From 0962313ebd45abb116d6ad2ee0eb754f587fc299 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 8 Nov 2023 13:11:13 +0100 Subject: garage_rpc: reorder functions in layout.rs --- src/rpc/layout.rs | 223 +++++++++++++++++++++++++++--------------------------- 1 file changed, 113 insertions(+), 110 deletions(-) (limited to 'src') diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index e02a180b..368a9d2c 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -278,86 +278,7 @@ impl ClusterLayout { ret } - fn calculate_staging_hash(&self) -> Hash { - let hashed_tuple = (&self.staging_roles, &self.staging_parameters); - blake2sum(&nonversioned_encode(&hashed_tuple).unwrap()[..]) - } - - pub fn merge(&mut self, other: &ClusterLayout) -> bool { - match other.version.cmp(&self.version) { - Ordering::Greater => { - *self = other.clone(); - true - } - Ordering::Equal => { - self.staging_parameters.merge(&other.staging_parameters); - self.staging_roles.merge(&other.staging_roles); - - let new_staging_hash = self.calculate_staging_hash(); - let changed = new_staging_hash != self.staging_hash; - - self.staging_hash = new_staging_hash; - - changed - } - Ordering::Less => false, - } - } - - pub fn apply_staged_changes(mut self, version: Option) -> Result<(Self, Message), Error> { - match version { - None => { - let error = r#" -Please pass the new layout version number to ensure that you are writing the correct version of the cluster layout. -To know the correct value of the new layout version, invoke `garage layout show` and review the proposed changes. - "#; - return Err(Error::Message(error.into())); - } - Some(v) => { - if v != self.version + 1 { - return Err(Error::Message("Invalid new layout version".into())); - } - } - } - - self.roles.merge(&self.staging_roles); - self.roles.retain(|(_, _, v)| v.0.is_some()); - self.parameters = *self.staging_parameters.get(); - - self.staging_roles.clear(); - self.staging_hash = self.calculate_staging_hash(); - - let msg = self.calculate_partition_assignment()?; - - self.version += 1; - - Ok((self, msg)) - } - - pub fn revert_staged_changes(mut self, version: Option) -> Result { - match version { - None => { - let error = r#" -Please pass the new layout version number to ensure that you are writing the correct version of the cluster layout. -To know the correct value of the new layout version, invoke `garage layout show` and review the proposed changes. - "#; - return Err(Error::Message(error.into())); - } - Some(v) => { - if v != self.version + 1 { - return Err(Error::Message("Invalid new layout version".into())); - } - } - } - - self.staging_roles.clear(); - self.staging_parameters.update(self.parameters); - self.staging_hash = self.calculate_staging_hash(); - - self.version += 1; - - Ok(self) - } + // ===================== accessors ====================== /// Returns a list of IDs of nodes that currently have /// a role in the cluster @@ -377,28 +298,6 @@ To know the correct value of the new layout version, invoke `garage layout show` } } - /// Returns the uuids of the non_gateway nodes in self.node_id_vec. - fn nongateway_nodes(&self) -> Vec { - let mut result = Vec::::new(); - for uuid in self.node_id_vec.iter() { - match self.node_role(uuid) { - Some(role) if role.capacity.is_some() => result.push(*uuid), - _ => (), - } - } - result - } - - /// Given a node uuids, this function returns the label of its zone - fn get_node_zone(&self, uuid: &Uuid) -> Result { - match self.node_role(uuid) { - Some(role) => Ok(role.zone.clone()), - _ => Err(Error::Message( - "The Uuid does not correspond to a node present in the cluster.".into(), - )), - } - } - /// Given a node uuids, this function returns its capacity or fails if it does not have any pub fn get_node_capacity(&self, uuid: &Uuid) -> Result { match self.node_role(uuid) { @@ -435,6 +334,30 @@ To know the correct value of the new layout version, invoke `garage layout show` )) } + // ===================== internal information extractors ====================== + + /// Returns the uuids of the non_gateway nodes in self.node_id_vec. + fn nongateway_nodes(&self) -> Vec { + let mut result = Vec::::new(); + for uuid in self.node_id_vec.iter() { + match self.node_role(uuid) { + Some(role) if role.capacity.is_some() => result.push(*uuid), + _ => (), + } + } + result + } + + /// Given a node uuids, this function returns the label of its zone + fn get_node_zone(&self, uuid: &Uuid) -> Result<&str, Error> { + match self.node_role(uuid) { + Some(role) => Ok(&role.zone), + _ => Err(Error::Message( + "The Uuid does not correspond to a node present in the cluster.".into(), + )), + } + } + /// Returns the sum of capacities of non gateway nodes in the cluster fn get_total_capacity(&self) -> Result { let mut total_capacity = 0; @@ -461,6 +384,89 @@ To know the correct value of the new layout version, invoke `garage layout show` } } + fn calculate_staging_hash(&self) -> Hash { + let hashed_tuple = (&self.staging_roles, &self.staging_parameters); + blake2sum(&nonversioned_encode(&hashed_tuple).unwrap()[..]) + } + + // ================== updates to layout, public interface =================== + + pub fn merge(&mut self, other: &ClusterLayout) -> bool { + match other.version.cmp(&self.version) { + Ordering::Greater => { + *self = other.clone(); + true + } + Ordering::Equal => { + self.staging_parameters.merge(&other.staging_parameters); + self.staging_roles.merge(&other.staging_roles); + + let new_staging_hash = self.calculate_staging_hash(); + let changed = new_staging_hash != self.staging_hash; + + self.staging_hash = new_staging_hash; + + changed + } + Ordering::Less => false, + } + } + + pub fn apply_staged_changes(mut self, version: Option) -> Result<(Self, Message), Error> { + match version { + None => { + let error = r#" +Please pass the new layout version number to ensure that you are writing the correct version of the cluster layout. +To know the correct value of the new layout version, invoke `garage layout show` and review the proposed changes. + "#; + return Err(Error::Message(error.into())); + } + Some(v) => { + if v != self.version + 1 { + return Err(Error::Message("Invalid new layout version".into())); + } + } + } + + self.roles.merge(&self.staging_roles); + self.roles.retain(|(_, _, v)| v.0.is_some()); + self.parameters = *self.staging_parameters.get(); + + self.staging_roles.clear(); + self.staging_hash = self.calculate_staging_hash(); + + let msg = self.calculate_partition_assignment()?; + + self.version += 1; + + Ok((self, msg)) + } + + pub fn revert_staged_changes(mut self, version: Option) -> Result { + match version { + None => { + let error = r#" +Please pass the new layout version number to ensure that you are writing the correct version of the cluster layout. +To know the correct value of the new layout version, invoke `garage layout show` and review the proposed changes. + "#; + return Err(Error::Message(error.into())); + } + Some(v) => { + if v != self.version + 1 { + return Err(Error::Message("Invalid new layout version".into())); + } + } + } + + self.staging_roles.clear(); + self.staging_parameters.update(self.parameters); + self.staging_hash = self.calculate_staging_hash(); + + self.version += 1; + + Ok(self) + } + /// Check a cluster layout for internal consistency /// (assignment, roles, parameters, partition size) /// returns true if consistent, false if error @@ -574,12 +580,9 @@ To know the correct value of the new layout version, invoke `garage layout show` Ok(()) } -} -// ==================================================================================== + // ================== updates to layout, internals =================== -// Implementation of the ClusterLayout methods related to the assignment algorithm. -impl ClusterLayout { /// This function calculates a new partition-to-node assignment. /// The computed assignment respects the node replication factor /// and the zone redundancy parameter It maximizes the capacity of a @@ -867,7 +870,7 @@ impl ClusterLayout { } for n in 0..self.nongateway_nodes().len() { let node_capacity = self.get_node_capacity(&self.node_id_vec[n])?; - let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[n])?]; + let node_zone = zone_to_id[self.get_node_zone(&self.node_id_vec[n])?]; g.add_edge(Vertex::N(n), Vertex::Sink, node_capacity / partition_size)?; for p in 0..NB_PARTITIONS { if !exclude_assoc.contains(&(p, n)) { @@ -913,7 +916,7 @@ impl ClusterLayout { // The algorithm is such that it will start with the flow that we just computed // and find ameliorating paths from that. for (p, n) in exclude_edge.iter() { - let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]; + let node_zone = zone_to_id[self.get_node_zone(&self.node_id_vec[*n])?]; g.add_edge(Vertex::PZ(*p, node_zone), Vertex::N(*n), 1)?; } g.compute_maximal_flow()?; @@ -933,7 +936,7 @@ impl ClusterLayout { let mut cost = CostFunction::new(); for (p, assoc_p) in prev_assign.iter().enumerate() { for n in assoc_p.iter() { - let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]; + let node_zone = zone_to_id[self.get_node_zone(&self.node_id_vec[*n])?]; cost.insert((Vertex::PZ(p, node_zone), Vertex::N(*n)), -1); } } @@ -1035,7 +1038,7 @@ impl ClusterLayout { let mut old_zones_of_p = Vec::::new(); for n in prev_assign[p].iter() { old_zones_of_p - .push(zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]); + .push(zone_to_id[self.get_node_zone(&self.node_id_vec[*n])?]); } if !old_zones_of_p.contains(&z) { new_partitions_zone[z] += 1; -- cgit v1.2.3 From 12d1dbfc6b884be488e2d79c0b9e3c47490f5442 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 8 Nov 2023 15:41:24 +0100 Subject: remove Ring and use ClusterLayout everywhere --- src/api/admin/bucket.rs | 4 +- src/api/k2v/index.rs | 8 +- src/api/s3/put.rs | 2 +- src/garage/admin/bucket.rs | 4 +- src/garage/admin/mod.rs | 20 ++--- src/model/helper/bucket.rs | 6 +- src/model/index_counter.rs | 6 +- src/rpc/layout.rs | 72 ++++++++++++++-- src/rpc/lib.rs | 1 - src/rpc/ring.rs | 164 ------------------------------------ src/rpc/rpc_helper.rs | 14 +-- src/rpc/system.rs | 55 ++++++------ src/table/merkle.rs | 2 +- src/table/replication/fullcopy.rs | 8 +- src/table/replication/parameters.rs | 2 +- src/table/replication/sharded.rs | 14 +-- src/table/sync.rs | 20 ++--- 17 files changed, 148 insertions(+), 254 deletions(-) delete mode 100644 src/rpc/ring.rs (limited to 'src') diff --git a/src/api/admin/bucket.rs b/src/api/admin/bucket.rs index 17f46c30..6bff7e9f 100644 --- a/src/api/admin/bucket.rs +++ b/src/api/admin/bucket.rs @@ -122,7 +122,7 @@ async fn bucket_info_results( .table .get(&bucket_id, &EmptyKey) .await? - .map(|x| x.filtered_values(&garage.system.ring.borrow())) + .map(|x| x.filtered_values(&garage.system.layout_watch.borrow())) .unwrap_or_default(); let mpu_counters = garage @@ -130,7 +130,7 @@ async fn bucket_info_results( .table .get(&bucket_id, &EmptyKey) .await? - .map(|x| x.filtered_values(&garage.system.ring.borrow())) + .map(|x| x.filtered_values(&garage.system.layout_watch.borrow())) .unwrap_or_default(); let mut relevant_keys = HashMap::new(); diff --git a/src/api/k2v/index.rs b/src/api/k2v/index.rs index 6c1d4a91..ff8beda3 100644 --- a/src/api/k2v/index.rs +++ b/src/api/k2v/index.rs @@ -5,7 +5,7 @@ use serde::Serialize; use garage_util::data::*; -use garage_rpc::ring::Ring; +use garage_rpc::layout::ClusterLayout; use garage_table::util::*; use garage_model::garage::Garage; @@ -26,7 +26,7 @@ pub async fn handle_read_index( ) -> Result, Error> { let reverse = reverse.unwrap_or(false); - let ring: Arc = garage.system.ring.borrow().clone(); + let layout: Arc = garage.system.layout_watch.borrow().clone(); let (partition_keys, more, next_start) = read_range( &garage.k2v.counter_table.table, @@ -35,7 +35,7 @@ pub async fn handle_read_index( &start, &end, limit, - Some((DeletedFilter::NotDeleted, ring.layout.node_id_vec.clone())), + Some((DeletedFilter::NotDeleted, layout.node_id_vec.clone())), EnumerationOrder::from_reverse(reverse), ) .await?; @@ -54,7 +54,7 @@ pub async fn handle_read_index( partition_keys: partition_keys .into_iter() .map(|part| { - let vals = part.filtered_values(&ring); + let vals = part.filtered_values(&layout); ReadIndexResponseEntry { pk: part.sk, entries: *vals.get(&s_entries).unwrap_or(&0), diff --git a/src/api/s3/put.rs b/src/api/s3/put.rs index 606facc4..fc17ed03 100644 --- a/src/api/s3/put.rs +++ b/src/api/s3/put.rs @@ -253,7 +253,7 @@ pub(crate) async fn check_quotas( .await?; let counters = counters - .map(|x| x.filtered_values(&garage.system.ring.borrow())) + .map(|x| x.filtered_values(&garage.system.layout_watch.borrow())) .unwrap_or_default(); let (prev_cnt_obj, prev_cnt_size) = match prev_object { diff --git a/src/garage/admin/bucket.rs b/src/garage/admin/bucket.rs index 0781cb8b..34e48292 100644 --- a/src/garage/admin/bucket.rs +++ b/src/garage/admin/bucket.rs @@ -70,7 +70,7 @@ impl AdminRpcHandler { .table .get(&bucket_id, &EmptyKey) .await? - .map(|x| x.filtered_values(&self.garage.system.ring.borrow())) + .map(|x| x.filtered_values(&self.garage.system.layout_watch.borrow())) .unwrap_or_default(); let mpu_counters = self @@ -79,7 +79,7 @@ impl AdminRpcHandler { .table .get(&bucket_id, &EmptyKey) .await? - .map(|x| x.filtered_values(&self.garage.system.ring.borrow())) + .map(|x| x.filtered_values(&self.garage.system.layout_watch.borrow())) .unwrap_or_default(); let mut relevant_keys = HashMap::new(); diff --git a/src/garage/admin/mod.rs b/src/garage/admin/mod.rs index b6f9c426..006f71cd 100644 --- a/src/garage/admin/mod.rs +++ b/src/garage/admin/mod.rs @@ -18,7 +18,7 @@ use garage_util::error::Error as GarageError; use garage_table::replication::*; use garage_table::*; -use garage_rpc::ring::PARTITION_BITS; +use garage_rpc::layout::PARTITION_BITS; use garage_rpc::*; use garage_block::manager::BlockResyncErrorInfo; @@ -126,8 +126,8 @@ impl AdminRpcHandler { opt_to_send.all_nodes = false; let mut failures = vec![]; - let ring = self.garage.system.ring.borrow().clone(); - for node in ring.layout.node_ids().iter() { + let layout = self.garage.system.layout_watch.borrow().clone(); + for node in layout.node_ids().iter() { let node = (*node).into(); let resp = self .endpoint @@ -163,9 +163,9 @@ impl AdminRpcHandler { async fn handle_stats(&self, opt: StatsOpt) -> Result { if opt.all_nodes { let mut ret = String::new(); - let ring = self.garage.system.ring.borrow().clone(); + let layout = self.garage.system.layout_watch.borrow().clone(); - for node in ring.layout.node_ids().iter() { + for node in layout.node_ids().iter() { let mut opt = opt.clone(); opt.all_nodes = false; opt.skip_global = true; @@ -275,7 +275,7 @@ impl AdminRpcHandler { let mut ret = String::new(); // Gather storage node and free space statistics - let layout = &self.garage.system.ring.borrow().layout; + let layout = &self.garage.system.layout_watch.borrow(); let mut node_partition_count = HashMap::::new(); for short_id in layout.ring_assignment_data.iter() { let id = layout.node_id_vec[*short_id as usize]; @@ -440,8 +440,8 @@ impl AdminRpcHandler { ) -> Result { if all_nodes { let mut ret = vec![]; - let ring = self.garage.system.ring.borrow().clone(); - for node in ring.layout.node_ids().iter() { + let layout = self.garage.system.layout_watch.borrow().clone(); + for node in layout.node_ids().iter() { let node = (*node).into(); match self .endpoint @@ -488,8 +488,8 @@ impl AdminRpcHandler { ) -> Result { if all_nodes { let mut ret = vec![]; - let ring = self.garage.system.ring.borrow().clone(); - for node in ring.layout.node_ids().iter() { + let layout = self.garage.system.layout_watch.borrow().clone(); + for node in layout.node_ids().iter() { let node = (*node).into(); match self .endpoint diff --git a/src/model/helper/bucket.rs b/src/model/helper/bucket.rs index 576d03f3..d43d7e96 100644 --- a/src/model/helper/bucket.rs +++ b/src/model/helper/bucket.rs @@ -450,10 +450,10 @@ impl<'a> BucketHelper<'a> { #[cfg(feature = "k2v")] { - use garage_rpc::ring::Ring; + use garage_rpc::layout::ClusterLayout; use std::sync::Arc; - let ring: Arc = self.0.system.ring.borrow().clone(); + let layout: Arc = self.0.system.layout_watch.borrow().clone(); let k2vindexes = self .0 .k2v @@ -462,7 +462,7 @@ impl<'a> BucketHelper<'a> { .get_range( &bucket_id, None, - Some((DeletedFilter::NotDeleted, ring.layout.node_id_vec.clone())), + Some((DeletedFilter::NotDeleted, layout.node_id_vec.clone())), 10, EnumerationOrder::Forward, ) diff --git a/src/model/index_counter.rs b/src/model/index_counter.rs index a46c165f..d514cb06 100644 --- a/src/model/index_counter.rs +++ b/src/model/index_counter.rs @@ -7,7 +7,7 @@ use serde::{Deserialize, Serialize}; use garage_db as db; -use garage_rpc::ring::Ring; +use garage_rpc::layout::ClusterLayout; use garage_rpc::system::System; use garage_util::background::BackgroundRunner; use garage_util::data::*; @@ -83,8 +83,8 @@ impl Entry for CounterEntry { } impl CounterEntry { - pub fn filtered_values(&self, ring: &Ring) -> HashMap { - let nodes = &ring.layout.node_id_vec[..]; + pub fn filtered_values(&self, layout: &ClusterLayout) -> HashMap { + let nodes = &layout.node_id_vec[..]; self.filtered_values_with_nodes(nodes) } diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index 368a9d2c..2b5b6606 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -13,17 +13,39 @@ use garage_util::error::*; use crate::graph_algo::*; -use crate::ring::*; - use std::convert::TryInto; +// ---- defines: partitions ---- + +/// A partition id, which is stored on 16 bits +/// i.e. we have up to 2**16 partitions. +/// (in practice we have exactly 2**PARTITION_BITS partitions) +pub type Partition = u16; + +// TODO: make this constant parametrizable in the config file +// For deployments with many nodes it might make sense to bump +// it up to 10. +// Maximum value : 16 +/// How many bits from the hash are used to make partitions. Higher numbers means more fairness in +/// presence of numerous nodes, but exponentially bigger ring. Max 16 +pub const PARTITION_BITS: usize = 8; + const NB_PARTITIONS: usize = 1usize << PARTITION_BITS; +// ---- defines: nodes ---- + +// Type to store compactly the id of a node in the system +// Change this to u16 the day we want to have more than 256 nodes in a cluster +pub type CompactNodeType = u8; +pub const MAX_NODE_NUMBER: usize = 256; + +// ---- defines: other ---- + // The Message type will be used to collect information on the algorithm. -type Message = Vec; +pub type Message = Vec; mod v08 { - use crate::ring::CompactNodeType; + use super::CompactNodeType; use garage_util::crdt::LwwMap; use garage_util::data::{Hash, Uuid}; use serde::{Deserialize, Serialize}; @@ -76,7 +98,7 @@ mod v08 { mod v09 { use super::v08; - use crate::ring::CompactNodeType; + use super::CompactNodeType; use garage_util::crdt::{Lww, LwwMap}; use garage_util::data::{Hash, Uuid}; use serde::{Deserialize, Serialize}; @@ -334,6 +356,46 @@ impl ClusterLayout { )) } + /// Get the partition in which data would fall on + pub fn partition_of(&self, position: &Hash) -> Partition { + let top = u16::from_be_bytes(position.as_slice()[0..2].try_into().unwrap()); + top >> (16 - PARTITION_BITS) + } + + /// Get the list of partitions and the first hash of a partition key that would fall in it + pub fn partitions(&self) -> Vec<(Partition, Hash)> { + (0..(1 << PARTITION_BITS)) + .map(|i| { + let top = (i as u16) << (16 - PARTITION_BITS); + let mut location = [0u8; 32]; + location[..2].copy_from_slice(&u16::to_be_bytes(top)[..]); + (i as u16, Hash::from(location)) + }) + .collect::>() + } + + /// Walk the ring to find the n servers in which data should be replicated + pub fn nodes_of(&self, position: &Hash, n: usize) -> Vec { + assert_eq!(n, self.replication_factor); + + let data = &self.ring_assignment_data; + + if data.len() != self.replication_factor * (1 << PARTITION_BITS) { + warn!("Ring not yet ready, read/writes will be lost!"); + return vec![]; + } + + let partition_idx = self.partition_of(position) as usize; + let partition_start = partition_idx * self.replication_factor; + let partition_end = (partition_idx + 1) * self.replication_factor; + let partition_nodes = &data[partition_start..partition_end]; + + partition_nodes + .iter() + .map(|i| self.node_id_vec[*i as usize]) + .collect::>() + } + // ===================== internal information extractors ====================== /// Returns the uuids of the non_gateway nodes in self.node_id_vec. diff --git a/src/rpc/lib.rs b/src/rpc/lib.rs index a5f8fc6e..1af8b78e 100644 --- a/src/rpc/lib.rs +++ b/src/rpc/lib.rs @@ -14,7 +14,6 @@ mod kubernetes; pub mod graph_algo; pub mod layout; pub mod replication_mode; -pub mod ring; pub mod system; pub mod rpc_helper; diff --git a/src/rpc/ring.rs b/src/rpc/ring.rs deleted file mode 100644 index 6a2e5c72..00000000 --- a/src/rpc/ring.rs +++ /dev/null @@ -1,164 +0,0 @@ -//! Module containing types related to computing nodes which should receive a copy of data blocks -//! and metadata -use std::convert::TryInto; - -use garage_util::data::*; - -use crate::layout::ClusterLayout; - -/// A partition id, which is stored on 16 bits -/// i.e. we have up to 2**16 partitions. -/// (in practice we have exactly 2**PARTITION_BITS partitions) -pub type Partition = u16; - -// TODO: make this constant parametrizable in the config file -// For deployments with many nodes it might make sense to bump -// it up to 10. -// Maximum value : 16 -/// How many bits from the hash are used to make partitions. Higher numbers means more fairness in -/// presence of numerous nodes, but exponentially bigger ring. Max 16 -pub const PARTITION_BITS: usize = 8; - -const PARTITION_MASK_U16: u16 = ((1 << PARTITION_BITS) - 1) << (16 - PARTITION_BITS); - -/// A ring distributing fairly objects to nodes -#[derive(Clone)] -pub struct Ring { - /// The replication factor for this ring - pub replication_factor: usize, - - /// The network configuration used to generate this ring - pub layout: ClusterLayout, - - // Internal order of nodes used to make a more compact representation of the ring - nodes: Vec, - - // The list of entries in the ring - ring: Vec, -} - -// Type to store compactly the id of a node in the system -// Change this to u16 the day we want to have more than 256 nodes in a cluster -pub type CompactNodeType = u8; -pub const MAX_NODE_NUMBER: usize = 256; - -// The maximum number of times an object might get replicated -// This must be at least 3 because Garage supports 3-way replication -// Here we use 6 so that the size of a ring entry is 8 bytes -// (2 bytes partition id, 6 bytes node numbers as u8s) -const MAX_REPLICATION: usize = 6; - -/// An entry in the ring -#[derive(Clone, Debug)] -struct RingEntry { - // The two first bytes of the first hash that goes in this partition - // (the next bytes are zeroes) - hash_prefix: u16, - // The nodes that store this partition, stored as a list of positions in the `nodes` - // field of the Ring structure - // Only items 0 up to ring.replication_factor - 1 are used, others are zeros - nodes_buf: [CompactNodeType; MAX_REPLICATION], -} - -impl Ring { - pub(crate) fn new(layout: ClusterLayout, replication_factor: usize) -> Self { - if replication_factor != layout.replication_factor { - warn!("Could not build ring: replication factor does not match between local configuration and network role assignment."); - return Self::empty(layout, replication_factor); - } - - if layout.ring_assignment_data.len() != replication_factor * (1 << PARTITION_BITS) { - warn!("Could not build ring: network role assignment data has invalid length"); - return Self::empty(layout, replication_factor); - } - - let nodes = layout.node_id_vec.clone(); - let ring = (0..(1 << PARTITION_BITS)) - .map(|i| { - let top = (i as u16) << (16 - PARTITION_BITS); - let mut nodes_buf = [0u8; MAX_REPLICATION]; - nodes_buf[..replication_factor].copy_from_slice( - &layout.ring_assignment_data - [replication_factor * i..replication_factor * (i + 1)], - ); - RingEntry { - hash_prefix: top, - nodes_buf, - } - }) - .collect::>(); - - Self { - replication_factor, - layout, - nodes, - ring, - } - } - - fn empty(layout: ClusterLayout, replication_factor: usize) -> Self { - Self { - replication_factor, - layout, - nodes: vec![], - ring: vec![], - } - } - - /// Get the partition in which data would fall on - pub fn partition_of(&self, position: &Hash) -> Partition { - let top = u16::from_be_bytes(position.as_slice()[0..2].try_into().unwrap()); - top >> (16 - PARTITION_BITS) - } - - /// Get the list of partitions and the first hash of a partition key that would fall in it - pub fn partitions(&self) -> Vec<(Partition, Hash)> { - let mut ret = vec![]; - - for (i, entry) in self.ring.iter().enumerate() { - let mut location = [0u8; 32]; - location[..2].copy_from_slice(&u16::to_be_bytes(entry.hash_prefix)[..]); - ret.push((i as u16, location.into())); - } - if !ret.is_empty() { - assert_eq!(ret[0].1, [0u8; 32].into()); - } - - ret - } - - /// Walk the ring to find the n servers in which data should be replicated - pub fn get_nodes(&self, position: &Hash, n: usize) -> Vec { - if self.ring.len() != 1 << PARTITION_BITS { - warn!("Ring not yet ready, read/writes will be lost!"); - return vec![]; - } - - let partition_idx = self.partition_of(position) as usize; - let partition = &self.ring[partition_idx]; - - let top = u16::from_be_bytes(position.as_slice()[0..2].try_into().unwrap()); - // Check that we haven't messed up our partition table, i.e. that this partition - // table entrey indeed corresponds to the item we are storing - assert_eq!( - partition.hash_prefix & PARTITION_MASK_U16, - top & PARTITION_MASK_U16 - ); - - assert!(n <= self.replication_factor); - partition.nodes_buf[..n] - .iter() - .map(|i| self.nodes[*i as usize]) - .collect::>() - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_ring_entry_size() { - assert_eq!(std::mem::size_of::(), 8); - } -} diff --git a/src/rpc/rpc_helper.rs b/src/rpc/rpc_helper.rs index e59c372a..56bef2f3 100644 --- a/src/rpc/rpc_helper.rs +++ b/src/rpc/rpc_helper.rs @@ -26,8 +26,8 @@ use garage_util::data::*; use garage_util::error::Error; use garage_util::metrics::RecordDuration; +use crate::layout::ClusterLayout; use crate::metrics::RpcMetrics; -use crate::ring::Ring; // Default RPC timeout = 5 minutes const DEFAULT_TIMEOUT: Duration = Duration::from_secs(300); @@ -91,7 +91,7 @@ pub struct RpcHelper(Arc); struct RpcHelperInner { our_node_id: Uuid, fullmesh: Arc, - ring: watch::Receiver>, + layout_watch: watch::Receiver>, metrics: RpcMetrics, rpc_timeout: Duration, } @@ -100,7 +100,7 @@ impl RpcHelper { pub(crate) fn new( our_node_id: Uuid, fullmesh: Arc, - ring: watch::Receiver>, + layout_watch: watch::Receiver>, rpc_timeout: Option, ) -> Self { let metrics = RpcMetrics::new(); @@ -108,7 +108,7 @@ impl RpcHelper { Self(Arc::new(RpcHelperInner { our_node_id, fullmesh, - ring, + layout_watch, metrics, rpc_timeout: rpc_timeout.unwrap_or(DEFAULT_TIMEOUT), })) @@ -392,8 +392,8 @@ impl RpcHelper { pub fn request_order(&self, nodes: &[Uuid]) -> Vec { // Retrieve some status variables that we will use to sort requests let peer_list = self.0.fullmesh.get_peer_list(); - let ring: Arc = self.0.ring.borrow().clone(); - let our_zone = match ring.layout.node_role(&self.0.our_node_id) { + let layout: Arc = self.0.layout_watch.borrow().clone(); + let our_zone = match layout.node_role(&self.0.our_node_id) { Some(pc) => &pc.zone, None => "", }; @@ -407,7 +407,7 @@ impl RpcHelper { let mut nodes = nodes .iter() .map(|to| { - let peer_zone = match ring.layout.node_role(to) { + let peer_zone = match layout.node_role(to) { Some(pc) => &pc.zone, None => "", }; diff --git a/src/rpc/system.rs b/src/rpc/system.rs index 4b40bec4..106e9f8c 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -36,7 +36,6 @@ use crate::consul::ConsulDiscovery; use crate::kubernetes::*; use crate::layout::*; use crate::replication_mode::*; -use crate::ring::*; use crate::rpc_helper::*; use crate::system_metrics::*; @@ -112,9 +111,9 @@ pub struct System { replication_mode: ReplicationMode, replication_factor: usize, - /// The ring - pub ring: watch::Receiver>, - update_ring: Mutex>>, + /// The layout + pub layout_watch: watch::Receiver>, + update_layout: Mutex>>, /// Path to metadata directory pub metadata_dir: PathBuf, @@ -286,8 +285,7 @@ impl System { let mut local_status = NodeStatus::initial(replication_factor, &cluster_layout); local_status.update_disk_usage(&config.metadata_dir, &config.data_dir, &metrics); - let ring = Ring::new(cluster_layout, replication_factor); - let (update_ring, ring) = watch::channel(Arc::new(ring)); + let (update_layout, layout_watch) = watch::channel(Arc::new(cluster_layout)); let rpc_public_addr = match &config.rpc_public_addr { Some(a_str) => { @@ -362,7 +360,7 @@ impl System { rpc: RpcHelper::new( netapp.id.into(), fullmesh, - ring.clone(), + layout_watch.clone(), config.rpc_timeout_msec.map(Duration::from_millis), ), system_endpoint, @@ -378,8 +376,8 @@ impl System { kubernetes_discovery: config.kubernetes_discovery.clone(), metrics, - ring, - update_ring: Mutex::new(update_ring), + layout_watch, + update_layout: Mutex::new(update_layout), metadata_dir: config.metadata_dir.clone(), data_dir: config.data_dir.clone(), }); @@ -426,7 +424,7 @@ impl System { } pub fn get_cluster_layout(&self) -> ClusterLayout { - self.ring.borrow().layout.clone() + self.layout_watch.borrow().as_ref().clone() } pub async fn update_cluster_layout( @@ -466,7 +464,7 @@ impl System { } pub fn health(&self) -> ClusterHealth { - let ring: Arc<_> = self.ring.borrow().clone(); + let layout: Arc<_> = self.layout_watch.borrow().clone(); let quorum = self.replication_mode.write_quorum(); let replication_factor = self.replication_factor; @@ -477,8 +475,7 @@ impl System { .collect::>(); let connected_nodes = nodes.iter().filter(|(_, n)| n.is_up).count(); - let storage_nodes = ring - .layout + let storage_nodes = layout .roles .items() .iter() @@ -489,11 +486,11 @@ impl System { .filter(|(x, _, _)| nodes.get(x).map(|n| n.is_up).unwrap_or(false)) .count(); - let partitions = ring.partitions(); + let partitions = layout.partitions(); let partitions_n_up = partitions .iter() .map(|(_, h)| { - let pn = ring.get_nodes(h, ring.replication_factor); + let pn = layout.nodes_of(h, layout.replication_factor); pn.iter() .filter(|x| nodes.get(x).map(|n| n.is_up).unwrap_or(false)) .count() @@ -584,9 +581,9 @@ impl System { /// Save network configuration to disc async fn save_cluster_layout(&self) -> Result<(), Error> { - let ring: Arc = self.ring.borrow().clone(); + let layout: Arc = self.layout_watch.borrow().clone(); self.persist_cluster_layout - .save_async(&ring.layout) + .save_async(&layout) .await .expect("Cannot save current cluster layout"); Ok(()) @@ -595,9 +592,9 @@ impl System { fn update_local_status(&self) { let mut new_si: NodeStatus = self.local_status.load().as_ref().clone(); - let ring = self.ring.borrow(); - new_si.cluster_layout_version = ring.layout.version; - new_si.cluster_layout_staging_hash = ring.layout.staging_hash; + let layout = self.layout_watch.borrow(); + new_si.cluster_layout_version = layout.version; + new_si.cluster_layout_staging_hash = layout.staging_hash; new_si.update_disk_usage(&self.metadata_dir, &self.data_dir, &self.metrics); @@ -612,8 +609,8 @@ impl System { } fn handle_pull_cluster_layout(&self) -> SystemRpc { - let ring = self.ring.borrow().clone(); - SystemRpc::AdvertiseClusterLayout(ring.layout.clone()) + let layout = self.layout_watch.borrow().as_ref().clone(); + SystemRpc::AdvertiseClusterLayout(layout) } fn handle_get_known_nodes(&self) -> SystemRpc { @@ -663,8 +660,9 @@ impl System { return Err(Error::Message(msg)); } - let update_ring = self.update_ring.lock().await; - let mut layout: ClusterLayout = self.ring.borrow().layout.clone(); + let update_layout = self.update_layout.lock().await; + // TODO: don't clone each time an AdvertiseClusterLayout is received + let mut layout: ClusterLayout = self.layout_watch.borrow().as_ref().clone(); let prev_layout_check = layout.check().is_ok(); if layout.merge(adv) { @@ -675,9 +673,8 @@ impl System { )); } - let ring = Ring::new(layout.clone(), self.replication_factor); - update_ring.send(Arc::new(ring))?; - drop(update_ring); + update_layout.send(Arc::new(layout.clone()))?; + drop(update_layout); let self2 = self.clone(); tokio::spawn(async move { @@ -725,9 +722,9 @@ impl System { async fn discovery_loop(self: &Arc, mut stop_signal: watch::Receiver) { while !*stop_signal.borrow() { - let not_configured = self.ring.borrow().layout.check().is_err(); + let not_configured = self.layout_watch.borrow().check().is_err(); let no_peers = self.fullmesh.get_peer_list().len() < self.replication_factor; - let expected_n_nodes = self.ring.borrow().layout.num_nodes(); + let expected_n_nodes = self.layout_watch.borrow().num_nodes(); let bad_peers = self .fullmesh .get_peer_list() diff --git a/src/table/merkle.rs b/src/table/merkle.rs index 4577f872..01271c58 100644 --- a/src/table/merkle.rs +++ b/src/table/merkle.rs @@ -13,7 +13,7 @@ use garage_util::data::*; use garage_util::encode::{nonversioned_decode, nonversioned_encode}; use garage_util::error::Error; -use garage_rpc::ring::*; +use garage_rpc::layout::*; use crate::data::*; use crate::replication::*; diff --git a/src/table/replication/fullcopy.rs b/src/table/replication/fullcopy.rs index 18682ace..f8b7cacc 100644 --- a/src/table/replication/fullcopy.rs +++ b/src/table/replication/fullcopy.rs @@ -1,6 +1,6 @@ use std::sync::Arc; -use garage_rpc::ring::*; +use garage_rpc::layout::*; use garage_rpc::system::System; use garage_util::data::*; @@ -27,11 +27,11 @@ impl TableReplication for TableFullReplication { } fn write_nodes(&self, _hash: &Hash) -> Vec { - let ring = self.system.ring.borrow(); - ring.layout.node_ids().to_vec() + let layout = self.system.layout_watch.borrow(); + layout.node_ids().to_vec() } fn write_quorum(&self) -> usize { - let nmembers = self.system.ring.borrow().layout.node_ids().len(); + let nmembers = self.system.layout_watch.borrow().node_ids().len(); if nmembers > self.max_faults { nmembers - self.max_faults } else { diff --git a/src/table/replication/parameters.rs b/src/table/replication/parameters.rs index f00815a2..19b306f2 100644 --- a/src/table/replication/parameters.rs +++ b/src/table/replication/parameters.rs @@ -1,4 +1,4 @@ -use garage_rpc::ring::*; +use garage_rpc::layout::*; use garage_util::data::*; /// Trait to describe how a table shall be replicated diff --git a/src/table/replication/sharded.rs b/src/table/replication/sharded.rs index 1cf964af..95901a5a 100644 --- a/src/table/replication/sharded.rs +++ b/src/table/replication/sharded.rs @@ -1,6 +1,6 @@ use std::sync::Arc; -use garage_rpc::ring::*; +use garage_rpc::layout::*; use garage_rpc::system::System; use garage_util::data::*; @@ -26,16 +26,16 @@ pub struct TableShardedReplication { impl TableReplication for TableShardedReplication { fn read_nodes(&self, hash: &Hash) -> Vec { - let ring = self.system.ring.borrow(); - ring.get_nodes(hash, self.replication_factor) + let layout = self.system.layout_watch.borrow(); + layout.nodes_of(hash, self.replication_factor) } fn read_quorum(&self) -> usize { self.read_quorum } fn write_nodes(&self, hash: &Hash) -> Vec { - let ring = self.system.ring.borrow(); - ring.get_nodes(hash, self.replication_factor) + let layout = self.system.layout_watch.borrow(); + layout.nodes_of(hash, self.replication_factor) } fn write_quorum(&self) -> usize { self.write_quorum @@ -45,9 +45,9 @@ impl TableReplication for TableShardedReplication { } fn partition_of(&self, hash: &Hash) -> Partition { - self.system.ring.borrow().partition_of(hash) + self.system.layout_watch.borrow().partition_of(hash) } fn partitions(&self) -> Vec<(Partition, Hash)> { - self.system.ring.borrow().partitions() + self.system.layout_watch.borrow().partitions() } } diff --git a/src/table/sync.rs b/src/table/sync.rs index 92a353c6..b2600013 100644 --- a/src/table/sync.rs +++ b/src/table/sync.rs @@ -17,7 +17,7 @@ use garage_util::data::*; use garage_util::encode::{debug_serialize, nonversioned_encode}; use garage_util::error::{Error, OkOrMessage}; -use garage_rpc::ring::*; +use garage_rpc::layout::*; use garage_rpc::system::System; use garage_rpc::*; @@ -91,8 +91,8 @@ impl TableSyncer { bg.spawn_worker(SyncWorker { syncer: self.clone(), - ring_recv: self.system.ring.clone(), - ring: self.system.ring.borrow().clone(), + layout_watch: self.system.layout_watch.clone(), + layout: self.system.layout_watch.borrow().clone(), add_full_sync_rx, todo: vec![], next_full_sync: Instant::now() + Duration::from_secs(20), @@ -492,8 +492,8 @@ impl EndpointHandler for TableSync struct SyncWorker { syncer: Arc>, - ring_recv: watch::Receiver>, - ring: Arc, + layout_watch: watch::Receiver>, + layout: Arc, add_full_sync_rx: mpsc::UnboundedReceiver<()>, todo: Vec, next_full_sync: Instant, @@ -593,11 +593,11 @@ impl Worker for SyncWorker { self.add_full_sync(); } }, - _ = self.ring_recv.changed() => { - let new_ring = self.ring_recv.borrow(); - if !Arc::ptr_eq(&new_ring, &self.ring) { - self.ring = new_ring.clone(); - drop(new_ring); + _ = self.layout_watch.changed() => { + let new_layout = self.layout_watch.borrow(); + if !Arc::ptr_eq(&new_layout, &self.layout) { + self.layout = new_layout.clone(); + drop(new_layout); debug!("({}) Ring changed, adding full sync to syncer todo list", F::TABLE_NAME); self.add_full_sync(); } -- cgit v1.2.3 From 4a9c94514f49aa4e9880a8e0f5cf5a52d11ae993 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 8 Nov 2023 16:41:00 +0100 Subject: avoid using layout_watch in System directly --- src/api/admin/bucket.rs | 4 ++-- src/api/admin/cluster.rs | 10 +++++----- src/api/k2v/index.rs | 2 +- src/api/s3/put.rs | 2 +- src/garage/admin/bucket.rs | 4 ++-- src/garage/admin/mod.rs | 10 +++++----- src/model/helper/bucket.rs | 2 +- src/rpc/system.rs | 4 ++-- src/table/replication/fullcopy.rs | 4 ++-- src/table/replication/sharded.rs | 8 ++++---- src/table/sync.rs | 2 +- 11 files changed, 26 insertions(+), 26 deletions(-) (limited to 'src') diff --git a/src/api/admin/bucket.rs b/src/api/admin/bucket.rs index 6bff7e9f..65929d61 100644 --- a/src/api/admin/bucket.rs +++ b/src/api/admin/bucket.rs @@ -122,7 +122,7 @@ async fn bucket_info_results( .table .get(&bucket_id, &EmptyKey) .await? - .map(|x| x.filtered_values(&garage.system.layout_watch.borrow())) + .map(|x| x.filtered_values(&garage.system.cluster_layout())) .unwrap_or_default(); let mpu_counters = garage @@ -130,7 +130,7 @@ async fn bucket_info_results( .table .get(&bucket_id, &EmptyKey) .await? - .map(|x| x.filtered_values(&garage.system.layout_watch.borrow())) + .map(|x| x.filtered_values(&garage.system.cluster_layout())) .unwrap_or_default(); let mut relevant_keys = HashMap::new(); diff --git a/src/api/admin/cluster.rs b/src/api/admin/cluster.rs index c8107b82..01ff9885 100644 --- a/src/api/admin/cluster.rs +++ b/src/api/admin/cluster.rs @@ -33,7 +33,7 @@ pub async fn handle_get_cluster_status(garage: &Arc) -> Result) -> Result, Error> { - let res = format_cluster_layout(&garage.system.get_cluster_layout()); + let res = format_cluster_layout(&garage.system.cluster_layout()); Ok(json_ok_response(&res)?) } @@ -207,7 +207,7 @@ pub async fn handle_update_cluster_layout( ) -> Result, Error> { let updates = parse_json_body::(req).await?; - let mut layout = garage.system.get_cluster_layout(); + let mut layout = garage.system.cluster_layout().as_ref().clone(); let mut roles = layout.roles.clone(); roles.merge(&layout.staging_roles); @@ -247,7 +247,7 @@ pub async fn handle_apply_cluster_layout( ) -> Result, Error> { let param = parse_json_body::(req).await?; - let layout = garage.system.get_cluster_layout(); + let layout = garage.system.cluster_layout().as_ref().clone(); let (layout, msg) = layout.apply_staged_changes(Some(param.version))?; garage.system.update_cluster_layout(&layout).await?; @@ -265,7 +265,7 @@ pub async fn handle_revert_cluster_layout( ) -> Result, Error> { let param = parse_json_body::(req).await?; - let layout = garage.system.get_cluster_layout(); + let layout = garage.system.cluster_layout().as_ref().clone(); let layout = layout.revert_staged_changes(Some(param.version))?; garage.system.update_cluster_layout(&layout).await?; diff --git a/src/api/k2v/index.rs b/src/api/k2v/index.rs index ff8beda3..3fc39de6 100644 --- a/src/api/k2v/index.rs +++ b/src/api/k2v/index.rs @@ -26,7 +26,7 @@ pub async fn handle_read_index( ) -> Result, Error> { let reverse = reverse.unwrap_or(false); - let layout: Arc = garage.system.layout_watch.borrow().clone(); + let layout: Arc = garage.system.cluster_layout().clone(); let (partition_keys, more, next_start) = read_range( &garage.k2v.counter_table.table, diff --git a/src/api/s3/put.rs b/src/api/s3/put.rs index fc17ed03..d1c88a76 100644 --- a/src/api/s3/put.rs +++ b/src/api/s3/put.rs @@ -253,7 +253,7 @@ pub(crate) async fn check_quotas( .await?; let counters = counters - .map(|x| x.filtered_values(&garage.system.layout_watch.borrow())) + .map(|x| x.filtered_values(&garage.system.cluster_layout())) .unwrap_or_default(); let (prev_cnt_obj, prev_cnt_size) = match prev_object { diff --git a/src/garage/admin/bucket.rs b/src/garage/admin/bucket.rs index 34e48292..9e642f57 100644 --- a/src/garage/admin/bucket.rs +++ b/src/garage/admin/bucket.rs @@ -70,7 +70,7 @@ impl AdminRpcHandler { .table .get(&bucket_id, &EmptyKey) .await? - .map(|x| x.filtered_values(&self.garage.system.layout_watch.borrow())) + .map(|x| x.filtered_values(&self.garage.system.cluster_layout())) .unwrap_or_default(); let mpu_counters = self @@ -79,7 +79,7 @@ impl AdminRpcHandler { .table .get(&bucket_id, &EmptyKey) .await? - .map(|x| x.filtered_values(&self.garage.system.layout_watch.borrow())) + .map(|x| x.filtered_values(&self.garage.system.cluster_layout())) .unwrap_or_default(); let mut relevant_keys = HashMap::new(); diff --git a/src/garage/admin/mod.rs b/src/garage/admin/mod.rs index 006f71cd..c3fa801a 100644 --- a/src/garage/admin/mod.rs +++ b/src/garage/admin/mod.rs @@ -126,7 +126,7 @@ impl AdminRpcHandler { opt_to_send.all_nodes = false; let mut failures = vec![]; - let layout = self.garage.system.layout_watch.borrow().clone(); + let layout = self.garage.system.cluster_layout().clone(); for node in layout.node_ids().iter() { let node = (*node).into(); let resp = self @@ -163,7 +163,7 @@ impl AdminRpcHandler { async fn handle_stats(&self, opt: StatsOpt) -> Result { if opt.all_nodes { let mut ret = String::new(); - let layout = self.garage.system.layout_watch.borrow().clone(); + let layout = self.garage.system.cluster_layout().clone(); for node in layout.node_ids().iter() { let mut opt = opt.clone(); @@ -275,7 +275,7 @@ impl AdminRpcHandler { let mut ret = String::new(); // Gather storage node and free space statistics - let layout = &self.garage.system.layout_watch.borrow(); + let layout = &self.garage.system.cluster_layout(); let mut node_partition_count = HashMap::::new(); for short_id in layout.ring_assignment_data.iter() { let id = layout.node_id_vec[*short_id as usize]; @@ -440,7 +440,7 @@ impl AdminRpcHandler { ) -> Result { if all_nodes { let mut ret = vec![]; - let layout = self.garage.system.layout_watch.borrow().clone(); + let layout = self.garage.system.cluster_layout().clone(); for node in layout.node_ids().iter() { let node = (*node).into(); match self @@ -488,7 +488,7 @@ impl AdminRpcHandler { ) -> Result { if all_nodes { let mut ret = vec![]; - let layout = self.garage.system.layout_watch.borrow().clone(); + let layout = self.garage.system.cluster_layout().clone(); for node in layout.node_ids().iter() { let node = (*node).into(); match self diff --git a/src/model/helper/bucket.rs b/src/model/helper/bucket.rs index d43d7e96..8cd5b27b 100644 --- a/src/model/helper/bucket.rs +++ b/src/model/helper/bucket.rs @@ -453,7 +453,7 @@ impl<'a> BucketHelper<'a> { use garage_rpc::layout::ClusterLayout; use std::sync::Arc; - let layout: Arc = self.0.system.layout_watch.borrow().clone(); + let layout: Arc = self.0.system.cluster_layout().clone(); let k2vindexes = self .0 .k2v diff --git a/src/rpc/system.rs b/src/rpc/system.rs index 106e9f8c..93144e39 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -423,8 +423,8 @@ impl System { known_nodes } - pub fn get_cluster_layout(&self) -> ClusterLayout { - self.layout_watch.borrow().as_ref().clone() + pub fn cluster_layout(&self) -> watch::Ref> { + self.layout_watch.borrow() } pub async fn update_cluster_layout( diff --git a/src/table/replication/fullcopy.rs b/src/table/replication/fullcopy.rs index f8b7cacc..34807e3d 100644 --- a/src/table/replication/fullcopy.rs +++ b/src/table/replication/fullcopy.rs @@ -27,11 +27,11 @@ impl TableReplication for TableFullReplication { } fn write_nodes(&self, _hash: &Hash) -> Vec { - let layout = self.system.layout_watch.borrow(); + let layout = self.system.cluster_layout(); layout.node_ids().to_vec() } fn write_quorum(&self) -> usize { - let nmembers = self.system.layout_watch.borrow().node_ids().len(); + let nmembers = self.system.cluster_layout().node_ids().len(); if nmembers > self.max_faults { nmembers - self.max_faults } else { diff --git a/src/table/replication/sharded.rs b/src/table/replication/sharded.rs index 95901a5a..60c95cb4 100644 --- a/src/table/replication/sharded.rs +++ b/src/table/replication/sharded.rs @@ -26,7 +26,7 @@ pub struct TableShardedReplication { impl TableReplication for TableShardedReplication { fn read_nodes(&self, hash: &Hash) -> Vec { - let layout = self.system.layout_watch.borrow(); + let layout = self.system.cluster_layout(); layout.nodes_of(hash, self.replication_factor) } fn read_quorum(&self) -> usize { @@ -34,7 +34,7 @@ impl TableReplication for TableShardedReplication { } fn write_nodes(&self, hash: &Hash) -> Vec { - let layout = self.system.layout_watch.borrow(); + let layout = self.system.cluster_layout(); layout.nodes_of(hash, self.replication_factor) } fn write_quorum(&self) -> usize { @@ -45,9 +45,9 @@ impl TableReplication for TableShardedReplication { } fn partition_of(&self, hash: &Hash) -> Partition { - self.system.layout_watch.borrow().partition_of(hash) + self.system.cluster_layout().partition_of(hash) } fn partitions(&self) -> Vec<(Partition, Hash)> { - self.system.layout_watch.borrow().partitions() + self.system.cluster_layout().partitions() } } diff --git a/src/table/sync.rs b/src/table/sync.rs index b2600013..65eff7cd 100644 --- a/src/table/sync.rs +++ b/src/table/sync.rs @@ -92,7 +92,7 @@ impl TableSyncer { bg.spawn_worker(SyncWorker { syncer: self.clone(), layout_watch: self.system.layout_watch.clone(), - layout: self.system.layout_watch.borrow().clone(), + layout: self.system.cluster_layout().clone(), add_full_sync_rx, todo: vec![], next_full_sync: Instant::now() + Duration::from_secs(20), -- cgit v1.2.3 From fe9af1dcaae31a117528a9cfa10c422c9a850201 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 8 Nov 2023 17:49:06 +0100 Subject: WIP: garage_rpc: store layout version history --- src/rpc/layout.rs | 1358 --------------------------------------------- src/rpc/layout/history.rs | 170 ++++++ src/rpc/layout/mod.rs | 32 ++ src/rpc/layout/schema.rs | 286 ++++++++++ src/rpc/layout/tracker.rs | 21 + src/rpc/layout/version.rs | 1052 +++++++++++++++++++++++++++++++++++ src/rpc/rpc_helper.rs | 12 +- src/rpc/system.rs | 44 +- 8 files changed, 1590 insertions(+), 1385 deletions(-) delete mode 100644 src/rpc/layout.rs create mode 100644 src/rpc/layout/history.rs create mode 100644 src/rpc/layout/mod.rs create mode 100644 src/rpc/layout/schema.rs create mode 100644 src/rpc/layout/tracker.rs create mode 100644 src/rpc/layout/version.rs (limited to 'src') diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs deleted file mode 100644 index 2b5b6606..00000000 --- a/src/rpc/layout.rs +++ /dev/null @@ -1,1358 +0,0 @@ -use std::cmp::Ordering; -use std::collections::HashMap; -use std::collections::HashSet; -use std::fmt; - -use bytesize::ByteSize; -use itertools::Itertools; - -use garage_util::crdt::{AutoCrdt, Crdt, Lww, LwwMap}; -use garage_util::data::*; -use garage_util::encode::nonversioned_encode; -use garage_util::error::*; - -use crate::graph_algo::*; - -use std::convert::TryInto; - -// ---- defines: partitions ---- - -/// A partition id, which is stored on 16 bits -/// i.e. we have up to 2**16 partitions. -/// (in practice we have exactly 2**PARTITION_BITS partitions) -pub type Partition = u16; - -// TODO: make this constant parametrizable in the config file -// For deployments with many nodes it might make sense to bump -// it up to 10. -// Maximum value : 16 -/// How many bits from the hash are used to make partitions. Higher numbers means more fairness in -/// presence of numerous nodes, but exponentially bigger ring. Max 16 -pub const PARTITION_BITS: usize = 8; - -const NB_PARTITIONS: usize = 1usize << PARTITION_BITS; - -// ---- defines: nodes ---- - -// Type to store compactly the id of a node in the system -// Change this to u16 the day we want to have more than 256 nodes in a cluster -pub type CompactNodeType = u8; -pub const MAX_NODE_NUMBER: usize = 256; - -// ---- defines: other ---- - -// The Message type will be used to collect information on the algorithm. -pub type Message = Vec; - -mod v08 { - use super::CompactNodeType; - use garage_util::crdt::LwwMap; - use garage_util::data::{Hash, Uuid}; - use serde::{Deserialize, Serialize}; - - /// The layout of the cluster, i.e. the list of roles - /// which are assigned to each cluster node - #[derive(Clone, Debug, Serialize, Deserialize)] - pub struct ClusterLayout { - pub version: u64, - - pub replication_factor: usize, - pub roles: LwwMap, - - /// node_id_vec: a vector of node IDs with a role assigned - /// in the system (this includes gateway nodes). - /// The order here is different than the vec stored by `roles`, because: - /// 1. non-gateway nodes are first so that they have lower numbers - /// 2. nodes that don't have a role are excluded (but they need to - /// stay in the CRDT as tombstones) - pub node_id_vec: Vec, - /// the assignation of data partitions to node, the values - /// are indices in node_id_vec - #[serde(with = "serde_bytes")] - pub ring_assignation_data: Vec, - - /// Role changes which are staged for the next version of the layout - pub staging: LwwMap, - pub staging_hash: Hash, - } - - #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] - pub struct NodeRoleV(pub Option); - - /// The user-assigned roles of cluster nodes - #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] - pub struct NodeRole { - /// Datacenter at which this entry belong. This information is used to - /// perform a better geodistribution - pub zone: String, - /// The capacity of the node - /// If this is set to None, the node does not participate in storing data for the system - /// and is only active as an API gateway to other nodes - pub capacity: Option, - /// A set of tags to recognize the node - pub tags: Vec, - } - - impl garage_util::migrate::InitialFormat for ClusterLayout {} -} - -mod v09 { - use super::v08; - use super::CompactNodeType; - use garage_util::crdt::{Lww, LwwMap}; - use garage_util::data::{Hash, Uuid}; - use serde::{Deserialize, Serialize}; - pub use v08::{NodeRole, NodeRoleV}; - - /// The layout of the cluster, i.e. the list of roles - /// which are assigned to each cluster node - #[derive(Clone, Debug, Serialize, Deserialize)] - pub struct ClusterLayout { - pub version: u64, - - pub replication_factor: usize, - - /// This attribute is only used to retain the previously computed partition size, - /// to know to what extent does it change with the layout update. - pub partition_size: u64, - /// Parameters used to compute the assignment currently given by - /// ring_assignment_data - pub parameters: LayoutParameters, - - pub roles: LwwMap, - - /// see comment in v08::ClusterLayout - pub node_id_vec: Vec, - /// see comment in v08::ClusterLayout - #[serde(with = "serde_bytes")] - pub ring_assignment_data: Vec, - - /// Parameters to be used in the next partition assignment computation. - pub staging_parameters: Lww, - /// Role changes which are staged for the next version of the layout - pub staging_roles: LwwMap, - pub staging_hash: Hash, - } - - /// This struct is used to set the parameters to be used in the assignment computation - /// algorithm. It is stored as a Crdt. - #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)] - pub struct LayoutParameters { - pub zone_redundancy: ZoneRedundancy, - } - - /// Zone redundancy: if set to AtLeast(x), the layout calculation will aim to store copies - /// of each partition on at least that number of different zones. - /// Otherwise, copies will be stored on the maximum possible number of zones. - #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)] - pub enum ZoneRedundancy { - AtLeast(usize), - Maximum, - } - - impl garage_util::migrate::Migrate for ClusterLayout { - const VERSION_MARKER: &'static [u8] = b"G09layout"; - - type Previous = v08::ClusterLayout; - - fn migrate(previous: Self::Previous) -> Self { - use itertools::Itertools; - - // In the old layout, capacities are in an arbitrary unit, - // but in the new layout they are in bytes. - // Here we arbitrarily multiply everything by 1G, - // such that 1 old capacity unit = 1GB in the new units. - // This is totally arbitrary and won't work for most users. - let cap_mul = 1024 * 1024 * 1024; - let roles = multiply_all_capacities(previous.roles, cap_mul); - let staging_roles = multiply_all_capacities(previous.staging, cap_mul); - let node_id_vec = previous.node_id_vec; - - // Determine partition size - let mut tmp = previous.ring_assignation_data.clone(); - tmp.sort(); - let partition_size = tmp - .into_iter() - .dedup_with_count() - .map(|(npart, node)| { - roles - .get(&node_id_vec[node as usize]) - .and_then(|p| p.0.as_ref().and_then(|r| r.capacity)) - .unwrap_or(0) / npart as u64 - }) - .min() - .unwrap_or(0); - - // By default, zone_redundancy is maximum possible value - let parameters = LayoutParameters { - zone_redundancy: ZoneRedundancy::Maximum, - }; - - let mut res = Self { - version: previous.version, - replication_factor: previous.replication_factor, - partition_size, - parameters, - roles, - node_id_vec, - ring_assignment_data: previous.ring_assignation_data, - staging_parameters: Lww::new(parameters), - staging_roles, - staging_hash: [0u8; 32].into(), - }; - res.staging_hash = res.calculate_staging_hash(); - res - } - } - - fn multiply_all_capacities( - old_roles: LwwMap, - mul: u64, - ) -> LwwMap { - let mut new_roles = LwwMap::new(); - for (node, ts, role) in old_roles.items() { - let mut role = role.clone(); - if let NodeRoleV(Some(NodeRole { - capacity: Some(ref mut cap), - .. - })) = role - { - *cap *= mul; - } - new_roles.merge_raw(node, *ts, &role); - } - new_roles - } -} - -pub use v09::*; - -impl AutoCrdt for LayoutParameters { - const WARN_IF_DIFFERENT: bool = true; -} - -impl AutoCrdt for NodeRoleV { - const WARN_IF_DIFFERENT: bool = true; -} - -impl NodeRole { - pub fn capacity_string(&self) -> String { - match self.capacity { - Some(c) => ByteSize::b(c).to_string_as(false), - None => "gateway".to_string(), - } - } - - pub fn tags_string(&self) -> String { - self.tags.join(",") - } -} - -impl fmt::Display for ZoneRedundancy { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - ZoneRedundancy::Maximum => write!(f, "maximum"), - ZoneRedundancy::AtLeast(x) => write!(f, "{}", x), - } - } -} - -impl core::str::FromStr for ZoneRedundancy { - type Err = &'static str; - fn from_str(s: &str) -> Result { - match s { - "none" | "max" | "maximum" => Ok(ZoneRedundancy::Maximum), - x => { - let v = x - .parse::() - .map_err(|_| "zone redundancy must be 'none'/'max' or an integer")?; - Ok(ZoneRedundancy::AtLeast(v)) - } - } - } -} - -// Implementation of the ClusterLayout methods unrelated to the assignment algorithm. -impl ClusterLayout { - pub fn new(replication_factor: usize) -> Self { - // We set the default zone redundancy to be Maximum, meaning that the maximum - // possible value will be used depending on the cluster topology - let parameters = LayoutParameters { - zone_redundancy: ZoneRedundancy::Maximum, - }; - let staging_parameters = Lww::::new(parameters); - - let empty_lwwmap = LwwMap::new(); - - let mut ret = ClusterLayout { - version: 0, - replication_factor, - partition_size: 0, - roles: LwwMap::new(), - node_id_vec: Vec::new(), - ring_assignment_data: Vec::new(), - parameters, - staging_parameters, - staging_roles: empty_lwwmap, - staging_hash: [0u8; 32].into(), - }; - ret.staging_hash = ret.calculate_staging_hash(); - ret - } - - // ===================== accessors ====================== - - /// Returns a list of IDs of nodes that currently have - /// a role in the cluster - pub fn node_ids(&self) -> &[Uuid] { - &self.node_id_vec[..] - } - - pub fn num_nodes(&self) -> usize { - self.node_id_vec.len() - } - - /// Returns the role of a node in the layout - pub fn node_role(&self, node: &Uuid) -> Option<&NodeRole> { - match self.roles.get(node) { - Some(NodeRoleV(Some(v))) => Some(v), - _ => None, - } - } - - /// Given a node uuids, this function returns its capacity or fails if it does not have any - pub fn get_node_capacity(&self, uuid: &Uuid) -> Result { - match self.node_role(uuid) { - Some(NodeRole { - capacity: Some(cap), - zone: _, - tags: _, - }) => Ok(*cap), - _ => Err(Error::Message( - "The Uuid does not correspond to a node present in the \ - cluster or this node does not have a positive capacity." - .into(), - )), - } - } - - /// Returns the number of partitions associated to this node in the ring - pub fn get_node_usage(&self, uuid: &Uuid) -> Result { - for (i, id) in self.node_id_vec.iter().enumerate() { - if id == uuid { - let mut count = 0; - for nod in self.ring_assignment_data.iter() { - if i as u8 == *nod { - count += 1 - } - } - return Ok(count); - } - } - Err(Error::Message( - "The Uuid does not correspond to a node present in the \ - cluster or this node does not have a positive capacity." - .into(), - )) - } - - /// Get the partition in which data would fall on - pub fn partition_of(&self, position: &Hash) -> Partition { - let top = u16::from_be_bytes(position.as_slice()[0..2].try_into().unwrap()); - top >> (16 - PARTITION_BITS) - } - - /// Get the list of partitions and the first hash of a partition key that would fall in it - pub fn partitions(&self) -> Vec<(Partition, Hash)> { - (0..(1 << PARTITION_BITS)) - .map(|i| { - let top = (i as u16) << (16 - PARTITION_BITS); - let mut location = [0u8; 32]; - location[..2].copy_from_slice(&u16::to_be_bytes(top)[..]); - (i as u16, Hash::from(location)) - }) - .collect::>() - } - - /// Walk the ring to find the n servers in which data should be replicated - pub fn nodes_of(&self, position: &Hash, n: usize) -> Vec { - assert_eq!(n, self.replication_factor); - - let data = &self.ring_assignment_data; - - if data.len() != self.replication_factor * (1 << PARTITION_BITS) { - warn!("Ring not yet ready, read/writes will be lost!"); - return vec![]; - } - - let partition_idx = self.partition_of(position) as usize; - let partition_start = partition_idx * self.replication_factor; - let partition_end = (partition_idx + 1) * self.replication_factor; - let partition_nodes = &data[partition_start..partition_end]; - - partition_nodes - .iter() - .map(|i| self.node_id_vec[*i as usize]) - .collect::>() - } - - // ===================== internal information extractors ====================== - - /// Returns the uuids of the non_gateway nodes in self.node_id_vec. - fn nongateway_nodes(&self) -> Vec { - let mut result = Vec::::new(); - for uuid in self.node_id_vec.iter() { - match self.node_role(uuid) { - Some(role) if role.capacity.is_some() => result.push(*uuid), - _ => (), - } - } - result - } - - /// Given a node uuids, this function returns the label of its zone - fn get_node_zone(&self, uuid: &Uuid) -> Result<&str, Error> { - match self.node_role(uuid) { - Some(role) => Ok(&role.zone), - _ => Err(Error::Message( - "The Uuid does not correspond to a node present in the cluster.".into(), - )), - } - } - - /// Returns the sum of capacities of non gateway nodes in the cluster - fn get_total_capacity(&self) -> Result { - let mut total_capacity = 0; - for uuid in self.nongateway_nodes().iter() { - total_capacity += self.get_node_capacity(uuid)?; - } - Ok(total_capacity) - } - - /// Returns the effective value of the zone_redundancy parameter - fn effective_zone_redundancy(&self) -> usize { - match self.parameters.zone_redundancy { - ZoneRedundancy::AtLeast(v) => v, - ZoneRedundancy::Maximum => { - let n_zones = self - .roles - .items() - .iter() - .filter_map(|(_, _, role)| role.0.as_ref().map(|x| x.zone.as_str())) - .collect::>() - .len(); - std::cmp::min(n_zones, self.replication_factor) - } - } - } - - fn calculate_staging_hash(&self) -> Hash { - let hashed_tuple = (&self.staging_roles, &self.staging_parameters); - blake2sum(&nonversioned_encode(&hashed_tuple).unwrap()[..]) - } - - // ================== updates to layout, public interface =================== - - pub fn merge(&mut self, other: &ClusterLayout) -> bool { - match other.version.cmp(&self.version) { - Ordering::Greater => { - *self = other.clone(); - true - } - Ordering::Equal => { - self.staging_parameters.merge(&other.staging_parameters); - self.staging_roles.merge(&other.staging_roles); - - let new_staging_hash = self.calculate_staging_hash(); - let changed = new_staging_hash != self.staging_hash; - - self.staging_hash = new_staging_hash; - - changed - } - Ordering::Less => false, - } - } - - pub fn apply_staged_changes(mut self, version: Option) -> Result<(Self, Message), Error> { - match version { - None => { - let error = r#" -Please pass the new layout version number to ensure that you are writing the correct version of the cluster layout. -To know the correct value of the new layout version, invoke `garage layout show` and review the proposed changes. - "#; - return Err(Error::Message(error.into())); - } - Some(v) => { - if v != self.version + 1 { - return Err(Error::Message("Invalid new layout version".into())); - } - } - } - - self.roles.merge(&self.staging_roles); - self.roles.retain(|(_, _, v)| v.0.is_some()); - self.parameters = *self.staging_parameters.get(); - - self.staging_roles.clear(); - self.staging_hash = self.calculate_staging_hash(); - - let msg = self.calculate_partition_assignment()?; - - self.version += 1; - - Ok((self, msg)) - } - - pub fn revert_staged_changes(mut self, version: Option) -> Result { - match version { - None => { - let error = r#" -Please pass the new layout version number to ensure that you are writing the correct version of the cluster layout. -To know the correct value of the new layout version, invoke `garage layout show` and review the proposed changes. - "#; - return Err(Error::Message(error.into())); - } - Some(v) => { - if v != self.version + 1 { - return Err(Error::Message("Invalid new layout version".into())); - } - } - } - - self.staging_roles.clear(); - self.staging_parameters.update(self.parameters); - self.staging_hash = self.calculate_staging_hash(); - - self.version += 1; - - Ok(self) - } - - /// Check a cluster layout for internal consistency - /// (assignment, roles, parameters, partition size) - /// returns true if consistent, false if error - pub fn check(&self) -> Result<(), String> { - // Check that the hash of the staging data is correct - let staging_hash = self.calculate_staging_hash(); - if staging_hash != self.staging_hash { - return Err("staging_hash is incorrect".into()); - } - - // Check that node_id_vec contains the correct list of nodes - let mut expected_nodes = self - .roles - .items() - .iter() - .filter(|(_, _, v)| v.0.is_some()) - .map(|(id, _, _)| *id) - .collect::>(); - expected_nodes.sort(); - let mut node_id_vec = self.node_id_vec.clone(); - node_id_vec.sort(); - if expected_nodes != node_id_vec { - return Err(format!("node_id_vec does not contain the correct set of nodes\nnode_id_vec: {:?}\nexpected: {:?}", node_id_vec, expected_nodes)); - } - - // Check that the assignment data has the correct length - let expected_assignment_data_len = (1 << PARTITION_BITS) * self.replication_factor; - if self.ring_assignment_data.len() != expected_assignment_data_len { - return Err(format!( - "ring_assignment_data has incorrect length {} instead of {}", - self.ring_assignment_data.len(), - expected_assignment_data_len - )); - } - - // Check that the assigned nodes are correct identifiers - // of nodes that are assigned a role - // and that role is not the role of a gateway nodes - for x in self.ring_assignment_data.iter() { - if *x as usize >= self.node_id_vec.len() { - return Err(format!( - "ring_assignment_data contains invalid node id {}", - *x - )); - } - let node = self.node_id_vec[*x as usize]; - match self.roles.get(&node) { - Some(NodeRoleV(Some(x))) if x.capacity.is_some() => (), - _ => return Err("ring_assignment_data contains id of a gateway node".into()), - } - } - - // Check that every partition is associated to distinct nodes - let zone_redundancy = self.effective_zone_redundancy(); - let rf = self.replication_factor; - for p in 0..(1 << PARTITION_BITS) { - let nodes_of_p = self.ring_assignment_data[rf * p..rf * (p + 1)].to_vec(); - if nodes_of_p.iter().unique().count() != rf { - return Err(format!("partition does not contain {} unique node ids", rf)); - } - // Check that every partition is spread over at least zone_redundancy zones. - let zones_of_p = nodes_of_p - .iter() - .map(|n| { - self.get_node_zone(&self.node_id_vec[*n as usize]) - .expect("Zone not found.") - }) - .collect::>(); - if zones_of_p.iter().unique().count() < zone_redundancy { - return Err(format!( - "nodes of partition are in less than {} distinct zones", - zone_redundancy - )); - } - } - - // Check that the nodes capacities is consistent with the stored partitions - let mut node_usage = vec![0; MAX_NODE_NUMBER]; - for n in self.ring_assignment_data.iter() { - node_usage[*n as usize] += 1; - } - for (n, usage) in node_usage.iter().enumerate() { - if *usage > 0 { - let uuid = self.node_id_vec[n]; - let partusage = usage * self.partition_size; - let nodecap = self.get_node_capacity(&uuid).unwrap(); - if partusage > nodecap { - return Err(format!( - "node usage ({}) is bigger than node capacity ({})", - usage * self.partition_size, - nodecap - )); - } - } - } - - // Check that the partition size stored is the one computed by the asignation - // algorithm. - let cl2 = self.clone(); - let (_, zone_to_id) = cl2.generate_nongateway_zone_ids().unwrap(); - match cl2.compute_optimal_partition_size(&zone_to_id, zone_redundancy) { - Ok(s) if s != self.partition_size => { - return Err(format!( - "partition_size ({}) is different than optimal value ({})", - self.partition_size, s - )) - } - Err(e) => return Err(format!("could not calculate optimal partition size: {}", e)), - _ => (), - } - - Ok(()) - } - - // ================== updates to layout, internals =================== - - /// This function calculates a new partition-to-node assignment. - /// The computed assignment respects the node replication factor - /// and the zone redundancy parameter It maximizes the capacity of a - /// partition (assuming all partitions have the same size). - /// Among such optimal assignment, it minimizes the distance to - /// the former assignment (if any) to minimize the amount of - /// data to be moved. - /// Staged role changes must be merged with nodes roles before calling this function, - /// hence it must only be called from apply_staged_changes() and hence is not public. - fn calculate_partition_assignment(&mut self) -> Result { - // We update the node ids, since the node role list might have changed with the - // changes in the layout. We retrieve the old_assignment reframed with new ids - let old_assignment_opt = self.update_node_id_vec()?; - - let zone_redundancy = self.effective_zone_redundancy(); - - let mut msg = Message::new(); - msg.push("==== COMPUTATION OF A NEW PARTITION ASSIGNATION ====".into()); - msg.push("".into()); - msg.push(format!( - "Partitions are \ - replicated {} times on at least {} distinct zones.", - self.replication_factor, zone_redundancy - )); - - // We generate for once numerical ids for the zones of non gateway nodes, - // to use them as indices in the flow graphs. - let (id_to_zone, zone_to_id) = self.generate_nongateway_zone_ids()?; - - let nb_nongateway_nodes = self.nongateway_nodes().len(); - if nb_nongateway_nodes < self.replication_factor { - return Err(Error::Message(format!( - "The number of nodes with positive \ - capacity ({}) is smaller than the replication factor ({}).", - nb_nongateway_nodes, self.replication_factor - ))); - } - if id_to_zone.len() < zone_redundancy { - return Err(Error::Message(format!( - "The number of zones with non-gateway \ - nodes ({}) is smaller than the redundancy parameter ({})", - id_to_zone.len(), - zone_redundancy - ))); - } - - // We compute the optimal partition size - // Capacities should be given in a unit so that partition size is at least 100. - // In this case, integer rounding plays a marginal role in the percentages of - // optimality. - let partition_size = self.compute_optimal_partition_size(&zone_to_id, zone_redundancy)?; - - msg.push("".into()); - if old_assignment_opt.is_some() { - msg.push(format!( - "Optimal partition size: {} ({} in previous layout)", - ByteSize::b(partition_size).to_string_as(false), - ByteSize::b(self.partition_size).to_string_as(false) - )); - } else { - msg.push(format!( - "Optimal partition size: {}", - ByteSize::b(partition_size).to_string_as(false) - )); - } - // We write the partition size. - self.partition_size = partition_size; - - if partition_size < 100 { - msg.push( - "WARNING: The partition size is low (< 100), make sure the capacities of your nodes are correct and are of at least a few MB" - .into(), - ); - } - - // We compute a first flow/assignment that is heuristically close to the previous - // assignment - let mut gflow = - self.compute_candidate_assignment(&zone_to_id, &old_assignment_opt, zone_redundancy)?; - if let Some(assoc) = &old_assignment_opt { - // We minimize the distance to the previous assignment. - self.minimize_rebalance_load(&mut gflow, &zone_to_id, assoc)?; - } - - // We display statistics of the computation - msg.extend(self.output_stat(&gflow, &old_assignment_opt, &zone_to_id, &id_to_zone)?); - - // We update the layout structure - self.update_ring_from_flow(id_to_zone.len(), &gflow)?; - - if let Err(e) = self.check() { - return Err(Error::Message( - format!("Layout check returned an error: {}\nOriginal result of computation: <<<<\n{}\n>>>>", e, msg.join("\n")) - )); - } - - Ok(msg) - } - - /// The LwwMap of node roles might have changed. This function updates the node_id_vec - /// and returns the assignment given by ring, with the new indices of the nodes, and - /// None if the node is not present anymore. - /// We work with the assumption that only this function and calculate_new_assignment - /// do modify assignment_ring and node_id_vec. - fn update_node_id_vec(&mut self) -> Result>>, Error> { - // (1) We compute the new node list - // Non gateway nodes should be coded on 8bits, hence they must be first in the list - // We build the new node ids - let new_non_gateway_nodes: Vec = self - .roles - .items() - .iter() - .filter(|(_, _, v)| matches!(&v.0, Some(r) if r.capacity.is_some())) - .map(|(k, _, _)| *k) - .collect(); - - if new_non_gateway_nodes.len() > MAX_NODE_NUMBER { - return Err(Error::Message(format!( - "There are more than {} non-gateway nodes in the new \ - layout. This is not allowed.", - MAX_NODE_NUMBER - ))); - } - - let new_gateway_nodes: Vec = self - .roles - .items() - .iter() - .filter(|(_, _, v)| matches!(v, NodeRoleV(Some(r)) if r.capacity.is_none())) - .map(|(k, _, _)| *k) - .collect(); - - let mut new_node_id_vec = Vec::::new(); - new_node_id_vec.extend(new_non_gateway_nodes); - new_node_id_vec.extend(new_gateway_nodes); - - let old_node_id_vec = self.node_id_vec.clone(); - self.node_id_vec = new_node_id_vec.clone(); - - // (2) We retrieve the old association - // We rewrite the old association with the new indices. We only consider partition - // to node assignments where the node is still in use. - if self.ring_assignment_data.is_empty() { - // This is a new association - return Ok(None); - } - - if self.ring_assignment_data.len() != NB_PARTITIONS * self.replication_factor { - return Err(Error::Message( - "The old assignment does not have a size corresponding to \ - the old replication factor or the number of partitions." - .into(), - )); - } - - // We build a translation table between the uuid and new ids - let mut uuid_to_new_id = HashMap::::new(); - - // We add the indices of only the new non-gateway nodes that can be used in the - // association ring - for (i, uuid) in new_node_id_vec.iter().enumerate() { - uuid_to_new_id.insert(*uuid, i); - } - - let mut old_assignment = vec![Vec::::new(); NB_PARTITIONS]; - let rf = self.replication_factor; - - for (p, old_assign_p) in old_assignment.iter_mut().enumerate() { - for old_id in &self.ring_assignment_data[p * rf..(p + 1) * rf] { - let uuid = old_node_id_vec[*old_id as usize]; - if uuid_to_new_id.contains_key(&uuid) { - old_assign_p.push(uuid_to_new_id[&uuid]); - } - } - } - - // We write the ring - self.ring_assignment_data = Vec::::new(); - - Ok(Some(old_assignment)) - } - - /// This function generates ids for the zone of the nodes appearing in - /// self.node_id_vec. - fn generate_nongateway_zone_ids(&self) -> Result<(Vec, HashMap), Error> { - let mut id_to_zone = Vec::::new(); - let mut zone_to_id = HashMap::::new(); - - for uuid in self.nongateway_nodes().iter() { - let r = self.node_role(uuid).unwrap(); - if !zone_to_id.contains_key(&r.zone) && r.capacity.is_some() { - zone_to_id.insert(r.zone.clone(), id_to_zone.len()); - id_to_zone.push(r.zone.clone()); - } - } - Ok((id_to_zone, zone_to_id)) - } - - /// This function computes by dichotomy the largest realizable partition size, given - /// the layout roles and parameters. - fn compute_optimal_partition_size( - &self, - zone_to_id: &HashMap, - zone_redundancy: usize, - ) -> Result { - let empty_set = HashSet::<(usize, usize)>::new(); - let mut g = self.generate_flow_graph(1, zone_to_id, &empty_set, zone_redundancy)?; - g.compute_maximal_flow()?; - if g.get_flow_value()? < (NB_PARTITIONS * self.replication_factor) as i64 { - return Err(Error::Message( - "The storage capacity of he cluster is to small. It is \ - impossible to store partitions of size 1." - .into(), - )); - } - - let mut s_down = 1; - let mut s_up = self.get_total_capacity()?; - while s_down + 1 < s_up { - g = self.generate_flow_graph( - (s_down + s_up) / 2, - zone_to_id, - &empty_set, - zone_redundancy, - )?; - g.compute_maximal_flow()?; - if g.get_flow_value()? < (NB_PARTITIONS * self.replication_factor) as i64 { - s_up = (s_down + s_up) / 2; - } else { - s_down = (s_down + s_up) / 2; - } - } - - Ok(s_down) - } - - fn generate_graph_vertices(nb_zones: usize, nb_nodes: usize) -> Vec { - let mut vertices = vec![Vertex::Source, Vertex::Sink]; - for p in 0..NB_PARTITIONS { - vertices.push(Vertex::Pup(p)); - vertices.push(Vertex::Pdown(p)); - for z in 0..nb_zones { - vertices.push(Vertex::PZ(p, z)); - } - } - for n in 0..nb_nodes { - vertices.push(Vertex::N(n)); - } - vertices - } - - /// Generates the graph to compute the maximal flow corresponding to the optimal - /// partition assignment. - /// exclude_assoc is the set of (partition, node) association that we are forbidden - /// to use (hence we do not add the corresponding edge to the graph). This parameter - /// is used to compute a first flow that uses only edges appearing in the previous - /// assignment. This produces a solution that heuristically should be close to the - /// previous one. - fn generate_flow_graph( - &self, - partition_size: u64, - zone_to_id: &HashMap, - exclude_assoc: &HashSet<(usize, usize)>, - zone_redundancy: usize, - ) -> Result, Error> { - let vertices = - ClusterLayout::generate_graph_vertices(zone_to_id.len(), self.nongateway_nodes().len()); - let mut g = Graph::::new(&vertices); - let nb_zones = zone_to_id.len(); - for p in 0..NB_PARTITIONS { - g.add_edge(Vertex::Source, Vertex::Pup(p), zone_redundancy as u64)?; - g.add_edge( - Vertex::Source, - Vertex::Pdown(p), - (self.replication_factor - zone_redundancy) as u64, - )?; - for z in 0..nb_zones { - g.add_edge(Vertex::Pup(p), Vertex::PZ(p, z), 1)?; - g.add_edge( - Vertex::Pdown(p), - Vertex::PZ(p, z), - self.replication_factor as u64, - )?; - } - } - for n in 0..self.nongateway_nodes().len() { - let node_capacity = self.get_node_capacity(&self.node_id_vec[n])?; - let node_zone = zone_to_id[self.get_node_zone(&self.node_id_vec[n])?]; - g.add_edge(Vertex::N(n), Vertex::Sink, node_capacity / partition_size)?; - for p in 0..NB_PARTITIONS { - if !exclude_assoc.contains(&(p, n)) { - g.add_edge(Vertex::PZ(p, node_zone), Vertex::N(n), 1)?; - } - } - } - Ok(g) - } - - /// This function computes a first optimal assignment (in the form of a flow graph). - fn compute_candidate_assignment( - &self, - zone_to_id: &HashMap, - prev_assign_opt: &Option>>, - zone_redundancy: usize, - ) -> Result, Error> { - // We list the (partition,node) associations that are not used in the - // previous assignment - let mut exclude_edge = HashSet::<(usize, usize)>::new(); - if let Some(prev_assign) = prev_assign_opt { - let nb_nodes = self.nongateway_nodes().len(); - for (p, prev_assign_p) in prev_assign.iter().enumerate() { - for n in 0..nb_nodes { - exclude_edge.insert((p, n)); - } - for n in prev_assign_p.iter() { - exclude_edge.remove(&(p, *n)); - } - } - } - - // We compute the best flow using only the edges used in the previous assignment - let mut g = self.generate_flow_graph( - self.partition_size, - zone_to_id, - &exclude_edge, - zone_redundancy, - )?; - g.compute_maximal_flow()?; - - // We add the excluded edges and compute the maximal flow with the full graph. - // The algorithm is such that it will start with the flow that we just computed - // and find ameliorating paths from that. - for (p, n) in exclude_edge.iter() { - let node_zone = zone_to_id[self.get_node_zone(&self.node_id_vec[*n])?]; - g.add_edge(Vertex::PZ(*p, node_zone), Vertex::N(*n), 1)?; - } - g.compute_maximal_flow()?; - Ok(g) - } - - /// This function updates the flow graph gflow to minimize the distance between - /// its corresponding assignment and the previous one - fn minimize_rebalance_load( - &self, - gflow: &mut Graph, - zone_to_id: &HashMap, - prev_assign: &[Vec], - ) -> Result<(), Error> { - // We define a cost function on the edges (pairs of vertices) corresponding - // to the distance between the two assignments. - let mut cost = CostFunction::new(); - for (p, assoc_p) in prev_assign.iter().enumerate() { - for n in assoc_p.iter() { - let node_zone = zone_to_id[self.get_node_zone(&self.node_id_vec[*n])?]; - cost.insert((Vertex::PZ(p, node_zone), Vertex::N(*n)), -1); - } - } - - // We compute the maximal length of a simple path in gflow. It is used in the - // Bellman-Ford algorithm in optimize_flow_with_cost to set the number - // of iterations. - let nb_nodes = self.nongateway_nodes().len(); - let path_length = 4 * nb_nodes; - gflow.optimize_flow_with_cost(&cost, path_length)?; - - Ok(()) - } - - /// This function updates the assignment ring from the flow graph. - fn update_ring_from_flow( - &mut self, - nb_zones: usize, - gflow: &Graph, - ) -> Result<(), Error> { - self.ring_assignment_data = Vec::::new(); - for p in 0..NB_PARTITIONS { - for z in 0..nb_zones { - let assoc_vertex = gflow.get_positive_flow_from(Vertex::PZ(p, z))?; - for vertex in assoc_vertex.iter() { - if let Vertex::N(n) = vertex { - self.ring_assignment_data.push((*n).try_into().unwrap()); - } - } - } - } - - if self.ring_assignment_data.len() != NB_PARTITIONS * self.replication_factor { - return Err(Error::Message( - "Critical Error : the association ring we produced does not \ - have the right size." - .into(), - )); - } - Ok(()) - } - - /// This function returns a message summing up the partition repartition of the new - /// layout, and other statistics of the partition assignment computation. - fn output_stat( - &self, - gflow: &Graph, - prev_assign_opt: &Option>>, - zone_to_id: &HashMap, - id_to_zone: &[String], - ) -> Result { - let mut msg = Message::new(); - - let used_cap = self.partition_size * NB_PARTITIONS as u64 * self.replication_factor as u64; - let total_cap = self.get_total_capacity()?; - let percent_cap = 100.0 * (used_cap as f32) / (total_cap as f32); - msg.push(format!( - "Usable capacity / total cluster capacity: {} / {} ({:.1} %)", - ByteSize::b(used_cap).to_string_as(false), - ByteSize::b(total_cap).to_string_as(false), - percent_cap - )); - msg.push(format!( - "Effective capacity (replication factor {}): {}", - self.replication_factor, - ByteSize::b(used_cap / self.replication_factor as u64).to_string_as(false) - )); - if percent_cap < 80. { - msg.push("".into()); - msg.push( - "If the percentage is too low, it might be that the \ - cluster topology and redundancy constraints are forcing the use of nodes/zones with small \ - storage capacities." - .into(), - ); - msg.push( - "You might want to move storage capacity between zones or relax the redundancy constraint." - .into(), - ); - msg.push( - "See the detailed statistics below and look for saturated nodes/zones.".into(), - ); - } - - // We define and fill in the following tables - let storing_nodes = self.nongateway_nodes(); - let mut new_partitions = vec![0; storing_nodes.len()]; - let mut stored_partitions = vec![0; storing_nodes.len()]; - - let mut new_partitions_zone = vec![0; id_to_zone.len()]; - let mut stored_partitions_zone = vec![0; id_to_zone.len()]; - - for p in 0..NB_PARTITIONS { - for z in 0..id_to_zone.len() { - let pz_nodes = gflow.get_positive_flow_from(Vertex::PZ(p, z))?; - if !pz_nodes.is_empty() { - stored_partitions_zone[z] += 1; - if let Some(prev_assign) = prev_assign_opt { - let mut old_zones_of_p = Vec::::new(); - for n in prev_assign[p].iter() { - old_zones_of_p - .push(zone_to_id[self.get_node_zone(&self.node_id_vec[*n])?]); - } - if !old_zones_of_p.contains(&z) { - new_partitions_zone[z] += 1; - } - } - } - for vert in pz_nodes.iter() { - if let Vertex::N(n) = *vert { - stored_partitions[n] += 1; - if let Some(prev_assign) = prev_assign_opt { - if !prev_assign[p].contains(&n) { - new_partitions[n] += 1; - } - } - } - } - } - } - - if prev_assign_opt.is_none() { - new_partitions = stored_partitions.clone(); - //new_partitions_zone = stored_partitions_zone.clone(); - } - - // We display the statistics - - msg.push("".into()); - if prev_assign_opt.is_some() { - let total_new_partitions: usize = new_partitions.iter().sum(); - msg.push(format!( - "A total of {} new copies of partitions need to be \ - transferred.", - total_new_partitions - )); - msg.push("".into()); - } - - let mut table = vec![]; - for z in 0..id_to_zone.len() { - let mut nodes_of_z = Vec::::new(); - for n in 0..storing_nodes.len() { - if self.get_node_zone(&self.node_id_vec[n])? == id_to_zone[z] { - nodes_of_z.push(n); - } - } - let replicated_partitions: usize = - nodes_of_z.iter().map(|n| stored_partitions[*n]).sum(); - table.push(format!( - "{}\tTags\tPartitions\tCapacity\tUsable capacity", - id_to_zone[z] - )); - - let available_cap_z: u64 = self.partition_size * replicated_partitions as u64; - let mut total_cap_z = 0; - for n in nodes_of_z.iter() { - total_cap_z += self.get_node_capacity(&self.node_id_vec[*n])?; - } - let percent_cap_z = 100.0 * (available_cap_z as f32) / (total_cap_z as f32); - - for n in nodes_of_z.iter() { - let available_cap_n = stored_partitions[*n] as u64 * self.partition_size; - let total_cap_n = self.get_node_capacity(&self.node_id_vec[*n])?; - let tags_n = (self.node_role(&self.node_id_vec[*n]).ok_or(""))?.tags_string(); - table.push(format!( - " {:?}\t{}\t{} ({} new)\t{}\t{} ({:.1}%)", - self.node_id_vec[*n], - tags_n, - stored_partitions[*n], - new_partitions[*n], - ByteSize::b(total_cap_n).to_string_as(false), - ByteSize::b(available_cap_n).to_string_as(false), - (available_cap_n as f32) / (total_cap_n as f32) * 100.0, - )); - } - - table.push(format!( - " TOTAL\t\t{} ({} unique)\t{}\t{} ({:.1}%)", - replicated_partitions, - stored_partitions_zone[z], - //new_partitions_zone[z], - ByteSize::b(total_cap_z).to_string_as(false), - ByteSize::b(available_cap_z).to_string_as(false), - percent_cap_z - )); - table.push("".into()); - } - msg.push(format_table::format_table_to_string(table)); - - Ok(msg) - } -} - -// ==================================================================================== - -#[cfg(test)] -mod tests { - use super::{Error, *}; - use std::cmp::min; - - // This function checks that the partition size S computed is at least better than the - // one given by a very naive algorithm. To do so, we try to run the naive algorithm - // assuming a partion size of S+1. If we succed, it means that the optimal assignment - // was not optimal. The naive algorithm is the following : - // - we compute the max number of partitions associated to every node, capped at the - // partition number. It gives the number of tokens of every node. - // - every zone has a number of tokens equal to the sum of the tokens of its nodes. - // - we cycle over the partitions and associate zone tokens while respecting the - // zone redundancy constraint. - // NOTE: the naive algorithm is not optimal. Counter example: - // take nb_partition = 3 ; replication_factor = 5; redundancy = 4; - // number of tokens by zone : (A, 4), (B,1), (C,4), (D, 4), (E, 2) - // With these parameters, the naive algo fails, whereas there is a solution: - // (A,A,C,D,E) , (A,B,C,D,D) (A,C,C,D,E) - fn check_against_naive(cl: &ClusterLayout) -> Result { - let over_size = cl.partition_size + 1; - let mut zone_token = HashMap::::new(); - - let (zones, zone_to_id) = cl.generate_nongateway_zone_ids()?; - - if zones.is_empty() { - return Ok(false); - } - - for z in zones.iter() { - zone_token.insert(z.clone(), 0); - } - for uuid in cl.nongateway_nodes().iter() { - let z = cl.get_node_zone(uuid)?; - let c = cl.get_node_capacity(uuid)?; - zone_token.insert( - z.clone(), - zone_token[&z] + min(NB_PARTITIONS, (c / over_size) as usize), - ); - } - - // For every partition, we count the number of zone already associated and - // the name of the last zone associated - - let mut id_zone_token = vec![0; zones.len()]; - for (z, t) in zone_token.iter() { - id_zone_token[zone_to_id[z]] = *t; - } - - let mut nb_token = vec![0; NB_PARTITIONS]; - let mut last_zone = vec![zones.len(); NB_PARTITIONS]; - - let mut curr_zone = 0; - - let redundancy = cl.effective_zone_redundancy(); - - for replic in 0..cl.replication_factor { - for p in 0..NB_PARTITIONS { - while id_zone_token[curr_zone] == 0 - || (last_zone[p] == curr_zone - && redundancy - nb_token[p] <= cl.replication_factor - replic) - { - curr_zone += 1; - if curr_zone >= zones.len() { - return Ok(true); - } - } - id_zone_token[curr_zone] -= 1; - if last_zone[p] != curr_zone { - nb_token[p] += 1; - last_zone[p] = curr_zone; - } - } - } - - return Ok(false); - } - - fn show_msg(msg: &Message) { - for s in msg.iter() { - println!("{}", s); - } - } - - fn update_layout( - cl: &mut ClusterLayout, - node_id_vec: &Vec, - node_capacity_vec: &Vec, - node_zone_vec: &Vec, - zone_redundancy: usize, - ) { - for i in 0..node_id_vec.len() { - if let Some(x) = FixedBytes32::try_from(&[i as u8; 32]) { - cl.node_id_vec.push(x); - } - - let update = cl.staging_roles.update_mutator( - cl.node_id_vec[i], - NodeRoleV(Some(NodeRole { - zone: (node_zone_vec[i].to_string()), - capacity: (Some(node_capacity_vec[i])), - tags: (vec![]), - })), - ); - cl.staging_roles.merge(&update); - } - cl.staging_parameters.update(LayoutParameters { - zone_redundancy: ZoneRedundancy::AtLeast(zone_redundancy), - }); - cl.staging_hash = cl.calculate_staging_hash(); - } - - #[test] - fn test_assignment() { - let mut node_id_vec = vec![1, 2, 3]; - let mut node_capacity_vec = vec![4000, 1000, 2000]; - let mut node_zone_vec = vec!["A", "B", "C"] - .into_iter() - .map(|x| x.to_string()) - .collect(); - - let mut cl = ClusterLayout::new(3); - update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 3); - let v = cl.version; - let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); - show_msg(&msg); - assert_eq!(cl.check(), Ok(())); - assert!(matches!(check_against_naive(&cl), Ok(true))); - - node_id_vec = vec![1, 2, 3, 4, 5, 6, 7, 8, 9]; - node_capacity_vec = vec![4000, 1000, 1000, 3000, 1000, 1000, 2000, 10000, 2000]; - node_zone_vec = vec!["A", "B", "C", "C", "C", "B", "G", "H", "I"] - .into_iter() - .map(|x| x.to_string()) - .collect(); - update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 2); - let v = cl.version; - let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); - show_msg(&msg); - assert_eq!(cl.check(), Ok(())); - assert!(matches!(check_against_naive(&cl), Ok(true))); - - node_capacity_vec = vec![4000, 1000, 2000, 7000, 1000, 1000, 2000, 10000, 2000]; - update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 3); - let v = cl.version; - let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); - show_msg(&msg); - assert_eq!(cl.check(), Ok(())); - assert!(matches!(check_against_naive(&cl), Ok(true))); - - node_capacity_vec = vec![ - 4000000, 4000000, 2000000, 7000000, 1000000, 9000000, 2000000, 10000, 2000000, - ]; - update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 1); - let v = cl.version; - let (cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); - show_msg(&msg); - assert_eq!(cl.check(), Ok(())); - assert!(matches!(check_against_naive(&cl), Ok(true))); - } -} diff --git a/src/rpc/layout/history.rs b/src/rpc/layout/history.rs new file mode 100644 index 00000000..b3019f58 --- /dev/null +++ b/src/rpc/layout/history.rs @@ -0,0 +1,170 @@ +use std::cmp::Ordering; +use std::sync::Arc; + +use garage_util::crdt::{Crdt, Lww, LwwMap}; +use garage_util::data::*; +use garage_util::encode::nonversioned_encode; +use garage_util::error::*; + +use super::schema::*; +use super::*; + +impl LayoutHistory { + pub fn new(replication_factor: usize) -> Self { + let version = LayoutVersion::new(replication_factor); + + let staging_parameters = Lww::::new(version.parameters); + let empty_lwwmap = LwwMap::new(); + + let mut ret = LayoutHistory { + versions: vec![version].into_boxed_slice().into(), + update_trackers: Default::default(), + staging_parameters, + staging_roles: empty_lwwmap, + staging_hash: [0u8; 32].into(), + }; + ret.staging_hash = ret.calculate_staging_hash(); + ret + } + + pub fn current(&self) -> &LayoutVersion { + self.versions.last().as_ref().unwrap() + } + + pub(crate) fn calculate_staging_hash(&self) -> Hash { + let hashed_tuple = (&self.staging_roles, &self.staging_parameters); + blake2sum(&nonversioned_encode(&hashed_tuple).unwrap()[..]) + } + + // ================== updates to layout, public interface =================== + + pub fn merge(&mut self, other: &LayoutHistory) -> bool { + let mut changed = false; + + // Merge staged layout changes + match other.current().version.cmp(&self.current().version) { + Ordering::Greater => { + self.staging_parameters = other.staging_parameters.clone(); + self.staging_roles = other.staging_roles.clone(); + self.staging_hash = other.staging_hash; + changed = true; + } + Ordering::Equal => { + self.staging_parameters.merge(&other.staging_parameters); + self.staging_roles.merge(&other.staging_roles); + + let new_staging_hash = self.calculate_staging_hash(); + if new_staging_hash != self.staging_hash { + changed = true; + } + + self.staging_hash = new_staging_hash; + } + Ordering::Less => (), + } + + // Add any new versions to history + let mut versions = self.versions.to_vec(); + for v2 in other.versions.iter() { + if let Some(v1) = versions.iter().find(|v| v.version == v2.version) { + if v1 != v2 { + error!("Inconsistent layout histories: different layout compositions for version {}. Your cluster will be broken as long as this layout version is not replaced.", v2.version); + } + } else if versions.iter().all(|v| v.version != v2.version - 1) { + error!( + "Cannot receive new layout version {}, version {} is missing", + v2.version, + v2.version - 1 + ); + } else { + versions.push(v2.clone()); + changed = true; + } + } + self.versions = Arc::from(versions.into_boxed_slice()); + + // Merge trackers + self.update_trackers.merge(&other.update_trackers); + + changed + } + + pub fn apply_staged_changes(mut self, version: Option) -> Result<(Self, Message), Error> { + match version { + None => { + let error = r#" +Please pass the new layout version number to ensure that you are writing the correct version of the cluster layout. +To know the correct value of the new layout version, invoke `garage layout show` and review the proposed changes. + "#; + return Err(Error::Message(error.into())); + } + Some(v) => { + if v != self.current().version + 1 { + return Err(Error::Message("Invalid new layout version".into())); + } + } + } + + let mut new_version = self.current().clone(); + new_version.version += 1; + + new_version.roles.merge(&self.staging_roles); + new_version.roles.retain(|(_, _, v)| v.0.is_some()); + new_version.parameters = *self.staging_parameters.get(); + + self.staging_roles.clear(); + self.staging_hash = self.calculate_staging_hash(); + + let msg = new_version.calculate_partition_assignment()?; + + let mut versions = self.versions.to_vec(); + versions.push(new_version); + self.versions = Arc::from(versions.into_boxed_slice()); + + Ok((self, msg)) + } + + pub fn revert_staged_changes(mut self, version: Option) -> Result { + match version { + None => { + let error = r#" +Please pass the new layout version number to ensure that you are writing the correct version of the cluster layout. +To know the correct value of the new layout version, invoke `garage layout show` and review the proposed changes. + "#; + return Err(Error::Message(error.into())); + } + Some(v) => { + if v != self.current().version + 1 { + return Err(Error::Message("Invalid new layout version".into())); + } + } + } + + self.staging_roles.clear(); + self.staging_parameters.update(self.current().parameters); + self.staging_hash = self.calculate_staging_hash(); + + // TODO this is stupid, we should have a separate version counter/LWW + // for the staging params + let mut new_version = self.current().clone(); + new_version.version += 1; + + let mut versions = self.versions.to_vec(); + versions.push(new_version); + self.versions = Arc::from(versions.into_boxed_slice()); + + Ok(self) + } + + pub fn check(&self) -> Result<(), String> { + // Check that the hash of the staging data is correct + let staging_hash = self.calculate_staging_hash(); + if staging_hash != self.staging_hash { + return Err("staging_hash is incorrect".into()); + } + + // TODO: anythign more ? + + self.current().check() + } +} diff --git a/src/rpc/layout/mod.rs b/src/rpc/layout/mod.rs new file mode 100644 index 00000000..122d4b65 --- /dev/null +++ b/src/rpc/layout/mod.rs @@ -0,0 +1,32 @@ +mod history; +mod schema; +mod tracker; +mod version; + +pub use history::*; +pub use schema::*; +pub use version::*; + +// ---- defines: partitions ---- + +/// A partition id, which is stored on 16 bits +/// i.e. we have up to 2**16 partitions. +/// (in practice we have exactly 2**PARTITION_BITS partitions) +pub type Partition = u16; + +// TODO: make this constant parametrizable in the config file +// For deployments with many nodes it might make sense to bump +// it up to 10. +// Maximum value : 16 +/// How many bits from the hash are used to make partitions. Higher numbers means more fairness in +/// presence of numerous nodes, but exponentially bigger ring. Max 16 +pub const PARTITION_BITS: usize = 8; + +const NB_PARTITIONS: usize = 1usize << PARTITION_BITS; + +// ---- defines: nodes ---- + +// Type to store compactly the id of a node in the system +// Change this to u16 the day we want to have more than 256 nodes in a cluster +pub type CompactNodeType = u8; +pub const MAX_NODE_NUMBER: usize = 256; diff --git a/src/rpc/layout/schema.rs b/src/rpc/layout/schema.rs new file mode 100644 index 00000000..fa0822fa --- /dev/null +++ b/src/rpc/layout/schema.rs @@ -0,0 +1,286 @@ +mod v08 { + use crate::layout::CompactNodeType; + use garage_util::crdt::LwwMap; + use garage_util::data::{Hash, Uuid}; + use serde::{Deserialize, Serialize}; + + /// The layout of the cluster, i.e. the list of roles + /// which are assigned to each cluster node + #[derive(Clone, Debug, Serialize, Deserialize)] + pub struct ClusterLayout { + pub version: u64, + + pub replication_factor: usize, + pub roles: LwwMap, + + /// node_id_vec: a vector of node IDs with a role assigned + /// in the system (this includes gateway nodes). + /// The order here is different than the vec stored by `roles`, because: + /// 1. non-gateway nodes are first so that they have lower numbers + /// 2. nodes that don't have a role are excluded (but they need to + /// stay in the CRDT as tombstones) + pub node_id_vec: Vec, + /// the assignation of data partitions to node, the values + /// are indices in node_id_vec + #[serde(with = "serde_bytes")] + pub ring_assignation_data: Vec, + + /// Role changes which are staged for the next version of the layout + pub staging: LwwMap, + pub staging_hash: Hash, + } + + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] + pub struct NodeRoleV(pub Option); + + /// The user-assigned roles of cluster nodes + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] + pub struct NodeRole { + /// Datacenter at which this entry belong. This information is used to + /// perform a better geodistribution + pub zone: String, + /// The capacity of the node + /// If this is set to None, the node does not participate in storing data for the system + /// and is only active as an API gateway to other nodes + pub capacity: Option, + /// A set of tags to recognize the node + pub tags: Vec, + } + + impl garage_util::migrate::InitialFormat for ClusterLayout {} +} + +mod v09 { + use super::v08; + use crate::layout::CompactNodeType; + use garage_util::crdt::{Lww, LwwMap}; + use garage_util::data::{Hash, Uuid}; + use serde::{Deserialize, Serialize}; + pub use v08::{NodeRole, NodeRoleV}; + + /// The layout of the cluster, i.e. the list of roles + /// which are assigned to each cluster node + #[derive(Clone, Debug, Serialize, Deserialize)] + pub struct ClusterLayout { + pub version: u64, + + pub replication_factor: usize, + + /// This attribute is only used to retain the previously computed partition size, + /// to know to what extent does it change with the layout update. + pub partition_size: u64, + /// Parameters used to compute the assignment currently given by + /// ring_assignment_data + pub parameters: LayoutParameters, + + pub roles: LwwMap, + + /// see comment in v08::ClusterLayout + pub node_id_vec: Vec, + /// see comment in v08::ClusterLayout + #[serde(with = "serde_bytes")] + pub ring_assignment_data: Vec, + + /// Parameters to be used in the next partition assignment computation. + pub staging_parameters: Lww, + /// Role changes which are staged for the next version of the layout + pub staging_roles: LwwMap, + pub staging_hash: Hash, + } + + /// This struct is used to set the parameters to be used in the assignment computation + /// algorithm. It is stored as a Crdt. + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)] + pub struct LayoutParameters { + pub zone_redundancy: ZoneRedundancy, + } + + /// Zone redundancy: if set to AtLeast(x), the layout calculation will aim to store copies + /// of each partition on at least that number of different zones. + /// Otherwise, copies will be stored on the maximum possible number of zones. + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)] + pub enum ZoneRedundancy { + AtLeast(usize), + Maximum, + } + + impl garage_util::migrate::Migrate for ClusterLayout { + const VERSION_MARKER: &'static [u8] = b"G09layout"; + + type Previous = v08::ClusterLayout; + + fn migrate(previous: Self::Previous) -> Self { + use itertools::Itertools; + + // In the old layout, capacities are in an arbitrary unit, + // but in the new layout they are in bytes. + // Here we arbitrarily multiply everything by 1G, + // such that 1 old capacity unit = 1GB in the new units. + // This is totally arbitrary and won't work for most users. + let cap_mul = 1024 * 1024 * 1024; + let roles = multiply_all_capacities(previous.roles, cap_mul); + let staging_roles = multiply_all_capacities(previous.staging, cap_mul); + let node_id_vec = previous.node_id_vec; + + // Determine partition size + let mut tmp = previous.ring_assignation_data.clone(); + tmp.sort(); + let partition_size = tmp + .into_iter() + .dedup_with_count() + .map(|(npart, node)| { + roles + .get(&node_id_vec[node as usize]) + .and_then(|p| p.0.as_ref().and_then(|r| r.capacity)) + .unwrap_or(0) / npart as u64 + }) + .min() + .unwrap_or(0); + + // By default, zone_redundancy is maximum possible value + let parameters = LayoutParameters { + zone_redundancy: ZoneRedundancy::Maximum, + }; + + Self { + version: previous.version, + replication_factor: previous.replication_factor, + partition_size, + parameters, + roles, + node_id_vec, + ring_assignment_data: previous.ring_assignation_data, + staging_parameters: Lww::new(parameters), + staging_roles, + staging_hash: [0u8; 32].into(), // will be set in the next migration + } + } + } + + fn multiply_all_capacities( + old_roles: LwwMap, + mul: u64, + ) -> LwwMap { + let mut new_roles = LwwMap::new(); + for (node, ts, role) in old_roles.items() { + let mut role = role.clone(); + if let NodeRoleV(Some(NodeRole { + capacity: Some(ref mut cap), + .. + })) = role + { + *cap *= mul; + } + new_roles.merge_raw(node, *ts, &role); + } + new_roles + } +} + +mod v010 { + use super::v09; + use crate::layout::CompactNodeType; + use garage_util::crdt::{Lww, LwwMap}; + use garage_util::data::{Hash, Uuid}; + use serde::{Deserialize, Serialize}; + use std::collections::HashMap; + use std::sync::Arc; + pub use v09::{LayoutParameters, NodeRole, NodeRoleV, ZoneRedundancy}; + + /// The layout of the cluster, i.e. the list of roles + /// which are assigned to each cluster node + #[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] + pub struct LayoutVersion { + pub version: u64, + + pub replication_factor: usize, + + /// This attribute is only used to retain the previously computed partition size, + /// to know to what extent does it change with the layout update. + pub partition_size: u64, + /// Parameters used to compute the assignment currently given by + /// ring_assignment_data + pub parameters: LayoutParameters, + + pub roles: LwwMap, + + /// see comment in v08::ClusterLayout + pub node_id_vec: Vec, + /// see comment in v08::ClusterLayout + #[serde(with = "serde_bytes")] + pub ring_assignment_data: Vec, + } + + /// The history of cluster layouts + #[derive(Clone, Debug, Serialize, Deserialize)] + pub struct LayoutHistory { + /// The versions currently in use in the cluster + pub versions: Arc<[LayoutVersion]>, + + /// Update trackers + pub update_trackers: UpdateTrackers, + + /// Parameters to be used in the next partition assignment computation. + pub staging_parameters: Lww, + /// Role changes which are staged for the next version of the layout + pub staging_roles: LwwMap, + /// Hash of the serialized staging_parameters + staging_roles + pub staging_hash: Hash, + } + + /// The tracker of acknowlegments and data syncs around the cluster + #[derive(Clone, Debug, Serialize, Deserialize, Default)] + pub struct UpdateTrackers { + /// The highest layout version number each node has ack'ed + pub ack_map: UpdateTracker, + /// The highest layout version number each node has synced data for + pub sync_map: UpdateTracker, + /// The highest layout version number each node has + /// ack'ed that all other nodes have synced data for + pub sync_ack_map: UpdateTracker, + } + + /// The history of cluster layouts + #[derive(Clone, Debug, Serialize, Deserialize, Default)] + pub struct UpdateTracker(pub HashMap); + + impl garage_util::migrate::Migrate for LayoutHistory { + const VERSION_MARKER: &'static [u8] = b"G010lh"; + + type Previous = v09::ClusterLayout; + + fn migrate(previous: Self::Previous) -> Self { + let version = LayoutVersion { + version: previous.version, + replication_factor: previous.replication_factor, + partition_size: previous.partition_size, + parameters: previous.parameters, + roles: previous.roles, + node_id_vec: previous.node_id_vec, + ring_assignment_data: previous.ring_assignment_data, + }; + let update_tracker = UpdateTracker( + version + .nongateway_nodes() + .iter() + .map(|x| (*x, version.version)) + .collect::>(), + ); + let mut ret = Self { + versions: Arc::from(vec![version].into_boxed_slice()), + update_trackers: UpdateTrackers { + ack_map: update_tracker.clone(), + sync_map: update_tracker.clone(), + sync_ack_map: update_tracker.clone(), + }, + staging_parameters: previous.staging_parameters, + staging_roles: previous.staging_roles, + staging_hash: [0u8; 32].into(), + }; + ret.staging_hash = ret.calculate_staging_hash(); + ret + } + } +} + +pub use v010::*; diff --git a/src/rpc/layout/tracker.rs b/src/rpc/layout/tracker.rs new file mode 100644 index 00000000..778121e4 --- /dev/null +++ b/src/rpc/layout/tracker.rs @@ -0,0 +1,21 @@ +use super::*; + +impl UpdateTracker { + fn merge(&mut self, other: &UpdateTracker) { + for (k, v) in other.0.iter() { + if let Some(v_mut) = self.0.get_mut(k) { + *v_mut = std::cmp::max(*v_mut, *v); + } else { + self.0.insert(*k, *v); + } + } + } +} + +impl UpdateTrackers { + pub(crate) fn merge(&mut self, other: &UpdateTrackers) { + self.ack_map.merge(&other.ack_map); + self.sync_map.merge(&other.sync_map); + self.sync_ack_map.merge(&other.sync_ack_map); + } +} diff --git a/src/rpc/layout/version.rs b/src/rpc/layout/version.rs new file mode 100644 index 00000000..363bc204 --- /dev/null +++ b/src/rpc/layout/version.rs @@ -0,0 +1,1052 @@ +use std::collections::HashMap; +use std::collections::HashSet; +use std::fmt; + +use bytesize::ByteSize; +use itertools::Itertools; + +use garage_util::crdt::{AutoCrdt, LwwMap}; +use garage_util::data::*; +use garage_util::error::*; + +use crate::graph_algo::*; + +use std::convert::TryInto; + +use super::schema::*; +use super::*; + +// The Message type will be used to collect information on the algorithm. +pub type Message = Vec; + +impl AutoCrdt for LayoutParameters { + const WARN_IF_DIFFERENT: bool = true; +} + +impl AutoCrdt for NodeRoleV { + const WARN_IF_DIFFERENT: bool = true; +} + +impl NodeRole { + pub fn capacity_string(&self) -> String { + match self.capacity { + Some(c) => ByteSize::b(c).to_string_as(false), + None => "gateway".to_string(), + } + } + + pub fn tags_string(&self) -> String { + self.tags.join(",") + } +} + +impl fmt::Display for ZoneRedundancy { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + ZoneRedundancy::Maximum => write!(f, "maximum"), + ZoneRedundancy::AtLeast(x) => write!(f, "{}", x), + } + } +} + +impl core::str::FromStr for ZoneRedundancy { + type Err = &'static str; + fn from_str(s: &str) -> Result { + match s { + "none" | "max" | "maximum" => Ok(ZoneRedundancy::Maximum), + x => { + let v = x + .parse::() + .map_err(|_| "zone redundancy must be 'none'/'max' or an integer")?; + Ok(ZoneRedundancy::AtLeast(v)) + } + } + } +} + +impl LayoutVersion { + pub fn new(replication_factor: usize) -> Self { + // We set the default zone redundancy to be Maximum, meaning that the maximum + // possible value will be used depending on the cluster topology + let parameters = LayoutParameters { + zone_redundancy: ZoneRedundancy::Maximum, + }; + + LayoutVersion { + version: 0, + replication_factor, + partition_size: 0, + roles: LwwMap::new(), + node_id_vec: Vec::new(), + ring_assignment_data: Vec::new(), + parameters, + } + } + + // ===================== accessors ====================== + + /// Returns a list of IDs of nodes that currently have + /// a role in the cluster + pub fn node_ids(&self) -> &[Uuid] { + &self.node_id_vec[..] + } + + pub fn num_nodes(&self) -> usize { + self.node_id_vec.len() + } + + /// Returns the role of a node in the layout + pub fn node_role(&self, node: &Uuid) -> Option<&NodeRole> { + match self.roles.get(node) { + Some(NodeRoleV(Some(v))) => Some(v), + _ => None, + } + } + + /// Given a node uuids, this function returns its capacity or fails if it does not have any + pub fn get_node_capacity(&self, uuid: &Uuid) -> Result { + match self.node_role(uuid) { + Some(NodeRole { + capacity: Some(cap), + zone: _, + tags: _, + }) => Ok(*cap), + _ => Err(Error::Message( + "The Uuid does not correspond to a node present in the \ + cluster or this node does not have a positive capacity." + .into(), + )), + } + } + + /// Returns the number of partitions associated to this node in the ring + pub fn get_node_usage(&self, uuid: &Uuid) -> Result { + for (i, id) in self.node_id_vec.iter().enumerate() { + if id == uuid { + let mut count = 0; + for nod in self.ring_assignment_data.iter() { + if i as u8 == *nod { + count += 1 + } + } + return Ok(count); + } + } + Err(Error::Message( + "The Uuid does not correspond to a node present in the \ + cluster or this node does not have a positive capacity." + .into(), + )) + } + + /// Get the partition in which data would fall on + pub fn partition_of(&self, position: &Hash) -> Partition { + let top = u16::from_be_bytes(position.as_slice()[0..2].try_into().unwrap()); + top >> (16 - PARTITION_BITS) + } + + /// Get the list of partitions and the first hash of a partition key that would fall in it + pub fn partitions(&self) -> Vec<(Partition, Hash)> { + (0..(1 << PARTITION_BITS)) + .map(|i| { + let top = (i as u16) << (16 - PARTITION_BITS); + let mut location = [0u8; 32]; + location[..2].copy_from_slice(&u16::to_be_bytes(top)[..]); + (i as u16, Hash::from(location)) + }) + .collect::>() + } + + /// Walk the ring to find the n servers in which data should be replicated + pub fn nodes_of(&self, position: &Hash, n: usize) -> Vec { + assert_eq!(n, self.replication_factor); + + let data = &self.ring_assignment_data; + + if data.len() != self.replication_factor * (1 << PARTITION_BITS) { + warn!("Ring not yet ready, read/writes will be lost!"); + return vec![]; + } + + let partition_idx = self.partition_of(position) as usize; + let partition_start = partition_idx * self.replication_factor; + let partition_end = (partition_idx + 1) * self.replication_factor; + let partition_nodes = &data[partition_start..partition_end]; + + partition_nodes + .iter() + .map(|i| self.node_id_vec[*i as usize]) + .collect::>() + } + + // ===================== internal information extractors ====================== + + /// Returns the uuids of the non_gateway nodes in self.node_id_vec. + pub(crate) fn nongateway_nodes(&self) -> Vec { + let mut result = Vec::::new(); + for uuid in self.node_id_vec.iter() { + match self.node_role(uuid) { + Some(role) if role.capacity.is_some() => result.push(*uuid), + _ => (), + } + } + result + } + + /// Given a node uuids, this function returns the label of its zone + fn get_node_zone(&self, uuid: &Uuid) -> Result<&str, Error> { + match self.node_role(uuid) { + Some(role) => Ok(&role.zone), + _ => Err(Error::Message( + "The Uuid does not correspond to a node present in the cluster.".into(), + )), + } + } + + /// Returns the sum of capacities of non gateway nodes in the cluster + fn get_total_capacity(&self) -> Result { + let mut total_capacity = 0; + for uuid in self.nongateway_nodes().iter() { + total_capacity += self.get_node_capacity(uuid)?; + } + Ok(total_capacity) + } + + /// Returns the effective value of the zone_redundancy parameter + fn effective_zone_redundancy(&self) -> usize { + match self.parameters.zone_redundancy { + ZoneRedundancy::AtLeast(v) => v, + ZoneRedundancy::Maximum => { + let n_zones = self + .roles + .items() + .iter() + .filter_map(|(_, _, role)| role.0.as_ref().map(|x| x.zone.as_str())) + .collect::>() + .len(); + std::cmp::min(n_zones, self.replication_factor) + } + } + } + + /// Check a cluster layout for internal consistency + /// (assignment, roles, parameters, partition size) + /// returns true if consistent, false if error + pub fn check(&self) -> Result<(), String> { + // Check that node_id_vec contains the correct list of nodes + let mut expected_nodes = self + .roles + .items() + .iter() + .filter(|(_, _, v)| v.0.is_some()) + .map(|(id, _, _)| *id) + .collect::>(); + expected_nodes.sort(); + let mut node_id_vec = self.node_id_vec.clone(); + node_id_vec.sort(); + if expected_nodes != node_id_vec { + return Err(format!("node_id_vec does not contain the correct set of nodes\nnode_id_vec: {:?}\nexpected: {:?}", node_id_vec, expected_nodes)); + } + + // Check that the assignment data has the correct length + let expected_assignment_data_len = (1 << PARTITION_BITS) * self.replication_factor; + if self.ring_assignment_data.len() != expected_assignment_data_len { + return Err(format!( + "ring_assignment_data has incorrect length {} instead of {}", + self.ring_assignment_data.len(), + expected_assignment_data_len + )); + } + + // Check that the assigned nodes are correct identifiers + // of nodes that are assigned a role + // and that role is not the role of a gateway nodes + for x in self.ring_assignment_data.iter() { + if *x as usize >= self.node_id_vec.len() { + return Err(format!( + "ring_assignment_data contains invalid node id {}", + *x + )); + } + let node = self.node_id_vec[*x as usize]; + match self.roles.get(&node) { + Some(NodeRoleV(Some(x))) if x.capacity.is_some() => (), + _ => return Err("ring_assignment_data contains id of a gateway node".into()), + } + } + + // Check that every partition is associated to distinct nodes + let zone_redundancy = self.effective_zone_redundancy(); + let rf = self.replication_factor; + for p in 0..(1 << PARTITION_BITS) { + let nodes_of_p = self.ring_assignment_data[rf * p..rf * (p + 1)].to_vec(); + if nodes_of_p.iter().unique().count() != rf { + return Err(format!("partition does not contain {} unique node ids", rf)); + } + // Check that every partition is spread over at least zone_redundancy zones. + let zones_of_p = nodes_of_p + .iter() + .map(|n| { + self.get_node_zone(&self.node_id_vec[*n as usize]) + .expect("Zone not found.") + }) + .collect::>(); + if zones_of_p.iter().unique().count() < zone_redundancy { + return Err(format!( + "nodes of partition are in less than {} distinct zones", + zone_redundancy + )); + } + } + + // Check that the nodes capacities is consistent with the stored partitions + let mut node_usage = vec![0; MAX_NODE_NUMBER]; + for n in self.ring_assignment_data.iter() { + node_usage[*n as usize] += 1; + } + for (n, usage) in node_usage.iter().enumerate() { + if *usage > 0 { + let uuid = self.node_id_vec[n]; + let partusage = usage * self.partition_size; + let nodecap = self.get_node_capacity(&uuid).unwrap(); + if partusage > nodecap { + return Err(format!( + "node usage ({}) is bigger than node capacity ({})", + usage * self.partition_size, + nodecap + )); + } + } + } + + // Check that the partition size stored is the one computed by the asignation + // algorithm. + let cl2 = self.clone(); + let (_, zone_to_id) = cl2.generate_nongateway_zone_ids().unwrap(); + match cl2.compute_optimal_partition_size(&zone_to_id, zone_redundancy) { + Ok(s) if s != self.partition_size => { + return Err(format!( + "partition_size ({}) is different than optimal value ({})", + self.partition_size, s + )) + } + Err(e) => return Err(format!("could not calculate optimal partition size: {}", e)), + _ => (), + } + + Ok(()) + } + + // ================== updates to layout, internals =================== + + /// This function calculates a new partition-to-node assignment. + /// The computed assignment respects the node replication factor + /// and the zone redundancy parameter It maximizes the capacity of a + /// partition (assuming all partitions have the same size). + /// Among such optimal assignment, it minimizes the distance to + /// the former assignment (if any) to minimize the amount of + /// data to be moved. + /// Staged role changes must be merged with nodes roles before calling this function, + /// hence it must only be called from apply_staged_changes() and hence is not public. + pub(crate) fn calculate_partition_assignment(&mut self) -> Result { + // We update the node ids, since the node role list might have changed with the + // changes in the layout. We retrieve the old_assignment reframed with new ids + let old_assignment_opt = self.update_node_id_vec()?; + + let zone_redundancy = self.effective_zone_redundancy(); + + let mut msg = Message::new(); + msg.push("==== COMPUTATION OF A NEW PARTITION ASSIGNATION ====".into()); + msg.push("".into()); + msg.push(format!( + "Partitions are \ + replicated {} times on at least {} distinct zones.", + self.replication_factor, zone_redundancy + )); + + // We generate for once numerical ids for the zones of non gateway nodes, + // to use them as indices in the flow graphs. + let (id_to_zone, zone_to_id) = self.generate_nongateway_zone_ids()?; + + let nb_nongateway_nodes = self.nongateway_nodes().len(); + if nb_nongateway_nodes < self.replication_factor { + return Err(Error::Message(format!( + "The number of nodes with positive \ + capacity ({}) is smaller than the replication factor ({}).", + nb_nongateway_nodes, self.replication_factor + ))); + } + if id_to_zone.len() < zone_redundancy { + return Err(Error::Message(format!( + "The number of zones with non-gateway \ + nodes ({}) is smaller than the redundancy parameter ({})", + id_to_zone.len(), + zone_redundancy + ))); + } + + // We compute the optimal partition size + // Capacities should be given in a unit so that partition size is at least 100. + // In this case, integer rounding plays a marginal role in the percentages of + // optimality. + let partition_size = self.compute_optimal_partition_size(&zone_to_id, zone_redundancy)?; + + msg.push("".into()); + if old_assignment_opt.is_some() { + msg.push(format!( + "Optimal partition size: {} ({} in previous layout)", + ByteSize::b(partition_size).to_string_as(false), + ByteSize::b(self.partition_size).to_string_as(false) + )); + } else { + msg.push(format!( + "Optimal partition size: {}", + ByteSize::b(partition_size).to_string_as(false) + )); + } + // We write the partition size. + self.partition_size = partition_size; + + if partition_size < 100 { + msg.push( + "WARNING: The partition size is low (< 100), make sure the capacities of your nodes are correct and are of at least a few MB" + .into(), + ); + } + + // We compute a first flow/assignment that is heuristically close to the previous + // assignment + let mut gflow = + self.compute_candidate_assignment(&zone_to_id, &old_assignment_opt, zone_redundancy)?; + if let Some(assoc) = &old_assignment_opt { + // We minimize the distance to the previous assignment. + self.minimize_rebalance_load(&mut gflow, &zone_to_id, assoc)?; + } + + // We display statistics of the computation + msg.extend(self.output_stat(&gflow, &old_assignment_opt, &zone_to_id, &id_to_zone)?); + + // We update the layout structure + self.update_ring_from_flow(id_to_zone.len(), &gflow)?; + + if let Err(e) = self.check() { + return Err(Error::Message( + format!("Layout check returned an error: {}\nOriginal result of computation: <<<<\n{}\n>>>>", e, msg.join("\n")) + )); + } + + Ok(msg) + } + + /// The LwwMap of node roles might have changed. This function updates the node_id_vec + /// and returns the assignment given by ring, with the new indices of the nodes, and + /// None if the node is not present anymore. + /// We work with the assumption that only this function and calculate_new_assignment + /// do modify assignment_ring and node_id_vec. + fn update_node_id_vec(&mut self) -> Result>>, Error> { + // (1) We compute the new node list + // Non gateway nodes should be coded on 8bits, hence they must be first in the list + // We build the new node ids + let new_non_gateway_nodes: Vec = self + .roles + .items() + .iter() + .filter(|(_, _, v)| matches!(&v.0, Some(r) if r.capacity.is_some())) + .map(|(k, _, _)| *k) + .collect(); + + if new_non_gateway_nodes.len() > MAX_NODE_NUMBER { + return Err(Error::Message(format!( + "There are more than {} non-gateway nodes in the new \ + layout. This is not allowed.", + MAX_NODE_NUMBER + ))); + } + + let new_gateway_nodes: Vec = self + .roles + .items() + .iter() + .filter(|(_, _, v)| matches!(v, NodeRoleV(Some(r)) if r.capacity.is_none())) + .map(|(k, _, _)| *k) + .collect(); + + let mut new_node_id_vec = Vec::::new(); + new_node_id_vec.extend(new_non_gateway_nodes); + new_node_id_vec.extend(new_gateway_nodes); + + let old_node_id_vec = self.node_id_vec.clone(); + self.node_id_vec = new_node_id_vec.clone(); + + // (2) We retrieve the old association + // We rewrite the old association with the new indices. We only consider partition + // to node assignments where the node is still in use. + if self.ring_assignment_data.is_empty() { + // This is a new association + return Ok(None); + } + + if self.ring_assignment_data.len() != NB_PARTITIONS * self.replication_factor { + return Err(Error::Message( + "The old assignment does not have a size corresponding to \ + the old replication factor or the number of partitions." + .into(), + )); + } + + // We build a translation table between the uuid and new ids + let mut uuid_to_new_id = HashMap::::new(); + + // We add the indices of only the new non-gateway nodes that can be used in the + // association ring + for (i, uuid) in new_node_id_vec.iter().enumerate() { + uuid_to_new_id.insert(*uuid, i); + } + + let mut old_assignment = vec![Vec::::new(); NB_PARTITIONS]; + let rf = self.replication_factor; + + for (p, old_assign_p) in old_assignment.iter_mut().enumerate() { + for old_id in &self.ring_assignment_data[p * rf..(p + 1) * rf] { + let uuid = old_node_id_vec[*old_id as usize]; + if uuid_to_new_id.contains_key(&uuid) { + old_assign_p.push(uuid_to_new_id[&uuid]); + } + } + } + + // We write the ring + self.ring_assignment_data = Vec::::new(); + + Ok(Some(old_assignment)) + } + + /// This function generates ids for the zone of the nodes appearing in + /// self.node_id_vec. + fn generate_nongateway_zone_ids(&self) -> Result<(Vec, HashMap), Error> { + let mut id_to_zone = Vec::::new(); + let mut zone_to_id = HashMap::::new(); + + for uuid in self.nongateway_nodes().iter() { + let r = self.node_role(uuid).unwrap(); + if !zone_to_id.contains_key(&r.zone) && r.capacity.is_some() { + zone_to_id.insert(r.zone.clone(), id_to_zone.len()); + id_to_zone.push(r.zone.clone()); + } + } + Ok((id_to_zone, zone_to_id)) + } + + /// This function computes by dichotomy the largest realizable partition size, given + /// the layout roles and parameters. + fn compute_optimal_partition_size( + &self, + zone_to_id: &HashMap, + zone_redundancy: usize, + ) -> Result { + let empty_set = HashSet::<(usize, usize)>::new(); + let mut g = self.generate_flow_graph(1, zone_to_id, &empty_set, zone_redundancy)?; + g.compute_maximal_flow()?; + if g.get_flow_value()? < (NB_PARTITIONS * self.replication_factor) as i64 { + return Err(Error::Message( + "The storage capacity of he cluster is to small. It is \ + impossible to store partitions of size 1." + .into(), + )); + } + + let mut s_down = 1; + let mut s_up = self.get_total_capacity()?; + while s_down + 1 < s_up { + g = self.generate_flow_graph( + (s_down + s_up) / 2, + zone_to_id, + &empty_set, + zone_redundancy, + )?; + g.compute_maximal_flow()?; + if g.get_flow_value()? < (NB_PARTITIONS * self.replication_factor) as i64 { + s_up = (s_down + s_up) / 2; + } else { + s_down = (s_down + s_up) / 2; + } + } + + Ok(s_down) + } + + fn generate_graph_vertices(nb_zones: usize, nb_nodes: usize) -> Vec { + let mut vertices = vec![Vertex::Source, Vertex::Sink]; + for p in 0..NB_PARTITIONS { + vertices.push(Vertex::Pup(p)); + vertices.push(Vertex::Pdown(p)); + for z in 0..nb_zones { + vertices.push(Vertex::PZ(p, z)); + } + } + for n in 0..nb_nodes { + vertices.push(Vertex::N(n)); + } + vertices + } + + /// Generates the graph to compute the maximal flow corresponding to the optimal + /// partition assignment. + /// exclude_assoc is the set of (partition, node) association that we are forbidden + /// to use (hence we do not add the corresponding edge to the graph). This parameter + /// is used to compute a first flow that uses only edges appearing in the previous + /// assignment. This produces a solution that heuristically should be close to the + /// previous one. + fn generate_flow_graph( + &self, + partition_size: u64, + zone_to_id: &HashMap, + exclude_assoc: &HashSet<(usize, usize)>, + zone_redundancy: usize, + ) -> Result, Error> { + let vertices = + LayoutVersion::generate_graph_vertices(zone_to_id.len(), self.nongateway_nodes().len()); + let mut g = Graph::::new(&vertices); + let nb_zones = zone_to_id.len(); + for p in 0..NB_PARTITIONS { + g.add_edge(Vertex::Source, Vertex::Pup(p), zone_redundancy as u64)?; + g.add_edge( + Vertex::Source, + Vertex::Pdown(p), + (self.replication_factor - zone_redundancy) as u64, + )?; + for z in 0..nb_zones { + g.add_edge(Vertex::Pup(p), Vertex::PZ(p, z), 1)?; + g.add_edge( + Vertex::Pdown(p), + Vertex::PZ(p, z), + self.replication_factor as u64, + )?; + } + } + for n in 0..self.nongateway_nodes().len() { + let node_capacity = self.get_node_capacity(&self.node_id_vec[n])?; + let node_zone = zone_to_id[self.get_node_zone(&self.node_id_vec[n])?]; + g.add_edge(Vertex::N(n), Vertex::Sink, node_capacity / partition_size)?; + for p in 0..NB_PARTITIONS { + if !exclude_assoc.contains(&(p, n)) { + g.add_edge(Vertex::PZ(p, node_zone), Vertex::N(n), 1)?; + } + } + } + Ok(g) + } + + /// This function computes a first optimal assignment (in the form of a flow graph). + fn compute_candidate_assignment( + &self, + zone_to_id: &HashMap, + prev_assign_opt: &Option>>, + zone_redundancy: usize, + ) -> Result, Error> { + // We list the (partition,node) associations that are not used in the + // previous assignment + let mut exclude_edge = HashSet::<(usize, usize)>::new(); + if let Some(prev_assign) = prev_assign_opt { + let nb_nodes = self.nongateway_nodes().len(); + for (p, prev_assign_p) in prev_assign.iter().enumerate() { + for n in 0..nb_nodes { + exclude_edge.insert((p, n)); + } + for n in prev_assign_p.iter() { + exclude_edge.remove(&(p, *n)); + } + } + } + + // We compute the best flow using only the edges used in the previous assignment + let mut g = self.generate_flow_graph( + self.partition_size, + zone_to_id, + &exclude_edge, + zone_redundancy, + )?; + g.compute_maximal_flow()?; + + // We add the excluded edges and compute the maximal flow with the full graph. + // The algorithm is such that it will start with the flow that we just computed + // and find ameliorating paths from that. + for (p, n) in exclude_edge.iter() { + let node_zone = zone_to_id[self.get_node_zone(&self.node_id_vec[*n])?]; + g.add_edge(Vertex::PZ(*p, node_zone), Vertex::N(*n), 1)?; + } + g.compute_maximal_flow()?; + Ok(g) + } + + /// This function updates the flow graph gflow to minimize the distance between + /// its corresponding assignment and the previous one + fn minimize_rebalance_load( + &self, + gflow: &mut Graph, + zone_to_id: &HashMap, + prev_assign: &[Vec], + ) -> Result<(), Error> { + // We define a cost function on the edges (pairs of vertices) corresponding + // to the distance between the two assignments. + let mut cost = CostFunction::new(); + for (p, assoc_p) in prev_assign.iter().enumerate() { + for n in assoc_p.iter() { + let node_zone = zone_to_id[self.get_node_zone(&self.node_id_vec[*n])?]; + cost.insert((Vertex::PZ(p, node_zone), Vertex::N(*n)), -1); + } + } + + // We compute the maximal length of a simple path in gflow. It is used in the + // Bellman-Ford algorithm in optimize_flow_with_cost to set the number + // of iterations. + let nb_nodes = self.nongateway_nodes().len(); + let path_length = 4 * nb_nodes; + gflow.optimize_flow_with_cost(&cost, path_length)?; + + Ok(()) + } + + /// This function updates the assignment ring from the flow graph. + fn update_ring_from_flow( + &mut self, + nb_zones: usize, + gflow: &Graph, + ) -> Result<(), Error> { + self.ring_assignment_data = Vec::::new(); + for p in 0..NB_PARTITIONS { + for z in 0..nb_zones { + let assoc_vertex = gflow.get_positive_flow_from(Vertex::PZ(p, z))?; + for vertex in assoc_vertex.iter() { + if let Vertex::N(n) = vertex { + self.ring_assignment_data.push((*n).try_into().unwrap()); + } + } + } + } + + if self.ring_assignment_data.len() != NB_PARTITIONS * self.replication_factor { + return Err(Error::Message( + "Critical Error : the association ring we produced does not \ + have the right size." + .into(), + )); + } + Ok(()) + } + + /// This function returns a message summing up the partition repartition of the new + /// layout, and other statistics of the partition assignment computation. + fn output_stat( + &self, + gflow: &Graph, + prev_assign_opt: &Option>>, + zone_to_id: &HashMap, + id_to_zone: &[String], + ) -> Result { + let mut msg = Message::new(); + + let used_cap = self.partition_size * NB_PARTITIONS as u64 * self.replication_factor as u64; + let total_cap = self.get_total_capacity()?; + let percent_cap = 100.0 * (used_cap as f32) / (total_cap as f32); + msg.push(format!( + "Usable capacity / total cluster capacity: {} / {} ({:.1} %)", + ByteSize::b(used_cap).to_string_as(false), + ByteSize::b(total_cap).to_string_as(false), + percent_cap + )); + msg.push(format!( + "Effective capacity (replication factor {}): {}", + self.replication_factor, + ByteSize::b(used_cap / self.replication_factor as u64).to_string_as(false) + )); + if percent_cap < 80. { + msg.push("".into()); + msg.push( + "If the percentage is too low, it might be that the \ + cluster topology and redundancy constraints are forcing the use of nodes/zones with small \ + storage capacities." + .into(), + ); + msg.push( + "You might want to move storage capacity between zones or relax the redundancy constraint." + .into(), + ); + msg.push( + "See the detailed statistics below and look for saturated nodes/zones.".into(), + ); + } + + // We define and fill in the following tables + let storing_nodes = self.nongateway_nodes(); + let mut new_partitions = vec![0; storing_nodes.len()]; + let mut stored_partitions = vec![0; storing_nodes.len()]; + + let mut new_partitions_zone = vec![0; id_to_zone.len()]; + let mut stored_partitions_zone = vec![0; id_to_zone.len()]; + + for p in 0..NB_PARTITIONS { + for z in 0..id_to_zone.len() { + let pz_nodes = gflow.get_positive_flow_from(Vertex::PZ(p, z))?; + if !pz_nodes.is_empty() { + stored_partitions_zone[z] += 1; + if let Some(prev_assign) = prev_assign_opt { + let mut old_zones_of_p = Vec::::new(); + for n in prev_assign[p].iter() { + old_zones_of_p + .push(zone_to_id[self.get_node_zone(&self.node_id_vec[*n])?]); + } + if !old_zones_of_p.contains(&z) { + new_partitions_zone[z] += 1; + } + } + } + for vert in pz_nodes.iter() { + if let Vertex::N(n) = *vert { + stored_partitions[n] += 1; + if let Some(prev_assign) = prev_assign_opt { + if !prev_assign[p].contains(&n) { + new_partitions[n] += 1; + } + } + } + } + } + } + + if prev_assign_opt.is_none() { + new_partitions = stored_partitions.clone(); + //new_partitions_zone = stored_partitions_zone.clone(); + } + + // We display the statistics + + msg.push("".into()); + if prev_assign_opt.is_some() { + let total_new_partitions: usize = new_partitions.iter().sum(); + msg.push(format!( + "A total of {} new copies of partitions need to be \ + transferred.", + total_new_partitions + )); + msg.push("".into()); + } + + let mut table = vec![]; + for z in 0..id_to_zone.len() { + let mut nodes_of_z = Vec::::new(); + for n in 0..storing_nodes.len() { + if self.get_node_zone(&self.node_id_vec[n])? == id_to_zone[z] { + nodes_of_z.push(n); + } + } + let replicated_partitions: usize = + nodes_of_z.iter().map(|n| stored_partitions[*n]).sum(); + table.push(format!( + "{}\tTags\tPartitions\tCapacity\tUsable capacity", + id_to_zone[z] + )); + + let available_cap_z: u64 = self.partition_size * replicated_partitions as u64; + let mut total_cap_z = 0; + for n in nodes_of_z.iter() { + total_cap_z += self.get_node_capacity(&self.node_id_vec[*n])?; + } + let percent_cap_z = 100.0 * (available_cap_z as f32) / (total_cap_z as f32); + + for n in nodes_of_z.iter() { + let available_cap_n = stored_partitions[*n] as u64 * self.partition_size; + let total_cap_n = self.get_node_capacity(&self.node_id_vec[*n])?; + let tags_n = (self.node_role(&self.node_id_vec[*n]).ok_or(""))?.tags_string(); + table.push(format!( + " {:?}\t{}\t{} ({} new)\t{}\t{} ({:.1}%)", + self.node_id_vec[*n], + tags_n, + stored_partitions[*n], + new_partitions[*n], + ByteSize::b(total_cap_n).to_string_as(false), + ByteSize::b(available_cap_n).to_string_as(false), + (available_cap_n as f32) / (total_cap_n as f32) * 100.0, + )); + } + + table.push(format!( + " TOTAL\t\t{} ({} unique)\t{}\t{} ({:.1}%)", + replicated_partitions, + stored_partitions_zone[z], + //new_partitions_zone[z], + ByteSize::b(total_cap_z).to_string_as(false), + ByteSize::b(available_cap_z).to_string_as(false), + percent_cap_z + )); + table.push("".into()); + } + msg.push(format_table::format_table_to_string(table)); + + Ok(msg) + } +} + +// ==================================================================================== + +#[cfg(test)] +mod tests { + use super::{Error, *}; + use std::cmp::min; + + // This function checks that the partition size S computed is at least better than the + // one given by a very naive algorithm. To do so, we try to run the naive algorithm + // assuming a partion size of S+1. If we succed, it means that the optimal assignment + // was not optimal. The naive algorithm is the following : + // - we compute the max number of partitions associated to every node, capped at the + // partition number. It gives the number of tokens of every node. + // - every zone has a number of tokens equal to the sum of the tokens of its nodes. + // - we cycle over the partitions and associate zone tokens while respecting the + // zone redundancy constraint. + // NOTE: the naive algorithm is not optimal. Counter example: + // take nb_partition = 3 ; replication_factor = 5; redundancy = 4; + // number of tokens by zone : (A, 4), (B,1), (C,4), (D, 4), (E, 2) + // With these parameters, the naive algo fails, whereas there is a solution: + // (A,A,C,D,E) , (A,B,C,D,D) (A,C,C,D,E) + fn check_against_naive(cl: &LayoutVersion) -> Result { + let over_size = cl.partition_size + 1; + let mut zone_token = HashMap::::new(); + + let (zones, zone_to_id) = cl.generate_nongateway_zone_ids()?; + + if zones.is_empty() { + return Ok(false); + } + + for z in zones.iter() { + zone_token.insert(z.clone(), 0); + } + for uuid in cl.nongateway_nodes().iter() { + let z = cl.get_node_zone(uuid)?; + let c = cl.get_node_capacity(uuid)?; + zone_token.insert( + z.clone(), + zone_token[&z] + min(NB_PARTITIONS, (c / over_size) as usize), + ); + } + + // For every partition, we count the number of zone already associated and + // the name of the last zone associated + + let mut id_zone_token = vec![0; zones.len()]; + for (z, t) in zone_token.iter() { + id_zone_token[zone_to_id[z]] = *t; + } + + let mut nb_token = vec![0; NB_PARTITIONS]; + let mut last_zone = vec![zones.len(); NB_PARTITIONS]; + + let mut curr_zone = 0; + + let redundancy = cl.effective_zone_redundancy(); + + for replic in 0..cl.replication_factor { + for p in 0..NB_PARTITIONS { + while id_zone_token[curr_zone] == 0 + || (last_zone[p] == curr_zone + && redundancy - nb_token[p] <= cl.replication_factor - replic) + { + curr_zone += 1; + if curr_zone >= zones.len() { + return Ok(true); + } + } + id_zone_token[curr_zone] -= 1; + if last_zone[p] != curr_zone { + nb_token[p] += 1; + last_zone[p] = curr_zone; + } + } + } + + return Ok(false); + } + + fn show_msg(msg: &Message) { + for s in msg.iter() { + println!("{}", s); + } + } + + fn update_layout( + cl: &mut LayoutVersion, + node_id_vec: &Vec, + node_capacity_vec: &Vec, + node_zone_vec: &Vec, + zone_redundancy: usize, + ) { + for i in 0..node_id_vec.len() { + if let Some(x) = FixedBytes32::try_from(&[i as u8; 32]) { + cl.node_id_vec.push(x); + } + + let update = cl.staging_roles.update_mutator( + cl.node_id_vec[i], + NodeRoleV(Some(NodeRole { + zone: (node_zone_vec[i].to_string()), + capacity: (Some(node_capacity_vec[i])), + tags: (vec![]), + })), + ); + cl.staging_roles.merge(&update); + } + cl.staging_parameters.update(LayoutParameters { + zone_redundancy: ZoneRedundancy::AtLeast(zone_redundancy), + }); + cl.staging_hash = cl.calculate_staging_hash(); + } + + #[test] + fn test_assignment() { + let mut node_id_vec = vec![1, 2, 3]; + let mut node_capacity_vec = vec![4000, 1000, 2000]; + let mut node_zone_vec = vec!["A", "B", "C"] + .into_iter() + .map(|x| x.to_string()) + .collect(); + + let mut cl = LayoutVersion::new(3); + update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 3); + let v = cl.version; + let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); + show_msg(&msg); + assert_eq!(cl.check(), Ok(())); + assert!(matches!(check_against_naive(&cl), Ok(true))); + + node_id_vec = vec![1, 2, 3, 4, 5, 6, 7, 8, 9]; + node_capacity_vec = vec![4000, 1000, 1000, 3000, 1000, 1000, 2000, 10000, 2000]; + node_zone_vec = vec!["A", "B", "C", "C", "C", "B", "G", "H", "I"] + .into_iter() + .map(|x| x.to_string()) + .collect(); + update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 2); + let v = cl.version; + let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); + show_msg(&msg); + assert_eq!(cl.check(), Ok(())); + assert!(matches!(check_against_naive(&cl), Ok(true))); + + node_capacity_vec = vec![4000, 1000, 2000, 7000, 1000, 1000, 2000, 10000, 2000]; + update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 3); + let v = cl.version; + let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); + show_msg(&msg); + assert_eq!(cl.check(), Ok(())); + assert!(matches!(check_against_naive(&cl), Ok(true))); + + node_capacity_vec = vec![ + 4000000, 4000000, 2000000, 7000000, 1000000, 9000000, 2000000, 10000, 2000000, + ]; + update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 1); + let v = cl.version; + let (cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); + show_msg(&msg); + assert_eq!(cl.check(), Ok(())); + assert!(matches!(check_against_naive(&cl), Ok(true))); + } +} diff --git a/src/rpc/rpc_helper.rs b/src/rpc/rpc_helper.rs index 56bef2f3..3fdb4acd 100644 --- a/src/rpc/rpc_helper.rs +++ b/src/rpc/rpc_helper.rs @@ -26,7 +26,7 @@ use garage_util::data::*; use garage_util::error::Error; use garage_util::metrics::RecordDuration; -use crate::layout::ClusterLayout; +use crate::layout::LayoutHistory; use crate::metrics::RpcMetrics; // Default RPC timeout = 5 minutes @@ -91,7 +91,7 @@ pub struct RpcHelper(Arc); struct RpcHelperInner { our_node_id: Uuid, fullmesh: Arc, - layout_watch: watch::Receiver>, + layout_watch: watch::Receiver>, metrics: RpcMetrics, rpc_timeout: Duration, } @@ -100,7 +100,7 @@ impl RpcHelper { pub(crate) fn new( our_node_id: Uuid, fullmesh: Arc, - layout_watch: watch::Receiver>, + layout_watch: watch::Receiver>, rpc_timeout: Option, ) -> Self { let metrics = RpcMetrics::new(); @@ -392,8 +392,8 @@ impl RpcHelper { pub fn request_order(&self, nodes: &[Uuid]) -> Vec { // Retrieve some status variables that we will use to sort requests let peer_list = self.0.fullmesh.get_peer_list(); - let layout: Arc = self.0.layout_watch.borrow().clone(); - let our_zone = match layout.node_role(&self.0.our_node_id) { + let layout: Arc = self.0.layout_watch.borrow().clone(); + let our_zone = match layout.current().node_role(&self.0.our_node_id) { Some(pc) => &pc.zone, None => "", }; @@ -407,7 +407,7 @@ impl RpcHelper { let mut nodes = nodes .iter() .map(|to| { - let peer_zone = match layout.node_role(to) { + let peer_zone = match layout.current().node_role(to) { Some(pc) => &pc.zone, None => "", }; diff --git a/src/rpc/system.rs b/src/rpc/system.rs index 93144e39..86d724f1 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -64,7 +64,7 @@ pub enum SystemRpc { /// Exchanged with every node on a regular basis. AdvertiseStatus(NodeStatus), /// Advertisement of cluster layout. Sent spontanously or in response to PullClusterLayout - AdvertiseClusterLayout(ClusterLayout), + AdvertiseClusterLayout(LayoutHistory), /// Get known nodes states GetKnownNodes, /// Return known nodes @@ -84,7 +84,7 @@ pub struct System { /// The id of this node pub id: Uuid, - persist_cluster_layout: Persister, + persist_cluster_layout: Persister, persist_peer_list: Persister, local_status: ArcSwap, @@ -112,8 +112,8 @@ pub struct System { replication_factor: usize, /// The layout - pub layout_watch: watch::Receiver>, - update_layout: Mutex>>, + pub layout_watch: watch::Receiver>, + update_layout: Mutex>>, /// Path to metadata directory pub metadata_dir: PathBuf, @@ -256,16 +256,16 @@ impl System { hex::encode(&node_key.public_key()[..8]) ); - let persist_cluster_layout: Persister = + let persist_cluster_layout: Persister = Persister::new(&config.metadata_dir, "cluster_layout"); let persist_peer_list = Persister::new(&config.metadata_dir, "peer_list"); let cluster_layout = match persist_cluster_layout.load() { Ok(x) => { - if x.replication_factor != replication_factor { + if x.current().replication_factor != replication_factor { return Err(Error::Message(format!( "Prevous cluster layout has replication factor {}, which is different than the one specified in the config file ({}). The previous cluster layout can be purged, if you know what you are doing, simply by deleting the `cluster_layout` file in your metadata directory.", - x.replication_factor, + x.current().replication_factor, replication_factor ))); } @@ -276,7 +276,7 @@ impl System { "No valid previous cluster layout stored ({}), starting fresh.", e ); - ClusterLayout::new(replication_factor) + LayoutHistory::new(replication_factor) } }; @@ -423,13 +423,13 @@ impl System { known_nodes } - pub fn cluster_layout(&self) -> watch::Ref> { + pub fn cluster_layout(&self) -> watch::Ref> { self.layout_watch.borrow() } pub async fn update_cluster_layout( self: &Arc, - layout: &ClusterLayout, + layout: &LayoutHistory, ) -> Result<(), Error> { self.handle_advertise_cluster_layout(layout).await?; Ok(()) @@ -475,7 +475,9 @@ impl System { .collect::>(); let connected_nodes = nodes.iter().filter(|(_, n)| n.is_up).count(); + // TODO: not only layout.current() let storage_nodes = layout + .current() .roles .items() .iter() @@ -486,11 +488,11 @@ impl System { .filter(|(x, _, _)| nodes.get(x).map(|n| n.is_up).unwrap_or(false)) .count(); - let partitions = layout.partitions(); + let partitions = layout.current().partitions(); let partitions_n_up = partitions .iter() .map(|(_, h)| { - let pn = layout.nodes_of(h, layout.replication_factor); + let pn = layout.current().nodes_of(h, replication_factor); pn.iter() .filter(|x| nodes.get(x).map(|n| n.is_up).unwrap_or(false)) .count() @@ -581,7 +583,7 @@ impl System { /// Save network configuration to disc async fn save_cluster_layout(&self) -> Result<(), Error> { - let layout: Arc = self.layout_watch.borrow().clone(); + let layout: Arc = self.layout_watch.borrow().clone(); self.persist_cluster_layout .save_async(&layout) .await @@ -593,7 +595,7 @@ impl System { let mut new_si: NodeStatus = self.local_status.load().as_ref().clone(); let layout = self.layout_watch.borrow(); - new_si.cluster_layout_version = layout.version; + new_si.cluster_layout_version = layout.current().version; new_si.cluster_layout_staging_hash = layout.staging_hash; new_si.update_disk_usage(&self.metadata_dir, &self.data_dir, &self.metrics); @@ -648,12 +650,12 @@ impl System { async fn handle_advertise_cluster_layout( self: &Arc, - adv: &ClusterLayout, + adv: &LayoutHistory, ) -> Result { - if adv.replication_factor != self.replication_factor { + if adv.current().replication_factor != self.replication_factor { let msg = format!( "Received a cluster layout from another node with replication factor {}, which is different from what we have in our configuration ({}). Discarding the cluster layout we received.", - adv.replication_factor, + adv.current().replication_factor, self.replication_factor ); error!("{}", msg); @@ -662,7 +664,7 @@ impl System { let update_layout = self.update_layout.lock().await; // TODO: don't clone each time an AdvertiseClusterLayout is received - let mut layout: ClusterLayout = self.layout_watch.borrow().as_ref().clone(); + let mut layout: LayoutHistory = self.layout_watch.borrow().as_ref().clone(); let prev_layout_check = layout.check().is_ok(); if layout.merge(adv) { @@ -724,7 +726,7 @@ impl System { while !*stop_signal.borrow() { let not_configured = self.layout_watch.borrow().check().is_err(); let no_peers = self.fullmesh.get_peer_list().len() < self.replication_factor; - let expected_n_nodes = self.layout_watch.borrow().num_nodes(); + let expected_n_nodes = self.layout_watch.borrow().current().num_nodes(); let bad_peers = self .fullmesh .get_peer_list() @@ -863,13 +865,13 @@ impl EndpointHandler for System { } impl NodeStatus { - fn initial(replication_factor: usize, layout: &ClusterLayout) -> Self { + fn initial(replication_factor: usize, layout: &LayoutHistory) -> Self { NodeStatus { hostname: gethostname::gethostname() .into_string() .unwrap_or_else(|_| "".to_string()), replication_factor, - cluster_layout_version: layout.version, + cluster_layout_version: layout.current().version, cluster_layout_staging_hash: layout.staging_hash, meta_disk_avail: None, data_disk_avail: None, -- cgit v1.2.3 From 8dccee3ccfe7793c42203f28c1e91c6f989b6899 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 8 Nov 2023 19:28:36 +0100 Subject: cluster layout: adapt all uses of ClusterLayout to LayoutHistory --- src/api/admin/cluster.rs | 9 +++++---- src/api/k2v/index.rs | 9 ++++++--- src/garage/admin/mod.rs | 14 +++++++------- src/garage/cli/cmd.rs | 17 +++++++++++------ src/garage/cli/layout.rs | 38 +++++++++++++++++++++----------------- src/model/helper/bucket.rs | 9 ++++++--- src/model/index_counter.rs | 6 +++--- src/rpc/layout/history.rs | 17 +++++------------ src/rpc/layout/schema.rs | 5 ++--- src/table/replication/fullcopy.rs | 5 ++--- src/table/replication/sharded.rs | 16 ++++++++++------ src/table/sync.rs | 4 ++-- 12 files changed, 80 insertions(+), 69 deletions(-) (limited to 'src') diff --git a/src/api/admin/cluster.rs b/src/api/admin/cluster.rs index 01ff9885..6dd2e8da 100644 --- a/src/api/admin/cluster.rs +++ b/src/api/admin/cluster.rs @@ -89,8 +89,9 @@ pub async fn handle_get_cluster_layout(garage: &Arc) -> Result GetClusterLayoutResponse { +fn format_cluster_layout(layout: &layout::LayoutHistory) -> GetClusterLayoutResponse { let roles = layout + .current() .roles .items() .iter() @@ -107,7 +108,7 @@ fn format_cluster_layout(layout: &layout::ClusterLayout) -> GetClusterLayoutResp .staging_roles .items() .iter() - .filter(|(k, _, v)| layout.roles.get(k) != Some(v)) + .filter(|(k, _, v)| layout.current().roles.get(k) != Some(v)) .map(|(k, _, v)| match &v.0 { None => NodeRoleChange { id: hex::encode(k), @@ -125,7 +126,7 @@ fn format_cluster_layout(layout: &layout::ClusterLayout) -> GetClusterLayoutResp .collect::>(); GetClusterLayoutResponse { - version: layout.version, + version: layout.current().version, roles, staged_role_changes, } @@ -209,7 +210,7 @@ pub async fn handle_update_cluster_layout( let mut layout = garage.system.cluster_layout().as_ref().clone(); - let mut roles = layout.roles.clone(); + let mut roles = layout.current().roles.clone(); roles.merge(&layout.staging_roles); for change in updates { diff --git a/src/api/k2v/index.rs b/src/api/k2v/index.rs index 3fc39de6..a9bc3826 100644 --- a/src/api/k2v/index.rs +++ b/src/api/k2v/index.rs @@ -5,7 +5,7 @@ use serde::Serialize; use garage_util::data::*; -use garage_rpc::layout::ClusterLayout; +use garage_rpc::layout::LayoutHistory; use garage_table::util::*; use garage_model::garage::Garage; @@ -26,7 +26,7 @@ pub async fn handle_read_index( ) -> Result, Error> { let reverse = reverse.unwrap_or(false); - let layout: Arc = garage.system.cluster_layout().clone(); + let layout: Arc = garage.system.cluster_layout().clone(); let (partition_keys, more, next_start) = read_range( &garage.k2v.counter_table.table, @@ -35,7 +35,10 @@ pub async fn handle_read_index( &start, &end, limit, - Some((DeletedFilter::NotDeleted, layout.node_id_vec.clone())), + Some(( + DeletedFilter::NotDeleted, + layout.current().node_id_vec.clone(), + )), EnumerationOrder::from_reverse(reverse), ) .await?; diff --git a/src/garage/admin/mod.rs b/src/garage/admin/mod.rs index c3fa801a..e3ba6d35 100644 --- a/src/garage/admin/mod.rs +++ b/src/garage/admin/mod.rs @@ -127,7 +127,7 @@ impl AdminRpcHandler { let mut failures = vec![]; let layout = self.garage.system.cluster_layout().clone(); - for node in layout.node_ids().iter() { + for node in layout.current().node_ids().iter() { let node = (*node).into(); let resp = self .endpoint @@ -165,7 +165,7 @@ impl AdminRpcHandler { let mut ret = String::new(); let layout = self.garage.system.cluster_layout().clone(); - for node in layout.node_ids().iter() { + for node in layout.current().node_ids().iter() { let mut opt = opt.clone(); opt.all_nodes = false; opt.skip_global = true; @@ -277,8 +277,8 @@ impl AdminRpcHandler { // Gather storage node and free space statistics let layout = &self.garage.system.cluster_layout(); let mut node_partition_count = HashMap::::new(); - for short_id in layout.ring_assignment_data.iter() { - let id = layout.node_id_vec[*short_id as usize]; + for short_id in layout.current().ring_assignment_data.iter() { + let id = layout.current().node_id_vec[*short_id as usize]; *node_partition_count.entry(id).or_default() += 1; } let node_info = self @@ -293,7 +293,7 @@ impl AdminRpcHandler { for (id, parts) in node_partition_count.iter() { let info = node_info.get(id); let status = info.map(|x| &x.status); - let role = layout.roles.get(id).and_then(|x| x.0.as_ref()); + let role = layout.current().roles.get(id).and_then(|x| x.0.as_ref()); let hostname = status.map(|x| x.hostname.as_str()).unwrap_or("?"); let zone = role.map(|x| x.zone.as_str()).unwrap_or("?"); let capacity = role @@ -441,7 +441,7 @@ impl AdminRpcHandler { if all_nodes { let mut ret = vec![]; let layout = self.garage.system.cluster_layout().clone(); - for node in layout.node_ids().iter() { + for node in layout.current().node_ids().iter() { let node = (*node).into(); match self .endpoint @@ -489,7 +489,7 @@ impl AdminRpcHandler { if all_nodes { let mut ret = vec![]; let layout = self.garage.system.cluster_layout().clone(); - for node in layout.node_ids().iter() { + for node in layout.current().node_ids().iter() { let node = (*node).into(); match self .endpoint diff --git a/src/garage/cli/cmd.rs b/src/garage/cli/cmd.rs index 48359614..8be43873 100644 --- a/src/garage/cli/cmd.rs +++ b/src/garage/cli/cmd.rs @@ -62,7 +62,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> let mut healthy_nodes = vec!["ID\tHostname\tAddress\tTags\tZone\tCapacity\tDataAvail".to_string()]; for adv in status.iter().filter(|adv| adv.is_up) { - match layout.roles.get(&adv.id) { + match layout.current().roles.get(&adv.id) { Some(NodeRoleV(Some(cfg))) => { let data_avail = match &adv.status.data_disk_avail { _ if cfg.capacity.is_none() => "N/A".into(), @@ -102,10 +102,15 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> format_table(healthy_nodes); let status_keys = status.iter().map(|adv| adv.id).collect::>(); - let failure_case_1 = status - .iter() - .any(|adv| !adv.is_up && matches!(layout.roles.get(&adv.id), Some(NodeRoleV(Some(_))))); + let failure_case_1 = status.iter().any(|adv| { + !adv.is_up + && matches!( + layout.current().roles.get(&adv.id), + Some(NodeRoleV(Some(_))) + ) + }); let failure_case_2 = layout + .current() .roles .items() .iter() @@ -115,7 +120,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> let mut failed_nodes = vec!["ID\tHostname\tAddress\tTags\tZone\tCapacity\tLast seen".to_string()]; for adv in status.iter().filter(|adv| !adv.is_up) { - if let Some(NodeRoleV(Some(cfg))) = layout.roles.get(&adv.id) { + if let Some(NodeRoleV(Some(cfg))) = layout.current().roles.get(&adv.id) { let tf = timeago::Formatter::new(); failed_nodes.push(format!( "{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\t{capacity}\t{last_seen}", @@ -132,7 +137,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> )); } } - for (id, _, role_v) in layout.roles.items().iter() { + for (id, _, role_v) in layout.current().roles.items().iter() { if let NodeRoleV(Some(cfg)) = role_v { if !status_keys.contains(id) { failed_nodes.push(format!( diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index ce2b11e0..4a617337 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -58,17 +58,18 @@ pub async fn cmd_assign_role( status .iter() .map(|adv| adv.id) - .chain(layout.node_ids().iter().cloned()), + .chain(layout.current().node_ids().iter().cloned()), node_id, ) }) .collect::, _>>()?; - let mut roles = layout.roles.clone(); + let mut roles = layout.current().roles.clone(); roles.merge(&layout.staging_roles); for replaced in args.replace.iter() { - let replaced_node = find_matching_node(layout.node_ids().iter().cloned(), replaced)?; + let replaced_node = + find_matching_node(layout.current().node_ids().iter().cloned(), replaced)?; match roles.get(&replaced_node) { Some(NodeRoleV(Some(_))) => { layout @@ -149,7 +150,7 @@ pub async fn cmd_remove_role( ) -> Result<(), Error> { let mut layout = fetch_layout(rpc_cli, rpc_host).await?; - let mut roles = layout.roles.clone(); + let mut roles = layout.current().roles.clone(); roles.merge(&layout.staging_roles); let deleted_node = @@ -174,13 +175,16 @@ pub async fn cmd_show_layout( let layout = fetch_layout(rpc_cli, rpc_host).await?; println!("==== CURRENT CLUSTER LAYOUT ===="); - print_cluster_layout(&layout, "No nodes currently have a role in the cluster.\nSee `garage status` to view available nodes."); + print_cluster_layout(layout.current(), "No nodes currently have a role in the cluster.\nSee `garage status` to view available nodes."); println!(); - println!("Current cluster layout version: {}", layout.version); + println!( + "Current cluster layout version: {}", + layout.current().version + ); let has_role_changes = print_staging_role_changes(&layout); if has_role_changes { - let v = layout.version; + let v = layout.current().version; let res_apply = layout.apply_staged_changes(Some(v + 1)); // this will print the stats of what partitions @@ -189,7 +193,7 @@ pub async fn cmd_show_layout( Ok((layout, msg)) => { println!(); println!("==== NEW CLUSTER LAYOUT AFTER APPLYING CHANGES ===="); - print_cluster_layout(&layout, "No nodes have a role in the new layout."); + print_cluster_layout(layout.current(), "No nodes have a role in the new layout."); println!(); for line in msg.iter() { @@ -266,11 +270,11 @@ pub async fn cmd_config_layout( .parse::() .ok_or_message("invalid zone redundancy value")?; if let ZoneRedundancy::AtLeast(r_int) = r { - if r_int > layout.replication_factor { + if r_int > layout.current().replication_factor { return Err(Error::Message(format!( "The zone redundancy must be smaller or equal to the \ replication factor ({}).", - layout.replication_factor + layout.current().replication_factor ))); } else if r_int < 1 { return Err(Error::Message( @@ -302,7 +306,7 @@ pub async fn cmd_config_layout( pub async fn fetch_layout( rpc_cli: &Endpoint, rpc_host: NodeID, -) -> Result { +) -> Result { match rpc_cli .call(&rpc_host, SystemRpc::PullClusterLayout, PRIO_NORMAL) .await?? @@ -315,7 +319,7 @@ pub async fn fetch_layout( pub async fn send_layout( rpc_cli: &Endpoint, rpc_host: NodeID, - layout: ClusterLayout, + layout: LayoutHistory, ) -> Result<(), Error> { rpc_cli .call( @@ -327,7 +331,7 @@ pub async fn send_layout( Ok(()) } -pub fn print_cluster_layout(layout: &ClusterLayout, empty_msg: &str) { +pub fn print_cluster_layout(layout: &LayoutVersion, empty_msg: &str) { let mut table = vec!["ID\tTags\tZone\tCapacity\tUsable capacity".to_string()]; for (id, _, role) in layout.roles.items().iter() { let role = match &role.0 { @@ -366,13 +370,13 @@ pub fn print_cluster_layout(layout: &ClusterLayout, empty_msg: &str) { } } -pub fn print_staging_role_changes(layout: &ClusterLayout) -> bool { +pub fn print_staging_role_changes(layout: &LayoutHistory) -> bool { let has_role_changes = layout .staging_roles .items() .iter() - .any(|(k, _, v)| layout.roles.get(k) != Some(v)); - let has_layout_changes = *layout.staging_parameters.get() != layout.parameters; + .any(|(k, _, v)| layout.current().roles.get(k) != Some(v)); + let has_layout_changes = *layout.staging_parameters.get() != layout.current().parameters; if has_role_changes || has_layout_changes { println!(); @@ -380,7 +384,7 @@ pub fn print_staging_role_changes(layout: &ClusterLayout) -> bool { if has_role_changes { let mut table = vec!["ID\tTags\tZone\tCapacity".to_string()]; for (id, _, role) in layout.staging_roles.items().iter() { - if layout.roles.get(id) == Some(role) { + if layout.current().roles.get(id) == Some(role) { continue; } if let Some(role) = &role.0 { diff --git a/src/model/helper/bucket.rs b/src/model/helper/bucket.rs index 8cd5b27b..18904c8d 100644 --- a/src/model/helper/bucket.rs +++ b/src/model/helper/bucket.rs @@ -450,10 +450,10 @@ impl<'a> BucketHelper<'a> { #[cfg(feature = "k2v")] { - use garage_rpc::layout::ClusterLayout; + use garage_rpc::layout::LayoutHistory; use std::sync::Arc; - let layout: Arc = self.0.system.cluster_layout().clone(); + let layout: Arc = self.0.system.cluster_layout().clone(); let k2vindexes = self .0 .k2v @@ -462,7 +462,10 @@ impl<'a> BucketHelper<'a> { .get_range( &bucket_id, None, - Some((DeletedFilter::NotDeleted, layout.node_id_vec.clone())), + Some(( + DeletedFilter::NotDeleted, + layout.current().node_id_vec.clone(), + )), 10, EnumerationOrder::Forward, ) diff --git a/src/model/index_counter.rs b/src/model/index_counter.rs index d514cb06..9637cc4c 100644 --- a/src/model/index_counter.rs +++ b/src/model/index_counter.rs @@ -7,7 +7,7 @@ use serde::{Deserialize, Serialize}; use garage_db as db; -use garage_rpc::layout::ClusterLayout; +use garage_rpc::layout::LayoutHistory; use garage_rpc::system::System; use garage_util::background::BackgroundRunner; use garage_util::data::*; @@ -83,8 +83,8 @@ impl Entry for CounterEntry { } impl CounterEntry { - pub fn filtered_values(&self, layout: &ClusterLayout) -> HashMap { - let nodes = &layout.node_id_vec[..]; + pub fn filtered_values(&self, layout: &LayoutHistory) -> HashMap { + let nodes = &layout.current().node_id_vec[..]; self.filtered_values_with_nodes(nodes) } diff --git a/src/rpc/layout/history.rs b/src/rpc/layout/history.rs index b3019f58..e59c9e9c 100644 --- a/src/rpc/layout/history.rs +++ b/src/rpc/layout/history.rs @@ -1,5 +1,4 @@ use std::cmp::Ordering; -use std::sync::Arc; use garage_util::crdt::{Crdt, Lww, LwwMap}; use garage_util::data::*; @@ -64,24 +63,22 @@ impl LayoutHistory { } // Add any new versions to history - let mut versions = self.versions.to_vec(); for v2 in other.versions.iter() { - if let Some(v1) = versions.iter().find(|v| v.version == v2.version) { + if let Some(v1) = self.versions.iter().find(|v| v.version == v2.version) { if v1 != v2 { error!("Inconsistent layout histories: different layout compositions for version {}. Your cluster will be broken as long as this layout version is not replaced.", v2.version); } - } else if versions.iter().all(|v| v.version != v2.version - 1) { + } else if self.versions.iter().all(|v| v.version != v2.version - 1) { error!( "Cannot receive new layout version {}, version {} is missing", v2.version, v2.version - 1 ); } else { - versions.push(v2.clone()); + self.versions.push(v2.clone()); changed = true; } } - self.versions = Arc::from(versions.into_boxed_slice()); // Merge trackers self.update_trackers.merge(&other.update_trackers); @@ -117,9 +114,7 @@ To know the correct value of the new layout version, invoke `garage layout show` let msg = new_version.calculate_partition_assignment()?; - let mut versions = self.versions.to_vec(); - versions.push(new_version); - self.versions = Arc::from(versions.into_boxed_slice()); + self.versions.push(new_version); Ok((self, msg)) } @@ -149,9 +144,7 @@ To know the correct value of the new layout version, invoke `garage layout show` let mut new_version = self.current().clone(); new_version.version += 1; - let mut versions = self.versions.to_vec(); - versions.push(new_version); - self.versions = Arc::from(versions.into_boxed_slice()); + self.versions.push(new_version); Ok(self) } diff --git a/src/rpc/layout/schema.rs b/src/rpc/layout/schema.rs index fa0822fa..14e797be 100644 --- a/src/rpc/layout/schema.rs +++ b/src/rpc/layout/schema.rs @@ -184,7 +184,6 @@ mod v010 { use garage_util::data::{Hash, Uuid}; use serde::{Deserialize, Serialize}; use std::collections::HashMap; - use std::sync::Arc; pub use v09::{LayoutParameters, NodeRole, NodeRoleV, ZoneRedundancy}; /// The layout of the cluster, i.e. the list of roles @@ -215,7 +214,7 @@ mod v010 { #[derive(Clone, Debug, Serialize, Deserialize)] pub struct LayoutHistory { /// The versions currently in use in the cluster - pub versions: Arc<[LayoutVersion]>, + pub versions: Vec, /// Update trackers pub update_trackers: UpdateTrackers, @@ -267,7 +266,7 @@ mod v010 { .collect::>(), ); let mut ret = Self { - versions: Arc::from(vec![version].into_boxed_slice()), + versions: vec![version], update_trackers: UpdateTrackers { ack_map: update_tracker.clone(), sync_map: update_tracker.clone(), diff --git a/src/table/replication/fullcopy.rs b/src/table/replication/fullcopy.rs index 34807e3d..a5c83d0f 100644 --- a/src/table/replication/fullcopy.rs +++ b/src/table/replication/fullcopy.rs @@ -27,11 +27,10 @@ impl TableReplication for TableFullReplication { } fn write_nodes(&self, _hash: &Hash) -> Vec { - let layout = self.system.cluster_layout(); - layout.node_ids().to_vec() + self.system.cluster_layout().current().node_ids().to_vec() } fn write_quorum(&self) -> usize { - let nmembers = self.system.cluster_layout().node_ids().len(); + let nmembers = self.system.cluster_layout().current().node_ids().len(); if nmembers > self.max_faults { nmembers - self.max_faults } else { diff --git a/src/table/replication/sharded.rs b/src/table/replication/sharded.rs index 60c95cb4..793d87fd 100644 --- a/src/table/replication/sharded.rs +++ b/src/table/replication/sharded.rs @@ -26,16 +26,20 @@ pub struct TableShardedReplication { impl TableReplication for TableShardedReplication { fn read_nodes(&self, hash: &Hash) -> Vec { - let layout = self.system.cluster_layout(); - layout.nodes_of(hash, self.replication_factor) + self.system + .cluster_layout() + .current() + .nodes_of(hash, self.replication_factor) } fn read_quorum(&self) -> usize { self.read_quorum } fn write_nodes(&self, hash: &Hash) -> Vec { - let layout = self.system.cluster_layout(); - layout.nodes_of(hash, self.replication_factor) + self.system + .cluster_layout() + .current() + .nodes_of(hash, self.replication_factor) } fn write_quorum(&self) -> usize { self.write_quorum @@ -45,9 +49,9 @@ impl TableReplication for TableShardedReplication { } fn partition_of(&self, hash: &Hash) -> Partition { - self.system.cluster_layout().partition_of(hash) + self.system.cluster_layout().current().partition_of(hash) } fn partitions(&self) -> Vec<(Partition, Hash)> { - self.system.cluster_layout().partitions() + self.system.cluster_layout().current().partitions() } } diff --git a/src/table/sync.rs b/src/table/sync.rs index 65eff7cd..620d83b9 100644 --- a/src/table/sync.rs +++ b/src/table/sync.rs @@ -492,8 +492,8 @@ impl EndpointHandler for TableSync struct SyncWorker { syncer: Arc>, - layout_watch: watch::Receiver>, - layout: Arc, + layout_watch: watch::Receiver>, + layout: Arc, add_full_sync_rx: mpsc::UnboundedReceiver<()>, todo: Vec, next_full_sync: Instant, -- cgit v1.2.3 From 1da0a5676edcd20fc5c7412596edb5772da9f606 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 8 Nov 2023 19:30:58 +0100 Subject: bump garage protocol version tag to 0x000A (0.10) --- src/rpc/system.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src') diff --git a/src/rpc/system.rs b/src/rpc/system.rs index 86d724f1..a7433b68 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -46,7 +46,7 @@ const STATUS_EXCHANGE_INTERVAL: Duration = Duration::from_secs(10); /// Version tag used for version check upon Netapp connection. /// Cluster nodes with different version tags are deemed /// incompatible and will refuse to connect. -pub const GARAGE_VERSION_TAG: u64 = 0x6761726167650008; // garage 0x0008 +pub const GARAGE_VERSION_TAG: u64 = 0x676172616765000A; // garage 0x000A /// RPC endpoint used for calls related to membership pub const SYSTEM_RPC_PATH: &str = "garage_rpc/membership.rs/SystemRpc"; -- cgit v1.2.3 From 523d2ecb9511f74e144cd116b942d6c1bf0f546d Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 9 Nov 2023 11:19:43 +0100 Subject: layout: use separate CRDT for staged layout changes --- src/api/admin/api_server.rs | 2 +- src/api/admin/cluster.rs | 23 ++- src/garage/cli/cmd.rs | 2 +- src/garage/cli/layout.rs | 47 +++-- src/garage/cli/structs.rs | 6 +- src/rpc/graph_algo.rs | 415 ------------------------------------------- src/rpc/layout/graph_algo.rs | 405 +++++++++++++++++++++++++++++++++++++++++ src/rpc/layout/history.rs | 82 +++------ src/rpc/layout/mod.rs | 4 +- src/rpc/layout/schema.rs | 106 ++++++++++- src/rpc/layout/tracker.rs | 21 --- src/rpc/layout/version.rs | 54 +----- src/rpc/lib.rs | 1 - 13 files changed, 580 insertions(+), 588 deletions(-) delete mode 100644 src/rpc/graph_algo.rs create mode 100644 src/rpc/layout/graph_algo.rs delete mode 100644 src/rpc/layout/tracker.rs (limited to 'src') diff --git a/src/api/admin/api_server.rs b/src/api/admin/api_server.rs index 4779f924..d9bd600e 100644 --- a/src/api/admin/api_server.rs +++ b/src/api/admin/api_server.rs @@ -279,7 +279,7 @@ impl ApiHandler for AdminApiServer { Endpoint::GetClusterLayout => handle_get_cluster_layout(&self.garage).await, Endpoint::UpdateClusterLayout => handle_update_cluster_layout(&self.garage, req).await, Endpoint::ApplyClusterLayout => handle_apply_cluster_layout(&self.garage, req).await, - Endpoint::RevertClusterLayout => handle_revert_cluster_layout(&self.garage, req).await, + Endpoint::RevertClusterLayout => handle_revert_cluster_layout(&self.garage).await, // Keys Endpoint::ListKeys => handle_list_keys(&self.garage).await, Endpoint::GetKeyInfo { diff --git a/src/api/admin/cluster.rs b/src/api/admin/cluster.rs index 6dd2e8da..fe8e8764 100644 --- a/src/api/admin/cluster.rs +++ b/src/api/admin/cluster.rs @@ -105,7 +105,9 @@ fn format_cluster_layout(layout: &layout::LayoutHistory) -> GetClusterLayoutResp .collect::>(); let staged_role_changes = layout - .staging_roles + .staging + .get() + .roles .items() .iter() .filter(|(k, _, v)| layout.current().roles.get(k) != Some(v)) @@ -211,7 +213,7 @@ pub async fn handle_update_cluster_layout( let mut layout = garage.system.cluster_layout().as_ref().clone(); let mut roles = layout.current().roles.clone(); - roles.merge(&layout.staging_roles); + roles.merge(&layout.staging.get().roles); for change in updates { let node = hex::decode(&change.id).ok_or_bad_request("Invalid node identifier")?; @@ -232,7 +234,9 @@ pub async fn handle_update_cluster_layout( }; layout - .staging_roles + .staging + .get_mut() + .roles .merge(&roles.update_mutator(node, layout::NodeRoleV(new_role))); } @@ -246,7 +250,7 @@ pub async fn handle_apply_cluster_layout( garage: &Arc, req: Request, ) -> Result, Error> { - let param = parse_json_body::(req).await?; + let param = parse_json_body::(req).await?; let layout = garage.system.cluster_layout().as_ref().clone(); let (layout, msg) = layout.apply_staged_changes(Some(param.version))?; @@ -260,14 +264,9 @@ pub async fn handle_apply_cluster_layout( Ok(json_ok_response(&res)?) } -pub async fn handle_revert_cluster_layout( - garage: &Arc, - req: Request, -) -> Result, Error> { - let param = parse_json_body::(req).await?; - +pub async fn handle_revert_cluster_layout(garage: &Arc) -> Result, Error> { let layout = garage.system.cluster_layout().as_ref().clone(); - let layout = layout.revert_staged_changes(Some(param.version))?; + let layout = layout.revert_staged_changes()?; garage.system.update_cluster_layout(&layout).await?; let res = format_cluster_layout(&layout); @@ -280,7 +279,7 @@ type UpdateClusterLayoutRequest = Vec; #[derive(Deserialize)] #[serde(rename_all = "camelCase")] -struct ApplyRevertLayoutRequest { +struct ApplyLayoutRequest { version: u64, } diff --git a/src/garage/cli/cmd.rs b/src/garage/cli/cmd.rs index 8be43873..1a054025 100644 --- a/src/garage/cli/cmd.rs +++ b/src/garage/cli/cmd.rs @@ -85,7 +85,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> )); } _ => { - let new_role = match layout.staging_roles.get(&adv.id) { + let new_role = match layout.staging.get().roles.get(&adv.id) { Some(NodeRoleV(Some(_))) => "(pending)", _ => "NO ROLE ASSIGNED", }; diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index 4a617337..269d92f4 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -65,7 +65,7 @@ pub async fn cmd_assign_role( .collect::, _>>()?; let mut roles = layout.current().roles.clone(); - roles.merge(&layout.staging_roles); + roles.merge(&layout.staging.get().roles); for replaced in args.replace.iter() { let replaced_node = @@ -73,7 +73,9 @@ pub async fn cmd_assign_role( match roles.get(&replaced_node) { Some(NodeRoleV(Some(_))) => { layout - .staging_roles + .staging + .get_mut() + .roles .merge(&roles.update_mutator(replaced_node, NodeRoleV(None))); } _ => { @@ -131,7 +133,9 @@ pub async fn cmd_assign_role( }; layout - .staging_roles + .staging + .get_mut() + .roles .merge(&roles.update_mutator(added_node, NodeRoleV(Some(new_entry)))); } @@ -151,13 +155,15 @@ pub async fn cmd_remove_role( let mut layout = fetch_layout(rpc_cli, rpc_host).await?; let mut roles = layout.current().roles.clone(); - roles.merge(&layout.staging_roles); + roles.merge(&layout.staging.get().roles); let deleted_node = find_matching_node(roles.items().iter().map(|(id, _, _)| *id), &args.node_id)?; layout - .staging_roles + .staging + .get_mut() + .roles .merge(&roles.update_mutator(deleted_node, NodeRoleV(None))); send_layout(rpc_cli, rpc_host, layout).await?; @@ -203,16 +209,12 @@ pub async fn cmd_show_layout( println!(); println!(" garage layout apply --version {}", v + 1); println!(); - println!( - "You can also revert all proposed changes with: garage layout revert --version {}", - v + 1) + println!("You can also revert all proposed changes with: garage layout revert"); } Err(e) => { println!("Error while trying to compute the assignment: {}", e); println!("This new layout cannot yet be applied."); - println!( - "You can also revert all proposed changes with: garage layout revert --version {}", - v + 1) + println!("You can also revert all proposed changes with: garage layout revert"); } } } @@ -245,9 +247,15 @@ pub async fn cmd_revert_layout( rpc_host: NodeID, revert_opt: RevertLayoutOpt, ) -> Result<(), Error> { + if !revert_opt.yes { + return Err(Error::Message( + "Please add the --yes flag to run the layout revert operation".into(), + )); + } + let layout = fetch_layout(rpc_cli, rpc_host).await?; - let layout = layout.revert_staged_changes(revert_opt.version)?; + let layout = layout.revert_staged_changes()?; send_layout(rpc_cli, rpc_host, layout).await?; @@ -284,7 +292,9 @@ pub async fn cmd_config_layout( } layout - .staging_parameters + .staging + .get_mut() + .parameters .update(LayoutParameters { zone_redundancy: r }); println!("The zone redundancy parameter has been set to '{}'.", r); did_something = true; @@ -371,19 +381,20 @@ pub fn print_cluster_layout(layout: &LayoutVersion, empty_msg: &str) { } pub fn print_staging_role_changes(layout: &LayoutHistory) -> bool { - let has_role_changes = layout - .staging_roles + let staging = layout.staging.get(); + let has_role_changes = staging + .roles .items() .iter() .any(|(k, _, v)| layout.current().roles.get(k) != Some(v)); - let has_layout_changes = *layout.staging_parameters.get() != layout.current().parameters; + let has_layout_changes = *staging.parameters.get() != layout.current().parameters; if has_role_changes || has_layout_changes { println!(); println!("==== STAGED ROLE CHANGES ===="); if has_role_changes { let mut table = vec!["ID\tTags\tZone\tCapacity".to_string()]; - for (id, _, role) in layout.staging_roles.items().iter() { + for (id, _, role) in staging.roles.items().iter() { if layout.current().roles.get(id) == Some(role) { continue; } @@ -406,7 +417,7 @@ pub fn print_staging_role_changes(layout: &LayoutHistory) -> bool { if has_layout_changes { println!( "Zone redundancy: {}", - layout.staging_parameters.get().zone_redundancy + staging.parameters.get().zone_redundancy ); } true diff --git a/src/garage/cli/structs.rs b/src/garage/cli/structs.rs index aba57551..3badc447 100644 --- a/src/garage/cli/structs.rs +++ b/src/garage/cli/structs.rs @@ -164,9 +164,9 @@ pub struct ApplyLayoutOpt { #[derive(StructOpt, Debug)] pub struct RevertLayoutOpt { - /// Version number of old configuration to which to revert - #[structopt(long = "version")] - pub(crate) version: Option, + /// The revert operation will not be ran unless this flag is added + #[structopt(long = "yes")] + pub(crate) yes: bool, } #[derive(Serialize, Deserialize, StructOpt, Debug)] diff --git a/src/rpc/graph_algo.rs b/src/rpc/graph_algo.rs deleted file mode 100644 index d8c6c9b9..00000000 --- a/src/rpc/graph_algo.rs +++ /dev/null @@ -1,415 +0,0 @@ -//! This module deals with graph algorithms. -//! It is used in layout.rs to build the partition to node assignment. - -use rand::prelude::{SeedableRng, SliceRandom}; -use std::cmp::{max, min}; -use std::collections::HashMap; -use std::collections::VecDeque; - -/// Vertex data structures used in all the graphs used in layout.rs. -/// usize parameters correspond to node/zone/partitions ids. -/// To understand the vertex roles below, please refer to the formal description -/// of the layout computation algorithm. -#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] -pub enum Vertex { - Source, - Pup(usize), // The vertex p+ of partition p - Pdown(usize), // The vertex p- of partition p - PZ(usize, usize), // The vertex corresponding to x_(partition p, zone z) - N(usize), // The vertex corresponding to node n - Sink, -} - -/// Edge data structure for the flow algorithm. -#[derive(Clone, Copy, Debug)] -pub struct FlowEdge { - cap: u64, // flow maximal capacity of the edge - flow: i64, // flow value on the edge - dest: usize, // destination vertex id - rev: usize, // index of the reversed edge (v, self) in the edge list of vertex v -} - -/// Edge data structure for the detection of negative cycles. -#[derive(Clone, Copy, Debug)] -pub struct WeightedEdge { - w: i64, // weight of the edge - dest: usize, -} - -pub trait Edge: Clone + Copy {} -impl Edge for FlowEdge {} -impl Edge for WeightedEdge {} - -/// Struct for the graph structure. We do encapsulation here to be able to both -/// provide user friendly Vertex enum to address vertices, and to use internally usize -/// indices and Vec instead of HashMap in the graph algorithm to optimize execution speed. -pub struct Graph { - vertex_to_id: HashMap, - id_to_vertex: Vec, - - // The graph is stored as an adjacency list - graph: Vec>, -} - -pub type CostFunction = HashMap<(Vertex, Vertex), i64>; - -impl Graph { - pub fn new(vertices: &[Vertex]) -> Self { - let mut map = HashMap::::new(); - for (i, vert) in vertices.iter().enumerate() { - map.insert(*vert, i); - } - Graph:: { - vertex_to_id: map, - id_to_vertex: vertices.to_vec(), - graph: vec![Vec::::new(); vertices.len()], - } - } - - fn get_vertex_id(&self, v: &Vertex) -> Result { - self.vertex_to_id - .get(v) - .cloned() - .ok_or_else(|| format!("The graph does not contain vertex {:?}", v)) - } -} - -impl Graph { - /// This function adds a directed edge to the graph with capacity c, and the - /// corresponding reversed edge with capacity 0. - pub fn add_edge(&mut self, u: Vertex, v: Vertex, c: u64) -> Result<(), String> { - let idu = self.get_vertex_id(&u)?; - let idv = self.get_vertex_id(&v)?; - if idu == idv { - return Err("Cannot add edge from vertex to itself in flow graph".into()); - } - - let rev_u = self.graph[idu].len(); - let rev_v = self.graph[idv].len(); - self.graph[idu].push(FlowEdge { - cap: c, - dest: idv, - flow: 0, - rev: rev_v, - }); - self.graph[idv].push(FlowEdge { - cap: 0, - dest: idu, - flow: 0, - rev: rev_u, - }); - Ok(()) - } - - /// This function returns the list of vertices that receive a positive flow from - /// vertex v. - pub fn get_positive_flow_from(&self, v: Vertex) -> Result, String> { - let idv = self.get_vertex_id(&v)?; - let mut result = Vec::::new(); - for edge in self.graph[idv].iter() { - if edge.flow > 0 { - result.push(self.id_to_vertex[edge.dest]); - } - } - Ok(result) - } - - /// This function returns the value of the flow incoming to v. - pub fn get_inflow(&self, v: Vertex) -> Result { - let idv = self.get_vertex_id(&v)?; - let mut result = 0; - for edge in self.graph[idv].iter() { - result += max(0, self.graph[edge.dest][edge.rev].flow); - } - Ok(result) - } - - /// This function returns the value of the flow outgoing from v. - pub fn get_outflow(&self, v: Vertex) -> Result { - let idv = self.get_vertex_id(&v)?; - let mut result = 0; - for edge in self.graph[idv].iter() { - result += max(0, edge.flow); - } - Ok(result) - } - - /// This function computes the flow total value by computing the outgoing flow - /// from the source. - pub fn get_flow_value(&mut self) -> Result { - self.get_outflow(Vertex::Source) - } - - /// This function shuffles the order of the edge lists. It keeps the ids of the - /// reversed edges consistent. - fn shuffle_edges(&mut self) { - // We use deterministic randomness so that the layout calculation algorihtm - // will output the same thing every time it is run. This way, the results - // pre-calculated in `garage layout show` will match exactly those used - // in practice with `garage layout apply` - let mut rng = rand::rngs::StdRng::from_seed([0x12u8; 32]); - for i in 0..self.graph.len() { - self.graph[i].shuffle(&mut rng); - // We need to update the ids of the reverse edges. - for j in 0..self.graph[i].len() { - let target_v = self.graph[i][j].dest; - let target_rev = self.graph[i][j].rev; - self.graph[target_v][target_rev].rev = j; - } - } - } - - /// Computes an upper bound of the flow on the graph - pub fn flow_upper_bound(&self) -> Result { - let idsource = self.get_vertex_id(&Vertex::Source)?; - let mut flow_upper_bound = 0; - for edge in self.graph[idsource].iter() { - flow_upper_bound += edge.cap; - } - Ok(flow_upper_bound) - } - - /// This function computes the maximal flow using Dinic's algorithm. It starts with - /// the flow values already present in the graph. So it is possible to add some edge to - /// the graph, compute a flow, add other edges, update the flow. - pub fn compute_maximal_flow(&mut self) -> Result<(), String> { - let idsource = self.get_vertex_id(&Vertex::Source)?; - let idsink = self.get_vertex_id(&Vertex::Sink)?; - - let nb_vertices = self.graph.len(); - - let flow_upper_bound = self.flow_upper_bound()?; - - // To ensure the dispersion of the associations generated by the - // assignment, we shuffle the neighbours of the nodes. Hence, - // the vertices do not consider their neighbours in the same order. - self.shuffle_edges(); - - // We run Dinic's max flow algorithm - loop { - // We build the level array from Dinic's algorithm. - let mut level = vec![None; nb_vertices]; - - let mut fifo = VecDeque::new(); - fifo.push_back((idsource, 0)); - while let Some((id, lvl)) = fifo.pop_front() { - if level[id].is_none() { - // it means id has not yet been reached - level[id] = Some(lvl); - for edge in self.graph[id].iter() { - if edge.cap as i64 - edge.flow > 0 { - fifo.push_back((edge.dest, lvl + 1)); - } - } - } - } - if level[idsink].is_none() { - // There is no residual flow - break; - } - // Now we run DFS respecting the level array - let mut next_nbd = vec![0; nb_vertices]; - let mut lifo = Vec::new(); - - lifo.push((idsource, flow_upper_bound)); - - while let Some((id, f)) = lifo.last().cloned() { - if id == idsink { - // The DFS reached the sink, we can add a - // residual flow. - lifo.pop(); - while let Some((id, _)) = lifo.pop() { - let nbd = next_nbd[id]; - self.graph[id][nbd].flow += f as i64; - let id_rev = self.graph[id][nbd].dest; - let nbd_rev = self.graph[id][nbd].rev; - self.graph[id_rev][nbd_rev].flow -= f as i64; - } - lifo.push((idsource, flow_upper_bound)); - continue; - } - // else we did not reach the sink - let nbd = next_nbd[id]; - if nbd >= self.graph[id].len() { - // There is nothing to explore from id anymore - lifo.pop(); - if let Some((parent, _)) = lifo.last() { - next_nbd[*parent] += 1; - } - continue; - } - // else we can try to send flow from id to its nbd - let new_flow = min( - f as i64, - self.graph[id][nbd].cap as i64 - self.graph[id][nbd].flow, - ) as u64; - if new_flow == 0 { - next_nbd[id] += 1; - continue; - } - if let (Some(lvldest), Some(lvlid)) = (level[self.graph[id][nbd].dest], level[id]) { - if lvldest <= lvlid { - // We cannot send flow to nbd. - next_nbd[id] += 1; - continue; - } - } - // otherwise, we send flow to nbd. - lifo.push((self.graph[id][nbd].dest, new_flow)); - } - } - Ok(()) - } - - /// This function takes a flow, and a cost function on the edges, and tries to find an - /// equivalent flow with a better cost, by finding improving overflow cycles. It uses - /// as subroutine the Bellman Ford algorithm run up to path_length. - /// We assume that the cost of edge (u,v) is the opposite of the cost of (v,u), and - /// only one needs to be present in the cost function. - pub fn optimize_flow_with_cost( - &mut self, - cost: &CostFunction, - path_length: usize, - ) -> Result<(), String> { - // We build the weighted graph g where we will look for negative cycle - let mut gf = self.build_cost_graph(cost)?; - let mut cycles = gf.list_negative_cycles(path_length); - while !cycles.is_empty() { - // we enumerate negative cycles - for c in cycles.iter() { - for i in 0..c.len() { - // We add one flow unit to the edge (u,v) of cycle c - let idu = self.vertex_to_id[&c[i]]; - let idv = self.vertex_to_id[&c[(i + 1) % c.len()]]; - for j in 0..self.graph[idu].len() { - // since idu appears at most once in the cycles, we enumerate every - // edge at most once. - let edge = self.graph[idu][j]; - if edge.dest == idv { - self.graph[idu][j].flow += 1; - self.graph[idv][edge.rev].flow -= 1; - break; - } - } - } - } - - gf = self.build_cost_graph(cost)?; - cycles = gf.list_negative_cycles(path_length); - } - Ok(()) - } - - /// Construct the weighted graph G_f from the flow and the cost function - fn build_cost_graph(&self, cost: &CostFunction) -> Result, String> { - let mut g = Graph::::new(&self.id_to_vertex); - let nb_vertices = self.id_to_vertex.len(); - for i in 0..nb_vertices { - for edge in self.graph[i].iter() { - if edge.cap as i64 - edge.flow > 0 { - // It is possible to send overflow through this edge - let u = self.id_to_vertex[i]; - let v = self.id_to_vertex[edge.dest]; - if cost.contains_key(&(u, v)) { - g.add_edge(u, v, cost[&(u, v)])?; - } else if cost.contains_key(&(v, u)) { - g.add_edge(u, v, -cost[&(v, u)])?; - } else { - g.add_edge(u, v, 0)?; - } - } - } - } - Ok(g) - } -} - -impl Graph { - /// This function adds a single directed weighted edge to the graph. - pub fn add_edge(&mut self, u: Vertex, v: Vertex, w: i64) -> Result<(), String> { - let idu = self.get_vertex_id(&u)?; - let idv = self.get_vertex_id(&v)?; - self.graph[idu].push(WeightedEdge { w, dest: idv }); - Ok(()) - } - - /// This function lists the negative cycles it manages to find after path_length - /// iterations of the main loop of the Bellman-Ford algorithm. For the classical - /// algorithm, path_length needs to be equal to the number of vertices. However, - /// for particular graph structures like in our case, the algorithm is still correct - /// when path_length is the length of the longest possible simple path. - /// See the formal description of the algorithm for more details. - fn list_negative_cycles(&self, path_length: usize) -> Vec> { - let nb_vertices = self.graph.len(); - - // We start with every vertex at distance 0 of some imaginary extra -1 vertex. - let mut distance = vec![0; nb_vertices]; - // The prev vector collects for every vertex from where does the shortest path come - let mut prev = vec![None; nb_vertices]; - - for _ in 0..path_length + 1 { - for id in 0..nb_vertices { - for e in self.graph[id].iter() { - if distance[id] + e.w < distance[e.dest] { - distance[e.dest] = distance[id] + e.w; - prev[e.dest] = Some(id); - } - } - } - } - - // If self.graph contains a negative cycle, then at this point the graph described - // by prev (which is a directed 1-forest/functional graph) - // must contain a cycle. We list the cycles of prev. - let cycles_prev = cycles_of_1_forest(&prev); - - // Remark that the cycle in prev is in the reverse order compared to the cycle - // in the graph. Thus the .rev(). - return cycles_prev - .iter() - .map(|cycle| { - cycle - .iter() - .rev() - .map(|id| self.id_to_vertex[*id]) - .collect() - }) - .collect(); - } -} - -/// This function returns the list of cycles of a directed 1 forest. It does not -/// check for the consistency of the input. -fn cycles_of_1_forest(forest: &[Option]) -> Vec> { - let mut cycles = Vec::>::new(); - let mut time_of_discovery = vec![None; forest.len()]; - - for t in 0..forest.len() { - let mut id = t; - // while we are on a valid undiscovered node - while time_of_discovery[id].is_none() { - time_of_discovery[id] = Some(t); - if let Some(i) = forest[id] { - id = i; - } else { - break; - } - } - if forest[id].is_some() && time_of_discovery[id] == Some(t) { - // We discovered an id that we explored at this iteration t. - // It means we are on a cycle - let mut cy = vec![id; 1]; - let mut id2 = id; - while let Some(id_next) = forest[id2] { - id2 = id_next; - if id2 != id { - cy.push(id2); - } else { - break; - } - } - cycles.push(cy); - } - } - cycles -} diff --git a/src/rpc/layout/graph_algo.rs b/src/rpc/layout/graph_algo.rs new file mode 100644 index 00000000..bd33e97f --- /dev/null +++ b/src/rpc/layout/graph_algo.rs @@ -0,0 +1,405 @@ +//! This module deals with graph algorithms. +//! It is used in layout.rs to build the partition to node assignment. + +use rand::prelude::{SeedableRng, SliceRandom}; +use std::cmp::{max, min}; +use std::collections::HashMap; +use std::collections::VecDeque; + +/// Vertex data structures used in all the graphs used in layout.rs. +/// usize parameters correspond to node/zone/partitions ids. +/// To understand the vertex roles below, please refer to the formal description +/// of the layout computation algorithm. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +pub enum Vertex { + Source, + Pup(usize), // The vertex p+ of partition p + Pdown(usize), // The vertex p- of partition p + PZ(usize, usize), // The vertex corresponding to x_(partition p, zone z) + N(usize), // The vertex corresponding to node n + Sink, +} + +/// Edge data structure for the flow algorithm. +#[derive(Clone, Copy, Debug)] +pub struct FlowEdge { + cap: u64, // flow maximal capacity of the edge + flow: i64, // flow value on the edge + dest: usize, // destination vertex id + rev: usize, // index of the reversed edge (v, self) in the edge list of vertex v +} + +/// Edge data structure for the detection of negative cycles. +#[derive(Clone, Copy, Debug)] +pub struct WeightedEdge { + w: i64, // weight of the edge + dest: usize, +} + +pub trait Edge: Clone + Copy {} +impl Edge for FlowEdge {} +impl Edge for WeightedEdge {} + +/// Struct for the graph structure. We do encapsulation here to be able to both +/// provide user friendly Vertex enum to address vertices, and to use internally usize +/// indices and Vec instead of HashMap in the graph algorithm to optimize execution speed. +pub struct Graph { + vertex_to_id: HashMap, + id_to_vertex: Vec, + + // The graph is stored as an adjacency list + graph: Vec>, +} + +pub type CostFunction = HashMap<(Vertex, Vertex), i64>; + +impl Graph { + pub fn new(vertices: &[Vertex]) -> Self { + let mut map = HashMap::::new(); + for (i, vert) in vertices.iter().enumerate() { + map.insert(*vert, i); + } + Graph:: { + vertex_to_id: map, + id_to_vertex: vertices.to_vec(), + graph: vec![Vec::::new(); vertices.len()], + } + } + + fn get_vertex_id(&self, v: &Vertex) -> Result { + self.vertex_to_id + .get(v) + .cloned() + .ok_or_else(|| format!("The graph does not contain vertex {:?}", v)) + } +} + +impl Graph { + /// This function adds a directed edge to the graph with capacity c, and the + /// corresponding reversed edge with capacity 0. + pub fn add_edge(&mut self, u: Vertex, v: Vertex, c: u64) -> Result<(), String> { + let idu = self.get_vertex_id(&u)?; + let idv = self.get_vertex_id(&v)?; + if idu == idv { + return Err("Cannot add edge from vertex to itself in flow graph".into()); + } + + let rev_u = self.graph[idu].len(); + let rev_v = self.graph[idv].len(); + self.graph[idu].push(FlowEdge { + cap: c, + dest: idv, + flow: 0, + rev: rev_v, + }); + self.graph[idv].push(FlowEdge { + cap: 0, + dest: idu, + flow: 0, + rev: rev_u, + }); + Ok(()) + } + + /// This function returns the list of vertices that receive a positive flow from + /// vertex v. + pub fn get_positive_flow_from(&self, v: Vertex) -> Result, String> { + let idv = self.get_vertex_id(&v)?; + let mut result = Vec::::new(); + for edge in self.graph[idv].iter() { + if edge.flow > 0 { + result.push(self.id_to_vertex[edge.dest]); + } + } + Ok(result) + } + + /// This function returns the value of the flow outgoing from v. + pub fn get_outflow(&self, v: Vertex) -> Result { + let idv = self.get_vertex_id(&v)?; + let mut result = 0; + for edge in self.graph[idv].iter() { + result += max(0, edge.flow); + } + Ok(result) + } + + /// This function computes the flow total value by computing the outgoing flow + /// from the source. + pub fn get_flow_value(&mut self) -> Result { + self.get_outflow(Vertex::Source) + } + + /// This function shuffles the order of the edge lists. It keeps the ids of the + /// reversed edges consistent. + fn shuffle_edges(&mut self) { + // We use deterministic randomness so that the layout calculation algorihtm + // will output the same thing every time it is run. This way, the results + // pre-calculated in `garage layout show` will match exactly those used + // in practice with `garage layout apply` + let mut rng = rand::rngs::StdRng::from_seed([0x12u8; 32]); + for i in 0..self.graph.len() { + self.graph[i].shuffle(&mut rng); + // We need to update the ids of the reverse edges. + for j in 0..self.graph[i].len() { + let target_v = self.graph[i][j].dest; + let target_rev = self.graph[i][j].rev; + self.graph[target_v][target_rev].rev = j; + } + } + } + + /// Computes an upper bound of the flow on the graph + pub fn flow_upper_bound(&self) -> Result { + let idsource = self.get_vertex_id(&Vertex::Source)?; + let mut flow_upper_bound = 0; + for edge in self.graph[idsource].iter() { + flow_upper_bound += edge.cap; + } + Ok(flow_upper_bound) + } + + /// This function computes the maximal flow using Dinic's algorithm. It starts with + /// the flow values already present in the graph. So it is possible to add some edge to + /// the graph, compute a flow, add other edges, update the flow. + pub fn compute_maximal_flow(&mut self) -> Result<(), String> { + let idsource = self.get_vertex_id(&Vertex::Source)?; + let idsink = self.get_vertex_id(&Vertex::Sink)?; + + let nb_vertices = self.graph.len(); + + let flow_upper_bound = self.flow_upper_bound()?; + + // To ensure the dispersion of the associations generated by the + // assignment, we shuffle the neighbours of the nodes. Hence, + // the vertices do not consider their neighbours in the same order. + self.shuffle_edges(); + + // We run Dinic's max flow algorithm + loop { + // We build the level array from Dinic's algorithm. + let mut level = vec![None; nb_vertices]; + + let mut fifo = VecDeque::new(); + fifo.push_back((idsource, 0)); + while let Some((id, lvl)) = fifo.pop_front() { + if level[id].is_none() { + // it means id has not yet been reached + level[id] = Some(lvl); + for edge in self.graph[id].iter() { + if edge.cap as i64 - edge.flow > 0 { + fifo.push_back((edge.dest, lvl + 1)); + } + } + } + } + if level[idsink].is_none() { + // There is no residual flow + break; + } + // Now we run DFS respecting the level array + let mut next_nbd = vec![0; nb_vertices]; + let mut lifo = Vec::new(); + + lifo.push((idsource, flow_upper_bound)); + + while let Some((id, f)) = lifo.last().cloned() { + if id == idsink { + // The DFS reached the sink, we can add a + // residual flow. + lifo.pop(); + while let Some((id, _)) = lifo.pop() { + let nbd = next_nbd[id]; + self.graph[id][nbd].flow += f as i64; + let id_rev = self.graph[id][nbd].dest; + let nbd_rev = self.graph[id][nbd].rev; + self.graph[id_rev][nbd_rev].flow -= f as i64; + } + lifo.push((idsource, flow_upper_bound)); + continue; + } + // else we did not reach the sink + let nbd = next_nbd[id]; + if nbd >= self.graph[id].len() { + // There is nothing to explore from id anymore + lifo.pop(); + if let Some((parent, _)) = lifo.last() { + next_nbd[*parent] += 1; + } + continue; + } + // else we can try to send flow from id to its nbd + let new_flow = min( + f as i64, + self.graph[id][nbd].cap as i64 - self.graph[id][nbd].flow, + ) as u64; + if new_flow == 0 { + next_nbd[id] += 1; + continue; + } + if let (Some(lvldest), Some(lvlid)) = (level[self.graph[id][nbd].dest], level[id]) { + if lvldest <= lvlid { + // We cannot send flow to nbd. + next_nbd[id] += 1; + continue; + } + } + // otherwise, we send flow to nbd. + lifo.push((self.graph[id][nbd].dest, new_flow)); + } + } + Ok(()) + } + + /// This function takes a flow, and a cost function on the edges, and tries to find an + /// equivalent flow with a better cost, by finding improving overflow cycles. It uses + /// as subroutine the Bellman Ford algorithm run up to path_length. + /// We assume that the cost of edge (u,v) is the opposite of the cost of (v,u), and + /// only one needs to be present in the cost function. + pub fn optimize_flow_with_cost( + &mut self, + cost: &CostFunction, + path_length: usize, + ) -> Result<(), String> { + // We build the weighted graph g where we will look for negative cycle + let mut gf = self.build_cost_graph(cost)?; + let mut cycles = gf.list_negative_cycles(path_length); + while !cycles.is_empty() { + // we enumerate negative cycles + for c in cycles.iter() { + for i in 0..c.len() { + // We add one flow unit to the edge (u,v) of cycle c + let idu = self.vertex_to_id[&c[i]]; + let idv = self.vertex_to_id[&c[(i + 1) % c.len()]]; + for j in 0..self.graph[idu].len() { + // since idu appears at most once in the cycles, we enumerate every + // edge at most once. + let edge = self.graph[idu][j]; + if edge.dest == idv { + self.graph[idu][j].flow += 1; + self.graph[idv][edge.rev].flow -= 1; + break; + } + } + } + } + + gf = self.build_cost_graph(cost)?; + cycles = gf.list_negative_cycles(path_length); + } + Ok(()) + } + + /// Construct the weighted graph G_f from the flow and the cost function + fn build_cost_graph(&self, cost: &CostFunction) -> Result, String> { + let mut g = Graph::::new(&self.id_to_vertex); + let nb_vertices = self.id_to_vertex.len(); + for i in 0..nb_vertices { + for edge in self.graph[i].iter() { + if edge.cap as i64 - edge.flow > 0 { + // It is possible to send overflow through this edge + let u = self.id_to_vertex[i]; + let v = self.id_to_vertex[edge.dest]; + if cost.contains_key(&(u, v)) { + g.add_edge(u, v, cost[&(u, v)])?; + } else if cost.contains_key(&(v, u)) { + g.add_edge(u, v, -cost[&(v, u)])?; + } else { + g.add_edge(u, v, 0)?; + } + } + } + } + Ok(g) + } +} + +impl Graph { + /// This function adds a single directed weighted edge to the graph. + pub fn add_edge(&mut self, u: Vertex, v: Vertex, w: i64) -> Result<(), String> { + let idu = self.get_vertex_id(&u)?; + let idv = self.get_vertex_id(&v)?; + self.graph[idu].push(WeightedEdge { w, dest: idv }); + Ok(()) + } + + /// This function lists the negative cycles it manages to find after path_length + /// iterations of the main loop of the Bellman-Ford algorithm. For the classical + /// algorithm, path_length needs to be equal to the number of vertices. However, + /// for particular graph structures like in our case, the algorithm is still correct + /// when path_length is the length of the longest possible simple path. + /// See the formal description of the algorithm for more details. + fn list_negative_cycles(&self, path_length: usize) -> Vec> { + let nb_vertices = self.graph.len(); + + // We start with every vertex at distance 0 of some imaginary extra -1 vertex. + let mut distance = vec![0; nb_vertices]; + // The prev vector collects for every vertex from where does the shortest path come + let mut prev = vec![None; nb_vertices]; + + for _ in 0..path_length + 1 { + for id in 0..nb_vertices { + for e in self.graph[id].iter() { + if distance[id] + e.w < distance[e.dest] { + distance[e.dest] = distance[id] + e.w; + prev[e.dest] = Some(id); + } + } + } + } + + // If self.graph contains a negative cycle, then at this point the graph described + // by prev (which is a directed 1-forest/functional graph) + // must contain a cycle. We list the cycles of prev. + let cycles_prev = cycles_of_1_forest(&prev); + + // Remark that the cycle in prev is in the reverse order compared to the cycle + // in the graph. Thus the .rev(). + return cycles_prev + .iter() + .map(|cycle| { + cycle + .iter() + .rev() + .map(|id| self.id_to_vertex[*id]) + .collect() + }) + .collect(); + } +} + +/// This function returns the list of cycles of a directed 1 forest. It does not +/// check for the consistency of the input. +fn cycles_of_1_forest(forest: &[Option]) -> Vec> { + let mut cycles = Vec::>::new(); + let mut time_of_discovery = vec![None; forest.len()]; + + for t in 0..forest.len() { + let mut id = t; + // while we are on a valid undiscovered node + while time_of_discovery[id].is_none() { + time_of_discovery[id] = Some(t); + if let Some(i) = forest[id] { + id = i; + } else { + break; + } + } + if forest[id].is_some() && time_of_discovery[id] == Some(t) { + // We discovered an id that we explored at this iteration t. + // It means we are on a cycle + let mut cy = vec![id; 1]; + let mut id2 = id; + while let Some(id_next) = forest[id2] { + id2 = id_next; + if id2 != id { + cy.push(id2); + } else { + break; + } + } + cycles.push(cy); + } + } + cycles +} diff --git a/src/rpc/layout/history.rs b/src/rpc/layout/history.rs index e59c9e9c..9ae28887 100644 --- a/src/rpc/layout/history.rs +++ b/src/rpc/layout/history.rs @@ -1,5 +1,3 @@ -use std::cmp::Ordering; - use garage_util::crdt::{Crdt, Lww, LwwMap}; use garage_util::data::*; use garage_util::encode::nonversioned_encode; @@ -12,14 +10,15 @@ impl LayoutHistory { pub fn new(replication_factor: usize) -> Self { let version = LayoutVersion::new(replication_factor); - let staging_parameters = Lww::::new(version.parameters); - let empty_lwwmap = LwwMap::new(); + let staging = LayoutStaging { + parameters: Lww::::new(version.parameters), + roles: LwwMap::new(), + }; let mut ret = LayoutHistory { versions: vec![version].into_boxed_slice().into(), update_trackers: Default::default(), - staging_parameters, - staging_roles: empty_lwwmap, + staging: Lww::raw(0, staging), staging_hash: [0u8; 32].into(), }; ret.staging_hash = ret.calculate_staging_hash(); @@ -31,8 +30,7 @@ impl LayoutHistory { } pub(crate) fn calculate_staging_hash(&self) -> Hash { - let hashed_tuple = (&self.staging_roles, &self.staging_parameters); - blake2sum(&nonversioned_encode(&hashed_tuple).unwrap()[..]) + blake2sum(&nonversioned_encode(&self.staging).unwrap()[..]) } // ================== updates to layout, public interface =================== @@ -41,26 +39,10 @@ impl LayoutHistory { let mut changed = false; // Merge staged layout changes - match other.current().version.cmp(&self.current().version) { - Ordering::Greater => { - self.staging_parameters = other.staging_parameters.clone(); - self.staging_roles = other.staging_roles.clone(); - self.staging_hash = other.staging_hash; - changed = true; - } - Ordering::Equal => { - self.staging_parameters.merge(&other.staging_parameters); - self.staging_roles.merge(&other.staging_roles); - - let new_staging_hash = self.calculate_staging_hash(); - if new_staging_hash != self.staging_hash { - changed = true; - } - - self.staging_hash = new_staging_hash; - } - Ordering::Less => (), + if self.staging != other.staging { + changed = true; } + self.staging.merge(&other.staging); // Add any new versions to history for v2 in other.versions.iter() { @@ -102,50 +84,34 @@ To know the correct value of the new layout version, invoke `garage layout show` } } + // Compute new version and add it to history let mut new_version = self.current().clone(); new_version.version += 1; - new_version.roles.merge(&self.staging_roles); + new_version.roles.merge(&self.staging.get().roles); new_version.roles.retain(|(_, _, v)| v.0.is_some()); - new_version.parameters = *self.staging_parameters.get(); - - self.staging_roles.clear(); - self.staging_hash = self.calculate_staging_hash(); + new_version.parameters = *self.staging.get().parameters.get(); let msg = new_version.calculate_partition_assignment()?; - self.versions.push(new_version); + // Reset the staged layout changes + self.staging.update(LayoutStaging { + parameters: self.staging.get().parameters.clone(), + roles: LwwMap::new(), + }); + self.staging_hash = self.calculate_staging_hash(); + Ok((self, msg)) } - pub fn revert_staged_changes(mut self, version: Option) -> Result { - match version { - None => { - let error = r#" -Please pass the new layout version number to ensure that you are writing the correct version of the cluster layout. -To know the correct value of the new layout version, invoke `garage layout show` and review the proposed changes. - "#; - return Err(Error::Message(error.into())); - } - Some(v) => { - if v != self.current().version + 1 { - return Err(Error::Message("Invalid new layout version".into())); - } - } - } - - self.staging_roles.clear(); - self.staging_parameters.update(self.current().parameters); + pub fn revert_staged_changes(mut self) -> Result { + self.staging.update(LayoutStaging { + parameters: Lww::new(self.current().parameters.clone()), + roles: LwwMap::new(), + }); self.staging_hash = self.calculate_staging_hash(); - // TODO this is stupid, we should have a separate version counter/LWW - // for the staging params - let mut new_version = self.current().clone(); - new_version.version += 1; - - self.versions.push(new_version); - Ok(self) } diff --git a/src/rpc/layout/mod.rs b/src/rpc/layout/mod.rs index 122d4b65..7c15988a 100644 --- a/src/rpc/layout/mod.rs +++ b/src/rpc/layout/mod.rs @@ -1,8 +1,10 @@ +mod graph_algo; mod history; mod schema; -mod tracker; mod version; +// ---- re-exports ---- + pub use history::*; pub use schema::*; pub use version::*; diff --git a/src/rpc/layout/schema.rs b/src/rpc/layout/schema.rs index 14e797be..c5b9b1d3 100644 --- a/src/rpc/layout/schema.rs +++ b/src/rpc/layout/schema.rs @@ -1,3 +1,9 @@ +use std::fmt; + +use bytesize::ByteSize; + +use garage_util::crdt::{AutoCrdt, Crdt}; + mod v08 { use crate::layout::CompactNodeType; use garage_util::crdt::LwwMap; @@ -210,6 +216,15 @@ mod v010 { pub ring_assignment_data: Vec, } + /// The staged changes for the next layout version + #[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] + pub struct LayoutStaging { + /// Parameters to be used in the next partition assignment computation. + pub parameters: Lww, + /// Role changes which are staged for the next version of the layout + pub roles: LwwMap, + } + /// The history of cluster layouts #[derive(Clone, Debug, Serialize, Deserialize)] pub struct LayoutHistory { @@ -219,10 +234,8 @@ mod v010 { /// Update trackers pub update_trackers: UpdateTrackers, - /// Parameters to be used in the next partition assignment computation. - pub staging_parameters: Lww, - /// Role changes which are staged for the next version of the layout - pub staging_roles: LwwMap, + /// Staged changes for the next version + pub staging: Lww, /// Hash of the serialized staging_parameters + staging_roles pub staging_hash: Hash, } @@ -265,6 +278,10 @@ mod v010 { .map(|x| (*x, version.version)) .collect::>(), ); + let staging = LayoutStaging { + parameters: previous.staging_parameters, + roles: previous.staging_roles, + }; let mut ret = Self { versions: vec![version], update_trackers: UpdateTrackers { @@ -272,8 +289,7 @@ mod v010 { sync_map: update_tracker.clone(), sync_ack_map: update_tracker.clone(), }, - staging_parameters: previous.staging_parameters, - staging_roles: previous.staging_roles, + staging: Lww::raw(previous.version, staging), staging_hash: [0u8; 32].into(), }; ret.staging_hash = ret.calculate_staging_hash(); @@ -283,3 +299,81 @@ mod v010 { } pub use v010::*; + +// ---- utility functions ---- + +impl AutoCrdt for LayoutParameters { + const WARN_IF_DIFFERENT: bool = true; +} + +impl AutoCrdt for NodeRoleV { + const WARN_IF_DIFFERENT: bool = true; +} + +impl Crdt for LayoutStaging { + fn merge(&mut self, other: &LayoutStaging) { + self.parameters.merge(&other.parameters); + self.roles.merge(&other.roles); + } +} + +impl NodeRole { + pub fn capacity_string(&self) -> String { + match self.capacity { + Some(c) => ByteSize::b(c).to_string_as(false), + None => "gateway".to_string(), + } + } + + pub fn tags_string(&self) -> String { + self.tags.join(",") + } +} + +impl fmt::Display for ZoneRedundancy { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + ZoneRedundancy::Maximum => write!(f, "maximum"), + ZoneRedundancy::AtLeast(x) => write!(f, "{}", x), + } + } +} + +impl core::str::FromStr for ZoneRedundancy { + type Err = &'static str; + fn from_str(s: &str) -> Result { + match s { + "none" | "max" | "maximum" => Ok(ZoneRedundancy::Maximum), + x => { + let v = x + .parse::() + .map_err(|_| "zone redundancy must be 'none'/'max' or an integer")?; + Ok(ZoneRedundancy::AtLeast(v)) + } + } + } +} + +impl UpdateTracker { + fn merge(&mut self, other: &UpdateTracker) { + for (k, v) in other.0.iter() { + if let Some(v_mut) = self.0.get_mut(k) { + *v_mut = std::cmp::max(*v_mut, *v); + } else { + self.0.insert(*k, *v); + } + } + } + + pub(crate) fn min(&self) -> u64 { + self.0.iter().map(|(_, v)| *v).min().unwrap_or(0) + } +} + +impl UpdateTrackers { + pub(crate) fn merge(&mut self, other: &UpdateTrackers) { + self.ack_map.merge(&other.ack_map); + self.sync_map.merge(&other.sync_map); + self.sync_ack_map.merge(&other.sync_ack_map); + } +} diff --git a/src/rpc/layout/tracker.rs b/src/rpc/layout/tracker.rs deleted file mode 100644 index 778121e4..00000000 --- a/src/rpc/layout/tracker.rs +++ /dev/null @@ -1,21 +0,0 @@ -use super::*; - -impl UpdateTracker { - fn merge(&mut self, other: &UpdateTracker) { - for (k, v) in other.0.iter() { - if let Some(v_mut) = self.0.get_mut(k) { - *v_mut = std::cmp::max(*v_mut, *v); - } else { - self.0.insert(*k, *v); - } - } - } -} - -impl UpdateTrackers { - pub(crate) fn merge(&mut self, other: &UpdateTrackers) { - self.ack_map.merge(&other.ack_map); - self.sync_map.merge(&other.sync_map); - self.sync_ack_map.merge(&other.sync_ack_map); - } -} diff --git a/src/rpc/layout/version.rs b/src/rpc/layout/version.rs index 363bc204..6918fdf9 100644 --- a/src/rpc/layout/version.rs +++ b/src/rpc/layout/version.rs @@ -1,69 +1,21 @@ use std::collections::HashMap; use std::collections::HashSet; -use std::fmt; +use std::convert::TryInto; use bytesize::ByteSize; use itertools::Itertools; -use garage_util::crdt::{AutoCrdt, LwwMap}; +use garage_util::crdt::LwwMap; use garage_util::data::*; use garage_util::error::*; -use crate::graph_algo::*; - -use std::convert::TryInto; - +use super::graph_algo::*; use super::schema::*; use super::*; // The Message type will be used to collect information on the algorithm. pub type Message = Vec; -impl AutoCrdt for LayoutParameters { - const WARN_IF_DIFFERENT: bool = true; -} - -impl AutoCrdt for NodeRoleV { - const WARN_IF_DIFFERENT: bool = true; -} - -impl NodeRole { - pub fn capacity_string(&self) -> String { - match self.capacity { - Some(c) => ByteSize::b(c).to_string_as(false), - None => "gateway".to_string(), - } - } - - pub fn tags_string(&self) -> String { - self.tags.join(",") - } -} - -impl fmt::Display for ZoneRedundancy { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - ZoneRedundancy::Maximum => write!(f, "maximum"), - ZoneRedundancy::AtLeast(x) => write!(f, "{}", x), - } - } -} - -impl core::str::FromStr for ZoneRedundancy { - type Err = &'static str; - fn from_str(s: &str) -> Result { - match s { - "none" | "max" | "maximum" => Ok(ZoneRedundancy::Maximum), - x => { - let v = x - .parse::() - .map_err(|_| "zone redundancy must be 'none'/'max' or an integer")?; - Ok(ZoneRedundancy::AtLeast(v)) - } - } - } -} - impl LayoutVersion { pub fn new(replication_factor: usize) -> Self { // We set the default zone redundancy to be Maximum, meaning that the maximum diff --git a/src/rpc/lib.rs b/src/rpc/lib.rs index 1af8b78e..b5b31c05 100644 --- a/src/rpc/lib.rs +++ b/src/rpc/lib.rs @@ -11,7 +11,6 @@ mod consul; #[cfg(feature = "kubernetes-discovery")] mod kubernetes; -pub mod graph_algo; pub mod layout; pub mod replication_mode; pub mod system; -- cgit v1.2.3 From 8a2b1dd422fb57abe611d8c1cf3cb0b55f487189 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 9 Nov 2023 12:55:36 +0100 Subject: wip: split out layout management from System into separate LayoutManager --- src/api/admin/cluster.rs | 18 ++- src/block/manager.rs | 10 +- src/block/resync.rs | 4 +- src/model/k2v/rpc.rs | 20 ++-- src/rpc/layout/manager.rs | 177 ++++++++++++++++++++++++++++ src/rpc/layout/mod.rs | 2 + src/rpc/system.rs | 295 +++++++++++++++++----------------------------- src/table/gc.rs | 4 +- src/table/sync.rs | 10 +- src/table/table.rs | 10 +- 10 files changed, 331 insertions(+), 219 deletions(-) create mode 100644 src/rpc/layout/manager.rs (limited to 'src') diff --git a/src/api/admin/cluster.rs b/src/api/admin/cluster.rs index fe8e8764..f5483451 100644 --- a/src/api/admin/cluster.rs +++ b/src/api/admin/cluster.rs @@ -240,7 +240,11 @@ pub async fn handle_update_cluster_layout( .merge(&roles.update_mutator(node, layout::NodeRoleV(new_role))); } - garage.system.update_cluster_layout(&layout).await?; + garage + .system + .layout_manager + .update_cluster_layout(&layout) + .await?; let res = format_cluster_layout(&layout); Ok(json_ok_response(&res)?) @@ -255,7 +259,11 @@ pub async fn handle_apply_cluster_layout( let layout = garage.system.cluster_layout().as_ref().clone(); let (layout, msg) = layout.apply_staged_changes(Some(param.version))?; - garage.system.update_cluster_layout(&layout).await?; + garage + .system + .layout_manager + .update_cluster_layout(&layout) + .await?; let res = ApplyClusterLayoutResponse { message: msg, @@ -267,7 +275,11 @@ pub async fn handle_apply_cluster_layout( pub async fn handle_revert_cluster_layout(garage: &Arc) -> Result, Error> { let layout = garage.system.cluster_layout().as_ref().clone(); let layout = layout.revert_staged_changes()?; - garage.system.update_cluster_layout(&layout).await?; + garage + .system + .layout_manager + .update_cluster_layout(&layout) + .await?; let res = format_cluster_layout(&layout); Ok(json_ok_response(&res)?) diff --git a/src/block/manager.rs b/src/block/manager.rs index 2d1b5c67..72b4ea66 100644 --- a/src/block/manager.rs +++ b/src/block/manager.rs @@ -265,7 +265,7 @@ impl BlockManager { Fut: futures::Future>, { let who = self.replication.read_nodes(hash); - let who = self.system.rpc.request_order(&who); + let who = self.system.rpc_helper().request_order(&who); for node in who.iter() { let node_id = NodeID::from(*node); @@ -305,7 +305,7 @@ impl BlockManager { // if the first one doesn't succeed rapidly // TODO: keep first request running when initiating a new one and take the // one that finishes earlier - _ = tokio::time::sleep(self.system.rpc.rpc_timeout()) => { + _ = tokio::time::sleep(self.system.rpc_helper().rpc_timeout()) => { debug!("Get block {:?}: node {:?} didn't return block in time, trying next.", hash, node); } }; @@ -363,7 +363,7 @@ impl BlockManager { Req::new(BlockRpc::PutBlock { hash, header })?.with_stream_from_buffer(bytes); self.system - .rpc + .rpc_helper() .try_call_many( &self.endpoint, &who[..], @@ -439,7 +439,7 @@ impl BlockManager { tokio::spawn(async move { if let Err(e) = this .resync - .put_to_resync(&hash, 2 * this.system.rpc.rpc_timeout()) + .put_to_resync(&hash, 2 * this.system.rpc_helper().rpc_timeout()) { error!("Block {:?} could not be put in resync queue: {}.", hash, e); } @@ -533,7 +533,7 @@ impl BlockManager { None => { // Not found but maybe we should have had it ?? self.resync - .put_to_resync(hash, 2 * self.system.rpc.rpc_timeout())?; + .put_to_resync(hash, 2 * self.system.rpc_helper().rpc_timeout())?; return Err(Error::Message(format!( "block {:?} not found on node", hash diff --git a/src/block/resync.rs b/src/block/resync.rs index 9c1da4a7..fedcd6f5 100644 --- a/src/block/resync.rs +++ b/src/block/resync.rs @@ -385,7 +385,7 @@ impl BlockResyncManager { let who_needs_resps = manager .system - .rpc + .rpc_helper() .call_many( &manager.endpoint, &who, @@ -431,7 +431,7 @@ impl BlockResyncManager { .with_stream_from_buffer(bytes); manager .system - .rpc + .rpc_helper() .try_call_many( &manager.endpoint, &need_nodes[..], diff --git a/src/model/k2v/rpc.rs b/src/model/k2v/rpc.rs index 37e142f6..2f548ad7 100644 --- a/src/model/k2v/rpc.rs +++ b/src/model/k2v/rpc.rs @@ -131,7 +131,7 @@ impl K2VRpcHandler { who.sort(); self.system - .rpc + .rpc_helper() .try_call_many( &self.endpoint, &who[..], @@ -187,7 +187,7 @@ impl K2VRpcHandler { let call_futures = call_list.into_iter().map(|(nodes, items)| async move { let resp = self .system - .rpc + .rpc_helper() .try_call_many( &self.endpoint, &nodes[..], @@ -229,7 +229,7 @@ impl K2VRpcHandler { .replication .write_nodes(&poll_key.partition.hash()); - let rpc = self.system.rpc.try_call_many( + let rpc = self.system.rpc_helper().try_call_many( &self.endpoint, &nodes[..], K2VRpc::PollItem { @@ -241,7 +241,8 @@ impl K2VRpcHandler { .with_quorum(self.item_table.data.replication.read_quorum()) .without_timeout(), ); - let timeout_duration = Duration::from_millis(timeout_msec) + self.system.rpc.rpc_timeout(); + let timeout_duration = + Duration::from_millis(timeout_msec) + self.system.rpc_helper().rpc_timeout(); let resps = select! { r = rpc => r?, _ = tokio::time::sleep(timeout_duration) => return Ok(None), @@ -300,7 +301,11 @@ impl K2VRpcHandler { let rs = RequestStrategy::with_priority(PRIO_NORMAL).without_timeout(); let mut requests = nodes .iter() - .map(|node| self.system.rpc.call(&self.endpoint, *node, msg.clone(), rs)) + .map(|node| { + self.system + .rpc_helper() + .call(&self.endpoint, *node, msg.clone(), rs) + }) .collect::>(); // Fetch responses. This procedure stops fetching responses when any of the following @@ -316,8 +321,9 @@ impl K2VRpcHandler { // kind: all items produced by that node until time ts have been returned, so we can // bump the entry in the global vector clock and possibly remove some item-specific // vector clocks) - let mut deadline = - Instant::now() + Duration::from_millis(timeout_msec) + self.system.rpc.rpc_timeout(); + let mut deadline = Instant::now() + + Duration::from_millis(timeout_msec) + + self.system.rpc_helper().rpc_timeout(); let mut resps = vec![]; let mut errors = vec![]; loop { diff --git a/src/rpc/layout/manager.rs b/src/rpc/layout/manager.rs new file mode 100644 index 00000000..a8a77139 --- /dev/null +++ b/src/rpc/layout/manager.rs @@ -0,0 +1,177 @@ +use std::sync::Arc; +use std::time::Duration; + +use tokio::sync::watch; +use tokio::sync::Mutex; + +use netapp::endpoint::Endpoint; +use netapp::peering::fullmesh::FullMeshPeeringStrategy; +use netapp::NodeID; + +use garage_util::config::Config; +use garage_util::data::*; +use garage_util::error::*; +use garage_util::persister::Persister; + +use super::*; +use crate::rpc_helper::*; +use crate::system::*; + +pub struct LayoutManager { + replication_factor: usize, + persist_cluster_layout: Persister, + + pub layout_watch: watch::Receiver>, + update_layout: Mutex>>, + + pub(crate) rpc_helper: RpcHelper, + system_endpoint: Arc>, +} + +impl LayoutManager { + pub fn new( + config: &Config, + node_id: NodeID, + system_endpoint: Arc>, + fullmesh: Arc, + replication_factor: usize, + ) -> Result { + let persist_cluster_layout: Persister = + Persister::new(&config.metadata_dir, "cluster_layout"); + + let cluster_layout = match persist_cluster_layout.load() { + Ok(x) => { + if x.current().replication_factor != replication_factor { + return Err(Error::Message(format!( + "Prevous cluster layout has replication factor {}, which is different than the one specified in the config file ({}). The previous cluster layout can be purged, if you know what you are doing, simply by deleting the `cluster_layout` file in your metadata directory.", + x.current().replication_factor, + replication_factor + ))); + } + x + } + Err(e) => { + info!( + "No valid previous cluster layout stored ({}), starting fresh.", + e + ); + LayoutHistory::new(replication_factor) + } + }; + + let (update_layout, layout_watch) = watch::channel(Arc::new(cluster_layout)); + + let rpc_helper = RpcHelper::new( + node_id.into(), + fullmesh, + layout_watch.clone(), + config.rpc_timeout_msec.map(Duration::from_millis), + ); + + Ok(Self { + replication_factor, + persist_cluster_layout, + layout_watch, + update_layout: Mutex::new(update_layout), + system_endpoint, + rpc_helper, + }) + } + + // ---- PUBLIC INTERFACE ---- + + pub async fn update_cluster_layout(&self, layout: &LayoutHistory) -> Result<(), Error> { + self.handle_advertise_cluster_layout(layout).await?; + Ok(()) + } + + pub fn history(&self) -> watch::Ref> { + self.layout_watch.borrow() + } + + pub(crate) async fn pull_cluster_layout(&self, peer: Uuid) { + let resp = self + .rpc_helper + .call( + &self.system_endpoint, + peer, + SystemRpc::PullClusterLayout, + RequestStrategy::with_priority(PRIO_HIGH), + ) + .await; + if let Ok(SystemRpc::AdvertiseClusterLayout(layout)) = resp { + let _: Result<_, _> = self.handle_advertise_cluster_layout(&layout).await; + } + } + + // ---- INTERNALS --- + + /// Save network configuration to disc + async fn save_cluster_layout(&self) -> Result<(), Error> { + let layout: Arc = self.layout_watch.borrow().clone(); + self.persist_cluster_layout + .save_async(&layout) + .await + .expect("Cannot save current cluster layout"); + Ok(()) + } + + // ---- RPC HANDLERS ---- + + pub(crate) fn handle_pull_cluster_layout(&self) -> SystemRpc { + let layout = self.layout_watch.borrow().as_ref().clone(); + SystemRpc::AdvertiseClusterLayout(layout) + } + + pub(crate) async fn handle_advertise_cluster_layout( + &self, + adv: &LayoutHistory, + ) -> Result { + if adv.current().replication_factor != self.replication_factor { + let msg = format!( + "Received a cluster layout from another node with replication factor {}, which is different from what we have in our configuration ({}). Discarding the cluster layout we received.", + adv.current().replication_factor, + self.replication_factor + ); + error!("{}", msg); + return Err(Error::Message(msg)); + } + + let update_layout = self.update_layout.lock().await; + // TODO: don't clone each time an AdvertiseClusterLayout is received + let mut layout: LayoutHistory = self.layout_watch.borrow().as_ref().clone(); + + let prev_layout_check = layout.check().is_ok(); + if layout.merge(adv) { + if prev_layout_check && layout.check().is_err() { + error!("New cluster layout is invalid, discarding."); + return Err(Error::Message( + "New cluster layout is invalid, discarding.".into(), + )); + } + + update_layout.send(Arc::new(layout.clone()))?; + drop(update_layout); + + /* TODO + tokio::spawn(async move { + if let Err(e) = system + .rpc_helper() + .broadcast( + &system.system_endpoint, + SystemRpc::AdvertiseClusterLayout(layout), + RequestStrategy::with_priority(PRIO_HIGH), + ) + .await + { + warn!("Error while broadcasting new cluster layout: {}", e); + } + }); + */ + + self.save_cluster_layout().await?; + } + + Ok(SystemRpc::Ok) + } +} diff --git a/src/rpc/layout/mod.rs b/src/rpc/layout/mod.rs index 7c15988a..cd3764bc 100644 --- a/src/rpc/layout/mod.rs +++ b/src/rpc/layout/mod.rs @@ -3,6 +3,8 @@ mod history; mod schema; mod version; +pub mod manager; + // ---- re-exports ---- pub use history::*; diff --git a/src/rpc/system.rs b/src/rpc/system.rs index a7433b68..a8e88425 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -14,7 +14,6 @@ use serde::{Deserialize, Serialize}; use sodiumoxide::crypto::sign::ed25519; use tokio::select; use tokio::sync::watch; -use tokio::sync::Mutex; use netapp::endpoint::{Endpoint, EndpointHandler}; use netapp::message::*; @@ -34,6 +33,7 @@ use garage_util::time::*; use crate::consul::ConsulDiscovery; #[cfg(feature = "kubernetes-discovery")] use crate::kubernetes::*; +use crate::layout::manager::LayoutManager; use crate::layout::*; use crate::replication_mode::*; use crate::rpc_helper::*; @@ -49,7 +49,7 @@ const STATUS_EXCHANGE_INTERVAL: Duration = Duration::from_secs(10); pub const GARAGE_VERSION_TAG: u64 = 0x676172616765000A; // garage 0x000A /// RPC endpoint used for calls related to membership -pub const SYSTEM_RPC_PATH: &str = "garage_rpc/membership.rs/SystemRpc"; +pub const SYSTEM_RPC_PATH: &str = "garage_rpc/system.rs/SystemRpc"; /// RPC messages related to membership #[derive(Debug, Serialize, Deserialize, Clone)] @@ -58,17 +58,17 @@ pub enum SystemRpc { Ok, /// Request to connect to a specific node (in @: format) Connect(String), - /// Ask other node its cluster layout. Answered with AdvertiseClusterLayout - PullClusterLayout, /// Advertise Garage status. Answered with another AdvertiseStatus. /// Exchanged with every node on a regular basis. AdvertiseStatus(NodeStatus), - /// Advertisement of cluster layout. Sent spontanously or in response to PullClusterLayout - AdvertiseClusterLayout(LayoutHistory), /// Get known nodes states GetKnownNodes, /// Return known nodes ReturnKnownNodes(Vec), + /// Ask other node its cluster layout. Answered with AdvertiseClusterLayout + PullClusterLayout, + /// Advertisement of cluster layout. Sent spontanously or in response to PullClusterLayout + AdvertiseClusterLayout(LayoutHistory), } impl Rpc for SystemRpc { @@ -84,7 +84,6 @@ pub struct System { /// The id of this node pub id: Uuid, - persist_cluster_layout: Persister, persist_peer_list: Persister, local_status: ArcSwap, @@ -92,9 +91,8 @@ pub struct System { pub netapp: Arc, fullmesh: Arc, - pub rpc: RpcHelper, - system_endpoint: Arc>, + pub(crate) system_endpoint: Arc>, rpc_listen_addr: SocketAddr, #[cfg(any(feature = "consul-discovery", feature = "kubernetes-discovery"))] @@ -106,15 +104,13 @@ pub struct System { #[cfg(feature = "kubernetes-discovery")] kubernetes_discovery: Option, + pub layout_manager: LayoutManager, + metrics: SystemMetrics, replication_mode: ReplicationMode, replication_factor: usize, - /// The layout - pub layout_watch: watch::Receiver>, - update_layout: Mutex>>, - /// Path to metadata directory pub metadata_dir: PathBuf, /// Path to data directory @@ -128,8 +124,11 @@ pub struct NodeStatus { /// Replication factor configured on the node pub replication_factor: usize, + /// Cluster layout version pub cluster_layout_version: u64, + /// Hash of cluster layout update trackers + // (TODO) pub cluster_layout_trackers_hash: Hash, /// Hash of cluster layout staging data pub cluster_layout_staging_hash: Hash, @@ -247,8 +246,7 @@ impl System { replication_mode: ReplicationMode, config: &Config, ) -> Result, Error> { - let replication_factor = replication_mode.replication_factor(); - + // ---- setup netapp RPC protocol ---- let node_key = gen_node_key(&config.metadata_dir).expect("Unable to read or generate node ID"); info!( @@ -256,81 +254,40 @@ impl System { hex::encode(&node_key.public_key()[..8]) ); - let persist_cluster_layout: Persister = - Persister::new(&config.metadata_dir, "cluster_layout"); - let persist_peer_list = Persister::new(&config.metadata_dir, "peer_list"); - - let cluster_layout = match persist_cluster_layout.load() { - Ok(x) => { - if x.current().replication_factor != replication_factor { - return Err(Error::Message(format!( - "Prevous cluster layout has replication factor {}, which is different than the one specified in the config file ({}). The previous cluster layout can be purged, if you know what you are doing, simply by deleting the `cluster_layout` file in your metadata directory.", - x.current().replication_factor, - replication_factor - ))); - } - x - } - Err(e) => { - info!( - "No valid previous cluster layout stored ({}), starting fresh.", - e - ); - LayoutHistory::new(replication_factor) - } - }; - - let metrics = SystemMetrics::new(replication_factor); - - let mut local_status = NodeStatus::initial(replication_factor, &cluster_layout); - local_status.update_disk_usage(&config.metadata_dir, &config.data_dir, &metrics); + let netapp = NetApp::new(GARAGE_VERSION_TAG, network_key, node_key); + let system_endpoint = netapp.endpoint(SYSTEM_RPC_PATH.into()); - let (update_layout, layout_watch) = watch::channel(Arc::new(cluster_layout)); - - let rpc_public_addr = match &config.rpc_public_addr { - Some(a_str) => { - use std::net::ToSocketAddrs; - match a_str.to_socket_addrs() { - Err(e) => { - error!( - "Cannot resolve rpc_public_addr {} from config file: {}.", - a_str, e - ); - None - } - Ok(a) => { - let a = a.collect::>(); - if a.is_empty() { - error!("rpc_public_addr {} resolve to no known IP address", a_str); - } - if a.len() > 1 { - warn!("Multiple possible resolutions for rpc_public_addr: {:?}. Taking the first one.", a); - } - a.into_iter().next() - } - } - } - None => { - let addr = - get_default_ip().map(|ip| SocketAddr::new(ip, config.rpc_bind_addr.port())); - if let Some(a) = addr { - warn!("Using autodetected rpc_public_addr: {}. Consider specifying it explicitly in configuration file if possible.", a); - } - addr - } - }; + // ---- setup netapp public listener and full mesh peering strategy ---- + let rpc_public_addr = get_rpc_public_addr(config); if rpc_public_addr.is_none() { warn!("This Garage node does not know its publicly reachable RPC address, this might hamper intra-cluster communication."); } - let netapp = NetApp::new(GARAGE_VERSION_TAG, network_key, node_key); let fullmesh = FullMeshPeeringStrategy::new(netapp.clone(), vec![], rpc_public_addr); if let Some(ping_timeout) = config.rpc_ping_timeout_msec { fullmesh.set_ping_timeout_millis(ping_timeout); } - let system_endpoint = netapp.endpoint(SYSTEM_RPC_PATH.into()); + let persist_peer_list = Persister::new(&config.metadata_dir, "peer_list"); + // ---- setup cluster layout and layout manager ---- + let replication_factor = replication_mode.replication_factor(); + + let layout_manager = LayoutManager::new( + config, + netapp.id, + system_endpoint.clone(), + fullmesh.clone(), + replication_factor, + )?; + + // ---- set up metrics and status exchange ---- + let metrics = SystemMetrics::new(replication_factor); + + let mut local_status = NodeStatus::initial(replication_factor, &layout_manager.history()); + local_status.update_disk_usage(&config.metadata_dir, &config.data_dir, &metrics); + + // ---- if enabled, set up additionnal peer discovery methods ---- #[cfg(feature = "consul-discovery")] let consul_discovery = match &config.consul_discovery { Some(cfg) => Some( @@ -349,20 +306,14 @@ impl System { warn!("Kubernetes discovery is not enabled in this build."); } + // ---- done ---- let sys = Arc::new(System { id: netapp.id.into(), - persist_cluster_layout, persist_peer_list, local_status: ArcSwap::new(Arc::new(local_status)), node_status: RwLock::new(HashMap::new()), netapp: netapp.clone(), fullmesh: fullmesh.clone(), - rpc: RpcHelper::new( - netapp.id.into(), - fullmesh, - layout_watch.clone(), - config.rpc_timeout_msec.map(Duration::from_millis), - ), system_endpoint, replication_mode, replication_factor, @@ -374,10 +325,9 @@ impl System { consul_discovery, #[cfg(feature = "kubernetes-discovery")] kubernetes_discovery: config.kubernetes_discovery.clone(), + layout_manager, metrics, - layout_watch, - update_layout: Mutex::new(update_layout), metadata_dir: config.metadata_dir.clone(), data_dir: config.data_dir.clone(), }); @@ -397,6 +347,20 @@ impl System { ); } + // ---- Public utilities / accessors ---- + + pub fn cluster_layout(&self) -> watch::Ref> { + self.layout_manager.history() + } + + pub fn layout_watch(&self) -> watch::Receiver> { + self.layout_manager.layout_watch.clone() + } + + pub fn rpc_helper(&self) -> &RpcHelper { + &self.layout_manager.rpc_helper + } + // ---- Administrative operations (directly available and // also available through RPC) ---- @@ -423,18 +387,6 @@ impl System { known_nodes } - pub fn cluster_layout(&self) -> watch::Ref> { - self.layout_watch.borrow() - } - - pub async fn update_cluster_layout( - self: &Arc, - layout: &LayoutHistory, - ) -> Result<(), Error> { - self.handle_advertise_cluster_layout(layout).await?; - Ok(()) - } - pub async fn connect(&self, node: &str) -> Result<(), Error> { let (pubkey, addrs) = parse_and_resolve_peer_addr_async(node) .await @@ -464,7 +416,7 @@ impl System { } pub fn health(&self) -> ClusterHealth { - let layout: Arc<_> = self.layout_watch.borrow().clone(); + let layout: Arc<_> = self.cluster_layout().clone(); let quorum = self.replication_mode.write_quorum(); let replication_factor = self.replication_factor; @@ -581,20 +533,10 @@ impl System { } } - /// Save network configuration to disc - async fn save_cluster_layout(&self) -> Result<(), Error> { - let layout: Arc = self.layout_watch.borrow().clone(); - self.persist_cluster_layout - .save_async(&layout) - .await - .expect("Cannot save current cluster layout"); - Ok(()) - } - fn update_local_status(&self) { let mut new_si: NodeStatus = self.local_status.load().as_ref().clone(); - let layout = self.layout_watch.borrow(); + let layout = self.cluster_layout(); new_si.cluster_layout_version = layout.current().version; new_si.cluster_layout_staging_hash = layout.staging_hash; @@ -610,11 +552,6 @@ impl System { Ok(SystemRpc::Ok) } - fn handle_pull_cluster_layout(&self) -> SystemRpc { - let layout = self.layout_watch.borrow().as_ref().clone(); - SystemRpc::AdvertiseClusterLayout(layout) - } - fn handle_get_known_nodes(&self) -> SystemRpc { let known_nodes = self.get_known_nodes(); SystemRpc::ReturnKnownNodes(known_nodes) @@ -637,7 +574,10 @@ impl System { if info.cluster_layout_version > local_info.cluster_layout_version || info.cluster_layout_staging_hash != local_info.cluster_layout_staging_hash { - tokio::spawn(self.clone().pull_cluster_layout(from)); + tokio::spawn({ + let system = self.clone(); + async move { system.layout_manager.pull_cluster_layout(from).await } + }); } self.node_status @@ -648,57 +588,6 @@ impl System { Ok(SystemRpc::Ok) } - async fn handle_advertise_cluster_layout( - self: &Arc, - adv: &LayoutHistory, - ) -> Result { - if adv.current().replication_factor != self.replication_factor { - let msg = format!( - "Received a cluster layout from another node with replication factor {}, which is different from what we have in our configuration ({}). Discarding the cluster layout we received.", - adv.current().replication_factor, - self.replication_factor - ); - error!("{}", msg); - return Err(Error::Message(msg)); - } - - let update_layout = self.update_layout.lock().await; - // TODO: don't clone each time an AdvertiseClusterLayout is received - let mut layout: LayoutHistory = self.layout_watch.borrow().as_ref().clone(); - - let prev_layout_check = layout.check().is_ok(); - if layout.merge(adv) { - if prev_layout_check && layout.check().is_err() { - error!("New cluster layout is invalid, discarding."); - return Err(Error::Message( - "New cluster layout is invalid, discarding.".into(), - )); - } - - update_layout.send(Arc::new(layout.clone()))?; - drop(update_layout); - - let self2 = self.clone(); - tokio::spawn(async move { - if let Err(e) = self2 - .rpc - .broadcast( - &self2.system_endpoint, - SystemRpc::AdvertiseClusterLayout(layout), - RequestStrategy::with_priority(PRIO_HIGH), - ) - .await - { - warn!("Error while broadcasting new cluster layout: {}", e); - } - }); - - self.save_cluster_layout().await?; - } - - Ok(SystemRpc::Ok) - } - async fn status_exchange_loop(&self, mut stop_signal: watch::Receiver) { while !*stop_signal.borrow() { let restart_at = Instant::now() + STATUS_EXCHANGE_INTERVAL; @@ -706,7 +595,7 @@ impl System { self.update_local_status(); let local_status: NodeStatus = self.local_status.load().as_ref().clone(); let _ = self - .rpc + .rpc_helper() .broadcast( &self.system_endpoint, SystemRpc::AdvertiseStatus(local_status), @@ -724,9 +613,9 @@ impl System { async fn discovery_loop(self: &Arc, mut stop_signal: watch::Receiver) { while !*stop_signal.borrow() { - let not_configured = self.layout_watch.borrow().check().is_err(); + let not_configured = self.cluster_layout().check().is_err(); let no_peers = self.fullmesh.get_peer_list().len() < self.replication_factor; - let expected_n_nodes = self.layout_watch.borrow().current().num_nodes(); + let expected_n_nodes = self.cluster_layout().current().num_nodes(); let bad_peers = self .fullmesh .get_peer_list() @@ -831,34 +720,26 @@ impl System { .save_async(&PeerList(peer_list)) .await } - - async fn pull_cluster_layout(self: Arc, peer: Uuid) { - let resp = self - .rpc - .call( - &self.system_endpoint, - peer, - SystemRpc::PullClusterLayout, - RequestStrategy::with_priority(PRIO_HIGH), - ) - .await; - if let Ok(SystemRpc::AdvertiseClusterLayout(layout)) = resp { - let _: Result<_, _> = self.handle_advertise_cluster_layout(&layout).await; - } - } } #[async_trait] impl EndpointHandler for System { async fn handle(self: &Arc, msg: &SystemRpc, from: NodeID) -> Result { match msg { + // ---- system functions -> System ---- SystemRpc::Connect(node) => self.handle_connect(node).await, - SystemRpc::PullClusterLayout => Ok(self.handle_pull_cluster_layout()), SystemRpc::AdvertiseStatus(adv) => self.handle_advertise_status(from.into(), adv).await, + SystemRpc::GetKnownNodes => Ok(self.handle_get_known_nodes()), + + // ---- layout functions -> LayoutManager ---- + SystemRpc::PullClusterLayout => Ok(self.layout_manager.handle_pull_cluster_layout()), SystemRpc::AdvertiseClusterLayout(adv) => { - self.clone().handle_advertise_cluster_layout(adv).await + self.layout_manager + .handle_advertise_cluster_layout(adv) + .await } - SystemRpc::GetKnownNodes => Ok(self.handle_get_known_nodes()), + + // ---- other -> Error ---- m => Err(Error::unexpected_rpc_message(m)), } } @@ -962,6 +843,40 @@ fn get_default_ip() -> Option { .map(|a| a.ip()) } +fn get_rpc_public_addr(config: &Config) -> Option { + match &config.rpc_public_addr { + Some(a_str) => { + use std::net::ToSocketAddrs; + match a_str.to_socket_addrs() { + Err(e) => { + error!( + "Cannot resolve rpc_public_addr {} from config file: {}.", + a_str, e + ); + None + } + Ok(a) => { + let a = a.collect::>(); + if a.is_empty() { + error!("rpc_public_addr {} resolve to no known IP address", a_str); + } + if a.len() > 1 { + warn!("Multiple possible resolutions for rpc_public_addr: {:?}. Taking the first one.", a); + } + a.into_iter().next() + } + } + } + None => { + let addr = get_default_ip().map(|ip| SocketAddr::new(ip, config.rpc_bind_addr.port())); + if let Some(a) = addr { + warn!("Using autodetected rpc_public_addr: {}. Consider specifying it explicitly in configuration file if possible.", a); + } + addr + } + } +} + async fn resolve_peers(peers: &[String]) -> Vec<(NodeID, SocketAddr)> { let mut ret = vec![]; diff --git a/src/table/gc.rs b/src/table/gc.rs index 5b9124a7..2135a358 100644 --- a/src/table/gc.rs +++ b/src/table/gc.rs @@ -227,7 +227,7 @@ impl TableGc { // GC'ing is not a critical function of the system, so it's not a big // deal if we can't do it right now. self.system - .rpc + .rpc_helper() .try_call_many( &self.endpoint, &nodes[..], @@ -248,7 +248,7 @@ impl TableGc { // it means that the garbage collection wasn't completed and has // to be retried later. self.system - .rpc + .rpc_helper() .try_call_many( &self.endpoint, &nodes[..], diff --git a/src/table/sync.rs b/src/table/sync.rs index 620d83b9..2da1bfe7 100644 --- a/src/table/sync.rs +++ b/src/table/sync.rs @@ -91,7 +91,7 @@ impl TableSyncer { bg.spawn_worker(SyncWorker { syncer: self.clone(), - layout_watch: self.system.layout_watch.clone(), + layout_watch: self.system.layout_watch(), layout: self.system.cluster_layout().clone(), add_full_sync_rx, todo: vec![], @@ -244,7 +244,7 @@ impl TableSyncer { } self.system - .rpc + .rpc_helper() .try_call_many( &self.endpoint, nodes, @@ -305,7 +305,7 @@ impl TableSyncer { // If so, do nothing. let root_resp = self .system - .rpc + .rpc_helper() .call( &self.endpoint, who, @@ -361,7 +361,7 @@ impl TableSyncer { // and compare it with local node let remote_node = match self .system - .rpc + .rpc_helper() .call( &self.endpoint, who, @@ -437,7 +437,7 @@ impl TableSyncer { let rpc_resp = self .system - .rpc + .rpc_helper() .call( &self.endpoint, who, diff --git a/src/table/table.rs b/src/table/table.rs index 7ad79677..3e3fd138 100644 --- a/src/table/table.rs +++ b/src/table/table.rs @@ -123,7 +123,7 @@ impl Table { let rpc = TableRpc::::Update(vec![e_enc]); self.system - .rpc + .rpc_helper() .try_call_many( &self.endpoint, &who[..], @@ -181,7 +181,7 @@ impl Table { let resp = self .system - .rpc + .rpc_helper() .call( &self.endpoint, node, @@ -236,7 +236,7 @@ impl Table { let rpc = TableRpc::::ReadEntry(partition_key.clone(), sort_key.clone()); let resps = self .system - .rpc + .rpc_helper() .try_call_many( &self.endpoint, &who[..], @@ -332,7 +332,7 @@ impl Table { let resps = self .system - .rpc + .rpc_helper() .try_call_many( &self.endpoint, &who[..], @@ -411,7 +411,7 @@ impl Table { async fn repair_on_read(&self, who: &[Uuid], what: F::E) -> Result<(), Error> { let what_enc = Arc::new(ByteBuf::from(what.encode()?)); self.system - .rpc + .rpc_helper() .try_call_many( &self.endpoint, who, -- cgit v1.2.3 From 19ef1ec8e7fee3a6c670e6e35dfcc83f0801e604 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 9 Nov 2023 13:34:14 +0100 Subject: layout: more refactoring --- src/garage/cli/layout.rs | 6 ++- src/rpc/layout/manager.rs | 116 ++++++++++++++++++++++++++++++---------------- src/rpc/layout/schema.rs | 6 +-- src/rpc/system.rs | 40 ++++++---------- 4 files changed, 97 insertions(+), 71 deletions(-) (limited to 'src') diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index 269d92f4..bffc81d3 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -1,3 +1,5 @@ +use std::sync::Arc; + use bytesize::ByteSize; use format_table::format_table; @@ -321,7 +323,7 @@ pub async fn fetch_layout( .call(&rpc_host, SystemRpc::PullClusterLayout, PRIO_NORMAL) .await?? { - SystemRpc::AdvertiseClusterLayout(t) => Ok(t), + SystemRpc::AdvertiseClusterLayout(t) => Ok(Arc::try_unwrap(t).unwrap()), resp => Err(Error::Message(format!("Invalid RPC response: {:?}", resp))), } } @@ -334,7 +336,7 @@ pub async fn send_layout( rpc_cli .call( &rpc_host, - SystemRpc::AdvertiseClusterLayout(layout), + SystemRpc::AdvertiseClusterLayout(Arc::new(layout)), PRIO_NORMAL, ) .await??; diff --git a/src/rpc/layout/manager.rs b/src/rpc/layout/manager.rs index a8a77139..351e0959 100644 --- a/src/rpc/layout/manager.rs +++ b/src/rpc/layout/manager.rs @@ -1,6 +1,8 @@ use std::sync::Arc; use std::time::Duration; +use serde::{Deserialize, Serialize}; + use tokio::sync::watch; use tokio::sync::Mutex; @@ -28,6 +30,16 @@ pub struct LayoutManager { system_endpoint: Arc>, } +#[derive(Debug, Clone, Serialize, Deserialize, Default)] +pub struct LayoutStatus { + /// Cluster layout version + pub cluster_layout_version: u64, + /// Hash of cluster layout update trackers + // (TODO) pub cluster_layout_trackers_hash: Hash, + /// Hash of cluster layout staging data + pub cluster_layout_staging_hash: Hash, +} + impl LayoutManager { pub fn new( config: &Config, @@ -35,7 +47,7 @@ impl LayoutManager { system_endpoint: Arc>, fullmesh: Arc, replication_factor: usize, - ) -> Result { + ) -> Result, Error> { let persist_cluster_layout: Persister = Persister::new(&config.metadata_dir, "cluster_layout"); @@ -68,28 +80,39 @@ impl LayoutManager { config.rpc_timeout_msec.map(Duration::from_millis), ); - Ok(Self { + Ok(Arc::new(Self { replication_factor, persist_cluster_layout, layout_watch, update_layout: Mutex::new(update_layout), system_endpoint, rpc_helper, - }) + })) } // ---- PUBLIC INTERFACE ---- - pub async fn update_cluster_layout(&self, layout: &LayoutHistory) -> Result<(), Error> { + pub fn status(&self) -> LayoutStatus { + let layout = self.layout(); + LayoutStatus { + cluster_layout_version: layout.current().version, + cluster_layout_staging_hash: layout.staging_hash, + } + } + + pub async fn update_cluster_layout( + self: &Arc, + layout: &LayoutHistory, + ) -> Result<(), Error> { self.handle_advertise_cluster_layout(layout).await?; Ok(()) } - pub fn history(&self) -> watch::Ref> { + pub fn layout(&self) -> watch::Ref> { self.layout_watch.borrow() } - pub(crate) async fn pull_cluster_layout(&self, peer: Uuid) { + pub(crate) async fn pull_cluster_layout(self: &Arc, peer: Uuid) { let resp = self .rpc_helper .call( @@ -118,13 +141,25 @@ impl LayoutManager { // ---- RPC HANDLERS ---- + pub(crate) fn handle_advertise_status(self: &Arc, from: Uuid, status: &LayoutStatus) { + let local_status = self.status(); + if status.cluster_layout_version > local_status.cluster_layout_version + || status.cluster_layout_staging_hash != local_status.cluster_layout_staging_hash + { + tokio::spawn({ + let this = self.clone(); + async move { this.pull_cluster_layout(from).await } + }); + } + } + pub(crate) fn handle_pull_cluster_layout(&self) -> SystemRpc { - let layout = self.layout_watch.borrow().as_ref().clone(); + let layout = self.layout_watch.borrow().clone(); SystemRpc::AdvertiseClusterLayout(layout) } pub(crate) async fn handle_advertise_cluster_layout( - &self, + self: &Arc, adv: &LayoutHistory, ) -> Result { if adv.current().replication_factor != self.replication_factor { @@ -137,39 +172,42 @@ impl LayoutManager { return Err(Error::Message(msg)); } - let update_layout = self.update_layout.lock().await; - // TODO: don't clone each time an AdvertiseClusterLayout is received - let mut layout: LayoutHistory = self.layout_watch.borrow().as_ref().clone(); - - let prev_layout_check = layout.check().is_ok(); - if layout.merge(adv) { - if prev_layout_check && layout.check().is_err() { - error!("New cluster layout is invalid, discarding."); - return Err(Error::Message( - "New cluster layout is invalid, discarding.".into(), - )); - } - - update_layout.send(Arc::new(layout.clone()))?; - drop(update_layout); - - /* TODO - tokio::spawn(async move { - if let Err(e) = system - .rpc_helper() - .broadcast( - &system.system_endpoint, - SystemRpc::AdvertiseClusterLayout(layout), - RequestStrategy::with_priority(PRIO_HIGH), - ) - .await - { - warn!("Error while broadcasting new cluster layout: {}", e); + if *adv != **self.layout_watch.borrow() { + let update_layout = self.update_layout.lock().await; + let mut layout: LayoutHistory = self.layout_watch.borrow().as_ref().clone(); + + let prev_layout_check = layout.check().is_ok(); + if layout.merge(adv) { + if prev_layout_check && layout.check().is_err() { + error!("New cluster layout is invalid, discarding."); + return Err(Error::Message( + "New cluster layout is invalid, discarding.".into(), + )); } - }); - */ - self.save_cluster_layout().await?; + let layout = Arc::new(layout); + update_layout.send(layout.clone())?; + drop(update_layout); // release mutex + + tokio::spawn({ + let this = self.clone(); + async move { + if let Err(e) = this + .rpc_helper + .broadcast( + &this.system_endpoint, + SystemRpc::AdvertiseClusterLayout(layout), + RequestStrategy::with_priority(PRIO_HIGH), + ) + .await + { + warn!("Error while broadcasting new cluster layout: {}", e); + } + } + }); + + self.save_cluster_layout().await?; + } } Ok(SystemRpc::Ok) diff --git a/src/rpc/layout/schema.rs b/src/rpc/layout/schema.rs index c5b9b1d3..d587a6cb 100644 --- a/src/rpc/layout/schema.rs +++ b/src/rpc/layout/schema.rs @@ -226,7 +226,7 @@ mod v010 { } /// The history of cluster layouts - #[derive(Clone, Debug, Serialize, Deserialize)] + #[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] pub struct LayoutHistory { /// The versions currently in use in the cluster pub versions: Vec, @@ -241,7 +241,7 @@ mod v010 { } /// The tracker of acknowlegments and data syncs around the cluster - #[derive(Clone, Debug, Serialize, Deserialize, Default)] + #[derive(Clone, Debug, Serialize, Deserialize, Default, PartialEq)] pub struct UpdateTrackers { /// The highest layout version number each node has ack'ed pub ack_map: UpdateTracker, @@ -253,7 +253,7 @@ mod v010 { } /// The history of cluster layouts - #[derive(Clone, Debug, Serialize, Deserialize, Default)] + #[derive(Clone, Debug, Serialize, Deserialize, Default, PartialEq)] pub struct UpdateTracker(pub HashMap); impl garage_util::migrate::Migrate for LayoutHistory { diff --git a/src/rpc/system.rs b/src/rpc/system.rs index a8e88425..88c4d443 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -33,7 +33,7 @@ use garage_util::time::*; use crate::consul::ConsulDiscovery; #[cfg(feature = "kubernetes-discovery")] use crate::kubernetes::*; -use crate::layout::manager::LayoutManager; +use crate::layout::manager::{LayoutManager, LayoutStatus}; use crate::layout::*; use crate::replication_mode::*; use crate::rpc_helper::*; @@ -68,7 +68,7 @@ pub enum SystemRpc { /// Ask other node its cluster layout. Answered with AdvertiseClusterLayout PullClusterLayout, /// Advertisement of cluster layout. Sent spontanously or in response to PullClusterLayout - AdvertiseClusterLayout(LayoutHistory), + AdvertiseClusterLayout(Arc), } impl Rpc for SystemRpc { @@ -104,7 +104,7 @@ pub struct System { #[cfg(feature = "kubernetes-discovery")] kubernetes_discovery: Option, - pub layout_manager: LayoutManager, + pub layout_manager: Arc, metrics: SystemMetrics, @@ -125,12 +125,8 @@ pub struct NodeStatus { /// Replication factor configured on the node pub replication_factor: usize, - /// Cluster layout version - pub cluster_layout_version: u64, - /// Hash of cluster layout update trackers - // (TODO) pub cluster_layout_trackers_hash: Hash, - /// Hash of cluster layout staging data - pub cluster_layout_staging_hash: Hash, + /// Layout status + pub layout_status: LayoutStatus, /// Disk usage on partition containing metadata directory (tuple: `(avail, total)`) #[serde(default)] @@ -284,7 +280,7 @@ impl System { // ---- set up metrics and status exchange ---- let metrics = SystemMetrics::new(replication_factor); - let mut local_status = NodeStatus::initial(replication_factor, &layout_manager.history()); + let mut local_status = NodeStatus::initial(replication_factor, &layout_manager); local_status.update_disk_usage(&config.metadata_dir, &config.data_dir, &metrics); // ---- if enabled, set up additionnal peer discovery methods ---- @@ -350,7 +346,7 @@ impl System { // ---- Public utilities / accessors ---- pub fn cluster_layout(&self) -> watch::Ref> { - self.layout_manager.history() + self.layout_manager.layout() } pub fn layout_watch(&self) -> watch::Receiver> { @@ -536,9 +532,7 @@ impl System { fn update_local_status(&self) { let mut new_si: NodeStatus = self.local_status.load().as_ref().clone(); - let layout = self.cluster_layout(); - new_si.cluster_layout_version = layout.current().version; - new_si.cluster_layout_staging_hash = layout.staging_hash; + new_si.layout_status = self.layout_manager.status(); new_si.update_disk_usage(&self.metadata_dir, &self.data_dir, &self.metrics); @@ -571,14 +565,8 @@ impl System { std::process::exit(1); } - if info.cluster_layout_version > local_info.cluster_layout_version - || info.cluster_layout_staging_hash != local_info.cluster_layout_staging_hash - { - tokio::spawn({ - let system = self.clone(); - async move { system.layout_manager.pull_cluster_layout(from).await } - }); - } + self.layout_manager + .handle_advertise_status(from, &info.layout_status); self.node_status .write() @@ -746,14 +734,13 @@ impl EndpointHandler for System { } impl NodeStatus { - fn initial(replication_factor: usize, layout: &LayoutHistory) -> Self { + fn initial(replication_factor: usize, layout_manager: &LayoutManager) -> Self { NodeStatus { hostname: gethostname::gethostname() .into_string() .unwrap_or_else(|_| "".to_string()), replication_factor, - cluster_layout_version: layout.current().version, - cluster_layout_staging_hash: layout.staging_hash, + layout_status: layout_manager.status(), meta_disk_avail: None, data_disk_avail: None, } @@ -763,8 +750,7 @@ impl NodeStatus { NodeStatus { hostname: "?".to_string(), replication_factor: 0, - cluster_layout_version: 0, - cluster_layout_staging_hash: Hash::from([0u8; 32]), + layout_status: Default::default(), meta_disk_avail: None, data_disk_avail: None, } -- cgit v1.2.3 From bfb1845fdc981a370539d641a5d80f438f184f07 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 9 Nov 2023 14:12:05 +0100 Subject: layout: refactor to use a RwLock on LayoutHistory --- src/api/admin/cluster.rs | 6 +-- src/api/k2v/index.rs | 11 ++---- src/garage/cli/layout.rs | 6 +-- src/model/helper/bucket.rs | 11 ++---- src/rpc/layout/manager.rs | 93 +++++++++++++++++++++++----------------------- src/rpc/rpc_helper.rs | 11 +++--- src/rpc/system.rs | 15 ++++---- src/table/sync.rs | 21 +++++------ 8 files changed, 82 insertions(+), 92 deletions(-) (limited to 'src') diff --git a/src/api/admin/cluster.rs b/src/api/admin/cluster.rs index f5483451..593bd778 100644 --- a/src/api/admin/cluster.rs +++ b/src/api/admin/cluster.rs @@ -210,7 +210,7 @@ pub async fn handle_update_cluster_layout( ) -> Result, Error> { let updates = parse_json_body::(req).await?; - let mut layout = garage.system.cluster_layout().as_ref().clone(); + let mut layout = garage.system.cluster_layout().clone(); let mut roles = layout.current().roles.clone(); roles.merge(&layout.staging.get().roles); @@ -256,7 +256,7 @@ pub async fn handle_apply_cluster_layout( ) -> Result, Error> { let param = parse_json_body::(req).await?; - let layout = garage.system.cluster_layout().as_ref().clone(); + let layout = garage.system.cluster_layout().clone(); let (layout, msg) = layout.apply_staged_changes(Some(param.version))?; garage @@ -273,7 +273,7 @@ pub async fn handle_apply_cluster_layout( } pub async fn handle_revert_cluster_layout(garage: &Arc) -> Result, Error> { - let layout = garage.system.cluster_layout().as_ref().clone(); + let layout = garage.system.cluster_layout().clone(); let layout = layout.revert_staged_changes()?; garage .system diff --git a/src/api/k2v/index.rs b/src/api/k2v/index.rs index a9bc3826..3c2f51a9 100644 --- a/src/api/k2v/index.rs +++ b/src/api/k2v/index.rs @@ -5,7 +5,6 @@ use serde::Serialize; use garage_util::data::*; -use garage_rpc::layout::LayoutHistory; use garage_table::util::*; use garage_model::garage::Garage; @@ -26,7 +25,8 @@ pub async fn handle_read_index( ) -> Result, Error> { let reverse = reverse.unwrap_or(false); - let layout: Arc = garage.system.cluster_layout().clone(); + // TODO: not only current + let node_id_vec = garage.system.cluster_layout().current().node_ids().to_vec(); let (partition_keys, more, next_start) = read_range( &garage.k2v.counter_table.table, @@ -35,10 +35,7 @@ pub async fn handle_read_index( &start, &end, limit, - Some(( - DeletedFilter::NotDeleted, - layout.current().node_id_vec.clone(), - )), + Some((DeletedFilter::NotDeleted, node_id_vec)), EnumerationOrder::from_reverse(reverse), ) .await?; @@ -57,7 +54,7 @@ pub async fn handle_read_index( partition_keys: partition_keys .into_iter() .map(|part| { - let vals = part.filtered_values(&layout); + let vals = part.filtered_values(&garage.system.cluster_layout()); ReadIndexResponseEntry { pk: part.sk, entries: *vals.get(&s_entries).unwrap_or(&0), diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index bffc81d3..269d92f4 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -1,5 +1,3 @@ -use std::sync::Arc; - use bytesize::ByteSize; use format_table::format_table; @@ -323,7 +321,7 @@ pub async fn fetch_layout( .call(&rpc_host, SystemRpc::PullClusterLayout, PRIO_NORMAL) .await?? { - SystemRpc::AdvertiseClusterLayout(t) => Ok(Arc::try_unwrap(t).unwrap()), + SystemRpc::AdvertiseClusterLayout(t) => Ok(t), resp => Err(Error::Message(format!("Invalid RPC response: {:?}", resp))), } } @@ -336,7 +334,7 @@ pub async fn send_layout( rpc_cli .call( &rpc_host, - SystemRpc::AdvertiseClusterLayout(Arc::new(layout)), + SystemRpc::AdvertiseClusterLayout(layout), PRIO_NORMAL, ) .await??; diff --git a/src/model/helper/bucket.rs b/src/model/helper/bucket.rs index 18904c8d..2a9c0fb1 100644 --- a/src/model/helper/bucket.rs +++ b/src/model/helper/bucket.rs @@ -450,10 +450,8 @@ impl<'a> BucketHelper<'a> { #[cfg(feature = "k2v")] { - use garage_rpc::layout::LayoutHistory; - use std::sync::Arc; - - let layout: Arc = self.0.system.cluster_layout().clone(); + // TODO: not only current + let node_id_vec = self.0.system.cluster_layout().current().node_ids().to_vec(); let k2vindexes = self .0 .k2v @@ -462,10 +460,7 @@ impl<'a> BucketHelper<'a> { .get_range( &bucket_id, None, - Some(( - DeletedFilter::NotDeleted, - layout.current().node_id_vec.clone(), - )), + Some((DeletedFilter::NotDeleted, node_id_vec)), 10, EnumerationOrder::Forward, ) diff --git a/src/rpc/layout/manager.rs b/src/rpc/layout/manager.rs index 351e0959..c021039b 100644 --- a/src/rpc/layout/manager.rs +++ b/src/rpc/layout/manager.rs @@ -1,10 +1,9 @@ -use std::sync::Arc; +use std::sync::{Arc, RwLock, RwLockReadGuard}; use std::time::Duration; use serde::{Deserialize, Serialize}; -use tokio::sync::watch; -use tokio::sync::Mutex; +use tokio::sync::Notify; use netapp::endpoint::Endpoint; use netapp::peering::fullmesh::FullMeshPeeringStrategy; @@ -23,8 +22,8 @@ pub struct LayoutManager { replication_factor: usize, persist_cluster_layout: Persister, - pub layout_watch: watch::Receiver>, - update_layout: Mutex>>, + layout: Arc>, + pub(crate) change_notify: Arc, pub(crate) rpc_helper: RpcHelper, system_endpoint: Arc>, @@ -71,20 +70,21 @@ impl LayoutManager { } }; - let (update_layout, layout_watch) = watch::channel(Arc::new(cluster_layout)); + let layout = Arc::new(RwLock::new(cluster_layout)); + let change_notify = Arc::new(Notify::new()); let rpc_helper = RpcHelper::new( node_id.into(), fullmesh, - layout_watch.clone(), + layout.clone(), config.rpc_timeout_msec.map(Duration::from_millis), ); Ok(Arc::new(Self { replication_factor, persist_cluster_layout, - layout_watch, - update_layout: Mutex::new(update_layout), + layout, + change_notify, system_endpoint, rpc_helper, })) @@ -108,8 +108,8 @@ impl LayoutManager { Ok(()) } - pub fn layout(&self) -> watch::Ref> { - self.layout_watch.borrow() + pub fn layout(&self) -> RwLockReadGuard<'_, LayoutHistory> { + self.layout.read().unwrap() } pub(crate) async fn pull_cluster_layout(self: &Arc, peer: Uuid) { @@ -131,7 +131,7 @@ impl LayoutManager { /// Save network configuration to disc async fn save_cluster_layout(&self) -> Result<(), Error> { - let layout: Arc = self.layout_watch.borrow().clone(); + let layout = self.layout.read().unwrap().clone(); // TODO: avoid cloning self.persist_cluster_layout .save_async(&layout) .await @@ -139,6 +139,22 @@ impl LayoutManager { Ok(()) } + fn merge_layout(&self, adv: &LayoutHistory) -> Option { + let mut layout = self.layout.write().unwrap(); + let prev_layout_check = layout.check().is_ok(); + + if !prev_layout_check || adv.check().is_ok() { + if layout.merge(adv) { + if prev_layout_check && layout.check().is_err() { + panic!("Merged two correct layouts and got an incorrect layout."); + } + + return Some(layout.clone()); + } + } + None + } + // ---- RPC HANDLERS ---- pub(crate) fn handle_advertise_status(self: &Arc, from: Uuid, status: &LayoutStatus) { @@ -154,7 +170,7 @@ impl LayoutManager { } pub(crate) fn handle_pull_cluster_layout(&self) -> SystemRpc { - let layout = self.layout_watch.borrow().clone(); + let layout = self.layout.read().unwrap().clone(); // TODO: avoid cloning SystemRpc::AdvertiseClusterLayout(layout) } @@ -172,42 +188,27 @@ impl LayoutManager { return Err(Error::Message(msg)); } - if *adv != **self.layout_watch.borrow() { - let update_layout = self.update_layout.lock().await; - let mut layout: LayoutHistory = self.layout_watch.borrow().as_ref().clone(); + if let Some(new_layout) = self.merge_layout(adv) { + self.change_notify.notify_waiters(); - let prev_layout_check = layout.check().is_ok(); - if layout.merge(adv) { - if prev_layout_check && layout.check().is_err() { - error!("New cluster layout is invalid, discarding."); - return Err(Error::Message( - "New cluster layout is invalid, discarding.".into(), - )); - } - - let layout = Arc::new(layout); - update_layout.send(layout.clone())?; - drop(update_layout); // release mutex - - tokio::spawn({ - let this = self.clone(); - async move { - if let Err(e) = this - .rpc_helper - .broadcast( - &this.system_endpoint, - SystemRpc::AdvertiseClusterLayout(layout), - RequestStrategy::with_priority(PRIO_HIGH), - ) - .await - { - warn!("Error while broadcasting new cluster layout: {}", e); - } + tokio::spawn({ + let this = self.clone(); + async move { + if let Err(e) = this + .rpc_helper + .broadcast( + &this.system_endpoint, + SystemRpc::AdvertiseClusterLayout(new_layout), + RequestStrategy::with_priority(PRIO_HIGH), + ) + .await + { + warn!("Error while broadcasting new cluster layout: {}", e); } - }); + } + }); - self.save_cluster_layout().await?; - } + self.save_cluster_layout().await?; } Ok(SystemRpc::Ok) diff --git a/src/rpc/rpc_helper.rs b/src/rpc/rpc_helper.rs index 3fdb4acd..ce291068 100644 --- a/src/rpc/rpc_helper.rs +++ b/src/rpc/rpc_helper.rs @@ -1,12 +1,11 @@ //! Contain structs related to making RPCs -use std::sync::Arc; +use std::sync::{Arc, RwLock}; use std::time::Duration; use futures::future::join_all; use futures::stream::futures_unordered::FuturesUnordered; use futures::stream::StreamExt; use tokio::select; -use tokio::sync::watch; use opentelemetry::KeyValue; use opentelemetry::{ @@ -91,7 +90,7 @@ pub struct RpcHelper(Arc); struct RpcHelperInner { our_node_id: Uuid, fullmesh: Arc, - layout_watch: watch::Receiver>, + layout: Arc>, metrics: RpcMetrics, rpc_timeout: Duration, } @@ -100,7 +99,7 @@ impl RpcHelper { pub(crate) fn new( our_node_id: Uuid, fullmesh: Arc, - layout_watch: watch::Receiver>, + layout: Arc>, rpc_timeout: Option, ) -> Self { let metrics = RpcMetrics::new(); @@ -108,7 +107,7 @@ impl RpcHelper { Self(Arc::new(RpcHelperInner { our_node_id, fullmesh, - layout_watch, + layout, metrics, rpc_timeout: rpc_timeout.unwrap_or(DEFAULT_TIMEOUT), })) @@ -392,7 +391,7 @@ impl RpcHelper { pub fn request_order(&self, nodes: &[Uuid]) -> Vec { // Retrieve some status variables that we will use to sort requests let peer_list = self.0.fullmesh.get_peer_list(); - let layout: Arc = self.0.layout_watch.borrow().clone(); + let layout = self.0.layout.read().unwrap(); let our_zone = match layout.current().node_role(&self.0.our_node_id) { Some(pc) => &pc.zone, None => "", diff --git a/src/rpc/system.rs b/src/rpc/system.rs index 88c4d443..cb3af3fe 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -4,7 +4,7 @@ use std::io::{Read, Write}; use std::net::{IpAddr, SocketAddr}; use std::path::{Path, PathBuf}; use std::sync::atomic::Ordering; -use std::sync::{Arc, RwLock}; +use std::sync::{Arc, RwLock, RwLockReadGuard}; use std::time::{Duration, Instant}; use arc_swap::ArcSwap; @@ -13,7 +13,7 @@ use futures::join; use serde::{Deserialize, Serialize}; use sodiumoxide::crypto::sign::ed25519; use tokio::select; -use tokio::sync::watch; +use tokio::sync::{watch, Notify}; use netapp::endpoint::{Endpoint, EndpointHandler}; use netapp::message::*; @@ -68,7 +68,7 @@ pub enum SystemRpc { /// Ask other node its cluster layout. Answered with AdvertiseClusterLayout PullClusterLayout, /// Advertisement of cluster layout. Sent spontanously or in response to PullClusterLayout - AdvertiseClusterLayout(Arc), + AdvertiseClusterLayout(LayoutHistory), } impl Rpc for SystemRpc { @@ -345,12 +345,12 @@ impl System { // ---- Public utilities / accessors ---- - pub fn cluster_layout(&self) -> watch::Ref> { + pub fn cluster_layout(&self) -> RwLockReadGuard<'_, LayoutHistory> { self.layout_manager.layout() } - pub fn layout_watch(&self) -> watch::Receiver> { - self.layout_manager.layout_watch.clone() + pub fn layout_notify(&self) -> Arc { + self.layout_manager.change_notify.clone() } pub fn rpc_helper(&self) -> &RpcHelper { @@ -412,7 +412,6 @@ impl System { } pub fn health(&self) -> ClusterHealth { - let layout: Arc<_> = self.cluster_layout().clone(); let quorum = self.replication_mode.write_quorum(); let replication_factor = self.replication_factor; @@ -423,6 +422,8 @@ impl System { .collect::>(); let connected_nodes = nodes.iter().filter(|(_, n)| n.is_up).count(); + let layout = self.cluster_layout(); // acquires a rwlock + // TODO: not only layout.current() let storage_nodes = layout .current() diff --git a/src/table/sync.rs b/src/table/sync.rs index 2da1bfe7..4355bd9e 100644 --- a/src/table/sync.rs +++ b/src/table/sync.rs @@ -10,7 +10,7 @@ use rand::Rng; use serde::{Deserialize, Serialize}; use serde_bytes::ByteBuf; use tokio::select; -use tokio::sync::{mpsc, watch}; +use tokio::sync::{mpsc, watch, Notify}; use garage_util::background::*; use garage_util::data::*; @@ -91,8 +91,8 @@ impl TableSyncer { bg.spawn_worker(SyncWorker { syncer: self.clone(), - layout_watch: self.system.layout_watch(), - layout: self.system.cluster_layout().clone(), + layout_notify: self.system.layout_notify(), + layout_version: self.system.cluster_layout().current().version, add_full_sync_rx, todo: vec![], next_full_sync: Instant::now() + Duration::from_secs(20), @@ -492,8 +492,8 @@ impl EndpointHandler for TableSync struct SyncWorker { syncer: Arc>, - layout_watch: watch::Receiver>, - layout: Arc, + layout_notify: Arc, + layout_version: u64, add_full_sync_rx: mpsc::UnboundedReceiver<()>, todo: Vec, next_full_sync: Instant, @@ -593,12 +593,11 @@ impl Worker for SyncWorker { self.add_full_sync(); } }, - _ = self.layout_watch.changed() => { - let new_layout = self.layout_watch.borrow(); - if !Arc::ptr_eq(&new_layout, &self.layout) { - self.layout = new_layout.clone(); - drop(new_layout); - debug!("({}) Ring changed, adding full sync to syncer todo list", F::TABLE_NAME); + _ = self.layout_notify.notified() => { + let new_version = self.syncer.system.cluster_layout().current().version; + if new_version > self.layout_version { + self.layout_version = new_version; + debug!("({}) Layout changed, adding full sync to syncer todo list", F::TABLE_NAME); self.add_full_sync(); } }, -- cgit v1.2.3 From 94caf9c0c1342ce1d2ba3ac7af39fb133721ee83 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 9 Nov 2023 14:53:34 +0100 Subject: layout: separate code path for synchronizing update trackers only --- src/rpc/layout/history.rs | 51 ++++++++++++----- src/rpc/layout/manager.rs | 140 +++++++++++++++++++++++++++++++++------------- src/rpc/layout/schema.rs | 23 ++++++-- src/rpc/system.rs | 15 ++++- 4 files changed, 168 insertions(+), 61 deletions(-) (limited to 'src') diff --git a/src/rpc/layout/history.rs b/src/rpc/layout/history.rs index 9ae28887..357b9d62 100644 --- a/src/rpc/layout/history.rs +++ b/src/rpc/layout/history.rs @@ -18,10 +18,11 @@ impl LayoutHistory { let mut ret = LayoutHistory { versions: vec![version].into_boxed_slice().into(), update_trackers: Default::default(), + trackers_hash: [0u8; 32].into(), staging: Lww::raw(0, staging), staging_hash: [0u8; 32].into(), }; - ret.staging_hash = ret.calculate_staging_hash(); + ret.update_hashes(); ret } @@ -29,6 +30,15 @@ impl LayoutHistory { self.versions.last().as_ref().unwrap() } + pub(crate) fn update_hashes(&mut self) { + self.trackers_hash = self.calculate_trackers_hash(); + self.staging_hash = self.calculate_staging_hash(); + } + + pub(crate) fn calculate_trackers_hash(&self) -> Hash { + blake2sum(&nonversioned_encode(&self.update_trackers).unwrap()[..]) + } + pub(crate) fn calculate_staging_hash(&self) -> Hash { blake2sum(&nonversioned_encode(&self.staging).unwrap()[..]) } @@ -38,12 +48,6 @@ impl LayoutHistory { pub fn merge(&mut self, other: &LayoutHistory) -> bool { let mut changed = false; - // Merge staged layout changes - if self.staging != other.staging { - changed = true; - } - self.staging.merge(&other.staging); - // Add any new versions to history for v2 in other.versions.iter() { if let Some(v1) = self.versions.iter().find(|v| v.version == v2.version) { @@ -63,7 +67,21 @@ impl LayoutHistory { } // Merge trackers - self.update_trackers.merge(&other.update_trackers); + if self.update_trackers != other.update_trackers { + let c = self.update_trackers.merge(&other.update_trackers); + changed = changed || c; + } + + // Merge staged layout changes + if self.staging != other.staging { + self.staging.merge(&other.staging); + changed = true; + } + + // Update hashes if there are changes + if changed { + self.update_hashes(); + } changed } @@ -100,7 +118,7 @@ To know the correct value of the new layout version, invoke `garage layout show` parameters: self.staging.get().parameters.clone(), roles: LwwMap::new(), }); - self.staging_hash = self.calculate_staging_hash(); + self.update_hashes(); Ok((self, msg)) } @@ -110,20 +128,25 @@ To know the correct value of the new layout version, invoke `garage layout show` parameters: Lww::new(self.current().parameters.clone()), roles: LwwMap::new(), }); - self.staging_hash = self.calculate_staging_hash(); + self.update_hashes(); Ok(self) } pub fn check(&self) -> Result<(), String> { // Check that the hash of the staging data is correct - let staging_hash = self.calculate_staging_hash(); - if staging_hash != self.staging_hash { + if self.trackers_hash != self.calculate_trackers_hash() { + return Err("trackers_hash is incorrect".into()); + } + if self.staging_hash != self.calculate_staging_hash() { return Err("staging_hash is incorrect".into()); } - // TODO: anythign more ? + for version in self.versions.iter() { + version.check()?; + } - self.current().check() + // TODO: anythign more ? + Ok(()) } } diff --git a/src/rpc/layout/manager.rs b/src/rpc/layout/manager.rs index c021039b..a2502f58 100644 --- a/src/rpc/layout/manager.rs +++ b/src/rpc/layout/manager.rs @@ -19,6 +19,7 @@ use crate::rpc_helper::*; use crate::system::*; pub struct LayoutManager { + node_id: Uuid, replication_factor: usize, persist_cluster_layout: Persister, @@ -34,7 +35,7 @@ pub struct LayoutStatus { /// Cluster layout version pub cluster_layout_version: u64, /// Hash of cluster layout update trackers - // (TODO) pub cluster_layout_trackers_hash: Hash, + pub cluster_layout_trackers_hash: Hash, /// Hash of cluster layout staging data pub cluster_layout_staging_hash: Hash, } @@ -81,6 +82,7 @@ impl LayoutManager { ); Ok(Arc::new(Self { + node_id: node_id.into(), replication_factor, persist_cluster_layout, layout, @@ -92,10 +94,15 @@ impl LayoutManager { // ---- PUBLIC INTERFACE ---- + pub fn layout(&self) -> RwLockReadGuard<'_, LayoutHistory> { + self.layout.read().unwrap() + } + pub fn status(&self) -> LayoutStatus { let layout = self.layout(); LayoutStatus { cluster_layout_version: layout.current().version, + cluster_layout_trackers_hash: layout.trackers_hash, cluster_layout_staging_hash: layout.staging_hash, } } @@ -108,11 +115,35 @@ impl LayoutManager { Ok(()) } - pub fn layout(&self) -> RwLockReadGuard<'_, LayoutHistory> { - self.layout.read().unwrap() + // ---- INTERNALS --- + + fn merge_layout(&self, adv: &LayoutHistory) -> Option { + let mut layout = self.layout.write().unwrap(); + let prev_layout_check = layout.check().is_ok(); + + if !prev_layout_check || adv.check().is_ok() { + if layout.merge(adv) { + if prev_layout_check && layout.check().is_err() { + panic!("Merged two correct layouts and got an incorrect layout."); + } + + return Some(layout.clone()); + } + } + None } - pub(crate) async fn pull_cluster_layout(self: &Arc, peer: Uuid) { + fn merge_layout_trackers(&self, adv: &UpdateTrackers) -> Option { + let mut layout = self.layout.write().unwrap(); + if layout.update_trackers != *adv { + if layout.update_trackers.merge(adv) { + return Some(layout.update_trackers.clone()); + } + } + None + } + + async fn pull_cluster_layout(self: &Arc, peer: Uuid) { let resp = self .rpc_helper .call( @@ -123,15 +154,35 @@ impl LayoutManager { ) .await; if let Ok(SystemRpc::AdvertiseClusterLayout(layout)) = resp { - let _: Result<_, _> = self.handle_advertise_cluster_layout(&layout).await; + if let Err(e) = self.handle_advertise_cluster_layout(&layout).await { + warn!("In pull_cluster_layout: {}", e); + } } } - // ---- INTERNALS --- + async fn pull_cluster_layout_trackers(self: &Arc, peer: Uuid) { + let resp = self + .rpc_helper + .call( + &self.system_endpoint, + peer, + SystemRpc::PullClusterLayoutTrackers, + RequestStrategy::with_priority(PRIO_HIGH), + ) + .await; + if let Ok(SystemRpc::AdvertiseClusterLayoutTrackers(trackers)) = resp { + if let Err(e) = self + .handle_advertise_cluster_layout_trackers(&trackers) + .await + { + warn!("In pull_cluster_layout_trackers: {}", e); + } + } + } - /// Save network configuration to disc + /// Save cluster layout data to disk async fn save_cluster_layout(&self) -> Result<(), Error> { - let layout = self.layout.read().unwrap().clone(); // TODO: avoid cloning + let layout = self.layout.read().unwrap().clone(); self.persist_cluster_layout .save_async(&layout) .await @@ -139,33 +190,41 @@ impl LayoutManager { Ok(()) } - fn merge_layout(&self, adv: &LayoutHistory) -> Option { - let mut layout = self.layout.write().unwrap(); - let prev_layout_check = layout.check().is_ok(); - - if !prev_layout_check || adv.check().is_ok() { - if layout.merge(adv) { - if prev_layout_check && layout.check().is_err() { - panic!("Merged two correct layouts and got an incorrect layout."); + fn broadcast_update(self: &Arc, rpc: SystemRpc) { + tokio::spawn({ + let this = self.clone(); + async move { + if let Err(e) = this + .rpc_helper + .broadcast( + &this.system_endpoint, + rpc, + RequestStrategy::with_priority(PRIO_HIGH), + ) + .await + { + warn!("Error while broadcasting new cluster layout: {}", e); } - - return Some(layout.clone()); } - } - None + }); } // ---- RPC HANDLERS ---- - pub(crate) fn handle_advertise_status(self: &Arc, from: Uuid, status: &LayoutStatus) { - let local_status = self.status(); - if status.cluster_layout_version > local_status.cluster_layout_version - || status.cluster_layout_staging_hash != local_status.cluster_layout_staging_hash + pub(crate) fn handle_advertise_status(self: &Arc, from: Uuid, remote: &LayoutStatus) { + let local = self.status(); + if remote.cluster_layout_version > local.cluster_layout_version + || remote.cluster_layout_staging_hash != local.cluster_layout_staging_hash { tokio::spawn({ let this = self.clone(); async move { this.pull_cluster_layout(from).await } }); + } else if remote.cluster_layout_trackers_hash != local.cluster_layout_trackers_hash { + tokio::spawn({ + let this = self.clone(); + async move { this.pull_cluster_layout_trackers(from).await } + }); } } @@ -174,6 +233,11 @@ impl LayoutManager { SystemRpc::AdvertiseClusterLayout(layout) } + pub(crate) fn handle_pull_cluster_layout_trackers(&self) -> SystemRpc { + let layout = self.layout.read().unwrap(); + SystemRpc::AdvertiseClusterLayoutTrackers(layout.update_trackers.clone()) + } + pub(crate) async fn handle_advertise_cluster_layout( self: &Arc, adv: &LayoutHistory, @@ -190,24 +254,20 @@ impl LayoutManager { if let Some(new_layout) = self.merge_layout(adv) { self.change_notify.notify_waiters(); + self.broadcast_update(SystemRpc::AdvertiseClusterLayout(new_layout)); + self.save_cluster_layout().await?; + } - tokio::spawn({ - let this = self.clone(); - async move { - if let Err(e) = this - .rpc_helper - .broadcast( - &this.system_endpoint, - SystemRpc::AdvertiseClusterLayout(new_layout), - RequestStrategy::with_priority(PRIO_HIGH), - ) - .await - { - warn!("Error while broadcasting new cluster layout: {}", e); - } - } - }); + Ok(SystemRpc::Ok) + } + pub(crate) async fn handle_advertise_cluster_layout_trackers( + self: &Arc, + trackers: &UpdateTrackers, + ) -> Result { + if let Some(new_trackers) = self.merge_layout_trackers(trackers) { + self.change_notify.notify_waiters(); + self.broadcast_update(SystemRpc::AdvertiseClusterLayoutTrackers(new_trackers)); self.save_cluster_layout().await?; } diff --git a/src/rpc/layout/schema.rs b/src/rpc/layout/schema.rs index d587a6cb..abae5bd8 100644 --- a/src/rpc/layout/schema.rs +++ b/src/rpc/layout/schema.rs @@ -233,6 +233,8 @@ mod v010 { /// Update trackers pub update_trackers: UpdateTrackers, + /// Hash of the update trackers + pub trackers_hash: Hash, /// Staged changes for the next version pub staging: Lww, @@ -289,10 +291,12 @@ mod v010 { sync_map: update_tracker.clone(), sync_ack_map: update_tracker.clone(), }, + trackers_hash: [0u8; 32].into(), staging: Lww::raw(previous.version, staging), staging_hash: [0u8; 32].into(), }; ret.staging_hash = ret.calculate_staging_hash(); + ret.trackers_hash = ret.calculate_trackers_hash(); ret } } @@ -355,14 +359,20 @@ impl core::str::FromStr for ZoneRedundancy { } impl UpdateTracker { - fn merge(&mut self, other: &UpdateTracker) { + fn merge(&mut self, other: &UpdateTracker) -> bool { + let mut changed = false; for (k, v) in other.0.iter() { if let Some(v_mut) = self.0.get_mut(k) { - *v_mut = std::cmp::max(*v_mut, *v); + if *v > *v_mut { + *v_mut = *v; + changed = true; + } } else { self.0.insert(*k, *v); + changed = true; } } + changed } pub(crate) fn min(&self) -> u64 { @@ -371,9 +381,10 @@ impl UpdateTracker { } impl UpdateTrackers { - pub(crate) fn merge(&mut self, other: &UpdateTrackers) { - self.ack_map.merge(&other.ack_map); - self.sync_map.merge(&other.sync_map); - self.sync_ack_map.merge(&other.sync_ack_map); + pub(crate) fn merge(&mut self, other: &UpdateTrackers) -> bool { + let c1 = self.ack_map.merge(&other.ack_map); + let c2 = self.sync_map.merge(&other.sync_map); + let c3 = self.sync_ack_map.merge(&other.sync_ack_map); + c1 || c2 || c3 } } diff --git a/src/rpc/system.rs b/src/rpc/system.rs index cb3af3fe..6ce13d0d 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -34,7 +34,7 @@ use crate::consul::ConsulDiscovery; #[cfg(feature = "kubernetes-discovery")] use crate::kubernetes::*; use crate::layout::manager::{LayoutManager, LayoutStatus}; -use crate::layout::*; +use crate::layout::{self, LayoutHistory, NodeRoleV}; use crate::replication_mode::*; use crate::rpc_helper::*; @@ -65,10 +65,15 @@ pub enum SystemRpc { GetKnownNodes, /// Return known nodes ReturnKnownNodes(Vec), + /// Ask other node its cluster layout. Answered with AdvertiseClusterLayout PullClusterLayout, /// Advertisement of cluster layout. Sent spontanously or in response to PullClusterLayout AdvertiseClusterLayout(LayoutHistory), + /// Ask other node its cluster layout update trackers. + PullClusterLayoutTrackers, + /// Advertisement of cluster layout update trackers. + AdvertiseClusterLayoutTrackers(layout::UpdateTrackers), } impl Rpc for SystemRpc { @@ -727,6 +732,14 @@ impl EndpointHandler for System { .handle_advertise_cluster_layout(adv) .await } + SystemRpc::PullClusterLayoutTrackers => { + Ok(self.layout_manager.handle_pull_cluster_layout_trackers()) + } + SystemRpc::AdvertiseClusterLayoutTrackers(adv) => { + self.layout_manager + .handle_advertise_cluster_layout_trackers(adv) + .await + } // ---- other -> Error ---- m => Err(Error::unexpected_rpc_message(m)), -- cgit v1.2.3 From 03ebf18830dff1983f09abe6ecb8d8d26daeb446 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 9 Nov 2023 15:31:59 +0100 Subject: layout: begin managing the update tracker values --- src/rpc/layout/history.rs | 74 +++++++++++++++++++++++++++++++++++++++++++---- src/rpc/layout/manager.rs | 7 +++-- src/rpc/layout/schema.rs | 15 +++++++--- src/rpc/layout/version.rs | 46 +++++++++++++++-------------- 4 files changed, 109 insertions(+), 33 deletions(-) (limited to 'src') diff --git a/src/rpc/layout/history.rs b/src/rpc/layout/history.rs index 357b9d62..347f03db 100644 --- a/src/rpc/layout/history.rs +++ b/src/rpc/layout/history.rs @@ -1,3 +1,5 @@ +use std::collections::HashSet; + use garage_util::crdt::{Crdt, Lww, LwwMap}; use garage_util::data::*; use garage_util::encode::nonversioned_encode; @@ -30,6 +32,14 @@ impl LayoutHistory { self.versions.last().as_ref().unwrap() } + pub fn all_storage_nodes(&self) -> HashSet { + self.versions + .iter() + .map(|x| x.nongateway_nodes()) + .flatten() + .collect::>() + } + pub(crate) fn update_hashes(&mut self) { self.trackers_hash = self.calculate_trackers_hash(); self.staging_hash = self.calculate_staging_hash(); @@ -43,6 +53,65 @@ impl LayoutHistory { blake2sum(&nonversioned_encode(&self.staging).unwrap()[..]) } + // ------------------ update tracking --------------- + + pub(crate) fn update_trackers(&mut self, node_id: Uuid) { + // Ensure trackers for this node's values are up-to-date + + // 1. Acknowledge the last layout version in the history + self.ack_last(node_id); + + // 2. Assume the data on this node is sync'ed up at least to + // the first layout version in the history + self.sync_first(node_id); + + // 3. Acknowledge everyone has synced up to min(self.sync_map) + self.sync_ack(node_id); + + // 4. Cleanup layout versions that are not needed anymore + self.cleanup_old_versions(); + + info!("ack_map: {:?}", self.update_trackers.ack_map); + info!("sync_map: {:?}", self.update_trackers.sync_map); + info!("sync_ack_map: {:?}", self.update_trackers.sync_ack_map); + + // Finally, update hashes + self.update_hashes(); + } + + pub(crate) fn ack_last(&mut self, node: Uuid) { + let last_version = self.current().version; + self.update_trackers.ack_map.set_max(node, last_version); + } + + pub(crate) fn sync_first(&mut self, node: Uuid) { + let first_version = self.versions.first().as_ref().unwrap().version; + self.update_trackers.sync_map.set_max(node, first_version); + } + + pub(crate) fn sync_ack(&mut self, node: Uuid) { + self.update_trackers.sync_ack_map.set_max( + node, + self.calculate_global_min(&self.update_trackers.sync_map), + ); + } + + pub(crate) fn cleanup_old_versions(&mut self) { + let min_sync_ack = self.calculate_global_min(&self.update_trackers.sync_ack_map); + while self.versions.first().as_ref().unwrap().version < min_sync_ack { + self.versions.remove(0); + } + } + + pub(crate) fn calculate_global_min(&self, tracker: &UpdateTracker) -> u64 { + let storage_nodes = self.all_storage_nodes(); + storage_nodes + .iter() + .map(|x| tracker.0.get(x).copied().unwrap_or(0)) + .min() + .unwrap_or(0) + } + // ================== updates to layout, public interface =================== pub fn merge(&mut self, other: &LayoutHistory) -> bool { @@ -78,11 +147,6 @@ impl LayoutHistory { changed = true; } - // Update hashes if there are changes - if changed { - self.update_hashes(); - } - changed } diff --git a/src/rpc/layout/manager.rs b/src/rpc/layout/manager.rs index a2502f58..ffcc938b 100644 --- a/src/rpc/layout/manager.rs +++ b/src/rpc/layout/manager.rs @@ -51,7 +51,7 @@ impl LayoutManager { let persist_cluster_layout: Persister = Persister::new(&config.metadata_dir, "cluster_layout"); - let cluster_layout = match persist_cluster_layout.load() { + let mut cluster_layout = match persist_cluster_layout.load() { Ok(x) => { if x.current().replication_factor != replication_factor { return Err(Error::Message(format!( @@ -71,6 +71,8 @@ impl LayoutManager { } }; + cluster_layout.update_trackers(node_id.into()); + let layout = Arc::new(RwLock::new(cluster_layout)); let change_notify = Arc::new(Notify::new()); @@ -126,7 +128,7 @@ impl LayoutManager { if prev_layout_check && layout.check().is_err() { panic!("Merged two correct layouts and got an incorrect layout."); } - + layout.update_trackers(self.node_id); return Some(layout.clone()); } } @@ -137,6 +139,7 @@ impl LayoutManager { let mut layout = self.layout.write().unwrap(); if layout.update_trackers != *adv { if layout.update_trackers.merge(adv) { + layout.update_trackers(self.node_id); return Some(layout.update_trackers.clone()); } } diff --git a/src/rpc/layout/schema.rs b/src/rpc/layout/schema.rs index abae5bd8..9f5d6f62 100644 --- a/src/rpc/layout/schema.rs +++ b/src/rpc/layout/schema.rs @@ -3,6 +3,7 @@ use std::fmt; use bytesize::ByteSize; use garage_util::crdt::{AutoCrdt, Crdt}; +use garage_util::data::Uuid; mod v08 { use crate::layout::CompactNodeType; @@ -276,8 +277,7 @@ mod v010 { let update_tracker = UpdateTracker( version .nongateway_nodes() - .iter() - .map(|x| (*x, version.version)) + .map(|x| (x, version.version)) .collect::>(), ); let staging = LayoutStaging { @@ -375,8 +375,15 @@ impl UpdateTracker { changed } - pub(crate) fn min(&self) -> u64 { - self.0.iter().map(|(_, v)| *v).min().unwrap_or(0) + pub(crate) fn set_max(&mut self, peer: Uuid, value: u64) { + match self.0.get_mut(&peer) { + Some(e) => { + *e = std::cmp::max(*e, value); + } + None => { + self.0.insert(peer, value); + } + } } } diff --git a/src/rpc/layout/version.rs b/src/rpc/layout/version.rs index 6918fdf9..65c62f63 100644 --- a/src/rpc/layout/version.rs +++ b/src/rpc/layout/version.rs @@ -134,15 +134,14 @@ impl LayoutVersion { // ===================== internal information extractors ====================== /// Returns the uuids of the non_gateway nodes in self.node_id_vec. - pub(crate) fn nongateway_nodes(&self) -> Vec { - let mut result = Vec::::new(); - for uuid in self.node_id_vec.iter() { - match self.node_role(uuid) { - Some(role) if role.capacity.is_some() => result.push(*uuid), - _ => (), - } - } - result + pub(crate) fn nongateway_nodes(&self) -> impl Iterator + '_ { + self.node_id_vec + .iter() + .copied() + .filter(move |uuid| match self.node_role(uuid) { + Some(role) if role.capacity.is_some() => true, + _ => false, + }) } /// Given a node uuids, this function returns the label of its zone @@ -158,8 +157,8 @@ impl LayoutVersion { /// Returns the sum of capacities of non gateway nodes in the cluster fn get_total_capacity(&self) -> Result { let mut total_capacity = 0; - for uuid in self.nongateway_nodes().iter() { - total_capacity += self.get_node_capacity(uuid)?; + for uuid in self.nongateway_nodes() { + total_capacity += self.get_node_capacity(&uuid)?; } Ok(total_capacity) } @@ -320,7 +319,7 @@ impl LayoutVersion { // to use them as indices in the flow graphs. let (id_to_zone, zone_to_id) = self.generate_nongateway_zone_ids()?; - let nb_nongateway_nodes = self.nongateway_nodes().len(); + let nb_nongateway_nodes = self.nongateway_nodes().count(); if nb_nongateway_nodes < self.replication_factor { return Err(Error::Message(format!( "The number of nodes with positive \ @@ -479,7 +478,8 @@ impl LayoutVersion { let mut id_to_zone = Vec::::new(); let mut zone_to_id = HashMap::::new(); - for uuid in self.nongateway_nodes().iter() { + let nongateway_nodes = self.nongateway_nodes().collect::>(); + for uuid in nongateway_nodes.iter() { let r = self.node_role(uuid).unwrap(); if !zone_to_id.contains_key(&r.zone) && r.capacity.is_some() { zone_to_id.insert(r.zone.clone(), id_to_zone.len()); @@ -556,8 +556,10 @@ impl LayoutVersion { exclude_assoc: &HashSet<(usize, usize)>, zone_redundancy: usize, ) -> Result, Error> { - let vertices = - LayoutVersion::generate_graph_vertices(zone_to_id.len(), self.nongateway_nodes().len()); + let vertices = LayoutVersion::generate_graph_vertices( + zone_to_id.len(), + self.nongateway_nodes().count(), + ); let mut g = Graph::::new(&vertices); let nb_zones = zone_to_id.len(); for p in 0..NB_PARTITIONS { @@ -576,7 +578,7 @@ impl LayoutVersion { )?; } } - for n in 0..self.nongateway_nodes().len() { + for n in 0..self.nongateway_nodes().count() { let node_capacity = self.get_node_capacity(&self.node_id_vec[n])?; let node_zone = zone_to_id[self.get_node_zone(&self.node_id_vec[n])?]; g.add_edge(Vertex::N(n), Vertex::Sink, node_capacity / partition_size)?; @@ -600,7 +602,7 @@ impl LayoutVersion { // previous assignment let mut exclude_edge = HashSet::<(usize, usize)>::new(); if let Some(prev_assign) = prev_assign_opt { - let nb_nodes = self.nongateway_nodes().len(); + let nb_nodes = self.nongateway_nodes().count(); for (p, prev_assign_p) in prev_assign.iter().enumerate() { for n in 0..nb_nodes { exclude_edge.insert((p, n)); @@ -652,7 +654,7 @@ impl LayoutVersion { // We compute the maximal length of a simple path in gflow. It is used in the // Bellman-Ford algorithm in optimize_flow_with_cost to set the number // of iterations. - let nb_nodes = self.nongateway_nodes().len(); + let nb_nodes = self.nongateway_nodes().count(); let path_length = 4 * nb_nodes; gflow.optimize_flow_with_cost(&cost, path_length)?; @@ -730,7 +732,7 @@ impl LayoutVersion { } // We define and fill in the following tables - let storing_nodes = self.nongateway_nodes(); + let storing_nodes = self.nongateway_nodes().collect::>(); let mut new_partitions = vec![0; storing_nodes.len()]; let mut stored_partitions = vec![0; storing_nodes.len()]; @@ -873,9 +875,9 @@ mod tests { for z in zones.iter() { zone_token.insert(z.clone(), 0); } - for uuid in cl.nongateway_nodes().iter() { - let z = cl.get_node_zone(uuid)?; - let c = cl.get_node_capacity(uuid)?; + for uuid in cl.nongateway_nodes() { + let z = cl.get_node_zone(&uuid)?; + let c = cl.get_node_capacity(&uuid)?; zone_token.insert( z.clone(), zone_token[&z] + min(NB_PARTITIONS, (c / over_size) as usize), -- cgit v1.2.3 From bad7cc812ead88e9f334405c5c082d79c14c8898 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 9 Nov 2023 15:42:10 +0100 Subject: layout admin: add missing calls to update_hash --- src/api/admin/cluster.rs | 1 + src/garage/cli/layout.rs | 3 ++- src/rpc/layout/history.rs | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) (limited to 'src') diff --git a/src/api/admin/cluster.rs b/src/api/admin/cluster.rs index 593bd778..d912b58f 100644 --- a/src/api/admin/cluster.rs +++ b/src/api/admin/cluster.rs @@ -240,6 +240,7 @@ pub async fn handle_update_cluster_layout( .merge(&roles.update_mutator(node, layout::NodeRoleV(new_role))); } + layout.update_hashes(); garage .system .layout_manager diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index 269d92f4..15727448 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -329,8 +329,9 @@ pub async fn fetch_layout( pub async fn send_layout( rpc_cli: &Endpoint, rpc_host: NodeID, - layout: LayoutHistory, + mut layout: LayoutHistory, ) -> Result<(), Error> { + layout.update_hashes(); rpc_cli .call( &rpc_host, diff --git a/src/rpc/layout/history.rs b/src/rpc/layout/history.rs index 347f03db..e17a1c77 100644 --- a/src/rpc/layout/history.rs +++ b/src/rpc/layout/history.rs @@ -40,7 +40,7 @@ impl LayoutHistory { .collect::>() } - pub(crate) fn update_hashes(&mut self) { + pub fn update_hashes(&mut self) { self.trackers_hash = self.calculate_trackers_hash(); self.staging_hash = self.calculate_staging_hash(); } -- cgit v1.2.3 From 9d95f6f7040c1899715ae4f984313427b1432758 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 9 Nov 2023 15:52:45 +0100 Subject: layout: fix tracker bugs --- src/rpc/layout/manager.rs | 8 +++++++- src/rpc/layout/schema.rs | 6 +++--- 2 files changed, 10 insertions(+), 4 deletions(-) (limited to 'src') diff --git a/src/rpc/layout/manager.rs b/src/rpc/layout/manager.rs index ffcc938b..c1417dac 100644 --- a/src/rpc/layout/manager.rs +++ b/src/rpc/layout/manager.rs @@ -125,10 +125,10 @@ impl LayoutManager { if !prev_layout_check || adv.check().is_ok() { if layout.merge(adv) { + layout.update_trackers(self.node_id); if prev_layout_check && layout.check().is_err() { panic!("Merged two correct layouts and got an incorrect layout."); } - layout.update_trackers(self.node_id); return Some(layout.clone()); } } @@ -245,6 +245,8 @@ impl LayoutManager { self: &Arc, adv: &LayoutHistory, ) -> Result { + debug!("handle_advertise_cluster_layout: {:?}", adv); + if adv.current().replication_factor != self.replication_factor { let msg = format!( "Received a cluster layout from another node with replication factor {}, which is different from what we have in our configuration ({}). Discarding the cluster layout we received.", @@ -256,6 +258,8 @@ impl LayoutManager { } if let Some(new_layout) = self.merge_layout(adv) { + debug!("handle_advertise_cluster_layout: some changes were added to the current stuff"); + self.change_notify.notify_waiters(); self.broadcast_update(SystemRpc::AdvertiseClusterLayout(new_layout)); self.save_cluster_layout().await?; @@ -268,6 +272,8 @@ impl LayoutManager { self: &Arc, trackers: &UpdateTrackers, ) -> Result { + debug!("handle_advertise_cluster_layout_trackers: {:?}", trackers); + if let Some(new_trackers) = self.merge_layout_trackers(trackers) { self.change_notify.notify_waiters(); self.broadcast_update(SystemRpc::AdvertiseClusterLayoutTrackers(new_trackers)); diff --git a/src/rpc/layout/schema.rs b/src/rpc/layout/schema.rs index 9f5d6f62..db60c806 100644 --- a/src/rpc/layout/schema.rs +++ b/src/rpc/layout/schema.rs @@ -190,7 +190,7 @@ mod v010 { use garage_util::crdt::{Lww, LwwMap}; use garage_util::data::{Hash, Uuid}; use serde::{Deserialize, Serialize}; - use std::collections::HashMap; + use std::collections::BTreeMap; pub use v09::{LayoutParameters, NodeRole, NodeRoleV, ZoneRedundancy}; /// The layout of the cluster, i.e. the list of roles @@ -257,7 +257,7 @@ mod v010 { /// The history of cluster layouts #[derive(Clone, Debug, Serialize, Deserialize, Default, PartialEq)] - pub struct UpdateTracker(pub HashMap); + pub struct UpdateTracker(pub BTreeMap); impl garage_util::migrate::Migrate for LayoutHistory { const VERSION_MARKER: &'static [u8] = b"G010lh"; @@ -278,7 +278,7 @@ mod v010 { version .nongateway_nodes() .map(|x| (x, version.version)) - .collect::>(), + .collect::>(), ); let staging = LayoutStaging { parameters: previous.staging_parameters, -- cgit v1.2.3 From df36cf3099f6010c4fc62109b85d4d1e62f160cc Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 9 Nov 2023 16:32:31 +0100 Subject: layout: add helpers to LayoutHistory and prepare integration with Table --- src/rpc/layout/history.rs | 41 +++++++++++++++++++++++++++++++++-------- src/rpc/layout/manager.rs | 33 ++++++++++++++++++++++++++++++++- src/rpc/layout/schema.rs | 9 ++++++--- src/rpc/layout/version.rs | 2 +- src/table/table.rs | 2 ++ 5 files changed, 74 insertions(+), 13 deletions(-) (limited to 'src') diff --git a/src/rpc/layout/history.rs b/src/rpc/layout/history.rs index e17a1c77..dbb02269 100644 --- a/src/rpc/layout/history.rs +++ b/src/rpc/layout/history.rs @@ -32,14 +32,6 @@ impl LayoutHistory { self.versions.last().as_ref().unwrap() } - pub fn all_storage_nodes(&self) -> HashSet { - self.versions - .iter() - .map(|x| x.nongateway_nodes()) - .flatten() - .collect::>() - } - pub fn update_hashes(&mut self) { self.trackers_hash = self.calculate_trackers_hash(); self.staging_hash = self.calculate_staging_hash(); @@ -53,6 +45,39 @@ impl LayoutHistory { blake2sum(&nonversioned_encode(&self.staging).unwrap()[..]) } + // ------------------ who stores what now? --------------- + + pub fn max_ack(&self) -> u64 { + self.calculate_global_min(&self.update_trackers.ack_map) + } + + pub fn all_storage_nodes(&self) -> HashSet { + // TODO: cache this + self.versions + .iter() + .map(|x| x.nongateway_nodes()) + .flatten() + .collect::>() + } + + pub fn read_nodes_of(&self, position: &Hash) -> Vec { + let sync_min = self.calculate_global_min(&self.update_trackers.sync_map); + let version = self + .versions + .iter() + .find(|x| x.version == sync_min) + .or(self.versions.last()) + .unwrap(); + version.nodes_of(position, version.replication_factor) + } + + pub fn write_sets_of(&self, position: &Hash) -> Vec> { + self.versions + .iter() + .map(|x| x.nodes_of(position, x.replication_factor)) + .collect::>() + } + // ------------------ update tracking --------------- pub(crate) fn update_trackers(&mut self, node_id: Uuid) { diff --git a/src/rpc/layout/manager.rs b/src/rpc/layout/manager.rs index c1417dac..b0302b12 100644 --- a/src/rpc/layout/manager.rs +++ b/src/rpc/layout/manager.rs @@ -1,4 +1,5 @@ -use std::sync::{Arc, RwLock, RwLockReadGuard}; +use std::collections::HashMap; +use std::sync::{Arc, Mutex, RwLock, RwLockReadGuard}; use std::time::Duration; use serde::{Deserialize, Serialize}; @@ -26,6 +27,8 @@ pub struct LayoutManager { layout: Arc>, pub(crate) change_notify: Arc, + table_sync_version: Mutex>, + pub(crate) rpc_helper: RpcHelper, system_endpoint: Arc>, } @@ -117,6 +120,34 @@ impl LayoutManager { Ok(()) } + pub fn add_table(&self, table_name: &'static str) { + let first_version = self.layout().versions.first().unwrap().version; + + self.table_sync_version + .lock() + .unwrap() + .insert(table_name.to_string(), first_version); + } + + pub fn sync_table_until(self: &Arc, table_name: &'static str, version: u64) { + let mut table_sync_version = self.table_sync_version.lock().unwrap(); + *table_sync_version.get_mut(table_name).unwrap() = version; + let sync_until = table_sync_version.iter().map(|(_, v)| *v).max().unwrap(); + drop(table_sync_version); + + let mut layout = self.layout.write().unwrap(); + if layout + .update_trackers + .sync_map + .set_max(self.node_id, sync_until) + { + layout.update_hashes(); + self.broadcast_update(SystemRpc::AdvertiseClusterLayoutTrackers( + layout.update_trackers.clone(), + )); + } + } + // ---- INTERNALS --- fn merge_layout(&self, adv: &LayoutHistory) -> Option { diff --git a/src/rpc/layout/schema.rs b/src/rpc/layout/schema.rs index db60c806..89f5c361 100644 --- a/src/rpc/layout/schema.rs +++ b/src/rpc/layout/schema.rs @@ -375,14 +375,17 @@ impl UpdateTracker { changed } - pub(crate) fn set_max(&mut self, peer: Uuid, value: u64) { + pub(crate) fn set_max(&mut self, peer: Uuid, value: u64) -> bool { match self.0.get_mut(&peer) { - Some(e) => { - *e = std::cmp::max(*e, value); + Some(e) if *e < value => { + *e = value; + true } None => { self.0.insert(peer, value); + true } + _ => false, } } } diff --git a/src/rpc/layout/version.rs b/src/rpc/layout/version.rs index 65c62f63..8133672a 100644 --- a/src/rpc/layout/version.rs +++ b/src/rpc/layout/version.rs @@ -109,7 +109,7 @@ impl LayoutVersion { .collect::>() } - /// Walk the ring to find the n servers in which data should be replicated + /// Return the n servers in which data for this hash should be replicated pub fn nodes_of(&self, position: &Hash, n: usize) -> Vec { assert_eq!(n, self.replication_factor); diff --git a/src/table/table.rs b/src/table/table.rs index 3e3fd138..997fd7dc 100644 --- a/src/table/table.rs +++ b/src/table/table.rs @@ -80,6 +80,8 @@ impl Table { let syncer = TableSyncer::new(system.clone(), data.clone(), merkle_updater.clone()); let gc = TableGc::new(system.clone(), data.clone()); + system.layout_manager.add_table(F::TABLE_NAME); + let table = Arc::new(Self { system, data, -- cgit v1.2.3 From ce89d1ddabe3b9e638b0173949726522ae9a0311 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Sat, 11 Nov 2023 12:08:32 +0100 Subject: table sync: adapt to new layout history --- src/rpc/layout/history.rs | 21 +++-- src/rpc/layout/manager.rs | 1 + src/rpc/layout/version.rs | 16 ++-- src/rpc/system.rs | 2 +- src/table/replication/fullcopy.rs | 25 ++++- src/table/replication/parameters.rs | 19 +++- src/table/replication/sharded.rs | 39 +++++++- src/table/sync.rs | 178 +++++++++++++++--------------------- 8 files changed, 172 insertions(+), 129 deletions(-) (limited to 'src') diff --git a/src/rpc/layout/history.rs b/src/rpc/layout/history.rs index dbb02269..185dbb27 100644 --- a/src/rpc/layout/history.rs +++ b/src/rpc/layout/history.rs @@ -47,11 +47,19 @@ impl LayoutHistory { // ------------------ who stores what now? --------------- - pub fn max_ack(&self) -> u64 { + pub fn all_ack(&self) -> u64 { self.calculate_global_min(&self.update_trackers.ack_map) } - pub fn all_storage_nodes(&self) -> HashSet { + pub fn min_stored(&self) -> u64 { + self.versions.first().as_ref().unwrap().version + } + + pub fn sync_versions(&self) -> (u64, u64, u64) { + (self.current().version, self.all_ack(), self.min_stored()) + } + + pub fn all_nongateway_nodes(&self) -> HashSet { // TODO: cache this self.versions .iter() @@ -71,11 +79,10 @@ impl LayoutHistory { version.nodes_of(position, version.replication_factor) } - pub fn write_sets_of(&self, position: &Hash) -> Vec> { + pub fn write_sets_of<'a>(&'a self, position: &'a Hash) -> impl Iterator> + 'a { self.versions .iter() - .map(|x| x.nodes_of(position, x.replication_factor)) - .collect::>() + .map(move |x| x.nodes_of(position, x.replication_factor)) } // ------------------ update tracking --------------- @@ -129,7 +136,9 @@ impl LayoutHistory { } pub(crate) fn calculate_global_min(&self, tracker: &UpdateTracker) -> u64 { - let storage_nodes = self.all_storage_nodes(); + // TODO: for TableFullReplication, counting gateway nodes might be + // necessary? Think about this more. + let storage_nodes = self.all_nongateway_nodes(); storage_nodes .iter() .map(|x| tracker.0.get(x).copied().unwrap_or(0)) diff --git a/src/rpc/layout/manager.rs b/src/rpc/layout/manager.rs index b0302b12..7d60bae6 100644 --- a/src/rpc/layout/manager.rs +++ b/src/rpc/layout/manager.rs @@ -92,6 +92,7 @@ impl LayoutManager { persist_cluster_layout, layout, change_notify, + table_sync_version: Mutex::new(HashMap::new()), system_endpoint, rpc_helper, })) diff --git a/src/rpc/layout/version.rs b/src/rpc/layout/version.rs index 8133672a..f45a3c35 100644 --- a/src/rpc/layout/version.rs +++ b/src/rpc/layout/version.rs @@ -98,15 +98,13 @@ impl LayoutVersion { } /// Get the list of partitions and the first hash of a partition key that would fall in it - pub fn partitions(&self) -> Vec<(Partition, Hash)> { - (0..(1 << PARTITION_BITS)) - .map(|i| { - let top = (i as u16) << (16 - PARTITION_BITS); - let mut location = [0u8; 32]; - location[..2].copy_from_slice(&u16::to_be_bytes(top)[..]); - (i as u16, Hash::from(location)) - }) - .collect::>() + pub fn partitions(&self) -> impl Iterator + '_ { + (0..(1 << PARTITION_BITS)).map(|i| { + let top = (i as u16) << (16 - PARTITION_BITS); + let mut location = [0u8; 32]; + location[..2].copy_from_slice(&u16::to_be_bytes(top)[..]); + (i as u16, Hash::from(location)) + }) } /// Return the n servers in which data for this hash should be replicated diff --git a/src/rpc/system.rs b/src/rpc/system.rs index 6ce13d0d..3418600b 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -442,7 +442,7 @@ impl System { .filter(|(x, _, _)| nodes.get(x).map(|n| n.is_up).unwrap_or(false)) .count(); - let partitions = layout.current().partitions(); + let partitions = layout.current().partitions().collect::>(); let partitions_n_up = partitions .iter() .map(|(_, h)| { diff --git a/src/table/replication/fullcopy.rs b/src/table/replication/fullcopy.rs index a5c83d0f..5653a229 100644 --- a/src/table/replication/fullcopy.rs +++ b/src/table/replication/fullcopy.rs @@ -1,3 +1,4 @@ +use std::iter::FromIterator; use std::sync::Arc; use garage_rpc::layout::*; @@ -6,10 +7,17 @@ use garage_util::data::*; use crate::replication::*; +// TODO: find a way to track layout changes for this as well +// The hard thing is that this data is stored also on gateway nodes, +// whereas sharded data is stored only on non-Gateway nodes (storage nodes) +// Also we want to be more tolerant to failures of gateways so we don't +// want to do too much holding back of data when progress of gateway +// nodes is not reported in the layout history's ack/sync/sync_ack maps. + /// Full replication schema: all nodes store everything -/// Writes are disseminated in an epidemic manner in the network /// Advantage: do all reads locally, extremely fast /// Inconvenient: only suitable to reasonably small tables +/// Inconvenient: if some writes fail, nodes will read outdated data #[derive(Clone)] pub struct TableFullReplication { /// The membership manager of this node @@ -44,7 +52,18 @@ impl TableReplication for TableFullReplication { fn partition_of(&self, _hash: &Hash) -> Partition { 0u16 } - fn partitions(&self) -> Vec<(Partition, Hash)> { - vec![(0u16, [0u8; 32].into())] + + fn sync_partitions(&self) -> SyncPartitions { + let layout = self.system.cluster_layout(); + let layout_version = layout.current().version; + SyncPartitions { + layout_version, + partitions: vec![SyncPartition { + partition: 0u16, + first_hash: [0u8; 32].into(), + last_hash: [0xff; 32].into(), + storage_nodes: Vec::from_iter(layout.current().node_ids().to_vec()), + }], + } } } diff --git a/src/table/replication/parameters.rs b/src/table/replication/parameters.rs index 19b306f2..2a7d3585 100644 --- a/src/table/replication/parameters.rs +++ b/src/table/replication/parameters.rs @@ -20,6 +20,21 @@ pub trait TableReplication: Send + Sync + 'static { // Accessing partitions, for Merkle tree & sync /// Get partition for data with given hash fn partition_of(&self, hash: &Hash) -> Partition; - /// List of existing partitions - fn partitions(&self) -> Vec<(Partition, Hash)>; + + /// List of partitions and nodes to sync with in current layout + fn sync_partitions(&self) -> SyncPartitions; +} + +#[derive(Debug)] +pub struct SyncPartitions { + pub layout_version: u64, + pub partitions: Vec, +} + +#[derive(Debug)] +pub struct SyncPartition { + pub partition: Partition, + pub first_hash: Hash, + pub last_hash: Hash, + pub storage_nodes: Vec, } diff --git a/src/table/replication/sharded.rs b/src/table/replication/sharded.rs index 793d87fd..f02b1d66 100644 --- a/src/table/replication/sharded.rs +++ b/src/table/replication/sharded.rs @@ -51,7 +51,42 @@ impl TableReplication for TableShardedReplication { fn partition_of(&self, hash: &Hash) -> Partition { self.system.cluster_layout().current().partition_of(hash) } - fn partitions(&self) -> Vec<(Partition, Hash)> { - self.system.cluster_layout().current().partitions() + + fn sync_partitions(&self) -> SyncPartitions { + let layout = self.system.cluster_layout(); + let layout_version = layout.all_ack(); + + let mut partitions = layout + .current() + .partitions() + .map(|(partition, first_hash)| { + let mut storage_nodes = layout + .write_sets_of(&first_hash) + .map(|x| x.into_iter()) + .flatten() + .collect::>(); + storage_nodes.sort(); + storage_nodes.dedup(); + SyncPartition { + partition, + first_hash, + last_hash: [0u8; 32].into(), // filled in just after + storage_nodes, + } + }) + .collect::>(); + + for i in 0..partitions.len() { + partitions[i].last_hash = if i + 1 < partitions.len() { + partitions[i + 1].first_hash + } else { + [0xFFu8; 32].into() + }; + } + + SyncPartitions { + layout_version, + partitions, + } } } diff --git a/src/table/sync.rs b/src/table/sync.rs index 4355bd9e..43636faa 100644 --- a/src/table/sync.rs +++ b/src/table/sync.rs @@ -6,7 +6,7 @@ use arc_swap::ArcSwapOption; use async_trait::async_trait; use futures_util::stream::*; use opentelemetry::KeyValue; -use rand::Rng; +use rand::prelude::*; use serde::{Deserialize, Serialize}; use serde_bytes::ByteBuf; use tokio::select; @@ -52,16 +52,6 @@ impl Rpc for SyncRpc { type Response = Result; } -#[derive(Debug, Clone)] -struct TodoPartition { - partition: Partition, - begin: Hash, - end: Hash, - - // Are we a node that stores this partition or not? - retain: bool, -} - impl TableSyncer { pub(crate) fn new( system: Arc, @@ -92,9 +82,9 @@ impl TableSyncer { bg.spawn_worker(SyncWorker { syncer: self.clone(), layout_notify: self.system.layout_notify(), - layout_version: self.system.cluster_layout().current().version, + layout_versions: self.system.cluster_layout().sync_versions(), add_full_sync_rx, - todo: vec![], + todo: None, next_full_sync: Instant::now() + Duration::from_secs(20), }); } @@ -112,31 +102,26 @@ impl TableSyncer { async fn sync_partition( self: &Arc, - partition: &TodoPartition, + partition: &SyncPartition, must_exit: &mut watch::Receiver, ) -> Result<(), Error> { - if partition.retain { - let my_id = self.system.id; - - let nodes = self - .data - .replication - .write_nodes(&partition.begin) - .into_iter() - .filter(|node| *node != my_id) - .collect::>(); + let my_id = self.system.id; + let retain = partition.storage_nodes.contains(&my_id); + if retain { debug!( "({}) Syncing {:?} with {:?}...", F::TABLE_NAME, partition, - nodes + partition.storage_nodes ); - let mut sync_futures = nodes + let mut sync_futures = partition + .storage_nodes .iter() + .filter(|node| **node != my_id) .map(|node| { self.clone() - .do_sync_with(partition.clone(), *node, must_exit.clone()) + .do_sync_with(&partition, *node, must_exit.clone()) }) .collect::>(); @@ -147,14 +132,14 @@ impl TableSyncer { warn!("({}) Sync error: {}", F::TABLE_NAME, e); } } - if n_errors > self.data.replication.max_write_errors() { + if n_errors > 0 { return Err(Error::Message(format!( - "Sync failed with too many nodes (should have been: {:?}).", - nodes + "Sync failed with {} nodes.", + n_errors ))); } } else { - self.offload_partition(&partition.begin, &partition.end, must_exit) + self.offload_partition(&partition.first_hash, &partition.last_hash, must_exit) .await?; } @@ -285,7 +270,7 @@ impl TableSyncer { async fn do_sync_with( self: Arc, - partition: TodoPartition, + partition: &SyncPartition, who: Uuid, must_exit: watch::Receiver, ) -> Result<(), Error> { @@ -492,76 +477,23 @@ impl EndpointHandler for TableSync struct SyncWorker { syncer: Arc>, + layout_notify: Arc, - layout_version: u64, + layout_versions: (u64, u64, u64), + add_full_sync_rx: mpsc::UnboundedReceiver<()>, - todo: Vec, next_full_sync: Instant, + + todo: Option, } impl SyncWorker { fn add_full_sync(&mut self) { - let system = &self.syncer.system; - let data = &self.syncer.data; - - let my_id = system.id; - - self.todo.clear(); - - let partitions = data.replication.partitions(); - - for i in 0..partitions.len() { - let begin = partitions[i].1; - - let end = if i + 1 < partitions.len() { - partitions[i + 1].1 - } else { - [0xFFu8; 32].into() - }; - - let nodes = data.replication.write_nodes(&begin); - - let retain = nodes.contains(&my_id); - if !retain { - // Check if we have some data to send, otherwise skip - match data.store.range(begin..end) { - Ok(mut iter) => { - if iter.next().is_none() { - continue; - } - } - Err(e) => { - warn!("DB error in add_full_sync: {}", e); - continue; - } - } - } - - self.todo.push(TodoPartition { - partition: partitions[i].0, - begin, - end, - retain, - }); - } - + let mut partitions = self.syncer.data.replication.sync_partitions(); + partitions.partitions.shuffle(&mut thread_rng()); + self.todo = Some(partitions); self.next_full_sync = Instant::now() + ANTI_ENTROPY_INTERVAL; } - - fn pop_task(&mut self) -> Option { - if self.todo.is_empty() { - return None; - } - - let i = rand::thread_rng().gen_range(0..self.todo.len()); - if i == self.todo.len() - 1 { - self.todo.pop() - } else { - let replacement = self.todo.pop().unwrap(); - let ret = std::mem::replace(&mut self.todo[i], replacement); - Some(ret) - } - } } #[async_trait] @@ -572,18 +504,46 @@ impl Worker for SyncWorker { fn status(&self) -> WorkerStatus { WorkerStatus { - queue_length: Some(self.todo.len() as u64), + queue_length: Some(self.todo.as_ref().map(|x| x.partitions.len()).unwrap_or(0) as u64), ..Default::default() } } async fn work(&mut self, must_exit: &mut watch::Receiver) -> Result { - if let Some(partition) = self.pop_task() { - self.syncer.sync_partition(&partition, must_exit).await?; - Ok(WorkerState::Busy) - } else { - Ok(WorkerState::Idle) + if let Some(todo) = &mut self.todo { + let partition = todo.partitions.pop().unwrap(); + + // process partition + if let Err(e) = self.syncer.sync_partition(&partition, must_exit).await { + error!( + "{}: Failed to sync partition {:?}: {}", + F::TABLE_NAME, + partition, + e + ); + // if error, put partition back at the other side of the queue, + // so that other partitions will be tried in the meantime + todo.partitions.insert(0, partition); + // TODO: returning an error here will cause the background job worker + // to delay this task for some time, but maybe we don't want to + // delay it if there are lots of failures from nodes that are gone + // (we also don't want zero delays as that will cause lots of useless retries) + return Err(e); + } + + // done + if !todo.partitions.is_empty() { + return Ok(WorkerState::Busy); + } + + self.syncer + .system + .layout_manager + .sync_table_until(F::TABLE_NAME, todo.layout_version); } + + self.todo = None; + Ok(WorkerState::Idle) } async fn wait_for_work(&mut self) -> WorkerState { @@ -594,10 +554,16 @@ impl Worker for SyncWorker { } }, _ = self.layout_notify.notified() => { - let new_version = self.syncer.system.cluster_layout().current().version; - if new_version > self.layout_version { - self.layout_version = new_version; - debug!("({}) Layout changed, adding full sync to syncer todo list", F::TABLE_NAME); + let layout_versions = self.syncer.system.cluster_layout().sync_versions(); + if layout_versions != self.layout_versions { + self.layout_versions = layout_versions; + debug!( + "({}) Layout versions changed (max={}, ack={}, min stored={}), adding full sync to syncer todo list", + F::TABLE_NAME, + layout_versions.0, + layout_versions.1, + layout_versions.2 + ); self.add_full_sync(); } }, @@ -605,9 +571,9 @@ impl Worker for SyncWorker { self.add_full_sync(); } } - match self.todo.is_empty() { - false => WorkerState::Busy, - true => WorkerState::Idle, + match self.todo.is_some() { + true => WorkerState::Busy, + false => WorkerState::Idle, } } } -- cgit v1.2.3 From df24bb806d64d5d5e748c35efe3f49ad3dda709e Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Sat, 11 Nov 2023 12:37:33 +0100 Subject: layout/sync: fix bugs and add tracing --- src/rpc/layout/history.rs | 3 ++- src/rpc/layout/manager.rs | 10 ++++++-- src/table/sync.rs | 60 ++++++++++++++++++++++++++++++----------------- 3 files changed, 48 insertions(+), 25 deletions(-) (limited to 'src') diff --git a/src/rpc/layout/history.rs b/src/rpc/layout/history.rs index 185dbb27..cef56647 100644 --- a/src/rpc/layout/history.rs +++ b/src/rpc/layout/history.rs @@ -131,7 +131,8 @@ impl LayoutHistory { pub(crate) fn cleanup_old_versions(&mut self) { let min_sync_ack = self.calculate_global_min(&self.update_trackers.sync_ack_map); while self.versions.first().as_ref().unwrap().version < min_sync_ack { - self.versions.remove(0); + let removed = self.versions.remove(0); + info!("Layout history: pruning old version {}", removed.version); } } diff --git a/src/rpc/layout/manager.rs b/src/rpc/layout/manager.rs index 7d60bae6..ce8b6f61 100644 --- a/src/rpc/layout/manager.rs +++ b/src/rpc/layout/manager.rs @@ -133,7 +133,7 @@ impl LayoutManager { pub fn sync_table_until(self: &Arc, table_name: &'static str, version: u64) { let mut table_sync_version = self.table_sync_version.lock().unwrap(); *table_sync_version.get_mut(table_name).unwrap() = version; - let sync_until = table_sync_version.iter().map(|(_, v)| *v).max().unwrap(); + let sync_until = table_sync_version.iter().map(|(_, v)| *v).min().unwrap(); drop(table_sync_version); let mut layout = self.layout.write().unwrap(); @@ -142,6 +142,7 @@ impl LayoutManager { .sync_map .set_max(self.node_id, sync_until) { + debug!("sync_until updated to {}", sync_until); layout.update_hashes(); self.broadcast_update(SystemRpc::AdvertiseClusterLayoutTrackers( layout.update_trackers.clone(), @@ -277,7 +278,12 @@ impl LayoutManager { self: &Arc, adv: &LayoutHistory, ) -> Result { - debug!("handle_advertise_cluster_layout: {:?}", adv); + debug!( + "handle_advertise_cluster_layout: {} versions, last={}, trackers={:?}", + adv.versions.len(), + adv.current().version, + adv.update_trackers + ); if adv.current().replication_factor != self.replication_factor { let msg = format!( diff --git a/src/table/sync.rs b/src/table/sync.rs index 43636faa..8c21db8b 100644 --- a/src/table/sync.rs +++ b/src/table/sync.rs @@ -488,8 +488,29 @@ struct SyncWorker { } impl SyncWorker { + fn check_add_full_sync(&mut self) { + let layout_versions = self.syncer.system.cluster_layout().sync_versions(); + if layout_versions != self.layout_versions { + self.layout_versions = layout_versions; + info!( + "({}) Layout versions changed (max={}, ack={}, min stored={}), adding full sync to syncer todo list", + F::TABLE_NAME, + layout_versions.0, + layout_versions.1, + layout_versions.2 + ); + self.add_full_sync(); + } + } + fn add_full_sync(&mut self) { let mut partitions = self.syncer.data.replication.sync_partitions(); + info!( + "{}: Adding full sync for ack layout version {}", + F::TABLE_NAME, + partitions.layout_version + ); + partitions.partitions.shuffle(&mut thread_rng()); self.todo = Some(partitions); self.next_full_sync = Instant::now() + ANTI_ENTROPY_INTERVAL; @@ -510,6 +531,8 @@ impl Worker for SyncWorker { } async fn work(&mut self, must_exit: &mut watch::Receiver) -> Result { + self.check_add_full_sync(); + if let Some(todo) = &mut self.todo { let partition = todo.partitions.pop().unwrap(); @@ -531,19 +554,23 @@ impl Worker for SyncWorker { return Err(e); } - // done - if !todo.partitions.is_empty() { - return Ok(WorkerState::Busy); + if todo.partitions.is_empty() { + info!( + "{}: Completed full sync for ack layout version {}", + F::TABLE_NAME, + todo.layout_version + ); + self.syncer + .system + .layout_manager + .sync_table_until(F::TABLE_NAME, todo.layout_version); + self.todo = None; } - self.syncer - .system - .layout_manager - .sync_table_until(F::TABLE_NAME, todo.layout_version); + Ok(WorkerState::Busy) + } else { + Ok(WorkerState::Idle) } - - self.todo = None; - Ok(WorkerState::Idle) } async fn wait_for_work(&mut self) -> WorkerState { @@ -554,18 +581,7 @@ impl Worker for SyncWorker { } }, _ = self.layout_notify.notified() => { - let layout_versions = self.syncer.system.cluster_layout().sync_versions(); - if layout_versions != self.layout_versions { - self.layout_versions = layout_versions; - debug!( - "({}) Layout versions changed (max={}, ack={}, min stored={}), adding full sync to syncer todo list", - F::TABLE_NAME, - layout_versions.0, - layout_versions.1, - layout_versions.2 - ); - self.add_full_sync(); - } + self.check_add_full_sync(); }, _ = tokio::time::sleep_until(self.next_full_sync.into()) => { self.add_full_sync(); -- cgit v1.2.3 From 9a491fa1372a23e91c793ee1d2b313607752826a Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Sat, 11 Nov 2023 13:10:59 +0100 Subject: layout: fix test --- src/rpc/layout/history.rs | 9 ++- src/rpc/layout/mod.rs | 3 + src/rpc/layout/test.rs | 159 ++++++++++++++++++++++++++++++++++++++++++ src/rpc/layout/version.rs | 172 ++-------------------------------------------- 4 files changed, 174 insertions(+), 169 deletions(-) create mode 100644 src/rpc/layout/test.rs (limited to 'src') diff --git a/src/rpc/layout/history.rs b/src/rpc/layout/history.rs index cef56647..050f5d0a 100644 --- a/src/rpc/layout/history.rs +++ b/src/rpc/layout/history.rs @@ -18,7 +18,7 @@ impl LayoutHistory { }; let mut ret = LayoutHistory { - versions: vec![version].into_boxed_slice().into(), + versions: vec![version], update_trackers: Default::default(), trackers_hash: [0u8; 32].into(), staging: Lww::raw(0, staging), @@ -211,6 +211,11 @@ To know the correct value of the new layout version, invoke `garage layout show` let msg = new_version.calculate_partition_assignment()?; self.versions.push(new_version); + if self.current().check().is_ok() { + while self.versions.first().unwrap().check().is_err() { + self.versions.remove(0); + } + } // Reset the staged layout changes self.staging.update(LayoutStaging { @@ -245,7 +250,7 @@ To know the correct value of the new layout version, invoke `garage layout show` version.check()?; } - // TODO: anythign more ? + // TODO: anything more ? Ok(()) } } diff --git a/src/rpc/layout/mod.rs b/src/rpc/layout/mod.rs index cd3764bc..577b32fb 100644 --- a/src/rpc/layout/mod.rs +++ b/src/rpc/layout/mod.rs @@ -3,6 +3,9 @@ mod history; mod schema; mod version; +#[cfg(test)] +mod test; + pub mod manager; // ---- re-exports ---- diff --git a/src/rpc/layout/test.rs b/src/rpc/layout/test.rs new file mode 100644 index 00000000..0ce090d2 --- /dev/null +++ b/src/rpc/layout/test.rs @@ -0,0 +1,159 @@ +use std::cmp::min; +use std::collections::HashMap; + +use garage_util::crdt::Crdt; +use garage_util::error::*; + +use crate::layout::*; + +// This function checks that the partition size S computed is at least better than the +// one given by a very naive algorithm. To do so, we try to run the naive algorithm +// assuming a partion size of S+1. If we succed, it means that the optimal assignment +// was not optimal. The naive algorithm is the following : +// - we compute the max number of partitions associated to every node, capped at the +// partition number. It gives the number of tokens of every node. +// - every zone has a number of tokens equal to the sum of the tokens of its nodes. +// - we cycle over the partitions and associate zone tokens while respecting the +// zone redundancy constraint. +// NOTE: the naive algorithm is not optimal. Counter example: +// take nb_partition = 3 ; replication_factor = 5; redundancy = 4; +// number of tokens by zone : (A, 4), (B,1), (C,4), (D, 4), (E, 2) +// With these parameters, the naive algo fails, whereas there is a solution: +// (A,A,C,D,E) , (A,B,C,D,D) (A,C,C,D,E) +fn check_against_naive(cl: &LayoutVersion) -> Result { + let over_size = cl.partition_size + 1; + let mut zone_token = HashMap::::new(); + + let (zones, zone_to_id) = cl.generate_nongateway_zone_ids()?; + + if zones.is_empty() { + return Ok(false); + } + + for z in zones.iter() { + zone_token.insert(z.clone(), 0); + } + for uuid in cl.nongateway_nodes() { + let z = cl.get_node_zone(&uuid)?; + let c = cl.get_node_capacity(&uuid)?; + zone_token.insert( + z.to_string(), + zone_token[z] + min(NB_PARTITIONS, (c / over_size) as usize), + ); + } + + // For every partition, we count the number of zone already associated and + // the name of the last zone associated + + let mut id_zone_token = vec![0; zones.len()]; + for (z, t) in zone_token.iter() { + id_zone_token[zone_to_id[z]] = *t; + } + + let mut nb_token = vec![0; NB_PARTITIONS]; + let mut last_zone = vec![zones.len(); NB_PARTITIONS]; + + let mut curr_zone = 0; + + let redundancy = cl.effective_zone_redundancy(); + + for replic in 0..cl.replication_factor { + for p in 0..NB_PARTITIONS { + while id_zone_token[curr_zone] == 0 + || (last_zone[p] == curr_zone + && redundancy - nb_token[p] <= cl.replication_factor - replic) + { + curr_zone += 1; + if curr_zone >= zones.len() { + return Ok(true); + } + } + id_zone_token[curr_zone] -= 1; + if last_zone[p] != curr_zone { + nb_token[p] += 1; + last_zone[p] = curr_zone; + } + } + } + + return Ok(false); +} + +fn show_msg(msg: &Message) { + for s in msg.iter() { + println!("{}", s); + } +} + +fn update_layout( + cl: &mut LayoutHistory, + node_capacity_vec: &[u64], + node_zone_vec: &[&'static str], + zone_redundancy: usize, +) { + let staging = cl.staging.get_mut(); + + for (i, (capacity, zone)) in node_capacity_vec + .iter() + .zip(node_zone_vec.iter()) + .enumerate() + { + let node_id = [i as u8; 32].into(); + + let update = staging.roles.update_mutator( + node_id, + NodeRoleV(Some(NodeRole { + zone: zone.to_string(), + capacity: Some(*capacity), + tags: (vec![]), + })), + ); + staging.roles.merge(&update); + } + staging.parameters.update(LayoutParameters { + zone_redundancy: ZoneRedundancy::AtLeast(zone_redundancy), + }); + + cl.update_hashes(); +} + +#[test] +fn test_assignment() { + let mut node_capacity_vec = vec![4000, 1000, 2000]; + let mut node_zone_vec = vec!["A", "B", "C"]; + + let mut cl = LayoutHistory::new(3); + update_layout(&mut cl, &node_capacity_vec, &node_zone_vec, 3); + let v = cl.current().version; + let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); + show_msg(&msg); + assert_eq!(cl.check(), Ok(())); + assert!(check_against_naive(cl.current()).unwrap()); + + node_capacity_vec = vec![4000, 1000, 1000, 3000, 1000, 1000, 2000, 10000, 2000]; + node_zone_vec = vec!["A", "B", "C", "C", "C", "B", "G", "H", "I"]; + update_layout(&mut cl, &node_capacity_vec, &node_zone_vec, 2); + let v = cl.current().version; + let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); + show_msg(&msg); + assert_eq!(cl.check(), Ok(())); + assert!(check_against_naive(cl.current()).unwrap()); + + node_capacity_vec = vec![4000, 1000, 2000, 7000, 1000, 1000, 2000, 10000, 2000]; + update_layout(&mut cl, &node_capacity_vec, &node_zone_vec, 3); + let v = cl.current().version; + let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); + show_msg(&msg); + assert_eq!(cl.check(), Ok(())); + assert!(check_against_naive(cl.current()).unwrap()); + + node_capacity_vec = vec![ + 4000000, 4000000, 2000000, 7000000, 1000000, 9000000, 2000000, 10000, 2000000, + ]; + update_layout(&mut cl, &node_capacity_vec, &node_zone_vec, 1); + let v = cl.current().version; + let (cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); + show_msg(&msg); + assert_eq!(cl.check(), Ok(())); + assert!(check_against_naive(cl.current()).unwrap()); +} diff --git a/src/rpc/layout/version.rs b/src/rpc/layout/version.rs index f45a3c35..ffbdf277 100644 --- a/src/rpc/layout/version.rs +++ b/src/rpc/layout/version.rs @@ -143,7 +143,7 @@ impl LayoutVersion { } /// Given a node uuids, this function returns the label of its zone - fn get_node_zone(&self, uuid: &Uuid) -> Result<&str, Error> { + pub(crate) fn get_node_zone(&self, uuid: &Uuid) -> Result<&str, Error> { match self.node_role(uuid) { Some(role) => Ok(&role.zone), _ => Err(Error::Message( @@ -162,7 +162,7 @@ impl LayoutVersion { } /// Returns the effective value of the zone_redundancy parameter - fn effective_zone_redundancy(&self) -> usize { + pub(crate) fn effective_zone_redundancy(&self) -> usize { match self.parameters.zone_redundancy { ZoneRedundancy::AtLeast(v) => v, ZoneRedundancy::Maximum => { @@ -472,7 +472,9 @@ impl LayoutVersion { /// This function generates ids for the zone of the nodes appearing in /// self.node_id_vec. - fn generate_nongateway_zone_ids(&self) -> Result<(Vec, HashMap), Error> { + pub(crate) fn generate_nongateway_zone_ids( + &self, + ) -> Result<(Vec, HashMap), Error> { let mut id_to_zone = Vec::::new(); let mut zone_to_id = HashMap::::new(); @@ -838,167 +840,3 @@ impl LayoutVersion { Ok(msg) } } - -// ==================================================================================== - -#[cfg(test)] -mod tests { - use super::{Error, *}; - use std::cmp::min; - - // This function checks that the partition size S computed is at least better than the - // one given by a very naive algorithm. To do so, we try to run the naive algorithm - // assuming a partion size of S+1. If we succed, it means that the optimal assignment - // was not optimal. The naive algorithm is the following : - // - we compute the max number of partitions associated to every node, capped at the - // partition number. It gives the number of tokens of every node. - // - every zone has a number of tokens equal to the sum of the tokens of its nodes. - // - we cycle over the partitions and associate zone tokens while respecting the - // zone redundancy constraint. - // NOTE: the naive algorithm is not optimal. Counter example: - // take nb_partition = 3 ; replication_factor = 5; redundancy = 4; - // number of tokens by zone : (A, 4), (B,1), (C,4), (D, 4), (E, 2) - // With these parameters, the naive algo fails, whereas there is a solution: - // (A,A,C,D,E) , (A,B,C,D,D) (A,C,C,D,E) - fn check_against_naive(cl: &LayoutVersion) -> Result { - let over_size = cl.partition_size + 1; - let mut zone_token = HashMap::::new(); - - let (zones, zone_to_id) = cl.generate_nongateway_zone_ids()?; - - if zones.is_empty() { - return Ok(false); - } - - for z in zones.iter() { - zone_token.insert(z.clone(), 0); - } - for uuid in cl.nongateway_nodes() { - let z = cl.get_node_zone(&uuid)?; - let c = cl.get_node_capacity(&uuid)?; - zone_token.insert( - z.clone(), - zone_token[&z] + min(NB_PARTITIONS, (c / over_size) as usize), - ); - } - - // For every partition, we count the number of zone already associated and - // the name of the last zone associated - - let mut id_zone_token = vec![0; zones.len()]; - for (z, t) in zone_token.iter() { - id_zone_token[zone_to_id[z]] = *t; - } - - let mut nb_token = vec![0; NB_PARTITIONS]; - let mut last_zone = vec![zones.len(); NB_PARTITIONS]; - - let mut curr_zone = 0; - - let redundancy = cl.effective_zone_redundancy(); - - for replic in 0..cl.replication_factor { - for p in 0..NB_PARTITIONS { - while id_zone_token[curr_zone] == 0 - || (last_zone[p] == curr_zone - && redundancy - nb_token[p] <= cl.replication_factor - replic) - { - curr_zone += 1; - if curr_zone >= zones.len() { - return Ok(true); - } - } - id_zone_token[curr_zone] -= 1; - if last_zone[p] != curr_zone { - nb_token[p] += 1; - last_zone[p] = curr_zone; - } - } - } - - return Ok(false); - } - - fn show_msg(msg: &Message) { - for s in msg.iter() { - println!("{}", s); - } - } - - fn update_layout( - cl: &mut LayoutVersion, - node_id_vec: &Vec, - node_capacity_vec: &Vec, - node_zone_vec: &Vec, - zone_redundancy: usize, - ) { - for i in 0..node_id_vec.len() { - if let Some(x) = FixedBytes32::try_from(&[i as u8; 32]) { - cl.node_id_vec.push(x); - } - - let update = cl.staging_roles.update_mutator( - cl.node_id_vec[i], - NodeRoleV(Some(NodeRole { - zone: (node_zone_vec[i].to_string()), - capacity: (Some(node_capacity_vec[i])), - tags: (vec![]), - })), - ); - cl.staging_roles.merge(&update); - } - cl.staging_parameters.update(LayoutParameters { - zone_redundancy: ZoneRedundancy::AtLeast(zone_redundancy), - }); - cl.staging_hash = cl.calculate_staging_hash(); - } - - #[test] - fn test_assignment() { - let mut node_id_vec = vec![1, 2, 3]; - let mut node_capacity_vec = vec![4000, 1000, 2000]; - let mut node_zone_vec = vec!["A", "B", "C"] - .into_iter() - .map(|x| x.to_string()) - .collect(); - - let mut cl = LayoutVersion::new(3); - update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 3); - let v = cl.version; - let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); - show_msg(&msg); - assert_eq!(cl.check(), Ok(())); - assert!(matches!(check_against_naive(&cl), Ok(true))); - - node_id_vec = vec![1, 2, 3, 4, 5, 6, 7, 8, 9]; - node_capacity_vec = vec![4000, 1000, 1000, 3000, 1000, 1000, 2000, 10000, 2000]; - node_zone_vec = vec!["A", "B", "C", "C", "C", "B", "G", "H", "I"] - .into_iter() - .map(|x| x.to_string()) - .collect(); - update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 2); - let v = cl.version; - let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); - show_msg(&msg); - assert_eq!(cl.check(), Ok(())); - assert!(matches!(check_against_naive(&cl), Ok(true))); - - node_capacity_vec = vec![4000, 1000, 2000, 7000, 1000, 1000, 2000, 10000, 2000]; - update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 3); - let v = cl.version; - let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); - show_msg(&msg); - assert_eq!(cl.check(), Ok(())); - assert!(matches!(check_against_naive(&cl), Ok(true))); - - node_capacity_vec = vec![ - 4000000, 4000000, 2000000, 7000000, 1000000, 9000000, 2000000, 10000, 2000000, - ]; - update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 1); - let v = cl.version; - let (cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); - show_msg(&msg); - assert_eq!(cl.check(), Ok(())); - assert!(matches!(check_against_naive(&cl), Ok(true))); - } -} -- cgit v1.2.3 From 8e292e06b3fde1d3b5b019a26eabd4f0d9ac22c3 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 14 Nov 2023 12:48:38 +0100 Subject: layout: some refactoring of nongateway nodes --- src/api/k2v/index.rs | 7 +++- src/model/index_counter.rs | 4 +- src/rpc/layout/history.rs | 30 ++++++++------ src/rpc/layout/schema.rs | 17 ++++++++ src/rpc/layout/version.rs | 98 +++++++++++++++++++++++++--------------------- 5 files changed, 95 insertions(+), 61 deletions(-) (limited to 'src') diff --git a/src/api/k2v/index.rs b/src/api/k2v/index.rs index 3c2f51a9..c189232a 100644 --- a/src/api/k2v/index.rs +++ b/src/api/k2v/index.rs @@ -25,8 +25,11 @@ pub async fn handle_read_index( ) -> Result, Error> { let reverse = reverse.unwrap_or(false); - // TODO: not only current - let node_id_vec = garage.system.cluster_layout().current().node_ids().to_vec(); + let node_id_vec = garage + .system + .cluster_layout() + .all_nongateway_nodes() + .into_owned(); let (partition_keys, more, next_start) = read_range( &garage.k2v.counter_table.table, diff --git a/src/model/index_counter.rs b/src/model/index_counter.rs index 9637cc4c..2d968733 100644 --- a/src/model/index_counter.rs +++ b/src/model/index_counter.rs @@ -84,8 +84,8 @@ impl Entry for CounterEntry { impl CounterEntry { pub fn filtered_values(&self, layout: &LayoutHistory) -> HashMap { - let nodes = &layout.current().node_id_vec[..]; - self.filtered_values_with_nodes(nodes) + let nodes = layout.all_nongateway_nodes(); + self.filtered_values_with_nodes(&nodes) } pub fn filtered_values_with_nodes(&self, nodes: &[Uuid]) -> HashMap { diff --git a/src/rpc/layout/history.rs b/src/rpc/layout/history.rs index 050f5d0a..877ad3a7 100644 --- a/src/rpc/layout/history.rs +++ b/src/rpc/layout/history.rs @@ -1,3 +1,4 @@ +use std::borrow::Cow; use std::collections::HashSet; use garage_util::crdt::{Crdt, Lww, LwwMap}; @@ -59,13 +60,19 @@ impl LayoutHistory { (self.current().version, self.all_ack(), self.min_stored()) } - pub fn all_nongateway_nodes(&self) -> HashSet { + pub fn all_nongateway_nodes(&self) -> Cow<'_, [Uuid]> { // TODO: cache this - self.versions - .iter() - .map(|x| x.nongateway_nodes()) - .flatten() - .collect::>() + if self.versions.len() == 1 { + self.versions[0].nongateway_nodes().into() + } else { + let set = self + .versions + .iter() + .map(|x| x.nongateway_nodes()) + .flatten() + .collect::>(); + set.into_iter().copied().collect::>().into() + } } pub fn read_nodes_of(&self, position: &Hash) -> Vec { @@ -202,14 +209,11 @@ To know the correct value of the new layout version, invoke `garage layout show` } // Compute new version and add it to history - let mut new_version = self.current().clone(); - new_version.version += 1; - - new_version.roles.merge(&self.staging.get().roles); - new_version.roles.retain(|(_, _, v)| v.0.is_some()); - new_version.parameters = *self.staging.get().parameters.get(); + let (new_version, msg) = self + .current() + .clone() + .calculate_next_version(&self.staging.get())?; - let msg = new_version.calculate_partition_assignment()?; self.versions.push(new_version); if self.current().check().is_ok() { while self.versions.first().unwrap().check().is_err() { diff --git a/src/rpc/layout/schema.rs b/src/rpc/layout/schema.rs index 89f5c361..db298ee6 100644 --- a/src/rpc/layout/schema.rs +++ b/src/rpc/layout/schema.rs @@ -212,6 +212,8 @@ mod v010 { /// see comment in v08::ClusterLayout pub node_id_vec: Vec, + /// number of non-gateway nodes, which are the first ids in node_id_vec + pub nongateway_node_count: usize, /// see comment in v08::ClusterLayout #[serde(with = "serde_bytes")] pub ring_assignment_data: Vec, @@ -265,6 +267,18 @@ mod v010 { type Previous = v09::ClusterLayout; fn migrate(previous: Self::Previous) -> Self { + let nongateway_node_count = previous + .node_id_vec + .iter() + .enumerate() + .filter(|(_, uuid)| { + let role = previous.roles.get(uuid); + matches!(role, Some(NodeRoleV(Some(role))) if role.capacity.is_some()) + }) + .map(|(i, _)| i) + .max() + .unwrap_or(0); + let version = LayoutVersion { version: previous.version, replication_factor: previous.replication_factor, @@ -272,11 +286,14 @@ mod v010 { parameters: previous.parameters, roles: previous.roles, node_id_vec: previous.node_id_vec, + nongateway_node_count, ring_assignment_data: previous.ring_assignment_data, }; let update_tracker = UpdateTracker( version .nongateway_nodes() + .iter() + .copied() .map(|x| (x, version.version)) .collect::>(), ); diff --git a/src/rpc/layout/version.rs b/src/rpc/layout/version.rs index ffbdf277..a7f387b6 100644 --- a/src/rpc/layout/version.rs +++ b/src/rpc/layout/version.rs @@ -5,7 +5,7 @@ use std::convert::TryInto; use bytesize::ByteSize; use itertools::Itertools; -use garage_util::crdt::LwwMap; +use garage_util::crdt::{Crdt, LwwMap}; use garage_util::data::*; use garage_util::error::*; @@ -30,6 +30,7 @@ impl LayoutVersion { partition_size: 0, roles: LwwMap::new(), node_id_vec: Vec::new(), + nongateway_node_count: 0, ring_assignment_data: Vec::new(), parameters, } @@ -43,6 +44,11 @@ impl LayoutVersion { &self.node_id_vec[..] } + /// Returns the uuids of the non_gateway nodes in self.node_id_vec. + pub fn nongateway_nodes(&self) -> &[Uuid] { + &self.node_id_vec[..self.nongateway_node_count] + } + pub fn num_nodes(&self) -> usize { self.node_id_vec.len() } @@ -56,18 +62,14 @@ impl LayoutVersion { } /// Given a node uuids, this function returns its capacity or fails if it does not have any - pub fn get_node_capacity(&self, uuid: &Uuid) -> Result { + pub fn get_node_capacity(&self, uuid: &Uuid) -> Option { match self.node_role(uuid) { Some(NodeRole { capacity: Some(cap), zone: _, tags: _, - }) => Ok(*cap), - _ => Err(Error::Message( - "The Uuid does not correspond to a node present in the \ - cluster or this node does not have a positive capacity." - .into(), - )), + }) => Some(*cap), + _ => None, } } @@ -131,17 +133,6 @@ impl LayoutVersion { // ===================== internal information extractors ====================== - /// Returns the uuids of the non_gateway nodes in self.node_id_vec. - pub(crate) fn nongateway_nodes(&self) -> impl Iterator + '_ { - self.node_id_vec - .iter() - .copied() - .filter(move |uuid| match self.node_role(uuid) { - Some(role) if role.capacity.is_some() => true, - _ => false, - }) - } - /// Given a node uuids, this function returns the label of its zone pub(crate) fn get_node_zone(&self, uuid: &Uuid) -> Result<&str, Error> { match self.node_role(uuid) { @@ -152,11 +143,16 @@ impl LayoutVersion { } } + fn expect_get_node_capacity(&self, uuid: &Uuid) -> u64 { + self.get_node_capacity(&uuid) + .expect("non-gateway node with zero capacity") + } + /// Returns the sum of capacities of non gateway nodes in the cluster fn get_total_capacity(&self) -> Result { let mut total_capacity = 0; for uuid in self.nongateway_nodes() { - total_capacity += self.get_node_capacity(&uuid)?; + total_capacity += self.expect_get_node_capacity(&uuid); } Ok(total_capacity) } @@ -257,7 +253,7 @@ impl LayoutVersion { if *usage > 0 { let uuid = self.node_id_vec[n]; let partusage = usage * self.partition_size; - let nodecap = self.get_node_capacity(&uuid).unwrap(); + let nodecap = self.expect_get_node_capacity(&uuid); if partusage > nodecap { return Err(format!( "node usage ({}) is bigger than node capacity ({})", @@ -288,6 +284,21 @@ impl LayoutVersion { // ================== updates to layout, internals =================== + pub(crate) fn calculate_next_version( + mut self, + staging: &LayoutStaging, + ) -> Result<(Self, Message), Error> { + self.version += 1; + + self.roles.merge(&staging.roles); + self.roles.retain(|(_, _, v)| v.0.is_some()); + self.parameters = *staging.parameters.get(); + + let msg = self.calculate_partition_assignment()?; + + Ok((self, msg)) + } + /// This function calculates a new partition-to-node assignment. /// The computed assignment respects the node replication factor /// and the zone redundancy parameter It maximizes the capacity of a @@ -297,7 +308,7 @@ impl LayoutVersion { /// data to be moved. /// Staged role changes must be merged with nodes roles before calling this function, /// hence it must only be called from apply_staged_changes() and hence is not public. - pub(crate) fn calculate_partition_assignment(&mut self) -> Result { + fn calculate_partition_assignment(&mut self) -> Result { // We update the node ids, since the node role list might have changed with the // changes in the layout. We retrieve the old_assignment reframed with new ids let old_assignment_opt = self.update_node_id_vec()?; @@ -317,12 +328,12 @@ impl LayoutVersion { // to use them as indices in the flow graphs. let (id_to_zone, zone_to_id) = self.generate_nongateway_zone_ids()?; - let nb_nongateway_nodes = self.nongateway_nodes().count(); - if nb_nongateway_nodes < self.replication_factor { + if self.nongateway_nodes().len() < self.replication_factor { return Err(Error::Message(format!( "The number of nodes with positive \ capacity ({}) is smaller than the replication factor ({}).", - nb_nongateway_nodes, self.replication_factor + self.nongateway_nodes().len(), + self.replication_factor ))); } if id_to_zone.len() < zone_redundancy { @@ -420,12 +431,14 @@ impl LayoutVersion { .map(|(k, _, _)| *k) .collect(); - let mut new_node_id_vec = Vec::::new(); - new_node_id_vec.extend(new_non_gateway_nodes); - new_node_id_vec.extend(new_gateway_nodes); + let old_node_id_vec = std::mem::take(&mut self.node_id_vec); + + self.nongateway_node_count = new_non_gateway_nodes.len(); + self.node_id_vec.clear(); + self.node_id_vec.extend(new_non_gateway_nodes); + self.node_id_vec.extend(new_gateway_nodes); - let old_node_id_vec = self.node_id_vec.clone(); - self.node_id_vec = new_node_id_vec.clone(); + let new_node_id_vec = &self.node_id_vec; // (2) We retrieve the old association // We rewrite the old association with the new indices. We only consider partition @@ -464,7 +477,7 @@ impl LayoutVersion { } } - // We write the ring + // We clear the ring assignemnt data self.ring_assignment_data = Vec::::new(); Ok(Some(old_assignment)) @@ -478,8 +491,7 @@ impl LayoutVersion { let mut id_to_zone = Vec::::new(); let mut zone_to_id = HashMap::::new(); - let nongateway_nodes = self.nongateway_nodes().collect::>(); - for uuid in nongateway_nodes.iter() { + for uuid in self.nongateway_nodes().iter() { let r = self.node_role(uuid).unwrap(); if !zone_to_id.contains_key(&r.zone) && r.capacity.is_some() { zone_to_id.insert(r.zone.clone(), id_to_zone.len()); @@ -556,10 +568,8 @@ impl LayoutVersion { exclude_assoc: &HashSet<(usize, usize)>, zone_redundancy: usize, ) -> Result, Error> { - let vertices = LayoutVersion::generate_graph_vertices( - zone_to_id.len(), - self.nongateway_nodes().count(), - ); + let vertices = + LayoutVersion::generate_graph_vertices(zone_to_id.len(), self.nongateway_nodes().len()); let mut g = Graph::::new(&vertices); let nb_zones = zone_to_id.len(); for p in 0..NB_PARTITIONS { @@ -578,8 +588,8 @@ impl LayoutVersion { )?; } } - for n in 0..self.nongateway_nodes().count() { - let node_capacity = self.get_node_capacity(&self.node_id_vec[n])?; + for n in 0..self.nongateway_nodes().len() { + let node_capacity = self.expect_get_node_capacity(&self.node_id_vec[n]); let node_zone = zone_to_id[self.get_node_zone(&self.node_id_vec[n])?]; g.add_edge(Vertex::N(n), Vertex::Sink, node_capacity / partition_size)?; for p in 0..NB_PARTITIONS { @@ -602,7 +612,7 @@ impl LayoutVersion { // previous assignment let mut exclude_edge = HashSet::<(usize, usize)>::new(); if let Some(prev_assign) = prev_assign_opt { - let nb_nodes = self.nongateway_nodes().count(); + let nb_nodes = self.nongateway_nodes().len(); for (p, prev_assign_p) in prev_assign.iter().enumerate() { for n in 0..nb_nodes { exclude_edge.insert((p, n)); @@ -654,7 +664,7 @@ impl LayoutVersion { // We compute the maximal length of a simple path in gflow. It is used in the // Bellman-Ford algorithm in optimize_flow_with_cost to set the number // of iterations. - let nb_nodes = self.nongateway_nodes().count(); + let nb_nodes = self.nongateway_nodes().len(); let path_length = 4 * nb_nodes; gflow.optimize_flow_with_cost(&cost, path_length)?; @@ -732,7 +742,7 @@ impl LayoutVersion { } // We define and fill in the following tables - let storing_nodes = self.nongateway_nodes().collect::>(); + let storing_nodes = self.nongateway_nodes(); let mut new_partitions = vec![0; storing_nodes.len()]; let mut stored_partitions = vec![0; storing_nodes.len()]; @@ -804,13 +814,13 @@ impl LayoutVersion { let available_cap_z: u64 = self.partition_size * replicated_partitions as u64; let mut total_cap_z = 0; for n in nodes_of_z.iter() { - total_cap_z += self.get_node_capacity(&self.node_id_vec[*n])?; + total_cap_z += self.expect_get_node_capacity(&self.node_id_vec[*n]); } let percent_cap_z = 100.0 * (available_cap_z as f32) / (total_cap_z as f32); for n in nodes_of_z.iter() { let available_cap_n = stored_partitions[*n] as u64 * self.partition_size; - let total_cap_n = self.get_node_capacity(&self.node_id_vec[*n])?; + let total_cap_n = self.expect_get_node_capacity(&self.node_id_vec[*n]); let tags_n = (self.node_role(&self.node_id_vec[*n]).ok_or(""))?.tags_string(); table.push(format!( " {:?}\t{}\t{} ({} new)\t{}\t{} ({:.1}%)", -- cgit v1.2.3 From 1aab1f4e688ebc3f3adcb41c817c16c688a3291c Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 14 Nov 2023 13:06:16 +0100 Subject: layout: refactoring of all_nodes --- src/garage/admin/mod.rs | 17 +++++++++-------- src/garage/cli/layout.rs | 6 +++--- src/model/helper/bucket.rs | 8 ++++++-- src/rpc/layout/history.rs | 15 +++++++++++++++ src/rpc/layout/version.rs | 17 +++++++---------- src/rpc/system.rs | 2 +- src/table/replication/fullcopy.rs | 6 +++--- 7 files changed, 44 insertions(+), 27 deletions(-) (limited to 'src') diff --git a/src/garage/admin/mod.rs b/src/garage/admin/mod.rs index e3ba6d35..77918a0f 100644 --- a/src/garage/admin/mod.rs +++ b/src/garage/admin/mod.rs @@ -126,8 +126,8 @@ impl AdminRpcHandler { opt_to_send.all_nodes = false; let mut failures = vec![]; - let layout = self.garage.system.cluster_layout().clone(); - for node in layout.current().node_ids().iter() { + let all_nodes = self.garage.system.cluster_layout().all_nodes().to_vec(); + for node in all_nodes.iter() { let node = (*node).into(); let resp = self .endpoint @@ -163,9 +163,9 @@ impl AdminRpcHandler { async fn handle_stats(&self, opt: StatsOpt) -> Result { if opt.all_nodes { let mut ret = String::new(); - let layout = self.garage.system.cluster_layout().clone(); + let all_nodes = self.garage.system.cluster_layout().all_nodes().to_vec(); - for node in layout.current().node_ids().iter() { + for node in all_nodes.iter() { let mut opt = opt.clone(); opt.all_nodes = false; opt.skip_global = true; @@ -275,6 +275,7 @@ impl AdminRpcHandler { let mut ret = String::new(); // Gather storage node and free space statistics + // TODO: not only layout.current() ??? let layout = &self.garage.system.cluster_layout(); let mut node_partition_count = HashMap::::new(); for short_id in layout.current().ring_assignment_data.iter() { @@ -440,8 +441,8 @@ impl AdminRpcHandler { ) -> Result { if all_nodes { let mut ret = vec![]; - let layout = self.garage.system.cluster_layout().clone(); - for node in layout.current().node_ids().iter() { + let all_nodes = self.garage.system.cluster_layout().all_nodes().to_vec(); + for node in all_nodes.iter() { let node = (*node).into(); match self .endpoint @@ -488,8 +489,8 @@ impl AdminRpcHandler { ) -> Result { if all_nodes { let mut ret = vec![]; - let layout = self.garage.system.cluster_layout().clone(); - for node in layout.current().node_ids().iter() { + let all_nodes = self.garage.system.cluster_layout().all_nodes().to_vec(); + for node in all_nodes.iter() { let node = (*node).into(); match self .endpoint diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index 15727448..0f01a37a 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -49,6 +49,7 @@ pub async fn cmd_assign_role( }; let mut layout = fetch_layout(rpc_cli, rpc_host).await?; + let all_nodes = layout.all_nodes().into_owned(); let added_nodes = args .node_ids @@ -58,7 +59,7 @@ pub async fn cmd_assign_role( status .iter() .map(|adv| adv.id) - .chain(layout.current().node_ids().iter().cloned()), + .chain(all_nodes.iter().cloned()), node_id, ) }) @@ -68,8 +69,7 @@ pub async fn cmd_assign_role( roles.merge(&layout.staging.get().roles); for replaced in args.replace.iter() { - let replaced_node = - find_matching_node(layout.current().node_ids().iter().cloned(), replaced)?; + let replaced_node = find_matching_node(all_nodes.iter().cloned(), replaced)?; match roles.get(&replaced_node) { Some(NodeRoleV(Some(_))) => { layout diff --git a/src/model/helper/bucket.rs b/src/model/helper/bucket.rs index 2a9c0fb1..2cb53424 100644 --- a/src/model/helper/bucket.rs +++ b/src/model/helper/bucket.rs @@ -450,8 +450,12 @@ impl<'a> BucketHelper<'a> { #[cfg(feature = "k2v")] { - // TODO: not only current - let node_id_vec = self.0.system.cluster_layout().current().node_ids().to_vec(); + let node_id_vec = self + .0 + .system + .cluster_layout() + .all_nongateway_nodes() + .into_owned(); let k2vindexes = self .0 .k2v diff --git a/src/rpc/layout/history.rs b/src/rpc/layout/history.rs index 877ad3a7..69348873 100644 --- a/src/rpc/layout/history.rs +++ b/src/rpc/layout/history.rs @@ -60,6 +60,21 @@ impl LayoutHistory { (self.current().version, self.all_ack(), self.min_stored()) } + pub fn all_nodes(&self) -> Cow<'_, [Uuid]> { + // TODO: cache this + if self.versions.len() == 1 { + self.versions[0].all_nodes().into() + } else { + let set = self + .versions + .iter() + .map(|x| x.all_nodes()) + .flatten() + .collect::>(); + set.into_iter().copied().collect::>().into() + } + } + pub fn all_nongateway_nodes(&self) -> Cow<'_, [Uuid]> { // TODO: cache this if self.versions.len() == 1 { diff --git a/src/rpc/layout/version.rs b/src/rpc/layout/version.rs index a7f387b6..2cbdcee2 100644 --- a/src/rpc/layout/version.rs +++ b/src/rpc/layout/version.rs @@ -38,22 +38,19 @@ impl LayoutVersion { // ===================== accessors ====================== - /// Returns a list of IDs of nodes that currently have - /// a role in the cluster - pub fn node_ids(&self) -> &[Uuid] { + /// Returns a list of IDs of nodes that have a role in this + /// version of the cluster layout, including gateway nodes + pub fn all_nodes(&self) -> &[Uuid] { &self.node_id_vec[..] } - /// Returns the uuids of the non_gateway nodes in self.node_id_vec. + /// Returns a list of IDs of nodes that have a storage capacity + /// assigned in this version of the cluster layout pub fn nongateway_nodes(&self) -> &[Uuid] { &self.node_id_vec[..self.nongateway_node_count] } - pub fn num_nodes(&self) -> usize { - self.node_id_vec.len() - } - - /// Returns the role of a node in the layout + /// Returns the role of a node in the layout, if it has one pub fn node_role(&self, node: &Uuid) -> Option<&NodeRole> { match self.roles.get(node) { Some(NodeRoleV(Some(v))) => Some(v), @@ -61,7 +58,7 @@ impl LayoutVersion { } } - /// Given a node uuids, this function returns its capacity or fails if it does not have any + /// Returns the capacity of a node in the layout, if it has one pub fn get_node_capacity(&self, uuid: &Uuid) -> Option { match self.node_role(uuid) { Some(NodeRole { diff --git a/src/rpc/system.rs b/src/rpc/system.rs index 3418600b..ab3c96b8 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -609,7 +609,7 @@ impl System { while !*stop_signal.borrow() { let not_configured = self.cluster_layout().check().is_err(); let no_peers = self.fullmesh.get_peer_list().len() < self.replication_factor; - let expected_n_nodes = self.cluster_layout().current().num_nodes(); + let expected_n_nodes = self.cluster_layout().all_nodes().len(); let bad_peers = self .fullmesh .get_peer_list() diff --git a/src/table/replication/fullcopy.rs b/src/table/replication/fullcopy.rs index 5653a229..beaacc2b 100644 --- a/src/table/replication/fullcopy.rs +++ b/src/table/replication/fullcopy.rs @@ -35,10 +35,10 @@ impl TableReplication for TableFullReplication { } fn write_nodes(&self, _hash: &Hash) -> Vec { - self.system.cluster_layout().current().node_ids().to_vec() + self.system.cluster_layout().current().all_nodes().to_vec() } fn write_quorum(&self) -> usize { - let nmembers = self.system.cluster_layout().current().node_ids().len(); + let nmembers = self.system.cluster_layout().current().all_nodes().len(); if nmembers > self.max_faults { nmembers - self.max_faults } else { @@ -62,7 +62,7 @@ impl TableReplication for TableFullReplication { partition: 0u16, first_hash: [0u8; 32].into(), last_hash: [0xff; 32].into(), - storage_nodes: Vec::from_iter(layout.current().node_ids().to_vec()), + storage_nodes: Vec::from_iter(layout.current().all_nodes().to_vec()), }], } } -- cgit v1.2.3 From 83a11374ca45831a6f54928dfe726fac65493b00 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 14 Nov 2023 13:29:26 +0100 Subject: layout: fixes in schema --- src/rpc/layout/schema.rs | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) (limited to 'src') diff --git a/src/rpc/layout/schema.rs b/src/rpc/layout/schema.rs index db298ee6..79440a47 100644 --- a/src/rpc/layout/schema.rs +++ b/src/rpc/layout/schema.rs @@ -193,7 +193,25 @@ mod v010 { use std::collections::BTreeMap; pub use v09::{LayoutParameters, NodeRole, NodeRoleV, ZoneRedundancy}; - /// The layout of the cluster, i.e. the list of roles + /// The history of cluster layouts, with trackers to keep a record + /// of which nodes are up-to-date to current cluster data + #[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] + pub struct LayoutHistory { + /// The versions currently in use in the cluster + pub versions: Vec, + + /// Update trackers + pub update_trackers: UpdateTrackers, + /// Hash of the update trackers + pub trackers_hash: Hash, + + /// Staged changes for the next version + pub staging: Lww, + /// Hash of the serialized staging_parameters + staging_roles + pub staging_hash: Hash, + } + + /// A version of the layout of the cluster, i.e. the list of roles /// which are assigned to each cluster node #[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] pub struct LayoutVersion { @@ -228,23 +246,6 @@ mod v010 { pub roles: LwwMap, } - /// The history of cluster layouts - #[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] - pub struct LayoutHistory { - /// The versions currently in use in the cluster - pub versions: Vec, - - /// Update trackers - pub update_trackers: UpdateTrackers, - /// Hash of the update trackers - pub trackers_hash: Hash, - - /// Staged changes for the next version - pub staging: Lww, - /// Hash of the serialized staging_parameters + staging_roles - pub staging_hash: Hash, - } - /// The tracker of acknowlegments and data syncs around the cluster #[derive(Clone, Debug, Serialize, Deserialize, Default, PartialEq)] pub struct UpdateTrackers { @@ -275,7 +276,7 @@ mod v010 { let role = previous.roles.get(uuid); matches!(role, Some(NodeRoleV(Some(role))) if role.capacity.is_some()) }) - .map(|(i, _)| i) + .map(|(i, _)| i + 1) .max() .unwrap_or(0); @@ -312,8 +313,7 @@ mod v010 { staging: Lww::raw(previous.version, staging), staging_hash: [0u8; 32].into(), }; - ret.staging_hash = ret.calculate_staging_hash(); - ret.trackers_hash = ret.calculate_trackers_hash(); + ret.update_hashes(); ret } } -- cgit v1.2.3 From 866196750fca74c1911ade2a90611f3663e60046 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 14 Nov 2023 13:36:58 +0100 Subject: system: add todo wrt new layout --- src/rpc/system.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'src') diff --git a/src/rpc/system.rs b/src/rpc/system.rs index ab3c96b8..86c02e86 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -417,6 +417,9 @@ impl System { } pub fn health(&self) -> ClusterHealth { + // TODO: adapt this function to take into account layout history + // when estimating cluster health, and not just use current layout + let quorum = self.replication_mode.write_quorum(); let replication_factor = self.replication_factor; @@ -429,7 +432,6 @@ impl System { let layout = self.cluster_layout(); // acquires a rwlock - // TODO: not only layout.current() let storage_nodes = layout .current() .roles -- cgit v1.2.3 From 3b361d2959e3d577bdae6f8a5ccb0c9d5526b7ea Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 14 Nov 2023 14:28:16 +0100 Subject: layout: prepare for write sets --- src/block/manager.rs | 3 ++- src/block/resync.rs | 2 +- src/model/k2v/rpc.rs | 10 ++++++---- src/rpc/layout/history.rs | 19 ++++++++++++++++--- src/rpc/layout/version.rs | 21 ++++++++++----------- src/rpc/system.rs | 3 +-- src/table/data.rs | 3 ++- src/table/gc.rs | 2 +- src/table/replication/fullcopy.rs | 9 +++++++-- src/table/replication/parameters.rs | 8 +++++--- src/table/replication/sharded.rs | 24 ++++++++---------------- src/table/sync.rs | 2 +- src/table/table.rs | 6 ++++-- 13 files changed, 64 insertions(+), 48 deletions(-) (limited to 'src') diff --git a/src/block/manager.rs b/src/block/manager.rs index 72b4ea66..2bb9c23d 100644 --- a/src/block/manager.rs +++ b/src/block/manager.rs @@ -354,7 +354,8 @@ impl BlockManager { /// Send block to nodes that should have it pub async fn rpc_put_block(&self, hash: Hash, data: Bytes) -> Result<(), Error> { - let who = self.replication.write_nodes(&hash); + // TODO: use quorums among latest write set + let who = self.replication.storage_nodes(&hash); let (header, bytes) = DataBlock::from_buffer(data, self.compression_level) .await diff --git a/src/block/resync.rs b/src/block/resync.rs index fedcd6f5..122d0142 100644 --- a/src/block/resync.rs +++ b/src/block/resync.rs @@ -377,7 +377,7 @@ impl BlockResyncManager { info!("Resync block {:?}: offloading and deleting", hash); let existing_path = existing_path.unwrap(); - let mut who = manager.replication.write_nodes(hash); + let mut who = manager.replication.storage_nodes(hash); if who.len() < manager.replication.write_quorum() { return Err(Error::Message("Not trying to offload block because we don't have a quorum of nodes to write to".to_string())); } diff --git a/src/model/k2v/rpc.rs b/src/model/k2v/rpc.rs index 2f548ad7..aa3323d5 100644 --- a/src/model/k2v/rpc.rs +++ b/src/model/k2v/rpc.rs @@ -127,7 +127,7 @@ impl K2VRpcHandler { .item_table .data .replication - .write_nodes(&partition.hash()); + .storage_nodes(&partition.hash()); who.sort(); self.system @@ -168,7 +168,7 @@ impl K2VRpcHandler { .item_table .data .replication - .write_nodes(&partition.hash()); + .storage_nodes(&partition.hash()); who.sort(); call_list.entry(who).or_default().push(InsertedItem { @@ -223,11 +223,12 @@ impl K2VRpcHandler { }, sort_key, }; + // TODO figure this out with write sets, does it still work???? let nodes = self .item_table .data .replication - .write_nodes(&poll_key.partition.hash()); + .read_nodes(&poll_key.partition.hash()); let rpc = self.system.rpc_helper().try_call_many( &self.endpoint, @@ -284,11 +285,12 @@ impl K2VRpcHandler { seen.restrict(&range); // Prepare PollRange RPC to send to the storage nodes responsible for the parititon + // TODO figure this out with write sets, does it still work???? let nodes = self .item_table .data .replication - .write_nodes(&range.partition.hash()); + .read_nodes(&range.partition.hash()); let quorum = self.item_table.data.replication.read_quorum(); let msg = K2VRpc::PollRange { range, diff --git a/src/rpc/layout/history.rs b/src/rpc/layout/history.rs index 69348873..dce492c9 100644 --- a/src/rpc/layout/history.rs +++ b/src/rpc/layout/history.rs @@ -98,13 +98,26 @@ impl LayoutHistory { .find(|x| x.version == sync_min) .or(self.versions.last()) .unwrap(); - version.nodes_of(position, version.replication_factor) + version + .nodes_of(position, version.replication_factor) + .collect() } - pub fn write_sets_of<'a>(&'a self, position: &'a Hash) -> impl Iterator> + 'a { + pub fn write_sets_of(&self, position: &Hash) -> Vec> { self.versions .iter() - .map(move |x| x.nodes_of(position, x.replication_factor)) + .map(|x| x.nodes_of(position, x.replication_factor).collect()) + .collect() + } + + pub fn storage_nodes_of(&self, position: &Hash) -> Vec { + let mut ret = vec![]; + for version in self.versions.iter() { + ret.extend(version.nodes_of(position, version.replication_factor)); + } + ret.sort(); + ret.dedup(); + ret } // ------------------ update tracking --------------- diff --git a/src/rpc/layout/version.rs b/src/rpc/layout/version.rs index 2cbdcee2..912ee538 100644 --- a/src/rpc/layout/version.rs +++ b/src/rpc/layout/version.rs @@ -107,25 +107,24 @@ impl LayoutVersion { } /// Return the n servers in which data for this hash should be replicated - pub fn nodes_of(&self, position: &Hash, n: usize) -> Vec { + pub fn nodes_of(&self, position: &Hash, n: usize) -> impl Iterator + '_ { assert_eq!(n, self.replication_factor); let data = &self.ring_assignment_data; - if data.len() != self.replication_factor * (1 << PARTITION_BITS) { + let partition_nodes = if data.len() == self.replication_factor * (1 << PARTITION_BITS) { + let partition_idx = self.partition_of(position) as usize; + let partition_start = partition_idx * self.replication_factor; + let partition_end = (partition_idx + 1) * self.replication_factor; + &data[partition_start..partition_end] + } else { warn!("Ring not yet ready, read/writes will be lost!"); - return vec![]; - } - - let partition_idx = self.partition_of(position) as usize; - let partition_start = partition_idx * self.replication_factor; - let partition_end = (partition_idx + 1) * self.replication_factor; - let partition_nodes = &data[partition_start..partition_end]; + &[] + }; partition_nodes .iter() - .map(|i| self.node_id_vec[*i as usize]) - .collect::>() + .map(move |i| self.node_id_vec[*i as usize]) } // ===================== internal information extractors ====================== diff --git a/src/rpc/system.rs b/src/rpc/system.rs index 86c02e86..31d78bf6 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -449,8 +449,7 @@ impl System { .iter() .map(|(_, h)| { let pn = layout.current().nodes_of(h, replication_factor); - pn.iter() - .filter(|x| nodes.get(x).map(|n| n.is_up).unwrap_or(false)) + pn.filter(|x| nodes.get(x).map(|n| n.is_up).unwrap_or(false)) .count() }) .collect::>(); diff --git a/src/table/data.rs b/src/table/data.rs index bbfdf58b..7f6b7847 100644 --- a/src/table/data.rs +++ b/src/table/data.rs @@ -254,7 +254,8 @@ impl TableData { // of the GC algorithm, as in all cases GC is suspended if // any node of the partition is unavailable. let pk_hash = Hash::try_from(&tree_key[..32]).unwrap(); - let nodes = self.replication.write_nodes(&pk_hash); + // TODO: this probably breaks when the layout changes + let nodes = self.replication.storage_nodes(&pk_hash); if nodes.first() == Some(&self.system.id) { GcTodoEntry::new(tree_key, new_bytes_hash).save(&self.gc_todo)?; } diff --git a/src/table/gc.rs b/src/table/gc.rs index 2135a358..002cfbf4 100644 --- a/src/table/gc.rs +++ b/src/table/gc.rs @@ -152,7 +152,7 @@ impl TableGc { let mut partitions = HashMap::new(); for entry in entries { let pkh = Hash::try_from(&entry.key[..32]).unwrap(); - let mut nodes = self.data.replication.write_nodes(&pkh); + let mut nodes = self.data.replication.storage_nodes(&pkh); nodes.retain(|x| *x != self.system.id); nodes.sort(); diff --git a/src/table/replication/fullcopy.rs b/src/table/replication/fullcopy.rs index beaacc2b..cb5471af 100644 --- a/src/table/replication/fullcopy.rs +++ b/src/table/replication/fullcopy.rs @@ -27,6 +27,11 @@ pub struct TableFullReplication { } impl TableReplication for TableFullReplication { + fn storage_nodes(&self, _hash: &Hash) -> Vec { + let layout = self.system.cluster_layout(); + layout.current().all_nodes().to_vec() + } + fn read_nodes(&self, _hash: &Hash) -> Vec { vec![self.system.id] } @@ -34,8 +39,8 @@ impl TableReplication for TableFullReplication { 1 } - fn write_nodes(&self, _hash: &Hash) -> Vec { - self.system.cluster_layout().current().all_nodes().to_vec() + fn write_sets(&self, hash: &Hash) -> Vec> { + vec![self.storage_nodes(hash)] } fn write_quorum(&self) -> usize { let nmembers = self.system.cluster_layout().current().all_nodes().len(); diff --git a/src/table/replication/parameters.rs b/src/table/replication/parameters.rs index 2a7d3585..2f842409 100644 --- a/src/table/replication/parameters.rs +++ b/src/table/replication/parameters.rs @@ -6,21 +6,23 @@ pub trait TableReplication: Send + Sync + 'static { // See examples in table_sharded.rs and table_fullcopy.rs // To understand various replication methods + /// The entire list of all nodes that store a partition + fn storage_nodes(&self, hash: &Hash) -> Vec; + /// Which nodes to send read requests to fn read_nodes(&self, hash: &Hash) -> Vec; /// Responses needed to consider a read succesfull fn read_quorum(&self) -> usize; /// Which nodes to send writes to - fn write_nodes(&self, hash: &Hash) -> Vec; - /// Responses needed to consider a write succesfull + fn write_sets(&self, hash: &Hash) -> Vec>; + /// Responses needed to consider a write succesfull in each set fn write_quorum(&self) -> usize; fn max_write_errors(&self) -> usize; // Accessing partitions, for Merkle tree & sync /// Get partition for data with given hash fn partition_of(&self, hash: &Hash) -> Partition; - /// List of partitions and nodes to sync with in current layout fn sync_partitions(&self) -> SyncPartitions; } diff --git a/src/table/replication/sharded.rs b/src/table/replication/sharded.rs index f02b1d66..1320a189 100644 --- a/src/table/replication/sharded.rs +++ b/src/table/replication/sharded.rs @@ -25,21 +25,19 @@ pub struct TableShardedReplication { } impl TableReplication for TableShardedReplication { + fn storage_nodes(&self, hash: &Hash) -> Vec { + self.system.cluster_layout().storage_nodes_of(hash) + } + fn read_nodes(&self, hash: &Hash) -> Vec { - self.system - .cluster_layout() - .current() - .nodes_of(hash, self.replication_factor) + self.system.cluster_layout().read_nodes_of(hash) } fn read_quorum(&self) -> usize { self.read_quorum } - fn write_nodes(&self, hash: &Hash) -> Vec { - self.system - .cluster_layout() - .current() - .nodes_of(hash, self.replication_factor) + fn write_sets(&self, hash: &Hash) -> Vec> { + self.system.cluster_layout().write_sets_of(hash) } fn write_quorum(&self) -> usize { self.write_quorum @@ -60,13 +58,7 @@ impl TableReplication for TableShardedReplication { .current() .partitions() .map(|(partition, first_hash)| { - let mut storage_nodes = layout - .write_sets_of(&first_hash) - .map(|x| x.into_iter()) - .flatten() - .collect::>(); - storage_nodes.sort(); - storage_nodes.dedup(); + let storage_nodes = layout.storage_nodes_of(&first_hash); SyncPartition { partition, first_hash, diff --git a/src/table/sync.rs b/src/table/sync.rs index 8c21db8b..b67cdd79 100644 --- a/src/table/sync.rs +++ b/src/table/sync.rs @@ -176,7 +176,7 @@ impl TableSyncer { let nodes = self .data .replication - .write_nodes(begin) + .storage_nodes(begin) .into_iter() .collect::>(); if nodes.contains(&self.system.id) { diff --git a/src/table/table.rs b/src/table/table.rs index 997fd7dc..bf08d5a0 100644 --- a/src/table/table.rs +++ b/src/table/table.rs @@ -119,7 +119,8 @@ impl Table { async fn insert_internal(&self, e: &F::E) -> Result<(), Error> { let hash = e.partition_key().hash(); - let who = self.data.replication.write_nodes(&hash); + // TODO: use write sets + let who = self.data.replication.storage_nodes(&hash); let e_enc = Arc::new(ByteBuf::from(e.encode()?)); let rpc = TableRpc::::Update(vec![e_enc]); @@ -171,7 +172,8 @@ impl Table { for entry in entries.into_iter() { let entry = entry.borrow(); let hash = entry.partition_key().hash(); - let who = self.data.replication.write_nodes(&hash); + // TODO: use write sets + let who = self.data.replication.storage_nodes(&hash); let e_enc = Arc::new(ByteBuf::from(entry.encode()?)); for node in who { call_list.entry(node).or_default().push(e_enc.clone()); -- cgit v1.2.3 From 90e1619b1e9f5d81e59da371f04717f0c4fe5afc Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 14 Nov 2023 15:40:46 +0100 Subject: table: take into account multiple write sets in inserts --- src/block/manager.rs | 7 +- src/block/resync.rs | 2 +- src/model/k2v/rpc.rs | 15 ++- src/rpc/rpc_helper.rs | 278 ++++++++++++++++++++++++++++++-------------------- src/table/gc.rs | 4 +- src/table/table.rs | 17 ++- 6 files changed, 189 insertions(+), 134 deletions(-) (limited to 'src') diff --git a/src/block/manager.rs b/src/block/manager.rs index 2bb9c23d..0ca8bc31 100644 --- a/src/block/manager.rs +++ b/src/block/manager.rs @@ -354,8 +354,7 @@ impl BlockManager { /// Send block to nodes that should have it pub async fn rpc_put_block(&self, hash: Hash, data: Bytes) -> Result<(), Error> { - // TODO: use quorums among latest write set - let who = self.replication.storage_nodes(&hash); + let who = self.replication.write_sets(&hash); let (header, bytes) = DataBlock::from_buffer(data, self.compression_level) .await @@ -365,9 +364,9 @@ impl BlockManager { self.system .rpc_helper() - .try_call_many( + .try_write_many_sets( &self.endpoint, - &who[..], + &who, put_block_rpc, RequestStrategy::with_priority(PRIO_NORMAL | PRIO_SECONDARY) .with_quorum(self.replication.write_quorum()), diff --git a/src/block/resync.rs b/src/block/resync.rs index 122d0142..15f210e4 100644 --- a/src/block/resync.rs +++ b/src/block/resync.rs @@ -434,7 +434,7 @@ impl BlockResyncManager { .rpc_helper() .try_call_many( &manager.endpoint, - &need_nodes[..], + &need_nodes, put_block_message, RequestStrategy::with_priority(PRIO_BACKGROUND) .with_quorum(need_nodes.len()), diff --git a/src/model/k2v/rpc.rs b/src/model/k2v/rpc.rs index aa3323d5..863a068a 100644 --- a/src/model/k2v/rpc.rs +++ b/src/model/k2v/rpc.rs @@ -134,16 +134,14 @@ impl K2VRpcHandler { .rpc_helper() .try_call_many( &self.endpoint, - &who[..], + &who, K2VRpc::InsertItem(InsertedItem { partition, sort_key, causal_context, value, }), - RequestStrategy::with_priority(PRIO_NORMAL) - .with_quorum(1) - .interrupt_after_quorum(true), + RequestStrategy::with_priority(PRIO_NORMAL).with_quorum(1), ) .await?; @@ -192,9 +190,7 @@ impl K2VRpcHandler { &self.endpoint, &nodes[..], K2VRpc::InsertManyItems(items), - RequestStrategy::with_priority(PRIO_NORMAL) - .with_quorum(1) - .interrupt_after_quorum(true), + RequestStrategy::with_priority(PRIO_NORMAL).with_quorum(1), ) .await?; Ok::<_, Error>((nodes, resp)) @@ -223,7 +219,7 @@ impl K2VRpcHandler { }, sort_key, }; - // TODO figure this out with write sets, does it still work???? + // TODO figure this out with write sets, is it still appropriate??? let nodes = self .item_table .data @@ -232,7 +228,7 @@ impl K2VRpcHandler { let rpc = self.system.rpc_helper().try_call_many( &self.endpoint, - &nodes[..], + &nodes, K2VRpc::PollItem { key: poll_key, causal_context, @@ -240,6 +236,7 @@ impl K2VRpcHandler { }, RequestStrategy::with_priority(PRIO_NORMAL) .with_quorum(self.item_table.data.replication.read_quorum()) + .send_all_at_once(true) .without_timeout(), ); let timeout_duration = diff --git a/src/rpc/rpc_helper.rs b/src/rpc/rpc_helper.rs index ce291068..12d073b6 100644 --- a/src/rpc/rpc_helper.rs +++ b/src/rpc/rpc_helper.rs @@ -1,4 +1,5 @@ //! Contain structs related to making RPCs +use std::collections::HashMap; use std::sync::{Arc, RwLock}; use std::time::Duration; @@ -35,11 +36,11 @@ const DEFAULT_TIMEOUT: Duration = Duration::from_secs(300); #[derive(Copy, Clone)] pub struct RequestStrategy { /// Min number of response to consider the request successful - pub rs_quorum: Option, - /// Should requests be dropped after enough response are received - pub rs_interrupt_after_quorum: bool, + rs_quorum: Option, + /// Send all requests at once + rs_send_all_at_once: Option, /// Request priority - pub rs_priority: RequestPriority, + rs_priority: RequestPriority, /// Custom timeout for this request rs_timeout: Timeout, } @@ -56,7 +57,7 @@ impl RequestStrategy { pub fn with_priority(prio: RequestPriority) -> Self { RequestStrategy { rs_quorum: None, - rs_interrupt_after_quorum: false, + rs_send_all_at_once: None, rs_priority: prio, rs_timeout: Timeout::Default, } @@ -66,10 +67,9 @@ impl RequestStrategy { self.rs_quorum = Some(quorum); self } - /// Set if requests can be dropped after quorum has been reached - /// In general true for read requests, and false for write - pub fn interrupt_after_quorum(mut self, interrupt: bool) -> Self { - self.rs_interrupt_after_quorum = interrupt; + /// Set quorum to be reached for request + pub fn send_all_at_once(mut self, value: bool) -> Self { + self.rs_send_all_at_once = Some(value); self } /// Deactivate timeout for this request @@ -235,31 +235,19 @@ impl RpcHelper { let quorum = strategy.rs_quorum.unwrap_or(to.len()); let tracer = opentelemetry::global::tracer("garage"); - let span_name = if strategy.rs_interrupt_after_quorum { - format!("RPC {} to {} of {}", endpoint.path(), quorum, to.len()) - } else { - format!( - "RPC {} to {} (quorum {})", - endpoint.path(), - to.len(), - quorum - ) - }; + let span_name = format!("Read RPC {} to {} of {}", endpoint.path(), quorum, to.len()); + let mut span = tracer.start(span_name); span.set_attribute(KeyValue::new("from", format!("{:?}", self.0.our_node_id))); span.set_attribute(KeyValue::new("to", format!("{:?}", to))); span.set_attribute(KeyValue::new("quorum", quorum as i64)); - span.set_attribute(KeyValue::new( - "interrupt_after_quorum", - strategy.rs_interrupt_after_quorum.to_string(), - )); - self.try_call_many_internal(endpoint, to, msg, strategy, quorum) + self.try_call_many_inner(endpoint, to, msg, strategy, quorum) .with_context(Context::current_with_span(span)) .await } - async fn try_call_many_internal( + async fn try_call_many_inner( &self, endpoint: &Arc>, to: &[Uuid], @@ -273,12 +261,20 @@ impl RpcHelper { H: StreamingEndpointHandler + 'static, S: Send + 'static, { - let msg = msg.into_req().map_err(netapp::error::Error::from)?; + // Once quorum is reached, other requests don't matter. + // What we do here is only send the required number of requests + // to reach a quorum, priorizing nodes with the lowest latency. + // When there are errors, we start new requests to compensate. + + // Reorder requests to priorize closeness / low latency + let request_order = self.request_order(to); + let send_all_at_once = strategy.rs_send_all_at_once.unwrap_or(false); // Build future for each request // They are not started now: they are added below in a FuturesUnordered // object that will take care of polling them (see below) - let requests = to.iter().cloned().map(|to| { + let msg = msg.into_req().map_err(netapp::error::Error::from)?; + let mut requests = request_order.into_iter().map(|to| { let self2 = self.clone(); let msg = msg.clone(); let endpoint2 = endpoint.clone(); @@ -291,93 +287,40 @@ impl RpcHelper { let mut successes = vec![]; let mut errors = vec![]; - if strategy.rs_interrupt_after_quorum { - // Case 1: once quorum is reached, other requests don't matter. - // What we do here is only send the required number of requests - // to reach a quorum, priorizing nodes with the lowest latency. - // When there are errors, we start new requests to compensate. - - // Reorder requests to priorize closeness / low latency - let request_order = self.request_order(to); - let mut ord_requests = vec![(); request_order.len()] - .into_iter() - .map(|_| None) - .collect::>(); - for (to, fut) in requests { - let i = request_order.iter().position(|x| *x == to).unwrap(); - ord_requests[i] = Some((to, fut)); + // resp_stream will contain all of the requests that are currently in flight. + // (for the moment none, they will be added in the loop below) + let mut resp_stream = FuturesUnordered::new(); + + // Do some requests and collect results + while successes.len() < quorum { + // If the current set of requests that are running is not enough to possibly + // reach quorum, start some new requests. + while send_all_at_once || successes.len() + resp_stream.len() < quorum { + if let Some((req_to, fut)) = requests.next() { + let tracer = opentelemetry::global::tracer("garage"); + let span = tracer.start(format!("RPC to {:?}", req_to)); + resp_stream.push(tokio::spawn( + fut.with_context(Context::current_with_span(span)), + )); + } else { + break; + } } - // Make an iterator to take requests in their sorted order - let mut requests = ord_requests.into_iter().map(Option::unwrap); - - // resp_stream will contain all of the requests that are currently in flight. - // (for the moment none, they will be added in the loop below) - let mut resp_stream = FuturesUnordered::new(); - - // Do some requests and collect results - 'request_loop: while successes.len() < quorum { - // If the current set of requests that are running is not enough to possibly - // reach quorum, start some new requests. - while successes.len() + resp_stream.len() < quorum { - if let Some((req_to, fut)) = requests.next() { - let tracer = opentelemetry::global::tracer("garage"); - let span = tracer.start(format!("RPC to {:?}", req_to)); - resp_stream.push(tokio::spawn( - fut.with_context(Context::current_with_span(span)), - )); - } else { - // If we have no request to add, we know that we won't ever - // reach quorum: bail out now. - break 'request_loop; - } - } - assert!(!resp_stream.is_empty()); // because of loop invariants + if successes.len() + resp_stream.len() < quorum { + // We know we won't ever reach quorum + break; + } - // Wait for one request to terminate - match resp_stream.next().await.unwrap().unwrap() { - Ok(msg) => { - successes.push(msg); - } - Err(e) => { - errors.push(e); - } + // Wait for one request to terminate + match resp_stream.next().await.unwrap().unwrap() { + Ok(msg) => { + successes.push(msg); } - } - } else { - // Case 2: all of the requests need to be sent in all cases, - // and need to terminate. (this is the case for writes that - // must be spread to n nodes) - // Just start all the requests in parallel and return as soon - // as the quorum is reached. - let mut resp_stream = requests - .map(|(_, fut)| fut) - .collect::>(); - - while let Some(resp) = resp_stream.next().await { - match resp { - Ok(msg) => { - successes.push(msg); - if successes.len() >= quorum { - break; - } - } - Err(e) => { - errors.push(e); - } + Err(e) => { + errors.push(e); } } - - if !resp_stream.is_empty() { - // Continue remaining requests in background. - // Note: these requests can get interrupted on process shutdown, - // we must not count on them being executed for certain. - // For all background things that have to happen with certainty, - // they have to be put in a proper queue that is persisted to disk. - tokio::spawn(async move { - resp_stream.collect::>>().await; - }); - } } if successes.len() >= quorum { @@ -432,4 +375,123 @@ impl RpcHelper { .map(|(_, _, _, to)| to) .collect::>() } + + pub async fn try_write_many_sets( + &self, + endpoint: &Arc>, + to_sets: &[Vec], + msg: N, + strategy: RequestStrategy, + ) -> Result, Error> + where + M: Rpc> + 'static, + N: IntoReq, + H: StreamingEndpointHandler + 'static, + S: Send + 'static, + { + let quorum = strategy + .rs_quorum + .expect("internal error: missing quroum in try_write_many_sets"); + + let tracer = opentelemetry::global::tracer("garage"); + let span_name = format!( + "Write RPC {} (quorum {} in {} sets)", + endpoint.path(), + quorum, + to_sets.len() + ); + + let mut span = tracer.start(span_name); + span.set_attribute(KeyValue::new("from", format!("{:?}", self.0.our_node_id))); + span.set_attribute(KeyValue::new("to", format!("{:?}", to_sets))); + span.set_attribute(KeyValue::new("quorum", quorum as i64)); + + self.try_write_many_sets_inner(endpoint, to_sets, msg, strategy, quorum) + .with_context(Context::current_with_span(span)) + .await + } + + async fn try_write_many_sets_inner( + &self, + endpoint: &Arc>, + to_sets: &[Vec], + msg: N, + strategy: RequestStrategy, + quorum: usize, + ) -> Result, Error> + where + M: Rpc> + 'static, + N: IntoReq, + H: StreamingEndpointHandler + 'static, + S: Send + 'static, + { + let msg = msg.into_req().map_err(netapp::error::Error::from)?; + + let mut peers = HashMap::>::new(); + for (i, set) in to_sets.iter().enumerate() { + for peer in set.iter() { + peers.entry(*peer).or_default().push(i); + } + } + + let requests = peers.iter().map(|(peer, _)| { + let self2 = self.clone(); + let msg = msg.clone(); + let endpoint2 = endpoint.clone(); + let to = *peer; + let tracer = opentelemetry::global::tracer("garage"); + let span = tracer.start(format!("RPC to {:?}", to)); + let fut = async move { (to, self2.call(&endpoint2, to, msg, strategy).await) }; + tokio::spawn(fut.with_context(Context::current_with_span(span))) + }); + let mut resp_stream = requests.collect::>(); + + let mut successes = vec![]; + let mut errors = vec![]; + + let mut set_counters = vec![(0, 0); to_sets.len()]; + + while !resp_stream.is_empty() { + let (node, resp) = resp_stream.next().await.unwrap().unwrap(); + + match resp { + Ok(msg) => { + for set in peers.get(&node).unwrap().iter() { + set_counters[*set].0 += 1; + } + successes.push(msg); + } + Err(e) => { + for set in peers.get(&node).unwrap().iter() { + set_counters[*set].1 += 1; + } + errors.push(e); + } + } + + if set_counters.iter().all(|x| x.0 > quorum) { + // Success + + // Continue all other requets in background + tokio::spawn(async move { + resp_stream.collect::>>().await; + }); + + return Ok(successes); + } + + if set_counters + .iter() + .enumerate() + .any(|(i, x)| x.1 + quorum > to_sets[i].len()) + { + // Too many errors in this set, we know we won't get a quorum + break; + } + } + + // Failure, could not get quorum + let errors = errors.iter().map(|e| format!("{}", e)).collect::>(); + Err(Error::Quorum(quorum, successes.len(), peers.len(), errors)) + } } diff --git a/src/table/gc.rs b/src/table/gc.rs index 002cfbf4..ef788749 100644 --- a/src/table/gc.rs +++ b/src/table/gc.rs @@ -230,7 +230,7 @@ impl TableGc { .rpc_helper() .try_call_many( &self.endpoint, - &nodes[..], + &nodes, GcRpc::Update(updates), RequestStrategy::with_priority(PRIO_BACKGROUND).with_quorum(nodes.len()), ) @@ -251,7 +251,7 @@ impl TableGc { .rpc_helper() .try_call_many( &self.endpoint, - &nodes[..], + &nodes, GcRpc::DeleteIfEqualHash(deletes), RequestStrategy::with_priority(PRIO_BACKGROUND).with_quorum(nodes.len()), ) diff --git a/src/table/table.rs b/src/table/table.rs index bf08d5a0..c2efaeaf 100644 --- a/src/table/table.rs +++ b/src/table/table.rs @@ -119,17 +119,16 @@ impl Table { async fn insert_internal(&self, e: &F::E) -> Result<(), Error> { let hash = e.partition_key().hash(); - // TODO: use write sets - let who = self.data.replication.storage_nodes(&hash); + let who = self.data.replication.write_sets(&hash); let e_enc = Arc::new(ByteBuf::from(e.encode()?)); let rpc = TableRpc::::Update(vec![e_enc]); self.system .rpc_helper() - .try_call_many( + .try_write_many_sets( &self.endpoint, - &who[..], + &who, rpc, RequestStrategy::with_priority(PRIO_NORMAL) .with_quorum(self.data.replication.write_quorum()), @@ -243,11 +242,10 @@ impl Table { .rpc_helper() .try_call_many( &self.endpoint, - &who[..], + &who, rpc, RequestStrategy::with_priority(PRIO_NORMAL) - .with_quorum(self.data.replication.read_quorum()) - .interrupt_after_quorum(true), + .with_quorum(self.data.replication.read_quorum()), ) .await?; @@ -339,11 +337,10 @@ impl Table { .rpc_helper() .try_call_many( &self.endpoint, - &who[..], + &who, rpc, RequestStrategy::with_priority(PRIO_NORMAL) - .with_quorum(self.data.replication.read_quorum()) - .interrupt_after_quorum(true), + .with_quorum(self.data.replication.read_quorum()), ) .await?; -- cgit v1.2.3 From 7ef2c231208073db5a0a0a8674e2dd2d2ecb2222 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 14 Nov 2023 15:45:01 +0100 Subject: layout: fix test --- src/rpc/layout/test.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src') diff --git a/src/rpc/layout/test.rs b/src/rpc/layout/test.rs index 0ce090d2..e9639073 100644 --- a/src/rpc/layout/test.rs +++ b/src/rpc/layout/test.rs @@ -35,7 +35,7 @@ fn check_against_naive(cl: &LayoutVersion) -> Result { } for uuid in cl.nongateway_nodes() { let z = cl.get_node_zone(&uuid)?; - let c = cl.get_node_capacity(&uuid)?; + let c = cl.get_node_capacity(&uuid).unwrap(); zone_token.insert( z.to_string(), zone_token[z] + min(NB_PARTITIONS, (c / over_size) as usize), -- cgit v1.2.3 From b3e729f4b8ec3b06593f8d3b161c76b1263d9f13 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 15 Nov 2023 12:15:58 +0100 Subject: layout history merge: rm invalid versions when valid versions are added --- src/rpc/layout/history.rs | 18 ++++++++++++++++++ src/rpc/layout/version.rs | 20 ++++++++++---------- 2 files changed, 28 insertions(+), 10 deletions(-) (limited to 'src') diff --git a/src/rpc/layout/history.rs b/src/rpc/layout/history.rs index dce492c9..2346b14a 100644 --- a/src/rpc/layout/history.rs +++ b/src/rpc/layout/history.rs @@ -211,6 +211,24 @@ impl LayoutHistory { changed = changed || c; } + // If there are invalid versions before valid versions, remove them, + // and increment update trackers + if self.versions.len() > 1 && self.current().check().is_ok() { + while self.versions.first().unwrap().check().is_err() { + self.versions.remove(0); + changed = true; + } + if changed { + let min_v = self.versions.first().unwrap().version; + let nodes = self.all_nongateway_nodes().into_owned(); + for node in nodes { + self.update_trackers.ack_map.set_max(node, min_v); + self.update_trackers.sync_map.set_max(node, min_v); + self.update_trackers.sync_ack_map.set_max(node, min_v); + } + } + } + // Merge staged layout changes if self.staging != other.staging { self.staging.merge(&other.staging); diff --git a/src/rpc/layout/version.rs b/src/rpc/layout/version.rs index 912ee538..947fab56 100644 --- a/src/rpc/layout/version.rs +++ b/src/rpc/layout/version.rs @@ -174,6 +174,16 @@ impl LayoutVersion { /// (assignment, roles, parameters, partition size) /// returns true if consistent, false if error pub fn check(&self) -> Result<(), String> { + // Check that the assignment data has the correct length + let expected_assignment_data_len = (1 << PARTITION_BITS) * self.replication_factor; + if self.ring_assignment_data.len() != expected_assignment_data_len { + return Err(format!( + "ring_assignment_data has incorrect length {} instead of {}", + self.ring_assignment_data.len(), + expected_assignment_data_len + )); + } + // Check that node_id_vec contains the correct list of nodes let mut expected_nodes = self .roles @@ -189,16 +199,6 @@ impl LayoutVersion { return Err(format!("node_id_vec does not contain the correct set of nodes\nnode_id_vec: {:?}\nexpected: {:?}", node_id_vec, expected_nodes)); } - // Check that the assignment data has the correct length - let expected_assignment_data_len = (1 << PARTITION_BITS) * self.replication_factor; - if self.ring_assignment_data.len() != expected_assignment_data_len { - return Err(format!( - "ring_assignment_data has incorrect length {} instead of {}", - self.ring_assignment_data.len(), - expected_assignment_data_len - )); - } - // Check that the assigned nodes are correct identifiers // of nodes that are assigned a role // and that role is not the role of a gateway nodes -- cgit v1.2.3 From 46007bf01dd2e5b489d145ca0a5499ffa7257b96 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 15 Nov 2023 12:56:52 +0100 Subject: integration test: print stdout and stderr on subcommand crash --- src/garage/tests/common/ext/process.rs | 44 +++++++++------------------------- src/garage/tests/common/garage.rs | 2 +- 2 files changed, 12 insertions(+), 34 deletions(-) (limited to 'src') diff --git a/src/garage/tests/common/ext/process.rs b/src/garage/tests/common/ext/process.rs index ba533b6c..8e20bf7c 100644 --- a/src/garage/tests/common/ext/process.rs +++ b/src/garage/tests/common/ext/process.rs @@ -14,42 +14,20 @@ impl CommandExt for process::Command { } fn expect_success_status(&mut self, msg: &str) -> process::ExitStatus { - let status = self.status().expect(msg); - status.expect_success(msg); - status + self.expect_success_output(msg).status } fn expect_success_output(&mut self, msg: &str) -> process::Output { let output = self.output().expect(msg); - output.expect_success(msg); - output - } -} - -pub trait OutputExt { - fn expect_success(&self, msg: &str); -} - -impl OutputExt for process::Output { - fn expect_success(&self, msg: &str) { - self.status.expect_success(msg) - } -} - -pub trait ExitStatusExt { - fn expect_success(&self, msg: &str); -} - -impl ExitStatusExt for process::ExitStatus { - fn expect_success(&self, msg: &str) { - if !self.success() { - match self.code() { - Some(code) => panic!( - "Command exited with code {code}: {msg}", - code = code, - msg = msg - ), - None => panic!("Command exited with signal: {msg}", msg = msg), - } + if !output.status.success() { + panic!( + "{}: command {:?} exited with error {:?}\nSTDOUT: {}\nSTDERR: {}", + msg, + self, + output.status.code(), + String::from_utf8_lossy(&output.stdout), + String::from_utf8_lossy(&output.stderr) + ); } + output } } diff --git a/src/garage/tests/common/garage.rs b/src/garage/tests/common/garage.rs index d1f0867a..ebc82f37 100644 --- a/src/garage/tests/common/garage.rs +++ b/src/garage/tests/common/garage.rs @@ -96,7 +96,7 @@ api_bind_addr = "127.0.0.1:{admin_port}" .arg("server") .stdout(stdout) .stderr(stderr) - .env("RUST_LOG", "garage=info,garage_api=trace") + .env("RUST_LOG", "garage=debug,garage_api=trace") .spawn() .expect("Could not start garage"); -- cgit v1.2.3 From acd49de9f97bd27409232691262bd5827983388d Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 15 Nov 2023 13:07:42 +0100 Subject: rpc: fix write set quorums --- src/api/common_error.rs | 8 ++------ src/model/k2v/rpc.rs | 2 +- src/rpc/rpc_helper.rs | 18 +++++++++++++++--- src/util/error.rs | 7 ++++--- 4 files changed, 22 insertions(+), 13 deletions(-) (limited to 'src') diff --git a/src/api/common_error.rs b/src/api/common_error.rs index 20f9f266..ecb22fd8 100644 --- a/src/api/common_error.rs +++ b/src/api/common_error.rs @@ -53,9 +53,7 @@ impl CommonError { pub fn http_status_code(&self) -> StatusCode { match self { CommonError::InternalError( - GarageError::Timeout - | GarageError::RemoteError(_) - | GarageError::Quorum(_, _, _, _), + GarageError::Timeout | GarageError::RemoteError(_) | GarageError::Quorum(..), ) => StatusCode::SERVICE_UNAVAILABLE, CommonError::InternalError(_) | CommonError::Hyper(_) | CommonError::Http(_) => { StatusCode::INTERNAL_SERVER_ERROR @@ -72,9 +70,7 @@ impl CommonError { match self { CommonError::Forbidden(_) => "AccessDenied", CommonError::InternalError( - GarageError::Timeout - | GarageError::RemoteError(_) - | GarageError::Quorum(_, _, _, _), + GarageError::Timeout | GarageError::RemoteError(_) | GarageError::Quorum(..), ) => "ServiceUnavailable", CommonError::InternalError(_) | CommonError::Hyper(_) | CommonError::Http(_) => { "InternalError" diff --git a/src/model/k2v/rpc.rs b/src/model/k2v/rpc.rs index 863a068a..3c759181 100644 --- a/src/model/k2v/rpc.rs +++ b/src/model/k2v/rpc.rs @@ -344,7 +344,7 @@ impl K2VRpcHandler { } if errors.len() > nodes.len() - quorum { let errors = errors.iter().map(|e| format!("{}", e)).collect::>(); - return Err(Error::Quorum(quorum, resps.len(), nodes.len(), errors).into()); + return Err(Error::Quorum(quorum, None, resps.len(), nodes.len(), errors).into()); } // Take all returned items into account to produce the response. diff --git a/src/rpc/rpc_helper.rs b/src/rpc/rpc_helper.rs index 12d073b6..1bad495b 100644 --- a/src/rpc/rpc_helper.rs +++ b/src/rpc/rpc_helper.rs @@ -327,7 +327,13 @@ impl RpcHelper { Ok(successes) } else { let errors = errors.iter().map(|e| format!("{}", e)).collect::>(); - Err(Error::Quorum(quorum, successes.len(), to.len(), errors)) + Err(Error::Quorum( + quorum, + None, + successes.len(), + to.len(), + errors, + )) } } @@ -469,7 +475,7 @@ impl RpcHelper { } } - if set_counters.iter().all(|x| x.0 > quorum) { + if set_counters.iter().all(|x| x.0 >= quorum) { // Success // Continue all other requets in background @@ -492,6 +498,12 @@ impl RpcHelper { // Failure, could not get quorum let errors = errors.iter().map(|e| format!("{}", e)).collect::>(); - Err(Error::Quorum(quorum, successes.len(), peers.len(), errors)) + Err(Error::Quorum( + quorum, + Some(to_sets.len()), + successes.len(), + peers.len(), + errors, + )) } } diff --git a/src/util/error.rs b/src/util/error.rs index 3fcee71d..be7cdfdb 100644 --- a/src/util/error.rs +++ b/src/util/error.rs @@ -55,13 +55,14 @@ pub enum Error { Timeout, #[error( - display = "Could not reach quorum of {}. {} of {} request succeeded, others returned errors: {:?}", + display = "Could not reach quorum of {} (sets={:?}). {} of {} request succeeded, others returned errors: {:?}", _0, _1, _2, - _3 + _3, + _4 )] - Quorum(usize, usize, usize, Vec), + Quorum(usize, Option, usize, usize, Vec), #[error(display = "Unexpected RPC message: {}", _0)] UnexpectedRpcMessage(String), -- cgit v1.2.3 From 65066c70640371cc318faddfb4c05c96de18e86d Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 15 Nov 2023 13:28:30 +0100 Subject: layout: wip cache global mins --- src/rpc/layout/history.rs | 46 +++++++++++++++++++++++++++------------------- src/rpc/layout/manager.rs | 6 +++--- src/rpc/layout/schema.rs | 36 +++++++++++++++++++++++++++--------- 3 files changed, 57 insertions(+), 31 deletions(-) (limited to 'src') diff --git a/src/rpc/layout/history.rs b/src/rpc/layout/history.rs index 2346b14a..1684918e 100644 --- a/src/rpc/layout/history.rs +++ b/src/rpc/layout/history.rs @@ -9,6 +9,7 @@ use garage_util::error::*; use super::schema::*; use super::*; + impl LayoutHistory { pub fn new(replication_factor: usize) -> Self { let version = LayoutVersion::new(replication_factor); @@ -49,7 +50,7 @@ impl LayoutHistory { // ------------------ who stores what now? --------------- pub fn all_ack(&self) -> u64 { - self.calculate_global_min(&self.update_trackers.ack_map) + self.update_trackers.ack_map.current_min } pub fn min_stored(&self) -> u64 { @@ -91,7 +92,7 @@ impl LayoutHistory { } pub fn read_nodes_of(&self, position: &Hash) -> Vec { - let sync_min = self.calculate_global_min(&self.update_trackers.sync_map); + let sync_min = self.update_trackers.sync_map.current_min; let version = self .versions .iter() @@ -122,7 +123,7 @@ impl LayoutHistory { // ------------------ update tracking --------------- - pub(crate) fn update_trackers(&mut self, node_id: Uuid) { + pub(crate) fn update_trackers_of(&mut self, node_id: Uuid) { // Ensure trackers for this node's values are up-to-date // 1. Acknowledge the last layout version in the history @@ -138,6 +139,9 @@ impl LayoutHistory { // 4. Cleanup layout versions that are not needed anymore self.cleanup_old_versions(); + // 5. Recalculate global minima + self.update_trackers_min(); + info!("ack_map: {:?}", self.update_trackers.ack_map); info!("sync_map: {:?}", self.update_trackers.sync_map); info!("sync_ack_map: {:?}", self.update_trackers.sync_ack_map); @@ -146,42 +150,41 @@ impl LayoutHistory { self.update_hashes(); } + fn update_trackers_min(&mut self) { + // TODO: for TableFullReplication, counting gateway nodes might be + // necessary? Think about this more. + let storage_nodes = self.all_nongateway_nodes().into_owned(); + let min_version = self.versions.first().unwrap().version; + self.update_trackers.update_min(&storage_nodes, min_version); + } + pub(crate) fn ack_last(&mut self, node: Uuid) { let last_version = self.current().version; self.update_trackers.ack_map.set_max(node, last_version); + self.update_trackers_min(); } pub(crate) fn sync_first(&mut self, node: Uuid) { let first_version = self.versions.first().as_ref().unwrap().version; self.update_trackers.sync_map.set_max(node, first_version); + self.update_trackers_min(); } pub(crate) fn sync_ack(&mut self, node: Uuid) { - self.update_trackers.sync_ack_map.set_max( - node, - self.calculate_global_min(&self.update_trackers.sync_map), - ); + self.update_trackers + .sync_ack_map + .set_max(node, self.update_trackers.sync_map.current_min); + self.update_trackers_min(); } pub(crate) fn cleanup_old_versions(&mut self) { - let min_sync_ack = self.calculate_global_min(&self.update_trackers.sync_ack_map); + let min_sync_ack = self.update_trackers.sync_ack_map.current_min; while self.versions.first().as_ref().unwrap().version < min_sync_ack { let removed = self.versions.remove(0); info!("Layout history: pruning old version {}", removed.version); } } - pub(crate) fn calculate_global_min(&self, tracker: &UpdateTracker) -> u64 { - // TODO: for TableFullReplication, counting gateway nodes might be - // necessary? Think about this more. - let storage_nodes = self.all_nongateway_nodes(); - storage_nodes - .iter() - .map(|x| tracker.0.get(x).copied().unwrap_or(0)) - .min() - .unwrap_or(0) - } - // ================== updates to layout, public interface =================== pub fn merge(&mut self, other: &LayoutHistory) -> bool { @@ -229,6 +232,11 @@ impl LayoutHistory { } } + // Update the current_min value in trackers if anything changed + if changed { + self.update_trackers_min(); + } + // Merge staged layout changes if self.staging != other.staging { self.staging.merge(&other.staging); diff --git a/src/rpc/layout/manager.rs b/src/rpc/layout/manager.rs index ce8b6f61..21ec2d8d 100644 --- a/src/rpc/layout/manager.rs +++ b/src/rpc/layout/manager.rs @@ -74,7 +74,7 @@ impl LayoutManager { } }; - cluster_layout.update_trackers(node_id.into()); + cluster_layout.update_trackers_of(node_id.into()); let layout = Arc::new(RwLock::new(cluster_layout)); let change_notify = Arc::new(Notify::new()); @@ -158,7 +158,7 @@ impl LayoutManager { if !prev_layout_check || adv.check().is_ok() { if layout.merge(adv) { - layout.update_trackers(self.node_id); + layout.update_trackers_of(self.node_id); if prev_layout_check && layout.check().is_err() { panic!("Merged two correct layouts and got an incorrect layout."); } @@ -172,7 +172,7 @@ impl LayoutManager { let mut layout = self.layout.write().unwrap(); if layout.update_trackers != *adv { if layout.update_trackers.merge(adv) { - layout.update_trackers(self.node_id); + layout.update_trackers_of(self.node_id); return Some(layout.update_trackers.clone()); } } diff --git a/src/rpc/layout/schema.rs b/src/rpc/layout/schema.rs index 79440a47..969f5a0b 100644 --- a/src/rpc/layout/schema.rs +++ b/src/rpc/layout/schema.rs @@ -260,7 +260,10 @@ mod v010 { /// The history of cluster layouts #[derive(Clone, Debug, Serialize, Deserialize, Default, PartialEq)] - pub struct UpdateTracker(pub BTreeMap); + pub struct UpdateTracker { + pub values: BTreeMap, + pub current_min: u64, + } impl garage_util::migrate::Migrate for LayoutHistory { const VERSION_MARKER: &'static [u8] = b"G010lh"; @@ -290,14 +293,15 @@ mod v010 { nongateway_node_count, ring_assignment_data: previous.ring_assignment_data, }; - let update_tracker = UpdateTracker( - version + let update_tracker = UpdateTracker { + values: version .nongateway_nodes() .iter() .copied() .map(|x| (x, version.version)) .collect::>(), - ); + current_min: 0, + }; let staging = LayoutStaging { parameters: previous.staging_parameters, roles: previous.staging_roles, @@ -378,14 +382,14 @@ impl core::str::FromStr for ZoneRedundancy { impl UpdateTracker { fn merge(&mut self, other: &UpdateTracker) -> bool { let mut changed = false; - for (k, v) in other.0.iter() { - if let Some(v_mut) = self.0.get_mut(k) { + for (k, v) in other.values.iter() { + if let Some(v_mut) = self.values.get_mut(k) { if *v > *v_mut { *v_mut = *v; changed = true; } } else { - self.0.insert(*k, *v); + self.values.insert(*k, *v); changed = true; } } @@ -393,18 +397,26 @@ impl UpdateTracker { } pub(crate) fn set_max(&mut self, peer: Uuid, value: u64) -> bool { - match self.0.get_mut(&peer) { + match self.values.get_mut(&peer) { Some(e) if *e < value => { *e = value; true } None => { - self.0.insert(peer, value); + self.values.insert(peer, value); true } _ => false, } } + + fn update_min(&mut self, storage_nodes: &[Uuid], min_version: u64) { + self.current_min = storage_nodes + .iter() + .map(|x| self.values.get(x).copied().unwrap_or(min_version)) + .min() + .unwrap_or(min_version) + } } impl UpdateTrackers { @@ -414,4 +426,10 @@ impl UpdateTrackers { let c3 = self.sync_ack_map.merge(&other.sync_ack_map); c1 || c2 || c3 } + + pub(crate) fn update_min(&mut self, storage_nodes: &[Uuid], min_version: u64) { + self.ack_map.update_min(&storage_nodes, min_version); + self.sync_map.update_min(&storage_nodes, min_version); + self.sync_ack_map.update_min(&storage_nodes, min_version); + } } -- cgit v1.2.3 From 393c4d4515e0cdadadc8de8ae2df12e4371cff88 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 15 Nov 2023 14:20:50 +0100 Subject: layout: add helper for cached/external values to centralize recomputation --- src/api/admin/cluster.rs | 1 - src/api/k2v/index.rs | 2 +- src/garage/cli/layout.rs | 3 +- src/model/helper/bucket.rs | 2 +- src/model/index_counter.rs | 4 +- src/rpc/layout/history.rs | 311 +++++++++++++++++++++++++++------------------ src/rpc/layout/manager.rs | 22 ++-- src/rpc/layout/schema.rs | 48 +++---- src/rpc/rpc_helper.rs | 6 +- src/rpc/system.rs | 4 +- 10 files changed, 222 insertions(+), 181 deletions(-) (limited to 'src') diff --git a/src/api/admin/cluster.rs b/src/api/admin/cluster.rs index d912b58f..593bd778 100644 --- a/src/api/admin/cluster.rs +++ b/src/api/admin/cluster.rs @@ -240,7 +240,6 @@ pub async fn handle_update_cluster_layout( .merge(&roles.update_mutator(node, layout::NodeRoleV(new_role))); } - layout.update_hashes(); garage .system .layout_manager diff --git a/src/api/k2v/index.rs b/src/api/k2v/index.rs index c189232a..e8cd1fba 100644 --- a/src/api/k2v/index.rs +++ b/src/api/k2v/index.rs @@ -29,7 +29,7 @@ pub async fn handle_read_index( .system .cluster_layout() .all_nongateway_nodes() - .into_owned(); + .to_vec(); let (partition_keys, more, next_start) = read_range( &garage.k2v.counter_table.table, diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index 0f01a37a..51774314 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -49,7 +49,7 @@ pub async fn cmd_assign_role( }; let mut layout = fetch_layout(rpc_cli, rpc_host).await?; - let all_nodes = layout.all_nodes().into_owned(); + let all_nodes = layout.get_all_nodes(); let added_nodes = args .node_ids @@ -331,7 +331,6 @@ pub async fn send_layout( rpc_host: NodeID, mut layout: LayoutHistory, ) -> Result<(), Error> { - layout.update_hashes(); rpc_cli .call( &rpc_host, diff --git a/src/model/helper/bucket.rs b/src/model/helper/bucket.rs index 2cb53424..efa3e27b 100644 --- a/src/model/helper/bucket.rs +++ b/src/model/helper/bucket.rs @@ -455,7 +455,7 @@ impl<'a> BucketHelper<'a> { .system .cluster_layout() .all_nongateway_nodes() - .into_owned(); + .to_vec(); let k2vindexes = self .0 .k2v diff --git a/src/model/index_counter.rs b/src/model/index_counter.rs index 2d968733..e8702bf1 100644 --- a/src/model/index_counter.rs +++ b/src/model/index_counter.rs @@ -7,7 +7,7 @@ use serde::{Deserialize, Serialize}; use garage_db as db; -use garage_rpc::layout::LayoutHistory; +use garage_rpc::layout::LayoutHelper; use garage_rpc::system::System; use garage_util::background::BackgroundRunner; use garage_util::data::*; @@ -83,7 +83,7 @@ impl Entry for CounterEntry { } impl CounterEntry { - pub fn filtered_values(&self, layout: &LayoutHistory) -> HashMap { + pub fn filtered_values(&self, layout: &LayoutHelper) -> HashMap { let nodes = layout.all_nongateway_nodes(); self.filtered_values_with_nodes(&nodes) } diff --git a/src/rpc/layout/history.rs b/src/rpc/layout/history.rs index 1684918e..b6f0e495 100644 --- a/src/rpc/layout/history.rs +++ b/src/rpc/layout/history.rs @@ -1,5 +1,5 @@ -use std::borrow::Cow; use std::collections::HashSet; +use std::ops::Deref; use garage_util::crdt::{Crdt, Lww, LwwMap}; use garage_util::data::*; @@ -9,95 +9,106 @@ use garage_util::error::*; use super::schema::*; use super::*; +pub struct LayoutHelper { + layout: Option, -impl LayoutHistory { - pub fn new(replication_factor: usize) -> Self { - let version = LayoutVersion::new(replication_factor); + // cached values + ack_map_min: u64, + sync_map_min: u64, - let staging = LayoutStaging { - parameters: Lww::::new(version.parameters), - roles: LwwMap::new(), - }; + all_nodes: Vec, + all_nongateway_nodes: Vec, - let mut ret = LayoutHistory { - versions: vec![version], - update_trackers: Default::default(), - trackers_hash: [0u8; 32].into(), - staging: Lww::raw(0, staging), - staging_hash: [0u8; 32].into(), - }; - ret.update_hashes(); - ret - } + trackers_hash: Hash, + staging_hash: Hash, +} - pub fn current(&self) -> &LayoutVersion { - self.versions.last().as_ref().unwrap() +impl Deref for LayoutHelper { + type Target = LayoutHistory; + fn deref(&self) -> &LayoutHistory { + self.layout() } +} - pub fn update_hashes(&mut self) { - self.trackers_hash = self.calculate_trackers_hash(); - self.staging_hash = self.calculate_staging_hash(); +impl LayoutHelper { + pub fn new(mut layout: LayoutHistory) -> Self { + layout.cleanup_old_versions(); + + let all_nongateway_nodes = layout.get_all_nongateway_nodes(); + layout.clamp_update_trackers(&all_nongateway_nodes); + + let min_version = layout.min_stored(); + let ack_map_min = layout + .update_trackers + .ack_map + .min(&all_nongateway_nodes, min_version); + let sync_map_min = layout + .update_trackers + .sync_map + .min(&all_nongateway_nodes, min_version); + + let all_nodes = layout.get_all_nodes(); + let trackers_hash = layout.calculate_trackers_hash(); + let staging_hash = layout.calculate_staging_hash(); + + LayoutHelper { + layout: Some(layout), + ack_map_min, + sync_map_min, + all_nodes, + all_nongateway_nodes, + trackers_hash, + staging_hash, + } } - pub(crate) fn calculate_trackers_hash(&self) -> Hash { - blake2sum(&nonversioned_encode(&self.update_trackers).unwrap()[..]) - } + // ------------------ single updating function -------------- - pub(crate) fn calculate_staging_hash(&self) -> Hash { - blake2sum(&nonversioned_encode(&self.staging).unwrap()[..]) + fn layout(&self) -> &LayoutHistory { + self.layout.as_ref().unwrap() } - // ------------------ who stores what now? --------------- - - pub fn all_ack(&self) -> u64 { - self.update_trackers.ack_map.current_min + pub(crate) fn update(&mut self, f: F) -> bool + where + F: FnOnce(&mut LayoutHistory) -> bool, + { + let changed = f(&mut self.layout.as_mut().unwrap()); + if changed { + *self = Self::new(self.layout.take().unwrap()); + } + changed } - pub fn min_stored(&self) -> u64 { - self.versions.first().as_ref().unwrap().version + // ------------------ read helpers --------------- + + pub fn all_nodes(&self) -> &[Uuid] { + &self.all_nodes } - pub fn sync_versions(&self) -> (u64, u64, u64) { - (self.current().version, self.all_ack(), self.min_stored()) + pub fn all_nongateway_nodes(&self) -> &[Uuid] { + &self.all_nongateway_nodes } - pub fn all_nodes(&self) -> Cow<'_, [Uuid]> { - // TODO: cache this - if self.versions.len() == 1 { - self.versions[0].all_nodes().into() - } else { - let set = self - .versions - .iter() - .map(|x| x.all_nodes()) - .flatten() - .collect::>(); - set.into_iter().copied().collect::>().into() - } + pub fn all_ack(&self) -> u64 { + self.ack_map_min } - pub fn all_nongateway_nodes(&self) -> Cow<'_, [Uuid]> { - // TODO: cache this - if self.versions.len() == 1 { - self.versions[0].nongateway_nodes().into() - } else { - let set = self - .versions - .iter() - .map(|x| x.nongateway_nodes()) - .flatten() - .collect::>(); - set.into_iter().copied().collect::>().into() - } + pub fn sync_versions(&self) -> (u64, u64, u64) { + ( + self.layout().current().version, + self.all_ack(), + self.layout().min_stored(), + ) } pub fn read_nodes_of(&self, position: &Hash) -> Vec { - let sync_min = self.update_trackers.sync_map.current_min; + let sync_min = self.sync_map_min; let version = self + .layout() .versions .iter() .find(|x| x.version == sync_min) - .or(self.versions.last()) + .or(self.layout().versions.last()) .unwrap(); version .nodes_of(position, version.replication_factor) @@ -105,7 +116,8 @@ impl LayoutHistory { } pub fn write_sets_of(&self, position: &Hash) -> Vec> { - self.versions + self.layout() + .versions .iter() .map(|x| x.nodes_of(position, x.replication_factor).collect()) .collect() @@ -113,7 +125,7 @@ impl LayoutHistory { pub fn storage_nodes_of(&self, position: &Hash) -> Vec { let mut ret = vec![]; - for version in self.versions.iter() { + for version in self.layout().versions.iter() { ret.extend(version.nodes_of(position, version.replication_factor)); } ret.sort(); @@ -121,7 +133,35 @@ impl LayoutHistory { ret } - // ------------------ update tracking --------------- + pub fn trackers_hash(&self) -> Hash { + self.trackers_hash + } + + pub fn staging_hash(&self) -> Hash { + self.staging_hash + } + + // ------------------ helpers for update tracking --------------- + + pub(crate) fn sync_first(&mut self, node: Uuid) { + let first_version = self.versions.first().as_ref().unwrap().version; + self.update(|layout| layout.update_trackers.sync_map.set_max(node, first_version)); + } + + pub(crate) fn sync_ack(&mut self, node: Uuid) { + let sync_map_min = self.sync_map_min; + self.update(|layout| { + layout + .update_trackers + .sync_ack_map + .set_max(node, sync_map_min) + }); + } + + pub(crate) fn ack_last(&mut self, node: Uuid) { + let last_version = self.current().version; + self.update(|layout| layout.update_trackers.ack_map.set_max(node, last_version)); + } pub(crate) fn update_trackers_of(&mut self, node_id: Uuid) { // Ensure trackers for this node's values are up-to-date @@ -136,55 +176,104 @@ impl LayoutHistory { // 3. Acknowledge everyone has synced up to min(self.sync_map) self.sync_ack(node_id); - // 4. Cleanup layout versions that are not needed anymore - self.cleanup_old_versions(); - - // 5. Recalculate global minima - self.update_trackers_min(); - info!("ack_map: {:?}", self.update_trackers.ack_map); info!("sync_map: {:?}", self.update_trackers.sync_map); info!("sync_ack_map: {:?}", self.update_trackers.sync_ack_map); + } +} - // Finally, update hashes - self.update_hashes(); +// ---- + +impl LayoutHistory { + pub fn new(replication_factor: usize) -> Self { + let version = LayoutVersion::new(replication_factor); + + let staging = LayoutStaging { + parameters: Lww::::new(version.parameters), + roles: LwwMap::new(), + }; + + LayoutHistory { + versions: vec![version], + update_trackers: Default::default(), + staging: Lww::raw(0, staging), + } } - fn update_trackers_min(&mut self) { - // TODO: for TableFullReplication, counting gateway nodes might be - // necessary? Think about this more. - let storage_nodes = self.all_nongateway_nodes().into_owned(); - let min_version = self.versions.first().unwrap().version; - self.update_trackers.update_min(&storage_nodes, min_version); + // ------------------ who stores what now? --------------- + + pub fn current(&self) -> &LayoutVersion { + self.versions.last().as_ref().unwrap() } - pub(crate) fn ack_last(&mut self, node: Uuid) { - let last_version = self.current().version; - self.update_trackers.ack_map.set_max(node, last_version); - self.update_trackers_min(); + pub fn min_stored(&self) -> u64 { + self.versions.first().as_ref().unwrap().version } - pub(crate) fn sync_first(&mut self, node: Uuid) { - let first_version = self.versions.first().as_ref().unwrap().version; - self.update_trackers.sync_map.set_max(node, first_version); - self.update_trackers_min(); + pub fn get_all_nodes(&self) -> Vec { + if self.versions.len() == 1 { + self.versions[0].all_nodes().to_vec() + } else { + let set = self + .versions + .iter() + .map(|x| x.all_nodes()) + .flatten() + .collect::>(); + set.into_iter().copied().collect::>() + } } - pub(crate) fn sync_ack(&mut self, node: Uuid) { - self.update_trackers - .sync_ack_map - .set_max(node, self.update_trackers.sync_map.current_min); - self.update_trackers_min(); + fn get_all_nongateway_nodes(&self) -> Vec { + if self.versions.len() == 1 { + self.versions[0].nongateway_nodes().to_vec() + } else { + let set = self + .versions + .iter() + .map(|x| x.nongateway_nodes()) + .flatten() + .collect::>(); + set.into_iter().copied().collect::>() + } } - pub(crate) fn cleanup_old_versions(&mut self) { - let min_sync_ack = self.update_trackers.sync_ack_map.current_min; - while self.versions.first().as_ref().unwrap().version < min_sync_ack { - let removed = self.versions.remove(0); - info!("Layout history: pruning old version {}", removed.version); + // ---- housekeeping (all invoked by LayoutHelper) ---- + + fn cleanup_old_versions(&mut self) { + loop { + let all_nongateway_nodes = self.get_all_nongateway_nodes(); + let min_version = self.min_stored(); + let sync_ack_map_min = self + .update_trackers + .sync_ack_map + .min(&all_nongateway_nodes, min_version); + if self.min_stored() < sync_ack_map_min { + let removed = self.versions.remove(0); + info!("Layout history: pruning old version {}", removed.version); + } else { + break; + } } } + fn clamp_update_trackers(&mut self, nodes: &[Uuid]) { + let min_v = self.min_stored(); + for node in nodes { + self.update_trackers.ack_map.set_max(*node, min_v); + self.update_trackers.sync_map.set_max(*node, min_v); + self.update_trackers.sync_ack_map.set_max(*node, min_v); + } + } + + fn calculate_trackers_hash(&self) -> Hash { + blake2sum(&nonversioned_encode(&self.update_trackers).unwrap()[..]) + } + + fn calculate_staging_hash(&self) -> Hash { + blake2sum(&nonversioned_encode(&self.staging).unwrap()[..]) + } + // ================== updates to layout, public interface =================== pub fn merge(&mut self, other: &LayoutHistory) -> bool { @@ -221,20 +310,6 @@ impl LayoutHistory { self.versions.remove(0); changed = true; } - if changed { - let min_v = self.versions.first().unwrap().version; - let nodes = self.all_nongateway_nodes().into_owned(); - for node in nodes { - self.update_trackers.ack_map.set_max(node, min_v); - self.update_trackers.sync_map.set_max(node, min_v); - self.update_trackers.sync_ack_map.set_max(node, min_v); - } - } - } - - // Update the current_min value in trackers if anything changed - if changed { - self.update_trackers_min(); } // Merge staged layout changes @@ -280,7 +355,6 @@ To know the correct value of the new layout version, invoke `garage layout show` parameters: self.staging.get().parameters.clone(), roles: LwwMap::new(), }); - self.update_hashes(); Ok((self, msg)) } @@ -290,20 +364,11 @@ To know the correct value of the new layout version, invoke `garage layout show` parameters: Lww::new(self.current().parameters.clone()), roles: LwwMap::new(), }); - self.update_hashes(); Ok(self) } pub fn check(&self) -> Result<(), String> { - // Check that the hash of the staging data is correct - if self.trackers_hash != self.calculate_trackers_hash() { - return Err("trackers_hash is incorrect".into()); - } - if self.staging_hash != self.calculate_staging_hash() { - return Err("staging_hash is incorrect".into()); - } - for version in self.versions.iter() { version.check()?; } diff --git a/src/rpc/layout/manager.rs b/src/rpc/layout/manager.rs index 21ec2d8d..e270ad21 100644 --- a/src/rpc/layout/manager.rs +++ b/src/rpc/layout/manager.rs @@ -24,7 +24,7 @@ pub struct LayoutManager { replication_factor: usize, persist_cluster_layout: Persister, - layout: Arc>, + layout: Arc>, pub(crate) change_notify: Arc, table_sync_version: Mutex>, @@ -54,7 +54,7 @@ impl LayoutManager { let persist_cluster_layout: Persister = Persister::new(&config.metadata_dir, "cluster_layout"); - let mut cluster_layout = match persist_cluster_layout.load() { + let cluster_layout = match persist_cluster_layout.load() { Ok(x) => { if x.current().replication_factor != replication_factor { return Err(Error::Message(format!( @@ -74,6 +74,7 @@ impl LayoutManager { } }; + let mut cluster_layout = LayoutHelper::new(cluster_layout); cluster_layout.update_trackers_of(node_id.into()); let layout = Arc::new(RwLock::new(cluster_layout)); @@ -100,7 +101,7 @@ impl LayoutManager { // ---- PUBLIC INTERFACE ---- - pub fn layout(&self) -> RwLockReadGuard<'_, LayoutHistory> { + pub fn layout(&self) -> RwLockReadGuard<'_, LayoutHelper> { self.layout.read().unwrap() } @@ -108,8 +109,8 @@ impl LayoutManager { let layout = self.layout(); LayoutStatus { cluster_layout_version: layout.current().version, - cluster_layout_trackers_hash: layout.trackers_hash, - cluster_layout_staging_hash: layout.staging_hash, + cluster_layout_trackers_hash: layout.trackers_hash(), + cluster_layout_staging_hash: layout.staging_hash(), } } @@ -137,13 +138,8 @@ impl LayoutManager { drop(table_sync_version); let mut layout = self.layout.write().unwrap(); - if layout - .update_trackers - .sync_map - .set_max(self.node_id, sync_until) - { + if layout.update(|l| l.update_trackers.sync_map.set_max(self.node_id, sync_until)) { debug!("sync_until updated to {}", sync_until); - layout.update_hashes(); self.broadcast_update(SystemRpc::AdvertiseClusterLayoutTrackers( layout.update_trackers.clone(), )); @@ -157,7 +153,7 @@ impl LayoutManager { let prev_layout_check = layout.check().is_ok(); if !prev_layout_check || adv.check().is_ok() { - if layout.merge(adv) { + if layout.update(|l| l.merge(adv)) { layout.update_trackers_of(self.node_id); if prev_layout_check && layout.check().is_err() { panic!("Merged two correct layouts and got an incorrect layout."); @@ -171,7 +167,7 @@ impl LayoutManager { fn merge_layout_trackers(&self, adv: &UpdateTrackers) -> Option { let mut layout = self.layout.write().unwrap(); if layout.update_trackers != *adv { - if layout.update_trackers.merge(adv) { + if layout.update(|l| l.update_trackers.merge(adv)) { layout.update_trackers_of(self.node_id); return Some(layout.update_trackers.clone()); } diff --git a/src/rpc/layout/schema.rs b/src/rpc/layout/schema.rs index 969f5a0b..00a2c017 100644 --- a/src/rpc/layout/schema.rs +++ b/src/rpc/layout/schema.rs @@ -188,7 +188,7 @@ mod v010 { use super::v09; use crate::layout::CompactNodeType; use garage_util::crdt::{Lww, LwwMap}; - use garage_util::data::{Hash, Uuid}; + use garage_util::data::Uuid; use serde::{Deserialize, Serialize}; use std::collections::BTreeMap; pub use v09::{LayoutParameters, NodeRole, NodeRoleV, ZoneRedundancy}; @@ -202,13 +202,9 @@ mod v010 { /// Update trackers pub update_trackers: UpdateTrackers, - /// Hash of the update trackers - pub trackers_hash: Hash, /// Staged changes for the next version pub staging: Lww, - /// Hash of the serialized staging_parameters + staging_roles - pub staging_hash: Hash, } /// A version of the layout of the cluster, i.e. the list of roles @@ -260,10 +256,7 @@ mod v010 { /// The history of cluster layouts #[derive(Clone, Debug, Serialize, Deserialize, Default, PartialEq)] - pub struct UpdateTracker { - pub values: BTreeMap, - pub current_min: u64, - } + pub struct UpdateTracker(pub BTreeMap); impl garage_util::migrate::Migrate for LayoutHistory { const VERSION_MARKER: &'static [u8] = b"G010lh"; @@ -293,32 +286,27 @@ mod v010 { nongateway_node_count, ring_assignment_data: previous.ring_assignment_data, }; - let update_tracker = UpdateTracker { - values: version + let update_tracker = UpdateTracker( + version .nongateway_nodes() .iter() .copied() .map(|x| (x, version.version)) .collect::>(), - current_min: 0, - }; + ); let staging = LayoutStaging { parameters: previous.staging_parameters, roles: previous.staging_roles, }; - let mut ret = Self { + Self { versions: vec![version], update_trackers: UpdateTrackers { ack_map: update_tracker.clone(), sync_map: update_tracker.clone(), sync_ack_map: update_tracker.clone(), }, - trackers_hash: [0u8; 32].into(), staging: Lww::raw(previous.version, staging), - staging_hash: [0u8; 32].into(), - }; - ret.update_hashes(); - ret + } } } } @@ -382,14 +370,14 @@ impl core::str::FromStr for ZoneRedundancy { impl UpdateTracker { fn merge(&mut self, other: &UpdateTracker) -> bool { let mut changed = false; - for (k, v) in other.values.iter() { - if let Some(v_mut) = self.values.get_mut(k) { + for (k, v) in other.0.iter() { + if let Some(v_mut) = self.0.get_mut(k) { if *v > *v_mut { *v_mut = *v; changed = true; } } else { - self.values.insert(*k, *v); + self.0.insert(*k, *v); changed = true; } } @@ -397,23 +385,23 @@ impl UpdateTracker { } pub(crate) fn set_max(&mut self, peer: Uuid, value: u64) -> bool { - match self.values.get_mut(&peer) { + match self.0.get_mut(&peer) { Some(e) if *e < value => { *e = value; true } None => { - self.values.insert(peer, value); + self.0.insert(peer, value); true } _ => false, } } - fn update_min(&mut self, storage_nodes: &[Uuid], min_version: u64) { - self.current_min = storage_nodes + pub(crate) fn min(&self, storage_nodes: &[Uuid], min_version: u64) -> u64 { + storage_nodes .iter() - .map(|x| self.values.get(x).copied().unwrap_or(min_version)) + .map(|x| self.0.get(x).copied().unwrap_or(min_version)) .min() .unwrap_or(min_version) } @@ -426,10 +414,4 @@ impl UpdateTrackers { let c3 = self.sync_ack_map.merge(&other.sync_ack_map); c1 || c2 || c3 } - - pub(crate) fn update_min(&mut self, storage_nodes: &[Uuid], min_version: u64) { - self.ack_map.update_min(&storage_nodes, min_version); - self.sync_map.update_min(&storage_nodes, min_version); - self.sync_ack_map.update_min(&storage_nodes, min_version); - } } diff --git a/src/rpc/rpc_helper.rs b/src/rpc/rpc_helper.rs index 1bad495b..e269ddaa 100644 --- a/src/rpc/rpc_helper.rs +++ b/src/rpc/rpc_helper.rs @@ -26,7 +26,7 @@ use garage_util::data::*; use garage_util::error::Error; use garage_util::metrics::RecordDuration; -use crate::layout::LayoutHistory; +use crate::layout::LayoutHelper; use crate::metrics::RpcMetrics; // Default RPC timeout = 5 minutes @@ -90,7 +90,7 @@ pub struct RpcHelper(Arc); struct RpcHelperInner { our_node_id: Uuid, fullmesh: Arc, - layout: Arc>, + layout: Arc>, metrics: RpcMetrics, rpc_timeout: Duration, } @@ -99,7 +99,7 @@ impl RpcHelper { pub(crate) fn new( our_node_id: Uuid, fullmesh: Arc, - layout: Arc>, + layout: Arc>, rpc_timeout: Option, ) -> Self { let metrics = RpcMetrics::new(); diff --git a/src/rpc/system.rs b/src/rpc/system.rs index 31d78bf6..d74dc2a1 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -34,7 +34,7 @@ use crate::consul::ConsulDiscovery; #[cfg(feature = "kubernetes-discovery")] use crate::kubernetes::*; use crate::layout::manager::{LayoutManager, LayoutStatus}; -use crate::layout::{self, LayoutHistory, NodeRoleV}; +use crate::layout::{self, LayoutHelper, LayoutHistory, NodeRoleV}; use crate::replication_mode::*; use crate::rpc_helper::*; @@ -350,7 +350,7 @@ impl System { // ---- Public utilities / accessors ---- - pub fn cluster_layout(&self) -> RwLockReadGuard<'_, LayoutHistory> { + pub fn cluster_layout(&self) -> RwLockReadGuard<'_, LayoutHelper> { self.layout_manager.layout() } -- cgit v1.2.3 From 33c8a489b0a9c0e869282bfc19c548f5a3e02e8c Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 15 Nov 2023 15:40:44 +0100 Subject: layou: implement ack locking --- src/block/manager.rs | 2 +- src/garage/cli/layout.rs | 2 +- src/rpc/layout/history.rs | 98 +++++++++++++++++++++++++++---------- src/rpc/layout/manager.rs | 74 +++++++++++++++++++++++++--- src/rpc/layout/mod.rs | 1 + src/table/replication/fullcopy.rs | 4 +- src/table/replication/parameters.rs | 4 +- src/table/replication/sharded.rs | 6 ++- src/table/sync.rs | 9 +--- src/table/table.rs | 2 +- 10 files changed, 156 insertions(+), 46 deletions(-) (limited to 'src') diff --git a/src/block/manager.rs b/src/block/manager.rs index 0ca8bc31..be2e4951 100644 --- a/src/block/manager.rs +++ b/src/block/manager.rs @@ -366,7 +366,7 @@ impl BlockManager { .rpc_helper() .try_write_many_sets( &self.endpoint, - &who, + who.as_ref(), put_block_rpc, RequestStrategy::with_priority(PRIO_NORMAL | PRIO_SECONDARY) .with_quorum(self.replication.write_quorum()), diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index 51774314..0be8278f 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -329,7 +329,7 @@ pub async fn fetch_layout( pub async fn send_layout( rpc_cli: &Endpoint, rpc_host: NodeID, - mut layout: LayoutHistory, + layout: LayoutHistory, ) -> Result<(), Error> { rpc_cli .call( diff --git a/src/rpc/layout/history.rs b/src/rpc/layout/history.rs index b6f0e495..dd38efa7 100644 --- a/src/rpc/layout/history.rs +++ b/src/rpc/layout/history.rs @@ -1,5 +1,7 @@ +use std::collections::HashMap; use std::collections::HashSet; use std::ops::Deref; +use std::sync::atomic::{AtomicUsize, Ordering}; use garage_util::crdt::{Crdt, Lww, LwwMap}; use garage_util::data::*; @@ -21,6 +23,11 @@ pub struct LayoutHelper { trackers_hash: Hash, staging_hash: Hash, + + // ack lock: counts in-progress write operations for each + // layout version ; we don't increase the ack update tracker + // while this lock is nonzero + pub(crate) ack_lock: HashMap, } impl Deref for LayoutHelper { @@ -31,7 +38,7 @@ impl Deref for LayoutHelper { } impl LayoutHelper { - pub fn new(mut layout: LayoutHistory) -> Self { + pub fn new(mut layout: LayoutHistory, mut ack_lock: HashMap) -> Self { layout.cleanup_old_versions(); let all_nongateway_nodes = layout.get_all_nongateway_nodes(); @@ -51,6 +58,11 @@ impl LayoutHelper { let trackers_hash = layout.calculate_trackers_hash(); let staging_hash = layout.calculate_staging_hash(); + ack_lock.retain(|_, cnt| *cnt.get_mut() > 0); + ack_lock + .entry(layout.current().version) + .or_insert(AtomicUsize::new(0)); + LayoutHelper { layout: Some(layout), ack_map_min, @@ -59,6 +71,7 @@ impl LayoutHelper { all_nongateway_nodes, trackers_hash, staging_hash, + ack_lock, } } @@ -74,7 +87,10 @@ impl LayoutHelper { { let changed = f(&mut self.layout.as_mut().unwrap()); if changed { - *self = Self::new(self.layout.take().unwrap()); + *self = Self::new( + self.layout.take().unwrap(), + std::mem::take(&mut self.ack_lock), + ); } changed } @@ -115,7 +131,7 @@ impl LayoutHelper { .collect() } - pub fn write_sets_of(&self, position: &Hash) -> Vec> { + pub(crate) fn write_sets_of(&self, position: &Hash) -> Vec> { self.layout() .versions .iter() @@ -143,42 +159,72 @@ impl LayoutHelper { // ------------------ helpers for update tracking --------------- - pub(crate) fn sync_first(&mut self, node: Uuid) { + pub(crate) fn update_trackers(&mut self, local_node_id: Uuid) { + // Ensure trackers for this node's values are up-to-date + + // 1. Acknowledge the last layout version which is not currently + // locked by an in-progress write operation + self.ack_max_free(local_node_id); + + // 2. Assume the data on this node is sync'ed up at least to + // the first layout version in the history + self.sync_first(local_node_id); + + // 3. Acknowledge everyone has synced up to min(self.sync_map) + self.sync_ack(local_node_id); + + info!("ack_map: {:?}", self.update_trackers.ack_map); + info!("sync_map: {:?}", self.update_trackers.sync_map); + info!("sync_ack_map: {:?}", self.update_trackers.sync_ack_map); + } + + fn sync_first(&mut self, local_node_id: Uuid) { let first_version = self.versions.first().as_ref().unwrap().version; - self.update(|layout| layout.update_trackers.sync_map.set_max(node, first_version)); + self.update(|layout| { + layout + .update_trackers + .sync_map + .set_max(local_node_id, first_version) + }); } - pub(crate) fn sync_ack(&mut self, node: Uuid) { + fn sync_ack(&mut self, local_node_id: Uuid) { let sync_map_min = self.sync_map_min; self.update(|layout| { layout .update_trackers .sync_ack_map - .set_max(node, sync_map_min) + .set_max(local_node_id, sync_map_min) }); } - pub(crate) fn ack_last(&mut self, node: Uuid) { - let last_version = self.current().version; - self.update(|layout| layout.update_trackers.ack_map.set_max(node, last_version)); + pub(crate) fn ack_max_free(&mut self, local_node_id: Uuid) -> bool { + let max_ack = self.max_free_ack(); + let changed = self.update(|layout| { + layout + .update_trackers + .ack_map + .set_max(local_node_id, max_ack) + }); + if changed { + info!("ack_until updated to {}", max_ack); + } + changed } - pub(crate) fn update_trackers_of(&mut self, node_id: Uuid) { - // Ensure trackers for this node's values are up-to-date - - // 1. Acknowledge the last layout version in the history - self.ack_last(node_id); - - // 2. Assume the data on this node is sync'ed up at least to - // the first layout version in the history - self.sync_first(node_id); - - // 3. Acknowledge everyone has synced up to min(self.sync_map) - self.sync_ack(node_id); - - info!("ack_map: {:?}", self.update_trackers.ack_map); - info!("sync_map: {:?}", self.update_trackers.sync_map); - info!("sync_ack_map: {:?}", self.update_trackers.sync_ack_map); + pub(crate) fn max_free_ack(&self) -> u64 { + self.layout() + .versions + .iter() + .map(|x| x.version) + .take_while(|v| { + self.ack_lock + .get(v) + .map(|x| x.load(Ordering::Relaxed) == 0) + .unwrap_or(true) + }) + .max() + .unwrap_or(self.min_stored()) } } diff --git a/src/rpc/layout/manager.rs b/src/rpc/layout/manager.rs index e270ad21..4e073d1f 100644 --- a/src/rpc/layout/manager.rs +++ b/src/rpc/layout/manager.rs @@ -1,5 +1,5 @@ use std::collections::HashMap; -use std::sync::{Arc, Mutex, RwLock, RwLockReadGuard}; +use std::sync::{atomic::Ordering, Arc, Mutex, RwLock, RwLockReadGuard}; use std::time::Duration; use serde::{Deserialize, Serialize}; @@ -74,8 +74,8 @@ impl LayoutManager { } }; - let mut cluster_layout = LayoutHelper::new(cluster_layout); - cluster_layout.update_trackers_of(node_id.into()); + let mut cluster_layout = LayoutHelper::new(cluster_layout, Default::default()); + cluster_layout.update_trackers(node_id.into()); let layout = Arc::new(RwLock::new(cluster_layout)); let change_notify = Arc::new(Notify::new()); @@ -139,13 +139,36 @@ impl LayoutManager { let mut layout = self.layout.write().unwrap(); if layout.update(|l| l.update_trackers.sync_map.set_max(self.node_id, sync_until)) { - debug!("sync_until updated to {}", sync_until); + info!("sync_until updated to {}", sync_until); self.broadcast_update(SystemRpc::AdvertiseClusterLayoutTrackers( layout.update_trackers.clone(), )); } } + fn ack_new_version(self: &Arc) { + let mut layout = self.layout.write().unwrap(); + if layout.ack_max_free(self.node_id) { + self.broadcast_update(SystemRpc::AdvertiseClusterLayoutTrackers( + layout.update_trackers.clone(), + )); + } + } + + // ---- ACK LOCKING ---- + + pub fn write_sets_of(self: &Arc, position: &Hash) -> WriteLock>> { + let layout = self.layout(); + let version = layout.current().version; + let nodes = layout.write_sets_of(position); + layout + .ack_lock + .get(&version) + .unwrap() + .fetch_add(1, Ordering::Relaxed); + WriteLock::new(version, self, nodes) + } + // ---- INTERNALS --- fn merge_layout(&self, adv: &LayoutHistory) -> Option { @@ -154,7 +177,7 @@ impl LayoutManager { if !prev_layout_check || adv.check().is_ok() { if layout.update(|l| l.merge(adv)) { - layout.update_trackers_of(self.node_id); + layout.update_trackers(self.node_id); if prev_layout_check && layout.check().is_err() { panic!("Merged two correct layouts and got an incorrect layout."); } @@ -168,7 +191,7 @@ impl LayoutManager { let mut layout = self.layout.write().unwrap(); if layout.update_trackers != *adv { if layout.update(|l| l.update_trackers.merge(adv)) { - layout.update_trackers_of(self.node_id); + layout.update_trackers(self.node_id); return Some(layout.update_trackers.clone()); } } @@ -317,3 +340,42 @@ impl LayoutManager { Ok(SystemRpc::Ok) } } + +// ---- ack lock ---- + +pub struct WriteLock { + layout_version: u64, + layout_manager: Arc, + value: T, +} + +impl WriteLock { + fn new(version: u64, layout_manager: &Arc, value: T) -> Self { + Self { + layout_version: version, + layout_manager: layout_manager.clone(), + value, + } + } +} + +impl AsRef for WriteLock { + fn as_ref(&self) -> &T { + &self.value + } +} + +impl Drop for WriteLock { + fn drop(&mut self) { + let layout = self.layout_manager.layout(); // acquire read lock + if let Some(counter) = layout.ack_lock.get(&self.layout_version) { + let prev_lock = counter.fetch_sub(1, Ordering::Relaxed); + if prev_lock == 1 && layout.current().version > self.layout_version { + drop(layout); // release read lock, write lock will be acquired + self.layout_manager.ack_new_version(); + } + } else { + error!("Could not find ack lock counter for layout version {}. This probably indicates a bug in Garage.", self.layout_version); + } + } +} diff --git a/src/rpc/layout/mod.rs b/src/rpc/layout/mod.rs index 577b32fb..859287c8 100644 --- a/src/rpc/layout/mod.rs +++ b/src/rpc/layout/mod.rs @@ -11,6 +11,7 @@ pub mod manager; // ---- re-exports ---- pub use history::*; +pub use manager::WriteLock; pub use schema::*; pub use version::*; diff --git a/src/table/replication/fullcopy.rs b/src/table/replication/fullcopy.rs index cb5471af..df930224 100644 --- a/src/table/replication/fullcopy.rs +++ b/src/table/replication/fullcopy.rs @@ -27,6 +27,8 @@ pub struct TableFullReplication { } impl TableReplication for TableFullReplication { + type WriteSets = Vec>; + fn storage_nodes(&self, _hash: &Hash) -> Vec { let layout = self.system.cluster_layout(); layout.current().all_nodes().to_vec() @@ -39,7 +41,7 @@ impl TableReplication for TableFullReplication { 1 } - fn write_sets(&self, hash: &Hash) -> Vec> { + fn write_sets(&self, hash: &Hash) -> Self::WriteSets { vec![self.storage_nodes(hash)] } fn write_quorum(&self) -> usize { diff --git a/src/table/replication/parameters.rs b/src/table/replication/parameters.rs index 2f842409..a4e701bb 100644 --- a/src/table/replication/parameters.rs +++ b/src/table/replication/parameters.rs @@ -3,6 +3,8 @@ use garage_util::data::*; /// Trait to describe how a table shall be replicated pub trait TableReplication: Send + Sync + 'static { + type WriteSets: AsRef>> + Send + Sync + 'static; + // See examples in table_sharded.rs and table_fullcopy.rs // To understand various replication methods @@ -15,7 +17,7 @@ pub trait TableReplication: Send + Sync + 'static { fn read_quorum(&self) -> usize; /// Which nodes to send writes to - fn write_sets(&self, hash: &Hash) -> Vec>; + fn write_sets(&self, hash: &Hash) -> Self::WriteSets; /// Responses needed to consider a write succesfull in each set fn write_quorum(&self) -> usize; fn max_write_errors(&self) -> usize; diff --git a/src/table/replication/sharded.rs b/src/table/replication/sharded.rs index 1320a189..2a16bc0c 100644 --- a/src/table/replication/sharded.rs +++ b/src/table/replication/sharded.rs @@ -25,6 +25,8 @@ pub struct TableShardedReplication { } impl TableReplication for TableShardedReplication { + type WriteSets = WriteLock>>; + fn storage_nodes(&self, hash: &Hash) -> Vec { self.system.cluster_layout().storage_nodes_of(hash) } @@ -36,8 +38,8 @@ impl TableReplication for TableShardedReplication { self.read_quorum } - fn write_sets(&self, hash: &Hash) -> Vec> { - self.system.cluster_layout().write_sets_of(hash) + fn write_sets(&self, hash: &Hash) -> Self::WriteSets { + self.system.layout_manager.write_sets_of(hash) } fn write_quorum(&self) -> usize { self.write_quorum diff --git a/src/table/sync.rs b/src/table/sync.rs index b67cdd79..efeac402 100644 --- a/src/table/sync.rs +++ b/src/table/sync.rs @@ -173,12 +173,7 @@ impl TableSyncer { } if !items.is_empty() { - let nodes = self - .data - .replication - .storage_nodes(begin) - .into_iter() - .collect::>(); + let nodes = self.data.replication.storage_nodes(begin); if nodes.contains(&self.system.id) { warn!( "({}) Interrupting offload as partitions seem to have changed", @@ -202,7 +197,7 @@ impl TableSyncer { end, counter ); - self.offload_items(&items, &nodes[..]).await?; + self.offload_items(&items, &nodes).await?; } else { break; } diff --git a/src/table/table.rs b/src/table/table.rs index c2efaeaf..5ec9eb0a 100644 --- a/src/table/table.rs +++ b/src/table/table.rs @@ -128,7 +128,7 @@ impl Table { .rpc_helper() .try_write_many_sets( &self.endpoint, - &who, + who.as_ref(), rpc, RequestStrategy::with_priority(PRIO_NORMAL) .with_quorum(self.data.replication.write_quorum()), -- cgit v1.2.3 From d4df03424f1c7f3cc1eaba9e16d2e1d049131b97 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 15 Nov 2023 15:56:57 +0100 Subject: layout: fix test --- src/rpc/layout/test.rs | 2 -- 1 file changed, 2 deletions(-) (limited to 'src') diff --git a/src/rpc/layout/test.rs b/src/rpc/layout/test.rs index e9639073..bb072c97 100644 --- a/src/rpc/layout/test.rs +++ b/src/rpc/layout/test.rs @@ -113,8 +113,6 @@ fn update_layout( staging.parameters.update(LayoutParameters { zone_redundancy: ZoneRedundancy::AtLeast(zone_redundancy), }); - - cl.update_hashes(); } #[test] -- cgit v1.2.3 From ad5c6f779f7fdfdc0569920c830c59197023515a Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 16 Nov 2023 13:26:43 +0100 Subject: layout: split helper in separate file; more precise difference tracking --- src/rpc/layout/helper.rs | 224 +++++++++++++++++++++++++++++++++++++ src/rpc/layout/history.rs | 278 +++++----------------------------------------- src/rpc/layout/manager.rs | 5 +- src/rpc/layout/mod.rs | 3 +- 4 files changed, 256 insertions(+), 254 deletions(-) create mode 100644 src/rpc/layout/helper.rs (limited to 'src') diff --git a/src/rpc/layout/helper.rs b/src/rpc/layout/helper.rs new file mode 100644 index 00000000..ed3da498 --- /dev/null +++ b/src/rpc/layout/helper.rs @@ -0,0 +1,224 @@ +use std::collections::HashMap; +use std::ops::Deref; +use std::sync::atomic::{AtomicUsize, Ordering}; + +use garage_util::data::*; + +use super::schema::*; + +pub struct LayoutHelper { + layout: Option, + + // cached values + ack_map_min: u64, + sync_map_min: u64, + + all_nodes: Vec, + all_nongateway_nodes: Vec, + + pub(crate) trackers_hash: Hash, + pub(crate) staging_hash: Hash, + + // ack lock: counts in-progress write operations for each + // layout version ; we don't increase the ack update tracker + // while this lock is nonzero + pub(crate) ack_lock: HashMap, +} + +impl Deref for LayoutHelper { + type Target = LayoutHistory; + fn deref(&self) -> &LayoutHistory { + self.layout() + } +} + +impl LayoutHelper { + pub fn new(mut layout: LayoutHistory, mut ack_lock: HashMap) -> Self { + layout.cleanup_old_versions(); + + let all_nongateway_nodes = layout.get_all_nongateway_nodes(); + layout.clamp_update_trackers(&all_nongateway_nodes); + + let min_version = layout.min_stored(); + let ack_map_min = layout + .update_trackers + .ack_map + .min(&all_nongateway_nodes, min_version); + let sync_map_min = layout + .update_trackers + .sync_map + .min(&all_nongateway_nodes, min_version); + + let all_nodes = layout.get_all_nodes(); + let trackers_hash = layout.calculate_trackers_hash(); + let staging_hash = layout.calculate_staging_hash(); + + ack_lock.retain(|_, cnt| *cnt.get_mut() > 0); + ack_lock + .entry(layout.current().version) + .or_insert(AtomicUsize::new(0)); + + LayoutHelper { + layout: Some(layout), + ack_map_min, + sync_map_min, + all_nodes, + all_nongateway_nodes, + trackers_hash, + staging_hash, + ack_lock, + } + } + + // ------------------ single updating function -------------- + + fn layout(&self) -> &LayoutHistory { + self.layout.as_ref().unwrap() + } + + pub(crate) fn update(&mut self, f: F) -> bool + where + F: FnOnce(&mut LayoutHistory) -> bool, + { + let changed = f(&mut self.layout.as_mut().unwrap()); + if changed { + *self = Self::new( + self.layout.take().unwrap(), + std::mem::take(&mut self.ack_lock), + ); + } + changed + } + + // ------------------ read helpers --------------- + + pub fn all_nodes(&self) -> &[Uuid] { + &self.all_nodes + } + + pub fn all_nongateway_nodes(&self) -> &[Uuid] { + &self.all_nongateway_nodes + } + + pub fn all_ack(&self) -> u64 { + self.ack_map_min + } + + pub fn sync_versions(&self) -> (u64, u64, u64) { + ( + self.layout().current().version, + self.all_ack(), + self.layout().min_stored(), + ) + } + + pub fn read_nodes_of(&self, position: &Hash) -> Vec { + let sync_min = self.sync_map_min; + let version = self + .layout() + .versions + .iter() + .find(|x| x.version == sync_min) + .or(self.layout().versions.last()) + .unwrap(); + version + .nodes_of(position, version.replication_factor) + .collect() + } + + pub(crate) fn write_sets_of(&self, position: &Hash) -> Vec> { + self.layout() + .versions + .iter() + .map(|x| x.nodes_of(position, x.replication_factor).collect()) + .collect() + } + + pub fn storage_nodes_of(&self, position: &Hash) -> Vec { + let mut ret = vec![]; + for version in self.layout().versions.iter() { + ret.extend(version.nodes_of(position, version.replication_factor)); + } + ret.sort(); + ret.dedup(); + ret + } + + pub fn trackers_hash(&self) -> Hash { + self.trackers_hash + } + + pub fn staging_hash(&self) -> Hash { + self.staging_hash + } + + // ------------------ helpers for update tracking --------------- + + pub(crate) fn update_trackers(&mut self, local_node_id: Uuid) { + // Ensure trackers for this node's values are up-to-date + + // 1. Acknowledge the last layout version which is not currently + // locked by an in-progress write operation + self.ack_max_free(local_node_id); + + // 2. Assume the data on this node is sync'ed up at least to + // the first layout version in the history + self.sync_first(local_node_id); + + // 3. Acknowledge everyone has synced up to min(self.sync_map) + self.sync_ack(local_node_id); + + info!("ack_map: {:?}", self.update_trackers.ack_map); + info!("sync_map: {:?}", self.update_trackers.sync_map); + info!("sync_ack_map: {:?}", self.update_trackers.sync_ack_map); + } + + fn sync_first(&mut self, local_node_id: Uuid) { + let first_version = self.versions.first().as_ref().unwrap().version; + self.update(|layout| { + layout + .update_trackers + .sync_map + .set_max(local_node_id, first_version) + }); + } + + fn sync_ack(&mut self, local_node_id: Uuid) { + let sync_map_min = self.sync_map_min; + self.update(|layout| { + layout + .update_trackers + .sync_ack_map + .set_max(local_node_id, sync_map_min) + }); + } + + pub(crate) fn ack_max_free(&mut self, local_node_id: Uuid) -> bool { + let max_ack = self.max_free_ack(); + let changed = self.update(|layout| { + layout + .update_trackers + .ack_map + .set_max(local_node_id, max_ack) + }); + if changed { + info!("ack_until updated to {}", max_ack); + } + changed + } + + pub(crate) fn max_free_ack(&self) -> u64 { + self.layout() + .versions + .iter() + .map(|x| x.version) + .take_while(|v| { + self.ack_lock + .get(v) + .map(|x| x.load(Ordering::Relaxed) == 0) + .unwrap_or(true) + }) + .max() + .unwrap_or(self.min_stored()) + } +} diff --git a/src/rpc/layout/history.rs b/src/rpc/layout/history.rs index dd38efa7..0a139549 100644 --- a/src/rpc/layout/history.rs +++ b/src/rpc/layout/history.rs @@ -1,7 +1,4 @@ -use std::collections::HashMap; use std::collections::HashSet; -use std::ops::Deref; -use std::sync::atomic::{AtomicUsize, Ordering}; use garage_util::crdt::{Crdt, Lww, LwwMap}; use garage_util::data::*; @@ -11,225 +8,6 @@ use garage_util::error::*; use super::schema::*; use super::*; -pub struct LayoutHelper { - layout: Option, - - // cached values - ack_map_min: u64, - sync_map_min: u64, - - all_nodes: Vec, - all_nongateway_nodes: Vec, - - trackers_hash: Hash, - staging_hash: Hash, - - // ack lock: counts in-progress write operations for each - // layout version ; we don't increase the ack update tracker - // while this lock is nonzero - pub(crate) ack_lock: HashMap, -} - -impl Deref for LayoutHelper { - type Target = LayoutHistory; - fn deref(&self) -> &LayoutHistory { - self.layout() - } -} - -impl LayoutHelper { - pub fn new(mut layout: LayoutHistory, mut ack_lock: HashMap) -> Self { - layout.cleanup_old_versions(); - - let all_nongateway_nodes = layout.get_all_nongateway_nodes(); - layout.clamp_update_trackers(&all_nongateway_nodes); - - let min_version = layout.min_stored(); - let ack_map_min = layout - .update_trackers - .ack_map - .min(&all_nongateway_nodes, min_version); - let sync_map_min = layout - .update_trackers - .sync_map - .min(&all_nongateway_nodes, min_version); - - let all_nodes = layout.get_all_nodes(); - let trackers_hash = layout.calculate_trackers_hash(); - let staging_hash = layout.calculate_staging_hash(); - - ack_lock.retain(|_, cnt| *cnt.get_mut() > 0); - ack_lock - .entry(layout.current().version) - .or_insert(AtomicUsize::new(0)); - - LayoutHelper { - layout: Some(layout), - ack_map_min, - sync_map_min, - all_nodes, - all_nongateway_nodes, - trackers_hash, - staging_hash, - ack_lock, - } - } - - // ------------------ single updating function -------------- - - fn layout(&self) -> &LayoutHistory { - self.layout.as_ref().unwrap() - } - - pub(crate) fn update(&mut self, f: F) -> bool - where - F: FnOnce(&mut LayoutHistory) -> bool, - { - let changed = f(&mut self.layout.as_mut().unwrap()); - if changed { - *self = Self::new( - self.layout.take().unwrap(), - std::mem::take(&mut self.ack_lock), - ); - } - changed - } - - // ------------------ read helpers --------------- - - pub fn all_nodes(&self) -> &[Uuid] { - &self.all_nodes - } - - pub fn all_nongateway_nodes(&self) -> &[Uuid] { - &self.all_nongateway_nodes - } - - pub fn all_ack(&self) -> u64 { - self.ack_map_min - } - - pub fn sync_versions(&self) -> (u64, u64, u64) { - ( - self.layout().current().version, - self.all_ack(), - self.layout().min_stored(), - ) - } - - pub fn read_nodes_of(&self, position: &Hash) -> Vec { - let sync_min = self.sync_map_min; - let version = self - .layout() - .versions - .iter() - .find(|x| x.version == sync_min) - .or(self.layout().versions.last()) - .unwrap(); - version - .nodes_of(position, version.replication_factor) - .collect() - } - - pub(crate) fn write_sets_of(&self, position: &Hash) -> Vec> { - self.layout() - .versions - .iter() - .map(|x| x.nodes_of(position, x.replication_factor).collect()) - .collect() - } - - pub fn storage_nodes_of(&self, position: &Hash) -> Vec { - let mut ret = vec![]; - for version in self.layout().versions.iter() { - ret.extend(version.nodes_of(position, version.replication_factor)); - } - ret.sort(); - ret.dedup(); - ret - } - - pub fn trackers_hash(&self) -> Hash { - self.trackers_hash - } - - pub fn staging_hash(&self) -> Hash { - self.staging_hash - } - - // ------------------ helpers for update tracking --------------- - - pub(crate) fn update_trackers(&mut self, local_node_id: Uuid) { - // Ensure trackers for this node's values are up-to-date - - // 1. Acknowledge the last layout version which is not currently - // locked by an in-progress write operation - self.ack_max_free(local_node_id); - - // 2. Assume the data on this node is sync'ed up at least to - // the first layout version in the history - self.sync_first(local_node_id); - - // 3. Acknowledge everyone has synced up to min(self.sync_map) - self.sync_ack(local_node_id); - - info!("ack_map: {:?}", self.update_trackers.ack_map); - info!("sync_map: {:?}", self.update_trackers.sync_map); - info!("sync_ack_map: {:?}", self.update_trackers.sync_ack_map); - } - - fn sync_first(&mut self, local_node_id: Uuid) { - let first_version = self.versions.first().as_ref().unwrap().version; - self.update(|layout| { - layout - .update_trackers - .sync_map - .set_max(local_node_id, first_version) - }); - } - - fn sync_ack(&mut self, local_node_id: Uuid) { - let sync_map_min = self.sync_map_min; - self.update(|layout| { - layout - .update_trackers - .sync_ack_map - .set_max(local_node_id, sync_map_min) - }); - } - - pub(crate) fn ack_max_free(&mut self, local_node_id: Uuid) -> bool { - let max_ack = self.max_free_ack(); - let changed = self.update(|layout| { - layout - .update_trackers - .ack_map - .set_max(local_node_id, max_ack) - }); - if changed { - info!("ack_until updated to {}", max_ack); - } - changed - } - - pub(crate) fn max_free_ack(&self) -> u64 { - self.layout() - .versions - .iter() - .map(|x| x.version) - .take_while(|v| { - self.ack_lock - .get(v) - .map(|x| x.load(Ordering::Relaxed) == 0) - .unwrap_or(true) - }) - .max() - .unwrap_or(self.min_stored()) - } -} - -// ---- - impl LayoutHistory { pub fn new(replication_factor: usize) -> Self { let version = LayoutVersion::new(replication_factor); @@ -270,7 +48,7 @@ impl LayoutHistory { } } - fn get_all_nongateway_nodes(&self) -> Vec { + pub(crate) fn get_all_nongateway_nodes(&self) -> Vec { if self.versions.len() == 1 { self.versions[0].nongateway_nodes().to_vec() } else { @@ -286,8 +64,21 @@ impl LayoutHistory { // ---- housekeeping (all invoked by LayoutHelper) ---- - fn cleanup_old_versions(&mut self) { - loop { + pub(crate) fn cleanup_old_versions(&mut self) { + // If there are invalid versions before valid versions, remove them + if self.versions.len() > 1 && self.current().check().is_ok() { + while self.versions.len() > 1 && self.versions.first().unwrap().check().is_err() { + let removed = self.versions.remove(0); + info!( + "Layout history: pruning old invalid version {}", + removed.version + ); + } + } + + // If there are old versions that no one is reading from anymore, + // remove them + while self.versions.len() > 1 { let all_nongateway_nodes = self.get_all_nongateway_nodes(); let min_version = self.min_stored(); let sync_ack_map_min = self @@ -303,7 +94,7 @@ impl LayoutHistory { } } - fn clamp_update_trackers(&mut self, nodes: &[Uuid]) { + pub(crate) fn clamp_update_trackers(&mut self, nodes: &[Uuid]) { let min_v = self.min_stored(); for node in nodes { self.update_trackers.ack_map.set_max(*node, min_v); @@ -312,11 +103,11 @@ impl LayoutHistory { } } - fn calculate_trackers_hash(&self) -> Hash { + pub(crate) fn calculate_trackers_hash(&self) -> Hash { blake2sum(&nonversioned_encode(&self.update_trackers).unwrap()[..]) } - fn calculate_staging_hash(&self) -> Hash { + pub(crate) fn calculate_staging_hash(&self) -> Hash { blake2sum(&nonversioned_encode(&self.staging).unwrap()[..]) } @@ -328,6 +119,7 @@ impl LayoutHistory { // Add any new versions to history for v2 in other.versions.iter() { if let Some(v1) = self.versions.iter().find(|v| v.version == v2.version) { + // Version is already present, check consistency if v1 != v2 { error!("Inconsistent layout histories: different layout compositions for version {}. Your cluster will be broken as long as this layout version is not replaced.", v2.version); } @@ -344,24 +136,14 @@ impl LayoutHistory { } // Merge trackers - if self.update_trackers != other.update_trackers { - let c = self.update_trackers.merge(&other.update_trackers); - changed = changed || c; - } - - // If there are invalid versions before valid versions, remove them, - // and increment update trackers - if self.versions.len() > 1 && self.current().check().is_ok() { - while self.versions.first().unwrap().check().is_err() { - self.versions.remove(0); - changed = true; - } - } + let c = self.update_trackers.merge(&other.update_trackers); + changed = changed || c; // Merge staged layout changes if self.staging != other.staging { + let prev_staging = self.staging.clone(); self.staging.merge(&other.staging); - changed = true; + changed = changed || self.staging != prev_staging; } changed @@ -390,11 +172,7 @@ To know the correct value of the new layout version, invoke `garage layout show` .calculate_next_version(&self.staging.get())?; self.versions.push(new_version); - if self.current().check().is_ok() { - while self.versions.first().unwrap().check().is_err() { - self.versions.remove(0); - } - } + self.cleanup_old_versions(); // Reset the staged layout changes self.staging.update(LayoutStaging { @@ -415,11 +193,7 @@ To know the correct value of the new layout version, invoke `garage layout show` } pub fn check(&self) -> Result<(), String> { - for version in self.versions.iter() { - version.check()?; - } - // TODO: anything more ? - Ok(()) + self.current().check() } } diff --git a/src/rpc/layout/manager.rs b/src/rpc/layout/manager.rs index 4e073d1f..85d94ffa 100644 --- a/src/rpc/layout/manager.rs +++ b/src/rpc/layout/manager.rs @@ -184,17 +184,20 @@ impl LayoutManager { return Some(layout.clone()); } } + None } fn merge_layout_trackers(&self, adv: &UpdateTrackers) -> Option { let mut layout = self.layout.write().unwrap(); + if layout.update_trackers != *adv { if layout.update(|l| l.update_trackers.merge(adv)) { layout.update_trackers(self.node_id); return Some(layout.update_trackers.clone()); } } + None } @@ -284,7 +287,7 @@ impl LayoutManager { } pub(crate) fn handle_pull_cluster_layout(&self) -> SystemRpc { - let layout = self.layout.read().unwrap().clone(); // TODO: avoid cloning + let layout = self.layout.read().unwrap().clone(); SystemRpc::AdvertiseClusterLayout(layout) } diff --git a/src/rpc/layout/mod.rs b/src/rpc/layout/mod.rs index 859287c8..91151ab4 100644 --- a/src/rpc/layout/mod.rs +++ b/src/rpc/layout/mod.rs @@ -1,4 +1,5 @@ mod graph_algo; +mod helper; mod history; mod schema; mod version; @@ -10,7 +11,7 @@ pub mod manager; // ---- re-exports ---- -pub use history::*; +pub use helper::LayoutHelper; pub use manager::WriteLock; pub use schema::*; pub use version::*; -- cgit v1.2.3 From 707442f5de416fdbed4681a33b739f0a787b7834 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 16 Nov 2023 13:51:40 +0100 Subject: layout: refactor digests and add "!=" assertions before epidemic bcast --- src/rpc/layout/helper.rs | 27 +++++++++++++++++++++++++-- src/rpc/layout/history.rs | 1 - src/rpc/layout/manager.rs | 36 ++++++++++-------------------------- src/rpc/layout/mod.rs | 2 +- src/rpc/system.rs | 17 +++++++++-------- 5 files changed, 45 insertions(+), 38 deletions(-) (limited to 'src') diff --git a/src/rpc/layout/helper.rs b/src/rpc/layout/helper.rs index ed3da498..0d746ea3 100644 --- a/src/rpc/layout/helper.rs +++ b/src/rpc/layout/helper.rs @@ -2,10 +2,24 @@ use std::collections::HashMap; use std::ops::Deref; use std::sync::atomic::{AtomicUsize, Ordering}; +use serde::{Deserialize, Serialize}; + use garage_util::data::*; use super::schema::*; +#[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq, Eq)] +pub struct LayoutDigest { + /// Cluster layout version + pub current_version: u64, + /// Number of active layout versions + pub active_versions: usize, + /// Hash of cluster layout update trackers + pub trackers_hash: Hash, + /// Hash of cluster layout staging data + pub staging_hash: Hash, +} + pub struct LayoutHelper { layout: Option, @@ -16,8 +30,8 @@ pub struct LayoutHelper { all_nodes: Vec, all_nongateway_nodes: Vec, - pub(crate) trackers_hash: Hash, - pub(crate) staging_hash: Hash, + trackers_hash: Hash, + staging_hash: Hash, // ack lock: counts in-progress write operations for each // layout version ; we don't increase the ack update tracker @@ -152,6 +166,15 @@ impl LayoutHelper { self.staging_hash } + pub fn digest(&self) -> LayoutDigest { + LayoutDigest { + current_version: self.current().version, + active_versions: self.versions.len(), + trackers_hash: self.trackers_hash, + staging_hash: self.staging_hash, + } + } + // ------------------ helpers for update tracking --------------- pub(crate) fn update_trackers(&mut self, local_node_id: Uuid) { diff --git a/src/rpc/layout/history.rs b/src/rpc/layout/history.rs index 0a139549..653d2a48 100644 --- a/src/rpc/layout/history.rs +++ b/src/rpc/layout/history.rs @@ -5,7 +5,6 @@ use garage_util::data::*; use garage_util::encode::nonversioned_encode; use garage_util::error::*; -use super::schema::*; use super::*; impl LayoutHistory { diff --git a/src/rpc/layout/manager.rs b/src/rpc/layout/manager.rs index 85d94ffa..c65831a2 100644 --- a/src/rpc/layout/manager.rs +++ b/src/rpc/layout/manager.rs @@ -2,8 +2,6 @@ use std::collections::HashMap; use std::sync::{atomic::Ordering, Arc, Mutex, RwLock, RwLockReadGuard}; use std::time::Duration; -use serde::{Deserialize, Serialize}; - use tokio::sync::Notify; use netapp::endpoint::Endpoint; @@ -33,16 +31,6 @@ pub struct LayoutManager { system_endpoint: Arc>, } -#[derive(Debug, Clone, Serialize, Deserialize, Default)] -pub struct LayoutStatus { - /// Cluster layout version - pub cluster_layout_version: u64, - /// Hash of cluster layout update trackers - pub cluster_layout_trackers_hash: Hash, - /// Hash of cluster layout staging data - pub cluster_layout_staging_hash: Hash, -} - impl LayoutManager { pub fn new( config: &Config, @@ -105,15 +93,6 @@ impl LayoutManager { self.layout.read().unwrap() } - pub fn status(&self) -> LayoutStatus { - let layout = self.layout(); - LayoutStatus { - cluster_layout_version: layout.current().version, - cluster_layout_trackers_hash: layout.trackers_hash(), - cluster_layout_staging_hash: layout.staging_hash(), - } - } - pub async fn update_cluster_layout( self: &Arc, layout: &LayoutHistory, @@ -173,6 +152,7 @@ impl LayoutManager { fn merge_layout(&self, adv: &LayoutHistory) -> Option { let mut layout = self.layout.write().unwrap(); + let prev_digest = layout.digest(); let prev_layout_check = layout.check().is_ok(); if !prev_layout_check || adv.check().is_ok() { @@ -181,6 +161,7 @@ impl LayoutManager { if prev_layout_check && layout.check().is_err() { panic!("Merged two correct layouts and got an incorrect layout."); } + assert!(layout.digest() != prev_digest); return Some(layout.clone()); } } @@ -190,10 +171,12 @@ impl LayoutManager { fn merge_layout_trackers(&self, adv: &UpdateTrackers) -> Option { let mut layout = self.layout.write().unwrap(); + let prev_digest = layout.digest(); if layout.update_trackers != *adv { if layout.update(|l| l.update_trackers.merge(adv)) { layout.update_trackers(self.node_id); + assert!(layout.digest() != prev_digest); return Some(layout.update_trackers.clone()); } } @@ -269,16 +252,17 @@ impl LayoutManager { // ---- RPC HANDLERS ---- - pub(crate) fn handle_advertise_status(self: &Arc, from: Uuid, remote: &LayoutStatus) { - let local = self.status(); - if remote.cluster_layout_version > local.cluster_layout_version - || remote.cluster_layout_staging_hash != local.cluster_layout_staging_hash + pub(crate) fn handle_advertise_status(self: &Arc, from: Uuid, remote: &LayoutDigest) { + let local = self.layout().digest(); + if remote.current_version > local.current_version + || remote.active_versions != local.active_versions + || remote.staging_hash != local.staging_hash { tokio::spawn({ let this = self.clone(); async move { this.pull_cluster_layout(from).await } }); - } else if remote.cluster_layout_trackers_hash != local.cluster_layout_trackers_hash { + } else if remote.trackers_hash != local.trackers_hash { tokio::spawn({ let this = self.clone(); async move { this.pull_cluster_layout_trackers(from).await } diff --git a/src/rpc/layout/mod.rs b/src/rpc/layout/mod.rs index 91151ab4..eb127fda 100644 --- a/src/rpc/layout/mod.rs +++ b/src/rpc/layout/mod.rs @@ -11,7 +11,7 @@ pub mod manager; // ---- re-exports ---- -pub use helper::LayoutHelper; +pub use helper::{LayoutDigest, LayoutHelper}; pub use manager::WriteLock; pub use schema::*; pub use version::*; diff --git a/src/rpc/system.rs b/src/rpc/system.rs index d74dc2a1..dc127afb 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -33,8 +33,9 @@ use garage_util::time::*; use crate::consul::ConsulDiscovery; #[cfg(feature = "kubernetes-discovery")] use crate::kubernetes::*; -use crate::layout::manager::{LayoutManager, LayoutStatus}; -use crate::layout::{self, LayoutHelper, LayoutHistory, NodeRoleV}; +use crate::layout::{ + self, manager::LayoutManager, LayoutDigest, LayoutHelper, LayoutHistory, NodeRoleV, +}; use crate::replication_mode::*; use crate::rpc_helper::*; @@ -130,8 +131,8 @@ pub struct NodeStatus { /// Replication factor configured on the node pub replication_factor: usize, - /// Layout status - pub layout_status: LayoutStatus, + /// Cluster layout digest + pub layout_digest: LayoutDigest, /// Disk usage on partition containing metadata directory (tuple: `(avail, total)`) #[serde(default)] @@ -539,7 +540,7 @@ impl System { fn update_local_status(&self) { let mut new_si: NodeStatus = self.local_status.load().as_ref().clone(); - new_si.layout_status = self.layout_manager.status(); + new_si.layout_digest = self.layout_manager.layout().digest(); new_si.update_disk_usage(&self.metadata_dir, &self.data_dir, &self.metrics); @@ -573,7 +574,7 @@ impl System { } self.layout_manager - .handle_advertise_status(from, &info.layout_status); + .handle_advertise_status(from, &info.layout_digest); self.node_status .write() @@ -755,7 +756,7 @@ impl NodeStatus { .into_string() .unwrap_or_else(|_| "".to_string()), replication_factor, - layout_status: layout_manager.status(), + layout_digest: layout_manager.layout().digest(), meta_disk_avail: None, data_disk_avail: None, } @@ -765,7 +766,7 @@ impl NodeStatus { NodeStatus { hostname: "?".to_string(), replication_factor: 0, - layout_status: Default::default(), + layout_digest: Default::default(), meta_disk_avail: None, data_disk_avail: None, } -- cgit v1.2.3 From 22f38808e744ea5b30ad771fcb344a29579b56d4 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 16 Nov 2023 16:34:01 +0100 Subject: rpc_helper: don't use tokio::spawn for individual requests --- src/rpc/rpc_helper.rs | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) (limited to 'src') diff --git a/src/rpc/rpc_helper.rs b/src/rpc/rpc_helper.rs index e269ddaa..7e9fabd7 100644 --- a/src/rpc/rpc_helper.rs +++ b/src/rpc/rpc_helper.rs @@ -299,9 +299,7 @@ impl RpcHelper { if let Some((req_to, fut)) = requests.next() { let tracer = opentelemetry::global::tracer("garage"); let span = tracer.start(format!("RPC to {:?}", req_to)); - resp_stream.push(tokio::spawn( - fut.with_context(Context::current_with_span(span)), - )); + resp_stream.push(fut.with_context(Context::current_with_span(span))); } else { break; } @@ -313,7 +311,7 @@ impl RpcHelper { } // Wait for one request to terminate - match resp_stream.next().await.unwrap().unwrap() { + match resp_stream.next().await.unwrap() { Ok(msg) => { successes.push(msg); } @@ -448,7 +446,7 @@ impl RpcHelper { let tracer = opentelemetry::global::tracer("garage"); let span = tracer.start(format!("RPC to {:?}", to)); let fut = async move { (to, self2.call(&endpoint2, to, msg, strategy).await) }; - tokio::spawn(fut.with_context(Context::current_with_span(span))) + fut.with_context(Context::current_with_span(span)) }); let mut resp_stream = requests.collect::>(); @@ -457,9 +455,7 @@ impl RpcHelper { let mut set_counters = vec![(0, 0); to_sets.len()]; - while !resp_stream.is_empty() { - let (node, resp) = resp_stream.next().await.unwrap().unwrap(); - + while let Some((node, resp)) = resp_stream.next().await { match resp { Ok(msg) => { for set in peers.get(&node).unwrap().iter() { @@ -475,12 +471,12 @@ impl RpcHelper { } } - if set_counters.iter().all(|x| x.0 >= quorum) { + if set_counters.iter().all(|(ok_cnt, _)| *ok_cnt >= quorum) { // Success // Continue all other requets in background tokio::spawn(async move { - resp_stream.collect::>>().await; + resp_stream.collect::)>>().await; }); return Ok(successes); @@ -489,7 +485,7 @@ impl RpcHelper { if set_counters .iter() .enumerate() - .any(|(i, x)| x.1 + quorum > to_sets[i].len()) + .any(|(i, (_, err_cnt))| err_cnt + quorum > to_sets[i].len()) { // Too many errors in this set, we know we won't get a quorum break; -- cgit v1.2.3 From 3ecd14b9f6202ad3c5513c6ad7422bd408134002 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 16 Nov 2023 16:41:45 +0100 Subject: table: implement write sets for insert_many --- src/table/table.rs | 157 +++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 127 insertions(+), 30 deletions(-) (limited to 'src') diff --git a/src/table/table.rs b/src/table/table.rs index 5ec9eb0a..7d1ff31c 100644 --- a/src/table/table.rs +++ b/src/table/table.rs @@ -143,7 +143,7 @@ impl Table { self.data.queue_insert(tx, e) } - pub async fn insert_many(&self, entries: I) -> Result<(), Error> + pub async fn insert_many(self: &Arc, entries: I) -> Result<(), Error> where I: IntoIterator + Send + Sync, IE: Borrow + Send + Sync, @@ -161,52 +161,149 @@ impl Table { Ok(()) } - async fn insert_many_internal(&self, entries: I) -> Result<(), Error> + async fn insert_many_internal(self: &Arc, entries: I) -> Result<(), Error> where I: IntoIterator + Send + Sync, IE: Borrow + Send + Sync, { - let mut call_list: HashMap<_, Vec<_>> = HashMap::new(); - + // The different items will have to be stored on possibly different nodes. + // We will here batch all items into a single request for each concerned + // node, with all of the entries it must store within that request. + // Each entry has to be saved to a specific list of "write sets", i.e. a set + // of node within wich a quorum must be achieved. In normal operation, there + // is a single write set which corresponds to the quorum in the current + // cluster layout, but when the layout is updated, multiple write sets might + // have to be handled at once. Here, since we are sending many entries, we + // will have to handle many write sets in all cases. The algorihtm is thus + // to send one request to each node with all the items it must save, + // and keep track of the OK responses within each write set: if for all sets + // a quorum of nodes has answered OK, then the insert has succeeded and + // consistency properties (read-after-write) are preserved. + + // Some code here might feel redundant with RpcHelper::try_write_many_sets, + // but I think deduplicating could lead to more spaghetti instead of + // improving the readability, so I'm leaving as is. + + let quorum = self.data.replication.write_quorum(); + + // Serialize all entries and compute the write sets for each of them. + // In the case of sharded table replication, this also takes an "ack lock" + // to the layout manager to avoid ack'ing newer versions which are not + // taken into account by writes in progress (the ack can happen later, once + // all writes that didn't take the new layout into account are finished). + // These locks are released when entries_vec is dropped, i.e. when this + // function returns. + let mut entries_vec = Vec::new(); for entry in entries.into_iter() { let entry = entry.borrow(); let hash = entry.partition_key().hash(); - // TODO: use write sets - let who = self.data.replication.storage_nodes(&hash); + let write_sets = self.data.replication.write_sets(&hash); let e_enc = Arc::new(ByteBuf::from(entry.encode()?)); - for node in who { - call_list.entry(node).or_default().push(e_enc.clone()); + entries_vec.push((write_sets, e_enc)); + } + + // Compute a deduplicated list of all of the write sets, + // and compute an index from each node to the position of the sets in which + // it takes part, to optimize the detection of a quorum. + let mut write_sets = entries_vec + .iter() + .map(|(wss, _)| wss.as_ref().iter().map(|ws| ws.as_slice())) + .flatten() + .collect::>(); + write_sets.sort(); + write_sets.dedup(); + let mut write_set_index = HashMap::<&Uuid, Vec>::new(); + for (i, write_set) in write_sets.iter().enumerate() { + for node in write_set.iter() { + write_set_index.entry(node).or_default().push(i); } } - let call_futures = call_list.drain().map(|(node, entries)| async move { - let rpc = TableRpc::::Update(entries); - - let resp = self - .system - .rpc_helper() - .call( - &self.endpoint, - node, - rpc, - RequestStrategy::with_priority(PRIO_NORMAL), - ) - .await?; - Ok::<_, Error>((node, resp)) + // Build a map of all nodes to the entries that must be sent to that node. + let mut call_list: HashMap> = HashMap::new(); + for (write_sets, entry_enc) in entries_vec.iter() { + for write_set in write_sets.as_ref().iter() { + for node in write_set.iter() { + call_list.entry(*node).or_default().push(entry_enc.clone()) + } + } + } + + // Build futures to actually perform each of the corresponding RPC calls + let call_count = call_list.len(); + let call_futures = call_list.into_iter().map(|(node, entries)| { + let this = self.clone(); + let tracer = opentelemetry::global::tracer("garage"); + let span = tracer.start(format!("RPC to {:?}", node)); + let fut = async move { + let rpc = TableRpc::::Update(entries); + let resp = this + .system + .rpc_helper() + .call( + &this.endpoint, + node, + rpc, + RequestStrategy::with_priority(PRIO_NORMAL).with_quorum(quorum), + ) + .await; + (node, resp) + }; + fut.with_context(Context::current_with_span(span)) }); + + // Run all requests in parallel thanks to FuturesUnordered, and collect results. let mut resps = call_futures.collect::>(); + let mut set_counters = vec![(0, 0); write_sets.len()]; + let mut successes = 0; let mut errors = vec![]; - while let Some(resp) = resps.next().await { - if let Err(e) = resp { - errors.push(e); + while let Some((node, resp)) = resps.next().await { + match resp { + Ok(_) => { + successes += 1; + for set in write_set_index.get(&node).unwrap().iter() { + set_counters[*set].0 += 1; + } + } + Err(e) => { + errors.push(e); + for set in write_set_index.get(&node).unwrap().iter() { + set_counters[*set].1 += 1; + } + } + } + + if set_counters.iter().all(|(ok_cnt, _)| *ok_cnt >= quorum) { + // Success + + // Continue all other requests in background + tokio::spawn(async move { + resps.collect::)>>().await; + }); + + return Ok(()); + } + + if set_counters + .iter() + .enumerate() + .any(|(i, (_, err_cnt))| err_cnt + quorum > write_sets[i].len()) + { + // Too many errors in this set, we know we won't get a quorum + break; } } - if errors.len() > self.data.replication.max_write_errors() { - Err(Error::Message("Too many errors".into())) - } else { - Ok(()) - } + + // Failure, could not get quorum within at least one set + let errors = errors.iter().map(|e| format!("{}", e)).collect::>(); + Err(Error::Quorum( + quorum, + Some(write_sets.len()), + successes, + call_count, + errors, + )) } pub async fn get( -- cgit v1.2.3 From d6d239fc7909cbd017da6ea35cceb3d561a87cca Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 27 Nov 2023 11:52:57 +0100 Subject: block manager: read_block using old layout versions if necessary --- src/block/manager.rs | 6 ++++-- src/rpc/layout/helper.rs | 23 +++++++++++++++++++++++ src/rpc/layout/history.rs | 12 +++++++++++- src/rpc/layout/schema.rs | 7 +++++++ src/rpc/rpc_helper.rs | 11 +++++------ 5 files changed, 50 insertions(+), 9 deletions(-) (limited to 'src') diff --git a/src/block/manager.rs b/src/block/manager.rs index be2e4951..47111160 100644 --- a/src/block/manager.rs +++ b/src/block/manager.rs @@ -264,8 +264,10 @@ impl BlockManager { F: Fn(DataBlockHeader, ByteStream) -> Fut, Fut: futures::Future>, { - let who = self.replication.read_nodes(hash); - let who = self.system.rpc_helper().request_order(&who); + let who = self + .system + .cluster_layout() + .block_read_nodes_of(hash, self.system.rpc_helper()); for node in who.iter() { let node_id = NodeID::from(*node); diff --git a/src/rpc/layout/helper.rs b/src/rpc/layout/helper.rs index 0d746ea3..5d159f3e 100644 --- a/src/rpc/layout/helper.rs +++ b/src/rpc/layout/helper.rs @@ -7,6 +7,7 @@ use serde::{Deserialize, Serialize}; use garage_util::data::*; use super::schema::*; +use crate::rpc_helper::RpcHelper; #[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq, Eq)] pub struct LayoutDigest { @@ -140,6 +141,28 @@ impl LayoutHelper { .collect() } + pub fn block_read_nodes_of(&self, position: &Hash, rpc_helper: &RpcHelper) -> Vec { + let mut ret = Vec::with_capacity(12); + let ver_iter = self + .layout() + .versions + .iter() + .rev() + .chain(self.layout().old_versions.iter().rev()); + for ver in ver_iter { + if ver.version > self.sync_map_min { + continue; + } + let nodes = ver.nodes_of(position, ver.replication_factor); + for node in rpc_helper.request_order(nodes) { + if !ret.contains(&node) { + ret.push(node); + } + } + } + ret + } + pub(crate) fn write_sets_of(&self, position: &Hash) -> Vec> { self.layout() .versions diff --git a/src/rpc/layout/history.rs b/src/rpc/layout/history.rs index 653d2a48..7d4a1b48 100644 --- a/src/rpc/layout/history.rs +++ b/src/rpc/layout/history.rs @@ -18,6 +18,7 @@ impl LayoutHistory { LayoutHistory { versions: vec![version], + old_versions: vec![], update_trackers: Default::default(), staging: Lww::raw(0, staging), } @@ -86,11 +87,20 @@ impl LayoutHistory { .min(&all_nongateway_nodes, min_version); if self.min_stored() < sync_ack_map_min { let removed = self.versions.remove(0); - info!("Layout history: pruning old version {}", removed.version); + info!( + "Layout history: moving version {} to old_versions", + removed.version + ); + self.old_versions.push(removed); } else { break; } } + + while self.old_versions.len() > OLD_VERSION_COUNT { + let removed = self.old_versions.remove(0); + info!("Layout history: removing old_version {}", removed.version); + } } pub(crate) fn clamp_update_trackers(&mut self, nodes: &[Uuid]) { diff --git a/src/rpc/layout/schema.rs b/src/rpc/layout/schema.rs index 00a2c017..08db44ca 100644 --- a/src/rpc/layout/schema.rs +++ b/src/rpc/layout/schema.rs @@ -193,12 +193,18 @@ mod v010 { use std::collections::BTreeMap; pub use v09::{LayoutParameters, NodeRole, NodeRoleV, ZoneRedundancy}; + pub const OLD_VERSION_COUNT: usize = 5; + /// The history of cluster layouts, with trackers to keep a record /// of which nodes are up-to-date to current cluster data #[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] pub struct LayoutHistory { /// The versions currently in use in the cluster pub versions: Vec, + /// At most 5 of the previous versions, not used by the garage_table + /// module, but usefull for the garage_block module to find data blocks + /// that have not yet been moved + pub old_versions: Vec, /// Update trackers pub update_trackers: UpdateTrackers, @@ -300,6 +306,7 @@ mod v010 { }; Self { versions: vec![version], + old_versions: vec![], update_trackers: UpdateTrackers { ack_map: update_tracker.clone(), sync_map: update_tracker.clone(), diff --git a/src/rpc/rpc_helper.rs b/src/rpc/rpc_helper.rs index 7e9fabd7..e9a9143f 100644 --- a/src/rpc/rpc_helper.rs +++ b/src/rpc/rpc_helper.rs @@ -267,7 +267,7 @@ impl RpcHelper { // When there are errors, we start new requests to compensate. // Reorder requests to priorize closeness / low latency - let request_order = self.request_order(to); + let request_order = self.request_order(to.iter().copied()); let send_all_at_once = strategy.rs_send_all_at_once.unwrap_or(false); // Build future for each request @@ -335,7 +335,7 @@ impl RpcHelper { } } - pub fn request_order(&self, nodes: &[Uuid]) -> Vec { + pub fn request_order(&self, nodes: impl Iterator) -> Vec { // Retrieve some status variables that we will use to sort requests let peer_list = self.0.fullmesh.get_peer_list(); let layout = self.0.layout.read().unwrap(); @@ -351,9 +351,8 @@ impl RpcHelper { // By sorting this vec, we priorize ourself, then nodes in the same zone, // and within a same zone we priorize nodes with the lowest latency. let mut nodes = nodes - .iter() .map(|to| { - let peer_zone = match layout.current().node_role(to) { + let peer_zone = match layout.current().node_role(&to) { Some(pc) => &pc.zone, None => "", }; @@ -363,10 +362,10 @@ impl RpcHelper { .and_then(|pi| pi.avg_ping) .unwrap_or_else(|| Duration::from_secs(10)); ( - *to != self.0.our_node_id, + to != self.0.our_node_id, peer_zone != our_zone, peer_avg_ping, - *to, + to, ) }) .collect::>(); -- cgit v1.2.3 From 78362140f5a177340a06690d9c9ea98bd831e7a4 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 27 Nov 2023 12:10:21 +0100 Subject: rpc: update system::health to take into account write sets for all partitions --- src/rpc/system.rs | 77 ++++++++++++++++++++++++++++++++----------------------- 1 file changed, 45 insertions(+), 32 deletions(-) (limited to 'src') diff --git a/src/rpc/system.rs b/src/rpc/system.rs index dc127afb..c7d41ee4 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -1,5 +1,5 @@ //! Module containing structs related to membership management -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::io::{Read, Write}; use std::net::{IpAddr, SocketAddr}; use std::path::{Path, PathBuf}; @@ -418,48 +418,61 @@ impl System { } pub fn health(&self) -> ClusterHealth { - // TODO: adapt this function to take into account layout history - // when estimating cluster health, and not just use current layout - let quorum = self.replication_mode.write_quorum(); - let replication_factor = self.replication_factor; + // Gather information about running nodes. + // Technically, `nodes` contains currently running nodes, as well + // as nodes that this Garage process has been connected to at least + // once since it started. let nodes = self .get_known_nodes() .into_iter() .map(|n| (n.id, n)) .collect::>(); let connected_nodes = nodes.iter().filter(|(_, n)| n.is_up).count(); + let node_up = |x: &Uuid| nodes.get(x).map(|n| n.is_up).unwrap_or(false); + + // Acquire a rwlock read-lock to the current cluster layout + let layout = self.cluster_layout(); + + // Obtain information about nodes that have a role as storage nodes + // in one of the active layout versions + let mut storage_nodes = HashSet::::with_capacity(16); + for ver in layout.versions.iter() { + storage_nodes.extend( + ver.roles + .items() + .iter() + .filter(|(_, _, v)| matches!(v, NodeRoleV(Some(r)) if r.capacity.is_some())) + .map(|(n, _, _)| *n), + ) + } + let storage_nodes_ok = storage_nodes.iter().filter(|x| node_up(x)).count(); - let layout = self.cluster_layout(); // acquires a rwlock - - let storage_nodes = layout - .current() - .roles - .items() - .iter() - .filter(|(_, _, v)| matches!(v, NodeRoleV(Some(r)) if r.capacity.is_some())) - .collect::>(); - let storage_nodes_ok = storage_nodes - .iter() - .filter(|(x, _, _)| nodes.get(x).map(|n| n.is_up).unwrap_or(false)) - .count(); - + // Determine the number of partitions that have: + // - a quorum of up nodes for all write sets (i.e. are available) + // - for which all nodes in all write sets are up (i.e. are fully healthy) let partitions = layout.current().partitions().collect::>(); - let partitions_n_up = partitions - .iter() - .map(|(_, h)| { - let pn = layout.current().nodes_of(h, replication_factor); - pn.filter(|x| nodes.get(x).map(|n| n.is_up).unwrap_or(false)) - .count() - }) - .collect::>(); - let partitions_all_ok = partitions_n_up - .iter() - .filter(|c| **c == replication_factor) - .count(); - let partitions_quorum = partitions_n_up.iter().filter(|c| **c >= quorum).count(); + let mut partitions_quorum = 0; + let mut partitions_all_ok = 0; + for (_, hash) in partitions.iter() { + let write_sets = layout + .versions + .iter() + .map(|x| x.nodes_of(hash, x.replication_factor)); + let has_quorum = write_sets + .clone() + .all(|set| set.filter(|x| node_up(x)).count() >= quorum); + let all_ok = write_sets.clone().all(|mut set| set.all(|x| node_up(&x))); + if has_quorum { + partitions_quorum += 1; + } + if all_ok { + partitions_all_ok += 1; + } + } + // Determine overall cluster status let status = if partitions_quorum == partitions.len() && storage_nodes_ok == storage_nodes.len() { ClusterHealthStatus::Healthy -- cgit v1.2.3 From 539a920313fff010b8a4291aeef58ec9a14ee635 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 27 Nov 2023 13:18:59 +0100 Subject: cli: show when nodes are draining metadata --- src/garage/cli/cmd.rs | 172 +++++++++++++++++++++++++++++++------------------- 1 file changed, 108 insertions(+), 64 deletions(-) (limited to 'src') diff --git a/src/garage/cli/cmd.rs b/src/garage/cli/cmd.rs index 1a054025..c99243b9 100644 --- a/src/garage/cli/cmd.rs +++ b/src/garage/cli/cmd.rs @@ -1,4 +1,4 @@ -use std::collections::HashSet; +use std::collections::{HashMap, HashSet}; use std::time::Duration; use format_table::format_table; @@ -62,35 +62,69 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> let mut healthy_nodes = vec!["ID\tHostname\tAddress\tTags\tZone\tCapacity\tDataAvail".to_string()]; for adv in status.iter().filter(|adv| adv.is_up) { - match layout.current().roles.get(&adv.id) { - Some(NodeRoleV(Some(cfg))) => { - let data_avail = match &adv.status.data_disk_avail { - _ if cfg.capacity.is_none() => "N/A".into(), - Some((avail, total)) => { - let pct = (*avail as f64) / (*total as f64) * 100.; - let avail = bytesize::ByteSize::b(*avail); - format!("{} ({:.1}%)", avail, pct) - } - None => "?".into(), - }; + if let Some(NodeRoleV(Some(cfg))) = layout.current().roles.get(&adv.id) { + let data_avail = match &adv.status.data_disk_avail { + _ if cfg.capacity.is_none() => "N/A".into(), + Some((avail, total)) => { + let pct = (*avail as f64) / (*total as f64) * 100.; + let avail = bytesize::ByteSize::b(*avail); + format!("{} ({:.1}%)", avail, pct) + } + None => "?".into(), + }; + healthy_nodes.push(format!( + "{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\t{capacity}\t{data_avail}", + id = adv.id, + host = adv.status.hostname, + addr = adv.addr, + tags = cfg.tags.join(","), + zone = cfg.zone, + capacity = cfg.capacity_string(), + data_avail = data_avail, + )); + } else { + let prev_role = layout + .versions + .iter() + .rev() + .find_map(|x| match x.roles.get(&adv.id) { + Some(NodeRoleV(Some(cfg))) => Some(cfg), + _ => None, + }); + let historic_role = + layout + .old_versions + .iter() + .rev() + .find_map(|x| match x.roles.get(&adv.id) { + Some(NodeRoleV(Some(cfg))) => Some(cfg), + _ => None, + }); + if let Some(cfg) = prev_role { healthy_nodes.push(format!( - "{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\t{capacity}\t{data_avail}", + "{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\tdraining metadata...", id = adv.id, host = adv.status.hostname, addr = adv.addr, tags = cfg.tags.join(","), zone = cfg.zone, - capacity = cfg.capacity_string(), - data_avail = data_avail, )); - } - _ => { + } else if let Some(cfg) = historic_role { + healthy_nodes.push(format!( + "{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\tremoved, metadata drained", + id = adv.id, + host = adv.status.hostname, + addr = adv.addr, + tags = cfg.tags.join(","), + zone = cfg.zone, + )); + } else { let new_role = match layout.staging.get().roles.get(&adv.id) { - Some(NodeRoleV(Some(_))) => "(pending)", + Some(NodeRoleV(Some(_))) => "pending...", _ => "NO ROLE ASSIGNED", }; healthy_nodes.push(format!( - "{id:?}\t{h}\t{addr}\t{new_role}", + "{id:?}\t{h}\t{addr}\t\t\t{new_role}", id = adv.id, h = adv.status.hostname, addr = adv.addr, @@ -101,55 +135,65 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> } format_table(healthy_nodes); - let status_keys = status.iter().map(|adv| adv.id).collect::>(); - let failure_case_1 = status.iter().any(|adv| { - !adv.is_up - && matches!( - layout.current().roles.get(&adv.id), - Some(NodeRoleV(Some(_))) - ) - }); - let failure_case_2 = layout - .current() - .roles - .items() + // Determine which nodes are unhealthy and print that to stdout + let status_map = status .iter() - .any(|(id, _, v)| !status_keys.contains(id) && v.0.is_some()); - if failure_case_1 || failure_case_2 { - println!("\n==== FAILED NODES ===="); - let mut failed_nodes = - vec!["ID\tHostname\tAddress\tTags\tZone\tCapacity\tLast seen".to_string()]; - for adv in status.iter().filter(|adv| !adv.is_up) { - if let Some(NodeRoleV(Some(cfg))) = layout.current().roles.get(&adv.id) { - let tf = timeago::Formatter::new(); - failed_nodes.push(format!( - "{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\t{capacity}\t{last_seen}", - id = adv.id, - host = adv.status.hostname, - addr = adv.addr, - tags = cfg.tags.join(","), - zone = cfg.zone, - capacity = cfg.capacity_string(), - last_seen = adv - .last_seen_secs_ago - .map(|s| tf.convert(Duration::from_secs(s))) - .unwrap_or_else(|| "never seen".into()), - )); + .map(|adv| (adv.id, adv)) + .collect::>(); + + let tf = timeago::Formatter::new(); + let mut failed_nodes = + vec!["ID\tHostname\tAddress\tTags\tZone\tCapacity\tLast seen".to_string()]; + let mut listed = HashSet::new(); + for ver in layout.versions.iter().rev() { + for (node, _, role) in ver.roles.items().iter() { + let cfg = match role { + NodeRoleV(Some(role)) if role.capacity.is_some() => role, + _ => continue, + }; + + if listed.contains(node) { + continue; } - } - for (id, _, role_v) in layout.current().roles.items().iter() { - if let NodeRoleV(Some(cfg)) = role_v { - if !status_keys.contains(id) { - failed_nodes.push(format!( - "{id:?}\t??\t??\t[{tags}]\t{zone}\t{capacity}\tnever seen", - id = id, - tags = cfg.tags.join(","), - zone = cfg.zone, - capacity = cfg.capacity_string(), - )); - } + listed.insert(*node); + + let adv = status_map.get(node); + if adv.map(|x| x.is_up).unwrap_or(false) { + continue; } + + // Node is in a layout version, is not a gateway node, and is not up: + // it is in a failed state, add proper line to the output + let (host, addr, last_seen) = match adv { + Some(adv) => ( + adv.status.hostname.as_str(), + adv.addr.to_string(), + adv.last_seen_secs_ago + .map(|s| tf.convert(Duration::from_secs(s))) + .unwrap_or_else(|| "never seen".into()), + ), + None => ("??", "??".into(), "never seen".into()), + }; + let capacity = if ver.version == layout.current().version { + cfg.capacity_string() + } else { + "draining metadata...".to_string() + }; + failed_nodes.push(format!( + "{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\t{capacity}\t{last_seen}", + id = node, + host = host, + addr = addr, + tags = cfg.tags.join(","), + zone = cfg.zone, + capacity = capacity, + last_seen = last_seen, + )); } + } + + if failed_nodes.len() > 1 { + println!("\n==== FAILED NODES ===="); format_table(failed_nodes); } -- cgit v1.2.3 From 11e6fef93ce3ca56584fc99223b71da77d320dd7 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 27 Nov 2023 16:17:41 +0100 Subject: cli: add layout history and layout assume-sync commands --- src/garage/cli/cmd.rs | 14 +++++- src/garage/cli/layout.rs | 111 ++++++++++++++++++++++++++++++++++++++++++++++ src/garage/cli/structs.rs | 16 +++++++ src/rpc/layout/schema.rs | 9 +++- 4 files changed, 147 insertions(+), 3 deletions(-) (limited to 'src') diff --git a/src/garage/cli/cmd.rs b/src/garage/cli/cmd.rs index c99243b9..08ed00cf 100644 --- a/src/garage/cli/cmd.rs +++ b/src/garage/cli/cmd.rs @@ -135,13 +135,14 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> } format_table(healthy_nodes); - // Determine which nodes are unhealthy and print that to stdout + // Determine which nodes are unhealthy and print that to stdout let status_map = status .iter() .map(|adv| (adv.id, adv)) .collect::>(); let tf = timeago::Formatter::new(); + let mut drain_msg = false; let mut failed_nodes = vec!["ID\tHostname\tAddress\tTags\tZone\tCapacity\tLast seen".to_string()]; let mut listed = HashSet::new(); @@ -163,7 +164,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> } // Node is in a layout version, is not a gateway node, and is not up: - // it is in a failed state, add proper line to the output + // it is in a failed state, add proper line to the output let (host, addr, last_seen) = match adv { Some(adv) => ( adv.status.hostname.as_str(), @@ -177,6 +178,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> let capacity = if ver.version == layout.current().version { cfg.capacity_string() } else { + drain_msg = true; "draining metadata...".to_string() }; failed_nodes.push(format!( @@ -195,6 +197,14 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> if failed_nodes.len() > 1 { println!("\n==== FAILED NODES ===="); format_table(failed_nodes); + if drain_msg { + println!(); + println!("Your cluster is expecting to drain data from nodes that are currently unavailable."); + println!("If these nodes are definitely dead, please review the layout history with"); + println!( + "`garage layout history` and use `garage layout assume-sync` to force progress." + ); + } } if print_staging_role_changes(&layout) { diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index 0be8278f..3c7843bd 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -32,6 +32,10 @@ pub async fn cli_layout_command_dispatch( LayoutOperation::Config(config_opt) => { cmd_config_layout(system_rpc_endpoint, rpc_host, config_opt).await } + LayoutOperation::History => cmd_layout_history(system_rpc_endpoint, rpc_host).await, + LayoutOperation::AssumeSync(assume_sync_opt) => { + cmd_layout_assume_sync(system_rpc_endpoint, rpc_host, assume_sync_opt).await + } } } @@ -311,6 +315,113 @@ pub async fn cmd_config_layout( Ok(()) } +pub async fn cmd_layout_history( + rpc_cli: &Endpoint, + rpc_host: NodeID, +) -> Result<(), Error> { + let layout = fetch_layout(rpc_cli, rpc_host).await?; + let min_stored = layout.min_stored(); + + println!("==== LAYOUT HISTORY ===="); + let mut table = vec!["Version\tStatus\tStorage nodes\tGateway nodes".to_string()]; + for ver in layout + .versions + .iter() + .rev() + .chain(layout.old_versions.iter().rev()) + { + let status = if ver.version == layout.current().version { + "current" + } else if ver.version >= min_stored { + "draining" + } else { + "historical" + }; + table.push(format!( + "#{}\t{}\t{}\t{}", + ver.version, + status, + ver.roles + .items() + .iter() + .filter(|(_, _, x)| matches!(x, NodeRoleV(Some(c)) if c.capacity.is_some())) + .count(), + ver.roles + .items() + .iter() + .filter(|(_, _, x)| matches!(x, NodeRoleV(Some(c)) if c.capacity.is_none())) + .count(), + )); + } + format_table(table); + + println!(); + println!("==== UPDATE TRACKERS ===="); + println!("This is the internal data that Garage stores to know which nodes have what data."); + println!(); + let mut table = vec!["Node\tAck\tSync\tSync_ack".to_string()]; + let all_nodes = layout.get_all_nodes(); + for node in all_nodes.iter() { + table.push(format!( + "{:?}\t#{}\t#{}\t#{}", + node, + layout.update_trackers.ack_map.get(node), + layout.update_trackers.sync_map.get(node), + layout.update_trackers.sync_ack_map.get(node), + )); + } + table[1..].sort(); + format_table(table); + + if layout.versions.len() > 1 { + println!(); + println!( + "If some nodes are not catching up to the latest layout version in the update tracker," + ); + println!("it might be because they are offline or unable to complete a sync successfully."); + println!( + "You may force progress using `garage layout assume-sync --version {}`", + layout.current().version + ); + } + + Ok(()) +} + +pub async fn cmd_layout_assume_sync( + rpc_cli: &Endpoint, + rpc_host: NodeID, + opt: AssumeSyncOpt, +) -> Result<(), Error> { + let mut layout = fetch_layout(rpc_cli, rpc_host).await?; + + let min_v = layout.min_stored(); + if opt.version <= min_v || opt.version > layout.current().version { + return Err(Error::Message(format!( + "Invalid version, you may use the following version numbers: {}", + (min_v + 1..=layout.current().version) + .map(|x| x.to_string()) + .collect::>() + .join(" ") + ))); + } + + let all_nodes = layout.get_all_nodes(); + for node in all_nodes.iter() { + layout.update_trackers.ack_map.set_max(*node, opt.version); + layout.update_trackers.sync_map.set_max(*node, opt.version); + layout + .update_trackers + .sync_ack_map + .set_max(*node, opt.version); + } + + send_layout(rpc_cli, rpc_host, layout).await?; + println!("Success."); + + Ok(()) +} + // --- utility --- pub async fn fetch_layout( diff --git a/src/garage/cli/structs.rs b/src/garage/cli/structs.rs index 3badc447..c4b400f4 100644 --- a/src/garage/cli/structs.rs +++ b/src/garage/cli/structs.rs @@ -112,6 +112,14 @@ pub enum LayoutOperation { /// Revert staged changes to cluster layout #[structopt(name = "revert", version = garage_version())] Revert(RevertLayoutOpt), + + /// View the history of layouts in the cluster + #[structopt(name = "history", version = garage_version())] + History, + + /// Assume all nodes are synchronized up to a certain layout version + #[structopt(name = "assume-sync", version = garage_version())] + AssumeSync(AssumeSyncOpt), } #[derive(StructOpt, Debug)] @@ -169,6 +177,14 @@ pub struct RevertLayoutOpt { pub(crate) yes: bool, } +#[derive(StructOpt, Debug)] +pub struct AssumeSyncOpt { + /// Version number of the layout to assume is currently up-to-date. + /// This will generally be the current layout version. + #[structopt(long = "version")] + pub(crate) version: u64, +} + #[derive(Serialize, Deserialize, StructOpt, Debug)] pub enum BucketOperation { /// List buckets diff --git a/src/rpc/layout/schema.rs b/src/rpc/layout/schema.rs index 08db44ca..cb36297d 100644 --- a/src/rpc/layout/schema.rs +++ b/src/rpc/layout/schema.rs @@ -391,7 +391,10 @@ impl UpdateTracker { changed } - pub(crate) fn set_max(&mut self, peer: Uuid, value: u64) -> bool { + /// This bumps the update tracker for a given node up to the specified value. + /// This has potential impacts on the correctness of Garage and should only + /// be used in very specific circumstances. + pub fn set_max(&mut self, peer: Uuid, value: u64) -> bool { match self.0.get_mut(&peer) { Some(e) if *e < value => { *e = value; @@ -412,6 +415,10 @@ impl UpdateTracker { .min() .unwrap_or(min_version) } + + pub fn get(&self, node: &Uuid) -> u64 { + self.0.get(node).copied().unwrap_or(0) + } } impl UpdateTrackers { -- cgit v1.2.3 From c539077d30809c9d2232aa0fe107a9652dcb7c26 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 27 Nov 2023 16:20:19 +0100 Subject: cli: remove historic layout info from status --- src/garage/cli/cmd.rs | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'src') diff --git a/src/garage/cli/cmd.rs b/src/garage/cli/cmd.rs index 08ed00cf..4d1306b6 100644 --- a/src/garage/cli/cmd.rs +++ b/src/garage/cli/cmd.rs @@ -91,15 +91,6 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> Some(NodeRoleV(Some(cfg))) => Some(cfg), _ => None, }); - let historic_role = - layout - .old_versions - .iter() - .rev() - .find_map(|x| match x.roles.get(&adv.id) { - Some(NodeRoleV(Some(cfg))) => Some(cfg), - _ => None, - }); if let Some(cfg) = prev_role { healthy_nodes.push(format!( "{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\tdraining metadata...", @@ -109,15 +100,6 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> tags = cfg.tags.join(","), zone = cfg.zone, )); - } else if let Some(cfg) = historic_role { - healthy_nodes.push(format!( - "{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\tremoved, metadata drained", - id = adv.id, - host = adv.status.hostname, - addr = adv.addr, - tags = cfg.tags.join(","), - zone = cfg.zone, - )); } else { let new_role = match layout.staging.get().roles.get(&adv.id) { Some(NodeRoleV(Some(_))) => "pending...", -- cgit v1.2.3 From 539af6eac434bd94acbcabcc5bb5c10450b71c5d Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 28 Nov 2023 11:12:39 +0100 Subject: rpc helper: write comments + small refactoring of tracing --- src/rpc/rpc_helper.rs | 105 ++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 88 insertions(+), 17 deletions(-) (limited to 'src') diff --git a/src/rpc/rpc_helper.rs b/src/rpc/rpc_helper.rs index e9a9143f..f71f5ae7 100644 --- a/src/rpc/rpc_helper.rs +++ b/src/rpc/rpc_helper.rs @@ -129,6 +129,12 @@ impl RpcHelper { N: IntoReq + Send, H: StreamingEndpointHandler, { + let tracer = opentelemetry::global::tracer("garage"); + let span_name = format!("RPC [{}] to {:?}", endpoint.path(), to); + let mut span = tracer.start(span_name); + span.set_attribute(KeyValue::new("from", format!("{:?}", self.0.our_node_id))); + span.set_attribute(KeyValue::new("to", format!("{:?}", to))); + let metric_tags = [ KeyValue::new("rpc_endpoint", endpoint.path().to_string()), KeyValue::new("from", format!("{:?}", self.0.our_node_id)), @@ -140,6 +146,7 @@ impl RpcHelper { let node_id = to.into(); let rpc_call = endpoint .call_streaming(&node_id, msg, strat.rs_priority) + .with_context(Context::current_with_span(span)) .record_duration(&self.0.metrics.rpc_duration, &metric_tags); let timeout = async { @@ -182,12 +189,17 @@ impl RpcHelper { N: IntoReq, H: StreamingEndpointHandler, { + let tracer = opentelemetry::global::tracer("garage"); + let span_name = format!("RPC [{}] call_many {} nodes", endpoint.path(), to.len()); + let span = tracer.start(span_name); + let msg = msg.into_req().map_err(netapp::error::Error::from)?; let resps = join_all( to.iter() .map(|to| self.call(endpoint, *to, msg.clone(), strat)), ) + .with_context(Context::current_with_span(span)) .await; Ok(to .iter() @@ -219,6 +231,22 @@ impl RpcHelper { /// Make a RPC call to multiple servers, returning either a Vec of responses, /// or an error if quorum could not be reached due to too many errors + /// + /// If RequestStrategy has send_all_at_once set, then all requests will be + /// sent at once, and `try_call_many` will return as soon as a quorum of + /// responses is achieved, dropping and cancelling the remaining requests. + /// + /// Otherwise, `quorum` requests will be sent at the same time, and if an + /// error response is received, a new request will be sent to replace it. + /// The ordering of nodes to which requests are sent is determined by + /// the `RpcHelper::request_order` function, which takes into account + /// parameters such as node zones and measured ping values. + /// + /// In both cases, the basic contract of this function is that even in the + /// absence of failures, the RPC call might not be driven to completion + /// on all of the specified nodes. It is therefore unfit for broadcast + /// write operations where we expect all nodes to successfully store + /// the written date. pub async fn try_call_many( &self, endpoint: &Arc>, @@ -235,7 +263,12 @@ impl RpcHelper { let quorum = strategy.rs_quorum.unwrap_or(to.len()); let tracer = opentelemetry::global::tracer("garage"); - let span_name = format!("Read RPC {} to {} of {}", endpoint.path(), quorum, to.len()); + let span_name = format!( + "RPC [{}] try_call_many (quorum {}/{})", + endpoint.path(), + quorum, + to.len() + ); let mut span = tracer.start(span_name); span.set_attribute(KeyValue::new("from", format!("{:?}", self.0.our_node_id))); @@ -266,6 +299,10 @@ impl RpcHelper { // to reach a quorum, priorizing nodes with the lowest latency. // When there are errors, we start new requests to compensate. + // TODO: this could be made more aggressive, e.g. if after 2x the + // average ping of a given request, the response is not yet received, + // preemptively send an additional request to any remaining nodes. + // Reorder requests to priorize closeness / low latency let request_order = self.request_order(to.iter().copied()); let send_all_at_once = strategy.rs_send_all_at_once.unwrap_or(false); @@ -278,9 +315,7 @@ impl RpcHelper { let self2 = self.clone(); let msg = msg.clone(); let endpoint2 = endpoint.clone(); - (to, async move { - self2.call(&endpoint2, to, msg, strategy).await - }) + async move { self2.call(&endpoint2, to, msg, strategy).await } }); // Vectors in which success results and errors will be collected @@ -296,10 +331,8 @@ impl RpcHelper { // If the current set of requests that are running is not enough to possibly // reach quorum, start some new requests. while send_all_at_once || successes.len() + resp_stream.len() < quorum { - if let Some((req_to, fut)) = requests.next() { - let tracer = opentelemetry::global::tracer("garage"); - let span = tracer.start(format!("RPC to {:?}", req_to)); - resp_stream.push(fut.with_context(Context::current_with_span(span))); + if let Some(fut) = requests.next() { + resp_stream.push(fut) } else { break; } @@ -379,6 +412,25 @@ impl RpcHelper { .collect::>() } + /// Make a RPC call to multiple servers, returning either a Vec of responses, + /// or an error if quorum could not be reached due to too many errors + /// + /// Contrary to try_call_many, this fuction is especially made for broadcast + /// write operations. In particular: + /// + /// - The request are sent to all specified nodes as soon as `try_write_many_sets` + /// is invoked. + /// + /// - When `try_write_many_sets` returns, all remaining requests that haven't + /// completed move to a background task so that they have a chance to + /// complete successfully if there are no failures. + /// + /// In addition, the nodes to which requests should be sent are divided in + /// "quorum sets", and `try_write_many_sets` only returns once a quorum + /// has been validated in each set. This is used in the case of cluster layout + /// changes, where data has to be written both in the old layout and in the + /// new one as long as all nodes have not successfully tranisitionned and + /// moved all data to the new layout. pub async fn try_write_many_sets( &self, endpoint: &Arc>, @@ -394,11 +446,11 @@ impl RpcHelper { { let quorum = strategy .rs_quorum - .expect("internal error: missing quroum in try_write_many_sets"); + .expect("internal error: missing quorum value in try_write_many_sets"); let tracer = opentelemetry::global::tracer("garage"); let span_name = format!( - "Write RPC {} (quorum {} in {} sets)", + "RPC [{}] try_write_many_sets (quorum {} in {} sets)", endpoint.path(), quorum, to_sets.len() @@ -430,6 +482,8 @@ impl RpcHelper { { let msg = msg.into_req().map_err(netapp::error::Error::from)?; + // Peers may appear in many quorum sets. Here, build a list of peers, + // mapping to the index of the quorum sets in which they appear. let mut peers = HashMap::>::new(); for (i, set) in to_sets.iter().enumerate() { for peer in set.iter() { @@ -437,24 +491,30 @@ impl RpcHelper { } } + // Send one request to each peer of the quorum sets let requests = peers.iter().map(|(peer, _)| { let self2 = self.clone(); let msg = msg.clone(); let endpoint2 = endpoint.clone(); let to = *peer; - let tracer = opentelemetry::global::tracer("garage"); - let span = tracer.start(format!("RPC to {:?}", to)); - let fut = async move { (to, self2.call(&endpoint2, to, msg, strategy).await) }; - fut.with_context(Context::current_with_span(span)) + async move { (to, self2.call(&endpoint2, to, msg, strategy).await) } }); let mut resp_stream = requests.collect::>(); + // Success and error responses will be collected in these two vectors let mut successes = vec![]; let mut errors = vec![]; + // `set_counters` is used to keep track of how many success and error + // responses are received within each quorum set. When a node returns + // its response, it counts as a sucess/an error for all of the quorum + // sets which it is part of. let mut set_counters = vec![(0, 0); to_sets.len()]; + // Drive requests to completion while let Some((node, resp)) = resp_stream.next().await { + // Store the response in the correct vector and increment the + // appropriate counters match resp { Ok(msg) => { for set in peers.get(&node).unwrap().iter() { @@ -470,9 +530,8 @@ impl RpcHelper { } } + // If we have a quorum of ok in all quorum sets, then it's a success! if set_counters.iter().all(|(ok_cnt, _)| *ok_cnt >= quorum) { - // Success - // Continue all other requets in background tokio::spawn(async move { resp_stream.collect::)>>().await; @@ -481,16 +540,28 @@ impl RpcHelper { return Ok(successes); } + // If there is a quorum set for which too many errors were received, + // we know it's impossible to get a quorum, so return immediately. if set_counters .iter() .enumerate() .any(|(i, (_, err_cnt))| err_cnt + quorum > to_sets[i].len()) { - // Too many errors in this set, we know we won't get a quorum break; } } + // At this point, there is no quorum and we know that a quorum + // will never be achieved. Currently, we drop all remaining requests. + // Should we still move them to background so that they can continue + // for non-failed nodes? Not doing so has no impact on correctness, + // but it means that more cancellation messages will be sent. Idk. + // (When an in-progress request future is dropped, Netapp automatically + // sends a cancellation message to the remote node to inform it that + // the result is no longer needed. In turn, if the remote node receives + // the cancellation message in time, it interrupts the task of the + // running request handler.) + // Failure, could not get quorum let errors = errors.iter().map(|e| format!("{}", e)).collect::>(); Err(Error::Quorum( -- cgit v1.2.3 From c04dd8788a3764da2f307b1d10c2d56b7b0e4a61 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 28 Nov 2023 14:25:04 +0100 Subject: admin: more info in admin GetClusterStatus --- src/api/admin/cluster.rs | 122 +++++++++++++++++++++++++++++++++++++++-------- src/garage/admin/mod.rs | 2 +- src/garage/cli/cmd.rs | 9 ++-- src/rpc/system.rs | 12 +++-- 4 files changed, 116 insertions(+), 29 deletions(-) (limited to 'src') diff --git a/src/api/admin/cluster.rs b/src/api/admin/cluster.rs index 593bd778..3ce1b254 100644 --- a/src/api/admin/cluster.rs +++ b/src/api/admin/cluster.rs @@ -1,3 +1,4 @@ +use std::collections::HashMap; use std::net::SocketAddr; use std::sync::Arc; @@ -15,25 +16,95 @@ use crate::admin::error::*; use crate::helpers::{json_ok_response, parse_json_body}; pub async fn handle_get_cluster_status(garage: &Arc) -> Result, Error> { + let layout = garage.system.cluster_layout(); + let mut nodes = garage + .system + .get_known_nodes() + .into_iter() + .map(|i| { + ( + i.id, + NodeResp { + id: hex::encode(i.id), + addr: Some(i.addr), + hostname: i.status.hostname, + is_up: i.is_up, + last_seen_secs_ago: i.last_seen_secs_ago, + data_partition: i + .status + .data_disk_avail + .map(|(avail, total)| FreeSpaceResp { + available: avail, + total, + }), + metadata_partition: i.status.meta_disk_avail.map(|(avail, total)| { + FreeSpaceResp { + available: avail, + total, + } + }), + ..Default::default() + }, + ) + }) + .collect::>(); + + for (id, _, role) in layout.current().roles.items().iter() { + if let layout::NodeRoleV(Some(r)) = role { + let role = NodeRoleResp { + id: hex::encode(id), + zone: r.zone.to_string(), + capacity: r.capacity, + tags: r.tags.clone(), + }; + match nodes.get_mut(id) { + None => { + nodes.insert( + *id, + NodeResp { + id: hex::encode(id), + role: Some(role), + ..Default::default() + }, + ); + } + Some(n) => { + if n.role.is_none() { + n.role = Some(role); + } + } + } + } + } + + for ver in layout.versions.iter().rev().skip(1) { + for (id, _, role) in ver.roles.items().iter() { + if let layout::NodeRoleV(Some(r)) = role { + if !nodes.contains_key(id) && r.capacity.is_some() { + nodes.insert( + *id, + NodeResp { + id: hex::encode(id), + draining: true, + ..Default::default() + }, + ); + } + } + } + } + + let mut nodes = nodes.into_iter().map(|(_, v)| v).collect::>(); + nodes.sort_by(|x, y| x.id.cmp(&y.id)); + let res = GetClusterStatusResponse { node: hex::encode(garage.system.id), garage_version: garage_util::version::garage_version(), garage_features: garage_util::version::garage_features(), rust_version: garage_util::version::rust_version(), db_engine: garage.db.engine(), - known_nodes: garage - .system - .get_known_nodes() - .into_iter() - .map(|i| KnownNodeResp { - id: hex::encode(i.id), - addr: i.addr, - is_up: i.is_up, - last_seen_secs_ago: i.last_seen_secs_ago, - hostname: i.status.hostname, - }) - .collect(), - layout: format_cluster_layout(&garage.system.cluster_layout()), + layout_version: layout.current().version, + nodes, }; Ok(json_ok_response(&res)?) @@ -157,8 +228,8 @@ struct GetClusterStatusResponse { garage_features: Option<&'static [&'static str]>, rust_version: &'static str, db_engine: String, - known_nodes: Vec, - layout: GetClusterLayoutResponse, + layout_version: u64, + nodes: Vec, } #[derive(Serialize)] @@ -192,14 +263,27 @@ struct NodeRoleResp { tags: Vec, } -#[derive(Serialize)] +#[derive(Serialize, Default)] +#[serde(rename_all = "camelCase")] +struct FreeSpaceResp { + available: u64, + total: u64, +} + +#[derive(Serialize, Default)] #[serde(rename_all = "camelCase")] -struct KnownNodeResp { +struct NodeResp { id: String, - addr: SocketAddr, + role: Option, + addr: Option, + hostname: Option, is_up: bool, last_seen_secs_ago: Option, - hostname: String, + draining: bool, + #[serde(skip_serializing_if = "Option::is_none")] + data_partition: Option, + #[serde(skip_serializing_if = "Option::is_none")] + metadata_partition: Option, } // ---- update functions ---- diff --git a/src/garage/admin/mod.rs b/src/garage/admin/mod.rs index 77918a0f..da4226cf 100644 --- a/src/garage/admin/mod.rs +++ b/src/garage/admin/mod.rs @@ -295,7 +295,7 @@ impl AdminRpcHandler { let info = node_info.get(id); let status = info.map(|x| &x.status); let role = layout.current().roles.get(id).and_then(|x| x.0.as_ref()); - let hostname = status.map(|x| x.hostname.as_str()).unwrap_or("?"); + let hostname = status.and_then(|x| x.hostname.as_deref()).unwrap_or("?"); let zone = role.map(|x| x.zone.as_str()).unwrap_or("?"); let capacity = role .map(|x| x.capacity_string()) diff --git a/src/garage/cli/cmd.rs b/src/garage/cli/cmd.rs index 4d1306b6..c7f0ad2b 100644 --- a/src/garage/cli/cmd.rs +++ b/src/garage/cli/cmd.rs @@ -62,6 +62,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> let mut healthy_nodes = vec!["ID\tHostname\tAddress\tTags\tZone\tCapacity\tDataAvail".to_string()]; for adv in status.iter().filter(|adv| adv.is_up) { + let host = adv.status.hostname.as_deref().unwrap_or("?"); if let Some(NodeRoleV(Some(cfg))) = layout.current().roles.get(&adv.id) { let data_avail = match &adv.status.data_disk_avail { _ if cfg.capacity.is_none() => "N/A".into(), @@ -75,7 +76,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> healthy_nodes.push(format!( "{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\t{capacity}\t{data_avail}", id = adv.id, - host = adv.status.hostname, + host = host, addr = adv.addr, tags = cfg.tags.join(","), zone = cfg.zone, @@ -95,7 +96,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> healthy_nodes.push(format!( "{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\tdraining metadata...", id = adv.id, - host = adv.status.hostname, + host = host, addr = adv.addr, tags = cfg.tags.join(","), zone = cfg.zone, @@ -108,7 +109,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> healthy_nodes.push(format!( "{id:?}\t{h}\t{addr}\t\t\t{new_role}", id = adv.id, - h = adv.status.hostname, + h = host, addr = adv.addr, new_role = new_role, )); @@ -149,7 +150,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> // it is in a failed state, add proper line to the output let (host, addr, last_seen) = match adv { Some(adv) => ( - adv.status.hostname.as_str(), + adv.status.hostname.as_deref().unwrap_or("?"), adv.addr.to_string(), adv.last_seen_secs_ago .map(|s| tf.convert(Duration::from_secs(s))) diff --git a/src/rpc/system.rs b/src/rpc/system.rs index c7d41ee4..be4aefa2 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -126,7 +126,7 @@ pub struct System { #[derive(Debug, Clone, Serialize, Deserialize)] pub struct NodeStatus { /// Hostname of the node - pub hostname: String, + pub hostname: Option, /// Replication factor configured on the node pub replication_factor: usize, @@ -765,9 +765,11 @@ impl EndpointHandler for System { impl NodeStatus { fn initial(replication_factor: usize, layout_manager: &LayoutManager) -> Self { NodeStatus { - hostname: gethostname::gethostname() - .into_string() - .unwrap_or_else(|_| "".to_string()), + hostname: Some( + gethostname::gethostname() + .into_string() + .unwrap_or_else(|_| "".to_string()), + ), replication_factor, layout_digest: layout_manager.layout().digest(), meta_disk_avail: None, @@ -777,7 +779,7 @@ impl NodeStatus { fn unknown() -> Self { NodeStatus { - hostname: "?".to_string(), + hostname: None, replication_factor: 0, layout_digest: Default::default(), meta_disk_avail: None, -- cgit v1.2.3 From c8356a91d9bf1d1488ec288099f2a55a1019918f Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 7 Dec 2023 10:30:26 +0100 Subject: layout updates: fix the set of nodes among which minima are calculated --- src/rpc/layout/helper.rs | 25 +++++++++++++++++++++---- src/rpc/layout/history.rs | 8 +++++--- src/rpc/layout/schema.rs | 2 +- 3 files changed, 27 insertions(+), 8 deletions(-) (limited to 'src') diff --git a/src/rpc/layout/helper.rs b/src/rpc/layout/helper.rs index 5d159f3e..881a039e 100644 --- a/src/rpc/layout/helper.rs +++ b/src/rpc/layout/helper.rs @@ -51,20 +51,37 @@ impl LayoutHelper { pub fn new(mut layout: LayoutHistory, mut ack_lock: HashMap) -> Self { layout.cleanup_old_versions(); + let all_nodes = layout.get_all_nodes(); let all_nongateway_nodes = layout.get_all_nongateway_nodes(); - layout.clamp_update_trackers(&all_nongateway_nodes); + + layout.clamp_update_trackers(&all_nodes); let min_version = layout.min_stored(); + + // ack_map_min is the minimum value of ack_map among all nodes + // in the cluster (gateway, non-gateway, current and previous layouts). + // It is the highest layout version which all of these nodes have + // acknowledged, indicating that they are aware of it and are no + // longer processing write operations that did not take it into account. let ack_map_min = layout .update_trackers .ack_map - .min(&all_nongateway_nodes, min_version); + .min_among(&all_nodes, min_version); + + // sync_map_min is the minimum value of sync_map among all storage nodes + // in the cluster (non-gateway nodes only, current and previous layouts). + // It is the highest layout version for which we know that all relevant + // storage nodes have fullfilled a sync, and therefore it is safe to + // use a read quorum within that layout to ensure consistency. + // Gateway nodes are excluded here because they hold no relevant data + // (they store the bucket and access key tables, but we don't have + // consistency on those). + // TODO: this value could take quorums into account instead. let sync_map_min = layout .update_trackers .sync_map - .min(&all_nongateway_nodes, min_version); + .min_among(&all_nongateway_nodes, min_version); - let all_nodes = layout.get_all_nodes(); let trackers_hash = layout.calculate_trackers_hash(); let staging_hash = layout.calculate_staging_hash(); diff --git a/src/rpc/layout/history.rs b/src/rpc/layout/history.rs index 7d4a1b48..c448ac24 100644 --- a/src/rpc/layout/history.rs +++ b/src/rpc/layout/history.rs @@ -77,14 +77,16 @@ impl LayoutHistory { } // If there are old versions that no one is reading from anymore, - // remove them + // remove them (keep them in self.old_versions). + // ASSUMPTION: we only care about where nodes in the current layout version + // are reading from, as we assume older nodes are being discarded. while self.versions.len() > 1 { - let all_nongateway_nodes = self.get_all_nongateway_nodes(); + let current_nodes = &self.current().node_id_vec; let min_version = self.min_stored(); let sync_ack_map_min = self .update_trackers .sync_ack_map - .min(&all_nongateway_nodes, min_version); + .min_among(¤t_nodes, min_version); if self.min_stored() < sync_ack_map_min { let removed = self.versions.remove(0); info!( diff --git a/src/rpc/layout/schema.rs b/src/rpc/layout/schema.rs index cb36297d..49e84420 100644 --- a/src/rpc/layout/schema.rs +++ b/src/rpc/layout/schema.rs @@ -408,7 +408,7 @@ impl UpdateTracker { } } - pub(crate) fn min(&self, storage_nodes: &[Uuid], min_version: u64) -> u64 { + pub(crate) fn min_among(&self, storage_nodes: &[Uuid], min_version: u64) -> u64 { storage_nodes .iter() .map(|x| self.0.get(x).copied().unwrap_or(min_version)) -- cgit v1.2.3 From 95eb13eb08d517d328e3c8aeb222440a27211ee9 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 7 Dec 2023 10:55:15 +0100 Subject: rpc: refactor result tracking for quorum sets --- src/rpc/layout/manager.rs | 6 ++ src/rpc/rpc_helper.rs | 147 +++++++++++++++++++++++++----------- src/table/replication/parameters.rs | 2 +- src/table/table.rs | 54 +++---------- 4 files changed, 121 insertions(+), 88 deletions(-) (limited to 'src') diff --git a/src/rpc/layout/manager.rs b/src/rpc/layout/manager.rs index c65831a2..17465019 100644 --- a/src/rpc/layout/manager.rs +++ b/src/rpc/layout/manager.rs @@ -352,6 +352,12 @@ impl AsRef for WriteLock { } } +impl AsMut for WriteLock { + fn as_mut(&mut self) -> &mut T { + &mut self.value + } +} + impl Drop for WriteLock { fn drop(&mut self) { let layout = self.layout_manager.layout(); // acquire read lock diff --git a/src/rpc/rpc_helper.rs b/src/rpc/rpc_helper.rs index f71f5ae7..c6dcbe75 100644 --- a/src/rpc/rpc_helper.rs +++ b/src/rpc/rpc_helper.rs @@ -484,15 +484,10 @@ impl RpcHelper { // Peers may appear in many quorum sets. Here, build a list of peers, // mapping to the index of the quorum sets in which they appear. - let mut peers = HashMap::>::new(); - for (i, set) in to_sets.iter().enumerate() { - for peer in set.iter() { - peers.entry(*peer).or_default().push(i); - } - } + let mut result_tracker = QuorumSetResultTracker::new(to_sets, quorum); // Send one request to each peer of the quorum sets - let requests = peers.iter().map(|(peer, _)| { + let requests = result_tracker.nodes.iter().map(|(peer, _)| { let self2 = self.clone(); let msg = msg.clone(); let endpoint2 = endpoint.clone(); @@ -501,52 +496,25 @@ impl RpcHelper { }); let mut resp_stream = requests.collect::>(); - // Success and error responses will be collected in these two vectors - let mut successes = vec![]; - let mut errors = vec![]; - - // `set_counters` is used to keep track of how many success and error - // responses are received within each quorum set. When a node returns - // its response, it counts as a sucess/an error for all of the quorum - // sets which it is part of. - let mut set_counters = vec![(0, 0); to_sets.len()]; - // Drive requests to completion while let Some((node, resp)) = resp_stream.next().await { // Store the response in the correct vector and increment the // appropriate counters - match resp { - Ok(msg) => { - for set in peers.get(&node).unwrap().iter() { - set_counters[*set].0 += 1; - } - successes.push(msg); - } - Err(e) => { - for set in peers.get(&node).unwrap().iter() { - set_counters[*set].1 += 1; - } - errors.push(e); - } - } + result_tracker.register_result(node, resp); // If we have a quorum of ok in all quorum sets, then it's a success! - if set_counters.iter().all(|(ok_cnt, _)| *ok_cnt >= quorum) { + if result_tracker.all_quorums_ok() { // Continue all other requets in background tokio::spawn(async move { resp_stream.collect::)>>().await; }); - return Ok(successes); + return Ok(result_tracker.success_values()); } // If there is a quorum set for which too many errors were received, // we know it's impossible to get a quorum, so return immediately. - if set_counters - .iter() - .enumerate() - .any(|(i, (_, err_cnt))| err_cnt + quorum > to_sets[i].len()) - { + if result_tracker.too_many_failures() { break; } } @@ -563,13 +531,104 @@ impl RpcHelper { // running request handler.) // Failure, could not get quorum - let errors = errors.iter().map(|e| format!("{}", e)).collect::>(); - Err(Error::Quorum( + Err(result_tracker.quorum_error()) + } +} + +// ------- utility for tracking successes/errors among write sets -------- + +pub struct QuorumSetResultTracker { + // The set of nodes and the quorum sets they belong to + pub nodes: HashMap>, + pub quorum: usize, + + // The success and error responses received + pub successes: Vec<(Uuid, S)>, + pub failures: Vec<(Uuid, E)>, + + // The counters for successes and failures in each set + pub success_counters: Box<[usize]>, + pub failure_counters: Box<[usize]>, + pub set_lens: Box<[usize]>, +} + +impl QuorumSetResultTracker { + pub fn new(sets: &[A], quorum: usize) -> Self + where + A: AsRef<[Uuid]>, + { + let mut nodes = HashMap::>::new(); + for (i, set) in sets.iter().enumerate() { + for node in set.as_ref().iter() { + nodes.entry(*node).or_default().push(i); + } + } + + let num_nodes = nodes.len(); + Self { + nodes, quorum, - Some(to_sets.len()), - successes.len(), - peers.len(), + successes: Vec::with_capacity(num_nodes), + failures: vec![], + success_counters: vec![0; sets.len()].into_boxed_slice(), + failure_counters: vec![0; sets.len()].into_boxed_slice(), + set_lens: sets + .iter() + .map(|x| x.as_ref().len()) + .collect::>() + .into_boxed_slice(), + } + } + + pub fn register_result(&mut self, node: Uuid, result: Result) { + match result { + Ok(s) => { + self.successes.push((node, s)); + for set in self.nodes.get(&node).unwrap().iter() { + self.success_counters[*set] += 1; + } + } + Err(e) => { + self.failures.push((node, e)); + for set in self.nodes.get(&node).unwrap().iter() { + self.failure_counters[*set] += 1; + } + } + } + } + + pub fn all_quorums_ok(&self) -> bool { + self.success_counters + .iter() + .all(|ok_cnt| *ok_cnt >= self.quorum) + } + + pub fn too_many_failures(&self) -> bool { + self.failure_counters + .iter() + .zip(self.set_lens.iter()) + .any(|(err_cnt, set_len)| *err_cnt + self.quorum > *set_len) + } + + pub fn success_values(self) -> Vec { + self.successes + .into_iter() + .map(|(_, x)| x) + .collect::>() + } + + pub fn quorum_error(self) -> Error { + let errors = self + .failures + .iter() + .map(|(n, e)| format!("{:?}: {}", n, e)) + .collect::>(); + Error::Quorum( + self.quorum, + Some(self.set_lens.len()), + self.successes.len(), + self.nodes.len(), errors, - )) + ) } } diff --git a/src/table/replication/parameters.rs b/src/table/replication/parameters.rs index a4e701bb..db11ff5f 100644 --- a/src/table/replication/parameters.rs +++ b/src/table/replication/parameters.rs @@ -3,7 +3,7 @@ use garage_util::data::*; /// Trait to describe how a table shall be replicated pub trait TableReplication: Send + Sync + 'static { - type WriteSets: AsRef>> + Send + Sync + 'static; + type WriteSets: AsRef>> + AsMut>> + Send + Sync + 'static; // See examples in table_sharded.rs and table_fullcopy.rs // To understand various replication methods diff --git a/src/table/table.rs b/src/table/table.rs index 7d1ff31c..6508cf5d 100644 --- a/src/table/table.rs +++ b/src/table/table.rs @@ -20,6 +20,7 @@ use garage_util::error::Error; use garage_util::metrics::RecordDuration; use garage_util::migrate::Migrate; +use garage_rpc::rpc_helper::QuorumSetResultTracker; use garage_rpc::system::System; use garage_rpc::*; @@ -180,10 +181,6 @@ impl Table { // a quorum of nodes has answered OK, then the insert has succeeded and // consistency properties (read-after-write) are preserved. - // Some code here might feel redundant with RpcHelper::try_write_many_sets, - // but I think deduplicating could lead to more spaghetti instead of - // improving the readability, so I'm leaving as is. - let quorum = self.data.replication.write_quorum(); // Serialize all entries and compute the write sets for each of them. @@ -197,7 +194,10 @@ impl Table { for entry in entries.into_iter() { let entry = entry.borrow(); let hash = entry.partition_key().hash(); - let write_sets = self.data.replication.write_sets(&hash); + let mut write_sets = self.data.replication.write_sets(&hash); + for set in write_sets.as_mut().iter_mut() { + set.sort(); + } let e_enc = Arc::new(ByteBuf::from(entry.encode()?)); entries_vec.push((write_sets, e_enc)); } @@ -212,12 +212,8 @@ impl Table { .collect::>(); write_sets.sort(); write_sets.dedup(); - let mut write_set_index = HashMap::<&Uuid, Vec>::new(); - for (i, write_set) in write_sets.iter().enumerate() { - for node in write_set.iter() { - write_set_index.entry(node).or_default().push(i); - } - } + + let mut result_tracker = QuorumSetResultTracker::new(&write_sets, quorum); // Build a map of all nodes to the entries that must be sent to that node. let mut call_list: HashMap> = HashMap::new(); @@ -230,7 +226,6 @@ impl Table { } // Build futures to actually perform each of the corresponding RPC calls - let call_count = call_list.len(); let call_futures = call_list.into_iter().map(|(node, entries)| { let this = self.clone(); let tracer = opentelemetry::global::tracer("garage"); @@ -254,27 +249,11 @@ impl Table { // Run all requests in parallel thanks to FuturesUnordered, and collect results. let mut resps = call_futures.collect::>(); - let mut set_counters = vec![(0, 0); write_sets.len()]; - let mut successes = 0; - let mut errors = vec![]; while let Some((node, resp)) = resps.next().await { - match resp { - Ok(_) => { - successes += 1; - for set in write_set_index.get(&node).unwrap().iter() { - set_counters[*set].0 += 1; - } - } - Err(e) => { - errors.push(e); - for set in write_set_index.get(&node).unwrap().iter() { - set_counters[*set].1 += 1; - } - } - } + result_tracker.register_result(node, resp.map(|_| ())); - if set_counters.iter().all(|(ok_cnt, _)| *ok_cnt >= quorum) { + if result_tracker.all_quorums_ok() { // Success // Continue all other requests in background @@ -285,25 +264,14 @@ impl Table { return Ok(()); } - if set_counters - .iter() - .enumerate() - .any(|(i, (_, err_cnt))| err_cnt + quorum > write_sets[i].len()) - { + if result_tracker.too_many_failures() { // Too many errors in this set, we know we won't get a quorum break; } } // Failure, could not get quorum within at least one set - let errors = errors.iter().map(|e| format!("{}", e)).collect::>(); - Err(Error::Quorum( - quorum, - Some(write_sets.len()), - successes, - call_count, - errors, - )) + Err(result_tracker.quorum_error()) } pub async fn get( -- cgit v1.2.3 From d90de365b3b30cb631b22fcd62c98bddb5a91549 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 7 Dec 2023 11:16:10 +0100 Subject: table sync: use write quorums to report global success or failure of sync --- src/rpc/layout/helper.rs | 2 +- src/rpc/layout/manager.rs | 2 +- src/table/replication/fullcopy.rs | 3 +-- src/table/replication/parameters.rs | 2 +- src/table/replication/sharded.rs | 4 +-- src/table/sync.rs | 51 ++++++++++++++++++++++--------------- 6 files changed, 36 insertions(+), 28 deletions(-) (limited to 'src') diff --git a/src/rpc/layout/helper.rs b/src/rpc/layout/helper.rs index 881a039e..0aa7c6aa 100644 --- a/src/rpc/layout/helper.rs +++ b/src/rpc/layout/helper.rs @@ -180,7 +180,7 @@ impl LayoutHelper { ret } - pub(crate) fn write_sets_of(&self, position: &Hash) -> Vec> { + pub fn storage_sets_of(&self, position: &Hash) -> Vec> { self.layout() .versions .iter() diff --git a/src/rpc/layout/manager.rs b/src/rpc/layout/manager.rs index 17465019..dc963ba0 100644 --- a/src/rpc/layout/manager.rs +++ b/src/rpc/layout/manager.rs @@ -139,7 +139,7 @@ impl LayoutManager { pub fn write_sets_of(self: &Arc, position: &Hash) -> WriteLock>> { let layout = self.layout(); let version = layout.current().version; - let nodes = layout.write_sets_of(position); + let nodes = layout.storage_sets_of(position); layout .ack_lock .get(&version) diff --git a/src/table/replication/fullcopy.rs b/src/table/replication/fullcopy.rs index df930224..30122f39 100644 --- a/src/table/replication/fullcopy.rs +++ b/src/table/replication/fullcopy.rs @@ -1,4 +1,3 @@ -use std::iter::FromIterator; use std::sync::Arc; use garage_rpc::layout::*; @@ -69,7 +68,7 @@ impl TableReplication for TableFullReplication { partition: 0u16, first_hash: [0u8; 32].into(), last_hash: [0xff; 32].into(), - storage_nodes: Vec::from_iter(layout.current().all_nodes().to_vec()), + storage_sets: vec![layout.current().all_nodes().to_vec()], }], } } diff --git a/src/table/replication/parameters.rs b/src/table/replication/parameters.rs index db11ff5f..78470f35 100644 --- a/src/table/replication/parameters.rs +++ b/src/table/replication/parameters.rs @@ -40,5 +40,5 @@ pub struct SyncPartition { pub partition: Partition, pub first_hash: Hash, pub last_hash: Hash, - pub storage_nodes: Vec, + pub storage_sets: Vec>, } diff --git a/src/table/replication/sharded.rs b/src/table/replication/sharded.rs index 2a16bc0c..55d0029d 100644 --- a/src/table/replication/sharded.rs +++ b/src/table/replication/sharded.rs @@ -60,12 +60,12 @@ impl TableReplication for TableShardedReplication { .current() .partitions() .map(|(partition, first_hash)| { - let storage_nodes = layout.storage_nodes_of(&first_hash); + let storage_sets = layout.storage_sets_of(&first_hash); SyncPartition { partition, first_hash, last_hash: [0u8; 32].into(), // filled in just after - storage_nodes, + storage_sets, } }) .collect::>(); diff --git a/src/table/sync.rs b/src/table/sync.rs index efeac402..cfcbc4b5 100644 --- a/src/table/sync.rs +++ b/src/table/sync.rs @@ -18,6 +18,7 @@ use garage_util::encode::{debug_serialize, nonversioned_encode}; use garage_util::error::{Error, OkOrMessage}; use garage_rpc::layout::*; +use garage_rpc::rpc_helper::QuorumSetResultTracker; use garage_rpc::system::System; use garage_rpc::*; @@ -106,44 +107,52 @@ impl TableSyncer { must_exit: &mut watch::Receiver, ) -> Result<(), Error> { let my_id = self.system.id; - let retain = partition.storage_nodes.contains(&my_id); + let retain = partition.storage_sets.iter().any(|x| x.contains(&my_id)); if retain { debug!( "({}) Syncing {:?} with {:?}...", F::TABLE_NAME, partition, - partition.storage_nodes + partition.storage_sets ); - let mut sync_futures = partition - .storage_nodes + let mut result_tracker = QuorumSetResultTracker::new( + &partition.storage_sets, + self.data.replication.write_quorum(), + ); + + let mut sync_futures = result_tracker + .nodes .iter() - .filter(|node| **node != my_id) + .map(|(node, _)| *node) .map(|node| { - self.clone() - .do_sync_with(&partition, *node, must_exit.clone()) + let must_exit = must_exit.clone(); + async move { + if node == my_id { + (node, Ok(())) + } else { + (node, self.do_sync_with(&partition, node, must_exit).await) + } + } }) .collect::>(); - let mut n_errors = 0; - while let Some(r) = sync_futures.next().await { - if let Err(e) = r { - n_errors += 1; - warn!("({}) Sync error: {}", F::TABLE_NAME, e); + while let Some((node, res)) = sync_futures.next().await { + if let Err(e) = &res { + warn!("({}) Sync error with {:?}: {}", F::TABLE_NAME, node, e); } + result_tracker.register_result(node, res); } - if n_errors > 0 { - return Err(Error::Message(format!( - "Sync failed with {} nodes.", - n_errors - ))); + + if result_tracker.too_many_failures() { + return Err(result_tracker.quorum_error()); + } else { + Ok(()) } } else { self.offload_partition(&partition.first_hash, &partition.last_hash, must_exit) - .await?; + .await } - - Ok(()) } // Offload partition: this partition is not something we are storing, @@ -264,7 +273,7 @@ impl TableSyncer { } async fn do_sync_with( - self: Arc, + self: &Arc, partition: &SyncPartition, who: Uuid, must_exit: watch::Receiver, -- cgit v1.2.3 From aa59059a910eb6e1e824b84413a66909d697ef8a Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 7 Dec 2023 11:50:00 +0100 Subject: layout cli: safer skip-dead-nodes command --- src/garage/cli/cmd.rs | 23 ++++++++++++++++------- src/garage/cli/layout.rs | 35 +++++++++++++++++++++++++---------- src/garage/cli/structs.rs | 12 ++++++++---- 3 files changed, 49 insertions(+), 21 deletions(-) (limited to 'src') diff --git a/src/garage/cli/cmd.rs b/src/garage/cli/cmd.rs index c7f0ad2b..196c0cb3 100644 --- a/src/garage/cli/cmd.rs +++ b/src/garage/cli/cmd.rs @@ -49,13 +49,7 @@ pub async fn cli_command_dispatch( } pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> Result<(), Error> { - let status = match rpc_cli - .call(&rpc_host, SystemRpc::GetKnownNodes, PRIO_NORMAL) - .await?? - { - SystemRpc::ReturnKnownNodes(nodes) => nodes, - resp => return Err(Error::Message(format!("Invalid RPC response: {:?}", resp))), - }; + let status = fetch_status(rpc_cli, rpc_host).await?; let layout = fetch_layout(rpc_cli, rpc_host).await?; println!("==== HEALTHY NODES ===="); @@ -268,3 +262,18 @@ pub async fn cmd_admin( } Ok(()) } + +// ---- utility ---- + +pub async fn fetch_status( + rpc_cli: &Endpoint, + rpc_host: NodeID, +) -> Result, Error> { + match rpc_cli + .call(&rpc_host, SystemRpc::GetKnownNodes, PRIO_NORMAL) + .await?? + { + SystemRpc::ReturnKnownNodes(nodes) => Ok(nodes), + resp => Err(Error::Message(format!("Invalid RPC response: {:?}", resp))), + } +} diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index 3c7843bd..cdf77c04 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -33,8 +33,8 @@ pub async fn cli_layout_command_dispatch( cmd_config_layout(system_rpc_endpoint, rpc_host, config_opt).await } LayoutOperation::History => cmd_layout_history(system_rpc_endpoint, rpc_host).await, - LayoutOperation::AssumeSync(assume_sync_opt) => { - cmd_layout_assume_sync(system_rpc_endpoint, rpc_host, assume_sync_opt).await + LayoutOperation::SkipDeadNodes(assume_sync_opt) => { + cmd_layout_skip_dead_nodes(system_rpc_endpoint, rpc_host, assume_sync_opt).await } } } @@ -388,13 +388,21 @@ pub async fn cmd_layout_history( Ok(()) } -pub async fn cmd_layout_assume_sync( +pub async fn cmd_layout_skip_dead_nodes( rpc_cli: &Endpoint, rpc_host: NodeID, - opt: AssumeSyncOpt, + opt: SkipDeadNodesOpt, ) -> Result<(), Error> { + let status = fetch_status(rpc_cli, rpc_host).await?; let mut layout = fetch_layout(rpc_cli, rpc_host).await?; + if layout.versions.len() == 1 { + return Err(Error::Message( + "This command cannot be called when there is only one live cluster layout version" + .into(), + )); + } + let min_v = layout.min_stored(); if opt.version <= min_v || opt.version > layout.current().version { return Err(Error::Message(format!( @@ -408,12 +416,19 @@ pub async fn cmd_layout_assume_sync( let all_nodes = layout.get_all_nodes(); for node in all_nodes.iter() { - layout.update_trackers.ack_map.set_max(*node, opt.version); - layout.update_trackers.sync_map.set_max(*node, opt.version); - layout - .update_trackers - .sync_ack_map - .set_max(*node, opt.version); + if status.iter().any(|x| x.id == *node && x.is_up) { + continue; + } + + if layout.update_trackers.ack_map.set_max(*node, opt.version) { + println!("Increased the ACK tracker for node {:?}", node); + } + + if opt.allow_missing_data { + if layout.update_trackers.sync_map.set_max(*node, opt.version) { + println!("Increased the SYNC tracker for node {:?}", node); + } + } } send_layout(rpc_cli, rpc_host, layout).await?; diff --git a/src/garage/cli/structs.rs b/src/garage/cli/structs.rs index c4b400f4..6bc3da22 100644 --- a/src/garage/cli/structs.rs +++ b/src/garage/cli/structs.rs @@ -117,9 +117,9 @@ pub enum LayoutOperation { #[structopt(name = "history", version = garage_version())] History, - /// Assume all nodes are synchronized up to a certain layout version - #[structopt(name = "assume-sync", version = garage_version())] - AssumeSync(AssumeSyncOpt), + /// Skip dead nodes when awaiting for a new layout version to be synchronized + #[structopt(name = "skip-dead-nodes", version = garage_version())] + SkipDeadNodes(SkipDeadNodesOpt), } #[derive(StructOpt, Debug)] @@ -178,11 +178,15 @@ pub struct RevertLayoutOpt { } #[derive(StructOpt, Debug)] -pub struct AssumeSyncOpt { +pub struct SkipDeadNodesOpt { /// Version number of the layout to assume is currently up-to-date. /// This will generally be the current layout version. #[structopt(long = "version")] pub(crate) version: u64, + /// Allow the skip even if a quorum of ndoes could not be found for + /// the data among the remaining nodes + #[structopt(long = "allow-missing-data")] + pub(crate) allow_missing_data: bool, } #[derive(Serialize, Deserialize, StructOpt, Debug)] -- cgit v1.2.3 From 9cecea64d4509e95ac9793b29c947e2ecf9bb0b8 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 7 Dec 2023 14:27:53 +0100 Subject: layout: allow sync update tracker to progress with only quorums --- src/garage/cli/layout.rs | 6 +-- src/rpc/layout/helper.rs | 33 ++++++++++++--- src/rpc/layout/history.rs | 101 ++++++++++++++++++++++++++++++++++++++++++++ src/rpc/layout/manager.rs | 18 +++++--- src/rpc/layout/schema.rs | 6 +-- src/rpc/replication_mode.rs | 7 +++ src/rpc/system.rs | 2 +- 7 files changed, 152 insertions(+), 21 deletions(-) (limited to 'src') diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index cdf77c04..fac826f5 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -365,9 +365,9 @@ pub async fn cmd_layout_history( table.push(format!( "{:?}\t#{}\t#{}\t#{}", node, - layout.update_trackers.ack_map.get(node), - layout.update_trackers.sync_map.get(node), - layout.update_trackers.sync_ack_map.get(node), + layout.update_trackers.ack_map.get(node, min_stored), + layout.update_trackers.sync_map.get(node, min_stored), + layout.update_trackers.sync_ack_map.get(node, min_stored), )); } table[1..].sort(); diff --git a/src/rpc/layout/helper.rs b/src/rpc/layout/helper.rs index 0aa7c6aa..eeaf4ffa 100644 --- a/src/rpc/layout/helper.rs +++ b/src/rpc/layout/helper.rs @@ -7,6 +7,7 @@ use serde::{Deserialize, Serialize}; use garage_util::data::*; use super::schema::*; +use crate::replication_mode::ReplicationMode; use crate::rpc_helper::RpcHelper; #[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq, Eq)] @@ -22,6 +23,7 @@ pub struct LayoutDigest { } pub struct LayoutHelper { + replication_mode: ReplicationMode, layout: Option, // cached values @@ -48,7 +50,23 @@ impl Deref for LayoutHelper { } impl LayoutHelper { - pub fn new(mut layout: LayoutHistory, mut ack_lock: HashMap) -> Self { + pub fn new( + replication_mode: ReplicationMode, + mut layout: LayoutHistory, + mut ack_lock: HashMap, + ) -> Self { + // In the new() function of the helper, we do a bunch of cleanup + // and calculations on the layout history to make sure things are + // correct and we have rapid access to important values such as + // the layout versions to use when reading to ensure consistency. + + if !replication_mode.is_read_after_write_consistent() { + // Fast path for when no consistency is required. + // In this case we only need to keep the last version of the layout, + // we don't care about coordinating stuff in the cluster. + layout.keep_current_version_only(); + } + layout.cleanup_old_versions(); let all_nodes = layout.get_all_nodes(); @@ -68,7 +86,7 @@ impl LayoutHelper { .ack_map .min_among(&all_nodes, min_version); - // sync_map_min is the minimum value of sync_map among all storage nodes + // sync_map_min is the minimum value of sync_map among storage nodes // in the cluster (non-gateway nodes only, current and previous layouts). // It is the highest layout version for which we know that all relevant // storage nodes have fullfilled a sync, and therefore it is safe to @@ -76,11 +94,10 @@ impl LayoutHelper { // Gateway nodes are excluded here because they hold no relevant data // (they store the bucket and access key tables, but we don't have // consistency on those). - // TODO: this value could take quorums into account instead. - let sync_map_min = layout - .update_trackers - .sync_map - .min_among(&all_nongateway_nodes, min_version); + // This value is calculated using quorums to allow progress even + // if not all nodes have successfully completed a sync. + let sync_map_min = + layout.calculate_sync_map_min_with_quorum(replication_mode, &all_nongateway_nodes); let trackers_hash = layout.calculate_trackers_hash(); let staging_hash = layout.calculate_staging_hash(); @@ -91,6 +108,7 @@ impl LayoutHelper { .or_insert(AtomicUsize::new(0)); LayoutHelper { + replication_mode, layout: Some(layout), ack_map_min, sync_map_min, @@ -115,6 +133,7 @@ impl LayoutHelper { let changed = f(&mut self.layout.as_mut().unwrap()); if changed { *self = Self::new( + self.replication_mode, self.layout.take().unwrap(), std::mem::take(&mut self.ack_lock), ); diff --git a/src/rpc/layout/history.rs b/src/rpc/layout/history.rs index c448ac24..a53256cc 100644 --- a/src/rpc/layout/history.rs +++ b/src/rpc/layout/history.rs @@ -6,6 +6,7 @@ use garage_util::encode::nonversioned_encode; use garage_util::error::*; use super::*; +use crate::replication_mode::ReplicationMode; impl LayoutHistory { pub fn new(replication_factor: usize) -> Self { @@ -64,6 +65,13 @@ impl LayoutHistory { // ---- housekeeping (all invoked by LayoutHelper) ---- + pub(crate) fn keep_current_version_only(&mut self) { + while self.versions.len() > 1 { + let removed = self.versions.remove(0); + self.old_versions.push(removed); + } + } + pub(crate) fn cleanup_old_versions(&mut self) { // If there are invalid versions before valid versions, remove them if self.versions.len() > 1 && self.current().check().is_ok() { @@ -114,6 +122,99 @@ impl LayoutHistory { } } + pub(crate) fn calculate_sync_map_min_with_quorum( + &self, + replication_mode: ReplicationMode, + all_nongateway_nodes: &[Uuid], + ) -> u64 { + // This function calculates the minimum layout version from which + // it is safe to read if we want to maintain read-after-write consistency. + // In the general case the computation can be a bit expensive so + // we try to optimize it in several ways. + + // If there is only one layout version, we know that's the one + // we need to read from. + if self.versions.len() == 1 { + return self.current().version; + } + + let quorum = replication_mode.write_quorum(); + + let min_version = self.min_stored(); + let global_min = self + .update_trackers + .sync_map + .min_among(&all_nongateway_nodes, min_version); + + // If the write quorums are equal to the total number of nodes, + // i.e. no writes can succeed while they are not written to all nodes, + // then we must in all case wait for all nodes to complete a sync. + // This is represented by reading from the layout with version + // number global_min, the smallest layout version for which all nodes + // have completed a sync. + if quorum == self.current().replication_factor { + return global_min; + } + + // In the general case, we need to look at all write sets for all partitions, + // and find a safe layout version to read for that partition. We then + // take the minimum value among all partition as the safe layout version + // to read in all cases (the layout version to which all reads are directed). + let mut current_min = self.current().version; + let mut sets_done = HashSet::>::new(); + + for (_, p_hash) in self.current().partitions() { + for v in self.versions.iter() { + if v.version == self.current().version { + // We don't care about whether nodes in the latest layout version + // have completed a sync or not, as the sync is push-only + // and by definition nodes in the latest layout version do not + // hold data that must be pushed to nodes in the latest layout + // version, since that's the same version (any data that's + // already in the latest version is assumed to have been written + // by an operation that ensured a quorum of writes within + // that version). + continue; + } + + // Determine set of nodes for partition p in layout version v. + // Sort the node set to avoid duplicate computations. + let mut set = v + .nodes_of(&p_hash, v.replication_factor) + .collect::>(); + set.sort(); + + // If this set was already processed, skip it. + if sets_done.contains(&set) { + continue; + } + + // Find the value of the sync update trackers that is the + // highest possible minimum within a quorum of nodes. + let mut sync_values = set + .iter() + .map(|x| self.update_trackers.sync_map.get(x, min_version)) + .collect::>(); + sync_values.sort(); + let set_min = sync_values[sync_values.len() - quorum]; + if set_min < current_min { + current_min = set_min; + } + // defavorable case, we know we are at the smallest possible version, + // so we can stop early + assert!(current_min >= global_min); + if current_min == global_min { + return current_min; + } + + // Add set to already processed sets + sets_done.insert(set); + } + } + + current_min + } + pub(crate) fn calculate_trackers_hash(&self) -> Hash { blake2sum(&nonversioned_encode(&self.update_trackers).unwrap()[..]) } diff --git a/src/rpc/layout/manager.rs b/src/rpc/layout/manager.rs index dc963ba0..ec8a2a15 100644 --- a/src/rpc/layout/manager.rs +++ b/src/rpc/layout/manager.rs @@ -14,12 +14,13 @@ use garage_util::error::*; use garage_util::persister::Persister; use super::*; +use crate::replication_mode::ReplicationMode; use crate::rpc_helper::*; use crate::system::*; pub struct LayoutManager { node_id: Uuid, - replication_factor: usize, + replication_mode: ReplicationMode, persist_cluster_layout: Persister, layout: Arc>, @@ -37,14 +38,16 @@ impl LayoutManager { node_id: NodeID, system_endpoint: Arc>, fullmesh: Arc, - replication_factor: usize, + replication_mode: ReplicationMode, ) -> Result, Error> { + let replication_factor = replication_mode.replication_factor(); + let persist_cluster_layout: Persister = Persister::new(&config.metadata_dir, "cluster_layout"); let cluster_layout = match persist_cluster_layout.load() { Ok(x) => { - if x.current().replication_factor != replication_factor { + if x.current().replication_factor != replication_mode.replication_factor() { return Err(Error::Message(format!( "Prevous cluster layout has replication factor {}, which is different than the one specified in the config file ({}). The previous cluster layout can be purged, if you know what you are doing, simply by deleting the `cluster_layout` file in your metadata directory.", x.current().replication_factor, @@ -62,7 +65,8 @@ impl LayoutManager { } }; - let mut cluster_layout = LayoutHelper::new(cluster_layout, Default::default()); + let mut cluster_layout = + LayoutHelper::new(replication_mode, cluster_layout, Default::default()); cluster_layout.update_trackers(node_id.into()); let layout = Arc::new(RwLock::new(cluster_layout)); @@ -77,7 +81,7 @@ impl LayoutManager { Ok(Arc::new(Self { node_id: node_id.into(), - replication_factor, + replication_mode, persist_cluster_layout, layout, change_notify, @@ -291,11 +295,11 @@ impl LayoutManager { adv.update_trackers ); - if adv.current().replication_factor != self.replication_factor { + if adv.current().replication_factor != self.replication_mode.replication_factor() { let msg = format!( "Received a cluster layout from another node with replication factor {}, which is different from what we have in our configuration ({}). Discarding the cluster layout we received.", adv.current().replication_factor, - self.replication_factor + self.replication_mode.replication_factor() ); error!("{}", msg); return Err(Error::Message(msg)); diff --git a/src/rpc/layout/schema.rs b/src/rpc/layout/schema.rs index 49e84420..df949906 100644 --- a/src/rpc/layout/schema.rs +++ b/src/rpc/layout/schema.rs @@ -411,13 +411,13 @@ impl UpdateTracker { pub(crate) fn min_among(&self, storage_nodes: &[Uuid], min_version: u64) -> u64 { storage_nodes .iter() - .map(|x| self.0.get(x).copied().unwrap_or(min_version)) + .map(|x| self.get(x, min_version)) .min() .unwrap_or(min_version) } - pub fn get(&self, node: &Uuid) -> u64 { - self.0.get(node).copied().unwrap_or(0) + pub fn get(&self, node: &Uuid, min_version: u64) -> u64 { + self.0.get(node).copied().unwrap_or(min_version) } } diff --git a/src/rpc/replication_mode.rs b/src/rpc/replication_mode.rs index e244e063..2f7e2fec 100644 --- a/src/rpc/replication_mode.rs +++ b/src/rpc/replication_mode.rs @@ -54,4 +54,11 @@ impl ReplicationMode { Self::ThreeWayDangerous => 1, } } + + pub fn is_read_after_write_consistent(&self) -> bool { + match self { + Self::None | Self::TwoWay | Self::ThreeWay => true, + _ => false, + } + } } diff --git a/src/rpc/system.rs b/src/rpc/system.rs index be4aefa2..81a47ff3 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -280,7 +280,7 @@ impl System { netapp.id, system_endpoint.clone(), fullmesh.clone(), - replication_factor, + replication_mode, )?; // ---- set up metrics and status exchange ---- -- cgit v1.2.3 From 431b28e0cfdc9cac6c649193cf602108a8b02997 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 7 Dec 2023 15:15:59 +0100 Subject: fix build with discovery features --- src/rpc/system.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'src') diff --git a/src/rpc/system.rs b/src/rpc/system.rs index 81a47ff3..adfef6b6 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -514,7 +514,7 @@ impl System { if let Err(e) = c .publish_consul_service( self.netapp.id, - &self.local_status.load_full().hostname, + &self.local_status.load_full().hostname.as_deref().unwrap(), rpc_public_addr, ) .await @@ -541,7 +541,7 @@ impl System { if let Err(e) = publish_kubernetes_node( k, self.netapp.id, - &self.local_status.load_full().hostname, + &self.local_status.load_full().hostname.as_deref().unwrap(), rpc_public_addr, ) .await -- cgit v1.2.3 From 91b874c4efa40e64663368369a712e0a5a389e53 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 8 Dec 2023 10:36:37 +0100 Subject: rpc: fix system::health --- src/rpc/system.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'src') diff --git a/src/rpc/system.rs b/src/rpc/system.rs index adfef6b6..a8f12852 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -456,14 +456,14 @@ impl System { let mut partitions_quorum = 0; let mut partitions_all_ok = 0; for (_, hash) in partitions.iter() { - let write_sets = layout + let mut write_sets = layout .versions .iter() .map(|x| x.nodes_of(hash, x.replication_factor)); let has_quorum = write_sets .clone() .all(|set| set.filter(|x| node_up(x)).count() >= quorum); - let all_ok = write_sets.clone().all(|mut set| set.all(|x| node_up(&x))); + let all_ok = write_sets.all(|mut set| set.all(|x| node_up(&x))); if has_quorum { partitions_quorum += 1; } @@ -474,7 +474,7 @@ impl System { // Determine overall cluster status let status = - if partitions_quorum == partitions.len() && storage_nodes_ok == storage_nodes.len() { + if partitions_all_ok == partitions.len() && storage_nodes_ok == storage_nodes.len() { ClusterHealthStatus::Healthy } else if partitions_quorum == partitions.len() { ClusterHealthStatus::Degraded -- cgit v1.2.3 From 7f2541101f15614c79020b35d3d7dab767c32676 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 8 Dec 2023 11:24:23 +0100 Subject: cli: improvements to the layout commands when multiple layouts are live --- src/garage/admin/mod.rs | 3 +-- src/garage/cli/cmd.rs | 4 +-- src/garage/cli/layout.rs | 67 +++++++++++++++++++++++++++++++----------------- src/garage/cli/util.rs | 4 ++- 4 files changed, 49 insertions(+), 29 deletions(-) (limited to 'src') diff --git a/src/garage/admin/mod.rs b/src/garage/admin/mod.rs index da4226cf..de7851e1 100644 --- a/src/garage/admin/mod.rs +++ b/src/garage/admin/mod.rs @@ -274,8 +274,7 @@ impl AdminRpcHandler { fn gather_cluster_stats(&self) -> String { let mut ret = String::new(); - // Gather storage node and free space statistics - // TODO: not only layout.current() ??? + // Gather storage node and free space statistics for current nodes let layout = &self.garage.system.cluster_layout(); let mut node_partition_count = HashMap::::new(); for short_id in layout.current().ring_assignment_data.iter() { diff --git a/src/garage/cli/cmd.rs b/src/garage/cli/cmd.rs index 196c0cb3..fb6dface 100644 --- a/src/garage/cli/cmd.rs +++ b/src/garage/cli/cmd.rs @@ -179,7 +179,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> println!("Your cluster is expecting to drain data from nodes that are currently unavailable."); println!("If these nodes are definitely dead, please review the layout history with"); println!( - "`garage layout history` and use `garage layout assume-sync` to force progress." + "`garage layout history` and use `garage layout skip-dead-nodes` to force progress." ); } } @@ -274,6 +274,6 @@ pub async fn fetch_status( .await?? { SystemRpc::ReturnKnownNodes(nodes) => Ok(nodes), - resp => Err(Error::Message(format!("Invalid RPC response: {:?}", resp))), + resp => Err(Error::unexpected_rpc_message(resp)), } } diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index fac826f5..f76e33c5 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -354,35 +354,44 @@ pub async fn cmd_layout_history( )); } format_table(table); - - println!(); - println!("==== UPDATE TRACKERS ===="); - println!("This is the internal data that Garage stores to know which nodes have what data."); println!(); - let mut table = vec!["Node\tAck\tSync\tSync_ack".to_string()]; - let all_nodes = layout.get_all_nodes(); - for node in all_nodes.iter() { - table.push(format!( - "{:?}\t#{}\t#{}\t#{}", - node, - layout.update_trackers.ack_map.get(node, min_stored), - layout.update_trackers.sync_map.get(node, min_stored), - layout.update_trackers.sync_ack_map.get(node, min_stored), - )); - } - table[1..].sort(); - format_table(table); if layout.versions.len() > 1 { + println!("==== UPDATE TRACKERS ===="); + println!("Several layout versions are currently live in the version, and data is being migrated."); + println!( + "This is the internal data that Garage stores to know which nodes have what data." + ); + println!(); + let mut table = vec!["Node\tAck\tSync\tSync_ack".to_string()]; + let all_nodes = layout.get_all_nodes(); + for node in all_nodes.iter() { + table.push(format!( + "{:?}\t#{}\t#{}\t#{}", + node, + layout.update_trackers.ack_map.get(node, min_stored), + layout.update_trackers.sync_map.get(node, min_stored), + layout.update_trackers.sync_ack_map.get(node, min_stored), + )); + } + table[1..].sort(); + format_table(table); + println!(); println!( - "If some nodes are not catching up to the latest layout version in the update tracker," + "If some nodes are not catching up to the latest layout version in the update trackers," ); println!("it might be because they are offline or unable to complete a sync successfully."); println!( - "You may force progress using `garage layout assume-sync --version {}`", + "You may force progress using `garage layout skip-dead-nodes --version {}`", layout.current().version ); + } else { + println!("Your cluster is currently in a stable state with a single live layout version."); + println!("No metadata migration is in progress. Note that the migration of data blocks is not tracked,"); + println!( + "so you might want to keep old nodes online until their data directories become empty." + ); } Ok(()) @@ -415,6 +424,7 @@ pub async fn cmd_layout_skip_dead_nodes( } let all_nodes = layout.get_all_nodes(); + let mut did_something = false; for node in all_nodes.iter() { if status.iter().any(|x| x.id == *node && x.is_up) { continue; @@ -422,19 +432,28 @@ pub async fn cmd_layout_skip_dead_nodes( if layout.update_trackers.ack_map.set_max(*node, opt.version) { println!("Increased the ACK tracker for node {:?}", node); + did_something = true; } if opt.allow_missing_data { if layout.update_trackers.sync_map.set_max(*node, opt.version) { println!("Increased the SYNC tracker for node {:?}", node); + did_something = true; } } } - send_layout(rpc_cli, rpc_host, layout).await?; - println!("Success."); - - Ok(()) + if did_something { + send_layout(rpc_cli, rpc_host, layout).await?; + println!("Success."); + Ok(()) + } else if !opt.allow_missing_data { + Err(Error::Message("Nothing was done, try passing the `--allow-missing-data` flag to force progress even when not enough nodes can complete a metadata sync.".into())) + } else { + Err(Error::Message( + "Sorry, there is nothing I can do for you. Please wait patiently. If you ask for help, please send the output of the `garage layout history` command.".into(), + )) + } } // --- utility --- @@ -448,7 +467,7 @@ pub async fn fetch_layout( .await?? { SystemRpc::AdvertiseClusterLayout(t) => Ok(t), - resp => Err(Error::Message(format!("Invalid RPC response: {:?}", resp))), + resp => Err(Error::unexpected_rpc_message(resp)), } } diff --git a/src/garage/cli/util.rs b/src/garage/cli/util.rs index 2232d395..0511e2b1 100644 --- a/src/garage/cli/util.rs +++ b/src/garage/cli/util.rs @@ -450,6 +450,8 @@ pub fn print_block_info( if refcount != nondeleted_count { println!(); - println!("Warning: refcount does not match number of non-deleted versions"); + println!( + "Warning: refcount does not match number of non-deleted versions (see issue #644)." + ); } } -- cgit v1.2.3 From 063294dd569e10c6d85e29eb6507249eece00956 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 8 Dec 2023 11:50:58 +0100 Subject: layout version: refactor get_node_zone --- src/rpc/layout/test.rs | 4 ++-- src/rpc/layout/version.rs | 47 +++++++++++++++++++++++------------------------ 2 files changed, 25 insertions(+), 26 deletions(-) (limited to 'src') diff --git a/src/rpc/layout/test.rs b/src/rpc/layout/test.rs index bb072c97..88eb518e 100644 --- a/src/rpc/layout/test.rs +++ b/src/rpc/layout/test.rs @@ -34,8 +34,8 @@ fn check_against_naive(cl: &LayoutVersion) -> Result { zone_token.insert(z.clone(), 0); } for uuid in cl.nongateway_nodes() { - let z = cl.get_node_zone(&uuid)?; - let c = cl.get_node_capacity(&uuid).unwrap(); + let z = cl.expect_get_node_zone(&uuid); + let c = cl.expect_get_node_capacity(&uuid); zone_token.insert( z.to_string(), zone_token[z] + min(NB_PARTITIONS, (c / over_size) as usize), diff --git a/src/rpc/layout/version.rs b/src/rpc/layout/version.rs index 947fab56..cbfbee94 100644 --- a/src/rpc/layout/version.rs +++ b/src/rpc/layout/version.rs @@ -70,6 +70,14 @@ impl LayoutVersion { } } + /// Given a node uuids, this function returns the label of its zone if it has one + pub fn get_node_zone(&self, uuid: &Uuid) -> Option<&str> { + match self.node_role(uuid) { + Some(role) => Some(&role.zone), + _ => None, + } + } + /// Returns the number of partitions associated to this node in the ring pub fn get_node_usage(&self, uuid: &Uuid) -> Result { for (i, id) in self.node_id_vec.iter().enumerate() { @@ -129,28 +137,22 @@ impl LayoutVersion { // ===================== internal information extractors ====================== - /// Given a node uuids, this function returns the label of its zone - pub(crate) fn get_node_zone(&self, uuid: &Uuid) -> Result<&str, Error> { - match self.node_role(uuid) { - Some(role) => Ok(&role.zone), - _ => Err(Error::Message( - "The Uuid does not correspond to a node present in the cluster.".into(), - )), - } - } - - fn expect_get_node_capacity(&self, uuid: &Uuid) -> u64 { + pub(crate) fn expect_get_node_capacity(&self, uuid: &Uuid) -> u64 { self.get_node_capacity(&uuid) .expect("non-gateway node with zero capacity") } + pub(crate) fn expect_get_node_zone(&self, uuid: &Uuid) -> &str { + self.get_node_zone(&uuid).expect("node without a zone") + } + /// Returns the sum of capacities of non gateway nodes in the cluster - fn get_total_capacity(&self) -> Result { + fn get_total_capacity(&self) -> u64 { let mut total_capacity = 0; for uuid in self.nongateway_nodes() { total_capacity += self.expect_get_node_capacity(&uuid); } - Ok(total_capacity) + total_capacity } /// Returns the effective value of the zone_redundancy parameter @@ -227,10 +229,7 @@ impl LayoutVersion { // Check that every partition is spread over at least zone_redundancy zones. let zones_of_p = nodes_of_p .iter() - .map(|n| { - self.get_node_zone(&self.node_id_vec[*n as usize]) - .expect("Zone not found.") - }) + .map(|n| self.expect_get_node_zone(&self.node_id_vec[*n as usize])) .collect::>(); if zones_of_p.iter().unique().count() < zone_redundancy { return Err(format!( @@ -516,7 +515,7 @@ impl LayoutVersion { } let mut s_down = 1; - let mut s_up = self.get_total_capacity()?; + let mut s_up = self.get_total_capacity(); while s_down + 1 < s_up { g = self.generate_flow_graph( (s_down + s_up) / 2, @@ -586,7 +585,7 @@ impl LayoutVersion { } for n in 0..self.nongateway_nodes().len() { let node_capacity = self.expect_get_node_capacity(&self.node_id_vec[n]); - let node_zone = zone_to_id[self.get_node_zone(&self.node_id_vec[n])?]; + let node_zone = zone_to_id[self.expect_get_node_zone(&self.node_id_vec[n])]; g.add_edge(Vertex::N(n), Vertex::Sink, node_capacity / partition_size)?; for p in 0..NB_PARTITIONS { if !exclude_assoc.contains(&(p, n)) { @@ -632,7 +631,7 @@ impl LayoutVersion { // The algorithm is such that it will start with the flow that we just computed // and find ameliorating paths from that. for (p, n) in exclude_edge.iter() { - let node_zone = zone_to_id[self.get_node_zone(&self.node_id_vec[*n])?]; + let node_zone = zone_to_id[self.expect_get_node_zone(&self.node_id_vec[*n])]; g.add_edge(Vertex::PZ(*p, node_zone), Vertex::N(*n), 1)?; } g.compute_maximal_flow()?; @@ -652,7 +651,7 @@ impl LayoutVersion { let mut cost = CostFunction::new(); for (p, assoc_p) in prev_assign.iter().enumerate() { for n in assoc_p.iter() { - let node_zone = zone_to_id[self.get_node_zone(&self.node_id_vec[*n])?]; + let node_zone = zone_to_id[self.expect_get_node_zone(&self.node_id_vec[*n])]; cost.insert((Vertex::PZ(p, node_zone), Vertex::N(*n)), -1); } } @@ -707,7 +706,7 @@ impl LayoutVersion { let mut msg = Message::new(); let used_cap = self.partition_size * NB_PARTITIONS as u64 * self.replication_factor as u64; - let total_cap = self.get_total_capacity()?; + let total_cap = self.get_total_capacity(); let percent_cap = 100.0 * (used_cap as f32) / (total_cap as f32); msg.push(format!( "Usable capacity / total cluster capacity: {} / {} ({:.1} %)", @@ -754,7 +753,7 @@ impl LayoutVersion { let mut old_zones_of_p = Vec::::new(); for n in prev_assign[p].iter() { old_zones_of_p - .push(zone_to_id[self.get_node_zone(&self.node_id_vec[*n])?]); + .push(zone_to_id[self.expect_get_node_zone(&self.node_id_vec[*n])]); } if !old_zones_of_p.contains(&z) { new_partitions_zone[z] += 1; @@ -796,7 +795,7 @@ impl LayoutVersion { for z in 0..id_to_zone.len() { let mut nodes_of_z = Vec::::new(); for n in 0..storing_nodes.len() { - if self.get_node_zone(&self.node_id_vec[n])? == id_to_zone[z] { + if self.expect_get_node_zone(&self.node_id_vec[n]) == id_to_zone[z] { nodes_of_z.push(n); } } -- cgit v1.2.3 From 5dd200c015aed786173f0e11541b0505f95dd6d1 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 8 Dec 2023 12:02:24 +0100 Subject: layout: move block_read_nodes_of to rpc_helper to avoid double-locking (in theory, this could have caused a deadlock) --- src/block/manager.rs | 2 +- src/rpc/layout/helper.rs | 27 ++--------- src/rpc/rpc_helper.rs | 121 +++++++++++++++++++++++++++++------------------ 3 files changed, 80 insertions(+), 70 deletions(-) (limited to 'src') diff --git a/src/block/manager.rs b/src/block/manager.rs index 47111160..bfd390ee 100644 --- a/src/block/manager.rs +++ b/src/block/manager.rs @@ -266,7 +266,7 @@ impl BlockManager { { let who = self .system - .cluster_layout() + .rpc_helper() .block_read_nodes_of(hash, self.system.rpc_helper()); for node in who.iter() { diff --git a/src/rpc/layout/helper.rs b/src/rpc/layout/helper.rs index eeaf4ffa..147c8b4f 100644 --- a/src/rpc/layout/helper.rs +++ b/src/rpc/layout/helper.rs @@ -8,7 +8,6 @@ use garage_util::data::*; use super::schema::*; use crate::replication_mode::ReplicationMode; -use crate::rpc_helper::RpcHelper; #[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq, Eq)] pub struct LayoutDigest { @@ -155,6 +154,10 @@ impl LayoutHelper { self.ack_map_min } + pub fn all_sync(&self) -> u64 { + self.sync_map_min + } + pub fn sync_versions(&self) -> (u64, u64, u64) { ( self.layout().current().version, @@ -177,28 +180,6 @@ impl LayoutHelper { .collect() } - pub fn block_read_nodes_of(&self, position: &Hash, rpc_helper: &RpcHelper) -> Vec { - let mut ret = Vec::with_capacity(12); - let ver_iter = self - .layout() - .versions - .iter() - .rev() - .chain(self.layout().old_versions.iter().rev()); - for ver in ver_iter { - if ver.version > self.sync_map_min { - continue; - } - let nodes = ver.nodes_of(position, ver.replication_factor); - for node in rpc_helper.request_order(nodes) { - if !ret.contains(&node) { - ret.push(node); - } - } - } - ret - } - pub fn storage_sets_of(&self, position: &Hash) -> Vec> { self.layout() .versions diff --git a/src/rpc/rpc_helper.rs b/src/rpc/rpc_helper.rs index c6dcbe75..7e1387ed 100644 --- a/src/rpc/rpc_helper.rs +++ b/src/rpc/rpc_helper.rs @@ -26,7 +26,7 @@ use garage_util::data::*; use garage_util::error::Error; use garage_util::metrics::RecordDuration; -use crate::layout::LayoutHelper; +use crate::layout::{LayoutHelper, LayoutHistory}; use crate::metrics::RpcMetrics; // Default RPC timeout = 5 minutes @@ -304,7 +304,7 @@ impl RpcHelper { // preemptively send an additional request to any remaining nodes. // Reorder requests to priorize closeness / low latency - let request_order = self.request_order(to.iter().copied()); + let request_order = self.request_order(&self.0.layout.read().unwrap(), to.iter().copied()); let send_all_at_once = strategy.rs_send_all_at_once.unwrap_or(false); // Build future for each request @@ -368,50 +368,6 @@ impl RpcHelper { } } - pub fn request_order(&self, nodes: impl Iterator) -> Vec { - // Retrieve some status variables that we will use to sort requests - let peer_list = self.0.fullmesh.get_peer_list(); - let layout = self.0.layout.read().unwrap(); - let our_zone = match layout.current().node_role(&self.0.our_node_id) { - Some(pc) => &pc.zone, - None => "", - }; - - // Augment requests with some information used to sort them. - // The tuples are as follows: - // (is another node?, is another zone?, latency, node ID, request future) - // We store all of these tuples in a vec that we can sort. - // By sorting this vec, we priorize ourself, then nodes in the same zone, - // and within a same zone we priorize nodes with the lowest latency. - let mut nodes = nodes - .map(|to| { - let peer_zone = match layout.current().node_role(&to) { - Some(pc) => &pc.zone, - None => "", - }; - let peer_avg_ping = peer_list - .iter() - .find(|x| x.id.as_ref() == to.as_slice()) - .and_then(|pi| pi.avg_ping) - .unwrap_or_else(|| Duration::from_secs(10)); - ( - to != self.0.our_node_id, - peer_zone != our_zone, - peer_avg_ping, - to, - ) - }) - .collect::>(); - - // Sort requests by (priorize ourself, priorize same zone, priorize low latency) - nodes.sort_by_key(|(diffnode, diffzone, ping, _to)| (*diffnode, *diffzone, *ping)); - - nodes - .into_iter() - .map(|(_, _, _, to)| to) - .collect::>() - } - /// Make a RPC call to multiple servers, returning either a Vec of responses, /// or an error if quorum could not be reached due to too many errors /// @@ -533,6 +489,79 @@ impl RpcHelper { // Failure, could not get quorum Err(result_tracker.quorum_error()) } + + // ---- functions not related to MAKING RPCs, but just determining to what nodes + // they should be made and in which order ---- + + pub fn block_read_nodes_of(&self, position: &Hash, rpc_helper: &RpcHelper) -> Vec { + let layout = self.0.layout.read().unwrap(); + + let mut ret = Vec::with_capacity(12); + let ver_iter = layout + .versions + .iter() + .rev() + .chain(layout.old_versions.iter().rev()); + for ver in ver_iter { + if ver.version > layout.all_sync() { + continue; + } + let nodes = ver.nodes_of(position, ver.replication_factor); + for node in rpc_helper.request_order(&layout, nodes) { + if !ret.contains(&node) { + ret.push(node); + } + } + } + ret + } + + fn request_order( + &self, + layout: &LayoutHistory, + nodes: impl Iterator, + ) -> Vec { + // Retrieve some status variables that we will use to sort requests + let peer_list = self.0.fullmesh.get_peer_list(); + let our_zone = match layout.current().node_role(&self.0.our_node_id) { + Some(pc) => &pc.zone, + None => "", + }; + + // Augment requests with some information used to sort them. + // The tuples are as follows: + // (is another node?, is another zone?, latency, node ID, request future) + // We store all of these tuples in a vec that we can sort. + // By sorting this vec, we priorize ourself, then nodes in the same zone, + // and within a same zone we priorize nodes with the lowest latency. + let mut nodes = nodes + .map(|to| { + let peer_zone = match layout.current().node_role(&to) { + Some(pc) => &pc.zone, + None => "", + }; + let peer_avg_ping = peer_list + .iter() + .find(|x| x.id.as_ref() == to.as_slice()) + .and_then(|pi| pi.avg_ping) + .unwrap_or_else(|| Duration::from_secs(10)); + ( + to != self.0.our_node_id, + peer_zone != our_zone, + peer_avg_ping, + to, + ) + }) + .collect::>(); + + // Sort requests by (priorize ourself, priorize same zone, priorize low latency) + nodes.sort_by_key(|(diffnode, diffzone, ping, _to)| (*diffnode, *diffzone, *ping)); + + nodes + .into_iter() + .map(|(_, _, _, to)| to) + .collect::>() + } } // ------- utility for tracking successes/errors among write sets -------- -- cgit v1.2.3 From 64a6e557a4ff6aa1ad833a1b25ef8c85cf9ee3f3 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 8 Dec 2023 12:18:12 +0100 Subject: rpc helper: small refactorings --- src/rpc/rpc_helper.rs | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) (limited to 'src') diff --git a/src/rpc/rpc_helper.rs b/src/rpc/rpc_helper.rs index 7e1387ed..65af8901 100644 --- a/src/rpc/rpc_helper.rs +++ b/src/rpc/rpc_helper.rs @@ -436,13 +436,12 @@ impl RpcHelper { H: StreamingEndpointHandler + 'static, S: Send + 'static, { - let msg = msg.into_req().map_err(netapp::error::Error::from)?; - // Peers may appear in many quorum sets. Here, build a list of peers, // mapping to the index of the quorum sets in which they appear. let mut result_tracker = QuorumSetResultTracker::new(to_sets, quorum); // Send one request to each peer of the quorum sets + let msg = msg.into_req().map_err(netapp::error::Error::from)?; let requests = result_tracker.nodes.iter().map(|(peer, _)| { let self2 = self.clone(); let msg = msg.clone(); @@ -523,10 +522,10 @@ impl RpcHelper { ) -> Vec { // Retrieve some status variables that we will use to sort requests let peer_list = self.0.fullmesh.get_peer_list(); - let our_zone = match layout.current().node_role(&self.0.our_node_id) { - Some(pc) => &pc.zone, - None => "", - }; + let our_zone = layout + .current() + .get_node_zone(&self.0.our_node_id) + .unwrap_or(""); // Augment requests with some information used to sort them. // The tuples are as follows: @@ -536,10 +535,7 @@ impl RpcHelper { // and within a same zone we priorize nodes with the lowest latency. let mut nodes = nodes .map(|to| { - let peer_zone = match layout.current().node_role(&to) { - Some(pc) => &pc.zone, - None => "", - }; + let peer_zone = layout.current().get_node_zone(&to).unwrap_or(""); let peer_avg_ping = peer_list .iter() .find(|x| x.id.as_ref() == to.as_slice()) @@ -567,21 +563,28 @@ impl RpcHelper { // ------- utility for tracking successes/errors among write sets -------- pub struct QuorumSetResultTracker { - // The set of nodes and the quorum sets they belong to + /// The set of nodes and the index of the quorum sets they belong to pub nodes: HashMap>, + /// The quorum value, i.e. number of success responses to await in each set pub quorum: usize, - // The success and error responses received + /// The success responses received pub successes: Vec<(Uuid, S)>, + /// The error responses received pub failures: Vec<(Uuid, E)>, - // The counters for successes and failures in each set + /// The counters for successes in each set pub success_counters: Box<[usize]>, + /// The counters for failures in each set pub failure_counters: Box<[usize]>, + /// The total number of nodes in each set pub set_lens: Box<[usize]>, } -impl QuorumSetResultTracker { +impl QuorumSetResultTracker +where + E: std::fmt::Display, +{ pub fn new(sets: &[A], quorum: usize) -> Self where A: AsRef<[Uuid]>, -- cgit v1.2.3 From 4dbf254512327ef4e7abbd5525b89bfa5b7ecb6f Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 8 Dec 2023 14:15:52 +0100 Subject: layout: refactoring, merge two files --- src/rpc/layout/helper.rs | 2 +- src/rpc/layout/mod.rs | 441 +++++++++++++++++++++++++++++++++++++++++++++- src/rpc/layout/schema.rs | 431 -------------------------------------------- src/rpc/layout/version.rs | 1 - 4 files changed, 440 insertions(+), 435 deletions(-) delete mode 100644 src/rpc/layout/schema.rs (limited to 'src') diff --git a/src/rpc/layout/helper.rs b/src/rpc/layout/helper.rs index 147c8b4f..2ba010b8 100644 --- a/src/rpc/layout/helper.rs +++ b/src/rpc/layout/helper.rs @@ -6,7 +6,7 @@ use serde::{Deserialize, Serialize}; use garage_util::data::*; -use super::schema::*; +use super::*; use crate::replication_mode::ReplicationMode; #[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq, Eq)] diff --git a/src/rpc/layout/mod.rs b/src/rpc/layout/mod.rs index eb127fda..facdb2ce 100644 --- a/src/rpc/layout/mod.rs +++ b/src/rpc/layout/mod.rs @@ -1,7 +1,13 @@ +use std::fmt; + +use bytesize::ByteSize; + +use garage_util::crdt::{AutoCrdt, Crdt}; +use garage_util::data::Uuid; + mod graph_algo; mod helper; mod history; -mod schema; mod version; #[cfg(test)] @@ -13,7 +19,6 @@ pub mod manager; pub use helper::{LayoutDigest, LayoutHelper}; pub use manager::WriteLock; -pub use schema::*; pub use version::*; // ---- defines: partitions ---- @@ -39,3 +44,435 @@ const NB_PARTITIONS: usize = 1usize << PARTITION_BITS; // Change this to u16 the day we want to have more than 256 nodes in a cluster pub type CompactNodeType = u8; pub const MAX_NODE_NUMBER: usize = 256; + +// ======== actual data structures for the layout data ======== +// ======== that is persisted to disk ======== +// some small utility impls are at the end of this file, +// but most of the code that actually computes stuff is in +// version.rs, history.rs and helper.rs + +mod v08 { + use crate::layout::CompactNodeType; + use garage_util::crdt::LwwMap; + use garage_util::data::{Hash, Uuid}; + use serde::{Deserialize, Serialize}; + + /// The layout of the cluster, i.e. the list of roles + /// which are assigned to each cluster node + #[derive(Clone, Debug, Serialize, Deserialize)] + pub struct ClusterLayout { + pub version: u64, + + pub replication_factor: usize, + pub roles: LwwMap, + + // see comments in v010::ClusterLayout + pub node_id_vec: Vec, + #[serde(with = "serde_bytes")] + pub ring_assignation_data: Vec, + + /// Role changes which are staged for the next version of the layout + pub staging: LwwMap, + pub staging_hash: Hash, + } + + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] + pub struct NodeRoleV(pub Option); + + /// The user-assigned roles of cluster nodes + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] + pub struct NodeRole { + /// Datacenter at which this entry belong. This information is used to + /// perform a better geodistribution + pub zone: String, + /// The capacity of the node + /// If this is set to None, the node does not participate in storing data for the system + /// and is only active as an API gateway to other nodes + pub capacity: Option, + /// A set of tags to recognize the node + pub tags: Vec, + } + + impl garage_util::migrate::InitialFormat for ClusterLayout {} +} + +mod v09 { + use super::v08; + use crate::layout::CompactNodeType; + use garage_util::crdt::{Lww, LwwMap}; + use garage_util::data::{Hash, Uuid}; + use serde::{Deserialize, Serialize}; + pub use v08::{NodeRole, NodeRoleV}; + + /// The layout of the cluster, i.e. the list of roles + /// which are assigned to each cluster node + #[derive(Clone, Debug, Serialize, Deserialize)] + pub struct ClusterLayout { + pub version: u64, + + pub replication_factor: usize, + + /// This attribute is only used to retain the previously computed partition size, + /// to know to what extent does it change with the layout update. + pub partition_size: u64, + /// Parameters used to compute the assignment currently given by + /// ring_assignment_data + pub parameters: LayoutParameters, + + pub roles: LwwMap, + + // see comments in v010::ClusterLayout + pub node_id_vec: Vec, + #[serde(with = "serde_bytes")] + pub ring_assignment_data: Vec, + + /// Parameters to be used in the next partition assignment computation. + pub staging_parameters: Lww, + /// Role changes which are staged for the next version of the layout + pub staging_roles: LwwMap, + pub staging_hash: Hash, + } + + /// This struct is used to set the parameters to be used in the assignment computation + /// algorithm. It is stored as a Crdt. + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)] + pub struct LayoutParameters { + pub zone_redundancy: ZoneRedundancy, + } + + /// Zone redundancy: if set to AtLeast(x), the layout calculation will aim to store copies + /// of each partition on at least that number of different zones. + /// Otherwise, copies will be stored on the maximum possible number of zones. + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)] + pub enum ZoneRedundancy { + AtLeast(usize), + Maximum, + } + + impl garage_util::migrate::Migrate for ClusterLayout { + const VERSION_MARKER: &'static [u8] = b"G09layout"; + + type Previous = v08::ClusterLayout; + + fn migrate(previous: Self::Previous) -> Self { + use itertools::Itertools; + + // In the old layout, capacities are in an arbitrary unit, + // but in the new layout they are in bytes. + // Here we arbitrarily multiply everything by 1G, + // such that 1 old capacity unit = 1GB in the new units. + // This is totally arbitrary and won't work for most users. + let cap_mul = 1024 * 1024 * 1024; + let roles = multiply_all_capacities(previous.roles, cap_mul); + let staging_roles = multiply_all_capacities(previous.staging, cap_mul); + let node_id_vec = previous.node_id_vec; + + // Determine partition size + let mut tmp = previous.ring_assignation_data.clone(); + tmp.sort(); + let partition_size = tmp + .into_iter() + .dedup_with_count() + .map(|(npart, node)| { + roles + .get(&node_id_vec[node as usize]) + .and_then(|p| p.0.as_ref().and_then(|r| r.capacity)) + .unwrap_or(0) / npart as u64 + }) + .min() + .unwrap_or(0); + + // By default, zone_redundancy is maximum possible value + let parameters = LayoutParameters { + zone_redundancy: ZoneRedundancy::Maximum, + }; + + Self { + version: previous.version, + replication_factor: previous.replication_factor, + partition_size, + parameters, + roles, + node_id_vec, + ring_assignment_data: previous.ring_assignation_data, + staging_parameters: Lww::new(parameters), + staging_roles, + staging_hash: [0u8; 32].into(), // will be set in the next migration + } + } + } + + fn multiply_all_capacities( + old_roles: LwwMap, + mul: u64, + ) -> LwwMap { + let mut new_roles = LwwMap::new(); + for (node, ts, role) in old_roles.items() { + let mut role = role.clone(); + if let NodeRoleV(Some(NodeRole { + capacity: Some(ref mut cap), + .. + })) = role + { + *cap *= mul; + } + new_roles.merge_raw(node, *ts, &role); + } + new_roles + } +} + +mod v010 { + use super::v09; + use crate::layout::CompactNodeType; + use garage_util::crdt::{Lww, LwwMap}; + use garage_util::data::Uuid; + use serde::{Deserialize, Serialize}; + use std::collections::BTreeMap; + pub use v09::{LayoutParameters, NodeRole, NodeRoleV, ZoneRedundancy}; + + /// Number of old (non-live) versions to keep, see LayoutHistory::old_versions + pub const OLD_VERSION_COUNT: usize = 5; + + /// The history of cluster layouts, with trackers to keep a record + /// of which nodes are up-to-date to current cluster data + #[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] + pub struct LayoutHistory { + /// The versions currently in use in the cluster + pub versions: Vec, + /// At most 5 of the previous versions, not used by the garage_table + /// module, but usefull for the garage_block module to find data blocks + /// that have not yet been moved + pub old_versions: Vec, + + /// Update trackers + pub update_trackers: UpdateTrackers, + + /// Staged changes for the next version + pub staging: Lww, + } + + /// A version of the layout of the cluster, i.e. the list of roles + /// which are assigned to each cluster node + #[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] + pub struct LayoutVersion { + /// The number of this version + pub version: u64, + + /// Roles assigned to nodes in this version + pub roles: LwwMap, + /// Parameters used to compute the assignment currently given by + /// ring_assignment_data + pub parameters: LayoutParameters, + + /// The number of replicas for each data partition + pub replication_factor: usize, + /// This attribute is only used to retain the previously computed partition size, + /// to know to what extent does it change with the layout update. + pub partition_size: u64, + + /// node_id_vec: a vector of node IDs with a role assigned + /// in the system (this includes gateway nodes). + /// The order here is different than the vec stored by `roles`, because: + /// 1. non-gateway nodes are first so that they have lower numbers + /// 2. nodes that don't have a role are excluded (but they need to + /// stay in the CRDT as tombstones) + pub node_id_vec: Vec, + /// number of non-gateway nodes, which are the first ids in node_id_vec + pub nongateway_node_count: usize, + /// The assignation of data partitions to nodes, the values + /// are indices in node_id_vec + #[serde(with = "serde_bytes")] + pub ring_assignment_data: Vec, + } + + /// The staged changes for the next layout version + #[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] + pub struct LayoutStaging { + /// Parameters to be used in the next partition assignment computation. + pub parameters: Lww, + /// Role changes which are staged for the next version of the layout + pub roles: LwwMap, + } + + /// The tracker of acknowlegments and data syncs around the cluster + #[derive(Clone, Debug, Serialize, Deserialize, Default, PartialEq)] + pub struct UpdateTrackers { + /// The highest layout version number each node has ack'ed + pub ack_map: UpdateTracker, + /// The highest layout version number each node has synced data for + pub sync_map: UpdateTracker, + /// The highest layout version number each node has + /// ack'ed that all other nodes have synced data for + pub sync_ack_map: UpdateTracker, + } + + /// Generic update tracker struct + #[derive(Clone, Debug, Serialize, Deserialize, Default, PartialEq)] + pub struct UpdateTracker(pub BTreeMap); + + impl garage_util::migrate::Migrate for LayoutHistory { + const VERSION_MARKER: &'static [u8] = b"G010lh"; + + type Previous = v09::ClusterLayout; + + fn migrate(previous: Self::Previous) -> Self { + let nongateway_node_count = previous + .node_id_vec + .iter() + .enumerate() + .filter(|(_, uuid)| { + let role = previous.roles.get(uuid); + matches!(role, Some(NodeRoleV(Some(role))) if role.capacity.is_some()) + }) + .map(|(i, _)| i + 1) + .max() + .unwrap_or(0); + + let version = LayoutVersion { + version: previous.version, + replication_factor: previous.replication_factor, + partition_size: previous.partition_size, + parameters: previous.parameters, + roles: previous.roles, + node_id_vec: previous.node_id_vec, + nongateway_node_count, + ring_assignment_data: previous.ring_assignment_data, + }; + let update_tracker = UpdateTracker( + version + .nongateway_nodes() + .iter() + .copied() + .map(|x| (x, version.version)) + .collect::>(), + ); + let staging = LayoutStaging { + parameters: previous.staging_parameters, + roles: previous.staging_roles, + }; + Self { + versions: vec![version], + old_versions: vec![], + update_trackers: UpdateTrackers { + ack_map: update_tracker.clone(), + sync_map: update_tracker.clone(), + sync_ack_map: update_tracker.clone(), + }, + staging: Lww::raw(previous.version, staging), + } + } + } +} + +pub use v010::*; + +// ---- utility functions ---- + +impl AutoCrdt for LayoutParameters { + const WARN_IF_DIFFERENT: bool = true; +} + +impl AutoCrdt for NodeRoleV { + const WARN_IF_DIFFERENT: bool = true; +} + +impl Crdt for LayoutStaging { + fn merge(&mut self, other: &LayoutStaging) { + self.parameters.merge(&other.parameters); + self.roles.merge(&other.roles); + } +} + +impl NodeRole { + pub fn capacity_string(&self) -> String { + match self.capacity { + Some(c) => ByteSize::b(c).to_string_as(false), + None => "gateway".to_string(), + } + } + + pub fn tags_string(&self) -> String { + self.tags.join(",") + } +} + +impl fmt::Display for ZoneRedundancy { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + ZoneRedundancy::Maximum => write!(f, "maximum"), + ZoneRedundancy::AtLeast(x) => write!(f, "{}", x), + } + } +} + +impl core::str::FromStr for ZoneRedundancy { + type Err = &'static str; + fn from_str(s: &str) -> Result { + match s { + "none" | "max" | "maximum" => Ok(ZoneRedundancy::Maximum), + x => { + let v = x + .parse::() + .map_err(|_| "zone redundancy must be 'none'/'max' or an integer")?; + Ok(ZoneRedundancy::AtLeast(v)) + } + } + } +} + +impl UpdateTracker { + fn merge(&mut self, other: &UpdateTracker) -> bool { + let mut changed = false; + for (k, v) in other.0.iter() { + if let Some(v_mut) = self.0.get_mut(k) { + if *v > *v_mut { + *v_mut = *v; + changed = true; + } + } else { + self.0.insert(*k, *v); + changed = true; + } + } + changed + } + + /// This bumps the update tracker for a given node up to the specified value. + /// This has potential impacts on the correctness of Garage and should only + /// be used in very specific circumstances. + pub fn set_max(&mut self, peer: Uuid, value: u64) -> bool { + match self.0.get_mut(&peer) { + Some(e) if *e < value => { + *e = value; + true + } + None => { + self.0.insert(peer, value); + true + } + _ => false, + } + } + + pub(crate) fn min_among(&self, storage_nodes: &[Uuid], min_version: u64) -> u64 { + storage_nodes + .iter() + .map(|x| self.get(x, min_version)) + .min() + .unwrap_or(min_version) + } + + pub fn get(&self, node: &Uuid, min_version: u64) -> u64 { + self.0.get(node).copied().unwrap_or(min_version) + } +} + +impl UpdateTrackers { + pub(crate) fn merge(&mut self, other: &UpdateTrackers) -> bool { + let c1 = self.ack_map.merge(&other.ack_map); + let c2 = self.sync_map.merge(&other.sync_map); + let c3 = self.sync_ack_map.merge(&other.sync_ack_map); + c1 || c2 || c3 + } +} diff --git a/src/rpc/layout/schema.rs b/src/rpc/layout/schema.rs deleted file mode 100644 index df949906..00000000 --- a/src/rpc/layout/schema.rs +++ /dev/null @@ -1,431 +0,0 @@ -use std::fmt; - -use bytesize::ByteSize; - -use garage_util::crdt::{AutoCrdt, Crdt}; -use garage_util::data::Uuid; - -mod v08 { - use crate::layout::CompactNodeType; - use garage_util::crdt::LwwMap; - use garage_util::data::{Hash, Uuid}; - use serde::{Deserialize, Serialize}; - - /// The layout of the cluster, i.e. the list of roles - /// which are assigned to each cluster node - #[derive(Clone, Debug, Serialize, Deserialize)] - pub struct ClusterLayout { - pub version: u64, - - pub replication_factor: usize, - pub roles: LwwMap, - - /// node_id_vec: a vector of node IDs with a role assigned - /// in the system (this includes gateway nodes). - /// The order here is different than the vec stored by `roles`, because: - /// 1. non-gateway nodes are first so that they have lower numbers - /// 2. nodes that don't have a role are excluded (but they need to - /// stay in the CRDT as tombstones) - pub node_id_vec: Vec, - /// the assignation of data partitions to node, the values - /// are indices in node_id_vec - #[serde(with = "serde_bytes")] - pub ring_assignation_data: Vec, - - /// Role changes which are staged for the next version of the layout - pub staging: LwwMap, - pub staging_hash: Hash, - } - - #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] - pub struct NodeRoleV(pub Option); - - /// The user-assigned roles of cluster nodes - #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] - pub struct NodeRole { - /// Datacenter at which this entry belong. This information is used to - /// perform a better geodistribution - pub zone: String, - /// The capacity of the node - /// If this is set to None, the node does not participate in storing data for the system - /// and is only active as an API gateway to other nodes - pub capacity: Option, - /// A set of tags to recognize the node - pub tags: Vec, - } - - impl garage_util::migrate::InitialFormat for ClusterLayout {} -} - -mod v09 { - use super::v08; - use crate::layout::CompactNodeType; - use garage_util::crdt::{Lww, LwwMap}; - use garage_util::data::{Hash, Uuid}; - use serde::{Deserialize, Serialize}; - pub use v08::{NodeRole, NodeRoleV}; - - /// The layout of the cluster, i.e. the list of roles - /// which are assigned to each cluster node - #[derive(Clone, Debug, Serialize, Deserialize)] - pub struct ClusterLayout { - pub version: u64, - - pub replication_factor: usize, - - /// This attribute is only used to retain the previously computed partition size, - /// to know to what extent does it change with the layout update. - pub partition_size: u64, - /// Parameters used to compute the assignment currently given by - /// ring_assignment_data - pub parameters: LayoutParameters, - - pub roles: LwwMap, - - /// see comment in v08::ClusterLayout - pub node_id_vec: Vec, - /// see comment in v08::ClusterLayout - #[serde(with = "serde_bytes")] - pub ring_assignment_data: Vec, - - /// Parameters to be used in the next partition assignment computation. - pub staging_parameters: Lww, - /// Role changes which are staged for the next version of the layout - pub staging_roles: LwwMap, - pub staging_hash: Hash, - } - - /// This struct is used to set the parameters to be used in the assignment computation - /// algorithm. It is stored as a Crdt. - #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)] - pub struct LayoutParameters { - pub zone_redundancy: ZoneRedundancy, - } - - /// Zone redundancy: if set to AtLeast(x), the layout calculation will aim to store copies - /// of each partition on at least that number of different zones. - /// Otherwise, copies will be stored on the maximum possible number of zones. - #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)] - pub enum ZoneRedundancy { - AtLeast(usize), - Maximum, - } - - impl garage_util::migrate::Migrate for ClusterLayout { - const VERSION_MARKER: &'static [u8] = b"G09layout"; - - type Previous = v08::ClusterLayout; - - fn migrate(previous: Self::Previous) -> Self { - use itertools::Itertools; - - // In the old layout, capacities are in an arbitrary unit, - // but in the new layout they are in bytes. - // Here we arbitrarily multiply everything by 1G, - // such that 1 old capacity unit = 1GB in the new units. - // This is totally arbitrary and won't work for most users. - let cap_mul = 1024 * 1024 * 1024; - let roles = multiply_all_capacities(previous.roles, cap_mul); - let staging_roles = multiply_all_capacities(previous.staging, cap_mul); - let node_id_vec = previous.node_id_vec; - - // Determine partition size - let mut tmp = previous.ring_assignation_data.clone(); - tmp.sort(); - let partition_size = tmp - .into_iter() - .dedup_with_count() - .map(|(npart, node)| { - roles - .get(&node_id_vec[node as usize]) - .and_then(|p| p.0.as_ref().and_then(|r| r.capacity)) - .unwrap_or(0) / npart as u64 - }) - .min() - .unwrap_or(0); - - // By default, zone_redundancy is maximum possible value - let parameters = LayoutParameters { - zone_redundancy: ZoneRedundancy::Maximum, - }; - - Self { - version: previous.version, - replication_factor: previous.replication_factor, - partition_size, - parameters, - roles, - node_id_vec, - ring_assignment_data: previous.ring_assignation_data, - staging_parameters: Lww::new(parameters), - staging_roles, - staging_hash: [0u8; 32].into(), // will be set in the next migration - } - } - } - - fn multiply_all_capacities( - old_roles: LwwMap, - mul: u64, - ) -> LwwMap { - let mut new_roles = LwwMap::new(); - for (node, ts, role) in old_roles.items() { - let mut role = role.clone(); - if let NodeRoleV(Some(NodeRole { - capacity: Some(ref mut cap), - .. - })) = role - { - *cap *= mul; - } - new_roles.merge_raw(node, *ts, &role); - } - new_roles - } -} - -mod v010 { - use super::v09; - use crate::layout::CompactNodeType; - use garage_util::crdt::{Lww, LwwMap}; - use garage_util::data::Uuid; - use serde::{Deserialize, Serialize}; - use std::collections::BTreeMap; - pub use v09::{LayoutParameters, NodeRole, NodeRoleV, ZoneRedundancy}; - - pub const OLD_VERSION_COUNT: usize = 5; - - /// The history of cluster layouts, with trackers to keep a record - /// of which nodes are up-to-date to current cluster data - #[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] - pub struct LayoutHistory { - /// The versions currently in use in the cluster - pub versions: Vec, - /// At most 5 of the previous versions, not used by the garage_table - /// module, but usefull for the garage_block module to find data blocks - /// that have not yet been moved - pub old_versions: Vec, - - /// Update trackers - pub update_trackers: UpdateTrackers, - - /// Staged changes for the next version - pub staging: Lww, - } - - /// A version of the layout of the cluster, i.e. the list of roles - /// which are assigned to each cluster node - #[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] - pub struct LayoutVersion { - pub version: u64, - - pub replication_factor: usize, - - /// This attribute is only used to retain the previously computed partition size, - /// to know to what extent does it change with the layout update. - pub partition_size: u64, - /// Parameters used to compute the assignment currently given by - /// ring_assignment_data - pub parameters: LayoutParameters, - - pub roles: LwwMap, - - /// see comment in v08::ClusterLayout - pub node_id_vec: Vec, - /// number of non-gateway nodes, which are the first ids in node_id_vec - pub nongateway_node_count: usize, - /// see comment in v08::ClusterLayout - #[serde(with = "serde_bytes")] - pub ring_assignment_data: Vec, - } - - /// The staged changes for the next layout version - #[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] - pub struct LayoutStaging { - /// Parameters to be used in the next partition assignment computation. - pub parameters: Lww, - /// Role changes which are staged for the next version of the layout - pub roles: LwwMap, - } - - /// The tracker of acknowlegments and data syncs around the cluster - #[derive(Clone, Debug, Serialize, Deserialize, Default, PartialEq)] - pub struct UpdateTrackers { - /// The highest layout version number each node has ack'ed - pub ack_map: UpdateTracker, - /// The highest layout version number each node has synced data for - pub sync_map: UpdateTracker, - /// The highest layout version number each node has - /// ack'ed that all other nodes have synced data for - pub sync_ack_map: UpdateTracker, - } - - /// The history of cluster layouts - #[derive(Clone, Debug, Serialize, Deserialize, Default, PartialEq)] - pub struct UpdateTracker(pub BTreeMap); - - impl garage_util::migrate::Migrate for LayoutHistory { - const VERSION_MARKER: &'static [u8] = b"G010lh"; - - type Previous = v09::ClusterLayout; - - fn migrate(previous: Self::Previous) -> Self { - let nongateway_node_count = previous - .node_id_vec - .iter() - .enumerate() - .filter(|(_, uuid)| { - let role = previous.roles.get(uuid); - matches!(role, Some(NodeRoleV(Some(role))) if role.capacity.is_some()) - }) - .map(|(i, _)| i + 1) - .max() - .unwrap_or(0); - - let version = LayoutVersion { - version: previous.version, - replication_factor: previous.replication_factor, - partition_size: previous.partition_size, - parameters: previous.parameters, - roles: previous.roles, - node_id_vec: previous.node_id_vec, - nongateway_node_count, - ring_assignment_data: previous.ring_assignment_data, - }; - let update_tracker = UpdateTracker( - version - .nongateway_nodes() - .iter() - .copied() - .map(|x| (x, version.version)) - .collect::>(), - ); - let staging = LayoutStaging { - parameters: previous.staging_parameters, - roles: previous.staging_roles, - }; - Self { - versions: vec![version], - old_versions: vec![], - update_trackers: UpdateTrackers { - ack_map: update_tracker.clone(), - sync_map: update_tracker.clone(), - sync_ack_map: update_tracker.clone(), - }, - staging: Lww::raw(previous.version, staging), - } - } - } -} - -pub use v010::*; - -// ---- utility functions ---- - -impl AutoCrdt for LayoutParameters { - const WARN_IF_DIFFERENT: bool = true; -} - -impl AutoCrdt for NodeRoleV { - const WARN_IF_DIFFERENT: bool = true; -} - -impl Crdt for LayoutStaging { - fn merge(&mut self, other: &LayoutStaging) { - self.parameters.merge(&other.parameters); - self.roles.merge(&other.roles); - } -} - -impl NodeRole { - pub fn capacity_string(&self) -> String { - match self.capacity { - Some(c) => ByteSize::b(c).to_string_as(false), - None => "gateway".to_string(), - } - } - - pub fn tags_string(&self) -> String { - self.tags.join(",") - } -} - -impl fmt::Display for ZoneRedundancy { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - ZoneRedundancy::Maximum => write!(f, "maximum"), - ZoneRedundancy::AtLeast(x) => write!(f, "{}", x), - } - } -} - -impl core::str::FromStr for ZoneRedundancy { - type Err = &'static str; - fn from_str(s: &str) -> Result { - match s { - "none" | "max" | "maximum" => Ok(ZoneRedundancy::Maximum), - x => { - let v = x - .parse::() - .map_err(|_| "zone redundancy must be 'none'/'max' or an integer")?; - Ok(ZoneRedundancy::AtLeast(v)) - } - } - } -} - -impl UpdateTracker { - fn merge(&mut self, other: &UpdateTracker) -> bool { - let mut changed = false; - for (k, v) in other.0.iter() { - if let Some(v_mut) = self.0.get_mut(k) { - if *v > *v_mut { - *v_mut = *v; - changed = true; - } - } else { - self.0.insert(*k, *v); - changed = true; - } - } - changed - } - - /// This bumps the update tracker for a given node up to the specified value. - /// This has potential impacts on the correctness of Garage and should only - /// be used in very specific circumstances. - pub fn set_max(&mut self, peer: Uuid, value: u64) -> bool { - match self.0.get_mut(&peer) { - Some(e) if *e < value => { - *e = value; - true - } - None => { - self.0.insert(peer, value); - true - } - _ => false, - } - } - - pub(crate) fn min_among(&self, storage_nodes: &[Uuid], min_version: u64) -> u64 { - storage_nodes - .iter() - .map(|x| self.get(x, min_version)) - .min() - .unwrap_or(min_version) - } - - pub fn get(&self, node: &Uuid, min_version: u64) -> u64 { - self.0.get(node).copied().unwrap_or(min_version) - } -} - -impl UpdateTrackers { - pub(crate) fn merge(&mut self, other: &UpdateTrackers) -> bool { - let c1 = self.ack_map.merge(&other.ack_map); - let c2 = self.sync_map.merge(&other.sync_map); - let c3 = self.sync_ack_map.merge(&other.sync_ack_map); - c1 || c2 || c3 - } -} diff --git a/src/rpc/layout/version.rs b/src/rpc/layout/version.rs index cbfbee94..5b307156 100644 --- a/src/rpc/layout/version.rs +++ b/src/rpc/layout/version.rs @@ -10,7 +10,6 @@ use garage_util::data::*; use garage_util::error::*; use super::graph_algo::*; -use super::schema::*; use super::*; // The Message type will be used to collect information on the algorithm. -- cgit v1.2.3 From f8df90b79b93e4a1391839435718bad8c697246d Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 8 Dec 2023 14:54:11 +0100 Subject: table: fix insert_many to not send duplicates --- src/table/table.rs | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'src') diff --git a/src/table/table.rs b/src/table/table.rs index 6508cf5d..59cfdd07 100644 --- a/src/table/table.rs +++ b/src/table/table.rs @@ -196,6 +196,8 @@ impl Table { let hash = entry.partition_key().hash(); let mut write_sets = self.data.replication.write_sets(&hash); for set in write_sets.as_mut().iter_mut() { + // Sort nodes in each write sets to merge write sets with same + // nodes but in possibly different orders set.sort(); } let e_enc = Arc::new(ByteBuf::from(entry.encode()?)); @@ -220,7 +222,16 @@ impl Table { for (write_sets, entry_enc) in entries_vec.iter() { for write_set in write_sets.as_ref().iter() { for node in write_set.iter() { - call_list.entry(*node).or_default().push(entry_enc.clone()) + let node_entries = call_list.entry(*node).or_default(); + match node_entries.last() { + Some(x) if Arc::ptr_eq(x, entry_enc) => { + // skip if entry already in list to send to this node + // (could happen if node is in several write sets for this entry) + } + _ => { + node_entries.push(entry_enc.clone()); + } + } } } } -- cgit v1.2.3 From e4f493b48156e6e30f16fba10f300f6cb5fe0b0d Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 11 Dec 2023 14:57:42 +0100 Subject: table: remove redundant tracing in insert_many --- src/table/table.rs | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'src') diff --git a/src/table/table.rs b/src/table/table.rs index 59cfdd07..05a0dab1 100644 --- a/src/table/table.rs +++ b/src/table/table.rs @@ -239,9 +239,7 @@ impl Table { // Build futures to actually perform each of the corresponding RPC calls let call_futures = call_list.into_iter().map(|(node, entries)| { let this = self.clone(); - let tracer = opentelemetry::global::tracer("garage"); - let span = tracer.start(format!("RPC to {:?}", node)); - let fut = async move { + async move { let rpc = TableRpc::::Update(entries); let resp = this .system @@ -254,8 +252,7 @@ impl Table { ) .await; (node, resp) - }; - fut.with_context(Context::current_with_span(span)) + } }); // Run all requests in parallel thanks to FuturesUnordered, and collect results. -- cgit v1.2.3 From 85b5a6bcd11c0a7651e4c589569e1935a3d18e46 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 11 Dec 2023 15:31:47 +0100 Subject: fix some clippy lints --- src/api/admin/cluster.rs | 2 +- src/rpc/layout/helper.rs | 2 +- src/rpc/layout/history.rs | 14 ++++++-------- src/rpc/layout/mod.rs | 2 +- src/rpc/layout/version.rs | 6 +++--- src/rpc/rpc_helper.rs | 2 +- src/rpc/system.rs | 2 +- src/table/sync.rs | 8 ++++---- src/table/table.rs | 3 +-- 9 files changed, 19 insertions(+), 22 deletions(-) (limited to 'src') diff --git a/src/api/admin/cluster.rs b/src/api/admin/cluster.rs index 3ce1b254..8677257d 100644 --- a/src/api/admin/cluster.rs +++ b/src/api/admin/cluster.rs @@ -94,7 +94,7 @@ pub async fn handle_get_cluster_status(garage: &Arc) -> Result>(); + let mut nodes = nodes.into_values().collect::>(); nodes.sort_by(|x, y| x.id.cmp(&y.id)); let res = GetClusterStatusResponse { diff --git a/src/rpc/layout/helper.rs b/src/rpc/layout/helper.rs index 2ba010b8..7e5d37e9 100644 --- a/src/rpc/layout/helper.rs +++ b/src/rpc/layout/helper.rs @@ -129,7 +129,7 @@ impl LayoutHelper { where F: FnOnce(&mut LayoutHistory) -> bool, { - let changed = f(&mut self.layout.as_mut().unwrap()); + let changed = f(self.layout.as_mut().unwrap()); if changed { *self = Self::new( self.replication_mode, diff --git a/src/rpc/layout/history.rs b/src/rpc/layout/history.rs index a53256cc..23196aee 100644 --- a/src/rpc/layout/history.rs +++ b/src/rpc/layout/history.rs @@ -42,8 +42,7 @@ impl LayoutHistory { let set = self .versions .iter() - .map(|x| x.all_nodes()) - .flatten() + .flat_map(|x| x.all_nodes()) .collect::>(); set.into_iter().copied().collect::>() } @@ -56,8 +55,7 @@ impl LayoutHistory { let set = self .versions .iter() - .map(|x| x.nongateway_nodes()) - .flatten() + .flat_map(|x| x.nongateway_nodes()) .collect::>(); set.into_iter().copied().collect::>() } @@ -94,7 +92,7 @@ impl LayoutHistory { let sync_ack_map_min = self .update_trackers .sync_ack_map - .min_among(¤t_nodes, min_version); + .min_among(current_nodes, min_version); if self.min_stored() < sync_ack_map_min { let removed = self.versions.remove(0); info!( @@ -144,7 +142,7 @@ impl LayoutHistory { let global_min = self .update_trackers .sync_map - .min_among(&all_nongateway_nodes, min_version); + .min_among(all_nongateway_nodes, min_version); // If the write quorums are equal to the total number of nodes, // i.e. no writes can succeed while they are not written to all nodes, @@ -281,7 +279,7 @@ To know the correct value of the new layout version, invoke `garage layout show` let (new_version, msg) = self .current() .clone() - .calculate_next_version(&self.staging.get())?; + .calculate_next_version(self.staging.get())?; self.versions.push(new_version); self.cleanup_old_versions(); @@ -297,7 +295,7 @@ To know the correct value of the new layout version, invoke `garage layout show` pub fn revert_staged_changes(mut self) -> Result { self.staging.update(LayoutStaging { - parameters: Lww::new(self.current().parameters.clone()), + parameters: Lww::new(self.current().parameters), roles: LwwMap::new(), }); diff --git a/src/rpc/layout/mod.rs b/src/rpc/layout/mod.rs index facdb2ce..162e3c6e 100644 --- a/src/rpc/layout/mod.rs +++ b/src/rpc/layout/mod.rs @@ -357,7 +357,7 @@ mod v010 { update_trackers: UpdateTrackers { ack_map: update_tracker.clone(), sync_map: update_tracker.clone(), - sync_ack_map: update_tracker.clone(), + sync_ack_map: update_tracker, }, staging: Lww::raw(previous.version, staging), } diff --git a/src/rpc/layout/version.rs b/src/rpc/layout/version.rs index 5b307156..ee4b2821 100644 --- a/src/rpc/layout/version.rs +++ b/src/rpc/layout/version.rs @@ -137,19 +137,19 @@ impl LayoutVersion { // ===================== internal information extractors ====================== pub(crate) fn expect_get_node_capacity(&self, uuid: &Uuid) -> u64 { - self.get_node_capacity(&uuid) + self.get_node_capacity(uuid) .expect("non-gateway node with zero capacity") } pub(crate) fn expect_get_node_zone(&self, uuid: &Uuid) -> &str { - self.get_node_zone(&uuid).expect("node without a zone") + self.get_node_zone(uuid).expect("node without a zone") } /// Returns the sum of capacities of non gateway nodes in the cluster fn get_total_capacity(&self) -> u64 { let mut total_capacity = 0; for uuid in self.nongateway_nodes() { - total_capacity += self.expect_get_node_capacity(&uuid); + total_capacity += self.expect_get_node_capacity(uuid); } total_capacity } diff --git a/src/rpc/rpc_helper.rs b/src/rpc/rpc_helper.rs index 65af8901..77a36ca1 100644 --- a/src/rpc/rpc_helper.rs +++ b/src/rpc/rpc_helper.rs @@ -442,7 +442,7 @@ impl RpcHelper { // Send one request to each peer of the quorum sets let msg = msg.into_req().map_err(netapp::error::Error::from)?; - let requests = result_tracker.nodes.iter().map(|(peer, _)| { + let requests = result_tracker.nodes.keys().map(|peer| { let self2 = self.clone(); let msg = msg.clone(); let endpoint2 = endpoint.clone(); diff --git a/src/rpc/system.rs b/src/rpc/system.rs index a8f12852..41d76177 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -315,7 +315,7 @@ impl System { local_status: ArcSwap::new(Arc::new(local_status)), node_status: RwLock::new(HashMap::new()), netapp: netapp.clone(), - fullmesh: fullmesh.clone(), + fullmesh, system_endpoint, replication_mode, replication_factor, diff --git a/src/table/sync.rs b/src/table/sync.rs index cfcbc4b5..1561a2e5 100644 --- a/src/table/sync.rs +++ b/src/table/sync.rs @@ -123,15 +123,15 @@ impl TableSyncer { let mut sync_futures = result_tracker .nodes - .iter() - .map(|(node, _)| *node) + .keys() + .copied() .map(|node| { let must_exit = must_exit.clone(); async move { if node == my_id { (node, Ok(())) } else { - (node, self.do_sync_with(&partition, node, must_exit).await) + (node, self.do_sync_with(partition, node, must_exit).await) } } }) @@ -145,7 +145,7 @@ impl TableSyncer { } if result_tracker.too_many_failures() { - return Err(result_tracker.quorum_error()); + Err(result_tracker.quorum_error()) } else { Ok(()) } diff --git a/src/table/table.rs b/src/table/table.rs index 05a0dab1..a5be2910 100644 --- a/src/table/table.rs +++ b/src/table/table.rs @@ -209,8 +209,7 @@ impl Table { // it takes part, to optimize the detection of a quorum. let mut write_sets = entries_vec .iter() - .map(|(wss, _)| wss.as_ref().iter().map(|ws| ws.as_slice())) - .flatten() + .flat_map(|(wss, _)| wss.as_ref().iter().map(|ws| ws.as_slice())) .collect::>(); write_sets.sort(); write_sets.dedup(); -- cgit v1.2.3 From adccce1145d5d82581e4a5da707be35badb2d5a6 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 11 Dec 2023 15:45:14 +0100 Subject: layout: refactor/fix bad while loop --- src/rpc/layout/history.rs | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) (limited to 'src') diff --git a/src/rpc/layout/history.rs b/src/rpc/layout/history.rs index 23196aee..b8cc27da 100644 --- a/src/rpc/layout/history.rs +++ b/src/rpc/layout/history.rs @@ -86,23 +86,20 @@ impl LayoutHistory { // remove them (keep them in self.old_versions). // ASSUMPTION: we only care about where nodes in the current layout version // are reading from, as we assume older nodes are being discarded. - while self.versions.len() > 1 { - let current_nodes = &self.current().node_id_vec; - let min_version = self.min_stored(); - let sync_ack_map_min = self - .update_trackers - .sync_ack_map - .min_among(current_nodes, min_version); - if self.min_stored() < sync_ack_map_min { - let removed = self.versions.remove(0); - info!( - "Layout history: moving version {} to old_versions", - removed.version - ); - self.old_versions.push(removed); - } else { - break; - } + let current_nodes = &self.current().node_id_vec; + let min_version = self.min_stored(); + let sync_ack_map_min = self + .update_trackers + .sync_ack_map + .min_among(current_nodes, min_version); + while self.min_stored() < sync_ack_map_min { + assert!(self.versions.len() > 1); + let removed = self.versions.remove(0); + info!( + "Layout history: moving version {} to old_versions", + removed.version + ); + self.old_versions.push(removed); } while self.old_versions.len() > OLD_VERSION_COUNT { -- cgit v1.2.3 From 0041b013a473e3ae72f50209d8f79db75a72848b Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 11 Dec 2023 16:09:22 +0100 Subject: layout: refactoring and fix in layout helper --- src/rpc/layout/helper.rs | 43 +++++++++++++++++++++++----------------- src/rpc/layout/manager.rs | 2 +- src/rpc/layout/mod.rs | 2 +- src/rpc/rpc_helper.rs | 2 +- src/rpc/system.rs | 4 ++-- src/table/replication/sharded.rs | 2 +- src/table/sync.rs | 16 +++++++-------- 7 files changed, 38 insertions(+), 33 deletions(-) (limited to 'src') diff --git a/src/rpc/layout/helper.rs b/src/rpc/layout/helper.rs index 7e5d37e9..9fb738ea 100644 --- a/src/rpc/layout/helper.rs +++ b/src/rpc/layout/helper.rs @@ -10,7 +10,7 @@ use super::*; use crate::replication_mode::ReplicationMode; #[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq, Eq)] -pub struct LayoutDigest { +pub struct RpcLayoutDigest { /// Cluster layout version pub current_version: u64, /// Number of active layout versions @@ -21,6 +21,13 @@ pub struct LayoutDigest { pub staging_hash: Hash, } +#[derive(Debug, Clone, Copy, Eq, PartialEq)] +pub struct SyncLayoutDigest { + current: u64, + ack_map_min: u64, + min_stored: u64, +} + pub struct LayoutHelper { replication_mode: ReplicationMode, layout: Option, @@ -150,20 +157,20 @@ impl LayoutHelper { &self.all_nongateway_nodes } - pub fn all_ack(&self) -> u64 { + pub fn ack_map_min(&self) -> u64 { self.ack_map_min } - pub fn all_sync(&self) -> u64 { + pub fn sync_map_min(&self) -> u64 { self.sync_map_min } - pub fn sync_versions(&self) -> (u64, u64, u64) { - ( - self.layout().current().version, - self.all_ack(), - self.layout().min_stored(), - ) + pub fn sync_digest(&self) -> SyncLayoutDigest { + SyncLayoutDigest { + current: self.layout().current().version, + ack_map_min: self.ack_map_min(), + min_stored: self.layout().min_stored(), + } } pub fn read_nodes_of(&self, position: &Hash) -> Vec { @@ -206,8 +213,8 @@ impl LayoutHelper { self.staging_hash } - pub fn digest(&self) -> LayoutDigest { - LayoutDigest { + pub fn digest(&self) -> RpcLayoutDigest { + RpcLayoutDigest { current_version: self.current().version, active_versions: self.versions.len(), trackers_hash: self.trackers_hash, @@ -231,13 +238,13 @@ impl LayoutHelper { // 3. Acknowledge everyone has synced up to min(self.sync_map) self.sync_ack(local_node_id); - info!("ack_map: {:?}", self.update_trackers.ack_map); - info!("sync_map: {:?}", self.update_trackers.sync_map); - info!("sync_ack_map: {:?}", self.update_trackers.sync_ack_map); + debug!("ack_map: {:?}", self.update_trackers.ack_map); + debug!("sync_map: {:?}", self.update_trackers.sync_map); + debug!("sync_ack_map: {:?}", self.update_trackers.sync_ack_map); } fn sync_first(&mut self, local_node_id: Uuid) { - let first_version = self.versions.first().as_ref().unwrap().version; + let first_version = self.min_stored(); self.update(|layout| { layout .update_trackers @@ -275,13 +282,13 @@ impl LayoutHelper { .versions .iter() .map(|x| x.version) - .take_while(|v| { + .skip_while(|v| { self.ack_lock .get(v) .map(|x| x.load(Ordering::Relaxed) == 0) .unwrap_or(true) }) - .max() - .unwrap_or(self.min_stored()) + .next() + .unwrap_or(self.current().version) } } diff --git a/src/rpc/layout/manager.rs b/src/rpc/layout/manager.rs index ec8a2a15..6747b79d 100644 --- a/src/rpc/layout/manager.rs +++ b/src/rpc/layout/manager.rs @@ -256,7 +256,7 @@ impl LayoutManager { // ---- RPC HANDLERS ---- - pub(crate) fn handle_advertise_status(self: &Arc, from: Uuid, remote: &LayoutDigest) { + pub(crate) fn handle_advertise_status(self: &Arc, from: Uuid, remote: &RpcLayoutDigest) { let local = self.layout().digest(); if remote.current_version > local.current_version || remote.active_versions != local.active_versions diff --git a/src/rpc/layout/mod.rs b/src/rpc/layout/mod.rs index 162e3c6e..33676c37 100644 --- a/src/rpc/layout/mod.rs +++ b/src/rpc/layout/mod.rs @@ -17,7 +17,7 @@ pub mod manager; // ---- re-exports ---- -pub use helper::{LayoutDigest, LayoutHelper}; +pub use helper::{LayoutHelper, RpcLayoutDigest, SyncLayoutDigest}; pub use manager::WriteLock; pub use version::*; diff --git a/src/rpc/rpc_helper.rs b/src/rpc/rpc_helper.rs index 77a36ca1..ae3a19c4 100644 --- a/src/rpc/rpc_helper.rs +++ b/src/rpc/rpc_helper.rs @@ -502,7 +502,7 @@ impl RpcHelper { .rev() .chain(layout.old_versions.iter().rev()); for ver in ver_iter { - if ver.version > layout.all_sync() { + if ver.version > layout.sync_map_min() { continue; } let nodes = ver.nodes_of(position, ver.replication_factor); diff --git a/src/rpc/system.rs b/src/rpc/system.rs index 41d76177..83cc6816 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -34,7 +34,7 @@ use crate::consul::ConsulDiscovery; #[cfg(feature = "kubernetes-discovery")] use crate::kubernetes::*; use crate::layout::{ - self, manager::LayoutManager, LayoutDigest, LayoutHelper, LayoutHistory, NodeRoleV, + self, manager::LayoutManager, LayoutHelper, LayoutHistory, NodeRoleV, RpcLayoutDigest, }; use crate::replication_mode::*; use crate::rpc_helper::*; @@ -132,7 +132,7 @@ pub struct NodeStatus { pub replication_factor: usize, /// Cluster layout digest - pub layout_digest: LayoutDigest, + pub layout_digest: RpcLayoutDigest, /// Disk usage on partition containing metadata directory (tuple: `(avail, total)`) #[serde(default)] diff --git a/src/table/replication/sharded.rs b/src/table/replication/sharded.rs index 55d0029d..8ba3700f 100644 --- a/src/table/replication/sharded.rs +++ b/src/table/replication/sharded.rs @@ -54,7 +54,7 @@ impl TableReplication for TableShardedReplication { fn sync_partitions(&self) -> SyncPartitions { let layout = self.system.cluster_layout(); - let layout_version = layout.all_ack(); + let layout_version = layout.ack_map_min(); let mut partitions = layout .current() diff --git a/src/table/sync.rs b/src/table/sync.rs index 1561a2e5..cd080df0 100644 --- a/src/table/sync.rs +++ b/src/table/sync.rs @@ -83,7 +83,7 @@ impl TableSyncer { bg.spawn_worker(SyncWorker { syncer: self.clone(), layout_notify: self.system.layout_notify(), - layout_versions: self.system.cluster_layout().sync_versions(), + layout_digest: self.system.cluster_layout().sync_digest(), add_full_sync_rx, todo: None, next_full_sync: Instant::now() + Duration::from_secs(20), @@ -483,7 +483,7 @@ struct SyncWorker { syncer: Arc>, layout_notify: Arc, - layout_versions: (u64, u64, u64), + layout_digest: SyncLayoutDigest, add_full_sync_rx: mpsc::UnboundedReceiver<()>, next_full_sync: Instant, @@ -493,15 +493,13 @@ struct SyncWorker { impl SyncWorker { fn check_add_full_sync(&mut self) { - let layout_versions = self.syncer.system.cluster_layout().sync_versions(); - if layout_versions != self.layout_versions { - self.layout_versions = layout_versions; + let layout_digest = self.syncer.system.cluster_layout().sync_digest(); + if layout_digest != self.layout_digest { + self.layout_digest = layout_digest; info!( - "({}) Layout versions changed (max={}, ack={}, min stored={}), adding full sync to syncer todo list", + "({}) Layout versions changed ({:?}), adding full sync to syncer todo list", F::TABLE_NAME, - layout_versions.0, - layout_versions.1, - layout_versions.2 + layout_digest, ); self.add_full_sync(); } -- cgit v1.2.3 From db48dd3d6c1f9e86a62e9b8edfce2c1620bcd5f3 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 11 Jan 2024 12:05:51 +0100 Subject: bump crate versions to 0.10.0 --- src/api/Cargo.toml | 2 +- src/block/Cargo.toml | 2 +- src/db/Cargo.toml | 2 +- src/garage/Cargo.toml | 2 +- src/model/Cargo.toml | 2 +- src/rpc/Cargo.toml | 2 +- src/table/Cargo.toml | 2 +- src/util/Cargo.toml | 2 +- src/web/Cargo.toml | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) (limited to 'src') diff --git a/src/api/Cargo.toml b/src/api/Cargo.toml index e8cbc1c8..15bf757e 100644 --- a/src/api/Cargo.toml +++ b/src/api/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "garage_api" -version = "0.9.0" +version = "0.10.0" authors = ["Alex Auvolat "] edition = "2018" license = "AGPL-3.0" diff --git a/src/block/Cargo.toml b/src/block/Cargo.toml index f6aa5f64..e4265cbe 100644 --- a/src/block/Cargo.toml +++ b/src/block/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "garage_block" -version = "0.9.0" +version = "0.10.0" authors = ["Alex Auvolat "] edition = "2018" license = "AGPL-3.0" diff --git a/src/db/Cargo.toml b/src/db/Cargo.toml index 67af4a7c..530f1966 100644 --- a/src/db/Cargo.toml +++ b/src/db/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "garage_db" -version = "0.9.0" +version = "0.10.0" authors = ["Alex Auvolat "] edition = "2018" license = "AGPL-3.0" diff --git a/src/garage/Cargo.toml b/src/garage/Cargo.toml index 7c3a79cb..dce7ea73 100644 --- a/src/garage/Cargo.toml +++ b/src/garage/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "garage" -version = "0.9.0" +version = "0.10.0" authors = ["Alex Auvolat "] edition = "2018" license = "AGPL-3.0" diff --git a/src/model/Cargo.toml b/src/model/Cargo.toml index 42b7ffdb..124b84b0 100644 --- a/src/model/Cargo.toml +++ b/src/model/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "garage_model" -version = "0.9.0" +version = "0.10.0" authors = ["Alex Auvolat "] edition = "2018" license = "AGPL-3.0" diff --git a/src/rpc/Cargo.toml b/src/rpc/Cargo.toml index f450718f..e19f80a8 100644 --- a/src/rpc/Cargo.toml +++ b/src/rpc/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "garage_rpc" -version = "0.9.0" +version = "0.10.0" authors = ["Alex Auvolat "] edition = "2018" license = "AGPL-3.0" diff --git a/src/table/Cargo.toml b/src/table/Cargo.toml index 08ccec72..62cffac7 100644 --- a/src/table/Cargo.toml +++ b/src/table/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "garage_table" -version = "0.9.0" +version = "0.10.0" authors = ["Alex Auvolat "] edition = "2018" license = "AGPL-3.0" diff --git a/src/util/Cargo.toml b/src/util/Cargo.toml index 6554ac13..afc4d3c3 100644 --- a/src/util/Cargo.toml +++ b/src/util/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "garage_util" -version = "0.9.0" +version = "0.10.0" authors = ["Alex Auvolat "] edition = "2018" license = "AGPL-3.0" diff --git a/src/web/Cargo.toml b/src/web/Cargo.toml index 88cf1486..9f7720da 100644 --- a/src/web/Cargo.toml +++ b/src/web/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "garage_web" -version = "0.9.0" +version = "0.10.0" authors = ["Alex Auvolat ", "Quentin Dufour "] edition = "2018" license = "AGPL-3.0" -- cgit v1.2.3 From 75e591727d9cfda0133200604872a38419c178a1 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 20 Feb 2024 17:08:31 +0100 Subject: [next-0.10] cluster node status metrics: report nodes of all active layout versions --- src/rpc/system_metrics.rs | 68 +++++++++++++++++++++++------------------------ 1 file changed, 34 insertions(+), 34 deletions(-) (limited to 'src') diff --git a/src/rpc/system_metrics.rs b/src/rpc/system_metrics.rs index fb3c983c..0bb55bf3 100644 --- a/src/rpc/system_metrics.rs +++ b/src/rpc/system_metrics.rs @@ -216,12 +216,12 @@ impl SystemMetrics { .u64_value_observer("cluster_layout_node_connected", move |observer| { let layout = system.cluster_layout(); let nodes = system.get_known_nodes(); - for (id, _, config) in layout.current().roles.items().iter() { - if let Some(role) = &config.0 { - let mut kv = vec![ - KeyValue::new("id", format!("{:?}", id)), - KeyValue::new("role_zone", role.zone.clone()), - ]; + for id in layout.all_nodes().iter() { + let mut kv = vec![KeyValue::new("id", format!("{:?}", id))]; + if let Some(role) = + layout.current().roles.get(id).and_then(|r| r.0.as_ref()) + { + kv.push(KeyValue::new("role_zone", role.zone.clone())); match role.capacity { Some(cap) => { kv.push(KeyValue::new("role_capacity", cap as i64)); @@ -231,24 +231,24 @@ impl SystemMetrics { kv.push(KeyValue::new("role_gateway", 1)); } } + } - let value; - if let Some(node) = nodes.iter().find(|n| n.id == *id) { - value = if node.is_up { 1 } else { 0 }; + let value; + if let Some(node) = nodes.iter().find(|n| n.id == *id) { // TODO: if we add address and hostname, and those change, we // get duplicate metrics, due to bad otel aggregation :( // Can probably be fixed when we upgrade opentelemetry // kv.push(KeyValue::new("address", node.addr.to_string())); // kv.push(KeyValue::new( - // "hostname", - // node.status.hostname.clone(), + // "hostname", + // node.status.hostname.clone(), // )); - } else { - value = 0; - } - - observer.observe(value, &kv); + value = if node.is_up { 1 } else { 0 }; + } else { + value = 0; } + + observer.observe(value, &kv); } }) .with_description("Connection status for nodes in the cluster layout") @@ -260,12 +260,12 @@ impl SystemMetrics { .u64_value_observer("cluster_layout_node_disconnected_time", move |observer| { let layout = system.cluster_layout(); let nodes = system.get_known_nodes(); - for (id, _, config) in layout.current().roles.items().iter() { - if let Some(role) = &config.0 { - let mut kv = vec![ - KeyValue::new("id", format!("{:?}", id)), - KeyValue::new("role_zone", role.zone.clone()), - ]; + for id in layout.all_nodes().iter() { + let mut kv = vec![KeyValue::new("id", format!("{:?}", id))]; + if let Some(role) = + layout.current().roles.get(id).and_then(|r| r.0.as_ref()) + { + kv.push(KeyValue::new("role_zone", role.zone.clone())); match role.capacity { Some(cap) => { kv.push(KeyValue::new("role_capacity", cap as i64)); @@ -275,19 +275,19 @@ impl SystemMetrics { kv.push(KeyValue::new("role_gateway", 1)); } } + } - if let Some(node) = nodes.iter().find(|n| n.id == *id) { - // TODO: see comment above - // kv.push(KeyValue::new("address", node.addr.to_string())); - // kv.push(KeyValue::new( - // "hostname", - // node.status.hostname.clone(), - // )); - if node.is_up { - observer.observe(0, &kv); - } else if let Some(secs) = node.last_seen_secs_ago { - observer.observe(secs, &kv); - } + if let Some(node) = nodes.iter().find(|n| n.id == *id) { + // TODO: see comment above + // kv.push(KeyValue::new("address", node.addr.to_string())); + // kv.push(KeyValue::new( + // "hostname", + // node.status.hostname.clone(), + // )); + if node.is_up { + observer.observe(0, &kv); + } else if let Some(secs) = node.last_seen_secs_ago { + observer.observe(secs, &kv); } } } -- cgit v1.2.3 From 81cebdd12415381f67747e96591e83b1a4a8cc0b Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 22 Feb 2024 15:53:47 +0100 Subject: [next-0.10] fix build --- src/rpc/system.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'src') diff --git a/src/rpc/system.rs b/src/rpc/system.rs index e8844f29..1c668306 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -520,7 +520,7 @@ impl System { } }; - let hostname = self.local_status.read().unwrap().hostname.clone(); + let hostname = self.local_status.read().unwrap().hostname.clone().unwrap(); if let Err(e) = c .publish_consul_service(self.netapp.id, &hostname, rpc_public_addr) .await @@ -544,7 +544,7 @@ impl System { } }; - let hostname = self.local_status.read().unwrap().hostname.clone(); + let hostname = self.local_status.read().unwrap().hostname.clone().unwrap(); if let Err(e) = publish_kubernetes_node(k, self.netapp.id, &hostname, rpc_public_addr).await { error!("Error while publishing node to Kubernetes: {}", e); -- cgit v1.2.3 From 6760895926c23112583dfc53a01ecbcfad02a276 Mon Sep 17 00:00:00 2001 From: Yureka Date: Mon, 4 Mar 2024 18:37:00 +0100 Subject: refactor: remove max_write_errors and max_faults --- src/model/garage.rs | 1 - src/rpc/replication_mode.rs | 7 ------- src/table/replication/fullcopy.rs | 12 +++++------- src/table/replication/parameters.rs | 1 - src/table/replication/sharded.rs | 3 --- 5 files changed, 5 insertions(+), 19 deletions(-) (limited to 'src') diff --git a/src/model/garage.rs b/src/model/garage.rs index fe38a760..561aca8f 100644 --- a/src/model/garage.rs +++ b/src/model/garage.rs @@ -247,7 +247,6 @@ impl Garage { let control_rep_param = TableFullReplication { system: system.clone(), - max_faults: replication_mode.control_write_max_faults(), }; info!("Initialize block manager..."); diff --git a/src/rpc/replication_mode.rs b/src/rpc/replication_mode.rs index 2f7e2fec..b142ea10 100644 --- a/src/rpc/replication_mode.rs +++ b/src/rpc/replication_mode.rs @@ -21,13 +21,6 @@ impl ReplicationMode { } } - pub fn control_write_max_faults(&self) -> usize { - match self { - Self::None => 0, - _ => 1, - } - } - pub fn replication_factor(&self) -> usize { match self { Self::None => 1, diff --git a/src/table/replication/fullcopy.rs b/src/table/replication/fullcopy.rs index 30122f39..1e52bb47 100644 --- a/src/table/replication/fullcopy.rs +++ b/src/table/replication/fullcopy.rs @@ -21,8 +21,6 @@ use crate::replication::*; pub struct TableFullReplication { /// The membership manager of this node pub system: Arc, - /// Max number of faults allowed while replicating a record - pub max_faults: usize, } impl TableReplication for TableFullReplication { @@ -45,15 +43,15 @@ impl TableReplication for TableFullReplication { } fn write_quorum(&self) -> usize { let nmembers = self.system.cluster_layout().current().all_nodes().len(); - if nmembers > self.max_faults { - nmembers - self.max_faults + + let max_faults = if nmembers > 1 { 1 } else { 0 }; + + if nmembers > max_faults { + nmembers - max_faults } else { 1 } } - fn max_write_errors(&self) -> usize { - self.max_faults - } fn partition_of(&self, _hash: &Hash) -> Partition { 0u16 diff --git a/src/table/replication/parameters.rs b/src/table/replication/parameters.rs index 78470f35..682c1ea6 100644 --- a/src/table/replication/parameters.rs +++ b/src/table/replication/parameters.rs @@ -20,7 +20,6 @@ pub trait TableReplication: Send + Sync + 'static { fn write_sets(&self, hash: &Hash) -> Self::WriteSets; /// Responses needed to consider a write succesfull in each set fn write_quorum(&self) -> usize; - fn max_write_errors(&self) -> usize; // Accessing partitions, for Merkle tree & sync /// Get partition for data with given hash diff --git a/src/table/replication/sharded.rs b/src/table/replication/sharded.rs index 8ba3700f..e0245949 100644 --- a/src/table/replication/sharded.rs +++ b/src/table/replication/sharded.rs @@ -44,9 +44,6 @@ impl TableReplication for TableShardedReplication { fn write_quorum(&self) -> usize { self.write_quorum } - fn max_write_errors(&self) -> usize { - self.replication_factor - self.write_quorum - } fn partition_of(&self, hash: &Hash) -> Partition { self.system.cluster_layout().current().partition_of(hash) -- cgit v1.2.3 From c1769bbe69f723fb3980cf4fdac7615cfb782720 Mon Sep 17 00:00:00 2001 From: Yureka Date: Mon, 4 Mar 2024 19:58:32 +0100 Subject: ReplicationMode -> ConsistencyMode+ReplicationFactor --- src/garage/secrets.rs | 6 +- src/garage/tests/common/garage.rs | 2 +- src/model/garage.rs | 23 ++++---- src/rpc/layout/helper.rs | 18 +++--- src/rpc/layout/history.rs | 10 ++-- src/rpc/layout/manager.rs | 27 +++++---- src/rpc/layout/test.rs | 3 +- src/rpc/replication_mode.rs | 119 +++++++++++++++++++++++++------------- src/rpc/system.rs | 22 +++---- src/rpc/system_metrics.rs | 2 +- src/util/config.rs | 27 ++++++--- 11 files changed, 158 insertions(+), 101 deletions(-) (limited to 'src') diff --git a/src/garage/secrets.rs b/src/garage/secrets.rs index c3d704aa..8d2ff475 100644 --- a/src/garage/secrets.rs +++ b/src/garage/secrets.rs @@ -163,7 +163,7 @@ mod tests { r#" metadata_dir = "/tmp/garage/meta" data_dir = "/tmp/garage/data" - replication_mode = "3" + replication_factor = 3 rpc_bind_addr = "[::]:3901" rpc_secret_file = "{}" @@ -185,7 +185,7 @@ mod tests { r#" metadata_dir = "/tmp/garage/meta" data_dir = "/tmp/garage/data" - replication_mode = "3" + replication_factor = 3 rpc_bind_addr = "[::]:3901" rpc_secret_file = "{}" allow_world_readable_secrets = true @@ -296,7 +296,7 @@ mod tests { r#" metadata_dir = "/tmp/garage/meta" data_dir = "/tmp/garage/data" - replication_mode = "3" + replication_factor = 3 rpc_bind_addr = "[::]:3901" rpc_secret= "dummy" rpc_secret_file = "dummy" diff --git a/src/garage/tests/common/garage.rs b/src/garage/tests/common/garage.rs index ebc82f37..f1c1efc8 100644 --- a/src/garage/tests/common/garage.rs +++ b/src/garage/tests/common/garage.rs @@ -54,7 +54,7 @@ metadata_dir = "{path}/meta" data_dir = "{path}/data" db_engine = "lmdb" -replication_mode = "1" +replication_factor = 1 rpc_bind_addr = "127.0.0.1:{rpc_port}" rpc_public_addr = "127.0.0.1:{rpc_port}" diff --git a/src/model/garage.rs b/src/model/garage.rs index 561aca8f..19f58077 100644 --- a/src/model/garage.rs +++ b/src/model/garage.rs @@ -9,7 +9,7 @@ use garage_util::config::*; use garage_util::error::*; use garage_util::persister::PersisterShared; -use garage_rpc::replication_mode::ReplicationMode; +use garage_rpc::replication_mode::*; use garage_rpc::system::System; use garage_block::manager::*; @@ -39,8 +39,8 @@ pub struct Garage { /// The set of background variables that can be viewed/modified at runtime pub bg_vars: vars::BgVars, - /// The replication mode of this cluster - pub replication_mode: ReplicationMode, + /// The replication factor of this cluster + pub replication_factor: ReplicationFactor, /// The local database pub db: db::Db, @@ -222,27 +222,26 @@ impl Garage { .and_then(|x| NetworkKey::from_slice(&x)) .ok_or_message("Invalid RPC secret key")?; - let replication_mode = ReplicationMode::parse(&config.replication_mode) - .ok_or_message("Invalid replication_mode in config file.")?; + let (replication_factor, consistency_mode) = parse_replication_mode(&config)?; info!("Initialize background variable system..."); let mut bg_vars = vars::BgVars::new(); info!("Initialize membership management system..."); - let system = System::new(network_key, replication_mode, &config)?; + let system = System::new(network_key, replication_factor, consistency_mode, &config)?; let data_rep_param = TableShardedReplication { system: system.clone(), - replication_factor: replication_mode.replication_factor(), - write_quorum: replication_mode.write_quorum(), + replication_factor: replication_factor.into(), + write_quorum: replication_factor.write_quorum(consistency_mode), read_quorum: 1, }; let meta_rep_param = TableShardedReplication { system: system.clone(), - replication_factor: replication_mode.replication_factor(), - write_quorum: replication_mode.write_quorum(), - read_quorum: replication_mode.read_quorum(), + replication_factor: replication_factor.into(), + write_quorum: replication_factor.write_quorum(consistency_mode), + read_quorum: replication_factor.read_quorum(consistency_mode), }; let control_rep_param = TableFullReplication { @@ -338,7 +337,7 @@ impl Garage { Ok(Arc::new(Self { config, bg_vars, - replication_mode, + replication_factor, db, system, block_manager, diff --git a/src/rpc/layout/helper.rs b/src/rpc/layout/helper.rs index 9fb738ea..2835347a 100644 --- a/src/rpc/layout/helper.rs +++ b/src/rpc/layout/helper.rs @@ -7,7 +7,7 @@ use serde::{Deserialize, Serialize}; use garage_util::data::*; use super::*; -use crate::replication_mode::ReplicationMode; +use crate::replication_mode::*; #[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq, Eq)] pub struct RpcLayoutDigest { @@ -29,7 +29,8 @@ pub struct SyncLayoutDigest { } pub struct LayoutHelper { - replication_mode: ReplicationMode, + replication_factor: ReplicationFactor, + consistency_mode: ConsistencyMode, layout: Option, // cached values @@ -57,7 +58,8 @@ impl Deref for LayoutHelper { impl LayoutHelper { pub fn new( - replication_mode: ReplicationMode, + replication_factor: ReplicationFactor, + consistency_mode: ConsistencyMode, mut layout: LayoutHistory, mut ack_lock: HashMap, ) -> Self { @@ -66,7 +68,7 @@ impl LayoutHelper { // correct and we have rapid access to important values such as // the layout versions to use when reading to ensure consistency. - if !replication_mode.is_read_after_write_consistent() { + if consistency_mode != ConsistencyMode::Consistent { // Fast path for when no consistency is required. // In this case we only need to keep the last version of the layout, // we don't care about coordinating stuff in the cluster. @@ -103,7 +105,7 @@ impl LayoutHelper { // This value is calculated using quorums to allow progress even // if not all nodes have successfully completed a sync. let sync_map_min = - layout.calculate_sync_map_min_with_quorum(replication_mode, &all_nongateway_nodes); + layout.calculate_sync_map_min_with_quorum(replication_factor, &all_nongateway_nodes); let trackers_hash = layout.calculate_trackers_hash(); let staging_hash = layout.calculate_staging_hash(); @@ -114,7 +116,8 @@ impl LayoutHelper { .or_insert(AtomicUsize::new(0)); LayoutHelper { - replication_mode, + replication_factor, + consistency_mode, layout: Some(layout), ack_map_min, sync_map_min, @@ -139,7 +142,8 @@ impl LayoutHelper { let changed = f(self.layout.as_mut().unwrap()); if changed { *self = Self::new( - self.replication_mode, + self.replication_factor, + self.consistency_mode, self.layout.take().unwrap(), std::mem::take(&mut self.ack_lock), ); diff --git a/src/rpc/layout/history.rs b/src/rpc/layout/history.rs index b8cc27da..290f058d 100644 --- a/src/rpc/layout/history.rs +++ b/src/rpc/layout/history.rs @@ -6,11 +6,11 @@ use garage_util::encode::nonversioned_encode; use garage_util::error::*; use super::*; -use crate::replication_mode::ReplicationMode; +use crate::replication_mode::*; impl LayoutHistory { - pub fn new(replication_factor: usize) -> Self { - let version = LayoutVersion::new(replication_factor); + pub fn new(replication_factor: ReplicationFactor) -> Self { + let version = LayoutVersion::new(replication_factor.into()); let staging = LayoutStaging { parameters: Lww::::new(version.parameters), @@ -119,7 +119,7 @@ impl LayoutHistory { pub(crate) fn calculate_sync_map_min_with_quorum( &self, - replication_mode: ReplicationMode, + replication_factor: ReplicationFactor, all_nongateway_nodes: &[Uuid], ) -> u64 { // This function calculates the minimum layout version from which @@ -133,7 +133,7 @@ impl LayoutHistory { return self.current().version; } - let quorum = replication_mode.write_quorum(); + let quorum = replication_factor.write_quorum(ConsistencyMode::Consistent); let min_version = self.min_stored(); let global_min = self diff --git a/src/rpc/layout/manager.rs b/src/rpc/layout/manager.rs index 0b6c7e63..8a6eb1c3 100644 --- a/src/rpc/layout/manager.rs +++ b/src/rpc/layout/manager.rs @@ -14,13 +14,13 @@ use garage_util::error::*; use garage_util::persister::Persister; use super::*; -use crate::replication_mode::ReplicationMode; +use crate::replication_mode::*; use crate::rpc_helper::*; use crate::system::*; pub struct LayoutManager { node_id: Uuid, - replication_mode: ReplicationMode, + replication_factor: ReplicationFactor, persist_cluster_layout: Persister, layout: Arc>, @@ -38,20 +38,19 @@ impl LayoutManager { node_id: NodeID, system_endpoint: Arc>, peering: Arc, - replication_mode: ReplicationMode, + replication_factor: ReplicationFactor, + consistency_mode: ConsistencyMode, ) -> Result, Error> { - let replication_factor = replication_mode.replication_factor(); - let persist_cluster_layout: Persister = Persister::new(&config.metadata_dir, "cluster_layout"); let cluster_layout = match persist_cluster_layout.load() { Ok(x) => { - if x.current().replication_factor != replication_mode.replication_factor() { + if x.current().replication_factor != replication_factor.replication_factor() { return Err(Error::Message(format!( "Prevous cluster layout has replication factor {}, which is different than the one specified in the config file ({}). The previous cluster layout can be purged, if you know what you are doing, simply by deleting the `cluster_layout` file in your metadata directory.", x.current().replication_factor, - replication_factor + replication_factor.replication_factor() ))); } x @@ -65,8 +64,12 @@ impl LayoutManager { } }; - let mut cluster_layout = - LayoutHelper::new(replication_mode, cluster_layout, Default::default()); + let mut cluster_layout = LayoutHelper::new( + replication_factor, + consistency_mode, + cluster_layout, + Default::default(), + ); cluster_layout.update_trackers(node_id.into()); let layout = Arc::new(RwLock::new(cluster_layout)); @@ -81,7 +84,7 @@ impl LayoutManager { Ok(Arc::new(Self { node_id: node_id.into(), - replication_mode, + replication_factor, persist_cluster_layout, layout, change_notify, @@ -295,11 +298,11 @@ impl LayoutManager { adv.update_trackers ); - if adv.current().replication_factor != self.replication_mode.replication_factor() { + if adv.current().replication_factor != self.replication_factor.replication_factor() { let msg = format!( "Received a cluster layout from another node with replication factor {}, which is different from what we have in our configuration ({}). Discarding the cluster layout we received.", adv.current().replication_factor, - self.replication_mode.replication_factor() + self.replication_factor.replication_factor() ); error!("{}", msg); return Err(Error::Message(msg)); diff --git a/src/rpc/layout/test.rs b/src/rpc/layout/test.rs index 88eb518e..fcbb9dfc 100644 --- a/src/rpc/layout/test.rs +++ b/src/rpc/layout/test.rs @@ -5,6 +5,7 @@ use garage_util::crdt::Crdt; use garage_util::error::*; use crate::layout::*; +use crate::replication_mode::ReplicationFactor; // This function checks that the partition size S computed is at least better than the // one given by a very naive algorithm. To do so, we try to run the naive algorithm @@ -120,7 +121,7 @@ fn test_assignment() { let mut node_capacity_vec = vec![4000, 1000, 2000]; let mut node_zone_vec = vec!["A", "B", "C"]; - let mut cl = LayoutHistory::new(3); + let mut cl = LayoutHistory::new(ReplicationFactor::new(3).unwrap()); update_layout(&mut cl, &node_capacity_vec, &node_zone_vec, 3); let v = cl.current().version; let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); diff --git a/src/rpc/replication_mode.rs b/src/rpc/replication_mode.rs index b142ea10..a3a94085 100644 --- a/src/rpc/replication_mode.rs +++ b/src/rpc/replication_mode.rs @@ -1,57 +1,94 @@ -#[derive(Clone, Copy)] -pub enum ReplicationMode { - None, - TwoWay, - TwoWayDangerous, - ThreeWay, - ThreeWayDegraded, - ThreeWayDangerous, +use garage_util::config::Config; +use garage_util::crdt::AutoCrdt; +use garage_util::error::*; +use serde::{Deserialize, Serialize}; + +#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)] +#[serde(transparent)] +pub struct ReplicationFactor(usize); + +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Default, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum ConsistencyMode { + /// Read- and Write-quorum are 1 + Dangerous, + /// Read-quorum is 1 + Degraded, + /// Read- and Write-quorum are determined for read-after-write-consistency + #[default] + Consistent, +} + +impl ConsistencyMode { + pub fn parse(s: &str) -> Option { + serde_json::from_value(serde_json::Value::String(s.to_string())).ok() + } +} + +impl AutoCrdt for ConsistencyMode { + const WARN_IF_DIFFERENT: bool = true; } -impl ReplicationMode { - pub fn parse(v: &str) -> Option { - match v { - "none" | "1" => Some(Self::None), - "2" => Some(Self::TwoWay), - "2-dangerous" => Some(Self::TwoWayDangerous), - "3" => Some(Self::ThreeWay), - "3-degraded" => Some(Self::ThreeWayDegraded), - "3-dangerous" => Some(Self::ThreeWayDangerous), - _ => None, +impl ReplicationFactor { + pub fn new(replication_factor: usize) -> Option { + if replication_factor < 1 { + None + } else { + Some(Self(replication_factor)) } } pub fn replication_factor(&self) -> usize { - match self { - Self::None => 1, - Self::TwoWay | Self::TwoWayDangerous => 2, - Self::ThreeWay | Self::ThreeWayDegraded | Self::ThreeWayDangerous => 3, - } + self.0 } - pub fn read_quorum(&self) -> usize { - match self { - Self::None => 1, - Self::TwoWay | Self::TwoWayDangerous => 1, - Self::ThreeWay => 2, - Self::ThreeWayDegraded | Self::ThreeWayDangerous => 1, + pub fn read_quorum(&self, consistency_mode: ConsistencyMode) -> usize { + match consistency_mode { + ConsistencyMode::Dangerous | ConsistencyMode::Degraded => 1, + ConsistencyMode::Consistent => self.replication_factor().div_ceil(2), } } - pub fn write_quorum(&self) -> usize { - match self { - Self::None => 1, - Self::TwoWay => 2, - Self::TwoWayDangerous => 1, - Self::ThreeWay | Self::ThreeWayDegraded => 2, - Self::ThreeWayDangerous => 1, + pub fn write_quorum(&self, consistency_mode: ConsistencyMode) -> usize { + match consistency_mode { + ConsistencyMode::Dangerous => 1, + ConsistencyMode::Degraded | ConsistencyMode::Consistent => { + (self.replication_factor() + 1) - self.read_quorum(ConsistencyMode::Consistent) + } } } +} - pub fn is_read_after_write_consistent(&self) -> bool { - match self { - Self::None | Self::TwoWay | Self::ThreeWay => true, - _ => false, - } +impl std::convert::From for usize { + fn from(replication_factor: ReplicationFactor) -> usize { + replication_factor.0 } } + +pub fn parse_replication_mode( + config: &Config, +) -> Result<(ReplicationFactor, ConsistencyMode), Error> { + match (&config.replication_mode, config.replication_factor, config.consistency_mode.as_str()) { + (Some(replication_mode), None, "consistent") => { + tracing::warn!("Legacy config option replication_mode in use. Please migrate to replication_factor and consistency_mode"); + let parsed_replication_mode = match replication_mode.as_str() { + "1" | "none" => Some((ReplicationFactor(1), ConsistencyMode::Consistent)), + "2" => Some((ReplicationFactor(2), ConsistencyMode::Consistent)), + "2-dangerous" => Some((ReplicationFactor(2), ConsistencyMode::Dangerous)), + "3" => Some((ReplicationFactor(3), ConsistencyMode::Consistent)), + "3-degraded" => Some((ReplicationFactor(3), ConsistencyMode::Degraded)), + "3-dangerous" => Some((ReplicationFactor(3), ConsistencyMode::Dangerous)), + _ => None, + }; + Some(parsed_replication_mode.ok_or_message("Invalid replication_mode in config file.")?) + }, + (None, Some(replication_factor), consistency_mode) => { + let replication_factor = ReplicationFactor::new(replication_factor) + .ok_or_message("Invalid replication_factor in config file.")?; + let consistency_mode = ConsistencyMode::parse(consistency_mode) + .ok_or_message("Invalid consistency_mode in config file.")?; + Some((replication_factor, consistency_mode)) + } + _ => None, + }.ok_or_message("Either the legacy replication_mode or replication_level and consistency_mode can be set, not both.") +} diff --git a/src/rpc/system.rs b/src/rpc/system.rs index 1c668306..54d589d2 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -112,8 +112,7 @@ pub struct System { metrics: ArcSwapOption, - replication_mode: ReplicationMode, - pub(crate) replication_factor: usize, + pub(crate) replication_factor: ReplicationFactor, /// Path to metadata directory pub metadata_dir: PathBuf, @@ -243,7 +242,8 @@ impl System { /// Create this node's membership manager pub fn new( network_key: NetworkKey, - replication_mode: ReplicationMode, + replication_factor: ReplicationFactor, + consistency_mode: ConsistencyMode, config: &Config, ) -> Result, Error> { // ---- setup netapp RPC protocol ---- @@ -274,14 +274,13 @@ impl System { let persist_peer_list = Persister::new(&config.metadata_dir, "peer_list"); // ---- setup cluster layout and layout manager ---- - let replication_factor = replication_mode.replication_factor(); - let layout_manager = LayoutManager::new( config, netapp.id, system_endpoint.clone(), peering.clone(), - replication_mode, + replication_factor, + consistency_mode, )?; let mut local_status = NodeStatus::initial(replication_factor, &layout_manager); @@ -315,7 +314,6 @@ impl System { netapp: netapp.clone(), peering: peering.clone(), system_endpoint, - replication_mode, replication_factor, rpc_listen_addr: config.rpc_bind_addr, rpc_public_addr, @@ -427,7 +425,9 @@ impl System { } pub fn health(&self) -> ClusterHealth { - let quorum = self.replication_mode.write_quorum(); + let quorum = self + .replication_factor + .write_quorum(ConsistencyMode::Consistent); // Gather information about running nodes. // Technically, `nodes` contains currently running nodes, as well @@ -631,7 +631,7 @@ impl System { .count(); let not_configured = self.cluster_layout().check().is_err(); - let no_peers = n_connected < self.replication_factor; + let no_peers = n_connected < self.replication_factor.into(); let expected_n_nodes = self.cluster_layout().all_nodes().len(); let bad_peers = n_connected != expected_n_nodes; @@ -774,14 +774,14 @@ impl EndpointHandler for System { } impl NodeStatus { - fn initial(replication_factor: usize, layout_manager: &LayoutManager) -> Self { + fn initial(replication_factor: ReplicationFactor, layout_manager: &LayoutManager) -> Self { NodeStatus { hostname: Some( gethostname::gethostname() .into_string() .unwrap_or_else(|_| "".to_string()), ), - replication_factor, + replication_factor: replication_factor.into(), layout_digest: layout_manager.layout().digest(), meta_disk_avail: None, data_disk_avail: None, diff --git a/src/rpc/system_metrics.rs b/src/rpc/system_metrics.rs index 0bb55bf3..a64daec8 100644 --- a/src/rpc/system_metrics.rs +++ b/src/rpc/system_metrics.rs @@ -68,7 +68,7 @@ impl SystemMetrics { let replication_factor = system.replication_factor; meter .u64_value_observer("garage_replication_factor", move |observer| { - observer.observe(replication_factor as u64, &[]) + observer.observe(replication_factor.replication_factor() as u64, &[]) }) .with_description("Garage replication factor setting") .init() diff --git a/src/util/config.rs b/src/util/config.rs index 056c625d..b7f27676 100644 --- a/src/util/config.rs +++ b/src/util/config.rs @@ -30,12 +30,20 @@ pub struct Config { )] pub block_size: usize, - /// Replication mode. Supported values: - /// - none, 1 -> no replication - /// - 2 -> 2-way replication - /// - 3 -> 3-way replication - // (we can add more aliases for this later) - pub replication_mode: String, + /// Number of replicas. Can be any positive integer, but uneven numbers are more favorable. + /// - 1 for single-node clusters, or to disable replication + /// - 3 is the recommended and supported setting. + #[serde(default)] + pub replication_factor: Option, + + /// Consistency mode for all for requests through this node + /// - Degraded -> Disable read quorum + /// - Dangerous -> Disable read and write quorum + #[serde(default = "default_consistency_mode")] + pub consistency_mode: String, + + /// Legacy option + pub replication_mode: Option, /// Zstd compression level used on data blocks #[serde( @@ -244,10 +252,15 @@ fn default_sled_cache_capacity() -> usize { fn default_sled_flush_every_ms() -> u64 { 2000 } + fn default_block_size() -> usize { 1048576 } +fn default_consistency_mode() -> String { + "consistent".into() +} + fn default_compression() -> Option { Some(1) } @@ -359,7 +372,7 @@ mod tests { r#" metadata_dir = "/tmp/garage/meta" data_dir = "/tmp/garage/data" - replication_mode = "3" + replication_factor = 3 rpc_bind_addr = "[::]:3901" rpc_secret = "foo" -- cgit v1.2.3 From 57acc60082089e1a24fd47588f6ff3cb20ed4eef Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 23 Feb 2024 16:49:50 +0100 Subject: [sse-c] Implement SSE-C encryption --- src/api/Cargo.toml | 3 + src/api/s3/copy.rs | 301 +++++++++++++++------ src/api/s3/encryption.rs | 595 ++++++++++++++++++++++++++++++++++++++++++ src/api/s3/error.rs | 6 + src/api/s3/get.rs | 199 +++++++++----- src/api/s3/list.rs | 8 +- src/api/s3/mod.rs | 1 + src/api/s3/multipart.rs | 56 ++-- src/api/s3/post_object.rs | 42 +-- src/api/s3/put.rs | 128 ++++++--- src/block/block.rs | 2 +- src/block/lib.rs | 2 + src/block/manager.rs | 14 +- src/model/s3/object_table.rs | 160 +++++++++++- src/model/s3/version_table.rs | 5 +- 15 files changed, 1290 insertions(+), 232 deletions(-) create mode 100644 src/api/s3/encryption.rs (limited to 'src') diff --git a/src/api/Cargo.toml b/src/api/Cargo.toml index 9b215333..bcf6a537 100644 --- a/src/api/Cargo.toml +++ b/src/api/Cargo.toml @@ -21,7 +21,9 @@ garage_net.workspace = true garage_util.workspace = true garage_rpc.workspace = true +aes-gcm.workspace = true argon2.workspace = true +async-compression.workspace = true async-trait.workspace = true base64.workspace = true bytes.workspace = true @@ -41,6 +43,7 @@ futures.workspace = true futures-util.workspace = true tokio.workspace = true tokio-stream.workspace = true +tokio-util.workspace = true form_urlencoded.workspace = true http.workspace = true diff --git a/src/api/s3/copy.rs b/src/api/s3/copy.rs index 3c2bd483..2b29ec6d 100644 --- a/src/api/s3/copy.rs +++ b/src/api/s3/copy.rs @@ -1,7 +1,7 @@ use std::pin::Pin; use std::time::{Duration, SystemTime, UNIX_EPOCH}; -use futures::{stream, stream::Stream, StreamExt}; +use futures::{stream, stream::Stream, StreamExt, TryStreamExt}; use md5::{Digest as Md5Digest, Md5}; use bytes::Bytes; @@ -9,9 +9,11 @@ use hyper::{Request, Response}; use serde::Serialize; use garage_net::bytes_buf::BytesBuf; +use garage_net::stream::read_stream_to_end; use garage_rpc::rpc_helper::OrderTag; use garage_table::*; use garage_util::data::*; +use garage_util::error::Error as GarageError; use garage_util::time::*; use garage_model::s3::block_ref_table::*; @@ -21,11 +23,15 @@ use garage_model::s3::version_table::*; use crate::helpers::*; use crate::s3::api_server::{ReqBody, ResBody}; +use crate::s3::encryption::EncryptionParams; use crate::s3::error::*; +use crate::s3::get::full_object_byte_stream; use crate::s3::multipart; -use crate::s3::put::get_headers; +use crate::s3::put::{get_headers, save_stream, SaveStreamResult}; use crate::s3::xml::{self as s3_xml, xmlns_tag}; +// -------- CopyObject --------- + pub async fn handle_copy( ctx: ReqCtx, req: &Request, @@ -35,38 +41,114 @@ pub async fn handle_copy( let source_object = get_copy_source(&ctx, req).await?; - let ReqCtx { - garage, - bucket_id: dest_bucket_id, - .. - } = ctx; - let (source_version, source_version_data, source_version_meta) = extract_source_info(&source_object)?; // Check precondition, e.g. x-amz-copy-source-if-match copy_precondition.check(source_version, &source_version_meta.etag)?; + // Determine encryption parameters + let (source_encryption, source_object_headers) = + EncryptionParams::check_decrypt_for_copy_source( + &ctx.garage, + req.headers(), + &source_version_meta.encryption, + )?; + let dest_encryption = EncryptionParams::new_from_headers(&ctx.garage, req.headers())?; + + // Determine headers of destination object + let dest_object_headers = match req.headers().get("x-amz-metadata-directive") { + Some(v) if v == hyper::header::HeaderValue::from_static("REPLACE") => { + get_headers(req.headers())? + } + _ => source_object_headers.into_owned(), + }; + + // Do actual object copying + let res = if EncryptionParams::is_same(&source_encryption, &dest_encryption) { + // If source and dest are both unencrypted, or if the encryption keys + // are the same, we can just copy the metadata and link blocks of the + // old object from the new object. + handle_copy_metaonly( + ctx, + dest_key, + dest_object_headers, + dest_encryption, + source_version, + source_version_data, + source_version_meta, + ) + .await? + } else { + // If source and dest encryption use different keys, + // we must decrypt content and re-encrypt, so rewrite all data blocks. + handle_copy_reencrypt( + ctx, + dest_key, + dest_object_headers, + dest_encryption, + source_version, + source_version_data, + source_encryption, + ) + .await? + }; + + let last_modified = msec_to_rfc3339(res.version_timestamp); + let result = CopyObjectResult { + last_modified: s3_xml::Value(last_modified), + etag: s3_xml::Value(format!("\"{}\"", res.etag)), + }; + let xml = s3_xml::to_xml_with_header(&result)?; + + let mut resp = Response::builder() + .header("Content-Type", "application/xml") + .header("x-amz-version-id", hex::encode(res.version_uuid)) + .header( + "x-amz-copy-source-version-id", + hex::encode(source_version.uuid), + ); + dest_encryption.add_response_headers(&mut resp); + Ok(resp.body(string_body(xml))?) +} + +async fn handle_copy_metaonly( + ctx: ReqCtx, + dest_key: &str, + dest_object_headers: ObjectVersionHeaders, + dest_encryption: EncryptionParams, + source_version: &ObjectVersion, + source_version_data: &ObjectVersionData, + source_version_meta: &ObjectVersionMeta, +) -> Result { + let ReqCtx { + garage, + bucket_id: dest_bucket_id, + .. + } = ctx; + // Generate parameters for copied object let new_uuid = gen_uuid(); let new_timestamp = now_msec(); - // Implement x-amz-metadata-directive: REPLACE - let new_meta = match req.headers().get("x-amz-metadata-directive") { - Some(v) if v == hyper::header::HeaderValue::from_static("REPLACE") => ObjectVersionMeta { - headers: get_headers(req.headers())?, - size: source_version_meta.size, - etag: source_version_meta.etag.clone(), - }, - _ => source_version_meta.clone(), + let new_meta = ObjectVersionMeta { + encryption: dest_encryption.encrypt_headers(dest_object_headers)?, + size: source_version_meta.size, + etag: source_version_meta.etag.clone(), }; - let etag = new_meta.etag.to_string(); + let res = SaveStreamResult { + version_uuid: new_uuid, + version_timestamp: new_timestamp, + etag: new_meta.etag.clone(), + }; // Save object copy match source_version_data { ObjectVersionData::DeleteMarker => unreachable!(), ObjectVersionData::Inline(_meta, bytes) => { + // bytes is either plaintext before&after or encrypted with the + // same keys, so it's ok to just copy it as is let dest_object_version = ObjectVersion { uuid: new_uuid, timestamp: new_timestamp, @@ -97,7 +179,7 @@ pub async fn handle_copy( uuid: new_uuid, timestamp: new_timestamp, state: ObjectVersionState::Uploading { - headers: new_meta.headers.clone(), + encryption: new_meta.encryption.clone(), multipart: false, }, }; @@ -164,23 +246,42 @@ pub async fn handle_copy( } } - let last_modified = msec_to_rfc3339(new_timestamp); - let result = CopyObjectResult { - last_modified: s3_xml::Value(last_modified), - etag: s3_xml::Value(format!("\"{}\"", etag)), - }; - let xml = s3_xml::to_xml_with_header(&result)?; + Ok(res) +} - Ok(Response::builder() - .header("Content-Type", "application/xml") - .header("x-amz-version-id", hex::encode(new_uuid)) - .header( - "x-amz-copy-source-version-id", - hex::encode(source_version.uuid), - ) - .body(string_body(xml))?) +async fn handle_copy_reencrypt( + ctx: ReqCtx, + dest_key: &str, + dest_object_headers: ObjectVersionHeaders, + dest_encryption: EncryptionParams, + source_version: &ObjectVersion, + source_version_data: &ObjectVersionData, + source_encryption: EncryptionParams, +) -> Result { + // basically we will read the source data (decrypt if necessary) + // and save that in a new object (encrypt if necessary), + // by combining the code used in getobject and putobject + let source_stream = full_object_byte_stream( + ctx.garage.clone(), + source_version, + source_version_data, + source_encryption, + ); + + save_stream( + &ctx, + dest_object_headers, + dest_encryption, + source_stream.map_err(|e| Error::from(GarageError::from(e))), + &dest_key.to_string(), + None, + None, + ) + .await } +// -------- UploadPartCopy --------- + pub async fn handle_upload_part_copy( ctx: ReqCtx, req: &Request, @@ -193,7 +294,7 @@ pub async fn handle_upload_part_copy( let dest_upload_id = multipart::decode_upload_id(upload_id)?; let dest_key = dest_key.to_string(); - let (source_object, (_, _, mut dest_mpu)) = futures::try_join!( + let (source_object, (_, dest_version, mut dest_mpu)) = futures::try_join!( get_copy_source(&ctx, req), multipart::get_upload(&ctx, &dest_key, &dest_upload_id) )?; @@ -206,6 +307,20 @@ pub async fn handle_upload_part_copy( // Check precondition on source, e.g. x-amz-copy-source-if-match copy_precondition.check(source_object_version, &source_version_meta.etag)?; + // Determine encryption parameters + let (source_encryption, _) = EncryptionParams::check_decrypt_for_copy_source( + &garage, + req.headers(), + &source_version_meta.encryption, + )?; + let dest_object_encryption = match dest_version.state { + ObjectVersionState::Uploading { encryption, .. } => encryption, + _ => unreachable!(), + }; + let (dest_encryption, _) = + EncryptionParams::check_decrypt(&garage, req.headers(), &dest_object_encryption)?; + let same_encryption = EncryptionParams::is_same(&source_encryption, &dest_encryption); + // Check source range is valid let source_range = match req.headers().get("x-amz-copy-source-range") { Some(range) => { @@ -227,21 +342,16 @@ pub async fn handle_upload_part_copy( }; // Check source version is not inlined - match source_version_data { - ObjectVersionData::DeleteMarker => unreachable!(), - ObjectVersionData::Inline(_meta, _bytes) => { - // This is only for small files, we don't bother handling this. - // (in AWS UploadPartCopy works for parts at least 5MB which - // is never the case of an inline object) - return Err(Error::bad_request( - "Source object is too small (minimum part size is 5Mb)", - )); - } - ObjectVersionData::FirstBlock(_meta, _first_block_hash) => (), - }; + if matches!(source_version_data, ObjectVersionData::Inline(_, _)) { + // This is only for small files, we don't bother handling this. + // (in AWS UploadPartCopy works for parts at least 5MB which + // is never the case of an inline object) + return Err(Error::bad_request( + "Source object is too small (minimum part size is 5Mb)", + )); + } - // Fetch source versin with its block list, - // and destination version to check part hasn't yet been uploaded + // Fetch source version with its block list let source_version = garage .version_table .get(&source_object_version.uuid, &EmptyKey) @@ -251,7 +361,9 @@ pub async fn handle_upload_part_copy( // We want to reuse blocks from the source version as much as possible. // However, we still need to get the data from these blocks // because we need to know it to calculate the MD5sum of the part - // which is used as its ETag. + // which is used as its ETag. For encrypted sources or destinations, + // we must always read(+decrypt) and then write(+encrypt), so we + // can never reuse data blocks as is. // First, calculate what blocks we want to keep, // and the subrange of the block to take, if the bounds of the @@ -313,6 +425,8 @@ pub async fn handle_upload_part_copy( }, false, ); + // write an empty version now to be the parent of the block_ref entries + garage.version_table.insert(&dest_version).await?; // Now, actually copy the blocks let mut md5hasher = Md5::new(); @@ -321,24 +435,44 @@ pub async fn handle_upload_part_copy( // and extract the subrange if necessary. // The second returned value is an Option, that is Some // if and only if the block returned is a block that already existed - // in the Garage data store (thus we don't need to save it again). + // in the Garage data store and can be reused as-is instead of having + // to save it again. This excludes encrypted source blocks that we had + // to decrypt. let garage2 = garage.clone(); let order_stream = OrderTag::stream(); let source_blocks = stream::iter(blocks_to_copy) .enumerate() - .flat_map(|(i, (block_hash, range_to_copy))| { + .map(|(i, (block_hash, range_to_copy))| { let garage3 = garage2.clone(); - stream::once(async move { - let data = garage3 - .block_manager - .rpc_get_block(&block_hash, Some(order_stream.order(i as u64))) + async move { + let stream = source_encryption + .get_block(&garage3, &block_hash, Some(order_stream.order(i as u64))) .await?; + let data = read_stream_to_end(stream).await?.into_bytes(); + // For each item, we return a tuple of: + // 1. the full data block (decrypted) + // 2. an Option that indicates the hash of the block in the block store, + // only if it can be re-used as-is in the copied object match range_to_copy { - Some(r) => Ok((data.slice(r), None)), - None => Ok((data, Some(block_hash))), + Some(r) => { + // If we are taking a subslice of the data, we cannot reuse the block as-is + Ok((data.slice(r), None)) + } + None if same_encryption => { + // If the data is unencrypted before & after, or if we are using + // the same encryption key, we can reuse the stored block, no need + // to re-send it to storage nodes. + Ok((data, Some(block_hash))) + } + None => { + // If we are decrypting / (re)encrypting with different keys, + // we cannot reuse the block as-is + Ok((data, None)) + } } - }) + } }) + .buffered(2) .peekable(); // The defragmenter is a custom stream (defined below) that concatenates @@ -346,22 +480,33 @@ pub async fn handle_upload_part_copy( // It returns a series of (Vec, Option). // When it is done, it returns an empty vec. // Same as the previous iterator, the Option is Some(_) if and only if - // it's an existing block of the Garage data store. + // it's an existing block of the Garage data store that can be reused. let mut defragmenter = Defragmenter::new(garage.config.block_size, Box::pin(source_blocks)); let mut current_offset = 0; let mut next_block = defragmenter.next().await?; + // TODO this could be optimized similarly to read_and_put_blocks + // low priority because uploadpartcopy is rarely used loop { let (data, existing_block_hash) = next_block; if data.is_empty() { break; } + let data_len = data.len() as u64; md5hasher.update(&data[..]); - let must_upload = existing_block_hash.is_none(); - let final_hash = existing_block_hash.unwrap_or_else(|| blake2sum(&data[..])); + let (final_data, must_upload, final_hash) = match existing_block_hash { + Some(hash) if same_encryption => (data, false, hash), + _ => tokio::task::spawn_blocking(move || { + let data_enc = dest_encryption.encrypt_block(data)?; + let hash = blake2sum(&data_enc); + Ok::<_, Error>((data_enc, true, hash)) + }) + .await + .unwrap()?, + }; dest_version.blocks.clear(); dest_version.blocks.put( @@ -371,10 +516,10 @@ pub async fn handle_upload_part_copy( }, VersionBlock { hash: final_hash, - size: data.len() as u64, + size: data_len, }, ); - current_offset += data.len() as u64; + current_offset += data_len; let block_ref = BlockRef { block: final_hash, @@ -382,36 +527,33 @@ pub async fn handle_upload_part_copy( deleted: false.into(), }; - let garage2 = garage.clone(); - let res = futures::try_join!( + let (_, _, _, next) = futures::try_join!( // Thing 1: if the block is not exactly a block that existed before, // we need to insert that data as a new block. - async move { + async { if must_upload { - garage2 + garage .block_manager - .rpc_put_block(final_hash, data, None) + .rpc_put_block(final_hash, final_data, dest_encryption.is_encrypted(), None) .await } else { Ok(()) } }, - async { - // Thing 2: we need to insert the block in the version - garage.version_table.insert(&dest_version).await?; - // Thing 3: we need to add a block reference - garage.block_ref_table.insert(&block_ref).await - }, - // Thing 4: we need to prefetch the next block + // Thing 2: we need to insert the block in the version + garage.version_table.insert(&dest_version), + // Thing 3: we need to add a block reference + garage.block_ref_table.insert(&block_ref), + // Thing 4: we need to read the next block defragmenter.next(), )?; - next_block = res.2; + next_block = next; } assert_eq!(current_offset, source_range.length); let data_md5sum = md5hasher.finalize(); - let etag = hex::encode(data_md5sum); + let etag = dest_encryption.etag_from_md5(&data_md5sum); // Put the part's ETag in the Versiontable dest_mpu.parts.put( @@ -431,13 +573,14 @@ pub async fn handle_upload_part_copy( last_modified: s3_xml::Value(msec_to_rfc3339(source_object_version.timestamp)), })?; - Ok(Response::builder() + let mut resp = Response::builder() .header("Content-Type", "application/xml") .header( "x-amz-copy-source-version-id", hex::encode(source_object_version.uuid), - ) - .body(string_body(resp_xml))?) + ); + dest_encryption.add_response_headers(&mut resp); + Ok(resp.body(string_body(resp_xml))?) } async fn get_copy_source(ctx: &ReqCtx, req: &Request) -> Result { diff --git a/src/api/s3/encryption.rs b/src/api/s3/encryption.rs new file mode 100644 index 00000000..2b105e90 --- /dev/null +++ b/src/api/s3/encryption.rs @@ -0,0 +1,595 @@ +use std::borrow::Cow; +use std::convert::TryInto; +use std::pin::Pin; + +use aes_gcm::{ + aead::stream::{DecryptorLE31, EncryptorLE31, StreamLE31}, + aead::{Aead, AeadCore, KeyInit, OsRng}, + aes::cipher::crypto_common::rand_core::RngCore, + aes::cipher::typenum::Unsigned, + Aes256Gcm, Key, Nonce, +}; +use base64::prelude::*; +use bytes::Bytes; + +use futures::stream::Stream; +use futures::task; +use tokio::io::BufReader; + +use http::header::{HeaderMap, HeaderName, HeaderValue}; + +use garage_net::bytes_buf::BytesBuf; +use garage_net::stream::{stream_asyncread, ByteStream}; +use garage_rpc::rpc_helper::OrderTag; +use garage_util::data::Hash; +use garage_util::error::Error as GarageError; +use garage_util::migrate::Migrate; + +use garage_model::garage::Garage; +use garage_model::s3::object_table::{ObjectVersionEncryption, ObjectVersionHeaders}; + +use crate::common_error::*; +use crate::s3::error::Error; + +const X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_ALGORITHM: HeaderName = + HeaderName::from_static("x-amz-server-side-encryption-customer-algorithm"); +const X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY: HeaderName = + HeaderName::from_static("x-amz-server-side-encryption-customer-key"); +const X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5: HeaderName = + HeaderName::from_static("x-amz-server-side-encryption-customer-key-md5"); + +const X_AMZ_COPY_SOURCE_SERVER_SIDE_ENCRYPTION_CUSTOMER_ALGORITHM: HeaderName = + HeaderName::from_static("x-amz-copy-source-server-side-encryption-customer-algorithm"); +const X_AMZ_COPY_SOURCE_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY: HeaderName = + HeaderName::from_static("x-amz-copy-source-server-side-encryption-customer-key"); +const X_AMZ_COPY_SOURCE_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5: HeaderName = + HeaderName::from_static("x-amz-copy-source-server-side-encryption-customer-key-md5"); + +const CUSTOMER_ALGORITHM_AES256: &[u8] = b"AES256"; + +type Md5Output = md5::digest::Output; + +type StreamNonceSize = aes_gcm::aead::stream::NonceSize>; + +// Data blocks are encrypted by smaller chunks of size 4096 bytes, +// so that data can be streamed when reading. +// This size has to be known and has to be constant, or data won't be +// readable anymore. DO NOT CHANGE THIS VALUE. +const STREAM_ENC_PLAIN_CHUNK_SIZE: usize = 0x1000; // 4096 bytes +const STREAM_ENC_CYPER_CHUNK_SIZE: usize = STREAM_ENC_PLAIN_CHUNK_SIZE + 16; + +#[derive(Clone, Copy)] +pub enum EncryptionParams { + Plaintext, + SseC { + client_key: Key, + client_key_md5: Md5Output, + compression_level: Option, + }, +} + +impl EncryptionParams { + pub fn is_encrypted(&self) -> bool { + !matches!(self, Self::Plaintext) + } + + pub fn is_same(a: &Self, b: &Self) -> bool { + let relevant_info = |x: &Self| match x { + Self::Plaintext => None, + Self::SseC { + client_key, + compression_level, + .. + } => Some((*client_key, compression_level.is_some())), + }; + relevant_info(a) == relevant_info(b) + } + + pub fn new_from_headers( + garage: &Garage, + headers: &HeaderMap, + ) -> Result { + let key = parse_request_headers( + headers, + &X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_ALGORITHM, + &X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY, + &X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5, + )?; + match key { + Some((client_key, client_key_md5)) => Ok(EncryptionParams::SseC { + client_key, + client_key_md5, + compression_level: garage.config.compression_level, + }), + None => Ok(EncryptionParams::Plaintext), + } + } + + pub fn add_response_headers(&self, resp: &mut http::response::Builder) { + if let Self::SseC { client_key_md5, .. } = self { + let md5 = BASE64_STANDARD.encode(&client_key_md5); + + resp.headers_mut().unwrap().insert( + X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_ALGORITHM, + HeaderValue::from_bytes(CUSTOMER_ALGORITHM_AES256).unwrap(), + ); + resp.headers_mut().unwrap().insert( + X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5, + HeaderValue::from_bytes(md5.as_bytes()).unwrap(), + ); + } + } + + pub fn check_decrypt<'a>( + garage: &Garage, + headers: &HeaderMap, + obj_enc: &'a ObjectVersionEncryption, + ) -> Result<(Self, Cow<'a, ObjectVersionHeaders>), Error> { + let key = parse_request_headers( + headers, + &X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_ALGORITHM, + &X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY, + &X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5, + )?; + Self::check_decrypt_common(garage, key, obj_enc) + } + + pub fn check_decrypt_for_copy_source<'a>( + garage: &Garage, + headers: &HeaderMap, + obj_enc: &'a ObjectVersionEncryption, + ) -> Result<(Self, Cow<'a, ObjectVersionHeaders>), Error> { + let key = parse_request_headers( + headers, + &X_AMZ_COPY_SOURCE_SERVER_SIDE_ENCRYPTION_CUSTOMER_ALGORITHM, + &X_AMZ_COPY_SOURCE_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY, + &X_AMZ_COPY_SOURCE_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5, + )?; + Self::check_decrypt_common(garage, key, obj_enc) + } + + fn check_decrypt_common<'a>( + garage: &Garage, + key: Option<(Key, Md5Output)>, + obj_enc: &'a ObjectVersionEncryption, + ) -> Result<(Self, Cow<'a, ObjectVersionHeaders>), Error> { + match (key, &obj_enc) { + ( + Some((client_key, client_key_md5)), + ObjectVersionEncryption::SseC { + headers, + compressed, + }, + ) => { + let enc = Self::SseC { + client_key, + client_key_md5, + compression_level: if *compressed { + Some(garage.config.compression_level.unwrap_or(1)) + } else { + None + }, + }; + let plaintext = enc.decrypt_blob(&headers)?; + let headers = ObjectVersionHeaders::decode(&plaintext) + .ok_or_internal_error("Could not decode encrypted headers")?; + Ok((enc, Cow::Owned(headers))) + } + (None, ObjectVersionEncryption::Plaintext { headers }) => { + Ok((Self::Plaintext, Cow::Borrowed(headers))) + } + (_, ObjectVersionEncryption::SseC { .. }) => { + Err(Error::bad_request("Object is encrypted")) + } + (Some(_), _) => { + // TODO: should this be an OK scenario? + Err(Error::bad_request("Trying to decrypt a plaintext object")) + } + } + } + + pub fn encrypt_headers( + &self, + h: ObjectVersionHeaders, + ) -> Result { + match self { + Self::SseC { + compression_level, .. + } => { + let plaintext = h.encode().map_err(GarageError::from)?; + let ciphertext = self.encrypt_blob(&plaintext)?; + Ok(ObjectVersionEncryption::SseC { + headers: ciphertext.into_owned(), + compressed: compression_level.is_some(), + }) + } + Self::Plaintext => Ok(ObjectVersionEncryption::Plaintext { headers: h }), + } + } + + // ---- generating object Etag values ---- + pub fn etag_from_md5(&self, md5sum: &[u8]) -> String { + match self { + Self::Plaintext => hex::encode(md5sum), + Self::SseC { .. } => { + // AWS specifies that for encrypted objects, the Etag is not + // the md5sum of the data, but doesn't say what it is. + // So we just put some random bytes. + let mut random = [0u8; 16]; + OsRng.fill_bytes(&mut random); + hex::encode(&random) + } + } + } + + // ---- generic function for encrypting / decrypting blobs ---- + // Prepends a randomly-generated nonce to the encrypted value. + // This is used for encrypting object headers and inlined data for small objects. + // This does not compress anything. + + pub fn encrypt_blob<'a>(&self, blob: &'a [u8]) -> Result, Error> { + match self { + Self::SseC { client_key, .. } => { + let cipher = Aes256Gcm::new(&client_key); + let nonce = Aes256Gcm::generate_nonce(&mut OsRng); + let ciphertext = cipher + .encrypt(&nonce, blob) + .ok_or_internal_error("Encryption failed")?; + Ok(Cow::Owned([nonce.to_vec(), ciphertext].concat())) + } + Self::Plaintext => Ok(Cow::Borrowed(blob)), + } + } + + pub fn decrypt_blob<'a>(&self, blob: &'a [u8]) -> Result, Error> { + match self { + Self::SseC { client_key, .. } => { + let cipher = Aes256Gcm::new(&client_key); + let nonce_size = ::NonceSize::to_usize(); + let nonce = Nonce::from_slice( + blob.get(..nonce_size) + .ok_or_internal_error("invalid encrypted data")?, + ); + let plaintext = cipher + .decrypt(nonce, &blob[nonce_size..]) + .ok_or_bad_request( + "Invalid encryption key, could not decrypt object metadata.", + )?; + Ok(Cow::Owned(plaintext)) + } + Self::Plaintext => Ok(Cow::Borrowed(blob)), + } + } + + // ---- function for encrypting / decrypting byte streams ---- + + /// Get a data block from the storage node, and decrypt+decompress it + /// if necessary. If object is plaintext, just get it without any processing. + pub async fn get_block( + &self, + garage: &Garage, + hash: &Hash, + order: Option, + ) -> Result { + let raw_block = garage + .block_manager + .rpc_get_block_streaming(hash, order) + .await?; + Ok(self.decrypt_block_stream(raw_block)) + } + + pub fn decrypt_block_stream(&self, stream: ByteStream) -> ByteStream { + match self { + Self::Plaintext => stream, + Self::SseC { + client_key, + compression_level, + .. + } => { + let plaintext = DecryptStream::new(stream, *client_key); + if compression_level.is_some() { + let reader = stream_asyncread(Box::pin(plaintext)); + let reader = BufReader::new(reader); + let reader = async_compression::tokio::bufread::ZstdDecoder::new(reader); + Box::pin(tokio_util::io::ReaderStream::new(reader)) + } else { + Box::pin(plaintext) + } + } + } + } + + /// Encrypt a data block if encryption is set, for use before + /// putting the data blocks into storage + pub fn encrypt_block(&self, block: Bytes) -> Result { + match self { + Self::Plaintext => Ok(block), + Self::SseC { + client_key, + compression_level, + .. + } => { + let block = if let Some(level) = compression_level { + Cow::Owned( + garage_block::zstd_encode(block.as_ref(), *level) + .ok_or_internal_error("failed to compress data block")?, + ) + } else { + Cow::Borrowed(block.as_ref()) + }; + + let mut ret = Vec::with_capacity(block.len() + 32 + block.len() / 64); + + let mut nonce: Nonce = Default::default(); + OsRng.fill_bytes(&mut nonce); + ret.extend_from_slice(nonce.as_slice()); + + let mut cipher = EncryptorLE31::::new(&client_key, &nonce); + let mut iter = block.chunks(STREAM_ENC_PLAIN_CHUNK_SIZE).peekable(); + + if iter.peek().is_none() { + // Empty stream: we encrypt an empty last chunk + let chunk_enc = cipher + .encrypt_last(&[][..]) + .ok_or_internal_error("failed to encrypt chunk")?; + ret.extend_from_slice(&chunk_enc); + } else { + loop { + let chunk = iter.next().unwrap(); + if iter.peek().is_some() { + let chunk_enc = cipher + .encrypt_next(chunk) + .ok_or_internal_error("failed to encrypt chunk")?; + assert_eq!(chunk.len(), STREAM_ENC_PLAIN_CHUNK_SIZE); + assert_eq!(chunk_enc.len(), STREAM_ENC_CYPER_CHUNK_SIZE); + ret.extend_from_slice(&chunk_enc); + } else { + // use encrypt_last for the last chunk + let chunk_enc = cipher + .encrypt_last(chunk) + .ok_or_internal_error("failed to encrypt chunk")?; + ret.extend_from_slice(&chunk_enc); + break; + } + } + } + + Ok(ret.into()) + } + } + } +} + +fn parse_request_headers( + headers: &HeaderMap, + alg_header: &HeaderName, + key_header: &HeaderName, + md5_header: &HeaderName, +) -> Result, Md5Output)>, Error> { + let alg = headers.get(alg_header).map(HeaderValue::as_bytes); + let key = headers.get(key_header).map(HeaderValue::as_bytes); + let md5 = headers.get(md5_header).map(HeaderValue::as_bytes); + + match alg { + Some(CUSTOMER_ALGORITHM_AES256) => { + use md5::{Digest, Md5}; + + let key_b64 = + key.ok_or_bad_request("Missing server-side-encryption-customer-key header")?; + let key_bytes: [u8; 32] = BASE64_STANDARD + .decode(&key_b64) + .ok_or_bad_request( + "Invalid server-side-encryption-customer-key header: invalid base64", + )? + .try_into() + .ok() + .ok_or_bad_request( + "Invalid server-side-encryption-customer-key header: invalid length", + )?; + + let md5_b64 = + md5.ok_or_bad_request("Missing server-side-encryption-customer-key-md5 header")?; + let md5_bytes = BASE64_STANDARD.decode(&md5_b64).ok_or_bad_request( + "Invalid server-side-encryption-customer-key-md5 header: invalid bass64", + )?; + + let mut hasher = Md5::new(); + hasher.update(&key_bytes[..]); + let our_md5 = hasher.finalize(); + if our_md5.as_slice() != md5_bytes.as_slice() { + return Err(Error::bad_request( + "Server-side encryption client key MD5 checksum does not match", + )); + } + + Ok(Some((key_bytes.into(), our_md5))) + } + Some(alg) => Err(Error::InvalidEncryptionAlgorithm( + String::from_utf8_lossy(alg).into_owned(), + )), + None => { + if key.is_some() || md5.is_some() { + Err(Error::bad_request( + "Unexpected server-side-encryption-customer-key{,-md5} header(s)", + )) + } else { + Ok(None) + } + } + } +} + +// ---- encrypt & decrypt streams ---- + +#[pin_project::pin_project] +struct DecryptStream { + #[pin] + stream: ByteStream, + done_reading: bool, + buf: BytesBuf, + key: Key, + state: DecryptStreamState, +} + +enum DecryptStreamState { + Starting, + Running(DecryptorLE31), + Done, +} + +impl DecryptStream { + fn new(stream: ByteStream, key: Key) -> Self { + Self { + stream, + done_reading: false, + buf: BytesBuf::new(), + key, + state: DecryptStreamState::Starting, + } + } +} + +impl Stream for DecryptStream { + type Item = Result; + + fn poll_next( + self: Pin<&mut Self>, + cx: &mut task::Context<'_>, + ) -> task::Poll> { + use std::task::Poll; + + let mut this = self.project(); + + // The first bytes of the stream should contain the starting nonce. + // If we don't have a Running state, it means that we haven't + // yet read the nonce. + while matches!(this.state, DecryptStreamState::Starting) { + let nonce_size = StreamNonceSize::to_usize(); + if let Some(nonce) = this.buf.take_exact(nonce_size) { + let nonce = Nonce::from_slice(nonce.as_ref()); + *this.state = DecryptStreamState::Running(DecryptorLE31::new(&this.key, nonce)); + break; + } + + match futures::ready!(this.stream.as_mut().poll_next(cx)) { + Some(Ok(bytes)) => { + this.buf.extend(bytes); + } + Some(Err(e)) => { + return Poll::Ready(Some(Err(e))); + } + None => { + return Poll::Ready(Some(Err(std::io::Error::new( + std::io::ErrorKind::UnexpectedEof, + "Decrypt: unexpected EOF, could not read nonce", + )))); + } + } + } + + // Read at least one byte more than the encrypted chunk size + // (if possible), so that we know if we are decrypting the + // last chunk or not. + while !*this.done_reading && this.buf.len() <= STREAM_ENC_CYPER_CHUNK_SIZE { + match futures::ready!(this.stream.as_mut().poll_next(cx)) { + Some(Ok(bytes)) => { + this.buf.extend(bytes); + } + Some(Err(e)) => { + return Poll::Ready(Some(Err(e))); + } + None => { + *this.done_reading = true; + break; + } + } + } + + if matches!(this.state, DecryptStreamState::Done) { + if !this.buf.is_empty() { + return Poll::Ready(Some(Err(std::io::Error::new( + std::io::ErrorKind::Other, + "Decrypt: unexpected bytes after last encrypted chunk", + )))); + } + return Poll::Ready(None); + } + + let res = if this.buf.len() > STREAM_ENC_CYPER_CHUNK_SIZE { + // we have strictly more bytes than the encrypted chunk size, + // so we know this is not the last + let DecryptStreamState::Running(ref mut cipher) = this.state else { + unreachable!() + }; + let chunk = this.buf.take_exact(STREAM_ENC_CYPER_CHUNK_SIZE).unwrap(); + let chunk_dec = cipher.decrypt_next(chunk.as_ref()); + if let Ok(c) = &chunk_dec { + assert_eq!(c.len(), STREAM_ENC_PLAIN_CHUNK_SIZE); + } + chunk_dec + } else { + // We have one encrypted chunk size or less, even though we tried + // to read more, so this is the last chunk. Decrypt using the + // appropriate decrypt_last() function that then destroys the cipher. + let state = std::mem::replace(this.state, DecryptStreamState::Done); + let DecryptStreamState::Running(cipher) = state else { + unreachable!() + }; + let chunk = this.buf.take_all(); + cipher.decrypt_last(chunk.as_ref()) + }; + + match res { + Ok(bytes) if bytes.is_empty() => Poll::Ready(None), + Ok(bytes) => Poll::Ready(Some(Ok(bytes.into()))), + Err(_) => Poll::Ready(Some(Err(std::io::Error::new( + std::io::ErrorKind::Other, + "Decryption failed", + )))), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use futures::stream::StreamExt; + use garage_net::stream::read_stream_to_end; + + fn stream() -> ByteStream { + Box::pin( + futures::stream::iter(16usize..1024) + .map(|i| Ok(Bytes::from(vec![(i % 256) as u8; (i * 37) % 1024]))), + ) + } + + async fn test_block_enc(compression_level: Option) { + let enc = EncryptionParams::SseC { + client_key: Aes256Gcm::generate_key(&mut OsRng), + client_key_md5: Default::default(), // not needed + compression_level, + }; + + let block_plain = read_stream_to_end(stream()).await.unwrap().into_bytes(); + + let block_enc = enc.encrypt_block(block_plain.clone()).unwrap(); + + let block_dec = + enc.decrypt_block_stream(Box::pin(futures::stream::once(async { Ok(block_enc) }))); + let block_dec = read_stream_to_end(block_dec).await.unwrap().into_bytes(); + + assert_eq!(block_plain, block_dec); + assert!(block_dec.len() > 128000); + } + + #[tokio::test] + async fn test_encrypt_block() { + test_block_enc(None).await + } + + #[tokio::test] + async fn test_encrypt_block_compressed() { + test_block_enc(Some(1)).await + } +} diff --git a/src/api/s3/error.rs b/src/api/s3/error.rs index f86c19a6..5cb5d04e 100644 --- a/src/api/s3/error.rs +++ b/src/api/s3/error.rs @@ -65,6 +65,10 @@ pub enum Error { #[error(display = "Invalid HTTP range: {:?}", _0)] InvalidRange(#[error(from)] (http_range::HttpRangeParseError, u64)), + /// The client sent a range header with invalid value + #[error(display = "Invalid encryption algorithm: {:?}, should be AES256", _0)] + InvalidEncryptionAlgorithm(String), + /// The client sent a request for an action not supported by garage #[error(display = "Unimplemented action: {}", _0)] NotImplemented(String), @@ -126,6 +130,7 @@ impl Error { Error::InvalidXml(_) => "MalformedXML", Error::InvalidRange(_) => "InvalidRange", Error::InvalidUtf8Str(_) | Error::InvalidUtf8String(_) => "InvalidRequest", + Error::InvalidEncryptionAlgorithm(_) => "InvalidEncryptionAlgorithmError", } } } @@ -143,6 +148,7 @@ impl ApiError for Error { | Error::InvalidPart | Error::InvalidPartOrder | Error::EntityTooSmall + | Error::InvalidEncryptionAlgorithm(_) | Error::InvalidXml(_) | Error::InvalidUtf8Str(_) | Error::InvalidUtf8String(_) => StatusCode::BAD_REQUEST, diff --git a/src/api/s3/get.rs b/src/api/s3/get.rs index ed996fb1..1bca4671 100644 --- a/src/api/s3/get.rs +++ b/src/api/s3/get.rs @@ -3,8 +3,9 @@ use std::convert::TryInto; use std::sync::Arc; use std::time::{Duration, UNIX_EPOCH}; +use bytes::Bytes; use futures::future; -use futures::stream::{self, StreamExt}; +use futures::stream::{self, Stream, StreamExt}; use http::header::{ ACCEPT_RANGES, CACHE_CONTROL, CONTENT_DISPOSITION, CONTENT_ENCODING, CONTENT_LANGUAGE, CONTENT_LENGTH, CONTENT_RANGE, CONTENT_TYPE, ETAG, EXPIRES, IF_MODIFIED_SINCE, IF_NONE_MATCH, @@ -25,6 +26,7 @@ use garage_model::s3::version_table::*; use crate::helpers::*; use crate::s3::api_server::ResBody; +use crate::s3::encryption::EncryptionParams; use crate::s3::error::*; const X_AMZ_MP_PARTS_COUNT: &str = "x-amz-mp-parts-count"; @@ -42,6 +44,8 @@ pub struct GetObjectOverrides { fn object_headers( version: &ObjectVersion, version_meta: &ObjectVersionMeta, + headers: &ObjectVersionHeaders, + encryption: EncryptionParams, ) -> http::response::Builder { debug!("Version meta: {:?}", version_meta); @@ -49,7 +53,7 @@ fn object_headers( let date_str = httpdate::fmt_http_date(date); let mut resp = Response::builder() - .header(CONTENT_TYPE, version_meta.headers.content_type.to_string()) + .header(CONTENT_TYPE, headers.content_type.to_string()) .header(LAST_MODIFIED, date_str) .header(ACCEPT_RANGES, "bytes".to_string()); @@ -57,10 +61,12 @@ fn object_headers( resp = resp.header(ETAG, format!("\"{}\"", version_meta.etag)); } - for (k, v) in version_meta.headers.other.iter() { + for (k, v) in headers.other.iter() { resp = resp.header(k, v.to_string()); } + encryption.add_response_headers(&mut resp); + resp } @@ -175,21 +181,27 @@ pub async fn handle_head_without_ctx( return Ok(cached); } + let (encryption, headers) = + EncryptionParams::check_decrypt(&garage, req.headers(), &version_meta.encryption)?; + if let Some(pn) = part_number { match version_data { - ObjectVersionData::Inline(_, bytes) => { + ObjectVersionData::Inline(_, _) => { if pn != 1 { return Err(Error::InvalidPart); } - Ok(object_headers(object_version, version_meta) - .header(CONTENT_LENGTH, format!("{}", bytes.len())) - .header( - CONTENT_RANGE, - format!("bytes 0-{}/{}", bytes.len() - 1, bytes.len()), - ) - .header(X_AMZ_MP_PARTS_COUNT, "1") - .status(StatusCode::PARTIAL_CONTENT) - .body(empty_body())?) + let bytes_len = version_meta.size; + Ok( + object_headers(object_version, version_meta, &headers, encryption) + .header(CONTENT_LENGTH, format!("{}", bytes_len)) + .header( + CONTENT_RANGE, + format!("bytes 0-{}/{}", bytes_len - 1, bytes_len), + ) + .header(X_AMZ_MP_PARTS_COUNT, "1") + .status(StatusCode::PARTIAL_CONTENT) + .body(empty_body())?, + ) } ObjectVersionData::FirstBlock(_, _) => { let version = garage @@ -201,28 +213,32 @@ pub async fn handle_head_without_ctx( let (part_offset, part_end) = calculate_part_bounds(&version, pn).ok_or(Error::InvalidPart)?; - Ok(object_headers(object_version, version_meta) - .header(CONTENT_LENGTH, format!("{}", part_end - part_offset)) - .header( - CONTENT_RANGE, - format!( - "bytes {}-{}/{}", - part_offset, - part_end - 1, - version_meta.size - ), - ) - .header(X_AMZ_MP_PARTS_COUNT, format!("{}", version.n_parts()?)) - .status(StatusCode::PARTIAL_CONTENT) - .body(empty_body())?) + Ok( + object_headers(object_version, version_meta, &headers, encryption) + .header(CONTENT_LENGTH, format!("{}", part_end - part_offset)) + .header( + CONTENT_RANGE, + format!( + "bytes {}-{}/{}", + part_offset, + part_end - 1, + version_meta.size + ), + ) + .header(X_AMZ_MP_PARTS_COUNT, format!("{}", version.n_parts()?)) + .status(StatusCode::PARTIAL_CONTENT) + .body(empty_body())?, + ) } _ => unreachable!(), } } else { - Ok(object_headers(object_version, version_meta) - .header(CONTENT_LENGTH, format!("{}", version_meta.size)) - .status(StatusCode::OK) - .body(empty_body())?) + Ok( + object_headers(object_version, version_meta, &headers, encryption) + .header(CONTENT_LENGTH, format!("{}", version_meta.size)) + .status(StatusCode::OK) + .body(empty_body())?, + ) } } @@ -273,23 +289,41 @@ pub async fn handle_get_without_ctx( return Ok(cached); } + let (enc, headers) = + EncryptionParams::check_decrypt(&garage, req.headers(), &last_v_meta.encryption)?; + match (part_number, parse_range_header(req, last_v_meta.size)?) { (Some(_), Some(_)) => Err(Error::bad_request( "Cannot specify both partNumber and Range header", )), - (Some(pn), None) => handle_get_part(garage, last_v, last_v_data, last_v_meta, pn).await, + (Some(pn), None) => { + handle_get_part(garage, last_v, last_v_data, last_v_meta, enc, &headers, pn).await + } (None, Some(range)) => { handle_get_range( garage, last_v, last_v_data, last_v_meta, + enc, + &headers, range.start, range.start + range.length, ) .await } - (None, None) => handle_get_full(garage, last_v, last_v_data, last_v_meta, overrides).await, + (None, None) => { + handle_get_full( + garage, + last_v, + last_v_data, + last_v_meta, + enc, + &headers, + overrides, + ) + .await + } } } @@ -298,17 +332,36 @@ async fn handle_get_full( version: &ObjectVersion, version_data: &ObjectVersionData, version_meta: &ObjectVersionMeta, + encryption: EncryptionParams, + headers: &ObjectVersionHeaders, overrides: GetObjectOverrides, ) -> Result, Error> { - let mut resp_builder = object_headers(version, version_meta) + let mut resp_builder = object_headers(version, version_meta, &headers, encryption) .header(CONTENT_LENGTH, format!("{}", version_meta.size)) .status(StatusCode::OK); getobject_override_headers(overrides, &mut resp_builder)?; + let stream = full_object_byte_stream(garage, version, version_data, encryption); + + Ok(resp_builder.body(response_body_from_stream(stream))?) +} + +pub fn full_object_byte_stream( + garage: Arc, + version: &ObjectVersion, + version_data: &ObjectVersionData, + encryption: EncryptionParams, +) -> ByteStream { match &version_data { ObjectVersionData::DeleteMarker => unreachable!(), ObjectVersionData::Inline(_, bytes) => { - Ok(resp_builder.body(bytes_body(bytes.to_vec().into()))?) + let bytes = bytes.to_vec(); + Box::pin(futures::stream::once(async move { + encryption + .decrypt_blob(&bytes) + .map(|x| Bytes::from(x.to_vec())) + .map_err(std_error_from_read_error) + })) } ObjectVersionData::FirstBlock(_, first_block_hash) => { let (tx, rx) = mpsc::channel::(2); @@ -324,19 +377,18 @@ async fn handle_get_full( garage2.version_table.get(&version_uuid, &EmptyKey).await }); - let stream_block_0 = garage - .block_manager - .rpc_get_block_streaming(&first_block_hash, Some(order_stream.order(0))) + let stream_block_0 = encryption + .get_block(&garage, &first_block_hash, Some(order_stream.order(0))) .await?; + tx.send(stream_block_0) .await .ok_or_message("channel closed")?; let version = version_fut.await.unwrap()?.ok_or(Error::NoSuchKey)?; for (i, (_, vb)) in version.blocks.items().iter().enumerate().skip(1) { - let stream_block_i = garage - .block_manager - .rpc_get_block_streaming(&vb.hash, Some(order_stream.order(i as u64))) + let stream_block_i = encryption + .get_block(&garage, &vb.hash, Some(order_stream.order(i as u64))) .await?; tx.send(stream_block_i) .await @@ -354,8 +406,7 @@ async fn handle_get_full( } }); - let body = response_body_from_block_stream(rx); - Ok(resp_builder.body(body)?) + Box::pin(tokio_stream::wrappers::ReceiverStream::new(rx).flatten()) } } } @@ -365,13 +416,15 @@ async fn handle_get_range( version: &ObjectVersion, version_data: &ObjectVersionData, version_meta: &ObjectVersionMeta, + encryption: EncryptionParams, + headers: &ObjectVersionHeaders, begin: u64, end: u64, ) -> Result, Error> { // Here we do not use getobject_override_headers because we don't // want to add any overridden headers (those should not be added // when returning PARTIAL_CONTENT) - let resp_builder = object_headers(version, version_meta) + let resp_builder = object_headers(version, version_meta, headers, encryption) .header(CONTENT_LENGTH, format!("{}", end - begin)) .header( CONTENT_RANGE, @@ -382,6 +435,7 @@ async fn handle_get_range( match &version_data { ObjectVersionData::DeleteMarker => unreachable!(), ObjectVersionData::Inline(_meta, bytes) => { + let bytes = encryption.decrypt_blob(&bytes)?; if end as usize <= bytes.len() { let body = bytes_body(bytes[begin as usize..end as usize].to_vec().into()); Ok(resp_builder.body(body)?) @@ -398,7 +452,8 @@ async fn handle_get_range( .await? .ok_or(Error::NoSuchKey)?; - let body = body_from_blocks_range(garage, version.blocks.items(), begin, end); + let body = + body_from_blocks_range(garage, encryption, version.blocks.items(), begin, end); Ok(resp_builder.body(body)?) } } @@ -409,17 +464,21 @@ async fn handle_get_part( object_version: &ObjectVersion, version_data: &ObjectVersionData, version_meta: &ObjectVersionMeta, + encryption: EncryptionParams, + headers: &ObjectVersionHeaders, part_number: u64, ) -> Result, Error> { // Same as for get_range, no getobject_override_headers - let resp_builder = - object_headers(object_version, version_meta).status(StatusCode::PARTIAL_CONTENT); + let resp_builder = object_headers(object_version, version_meta, headers, encryption) + .status(StatusCode::PARTIAL_CONTENT); match version_data { ObjectVersionData::Inline(_, bytes) => { if part_number != 1 { return Err(Error::InvalidPart); } + let bytes = encryption.decrypt_blob(&bytes)?; + assert_eq!(bytes.len() as u64, version_meta.size); Ok(resp_builder .header(CONTENT_LENGTH, format!("{}", bytes.len())) .header( @@ -427,7 +486,7 @@ async fn handle_get_part( format!("bytes {}-{}/{}", 0, bytes.len() - 1, bytes.len()), ) .header(X_AMZ_MP_PARTS_COUNT, "1") - .body(bytes_body(bytes.to_vec().into()))?) + .body(bytes_body(bytes.into_owned().into()))?) } ObjectVersionData::FirstBlock(_, _) => { let version = garage @@ -439,7 +498,8 @@ async fn handle_get_part( let (begin, end) = calculate_part_bounds(&version, part_number).ok_or(Error::InvalidPart)?; - let body = body_from_blocks_range(garage, version.blocks.items(), begin, end); + let body = + body_from_blocks_range(garage, encryption, version.blocks.items(), begin, end); Ok(resp_builder .header(CONTENT_LENGTH, format!("{}", end - begin)) @@ -494,6 +554,7 @@ fn calculate_part_bounds(v: &Version, part_number: u64) -> Option<(u64, u64)> { fn body_from_blocks_range( garage: Arc, + encryption: EncryptionParams, all_blocks: &[(VersionBlockKey, VersionBlock)], begin: u64, end: u64, @@ -523,12 +584,11 @@ fn body_from_blocks_range( tokio::spawn(async move { match async { - let garage = garage.clone(); for (i, (block, block_offset)) in blocks.iter().enumerate() { - let block_stream = garage - .block_manager - .rpc_get_block_streaming(&block.hash, Some(order_stream.order(i as u64))) - .await? + let block_stream = encryption + .get_block(&garage, &block.hash, Some(order_stream.order(i as u64))) + .await?; + let block_stream = block_stream .scan(*block_offset, move |chunk_offset, chunk| { let r = match chunk { Ok(chunk_bytes) => { @@ -588,19 +648,30 @@ fn body_from_blocks_range( } fn response_body_from_block_stream(rx: mpsc::Receiver) -> ResBody { - let body_stream = tokio_stream::wrappers::ReceiverStream::new(rx) - .flatten() - .map(|x| { - x.map(hyper::body::Frame::data) - .map_err(|e| Error::from(garage_util::error::Error::from(e))) - }); + let body_stream = tokio_stream::wrappers::ReceiverStream::new(rx).flatten(); + response_body_from_stream(body_stream) +} + +fn response_body_from_stream(stream: S) -> ResBody +where + S: Stream> + Send + Sync + 'static, +{ + let body_stream = stream.map(|x| { + x.map(hyper::body::Frame::data) + .map_err(|e| Error::from(garage_util::error::Error::from(e))) + }); ResBody::new(http_body_util::StreamBody::new(body_stream)) } fn error_stream_item(e: E) -> ByteStream { - let err = std::io::Error::new( + Box::pin(stream::once(future::ready(Err(std_error_from_read_error( + e, + ))))) +} + +fn std_error_from_read_error(e: E) -> std::io::Error { + std::io::Error::new( std::io::ErrorKind::Other, - format!("Error while getting object data: {}", e), - ); - Box::pin(stream::once(future::ready(Err(err)))) + format!("Error while reading object data: {}", e), + ) } diff --git a/src/api/s3/list.rs b/src/api/s3/list.rs index 302c03f4..a7eebbb1 100644 --- a/src/api/s3/list.rs +++ b/src/api/s3/list.rs @@ -944,9 +944,11 @@ mod tests { timestamp: TS, state: ObjectVersionState::Uploading { multipart: true, - headers: ObjectVersionHeaders { - content_type: "text/plain".to_string(), - other: BTreeMap::::new(), + encryption: ObjectVersionEncryption::Plaintext { + headers: ObjectVersionHeaders { + content_type: "text/plain".to_string(), + other: BTreeMap::::new(), + }, }, }, } diff --git a/src/api/s3/mod.rs b/src/api/s3/mod.rs index cbdb94ab..1eb95d40 100644 --- a/src/api/s3/mod.rs +++ b/src/api/s3/mod.rs @@ -13,5 +13,6 @@ mod post_object; mod put; mod website; +mod encryption; mod router; pub mod xml; diff --git a/src/api/s3/multipart.rs b/src/api/s3/multipart.rs index 1d5aeb26..fcc5769f 100644 --- a/src/api/s3/multipart.rs +++ b/src/api/s3/multipart.rs @@ -16,6 +16,7 @@ use garage_model::s3::version_table::*; use crate::helpers::*; use crate::s3::api_server::{ReqBody, ResBody}; +use crate::s3::encryption::EncryptionParams; use crate::s3::error::*; use crate::s3::put::*; use crate::s3::xml as s3_xml; @@ -41,13 +42,17 @@ pub async fn handle_create_multipart_upload( let headers = get_headers(req.headers())?; + // Determine whether object should be encrypted, and if so the key + let encryption = EncryptionParams::new_from_headers(&garage, req.headers())?; + let object_encryption = encryption.encrypt_headers(headers)?; + // Create object in object table let object_version = ObjectVersion { uuid: upload_id, timestamp, state: ObjectVersionState::Uploading { multipart: true, - headers, + encryption: object_encryption, }, }; let object = Object::new(*bucket_id, key.to_string(), vec![object_version]); @@ -68,7 +73,9 @@ pub async fn handle_create_multipart_upload( }; let xml = s3_xml::to_xml_with_header(&result)?; - Ok(Response::new(string_body(xml))) + let mut resp = Response::builder(); + encryption.add_response_headers(&mut resp); + Ok(resp.body(string_body(xml))?) } pub async fn handle_put_part( @@ -91,12 +98,21 @@ pub async fn handle_put_part( // Read first chuck, and at the same time try to get object to see if it exists let key = key.to_string(); - let stream = body_stream(req.into_body()); + let (req_head, req_body) = req.into_parts(); + let stream = body_stream(req_body); let mut chunker = StreamChunker::new(stream, garage.config.block_size); - let ((_, _, mut mpu), first_block) = + let ((_, object_version, mut mpu), first_block) = futures::try_join!(get_upload(&ctx, &key, &upload_id), chunker.next(),)?; + // Check encryption params + let object_encryption = match object_version.state { + ObjectVersionState::Uploading { encryption, .. } => encryption, + _ => unreachable!(), + }; + let (encryption, _) = + EncryptionParams::check_decrypt(&garage, &req_head.headers, &object_encryption)?; + // Check object is valid and part can be accepted let first_block = first_block.ok_or_bad_request("Empty body")?; @@ -136,24 +152,32 @@ pub async fn handle_put_part( garage.version_table.insert(&version).await?; // Copy data to version - let (total_size, data_md5sum, data_sha256sum, _) = - read_and_put_blocks(&ctx, &version, part_number, first_block, &mut chunker).await?; + let (total_size, data_md5sum, data_sha256sum, _) = read_and_put_blocks( + &ctx, + &version, + encryption, + part_number, + first_block, + &mut chunker, + ) + .await?; // Verify that checksums map ensure_checksum_matches( - data_md5sum.as_slice(), + &data_md5sum, data_sha256sum, content_md5.as_deref(), content_sha256, )?; // Store part etag in version - let data_md5sum_hex = hex::encode(data_md5sum); + let etag = encryption.etag_from_md5(&data_md5sum); + mpu.parts.put( mpu_part_key, MpuPart { version: version_uuid, - etag: Some(data_md5sum_hex.clone()), + etag: Some(etag.clone()), size: Some(total_size), }, ); @@ -163,11 +187,9 @@ pub async fn handle_put_part( // We won't have to clean up on drop. interrupted_cleanup.cancel(); - let response = Response::builder() - .header("ETag", format!("\"{}\"", data_md5sum_hex)) - .body(empty_body()) - .unwrap(); - Ok(response) + let mut resp = Response::builder().header("ETag", format!("\"{}\"", etag)); + encryption.add_response_headers(&mut resp); + Ok(resp.body(empty_body())?) } struct InterruptedCleanup(Option); @@ -241,8 +263,8 @@ pub async fn handle_complete_multipart_upload( return Err(Error::bad_request("No data was uploaded")); } - let headers = match object_version.state { - ObjectVersionState::Uploading { headers, .. } => headers, + let object_encryption = match object_version.state { + ObjectVersionState::Uploading { encryption, .. } => encryption, _ => unreachable!(), }; @@ -344,7 +366,7 @@ pub async fn handle_complete_multipart_upload( // Write final object version object_version.state = ObjectVersionState::Complete(ObjectVersionData::FirstBlock( ObjectVersionMeta { - headers, + encryption: object_encryption, size: total_size, etag: etag.clone(), }, diff --git a/src/api/s3/post_object.rs b/src/api/s3/post_object.rs index 66f8174c..7c4219a7 100644 --- a/src/api/s3/post_object.rs +++ b/src/api/s3/post_object.rs @@ -18,6 +18,7 @@ use garage_model::garage::Garage; use crate::helpers::*; use crate::s3::api_server::ResBody; use crate::s3::cors::*; +use crate::s3::encryption::EncryptionParams; use crate::s3::error::*; use crate::s3::put::{get_headers, save_stream}; use crate::s3::xml as s3_xml; @@ -48,13 +49,17 @@ pub async fn handle_post_object( let mut multipart = Multipart::with_constraints(stream, boundary, constraints); let mut params = HeaderMap::new(); - let field = loop { + let file_field = loop { let field = if let Some(field) = multipart.next_field().await? { field } else { return Err(Error::bad_request("Request did not contain a file")); }; - let name: HeaderName = if let Some(Ok(name)) = field.name().map(TryInto::try_into) { + let name: HeaderName = if let Some(Ok(name)) = field + .name() + .map(str::to_ascii_lowercase) + .map(TryInto::try_into) + { name } else { continue; @@ -93,10 +98,14 @@ pub async fn handle_post_object( .ok_or_bad_request("No policy was provided")? .to_str()?; let authorization = Authorization::parse_form(¶ms)?; + let content_md5 = params + .get("content-md5") + .map(HeaderValue::to_str) + .transpose()?; let key = if key.contains("${filename}") { // if no filename is provided, don't replace. This matches the behavior of AWS. - if let Some(filename) = field.file_name() { + if let Some(filename) = file_field.file_name() { key.replace("${filename}", filename) } else { key.to_owned() @@ -143,9 +152,8 @@ pub async fn handle_post_object( let mut conditions = decoded_policy.into_conditions()?; for (param_key, value) in params.iter() { - let mut param_key = param_key.to_string(); - param_key.make_ascii_lowercase(); - match param_key.as_str() { + let param_key = param_key.as_str(); + match param_key { "policy" | "x-amz-signature" => (), // this is always accepted, as it's required to validate other fields "content-type" => { let conds = conditions.params.remove("content-type").ok_or_else(|| { @@ -190,7 +198,7 @@ pub async fn handle_post_object( // how aws seems to behave. continue; } - let conds = conditions.params.remove(¶m_key).ok_or_else(|| { + let conds = conditions.params.remove(param_key).ok_or_else(|| { Error::bad_request(format!("Key '{}' is not allowed in policy", param_key)) })?; for cond in conds { @@ -218,8 +226,9 @@ pub async fn handle_post_object( let headers = get_headers(¶ms)?; - let stream = field.map(|r| r.map_err(Into::into)); + let encryption = EncryptionParams::new_from_headers(&garage, ¶ms)?; + let stream = file_field.map(|r| r.map_err(Into::into)); let ctx = ReqCtx { garage, bucket_id, @@ -228,17 +237,18 @@ pub async fn handle_post_object( api_key, }; - let (_, md5) = save_stream( + let res = save_stream( &ctx, headers, + encryption, StreamLimiter::new(stream, conditions.content_length), &key, - None, + content_md5.map(str::to_string), None, ) .await?; - let etag = format!("\"{}\"", md5); + let etag = format!("\"{}\"", res.etag); let mut resp = if let Some(mut target) = params .get("success_action_redirect") @@ -252,11 +262,12 @@ pub async fn handle_post_object( .append_pair("key", &key) .append_pair("etag", &etag); let target = target.to_string(); - Response::builder() + let mut resp = Response::builder() .status(StatusCode::SEE_OTHER) .header(header::LOCATION, target.clone()) - .header(header::ETAG, etag) - .body(string_body(target))? + .header(header::ETAG, etag); + encryption.add_response_headers(&mut resp); + resp.body(string_body(target))? } else { let path = head .uri @@ -283,9 +294,10 @@ pub async fn handle_post_object( .get("success_action_status") .and_then(|h| h.to_str().ok()) .unwrap_or("204"); - let builder = Response::builder() + let mut builder = Response::builder() .header(header::LOCATION, location.clone()) .header(header::ETAG, etag.clone()); + encryption.add_response_headers(&mut builder); match action { "200" => builder.status(StatusCode::OK).body(empty_body())?, "201" => { diff --git a/src/api/s3/put.rs b/src/api/s3/put.rs index 2ced0580..745c2219 100644 --- a/src/api/s3/put.rs +++ b/src/api/s3/put.rs @@ -36,10 +36,18 @@ use garage_model::s3::version_table::*; use crate::helpers::*; use crate::s3::api_server::{ReqBody, ResBody}; +use crate::s3::encryption::EncryptionParams; use crate::s3::error::*; const PUT_BLOCKS_MAX_PARALLEL: usize = 3; +pub struct SaveStreamResult { + pub version_uuid: Uuid, + pub version_timestamp: u64, + /// Etag WITHOUT THE QUOTES (just the hex value) + pub etag: String, +} + pub async fn handle_put( ctx: ReqCtx, req: Request, @@ -50,6 +58,9 @@ pub async fn handle_put( let headers = get_headers(req.headers())?; debug!("Object headers: {:?}", headers); + // Determine whether object should be encrypted, and if so the key + let encryption = EncryptionParams::new_from_headers(&ctx.garage, req.headers())?; + let content_md5 = match req.headers().get("content-md5") { Some(x) => Some(x.to_str()?.to_string()), None => None, @@ -57,19 +68,33 @@ pub async fn handle_put( let stream = body_stream(req.into_body()); - save_stream(&ctx, headers, stream, key, content_md5, content_sha256) - .await - .map(|(uuid, md5)| put_response(uuid, md5)) + let res = save_stream( + &ctx, + headers, + encryption, + stream, + key, + content_md5, + content_sha256, + ) + .await?; + + let mut resp = Response::builder() + .header("x-amz-version-id", hex::encode(res.version_uuid)) + .header("ETag", format!("\"{}\"", res.etag)); + encryption.add_response_headers(&mut resp); + Ok(resp.body(empty_body())?) } pub(crate) async fn save_stream> + Unpin>( ctx: &ReqCtx, headers: ObjectVersionHeaders, + encryption: EncryptionParams, body: S, key: &String, content_md5: Option, content_sha256: Option, -) -> Result<(Uuid, String), Error> { +) -> Result { let ReqCtx { garage, bucket_id, .. } = ctx; @@ -82,6 +107,8 @@ pub(crate) async fn save_stream> + Unpin>( let first_block = first_block_opt.unwrap_or_default(); + let object_encryption = encryption.encrypt_headers(headers)?; + // Generate identity of new version let version_uuid = gen_uuid(); let version_timestamp = next_timestamp(existing_object.as_ref()); @@ -92,37 +119,43 @@ pub(crate) async fn save_stream> + Unpin>( let mut md5sum = Md5::new(); md5sum.update(&first_block[..]); let data_md5sum = md5sum.finalize(); - let data_md5sum_hex = hex::encode(data_md5sum); let data_sha256sum = sha256sum(&first_block[..]); - let size = first_block.len() as u64; ensure_checksum_matches( - data_md5sum.as_slice(), + &data_md5sum, data_sha256sum, content_md5.as_deref(), content_sha256, )?; + let size = first_block.len() as u64; check_quotas(ctx, size, existing_object.as_ref()).await?; + let etag = encryption.etag_from_md5(&data_md5sum); + let inline_data = encryption.encrypt_blob(&first_block)?.to_vec(); + let object_version = ObjectVersion { uuid: version_uuid, timestamp: version_timestamp, state: ObjectVersionState::Complete(ObjectVersionData::Inline( ObjectVersionMeta { - headers, + encryption: object_encryption, size, - etag: data_md5sum_hex.clone(), + etag: etag.clone(), }, - first_block.to_vec(), + inline_data, )), }; let object = Object::new(*bucket_id, key.into(), vec![object_version]); garage.object_table.insert(&object).await?; - return Ok((version_uuid, data_md5sum_hex)); + return Ok(SaveStreamResult { + version_uuid, + version_timestamp, + etag, + }); } // The following consists in many steps that can each fail. @@ -142,7 +175,7 @@ pub(crate) async fn save_stream> + Unpin>( uuid: version_uuid, timestamp: version_timestamp, state: ObjectVersionState::Uploading { - headers: headers.clone(), + encryption: object_encryption.clone(), multipart: false, }, }; @@ -165,10 +198,10 @@ pub(crate) async fn save_stream> + Unpin>( // Transfer data and verify checksum let (total_size, data_md5sum, data_sha256sum, first_block_hash) = - read_and_put_blocks(ctx, &version, 1, first_block, &mut chunker).await?; + read_and_put_blocks(ctx, &version, encryption, 1, first_block, &mut chunker).await?; ensure_checksum_matches( - data_md5sum.as_slice(), + &data_md5sum, data_sha256sum, content_md5.as_deref(), content_sha256, @@ -177,12 +210,13 @@ pub(crate) async fn save_stream> + Unpin>( check_quotas(ctx, total_size, existing_object.as_ref()).await?; // Save final object state, marked as Complete - let md5sum_hex = hex::encode(data_md5sum); + let etag = encryption.etag_from_md5(&data_md5sum); + object_version.state = ObjectVersionState::Complete(ObjectVersionData::FirstBlock( ObjectVersionMeta { - headers, + encryption: object_encryption, size: total_size, - etag: md5sum_hex.clone(), + etag: etag.clone(), }, first_block_hash, )); @@ -193,7 +227,11 @@ pub(crate) async fn save_stream> + Unpin>( // We won't have to clean up on drop. interrupted_cleanup.cancel(); - Ok((version_uuid, md5sum_hex)) + Ok(SaveStreamResult { + version_uuid, + version_timestamp, + etag, + }) } /// Validate MD5 sum against content-md5 header @@ -290,6 +328,7 @@ pub(crate) async fn check_quotas( pub(crate) async fn read_and_put_blocks> + Unpin>( ctx: &ReqCtx, version: &Version, + encryption: EncryptionParams, part_number: u64, first_block: Bytes, chunker: &mut StreamChunker, @@ -349,12 +388,31 @@ pub(crate) async fn read_and_put_blocks> + )) }; - let (block_tx3, mut block_rx3) = mpsc::channel::>(1); - let hash_blocks = async { + let (block_tx3, mut block_rx3) = mpsc::channel::>(1); + let encrypt_hash_blocks = async { let mut first_block_hash = None; while let Some(next) = block_rx2.recv().await { match next { Ok(block) => { + let unencrypted_len = block.len() as u64; + let block = if encryption.is_encrypted() { + let res = + tokio::task::spawn_blocking(move || encryption.encrypt_block(block)) + .with_context(Context::current_with_span( + tracer.start("Encrypt block"), + )) + .await + .unwrap(); + match res { + Ok(b) => b, + Err(e) => { + block_tx3.send(Err(e)).await?; + break; + } + } + } else { + block + }; let hash = async_blake2sum(block.clone()) .with_context(Context::current_with_span( tracer.start("Hash block (blake2)"), @@ -363,7 +421,7 @@ pub(crate) async fn read_and_put_blocks> + if first_block_hash.is_none() { first_block_hash = Some(hash); } - block_tx3.send(Ok((block, hash))).await?; + block_tx3.send(Ok((block, unencrypted_len, hash))).await?; } Err(e) => { block_tx3.send(Err(e)).await?; @@ -398,7 +456,7 @@ pub(crate) async fn read_and_put_blocks> + block_rx3.recv().await } }; - let (block, hash) = tokio::select! { + let (block, unencrypted_len, hash) = tokio::select! { result = write_futs_next => { result?; continue; @@ -410,17 +468,18 @@ pub(crate) async fn read_and_put_blocks> + }; // For next block to be written: count its size and spawn future to write it - let offset = written_bytes; - written_bytes += block.len() as u64; write_futs.push_back(put_block_and_meta( ctx, version, part_number, - offset, + written_bytes, hash, block, + unencrypted_len, + encryption.is_encrypted(), order_stream.order(written_bytes), )); + written_bytes += unencrypted_len; } while let Some(res) = write_futs.next().await { res?; @@ -429,7 +488,7 @@ pub(crate) async fn read_and_put_blocks> + }; let (_, stream_hash_result, block_hash_result, final_result) = - futures::join!(read_blocks, hash_stream, hash_blocks, put_blocks); + futures::join!(read_blocks, hash_stream, encrypt_hash_blocks, put_blocks); let total_size = final_result?; // unwrap here is ok, because if hasher failed, it is because something failed @@ -449,6 +508,8 @@ async fn put_block_and_meta( offset: u64, hash: Hash, block: Bytes, + size: u64, + is_encrypted: bool, order_tag: OrderTag, ) -> Result<(), GarageError> { let ReqCtx { garage, .. } = ctx; @@ -459,10 +520,7 @@ async fn put_block_and_meta( part_number, offset, }, - VersionBlock { - hash, - size: block.len() as u64, - }, + VersionBlock { hash, size }, ); let block_ref = BlockRef { @@ -474,7 +532,7 @@ async fn put_block_and_meta( futures::try_join!( garage .block_manager - .rpc_put_block(hash, block, Some(order_tag)), + .rpc_put_block(hash, block, is_encrypted, Some(order_tag)), garage.version_table.insert(&version), garage.block_ref_table.insert(&block_ref), )?; @@ -517,14 +575,6 @@ impl> + Unpin> StreamChunker { } } -pub fn put_response(version_uuid: Uuid, md5sum_hex: String) -> Response { - Response::builder() - .header("x-amz-version-id", hex::encode(version_uuid)) - .header("ETag", format!("\"{}\"", md5sum_hex)) - .body(empty_body()) - .unwrap() -} - struct InterruptedCleanup(Option); struct InterruptedCleanupInner { garage: Arc, diff --git a/src/block/block.rs b/src/block/block.rs index 504d11f8..bd95680e 100644 --- a/src/block/block.rs +++ b/src/block/block.rs @@ -96,7 +96,7 @@ impl DataBlock { } } -fn zstd_encode(mut source: R, level: i32) -> std::io::Result> { +pub fn zstd_encode(mut source: R, level: i32) -> std::io::Result> { let mut result = Vec::::new(); let mut encoder = Encoder::new(&mut result, level)?; encoder.include_checksum(true)?; diff --git a/src/block/lib.rs b/src/block/lib.rs index c9ff2845..6c4711ef 100644 --- a/src/block/lib.rs +++ b/src/block/lib.rs @@ -9,3 +9,5 @@ mod block; mod layout; mod metrics; mod rc; + +pub use block::zstd_encode; diff --git a/src/block/manager.rs b/src/block/manager.rs index f4d8ee56..c7e4df17 100644 --- a/src/block/manager.rs +++ b/src/block/manager.rs @@ -337,26 +337,18 @@ impl BlockManager { } } - /// Ask nodes that might have a block for it, return it as one big Bytes - pub async fn rpc_get_block( - &self, - hash: &Hash, - order_tag: Option, - ) -> Result { - let stream = self.rpc_get_block_streaming(hash, order_tag).await?; - Ok(read_stream_to_end(stream).await?.into_bytes()) - } - /// Send block to nodes that should have it pub async fn rpc_put_block( &self, hash: Hash, data: Bytes, + prevent_compression: bool, order_tag: Option, ) -> Result<(), Error> { let who = self.replication.write_sets(&hash); - let (header, bytes) = DataBlock::from_buffer(data, self.compression_level) + let compression_level = self.compression_level.filter(|_| !prevent_compression); + let (header, bytes) = DataBlock::from_buffer(data, compression_level) .await .into_parts(); let put_block_rpc = diff --git a/src/model/s3/object_table.rs b/src/model/s3/object_table.rs index ebea04bd..7fa4b9e0 100644 --- a/src/model/s3/object_table.rs +++ b/src/model/s3/object_table.rs @@ -210,7 +210,165 @@ mod v09 { } } -pub use v09::*; +mod v010 { + use garage_util::data::{Hash, Uuid}; + use serde::{Deserialize, Serialize}; + + use super::v09; + + pub use v09::ObjectVersionHeaders; + + /// An object + #[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] + pub struct Object { + /// The bucket in which the object is stored, used as partition key + pub bucket_id: Uuid, + + /// The key at which the object is stored in its bucket, used as sorting key + pub key: String, + + /// The list of currenty stored versions of the object + pub(super) versions: Vec, + } + + /// Informations about a version of an object + #[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] + pub struct ObjectVersion { + /// Id of the version + pub uuid: Uuid, + /// Timestamp of when the object was created + pub timestamp: u64, + /// State of the version + pub state: ObjectVersionState, + } + + /// State of an object version + #[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] + pub enum ObjectVersionState { + /// The version is being received + Uploading { + /// Indicates whether this is a multipart upload + multipart: bool, + /// Encryption params + headers to be included in the final object + encryption: ObjectVersionEncryption, + }, + /// The version is fully received + Complete(ObjectVersionData), + /// The version uploaded containded errors or the upload was explicitly aborted + Aborted, + } + + /// Data stored in object version + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] + pub enum ObjectVersionData { + /// The object was deleted, this Version is a tombstone to mark it as such + DeleteMarker, + /// The object is short, it's stored inlined. + /// It is never compressed. For encrypted objects, it is encrypted using + /// AES256-GCM, like the encrypted headers. + Inline(ObjectVersionMeta, #[serde(with = "serde_bytes")] Vec), + /// The object is not short, Hash of first block is stored here, next segments hashes are + /// stored in the version table + FirstBlock(ObjectVersionMeta, Hash), + } + + /// Metadata about the object version + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] + pub struct ObjectVersionMeta { + /// Size of the object. If object is encrypted/compressed, + /// this is always the size of the unencrypted/uncompressed data + pub size: u64, + /// etag of the object + pub etag: String, + /// Encryption params + headers (encrypted or plaintext) + pub encryption: ObjectVersionEncryption, + } + + /// Encryption information + metadata + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] + pub enum ObjectVersionEncryption { + SseC { + /// Encrypted serialized ObjectVersionHeaders struct. + /// This is never compressed, just encrypted using AES256-GCM. + #[serde(with = "serde_bytes")] + headers: Vec, + /// Whether data blocks are compressed in addition to being encrypted + /// (compression happens before encryption, whereas for non-encrypted + /// objects, compression is handled at the level of the block manager) + compressed: bool, + }, + Plaintext { + /// Plain-text headers + headers: ObjectVersionHeaders, + }, + } + + impl garage_util::migrate::Migrate for Object { + const VERSION_MARKER: &'static [u8] = b"G010s3ob"; + + type Previous = v09::Object; + + fn migrate(old: v09::Object) -> Object { + Object { + bucket_id: old.bucket_id, + key: old.key, + versions: old.versions.into_iter().map(migrate_version).collect(), + } + } + } + + fn migrate_version(old: v09::ObjectVersion) -> ObjectVersion { + ObjectVersion { + uuid: old.uuid, + timestamp: old.timestamp, + state: match old.state { + v09::ObjectVersionState::Uploading { multipart, headers } => { + ObjectVersionState::Uploading { + multipart, + encryption: migrate_headers(headers), + } + } + v09::ObjectVersionState::Complete(d) => { + ObjectVersionState::Complete(migrate_data(d)) + } + v09::ObjectVersionState::Aborted => ObjectVersionState::Aborted, + }, + } + } + + fn migrate_data(old: v09::ObjectVersionData) -> ObjectVersionData { + match old { + v09::ObjectVersionData::DeleteMarker => ObjectVersionData::DeleteMarker, + v09::ObjectVersionData::Inline(meta, data) => { + ObjectVersionData::Inline(migrate_meta(meta), data) + } + v09::ObjectVersionData::FirstBlock(meta, fb) => { + ObjectVersionData::FirstBlock(migrate_meta(meta), fb) + } + } + } + + fn migrate_meta(old: v09::ObjectVersionMeta) -> ObjectVersionMeta { + ObjectVersionMeta { + size: old.size, + etag: old.etag, + encryption: migrate_headers(old.headers), + } + } + + fn migrate_headers(old: v09::ObjectVersionHeaders) -> ObjectVersionEncryption { + ObjectVersionEncryption::Plaintext { headers: old } + } + + // Since ObjectVersionHeaders can now be serialized independently, for the + // purpose of being encrypted, we need it to support migrations on its own + // as well. + impl garage_util::migrate::InitialFormat for ObjectVersionHeaders { + const VERSION_MARKER: &'static [u8] = b"G010s3oh"; + } +} + +pub use v010::*; impl Object { /// Initialize an Object struct from parts diff --git a/src/model/s3/version_table.rs b/src/model/s3/version_table.rs index 5c032f9f..b4662a55 100644 --- a/src/model/s3/version_table.rs +++ b/src/model/s3/version_table.rs @@ -44,7 +44,8 @@ mod v05 { pub struct VersionBlockKey { /// Number of the part pub part_number: u64, - /// Offset of this sub-segment in its part + /// Offset of this sub-segment in its part as sent by the client + /// (before any kind of compression or encryption) pub offset: u64, } @@ -53,7 +54,7 @@ mod v05 { pub struct VersionBlock { /// Blake2 sum of the block pub hash: Hash, - /// Size of the block + /// Size of the block, before any kind of compression or encryption pub size: u64, } -- cgit v1.2.3 From fa4878bad6434f33ab9e0f663d8529e0db66d7e6 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 28 Feb 2024 14:09:41 +0100 Subject: [sse-c] Testing for SSE-C encryption --- src/garage/tests/s3/mod.rs | 1 + src/garage/tests/s3/ssec.rs | 455 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 456 insertions(+) create mode 100644 src/garage/tests/s3/ssec.rs (limited to 'src') diff --git a/src/garage/tests/s3/mod.rs b/src/garage/tests/s3/mod.rs index 4ebc4914..e75b1397 100644 --- a/src/garage/tests/s3/mod.rs +++ b/src/garage/tests/s3/mod.rs @@ -3,5 +3,6 @@ mod multipart; mod objects; mod presigned; mod simple; +mod ssec; mod streaming_signature; mod website; diff --git a/src/garage/tests/s3/ssec.rs b/src/garage/tests/s3/ssec.rs new file mode 100644 index 00000000..d8f11950 --- /dev/null +++ b/src/garage/tests/s3/ssec.rs @@ -0,0 +1,455 @@ +use crate::common::{self, Context}; +use aws_sdk_s3::primitives::ByteStream; +use aws_sdk_s3::types::{CompletedMultipartUpload, CompletedPart}; + +const SSEC_KEY: &str = "u8zCfnEyt5Imo/krN+sxA1DQXxLWtPJavU6T6gOVj1Y="; +const SSEC_KEY_MD5: &str = "jMGbs3GyZkYjJUP6q5jA7g=="; +const SSEC_KEY2: &str = "XkYVk4Z3vVDO2yJaUqCAEZX6lL10voMxtV06d8my/eU="; +const SSEC_KEY2_MD5: &str = "kedo2ab8J1MCjHwJuLTJHw=="; + +const SZ_2MB: usize = 2 * 1024 * 1024; + +#[tokio::test] +async fn test_ssec_object() { + let ctx = common::context(); + let bucket = ctx.create_bucket("sse-c"); + + let bytes1 = b"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz".to_vec(); + let bytes2 = (0..400000) + .map(|x| ((x * 3792) % 256) as u8) + .collect::>(); + + for data in vec![bytes1, bytes2] { + let stream = ByteStream::new(data.clone().into()); + + // Write encrypted object + let r = ctx + .client + .put_object() + .bucket(&bucket) + .key("testobj") + .sse_customer_algorithm("AES256") + .sse_customer_key(SSEC_KEY) + .sse_customer_key_md5(SSEC_KEY_MD5) + .body(stream) + .send() + .await + .unwrap(); + assert_eq!(r.sse_customer_algorithm, Some("AES256".into())); + assert_eq!(r.sse_customer_key_md5, Some(SSEC_KEY_MD5.into())); + + test_read_encrypted( + &ctx, + &bucket, + "testobj", + &data, + SSEC_KEY, + SSEC_KEY_MD5, + SSEC_KEY2, + SSEC_KEY2_MD5, + ) + .await; + + // Test copy from encrypted to non-encrypted + let r = ctx + .client + .copy_object() + .bucket(&bucket) + .key("test-copy-enc-dec") + .copy_source(format!("{}/{}", bucket, "testobj")) + .copy_source_sse_customer_algorithm("AES256") + .copy_source_sse_customer_key(SSEC_KEY) + .copy_source_sse_customer_key_md5(SSEC_KEY_MD5) + .send() + .await + .unwrap(); + assert_eq!(r.sse_customer_algorithm, None); + assert_eq!(r.sse_customer_key_md5, None); + + // Test read decrypted file + let r = ctx + .client + .get_object() + .bucket(&bucket) + .key("test-copy-enc-dec") + .send() + .await + .unwrap(); + assert_bytes_eq!(r.body, &data); + assert_eq!(r.sse_customer_algorithm, None); + assert_eq!(r.sse_customer_key_md5, None); + + // Test copy from non-encrypted to encrypted + let r = ctx + .client + .copy_object() + .bucket(&bucket) + .key("test-copy-enc-dec-enc") + .copy_source(format!("{}/test-copy-enc-dec", bucket)) + .sse_customer_algorithm("AES256") + .sse_customer_key(SSEC_KEY2) + .sse_customer_key_md5(SSEC_KEY2_MD5) + .send() + .await + .unwrap(); + assert_eq!(r.sse_customer_algorithm, Some("AES256".into())); + assert_eq!(r.sse_customer_key_md5, Some(SSEC_KEY2_MD5.into())); + + test_read_encrypted( + &ctx, + &bucket, + "test-copy-enc-dec-enc", + &data, + SSEC_KEY2, + SSEC_KEY2_MD5, + SSEC_KEY, + SSEC_KEY_MD5, + ) + .await; + + // Test copy from encrypted to encrypted with different keys + let r = ctx + .client + .copy_object() + .bucket(&bucket) + .key("test-copy-enc-enc") + .copy_source(format!("{}/{}", bucket, "testobj")) + .copy_source_sse_customer_algorithm("AES256") + .copy_source_sse_customer_key(SSEC_KEY) + .copy_source_sse_customer_key_md5(SSEC_KEY_MD5) + .sse_customer_algorithm("AES256") + .sse_customer_key(SSEC_KEY2) + .sse_customer_key_md5(SSEC_KEY2_MD5) + .send() + .await + .unwrap(); + assert_eq!(r.sse_customer_algorithm, Some("AES256".into())); + assert_eq!(r.sse_customer_key_md5, Some(SSEC_KEY2_MD5.into())); + test_read_encrypted( + &ctx, + &bucket, + "test-copy-enc-enc", + &data, + SSEC_KEY2, + SSEC_KEY2_MD5, + SSEC_KEY, + SSEC_KEY_MD5, + ) + .await; + + // Test copy from encrypted to encrypted with the same key + let r = ctx + .client + .copy_object() + .bucket(&bucket) + .key("test-copy-enc-enc-same") + .copy_source(format!("{}/{}", bucket, "testobj")) + .copy_source_sse_customer_algorithm("AES256") + .copy_source_sse_customer_key(SSEC_KEY) + .copy_source_sse_customer_key_md5(SSEC_KEY_MD5) + .sse_customer_algorithm("AES256") + .sse_customer_key(SSEC_KEY) + .sse_customer_key_md5(SSEC_KEY_MD5) + .send() + .await + .unwrap(); + assert_eq!(r.sse_customer_algorithm, Some("AES256".into())); + assert_eq!(r.sse_customer_key_md5, Some(SSEC_KEY_MD5.into())); + test_read_encrypted( + &ctx, + &bucket, + "test-copy-enc-enc-same", + &data, + SSEC_KEY, + SSEC_KEY_MD5, + SSEC_KEY2, + SSEC_KEY2_MD5, + ) + .await; + } +} + +#[tokio::test] +async fn test_multipart_upload() { + let ctx = common::context(); + let bucket = ctx.create_bucket("test-ssec-mpu"); + + let u1 = vec![0x11; SZ_2MB]; + let u2 = vec![0x22; SZ_2MB]; + let u3 = vec![0x33; SZ_2MB]; + let all = [&u1[..], &u2[..], &u3[..]].concat(); + + // Test simple encrypted mpu + { + let up = ctx + .client + .create_multipart_upload() + .bucket(&bucket) + .key("a") + .sse_customer_algorithm("AES256") + .sse_customer_key(SSEC_KEY) + .sse_customer_key_md5(SSEC_KEY_MD5) + .send() + .await + .unwrap(); + assert!(up.upload_id.is_some()); + assert_eq!(up.sse_customer_algorithm, Some("AES256".into())); + assert_eq!(up.sse_customer_key_md5, Some(SSEC_KEY_MD5.into())); + + let uid = up.upload_id.as_ref().unwrap(); + + let mut etags = vec![]; + for (i, part) in vec![&u1, &u2, &u3].into_iter().enumerate() { + let pu = ctx + .client + .upload_part() + .bucket(&bucket) + .key("a") + .upload_id(uid) + .part_number((i + 1) as i32) + .sse_customer_algorithm("AES256") + .sse_customer_key(SSEC_KEY) + .sse_customer_key_md5(SSEC_KEY_MD5) + .body(ByteStream::from(part.to_vec())) + .send() + .await + .unwrap(); + etags.push(pu.e_tag.unwrap()); + } + + let mut cmp = CompletedMultipartUpload::builder(); + for (i, etag) in etags.into_iter().enumerate() { + cmp = cmp.parts( + CompletedPart::builder() + .part_number((i + 1) as i32) + .e_tag(etag) + .build(), + ); + } + + ctx.client + .complete_multipart_upload() + .bucket(&bucket) + .key("a") + .upload_id(uid) + .multipart_upload(cmp.build()) + .send() + .await + .unwrap(); + + test_read_encrypted( + &ctx, + &bucket, + "a", + &all, + SSEC_KEY, + SSEC_KEY_MD5, + SSEC_KEY2, + SSEC_KEY2_MD5, + ) + .await; + } + + // Test upload part copy from first object + { + // (setup) Upload a single part object + ctx.client + .put_object() + .bucket(&bucket) + .key("b") + .body(ByteStream::from(u1.clone())) + .sse_customer_algorithm("AES256") + .sse_customer_key(SSEC_KEY2) + .sse_customer_key_md5(SSEC_KEY2_MD5) + .send() + .await + .unwrap(); + + let up = ctx + .client + .create_multipart_upload() + .bucket(&bucket) + .key("target") + .sse_customer_algorithm("AES256") + .sse_customer_key(SSEC_KEY2) + .sse_customer_key_md5(SSEC_KEY2_MD5) + .send() + .await + .unwrap(); + let uid = up.upload_id.as_ref().unwrap(); + + let p1 = ctx + .client + .upload_part() + .bucket(&bucket) + .key("target") + .upload_id(uid) + .part_number(1) + .sse_customer_algorithm("AES256") + .sse_customer_key(SSEC_KEY2) + .sse_customer_key_md5(SSEC_KEY2_MD5) + .body(ByteStream::from(u3.clone())) + .send() + .await + .unwrap(); + + let p2 = ctx + .client + .upload_part_copy() + .bucket(&bucket) + .key("target") + .upload_id(uid) + .part_number(2) + .copy_source(format!("{}/a", bucket)) + .copy_source_range("bytes=500-550000") + .copy_source_sse_customer_algorithm("AES256") + .copy_source_sse_customer_key(SSEC_KEY) + .copy_source_sse_customer_key_md5(SSEC_KEY_MD5) + .sse_customer_algorithm("AES256") + .sse_customer_key(SSEC_KEY2) + .sse_customer_key_md5(SSEC_KEY2_MD5) + .send() + .await + .unwrap(); + + let p3 = ctx + .client + .upload_part() + .bucket(&bucket) + .key("target") + .upload_id(uid) + .part_number(3) + .sse_customer_algorithm("AES256") + .sse_customer_key(SSEC_KEY2) + .sse_customer_key_md5(SSEC_KEY2_MD5) + .body(ByteStream::from(u2.clone())) + .send() + .await + .unwrap(); + + let p4 = ctx + .client + .upload_part_copy() + .bucket(&bucket) + .key("target") + .upload_id(uid) + .part_number(4) + .copy_source(format!("{}/b", bucket)) + .copy_source_range("bytes=1500-20500") + .copy_source_sse_customer_algorithm("AES256") + .copy_source_sse_customer_key(SSEC_KEY2) + .copy_source_sse_customer_key_md5(SSEC_KEY2_MD5) + .sse_customer_algorithm("AES256") + .sse_customer_key(SSEC_KEY2) + .sse_customer_key_md5(SSEC_KEY2_MD5) + .send() + .await + .unwrap(); + + let cmp = CompletedMultipartUpload::builder() + .parts( + CompletedPart::builder() + .part_number(1) + .e_tag(p1.e_tag.unwrap()) + .build(), + ) + .parts( + CompletedPart::builder() + .part_number(2) + .e_tag(p2.copy_part_result.unwrap().e_tag.unwrap()) + .build(), + ) + .parts( + CompletedPart::builder() + .part_number(3) + .e_tag(p3.e_tag.unwrap()) + .build(), + ) + .parts( + CompletedPart::builder() + .part_number(4) + .e_tag(p4.copy_part_result.unwrap().e_tag.unwrap()) + .build(), + ) + .build(); + + ctx.client + .complete_multipart_upload() + .bucket(&bucket) + .key("target") + .upload_id(uid) + .multipart_upload(cmp) + .send() + .await + .unwrap(); + + // (check) Get object + let expected = [&u3[..], &all[500..550001], &u2[..], &u1[1500..20501]].concat(); + test_read_encrypted( + &ctx, + &bucket, + "target", + &expected, + SSEC_KEY2, + SSEC_KEY2_MD5, + SSEC_KEY, + SSEC_KEY_MD5, + ) + .await; + } +} + +async fn test_read_encrypted( + ctx: &Context, + bucket: &str, + obj_key: &str, + expected_data: &[u8], + enc_key: &str, + enc_key_md5: &str, + wrong_enc_key: &str, + wrong_enc_key_md5: &str, +) { + // Test read encrypted without key + let o = ctx + .client + .get_object() + .bucket(bucket) + .key(obj_key) + .send() + .await; + assert!( + o.is_err(), + "encrypted file could be read without encryption key" + ); + + // Test read encrypted with wrong key + let o = ctx + .client + .get_object() + .bucket(bucket) + .key(obj_key) + .sse_customer_key(wrong_enc_key) + .sse_customer_key_md5(wrong_enc_key_md5) + .send() + .await; + assert!( + o.is_err(), + "encrypted file could be read with incorrect encryption key" + ); + + // Test read encrypted with correct key + let o = ctx + .client + .get_object() + .bucket(bucket) + .key(obj_key) + .sse_customer_algorithm("AES256") + .sse_customer_key(enc_key) + .sse_customer_key_md5(enc_key_md5) + .send() + .await + .unwrap(); + assert_bytes_eq!(o.body, expected_data); + assert_eq!(o.sse_customer_algorithm, Some("AES256".into())); + assert_eq!(o.sse_customer_key_md5, Some(enc_key_md5.to_string())); +} -- cgit v1.2.3 From 3fcb54e3cf62cdc9ed84751e1f0522ff553ea63c Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 6 Mar 2024 19:23:36 +0100 Subject: [sse-c] Remove special case for Content-Type header --- src/api/s3/get.rs | 21 ++++++++++++++++--- src/api/s3/list.rs | 5 +---- src/api/s3/put.rs | 48 ++++++++++++-------------------------------- src/model/Cargo.toml | 1 + src/model/s3/object_table.rs | 20 +++++++++++++++--- 5 files changed, 50 insertions(+), 45 deletions(-) (limited to 'src') diff --git a/src/api/s3/get.rs b/src/api/s3/get.rs index 1bca4671..ec300ab7 100644 --- a/src/api/s3/get.rs +++ b/src/api/s3/get.rs @@ -1,4 +1,5 @@ //! Function related to GET and HEAD requests +use std::collections::BTreeMap; use std::convert::TryInto; use std::sync::Arc; use std::time::{Duration, UNIX_EPOCH}; @@ -53,7 +54,6 @@ fn object_headers( let date_str = httpdate::fmt_http_date(date); let mut resp = Response::builder() - .header(CONTENT_TYPE, headers.content_type.to_string()) .header(LAST_MODIFIED, date_str) .header(ACCEPT_RANGES, "bytes".to_string()); @@ -61,8 +61,23 @@ fn object_headers( resp = resp.header(ETAG, format!("\"{}\"", version_meta.etag)); } - for (k, v) in headers.other.iter() { - resp = resp.header(k, v.to_string()); + // When metadata is retrieved through the REST API, Amazon S3 combines headers that + // have the same name (ignoring case) into a comma-delimited list. + // See: https://docs.aws.amazon.com/AmazonS3/latest/userguide/UsingMetadata.html + let mut headers_by_name = BTreeMap::new(); + for (name, value) in headers.0.iter() { + match headers_by_name.get_mut(name) { + None => { + headers_by_name.insert(name, vec![value.as_str()]); + } + Some(headers) => { + headers.push(value.as_str()); + } + } + } + + for (name, values) in headers_by_name { + resp = resp.header(name, values.join(",")); } encryption.add_response_headers(&mut resp); diff --git a/src/api/s3/list.rs b/src/api/s3/list.rs index a7eebbb1..1678f1fa 100644 --- a/src/api/s3/list.rs +++ b/src/api/s3/list.rs @@ -945,10 +945,7 @@ mod tests { state: ObjectVersionState::Uploading { multipart: true, encryption: ObjectVersionEncryption::Plaintext { - headers: ObjectVersionHeaders { - content_type: "text/plain".to_string(), - other: BTreeMap::::new(), - }, + headers: ObjectVersionHeaders(vec![]), }, }, } diff --git a/src/api/s3/put.rs b/src/api/s3/put.rs index 745c2219..941e4122 100644 --- a/src/api/s3/put.rs +++ b/src/api/s3/put.rs @@ -1,4 +1,4 @@ -use std::collections::{BTreeMap, HashMap}; +use std::collections::HashMap; use std::sync::Arc; use base64::prelude::*; @@ -609,57 +609,35 @@ impl Drop for InterruptedCleanup { // ============ helpers ============ -pub(crate) fn get_mime_type(headers: &HeaderMap) -> Result { - Ok(headers - .get(hyper::header::CONTENT_TYPE) - .map(|x| x.to_str()) - .unwrap_or(Ok("blob"))? - .to_string()) -} - pub(crate) fn get_headers(headers: &HeaderMap) -> Result { - let content_type = get_mime_type(headers)?; - let mut other = BTreeMap::new(); + let mut ret = Vec::new(); // Preserve standard headers let standard_header = vec![ + hyper::header::CONTENT_TYPE, hyper::header::CACHE_CONTROL, hyper::header::CONTENT_DISPOSITION, hyper::header::CONTENT_ENCODING, hyper::header::CONTENT_LANGUAGE, hyper::header::EXPIRES, ]; - for h in standard_header.iter() { - if let Some(v) = headers.get(h) { - match v.to_str() { - Ok(v_str) => { - other.insert(h.to_string(), v_str.to_string()); - } - Err(e) => { - warn!("Discarding header {}, error in .to_str(): {}", h, e); - } - } + for name in standard_header.iter() { + if let Some(value) = headers.get(name) { + ret.push((name.to_string(), value.to_str()?.to_string())); } } // Preserve x-amz-meta- headers - for (k, v) in headers.iter() { - if k.as_str().starts_with("x-amz-meta-") { - match std::str::from_utf8(v.as_bytes()) { - Ok(v_str) => { - other.insert(k.to_string(), v_str.to_string()); - } - Err(e) => { - warn!("Discarding header {}, error in .to_str(): {}", k, e); - } - } + for (name, value) in headers.iter() { + if name.as_str().starts_with("x-amz-meta-") { + ret.push(( + name.to_string(), + std::str::from_utf8(value.as_bytes())?.to_string(), + )); } } - Ok(ObjectVersionHeaders { - content_type, - other, - }) + Ok(ObjectVersionHeaders(ret)) } pub(crate) fn next_timestamp(existing_object: Option<&Object>) -> u64 { diff --git a/src/model/Cargo.toml b/src/model/Cargo.toml index 33898e20..776671d0 100644 --- a/src/model/Cargo.toml +++ b/src/model/Cargo.toml @@ -27,6 +27,7 @@ blake2.workspace = true chrono.workspace = true err-derive.workspace = true hex.workspace = true +http.workspace = true base64.workspace = true tracing.workspace = true rand.workspace = true diff --git a/src/model/s3/object_table.rs b/src/model/s3/object_table.rs index 7fa4b9e0..f2d21493 100644 --- a/src/model/s3/object_table.rs +++ b/src/model/s3/object_table.rs @@ -216,8 +216,6 @@ mod v010 { use super::v09; - pub use v09::ObjectVersionHeaders; - /// An object #[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] pub struct Object { @@ -303,6 +301,10 @@ mod v010 { }, } + /// Vector of headers, as tuples of the format (header name, header value) + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] + pub struct ObjectVersionHeaders(pub Vec<(String, String)>); + impl garage_util::migrate::Migrate for Object { const VERSION_MARKER: &'static [u8] = b"G010s3ob"; @@ -357,7 +359,19 @@ mod v010 { } fn migrate_headers(old: v09::ObjectVersionHeaders) -> ObjectVersionEncryption { - ObjectVersionEncryption::Plaintext { headers: old } + use http::header::CONTENT_TYPE; + + let mut new_headers = Vec::with_capacity(old.other.len() + 1); + if old.content_type != "blob" { + new_headers.push((CONTENT_TYPE.as_str().to_string(), old.content_type)); + } + for (name, value) in old.other.into_iter() { + new_headers.push((name, value)); + } + + ObjectVersionEncryption::Plaintext { + headers: ObjectVersionHeaders(new_headers), + } } // Since ObjectVersionHeaders can now be serialized independently, for the -- cgit v1.2.3 From f537f76681760e9b2b3cc095a6031ebb59ca4733 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 8 Mar 2024 13:24:47 +0100 Subject: [rm-migration] Remove migration path from Garage v0.5 --- src/garage/admin/mod.rs | 21 ------- src/garage/cli/cmd.rs | 3 - src/garage/cli/structs.rs | 22 -------- src/model/key_table.rs | 68 +---------------------- src/model/lib.rs | 4 -- src/model/migrate.rs | 108 ------------------------------------ src/model/prev/mod.rs | 1 - src/model/prev/v051/bucket_table.rs | 63 --------------------- src/model/prev/v051/mod.rs | 1 - src/model/s3/object_table.rs | 43 +------------- src/model/s3/version_table.rs | 55 +----------------- 11 files changed, 5 insertions(+), 384 deletions(-) delete mode 100644 src/model/migrate.rs delete mode 100644 src/model/prev/mod.rs delete mode 100644 src/model/prev/v051/bucket_table.rs delete mode 100644 src/model/prev/v051/mod.rs (limited to 'src') diff --git a/src/garage/admin/mod.rs b/src/garage/admin/mod.rs index de7851e1..073693ed 100644 --- a/src/garage/admin/mod.rs +++ b/src/garage/admin/mod.rs @@ -27,7 +27,6 @@ use garage_model::bucket_table::*; use garage_model::garage::Garage; use garage_model::helper::error::{Error, OkOrBadRequest}; use garage_model::key_table::*; -use garage_model::migrate::Migrate; use garage_model::s3::mpu_table::MultipartUpload; use garage_model::s3::version_table::Version; @@ -42,7 +41,6 @@ pub enum AdminRpc { BucketOperation(BucketOperation), KeyOperation(KeyOperation), LaunchRepair(RepairOpt), - Migrate(MigrateOpt), Stats(StatsOpt), Worker(WorkerOperation), BlockOperation(BlockOperation), @@ -95,24 +93,6 @@ impl AdminRpcHandler { admin } - // ================ MIGRATION COMMANDS ==================== - - async fn handle_migrate(self: &Arc, opt: MigrateOpt) -> Result { - if !opt.yes { - return Err(Error::BadRequest( - "Please provide the --yes flag to initiate migration operation.".to_string(), - )); - } - - let m = Migrate { - garage: self.garage.clone(), - }; - match opt.what { - MigrateWhat::Buckets050 => m.migrate_buckets050().await, - }?; - Ok(AdminRpc::Ok("Migration successfull.".into())) - } - // ================ REPAIR COMMANDS ==================== async fn handle_launch_repair(self: &Arc, opt: RepairOpt) -> Result { @@ -530,7 +510,6 @@ impl EndpointHandler for AdminRpcHandler { match message { AdminRpc::BucketOperation(bo) => self.handle_bucket_cmd(bo).await, AdminRpc::KeyOperation(ko) => self.handle_key_cmd(ko).await, - AdminRpc::Migrate(opt) => self.handle_migrate(opt.clone()).await, AdminRpc::LaunchRepair(opt) => self.handle_launch_repair(opt.clone()).await, AdminRpc::Stats(opt) => self.handle_stats(opt.clone()).await, AdminRpc::Worker(wo) => self.handle_worker_cmd(wo).await, diff --git a/src/garage/cli/cmd.rs b/src/garage/cli/cmd.rs index fb6dface..7440457f 100644 --- a/src/garage/cli/cmd.rs +++ b/src/garage/cli/cmd.rs @@ -33,9 +33,6 @@ pub async fn cli_command_dispatch( Command::Key(ko) => { cmd_admin(admin_rpc_endpoint, rpc_host, AdminRpc::KeyOperation(ko)).await } - Command::Migrate(mo) => { - cmd_admin(admin_rpc_endpoint, rpc_host, AdminRpc::Migrate(mo)).await - } Command::Repair(ro) => { cmd_admin(admin_rpc_endpoint, rpc_host, AdminRpc::LaunchRepair(ro)).await } diff --git a/src/garage/cli/structs.rs b/src/garage/cli/structs.rs index 40e47ee1..63014dbc 100644 --- a/src/garage/cli/structs.rs +++ b/src/garage/cli/structs.rs @@ -31,11 +31,6 @@ pub enum Command { #[structopt(name = "key", version = garage_version())] Key(KeyOperation), - /// Run migrations from previous Garage version - /// (DO NOT USE WITHOUT READING FULL DOCUMENTATION) - #[structopt(name = "migrate", version = garage_version())] - Migrate(MigrateOpt), - /// Start repair of node data on remote node #[structopt(name = "repair", version = garage_version())] Repair(RepairOpt), @@ -445,23 +440,6 @@ pub struct KeyImportOpt { pub yes: bool, } -#[derive(Serialize, Deserialize, StructOpt, Debug, Clone)] -pub struct MigrateOpt { - /// Confirm the launch of the migrate operation - #[structopt(long = "yes")] - pub yes: bool, - - #[structopt(subcommand)] - pub what: MigrateWhat, -} - -#[derive(Serialize, Deserialize, StructOpt, Debug, Eq, PartialEq, Clone)] -pub enum MigrateWhat { - /// Migrate buckets and permissions from v0.5.0 - #[structopt(name = "buckets050", version = garage_version())] - Buckets050, -} - #[derive(Serialize, Deserialize, StructOpt, Debug, Clone)] pub struct RepairOpt { /// Launch repair operation on all nodes diff --git a/src/model/key_table.rs b/src/model/key_table.rs index a9762f1b..efb95f08 100644 --- a/src/model/key_table.rs +++ b/src/model/key_table.rs @@ -7,48 +7,7 @@ use garage_table::{DeletedFilter, EmptyKey, Entry, TableSchema}; use crate::permission::BucketKeyPerm; -pub(crate) mod v05 { - use garage_util::crdt; - use serde::{Deserialize, Serialize}; - - /// An api key - #[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] - pub struct Key { - /// The id of the key (immutable), used as partition key - pub key_id: String, - - /// The secret_key associated - pub secret_key: String, - - /// Name for the key - pub name: crdt::Lww, - - /// Is the key deleted - pub deleted: crdt::Bool, - - /// Buckets in which the key is authorized. Empty if `Key` is deleted - // CRDT interaction: deleted implies authorized_buckets is empty - pub authorized_buckets: crdt::LwwMap, - } - - /// Permission given to a key in a bucket - #[derive(PartialOrd, Ord, PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] - pub struct PermissionSet { - /// The key can be used to read the bucket - pub allow_read: bool, - /// The key can be used to write in the bucket - pub allow_write: bool, - } - - impl crdt::AutoCrdt for PermissionSet { - const WARN_IF_DIFFERENT: bool = true; - } - - impl garage_util::migrate::InitialFormat for Key {} -} - mod v08 { - use super::v05; use crate::permission::BucketKeyPerm; use garage_util::crdt; use garage_util::data::Uuid; @@ -86,32 +45,7 @@ mod v08 { pub local_aliases: crdt::LwwMap>, } - impl garage_util::migrate::Migrate for Key { - type Previous = v05::Key; - - fn migrate(old_k: v05::Key) -> Key { - let name = crdt::Lww::raw(old_k.name.timestamp(), old_k.name.get().clone()); - - let state = if old_k.deleted.get() { - crdt::Deletable::Deleted - } else { - // Authorized buckets is ignored here, - // migration is performed in specific migration code in - // garage/migrate.rs - crdt::Deletable::Present(KeyParams { - secret_key: old_k.secret_key, - name, - allow_create_bucket: crdt::Lww::new(false), - authorized_buckets: crdt::Map::new(), - local_aliases: crdt::LwwMap::new(), - }) - }; - Key { - key_id: old_k.key_id, - state, - } - } - } + impl garage_util::migrate::InitialFormat for Key {} } pub use v08::*; diff --git a/src/model/lib.rs b/src/model/lib.rs index 4f20ea46..2166105f 100644 --- a/src/model/lib.rs +++ b/src/model/lib.rs @@ -1,9 +1,6 @@ #[macro_use] extern crate tracing; -// For migration from previous versions -pub(crate) mod prev; - pub mod permission; pub mod index_counter; @@ -18,4 +15,3 @@ pub mod s3; pub mod garage; pub mod helper; -pub mod migrate; diff --git a/src/model/migrate.rs b/src/model/migrate.rs deleted file mode 100644 index 8528382a..00000000 --- a/src/model/migrate.rs +++ /dev/null @@ -1,108 +0,0 @@ -use std::sync::Arc; - -use garage_util::crdt::*; -use garage_util::data::*; -use garage_util::encode::nonversioned_decode; -use garage_util::error::Error as GarageError; -use garage_util::time::*; - -use crate::prev::v051::bucket_table as old_bucket; - -use crate::bucket_alias_table::*; -use crate::bucket_table::*; -use crate::garage::Garage; -use crate::helper::error::*; -use crate::permission::*; - -pub struct Migrate { - pub garage: Arc, -} - -impl Migrate { - pub async fn migrate_buckets050(&self) -> Result<(), Error> { - let tree = self - .garage - .db - .open_tree("bucket:table") - .map_err(GarageError::from)?; - - let mut old_buckets = vec![]; - for res in tree.iter().map_err(GarageError::from)? { - let (_k, v) = res.map_err(GarageError::from)?; - let bucket = - nonversioned_decode::(&v[..]).map_err(GarageError::from)?; - old_buckets.push(bucket); - } - - for bucket in old_buckets { - if let old_bucket::BucketState::Present(p) = bucket.state.get() { - self.migrate_buckets050_do_bucket(&bucket, p).await?; - } - } - - Ok(()) - } - - pub async fn migrate_buckets050_do_bucket( - &self, - old_bucket: &old_bucket::Bucket, - old_bucket_p: &old_bucket::BucketParams, - ) -> Result<(), Error> { - let bucket_id = blake2sum(old_bucket.name.as_bytes()); - - let new_name = if is_valid_bucket_name(&old_bucket.name) { - old_bucket.name.clone() - } else { - // if old bucket name was not valid, replace it by - // a hex-encoded name derived from its identifier - hex::encode(&bucket_id.as_slice()[..16]) - }; - - let website = if *old_bucket_p.website.get() { - Some(WebsiteConfig { - index_document: "index.html".into(), - error_document: None, - }) - } else { - None - }; - - let helper = self.garage.locked_helper().await; - - self.garage - .bucket_table - .insert(&Bucket { - id: bucket_id, - state: Deletable::Present(BucketParams { - creation_date: now_msec(), - authorized_keys: Map::new(), - aliases: LwwMap::new(), - local_aliases: LwwMap::new(), - website_config: Lww::new(website), - cors_config: Lww::new(None), - lifecycle_config: Lww::new(None), - quotas: Lww::new(Default::default()), - }), - }) - .await?; - - helper.set_global_bucket_alias(bucket_id, &new_name).await?; - - for (k, ts, perm) in old_bucket_p.authorized_keys.items().iter() { - helper - .set_bucket_key_permissions( - bucket_id, - k, - BucketKeyPerm { - timestamp: *ts, - allow_read: perm.allow_read, - allow_write: perm.allow_write, - allow_owner: false, - }, - ) - .await?; - } - - Ok(()) - } -} diff --git a/src/model/prev/mod.rs b/src/model/prev/mod.rs deleted file mode 100644 index 68bb1502..00000000 --- a/src/model/prev/mod.rs +++ /dev/null @@ -1 +0,0 @@ -pub(crate) mod v051; diff --git a/src/model/prev/v051/bucket_table.rs b/src/model/prev/v051/bucket_table.rs deleted file mode 100644 index 19893458..00000000 --- a/src/model/prev/v051/bucket_table.rs +++ /dev/null @@ -1,63 +0,0 @@ -use serde::{Deserialize, Serialize}; - -use garage_table::crdt::Crdt; -use garage_table::*; - -use crate::key_table::v05::PermissionSet; - -/// A bucket is a collection of objects -/// -/// Its parameters are not directly accessible as: -/// - It must be possible to merge paramaters, hence the use of a LWW CRDT. -/// - A bucket has 2 states, Present or Deleted and parameters make sense only if present. -#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] -pub struct Bucket { - /// Name of the bucket - pub name: String, - /// State, and configuration if not deleted, of the bucket - pub state: crdt::Lww, -} - -/// State of a bucket -#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] -pub enum BucketState { - /// The bucket is deleted - Deleted, - /// The bucket exists - Present(BucketParams), -} - -impl Crdt for BucketState { - fn merge(&mut self, o: &Self) { - match o { - BucketState::Deleted => *self = BucketState::Deleted, - BucketState::Present(other_params) => { - if let BucketState::Present(params) = self { - params.merge(other_params); - } - } - } - } -} - -/// Configuration for a bucket -#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] -pub struct BucketParams { - /// Map of key with access to the bucket, and what kind of access they give - pub authorized_keys: crdt::LwwMap, - /// Is the bucket served as http - pub website: crdt::Lww, -} - -impl Crdt for BucketParams { - fn merge(&mut self, o: &Self) { - self.authorized_keys.merge(&o.authorized_keys); - self.website.merge(&o.website); - } -} - -impl Crdt for Bucket { - fn merge(&mut self, other: &Self) { - self.state.merge(&other.state); - } -} diff --git a/src/model/prev/v051/mod.rs b/src/model/prev/v051/mod.rs deleted file mode 100644 index 8c1335a5..00000000 --- a/src/model/prev/v051/mod.rs +++ /dev/null @@ -1 +0,0 @@ -pub(crate) mod bucket_table; diff --git a/src/model/s3/object_table.rs b/src/model/s3/object_table.rs index f2d21493..eedb9615 100644 --- a/src/model/s3/object_table.rs +++ b/src/model/s3/object_table.rs @@ -17,7 +17,7 @@ pub const OBJECTS: &str = "objects"; pub const UNFINISHED_UPLOADS: &str = "unfinished_uploads"; pub const BYTES: &str = "bytes"; -mod v05 { +mod v08 { use garage_util::data::{Hash, Uuid}; use serde::{Deserialize, Serialize}; use std::collections::BTreeMap; @@ -26,7 +26,7 @@ mod v05 { #[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] pub struct Object { /// The bucket in which the object is stored, used as partition key - pub bucket: String, + pub bucket_id: Uuid, /// The key at which the object is stored in its bucket, used as sorting key pub key: String, @@ -92,45 +92,6 @@ mod v05 { impl garage_util::migrate::InitialFormat for Object {} } -mod v08 { - use garage_util::data::Uuid; - use serde::{Deserialize, Serialize}; - - use super::v05; - - pub use v05::{ - ObjectVersion, ObjectVersionData, ObjectVersionHeaders, ObjectVersionMeta, - ObjectVersionState, - }; - - /// An object - #[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] - pub struct Object { - /// The bucket in which the object is stored, used as partition key - pub bucket_id: Uuid, - - /// The key at which the object is stored in its bucket, used as sorting key - pub key: String, - - /// The list of currenty stored versions of the object - pub(super) versions: Vec, - } - - impl garage_util::migrate::Migrate for Object { - type Previous = v05::Object; - - fn migrate(old: v05::Object) -> Object { - use garage_util::data::blake2sum; - - Object { - bucket_id: blake2sum(old.bucket.as_bytes()), - key: old.key, - versions: old.versions, - } - } - } -} - mod v09 { use garage_util::data::Uuid; use serde::{Deserialize, Serialize}; diff --git a/src/model/s3/version_table.rs b/src/model/s3/version_table.rs index b4662a55..d611a9e3 100644 --- a/src/model/s3/version_table.rs +++ b/src/model/s3/version_table.rs @@ -11,7 +11,7 @@ use garage_table::*; use crate::s3::block_ref_table::*; -mod v05 { +mod v08 { use garage_util::crdt; use garage_util::data::{Hash, Uuid}; use serde::{Deserialize, Serialize}; @@ -35,7 +35,7 @@ mod v05 { // Back link to bucket+key so that we can figure if // this was deleted later on /// Bucket in which the related object is stored - pub bucket: String, + pub bucket_id: Uuid, /// Key in which the related object is stored pub key: String, } @@ -61,57 +61,6 @@ mod v05 { impl garage_util::migrate::InitialFormat for Version {} } -mod v08 { - use garage_util::crdt; - use garage_util::data::Uuid; - use serde::{Deserialize, Serialize}; - - use super::v05; - - pub use v05::{VersionBlock, VersionBlockKey}; - - /// A version of an object - #[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] - pub struct Version { - /// UUID of the version, used as partition key - pub uuid: Uuid, - - // Actual data: the blocks for this version - // In the case of a multipart upload, also store the etags - // of individual parts and check them when doing CompleteMultipartUpload - /// Is this version deleted - pub deleted: crdt::Bool, - /// list of blocks of data composing the version - pub blocks: crdt::Map, - /// Etag of each part in case of a multipart upload, empty otherwise - pub parts_etags: crdt::Map, - - // Back link to bucket+key so that we can figure if - // this was deleted later on - /// Bucket in which the related object is stored - pub bucket_id: Uuid, - /// Key in which the related object is stored - pub key: String, - } - - impl garage_util::migrate::Migrate for Version { - type Previous = v05::Version; - - fn migrate(old: v05::Version) -> Version { - use garage_util::data::blake2sum; - - Version { - uuid: old.uuid, - deleted: old.deleted, - blocks: old.blocks, - parts_etags: old.parts_etags, - bucket_id: blake2sum(old.bucket.as_bytes()), - key: old.key, - } - } - } -} - pub(crate) mod v09 { use garage_util::crdt; use garage_util::data::Uuid; -- cgit v1.2.3 From 44454aac012cbef9158110f2352301ffcfaf31c7 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 8 Mar 2024 14:11:02 +0100 Subject: [rm-sled] Remove the Sled database engine --- src/block/resync.rs | 6 +- src/db/Cargo.toml | 3 +- src/db/lib.rs | 2 - src/db/open.rs | 27 +---- src/db/sled_adapter.rs | 274 ------------------------------------------- src/db/test.rs | 11 -- src/garage/Cargo.toml | 5 +- src/garage/cli/convert_db.rs | 2 +- src/garage/main.rs | 6 +- src/model/Cargo.toml | 3 +- src/model/garage.rs | 5 - src/table/gc.rs | 6 +- src/table/merkle.rs | 4 +- src/util/config.rs | 19 +-- 14 files changed, 18 insertions(+), 355 deletions(-) delete mode 100644 src/db/sled_adapter.rs (limited to 'src') diff --git a/src/block/resync.rs b/src/block/resync.rs index 15f210e4..2516ba08 100644 --- a/src/block/resync.rs +++ b/src/block/resync.rs @@ -180,7 +180,7 @@ impl BlockResyncManager { // deleted once the garbage collection delay has passed. // // Here are some explanations on how the resync queue works. - // There are two Sled trees that are used to have information + // There are two db trees that are used to have information // about the status of blocks that need to be resynchronized: // // - resync.queue: a tree that is ordered first by a timestamp @@ -541,9 +541,9 @@ impl Worker for ResyncWorker { Ok(WorkerState::Idle) } Err(e) => { - // The errors that we have here are only Sled errors + // The errors that we have here are only db errors // We don't really know how to handle them so just ¯\_(ツ)_/¯ - // (there is kind of an assumption that Sled won't error on us, + // (there is kind of an assumption that the db won't error on us, // if it does there is not much we can do -- TODO should we just panic?) // Here we just give the error to the worker manager, // it will print it to the logs and increment a counter diff --git a/src/db/Cargo.toml b/src/db/Cargo.toml index fddc5cca..a8f6d586 100644 --- a/src/db/Cargo.toml +++ b/src/db/Cargo.toml @@ -18,13 +18,12 @@ tracing.workspace = true heed = { workspace = true, optional = true } rusqlite = { workspace = true, optional = true } -sled = { workspace = true, optional = true } [dev-dependencies] mktemp.workspace = true [features] -default = [ "sled", "lmdb", "sqlite" ] +default = [ "lmdb", "sqlite" ] bundled-libs = [ "rusqlite?/bundled" ] lmdb = [ "heed" ] sqlite = [ "rusqlite" ] diff --git a/src/db/lib.rs b/src/db/lib.rs index 0fb457ce..8975f295 100644 --- a/src/db/lib.rs +++ b/src/db/lib.rs @@ -3,8 +3,6 @@ extern crate tracing; #[cfg(feature = "lmdb")] pub mod lmdb_adapter; -#[cfg(feature = "sled")] -pub mod sled_adapter; #[cfg(feature = "sqlite")] pub mod sqlite_adapter; diff --git a/src/db/open.rs b/src/db/open.rs index ae135c4e..03476a42 100644 --- a/src/db/open.rs +++ b/src/db/open.rs @@ -11,7 +11,6 @@ use crate::{Db, Error, Result}; pub enum Engine { Lmdb, Sqlite, - Sled, } impl Engine { @@ -20,7 +19,6 @@ impl Engine { match self { Self::Lmdb => "lmdb", Self::Sqlite => "sqlite", - Self::Sled => "sled", } } } @@ -38,10 +36,10 @@ impl std::str::FromStr for Engine { match text { "lmdb" | "heed" => Ok(Self::Lmdb), "sqlite" | "sqlite3" | "rusqlite" => Ok(Self::Sqlite), - "sled" => Ok(Self::Sled), + "sled" => Err(Error("Sled is no longer supported as a database engine. Converting your old metadata db can be done using an older Garage binary (e.g. v0.9.3).".into())), kind => Err(Error( format!( - "Invalid DB engine: {} (options are: lmdb, sled, sqlite)", + "Invalid DB engine: {} (options are: lmdb, sqlite)", kind ) .into(), @@ -53,8 +51,6 @@ impl std::str::FromStr for Engine { pub struct OpenOpt { pub fsync: bool, pub lmdb_map_size: Option, - pub sled_cache_capacity: usize, - pub sled_flush_every_ms: u64, } impl Default for OpenOpt { @@ -62,31 +58,12 @@ impl Default for OpenOpt { Self { fsync: false, lmdb_map_size: None, - sled_cache_capacity: 1024 * 1024 * 1024, - sled_flush_every_ms: 2000, } } } pub fn open_db(path: &PathBuf, engine: Engine, opt: &OpenOpt) -> Result { match engine { - // ---- Sled DB ---- - #[cfg(feature = "sled")] - Engine::Sled => { - if opt.fsync { - return Err(Error( - "`metadata_fsync = true` is not supported with the Sled database engine".into(), - )); - } - info!("Opening Sled database at: {}", path.display()); - let db = crate::sled_adapter::sled::Config::default() - .path(&path) - .cache_capacity(opt.sled_cache_capacity as u64) - .flush_every_ms(Some(opt.sled_flush_every_ms)) - .open()?; - Ok(crate::sled_adapter::SledDb::init(db)) - } - // ---- Sqlite DB ---- #[cfg(feature = "sqlite")] Engine::Sqlite => { diff --git a/src/db/sled_adapter.rs b/src/db/sled_adapter.rs deleted file mode 100644 index 84f2001b..00000000 --- a/src/db/sled_adapter.rs +++ /dev/null @@ -1,274 +0,0 @@ -use core::ops::Bound; - -use std::cell::Cell; -use std::collections::HashMap; -use std::sync::{Arc, RwLock}; - -use sled::transaction::{ - ConflictableTransactionError, TransactionError, Transactional, TransactionalTree, - UnabortableTransactionError, -}; - -use crate::{ - Db, Error, IDb, ITx, ITxFn, OnCommit, Result, TxError, TxFnResult, TxOpError, TxOpResult, - TxResult, TxValueIter, Value, ValueIter, -}; - -pub use sled; - -// -- err - -impl From for Error { - fn from(e: sled::Error) -> Error { - Error(format!("Sled: {}", e).into()) - } -} - -impl From for TxOpError { - fn from(e: sled::Error) -> TxOpError { - TxOpError(e.into()) - } -} - -// -- db - -pub struct SledDb { - db: sled::Db, - trees: RwLock<(Vec, HashMap)>, -} - -impl SledDb { - #[deprecated( - since = "0.9.0", - note = "The Sled database is now deprecated and will be removed in Garage v1.0. Please migrate to LMDB or Sqlite as soon as possible." - )] - pub fn init(db: sled::Db) -> Db { - tracing::warn!("-------------------- IMPORTANT WARNING !!! ----------------------"); - tracing::warn!("The Sled database is now deprecated and will be removed in Garage v1.0."); - tracing::warn!("Please migrate to LMDB or Sqlite as soon as possible."); - tracing::warn!("-----------------------------------------------------------------------"); - let s = Self { - db, - trees: RwLock::new((Vec::new(), HashMap::new())), - }; - Db(Arc::new(s)) - } - - fn get_tree(&self, i: usize) -> Result { - self.trees - .read() - .unwrap() - .0 - .get(i) - .cloned() - .ok_or_else(|| Error("invalid tree id".into())) - } -} - -impl IDb for SledDb { - fn engine(&self) -> String { - "Sled".into() - } - - fn open_tree(&self, name: &str) -> Result { - let mut trees = self.trees.write().unwrap(); - if let Some(i) = trees.1.get(name) { - Ok(*i) - } else { - let tree = self.db.open_tree(name)?; - let i = trees.0.len(); - trees.0.push(tree); - trees.1.insert(name.to_string(), i); - Ok(i) - } - } - - fn list_trees(&self) -> Result> { - let mut trees = vec![]; - for name in self.db.tree_names() { - let name = std::str::from_utf8(&name) - .map_err(|e| Error(format!("{}", e).into()))? - .to_string(); - if name != "__sled__default" { - trees.push(name); - } - } - Ok(trees) - } - - // ---- - - fn get(&self, tree: usize, key: &[u8]) -> Result> { - let tree = self.get_tree(tree)?; - let val = tree.get(key)?; - Ok(val.map(|x| x.to_vec())) - } - - fn len(&self, tree: usize) -> Result { - let tree = self.get_tree(tree)?; - Ok(tree.len()) - } - - fn insert(&self, tree: usize, key: &[u8], value: &[u8]) -> Result> { - let tree = self.get_tree(tree)?; - let old_val = tree.insert(key, value)?; - Ok(old_val.map(|x| x.to_vec())) - } - - fn remove(&self, tree: usize, key: &[u8]) -> Result> { - let tree = self.get_tree(tree)?; - let old_val = tree.remove(key)?; - Ok(old_val.map(|x| x.to_vec())) - } - - fn clear(&self, tree: usize) -> Result<()> { - let tree = self.get_tree(tree)?; - tree.clear()?; - Ok(()) - } - - fn iter(&self, tree: usize) -> Result> { - let tree = self.get_tree(tree)?; - Ok(Box::new(tree.iter().map(|v| { - v.map(|(x, y)| (x.to_vec(), y.to_vec())).map_err(Into::into) - }))) - } - - fn iter_rev(&self, tree: usize) -> Result> { - let tree = self.get_tree(tree)?; - Ok(Box::new(tree.iter().rev().map(|v| { - v.map(|(x, y)| (x.to_vec(), y.to_vec())).map_err(Into::into) - }))) - } - - fn range<'r>( - &self, - tree: usize, - low: Bound<&'r [u8]>, - high: Bound<&'r [u8]>, - ) -> Result> { - let tree = self.get_tree(tree)?; - Ok(Box::new(tree.range::<&'r [u8], _>((low, high)).map(|v| { - v.map(|(x, y)| (x.to_vec(), y.to_vec())).map_err(Into::into) - }))) - } - fn range_rev<'r>( - &self, - tree: usize, - low: Bound<&'r [u8]>, - high: Bound<&'r [u8]>, - ) -> Result> { - let tree = self.get_tree(tree)?; - Ok(Box::new(tree.range::<&'r [u8], _>((low, high)).rev().map( - |v| v.map(|(x, y)| (x.to_vec(), y.to_vec())).map_err(Into::into), - ))) - } - - // ---- - - fn transaction(&self, f: &dyn ITxFn) -> TxResult { - let trees = self.trees.read().unwrap(); - let res = trees.0.transaction(|txtrees| { - let mut tx = SledTx { - trees: txtrees, - err: Cell::new(None), - }; - match f.try_on(&mut tx) { - TxFnResult::Ok(on_commit) => { - assert!(tx.err.into_inner().is_none()); - Ok(on_commit) - } - TxFnResult::Abort => { - assert!(tx.err.into_inner().is_none()); - Err(ConflictableTransactionError::Abort(())) - } - TxFnResult::DbErr => { - let e = tx.err.into_inner().expect("No DB error"); - Err(e.into()) - } - } - }); - match res { - Ok(on_commit) => Ok(on_commit), - Err(TransactionError::Abort(())) => Err(TxError::Abort(())), - Err(TransactionError::Storage(s)) => Err(TxError::Db(s.into())), - } - } -} - -// ---- - -struct SledTx<'a> { - trees: &'a [TransactionalTree], - err: Cell>, -} - -impl<'a> SledTx<'a> { - fn get_tree(&self, i: usize) -> TxOpResult<&TransactionalTree> { - self.trees.get(i).ok_or_else(|| { - TxOpError(Error( - "invalid tree id (it might have been openned after the transaction started)".into(), - )) - }) - } - - fn save_error( - &self, - v: std::result::Result, - ) -> TxOpResult { - match v { - Ok(x) => Ok(x), - Err(e) => { - let txt = format!("{}", e); - self.err.set(Some(e)); - Err(TxOpError(Error(txt.into()))) - } - } - } -} - -impl<'a> ITx for SledTx<'a> { - fn get(&self, tree: usize, key: &[u8]) -> TxOpResult> { - let tree = self.get_tree(tree)?; - let tmp = self.save_error(tree.get(key))?; - Ok(tmp.map(|x| x.to_vec())) - } - fn len(&self, _tree: usize) -> TxOpResult { - unimplemented!(".len() in transaction not supported with Sled backend") - } - - fn insert(&mut self, tree: usize, key: &[u8], value: &[u8]) -> TxOpResult> { - let tree = self.get_tree(tree)?; - let old_val = self.save_error(tree.insert(key, value))?; - Ok(old_val.map(|x| x.to_vec())) - } - fn remove(&mut self, tree: usize, key: &[u8]) -> TxOpResult> { - let tree = self.get_tree(tree)?; - let old_val = self.save_error(tree.remove(key))?; - Ok(old_val.map(|x| x.to_vec())) - } - - fn iter(&self, _tree: usize) -> TxOpResult> { - unimplemented!("Iterators in transactions not supported with Sled backend"); - } - fn iter_rev(&self, _tree: usize) -> TxOpResult> { - unimplemented!("Iterators in transactions not supported with Sled backend"); - } - - fn range<'r>( - &self, - _tree: usize, - _low: Bound<&'r [u8]>, - _high: Bound<&'r [u8]>, - ) -> TxOpResult> { - unimplemented!("Iterators in transactions not supported with Sled backend"); - } - fn range_rev<'r>( - &self, - _tree: usize, - _low: Bound<&'r [u8]>, - _high: Bound<&'r [u8]>, - ) -> TxOpResult> { - unimplemented!("Iterators in transactions not supported with Sled backend"); - } -} diff --git a/src/db/test.rs b/src/db/test.rs index cd99eafa..d4c875f0 100644 --- a/src/db/test.rs +++ b/src/db/test.rs @@ -90,17 +90,6 @@ fn test_lmdb_db() { drop(path); } -#[test] -#[cfg(feature = "sled")] -fn test_sled_db() { - use crate::sled_adapter::SledDb; - - let path = mktemp::Temp::new_dir().unwrap(); - let db = SledDb::init(sled::open(path.to_path_buf()).unwrap()); - test_suite(db); - drop(path); -} - #[test] #[cfg(feature = "sqlite")] fn test_sqlite_db() { diff --git a/src/garage/Cargo.toml b/src/garage/Cargo.toml index 00ecb35e..53449a1c 100644 --- a/src/garage/Cargo.toml +++ b/src/garage/Cargo.toml @@ -80,12 +80,11 @@ k2v-client.workspace = true [features] -default = [ "bundled-libs", "metrics", "sled", "lmdb", "sqlite", "k2v" ] +default = [ "bundled-libs", "metrics", "lmdb", "sqlite", "k2v" ] k2v = [ "garage_util/k2v", "garage_api/k2v" ] -# Database engines, Sled is still our default even though we don't like it -sled = [ "garage_model/sled" ] +# Database engines lmdb = [ "garage_model/lmdb" ] sqlite = [ "garage_model/sqlite" ] diff --git a/src/garage/cli/convert_db.rs b/src/garage/cli/convert_db.rs index 2aadb1d6..5346d55a 100644 --- a/src/garage/cli/convert_db.rs +++ b/src/garage/cli/convert_db.rs @@ -11,7 +11,7 @@ pub struct ConvertDbOpt { /// https://garagehq.deuxfleurs.fr/documentation/reference-manual/configuration/#db-engine-since-v0-8-0) #[structopt(short = "i")] input_path: PathBuf, - /// Input database engine (sled, lmdb or sqlite; limited by db engines + /// Input database engine (lmdb or sqlite; limited by db engines /// enabled in this build) #[structopt(short = "a")] input_engine: Engine, diff --git a/src/garage/main.rs b/src/garage/main.rs index e489fff0..5e9c061f 100644 --- a/src/garage/main.rs +++ b/src/garage/main.rs @@ -18,8 +18,8 @@ compile_error!("Either bundled-libs or system-libs Cargo feature must be enabled #[cfg(all(feature = "bundled-libs", feature = "system-libs"))] compile_error!("Only one of bundled-libs and system-libs Cargo features must be enabled"); -#[cfg(not(any(feature = "lmdb", feature = "sled", feature = "sqlite")))] -compile_error!("Must activate the Cargo feature for at least one DB engine: lmdb, sled or sqlite."); +#[cfg(not(any(feature = "lmdb", feature = "sqlite")))] +compile_error!("Must activate the Cargo feature for at least one DB engine: lmdb or sqlite."); use std::net::SocketAddr; use std::path::PathBuf; @@ -72,8 +72,6 @@ async fn main() { let features = &[ #[cfg(feature = "k2v")] "k2v", - #[cfg(feature = "sled")] - "sled", #[cfg(feature = "lmdb")] "lmdb", #[cfg(feature = "sqlite")] diff --git a/src/model/Cargo.toml b/src/model/Cargo.toml index 776671d0..a6bcfbe7 100644 --- a/src/model/Cargo.toml +++ b/src/model/Cargo.toml @@ -42,8 +42,7 @@ tokio.workspace = true opentelemetry.workspace = true [features] -default = [ "sled", "lmdb", "sqlite" ] +default = [ "lmdb", "sqlite" ] k2v = [ "garage_util/k2v" ] lmdb = [ "garage_db/lmdb" ] -sled = [ "garage_db/sled" ] sqlite = [ "garage_db/sqlite" ] diff --git a/src/model/garage.rs b/src/model/garage.rs index 7ec8b22e..8987c594 100644 --- a/src/model/garage.rs +++ b/src/model/garage.rs @@ -118,9 +118,6 @@ impl Garage { .ok_or_message("Invalid `db_engine` value in configuration file")?; let mut db_path = config.metadata_dir.clone(); match db_engine { - db::Engine::Sled => { - db_path.push("db"); - } db::Engine::Sqlite => { db_path.push("db.sqlite"); } @@ -134,8 +131,6 @@ impl Garage { v if v == usize::default() => None, v => Some(v), }, - sled_cache_capacity: config.sled_cache_capacity, - sled_flush_every_ms: config.sled_flush_every_ms, }; let db = db::open_db(&db_path, db_engine, &db_opt) .ok_or_message("Unable to open metadata db")?; diff --git a/src/table/gc.rs b/src/table/gc.rs index ef788749..65ad0c42 100644 --- a/src/table/gc.rs +++ b/src/table/gc.rs @@ -334,9 +334,9 @@ impl Worker for GcWorker { } } -/// An entry stored in the gc_todo Sled tree associated with the table +/// An entry stored in the gc_todo db tree associated with the table /// Contains helper function for parsing, saving, and removing -/// such entry in Sled +/// such entry in the db /// /// Format of an entry: /// - key = 8 bytes: timestamp of tombstone @@ -353,7 +353,7 @@ pub(crate) struct GcTodoEntry { } impl GcTodoEntry { - /// Creates a new GcTodoEntry (not saved in Sled) from its components: + /// Creates a new GcTodoEntry (not saved in the db) from its components: /// the key of an entry in the table, and the hash of the associated /// serialized value pub(crate) fn new(key: Vec, value_hash: Hash) -> Self { diff --git a/src/table/merkle.rs b/src/table/merkle.rs index 01271c58..be0ae243 100644 --- a/src/table/merkle.rs +++ b/src/table/merkle.rs @@ -31,14 +31,14 @@ pub struct MerkleUpdater { // - value = the hash of the full serialized item, if present, // or an empty vec if item is absent (deleted) // Fields in data: - // pub(crate) merkle_todo: sled::Tree, + // pub(crate) merkle_todo: db::Tree, // pub(crate) merkle_todo_notify: Notify, // Content of the merkle tree: items where // - key = .bytes() for MerkleNodeKey // - value = serialization of a MerkleNode, assumed to be MerkleNode::empty if not found // Field in data: - // pub(crate) merkle_tree: sled::Tree, + // pub(crate) merkle_tree: db::Tree, empty_node_hash: Hash, } diff --git a/src/util/config.rs b/src/util/config.rs index b7f27676..e243c813 100644 --- a/src/util/config.rs +++ b/src/util/config.rs @@ -87,20 +87,10 @@ pub struct Config { pub kubernetes_discovery: Option, // -- DB - /// Database engine to use for metadata (options: sled, sqlite, lmdb) + /// Database engine to use for metadata (options: sqlite, lmdb) #[serde(default = "default_db_engine")] pub db_engine: String, - /// Sled cache size, in bytes - #[serde( - deserialize_with = "deserialize_capacity", - default = "default_sled_cache_capacity" - )] - pub sled_cache_capacity: usize, - /// Sled flush interval in milliseconds - #[serde(default = "default_sled_flush_every_ms")] - pub sled_flush_every_ms: u64, - /// LMDB map size #[serde(deserialize_with = "deserialize_capacity", default)] pub lmdb_map_size: usize, @@ -246,13 +236,6 @@ fn default_db_engine() -> String { "lmdb".into() } -fn default_sled_cache_capacity() -> usize { - 128 * 1024 * 1024 -} -fn default_sled_flush_every_ms() -> u64 { - 2000 -} - fn default_block_size() -> usize { 1048576 } -- cgit v1.2.3 From 05c92204ecab87540806073ac4deedfd58519240 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 8 Mar 2024 14:59:56 +0100 Subject: [rm-sled] Remove counted_tree_hack --- src/block/manager.rs | 7 +-- src/block/metrics.rs | 17 +++--- src/block/resync.rs | 15 ++--- src/db/counted_tree_hack.rs | 127 --------------------------------------- src/db/lib.rs | 9 --- src/db/lmdb_adapter.rs | 4 -- src/db/sqlite_adapter.rs | 4 -- src/garage/admin/mod.rs | 51 +++------------- src/garage/cli/structs.rs | 4 -- src/model/s3/lifecycle_worker.rs | 8 +-- src/table/data.rs | 6 +- src/table/gc.rs | 18 +++--- src/table/merkle.rs | 4 -- src/table/metrics.rs | 21 ++++--- 14 files changed, 48 insertions(+), 247 deletions(-) delete mode 100644 src/db/counted_tree_hack.rs (limited to 'src') diff --git a/src/block/manager.rs b/src/block/manager.rs index c7e4df17..18fadf85 100644 --- a/src/block/manager.rs +++ b/src/block/manager.rs @@ -378,11 +378,6 @@ impl BlockManager { Ok(self.rc.rc.len()?) } - /// Get number of items in the refcount table - pub fn rc_fast_len(&self) -> Result, Error> { - Ok(self.rc.rc.fast_len()?) - } - /// Send command to start/stop/manager scrub worker pub async fn send_scrub_command(&self, cmd: ScrubWorkerCommand) -> Result<(), Error> { let tx = self.tx_scrub_command.load(); @@ -398,7 +393,7 @@ impl BlockManager { /// List all resync errors pub fn list_resync_errors(&self) -> Result, Error> { - let mut blocks = Vec::with_capacity(self.resync.errors.len()); + let mut blocks = Vec::with_capacity(self.resync.errors.len()?); for ent in self.resync.errors.iter()? { let (hash, cnt) = ent?; let cnt = ErrorCounter::decode(&cnt); diff --git a/src/block/metrics.rs b/src/block/metrics.rs index 6659df32..8e10afdf 100644 --- a/src/block/metrics.rs +++ b/src/block/metrics.rs @@ -1,7 +1,6 @@ use opentelemetry::{global, metrics::*}; use garage_db as db; -use garage_db::counted_tree_hack::CountedTree; /// TableMetrics reference all counter used for metrics pub struct BlockManagerMetrics { @@ -29,8 +28,8 @@ impl BlockManagerMetrics { pub fn new( compression_level: Option, rc_tree: db::Tree, - resync_queue: CountedTree, - resync_errors: CountedTree, + resync_queue: db::Tree, + resync_errors: db::Tree, ) -> Self { let meter = global::meter("garage_model/block"); Self { @@ -45,15 +44,17 @@ impl BlockManagerMetrics { .init(), _rc_size: meter .u64_value_observer("block.rc_size", move |observer| { - if let Ok(Some(v)) = rc_tree.fast_len() { - observer.observe(v as u64, &[]) + if let Ok(value) = rc_tree.len() { + observer.observe(value as u64, &[]) } }) .with_description("Number of blocks known to the reference counter") .init(), _resync_queue_len: meter .u64_value_observer("block.resync_queue_length", move |observer| { - observer.observe(resync_queue.len() as u64, &[]) + if let Ok(value) = resync_queue.len() { + observer.observe(value as u64, &[]); + } }) .with_description( "Number of block hashes queued for local check and possible resync", @@ -61,7 +62,9 @@ impl BlockManagerMetrics { .init(), _resync_errored_blocks: meter .u64_value_observer("block.resync_errored_blocks", move |observer| { - observer.observe(resync_errors.len() as u64, &[]) + if let Ok(value) = resync_errors.len() { + observer.observe(value as u64, &[]); + } }) .with_description("Number of block hashes whose last resync resulted in an error") .init(), diff --git a/src/block/resync.rs b/src/block/resync.rs index 2516ba08..48c2cef1 100644 --- a/src/block/resync.rs +++ b/src/block/resync.rs @@ -15,7 +15,6 @@ use opentelemetry::{ }; use garage_db as db; -use garage_db::counted_tree_hack::CountedTree; use garage_util::background::*; use garage_util::data::*; @@ -47,9 +46,9 @@ pub(crate) const MAX_RESYNC_WORKERS: usize = 8; const INITIAL_RESYNC_TRANQUILITY: u32 = 2; pub struct BlockResyncManager { - pub(crate) queue: CountedTree, + pub(crate) queue: db::Tree, pub(crate) notify: Arc, - pub(crate) errors: CountedTree, + pub(crate) errors: db::Tree, busy_set: BusySet, @@ -90,12 +89,10 @@ impl BlockResyncManager { let queue = db .open_tree("block_local_resync_queue") .expect("Unable to open block_local_resync_queue tree"); - let queue = CountedTree::new(queue).expect("Could not count block_local_resync_queue"); let errors = db .open_tree("block_local_resync_errors") .expect("Unable to open block_local_resync_errors tree"); - let errors = CountedTree::new(errors).expect("Could not count block_local_resync_errors"); let persister = PersisterShared::new(&system.metadata_dir, "resync_cfg"); @@ -110,16 +107,12 @@ impl BlockResyncManager { /// Get lenght of resync queue pub fn queue_len(&self) -> Result { - // This currently can't return an error because the CountedTree hack - // doesn't error on .len(), but this will change when we remove the hack - // (hopefully someday!) - Ok(self.queue.len()) + Ok(self.queue.len()?) } /// Get number of blocks that have an error pub fn errors_len(&self) -> Result { - // (see queue_len comment) - Ok(self.errors.len()) + Ok(self.errors.len()?) } /// Clear the error counter for a block and put it in queue immediately diff --git a/src/db/counted_tree_hack.rs b/src/db/counted_tree_hack.rs deleted file mode 100644 index a4ce12e0..00000000 --- a/src/db/counted_tree_hack.rs +++ /dev/null @@ -1,127 +0,0 @@ -//! This hack allows a db tree to keep in RAM a counter of the number of entries -//! it contains, which is used to call .len() on it. This is usefull only for -//! the sled backend where .len() otherwise would have to traverse the whole -//! tree to count items. For sqlite and lmdb, this is mostly useless (but -//! hopefully not harmfull!). Note that a CountedTree cannot be part of a -//! transaction. - -use std::sync::{ - atomic::{AtomicUsize, Ordering}, - Arc, -}; - -use crate::{Result, Tree, TxError, Value, ValueIter}; - -#[derive(Clone)] -pub struct CountedTree(Arc); - -struct CountedTreeInternal { - tree: Tree, - len: AtomicUsize, -} - -impl CountedTree { - pub fn new(tree: Tree) -> Result { - let len = tree.len()?; - Ok(Self(Arc::new(CountedTreeInternal { - tree, - len: AtomicUsize::new(len), - }))) - } - - pub fn len(&self) -> usize { - self.0.len.load(Ordering::SeqCst) - } - - pub fn is_empty(&self) -> bool { - self.len() == 0 - } - - pub fn get>(&self, key: K) -> Result> { - self.0.tree.get(key) - } - - pub fn first(&self) -> Result> { - self.0.tree.first() - } - - pub fn iter(&self) -> Result> { - self.0.tree.iter() - } - - // ---- writing functions ---- - - pub fn insert(&self, key: K, value: V) -> Result> - where - K: AsRef<[u8]>, - V: AsRef<[u8]>, - { - let old_val = self.0.tree.insert(key, value)?; - if old_val.is_none() { - self.0.len.fetch_add(1, Ordering::SeqCst); - } - Ok(old_val) - } - - pub fn remove>(&self, key: K) -> Result> { - let old_val = self.0.tree.remove(key)?; - if old_val.is_some() { - self.0.len.fetch_sub(1, Ordering::SeqCst); - } - Ok(old_val) - } - - pub fn compare_and_swap( - &self, - key: K, - expected_old: Option, - new: Option, - ) -> Result - where - K: AsRef<[u8]>, - OV: AsRef<[u8]>, - NV: AsRef<[u8]>, - { - let old_some = expected_old.is_some(); - let new_some = new.is_some(); - - let tx_res = self.0.tree.db().transaction(|tx| { - let old_val = tx.get(&self.0.tree, &key)?; - let is_same = match (&old_val, &expected_old) { - (None, None) => true, - (Some(x), Some(y)) if x == y.as_ref() => true, - _ => false, - }; - if is_same { - match &new { - Some(v) => { - tx.insert(&self.0.tree, &key, v)?; - } - None => { - tx.remove(&self.0.tree, &key)?; - } - } - Ok(()) - } else { - Err(TxError::Abort(())) - } - }); - - match tx_res { - Ok(()) => { - match (old_some, new_some) { - (false, true) => { - self.0.len.fetch_add(1, Ordering::SeqCst); - } - (true, false) => { - self.0.len.fetch_sub(1, Ordering::SeqCst); - } - _ => (), - } - Ok(true) - } - Err(TxError::Abort(())) => Ok(false), - Err(TxError::Db(e)) => Err(e), - } - } -} diff --git a/src/db/lib.rs b/src/db/lib.rs index 8975f295..e81c712c 100644 --- a/src/db/lib.rs +++ b/src/db/lib.rs @@ -6,8 +6,6 @@ pub mod lmdb_adapter; #[cfg(feature = "sqlite")] pub mod sqlite_adapter; -pub mod counted_tree_hack; - pub mod open; #[cfg(test)] @@ -187,10 +185,6 @@ impl Tree { pub fn len(&self) -> Result { self.0.len(self.1) } - #[inline] - pub fn fast_len(&self) -> Result> { - self.0.fast_len(self.1) - } #[inline] pub fn first(&self) -> Result> { @@ -326,9 +320,6 @@ pub(crate) trait IDb: Send + Sync { fn get(&self, tree: usize, key: &[u8]) -> Result>; fn len(&self, tree: usize) -> Result; - fn fast_len(&self, _tree: usize) -> Result> { - Ok(None) - } fn insert(&self, tree: usize, key: &[u8], value: &[u8]) -> Result>; fn remove(&self, tree: usize, key: &[u8]) -> Result>; diff --git a/src/db/lmdb_adapter.rs b/src/db/lmdb_adapter.rs index 59fa132d..99b29a74 100644 --- a/src/db/lmdb_adapter.rs +++ b/src/db/lmdb_adapter.rs @@ -121,10 +121,6 @@ impl IDb for LmdbDb { Ok(tree.len(&tx)?.try_into().unwrap()) } - fn fast_len(&self, tree: usize) -> Result> { - Ok(Some(self.len(tree)?)) - } - fn insert(&self, tree: usize, key: &[u8], value: &[u8]) -> Result> { let tree = self.get_tree(tree)?; let mut tx = self.db.write_txn()?; diff --git a/src/db/sqlite_adapter.rs b/src/db/sqlite_adapter.rs index 9f967c66..1a7ae5f0 100644 --- a/src/db/sqlite_adapter.rs +++ b/src/db/sqlite_adapter.rs @@ -144,10 +144,6 @@ impl IDb for SqliteDb { } } - fn fast_len(&self, tree: usize) -> Result> { - Ok(Some(self.len(tree)?)) - } - fn insert(&self, tree: usize, key: &[u8], value: &[u8]) -> Result> { trace!("insert {}: lock db", tree); let this = self.0.lock().unwrap(); diff --git a/src/garage/admin/mod.rs b/src/garage/admin/mod.rs index de7851e1..896751cc 100644 --- a/src/garage/admin/mod.rs +++ b/src/garage/admin/mod.rs @@ -217,11 +217,11 @@ impl AdminRpcHandler { // Gather table statistics let mut table = vec![" Table\tItems\tMklItems\tMklTodo\tGcTodo".into()]; - table.push(self.gather_table_stats(&self.garage.bucket_table, opt.detailed)?); - table.push(self.gather_table_stats(&self.garage.key_table, opt.detailed)?); - table.push(self.gather_table_stats(&self.garage.object_table, opt.detailed)?); - table.push(self.gather_table_stats(&self.garage.version_table, opt.detailed)?); - table.push(self.gather_table_stats(&self.garage.block_ref_table, opt.detailed)?); + table.push(self.gather_table_stats(&self.garage.bucket_table)?); + table.push(self.gather_table_stats(&self.garage.key_table)?); + table.push(self.gather_table_stats(&self.garage.object_table)?); + table.push(self.gather_table_stats(&self.garage.version_table)?); + table.push(self.gather_table_stats(&self.garage.block_ref_table)?); write!( &mut ret, "\nTable stats:\n{}", @@ -231,15 +231,7 @@ impl AdminRpcHandler { // Gather block manager statistics writeln!(&mut ret, "\nBlock manager stats:").unwrap(); - let rc_len = if opt.detailed { - self.garage.block_manager.rc_len()?.to_string() - } else { - self.garage - .block_manager - .rc_fast_len()? - .map(|x| x.to_string()) - .unwrap_or_else(|| "NC".into()) - }; + let rc_len = self.garage.block_manager.rc_len()?.to_string(); writeln!( &mut ret, @@ -260,10 +252,6 @@ impl AdminRpcHandler { ) .unwrap(); - if !opt.detailed { - writeln!(&mut ret, "\nIf values are missing above (marked as NC), consider adding the --detailed flag (this will be slow).").unwrap(); - } - if !opt.skip_global { write!(&mut ret, "\n{}", self.gather_cluster_stats()).unwrap(); } @@ -365,34 +353,13 @@ impl AdminRpcHandler { ret } - fn gather_table_stats( - &self, - t: &Arc>, - detailed: bool, - ) -> Result + fn gather_table_stats(&self, t: &Arc>) -> Result where F: TableSchema + 'static, R: TableReplication + 'static, { - let (data_len, mkl_len) = if detailed { - ( - t.data.store.len().map_err(GarageError::from)?.to_string(), - t.merkle_updater.merkle_tree_len()?.to_string(), - ) - } else { - ( - t.data - .store - .fast_len() - .map_err(GarageError::from)? - .map(|x| x.to_string()) - .unwrap_or_else(|| "NC".into()), - t.merkle_updater - .merkle_tree_fast_len()? - .map(|x| x.to_string()) - .unwrap_or_else(|| "NC".into()), - ) - }; + let data_len = t.data.store.len().map_err(GarageError::from)?.to_string(); + let mkl_len = t.merkle_updater.merkle_tree_len()?.to_string(); Ok(format!( " {}\t{}\t{}\t{}\t{}", diff --git a/src/garage/cli/structs.rs b/src/garage/cli/structs.rs index 40e47ee1..7e7ab71b 100644 --- a/src/garage/cli/structs.rs +++ b/src/garage/cli/structs.rs @@ -553,10 +553,6 @@ pub struct StatsOpt { #[structopt(short = "a", long = "all-nodes")] pub all_nodes: bool, - /// Gather detailed statistics (this can be long) - #[structopt(short = "d", long = "detailed")] - pub detailed: bool, - /// Don't show global cluster stats (internal use in RPC) #[structopt(skip)] #[serde(default)] diff --git a/src/model/s3/lifecycle_worker.rs b/src/model/s3/lifecycle_worker.rs index 50d4283f..9ecf168c 100644 --- a/src/model/s3/lifecycle_worker.rs +++ b/src/model/s3/lifecycle_worker.rs @@ -121,13 +121,7 @@ impl Worker for LifecycleWorker { mpu_aborted, .. } => { - let n_objects = self - .garage - .object_table - .data - .store - .fast_len() - .unwrap_or(None); + let n_objects = self.garage.object_table.data.store.len().ok(); let progress = match n_objects { None => "...".to_string(), Some(total) => format!( diff --git a/src/table/data.rs b/src/table/data.rs index 7f6b7847..09f4e008 100644 --- a/src/table/data.rs +++ b/src/table/data.rs @@ -6,7 +6,6 @@ use serde_bytes::ByteBuf; use tokio::sync::Notify; use garage_db as db; -use garage_db::counted_tree_hack::CountedTree; use garage_util::data::*; use garage_util::error::*; @@ -36,7 +35,7 @@ pub struct TableData { pub(crate) insert_queue: db::Tree, pub(crate) insert_queue_notify: Arc, - pub(crate) gc_todo: CountedTree, + pub(crate) gc_todo: db::Tree, pub(crate) metrics: TableMetrics, } @@ -61,7 +60,6 @@ impl TableData { let gc_todo = db .open_tree(format!("{}:gc_todo_v2", F::TABLE_NAME)) .expect("Unable to open GC DB tree"); - let gc_todo = CountedTree::new(gc_todo).expect("Cannot count gc_todo_v2"); let metrics = TableMetrics::new( F::TABLE_NAME, @@ -370,6 +368,6 @@ impl TableData { } pub fn gc_todo_len(&self) -> Result { - Ok(self.gc_todo.len()) + Ok(self.gc_todo.len()?) } } diff --git a/src/table/gc.rs b/src/table/gc.rs index 65ad0c42..d30a1849 100644 --- a/src/table/gc.rs +++ b/src/table/gc.rs @@ -10,7 +10,7 @@ use serde_bytes::ByteBuf; use futures::future::join_all; use tokio::sync::watch; -use garage_db::counted_tree_hack::CountedTree; +use garage_db as db; use garage_util::background::*; use garage_util::data::*; @@ -376,7 +376,7 @@ impl GcTodoEntry { } /// Saves the GcTodoEntry in the gc_todo tree - pub(crate) fn save(&self, gc_todo_tree: &CountedTree) -> Result<(), Error> { + pub(crate) fn save(&self, gc_todo_tree: &db::Tree) -> Result<(), Error> { gc_todo_tree.insert(self.todo_table_key(), self.value_hash.as_slice())?; Ok(()) } @@ -386,12 +386,14 @@ impl GcTodoEntry { /// This is usefull to remove a todo entry only under the condition /// that it has not changed since the time it was read, i.e. /// what we have to do is still the same - pub(crate) fn remove_if_equal(&self, gc_todo_tree: &CountedTree) -> Result<(), Error> { - gc_todo_tree.compare_and_swap::<_, _, &[u8]>( - &self.todo_table_key(), - Some(self.value_hash), - None, - )?; + pub(crate) fn remove_if_equal(&self, gc_todo_tree: &db::Tree) -> Result<(), Error> { + gc_todo_tree.db().transaction(|txn| { + let key = self.todo_table_key(); + if txn.get(gc_todo_tree, &key)?.as_deref() == Some(self.value_hash.as_slice()) { + txn.remove(gc_todo_tree, &key)?; + } + Ok(()) + })?; Ok(()) } diff --git a/src/table/merkle.rs b/src/table/merkle.rs index be0ae243..596d5805 100644 --- a/src/table/merkle.rs +++ b/src/table/merkle.rs @@ -291,10 +291,6 @@ impl MerkleUpdater { Ok(self.data.merkle_tree.len()?) } - pub fn merkle_tree_fast_len(&self) -> Result, Error> { - Ok(self.data.merkle_tree.fast_len()?) - } - pub fn todo_len(&self) -> Result { Ok(self.data.merkle_todo.len()?) } diff --git a/src/table/metrics.rs b/src/table/metrics.rs index 8318a84f..7bb0959a 100644 --- a/src/table/metrics.rs +++ b/src/table/metrics.rs @@ -1,7 +1,6 @@ use opentelemetry::{global, metrics::*, KeyValue}; use garage_db as db; -use garage_db::counted_tree_hack::CountedTree; /// TableMetrics reference all counter used for metrics pub struct TableMetrics { @@ -27,7 +26,7 @@ impl TableMetrics { store: db::Tree, merkle_tree: db::Tree, merkle_todo: db::Tree, - gc_todo: CountedTree, + gc_todo: db::Tree, ) -> Self { let meter = global::meter(table_name); TableMetrics { @@ -35,9 +34,9 @@ impl TableMetrics { .u64_value_observer( "table.size", move |observer| { - if let Ok(Some(v)) = store.fast_len() { + if let Ok(value) = store.len() { observer.observe( - v as u64, + value as u64, &[KeyValue::new("table_name", table_name)], ); } @@ -49,9 +48,9 @@ impl TableMetrics { .u64_value_observer( "table.merkle_tree_size", move |observer| { - if let Ok(Some(v)) = merkle_tree.fast_len() { + if let Ok(value) = merkle_tree.len() { observer.observe( - v as u64, + value as u64, &[KeyValue::new("table_name", table_name)], ); } @@ -77,10 +76,12 @@ impl TableMetrics { .u64_value_observer( "table.gc_todo_queue_length", move |observer| { - observer.observe( - gc_todo.len() as u64, - &[KeyValue::new("table_name", table_name)], - ); + if let Ok(value) = gc_todo.len() { + observer.observe( + value as u64, + &[KeyValue::new("table_name", table_name)], + ); + } }, ) .with_description("Table garbage collector TODO queue length") -- cgit v1.2.3 From 66c23890c1a6e73fd6c5246642e087cd2866451e Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 8 Mar 2024 16:02:58 +0100 Subject: [rm-sled] Implement some missing functionality in garage_db --- src/db/lib.rs | 6 ++++++ src/db/lmdb_adapter.rs | 10 ++++++++-- src/db/sqlite_adapter.rs | 5 +++++ 3 files changed, 19 insertions(+), 2 deletions(-) (limited to 'src') diff --git a/src/db/lib.rs b/src/db/lib.rs index e81c712c..5881954d 100644 --- a/src/db/lib.rs +++ b/src/db/lib.rs @@ -274,6 +274,11 @@ impl<'a> Transaction<'a> { pub fn remove>(&mut self, tree: &Tree, key: T) -> TxOpResult> { self.tx.remove(tree.1, key.as_ref()) } + /// Clears all values in a tree + #[inline] + pub fn clear(&mut self, tree: &Tree) -> TxOpResult<()> { + self.tx.clear(tree.1) + } #[inline] pub fn iter(&self, tree: &Tree) -> TxOpResult> { @@ -350,6 +355,7 @@ pub(crate) trait ITx { fn insert(&mut self, tree: usize, key: &[u8], value: &[u8]) -> TxOpResult>; fn remove(&mut self, tree: usize, key: &[u8]) -> TxOpResult>; + fn clear(&mut self, tree: usize) -> TxOpResult<()>; fn iter(&self, tree: usize) -> TxOpResult>; fn iter_rev(&self, tree: usize) -> TxOpResult>; diff --git a/src/db/lmdb_adapter.rs b/src/db/lmdb_adapter.rs index 99b29a74..01c360b4 100644 --- a/src/db/lmdb_adapter.rs +++ b/src/db/lmdb_adapter.rs @@ -238,8 +238,9 @@ impl<'a> ITx for LmdbTx<'a> { None => Ok(None), } } - fn len(&self, _tree: usize) -> TxOpResult { - unimplemented!(".len() in transaction not supported with LMDB backend") + fn len(&self, tree: usize) -> TxOpResult { + let tree = self.get_tree(tree)?; + Ok(tree.len(&self.tx)? as usize) } fn insert(&mut self, tree: usize, key: &[u8], value: &[u8]) -> TxOpResult> { @@ -254,6 +255,11 @@ impl<'a> ITx for LmdbTx<'a> { tree.delete(&mut self.tx, key)?; Ok(old_val) } + fn clear(&mut self, tree: usize) -> TxOpResult<()> { + let tree = *self.get_tree(tree)?; + tree.clear(&mut self.tx)?; + Ok(()) + } fn iter(&self, _tree: usize) -> TxOpResult> { unimplemented!("Iterators in transactions not supported with LMDB backend"); diff --git a/src/db/sqlite_adapter.rs b/src/db/sqlite_adapter.rs index 1a7ae5f0..d1394355 100644 --- a/src/db/sqlite_adapter.rs +++ b/src/db/sqlite_adapter.rs @@ -363,6 +363,11 @@ impl<'a> ITx for SqliteTx<'a> { Ok(old_val) } + fn clear(&mut self, tree: usize) -> TxOpResult<()> { + let tree = self.get_tree(tree)?; + self.tx.execute(&format!("DELETE FROM {}", tree), [])?; + Ok(()) + } fn iter(&self, _tree: usize) -> TxOpResult> { unimplemented!(); -- cgit v1.2.3 From b942949940b5a0dec8e8640c44a2705a4482a2e4 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 8 Mar 2024 16:38:01 +0100 Subject: [rm-sled] Implement iterators in sqlite & lmdb transactions with way too much unsafe code --- src/db/lib.rs | 1 + src/db/lmdb_adapter.rs | 48 ++++++++++++------ src/db/sqlite_adapter.rs | 125 +++++++++++++++++++++++++++++++++++++++++------ src/db/test.rs | 49 +++++++++++++++++++ 4 files changed, 195 insertions(+), 28 deletions(-) (limited to 'src') diff --git a/src/db/lib.rs b/src/db/lib.rs index 5881954d..ff511b5f 100644 --- a/src/db/lib.rs +++ b/src/db/lib.rs @@ -51,6 +51,7 @@ pub type Result = std::result::Result; pub struct TxOpError(pub(crate) Error); pub type TxOpResult = std::result::Result; +#[derive(Debug)] pub enum TxError { Abort(E), Db(Error), diff --git a/src/db/lmdb_adapter.rs b/src/db/lmdb_adapter.rs index 01c360b4..ddfb6ed5 100644 --- a/src/db/lmdb_adapter.rs +++ b/src/db/lmdb_adapter.rs @@ -261,32 +261,42 @@ impl<'a> ITx for LmdbTx<'a> { Ok(()) } - fn iter(&self, _tree: usize) -> TxOpResult> { - unimplemented!("Iterators in transactions not supported with LMDB backend"); + fn iter(&self, tree: usize) -> TxOpResult> { + let tree = *self.get_tree(tree)?; + Ok(Box::new(tree.iter(&self.tx)?.map(tx_iter_item))) } - fn iter_rev(&self, _tree: usize) -> TxOpResult> { - unimplemented!("Iterators in transactions not supported with LMDB backend"); + fn iter_rev(&self, tree: usize) -> TxOpResult> { + let tree = *self.get_tree(tree)?; + Ok(Box::new(tree.rev_iter(&self.tx)?.map(tx_iter_item))) } fn range<'r>( &self, - _tree: usize, - _low: Bound<&'r [u8]>, - _high: Bound<&'r [u8]>, + tree: usize, + low: Bound<&'r [u8]>, + high: Bound<&'r [u8]>, ) -> TxOpResult> { - unimplemented!("Iterators in transactions not supported with LMDB backend"); + let tree = *self.get_tree(tree)?; + Ok(Box::new( + tree.range(&self.tx, &(low, high))?.map(tx_iter_item), + )) } fn range_rev<'r>( &self, - _tree: usize, - _low: Bound<&'r [u8]>, - _high: Bound<&'r [u8]>, + tree: usize, + low: Bound<&'r [u8]>, + high: Bound<&'r [u8]>, ) -> TxOpResult> { - unimplemented!("Iterators in transactions not supported with LMDB backend"); + let tree = *self.get_tree(tree)?; + Ok(Box::new( + tree.rev_range(&self.tx, &(low, high))?.map(tx_iter_item), + )) } } -// ---- +// ---- iterators outside transactions ---- +// complicated, they must hold the transaction object +// therefore a bit of unsafe code (it is a self-referential struct) type IteratorItem<'a> = heed::Result<( >::DItem, @@ -323,6 +333,7 @@ where I: Iterator> + 'a, { fn drop(&mut self) { + // ensure the iterator is dropped before the RoTxn it references drop(self.iter.take()); } } @@ -342,7 +353,16 @@ where } } -// ---- +// ---- iterators within transactions ---- + +fn tx_iter_item<'a>( + item: std::result::Result<(&'a [u8], &'a [u8]), heed::Error>, +) -> TxOpResult<(Vec, Vec)> { + item.map(|(k, v)| (k.to_vec(), v.to_vec())) + .map_err(|e| TxOpError(Error::from(e))) +} + +// ---- utility ---- #[cfg(target_pointer_width = "64")] pub fn recommended_map_size() -> usize { diff --git a/src/db/sqlite_adapter.rs b/src/db/sqlite_adapter.rs index d1394355..077c1f1b 100644 --- a/src/db/sqlite_adapter.rs +++ b/src/db/sqlite_adapter.rs @@ -369,32 +369,58 @@ impl<'a> ITx for SqliteTx<'a> { Ok(()) } - fn iter(&self, _tree: usize) -> TxOpResult> { - unimplemented!(); + fn iter(&self, tree: usize) -> TxOpResult> { + let tree = self.get_tree(tree)?; + let sql = format!("SELECT k, v FROM {} ORDER BY k ASC", tree); + TxValueIterator::make(self, &sql, []) } - fn iter_rev(&self, _tree: usize) -> TxOpResult> { - unimplemented!(); + fn iter_rev(&self, tree: usize) -> TxOpResult> { + let tree = self.get_tree(tree)?; + let sql = format!("SELECT k, v FROM {} ORDER BY k DESC", tree); + TxValueIterator::make(self, &sql, []) } fn range<'r>( &self, - _tree: usize, - _low: Bound<&'r [u8]>, - _high: Bound<&'r [u8]>, + tree: usize, + low: Bound<&'r [u8]>, + high: Bound<&'r [u8]>, ) -> TxOpResult> { - unimplemented!(); + let tree = self.get_tree(tree)?; + + let (bounds_sql, params) = bounds_sql(low, high); + let sql = format!("SELECT k, v FROM {} {} ORDER BY k ASC", tree, bounds_sql); + + let params = params + .iter() + .map(|x| x as &dyn rusqlite::ToSql) + .collect::>(); + + TxValueIterator::make::<&[&dyn rusqlite::ToSql]>(self, &sql, params.as_ref()) } fn range_rev<'r>( &self, - _tree: usize, - _low: Bound<&'r [u8]>, - _high: Bound<&'r [u8]>, + tree: usize, + low: Bound<&'r [u8]>, + high: Bound<&'r [u8]>, ) -> TxOpResult> { - unimplemented!(); + let tree = self.get_tree(tree)?; + + let (bounds_sql, params) = bounds_sql(low, high); + let sql = format!("SELECT k, v FROM {} {} ORDER BY k DESC", tree, bounds_sql); + + let params = params + .iter() + .map(|x| x as &dyn rusqlite::ToSql) + .collect::>(); + + TxValueIterator::make::<&[&dyn rusqlite::ToSql]>(self, &sql, params.as_ref()) } } -// ---- +// ---- iterators outside transactions ---- +// complicated, they must hold the Statement and Row objects +// therefore quite some unsafe code (it is a self-referential struct) struct DbValueIterator<'a> { db: MutexGuard<'a, SqliteDbInner>, @@ -471,7 +497,78 @@ impl<'a> Iterator for DbValueIteratorPin<'a> { } } -// ---- +// ---- iterators within transactions ---- +// it's the same except we don't hold a mutex guard, +// only a Statement and a Rows object + +struct TxValueIterator<'a> { + stmt: Statement<'a>, + iter: Option>, + _pin: PhantomPinned, +} + +impl<'a> TxValueIterator<'a> { + fn make( + tx: &'a SqliteTx<'a>, + sql: &str, + args: P, + ) -> TxOpResult> { + let stmt = tx.tx.prepare(sql)?; + let res = TxValueIterator { + stmt, + iter: None, + _pin: PhantomPinned, + }; + let mut boxed = Box::pin(res); + trace!("make iterator with sql: {}", sql); + + unsafe { + let mut stmt = NonNull::from(&boxed.stmt); + let iter = stmt.as_mut().query(args)?; + + let mut_ref: Pin<&mut TxValueIterator<'a>> = Pin::as_mut(&mut boxed); + Pin::get_unchecked_mut(mut_ref).iter = Some(iter); + } + + Ok(Box::new(TxValueIteratorPin(boxed))) + } +} + +impl<'a> Drop for TxValueIterator<'a> { + fn drop(&mut self) { + trace!("drop iter"); + drop(self.iter.take()); + } +} + +struct TxValueIteratorPin<'a>(Pin>>); + +impl<'a> Iterator for TxValueIteratorPin<'a> { + type Item = TxOpResult<(Value, Value)>; + + fn next(&mut self) -> Option { + let next = unsafe { + let mut_ref: Pin<&mut TxValueIterator<'a>> = Pin::as_mut(&mut self.0); + Pin::get_unchecked_mut(mut_ref).iter.as_mut()?.next() + }; + let row = match next { + Err(e) => return Some(Err(e.into())), + Ok(None) => return None, + Ok(Some(r)) => r, + }; + let k = match row.get::<_, Vec>(0) { + Err(e) => return Some(Err(e.into())), + Ok(x) => x, + }; + let v = match row.get::<_, Vec>(1) { + Err(e) => return Some(Err(e.into())), + Ok(y) => y, + }; + Some(Ok((k, v))) + } +} + +// ---- utility ---- fn bounds_sql<'r>(low: Bound<&'r [u8]>, high: Bound<&'r [u8]>) -> (String, Vec>) { let mut sql = String::new(); diff --git a/src/db/test.rs b/src/db/test.rs index d4c875f0..3add89fb 100644 --- a/src/db/test.rs +++ b/src/db/test.rs @@ -10,8 +10,13 @@ fn test_suite(db: Db) { let vb: &[u8] = &b"plip"[..]; let vc: &[u8] = &b"plup"[..]; + // ---- test simple insert/delete ---- + assert!(tree.insert(ka, va).unwrap().is_none()); assert_eq!(tree.get(ka).unwrap().unwrap(), va); + assert_eq!(tree.len().unwrap(), 1); + + // ---- test transaction logic ---- let res = db.transaction::<_, (), _>(|tx| { assert_eq!(tx.get(&tree, ka).unwrap().unwrap(), va); @@ -37,6 +42,8 @@ fn test_suite(db: Db) { assert!(matches!(res, Err(TxError::Abort(42)))); assert_eq!(tree.get(ka).unwrap().unwrap(), vb); + // ---- test iteration outside of transactions ---- + let mut iter = tree.iter().unwrap(); let next = iter.next().unwrap().unwrap(); assert_eq!((next.0.as_ref(), next.1.as_ref()), (ka, vb)); @@ -73,6 +80,48 @@ fn test_suite(db: Db) { assert_eq!((next.0.as_ref(), next.1.as_ref()), (ka, vb)); assert!(iter.next().is_none()); drop(iter); + + // ---- test iteration within transactions ---- + + db.transaction::<_, (), _>(|tx| { + let mut iter = tx.iter(&tree).unwrap(); + let next = iter.next().unwrap().unwrap(); + assert_eq!((next.0.as_ref(), next.1.as_ref()), (ka, vb)); + let next = iter.next().unwrap().unwrap(); + assert_eq!((next.0.as_ref(), next.1.as_ref()), (kb, vc)); + assert!(iter.next().is_none()); + Ok(()) + }) + .unwrap(); + + db.transaction::<_, (), _>(|tx| { + let mut iter = tx.range(&tree, kint..).unwrap(); + let next = iter.next().unwrap().unwrap(); + assert_eq!((next.0.as_ref(), next.1.as_ref()), (kb, vc)); + assert!(iter.next().is_none()); + Ok(()) + }) + .unwrap(); + + db.transaction::<_, (), _>(|tx| { + let mut iter = tx.range_rev(&tree, ..kint).unwrap(); + let next = iter.next().unwrap().unwrap(); + assert_eq!((next.0.as_ref(), next.1.as_ref()), (ka, vb)); + assert!(iter.next().is_none()); + Ok(()) + }) + .unwrap(); + + db.transaction::<_, (), _>(|tx| { + let mut iter = tx.iter_rev(&tree).unwrap(); + let next = iter.next().unwrap().unwrap(); + assert_eq!((next.0.as_ref(), next.1.as_ref()), (kb, vc)); + let next = iter.next().unwrap().unwrap(); + assert_eq!((next.0.as_ref(), next.1.as_ref()), (ka, vb)); + assert!(iter.next().is_none()); + Ok(()) + }) + .unwrap(); } #[test] -- cgit v1.2.3 From 32aa2463001c0af9f87633a1ff78858dd4157eb9 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 8 Mar 2024 17:39:17 +0100 Subject: [rm-sled] Make proper use of pinning in LMDB adapter + comment unsafe --- src/db/lmdb_adapter.rs | 28 ++++++++++++++++++++++------ src/db/sqlite_adapter.rs | 44 +++++++++++++++++++++++++------------------- 2 files changed, 47 insertions(+), 25 deletions(-) (limited to 'src') diff --git a/src/db/lmdb_adapter.rs b/src/db/lmdb_adapter.rs index ddfb6ed5..5ce7d3e3 100644 --- a/src/db/lmdb_adapter.rs +++ b/src/db/lmdb_adapter.rs @@ -3,6 +3,7 @@ use core::ptr::NonNull; use std::collections::HashMap; use std::convert::TryInto; +use std::pin::Pin; use std::sync::{Arc, RwLock}; use heed::types::ByteSlice; @@ -319,12 +320,20 @@ where where F: FnOnce(&'a RoTxn<'a>) -> Result, { - let mut res = TxAndIterator { tx, iter: None }; + let res = TxAndIterator { tx, iter: None }; + let mut boxed = Box::pin(res); - let tx = unsafe { NonNull::from(&res.tx).as_ref() }; - res.iter = Some(iterfun(tx)?); + // This unsafe allows us to bypass lifetime checks + let tx = unsafe { NonNull::from(&boxed.tx).as_ref() }; + let iter = iterfun(tx)?; - Ok(Box::new(res)) + let mut_ref = Pin::as_mut(&mut boxed); + // This unsafe allows us to write in a field of the pinned struct + unsafe { + Pin::get_unchecked_mut(mut_ref).iter = Some(iter); + } + + Ok(Box::new(TxAndIteratorPin(boxed))) } } @@ -338,14 +347,21 @@ where } } -impl<'a, I> Iterator for TxAndIterator<'a, I> +struct TxAndIteratorPin<'a, I>(Pin>>) +where + I: Iterator> + 'a; + +impl<'a, I> Iterator for TxAndIteratorPin<'a, I> where I: Iterator> + 'a, { type Item = Result<(Value, Value)>; fn next(&mut self) -> Option { - match self.iter.as_mut().unwrap().next() { + let mut_ref = Pin::as_mut(&mut self.0); + // This unsafe allows us to mutably access the iterator field + let next = unsafe { Pin::get_unchecked_mut(mut_ref).iter.as_mut()?.next() }; + match next { None => None, Some(Err(e)) => Some(Err(e.into())), Some(Ok((k, v))) => Some(Ok((k.to_vec(), v.to_vec()))), diff --git a/src/db/sqlite_adapter.rs b/src/db/sqlite_adapter.rs index 077c1f1b..2c6a4159 100644 --- a/src/db/sqlite_adapter.rs +++ b/src/db/sqlite_adapter.rs @@ -444,17 +444,23 @@ impl<'a> DbValueIterator<'a> { let mut boxed = Box::pin(res); trace!("make iterator with sql: {}", sql); - unsafe { - let db = NonNull::from(&boxed.db); - let stmt = db.as_ref().db.prepare(sql)?; + // This unsafe allows us to bypass lifetime checks + let db = unsafe { NonNull::from(&boxed.db).as_ref() }; + let stmt = db.db.prepare(sql)?; - let mut_ref: Pin<&mut DbValueIterator<'a>> = Pin::as_mut(&mut boxed); + let mut_ref = Pin::as_mut(&mut boxed); + // This unsafe allows us to write in a field of the pinned struct + unsafe { Pin::get_unchecked_mut(mut_ref).stmt = Some(stmt); + } - let mut stmt = NonNull::from(&boxed.stmt); - let iter = stmt.as_mut().as_mut().unwrap().query(args)?; + // This unsafe allows us to bypass lifetime checks + let stmt = unsafe { NonNull::from(&boxed.stmt).as_mut() }; + let iter = stmt.as_mut().unwrap().query(args)?; - let mut_ref: Pin<&mut DbValueIterator<'a>> = Pin::as_mut(&mut boxed); + let mut_ref = Pin::as_mut(&mut boxed); + // This unsafe allows us to write in a field of the pinned struct + unsafe { Pin::get_unchecked_mut(mut_ref).iter = Some(iter); } @@ -476,10 +482,9 @@ impl<'a> Iterator for DbValueIteratorPin<'a> { type Item = Result<(Value, Value)>; fn next(&mut self) -> Option { - let next = unsafe { - let mut_ref: Pin<&mut DbValueIterator<'a>> = Pin::as_mut(&mut self.0); - Pin::get_unchecked_mut(mut_ref).iter.as_mut()?.next() - }; + let mut_ref = Pin::as_mut(&mut self.0); + // This unsafe allows us to mutably access the iterator field + let next = unsafe { Pin::get_unchecked_mut(mut_ref).iter.as_mut()?.next() }; let row = match next { Err(e) => return Some(Err(e.into())), Ok(None) => return None, @@ -522,11 +527,13 @@ impl<'a> TxValueIterator<'a> { let mut boxed = Box::pin(res); trace!("make iterator with sql: {}", sql); - unsafe { - let mut stmt = NonNull::from(&boxed.stmt); - let iter = stmt.as_mut().query(args)?; + // This unsafe allows us to bypass lifetime checks + let stmt = unsafe { NonNull::from(&boxed.stmt).as_mut() }; + let iter = stmt.query(args)?; - let mut_ref: Pin<&mut TxValueIterator<'a>> = Pin::as_mut(&mut boxed); + let mut_ref = Pin::as_mut(&mut boxed); + // This unsafe allows us to write in a field of the pinned struct + unsafe { Pin::get_unchecked_mut(mut_ref).iter = Some(iter); } @@ -547,10 +554,9 @@ impl<'a> Iterator for TxValueIteratorPin<'a> { type Item = TxOpResult<(Value, Value)>; fn next(&mut self) -> Option { - let next = unsafe { - let mut_ref: Pin<&mut TxValueIterator<'a>> = Pin::as_mut(&mut self.0); - Pin::get_unchecked_mut(mut_ref).iter.as_mut()?.next() - }; + let mut_ref = Pin::as_mut(&mut self.0); + // This unsafe allows us to mutably access the iterator field + let next = unsafe { Pin::get_unchecked_mut(mut_ref).iter.as_mut()?.next() }; let row = match next { Err(e) => return Some(Err(e.into())), Ok(None) => return None, -- cgit v1.2.3 From 2795b53b8b3ebd162df6b0244b73889e72f67ce0 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 12 Mar 2024 11:15:26 +0100 Subject: [rm-sled] factorize some code in sqlite backend --- src/db/sqlite_adapter.rs | 52 ++++++++++++++++++++++-------------------------- 1 file changed, 24 insertions(+), 28 deletions(-) (limited to 'src') diff --git a/src/db/sqlite_adapter.rs b/src/db/sqlite_adapter.rs index 2c6a4159..6c556c97 100644 --- a/src/db/sqlite_adapter.rs +++ b/src/db/sqlite_adapter.rs @@ -485,20 +485,7 @@ impl<'a> Iterator for DbValueIteratorPin<'a> { let mut_ref = Pin::as_mut(&mut self.0); // This unsafe allows us to mutably access the iterator field let next = unsafe { Pin::get_unchecked_mut(mut_ref).iter.as_mut()?.next() }; - let row = match next { - Err(e) => return Some(Err(e.into())), - Ok(None) => return None, - Ok(Some(r)) => r, - }; - let k = match row.get::<_, Vec>(0) { - Err(e) => return Some(Err(e.into())), - Ok(x) => x, - }; - let v = match row.get::<_, Vec>(1) { - Err(e) => return Some(Err(e.into())), - Ok(y) => y, - }; - Some(Ok((k, v))) + iter_next_row(next) } } @@ -557,20 +544,7 @@ impl<'a> Iterator for TxValueIteratorPin<'a> { let mut_ref = Pin::as_mut(&mut self.0); // This unsafe allows us to mutably access the iterator field let next = unsafe { Pin::get_unchecked_mut(mut_ref).iter.as_mut()?.next() }; - let row = match next { - Err(e) => return Some(Err(e.into())), - Ok(None) => return None, - Ok(Some(r)) => r, - }; - let k = match row.get::<_, Vec>(0) { - Err(e) => return Some(Err(e.into())), - Ok(x) => x, - }; - let v = match row.get::<_, Vec>(1) { - Err(e) => return Some(Err(e.into())), - Ok(y) => y, - }; - Some(Ok((k, v))) + iter_next_row(next) } } @@ -614,3 +588,25 @@ fn bounds_sql<'r>(low: Bound<&'r [u8]>, high: Bound<&'r [u8]>) -> (String, Vec( + next_row: rusqlite::Result>, +) -> Option> +where + E: From, +{ + let row = match next_row { + Err(e) => return Some(Err(e.into())), + Ok(None) => return None, + Ok(Some(r)) => r, + }; + let k = match row.get::<_, Vec>(0) { + Err(e) => return Some(Err(e.into())), + Ok(x) => x, + }; + let v = match row.get::<_, Vec>(1) { + Err(e) => return Some(Err(e.into())), + Ok(y) => y, + }; + Some(Ok((k, v))) +} -- cgit v1.2.3 From dc0b78cdb88e9cbfd7dc1a2ee0b15333939be549 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 19 Mar 2024 11:04:20 +0100 Subject: [block-ref-repair] Block refcount recalculation and repair - We always recalculate the reference count of a block before deleting it locally, to make sure that it is indeed zero. - If we had to fetch a remote block but we were not able to get it, check that refcount is indeed > 0. - Repair procedure that checks everything --- src/block/lib.rs | 1 + src/block/manager.rs | 14 ++++-- src/block/rc.rs | 65 +++++++++++++++++++++++- src/block/resync.rs | 17 ++++++- src/garage/cli/structs.rs | 5 +- src/garage/repair/online.rs | 106 ++++++++++++++++++++++++++++++++++++++++ src/model/garage.rs | 8 +++ src/model/s3/block_ref_table.rs | 39 +++++++++++++++ src/util/data.rs | 35 +++++++++++++ src/util/error.rs | 3 ++ 10 files changed, 285 insertions(+), 8 deletions(-) (limited to 'src') diff --git a/src/block/lib.rs b/src/block/lib.rs index 6c4711ef..944f0d83 100644 --- a/src/block/lib.rs +++ b/src/block/lib.rs @@ -11,3 +11,4 @@ mod metrics; mod rc; pub use block::zstd_encode; +pub use rc::CalculateRefcount; diff --git a/src/block/manager.rs b/src/block/manager.rs index eeacf8b9..628ffc71 100644 --- a/src/block/manager.rs +++ b/src/block/manager.rs @@ -88,7 +88,7 @@ pub struct BlockManager { mutation_lock: Vec>, - pub(crate) rc: BlockRc, + pub rc: BlockRc, pub resync: BlockResyncManager, pub(crate) system: Arc, @@ -229,6 +229,12 @@ impl BlockManager { } } + /// Initialization: set how block references are recalculated + /// for repair operations + pub fn set_recalc_rc(&self, recalc: Vec) { + self.rc.recalc_rc.store(Some(Arc::new(recalc))); + } + /// Ask nodes that might have a (possibly compressed) block for it /// Return it as a stream with a header async fn rpc_get_raw_block_streaming( @@ -316,9 +322,9 @@ impl BlockManager { }; } - let msg = format!("Get block {:?}: no node returned a valid block", hash); - debug!("{}", msg); - Err(Error::Message(msg)) + let err = Error::MissingBlock(*hash); + debug!("{}", err); + Err(err) } // ---- Public interface ---- diff --git a/src/block/rc.rs b/src/block/rc.rs index b6afb277..bf5aeced 100644 --- a/src/block/rc.rs +++ b/src/block/rc.rs @@ -1,5 +1,7 @@ use std::convert::TryInto; +use arc_swap::ArcSwapOption; + use garage_db as db; use garage_util::data::*; @@ -8,13 +10,20 @@ use garage_util::time::*; use crate::manager::BLOCK_GC_DELAY; +pub type CalculateRefcount = + Box db::TxResult + Send + Sync>; + pub struct BlockRc { - pub(crate) rc: db::Tree, + pub rc: db::Tree, + pub(crate) recalc_rc: ArcSwapOption>, } impl BlockRc { pub(crate) fn new(rc: db::Tree) -> Self { - Self { rc } + Self { + rc, + recalc_rc: ArcSwapOption::new(None), + } } /// Increment the reference counter associated to a hash. @@ -68,6 +77,58 @@ impl BlockRc { })?; Ok(()) } + + /// Recalculate the reference counter of a block + /// to fix potential inconsistencies + pub fn recalculate_rc(&self, hash: &Hash) -> Result<(usize, bool), Error> { + if let Some(recalc_fns) = self.recalc_rc.load().as_ref() { + trace!("Repair block RC for {:?}", hash); + let res = self + .rc + .db() + .transaction(|tx| { + let mut cnt = 0; + for f in recalc_fns.iter() { + cnt += f(&tx, hash)?; + } + let old_rc = RcEntry::parse_opt(tx.get(&self.rc, hash)?); + trace!( + "Block RC for {:?}: stored={}, calculated={}", + hash, + old_rc.as_u64(), + cnt + ); + if cnt as u64 != old_rc.as_u64() { + warn!( + "Fixing inconsistent block RC for {:?}: was {}, should be {}", + hash, + old_rc.as_u64(), + cnt + ); + let new_rc = if cnt > 0 { + RcEntry::Present { count: cnt as u64 } + } else { + RcEntry::Deletable { + at_time: now_msec() + BLOCK_GC_DELAY.as_millis() as u64, + } + }; + tx.insert(&self.rc, hash, new_rc.serialize().unwrap())?; + Ok((cnt, true)) + } else { + Ok((cnt, false)) + } + }) + .map_err(Error::from); + if let Err(e) = &res { + error!("Failed to fix RC for block {:?}: {}", hash, e); + } + res + } else { + Err(Error::Message( + "Block RC recalculation is not available at this point".into(), + )) + } + } } /// Describes the state of the reference counter for a block diff --git a/src/block/resync.rs b/src/block/resync.rs index 48c2cef1..b4108213 100644 --- a/src/block/resync.rs +++ b/src/block/resync.rs @@ -367,6 +367,13 @@ impl BlockResyncManager { } if exists && rc.is_deletable() { + if manager.rc.recalculate_rc(hash)?.0 > 0 { + return Err(Error::Message(format!( + "Refcount for block {:?} was inconsistent, retrying later", + hash + ))); + } + info!("Resync block {:?}: offloading and deleting", hash); let existing_path = existing_path.unwrap(); @@ -453,7 +460,15 @@ impl BlockResyncManager { hash ); - let block_data = manager.rpc_get_raw_block(hash, None).await?; + let block_data = manager.rpc_get_raw_block(hash, None).await; + if matches!(block_data, Err(Error::MissingBlock(_))) { + warn!( + "Could not fetch needed block {:?}, no node returned valid data. Checking that refcount is correct.", + hash + ); + manager.rc.recalculate_rc(hash)?; + } + let block_data = block_data?; manager.metrics.resync_recv_counter.add(1); diff --git a/src/garage/cli/structs.rs b/src/garage/cli/structs.rs index 1f572a9a..8380b5e2 100644 --- a/src/garage/cli/structs.rs +++ b/src/garage/cli/structs.rs @@ -473,8 +473,11 @@ pub enum RepairWhat { #[structopt(name = "mpu", version = garage_version())] MultipartUploads, /// Repropagate version deletions to the block ref table - #[structopt(name = "block_refs", version = garage_version())] + #[structopt(name = "block-refs", version = garage_version())] BlockRefs, + /// Recalculate block reference counters + #[structopt(name = "block-rc", version = garage_version())] + BlockRc, /// Verify integrity of all blocks on disc #[structopt(name = "scrub", version = garage_version())] Scrub { diff --git a/src/garage/repair/online.rs b/src/garage/repair/online.rs index 9e4de873..ecccdf6d 100644 --- a/src/garage/repair/online.rs +++ b/src/garage/repair/online.rs @@ -4,6 +4,7 @@ use std::time::Duration; use async_trait::async_trait; use tokio::sync::watch; +use garage_block::manager::BlockManager; use garage_block::repair::ScrubWorkerCommand; use garage_model::garage::Garage; @@ -16,11 +17,14 @@ use garage_table::replication::*; use garage_table::*; use garage_util::background::*; +use garage_util::data::*; use garage_util::error::Error; use garage_util::migrate::Migrate; use crate::*; +const RC_REPAIR_ITER_COUNT: usize = 64; + pub async fn launch_online_repair( garage: &Arc, bg: &BackgroundRunner, @@ -47,6 +51,13 @@ pub async fn launch_online_repair( info!("Repairing the block refs table"); bg.spawn_worker(TableRepairWorker::new(garage.clone(), RepairBlockRefs)); } + RepairWhat::BlockRc => { + info!("Repairing the block reference counters"); + bg.spawn_worker(BlockRcRepair::new( + garage.block_manager.clone(), + garage.block_ref_table.clone(), + )); + } RepairWhat::Blocks => { info!("Repairing the stored blocks"); bg.spawn_worker(garage_block::repair::RepairWorker::new( @@ -282,3 +293,98 @@ impl TableRepair for RepairMpu { Ok(false) } } + +// ===== block reference counter repair ===== + +pub struct BlockRcRepair { + block_manager: Arc, + block_ref_table: Arc>, + cursor: Hash, + counter: u64, + repairs: u64, +} + +impl BlockRcRepair { + fn new( + block_manager: Arc, + block_ref_table: Arc>, + ) -> Self { + Self { + block_manager, + block_ref_table, + cursor: [0u8; 32].into(), + counter: 0, + repairs: 0, + } + } +} + +#[async_trait] +impl Worker for BlockRcRepair { + fn name(&self) -> String { + format!("Block refcount repair worker") + } + + fn status(&self) -> WorkerStatus { + WorkerStatus { + progress: Some(format!("{} ({})", self.counter, self.repairs)), + ..Default::default() + } + } + + async fn work(&mut self, _must_exit: &mut watch::Receiver) -> Result { + for _i in 0..RC_REPAIR_ITER_COUNT { + let next1 = self + .block_manager + .rc + .rc + .range(self.cursor.as_slice()..)? + .next() + .transpose()? + .map(|(k, _)| Hash::try_from(k.as_slice()).unwrap()); + let next2 = self + .block_ref_table + .data + .store + .range(self.cursor.as_slice()..)? + .next() + .transpose()? + .map(|(k, _)| Hash::try_from(&k[..32]).unwrap()); + let next = match (next1, next2) { + (Some(k1), Some(k2)) => std::cmp::min(k1, k2), + (Some(k), None) | (None, Some(k)) => k, + (None, None) => { + info!( + "{}: finished, done {}, fixed {}", + self.name(), + self.counter, + self.repairs + ); + return Ok(WorkerState::Done); + } + }; + + if self.block_manager.rc.recalculate_rc(&next)?.1 { + self.repairs += 1; + } + self.counter += 1; + if let Some(next_incr) = next.increment() { + self.cursor = next_incr; + } else { + info!( + "{}: finished, done {}, fixed {}", + self.name(), + self.counter, + self.repairs + ); + return Ok(WorkerState::Done); + } + } + + Ok(WorkerState::Busy) + } + + async fn wait_for_work(&mut self) -> WorkerState { + unreachable!() + } +} diff --git a/src/model/garage.rs b/src/model/garage.rs index 4405d22d..273690db 100644 --- a/src/model/garage.rs +++ b/src/model/garage.rs @@ -247,6 +247,14 @@ impl Garage { #[cfg(feature = "k2v")] let k2v = GarageK2V::new(system.clone(), &db, meta_rep_param); + // ---- setup block refcount recalculation ---- + // this function can be used to fix inconsistencies in the RC table + block_manager.set_recalc_rc(vec![ + block_ref_recount_fn(&block_ref_table), + // other functions could be added here if we had other tables + // that hold references to data blocks + ]); + // -- done -- Ok(Arc::new(Self { config, diff --git a/src/model/s3/block_ref_table.rs b/src/model/s3/block_ref_table.rs index 7b023d87..57eb7b16 100644 --- a/src/model/s3/block_ref_table.rs +++ b/src/model/s3/block_ref_table.rs @@ -3,8 +3,12 @@ use std::sync::Arc; use garage_db as db; use garage_util::data::*; +use garage_util::error::*; +use garage_util::migrate::Migrate; +use garage_block::CalculateRefcount; use garage_table::crdt::Crdt; +use garage_table::replication::TableShardedReplication; use garage_table::*; use garage_block::manager::*; @@ -84,3 +88,38 @@ impl TableSchema for BlockRefTable { filter.apply(entry.deleted.get()) } } + +pub fn block_ref_recount_fn( + block_ref_table: &Arc>, +) -> CalculateRefcount { + let table = Arc::downgrade(block_ref_table); + Box::new(move |tx: &db::Transaction, block: &Hash| { + let table = table + .upgrade() + .ok_or_message("cannot upgrade weak ptr to block_ref_table") + .map_err(db::TxError::Abort)?; + Ok(calculate_refcount(&table, tx, block)?) + }) +} + +fn calculate_refcount( + block_ref_table: &Table, + tx: &db::Transaction, + block: &Hash, +) -> db::TxResult { + let mut result = 0; + for entry in tx.range(&block_ref_table.data.store, block.as_slice()..)? { + let (key, value) = entry?; + if &key[..32] != block.as_slice() { + break; + } + let value = BlockRef::decode(&value) + .ok_or_message("could not decode block_ref") + .map_err(db::TxError::Abort)?; + assert_eq!(value.block, *block); + if !value.deleted.get() { + result += 1; + } + } + Ok(result) +} diff --git a/src/util/data.rs b/src/util/data.rs index 2579fd1b..1fe7dfe0 100644 --- a/src/util/data.rs +++ b/src/util/data.rs @@ -83,6 +83,19 @@ impl FixedBytes32 { ret.copy_from_slice(by); Some(Self(ret)) } + /// Return the next hash + pub fn increment(&self) -> Option { + let mut ret = *self; + for byte in ret.0.iter_mut().rev() { + if *byte == u8::MAX { + *byte = 0; + } else { + *byte = *byte + 1; + return Some(ret); + } + } + return None; + } } impl From for FixedBytes32 { @@ -140,3 +153,25 @@ pub fn fasthash(data: &[u8]) -> FastHash { pub fn gen_uuid() -> Uuid { rand::thread_rng().gen::<[u8; 32]>().into() } + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_increment() { + let zero: FixedBytes32 = [0u8; 32].into(); + let mut one: FixedBytes32 = [0u8; 32].into(); + one.0[31] = 1; + let max: FixedBytes32 = [0xFFu8; 32].into(); + assert_eq!(zero.increment(), Some(one)); + assert_eq!(max.increment(), None); + + let mut test: FixedBytes32 = [0u8; 32].into(); + let i = 0x198DF97209F8FFFFu64; + test.0[24..32].copy_from_slice(&u64::to_be_bytes(i)); + let mut test2: FixedBytes32 = [0u8; 32].into(); + test2.0[24..32].copy_from_slice(&u64::to_be_bytes(i + 1)); + assert_eq!(test.increment(), Some(test2)); + } +} diff --git a/src/util/error.rs b/src/util/error.rs index da9eda10..75fd3f9c 100644 --- a/src/util/error.rs +++ b/src/util/error.rs @@ -70,6 +70,9 @@ pub enum Error { #[error(display = "Corrupt data: does not match hash {:?}", _0)] CorruptData(Hash), + #[error(display = "Missing block {:?}: no node returned a valid block", _0)] + MissingBlock(Hash), + #[error(display = "{}", _0)] Message(String), } -- cgit v1.2.3 From 3165ab926c665b795eab7a227f65a67a0874641e Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 19 Mar 2024 16:09:47 +0100 Subject: [block-ref-repair] rename rc's rc field to rc_table --- src/block/manager.rs | 4 ++-- src/block/rc.rs | 28 ++++++++++++++-------------- src/block/repair.rs | 2 +- src/garage/repair/online.rs | 2 +- 4 files changed, 18 insertions(+), 18 deletions(-) (limited to 'src') diff --git a/src/block/manager.rs b/src/block/manager.rs index 628ffc71..8ee33096 100644 --- a/src/block/manager.rs +++ b/src/block/manager.rs @@ -156,7 +156,7 @@ impl BlockManager { let metrics = BlockManagerMetrics::new( config.compression_level, - rc.rc.clone(), + rc.rc_table.clone(), resync.queue.clone(), resync.errors.clone(), ); @@ -387,7 +387,7 @@ impl BlockManager { /// Get number of items in the refcount table pub fn rc_len(&self) -> Result { - Ok(self.rc.rc.len()?) + Ok(self.rc.rc_table.len()?) } /// Send command to start/stop/manager scrub worker diff --git a/src/block/rc.rs b/src/block/rc.rs index bf5aeced..4a55ee29 100644 --- a/src/block/rc.rs +++ b/src/block/rc.rs @@ -14,14 +14,14 @@ pub type CalculateRefcount = Box db::TxResult + Send + Sync>; pub struct BlockRc { - pub rc: db::Tree, + pub rc_table: db::Tree, pub(crate) recalc_rc: ArcSwapOption>, } impl BlockRc { pub(crate) fn new(rc: db::Tree) -> Self { Self { - rc, + rc_table: rc, recalc_rc: ArcSwapOption::new(None), } } @@ -33,9 +33,9 @@ impl BlockRc { tx: &mut db::Transaction, hash: &Hash, ) -> db::TxOpResult { - let old_rc = RcEntry::parse_opt(tx.get(&self.rc, hash)?); + let old_rc = RcEntry::parse_opt(tx.get(&self.rc_table, hash)?); match old_rc.increment().serialize() { - Some(x) => tx.insert(&self.rc, hash, x)?, + Some(x) => tx.insert(&self.rc_table, hash, x)?, None => unreachable!(), }; Ok(old_rc.is_zero()) @@ -48,28 +48,28 @@ impl BlockRc { tx: &mut db::Transaction, hash: &Hash, ) -> db::TxOpResult { - let new_rc = RcEntry::parse_opt(tx.get(&self.rc, hash)?).decrement(); + let new_rc = RcEntry::parse_opt(tx.get(&self.rc_table, hash)?).decrement(); match new_rc.serialize() { - Some(x) => tx.insert(&self.rc, hash, x)?, - None => tx.remove(&self.rc, hash)?, + Some(x) => tx.insert(&self.rc_table, hash, x)?, + None => tx.remove(&self.rc_table, hash)?, }; Ok(matches!(new_rc, RcEntry::Deletable { .. })) } /// Read a block's reference count pub(crate) fn get_block_rc(&self, hash: &Hash) -> Result { - Ok(RcEntry::parse_opt(self.rc.get(hash.as_ref())?)) + Ok(RcEntry::parse_opt(self.rc_table.get(hash.as_ref())?)) } /// Delete an entry in the RC table if it is deletable and the /// deletion time has passed pub(crate) fn clear_deleted_block_rc(&self, hash: &Hash) -> Result<(), Error> { let now = now_msec(); - self.rc.db().transaction(|tx| { - let rcval = RcEntry::parse_opt(tx.get(&self.rc, hash)?); + self.rc_table.db().transaction(|tx| { + let rcval = RcEntry::parse_opt(tx.get(&self.rc_table, hash)?); match rcval { RcEntry::Deletable { at_time } if now > at_time => { - tx.remove(&self.rc, hash)?; + tx.remove(&self.rc_table, hash)?; } _ => (), }; @@ -84,14 +84,14 @@ impl BlockRc { if let Some(recalc_fns) = self.recalc_rc.load().as_ref() { trace!("Repair block RC for {:?}", hash); let res = self - .rc + .rc_table .db() .transaction(|tx| { let mut cnt = 0; for f in recalc_fns.iter() { cnt += f(&tx, hash)?; } - let old_rc = RcEntry::parse_opt(tx.get(&self.rc, hash)?); + let old_rc = RcEntry::parse_opt(tx.get(&self.rc_table, hash)?); trace!( "Block RC for {:?}: stored={}, calculated={}", hash, @@ -112,7 +112,7 @@ impl BlockRc { at_time: now_msec() + BLOCK_GC_DELAY.as_millis() as u64, } }; - tx.insert(&self.rc, hash, new_rc.serialize().unwrap())?; + tx.insert(&self.rc_table, hash, new_rc.serialize().unwrap())?; Ok((cnt, true)) } else { Ok((cnt, false)) diff --git a/src/block/repair.rs b/src/block/repair.rs index 2c8acbc9..ef271094 100644 --- a/src/block/repair.rs +++ b/src/block/repair.rs @@ -107,7 +107,7 @@ impl Worker for RepairWorker { for entry in self .manager .rc - .rc + .rc_table .range::<&[u8], _>((start_bound, Bound::Unbounded))? { let (hash, _) = entry?; diff --git a/src/garage/repair/online.rs b/src/garage/repair/online.rs index ecccdf6d..2c5227d2 100644 --- a/src/garage/repair/online.rs +++ b/src/garage/repair/online.rs @@ -337,7 +337,7 @@ impl Worker for BlockRcRepair { let next1 = self .block_manager .rc - .rc + .rc_table .range(self.cursor.as_slice()..)? .next() .transpose()? -- cgit v1.2.3 From 3eab639c146f67fc67534633ae26c9aec116327d Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 19 Mar 2024 16:24:34 +0100 Subject: [block-ref-repair] mention `garage block repair-rc` in documentation --- src/garage/cli/util.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src') diff --git a/src/garage/cli/util.rs b/src/garage/cli/util.rs index 0511e2b1..21c14f42 100644 --- a/src/garage/cli/util.rs +++ b/src/garage/cli/util.rs @@ -451,7 +451,7 @@ pub fn print_block_info( if refcount != nondeleted_count { println!(); println!( - "Warning: refcount does not match number of non-deleted versions (see issue #644)." + "Warning: refcount does not match number of non-deleted versions, you should try `garage repair block-rc`." ); } } -- cgit v1.2.3 From 5225a81dee21603950e7944cd93c40fdb1bd8feb Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 21 Mar 2024 09:47:04 +0100 Subject: [net-fixes] peering: only count node IDs and not addresses in hash --- src/net/peering.rs | 45 ++++++++++++++++++++++++++++----------------- 1 file changed, 28 insertions(+), 17 deletions(-) (limited to 'src') diff --git a/src/net/peering.rs b/src/net/peering.rs index 61882a18..f4283683 100644 --- a/src/net/peering.rs +++ b/src/net/peering.rs @@ -164,29 +164,40 @@ struct KnownHosts { impl KnownHosts { fn new() -> Self { let list = HashMap::new(); - let hash = Self::calculate_hash(vec![]); - Self { list, hash } + let mut ret = Self { + list, + hash: hash::Digest::from_slice(&[0u8; 64][..]).unwrap(), + }; + ret.update_hash(); + ret } fn update_hash(&mut self) { - self.hash = Self::calculate_hash(self.connected_peers_vec()); - } - fn connected_peers_vec(&self) -> Vec<(NodeID, SocketAddr)> { - let mut list = Vec::with_capacity(self.list.len()); - for (id, peer) in self.list.iter() { - if peer.state.is_up() { - list.push((*id, peer.addr)); - } - } - list - } - fn calculate_hash(mut list: Vec<(NodeID, SocketAddr)>) -> hash::Digest { + // The hash is a value that is exchanged between nodes when they ping one + // another. Nodes compare their known hosts hash to know if they are connected + // to the same set of nodes. If the hashes differ, they are connected to + // different nodes and they trigger an exchange of the full list of active + // connections. The hash value only represents the set of node IDs and not + // their actual socket addresses, because nodes can be connected via different + // addresses and that shouldn't necessarily trigger a full peer exchange. + let mut list = self + .list + .iter() + .filter(|(_, peer)| peer.state.is_up()) + .map(|(id, _)| *id) + .collect::>(); list.sort(); let mut hash_state = hash::State::new(); - for (id, addr) in list { + for id in list { hash_state.update(&id[..]); - hash_state.update(&format!("{}\n", addr).into_bytes()[..]); } - hash_state.finalize() + self.hash = hash_state.finalize(); + } + fn connected_peers_vec(&self) -> Vec<(NodeID, SocketAddr)> { + self.list + .iter() + .filter(|(_, peer)| peer.state.is_up()) + .map(|(id, peer)| (*id, peer.addr)) + .collect::>() } } -- cgit v1.2.3 From 961b4f9af36a7fb5d3a661ac19e8f2c168bb48ae Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 21 Mar 2024 10:45:34 +0100 Subject: [net-fixes] fix issues with local peer address (fix #761) --- src/api/admin/cluster.rs | 2 +- src/garage/cli/cmd.rs | 21 ++++---- src/net/netapp.rs | 51 +++++++++---------- src/net/peering.rs | 124 +++++++++++++++++++++-------------------------- src/rpc/system.rs | 15 ++++-- 5 files changed, 100 insertions(+), 113 deletions(-) (limited to 'src') diff --git a/src/api/admin/cluster.rs b/src/api/admin/cluster.rs index 8ce6c5ed..8c9cb1e5 100644 --- a/src/api/admin/cluster.rs +++ b/src/api/admin/cluster.rs @@ -27,7 +27,7 @@ pub async fn handle_get_cluster_status(garage: &Arc) -> Result, rpc_host: NodeID) -> vec!["ID\tHostname\tAddress\tTags\tZone\tCapacity\tDataAvail".to_string()]; for adv in status.iter().filter(|adv| adv.is_up) { let host = adv.status.hostname.as_deref().unwrap_or("?"); + let addr = match adv.addr { + Some(addr) => addr.to_string(), + None => "N/A".to_string(), + }; if let Some(NodeRoleV(Some(cfg))) = layout.current().roles.get(&adv.id) { let data_avail = match &adv.status.data_disk_avail { _ if cfg.capacity.is_none() => "N/A".into(), @@ -71,7 +75,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> "{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\t{capacity}\t{data_avail}", id = adv.id, host = host, - addr = adv.addr, + addr = addr, tags = cfg.tags.join(","), zone = cfg.zone, capacity = cfg.capacity_string(), @@ -91,7 +95,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> "{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\tdraining metadata...", id = adv.id, host = host, - addr = adv.addr, + addr = addr, tags = cfg.tags.join(","), zone = cfg.zone, )); @@ -104,7 +108,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> "{id:?}\t{h}\t{addr}\t\t\t{new_role}", id = adv.id, h = host, - addr = adv.addr, + addr = addr, new_role = new_role, )); } @@ -120,8 +124,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> let tf = timeago::Formatter::new(); let mut drain_msg = false; - let mut failed_nodes = - vec!["ID\tHostname\tAddress\tTags\tZone\tCapacity\tLast seen".to_string()]; + let mut failed_nodes = vec!["ID\tHostname\tTags\tZone\tCapacity\tLast seen".to_string()]; let mut listed = HashSet::new(); for ver in layout.versions.iter().rev() { for (node, _, role) in ver.roles.items().iter() { @@ -142,15 +145,14 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> // Node is in a layout version, is not a gateway node, and is not up: // it is in a failed state, add proper line to the output - let (host, addr, last_seen) = match adv { + let (host, last_seen) = match adv { Some(adv) => ( adv.status.hostname.as_deref().unwrap_or("?"), - adv.addr.to_string(), adv.last_seen_secs_ago .map(|s| tf.convert(Duration::from_secs(s))) .unwrap_or_else(|| "never seen".into()), ), - None => ("??", "??".into(), "never seen".into()), + None => ("??", "never seen".into()), }; let capacity = if ver.version == layout.current().version { cfg.capacity_string() @@ -159,10 +161,9 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> "draining metadata...".to_string() }; failed_nodes.push(format!( - "{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\t{capacity}\t{last_seen}", + "{id:?}\t{host}\t[{tags}]\t{zone}\t{capacity}\t{last_seen}", id = node, host = host, - addr = addr, tags = cfg.tags.join(","), zone = cfg.zone, capacity = capacity, diff --git a/src/net/netapp.rs b/src/net/netapp.rs index faa51a99..6480a126 100644 --- a/src/net/netapp.rs +++ b/src/net/netapp.rs @@ -292,13 +292,7 @@ impl NetApp { /// the other node with `Netapp::request` pub async fn try_connect(self: Arc, ip: SocketAddr, id: NodeID) -> Result<(), Error> { // Don't connect to ourself, we don't care - // but pretend we did if id == self.id { - tokio::spawn(async move { - if let Some(h) = self.on_connected_handler.load().as_ref() { - h(id, ip, false); - } - }); return Ok(()); } @@ -327,31 +321,32 @@ impl NetApp { /// Close the outgoing connection we have to a node specified by its public key, /// if such a connection is currently open. pub fn disconnect(self: &Arc, id: &NodeID) { + let conn = self.client_conns.write().unwrap().remove(id); + // If id is ourself, we're not supposed to have a connection open - if *id != self.id { - let conn = self.client_conns.write().unwrap().remove(id); - if let Some(c) = conn { - debug!( - "Closing connection to {} ({})", - hex::encode(&c.peer_id[..8]), - c.remote_addr - ); - c.close(); - } else { - return; - } + if *id == self.id { + // sanity check + assert!(conn.is_none(), "had a connection to local node"); + return; } - // call on_disconnected_handler immediately, since the connection - // was removed - // (if id == self.id, we pretend we disconnected) - let id = *id; - let self2 = self.clone(); - tokio::spawn(async move { - if let Some(h) = self2.on_disconnected_handler.load().as_ref() { - h(id, false); - } - }); + if let Some(c) = conn { + debug!( + "Closing connection to {} ({})", + hex::encode(&c.peer_id[..8]), + c.remote_addr + ); + c.close(); + + // call on_disconnected_handler immediately, since the connection was removed + let id = *id; + let self2 = self.clone(); + tokio::spawn(async move { + if let Some(h) = self2.on_disconnected_handler.load().as_ref() { + h(id, false); + } + }); + } } // Called from conn.rs when an incoming connection is successfully established diff --git a/src/net/peering.rs b/src/net/peering.rs index f4283683..0b4fec9e 100644 --- a/src/net/peering.rs +++ b/src/net/peering.rs @@ -43,7 +43,7 @@ impl Message for PingMessage { #[derive(Serialize, Deserialize)] struct PeerListMessage { - pub list: Vec<(NodeID, SocketAddr)>, + pub list: Vec<(NodeID, Vec)>, } impl Message for PeerListMessage { @@ -54,12 +54,8 @@ impl Message for PeerListMessage { #[derive(Debug)] struct PeerInfoInternal { - // addr is the currently connected address, - // or the last address we were connected to, - // or an arbitrary address some other peer gave us - addr: SocketAddr, - // all_addrs contains all of the addresses everyone gave us - all_addrs: Vec, + // known_addrs contains all of the addresses everyone gave us + known_addrs: Vec, state: PeerConnState, last_send_ping: Option, @@ -69,10 +65,9 @@ struct PeerInfoInternal { } impl PeerInfoInternal { - fn new(addr: SocketAddr, state: PeerConnState) -> Self { + fn new(state: PeerConnState, known_addr: Option) -> Self { Self { - addr, - all_addrs: vec![addr], + known_addrs: known_addr.map(|x| vec![x]).unwrap_or_default(), state, last_send_ping: None, last_seen: None, @@ -81,8 +76,8 @@ impl PeerInfoInternal { } } fn add_addr(&mut self, addr: SocketAddr) -> bool { - if !self.all_addrs.contains(&addr) { - self.all_addrs.push(addr); + if !self.known_addrs.contains(&addr) { + self.known_addrs.push(addr); // If we are learning a new address for this node, // we want to retry connecting self.state = match self.state { @@ -90,7 +85,7 @@ impl PeerInfoInternal { PeerConnState::Waiting(_, _) | PeerConnState::Abandonned => { PeerConnState::Waiting(0, Instant::now()) } - x @ (PeerConnState::Ourself | PeerConnState::Connected) => x, + x @ (PeerConnState::Ourself | PeerConnState::Connected { .. }) => x, }; true } else { @@ -104,8 +99,6 @@ impl PeerInfoInternal { pub struct PeerInfo { /// The node's identifier (its public key) pub id: NodeID, - /// The node's network address - pub addr: SocketAddr, /// The current status of our connection to this node pub state: PeerConnState, /// The last time at which the node was seen @@ -136,7 +129,7 @@ pub enum PeerConnState { Ourself, /// We currently have a connection to this peer - Connected, + Connected { addr: SocketAddr }, /// Our next connection tentative (the nth, where n is the first value of the tuple) /// will be at given Instant @@ -152,7 +145,7 @@ pub enum PeerConnState { impl PeerConnState { /// Returns true if we can currently send requests to this peer pub fn is_up(&self) -> bool { - matches!(self, Self::Ourself | Self::Connected) + matches!(self, Self::Ourself | Self::Connected { .. }) } } @@ -192,11 +185,11 @@ impl KnownHosts { } self.hash = hash_state.finalize(); } - fn connected_peers_vec(&self) -> Vec<(NodeID, SocketAddr)> { + fn connected_peers_vec(&self) -> Vec<(NodeID, Vec)> { self.list .iter() .filter(|(_, peer)| peer.state.is_up()) - .map(|(id, peer)| (*id, peer.addr)) + .map(|(id, peer)| (*id, peer.known_addrs.clone())) .collect::>() } } @@ -231,18 +224,16 @@ impl PeeringManager { if id != netapp.id { known_hosts.list.insert( id, - PeerInfoInternal::new(addr, PeerConnState::Waiting(0, Instant::now())), + PeerInfoInternal::new(PeerConnState::Waiting(0, Instant::now()), Some(addr)), ); } } - if let Some(addr) = our_addr { - known_hosts.list.insert( - netapp.id, - PeerInfoInternal::new(addr, PeerConnState::Ourself), - ); - known_hosts.update_hash(); - } + known_hosts.list.insert( + netapp.id, + PeerInfoInternal::new(PeerConnState::Ourself, our_addr), + ); + known_hosts.update_hash(); // TODO for v0.10 / v1.0 : rename the endpoint (it will break compatibility) let strat = Arc::new(Self { @@ -287,7 +278,7 @@ impl PeeringManager { for (id, info) in known_hosts.list.iter() { trace!("{}, {:?}", hex::encode(&id[..8]), info); match info.state { - PeerConnState::Connected => { + PeerConnState::Connected { .. } => { let must_ping = match info.last_send_ping { None => true, Some(t) => Instant::now() - t > PING_INTERVAL, @@ -330,7 +321,7 @@ impl PeeringManager { info!( "Retrying connection to {} at {} ({})", hex::encode(&id[..8]), - h.all_addrs + h.known_addrs .iter() .map(|x| format!("{}", x)) .collect::>() @@ -339,13 +330,8 @@ impl PeeringManager { ); h.state = PeerConnState::Trying(i); - let alternate_addrs = h - .all_addrs - .iter() - .filter(|x| **x != h.addr) - .cloned() - .collect::>(); - tokio::spawn(self.clone().try_connect(id, h.addr, alternate_addrs)); + let addresses = h.known_addrs.clone(); + tokio::spawn(self.clone().try_connect(id, addresses)); } } } @@ -373,27 +359,24 @@ impl PeeringManager { fn update_public_peer_list(&self, known_hosts: &KnownHosts) { let mut pub_peer_list = Vec::with_capacity(known_hosts.list.len()); for (id, info) in known_hosts.list.iter() { + if *id == self.netapp.id { + // sanity check + assert!(matches!(info.state, PeerConnState::Ourself)); + } let mut pings = info.ping.iter().cloned().collect::>(); pings.sort(); if !pings.is_empty() { pub_peer_list.push(PeerInfo { id: *id, - addr: info.addr, state: info.state, last_seen: info.last_seen, - avg_ping: Some( - pings - .iter() - .fold(Duration::from_secs(0), |x, y| x + *y) - .div_f64(pings.len() as f64), - ), + avg_ping: Some(pings.iter().sum::().div_f64(pings.len() as f64)), max_ping: pings.last().cloned(), med_ping: Some(pings[pings.len() / 2]), }); } else { pub_peer_list.push(PeerInfo { id: *id, - addr: info.addr, state: info.state, last_seen: info.last_seen, avg_ping: None, @@ -485,18 +468,20 @@ impl PeeringManager { } } - fn handle_peer_list(&self, list: &[(NodeID, SocketAddr)]) { + fn handle_peer_list(&self, list: &[(NodeID, Vec)]) { let mut known_hosts = self.known_hosts.write().unwrap(); let mut changed = false; - for (id, addr) in list.iter() { - if let Some(kh) = known_hosts.list.get_mut(id) { - if kh.add_addr(*addr) { + for (id, addrs) in list.iter() { + for addr in addrs.iter() { + if let Some(kh) = known_hosts.list.get_mut(id) { + if kh.add_addr(*addr) { + changed = true; + } + } else { + known_hosts.list.insert(*id, self.new_peer(id, *addr)); changed = true; } - } else { - known_hosts.list.insert(*id, self.new_peer(id, *addr)); - changed = true; } } @@ -506,15 +491,10 @@ impl PeeringManager { } } - async fn try_connect( - self: Arc, - id: NodeID, - default_addr: SocketAddr, - alternate_addrs: Vec, - ) { + async fn try_connect(self: Arc, id: NodeID, addresses: Vec) { let conn_addr = { let mut ret = None; - for addr in [default_addr].iter().chain(alternate_addrs.iter()) { + for addr in addresses.iter() { debug!("Trying address {} for peer {}", addr, hex::encode(&id[..8])); match self.netapp.clone().try_connect(*addr, id).await { Ok(()) => { @@ -540,7 +520,7 @@ impl PeeringManager { warn!( "Could not connect to peer {} ({} addresses tried)", hex::encode(&id[..8]), - 1 + alternate_addrs.len() + addresses.len() ); let mut known_hosts = self.known_hosts.write().unwrap(); if let Some(host) = known_hosts.list.get_mut(&id) { @@ -560,6 +540,14 @@ impl PeeringManager { } fn on_connected(self: &Arc, id: NodeID, addr: SocketAddr, is_incoming: bool) { + if id == self.netapp.id { + // sanity check + panic!( + "on_connected from local node, id={:?}, addr={}, incoming={}", + id, addr, is_incoming + ); + } + let mut known_hosts = self.known_hosts.write().unwrap(); if is_incoming { if let Some(host) = known_hosts.list.get_mut(&id) { @@ -574,13 +562,13 @@ impl PeeringManager { addr ); if let Some(host) = known_hosts.list.get_mut(&id) { - host.state = PeerConnState::Connected; - host.addr = addr; + host.state = PeerConnState::Connected { addr }; host.add_addr(addr); } else { - known_hosts - .list - .insert(id, PeerInfoInternal::new(addr, PeerConnState::Connected)); + known_hosts.list.insert( + id, + PeerInfoInternal::new(PeerConnState::Connected { addr }, Some(addr)), + ); } } known_hosts.update_hash(); @@ -600,12 +588,8 @@ impl PeeringManager { } fn new_peer(&self, id: &NodeID, addr: SocketAddr) -> PeerInfoInternal { - let state = if *id == self.netapp.id { - PeerConnState::Ourself - } else { - PeerConnState::Waiting(0, Instant::now()) - }; - PeerInfoInternal::new(addr, state) + assert!(*id != self.netapp.id); + PeerInfoInternal::new(PeerConnState::Waiting(0, Instant::now()), Some(addr)) } } diff --git a/src/rpc/system.rs b/src/rpc/system.rs index 54d589d2..9da1b176 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -16,7 +16,7 @@ use tokio::sync::{watch, Notify}; use garage_net::endpoint::{Endpoint, EndpointHandler}; use garage_net::message::*; -use garage_net::peering::PeeringManager; +use garage_net::peering::{PeerConnState, PeeringManager}; use garage_net::util::parse_and_resolve_peer_addr_async; use garage_net::{NetApp, NetworkKey, NodeID, NodeKey}; @@ -142,7 +142,7 @@ pub struct NodeStatus { #[derive(Debug, Clone, Serialize, Deserialize)] pub struct KnownNodeInfo { pub id: Uuid, - pub addr: SocketAddr, + pub addr: Option, pub is_up: bool, pub last_seen_secs_ago: Option, pub status: NodeStatus, @@ -381,7 +381,11 @@ impl System { .iter() .map(|n| KnownNodeInfo { id: n.id.into(), - addr: n.addr, + addr: match n.state { + PeerConnState::Ourself => self.rpc_public_addr, + PeerConnState::Connected { addr } => Some(addr), + _ => None, + }, is_up: n.is_up(), last_seen_secs_ago: n .last_seen @@ -722,7 +726,10 @@ impl System { .peering .get_peer_list() .iter() - .map(|n| (n.id.into(), n.addr)) + .filter_map(|n| match n.state { + PeerConnState::Connected { addr } => Some((n.id.into(), addr)), + _ => None, + }) .collect::>(); // Before doing it, we read the current peer list file (if it exists) -- cgit v1.2.3 From 3844110cd03210a1600d57db1aab53e41cf4815f Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 21 Mar 2024 10:50:44 +0100 Subject: [net-fixes] netapp peer exchange: send only currently connected address --- src/net/peering.rs | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'src') diff --git a/src/net/peering.rs b/src/net/peering.rs index 0b4fec9e..b4271231 100644 --- a/src/net/peering.rs +++ b/src/net/peering.rs @@ -43,7 +43,7 @@ impl Message for PingMessage { #[derive(Serialize, Deserialize)] struct PeerListMessage { - pub list: Vec<(NodeID, Vec)>, + pub list: Vec<(NodeID, SocketAddr)>, } impl Message for PeerListMessage { @@ -185,11 +185,13 @@ impl KnownHosts { } self.hash = hash_state.finalize(); } - fn connected_peers_vec(&self) -> Vec<(NodeID, Vec)> { + fn connected_peers_vec(&self) -> Vec<(NodeID, SocketAddr)> { self.list .iter() - .filter(|(_, peer)| peer.state.is_up()) - .map(|(id, peer)| (*id, peer.known_addrs.clone())) + .filter_map(|(id, peer)| match peer.state { + PeerConnState::Connected { addr } => Some((*id, addr)), + _ => None, + }) .collect::>() } } @@ -468,20 +470,18 @@ impl PeeringManager { } } - fn handle_peer_list(&self, list: &[(NodeID, Vec)]) { + fn handle_peer_list(&self, list: &[(NodeID, SocketAddr)]) { let mut known_hosts = self.known_hosts.write().unwrap(); let mut changed = false; - for (id, addrs) in list.iter() { - for addr in addrs.iter() { - if let Some(kh) = known_hosts.list.get_mut(id) { - if kh.add_addr(*addr) { - changed = true; - } - } else { - known_hosts.list.insert(*id, self.new_peer(id, *addr)); + for (id, addr) in list.iter() { + if let Some(kh) = known_hosts.list.get_mut(id) { + if kh.add_addr(*addr) { changed = true; } + } else { + known_hosts.list.insert(*id, self.new_peer(id, *addr)); + changed = true; } } -- cgit v1.2.3 From 74949c69cbf1a8222b6d10a02fcf5fe139ccb560 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 21 Mar 2024 14:06:59 +0100 Subject: [s3-checksum] implement x-amz-checksum-* headers --- src/api/Cargo.toml | 3 + src/api/s3/api_server.rs | 2 +- src/api/s3/checksum.rs | 406 +++++++++++++++++++++++++++++++++++++++ src/api/s3/copy.rs | 132 +++++++++---- src/api/s3/encryption.rs | 44 ++--- src/api/s3/error.rs | 6 + src/api/s3/get.rs | 147 ++++++++++---- src/api/s3/list.rs | 77 +++++++- src/api/s3/mod.rs | 1 + src/api/s3/multipart.rs | 159 ++++++++++++--- src/api/s3/post_object.rs | 28 ++- src/api/s3/put.rs | 221 ++++++++++----------- src/api/s3/xml.rs | 33 ++++ src/garage/Cargo.toml | 1 + src/garage/tests/s3/multipart.rs | 158 ++++++++++++++- src/model/s3/mpu_table.rs | 9 + src/model/s3/object_table.rs | 53 ++++- src/util/async_hash.rs | 61 ------ src/util/lib.rs | 1 - 19 files changed, 1209 insertions(+), 333 deletions(-) create mode 100644 src/api/s3/checksum.rs delete mode 100644 src/util/async_hash.rs (limited to 'src') diff --git a/src/api/Cargo.toml b/src/api/Cargo.toml index bcf6a537..1b87496c 100644 --- a/src/api/Cargo.toml +++ b/src/api/Cargo.toml @@ -28,6 +28,8 @@ async-trait.workspace = true base64.workspace = true bytes.workspace = true chrono.workspace = true +crc32fast.workspace = true +crc32c.workspace = true crypto-common.workspace = true err-derive.workspace = true hex.workspace = true @@ -37,6 +39,7 @@ tracing.workspace = true md-5.workspace = true nom.workspace = true pin-project.workspace = true +sha1.workspace = true sha2.workspace = true futures.workspace = true diff --git a/src/api/s3/api_server.rs b/src/api/s3/api_server.rs index 1ed30996..1737af33 100644 --- a/src/api/s3/api_server.rs +++ b/src/api/s3/api_server.rs @@ -325,7 +325,7 @@ impl ApiHandler for S3ApiServer { part_number_marker: part_number_marker.map(|p| p.min(10000)), max_parts: max_parts.unwrap_or(1000).clamp(1, 1000), }; - handle_list_parts(ctx, &query).await + handle_list_parts(ctx, req, &query).await } Endpoint::DeleteObjects {} => handle_delete_objects(ctx, req, content_sha256).await, Endpoint::GetBucketWebsite {} => handle_get_website(ctx).await, diff --git a/src/api/s3/checksum.rs b/src/api/s3/checksum.rs new file mode 100644 index 00000000..c9dc001c --- /dev/null +++ b/src/api/s3/checksum.rs @@ -0,0 +1,406 @@ +use std::convert::{TryFrom, TryInto}; +use std::hash::Hasher; + +use base64::prelude::*; +use crc32c::Crc32cHasher as Crc32c; +use crc32fast::Hasher as Crc32; +use md5::{Digest, Md5}; +use sha1::Sha1; +use sha2::Sha256; + +use http::{HeaderMap, HeaderName, HeaderValue}; + +use garage_util::data::*; +use garage_util::error::OkOrMessage; + +use garage_model::s3::object_table::*; + +use crate::s3::error::*; + +pub const X_AMZ_CHECKSUM_ALGORITHM: HeaderName = + HeaderName::from_static("x-amz-checksum-algorithm"); +pub const X_AMZ_CHECKSUM_MODE: HeaderName = HeaderName::from_static("x-amz-checksum-mode"); +pub const X_AMZ_CHECKSUM_CRC32: HeaderName = HeaderName::from_static("x-amz-checksum-crc32"); +pub const X_AMZ_CHECKSUM_CRC32C: HeaderName = HeaderName::from_static("x-amz-checksum-crc32c"); +pub const X_AMZ_CHECKSUM_SHA1: HeaderName = HeaderName::from_static("x-amz-checksum-sha1"); +pub const X_AMZ_CHECKSUM_SHA256: HeaderName = HeaderName::from_static("x-amz-checksum-sha256"); + +pub type Crc32Checksum = [u8; 4]; +pub type Crc32cChecksum = [u8; 4]; +pub type Md5Checksum = [u8; 16]; +pub type Sha1Checksum = [u8; 20]; +pub type Sha256Checksum = [u8; 32]; + +#[derive(Debug, Default)] +pub(crate) struct ExpectedChecksums { + // base64-encoded md5 (content-md5 header) + pub md5: Option, + // content_sha256 (as a Hash / FixedBytes32) + pub sha256: Option, + // extra x-amz-checksum-* header + pub extra: Option, +} + +pub(crate) struct Checksummer { + pub crc32: Option, + pub crc32c: Option, + pub md5: Option, + pub sha1: Option, + pub sha256: Option, +} + +#[derive(Default)] +pub(crate) struct Checksums { + pub crc32: Option, + pub crc32c: Option, + pub md5: Option, + pub sha1: Option, + pub sha256: Option, +} + +impl Checksummer { + pub(crate) fn init(expected: &ExpectedChecksums, require_md5: bool) -> Self { + let mut ret = Self { + crc32: None, + crc32c: None, + md5: None, + sha1: None, + sha256: None, + }; + + if expected.md5.is_some() || require_md5 { + ret.md5 = Some(Md5::new()); + } + if expected.sha256.is_some() || matches!(&expected.extra, Some(ChecksumValue::Sha256(_))) { + ret.sha256 = Some(Sha256::new()); + } + if matches!(&expected.extra, Some(ChecksumValue::Crc32(_))) { + ret.crc32 = Some(Crc32::new()); + } + if matches!(&expected.extra, Some(ChecksumValue::Crc32c(_))) { + ret.crc32c = Some(Crc32c::default()); + } + if matches!(&expected.extra, Some(ChecksumValue::Sha1(_))) { + ret.sha1 = Some(Sha1::new()); + } + ret + } + + pub(crate) fn add(mut self, algo: Option) -> Self { + match algo { + Some(ChecksumAlgorithm::Crc32) => { + self.crc32 = Some(Crc32::new()); + } + Some(ChecksumAlgorithm::Crc32c) => { + self.crc32c = Some(Crc32c::default()); + } + Some(ChecksumAlgorithm::Sha1) => { + self.sha1 = Some(Sha1::new()); + } + Some(ChecksumAlgorithm::Sha256) => { + self.sha256 = Some(Sha256::new()); + } + None => (), + } + self + } + + pub(crate) fn update(&mut self, bytes: &[u8]) { + if let Some(crc32) = &mut self.crc32 { + crc32.update(bytes); + } + if let Some(crc32c) = &mut self.crc32c { + crc32c.write(bytes); + } + if let Some(md5) = &mut self.md5 { + md5.update(bytes); + } + if let Some(sha1) = &mut self.sha1 { + sha1.update(bytes); + } + if let Some(sha256) = &mut self.sha256 { + sha256.update(bytes); + } + } + + pub(crate) fn finalize(self) -> Checksums { + Checksums { + crc32: self.crc32.map(|x| u32::to_be_bytes(x.finalize())), + crc32c: self + .crc32c + .map(|x| u32::to_be_bytes(u32::try_from(x.finish()).unwrap())), + md5: self.md5.map(|x| x.finalize()[..].try_into().unwrap()), + sha1: self.sha1.map(|x| x.finalize()[..].try_into().unwrap()), + sha256: self.sha256.map(|x| x.finalize()[..].try_into().unwrap()), + } + } +} + +impl Checksums { + pub fn verify(&self, expected: &ExpectedChecksums) -> Result<(), Error> { + if let Some(expected_md5) = &expected.md5 { + match self.md5 { + Some(md5) if BASE64_STANDARD.encode(&md5) == expected_md5.trim_matches('"') => (), + _ => { + return Err(Error::InvalidDigest( + "MD5 checksum verification failed (from content-md5)".into(), + )) + } + } + } + if let Some(expected_sha256) = &expected.sha256 { + match self.sha256 { + Some(sha256) if &sha256[..] == expected_sha256.as_slice() => (), + _ => { + return Err(Error::InvalidDigest( + "SHA256 checksum verification failed (from x-amz-content-sha256)".into(), + )) + } + } + } + if let Some(extra) = expected.extra { + let algo = extra.algorithm(); + if self.extract(Some(algo)) != Some(extra) { + return Err(Error::InvalidDigest(format!( + "Failed to validate checksum for algorithm {:?}", + algo + ))); + } + } + Ok(()) + } + + pub fn extract(&self, algo: Option) -> Option { + match algo { + None => None, + Some(ChecksumAlgorithm::Crc32) => Some(ChecksumValue::Crc32(self.crc32.unwrap())), + Some(ChecksumAlgorithm::Crc32c) => Some(ChecksumValue::Crc32c(self.crc32c.unwrap())), + Some(ChecksumAlgorithm::Sha1) => Some(ChecksumValue::Sha1(self.sha1.unwrap())), + Some(ChecksumAlgorithm::Sha256) => Some(ChecksumValue::Sha256(self.sha256.unwrap())), + } + } +} + +// ---- + +#[derive(Default)] +pub(crate) struct MultipartChecksummer { + pub md5: Md5, + pub extra: Option, +} + +pub(crate) enum MultipartExtraChecksummer { + Crc32(Crc32), + Crc32c(Crc32c), + Sha1(Sha1), + Sha256(Sha256), +} + +impl MultipartChecksummer { + pub(crate) fn init(algo: Option) -> Self { + Self { + md5: Md5::new(), + extra: match algo { + None => None, + Some(ChecksumAlgorithm::Crc32) => { + Some(MultipartExtraChecksummer::Crc32(Crc32::new())) + } + Some(ChecksumAlgorithm::Crc32c) => { + Some(MultipartExtraChecksummer::Crc32c(Crc32c::default())) + } + Some(ChecksumAlgorithm::Sha1) => Some(MultipartExtraChecksummer::Sha1(Sha1::new())), + Some(ChecksumAlgorithm::Sha256) => { + Some(MultipartExtraChecksummer::Sha256(Sha256::new())) + } + }, + } + } + + pub(crate) fn update( + &mut self, + etag: &str, + checksum: Option, + ) -> Result<(), Error> { + self.md5 + .update(&hex::decode(&etag).ok_or_message("invalid etag hex")?); + match (&mut self.extra, checksum) { + (None, _) => (), + ( + Some(MultipartExtraChecksummer::Crc32(ref mut crc32)), + Some(ChecksumValue::Crc32(x)), + ) => { + crc32.update(&x); + } + ( + Some(MultipartExtraChecksummer::Crc32c(ref mut crc32c)), + Some(ChecksumValue::Crc32c(x)), + ) => { + crc32c.write(&x); + } + (Some(MultipartExtraChecksummer::Sha1(ref mut sha1)), Some(ChecksumValue::Sha1(x))) => { + sha1.update(&x); + } + ( + Some(MultipartExtraChecksummer::Sha256(ref mut sha256)), + Some(ChecksumValue::Sha256(x)), + ) => { + sha256.update(&x); + } + (Some(_), b) => { + return Err(Error::internal_error(format!( + "part checksum was not computed correctly, got: {:?}", + b + ))) + } + } + Ok(()) + } + + pub(crate) fn finalize(self) -> (Md5Checksum, Option) { + let md5 = self.md5.finalize()[..].try_into().unwrap(); + let extra = match self.extra { + None => None, + Some(MultipartExtraChecksummer::Crc32(crc32)) => { + Some(ChecksumValue::Crc32(u32::to_be_bytes(crc32.finalize()))) + } + Some(MultipartExtraChecksummer::Crc32c(crc32c)) => Some(ChecksumValue::Crc32c( + u32::to_be_bytes(u32::try_from(crc32c.finish()).unwrap()), + )), + Some(MultipartExtraChecksummer::Sha1(sha1)) => { + Some(ChecksumValue::Sha1(sha1.finalize()[..].try_into().unwrap())) + } + Some(MultipartExtraChecksummer::Sha256(sha256)) => Some(ChecksumValue::Sha256( + sha256.finalize()[..].try_into().unwrap(), + )), + }; + (md5, extra) + } +} + +// ---- + +/// Extract the value of the x-amz-checksum-algorithm header +pub(crate) fn request_checksum_algorithm( + headers: &HeaderMap, +) -> Result, Error> { + match headers.get(X_AMZ_CHECKSUM_ALGORITHM) { + None => Ok(None), + Some(x) if x == "CRC32" => Ok(Some(ChecksumAlgorithm::Crc32)), + Some(x) if x == "CRC32C" => Ok(Some(ChecksumAlgorithm::Crc32c)), + Some(x) if x == "SHA1" => Ok(Some(ChecksumAlgorithm::Sha1)), + Some(x) if x == "SHA256" => Ok(Some(ChecksumAlgorithm::Sha256)), + _ => Err(Error::bad_request("invalid checksum algorithm")), + } +} + +/// Extract the value of any of the x-amz-checksum-* headers +pub(crate) fn request_checksum_value( + headers: &HeaderMap, +) -> Result, Error> { + let mut ret = vec![]; + + if let Some(crc32_str) = headers.get(X_AMZ_CHECKSUM_CRC32) { + let crc32 = BASE64_STANDARD + .decode(&crc32_str) + .ok() + .and_then(|x| x.try_into().ok()) + .ok_or_bad_request("invalid x-amz-checksum-crc32 header")?; + ret.push(ChecksumValue::Crc32(crc32)) + } + if let Some(crc32c_str) = headers.get(X_AMZ_CHECKSUM_CRC32C) { + let crc32c = BASE64_STANDARD + .decode(&crc32c_str) + .ok() + .and_then(|x| x.try_into().ok()) + .ok_or_bad_request("invalid x-amz-checksum-crc32c header")?; + ret.push(ChecksumValue::Crc32c(crc32c)) + } + if let Some(sha1_str) = headers.get(X_AMZ_CHECKSUM_SHA1) { + let sha1 = BASE64_STANDARD + .decode(&sha1_str) + .ok() + .and_then(|x| x.try_into().ok()) + .ok_or_bad_request("invalid x-amz-checksum-sha1 header")?; + ret.push(ChecksumValue::Sha1(sha1)) + } + if let Some(sha256_str) = headers.get(X_AMZ_CHECKSUM_SHA256) { + let sha256 = BASE64_STANDARD + .decode(&sha256_str) + .ok() + .and_then(|x| x.try_into().ok()) + .ok_or_bad_request("invalid x-amz-checksum-sha256 header")?; + ret.push(ChecksumValue::Sha256(sha256)) + } + + if ret.len() > 1 { + return Err(Error::bad_request( + "multiple x-amz-checksum-* headers given", + )); + } + Ok(ret.pop()) +} + +/// Checks for the presense of x-amz-checksum-algorithm +/// if so extract the corrseponding x-amz-checksum-* value +pub(crate) fn request_checksum_algorithm_value( + headers: &HeaderMap, +) -> Result, Error> { + match headers.get(X_AMZ_CHECKSUM_ALGORITHM) { + Some(x) if x == "CRC32" => { + let crc32 = headers + .get(X_AMZ_CHECKSUM_CRC32) + .and_then(|x| BASE64_STANDARD.decode(&x).ok()) + .and_then(|x| x.try_into().ok()) + .ok_or_bad_request("invalid x-amz-checksum-crc32 header")?; + Ok(Some(ChecksumValue::Crc32(crc32))) + } + Some(x) if x == "CRC32C" => { + let crc32c = headers + .get(X_AMZ_CHECKSUM_CRC32C) + .and_then(|x| BASE64_STANDARD.decode(&x).ok()) + .and_then(|x| x.try_into().ok()) + .ok_or_bad_request("invalid x-amz-checksum-crc32c header")?; + Ok(Some(ChecksumValue::Crc32c(crc32c))) + } + Some(x) if x == "SHA1" => { + let sha1 = headers + .get(X_AMZ_CHECKSUM_SHA1) + .and_then(|x| BASE64_STANDARD.decode(&x).ok()) + .and_then(|x| x.try_into().ok()) + .ok_or_bad_request("invalid x-amz-checksum-sha1 header")?; + Ok(Some(ChecksumValue::Sha1(sha1))) + } + Some(x) if x == "SHA256" => { + let sha256 = headers + .get(X_AMZ_CHECKSUM_SHA256) + .and_then(|x| BASE64_STANDARD.decode(&x).ok()) + .and_then(|x| x.try_into().ok()) + .ok_or_bad_request("invalid x-amz-checksum-sha256 header")?; + Ok(Some(ChecksumValue::Sha256(sha256))) + } + Some(_) => Err(Error::bad_request("invalid x-amz-checksum-algorithm")), + None => Ok(None), + } +} + +pub(crate) fn add_checksum_response_headers( + checksum: &Option, + mut resp: http::response::Builder, +) -> http::response::Builder { + match checksum { + Some(ChecksumValue::Crc32(crc32)) => { + resp = resp.header(X_AMZ_CHECKSUM_CRC32, BASE64_STANDARD.encode(&crc32)); + } + Some(ChecksumValue::Crc32c(crc32c)) => { + resp = resp.header(X_AMZ_CHECKSUM_CRC32C, BASE64_STANDARD.encode(&crc32c)); + } + Some(ChecksumValue::Sha1(sha1)) => { + resp = resp.header(X_AMZ_CHECKSUM_SHA1, BASE64_STANDARD.encode(&sha1)); + } + Some(ChecksumValue::Sha256(sha256)) => { + resp = resp.header(X_AMZ_CHECKSUM_SHA256, BASE64_STANDARD.encode(&sha256)); + } + None => (), + } + resp +} diff --git a/src/api/s3/copy.rs b/src/api/s3/copy.rs index 2b29ec6d..411a6917 100644 --- a/src/api/s3/copy.rs +++ b/src/api/s3/copy.rs @@ -2,7 +2,6 @@ use std::pin::Pin; use std::time::{Duration, SystemTime, UNIX_EPOCH}; use futures::{stream, stream::Stream, StreamExt, TryStreamExt}; -use md5::{Digest as Md5Digest, Md5}; use bytes::Bytes; use hyper::{Request, Response}; @@ -23,11 +22,12 @@ use garage_model::s3::version_table::*; use crate::helpers::*; use crate::s3::api_server::{ReqBody, ResBody}; +use crate::s3::checksum::*; use crate::s3::encryption::EncryptionParams; use crate::s3::error::*; use crate::s3::get::full_object_byte_stream; use crate::s3::multipart; -use crate::s3::put::{get_headers, save_stream, SaveStreamResult}; +use crate::s3::put::{get_headers, save_stream, ChecksumMode, SaveStreamResult}; use crate::s3::xml::{self as s3_xml, xmlns_tag}; // -------- CopyObject --------- @@ -39,6 +39,8 @@ pub async fn handle_copy( ) -> Result, Error> { let copy_precondition = CopyPreconditionHeaders::parse(req)?; + let checksum_algorithm = request_checksum_algorithm(req.headers())?; + let source_object = get_copy_source(&ctx, req).await?; let (source_version, source_version_data, source_version_meta) = @@ -48,7 +50,7 @@ pub async fn handle_copy( copy_precondition.check(source_version, &source_version_meta.etag)?; // Determine encryption parameters - let (source_encryption, source_object_headers) = + let (source_encryption, source_object_meta_inner) = EncryptionParams::check_decrypt_for_copy_source( &ctx.garage, req.headers(), @@ -56,23 +58,54 @@ pub async fn handle_copy( )?; let dest_encryption = EncryptionParams::new_from_headers(&ctx.garage, req.headers())?; - // Determine headers of destination object - let dest_object_headers = match req.headers().get("x-amz-metadata-directive") { - Some(v) if v == hyper::header::HeaderValue::from_static("REPLACE") => { - get_headers(req.headers())? - } - _ => source_object_headers.into_owned(), + // Extract source checksum info before source_object_meta_inner is consumed + let source_checksum = source_object_meta_inner.checksum; + let source_checksum_algorithm = source_checksum.map(|x| x.algorithm()); + + // If source object has a checksum, the destination object must as well. + // The x-amz-checksum-algorihtm header allows to change that algorithm, + // but if it is absent, we must use the same as before + let checksum_algorithm = checksum_algorithm.or(source_checksum_algorithm); + + // Determine metadata of destination object + let was_multipart = source_version_meta.etag.contains('-'); + let dest_object_meta = ObjectVersionMetaInner { + headers: match req.headers().get("x-amz-metadata-directive") { + Some(v) if v == hyper::header::HeaderValue::from_static("REPLACE") => { + get_headers(req.headers())? + } + _ => source_object_meta_inner.into_owned().headers, + }, + checksum: source_checksum, }; // Do actual object copying - let res = if EncryptionParams::is_same(&source_encryption, &dest_encryption) { - // If source and dest are both unencrypted, or if the encryption keys - // are the same, we can just copy the metadata and link blocks of the + // + // In any of the following scenarios, we need to read the whole object + // data and re-write it again: + // + // - the data needs to be decrypted or encrypted + // - the requested checksum algorithm requires us to recompute a checksum + // - the original object was a multipart upload and a checksum algorithm + // is defined (AWS specifies that in this case, we must recompute the + // checksum from scratch as if this was a single big object and not + // a multipart object, as the checksums are not computed in the same way) + // + // In other cases, we can just copy the metadata and reference the same blocks. + // + // See: https://docs.aws.amazon.com/AmazonS3/latest/userguide/checking-object-integrity.html + + let must_recopy = !EncryptionParams::is_same(&source_encryption, &dest_encryption) + || source_checksum_algorithm != checksum_algorithm + || (was_multipart && checksum_algorithm.is_some()); + + let res = if !must_recopy { + // In most cases, we can just copy the metadata and link blocks of the // old object from the new object. handle_copy_metaonly( ctx, dest_key, - dest_object_headers, + dest_object_meta, dest_encryption, source_version, source_version_data, @@ -80,16 +113,27 @@ pub async fn handle_copy( ) .await? } else { + let expected_checksum = ExpectedChecksums { + md5: None, + sha256: None, + extra: source_checksum, + }; + let checksum_mode = if was_multipart || source_checksum_algorithm != checksum_algorithm { + ChecksumMode::Calculate(checksum_algorithm) + } else { + ChecksumMode::Verify(&expected_checksum) + }; // If source and dest encryption use different keys, // we must decrypt content and re-encrypt, so rewrite all data blocks. handle_copy_reencrypt( ctx, dest_key, - dest_object_headers, + dest_object_meta, dest_encryption, source_version, source_version_data, source_encryption, + checksum_mode, ) .await? }; @@ -115,7 +159,7 @@ pub async fn handle_copy( async fn handle_copy_metaonly( ctx: ReqCtx, dest_key: &str, - dest_object_headers: ObjectVersionHeaders, + dest_object_meta: ObjectVersionMetaInner, dest_encryption: EncryptionParams, source_version: &ObjectVersion, source_version_data: &ObjectVersionData, @@ -132,7 +176,7 @@ async fn handle_copy_metaonly( let new_timestamp = now_msec(); let new_meta = ObjectVersionMeta { - encryption: dest_encryption.encrypt_headers(dest_object_headers)?, + encryption: dest_encryption.encrypt_meta(dest_object_meta)?, size: source_version_meta.size, etag: source_version_meta.etag.clone(), }; @@ -180,6 +224,7 @@ async fn handle_copy_metaonly( timestamp: new_timestamp, state: ObjectVersionState::Uploading { encryption: new_meta.encryption.clone(), + checksum_algorithm: None, multipart: false, }, }; @@ -252,11 +297,12 @@ async fn handle_copy_metaonly( async fn handle_copy_reencrypt( ctx: ReqCtx, dest_key: &str, - dest_object_headers: ObjectVersionHeaders, + dest_object_meta: ObjectVersionMetaInner, dest_encryption: EncryptionParams, source_version: &ObjectVersion, source_version_data: &ObjectVersionData, source_encryption: EncryptionParams, + checksum_mode: ChecksumMode<'_>, ) -> Result { // basically we will read the source data (decrypt if necessary) // and save that in a new object (encrypt if necessary), @@ -270,12 +316,11 @@ async fn handle_copy_reencrypt( save_stream( &ctx, - dest_object_headers, + dest_object_meta, dest_encryption, source_stream.map_err(|e| Error::from(GarageError::from(e))), &dest_key.to_string(), - None, - None, + checksum_mode, ) .await } @@ -313,8 +358,12 @@ pub async fn handle_upload_part_copy( req.headers(), &source_version_meta.encryption, )?; - let dest_object_encryption = match dest_version.state { - ObjectVersionState::Uploading { encryption, .. } => encryption, + let (dest_object_encryption, dest_object_checksum_algorithm) = match dest_version.state { + ObjectVersionState::Uploading { + encryption, + checksum_algorithm, + .. + } => (encryption, checksum_algorithm), _ => unreachable!(), }; let (dest_encryption, _) = @@ -412,7 +461,9 @@ pub async fn handle_upload_part_copy( dest_mpu_part_key, MpuPart { version: dest_version_id, + // These are all filled in later (bottom of this function) etag: None, + checksum: None, size: None, }, ); @@ -429,7 +480,8 @@ pub async fn handle_upload_part_copy( garage.version_table.insert(&dest_version).await?; // Now, actually copy the blocks - let mut md5hasher = Md5::new(); + let mut checksummer = Checksummer::init(&Default::default(), !dest_encryption.is_encrypted()) + .add(dest_object_checksum_algorithm); // First, create a stream that is able to read the source blocks // and extract the subrange if necessary. @@ -495,18 +547,24 @@ pub async fn handle_upload_part_copy( } let data_len = data.len() as u64; - md5hasher.update(&data[..]); - - let (final_data, must_upload, final_hash) = match existing_block_hash { - Some(hash) if same_encryption => (data, false, hash), - _ => tokio::task::spawn_blocking(move || { - let data_enc = dest_encryption.encrypt_block(data)?; - let hash = blake2sum(&data_enc); - Ok::<_, Error>((data_enc, true, hash)) + + let (checksummer_updated, (data_to_upload, final_hash)) = + tokio::task::spawn_blocking(move || { + checksummer.update(&data[..]); + + let tup = match existing_block_hash { + Some(hash) if same_encryption => (None, hash), + _ => { + let data_enc = dest_encryption.encrypt_block(data)?; + let hash = blake2sum(&data_enc); + (Some(data_enc), hash) + } + }; + Ok::<_, Error>((checksummer, tup)) }) .await - .unwrap()?, - }; + .unwrap()?; + checksummer = checksummer_updated; dest_version.blocks.clear(); dest_version.blocks.put( @@ -531,7 +589,7 @@ pub async fn handle_upload_part_copy( // Thing 1: if the block is not exactly a block that existed before, // we need to insert that data as a new block. async { - if must_upload { + if let Some(final_data) = data_to_upload { garage .block_manager .rpc_put_block(final_hash, final_data, dest_encryption.is_encrypted(), None) @@ -552,8 +610,9 @@ pub async fn handle_upload_part_copy( assert_eq!(current_offset, source_range.length); - let data_md5sum = md5hasher.finalize(); - let etag = dest_encryption.etag_from_md5(&data_md5sum); + let checksums = checksummer.finalize(); + let etag = dest_encryption.etag_from_md5(&checksums.md5); + let checksum = checksums.extract(dest_object_checksum_algorithm); // Put the part's ETag in the Versiontable dest_mpu.parts.put( @@ -561,6 +620,7 @@ pub async fn handle_upload_part_copy( MpuPart { version: dest_version_id, etag: Some(etag.clone()), + checksum, size: Some(current_offset), }, ); diff --git a/src/api/s3/encryption.rs b/src/api/s3/encryption.rs index 2b105e90..2e6ed65c 100644 --- a/src/api/s3/encryption.rs +++ b/src/api/s3/encryption.rs @@ -26,9 +26,10 @@ use garage_util::error::Error as GarageError; use garage_util::migrate::Migrate; use garage_model::garage::Garage; -use garage_model::s3::object_table::{ObjectVersionEncryption, ObjectVersionHeaders}; +use garage_model::s3::object_table::{ObjectVersionEncryption, ObjectVersionMetaInner}; use crate::common_error::*; +use crate::s3::checksum::Md5Checksum; use crate::s3::error::Error; const X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_ALGORITHM: HeaderName = @@ -124,7 +125,7 @@ impl EncryptionParams { garage: &Garage, headers: &HeaderMap, obj_enc: &'a ObjectVersionEncryption, - ) -> Result<(Self, Cow<'a, ObjectVersionHeaders>), Error> { + ) -> Result<(Self, Cow<'a, ObjectVersionMetaInner>), Error> { let key = parse_request_headers( headers, &X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_ALGORITHM, @@ -138,7 +139,7 @@ impl EncryptionParams { garage: &Garage, headers: &HeaderMap, obj_enc: &'a ObjectVersionEncryption, - ) -> Result<(Self, Cow<'a, ObjectVersionHeaders>), Error> { + ) -> Result<(Self, Cow<'a, ObjectVersionMetaInner>), Error> { let key = parse_request_headers( headers, &X_AMZ_COPY_SOURCE_SERVER_SIDE_ENCRYPTION_CUSTOMER_ALGORITHM, @@ -152,14 +153,11 @@ impl EncryptionParams { garage: &Garage, key: Option<(Key, Md5Output)>, obj_enc: &'a ObjectVersionEncryption, - ) -> Result<(Self, Cow<'a, ObjectVersionHeaders>), Error> { + ) -> Result<(Self, Cow<'a, ObjectVersionMetaInner>), Error> { match (key, &obj_enc) { ( Some((client_key, client_key_md5)), - ObjectVersionEncryption::SseC { - headers, - compressed, - }, + ObjectVersionEncryption::SseC { inner, compressed }, ) => { let enc = Self::SseC { client_key, @@ -170,13 +168,13 @@ impl EncryptionParams { None }, }; - let plaintext = enc.decrypt_blob(&headers)?; - let headers = ObjectVersionHeaders::decode(&plaintext) - .ok_or_internal_error("Could not decode encrypted headers")?; - Ok((enc, Cow::Owned(headers))) + let plaintext = enc.decrypt_blob(&inner)?; + let inner = ObjectVersionMetaInner::decode(&plaintext) + .ok_or_internal_error("Could not decode encrypted metadata")?; + Ok((enc, Cow::Owned(inner))) } - (None, ObjectVersionEncryption::Plaintext { headers }) => { - Ok((Self::Plaintext, Cow::Borrowed(headers))) + (None, ObjectVersionEncryption::Plaintext { inner }) => { + Ok((Self::Plaintext, Cow::Borrowed(inner))) } (_, ObjectVersionEncryption::SseC { .. }) => { Err(Error::bad_request("Object is encrypted")) @@ -188,29 +186,31 @@ impl EncryptionParams { } } - pub fn encrypt_headers( + pub fn encrypt_meta( &self, - h: ObjectVersionHeaders, + meta: ObjectVersionMetaInner, ) -> Result { match self { Self::SseC { compression_level, .. } => { - let plaintext = h.encode().map_err(GarageError::from)?; + let plaintext = meta.encode().map_err(GarageError::from)?; let ciphertext = self.encrypt_blob(&plaintext)?; Ok(ObjectVersionEncryption::SseC { - headers: ciphertext.into_owned(), + inner: ciphertext.into_owned(), compressed: compression_level.is_some(), }) } - Self::Plaintext => Ok(ObjectVersionEncryption::Plaintext { headers: h }), + Self::Plaintext => Ok(ObjectVersionEncryption::Plaintext { inner: meta }), } } // ---- generating object Etag values ---- - pub fn etag_from_md5(&self, md5sum: &[u8]) -> String { + pub fn etag_from_md5(&self, md5sum: &Option) -> String { match self { - Self::Plaintext => hex::encode(md5sum), + Self::Plaintext => md5sum + .map(|x| hex::encode(&x[..])) + .expect("md5 digest should have been computed"), Self::SseC { .. } => { // AWS specifies that for encrypted objects, the Etag is not // the md5sum of the data, but doesn't say what it is. @@ -224,7 +224,7 @@ impl EncryptionParams { // ---- generic function for encrypting / decrypting blobs ---- // Prepends a randomly-generated nonce to the encrypted value. - // This is used for encrypting object headers and inlined data for small objects. + // This is used for encrypting object metadata and inlined data for small objects. // This does not compress anything. pub fn encrypt_blob<'a>(&self, blob: &'a [u8]) -> Result, Error> { diff --git a/src/api/s3/error.rs b/src/api/s3/error.rs index 5cb5d04e..2855e0b3 100644 --- a/src/api/s3/error.rs +++ b/src/api/s3/error.rs @@ -69,6 +69,10 @@ pub enum Error { #[error(display = "Invalid encryption algorithm: {:?}, should be AES256", _0)] InvalidEncryptionAlgorithm(String), + /// The client sent invalid XML data + #[error(display = "Invalid digest: {}", _0)] + InvalidDigest(String), + /// The client sent a request for an action not supported by garage #[error(display = "Unimplemented action: {}", _0)] NotImplemented(String), @@ -129,6 +133,7 @@ impl Error { Error::NotImplemented(_) => "NotImplemented", Error::InvalidXml(_) => "MalformedXML", Error::InvalidRange(_) => "InvalidRange", + Error::InvalidDigest(_) => "InvalidDigest", Error::InvalidUtf8Str(_) | Error::InvalidUtf8String(_) => "InvalidRequest", Error::InvalidEncryptionAlgorithm(_) => "InvalidEncryptionAlgorithmError", } @@ -148,6 +153,7 @@ impl ApiError for Error { | Error::InvalidPart | Error::InvalidPartOrder | Error::EntityTooSmall + | Error::InvalidDigest(_) | Error::InvalidEncryptionAlgorithm(_) | Error::InvalidXml(_) | Error::InvalidUtf8Str(_) diff --git a/src/api/s3/get.rs b/src/api/s3/get.rs index ec300ab7..f5d3cf11 100644 --- a/src/api/s3/get.rs +++ b/src/api/s3/get.rs @@ -27,6 +27,7 @@ use garage_model::s3::version_table::*; use crate::helpers::*; use crate::s3::api_server::ResBody; +use crate::s3::checksum::{add_checksum_response_headers, X_AMZ_CHECKSUM_MODE}; use crate::s3::encryption::EncryptionParams; use crate::s3::error::*; @@ -45,8 +46,9 @@ pub struct GetObjectOverrides { fn object_headers( version: &ObjectVersion, version_meta: &ObjectVersionMeta, - headers: &ObjectVersionHeaders, + meta_inner: &ObjectVersionMetaInner, encryption: EncryptionParams, + checksum_mode: ChecksumMode, ) -> http::response::Builder { debug!("Version meta: {:?}", version_meta); @@ -65,7 +67,7 @@ fn object_headers( // have the same name (ignoring case) into a comma-delimited list. // See: https://docs.aws.amazon.com/AmazonS3/latest/userguide/UsingMetadata.html let mut headers_by_name = BTreeMap::new(); - for (name, value) in headers.0.iter() { + for (name, value) in meta_inner.headers.iter() { match headers_by_name.get_mut(name) { None => { headers_by_name.insert(name, vec![value.as_str()]); @@ -80,6 +82,10 @@ fn object_headers( resp = resp.header(name, values.join(",")); } + if checksum_mode.enabled { + resp = add_checksum_response_headers(&meta_inner.checksum, resp); + } + encryption.add_response_headers(&mut resp); resp @@ -199,6 +205,8 @@ pub async fn handle_head_without_ctx( let (encryption, headers) = EncryptionParams::check_decrypt(&garage, req.headers(), &version_meta.encryption)?; + let checksum_mode = checksum_mode(&req); + if let Some(pn) = part_number { match version_data { ObjectVersionData::Inline(_, _) => { @@ -206,17 +214,21 @@ pub async fn handle_head_without_ctx( return Err(Error::InvalidPart); } let bytes_len = version_meta.size; - Ok( - object_headers(object_version, version_meta, &headers, encryption) - .header(CONTENT_LENGTH, format!("{}", bytes_len)) - .header( - CONTENT_RANGE, - format!("bytes 0-{}/{}", bytes_len - 1, bytes_len), - ) - .header(X_AMZ_MP_PARTS_COUNT, "1") - .status(StatusCode::PARTIAL_CONTENT) - .body(empty_body())?, + Ok(object_headers( + object_version, + version_meta, + &headers, + encryption, + checksum_mode, + ) + .header(CONTENT_LENGTH, format!("{}", bytes_len)) + .header( + CONTENT_RANGE, + format!("bytes 0-{}/{}", bytes_len - 1, bytes_len), ) + .header(X_AMZ_MP_PARTS_COUNT, "1") + .status(StatusCode::PARTIAL_CONTENT) + .body(empty_body())?) } ObjectVersionData::FirstBlock(_, _) => { let version = garage @@ -228,32 +240,40 @@ pub async fn handle_head_without_ctx( let (part_offset, part_end) = calculate_part_bounds(&version, pn).ok_or(Error::InvalidPart)?; - Ok( - object_headers(object_version, version_meta, &headers, encryption) - .header(CONTENT_LENGTH, format!("{}", part_end - part_offset)) - .header( - CONTENT_RANGE, - format!( - "bytes {}-{}/{}", - part_offset, - part_end - 1, - version_meta.size - ), - ) - .header(X_AMZ_MP_PARTS_COUNT, format!("{}", version.n_parts()?)) - .status(StatusCode::PARTIAL_CONTENT) - .body(empty_body())?, + Ok(object_headers( + object_version, + version_meta, + &headers, + encryption, + checksum_mode, ) + .header(CONTENT_LENGTH, format!("{}", part_end - part_offset)) + .header( + CONTENT_RANGE, + format!( + "bytes {}-{}/{}", + part_offset, + part_end - 1, + version_meta.size + ), + ) + .header(X_AMZ_MP_PARTS_COUNT, format!("{}", version.n_parts()?)) + .status(StatusCode::PARTIAL_CONTENT) + .body(empty_body())?) } _ => unreachable!(), } } else { - Ok( - object_headers(object_version, version_meta, &headers, encryption) - .header(CONTENT_LENGTH, format!("{}", version_meta.size)) - .status(StatusCode::OK) - .body(empty_body())?, + Ok(object_headers( + object_version, + version_meta, + &headers, + encryption, + checksum_mode, ) + .header(CONTENT_LENGTH, format!("{}", version_meta.size)) + .status(StatusCode::OK) + .body(empty_body())?) } } @@ -307,12 +327,24 @@ pub async fn handle_get_without_ctx( let (enc, headers) = EncryptionParams::check_decrypt(&garage, req.headers(), &last_v_meta.encryption)?; + let checksum_mode = checksum_mode(&req); + match (part_number, parse_range_header(req, last_v_meta.size)?) { (Some(_), Some(_)) => Err(Error::bad_request( "Cannot specify both partNumber and Range header", )), (Some(pn), None) => { - handle_get_part(garage, last_v, last_v_data, last_v_meta, enc, &headers, pn).await + handle_get_part( + garage, + last_v, + last_v_data, + last_v_meta, + enc, + &headers, + pn, + checksum_mode, + ) + .await } (None, Some(range)) => { handle_get_range( @@ -324,6 +356,7 @@ pub async fn handle_get_without_ctx( &headers, range.start, range.start + range.length, + checksum_mode, ) .await } @@ -336,6 +369,7 @@ pub async fn handle_get_without_ctx( enc, &headers, overrides, + checksum_mode, ) .await } @@ -348,12 +382,19 @@ async fn handle_get_full( version_data: &ObjectVersionData, version_meta: &ObjectVersionMeta, encryption: EncryptionParams, - headers: &ObjectVersionHeaders, + meta_inner: &ObjectVersionMetaInner, overrides: GetObjectOverrides, + checksum_mode: ChecksumMode, ) -> Result, Error> { - let mut resp_builder = object_headers(version, version_meta, &headers, encryption) - .header(CONTENT_LENGTH, format!("{}", version_meta.size)) - .status(StatusCode::OK); + let mut resp_builder = object_headers( + version, + version_meta, + &meta_inner, + encryption, + checksum_mode, + ) + .header(CONTENT_LENGTH, format!("{}", version_meta.size)) + .status(StatusCode::OK); getobject_override_headers(overrides, &mut resp_builder)?; let stream = full_object_byte_stream(garage, version, version_data, encryption); @@ -432,14 +473,15 @@ async fn handle_get_range( version_data: &ObjectVersionData, version_meta: &ObjectVersionMeta, encryption: EncryptionParams, - headers: &ObjectVersionHeaders, + meta_inner: &ObjectVersionMetaInner, begin: u64, end: u64, + checksum_mode: ChecksumMode, ) -> Result, Error> { // Here we do not use getobject_override_headers because we don't // want to add any overridden headers (those should not be added // when returning PARTIAL_CONTENT) - let resp_builder = object_headers(version, version_meta, headers, encryption) + let resp_builder = object_headers(version, version_meta, meta_inner, encryption, checksum_mode) .header(CONTENT_LENGTH, format!("{}", end - begin)) .header( CONTENT_RANGE, @@ -480,12 +522,19 @@ async fn handle_get_part( version_data: &ObjectVersionData, version_meta: &ObjectVersionMeta, encryption: EncryptionParams, - headers: &ObjectVersionHeaders, + meta_inner: &ObjectVersionMetaInner, part_number: u64, + checksum_mode: ChecksumMode, ) -> Result, Error> { // Same as for get_range, no getobject_override_headers - let resp_builder = object_headers(object_version, version_meta, headers, encryption) - .status(StatusCode::PARTIAL_CONTENT); + let resp_builder = object_headers( + object_version, + version_meta, + meta_inner, + encryption, + checksum_mode, + ) + .status(StatusCode::PARTIAL_CONTENT); match version_data { ObjectVersionData::Inline(_, bytes) => { @@ -567,6 +616,20 @@ fn calculate_part_bounds(v: &Version, part_number: u64) -> Option<(u64, u64)> { None } +struct ChecksumMode { + enabled: bool, +} + +fn checksum_mode(req: &Request) -> ChecksumMode { + ChecksumMode { + enabled: req + .headers() + .get(X_AMZ_CHECKSUM_MODE) + .map(|x| x == "ENABLED") + .unwrap_or(false), + } +} + fn body_from_blocks_range( garage: Arc, encryption: EncryptionParams, diff --git a/src/api/s3/list.rs b/src/api/s3/list.rs index 1678f1fa..648bace2 100644 --- a/src/api/s3/list.rs +++ b/src/api/s3/list.rs @@ -2,7 +2,7 @@ use std::collections::{BTreeMap, BTreeSet}; use std::iter::{Iterator, Peekable}; use base64::prelude::*; -use hyper::Response; +use hyper::{Request, Response}; use garage_util::data::*; use garage_util::error::Error as GarageError; @@ -15,7 +15,8 @@ use garage_table::EnumerationOrder; use crate::encoding::*; use crate::helpers::*; -use crate::s3::api_server::ResBody; +use crate::s3::api_server::{ReqBody, ResBody}; +use crate::s3::encryption::EncryptionParams; use crate::s3::error::*; use crate::s3::multipart as s3_multipart; use crate::s3::xml as s3_xml; @@ -271,13 +272,21 @@ pub async fn handle_list_multipart_upload( pub async fn handle_list_parts( ctx: ReqCtx, + req: Request, query: &ListPartsQuery, ) -> Result, Error> { debug!("ListParts {:?}", query); let upload_id = s3_multipart::decode_upload_id(&query.upload_id)?; - let (_, _, mpu) = s3_multipart::get_upload(&ctx, &query.key, &upload_id).await?; + let (_, object_version, mpu) = s3_multipart::get_upload(&ctx, &query.key, &upload_id).await?; + + let object_encryption = match object_version.state { + ObjectVersionState::Uploading { encryption, .. } => encryption, + _ => unreachable!(), + }; + let encryption_res = + EncryptionParams::check_decrypt(&ctx.garage, req.headers(), &object_encryption); let (info, next) = fetch_part_info(query, &mpu)?; @@ -296,11 +305,40 @@ pub async fn handle_list_parts( is_truncated: s3_xml::Value(format!("{}", next.is_some())), parts: info .iter() - .map(|part| s3_xml::PartItem { - etag: s3_xml::Value(format!("\"{}\"", part.etag)), - last_modified: s3_xml::Value(msec_to_rfc3339(part.timestamp)), - part_number: s3_xml::IntValue(part.part_number as i64), - size: s3_xml::IntValue(part.size as i64), + .map(|part| { + // hide checksum if object is encrypted and the decryption + // keys are not provided + let checksum = part.checksum.filter(|_| encryption_res.is_ok()); + s3_xml::PartItem { + etag: s3_xml::Value(format!("\"{}\"", part.etag)), + last_modified: s3_xml::Value(msec_to_rfc3339(part.timestamp)), + part_number: s3_xml::IntValue(part.part_number as i64), + size: s3_xml::IntValue(part.size as i64), + checksum_crc32: match &checksum { + Some(ChecksumValue::Crc32(x)) => { + Some(s3_xml::Value(BASE64_STANDARD.encode(&x))) + } + _ => None, + }, + checksum_crc32c: match &checksum { + Some(ChecksumValue::Crc32c(x)) => { + Some(s3_xml::Value(BASE64_STANDARD.encode(&x))) + } + _ => None, + }, + checksum_sha1: match &checksum { + Some(ChecksumValue::Sha1(x)) => { + Some(s3_xml::Value(BASE64_STANDARD.encode(&x))) + } + _ => None, + }, + checksum_sha256: match &checksum { + Some(ChecksumValue::Sha256(x)) => { + Some(s3_xml::Value(BASE64_STANDARD.encode(&x))) + } + _ => None, + }, + } }) .collect(), @@ -346,6 +384,7 @@ struct PartInfo<'a> { timestamp: u64, part_number: u64, size: u64, + checksum: Option, } enum ExtractionResult { @@ -486,6 +525,7 @@ fn fetch_part_info<'a>( timestamp: pk.timestamp, etag, size, + checksum: p.checksum, }; match parts.last_mut() { Some(lastpart) if lastpart.part_number == pk.part_number => { @@ -945,8 +985,12 @@ mod tests { state: ObjectVersionState::Uploading { multipart: true, encryption: ObjectVersionEncryption::Plaintext { - headers: ObjectVersionHeaders(vec![]), + inner: ObjectVersionMetaInner { + headers: vec![], + checksum: None, + }, }, + checksum_algorithm: None, }, } } @@ -1135,6 +1179,7 @@ mod tests { version: uuid, size: Some(3), etag: Some("etag1".into()), + checksum: None, }, ), ( @@ -1146,6 +1191,7 @@ mod tests { version: uuid, size: None, etag: None, + checksum: None, }, ), ( @@ -1157,6 +1203,7 @@ mod tests { version: uuid, size: Some(10), etag: Some("etag2".into()), + checksum: None, }, ), ( @@ -1168,6 +1215,7 @@ mod tests { version: uuid, size: Some(7), etag: Some("etag3".into()), + checksum: None, }, ), ( @@ -1179,6 +1227,7 @@ mod tests { version: uuid, size: Some(5), etag: Some("etag4".into()), + checksum: None, }, ), ]; @@ -1217,12 +1266,14 @@ mod tests { etag: "etag1", timestamp: TS, part_number: 1, - size: 3 + size: 3, + checksum: None, }, PartInfo { etag: "etag2", timestamp: TS, part_number: 3, + checksum: None, size: 10 }, ] @@ -1238,12 +1289,14 @@ mod tests { PartInfo { etag: "etag3", timestamp: TS, + checksum: None, part_number: 5, size: 7 }, PartInfo { etag: "etag4", timestamp: TS, + checksum: None, part_number: 8, size: 5 }, @@ -1267,24 +1320,28 @@ mod tests { PartInfo { etag: "etag1", timestamp: TS, + checksum: None, part_number: 1, size: 3 }, PartInfo { etag: "etag2", timestamp: TS, + checksum: None, part_number: 3, size: 10 }, PartInfo { etag: "etag3", timestamp: TS, + checksum: None, part_number: 5, size: 7 }, PartInfo { etag: "etag4", timestamp: TS, + checksum: None, part_number: 8, size: 5 }, diff --git a/src/api/s3/mod.rs b/src/api/s3/mod.rs index 1eb95d40..b9bb1a6f 100644 --- a/src/api/s3/mod.rs +++ b/src/api/s3/mod.rs @@ -13,6 +13,7 @@ mod post_object; mod put; mod website; +mod checksum; mod encryption; mod router; pub mod xml; diff --git a/src/api/s3/multipart.rs b/src/api/s3/multipart.rs index fcc5769f..3db3e8aa 100644 --- a/src/api/s3/multipart.rs +++ b/src/api/s3/multipart.rs @@ -1,9 +1,10 @@ use std::collections::HashMap; +use std::convert::TryInto; use std::sync::Arc; +use base64::prelude::*; use futures::prelude::*; use hyper::{Request, Response}; -use md5::{Digest as Md5Digest, Md5}; use garage_table::*; use garage_util::data::*; @@ -16,6 +17,7 @@ use garage_model::s3::version_table::*; use crate::helpers::*; use crate::s3::api_server::{ReqBody, ResBody}; +use crate::s3::checksum::*; use crate::s3::encryption::EncryptionParams; use crate::s3::error::*; use crate::s3::put::*; @@ -41,10 +43,16 @@ pub async fn handle_create_multipart_upload( let timestamp = next_timestamp(existing_object.as_ref()); let headers = get_headers(req.headers())?; + let meta = ObjectVersionMetaInner { + headers, + checksum: None, + }; // Determine whether object should be encrypted, and if so the key let encryption = EncryptionParams::new_from_headers(&garage, req.headers())?; - let object_encryption = encryption.encrypt_headers(headers)?; + let object_encryption = encryption.encrypt_meta(meta)?; + + let checksum_algorithm = request_checksum_algorithm(req.headers())?; // Create object in object table let object_version = ObjectVersion { @@ -53,6 +61,7 @@ pub async fn handle_create_multipart_upload( state: ObjectVersionState::Uploading { multipart: true, encryption: object_encryption, + checksum_algorithm, }, }; let object = Object::new(*bucket_id, key.to_string(), vec![object_version]); @@ -90,9 +99,13 @@ pub async fn handle_put_part( let upload_id = decode_upload_id(upload_id)?; - let content_md5 = match req.headers().get("content-md5") { - Some(x) => Some(x.to_str()?.to_string()), - None => None, + let expected_checksums = ExpectedChecksums { + md5: match req.headers().get("content-md5") { + Some(x) => Some(x.to_str()?.to_string()), + None => None, + }, + sha256: content_sha256, + extra: request_checksum_value(req.headers())?, }; // Read first chuck, and at the same time try to get object to see if it exists @@ -106,8 +119,12 @@ pub async fn handle_put_part( futures::try_join!(get_upload(&ctx, &key, &upload_id), chunker.next(),)?; // Check encryption params - let object_encryption = match object_version.state { - ObjectVersionState::Uploading { encryption, .. } => encryption, + let (object_encryption, checksum_algorithm) = match object_version.state { + ObjectVersionState::Uploading { + encryption, + checksum_algorithm, + .. + } => (encryption, checksum_algorithm), _ => unreachable!(), }; let (encryption, _) = @@ -138,7 +155,9 @@ pub async fn handle_put_part( mpu_part_key, MpuPart { version: version_uuid, + // all these are filled in later, at the end of this function etag: None, + checksum: None, size: None, }, ); @@ -152,32 +171,31 @@ pub async fn handle_put_part( garage.version_table.insert(&version).await?; // Copy data to version - let (total_size, data_md5sum, data_sha256sum, _) = read_and_put_blocks( + let checksummer = + Checksummer::init(&expected_checksums, !encryption.is_encrypted()).add(checksum_algorithm); + let (total_size, checksums, _) = read_and_put_blocks( &ctx, &version, encryption, part_number, first_block, &mut chunker, + checksummer, ) .await?; // Verify that checksums map - ensure_checksum_matches( - &data_md5sum, - data_sha256sum, - content_md5.as_deref(), - content_sha256, - )?; + checksums.verify(&expected_checksums)?; // Store part etag in version - let etag = encryption.etag_from_md5(&data_md5sum); + let etag = encryption.etag_from_md5(&checksums.md5); mpu.parts.put( mpu_part_key, MpuPart { version: version_uuid, etag: Some(etag.clone()), + checksum: checksums.extract(checksum_algorithm), size: Some(total_size), }, ); @@ -189,6 +207,7 @@ pub async fn handle_put_part( let mut resp = Response::builder().header("ETag", format!("\"{}\"", etag)); encryption.add_response_headers(&mut resp); + let resp = add_checksum_response_headers(&expected_checksums.extra, resp); Ok(resp.body(empty_body())?) } @@ -236,10 +255,11 @@ pub async fn handle_complete_multipart_upload( bucket_name, .. } = &ctx; + let (req_head, req_body) = req.into_parts(); - let body = http_body_util::BodyExt::collect(req.into_body()) - .await? - .to_bytes(); + let expected_checksum = request_checksum_value(&req_head.headers)?; + + let body = http_body_util::BodyExt::collect(req_body).await?.to_bytes(); if let Some(content_sha256) = content_sha256 { verify_signed_content(content_sha256, &body[..])?; @@ -263,8 +283,12 @@ pub async fn handle_complete_multipart_upload( return Err(Error::bad_request("No data was uploaded")); } - let object_encryption = match object_version.state { - ObjectVersionState::Uploading { encryption, .. } => encryption, + let (object_encryption, checksum_algorithm) = match object_version.state { + ObjectVersionState::Uploading { + encryption, + checksum_algorithm, + .. + } => (encryption, checksum_algorithm), _ => unreachable!(), }; @@ -292,6 +316,13 @@ pub async fn handle_complete_multipart_upload( for req_part in body_list_of_parts.iter() { match have_parts.get(&req_part.part_number) { Some(part) if part.etag.as_ref() == Some(&req_part.etag) && part.size.is_some() => { + // alternative version: if req_part.checksum.is_some() && part.checksum != req_part.checksum { + if part.checksum != req_part.checksum { + return Err(Error::InvalidDigest(format!( + "Invalid checksum for part {}: in request = {:?}, uploaded part = {:?}", + req_part.part_number, req_part.checksum, part.checksum + ))); + } parts.push(*part) } _ => return Err(Error::InvalidPart), @@ -339,18 +370,23 @@ pub async fn handle_complete_multipart_upload( }); garage.block_ref_table.insert_many(block_refs).await?; - // Calculate etag of final object + // Calculate checksum and etag of final object // To understand how etags are calculated, read more here: + // https://docs.aws.amazon.com/AmazonS3/latest/userguide/checking-object-integrity.html // https://teppen.io/2018/06/23/aws_s3_etags/ - let mut etag_md5_hasher = Md5::new(); + let mut checksummer = MultipartChecksummer::init(checksum_algorithm); for part in parts.iter() { - etag_md5_hasher.update(part.etag.as_ref().unwrap().as_bytes()); + checksummer.update(part.etag.as_ref().unwrap(), part.checksum)?; + } + let (checksum_md5, checksum_extra) = checksummer.finalize(); + + if expected_checksum.is_some() && checksum_extra != expected_checksum { + return Err(Error::InvalidDigest( + "Failed to validate x-amz-checksum-*".into(), + )); } - let etag = format!( - "{}-{}", - hex::encode(etag_md5_hasher.finalize()), - parts.len() - ); + + let etag = format!("{}-{}", hex::encode(&checksum_md5[..]), parts.len()); // Calculate total size of final object let total_size = parts.iter().map(|x| x.size.unwrap()).sum(); @@ -363,6 +399,20 @@ pub async fn handle_complete_multipart_upload( return Err(e); } + // If there is a checksum algorithm, update metadata with checksum + let object_encryption = match checksum_algorithm { + None => object_encryption, + Some(_) => { + let (encryption, meta) = + EncryptionParams::check_decrypt(&garage, &req_head.headers, &object_encryption)?; + let new_meta = ObjectVersionMetaInner { + headers: meta.into_owned().headers, + checksum: checksum_extra, + }; + encryption.encrypt_meta(new_meta)? + } + }; + // Write final object version object_version.state = ObjectVersionState::Complete(ObjectVersionData::FirstBlock( ObjectVersionMeta { @@ -383,10 +433,28 @@ pub async fn handle_complete_multipart_upload( bucket: s3_xml::Value(bucket_name.to_string()), key: s3_xml::Value(key), etag: s3_xml::Value(format!("\"{}\"", etag)), + checksum_crc32: match &checksum_extra { + Some(ChecksumValue::Crc32(x)) => Some(s3_xml::Value(BASE64_STANDARD.encode(&x))), + _ => None, + }, + checksum_crc32c: match &checksum_extra { + Some(ChecksumValue::Crc32c(x)) => Some(s3_xml::Value(BASE64_STANDARD.encode(&x))), + _ => None, + }, + checksum_sha1: match &checksum_extra { + Some(ChecksumValue::Sha1(x)) => Some(s3_xml::Value(BASE64_STANDARD.encode(&x))), + _ => None, + }, + checksum_sha256: match &checksum_extra { + Some(ChecksumValue::Sha256(x)) => Some(s3_xml::Value(BASE64_STANDARD.encode(&x))), + _ => None, + }, }; let xml = s3_xml::to_xml_with_header(&result)?; - Ok(Response::new(string_body(xml))) + let resp = Response::builder(); + let resp = add_checksum_response_headers(&expected_checksum, resp); + Ok(resp.body(string_body(xml))?) } pub async fn handle_abort_multipart_upload( @@ -455,6 +523,7 @@ pub fn decode_upload_id(id: &str) -> Result { struct CompleteMultipartUploadPart { etag: String, part_number: u64, + checksum: Option, } fn parse_complete_multipart_upload_body( @@ -480,9 +549,41 @@ fn parse_complete_multipart_upload_body( .children() .find(|e| e.has_tag_name("PartNumber"))? .text()?; + let checksum = if let Some(crc32) = + item.children().find(|e| e.has_tag_name("ChecksumCRC32")) + { + Some(ChecksumValue::Crc32( + BASE64_STANDARD.decode(crc32.text()?).ok()?[..] + .try_into() + .ok()?, + )) + } else if let Some(crc32c) = item.children().find(|e| e.has_tag_name("ChecksumCRC32C")) + { + Some(ChecksumValue::Crc32c( + BASE64_STANDARD.decode(crc32c.text()?).ok()?[..] + .try_into() + .ok()?, + )) + } else if let Some(sha1) = item.children().find(|e| e.has_tag_name("ChecksumSHA1")) { + Some(ChecksumValue::Sha1( + BASE64_STANDARD.decode(sha1.text()?).ok()?[..] + .try_into() + .ok()?, + )) + } else if let Some(sha256) = item.children().find(|e| e.has_tag_name("ChecksumSHA256")) + { + Some(ChecksumValue::Sha256( + BASE64_STANDARD.decode(sha256.text()?).ok()?[..] + .try_into() + .ok()?, + )) + } else { + None + }; parts.push(CompleteMultipartUploadPart { etag: etag.trim_matches('"').to_string(), part_number: part_number.parse().ok()?, + checksum, }); } else { return None; diff --git a/src/api/s3/post_object.rs b/src/api/s3/post_object.rs index 7c4219a7..2c106b3b 100644 --- a/src/api/s3/post_object.rs +++ b/src/api/s3/post_object.rs @@ -14,13 +14,15 @@ use multer::{Constraints, Multipart, SizeLimit}; use serde::Deserialize; use garage_model::garage::Garage; +use garage_model::s3::object_table::*; use crate::helpers::*; use crate::s3::api_server::ResBody; +use crate::s3::checksum::*; use crate::s3::cors::*; use crate::s3::encryption::EncryptionParams; use crate::s3::error::*; -use crate::s3::put::{get_headers, save_stream}; +use crate::s3::put::{get_headers, save_stream, ChecksumMode}; use crate::s3::xml as s3_xml; use crate::signature::payload::{verify_v4, Authorization}; @@ -98,10 +100,6 @@ pub async fn handle_post_object( .ok_or_bad_request("No policy was provided")? .to_str()?; let authorization = Authorization::parse_form(¶ms)?; - let content_md5 = params - .get("content-md5") - .map(HeaderValue::to_str) - .transpose()?; let key = if key.contains("${filename}") { // if no filename is provided, don't replace. This matches the behavior of AWS. @@ -226,6 +224,21 @@ pub async fn handle_post_object( let headers = get_headers(¶ms)?; + let expected_checksums = ExpectedChecksums { + md5: params + .get("content-md5") + .map(HeaderValue::to_str) + .transpose()? + .map(str::to_string), + sha256: None, + extra: request_checksum_algorithm_value(¶ms)?, + }; + + let meta = ObjectVersionMetaInner { + headers, + checksum: expected_checksums.extra, + }; + let encryption = EncryptionParams::new_from_headers(&garage, ¶ms)?; let stream = file_field.map(|r| r.map_err(Into::into)); @@ -239,12 +252,11 @@ pub async fn handle_post_object( let res = save_stream( &ctx, - headers, + meta, encryption, StreamLimiter::new(stream, conditions.content_length), &key, - content_md5.map(str::to_string), - None, + ChecksumMode::Verify(&expected_checksums), ) .await?; diff --git a/src/api/s3/put.rs b/src/api/s3/put.rs index 941e4122..1e3b1b44 100644 --- a/src/api/s3/put.rs +++ b/src/api/s3/put.rs @@ -1,12 +1,9 @@ use std::collections::HashMap; use std::sync::Arc; -use base64::prelude::*; use futures::prelude::*; use futures::stream::FuturesOrdered; use futures::try_join; -use md5::{digest::generic_array::*, Digest as Md5Digest, Md5}; -use sha2::Sha256; use tokio::sync::mpsc; @@ -22,7 +19,6 @@ use opentelemetry::{ use garage_net::bytes_buf::BytesBuf; use garage_rpc::rpc_helper::OrderTag; use garage_table::*; -use garage_util::async_hash::*; use garage_util::data::*; use garage_util::error::Error as GarageError; use garage_util::time::*; @@ -36,16 +32,22 @@ use garage_model::s3::version_table::*; use crate::helpers::*; use crate::s3::api_server::{ReqBody, ResBody}; +use crate::s3::checksum::*; use crate::s3::encryption::EncryptionParams; use crate::s3::error::*; const PUT_BLOCKS_MAX_PARALLEL: usize = 3; -pub struct SaveStreamResult { - pub version_uuid: Uuid, - pub version_timestamp: u64, +pub(crate) struct SaveStreamResult { + pub(crate) version_uuid: Uuid, + pub(crate) version_timestamp: u64, /// Etag WITHOUT THE QUOTES (just the hex value) - pub etag: String, + pub(crate) etag: String, +} + +pub(crate) enum ChecksumMode<'a> { + Verify(&'a ExpectedChecksums), + Calculate(Option), } pub async fn handle_put( @@ -58,24 +60,32 @@ pub async fn handle_put( let headers = get_headers(req.headers())?; debug!("Object headers: {:?}", headers); - // Determine whether object should be encrypted, and if so the key - let encryption = EncryptionParams::new_from_headers(&ctx.garage, req.headers())?; + let expected_checksums = ExpectedChecksums { + md5: match req.headers().get("content-md5") { + Some(x) => Some(x.to_str()?.to_string()), + None => None, + }, + sha256: content_sha256, + extra: request_checksum_value(req.headers())?, + }; - let content_md5 = match req.headers().get("content-md5") { - Some(x) => Some(x.to_str()?.to_string()), - None => None, + let meta = ObjectVersionMetaInner { + headers, + checksum: expected_checksums.extra, }; + // Determine whether object should be encrypted, and if so the key + let encryption = EncryptionParams::new_from_headers(&ctx.garage, req.headers())?; + let stream = body_stream(req.into_body()); let res = save_stream( &ctx, - headers, + meta, encryption, stream, key, - content_md5, - content_sha256, + ChecksumMode::Verify(&expected_checksums), ) .await?; @@ -83,17 +93,17 @@ pub async fn handle_put( .header("x-amz-version-id", hex::encode(res.version_uuid)) .header("ETag", format!("\"{}\"", res.etag)); encryption.add_response_headers(&mut resp); + let resp = add_checksum_response_headers(&expected_checksums.extra, resp); Ok(resp.body(empty_body())?) } pub(crate) async fn save_stream> + Unpin>( ctx: &ReqCtx, - headers: ObjectVersionHeaders, + mut meta: ObjectVersionMetaInner, encryption: EncryptionParams, body: S, key: &String, - content_md5: Option, - content_sha256: Option, + checksum_mode: ChecksumMode<'_>, ) -> Result { let ReqCtx { garage, bucket_id, .. @@ -107,32 +117,36 @@ pub(crate) async fn save_stream> + Unpin>( let first_block = first_block_opt.unwrap_or_default(); - let object_encryption = encryption.encrypt_headers(headers)?; - // Generate identity of new version let version_uuid = gen_uuid(); let version_timestamp = next_timestamp(existing_object.as_ref()); + let mut checksummer = match checksum_mode { + ChecksumMode::Verify(expected) => Checksummer::init(expected, !encryption.is_encrypted()), + ChecksumMode::Calculate(algo) => { + Checksummer::init(&Default::default(), !encryption.is_encrypted()).add(algo) + } + }; + // If body is small enough, store it directly in the object table // as "inline data". We can then return immediately. if first_block.len() < INLINE_THRESHOLD { - let mut md5sum = Md5::new(); - md5sum.update(&first_block[..]); - let data_md5sum = md5sum.finalize(); - - let data_sha256sum = sha256sum(&first_block[..]); + checksummer.update(&first_block); + let checksums = checksummer.finalize(); - ensure_checksum_matches( - &data_md5sum, - data_sha256sum, - content_md5.as_deref(), - content_sha256, - )?; + match checksum_mode { + ChecksumMode::Verify(expected) => { + checksums.verify(&expected)?; + } + ChecksumMode::Calculate(algo) => { + meta.checksum = checksums.extract(algo); + } + }; let size = first_block.len() as u64; check_quotas(ctx, size, existing_object.as_ref()).await?; - let etag = encryption.etag_from_md5(&data_md5sum); + let etag = encryption.etag_from_md5(&checksums.md5); let inline_data = encryption.encrypt_blob(&first_block)?.to_vec(); let object_version = ObjectVersion { @@ -140,7 +154,7 @@ pub(crate) async fn save_stream> + Unpin>( timestamp: version_timestamp, state: ObjectVersionState::Complete(ObjectVersionData::Inline( ObjectVersionMeta { - encryption: object_encryption, + encryption: encryption.encrypt_meta(meta)?, size, etag: etag.clone(), }, @@ -175,7 +189,8 @@ pub(crate) async fn save_stream> + Unpin>( uuid: version_uuid, timestamp: version_timestamp, state: ObjectVersionState::Uploading { - encryption: object_encryption.clone(), + encryption: encryption.encrypt_meta(meta.clone())?, + checksum_algorithm: None, // don't care; overwritten later multipart: false, }, }; @@ -196,25 +211,37 @@ pub(crate) async fn save_stream> + Unpin>( ); garage.version_table.insert(&version).await?; - // Transfer data and verify checksum - let (total_size, data_md5sum, data_sha256sum, first_block_hash) = - read_and_put_blocks(ctx, &version, encryption, 1, first_block, &mut chunker).await?; + // Transfer data + let (total_size, checksums, first_block_hash) = read_and_put_blocks( + ctx, + &version, + encryption, + 1, + first_block, + &mut chunker, + checksummer, + ) + .await?; - ensure_checksum_matches( - &data_md5sum, - data_sha256sum, - content_md5.as_deref(), - content_sha256, - )?; + // Verify checksums are ok / add calculated checksum to metadata + match checksum_mode { + ChecksumMode::Verify(expected) => { + checksums.verify(&expected)?; + } + ChecksumMode::Calculate(algo) => { + meta.checksum = checksums.extract(algo); + } + }; + // Verify quotas are respsected check_quotas(ctx, total_size, existing_object.as_ref()).await?; // Save final object state, marked as Complete - let etag = encryption.etag_from_md5(&data_md5sum); + let etag = encryption.etag_from_md5(&checksums.md5); object_version.state = ObjectVersionState::Complete(ObjectVersionData::FirstBlock( ObjectVersionMeta { - encryption: object_encryption, + encryption: encryption.encrypt_meta(meta)?, size: total_size, etag: etag.clone(), }, @@ -234,33 +261,6 @@ pub(crate) async fn save_stream> + Unpin>( }) } -/// Validate MD5 sum against content-md5 header -/// and sha256sum against signed content-sha256 -pub(crate) fn ensure_checksum_matches( - data_md5sum: &[u8], - data_sha256sum: garage_util::data::FixedBytes32, - content_md5: Option<&str>, - content_sha256: Option, -) -> Result<(), Error> { - if let Some(expected_sha256) = content_sha256 { - if expected_sha256 != data_sha256sum { - return Err(Error::bad_request( - "Unable to validate x-amz-content-sha256", - )); - } else { - trace!("Successfully validated x-amz-content-sha256"); - } - } - if let Some(expected_md5) = content_md5 { - if expected_md5.trim_matches('"') != BASE64_STANDARD.encode(data_md5sum) { - return Err(Error::bad_request("Unable to validate content-md5")); - } else { - trace!("Successfully validated content-md5"); - } - } - Ok(()) -} - /// Check that inserting this object with this size doesn't exceed bucket quotas pub(crate) async fn check_quotas( ctx: &ReqCtx, @@ -332,7 +332,8 @@ pub(crate) async fn read_and_put_blocks> + part_number: u64, first_block: Bytes, chunker: &mut StreamChunker, -) -> Result<(u64, GenericArray, Hash, Hash), Error> { + checksummer: Checksummer, +) -> Result<(u64, Checksums, Hash), Error> { let tracer = opentelemetry::global::tracer("garage"); let (block_tx, mut block_rx) = mpsc::channel::>(2); @@ -360,20 +361,20 @@ pub(crate) async fn read_and_put_blocks> + let (block_tx2, mut block_rx2) = mpsc::channel::>(1); let hash_stream = async { - let md5hasher = AsyncHasher::::new(); - let sha256hasher = AsyncHasher::::new(); + let mut checksummer = checksummer; while let Some(next) = block_rx.recv().await { match next { Ok(block) => { block_tx2.send(Ok(block.clone())).await?; - futures::future::join( - md5hasher.update(block.clone()), - sha256hasher.update(block.clone()), - ) + checksummer = tokio::task::spawn_blocking(move || { + checksummer.update(&block); + checksummer + }) .with_context(Context::current_with_span( tracer.start("Hash block (md5, sha256)"), )) - .await; + .await + .unwrap() } Err(e) => { block_tx2.send(Err(e)).await?; @@ -382,10 +383,7 @@ pub(crate) async fn read_and_put_blocks> + } } drop(block_tx2); - Ok::<_, mpsc::error::SendError<_>>(futures::join!( - md5hasher.finalize(), - sha256hasher.finalize() - )) + Ok::<_, mpsc::error::SendError<_>>(checksummer) }; let (block_tx3, mut block_rx3) = mpsc::channel::>(1); @@ -395,33 +393,28 @@ pub(crate) async fn read_and_put_blocks> + match next { Ok(block) => { let unencrypted_len = block.len() as u64; - let block = if encryption.is_encrypted() { - let res = - tokio::task::spawn_blocking(move || encryption.encrypt_block(block)) - .with_context(Context::current_with_span( - tracer.start("Encrypt block"), - )) - .await - .unwrap(); - match res { - Ok(b) => b, - Err(e) => { - block_tx3.send(Err(e)).await?; - break; + let res = tokio::task::spawn_blocking(move || { + let block = encryption.encrypt_block(block)?; + let hash = blake2sum(&block); + Ok((block, hash)) + }) + .with_context(Context::current_with_span( + tracer.start("Encrypt and hash (blake2) block"), + )) + .await + .unwrap(); + match res { + Ok((block, hash)) => { + if first_block_hash.is_none() { + first_block_hash = Some(hash); } + block_tx3.send(Ok((block, unencrypted_len, hash))).await?; + } + Err(e) => { + block_tx3.send(Err(e)).await?; + break; } - } else { - block - }; - let hash = async_blake2sum(block.clone()) - .with_context(Context::current_with_span( - tracer.start("Hash block (blake2)"), - )) - .await; - if first_block_hash.is_none() { - first_block_hash = Some(hash); } - block_tx3.send(Ok((block, unencrypted_len, hash))).await?; } Err(e) => { block_tx3.send(Err(e)).await?; @@ -493,12 +486,10 @@ pub(crate) async fn read_and_put_blocks> + let total_size = final_result?; // unwrap here is ok, because if hasher failed, it is because something failed // later in the pipeline which already caused a return at the ? on previous line - let (data_md5sum, data_sha256sum) = stream_hash_result.unwrap(); let first_block_hash = block_hash_result.unwrap(); + let checksums = stream_hash_result.unwrap().finalize(); - let data_sha256sum = Hash::try_from(&data_sha256sum[..]).unwrap(); - - Ok((total_size, data_md5sum, data_sha256sum, first_block_hash)) + Ok((total_size, checksums, first_block_hash)) } async fn put_block_and_meta( @@ -609,7 +600,7 @@ impl Drop for InterruptedCleanup { // ============ helpers ============ -pub(crate) fn get_headers(headers: &HeaderMap) -> Result { +pub(crate) fn get_headers(headers: &HeaderMap) -> Result { let mut ret = Vec::new(); // Preserve standard headers @@ -637,7 +628,7 @@ pub(crate) fn get_headers(headers: &HeaderMap) -> Result) -> u64 { diff --git a/src/api/s3/xml.rs b/src/api/s3/xml.rs index 06f11288..1e569ade 100644 --- a/src/api/s3/xml.rs +++ b/src/api/s3/xml.rs @@ -131,6 +131,14 @@ pub struct CompleteMultipartUploadResult { pub key: Value, #[serde(rename = "ETag")] pub etag: Value, + #[serde(rename = "ChecksumCRC32")] + pub checksum_crc32: Option, + #[serde(rename = "ChecksumCRC32C")] + pub checksum_crc32c: Option, + #[serde(rename = "ChecksumSHA1")] + pub checksum_sha1: Option, + #[serde(rename = "ChecksumSHA256")] + pub checksum_sha256: Option, } #[derive(Debug, Serialize, PartialEq, Eq)] @@ -197,6 +205,14 @@ pub struct PartItem { pub part_number: IntValue, #[serde(rename = "Size")] pub size: IntValue, + #[serde(rename = "ChecksumCRC32")] + pub checksum_crc32: Option, + #[serde(rename = "ChecksumCRC32C")] + pub checksum_crc32c: Option, + #[serde(rename = "ChecksumSHA1")] + pub checksum_sha1: Option, + #[serde(rename = "ChecksumSHA256")] + pub checksum_sha256: Option, } #[derive(Debug, Serialize, PartialEq, Eq)] @@ -500,6 +516,10 @@ mod tests { bucket: Value("mybucket".to_string()), key: Value("a/plop".to_string()), etag: Value("\"3858f62230ac3c915f300c664312c11f-9\"".to_string()), + checksum_crc32: None, + checksum_crc32c: None, + checksum_sha1: Some(Value("ZJAnHyG8PeKz9tI8UTcHrJos39A=".into())), + checksum_sha256: None, }; assert_eq!( to_xml_with_header(&result)?, @@ -509,6 +529,7 @@ mod tests { mybucket\ a/plop\ "3858f62230ac3c915f300c664312c11f-9"\ + ZJAnHyG8PeKz9tI8UTcHrJos39A=\ " ); Ok(()) @@ -780,12 +801,22 @@ mod tests { last_modified: Value("2010-11-10T20:48:34.000Z".to_string()), part_number: IntValue(2), size: IntValue(10485760), + checksum_crc32: None, + checksum_crc32c: None, + checksum_sha256: Some(Value( + "5RQ3A5uk0w7ojNjvegohch4JRBBGN/cLhsNrPzfv/hA=".into(), + )), + checksum_sha1: None, }, PartItem { etag: Value("\"aaaa18db4cc2f85cedef654fccc4a4x8\"".to_string()), last_modified: Value("2010-11-10T20:48:33.000Z".to_string()), part_number: IntValue(3), size: IntValue(10485760), + checksum_sha256: None, + checksum_crc32c: None, + checksum_crc32: Some(Value("ZJAnHyG8=".into())), + checksum_sha1: None, }, ], initiator: Initiator { @@ -820,12 +851,14 @@ mod tests { 2010-11-10T20:48:34.000Z\ 2\ 10485760\ + 5RQ3A5uk0w7ojNjvegohch4JRBBGN/cLhsNrPzfv/hA=\ \ \ "aaaa18db4cc2f85cedef654fccc4a4x8"\ 2010-11-10T20:48:33.000Z\ 3\ 10485760\ + ZJAnHyG8=\ \ \ umat-user-11116a31-17b5-4fb7-9df5-b288870f11xx\ diff --git a/src/garage/Cargo.toml b/src/garage/Cargo.toml index 53449a1c..17da68f8 100644 --- a/src/garage/Cargo.toml +++ b/src/garage/Cargo.toml @@ -42,6 +42,7 @@ tracing.workspace = true tracing-subscriber.workspace = true rand.workspace = true async-trait.workspace = true +sha1.workspace = true sodiumoxide.workspace = true structopt.workspace = true git-version.workspace = true diff --git a/src/garage/tests/s3/multipart.rs b/src/garage/tests/s3/multipart.rs index 51c9df74..cc424f59 100644 --- a/src/garage/tests/s3/multipart.rs +++ b/src/garage/tests/s3/multipart.rs @@ -1,6 +1,7 @@ use crate::common; use aws_sdk_s3::primitives::ByteStream; -use aws_sdk_s3::types::{CompletedMultipartUpload, CompletedPart}; +use aws_sdk_s3::types::{ChecksumAlgorithm, CompletedMultipartUpload, CompletedPart}; +use base64::prelude::*; const SZ_5MB: usize = 5 * 1024 * 1024; const SZ_10MB: usize = 10 * 1024 * 1024; @@ -189,6 +190,153 @@ async fn test_multipart_upload() { } } +#[tokio::test] +async fn test_multipart_with_checksum() { + let ctx = common::context(); + let bucket = ctx.create_bucket("testmpu-cksum"); + + let u1 = vec![0x11; SZ_5MB]; + let u2 = vec![0x22; SZ_5MB]; + let u3 = vec![0x33; SZ_5MB]; + + let ck1 = calculate_sha1(&u1); + let ck2 = calculate_sha1(&u2); + let ck3 = calculate_sha1(&u3); + + let up = ctx + .client + .create_multipart_upload() + .bucket(&bucket) + .checksum_algorithm(ChecksumAlgorithm::Sha1) + .key("a") + .send() + .await + .unwrap(); + assert!(up.upload_id.is_some()); + + let uid = up.upload_id.as_ref().unwrap(); + + let p1 = ctx + .client + .upload_part() + .bucket(&bucket) + .key("a") + .upload_id(uid) + .part_number(1) + .checksum_sha1(&ck1) + .body(ByteStream::from(u1.clone())) + .send() + .await + .unwrap(); + + // wrong checksum value should return an error + let err1 = ctx + .client + .upload_part() + .bucket(&bucket) + .key("a") + .upload_id(uid) + .part_number(2) + .checksum_sha1(&ck1) + .body(ByteStream::from(u2.clone())) + .send() + .await; + assert!(err1.is_err()); + + let p2 = ctx + .client + .upload_part() + .bucket(&bucket) + .key("a") + .upload_id(uid) + .part_number(2) + .checksum_sha1(&ck2) + .body(ByteStream::from(u2)) + .send() + .await + .unwrap(); + + let p3 = ctx + .client + .upload_part() + .bucket(&bucket) + .key("a") + .upload_id(uid) + .part_number(3) + .checksum_sha1(&ck3) + .body(ByteStream::from(u3.clone())) + .send() + .await + .unwrap(); + + { + let r = ctx + .client + .list_parts() + .bucket(&bucket) + .key("a") + .upload_id(uid) + .send() + .await + .unwrap(); + let parts = r.parts.unwrap(); + assert_eq!(parts.len(), 3); + assert!(parts[0].checksum_crc32.is_none()); + assert!(parts[0].checksum_crc32_c.is_none()); + assert!(parts[0].checksum_sha256.is_none()); + assert_eq!(parts[0].checksum_sha1.as_deref().unwrap(), ck1); + assert_eq!(parts[1].checksum_sha1.as_deref().unwrap(), ck2); + assert_eq!(parts[2].checksum_sha1.as_deref().unwrap(), ck3); + } + + let cmp = CompletedMultipartUpload::builder() + .parts( + CompletedPart::builder() + .part_number(1) + .checksum_sha1(&ck1) + .e_tag(p1.e_tag.unwrap()) + .build(), + ) + .parts( + CompletedPart::builder() + .part_number(2) + .checksum_sha1(&ck2) + .e_tag(p2.e_tag.unwrap()) + .build(), + ) + .parts( + CompletedPart::builder() + .part_number(3) + .checksum_sha1(&ck3) + .e_tag(p3.e_tag.unwrap()) + .build(), + ) + .build(); + + let expected_checksum = calculate_sha1( + &vec![ + BASE64_STANDARD.decode(&ck1).unwrap(), + BASE64_STANDARD.decode(&ck2).unwrap(), + BASE64_STANDARD.decode(&ck3).unwrap(), + ] + .concat(), + ); + + let res = ctx + .client + .complete_multipart_upload() + .bucket(&bucket) + .key("a") + .upload_id(uid) + .checksum_sha1(expected_checksum.clone()) + .multipart_upload(cmp) + .send() + .await + .unwrap(); + + assert_eq!(res.checksum_sha1, Some(expected_checksum)); +} + #[tokio::test] async fn test_uploadlistpart() { let ctx = common::context(); @@ -624,3 +772,11 @@ async fn test_uploadpartcopy() { assert_eq!(real_obj.len(), exp_obj.len()); assert_eq!(real_obj, exp_obj); } + +fn calculate_sha1(bytes: &[u8]) -> String { + use sha1::{Digest, Sha1}; + + let mut hasher = Sha1::new(); + hasher.update(bytes); + BASE64_STANDARD.encode(&hasher.finalize()[..]) +} diff --git a/src/model/s3/mpu_table.rs b/src/model/s3/mpu_table.rs index 238cbf11..c9f79caf 100644 --- a/src/model/s3/mpu_table.rs +++ b/src/model/s3/mpu_table.rs @@ -17,6 +17,7 @@ pub const PARTS: &str = "parts"; pub const BYTES: &str = "bytes"; mod v09 { + use crate::s3::object_table::ChecksumValue; use garage_util::crdt; use garage_util::data::Uuid; use serde::{Deserialize, Serialize}; @@ -61,6 +62,9 @@ mod v09 { pub version: Uuid, /// ETag of the content of this part (known only once done uploading) pub etag: Option, + /// Checksum requested by x-amz-checksum-algorithm + #[serde(default)] + pub checksum: Option, /// Size of this part (known only once done uploading) pub size: Option, } @@ -155,6 +159,11 @@ impl Crdt for MpuPart { (Some(x), Some(y)) if x < y => other.size, (x, _) => x, }; + self.checksum = match (self.checksum.take(), &other.checksum) { + (None, Some(_)) => other.checksum.clone(), + (Some(x), Some(y)) if x < *y => other.checksum.clone(), + (x, _) => x, + }; } } diff --git a/src/model/s3/object_table.rs b/src/model/s3/object_table.rs index eedb9615..b2f25803 100644 --- a/src/model/s3/object_table.rs +++ b/src/model/s3/object_table.rs @@ -208,6 +208,8 @@ mod v010 { Uploading { /// Indicates whether this is a multipart upload multipart: bool, + /// Checksum algorithm to use + checksum_algorithm: Option, /// Encryption params + headers to be included in the final object encryption: ObjectVersionEncryption, }, @@ -247,10 +249,10 @@ mod v010 { #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] pub enum ObjectVersionEncryption { SseC { - /// Encrypted serialized ObjectVersionHeaders struct. + /// Encrypted serialized ObjectVersionInner struct. /// This is never compressed, just encrypted using AES256-GCM. #[serde(with = "serde_bytes")] - headers: Vec, + inner: Vec, /// Whether data blocks are compressed in addition to being encrypted /// (compression happens before encryption, whereas for non-encrypted /// objects, compression is handled at the level of the block manager) @@ -258,13 +260,35 @@ mod v010 { }, Plaintext { /// Plain-text headers - headers: ObjectVersionHeaders, + inner: ObjectVersionMetaInner, }, } /// Vector of headers, as tuples of the format (header name, header value) #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] - pub struct ObjectVersionHeaders(pub Vec<(String, String)>); + pub struct ObjectVersionMetaInner { + pub headers: HeaderList, + pub checksum: Option, + } + + pub type HeaderList = Vec<(String, String)>; + + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)] + pub enum ChecksumAlgorithm { + Crc32, + Crc32c, + Sha1, + Sha256, + } + + /// Checksum value for x-amz-checksum-algorithm + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)] + pub enum ChecksumValue { + Crc32(#[serde(with = "serde_bytes")] [u8; 4]), + Crc32c(#[serde(with = "serde_bytes")] [u8; 4]), + Sha1(#[serde(with = "serde_bytes")] [u8; 20]), + Sha256(#[serde(with = "serde_bytes")] [u8; 32]), + } impl garage_util::migrate::Migrate for Object { const VERSION_MARKER: &'static [u8] = b"G010s3ob"; @@ -288,6 +312,7 @@ mod v010 { v09::ObjectVersionState::Uploading { multipart, headers } => { ObjectVersionState::Uploading { multipart, + checksum_algorithm: None, encryption: migrate_headers(headers), } } @@ -331,15 +356,18 @@ mod v010 { } ObjectVersionEncryption::Plaintext { - headers: ObjectVersionHeaders(new_headers), + inner: ObjectVersionMetaInner { + headers: new_headers, + checksum: None, + }, } } // Since ObjectVersionHeaders can now be serialized independently, for the // purpose of being encrypted, we need it to support migrations on its own // as well. - impl garage_util::migrate::InitialFormat for ObjectVersionHeaders { - const VERSION_MARKER: &'static [u8] = b"G010s3oh"; + impl garage_util::migrate::InitialFormat for ObjectVersionMetaInner { + const VERSION_MARKER: &'static [u8] = b"G010s3om"; } } @@ -454,6 +482,17 @@ impl Entry for Object { } } +impl ChecksumValue { + pub fn algorithm(&self) -> ChecksumAlgorithm { + match self { + ChecksumValue::Crc32(_) => ChecksumAlgorithm::Crc32, + ChecksumValue::Crc32c(_) => ChecksumAlgorithm::Crc32c, + ChecksumValue::Sha1(_) => ChecksumAlgorithm::Sha1, + ChecksumValue::Sha256(_) => ChecksumAlgorithm::Sha256, + } + } +} + impl Crdt for Object { fn merge(&mut self, other: &Self) { // Merge versions from other into here diff --git a/src/util/async_hash.rs b/src/util/async_hash.rs deleted file mode 100644 index 5631ea6b..00000000 --- a/src/util/async_hash.rs +++ /dev/null @@ -1,61 +0,0 @@ -use bytes::Bytes; -use digest::Digest; - -use tokio::sync::mpsc; -use tokio::task::JoinHandle; - -use crate::data::*; - -/// Compute the sha256 of a slice, -/// spawning on a tokio thread for CPU-intensive processing -/// The argument has to be an owned Bytes, as it is moved out to a new thread. -pub async fn async_sha256sum(data: Bytes) -> Hash { - tokio::task::spawn_blocking(move || sha256sum(&data)) - .await - .unwrap() -} - -/// Compute the blake2sum of a slice, -/// spawning on a tokio thread for CPU-intensive processing. -/// The argument has to be an owned Bytes, as it is moved out to a new thread. -pub async fn async_blake2sum(data: Bytes) -> Hash { - tokio::task::spawn_blocking(move || blake2sum(&data)) - .await - .unwrap() -} - -// ---- - -pub struct AsyncHasher { - sendblk: mpsc::Sender, - task: JoinHandle>, -} - -impl AsyncHasher { - pub fn new() -> Self { - let (sendblk, mut recvblk) = mpsc::channel::(1); - let task = tokio::task::spawn_blocking(move || { - let mut digest = D::new(); - while let Some(blk) = recvblk.blocking_recv() { - digest.update(&blk[..]); - } - digest.finalize() - }); - Self { sendblk, task } - } - - pub async fn update(&self, b: Bytes) { - self.sendblk.send(b).await.unwrap(); - } - - pub async fn finalize(self) -> digest::Output { - drop(self.sendblk); - self.task.await.unwrap() - } -} - -impl Default for AsyncHasher { - fn default() -> Self { - Self::new() - } -} diff --git a/src/util/lib.rs b/src/util/lib.rs index 7df77959..8b035ff0 100644 --- a/src/util/lib.rs +++ b/src/util/lib.rs @@ -3,7 +3,6 @@ #[macro_use] extern crate tracing; -pub mod async_hash; pub mod background; pub mod config; pub mod crdt; -- cgit v1.2.3 From c0eeb0b0f32ed0a27cfdf9297d0e71e1b9948b73 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 27 Mar 2024 10:44:03 +0100 Subject: [next-0.10] fixes to k2v rpc + comment fixes --- src/model/k2v/rpc.rs | 13 ++++--------- src/model/s3/object_table.rs | 2 +- src/rpc/layout/helper.rs | 4 ++++ src/rpc/layout/history.rs | 6 ++++++ 4 files changed, 15 insertions(+), 10 deletions(-) (limited to 'src') diff --git a/src/model/k2v/rpc.rs b/src/model/k2v/rpc.rs index e15f2df8..95ff2d18 100644 --- a/src/model/k2v/rpc.rs +++ b/src/model/k2v/rpc.rs @@ -219,12 +219,11 @@ impl K2VRpcHandler { }, sort_key, }; - // TODO figure this out with write sets, is it still appropriate??? let nodes = self .item_table .data .replication - .read_nodes(&poll_key.partition.hash()); + .storage_nodes(&poll_key.partition.hash()); let rpc = self.system.rpc_helper().try_call_many( &self.endpoint, @@ -239,8 +238,7 @@ impl K2VRpcHandler { .send_all_at_once(true) .without_timeout(), ); - let timeout_duration = - Duration::from_millis(timeout_msec) + self.system.rpc_helper().rpc_timeout(); + let timeout_duration = Duration::from_millis(timeout_msec); let resps = select! { r = rpc => r?, _ = tokio::time::sleep(timeout_duration) => return Ok(None), @@ -282,12 +280,11 @@ impl K2VRpcHandler { seen.restrict(&range); // Prepare PollRange RPC to send to the storage nodes responsible for the parititon - // TODO figure this out with write sets, does it still work???? let nodes = self .item_table .data .replication - .read_nodes(&range.partition.hash()); + .storage_nodes(&range.partition.hash()); let quorum = self.item_table.data.replication.read_quorum(); let msg = K2VRpc::PollRange { range, @@ -320,9 +317,7 @@ impl K2VRpcHandler { // kind: all items produced by that node until time ts have been returned, so we can // bump the entry in the global vector clock and possibly remove some item-specific // vector clocks) - let mut deadline = Instant::now() - + Duration::from_millis(timeout_msec) - + self.system.rpc_helper().rpc_timeout(); + let mut deadline = Instant::now() + Duration::from_millis(timeout_msec); let mut resps = vec![]; let mut errors = vec![]; loop { diff --git a/src/model/s3/object_table.rs b/src/model/s3/object_table.rs index b2f25803..5c721148 100644 --- a/src/model/s3/object_table.rs +++ b/src/model/s3/object_table.rs @@ -363,7 +363,7 @@ mod v010 { } } - // Since ObjectVersionHeaders can now be serialized independently, for the + // Since ObjectVersionMetaInner can now be serialized independently, for the // purpose of being encrypted, we need it to support migrations on its own // as well. impl garage_util::migrate::InitialFormat for ObjectVersionMetaInner { diff --git a/src/rpc/layout/helper.rs b/src/rpc/layout/helper.rs index 2835347a..e3096945 100644 --- a/src/rpc/layout/helper.rs +++ b/src/rpc/layout/helper.rs @@ -153,10 +153,14 @@ impl LayoutHelper { // ------------------ read helpers --------------- + /// Return all nodes that have a role (gateway or storage) + /// in one of the currently active layout versions pub fn all_nodes(&self) -> &[Uuid] { &self.all_nodes } + /// Return all nodes that are configured to store data + /// in one of the currently active layout versions pub fn all_nongateway_nodes(&self) -> &[Uuid] { &self.all_nongateway_nodes } diff --git a/src/rpc/layout/history.rs b/src/rpc/layout/history.rs index 290f058d..af2cbc63 100644 --- a/src/rpc/layout/history.rs +++ b/src/rpc/layout/history.rs @@ -27,14 +27,18 @@ impl LayoutHistory { // ------------------ who stores what now? --------------- + /// Returns the layout version with the highest number pub fn current(&self) -> &LayoutVersion { self.versions.last().as_ref().unwrap() } + /// Returns the version number of the oldest layout version still active pub fn min_stored(&self) -> u64 { self.versions.first().as_ref().unwrap().version } + /// Calculate the set of all nodes that have a role (gateway or storage) + /// in one of the currently active layout versions pub fn get_all_nodes(&self) -> Vec { if self.versions.len() == 1 { self.versions[0].all_nodes().to_vec() @@ -48,6 +52,8 @@ impl LayoutHistory { } } + /// Calculate the set of all nodes that are configured to store data + /// in one of the currently active layout versions pub(crate) fn get_all_nongateway_nodes(&self) -> Vec { if self.versions.len() == 1 { self.versions[0].nongateway_nodes().to_vec() -- cgit v1.2.3 From 01a0bd54106941156ca998be1a44b8ac2c3aa74a Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 27 Mar 2024 13:32:13 +0100 Subject: [next-0.10] remove impl Deref for LayoutHelper --- src/api/admin/cluster.rs | 10 ++++----- src/rpc/layout/helper.rs | 54 +++++++++++++++++++++++------------------------ src/rpc/layout/manager.rs | 22 +++++++++---------- src/rpc/rpc_helper.rs | 20 ++++++++---------- src/rpc/system.rs | 6 +++--- 5 files changed, 55 insertions(+), 57 deletions(-) (limited to 'src') diff --git a/src/api/admin/cluster.rs b/src/api/admin/cluster.rs index 8c9cb1e5..e5877fcd 100644 --- a/src/api/admin/cluster.rs +++ b/src/api/admin/cluster.rs @@ -78,7 +78,7 @@ pub async fn handle_get_cluster_status(garage: &Arc) -> Result) -> Result, Error> { - let res = format_cluster_layout(&garage.system.cluster_layout()); + let res = format_cluster_layout(garage.system.cluster_layout().inner()); Ok(json_ok_response(&res)?) } @@ -295,7 +295,7 @@ pub async fn handle_update_cluster_layout( ) -> Result, Error> { let updates = parse_json_body::(req).await?; - let mut layout = garage.system.cluster_layout().clone(); + let mut layout = garage.system.cluster_layout().inner().clone(); let mut roles = layout.current().roles.clone(); roles.merge(&layout.staging.get().roles); @@ -341,7 +341,7 @@ pub async fn handle_apply_cluster_layout( ) -> Result, Error> { let param = parse_json_body::(req).await?; - let layout = garage.system.cluster_layout().clone(); + let layout = garage.system.cluster_layout().inner().clone(); let (layout, msg) = layout.apply_staged_changes(Some(param.version))?; garage @@ -360,7 +360,7 @@ pub async fn handle_apply_cluster_layout( pub async fn handle_revert_cluster_layout( garage: &Arc, ) -> Result, Error> { - let layout = garage.system.cluster_layout().clone(); + let layout = garage.system.cluster_layout().inner().clone(); let layout = layout.revert_staged_changes()?; garage .system diff --git a/src/rpc/layout/helper.rs b/src/rpc/layout/helper.rs index e3096945..ddf8fd44 100644 --- a/src/rpc/layout/helper.rs +++ b/src/rpc/layout/helper.rs @@ -1,5 +1,4 @@ use std::collections::HashMap; -use std::ops::Deref; use std::sync::atomic::{AtomicUsize, Ordering}; use serde::{Deserialize, Serialize}; @@ -49,13 +48,6 @@ pub struct LayoutHelper { pub(crate) ack_lock: HashMap, } -impl Deref for LayoutHelper { - type Target = LayoutHistory; - fn deref(&self) -> &LayoutHistory { - self.layout() - } -} - impl LayoutHelper { pub fn new( replication_factor: ReplicationFactor, @@ -131,10 +123,6 @@ impl LayoutHelper { // ------------------ single updating function -------------- - fn layout(&self) -> &LayoutHistory { - self.layout.as_ref().unwrap() - } - pub(crate) fn update(&mut self, f: F) -> bool where F: FnOnce(&mut LayoutHistory) -> bool, @@ -153,6 +141,18 @@ impl LayoutHelper { // ------------------ read helpers --------------- + pub fn inner(&self) -> &LayoutHistory { + self.layout.as_ref().unwrap() + } + + pub fn current(&self) -> &LayoutVersion { + self.inner().current() + } + + pub fn versions(&self) -> &[LayoutVersion] { + &self.inner().versions + } + /// Return all nodes that have a role (gateway or storage) /// in one of the currently active layout versions pub fn all_nodes(&self) -> &[Uuid] { @@ -175,20 +175,19 @@ impl LayoutHelper { pub fn sync_digest(&self) -> SyncLayoutDigest { SyncLayoutDigest { - current: self.layout().current().version, + current: self.current().version, ack_map_min: self.ack_map_min(), - min_stored: self.layout().min_stored(), + min_stored: self.inner().min_stored(), } } pub fn read_nodes_of(&self, position: &Hash) -> Vec { let sync_min = self.sync_map_min; let version = self - .layout() - .versions + .versions() .iter() .find(|x| x.version == sync_min) - .or(self.layout().versions.last()) + .or(self.versions().last()) .unwrap(); version .nodes_of(position, version.replication_factor) @@ -196,8 +195,7 @@ impl LayoutHelper { } pub fn storage_sets_of(&self, position: &Hash) -> Vec> { - self.layout() - .versions + self.versions() .iter() .map(|x| x.nodes_of(position, x.replication_factor).collect()) .collect() @@ -205,7 +203,7 @@ impl LayoutHelper { pub fn storage_nodes_of(&self, position: &Hash) -> Vec { let mut ret = vec![]; - for version in self.layout().versions.iter() { + for version in self.versions().iter() { ret.extend(version.nodes_of(position, version.replication_factor)); } ret.sort(); @@ -224,7 +222,7 @@ impl LayoutHelper { pub fn digest(&self) -> RpcLayoutDigest { RpcLayoutDigest { current_version: self.current().version, - active_versions: self.versions.len(), + active_versions: self.versions().len(), trackers_hash: self.trackers_hash, staging_hash: self.staging_hash, } @@ -246,13 +244,16 @@ impl LayoutHelper { // 3. Acknowledge everyone has synced up to min(self.sync_map) self.sync_ack(local_node_id); - debug!("ack_map: {:?}", self.update_trackers.ack_map); - debug!("sync_map: {:?}", self.update_trackers.sync_map); - debug!("sync_ack_map: {:?}", self.update_trackers.sync_ack_map); + debug!("ack_map: {:?}", self.inner().update_trackers.ack_map); + debug!("sync_map: {:?}", self.inner().update_trackers.sync_map); + debug!( + "sync_ack_map: {:?}", + self.inner().update_trackers.sync_ack_map + ); } fn sync_first(&mut self, local_node_id: Uuid) { - let first_version = self.min_stored(); + let first_version = self.inner().min_stored(); self.update(|layout| { layout .update_trackers @@ -286,8 +287,7 @@ impl LayoutHelper { } pub(crate) fn max_free_ack(&self) -> u64 { - self.layout() - .versions + self.versions() .iter() .map(|x| x.version) .skip_while(|v| { diff --git a/src/rpc/layout/manager.rs b/src/rpc/layout/manager.rs index 8a6eb1c3..3866f867 100644 --- a/src/rpc/layout/manager.rs +++ b/src/rpc/layout/manager.rs @@ -109,7 +109,7 @@ impl LayoutManager { } pub fn add_table(&self, table_name: &'static str) { - let first_version = self.layout().versions.first().unwrap().version; + let first_version = self.layout().versions().first().unwrap().version; self.table_sync_version .lock() @@ -127,7 +127,7 @@ impl LayoutManager { if layout.update(|l| l.update_trackers.sync_map.set_max(self.node_id, sync_until)) { info!("sync_until updated to {}", sync_until); self.broadcast_update(SystemRpc::AdvertiseClusterLayoutTrackers( - layout.update_trackers.clone(), + layout.inner().update_trackers.clone(), )); } } @@ -136,7 +136,7 @@ impl LayoutManager { let mut layout = self.layout.write().unwrap(); if layout.ack_max_free(self.node_id) { self.broadcast_update(SystemRpc::AdvertiseClusterLayoutTrackers( - layout.update_trackers.clone(), + layout.inner().update_trackers.clone(), )); } } @@ -160,16 +160,16 @@ impl LayoutManager { fn merge_layout(&self, adv: &LayoutHistory) -> Option { let mut layout = self.layout.write().unwrap(); let prev_digest = layout.digest(); - let prev_layout_check = layout.check().is_ok(); + let prev_layout_check = layout.inner().check().is_ok(); if !prev_layout_check || adv.check().is_ok() { if layout.update(|l| l.merge(adv)) { layout.update_trackers(self.node_id); - if prev_layout_check && layout.check().is_err() { + if prev_layout_check && layout.inner().check().is_err() { panic!("Merged two correct layouts and got an incorrect layout."); } assert!(layout.digest() != prev_digest); - return Some(layout.clone()); + return Some(layout.inner().clone()); } } @@ -180,11 +180,11 @@ impl LayoutManager { let mut layout = self.layout.write().unwrap(); let prev_digest = layout.digest(); - if layout.update_trackers != *adv { + if layout.inner().update_trackers != *adv { if layout.update(|l| l.update_trackers.merge(adv)) { layout.update_trackers(self.node_id); assert!(layout.digest() != prev_digest); - return Some(layout.update_trackers.clone()); + return Some(layout.inner().update_trackers.clone()); } } @@ -230,7 +230,7 @@ impl LayoutManager { /// Save cluster layout data to disk async fn save_cluster_layout(&self) -> Result<(), Error> { - let layout = self.layout.read().unwrap().clone(); + let layout = self.layout.read().unwrap().inner().clone(); self.persist_cluster_layout .save_async(&layout) .await @@ -278,13 +278,13 @@ impl LayoutManager { } pub(crate) fn handle_pull_cluster_layout(&self) -> SystemRpc { - let layout = self.layout.read().unwrap().clone(); + let layout = self.layout.read().unwrap().inner().clone(); SystemRpc::AdvertiseClusterLayout(layout) } pub(crate) fn handle_pull_cluster_layout_trackers(&self) -> SystemRpc { let layout = self.layout.read().unwrap(); - SystemRpc::AdvertiseClusterLayoutTrackers(layout.update_trackers.clone()) + SystemRpc::AdvertiseClusterLayoutTrackers(layout.inner().update_trackers.clone()) } pub(crate) async fn handle_advertise_cluster_layout( diff --git a/src/rpc/rpc_helper.rs b/src/rpc/rpc_helper.rs index 977c6ed8..05fdcce4 100644 --- a/src/rpc/rpc_helper.rs +++ b/src/rpc/rpc_helper.rs @@ -26,7 +26,7 @@ use garage_util::data::*; use garage_util::error::Error; use garage_util::metrics::RecordDuration; -use crate::layout::{LayoutHelper, LayoutHistory}; +use crate::layout::{LayoutHelper, LayoutVersion}; use crate::metrics::RpcMetrics; // Default RPC timeout = 5 minutes @@ -304,7 +304,8 @@ impl RpcHelper { // preemptively send an additional request to any remaining nodes. // Reorder requests to priorize closeness / low latency - let request_order = self.request_order(&self.0.layout.read().unwrap(), to.iter().copied()); + let request_order = + self.request_order(&self.0.layout.read().unwrap().current(), to.iter().copied()); let send_all_at_once = strategy.rs_send_all_at_once.unwrap_or(false); // Build future for each request @@ -497,16 +498,16 @@ impl RpcHelper { let mut ret = Vec::with_capacity(12); let ver_iter = layout - .versions + .versions() .iter() .rev() - .chain(layout.old_versions.iter().rev()); + .chain(layout.inner().old_versions.iter().rev()); for ver in ver_iter { if ver.version > layout.sync_map_min() { continue; } let nodes = ver.nodes_of(position, ver.replication_factor); - for node in rpc_helper.request_order(&layout, nodes) { + for node in rpc_helper.request_order(layout.current(), nodes) { if !ret.contains(&node) { ret.push(node); } @@ -517,15 +518,12 @@ impl RpcHelper { fn request_order( &self, - layout: &LayoutHistory, + layout: &LayoutVersion, nodes: impl Iterator, ) -> Vec { // Retrieve some status variables that we will use to sort requests let peer_list = self.0.peering.get_peer_list(); - let our_zone = layout - .current() - .get_node_zone(&self.0.our_node_id) - .unwrap_or(""); + let our_zone = layout.get_node_zone(&self.0.our_node_id).unwrap_or(""); // Augment requests with some information used to sort them. // The tuples are as follows: @@ -535,7 +533,7 @@ impl RpcHelper { // and within a same zone we priorize nodes with the lowest latency. let mut nodes = nodes .map(|to| { - let peer_zone = layout.current().get_node_zone(&to).unwrap_or(""); + let peer_zone = layout.get_node_zone(&to).unwrap_or(""); let peer_avg_ping = peer_list .iter() .find(|x| x.id.as_ref() == to.as_slice()) diff --git a/src/rpc/system.rs b/src/rpc/system.rs index 9da1b176..b38e2e01 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -451,7 +451,7 @@ impl System { // Obtain information about nodes that have a role as storage nodes // in one of the active layout versions let mut storage_nodes = HashSet::::with_capacity(16); - for ver in layout.versions.iter() { + for ver in layout.versions().iter() { storage_nodes.extend( ver.roles .items() @@ -470,7 +470,7 @@ impl System { let mut partitions_all_ok = 0; for (_, hash) in partitions.iter() { let mut write_sets = layout - .versions + .versions() .iter() .map(|x| x.nodes_of(hash, x.replication_factor)); let has_quorum = write_sets @@ -634,7 +634,7 @@ impl System { .filter(|p| p.is_up()) .count(); - let not_configured = self.cluster_layout().check().is_err(); + let not_configured = self.cluster_layout().inner().check().is_err(); let no_peers = n_connected < self.replication_factor.into(); let expected_n_nodes = self.cluster_layout().all_nodes().len(); let bad_peers = n_connected != expected_n_nodes; -- cgit v1.2.3 From 32f1786f9ff17f12911f5f3f37e2d1c35d534f59 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 27 Mar 2024 13:37:20 +0100 Subject: [next-0.10] cache layout check result --- src/rpc/layout/helper.rs | 8 ++++++++ src/rpc/layout/manager.rs | 4 ++-- src/rpc/system.rs | 2 +- 3 files changed, 11 insertions(+), 3 deletions(-) (limited to 'src') diff --git a/src/rpc/layout/helper.rs b/src/rpc/layout/helper.rs index ddf8fd44..b15f7540 100644 --- a/src/rpc/layout/helper.rs +++ b/src/rpc/layout/helper.rs @@ -41,6 +41,7 @@ pub struct LayoutHelper { trackers_hash: Hash, staging_hash: Hash, + is_check_ok: bool, // ack lock: counts in-progress write operations for each // layout version ; we don't increase the ack update tracker @@ -107,6 +108,8 @@ impl LayoutHelper { .entry(layout.current().version) .or_insert(AtomicUsize::new(0)); + let is_check_ok = layout.check().is_ok(); + LayoutHelper { replication_factor, consistency_mode, @@ -118,6 +121,7 @@ impl LayoutHelper { trackers_hash, staging_hash, ack_lock, + is_check_ok, } } @@ -153,6 +157,10 @@ impl LayoutHelper { &self.inner().versions } + pub fn is_check_ok(&self) -> bool { + self.is_check_ok + } + /// Return all nodes that have a role (gateway or storage) /// in one of the currently active layout versions pub fn all_nodes(&self) -> &[Uuid] { diff --git a/src/rpc/layout/manager.rs b/src/rpc/layout/manager.rs index 3866f867..0ca532ba 100644 --- a/src/rpc/layout/manager.rs +++ b/src/rpc/layout/manager.rs @@ -160,12 +160,12 @@ impl LayoutManager { fn merge_layout(&self, adv: &LayoutHistory) -> Option { let mut layout = self.layout.write().unwrap(); let prev_digest = layout.digest(); - let prev_layout_check = layout.inner().check().is_ok(); + let prev_layout_check = layout.is_check_ok(); if !prev_layout_check || adv.check().is_ok() { if layout.update(|l| l.merge(adv)) { layout.update_trackers(self.node_id); - if prev_layout_check && layout.inner().check().is_err() { + if prev_layout_check && !layout.is_check_ok() { panic!("Merged two correct layouts and got an incorrect layout."); } assert!(layout.digest() != prev_digest); diff --git a/src/rpc/system.rs b/src/rpc/system.rs index b38e2e01..91a42415 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -634,7 +634,7 @@ impl System { .filter(|p| p.is_up()) .count(); - let not_configured = self.cluster_layout().inner().check().is_err(); + let not_configured = !self.cluster_layout().is_check_ok(); let no_peers = n_connected < self.replication_factor.into(); let expected_n_nodes = self.cluster_layout().all_nodes().len(); let bad_peers = n_connected != expected_n_nodes; -- cgit v1.2.3 From 4eba32f29fceea5ab19e44900f8d3a6864989d55 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 27 Mar 2024 13:47:06 +0100 Subject: [next-0.10] layout helper: rename & clarify updates to update trackers --- src/rpc/layout/helper.rs | 57 +++++++++++++++++++---------------------------- src/rpc/layout/manager.rs | 8 +++---- 2 files changed, 27 insertions(+), 38 deletions(-) (limited to 'src') diff --git a/src/rpc/layout/helper.rs b/src/rpc/layout/helper.rs index b15f7540..3a033ab2 100644 --- a/src/rpc/layout/helper.rs +++ b/src/rpc/layout/helper.rs @@ -238,29 +238,15 @@ impl LayoutHelper { // ------------------ helpers for update tracking --------------- - pub(crate) fn update_trackers(&mut self, local_node_id: Uuid) { + pub(crate) fn update_update_trackers(&mut self, local_node_id: Uuid) { // Ensure trackers for this node's values are up-to-date // 1. Acknowledge the last layout version which is not currently // locked by an in-progress write operation - self.ack_max_free(local_node_id); + self.update_ack_to_max_free(local_node_id); // 2. Assume the data on this node is sync'ed up at least to // the first layout version in the history - self.sync_first(local_node_id); - - // 3. Acknowledge everyone has synced up to min(self.sync_map) - self.sync_ack(local_node_id); - - debug!("ack_map: {:?}", self.inner().update_trackers.ack_map); - debug!("sync_map: {:?}", self.inner().update_trackers.sync_map); - debug!( - "sync_ack_map: {:?}", - self.inner().update_trackers.sync_ack_map - ); - } - - fn sync_first(&mut self, local_node_id: Uuid) { let first_version = self.inner().min_stored(); self.update(|layout| { layout @@ -268,9 +254,8 @@ impl LayoutHelper { .sync_map .set_max(local_node_id, first_version) }); - } - fn sync_ack(&mut self, local_node_id: Uuid) { + // 3. Acknowledge everyone has synced up to min(self.sync_map) let sync_map_min = self.sync_map_min; self.update(|layout| { layout @@ -278,24 +263,18 @@ impl LayoutHelper { .sync_ack_map .set_max(local_node_id, sync_map_min) }); - } - pub(crate) fn ack_max_free(&mut self, local_node_id: Uuid) -> bool { - let max_ack = self.max_free_ack(); - let changed = self.update(|layout| { - layout - .update_trackers - .ack_map - .set_max(local_node_id, max_ack) - }); - if changed { - info!("ack_until updated to {}", max_ack); - } - changed + debug!("ack_map: {:?}", self.inner().update_trackers.ack_map); + debug!("sync_map: {:?}", self.inner().update_trackers.sync_map); + debug!( + "sync_ack_map: {:?}", + self.inner().update_trackers.sync_ack_map + ); } - pub(crate) fn max_free_ack(&self) -> u64 { - self.versions() + pub(crate) fn update_ack_to_max_free(&mut self, local_node_id: Uuid) -> bool { + let max_free = self + .versions() .iter() .map(|x| x.version) .skip_while(|v| { @@ -305,6 +284,16 @@ impl LayoutHelper { .unwrap_or(true) }) .next() - .unwrap_or(self.current().version) + .unwrap_or(self.current().version); + let changed = self.update(|layout| { + layout + .update_trackers + .ack_map + .set_max(local_node_id, max_free) + }); + if changed { + info!("ack_until updated to {}", max_free); + } + changed } } diff --git a/src/rpc/layout/manager.rs b/src/rpc/layout/manager.rs index 0ca532ba..a0dcf50e 100644 --- a/src/rpc/layout/manager.rs +++ b/src/rpc/layout/manager.rs @@ -70,7 +70,7 @@ impl LayoutManager { cluster_layout, Default::default(), ); - cluster_layout.update_trackers(node_id.into()); + cluster_layout.update_update_trackers(node_id.into()); let layout = Arc::new(RwLock::new(cluster_layout)); let change_notify = Arc::new(Notify::new()); @@ -134,7 +134,7 @@ impl LayoutManager { fn ack_new_version(self: &Arc) { let mut layout = self.layout.write().unwrap(); - if layout.ack_max_free(self.node_id) { + if layout.update_ack_to_max_free(self.node_id) { self.broadcast_update(SystemRpc::AdvertiseClusterLayoutTrackers( layout.inner().update_trackers.clone(), )); @@ -164,7 +164,7 @@ impl LayoutManager { if !prev_layout_check || adv.check().is_ok() { if layout.update(|l| l.merge(adv)) { - layout.update_trackers(self.node_id); + layout.update_update_trackers(self.node_id); if prev_layout_check && !layout.is_check_ok() { panic!("Merged two correct layouts and got an incorrect layout."); } @@ -182,7 +182,7 @@ impl LayoutManager { if layout.inner().update_trackers != *adv { if layout.update(|l| l.update_trackers.merge(adv)) { - layout.update_trackers(self.node_id); + layout.update_update_trackers(self.node_id); assert!(layout.digest() != prev_digest); return Some(layout.inner().update_trackers.clone()); } -- cgit v1.2.3 From 25c196f34d958f4f61d50c89a1c5d40b96d7cd24 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 27 Mar 2024 13:55:49 +0100 Subject: [next-0.10] admin api: fix logic in get cluster status --- src/api/admin/cluster.rs | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) (limited to 'src') diff --git a/src/api/admin/cluster.rs b/src/api/admin/cluster.rs index e5877fcd..357ac600 100644 --- a/src/api/admin/cluster.rs +++ b/src/api/admin/cluster.rs @@ -70,9 +70,7 @@ pub async fn handle_get_cluster_status(garage: &Arc) -> Result { - if n.role.is_none() { - n.role = Some(role); - } + n.role = Some(role); } } } @@ -81,15 +79,21 @@ pub async fn handle_get_cluster_status(garage: &Arc) -> Result Date: Thu, 28 Mar 2024 15:19:44 +0100 Subject: [next-0.10] bump version number to 1.0 --- src/api/Cargo.toml | 2 +- src/block/Cargo.toml | 2 +- src/db/Cargo.toml | 2 +- src/garage/Cargo.toml | 2 +- src/model/Cargo.toml | 2 +- src/net/Cargo.toml | 2 +- src/net/netapp.rs | 8 +++++--- src/net/peering.rs | 5 ++--- src/rpc/Cargo.toml | 2 +- src/rpc/system.rs | 2 +- src/table/Cargo.toml | 2 +- src/util/Cargo.toml | 2 +- src/web/Cargo.toml | 2 +- 13 files changed, 18 insertions(+), 17 deletions(-) (limited to 'src') diff --git a/src/api/Cargo.toml b/src/api/Cargo.toml index 1b87496c..a5645c26 100644 --- a/src/api/Cargo.toml +++ b/src/api/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "garage_api" -version = "0.10.0" +version = "1.0.0" authors = ["Alex Auvolat "] edition = "2018" license = "AGPL-3.0" diff --git a/src/block/Cargo.toml b/src/block/Cargo.toml index b5763120..7eb6bca8 100644 --- a/src/block/Cargo.toml +++ b/src/block/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "garage_block" -version = "0.10.0" +version = "1.0.0" authors = ["Alex Auvolat "] edition = "2018" license = "AGPL-3.0" diff --git a/src/db/Cargo.toml b/src/db/Cargo.toml index b88298ee..ef5a8659 100644 --- a/src/db/Cargo.toml +++ b/src/db/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "garage_db" -version = "0.10.0" +version = "1.0.0" authors = ["Alex Auvolat "] edition = "2018" license = "AGPL-3.0" diff --git a/src/garage/Cargo.toml b/src/garage/Cargo.toml index a4acbb1f..9cc71abd 100644 --- a/src/garage/Cargo.toml +++ b/src/garage/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "garage" -version = "0.10.0" +version = "1.0.0" authors = ["Alex Auvolat "] edition = "2018" license = "AGPL-3.0" diff --git a/src/model/Cargo.toml b/src/model/Cargo.toml index 1e1ce0f7..25926080 100644 --- a/src/model/Cargo.toml +++ b/src/model/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "garage_model" -version = "0.10.0" +version = "1.0.0" authors = ["Alex Auvolat "] edition = "2018" license = "AGPL-3.0" diff --git a/src/net/Cargo.toml b/src/net/Cargo.toml index 4bd0d2e5..c12b39a4 100644 --- a/src/net/Cargo.toml +++ b/src/net/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "garage_net" -version = "0.10.0" +version = "1.0.0" authors = ["Alex Auvolat "] edition = "2018" license = "AGPL-3.0" diff --git a/src/net/netapp.rs b/src/net/netapp.rs index 6480a126..f1e9f1ae 100644 --- a/src/net/netapp.rs +++ b/src/net/netapp.rs @@ -35,8 +35,10 @@ pub type NetworkKey = sodiumoxide::crypto::auth::Key; /// composed of 8 bytes for Netapp version and 8 bytes for client version pub(crate) type VersionTag = [u8; 16]; -/// Value of the Netapp version used in the version tag -pub(crate) const NETAPP_VERSION_TAG: u64 = 0x6e65746170700005; // netapp 0x0005 +/// Value of garage_net version used in the version tag +/// We are no longer using prefix `netapp` as garage_net is forked from the netapp crate. +/// Since Garage v1.0, we have replaced the prefix by `grgnet` (shorthand for garage_net). +pub(crate) const NETAPP_VERSION_TAG: u64 = 0x6772676e65740010; // grgnet 0x0010 (1.0) /// HelloMessage is sent by the client on a Netapp connection to indicate /// that they are also a server and ready to recieve incoming connections @@ -123,7 +125,7 @@ impl NetApp { netapp .hello_endpoint - .swap(Some(netapp.endpoint("__netapp/netapp.rs/Hello".into()))); + .swap(Some(netapp.endpoint("garage_net/netapp.rs/Hello".into()))); netapp .hello_endpoint .load_full() diff --git a/src/net/peering.rs b/src/net/peering.rs index b4271231..168162d9 100644 --- a/src/net/peering.rs +++ b/src/net/peering.rs @@ -237,14 +237,13 @@ impl PeeringManager { ); known_hosts.update_hash(); - // TODO for v0.10 / v1.0 : rename the endpoint (it will break compatibility) let strat = Arc::new(Self { netapp: netapp.clone(), known_hosts: RwLock::new(known_hosts), public_peer_list: ArcSwap::new(Arc::new(Vec::new())), next_ping_id: AtomicU64::new(42), - ping_endpoint: netapp.endpoint("__netapp/peering/fullmesh.rs/Ping".into()), - peer_list_endpoint: netapp.endpoint("__netapp/peering/fullmesh.rs/PeerList".into()), + ping_endpoint: netapp.endpoint("garage_net/peering.rs/Ping".into()), + peer_list_endpoint: netapp.endpoint("garage_net/peering.rs/PeerList".into()), ping_timeout_millis: DEFAULT_PING_TIMEOUT_MILLIS.into(), }); diff --git a/src/rpc/Cargo.toml b/src/rpc/Cargo.toml index 3e7ac635..43d5568e 100644 --- a/src/rpc/Cargo.toml +++ b/src/rpc/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "garage_rpc" -version = "0.10.0" +version = "1.0.0" authors = ["Alex Auvolat "] edition = "2018" license = "AGPL-3.0" diff --git a/src/rpc/system.rs b/src/rpc/system.rs index 91a42415..0e78060b 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -46,7 +46,7 @@ const STATUS_EXCHANGE_INTERVAL: Duration = Duration::from_secs(10); /// Version tag used for version check upon Netapp connection. /// Cluster nodes with different version tags are deemed /// incompatible and will refuse to connect. -pub const GARAGE_VERSION_TAG: u64 = 0x676172616765000A; // garage 0x000A +pub const GARAGE_VERSION_TAG: u64 = 0x6761726167650010; // garage 0x0010 (1.0) /// RPC endpoint used for calls related to membership pub const SYSTEM_RPC_PATH: &str = "garage_rpc/system.rs/SystemRpc"; diff --git a/src/table/Cargo.toml b/src/table/Cargo.toml index cac17da6..171118ea 100644 --- a/src/table/Cargo.toml +++ b/src/table/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "garage_table" -version = "0.10.0" +version = "1.0.0" authors = ["Alex Auvolat "] edition = "2018" license = "AGPL-3.0" diff --git a/src/util/Cargo.toml b/src/util/Cargo.toml index e4c31460..883c0aa4 100644 --- a/src/util/Cargo.toml +++ b/src/util/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "garage_util" -version = "0.10.0" +version = "1.0.0" authors = ["Alex Auvolat "] edition = "2018" license = "AGPL-3.0" diff --git a/src/web/Cargo.toml b/src/web/Cargo.toml index 49549c9b..f097755c 100644 --- a/src/web/Cargo.toml +++ b/src/web/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "garage_web" -version = "0.10.0" +version = "1.0.0" authors = ["Alex Auvolat ", "Quentin Dufour "] edition = "2018" license = "AGPL-3.0" -- cgit v1.2.3 From 90e3c2af915251720a4253f78f3f1b4ba844800d Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 10 Apr 2024 14:35:30 +0200 Subject: [next-0.10] small updates to mention Garage v0.9.4 --- src/db/open.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src') diff --git a/src/db/open.rs b/src/db/open.rs index 19bc96cc..b8de3cd7 100644 --- a/src/db/open.rs +++ b/src/db/open.rs @@ -36,7 +36,7 @@ impl std::str::FromStr for Engine { match text { "lmdb" | "heed" => Ok(Self::Lmdb), "sqlite" | "sqlite3" | "rusqlite" => Ok(Self::Sqlite), - "sled" => Err(Error("Sled is no longer supported as a database engine. Converting your old metadata db can be done using an older Garage binary (e.g. v0.9.3).".into())), + "sled" => Err(Error("Sled is no longer supported as a database engine. Converting your old metadata db can be done using an older Garage binary (e.g. v0.9.4).".into())), kind => Err(Error( format!( "Invalid DB engine: {} (options are: lmdb, sqlite)", -- cgit v1.2.3