From 0962313ebd45abb116d6ad2ee0eb754f587fc299 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 8 Nov 2023 13:11:13 +0100 Subject: garage_rpc: reorder functions in layout.rs --- src/rpc/layout.rs | 223 +++++++++++++++++++++++++++--------------------------- 1 file changed, 113 insertions(+), 110 deletions(-) diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index e02a180b..368a9d2c 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -278,86 +278,7 @@ impl ClusterLayout { ret } - fn calculate_staging_hash(&self) -> Hash { - let hashed_tuple = (&self.staging_roles, &self.staging_parameters); - blake2sum(&nonversioned_encode(&hashed_tuple).unwrap()[..]) - } - - pub fn merge(&mut self, other: &ClusterLayout) -> bool { - match other.version.cmp(&self.version) { - Ordering::Greater => { - *self = other.clone(); - true - } - Ordering::Equal => { - self.staging_parameters.merge(&other.staging_parameters); - self.staging_roles.merge(&other.staging_roles); - - let new_staging_hash = self.calculate_staging_hash(); - let changed = new_staging_hash != self.staging_hash; - - self.staging_hash = new_staging_hash; - - changed - } - Ordering::Less => false, - } - } - - pub fn apply_staged_changes(mut self, version: Option) -> Result<(Self, Message), Error> { - match version { - None => { - let error = r#" -Please pass the new layout version number to ensure that you are writing the correct version of the cluster layout. -To know the correct value of the new layout version, invoke `garage layout show` and review the proposed changes. - "#; - return Err(Error::Message(error.into())); - } - Some(v) => { - if v != self.version + 1 { - return Err(Error::Message("Invalid new layout version".into())); - } - } - } - - self.roles.merge(&self.staging_roles); - self.roles.retain(|(_, _, v)| v.0.is_some()); - self.parameters = *self.staging_parameters.get(); - - self.staging_roles.clear(); - self.staging_hash = self.calculate_staging_hash(); - - let msg = self.calculate_partition_assignment()?; - - self.version += 1; - - Ok((self, msg)) - } - - pub fn revert_staged_changes(mut self, version: Option) -> Result { - match version { - None => { - let error = r#" -Please pass the new layout version number to ensure that you are writing the correct version of the cluster layout. -To know the correct value of the new layout version, invoke `garage layout show` and review the proposed changes. - "#; - return Err(Error::Message(error.into())); - } - Some(v) => { - if v != self.version + 1 { - return Err(Error::Message("Invalid new layout version".into())); - } - } - } - - self.staging_roles.clear(); - self.staging_parameters.update(self.parameters); - self.staging_hash = self.calculate_staging_hash(); - - self.version += 1; - - Ok(self) - } + // ===================== accessors ====================== /// Returns a list of IDs of nodes that currently have /// a role in the cluster @@ -377,28 +298,6 @@ To know the correct value of the new layout version, invoke `garage layout show` } } - /// Returns the uuids of the non_gateway nodes in self.node_id_vec. - fn nongateway_nodes(&self) -> Vec { - let mut result = Vec::::new(); - for uuid in self.node_id_vec.iter() { - match self.node_role(uuid) { - Some(role) if role.capacity.is_some() => result.push(*uuid), - _ => (), - } - } - result - } - - /// Given a node uuids, this function returns the label of its zone - fn get_node_zone(&self, uuid: &Uuid) -> Result { - match self.node_role(uuid) { - Some(role) => Ok(role.zone.clone()), - _ => Err(Error::Message( - "The Uuid does not correspond to a node present in the cluster.".into(), - )), - } - } - /// Given a node uuids, this function returns its capacity or fails if it does not have any pub fn get_node_capacity(&self, uuid: &Uuid) -> Result { match self.node_role(uuid) { @@ -435,6 +334,30 @@ To know the correct value of the new layout version, invoke `garage layout show` )) } + // ===================== internal information extractors ====================== + + /// Returns the uuids of the non_gateway nodes in self.node_id_vec. + fn nongateway_nodes(&self) -> Vec { + let mut result = Vec::::new(); + for uuid in self.node_id_vec.iter() { + match self.node_role(uuid) { + Some(role) if role.capacity.is_some() => result.push(*uuid), + _ => (), + } + } + result + } + + /// Given a node uuids, this function returns the label of its zone + fn get_node_zone(&self, uuid: &Uuid) -> Result<&str, Error> { + match self.node_role(uuid) { + Some(role) => Ok(&role.zone), + _ => Err(Error::Message( + "The Uuid does not correspond to a node present in the cluster.".into(), + )), + } + } + /// Returns the sum of capacities of non gateway nodes in the cluster fn get_total_capacity(&self) -> Result { let mut total_capacity = 0; @@ -461,6 +384,89 @@ To know the correct value of the new layout version, invoke `garage layout show` } } + fn calculate_staging_hash(&self) -> Hash { + let hashed_tuple = (&self.staging_roles, &self.staging_parameters); + blake2sum(&nonversioned_encode(&hashed_tuple).unwrap()[..]) + } + + // ================== updates to layout, public interface =================== + + pub fn merge(&mut self, other: &ClusterLayout) -> bool { + match other.version.cmp(&self.version) { + Ordering::Greater => { + *self = other.clone(); + true + } + Ordering::Equal => { + self.staging_parameters.merge(&other.staging_parameters); + self.staging_roles.merge(&other.staging_roles); + + let new_staging_hash = self.calculate_staging_hash(); + let changed = new_staging_hash != self.staging_hash; + + self.staging_hash = new_staging_hash; + + changed + } + Ordering::Less => false, + } + } + + pub fn apply_staged_changes(mut self, version: Option) -> Result<(Self, Message), Error> { + match version { + None => { + let error = r#" +Please pass the new layout version number to ensure that you are writing the correct version of the cluster layout. +To know the correct value of the new layout version, invoke `garage layout show` and review the proposed changes. + "#; + return Err(Error::Message(error.into())); + } + Some(v) => { + if v != self.version + 1 { + return Err(Error::Message("Invalid new layout version".into())); + } + } + } + + self.roles.merge(&self.staging_roles); + self.roles.retain(|(_, _, v)| v.0.is_some()); + self.parameters = *self.staging_parameters.get(); + + self.staging_roles.clear(); + self.staging_hash = self.calculate_staging_hash(); + + let msg = self.calculate_partition_assignment()?; + + self.version += 1; + + Ok((self, msg)) + } + + pub fn revert_staged_changes(mut self, version: Option) -> Result { + match version { + None => { + let error = r#" +Please pass the new layout version number to ensure that you are writing the correct version of the cluster layout. +To know the correct value of the new layout version, invoke `garage layout show` and review the proposed changes. + "#; + return Err(Error::Message(error.into())); + } + Some(v) => { + if v != self.version + 1 { + return Err(Error::Message("Invalid new layout version".into())); + } + } + } + + self.staging_roles.clear(); + self.staging_parameters.update(self.parameters); + self.staging_hash = self.calculate_staging_hash(); + + self.version += 1; + + Ok(self) + } + /// Check a cluster layout for internal consistency /// (assignment, roles, parameters, partition size) /// returns true if consistent, false if error @@ -574,12 +580,9 @@ To know the correct value of the new layout version, invoke `garage layout show` Ok(()) } -} -// ==================================================================================== + // ================== updates to layout, internals =================== -// Implementation of the ClusterLayout methods related to the assignment algorithm. -impl ClusterLayout { /// This function calculates a new partition-to-node assignment. /// The computed assignment respects the node replication factor /// and the zone redundancy parameter It maximizes the capacity of a @@ -867,7 +870,7 @@ impl ClusterLayout { } for n in 0..self.nongateway_nodes().len() { let node_capacity = self.get_node_capacity(&self.node_id_vec[n])?; - let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[n])?]; + let node_zone = zone_to_id[self.get_node_zone(&self.node_id_vec[n])?]; g.add_edge(Vertex::N(n), Vertex::Sink, node_capacity / partition_size)?; for p in 0..NB_PARTITIONS { if !exclude_assoc.contains(&(p, n)) { @@ -913,7 +916,7 @@ impl ClusterLayout { // The algorithm is such that it will start with the flow that we just computed // and find ameliorating paths from that. for (p, n) in exclude_edge.iter() { - let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]; + let node_zone = zone_to_id[self.get_node_zone(&self.node_id_vec[*n])?]; g.add_edge(Vertex::PZ(*p, node_zone), Vertex::N(*n), 1)?; } g.compute_maximal_flow()?; @@ -933,7 +936,7 @@ impl ClusterLayout { let mut cost = CostFunction::new(); for (p, assoc_p) in prev_assign.iter().enumerate() { for n in assoc_p.iter() { - let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]; + let node_zone = zone_to_id[self.get_node_zone(&self.node_id_vec[*n])?]; cost.insert((Vertex::PZ(p, node_zone), Vertex::N(*n)), -1); } } @@ -1035,7 +1038,7 @@ impl ClusterLayout { let mut old_zones_of_p = Vec::::new(); for n in prev_assign[p].iter() { old_zones_of_p - .push(zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]); + .push(zone_to_id[self.get_node_zone(&self.node_id_vec[*n])?]); } if !old_zones_of_p.contains(&z) { new_partitions_zone[z] += 1; -- cgit v1.2.3 From 12d1dbfc6b884be488e2d79c0b9e3c47490f5442 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 8 Nov 2023 15:41:24 +0100 Subject: remove Ring and use ClusterLayout everywhere --- src/api/admin/bucket.rs | 4 +- src/api/k2v/index.rs | 8 +- src/api/s3/put.rs | 2 +- src/garage/admin/bucket.rs | 4 +- src/garage/admin/mod.rs | 20 ++--- src/model/helper/bucket.rs | 6 +- src/model/index_counter.rs | 6 +- src/rpc/layout.rs | 72 ++++++++++++++-- src/rpc/lib.rs | 1 - src/rpc/ring.rs | 164 ------------------------------------ src/rpc/rpc_helper.rs | 14 +-- src/rpc/system.rs | 55 ++++++------ src/table/merkle.rs | 2 +- src/table/replication/fullcopy.rs | 8 +- src/table/replication/parameters.rs | 2 +- src/table/replication/sharded.rs | 14 +-- src/table/sync.rs | 20 ++--- 17 files changed, 148 insertions(+), 254 deletions(-) delete mode 100644 src/rpc/ring.rs diff --git a/src/api/admin/bucket.rs b/src/api/admin/bucket.rs index 17f46c30..6bff7e9f 100644 --- a/src/api/admin/bucket.rs +++ b/src/api/admin/bucket.rs @@ -122,7 +122,7 @@ async fn bucket_info_results( .table .get(&bucket_id, &EmptyKey) .await? - .map(|x| x.filtered_values(&garage.system.ring.borrow())) + .map(|x| x.filtered_values(&garage.system.layout_watch.borrow())) .unwrap_or_default(); let mpu_counters = garage @@ -130,7 +130,7 @@ async fn bucket_info_results( .table .get(&bucket_id, &EmptyKey) .await? - .map(|x| x.filtered_values(&garage.system.ring.borrow())) + .map(|x| x.filtered_values(&garage.system.layout_watch.borrow())) .unwrap_or_default(); let mut relevant_keys = HashMap::new(); diff --git a/src/api/k2v/index.rs b/src/api/k2v/index.rs index 6c1d4a91..ff8beda3 100644 --- a/src/api/k2v/index.rs +++ b/src/api/k2v/index.rs @@ -5,7 +5,7 @@ use serde::Serialize; use garage_util::data::*; -use garage_rpc::ring::Ring; +use garage_rpc::layout::ClusterLayout; use garage_table::util::*; use garage_model::garage::Garage; @@ -26,7 +26,7 @@ pub async fn handle_read_index( ) -> Result, Error> { let reverse = reverse.unwrap_or(false); - let ring: Arc = garage.system.ring.borrow().clone(); + let layout: Arc = garage.system.layout_watch.borrow().clone(); let (partition_keys, more, next_start) = read_range( &garage.k2v.counter_table.table, @@ -35,7 +35,7 @@ pub async fn handle_read_index( &start, &end, limit, - Some((DeletedFilter::NotDeleted, ring.layout.node_id_vec.clone())), + Some((DeletedFilter::NotDeleted, layout.node_id_vec.clone())), EnumerationOrder::from_reverse(reverse), ) .await?; @@ -54,7 +54,7 @@ pub async fn handle_read_index( partition_keys: partition_keys .into_iter() .map(|part| { - let vals = part.filtered_values(&ring); + let vals = part.filtered_values(&layout); ReadIndexResponseEntry { pk: part.sk, entries: *vals.get(&s_entries).unwrap_or(&0), diff --git a/src/api/s3/put.rs b/src/api/s3/put.rs index 606facc4..fc17ed03 100644 --- a/src/api/s3/put.rs +++ b/src/api/s3/put.rs @@ -253,7 +253,7 @@ pub(crate) async fn check_quotas( .await?; let counters = counters - .map(|x| x.filtered_values(&garage.system.ring.borrow())) + .map(|x| x.filtered_values(&garage.system.layout_watch.borrow())) .unwrap_or_default(); let (prev_cnt_obj, prev_cnt_size) = match prev_object { diff --git a/src/garage/admin/bucket.rs b/src/garage/admin/bucket.rs index 0781cb8b..34e48292 100644 --- a/src/garage/admin/bucket.rs +++ b/src/garage/admin/bucket.rs @@ -70,7 +70,7 @@ impl AdminRpcHandler { .table .get(&bucket_id, &EmptyKey) .await? - .map(|x| x.filtered_values(&self.garage.system.ring.borrow())) + .map(|x| x.filtered_values(&self.garage.system.layout_watch.borrow())) .unwrap_or_default(); let mpu_counters = self @@ -79,7 +79,7 @@ impl AdminRpcHandler { .table .get(&bucket_id, &EmptyKey) .await? - .map(|x| x.filtered_values(&self.garage.system.ring.borrow())) + .map(|x| x.filtered_values(&self.garage.system.layout_watch.borrow())) .unwrap_or_default(); let mut relevant_keys = HashMap::new(); diff --git a/src/garage/admin/mod.rs b/src/garage/admin/mod.rs index b6f9c426..006f71cd 100644 --- a/src/garage/admin/mod.rs +++ b/src/garage/admin/mod.rs @@ -18,7 +18,7 @@ use garage_util::error::Error as GarageError; use garage_table::replication::*; use garage_table::*; -use garage_rpc::ring::PARTITION_BITS; +use garage_rpc::layout::PARTITION_BITS; use garage_rpc::*; use garage_block::manager::BlockResyncErrorInfo; @@ -126,8 +126,8 @@ impl AdminRpcHandler { opt_to_send.all_nodes = false; let mut failures = vec![]; - let ring = self.garage.system.ring.borrow().clone(); - for node in ring.layout.node_ids().iter() { + let layout = self.garage.system.layout_watch.borrow().clone(); + for node in layout.node_ids().iter() { let node = (*node).into(); let resp = self .endpoint @@ -163,9 +163,9 @@ impl AdminRpcHandler { async fn handle_stats(&self, opt: StatsOpt) -> Result { if opt.all_nodes { let mut ret = String::new(); - let ring = self.garage.system.ring.borrow().clone(); + let layout = self.garage.system.layout_watch.borrow().clone(); - for node in ring.layout.node_ids().iter() { + for node in layout.node_ids().iter() { let mut opt = opt.clone(); opt.all_nodes = false; opt.skip_global = true; @@ -275,7 +275,7 @@ impl AdminRpcHandler { let mut ret = String::new(); // Gather storage node and free space statistics - let layout = &self.garage.system.ring.borrow().layout; + let layout = &self.garage.system.layout_watch.borrow(); let mut node_partition_count = HashMap::::new(); for short_id in layout.ring_assignment_data.iter() { let id = layout.node_id_vec[*short_id as usize]; @@ -440,8 +440,8 @@ impl AdminRpcHandler { ) -> Result { if all_nodes { let mut ret = vec![]; - let ring = self.garage.system.ring.borrow().clone(); - for node in ring.layout.node_ids().iter() { + let layout = self.garage.system.layout_watch.borrow().clone(); + for node in layout.node_ids().iter() { let node = (*node).into(); match self .endpoint @@ -488,8 +488,8 @@ impl AdminRpcHandler { ) -> Result { if all_nodes { let mut ret = vec![]; - let ring = self.garage.system.ring.borrow().clone(); - for node in ring.layout.node_ids().iter() { + let layout = self.garage.system.layout_watch.borrow().clone(); + for node in layout.node_ids().iter() { let node = (*node).into(); match self .endpoint diff --git a/src/model/helper/bucket.rs b/src/model/helper/bucket.rs index 576d03f3..d43d7e96 100644 --- a/src/model/helper/bucket.rs +++ b/src/model/helper/bucket.rs @@ -450,10 +450,10 @@ impl<'a> BucketHelper<'a> { #[cfg(feature = "k2v")] { - use garage_rpc::ring::Ring; + use garage_rpc::layout::ClusterLayout; use std::sync::Arc; - let ring: Arc = self.0.system.ring.borrow().clone(); + let layout: Arc = self.0.system.layout_watch.borrow().clone(); let k2vindexes = self .0 .k2v @@ -462,7 +462,7 @@ impl<'a> BucketHelper<'a> { .get_range( &bucket_id, None, - Some((DeletedFilter::NotDeleted, ring.layout.node_id_vec.clone())), + Some((DeletedFilter::NotDeleted, layout.node_id_vec.clone())), 10, EnumerationOrder::Forward, ) diff --git a/src/model/index_counter.rs b/src/model/index_counter.rs index a46c165f..d514cb06 100644 --- a/src/model/index_counter.rs +++ b/src/model/index_counter.rs @@ -7,7 +7,7 @@ use serde::{Deserialize, Serialize}; use garage_db as db; -use garage_rpc::ring::Ring; +use garage_rpc::layout::ClusterLayout; use garage_rpc::system::System; use garage_util::background::BackgroundRunner; use garage_util::data::*; @@ -83,8 +83,8 @@ impl Entry for CounterEntry { } impl CounterEntry { - pub fn filtered_values(&self, ring: &Ring) -> HashMap { - let nodes = &ring.layout.node_id_vec[..]; + pub fn filtered_values(&self, layout: &ClusterLayout) -> HashMap { + let nodes = &layout.node_id_vec[..]; self.filtered_values_with_nodes(nodes) } diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index 368a9d2c..2b5b6606 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -13,17 +13,39 @@ use garage_util::error::*; use crate::graph_algo::*; -use crate::ring::*; - use std::convert::TryInto; +// ---- defines: partitions ---- + +/// A partition id, which is stored on 16 bits +/// i.e. we have up to 2**16 partitions. +/// (in practice we have exactly 2**PARTITION_BITS partitions) +pub type Partition = u16; + +// TODO: make this constant parametrizable in the config file +// For deployments with many nodes it might make sense to bump +// it up to 10. +// Maximum value : 16 +/// How many bits from the hash are used to make partitions. Higher numbers means more fairness in +/// presence of numerous nodes, but exponentially bigger ring. Max 16 +pub const PARTITION_BITS: usize = 8; + const NB_PARTITIONS: usize = 1usize << PARTITION_BITS; +// ---- defines: nodes ---- + +// Type to store compactly the id of a node in the system +// Change this to u16 the day we want to have more than 256 nodes in a cluster +pub type CompactNodeType = u8; +pub const MAX_NODE_NUMBER: usize = 256; + +// ---- defines: other ---- + // The Message type will be used to collect information on the algorithm. -type Message = Vec; +pub type Message = Vec; mod v08 { - use crate::ring::CompactNodeType; + use super::CompactNodeType; use garage_util::crdt::LwwMap; use garage_util::data::{Hash, Uuid}; use serde::{Deserialize, Serialize}; @@ -76,7 +98,7 @@ mod v08 { mod v09 { use super::v08; - use crate::ring::CompactNodeType; + use super::CompactNodeType; use garage_util::crdt::{Lww, LwwMap}; use garage_util::data::{Hash, Uuid}; use serde::{Deserialize, Serialize}; @@ -334,6 +356,46 @@ impl ClusterLayout { )) } + /// Get the partition in which data would fall on + pub fn partition_of(&self, position: &Hash) -> Partition { + let top = u16::from_be_bytes(position.as_slice()[0..2].try_into().unwrap()); + top >> (16 - PARTITION_BITS) + } + + /// Get the list of partitions and the first hash of a partition key that would fall in it + pub fn partitions(&self) -> Vec<(Partition, Hash)> { + (0..(1 << PARTITION_BITS)) + .map(|i| { + let top = (i as u16) << (16 - PARTITION_BITS); + let mut location = [0u8; 32]; + location[..2].copy_from_slice(&u16::to_be_bytes(top)[..]); + (i as u16, Hash::from(location)) + }) + .collect::>() + } + + /// Walk the ring to find the n servers in which data should be replicated + pub fn nodes_of(&self, position: &Hash, n: usize) -> Vec { + assert_eq!(n, self.replication_factor); + + let data = &self.ring_assignment_data; + + if data.len() != self.replication_factor * (1 << PARTITION_BITS) { + warn!("Ring not yet ready, read/writes will be lost!"); + return vec![]; + } + + let partition_idx = self.partition_of(position) as usize; + let partition_start = partition_idx * self.replication_factor; + let partition_end = (partition_idx + 1) * self.replication_factor; + let partition_nodes = &data[partition_start..partition_end]; + + partition_nodes + .iter() + .map(|i| self.node_id_vec[*i as usize]) + .collect::>() + } + // ===================== internal information extractors ====================== /// Returns the uuids of the non_gateway nodes in self.node_id_vec. diff --git a/src/rpc/lib.rs b/src/rpc/lib.rs index a5f8fc6e..1af8b78e 100644 --- a/src/rpc/lib.rs +++ b/src/rpc/lib.rs @@ -14,7 +14,6 @@ mod kubernetes; pub mod graph_algo; pub mod layout; pub mod replication_mode; -pub mod ring; pub mod system; pub mod rpc_helper; diff --git a/src/rpc/ring.rs b/src/rpc/ring.rs deleted file mode 100644 index 6a2e5c72..00000000 --- a/src/rpc/ring.rs +++ /dev/null @@ -1,164 +0,0 @@ -//! Module containing types related to computing nodes which should receive a copy of data blocks -//! and metadata -use std::convert::TryInto; - -use garage_util::data::*; - -use crate::layout::ClusterLayout; - -/// A partition id, which is stored on 16 bits -/// i.e. we have up to 2**16 partitions. -/// (in practice we have exactly 2**PARTITION_BITS partitions) -pub type Partition = u16; - -// TODO: make this constant parametrizable in the config file -// For deployments with many nodes it might make sense to bump -// it up to 10. -// Maximum value : 16 -/// How many bits from the hash are used to make partitions. Higher numbers means more fairness in -/// presence of numerous nodes, but exponentially bigger ring. Max 16 -pub const PARTITION_BITS: usize = 8; - -const PARTITION_MASK_U16: u16 = ((1 << PARTITION_BITS) - 1) << (16 - PARTITION_BITS); - -/// A ring distributing fairly objects to nodes -#[derive(Clone)] -pub struct Ring { - /// The replication factor for this ring - pub replication_factor: usize, - - /// The network configuration used to generate this ring - pub layout: ClusterLayout, - - // Internal order of nodes used to make a more compact representation of the ring - nodes: Vec, - - // The list of entries in the ring - ring: Vec, -} - -// Type to store compactly the id of a node in the system -// Change this to u16 the day we want to have more than 256 nodes in a cluster -pub type CompactNodeType = u8; -pub const MAX_NODE_NUMBER: usize = 256; - -// The maximum number of times an object might get replicated -// This must be at least 3 because Garage supports 3-way replication -// Here we use 6 so that the size of a ring entry is 8 bytes -// (2 bytes partition id, 6 bytes node numbers as u8s) -const MAX_REPLICATION: usize = 6; - -/// An entry in the ring -#[derive(Clone, Debug)] -struct RingEntry { - // The two first bytes of the first hash that goes in this partition - // (the next bytes are zeroes) - hash_prefix: u16, - // The nodes that store this partition, stored as a list of positions in the `nodes` - // field of the Ring structure - // Only items 0 up to ring.replication_factor - 1 are used, others are zeros - nodes_buf: [CompactNodeType; MAX_REPLICATION], -} - -impl Ring { - pub(crate) fn new(layout: ClusterLayout, replication_factor: usize) -> Self { - if replication_factor != layout.replication_factor { - warn!("Could not build ring: replication factor does not match between local configuration and network role assignment."); - return Self::empty(layout, replication_factor); - } - - if layout.ring_assignment_data.len() != replication_factor * (1 << PARTITION_BITS) { - warn!("Could not build ring: network role assignment data has invalid length"); - return Self::empty(layout, replication_factor); - } - - let nodes = layout.node_id_vec.clone(); - let ring = (0..(1 << PARTITION_BITS)) - .map(|i| { - let top = (i as u16) << (16 - PARTITION_BITS); - let mut nodes_buf = [0u8; MAX_REPLICATION]; - nodes_buf[..replication_factor].copy_from_slice( - &layout.ring_assignment_data - [replication_factor * i..replication_factor * (i + 1)], - ); - RingEntry { - hash_prefix: top, - nodes_buf, - } - }) - .collect::>(); - - Self { - replication_factor, - layout, - nodes, - ring, - } - } - - fn empty(layout: ClusterLayout, replication_factor: usize) -> Self { - Self { - replication_factor, - layout, - nodes: vec![], - ring: vec![], - } - } - - /// Get the partition in which data would fall on - pub fn partition_of(&self, position: &Hash) -> Partition { - let top = u16::from_be_bytes(position.as_slice()[0..2].try_into().unwrap()); - top >> (16 - PARTITION_BITS) - } - - /// Get the list of partitions and the first hash of a partition key that would fall in it - pub fn partitions(&self) -> Vec<(Partition, Hash)> { - let mut ret = vec![]; - - for (i, entry) in self.ring.iter().enumerate() { - let mut location = [0u8; 32]; - location[..2].copy_from_slice(&u16::to_be_bytes(entry.hash_prefix)[..]); - ret.push((i as u16, location.into())); - } - if !ret.is_empty() { - assert_eq!(ret[0].1, [0u8; 32].into()); - } - - ret - } - - /// Walk the ring to find the n servers in which data should be replicated - pub fn get_nodes(&self, position: &Hash, n: usize) -> Vec { - if self.ring.len() != 1 << PARTITION_BITS { - warn!("Ring not yet ready, read/writes will be lost!"); - return vec![]; - } - - let partition_idx = self.partition_of(position) as usize; - let partition = &self.ring[partition_idx]; - - let top = u16::from_be_bytes(position.as_slice()[0..2].try_into().unwrap()); - // Check that we haven't messed up our partition table, i.e. that this partition - // table entrey indeed corresponds to the item we are storing - assert_eq!( - partition.hash_prefix & PARTITION_MASK_U16, - top & PARTITION_MASK_U16 - ); - - assert!(n <= self.replication_factor); - partition.nodes_buf[..n] - .iter() - .map(|i| self.nodes[*i as usize]) - .collect::>() - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_ring_entry_size() { - assert_eq!(std::mem::size_of::(), 8); - } -} diff --git a/src/rpc/rpc_helper.rs b/src/rpc/rpc_helper.rs index e59c372a..56bef2f3 100644 --- a/src/rpc/rpc_helper.rs +++ b/src/rpc/rpc_helper.rs @@ -26,8 +26,8 @@ use garage_util::data::*; use garage_util::error::Error; use garage_util::metrics::RecordDuration; +use crate::layout::ClusterLayout; use crate::metrics::RpcMetrics; -use crate::ring::Ring; // Default RPC timeout = 5 minutes const DEFAULT_TIMEOUT: Duration = Duration::from_secs(300); @@ -91,7 +91,7 @@ pub struct RpcHelper(Arc); struct RpcHelperInner { our_node_id: Uuid, fullmesh: Arc, - ring: watch::Receiver>, + layout_watch: watch::Receiver>, metrics: RpcMetrics, rpc_timeout: Duration, } @@ -100,7 +100,7 @@ impl RpcHelper { pub(crate) fn new( our_node_id: Uuid, fullmesh: Arc, - ring: watch::Receiver>, + layout_watch: watch::Receiver>, rpc_timeout: Option, ) -> Self { let metrics = RpcMetrics::new(); @@ -108,7 +108,7 @@ impl RpcHelper { Self(Arc::new(RpcHelperInner { our_node_id, fullmesh, - ring, + layout_watch, metrics, rpc_timeout: rpc_timeout.unwrap_or(DEFAULT_TIMEOUT), })) @@ -392,8 +392,8 @@ impl RpcHelper { pub fn request_order(&self, nodes: &[Uuid]) -> Vec { // Retrieve some status variables that we will use to sort requests let peer_list = self.0.fullmesh.get_peer_list(); - let ring: Arc = self.0.ring.borrow().clone(); - let our_zone = match ring.layout.node_role(&self.0.our_node_id) { + let layout: Arc = self.0.layout_watch.borrow().clone(); + let our_zone = match layout.node_role(&self.0.our_node_id) { Some(pc) => &pc.zone, None => "", }; @@ -407,7 +407,7 @@ impl RpcHelper { let mut nodes = nodes .iter() .map(|to| { - let peer_zone = match ring.layout.node_role(to) { + let peer_zone = match layout.node_role(to) { Some(pc) => &pc.zone, None => "", }; diff --git a/src/rpc/system.rs b/src/rpc/system.rs index 4b40bec4..106e9f8c 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -36,7 +36,6 @@ use crate::consul::ConsulDiscovery; use crate::kubernetes::*; use crate::layout::*; use crate::replication_mode::*; -use crate::ring::*; use crate::rpc_helper::*; use crate::system_metrics::*; @@ -112,9 +111,9 @@ pub struct System { replication_mode: ReplicationMode, replication_factor: usize, - /// The ring - pub ring: watch::Receiver>, - update_ring: Mutex>>, + /// The layout + pub layout_watch: watch::Receiver>, + update_layout: Mutex>>, /// Path to metadata directory pub metadata_dir: PathBuf, @@ -286,8 +285,7 @@ impl System { let mut local_status = NodeStatus::initial(replication_factor, &cluster_layout); local_status.update_disk_usage(&config.metadata_dir, &config.data_dir, &metrics); - let ring = Ring::new(cluster_layout, replication_factor); - let (update_ring, ring) = watch::channel(Arc::new(ring)); + let (update_layout, layout_watch) = watch::channel(Arc::new(cluster_layout)); let rpc_public_addr = match &config.rpc_public_addr { Some(a_str) => { @@ -362,7 +360,7 @@ impl System { rpc: RpcHelper::new( netapp.id.into(), fullmesh, - ring.clone(), + layout_watch.clone(), config.rpc_timeout_msec.map(Duration::from_millis), ), system_endpoint, @@ -378,8 +376,8 @@ impl System { kubernetes_discovery: config.kubernetes_discovery.clone(), metrics, - ring, - update_ring: Mutex::new(update_ring), + layout_watch, + update_layout: Mutex::new(update_layout), metadata_dir: config.metadata_dir.clone(), data_dir: config.data_dir.clone(), }); @@ -426,7 +424,7 @@ impl System { } pub fn get_cluster_layout(&self) -> ClusterLayout { - self.ring.borrow().layout.clone() + self.layout_watch.borrow().as_ref().clone() } pub async fn update_cluster_layout( @@ -466,7 +464,7 @@ impl System { } pub fn health(&self) -> ClusterHealth { - let ring: Arc<_> = self.ring.borrow().clone(); + let layout: Arc<_> = self.layout_watch.borrow().clone(); let quorum = self.replication_mode.write_quorum(); let replication_factor = self.replication_factor; @@ -477,8 +475,7 @@ impl System { .collect::>(); let connected_nodes = nodes.iter().filter(|(_, n)| n.is_up).count(); - let storage_nodes = ring - .layout + let storage_nodes = layout .roles .items() .iter() @@ -489,11 +486,11 @@ impl System { .filter(|(x, _, _)| nodes.get(x).map(|n| n.is_up).unwrap_or(false)) .count(); - let partitions = ring.partitions(); + let partitions = layout.partitions(); let partitions_n_up = partitions .iter() .map(|(_, h)| { - let pn = ring.get_nodes(h, ring.replication_factor); + let pn = layout.nodes_of(h, layout.replication_factor); pn.iter() .filter(|x| nodes.get(x).map(|n| n.is_up).unwrap_or(false)) .count() @@ -584,9 +581,9 @@ impl System { /// Save network configuration to disc async fn save_cluster_layout(&self) -> Result<(), Error> { - let ring: Arc = self.ring.borrow().clone(); + let layout: Arc = self.layout_watch.borrow().clone(); self.persist_cluster_layout - .save_async(&ring.layout) + .save_async(&layout) .await .expect("Cannot save current cluster layout"); Ok(()) @@ -595,9 +592,9 @@ impl System { fn update_local_status(&self) { let mut new_si: NodeStatus = self.local_status.load().as_ref().clone(); - let ring = self.ring.borrow(); - new_si.cluster_layout_version = ring.layout.version; - new_si.cluster_layout_staging_hash = ring.layout.staging_hash; + let layout = self.layout_watch.borrow(); + new_si.cluster_layout_version = layout.version; + new_si.cluster_layout_staging_hash = layout.staging_hash; new_si.update_disk_usage(&self.metadata_dir, &self.data_dir, &self.metrics); @@ -612,8 +609,8 @@ impl System { } fn handle_pull_cluster_layout(&self) -> SystemRpc { - let ring = self.ring.borrow().clone(); - SystemRpc::AdvertiseClusterLayout(ring.layout.clone()) + let layout = self.layout_watch.borrow().as_ref().clone(); + SystemRpc::AdvertiseClusterLayout(layout) } fn handle_get_known_nodes(&self) -> SystemRpc { @@ -663,8 +660,9 @@ impl System { return Err(Error::Message(msg)); } - let update_ring = self.update_ring.lock().await; - let mut layout: ClusterLayout = self.ring.borrow().layout.clone(); + let update_layout = self.update_layout.lock().await; + // TODO: don't clone each time an AdvertiseClusterLayout is received + let mut layout: ClusterLayout = self.layout_watch.borrow().as_ref().clone(); let prev_layout_check = layout.check().is_ok(); if layout.merge(adv) { @@ -675,9 +673,8 @@ impl System { )); } - let ring = Ring::new(layout.clone(), self.replication_factor); - update_ring.send(Arc::new(ring))?; - drop(update_ring); + update_layout.send(Arc::new(layout.clone()))?; + drop(update_layout); let self2 = self.clone(); tokio::spawn(async move { @@ -725,9 +722,9 @@ impl System { async fn discovery_loop(self: &Arc, mut stop_signal: watch::Receiver) { while !*stop_signal.borrow() { - let not_configured = self.ring.borrow().layout.check().is_err(); + let not_configured = self.layout_watch.borrow().check().is_err(); let no_peers = self.fullmesh.get_peer_list().len() < self.replication_factor; - let expected_n_nodes = self.ring.borrow().layout.num_nodes(); + let expected_n_nodes = self.layout_watch.borrow().num_nodes(); let bad_peers = self .fullmesh .get_peer_list() diff --git a/src/table/merkle.rs b/src/table/merkle.rs index 4577f872..01271c58 100644 --- a/src/table/merkle.rs +++ b/src/table/merkle.rs @@ -13,7 +13,7 @@ use garage_util::data::*; use garage_util::encode::{nonversioned_decode, nonversioned_encode}; use garage_util::error::Error; -use garage_rpc::ring::*; +use garage_rpc::layout::*; use crate::data::*; use crate::replication::*; diff --git a/src/table/replication/fullcopy.rs b/src/table/replication/fullcopy.rs index 18682ace..f8b7cacc 100644 --- a/src/table/replication/fullcopy.rs +++ b/src/table/replication/fullcopy.rs @@ -1,6 +1,6 @@ use std::sync::Arc; -use garage_rpc::ring::*; +use garage_rpc::layout::*; use garage_rpc::system::System; use garage_util::data::*; @@ -27,11 +27,11 @@ impl TableReplication for TableFullReplication { } fn write_nodes(&self, _hash: &Hash) -> Vec { - let ring = self.system.ring.borrow(); - ring.layout.node_ids().to_vec() + let layout = self.system.layout_watch.borrow(); + layout.node_ids().to_vec() } fn write_quorum(&self) -> usize { - let nmembers = self.system.ring.borrow().layout.node_ids().len(); + let nmembers = self.system.layout_watch.borrow().node_ids().len(); if nmembers > self.max_faults { nmembers - self.max_faults } else { diff --git a/src/table/replication/parameters.rs b/src/table/replication/parameters.rs index f00815a2..19b306f2 100644 --- a/src/table/replication/parameters.rs +++ b/src/table/replication/parameters.rs @@ -1,4 +1,4 @@ -use garage_rpc::ring::*; +use garage_rpc::layout::*; use garage_util::data::*; /// Trait to describe how a table shall be replicated diff --git a/src/table/replication/sharded.rs b/src/table/replication/sharded.rs index 1cf964af..95901a5a 100644 --- a/src/table/replication/sharded.rs +++ b/src/table/replication/sharded.rs @@ -1,6 +1,6 @@ use std::sync::Arc; -use garage_rpc::ring::*; +use garage_rpc::layout::*; use garage_rpc::system::System; use garage_util::data::*; @@ -26,16 +26,16 @@ pub struct TableShardedReplication { impl TableReplication for TableShardedReplication { fn read_nodes(&self, hash: &Hash) -> Vec { - let ring = self.system.ring.borrow(); - ring.get_nodes(hash, self.replication_factor) + let layout = self.system.layout_watch.borrow(); + layout.nodes_of(hash, self.replication_factor) } fn read_quorum(&self) -> usize { self.read_quorum } fn write_nodes(&self, hash: &Hash) -> Vec { - let ring = self.system.ring.borrow(); - ring.get_nodes(hash, self.replication_factor) + let layout = self.system.layout_watch.borrow(); + layout.nodes_of(hash, self.replication_factor) } fn write_quorum(&self) -> usize { self.write_quorum @@ -45,9 +45,9 @@ impl TableReplication for TableShardedReplication { } fn partition_of(&self, hash: &Hash) -> Partition { - self.system.ring.borrow().partition_of(hash) + self.system.layout_watch.borrow().partition_of(hash) } fn partitions(&self) -> Vec<(Partition, Hash)> { - self.system.ring.borrow().partitions() + self.system.layout_watch.borrow().partitions() } } diff --git a/src/table/sync.rs b/src/table/sync.rs index 92a353c6..b2600013 100644 --- a/src/table/sync.rs +++ b/src/table/sync.rs @@ -17,7 +17,7 @@ use garage_util::data::*; use garage_util::encode::{debug_serialize, nonversioned_encode}; use garage_util::error::{Error, OkOrMessage}; -use garage_rpc::ring::*; +use garage_rpc::layout::*; use garage_rpc::system::System; use garage_rpc::*; @@ -91,8 +91,8 @@ impl TableSyncer { bg.spawn_worker(SyncWorker { syncer: self.clone(), - ring_recv: self.system.ring.clone(), - ring: self.system.ring.borrow().clone(), + layout_watch: self.system.layout_watch.clone(), + layout: self.system.layout_watch.borrow().clone(), add_full_sync_rx, todo: vec![], next_full_sync: Instant::now() + Duration::from_secs(20), @@ -492,8 +492,8 @@ impl EndpointHandler for TableSync struct SyncWorker { syncer: Arc>, - ring_recv: watch::Receiver>, - ring: Arc, + layout_watch: watch::Receiver>, + layout: Arc, add_full_sync_rx: mpsc::UnboundedReceiver<()>, todo: Vec, next_full_sync: Instant, @@ -593,11 +593,11 @@ impl Worker for SyncWorker { self.add_full_sync(); } }, - _ = self.ring_recv.changed() => { - let new_ring = self.ring_recv.borrow(); - if !Arc::ptr_eq(&new_ring, &self.ring) { - self.ring = new_ring.clone(); - drop(new_ring); + _ = self.layout_watch.changed() => { + let new_layout = self.layout_watch.borrow(); + if !Arc::ptr_eq(&new_layout, &self.layout) { + self.layout = new_layout.clone(); + drop(new_layout); debug!("({}) Ring changed, adding full sync to syncer todo list", F::TABLE_NAME); self.add_full_sync(); } -- cgit v1.2.3 From 4a9c94514f49aa4e9880a8e0f5cf5a52d11ae993 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 8 Nov 2023 16:41:00 +0100 Subject: avoid using layout_watch in System directly --- src/api/admin/bucket.rs | 4 ++-- src/api/admin/cluster.rs | 10 +++++----- src/api/k2v/index.rs | 2 +- src/api/s3/put.rs | 2 +- src/garage/admin/bucket.rs | 4 ++-- src/garage/admin/mod.rs | 10 +++++----- src/model/helper/bucket.rs | 2 +- src/rpc/system.rs | 4 ++-- src/table/replication/fullcopy.rs | 4 ++-- src/table/replication/sharded.rs | 8 ++++---- src/table/sync.rs | 2 +- 11 files changed, 26 insertions(+), 26 deletions(-) diff --git a/src/api/admin/bucket.rs b/src/api/admin/bucket.rs index 6bff7e9f..65929d61 100644 --- a/src/api/admin/bucket.rs +++ b/src/api/admin/bucket.rs @@ -122,7 +122,7 @@ async fn bucket_info_results( .table .get(&bucket_id, &EmptyKey) .await? - .map(|x| x.filtered_values(&garage.system.layout_watch.borrow())) + .map(|x| x.filtered_values(&garage.system.cluster_layout())) .unwrap_or_default(); let mpu_counters = garage @@ -130,7 +130,7 @@ async fn bucket_info_results( .table .get(&bucket_id, &EmptyKey) .await? - .map(|x| x.filtered_values(&garage.system.layout_watch.borrow())) + .map(|x| x.filtered_values(&garage.system.cluster_layout())) .unwrap_or_default(); let mut relevant_keys = HashMap::new(); diff --git a/src/api/admin/cluster.rs b/src/api/admin/cluster.rs index c8107b82..01ff9885 100644 --- a/src/api/admin/cluster.rs +++ b/src/api/admin/cluster.rs @@ -33,7 +33,7 @@ pub async fn handle_get_cluster_status(garage: &Arc) -> Result) -> Result, Error> { - let res = format_cluster_layout(&garage.system.get_cluster_layout()); + let res = format_cluster_layout(&garage.system.cluster_layout()); Ok(json_ok_response(&res)?) } @@ -207,7 +207,7 @@ pub async fn handle_update_cluster_layout( ) -> Result, Error> { let updates = parse_json_body::(req).await?; - let mut layout = garage.system.get_cluster_layout(); + let mut layout = garage.system.cluster_layout().as_ref().clone(); let mut roles = layout.roles.clone(); roles.merge(&layout.staging_roles); @@ -247,7 +247,7 @@ pub async fn handle_apply_cluster_layout( ) -> Result, Error> { let param = parse_json_body::(req).await?; - let layout = garage.system.get_cluster_layout(); + let layout = garage.system.cluster_layout().as_ref().clone(); let (layout, msg) = layout.apply_staged_changes(Some(param.version))?; garage.system.update_cluster_layout(&layout).await?; @@ -265,7 +265,7 @@ pub async fn handle_revert_cluster_layout( ) -> Result, Error> { let param = parse_json_body::(req).await?; - let layout = garage.system.get_cluster_layout(); + let layout = garage.system.cluster_layout().as_ref().clone(); let layout = layout.revert_staged_changes(Some(param.version))?; garage.system.update_cluster_layout(&layout).await?; diff --git a/src/api/k2v/index.rs b/src/api/k2v/index.rs index ff8beda3..3fc39de6 100644 --- a/src/api/k2v/index.rs +++ b/src/api/k2v/index.rs @@ -26,7 +26,7 @@ pub async fn handle_read_index( ) -> Result, Error> { let reverse = reverse.unwrap_or(false); - let layout: Arc = garage.system.layout_watch.borrow().clone(); + let layout: Arc = garage.system.cluster_layout().clone(); let (partition_keys, more, next_start) = read_range( &garage.k2v.counter_table.table, diff --git a/src/api/s3/put.rs b/src/api/s3/put.rs index fc17ed03..d1c88a76 100644 --- a/src/api/s3/put.rs +++ b/src/api/s3/put.rs @@ -253,7 +253,7 @@ pub(crate) async fn check_quotas( .await?; let counters = counters - .map(|x| x.filtered_values(&garage.system.layout_watch.borrow())) + .map(|x| x.filtered_values(&garage.system.cluster_layout())) .unwrap_or_default(); let (prev_cnt_obj, prev_cnt_size) = match prev_object { diff --git a/src/garage/admin/bucket.rs b/src/garage/admin/bucket.rs index 34e48292..9e642f57 100644 --- a/src/garage/admin/bucket.rs +++ b/src/garage/admin/bucket.rs @@ -70,7 +70,7 @@ impl AdminRpcHandler { .table .get(&bucket_id, &EmptyKey) .await? - .map(|x| x.filtered_values(&self.garage.system.layout_watch.borrow())) + .map(|x| x.filtered_values(&self.garage.system.cluster_layout())) .unwrap_or_default(); let mpu_counters = self @@ -79,7 +79,7 @@ impl AdminRpcHandler { .table .get(&bucket_id, &EmptyKey) .await? - .map(|x| x.filtered_values(&self.garage.system.layout_watch.borrow())) + .map(|x| x.filtered_values(&self.garage.system.cluster_layout())) .unwrap_or_default(); let mut relevant_keys = HashMap::new(); diff --git a/src/garage/admin/mod.rs b/src/garage/admin/mod.rs index 006f71cd..c3fa801a 100644 --- a/src/garage/admin/mod.rs +++ b/src/garage/admin/mod.rs @@ -126,7 +126,7 @@ impl AdminRpcHandler { opt_to_send.all_nodes = false; let mut failures = vec![]; - let layout = self.garage.system.layout_watch.borrow().clone(); + let layout = self.garage.system.cluster_layout().clone(); for node in layout.node_ids().iter() { let node = (*node).into(); let resp = self @@ -163,7 +163,7 @@ impl AdminRpcHandler { async fn handle_stats(&self, opt: StatsOpt) -> Result { if opt.all_nodes { let mut ret = String::new(); - let layout = self.garage.system.layout_watch.borrow().clone(); + let layout = self.garage.system.cluster_layout().clone(); for node in layout.node_ids().iter() { let mut opt = opt.clone(); @@ -275,7 +275,7 @@ impl AdminRpcHandler { let mut ret = String::new(); // Gather storage node and free space statistics - let layout = &self.garage.system.layout_watch.borrow(); + let layout = &self.garage.system.cluster_layout(); let mut node_partition_count = HashMap::::new(); for short_id in layout.ring_assignment_data.iter() { let id = layout.node_id_vec[*short_id as usize]; @@ -440,7 +440,7 @@ impl AdminRpcHandler { ) -> Result { if all_nodes { let mut ret = vec![]; - let layout = self.garage.system.layout_watch.borrow().clone(); + let layout = self.garage.system.cluster_layout().clone(); for node in layout.node_ids().iter() { let node = (*node).into(); match self @@ -488,7 +488,7 @@ impl AdminRpcHandler { ) -> Result { if all_nodes { let mut ret = vec![]; - let layout = self.garage.system.layout_watch.borrow().clone(); + let layout = self.garage.system.cluster_layout().clone(); for node in layout.node_ids().iter() { let node = (*node).into(); match self diff --git a/src/model/helper/bucket.rs b/src/model/helper/bucket.rs index d43d7e96..8cd5b27b 100644 --- a/src/model/helper/bucket.rs +++ b/src/model/helper/bucket.rs @@ -453,7 +453,7 @@ impl<'a> BucketHelper<'a> { use garage_rpc::layout::ClusterLayout; use std::sync::Arc; - let layout: Arc = self.0.system.layout_watch.borrow().clone(); + let layout: Arc = self.0.system.cluster_layout().clone(); let k2vindexes = self .0 .k2v diff --git a/src/rpc/system.rs b/src/rpc/system.rs index 106e9f8c..93144e39 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -423,8 +423,8 @@ impl System { known_nodes } - pub fn get_cluster_layout(&self) -> ClusterLayout { - self.layout_watch.borrow().as_ref().clone() + pub fn cluster_layout(&self) -> watch::Ref> { + self.layout_watch.borrow() } pub async fn update_cluster_layout( diff --git a/src/table/replication/fullcopy.rs b/src/table/replication/fullcopy.rs index f8b7cacc..34807e3d 100644 --- a/src/table/replication/fullcopy.rs +++ b/src/table/replication/fullcopy.rs @@ -27,11 +27,11 @@ impl TableReplication for TableFullReplication { } fn write_nodes(&self, _hash: &Hash) -> Vec { - let layout = self.system.layout_watch.borrow(); + let layout = self.system.cluster_layout(); layout.node_ids().to_vec() } fn write_quorum(&self) -> usize { - let nmembers = self.system.layout_watch.borrow().node_ids().len(); + let nmembers = self.system.cluster_layout().node_ids().len(); if nmembers > self.max_faults { nmembers - self.max_faults } else { diff --git a/src/table/replication/sharded.rs b/src/table/replication/sharded.rs index 95901a5a..60c95cb4 100644 --- a/src/table/replication/sharded.rs +++ b/src/table/replication/sharded.rs @@ -26,7 +26,7 @@ pub struct TableShardedReplication { impl TableReplication for TableShardedReplication { fn read_nodes(&self, hash: &Hash) -> Vec { - let layout = self.system.layout_watch.borrow(); + let layout = self.system.cluster_layout(); layout.nodes_of(hash, self.replication_factor) } fn read_quorum(&self) -> usize { @@ -34,7 +34,7 @@ impl TableReplication for TableShardedReplication { } fn write_nodes(&self, hash: &Hash) -> Vec { - let layout = self.system.layout_watch.borrow(); + let layout = self.system.cluster_layout(); layout.nodes_of(hash, self.replication_factor) } fn write_quorum(&self) -> usize { @@ -45,9 +45,9 @@ impl TableReplication for TableShardedReplication { } fn partition_of(&self, hash: &Hash) -> Partition { - self.system.layout_watch.borrow().partition_of(hash) + self.system.cluster_layout().partition_of(hash) } fn partitions(&self) -> Vec<(Partition, Hash)> { - self.system.layout_watch.borrow().partitions() + self.system.cluster_layout().partitions() } } diff --git a/src/table/sync.rs b/src/table/sync.rs index b2600013..65eff7cd 100644 --- a/src/table/sync.rs +++ b/src/table/sync.rs @@ -92,7 +92,7 @@ impl TableSyncer { bg.spawn_worker(SyncWorker { syncer: self.clone(), layout_watch: self.system.layout_watch.clone(), - layout: self.system.layout_watch.borrow().clone(), + layout: self.system.cluster_layout().clone(), add_full_sync_rx, todo: vec![], next_full_sync: Instant::now() + Duration::from_secs(20), -- cgit v1.2.3 From fe9af1dcaae31a117528a9cfa10c422c9a850201 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 8 Nov 2023 17:49:06 +0100 Subject: WIP: garage_rpc: store layout version history --- src/rpc/layout.rs | 1358 --------------------------------------------- src/rpc/layout/history.rs | 170 ++++++ src/rpc/layout/mod.rs | 32 ++ src/rpc/layout/schema.rs | 286 ++++++++++ src/rpc/layout/tracker.rs | 21 + src/rpc/layout/version.rs | 1052 +++++++++++++++++++++++++++++++++++ src/rpc/rpc_helper.rs | 12 +- src/rpc/system.rs | 44 +- 8 files changed, 1590 insertions(+), 1385 deletions(-) delete mode 100644 src/rpc/layout.rs create mode 100644 src/rpc/layout/history.rs create mode 100644 src/rpc/layout/mod.rs create mode 100644 src/rpc/layout/schema.rs create mode 100644 src/rpc/layout/tracker.rs create mode 100644 src/rpc/layout/version.rs diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs deleted file mode 100644 index 2b5b6606..00000000 --- a/src/rpc/layout.rs +++ /dev/null @@ -1,1358 +0,0 @@ -use std::cmp::Ordering; -use std::collections::HashMap; -use std::collections::HashSet; -use std::fmt; - -use bytesize::ByteSize; -use itertools::Itertools; - -use garage_util::crdt::{AutoCrdt, Crdt, Lww, LwwMap}; -use garage_util::data::*; -use garage_util::encode::nonversioned_encode; -use garage_util::error::*; - -use crate::graph_algo::*; - -use std::convert::TryInto; - -// ---- defines: partitions ---- - -/// A partition id, which is stored on 16 bits -/// i.e. we have up to 2**16 partitions. -/// (in practice we have exactly 2**PARTITION_BITS partitions) -pub type Partition = u16; - -// TODO: make this constant parametrizable in the config file -// For deployments with many nodes it might make sense to bump -// it up to 10. -// Maximum value : 16 -/// How many bits from the hash are used to make partitions. Higher numbers means more fairness in -/// presence of numerous nodes, but exponentially bigger ring. Max 16 -pub const PARTITION_BITS: usize = 8; - -const NB_PARTITIONS: usize = 1usize << PARTITION_BITS; - -// ---- defines: nodes ---- - -// Type to store compactly the id of a node in the system -// Change this to u16 the day we want to have more than 256 nodes in a cluster -pub type CompactNodeType = u8; -pub const MAX_NODE_NUMBER: usize = 256; - -// ---- defines: other ---- - -// The Message type will be used to collect information on the algorithm. -pub type Message = Vec; - -mod v08 { - use super::CompactNodeType; - use garage_util::crdt::LwwMap; - use garage_util::data::{Hash, Uuid}; - use serde::{Deserialize, Serialize}; - - /// The layout of the cluster, i.e. the list of roles - /// which are assigned to each cluster node - #[derive(Clone, Debug, Serialize, Deserialize)] - pub struct ClusterLayout { - pub version: u64, - - pub replication_factor: usize, - pub roles: LwwMap, - - /// node_id_vec: a vector of node IDs with a role assigned - /// in the system (this includes gateway nodes). - /// The order here is different than the vec stored by `roles`, because: - /// 1. non-gateway nodes are first so that they have lower numbers - /// 2. nodes that don't have a role are excluded (but they need to - /// stay in the CRDT as tombstones) - pub node_id_vec: Vec, - /// the assignation of data partitions to node, the values - /// are indices in node_id_vec - #[serde(with = "serde_bytes")] - pub ring_assignation_data: Vec, - - /// Role changes which are staged for the next version of the layout - pub staging: LwwMap, - pub staging_hash: Hash, - } - - #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] - pub struct NodeRoleV(pub Option); - - /// The user-assigned roles of cluster nodes - #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] - pub struct NodeRole { - /// Datacenter at which this entry belong. This information is used to - /// perform a better geodistribution - pub zone: String, - /// The capacity of the node - /// If this is set to None, the node does not participate in storing data for the system - /// and is only active as an API gateway to other nodes - pub capacity: Option, - /// A set of tags to recognize the node - pub tags: Vec, - } - - impl garage_util::migrate::InitialFormat for ClusterLayout {} -} - -mod v09 { - use super::v08; - use super::CompactNodeType; - use garage_util::crdt::{Lww, LwwMap}; - use garage_util::data::{Hash, Uuid}; - use serde::{Deserialize, Serialize}; - pub use v08::{NodeRole, NodeRoleV}; - - /// The layout of the cluster, i.e. the list of roles - /// which are assigned to each cluster node - #[derive(Clone, Debug, Serialize, Deserialize)] - pub struct ClusterLayout { - pub version: u64, - - pub replication_factor: usize, - - /// This attribute is only used to retain the previously computed partition size, - /// to know to what extent does it change with the layout update. - pub partition_size: u64, - /// Parameters used to compute the assignment currently given by - /// ring_assignment_data - pub parameters: LayoutParameters, - - pub roles: LwwMap, - - /// see comment in v08::ClusterLayout - pub node_id_vec: Vec, - /// see comment in v08::ClusterLayout - #[serde(with = "serde_bytes")] - pub ring_assignment_data: Vec, - - /// Parameters to be used in the next partition assignment computation. - pub staging_parameters: Lww, - /// Role changes which are staged for the next version of the layout - pub staging_roles: LwwMap, - pub staging_hash: Hash, - } - - /// This struct is used to set the parameters to be used in the assignment computation - /// algorithm. It is stored as a Crdt. - #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)] - pub struct LayoutParameters { - pub zone_redundancy: ZoneRedundancy, - } - - /// Zone redundancy: if set to AtLeast(x), the layout calculation will aim to store copies - /// of each partition on at least that number of different zones. - /// Otherwise, copies will be stored on the maximum possible number of zones. - #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)] - pub enum ZoneRedundancy { - AtLeast(usize), - Maximum, - } - - impl garage_util::migrate::Migrate for ClusterLayout { - const VERSION_MARKER: &'static [u8] = b"G09layout"; - - type Previous = v08::ClusterLayout; - - fn migrate(previous: Self::Previous) -> Self { - use itertools::Itertools; - - // In the old layout, capacities are in an arbitrary unit, - // but in the new layout they are in bytes. - // Here we arbitrarily multiply everything by 1G, - // such that 1 old capacity unit = 1GB in the new units. - // This is totally arbitrary and won't work for most users. - let cap_mul = 1024 * 1024 * 1024; - let roles = multiply_all_capacities(previous.roles, cap_mul); - let staging_roles = multiply_all_capacities(previous.staging, cap_mul); - let node_id_vec = previous.node_id_vec; - - // Determine partition size - let mut tmp = previous.ring_assignation_data.clone(); - tmp.sort(); - let partition_size = tmp - .into_iter() - .dedup_with_count() - .map(|(npart, node)| { - roles - .get(&node_id_vec[node as usize]) - .and_then(|p| p.0.as_ref().and_then(|r| r.capacity)) - .unwrap_or(0) / npart as u64 - }) - .min() - .unwrap_or(0); - - // By default, zone_redundancy is maximum possible value - let parameters = LayoutParameters { - zone_redundancy: ZoneRedundancy::Maximum, - }; - - let mut res = Self { - version: previous.version, - replication_factor: previous.replication_factor, - partition_size, - parameters, - roles, - node_id_vec, - ring_assignment_data: previous.ring_assignation_data, - staging_parameters: Lww::new(parameters), - staging_roles, - staging_hash: [0u8; 32].into(), - }; - res.staging_hash = res.calculate_staging_hash(); - res - } - } - - fn multiply_all_capacities( - old_roles: LwwMap, - mul: u64, - ) -> LwwMap { - let mut new_roles = LwwMap::new(); - for (node, ts, role) in old_roles.items() { - let mut role = role.clone(); - if let NodeRoleV(Some(NodeRole { - capacity: Some(ref mut cap), - .. - })) = role - { - *cap *= mul; - } - new_roles.merge_raw(node, *ts, &role); - } - new_roles - } -} - -pub use v09::*; - -impl AutoCrdt for LayoutParameters { - const WARN_IF_DIFFERENT: bool = true; -} - -impl AutoCrdt for NodeRoleV { - const WARN_IF_DIFFERENT: bool = true; -} - -impl NodeRole { - pub fn capacity_string(&self) -> String { - match self.capacity { - Some(c) => ByteSize::b(c).to_string_as(false), - None => "gateway".to_string(), - } - } - - pub fn tags_string(&self) -> String { - self.tags.join(",") - } -} - -impl fmt::Display for ZoneRedundancy { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - ZoneRedundancy::Maximum => write!(f, "maximum"), - ZoneRedundancy::AtLeast(x) => write!(f, "{}", x), - } - } -} - -impl core::str::FromStr for ZoneRedundancy { - type Err = &'static str; - fn from_str(s: &str) -> Result { - match s { - "none" | "max" | "maximum" => Ok(ZoneRedundancy::Maximum), - x => { - let v = x - .parse::() - .map_err(|_| "zone redundancy must be 'none'/'max' or an integer")?; - Ok(ZoneRedundancy::AtLeast(v)) - } - } - } -} - -// Implementation of the ClusterLayout methods unrelated to the assignment algorithm. -impl ClusterLayout { - pub fn new(replication_factor: usize) -> Self { - // We set the default zone redundancy to be Maximum, meaning that the maximum - // possible value will be used depending on the cluster topology - let parameters = LayoutParameters { - zone_redundancy: ZoneRedundancy::Maximum, - }; - let staging_parameters = Lww::::new(parameters); - - let empty_lwwmap = LwwMap::new(); - - let mut ret = ClusterLayout { - version: 0, - replication_factor, - partition_size: 0, - roles: LwwMap::new(), - node_id_vec: Vec::new(), - ring_assignment_data: Vec::new(), - parameters, - staging_parameters, - staging_roles: empty_lwwmap, - staging_hash: [0u8; 32].into(), - }; - ret.staging_hash = ret.calculate_staging_hash(); - ret - } - - // ===================== accessors ====================== - - /// Returns a list of IDs of nodes that currently have - /// a role in the cluster - pub fn node_ids(&self) -> &[Uuid] { - &self.node_id_vec[..] - } - - pub fn num_nodes(&self) -> usize { - self.node_id_vec.len() - } - - /// Returns the role of a node in the layout - pub fn node_role(&self, node: &Uuid) -> Option<&NodeRole> { - match self.roles.get(node) { - Some(NodeRoleV(Some(v))) => Some(v), - _ => None, - } - } - - /// Given a node uuids, this function returns its capacity or fails if it does not have any - pub fn get_node_capacity(&self, uuid: &Uuid) -> Result { - match self.node_role(uuid) { - Some(NodeRole { - capacity: Some(cap), - zone: _, - tags: _, - }) => Ok(*cap), - _ => Err(Error::Message( - "The Uuid does not correspond to a node present in the \ - cluster or this node does not have a positive capacity." - .into(), - )), - } - } - - /// Returns the number of partitions associated to this node in the ring - pub fn get_node_usage(&self, uuid: &Uuid) -> Result { - for (i, id) in self.node_id_vec.iter().enumerate() { - if id == uuid { - let mut count = 0; - for nod in self.ring_assignment_data.iter() { - if i as u8 == *nod { - count += 1 - } - } - return Ok(count); - } - } - Err(Error::Message( - "The Uuid does not correspond to a node present in the \ - cluster or this node does not have a positive capacity." - .into(), - )) - } - - /// Get the partition in which data would fall on - pub fn partition_of(&self, position: &Hash) -> Partition { - let top = u16::from_be_bytes(position.as_slice()[0..2].try_into().unwrap()); - top >> (16 - PARTITION_BITS) - } - - /// Get the list of partitions and the first hash of a partition key that would fall in it - pub fn partitions(&self) -> Vec<(Partition, Hash)> { - (0..(1 << PARTITION_BITS)) - .map(|i| { - let top = (i as u16) << (16 - PARTITION_BITS); - let mut location = [0u8; 32]; - location[..2].copy_from_slice(&u16::to_be_bytes(top)[..]); - (i as u16, Hash::from(location)) - }) - .collect::>() - } - - /// Walk the ring to find the n servers in which data should be replicated - pub fn nodes_of(&self, position: &Hash, n: usize) -> Vec { - assert_eq!(n, self.replication_factor); - - let data = &self.ring_assignment_data; - - if data.len() != self.replication_factor * (1 << PARTITION_BITS) { - warn!("Ring not yet ready, read/writes will be lost!"); - return vec![]; - } - - let partition_idx = self.partition_of(position) as usize; - let partition_start = partition_idx * self.replication_factor; - let partition_end = (partition_idx + 1) * self.replication_factor; - let partition_nodes = &data[partition_start..partition_end]; - - partition_nodes - .iter() - .map(|i| self.node_id_vec[*i as usize]) - .collect::>() - } - - // ===================== internal information extractors ====================== - - /// Returns the uuids of the non_gateway nodes in self.node_id_vec. - fn nongateway_nodes(&self) -> Vec { - let mut result = Vec::::new(); - for uuid in self.node_id_vec.iter() { - match self.node_role(uuid) { - Some(role) if role.capacity.is_some() => result.push(*uuid), - _ => (), - } - } - result - } - - /// Given a node uuids, this function returns the label of its zone - fn get_node_zone(&self, uuid: &Uuid) -> Result<&str, Error> { - match self.node_role(uuid) { - Some(role) => Ok(&role.zone), - _ => Err(Error::Message( - "The Uuid does not correspond to a node present in the cluster.".into(), - )), - } - } - - /// Returns the sum of capacities of non gateway nodes in the cluster - fn get_total_capacity(&self) -> Result { - let mut total_capacity = 0; - for uuid in self.nongateway_nodes().iter() { - total_capacity += self.get_node_capacity(uuid)?; - } - Ok(total_capacity) - } - - /// Returns the effective value of the zone_redundancy parameter - fn effective_zone_redundancy(&self) -> usize { - match self.parameters.zone_redundancy { - ZoneRedundancy::AtLeast(v) => v, - ZoneRedundancy::Maximum => { - let n_zones = self - .roles - .items() - .iter() - .filter_map(|(_, _, role)| role.0.as_ref().map(|x| x.zone.as_str())) - .collect::>() - .len(); - std::cmp::min(n_zones, self.replication_factor) - } - } - } - - fn calculate_staging_hash(&self) -> Hash { - let hashed_tuple = (&self.staging_roles, &self.staging_parameters); - blake2sum(&nonversioned_encode(&hashed_tuple).unwrap()[..]) - } - - // ================== updates to layout, public interface =================== - - pub fn merge(&mut self, other: &ClusterLayout) -> bool { - match other.version.cmp(&self.version) { - Ordering::Greater => { - *self = other.clone(); - true - } - Ordering::Equal => { - self.staging_parameters.merge(&other.staging_parameters); - self.staging_roles.merge(&other.staging_roles); - - let new_staging_hash = self.calculate_staging_hash(); - let changed = new_staging_hash != self.staging_hash; - - self.staging_hash = new_staging_hash; - - changed - } - Ordering::Less => false, - } - } - - pub fn apply_staged_changes(mut self, version: Option) -> Result<(Self, Message), Error> { - match version { - None => { - let error = r#" -Please pass the new layout version number to ensure that you are writing the correct version of the cluster layout. -To know the correct value of the new layout version, invoke `garage layout show` and review the proposed changes. - "#; - return Err(Error::Message(error.into())); - } - Some(v) => { - if v != self.version + 1 { - return Err(Error::Message("Invalid new layout version".into())); - } - } - } - - self.roles.merge(&self.staging_roles); - self.roles.retain(|(_, _, v)| v.0.is_some()); - self.parameters = *self.staging_parameters.get(); - - self.staging_roles.clear(); - self.staging_hash = self.calculate_staging_hash(); - - let msg = self.calculate_partition_assignment()?; - - self.version += 1; - - Ok((self, msg)) - } - - pub fn revert_staged_changes(mut self, version: Option) -> Result { - match version { - None => { - let error = r#" -Please pass the new layout version number to ensure that you are writing the correct version of the cluster layout. -To know the correct value of the new layout version, invoke `garage layout show` and review the proposed changes. - "#; - return Err(Error::Message(error.into())); - } - Some(v) => { - if v != self.version + 1 { - return Err(Error::Message("Invalid new layout version".into())); - } - } - } - - self.staging_roles.clear(); - self.staging_parameters.update(self.parameters); - self.staging_hash = self.calculate_staging_hash(); - - self.version += 1; - - Ok(self) - } - - /// Check a cluster layout for internal consistency - /// (assignment, roles, parameters, partition size) - /// returns true if consistent, false if error - pub fn check(&self) -> Result<(), String> { - // Check that the hash of the staging data is correct - let staging_hash = self.calculate_staging_hash(); - if staging_hash != self.staging_hash { - return Err("staging_hash is incorrect".into()); - } - - // Check that node_id_vec contains the correct list of nodes - let mut expected_nodes = self - .roles - .items() - .iter() - .filter(|(_, _, v)| v.0.is_some()) - .map(|(id, _, _)| *id) - .collect::>(); - expected_nodes.sort(); - let mut node_id_vec = self.node_id_vec.clone(); - node_id_vec.sort(); - if expected_nodes != node_id_vec { - return Err(format!("node_id_vec does not contain the correct set of nodes\nnode_id_vec: {:?}\nexpected: {:?}", node_id_vec, expected_nodes)); - } - - // Check that the assignment data has the correct length - let expected_assignment_data_len = (1 << PARTITION_BITS) * self.replication_factor; - if self.ring_assignment_data.len() != expected_assignment_data_len { - return Err(format!( - "ring_assignment_data has incorrect length {} instead of {}", - self.ring_assignment_data.len(), - expected_assignment_data_len - )); - } - - // Check that the assigned nodes are correct identifiers - // of nodes that are assigned a role - // and that role is not the role of a gateway nodes - for x in self.ring_assignment_data.iter() { - if *x as usize >= self.node_id_vec.len() { - return Err(format!( - "ring_assignment_data contains invalid node id {}", - *x - )); - } - let node = self.node_id_vec[*x as usize]; - match self.roles.get(&node) { - Some(NodeRoleV(Some(x))) if x.capacity.is_some() => (), - _ => return Err("ring_assignment_data contains id of a gateway node".into()), - } - } - - // Check that every partition is associated to distinct nodes - let zone_redundancy = self.effective_zone_redundancy(); - let rf = self.replication_factor; - for p in 0..(1 << PARTITION_BITS) { - let nodes_of_p = self.ring_assignment_data[rf * p..rf * (p + 1)].to_vec(); - if nodes_of_p.iter().unique().count() != rf { - return Err(format!("partition does not contain {} unique node ids", rf)); - } - // Check that every partition is spread over at least zone_redundancy zones. - let zones_of_p = nodes_of_p - .iter() - .map(|n| { - self.get_node_zone(&self.node_id_vec[*n as usize]) - .expect("Zone not found.") - }) - .collect::>(); - if zones_of_p.iter().unique().count() < zone_redundancy { - return Err(format!( - "nodes of partition are in less than {} distinct zones", - zone_redundancy - )); - } - } - - // Check that the nodes capacities is consistent with the stored partitions - let mut node_usage = vec![0; MAX_NODE_NUMBER]; - for n in self.ring_assignment_data.iter() { - node_usage[*n as usize] += 1; - } - for (n, usage) in node_usage.iter().enumerate() { - if *usage > 0 { - let uuid = self.node_id_vec[n]; - let partusage = usage * self.partition_size; - let nodecap = self.get_node_capacity(&uuid).unwrap(); - if partusage > nodecap { - return Err(format!( - "node usage ({}) is bigger than node capacity ({})", - usage * self.partition_size, - nodecap - )); - } - } - } - - // Check that the partition size stored is the one computed by the asignation - // algorithm. - let cl2 = self.clone(); - let (_, zone_to_id) = cl2.generate_nongateway_zone_ids().unwrap(); - match cl2.compute_optimal_partition_size(&zone_to_id, zone_redundancy) { - Ok(s) if s != self.partition_size => { - return Err(format!( - "partition_size ({}) is different than optimal value ({})", - self.partition_size, s - )) - } - Err(e) => return Err(format!("could not calculate optimal partition size: {}", e)), - _ => (), - } - - Ok(()) - } - - // ================== updates to layout, internals =================== - - /// This function calculates a new partition-to-node assignment. - /// The computed assignment respects the node replication factor - /// and the zone redundancy parameter It maximizes the capacity of a - /// partition (assuming all partitions have the same size). - /// Among such optimal assignment, it minimizes the distance to - /// the former assignment (if any) to minimize the amount of - /// data to be moved. - /// Staged role changes must be merged with nodes roles before calling this function, - /// hence it must only be called from apply_staged_changes() and hence is not public. - fn calculate_partition_assignment(&mut self) -> Result { - // We update the node ids, since the node role list might have changed with the - // changes in the layout. We retrieve the old_assignment reframed with new ids - let old_assignment_opt = self.update_node_id_vec()?; - - let zone_redundancy = self.effective_zone_redundancy(); - - let mut msg = Message::new(); - msg.push("==== COMPUTATION OF A NEW PARTITION ASSIGNATION ====".into()); - msg.push("".into()); - msg.push(format!( - "Partitions are \ - replicated {} times on at least {} distinct zones.", - self.replication_factor, zone_redundancy - )); - - // We generate for once numerical ids for the zones of non gateway nodes, - // to use them as indices in the flow graphs. - let (id_to_zone, zone_to_id) = self.generate_nongateway_zone_ids()?; - - let nb_nongateway_nodes = self.nongateway_nodes().len(); - if nb_nongateway_nodes < self.replication_factor { - return Err(Error::Message(format!( - "The number of nodes with positive \ - capacity ({}) is smaller than the replication factor ({}).", - nb_nongateway_nodes, self.replication_factor - ))); - } - if id_to_zone.len() < zone_redundancy { - return Err(Error::Message(format!( - "The number of zones with non-gateway \ - nodes ({}) is smaller than the redundancy parameter ({})", - id_to_zone.len(), - zone_redundancy - ))); - } - - // We compute the optimal partition size - // Capacities should be given in a unit so that partition size is at least 100. - // In this case, integer rounding plays a marginal role in the percentages of - // optimality. - let partition_size = self.compute_optimal_partition_size(&zone_to_id, zone_redundancy)?; - - msg.push("".into()); - if old_assignment_opt.is_some() { - msg.push(format!( - "Optimal partition size: {} ({} in previous layout)", - ByteSize::b(partition_size).to_string_as(false), - ByteSize::b(self.partition_size).to_string_as(false) - )); - } else { - msg.push(format!( - "Optimal partition size: {}", - ByteSize::b(partition_size).to_string_as(false) - )); - } - // We write the partition size. - self.partition_size = partition_size; - - if partition_size < 100 { - msg.push( - "WARNING: The partition size is low (< 100), make sure the capacities of your nodes are correct and are of at least a few MB" - .into(), - ); - } - - // We compute a first flow/assignment that is heuristically close to the previous - // assignment - let mut gflow = - self.compute_candidate_assignment(&zone_to_id, &old_assignment_opt, zone_redundancy)?; - if let Some(assoc) = &old_assignment_opt { - // We minimize the distance to the previous assignment. - self.minimize_rebalance_load(&mut gflow, &zone_to_id, assoc)?; - } - - // We display statistics of the computation - msg.extend(self.output_stat(&gflow, &old_assignment_opt, &zone_to_id, &id_to_zone)?); - - // We update the layout structure - self.update_ring_from_flow(id_to_zone.len(), &gflow)?; - - if let Err(e) = self.check() { - return Err(Error::Message( - format!("Layout check returned an error: {}\nOriginal result of computation: <<<<\n{}\n>>>>", e, msg.join("\n")) - )); - } - - Ok(msg) - } - - /// The LwwMap of node roles might have changed. This function updates the node_id_vec - /// and returns the assignment given by ring, with the new indices of the nodes, and - /// None if the node is not present anymore. - /// We work with the assumption that only this function and calculate_new_assignment - /// do modify assignment_ring and node_id_vec. - fn update_node_id_vec(&mut self) -> Result>>, Error> { - // (1) We compute the new node list - // Non gateway nodes should be coded on 8bits, hence they must be first in the list - // We build the new node ids - let new_non_gateway_nodes: Vec = self - .roles - .items() - .iter() - .filter(|(_, _, v)| matches!(&v.0, Some(r) if r.capacity.is_some())) - .map(|(k, _, _)| *k) - .collect(); - - if new_non_gateway_nodes.len() > MAX_NODE_NUMBER { - return Err(Error::Message(format!( - "There are more than {} non-gateway nodes in the new \ - layout. This is not allowed.", - MAX_NODE_NUMBER - ))); - } - - let new_gateway_nodes: Vec = self - .roles - .items() - .iter() - .filter(|(_, _, v)| matches!(v, NodeRoleV(Some(r)) if r.capacity.is_none())) - .map(|(k, _, _)| *k) - .collect(); - - let mut new_node_id_vec = Vec::::new(); - new_node_id_vec.extend(new_non_gateway_nodes); - new_node_id_vec.extend(new_gateway_nodes); - - let old_node_id_vec = self.node_id_vec.clone(); - self.node_id_vec = new_node_id_vec.clone(); - - // (2) We retrieve the old association - // We rewrite the old association with the new indices. We only consider partition - // to node assignments where the node is still in use. - if self.ring_assignment_data.is_empty() { - // This is a new association - return Ok(None); - } - - if self.ring_assignment_data.len() != NB_PARTITIONS * self.replication_factor { - return Err(Error::Message( - "The old assignment does not have a size corresponding to \ - the old replication factor or the number of partitions." - .into(), - )); - } - - // We build a translation table between the uuid and new ids - let mut uuid_to_new_id = HashMap::::new(); - - // We add the indices of only the new non-gateway nodes that can be used in the - // association ring - for (i, uuid) in new_node_id_vec.iter().enumerate() { - uuid_to_new_id.insert(*uuid, i); - } - - let mut old_assignment = vec![Vec::::new(); NB_PARTITIONS]; - let rf = self.replication_factor; - - for (p, old_assign_p) in old_assignment.iter_mut().enumerate() { - for old_id in &self.ring_assignment_data[p * rf..(p + 1) * rf] { - let uuid = old_node_id_vec[*old_id as usize]; - if uuid_to_new_id.contains_key(&uuid) { - old_assign_p.push(uuid_to_new_id[&uuid]); - } - } - } - - // We write the ring - self.ring_assignment_data = Vec::::new(); - - Ok(Some(old_assignment)) - } - - /// This function generates ids for the zone of the nodes appearing in - /// self.node_id_vec. - fn generate_nongateway_zone_ids(&self) -> Result<(Vec, HashMap), Error> { - let mut id_to_zone = Vec::::new(); - let mut zone_to_id = HashMap::::new(); - - for uuid in self.nongateway_nodes().iter() { - let r = self.node_role(uuid).unwrap(); - if !zone_to_id.contains_key(&r.zone) && r.capacity.is_some() { - zone_to_id.insert(r.zone.clone(), id_to_zone.len()); - id_to_zone.push(r.zone.clone()); - } - } - Ok((id_to_zone, zone_to_id)) - } - - /// This function computes by dichotomy the largest realizable partition size, given - /// the layout roles and parameters. - fn compute_optimal_partition_size( - &self, - zone_to_id: &HashMap, - zone_redundancy: usize, - ) -> Result { - let empty_set = HashSet::<(usize, usize)>::new(); - let mut g = self.generate_flow_graph(1, zone_to_id, &empty_set, zone_redundancy)?; - g.compute_maximal_flow()?; - if g.get_flow_value()? < (NB_PARTITIONS * self.replication_factor) as i64 { - return Err(Error::Message( - "The storage capacity of he cluster is to small. It is \ - impossible to store partitions of size 1." - .into(), - )); - } - - let mut s_down = 1; - let mut s_up = self.get_total_capacity()?; - while s_down + 1 < s_up { - g = self.generate_flow_graph( - (s_down + s_up) / 2, - zone_to_id, - &empty_set, - zone_redundancy, - )?; - g.compute_maximal_flow()?; - if g.get_flow_value()? < (NB_PARTITIONS * self.replication_factor) as i64 { - s_up = (s_down + s_up) / 2; - } else { - s_down = (s_down + s_up) / 2; - } - } - - Ok(s_down) - } - - fn generate_graph_vertices(nb_zones: usize, nb_nodes: usize) -> Vec { - let mut vertices = vec![Vertex::Source, Vertex::Sink]; - for p in 0..NB_PARTITIONS { - vertices.push(Vertex::Pup(p)); - vertices.push(Vertex::Pdown(p)); - for z in 0..nb_zones { - vertices.push(Vertex::PZ(p, z)); - } - } - for n in 0..nb_nodes { - vertices.push(Vertex::N(n)); - } - vertices - } - - /// Generates the graph to compute the maximal flow corresponding to the optimal - /// partition assignment. - /// exclude_assoc is the set of (partition, node) association that we are forbidden - /// to use (hence we do not add the corresponding edge to the graph). This parameter - /// is used to compute a first flow that uses only edges appearing in the previous - /// assignment. This produces a solution that heuristically should be close to the - /// previous one. - fn generate_flow_graph( - &self, - partition_size: u64, - zone_to_id: &HashMap, - exclude_assoc: &HashSet<(usize, usize)>, - zone_redundancy: usize, - ) -> Result, Error> { - let vertices = - ClusterLayout::generate_graph_vertices(zone_to_id.len(), self.nongateway_nodes().len()); - let mut g = Graph::::new(&vertices); - let nb_zones = zone_to_id.len(); - for p in 0..NB_PARTITIONS { - g.add_edge(Vertex::Source, Vertex::Pup(p), zone_redundancy as u64)?; - g.add_edge( - Vertex::Source, - Vertex::Pdown(p), - (self.replication_factor - zone_redundancy) as u64, - )?; - for z in 0..nb_zones { - g.add_edge(Vertex::Pup(p), Vertex::PZ(p, z), 1)?; - g.add_edge( - Vertex::Pdown(p), - Vertex::PZ(p, z), - self.replication_factor as u64, - )?; - } - } - for n in 0..self.nongateway_nodes().len() { - let node_capacity = self.get_node_capacity(&self.node_id_vec[n])?; - let node_zone = zone_to_id[self.get_node_zone(&self.node_id_vec[n])?]; - g.add_edge(Vertex::N(n), Vertex::Sink, node_capacity / partition_size)?; - for p in 0..NB_PARTITIONS { - if !exclude_assoc.contains(&(p, n)) { - g.add_edge(Vertex::PZ(p, node_zone), Vertex::N(n), 1)?; - } - } - } - Ok(g) - } - - /// This function computes a first optimal assignment (in the form of a flow graph). - fn compute_candidate_assignment( - &self, - zone_to_id: &HashMap, - prev_assign_opt: &Option>>, - zone_redundancy: usize, - ) -> Result, Error> { - // We list the (partition,node) associations that are not used in the - // previous assignment - let mut exclude_edge = HashSet::<(usize, usize)>::new(); - if let Some(prev_assign) = prev_assign_opt { - let nb_nodes = self.nongateway_nodes().len(); - for (p, prev_assign_p) in prev_assign.iter().enumerate() { - for n in 0..nb_nodes { - exclude_edge.insert((p, n)); - } - for n in prev_assign_p.iter() { - exclude_edge.remove(&(p, *n)); - } - } - } - - // We compute the best flow using only the edges used in the previous assignment - let mut g = self.generate_flow_graph( - self.partition_size, - zone_to_id, - &exclude_edge, - zone_redundancy, - )?; - g.compute_maximal_flow()?; - - // We add the excluded edges and compute the maximal flow with the full graph. - // The algorithm is such that it will start with the flow that we just computed - // and find ameliorating paths from that. - for (p, n) in exclude_edge.iter() { - let node_zone = zone_to_id[self.get_node_zone(&self.node_id_vec[*n])?]; - g.add_edge(Vertex::PZ(*p, node_zone), Vertex::N(*n), 1)?; - } - g.compute_maximal_flow()?; - Ok(g) - } - - /// This function updates the flow graph gflow to minimize the distance between - /// its corresponding assignment and the previous one - fn minimize_rebalance_load( - &self, - gflow: &mut Graph, - zone_to_id: &HashMap, - prev_assign: &[Vec], - ) -> Result<(), Error> { - // We define a cost function on the edges (pairs of vertices) corresponding - // to the distance between the two assignments. - let mut cost = CostFunction::new(); - for (p, assoc_p) in prev_assign.iter().enumerate() { - for n in assoc_p.iter() { - let node_zone = zone_to_id[self.get_node_zone(&self.node_id_vec[*n])?]; - cost.insert((Vertex::PZ(p, node_zone), Vertex::N(*n)), -1); - } - } - - // We compute the maximal length of a simple path in gflow. It is used in the - // Bellman-Ford algorithm in optimize_flow_with_cost to set the number - // of iterations. - let nb_nodes = self.nongateway_nodes().len(); - let path_length = 4 * nb_nodes; - gflow.optimize_flow_with_cost(&cost, path_length)?; - - Ok(()) - } - - /// This function updates the assignment ring from the flow graph. - fn update_ring_from_flow( - &mut self, - nb_zones: usize, - gflow: &Graph, - ) -> Result<(), Error> { - self.ring_assignment_data = Vec::::new(); - for p in 0..NB_PARTITIONS { - for z in 0..nb_zones { - let assoc_vertex = gflow.get_positive_flow_from(Vertex::PZ(p, z))?; - for vertex in assoc_vertex.iter() { - if let Vertex::N(n) = vertex { - self.ring_assignment_data.push((*n).try_into().unwrap()); - } - } - } - } - - if self.ring_assignment_data.len() != NB_PARTITIONS * self.replication_factor { - return Err(Error::Message( - "Critical Error : the association ring we produced does not \ - have the right size." - .into(), - )); - } - Ok(()) - } - - /// This function returns a message summing up the partition repartition of the new - /// layout, and other statistics of the partition assignment computation. - fn output_stat( - &self, - gflow: &Graph, - prev_assign_opt: &Option>>, - zone_to_id: &HashMap, - id_to_zone: &[String], - ) -> Result { - let mut msg = Message::new(); - - let used_cap = self.partition_size * NB_PARTITIONS as u64 * self.replication_factor as u64; - let total_cap = self.get_total_capacity()?; - let percent_cap = 100.0 * (used_cap as f32) / (total_cap as f32); - msg.push(format!( - "Usable capacity / total cluster capacity: {} / {} ({:.1} %)", - ByteSize::b(used_cap).to_string_as(false), - ByteSize::b(total_cap).to_string_as(false), - percent_cap - )); - msg.push(format!( - "Effective capacity (replication factor {}): {}", - self.replication_factor, - ByteSize::b(used_cap / self.replication_factor as u64).to_string_as(false) - )); - if percent_cap < 80. { - msg.push("".into()); - msg.push( - "If the percentage is too low, it might be that the \ - cluster topology and redundancy constraints are forcing the use of nodes/zones with small \ - storage capacities." - .into(), - ); - msg.push( - "You might want to move storage capacity between zones or relax the redundancy constraint." - .into(), - ); - msg.push( - "See the detailed statistics below and look for saturated nodes/zones.".into(), - ); - } - - // We define and fill in the following tables - let storing_nodes = self.nongateway_nodes(); - let mut new_partitions = vec![0; storing_nodes.len()]; - let mut stored_partitions = vec![0; storing_nodes.len()]; - - let mut new_partitions_zone = vec![0; id_to_zone.len()]; - let mut stored_partitions_zone = vec![0; id_to_zone.len()]; - - for p in 0..NB_PARTITIONS { - for z in 0..id_to_zone.len() { - let pz_nodes = gflow.get_positive_flow_from(Vertex::PZ(p, z))?; - if !pz_nodes.is_empty() { - stored_partitions_zone[z] += 1; - if let Some(prev_assign) = prev_assign_opt { - let mut old_zones_of_p = Vec::::new(); - for n in prev_assign[p].iter() { - old_zones_of_p - .push(zone_to_id[self.get_node_zone(&self.node_id_vec[*n])?]); - } - if !old_zones_of_p.contains(&z) { - new_partitions_zone[z] += 1; - } - } - } - for vert in pz_nodes.iter() { - if let Vertex::N(n) = *vert { - stored_partitions[n] += 1; - if let Some(prev_assign) = prev_assign_opt { - if !prev_assign[p].contains(&n) { - new_partitions[n] += 1; - } - } - } - } - } - } - - if prev_assign_opt.is_none() { - new_partitions = stored_partitions.clone(); - //new_partitions_zone = stored_partitions_zone.clone(); - } - - // We display the statistics - - msg.push("".into()); - if prev_assign_opt.is_some() { - let total_new_partitions: usize = new_partitions.iter().sum(); - msg.push(format!( - "A total of {} new copies of partitions need to be \ - transferred.", - total_new_partitions - )); - msg.push("".into()); - } - - let mut table = vec![]; - for z in 0..id_to_zone.len() { - let mut nodes_of_z = Vec::::new(); - for n in 0..storing_nodes.len() { - if self.get_node_zone(&self.node_id_vec[n])? == id_to_zone[z] { - nodes_of_z.push(n); - } - } - let replicated_partitions: usize = - nodes_of_z.iter().map(|n| stored_partitions[*n]).sum(); - table.push(format!( - "{}\tTags\tPartitions\tCapacity\tUsable capacity", - id_to_zone[z] - )); - - let available_cap_z: u64 = self.partition_size * replicated_partitions as u64; - let mut total_cap_z = 0; - for n in nodes_of_z.iter() { - total_cap_z += self.get_node_capacity(&self.node_id_vec[*n])?; - } - let percent_cap_z = 100.0 * (available_cap_z as f32) / (total_cap_z as f32); - - for n in nodes_of_z.iter() { - let available_cap_n = stored_partitions[*n] as u64 * self.partition_size; - let total_cap_n = self.get_node_capacity(&self.node_id_vec[*n])?; - let tags_n = (self.node_role(&self.node_id_vec[*n]).ok_or(""))?.tags_string(); - table.push(format!( - " {:?}\t{}\t{} ({} new)\t{}\t{} ({:.1}%)", - self.node_id_vec[*n], - tags_n, - stored_partitions[*n], - new_partitions[*n], - ByteSize::b(total_cap_n).to_string_as(false), - ByteSize::b(available_cap_n).to_string_as(false), - (available_cap_n as f32) / (total_cap_n as f32) * 100.0, - )); - } - - table.push(format!( - " TOTAL\t\t{} ({} unique)\t{}\t{} ({:.1}%)", - replicated_partitions, - stored_partitions_zone[z], - //new_partitions_zone[z], - ByteSize::b(total_cap_z).to_string_as(false), - ByteSize::b(available_cap_z).to_string_as(false), - percent_cap_z - )); - table.push("".into()); - } - msg.push(format_table::format_table_to_string(table)); - - Ok(msg) - } -} - -// ==================================================================================== - -#[cfg(test)] -mod tests { - use super::{Error, *}; - use std::cmp::min; - - // This function checks that the partition size S computed is at least better than the - // one given by a very naive algorithm. To do so, we try to run the naive algorithm - // assuming a partion size of S+1. If we succed, it means that the optimal assignment - // was not optimal. The naive algorithm is the following : - // - we compute the max number of partitions associated to every node, capped at the - // partition number. It gives the number of tokens of every node. - // - every zone has a number of tokens equal to the sum of the tokens of its nodes. - // - we cycle over the partitions and associate zone tokens while respecting the - // zone redundancy constraint. - // NOTE: the naive algorithm is not optimal. Counter example: - // take nb_partition = 3 ; replication_factor = 5; redundancy = 4; - // number of tokens by zone : (A, 4), (B,1), (C,4), (D, 4), (E, 2) - // With these parameters, the naive algo fails, whereas there is a solution: - // (A,A,C,D,E) , (A,B,C,D,D) (A,C,C,D,E) - fn check_against_naive(cl: &ClusterLayout) -> Result { - let over_size = cl.partition_size + 1; - let mut zone_token = HashMap::::new(); - - let (zones, zone_to_id) = cl.generate_nongateway_zone_ids()?; - - if zones.is_empty() { - return Ok(false); - } - - for z in zones.iter() { - zone_token.insert(z.clone(), 0); - } - for uuid in cl.nongateway_nodes().iter() { - let z = cl.get_node_zone(uuid)?; - let c = cl.get_node_capacity(uuid)?; - zone_token.insert( - z.clone(), - zone_token[&z] + min(NB_PARTITIONS, (c / over_size) as usize), - ); - } - - // For every partition, we count the number of zone already associated and - // the name of the last zone associated - - let mut id_zone_token = vec![0; zones.len()]; - for (z, t) in zone_token.iter() { - id_zone_token[zone_to_id[z]] = *t; - } - - let mut nb_token = vec![0; NB_PARTITIONS]; - let mut last_zone = vec![zones.len(); NB_PARTITIONS]; - - let mut curr_zone = 0; - - let redundancy = cl.effective_zone_redundancy(); - - for replic in 0..cl.replication_factor { - for p in 0..NB_PARTITIONS { - while id_zone_token[curr_zone] == 0 - || (last_zone[p] == curr_zone - && redundancy - nb_token[p] <= cl.replication_factor - replic) - { - curr_zone += 1; - if curr_zone >= zones.len() { - return Ok(true); - } - } - id_zone_token[curr_zone] -= 1; - if last_zone[p] != curr_zone { - nb_token[p] += 1; - last_zone[p] = curr_zone; - } - } - } - - return Ok(false); - } - - fn show_msg(msg: &Message) { - for s in msg.iter() { - println!("{}", s); - } - } - - fn update_layout( - cl: &mut ClusterLayout, - node_id_vec: &Vec, - node_capacity_vec: &Vec, - node_zone_vec: &Vec, - zone_redundancy: usize, - ) { - for i in 0..node_id_vec.len() { - if let Some(x) = FixedBytes32::try_from(&[i as u8; 32]) { - cl.node_id_vec.push(x); - } - - let update = cl.staging_roles.update_mutator( - cl.node_id_vec[i], - NodeRoleV(Some(NodeRole { - zone: (node_zone_vec[i].to_string()), - capacity: (Some(node_capacity_vec[i])), - tags: (vec![]), - })), - ); - cl.staging_roles.merge(&update); - } - cl.staging_parameters.update(LayoutParameters { - zone_redundancy: ZoneRedundancy::AtLeast(zone_redundancy), - }); - cl.staging_hash = cl.calculate_staging_hash(); - } - - #[test] - fn test_assignment() { - let mut node_id_vec = vec![1, 2, 3]; - let mut node_capacity_vec = vec![4000, 1000, 2000]; - let mut node_zone_vec = vec!["A", "B", "C"] - .into_iter() - .map(|x| x.to_string()) - .collect(); - - let mut cl = ClusterLayout::new(3); - update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 3); - let v = cl.version; - let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); - show_msg(&msg); - assert_eq!(cl.check(), Ok(())); - assert!(matches!(check_against_naive(&cl), Ok(true))); - - node_id_vec = vec![1, 2, 3, 4, 5, 6, 7, 8, 9]; - node_capacity_vec = vec![4000, 1000, 1000, 3000, 1000, 1000, 2000, 10000, 2000]; - node_zone_vec = vec!["A", "B", "C", "C", "C", "B", "G", "H", "I"] - .into_iter() - .map(|x| x.to_string()) - .collect(); - update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 2); - let v = cl.version; - let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); - show_msg(&msg); - assert_eq!(cl.check(), Ok(())); - assert!(matches!(check_against_naive(&cl), Ok(true))); - - node_capacity_vec = vec![4000, 1000, 2000, 7000, 1000, 1000, 2000, 10000, 2000]; - update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 3); - let v = cl.version; - let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); - show_msg(&msg); - assert_eq!(cl.check(), Ok(())); - assert!(matches!(check_against_naive(&cl), Ok(true))); - - node_capacity_vec = vec![ - 4000000, 4000000, 2000000, 7000000, 1000000, 9000000, 2000000, 10000, 2000000, - ]; - update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 1); - let v = cl.version; - let (cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); - show_msg(&msg); - assert_eq!(cl.check(), Ok(())); - assert!(matches!(check_against_naive(&cl), Ok(true))); - } -} diff --git a/src/rpc/layout/history.rs b/src/rpc/layout/history.rs new file mode 100644 index 00000000..b3019f58 --- /dev/null +++ b/src/rpc/layout/history.rs @@ -0,0 +1,170 @@ +use std::cmp::Ordering; +use std::sync::Arc; + +use garage_util::crdt::{Crdt, Lww, LwwMap}; +use garage_util::data::*; +use garage_util::encode::nonversioned_encode; +use garage_util::error::*; + +use super::schema::*; +use super::*; + +impl LayoutHistory { + pub fn new(replication_factor: usize) -> Self { + let version = LayoutVersion::new(replication_factor); + + let staging_parameters = Lww::::new(version.parameters); + let empty_lwwmap = LwwMap::new(); + + let mut ret = LayoutHistory { + versions: vec![version].into_boxed_slice().into(), + update_trackers: Default::default(), + staging_parameters, + staging_roles: empty_lwwmap, + staging_hash: [0u8; 32].into(), + }; + ret.staging_hash = ret.calculate_staging_hash(); + ret + } + + pub fn current(&self) -> &LayoutVersion { + self.versions.last().as_ref().unwrap() + } + + pub(crate) fn calculate_staging_hash(&self) -> Hash { + let hashed_tuple = (&self.staging_roles, &self.staging_parameters); + blake2sum(&nonversioned_encode(&hashed_tuple).unwrap()[..]) + } + + // ================== updates to layout, public interface =================== + + pub fn merge(&mut self, other: &LayoutHistory) -> bool { + let mut changed = false; + + // Merge staged layout changes + match other.current().version.cmp(&self.current().version) { + Ordering::Greater => { + self.staging_parameters = other.staging_parameters.clone(); + self.staging_roles = other.staging_roles.clone(); + self.staging_hash = other.staging_hash; + changed = true; + } + Ordering::Equal => { + self.staging_parameters.merge(&other.staging_parameters); + self.staging_roles.merge(&other.staging_roles); + + let new_staging_hash = self.calculate_staging_hash(); + if new_staging_hash != self.staging_hash { + changed = true; + } + + self.staging_hash = new_staging_hash; + } + Ordering::Less => (), + } + + // Add any new versions to history + let mut versions = self.versions.to_vec(); + for v2 in other.versions.iter() { + if let Some(v1) = versions.iter().find(|v| v.version == v2.version) { + if v1 != v2 { + error!("Inconsistent layout histories: different layout compositions for version {}. Your cluster will be broken as long as this layout version is not replaced.", v2.version); + } + } else if versions.iter().all(|v| v.version != v2.version - 1) { + error!( + "Cannot receive new layout version {}, version {} is missing", + v2.version, + v2.version - 1 + ); + } else { + versions.push(v2.clone()); + changed = true; + } + } + self.versions = Arc::from(versions.into_boxed_slice()); + + // Merge trackers + self.update_trackers.merge(&other.update_trackers); + + changed + } + + pub fn apply_staged_changes(mut self, version: Option) -> Result<(Self, Message), Error> { + match version { + None => { + let error = r#" +Please pass the new layout version number to ensure that you are writing the correct version of the cluster layout. +To know the correct value of the new layout version, invoke `garage layout show` and review the proposed changes. + "#; + return Err(Error::Message(error.into())); + } + Some(v) => { + if v != self.current().version + 1 { + return Err(Error::Message("Invalid new layout version".into())); + } + } + } + + let mut new_version = self.current().clone(); + new_version.version += 1; + + new_version.roles.merge(&self.staging_roles); + new_version.roles.retain(|(_, _, v)| v.0.is_some()); + new_version.parameters = *self.staging_parameters.get(); + + self.staging_roles.clear(); + self.staging_hash = self.calculate_staging_hash(); + + let msg = new_version.calculate_partition_assignment()?; + + let mut versions = self.versions.to_vec(); + versions.push(new_version); + self.versions = Arc::from(versions.into_boxed_slice()); + + Ok((self, msg)) + } + + pub fn revert_staged_changes(mut self, version: Option) -> Result { + match version { + None => { + let error = r#" +Please pass the new layout version number to ensure that you are writing the correct version of the cluster layout. +To know the correct value of the new layout version, invoke `garage layout show` and review the proposed changes. + "#; + return Err(Error::Message(error.into())); + } + Some(v) => { + if v != self.current().version + 1 { + return Err(Error::Message("Invalid new layout version".into())); + } + } + } + + self.staging_roles.clear(); + self.staging_parameters.update(self.current().parameters); + self.staging_hash = self.calculate_staging_hash(); + + // TODO this is stupid, we should have a separate version counter/LWW + // for the staging params + let mut new_version = self.current().clone(); + new_version.version += 1; + + let mut versions = self.versions.to_vec(); + versions.push(new_version); + self.versions = Arc::from(versions.into_boxed_slice()); + + Ok(self) + } + + pub fn check(&self) -> Result<(), String> { + // Check that the hash of the staging data is correct + let staging_hash = self.calculate_staging_hash(); + if staging_hash != self.staging_hash { + return Err("staging_hash is incorrect".into()); + } + + // TODO: anythign more ? + + self.current().check() + } +} diff --git a/src/rpc/layout/mod.rs b/src/rpc/layout/mod.rs new file mode 100644 index 00000000..122d4b65 --- /dev/null +++ b/src/rpc/layout/mod.rs @@ -0,0 +1,32 @@ +mod history; +mod schema; +mod tracker; +mod version; + +pub use history::*; +pub use schema::*; +pub use version::*; + +// ---- defines: partitions ---- + +/// A partition id, which is stored on 16 bits +/// i.e. we have up to 2**16 partitions. +/// (in practice we have exactly 2**PARTITION_BITS partitions) +pub type Partition = u16; + +// TODO: make this constant parametrizable in the config file +// For deployments with many nodes it might make sense to bump +// it up to 10. +// Maximum value : 16 +/// How many bits from the hash are used to make partitions. Higher numbers means more fairness in +/// presence of numerous nodes, but exponentially bigger ring. Max 16 +pub const PARTITION_BITS: usize = 8; + +const NB_PARTITIONS: usize = 1usize << PARTITION_BITS; + +// ---- defines: nodes ---- + +// Type to store compactly the id of a node in the system +// Change this to u16 the day we want to have more than 256 nodes in a cluster +pub type CompactNodeType = u8; +pub const MAX_NODE_NUMBER: usize = 256; diff --git a/src/rpc/layout/schema.rs b/src/rpc/layout/schema.rs new file mode 100644 index 00000000..fa0822fa --- /dev/null +++ b/src/rpc/layout/schema.rs @@ -0,0 +1,286 @@ +mod v08 { + use crate::layout::CompactNodeType; + use garage_util::crdt::LwwMap; + use garage_util::data::{Hash, Uuid}; + use serde::{Deserialize, Serialize}; + + /// The layout of the cluster, i.e. the list of roles + /// which are assigned to each cluster node + #[derive(Clone, Debug, Serialize, Deserialize)] + pub struct ClusterLayout { + pub version: u64, + + pub replication_factor: usize, + pub roles: LwwMap, + + /// node_id_vec: a vector of node IDs with a role assigned + /// in the system (this includes gateway nodes). + /// The order here is different than the vec stored by `roles`, because: + /// 1. non-gateway nodes are first so that they have lower numbers + /// 2. nodes that don't have a role are excluded (but they need to + /// stay in the CRDT as tombstones) + pub node_id_vec: Vec, + /// the assignation of data partitions to node, the values + /// are indices in node_id_vec + #[serde(with = "serde_bytes")] + pub ring_assignation_data: Vec, + + /// Role changes which are staged for the next version of the layout + pub staging: LwwMap, + pub staging_hash: Hash, + } + + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] + pub struct NodeRoleV(pub Option); + + /// The user-assigned roles of cluster nodes + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] + pub struct NodeRole { + /// Datacenter at which this entry belong. This information is used to + /// perform a better geodistribution + pub zone: String, + /// The capacity of the node + /// If this is set to None, the node does not participate in storing data for the system + /// and is only active as an API gateway to other nodes + pub capacity: Option, + /// A set of tags to recognize the node + pub tags: Vec, + } + + impl garage_util::migrate::InitialFormat for ClusterLayout {} +} + +mod v09 { + use super::v08; + use crate::layout::CompactNodeType; + use garage_util::crdt::{Lww, LwwMap}; + use garage_util::data::{Hash, Uuid}; + use serde::{Deserialize, Serialize}; + pub use v08::{NodeRole, NodeRoleV}; + + /// The layout of the cluster, i.e. the list of roles + /// which are assigned to each cluster node + #[derive(Clone, Debug, Serialize, Deserialize)] + pub struct ClusterLayout { + pub version: u64, + + pub replication_factor: usize, + + /// This attribute is only used to retain the previously computed partition size, + /// to know to what extent does it change with the layout update. + pub partition_size: u64, + /// Parameters used to compute the assignment currently given by + /// ring_assignment_data + pub parameters: LayoutParameters, + + pub roles: LwwMap, + + /// see comment in v08::ClusterLayout + pub node_id_vec: Vec, + /// see comment in v08::ClusterLayout + #[serde(with = "serde_bytes")] + pub ring_assignment_data: Vec, + + /// Parameters to be used in the next partition assignment computation. + pub staging_parameters: Lww, + /// Role changes which are staged for the next version of the layout + pub staging_roles: LwwMap, + pub staging_hash: Hash, + } + + /// This struct is used to set the parameters to be used in the assignment computation + /// algorithm. It is stored as a Crdt. + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)] + pub struct LayoutParameters { + pub zone_redundancy: ZoneRedundancy, + } + + /// Zone redundancy: if set to AtLeast(x), the layout calculation will aim to store copies + /// of each partition on at least that number of different zones. + /// Otherwise, copies will be stored on the maximum possible number of zones. + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)] + pub enum ZoneRedundancy { + AtLeast(usize), + Maximum, + } + + impl garage_util::migrate::Migrate for ClusterLayout { + const VERSION_MARKER: &'static [u8] = b"G09layout"; + + type Previous = v08::ClusterLayout; + + fn migrate(previous: Self::Previous) -> Self { + use itertools::Itertools; + + // In the old layout, capacities are in an arbitrary unit, + // but in the new layout they are in bytes. + // Here we arbitrarily multiply everything by 1G, + // such that 1 old capacity unit = 1GB in the new units. + // This is totally arbitrary and won't work for most users. + let cap_mul = 1024 * 1024 * 1024; + let roles = multiply_all_capacities(previous.roles, cap_mul); + let staging_roles = multiply_all_capacities(previous.staging, cap_mul); + let node_id_vec = previous.node_id_vec; + + // Determine partition size + let mut tmp = previous.ring_assignation_data.clone(); + tmp.sort(); + let partition_size = tmp + .into_iter() + .dedup_with_count() + .map(|(npart, node)| { + roles + .get(&node_id_vec[node as usize]) + .and_then(|p| p.0.as_ref().and_then(|r| r.capacity)) + .unwrap_or(0) / npart as u64 + }) + .min() + .unwrap_or(0); + + // By default, zone_redundancy is maximum possible value + let parameters = LayoutParameters { + zone_redundancy: ZoneRedundancy::Maximum, + }; + + Self { + version: previous.version, + replication_factor: previous.replication_factor, + partition_size, + parameters, + roles, + node_id_vec, + ring_assignment_data: previous.ring_assignation_data, + staging_parameters: Lww::new(parameters), + staging_roles, + staging_hash: [0u8; 32].into(), // will be set in the next migration + } + } + } + + fn multiply_all_capacities( + old_roles: LwwMap, + mul: u64, + ) -> LwwMap { + let mut new_roles = LwwMap::new(); + for (node, ts, role) in old_roles.items() { + let mut role = role.clone(); + if let NodeRoleV(Some(NodeRole { + capacity: Some(ref mut cap), + .. + })) = role + { + *cap *= mul; + } + new_roles.merge_raw(node, *ts, &role); + } + new_roles + } +} + +mod v010 { + use super::v09; + use crate::layout::CompactNodeType; + use garage_util::crdt::{Lww, LwwMap}; + use garage_util::data::{Hash, Uuid}; + use serde::{Deserialize, Serialize}; + use std::collections::HashMap; + use std::sync::Arc; + pub use v09::{LayoutParameters, NodeRole, NodeRoleV, ZoneRedundancy}; + + /// The layout of the cluster, i.e. the list of roles + /// which are assigned to each cluster node + #[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] + pub struct LayoutVersion { + pub version: u64, + + pub replication_factor: usize, + + /// This attribute is only used to retain the previously computed partition size, + /// to know to what extent does it change with the layout update. + pub partition_size: u64, + /// Parameters used to compute the assignment currently given by + /// ring_assignment_data + pub parameters: LayoutParameters, + + pub roles: LwwMap, + + /// see comment in v08::ClusterLayout + pub node_id_vec: Vec, + /// see comment in v08::ClusterLayout + #[serde(with = "serde_bytes")] + pub ring_assignment_data: Vec, + } + + /// The history of cluster layouts + #[derive(Clone, Debug, Serialize, Deserialize)] + pub struct LayoutHistory { + /// The versions currently in use in the cluster + pub versions: Arc<[LayoutVersion]>, + + /// Update trackers + pub update_trackers: UpdateTrackers, + + /// Parameters to be used in the next partition assignment computation. + pub staging_parameters: Lww, + /// Role changes which are staged for the next version of the layout + pub staging_roles: LwwMap, + /// Hash of the serialized staging_parameters + staging_roles + pub staging_hash: Hash, + } + + /// The tracker of acknowlegments and data syncs around the cluster + #[derive(Clone, Debug, Serialize, Deserialize, Default)] + pub struct UpdateTrackers { + /// The highest layout version number each node has ack'ed + pub ack_map: UpdateTracker, + /// The highest layout version number each node has synced data for + pub sync_map: UpdateTracker, + /// The highest layout version number each node has + /// ack'ed that all other nodes have synced data for + pub sync_ack_map: UpdateTracker, + } + + /// The history of cluster layouts + #[derive(Clone, Debug, Serialize, Deserialize, Default)] + pub struct UpdateTracker(pub HashMap); + + impl garage_util::migrate::Migrate for LayoutHistory { + const VERSION_MARKER: &'static [u8] = b"G010lh"; + + type Previous = v09::ClusterLayout; + + fn migrate(previous: Self::Previous) -> Self { + let version = LayoutVersion { + version: previous.version, + replication_factor: previous.replication_factor, + partition_size: previous.partition_size, + parameters: previous.parameters, + roles: previous.roles, + node_id_vec: previous.node_id_vec, + ring_assignment_data: previous.ring_assignment_data, + }; + let update_tracker = UpdateTracker( + version + .nongateway_nodes() + .iter() + .map(|x| (*x, version.version)) + .collect::>(), + ); + let mut ret = Self { + versions: Arc::from(vec![version].into_boxed_slice()), + update_trackers: UpdateTrackers { + ack_map: update_tracker.clone(), + sync_map: update_tracker.clone(), + sync_ack_map: update_tracker.clone(), + }, + staging_parameters: previous.staging_parameters, + staging_roles: previous.staging_roles, + staging_hash: [0u8; 32].into(), + }; + ret.staging_hash = ret.calculate_staging_hash(); + ret + } + } +} + +pub use v010::*; diff --git a/src/rpc/layout/tracker.rs b/src/rpc/layout/tracker.rs new file mode 100644 index 00000000..778121e4 --- /dev/null +++ b/src/rpc/layout/tracker.rs @@ -0,0 +1,21 @@ +use super::*; + +impl UpdateTracker { + fn merge(&mut self, other: &UpdateTracker) { + for (k, v) in other.0.iter() { + if let Some(v_mut) = self.0.get_mut(k) { + *v_mut = std::cmp::max(*v_mut, *v); + } else { + self.0.insert(*k, *v); + } + } + } +} + +impl UpdateTrackers { + pub(crate) fn merge(&mut self, other: &UpdateTrackers) { + self.ack_map.merge(&other.ack_map); + self.sync_map.merge(&other.sync_map); + self.sync_ack_map.merge(&other.sync_ack_map); + } +} diff --git a/src/rpc/layout/version.rs b/src/rpc/layout/version.rs new file mode 100644 index 00000000..363bc204 --- /dev/null +++ b/src/rpc/layout/version.rs @@ -0,0 +1,1052 @@ +use std::collections::HashMap; +use std::collections::HashSet; +use std::fmt; + +use bytesize::ByteSize; +use itertools::Itertools; + +use garage_util::crdt::{AutoCrdt, LwwMap}; +use garage_util::data::*; +use garage_util::error::*; + +use crate::graph_algo::*; + +use std::convert::TryInto; + +use super::schema::*; +use super::*; + +// The Message type will be used to collect information on the algorithm. +pub type Message = Vec; + +impl AutoCrdt for LayoutParameters { + const WARN_IF_DIFFERENT: bool = true; +} + +impl AutoCrdt for NodeRoleV { + const WARN_IF_DIFFERENT: bool = true; +} + +impl NodeRole { + pub fn capacity_string(&self) -> String { + match self.capacity { + Some(c) => ByteSize::b(c).to_string_as(false), + None => "gateway".to_string(), + } + } + + pub fn tags_string(&self) -> String { + self.tags.join(",") + } +} + +impl fmt::Display for ZoneRedundancy { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + ZoneRedundancy::Maximum => write!(f, "maximum"), + ZoneRedundancy::AtLeast(x) => write!(f, "{}", x), + } + } +} + +impl core::str::FromStr for ZoneRedundancy { + type Err = &'static str; + fn from_str(s: &str) -> Result { + match s { + "none" | "max" | "maximum" => Ok(ZoneRedundancy::Maximum), + x => { + let v = x + .parse::() + .map_err(|_| "zone redundancy must be 'none'/'max' or an integer")?; + Ok(ZoneRedundancy::AtLeast(v)) + } + } + } +} + +impl LayoutVersion { + pub fn new(replication_factor: usize) -> Self { + // We set the default zone redundancy to be Maximum, meaning that the maximum + // possible value will be used depending on the cluster topology + let parameters = LayoutParameters { + zone_redundancy: ZoneRedundancy::Maximum, + }; + + LayoutVersion { + version: 0, + replication_factor, + partition_size: 0, + roles: LwwMap::new(), + node_id_vec: Vec::new(), + ring_assignment_data: Vec::new(), + parameters, + } + } + + // ===================== accessors ====================== + + /// Returns a list of IDs of nodes that currently have + /// a role in the cluster + pub fn node_ids(&self) -> &[Uuid] { + &self.node_id_vec[..] + } + + pub fn num_nodes(&self) -> usize { + self.node_id_vec.len() + } + + /// Returns the role of a node in the layout + pub fn node_role(&self, node: &Uuid) -> Option<&NodeRole> { + match self.roles.get(node) { + Some(NodeRoleV(Some(v))) => Some(v), + _ => None, + } + } + + /// Given a node uuids, this function returns its capacity or fails if it does not have any + pub fn get_node_capacity(&self, uuid: &Uuid) -> Result { + match self.node_role(uuid) { + Some(NodeRole { + capacity: Some(cap), + zone: _, + tags: _, + }) => Ok(*cap), + _ => Err(Error::Message( + "The Uuid does not correspond to a node present in the \ + cluster or this node does not have a positive capacity." + .into(), + )), + } + } + + /// Returns the number of partitions associated to this node in the ring + pub fn get_node_usage(&self, uuid: &Uuid) -> Result { + for (i, id) in self.node_id_vec.iter().enumerate() { + if id == uuid { + let mut count = 0; + for nod in self.ring_assignment_data.iter() { + if i as u8 == *nod { + count += 1 + } + } + return Ok(count); + } + } + Err(Error::Message( + "The Uuid does not correspond to a node present in the \ + cluster or this node does not have a positive capacity." + .into(), + )) + } + + /// Get the partition in which data would fall on + pub fn partition_of(&self, position: &Hash) -> Partition { + let top = u16::from_be_bytes(position.as_slice()[0..2].try_into().unwrap()); + top >> (16 - PARTITION_BITS) + } + + /// Get the list of partitions and the first hash of a partition key that would fall in it + pub fn partitions(&self) -> Vec<(Partition, Hash)> { + (0..(1 << PARTITION_BITS)) + .map(|i| { + let top = (i as u16) << (16 - PARTITION_BITS); + let mut location = [0u8; 32]; + location[..2].copy_from_slice(&u16::to_be_bytes(top)[..]); + (i as u16, Hash::from(location)) + }) + .collect::>() + } + + /// Walk the ring to find the n servers in which data should be replicated + pub fn nodes_of(&self, position: &Hash, n: usize) -> Vec { + assert_eq!(n, self.replication_factor); + + let data = &self.ring_assignment_data; + + if data.len() != self.replication_factor * (1 << PARTITION_BITS) { + warn!("Ring not yet ready, read/writes will be lost!"); + return vec![]; + } + + let partition_idx = self.partition_of(position) as usize; + let partition_start = partition_idx * self.replication_factor; + let partition_end = (partition_idx + 1) * self.replication_factor; + let partition_nodes = &data[partition_start..partition_end]; + + partition_nodes + .iter() + .map(|i| self.node_id_vec[*i as usize]) + .collect::>() + } + + // ===================== internal information extractors ====================== + + /// Returns the uuids of the non_gateway nodes in self.node_id_vec. + pub(crate) fn nongateway_nodes(&self) -> Vec { + let mut result = Vec::::new(); + for uuid in self.node_id_vec.iter() { + match self.node_role(uuid) { + Some(role) if role.capacity.is_some() => result.push(*uuid), + _ => (), + } + } + result + } + + /// Given a node uuids, this function returns the label of its zone + fn get_node_zone(&self, uuid: &Uuid) -> Result<&str, Error> { + match self.node_role(uuid) { + Some(role) => Ok(&role.zone), + _ => Err(Error::Message( + "The Uuid does not correspond to a node present in the cluster.".into(), + )), + } + } + + /// Returns the sum of capacities of non gateway nodes in the cluster + fn get_total_capacity(&self) -> Result { + let mut total_capacity = 0; + for uuid in self.nongateway_nodes().iter() { + total_capacity += self.get_node_capacity(uuid)?; + } + Ok(total_capacity) + } + + /// Returns the effective value of the zone_redundancy parameter + fn effective_zone_redundancy(&self) -> usize { + match self.parameters.zone_redundancy { + ZoneRedundancy::AtLeast(v) => v, + ZoneRedundancy::Maximum => { + let n_zones = self + .roles + .items() + .iter() + .filter_map(|(_, _, role)| role.0.as_ref().map(|x| x.zone.as_str())) + .collect::>() + .len(); + std::cmp::min(n_zones, self.replication_factor) + } + } + } + + /// Check a cluster layout for internal consistency + /// (assignment, roles, parameters, partition size) + /// returns true if consistent, false if error + pub fn check(&self) -> Result<(), String> { + // Check that node_id_vec contains the correct list of nodes + let mut expected_nodes = self + .roles + .items() + .iter() + .filter(|(_, _, v)| v.0.is_some()) + .map(|(id, _, _)| *id) + .collect::>(); + expected_nodes.sort(); + let mut node_id_vec = self.node_id_vec.clone(); + node_id_vec.sort(); + if expected_nodes != node_id_vec { + return Err(format!("node_id_vec does not contain the correct set of nodes\nnode_id_vec: {:?}\nexpected: {:?}", node_id_vec, expected_nodes)); + } + + // Check that the assignment data has the correct length + let expected_assignment_data_len = (1 << PARTITION_BITS) * self.replication_factor; + if self.ring_assignment_data.len() != expected_assignment_data_len { + return Err(format!( + "ring_assignment_data has incorrect length {} instead of {}", + self.ring_assignment_data.len(), + expected_assignment_data_len + )); + } + + // Check that the assigned nodes are correct identifiers + // of nodes that are assigned a role + // and that role is not the role of a gateway nodes + for x in self.ring_assignment_data.iter() { + if *x as usize >= self.node_id_vec.len() { + return Err(format!( + "ring_assignment_data contains invalid node id {}", + *x + )); + } + let node = self.node_id_vec[*x as usize]; + match self.roles.get(&node) { + Some(NodeRoleV(Some(x))) if x.capacity.is_some() => (), + _ => return Err("ring_assignment_data contains id of a gateway node".into()), + } + } + + // Check that every partition is associated to distinct nodes + let zone_redundancy = self.effective_zone_redundancy(); + let rf = self.replication_factor; + for p in 0..(1 << PARTITION_BITS) { + let nodes_of_p = self.ring_assignment_data[rf * p..rf * (p + 1)].to_vec(); + if nodes_of_p.iter().unique().count() != rf { + return Err(format!("partition does not contain {} unique node ids", rf)); + } + // Check that every partition is spread over at least zone_redundancy zones. + let zones_of_p = nodes_of_p + .iter() + .map(|n| { + self.get_node_zone(&self.node_id_vec[*n as usize]) + .expect("Zone not found.") + }) + .collect::>(); + if zones_of_p.iter().unique().count() < zone_redundancy { + return Err(format!( + "nodes of partition are in less than {} distinct zones", + zone_redundancy + )); + } + } + + // Check that the nodes capacities is consistent with the stored partitions + let mut node_usage = vec![0; MAX_NODE_NUMBER]; + for n in self.ring_assignment_data.iter() { + node_usage[*n as usize] += 1; + } + for (n, usage) in node_usage.iter().enumerate() { + if *usage > 0 { + let uuid = self.node_id_vec[n]; + let partusage = usage * self.partition_size; + let nodecap = self.get_node_capacity(&uuid).unwrap(); + if partusage > nodecap { + return Err(format!( + "node usage ({}) is bigger than node capacity ({})", + usage * self.partition_size, + nodecap + )); + } + } + } + + // Check that the partition size stored is the one computed by the asignation + // algorithm. + let cl2 = self.clone(); + let (_, zone_to_id) = cl2.generate_nongateway_zone_ids().unwrap(); + match cl2.compute_optimal_partition_size(&zone_to_id, zone_redundancy) { + Ok(s) if s != self.partition_size => { + return Err(format!( + "partition_size ({}) is different than optimal value ({})", + self.partition_size, s + )) + } + Err(e) => return Err(format!("could not calculate optimal partition size: {}", e)), + _ => (), + } + + Ok(()) + } + + // ================== updates to layout, internals =================== + + /// This function calculates a new partition-to-node assignment. + /// The computed assignment respects the node replication factor + /// and the zone redundancy parameter It maximizes the capacity of a + /// partition (assuming all partitions have the same size). + /// Among such optimal assignment, it minimizes the distance to + /// the former assignment (if any) to minimize the amount of + /// data to be moved. + /// Staged role changes must be merged with nodes roles before calling this function, + /// hence it must only be called from apply_staged_changes() and hence is not public. + pub(crate) fn calculate_partition_assignment(&mut self) -> Result { + // We update the node ids, since the node role list might have changed with the + // changes in the layout. We retrieve the old_assignment reframed with new ids + let old_assignment_opt = self.update_node_id_vec()?; + + let zone_redundancy = self.effective_zone_redundancy(); + + let mut msg = Message::new(); + msg.push("==== COMPUTATION OF A NEW PARTITION ASSIGNATION ====".into()); + msg.push("".into()); + msg.push(format!( + "Partitions are \ + replicated {} times on at least {} distinct zones.", + self.replication_factor, zone_redundancy + )); + + // We generate for once numerical ids for the zones of non gateway nodes, + // to use them as indices in the flow graphs. + let (id_to_zone, zone_to_id) = self.generate_nongateway_zone_ids()?; + + let nb_nongateway_nodes = self.nongateway_nodes().len(); + if nb_nongateway_nodes < self.replication_factor { + return Err(Error::Message(format!( + "The number of nodes with positive \ + capacity ({}) is smaller than the replication factor ({}).", + nb_nongateway_nodes, self.replication_factor + ))); + } + if id_to_zone.len() < zone_redundancy { + return Err(Error::Message(format!( + "The number of zones with non-gateway \ + nodes ({}) is smaller than the redundancy parameter ({})", + id_to_zone.len(), + zone_redundancy + ))); + } + + // We compute the optimal partition size + // Capacities should be given in a unit so that partition size is at least 100. + // In this case, integer rounding plays a marginal role in the percentages of + // optimality. + let partition_size = self.compute_optimal_partition_size(&zone_to_id, zone_redundancy)?; + + msg.push("".into()); + if old_assignment_opt.is_some() { + msg.push(format!( + "Optimal partition size: {} ({} in previous layout)", + ByteSize::b(partition_size).to_string_as(false), + ByteSize::b(self.partition_size).to_string_as(false) + )); + } else { + msg.push(format!( + "Optimal partition size: {}", + ByteSize::b(partition_size).to_string_as(false) + )); + } + // We write the partition size. + self.partition_size = partition_size; + + if partition_size < 100 { + msg.push( + "WARNING: The partition size is low (< 100), make sure the capacities of your nodes are correct and are of at least a few MB" + .into(), + ); + } + + // We compute a first flow/assignment that is heuristically close to the previous + // assignment + let mut gflow = + self.compute_candidate_assignment(&zone_to_id, &old_assignment_opt, zone_redundancy)?; + if let Some(assoc) = &old_assignment_opt { + // We minimize the distance to the previous assignment. + self.minimize_rebalance_load(&mut gflow, &zone_to_id, assoc)?; + } + + // We display statistics of the computation + msg.extend(self.output_stat(&gflow, &old_assignment_opt, &zone_to_id, &id_to_zone)?); + + // We update the layout structure + self.update_ring_from_flow(id_to_zone.len(), &gflow)?; + + if let Err(e) = self.check() { + return Err(Error::Message( + format!("Layout check returned an error: {}\nOriginal result of computation: <<<<\n{}\n>>>>", e, msg.join("\n")) + )); + } + + Ok(msg) + } + + /// The LwwMap of node roles might have changed. This function updates the node_id_vec + /// and returns the assignment given by ring, with the new indices of the nodes, and + /// None if the node is not present anymore. + /// We work with the assumption that only this function and calculate_new_assignment + /// do modify assignment_ring and node_id_vec. + fn update_node_id_vec(&mut self) -> Result>>, Error> { + // (1) We compute the new node list + // Non gateway nodes should be coded on 8bits, hence they must be first in the list + // We build the new node ids + let new_non_gateway_nodes: Vec = self + .roles + .items() + .iter() + .filter(|(_, _, v)| matches!(&v.0, Some(r) if r.capacity.is_some())) + .map(|(k, _, _)| *k) + .collect(); + + if new_non_gateway_nodes.len() > MAX_NODE_NUMBER { + return Err(Error::Message(format!( + "There are more than {} non-gateway nodes in the new \ + layout. This is not allowed.", + MAX_NODE_NUMBER + ))); + } + + let new_gateway_nodes: Vec = self + .roles + .items() + .iter() + .filter(|(_, _, v)| matches!(v, NodeRoleV(Some(r)) if r.capacity.is_none())) + .map(|(k, _, _)| *k) + .collect(); + + let mut new_node_id_vec = Vec::::new(); + new_node_id_vec.extend(new_non_gateway_nodes); + new_node_id_vec.extend(new_gateway_nodes); + + let old_node_id_vec = self.node_id_vec.clone(); + self.node_id_vec = new_node_id_vec.clone(); + + // (2) We retrieve the old association + // We rewrite the old association with the new indices. We only consider partition + // to node assignments where the node is still in use. + if self.ring_assignment_data.is_empty() { + // This is a new association + return Ok(None); + } + + if self.ring_assignment_data.len() != NB_PARTITIONS * self.replication_factor { + return Err(Error::Message( + "The old assignment does not have a size corresponding to \ + the old replication factor or the number of partitions." + .into(), + )); + } + + // We build a translation table between the uuid and new ids + let mut uuid_to_new_id = HashMap::::new(); + + // We add the indices of only the new non-gateway nodes that can be used in the + // association ring + for (i, uuid) in new_node_id_vec.iter().enumerate() { + uuid_to_new_id.insert(*uuid, i); + } + + let mut old_assignment = vec![Vec::::new(); NB_PARTITIONS]; + let rf = self.replication_factor; + + for (p, old_assign_p) in old_assignment.iter_mut().enumerate() { + for old_id in &self.ring_assignment_data[p * rf..(p + 1) * rf] { + let uuid = old_node_id_vec[*old_id as usize]; + if uuid_to_new_id.contains_key(&uuid) { + old_assign_p.push(uuid_to_new_id[&uuid]); + } + } + } + + // We write the ring + self.ring_assignment_data = Vec::::new(); + + Ok(Some(old_assignment)) + } + + /// This function generates ids for the zone of the nodes appearing in + /// self.node_id_vec. + fn generate_nongateway_zone_ids(&self) -> Result<(Vec, HashMap), Error> { + let mut id_to_zone = Vec::::new(); + let mut zone_to_id = HashMap::::new(); + + for uuid in self.nongateway_nodes().iter() { + let r = self.node_role(uuid).unwrap(); + if !zone_to_id.contains_key(&r.zone) && r.capacity.is_some() { + zone_to_id.insert(r.zone.clone(), id_to_zone.len()); + id_to_zone.push(r.zone.clone()); + } + } + Ok((id_to_zone, zone_to_id)) + } + + /// This function computes by dichotomy the largest realizable partition size, given + /// the layout roles and parameters. + fn compute_optimal_partition_size( + &self, + zone_to_id: &HashMap, + zone_redundancy: usize, + ) -> Result { + let empty_set = HashSet::<(usize, usize)>::new(); + let mut g = self.generate_flow_graph(1, zone_to_id, &empty_set, zone_redundancy)?; + g.compute_maximal_flow()?; + if g.get_flow_value()? < (NB_PARTITIONS * self.replication_factor) as i64 { + return Err(Error::Message( + "The storage capacity of he cluster is to small. It is \ + impossible to store partitions of size 1." + .into(), + )); + } + + let mut s_down = 1; + let mut s_up = self.get_total_capacity()?; + while s_down + 1 < s_up { + g = self.generate_flow_graph( + (s_down + s_up) / 2, + zone_to_id, + &empty_set, + zone_redundancy, + )?; + g.compute_maximal_flow()?; + if g.get_flow_value()? < (NB_PARTITIONS * self.replication_factor) as i64 { + s_up = (s_down + s_up) / 2; + } else { + s_down = (s_down + s_up) / 2; + } + } + + Ok(s_down) + } + + fn generate_graph_vertices(nb_zones: usize, nb_nodes: usize) -> Vec { + let mut vertices = vec![Vertex::Source, Vertex::Sink]; + for p in 0..NB_PARTITIONS { + vertices.push(Vertex::Pup(p)); + vertices.push(Vertex::Pdown(p)); + for z in 0..nb_zones { + vertices.push(Vertex::PZ(p, z)); + } + } + for n in 0..nb_nodes { + vertices.push(Vertex::N(n)); + } + vertices + } + + /// Generates the graph to compute the maximal flow corresponding to the optimal + /// partition assignment. + /// exclude_assoc is the set of (partition, node) association that we are forbidden + /// to use (hence we do not add the corresponding edge to the graph). This parameter + /// is used to compute a first flow that uses only edges appearing in the previous + /// assignment. This produces a solution that heuristically should be close to the + /// previous one. + fn generate_flow_graph( + &self, + partition_size: u64, + zone_to_id: &HashMap, + exclude_assoc: &HashSet<(usize, usize)>, + zone_redundancy: usize, + ) -> Result, Error> { + let vertices = + LayoutVersion::generate_graph_vertices(zone_to_id.len(), self.nongateway_nodes().len()); + let mut g = Graph::::new(&vertices); + let nb_zones = zone_to_id.len(); + for p in 0..NB_PARTITIONS { + g.add_edge(Vertex::Source, Vertex::Pup(p), zone_redundancy as u64)?; + g.add_edge( + Vertex::Source, + Vertex::Pdown(p), + (self.replication_factor - zone_redundancy) as u64, + )?; + for z in 0..nb_zones { + g.add_edge(Vertex::Pup(p), Vertex::PZ(p, z), 1)?; + g.add_edge( + Vertex::Pdown(p), + Vertex::PZ(p, z), + self.replication_factor as u64, + )?; + } + } + for n in 0..self.nongateway_nodes().len() { + let node_capacity = self.get_node_capacity(&self.node_id_vec[n])?; + let node_zone = zone_to_id[self.get_node_zone(&self.node_id_vec[n])?]; + g.add_edge(Vertex::N(n), Vertex::Sink, node_capacity / partition_size)?; + for p in 0..NB_PARTITIONS { + if !exclude_assoc.contains(&(p, n)) { + g.add_edge(Vertex::PZ(p, node_zone), Vertex::N(n), 1)?; + } + } + } + Ok(g) + } + + /// This function computes a first optimal assignment (in the form of a flow graph). + fn compute_candidate_assignment( + &self, + zone_to_id: &HashMap, + prev_assign_opt: &Option>>, + zone_redundancy: usize, + ) -> Result, Error> { + // We list the (partition,node) associations that are not used in the + // previous assignment + let mut exclude_edge = HashSet::<(usize, usize)>::new(); + if let Some(prev_assign) = prev_assign_opt { + let nb_nodes = self.nongateway_nodes().len(); + for (p, prev_assign_p) in prev_assign.iter().enumerate() { + for n in 0..nb_nodes { + exclude_edge.insert((p, n)); + } + for n in prev_assign_p.iter() { + exclude_edge.remove(&(p, *n)); + } + } + } + + // We compute the best flow using only the edges used in the previous assignment + let mut g = self.generate_flow_graph( + self.partition_size, + zone_to_id, + &exclude_edge, + zone_redundancy, + )?; + g.compute_maximal_flow()?; + + // We add the excluded edges and compute the maximal flow with the full graph. + // The algorithm is such that it will start with the flow that we just computed + // and find ameliorating paths from that. + for (p, n) in exclude_edge.iter() { + let node_zone = zone_to_id[self.get_node_zone(&self.node_id_vec[*n])?]; + g.add_edge(Vertex::PZ(*p, node_zone), Vertex::N(*n), 1)?; + } + g.compute_maximal_flow()?; + Ok(g) + } + + /// This function updates the flow graph gflow to minimize the distance between + /// its corresponding assignment and the previous one + fn minimize_rebalance_load( + &self, + gflow: &mut Graph, + zone_to_id: &HashMap, + prev_assign: &[Vec], + ) -> Result<(), Error> { + // We define a cost function on the edges (pairs of vertices) corresponding + // to the distance between the two assignments. + let mut cost = CostFunction::new(); + for (p, assoc_p) in prev_assign.iter().enumerate() { + for n in assoc_p.iter() { + let node_zone = zone_to_id[self.get_node_zone(&self.node_id_vec[*n])?]; + cost.insert((Vertex::PZ(p, node_zone), Vertex::N(*n)), -1); + } + } + + // We compute the maximal length of a simple path in gflow. It is used in the + // Bellman-Ford algorithm in optimize_flow_with_cost to set the number + // of iterations. + let nb_nodes = self.nongateway_nodes().len(); + let path_length = 4 * nb_nodes; + gflow.optimize_flow_with_cost(&cost, path_length)?; + + Ok(()) + } + + /// This function updates the assignment ring from the flow graph. + fn update_ring_from_flow( + &mut self, + nb_zones: usize, + gflow: &Graph, + ) -> Result<(), Error> { + self.ring_assignment_data = Vec::::new(); + for p in 0..NB_PARTITIONS { + for z in 0..nb_zones { + let assoc_vertex = gflow.get_positive_flow_from(Vertex::PZ(p, z))?; + for vertex in assoc_vertex.iter() { + if let Vertex::N(n) = vertex { + self.ring_assignment_data.push((*n).try_into().unwrap()); + } + } + } + } + + if self.ring_assignment_data.len() != NB_PARTITIONS * self.replication_factor { + return Err(Error::Message( + "Critical Error : the association ring we produced does not \ + have the right size." + .into(), + )); + } + Ok(()) + } + + /// This function returns a message summing up the partition repartition of the new + /// layout, and other statistics of the partition assignment computation. + fn output_stat( + &self, + gflow: &Graph, + prev_assign_opt: &Option>>, + zone_to_id: &HashMap, + id_to_zone: &[String], + ) -> Result { + let mut msg = Message::new(); + + let used_cap = self.partition_size * NB_PARTITIONS as u64 * self.replication_factor as u64; + let total_cap = self.get_total_capacity()?; + let percent_cap = 100.0 * (used_cap as f32) / (total_cap as f32); + msg.push(format!( + "Usable capacity / total cluster capacity: {} / {} ({:.1} %)", + ByteSize::b(used_cap).to_string_as(false), + ByteSize::b(total_cap).to_string_as(false), + percent_cap + )); + msg.push(format!( + "Effective capacity (replication factor {}): {}", + self.replication_factor, + ByteSize::b(used_cap / self.replication_factor as u64).to_string_as(false) + )); + if percent_cap < 80. { + msg.push("".into()); + msg.push( + "If the percentage is too low, it might be that the \ + cluster topology and redundancy constraints are forcing the use of nodes/zones with small \ + storage capacities." + .into(), + ); + msg.push( + "You might want to move storage capacity between zones or relax the redundancy constraint." + .into(), + ); + msg.push( + "See the detailed statistics below and look for saturated nodes/zones.".into(), + ); + } + + // We define and fill in the following tables + let storing_nodes = self.nongateway_nodes(); + let mut new_partitions = vec![0; storing_nodes.len()]; + let mut stored_partitions = vec![0; storing_nodes.len()]; + + let mut new_partitions_zone = vec![0; id_to_zone.len()]; + let mut stored_partitions_zone = vec![0; id_to_zone.len()]; + + for p in 0..NB_PARTITIONS { + for z in 0..id_to_zone.len() { + let pz_nodes = gflow.get_positive_flow_from(Vertex::PZ(p, z))?; + if !pz_nodes.is_empty() { + stored_partitions_zone[z] += 1; + if let Some(prev_assign) = prev_assign_opt { + let mut old_zones_of_p = Vec::::new(); + for n in prev_assign[p].iter() { + old_zones_of_p + .push(zone_to_id[self.get_node_zone(&self.node_id_vec[*n])?]); + } + if !old_zones_of_p.contains(&z) { + new_partitions_zone[z] += 1; + } + } + } + for vert in pz_nodes.iter() { + if let Vertex::N(n) = *vert { + stored_partitions[n] += 1; + if let Some(prev_assign) = prev_assign_opt { + if !prev_assign[p].contains(&n) { + new_partitions[n] += 1; + } + } + } + } + } + } + + if prev_assign_opt.is_none() { + new_partitions = stored_partitions.clone(); + //new_partitions_zone = stored_partitions_zone.clone(); + } + + // We display the statistics + + msg.push("".into()); + if prev_assign_opt.is_some() { + let total_new_partitions: usize = new_partitions.iter().sum(); + msg.push(format!( + "A total of {} new copies of partitions need to be \ + transferred.", + total_new_partitions + )); + msg.push("".into()); + } + + let mut table = vec![]; + for z in 0..id_to_zone.len() { + let mut nodes_of_z = Vec::::new(); + for n in 0..storing_nodes.len() { + if self.get_node_zone(&self.node_id_vec[n])? == id_to_zone[z] { + nodes_of_z.push(n); + } + } + let replicated_partitions: usize = + nodes_of_z.iter().map(|n| stored_partitions[*n]).sum(); + table.push(format!( + "{}\tTags\tPartitions\tCapacity\tUsable capacity", + id_to_zone[z] + )); + + let available_cap_z: u64 = self.partition_size * replicated_partitions as u64; + let mut total_cap_z = 0; + for n in nodes_of_z.iter() { + total_cap_z += self.get_node_capacity(&self.node_id_vec[*n])?; + } + let percent_cap_z = 100.0 * (available_cap_z as f32) / (total_cap_z as f32); + + for n in nodes_of_z.iter() { + let available_cap_n = stored_partitions[*n] as u64 * self.partition_size; + let total_cap_n = self.get_node_capacity(&self.node_id_vec[*n])?; + let tags_n = (self.node_role(&self.node_id_vec[*n]).ok_or(""))?.tags_string(); + table.push(format!( + " {:?}\t{}\t{} ({} new)\t{}\t{} ({:.1}%)", + self.node_id_vec[*n], + tags_n, + stored_partitions[*n], + new_partitions[*n], + ByteSize::b(total_cap_n).to_string_as(false), + ByteSize::b(available_cap_n).to_string_as(false), + (available_cap_n as f32) / (total_cap_n as f32) * 100.0, + )); + } + + table.push(format!( + " TOTAL\t\t{} ({} unique)\t{}\t{} ({:.1}%)", + replicated_partitions, + stored_partitions_zone[z], + //new_partitions_zone[z], + ByteSize::b(total_cap_z).to_string_as(false), + ByteSize::b(available_cap_z).to_string_as(false), + percent_cap_z + )); + table.push("".into()); + } + msg.push(format_table::format_table_to_string(table)); + + Ok(msg) + } +} + +// ==================================================================================== + +#[cfg(test)] +mod tests { + use super::{Error, *}; + use std::cmp::min; + + // This function checks that the partition size S computed is at least better than the + // one given by a very naive algorithm. To do so, we try to run the naive algorithm + // assuming a partion size of S+1. If we succed, it means that the optimal assignment + // was not optimal. The naive algorithm is the following : + // - we compute the max number of partitions associated to every node, capped at the + // partition number. It gives the number of tokens of every node. + // - every zone has a number of tokens equal to the sum of the tokens of its nodes. + // - we cycle over the partitions and associate zone tokens while respecting the + // zone redundancy constraint. + // NOTE: the naive algorithm is not optimal. Counter example: + // take nb_partition = 3 ; replication_factor = 5; redundancy = 4; + // number of tokens by zone : (A, 4), (B,1), (C,4), (D, 4), (E, 2) + // With these parameters, the naive algo fails, whereas there is a solution: + // (A,A,C,D,E) , (A,B,C,D,D) (A,C,C,D,E) + fn check_against_naive(cl: &LayoutVersion) -> Result { + let over_size = cl.partition_size + 1; + let mut zone_token = HashMap::::new(); + + let (zones, zone_to_id) = cl.generate_nongateway_zone_ids()?; + + if zones.is_empty() { + return Ok(false); + } + + for z in zones.iter() { + zone_token.insert(z.clone(), 0); + } + for uuid in cl.nongateway_nodes().iter() { + let z = cl.get_node_zone(uuid)?; + let c = cl.get_node_capacity(uuid)?; + zone_token.insert( + z.clone(), + zone_token[&z] + min(NB_PARTITIONS, (c / over_size) as usize), + ); + } + + // For every partition, we count the number of zone already associated and + // the name of the last zone associated + + let mut id_zone_token = vec![0; zones.len()]; + for (z, t) in zone_token.iter() { + id_zone_token[zone_to_id[z]] = *t; + } + + let mut nb_token = vec![0; NB_PARTITIONS]; + let mut last_zone = vec![zones.len(); NB_PARTITIONS]; + + let mut curr_zone = 0; + + let redundancy = cl.effective_zone_redundancy(); + + for replic in 0..cl.replication_factor { + for p in 0..NB_PARTITIONS { + while id_zone_token[curr_zone] == 0 + || (last_zone[p] == curr_zone + && redundancy - nb_token[p] <= cl.replication_factor - replic) + { + curr_zone += 1; + if curr_zone >= zones.len() { + return Ok(true); + } + } + id_zone_token[curr_zone] -= 1; + if last_zone[p] != curr_zone { + nb_token[p] += 1; + last_zone[p] = curr_zone; + } + } + } + + return Ok(false); + } + + fn show_msg(msg: &Message) { + for s in msg.iter() { + println!("{}", s); + } + } + + fn update_layout( + cl: &mut LayoutVersion, + node_id_vec: &Vec, + node_capacity_vec: &Vec, + node_zone_vec: &Vec, + zone_redundancy: usize, + ) { + for i in 0..node_id_vec.len() { + if let Some(x) = FixedBytes32::try_from(&[i as u8; 32]) { + cl.node_id_vec.push(x); + } + + let update = cl.staging_roles.update_mutator( + cl.node_id_vec[i], + NodeRoleV(Some(NodeRole { + zone: (node_zone_vec[i].to_string()), + capacity: (Some(node_capacity_vec[i])), + tags: (vec![]), + })), + ); + cl.staging_roles.merge(&update); + } + cl.staging_parameters.update(LayoutParameters { + zone_redundancy: ZoneRedundancy::AtLeast(zone_redundancy), + }); + cl.staging_hash = cl.calculate_staging_hash(); + } + + #[test] + fn test_assignment() { + let mut node_id_vec = vec![1, 2, 3]; + let mut node_capacity_vec = vec![4000, 1000, 2000]; + let mut node_zone_vec = vec!["A", "B", "C"] + .into_iter() + .map(|x| x.to_string()) + .collect(); + + let mut cl = LayoutVersion::new(3); + update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 3); + let v = cl.version; + let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); + show_msg(&msg); + assert_eq!(cl.check(), Ok(())); + assert!(matches!(check_against_naive(&cl), Ok(true))); + + node_id_vec = vec![1, 2, 3, 4, 5, 6, 7, 8, 9]; + node_capacity_vec = vec![4000, 1000, 1000, 3000, 1000, 1000, 2000, 10000, 2000]; + node_zone_vec = vec!["A", "B", "C", "C", "C", "B", "G", "H", "I"] + .into_iter() + .map(|x| x.to_string()) + .collect(); + update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 2); + let v = cl.version; + let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); + show_msg(&msg); + assert_eq!(cl.check(), Ok(())); + assert!(matches!(check_against_naive(&cl), Ok(true))); + + node_capacity_vec = vec![4000, 1000, 2000, 7000, 1000, 1000, 2000, 10000, 2000]; + update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 3); + let v = cl.version; + let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); + show_msg(&msg); + assert_eq!(cl.check(), Ok(())); + assert!(matches!(check_against_naive(&cl), Ok(true))); + + node_capacity_vec = vec![ + 4000000, 4000000, 2000000, 7000000, 1000000, 9000000, 2000000, 10000, 2000000, + ]; + update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 1); + let v = cl.version; + let (cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); + show_msg(&msg); + assert_eq!(cl.check(), Ok(())); + assert!(matches!(check_against_naive(&cl), Ok(true))); + } +} diff --git a/src/rpc/rpc_helper.rs b/src/rpc/rpc_helper.rs index 56bef2f3..3fdb4acd 100644 --- a/src/rpc/rpc_helper.rs +++ b/src/rpc/rpc_helper.rs @@ -26,7 +26,7 @@ use garage_util::data::*; use garage_util::error::Error; use garage_util::metrics::RecordDuration; -use crate::layout::ClusterLayout; +use crate::layout::LayoutHistory; use crate::metrics::RpcMetrics; // Default RPC timeout = 5 minutes @@ -91,7 +91,7 @@ pub struct RpcHelper(Arc); struct RpcHelperInner { our_node_id: Uuid, fullmesh: Arc, - layout_watch: watch::Receiver>, + layout_watch: watch::Receiver>, metrics: RpcMetrics, rpc_timeout: Duration, } @@ -100,7 +100,7 @@ impl RpcHelper { pub(crate) fn new( our_node_id: Uuid, fullmesh: Arc, - layout_watch: watch::Receiver>, + layout_watch: watch::Receiver>, rpc_timeout: Option, ) -> Self { let metrics = RpcMetrics::new(); @@ -392,8 +392,8 @@ impl RpcHelper { pub fn request_order(&self, nodes: &[Uuid]) -> Vec { // Retrieve some status variables that we will use to sort requests let peer_list = self.0.fullmesh.get_peer_list(); - let layout: Arc = self.0.layout_watch.borrow().clone(); - let our_zone = match layout.node_role(&self.0.our_node_id) { + let layout: Arc = self.0.layout_watch.borrow().clone(); + let our_zone = match layout.current().node_role(&self.0.our_node_id) { Some(pc) => &pc.zone, None => "", }; @@ -407,7 +407,7 @@ impl RpcHelper { let mut nodes = nodes .iter() .map(|to| { - let peer_zone = match layout.node_role(to) { + let peer_zone = match layout.current().node_role(to) { Some(pc) => &pc.zone, None => "", }; diff --git a/src/rpc/system.rs b/src/rpc/system.rs index 93144e39..86d724f1 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -64,7 +64,7 @@ pub enum SystemRpc { /// Exchanged with every node on a regular basis. AdvertiseStatus(NodeStatus), /// Advertisement of cluster layout. Sent spontanously or in response to PullClusterLayout - AdvertiseClusterLayout(ClusterLayout), + AdvertiseClusterLayout(LayoutHistory), /// Get known nodes states GetKnownNodes, /// Return known nodes @@ -84,7 +84,7 @@ pub struct System { /// The id of this node pub id: Uuid, - persist_cluster_layout: Persister, + persist_cluster_layout: Persister, persist_peer_list: Persister, local_status: ArcSwap, @@ -112,8 +112,8 @@ pub struct System { replication_factor: usize, /// The layout - pub layout_watch: watch::Receiver>, - update_layout: Mutex>>, + pub layout_watch: watch::Receiver>, + update_layout: Mutex>>, /// Path to metadata directory pub metadata_dir: PathBuf, @@ -256,16 +256,16 @@ impl System { hex::encode(&node_key.public_key()[..8]) ); - let persist_cluster_layout: Persister = + let persist_cluster_layout: Persister = Persister::new(&config.metadata_dir, "cluster_layout"); let persist_peer_list = Persister::new(&config.metadata_dir, "peer_list"); let cluster_layout = match persist_cluster_layout.load() { Ok(x) => { - if x.replication_factor != replication_factor { + if x.current().replication_factor != replication_factor { return Err(Error::Message(format!( "Prevous cluster layout has replication factor {}, which is different than the one specified in the config file ({}). The previous cluster layout can be purged, if you know what you are doing, simply by deleting the `cluster_layout` file in your metadata directory.", - x.replication_factor, + x.current().replication_factor, replication_factor ))); } @@ -276,7 +276,7 @@ impl System { "No valid previous cluster layout stored ({}), starting fresh.", e ); - ClusterLayout::new(replication_factor) + LayoutHistory::new(replication_factor) } }; @@ -423,13 +423,13 @@ impl System { known_nodes } - pub fn cluster_layout(&self) -> watch::Ref> { + pub fn cluster_layout(&self) -> watch::Ref> { self.layout_watch.borrow() } pub async fn update_cluster_layout( self: &Arc, - layout: &ClusterLayout, + layout: &LayoutHistory, ) -> Result<(), Error> { self.handle_advertise_cluster_layout(layout).await?; Ok(()) @@ -475,7 +475,9 @@ impl System { .collect::>(); let connected_nodes = nodes.iter().filter(|(_, n)| n.is_up).count(); + // TODO: not only layout.current() let storage_nodes = layout + .current() .roles .items() .iter() @@ -486,11 +488,11 @@ impl System { .filter(|(x, _, _)| nodes.get(x).map(|n| n.is_up).unwrap_or(false)) .count(); - let partitions = layout.partitions(); + let partitions = layout.current().partitions(); let partitions_n_up = partitions .iter() .map(|(_, h)| { - let pn = layout.nodes_of(h, layout.replication_factor); + let pn = layout.current().nodes_of(h, replication_factor); pn.iter() .filter(|x| nodes.get(x).map(|n| n.is_up).unwrap_or(false)) .count() @@ -581,7 +583,7 @@ impl System { /// Save network configuration to disc async fn save_cluster_layout(&self) -> Result<(), Error> { - let layout: Arc = self.layout_watch.borrow().clone(); + let layout: Arc = self.layout_watch.borrow().clone(); self.persist_cluster_layout .save_async(&layout) .await @@ -593,7 +595,7 @@ impl System { let mut new_si: NodeStatus = self.local_status.load().as_ref().clone(); let layout = self.layout_watch.borrow(); - new_si.cluster_layout_version = layout.version; + new_si.cluster_layout_version = layout.current().version; new_si.cluster_layout_staging_hash = layout.staging_hash; new_si.update_disk_usage(&self.metadata_dir, &self.data_dir, &self.metrics); @@ -648,12 +650,12 @@ impl System { async fn handle_advertise_cluster_layout( self: &Arc, - adv: &ClusterLayout, + adv: &LayoutHistory, ) -> Result { - if adv.replication_factor != self.replication_factor { + if adv.current().replication_factor != self.replication_factor { let msg = format!( "Received a cluster layout from another node with replication factor {}, which is different from what we have in our configuration ({}). Discarding the cluster layout we received.", - adv.replication_factor, + adv.current().replication_factor, self.replication_factor ); error!("{}", msg); @@ -662,7 +664,7 @@ impl System { let update_layout = self.update_layout.lock().await; // TODO: don't clone each time an AdvertiseClusterLayout is received - let mut layout: ClusterLayout = self.layout_watch.borrow().as_ref().clone(); + let mut layout: LayoutHistory = self.layout_watch.borrow().as_ref().clone(); let prev_layout_check = layout.check().is_ok(); if layout.merge(adv) { @@ -724,7 +726,7 @@ impl System { while !*stop_signal.borrow() { let not_configured = self.layout_watch.borrow().check().is_err(); let no_peers = self.fullmesh.get_peer_list().len() < self.replication_factor; - let expected_n_nodes = self.layout_watch.borrow().num_nodes(); + let expected_n_nodes = self.layout_watch.borrow().current().num_nodes(); let bad_peers = self .fullmesh .get_peer_list() @@ -863,13 +865,13 @@ impl EndpointHandler for System { } impl NodeStatus { - fn initial(replication_factor: usize, layout: &ClusterLayout) -> Self { + fn initial(replication_factor: usize, layout: &LayoutHistory) -> Self { NodeStatus { hostname: gethostname::gethostname() .into_string() .unwrap_or_else(|_| "".to_string()), replication_factor, - cluster_layout_version: layout.version, + cluster_layout_version: layout.current().version, cluster_layout_staging_hash: layout.staging_hash, meta_disk_avail: None, data_disk_avail: None, -- cgit v1.2.3 From 8dccee3ccfe7793c42203f28c1e91c6f989b6899 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 8 Nov 2023 19:28:36 +0100 Subject: cluster layout: adapt all uses of ClusterLayout to LayoutHistory --- src/api/admin/cluster.rs | 9 +++++---- src/api/k2v/index.rs | 9 ++++++--- src/garage/admin/mod.rs | 14 +++++++------- src/garage/cli/cmd.rs | 17 +++++++++++------ src/garage/cli/layout.rs | 38 +++++++++++++++++++++----------------- src/model/helper/bucket.rs | 9 ++++++--- src/model/index_counter.rs | 6 +++--- src/rpc/layout/history.rs | 17 +++++------------ src/rpc/layout/schema.rs | 5 ++--- src/table/replication/fullcopy.rs | 5 ++--- src/table/replication/sharded.rs | 16 ++++++++++------ src/table/sync.rs | 4 ++-- 12 files changed, 80 insertions(+), 69 deletions(-) diff --git a/src/api/admin/cluster.rs b/src/api/admin/cluster.rs index 01ff9885..6dd2e8da 100644 --- a/src/api/admin/cluster.rs +++ b/src/api/admin/cluster.rs @@ -89,8 +89,9 @@ pub async fn handle_get_cluster_layout(garage: &Arc) -> Result GetClusterLayoutResponse { +fn format_cluster_layout(layout: &layout::LayoutHistory) -> GetClusterLayoutResponse { let roles = layout + .current() .roles .items() .iter() @@ -107,7 +108,7 @@ fn format_cluster_layout(layout: &layout::ClusterLayout) -> GetClusterLayoutResp .staging_roles .items() .iter() - .filter(|(k, _, v)| layout.roles.get(k) != Some(v)) + .filter(|(k, _, v)| layout.current().roles.get(k) != Some(v)) .map(|(k, _, v)| match &v.0 { None => NodeRoleChange { id: hex::encode(k), @@ -125,7 +126,7 @@ fn format_cluster_layout(layout: &layout::ClusterLayout) -> GetClusterLayoutResp .collect::>(); GetClusterLayoutResponse { - version: layout.version, + version: layout.current().version, roles, staged_role_changes, } @@ -209,7 +210,7 @@ pub async fn handle_update_cluster_layout( let mut layout = garage.system.cluster_layout().as_ref().clone(); - let mut roles = layout.roles.clone(); + let mut roles = layout.current().roles.clone(); roles.merge(&layout.staging_roles); for change in updates { diff --git a/src/api/k2v/index.rs b/src/api/k2v/index.rs index 3fc39de6..a9bc3826 100644 --- a/src/api/k2v/index.rs +++ b/src/api/k2v/index.rs @@ -5,7 +5,7 @@ use serde::Serialize; use garage_util::data::*; -use garage_rpc::layout::ClusterLayout; +use garage_rpc::layout::LayoutHistory; use garage_table::util::*; use garage_model::garage::Garage; @@ -26,7 +26,7 @@ pub async fn handle_read_index( ) -> Result, Error> { let reverse = reverse.unwrap_or(false); - let layout: Arc = garage.system.cluster_layout().clone(); + let layout: Arc = garage.system.cluster_layout().clone(); let (partition_keys, more, next_start) = read_range( &garage.k2v.counter_table.table, @@ -35,7 +35,10 @@ pub async fn handle_read_index( &start, &end, limit, - Some((DeletedFilter::NotDeleted, layout.node_id_vec.clone())), + Some(( + DeletedFilter::NotDeleted, + layout.current().node_id_vec.clone(), + )), EnumerationOrder::from_reverse(reverse), ) .await?; diff --git a/src/garage/admin/mod.rs b/src/garage/admin/mod.rs index c3fa801a..e3ba6d35 100644 --- a/src/garage/admin/mod.rs +++ b/src/garage/admin/mod.rs @@ -127,7 +127,7 @@ impl AdminRpcHandler { let mut failures = vec![]; let layout = self.garage.system.cluster_layout().clone(); - for node in layout.node_ids().iter() { + for node in layout.current().node_ids().iter() { let node = (*node).into(); let resp = self .endpoint @@ -165,7 +165,7 @@ impl AdminRpcHandler { let mut ret = String::new(); let layout = self.garage.system.cluster_layout().clone(); - for node in layout.node_ids().iter() { + for node in layout.current().node_ids().iter() { let mut opt = opt.clone(); opt.all_nodes = false; opt.skip_global = true; @@ -277,8 +277,8 @@ impl AdminRpcHandler { // Gather storage node and free space statistics let layout = &self.garage.system.cluster_layout(); let mut node_partition_count = HashMap::::new(); - for short_id in layout.ring_assignment_data.iter() { - let id = layout.node_id_vec[*short_id as usize]; + for short_id in layout.current().ring_assignment_data.iter() { + let id = layout.current().node_id_vec[*short_id as usize]; *node_partition_count.entry(id).or_default() += 1; } let node_info = self @@ -293,7 +293,7 @@ impl AdminRpcHandler { for (id, parts) in node_partition_count.iter() { let info = node_info.get(id); let status = info.map(|x| &x.status); - let role = layout.roles.get(id).and_then(|x| x.0.as_ref()); + let role = layout.current().roles.get(id).and_then(|x| x.0.as_ref()); let hostname = status.map(|x| x.hostname.as_str()).unwrap_or("?"); let zone = role.map(|x| x.zone.as_str()).unwrap_or("?"); let capacity = role @@ -441,7 +441,7 @@ impl AdminRpcHandler { if all_nodes { let mut ret = vec![]; let layout = self.garage.system.cluster_layout().clone(); - for node in layout.node_ids().iter() { + for node in layout.current().node_ids().iter() { let node = (*node).into(); match self .endpoint @@ -489,7 +489,7 @@ impl AdminRpcHandler { if all_nodes { let mut ret = vec![]; let layout = self.garage.system.cluster_layout().clone(); - for node in layout.node_ids().iter() { + for node in layout.current().node_ids().iter() { let node = (*node).into(); match self .endpoint diff --git a/src/garage/cli/cmd.rs b/src/garage/cli/cmd.rs index 48359614..8be43873 100644 --- a/src/garage/cli/cmd.rs +++ b/src/garage/cli/cmd.rs @@ -62,7 +62,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> let mut healthy_nodes = vec!["ID\tHostname\tAddress\tTags\tZone\tCapacity\tDataAvail".to_string()]; for adv in status.iter().filter(|adv| adv.is_up) { - match layout.roles.get(&adv.id) { + match layout.current().roles.get(&adv.id) { Some(NodeRoleV(Some(cfg))) => { let data_avail = match &adv.status.data_disk_avail { _ if cfg.capacity.is_none() => "N/A".into(), @@ -102,10 +102,15 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> format_table(healthy_nodes); let status_keys = status.iter().map(|adv| adv.id).collect::>(); - let failure_case_1 = status - .iter() - .any(|adv| !adv.is_up && matches!(layout.roles.get(&adv.id), Some(NodeRoleV(Some(_))))); + let failure_case_1 = status.iter().any(|adv| { + !adv.is_up + && matches!( + layout.current().roles.get(&adv.id), + Some(NodeRoleV(Some(_))) + ) + }); let failure_case_2 = layout + .current() .roles .items() .iter() @@ -115,7 +120,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> let mut failed_nodes = vec!["ID\tHostname\tAddress\tTags\tZone\tCapacity\tLast seen".to_string()]; for adv in status.iter().filter(|adv| !adv.is_up) { - if let Some(NodeRoleV(Some(cfg))) = layout.roles.get(&adv.id) { + if let Some(NodeRoleV(Some(cfg))) = layout.current().roles.get(&adv.id) { let tf = timeago::Formatter::new(); failed_nodes.push(format!( "{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\t{capacity}\t{last_seen}", @@ -132,7 +137,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> )); } } - for (id, _, role_v) in layout.roles.items().iter() { + for (id, _, role_v) in layout.current().roles.items().iter() { if let NodeRoleV(Some(cfg)) = role_v { if !status_keys.contains(id) { failed_nodes.push(format!( diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index ce2b11e0..4a617337 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -58,17 +58,18 @@ pub async fn cmd_assign_role( status .iter() .map(|adv| adv.id) - .chain(layout.node_ids().iter().cloned()), + .chain(layout.current().node_ids().iter().cloned()), node_id, ) }) .collect::, _>>()?; - let mut roles = layout.roles.clone(); + let mut roles = layout.current().roles.clone(); roles.merge(&layout.staging_roles); for replaced in args.replace.iter() { - let replaced_node = find_matching_node(layout.node_ids().iter().cloned(), replaced)?; + let replaced_node = + find_matching_node(layout.current().node_ids().iter().cloned(), replaced)?; match roles.get(&replaced_node) { Some(NodeRoleV(Some(_))) => { layout @@ -149,7 +150,7 @@ pub async fn cmd_remove_role( ) -> Result<(), Error> { let mut layout = fetch_layout(rpc_cli, rpc_host).await?; - let mut roles = layout.roles.clone(); + let mut roles = layout.current().roles.clone(); roles.merge(&layout.staging_roles); let deleted_node = @@ -174,13 +175,16 @@ pub async fn cmd_show_layout( let layout = fetch_layout(rpc_cli, rpc_host).await?; println!("==== CURRENT CLUSTER LAYOUT ===="); - print_cluster_layout(&layout, "No nodes currently have a role in the cluster.\nSee `garage status` to view available nodes."); + print_cluster_layout(layout.current(), "No nodes currently have a role in the cluster.\nSee `garage status` to view available nodes."); println!(); - println!("Current cluster layout version: {}", layout.version); + println!( + "Current cluster layout version: {}", + layout.current().version + ); let has_role_changes = print_staging_role_changes(&layout); if has_role_changes { - let v = layout.version; + let v = layout.current().version; let res_apply = layout.apply_staged_changes(Some(v + 1)); // this will print the stats of what partitions @@ -189,7 +193,7 @@ pub async fn cmd_show_layout( Ok((layout, msg)) => { println!(); println!("==== NEW CLUSTER LAYOUT AFTER APPLYING CHANGES ===="); - print_cluster_layout(&layout, "No nodes have a role in the new layout."); + print_cluster_layout(layout.current(), "No nodes have a role in the new layout."); println!(); for line in msg.iter() { @@ -266,11 +270,11 @@ pub async fn cmd_config_layout( .parse::() .ok_or_message("invalid zone redundancy value")?; if let ZoneRedundancy::AtLeast(r_int) = r { - if r_int > layout.replication_factor { + if r_int > layout.current().replication_factor { return Err(Error::Message(format!( "The zone redundancy must be smaller or equal to the \ replication factor ({}).", - layout.replication_factor + layout.current().replication_factor ))); } else if r_int < 1 { return Err(Error::Message( @@ -302,7 +306,7 @@ pub async fn cmd_config_layout( pub async fn fetch_layout( rpc_cli: &Endpoint, rpc_host: NodeID, -) -> Result { +) -> Result { match rpc_cli .call(&rpc_host, SystemRpc::PullClusterLayout, PRIO_NORMAL) .await?? @@ -315,7 +319,7 @@ pub async fn fetch_layout( pub async fn send_layout( rpc_cli: &Endpoint, rpc_host: NodeID, - layout: ClusterLayout, + layout: LayoutHistory, ) -> Result<(), Error> { rpc_cli .call( @@ -327,7 +331,7 @@ pub async fn send_layout( Ok(()) } -pub fn print_cluster_layout(layout: &ClusterLayout, empty_msg: &str) { +pub fn print_cluster_layout(layout: &LayoutVersion, empty_msg: &str) { let mut table = vec!["ID\tTags\tZone\tCapacity\tUsable capacity".to_string()]; for (id, _, role) in layout.roles.items().iter() { let role = match &role.0 { @@ -366,13 +370,13 @@ pub fn print_cluster_layout(layout: &ClusterLayout, empty_msg: &str) { } } -pub fn print_staging_role_changes(layout: &ClusterLayout) -> bool { +pub fn print_staging_role_changes(layout: &LayoutHistory) -> bool { let has_role_changes = layout .staging_roles .items() .iter() - .any(|(k, _, v)| layout.roles.get(k) != Some(v)); - let has_layout_changes = *layout.staging_parameters.get() != layout.parameters; + .any(|(k, _, v)| layout.current().roles.get(k) != Some(v)); + let has_layout_changes = *layout.staging_parameters.get() != layout.current().parameters; if has_role_changes || has_layout_changes { println!(); @@ -380,7 +384,7 @@ pub fn print_staging_role_changes(layout: &ClusterLayout) -> bool { if has_role_changes { let mut table = vec!["ID\tTags\tZone\tCapacity".to_string()]; for (id, _, role) in layout.staging_roles.items().iter() { - if layout.roles.get(id) == Some(role) { + if layout.current().roles.get(id) == Some(role) { continue; } if let Some(role) = &role.0 { diff --git a/src/model/helper/bucket.rs b/src/model/helper/bucket.rs index 8cd5b27b..18904c8d 100644 --- a/src/model/helper/bucket.rs +++ b/src/model/helper/bucket.rs @@ -450,10 +450,10 @@ impl<'a> BucketHelper<'a> { #[cfg(feature = "k2v")] { - use garage_rpc::layout::ClusterLayout; + use garage_rpc::layout::LayoutHistory; use std::sync::Arc; - let layout: Arc = self.0.system.cluster_layout().clone(); + let layout: Arc = self.0.system.cluster_layout().clone(); let k2vindexes = self .0 .k2v @@ -462,7 +462,10 @@ impl<'a> BucketHelper<'a> { .get_range( &bucket_id, None, - Some((DeletedFilter::NotDeleted, layout.node_id_vec.clone())), + Some(( + DeletedFilter::NotDeleted, + layout.current().node_id_vec.clone(), + )), 10, EnumerationOrder::Forward, ) diff --git a/src/model/index_counter.rs b/src/model/index_counter.rs index d514cb06..9637cc4c 100644 --- a/src/model/index_counter.rs +++ b/src/model/index_counter.rs @@ -7,7 +7,7 @@ use serde::{Deserialize, Serialize}; use garage_db as db; -use garage_rpc::layout::ClusterLayout; +use garage_rpc::layout::LayoutHistory; use garage_rpc::system::System; use garage_util::background::BackgroundRunner; use garage_util::data::*; @@ -83,8 +83,8 @@ impl Entry for CounterEntry { } impl CounterEntry { - pub fn filtered_values(&self, layout: &ClusterLayout) -> HashMap { - let nodes = &layout.node_id_vec[..]; + pub fn filtered_values(&self, layout: &LayoutHistory) -> HashMap { + let nodes = &layout.current().node_id_vec[..]; self.filtered_values_with_nodes(nodes) } diff --git a/src/rpc/layout/history.rs b/src/rpc/layout/history.rs index b3019f58..e59c9e9c 100644 --- a/src/rpc/layout/history.rs +++ b/src/rpc/layout/history.rs @@ -1,5 +1,4 @@ use std::cmp::Ordering; -use std::sync::Arc; use garage_util::crdt::{Crdt, Lww, LwwMap}; use garage_util::data::*; @@ -64,24 +63,22 @@ impl LayoutHistory { } // Add any new versions to history - let mut versions = self.versions.to_vec(); for v2 in other.versions.iter() { - if let Some(v1) = versions.iter().find(|v| v.version == v2.version) { + if let Some(v1) = self.versions.iter().find(|v| v.version == v2.version) { if v1 != v2 { error!("Inconsistent layout histories: different layout compositions for version {}. Your cluster will be broken as long as this layout version is not replaced.", v2.version); } - } else if versions.iter().all(|v| v.version != v2.version - 1) { + } else if self.versions.iter().all(|v| v.version != v2.version - 1) { error!( "Cannot receive new layout version {}, version {} is missing", v2.version, v2.version - 1 ); } else { - versions.push(v2.clone()); + self.versions.push(v2.clone()); changed = true; } } - self.versions = Arc::from(versions.into_boxed_slice()); // Merge trackers self.update_trackers.merge(&other.update_trackers); @@ -117,9 +114,7 @@ To know the correct value of the new layout version, invoke `garage layout show` let msg = new_version.calculate_partition_assignment()?; - let mut versions = self.versions.to_vec(); - versions.push(new_version); - self.versions = Arc::from(versions.into_boxed_slice()); + self.versions.push(new_version); Ok((self, msg)) } @@ -149,9 +144,7 @@ To know the correct value of the new layout version, invoke `garage layout show` let mut new_version = self.current().clone(); new_version.version += 1; - let mut versions = self.versions.to_vec(); - versions.push(new_version); - self.versions = Arc::from(versions.into_boxed_slice()); + self.versions.push(new_version); Ok(self) } diff --git a/src/rpc/layout/schema.rs b/src/rpc/layout/schema.rs index fa0822fa..14e797be 100644 --- a/src/rpc/layout/schema.rs +++ b/src/rpc/layout/schema.rs @@ -184,7 +184,6 @@ mod v010 { use garage_util::data::{Hash, Uuid}; use serde::{Deserialize, Serialize}; use std::collections::HashMap; - use std::sync::Arc; pub use v09::{LayoutParameters, NodeRole, NodeRoleV, ZoneRedundancy}; /// The layout of the cluster, i.e. the list of roles @@ -215,7 +214,7 @@ mod v010 { #[derive(Clone, Debug, Serialize, Deserialize)] pub struct LayoutHistory { /// The versions currently in use in the cluster - pub versions: Arc<[LayoutVersion]>, + pub versions: Vec, /// Update trackers pub update_trackers: UpdateTrackers, @@ -267,7 +266,7 @@ mod v010 { .collect::>(), ); let mut ret = Self { - versions: Arc::from(vec![version].into_boxed_slice()), + versions: vec![version], update_trackers: UpdateTrackers { ack_map: update_tracker.clone(), sync_map: update_tracker.clone(), diff --git a/src/table/replication/fullcopy.rs b/src/table/replication/fullcopy.rs index 34807e3d..a5c83d0f 100644 --- a/src/table/replication/fullcopy.rs +++ b/src/table/replication/fullcopy.rs @@ -27,11 +27,10 @@ impl TableReplication for TableFullReplication { } fn write_nodes(&self, _hash: &Hash) -> Vec { - let layout = self.system.cluster_layout(); - layout.node_ids().to_vec() + self.system.cluster_layout().current().node_ids().to_vec() } fn write_quorum(&self) -> usize { - let nmembers = self.system.cluster_layout().node_ids().len(); + let nmembers = self.system.cluster_layout().current().node_ids().len(); if nmembers > self.max_faults { nmembers - self.max_faults } else { diff --git a/src/table/replication/sharded.rs b/src/table/replication/sharded.rs index 60c95cb4..793d87fd 100644 --- a/src/table/replication/sharded.rs +++ b/src/table/replication/sharded.rs @@ -26,16 +26,20 @@ pub struct TableShardedReplication { impl TableReplication for TableShardedReplication { fn read_nodes(&self, hash: &Hash) -> Vec { - let layout = self.system.cluster_layout(); - layout.nodes_of(hash, self.replication_factor) + self.system + .cluster_layout() + .current() + .nodes_of(hash, self.replication_factor) } fn read_quorum(&self) -> usize { self.read_quorum } fn write_nodes(&self, hash: &Hash) -> Vec { - let layout = self.system.cluster_layout(); - layout.nodes_of(hash, self.replication_factor) + self.system + .cluster_layout() + .current() + .nodes_of(hash, self.replication_factor) } fn write_quorum(&self) -> usize { self.write_quorum @@ -45,9 +49,9 @@ impl TableReplication for TableShardedReplication { } fn partition_of(&self, hash: &Hash) -> Partition { - self.system.cluster_layout().partition_of(hash) + self.system.cluster_layout().current().partition_of(hash) } fn partitions(&self) -> Vec<(Partition, Hash)> { - self.system.cluster_layout().partitions() + self.system.cluster_layout().current().partitions() } } diff --git a/src/table/sync.rs b/src/table/sync.rs index 65eff7cd..620d83b9 100644 --- a/src/table/sync.rs +++ b/src/table/sync.rs @@ -492,8 +492,8 @@ impl EndpointHandler for TableSync struct SyncWorker { syncer: Arc>, - layout_watch: watch::Receiver>, - layout: Arc, + layout_watch: watch::Receiver>, + layout: Arc, add_full_sync_rx: mpsc::UnboundedReceiver<()>, todo: Vec, next_full_sync: Instant, -- cgit v1.2.3 From 1da0a5676edcd20fc5c7412596edb5772da9f606 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 8 Nov 2023 19:30:58 +0100 Subject: bump garage protocol version tag to 0x000A (0.10) --- src/rpc/system.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/rpc/system.rs b/src/rpc/system.rs index 86d724f1..a7433b68 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -46,7 +46,7 @@ const STATUS_EXCHANGE_INTERVAL: Duration = Duration::from_secs(10); /// Version tag used for version check upon Netapp connection. /// Cluster nodes with different version tags are deemed /// incompatible and will refuse to connect. -pub const GARAGE_VERSION_TAG: u64 = 0x6761726167650008; // garage 0x0008 +pub const GARAGE_VERSION_TAG: u64 = 0x676172616765000A; // garage 0x000A /// RPC endpoint used for calls related to membership pub const SYSTEM_RPC_PATH: &str = "garage_rpc/membership.rs/SystemRpc"; -- cgit v1.2.3 From 523d2ecb9511f74e144cd116b942d6c1bf0f546d Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 9 Nov 2023 11:19:43 +0100 Subject: layout: use separate CRDT for staged layout changes --- src/api/admin/api_server.rs | 2 +- src/api/admin/cluster.rs | 23 ++- src/garage/cli/cmd.rs | 2 +- src/garage/cli/layout.rs | 47 +++-- src/garage/cli/structs.rs | 6 +- src/rpc/graph_algo.rs | 415 ------------------------------------------- src/rpc/layout/graph_algo.rs | 405 +++++++++++++++++++++++++++++++++++++++++ src/rpc/layout/history.rs | 82 +++------ src/rpc/layout/mod.rs | 4 +- src/rpc/layout/schema.rs | 106 ++++++++++- src/rpc/layout/tracker.rs | 21 --- src/rpc/layout/version.rs | 54 +----- src/rpc/lib.rs | 1 - 13 files changed, 580 insertions(+), 588 deletions(-) delete mode 100644 src/rpc/graph_algo.rs create mode 100644 src/rpc/layout/graph_algo.rs delete mode 100644 src/rpc/layout/tracker.rs diff --git a/src/api/admin/api_server.rs b/src/api/admin/api_server.rs index 4779f924..d9bd600e 100644 --- a/src/api/admin/api_server.rs +++ b/src/api/admin/api_server.rs @@ -279,7 +279,7 @@ impl ApiHandler for AdminApiServer { Endpoint::GetClusterLayout => handle_get_cluster_layout(&self.garage).await, Endpoint::UpdateClusterLayout => handle_update_cluster_layout(&self.garage, req).await, Endpoint::ApplyClusterLayout => handle_apply_cluster_layout(&self.garage, req).await, - Endpoint::RevertClusterLayout => handle_revert_cluster_layout(&self.garage, req).await, + Endpoint::RevertClusterLayout => handle_revert_cluster_layout(&self.garage).await, // Keys Endpoint::ListKeys => handle_list_keys(&self.garage).await, Endpoint::GetKeyInfo { diff --git a/src/api/admin/cluster.rs b/src/api/admin/cluster.rs index 6dd2e8da..fe8e8764 100644 --- a/src/api/admin/cluster.rs +++ b/src/api/admin/cluster.rs @@ -105,7 +105,9 @@ fn format_cluster_layout(layout: &layout::LayoutHistory) -> GetClusterLayoutResp .collect::>(); let staged_role_changes = layout - .staging_roles + .staging + .get() + .roles .items() .iter() .filter(|(k, _, v)| layout.current().roles.get(k) != Some(v)) @@ -211,7 +213,7 @@ pub async fn handle_update_cluster_layout( let mut layout = garage.system.cluster_layout().as_ref().clone(); let mut roles = layout.current().roles.clone(); - roles.merge(&layout.staging_roles); + roles.merge(&layout.staging.get().roles); for change in updates { let node = hex::decode(&change.id).ok_or_bad_request("Invalid node identifier")?; @@ -232,7 +234,9 @@ pub async fn handle_update_cluster_layout( }; layout - .staging_roles + .staging + .get_mut() + .roles .merge(&roles.update_mutator(node, layout::NodeRoleV(new_role))); } @@ -246,7 +250,7 @@ pub async fn handle_apply_cluster_layout( garage: &Arc, req: Request, ) -> Result, Error> { - let param = parse_json_body::(req).await?; + let param = parse_json_body::(req).await?; let layout = garage.system.cluster_layout().as_ref().clone(); let (layout, msg) = layout.apply_staged_changes(Some(param.version))?; @@ -260,14 +264,9 @@ pub async fn handle_apply_cluster_layout( Ok(json_ok_response(&res)?) } -pub async fn handle_revert_cluster_layout( - garage: &Arc, - req: Request, -) -> Result, Error> { - let param = parse_json_body::(req).await?; - +pub async fn handle_revert_cluster_layout(garage: &Arc) -> Result, Error> { let layout = garage.system.cluster_layout().as_ref().clone(); - let layout = layout.revert_staged_changes(Some(param.version))?; + let layout = layout.revert_staged_changes()?; garage.system.update_cluster_layout(&layout).await?; let res = format_cluster_layout(&layout); @@ -280,7 +279,7 @@ type UpdateClusterLayoutRequest = Vec; #[derive(Deserialize)] #[serde(rename_all = "camelCase")] -struct ApplyRevertLayoutRequest { +struct ApplyLayoutRequest { version: u64, } diff --git a/src/garage/cli/cmd.rs b/src/garage/cli/cmd.rs index 8be43873..1a054025 100644 --- a/src/garage/cli/cmd.rs +++ b/src/garage/cli/cmd.rs @@ -85,7 +85,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> )); } _ => { - let new_role = match layout.staging_roles.get(&adv.id) { + let new_role = match layout.staging.get().roles.get(&adv.id) { Some(NodeRoleV(Some(_))) => "(pending)", _ => "NO ROLE ASSIGNED", }; diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index 4a617337..269d92f4 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -65,7 +65,7 @@ pub async fn cmd_assign_role( .collect::, _>>()?; let mut roles = layout.current().roles.clone(); - roles.merge(&layout.staging_roles); + roles.merge(&layout.staging.get().roles); for replaced in args.replace.iter() { let replaced_node = @@ -73,7 +73,9 @@ pub async fn cmd_assign_role( match roles.get(&replaced_node) { Some(NodeRoleV(Some(_))) => { layout - .staging_roles + .staging + .get_mut() + .roles .merge(&roles.update_mutator(replaced_node, NodeRoleV(None))); } _ => { @@ -131,7 +133,9 @@ pub async fn cmd_assign_role( }; layout - .staging_roles + .staging + .get_mut() + .roles .merge(&roles.update_mutator(added_node, NodeRoleV(Some(new_entry)))); } @@ -151,13 +155,15 @@ pub async fn cmd_remove_role( let mut layout = fetch_layout(rpc_cli, rpc_host).await?; let mut roles = layout.current().roles.clone(); - roles.merge(&layout.staging_roles); + roles.merge(&layout.staging.get().roles); let deleted_node = find_matching_node(roles.items().iter().map(|(id, _, _)| *id), &args.node_id)?; layout - .staging_roles + .staging + .get_mut() + .roles .merge(&roles.update_mutator(deleted_node, NodeRoleV(None))); send_layout(rpc_cli, rpc_host, layout).await?; @@ -203,16 +209,12 @@ pub async fn cmd_show_layout( println!(); println!(" garage layout apply --version {}", v + 1); println!(); - println!( - "You can also revert all proposed changes with: garage layout revert --version {}", - v + 1) + println!("You can also revert all proposed changes with: garage layout revert"); } Err(e) => { println!("Error while trying to compute the assignment: {}", e); println!("This new layout cannot yet be applied."); - println!( - "You can also revert all proposed changes with: garage layout revert --version {}", - v + 1) + println!("You can also revert all proposed changes with: garage layout revert"); } } } @@ -245,9 +247,15 @@ pub async fn cmd_revert_layout( rpc_host: NodeID, revert_opt: RevertLayoutOpt, ) -> Result<(), Error> { + if !revert_opt.yes { + return Err(Error::Message( + "Please add the --yes flag to run the layout revert operation".into(), + )); + } + let layout = fetch_layout(rpc_cli, rpc_host).await?; - let layout = layout.revert_staged_changes(revert_opt.version)?; + let layout = layout.revert_staged_changes()?; send_layout(rpc_cli, rpc_host, layout).await?; @@ -284,7 +292,9 @@ pub async fn cmd_config_layout( } layout - .staging_parameters + .staging + .get_mut() + .parameters .update(LayoutParameters { zone_redundancy: r }); println!("The zone redundancy parameter has been set to '{}'.", r); did_something = true; @@ -371,19 +381,20 @@ pub fn print_cluster_layout(layout: &LayoutVersion, empty_msg: &str) { } pub fn print_staging_role_changes(layout: &LayoutHistory) -> bool { - let has_role_changes = layout - .staging_roles + let staging = layout.staging.get(); + let has_role_changes = staging + .roles .items() .iter() .any(|(k, _, v)| layout.current().roles.get(k) != Some(v)); - let has_layout_changes = *layout.staging_parameters.get() != layout.current().parameters; + let has_layout_changes = *staging.parameters.get() != layout.current().parameters; if has_role_changes || has_layout_changes { println!(); println!("==== STAGED ROLE CHANGES ===="); if has_role_changes { let mut table = vec!["ID\tTags\tZone\tCapacity".to_string()]; - for (id, _, role) in layout.staging_roles.items().iter() { + for (id, _, role) in staging.roles.items().iter() { if layout.current().roles.get(id) == Some(role) { continue; } @@ -406,7 +417,7 @@ pub fn print_staging_role_changes(layout: &LayoutHistory) -> bool { if has_layout_changes { println!( "Zone redundancy: {}", - layout.staging_parameters.get().zone_redundancy + staging.parameters.get().zone_redundancy ); } true diff --git a/src/garage/cli/structs.rs b/src/garage/cli/structs.rs index aba57551..3badc447 100644 --- a/src/garage/cli/structs.rs +++ b/src/garage/cli/structs.rs @@ -164,9 +164,9 @@ pub struct ApplyLayoutOpt { #[derive(StructOpt, Debug)] pub struct RevertLayoutOpt { - /// Version number of old configuration to which to revert - #[structopt(long = "version")] - pub(crate) version: Option, + /// The revert operation will not be ran unless this flag is added + #[structopt(long = "yes")] + pub(crate) yes: bool, } #[derive(Serialize, Deserialize, StructOpt, Debug)] diff --git a/src/rpc/graph_algo.rs b/src/rpc/graph_algo.rs deleted file mode 100644 index d8c6c9b9..00000000 --- a/src/rpc/graph_algo.rs +++ /dev/null @@ -1,415 +0,0 @@ -//! This module deals with graph algorithms. -//! It is used in layout.rs to build the partition to node assignment. - -use rand::prelude::{SeedableRng, SliceRandom}; -use std::cmp::{max, min}; -use std::collections::HashMap; -use std::collections::VecDeque; - -/// Vertex data structures used in all the graphs used in layout.rs. -/// usize parameters correspond to node/zone/partitions ids. -/// To understand the vertex roles below, please refer to the formal description -/// of the layout computation algorithm. -#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] -pub enum Vertex { - Source, - Pup(usize), // The vertex p+ of partition p - Pdown(usize), // The vertex p- of partition p - PZ(usize, usize), // The vertex corresponding to x_(partition p, zone z) - N(usize), // The vertex corresponding to node n - Sink, -} - -/// Edge data structure for the flow algorithm. -#[derive(Clone, Copy, Debug)] -pub struct FlowEdge { - cap: u64, // flow maximal capacity of the edge - flow: i64, // flow value on the edge - dest: usize, // destination vertex id - rev: usize, // index of the reversed edge (v, self) in the edge list of vertex v -} - -/// Edge data structure for the detection of negative cycles. -#[derive(Clone, Copy, Debug)] -pub struct WeightedEdge { - w: i64, // weight of the edge - dest: usize, -} - -pub trait Edge: Clone + Copy {} -impl Edge for FlowEdge {} -impl Edge for WeightedEdge {} - -/// Struct for the graph structure. We do encapsulation here to be able to both -/// provide user friendly Vertex enum to address vertices, and to use internally usize -/// indices and Vec instead of HashMap in the graph algorithm to optimize execution speed. -pub struct Graph { - vertex_to_id: HashMap, - id_to_vertex: Vec, - - // The graph is stored as an adjacency list - graph: Vec>, -} - -pub type CostFunction = HashMap<(Vertex, Vertex), i64>; - -impl Graph { - pub fn new(vertices: &[Vertex]) -> Self { - let mut map = HashMap::::new(); - for (i, vert) in vertices.iter().enumerate() { - map.insert(*vert, i); - } - Graph:: { - vertex_to_id: map, - id_to_vertex: vertices.to_vec(), - graph: vec![Vec::::new(); vertices.len()], - } - } - - fn get_vertex_id(&self, v: &Vertex) -> Result { - self.vertex_to_id - .get(v) - .cloned() - .ok_or_else(|| format!("The graph does not contain vertex {:?}", v)) - } -} - -impl Graph { - /// This function adds a directed edge to the graph with capacity c, and the - /// corresponding reversed edge with capacity 0. - pub fn add_edge(&mut self, u: Vertex, v: Vertex, c: u64) -> Result<(), String> { - let idu = self.get_vertex_id(&u)?; - let idv = self.get_vertex_id(&v)?; - if idu == idv { - return Err("Cannot add edge from vertex to itself in flow graph".into()); - } - - let rev_u = self.graph[idu].len(); - let rev_v = self.graph[idv].len(); - self.graph[idu].push(FlowEdge { - cap: c, - dest: idv, - flow: 0, - rev: rev_v, - }); - self.graph[idv].push(FlowEdge { - cap: 0, - dest: idu, - flow: 0, - rev: rev_u, - }); - Ok(()) - } - - /// This function returns the list of vertices that receive a positive flow from - /// vertex v. - pub fn get_positive_flow_from(&self, v: Vertex) -> Result, String> { - let idv = self.get_vertex_id(&v)?; - let mut result = Vec::::new(); - for edge in self.graph[idv].iter() { - if edge.flow > 0 { - result.push(self.id_to_vertex[edge.dest]); - } - } - Ok(result) - } - - /// This function returns the value of the flow incoming to v. - pub fn get_inflow(&self, v: Vertex) -> Result { - let idv = self.get_vertex_id(&v)?; - let mut result = 0; - for edge in self.graph[idv].iter() { - result += max(0, self.graph[edge.dest][edge.rev].flow); - } - Ok(result) - } - - /// This function returns the value of the flow outgoing from v. - pub fn get_outflow(&self, v: Vertex) -> Result { - let idv = self.get_vertex_id(&v)?; - let mut result = 0; - for edge in self.graph[idv].iter() { - result += max(0, edge.flow); - } - Ok(result) - } - - /// This function computes the flow total value by computing the outgoing flow - /// from the source. - pub fn get_flow_value(&mut self) -> Result { - self.get_outflow(Vertex::Source) - } - - /// This function shuffles the order of the edge lists. It keeps the ids of the - /// reversed edges consistent. - fn shuffle_edges(&mut self) { - // We use deterministic randomness so that the layout calculation algorihtm - // will output the same thing every time it is run. This way, the results - // pre-calculated in `garage layout show` will match exactly those used - // in practice with `garage layout apply` - let mut rng = rand::rngs::StdRng::from_seed([0x12u8; 32]); - for i in 0..self.graph.len() { - self.graph[i].shuffle(&mut rng); - // We need to update the ids of the reverse edges. - for j in 0..self.graph[i].len() { - let target_v = self.graph[i][j].dest; - let target_rev = self.graph[i][j].rev; - self.graph[target_v][target_rev].rev = j; - } - } - } - - /// Computes an upper bound of the flow on the graph - pub fn flow_upper_bound(&self) -> Result { - let idsource = self.get_vertex_id(&Vertex::Source)?; - let mut flow_upper_bound = 0; - for edge in self.graph[idsource].iter() { - flow_upper_bound += edge.cap; - } - Ok(flow_upper_bound) - } - - /// This function computes the maximal flow using Dinic's algorithm. It starts with - /// the flow values already present in the graph. So it is possible to add some edge to - /// the graph, compute a flow, add other edges, update the flow. - pub fn compute_maximal_flow(&mut self) -> Result<(), String> { - let idsource = self.get_vertex_id(&Vertex::Source)?; - let idsink = self.get_vertex_id(&Vertex::Sink)?; - - let nb_vertices = self.graph.len(); - - let flow_upper_bound = self.flow_upper_bound()?; - - // To ensure the dispersion of the associations generated by the - // assignment, we shuffle the neighbours of the nodes. Hence, - // the vertices do not consider their neighbours in the same order. - self.shuffle_edges(); - - // We run Dinic's max flow algorithm - loop { - // We build the level array from Dinic's algorithm. - let mut level = vec![None; nb_vertices]; - - let mut fifo = VecDeque::new(); - fifo.push_back((idsource, 0)); - while let Some((id, lvl)) = fifo.pop_front() { - if level[id].is_none() { - // it means id has not yet been reached - level[id] = Some(lvl); - for edge in self.graph[id].iter() { - if edge.cap as i64 - edge.flow > 0 { - fifo.push_back((edge.dest, lvl + 1)); - } - } - } - } - if level[idsink].is_none() { - // There is no residual flow - break; - } - // Now we run DFS respecting the level array - let mut next_nbd = vec![0; nb_vertices]; - let mut lifo = Vec::new(); - - lifo.push((idsource, flow_upper_bound)); - - while let Some((id, f)) = lifo.last().cloned() { - if id == idsink { - // The DFS reached the sink, we can add a - // residual flow. - lifo.pop(); - while let Some((id, _)) = lifo.pop() { - let nbd = next_nbd[id]; - self.graph[id][nbd].flow += f as i64; - let id_rev = self.graph[id][nbd].dest; - let nbd_rev = self.graph[id][nbd].rev; - self.graph[id_rev][nbd_rev].flow -= f as i64; - } - lifo.push((idsource, flow_upper_bound)); - continue; - } - // else we did not reach the sink - let nbd = next_nbd[id]; - if nbd >= self.graph[id].len() { - // There is nothing to explore from id anymore - lifo.pop(); - if let Some((parent, _)) = lifo.last() { - next_nbd[*parent] += 1; - } - continue; - } - // else we can try to send flow from id to its nbd - let new_flow = min( - f as i64, - self.graph[id][nbd].cap as i64 - self.graph[id][nbd].flow, - ) as u64; - if new_flow == 0 { - next_nbd[id] += 1; - continue; - } - if let (Some(lvldest), Some(lvlid)) = (level[self.graph[id][nbd].dest], level[id]) { - if lvldest <= lvlid { - // We cannot send flow to nbd. - next_nbd[id] += 1; - continue; - } - } - // otherwise, we send flow to nbd. - lifo.push((self.graph[id][nbd].dest, new_flow)); - } - } - Ok(()) - } - - /// This function takes a flow, and a cost function on the edges, and tries to find an - /// equivalent flow with a better cost, by finding improving overflow cycles. It uses - /// as subroutine the Bellman Ford algorithm run up to path_length. - /// We assume that the cost of edge (u,v) is the opposite of the cost of (v,u), and - /// only one needs to be present in the cost function. - pub fn optimize_flow_with_cost( - &mut self, - cost: &CostFunction, - path_length: usize, - ) -> Result<(), String> { - // We build the weighted graph g where we will look for negative cycle - let mut gf = self.build_cost_graph(cost)?; - let mut cycles = gf.list_negative_cycles(path_length); - while !cycles.is_empty() { - // we enumerate negative cycles - for c in cycles.iter() { - for i in 0..c.len() { - // We add one flow unit to the edge (u,v) of cycle c - let idu = self.vertex_to_id[&c[i]]; - let idv = self.vertex_to_id[&c[(i + 1) % c.len()]]; - for j in 0..self.graph[idu].len() { - // since idu appears at most once in the cycles, we enumerate every - // edge at most once. - let edge = self.graph[idu][j]; - if edge.dest == idv { - self.graph[idu][j].flow += 1; - self.graph[idv][edge.rev].flow -= 1; - break; - } - } - } - } - - gf = self.build_cost_graph(cost)?; - cycles = gf.list_negative_cycles(path_length); - } - Ok(()) - } - - /// Construct the weighted graph G_f from the flow and the cost function - fn build_cost_graph(&self, cost: &CostFunction) -> Result, String> { - let mut g = Graph::::new(&self.id_to_vertex); - let nb_vertices = self.id_to_vertex.len(); - for i in 0..nb_vertices { - for edge in self.graph[i].iter() { - if edge.cap as i64 - edge.flow > 0 { - // It is possible to send overflow through this edge - let u = self.id_to_vertex[i]; - let v = self.id_to_vertex[edge.dest]; - if cost.contains_key(&(u, v)) { - g.add_edge(u, v, cost[&(u, v)])?; - } else if cost.contains_key(&(v, u)) { - g.add_edge(u, v, -cost[&(v, u)])?; - } else { - g.add_edge(u, v, 0)?; - } - } - } - } - Ok(g) - } -} - -impl Graph { - /// This function adds a single directed weighted edge to the graph. - pub fn add_edge(&mut self, u: Vertex, v: Vertex, w: i64) -> Result<(), String> { - let idu = self.get_vertex_id(&u)?; - let idv = self.get_vertex_id(&v)?; - self.graph[idu].push(WeightedEdge { w, dest: idv }); - Ok(()) - } - - /// This function lists the negative cycles it manages to find after path_length - /// iterations of the main loop of the Bellman-Ford algorithm. For the classical - /// algorithm, path_length needs to be equal to the number of vertices. However, - /// for particular graph structures like in our case, the algorithm is still correct - /// when path_length is the length of the longest possible simple path. - /// See the formal description of the algorithm for more details. - fn list_negative_cycles(&self, path_length: usize) -> Vec> { - let nb_vertices = self.graph.len(); - - // We start with every vertex at distance 0 of some imaginary extra -1 vertex. - let mut distance = vec![0; nb_vertices]; - // The prev vector collects for every vertex from where does the shortest path come - let mut prev = vec![None; nb_vertices]; - - for _ in 0..path_length + 1 { - for id in 0..nb_vertices { - for e in self.graph[id].iter() { - if distance[id] + e.w < distance[e.dest] { - distance[e.dest] = distance[id] + e.w; - prev[e.dest] = Some(id); - } - } - } - } - - // If self.graph contains a negative cycle, then at this point the graph described - // by prev (which is a directed 1-forest/functional graph) - // must contain a cycle. We list the cycles of prev. - let cycles_prev = cycles_of_1_forest(&prev); - - // Remark that the cycle in prev is in the reverse order compared to the cycle - // in the graph. Thus the .rev(). - return cycles_prev - .iter() - .map(|cycle| { - cycle - .iter() - .rev() - .map(|id| self.id_to_vertex[*id]) - .collect() - }) - .collect(); - } -} - -/// This function returns the list of cycles of a directed 1 forest. It does not -/// check for the consistency of the input. -fn cycles_of_1_forest(forest: &[Option]) -> Vec> { - let mut cycles = Vec::>::new(); - let mut time_of_discovery = vec![None; forest.len()]; - - for t in 0..forest.len() { - let mut id = t; - // while we are on a valid undiscovered node - while time_of_discovery[id].is_none() { - time_of_discovery[id] = Some(t); - if let Some(i) = forest[id] { - id = i; - } else { - break; - } - } - if forest[id].is_some() && time_of_discovery[id] == Some(t) { - // We discovered an id that we explored at this iteration t. - // It means we are on a cycle - let mut cy = vec![id; 1]; - let mut id2 = id; - while let Some(id_next) = forest[id2] { - id2 = id_next; - if id2 != id { - cy.push(id2); - } else { - break; - } - } - cycles.push(cy); - } - } - cycles -} diff --git a/src/rpc/layout/graph_algo.rs b/src/rpc/layout/graph_algo.rs new file mode 100644 index 00000000..bd33e97f --- /dev/null +++ b/src/rpc/layout/graph_algo.rs @@ -0,0 +1,405 @@ +//! This module deals with graph algorithms. +//! It is used in layout.rs to build the partition to node assignment. + +use rand::prelude::{SeedableRng, SliceRandom}; +use std::cmp::{max, min}; +use std::collections::HashMap; +use std::collections::VecDeque; + +/// Vertex data structures used in all the graphs used in layout.rs. +/// usize parameters correspond to node/zone/partitions ids. +/// To understand the vertex roles below, please refer to the formal description +/// of the layout computation algorithm. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +pub enum Vertex { + Source, + Pup(usize), // The vertex p+ of partition p + Pdown(usize), // The vertex p- of partition p + PZ(usize, usize), // The vertex corresponding to x_(partition p, zone z) + N(usize), // The vertex corresponding to node n + Sink, +} + +/// Edge data structure for the flow algorithm. +#[derive(Clone, Copy, Debug)] +pub struct FlowEdge { + cap: u64, // flow maximal capacity of the edge + flow: i64, // flow value on the edge + dest: usize, // destination vertex id + rev: usize, // index of the reversed edge (v, self) in the edge list of vertex v +} + +/// Edge data structure for the detection of negative cycles. +#[derive(Clone, Copy, Debug)] +pub struct WeightedEdge { + w: i64, // weight of the edge + dest: usize, +} + +pub trait Edge: Clone + Copy {} +impl Edge for FlowEdge {} +impl Edge for WeightedEdge {} + +/// Struct for the graph structure. We do encapsulation here to be able to both +/// provide user friendly Vertex enum to address vertices, and to use internally usize +/// indices and Vec instead of HashMap in the graph algorithm to optimize execution speed. +pub struct Graph { + vertex_to_id: HashMap, + id_to_vertex: Vec, + + // The graph is stored as an adjacency list + graph: Vec>, +} + +pub type CostFunction = HashMap<(Vertex, Vertex), i64>; + +impl Graph { + pub fn new(vertices: &[Vertex]) -> Self { + let mut map = HashMap::::new(); + for (i, vert) in vertices.iter().enumerate() { + map.insert(*vert, i); + } + Graph:: { + vertex_to_id: map, + id_to_vertex: vertices.to_vec(), + graph: vec![Vec::::new(); vertices.len()], + } + } + + fn get_vertex_id(&self, v: &Vertex) -> Result { + self.vertex_to_id + .get(v) + .cloned() + .ok_or_else(|| format!("The graph does not contain vertex {:?}", v)) + } +} + +impl Graph { + /// This function adds a directed edge to the graph with capacity c, and the + /// corresponding reversed edge with capacity 0. + pub fn add_edge(&mut self, u: Vertex, v: Vertex, c: u64) -> Result<(), String> { + let idu = self.get_vertex_id(&u)?; + let idv = self.get_vertex_id(&v)?; + if idu == idv { + return Err("Cannot add edge from vertex to itself in flow graph".into()); + } + + let rev_u = self.graph[idu].len(); + let rev_v = self.graph[idv].len(); + self.graph[idu].push(FlowEdge { + cap: c, + dest: idv, + flow: 0, + rev: rev_v, + }); + self.graph[idv].push(FlowEdge { + cap: 0, + dest: idu, + flow: 0, + rev: rev_u, + }); + Ok(()) + } + + /// This function returns the list of vertices that receive a positive flow from + /// vertex v. + pub fn get_positive_flow_from(&self, v: Vertex) -> Result, String> { + let idv = self.get_vertex_id(&v)?; + let mut result = Vec::::new(); + for edge in self.graph[idv].iter() { + if edge.flow > 0 { + result.push(self.id_to_vertex[edge.dest]); + } + } + Ok(result) + } + + /// This function returns the value of the flow outgoing from v. + pub fn get_outflow(&self, v: Vertex) -> Result { + let idv = self.get_vertex_id(&v)?; + let mut result = 0; + for edge in self.graph[idv].iter() { + result += max(0, edge.flow); + } + Ok(result) + } + + /// This function computes the flow total value by computing the outgoing flow + /// from the source. + pub fn get_flow_value(&mut self) -> Result { + self.get_outflow(Vertex::Source) + } + + /// This function shuffles the order of the edge lists. It keeps the ids of the + /// reversed edges consistent. + fn shuffle_edges(&mut self) { + // We use deterministic randomness so that the layout calculation algorihtm + // will output the same thing every time it is run. This way, the results + // pre-calculated in `garage layout show` will match exactly those used + // in practice with `garage layout apply` + let mut rng = rand::rngs::StdRng::from_seed([0x12u8; 32]); + for i in 0..self.graph.len() { + self.graph[i].shuffle(&mut rng); + // We need to update the ids of the reverse edges. + for j in 0..self.graph[i].len() { + let target_v = self.graph[i][j].dest; + let target_rev = self.graph[i][j].rev; + self.graph[target_v][target_rev].rev = j; + } + } + } + + /// Computes an upper bound of the flow on the graph + pub fn flow_upper_bound(&self) -> Result { + let idsource = self.get_vertex_id(&Vertex::Source)?; + let mut flow_upper_bound = 0; + for edge in self.graph[idsource].iter() { + flow_upper_bound += edge.cap; + } + Ok(flow_upper_bound) + } + + /// This function computes the maximal flow using Dinic's algorithm. It starts with + /// the flow values already present in the graph. So it is possible to add some edge to + /// the graph, compute a flow, add other edges, update the flow. + pub fn compute_maximal_flow(&mut self) -> Result<(), String> { + let idsource = self.get_vertex_id(&Vertex::Source)?; + let idsink = self.get_vertex_id(&Vertex::Sink)?; + + let nb_vertices = self.graph.len(); + + let flow_upper_bound = self.flow_upper_bound()?; + + // To ensure the dispersion of the associations generated by the + // assignment, we shuffle the neighbours of the nodes. Hence, + // the vertices do not consider their neighbours in the same order. + self.shuffle_edges(); + + // We run Dinic's max flow algorithm + loop { + // We build the level array from Dinic's algorithm. + let mut level = vec![None; nb_vertices]; + + let mut fifo = VecDeque::new(); + fifo.push_back((idsource, 0)); + while let Some((id, lvl)) = fifo.pop_front() { + if level[id].is_none() { + // it means id has not yet been reached + level[id] = Some(lvl); + for edge in self.graph[id].iter() { + if edge.cap as i64 - edge.flow > 0 { + fifo.push_back((edge.dest, lvl + 1)); + } + } + } + } + if level[idsink].is_none() { + // There is no residual flow + break; + } + // Now we run DFS respecting the level array + let mut next_nbd = vec![0; nb_vertices]; + let mut lifo = Vec::new(); + + lifo.push((idsource, flow_upper_bound)); + + while let Some((id, f)) = lifo.last().cloned() { + if id == idsink { + // The DFS reached the sink, we can add a + // residual flow. + lifo.pop(); + while let Some((id, _)) = lifo.pop() { + let nbd = next_nbd[id]; + self.graph[id][nbd].flow += f as i64; + let id_rev = self.graph[id][nbd].dest; + let nbd_rev = self.graph[id][nbd].rev; + self.graph[id_rev][nbd_rev].flow -= f as i64; + } + lifo.push((idsource, flow_upper_bound)); + continue; + } + // else we did not reach the sink + let nbd = next_nbd[id]; + if nbd >= self.graph[id].len() { + // There is nothing to explore from id anymore + lifo.pop(); + if let Some((parent, _)) = lifo.last() { + next_nbd[*parent] += 1; + } + continue; + } + // else we can try to send flow from id to its nbd + let new_flow = min( + f as i64, + self.graph[id][nbd].cap as i64 - self.graph[id][nbd].flow, + ) as u64; + if new_flow == 0 { + next_nbd[id] += 1; + continue; + } + if let (Some(lvldest), Some(lvlid)) = (level[self.graph[id][nbd].dest], level[id]) { + if lvldest <= lvlid { + // We cannot send flow to nbd. + next_nbd[id] += 1; + continue; + } + } + // otherwise, we send flow to nbd. + lifo.push((self.graph[id][nbd].dest, new_flow)); + } + } + Ok(()) + } + + /// This function takes a flow, and a cost function on the edges, and tries to find an + /// equivalent flow with a better cost, by finding improving overflow cycles. It uses + /// as subroutine the Bellman Ford algorithm run up to path_length. + /// We assume that the cost of edge (u,v) is the opposite of the cost of (v,u), and + /// only one needs to be present in the cost function. + pub fn optimize_flow_with_cost( + &mut self, + cost: &CostFunction, + path_length: usize, + ) -> Result<(), String> { + // We build the weighted graph g where we will look for negative cycle + let mut gf = self.build_cost_graph(cost)?; + let mut cycles = gf.list_negative_cycles(path_length); + while !cycles.is_empty() { + // we enumerate negative cycles + for c in cycles.iter() { + for i in 0..c.len() { + // We add one flow unit to the edge (u,v) of cycle c + let idu = self.vertex_to_id[&c[i]]; + let idv = self.vertex_to_id[&c[(i + 1) % c.len()]]; + for j in 0..self.graph[idu].len() { + // since idu appears at most once in the cycles, we enumerate every + // edge at most once. + let edge = self.graph[idu][j]; + if edge.dest == idv { + self.graph[idu][j].flow += 1; + self.graph[idv][edge.rev].flow -= 1; + break; + } + } + } + } + + gf = self.build_cost_graph(cost)?; + cycles = gf.list_negative_cycles(path_length); + } + Ok(()) + } + + /// Construct the weighted graph G_f from the flow and the cost function + fn build_cost_graph(&self, cost: &CostFunction) -> Result, String> { + let mut g = Graph::::new(&self.id_to_vertex); + let nb_vertices = self.id_to_vertex.len(); + for i in 0..nb_vertices { + for edge in self.graph[i].iter() { + if edge.cap as i64 - edge.flow > 0 { + // It is possible to send overflow through this edge + let u = self.id_to_vertex[i]; + let v = self.id_to_vertex[edge.dest]; + if cost.contains_key(&(u, v)) { + g.add_edge(u, v, cost[&(u, v)])?; + } else if cost.contains_key(&(v, u)) { + g.add_edge(u, v, -cost[&(v, u)])?; + } else { + g.add_edge(u, v, 0)?; + } + } + } + } + Ok(g) + } +} + +impl Graph { + /// This function adds a single directed weighted edge to the graph. + pub fn add_edge(&mut self, u: Vertex, v: Vertex, w: i64) -> Result<(), String> { + let idu = self.get_vertex_id(&u)?; + let idv = self.get_vertex_id(&v)?; + self.graph[idu].push(WeightedEdge { w, dest: idv }); + Ok(()) + } + + /// This function lists the negative cycles it manages to find after path_length + /// iterations of the main loop of the Bellman-Ford algorithm. For the classical + /// algorithm, path_length needs to be equal to the number of vertices. However, + /// for particular graph structures like in our case, the algorithm is still correct + /// when path_length is the length of the longest possible simple path. + /// See the formal description of the algorithm for more details. + fn list_negative_cycles(&self, path_length: usize) -> Vec> { + let nb_vertices = self.graph.len(); + + // We start with every vertex at distance 0 of some imaginary extra -1 vertex. + let mut distance = vec![0; nb_vertices]; + // The prev vector collects for every vertex from where does the shortest path come + let mut prev = vec![None; nb_vertices]; + + for _ in 0..path_length + 1 { + for id in 0..nb_vertices { + for e in self.graph[id].iter() { + if distance[id] + e.w < distance[e.dest] { + distance[e.dest] = distance[id] + e.w; + prev[e.dest] = Some(id); + } + } + } + } + + // If self.graph contains a negative cycle, then at this point the graph described + // by prev (which is a directed 1-forest/functional graph) + // must contain a cycle. We list the cycles of prev. + let cycles_prev = cycles_of_1_forest(&prev); + + // Remark that the cycle in prev is in the reverse order compared to the cycle + // in the graph. Thus the .rev(). + return cycles_prev + .iter() + .map(|cycle| { + cycle + .iter() + .rev() + .map(|id| self.id_to_vertex[*id]) + .collect() + }) + .collect(); + } +} + +/// This function returns the list of cycles of a directed 1 forest. It does not +/// check for the consistency of the input. +fn cycles_of_1_forest(forest: &[Option]) -> Vec> { + let mut cycles = Vec::>::new(); + let mut time_of_discovery = vec![None; forest.len()]; + + for t in 0..forest.len() { + let mut id = t; + // while we are on a valid undiscovered node + while time_of_discovery[id].is_none() { + time_of_discovery[id] = Some(t); + if let Some(i) = forest[id] { + id = i; + } else { + break; + } + } + if forest[id].is_some() && time_of_discovery[id] == Some(t) { + // We discovered an id that we explored at this iteration t. + // It means we are on a cycle + let mut cy = vec![id; 1]; + let mut id2 = id; + while let Some(id_next) = forest[id2] { + id2 = id_next; + if id2 != id { + cy.push(id2); + } else { + break; + } + } + cycles.push(cy); + } + } + cycles +} diff --git a/src/rpc/layout/history.rs b/src/rpc/layout/history.rs index e59c9e9c..9ae28887 100644 --- a/src/rpc/layout/history.rs +++ b/src/rpc/layout/history.rs @@ -1,5 +1,3 @@ -use std::cmp::Ordering; - use garage_util::crdt::{Crdt, Lww, LwwMap}; use garage_util::data::*; use garage_util::encode::nonversioned_encode; @@ -12,14 +10,15 @@ impl LayoutHistory { pub fn new(replication_factor: usize) -> Self { let version = LayoutVersion::new(replication_factor); - let staging_parameters = Lww::::new(version.parameters); - let empty_lwwmap = LwwMap::new(); + let staging = LayoutStaging { + parameters: Lww::::new(version.parameters), + roles: LwwMap::new(), + }; let mut ret = LayoutHistory { versions: vec![version].into_boxed_slice().into(), update_trackers: Default::default(), - staging_parameters, - staging_roles: empty_lwwmap, + staging: Lww::raw(0, staging), staging_hash: [0u8; 32].into(), }; ret.staging_hash = ret.calculate_staging_hash(); @@ -31,8 +30,7 @@ impl LayoutHistory { } pub(crate) fn calculate_staging_hash(&self) -> Hash { - let hashed_tuple = (&self.staging_roles, &self.staging_parameters); - blake2sum(&nonversioned_encode(&hashed_tuple).unwrap()[..]) + blake2sum(&nonversioned_encode(&self.staging).unwrap()[..]) } // ================== updates to layout, public interface =================== @@ -41,26 +39,10 @@ impl LayoutHistory { let mut changed = false; // Merge staged layout changes - match other.current().version.cmp(&self.current().version) { - Ordering::Greater => { - self.staging_parameters = other.staging_parameters.clone(); - self.staging_roles = other.staging_roles.clone(); - self.staging_hash = other.staging_hash; - changed = true; - } - Ordering::Equal => { - self.staging_parameters.merge(&other.staging_parameters); - self.staging_roles.merge(&other.staging_roles); - - let new_staging_hash = self.calculate_staging_hash(); - if new_staging_hash != self.staging_hash { - changed = true; - } - - self.staging_hash = new_staging_hash; - } - Ordering::Less => (), + if self.staging != other.staging { + changed = true; } + self.staging.merge(&other.staging); // Add any new versions to history for v2 in other.versions.iter() { @@ -102,50 +84,34 @@ To know the correct value of the new layout version, invoke `garage layout show` } } + // Compute new version and add it to history let mut new_version = self.current().clone(); new_version.version += 1; - new_version.roles.merge(&self.staging_roles); + new_version.roles.merge(&self.staging.get().roles); new_version.roles.retain(|(_, _, v)| v.0.is_some()); - new_version.parameters = *self.staging_parameters.get(); - - self.staging_roles.clear(); - self.staging_hash = self.calculate_staging_hash(); + new_version.parameters = *self.staging.get().parameters.get(); let msg = new_version.calculate_partition_assignment()?; - self.versions.push(new_version); + // Reset the staged layout changes + self.staging.update(LayoutStaging { + parameters: self.staging.get().parameters.clone(), + roles: LwwMap::new(), + }); + self.staging_hash = self.calculate_staging_hash(); + Ok((self, msg)) } - pub fn revert_staged_changes(mut self, version: Option) -> Result { - match version { - None => { - let error = r#" -Please pass the new layout version number to ensure that you are writing the correct version of the cluster layout. -To know the correct value of the new layout version, invoke `garage layout show` and review the proposed changes. - "#; - return Err(Error::Message(error.into())); - } - Some(v) => { - if v != self.current().version + 1 { - return Err(Error::Message("Invalid new layout version".into())); - } - } - } - - self.staging_roles.clear(); - self.staging_parameters.update(self.current().parameters); + pub fn revert_staged_changes(mut self) -> Result { + self.staging.update(LayoutStaging { + parameters: Lww::new(self.current().parameters.clone()), + roles: LwwMap::new(), + }); self.staging_hash = self.calculate_staging_hash(); - // TODO this is stupid, we should have a separate version counter/LWW - // for the staging params - let mut new_version = self.current().clone(); - new_version.version += 1; - - self.versions.push(new_version); - Ok(self) } diff --git a/src/rpc/layout/mod.rs b/src/rpc/layout/mod.rs index 122d4b65..7c15988a 100644 --- a/src/rpc/layout/mod.rs +++ b/src/rpc/layout/mod.rs @@ -1,8 +1,10 @@ +mod graph_algo; mod history; mod schema; -mod tracker; mod version; +// ---- re-exports ---- + pub use history::*; pub use schema::*; pub use version::*; diff --git a/src/rpc/layout/schema.rs b/src/rpc/layout/schema.rs index 14e797be..c5b9b1d3 100644 --- a/src/rpc/layout/schema.rs +++ b/src/rpc/layout/schema.rs @@ -1,3 +1,9 @@ +use std::fmt; + +use bytesize::ByteSize; + +use garage_util::crdt::{AutoCrdt, Crdt}; + mod v08 { use crate::layout::CompactNodeType; use garage_util::crdt::LwwMap; @@ -210,6 +216,15 @@ mod v010 { pub ring_assignment_data: Vec, } + /// The staged changes for the next layout version + #[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] + pub struct LayoutStaging { + /// Parameters to be used in the next partition assignment computation. + pub parameters: Lww, + /// Role changes which are staged for the next version of the layout + pub roles: LwwMap, + } + /// The history of cluster layouts #[derive(Clone, Debug, Serialize, Deserialize)] pub struct LayoutHistory { @@ -219,10 +234,8 @@ mod v010 { /// Update trackers pub update_trackers: UpdateTrackers, - /// Parameters to be used in the next partition assignment computation. - pub staging_parameters: Lww, - /// Role changes which are staged for the next version of the layout - pub staging_roles: LwwMap, + /// Staged changes for the next version + pub staging: Lww, /// Hash of the serialized staging_parameters + staging_roles pub staging_hash: Hash, } @@ -265,6 +278,10 @@ mod v010 { .map(|x| (*x, version.version)) .collect::>(), ); + let staging = LayoutStaging { + parameters: previous.staging_parameters, + roles: previous.staging_roles, + }; let mut ret = Self { versions: vec![version], update_trackers: UpdateTrackers { @@ -272,8 +289,7 @@ mod v010 { sync_map: update_tracker.clone(), sync_ack_map: update_tracker.clone(), }, - staging_parameters: previous.staging_parameters, - staging_roles: previous.staging_roles, + staging: Lww::raw(previous.version, staging), staging_hash: [0u8; 32].into(), }; ret.staging_hash = ret.calculate_staging_hash(); @@ -283,3 +299,81 @@ mod v010 { } pub use v010::*; + +// ---- utility functions ---- + +impl AutoCrdt for LayoutParameters { + const WARN_IF_DIFFERENT: bool = true; +} + +impl AutoCrdt for NodeRoleV { + const WARN_IF_DIFFERENT: bool = true; +} + +impl Crdt for LayoutStaging { + fn merge(&mut self, other: &LayoutStaging) { + self.parameters.merge(&other.parameters); + self.roles.merge(&other.roles); + } +} + +impl NodeRole { + pub fn capacity_string(&self) -> String { + match self.capacity { + Some(c) => ByteSize::b(c).to_string_as(false), + None => "gateway".to_string(), + } + } + + pub fn tags_string(&self) -> String { + self.tags.join(",") + } +} + +impl fmt::Display for ZoneRedundancy { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + ZoneRedundancy::Maximum => write!(f, "maximum"), + ZoneRedundancy::AtLeast(x) => write!(f, "{}", x), + } + } +} + +impl core::str::FromStr for ZoneRedundancy { + type Err = &'static str; + fn from_str(s: &str) -> Result { + match s { + "none" | "max" | "maximum" => Ok(ZoneRedundancy::Maximum), + x => { + let v = x + .parse::() + .map_err(|_| "zone redundancy must be 'none'/'max' or an integer")?; + Ok(ZoneRedundancy::AtLeast(v)) + } + } + } +} + +impl UpdateTracker { + fn merge(&mut self, other: &UpdateTracker) { + for (k, v) in other.0.iter() { + if let Some(v_mut) = self.0.get_mut(k) { + *v_mut = std::cmp::max(*v_mut, *v); + } else { + self.0.insert(*k, *v); + } + } + } + + pub(crate) fn min(&self) -> u64 { + self.0.iter().map(|(_, v)| *v).min().unwrap_or(0) + } +} + +impl UpdateTrackers { + pub(crate) fn merge(&mut self, other: &UpdateTrackers) { + self.ack_map.merge(&other.ack_map); + self.sync_map.merge(&other.sync_map); + self.sync_ack_map.merge(&other.sync_ack_map); + } +} diff --git a/src/rpc/layout/tracker.rs b/src/rpc/layout/tracker.rs deleted file mode 100644 index 778121e4..00000000 --- a/src/rpc/layout/tracker.rs +++ /dev/null @@ -1,21 +0,0 @@ -use super::*; - -impl UpdateTracker { - fn merge(&mut self, other: &UpdateTracker) { - for (k, v) in other.0.iter() { - if let Some(v_mut) = self.0.get_mut(k) { - *v_mut = std::cmp::max(*v_mut, *v); - } else { - self.0.insert(*k, *v); - } - } - } -} - -impl UpdateTrackers { - pub(crate) fn merge(&mut self, other: &UpdateTrackers) { - self.ack_map.merge(&other.ack_map); - self.sync_map.merge(&other.sync_map); - self.sync_ack_map.merge(&other.sync_ack_map); - } -} diff --git a/src/rpc/layout/version.rs b/src/rpc/layout/version.rs index 363bc204..6918fdf9 100644 --- a/src/rpc/layout/version.rs +++ b/src/rpc/layout/version.rs @@ -1,69 +1,21 @@ use std::collections::HashMap; use std::collections::HashSet; -use std::fmt; +use std::convert::TryInto; use bytesize::ByteSize; use itertools::Itertools; -use garage_util::crdt::{AutoCrdt, LwwMap}; +use garage_util::crdt::LwwMap; use garage_util::data::*; use garage_util::error::*; -use crate::graph_algo::*; - -use std::convert::TryInto; - +use super::graph_algo::*; use super::schema::*; use super::*; // The Message type will be used to collect information on the algorithm. pub type Message = Vec; -impl AutoCrdt for LayoutParameters { - const WARN_IF_DIFFERENT: bool = true; -} - -impl AutoCrdt for NodeRoleV { - const WARN_IF_DIFFERENT: bool = true; -} - -impl NodeRole { - pub fn capacity_string(&self) -> String { - match self.capacity { - Some(c) => ByteSize::b(c).to_string_as(false), - None => "gateway".to_string(), - } - } - - pub fn tags_string(&self) -> String { - self.tags.join(",") - } -} - -impl fmt::Display for ZoneRedundancy { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - ZoneRedundancy::Maximum => write!(f, "maximum"), - ZoneRedundancy::AtLeast(x) => write!(f, "{}", x), - } - } -} - -impl core::str::FromStr for ZoneRedundancy { - type Err = &'static str; - fn from_str(s: &str) -> Result { - match s { - "none" | "max" | "maximum" => Ok(ZoneRedundancy::Maximum), - x => { - let v = x - .parse::() - .map_err(|_| "zone redundancy must be 'none'/'max' or an integer")?; - Ok(ZoneRedundancy::AtLeast(v)) - } - } - } -} - impl LayoutVersion { pub fn new(replication_factor: usize) -> Self { // We set the default zone redundancy to be Maximum, meaning that the maximum diff --git a/src/rpc/lib.rs b/src/rpc/lib.rs index 1af8b78e..b5b31c05 100644 --- a/src/rpc/lib.rs +++ b/src/rpc/lib.rs @@ -11,7 +11,6 @@ mod consul; #[cfg(feature = "kubernetes-discovery")] mod kubernetes; -pub mod graph_algo; pub mod layout; pub mod replication_mode; pub mod system; -- cgit v1.2.3 From 8a2b1dd422fb57abe611d8c1cf3cb0b55f487189 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 9 Nov 2023 12:55:36 +0100 Subject: wip: split out layout management from System into separate LayoutManager --- src/api/admin/cluster.rs | 18 ++- src/block/manager.rs | 10 +- src/block/resync.rs | 4 +- src/model/k2v/rpc.rs | 20 ++-- src/rpc/layout/manager.rs | 177 ++++++++++++++++++++++++++++ src/rpc/layout/mod.rs | 2 + src/rpc/system.rs | 295 +++++++++++++++++----------------------------- src/table/gc.rs | 4 +- src/table/sync.rs | 10 +- src/table/table.rs | 10 +- 10 files changed, 331 insertions(+), 219 deletions(-) create mode 100644 src/rpc/layout/manager.rs diff --git a/src/api/admin/cluster.rs b/src/api/admin/cluster.rs index fe8e8764..f5483451 100644 --- a/src/api/admin/cluster.rs +++ b/src/api/admin/cluster.rs @@ -240,7 +240,11 @@ pub async fn handle_update_cluster_layout( .merge(&roles.update_mutator(node, layout::NodeRoleV(new_role))); } - garage.system.update_cluster_layout(&layout).await?; + garage + .system + .layout_manager + .update_cluster_layout(&layout) + .await?; let res = format_cluster_layout(&layout); Ok(json_ok_response(&res)?) @@ -255,7 +259,11 @@ pub async fn handle_apply_cluster_layout( let layout = garage.system.cluster_layout().as_ref().clone(); let (layout, msg) = layout.apply_staged_changes(Some(param.version))?; - garage.system.update_cluster_layout(&layout).await?; + garage + .system + .layout_manager + .update_cluster_layout(&layout) + .await?; let res = ApplyClusterLayoutResponse { message: msg, @@ -267,7 +275,11 @@ pub async fn handle_apply_cluster_layout( pub async fn handle_revert_cluster_layout(garage: &Arc) -> Result, Error> { let layout = garage.system.cluster_layout().as_ref().clone(); let layout = layout.revert_staged_changes()?; - garage.system.update_cluster_layout(&layout).await?; + garage + .system + .layout_manager + .update_cluster_layout(&layout) + .await?; let res = format_cluster_layout(&layout); Ok(json_ok_response(&res)?) diff --git a/src/block/manager.rs b/src/block/manager.rs index 2d1b5c67..72b4ea66 100644 --- a/src/block/manager.rs +++ b/src/block/manager.rs @@ -265,7 +265,7 @@ impl BlockManager { Fut: futures::Future>, { let who = self.replication.read_nodes(hash); - let who = self.system.rpc.request_order(&who); + let who = self.system.rpc_helper().request_order(&who); for node in who.iter() { let node_id = NodeID::from(*node); @@ -305,7 +305,7 @@ impl BlockManager { // if the first one doesn't succeed rapidly // TODO: keep first request running when initiating a new one and take the // one that finishes earlier - _ = tokio::time::sleep(self.system.rpc.rpc_timeout()) => { + _ = tokio::time::sleep(self.system.rpc_helper().rpc_timeout()) => { debug!("Get block {:?}: node {:?} didn't return block in time, trying next.", hash, node); } }; @@ -363,7 +363,7 @@ impl BlockManager { Req::new(BlockRpc::PutBlock { hash, header })?.with_stream_from_buffer(bytes); self.system - .rpc + .rpc_helper() .try_call_many( &self.endpoint, &who[..], @@ -439,7 +439,7 @@ impl BlockManager { tokio::spawn(async move { if let Err(e) = this .resync - .put_to_resync(&hash, 2 * this.system.rpc.rpc_timeout()) + .put_to_resync(&hash, 2 * this.system.rpc_helper().rpc_timeout()) { error!("Block {:?} could not be put in resync queue: {}.", hash, e); } @@ -533,7 +533,7 @@ impl BlockManager { None => { // Not found but maybe we should have had it ?? self.resync - .put_to_resync(hash, 2 * self.system.rpc.rpc_timeout())?; + .put_to_resync(hash, 2 * self.system.rpc_helper().rpc_timeout())?; return Err(Error::Message(format!( "block {:?} not found on node", hash diff --git a/src/block/resync.rs b/src/block/resync.rs index 9c1da4a7..fedcd6f5 100644 --- a/src/block/resync.rs +++ b/src/block/resync.rs @@ -385,7 +385,7 @@ impl BlockResyncManager { let who_needs_resps = manager .system - .rpc + .rpc_helper() .call_many( &manager.endpoint, &who, @@ -431,7 +431,7 @@ impl BlockResyncManager { .with_stream_from_buffer(bytes); manager .system - .rpc + .rpc_helper() .try_call_many( &manager.endpoint, &need_nodes[..], diff --git a/src/model/k2v/rpc.rs b/src/model/k2v/rpc.rs index 37e142f6..2f548ad7 100644 --- a/src/model/k2v/rpc.rs +++ b/src/model/k2v/rpc.rs @@ -131,7 +131,7 @@ impl K2VRpcHandler { who.sort(); self.system - .rpc + .rpc_helper() .try_call_many( &self.endpoint, &who[..], @@ -187,7 +187,7 @@ impl K2VRpcHandler { let call_futures = call_list.into_iter().map(|(nodes, items)| async move { let resp = self .system - .rpc + .rpc_helper() .try_call_many( &self.endpoint, &nodes[..], @@ -229,7 +229,7 @@ impl K2VRpcHandler { .replication .write_nodes(&poll_key.partition.hash()); - let rpc = self.system.rpc.try_call_many( + let rpc = self.system.rpc_helper().try_call_many( &self.endpoint, &nodes[..], K2VRpc::PollItem { @@ -241,7 +241,8 @@ impl K2VRpcHandler { .with_quorum(self.item_table.data.replication.read_quorum()) .without_timeout(), ); - let timeout_duration = Duration::from_millis(timeout_msec) + self.system.rpc.rpc_timeout(); + let timeout_duration = + Duration::from_millis(timeout_msec) + self.system.rpc_helper().rpc_timeout(); let resps = select! { r = rpc => r?, _ = tokio::time::sleep(timeout_duration) => return Ok(None), @@ -300,7 +301,11 @@ impl K2VRpcHandler { let rs = RequestStrategy::with_priority(PRIO_NORMAL).without_timeout(); let mut requests = nodes .iter() - .map(|node| self.system.rpc.call(&self.endpoint, *node, msg.clone(), rs)) + .map(|node| { + self.system + .rpc_helper() + .call(&self.endpoint, *node, msg.clone(), rs) + }) .collect::>(); // Fetch responses. This procedure stops fetching responses when any of the following @@ -316,8 +321,9 @@ impl K2VRpcHandler { // kind: all items produced by that node until time ts have been returned, so we can // bump the entry in the global vector clock and possibly remove some item-specific // vector clocks) - let mut deadline = - Instant::now() + Duration::from_millis(timeout_msec) + self.system.rpc.rpc_timeout(); + let mut deadline = Instant::now() + + Duration::from_millis(timeout_msec) + + self.system.rpc_helper().rpc_timeout(); let mut resps = vec![]; let mut errors = vec![]; loop { diff --git a/src/rpc/layout/manager.rs b/src/rpc/layout/manager.rs new file mode 100644 index 00000000..a8a77139 --- /dev/null +++ b/src/rpc/layout/manager.rs @@ -0,0 +1,177 @@ +use std::sync::Arc; +use std::time::Duration; + +use tokio::sync::watch; +use tokio::sync::Mutex; + +use netapp::endpoint::Endpoint; +use netapp::peering::fullmesh::FullMeshPeeringStrategy; +use netapp::NodeID; + +use garage_util::config::Config; +use garage_util::data::*; +use garage_util::error::*; +use garage_util::persister::Persister; + +use super::*; +use crate::rpc_helper::*; +use crate::system::*; + +pub struct LayoutManager { + replication_factor: usize, + persist_cluster_layout: Persister, + + pub layout_watch: watch::Receiver>, + update_layout: Mutex>>, + + pub(crate) rpc_helper: RpcHelper, + system_endpoint: Arc>, +} + +impl LayoutManager { + pub fn new( + config: &Config, + node_id: NodeID, + system_endpoint: Arc>, + fullmesh: Arc, + replication_factor: usize, + ) -> Result { + let persist_cluster_layout: Persister = + Persister::new(&config.metadata_dir, "cluster_layout"); + + let cluster_layout = match persist_cluster_layout.load() { + Ok(x) => { + if x.current().replication_factor != replication_factor { + return Err(Error::Message(format!( + "Prevous cluster layout has replication factor {}, which is different than the one specified in the config file ({}). The previous cluster layout can be purged, if you know what you are doing, simply by deleting the `cluster_layout` file in your metadata directory.", + x.current().replication_factor, + replication_factor + ))); + } + x + } + Err(e) => { + info!( + "No valid previous cluster layout stored ({}), starting fresh.", + e + ); + LayoutHistory::new(replication_factor) + } + }; + + let (update_layout, layout_watch) = watch::channel(Arc::new(cluster_layout)); + + let rpc_helper = RpcHelper::new( + node_id.into(), + fullmesh, + layout_watch.clone(), + config.rpc_timeout_msec.map(Duration::from_millis), + ); + + Ok(Self { + replication_factor, + persist_cluster_layout, + layout_watch, + update_layout: Mutex::new(update_layout), + system_endpoint, + rpc_helper, + }) + } + + // ---- PUBLIC INTERFACE ---- + + pub async fn update_cluster_layout(&self, layout: &LayoutHistory) -> Result<(), Error> { + self.handle_advertise_cluster_layout(layout).await?; + Ok(()) + } + + pub fn history(&self) -> watch::Ref> { + self.layout_watch.borrow() + } + + pub(crate) async fn pull_cluster_layout(&self, peer: Uuid) { + let resp = self + .rpc_helper + .call( + &self.system_endpoint, + peer, + SystemRpc::PullClusterLayout, + RequestStrategy::with_priority(PRIO_HIGH), + ) + .await; + if let Ok(SystemRpc::AdvertiseClusterLayout(layout)) = resp { + let _: Result<_, _> = self.handle_advertise_cluster_layout(&layout).await; + } + } + + // ---- INTERNALS --- + + /// Save network configuration to disc + async fn save_cluster_layout(&self) -> Result<(), Error> { + let layout: Arc = self.layout_watch.borrow().clone(); + self.persist_cluster_layout + .save_async(&layout) + .await + .expect("Cannot save current cluster layout"); + Ok(()) + } + + // ---- RPC HANDLERS ---- + + pub(crate) fn handle_pull_cluster_layout(&self) -> SystemRpc { + let layout = self.layout_watch.borrow().as_ref().clone(); + SystemRpc::AdvertiseClusterLayout(layout) + } + + pub(crate) async fn handle_advertise_cluster_layout( + &self, + adv: &LayoutHistory, + ) -> Result { + if adv.current().replication_factor != self.replication_factor { + let msg = format!( + "Received a cluster layout from another node with replication factor {}, which is different from what we have in our configuration ({}). Discarding the cluster layout we received.", + adv.current().replication_factor, + self.replication_factor + ); + error!("{}", msg); + return Err(Error::Message(msg)); + } + + let update_layout = self.update_layout.lock().await; + // TODO: don't clone each time an AdvertiseClusterLayout is received + let mut layout: LayoutHistory = self.layout_watch.borrow().as_ref().clone(); + + let prev_layout_check = layout.check().is_ok(); + if layout.merge(adv) { + if prev_layout_check && layout.check().is_err() { + error!("New cluster layout is invalid, discarding."); + return Err(Error::Message( + "New cluster layout is invalid, discarding.".into(), + )); + } + + update_layout.send(Arc::new(layout.clone()))?; + drop(update_layout); + + /* TODO + tokio::spawn(async move { + if let Err(e) = system + .rpc_helper() + .broadcast( + &system.system_endpoint, + SystemRpc::AdvertiseClusterLayout(layout), + RequestStrategy::with_priority(PRIO_HIGH), + ) + .await + { + warn!("Error while broadcasting new cluster layout: {}", e); + } + }); + */ + + self.save_cluster_layout().await?; + } + + Ok(SystemRpc::Ok) + } +} diff --git a/src/rpc/layout/mod.rs b/src/rpc/layout/mod.rs index 7c15988a..cd3764bc 100644 --- a/src/rpc/layout/mod.rs +++ b/src/rpc/layout/mod.rs @@ -3,6 +3,8 @@ mod history; mod schema; mod version; +pub mod manager; + // ---- re-exports ---- pub use history::*; diff --git a/src/rpc/system.rs b/src/rpc/system.rs index a7433b68..a8e88425 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -14,7 +14,6 @@ use serde::{Deserialize, Serialize}; use sodiumoxide::crypto::sign::ed25519; use tokio::select; use tokio::sync::watch; -use tokio::sync::Mutex; use netapp::endpoint::{Endpoint, EndpointHandler}; use netapp::message::*; @@ -34,6 +33,7 @@ use garage_util::time::*; use crate::consul::ConsulDiscovery; #[cfg(feature = "kubernetes-discovery")] use crate::kubernetes::*; +use crate::layout::manager::LayoutManager; use crate::layout::*; use crate::replication_mode::*; use crate::rpc_helper::*; @@ -49,7 +49,7 @@ const STATUS_EXCHANGE_INTERVAL: Duration = Duration::from_secs(10); pub const GARAGE_VERSION_TAG: u64 = 0x676172616765000A; // garage 0x000A /// RPC endpoint used for calls related to membership -pub const SYSTEM_RPC_PATH: &str = "garage_rpc/membership.rs/SystemRpc"; +pub const SYSTEM_RPC_PATH: &str = "garage_rpc/system.rs/SystemRpc"; /// RPC messages related to membership #[derive(Debug, Serialize, Deserialize, Clone)] @@ -58,17 +58,17 @@ pub enum SystemRpc { Ok, /// Request to connect to a specific node (in @: format) Connect(String), - /// Ask other node its cluster layout. Answered with AdvertiseClusterLayout - PullClusterLayout, /// Advertise Garage status. Answered with another AdvertiseStatus. /// Exchanged with every node on a regular basis. AdvertiseStatus(NodeStatus), - /// Advertisement of cluster layout. Sent spontanously or in response to PullClusterLayout - AdvertiseClusterLayout(LayoutHistory), /// Get known nodes states GetKnownNodes, /// Return known nodes ReturnKnownNodes(Vec), + /// Ask other node its cluster layout. Answered with AdvertiseClusterLayout + PullClusterLayout, + /// Advertisement of cluster layout. Sent spontanously or in response to PullClusterLayout + AdvertiseClusterLayout(LayoutHistory), } impl Rpc for SystemRpc { @@ -84,7 +84,6 @@ pub struct System { /// The id of this node pub id: Uuid, - persist_cluster_layout: Persister, persist_peer_list: Persister, local_status: ArcSwap, @@ -92,9 +91,8 @@ pub struct System { pub netapp: Arc, fullmesh: Arc, - pub rpc: RpcHelper, - system_endpoint: Arc>, + pub(crate) system_endpoint: Arc>, rpc_listen_addr: SocketAddr, #[cfg(any(feature = "consul-discovery", feature = "kubernetes-discovery"))] @@ -106,15 +104,13 @@ pub struct System { #[cfg(feature = "kubernetes-discovery")] kubernetes_discovery: Option, + pub layout_manager: LayoutManager, + metrics: SystemMetrics, replication_mode: ReplicationMode, replication_factor: usize, - /// The layout - pub layout_watch: watch::Receiver>, - update_layout: Mutex>>, - /// Path to metadata directory pub metadata_dir: PathBuf, /// Path to data directory @@ -128,8 +124,11 @@ pub struct NodeStatus { /// Replication factor configured on the node pub replication_factor: usize, + /// Cluster layout version pub cluster_layout_version: u64, + /// Hash of cluster layout update trackers + // (TODO) pub cluster_layout_trackers_hash: Hash, /// Hash of cluster layout staging data pub cluster_layout_staging_hash: Hash, @@ -247,8 +246,7 @@ impl System { replication_mode: ReplicationMode, config: &Config, ) -> Result, Error> { - let replication_factor = replication_mode.replication_factor(); - + // ---- setup netapp RPC protocol ---- let node_key = gen_node_key(&config.metadata_dir).expect("Unable to read or generate node ID"); info!( @@ -256,81 +254,40 @@ impl System { hex::encode(&node_key.public_key()[..8]) ); - let persist_cluster_layout: Persister = - Persister::new(&config.metadata_dir, "cluster_layout"); - let persist_peer_list = Persister::new(&config.metadata_dir, "peer_list"); - - let cluster_layout = match persist_cluster_layout.load() { - Ok(x) => { - if x.current().replication_factor != replication_factor { - return Err(Error::Message(format!( - "Prevous cluster layout has replication factor {}, which is different than the one specified in the config file ({}). The previous cluster layout can be purged, if you know what you are doing, simply by deleting the `cluster_layout` file in your metadata directory.", - x.current().replication_factor, - replication_factor - ))); - } - x - } - Err(e) => { - info!( - "No valid previous cluster layout stored ({}), starting fresh.", - e - ); - LayoutHistory::new(replication_factor) - } - }; - - let metrics = SystemMetrics::new(replication_factor); - - let mut local_status = NodeStatus::initial(replication_factor, &cluster_layout); - local_status.update_disk_usage(&config.metadata_dir, &config.data_dir, &metrics); + let netapp = NetApp::new(GARAGE_VERSION_TAG, network_key, node_key); + let system_endpoint = netapp.endpoint(SYSTEM_RPC_PATH.into()); - let (update_layout, layout_watch) = watch::channel(Arc::new(cluster_layout)); - - let rpc_public_addr = match &config.rpc_public_addr { - Some(a_str) => { - use std::net::ToSocketAddrs; - match a_str.to_socket_addrs() { - Err(e) => { - error!( - "Cannot resolve rpc_public_addr {} from config file: {}.", - a_str, e - ); - None - } - Ok(a) => { - let a = a.collect::>(); - if a.is_empty() { - error!("rpc_public_addr {} resolve to no known IP address", a_str); - } - if a.len() > 1 { - warn!("Multiple possible resolutions for rpc_public_addr: {:?}. Taking the first one.", a); - } - a.into_iter().next() - } - } - } - None => { - let addr = - get_default_ip().map(|ip| SocketAddr::new(ip, config.rpc_bind_addr.port())); - if let Some(a) = addr { - warn!("Using autodetected rpc_public_addr: {}. Consider specifying it explicitly in configuration file if possible.", a); - } - addr - } - }; + // ---- setup netapp public listener and full mesh peering strategy ---- + let rpc_public_addr = get_rpc_public_addr(config); if rpc_public_addr.is_none() { warn!("This Garage node does not know its publicly reachable RPC address, this might hamper intra-cluster communication."); } - let netapp = NetApp::new(GARAGE_VERSION_TAG, network_key, node_key); let fullmesh = FullMeshPeeringStrategy::new(netapp.clone(), vec![], rpc_public_addr); if let Some(ping_timeout) = config.rpc_ping_timeout_msec { fullmesh.set_ping_timeout_millis(ping_timeout); } - let system_endpoint = netapp.endpoint(SYSTEM_RPC_PATH.into()); + let persist_peer_list = Persister::new(&config.metadata_dir, "peer_list"); + // ---- setup cluster layout and layout manager ---- + let replication_factor = replication_mode.replication_factor(); + + let layout_manager = LayoutManager::new( + config, + netapp.id, + system_endpoint.clone(), + fullmesh.clone(), + replication_factor, + )?; + + // ---- set up metrics and status exchange ---- + let metrics = SystemMetrics::new(replication_factor); + + let mut local_status = NodeStatus::initial(replication_factor, &layout_manager.history()); + local_status.update_disk_usage(&config.metadata_dir, &config.data_dir, &metrics); + + // ---- if enabled, set up additionnal peer discovery methods ---- #[cfg(feature = "consul-discovery")] let consul_discovery = match &config.consul_discovery { Some(cfg) => Some( @@ -349,20 +306,14 @@ impl System { warn!("Kubernetes discovery is not enabled in this build."); } + // ---- done ---- let sys = Arc::new(System { id: netapp.id.into(), - persist_cluster_layout, persist_peer_list, local_status: ArcSwap::new(Arc::new(local_status)), node_status: RwLock::new(HashMap::new()), netapp: netapp.clone(), fullmesh: fullmesh.clone(), - rpc: RpcHelper::new( - netapp.id.into(), - fullmesh, - layout_watch.clone(), - config.rpc_timeout_msec.map(Duration::from_millis), - ), system_endpoint, replication_mode, replication_factor, @@ -374,10 +325,9 @@ impl System { consul_discovery, #[cfg(feature = "kubernetes-discovery")] kubernetes_discovery: config.kubernetes_discovery.clone(), + layout_manager, metrics, - layout_watch, - update_layout: Mutex::new(update_layout), metadata_dir: config.metadata_dir.clone(), data_dir: config.data_dir.clone(), }); @@ -397,6 +347,20 @@ impl System { ); } + // ---- Public utilities / accessors ---- + + pub fn cluster_layout(&self) -> watch::Ref> { + self.layout_manager.history() + } + + pub fn layout_watch(&self) -> watch::Receiver> { + self.layout_manager.layout_watch.clone() + } + + pub fn rpc_helper(&self) -> &RpcHelper { + &self.layout_manager.rpc_helper + } + // ---- Administrative operations (directly available and // also available through RPC) ---- @@ -423,18 +387,6 @@ impl System { known_nodes } - pub fn cluster_layout(&self) -> watch::Ref> { - self.layout_watch.borrow() - } - - pub async fn update_cluster_layout( - self: &Arc, - layout: &LayoutHistory, - ) -> Result<(), Error> { - self.handle_advertise_cluster_layout(layout).await?; - Ok(()) - } - pub async fn connect(&self, node: &str) -> Result<(), Error> { let (pubkey, addrs) = parse_and_resolve_peer_addr_async(node) .await @@ -464,7 +416,7 @@ impl System { } pub fn health(&self) -> ClusterHealth { - let layout: Arc<_> = self.layout_watch.borrow().clone(); + let layout: Arc<_> = self.cluster_layout().clone(); let quorum = self.replication_mode.write_quorum(); let replication_factor = self.replication_factor; @@ -581,20 +533,10 @@ impl System { } } - /// Save network configuration to disc - async fn save_cluster_layout(&self) -> Result<(), Error> { - let layout: Arc = self.layout_watch.borrow().clone(); - self.persist_cluster_layout - .save_async(&layout) - .await - .expect("Cannot save current cluster layout"); - Ok(()) - } - fn update_local_status(&self) { let mut new_si: NodeStatus = self.local_status.load().as_ref().clone(); - let layout = self.layout_watch.borrow(); + let layout = self.cluster_layout(); new_si.cluster_layout_version = layout.current().version; new_si.cluster_layout_staging_hash = layout.staging_hash; @@ -610,11 +552,6 @@ impl System { Ok(SystemRpc::Ok) } - fn handle_pull_cluster_layout(&self) -> SystemRpc { - let layout = self.layout_watch.borrow().as_ref().clone(); - SystemRpc::AdvertiseClusterLayout(layout) - } - fn handle_get_known_nodes(&self) -> SystemRpc { let known_nodes = self.get_known_nodes(); SystemRpc::ReturnKnownNodes(known_nodes) @@ -637,7 +574,10 @@ impl System { if info.cluster_layout_version > local_info.cluster_layout_version || info.cluster_layout_staging_hash != local_info.cluster_layout_staging_hash { - tokio::spawn(self.clone().pull_cluster_layout(from)); + tokio::spawn({ + let system = self.clone(); + async move { system.layout_manager.pull_cluster_layout(from).await } + }); } self.node_status @@ -648,57 +588,6 @@ impl System { Ok(SystemRpc::Ok) } - async fn handle_advertise_cluster_layout( - self: &Arc, - adv: &LayoutHistory, - ) -> Result { - if adv.current().replication_factor != self.replication_factor { - let msg = format!( - "Received a cluster layout from another node with replication factor {}, which is different from what we have in our configuration ({}). Discarding the cluster layout we received.", - adv.current().replication_factor, - self.replication_factor - ); - error!("{}", msg); - return Err(Error::Message(msg)); - } - - let update_layout = self.update_layout.lock().await; - // TODO: don't clone each time an AdvertiseClusterLayout is received - let mut layout: LayoutHistory = self.layout_watch.borrow().as_ref().clone(); - - let prev_layout_check = layout.check().is_ok(); - if layout.merge(adv) { - if prev_layout_check && layout.check().is_err() { - error!("New cluster layout is invalid, discarding."); - return Err(Error::Message( - "New cluster layout is invalid, discarding.".into(), - )); - } - - update_layout.send(Arc::new(layout.clone()))?; - drop(update_layout); - - let self2 = self.clone(); - tokio::spawn(async move { - if let Err(e) = self2 - .rpc - .broadcast( - &self2.system_endpoint, - SystemRpc::AdvertiseClusterLayout(layout), - RequestStrategy::with_priority(PRIO_HIGH), - ) - .await - { - warn!("Error while broadcasting new cluster layout: {}", e); - } - }); - - self.save_cluster_layout().await?; - } - - Ok(SystemRpc::Ok) - } - async fn status_exchange_loop(&self, mut stop_signal: watch::Receiver) { while !*stop_signal.borrow() { let restart_at = Instant::now() + STATUS_EXCHANGE_INTERVAL; @@ -706,7 +595,7 @@ impl System { self.update_local_status(); let local_status: NodeStatus = self.local_status.load().as_ref().clone(); let _ = self - .rpc + .rpc_helper() .broadcast( &self.system_endpoint, SystemRpc::AdvertiseStatus(local_status), @@ -724,9 +613,9 @@ impl System { async fn discovery_loop(self: &Arc, mut stop_signal: watch::Receiver) { while !*stop_signal.borrow() { - let not_configured = self.layout_watch.borrow().check().is_err(); + let not_configured = self.cluster_layout().check().is_err(); let no_peers = self.fullmesh.get_peer_list().len() < self.replication_factor; - let expected_n_nodes = self.layout_watch.borrow().current().num_nodes(); + let expected_n_nodes = self.cluster_layout().current().num_nodes(); let bad_peers = self .fullmesh .get_peer_list() @@ -831,34 +720,26 @@ impl System { .save_async(&PeerList(peer_list)) .await } - - async fn pull_cluster_layout(self: Arc, peer: Uuid) { - let resp = self - .rpc - .call( - &self.system_endpoint, - peer, - SystemRpc::PullClusterLayout, - RequestStrategy::with_priority(PRIO_HIGH), - ) - .await; - if let Ok(SystemRpc::AdvertiseClusterLayout(layout)) = resp { - let _: Result<_, _> = self.handle_advertise_cluster_layout(&layout).await; - } - } } #[async_trait] impl EndpointHandler for System { async fn handle(self: &Arc, msg: &SystemRpc, from: NodeID) -> Result { match msg { + // ---- system functions -> System ---- SystemRpc::Connect(node) => self.handle_connect(node).await, - SystemRpc::PullClusterLayout => Ok(self.handle_pull_cluster_layout()), SystemRpc::AdvertiseStatus(adv) => self.handle_advertise_status(from.into(), adv).await, + SystemRpc::GetKnownNodes => Ok(self.handle_get_known_nodes()), + + // ---- layout functions -> LayoutManager ---- + SystemRpc::PullClusterLayout => Ok(self.layout_manager.handle_pull_cluster_layout()), SystemRpc::AdvertiseClusterLayout(adv) => { - self.clone().handle_advertise_cluster_layout(adv).await + self.layout_manager + .handle_advertise_cluster_layout(adv) + .await } - SystemRpc::GetKnownNodes => Ok(self.handle_get_known_nodes()), + + // ---- other -> Error ---- m => Err(Error::unexpected_rpc_message(m)), } } @@ -962,6 +843,40 @@ fn get_default_ip() -> Option { .map(|a| a.ip()) } +fn get_rpc_public_addr(config: &Config) -> Option { + match &config.rpc_public_addr { + Some(a_str) => { + use std::net::ToSocketAddrs; + match a_str.to_socket_addrs() { + Err(e) => { + error!( + "Cannot resolve rpc_public_addr {} from config file: {}.", + a_str, e + ); + None + } + Ok(a) => { + let a = a.collect::>(); + if a.is_empty() { + error!("rpc_public_addr {} resolve to no known IP address", a_str); + } + if a.len() > 1 { + warn!("Multiple possible resolutions for rpc_public_addr: {:?}. Taking the first one.", a); + } + a.into_iter().next() + } + } + } + None => { + let addr = get_default_ip().map(|ip| SocketAddr::new(ip, config.rpc_bind_addr.port())); + if let Some(a) = addr { + warn!("Using autodetected rpc_public_addr: {}. Consider specifying it explicitly in configuration file if possible.", a); + } + addr + } + } +} + async fn resolve_peers(peers: &[String]) -> Vec<(NodeID, SocketAddr)> { let mut ret = vec![]; diff --git a/src/table/gc.rs b/src/table/gc.rs index 5b9124a7..2135a358 100644 --- a/src/table/gc.rs +++ b/src/table/gc.rs @@ -227,7 +227,7 @@ impl TableGc { // GC'ing is not a critical function of the system, so it's not a big // deal if we can't do it right now. self.system - .rpc + .rpc_helper() .try_call_many( &self.endpoint, &nodes[..], @@ -248,7 +248,7 @@ impl TableGc { // it means that the garbage collection wasn't completed and has // to be retried later. self.system - .rpc + .rpc_helper() .try_call_many( &self.endpoint, &nodes[..], diff --git a/src/table/sync.rs b/src/table/sync.rs index 620d83b9..2da1bfe7 100644 --- a/src/table/sync.rs +++ b/src/table/sync.rs @@ -91,7 +91,7 @@ impl TableSyncer { bg.spawn_worker(SyncWorker { syncer: self.clone(), - layout_watch: self.system.layout_watch.clone(), + layout_watch: self.system.layout_watch(), layout: self.system.cluster_layout().clone(), add_full_sync_rx, todo: vec![], @@ -244,7 +244,7 @@ impl TableSyncer { } self.system - .rpc + .rpc_helper() .try_call_many( &self.endpoint, nodes, @@ -305,7 +305,7 @@ impl TableSyncer { // If so, do nothing. let root_resp = self .system - .rpc + .rpc_helper() .call( &self.endpoint, who, @@ -361,7 +361,7 @@ impl TableSyncer { // and compare it with local node let remote_node = match self .system - .rpc + .rpc_helper() .call( &self.endpoint, who, @@ -437,7 +437,7 @@ impl TableSyncer { let rpc_resp = self .system - .rpc + .rpc_helper() .call( &self.endpoint, who, diff --git a/src/table/table.rs b/src/table/table.rs index 7ad79677..3e3fd138 100644 --- a/src/table/table.rs +++ b/src/table/table.rs @@ -123,7 +123,7 @@ impl Table { let rpc = TableRpc::::Update(vec![e_enc]); self.system - .rpc + .rpc_helper() .try_call_many( &self.endpoint, &who[..], @@ -181,7 +181,7 @@ impl Table { let resp = self .system - .rpc + .rpc_helper() .call( &self.endpoint, node, @@ -236,7 +236,7 @@ impl Table { let rpc = TableRpc::::ReadEntry(partition_key.clone(), sort_key.clone()); let resps = self .system - .rpc + .rpc_helper() .try_call_many( &self.endpoint, &who[..], @@ -332,7 +332,7 @@ impl Table { let resps = self .system - .rpc + .rpc_helper() .try_call_many( &self.endpoint, &who[..], @@ -411,7 +411,7 @@ impl Table { async fn repair_on_read(&self, who: &[Uuid], what: F::E) -> Result<(), Error> { let what_enc = Arc::new(ByteBuf::from(what.encode()?)); self.system - .rpc + .rpc_helper() .try_call_many( &self.endpoint, who, -- cgit v1.2.3 From 19ef1ec8e7fee3a6c670e6e35dfcc83f0801e604 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 9 Nov 2023 13:34:14 +0100 Subject: layout: more refactoring --- src/garage/cli/layout.rs | 6 ++- src/rpc/layout/manager.rs | 116 ++++++++++++++++++++++++++++++---------------- src/rpc/layout/schema.rs | 6 +-- src/rpc/system.rs | 40 ++++++---------- 4 files changed, 97 insertions(+), 71 deletions(-) diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index 269d92f4..bffc81d3 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -1,3 +1,5 @@ +use std::sync::Arc; + use bytesize::ByteSize; use format_table::format_table; @@ -321,7 +323,7 @@ pub async fn fetch_layout( .call(&rpc_host, SystemRpc::PullClusterLayout, PRIO_NORMAL) .await?? { - SystemRpc::AdvertiseClusterLayout(t) => Ok(t), + SystemRpc::AdvertiseClusterLayout(t) => Ok(Arc::try_unwrap(t).unwrap()), resp => Err(Error::Message(format!("Invalid RPC response: {:?}", resp))), } } @@ -334,7 +336,7 @@ pub async fn send_layout( rpc_cli .call( &rpc_host, - SystemRpc::AdvertiseClusterLayout(layout), + SystemRpc::AdvertiseClusterLayout(Arc::new(layout)), PRIO_NORMAL, ) .await??; diff --git a/src/rpc/layout/manager.rs b/src/rpc/layout/manager.rs index a8a77139..351e0959 100644 --- a/src/rpc/layout/manager.rs +++ b/src/rpc/layout/manager.rs @@ -1,6 +1,8 @@ use std::sync::Arc; use std::time::Duration; +use serde::{Deserialize, Serialize}; + use tokio::sync::watch; use tokio::sync::Mutex; @@ -28,6 +30,16 @@ pub struct LayoutManager { system_endpoint: Arc>, } +#[derive(Debug, Clone, Serialize, Deserialize, Default)] +pub struct LayoutStatus { + /// Cluster layout version + pub cluster_layout_version: u64, + /// Hash of cluster layout update trackers + // (TODO) pub cluster_layout_trackers_hash: Hash, + /// Hash of cluster layout staging data + pub cluster_layout_staging_hash: Hash, +} + impl LayoutManager { pub fn new( config: &Config, @@ -35,7 +47,7 @@ impl LayoutManager { system_endpoint: Arc>, fullmesh: Arc, replication_factor: usize, - ) -> Result { + ) -> Result, Error> { let persist_cluster_layout: Persister = Persister::new(&config.metadata_dir, "cluster_layout"); @@ -68,28 +80,39 @@ impl LayoutManager { config.rpc_timeout_msec.map(Duration::from_millis), ); - Ok(Self { + Ok(Arc::new(Self { replication_factor, persist_cluster_layout, layout_watch, update_layout: Mutex::new(update_layout), system_endpoint, rpc_helper, - }) + })) } // ---- PUBLIC INTERFACE ---- - pub async fn update_cluster_layout(&self, layout: &LayoutHistory) -> Result<(), Error> { + pub fn status(&self) -> LayoutStatus { + let layout = self.layout(); + LayoutStatus { + cluster_layout_version: layout.current().version, + cluster_layout_staging_hash: layout.staging_hash, + } + } + + pub async fn update_cluster_layout( + self: &Arc, + layout: &LayoutHistory, + ) -> Result<(), Error> { self.handle_advertise_cluster_layout(layout).await?; Ok(()) } - pub fn history(&self) -> watch::Ref> { + pub fn layout(&self) -> watch::Ref> { self.layout_watch.borrow() } - pub(crate) async fn pull_cluster_layout(&self, peer: Uuid) { + pub(crate) async fn pull_cluster_layout(self: &Arc, peer: Uuid) { let resp = self .rpc_helper .call( @@ -118,13 +141,25 @@ impl LayoutManager { // ---- RPC HANDLERS ---- + pub(crate) fn handle_advertise_status(self: &Arc, from: Uuid, status: &LayoutStatus) { + let local_status = self.status(); + if status.cluster_layout_version > local_status.cluster_layout_version + || status.cluster_layout_staging_hash != local_status.cluster_layout_staging_hash + { + tokio::spawn({ + let this = self.clone(); + async move { this.pull_cluster_layout(from).await } + }); + } + } + pub(crate) fn handle_pull_cluster_layout(&self) -> SystemRpc { - let layout = self.layout_watch.borrow().as_ref().clone(); + let layout = self.layout_watch.borrow().clone(); SystemRpc::AdvertiseClusterLayout(layout) } pub(crate) async fn handle_advertise_cluster_layout( - &self, + self: &Arc, adv: &LayoutHistory, ) -> Result { if adv.current().replication_factor != self.replication_factor { @@ -137,39 +172,42 @@ impl LayoutManager { return Err(Error::Message(msg)); } - let update_layout = self.update_layout.lock().await; - // TODO: don't clone each time an AdvertiseClusterLayout is received - let mut layout: LayoutHistory = self.layout_watch.borrow().as_ref().clone(); - - let prev_layout_check = layout.check().is_ok(); - if layout.merge(adv) { - if prev_layout_check && layout.check().is_err() { - error!("New cluster layout is invalid, discarding."); - return Err(Error::Message( - "New cluster layout is invalid, discarding.".into(), - )); - } - - update_layout.send(Arc::new(layout.clone()))?; - drop(update_layout); - - /* TODO - tokio::spawn(async move { - if let Err(e) = system - .rpc_helper() - .broadcast( - &system.system_endpoint, - SystemRpc::AdvertiseClusterLayout(layout), - RequestStrategy::with_priority(PRIO_HIGH), - ) - .await - { - warn!("Error while broadcasting new cluster layout: {}", e); + if *adv != **self.layout_watch.borrow() { + let update_layout = self.update_layout.lock().await; + let mut layout: LayoutHistory = self.layout_watch.borrow().as_ref().clone(); + + let prev_layout_check = layout.check().is_ok(); + if layout.merge(adv) { + if prev_layout_check && layout.check().is_err() { + error!("New cluster layout is invalid, discarding."); + return Err(Error::Message( + "New cluster layout is invalid, discarding.".into(), + )); } - }); - */ - self.save_cluster_layout().await?; + let layout = Arc::new(layout); + update_layout.send(layout.clone())?; + drop(update_layout); // release mutex + + tokio::spawn({ + let this = self.clone(); + async move { + if let Err(e) = this + .rpc_helper + .broadcast( + &this.system_endpoint, + SystemRpc::AdvertiseClusterLayout(layout), + RequestStrategy::with_priority(PRIO_HIGH), + ) + .await + { + warn!("Error while broadcasting new cluster layout: {}", e); + } + } + }); + + self.save_cluster_layout().await?; + } } Ok(SystemRpc::Ok) diff --git a/src/rpc/layout/schema.rs b/src/rpc/layout/schema.rs index c5b9b1d3..d587a6cb 100644 --- a/src/rpc/layout/schema.rs +++ b/src/rpc/layout/schema.rs @@ -226,7 +226,7 @@ mod v010 { } /// The history of cluster layouts - #[derive(Clone, Debug, Serialize, Deserialize)] + #[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] pub struct LayoutHistory { /// The versions currently in use in the cluster pub versions: Vec, @@ -241,7 +241,7 @@ mod v010 { } /// The tracker of acknowlegments and data syncs around the cluster - #[derive(Clone, Debug, Serialize, Deserialize, Default)] + #[derive(Clone, Debug, Serialize, Deserialize, Default, PartialEq)] pub struct UpdateTrackers { /// The highest layout version number each node has ack'ed pub ack_map: UpdateTracker, @@ -253,7 +253,7 @@ mod v010 { } /// The history of cluster layouts - #[derive(Clone, Debug, Serialize, Deserialize, Default)] + #[derive(Clone, Debug, Serialize, Deserialize, Default, PartialEq)] pub struct UpdateTracker(pub HashMap); impl garage_util::migrate::Migrate for LayoutHistory { diff --git a/src/rpc/system.rs b/src/rpc/system.rs index a8e88425..88c4d443 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -33,7 +33,7 @@ use garage_util::time::*; use crate::consul::ConsulDiscovery; #[cfg(feature = "kubernetes-discovery")] use crate::kubernetes::*; -use crate::layout::manager::LayoutManager; +use crate::layout::manager::{LayoutManager, LayoutStatus}; use crate::layout::*; use crate::replication_mode::*; use crate::rpc_helper::*; @@ -68,7 +68,7 @@ pub enum SystemRpc { /// Ask other node its cluster layout. Answered with AdvertiseClusterLayout PullClusterLayout, /// Advertisement of cluster layout. Sent spontanously or in response to PullClusterLayout - AdvertiseClusterLayout(LayoutHistory), + AdvertiseClusterLayout(Arc), } impl Rpc for SystemRpc { @@ -104,7 +104,7 @@ pub struct System { #[cfg(feature = "kubernetes-discovery")] kubernetes_discovery: Option, - pub layout_manager: LayoutManager, + pub layout_manager: Arc, metrics: SystemMetrics, @@ -125,12 +125,8 @@ pub struct NodeStatus { /// Replication factor configured on the node pub replication_factor: usize, - /// Cluster layout version - pub cluster_layout_version: u64, - /// Hash of cluster layout update trackers - // (TODO) pub cluster_layout_trackers_hash: Hash, - /// Hash of cluster layout staging data - pub cluster_layout_staging_hash: Hash, + /// Layout status + pub layout_status: LayoutStatus, /// Disk usage on partition containing metadata directory (tuple: `(avail, total)`) #[serde(default)] @@ -284,7 +280,7 @@ impl System { // ---- set up metrics and status exchange ---- let metrics = SystemMetrics::new(replication_factor); - let mut local_status = NodeStatus::initial(replication_factor, &layout_manager.history()); + let mut local_status = NodeStatus::initial(replication_factor, &layout_manager); local_status.update_disk_usage(&config.metadata_dir, &config.data_dir, &metrics); // ---- if enabled, set up additionnal peer discovery methods ---- @@ -350,7 +346,7 @@ impl System { // ---- Public utilities / accessors ---- pub fn cluster_layout(&self) -> watch::Ref> { - self.layout_manager.history() + self.layout_manager.layout() } pub fn layout_watch(&self) -> watch::Receiver> { @@ -536,9 +532,7 @@ impl System { fn update_local_status(&self) { let mut new_si: NodeStatus = self.local_status.load().as_ref().clone(); - let layout = self.cluster_layout(); - new_si.cluster_layout_version = layout.current().version; - new_si.cluster_layout_staging_hash = layout.staging_hash; + new_si.layout_status = self.layout_manager.status(); new_si.update_disk_usage(&self.metadata_dir, &self.data_dir, &self.metrics); @@ -571,14 +565,8 @@ impl System { std::process::exit(1); } - if info.cluster_layout_version > local_info.cluster_layout_version - || info.cluster_layout_staging_hash != local_info.cluster_layout_staging_hash - { - tokio::spawn({ - let system = self.clone(); - async move { system.layout_manager.pull_cluster_layout(from).await } - }); - } + self.layout_manager + .handle_advertise_status(from, &info.layout_status); self.node_status .write() @@ -746,14 +734,13 @@ impl EndpointHandler for System { } impl NodeStatus { - fn initial(replication_factor: usize, layout: &LayoutHistory) -> Self { + fn initial(replication_factor: usize, layout_manager: &LayoutManager) -> Self { NodeStatus { hostname: gethostname::gethostname() .into_string() .unwrap_or_else(|_| "".to_string()), replication_factor, - cluster_layout_version: layout.current().version, - cluster_layout_staging_hash: layout.staging_hash, + layout_status: layout_manager.status(), meta_disk_avail: None, data_disk_avail: None, } @@ -763,8 +750,7 @@ impl NodeStatus { NodeStatus { hostname: "?".to_string(), replication_factor: 0, - cluster_layout_version: 0, - cluster_layout_staging_hash: Hash::from([0u8; 32]), + layout_status: Default::default(), meta_disk_avail: None, data_disk_avail: None, } -- cgit v1.2.3 From bfb1845fdc981a370539d641a5d80f438f184f07 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 9 Nov 2023 14:12:05 +0100 Subject: layout: refactor to use a RwLock on LayoutHistory --- src/api/admin/cluster.rs | 6 +-- src/api/k2v/index.rs | 11 ++---- src/garage/cli/layout.rs | 6 +-- src/model/helper/bucket.rs | 11 ++---- src/rpc/layout/manager.rs | 93 +++++++++++++++++++++++----------------------- src/rpc/rpc_helper.rs | 11 +++--- src/rpc/system.rs | 15 ++++---- src/table/sync.rs | 21 +++++------ 8 files changed, 82 insertions(+), 92 deletions(-) diff --git a/src/api/admin/cluster.rs b/src/api/admin/cluster.rs index f5483451..593bd778 100644 --- a/src/api/admin/cluster.rs +++ b/src/api/admin/cluster.rs @@ -210,7 +210,7 @@ pub async fn handle_update_cluster_layout( ) -> Result, Error> { let updates = parse_json_body::(req).await?; - let mut layout = garage.system.cluster_layout().as_ref().clone(); + let mut layout = garage.system.cluster_layout().clone(); let mut roles = layout.current().roles.clone(); roles.merge(&layout.staging.get().roles); @@ -256,7 +256,7 @@ pub async fn handle_apply_cluster_layout( ) -> Result, Error> { let param = parse_json_body::(req).await?; - let layout = garage.system.cluster_layout().as_ref().clone(); + let layout = garage.system.cluster_layout().clone(); let (layout, msg) = layout.apply_staged_changes(Some(param.version))?; garage @@ -273,7 +273,7 @@ pub async fn handle_apply_cluster_layout( } pub async fn handle_revert_cluster_layout(garage: &Arc) -> Result, Error> { - let layout = garage.system.cluster_layout().as_ref().clone(); + let layout = garage.system.cluster_layout().clone(); let layout = layout.revert_staged_changes()?; garage .system diff --git a/src/api/k2v/index.rs b/src/api/k2v/index.rs index a9bc3826..3c2f51a9 100644 --- a/src/api/k2v/index.rs +++ b/src/api/k2v/index.rs @@ -5,7 +5,6 @@ use serde::Serialize; use garage_util::data::*; -use garage_rpc::layout::LayoutHistory; use garage_table::util::*; use garage_model::garage::Garage; @@ -26,7 +25,8 @@ pub async fn handle_read_index( ) -> Result, Error> { let reverse = reverse.unwrap_or(false); - let layout: Arc = garage.system.cluster_layout().clone(); + // TODO: not only current + let node_id_vec = garage.system.cluster_layout().current().node_ids().to_vec(); let (partition_keys, more, next_start) = read_range( &garage.k2v.counter_table.table, @@ -35,10 +35,7 @@ pub async fn handle_read_index( &start, &end, limit, - Some(( - DeletedFilter::NotDeleted, - layout.current().node_id_vec.clone(), - )), + Some((DeletedFilter::NotDeleted, node_id_vec)), EnumerationOrder::from_reverse(reverse), ) .await?; @@ -57,7 +54,7 @@ pub async fn handle_read_index( partition_keys: partition_keys .into_iter() .map(|part| { - let vals = part.filtered_values(&layout); + let vals = part.filtered_values(&garage.system.cluster_layout()); ReadIndexResponseEntry { pk: part.sk, entries: *vals.get(&s_entries).unwrap_or(&0), diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index bffc81d3..269d92f4 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -1,5 +1,3 @@ -use std::sync::Arc; - use bytesize::ByteSize; use format_table::format_table; @@ -323,7 +321,7 @@ pub async fn fetch_layout( .call(&rpc_host, SystemRpc::PullClusterLayout, PRIO_NORMAL) .await?? { - SystemRpc::AdvertiseClusterLayout(t) => Ok(Arc::try_unwrap(t).unwrap()), + SystemRpc::AdvertiseClusterLayout(t) => Ok(t), resp => Err(Error::Message(format!("Invalid RPC response: {:?}", resp))), } } @@ -336,7 +334,7 @@ pub async fn send_layout( rpc_cli .call( &rpc_host, - SystemRpc::AdvertiseClusterLayout(Arc::new(layout)), + SystemRpc::AdvertiseClusterLayout(layout), PRIO_NORMAL, ) .await??; diff --git a/src/model/helper/bucket.rs b/src/model/helper/bucket.rs index 18904c8d..2a9c0fb1 100644 --- a/src/model/helper/bucket.rs +++ b/src/model/helper/bucket.rs @@ -450,10 +450,8 @@ impl<'a> BucketHelper<'a> { #[cfg(feature = "k2v")] { - use garage_rpc::layout::LayoutHistory; - use std::sync::Arc; - - let layout: Arc = self.0.system.cluster_layout().clone(); + // TODO: not only current + let node_id_vec = self.0.system.cluster_layout().current().node_ids().to_vec(); let k2vindexes = self .0 .k2v @@ -462,10 +460,7 @@ impl<'a> BucketHelper<'a> { .get_range( &bucket_id, None, - Some(( - DeletedFilter::NotDeleted, - layout.current().node_id_vec.clone(), - )), + Some((DeletedFilter::NotDeleted, node_id_vec)), 10, EnumerationOrder::Forward, ) diff --git a/src/rpc/layout/manager.rs b/src/rpc/layout/manager.rs index 351e0959..c021039b 100644 --- a/src/rpc/layout/manager.rs +++ b/src/rpc/layout/manager.rs @@ -1,10 +1,9 @@ -use std::sync::Arc; +use std::sync::{Arc, RwLock, RwLockReadGuard}; use std::time::Duration; use serde::{Deserialize, Serialize}; -use tokio::sync::watch; -use tokio::sync::Mutex; +use tokio::sync::Notify; use netapp::endpoint::Endpoint; use netapp::peering::fullmesh::FullMeshPeeringStrategy; @@ -23,8 +22,8 @@ pub struct LayoutManager { replication_factor: usize, persist_cluster_layout: Persister, - pub layout_watch: watch::Receiver>, - update_layout: Mutex>>, + layout: Arc>, + pub(crate) change_notify: Arc, pub(crate) rpc_helper: RpcHelper, system_endpoint: Arc>, @@ -71,20 +70,21 @@ impl LayoutManager { } }; - let (update_layout, layout_watch) = watch::channel(Arc::new(cluster_layout)); + let layout = Arc::new(RwLock::new(cluster_layout)); + let change_notify = Arc::new(Notify::new()); let rpc_helper = RpcHelper::new( node_id.into(), fullmesh, - layout_watch.clone(), + layout.clone(), config.rpc_timeout_msec.map(Duration::from_millis), ); Ok(Arc::new(Self { replication_factor, persist_cluster_layout, - layout_watch, - update_layout: Mutex::new(update_layout), + layout, + change_notify, system_endpoint, rpc_helper, })) @@ -108,8 +108,8 @@ impl LayoutManager { Ok(()) } - pub fn layout(&self) -> watch::Ref> { - self.layout_watch.borrow() + pub fn layout(&self) -> RwLockReadGuard<'_, LayoutHistory> { + self.layout.read().unwrap() } pub(crate) async fn pull_cluster_layout(self: &Arc, peer: Uuid) { @@ -131,7 +131,7 @@ impl LayoutManager { /// Save network configuration to disc async fn save_cluster_layout(&self) -> Result<(), Error> { - let layout: Arc = self.layout_watch.borrow().clone(); + let layout = self.layout.read().unwrap().clone(); // TODO: avoid cloning self.persist_cluster_layout .save_async(&layout) .await @@ -139,6 +139,22 @@ impl LayoutManager { Ok(()) } + fn merge_layout(&self, adv: &LayoutHistory) -> Option { + let mut layout = self.layout.write().unwrap(); + let prev_layout_check = layout.check().is_ok(); + + if !prev_layout_check || adv.check().is_ok() { + if layout.merge(adv) { + if prev_layout_check && layout.check().is_err() { + panic!("Merged two correct layouts and got an incorrect layout."); + } + + return Some(layout.clone()); + } + } + None + } + // ---- RPC HANDLERS ---- pub(crate) fn handle_advertise_status(self: &Arc, from: Uuid, status: &LayoutStatus) { @@ -154,7 +170,7 @@ impl LayoutManager { } pub(crate) fn handle_pull_cluster_layout(&self) -> SystemRpc { - let layout = self.layout_watch.borrow().clone(); + let layout = self.layout.read().unwrap().clone(); // TODO: avoid cloning SystemRpc::AdvertiseClusterLayout(layout) } @@ -172,42 +188,27 @@ impl LayoutManager { return Err(Error::Message(msg)); } - if *adv != **self.layout_watch.borrow() { - let update_layout = self.update_layout.lock().await; - let mut layout: LayoutHistory = self.layout_watch.borrow().as_ref().clone(); + if let Some(new_layout) = self.merge_layout(adv) { + self.change_notify.notify_waiters(); - let prev_layout_check = layout.check().is_ok(); - if layout.merge(adv) { - if prev_layout_check && layout.check().is_err() { - error!("New cluster layout is invalid, discarding."); - return Err(Error::Message( - "New cluster layout is invalid, discarding.".into(), - )); - } - - let layout = Arc::new(layout); - update_layout.send(layout.clone())?; - drop(update_layout); // release mutex - - tokio::spawn({ - let this = self.clone(); - async move { - if let Err(e) = this - .rpc_helper - .broadcast( - &this.system_endpoint, - SystemRpc::AdvertiseClusterLayout(layout), - RequestStrategy::with_priority(PRIO_HIGH), - ) - .await - { - warn!("Error while broadcasting new cluster layout: {}", e); - } + tokio::spawn({ + let this = self.clone(); + async move { + if let Err(e) = this + .rpc_helper + .broadcast( + &this.system_endpoint, + SystemRpc::AdvertiseClusterLayout(new_layout), + RequestStrategy::with_priority(PRIO_HIGH), + ) + .await + { + warn!("Error while broadcasting new cluster layout: {}", e); } - }); + } + }); - self.save_cluster_layout().await?; - } + self.save_cluster_layout().await?; } Ok(SystemRpc::Ok) diff --git a/src/rpc/rpc_helper.rs b/src/rpc/rpc_helper.rs index 3fdb4acd..ce291068 100644 --- a/src/rpc/rpc_helper.rs +++ b/src/rpc/rpc_helper.rs @@ -1,12 +1,11 @@ //! Contain structs related to making RPCs -use std::sync::Arc; +use std::sync::{Arc, RwLock}; use std::time::Duration; use futures::future::join_all; use futures::stream::futures_unordered::FuturesUnordered; use futures::stream::StreamExt; use tokio::select; -use tokio::sync::watch; use opentelemetry::KeyValue; use opentelemetry::{ @@ -91,7 +90,7 @@ pub struct RpcHelper(Arc); struct RpcHelperInner { our_node_id: Uuid, fullmesh: Arc, - layout_watch: watch::Receiver>, + layout: Arc>, metrics: RpcMetrics, rpc_timeout: Duration, } @@ -100,7 +99,7 @@ impl RpcHelper { pub(crate) fn new( our_node_id: Uuid, fullmesh: Arc, - layout_watch: watch::Receiver>, + layout: Arc>, rpc_timeout: Option, ) -> Self { let metrics = RpcMetrics::new(); @@ -108,7 +107,7 @@ impl RpcHelper { Self(Arc::new(RpcHelperInner { our_node_id, fullmesh, - layout_watch, + layout, metrics, rpc_timeout: rpc_timeout.unwrap_or(DEFAULT_TIMEOUT), })) @@ -392,7 +391,7 @@ impl RpcHelper { pub fn request_order(&self, nodes: &[Uuid]) -> Vec { // Retrieve some status variables that we will use to sort requests let peer_list = self.0.fullmesh.get_peer_list(); - let layout: Arc = self.0.layout_watch.borrow().clone(); + let layout = self.0.layout.read().unwrap(); let our_zone = match layout.current().node_role(&self.0.our_node_id) { Some(pc) => &pc.zone, None => "", diff --git a/src/rpc/system.rs b/src/rpc/system.rs index 88c4d443..cb3af3fe 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -4,7 +4,7 @@ use std::io::{Read, Write}; use std::net::{IpAddr, SocketAddr}; use std::path::{Path, PathBuf}; use std::sync::atomic::Ordering; -use std::sync::{Arc, RwLock}; +use std::sync::{Arc, RwLock, RwLockReadGuard}; use std::time::{Duration, Instant}; use arc_swap::ArcSwap; @@ -13,7 +13,7 @@ use futures::join; use serde::{Deserialize, Serialize}; use sodiumoxide::crypto::sign::ed25519; use tokio::select; -use tokio::sync::watch; +use tokio::sync::{watch, Notify}; use netapp::endpoint::{Endpoint, EndpointHandler}; use netapp::message::*; @@ -68,7 +68,7 @@ pub enum SystemRpc { /// Ask other node its cluster layout. Answered with AdvertiseClusterLayout PullClusterLayout, /// Advertisement of cluster layout. Sent spontanously or in response to PullClusterLayout - AdvertiseClusterLayout(Arc), + AdvertiseClusterLayout(LayoutHistory), } impl Rpc for SystemRpc { @@ -345,12 +345,12 @@ impl System { // ---- Public utilities / accessors ---- - pub fn cluster_layout(&self) -> watch::Ref> { + pub fn cluster_layout(&self) -> RwLockReadGuard<'_, LayoutHistory> { self.layout_manager.layout() } - pub fn layout_watch(&self) -> watch::Receiver> { - self.layout_manager.layout_watch.clone() + pub fn layout_notify(&self) -> Arc { + self.layout_manager.change_notify.clone() } pub fn rpc_helper(&self) -> &RpcHelper { @@ -412,7 +412,6 @@ impl System { } pub fn health(&self) -> ClusterHealth { - let layout: Arc<_> = self.cluster_layout().clone(); let quorum = self.replication_mode.write_quorum(); let replication_factor = self.replication_factor; @@ -423,6 +422,8 @@ impl System { .collect::>(); let connected_nodes = nodes.iter().filter(|(_, n)| n.is_up).count(); + let layout = self.cluster_layout(); // acquires a rwlock + // TODO: not only layout.current() let storage_nodes = layout .current() diff --git a/src/table/sync.rs b/src/table/sync.rs index 2da1bfe7..4355bd9e 100644 --- a/src/table/sync.rs +++ b/src/table/sync.rs @@ -10,7 +10,7 @@ use rand::Rng; use serde::{Deserialize, Serialize}; use serde_bytes::ByteBuf; use tokio::select; -use tokio::sync::{mpsc, watch}; +use tokio::sync::{mpsc, watch, Notify}; use garage_util::background::*; use garage_util::data::*; @@ -91,8 +91,8 @@ impl TableSyncer { bg.spawn_worker(SyncWorker { syncer: self.clone(), - layout_watch: self.system.layout_watch(), - layout: self.system.cluster_layout().clone(), + layout_notify: self.system.layout_notify(), + layout_version: self.system.cluster_layout().current().version, add_full_sync_rx, todo: vec![], next_full_sync: Instant::now() + Duration::from_secs(20), @@ -492,8 +492,8 @@ impl EndpointHandler for TableSync struct SyncWorker { syncer: Arc>, - layout_watch: watch::Receiver>, - layout: Arc, + layout_notify: Arc, + layout_version: u64, add_full_sync_rx: mpsc::UnboundedReceiver<()>, todo: Vec, next_full_sync: Instant, @@ -593,12 +593,11 @@ impl Worker for SyncWorker { self.add_full_sync(); } }, - _ = self.layout_watch.changed() => { - let new_layout = self.layout_watch.borrow(); - if !Arc::ptr_eq(&new_layout, &self.layout) { - self.layout = new_layout.clone(); - drop(new_layout); - debug!("({}) Ring changed, adding full sync to syncer todo list", F::TABLE_NAME); + _ = self.layout_notify.notified() => { + let new_version = self.syncer.system.cluster_layout().current().version; + if new_version > self.layout_version { + self.layout_version = new_version; + debug!("({}) Layout changed, adding full sync to syncer todo list", F::TABLE_NAME); self.add_full_sync(); } }, -- cgit v1.2.3 From 94caf9c0c1342ce1d2ba3ac7af39fb133721ee83 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 9 Nov 2023 14:53:34 +0100 Subject: layout: separate code path for synchronizing update trackers only --- src/rpc/layout/history.rs | 51 ++++++++++++----- src/rpc/layout/manager.rs | 140 +++++++++++++++++++++++++++++++++------------- src/rpc/layout/schema.rs | 23 ++++++-- src/rpc/system.rs | 15 ++++- 4 files changed, 168 insertions(+), 61 deletions(-) diff --git a/src/rpc/layout/history.rs b/src/rpc/layout/history.rs index 9ae28887..357b9d62 100644 --- a/src/rpc/layout/history.rs +++ b/src/rpc/layout/history.rs @@ -18,10 +18,11 @@ impl LayoutHistory { let mut ret = LayoutHistory { versions: vec![version].into_boxed_slice().into(), update_trackers: Default::default(), + trackers_hash: [0u8; 32].into(), staging: Lww::raw(0, staging), staging_hash: [0u8; 32].into(), }; - ret.staging_hash = ret.calculate_staging_hash(); + ret.update_hashes(); ret } @@ -29,6 +30,15 @@ impl LayoutHistory { self.versions.last().as_ref().unwrap() } + pub(crate) fn update_hashes(&mut self) { + self.trackers_hash = self.calculate_trackers_hash(); + self.staging_hash = self.calculate_staging_hash(); + } + + pub(crate) fn calculate_trackers_hash(&self) -> Hash { + blake2sum(&nonversioned_encode(&self.update_trackers).unwrap()[..]) + } + pub(crate) fn calculate_staging_hash(&self) -> Hash { blake2sum(&nonversioned_encode(&self.staging).unwrap()[..]) } @@ -38,12 +48,6 @@ impl LayoutHistory { pub fn merge(&mut self, other: &LayoutHistory) -> bool { let mut changed = false; - // Merge staged layout changes - if self.staging != other.staging { - changed = true; - } - self.staging.merge(&other.staging); - // Add any new versions to history for v2 in other.versions.iter() { if let Some(v1) = self.versions.iter().find(|v| v.version == v2.version) { @@ -63,7 +67,21 @@ impl LayoutHistory { } // Merge trackers - self.update_trackers.merge(&other.update_trackers); + if self.update_trackers != other.update_trackers { + let c = self.update_trackers.merge(&other.update_trackers); + changed = changed || c; + } + + // Merge staged layout changes + if self.staging != other.staging { + self.staging.merge(&other.staging); + changed = true; + } + + // Update hashes if there are changes + if changed { + self.update_hashes(); + } changed } @@ -100,7 +118,7 @@ To know the correct value of the new layout version, invoke `garage layout show` parameters: self.staging.get().parameters.clone(), roles: LwwMap::new(), }); - self.staging_hash = self.calculate_staging_hash(); + self.update_hashes(); Ok((self, msg)) } @@ -110,20 +128,25 @@ To know the correct value of the new layout version, invoke `garage layout show` parameters: Lww::new(self.current().parameters.clone()), roles: LwwMap::new(), }); - self.staging_hash = self.calculate_staging_hash(); + self.update_hashes(); Ok(self) } pub fn check(&self) -> Result<(), String> { // Check that the hash of the staging data is correct - let staging_hash = self.calculate_staging_hash(); - if staging_hash != self.staging_hash { + if self.trackers_hash != self.calculate_trackers_hash() { + return Err("trackers_hash is incorrect".into()); + } + if self.staging_hash != self.calculate_staging_hash() { return Err("staging_hash is incorrect".into()); } - // TODO: anythign more ? + for version in self.versions.iter() { + version.check()?; + } - self.current().check() + // TODO: anythign more ? + Ok(()) } } diff --git a/src/rpc/layout/manager.rs b/src/rpc/layout/manager.rs index c021039b..a2502f58 100644 --- a/src/rpc/layout/manager.rs +++ b/src/rpc/layout/manager.rs @@ -19,6 +19,7 @@ use crate::rpc_helper::*; use crate::system::*; pub struct LayoutManager { + node_id: Uuid, replication_factor: usize, persist_cluster_layout: Persister, @@ -34,7 +35,7 @@ pub struct LayoutStatus { /// Cluster layout version pub cluster_layout_version: u64, /// Hash of cluster layout update trackers - // (TODO) pub cluster_layout_trackers_hash: Hash, + pub cluster_layout_trackers_hash: Hash, /// Hash of cluster layout staging data pub cluster_layout_staging_hash: Hash, } @@ -81,6 +82,7 @@ impl LayoutManager { ); Ok(Arc::new(Self { + node_id: node_id.into(), replication_factor, persist_cluster_layout, layout, @@ -92,10 +94,15 @@ impl LayoutManager { // ---- PUBLIC INTERFACE ---- + pub fn layout(&self) -> RwLockReadGuard<'_, LayoutHistory> { + self.layout.read().unwrap() + } + pub fn status(&self) -> LayoutStatus { let layout = self.layout(); LayoutStatus { cluster_layout_version: layout.current().version, + cluster_layout_trackers_hash: layout.trackers_hash, cluster_layout_staging_hash: layout.staging_hash, } } @@ -108,11 +115,35 @@ impl LayoutManager { Ok(()) } - pub fn layout(&self) -> RwLockReadGuard<'_, LayoutHistory> { - self.layout.read().unwrap() + // ---- INTERNALS --- + + fn merge_layout(&self, adv: &LayoutHistory) -> Option { + let mut layout = self.layout.write().unwrap(); + let prev_layout_check = layout.check().is_ok(); + + if !prev_layout_check || adv.check().is_ok() { + if layout.merge(adv) { + if prev_layout_check && layout.check().is_err() { + panic!("Merged two correct layouts and got an incorrect layout."); + } + + return Some(layout.clone()); + } + } + None } - pub(crate) async fn pull_cluster_layout(self: &Arc, peer: Uuid) { + fn merge_layout_trackers(&self, adv: &UpdateTrackers) -> Option { + let mut layout = self.layout.write().unwrap(); + if layout.update_trackers != *adv { + if layout.update_trackers.merge(adv) { + return Some(layout.update_trackers.clone()); + } + } + None + } + + async fn pull_cluster_layout(self: &Arc, peer: Uuid) { let resp = self .rpc_helper .call( @@ -123,15 +154,35 @@ impl LayoutManager { ) .await; if let Ok(SystemRpc::AdvertiseClusterLayout(layout)) = resp { - let _: Result<_, _> = self.handle_advertise_cluster_layout(&layout).await; + if let Err(e) = self.handle_advertise_cluster_layout(&layout).await { + warn!("In pull_cluster_layout: {}", e); + } } } - // ---- INTERNALS --- + async fn pull_cluster_layout_trackers(self: &Arc, peer: Uuid) { + let resp = self + .rpc_helper + .call( + &self.system_endpoint, + peer, + SystemRpc::PullClusterLayoutTrackers, + RequestStrategy::with_priority(PRIO_HIGH), + ) + .await; + if let Ok(SystemRpc::AdvertiseClusterLayoutTrackers(trackers)) = resp { + if let Err(e) = self + .handle_advertise_cluster_layout_trackers(&trackers) + .await + { + warn!("In pull_cluster_layout_trackers: {}", e); + } + } + } - /// Save network configuration to disc + /// Save cluster layout data to disk async fn save_cluster_layout(&self) -> Result<(), Error> { - let layout = self.layout.read().unwrap().clone(); // TODO: avoid cloning + let layout = self.layout.read().unwrap().clone(); self.persist_cluster_layout .save_async(&layout) .await @@ -139,33 +190,41 @@ impl LayoutManager { Ok(()) } - fn merge_layout(&self, adv: &LayoutHistory) -> Option { - let mut layout = self.layout.write().unwrap(); - let prev_layout_check = layout.check().is_ok(); - - if !prev_layout_check || adv.check().is_ok() { - if layout.merge(adv) { - if prev_layout_check && layout.check().is_err() { - panic!("Merged two correct layouts and got an incorrect layout."); + fn broadcast_update(self: &Arc, rpc: SystemRpc) { + tokio::spawn({ + let this = self.clone(); + async move { + if let Err(e) = this + .rpc_helper + .broadcast( + &this.system_endpoint, + rpc, + RequestStrategy::with_priority(PRIO_HIGH), + ) + .await + { + warn!("Error while broadcasting new cluster layout: {}", e); } - - return Some(layout.clone()); } - } - None + }); } // ---- RPC HANDLERS ---- - pub(crate) fn handle_advertise_status(self: &Arc, from: Uuid, status: &LayoutStatus) { - let local_status = self.status(); - if status.cluster_layout_version > local_status.cluster_layout_version - || status.cluster_layout_staging_hash != local_status.cluster_layout_staging_hash + pub(crate) fn handle_advertise_status(self: &Arc, from: Uuid, remote: &LayoutStatus) { + let local = self.status(); + if remote.cluster_layout_version > local.cluster_layout_version + || remote.cluster_layout_staging_hash != local.cluster_layout_staging_hash { tokio::spawn({ let this = self.clone(); async move { this.pull_cluster_layout(from).await } }); + } else if remote.cluster_layout_trackers_hash != local.cluster_layout_trackers_hash { + tokio::spawn({ + let this = self.clone(); + async move { this.pull_cluster_layout_trackers(from).await } + }); } } @@ -174,6 +233,11 @@ impl LayoutManager { SystemRpc::AdvertiseClusterLayout(layout) } + pub(crate) fn handle_pull_cluster_layout_trackers(&self) -> SystemRpc { + let layout = self.layout.read().unwrap(); + SystemRpc::AdvertiseClusterLayoutTrackers(layout.update_trackers.clone()) + } + pub(crate) async fn handle_advertise_cluster_layout( self: &Arc, adv: &LayoutHistory, @@ -190,24 +254,20 @@ impl LayoutManager { if let Some(new_layout) = self.merge_layout(adv) { self.change_notify.notify_waiters(); + self.broadcast_update(SystemRpc::AdvertiseClusterLayout(new_layout)); + self.save_cluster_layout().await?; + } - tokio::spawn({ - let this = self.clone(); - async move { - if let Err(e) = this - .rpc_helper - .broadcast( - &this.system_endpoint, - SystemRpc::AdvertiseClusterLayout(new_layout), - RequestStrategy::with_priority(PRIO_HIGH), - ) - .await - { - warn!("Error while broadcasting new cluster layout: {}", e); - } - } - }); + Ok(SystemRpc::Ok) + } + pub(crate) async fn handle_advertise_cluster_layout_trackers( + self: &Arc, + trackers: &UpdateTrackers, + ) -> Result { + if let Some(new_trackers) = self.merge_layout_trackers(trackers) { + self.change_notify.notify_waiters(); + self.broadcast_update(SystemRpc::AdvertiseClusterLayoutTrackers(new_trackers)); self.save_cluster_layout().await?; } diff --git a/src/rpc/layout/schema.rs b/src/rpc/layout/schema.rs index d587a6cb..abae5bd8 100644 --- a/src/rpc/layout/schema.rs +++ b/src/rpc/layout/schema.rs @@ -233,6 +233,8 @@ mod v010 { /// Update trackers pub update_trackers: UpdateTrackers, + /// Hash of the update trackers + pub trackers_hash: Hash, /// Staged changes for the next version pub staging: Lww, @@ -289,10 +291,12 @@ mod v010 { sync_map: update_tracker.clone(), sync_ack_map: update_tracker.clone(), }, + trackers_hash: [0u8; 32].into(), staging: Lww::raw(previous.version, staging), staging_hash: [0u8; 32].into(), }; ret.staging_hash = ret.calculate_staging_hash(); + ret.trackers_hash = ret.calculate_trackers_hash(); ret } } @@ -355,14 +359,20 @@ impl core::str::FromStr for ZoneRedundancy { } impl UpdateTracker { - fn merge(&mut self, other: &UpdateTracker) { + fn merge(&mut self, other: &UpdateTracker) -> bool { + let mut changed = false; for (k, v) in other.0.iter() { if let Some(v_mut) = self.0.get_mut(k) { - *v_mut = std::cmp::max(*v_mut, *v); + if *v > *v_mut { + *v_mut = *v; + changed = true; + } } else { self.0.insert(*k, *v); + changed = true; } } + changed } pub(crate) fn min(&self) -> u64 { @@ -371,9 +381,10 @@ impl UpdateTracker { } impl UpdateTrackers { - pub(crate) fn merge(&mut self, other: &UpdateTrackers) { - self.ack_map.merge(&other.ack_map); - self.sync_map.merge(&other.sync_map); - self.sync_ack_map.merge(&other.sync_ack_map); + pub(crate) fn merge(&mut self, other: &UpdateTrackers) -> bool { + let c1 = self.ack_map.merge(&other.ack_map); + let c2 = self.sync_map.merge(&other.sync_map); + let c3 = self.sync_ack_map.merge(&other.sync_ack_map); + c1 || c2 || c3 } } diff --git a/src/rpc/system.rs b/src/rpc/system.rs index cb3af3fe..6ce13d0d 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -34,7 +34,7 @@ use crate::consul::ConsulDiscovery; #[cfg(feature = "kubernetes-discovery")] use crate::kubernetes::*; use crate::layout::manager::{LayoutManager, LayoutStatus}; -use crate::layout::*; +use crate::layout::{self, LayoutHistory, NodeRoleV}; use crate::replication_mode::*; use crate::rpc_helper::*; @@ -65,10 +65,15 @@ pub enum SystemRpc { GetKnownNodes, /// Return known nodes ReturnKnownNodes(Vec), + /// Ask other node its cluster layout. Answered with AdvertiseClusterLayout PullClusterLayout, /// Advertisement of cluster layout. Sent spontanously or in response to PullClusterLayout AdvertiseClusterLayout(LayoutHistory), + /// Ask other node its cluster layout update trackers. + PullClusterLayoutTrackers, + /// Advertisement of cluster layout update trackers. + AdvertiseClusterLayoutTrackers(layout::UpdateTrackers), } impl Rpc for SystemRpc { @@ -727,6 +732,14 @@ impl EndpointHandler for System { .handle_advertise_cluster_layout(adv) .await } + SystemRpc::PullClusterLayoutTrackers => { + Ok(self.layout_manager.handle_pull_cluster_layout_trackers()) + } + SystemRpc::AdvertiseClusterLayoutTrackers(adv) => { + self.layout_manager + .handle_advertise_cluster_layout_trackers(adv) + .await + } // ---- other -> Error ---- m => Err(Error::unexpected_rpc_message(m)), -- cgit v1.2.3 From 03ebf18830dff1983f09abe6ecb8d8d26daeb446 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 9 Nov 2023 15:31:59 +0100 Subject: layout: begin managing the update tracker values --- src/rpc/layout/history.rs | 74 +++++++++++++++++++++++++++++++++++++++++++---- src/rpc/layout/manager.rs | 7 +++-- src/rpc/layout/schema.rs | 15 +++++++--- src/rpc/layout/version.rs | 46 +++++++++++++++-------------- 4 files changed, 109 insertions(+), 33 deletions(-) diff --git a/src/rpc/layout/history.rs b/src/rpc/layout/history.rs index 357b9d62..347f03db 100644 --- a/src/rpc/layout/history.rs +++ b/src/rpc/layout/history.rs @@ -1,3 +1,5 @@ +use std::collections::HashSet; + use garage_util::crdt::{Crdt, Lww, LwwMap}; use garage_util::data::*; use garage_util::encode::nonversioned_encode; @@ -30,6 +32,14 @@ impl LayoutHistory { self.versions.last().as_ref().unwrap() } + pub fn all_storage_nodes(&self) -> HashSet { + self.versions + .iter() + .map(|x| x.nongateway_nodes()) + .flatten() + .collect::>() + } + pub(crate) fn update_hashes(&mut self) { self.trackers_hash = self.calculate_trackers_hash(); self.staging_hash = self.calculate_staging_hash(); @@ -43,6 +53,65 @@ impl LayoutHistory { blake2sum(&nonversioned_encode(&self.staging).unwrap()[..]) } + // ------------------ update tracking --------------- + + pub(crate) fn update_trackers(&mut self, node_id: Uuid) { + // Ensure trackers for this node's values are up-to-date + + // 1. Acknowledge the last layout version in the history + self.ack_last(node_id); + + // 2. Assume the data on this node is sync'ed up at least to + // the first layout version in the history + self.sync_first(node_id); + + // 3. Acknowledge everyone has synced up to min(self.sync_map) + self.sync_ack(node_id); + + // 4. Cleanup layout versions that are not needed anymore + self.cleanup_old_versions(); + + info!("ack_map: {:?}", self.update_trackers.ack_map); + info!("sync_map: {:?}", self.update_trackers.sync_map); + info!("sync_ack_map: {:?}", self.update_trackers.sync_ack_map); + + // Finally, update hashes + self.update_hashes(); + } + + pub(crate) fn ack_last(&mut self, node: Uuid) { + let last_version = self.current().version; + self.update_trackers.ack_map.set_max(node, last_version); + } + + pub(crate) fn sync_first(&mut self, node: Uuid) { + let first_version = self.versions.first().as_ref().unwrap().version; + self.update_trackers.sync_map.set_max(node, first_version); + } + + pub(crate) fn sync_ack(&mut self, node: Uuid) { + self.update_trackers.sync_ack_map.set_max( + node, + self.calculate_global_min(&self.update_trackers.sync_map), + ); + } + + pub(crate) fn cleanup_old_versions(&mut self) { + let min_sync_ack = self.calculate_global_min(&self.update_trackers.sync_ack_map); + while self.versions.first().as_ref().unwrap().version < min_sync_ack { + self.versions.remove(0); + } + } + + pub(crate) fn calculate_global_min(&self, tracker: &UpdateTracker) -> u64 { + let storage_nodes = self.all_storage_nodes(); + storage_nodes + .iter() + .map(|x| tracker.0.get(x).copied().unwrap_or(0)) + .min() + .unwrap_or(0) + } + // ================== updates to layout, public interface =================== pub fn merge(&mut self, other: &LayoutHistory) -> bool { @@ -78,11 +147,6 @@ impl LayoutHistory { changed = true; } - // Update hashes if there are changes - if changed { - self.update_hashes(); - } - changed } diff --git a/src/rpc/layout/manager.rs b/src/rpc/layout/manager.rs index a2502f58..ffcc938b 100644 --- a/src/rpc/layout/manager.rs +++ b/src/rpc/layout/manager.rs @@ -51,7 +51,7 @@ impl LayoutManager { let persist_cluster_layout: Persister = Persister::new(&config.metadata_dir, "cluster_layout"); - let cluster_layout = match persist_cluster_layout.load() { + let mut cluster_layout = match persist_cluster_layout.load() { Ok(x) => { if x.current().replication_factor != replication_factor { return Err(Error::Message(format!( @@ -71,6 +71,8 @@ impl LayoutManager { } }; + cluster_layout.update_trackers(node_id.into()); + let layout = Arc::new(RwLock::new(cluster_layout)); let change_notify = Arc::new(Notify::new()); @@ -126,7 +128,7 @@ impl LayoutManager { if prev_layout_check && layout.check().is_err() { panic!("Merged two correct layouts and got an incorrect layout."); } - + layout.update_trackers(self.node_id); return Some(layout.clone()); } } @@ -137,6 +139,7 @@ impl LayoutManager { let mut layout = self.layout.write().unwrap(); if layout.update_trackers != *adv { if layout.update_trackers.merge(adv) { + layout.update_trackers(self.node_id); return Some(layout.update_trackers.clone()); } } diff --git a/src/rpc/layout/schema.rs b/src/rpc/layout/schema.rs index abae5bd8..9f5d6f62 100644 --- a/src/rpc/layout/schema.rs +++ b/src/rpc/layout/schema.rs @@ -3,6 +3,7 @@ use std::fmt; use bytesize::ByteSize; use garage_util::crdt::{AutoCrdt, Crdt}; +use garage_util::data::Uuid; mod v08 { use crate::layout::CompactNodeType; @@ -276,8 +277,7 @@ mod v010 { let update_tracker = UpdateTracker( version .nongateway_nodes() - .iter() - .map(|x| (*x, version.version)) + .map(|x| (x, version.version)) .collect::>(), ); let staging = LayoutStaging { @@ -375,8 +375,15 @@ impl UpdateTracker { changed } - pub(crate) fn min(&self) -> u64 { - self.0.iter().map(|(_, v)| *v).min().unwrap_or(0) + pub(crate) fn set_max(&mut self, peer: Uuid, value: u64) { + match self.0.get_mut(&peer) { + Some(e) => { + *e = std::cmp::max(*e, value); + } + None => { + self.0.insert(peer, value); + } + } } } diff --git a/src/rpc/layout/version.rs b/src/rpc/layout/version.rs index 6918fdf9..65c62f63 100644 --- a/src/rpc/layout/version.rs +++ b/src/rpc/layout/version.rs @@ -134,15 +134,14 @@ impl LayoutVersion { // ===================== internal information extractors ====================== /// Returns the uuids of the non_gateway nodes in self.node_id_vec. - pub(crate) fn nongateway_nodes(&self) -> Vec { - let mut result = Vec::::new(); - for uuid in self.node_id_vec.iter() { - match self.node_role(uuid) { - Some(role) if role.capacity.is_some() => result.push(*uuid), - _ => (), - } - } - result + pub(crate) fn nongateway_nodes(&self) -> impl Iterator + '_ { + self.node_id_vec + .iter() + .copied() + .filter(move |uuid| match self.node_role(uuid) { + Some(role) if role.capacity.is_some() => true, + _ => false, + }) } /// Given a node uuids, this function returns the label of its zone @@ -158,8 +157,8 @@ impl LayoutVersion { /// Returns the sum of capacities of non gateway nodes in the cluster fn get_total_capacity(&self) -> Result { let mut total_capacity = 0; - for uuid in self.nongateway_nodes().iter() { - total_capacity += self.get_node_capacity(uuid)?; + for uuid in self.nongateway_nodes() { + total_capacity += self.get_node_capacity(&uuid)?; } Ok(total_capacity) } @@ -320,7 +319,7 @@ impl LayoutVersion { // to use them as indices in the flow graphs. let (id_to_zone, zone_to_id) = self.generate_nongateway_zone_ids()?; - let nb_nongateway_nodes = self.nongateway_nodes().len(); + let nb_nongateway_nodes = self.nongateway_nodes().count(); if nb_nongateway_nodes < self.replication_factor { return Err(Error::Message(format!( "The number of nodes with positive \ @@ -479,7 +478,8 @@ impl LayoutVersion { let mut id_to_zone = Vec::::new(); let mut zone_to_id = HashMap::::new(); - for uuid in self.nongateway_nodes().iter() { + let nongateway_nodes = self.nongateway_nodes().collect::>(); + for uuid in nongateway_nodes.iter() { let r = self.node_role(uuid).unwrap(); if !zone_to_id.contains_key(&r.zone) && r.capacity.is_some() { zone_to_id.insert(r.zone.clone(), id_to_zone.len()); @@ -556,8 +556,10 @@ impl LayoutVersion { exclude_assoc: &HashSet<(usize, usize)>, zone_redundancy: usize, ) -> Result, Error> { - let vertices = - LayoutVersion::generate_graph_vertices(zone_to_id.len(), self.nongateway_nodes().len()); + let vertices = LayoutVersion::generate_graph_vertices( + zone_to_id.len(), + self.nongateway_nodes().count(), + ); let mut g = Graph::::new(&vertices); let nb_zones = zone_to_id.len(); for p in 0..NB_PARTITIONS { @@ -576,7 +578,7 @@ impl LayoutVersion { )?; } } - for n in 0..self.nongateway_nodes().len() { + for n in 0..self.nongateway_nodes().count() { let node_capacity = self.get_node_capacity(&self.node_id_vec[n])?; let node_zone = zone_to_id[self.get_node_zone(&self.node_id_vec[n])?]; g.add_edge(Vertex::N(n), Vertex::Sink, node_capacity / partition_size)?; @@ -600,7 +602,7 @@ impl LayoutVersion { // previous assignment let mut exclude_edge = HashSet::<(usize, usize)>::new(); if let Some(prev_assign) = prev_assign_opt { - let nb_nodes = self.nongateway_nodes().len(); + let nb_nodes = self.nongateway_nodes().count(); for (p, prev_assign_p) in prev_assign.iter().enumerate() { for n in 0..nb_nodes { exclude_edge.insert((p, n)); @@ -652,7 +654,7 @@ impl LayoutVersion { // We compute the maximal length of a simple path in gflow. It is used in the // Bellman-Ford algorithm in optimize_flow_with_cost to set the number // of iterations. - let nb_nodes = self.nongateway_nodes().len(); + let nb_nodes = self.nongateway_nodes().count(); let path_length = 4 * nb_nodes; gflow.optimize_flow_with_cost(&cost, path_length)?; @@ -730,7 +732,7 @@ impl LayoutVersion { } // We define and fill in the following tables - let storing_nodes = self.nongateway_nodes(); + let storing_nodes = self.nongateway_nodes().collect::>(); let mut new_partitions = vec![0; storing_nodes.len()]; let mut stored_partitions = vec![0; storing_nodes.len()]; @@ -873,9 +875,9 @@ mod tests { for z in zones.iter() { zone_token.insert(z.clone(), 0); } - for uuid in cl.nongateway_nodes().iter() { - let z = cl.get_node_zone(uuid)?; - let c = cl.get_node_capacity(uuid)?; + for uuid in cl.nongateway_nodes() { + let z = cl.get_node_zone(&uuid)?; + let c = cl.get_node_capacity(&uuid)?; zone_token.insert( z.clone(), zone_token[&z] + min(NB_PARTITIONS, (c / over_size) as usize), -- cgit v1.2.3 From bad7cc812ead88e9f334405c5c082d79c14c8898 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 9 Nov 2023 15:42:10 +0100 Subject: layout admin: add missing calls to update_hash --- src/api/admin/cluster.rs | 1 + src/garage/cli/layout.rs | 3 ++- src/rpc/layout/history.rs | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/api/admin/cluster.rs b/src/api/admin/cluster.rs index 593bd778..d912b58f 100644 --- a/src/api/admin/cluster.rs +++ b/src/api/admin/cluster.rs @@ -240,6 +240,7 @@ pub async fn handle_update_cluster_layout( .merge(&roles.update_mutator(node, layout::NodeRoleV(new_role))); } + layout.update_hashes(); garage .system .layout_manager diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index 269d92f4..15727448 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -329,8 +329,9 @@ pub async fn fetch_layout( pub async fn send_layout( rpc_cli: &Endpoint, rpc_host: NodeID, - layout: LayoutHistory, + mut layout: LayoutHistory, ) -> Result<(), Error> { + layout.update_hashes(); rpc_cli .call( &rpc_host, diff --git a/src/rpc/layout/history.rs b/src/rpc/layout/history.rs index 347f03db..e17a1c77 100644 --- a/src/rpc/layout/history.rs +++ b/src/rpc/layout/history.rs @@ -40,7 +40,7 @@ impl LayoutHistory { .collect::>() } - pub(crate) fn update_hashes(&mut self) { + pub fn update_hashes(&mut self) { self.trackers_hash = self.calculate_trackers_hash(); self.staging_hash = self.calculate_staging_hash(); } -- cgit v1.2.3 From 9d95f6f7040c1899715ae4f984313427b1432758 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 9 Nov 2023 15:52:45 +0100 Subject: layout: fix tracker bugs --- src/rpc/layout/manager.rs | 8 +++++++- src/rpc/layout/schema.rs | 6 +++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/src/rpc/layout/manager.rs b/src/rpc/layout/manager.rs index ffcc938b..c1417dac 100644 --- a/src/rpc/layout/manager.rs +++ b/src/rpc/layout/manager.rs @@ -125,10 +125,10 @@ impl LayoutManager { if !prev_layout_check || adv.check().is_ok() { if layout.merge(adv) { + layout.update_trackers(self.node_id); if prev_layout_check && layout.check().is_err() { panic!("Merged two correct layouts and got an incorrect layout."); } - layout.update_trackers(self.node_id); return Some(layout.clone()); } } @@ -245,6 +245,8 @@ impl LayoutManager { self: &Arc, adv: &LayoutHistory, ) -> Result { + debug!("handle_advertise_cluster_layout: {:?}", adv); + if adv.current().replication_factor != self.replication_factor { let msg = format!( "Received a cluster layout from another node with replication factor {}, which is different from what we have in our configuration ({}). Discarding the cluster layout we received.", @@ -256,6 +258,8 @@ impl LayoutManager { } if let Some(new_layout) = self.merge_layout(adv) { + debug!("handle_advertise_cluster_layout: some changes were added to the current stuff"); + self.change_notify.notify_waiters(); self.broadcast_update(SystemRpc::AdvertiseClusterLayout(new_layout)); self.save_cluster_layout().await?; @@ -268,6 +272,8 @@ impl LayoutManager { self: &Arc, trackers: &UpdateTrackers, ) -> Result { + debug!("handle_advertise_cluster_layout_trackers: {:?}", trackers); + if let Some(new_trackers) = self.merge_layout_trackers(trackers) { self.change_notify.notify_waiters(); self.broadcast_update(SystemRpc::AdvertiseClusterLayoutTrackers(new_trackers)); diff --git a/src/rpc/layout/schema.rs b/src/rpc/layout/schema.rs index 9f5d6f62..db60c806 100644 --- a/src/rpc/layout/schema.rs +++ b/src/rpc/layout/schema.rs @@ -190,7 +190,7 @@ mod v010 { use garage_util::crdt::{Lww, LwwMap}; use garage_util::data::{Hash, Uuid}; use serde::{Deserialize, Serialize}; - use std::collections::HashMap; + use std::collections::BTreeMap; pub use v09::{LayoutParameters, NodeRole, NodeRoleV, ZoneRedundancy}; /// The layout of the cluster, i.e. the list of roles @@ -257,7 +257,7 @@ mod v010 { /// The history of cluster layouts #[derive(Clone, Debug, Serialize, Deserialize, Default, PartialEq)] - pub struct UpdateTracker(pub HashMap); + pub struct UpdateTracker(pub BTreeMap); impl garage_util::migrate::Migrate for LayoutHistory { const VERSION_MARKER: &'static [u8] = b"G010lh"; @@ -278,7 +278,7 @@ mod v010 { version .nongateway_nodes() .map(|x| (x, version.version)) - .collect::>(), + .collect::>(), ); let staging = LayoutStaging { parameters: previous.staging_parameters, -- cgit v1.2.3 From df36cf3099f6010c4fc62109b85d4d1e62f160cc Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 9 Nov 2023 16:32:31 +0100 Subject: layout: add helpers to LayoutHistory and prepare integration with Table --- src/rpc/layout/history.rs | 41 +++++++++++++++++++++++++++++++++-------- src/rpc/layout/manager.rs | 33 ++++++++++++++++++++++++++++++++- src/rpc/layout/schema.rs | 9 ++++++--- src/rpc/layout/version.rs | 2 +- src/table/table.rs | 2 ++ 5 files changed, 74 insertions(+), 13 deletions(-) diff --git a/src/rpc/layout/history.rs b/src/rpc/layout/history.rs index e17a1c77..dbb02269 100644 --- a/src/rpc/layout/history.rs +++ b/src/rpc/layout/history.rs @@ -32,14 +32,6 @@ impl LayoutHistory { self.versions.last().as_ref().unwrap() } - pub fn all_storage_nodes(&self) -> HashSet { - self.versions - .iter() - .map(|x| x.nongateway_nodes()) - .flatten() - .collect::>() - } - pub fn update_hashes(&mut self) { self.trackers_hash = self.calculate_trackers_hash(); self.staging_hash = self.calculate_staging_hash(); @@ -53,6 +45,39 @@ impl LayoutHistory { blake2sum(&nonversioned_encode(&self.staging).unwrap()[..]) } + // ------------------ who stores what now? --------------- + + pub fn max_ack(&self) -> u64 { + self.calculate_global_min(&self.update_trackers.ack_map) + } + + pub fn all_storage_nodes(&self) -> HashSet { + // TODO: cache this + self.versions + .iter() + .map(|x| x.nongateway_nodes()) + .flatten() + .collect::>() + } + + pub fn read_nodes_of(&self, position: &Hash) -> Vec { + let sync_min = self.calculate_global_min(&self.update_trackers.sync_map); + let version = self + .versions + .iter() + .find(|x| x.version == sync_min) + .or(self.versions.last()) + .unwrap(); + version.nodes_of(position, version.replication_factor) + } + + pub fn write_sets_of(&self, position: &Hash) -> Vec> { + self.versions + .iter() + .map(|x| x.nodes_of(position, x.replication_factor)) + .collect::>() + } + // ------------------ update tracking --------------- pub(crate) fn update_trackers(&mut self, node_id: Uuid) { diff --git a/src/rpc/layout/manager.rs b/src/rpc/layout/manager.rs index c1417dac..b0302b12 100644 --- a/src/rpc/layout/manager.rs +++ b/src/rpc/layout/manager.rs @@ -1,4 +1,5 @@ -use std::sync::{Arc, RwLock, RwLockReadGuard}; +use std::collections::HashMap; +use std::sync::{Arc, Mutex, RwLock, RwLockReadGuard}; use std::time::Duration; use serde::{Deserialize, Serialize}; @@ -26,6 +27,8 @@ pub struct LayoutManager { layout: Arc>, pub(crate) change_notify: Arc, + table_sync_version: Mutex>, + pub(crate) rpc_helper: RpcHelper, system_endpoint: Arc>, } @@ -117,6 +120,34 @@ impl LayoutManager { Ok(()) } + pub fn add_table(&self, table_name: &'static str) { + let first_version = self.layout().versions.first().unwrap().version; + + self.table_sync_version + .lock() + .unwrap() + .insert(table_name.to_string(), first_version); + } + + pub fn sync_table_until(self: &Arc, table_name: &'static str, version: u64) { + let mut table_sync_version = self.table_sync_version.lock().unwrap(); + *table_sync_version.get_mut(table_name).unwrap() = version; + let sync_until = table_sync_version.iter().map(|(_, v)| *v).max().unwrap(); + drop(table_sync_version); + + let mut layout = self.layout.write().unwrap(); + if layout + .update_trackers + .sync_map + .set_max(self.node_id, sync_until) + { + layout.update_hashes(); + self.broadcast_update(SystemRpc::AdvertiseClusterLayoutTrackers( + layout.update_trackers.clone(), + )); + } + } + // ---- INTERNALS --- fn merge_layout(&self, adv: &LayoutHistory) -> Option { diff --git a/src/rpc/layout/schema.rs b/src/rpc/layout/schema.rs index db60c806..89f5c361 100644 --- a/src/rpc/layout/schema.rs +++ b/src/rpc/layout/schema.rs @@ -375,14 +375,17 @@ impl UpdateTracker { changed } - pub(crate) fn set_max(&mut self, peer: Uuid, value: u64) { + pub(crate) fn set_max(&mut self, peer: Uuid, value: u64) -> bool { match self.0.get_mut(&peer) { - Some(e) => { - *e = std::cmp::max(*e, value); + Some(e) if *e < value => { + *e = value; + true } None => { self.0.insert(peer, value); + true } + _ => false, } } } diff --git a/src/rpc/layout/version.rs b/src/rpc/layout/version.rs index 65c62f63..8133672a 100644 --- a/src/rpc/layout/version.rs +++ b/src/rpc/layout/version.rs @@ -109,7 +109,7 @@ impl LayoutVersion { .collect::>() } - /// Walk the ring to find the n servers in which data should be replicated + /// Return the n servers in which data for this hash should be replicated pub fn nodes_of(&self, position: &Hash, n: usize) -> Vec { assert_eq!(n, self.replication_factor); diff --git a/src/table/table.rs b/src/table/table.rs index 3e3fd138..997fd7dc 100644 --- a/src/table/table.rs +++ b/src/table/table.rs @@ -80,6 +80,8 @@ impl Table { let syncer = TableSyncer::new(system.clone(), data.clone(), merkle_updater.clone()); let gc = TableGc::new(system.clone(), data.clone()); + system.layout_manager.add_table(F::TABLE_NAME); + let table = Arc::new(Self { system, data, -- cgit v1.2.3 From ce89d1ddabe3b9e638b0173949726522ae9a0311 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Sat, 11 Nov 2023 12:08:32 +0100 Subject: table sync: adapt to new layout history --- src/rpc/layout/history.rs | 21 +++-- src/rpc/layout/manager.rs | 1 + src/rpc/layout/version.rs | 16 ++-- src/rpc/system.rs | 2 +- src/table/replication/fullcopy.rs | 25 ++++- src/table/replication/parameters.rs | 19 +++- src/table/replication/sharded.rs | 39 +++++++- src/table/sync.rs | 178 +++++++++++++++--------------------- 8 files changed, 172 insertions(+), 129 deletions(-) diff --git a/src/rpc/layout/history.rs b/src/rpc/layout/history.rs index dbb02269..185dbb27 100644 --- a/src/rpc/layout/history.rs +++ b/src/rpc/layout/history.rs @@ -47,11 +47,19 @@ impl LayoutHistory { // ------------------ who stores what now? --------------- - pub fn max_ack(&self) -> u64 { + pub fn all_ack(&self) -> u64 { self.calculate_global_min(&self.update_trackers.ack_map) } - pub fn all_storage_nodes(&self) -> HashSet { + pub fn min_stored(&self) -> u64 { + self.versions.first().as_ref().unwrap().version + } + + pub fn sync_versions(&self) -> (u64, u64, u64) { + (self.current().version, self.all_ack(), self.min_stored()) + } + + pub fn all_nongateway_nodes(&self) -> HashSet { // TODO: cache this self.versions .iter() @@ -71,11 +79,10 @@ impl LayoutHistory { version.nodes_of(position, version.replication_factor) } - pub fn write_sets_of(&self, position: &Hash) -> Vec> { + pub fn write_sets_of<'a>(&'a self, position: &'a Hash) -> impl Iterator> + 'a { self.versions .iter() - .map(|x| x.nodes_of(position, x.replication_factor)) - .collect::>() + .map(move |x| x.nodes_of(position, x.replication_factor)) } // ------------------ update tracking --------------- @@ -129,7 +136,9 @@ impl LayoutHistory { } pub(crate) fn calculate_global_min(&self, tracker: &UpdateTracker) -> u64 { - let storage_nodes = self.all_storage_nodes(); + // TODO: for TableFullReplication, counting gateway nodes might be + // necessary? Think about this more. + let storage_nodes = self.all_nongateway_nodes(); storage_nodes .iter() .map(|x| tracker.0.get(x).copied().unwrap_or(0)) diff --git a/src/rpc/layout/manager.rs b/src/rpc/layout/manager.rs index b0302b12..7d60bae6 100644 --- a/src/rpc/layout/manager.rs +++ b/src/rpc/layout/manager.rs @@ -92,6 +92,7 @@ impl LayoutManager { persist_cluster_layout, layout, change_notify, + table_sync_version: Mutex::new(HashMap::new()), system_endpoint, rpc_helper, })) diff --git a/src/rpc/layout/version.rs b/src/rpc/layout/version.rs index 8133672a..f45a3c35 100644 --- a/src/rpc/layout/version.rs +++ b/src/rpc/layout/version.rs @@ -98,15 +98,13 @@ impl LayoutVersion { } /// Get the list of partitions and the first hash of a partition key that would fall in it - pub fn partitions(&self) -> Vec<(Partition, Hash)> { - (0..(1 << PARTITION_BITS)) - .map(|i| { - let top = (i as u16) << (16 - PARTITION_BITS); - let mut location = [0u8; 32]; - location[..2].copy_from_slice(&u16::to_be_bytes(top)[..]); - (i as u16, Hash::from(location)) - }) - .collect::>() + pub fn partitions(&self) -> impl Iterator + '_ { + (0..(1 << PARTITION_BITS)).map(|i| { + let top = (i as u16) << (16 - PARTITION_BITS); + let mut location = [0u8; 32]; + location[..2].copy_from_slice(&u16::to_be_bytes(top)[..]); + (i as u16, Hash::from(location)) + }) } /// Return the n servers in which data for this hash should be replicated diff --git a/src/rpc/system.rs b/src/rpc/system.rs index 6ce13d0d..3418600b 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -442,7 +442,7 @@ impl System { .filter(|(x, _, _)| nodes.get(x).map(|n| n.is_up).unwrap_or(false)) .count(); - let partitions = layout.current().partitions(); + let partitions = layout.current().partitions().collect::>(); let partitions_n_up = partitions .iter() .map(|(_, h)| { diff --git a/src/table/replication/fullcopy.rs b/src/table/replication/fullcopy.rs index a5c83d0f..5653a229 100644 --- a/src/table/replication/fullcopy.rs +++ b/src/table/replication/fullcopy.rs @@ -1,3 +1,4 @@ +use std::iter::FromIterator; use std::sync::Arc; use garage_rpc::layout::*; @@ -6,10 +7,17 @@ use garage_util::data::*; use crate::replication::*; +// TODO: find a way to track layout changes for this as well +// The hard thing is that this data is stored also on gateway nodes, +// whereas sharded data is stored only on non-Gateway nodes (storage nodes) +// Also we want to be more tolerant to failures of gateways so we don't +// want to do too much holding back of data when progress of gateway +// nodes is not reported in the layout history's ack/sync/sync_ack maps. + /// Full replication schema: all nodes store everything -/// Writes are disseminated in an epidemic manner in the network /// Advantage: do all reads locally, extremely fast /// Inconvenient: only suitable to reasonably small tables +/// Inconvenient: if some writes fail, nodes will read outdated data #[derive(Clone)] pub struct TableFullReplication { /// The membership manager of this node @@ -44,7 +52,18 @@ impl TableReplication for TableFullReplication { fn partition_of(&self, _hash: &Hash) -> Partition { 0u16 } - fn partitions(&self) -> Vec<(Partition, Hash)> { - vec![(0u16, [0u8; 32].into())] + + fn sync_partitions(&self) -> SyncPartitions { + let layout = self.system.cluster_layout(); + let layout_version = layout.current().version; + SyncPartitions { + layout_version, + partitions: vec![SyncPartition { + partition: 0u16, + first_hash: [0u8; 32].into(), + last_hash: [0xff; 32].into(), + storage_nodes: Vec::from_iter(layout.current().node_ids().to_vec()), + }], + } } } diff --git a/src/table/replication/parameters.rs b/src/table/replication/parameters.rs index 19b306f2..2a7d3585 100644 --- a/src/table/replication/parameters.rs +++ b/src/table/replication/parameters.rs @@ -20,6 +20,21 @@ pub trait TableReplication: Send + Sync + 'static { // Accessing partitions, for Merkle tree & sync /// Get partition for data with given hash fn partition_of(&self, hash: &Hash) -> Partition; - /// List of existing partitions - fn partitions(&self) -> Vec<(Partition, Hash)>; + + /// List of partitions and nodes to sync with in current layout + fn sync_partitions(&self) -> SyncPartitions; +} + +#[derive(Debug)] +pub struct SyncPartitions { + pub layout_version: u64, + pub partitions: Vec, +} + +#[derive(Debug)] +pub struct SyncPartition { + pub partition: Partition, + pub first_hash: Hash, + pub last_hash: Hash, + pub storage_nodes: Vec, } diff --git a/src/table/replication/sharded.rs b/src/table/replication/sharded.rs index 793d87fd..f02b1d66 100644 --- a/src/table/replication/sharded.rs +++ b/src/table/replication/sharded.rs @@ -51,7 +51,42 @@ impl TableReplication for TableShardedReplication { fn partition_of(&self, hash: &Hash) -> Partition { self.system.cluster_layout().current().partition_of(hash) } - fn partitions(&self) -> Vec<(Partition, Hash)> { - self.system.cluster_layout().current().partitions() + + fn sync_partitions(&self) -> SyncPartitions { + let layout = self.system.cluster_layout(); + let layout_version = layout.all_ack(); + + let mut partitions = layout + .current() + .partitions() + .map(|(partition, first_hash)| { + let mut storage_nodes = layout + .write_sets_of(&first_hash) + .map(|x| x.into_iter()) + .flatten() + .collect::>(); + storage_nodes.sort(); + storage_nodes.dedup(); + SyncPartition { + partition, + first_hash, + last_hash: [0u8; 32].into(), // filled in just after + storage_nodes, + } + }) + .collect::>(); + + for i in 0..partitions.len() { + partitions[i].last_hash = if i + 1 < partitions.len() { + partitions[i + 1].first_hash + } else { + [0xFFu8; 32].into() + }; + } + + SyncPartitions { + layout_version, + partitions, + } } } diff --git a/src/table/sync.rs b/src/table/sync.rs index 4355bd9e..43636faa 100644 --- a/src/table/sync.rs +++ b/src/table/sync.rs @@ -6,7 +6,7 @@ use arc_swap::ArcSwapOption; use async_trait::async_trait; use futures_util::stream::*; use opentelemetry::KeyValue; -use rand::Rng; +use rand::prelude::*; use serde::{Deserialize, Serialize}; use serde_bytes::ByteBuf; use tokio::select; @@ -52,16 +52,6 @@ impl Rpc for SyncRpc { type Response = Result; } -#[derive(Debug, Clone)] -struct TodoPartition { - partition: Partition, - begin: Hash, - end: Hash, - - // Are we a node that stores this partition or not? - retain: bool, -} - impl TableSyncer { pub(crate) fn new( system: Arc, @@ -92,9 +82,9 @@ impl TableSyncer { bg.spawn_worker(SyncWorker { syncer: self.clone(), layout_notify: self.system.layout_notify(), - layout_version: self.system.cluster_layout().current().version, + layout_versions: self.system.cluster_layout().sync_versions(), add_full_sync_rx, - todo: vec![], + todo: None, next_full_sync: Instant::now() + Duration::from_secs(20), }); } @@ -112,31 +102,26 @@ impl TableSyncer { async fn sync_partition( self: &Arc, - partition: &TodoPartition, + partition: &SyncPartition, must_exit: &mut watch::Receiver, ) -> Result<(), Error> { - if partition.retain { - let my_id = self.system.id; - - let nodes = self - .data - .replication - .write_nodes(&partition.begin) - .into_iter() - .filter(|node| *node != my_id) - .collect::>(); + let my_id = self.system.id; + let retain = partition.storage_nodes.contains(&my_id); + if retain { debug!( "({}) Syncing {:?} with {:?}...", F::TABLE_NAME, partition, - nodes + partition.storage_nodes ); - let mut sync_futures = nodes + let mut sync_futures = partition + .storage_nodes .iter() + .filter(|node| **node != my_id) .map(|node| { self.clone() - .do_sync_with(partition.clone(), *node, must_exit.clone()) + .do_sync_with(&partition, *node, must_exit.clone()) }) .collect::>(); @@ -147,14 +132,14 @@ impl TableSyncer { warn!("({}) Sync error: {}", F::TABLE_NAME, e); } } - if n_errors > self.data.replication.max_write_errors() { + if n_errors > 0 { return Err(Error::Message(format!( - "Sync failed with too many nodes (should have been: {:?}).", - nodes + "Sync failed with {} nodes.", + n_errors ))); } } else { - self.offload_partition(&partition.begin, &partition.end, must_exit) + self.offload_partition(&partition.first_hash, &partition.last_hash, must_exit) .await?; } @@ -285,7 +270,7 @@ impl TableSyncer { async fn do_sync_with( self: Arc, - partition: TodoPartition, + partition: &SyncPartition, who: Uuid, must_exit: watch::Receiver, ) -> Result<(), Error> { @@ -492,76 +477,23 @@ impl EndpointHandler for TableSync struct SyncWorker { syncer: Arc>, + layout_notify: Arc, - layout_version: u64, + layout_versions: (u64, u64, u64), + add_full_sync_rx: mpsc::UnboundedReceiver<()>, - todo: Vec, next_full_sync: Instant, + + todo: Option, } impl SyncWorker { fn add_full_sync(&mut self) { - let system = &self.syncer.system; - let data = &self.syncer.data; - - let my_id = system.id; - - self.todo.clear(); - - let partitions = data.replication.partitions(); - - for i in 0..partitions.len() { - let begin = partitions[i].1; - - let end = if i + 1 < partitions.len() { - partitions[i + 1].1 - } else { - [0xFFu8; 32].into() - }; - - let nodes = data.replication.write_nodes(&begin); - - let retain = nodes.contains(&my_id); - if !retain { - // Check if we have some data to send, otherwise skip - match data.store.range(begin..end) { - Ok(mut iter) => { - if iter.next().is_none() { - continue; - } - } - Err(e) => { - warn!("DB error in add_full_sync: {}", e); - continue; - } - } - } - - self.todo.push(TodoPartition { - partition: partitions[i].0, - begin, - end, - retain, - }); - } - + let mut partitions = self.syncer.data.replication.sync_partitions(); + partitions.partitions.shuffle(&mut thread_rng()); + self.todo = Some(partitions); self.next_full_sync = Instant::now() + ANTI_ENTROPY_INTERVAL; } - - fn pop_task(&mut self) -> Option { - if self.todo.is_empty() { - return None; - } - - let i = rand::thread_rng().gen_range(0..self.todo.len()); - if i == self.todo.len() - 1 { - self.todo.pop() - } else { - let replacement = self.todo.pop().unwrap(); - let ret = std::mem::replace(&mut self.todo[i], replacement); - Some(ret) - } - } } #[async_trait] @@ -572,18 +504,46 @@ impl Worker for SyncWorker { fn status(&self) -> WorkerStatus { WorkerStatus { - queue_length: Some(self.todo.len() as u64), + queue_length: Some(self.todo.as_ref().map(|x| x.partitions.len()).unwrap_or(0) as u64), ..Default::default() } } async fn work(&mut self, must_exit: &mut watch::Receiver) -> Result { - if let Some(partition) = self.pop_task() { - self.syncer.sync_partition(&partition, must_exit).await?; - Ok(WorkerState::Busy) - } else { - Ok(WorkerState::Idle) + if let Some(todo) = &mut self.todo { + let partition = todo.partitions.pop().unwrap(); + + // process partition + if let Err(e) = self.syncer.sync_partition(&partition, must_exit).await { + error!( + "{}: Failed to sync partition {:?}: {}", + F::TABLE_NAME, + partition, + e + ); + // if error, put partition back at the other side of the queue, + // so that other partitions will be tried in the meantime + todo.partitions.insert(0, partition); + // TODO: returning an error here will cause the background job worker + // to delay this task for some time, but maybe we don't want to + // delay it if there are lots of failures from nodes that are gone + // (we also don't want zero delays as that will cause lots of useless retries) + return Err(e); + } + + // done + if !todo.partitions.is_empty() { + return Ok(WorkerState::Busy); + } + + self.syncer + .system + .layout_manager + .sync_table_until(F::TABLE_NAME, todo.layout_version); } + + self.todo = None; + Ok(WorkerState::Idle) } async fn wait_for_work(&mut self) -> WorkerState { @@ -594,10 +554,16 @@ impl Worker for SyncWorker { } }, _ = self.layout_notify.notified() => { - let new_version = self.syncer.system.cluster_layout().current().version; - if new_version > self.layout_version { - self.layout_version = new_version; - debug!("({}) Layout changed, adding full sync to syncer todo list", F::TABLE_NAME); + let layout_versions = self.syncer.system.cluster_layout().sync_versions(); + if layout_versions != self.layout_versions { + self.layout_versions = layout_versions; + debug!( + "({}) Layout versions changed (max={}, ack={}, min stored={}), adding full sync to syncer todo list", + F::TABLE_NAME, + layout_versions.0, + layout_versions.1, + layout_versions.2 + ); self.add_full_sync(); } }, @@ -605,9 +571,9 @@ impl Worker for SyncWorker { self.add_full_sync(); } } - match self.todo.is_empty() { - false => WorkerState::Busy, - true => WorkerState::Idle, + match self.todo.is_some() { + true => WorkerState::Busy, + false => WorkerState::Idle, } } } -- cgit v1.2.3 From df24bb806d64d5d5e748c35efe3f49ad3dda709e Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Sat, 11 Nov 2023 12:37:33 +0100 Subject: layout/sync: fix bugs and add tracing --- src/rpc/layout/history.rs | 3 ++- src/rpc/layout/manager.rs | 10 ++++++-- src/table/sync.rs | 60 ++++++++++++++++++++++++++++++----------------- 3 files changed, 48 insertions(+), 25 deletions(-) diff --git a/src/rpc/layout/history.rs b/src/rpc/layout/history.rs index 185dbb27..cef56647 100644 --- a/src/rpc/layout/history.rs +++ b/src/rpc/layout/history.rs @@ -131,7 +131,8 @@ impl LayoutHistory { pub(crate) fn cleanup_old_versions(&mut self) { let min_sync_ack = self.calculate_global_min(&self.update_trackers.sync_ack_map); while self.versions.first().as_ref().unwrap().version < min_sync_ack { - self.versions.remove(0); + let removed = self.versions.remove(0); + info!("Layout history: pruning old version {}", removed.version); } } diff --git a/src/rpc/layout/manager.rs b/src/rpc/layout/manager.rs index 7d60bae6..ce8b6f61 100644 --- a/src/rpc/layout/manager.rs +++ b/src/rpc/layout/manager.rs @@ -133,7 +133,7 @@ impl LayoutManager { pub fn sync_table_until(self: &Arc, table_name: &'static str, version: u64) { let mut table_sync_version = self.table_sync_version.lock().unwrap(); *table_sync_version.get_mut(table_name).unwrap() = version; - let sync_until = table_sync_version.iter().map(|(_, v)| *v).max().unwrap(); + let sync_until = table_sync_version.iter().map(|(_, v)| *v).min().unwrap(); drop(table_sync_version); let mut layout = self.layout.write().unwrap(); @@ -142,6 +142,7 @@ impl LayoutManager { .sync_map .set_max(self.node_id, sync_until) { + debug!("sync_until updated to {}", sync_until); layout.update_hashes(); self.broadcast_update(SystemRpc::AdvertiseClusterLayoutTrackers( layout.update_trackers.clone(), @@ -277,7 +278,12 @@ impl LayoutManager { self: &Arc, adv: &LayoutHistory, ) -> Result { - debug!("handle_advertise_cluster_layout: {:?}", adv); + debug!( + "handle_advertise_cluster_layout: {} versions, last={}, trackers={:?}", + adv.versions.len(), + adv.current().version, + adv.update_trackers + ); if adv.current().replication_factor != self.replication_factor { let msg = format!( diff --git a/src/table/sync.rs b/src/table/sync.rs index 43636faa..8c21db8b 100644 --- a/src/table/sync.rs +++ b/src/table/sync.rs @@ -488,8 +488,29 @@ struct SyncWorker { } impl SyncWorker { + fn check_add_full_sync(&mut self) { + let layout_versions = self.syncer.system.cluster_layout().sync_versions(); + if layout_versions != self.layout_versions { + self.layout_versions = layout_versions; + info!( + "({}) Layout versions changed (max={}, ack={}, min stored={}), adding full sync to syncer todo list", + F::TABLE_NAME, + layout_versions.0, + layout_versions.1, + layout_versions.2 + ); + self.add_full_sync(); + } + } + fn add_full_sync(&mut self) { let mut partitions = self.syncer.data.replication.sync_partitions(); + info!( + "{}: Adding full sync for ack layout version {}", + F::TABLE_NAME, + partitions.layout_version + ); + partitions.partitions.shuffle(&mut thread_rng()); self.todo = Some(partitions); self.next_full_sync = Instant::now() + ANTI_ENTROPY_INTERVAL; @@ -510,6 +531,8 @@ impl Worker for SyncWorker { } async fn work(&mut self, must_exit: &mut watch::Receiver) -> Result { + self.check_add_full_sync(); + if let Some(todo) = &mut self.todo { let partition = todo.partitions.pop().unwrap(); @@ -531,19 +554,23 @@ impl Worker for SyncWorker { return Err(e); } - // done - if !todo.partitions.is_empty() { - return Ok(WorkerState::Busy); + if todo.partitions.is_empty() { + info!( + "{}: Completed full sync for ack layout version {}", + F::TABLE_NAME, + todo.layout_version + ); + self.syncer + .system + .layout_manager + .sync_table_until(F::TABLE_NAME, todo.layout_version); + self.todo = None; } - self.syncer - .system - .layout_manager - .sync_table_until(F::TABLE_NAME, todo.layout_version); + Ok(WorkerState::Busy) + } else { + Ok(WorkerState::Idle) } - - self.todo = None; - Ok(WorkerState::Idle) } async fn wait_for_work(&mut self) -> WorkerState { @@ -554,18 +581,7 @@ impl Worker for SyncWorker { } }, _ = self.layout_notify.notified() => { - let layout_versions = self.syncer.system.cluster_layout().sync_versions(); - if layout_versions != self.layout_versions { - self.layout_versions = layout_versions; - debug!( - "({}) Layout versions changed (max={}, ack={}, min stored={}), adding full sync to syncer todo list", - F::TABLE_NAME, - layout_versions.0, - layout_versions.1, - layout_versions.2 - ); - self.add_full_sync(); - } + self.check_add_full_sync(); }, _ = tokio::time::sleep_until(self.next_full_sync.into()) => { self.add_full_sync(); -- cgit v1.2.3 From 9a491fa1372a23e91c793ee1d2b313607752826a Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Sat, 11 Nov 2023 13:10:59 +0100 Subject: layout: fix test --- src/rpc/layout/history.rs | 9 ++- src/rpc/layout/mod.rs | 3 + src/rpc/layout/test.rs | 159 ++++++++++++++++++++++++++++++++++++++++++ src/rpc/layout/version.rs | 172 ++-------------------------------------------- 4 files changed, 174 insertions(+), 169 deletions(-) create mode 100644 src/rpc/layout/test.rs diff --git a/src/rpc/layout/history.rs b/src/rpc/layout/history.rs index cef56647..050f5d0a 100644 --- a/src/rpc/layout/history.rs +++ b/src/rpc/layout/history.rs @@ -18,7 +18,7 @@ impl LayoutHistory { }; let mut ret = LayoutHistory { - versions: vec![version].into_boxed_slice().into(), + versions: vec![version], update_trackers: Default::default(), trackers_hash: [0u8; 32].into(), staging: Lww::raw(0, staging), @@ -211,6 +211,11 @@ To know the correct value of the new layout version, invoke `garage layout show` let msg = new_version.calculate_partition_assignment()?; self.versions.push(new_version); + if self.current().check().is_ok() { + while self.versions.first().unwrap().check().is_err() { + self.versions.remove(0); + } + } // Reset the staged layout changes self.staging.update(LayoutStaging { @@ -245,7 +250,7 @@ To know the correct value of the new layout version, invoke `garage layout show` version.check()?; } - // TODO: anythign more ? + // TODO: anything more ? Ok(()) } } diff --git a/src/rpc/layout/mod.rs b/src/rpc/layout/mod.rs index cd3764bc..577b32fb 100644 --- a/src/rpc/layout/mod.rs +++ b/src/rpc/layout/mod.rs @@ -3,6 +3,9 @@ mod history; mod schema; mod version; +#[cfg(test)] +mod test; + pub mod manager; // ---- re-exports ---- diff --git a/src/rpc/layout/test.rs b/src/rpc/layout/test.rs new file mode 100644 index 00000000..0ce090d2 --- /dev/null +++ b/src/rpc/layout/test.rs @@ -0,0 +1,159 @@ +use std::cmp::min; +use std::collections::HashMap; + +use garage_util::crdt::Crdt; +use garage_util::error::*; + +use crate::layout::*; + +// This function checks that the partition size S computed is at least better than the +// one given by a very naive algorithm. To do so, we try to run the naive algorithm +// assuming a partion size of S+1. If we succed, it means that the optimal assignment +// was not optimal. The naive algorithm is the following : +// - we compute the max number of partitions associated to every node, capped at the +// partition number. It gives the number of tokens of every node. +// - every zone has a number of tokens equal to the sum of the tokens of its nodes. +// - we cycle over the partitions and associate zone tokens while respecting the +// zone redundancy constraint. +// NOTE: the naive algorithm is not optimal. Counter example: +// take nb_partition = 3 ; replication_factor = 5; redundancy = 4; +// number of tokens by zone : (A, 4), (B,1), (C,4), (D, 4), (E, 2) +// With these parameters, the naive algo fails, whereas there is a solution: +// (A,A,C,D,E) , (A,B,C,D,D) (A,C,C,D,E) +fn check_against_naive(cl: &LayoutVersion) -> Result { + let over_size = cl.partition_size + 1; + let mut zone_token = HashMap::::new(); + + let (zones, zone_to_id) = cl.generate_nongateway_zone_ids()?; + + if zones.is_empty() { + return Ok(false); + } + + for z in zones.iter() { + zone_token.insert(z.clone(), 0); + } + for uuid in cl.nongateway_nodes() { + let z = cl.get_node_zone(&uuid)?; + let c = cl.get_node_capacity(&uuid)?; + zone_token.insert( + z.to_string(), + zone_token[z] + min(NB_PARTITIONS, (c / over_size) as usize), + ); + } + + // For every partition, we count the number of zone already associated and + // the name of the last zone associated + + let mut id_zone_token = vec![0; zones.len()]; + for (z, t) in zone_token.iter() { + id_zone_token[zone_to_id[z]] = *t; + } + + let mut nb_token = vec![0; NB_PARTITIONS]; + let mut last_zone = vec![zones.len(); NB_PARTITIONS]; + + let mut curr_zone = 0; + + let redundancy = cl.effective_zone_redundancy(); + + for replic in 0..cl.replication_factor { + for p in 0..NB_PARTITIONS { + while id_zone_token[curr_zone] == 0 + || (last_zone[p] == curr_zone + && redundancy - nb_token[p] <= cl.replication_factor - replic) + { + curr_zone += 1; + if curr_zone >= zones.len() { + return Ok(true); + } + } + id_zone_token[curr_zone] -= 1; + if last_zone[p] != curr_zone { + nb_token[p] += 1; + last_zone[p] = curr_zone; + } + } + } + + return Ok(false); +} + +fn show_msg(msg: &Message) { + for s in msg.iter() { + println!("{}", s); + } +} + +fn update_layout( + cl: &mut LayoutHistory, + node_capacity_vec: &[u64], + node_zone_vec: &[&'static str], + zone_redundancy: usize, +) { + let staging = cl.staging.get_mut(); + + for (i, (capacity, zone)) in node_capacity_vec + .iter() + .zip(node_zone_vec.iter()) + .enumerate() + { + let node_id = [i as u8; 32].into(); + + let update = staging.roles.update_mutator( + node_id, + NodeRoleV(Some(NodeRole { + zone: zone.to_string(), + capacity: Some(*capacity), + tags: (vec![]), + })), + ); + staging.roles.merge(&update); + } + staging.parameters.update(LayoutParameters { + zone_redundancy: ZoneRedundancy::AtLeast(zone_redundancy), + }); + + cl.update_hashes(); +} + +#[test] +fn test_assignment() { + let mut node_capacity_vec = vec![4000, 1000, 2000]; + let mut node_zone_vec = vec!["A", "B", "C"]; + + let mut cl = LayoutHistory::new(3); + update_layout(&mut cl, &node_capacity_vec, &node_zone_vec, 3); + let v = cl.current().version; + let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); + show_msg(&msg); + assert_eq!(cl.check(), Ok(())); + assert!(check_against_naive(cl.current()).unwrap()); + + node_capacity_vec = vec![4000, 1000, 1000, 3000, 1000, 1000, 2000, 10000, 2000]; + node_zone_vec = vec!["A", "B", "C", "C", "C", "B", "G", "H", "I"]; + update_layout(&mut cl, &node_capacity_vec, &node_zone_vec, 2); + let v = cl.current().version; + let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); + show_msg(&msg); + assert_eq!(cl.check(), Ok(())); + assert!(check_against_naive(cl.current()).unwrap()); + + node_capacity_vec = vec![4000, 1000, 2000, 7000, 1000, 1000, 2000, 10000, 2000]; + update_layout(&mut cl, &node_capacity_vec, &node_zone_vec, 3); + let v = cl.current().version; + let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); + show_msg(&msg); + assert_eq!(cl.check(), Ok(())); + assert!(check_against_naive(cl.current()).unwrap()); + + node_capacity_vec = vec![ + 4000000, 4000000, 2000000, 7000000, 1000000, 9000000, 2000000, 10000, 2000000, + ]; + update_layout(&mut cl, &node_capacity_vec, &node_zone_vec, 1); + let v = cl.current().version; + let (cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); + show_msg(&msg); + assert_eq!(cl.check(), Ok(())); + assert!(check_against_naive(cl.current()).unwrap()); +} diff --git a/src/rpc/layout/version.rs b/src/rpc/layout/version.rs index f45a3c35..ffbdf277 100644 --- a/src/rpc/layout/version.rs +++ b/src/rpc/layout/version.rs @@ -143,7 +143,7 @@ impl LayoutVersion { } /// Given a node uuids, this function returns the label of its zone - fn get_node_zone(&self, uuid: &Uuid) -> Result<&str, Error> { + pub(crate) fn get_node_zone(&self, uuid: &Uuid) -> Result<&str, Error> { match self.node_role(uuid) { Some(role) => Ok(&role.zone), _ => Err(Error::Message( @@ -162,7 +162,7 @@ impl LayoutVersion { } /// Returns the effective value of the zone_redundancy parameter - fn effective_zone_redundancy(&self) -> usize { + pub(crate) fn effective_zone_redundancy(&self) -> usize { match self.parameters.zone_redundancy { ZoneRedundancy::AtLeast(v) => v, ZoneRedundancy::Maximum => { @@ -472,7 +472,9 @@ impl LayoutVersion { /// This function generates ids for the zone of the nodes appearing in /// self.node_id_vec. - fn generate_nongateway_zone_ids(&self) -> Result<(Vec, HashMap), Error> { + pub(crate) fn generate_nongateway_zone_ids( + &self, + ) -> Result<(Vec, HashMap), Error> { let mut id_to_zone = Vec::::new(); let mut zone_to_id = HashMap::::new(); @@ -838,167 +840,3 @@ impl LayoutVersion { Ok(msg) } } - -// ==================================================================================== - -#[cfg(test)] -mod tests { - use super::{Error, *}; - use std::cmp::min; - - // This function checks that the partition size S computed is at least better than the - // one given by a very naive algorithm. To do so, we try to run the naive algorithm - // assuming a partion size of S+1. If we succed, it means that the optimal assignment - // was not optimal. The naive algorithm is the following : - // - we compute the max number of partitions associated to every node, capped at the - // partition number. It gives the number of tokens of every node. - // - every zone has a number of tokens equal to the sum of the tokens of its nodes. - // - we cycle over the partitions and associate zone tokens while respecting the - // zone redundancy constraint. - // NOTE: the naive algorithm is not optimal. Counter example: - // take nb_partition = 3 ; replication_factor = 5; redundancy = 4; - // number of tokens by zone : (A, 4), (B,1), (C,4), (D, 4), (E, 2) - // With these parameters, the naive algo fails, whereas there is a solution: - // (A,A,C,D,E) , (A,B,C,D,D) (A,C,C,D,E) - fn check_against_naive(cl: &LayoutVersion) -> Result { - let over_size = cl.partition_size + 1; - let mut zone_token = HashMap::::new(); - - let (zones, zone_to_id) = cl.generate_nongateway_zone_ids()?; - - if zones.is_empty() { - return Ok(false); - } - - for z in zones.iter() { - zone_token.insert(z.clone(), 0); - } - for uuid in cl.nongateway_nodes() { - let z = cl.get_node_zone(&uuid)?; - let c = cl.get_node_capacity(&uuid)?; - zone_token.insert( - z.clone(), - zone_token[&z] + min(NB_PARTITIONS, (c / over_size) as usize), - ); - } - - // For every partition, we count the number of zone already associated and - // the name of the last zone associated - - let mut id_zone_token = vec![0; zones.len()]; - for (z, t) in zone_token.iter() { - id_zone_token[zone_to_id[z]] = *t; - } - - let mut nb_token = vec![0; NB_PARTITIONS]; - let mut last_zone = vec![zones.len(); NB_PARTITIONS]; - - let mut curr_zone = 0; - - let redundancy = cl.effective_zone_redundancy(); - - for replic in 0..cl.replication_factor { - for p in 0..NB_PARTITIONS { - while id_zone_token[curr_zone] == 0 - || (last_zone[p] == curr_zone - && redundancy - nb_token[p] <= cl.replication_factor - replic) - { - curr_zone += 1; - if curr_zone >= zones.len() { - return Ok(true); - } - } - id_zone_token[curr_zone] -= 1; - if last_zone[p] != curr_zone { - nb_token[p] += 1; - last_zone[p] = curr_zone; - } - } - } - - return Ok(false); - } - - fn show_msg(msg: &Message) { - for s in msg.iter() { - println!("{}", s); - } - } - - fn update_layout( - cl: &mut LayoutVersion, - node_id_vec: &Vec, - node_capacity_vec: &Vec, - node_zone_vec: &Vec, - zone_redundancy: usize, - ) { - for i in 0..node_id_vec.len() { - if let Some(x) = FixedBytes32::try_from(&[i as u8; 32]) { - cl.node_id_vec.push(x); - } - - let update = cl.staging_roles.update_mutator( - cl.node_id_vec[i], - NodeRoleV(Some(NodeRole { - zone: (node_zone_vec[i].to_string()), - capacity: (Some(node_capacity_vec[i])), - tags: (vec![]), - })), - ); - cl.staging_roles.merge(&update); - } - cl.staging_parameters.update(LayoutParameters { - zone_redundancy: ZoneRedundancy::AtLeast(zone_redundancy), - }); - cl.staging_hash = cl.calculate_staging_hash(); - } - - #[test] - fn test_assignment() { - let mut node_id_vec = vec![1, 2, 3]; - let mut node_capacity_vec = vec![4000, 1000, 2000]; - let mut node_zone_vec = vec!["A", "B", "C"] - .into_iter() - .map(|x| x.to_string()) - .collect(); - - let mut cl = LayoutVersion::new(3); - update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 3); - let v = cl.version; - let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); - show_msg(&msg); - assert_eq!(cl.check(), Ok(())); - assert!(matches!(check_against_naive(&cl), Ok(true))); - - node_id_vec = vec![1, 2, 3, 4, 5, 6, 7, 8, 9]; - node_capacity_vec = vec![4000, 1000, 1000, 3000, 1000, 1000, 2000, 10000, 2000]; - node_zone_vec = vec!["A", "B", "C", "C", "C", "B", "G", "H", "I"] - .into_iter() - .map(|x| x.to_string()) - .collect(); - update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 2); - let v = cl.version; - let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); - show_msg(&msg); - assert_eq!(cl.check(), Ok(())); - assert!(matches!(check_against_naive(&cl), Ok(true))); - - node_capacity_vec = vec![4000, 1000, 2000, 7000, 1000, 1000, 2000, 10000, 2000]; - update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 3); - let v = cl.version; - let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); - show_msg(&msg); - assert_eq!(cl.check(), Ok(())); - assert!(matches!(check_against_naive(&cl), Ok(true))); - - node_capacity_vec = vec![ - 4000000, 4000000, 2000000, 7000000, 1000000, 9000000, 2000000, 10000, 2000000, - ]; - update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 1); - let v = cl.version; - let (cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); - show_msg(&msg); - assert_eq!(cl.check(), Ok(())); - assert!(matches!(check_against_naive(&cl), Ok(true))); - } -} -- cgit v1.2.3 From 8e292e06b3fde1d3b5b019a26eabd4f0d9ac22c3 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 14 Nov 2023 12:48:38 +0100 Subject: layout: some refactoring of nongateway nodes --- src/api/k2v/index.rs | 7 +++- src/model/index_counter.rs | 4 +- src/rpc/layout/history.rs | 30 ++++++++------ src/rpc/layout/schema.rs | 17 ++++++++ src/rpc/layout/version.rs | 98 +++++++++++++++++++++++++--------------------- 5 files changed, 95 insertions(+), 61 deletions(-) diff --git a/src/api/k2v/index.rs b/src/api/k2v/index.rs index 3c2f51a9..c189232a 100644 --- a/src/api/k2v/index.rs +++ b/src/api/k2v/index.rs @@ -25,8 +25,11 @@ pub async fn handle_read_index( ) -> Result, Error> { let reverse = reverse.unwrap_or(false); - // TODO: not only current - let node_id_vec = garage.system.cluster_layout().current().node_ids().to_vec(); + let node_id_vec = garage + .system + .cluster_layout() + .all_nongateway_nodes() + .into_owned(); let (partition_keys, more, next_start) = read_range( &garage.k2v.counter_table.table, diff --git a/src/model/index_counter.rs b/src/model/index_counter.rs index 9637cc4c..2d968733 100644 --- a/src/model/index_counter.rs +++ b/src/model/index_counter.rs @@ -84,8 +84,8 @@ impl Entry for CounterEntry { impl CounterEntry { pub fn filtered_values(&self, layout: &LayoutHistory) -> HashMap { - let nodes = &layout.current().node_id_vec[..]; - self.filtered_values_with_nodes(nodes) + let nodes = layout.all_nongateway_nodes(); + self.filtered_values_with_nodes(&nodes) } pub fn filtered_values_with_nodes(&self, nodes: &[Uuid]) -> HashMap { diff --git a/src/rpc/layout/history.rs b/src/rpc/layout/history.rs index 050f5d0a..877ad3a7 100644 --- a/src/rpc/layout/history.rs +++ b/src/rpc/layout/history.rs @@ -1,3 +1,4 @@ +use std::borrow::Cow; use std::collections::HashSet; use garage_util::crdt::{Crdt, Lww, LwwMap}; @@ -59,13 +60,19 @@ impl LayoutHistory { (self.current().version, self.all_ack(), self.min_stored()) } - pub fn all_nongateway_nodes(&self) -> HashSet { + pub fn all_nongateway_nodes(&self) -> Cow<'_, [Uuid]> { // TODO: cache this - self.versions - .iter() - .map(|x| x.nongateway_nodes()) - .flatten() - .collect::>() + if self.versions.len() == 1 { + self.versions[0].nongateway_nodes().into() + } else { + let set = self + .versions + .iter() + .map(|x| x.nongateway_nodes()) + .flatten() + .collect::>(); + set.into_iter().copied().collect::>().into() + } } pub fn read_nodes_of(&self, position: &Hash) -> Vec { @@ -202,14 +209,11 @@ To know the correct value of the new layout version, invoke `garage layout show` } // Compute new version and add it to history - let mut new_version = self.current().clone(); - new_version.version += 1; - - new_version.roles.merge(&self.staging.get().roles); - new_version.roles.retain(|(_, _, v)| v.0.is_some()); - new_version.parameters = *self.staging.get().parameters.get(); + let (new_version, msg) = self + .current() + .clone() + .calculate_next_version(&self.staging.get())?; - let msg = new_version.calculate_partition_assignment()?; self.versions.push(new_version); if self.current().check().is_ok() { while self.versions.first().unwrap().check().is_err() { diff --git a/src/rpc/layout/schema.rs b/src/rpc/layout/schema.rs index 89f5c361..db298ee6 100644 --- a/src/rpc/layout/schema.rs +++ b/src/rpc/layout/schema.rs @@ -212,6 +212,8 @@ mod v010 { /// see comment in v08::ClusterLayout pub node_id_vec: Vec, + /// number of non-gateway nodes, which are the first ids in node_id_vec + pub nongateway_node_count: usize, /// see comment in v08::ClusterLayout #[serde(with = "serde_bytes")] pub ring_assignment_data: Vec, @@ -265,6 +267,18 @@ mod v010 { type Previous = v09::ClusterLayout; fn migrate(previous: Self::Previous) -> Self { + let nongateway_node_count = previous + .node_id_vec + .iter() + .enumerate() + .filter(|(_, uuid)| { + let role = previous.roles.get(uuid); + matches!(role, Some(NodeRoleV(Some(role))) if role.capacity.is_some()) + }) + .map(|(i, _)| i) + .max() + .unwrap_or(0); + let version = LayoutVersion { version: previous.version, replication_factor: previous.replication_factor, @@ -272,11 +286,14 @@ mod v010 { parameters: previous.parameters, roles: previous.roles, node_id_vec: previous.node_id_vec, + nongateway_node_count, ring_assignment_data: previous.ring_assignment_data, }; let update_tracker = UpdateTracker( version .nongateway_nodes() + .iter() + .copied() .map(|x| (x, version.version)) .collect::>(), ); diff --git a/src/rpc/layout/version.rs b/src/rpc/layout/version.rs index ffbdf277..a7f387b6 100644 --- a/src/rpc/layout/version.rs +++ b/src/rpc/layout/version.rs @@ -5,7 +5,7 @@ use std::convert::TryInto; use bytesize::ByteSize; use itertools::Itertools; -use garage_util::crdt::LwwMap; +use garage_util::crdt::{Crdt, LwwMap}; use garage_util::data::*; use garage_util::error::*; @@ -30,6 +30,7 @@ impl LayoutVersion { partition_size: 0, roles: LwwMap::new(), node_id_vec: Vec::new(), + nongateway_node_count: 0, ring_assignment_data: Vec::new(), parameters, } @@ -43,6 +44,11 @@ impl LayoutVersion { &self.node_id_vec[..] } + /// Returns the uuids of the non_gateway nodes in self.node_id_vec. + pub fn nongateway_nodes(&self) -> &[Uuid] { + &self.node_id_vec[..self.nongateway_node_count] + } + pub fn num_nodes(&self) -> usize { self.node_id_vec.len() } @@ -56,18 +62,14 @@ impl LayoutVersion { } /// Given a node uuids, this function returns its capacity or fails if it does not have any - pub fn get_node_capacity(&self, uuid: &Uuid) -> Result { + pub fn get_node_capacity(&self, uuid: &Uuid) -> Option { match self.node_role(uuid) { Some(NodeRole { capacity: Some(cap), zone: _, tags: _, - }) => Ok(*cap), - _ => Err(Error::Message( - "The Uuid does not correspond to a node present in the \ - cluster or this node does not have a positive capacity." - .into(), - )), + }) => Some(*cap), + _ => None, } } @@ -131,17 +133,6 @@ impl LayoutVersion { // ===================== internal information extractors ====================== - /// Returns the uuids of the non_gateway nodes in self.node_id_vec. - pub(crate) fn nongateway_nodes(&self) -> impl Iterator + '_ { - self.node_id_vec - .iter() - .copied() - .filter(move |uuid| match self.node_role(uuid) { - Some(role) if role.capacity.is_some() => true, - _ => false, - }) - } - /// Given a node uuids, this function returns the label of its zone pub(crate) fn get_node_zone(&self, uuid: &Uuid) -> Result<&str, Error> { match self.node_role(uuid) { @@ -152,11 +143,16 @@ impl LayoutVersion { } } + fn expect_get_node_capacity(&self, uuid: &Uuid) -> u64 { + self.get_node_capacity(&uuid) + .expect("non-gateway node with zero capacity") + } + /// Returns the sum of capacities of non gateway nodes in the cluster fn get_total_capacity(&self) -> Result { let mut total_capacity = 0; for uuid in self.nongateway_nodes() { - total_capacity += self.get_node_capacity(&uuid)?; + total_capacity += self.expect_get_node_capacity(&uuid); } Ok(total_capacity) } @@ -257,7 +253,7 @@ impl LayoutVersion { if *usage > 0 { let uuid = self.node_id_vec[n]; let partusage = usage * self.partition_size; - let nodecap = self.get_node_capacity(&uuid).unwrap(); + let nodecap = self.expect_get_node_capacity(&uuid); if partusage > nodecap { return Err(format!( "node usage ({}) is bigger than node capacity ({})", @@ -288,6 +284,21 @@ impl LayoutVersion { // ================== updates to layout, internals =================== + pub(crate) fn calculate_next_version( + mut self, + staging: &LayoutStaging, + ) -> Result<(Self, Message), Error> { + self.version += 1; + + self.roles.merge(&staging.roles); + self.roles.retain(|(_, _, v)| v.0.is_some()); + self.parameters = *staging.parameters.get(); + + let msg = self.calculate_partition_assignment()?; + + Ok((self, msg)) + } + /// This function calculates a new partition-to-node assignment. /// The computed assignment respects the node replication factor /// and the zone redundancy parameter It maximizes the capacity of a @@ -297,7 +308,7 @@ impl LayoutVersion { /// data to be moved. /// Staged role changes must be merged with nodes roles before calling this function, /// hence it must only be called from apply_staged_changes() and hence is not public. - pub(crate) fn calculate_partition_assignment(&mut self) -> Result { + fn calculate_partition_assignment(&mut self) -> Result { // We update the node ids, since the node role list might have changed with the // changes in the layout. We retrieve the old_assignment reframed with new ids let old_assignment_opt = self.update_node_id_vec()?; @@ -317,12 +328,12 @@ impl LayoutVersion { // to use them as indices in the flow graphs. let (id_to_zone, zone_to_id) = self.generate_nongateway_zone_ids()?; - let nb_nongateway_nodes = self.nongateway_nodes().count(); - if nb_nongateway_nodes < self.replication_factor { + if self.nongateway_nodes().len() < self.replication_factor { return Err(Error::Message(format!( "The number of nodes with positive \ capacity ({}) is smaller than the replication factor ({}).", - nb_nongateway_nodes, self.replication_factor + self.nongateway_nodes().len(), + self.replication_factor ))); } if id_to_zone.len() < zone_redundancy { @@ -420,12 +431,14 @@ impl LayoutVersion { .map(|(k, _, _)| *k) .collect(); - let mut new_node_id_vec = Vec::::new(); - new_node_id_vec.extend(new_non_gateway_nodes); - new_node_id_vec.extend(new_gateway_nodes); + let old_node_id_vec = std::mem::take(&mut self.node_id_vec); + + self.nongateway_node_count = new_non_gateway_nodes.len(); + self.node_id_vec.clear(); + self.node_id_vec.extend(new_non_gateway_nodes); + self.node_id_vec.extend(new_gateway_nodes); - let old_node_id_vec = self.node_id_vec.clone(); - self.node_id_vec = new_node_id_vec.clone(); + let new_node_id_vec = &self.node_id_vec; // (2) We retrieve the old association // We rewrite the old association with the new indices. We only consider partition @@ -464,7 +477,7 @@ impl LayoutVersion { } } - // We write the ring + // We clear the ring assignemnt data self.ring_assignment_data = Vec::::new(); Ok(Some(old_assignment)) @@ -478,8 +491,7 @@ impl LayoutVersion { let mut id_to_zone = Vec::::new(); let mut zone_to_id = HashMap::::new(); - let nongateway_nodes = self.nongateway_nodes().collect::>(); - for uuid in nongateway_nodes.iter() { + for uuid in self.nongateway_nodes().iter() { let r = self.node_role(uuid).unwrap(); if !zone_to_id.contains_key(&r.zone) && r.capacity.is_some() { zone_to_id.insert(r.zone.clone(), id_to_zone.len()); @@ -556,10 +568,8 @@ impl LayoutVersion { exclude_assoc: &HashSet<(usize, usize)>, zone_redundancy: usize, ) -> Result, Error> { - let vertices = LayoutVersion::generate_graph_vertices( - zone_to_id.len(), - self.nongateway_nodes().count(), - ); + let vertices = + LayoutVersion::generate_graph_vertices(zone_to_id.len(), self.nongateway_nodes().len()); let mut g = Graph::::new(&vertices); let nb_zones = zone_to_id.len(); for p in 0..NB_PARTITIONS { @@ -578,8 +588,8 @@ impl LayoutVersion { )?; } } - for n in 0..self.nongateway_nodes().count() { - let node_capacity = self.get_node_capacity(&self.node_id_vec[n])?; + for n in 0..self.nongateway_nodes().len() { + let node_capacity = self.expect_get_node_capacity(&self.node_id_vec[n]); let node_zone = zone_to_id[self.get_node_zone(&self.node_id_vec[n])?]; g.add_edge(Vertex::N(n), Vertex::Sink, node_capacity / partition_size)?; for p in 0..NB_PARTITIONS { @@ -602,7 +612,7 @@ impl LayoutVersion { // previous assignment let mut exclude_edge = HashSet::<(usize, usize)>::new(); if let Some(prev_assign) = prev_assign_opt { - let nb_nodes = self.nongateway_nodes().count(); + let nb_nodes = self.nongateway_nodes().len(); for (p, prev_assign_p) in prev_assign.iter().enumerate() { for n in 0..nb_nodes { exclude_edge.insert((p, n)); @@ -654,7 +664,7 @@ impl LayoutVersion { // We compute the maximal length of a simple path in gflow. It is used in the // Bellman-Ford algorithm in optimize_flow_with_cost to set the number // of iterations. - let nb_nodes = self.nongateway_nodes().count(); + let nb_nodes = self.nongateway_nodes().len(); let path_length = 4 * nb_nodes; gflow.optimize_flow_with_cost(&cost, path_length)?; @@ -732,7 +742,7 @@ impl LayoutVersion { } // We define and fill in the following tables - let storing_nodes = self.nongateway_nodes().collect::>(); + let storing_nodes = self.nongateway_nodes(); let mut new_partitions = vec![0; storing_nodes.len()]; let mut stored_partitions = vec![0; storing_nodes.len()]; @@ -804,13 +814,13 @@ impl LayoutVersion { let available_cap_z: u64 = self.partition_size * replicated_partitions as u64; let mut total_cap_z = 0; for n in nodes_of_z.iter() { - total_cap_z += self.get_node_capacity(&self.node_id_vec[*n])?; + total_cap_z += self.expect_get_node_capacity(&self.node_id_vec[*n]); } let percent_cap_z = 100.0 * (available_cap_z as f32) / (total_cap_z as f32); for n in nodes_of_z.iter() { let available_cap_n = stored_partitions[*n] as u64 * self.partition_size; - let total_cap_n = self.get_node_capacity(&self.node_id_vec[*n])?; + let total_cap_n = self.expect_get_node_capacity(&self.node_id_vec[*n]); let tags_n = (self.node_role(&self.node_id_vec[*n]).ok_or(""))?.tags_string(); table.push(format!( " {:?}\t{}\t{} ({} new)\t{}\t{} ({:.1}%)", -- cgit v1.2.3 From 1aab1f4e688ebc3f3adcb41c817c16c688a3291c Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 14 Nov 2023 13:06:16 +0100 Subject: layout: refactoring of all_nodes --- src/garage/admin/mod.rs | 17 +++++++++-------- src/garage/cli/layout.rs | 6 +++--- src/model/helper/bucket.rs | 8 ++++++-- src/rpc/layout/history.rs | 15 +++++++++++++++ src/rpc/layout/version.rs | 17 +++++++---------- src/rpc/system.rs | 2 +- src/table/replication/fullcopy.rs | 6 +++--- 7 files changed, 44 insertions(+), 27 deletions(-) diff --git a/src/garage/admin/mod.rs b/src/garage/admin/mod.rs index e3ba6d35..77918a0f 100644 --- a/src/garage/admin/mod.rs +++ b/src/garage/admin/mod.rs @@ -126,8 +126,8 @@ impl AdminRpcHandler { opt_to_send.all_nodes = false; let mut failures = vec![]; - let layout = self.garage.system.cluster_layout().clone(); - for node in layout.current().node_ids().iter() { + let all_nodes = self.garage.system.cluster_layout().all_nodes().to_vec(); + for node in all_nodes.iter() { let node = (*node).into(); let resp = self .endpoint @@ -163,9 +163,9 @@ impl AdminRpcHandler { async fn handle_stats(&self, opt: StatsOpt) -> Result { if opt.all_nodes { let mut ret = String::new(); - let layout = self.garage.system.cluster_layout().clone(); + let all_nodes = self.garage.system.cluster_layout().all_nodes().to_vec(); - for node in layout.current().node_ids().iter() { + for node in all_nodes.iter() { let mut opt = opt.clone(); opt.all_nodes = false; opt.skip_global = true; @@ -275,6 +275,7 @@ impl AdminRpcHandler { let mut ret = String::new(); // Gather storage node and free space statistics + // TODO: not only layout.current() ??? let layout = &self.garage.system.cluster_layout(); let mut node_partition_count = HashMap::::new(); for short_id in layout.current().ring_assignment_data.iter() { @@ -440,8 +441,8 @@ impl AdminRpcHandler { ) -> Result { if all_nodes { let mut ret = vec![]; - let layout = self.garage.system.cluster_layout().clone(); - for node in layout.current().node_ids().iter() { + let all_nodes = self.garage.system.cluster_layout().all_nodes().to_vec(); + for node in all_nodes.iter() { let node = (*node).into(); match self .endpoint @@ -488,8 +489,8 @@ impl AdminRpcHandler { ) -> Result { if all_nodes { let mut ret = vec![]; - let layout = self.garage.system.cluster_layout().clone(); - for node in layout.current().node_ids().iter() { + let all_nodes = self.garage.system.cluster_layout().all_nodes().to_vec(); + for node in all_nodes.iter() { let node = (*node).into(); match self .endpoint diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index 15727448..0f01a37a 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -49,6 +49,7 @@ pub async fn cmd_assign_role( }; let mut layout = fetch_layout(rpc_cli, rpc_host).await?; + let all_nodes = layout.all_nodes().into_owned(); let added_nodes = args .node_ids @@ -58,7 +59,7 @@ pub async fn cmd_assign_role( status .iter() .map(|adv| adv.id) - .chain(layout.current().node_ids().iter().cloned()), + .chain(all_nodes.iter().cloned()), node_id, ) }) @@ -68,8 +69,7 @@ pub async fn cmd_assign_role( roles.merge(&layout.staging.get().roles); for replaced in args.replace.iter() { - let replaced_node = - find_matching_node(layout.current().node_ids().iter().cloned(), replaced)?; + let replaced_node = find_matching_node(all_nodes.iter().cloned(), replaced)?; match roles.get(&replaced_node) { Some(NodeRoleV(Some(_))) => { layout diff --git a/src/model/helper/bucket.rs b/src/model/helper/bucket.rs index 2a9c0fb1..2cb53424 100644 --- a/src/model/helper/bucket.rs +++ b/src/model/helper/bucket.rs @@ -450,8 +450,12 @@ impl<'a> BucketHelper<'a> { #[cfg(feature = "k2v")] { - // TODO: not only current - let node_id_vec = self.0.system.cluster_layout().current().node_ids().to_vec(); + let node_id_vec = self + .0 + .system + .cluster_layout() + .all_nongateway_nodes() + .into_owned(); let k2vindexes = self .0 .k2v diff --git a/src/rpc/layout/history.rs b/src/rpc/layout/history.rs index 877ad3a7..69348873 100644 --- a/src/rpc/layout/history.rs +++ b/src/rpc/layout/history.rs @@ -60,6 +60,21 @@ impl LayoutHistory { (self.current().version, self.all_ack(), self.min_stored()) } + pub fn all_nodes(&self) -> Cow<'_, [Uuid]> { + // TODO: cache this + if self.versions.len() == 1 { + self.versions[0].all_nodes().into() + } else { + let set = self + .versions + .iter() + .map(|x| x.all_nodes()) + .flatten() + .collect::>(); + set.into_iter().copied().collect::>().into() + } + } + pub fn all_nongateway_nodes(&self) -> Cow<'_, [Uuid]> { // TODO: cache this if self.versions.len() == 1 { diff --git a/src/rpc/layout/version.rs b/src/rpc/layout/version.rs index a7f387b6..2cbdcee2 100644 --- a/src/rpc/layout/version.rs +++ b/src/rpc/layout/version.rs @@ -38,22 +38,19 @@ impl LayoutVersion { // ===================== accessors ====================== - /// Returns a list of IDs of nodes that currently have - /// a role in the cluster - pub fn node_ids(&self) -> &[Uuid] { + /// Returns a list of IDs of nodes that have a role in this + /// version of the cluster layout, including gateway nodes + pub fn all_nodes(&self) -> &[Uuid] { &self.node_id_vec[..] } - /// Returns the uuids of the non_gateway nodes in self.node_id_vec. + /// Returns a list of IDs of nodes that have a storage capacity + /// assigned in this version of the cluster layout pub fn nongateway_nodes(&self) -> &[Uuid] { &self.node_id_vec[..self.nongateway_node_count] } - pub fn num_nodes(&self) -> usize { - self.node_id_vec.len() - } - - /// Returns the role of a node in the layout + /// Returns the role of a node in the layout, if it has one pub fn node_role(&self, node: &Uuid) -> Option<&NodeRole> { match self.roles.get(node) { Some(NodeRoleV(Some(v))) => Some(v), @@ -61,7 +58,7 @@ impl LayoutVersion { } } - /// Given a node uuids, this function returns its capacity or fails if it does not have any + /// Returns the capacity of a node in the layout, if it has one pub fn get_node_capacity(&self, uuid: &Uuid) -> Option { match self.node_role(uuid) { Some(NodeRole { diff --git a/src/rpc/system.rs b/src/rpc/system.rs index 3418600b..ab3c96b8 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -609,7 +609,7 @@ impl System { while !*stop_signal.borrow() { let not_configured = self.cluster_layout().check().is_err(); let no_peers = self.fullmesh.get_peer_list().len() < self.replication_factor; - let expected_n_nodes = self.cluster_layout().current().num_nodes(); + let expected_n_nodes = self.cluster_layout().all_nodes().len(); let bad_peers = self .fullmesh .get_peer_list() diff --git a/src/table/replication/fullcopy.rs b/src/table/replication/fullcopy.rs index 5653a229..beaacc2b 100644 --- a/src/table/replication/fullcopy.rs +++ b/src/table/replication/fullcopy.rs @@ -35,10 +35,10 @@ impl TableReplication for TableFullReplication { } fn write_nodes(&self, _hash: &Hash) -> Vec { - self.system.cluster_layout().current().node_ids().to_vec() + self.system.cluster_layout().current().all_nodes().to_vec() } fn write_quorum(&self) -> usize { - let nmembers = self.system.cluster_layout().current().node_ids().len(); + let nmembers = self.system.cluster_layout().current().all_nodes().len(); if nmembers > self.max_faults { nmembers - self.max_faults } else { @@ -62,7 +62,7 @@ impl TableReplication for TableFullReplication { partition: 0u16, first_hash: [0u8; 32].into(), last_hash: [0xff; 32].into(), - storage_nodes: Vec::from_iter(layout.current().node_ids().to_vec()), + storage_nodes: Vec::from_iter(layout.current().all_nodes().to_vec()), }], } } -- cgit v1.2.3 From 83a11374ca45831a6f54928dfe726fac65493b00 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 14 Nov 2023 13:29:26 +0100 Subject: layout: fixes in schema --- src/rpc/layout/schema.rs | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/src/rpc/layout/schema.rs b/src/rpc/layout/schema.rs index db298ee6..79440a47 100644 --- a/src/rpc/layout/schema.rs +++ b/src/rpc/layout/schema.rs @@ -193,7 +193,25 @@ mod v010 { use std::collections::BTreeMap; pub use v09::{LayoutParameters, NodeRole, NodeRoleV, ZoneRedundancy}; - /// The layout of the cluster, i.e. the list of roles + /// The history of cluster layouts, with trackers to keep a record + /// of which nodes are up-to-date to current cluster data + #[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] + pub struct LayoutHistory { + /// The versions currently in use in the cluster + pub versions: Vec, + + /// Update trackers + pub update_trackers: UpdateTrackers, + /// Hash of the update trackers + pub trackers_hash: Hash, + + /// Staged changes for the next version + pub staging: Lww, + /// Hash of the serialized staging_parameters + staging_roles + pub staging_hash: Hash, + } + + /// A version of the layout of the cluster, i.e. the list of roles /// which are assigned to each cluster node #[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] pub struct LayoutVersion { @@ -228,23 +246,6 @@ mod v010 { pub roles: LwwMap, } - /// The history of cluster layouts - #[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] - pub struct LayoutHistory { - /// The versions currently in use in the cluster - pub versions: Vec, - - /// Update trackers - pub update_trackers: UpdateTrackers, - /// Hash of the update trackers - pub trackers_hash: Hash, - - /// Staged changes for the next version - pub staging: Lww, - /// Hash of the serialized staging_parameters + staging_roles - pub staging_hash: Hash, - } - /// The tracker of acknowlegments and data syncs around the cluster #[derive(Clone, Debug, Serialize, Deserialize, Default, PartialEq)] pub struct UpdateTrackers { @@ -275,7 +276,7 @@ mod v010 { let role = previous.roles.get(uuid); matches!(role, Some(NodeRoleV(Some(role))) if role.capacity.is_some()) }) - .map(|(i, _)| i) + .map(|(i, _)| i + 1) .max() .unwrap_or(0); @@ -312,8 +313,7 @@ mod v010 { staging: Lww::raw(previous.version, staging), staging_hash: [0u8; 32].into(), }; - ret.staging_hash = ret.calculate_staging_hash(); - ret.trackers_hash = ret.calculate_trackers_hash(); + ret.update_hashes(); ret } } -- cgit v1.2.3 From 866196750fca74c1911ade2a90611f3663e60046 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 14 Nov 2023 13:36:58 +0100 Subject: system: add todo wrt new layout --- src/rpc/system.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/rpc/system.rs b/src/rpc/system.rs index ab3c96b8..86c02e86 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -417,6 +417,9 @@ impl System { } pub fn health(&self) -> ClusterHealth { + // TODO: adapt this function to take into account layout history + // when estimating cluster health, and not just use current layout + let quorum = self.replication_mode.write_quorum(); let replication_factor = self.replication_factor; @@ -429,7 +432,6 @@ impl System { let layout = self.cluster_layout(); // acquires a rwlock - // TODO: not only layout.current() let storage_nodes = layout .current() .roles -- cgit v1.2.3 From 3b361d2959e3d577bdae6f8a5ccb0c9d5526b7ea Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 14 Nov 2023 14:28:16 +0100 Subject: layout: prepare for write sets --- src/block/manager.rs | 3 ++- src/block/resync.rs | 2 +- src/model/k2v/rpc.rs | 10 ++++++---- src/rpc/layout/history.rs | 19 ++++++++++++++++--- src/rpc/layout/version.rs | 21 ++++++++++----------- src/rpc/system.rs | 3 +-- src/table/data.rs | 3 ++- src/table/gc.rs | 2 +- src/table/replication/fullcopy.rs | 9 +++++++-- src/table/replication/parameters.rs | 8 +++++--- src/table/replication/sharded.rs | 24 ++++++++---------------- src/table/sync.rs | 2 +- src/table/table.rs | 6 ++++-- 13 files changed, 64 insertions(+), 48 deletions(-) diff --git a/src/block/manager.rs b/src/block/manager.rs index 72b4ea66..2bb9c23d 100644 --- a/src/block/manager.rs +++ b/src/block/manager.rs @@ -354,7 +354,8 @@ impl BlockManager { /// Send block to nodes that should have it pub async fn rpc_put_block(&self, hash: Hash, data: Bytes) -> Result<(), Error> { - let who = self.replication.write_nodes(&hash); + // TODO: use quorums among latest write set + let who = self.replication.storage_nodes(&hash); let (header, bytes) = DataBlock::from_buffer(data, self.compression_level) .await diff --git a/src/block/resync.rs b/src/block/resync.rs index fedcd6f5..122d0142 100644 --- a/src/block/resync.rs +++ b/src/block/resync.rs @@ -377,7 +377,7 @@ impl BlockResyncManager { info!("Resync block {:?}: offloading and deleting", hash); let existing_path = existing_path.unwrap(); - let mut who = manager.replication.write_nodes(hash); + let mut who = manager.replication.storage_nodes(hash); if who.len() < manager.replication.write_quorum() { return Err(Error::Message("Not trying to offload block because we don't have a quorum of nodes to write to".to_string())); } diff --git a/src/model/k2v/rpc.rs b/src/model/k2v/rpc.rs index 2f548ad7..aa3323d5 100644 --- a/src/model/k2v/rpc.rs +++ b/src/model/k2v/rpc.rs @@ -127,7 +127,7 @@ impl K2VRpcHandler { .item_table .data .replication - .write_nodes(&partition.hash()); + .storage_nodes(&partition.hash()); who.sort(); self.system @@ -168,7 +168,7 @@ impl K2VRpcHandler { .item_table .data .replication - .write_nodes(&partition.hash()); + .storage_nodes(&partition.hash()); who.sort(); call_list.entry(who).or_default().push(InsertedItem { @@ -223,11 +223,12 @@ impl K2VRpcHandler { }, sort_key, }; + // TODO figure this out with write sets, does it still work???? let nodes = self .item_table .data .replication - .write_nodes(&poll_key.partition.hash()); + .read_nodes(&poll_key.partition.hash()); let rpc = self.system.rpc_helper().try_call_many( &self.endpoint, @@ -284,11 +285,12 @@ impl K2VRpcHandler { seen.restrict(&range); // Prepare PollRange RPC to send to the storage nodes responsible for the parititon + // TODO figure this out with write sets, does it still work???? let nodes = self .item_table .data .replication - .write_nodes(&range.partition.hash()); + .read_nodes(&range.partition.hash()); let quorum = self.item_table.data.replication.read_quorum(); let msg = K2VRpc::PollRange { range, diff --git a/src/rpc/layout/history.rs b/src/rpc/layout/history.rs index 69348873..dce492c9 100644 --- a/src/rpc/layout/history.rs +++ b/src/rpc/layout/history.rs @@ -98,13 +98,26 @@ impl LayoutHistory { .find(|x| x.version == sync_min) .or(self.versions.last()) .unwrap(); - version.nodes_of(position, version.replication_factor) + version + .nodes_of(position, version.replication_factor) + .collect() } - pub fn write_sets_of<'a>(&'a self, position: &'a Hash) -> impl Iterator> + 'a { + pub fn write_sets_of(&self, position: &Hash) -> Vec> { self.versions .iter() - .map(move |x| x.nodes_of(position, x.replication_factor)) + .map(|x| x.nodes_of(position, x.replication_factor).collect()) + .collect() + } + + pub fn storage_nodes_of(&self, position: &Hash) -> Vec { + let mut ret = vec![]; + for version in self.versions.iter() { + ret.extend(version.nodes_of(position, version.replication_factor)); + } + ret.sort(); + ret.dedup(); + ret } // ------------------ update tracking --------------- diff --git a/src/rpc/layout/version.rs b/src/rpc/layout/version.rs index 2cbdcee2..912ee538 100644 --- a/src/rpc/layout/version.rs +++ b/src/rpc/layout/version.rs @@ -107,25 +107,24 @@ impl LayoutVersion { } /// Return the n servers in which data for this hash should be replicated - pub fn nodes_of(&self, position: &Hash, n: usize) -> Vec { + pub fn nodes_of(&self, position: &Hash, n: usize) -> impl Iterator + '_ { assert_eq!(n, self.replication_factor); let data = &self.ring_assignment_data; - if data.len() != self.replication_factor * (1 << PARTITION_BITS) { + let partition_nodes = if data.len() == self.replication_factor * (1 << PARTITION_BITS) { + let partition_idx = self.partition_of(position) as usize; + let partition_start = partition_idx * self.replication_factor; + let partition_end = (partition_idx + 1) * self.replication_factor; + &data[partition_start..partition_end] + } else { warn!("Ring not yet ready, read/writes will be lost!"); - return vec![]; - } - - let partition_idx = self.partition_of(position) as usize; - let partition_start = partition_idx * self.replication_factor; - let partition_end = (partition_idx + 1) * self.replication_factor; - let partition_nodes = &data[partition_start..partition_end]; + &[] + }; partition_nodes .iter() - .map(|i| self.node_id_vec[*i as usize]) - .collect::>() + .map(move |i| self.node_id_vec[*i as usize]) } // ===================== internal information extractors ====================== diff --git a/src/rpc/system.rs b/src/rpc/system.rs index 86c02e86..31d78bf6 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -449,8 +449,7 @@ impl System { .iter() .map(|(_, h)| { let pn = layout.current().nodes_of(h, replication_factor); - pn.iter() - .filter(|x| nodes.get(x).map(|n| n.is_up).unwrap_or(false)) + pn.filter(|x| nodes.get(x).map(|n| n.is_up).unwrap_or(false)) .count() }) .collect::>(); diff --git a/src/table/data.rs b/src/table/data.rs index bbfdf58b..7f6b7847 100644 --- a/src/table/data.rs +++ b/src/table/data.rs @@ -254,7 +254,8 @@ impl TableData { // of the GC algorithm, as in all cases GC is suspended if // any node of the partition is unavailable. let pk_hash = Hash::try_from(&tree_key[..32]).unwrap(); - let nodes = self.replication.write_nodes(&pk_hash); + // TODO: this probably breaks when the layout changes + let nodes = self.replication.storage_nodes(&pk_hash); if nodes.first() == Some(&self.system.id) { GcTodoEntry::new(tree_key, new_bytes_hash).save(&self.gc_todo)?; } diff --git a/src/table/gc.rs b/src/table/gc.rs index 2135a358..002cfbf4 100644 --- a/src/table/gc.rs +++ b/src/table/gc.rs @@ -152,7 +152,7 @@ impl TableGc { let mut partitions = HashMap::new(); for entry in entries { let pkh = Hash::try_from(&entry.key[..32]).unwrap(); - let mut nodes = self.data.replication.write_nodes(&pkh); + let mut nodes = self.data.replication.storage_nodes(&pkh); nodes.retain(|x| *x != self.system.id); nodes.sort(); diff --git a/src/table/replication/fullcopy.rs b/src/table/replication/fullcopy.rs index beaacc2b..cb5471af 100644 --- a/src/table/replication/fullcopy.rs +++ b/src/table/replication/fullcopy.rs @@ -27,6 +27,11 @@ pub struct TableFullReplication { } impl TableReplication for TableFullReplication { + fn storage_nodes(&self, _hash: &Hash) -> Vec { + let layout = self.system.cluster_layout(); + layout.current().all_nodes().to_vec() + } + fn read_nodes(&self, _hash: &Hash) -> Vec { vec![self.system.id] } @@ -34,8 +39,8 @@ impl TableReplication for TableFullReplication { 1 } - fn write_nodes(&self, _hash: &Hash) -> Vec { - self.system.cluster_layout().current().all_nodes().to_vec() + fn write_sets(&self, hash: &Hash) -> Vec> { + vec![self.storage_nodes(hash)] } fn write_quorum(&self) -> usize { let nmembers = self.system.cluster_layout().current().all_nodes().len(); diff --git a/src/table/replication/parameters.rs b/src/table/replication/parameters.rs index 2a7d3585..2f842409 100644 --- a/src/table/replication/parameters.rs +++ b/src/table/replication/parameters.rs @@ -6,21 +6,23 @@ pub trait TableReplication: Send + Sync + 'static { // See examples in table_sharded.rs and table_fullcopy.rs // To understand various replication methods + /// The entire list of all nodes that store a partition + fn storage_nodes(&self, hash: &Hash) -> Vec; + /// Which nodes to send read requests to fn read_nodes(&self, hash: &Hash) -> Vec; /// Responses needed to consider a read succesfull fn read_quorum(&self) -> usize; /// Which nodes to send writes to - fn write_nodes(&self, hash: &Hash) -> Vec; - /// Responses needed to consider a write succesfull + fn write_sets(&self, hash: &Hash) -> Vec>; + /// Responses needed to consider a write succesfull in each set fn write_quorum(&self) -> usize; fn max_write_errors(&self) -> usize; // Accessing partitions, for Merkle tree & sync /// Get partition for data with given hash fn partition_of(&self, hash: &Hash) -> Partition; - /// List of partitions and nodes to sync with in current layout fn sync_partitions(&self) -> SyncPartitions; } diff --git a/src/table/replication/sharded.rs b/src/table/replication/sharded.rs index f02b1d66..1320a189 100644 --- a/src/table/replication/sharded.rs +++ b/src/table/replication/sharded.rs @@ -25,21 +25,19 @@ pub struct TableShardedReplication { } impl TableReplication for TableShardedReplication { + fn storage_nodes(&self, hash: &Hash) -> Vec { + self.system.cluster_layout().storage_nodes_of(hash) + } + fn read_nodes(&self, hash: &Hash) -> Vec { - self.system - .cluster_layout() - .current() - .nodes_of(hash, self.replication_factor) + self.system.cluster_layout().read_nodes_of(hash) } fn read_quorum(&self) -> usize { self.read_quorum } - fn write_nodes(&self, hash: &Hash) -> Vec { - self.system - .cluster_layout() - .current() - .nodes_of(hash, self.replication_factor) + fn write_sets(&self, hash: &Hash) -> Vec> { + self.system.cluster_layout().write_sets_of(hash) } fn write_quorum(&self) -> usize { self.write_quorum @@ -60,13 +58,7 @@ impl TableReplication for TableShardedReplication { .current() .partitions() .map(|(partition, first_hash)| { - let mut storage_nodes = layout - .write_sets_of(&first_hash) - .map(|x| x.into_iter()) - .flatten() - .collect::>(); - storage_nodes.sort(); - storage_nodes.dedup(); + let storage_nodes = layout.storage_nodes_of(&first_hash); SyncPartition { partition, first_hash, diff --git a/src/table/sync.rs b/src/table/sync.rs index 8c21db8b..b67cdd79 100644 --- a/src/table/sync.rs +++ b/src/table/sync.rs @@ -176,7 +176,7 @@ impl TableSyncer { let nodes = self .data .replication - .write_nodes(begin) + .storage_nodes(begin) .into_iter() .collect::>(); if nodes.contains(&self.system.id) { diff --git a/src/table/table.rs b/src/table/table.rs index 997fd7dc..bf08d5a0 100644 --- a/src/table/table.rs +++ b/src/table/table.rs @@ -119,7 +119,8 @@ impl Table { async fn insert_internal(&self, e: &F::E) -> Result<(), Error> { let hash = e.partition_key().hash(); - let who = self.data.replication.write_nodes(&hash); + // TODO: use write sets + let who = self.data.replication.storage_nodes(&hash); let e_enc = Arc::new(ByteBuf::from(e.encode()?)); let rpc = TableRpc::::Update(vec![e_enc]); @@ -171,7 +172,8 @@ impl Table { for entry in entries.into_iter() { let entry = entry.borrow(); let hash = entry.partition_key().hash(); - let who = self.data.replication.write_nodes(&hash); + // TODO: use write sets + let who = self.data.replication.storage_nodes(&hash); let e_enc = Arc::new(ByteBuf::from(entry.encode()?)); for node in who { call_list.entry(node).or_default().push(e_enc.clone()); -- cgit v1.2.3 From 90e1619b1e9f5d81e59da371f04717f0c4fe5afc Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 14 Nov 2023 15:40:46 +0100 Subject: table: take into account multiple write sets in inserts --- src/block/manager.rs | 7 +- src/block/resync.rs | 2 +- src/model/k2v/rpc.rs | 15 ++- src/rpc/rpc_helper.rs | 278 ++++++++++++++++++++++++++++++-------------------- src/table/gc.rs | 4 +- src/table/table.rs | 17 ++- 6 files changed, 189 insertions(+), 134 deletions(-) diff --git a/src/block/manager.rs b/src/block/manager.rs index 2bb9c23d..0ca8bc31 100644 --- a/src/block/manager.rs +++ b/src/block/manager.rs @@ -354,8 +354,7 @@ impl BlockManager { /// Send block to nodes that should have it pub async fn rpc_put_block(&self, hash: Hash, data: Bytes) -> Result<(), Error> { - // TODO: use quorums among latest write set - let who = self.replication.storage_nodes(&hash); + let who = self.replication.write_sets(&hash); let (header, bytes) = DataBlock::from_buffer(data, self.compression_level) .await @@ -365,9 +364,9 @@ impl BlockManager { self.system .rpc_helper() - .try_call_many( + .try_write_many_sets( &self.endpoint, - &who[..], + &who, put_block_rpc, RequestStrategy::with_priority(PRIO_NORMAL | PRIO_SECONDARY) .with_quorum(self.replication.write_quorum()), diff --git a/src/block/resync.rs b/src/block/resync.rs index 122d0142..15f210e4 100644 --- a/src/block/resync.rs +++ b/src/block/resync.rs @@ -434,7 +434,7 @@ impl BlockResyncManager { .rpc_helper() .try_call_many( &manager.endpoint, - &need_nodes[..], + &need_nodes, put_block_message, RequestStrategy::with_priority(PRIO_BACKGROUND) .with_quorum(need_nodes.len()), diff --git a/src/model/k2v/rpc.rs b/src/model/k2v/rpc.rs index aa3323d5..863a068a 100644 --- a/src/model/k2v/rpc.rs +++ b/src/model/k2v/rpc.rs @@ -134,16 +134,14 @@ impl K2VRpcHandler { .rpc_helper() .try_call_many( &self.endpoint, - &who[..], + &who, K2VRpc::InsertItem(InsertedItem { partition, sort_key, causal_context, value, }), - RequestStrategy::with_priority(PRIO_NORMAL) - .with_quorum(1) - .interrupt_after_quorum(true), + RequestStrategy::with_priority(PRIO_NORMAL).with_quorum(1), ) .await?; @@ -192,9 +190,7 @@ impl K2VRpcHandler { &self.endpoint, &nodes[..], K2VRpc::InsertManyItems(items), - RequestStrategy::with_priority(PRIO_NORMAL) - .with_quorum(1) - .interrupt_after_quorum(true), + RequestStrategy::with_priority(PRIO_NORMAL).with_quorum(1), ) .await?; Ok::<_, Error>((nodes, resp)) @@ -223,7 +219,7 @@ impl K2VRpcHandler { }, sort_key, }; - // TODO figure this out with write sets, does it still work???? + // TODO figure this out with write sets, is it still appropriate??? let nodes = self .item_table .data @@ -232,7 +228,7 @@ impl K2VRpcHandler { let rpc = self.system.rpc_helper().try_call_many( &self.endpoint, - &nodes[..], + &nodes, K2VRpc::PollItem { key: poll_key, causal_context, @@ -240,6 +236,7 @@ impl K2VRpcHandler { }, RequestStrategy::with_priority(PRIO_NORMAL) .with_quorum(self.item_table.data.replication.read_quorum()) + .send_all_at_once(true) .without_timeout(), ); let timeout_duration = diff --git a/src/rpc/rpc_helper.rs b/src/rpc/rpc_helper.rs index ce291068..12d073b6 100644 --- a/src/rpc/rpc_helper.rs +++ b/src/rpc/rpc_helper.rs @@ -1,4 +1,5 @@ //! Contain structs related to making RPCs +use std::collections::HashMap; use std::sync::{Arc, RwLock}; use std::time::Duration; @@ -35,11 +36,11 @@ const DEFAULT_TIMEOUT: Duration = Duration::from_secs(300); #[derive(Copy, Clone)] pub struct RequestStrategy { /// Min number of response to consider the request successful - pub rs_quorum: Option, - /// Should requests be dropped after enough response are received - pub rs_interrupt_after_quorum: bool, + rs_quorum: Option, + /// Send all requests at once + rs_send_all_at_once: Option, /// Request priority - pub rs_priority: RequestPriority, + rs_priority: RequestPriority, /// Custom timeout for this request rs_timeout: Timeout, } @@ -56,7 +57,7 @@ impl RequestStrategy { pub fn with_priority(prio: RequestPriority) -> Self { RequestStrategy { rs_quorum: None, - rs_interrupt_after_quorum: false, + rs_send_all_at_once: None, rs_priority: prio, rs_timeout: Timeout::Default, } @@ -66,10 +67,9 @@ impl RequestStrategy { self.rs_quorum = Some(quorum); self } - /// Set if requests can be dropped after quorum has been reached - /// In general true for read requests, and false for write - pub fn interrupt_after_quorum(mut self, interrupt: bool) -> Self { - self.rs_interrupt_after_quorum = interrupt; + /// Set quorum to be reached for request + pub fn send_all_at_once(mut self, value: bool) -> Self { + self.rs_send_all_at_once = Some(value); self } /// Deactivate timeout for this request @@ -235,31 +235,19 @@ impl RpcHelper { let quorum = strategy.rs_quorum.unwrap_or(to.len()); let tracer = opentelemetry::global::tracer("garage"); - let span_name = if strategy.rs_interrupt_after_quorum { - format!("RPC {} to {} of {}", endpoint.path(), quorum, to.len()) - } else { - format!( - "RPC {} to {} (quorum {})", - endpoint.path(), - to.len(), - quorum - ) - }; + let span_name = format!("Read RPC {} to {} of {}", endpoint.path(), quorum, to.len()); + let mut span = tracer.start(span_name); span.set_attribute(KeyValue::new("from", format!("{:?}", self.0.our_node_id))); span.set_attribute(KeyValue::new("to", format!("{:?}", to))); span.set_attribute(KeyValue::new("quorum", quorum as i64)); - span.set_attribute(KeyValue::new( - "interrupt_after_quorum", - strategy.rs_interrupt_after_quorum.to_string(), - )); - self.try_call_many_internal(endpoint, to, msg, strategy, quorum) + self.try_call_many_inner(endpoint, to, msg, strategy, quorum) .with_context(Context::current_with_span(span)) .await } - async fn try_call_many_internal( + async fn try_call_many_inner( &self, endpoint: &Arc>, to: &[Uuid], @@ -273,12 +261,20 @@ impl RpcHelper { H: StreamingEndpointHandler + 'static, S: Send + 'static, { - let msg = msg.into_req().map_err(netapp::error::Error::from)?; + // Once quorum is reached, other requests don't matter. + // What we do here is only send the required number of requests + // to reach a quorum, priorizing nodes with the lowest latency. + // When there are errors, we start new requests to compensate. + + // Reorder requests to priorize closeness / low latency + let request_order = self.request_order(to); + let send_all_at_once = strategy.rs_send_all_at_once.unwrap_or(false); // Build future for each request // They are not started now: they are added below in a FuturesUnordered // object that will take care of polling them (see below) - let requests = to.iter().cloned().map(|to| { + let msg = msg.into_req().map_err(netapp::error::Error::from)?; + let mut requests = request_order.into_iter().map(|to| { let self2 = self.clone(); let msg = msg.clone(); let endpoint2 = endpoint.clone(); @@ -291,93 +287,40 @@ impl RpcHelper { let mut successes = vec![]; let mut errors = vec![]; - if strategy.rs_interrupt_after_quorum { - // Case 1: once quorum is reached, other requests don't matter. - // What we do here is only send the required number of requests - // to reach a quorum, priorizing nodes with the lowest latency. - // When there are errors, we start new requests to compensate. - - // Reorder requests to priorize closeness / low latency - let request_order = self.request_order(to); - let mut ord_requests = vec![(); request_order.len()] - .into_iter() - .map(|_| None) - .collect::>(); - for (to, fut) in requests { - let i = request_order.iter().position(|x| *x == to).unwrap(); - ord_requests[i] = Some((to, fut)); + // resp_stream will contain all of the requests that are currently in flight. + // (for the moment none, they will be added in the loop below) + let mut resp_stream = FuturesUnordered::new(); + + // Do some requests and collect results + while successes.len() < quorum { + // If the current set of requests that are running is not enough to possibly + // reach quorum, start some new requests. + while send_all_at_once || successes.len() + resp_stream.len() < quorum { + if let Some((req_to, fut)) = requests.next() { + let tracer = opentelemetry::global::tracer("garage"); + let span = tracer.start(format!("RPC to {:?}", req_to)); + resp_stream.push(tokio::spawn( + fut.with_context(Context::current_with_span(span)), + )); + } else { + break; + } } - // Make an iterator to take requests in their sorted order - let mut requests = ord_requests.into_iter().map(Option::unwrap); - - // resp_stream will contain all of the requests that are currently in flight. - // (for the moment none, they will be added in the loop below) - let mut resp_stream = FuturesUnordered::new(); - - // Do some requests and collect results - 'request_loop: while successes.len() < quorum { - // If the current set of requests that are running is not enough to possibly - // reach quorum, start some new requests. - while successes.len() + resp_stream.len() < quorum { - if let Some((req_to, fut)) = requests.next() { - let tracer = opentelemetry::global::tracer("garage"); - let span = tracer.start(format!("RPC to {:?}", req_to)); - resp_stream.push(tokio::spawn( - fut.with_context(Context::current_with_span(span)), - )); - } else { - // If we have no request to add, we know that we won't ever - // reach quorum: bail out now. - break 'request_loop; - } - } - assert!(!resp_stream.is_empty()); // because of loop invariants + if successes.len() + resp_stream.len() < quorum { + // We know we won't ever reach quorum + break; + } - // Wait for one request to terminate - match resp_stream.next().await.unwrap().unwrap() { - Ok(msg) => { - successes.push(msg); - } - Err(e) => { - errors.push(e); - } + // Wait for one request to terminate + match resp_stream.next().await.unwrap().unwrap() { + Ok(msg) => { + successes.push(msg); } - } - } else { - // Case 2: all of the requests need to be sent in all cases, - // and need to terminate. (this is the case for writes that - // must be spread to n nodes) - // Just start all the requests in parallel and return as soon - // as the quorum is reached. - let mut resp_stream = requests - .map(|(_, fut)| fut) - .collect::>(); - - while let Some(resp) = resp_stream.next().await { - match resp { - Ok(msg) => { - successes.push(msg); - if successes.len() >= quorum { - break; - } - } - Err(e) => { - errors.push(e); - } + Err(e) => { + errors.push(e); } } - - if !resp_stream.is_empty() { - // Continue remaining requests in background. - // Note: these requests can get interrupted on process shutdown, - // we must not count on them being executed for certain. - // For all background things that have to happen with certainty, - // they have to be put in a proper queue that is persisted to disk. - tokio::spawn(async move { - resp_stream.collect::>>().await; - }); - } } if successes.len() >= quorum { @@ -432,4 +375,123 @@ impl RpcHelper { .map(|(_, _, _, to)| to) .collect::>() } + + pub async fn try_write_many_sets( + &self, + endpoint: &Arc>, + to_sets: &[Vec], + msg: N, + strategy: RequestStrategy, + ) -> Result, Error> + where + M: Rpc> + 'static, + N: IntoReq, + H: StreamingEndpointHandler + 'static, + S: Send + 'static, + { + let quorum = strategy + .rs_quorum + .expect("internal error: missing quroum in try_write_many_sets"); + + let tracer = opentelemetry::global::tracer("garage"); + let span_name = format!( + "Write RPC {} (quorum {} in {} sets)", + endpoint.path(), + quorum, + to_sets.len() + ); + + let mut span = tracer.start(span_name); + span.set_attribute(KeyValue::new("from", format!("{:?}", self.0.our_node_id))); + span.set_attribute(KeyValue::new("to", format!("{:?}", to_sets))); + span.set_attribute(KeyValue::new("quorum", quorum as i64)); + + self.try_write_many_sets_inner(endpoint, to_sets, msg, strategy, quorum) + .with_context(Context::current_with_span(span)) + .await + } + + async fn try_write_many_sets_inner( + &self, + endpoint: &Arc>, + to_sets: &[Vec], + msg: N, + strategy: RequestStrategy, + quorum: usize, + ) -> Result, Error> + where + M: Rpc> + 'static, + N: IntoReq, + H: StreamingEndpointHandler + 'static, + S: Send + 'static, + { + let msg = msg.into_req().map_err(netapp::error::Error::from)?; + + let mut peers = HashMap::>::new(); + for (i, set) in to_sets.iter().enumerate() { + for peer in set.iter() { + peers.entry(*peer).or_default().push(i); + } + } + + let requests = peers.iter().map(|(peer, _)| { + let self2 = self.clone(); + let msg = msg.clone(); + let endpoint2 = endpoint.clone(); + let to = *peer; + let tracer = opentelemetry::global::tracer("garage"); + let span = tracer.start(format!("RPC to {:?}", to)); + let fut = async move { (to, self2.call(&endpoint2, to, msg, strategy).await) }; + tokio::spawn(fut.with_context(Context::current_with_span(span))) + }); + let mut resp_stream = requests.collect::>(); + + let mut successes = vec![]; + let mut errors = vec![]; + + let mut set_counters = vec![(0, 0); to_sets.len()]; + + while !resp_stream.is_empty() { + let (node, resp) = resp_stream.next().await.unwrap().unwrap(); + + match resp { + Ok(msg) => { + for set in peers.get(&node).unwrap().iter() { + set_counters[*set].0 += 1; + } + successes.push(msg); + } + Err(e) => { + for set in peers.get(&node).unwrap().iter() { + set_counters[*set].1 += 1; + } + errors.push(e); + } + } + + if set_counters.iter().all(|x| x.0 > quorum) { + // Success + + // Continue all other requets in background + tokio::spawn(async move { + resp_stream.collect::>>().await; + }); + + return Ok(successes); + } + + if set_counters + .iter() + .enumerate() + .any(|(i, x)| x.1 + quorum > to_sets[i].len()) + { + // Too many errors in this set, we know we won't get a quorum + break; + } + } + + // Failure, could not get quorum + let errors = errors.iter().map(|e| format!("{}", e)).collect::>(); + Err(Error::Quorum(quorum, successes.len(), peers.len(), errors)) + } } diff --git a/src/table/gc.rs b/src/table/gc.rs index 002cfbf4..ef788749 100644 --- a/src/table/gc.rs +++ b/src/table/gc.rs @@ -230,7 +230,7 @@ impl TableGc { .rpc_helper() .try_call_many( &self.endpoint, - &nodes[..], + &nodes, GcRpc::Update(updates), RequestStrategy::with_priority(PRIO_BACKGROUND).with_quorum(nodes.len()), ) @@ -251,7 +251,7 @@ impl TableGc { .rpc_helper() .try_call_many( &self.endpoint, - &nodes[..], + &nodes, GcRpc::DeleteIfEqualHash(deletes), RequestStrategy::with_priority(PRIO_BACKGROUND).with_quorum(nodes.len()), ) diff --git a/src/table/table.rs b/src/table/table.rs index bf08d5a0..c2efaeaf 100644 --- a/src/table/table.rs +++ b/src/table/table.rs @@ -119,17 +119,16 @@ impl Table { async fn insert_internal(&self, e: &F::E) -> Result<(), Error> { let hash = e.partition_key().hash(); - // TODO: use write sets - let who = self.data.replication.storage_nodes(&hash); + let who = self.data.replication.write_sets(&hash); let e_enc = Arc::new(ByteBuf::from(e.encode()?)); let rpc = TableRpc::::Update(vec![e_enc]); self.system .rpc_helper() - .try_call_many( + .try_write_many_sets( &self.endpoint, - &who[..], + &who, rpc, RequestStrategy::with_priority(PRIO_NORMAL) .with_quorum(self.data.replication.write_quorum()), @@ -243,11 +242,10 @@ impl Table { .rpc_helper() .try_call_many( &self.endpoint, - &who[..], + &who, rpc, RequestStrategy::with_priority(PRIO_NORMAL) - .with_quorum(self.data.replication.read_quorum()) - .interrupt_after_quorum(true), + .with_quorum(self.data.replication.read_quorum()), ) .await?; @@ -339,11 +337,10 @@ impl Table { .rpc_helper() .try_call_many( &self.endpoint, - &who[..], + &who, rpc, RequestStrategy::with_priority(PRIO_NORMAL) - .with_quorum(self.data.replication.read_quorum()) - .interrupt_after_quorum(true), + .with_quorum(self.data.replication.read_quorum()), ) .await?; -- cgit v1.2.3 From 7ef2c231208073db5a0a0a8674e2dd2d2ecb2222 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 14 Nov 2023 15:45:01 +0100 Subject: layout: fix test --- src/rpc/layout/test.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/rpc/layout/test.rs b/src/rpc/layout/test.rs index 0ce090d2..e9639073 100644 --- a/src/rpc/layout/test.rs +++ b/src/rpc/layout/test.rs @@ -35,7 +35,7 @@ fn check_against_naive(cl: &LayoutVersion) -> Result { } for uuid in cl.nongateway_nodes() { let z = cl.get_node_zone(&uuid)?; - let c = cl.get_node_capacity(&uuid)?; + let c = cl.get_node_capacity(&uuid).unwrap(); zone_token.insert( z.to_string(), zone_token[z] + min(NB_PARTITIONS, (c / over_size) as usize), -- cgit v1.2.3 From b3e729f4b8ec3b06593f8d3b161c76b1263d9f13 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 15 Nov 2023 12:15:58 +0100 Subject: layout history merge: rm invalid versions when valid versions are added --- src/rpc/layout/history.rs | 18 ++++++++++++++++++ src/rpc/layout/version.rs | 20 ++++++++++---------- 2 files changed, 28 insertions(+), 10 deletions(-) diff --git a/src/rpc/layout/history.rs b/src/rpc/layout/history.rs index dce492c9..2346b14a 100644 --- a/src/rpc/layout/history.rs +++ b/src/rpc/layout/history.rs @@ -211,6 +211,24 @@ impl LayoutHistory { changed = changed || c; } + // If there are invalid versions before valid versions, remove them, + // and increment update trackers + if self.versions.len() > 1 && self.current().check().is_ok() { + while self.versions.first().unwrap().check().is_err() { + self.versions.remove(0); + changed = true; + } + if changed { + let min_v = self.versions.first().unwrap().version; + let nodes = self.all_nongateway_nodes().into_owned(); + for node in nodes { + self.update_trackers.ack_map.set_max(node, min_v); + self.update_trackers.sync_map.set_max(node, min_v); + self.update_trackers.sync_ack_map.set_max(node, min_v); + } + } + } + // Merge staged layout changes if self.staging != other.staging { self.staging.merge(&other.staging); diff --git a/src/rpc/layout/version.rs b/src/rpc/layout/version.rs index 912ee538..947fab56 100644 --- a/src/rpc/layout/version.rs +++ b/src/rpc/layout/version.rs @@ -174,6 +174,16 @@ impl LayoutVersion { /// (assignment, roles, parameters, partition size) /// returns true if consistent, false if error pub fn check(&self) -> Result<(), String> { + // Check that the assignment data has the correct length + let expected_assignment_data_len = (1 << PARTITION_BITS) * self.replication_factor; + if self.ring_assignment_data.len() != expected_assignment_data_len { + return Err(format!( + "ring_assignment_data has incorrect length {} instead of {}", + self.ring_assignment_data.len(), + expected_assignment_data_len + )); + } + // Check that node_id_vec contains the correct list of nodes let mut expected_nodes = self .roles @@ -189,16 +199,6 @@ impl LayoutVersion { return Err(format!("node_id_vec does not contain the correct set of nodes\nnode_id_vec: {:?}\nexpected: {:?}", node_id_vec, expected_nodes)); } - // Check that the assignment data has the correct length - let expected_assignment_data_len = (1 << PARTITION_BITS) * self.replication_factor; - if self.ring_assignment_data.len() != expected_assignment_data_len { - return Err(format!( - "ring_assignment_data has incorrect length {} instead of {}", - self.ring_assignment_data.len(), - expected_assignment_data_len - )); - } - // Check that the assigned nodes are correct identifiers // of nodes that are assigned a role // and that role is not the role of a gateway nodes -- cgit v1.2.3 From 46007bf01dd2e5b489d145ca0a5499ffa7257b96 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 15 Nov 2023 12:56:52 +0100 Subject: integration test: print stdout and stderr on subcommand crash --- src/garage/tests/common/ext/process.rs | 44 +++++++++------------------------- src/garage/tests/common/garage.rs | 2 +- 2 files changed, 12 insertions(+), 34 deletions(-) diff --git a/src/garage/tests/common/ext/process.rs b/src/garage/tests/common/ext/process.rs index ba533b6c..8e20bf7c 100644 --- a/src/garage/tests/common/ext/process.rs +++ b/src/garage/tests/common/ext/process.rs @@ -14,42 +14,20 @@ impl CommandExt for process::Command { } fn expect_success_status(&mut self, msg: &str) -> process::ExitStatus { - let status = self.status().expect(msg); - status.expect_success(msg); - status + self.expect_success_output(msg).status } fn expect_success_output(&mut self, msg: &str) -> process::Output { let output = self.output().expect(msg); - output.expect_success(msg); - output - } -} - -pub trait OutputExt { - fn expect_success(&self, msg: &str); -} - -impl OutputExt for process::Output { - fn expect_success(&self, msg: &str) { - self.status.expect_success(msg) - } -} - -pub trait ExitStatusExt { - fn expect_success(&self, msg: &str); -} - -impl ExitStatusExt for process::ExitStatus { - fn expect_success(&self, msg: &str) { - if !self.success() { - match self.code() { - Some(code) => panic!( - "Command exited with code {code}: {msg}", - code = code, - msg = msg - ), - None => panic!("Command exited with signal: {msg}", msg = msg), - } + if !output.status.success() { + panic!( + "{}: command {:?} exited with error {:?}\nSTDOUT: {}\nSTDERR: {}", + msg, + self, + output.status.code(), + String::from_utf8_lossy(&output.stdout), + String::from_utf8_lossy(&output.stderr) + ); } + output } } diff --git a/src/garage/tests/common/garage.rs b/src/garage/tests/common/garage.rs index d1f0867a..ebc82f37 100644 --- a/src/garage/tests/common/garage.rs +++ b/src/garage/tests/common/garage.rs @@ -96,7 +96,7 @@ api_bind_addr = "127.0.0.1:{admin_port}" .arg("server") .stdout(stdout) .stderr(stderr) - .env("RUST_LOG", "garage=info,garage_api=trace") + .env("RUST_LOG", "garage=debug,garage_api=trace") .spawn() .expect("Could not start garage"); -- cgit v1.2.3 From acd49de9f97bd27409232691262bd5827983388d Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 15 Nov 2023 13:07:42 +0100 Subject: rpc: fix write set quorums --- src/api/common_error.rs | 8 ++------ src/model/k2v/rpc.rs | 2 +- src/rpc/rpc_helper.rs | 18 +++++++++++++++--- src/util/error.rs | 7 ++++--- 4 files changed, 22 insertions(+), 13 deletions(-) diff --git a/src/api/common_error.rs b/src/api/common_error.rs index 20f9f266..ecb22fd8 100644 --- a/src/api/common_error.rs +++ b/src/api/common_error.rs @@ -53,9 +53,7 @@ impl CommonError { pub fn http_status_code(&self) -> StatusCode { match self { CommonError::InternalError( - GarageError::Timeout - | GarageError::RemoteError(_) - | GarageError::Quorum(_, _, _, _), + GarageError::Timeout | GarageError::RemoteError(_) | GarageError::Quorum(..), ) => StatusCode::SERVICE_UNAVAILABLE, CommonError::InternalError(_) | CommonError::Hyper(_) | CommonError::Http(_) => { StatusCode::INTERNAL_SERVER_ERROR @@ -72,9 +70,7 @@ impl CommonError { match self { CommonError::Forbidden(_) => "AccessDenied", CommonError::InternalError( - GarageError::Timeout - | GarageError::RemoteError(_) - | GarageError::Quorum(_, _, _, _), + GarageError::Timeout | GarageError::RemoteError(_) | GarageError::Quorum(..), ) => "ServiceUnavailable", CommonError::InternalError(_) | CommonError::Hyper(_) | CommonError::Http(_) => { "InternalError" diff --git a/src/model/k2v/rpc.rs b/src/model/k2v/rpc.rs index 863a068a..3c759181 100644 --- a/src/model/k2v/rpc.rs +++ b/src/model/k2v/rpc.rs @@ -344,7 +344,7 @@ impl K2VRpcHandler { } if errors.len() > nodes.len() - quorum { let errors = errors.iter().map(|e| format!("{}", e)).collect::>(); - return Err(Error::Quorum(quorum, resps.len(), nodes.len(), errors).into()); + return Err(Error::Quorum(quorum, None, resps.len(), nodes.len(), errors).into()); } // Take all returned items into account to produce the response. diff --git a/src/rpc/rpc_helper.rs b/src/rpc/rpc_helper.rs index 12d073b6..1bad495b 100644 --- a/src/rpc/rpc_helper.rs +++ b/src/rpc/rpc_helper.rs @@ -327,7 +327,13 @@ impl RpcHelper { Ok(successes) } else { let errors = errors.iter().map(|e| format!("{}", e)).collect::>(); - Err(Error::Quorum(quorum, successes.len(), to.len(), errors)) + Err(Error::Quorum( + quorum, + None, + successes.len(), + to.len(), + errors, + )) } } @@ -469,7 +475,7 @@ impl RpcHelper { } } - if set_counters.iter().all(|x| x.0 > quorum) { + if set_counters.iter().all(|x| x.0 >= quorum) { // Success // Continue all other requets in background @@ -492,6 +498,12 @@ impl RpcHelper { // Failure, could not get quorum let errors = errors.iter().map(|e| format!("{}", e)).collect::>(); - Err(Error::Quorum(quorum, successes.len(), peers.len(), errors)) + Err(Error::Quorum( + quorum, + Some(to_sets.len()), + successes.len(), + peers.len(), + errors, + )) } } diff --git a/src/util/error.rs b/src/util/error.rs index 3fcee71d..be7cdfdb 100644 --- a/src/util/error.rs +++ b/src/util/error.rs @@ -55,13 +55,14 @@ pub enum Error { Timeout, #[error( - display = "Could not reach quorum of {}. {} of {} request succeeded, others returned errors: {:?}", + display = "Could not reach quorum of {} (sets={:?}). {} of {} request succeeded, others returned errors: {:?}", _0, _1, _2, - _3 + _3, + _4 )] - Quorum(usize, usize, usize, Vec), + Quorum(usize, Option, usize, usize, Vec), #[error(display = "Unexpected RPC message: {}", _0)] UnexpectedRpcMessage(String), -- cgit v1.2.3 From 65066c70640371cc318faddfb4c05c96de18e86d Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 15 Nov 2023 13:28:30 +0100 Subject: layout: wip cache global mins --- src/rpc/layout/history.rs | 46 +++++++++++++++++++++++++++------------------- src/rpc/layout/manager.rs | 6 +++--- src/rpc/layout/schema.rs | 36 +++++++++++++++++++++++++++--------- 3 files changed, 57 insertions(+), 31 deletions(-) diff --git a/src/rpc/layout/history.rs b/src/rpc/layout/history.rs index 2346b14a..1684918e 100644 --- a/src/rpc/layout/history.rs +++ b/src/rpc/layout/history.rs @@ -9,6 +9,7 @@ use garage_util::error::*; use super::schema::*; use super::*; + impl LayoutHistory { pub fn new(replication_factor: usize) -> Self { let version = LayoutVersion::new(replication_factor); @@ -49,7 +50,7 @@ impl LayoutHistory { // ------------------ who stores what now? --------------- pub fn all_ack(&self) -> u64 { - self.calculate_global_min(&self.update_trackers.ack_map) + self.update_trackers.ack_map.current_min } pub fn min_stored(&self) -> u64 { @@ -91,7 +92,7 @@ impl LayoutHistory { } pub fn read_nodes_of(&self, position: &Hash) -> Vec { - let sync_min = self.calculate_global_min(&self.update_trackers.sync_map); + let sync_min = self.update_trackers.sync_map.current_min; let version = self .versions .iter() @@ -122,7 +123,7 @@ impl LayoutHistory { // ------------------ update tracking --------------- - pub(crate) fn update_trackers(&mut self, node_id: Uuid) { + pub(crate) fn update_trackers_of(&mut self, node_id: Uuid) { // Ensure trackers for this node's values are up-to-date // 1. Acknowledge the last layout version in the history @@ -138,6 +139,9 @@ impl LayoutHistory { // 4. Cleanup layout versions that are not needed anymore self.cleanup_old_versions(); + // 5. Recalculate global minima + self.update_trackers_min(); + info!("ack_map: {:?}", self.update_trackers.ack_map); info!("sync_map: {:?}", self.update_trackers.sync_map); info!("sync_ack_map: {:?}", self.update_trackers.sync_ack_map); @@ -146,42 +150,41 @@ impl LayoutHistory { self.update_hashes(); } + fn update_trackers_min(&mut self) { + // TODO: for TableFullReplication, counting gateway nodes might be + // necessary? Think about this more. + let storage_nodes = self.all_nongateway_nodes().into_owned(); + let min_version = self.versions.first().unwrap().version; + self.update_trackers.update_min(&storage_nodes, min_version); + } + pub(crate) fn ack_last(&mut self, node: Uuid) { let last_version = self.current().version; self.update_trackers.ack_map.set_max(node, last_version); + self.update_trackers_min(); } pub(crate) fn sync_first(&mut self, node: Uuid) { let first_version = self.versions.first().as_ref().unwrap().version; self.update_trackers.sync_map.set_max(node, first_version); + self.update_trackers_min(); } pub(crate) fn sync_ack(&mut self, node: Uuid) { - self.update_trackers.sync_ack_map.set_max( - node, - self.calculate_global_min(&self.update_trackers.sync_map), - ); + self.update_trackers + .sync_ack_map + .set_max(node, self.update_trackers.sync_map.current_min); + self.update_trackers_min(); } pub(crate) fn cleanup_old_versions(&mut self) { - let min_sync_ack = self.calculate_global_min(&self.update_trackers.sync_ack_map); + let min_sync_ack = self.update_trackers.sync_ack_map.current_min; while self.versions.first().as_ref().unwrap().version < min_sync_ack { let removed = self.versions.remove(0); info!("Layout history: pruning old version {}", removed.version); } } - pub(crate) fn calculate_global_min(&self, tracker: &UpdateTracker) -> u64 { - // TODO: for TableFullReplication, counting gateway nodes might be - // necessary? Think about this more. - let storage_nodes = self.all_nongateway_nodes(); - storage_nodes - .iter() - .map(|x| tracker.0.get(x).copied().unwrap_or(0)) - .min() - .unwrap_or(0) - } - // ================== updates to layout, public interface =================== pub fn merge(&mut self, other: &LayoutHistory) -> bool { @@ -229,6 +232,11 @@ impl LayoutHistory { } } + // Update the current_min value in trackers if anything changed + if changed { + self.update_trackers_min(); + } + // Merge staged layout changes if self.staging != other.staging { self.staging.merge(&other.staging); diff --git a/src/rpc/layout/manager.rs b/src/rpc/layout/manager.rs index ce8b6f61..21ec2d8d 100644 --- a/src/rpc/layout/manager.rs +++ b/src/rpc/layout/manager.rs @@ -74,7 +74,7 @@ impl LayoutManager { } }; - cluster_layout.update_trackers(node_id.into()); + cluster_layout.update_trackers_of(node_id.into()); let layout = Arc::new(RwLock::new(cluster_layout)); let change_notify = Arc::new(Notify::new()); @@ -158,7 +158,7 @@ impl LayoutManager { if !prev_layout_check || adv.check().is_ok() { if layout.merge(adv) { - layout.update_trackers(self.node_id); + layout.update_trackers_of(self.node_id); if prev_layout_check && layout.check().is_err() { panic!("Merged two correct layouts and got an incorrect layout."); } @@ -172,7 +172,7 @@ impl LayoutManager { let mut layout = self.layout.write().unwrap(); if layout.update_trackers != *adv { if layout.update_trackers.merge(adv) { - layout.update_trackers(self.node_id); + layout.update_trackers_of(self.node_id); return Some(layout.update_trackers.clone()); } } diff --git a/src/rpc/layout/schema.rs b/src/rpc/layout/schema.rs index 79440a47..969f5a0b 100644 --- a/src/rpc/layout/schema.rs +++ b/src/rpc/layout/schema.rs @@ -260,7 +260,10 @@ mod v010 { /// The history of cluster layouts #[derive(Clone, Debug, Serialize, Deserialize, Default, PartialEq)] - pub struct UpdateTracker(pub BTreeMap); + pub struct UpdateTracker { + pub values: BTreeMap, + pub current_min: u64, + } impl garage_util::migrate::Migrate for LayoutHistory { const VERSION_MARKER: &'static [u8] = b"G010lh"; @@ -290,14 +293,15 @@ mod v010 { nongateway_node_count, ring_assignment_data: previous.ring_assignment_data, }; - let update_tracker = UpdateTracker( - version + let update_tracker = UpdateTracker { + values: version .nongateway_nodes() .iter() .copied() .map(|x| (x, version.version)) .collect::>(), - ); + current_min: 0, + }; let staging = LayoutStaging { parameters: previous.staging_parameters, roles: previous.staging_roles, @@ -378,14 +382,14 @@ impl core::str::FromStr for ZoneRedundancy { impl UpdateTracker { fn merge(&mut self, other: &UpdateTracker) -> bool { let mut changed = false; - for (k, v) in other.0.iter() { - if let Some(v_mut) = self.0.get_mut(k) { + for (k, v) in other.values.iter() { + if let Some(v_mut) = self.values.get_mut(k) { if *v > *v_mut { *v_mut = *v; changed = true; } } else { - self.0.insert(*k, *v); + self.values.insert(*k, *v); changed = true; } } @@ -393,18 +397,26 @@ impl UpdateTracker { } pub(crate) fn set_max(&mut self, peer: Uuid, value: u64) -> bool { - match self.0.get_mut(&peer) { + match self.values.get_mut(&peer) { Some(e) if *e < value => { *e = value; true } None => { - self.0.insert(peer, value); + self.values.insert(peer, value); true } _ => false, } } + + fn update_min(&mut self, storage_nodes: &[Uuid], min_version: u64) { + self.current_min = storage_nodes + .iter() + .map(|x| self.values.get(x).copied().unwrap_or(min_version)) + .min() + .unwrap_or(min_version) + } } impl UpdateTrackers { @@ -414,4 +426,10 @@ impl UpdateTrackers { let c3 = self.sync_ack_map.merge(&other.sync_ack_map); c1 || c2 || c3 } + + pub(crate) fn update_min(&mut self, storage_nodes: &[Uuid], min_version: u64) { + self.ack_map.update_min(&storage_nodes, min_version); + self.sync_map.update_min(&storage_nodes, min_version); + self.sync_ack_map.update_min(&storage_nodes, min_version); + } } -- cgit v1.2.3 From 393c4d4515e0cdadadc8de8ae2df12e4371cff88 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 15 Nov 2023 14:20:50 +0100 Subject: layout: add helper for cached/external values to centralize recomputation --- src/api/admin/cluster.rs | 1 - src/api/k2v/index.rs | 2 +- src/garage/cli/layout.rs | 3 +- src/model/helper/bucket.rs | 2 +- src/model/index_counter.rs | 4 +- src/rpc/layout/history.rs | 311 +++++++++++++++++++++++++++------------------ src/rpc/layout/manager.rs | 22 ++-- src/rpc/layout/schema.rs | 48 +++---- src/rpc/rpc_helper.rs | 6 +- src/rpc/system.rs | 4 +- 10 files changed, 222 insertions(+), 181 deletions(-) diff --git a/src/api/admin/cluster.rs b/src/api/admin/cluster.rs index d912b58f..593bd778 100644 --- a/src/api/admin/cluster.rs +++ b/src/api/admin/cluster.rs @@ -240,7 +240,6 @@ pub async fn handle_update_cluster_layout( .merge(&roles.update_mutator(node, layout::NodeRoleV(new_role))); } - layout.update_hashes(); garage .system .layout_manager diff --git a/src/api/k2v/index.rs b/src/api/k2v/index.rs index c189232a..e8cd1fba 100644 --- a/src/api/k2v/index.rs +++ b/src/api/k2v/index.rs @@ -29,7 +29,7 @@ pub async fn handle_read_index( .system .cluster_layout() .all_nongateway_nodes() - .into_owned(); + .to_vec(); let (partition_keys, more, next_start) = read_range( &garage.k2v.counter_table.table, diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index 0f01a37a..51774314 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -49,7 +49,7 @@ pub async fn cmd_assign_role( }; let mut layout = fetch_layout(rpc_cli, rpc_host).await?; - let all_nodes = layout.all_nodes().into_owned(); + let all_nodes = layout.get_all_nodes(); let added_nodes = args .node_ids @@ -331,7 +331,6 @@ pub async fn send_layout( rpc_host: NodeID, mut layout: LayoutHistory, ) -> Result<(), Error> { - layout.update_hashes(); rpc_cli .call( &rpc_host, diff --git a/src/model/helper/bucket.rs b/src/model/helper/bucket.rs index 2cb53424..efa3e27b 100644 --- a/src/model/helper/bucket.rs +++ b/src/model/helper/bucket.rs @@ -455,7 +455,7 @@ impl<'a> BucketHelper<'a> { .system .cluster_layout() .all_nongateway_nodes() - .into_owned(); + .to_vec(); let k2vindexes = self .0 .k2v diff --git a/src/model/index_counter.rs b/src/model/index_counter.rs index 2d968733..e8702bf1 100644 --- a/src/model/index_counter.rs +++ b/src/model/index_counter.rs @@ -7,7 +7,7 @@ use serde::{Deserialize, Serialize}; use garage_db as db; -use garage_rpc::layout::LayoutHistory; +use garage_rpc::layout::LayoutHelper; use garage_rpc::system::System; use garage_util::background::BackgroundRunner; use garage_util::data::*; @@ -83,7 +83,7 @@ impl Entry for CounterEntry { } impl CounterEntry { - pub fn filtered_values(&self, layout: &LayoutHistory) -> HashMap { + pub fn filtered_values(&self, layout: &LayoutHelper) -> HashMap { let nodes = layout.all_nongateway_nodes(); self.filtered_values_with_nodes(&nodes) } diff --git a/src/rpc/layout/history.rs b/src/rpc/layout/history.rs index 1684918e..b6f0e495 100644 --- a/src/rpc/layout/history.rs +++ b/src/rpc/layout/history.rs @@ -1,5 +1,5 @@ -use std::borrow::Cow; use std::collections::HashSet; +use std::ops::Deref; use garage_util::crdt::{Crdt, Lww, LwwMap}; use garage_util::data::*; @@ -9,95 +9,106 @@ use garage_util::error::*; use super::schema::*; use super::*; +pub struct LayoutHelper { + layout: Option, -impl LayoutHistory { - pub fn new(replication_factor: usize) -> Self { - let version = LayoutVersion::new(replication_factor); + // cached values + ack_map_min: u64, + sync_map_min: u64, - let staging = LayoutStaging { - parameters: Lww::::new(version.parameters), - roles: LwwMap::new(), - }; + all_nodes: Vec, + all_nongateway_nodes: Vec, - let mut ret = LayoutHistory { - versions: vec![version], - update_trackers: Default::default(), - trackers_hash: [0u8; 32].into(), - staging: Lww::raw(0, staging), - staging_hash: [0u8; 32].into(), - }; - ret.update_hashes(); - ret - } + trackers_hash: Hash, + staging_hash: Hash, +} - pub fn current(&self) -> &LayoutVersion { - self.versions.last().as_ref().unwrap() +impl Deref for LayoutHelper { + type Target = LayoutHistory; + fn deref(&self) -> &LayoutHistory { + self.layout() } +} - pub fn update_hashes(&mut self) { - self.trackers_hash = self.calculate_trackers_hash(); - self.staging_hash = self.calculate_staging_hash(); +impl LayoutHelper { + pub fn new(mut layout: LayoutHistory) -> Self { + layout.cleanup_old_versions(); + + let all_nongateway_nodes = layout.get_all_nongateway_nodes(); + layout.clamp_update_trackers(&all_nongateway_nodes); + + let min_version = layout.min_stored(); + let ack_map_min = layout + .update_trackers + .ack_map + .min(&all_nongateway_nodes, min_version); + let sync_map_min = layout + .update_trackers + .sync_map + .min(&all_nongateway_nodes, min_version); + + let all_nodes = layout.get_all_nodes(); + let trackers_hash = layout.calculate_trackers_hash(); + let staging_hash = layout.calculate_staging_hash(); + + LayoutHelper { + layout: Some(layout), + ack_map_min, + sync_map_min, + all_nodes, + all_nongateway_nodes, + trackers_hash, + staging_hash, + } } - pub(crate) fn calculate_trackers_hash(&self) -> Hash { - blake2sum(&nonversioned_encode(&self.update_trackers).unwrap()[..]) - } + // ------------------ single updating function -------------- - pub(crate) fn calculate_staging_hash(&self) -> Hash { - blake2sum(&nonversioned_encode(&self.staging).unwrap()[..]) + fn layout(&self) -> &LayoutHistory { + self.layout.as_ref().unwrap() } - // ------------------ who stores what now? --------------- - - pub fn all_ack(&self) -> u64 { - self.update_trackers.ack_map.current_min + pub(crate) fn update(&mut self, f: F) -> bool + where + F: FnOnce(&mut LayoutHistory) -> bool, + { + let changed = f(&mut self.layout.as_mut().unwrap()); + if changed { + *self = Self::new(self.layout.take().unwrap()); + } + changed } - pub fn min_stored(&self) -> u64 { - self.versions.first().as_ref().unwrap().version + // ------------------ read helpers --------------- + + pub fn all_nodes(&self) -> &[Uuid] { + &self.all_nodes } - pub fn sync_versions(&self) -> (u64, u64, u64) { - (self.current().version, self.all_ack(), self.min_stored()) + pub fn all_nongateway_nodes(&self) -> &[Uuid] { + &self.all_nongateway_nodes } - pub fn all_nodes(&self) -> Cow<'_, [Uuid]> { - // TODO: cache this - if self.versions.len() == 1 { - self.versions[0].all_nodes().into() - } else { - let set = self - .versions - .iter() - .map(|x| x.all_nodes()) - .flatten() - .collect::>(); - set.into_iter().copied().collect::>().into() - } + pub fn all_ack(&self) -> u64 { + self.ack_map_min } - pub fn all_nongateway_nodes(&self) -> Cow<'_, [Uuid]> { - // TODO: cache this - if self.versions.len() == 1 { - self.versions[0].nongateway_nodes().into() - } else { - let set = self - .versions - .iter() - .map(|x| x.nongateway_nodes()) - .flatten() - .collect::>(); - set.into_iter().copied().collect::>().into() - } + pub fn sync_versions(&self) -> (u64, u64, u64) { + ( + self.layout().current().version, + self.all_ack(), + self.layout().min_stored(), + ) } pub fn read_nodes_of(&self, position: &Hash) -> Vec { - let sync_min = self.update_trackers.sync_map.current_min; + let sync_min = self.sync_map_min; let version = self + .layout() .versions .iter() .find(|x| x.version == sync_min) - .or(self.versions.last()) + .or(self.layout().versions.last()) .unwrap(); version .nodes_of(position, version.replication_factor) @@ -105,7 +116,8 @@ impl LayoutHistory { } pub fn write_sets_of(&self, position: &Hash) -> Vec> { - self.versions + self.layout() + .versions .iter() .map(|x| x.nodes_of(position, x.replication_factor).collect()) .collect() @@ -113,7 +125,7 @@ impl LayoutHistory { pub fn storage_nodes_of(&self, position: &Hash) -> Vec { let mut ret = vec![]; - for version in self.versions.iter() { + for version in self.layout().versions.iter() { ret.extend(version.nodes_of(position, version.replication_factor)); } ret.sort(); @@ -121,7 +133,35 @@ impl LayoutHistory { ret } - // ------------------ update tracking --------------- + pub fn trackers_hash(&self) -> Hash { + self.trackers_hash + } + + pub fn staging_hash(&self) -> Hash { + self.staging_hash + } + + // ------------------ helpers for update tracking --------------- + + pub(crate) fn sync_first(&mut self, node: Uuid) { + let first_version = self.versions.first().as_ref().unwrap().version; + self.update(|layout| layout.update_trackers.sync_map.set_max(node, first_version)); + } + + pub(crate) fn sync_ack(&mut self, node: Uuid) { + let sync_map_min = self.sync_map_min; + self.update(|layout| { + layout + .update_trackers + .sync_ack_map + .set_max(node, sync_map_min) + }); + } + + pub(crate) fn ack_last(&mut self, node: Uuid) { + let last_version = self.current().version; + self.update(|layout| layout.update_trackers.ack_map.set_max(node, last_version)); + } pub(crate) fn update_trackers_of(&mut self, node_id: Uuid) { // Ensure trackers for this node's values are up-to-date @@ -136,55 +176,104 @@ impl LayoutHistory { // 3. Acknowledge everyone has synced up to min(self.sync_map) self.sync_ack(node_id); - // 4. Cleanup layout versions that are not needed anymore - self.cleanup_old_versions(); - - // 5. Recalculate global minima - self.update_trackers_min(); - info!("ack_map: {:?}", self.update_trackers.ack_map); info!("sync_map: {:?}", self.update_trackers.sync_map); info!("sync_ack_map: {:?}", self.update_trackers.sync_ack_map); + } +} - // Finally, update hashes - self.update_hashes(); +// ---- + +impl LayoutHistory { + pub fn new(replication_factor: usize) -> Self { + let version = LayoutVersion::new(replication_factor); + + let staging = LayoutStaging { + parameters: Lww::::new(version.parameters), + roles: LwwMap::new(), + }; + + LayoutHistory { + versions: vec![version], + update_trackers: Default::default(), + staging: Lww::raw(0, staging), + } } - fn update_trackers_min(&mut self) { - // TODO: for TableFullReplication, counting gateway nodes might be - // necessary? Think about this more. - let storage_nodes = self.all_nongateway_nodes().into_owned(); - let min_version = self.versions.first().unwrap().version; - self.update_trackers.update_min(&storage_nodes, min_version); + // ------------------ who stores what now? --------------- + + pub fn current(&self) -> &LayoutVersion { + self.versions.last().as_ref().unwrap() } - pub(crate) fn ack_last(&mut self, node: Uuid) { - let last_version = self.current().version; - self.update_trackers.ack_map.set_max(node, last_version); - self.update_trackers_min(); + pub fn min_stored(&self) -> u64 { + self.versions.first().as_ref().unwrap().version } - pub(crate) fn sync_first(&mut self, node: Uuid) { - let first_version = self.versions.first().as_ref().unwrap().version; - self.update_trackers.sync_map.set_max(node, first_version); - self.update_trackers_min(); + pub fn get_all_nodes(&self) -> Vec { + if self.versions.len() == 1 { + self.versions[0].all_nodes().to_vec() + } else { + let set = self + .versions + .iter() + .map(|x| x.all_nodes()) + .flatten() + .collect::>(); + set.into_iter().copied().collect::>() + } } - pub(crate) fn sync_ack(&mut self, node: Uuid) { - self.update_trackers - .sync_ack_map - .set_max(node, self.update_trackers.sync_map.current_min); - self.update_trackers_min(); + fn get_all_nongateway_nodes(&self) -> Vec { + if self.versions.len() == 1 { + self.versions[0].nongateway_nodes().to_vec() + } else { + let set = self + .versions + .iter() + .map(|x| x.nongateway_nodes()) + .flatten() + .collect::>(); + set.into_iter().copied().collect::>() + } } - pub(crate) fn cleanup_old_versions(&mut self) { - let min_sync_ack = self.update_trackers.sync_ack_map.current_min; - while self.versions.first().as_ref().unwrap().version < min_sync_ack { - let removed = self.versions.remove(0); - info!("Layout history: pruning old version {}", removed.version); + // ---- housekeeping (all invoked by LayoutHelper) ---- + + fn cleanup_old_versions(&mut self) { + loop { + let all_nongateway_nodes = self.get_all_nongateway_nodes(); + let min_version = self.min_stored(); + let sync_ack_map_min = self + .update_trackers + .sync_ack_map + .min(&all_nongateway_nodes, min_version); + if self.min_stored() < sync_ack_map_min { + let removed = self.versions.remove(0); + info!("Layout history: pruning old version {}", removed.version); + } else { + break; + } } } + fn clamp_update_trackers(&mut self, nodes: &[Uuid]) { + let min_v = self.min_stored(); + for node in nodes { + self.update_trackers.ack_map.set_max(*node, min_v); + self.update_trackers.sync_map.set_max(*node, min_v); + self.update_trackers.sync_ack_map.set_max(*node, min_v); + } + } + + fn calculate_trackers_hash(&self) -> Hash { + blake2sum(&nonversioned_encode(&self.update_trackers).unwrap()[..]) + } + + fn calculate_staging_hash(&self) -> Hash { + blake2sum(&nonversioned_encode(&self.staging).unwrap()[..]) + } + // ================== updates to layout, public interface =================== pub fn merge(&mut self, other: &LayoutHistory) -> bool { @@ -221,20 +310,6 @@ impl LayoutHistory { self.versions.remove(0); changed = true; } - if changed { - let min_v = self.versions.first().unwrap().version; - let nodes = self.all_nongateway_nodes().into_owned(); - for node in nodes { - self.update_trackers.ack_map.set_max(node, min_v); - self.update_trackers.sync_map.set_max(node, min_v); - self.update_trackers.sync_ack_map.set_max(node, min_v); - } - } - } - - // Update the current_min value in trackers if anything changed - if changed { - self.update_trackers_min(); } // Merge staged layout changes @@ -280,7 +355,6 @@ To know the correct value of the new layout version, invoke `garage layout show` parameters: self.staging.get().parameters.clone(), roles: LwwMap::new(), }); - self.update_hashes(); Ok((self, msg)) } @@ -290,20 +364,11 @@ To know the correct value of the new layout version, invoke `garage layout show` parameters: Lww::new(self.current().parameters.clone()), roles: LwwMap::new(), }); - self.update_hashes(); Ok(self) } pub fn check(&self) -> Result<(), String> { - // Check that the hash of the staging data is correct - if self.trackers_hash != self.calculate_trackers_hash() { - return Err("trackers_hash is incorrect".into()); - } - if self.staging_hash != self.calculate_staging_hash() { - return Err("staging_hash is incorrect".into()); - } - for version in self.versions.iter() { version.check()?; } diff --git a/src/rpc/layout/manager.rs b/src/rpc/layout/manager.rs index 21ec2d8d..e270ad21 100644 --- a/src/rpc/layout/manager.rs +++ b/src/rpc/layout/manager.rs @@ -24,7 +24,7 @@ pub struct LayoutManager { replication_factor: usize, persist_cluster_layout: Persister, - layout: Arc>, + layout: Arc>, pub(crate) change_notify: Arc, table_sync_version: Mutex>, @@ -54,7 +54,7 @@ impl LayoutManager { let persist_cluster_layout: Persister = Persister::new(&config.metadata_dir, "cluster_layout"); - let mut cluster_layout = match persist_cluster_layout.load() { + let cluster_layout = match persist_cluster_layout.load() { Ok(x) => { if x.current().replication_factor != replication_factor { return Err(Error::Message(format!( @@ -74,6 +74,7 @@ impl LayoutManager { } }; + let mut cluster_layout = LayoutHelper::new(cluster_layout); cluster_layout.update_trackers_of(node_id.into()); let layout = Arc::new(RwLock::new(cluster_layout)); @@ -100,7 +101,7 @@ impl LayoutManager { // ---- PUBLIC INTERFACE ---- - pub fn layout(&self) -> RwLockReadGuard<'_, LayoutHistory> { + pub fn layout(&self) -> RwLockReadGuard<'_, LayoutHelper> { self.layout.read().unwrap() } @@ -108,8 +109,8 @@ impl LayoutManager { let layout = self.layout(); LayoutStatus { cluster_layout_version: layout.current().version, - cluster_layout_trackers_hash: layout.trackers_hash, - cluster_layout_staging_hash: layout.staging_hash, + cluster_layout_trackers_hash: layout.trackers_hash(), + cluster_layout_staging_hash: layout.staging_hash(), } } @@ -137,13 +138,8 @@ impl LayoutManager { drop(table_sync_version); let mut layout = self.layout.write().unwrap(); - if layout - .update_trackers - .sync_map - .set_max(self.node_id, sync_until) - { + if layout.update(|l| l.update_trackers.sync_map.set_max(self.node_id, sync_until)) { debug!("sync_until updated to {}", sync_until); - layout.update_hashes(); self.broadcast_update(SystemRpc::AdvertiseClusterLayoutTrackers( layout.update_trackers.clone(), )); @@ -157,7 +153,7 @@ impl LayoutManager { let prev_layout_check = layout.check().is_ok(); if !prev_layout_check || adv.check().is_ok() { - if layout.merge(adv) { + if layout.update(|l| l.merge(adv)) { layout.update_trackers_of(self.node_id); if prev_layout_check && layout.check().is_err() { panic!("Merged two correct layouts and got an incorrect layout."); @@ -171,7 +167,7 @@ impl LayoutManager { fn merge_layout_trackers(&self, adv: &UpdateTrackers) -> Option { let mut layout = self.layout.write().unwrap(); if layout.update_trackers != *adv { - if layout.update_trackers.merge(adv) { + if layout.update(|l| l.update_trackers.merge(adv)) { layout.update_trackers_of(self.node_id); return Some(layout.update_trackers.clone()); } diff --git a/src/rpc/layout/schema.rs b/src/rpc/layout/schema.rs index 969f5a0b..00a2c017 100644 --- a/src/rpc/layout/schema.rs +++ b/src/rpc/layout/schema.rs @@ -188,7 +188,7 @@ mod v010 { use super::v09; use crate::layout::CompactNodeType; use garage_util::crdt::{Lww, LwwMap}; - use garage_util::data::{Hash, Uuid}; + use garage_util::data::Uuid; use serde::{Deserialize, Serialize}; use std::collections::BTreeMap; pub use v09::{LayoutParameters, NodeRole, NodeRoleV, ZoneRedundancy}; @@ -202,13 +202,9 @@ mod v010 { /// Update trackers pub update_trackers: UpdateTrackers, - /// Hash of the update trackers - pub trackers_hash: Hash, /// Staged changes for the next version pub staging: Lww, - /// Hash of the serialized staging_parameters + staging_roles - pub staging_hash: Hash, } /// A version of the layout of the cluster, i.e. the list of roles @@ -260,10 +256,7 @@ mod v010 { /// The history of cluster layouts #[derive(Clone, Debug, Serialize, Deserialize, Default, PartialEq)] - pub struct UpdateTracker { - pub values: BTreeMap, - pub current_min: u64, - } + pub struct UpdateTracker(pub BTreeMap); impl garage_util::migrate::Migrate for LayoutHistory { const VERSION_MARKER: &'static [u8] = b"G010lh"; @@ -293,32 +286,27 @@ mod v010 { nongateway_node_count, ring_assignment_data: previous.ring_assignment_data, }; - let update_tracker = UpdateTracker { - values: version + let update_tracker = UpdateTracker( + version .nongateway_nodes() .iter() .copied() .map(|x| (x, version.version)) .collect::>(), - current_min: 0, - }; + ); let staging = LayoutStaging { parameters: previous.staging_parameters, roles: previous.staging_roles, }; - let mut ret = Self { + Self { versions: vec![version], update_trackers: UpdateTrackers { ack_map: update_tracker.clone(), sync_map: update_tracker.clone(), sync_ack_map: update_tracker.clone(), }, - trackers_hash: [0u8; 32].into(), staging: Lww::raw(previous.version, staging), - staging_hash: [0u8; 32].into(), - }; - ret.update_hashes(); - ret + } } } } @@ -382,14 +370,14 @@ impl core::str::FromStr for ZoneRedundancy { impl UpdateTracker { fn merge(&mut self, other: &UpdateTracker) -> bool { let mut changed = false; - for (k, v) in other.values.iter() { - if let Some(v_mut) = self.values.get_mut(k) { + for (k, v) in other.0.iter() { + if let Some(v_mut) = self.0.get_mut(k) { if *v > *v_mut { *v_mut = *v; changed = true; } } else { - self.values.insert(*k, *v); + self.0.insert(*k, *v); changed = true; } } @@ -397,23 +385,23 @@ impl UpdateTracker { } pub(crate) fn set_max(&mut self, peer: Uuid, value: u64) -> bool { - match self.values.get_mut(&peer) { + match self.0.get_mut(&peer) { Some(e) if *e < value => { *e = value; true } None => { - self.values.insert(peer, value); + self.0.insert(peer, value); true } _ => false, } } - fn update_min(&mut self, storage_nodes: &[Uuid], min_version: u64) { - self.current_min = storage_nodes + pub(crate) fn min(&self, storage_nodes: &[Uuid], min_version: u64) -> u64 { + storage_nodes .iter() - .map(|x| self.values.get(x).copied().unwrap_or(min_version)) + .map(|x| self.0.get(x).copied().unwrap_or(min_version)) .min() .unwrap_or(min_version) } @@ -426,10 +414,4 @@ impl UpdateTrackers { let c3 = self.sync_ack_map.merge(&other.sync_ack_map); c1 || c2 || c3 } - - pub(crate) fn update_min(&mut self, storage_nodes: &[Uuid], min_version: u64) { - self.ack_map.update_min(&storage_nodes, min_version); - self.sync_map.update_min(&storage_nodes, min_version); - self.sync_ack_map.update_min(&storage_nodes, min_version); - } } diff --git a/src/rpc/rpc_helper.rs b/src/rpc/rpc_helper.rs index 1bad495b..e269ddaa 100644 --- a/src/rpc/rpc_helper.rs +++ b/src/rpc/rpc_helper.rs @@ -26,7 +26,7 @@ use garage_util::data::*; use garage_util::error::Error; use garage_util::metrics::RecordDuration; -use crate::layout::LayoutHistory; +use crate::layout::LayoutHelper; use crate::metrics::RpcMetrics; // Default RPC timeout = 5 minutes @@ -90,7 +90,7 @@ pub struct RpcHelper(Arc); struct RpcHelperInner { our_node_id: Uuid, fullmesh: Arc, - layout: Arc>, + layout: Arc>, metrics: RpcMetrics, rpc_timeout: Duration, } @@ -99,7 +99,7 @@ impl RpcHelper { pub(crate) fn new( our_node_id: Uuid, fullmesh: Arc, - layout: Arc>, + layout: Arc>, rpc_timeout: Option, ) -> Self { let metrics = RpcMetrics::new(); diff --git a/src/rpc/system.rs b/src/rpc/system.rs index 31d78bf6..d74dc2a1 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -34,7 +34,7 @@ use crate::consul::ConsulDiscovery; #[cfg(feature = "kubernetes-discovery")] use crate::kubernetes::*; use crate::layout::manager::{LayoutManager, LayoutStatus}; -use crate::layout::{self, LayoutHistory, NodeRoleV}; +use crate::layout::{self, LayoutHelper, LayoutHistory, NodeRoleV}; use crate::replication_mode::*; use crate::rpc_helper::*; @@ -350,7 +350,7 @@ impl System { // ---- Public utilities / accessors ---- - pub fn cluster_layout(&self) -> RwLockReadGuard<'_, LayoutHistory> { + pub fn cluster_layout(&self) -> RwLockReadGuard<'_, LayoutHelper> { self.layout_manager.layout() } -- cgit v1.2.3 From 33c8a489b0a9c0e869282bfc19c548f5a3e02e8c Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 15 Nov 2023 15:40:44 +0100 Subject: layou: implement ack locking --- src/block/manager.rs | 2 +- src/garage/cli/layout.rs | 2 +- src/rpc/layout/history.rs | 98 +++++++++++++++++++++++++++---------- src/rpc/layout/manager.rs | 74 +++++++++++++++++++++++++--- src/rpc/layout/mod.rs | 1 + src/table/replication/fullcopy.rs | 4 +- src/table/replication/parameters.rs | 4 +- src/table/replication/sharded.rs | 6 ++- src/table/sync.rs | 9 +--- src/table/table.rs | 2 +- 10 files changed, 156 insertions(+), 46 deletions(-) diff --git a/src/block/manager.rs b/src/block/manager.rs index 0ca8bc31..be2e4951 100644 --- a/src/block/manager.rs +++ b/src/block/manager.rs @@ -366,7 +366,7 @@ impl BlockManager { .rpc_helper() .try_write_many_sets( &self.endpoint, - &who, + who.as_ref(), put_block_rpc, RequestStrategy::with_priority(PRIO_NORMAL | PRIO_SECONDARY) .with_quorum(self.replication.write_quorum()), diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index 51774314..0be8278f 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -329,7 +329,7 @@ pub async fn fetch_layout( pub async fn send_layout( rpc_cli: &Endpoint, rpc_host: NodeID, - mut layout: LayoutHistory, + layout: LayoutHistory, ) -> Result<(), Error> { rpc_cli .call( diff --git a/src/rpc/layout/history.rs b/src/rpc/layout/history.rs index b6f0e495..dd38efa7 100644 --- a/src/rpc/layout/history.rs +++ b/src/rpc/layout/history.rs @@ -1,5 +1,7 @@ +use std::collections::HashMap; use std::collections::HashSet; use std::ops::Deref; +use std::sync::atomic::{AtomicUsize, Ordering}; use garage_util::crdt::{Crdt, Lww, LwwMap}; use garage_util::data::*; @@ -21,6 +23,11 @@ pub struct LayoutHelper { trackers_hash: Hash, staging_hash: Hash, + + // ack lock: counts in-progress write operations for each + // layout version ; we don't increase the ack update tracker + // while this lock is nonzero + pub(crate) ack_lock: HashMap, } impl Deref for LayoutHelper { @@ -31,7 +38,7 @@ impl Deref for LayoutHelper { } impl LayoutHelper { - pub fn new(mut layout: LayoutHistory) -> Self { + pub fn new(mut layout: LayoutHistory, mut ack_lock: HashMap) -> Self { layout.cleanup_old_versions(); let all_nongateway_nodes = layout.get_all_nongateway_nodes(); @@ -51,6 +58,11 @@ impl LayoutHelper { let trackers_hash = layout.calculate_trackers_hash(); let staging_hash = layout.calculate_staging_hash(); + ack_lock.retain(|_, cnt| *cnt.get_mut() > 0); + ack_lock + .entry(layout.current().version) + .or_insert(AtomicUsize::new(0)); + LayoutHelper { layout: Some(layout), ack_map_min, @@ -59,6 +71,7 @@ impl LayoutHelper { all_nongateway_nodes, trackers_hash, staging_hash, + ack_lock, } } @@ -74,7 +87,10 @@ impl LayoutHelper { { let changed = f(&mut self.layout.as_mut().unwrap()); if changed { - *self = Self::new(self.layout.take().unwrap()); + *self = Self::new( + self.layout.take().unwrap(), + std::mem::take(&mut self.ack_lock), + ); } changed } @@ -115,7 +131,7 @@ impl LayoutHelper { .collect() } - pub fn write_sets_of(&self, position: &Hash) -> Vec> { + pub(crate) fn write_sets_of(&self, position: &Hash) -> Vec> { self.layout() .versions .iter() @@ -143,42 +159,72 @@ impl LayoutHelper { // ------------------ helpers for update tracking --------------- - pub(crate) fn sync_first(&mut self, node: Uuid) { + pub(crate) fn update_trackers(&mut self, local_node_id: Uuid) { + // Ensure trackers for this node's values are up-to-date + + // 1. Acknowledge the last layout version which is not currently + // locked by an in-progress write operation + self.ack_max_free(local_node_id); + + // 2. Assume the data on this node is sync'ed up at least to + // the first layout version in the history + self.sync_first(local_node_id); + + // 3. Acknowledge everyone has synced up to min(self.sync_map) + self.sync_ack(local_node_id); + + info!("ack_map: {:?}", self.update_trackers.ack_map); + info!("sync_map: {:?}", self.update_trackers.sync_map); + info!("sync_ack_map: {:?}", self.update_trackers.sync_ack_map); + } + + fn sync_first(&mut self, local_node_id: Uuid) { let first_version = self.versions.first().as_ref().unwrap().version; - self.update(|layout| layout.update_trackers.sync_map.set_max(node, first_version)); + self.update(|layout| { + layout + .update_trackers + .sync_map + .set_max(local_node_id, first_version) + }); } - pub(crate) fn sync_ack(&mut self, node: Uuid) { + fn sync_ack(&mut self, local_node_id: Uuid) { let sync_map_min = self.sync_map_min; self.update(|layout| { layout .update_trackers .sync_ack_map - .set_max(node, sync_map_min) + .set_max(local_node_id, sync_map_min) }); } - pub(crate) fn ack_last(&mut self, node: Uuid) { - let last_version = self.current().version; - self.update(|layout| layout.update_trackers.ack_map.set_max(node, last_version)); + pub(crate) fn ack_max_free(&mut self, local_node_id: Uuid) -> bool { + let max_ack = self.max_free_ack(); + let changed = self.update(|layout| { + layout + .update_trackers + .ack_map + .set_max(local_node_id, max_ack) + }); + if changed { + info!("ack_until updated to {}", max_ack); + } + changed } - pub(crate) fn update_trackers_of(&mut self, node_id: Uuid) { - // Ensure trackers for this node's values are up-to-date - - // 1. Acknowledge the last layout version in the history - self.ack_last(node_id); - - // 2. Assume the data on this node is sync'ed up at least to - // the first layout version in the history - self.sync_first(node_id); - - // 3. Acknowledge everyone has synced up to min(self.sync_map) - self.sync_ack(node_id); - - info!("ack_map: {:?}", self.update_trackers.ack_map); - info!("sync_map: {:?}", self.update_trackers.sync_map); - info!("sync_ack_map: {:?}", self.update_trackers.sync_ack_map); + pub(crate) fn max_free_ack(&self) -> u64 { + self.layout() + .versions + .iter() + .map(|x| x.version) + .take_while(|v| { + self.ack_lock + .get(v) + .map(|x| x.load(Ordering::Relaxed) == 0) + .unwrap_or(true) + }) + .max() + .unwrap_or(self.min_stored()) } } diff --git a/src/rpc/layout/manager.rs b/src/rpc/layout/manager.rs index e270ad21..4e073d1f 100644 --- a/src/rpc/layout/manager.rs +++ b/src/rpc/layout/manager.rs @@ -1,5 +1,5 @@ use std::collections::HashMap; -use std::sync::{Arc, Mutex, RwLock, RwLockReadGuard}; +use std::sync::{atomic::Ordering, Arc, Mutex, RwLock, RwLockReadGuard}; use std::time::Duration; use serde::{Deserialize, Serialize}; @@ -74,8 +74,8 @@ impl LayoutManager { } }; - let mut cluster_layout = LayoutHelper::new(cluster_layout); - cluster_layout.update_trackers_of(node_id.into()); + let mut cluster_layout = LayoutHelper::new(cluster_layout, Default::default()); + cluster_layout.update_trackers(node_id.into()); let layout = Arc::new(RwLock::new(cluster_layout)); let change_notify = Arc::new(Notify::new()); @@ -139,13 +139,36 @@ impl LayoutManager { let mut layout = self.layout.write().unwrap(); if layout.update(|l| l.update_trackers.sync_map.set_max(self.node_id, sync_until)) { - debug!("sync_until updated to {}", sync_until); + info!("sync_until updated to {}", sync_until); self.broadcast_update(SystemRpc::AdvertiseClusterLayoutTrackers( layout.update_trackers.clone(), )); } } + fn ack_new_version(self: &Arc) { + let mut layout = self.layout.write().unwrap(); + if layout.ack_max_free(self.node_id) { + self.broadcast_update(SystemRpc::AdvertiseClusterLayoutTrackers( + layout.update_trackers.clone(), + )); + } + } + + // ---- ACK LOCKING ---- + + pub fn write_sets_of(self: &Arc, position: &Hash) -> WriteLock>> { + let layout = self.layout(); + let version = layout.current().version; + let nodes = layout.write_sets_of(position); + layout + .ack_lock + .get(&version) + .unwrap() + .fetch_add(1, Ordering::Relaxed); + WriteLock::new(version, self, nodes) + } + // ---- INTERNALS --- fn merge_layout(&self, adv: &LayoutHistory) -> Option { @@ -154,7 +177,7 @@ impl LayoutManager { if !prev_layout_check || adv.check().is_ok() { if layout.update(|l| l.merge(adv)) { - layout.update_trackers_of(self.node_id); + layout.update_trackers(self.node_id); if prev_layout_check && layout.check().is_err() { panic!("Merged two correct layouts and got an incorrect layout."); } @@ -168,7 +191,7 @@ impl LayoutManager { let mut layout = self.layout.write().unwrap(); if layout.update_trackers != *adv { if layout.update(|l| l.update_trackers.merge(adv)) { - layout.update_trackers_of(self.node_id); + layout.update_trackers(self.node_id); return Some(layout.update_trackers.clone()); } } @@ -317,3 +340,42 @@ impl LayoutManager { Ok(SystemRpc::Ok) } } + +// ---- ack lock ---- + +pub struct WriteLock { + layout_version: u64, + layout_manager: Arc, + value: T, +} + +impl WriteLock { + fn new(version: u64, layout_manager: &Arc, value: T) -> Self { + Self { + layout_version: version, + layout_manager: layout_manager.clone(), + value, + } + } +} + +impl AsRef for WriteLock { + fn as_ref(&self) -> &T { + &self.value + } +} + +impl Drop for WriteLock { + fn drop(&mut self) { + let layout = self.layout_manager.layout(); // acquire read lock + if let Some(counter) = layout.ack_lock.get(&self.layout_version) { + let prev_lock = counter.fetch_sub(1, Ordering::Relaxed); + if prev_lock == 1 && layout.current().version > self.layout_version { + drop(layout); // release read lock, write lock will be acquired + self.layout_manager.ack_new_version(); + } + } else { + error!("Could not find ack lock counter for layout version {}. This probably indicates a bug in Garage.", self.layout_version); + } + } +} diff --git a/src/rpc/layout/mod.rs b/src/rpc/layout/mod.rs index 577b32fb..859287c8 100644 --- a/src/rpc/layout/mod.rs +++ b/src/rpc/layout/mod.rs @@ -11,6 +11,7 @@ pub mod manager; // ---- re-exports ---- pub use history::*; +pub use manager::WriteLock; pub use schema::*; pub use version::*; diff --git a/src/table/replication/fullcopy.rs b/src/table/replication/fullcopy.rs index cb5471af..df930224 100644 --- a/src/table/replication/fullcopy.rs +++ b/src/table/replication/fullcopy.rs @@ -27,6 +27,8 @@ pub struct TableFullReplication { } impl TableReplication for TableFullReplication { + type WriteSets = Vec>; + fn storage_nodes(&self, _hash: &Hash) -> Vec { let layout = self.system.cluster_layout(); layout.current().all_nodes().to_vec() @@ -39,7 +41,7 @@ impl TableReplication for TableFullReplication { 1 } - fn write_sets(&self, hash: &Hash) -> Vec> { + fn write_sets(&self, hash: &Hash) -> Self::WriteSets { vec![self.storage_nodes(hash)] } fn write_quorum(&self) -> usize { diff --git a/src/table/replication/parameters.rs b/src/table/replication/parameters.rs index 2f842409..a4e701bb 100644 --- a/src/table/replication/parameters.rs +++ b/src/table/replication/parameters.rs @@ -3,6 +3,8 @@ use garage_util::data::*; /// Trait to describe how a table shall be replicated pub trait TableReplication: Send + Sync + 'static { + type WriteSets: AsRef>> + Send + Sync + 'static; + // See examples in table_sharded.rs and table_fullcopy.rs // To understand various replication methods @@ -15,7 +17,7 @@ pub trait TableReplication: Send + Sync + 'static { fn read_quorum(&self) -> usize; /// Which nodes to send writes to - fn write_sets(&self, hash: &Hash) -> Vec>; + fn write_sets(&self, hash: &Hash) -> Self::WriteSets; /// Responses needed to consider a write succesfull in each set fn write_quorum(&self) -> usize; fn max_write_errors(&self) -> usize; diff --git a/src/table/replication/sharded.rs b/src/table/replication/sharded.rs index 1320a189..2a16bc0c 100644 --- a/src/table/replication/sharded.rs +++ b/src/table/replication/sharded.rs @@ -25,6 +25,8 @@ pub struct TableShardedReplication { } impl TableReplication for TableShardedReplication { + type WriteSets = WriteLock>>; + fn storage_nodes(&self, hash: &Hash) -> Vec { self.system.cluster_layout().storage_nodes_of(hash) } @@ -36,8 +38,8 @@ impl TableReplication for TableShardedReplication { self.read_quorum } - fn write_sets(&self, hash: &Hash) -> Vec> { - self.system.cluster_layout().write_sets_of(hash) + fn write_sets(&self, hash: &Hash) -> Self::WriteSets { + self.system.layout_manager.write_sets_of(hash) } fn write_quorum(&self) -> usize { self.write_quorum diff --git a/src/table/sync.rs b/src/table/sync.rs index b67cdd79..efeac402 100644 --- a/src/table/sync.rs +++ b/src/table/sync.rs @@ -173,12 +173,7 @@ impl TableSyncer { } if !items.is_empty() { - let nodes = self - .data - .replication - .storage_nodes(begin) - .into_iter() - .collect::>(); + let nodes = self.data.replication.storage_nodes(begin); if nodes.contains(&self.system.id) { warn!( "({}) Interrupting offload as partitions seem to have changed", @@ -202,7 +197,7 @@ impl TableSyncer { end, counter ); - self.offload_items(&items, &nodes[..]).await?; + self.offload_items(&items, &nodes).await?; } else { break; } diff --git a/src/table/table.rs b/src/table/table.rs index c2efaeaf..5ec9eb0a 100644 --- a/src/table/table.rs +++ b/src/table/table.rs @@ -128,7 +128,7 @@ impl Table { .rpc_helper() .try_write_many_sets( &self.endpoint, - &who, + who.as_ref(), rpc, RequestStrategy::with_priority(PRIO_NORMAL) .with_quorum(self.data.replication.write_quorum()), -- cgit v1.2.3 From d4df03424f1c7f3cc1eaba9e16d2e1d049131b97 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 15 Nov 2023 15:56:57 +0100 Subject: layout: fix test --- src/rpc/layout/test.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/rpc/layout/test.rs b/src/rpc/layout/test.rs index e9639073..bb072c97 100644 --- a/src/rpc/layout/test.rs +++ b/src/rpc/layout/test.rs @@ -113,8 +113,6 @@ fn update_layout( staging.parameters.update(LayoutParameters { zone_redundancy: ZoneRedundancy::AtLeast(zone_redundancy), }); - - cl.update_hashes(); } #[test] -- cgit v1.2.3 From ad5c6f779f7fdfdc0569920c830c59197023515a Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 16 Nov 2023 13:26:43 +0100 Subject: layout: split helper in separate file; more precise difference tracking --- src/rpc/layout/helper.rs | 224 +++++++++++++++++++++++++++++++++++++ src/rpc/layout/history.rs | 278 +++++----------------------------------------- src/rpc/layout/manager.rs | 5 +- src/rpc/layout/mod.rs | 3 +- 4 files changed, 256 insertions(+), 254 deletions(-) create mode 100644 src/rpc/layout/helper.rs diff --git a/src/rpc/layout/helper.rs b/src/rpc/layout/helper.rs new file mode 100644 index 00000000..ed3da498 --- /dev/null +++ b/src/rpc/layout/helper.rs @@ -0,0 +1,224 @@ +use std::collections::HashMap; +use std::ops::Deref; +use std::sync::atomic::{AtomicUsize, Ordering}; + +use garage_util::data::*; + +use super::schema::*; + +pub struct LayoutHelper { + layout: Option, + + // cached values + ack_map_min: u64, + sync_map_min: u64, + + all_nodes: Vec, + all_nongateway_nodes: Vec, + + pub(crate) trackers_hash: Hash, + pub(crate) staging_hash: Hash, + + // ack lock: counts in-progress write operations for each + // layout version ; we don't increase the ack update tracker + // while this lock is nonzero + pub(crate) ack_lock: HashMap, +} + +impl Deref for LayoutHelper { + type Target = LayoutHistory; + fn deref(&self) -> &LayoutHistory { + self.layout() + } +} + +impl LayoutHelper { + pub fn new(mut layout: LayoutHistory, mut ack_lock: HashMap) -> Self { + layout.cleanup_old_versions(); + + let all_nongateway_nodes = layout.get_all_nongateway_nodes(); + layout.clamp_update_trackers(&all_nongateway_nodes); + + let min_version = layout.min_stored(); + let ack_map_min = layout + .update_trackers + .ack_map + .min(&all_nongateway_nodes, min_version); + let sync_map_min = layout + .update_trackers + .sync_map + .min(&all_nongateway_nodes, min_version); + + let all_nodes = layout.get_all_nodes(); + let trackers_hash = layout.calculate_trackers_hash(); + let staging_hash = layout.calculate_staging_hash(); + + ack_lock.retain(|_, cnt| *cnt.get_mut() > 0); + ack_lock + .entry(layout.current().version) + .or_insert(AtomicUsize::new(0)); + + LayoutHelper { + layout: Some(layout), + ack_map_min, + sync_map_min, + all_nodes, + all_nongateway_nodes, + trackers_hash, + staging_hash, + ack_lock, + } + } + + // ------------------ single updating function -------------- + + fn layout(&self) -> &LayoutHistory { + self.layout.as_ref().unwrap() + } + + pub(crate) fn update(&mut self, f: F) -> bool + where + F: FnOnce(&mut LayoutHistory) -> bool, + { + let changed = f(&mut self.layout.as_mut().unwrap()); + if changed { + *self = Self::new( + self.layout.take().unwrap(), + std::mem::take(&mut self.ack_lock), + ); + } + changed + } + + // ------------------ read helpers --------------- + + pub fn all_nodes(&self) -> &[Uuid] { + &self.all_nodes + } + + pub fn all_nongateway_nodes(&self) -> &[Uuid] { + &self.all_nongateway_nodes + } + + pub fn all_ack(&self) -> u64 { + self.ack_map_min + } + + pub fn sync_versions(&self) -> (u64, u64, u64) { + ( + self.layout().current().version, + self.all_ack(), + self.layout().min_stored(), + ) + } + + pub fn read_nodes_of(&self, position: &Hash) -> Vec { + let sync_min = self.sync_map_min; + let version = self + .layout() + .versions + .iter() + .find(|x| x.version == sync_min) + .or(self.layout().versions.last()) + .unwrap(); + version + .nodes_of(position, version.replication_factor) + .collect() + } + + pub(crate) fn write_sets_of(&self, position: &Hash) -> Vec> { + self.layout() + .versions + .iter() + .map(|x| x.nodes_of(position, x.replication_factor).collect()) + .collect() + } + + pub fn storage_nodes_of(&self, position: &Hash) -> Vec { + let mut ret = vec![]; + for version in self.layout().versions.iter() { + ret.extend(version.nodes_of(position, version.replication_factor)); + } + ret.sort(); + ret.dedup(); + ret + } + + pub fn trackers_hash(&self) -> Hash { + self.trackers_hash + } + + pub fn staging_hash(&self) -> Hash { + self.staging_hash + } + + // ------------------ helpers for update tracking --------------- + + pub(crate) fn update_trackers(&mut self, local_node_id: Uuid) { + // Ensure trackers for this node's values are up-to-date + + // 1. Acknowledge the last layout version which is not currently + // locked by an in-progress write operation + self.ack_max_free(local_node_id); + + // 2. Assume the data on this node is sync'ed up at least to + // the first layout version in the history + self.sync_first(local_node_id); + + // 3. Acknowledge everyone has synced up to min(self.sync_map) + self.sync_ack(local_node_id); + + info!("ack_map: {:?}", self.update_trackers.ack_map); + info!("sync_map: {:?}", self.update_trackers.sync_map); + info!("sync_ack_map: {:?}", self.update_trackers.sync_ack_map); + } + + fn sync_first(&mut self, local_node_id: Uuid) { + let first_version = self.versions.first().as_ref().unwrap().version; + self.update(|layout| { + layout + .update_trackers + .sync_map + .set_max(local_node_id, first_version) + }); + } + + fn sync_ack(&mut self, local_node_id: Uuid) { + let sync_map_min = self.sync_map_min; + self.update(|layout| { + layout + .update_trackers + .sync_ack_map + .set_max(local_node_id, sync_map_min) + }); + } + + pub(crate) fn ack_max_free(&mut self, local_node_id: Uuid) -> bool { + let max_ack = self.max_free_ack(); + let changed = self.update(|layout| { + layout + .update_trackers + .ack_map + .set_max(local_node_id, max_ack) + }); + if changed { + info!("ack_until updated to {}", max_ack); + } + changed + } + + pub(crate) fn max_free_ack(&self) -> u64 { + self.layout() + .versions + .iter() + .map(|x| x.version) + .take_while(|v| { + self.ack_lock + .get(v) + .map(|x| x.load(Ordering::Relaxed) == 0) + .unwrap_or(true) + }) + .max() + .unwrap_or(self.min_stored()) + } +} diff --git a/src/rpc/layout/history.rs b/src/rpc/layout/history.rs index dd38efa7..0a139549 100644 --- a/src/rpc/layout/history.rs +++ b/src/rpc/layout/history.rs @@ -1,7 +1,4 @@ -use std::collections::HashMap; use std::collections::HashSet; -use std::ops::Deref; -use std::sync::atomic::{AtomicUsize, Ordering}; use garage_util::crdt::{Crdt, Lww, LwwMap}; use garage_util::data::*; @@ -11,225 +8,6 @@ use garage_util::error::*; use super::schema::*; use super::*; -pub struct LayoutHelper { - layout: Option, - - // cached values - ack_map_min: u64, - sync_map_min: u64, - - all_nodes: Vec, - all_nongateway_nodes: Vec, - - trackers_hash: Hash, - staging_hash: Hash, - - // ack lock: counts in-progress write operations for each - // layout version ; we don't increase the ack update tracker - // while this lock is nonzero - pub(crate) ack_lock: HashMap, -} - -impl Deref for LayoutHelper { - type Target = LayoutHistory; - fn deref(&self) -> &LayoutHistory { - self.layout() - } -} - -impl LayoutHelper { - pub fn new(mut layout: LayoutHistory, mut ack_lock: HashMap) -> Self { - layout.cleanup_old_versions(); - - let all_nongateway_nodes = layout.get_all_nongateway_nodes(); - layout.clamp_update_trackers(&all_nongateway_nodes); - - let min_version = layout.min_stored(); - let ack_map_min = layout - .update_trackers - .ack_map - .min(&all_nongateway_nodes, min_version); - let sync_map_min = layout - .update_trackers - .sync_map - .min(&all_nongateway_nodes, min_version); - - let all_nodes = layout.get_all_nodes(); - let trackers_hash = layout.calculate_trackers_hash(); - let staging_hash = layout.calculate_staging_hash(); - - ack_lock.retain(|_, cnt| *cnt.get_mut() > 0); - ack_lock - .entry(layout.current().version) - .or_insert(AtomicUsize::new(0)); - - LayoutHelper { - layout: Some(layout), - ack_map_min, - sync_map_min, - all_nodes, - all_nongateway_nodes, - trackers_hash, - staging_hash, - ack_lock, - } - } - - // ------------------ single updating function -------------- - - fn layout(&self) -> &LayoutHistory { - self.layout.as_ref().unwrap() - } - - pub(crate) fn update(&mut self, f: F) -> bool - where - F: FnOnce(&mut LayoutHistory) -> bool, - { - let changed = f(&mut self.layout.as_mut().unwrap()); - if changed { - *self = Self::new( - self.layout.take().unwrap(), - std::mem::take(&mut self.ack_lock), - ); - } - changed - } - - // ------------------ read helpers --------------- - - pub fn all_nodes(&self) -> &[Uuid] { - &self.all_nodes - } - - pub fn all_nongateway_nodes(&self) -> &[Uuid] { - &self.all_nongateway_nodes - } - - pub fn all_ack(&self) -> u64 { - self.ack_map_min - } - - pub fn sync_versions(&self) -> (u64, u64, u64) { - ( - self.layout().current().version, - self.all_ack(), - self.layout().min_stored(), - ) - } - - pub fn read_nodes_of(&self, position: &Hash) -> Vec { - let sync_min = self.sync_map_min; - let version = self - .layout() - .versions - .iter() - .find(|x| x.version == sync_min) - .or(self.layout().versions.last()) - .unwrap(); - version - .nodes_of(position, version.replication_factor) - .collect() - } - - pub(crate) fn write_sets_of(&self, position: &Hash) -> Vec> { - self.layout() - .versions - .iter() - .map(|x| x.nodes_of(position, x.replication_factor).collect()) - .collect() - } - - pub fn storage_nodes_of(&self, position: &Hash) -> Vec { - let mut ret = vec![]; - for version in self.layout().versions.iter() { - ret.extend(version.nodes_of(position, version.replication_factor)); - } - ret.sort(); - ret.dedup(); - ret - } - - pub fn trackers_hash(&self) -> Hash { - self.trackers_hash - } - - pub fn staging_hash(&self) -> Hash { - self.staging_hash - } - - // ------------------ helpers for update tracking --------------- - - pub(crate) fn update_trackers(&mut self, local_node_id: Uuid) { - // Ensure trackers for this node's values are up-to-date - - // 1. Acknowledge the last layout version which is not currently - // locked by an in-progress write operation - self.ack_max_free(local_node_id); - - // 2. Assume the data on this node is sync'ed up at least to - // the first layout version in the history - self.sync_first(local_node_id); - - // 3. Acknowledge everyone has synced up to min(self.sync_map) - self.sync_ack(local_node_id); - - info!("ack_map: {:?}", self.update_trackers.ack_map); - info!("sync_map: {:?}", self.update_trackers.sync_map); - info!("sync_ack_map: {:?}", self.update_trackers.sync_ack_map); - } - - fn sync_first(&mut self, local_node_id: Uuid) { - let first_version = self.versions.first().as_ref().unwrap().version; - self.update(|layout| { - layout - .update_trackers - .sync_map - .set_max(local_node_id, first_version) - }); - } - - fn sync_ack(&mut self, local_node_id: Uuid) { - let sync_map_min = self.sync_map_min; - self.update(|layout| { - layout - .update_trackers - .sync_ack_map - .set_max(local_node_id, sync_map_min) - }); - } - - pub(crate) fn ack_max_free(&mut self, local_node_id: Uuid) -> bool { - let max_ack = self.max_free_ack(); - let changed = self.update(|layout| { - layout - .update_trackers - .ack_map - .set_max(local_node_id, max_ack) - }); - if changed { - info!("ack_until updated to {}", max_ack); - } - changed - } - - pub(crate) fn max_free_ack(&self) -> u64 { - self.layout() - .versions - .iter() - .map(|x| x.version) - .take_while(|v| { - self.ack_lock - .get(v) - .map(|x| x.load(Ordering::Relaxed) == 0) - .unwrap_or(true) - }) - .max() - .unwrap_or(self.min_stored()) - } -} - -// ---- - impl LayoutHistory { pub fn new(replication_factor: usize) -> Self { let version = LayoutVersion::new(replication_factor); @@ -270,7 +48,7 @@ impl LayoutHistory { } } - fn get_all_nongateway_nodes(&self) -> Vec { + pub(crate) fn get_all_nongateway_nodes(&self) -> Vec { if self.versions.len() == 1 { self.versions[0].nongateway_nodes().to_vec() } else { @@ -286,8 +64,21 @@ impl LayoutHistory { // ---- housekeeping (all invoked by LayoutHelper) ---- - fn cleanup_old_versions(&mut self) { - loop { + pub(crate) fn cleanup_old_versions(&mut self) { + // If there are invalid versions before valid versions, remove them + if self.versions.len() > 1 && self.current().check().is_ok() { + while self.versions.len() > 1 && self.versions.first().unwrap().check().is_err() { + let removed = self.versions.remove(0); + info!( + "Layout history: pruning old invalid version {}", + removed.version + ); + } + } + + // If there are old versions that no one is reading from anymore, + // remove them + while self.versions.len() > 1 { let all_nongateway_nodes = self.get_all_nongateway_nodes(); let min_version = self.min_stored(); let sync_ack_map_min = self @@ -303,7 +94,7 @@ impl LayoutHistory { } } - fn clamp_update_trackers(&mut self, nodes: &[Uuid]) { + pub(crate) fn clamp_update_trackers(&mut self, nodes: &[Uuid]) { let min_v = self.min_stored(); for node in nodes { self.update_trackers.ack_map.set_max(*node, min_v); @@ -312,11 +103,11 @@ impl LayoutHistory { } } - fn calculate_trackers_hash(&self) -> Hash { + pub(crate) fn calculate_trackers_hash(&self) -> Hash { blake2sum(&nonversioned_encode(&self.update_trackers).unwrap()[..]) } - fn calculate_staging_hash(&self) -> Hash { + pub(crate) fn calculate_staging_hash(&self) -> Hash { blake2sum(&nonversioned_encode(&self.staging).unwrap()[..]) } @@ -328,6 +119,7 @@ impl LayoutHistory { // Add any new versions to history for v2 in other.versions.iter() { if let Some(v1) = self.versions.iter().find(|v| v.version == v2.version) { + // Version is already present, check consistency if v1 != v2 { error!("Inconsistent layout histories: different layout compositions for version {}. Your cluster will be broken as long as this layout version is not replaced.", v2.version); } @@ -344,24 +136,14 @@ impl LayoutHistory { } // Merge trackers - if self.update_trackers != other.update_trackers { - let c = self.update_trackers.merge(&other.update_trackers); - changed = changed || c; - } - - // If there are invalid versions before valid versions, remove them, - // and increment update trackers - if self.versions.len() > 1 && self.current().check().is_ok() { - while self.versions.first().unwrap().check().is_err() { - self.versions.remove(0); - changed = true; - } - } + let c = self.update_trackers.merge(&other.update_trackers); + changed = changed || c; // Merge staged layout changes if self.staging != other.staging { + let prev_staging = self.staging.clone(); self.staging.merge(&other.staging); - changed = true; + changed = changed || self.staging != prev_staging; } changed @@ -390,11 +172,7 @@ To know the correct value of the new layout version, invoke `garage layout show` .calculate_next_version(&self.staging.get())?; self.versions.push(new_version); - if self.current().check().is_ok() { - while self.versions.first().unwrap().check().is_err() { - self.versions.remove(0); - } - } + self.cleanup_old_versions(); // Reset the staged layout changes self.staging.update(LayoutStaging { @@ -415,11 +193,7 @@ To know the correct value of the new layout version, invoke `garage layout show` } pub fn check(&self) -> Result<(), String> { - for version in self.versions.iter() { - version.check()?; - } - // TODO: anything more ? - Ok(()) + self.current().check() } } diff --git a/src/rpc/layout/manager.rs b/src/rpc/layout/manager.rs index 4e073d1f..85d94ffa 100644 --- a/src/rpc/layout/manager.rs +++ b/src/rpc/layout/manager.rs @@ -184,17 +184,20 @@ impl LayoutManager { return Some(layout.clone()); } } + None } fn merge_layout_trackers(&self, adv: &UpdateTrackers) -> Option { let mut layout = self.layout.write().unwrap(); + if layout.update_trackers != *adv { if layout.update(|l| l.update_trackers.merge(adv)) { layout.update_trackers(self.node_id); return Some(layout.update_trackers.clone()); } } + None } @@ -284,7 +287,7 @@ impl LayoutManager { } pub(crate) fn handle_pull_cluster_layout(&self) -> SystemRpc { - let layout = self.layout.read().unwrap().clone(); // TODO: avoid cloning + let layout = self.layout.read().unwrap().clone(); SystemRpc::AdvertiseClusterLayout(layout) } diff --git a/src/rpc/layout/mod.rs b/src/rpc/layout/mod.rs index 859287c8..91151ab4 100644 --- a/src/rpc/layout/mod.rs +++ b/src/rpc/layout/mod.rs @@ -1,4 +1,5 @@ mod graph_algo; +mod helper; mod history; mod schema; mod version; @@ -10,7 +11,7 @@ pub mod manager; // ---- re-exports ---- -pub use history::*; +pub use helper::LayoutHelper; pub use manager::WriteLock; pub use schema::*; pub use version::*; -- cgit v1.2.3 From 707442f5de416fdbed4681a33b739f0a787b7834 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 16 Nov 2023 13:51:40 +0100 Subject: layout: refactor digests and add "!=" assertions before epidemic bcast --- src/rpc/layout/helper.rs | 27 +++++++++++++++++++++++++-- src/rpc/layout/history.rs | 1 - src/rpc/layout/manager.rs | 36 ++++++++++-------------------------- src/rpc/layout/mod.rs | 2 +- src/rpc/system.rs | 17 +++++++++-------- 5 files changed, 45 insertions(+), 38 deletions(-) diff --git a/src/rpc/layout/helper.rs b/src/rpc/layout/helper.rs index ed3da498..0d746ea3 100644 --- a/src/rpc/layout/helper.rs +++ b/src/rpc/layout/helper.rs @@ -2,10 +2,24 @@ use std::collections::HashMap; use std::ops::Deref; use std::sync::atomic::{AtomicUsize, Ordering}; +use serde::{Deserialize, Serialize}; + use garage_util::data::*; use super::schema::*; +#[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq, Eq)] +pub struct LayoutDigest { + /// Cluster layout version + pub current_version: u64, + /// Number of active layout versions + pub active_versions: usize, + /// Hash of cluster layout update trackers + pub trackers_hash: Hash, + /// Hash of cluster layout staging data + pub staging_hash: Hash, +} + pub struct LayoutHelper { layout: Option, @@ -16,8 +30,8 @@ pub struct LayoutHelper { all_nodes: Vec, all_nongateway_nodes: Vec, - pub(crate) trackers_hash: Hash, - pub(crate) staging_hash: Hash, + trackers_hash: Hash, + staging_hash: Hash, // ack lock: counts in-progress write operations for each // layout version ; we don't increase the ack update tracker @@ -152,6 +166,15 @@ impl LayoutHelper { self.staging_hash } + pub fn digest(&self) -> LayoutDigest { + LayoutDigest { + current_version: self.current().version, + active_versions: self.versions.len(), + trackers_hash: self.trackers_hash, + staging_hash: self.staging_hash, + } + } + // ------------------ helpers for update tracking --------------- pub(crate) fn update_trackers(&mut self, local_node_id: Uuid) { diff --git a/src/rpc/layout/history.rs b/src/rpc/layout/history.rs index 0a139549..653d2a48 100644 --- a/src/rpc/layout/history.rs +++ b/src/rpc/layout/history.rs @@ -5,7 +5,6 @@ use garage_util::data::*; use garage_util::encode::nonversioned_encode; use garage_util::error::*; -use super::schema::*; use super::*; impl LayoutHistory { diff --git a/src/rpc/layout/manager.rs b/src/rpc/layout/manager.rs index 85d94ffa..c65831a2 100644 --- a/src/rpc/layout/manager.rs +++ b/src/rpc/layout/manager.rs @@ -2,8 +2,6 @@ use std::collections::HashMap; use std::sync::{atomic::Ordering, Arc, Mutex, RwLock, RwLockReadGuard}; use std::time::Duration; -use serde::{Deserialize, Serialize}; - use tokio::sync::Notify; use netapp::endpoint::Endpoint; @@ -33,16 +31,6 @@ pub struct LayoutManager { system_endpoint: Arc>, } -#[derive(Debug, Clone, Serialize, Deserialize, Default)] -pub struct LayoutStatus { - /// Cluster layout version - pub cluster_layout_version: u64, - /// Hash of cluster layout update trackers - pub cluster_layout_trackers_hash: Hash, - /// Hash of cluster layout staging data - pub cluster_layout_staging_hash: Hash, -} - impl LayoutManager { pub fn new( config: &Config, @@ -105,15 +93,6 @@ impl LayoutManager { self.layout.read().unwrap() } - pub fn status(&self) -> LayoutStatus { - let layout = self.layout(); - LayoutStatus { - cluster_layout_version: layout.current().version, - cluster_layout_trackers_hash: layout.trackers_hash(), - cluster_layout_staging_hash: layout.staging_hash(), - } - } - pub async fn update_cluster_layout( self: &Arc, layout: &LayoutHistory, @@ -173,6 +152,7 @@ impl LayoutManager { fn merge_layout(&self, adv: &LayoutHistory) -> Option { let mut layout = self.layout.write().unwrap(); + let prev_digest = layout.digest(); let prev_layout_check = layout.check().is_ok(); if !prev_layout_check || adv.check().is_ok() { @@ -181,6 +161,7 @@ impl LayoutManager { if prev_layout_check && layout.check().is_err() { panic!("Merged two correct layouts and got an incorrect layout."); } + assert!(layout.digest() != prev_digest); return Some(layout.clone()); } } @@ -190,10 +171,12 @@ impl LayoutManager { fn merge_layout_trackers(&self, adv: &UpdateTrackers) -> Option { let mut layout = self.layout.write().unwrap(); + let prev_digest = layout.digest(); if layout.update_trackers != *adv { if layout.update(|l| l.update_trackers.merge(adv)) { layout.update_trackers(self.node_id); + assert!(layout.digest() != prev_digest); return Some(layout.update_trackers.clone()); } } @@ -269,16 +252,17 @@ impl LayoutManager { // ---- RPC HANDLERS ---- - pub(crate) fn handle_advertise_status(self: &Arc, from: Uuid, remote: &LayoutStatus) { - let local = self.status(); - if remote.cluster_layout_version > local.cluster_layout_version - || remote.cluster_layout_staging_hash != local.cluster_layout_staging_hash + pub(crate) fn handle_advertise_status(self: &Arc, from: Uuid, remote: &LayoutDigest) { + let local = self.layout().digest(); + if remote.current_version > local.current_version + || remote.active_versions != local.active_versions + || remote.staging_hash != local.staging_hash { tokio::spawn({ let this = self.clone(); async move { this.pull_cluster_layout(from).await } }); - } else if remote.cluster_layout_trackers_hash != local.cluster_layout_trackers_hash { + } else if remote.trackers_hash != local.trackers_hash { tokio::spawn({ let this = self.clone(); async move { this.pull_cluster_layout_trackers(from).await } diff --git a/src/rpc/layout/mod.rs b/src/rpc/layout/mod.rs index 91151ab4..eb127fda 100644 --- a/src/rpc/layout/mod.rs +++ b/src/rpc/layout/mod.rs @@ -11,7 +11,7 @@ pub mod manager; // ---- re-exports ---- -pub use helper::LayoutHelper; +pub use helper::{LayoutDigest, LayoutHelper}; pub use manager::WriteLock; pub use schema::*; pub use version::*; diff --git a/src/rpc/system.rs b/src/rpc/system.rs index d74dc2a1..dc127afb 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -33,8 +33,9 @@ use garage_util::time::*; use crate::consul::ConsulDiscovery; #[cfg(feature = "kubernetes-discovery")] use crate::kubernetes::*; -use crate::layout::manager::{LayoutManager, LayoutStatus}; -use crate::layout::{self, LayoutHelper, LayoutHistory, NodeRoleV}; +use crate::layout::{ + self, manager::LayoutManager, LayoutDigest, LayoutHelper, LayoutHistory, NodeRoleV, +}; use crate::replication_mode::*; use crate::rpc_helper::*; @@ -130,8 +131,8 @@ pub struct NodeStatus { /// Replication factor configured on the node pub replication_factor: usize, - /// Layout status - pub layout_status: LayoutStatus, + /// Cluster layout digest + pub layout_digest: LayoutDigest, /// Disk usage on partition containing metadata directory (tuple: `(avail, total)`) #[serde(default)] @@ -539,7 +540,7 @@ impl System { fn update_local_status(&self) { let mut new_si: NodeStatus = self.local_status.load().as_ref().clone(); - new_si.layout_status = self.layout_manager.status(); + new_si.layout_digest = self.layout_manager.layout().digest(); new_si.update_disk_usage(&self.metadata_dir, &self.data_dir, &self.metrics); @@ -573,7 +574,7 @@ impl System { } self.layout_manager - .handle_advertise_status(from, &info.layout_status); + .handle_advertise_status(from, &info.layout_digest); self.node_status .write() @@ -755,7 +756,7 @@ impl NodeStatus { .into_string() .unwrap_or_else(|_| "".to_string()), replication_factor, - layout_status: layout_manager.status(), + layout_digest: layout_manager.layout().digest(), meta_disk_avail: None, data_disk_avail: None, } @@ -765,7 +766,7 @@ impl NodeStatus { NodeStatus { hostname: "?".to_string(), replication_factor: 0, - layout_status: Default::default(), + layout_digest: Default::default(), meta_disk_avail: None, data_disk_avail: None, } -- cgit v1.2.3 From 22f38808e744ea5b30ad771fcb344a29579b56d4 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 16 Nov 2023 16:34:01 +0100 Subject: rpc_helper: don't use tokio::spawn for individual requests --- src/rpc/rpc_helper.rs | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/src/rpc/rpc_helper.rs b/src/rpc/rpc_helper.rs index e269ddaa..7e9fabd7 100644 --- a/src/rpc/rpc_helper.rs +++ b/src/rpc/rpc_helper.rs @@ -299,9 +299,7 @@ impl RpcHelper { if let Some((req_to, fut)) = requests.next() { let tracer = opentelemetry::global::tracer("garage"); let span = tracer.start(format!("RPC to {:?}", req_to)); - resp_stream.push(tokio::spawn( - fut.with_context(Context::current_with_span(span)), - )); + resp_stream.push(fut.with_context(Context::current_with_span(span))); } else { break; } @@ -313,7 +311,7 @@ impl RpcHelper { } // Wait for one request to terminate - match resp_stream.next().await.unwrap().unwrap() { + match resp_stream.next().await.unwrap() { Ok(msg) => { successes.push(msg); } @@ -448,7 +446,7 @@ impl RpcHelper { let tracer = opentelemetry::global::tracer("garage"); let span = tracer.start(format!("RPC to {:?}", to)); let fut = async move { (to, self2.call(&endpoint2, to, msg, strategy).await) }; - tokio::spawn(fut.with_context(Context::current_with_span(span))) + fut.with_context(Context::current_with_span(span)) }); let mut resp_stream = requests.collect::>(); @@ -457,9 +455,7 @@ impl RpcHelper { let mut set_counters = vec![(0, 0); to_sets.len()]; - while !resp_stream.is_empty() { - let (node, resp) = resp_stream.next().await.unwrap().unwrap(); - + while let Some((node, resp)) = resp_stream.next().await { match resp { Ok(msg) => { for set in peers.get(&node).unwrap().iter() { @@ -475,12 +471,12 @@ impl RpcHelper { } } - if set_counters.iter().all(|x| x.0 >= quorum) { + if set_counters.iter().all(|(ok_cnt, _)| *ok_cnt >= quorum) { // Success // Continue all other requets in background tokio::spawn(async move { - resp_stream.collect::>>().await; + resp_stream.collect::)>>().await; }); return Ok(successes); @@ -489,7 +485,7 @@ impl RpcHelper { if set_counters .iter() .enumerate() - .any(|(i, x)| x.1 + quorum > to_sets[i].len()) + .any(|(i, (_, err_cnt))| err_cnt + quorum > to_sets[i].len()) { // Too many errors in this set, we know we won't get a quorum break; -- cgit v1.2.3 From 3ecd14b9f6202ad3c5513c6ad7422bd408134002 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 16 Nov 2023 16:41:45 +0100 Subject: table: implement write sets for insert_many --- src/table/table.rs | 157 +++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 127 insertions(+), 30 deletions(-) diff --git a/src/table/table.rs b/src/table/table.rs index 5ec9eb0a..7d1ff31c 100644 --- a/src/table/table.rs +++ b/src/table/table.rs @@ -143,7 +143,7 @@ impl Table { self.data.queue_insert(tx, e) } - pub async fn insert_many(&self, entries: I) -> Result<(), Error> + pub async fn insert_many(self: &Arc, entries: I) -> Result<(), Error> where I: IntoIterator + Send + Sync, IE: Borrow + Send + Sync, @@ -161,52 +161,149 @@ impl Table { Ok(()) } - async fn insert_many_internal(&self, entries: I) -> Result<(), Error> + async fn insert_many_internal(self: &Arc, entries: I) -> Result<(), Error> where I: IntoIterator + Send + Sync, IE: Borrow + Send + Sync, { - let mut call_list: HashMap<_, Vec<_>> = HashMap::new(); - + // The different items will have to be stored on possibly different nodes. + // We will here batch all items into a single request for each concerned + // node, with all of the entries it must store within that request. + // Each entry has to be saved to a specific list of "write sets", i.e. a set + // of node within wich a quorum must be achieved. In normal operation, there + // is a single write set which corresponds to the quorum in the current + // cluster layout, but when the layout is updated, multiple write sets might + // have to be handled at once. Here, since we are sending many entries, we + // will have to handle many write sets in all cases. The algorihtm is thus + // to send one request to each node with all the items it must save, + // and keep track of the OK responses within each write set: if for all sets + // a quorum of nodes has answered OK, then the insert has succeeded and + // consistency properties (read-after-write) are preserved. + + // Some code here might feel redundant with RpcHelper::try_write_many_sets, + // but I think deduplicating could lead to more spaghetti instead of + // improving the readability, so I'm leaving as is. + + let quorum = self.data.replication.write_quorum(); + + // Serialize all entries and compute the write sets for each of them. + // In the case of sharded table replication, this also takes an "ack lock" + // to the layout manager to avoid ack'ing newer versions which are not + // taken into account by writes in progress (the ack can happen later, once + // all writes that didn't take the new layout into account are finished). + // These locks are released when entries_vec is dropped, i.e. when this + // function returns. + let mut entries_vec = Vec::new(); for entry in entries.into_iter() { let entry = entry.borrow(); let hash = entry.partition_key().hash(); - // TODO: use write sets - let who = self.data.replication.storage_nodes(&hash); + let write_sets = self.data.replication.write_sets(&hash); let e_enc = Arc::new(ByteBuf::from(entry.encode()?)); - for node in who { - call_list.entry(node).or_default().push(e_enc.clone()); + entries_vec.push((write_sets, e_enc)); + } + + // Compute a deduplicated list of all of the write sets, + // and compute an index from each node to the position of the sets in which + // it takes part, to optimize the detection of a quorum. + let mut write_sets = entries_vec + .iter() + .map(|(wss, _)| wss.as_ref().iter().map(|ws| ws.as_slice())) + .flatten() + .collect::>(); + write_sets.sort(); + write_sets.dedup(); + let mut write_set_index = HashMap::<&Uuid, Vec>::new(); + for (i, write_set) in write_sets.iter().enumerate() { + for node in write_set.iter() { + write_set_index.entry(node).or_default().push(i); } } - let call_futures = call_list.drain().map(|(node, entries)| async move { - let rpc = TableRpc::::Update(entries); - - let resp = self - .system - .rpc_helper() - .call( - &self.endpoint, - node, - rpc, - RequestStrategy::with_priority(PRIO_NORMAL), - ) - .await?; - Ok::<_, Error>((node, resp)) + // Build a map of all nodes to the entries that must be sent to that node. + let mut call_list: HashMap> = HashMap::new(); + for (write_sets, entry_enc) in entries_vec.iter() { + for write_set in write_sets.as_ref().iter() { + for node in write_set.iter() { + call_list.entry(*node).or_default().push(entry_enc.clone()) + } + } + } + + // Build futures to actually perform each of the corresponding RPC calls + let call_count = call_list.len(); + let call_futures = call_list.into_iter().map(|(node, entries)| { + let this = self.clone(); + let tracer = opentelemetry::global::tracer("garage"); + let span = tracer.start(format!("RPC to {:?}", node)); + let fut = async move { + let rpc = TableRpc::::Update(entries); + let resp = this + .system + .rpc_helper() + .call( + &this.endpoint, + node, + rpc, + RequestStrategy::with_priority(PRIO_NORMAL).with_quorum(quorum), + ) + .await; + (node, resp) + }; + fut.with_context(Context::current_with_span(span)) }); + + // Run all requests in parallel thanks to FuturesUnordered, and collect results. let mut resps = call_futures.collect::>(); + let mut set_counters = vec![(0, 0); write_sets.len()]; + let mut successes = 0; let mut errors = vec![]; - while let Some(resp) = resps.next().await { - if let Err(e) = resp { - errors.push(e); + while let Some((node, resp)) = resps.next().await { + match resp { + Ok(_) => { + successes += 1; + for set in write_set_index.get(&node).unwrap().iter() { + set_counters[*set].0 += 1; + } + } + Err(e) => { + errors.push(e); + for set in write_set_index.get(&node).unwrap().iter() { + set_counters[*set].1 += 1; + } + } + } + + if set_counters.iter().all(|(ok_cnt, _)| *ok_cnt >= quorum) { + // Success + + // Continue all other requests in background + tokio::spawn(async move { + resps.collect::)>>().await; + }); + + return Ok(()); + } + + if set_counters + .iter() + .enumerate() + .any(|(i, (_, err_cnt))| err_cnt + quorum > write_sets[i].len()) + { + // Too many errors in this set, we know we won't get a quorum + break; } } - if errors.len() > self.data.replication.max_write_errors() { - Err(Error::Message("Too many errors".into())) - } else { - Ok(()) - } + + // Failure, could not get quorum within at least one set + let errors = errors.iter().map(|e| format!("{}", e)).collect::>(); + Err(Error::Quorum( + quorum, + Some(write_sets.len()), + successes, + call_count, + errors, + )) } pub async fn get( -- cgit v1.2.3 From d6d239fc7909cbd017da6ea35cceb3d561a87cca Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 27 Nov 2023 11:52:57 +0100 Subject: block manager: read_block using old layout versions if necessary --- src/block/manager.rs | 6 ++++-- src/rpc/layout/helper.rs | 23 +++++++++++++++++++++++ src/rpc/layout/history.rs | 12 +++++++++++- src/rpc/layout/schema.rs | 7 +++++++ src/rpc/rpc_helper.rs | 11 +++++------ 5 files changed, 50 insertions(+), 9 deletions(-) diff --git a/src/block/manager.rs b/src/block/manager.rs index be2e4951..47111160 100644 --- a/src/block/manager.rs +++ b/src/block/manager.rs @@ -264,8 +264,10 @@ impl BlockManager { F: Fn(DataBlockHeader, ByteStream) -> Fut, Fut: futures::Future>, { - let who = self.replication.read_nodes(hash); - let who = self.system.rpc_helper().request_order(&who); + let who = self + .system + .cluster_layout() + .block_read_nodes_of(hash, self.system.rpc_helper()); for node in who.iter() { let node_id = NodeID::from(*node); diff --git a/src/rpc/layout/helper.rs b/src/rpc/layout/helper.rs index 0d746ea3..5d159f3e 100644 --- a/src/rpc/layout/helper.rs +++ b/src/rpc/layout/helper.rs @@ -7,6 +7,7 @@ use serde::{Deserialize, Serialize}; use garage_util::data::*; use super::schema::*; +use crate::rpc_helper::RpcHelper; #[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq, Eq)] pub struct LayoutDigest { @@ -140,6 +141,28 @@ impl LayoutHelper { .collect() } + pub fn block_read_nodes_of(&self, position: &Hash, rpc_helper: &RpcHelper) -> Vec { + let mut ret = Vec::with_capacity(12); + let ver_iter = self + .layout() + .versions + .iter() + .rev() + .chain(self.layout().old_versions.iter().rev()); + for ver in ver_iter { + if ver.version > self.sync_map_min { + continue; + } + let nodes = ver.nodes_of(position, ver.replication_factor); + for node in rpc_helper.request_order(nodes) { + if !ret.contains(&node) { + ret.push(node); + } + } + } + ret + } + pub(crate) fn write_sets_of(&self, position: &Hash) -> Vec> { self.layout() .versions diff --git a/src/rpc/layout/history.rs b/src/rpc/layout/history.rs index 653d2a48..7d4a1b48 100644 --- a/src/rpc/layout/history.rs +++ b/src/rpc/layout/history.rs @@ -18,6 +18,7 @@ impl LayoutHistory { LayoutHistory { versions: vec![version], + old_versions: vec![], update_trackers: Default::default(), staging: Lww::raw(0, staging), } @@ -86,11 +87,20 @@ impl LayoutHistory { .min(&all_nongateway_nodes, min_version); if self.min_stored() < sync_ack_map_min { let removed = self.versions.remove(0); - info!("Layout history: pruning old version {}", removed.version); + info!( + "Layout history: moving version {} to old_versions", + removed.version + ); + self.old_versions.push(removed); } else { break; } } + + while self.old_versions.len() > OLD_VERSION_COUNT { + let removed = self.old_versions.remove(0); + info!("Layout history: removing old_version {}", removed.version); + } } pub(crate) fn clamp_update_trackers(&mut self, nodes: &[Uuid]) { diff --git a/src/rpc/layout/schema.rs b/src/rpc/layout/schema.rs index 00a2c017..08db44ca 100644 --- a/src/rpc/layout/schema.rs +++ b/src/rpc/layout/schema.rs @@ -193,12 +193,18 @@ mod v010 { use std::collections::BTreeMap; pub use v09::{LayoutParameters, NodeRole, NodeRoleV, ZoneRedundancy}; + pub const OLD_VERSION_COUNT: usize = 5; + /// The history of cluster layouts, with trackers to keep a record /// of which nodes are up-to-date to current cluster data #[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] pub struct LayoutHistory { /// The versions currently in use in the cluster pub versions: Vec, + /// At most 5 of the previous versions, not used by the garage_table + /// module, but usefull for the garage_block module to find data blocks + /// that have not yet been moved + pub old_versions: Vec, /// Update trackers pub update_trackers: UpdateTrackers, @@ -300,6 +306,7 @@ mod v010 { }; Self { versions: vec![version], + old_versions: vec![], update_trackers: UpdateTrackers { ack_map: update_tracker.clone(), sync_map: update_tracker.clone(), diff --git a/src/rpc/rpc_helper.rs b/src/rpc/rpc_helper.rs index 7e9fabd7..e9a9143f 100644 --- a/src/rpc/rpc_helper.rs +++ b/src/rpc/rpc_helper.rs @@ -267,7 +267,7 @@ impl RpcHelper { // When there are errors, we start new requests to compensate. // Reorder requests to priorize closeness / low latency - let request_order = self.request_order(to); + let request_order = self.request_order(to.iter().copied()); let send_all_at_once = strategy.rs_send_all_at_once.unwrap_or(false); // Build future for each request @@ -335,7 +335,7 @@ impl RpcHelper { } } - pub fn request_order(&self, nodes: &[Uuid]) -> Vec { + pub fn request_order(&self, nodes: impl Iterator) -> Vec { // Retrieve some status variables that we will use to sort requests let peer_list = self.0.fullmesh.get_peer_list(); let layout = self.0.layout.read().unwrap(); @@ -351,9 +351,8 @@ impl RpcHelper { // By sorting this vec, we priorize ourself, then nodes in the same zone, // and within a same zone we priorize nodes with the lowest latency. let mut nodes = nodes - .iter() .map(|to| { - let peer_zone = match layout.current().node_role(to) { + let peer_zone = match layout.current().node_role(&to) { Some(pc) => &pc.zone, None => "", }; @@ -363,10 +362,10 @@ impl RpcHelper { .and_then(|pi| pi.avg_ping) .unwrap_or_else(|| Duration::from_secs(10)); ( - *to != self.0.our_node_id, + to != self.0.our_node_id, peer_zone != our_zone, peer_avg_ping, - *to, + to, ) }) .collect::>(); -- cgit v1.2.3 From 78362140f5a177340a06690d9c9ea98bd831e7a4 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 27 Nov 2023 12:10:21 +0100 Subject: rpc: update system::health to take into account write sets for all partitions --- src/rpc/system.rs | 77 ++++++++++++++++++++++++++++++++----------------------- 1 file changed, 45 insertions(+), 32 deletions(-) diff --git a/src/rpc/system.rs b/src/rpc/system.rs index dc127afb..c7d41ee4 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -1,5 +1,5 @@ //! Module containing structs related to membership management -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::io::{Read, Write}; use std::net::{IpAddr, SocketAddr}; use std::path::{Path, PathBuf}; @@ -418,48 +418,61 @@ impl System { } pub fn health(&self) -> ClusterHealth { - // TODO: adapt this function to take into account layout history - // when estimating cluster health, and not just use current layout - let quorum = self.replication_mode.write_quorum(); - let replication_factor = self.replication_factor; + // Gather information about running nodes. + // Technically, `nodes` contains currently running nodes, as well + // as nodes that this Garage process has been connected to at least + // once since it started. let nodes = self .get_known_nodes() .into_iter() .map(|n| (n.id, n)) .collect::>(); let connected_nodes = nodes.iter().filter(|(_, n)| n.is_up).count(); + let node_up = |x: &Uuid| nodes.get(x).map(|n| n.is_up).unwrap_or(false); + + // Acquire a rwlock read-lock to the current cluster layout + let layout = self.cluster_layout(); + + // Obtain information about nodes that have a role as storage nodes + // in one of the active layout versions + let mut storage_nodes = HashSet::::with_capacity(16); + for ver in layout.versions.iter() { + storage_nodes.extend( + ver.roles + .items() + .iter() + .filter(|(_, _, v)| matches!(v, NodeRoleV(Some(r)) if r.capacity.is_some())) + .map(|(n, _, _)| *n), + ) + } + let storage_nodes_ok = storage_nodes.iter().filter(|x| node_up(x)).count(); - let layout = self.cluster_layout(); // acquires a rwlock - - let storage_nodes = layout - .current() - .roles - .items() - .iter() - .filter(|(_, _, v)| matches!(v, NodeRoleV(Some(r)) if r.capacity.is_some())) - .collect::>(); - let storage_nodes_ok = storage_nodes - .iter() - .filter(|(x, _, _)| nodes.get(x).map(|n| n.is_up).unwrap_or(false)) - .count(); - + // Determine the number of partitions that have: + // - a quorum of up nodes for all write sets (i.e. are available) + // - for which all nodes in all write sets are up (i.e. are fully healthy) let partitions = layout.current().partitions().collect::>(); - let partitions_n_up = partitions - .iter() - .map(|(_, h)| { - let pn = layout.current().nodes_of(h, replication_factor); - pn.filter(|x| nodes.get(x).map(|n| n.is_up).unwrap_or(false)) - .count() - }) - .collect::>(); - let partitions_all_ok = partitions_n_up - .iter() - .filter(|c| **c == replication_factor) - .count(); - let partitions_quorum = partitions_n_up.iter().filter(|c| **c >= quorum).count(); + let mut partitions_quorum = 0; + let mut partitions_all_ok = 0; + for (_, hash) in partitions.iter() { + let write_sets = layout + .versions + .iter() + .map(|x| x.nodes_of(hash, x.replication_factor)); + let has_quorum = write_sets + .clone() + .all(|set| set.filter(|x| node_up(x)).count() >= quorum); + let all_ok = write_sets.clone().all(|mut set| set.all(|x| node_up(&x))); + if has_quorum { + partitions_quorum += 1; + } + if all_ok { + partitions_all_ok += 1; + } + } + // Determine overall cluster status let status = if partitions_quorum == partitions.len() && storage_nodes_ok == storage_nodes.len() { ClusterHealthStatus::Healthy -- cgit v1.2.3 From 539a920313fff010b8a4291aeef58ec9a14ee635 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 27 Nov 2023 13:18:59 +0100 Subject: cli: show when nodes are draining metadata --- src/garage/cli/cmd.rs | 172 +++++++++++++++++++++++++++++++------------------- 1 file changed, 108 insertions(+), 64 deletions(-) diff --git a/src/garage/cli/cmd.rs b/src/garage/cli/cmd.rs index 1a054025..c99243b9 100644 --- a/src/garage/cli/cmd.rs +++ b/src/garage/cli/cmd.rs @@ -1,4 +1,4 @@ -use std::collections::HashSet; +use std::collections::{HashMap, HashSet}; use std::time::Duration; use format_table::format_table; @@ -62,35 +62,69 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> let mut healthy_nodes = vec!["ID\tHostname\tAddress\tTags\tZone\tCapacity\tDataAvail".to_string()]; for adv in status.iter().filter(|adv| adv.is_up) { - match layout.current().roles.get(&adv.id) { - Some(NodeRoleV(Some(cfg))) => { - let data_avail = match &adv.status.data_disk_avail { - _ if cfg.capacity.is_none() => "N/A".into(), - Some((avail, total)) => { - let pct = (*avail as f64) / (*total as f64) * 100.; - let avail = bytesize::ByteSize::b(*avail); - format!("{} ({:.1}%)", avail, pct) - } - None => "?".into(), - }; + if let Some(NodeRoleV(Some(cfg))) = layout.current().roles.get(&adv.id) { + let data_avail = match &adv.status.data_disk_avail { + _ if cfg.capacity.is_none() => "N/A".into(), + Some((avail, total)) => { + let pct = (*avail as f64) / (*total as f64) * 100.; + let avail = bytesize::ByteSize::b(*avail); + format!("{} ({:.1}%)", avail, pct) + } + None => "?".into(), + }; + healthy_nodes.push(format!( + "{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\t{capacity}\t{data_avail}", + id = adv.id, + host = adv.status.hostname, + addr = adv.addr, + tags = cfg.tags.join(","), + zone = cfg.zone, + capacity = cfg.capacity_string(), + data_avail = data_avail, + )); + } else { + let prev_role = layout + .versions + .iter() + .rev() + .find_map(|x| match x.roles.get(&adv.id) { + Some(NodeRoleV(Some(cfg))) => Some(cfg), + _ => None, + }); + let historic_role = + layout + .old_versions + .iter() + .rev() + .find_map(|x| match x.roles.get(&adv.id) { + Some(NodeRoleV(Some(cfg))) => Some(cfg), + _ => None, + }); + if let Some(cfg) = prev_role { healthy_nodes.push(format!( - "{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\t{capacity}\t{data_avail}", + "{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\tdraining metadata...", id = adv.id, host = adv.status.hostname, addr = adv.addr, tags = cfg.tags.join(","), zone = cfg.zone, - capacity = cfg.capacity_string(), - data_avail = data_avail, )); - } - _ => { + } else if let Some(cfg) = historic_role { + healthy_nodes.push(format!( + "{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\tremoved, metadata drained", + id = adv.id, + host = adv.status.hostname, + addr = adv.addr, + tags = cfg.tags.join(","), + zone = cfg.zone, + )); + } else { let new_role = match layout.staging.get().roles.get(&adv.id) { - Some(NodeRoleV(Some(_))) => "(pending)", + Some(NodeRoleV(Some(_))) => "pending...", _ => "NO ROLE ASSIGNED", }; healthy_nodes.push(format!( - "{id:?}\t{h}\t{addr}\t{new_role}", + "{id:?}\t{h}\t{addr}\t\t\t{new_role}", id = adv.id, h = adv.status.hostname, addr = adv.addr, @@ -101,55 +135,65 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> } format_table(healthy_nodes); - let status_keys = status.iter().map(|adv| adv.id).collect::>(); - let failure_case_1 = status.iter().any(|adv| { - !adv.is_up - && matches!( - layout.current().roles.get(&adv.id), - Some(NodeRoleV(Some(_))) - ) - }); - let failure_case_2 = layout - .current() - .roles - .items() + // Determine which nodes are unhealthy and print that to stdout + let status_map = status .iter() - .any(|(id, _, v)| !status_keys.contains(id) && v.0.is_some()); - if failure_case_1 || failure_case_2 { - println!("\n==== FAILED NODES ===="); - let mut failed_nodes = - vec!["ID\tHostname\tAddress\tTags\tZone\tCapacity\tLast seen".to_string()]; - for adv in status.iter().filter(|adv| !adv.is_up) { - if let Some(NodeRoleV(Some(cfg))) = layout.current().roles.get(&adv.id) { - let tf = timeago::Formatter::new(); - failed_nodes.push(format!( - "{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\t{capacity}\t{last_seen}", - id = adv.id, - host = adv.status.hostname, - addr = adv.addr, - tags = cfg.tags.join(","), - zone = cfg.zone, - capacity = cfg.capacity_string(), - last_seen = adv - .last_seen_secs_ago - .map(|s| tf.convert(Duration::from_secs(s))) - .unwrap_or_else(|| "never seen".into()), - )); + .map(|adv| (adv.id, adv)) + .collect::>(); + + let tf = timeago::Formatter::new(); + let mut failed_nodes = + vec!["ID\tHostname\tAddress\tTags\tZone\tCapacity\tLast seen".to_string()]; + let mut listed = HashSet::new(); + for ver in layout.versions.iter().rev() { + for (node, _, role) in ver.roles.items().iter() { + let cfg = match role { + NodeRoleV(Some(role)) if role.capacity.is_some() => role, + _ => continue, + }; + + if listed.contains(node) { + continue; } - } - for (id, _, role_v) in layout.current().roles.items().iter() { - if let NodeRoleV(Some(cfg)) = role_v { - if !status_keys.contains(id) { - failed_nodes.push(format!( - "{id:?}\t??\t??\t[{tags}]\t{zone}\t{capacity}\tnever seen", - id = id, - tags = cfg.tags.join(","), - zone = cfg.zone, - capacity = cfg.capacity_string(), - )); - } + listed.insert(*node); + + let adv = status_map.get(node); + if adv.map(|x| x.is_up).unwrap_or(false) { + continue; } + + // Node is in a layout version, is not a gateway node, and is not up: + // it is in a failed state, add proper line to the output + let (host, addr, last_seen) = match adv { + Some(adv) => ( + adv.status.hostname.as_str(), + adv.addr.to_string(), + adv.last_seen_secs_ago + .map(|s| tf.convert(Duration::from_secs(s))) + .unwrap_or_else(|| "never seen".into()), + ), + None => ("??", "??".into(), "never seen".into()), + }; + let capacity = if ver.version == layout.current().version { + cfg.capacity_string() + } else { + "draining metadata...".to_string() + }; + failed_nodes.push(format!( + "{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\t{capacity}\t{last_seen}", + id = node, + host = host, + addr = addr, + tags = cfg.tags.join(","), + zone = cfg.zone, + capacity = capacity, + last_seen = last_seen, + )); } + } + + if failed_nodes.len() > 1 { + println!("\n==== FAILED NODES ===="); format_table(failed_nodes); } -- cgit v1.2.3 From 11e6fef93ce3ca56584fc99223b71da77d320dd7 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 27 Nov 2023 16:17:41 +0100 Subject: cli: add layout history and layout assume-sync commands --- src/garage/cli/cmd.rs | 14 +++++- src/garage/cli/layout.rs | 111 ++++++++++++++++++++++++++++++++++++++++++++++ src/garage/cli/structs.rs | 16 +++++++ src/rpc/layout/schema.rs | 9 +++- 4 files changed, 147 insertions(+), 3 deletions(-) diff --git a/src/garage/cli/cmd.rs b/src/garage/cli/cmd.rs index c99243b9..08ed00cf 100644 --- a/src/garage/cli/cmd.rs +++ b/src/garage/cli/cmd.rs @@ -135,13 +135,14 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> } format_table(healthy_nodes); - // Determine which nodes are unhealthy and print that to stdout + // Determine which nodes are unhealthy and print that to stdout let status_map = status .iter() .map(|adv| (adv.id, adv)) .collect::>(); let tf = timeago::Formatter::new(); + let mut drain_msg = false; let mut failed_nodes = vec!["ID\tHostname\tAddress\tTags\tZone\tCapacity\tLast seen".to_string()]; let mut listed = HashSet::new(); @@ -163,7 +164,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> } // Node is in a layout version, is not a gateway node, and is not up: - // it is in a failed state, add proper line to the output + // it is in a failed state, add proper line to the output let (host, addr, last_seen) = match adv { Some(adv) => ( adv.status.hostname.as_str(), @@ -177,6 +178,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> let capacity = if ver.version == layout.current().version { cfg.capacity_string() } else { + drain_msg = true; "draining metadata...".to_string() }; failed_nodes.push(format!( @@ -195,6 +197,14 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> if failed_nodes.len() > 1 { println!("\n==== FAILED NODES ===="); format_table(failed_nodes); + if drain_msg { + println!(); + println!("Your cluster is expecting to drain data from nodes that are currently unavailable."); + println!("If these nodes are definitely dead, please review the layout history with"); + println!( + "`garage layout history` and use `garage layout assume-sync` to force progress." + ); + } } if print_staging_role_changes(&layout) { diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index 0be8278f..3c7843bd 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -32,6 +32,10 @@ pub async fn cli_layout_command_dispatch( LayoutOperation::Config(config_opt) => { cmd_config_layout(system_rpc_endpoint, rpc_host, config_opt).await } + LayoutOperation::History => cmd_layout_history(system_rpc_endpoint, rpc_host).await, + LayoutOperation::AssumeSync(assume_sync_opt) => { + cmd_layout_assume_sync(system_rpc_endpoint, rpc_host, assume_sync_opt).await + } } } @@ -311,6 +315,113 @@ pub async fn cmd_config_layout( Ok(()) } +pub async fn cmd_layout_history( + rpc_cli: &Endpoint, + rpc_host: NodeID, +) -> Result<(), Error> { + let layout = fetch_layout(rpc_cli, rpc_host).await?; + let min_stored = layout.min_stored(); + + println!("==== LAYOUT HISTORY ===="); + let mut table = vec!["Version\tStatus\tStorage nodes\tGateway nodes".to_string()]; + for ver in layout + .versions + .iter() + .rev() + .chain(layout.old_versions.iter().rev()) + { + let status = if ver.version == layout.current().version { + "current" + } else if ver.version >= min_stored { + "draining" + } else { + "historical" + }; + table.push(format!( + "#{}\t{}\t{}\t{}", + ver.version, + status, + ver.roles + .items() + .iter() + .filter(|(_, _, x)| matches!(x, NodeRoleV(Some(c)) if c.capacity.is_some())) + .count(), + ver.roles + .items() + .iter() + .filter(|(_, _, x)| matches!(x, NodeRoleV(Some(c)) if c.capacity.is_none())) + .count(), + )); + } + format_table(table); + + println!(); + println!("==== UPDATE TRACKERS ===="); + println!("This is the internal data that Garage stores to know which nodes have what data."); + println!(); + let mut table = vec!["Node\tAck\tSync\tSync_ack".to_string()]; + let all_nodes = layout.get_all_nodes(); + for node in all_nodes.iter() { + table.push(format!( + "{:?}\t#{}\t#{}\t#{}", + node, + layout.update_trackers.ack_map.get(node), + layout.update_trackers.sync_map.get(node), + layout.update_trackers.sync_ack_map.get(node), + )); + } + table[1..].sort(); + format_table(table); + + if layout.versions.len() > 1 { + println!(); + println!( + "If some nodes are not catching up to the latest layout version in the update tracker," + ); + println!("it might be because they are offline or unable to complete a sync successfully."); + println!( + "You may force progress using `garage layout assume-sync --version {}`", + layout.current().version + ); + } + + Ok(()) +} + +pub async fn cmd_layout_assume_sync( + rpc_cli: &Endpoint, + rpc_host: NodeID, + opt: AssumeSyncOpt, +) -> Result<(), Error> { + let mut layout = fetch_layout(rpc_cli, rpc_host).await?; + + let min_v = layout.min_stored(); + if opt.version <= min_v || opt.version > layout.current().version { + return Err(Error::Message(format!( + "Invalid version, you may use the following version numbers: {}", + (min_v + 1..=layout.current().version) + .map(|x| x.to_string()) + .collect::>() + .join(" ") + ))); + } + + let all_nodes = layout.get_all_nodes(); + for node in all_nodes.iter() { + layout.update_trackers.ack_map.set_max(*node, opt.version); + layout.update_trackers.sync_map.set_max(*node, opt.version); + layout + .update_trackers + .sync_ack_map + .set_max(*node, opt.version); + } + + send_layout(rpc_cli, rpc_host, layout).await?; + println!("Success."); + + Ok(()) +} + // --- utility --- pub async fn fetch_layout( diff --git a/src/garage/cli/structs.rs b/src/garage/cli/structs.rs index 3badc447..c4b400f4 100644 --- a/src/garage/cli/structs.rs +++ b/src/garage/cli/structs.rs @@ -112,6 +112,14 @@ pub enum LayoutOperation { /// Revert staged changes to cluster layout #[structopt(name = "revert", version = garage_version())] Revert(RevertLayoutOpt), + + /// View the history of layouts in the cluster + #[structopt(name = "history", version = garage_version())] + History, + + /// Assume all nodes are synchronized up to a certain layout version + #[structopt(name = "assume-sync", version = garage_version())] + AssumeSync(AssumeSyncOpt), } #[derive(StructOpt, Debug)] @@ -169,6 +177,14 @@ pub struct RevertLayoutOpt { pub(crate) yes: bool, } +#[derive(StructOpt, Debug)] +pub struct AssumeSyncOpt { + /// Version number of the layout to assume is currently up-to-date. + /// This will generally be the current layout version. + #[structopt(long = "version")] + pub(crate) version: u64, +} + #[derive(Serialize, Deserialize, StructOpt, Debug)] pub enum BucketOperation { /// List buckets diff --git a/src/rpc/layout/schema.rs b/src/rpc/layout/schema.rs index 08db44ca..cb36297d 100644 --- a/src/rpc/layout/schema.rs +++ b/src/rpc/layout/schema.rs @@ -391,7 +391,10 @@ impl UpdateTracker { changed } - pub(crate) fn set_max(&mut self, peer: Uuid, value: u64) -> bool { + /// This bumps the update tracker for a given node up to the specified value. + /// This has potential impacts on the correctness of Garage and should only + /// be used in very specific circumstances. + pub fn set_max(&mut self, peer: Uuid, value: u64) -> bool { match self.0.get_mut(&peer) { Some(e) if *e < value => { *e = value; @@ -412,6 +415,10 @@ impl UpdateTracker { .min() .unwrap_or(min_version) } + + pub fn get(&self, node: &Uuid) -> u64 { + self.0.get(node).copied().unwrap_or(0) + } } impl UpdateTrackers { -- cgit v1.2.3 From c539077d30809c9d2232aa0fe107a9652dcb7c26 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 27 Nov 2023 16:20:19 +0100 Subject: cli: remove historic layout info from status --- src/garage/cli/cmd.rs | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/src/garage/cli/cmd.rs b/src/garage/cli/cmd.rs index 08ed00cf..4d1306b6 100644 --- a/src/garage/cli/cmd.rs +++ b/src/garage/cli/cmd.rs @@ -91,15 +91,6 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> Some(NodeRoleV(Some(cfg))) => Some(cfg), _ => None, }); - let historic_role = - layout - .old_versions - .iter() - .rev() - .find_map(|x| match x.roles.get(&adv.id) { - Some(NodeRoleV(Some(cfg))) => Some(cfg), - _ => None, - }); if let Some(cfg) = prev_role { healthy_nodes.push(format!( "{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\tdraining metadata...", @@ -109,15 +100,6 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> tags = cfg.tags.join(","), zone = cfg.zone, )); - } else if let Some(cfg) = historic_role { - healthy_nodes.push(format!( - "{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\tremoved, metadata drained", - id = adv.id, - host = adv.status.hostname, - addr = adv.addr, - tags = cfg.tags.join(","), - zone = cfg.zone, - )); } else { let new_role = match layout.staging.get().roles.get(&adv.id) { Some(NodeRoleV(Some(_))) => "pending...", -- cgit v1.2.3 From 539af6eac434bd94acbcabcc5bb5c10450b71c5d Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 28 Nov 2023 11:12:39 +0100 Subject: rpc helper: write comments + small refactoring of tracing --- src/rpc/rpc_helper.rs | 105 ++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 88 insertions(+), 17 deletions(-) diff --git a/src/rpc/rpc_helper.rs b/src/rpc/rpc_helper.rs index e9a9143f..f71f5ae7 100644 --- a/src/rpc/rpc_helper.rs +++ b/src/rpc/rpc_helper.rs @@ -129,6 +129,12 @@ impl RpcHelper { N: IntoReq + Send, H: StreamingEndpointHandler, { + let tracer = opentelemetry::global::tracer("garage"); + let span_name = format!("RPC [{}] to {:?}", endpoint.path(), to); + let mut span = tracer.start(span_name); + span.set_attribute(KeyValue::new("from", format!("{:?}", self.0.our_node_id))); + span.set_attribute(KeyValue::new("to", format!("{:?}", to))); + let metric_tags = [ KeyValue::new("rpc_endpoint", endpoint.path().to_string()), KeyValue::new("from", format!("{:?}", self.0.our_node_id)), @@ -140,6 +146,7 @@ impl RpcHelper { let node_id = to.into(); let rpc_call = endpoint .call_streaming(&node_id, msg, strat.rs_priority) + .with_context(Context::current_with_span(span)) .record_duration(&self.0.metrics.rpc_duration, &metric_tags); let timeout = async { @@ -182,12 +189,17 @@ impl RpcHelper { N: IntoReq, H: StreamingEndpointHandler, { + let tracer = opentelemetry::global::tracer("garage"); + let span_name = format!("RPC [{}] call_many {} nodes", endpoint.path(), to.len()); + let span = tracer.start(span_name); + let msg = msg.into_req().map_err(netapp::error::Error::from)?; let resps = join_all( to.iter() .map(|to| self.call(endpoint, *to, msg.clone(), strat)), ) + .with_context(Context::current_with_span(span)) .await; Ok(to .iter() @@ -219,6 +231,22 @@ impl RpcHelper { /// Make a RPC call to multiple servers, returning either a Vec of responses, /// or an error if quorum could not be reached due to too many errors + /// + /// If RequestStrategy has send_all_at_once set, then all requests will be + /// sent at once, and `try_call_many` will return as soon as a quorum of + /// responses is achieved, dropping and cancelling the remaining requests. + /// + /// Otherwise, `quorum` requests will be sent at the same time, and if an + /// error response is received, a new request will be sent to replace it. + /// The ordering of nodes to which requests are sent is determined by + /// the `RpcHelper::request_order` function, which takes into account + /// parameters such as node zones and measured ping values. + /// + /// In both cases, the basic contract of this function is that even in the + /// absence of failures, the RPC call might not be driven to completion + /// on all of the specified nodes. It is therefore unfit for broadcast + /// write operations where we expect all nodes to successfully store + /// the written date. pub async fn try_call_many( &self, endpoint: &Arc>, @@ -235,7 +263,12 @@ impl RpcHelper { let quorum = strategy.rs_quorum.unwrap_or(to.len()); let tracer = opentelemetry::global::tracer("garage"); - let span_name = format!("Read RPC {} to {} of {}", endpoint.path(), quorum, to.len()); + let span_name = format!( + "RPC [{}] try_call_many (quorum {}/{})", + endpoint.path(), + quorum, + to.len() + ); let mut span = tracer.start(span_name); span.set_attribute(KeyValue::new("from", format!("{:?}", self.0.our_node_id))); @@ -266,6 +299,10 @@ impl RpcHelper { // to reach a quorum, priorizing nodes with the lowest latency. // When there are errors, we start new requests to compensate. + // TODO: this could be made more aggressive, e.g. if after 2x the + // average ping of a given request, the response is not yet received, + // preemptively send an additional request to any remaining nodes. + // Reorder requests to priorize closeness / low latency let request_order = self.request_order(to.iter().copied()); let send_all_at_once = strategy.rs_send_all_at_once.unwrap_or(false); @@ -278,9 +315,7 @@ impl RpcHelper { let self2 = self.clone(); let msg = msg.clone(); let endpoint2 = endpoint.clone(); - (to, async move { - self2.call(&endpoint2, to, msg, strategy).await - }) + async move { self2.call(&endpoint2, to, msg, strategy).await } }); // Vectors in which success results and errors will be collected @@ -296,10 +331,8 @@ impl RpcHelper { // If the current set of requests that are running is not enough to possibly // reach quorum, start some new requests. while send_all_at_once || successes.len() + resp_stream.len() < quorum { - if let Some((req_to, fut)) = requests.next() { - let tracer = opentelemetry::global::tracer("garage"); - let span = tracer.start(format!("RPC to {:?}", req_to)); - resp_stream.push(fut.with_context(Context::current_with_span(span))); + if let Some(fut) = requests.next() { + resp_stream.push(fut) } else { break; } @@ -379,6 +412,25 @@ impl RpcHelper { .collect::>() } + /// Make a RPC call to multiple servers, returning either a Vec of responses, + /// or an error if quorum could not be reached due to too many errors + /// + /// Contrary to try_call_many, this fuction is especially made for broadcast + /// write operations. In particular: + /// + /// - The request are sent to all specified nodes as soon as `try_write_many_sets` + /// is invoked. + /// + /// - When `try_write_many_sets` returns, all remaining requests that haven't + /// completed move to a background task so that they have a chance to + /// complete successfully if there are no failures. + /// + /// In addition, the nodes to which requests should be sent are divided in + /// "quorum sets", and `try_write_many_sets` only returns once a quorum + /// has been validated in each set. This is used in the case of cluster layout + /// changes, where data has to be written both in the old layout and in the + /// new one as long as all nodes have not successfully tranisitionned and + /// moved all data to the new layout. pub async fn try_write_many_sets( &self, endpoint: &Arc>, @@ -394,11 +446,11 @@ impl RpcHelper { { let quorum = strategy .rs_quorum - .expect("internal error: missing quroum in try_write_many_sets"); + .expect("internal error: missing quorum value in try_write_many_sets"); let tracer = opentelemetry::global::tracer("garage"); let span_name = format!( - "Write RPC {} (quorum {} in {} sets)", + "RPC [{}] try_write_many_sets (quorum {} in {} sets)", endpoint.path(), quorum, to_sets.len() @@ -430,6 +482,8 @@ impl RpcHelper { { let msg = msg.into_req().map_err(netapp::error::Error::from)?; + // Peers may appear in many quorum sets. Here, build a list of peers, + // mapping to the index of the quorum sets in which they appear. let mut peers = HashMap::>::new(); for (i, set) in to_sets.iter().enumerate() { for peer in set.iter() { @@ -437,24 +491,30 @@ impl RpcHelper { } } + // Send one request to each peer of the quorum sets let requests = peers.iter().map(|(peer, _)| { let self2 = self.clone(); let msg = msg.clone(); let endpoint2 = endpoint.clone(); let to = *peer; - let tracer = opentelemetry::global::tracer("garage"); - let span = tracer.start(format!("RPC to {:?}", to)); - let fut = async move { (to, self2.call(&endpoint2, to, msg, strategy).await) }; - fut.with_context(Context::current_with_span(span)) + async move { (to, self2.call(&endpoint2, to, msg, strategy).await) } }); let mut resp_stream = requests.collect::>(); + // Success and error responses will be collected in these two vectors let mut successes = vec![]; let mut errors = vec![]; + // `set_counters` is used to keep track of how many success and error + // responses are received within each quorum set. When a node returns + // its response, it counts as a sucess/an error for all of the quorum + // sets which it is part of. let mut set_counters = vec![(0, 0); to_sets.len()]; + // Drive requests to completion while let Some((node, resp)) = resp_stream.next().await { + // Store the response in the correct vector and increment the + // appropriate counters match resp { Ok(msg) => { for set in peers.get(&node).unwrap().iter() { @@ -470,9 +530,8 @@ impl RpcHelper { } } + // If we have a quorum of ok in all quorum sets, then it's a success! if set_counters.iter().all(|(ok_cnt, _)| *ok_cnt >= quorum) { - // Success - // Continue all other requets in background tokio::spawn(async move { resp_stream.collect::)>>().await; @@ -481,16 +540,28 @@ impl RpcHelper { return Ok(successes); } + // If there is a quorum set for which too many errors were received, + // we know it's impossible to get a quorum, so return immediately. if set_counters .iter() .enumerate() .any(|(i, (_, err_cnt))| err_cnt + quorum > to_sets[i].len()) { - // Too many errors in this set, we know we won't get a quorum break; } } + // At this point, there is no quorum and we know that a quorum + // will never be achieved. Currently, we drop all remaining requests. + // Should we still move them to background so that they can continue + // for non-failed nodes? Not doing so has no impact on correctness, + // but it means that more cancellation messages will be sent. Idk. + // (When an in-progress request future is dropped, Netapp automatically + // sends a cancellation message to the remote node to inform it that + // the result is no longer needed. In turn, if the remote node receives + // the cancellation message in time, it interrupts the task of the + // running request handler.) + // Failure, could not get quorum let errors = errors.iter().map(|e| format!("{}", e)).collect::>(); Err(Error::Quorum( -- cgit v1.2.3 From c04dd8788a3764da2f307b1d10c2d56b7b0e4a61 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 28 Nov 2023 14:25:04 +0100 Subject: admin: more info in admin GetClusterStatus --- doc/drafts/admin-api.md | 139 +++++++++++++++++++++++++---------------------- src/api/admin/cluster.rs | 122 ++++++++++++++++++++++++++++++++++------- src/garage/admin/mod.rs | 2 +- src/garage/cli/cmd.rs | 9 +-- src/rpc/system.rs | 12 ++-- 5 files changed, 190 insertions(+), 94 deletions(-) diff --git a/doc/drafts/admin-api.md b/doc/drafts/admin-api.md index 411f6418..274bd5c4 100644 --- a/doc/drafts/admin-api.md +++ b/doc/drafts/admin-api.md @@ -69,8 +69,8 @@ Example response body: ```json { - "node": "ec79480e0ce52ae26fd00c9da684e4fa56658d9c64cdcecb094e936de0bfe71f", - "garageVersion": "git:v0.9.0-dev", + "node": "b10c110e4e854e5aa3f4637681befac755154b20059ec163254ddbfae86b09df", + "garageVersion": "v0.10.0", "garageFeatures": [ "k2v", "sled", @@ -81,83 +81,92 @@ Example response body: ], "rustVersion": "1.68.0", "dbEngine": "LMDB (using Heed crate)", - "knownNodes": [ + "layoutVersion": 5, + "nodes": [ { - "id": "ec79480e0ce52ae26fd00c9da684e4fa56658d9c64cdcecb094e936de0bfe71f", - "addr": "10.0.0.11:3901", + "id": "62b218d848e86a64f7fe1909735f29a4350547b54c4b204f91246a14eb0a1a8c", + "role": { + "id": "62b218d848e86a64f7fe1909735f29a4350547b54c4b204f91246a14eb0a1a8c", + "zone": "dc1", + "capacity": 100000000000, + "tags": [] + }, + "addr": "10.0.0.3:3901", + "hostname": "node3", "isUp": true, - "lastSeenSecsAgo": 9, - "hostname": "node1" + "lastSeenSecsAgo": 12, + "draining": false, + "dataPartition": { + "available": 660270088192, + "total": 873862266880 + }, + "metadataPartition": { + "available": 660270088192, + "total": 873862266880 + } }, { - "id": "4a6ae5a1d0d33bf895f5bb4f0a418b7dc94c47c0dd2eb108d1158f3c8f60b0ff", - "addr": "10.0.0.12:3901", + "id": "a11c7cf18af297379eff8688360155fe68d9061654449ba0ce239252f5a7487f", + "role": null, + "addr": "10.0.0.2:3901", + "hostname": "node2", "isUp": true, - "lastSeenSecsAgo": 1, - "hostname": "node2" + "lastSeenSecsAgo": 11, + "draining": true, + "dataPartition": { + "available": 660270088192, + "total": 873862266880 + }, + "metadataPartition": { + "available": 660270088192, + "total": 873862266880 + } }, { - "id": "23ffd0cdd375ebff573b20cc5cef38996b51c1a7d6dbcf2c6e619876e507cf27", - "addr": "10.0.0.21:3901", + "id": "a235ac7695e0c54d7b403943025f57504d500fdcc5c3e42c71c5212faca040a2", + "role": { + "id": "a235ac7695e0c54d7b403943025f57504d500fdcc5c3e42c71c5212faca040a2", + "zone": "dc1", + "capacity": 100000000000, + "tags": [] + }, + "addr": "127.0.0.1:3904", + "hostname": "lindy", "isUp": true, - "lastSeenSecsAgo": 7, - "hostname": "node3" + "lastSeenSecsAgo": 2, + "draining": false, + "dataPartition": { + "available": 660270088192, + "total": 873862266880 + }, + "metadataPartition": { + "available": 660270088192, + "total": 873862266880 + } }, { - "id": "e2ee7984ee65b260682086ec70026165903c86e601a4a5a501c1900afe28d84b", - "addr": "10.0.0.22:3901", - "isUp": true, - "lastSeenSecsAgo": 1, - "hostname": "node4" - } - ], - "layout": { - "version": 12, - "roles": [ - { - "id": "ec79480e0ce52ae26fd00c9da684e4fa56658d9c64cdcecb094e936de0bfe71f", + "id": "b10c110e4e854e5aa3f4637681befac755154b20059ec163254ddbfae86b09df", + "role": { + "id": "b10c110e4e854e5aa3f4637681befac755154b20059ec163254ddbfae86b09df", "zone": "dc1", - "capacity": 10737418240, - "tags": [ - "node1" - ] + "capacity": 100000000000, + "tags": [] }, - { - "id": "4a6ae5a1d0d33bf895f5bb4f0a418b7dc94c47c0dd2eb108d1158f3c8f60b0ff", - "zone": "dc1", - "capacity": 10737418240, - "tags": [ - "node2" - ] + "addr": "10.0.0.1:3901", + "hostname": "node1", + "isUp": true, + "lastSeenSecsAgo": 3, + "draining": false, + "dataPartition": { + "available": 660270088192, + "total": 873862266880 }, - { - "id": "23ffd0cdd375ebff573b20cc5cef38996b51c1a7d6dbcf2c6e619876e507cf27", - "zone": "dc2", - "capacity": 10737418240, - "tags": [ - "node3" - ] + "metadataPartition": { + "available": 660270088192, + "total": 873862266880 } - ], - "stagedRoleChanges": [ - { - "id": "e2ee7984ee65b260682086ec70026165903c86e601a4a5a501c1900afe28d84b", - "remove": false, - "zone": "dc2", - "capacity": 10737418240, - "tags": [ - "node4" - ] - } - { - "id": "23ffd0cdd375ebff573b20cc5cef38996b51c1a7d6dbcf2c6e619876e507cf27", - "remove": true, - "zone": null, - "capacity": null, - "tags": null, - } - ] - } + } + ] } ``` diff --git a/src/api/admin/cluster.rs b/src/api/admin/cluster.rs index 593bd778..3ce1b254 100644 --- a/src/api/admin/cluster.rs +++ b/src/api/admin/cluster.rs @@ -1,3 +1,4 @@ +use std::collections::HashMap; use std::net::SocketAddr; use std::sync::Arc; @@ -15,25 +16,95 @@ use crate::admin::error::*; use crate::helpers::{json_ok_response, parse_json_body}; pub async fn handle_get_cluster_status(garage: &Arc) -> Result, Error> { + let layout = garage.system.cluster_layout(); + let mut nodes = garage + .system + .get_known_nodes() + .into_iter() + .map(|i| { + ( + i.id, + NodeResp { + id: hex::encode(i.id), + addr: Some(i.addr), + hostname: i.status.hostname, + is_up: i.is_up, + last_seen_secs_ago: i.last_seen_secs_ago, + data_partition: i + .status + .data_disk_avail + .map(|(avail, total)| FreeSpaceResp { + available: avail, + total, + }), + metadata_partition: i.status.meta_disk_avail.map(|(avail, total)| { + FreeSpaceResp { + available: avail, + total, + } + }), + ..Default::default() + }, + ) + }) + .collect::>(); + + for (id, _, role) in layout.current().roles.items().iter() { + if let layout::NodeRoleV(Some(r)) = role { + let role = NodeRoleResp { + id: hex::encode(id), + zone: r.zone.to_string(), + capacity: r.capacity, + tags: r.tags.clone(), + }; + match nodes.get_mut(id) { + None => { + nodes.insert( + *id, + NodeResp { + id: hex::encode(id), + role: Some(role), + ..Default::default() + }, + ); + } + Some(n) => { + if n.role.is_none() { + n.role = Some(role); + } + } + } + } + } + + for ver in layout.versions.iter().rev().skip(1) { + for (id, _, role) in ver.roles.items().iter() { + if let layout::NodeRoleV(Some(r)) = role { + if !nodes.contains_key(id) && r.capacity.is_some() { + nodes.insert( + *id, + NodeResp { + id: hex::encode(id), + draining: true, + ..Default::default() + }, + ); + } + } + } + } + + let mut nodes = nodes.into_iter().map(|(_, v)| v).collect::>(); + nodes.sort_by(|x, y| x.id.cmp(&y.id)); + let res = GetClusterStatusResponse { node: hex::encode(garage.system.id), garage_version: garage_util::version::garage_version(), garage_features: garage_util::version::garage_features(), rust_version: garage_util::version::rust_version(), db_engine: garage.db.engine(), - known_nodes: garage - .system - .get_known_nodes() - .into_iter() - .map(|i| KnownNodeResp { - id: hex::encode(i.id), - addr: i.addr, - is_up: i.is_up, - last_seen_secs_ago: i.last_seen_secs_ago, - hostname: i.status.hostname, - }) - .collect(), - layout: format_cluster_layout(&garage.system.cluster_layout()), + layout_version: layout.current().version, + nodes, }; Ok(json_ok_response(&res)?) @@ -157,8 +228,8 @@ struct GetClusterStatusResponse { garage_features: Option<&'static [&'static str]>, rust_version: &'static str, db_engine: String, - known_nodes: Vec, - layout: GetClusterLayoutResponse, + layout_version: u64, + nodes: Vec, } #[derive(Serialize)] @@ -192,14 +263,27 @@ struct NodeRoleResp { tags: Vec, } -#[derive(Serialize)] +#[derive(Serialize, Default)] +#[serde(rename_all = "camelCase")] +struct FreeSpaceResp { + available: u64, + total: u64, +} + +#[derive(Serialize, Default)] #[serde(rename_all = "camelCase")] -struct KnownNodeResp { +struct NodeResp { id: String, - addr: SocketAddr, + role: Option, + addr: Option, + hostname: Option, is_up: bool, last_seen_secs_ago: Option, - hostname: String, + draining: bool, + #[serde(skip_serializing_if = "Option::is_none")] + data_partition: Option, + #[serde(skip_serializing_if = "Option::is_none")] + metadata_partition: Option, } // ---- update functions ---- diff --git a/src/garage/admin/mod.rs b/src/garage/admin/mod.rs index 77918a0f..da4226cf 100644 --- a/src/garage/admin/mod.rs +++ b/src/garage/admin/mod.rs @@ -295,7 +295,7 @@ impl AdminRpcHandler { let info = node_info.get(id); let status = info.map(|x| &x.status); let role = layout.current().roles.get(id).and_then(|x| x.0.as_ref()); - let hostname = status.map(|x| x.hostname.as_str()).unwrap_or("?"); + let hostname = status.and_then(|x| x.hostname.as_deref()).unwrap_or("?"); let zone = role.map(|x| x.zone.as_str()).unwrap_or("?"); let capacity = role .map(|x| x.capacity_string()) diff --git a/src/garage/cli/cmd.rs b/src/garage/cli/cmd.rs index 4d1306b6..c7f0ad2b 100644 --- a/src/garage/cli/cmd.rs +++ b/src/garage/cli/cmd.rs @@ -62,6 +62,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> let mut healthy_nodes = vec!["ID\tHostname\tAddress\tTags\tZone\tCapacity\tDataAvail".to_string()]; for adv in status.iter().filter(|adv| adv.is_up) { + let host = adv.status.hostname.as_deref().unwrap_or("?"); if let Some(NodeRoleV(Some(cfg))) = layout.current().roles.get(&adv.id) { let data_avail = match &adv.status.data_disk_avail { _ if cfg.capacity.is_none() => "N/A".into(), @@ -75,7 +76,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> healthy_nodes.push(format!( "{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\t{capacity}\t{data_avail}", id = adv.id, - host = adv.status.hostname, + host = host, addr = adv.addr, tags = cfg.tags.join(","), zone = cfg.zone, @@ -95,7 +96,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> healthy_nodes.push(format!( "{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\tdraining metadata...", id = adv.id, - host = adv.status.hostname, + host = host, addr = adv.addr, tags = cfg.tags.join(","), zone = cfg.zone, @@ -108,7 +109,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> healthy_nodes.push(format!( "{id:?}\t{h}\t{addr}\t\t\t{new_role}", id = adv.id, - h = adv.status.hostname, + h = host, addr = adv.addr, new_role = new_role, )); @@ -149,7 +150,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> // it is in a failed state, add proper line to the output let (host, addr, last_seen) = match adv { Some(adv) => ( - adv.status.hostname.as_str(), + adv.status.hostname.as_deref().unwrap_or("?"), adv.addr.to_string(), adv.last_seen_secs_ago .map(|s| tf.convert(Duration::from_secs(s))) diff --git a/src/rpc/system.rs b/src/rpc/system.rs index c7d41ee4..be4aefa2 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -126,7 +126,7 @@ pub struct System { #[derive(Debug, Clone, Serialize, Deserialize)] pub struct NodeStatus { /// Hostname of the node - pub hostname: String, + pub hostname: Option, /// Replication factor configured on the node pub replication_factor: usize, @@ -765,9 +765,11 @@ impl EndpointHandler for System { impl NodeStatus { fn initial(replication_factor: usize, layout_manager: &LayoutManager) -> Self { NodeStatus { - hostname: gethostname::gethostname() - .into_string() - .unwrap_or_else(|_| "".to_string()), + hostname: Some( + gethostname::gethostname() + .into_string() + .unwrap_or_else(|_| "".to_string()), + ), replication_factor, layout_digest: layout_manager.layout().digest(), meta_disk_avail: None, @@ -777,7 +779,7 @@ impl NodeStatus { fn unknown() -> Self { NodeStatus { - hostname: "?".to_string(), + hostname: None, replication_factor: 0, layout_digest: Default::default(), meta_disk_avail: None, -- cgit v1.2.3 From c8356a91d9bf1d1488ec288099f2a55a1019918f Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 7 Dec 2023 10:30:26 +0100 Subject: layout updates: fix the set of nodes among which minima are calculated --- src/rpc/layout/helper.rs | 25 +++++++++++++++++++++---- src/rpc/layout/history.rs | 8 +++++--- src/rpc/layout/schema.rs | 2 +- 3 files changed, 27 insertions(+), 8 deletions(-) diff --git a/src/rpc/layout/helper.rs b/src/rpc/layout/helper.rs index 5d159f3e..881a039e 100644 --- a/src/rpc/layout/helper.rs +++ b/src/rpc/layout/helper.rs @@ -51,20 +51,37 @@ impl LayoutHelper { pub fn new(mut layout: LayoutHistory, mut ack_lock: HashMap) -> Self { layout.cleanup_old_versions(); + let all_nodes = layout.get_all_nodes(); let all_nongateway_nodes = layout.get_all_nongateway_nodes(); - layout.clamp_update_trackers(&all_nongateway_nodes); + + layout.clamp_update_trackers(&all_nodes); let min_version = layout.min_stored(); + + // ack_map_min is the minimum value of ack_map among all nodes + // in the cluster (gateway, non-gateway, current and previous layouts). + // It is the highest layout version which all of these nodes have + // acknowledged, indicating that they are aware of it and are no + // longer processing write operations that did not take it into account. let ack_map_min = layout .update_trackers .ack_map - .min(&all_nongateway_nodes, min_version); + .min_among(&all_nodes, min_version); + + // sync_map_min is the minimum value of sync_map among all storage nodes + // in the cluster (non-gateway nodes only, current and previous layouts). + // It is the highest layout version for which we know that all relevant + // storage nodes have fullfilled a sync, and therefore it is safe to + // use a read quorum within that layout to ensure consistency. + // Gateway nodes are excluded here because they hold no relevant data + // (they store the bucket and access key tables, but we don't have + // consistency on those). + // TODO: this value could take quorums into account instead. let sync_map_min = layout .update_trackers .sync_map - .min(&all_nongateway_nodes, min_version); + .min_among(&all_nongateway_nodes, min_version); - let all_nodes = layout.get_all_nodes(); let trackers_hash = layout.calculate_trackers_hash(); let staging_hash = layout.calculate_staging_hash(); diff --git a/src/rpc/layout/history.rs b/src/rpc/layout/history.rs index 7d4a1b48..c448ac24 100644 --- a/src/rpc/layout/history.rs +++ b/src/rpc/layout/history.rs @@ -77,14 +77,16 @@ impl LayoutHistory { } // If there are old versions that no one is reading from anymore, - // remove them + // remove them (keep them in self.old_versions). + // ASSUMPTION: we only care about where nodes in the current layout version + // are reading from, as we assume older nodes are being discarded. while self.versions.len() > 1 { - let all_nongateway_nodes = self.get_all_nongateway_nodes(); + let current_nodes = &self.current().node_id_vec; let min_version = self.min_stored(); let sync_ack_map_min = self .update_trackers .sync_ack_map - .min(&all_nongateway_nodes, min_version); + .min_among(¤t_nodes, min_version); if self.min_stored() < sync_ack_map_min { let removed = self.versions.remove(0); info!( diff --git a/src/rpc/layout/schema.rs b/src/rpc/layout/schema.rs index cb36297d..49e84420 100644 --- a/src/rpc/layout/schema.rs +++ b/src/rpc/layout/schema.rs @@ -408,7 +408,7 @@ impl UpdateTracker { } } - pub(crate) fn min(&self, storage_nodes: &[Uuid], min_version: u64) -> u64 { + pub(crate) fn min_among(&self, storage_nodes: &[Uuid], min_version: u64) -> u64 { storage_nodes .iter() .map(|x| self.0.get(x).copied().unwrap_or(min_version)) -- cgit v1.2.3 From 95eb13eb08d517d328e3c8aeb222440a27211ee9 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 7 Dec 2023 10:55:15 +0100 Subject: rpc: refactor result tracking for quorum sets --- src/rpc/layout/manager.rs | 6 ++ src/rpc/rpc_helper.rs | 147 +++++++++++++++++++++++++----------- src/table/replication/parameters.rs | 2 +- src/table/table.rs | 54 +++---------- 4 files changed, 121 insertions(+), 88 deletions(-) diff --git a/src/rpc/layout/manager.rs b/src/rpc/layout/manager.rs index c65831a2..17465019 100644 --- a/src/rpc/layout/manager.rs +++ b/src/rpc/layout/manager.rs @@ -352,6 +352,12 @@ impl AsRef for WriteLock { } } +impl AsMut for WriteLock { + fn as_mut(&mut self) -> &mut T { + &mut self.value + } +} + impl Drop for WriteLock { fn drop(&mut self) { let layout = self.layout_manager.layout(); // acquire read lock diff --git a/src/rpc/rpc_helper.rs b/src/rpc/rpc_helper.rs index f71f5ae7..c6dcbe75 100644 --- a/src/rpc/rpc_helper.rs +++ b/src/rpc/rpc_helper.rs @@ -484,15 +484,10 @@ impl RpcHelper { // Peers may appear in many quorum sets. Here, build a list of peers, // mapping to the index of the quorum sets in which they appear. - let mut peers = HashMap::>::new(); - for (i, set) in to_sets.iter().enumerate() { - for peer in set.iter() { - peers.entry(*peer).or_default().push(i); - } - } + let mut result_tracker = QuorumSetResultTracker::new(to_sets, quorum); // Send one request to each peer of the quorum sets - let requests = peers.iter().map(|(peer, _)| { + let requests = result_tracker.nodes.iter().map(|(peer, _)| { let self2 = self.clone(); let msg = msg.clone(); let endpoint2 = endpoint.clone(); @@ -501,52 +496,25 @@ impl RpcHelper { }); let mut resp_stream = requests.collect::>(); - // Success and error responses will be collected in these two vectors - let mut successes = vec![]; - let mut errors = vec![]; - - // `set_counters` is used to keep track of how many success and error - // responses are received within each quorum set. When a node returns - // its response, it counts as a sucess/an error for all of the quorum - // sets which it is part of. - let mut set_counters = vec![(0, 0); to_sets.len()]; - // Drive requests to completion while let Some((node, resp)) = resp_stream.next().await { // Store the response in the correct vector and increment the // appropriate counters - match resp { - Ok(msg) => { - for set in peers.get(&node).unwrap().iter() { - set_counters[*set].0 += 1; - } - successes.push(msg); - } - Err(e) => { - for set in peers.get(&node).unwrap().iter() { - set_counters[*set].1 += 1; - } - errors.push(e); - } - } + result_tracker.register_result(node, resp); // If we have a quorum of ok in all quorum sets, then it's a success! - if set_counters.iter().all(|(ok_cnt, _)| *ok_cnt >= quorum) { + if result_tracker.all_quorums_ok() { // Continue all other requets in background tokio::spawn(async move { resp_stream.collect::)>>().await; }); - return Ok(successes); + return Ok(result_tracker.success_values()); } // If there is a quorum set for which too many errors were received, // we know it's impossible to get a quorum, so return immediately. - if set_counters - .iter() - .enumerate() - .any(|(i, (_, err_cnt))| err_cnt + quorum > to_sets[i].len()) - { + if result_tracker.too_many_failures() { break; } } @@ -563,13 +531,104 @@ impl RpcHelper { // running request handler.) // Failure, could not get quorum - let errors = errors.iter().map(|e| format!("{}", e)).collect::>(); - Err(Error::Quorum( + Err(result_tracker.quorum_error()) + } +} + +// ------- utility for tracking successes/errors among write sets -------- + +pub struct QuorumSetResultTracker { + // The set of nodes and the quorum sets they belong to + pub nodes: HashMap>, + pub quorum: usize, + + // The success and error responses received + pub successes: Vec<(Uuid, S)>, + pub failures: Vec<(Uuid, E)>, + + // The counters for successes and failures in each set + pub success_counters: Box<[usize]>, + pub failure_counters: Box<[usize]>, + pub set_lens: Box<[usize]>, +} + +impl QuorumSetResultTracker { + pub fn new(sets: &[A], quorum: usize) -> Self + where + A: AsRef<[Uuid]>, + { + let mut nodes = HashMap::>::new(); + for (i, set) in sets.iter().enumerate() { + for node in set.as_ref().iter() { + nodes.entry(*node).or_default().push(i); + } + } + + let num_nodes = nodes.len(); + Self { + nodes, quorum, - Some(to_sets.len()), - successes.len(), - peers.len(), + successes: Vec::with_capacity(num_nodes), + failures: vec![], + success_counters: vec![0; sets.len()].into_boxed_slice(), + failure_counters: vec![0; sets.len()].into_boxed_slice(), + set_lens: sets + .iter() + .map(|x| x.as_ref().len()) + .collect::>() + .into_boxed_slice(), + } + } + + pub fn register_result(&mut self, node: Uuid, result: Result) { + match result { + Ok(s) => { + self.successes.push((node, s)); + for set in self.nodes.get(&node).unwrap().iter() { + self.success_counters[*set] += 1; + } + } + Err(e) => { + self.failures.push((node, e)); + for set in self.nodes.get(&node).unwrap().iter() { + self.failure_counters[*set] += 1; + } + } + } + } + + pub fn all_quorums_ok(&self) -> bool { + self.success_counters + .iter() + .all(|ok_cnt| *ok_cnt >= self.quorum) + } + + pub fn too_many_failures(&self) -> bool { + self.failure_counters + .iter() + .zip(self.set_lens.iter()) + .any(|(err_cnt, set_len)| *err_cnt + self.quorum > *set_len) + } + + pub fn success_values(self) -> Vec { + self.successes + .into_iter() + .map(|(_, x)| x) + .collect::>() + } + + pub fn quorum_error(self) -> Error { + let errors = self + .failures + .iter() + .map(|(n, e)| format!("{:?}: {}", n, e)) + .collect::>(); + Error::Quorum( + self.quorum, + Some(self.set_lens.len()), + self.successes.len(), + self.nodes.len(), errors, - )) + ) } } diff --git a/src/table/replication/parameters.rs b/src/table/replication/parameters.rs index a4e701bb..db11ff5f 100644 --- a/src/table/replication/parameters.rs +++ b/src/table/replication/parameters.rs @@ -3,7 +3,7 @@ use garage_util::data::*; /// Trait to describe how a table shall be replicated pub trait TableReplication: Send + Sync + 'static { - type WriteSets: AsRef>> + Send + Sync + 'static; + type WriteSets: AsRef>> + AsMut>> + Send + Sync + 'static; // See examples in table_sharded.rs and table_fullcopy.rs // To understand various replication methods diff --git a/src/table/table.rs b/src/table/table.rs index 7d1ff31c..6508cf5d 100644 --- a/src/table/table.rs +++ b/src/table/table.rs @@ -20,6 +20,7 @@ use garage_util::error::Error; use garage_util::metrics::RecordDuration; use garage_util::migrate::Migrate; +use garage_rpc::rpc_helper::QuorumSetResultTracker; use garage_rpc::system::System; use garage_rpc::*; @@ -180,10 +181,6 @@ impl Table { // a quorum of nodes has answered OK, then the insert has succeeded and // consistency properties (read-after-write) are preserved. - // Some code here might feel redundant with RpcHelper::try_write_many_sets, - // but I think deduplicating could lead to more spaghetti instead of - // improving the readability, so I'm leaving as is. - let quorum = self.data.replication.write_quorum(); // Serialize all entries and compute the write sets for each of them. @@ -197,7 +194,10 @@ impl Table { for entry in entries.into_iter() { let entry = entry.borrow(); let hash = entry.partition_key().hash(); - let write_sets = self.data.replication.write_sets(&hash); + let mut write_sets = self.data.replication.write_sets(&hash); + for set in write_sets.as_mut().iter_mut() { + set.sort(); + } let e_enc = Arc::new(ByteBuf::from(entry.encode()?)); entries_vec.push((write_sets, e_enc)); } @@ -212,12 +212,8 @@ impl Table { .collect::>(); write_sets.sort(); write_sets.dedup(); - let mut write_set_index = HashMap::<&Uuid, Vec>::new(); - for (i, write_set) in write_sets.iter().enumerate() { - for node in write_set.iter() { - write_set_index.entry(node).or_default().push(i); - } - } + + let mut result_tracker = QuorumSetResultTracker::new(&write_sets, quorum); // Build a map of all nodes to the entries that must be sent to that node. let mut call_list: HashMap> = HashMap::new(); @@ -230,7 +226,6 @@ impl Table { } // Build futures to actually perform each of the corresponding RPC calls - let call_count = call_list.len(); let call_futures = call_list.into_iter().map(|(node, entries)| { let this = self.clone(); let tracer = opentelemetry::global::tracer("garage"); @@ -254,27 +249,11 @@ impl Table { // Run all requests in parallel thanks to FuturesUnordered, and collect results. let mut resps = call_futures.collect::>(); - let mut set_counters = vec![(0, 0); write_sets.len()]; - let mut successes = 0; - let mut errors = vec![]; while let Some((node, resp)) = resps.next().await { - match resp { - Ok(_) => { - successes += 1; - for set in write_set_index.get(&node).unwrap().iter() { - set_counters[*set].0 += 1; - } - } - Err(e) => { - errors.push(e); - for set in write_set_index.get(&node).unwrap().iter() { - set_counters[*set].1 += 1; - } - } - } + result_tracker.register_result(node, resp.map(|_| ())); - if set_counters.iter().all(|(ok_cnt, _)| *ok_cnt >= quorum) { + if result_tracker.all_quorums_ok() { // Success // Continue all other requests in background @@ -285,25 +264,14 @@ impl Table { return Ok(()); } - if set_counters - .iter() - .enumerate() - .any(|(i, (_, err_cnt))| err_cnt + quorum > write_sets[i].len()) - { + if result_tracker.too_many_failures() { // Too many errors in this set, we know we won't get a quorum break; } } // Failure, could not get quorum within at least one set - let errors = errors.iter().map(|e| format!("{}", e)).collect::>(); - Err(Error::Quorum( - quorum, - Some(write_sets.len()), - successes, - call_count, - errors, - )) + Err(result_tracker.quorum_error()) } pub async fn get( -- cgit v1.2.3 From d90de365b3b30cb631b22fcd62c98bddb5a91549 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 7 Dec 2023 11:16:10 +0100 Subject: table sync: use write quorums to report global success or failure of sync --- src/rpc/layout/helper.rs | 2 +- src/rpc/layout/manager.rs | 2 +- src/table/replication/fullcopy.rs | 3 +-- src/table/replication/parameters.rs | 2 +- src/table/replication/sharded.rs | 4 +-- src/table/sync.rs | 51 ++++++++++++++++++++++--------------- 6 files changed, 36 insertions(+), 28 deletions(-) diff --git a/src/rpc/layout/helper.rs b/src/rpc/layout/helper.rs index 881a039e..0aa7c6aa 100644 --- a/src/rpc/layout/helper.rs +++ b/src/rpc/layout/helper.rs @@ -180,7 +180,7 @@ impl LayoutHelper { ret } - pub(crate) fn write_sets_of(&self, position: &Hash) -> Vec> { + pub fn storage_sets_of(&self, position: &Hash) -> Vec> { self.layout() .versions .iter() diff --git a/src/rpc/layout/manager.rs b/src/rpc/layout/manager.rs index 17465019..dc963ba0 100644 --- a/src/rpc/layout/manager.rs +++ b/src/rpc/layout/manager.rs @@ -139,7 +139,7 @@ impl LayoutManager { pub fn write_sets_of(self: &Arc, position: &Hash) -> WriteLock>> { let layout = self.layout(); let version = layout.current().version; - let nodes = layout.write_sets_of(position); + let nodes = layout.storage_sets_of(position); layout .ack_lock .get(&version) diff --git a/src/table/replication/fullcopy.rs b/src/table/replication/fullcopy.rs index df930224..30122f39 100644 --- a/src/table/replication/fullcopy.rs +++ b/src/table/replication/fullcopy.rs @@ -1,4 +1,3 @@ -use std::iter::FromIterator; use std::sync::Arc; use garage_rpc::layout::*; @@ -69,7 +68,7 @@ impl TableReplication for TableFullReplication { partition: 0u16, first_hash: [0u8; 32].into(), last_hash: [0xff; 32].into(), - storage_nodes: Vec::from_iter(layout.current().all_nodes().to_vec()), + storage_sets: vec![layout.current().all_nodes().to_vec()], }], } } diff --git a/src/table/replication/parameters.rs b/src/table/replication/parameters.rs index db11ff5f..78470f35 100644 --- a/src/table/replication/parameters.rs +++ b/src/table/replication/parameters.rs @@ -40,5 +40,5 @@ pub struct SyncPartition { pub partition: Partition, pub first_hash: Hash, pub last_hash: Hash, - pub storage_nodes: Vec, + pub storage_sets: Vec>, } diff --git a/src/table/replication/sharded.rs b/src/table/replication/sharded.rs index 2a16bc0c..55d0029d 100644 --- a/src/table/replication/sharded.rs +++ b/src/table/replication/sharded.rs @@ -60,12 +60,12 @@ impl TableReplication for TableShardedReplication { .current() .partitions() .map(|(partition, first_hash)| { - let storage_nodes = layout.storage_nodes_of(&first_hash); + let storage_sets = layout.storage_sets_of(&first_hash); SyncPartition { partition, first_hash, last_hash: [0u8; 32].into(), // filled in just after - storage_nodes, + storage_sets, } }) .collect::>(); diff --git a/src/table/sync.rs b/src/table/sync.rs index efeac402..cfcbc4b5 100644 --- a/src/table/sync.rs +++ b/src/table/sync.rs @@ -18,6 +18,7 @@ use garage_util::encode::{debug_serialize, nonversioned_encode}; use garage_util::error::{Error, OkOrMessage}; use garage_rpc::layout::*; +use garage_rpc::rpc_helper::QuorumSetResultTracker; use garage_rpc::system::System; use garage_rpc::*; @@ -106,44 +107,52 @@ impl TableSyncer { must_exit: &mut watch::Receiver, ) -> Result<(), Error> { let my_id = self.system.id; - let retain = partition.storage_nodes.contains(&my_id); + let retain = partition.storage_sets.iter().any(|x| x.contains(&my_id)); if retain { debug!( "({}) Syncing {:?} with {:?}...", F::TABLE_NAME, partition, - partition.storage_nodes + partition.storage_sets ); - let mut sync_futures = partition - .storage_nodes + let mut result_tracker = QuorumSetResultTracker::new( + &partition.storage_sets, + self.data.replication.write_quorum(), + ); + + let mut sync_futures = result_tracker + .nodes .iter() - .filter(|node| **node != my_id) + .map(|(node, _)| *node) .map(|node| { - self.clone() - .do_sync_with(&partition, *node, must_exit.clone()) + let must_exit = must_exit.clone(); + async move { + if node == my_id { + (node, Ok(())) + } else { + (node, self.do_sync_with(&partition, node, must_exit).await) + } + } }) .collect::>(); - let mut n_errors = 0; - while let Some(r) = sync_futures.next().await { - if let Err(e) = r { - n_errors += 1; - warn!("({}) Sync error: {}", F::TABLE_NAME, e); + while let Some((node, res)) = sync_futures.next().await { + if let Err(e) = &res { + warn!("({}) Sync error with {:?}: {}", F::TABLE_NAME, node, e); } + result_tracker.register_result(node, res); } - if n_errors > 0 { - return Err(Error::Message(format!( - "Sync failed with {} nodes.", - n_errors - ))); + + if result_tracker.too_many_failures() { + return Err(result_tracker.quorum_error()); + } else { + Ok(()) } } else { self.offload_partition(&partition.first_hash, &partition.last_hash, must_exit) - .await?; + .await } - - Ok(()) } // Offload partition: this partition is not something we are storing, @@ -264,7 +273,7 @@ impl TableSyncer { } async fn do_sync_with( - self: Arc, + self: &Arc, partition: &SyncPartition, who: Uuid, must_exit: watch::Receiver, -- cgit v1.2.3 From aa59059a910eb6e1e824b84413a66909d697ef8a Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 7 Dec 2023 11:50:00 +0100 Subject: layout cli: safer skip-dead-nodes command --- src/garage/cli/cmd.rs | 23 ++++++++++++++++------- src/garage/cli/layout.rs | 35 +++++++++++++++++++++++++---------- src/garage/cli/structs.rs | 12 ++++++++---- 3 files changed, 49 insertions(+), 21 deletions(-) diff --git a/src/garage/cli/cmd.rs b/src/garage/cli/cmd.rs index c7f0ad2b..196c0cb3 100644 --- a/src/garage/cli/cmd.rs +++ b/src/garage/cli/cmd.rs @@ -49,13 +49,7 @@ pub async fn cli_command_dispatch( } pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> Result<(), Error> { - let status = match rpc_cli - .call(&rpc_host, SystemRpc::GetKnownNodes, PRIO_NORMAL) - .await?? - { - SystemRpc::ReturnKnownNodes(nodes) => nodes, - resp => return Err(Error::Message(format!("Invalid RPC response: {:?}", resp))), - }; + let status = fetch_status(rpc_cli, rpc_host).await?; let layout = fetch_layout(rpc_cli, rpc_host).await?; println!("==== HEALTHY NODES ===="); @@ -268,3 +262,18 @@ pub async fn cmd_admin( } Ok(()) } + +// ---- utility ---- + +pub async fn fetch_status( + rpc_cli: &Endpoint, + rpc_host: NodeID, +) -> Result, Error> { + match rpc_cli + .call(&rpc_host, SystemRpc::GetKnownNodes, PRIO_NORMAL) + .await?? + { + SystemRpc::ReturnKnownNodes(nodes) => Ok(nodes), + resp => Err(Error::Message(format!("Invalid RPC response: {:?}", resp))), + } +} diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index 3c7843bd..cdf77c04 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -33,8 +33,8 @@ pub async fn cli_layout_command_dispatch( cmd_config_layout(system_rpc_endpoint, rpc_host, config_opt).await } LayoutOperation::History => cmd_layout_history(system_rpc_endpoint, rpc_host).await, - LayoutOperation::AssumeSync(assume_sync_opt) => { - cmd_layout_assume_sync(system_rpc_endpoint, rpc_host, assume_sync_opt).await + LayoutOperation::SkipDeadNodes(assume_sync_opt) => { + cmd_layout_skip_dead_nodes(system_rpc_endpoint, rpc_host, assume_sync_opt).await } } } @@ -388,13 +388,21 @@ pub async fn cmd_layout_history( Ok(()) } -pub async fn cmd_layout_assume_sync( +pub async fn cmd_layout_skip_dead_nodes( rpc_cli: &Endpoint, rpc_host: NodeID, - opt: AssumeSyncOpt, + opt: SkipDeadNodesOpt, ) -> Result<(), Error> { + let status = fetch_status(rpc_cli, rpc_host).await?; let mut layout = fetch_layout(rpc_cli, rpc_host).await?; + if layout.versions.len() == 1 { + return Err(Error::Message( + "This command cannot be called when there is only one live cluster layout version" + .into(), + )); + } + let min_v = layout.min_stored(); if opt.version <= min_v || opt.version > layout.current().version { return Err(Error::Message(format!( @@ -408,12 +416,19 @@ pub async fn cmd_layout_assume_sync( let all_nodes = layout.get_all_nodes(); for node in all_nodes.iter() { - layout.update_trackers.ack_map.set_max(*node, opt.version); - layout.update_trackers.sync_map.set_max(*node, opt.version); - layout - .update_trackers - .sync_ack_map - .set_max(*node, opt.version); + if status.iter().any(|x| x.id == *node && x.is_up) { + continue; + } + + if layout.update_trackers.ack_map.set_max(*node, opt.version) { + println!("Increased the ACK tracker for node {:?}", node); + } + + if opt.allow_missing_data { + if layout.update_trackers.sync_map.set_max(*node, opt.version) { + println!("Increased the SYNC tracker for node {:?}", node); + } + } } send_layout(rpc_cli, rpc_host, layout).await?; diff --git a/src/garage/cli/structs.rs b/src/garage/cli/structs.rs index c4b400f4..6bc3da22 100644 --- a/src/garage/cli/structs.rs +++ b/src/garage/cli/structs.rs @@ -117,9 +117,9 @@ pub enum LayoutOperation { #[structopt(name = "history", version = garage_version())] History, - /// Assume all nodes are synchronized up to a certain layout version - #[structopt(name = "assume-sync", version = garage_version())] - AssumeSync(AssumeSyncOpt), + /// Skip dead nodes when awaiting for a new layout version to be synchronized + #[structopt(name = "skip-dead-nodes", version = garage_version())] + SkipDeadNodes(SkipDeadNodesOpt), } #[derive(StructOpt, Debug)] @@ -178,11 +178,15 @@ pub struct RevertLayoutOpt { } #[derive(StructOpt, Debug)] -pub struct AssumeSyncOpt { +pub struct SkipDeadNodesOpt { /// Version number of the layout to assume is currently up-to-date. /// This will generally be the current layout version. #[structopt(long = "version")] pub(crate) version: u64, + /// Allow the skip even if a quorum of ndoes could not be found for + /// the data among the remaining nodes + #[structopt(long = "allow-missing-data")] + pub(crate) allow_missing_data: bool, } #[derive(Serialize, Deserialize, StructOpt, Debug)] -- cgit v1.2.3 From 9cecea64d4509e95ac9793b29c947e2ecf9bb0b8 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 7 Dec 2023 14:27:53 +0100 Subject: layout: allow sync update tracker to progress with only quorums --- src/garage/cli/layout.rs | 6 +-- src/rpc/layout/helper.rs | 33 ++++++++++++--- src/rpc/layout/history.rs | 101 ++++++++++++++++++++++++++++++++++++++++++++ src/rpc/layout/manager.rs | 18 +++++--- src/rpc/layout/schema.rs | 6 +-- src/rpc/replication_mode.rs | 7 +++ src/rpc/system.rs | 2 +- 7 files changed, 152 insertions(+), 21 deletions(-) diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index cdf77c04..fac826f5 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -365,9 +365,9 @@ pub async fn cmd_layout_history( table.push(format!( "{:?}\t#{}\t#{}\t#{}", node, - layout.update_trackers.ack_map.get(node), - layout.update_trackers.sync_map.get(node), - layout.update_trackers.sync_ack_map.get(node), + layout.update_trackers.ack_map.get(node, min_stored), + layout.update_trackers.sync_map.get(node, min_stored), + layout.update_trackers.sync_ack_map.get(node, min_stored), )); } table[1..].sort(); diff --git a/src/rpc/layout/helper.rs b/src/rpc/layout/helper.rs index 0aa7c6aa..eeaf4ffa 100644 --- a/src/rpc/layout/helper.rs +++ b/src/rpc/layout/helper.rs @@ -7,6 +7,7 @@ use serde::{Deserialize, Serialize}; use garage_util::data::*; use super::schema::*; +use crate::replication_mode::ReplicationMode; use crate::rpc_helper::RpcHelper; #[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq, Eq)] @@ -22,6 +23,7 @@ pub struct LayoutDigest { } pub struct LayoutHelper { + replication_mode: ReplicationMode, layout: Option, // cached values @@ -48,7 +50,23 @@ impl Deref for LayoutHelper { } impl LayoutHelper { - pub fn new(mut layout: LayoutHistory, mut ack_lock: HashMap) -> Self { + pub fn new( + replication_mode: ReplicationMode, + mut layout: LayoutHistory, + mut ack_lock: HashMap, + ) -> Self { + // In the new() function of the helper, we do a bunch of cleanup + // and calculations on the layout history to make sure things are + // correct and we have rapid access to important values such as + // the layout versions to use when reading to ensure consistency. + + if !replication_mode.is_read_after_write_consistent() { + // Fast path for when no consistency is required. + // In this case we only need to keep the last version of the layout, + // we don't care about coordinating stuff in the cluster. + layout.keep_current_version_only(); + } + layout.cleanup_old_versions(); let all_nodes = layout.get_all_nodes(); @@ -68,7 +86,7 @@ impl LayoutHelper { .ack_map .min_among(&all_nodes, min_version); - // sync_map_min is the minimum value of sync_map among all storage nodes + // sync_map_min is the minimum value of sync_map among storage nodes // in the cluster (non-gateway nodes only, current and previous layouts). // It is the highest layout version for which we know that all relevant // storage nodes have fullfilled a sync, and therefore it is safe to @@ -76,11 +94,10 @@ impl LayoutHelper { // Gateway nodes are excluded here because they hold no relevant data // (they store the bucket and access key tables, but we don't have // consistency on those). - // TODO: this value could take quorums into account instead. - let sync_map_min = layout - .update_trackers - .sync_map - .min_among(&all_nongateway_nodes, min_version); + // This value is calculated using quorums to allow progress even + // if not all nodes have successfully completed a sync. + let sync_map_min = + layout.calculate_sync_map_min_with_quorum(replication_mode, &all_nongateway_nodes); let trackers_hash = layout.calculate_trackers_hash(); let staging_hash = layout.calculate_staging_hash(); @@ -91,6 +108,7 @@ impl LayoutHelper { .or_insert(AtomicUsize::new(0)); LayoutHelper { + replication_mode, layout: Some(layout), ack_map_min, sync_map_min, @@ -115,6 +133,7 @@ impl LayoutHelper { let changed = f(&mut self.layout.as_mut().unwrap()); if changed { *self = Self::new( + self.replication_mode, self.layout.take().unwrap(), std::mem::take(&mut self.ack_lock), ); diff --git a/src/rpc/layout/history.rs b/src/rpc/layout/history.rs index c448ac24..a53256cc 100644 --- a/src/rpc/layout/history.rs +++ b/src/rpc/layout/history.rs @@ -6,6 +6,7 @@ use garage_util::encode::nonversioned_encode; use garage_util::error::*; use super::*; +use crate::replication_mode::ReplicationMode; impl LayoutHistory { pub fn new(replication_factor: usize) -> Self { @@ -64,6 +65,13 @@ impl LayoutHistory { // ---- housekeeping (all invoked by LayoutHelper) ---- + pub(crate) fn keep_current_version_only(&mut self) { + while self.versions.len() > 1 { + let removed = self.versions.remove(0); + self.old_versions.push(removed); + } + } + pub(crate) fn cleanup_old_versions(&mut self) { // If there are invalid versions before valid versions, remove them if self.versions.len() > 1 && self.current().check().is_ok() { @@ -114,6 +122,99 @@ impl LayoutHistory { } } + pub(crate) fn calculate_sync_map_min_with_quorum( + &self, + replication_mode: ReplicationMode, + all_nongateway_nodes: &[Uuid], + ) -> u64 { + // This function calculates the minimum layout version from which + // it is safe to read if we want to maintain read-after-write consistency. + // In the general case the computation can be a bit expensive so + // we try to optimize it in several ways. + + // If there is only one layout version, we know that's the one + // we need to read from. + if self.versions.len() == 1 { + return self.current().version; + } + + let quorum = replication_mode.write_quorum(); + + let min_version = self.min_stored(); + let global_min = self + .update_trackers + .sync_map + .min_among(&all_nongateway_nodes, min_version); + + // If the write quorums are equal to the total number of nodes, + // i.e. no writes can succeed while they are not written to all nodes, + // then we must in all case wait for all nodes to complete a sync. + // This is represented by reading from the layout with version + // number global_min, the smallest layout version for which all nodes + // have completed a sync. + if quorum == self.current().replication_factor { + return global_min; + } + + // In the general case, we need to look at all write sets for all partitions, + // and find a safe layout version to read for that partition. We then + // take the minimum value among all partition as the safe layout version + // to read in all cases (the layout version to which all reads are directed). + let mut current_min = self.current().version; + let mut sets_done = HashSet::>::new(); + + for (_, p_hash) in self.current().partitions() { + for v in self.versions.iter() { + if v.version == self.current().version { + // We don't care about whether nodes in the latest layout version + // have completed a sync or not, as the sync is push-only + // and by definition nodes in the latest layout version do not + // hold data that must be pushed to nodes in the latest layout + // version, since that's the same version (any data that's + // already in the latest version is assumed to have been written + // by an operation that ensured a quorum of writes within + // that version). + continue; + } + + // Determine set of nodes for partition p in layout version v. + // Sort the node set to avoid duplicate computations. + let mut set = v + .nodes_of(&p_hash, v.replication_factor) + .collect::>(); + set.sort(); + + // If this set was already processed, skip it. + if sets_done.contains(&set) { + continue; + } + + // Find the value of the sync update trackers that is the + // highest possible minimum within a quorum of nodes. + let mut sync_values = set + .iter() + .map(|x| self.update_trackers.sync_map.get(x, min_version)) + .collect::>(); + sync_values.sort(); + let set_min = sync_values[sync_values.len() - quorum]; + if set_min < current_min { + current_min = set_min; + } + // defavorable case, we know we are at the smallest possible version, + // so we can stop early + assert!(current_min >= global_min); + if current_min == global_min { + return current_min; + } + + // Add set to already processed sets + sets_done.insert(set); + } + } + + current_min + } + pub(crate) fn calculate_trackers_hash(&self) -> Hash { blake2sum(&nonversioned_encode(&self.update_trackers).unwrap()[..]) } diff --git a/src/rpc/layout/manager.rs b/src/rpc/layout/manager.rs index dc963ba0..ec8a2a15 100644 --- a/src/rpc/layout/manager.rs +++ b/src/rpc/layout/manager.rs @@ -14,12 +14,13 @@ use garage_util::error::*; use garage_util::persister::Persister; use super::*; +use crate::replication_mode::ReplicationMode; use crate::rpc_helper::*; use crate::system::*; pub struct LayoutManager { node_id: Uuid, - replication_factor: usize, + replication_mode: ReplicationMode, persist_cluster_layout: Persister, layout: Arc>, @@ -37,14 +38,16 @@ impl LayoutManager { node_id: NodeID, system_endpoint: Arc>, fullmesh: Arc, - replication_factor: usize, + replication_mode: ReplicationMode, ) -> Result, Error> { + let replication_factor = replication_mode.replication_factor(); + let persist_cluster_layout: Persister = Persister::new(&config.metadata_dir, "cluster_layout"); let cluster_layout = match persist_cluster_layout.load() { Ok(x) => { - if x.current().replication_factor != replication_factor { + if x.current().replication_factor != replication_mode.replication_factor() { return Err(Error::Message(format!( "Prevous cluster layout has replication factor {}, which is different than the one specified in the config file ({}). The previous cluster layout can be purged, if you know what you are doing, simply by deleting the `cluster_layout` file in your metadata directory.", x.current().replication_factor, @@ -62,7 +65,8 @@ impl LayoutManager { } }; - let mut cluster_layout = LayoutHelper::new(cluster_layout, Default::default()); + let mut cluster_layout = + LayoutHelper::new(replication_mode, cluster_layout, Default::default()); cluster_layout.update_trackers(node_id.into()); let layout = Arc::new(RwLock::new(cluster_layout)); @@ -77,7 +81,7 @@ impl LayoutManager { Ok(Arc::new(Self { node_id: node_id.into(), - replication_factor, + replication_mode, persist_cluster_layout, layout, change_notify, @@ -291,11 +295,11 @@ impl LayoutManager { adv.update_trackers ); - if adv.current().replication_factor != self.replication_factor { + if adv.current().replication_factor != self.replication_mode.replication_factor() { let msg = format!( "Received a cluster layout from another node with replication factor {}, which is different from what we have in our configuration ({}). Discarding the cluster layout we received.", adv.current().replication_factor, - self.replication_factor + self.replication_mode.replication_factor() ); error!("{}", msg); return Err(Error::Message(msg)); diff --git a/src/rpc/layout/schema.rs b/src/rpc/layout/schema.rs index 49e84420..df949906 100644 --- a/src/rpc/layout/schema.rs +++ b/src/rpc/layout/schema.rs @@ -411,13 +411,13 @@ impl UpdateTracker { pub(crate) fn min_among(&self, storage_nodes: &[Uuid], min_version: u64) -> u64 { storage_nodes .iter() - .map(|x| self.0.get(x).copied().unwrap_or(min_version)) + .map(|x| self.get(x, min_version)) .min() .unwrap_or(min_version) } - pub fn get(&self, node: &Uuid) -> u64 { - self.0.get(node).copied().unwrap_or(0) + pub fn get(&self, node: &Uuid, min_version: u64) -> u64 { + self.0.get(node).copied().unwrap_or(min_version) } } diff --git a/src/rpc/replication_mode.rs b/src/rpc/replication_mode.rs index e244e063..2f7e2fec 100644 --- a/src/rpc/replication_mode.rs +++ b/src/rpc/replication_mode.rs @@ -54,4 +54,11 @@ impl ReplicationMode { Self::ThreeWayDangerous => 1, } } + + pub fn is_read_after_write_consistent(&self) -> bool { + match self { + Self::None | Self::TwoWay | Self::ThreeWay => true, + _ => false, + } + } } diff --git a/src/rpc/system.rs b/src/rpc/system.rs index be4aefa2..81a47ff3 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -280,7 +280,7 @@ impl System { netapp.id, system_endpoint.clone(), fullmesh.clone(), - replication_factor, + replication_mode, )?; // ---- set up metrics and status exchange ---- -- cgit v1.2.3 From 431b28e0cfdc9cac6c649193cf602108a8b02997 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 7 Dec 2023 15:15:59 +0100 Subject: fix build with discovery features --- src/rpc/system.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/rpc/system.rs b/src/rpc/system.rs index 81a47ff3..adfef6b6 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -514,7 +514,7 @@ impl System { if let Err(e) = c .publish_consul_service( self.netapp.id, - &self.local_status.load_full().hostname, + &self.local_status.load_full().hostname.as_deref().unwrap(), rpc_public_addr, ) .await @@ -541,7 +541,7 @@ impl System { if let Err(e) = publish_kubernetes_node( k, self.netapp.id, - &self.local_status.load_full().hostname, + &self.local_status.load_full().hostname.as_deref().unwrap(), rpc_public_addr, ) .await -- cgit v1.2.3 From 91b874c4efa40e64663368369a712e0a5a389e53 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 8 Dec 2023 10:36:37 +0100 Subject: rpc: fix system::health --- src/rpc/system.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/rpc/system.rs b/src/rpc/system.rs index adfef6b6..a8f12852 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -456,14 +456,14 @@ impl System { let mut partitions_quorum = 0; let mut partitions_all_ok = 0; for (_, hash) in partitions.iter() { - let write_sets = layout + let mut write_sets = layout .versions .iter() .map(|x| x.nodes_of(hash, x.replication_factor)); let has_quorum = write_sets .clone() .all(|set| set.filter(|x| node_up(x)).count() >= quorum); - let all_ok = write_sets.clone().all(|mut set| set.all(|x| node_up(&x))); + let all_ok = write_sets.all(|mut set| set.all(|x| node_up(&x))); if has_quorum { partitions_quorum += 1; } @@ -474,7 +474,7 @@ impl System { // Determine overall cluster status let status = - if partitions_quorum == partitions.len() && storage_nodes_ok == storage_nodes.len() { + if partitions_all_ok == partitions.len() && storage_nodes_ok == storage_nodes.len() { ClusterHealthStatus::Healthy } else if partitions_quorum == partitions.len() { ClusterHealthStatus::Degraded -- cgit v1.2.3 From 7f2541101f15614c79020b35d3d7dab767c32676 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 8 Dec 2023 11:24:23 +0100 Subject: cli: improvements to the layout commands when multiple layouts are live --- src/garage/admin/mod.rs | 3 +-- src/garage/cli/cmd.rs | 4 +-- src/garage/cli/layout.rs | 67 +++++++++++++++++++++++++++++++----------------- src/garage/cli/util.rs | 4 ++- 4 files changed, 49 insertions(+), 29 deletions(-) diff --git a/src/garage/admin/mod.rs b/src/garage/admin/mod.rs index da4226cf..de7851e1 100644 --- a/src/garage/admin/mod.rs +++ b/src/garage/admin/mod.rs @@ -274,8 +274,7 @@ impl AdminRpcHandler { fn gather_cluster_stats(&self) -> String { let mut ret = String::new(); - // Gather storage node and free space statistics - // TODO: not only layout.current() ??? + // Gather storage node and free space statistics for current nodes let layout = &self.garage.system.cluster_layout(); let mut node_partition_count = HashMap::::new(); for short_id in layout.current().ring_assignment_data.iter() { diff --git a/src/garage/cli/cmd.rs b/src/garage/cli/cmd.rs index 196c0cb3..fb6dface 100644 --- a/src/garage/cli/cmd.rs +++ b/src/garage/cli/cmd.rs @@ -179,7 +179,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> println!("Your cluster is expecting to drain data from nodes that are currently unavailable."); println!("If these nodes are definitely dead, please review the layout history with"); println!( - "`garage layout history` and use `garage layout assume-sync` to force progress." + "`garage layout history` and use `garage layout skip-dead-nodes` to force progress." ); } } @@ -274,6 +274,6 @@ pub async fn fetch_status( .await?? { SystemRpc::ReturnKnownNodes(nodes) => Ok(nodes), - resp => Err(Error::Message(format!("Invalid RPC response: {:?}", resp))), + resp => Err(Error::unexpected_rpc_message(resp)), } } diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index fac826f5..f76e33c5 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -354,35 +354,44 @@ pub async fn cmd_layout_history( )); } format_table(table); - - println!(); - println!("==== UPDATE TRACKERS ===="); - println!("This is the internal data that Garage stores to know which nodes have what data."); println!(); - let mut table = vec!["Node\tAck\tSync\tSync_ack".to_string()]; - let all_nodes = layout.get_all_nodes(); - for node in all_nodes.iter() { - table.push(format!( - "{:?}\t#{}\t#{}\t#{}", - node, - layout.update_trackers.ack_map.get(node, min_stored), - layout.update_trackers.sync_map.get(node, min_stored), - layout.update_trackers.sync_ack_map.get(node, min_stored), - )); - } - table[1..].sort(); - format_table(table); if layout.versions.len() > 1 { + println!("==== UPDATE TRACKERS ===="); + println!("Several layout versions are currently live in the version, and data is being migrated."); + println!( + "This is the internal data that Garage stores to know which nodes have what data." + ); + println!(); + let mut table = vec!["Node\tAck\tSync\tSync_ack".to_string()]; + let all_nodes = layout.get_all_nodes(); + for node in all_nodes.iter() { + table.push(format!( + "{:?}\t#{}\t#{}\t#{}", + node, + layout.update_trackers.ack_map.get(node, min_stored), + layout.update_trackers.sync_map.get(node, min_stored), + layout.update_trackers.sync_ack_map.get(node, min_stored), + )); + } + table[1..].sort(); + format_table(table); + println!(); println!( - "If some nodes are not catching up to the latest layout version in the update tracker," + "If some nodes are not catching up to the latest layout version in the update trackers," ); println!("it might be because they are offline or unable to complete a sync successfully."); println!( - "You may force progress using `garage layout assume-sync --version {}`", + "You may force progress using `garage layout skip-dead-nodes --version {}`", layout.current().version ); + } else { + println!("Your cluster is currently in a stable state with a single live layout version."); + println!("No metadata migration is in progress. Note that the migration of data blocks is not tracked,"); + println!( + "so you might want to keep old nodes online until their data directories become empty." + ); } Ok(()) @@ -415,6 +424,7 @@ pub async fn cmd_layout_skip_dead_nodes( } let all_nodes = layout.get_all_nodes(); + let mut did_something = false; for node in all_nodes.iter() { if status.iter().any(|x| x.id == *node && x.is_up) { continue; @@ -422,19 +432,28 @@ pub async fn cmd_layout_skip_dead_nodes( if layout.update_trackers.ack_map.set_max(*node, opt.version) { println!("Increased the ACK tracker for node {:?}", node); + did_something = true; } if opt.allow_missing_data { if layout.update_trackers.sync_map.set_max(*node, opt.version) { println!("Increased the SYNC tracker for node {:?}", node); + did_something = true; } } } - send_layout(rpc_cli, rpc_host, layout).await?; - println!("Success."); - - Ok(()) + if did_something { + send_layout(rpc_cli, rpc_host, layout).await?; + println!("Success."); + Ok(()) + } else if !opt.allow_missing_data { + Err(Error::Message("Nothing was done, try passing the `--allow-missing-data` flag to force progress even when not enough nodes can complete a metadata sync.".into())) + } else { + Err(Error::Message( + "Sorry, there is nothing I can do for you. Please wait patiently. If you ask for help, please send the output of the `garage layout history` command.".into(), + )) + } } // --- utility --- @@ -448,7 +467,7 @@ pub async fn fetch_layout( .await?? { SystemRpc::AdvertiseClusterLayout(t) => Ok(t), - resp => Err(Error::Message(format!("Invalid RPC response: {:?}", resp))), + resp => Err(Error::unexpected_rpc_message(resp)), } } diff --git a/src/garage/cli/util.rs b/src/garage/cli/util.rs index 2232d395..0511e2b1 100644 --- a/src/garage/cli/util.rs +++ b/src/garage/cli/util.rs @@ -450,6 +450,8 @@ pub fn print_block_info( if refcount != nondeleted_count { println!(); - println!("Warning: refcount does not match number of non-deleted versions"); + println!( + "Warning: refcount does not match number of non-deleted versions (see issue #644)." + ); } } -- cgit v1.2.3 From 063294dd569e10c6d85e29eb6507249eece00956 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 8 Dec 2023 11:50:58 +0100 Subject: layout version: refactor get_node_zone --- src/rpc/layout/test.rs | 4 ++-- src/rpc/layout/version.rs | 47 +++++++++++++++++++++++------------------------ 2 files changed, 25 insertions(+), 26 deletions(-) diff --git a/src/rpc/layout/test.rs b/src/rpc/layout/test.rs index bb072c97..88eb518e 100644 --- a/src/rpc/layout/test.rs +++ b/src/rpc/layout/test.rs @@ -34,8 +34,8 @@ fn check_against_naive(cl: &LayoutVersion) -> Result { zone_token.insert(z.clone(), 0); } for uuid in cl.nongateway_nodes() { - let z = cl.get_node_zone(&uuid)?; - let c = cl.get_node_capacity(&uuid).unwrap(); + let z = cl.expect_get_node_zone(&uuid); + let c = cl.expect_get_node_capacity(&uuid); zone_token.insert( z.to_string(), zone_token[z] + min(NB_PARTITIONS, (c / over_size) as usize), diff --git a/src/rpc/layout/version.rs b/src/rpc/layout/version.rs index 947fab56..cbfbee94 100644 --- a/src/rpc/layout/version.rs +++ b/src/rpc/layout/version.rs @@ -70,6 +70,14 @@ impl LayoutVersion { } } + /// Given a node uuids, this function returns the label of its zone if it has one + pub fn get_node_zone(&self, uuid: &Uuid) -> Option<&str> { + match self.node_role(uuid) { + Some(role) => Some(&role.zone), + _ => None, + } + } + /// Returns the number of partitions associated to this node in the ring pub fn get_node_usage(&self, uuid: &Uuid) -> Result { for (i, id) in self.node_id_vec.iter().enumerate() { @@ -129,28 +137,22 @@ impl LayoutVersion { // ===================== internal information extractors ====================== - /// Given a node uuids, this function returns the label of its zone - pub(crate) fn get_node_zone(&self, uuid: &Uuid) -> Result<&str, Error> { - match self.node_role(uuid) { - Some(role) => Ok(&role.zone), - _ => Err(Error::Message( - "The Uuid does not correspond to a node present in the cluster.".into(), - )), - } - } - - fn expect_get_node_capacity(&self, uuid: &Uuid) -> u64 { + pub(crate) fn expect_get_node_capacity(&self, uuid: &Uuid) -> u64 { self.get_node_capacity(&uuid) .expect("non-gateway node with zero capacity") } + pub(crate) fn expect_get_node_zone(&self, uuid: &Uuid) -> &str { + self.get_node_zone(&uuid).expect("node without a zone") + } + /// Returns the sum of capacities of non gateway nodes in the cluster - fn get_total_capacity(&self) -> Result { + fn get_total_capacity(&self) -> u64 { let mut total_capacity = 0; for uuid in self.nongateway_nodes() { total_capacity += self.expect_get_node_capacity(&uuid); } - Ok(total_capacity) + total_capacity } /// Returns the effective value of the zone_redundancy parameter @@ -227,10 +229,7 @@ impl LayoutVersion { // Check that every partition is spread over at least zone_redundancy zones. let zones_of_p = nodes_of_p .iter() - .map(|n| { - self.get_node_zone(&self.node_id_vec[*n as usize]) - .expect("Zone not found.") - }) + .map(|n| self.expect_get_node_zone(&self.node_id_vec[*n as usize])) .collect::>(); if zones_of_p.iter().unique().count() < zone_redundancy { return Err(format!( @@ -516,7 +515,7 @@ impl LayoutVersion { } let mut s_down = 1; - let mut s_up = self.get_total_capacity()?; + let mut s_up = self.get_total_capacity(); while s_down + 1 < s_up { g = self.generate_flow_graph( (s_down + s_up) / 2, @@ -586,7 +585,7 @@ impl LayoutVersion { } for n in 0..self.nongateway_nodes().len() { let node_capacity = self.expect_get_node_capacity(&self.node_id_vec[n]); - let node_zone = zone_to_id[self.get_node_zone(&self.node_id_vec[n])?]; + let node_zone = zone_to_id[self.expect_get_node_zone(&self.node_id_vec[n])]; g.add_edge(Vertex::N(n), Vertex::Sink, node_capacity / partition_size)?; for p in 0..NB_PARTITIONS { if !exclude_assoc.contains(&(p, n)) { @@ -632,7 +631,7 @@ impl LayoutVersion { // The algorithm is such that it will start with the flow that we just computed // and find ameliorating paths from that. for (p, n) in exclude_edge.iter() { - let node_zone = zone_to_id[self.get_node_zone(&self.node_id_vec[*n])?]; + let node_zone = zone_to_id[self.expect_get_node_zone(&self.node_id_vec[*n])]; g.add_edge(Vertex::PZ(*p, node_zone), Vertex::N(*n), 1)?; } g.compute_maximal_flow()?; @@ -652,7 +651,7 @@ impl LayoutVersion { let mut cost = CostFunction::new(); for (p, assoc_p) in prev_assign.iter().enumerate() { for n in assoc_p.iter() { - let node_zone = zone_to_id[self.get_node_zone(&self.node_id_vec[*n])?]; + let node_zone = zone_to_id[self.expect_get_node_zone(&self.node_id_vec[*n])]; cost.insert((Vertex::PZ(p, node_zone), Vertex::N(*n)), -1); } } @@ -707,7 +706,7 @@ impl LayoutVersion { let mut msg = Message::new(); let used_cap = self.partition_size * NB_PARTITIONS as u64 * self.replication_factor as u64; - let total_cap = self.get_total_capacity()?; + let total_cap = self.get_total_capacity(); let percent_cap = 100.0 * (used_cap as f32) / (total_cap as f32); msg.push(format!( "Usable capacity / total cluster capacity: {} / {} ({:.1} %)", @@ -754,7 +753,7 @@ impl LayoutVersion { let mut old_zones_of_p = Vec::::new(); for n in prev_assign[p].iter() { old_zones_of_p - .push(zone_to_id[self.get_node_zone(&self.node_id_vec[*n])?]); + .push(zone_to_id[self.expect_get_node_zone(&self.node_id_vec[*n])]); } if !old_zones_of_p.contains(&z) { new_partitions_zone[z] += 1; @@ -796,7 +795,7 @@ impl LayoutVersion { for z in 0..id_to_zone.len() { let mut nodes_of_z = Vec::::new(); for n in 0..storing_nodes.len() { - if self.get_node_zone(&self.node_id_vec[n])? == id_to_zone[z] { + if self.expect_get_node_zone(&self.node_id_vec[n]) == id_to_zone[z] { nodes_of_z.push(n); } } -- cgit v1.2.3 From 5dd200c015aed786173f0e11541b0505f95dd6d1 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 8 Dec 2023 12:02:24 +0100 Subject: layout: move block_read_nodes_of to rpc_helper to avoid double-locking (in theory, this could have caused a deadlock) --- src/block/manager.rs | 2 +- src/rpc/layout/helper.rs | 27 ++--------- src/rpc/rpc_helper.rs | 121 +++++++++++++++++++++++++++++------------------ 3 files changed, 80 insertions(+), 70 deletions(-) diff --git a/src/block/manager.rs b/src/block/manager.rs index 47111160..bfd390ee 100644 --- a/src/block/manager.rs +++ b/src/block/manager.rs @@ -266,7 +266,7 @@ impl BlockManager { { let who = self .system - .cluster_layout() + .rpc_helper() .block_read_nodes_of(hash, self.system.rpc_helper()); for node in who.iter() { diff --git a/src/rpc/layout/helper.rs b/src/rpc/layout/helper.rs index eeaf4ffa..147c8b4f 100644 --- a/src/rpc/layout/helper.rs +++ b/src/rpc/layout/helper.rs @@ -8,7 +8,6 @@ use garage_util::data::*; use super::schema::*; use crate::replication_mode::ReplicationMode; -use crate::rpc_helper::RpcHelper; #[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq, Eq)] pub struct LayoutDigest { @@ -155,6 +154,10 @@ impl LayoutHelper { self.ack_map_min } + pub fn all_sync(&self) -> u64 { + self.sync_map_min + } + pub fn sync_versions(&self) -> (u64, u64, u64) { ( self.layout().current().version, @@ -177,28 +180,6 @@ impl LayoutHelper { .collect() } - pub fn block_read_nodes_of(&self, position: &Hash, rpc_helper: &RpcHelper) -> Vec { - let mut ret = Vec::with_capacity(12); - let ver_iter = self - .layout() - .versions - .iter() - .rev() - .chain(self.layout().old_versions.iter().rev()); - for ver in ver_iter { - if ver.version > self.sync_map_min { - continue; - } - let nodes = ver.nodes_of(position, ver.replication_factor); - for node in rpc_helper.request_order(nodes) { - if !ret.contains(&node) { - ret.push(node); - } - } - } - ret - } - pub fn storage_sets_of(&self, position: &Hash) -> Vec> { self.layout() .versions diff --git a/src/rpc/rpc_helper.rs b/src/rpc/rpc_helper.rs index c6dcbe75..7e1387ed 100644 --- a/src/rpc/rpc_helper.rs +++ b/src/rpc/rpc_helper.rs @@ -26,7 +26,7 @@ use garage_util::data::*; use garage_util::error::Error; use garage_util::metrics::RecordDuration; -use crate::layout::LayoutHelper; +use crate::layout::{LayoutHelper, LayoutHistory}; use crate::metrics::RpcMetrics; // Default RPC timeout = 5 minutes @@ -304,7 +304,7 @@ impl RpcHelper { // preemptively send an additional request to any remaining nodes. // Reorder requests to priorize closeness / low latency - let request_order = self.request_order(to.iter().copied()); + let request_order = self.request_order(&self.0.layout.read().unwrap(), to.iter().copied()); let send_all_at_once = strategy.rs_send_all_at_once.unwrap_or(false); // Build future for each request @@ -368,50 +368,6 @@ impl RpcHelper { } } - pub fn request_order(&self, nodes: impl Iterator) -> Vec { - // Retrieve some status variables that we will use to sort requests - let peer_list = self.0.fullmesh.get_peer_list(); - let layout = self.0.layout.read().unwrap(); - let our_zone = match layout.current().node_role(&self.0.our_node_id) { - Some(pc) => &pc.zone, - None => "", - }; - - // Augment requests with some information used to sort them. - // The tuples are as follows: - // (is another node?, is another zone?, latency, node ID, request future) - // We store all of these tuples in a vec that we can sort. - // By sorting this vec, we priorize ourself, then nodes in the same zone, - // and within a same zone we priorize nodes with the lowest latency. - let mut nodes = nodes - .map(|to| { - let peer_zone = match layout.current().node_role(&to) { - Some(pc) => &pc.zone, - None => "", - }; - let peer_avg_ping = peer_list - .iter() - .find(|x| x.id.as_ref() == to.as_slice()) - .and_then(|pi| pi.avg_ping) - .unwrap_or_else(|| Duration::from_secs(10)); - ( - to != self.0.our_node_id, - peer_zone != our_zone, - peer_avg_ping, - to, - ) - }) - .collect::>(); - - // Sort requests by (priorize ourself, priorize same zone, priorize low latency) - nodes.sort_by_key(|(diffnode, diffzone, ping, _to)| (*diffnode, *diffzone, *ping)); - - nodes - .into_iter() - .map(|(_, _, _, to)| to) - .collect::>() - } - /// Make a RPC call to multiple servers, returning either a Vec of responses, /// or an error if quorum could not be reached due to too many errors /// @@ -533,6 +489,79 @@ impl RpcHelper { // Failure, could not get quorum Err(result_tracker.quorum_error()) } + + // ---- functions not related to MAKING RPCs, but just determining to what nodes + // they should be made and in which order ---- + + pub fn block_read_nodes_of(&self, position: &Hash, rpc_helper: &RpcHelper) -> Vec { + let layout = self.0.layout.read().unwrap(); + + let mut ret = Vec::with_capacity(12); + let ver_iter = layout + .versions + .iter() + .rev() + .chain(layout.old_versions.iter().rev()); + for ver in ver_iter { + if ver.version > layout.all_sync() { + continue; + } + let nodes = ver.nodes_of(position, ver.replication_factor); + for node in rpc_helper.request_order(&layout, nodes) { + if !ret.contains(&node) { + ret.push(node); + } + } + } + ret + } + + fn request_order( + &self, + layout: &LayoutHistory, + nodes: impl Iterator, + ) -> Vec { + // Retrieve some status variables that we will use to sort requests + let peer_list = self.0.fullmesh.get_peer_list(); + let our_zone = match layout.current().node_role(&self.0.our_node_id) { + Some(pc) => &pc.zone, + None => "", + }; + + // Augment requests with some information used to sort them. + // The tuples are as follows: + // (is another node?, is another zone?, latency, node ID, request future) + // We store all of these tuples in a vec that we can sort. + // By sorting this vec, we priorize ourself, then nodes in the same zone, + // and within a same zone we priorize nodes with the lowest latency. + let mut nodes = nodes + .map(|to| { + let peer_zone = match layout.current().node_role(&to) { + Some(pc) => &pc.zone, + None => "", + }; + let peer_avg_ping = peer_list + .iter() + .find(|x| x.id.as_ref() == to.as_slice()) + .and_then(|pi| pi.avg_ping) + .unwrap_or_else(|| Duration::from_secs(10)); + ( + to != self.0.our_node_id, + peer_zone != our_zone, + peer_avg_ping, + to, + ) + }) + .collect::>(); + + // Sort requests by (priorize ourself, priorize same zone, priorize low latency) + nodes.sort_by_key(|(diffnode, diffzone, ping, _to)| (*diffnode, *diffzone, *ping)); + + nodes + .into_iter() + .map(|(_, _, _, to)| to) + .collect::>() + } } // ------- utility for tracking successes/errors among write sets -------- -- cgit v1.2.3 From 64a6e557a4ff6aa1ad833a1b25ef8c85cf9ee3f3 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 8 Dec 2023 12:18:12 +0100 Subject: rpc helper: small refactorings --- src/rpc/rpc_helper.rs | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/src/rpc/rpc_helper.rs b/src/rpc/rpc_helper.rs index 7e1387ed..65af8901 100644 --- a/src/rpc/rpc_helper.rs +++ b/src/rpc/rpc_helper.rs @@ -436,13 +436,12 @@ impl RpcHelper { H: StreamingEndpointHandler + 'static, S: Send + 'static, { - let msg = msg.into_req().map_err(netapp::error::Error::from)?; - // Peers may appear in many quorum sets. Here, build a list of peers, // mapping to the index of the quorum sets in which they appear. let mut result_tracker = QuorumSetResultTracker::new(to_sets, quorum); // Send one request to each peer of the quorum sets + let msg = msg.into_req().map_err(netapp::error::Error::from)?; let requests = result_tracker.nodes.iter().map(|(peer, _)| { let self2 = self.clone(); let msg = msg.clone(); @@ -523,10 +522,10 @@ impl RpcHelper { ) -> Vec { // Retrieve some status variables that we will use to sort requests let peer_list = self.0.fullmesh.get_peer_list(); - let our_zone = match layout.current().node_role(&self.0.our_node_id) { - Some(pc) => &pc.zone, - None => "", - }; + let our_zone = layout + .current() + .get_node_zone(&self.0.our_node_id) + .unwrap_or(""); // Augment requests with some information used to sort them. // The tuples are as follows: @@ -536,10 +535,7 @@ impl RpcHelper { // and within a same zone we priorize nodes with the lowest latency. let mut nodes = nodes .map(|to| { - let peer_zone = match layout.current().node_role(&to) { - Some(pc) => &pc.zone, - None => "", - }; + let peer_zone = layout.current().get_node_zone(&to).unwrap_or(""); let peer_avg_ping = peer_list .iter() .find(|x| x.id.as_ref() == to.as_slice()) @@ -567,21 +563,28 @@ impl RpcHelper { // ------- utility for tracking successes/errors among write sets -------- pub struct QuorumSetResultTracker { - // The set of nodes and the quorum sets they belong to + /// The set of nodes and the index of the quorum sets they belong to pub nodes: HashMap>, + /// The quorum value, i.e. number of success responses to await in each set pub quorum: usize, - // The success and error responses received + /// The success responses received pub successes: Vec<(Uuid, S)>, + /// The error responses received pub failures: Vec<(Uuid, E)>, - // The counters for successes and failures in each set + /// The counters for successes in each set pub success_counters: Box<[usize]>, + /// The counters for failures in each set pub failure_counters: Box<[usize]>, + /// The total number of nodes in each set pub set_lens: Box<[usize]>, } -impl QuorumSetResultTracker { +impl QuorumSetResultTracker +where + E: std::fmt::Display, +{ pub fn new(sets: &[A], quorum: usize) -> Self where A: AsRef<[Uuid]>, -- cgit v1.2.3 From 4dbf254512327ef4e7abbd5525b89bfa5b7ecb6f Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 8 Dec 2023 14:15:52 +0100 Subject: layout: refactoring, merge two files --- src/rpc/layout/helper.rs | 2 +- src/rpc/layout/mod.rs | 441 +++++++++++++++++++++++++++++++++++++++++++++- src/rpc/layout/schema.rs | 431 -------------------------------------------- src/rpc/layout/version.rs | 1 - 4 files changed, 440 insertions(+), 435 deletions(-) delete mode 100644 src/rpc/layout/schema.rs diff --git a/src/rpc/layout/helper.rs b/src/rpc/layout/helper.rs index 147c8b4f..2ba010b8 100644 --- a/src/rpc/layout/helper.rs +++ b/src/rpc/layout/helper.rs @@ -6,7 +6,7 @@ use serde::{Deserialize, Serialize}; use garage_util::data::*; -use super::schema::*; +use super::*; use crate::replication_mode::ReplicationMode; #[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq, Eq)] diff --git a/src/rpc/layout/mod.rs b/src/rpc/layout/mod.rs index eb127fda..facdb2ce 100644 --- a/src/rpc/layout/mod.rs +++ b/src/rpc/layout/mod.rs @@ -1,7 +1,13 @@ +use std::fmt; + +use bytesize::ByteSize; + +use garage_util::crdt::{AutoCrdt, Crdt}; +use garage_util::data::Uuid; + mod graph_algo; mod helper; mod history; -mod schema; mod version; #[cfg(test)] @@ -13,7 +19,6 @@ pub mod manager; pub use helper::{LayoutDigest, LayoutHelper}; pub use manager::WriteLock; -pub use schema::*; pub use version::*; // ---- defines: partitions ---- @@ -39,3 +44,435 @@ const NB_PARTITIONS: usize = 1usize << PARTITION_BITS; // Change this to u16 the day we want to have more than 256 nodes in a cluster pub type CompactNodeType = u8; pub const MAX_NODE_NUMBER: usize = 256; + +// ======== actual data structures for the layout data ======== +// ======== that is persisted to disk ======== +// some small utility impls are at the end of this file, +// but most of the code that actually computes stuff is in +// version.rs, history.rs and helper.rs + +mod v08 { + use crate::layout::CompactNodeType; + use garage_util::crdt::LwwMap; + use garage_util::data::{Hash, Uuid}; + use serde::{Deserialize, Serialize}; + + /// The layout of the cluster, i.e. the list of roles + /// which are assigned to each cluster node + #[derive(Clone, Debug, Serialize, Deserialize)] + pub struct ClusterLayout { + pub version: u64, + + pub replication_factor: usize, + pub roles: LwwMap, + + // see comments in v010::ClusterLayout + pub node_id_vec: Vec, + #[serde(with = "serde_bytes")] + pub ring_assignation_data: Vec, + + /// Role changes which are staged for the next version of the layout + pub staging: LwwMap, + pub staging_hash: Hash, + } + + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] + pub struct NodeRoleV(pub Option); + + /// The user-assigned roles of cluster nodes + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] + pub struct NodeRole { + /// Datacenter at which this entry belong. This information is used to + /// perform a better geodistribution + pub zone: String, + /// The capacity of the node + /// If this is set to None, the node does not participate in storing data for the system + /// and is only active as an API gateway to other nodes + pub capacity: Option, + /// A set of tags to recognize the node + pub tags: Vec, + } + + impl garage_util::migrate::InitialFormat for ClusterLayout {} +} + +mod v09 { + use super::v08; + use crate::layout::CompactNodeType; + use garage_util::crdt::{Lww, LwwMap}; + use garage_util::data::{Hash, Uuid}; + use serde::{Deserialize, Serialize}; + pub use v08::{NodeRole, NodeRoleV}; + + /// The layout of the cluster, i.e. the list of roles + /// which are assigned to each cluster node + #[derive(Clone, Debug, Serialize, Deserialize)] + pub struct ClusterLayout { + pub version: u64, + + pub replication_factor: usize, + + /// This attribute is only used to retain the previously computed partition size, + /// to know to what extent does it change with the layout update. + pub partition_size: u64, + /// Parameters used to compute the assignment currently given by + /// ring_assignment_data + pub parameters: LayoutParameters, + + pub roles: LwwMap, + + // see comments in v010::ClusterLayout + pub node_id_vec: Vec, + #[serde(with = "serde_bytes")] + pub ring_assignment_data: Vec, + + /// Parameters to be used in the next partition assignment computation. + pub staging_parameters: Lww, + /// Role changes which are staged for the next version of the layout + pub staging_roles: LwwMap, + pub staging_hash: Hash, + } + + /// This struct is used to set the parameters to be used in the assignment computation + /// algorithm. It is stored as a Crdt. + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)] + pub struct LayoutParameters { + pub zone_redundancy: ZoneRedundancy, + } + + /// Zone redundancy: if set to AtLeast(x), the layout calculation will aim to store copies + /// of each partition on at least that number of different zones. + /// Otherwise, copies will be stored on the maximum possible number of zones. + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)] + pub enum ZoneRedundancy { + AtLeast(usize), + Maximum, + } + + impl garage_util::migrate::Migrate for ClusterLayout { + const VERSION_MARKER: &'static [u8] = b"G09layout"; + + type Previous = v08::ClusterLayout; + + fn migrate(previous: Self::Previous) -> Self { + use itertools::Itertools; + + // In the old layout, capacities are in an arbitrary unit, + // but in the new layout they are in bytes. + // Here we arbitrarily multiply everything by 1G, + // such that 1 old capacity unit = 1GB in the new units. + // This is totally arbitrary and won't work for most users. + let cap_mul = 1024 * 1024 * 1024; + let roles = multiply_all_capacities(previous.roles, cap_mul); + let staging_roles = multiply_all_capacities(previous.staging, cap_mul); + let node_id_vec = previous.node_id_vec; + + // Determine partition size + let mut tmp = previous.ring_assignation_data.clone(); + tmp.sort(); + let partition_size = tmp + .into_iter() + .dedup_with_count() + .map(|(npart, node)| { + roles + .get(&node_id_vec[node as usize]) + .and_then(|p| p.0.as_ref().and_then(|r| r.capacity)) + .unwrap_or(0) / npart as u64 + }) + .min() + .unwrap_or(0); + + // By default, zone_redundancy is maximum possible value + let parameters = LayoutParameters { + zone_redundancy: ZoneRedundancy::Maximum, + }; + + Self { + version: previous.version, + replication_factor: previous.replication_factor, + partition_size, + parameters, + roles, + node_id_vec, + ring_assignment_data: previous.ring_assignation_data, + staging_parameters: Lww::new(parameters), + staging_roles, + staging_hash: [0u8; 32].into(), // will be set in the next migration + } + } + } + + fn multiply_all_capacities( + old_roles: LwwMap, + mul: u64, + ) -> LwwMap { + let mut new_roles = LwwMap::new(); + for (node, ts, role) in old_roles.items() { + let mut role = role.clone(); + if let NodeRoleV(Some(NodeRole { + capacity: Some(ref mut cap), + .. + })) = role + { + *cap *= mul; + } + new_roles.merge_raw(node, *ts, &role); + } + new_roles + } +} + +mod v010 { + use super::v09; + use crate::layout::CompactNodeType; + use garage_util::crdt::{Lww, LwwMap}; + use garage_util::data::Uuid; + use serde::{Deserialize, Serialize}; + use std::collections::BTreeMap; + pub use v09::{LayoutParameters, NodeRole, NodeRoleV, ZoneRedundancy}; + + /// Number of old (non-live) versions to keep, see LayoutHistory::old_versions + pub const OLD_VERSION_COUNT: usize = 5; + + /// The history of cluster layouts, with trackers to keep a record + /// of which nodes are up-to-date to current cluster data + #[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] + pub struct LayoutHistory { + /// The versions currently in use in the cluster + pub versions: Vec, + /// At most 5 of the previous versions, not used by the garage_table + /// module, but usefull for the garage_block module to find data blocks + /// that have not yet been moved + pub old_versions: Vec, + + /// Update trackers + pub update_trackers: UpdateTrackers, + + /// Staged changes for the next version + pub staging: Lww, + } + + /// A version of the layout of the cluster, i.e. the list of roles + /// which are assigned to each cluster node + #[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] + pub struct LayoutVersion { + /// The number of this version + pub version: u64, + + /// Roles assigned to nodes in this version + pub roles: LwwMap, + /// Parameters used to compute the assignment currently given by + /// ring_assignment_data + pub parameters: LayoutParameters, + + /// The number of replicas for each data partition + pub replication_factor: usize, + /// This attribute is only used to retain the previously computed partition size, + /// to know to what extent does it change with the layout update. + pub partition_size: u64, + + /// node_id_vec: a vector of node IDs with a role assigned + /// in the system (this includes gateway nodes). + /// The order here is different than the vec stored by `roles`, because: + /// 1. non-gateway nodes are first so that they have lower numbers + /// 2. nodes that don't have a role are excluded (but they need to + /// stay in the CRDT as tombstones) + pub node_id_vec: Vec, + /// number of non-gateway nodes, which are the first ids in node_id_vec + pub nongateway_node_count: usize, + /// The assignation of data partitions to nodes, the values + /// are indices in node_id_vec + #[serde(with = "serde_bytes")] + pub ring_assignment_data: Vec, + } + + /// The staged changes for the next layout version + #[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] + pub struct LayoutStaging { + /// Parameters to be used in the next partition assignment computation. + pub parameters: Lww, + /// Role changes which are staged for the next version of the layout + pub roles: LwwMap, + } + + /// The tracker of acknowlegments and data syncs around the cluster + #[derive(Clone, Debug, Serialize, Deserialize, Default, PartialEq)] + pub struct UpdateTrackers { + /// The highest layout version number each node has ack'ed + pub ack_map: UpdateTracker, + /// The highest layout version number each node has synced data for + pub sync_map: UpdateTracker, + /// The highest layout version number each node has + /// ack'ed that all other nodes have synced data for + pub sync_ack_map: UpdateTracker, + } + + /// Generic update tracker struct + #[derive(Clone, Debug, Serialize, Deserialize, Default, PartialEq)] + pub struct UpdateTracker(pub BTreeMap); + + impl garage_util::migrate::Migrate for LayoutHistory { + const VERSION_MARKER: &'static [u8] = b"G010lh"; + + type Previous = v09::ClusterLayout; + + fn migrate(previous: Self::Previous) -> Self { + let nongateway_node_count = previous + .node_id_vec + .iter() + .enumerate() + .filter(|(_, uuid)| { + let role = previous.roles.get(uuid); + matches!(role, Some(NodeRoleV(Some(role))) if role.capacity.is_some()) + }) + .map(|(i, _)| i + 1) + .max() + .unwrap_or(0); + + let version = LayoutVersion { + version: previous.version, + replication_factor: previous.replication_factor, + partition_size: previous.partition_size, + parameters: previous.parameters, + roles: previous.roles, + node_id_vec: previous.node_id_vec, + nongateway_node_count, + ring_assignment_data: previous.ring_assignment_data, + }; + let update_tracker = UpdateTracker( + version + .nongateway_nodes() + .iter() + .copied() + .map(|x| (x, version.version)) + .collect::>(), + ); + let staging = LayoutStaging { + parameters: previous.staging_parameters, + roles: previous.staging_roles, + }; + Self { + versions: vec![version], + old_versions: vec![], + update_trackers: UpdateTrackers { + ack_map: update_tracker.clone(), + sync_map: update_tracker.clone(), + sync_ack_map: update_tracker.clone(), + }, + staging: Lww::raw(previous.version, staging), + } + } + } +} + +pub use v010::*; + +// ---- utility functions ---- + +impl AutoCrdt for LayoutParameters { + const WARN_IF_DIFFERENT: bool = true; +} + +impl AutoCrdt for NodeRoleV { + const WARN_IF_DIFFERENT: bool = true; +} + +impl Crdt for LayoutStaging { + fn merge(&mut self, other: &LayoutStaging) { + self.parameters.merge(&other.parameters); + self.roles.merge(&other.roles); + } +} + +impl NodeRole { + pub fn capacity_string(&self) -> String { + match self.capacity { + Some(c) => ByteSize::b(c).to_string_as(false), + None => "gateway".to_string(), + } + } + + pub fn tags_string(&self) -> String { + self.tags.join(",") + } +} + +impl fmt::Display for ZoneRedundancy { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + ZoneRedundancy::Maximum => write!(f, "maximum"), + ZoneRedundancy::AtLeast(x) => write!(f, "{}", x), + } + } +} + +impl core::str::FromStr for ZoneRedundancy { + type Err = &'static str; + fn from_str(s: &str) -> Result { + match s { + "none" | "max" | "maximum" => Ok(ZoneRedundancy::Maximum), + x => { + let v = x + .parse::() + .map_err(|_| "zone redundancy must be 'none'/'max' or an integer")?; + Ok(ZoneRedundancy::AtLeast(v)) + } + } + } +} + +impl UpdateTracker { + fn merge(&mut self, other: &UpdateTracker) -> bool { + let mut changed = false; + for (k, v) in other.0.iter() { + if let Some(v_mut) = self.0.get_mut(k) { + if *v > *v_mut { + *v_mut = *v; + changed = true; + } + } else { + self.0.insert(*k, *v); + changed = true; + } + } + changed + } + + /// This bumps the update tracker for a given node up to the specified value. + /// This has potential impacts on the correctness of Garage and should only + /// be used in very specific circumstances. + pub fn set_max(&mut self, peer: Uuid, value: u64) -> bool { + match self.0.get_mut(&peer) { + Some(e) if *e < value => { + *e = value; + true + } + None => { + self.0.insert(peer, value); + true + } + _ => false, + } + } + + pub(crate) fn min_among(&self, storage_nodes: &[Uuid], min_version: u64) -> u64 { + storage_nodes + .iter() + .map(|x| self.get(x, min_version)) + .min() + .unwrap_or(min_version) + } + + pub fn get(&self, node: &Uuid, min_version: u64) -> u64 { + self.0.get(node).copied().unwrap_or(min_version) + } +} + +impl UpdateTrackers { + pub(crate) fn merge(&mut self, other: &UpdateTrackers) -> bool { + let c1 = self.ack_map.merge(&other.ack_map); + let c2 = self.sync_map.merge(&other.sync_map); + let c3 = self.sync_ack_map.merge(&other.sync_ack_map); + c1 || c2 || c3 + } +} diff --git a/src/rpc/layout/schema.rs b/src/rpc/layout/schema.rs deleted file mode 100644 index df949906..00000000 --- a/src/rpc/layout/schema.rs +++ /dev/null @@ -1,431 +0,0 @@ -use std::fmt; - -use bytesize::ByteSize; - -use garage_util::crdt::{AutoCrdt, Crdt}; -use garage_util::data::Uuid; - -mod v08 { - use crate::layout::CompactNodeType; - use garage_util::crdt::LwwMap; - use garage_util::data::{Hash, Uuid}; - use serde::{Deserialize, Serialize}; - - /// The layout of the cluster, i.e. the list of roles - /// which are assigned to each cluster node - #[derive(Clone, Debug, Serialize, Deserialize)] - pub struct ClusterLayout { - pub version: u64, - - pub replication_factor: usize, - pub roles: LwwMap, - - /// node_id_vec: a vector of node IDs with a role assigned - /// in the system (this includes gateway nodes). - /// The order here is different than the vec stored by `roles`, because: - /// 1. non-gateway nodes are first so that they have lower numbers - /// 2. nodes that don't have a role are excluded (but they need to - /// stay in the CRDT as tombstones) - pub node_id_vec: Vec, - /// the assignation of data partitions to node, the values - /// are indices in node_id_vec - #[serde(with = "serde_bytes")] - pub ring_assignation_data: Vec, - - /// Role changes which are staged for the next version of the layout - pub staging: LwwMap, - pub staging_hash: Hash, - } - - #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] - pub struct NodeRoleV(pub Option); - - /// The user-assigned roles of cluster nodes - #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] - pub struct NodeRole { - /// Datacenter at which this entry belong. This information is used to - /// perform a better geodistribution - pub zone: String, - /// The capacity of the node - /// If this is set to None, the node does not participate in storing data for the system - /// and is only active as an API gateway to other nodes - pub capacity: Option, - /// A set of tags to recognize the node - pub tags: Vec, - } - - impl garage_util::migrate::InitialFormat for ClusterLayout {} -} - -mod v09 { - use super::v08; - use crate::layout::CompactNodeType; - use garage_util::crdt::{Lww, LwwMap}; - use garage_util::data::{Hash, Uuid}; - use serde::{Deserialize, Serialize}; - pub use v08::{NodeRole, NodeRoleV}; - - /// The layout of the cluster, i.e. the list of roles - /// which are assigned to each cluster node - #[derive(Clone, Debug, Serialize, Deserialize)] - pub struct ClusterLayout { - pub version: u64, - - pub replication_factor: usize, - - /// This attribute is only used to retain the previously computed partition size, - /// to know to what extent does it change with the layout update. - pub partition_size: u64, - /// Parameters used to compute the assignment currently given by - /// ring_assignment_data - pub parameters: LayoutParameters, - - pub roles: LwwMap, - - /// see comment in v08::ClusterLayout - pub node_id_vec: Vec, - /// see comment in v08::ClusterLayout - #[serde(with = "serde_bytes")] - pub ring_assignment_data: Vec, - - /// Parameters to be used in the next partition assignment computation. - pub staging_parameters: Lww, - /// Role changes which are staged for the next version of the layout - pub staging_roles: LwwMap, - pub staging_hash: Hash, - } - - /// This struct is used to set the parameters to be used in the assignment computation - /// algorithm. It is stored as a Crdt. - #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)] - pub struct LayoutParameters { - pub zone_redundancy: ZoneRedundancy, - } - - /// Zone redundancy: if set to AtLeast(x), the layout calculation will aim to store copies - /// of each partition on at least that number of different zones. - /// Otherwise, copies will be stored on the maximum possible number of zones. - #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)] - pub enum ZoneRedundancy { - AtLeast(usize), - Maximum, - } - - impl garage_util::migrate::Migrate for ClusterLayout { - const VERSION_MARKER: &'static [u8] = b"G09layout"; - - type Previous = v08::ClusterLayout; - - fn migrate(previous: Self::Previous) -> Self { - use itertools::Itertools; - - // In the old layout, capacities are in an arbitrary unit, - // but in the new layout they are in bytes. - // Here we arbitrarily multiply everything by 1G, - // such that 1 old capacity unit = 1GB in the new units. - // This is totally arbitrary and won't work for most users. - let cap_mul = 1024 * 1024 * 1024; - let roles = multiply_all_capacities(previous.roles, cap_mul); - let staging_roles = multiply_all_capacities(previous.staging, cap_mul); - let node_id_vec = previous.node_id_vec; - - // Determine partition size - let mut tmp = previous.ring_assignation_data.clone(); - tmp.sort(); - let partition_size = tmp - .into_iter() - .dedup_with_count() - .map(|(npart, node)| { - roles - .get(&node_id_vec[node as usize]) - .and_then(|p| p.0.as_ref().and_then(|r| r.capacity)) - .unwrap_or(0) / npart as u64 - }) - .min() - .unwrap_or(0); - - // By default, zone_redundancy is maximum possible value - let parameters = LayoutParameters { - zone_redundancy: ZoneRedundancy::Maximum, - }; - - Self { - version: previous.version, - replication_factor: previous.replication_factor, - partition_size, - parameters, - roles, - node_id_vec, - ring_assignment_data: previous.ring_assignation_data, - staging_parameters: Lww::new(parameters), - staging_roles, - staging_hash: [0u8; 32].into(), // will be set in the next migration - } - } - } - - fn multiply_all_capacities( - old_roles: LwwMap, - mul: u64, - ) -> LwwMap { - let mut new_roles = LwwMap::new(); - for (node, ts, role) in old_roles.items() { - let mut role = role.clone(); - if let NodeRoleV(Some(NodeRole { - capacity: Some(ref mut cap), - .. - })) = role - { - *cap *= mul; - } - new_roles.merge_raw(node, *ts, &role); - } - new_roles - } -} - -mod v010 { - use super::v09; - use crate::layout::CompactNodeType; - use garage_util::crdt::{Lww, LwwMap}; - use garage_util::data::Uuid; - use serde::{Deserialize, Serialize}; - use std::collections::BTreeMap; - pub use v09::{LayoutParameters, NodeRole, NodeRoleV, ZoneRedundancy}; - - pub const OLD_VERSION_COUNT: usize = 5; - - /// The history of cluster layouts, with trackers to keep a record - /// of which nodes are up-to-date to current cluster data - #[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] - pub struct LayoutHistory { - /// The versions currently in use in the cluster - pub versions: Vec, - /// At most 5 of the previous versions, not used by the garage_table - /// module, but usefull for the garage_block module to find data blocks - /// that have not yet been moved - pub old_versions: Vec, - - /// Update trackers - pub update_trackers: UpdateTrackers, - - /// Staged changes for the next version - pub staging: Lww, - } - - /// A version of the layout of the cluster, i.e. the list of roles - /// which are assigned to each cluster node - #[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] - pub struct LayoutVersion { - pub version: u64, - - pub replication_factor: usize, - - /// This attribute is only used to retain the previously computed partition size, - /// to know to what extent does it change with the layout update. - pub partition_size: u64, - /// Parameters used to compute the assignment currently given by - /// ring_assignment_data - pub parameters: LayoutParameters, - - pub roles: LwwMap, - - /// see comment in v08::ClusterLayout - pub node_id_vec: Vec, - /// number of non-gateway nodes, which are the first ids in node_id_vec - pub nongateway_node_count: usize, - /// see comment in v08::ClusterLayout - #[serde(with = "serde_bytes")] - pub ring_assignment_data: Vec, - } - - /// The staged changes for the next layout version - #[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] - pub struct LayoutStaging { - /// Parameters to be used in the next partition assignment computation. - pub parameters: Lww, - /// Role changes which are staged for the next version of the layout - pub roles: LwwMap, - } - - /// The tracker of acknowlegments and data syncs around the cluster - #[derive(Clone, Debug, Serialize, Deserialize, Default, PartialEq)] - pub struct UpdateTrackers { - /// The highest layout version number each node has ack'ed - pub ack_map: UpdateTracker, - /// The highest layout version number each node has synced data for - pub sync_map: UpdateTracker, - /// The highest layout version number each node has - /// ack'ed that all other nodes have synced data for - pub sync_ack_map: UpdateTracker, - } - - /// The history of cluster layouts - #[derive(Clone, Debug, Serialize, Deserialize, Default, PartialEq)] - pub struct UpdateTracker(pub BTreeMap); - - impl garage_util::migrate::Migrate for LayoutHistory { - const VERSION_MARKER: &'static [u8] = b"G010lh"; - - type Previous = v09::ClusterLayout; - - fn migrate(previous: Self::Previous) -> Self { - let nongateway_node_count = previous - .node_id_vec - .iter() - .enumerate() - .filter(|(_, uuid)| { - let role = previous.roles.get(uuid); - matches!(role, Some(NodeRoleV(Some(role))) if role.capacity.is_some()) - }) - .map(|(i, _)| i + 1) - .max() - .unwrap_or(0); - - let version = LayoutVersion { - version: previous.version, - replication_factor: previous.replication_factor, - partition_size: previous.partition_size, - parameters: previous.parameters, - roles: previous.roles, - node_id_vec: previous.node_id_vec, - nongateway_node_count, - ring_assignment_data: previous.ring_assignment_data, - }; - let update_tracker = UpdateTracker( - version - .nongateway_nodes() - .iter() - .copied() - .map(|x| (x, version.version)) - .collect::>(), - ); - let staging = LayoutStaging { - parameters: previous.staging_parameters, - roles: previous.staging_roles, - }; - Self { - versions: vec![version], - old_versions: vec![], - update_trackers: UpdateTrackers { - ack_map: update_tracker.clone(), - sync_map: update_tracker.clone(), - sync_ack_map: update_tracker.clone(), - }, - staging: Lww::raw(previous.version, staging), - } - } - } -} - -pub use v010::*; - -// ---- utility functions ---- - -impl AutoCrdt for LayoutParameters { - const WARN_IF_DIFFERENT: bool = true; -} - -impl AutoCrdt for NodeRoleV { - const WARN_IF_DIFFERENT: bool = true; -} - -impl Crdt for LayoutStaging { - fn merge(&mut self, other: &LayoutStaging) { - self.parameters.merge(&other.parameters); - self.roles.merge(&other.roles); - } -} - -impl NodeRole { - pub fn capacity_string(&self) -> String { - match self.capacity { - Some(c) => ByteSize::b(c).to_string_as(false), - None => "gateway".to_string(), - } - } - - pub fn tags_string(&self) -> String { - self.tags.join(",") - } -} - -impl fmt::Display for ZoneRedundancy { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - ZoneRedundancy::Maximum => write!(f, "maximum"), - ZoneRedundancy::AtLeast(x) => write!(f, "{}", x), - } - } -} - -impl core::str::FromStr for ZoneRedundancy { - type Err = &'static str; - fn from_str(s: &str) -> Result { - match s { - "none" | "max" | "maximum" => Ok(ZoneRedundancy::Maximum), - x => { - let v = x - .parse::() - .map_err(|_| "zone redundancy must be 'none'/'max' or an integer")?; - Ok(ZoneRedundancy::AtLeast(v)) - } - } - } -} - -impl UpdateTracker { - fn merge(&mut self, other: &UpdateTracker) -> bool { - let mut changed = false; - for (k, v) in other.0.iter() { - if let Some(v_mut) = self.0.get_mut(k) { - if *v > *v_mut { - *v_mut = *v; - changed = true; - } - } else { - self.0.insert(*k, *v); - changed = true; - } - } - changed - } - - /// This bumps the update tracker for a given node up to the specified value. - /// This has potential impacts on the correctness of Garage and should only - /// be used in very specific circumstances. - pub fn set_max(&mut self, peer: Uuid, value: u64) -> bool { - match self.0.get_mut(&peer) { - Some(e) if *e < value => { - *e = value; - true - } - None => { - self.0.insert(peer, value); - true - } - _ => false, - } - } - - pub(crate) fn min_among(&self, storage_nodes: &[Uuid], min_version: u64) -> u64 { - storage_nodes - .iter() - .map(|x| self.get(x, min_version)) - .min() - .unwrap_or(min_version) - } - - pub fn get(&self, node: &Uuid, min_version: u64) -> u64 { - self.0.get(node).copied().unwrap_or(min_version) - } -} - -impl UpdateTrackers { - pub(crate) fn merge(&mut self, other: &UpdateTrackers) -> bool { - let c1 = self.ack_map.merge(&other.ack_map); - let c2 = self.sync_map.merge(&other.sync_map); - let c3 = self.sync_ack_map.merge(&other.sync_ack_map); - c1 || c2 || c3 - } -} diff --git a/src/rpc/layout/version.rs b/src/rpc/layout/version.rs index cbfbee94..5b307156 100644 --- a/src/rpc/layout/version.rs +++ b/src/rpc/layout/version.rs @@ -10,7 +10,6 @@ use garage_util::data::*; use garage_util::error::*; use super::graph_algo::*; -use super::schema::*; use super::*; // The Message type will be used to collect information on the algorithm. -- cgit v1.2.3 From f8df90b79b93e4a1391839435718bad8c697246d Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 8 Dec 2023 14:54:11 +0100 Subject: table: fix insert_many to not send duplicates --- src/table/table.rs | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/table/table.rs b/src/table/table.rs index 6508cf5d..59cfdd07 100644 --- a/src/table/table.rs +++ b/src/table/table.rs @@ -196,6 +196,8 @@ impl Table { let hash = entry.partition_key().hash(); let mut write_sets = self.data.replication.write_sets(&hash); for set in write_sets.as_mut().iter_mut() { + // Sort nodes in each write sets to merge write sets with same + // nodes but in possibly different orders set.sort(); } let e_enc = Arc::new(ByteBuf::from(entry.encode()?)); @@ -220,7 +222,16 @@ impl Table { for (write_sets, entry_enc) in entries_vec.iter() { for write_set in write_sets.as_ref().iter() { for node in write_set.iter() { - call_list.entry(*node).or_default().push(entry_enc.clone()) + let node_entries = call_list.entry(*node).or_default(); + match node_entries.last() { + Some(x) if Arc::ptr_eq(x, entry_enc) => { + // skip if entry already in list to send to this node + // (could happen if node is in several write sets for this entry) + } + _ => { + node_entries.push(entry_enc.clone()); + } + } } } } -- cgit v1.2.3 From e4f493b48156e6e30f16fba10f300f6cb5fe0b0d Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 11 Dec 2023 14:57:42 +0100 Subject: table: remove redundant tracing in insert_many --- src/table/table.rs | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/table/table.rs b/src/table/table.rs index 59cfdd07..05a0dab1 100644 --- a/src/table/table.rs +++ b/src/table/table.rs @@ -239,9 +239,7 @@ impl Table { // Build futures to actually perform each of the corresponding RPC calls let call_futures = call_list.into_iter().map(|(node, entries)| { let this = self.clone(); - let tracer = opentelemetry::global::tracer("garage"); - let span = tracer.start(format!("RPC to {:?}", node)); - let fut = async move { + async move { let rpc = TableRpc::::Update(entries); let resp = this .system @@ -254,8 +252,7 @@ impl Table { ) .await; (node, resp) - }; - fut.with_context(Context::current_with_span(span)) + } }); // Run all requests in parallel thanks to FuturesUnordered, and collect results. -- cgit v1.2.3 From 85b5a6bcd11c0a7651e4c589569e1935a3d18e46 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 11 Dec 2023 15:31:47 +0100 Subject: fix some clippy lints --- src/api/admin/cluster.rs | 2 +- src/rpc/layout/helper.rs | 2 +- src/rpc/layout/history.rs | 14 ++++++-------- src/rpc/layout/mod.rs | 2 +- src/rpc/layout/version.rs | 6 +++--- src/rpc/rpc_helper.rs | 2 +- src/rpc/system.rs | 2 +- src/table/sync.rs | 8 ++++---- src/table/table.rs | 3 +-- 9 files changed, 19 insertions(+), 22 deletions(-) diff --git a/src/api/admin/cluster.rs b/src/api/admin/cluster.rs index 3ce1b254..8677257d 100644 --- a/src/api/admin/cluster.rs +++ b/src/api/admin/cluster.rs @@ -94,7 +94,7 @@ pub async fn handle_get_cluster_status(garage: &Arc) -> Result>(); + let mut nodes = nodes.into_values().collect::>(); nodes.sort_by(|x, y| x.id.cmp(&y.id)); let res = GetClusterStatusResponse { diff --git a/src/rpc/layout/helper.rs b/src/rpc/layout/helper.rs index 2ba010b8..7e5d37e9 100644 --- a/src/rpc/layout/helper.rs +++ b/src/rpc/layout/helper.rs @@ -129,7 +129,7 @@ impl LayoutHelper { where F: FnOnce(&mut LayoutHistory) -> bool, { - let changed = f(&mut self.layout.as_mut().unwrap()); + let changed = f(self.layout.as_mut().unwrap()); if changed { *self = Self::new( self.replication_mode, diff --git a/src/rpc/layout/history.rs b/src/rpc/layout/history.rs index a53256cc..23196aee 100644 --- a/src/rpc/layout/history.rs +++ b/src/rpc/layout/history.rs @@ -42,8 +42,7 @@ impl LayoutHistory { let set = self .versions .iter() - .map(|x| x.all_nodes()) - .flatten() + .flat_map(|x| x.all_nodes()) .collect::>(); set.into_iter().copied().collect::>() } @@ -56,8 +55,7 @@ impl LayoutHistory { let set = self .versions .iter() - .map(|x| x.nongateway_nodes()) - .flatten() + .flat_map(|x| x.nongateway_nodes()) .collect::>(); set.into_iter().copied().collect::>() } @@ -94,7 +92,7 @@ impl LayoutHistory { let sync_ack_map_min = self .update_trackers .sync_ack_map - .min_among(¤t_nodes, min_version); + .min_among(current_nodes, min_version); if self.min_stored() < sync_ack_map_min { let removed = self.versions.remove(0); info!( @@ -144,7 +142,7 @@ impl LayoutHistory { let global_min = self .update_trackers .sync_map - .min_among(&all_nongateway_nodes, min_version); + .min_among(all_nongateway_nodes, min_version); // If the write quorums are equal to the total number of nodes, // i.e. no writes can succeed while they are not written to all nodes, @@ -281,7 +279,7 @@ To know the correct value of the new layout version, invoke `garage layout show` let (new_version, msg) = self .current() .clone() - .calculate_next_version(&self.staging.get())?; + .calculate_next_version(self.staging.get())?; self.versions.push(new_version); self.cleanup_old_versions(); @@ -297,7 +295,7 @@ To know the correct value of the new layout version, invoke `garage layout show` pub fn revert_staged_changes(mut self) -> Result { self.staging.update(LayoutStaging { - parameters: Lww::new(self.current().parameters.clone()), + parameters: Lww::new(self.current().parameters), roles: LwwMap::new(), }); diff --git a/src/rpc/layout/mod.rs b/src/rpc/layout/mod.rs index facdb2ce..162e3c6e 100644 --- a/src/rpc/layout/mod.rs +++ b/src/rpc/layout/mod.rs @@ -357,7 +357,7 @@ mod v010 { update_trackers: UpdateTrackers { ack_map: update_tracker.clone(), sync_map: update_tracker.clone(), - sync_ack_map: update_tracker.clone(), + sync_ack_map: update_tracker, }, staging: Lww::raw(previous.version, staging), } diff --git a/src/rpc/layout/version.rs b/src/rpc/layout/version.rs index 5b307156..ee4b2821 100644 --- a/src/rpc/layout/version.rs +++ b/src/rpc/layout/version.rs @@ -137,19 +137,19 @@ impl LayoutVersion { // ===================== internal information extractors ====================== pub(crate) fn expect_get_node_capacity(&self, uuid: &Uuid) -> u64 { - self.get_node_capacity(&uuid) + self.get_node_capacity(uuid) .expect("non-gateway node with zero capacity") } pub(crate) fn expect_get_node_zone(&self, uuid: &Uuid) -> &str { - self.get_node_zone(&uuid).expect("node without a zone") + self.get_node_zone(uuid).expect("node without a zone") } /// Returns the sum of capacities of non gateway nodes in the cluster fn get_total_capacity(&self) -> u64 { let mut total_capacity = 0; for uuid in self.nongateway_nodes() { - total_capacity += self.expect_get_node_capacity(&uuid); + total_capacity += self.expect_get_node_capacity(uuid); } total_capacity } diff --git a/src/rpc/rpc_helper.rs b/src/rpc/rpc_helper.rs index 65af8901..77a36ca1 100644 --- a/src/rpc/rpc_helper.rs +++ b/src/rpc/rpc_helper.rs @@ -442,7 +442,7 @@ impl RpcHelper { // Send one request to each peer of the quorum sets let msg = msg.into_req().map_err(netapp::error::Error::from)?; - let requests = result_tracker.nodes.iter().map(|(peer, _)| { + let requests = result_tracker.nodes.keys().map(|peer| { let self2 = self.clone(); let msg = msg.clone(); let endpoint2 = endpoint.clone(); diff --git a/src/rpc/system.rs b/src/rpc/system.rs index a8f12852..41d76177 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -315,7 +315,7 @@ impl System { local_status: ArcSwap::new(Arc::new(local_status)), node_status: RwLock::new(HashMap::new()), netapp: netapp.clone(), - fullmesh: fullmesh.clone(), + fullmesh, system_endpoint, replication_mode, replication_factor, diff --git a/src/table/sync.rs b/src/table/sync.rs index cfcbc4b5..1561a2e5 100644 --- a/src/table/sync.rs +++ b/src/table/sync.rs @@ -123,15 +123,15 @@ impl TableSyncer { let mut sync_futures = result_tracker .nodes - .iter() - .map(|(node, _)| *node) + .keys() + .copied() .map(|node| { let must_exit = must_exit.clone(); async move { if node == my_id { (node, Ok(())) } else { - (node, self.do_sync_with(&partition, node, must_exit).await) + (node, self.do_sync_with(partition, node, must_exit).await) } } }) @@ -145,7 +145,7 @@ impl TableSyncer { } if result_tracker.too_many_failures() { - return Err(result_tracker.quorum_error()); + Err(result_tracker.quorum_error()) } else { Ok(()) } diff --git a/src/table/table.rs b/src/table/table.rs index 05a0dab1..a5be2910 100644 --- a/src/table/table.rs +++ b/src/table/table.rs @@ -209,8 +209,7 @@ impl Table { // it takes part, to optimize the detection of a quorum. let mut write_sets = entries_vec .iter() - .map(|(wss, _)| wss.as_ref().iter().map(|ws| ws.as_slice())) - .flatten() + .flat_map(|(wss, _)| wss.as_ref().iter().map(|ws| ws.as_slice())) .collect::>(); write_sets.sort(); write_sets.dedup(); -- cgit v1.2.3 From adccce1145d5d82581e4a5da707be35badb2d5a6 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 11 Dec 2023 15:45:14 +0100 Subject: layout: refactor/fix bad while loop --- src/rpc/layout/history.rs | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/src/rpc/layout/history.rs b/src/rpc/layout/history.rs index 23196aee..b8cc27da 100644 --- a/src/rpc/layout/history.rs +++ b/src/rpc/layout/history.rs @@ -86,23 +86,20 @@ impl LayoutHistory { // remove them (keep them in self.old_versions). // ASSUMPTION: we only care about where nodes in the current layout version // are reading from, as we assume older nodes are being discarded. - while self.versions.len() > 1 { - let current_nodes = &self.current().node_id_vec; - let min_version = self.min_stored(); - let sync_ack_map_min = self - .update_trackers - .sync_ack_map - .min_among(current_nodes, min_version); - if self.min_stored() < sync_ack_map_min { - let removed = self.versions.remove(0); - info!( - "Layout history: moving version {} to old_versions", - removed.version - ); - self.old_versions.push(removed); - } else { - break; - } + let current_nodes = &self.current().node_id_vec; + let min_version = self.min_stored(); + let sync_ack_map_min = self + .update_trackers + .sync_ack_map + .min_among(current_nodes, min_version); + while self.min_stored() < sync_ack_map_min { + assert!(self.versions.len() > 1); + let removed = self.versions.remove(0); + info!( + "Layout history: moving version {} to old_versions", + removed.version + ); + self.old_versions.push(removed); } while self.old_versions.len() > OLD_VERSION_COUNT { -- cgit v1.2.3 From 0041b013a473e3ae72f50209d8f79db75a72848b Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 11 Dec 2023 16:09:22 +0100 Subject: layout: refactoring and fix in layout helper --- src/rpc/layout/helper.rs | 43 +++++++++++++++++++++++----------------- src/rpc/layout/manager.rs | 2 +- src/rpc/layout/mod.rs | 2 +- src/rpc/rpc_helper.rs | 2 +- src/rpc/system.rs | 4 ++-- src/table/replication/sharded.rs | 2 +- src/table/sync.rs | 16 +++++++-------- 7 files changed, 38 insertions(+), 33 deletions(-) diff --git a/src/rpc/layout/helper.rs b/src/rpc/layout/helper.rs index 7e5d37e9..9fb738ea 100644 --- a/src/rpc/layout/helper.rs +++ b/src/rpc/layout/helper.rs @@ -10,7 +10,7 @@ use super::*; use crate::replication_mode::ReplicationMode; #[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq, Eq)] -pub struct LayoutDigest { +pub struct RpcLayoutDigest { /// Cluster layout version pub current_version: u64, /// Number of active layout versions @@ -21,6 +21,13 @@ pub struct LayoutDigest { pub staging_hash: Hash, } +#[derive(Debug, Clone, Copy, Eq, PartialEq)] +pub struct SyncLayoutDigest { + current: u64, + ack_map_min: u64, + min_stored: u64, +} + pub struct LayoutHelper { replication_mode: ReplicationMode, layout: Option, @@ -150,20 +157,20 @@ impl LayoutHelper { &self.all_nongateway_nodes } - pub fn all_ack(&self) -> u64 { + pub fn ack_map_min(&self) -> u64 { self.ack_map_min } - pub fn all_sync(&self) -> u64 { + pub fn sync_map_min(&self) -> u64 { self.sync_map_min } - pub fn sync_versions(&self) -> (u64, u64, u64) { - ( - self.layout().current().version, - self.all_ack(), - self.layout().min_stored(), - ) + pub fn sync_digest(&self) -> SyncLayoutDigest { + SyncLayoutDigest { + current: self.layout().current().version, + ack_map_min: self.ack_map_min(), + min_stored: self.layout().min_stored(), + } } pub fn read_nodes_of(&self, position: &Hash) -> Vec { @@ -206,8 +213,8 @@ impl LayoutHelper { self.staging_hash } - pub fn digest(&self) -> LayoutDigest { - LayoutDigest { + pub fn digest(&self) -> RpcLayoutDigest { + RpcLayoutDigest { current_version: self.current().version, active_versions: self.versions.len(), trackers_hash: self.trackers_hash, @@ -231,13 +238,13 @@ impl LayoutHelper { // 3. Acknowledge everyone has synced up to min(self.sync_map) self.sync_ack(local_node_id); - info!("ack_map: {:?}", self.update_trackers.ack_map); - info!("sync_map: {:?}", self.update_trackers.sync_map); - info!("sync_ack_map: {:?}", self.update_trackers.sync_ack_map); + debug!("ack_map: {:?}", self.update_trackers.ack_map); + debug!("sync_map: {:?}", self.update_trackers.sync_map); + debug!("sync_ack_map: {:?}", self.update_trackers.sync_ack_map); } fn sync_first(&mut self, local_node_id: Uuid) { - let first_version = self.versions.first().as_ref().unwrap().version; + let first_version = self.min_stored(); self.update(|layout| { layout .update_trackers @@ -275,13 +282,13 @@ impl LayoutHelper { .versions .iter() .map(|x| x.version) - .take_while(|v| { + .skip_while(|v| { self.ack_lock .get(v) .map(|x| x.load(Ordering::Relaxed) == 0) .unwrap_or(true) }) - .max() - .unwrap_or(self.min_stored()) + .next() + .unwrap_or(self.current().version) } } diff --git a/src/rpc/layout/manager.rs b/src/rpc/layout/manager.rs index ec8a2a15..6747b79d 100644 --- a/src/rpc/layout/manager.rs +++ b/src/rpc/layout/manager.rs @@ -256,7 +256,7 @@ impl LayoutManager { // ---- RPC HANDLERS ---- - pub(crate) fn handle_advertise_status(self: &Arc, from: Uuid, remote: &LayoutDigest) { + pub(crate) fn handle_advertise_status(self: &Arc, from: Uuid, remote: &RpcLayoutDigest) { let local = self.layout().digest(); if remote.current_version > local.current_version || remote.active_versions != local.active_versions diff --git a/src/rpc/layout/mod.rs b/src/rpc/layout/mod.rs index 162e3c6e..33676c37 100644 --- a/src/rpc/layout/mod.rs +++ b/src/rpc/layout/mod.rs @@ -17,7 +17,7 @@ pub mod manager; // ---- re-exports ---- -pub use helper::{LayoutDigest, LayoutHelper}; +pub use helper::{LayoutHelper, RpcLayoutDigest, SyncLayoutDigest}; pub use manager::WriteLock; pub use version::*; diff --git a/src/rpc/rpc_helper.rs b/src/rpc/rpc_helper.rs index 77a36ca1..ae3a19c4 100644 --- a/src/rpc/rpc_helper.rs +++ b/src/rpc/rpc_helper.rs @@ -502,7 +502,7 @@ impl RpcHelper { .rev() .chain(layout.old_versions.iter().rev()); for ver in ver_iter { - if ver.version > layout.all_sync() { + if ver.version > layout.sync_map_min() { continue; } let nodes = ver.nodes_of(position, ver.replication_factor); diff --git a/src/rpc/system.rs b/src/rpc/system.rs index 41d76177..83cc6816 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -34,7 +34,7 @@ use crate::consul::ConsulDiscovery; #[cfg(feature = "kubernetes-discovery")] use crate::kubernetes::*; use crate::layout::{ - self, manager::LayoutManager, LayoutDigest, LayoutHelper, LayoutHistory, NodeRoleV, + self, manager::LayoutManager, LayoutHelper, LayoutHistory, NodeRoleV, RpcLayoutDigest, }; use crate::replication_mode::*; use crate::rpc_helper::*; @@ -132,7 +132,7 @@ pub struct NodeStatus { pub replication_factor: usize, /// Cluster layout digest - pub layout_digest: LayoutDigest, + pub layout_digest: RpcLayoutDigest, /// Disk usage on partition containing metadata directory (tuple: `(avail, total)`) #[serde(default)] diff --git a/src/table/replication/sharded.rs b/src/table/replication/sharded.rs index 55d0029d..8ba3700f 100644 --- a/src/table/replication/sharded.rs +++ b/src/table/replication/sharded.rs @@ -54,7 +54,7 @@ impl TableReplication for TableShardedReplication { fn sync_partitions(&self) -> SyncPartitions { let layout = self.system.cluster_layout(); - let layout_version = layout.all_ack(); + let layout_version = layout.ack_map_min(); let mut partitions = layout .current() diff --git a/src/table/sync.rs b/src/table/sync.rs index 1561a2e5..cd080df0 100644 --- a/src/table/sync.rs +++ b/src/table/sync.rs @@ -83,7 +83,7 @@ impl TableSyncer { bg.spawn_worker(SyncWorker { syncer: self.clone(), layout_notify: self.system.layout_notify(), - layout_versions: self.system.cluster_layout().sync_versions(), + layout_digest: self.system.cluster_layout().sync_digest(), add_full_sync_rx, todo: None, next_full_sync: Instant::now() + Duration::from_secs(20), @@ -483,7 +483,7 @@ struct SyncWorker { syncer: Arc>, layout_notify: Arc, - layout_versions: (u64, u64, u64), + layout_digest: SyncLayoutDigest, add_full_sync_rx: mpsc::UnboundedReceiver<()>, next_full_sync: Instant, @@ -493,15 +493,13 @@ struct SyncWorker { impl SyncWorker { fn check_add_full_sync(&mut self) { - let layout_versions = self.syncer.system.cluster_layout().sync_versions(); - if layout_versions != self.layout_versions { - self.layout_versions = layout_versions; + let layout_digest = self.syncer.system.cluster_layout().sync_digest(); + if layout_digest != self.layout_digest { + self.layout_digest = layout_digest; info!( - "({}) Layout versions changed (max={}, ack={}, min stored={}), adding full sync to syncer todo list", + "({}) Layout versions changed ({:?}), adding full sync to syncer todo list", F::TABLE_NAME, - layout_versions.0, - layout_versions.1, - layout_versions.2 + layout_digest, ); self.add_full_sync(); } -- cgit v1.2.3 From db48dd3d6c1f9e86a62e9b8edfce2c1620bcd5f3 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 11 Jan 2024 12:05:51 +0100 Subject: bump crate versions to 0.10.0 --- Cargo.lock | 18 ++++---- Cargo.nix | 120 +++++++++++++++++++++++++------------------------- Cargo.toml | 16 +++---- src/api/Cargo.toml | 2 +- src/block/Cargo.toml | 2 +- src/db/Cargo.toml | 2 +- src/garage/Cargo.toml | 2 +- src/model/Cargo.toml | 2 +- src/rpc/Cargo.toml | 2 +- src/table/Cargo.toml | 2 +- src/util/Cargo.toml | 2 +- src/web/Cargo.toml | 2 +- 12 files changed, 86 insertions(+), 86 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7135a4de..e9a82d04 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1198,7 +1198,7 @@ dependencies = [ [[package]] name = "garage" -version = "0.9.0" +version = "0.10.0" dependencies = [ "assert-json-diff", "async-trait", @@ -1249,7 +1249,7 @@ dependencies = [ [[package]] name = "garage_api" -version = "0.9.0" +version = "0.10.0" dependencies = [ "async-trait", "base64 0.21.3", @@ -1295,7 +1295,7 @@ dependencies = [ [[package]] name = "garage_block" -version = "0.9.0" +version = "0.10.0" dependencies = [ "arc-swap", "async-compression", @@ -1321,7 +1321,7 @@ dependencies = [ [[package]] name = "garage_db" -version = "0.9.0" +version = "0.10.0" dependencies = [ "clap 4.4.0", "err-derive", @@ -1336,7 +1336,7 @@ dependencies = [ [[package]] name = "garage_model" -version = "0.9.0" +version = "0.10.0" dependencies = [ "arc-swap", "async-trait", @@ -1364,7 +1364,7 @@ dependencies = [ [[package]] name = "garage_rpc" -version = "0.9.0" +version = "0.10.0" dependencies = [ "arc-swap", "async-trait", @@ -1399,7 +1399,7 @@ dependencies = [ [[package]] name = "garage_table" -version = "0.9.0" +version = "0.10.0" dependencies = [ "arc-swap", "async-trait", @@ -1421,7 +1421,7 @@ dependencies = [ [[package]] name = "garage_util" -version = "0.9.0" +version = "0.10.0" dependencies = [ "arc-swap", "async-trait", @@ -1455,7 +1455,7 @@ dependencies = [ [[package]] name = "garage_web" -version = "0.9.0" +version = "0.10.0" dependencies = [ "err-derive", "futures", diff --git a/Cargo.nix b/Cargo.nix index cd8270b6..ab983987 100644 --- a/Cargo.nix +++ b/Cargo.nix @@ -33,7 +33,7 @@ args@{ ignoreLockHash, }: let - nixifiedLockHash = "1a87886681a3ef0b83c95addc26674a538b8a93d35bc80db8998e1fcd0821f6c"; + nixifiedLockHash = "9946c45969e70c13413d3474740963f4cdb2a8a00199daa23a1bd152c186b319"; workspaceSrc = if args.workspaceSrc == null then ./. else args.workspaceSrc; currentLockHash = builtins.hashFile "sha256" (workspaceSrc + /Cargo.lock); lockHashIgnored = if ignoreLockHash @@ -57,16 +57,16 @@ in { cargo2nixVersion = "0.11.0"; workspace = { - garage_db = rustPackages.unknown.garage_db."0.9.0"; - garage_util = rustPackages.unknown.garage_util."0.9.0"; - garage_rpc = rustPackages.unknown.garage_rpc."0.9.0"; + garage_db = rustPackages.unknown.garage_db."0.10.0"; + garage_util = rustPackages.unknown.garage_util."0.10.0"; + garage_rpc = rustPackages.unknown.garage_rpc."0.10.0"; format_table = rustPackages.unknown.format_table."0.1.1"; - garage_table = rustPackages.unknown.garage_table."0.9.0"; - garage_block = rustPackages.unknown.garage_block."0.9.0"; - garage_model = rustPackages.unknown.garage_model."0.9.0"; - garage_api = rustPackages.unknown.garage_api."0.9.0"; - garage_web = rustPackages.unknown.garage_web."0.9.0"; - garage = rustPackages.unknown.garage."0.9.0"; + garage_table = rustPackages.unknown.garage_table."0.10.0"; + garage_block = rustPackages.unknown.garage_block."0.10.0"; + garage_model = rustPackages.unknown.garage_model."0.10.0"; + garage_api = rustPackages.unknown.garage_api."0.10.0"; + garage_web = rustPackages.unknown.garage_web."0.10.0"; + garage = rustPackages.unknown.garage."0.10.0"; k2v-client = rustPackages.unknown.k2v-client."0.0.4"; }; "registry+https://github.com/rust-lang/crates.io-index".addr2line."0.21.0" = overridableMkRustCrate (profileName: rec { @@ -1705,9 +1705,9 @@ in }; }); - "unknown".garage."0.9.0" = overridableMkRustCrate (profileName: rec { + "unknown".garage."0.10.0" = overridableMkRustCrate (profileName: rec { name = "garage"; - version = "0.9.0"; + version = "0.10.0"; registry = "unknown"; src = fetchCrateLocal (workspaceSrc + "/src/garage"); features = builtins.concatLists [ @@ -1734,14 +1734,14 @@ in format_table = (rustPackages."unknown".format_table."0.1.1" { inherit profileName; }).out; futures = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.28" { inherit profileName; }).out; futures_util = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures-util."0.3.28" { inherit profileName; }).out; - garage_api = (rustPackages."unknown".garage_api."0.9.0" { inherit profileName; }).out; - garage_block = (rustPackages."unknown".garage_block."0.9.0" { inherit profileName; }).out; - garage_db = (rustPackages."unknown".garage_db."0.9.0" { inherit profileName; }).out; - garage_model = (rustPackages."unknown".garage_model."0.9.0" { inherit profileName; }).out; - garage_rpc = (rustPackages."unknown".garage_rpc."0.9.0" { inherit profileName; }).out; - garage_table = (rustPackages."unknown".garage_table."0.9.0" { inherit profileName; }).out; - garage_util = (rustPackages."unknown".garage_util."0.9.0" { inherit profileName; }).out; - garage_web = (rustPackages."unknown".garage_web."0.9.0" { inherit profileName; }).out; + garage_api = (rustPackages."unknown".garage_api."0.10.0" { inherit profileName; }).out; + garage_block = (rustPackages."unknown".garage_block."0.10.0" { inherit profileName; }).out; + garage_db = (rustPackages."unknown".garage_db."0.10.0" { inherit profileName; }).out; + garage_model = (rustPackages."unknown".garage_model."0.10.0" { inherit profileName; }).out; + garage_rpc = (rustPackages."unknown".garage_rpc."0.10.0" { inherit profileName; }).out; + garage_table = (rustPackages."unknown".garage_table."0.10.0" { inherit profileName; }).out; + garage_util = (rustPackages."unknown".garage_util."0.10.0" { inherit profileName; }).out; + garage_web = (rustPackages."unknown".garage_web."0.10.0" { inherit profileName; }).out; git_version = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".git-version."0.3.5" { inherit profileName; }).out; hex = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hex."0.4.3" { inherit profileName; }).out; sodiumoxide = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".kuska-sodiumoxide."0.2.5-0" { inherit profileName; }).out; @@ -1777,9 +1777,9 @@ in }; }); - "unknown".garage_api."0.9.0" = overridableMkRustCrate (profileName: rec { + "unknown".garage_api."0.10.0" = overridableMkRustCrate (profileName: rec { name = "garage_api"; - version = "0.9.0"; + version = "0.10.0"; registry = "unknown"; src = fetchCrateLocal (workspaceSrc + "/src/api"); features = builtins.concatLists [ @@ -1798,11 +1798,11 @@ in form_urlencoded = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".form_urlencoded."1.2.0" { inherit profileName; }).out; futures = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.28" { inherit profileName; }).out; futures_util = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures-util."0.3.28" { inherit profileName; }).out; - garage_block = (rustPackages."unknown".garage_block."0.9.0" { inherit profileName; }).out; - garage_model = (rustPackages."unknown".garage_model."0.9.0" { inherit profileName; }).out; - garage_rpc = (rustPackages."unknown".garage_rpc."0.9.0" { inherit profileName; }).out; - garage_table = (rustPackages."unknown".garage_table."0.9.0" { inherit profileName; }).out; - garage_util = (rustPackages."unknown".garage_util."0.9.0" { inherit profileName; }).out; + garage_block = (rustPackages."unknown".garage_block."0.10.0" { inherit profileName; }).out; + garage_model = (rustPackages."unknown".garage_model."0.10.0" { inherit profileName; }).out; + garage_rpc = (rustPackages."unknown".garage_rpc."0.10.0" { inherit profileName; }).out; + garage_table = (rustPackages."unknown".garage_table."0.10.0" { inherit profileName; }).out; + garage_util = (rustPackages."unknown".garage_util."0.10.0" { inherit profileName; }).out; hex = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hex."0.4.3" { inherit profileName; }).out; hmac = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hmac."0.12.1" { inherit profileName; }).out; http = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".http."0.2.9" { inherit profileName; }).out; @@ -1832,9 +1832,9 @@ in }; }); - "unknown".garage_block."0.9.0" = overridableMkRustCrate (profileName: rec { + "unknown".garage_block."0.10.0" = overridableMkRustCrate (profileName: rec { name = "garage_block"; - version = "0.9.0"; + version = "0.10.0"; registry = "unknown"; src = fetchCrateLocal (workspaceSrc + "/src/block"); features = builtins.concatLists [ @@ -1848,10 +1848,10 @@ in bytesize = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".bytesize."1.3.0" { inherit profileName; }).out; futures = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.28" { inherit profileName; }).out; futures_util = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures-util."0.3.28" { inherit profileName; }).out; - garage_db = (rustPackages."unknown".garage_db."0.9.0" { inherit profileName; }).out; - garage_rpc = (rustPackages."unknown".garage_rpc."0.9.0" { inherit profileName; }).out; - garage_table = (rustPackages."unknown".garage_table."0.9.0" { inherit profileName; }).out; - garage_util = (rustPackages."unknown".garage_util."0.9.0" { inherit profileName; }).out; + garage_db = (rustPackages."unknown".garage_db."0.10.0" { inherit profileName; }).out; + garage_rpc = (rustPackages."unknown".garage_rpc."0.10.0" { inherit profileName; }).out; + garage_table = (rustPackages."unknown".garage_table."0.10.0" { inherit profileName; }).out; + garage_util = (rustPackages."unknown".garage_util."0.10.0" { inherit profileName; }).out; hex = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hex."0.4.3" { inherit profileName; }).out; opentelemetry = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".opentelemetry."0.17.0" { inherit profileName; }).out; rand = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".rand."0.8.5" { inherit profileName; }).out; @@ -1864,9 +1864,9 @@ in }; }); - "unknown".garage_db."0.9.0" = overridableMkRustCrate (profileName: rec { + "unknown".garage_db."0.10.0" = overridableMkRustCrate (profileName: rec { name = "garage_db"; - version = "0.9.0"; + version = "0.10.0"; registry = "unknown"; src = fetchCrateLocal (workspaceSrc + "/src/db"); features = builtins.concatLists [ @@ -1896,9 +1896,9 @@ in }; }); - "unknown".garage_model."0.9.0" = overridableMkRustCrate (profileName: rec { + "unknown".garage_model."0.10.0" = overridableMkRustCrate (profileName: rec { name = "garage_model"; - version = "0.9.0"; + version = "0.10.0"; registry = "unknown"; src = fetchCrateLocal (workspaceSrc + "/src/model"); features = builtins.concatLists [ @@ -1917,11 +1917,11 @@ in err_derive = (buildRustPackages."registry+https://github.com/rust-lang/crates.io-index".err-derive."0.3.1" { profileName = "__noProfile"; }).out; futures = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.28" { inherit profileName; }).out; futures_util = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures-util."0.3.28" { inherit profileName; }).out; - garage_block = (rustPackages."unknown".garage_block."0.9.0" { inherit profileName; }).out; - garage_db = (rustPackages."unknown".garage_db."0.9.0" { inherit profileName; }).out; - garage_rpc = (rustPackages."unknown".garage_rpc."0.9.0" { inherit profileName; }).out; - garage_table = (rustPackages."unknown".garage_table."0.9.0" { inherit profileName; }).out; - garage_util = (rustPackages."unknown".garage_util."0.9.0" { inherit profileName; }).out; + garage_block = (rustPackages."unknown".garage_block."0.10.0" { inherit profileName; }).out; + garage_db = (rustPackages."unknown".garage_db."0.10.0" { inherit profileName; }).out; + garage_rpc = (rustPackages."unknown".garage_rpc."0.10.0" { inherit profileName; }).out; + garage_table = (rustPackages."unknown".garage_table."0.10.0" { inherit profileName; }).out; + garage_util = (rustPackages."unknown".garage_util."0.10.0" { inherit profileName; }).out; hex = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hex."0.4.3" { inherit profileName; }).out; netapp = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".netapp."0.10.0" { inherit profileName; }).out; opentelemetry = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".opentelemetry."0.17.0" { inherit profileName; }).out; @@ -1934,9 +1934,9 @@ in }; }); - "unknown".garage_rpc."0.9.0" = overridableMkRustCrate (profileName: rec { + "unknown".garage_rpc."0.10.0" = overridableMkRustCrate (profileName: rec { name = "garage_rpc"; - version = "0.9.0"; + version = "0.10.0"; registry = "unknown"; src = fetchCrateLocal (workspaceSrc + "/src/rpc"); features = builtins.concatLists [ @@ -1958,8 +1958,8 @@ in format_table = (rustPackages."unknown".format_table."0.1.1" { inherit profileName; }).out; futures = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.28" { inherit profileName; }).out; futures_util = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures-util."0.3.28" { inherit profileName; }).out; - garage_db = (rustPackages."unknown".garage_db."0.9.0" { inherit profileName; }).out; - garage_util = (rustPackages."unknown".garage_util."0.9.0" { inherit profileName; }).out; + garage_db = (rustPackages."unknown".garage_db."0.10.0" { inherit profileName; }).out; + garage_util = (rustPackages."unknown".garage_util."0.10.0" { inherit profileName; }).out; gethostname = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".gethostname."0.4.3" { inherit profileName; }).out; hex = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hex."0.4.3" { inherit profileName; }).out; itertools = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".itertools."0.10.5" { inherit profileName; }).out; @@ -1982,9 +1982,9 @@ in }; }); - "unknown".garage_table."0.9.0" = overridableMkRustCrate (profileName: rec { + "unknown".garage_table."0.10.0" = overridableMkRustCrate (profileName: rec { name = "garage_table"; - version = "0.9.0"; + version = "0.10.0"; registry = "unknown"; src = fetchCrateLocal (workspaceSrc + "/src/table"); dependencies = { @@ -1993,9 +1993,9 @@ in bytes = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".bytes."1.4.0" { inherit profileName; }).out; futures = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.28" { inherit profileName; }).out; futures_util = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures-util."0.3.28" { inherit profileName; }).out; - garage_db = (rustPackages."unknown".garage_db."0.9.0" { inherit profileName; }).out; - garage_rpc = (rustPackages."unknown".garage_rpc."0.9.0" { inherit profileName; }).out; - garage_util = (rustPackages."unknown".garage_util."0.9.0" { inherit profileName; }).out; + garage_db = (rustPackages."unknown".garage_db."0.10.0" { inherit profileName; }).out; + garage_rpc = (rustPackages."unknown".garage_rpc."0.10.0" { inherit profileName; }).out; + garage_util = (rustPackages."unknown".garage_util."0.10.0" { inherit profileName; }).out; hex = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hex."0.4.3" { inherit profileName; }).out; hexdump = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hexdump."0.1.1" { inherit profileName; }).out; opentelemetry = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".opentelemetry."0.17.0" { inherit profileName; }).out; @@ -2007,9 +2007,9 @@ in }; }); - "unknown".garage_util."0.9.0" = overridableMkRustCrate (profileName: rec { + "unknown".garage_util."0.10.0" = overridableMkRustCrate (profileName: rec { name = "garage_util"; - version = "0.9.0"; + version = "0.10.0"; registry = "unknown"; src = fetchCrateLocal (workspaceSrc + "/src/util"); features = builtins.concatLists [ @@ -2025,7 +2025,7 @@ in digest = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".digest."0.10.7" { inherit profileName; }).out; err_derive = (buildRustPackages."registry+https://github.com/rust-lang/crates.io-index".err-derive."0.3.1" { profileName = "__noProfile"; }).out; futures = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.28" { inherit profileName; }).out; - garage_db = (rustPackages."unknown".garage_db."0.9.0" { inherit profileName; }).out; + garage_db = (rustPackages."unknown".garage_db."0.10.0" { inherit profileName; }).out; hex = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hex."0.4.3" { inherit profileName; }).out; hexdump = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hexdump."0.1.1" { inherit profileName; }).out; http = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".http."0.2.9" { inherit profileName; }).out; @@ -2051,18 +2051,18 @@ in }; }); - "unknown".garage_web."0.9.0" = overridableMkRustCrate (profileName: rec { + "unknown".garage_web."0.10.0" = overridableMkRustCrate (profileName: rec { name = "garage_web"; - version = "0.9.0"; + version = "0.10.0"; registry = "unknown"; src = fetchCrateLocal (workspaceSrc + "/src/web"); dependencies = { err_derive = (buildRustPackages."registry+https://github.com/rust-lang/crates.io-index".err-derive."0.3.1" { profileName = "__noProfile"; }).out; futures = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.28" { inherit profileName; }).out; - garage_api = (rustPackages."unknown".garage_api."0.9.0" { inherit profileName; }).out; - garage_model = (rustPackages."unknown".garage_model."0.9.0" { inherit profileName; }).out; - garage_table = (rustPackages."unknown".garage_table."0.9.0" { inherit profileName; }).out; - garage_util = (rustPackages."unknown".garage_util."0.9.0" { inherit profileName; }).out; + garage_api = (rustPackages."unknown".garage_api."0.10.0" { inherit profileName; }).out; + garage_model = (rustPackages."unknown".garage_model."0.10.0" { inherit profileName; }).out; + garage_table = (rustPackages."unknown".garage_table."0.10.0" { inherit profileName; }).out; + garage_util = (rustPackages."unknown".garage_util."0.10.0" { inherit profileName; }).out; http = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".http."0.2.9" { inherit profileName; }).out; hyper = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hyper."0.14.27" { inherit profileName; }).out; hyperlocal = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hyperlocal."0.8.0" { inherit profileName; }).out; diff --git a/Cargo.toml b/Cargo.toml index e3d111c3..5982859c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,14 +18,14 @@ default-members = ["src/garage"] [workspace.dependencies] format_table = { version = "0.1.1", path = "src/format-table" } -garage_api = { version = "0.9.0", path = "src/api" } -garage_block = { version = "0.9.0", path = "src/block" } -garage_db = { version = "0.9.0", path = "src/db", default-features = false } -garage_model = { version = "0.9.0", path = "src/model", default-features = false } -garage_rpc = { version = "0.9.0", path = "src/rpc" } -garage_table = { version = "0.9.0", path = "src/table" } -garage_util = { version = "0.9.0", path = "src/util" } -garage_web = { version = "0.9.0", path = "src/web" } +garage_api = { version = "0.10.0", path = "src/api" } +garage_block = { version = "0.10.0", path = "src/block" } +garage_db = { version = "0.10.0", path = "src/db", default-features = false } +garage_model = { version = "0.10.0", path = "src/model", default-features = false } +garage_rpc = { version = "0.10.0", path = "src/rpc" } +garage_table = { version = "0.10.0", path = "src/table" } +garage_util = { version = "0.10.0", path = "src/util" } +garage_web = { version = "0.10.0", path = "src/web" } k2v-client = { version = "0.0.4", path = "src/k2v-client" } [profile.dev] diff --git a/src/api/Cargo.toml b/src/api/Cargo.toml index e8cbc1c8..15bf757e 100644 --- a/src/api/Cargo.toml +++ b/src/api/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "garage_api" -version = "0.9.0" +version = "0.10.0" authors = ["Alex Auvolat "] edition = "2018" license = "AGPL-3.0" diff --git a/src/block/Cargo.toml b/src/block/Cargo.toml index f6aa5f64..e4265cbe 100644 --- a/src/block/Cargo.toml +++ b/src/block/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "garage_block" -version = "0.9.0" +version = "0.10.0" authors = ["Alex Auvolat "] edition = "2018" license = "AGPL-3.0" diff --git a/src/db/Cargo.toml b/src/db/Cargo.toml index 67af4a7c..530f1966 100644 --- a/src/db/Cargo.toml +++ b/src/db/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "garage_db" -version = "0.9.0" +version = "0.10.0" authors = ["Alex Auvolat "] edition = "2018" license = "AGPL-3.0" diff --git a/src/garage/Cargo.toml b/src/garage/Cargo.toml index 7c3a79cb..dce7ea73 100644 --- a/src/garage/Cargo.toml +++ b/src/garage/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "garage" -version = "0.9.0" +version = "0.10.0" authors = ["Alex Auvolat "] edition = "2018" license = "AGPL-3.0" diff --git a/src/model/Cargo.toml b/src/model/Cargo.toml index 42b7ffdb..124b84b0 100644 --- a/src/model/Cargo.toml +++ b/src/model/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "garage_model" -version = "0.9.0" +version = "0.10.0" authors = ["Alex Auvolat "] edition = "2018" license = "AGPL-3.0" diff --git a/src/rpc/Cargo.toml b/src/rpc/Cargo.toml index f450718f..e19f80a8 100644 --- a/src/rpc/Cargo.toml +++ b/src/rpc/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "garage_rpc" -version = "0.9.0" +version = "0.10.0" authors = ["Alex Auvolat "] edition = "2018" license = "AGPL-3.0" diff --git a/src/table/Cargo.toml b/src/table/Cargo.toml index 08ccec72..62cffac7 100644 --- a/src/table/Cargo.toml +++ b/src/table/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "garage_table" -version = "0.9.0" +version = "0.10.0" authors = ["Alex Auvolat "] edition = "2018" license = "AGPL-3.0" diff --git a/src/util/Cargo.toml b/src/util/Cargo.toml index 6554ac13..afc4d3c3 100644 --- a/src/util/Cargo.toml +++ b/src/util/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "garage_util" -version = "0.9.0" +version = "0.10.0" authors = ["Alex Auvolat "] edition = "2018" license = "AGPL-3.0" diff --git a/src/web/Cargo.toml b/src/web/Cargo.toml index 88cf1486..9f7720da 100644 --- a/src/web/Cargo.toml +++ b/src/web/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "garage_web" -version = "0.9.0" +version = "0.10.0" authors = ["Alex Auvolat ", "Quentin Dufour "] edition = "2018" license = "AGPL-3.0" -- cgit v1.2.3 From 75e591727d9cfda0133200604872a38419c178a1 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 20 Feb 2024 17:08:31 +0100 Subject: [next-0.10] cluster node status metrics: report nodes of all active layout versions --- src/rpc/system_metrics.rs | 68 +++++++++++++++++++++++------------------------ 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/src/rpc/system_metrics.rs b/src/rpc/system_metrics.rs index fb3c983c..0bb55bf3 100644 --- a/src/rpc/system_metrics.rs +++ b/src/rpc/system_metrics.rs @@ -216,12 +216,12 @@ impl SystemMetrics { .u64_value_observer("cluster_layout_node_connected", move |observer| { let layout = system.cluster_layout(); let nodes = system.get_known_nodes(); - for (id, _, config) in layout.current().roles.items().iter() { - if let Some(role) = &config.0 { - let mut kv = vec![ - KeyValue::new("id", format!("{:?}", id)), - KeyValue::new("role_zone", role.zone.clone()), - ]; + for id in layout.all_nodes().iter() { + let mut kv = vec![KeyValue::new("id", format!("{:?}", id))]; + if let Some(role) = + layout.current().roles.get(id).and_then(|r| r.0.as_ref()) + { + kv.push(KeyValue::new("role_zone", role.zone.clone())); match role.capacity { Some(cap) => { kv.push(KeyValue::new("role_capacity", cap as i64)); @@ -231,24 +231,24 @@ impl SystemMetrics { kv.push(KeyValue::new("role_gateway", 1)); } } + } - let value; - if let Some(node) = nodes.iter().find(|n| n.id == *id) { - value = if node.is_up { 1 } else { 0 }; + let value; + if let Some(node) = nodes.iter().find(|n| n.id == *id) { // TODO: if we add address and hostname, and those change, we // get duplicate metrics, due to bad otel aggregation :( // Can probably be fixed when we upgrade opentelemetry // kv.push(KeyValue::new("address", node.addr.to_string())); // kv.push(KeyValue::new( - // "hostname", - // node.status.hostname.clone(), + // "hostname", + // node.status.hostname.clone(), // )); - } else { - value = 0; - } - - observer.observe(value, &kv); + value = if node.is_up { 1 } else { 0 }; + } else { + value = 0; } + + observer.observe(value, &kv); } }) .with_description("Connection status for nodes in the cluster layout") @@ -260,12 +260,12 @@ impl SystemMetrics { .u64_value_observer("cluster_layout_node_disconnected_time", move |observer| { let layout = system.cluster_layout(); let nodes = system.get_known_nodes(); - for (id, _, config) in layout.current().roles.items().iter() { - if let Some(role) = &config.0 { - let mut kv = vec![ - KeyValue::new("id", format!("{:?}", id)), - KeyValue::new("role_zone", role.zone.clone()), - ]; + for id in layout.all_nodes().iter() { + let mut kv = vec![KeyValue::new("id", format!("{:?}", id))]; + if let Some(role) = + layout.current().roles.get(id).and_then(|r| r.0.as_ref()) + { + kv.push(KeyValue::new("role_zone", role.zone.clone())); match role.capacity { Some(cap) => { kv.push(KeyValue::new("role_capacity", cap as i64)); @@ -275,19 +275,19 @@ impl SystemMetrics { kv.push(KeyValue::new("role_gateway", 1)); } } + } - if let Some(node) = nodes.iter().find(|n| n.id == *id) { - // TODO: see comment above - // kv.push(KeyValue::new("address", node.addr.to_string())); - // kv.push(KeyValue::new( - // "hostname", - // node.status.hostname.clone(), - // )); - if node.is_up { - observer.observe(0, &kv); - } else if let Some(secs) = node.last_seen_secs_ago { - observer.observe(secs, &kv); - } + if let Some(node) = nodes.iter().find(|n| n.id == *id) { + // TODO: see comment above + // kv.push(KeyValue::new("address", node.addr.to_string())); + // kv.push(KeyValue::new( + // "hostname", + // node.status.hostname.clone(), + // )); + if node.is_up { + observer.observe(0, &kv); + } else if let Some(secs) = node.last_seen_secs_ago { + observer.observe(secs, &kv); } } } -- cgit v1.2.3 From 81cebdd12415381f67747e96591e83b1a4a8cc0b Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 22 Feb 2024 15:53:47 +0100 Subject: [next-0.10] fix build --- src/rpc/system.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/rpc/system.rs b/src/rpc/system.rs index e8844f29..1c668306 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -520,7 +520,7 @@ impl System { } }; - let hostname = self.local_status.read().unwrap().hostname.clone(); + let hostname = self.local_status.read().unwrap().hostname.clone().unwrap(); if let Err(e) = c .publish_consul_service(self.netapp.id, &hostname, rpc_public_addr) .await @@ -544,7 +544,7 @@ impl System { } }; - let hostname = self.local_status.read().unwrap().hostname.clone(); + let hostname = self.local_status.read().unwrap().hostname.clone().unwrap(); if let Err(e) = publish_kubernetes_node(k, self.netapp.id, &hostname, rpc_public_addr).await { error!("Error while publishing node to Kubernetes: {}", e); -- cgit v1.2.3 From d0d95fd53f3d4a6fd5adcfbb4cbb031826fd64a4 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 27 Feb 2024 10:13:09 +0100 Subject: [next-0.10] woodpecker: run debug pipeline on manual trigger --- .woodpecker/debug.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.woodpecker/debug.yaml b/.woodpecker/debug.yaml index 59ae8c06..3a3ab7f6 100644 --- a/.woodpecker/debug.yaml +++ b/.woodpecker/debug.yaml @@ -5,6 +5,7 @@ when: - pull_request - deployment - cron + - manual steps: - name: check formatting -- cgit v1.2.3 From 6760895926c23112583dfc53a01ecbcfad02a276 Mon Sep 17 00:00:00 2001 From: Yureka Date: Mon, 4 Mar 2024 18:37:00 +0100 Subject: refactor: remove max_write_errors and max_faults --- src/model/garage.rs | 1 - src/rpc/replication_mode.rs | 7 ------- src/table/replication/fullcopy.rs | 12 +++++------- src/table/replication/parameters.rs | 1 - src/table/replication/sharded.rs | 3 --- 5 files changed, 5 insertions(+), 19 deletions(-) diff --git a/src/model/garage.rs b/src/model/garage.rs index fe38a760..561aca8f 100644 --- a/src/model/garage.rs +++ b/src/model/garage.rs @@ -247,7 +247,6 @@ impl Garage { let control_rep_param = TableFullReplication { system: system.clone(), - max_faults: replication_mode.control_write_max_faults(), }; info!("Initialize block manager..."); diff --git a/src/rpc/replication_mode.rs b/src/rpc/replication_mode.rs index 2f7e2fec..b142ea10 100644 --- a/src/rpc/replication_mode.rs +++ b/src/rpc/replication_mode.rs @@ -21,13 +21,6 @@ impl ReplicationMode { } } - pub fn control_write_max_faults(&self) -> usize { - match self { - Self::None => 0, - _ => 1, - } - } - pub fn replication_factor(&self) -> usize { match self { Self::None => 1, diff --git a/src/table/replication/fullcopy.rs b/src/table/replication/fullcopy.rs index 30122f39..1e52bb47 100644 --- a/src/table/replication/fullcopy.rs +++ b/src/table/replication/fullcopy.rs @@ -21,8 +21,6 @@ use crate::replication::*; pub struct TableFullReplication { /// The membership manager of this node pub system: Arc, - /// Max number of faults allowed while replicating a record - pub max_faults: usize, } impl TableReplication for TableFullReplication { @@ -45,15 +43,15 @@ impl TableReplication for TableFullReplication { } fn write_quorum(&self) -> usize { let nmembers = self.system.cluster_layout().current().all_nodes().len(); - if nmembers > self.max_faults { - nmembers - self.max_faults + + let max_faults = if nmembers > 1 { 1 } else { 0 }; + + if nmembers > max_faults { + nmembers - max_faults } else { 1 } } - fn max_write_errors(&self) -> usize { - self.max_faults - } fn partition_of(&self, _hash: &Hash) -> Partition { 0u16 diff --git a/src/table/replication/parameters.rs b/src/table/replication/parameters.rs index 78470f35..682c1ea6 100644 --- a/src/table/replication/parameters.rs +++ b/src/table/replication/parameters.rs @@ -20,7 +20,6 @@ pub trait TableReplication: Send + Sync + 'static { fn write_sets(&self, hash: &Hash) -> Self::WriteSets; /// Responses needed to consider a write succesfull in each set fn write_quorum(&self) -> usize; - fn max_write_errors(&self) -> usize; // Accessing partitions, for Merkle tree & sync /// Get partition for data with given hash diff --git a/src/table/replication/sharded.rs b/src/table/replication/sharded.rs index 8ba3700f..e0245949 100644 --- a/src/table/replication/sharded.rs +++ b/src/table/replication/sharded.rs @@ -44,9 +44,6 @@ impl TableReplication for TableShardedReplication { fn write_quorum(&self) -> usize { self.write_quorum } - fn max_write_errors(&self) -> usize { - self.replication_factor - self.write_quorum - } fn partition_of(&self, hash: &Hash) -> Partition { self.system.cluster_layout().current().partition_of(hash) -- cgit v1.2.3 From 8f86af52ed917bce506989ae1f378d977aa6c3ef Mon Sep 17 00:00:00 2001 From: Yureka Date: Mon, 4 Mar 2024 18:42:17 +0100 Subject: adjust docs for replication factor --- doc/book/cookbook/real-world.md | 2 +- doc/book/operations/layout.md | 2 +- doc/book/quick-start/_index.md | 2 +- doc/book/reference-manual/configuration.md | 129 +++++++++++++++++------------ doc/book/reference-manual/features.md | 6 +- 5 files changed, 81 insertions(+), 60 deletions(-) diff --git a/doc/book/cookbook/real-world.md b/doc/book/cookbook/real-world.md index c15ea384..cb10b550 100644 --- a/doc/book/cookbook/real-world.md +++ b/doc/book/cookbook/real-world.md @@ -116,7 +116,7 @@ metadata_dir = "/var/lib/garage/meta" data_dir = "/var/lib/garage/data" db_engine = "lmdb" -replication_mode = "3" +replication_factor = 3 compression_level = 2 diff --git a/doc/book/operations/layout.md b/doc/book/operations/layout.md index cf1372b0..667e89d2 100644 --- a/doc/book/operations/layout.md +++ b/doc/book/operations/layout.md @@ -12,7 +12,7 @@ An introduction to building cluster layouts can be found in the [production depl In Garage, all of the data that can be stored in a given cluster is divided into slices which we call *partitions*. Each partition is stored by one or several nodes in the cluster -(see [`replication_mode`](@/documentation/reference-manual/configuration.md#replication_mode)). +(see [`replication_factor`](@/documentation/reference-manual/configuration.md#replication_factor)). The layout determines the correspondence between these partitions, which exist on a logical level, and actual storage nodes. diff --git a/doc/book/quick-start/_index.md b/doc/book/quick-start/_index.md index f359843d..be9fe329 100644 --- a/doc/book/quick-start/_index.md +++ b/doc/book/quick-start/_index.md @@ -59,7 +59,7 @@ metadata_dir = "/tmp/meta" data_dir = "/tmp/data" db_engine = "lmdb" -replication_mode = "none" +replication_factor = 1 rpc_bind_addr = "[::]:3901" rpc_public_addr = "127.0.0.1:3901" diff --git a/doc/book/reference-manual/configuration.md b/doc/book/reference-manual/configuration.md index af7690f4..580e9fbc 100644 --- a/doc/book/reference-manual/configuration.md +++ b/doc/book/reference-manual/configuration.md @@ -8,7 +8,8 @@ weight = 20 Here is an example `garage.toml` configuration file that illustrates all of the possible options: ```toml -replication_mode = "3" +replication_factor = 3 +consistency_mode = "consistent" metadata_dir = "/var/lib/garage/meta" data_dir = "/var/lib/garage/data" @@ -90,7 +91,8 @@ Top-level configuration options: [`lmdb_map_size`](#lmdb_map_size), [`metadata_dir`](#metadata_dir), [`metadata_fsync`](#metadata_fsync), -[`replication_mode`](#replication_mode), +[`replication_factor`](#replication_factor), +[`consistency_mode`](#consistency_mode), [`rpc_bind_addr`](#rpc_bind_addr), [`rpc_bind_outgoing`](#rpc_bind_outgoing), [`rpc_public_addr`](#rpc_public_addr), @@ -133,11 +135,12 @@ The `[admin]` section: ### Top-level configuration options -#### `replication_mode` {#replication_mode} +#### `replication_factor` {#replication_factor} -Garage supports the following replication modes: +The replication factor can be any positive integer smaller or equal the node count in your cluster. +The chosen replication factor has a big impact on the cluster's failure tolerancy and performance characteristics. -- `none` or `1`: data stored on Garage is stored on a single node. There is no +- `1`: data stored on Garage is stored on a single node. There is no redundancy, and data will be unavailable as soon as one node fails or its network is disconnected. Do not use this for anything else than test deployments. @@ -148,17 +151,6 @@ Garage supports the following replication modes: before losing data. Data remains available in read-only mode when one node is down, but write operations will fail. - - `2-dangerous`: a variant of mode `2`, where written objects are written to - the second replica asynchronously. This means that Garage will return `200 - OK` to a PutObject request before the second copy is fully written (or even - before it even starts being written). This means that data can more easily - be lost if the node crashes before a second copy can be completed. This - also means that written objects might not be visible immediately in read - operations. In other words, this mode severely breaks the consistency and - durability guarantees of standard Garage cluster operation. Benefits of - this mode: you can still write to your cluster when one node is - unavailable. - - `3`: data stored on Garage will be stored on three different nodes, if possible each in a different zones. Garage tolerates two node failure, or several node failures but in no more than two zones (in a deployment with at @@ -166,55 +158,84 @@ Garage supports the following replication modes: or node failures are only in a single zone, reading and writing data to Garage can continue normally. - - `3-degraded`: a variant of replication mode `3`, that lowers the read +- `5`, `7`, ...: When setting the replication factor above 3, it is most useful to + choose an uneven value, since for every two copies added, one more node can fail + before losing the ability to write and read to the cluster. + +Note that in modes `2` and `3`, +if at least the same number of zones are available, an arbitrary number of failures in +any given zone is tolerated as copies of data will be spread over several zones. + +**Make sure `replication_factor` is the same in the configuration files of all nodes. +Never run a Garage cluster where that is not the case.** + +It is technically possible to change the replication factor although it's a +dangerous operation that is not officially supported. This requires you to +delete the existing cluster layout and create a new layout from scratch, +meaning that a full rebalancing of your cluster's data will be needed. To do +it, shut down your cluster entirely, delete the `custer_layout` files in the +meta directories of all your nodes, update all your configuration files with +the new `replication_factor` parameter, restart your cluster, and then create a +new layout with all the nodes you want to keep. Rebalancing data will take +some time, and data might temporarily appear unavailable to your users. +It is recommended to shut down public access to the cluster while rebalancing +is in progress. In theory, no data should be lost as rebalancing is a +routine operation for Garage, although we cannot guarantee you that everything + will go right in such an extreme scenario. + +#### `consistency_mode` {#consistency_mode} + +The consistency mode setting determines the read and write behaviour of your cluster. + + - `consistent`: The default setting. This is what the paragraph above describes. + The read and write quorum will be determined so that read-after-write consistency + is guaranteed. + - `degraded`: Lowers the read quorum to `1`, to allow you to read data from your cluster when several nodes (or nodes in several zones) are unavailable. In this mode, Garage - does not provide read-after-write consistency anymore. The write quorum is - still 2, ensuring that data successfully written to Garage is stored on at - least two nodes. - - - `3-dangerous`: a variant of replication mode `3` that lowers both the read + does not provide read-after-write consistency anymore. + The write quorum stays the same as in the `consistent` mode, ensuring that + data successfully written to Garage is stored on multiple nodes (depending + the replication factor). + - `dangerous`: This mode lowers both the read and write quorums to `1`, to allow you to both read and write to your cluster when several nodes (or nodes in several zones) are unavailable. It is the least consistent mode of operation proposed by Garage, and also one that should probably never be used. -Note that in modes `2` and `3`, -if at least the same number of zones are available, an arbitrary number of failures in -any given zone is tolerated as copies of data will be spread over several zones. +Changing the `consistency_mode` between modes while leaving the `replication_factor` untouched +(e.g. setting your node's `consistency_mode` to `degraded` when it was previously unset, or from +`dangerous` to `consistent`), can be done easily by just changing the `consistency_mode` +parameter in your config files and restarting all your Garage nodes. -**Make sure `replication_mode` is the same in the configuration files of all nodes. -Never run a Garage cluster where that is not the case.** +The consistency mode can be used together with various replication factors, to achieve +a wide range of read and write characteristics. Some examples: + + - Replication factor `2`, consistency mode `degraded`: While this mode + technically exists, its properties are the same as with consistency mode `consistent`, + since the read quorum with replication factor `2`, consistency mode `consistent` is already 1. + + - Replication factor `2`, consistency mode `dangerous`: written objects are written to + the second replica asynchronously. This means that Garage will return `200 + OK` to a PutObject request before the second copy is fully written (or even + before it even starts being written). This means that data can more easily + be lost if the node crashes before a second copy can be completed. This + also means that written objects might not be visible immediately in read + operations. In other words, this configuration severely breaks the consistency and + durability guarantees of standard Garage cluster operation. Benefits of + this configuration: you can still write to your cluster when one node is + unavailable. The quorums associated with each replication mode are described below: -| `replication_mode` | Number of replicas | Write quorum | Read quorum | Read-after-write consistency? | -| ------------------ | ------------------ | ------------ | ----------- | ----------------------------- | -| `none` or `1` | 1 | 1 | 1 | yes | -| `2` | 2 | 2 | 1 | yes | -| `2-dangerous` | 2 | 1 | 1 | NO | -| `3` | 3 | 2 | 2 | yes | -| `3-degraded` | 3 | 2 | 1 | NO | -| `3-dangerous` | 3 | 1 | 1 | NO | - -Changing the `replication_mode` between modes with the same number of replicas -(e.g. from `3` to `3-degraded`, or from `2-dangerous` to `2`), can be done easily by -just changing the `replication_mode` parameter in your config files and restarting all your -Garage nodes. - -It is also technically possible to change the replication mode to a mode with a -different numbers of replicas, although it's a dangerous operation that is not -officially supported. This requires you to delete the existing cluster layout -and create a new layout from scratch, meaning that a full rebalancing of your -cluster's data will be needed. To do it, shut down your cluster entirely, -delete the `custer_layout` files in the meta directories of all your nodes, -update all your configuration files with the new `replication_mode` parameter, -restart your cluster, and then create a new layout with all the nodes you want -to keep. Rebalancing data will take some time, and data might temporarily -appear unavailable to your users. It is recommended to shut down public access -to the cluster while rebalancing is in progress. In theory, no data should be -lost as rebalancing is a routine operation for Garage, although we cannot -guarantee you that everything will go right in such an extreme scenario. +| `consistency_mode` | `replication_factor` | Write quorum | Read quorum | Read-after-write consistency? | +| ------------------ | -------------------- | ------------ | ----------- | ----------------------------- | +| `consistent` | 1 | 1 | 1 | yes | +| `consistent` | 2 | 2 | 1 | yes | +| `dangerous` | 2 | 1 | 1 | NO | +| `consistent` | 3 | 2 | 2 | yes | +| `degraded` | 3 | 2 | 1 | NO | +| `dangerous` | 3 | 1 | 1 | NO | #### `metadata_dir` {#metadata_dir} diff --git a/doc/book/reference-manual/features.md b/doc/book/reference-manual/features.md index f7014b26..34f692cc 100644 --- a/doc/book/reference-manual/features.md +++ b/doc/book/reference-manual/features.md @@ -39,10 +39,10 @@ Read about cluster layout management [here](@/documentation/operations/layout.md ### Several replication modes -Garage supports a variety of replication modes, with 1 copy, 2 copies or 3 copies of your data, +Garage supports a variety of replication modes, with configurable replica count, and with various levels of consistency, in order to adapt to a variety of usage scenarios. -Read our reference page on [supported replication modes](@/documentation/reference-manual/configuration.md#replication_mode) -to select the replication mode best suited to your use case (hint: in most cases, `replication_mode = "3"` is what you want). +Read our reference page on [supported replication modes](@/documentation/reference-manual/configuration.md#replication_factor) +to select the replication mode best suited to your use case (hint: in most cases, `replication_factor = 3` is what you want). ### Compression and deduplication -- cgit v1.2.3 From c1769bbe69f723fb3980cf4fdac7615cfb782720 Mon Sep 17 00:00:00 2001 From: Yureka Date: Mon, 4 Mar 2024 19:58:32 +0100 Subject: ReplicationMode -> ConsistencyMode+ReplicationFactor --- src/garage/secrets.rs | 6 +- src/garage/tests/common/garage.rs | 2 +- src/model/garage.rs | 23 ++++---- src/rpc/layout/helper.rs | 18 +++--- src/rpc/layout/history.rs | 10 ++-- src/rpc/layout/manager.rs | 27 +++++---- src/rpc/layout/test.rs | 3 +- src/rpc/replication_mode.rs | 119 +++++++++++++++++++++++++------------- src/rpc/system.rs | 22 +++---- src/rpc/system_metrics.rs | 2 +- src/util/config.rs | 27 ++++++--- 11 files changed, 158 insertions(+), 101 deletions(-) diff --git a/src/garage/secrets.rs b/src/garage/secrets.rs index c3d704aa..8d2ff475 100644 --- a/src/garage/secrets.rs +++ b/src/garage/secrets.rs @@ -163,7 +163,7 @@ mod tests { r#" metadata_dir = "/tmp/garage/meta" data_dir = "/tmp/garage/data" - replication_mode = "3" + replication_factor = 3 rpc_bind_addr = "[::]:3901" rpc_secret_file = "{}" @@ -185,7 +185,7 @@ mod tests { r#" metadata_dir = "/tmp/garage/meta" data_dir = "/tmp/garage/data" - replication_mode = "3" + replication_factor = 3 rpc_bind_addr = "[::]:3901" rpc_secret_file = "{}" allow_world_readable_secrets = true @@ -296,7 +296,7 @@ mod tests { r#" metadata_dir = "/tmp/garage/meta" data_dir = "/tmp/garage/data" - replication_mode = "3" + replication_factor = 3 rpc_bind_addr = "[::]:3901" rpc_secret= "dummy" rpc_secret_file = "dummy" diff --git a/src/garage/tests/common/garage.rs b/src/garage/tests/common/garage.rs index ebc82f37..f1c1efc8 100644 --- a/src/garage/tests/common/garage.rs +++ b/src/garage/tests/common/garage.rs @@ -54,7 +54,7 @@ metadata_dir = "{path}/meta" data_dir = "{path}/data" db_engine = "lmdb" -replication_mode = "1" +replication_factor = 1 rpc_bind_addr = "127.0.0.1:{rpc_port}" rpc_public_addr = "127.0.0.1:{rpc_port}" diff --git a/src/model/garage.rs b/src/model/garage.rs index 561aca8f..19f58077 100644 --- a/src/model/garage.rs +++ b/src/model/garage.rs @@ -9,7 +9,7 @@ use garage_util::config::*; use garage_util::error::*; use garage_util::persister::PersisterShared; -use garage_rpc::replication_mode::ReplicationMode; +use garage_rpc::replication_mode::*; use garage_rpc::system::System; use garage_block::manager::*; @@ -39,8 +39,8 @@ pub struct Garage { /// The set of background variables that can be viewed/modified at runtime pub bg_vars: vars::BgVars, - /// The replication mode of this cluster - pub replication_mode: ReplicationMode, + /// The replication factor of this cluster + pub replication_factor: ReplicationFactor, /// The local database pub db: db::Db, @@ -222,27 +222,26 @@ impl Garage { .and_then(|x| NetworkKey::from_slice(&x)) .ok_or_message("Invalid RPC secret key")?; - let replication_mode = ReplicationMode::parse(&config.replication_mode) - .ok_or_message("Invalid replication_mode in config file.")?; + let (replication_factor, consistency_mode) = parse_replication_mode(&config)?; info!("Initialize background variable system..."); let mut bg_vars = vars::BgVars::new(); info!("Initialize membership management system..."); - let system = System::new(network_key, replication_mode, &config)?; + let system = System::new(network_key, replication_factor, consistency_mode, &config)?; let data_rep_param = TableShardedReplication { system: system.clone(), - replication_factor: replication_mode.replication_factor(), - write_quorum: replication_mode.write_quorum(), + replication_factor: replication_factor.into(), + write_quorum: replication_factor.write_quorum(consistency_mode), read_quorum: 1, }; let meta_rep_param = TableShardedReplication { system: system.clone(), - replication_factor: replication_mode.replication_factor(), - write_quorum: replication_mode.write_quorum(), - read_quorum: replication_mode.read_quorum(), + replication_factor: replication_factor.into(), + write_quorum: replication_factor.write_quorum(consistency_mode), + read_quorum: replication_factor.read_quorum(consistency_mode), }; let control_rep_param = TableFullReplication { @@ -338,7 +337,7 @@ impl Garage { Ok(Arc::new(Self { config, bg_vars, - replication_mode, + replication_factor, db, system, block_manager, diff --git a/src/rpc/layout/helper.rs b/src/rpc/layout/helper.rs index 9fb738ea..2835347a 100644 --- a/src/rpc/layout/helper.rs +++ b/src/rpc/layout/helper.rs @@ -7,7 +7,7 @@ use serde::{Deserialize, Serialize}; use garage_util::data::*; use super::*; -use crate::replication_mode::ReplicationMode; +use crate::replication_mode::*; #[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq, Eq)] pub struct RpcLayoutDigest { @@ -29,7 +29,8 @@ pub struct SyncLayoutDigest { } pub struct LayoutHelper { - replication_mode: ReplicationMode, + replication_factor: ReplicationFactor, + consistency_mode: ConsistencyMode, layout: Option, // cached values @@ -57,7 +58,8 @@ impl Deref for LayoutHelper { impl LayoutHelper { pub fn new( - replication_mode: ReplicationMode, + replication_factor: ReplicationFactor, + consistency_mode: ConsistencyMode, mut layout: LayoutHistory, mut ack_lock: HashMap, ) -> Self { @@ -66,7 +68,7 @@ impl LayoutHelper { // correct and we have rapid access to important values such as // the layout versions to use when reading to ensure consistency. - if !replication_mode.is_read_after_write_consistent() { + if consistency_mode != ConsistencyMode::Consistent { // Fast path for when no consistency is required. // In this case we only need to keep the last version of the layout, // we don't care about coordinating stuff in the cluster. @@ -103,7 +105,7 @@ impl LayoutHelper { // This value is calculated using quorums to allow progress even // if not all nodes have successfully completed a sync. let sync_map_min = - layout.calculate_sync_map_min_with_quorum(replication_mode, &all_nongateway_nodes); + layout.calculate_sync_map_min_with_quorum(replication_factor, &all_nongateway_nodes); let trackers_hash = layout.calculate_trackers_hash(); let staging_hash = layout.calculate_staging_hash(); @@ -114,7 +116,8 @@ impl LayoutHelper { .or_insert(AtomicUsize::new(0)); LayoutHelper { - replication_mode, + replication_factor, + consistency_mode, layout: Some(layout), ack_map_min, sync_map_min, @@ -139,7 +142,8 @@ impl LayoutHelper { let changed = f(self.layout.as_mut().unwrap()); if changed { *self = Self::new( - self.replication_mode, + self.replication_factor, + self.consistency_mode, self.layout.take().unwrap(), std::mem::take(&mut self.ack_lock), ); diff --git a/src/rpc/layout/history.rs b/src/rpc/layout/history.rs index b8cc27da..290f058d 100644 --- a/src/rpc/layout/history.rs +++ b/src/rpc/layout/history.rs @@ -6,11 +6,11 @@ use garage_util::encode::nonversioned_encode; use garage_util::error::*; use super::*; -use crate::replication_mode::ReplicationMode; +use crate::replication_mode::*; impl LayoutHistory { - pub fn new(replication_factor: usize) -> Self { - let version = LayoutVersion::new(replication_factor); + pub fn new(replication_factor: ReplicationFactor) -> Self { + let version = LayoutVersion::new(replication_factor.into()); let staging = LayoutStaging { parameters: Lww::::new(version.parameters), @@ -119,7 +119,7 @@ impl LayoutHistory { pub(crate) fn calculate_sync_map_min_with_quorum( &self, - replication_mode: ReplicationMode, + replication_factor: ReplicationFactor, all_nongateway_nodes: &[Uuid], ) -> u64 { // This function calculates the minimum layout version from which @@ -133,7 +133,7 @@ impl LayoutHistory { return self.current().version; } - let quorum = replication_mode.write_quorum(); + let quorum = replication_factor.write_quorum(ConsistencyMode::Consistent); let min_version = self.min_stored(); let global_min = self diff --git a/src/rpc/layout/manager.rs b/src/rpc/layout/manager.rs index 0b6c7e63..8a6eb1c3 100644 --- a/src/rpc/layout/manager.rs +++ b/src/rpc/layout/manager.rs @@ -14,13 +14,13 @@ use garage_util::error::*; use garage_util::persister::Persister; use super::*; -use crate::replication_mode::ReplicationMode; +use crate::replication_mode::*; use crate::rpc_helper::*; use crate::system::*; pub struct LayoutManager { node_id: Uuid, - replication_mode: ReplicationMode, + replication_factor: ReplicationFactor, persist_cluster_layout: Persister, layout: Arc>, @@ -38,20 +38,19 @@ impl LayoutManager { node_id: NodeID, system_endpoint: Arc>, peering: Arc, - replication_mode: ReplicationMode, + replication_factor: ReplicationFactor, + consistency_mode: ConsistencyMode, ) -> Result, Error> { - let replication_factor = replication_mode.replication_factor(); - let persist_cluster_layout: Persister = Persister::new(&config.metadata_dir, "cluster_layout"); let cluster_layout = match persist_cluster_layout.load() { Ok(x) => { - if x.current().replication_factor != replication_mode.replication_factor() { + if x.current().replication_factor != replication_factor.replication_factor() { return Err(Error::Message(format!( "Prevous cluster layout has replication factor {}, which is different than the one specified in the config file ({}). The previous cluster layout can be purged, if you know what you are doing, simply by deleting the `cluster_layout` file in your metadata directory.", x.current().replication_factor, - replication_factor + replication_factor.replication_factor() ))); } x @@ -65,8 +64,12 @@ impl LayoutManager { } }; - let mut cluster_layout = - LayoutHelper::new(replication_mode, cluster_layout, Default::default()); + let mut cluster_layout = LayoutHelper::new( + replication_factor, + consistency_mode, + cluster_layout, + Default::default(), + ); cluster_layout.update_trackers(node_id.into()); let layout = Arc::new(RwLock::new(cluster_layout)); @@ -81,7 +84,7 @@ impl LayoutManager { Ok(Arc::new(Self { node_id: node_id.into(), - replication_mode, + replication_factor, persist_cluster_layout, layout, change_notify, @@ -295,11 +298,11 @@ impl LayoutManager { adv.update_trackers ); - if adv.current().replication_factor != self.replication_mode.replication_factor() { + if adv.current().replication_factor != self.replication_factor.replication_factor() { let msg = format!( "Received a cluster layout from another node with replication factor {}, which is different from what we have in our configuration ({}). Discarding the cluster layout we received.", adv.current().replication_factor, - self.replication_mode.replication_factor() + self.replication_factor.replication_factor() ); error!("{}", msg); return Err(Error::Message(msg)); diff --git a/src/rpc/layout/test.rs b/src/rpc/layout/test.rs index 88eb518e..fcbb9dfc 100644 --- a/src/rpc/layout/test.rs +++ b/src/rpc/layout/test.rs @@ -5,6 +5,7 @@ use garage_util::crdt::Crdt; use garage_util::error::*; use crate::layout::*; +use crate::replication_mode::ReplicationFactor; // This function checks that the partition size S computed is at least better than the // one given by a very naive algorithm. To do so, we try to run the naive algorithm @@ -120,7 +121,7 @@ fn test_assignment() { let mut node_capacity_vec = vec![4000, 1000, 2000]; let mut node_zone_vec = vec!["A", "B", "C"]; - let mut cl = LayoutHistory::new(3); + let mut cl = LayoutHistory::new(ReplicationFactor::new(3).unwrap()); update_layout(&mut cl, &node_capacity_vec, &node_zone_vec, 3); let v = cl.current().version; let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); diff --git a/src/rpc/replication_mode.rs b/src/rpc/replication_mode.rs index b142ea10..a3a94085 100644 --- a/src/rpc/replication_mode.rs +++ b/src/rpc/replication_mode.rs @@ -1,57 +1,94 @@ -#[derive(Clone, Copy)] -pub enum ReplicationMode { - None, - TwoWay, - TwoWayDangerous, - ThreeWay, - ThreeWayDegraded, - ThreeWayDangerous, +use garage_util::config::Config; +use garage_util::crdt::AutoCrdt; +use garage_util::error::*; +use serde::{Deserialize, Serialize}; + +#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)] +#[serde(transparent)] +pub struct ReplicationFactor(usize); + +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Default, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum ConsistencyMode { + /// Read- and Write-quorum are 1 + Dangerous, + /// Read-quorum is 1 + Degraded, + /// Read- and Write-quorum are determined for read-after-write-consistency + #[default] + Consistent, +} + +impl ConsistencyMode { + pub fn parse(s: &str) -> Option { + serde_json::from_value(serde_json::Value::String(s.to_string())).ok() + } +} + +impl AutoCrdt for ConsistencyMode { + const WARN_IF_DIFFERENT: bool = true; } -impl ReplicationMode { - pub fn parse(v: &str) -> Option { - match v { - "none" | "1" => Some(Self::None), - "2" => Some(Self::TwoWay), - "2-dangerous" => Some(Self::TwoWayDangerous), - "3" => Some(Self::ThreeWay), - "3-degraded" => Some(Self::ThreeWayDegraded), - "3-dangerous" => Some(Self::ThreeWayDangerous), - _ => None, +impl ReplicationFactor { + pub fn new(replication_factor: usize) -> Option { + if replication_factor < 1 { + None + } else { + Some(Self(replication_factor)) } } pub fn replication_factor(&self) -> usize { - match self { - Self::None => 1, - Self::TwoWay | Self::TwoWayDangerous => 2, - Self::ThreeWay | Self::ThreeWayDegraded | Self::ThreeWayDangerous => 3, - } + self.0 } - pub fn read_quorum(&self) -> usize { - match self { - Self::None => 1, - Self::TwoWay | Self::TwoWayDangerous => 1, - Self::ThreeWay => 2, - Self::ThreeWayDegraded | Self::ThreeWayDangerous => 1, + pub fn read_quorum(&self, consistency_mode: ConsistencyMode) -> usize { + match consistency_mode { + ConsistencyMode::Dangerous | ConsistencyMode::Degraded => 1, + ConsistencyMode::Consistent => self.replication_factor().div_ceil(2), } } - pub fn write_quorum(&self) -> usize { - match self { - Self::None => 1, - Self::TwoWay => 2, - Self::TwoWayDangerous => 1, - Self::ThreeWay | Self::ThreeWayDegraded => 2, - Self::ThreeWayDangerous => 1, + pub fn write_quorum(&self, consistency_mode: ConsistencyMode) -> usize { + match consistency_mode { + ConsistencyMode::Dangerous => 1, + ConsistencyMode::Degraded | ConsistencyMode::Consistent => { + (self.replication_factor() + 1) - self.read_quorum(ConsistencyMode::Consistent) + } } } +} - pub fn is_read_after_write_consistent(&self) -> bool { - match self { - Self::None | Self::TwoWay | Self::ThreeWay => true, - _ => false, - } +impl std::convert::From for usize { + fn from(replication_factor: ReplicationFactor) -> usize { + replication_factor.0 } } + +pub fn parse_replication_mode( + config: &Config, +) -> Result<(ReplicationFactor, ConsistencyMode), Error> { + match (&config.replication_mode, config.replication_factor, config.consistency_mode.as_str()) { + (Some(replication_mode), None, "consistent") => { + tracing::warn!("Legacy config option replication_mode in use. Please migrate to replication_factor and consistency_mode"); + let parsed_replication_mode = match replication_mode.as_str() { + "1" | "none" => Some((ReplicationFactor(1), ConsistencyMode::Consistent)), + "2" => Some((ReplicationFactor(2), ConsistencyMode::Consistent)), + "2-dangerous" => Some((ReplicationFactor(2), ConsistencyMode::Dangerous)), + "3" => Some((ReplicationFactor(3), ConsistencyMode::Consistent)), + "3-degraded" => Some((ReplicationFactor(3), ConsistencyMode::Degraded)), + "3-dangerous" => Some((ReplicationFactor(3), ConsistencyMode::Dangerous)), + _ => None, + }; + Some(parsed_replication_mode.ok_or_message("Invalid replication_mode in config file.")?) + }, + (None, Some(replication_factor), consistency_mode) => { + let replication_factor = ReplicationFactor::new(replication_factor) + .ok_or_message("Invalid replication_factor in config file.")?; + let consistency_mode = ConsistencyMode::parse(consistency_mode) + .ok_or_message("Invalid consistency_mode in config file.")?; + Some((replication_factor, consistency_mode)) + } + _ => None, + }.ok_or_message("Either the legacy replication_mode or replication_level and consistency_mode can be set, not both.") +} diff --git a/src/rpc/system.rs b/src/rpc/system.rs index 1c668306..54d589d2 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -112,8 +112,7 @@ pub struct System { metrics: ArcSwapOption, - replication_mode: ReplicationMode, - pub(crate) replication_factor: usize, + pub(crate) replication_factor: ReplicationFactor, /// Path to metadata directory pub metadata_dir: PathBuf, @@ -243,7 +242,8 @@ impl System { /// Create this node's membership manager pub fn new( network_key: NetworkKey, - replication_mode: ReplicationMode, + replication_factor: ReplicationFactor, + consistency_mode: ConsistencyMode, config: &Config, ) -> Result, Error> { // ---- setup netapp RPC protocol ---- @@ -274,14 +274,13 @@ impl System { let persist_peer_list = Persister::new(&config.metadata_dir, "peer_list"); // ---- setup cluster layout and layout manager ---- - let replication_factor = replication_mode.replication_factor(); - let layout_manager = LayoutManager::new( config, netapp.id, system_endpoint.clone(), peering.clone(), - replication_mode, + replication_factor, + consistency_mode, )?; let mut local_status = NodeStatus::initial(replication_factor, &layout_manager); @@ -315,7 +314,6 @@ impl System { netapp: netapp.clone(), peering: peering.clone(), system_endpoint, - replication_mode, replication_factor, rpc_listen_addr: config.rpc_bind_addr, rpc_public_addr, @@ -427,7 +425,9 @@ impl System { } pub fn health(&self) -> ClusterHealth { - let quorum = self.replication_mode.write_quorum(); + let quorum = self + .replication_factor + .write_quorum(ConsistencyMode::Consistent); // Gather information about running nodes. // Technically, `nodes` contains currently running nodes, as well @@ -631,7 +631,7 @@ impl System { .count(); let not_configured = self.cluster_layout().check().is_err(); - let no_peers = n_connected < self.replication_factor; + let no_peers = n_connected < self.replication_factor.into(); let expected_n_nodes = self.cluster_layout().all_nodes().len(); let bad_peers = n_connected != expected_n_nodes; @@ -774,14 +774,14 @@ impl EndpointHandler for System { } impl NodeStatus { - fn initial(replication_factor: usize, layout_manager: &LayoutManager) -> Self { + fn initial(replication_factor: ReplicationFactor, layout_manager: &LayoutManager) -> Self { NodeStatus { hostname: Some( gethostname::gethostname() .into_string() .unwrap_or_else(|_| "".to_string()), ), - replication_factor, + replication_factor: replication_factor.into(), layout_digest: layout_manager.layout().digest(), meta_disk_avail: None, data_disk_avail: None, diff --git a/src/rpc/system_metrics.rs b/src/rpc/system_metrics.rs index 0bb55bf3..a64daec8 100644 --- a/src/rpc/system_metrics.rs +++ b/src/rpc/system_metrics.rs @@ -68,7 +68,7 @@ impl SystemMetrics { let replication_factor = system.replication_factor; meter .u64_value_observer("garage_replication_factor", move |observer| { - observer.observe(replication_factor as u64, &[]) + observer.observe(replication_factor.replication_factor() as u64, &[]) }) .with_description("Garage replication factor setting") .init() diff --git a/src/util/config.rs b/src/util/config.rs index 056c625d..b7f27676 100644 --- a/src/util/config.rs +++ b/src/util/config.rs @@ -30,12 +30,20 @@ pub struct Config { )] pub block_size: usize, - /// Replication mode. Supported values: - /// - none, 1 -> no replication - /// - 2 -> 2-way replication - /// - 3 -> 3-way replication - // (we can add more aliases for this later) - pub replication_mode: String, + /// Number of replicas. Can be any positive integer, but uneven numbers are more favorable. + /// - 1 for single-node clusters, or to disable replication + /// - 3 is the recommended and supported setting. + #[serde(default)] + pub replication_factor: Option, + + /// Consistency mode for all for requests through this node + /// - Degraded -> Disable read quorum + /// - Dangerous -> Disable read and write quorum + #[serde(default = "default_consistency_mode")] + pub consistency_mode: String, + + /// Legacy option + pub replication_mode: Option, /// Zstd compression level used on data blocks #[serde( @@ -244,10 +252,15 @@ fn default_sled_cache_capacity() -> usize { fn default_sled_flush_every_ms() -> u64 { 2000 } + fn default_block_size() -> usize { 1048576 } +fn default_consistency_mode() -> String { + "consistent".into() +} + fn default_compression() -> Option { Some(1) } @@ -359,7 +372,7 @@ mod tests { r#" metadata_dir = "/tmp/garage/meta" data_dir = "/tmp/garage/data" - replication_mode = "3" + replication_factor = 3 rpc_bind_addr = "[::]:3901" rpc_secret = "foo" -- cgit v1.2.3 From 57acc60082089e1a24fd47588f6ff3cb20ed4eef Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 23 Feb 2024 16:49:50 +0100 Subject: [sse-c] Implement SSE-C encryption --- Cargo.lock | 105 ++++++++ Cargo.nix | 133 +++++++++- Cargo.toml | 1 + src/api/Cargo.toml | 3 + src/api/s3/copy.rs | 301 +++++++++++++++------ src/api/s3/encryption.rs | 595 ++++++++++++++++++++++++++++++++++++++++++ src/api/s3/error.rs | 6 + src/api/s3/get.rs | 199 +++++++++----- src/api/s3/list.rs | 8 +- src/api/s3/mod.rs | 1 + src/api/s3/multipart.rs | 56 ++-- src/api/s3/post_object.rs | 42 +-- src/api/s3/put.rs | 128 ++++++--- src/block/block.rs | 2 +- src/block/lib.rs | 2 + src/block/manager.rs | 14 +- src/model/s3/object_table.rs | 160 +++++++++++- src/model/s3/version_table.rs | 5 +- 18 files changed, 1528 insertions(+), 233 deletions(-) create mode 100644 src/api/s3/encryption.rs diff --git a/Cargo.lock b/Cargo.lock index 50f5d4c4..5aeef747 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -17,6 +17,41 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" +[[package]] +name = "aead" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d122413f284cf2d62fb1b7db97e02edb8cda96d769b16e443a4f6195e35662b0" +dependencies = [ + "crypto-common", + "generic-array", +] + +[[package]] +name = "aes" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0" +dependencies = [ + "cfg-if", + "cipher", + "cpufeatures", +] + +[[package]] +name = "aes-gcm" +version = "0.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "831010a0f742e1209b3bcea8fab6a8e149051ba6099432c8cb2cc117dec3ead1" +dependencies = [ + "aead", + "aes", + "cipher", + "ctr", + "ghash", + "subtle", +] + [[package]] name = "ahash" version = "0.8.7" @@ -761,6 +796,16 @@ dependencies = [ "windows-targets 0.52.0", ] +[[package]] +name = "cipher" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773f3b9af64447d2ce9850330c473515014aa235e6a783b02db81ff39e4a3dad" +dependencies = [ + "crypto-common", + "inout", +] + [[package]] name = "clap" version = "2.34.0" @@ -929,9 +974,19 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" dependencies = [ "generic-array", + "rand_core", "typenum", ] +[[package]] +name = "ctr" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0369ee1ad671834580515889b80f2ea915f23b8be8d0daa4bbaf2ac5c7590835" +dependencies = [ + "cipher", +] + [[package]] name = "darling" version = "0.20.5" @@ -1333,7 +1388,9 @@ dependencies = [ name = "garage_api" version = "0.10.0" dependencies = [ + "aes-gcm", "argon2", + "async-compression", "async-trait", "base64 0.21.7", "bytes", @@ -1374,6 +1431,7 @@ dependencies = [ "sha2", "tokio", "tokio-stream", + "tokio-util 0.7.10", "tracing", "url", ] @@ -1614,6 +1672,16 @@ dependencies = [ "wasi", ] +[[package]] +name = "ghash" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0d8a4362ccb29cb0b265253fb0a2728f592895ee6854fd9bc13f2ffda266ff1" +dependencies = [ + "opaque-debug", + "polyval", +] + [[package]] name = "gimli" version = "0.28.1" @@ -2063,6 +2131,15 @@ dependencies = [ "hashbrown 0.14.3", ] +[[package]] +name = "inout" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a0c10553d664a4d0bcff9f4215d0aac67a639cc68ef660840afe309b807bc9f5" +dependencies = [ + "generic-array", +] + [[package]] name = "instant" version = "0.1.12" @@ -2643,6 +2720,12 @@ version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" +[[package]] +name = "opaque-debug" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08d65885ee38876c4f86fa503fb49d7b507c2b62552df7c70b2fce627e06381" + [[package]] name = "openssl-probe" version = "0.1.5" @@ -2980,6 +3063,18 @@ dependencies = [ "winapi", ] +[[package]] +name = "polyval" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d1fe60d06143b2430aa532c94cfe9e29783047f06c0d7fd359a9a51b729fa25" +dependencies = [ + "cfg-if", + "cpufeatures", + "opaque-debug", + "universal-hash", +] + [[package]] name = "powerfmt" version = "0.2.0" @@ -4399,6 +4494,16 @@ version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f962df74c8c05a667b5ee8bcf162993134c104e96440b663c8daa176dc772d8c" +[[package]] +name = "universal-hash" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc1de2c688dc15305988b563c3854064043356019f97a4b46276fe734c4f07ea" +dependencies = [ + "crypto-common", + "subtle", +] + [[package]] name = "unsafe-libyaml" version = "0.2.10" diff --git a/Cargo.nix b/Cargo.nix index b2081891..22e7d387 100644 --- a/Cargo.nix +++ b/Cargo.nix @@ -34,7 +34,7 @@ args@{ ignoreLockHash, }: let - nixifiedLockHash = "263873397c8aa960f9ef6a815187218ab9c58b5ab35bbeb9c3dc70d032dcc963"; + nixifiedLockHash = "170b83bf5f94d624b1caf773805f52b36970c99f4db21088c4ac794dad02c53b"; workspaceSrc = if args.workspaceSrc == null then ./. else args.workspaceSrc; currentLockHash = builtins.hashFile "sha256" (workspaceSrc + /Cargo.lock); lockHashIgnored = if ignoreLockHash @@ -88,6 +88,58 @@ in src = fetchCratesIo { inherit name version; sha256 = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"; }; }); + "registry+https://github.com/rust-lang/crates.io-index".aead."0.5.2" = overridableMkRustCrate (profileName: rec { + name = "aead"; + version = "0.5.2"; + registry = "registry+https://github.com/rust-lang/crates.io-index"; + src = fetchCratesIo { inherit name version; sha256 = "d122413f284cf2d62fb1b7db97e02edb8cda96d769b16e443a4f6195e35662b0"; }; + features = builtins.concatLists [ + [ "alloc" ] + [ "getrandom" ] + [ "rand_core" ] + [ "stream" ] + ]; + dependencies = { + crypto_common = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".crypto-common."0.1.6" { inherit profileName; }).out; + generic_array = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".generic-array."0.14.7" { inherit profileName; }).out; + }; + }); + + "registry+https://github.com/rust-lang/crates.io-index".aes."0.8.4" = overridableMkRustCrate (profileName: rec { + name = "aes"; + version = "0.8.4"; + registry = "registry+https://github.com/rust-lang/crates.io-index"; + src = fetchCratesIo { inherit name version; sha256 = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0"; }; + dependencies = { + cfg_if = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".cfg-if."1.0.0" { inherit profileName; }).out; + cipher = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".cipher."0.4.4" { inherit profileName; }).out; + ${ if hostPlatform.parsed.cpu.name == "aarch64" || hostPlatform.parsed.cpu.name == "x86_64" || hostPlatform.parsed.cpu.name == "i686" then "cpufeatures" else null } = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".cpufeatures."0.2.12" { inherit profileName; }).out; + }; + }); + + "registry+https://github.com/rust-lang/crates.io-index".aes-gcm."0.10.3" = overridableMkRustCrate (profileName: rec { + name = "aes-gcm"; + version = "0.10.3"; + registry = "registry+https://github.com/rust-lang/crates.io-index"; + src = fetchCratesIo { inherit name version; sha256 = "831010a0f742e1209b3bcea8fab6a8e149051ba6099432c8cb2cc117dec3ead1"; }; + features = builtins.concatLists [ + [ "aes" ] + [ "alloc" ] + [ "default" ] + [ "getrandom" ] + [ "rand_core" ] + [ "stream" ] + ]; + dependencies = { + aead = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".aead."0.5.2" { inherit profileName; }).out; + aes = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".aes."0.8.4" { inherit profileName; }).out; + cipher = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".cipher."0.4.4" { inherit profileName; }).out; + ctr = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".ctr."0.9.2" { inherit profileName; }).out; + ghash = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".ghash."0.5.1" { inherit profileName; }).out; + subtle = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".subtle."2.5.0" { inherit profileName; }).out; + }; + }); + "registry+https://github.com/rust-lang/crates.io-index".ahash."0.8.7" = overridableMkRustCrate (profileName: rec { name = "ahash"; version = "0.8.7"; @@ -1085,6 +1137,17 @@ in }; }); + "registry+https://github.com/rust-lang/crates.io-index".cipher."0.4.4" = overridableMkRustCrate (profileName: rec { + name = "cipher"; + version = "0.4.4"; + registry = "registry+https://github.com/rust-lang/crates.io-index"; + src = fetchCratesIo { inherit name version; sha256 = "773f3b9af64447d2ce9850330c473515014aa235e6a783b02db81ff39e4a3dad"; }; + dependencies = { + crypto_common = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".crypto-common."0.1.6" { inherit profileName; }).out; + inout = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".inout."0.1.3" { inherit profileName; }).out; + }; + }); + "registry+https://github.com/rust-lang/crates.io-index".clap."2.34.0" = overridableMkRustCrate (profileName: rec { name = "clap"; version = "2.34.0"; @@ -1333,14 +1396,27 @@ in registry = "registry+https://github.com/rust-lang/crates.io-index"; src = fetchCratesIo { inherit name version; sha256 = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3"; }; features = builtins.concatLists [ + [ "getrandom" ] + [ "rand_core" ] [ "std" ] ]; dependencies = { generic_array = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".generic-array."0.14.7" { inherit profileName; }).out; + rand_core = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".rand_core."0.6.4" { inherit profileName; }).out; typenum = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".typenum."1.17.0" { inherit profileName; }).out; }; }); + "registry+https://github.com/rust-lang/crates.io-index".ctr."0.9.2" = overridableMkRustCrate (profileName: rec { + name = "ctr"; + version = "0.9.2"; + registry = "registry+https://github.com/rust-lang/crates.io-index"; + src = fetchCratesIo { inherit name version; sha256 = "0369ee1ad671834580515889b80f2ea915f23b8be8d0daa4bbaf2ac5c7590835"; }; + dependencies = { + cipher = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".cipher."0.4.4" { inherit profileName; }).out; + }; + }); + "registry+https://github.com/rust-lang/crates.io-index".darling."0.20.5" = overridableMkRustCrate (profileName: rec { name = "darling"; version = "0.20.5"; @@ -1958,7 +2034,9 @@ in (lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/metrics" || rootFeatures' ? "garage_api/metrics" || rootFeatures' ? "garage_api/prometheus") "prometheus") ]; dependencies = { + aes_gcm = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".aes-gcm."0.10.3" { inherit profileName; }).out; argon2 = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".argon2."0.5.3" { inherit profileName; }).out; + async_compression = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".async-compression."0.4.6" { inherit profileName; }).out; async_trait = (buildRustPackages."registry+https://github.com/rust-lang/crates.io-index".async-trait."0.1.77" { profileName = "__noProfile"; }).out; base64 = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".base64."0.21.7" { inherit profileName; }).out; bytes = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".bytes."1.5.0" { inherit profileName; }).out; @@ -1999,6 +2077,7 @@ in sha2 = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".sha2."0.10.8" { inherit profileName; }).out; tokio = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".tokio."1.36.0" { inherit profileName; }).out; tokio_stream = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".tokio-stream."0.1.14" { inherit profileName; }).out; + tokio_util = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".tokio-util."0.7.10" { inherit profileName; }).out; tracing = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".tracing."0.1.40" { inherit profileName; }).out; url = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".url."2.5.0" { inherit profileName; }).out; }; @@ -2321,6 +2400,17 @@ in }; }); + "registry+https://github.com/rust-lang/crates.io-index".ghash."0.5.1" = overridableMkRustCrate (profileName: rec { + name = "ghash"; + version = "0.5.1"; + registry = "registry+https://github.com/rust-lang/crates.io-index"; + src = fetchCratesIo { inherit name version; sha256 = "f0d8a4362ccb29cb0b265253fb0a2728f592895ee6854fd9bc13f2ffda266ff1"; }; + dependencies = { + opaque_debug = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".opaque-debug."0.3.1" { inherit profileName; }).out; + polyval = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".polyval."0.6.2" { inherit profileName; }).out; + }; + }); + "registry+https://github.com/rust-lang/crates.io-index".gimli."0.28.1" = overridableMkRustCrate (profileName: rec { name = "gimli"; version = "0.28.1"; @@ -2928,6 +3018,16 @@ in }; }); + "registry+https://github.com/rust-lang/crates.io-index".inout."0.1.3" = overridableMkRustCrate (profileName: rec { + name = "inout"; + version = "0.1.3"; + registry = "registry+https://github.com/rust-lang/crates.io-index"; + src = fetchCratesIo { inherit name version; sha256 = "a0c10553d664a4d0bcff9f4215d0aac67a639cc68ef660840afe309b807bc9f5"; }; + dependencies = { + generic_array = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".generic-array."0.14.7" { inherit profileName; }).out; + }; + }); + "registry+https://github.com/rust-lang/crates.io-index".instant."0.1.12" = overridableMkRustCrate (profileName: rec { name = "instant"; version = "0.1.12"; @@ -3777,6 +3877,13 @@ in ]; }); + "registry+https://github.com/rust-lang/crates.io-index".opaque-debug."0.3.1" = overridableMkRustCrate (profileName: rec { + name = "opaque-debug"; + version = "0.3.1"; + registry = "registry+https://github.com/rust-lang/crates.io-index"; + src = fetchCratesIo { inherit name version; sha256 = "c08d65885ee38876c4f86fa503fb49d7b507c2b62552df7c70b2fce627e06381"; }; + }); + "registry+https://github.com/rust-lang/crates.io-index".openssl-probe."0.1.5" = overridableMkRustCrate (profileName: rec { name = "openssl-probe"; version = "0.1.5"; @@ -4236,6 +4343,19 @@ in }; }); + "registry+https://github.com/rust-lang/crates.io-index".polyval."0.6.2" = overridableMkRustCrate (profileName: rec { + name = "polyval"; + version = "0.6.2"; + registry = "registry+https://github.com/rust-lang/crates.io-index"; + src = fetchCratesIo { inherit name version; sha256 = "9d1fe60d06143b2430aa532c94cfe9e29783047f06c0d7fd359a9a51b729fa25"; }; + dependencies = { + cfg_if = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".cfg-if."1.0.0" { inherit profileName; }).out; + ${ if hostPlatform.parsed.cpu.name == "aarch64" || hostPlatform.parsed.cpu.name == "x86_64" || hostPlatform.parsed.cpu.name == "i686" then "cpufeatures" else null } = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".cpufeatures."0.2.12" { inherit profileName; }).out; + opaque_debug = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".opaque-debug."0.3.1" { inherit profileName; }).out; + universal_hash = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".universal-hash."0.5.1" { inherit profileName; }).out; + }; + }); + "registry+https://github.com/rust-lang/crates.io-index".powerfmt."0.2.0" = overridableMkRustCrate (profileName: rec { name = "powerfmt"; version = "0.2.0"; @@ -6314,6 +6434,17 @@ in ]; }); + "registry+https://github.com/rust-lang/crates.io-index".universal-hash."0.5.1" = overridableMkRustCrate (profileName: rec { + name = "universal-hash"; + version = "0.5.1"; + registry = "registry+https://github.com/rust-lang/crates.io-index"; + src = fetchCratesIo { inherit name version; sha256 = "fc1de2c688dc15305988b563c3854064043356019f97a4b46276fe734c4f07ea"; }; + dependencies = { + crypto_common = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".crypto-common."0.1.6" { inherit profileName; }).out; + subtle = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".subtle."2.5.0" { inherit profileName; }).out; + }; + }); + "registry+https://github.com/rust-lang/crates.io-index".unsafe-libyaml."0.2.10" = overridableMkRustCrate (profileName: rec { name = "unsafe-libyaml"; version = "0.2.10"; diff --git a/Cargo.toml b/Cargo.toml index 8e9187ca..c259c7f2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -66,6 +66,7 @@ sha2 = "0.10" timeago = { version = "0.4", default-features = false } xxhash-rust = { version = "0.8", default-features = false, features = ["xxh3"] } +aes-gcm = { version = "0.10", features = ["aes", "stream"] } sodiumoxide = { version = "0.2.5-0", package = "kuska-sodiumoxide" } kuska-handshake = { version = "0.2.0", features = ["default", "async_std"] } diff --git a/src/api/Cargo.toml b/src/api/Cargo.toml index 9b215333..bcf6a537 100644 --- a/src/api/Cargo.toml +++ b/src/api/Cargo.toml @@ -21,7 +21,9 @@ garage_net.workspace = true garage_util.workspace = true garage_rpc.workspace = true +aes-gcm.workspace = true argon2.workspace = true +async-compression.workspace = true async-trait.workspace = true base64.workspace = true bytes.workspace = true @@ -41,6 +43,7 @@ futures.workspace = true futures-util.workspace = true tokio.workspace = true tokio-stream.workspace = true +tokio-util.workspace = true form_urlencoded.workspace = true http.workspace = true diff --git a/src/api/s3/copy.rs b/src/api/s3/copy.rs index 3c2bd483..2b29ec6d 100644 --- a/src/api/s3/copy.rs +++ b/src/api/s3/copy.rs @@ -1,7 +1,7 @@ use std::pin::Pin; use std::time::{Duration, SystemTime, UNIX_EPOCH}; -use futures::{stream, stream::Stream, StreamExt}; +use futures::{stream, stream::Stream, StreamExt, TryStreamExt}; use md5::{Digest as Md5Digest, Md5}; use bytes::Bytes; @@ -9,9 +9,11 @@ use hyper::{Request, Response}; use serde::Serialize; use garage_net::bytes_buf::BytesBuf; +use garage_net::stream::read_stream_to_end; use garage_rpc::rpc_helper::OrderTag; use garage_table::*; use garage_util::data::*; +use garage_util::error::Error as GarageError; use garage_util::time::*; use garage_model::s3::block_ref_table::*; @@ -21,11 +23,15 @@ use garage_model::s3::version_table::*; use crate::helpers::*; use crate::s3::api_server::{ReqBody, ResBody}; +use crate::s3::encryption::EncryptionParams; use crate::s3::error::*; +use crate::s3::get::full_object_byte_stream; use crate::s3::multipart; -use crate::s3::put::get_headers; +use crate::s3::put::{get_headers, save_stream, SaveStreamResult}; use crate::s3::xml::{self as s3_xml, xmlns_tag}; +// -------- CopyObject --------- + pub async fn handle_copy( ctx: ReqCtx, req: &Request, @@ -35,38 +41,114 @@ pub async fn handle_copy( let source_object = get_copy_source(&ctx, req).await?; - let ReqCtx { - garage, - bucket_id: dest_bucket_id, - .. - } = ctx; - let (source_version, source_version_data, source_version_meta) = extract_source_info(&source_object)?; // Check precondition, e.g. x-amz-copy-source-if-match copy_precondition.check(source_version, &source_version_meta.etag)?; + // Determine encryption parameters + let (source_encryption, source_object_headers) = + EncryptionParams::check_decrypt_for_copy_source( + &ctx.garage, + req.headers(), + &source_version_meta.encryption, + )?; + let dest_encryption = EncryptionParams::new_from_headers(&ctx.garage, req.headers())?; + + // Determine headers of destination object + let dest_object_headers = match req.headers().get("x-amz-metadata-directive") { + Some(v) if v == hyper::header::HeaderValue::from_static("REPLACE") => { + get_headers(req.headers())? + } + _ => source_object_headers.into_owned(), + }; + + // Do actual object copying + let res = if EncryptionParams::is_same(&source_encryption, &dest_encryption) { + // If source and dest are both unencrypted, or if the encryption keys + // are the same, we can just copy the metadata and link blocks of the + // old object from the new object. + handle_copy_metaonly( + ctx, + dest_key, + dest_object_headers, + dest_encryption, + source_version, + source_version_data, + source_version_meta, + ) + .await? + } else { + // If source and dest encryption use different keys, + // we must decrypt content and re-encrypt, so rewrite all data blocks. + handle_copy_reencrypt( + ctx, + dest_key, + dest_object_headers, + dest_encryption, + source_version, + source_version_data, + source_encryption, + ) + .await? + }; + + let last_modified = msec_to_rfc3339(res.version_timestamp); + let result = CopyObjectResult { + last_modified: s3_xml::Value(last_modified), + etag: s3_xml::Value(format!("\"{}\"", res.etag)), + }; + let xml = s3_xml::to_xml_with_header(&result)?; + + let mut resp = Response::builder() + .header("Content-Type", "application/xml") + .header("x-amz-version-id", hex::encode(res.version_uuid)) + .header( + "x-amz-copy-source-version-id", + hex::encode(source_version.uuid), + ); + dest_encryption.add_response_headers(&mut resp); + Ok(resp.body(string_body(xml))?) +} + +async fn handle_copy_metaonly( + ctx: ReqCtx, + dest_key: &str, + dest_object_headers: ObjectVersionHeaders, + dest_encryption: EncryptionParams, + source_version: &ObjectVersion, + source_version_data: &ObjectVersionData, + source_version_meta: &ObjectVersionMeta, +) -> Result { + let ReqCtx { + garage, + bucket_id: dest_bucket_id, + .. + } = ctx; + // Generate parameters for copied object let new_uuid = gen_uuid(); let new_timestamp = now_msec(); - // Implement x-amz-metadata-directive: REPLACE - let new_meta = match req.headers().get("x-amz-metadata-directive") { - Some(v) if v == hyper::header::HeaderValue::from_static("REPLACE") => ObjectVersionMeta { - headers: get_headers(req.headers())?, - size: source_version_meta.size, - etag: source_version_meta.etag.clone(), - }, - _ => source_version_meta.clone(), + let new_meta = ObjectVersionMeta { + encryption: dest_encryption.encrypt_headers(dest_object_headers)?, + size: source_version_meta.size, + etag: source_version_meta.etag.clone(), }; - let etag = new_meta.etag.to_string(); + let res = SaveStreamResult { + version_uuid: new_uuid, + version_timestamp: new_timestamp, + etag: new_meta.etag.clone(), + }; // Save object copy match source_version_data { ObjectVersionData::DeleteMarker => unreachable!(), ObjectVersionData::Inline(_meta, bytes) => { + // bytes is either plaintext before&after or encrypted with the + // same keys, so it's ok to just copy it as is let dest_object_version = ObjectVersion { uuid: new_uuid, timestamp: new_timestamp, @@ -97,7 +179,7 @@ pub async fn handle_copy( uuid: new_uuid, timestamp: new_timestamp, state: ObjectVersionState::Uploading { - headers: new_meta.headers.clone(), + encryption: new_meta.encryption.clone(), multipart: false, }, }; @@ -164,23 +246,42 @@ pub async fn handle_copy( } } - let last_modified = msec_to_rfc3339(new_timestamp); - let result = CopyObjectResult { - last_modified: s3_xml::Value(last_modified), - etag: s3_xml::Value(format!("\"{}\"", etag)), - }; - let xml = s3_xml::to_xml_with_header(&result)?; + Ok(res) +} - Ok(Response::builder() - .header("Content-Type", "application/xml") - .header("x-amz-version-id", hex::encode(new_uuid)) - .header( - "x-amz-copy-source-version-id", - hex::encode(source_version.uuid), - ) - .body(string_body(xml))?) +async fn handle_copy_reencrypt( + ctx: ReqCtx, + dest_key: &str, + dest_object_headers: ObjectVersionHeaders, + dest_encryption: EncryptionParams, + source_version: &ObjectVersion, + source_version_data: &ObjectVersionData, + source_encryption: EncryptionParams, +) -> Result { + // basically we will read the source data (decrypt if necessary) + // and save that in a new object (encrypt if necessary), + // by combining the code used in getobject and putobject + let source_stream = full_object_byte_stream( + ctx.garage.clone(), + source_version, + source_version_data, + source_encryption, + ); + + save_stream( + &ctx, + dest_object_headers, + dest_encryption, + source_stream.map_err(|e| Error::from(GarageError::from(e))), + &dest_key.to_string(), + None, + None, + ) + .await } +// -------- UploadPartCopy --------- + pub async fn handle_upload_part_copy( ctx: ReqCtx, req: &Request, @@ -193,7 +294,7 @@ pub async fn handle_upload_part_copy( let dest_upload_id = multipart::decode_upload_id(upload_id)?; let dest_key = dest_key.to_string(); - let (source_object, (_, _, mut dest_mpu)) = futures::try_join!( + let (source_object, (_, dest_version, mut dest_mpu)) = futures::try_join!( get_copy_source(&ctx, req), multipart::get_upload(&ctx, &dest_key, &dest_upload_id) )?; @@ -206,6 +307,20 @@ pub async fn handle_upload_part_copy( // Check precondition on source, e.g. x-amz-copy-source-if-match copy_precondition.check(source_object_version, &source_version_meta.etag)?; + // Determine encryption parameters + let (source_encryption, _) = EncryptionParams::check_decrypt_for_copy_source( + &garage, + req.headers(), + &source_version_meta.encryption, + )?; + let dest_object_encryption = match dest_version.state { + ObjectVersionState::Uploading { encryption, .. } => encryption, + _ => unreachable!(), + }; + let (dest_encryption, _) = + EncryptionParams::check_decrypt(&garage, req.headers(), &dest_object_encryption)?; + let same_encryption = EncryptionParams::is_same(&source_encryption, &dest_encryption); + // Check source range is valid let source_range = match req.headers().get("x-amz-copy-source-range") { Some(range) => { @@ -227,21 +342,16 @@ pub async fn handle_upload_part_copy( }; // Check source version is not inlined - match source_version_data { - ObjectVersionData::DeleteMarker => unreachable!(), - ObjectVersionData::Inline(_meta, _bytes) => { - // This is only for small files, we don't bother handling this. - // (in AWS UploadPartCopy works for parts at least 5MB which - // is never the case of an inline object) - return Err(Error::bad_request( - "Source object is too small (minimum part size is 5Mb)", - )); - } - ObjectVersionData::FirstBlock(_meta, _first_block_hash) => (), - }; + if matches!(source_version_data, ObjectVersionData::Inline(_, _)) { + // This is only for small files, we don't bother handling this. + // (in AWS UploadPartCopy works for parts at least 5MB which + // is never the case of an inline object) + return Err(Error::bad_request( + "Source object is too small (minimum part size is 5Mb)", + )); + } - // Fetch source versin with its block list, - // and destination version to check part hasn't yet been uploaded + // Fetch source version with its block list let source_version = garage .version_table .get(&source_object_version.uuid, &EmptyKey) @@ -251,7 +361,9 @@ pub async fn handle_upload_part_copy( // We want to reuse blocks from the source version as much as possible. // However, we still need to get the data from these blocks // because we need to know it to calculate the MD5sum of the part - // which is used as its ETag. + // which is used as its ETag. For encrypted sources or destinations, + // we must always read(+decrypt) and then write(+encrypt), so we + // can never reuse data blocks as is. // First, calculate what blocks we want to keep, // and the subrange of the block to take, if the bounds of the @@ -313,6 +425,8 @@ pub async fn handle_upload_part_copy( }, false, ); + // write an empty version now to be the parent of the block_ref entries + garage.version_table.insert(&dest_version).await?; // Now, actually copy the blocks let mut md5hasher = Md5::new(); @@ -321,24 +435,44 @@ pub async fn handle_upload_part_copy( // and extract the subrange if necessary. // The second returned value is an Option, that is Some // if and only if the block returned is a block that already existed - // in the Garage data store (thus we don't need to save it again). + // in the Garage data store and can be reused as-is instead of having + // to save it again. This excludes encrypted source blocks that we had + // to decrypt. let garage2 = garage.clone(); let order_stream = OrderTag::stream(); let source_blocks = stream::iter(blocks_to_copy) .enumerate() - .flat_map(|(i, (block_hash, range_to_copy))| { + .map(|(i, (block_hash, range_to_copy))| { let garage3 = garage2.clone(); - stream::once(async move { - let data = garage3 - .block_manager - .rpc_get_block(&block_hash, Some(order_stream.order(i as u64))) + async move { + let stream = source_encryption + .get_block(&garage3, &block_hash, Some(order_stream.order(i as u64))) .await?; + let data = read_stream_to_end(stream).await?.into_bytes(); + // For each item, we return a tuple of: + // 1. the full data block (decrypted) + // 2. an Option that indicates the hash of the block in the block store, + // only if it can be re-used as-is in the copied object match range_to_copy { - Some(r) => Ok((data.slice(r), None)), - None => Ok((data, Some(block_hash))), + Some(r) => { + // If we are taking a subslice of the data, we cannot reuse the block as-is + Ok((data.slice(r), None)) + } + None if same_encryption => { + // If the data is unencrypted before & after, or if we are using + // the same encryption key, we can reuse the stored block, no need + // to re-send it to storage nodes. + Ok((data, Some(block_hash))) + } + None => { + // If we are decrypting / (re)encrypting with different keys, + // we cannot reuse the block as-is + Ok((data, None)) + } } - }) + } }) + .buffered(2) .peekable(); // The defragmenter is a custom stream (defined below) that concatenates @@ -346,22 +480,33 @@ pub async fn handle_upload_part_copy( // It returns a series of (Vec, Option). // When it is done, it returns an empty vec. // Same as the previous iterator, the Option is Some(_) if and only if - // it's an existing block of the Garage data store. + // it's an existing block of the Garage data store that can be reused. let mut defragmenter = Defragmenter::new(garage.config.block_size, Box::pin(source_blocks)); let mut current_offset = 0; let mut next_block = defragmenter.next().await?; + // TODO this could be optimized similarly to read_and_put_blocks + // low priority because uploadpartcopy is rarely used loop { let (data, existing_block_hash) = next_block; if data.is_empty() { break; } + let data_len = data.len() as u64; md5hasher.update(&data[..]); - let must_upload = existing_block_hash.is_none(); - let final_hash = existing_block_hash.unwrap_or_else(|| blake2sum(&data[..])); + let (final_data, must_upload, final_hash) = match existing_block_hash { + Some(hash) if same_encryption => (data, false, hash), + _ => tokio::task::spawn_blocking(move || { + let data_enc = dest_encryption.encrypt_block(data)?; + let hash = blake2sum(&data_enc); + Ok::<_, Error>((data_enc, true, hash)) + }) + .await + .unwrap()?, + }; dest_version.blocks.clear(); dest_version.blocks.put( @@ -371,10 +516,10 @@ pub async fn handle_upload_part_copy( }, VersionBlock { hash: final_hash, - size: data.len() as u64, + size: data_len, }, ); - current_offset += data.len() as u64; + current_offset += data_len; let block_ref = BlockRef { block: final_hash, @@ -382,36 +527,33 @@ pub async fn handle_upload_part_copy( deleted: false.into(), }; - let garage2 = garage.clone(); - let res = futures::try_join!( + let (_, _, _, next) = futures::try_join!( // Thing 1: if the block is not exactly a block that existed before, // we need to insert that data as a new block. - async move { + async { if must_upload { - garage2 + garage .block_manager - .rpc_put_block(final_hash, data, None) + .rpc_put_block(final_hash, final_data, dest_encryption.is_encrypted(), None) .await } else { Ok(()) } }, - async { - // Thing 2: we need to insert the block in the version - garage.version_table.insert(&dest_version).await?; - // Thing 3: we need to add a block reference - garage.block_ref_table.insert(&block_ref).await - }, - // Thing 4: we need to prefetch the next block + // Thing 2: we need to insert the block in the version + garage.version_table.insert(&dest_version), + // Thing 3: we need to add a block reference + garage.block_ref_table.insert(&block_ref), + // Thing 4: we need to read the next block defragmenter.next(), )?; - next_block = res.2; + next_block = next; } assert_eq!(current_offset, source_range.length); let data_md5sum = md5hasher.finalize(); - let etag = hex::encode(data_md5sum); + let etag = dest_encryption.etag_from_md5(&data_md5sum); // Put the part's ETag in the Versiontable dest_mpu.parts.put( @@ -431,13 +573,14 @@ pub async fn handle_upload_part_copy( last_modified: s3_xml::Value(msec_to_rfc3339(source_object_version.timestamp)), })?; - Ok(Response::builder() + let mut resp = Response::builder() .header("Content-Type", "application/xml") .header( "x-amz-copy-source-version-id", hex::encode(source_object_version.uuid), - ) - .body(string_body(resp_xml))?) + ); + dest_encryption.add_response_headers(&mut resp); + Ok(resp.body(string_body(resp_xml))?) } async fn get_copy_source(ctx: &ReqCtx, req: &Request) -> Result { diff --git a/src/api/s3/encryption.rs b/src/api/s3/encryption.rs new file mode 100644 index 00000000..2b105e90 --- /dev/null +++ b/src/api/s3/encryption.rs @@ -0,0 +1,595 @@ +use std::borrow::Cow; +use std::convert::TryInto; +use std::pin::Pin; + +use aes_gcm::{ + aead::stream::{DecryptorLE31, EncryptorLE31, StreamLE31}, + aead::{Aead, AeadCore, KeyInit, OsRng}, + aes::cipher::crypto_common::rand_core::RngCore, + aes::cipher::typenum::Unsigned, + Aes256Gcm, Key, Nonce, +}; +use base64::prelude::*; +use bytes::Bytes; + +use futures::stream::Stream; +use futures::task; +use tokio::io::BufReader; + +use http::header::{HeaderMap, HeaderName, HeaderValue}; + +use garage_net::bytes_buf::BytesBuf; +use garage_net::stream::{stream_asyncread, ByteStream}; +use garage_rpc::rpc_helper::OrderTag; +use garage_util::data::Hash; +use garage_util::error::Error as GarageError; +use garage_util::migrate::Migrate; + +use garage_model::garage::Garage; +use garage_model::s3::object_table::{ObjectVersionEncryption, ObjectVersionHeaders}; + +use crate::common_error::*; +use crate::s3::error::Error; + +const X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_ALGORITHM: HeaderName = + HeaderName::from_static("x-amz-server-side-encryption-customer-algorithm"); +const X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY: HeaderName = + HeaderName::from_static("x-amz-server-side-encryption-customer-key"); +const X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5: HeaderName = + HeaderName::from_static("x-amz-server-side-encryption-customer-key-md5"); + +const X_AMZ_COPY_SOURCE_SERVER_SIDE_ENCRYPTION_CUSTOMER_ALGORITHM: HeaderName = + HeaderName::from_static("x-amz-copy-source-server-side-encryption-customer-algorithm"); +const X_AMZ_COPY_SOURCE_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY: HeaderName = + HeaderName::from_static("x-amz-copy-source-server-side-encryption-customer-key"); +const X_AMZ_COPY_SOURCE_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5: HeaderName = + HeaderName::from_static("x-amz-copy-source-server-side-encryption-customer-key-md5"); + +const CUSTOMER_ALGORITHM_AES256: &[u8] = b"AES256"; + +type Md5Output = md5::digest::Output; + +type StreamNonceSize = aes_gcm::aead::stream::NonceSize>; + +// Data blocks are encrypted by smaller chunks of size 4096 bytes, +// so that data can be streamed when reading. +// This size has to be known and has to be constant, or data won't be +// readable anymore. DO NOT CHANGE THIS VALUE. +const STREAM_ENC_PLAIN_CHUNK_SIZE: usize = 0x1000; // 4096 bytes +const STREAM_ENC_CYPER_CHUNK_SIZE: usize = STREAM_ENC_PLAIN_CHUNK_SIZE + 16; + +#[derive(Clone, Copy)] +pub enum EncryptionParams { + Plaintext, + SseC { + client_key: Key, + client_key_md5: Md5Output, + compression_level: Option, + }, +} + +impl EncryptionParams { + pub fn is_encrypted(&self) -> bool { + !matches!(self, Self::Plaintext) + } + + pub fn is_same(a: &Self, b: &Self) -> bool { + let relevant_info = |x: &Self| match x { + Self::Plaintext => None, + Self::SseC { + client_key, + compression_level, + .. + } => Some((*client_key, compression_level.is_some())), + }; + relevant_info(a) == relevant_info(b) + } + + pub fn new_from_headers( + garage: &Garage, + headers: &HeaderMap, + ) -> Result { + let key = parse_request_headers( + headers, + &X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_ALGORITHM, + &X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY, + &X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5, + )?; + match key { + Some((client_key, client_key_md5)) => Ok(EncryptionParams::SseC { + client_key, + client_key_md5, + compression_level: garage.config.compression_level, + }), + None => Ok(EncryptionParams::Plaintext), + } + } + + pub fn add_response_headers(&self, resp: &mut http::response::Builder) { + if let Self::SseC { client_key_md5, .. } = self { + let md5 = BASE64_STANDARD.encode(&client_key_md5); + + resp.headers_mut().unwrap().insert( + X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_ALGORITHM, + HeaderValue::from_bytes(CUSTOMER_ALGORITHM_AES256).unwrap(), + ); + resp.headers_mut().unwrap().insert( + X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5, + HeaderValue::from_bytes(md5.as_bytes()).unwrap(), + ); + } + } + + pub fn check_decrypt<'a>( + garage: &Garage, + headers: &HeaderMap, + obj_enc: &'a ObjectVersionEncryption, + ) -> Result<(Self, Cow<'a, ObjectVersionHeaders>), Error> { + let key = parse_request_headers( + headers, + &X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_ALGORITHM, + &X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY, + &X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5, + )?; + Self::check_decrypt_common(garage, key, obj_enc) + } + + pub fn check_decrypt_for_copy_source<'a>( + garage: &Garage, + headers: &HeaderMap, + obj_enc: &'a ObjectVersionEncryption, + ) -> Result<(Self, Cow<'a, ObjectVersionHeaders>), Error> { + let key = parse_request_headers( + headers, + &X_AMZ_COPY_SOURCE_SERVER_SIDE_ENCRYPTION_CUSTOMER_ALGORITHM, + &X_AMZ_COPY_SOURCE_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY, + &X_AMZ_COPY_SOURCE_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5, + )?; + Self::check_decrypt_common(garage, key, obj_enc) + } + + fn check_decrypt_common<'a>( + garage: &Garage, + key: Option<(Key, Md5Output)>, + obj_enc: &'a ObjectVersionEncryption, + ) -> Result<(Self, Cow<'a, ObjectVersionHeaders>), Error> { + match (key, &obj_enc) { + ( + Some((client_key, client_key_md5)), + ObjectVersionEncryption::SseC { + headers, + compressed, + }, + ) => { + let enc = Self::SseC { + client_key, + client_key_md5, + compression_level: if *compressed { + Some(garage.config.compression_level.unwrap_or(1)) + } else { + None + }, + }; + let plaintext = enc.decrypt_blob(&headers)?; + let headers = ObjectVersionHeaders::decode(&plaintext) + .ok_or_internal_error("Could not decode encrypted headers")?; + Ok((enc, Cow::Owned(headers))) + } + (None, ObjectVersionEncryption::Plaintext { headers }) => { + Ok((Self::Plaintext, Cow::Borrowed(headers))) + } + (_, ObjectVersionEncryption::SseC { .. }) => { + Err(Error::bad_request("Object is encrypted")) + } + (Some(_), _) => { + // TODO: should this be an OK scenario? + Err(Error::bad_request("Trying to decrypt a plaintext object")) + } + } + } + + pub fn encrypt_headers( + &self, + h: ObjectVersionHeaders, + ) -> Result { + match self { + Self::SseC { + compression_level, .. + } => { + let plaintext = h.encode().map_err(GarageError::from)?; + let ciphertext = self.encrypt_blob(&plaintext)?; + Ok(ObjectVersionEncryption::SseC { + headers: ciphertext.into_owned(), + compressed: compression_level.is_some(), + }) + } + Self::Plaintext => Ok(ObjectVersionEncryption::Plaintext { headers: h }), + } + } + + // ---- generating object Etag values ---- + pub fn etag_from_md5(&self, md5sum: &[u8]) -> String { + match self { + Self::Plaintext => hex::encode(md5sum), + Self::SseC { .. } => { + // AWS specifies that for encrypted objects, the Etag is not + // the md5sum of the data, but doesn't say what it is. + // So we just put some random bytes. + let mut random = [0u8; 16]; + OsRng.fill_bytes(&mut random); + hex::encode(&random) + } + } + } + + // ---- generic function for encrypting / decrypting blobs ---- + // Prepends a randomly-generated nonce to the encrypted value. + // This is used for encrypting object headers and inlined data for small objects. + // This does not compress anything. + + pub fn encrypt_blob<'a>(&self, blob: &'a [u8]) -> Result, Error> { + match self { + Self::SseC { client_key, .. } => { + let cipher = Aes256Gcm::new(&client_key); + let nonce = Aes256Gcm::generate_nonce(&mut OsRng); + let ciphertext = cipher + .encrypt(&nonce, blob) + .ok_or_internal_error("Encryption failed")?; + Ok(Cow::Owned([nonce.to_vec(), ciphertext].concat())) + } + Self::Plaintext => Ok(Cow::Borrowed(blob)), + } + } + + pub fn decrypt_blob<'a>(&self, blob: &'a [u8]) -> Result, Error> { + match self { + Self::SseC { client_key, .. } => { + let cipher = Aes256Gcm::new(&client_key); + let nonce_size = ::NonceSize::to_usize(); + let nonce = Nonce::from_slice( + blob.get(..nonce_size) + .ok_or_internal_error("invalid encrypted data")?, + ); + let plaintext = cipher + .decrypt(nonce, &blob[nonce_size..]) + .ok_or_bad_request( + "Invalid encryption key, could not decrypt object metadata.", + )?; + Ok(Cow::Owned(plaintext)) + } + Self::Plaintext => Ok(Cow::Borrowed(blob)), + } + } + + // ---- function for encrypting / decrypting byte streams ---- + + /// Get a data block from the storage node, and decrypt+decompress it + /// if necessary. If object is plaintext, just get it without any processing. + pub async fn get_block( + &self, + garage: &Garage, + hash: &Hash, + order: Option, + ) -> Result { + let raw_block = garage + .block_manager + .rpc_get_block_streaming(hash, order) + .await?; + Ok(self.decrypt_block_stream(raw_block)) + } + + pub fn decrypt_block_stream(&self, stream: ByteStream) -> ByteStream { + match self { + Self::Plaintext => stream, + Self::SseC { + client_key, + compression_level, + .. + } => { + let plaintext = DecryptStream::new(stream, *client_key); + if compression_level.is_some() { + let reader = stream_asyncread(Box::pin(plaintext)); + let reader = BufReader::new(reader); + let reader = async_compression::tokio::bufread::ZstdDecoder::new(reader); + Box::pin(tokio_util::io::ReaderStream::new(reader)) + } else { + Box::pin(plaintext) + } + } + } + } + + /// Encrypt a data block if encryption is set, for use before + /// putting the data blocks into storage + pub fn encrypt_block(&self, block: Bytes) -> Result { + match self { + Self::Plaintext => Ok(block), + Self::SseC { + client_key, + compression_level, + .. + } => { + let block = if let Some(level) = compression_level { + Cow::Owned( + garage_block::zstd_encode(block.as_ref(), *level) + .ok_or_internal_error("failed to compress data block")?, + ) + } else { + Cow::Borrowed(block.as_ref()) + }; + + let mut ret = Vec::with_capacity(block.len() + 32 + block.len() / 64); + + let mut nonce: Nonce = Default::default(); + OsRng.fill_bytes(&mut nonce); + ret.extend_from_slice(nonce.as_slice()); + + let mut cipher = EncryptorLE31::::new(&client_key, &nonce); + let mut iter = block.chunks(STREAM_ENC_PLAIN_CHUNK_SIZE).peekable(); + + if iter.peek().is_none() { + // Empty stream: we encrypt an empty last chunk + let chunk_enc = cipher + .encrypt_last(&[][..]) + .ok_or_internal_error("failed to encrypt chunk")?; + ret.extend_from_slice(&chunk_enc); + } else { + loop { + let chunk = iter.next().unwrap(); + if iter.peek().is_some() { + let chunk_enc = cipher + .encrypt_next(chunk) + .ok_or_internal_error("failed to encrypt chunk")?; + assert_eq!(chunk.len(), STREAM_ENC_PLAIN_CHUNK_SIZE); + assert_eq!(chunk_enc.len(), STREAM_ENC_CYPER_CHUNK_SIZE); + ret.extend_from_slice(&chunk_enc); + } else { + // use encrypt_last for the last chunk + let chunk_enc = cipher + .encrypt_last(chunk) + .ok_or_internal_error("failed to encrypt chunk")?; + ret.extend_from_slice(&chunk_enc); + break; + } + } + } + + Ok(ret.into()) + } + } + } +} + +fn parse_request_headers( + headers: &HeaderMap, + alg_header: &HeaderName, + key_header: &HeaderName, + md5_header: &HeaderName, +) -> Result, Md5Output)>, Error> { + let alg = headers.get(alg_header).map(HeaderValue::as_bytes); + let key = headers.get(key_header).map(HeaderValue::as_bytes); + let md5 = headers.get(md5_header).map(HeaderValue::as_bytes); + + match alg { + Some(CUSTOMER_ALGORITHM_AES256) => { + use md5::{Digest, Md5}; + + let key_b64 = + key.ok_or_bad_request("Missing server-side-encryption-customer-key header")?; + let key_bytes: [u8; 32] = BASE64_STANDARD + .decode(&key_b64) + .ok_or_bad_request( + "Invalid server-side-encryption-customer-key header: invalid base64", + )? + .try_into() + .ok() + .ok_or_bad_request( + "Invalid server-side-encryption-customer-key header: invalid length", + )?; + + let md5_b64 = + md5.ok_or_bad_request("Missing server-side-encryption-customer-key-md5 header")?; + let md5_bytes = BASE64_STANDARD.decode(&md5_b64).ok_or_bad_request( + "Invalid server-side-encryption-customer-key-md5 header: invalid bass64", + )?; + + let mut hasher = Md5::new(); + hasher.update(&key_bytes[..]); + let our_md5 = hasher.finalize(); + if our_md5.as_slice() != md5_bytes.as_slice() { + return Err(Error::bad_request( + "Server-side encryption client key MD5 checksum does not match", + )); + } + + Ok(Some((key_bytes.into(), our_md5))) + } + Some(alg) => Err(Error::InvalidEncryptionAlgorithm( + String::from_utf8_lossy(alg).into_owned(), + )), + None => { + if key.is_some() || md5.is_some() { + Err(Error::bad_request( + "Unexpected server-side-encryption-customer-key{,-md5} header(s)", + )) + } else { + Ok(None) + } + } + } +} + +// ---- encrypt & decrypt streams ---- + +#[pin_project::pin_project] +struct DecryptStream { + #[pin] + stream: ByteStream, + done_reading: bool, + buf: BytesBuf, + key: Key, + state: DecryptStreamState, +} + +enum DecryptStreamState { + Starting, + Running(DecryptorLE31), + Done, +} + +impl DecryptStream { + fn new(stream: ByteStream, key: Key) -> Self { + Self { + stream, + done_reading: false, + buf: BytesBuf::new(), + key, + state: DecryptStreamState::Starting, + } + } +} + +impl Stream for DecryptStream { + type Item = Result; + + fn poll_next( + self: Pin<&mut Self>, + cx: &mut task::Context<'_>, + ) -> task::Poll> { + use std::task::Poll; + + let mut this = self.project(); + + // The first bytes of the stream should contain the starting nonce. + // If we don't have a Running state, it means that we haven't + // yet read the nonce. + while matches!(this.state, DecryptStreamState::Starting) { + let nonce_size = StreamNonceSize::to_usize(); + if let Some(nonce) = this.buf.take_exact(nonce_size) { + let nonce = Nonce::from_slice(nonce.as_ref()); + *this.state = DecryptStreamState::Running(DecryptorLE31::new(&this.key, nonce)); + break; + } + + match futures::ready!(this.stream.as_mut().poll_next(cx)) { + Some(Ok(bytes)) => { + this.buf.extend(bytes); + } + Some(Err(e)) => { + return Poll::Ready(Some(Err(e))); + } + None => { + return Poll::Ready(Some(Err(std::io::Error::new( + std::io::ErrorKind::UnexpectedEof, + "Decrypt: unexpected EOF, could not read nonce", + )))); + } + } + } + + // Read at least one byte more than the encrypted chunk size + // (if possible), so that we know if we are decrypting the + // last chunk or not. + while !*this.done_reading && this.buf.len() <= STREAM_ENC_CYPER_CHUNK_SIZE { + match futures::ready!(this.stream.as_mut().poll_next(cx)) { + Some(Ok(bytes)) => { + this.buf.extend(bytes); + } + Some(Err(e)) => { + return Poll::Ready(Some(Err(e))); + } + None => { + *this.done_reading = true; + break; + } + } + } + + if matches!(this.state, DecryptStreamState::Done) { + if !this.buf.is_empty() { + return Poll::Ready(Some(Err(std::io::Error::new( + std::io::ErrorKind::Other, + "Decrypt: unexpected bytes after last encrypted chunk", + )))); + } + return Poll::Ready(None); + } + + let res = if this.buf.len() > STREAM_ENC_CYPER_CHUNK_SIZE { + // we have strictly more bytes than the encrypted chunk size, + // so we know this is not the last + let DecryptStreamState::Running(ref mut cipher) = this.state else { + unreachable!() + }; + let chunk = this.buf.take_exact(STREAM_ENC_CYPER_CHUNK_SIZE).unwrap(); + let chunk_dec = cipher.decrypt_next(chunk.as_ref()); + if let Ok(c) = &chunk_dec { + assert_eq!(c.len(), STREAM_ENC_PLAIN_CHUNK_SIZE); + } + chunk_dec + } else { + // We have one encrypted chunk size or less, even though we tried + // to read more, so this is the last chunk. Decrypt using the + // appropriate decrypt_last() function that then destroys the cipher. + let state = std::mem::replace(this.state, DecryptStreamState::Done); + let DecryptStreamState::Running(cipher) = state else { + unreachable!() + }; + let chunk = this.buf.take_all(); + cipher.decrypt_last(chunk.as_ref()) + }; + + match res { + Ok(bytes) if bytes.is_empty() => Poll::Ready(None), + Ok(bytes) => Poll::Ready(Some(Ok(bytes.into()))), + Err(_) => Poll::Ready(Some(Err(std::io::Error::new( + std::io::ErrorKind::Other, + "Decryption failed", + )))), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use futures::stream::StreamExt; + use garage_net::stream::read_stream_to_end; + + fn stream() -> ByteStream { + Box::pin( + futures::stream::iter(16usize..1024) + .map(|i| Ok(Bytes::from(vec![(i % 256) as u8; (i * 37) % 1024]))), + ) + } + + async fn test_block_enc(compression_level: Option) { + let enc = EncryptionParams::SseC { + client_key: Aes256Gcm::generate_key(&mut OsRng), + client_key_md5: Default::default(), // not needed + compression_level, + }; + + let block_plain = read_stream_to_end(stream()).await.unwrap().into_bytes(); + + let block_enc = enc.encrypt_block(block_plain.clone()).unwrap(); + + let block_dec = + enc.decrypt_block_stream(Box::pin(futures::stream::once(async { Ok(block_enc) }))); + let block_dec = read_stream_to_end(block_dec).await.unwrap().into_bytes(); + + assert_eq!(block_plain, block_dec); + assert!(block_dec.len() > 128000); + } + + #[tokio::test] + async fn test_encrypt_block() { + test_block_enc(None).await + } + + #[tokio::test] + async fn test_encrypt_block_compressed() { + test_block_enc(Some(1)).await + } +} diff --git a/src/api/s3/error.rs b/src/api/s3/error.rs index f86c19a6..5cb5d04e 100644 --- a/src/api/s3/error.rs +++ b/src/api/s3/error.rs @@ -65,6 +65,10 @@ pub enum Error { #[error(display = "Invalid HTTP range: {:?}", _0)] InvalidRange(#[error(from)] (http_range::HttpRangeParseError, u64)), + /// The client sent a range header with invalid value + #[error(display = "Invalid encryption algorithm: {:?}, should be AES256", _0)] + InvalidEncryptionAlgorithm(String), + /// The client sent a request for an action not supported by garage #[error(display = "Unimplemented action: {}", _0)] NotImplemented(String), @@ -126,6 +130,7 @@ impl Error { Error::InvalidXml(_) => "MalformedXML", Error::InvalidRange(_) => "InvalidRange", Error::InvalidUtf8Str(_) | Error::InvalidUtf8String(_) => "InvalidRequest", + Error::InvalidEncryptionAlgorithm(_) => "InvalidEncryptionAlgorithmError", } } } @@ -143,6 +148,7 @@ impl ApiError for Error { | Error::InvalidPart | Error::InvalidPartOrder | Error::EntityTooSmall + | Error::InvalidEncryptionAlgorithm(_) | Error::InvalidXml(_) | Error::InvalidUtf8Str(_) | Error::InvalidUtf8String(_) => StatusCode::BAD_REQUEST, diff --git a/src/api/s3/get.rs b/src/api/s3/get.rs index ed996fb1..1bca4671 100644 --- a/src/api/s3/get.rs +++ b/src/api/s3/get.rs @@ -3,8 +3,9 @@ use std::convert::TryInto; use std::sync::Arc; use std::time::{Duration, UNIX_EPOCH}; +use bytes::Bytes; use futures::future; -use futures::stream::{self, StreamExt}; +use futures::stream::{self, Stream, StreamExt}; use http::header::{ ACCEPT_RANGES, CACHE_CONTROL, CONTENT_DISPOSITION, CONTENT_ENCODING, CONTENT_LANGUAGE, CONTENT_LENGTH, CONTENT_RANGE, CONTENT_TYPE, ETAG, EXPIRES, IF_MODIFIED_SINCE, IF_NONE_MATCH, @@ -25,6 +26,7 @@ use garage_model::s3::version_table::*; use crate::helpers::*; use crate::s3::api_server::ResBody; +use crate::s3::encryption::EncryptionParams; use crate::s3::error::*; const X_AMZ_MP_PARTS_COUNT: &str = "x-amz-mp-parts-count"; @@ -42,6 +44,8 @@ pub struct GetObjectOverrides { fn object_headers( version: &ObjectVersion, version_meta: &ObjectVersionMeta, + headers: &ObjectVersionHeaders, + encryption: EncryptionParams, ) -> http::response::Builder { debug!("Version meta: {:?}", version_meta); @@ -49,7 +53,7 @@ fn object_headers( let date_str = httpdate::fmt_http_date(date); let mut resp = Response::builder() - .header(CONTENT_TYPE, version_meta.headers.content_type.to_string()) + .header(CONTENT_TYPE, headers.content_type.to_string()) .header(LAST_MODIFIED, date_str) .header(ACCEPT_RANGES, "bytes".to_string()); @@ -57,10 +61,12 @@ fn object_headers( resp = resp.header(ETAG, format!("\"{}\"", version_meta.etag)); } - for (k, v) in version_meta.headers.other.iter() { + for (k, v) in headers.other.iter() { resp = resp.header(k, v.to_string()); } + encryption.add_response_headers(&mut resp); + resp } @@ -175,21 +181,27 @@ pub async fn handle_head_without_ctx( return Ok(cached); } + let (encryption, headers) = + EncryptionParams::check_decrypt(&garage, req.headers(), &version_meta.encryption)?; + if let Some(pn) = part_number { match version_data { - ObjectVersionData::Inline(_, bytes) => { + ObjectVersionData::Inline(_, _) => { if pn != 1 { return Err(Error::InvalidPart); } - Ok(object_headers(object_version, version_meta) - .header(CONTENT_LENGTH, format!("{}", bytes.len())) - .header( - CONTENT_RANGE, - format!("bytes 0-{}/{}", bytes.len() - 1, bytes.len()), - ) - .header(X_AMZ_MP_PARTS_COUNT, "1") - .status(StatusCode::PARTIAL_CONTENT) - .body(empty_body())?) + let bytes_len = version_meta.size; + Ok( + object_headers(object_version, version_meta, &headers, encryption) + .header(CONTENT_LENGTH, format!("{}", bytes_len)) + .header( + CONTENT_RANGE, + format!("bytes 0-{}/{}", bytes_len - 1, bytes_len), + ) + .header(X_AMZ_MP_PARTS_COUNT, "1") + .status(StatusCode::PARTIAL_CONTENT) + .body(empty_body())?, + ) } ObjectVersionData::FirstBlock(_, _) => { let version = garage @@ -201,28 +213,32 @@ pub async fn handle_head_without_ctx( let (part_offset, part_end) = calculate_part_bounds(&version, pn).ok_or(Error::InvalidPart)?; - Ok(object_headers(object_version, version_meta) - .header(CONTENT_LENGTH, format!("{}", part_end - part_offset)) - .header( - CONTENT_RANGE, - format!( - "bytes {}-{}/{}", - part_offset, - part_end - 1, - version_meta.size - ), - ) - .header(X_AMZ_MP_PARTS_COUNT, format!("{}", version.n_parts()?)) - .status(StatusCode::PARTIAL_CONTENT) - .body(empty_body())?) + Ok( + object_headers(object_version, version_meta, &headers, encryption) + .header(CONTENT_LENGTH, format!("{}", part_end - part_offset)) + .header( + CONTENT_RANGE, + format!( + "bytes {}-{}/{}", + part_offset, + part_end - 1, + version_meta.size + ), + ) + .header(X_AMZ_MP_PARTS_COUNT, format!("{}", version.n_parts()?)) + .status(StatusCode::PARTIAL_CONTENT) + .body(empty_body())?, + ) } _ => unreachable!(), } } else { - Ok(object_headers(object_version, version_meta) - .header(CONTENT_LENGTH, format!("{}", version_meta.size)) - .status(StatusCode::OK) - .body(empty_body())?) + Ok( + object_headers(object_version, version_meta, &headers, encryption) + .header(CONTENT_LENGTH, format!("{}", version_meta.size)) + .status(StatusCode::OK) + .body(empty_body())?, + ) } } @@ -273,23 +289,41 @@ pub async fn handle_get_without_ctx( return Ok(cached); } + let (enc, headers) = + EncryptionParams::check_decrypt(&garage, req.headers(), &last_v_meta.encryption)?; + match (part_number, parse_range_header(req, last_v_meta.size)?) { (Some(_), Some(_)) => Err(Error::bad_request( "Cannot specify both partNumber and Range header", )), - (Some(pn), None) => handle_get_part(garage, last_v, last_v_data, last_v_meta, pn).await, + (Some(pn), None) => { + handle_get_part(garage, last_v, last_v_data, last_v_meta, enc, &headers, pn).await + } (None, Some(range)) => { handle_get_range( garage, last_v, last_v_data, last_v_meta, + enc, + &headers, range.start, range.start + range.length, ) .await } - (None, None) => handle_get_full(garage, last_v, last_v_data, last_v_meta, overrides).await, + (None, None) => { + handle_get_full( + garage, + last_v, + last_v_data, + last_v_meta, + enc, + &headers, + overrides, + ) + .await + } } } @@ -298,17 +332,36 @@ async fn handle_get_full( version: &ObjectVersion, version_data: &ObjectVersionData, version_meta: &ObjectVersionMeta, + encryption: EncryptionParams, + headers: &ObjectVersionHeaders, overrides: GetObjectOverrides, ) -> Result, Error> { - let mut resp_builder = object_headers(version, version_meta) + let mut resp_builder = object_headers(version, version_meta, &headers, encryption) .header(CONTENT_LENGTH, format!("{}", version_meta.size)) .status(StatusCode::OK); getobject_override_headers(overrides, &mut resp_builder)?; + let stream = full_object_byte_stream(garage, version, version_data, encryption); + + Ok(resp_builder.body(response_body_from_stream(stream))?) +} + +pub fn full_object_byte_stream( + garage: Arc, + version: &ObjectVersion, + version_data: &ObjectVersionData, + encryption: EncryptionParams, +) -> ByteStream { match &version_data { ObjectVersionData::DeleteMarker => unreachable!(), ObjectVersionData::Inline(_, bytes) => { - Ok(resp_builder.body(bytes_body(bytes.to_vec().into()))?) + let bytes = bytes.to_vec(); + Box::pin(futures::stream::once(async move { + encryption + .decrypt_blob(&bytes) + .map(|x| Bytes::from(x.to_vec())) + .map_err(std_error_from_read_error) + })) } ObjectVersionData::FirstBlock(_, first_block_hash) => { let (tx, rx) = mpsc::channel::(2); @@ -324,19 +377,18 @@ async fn handle_get_full( garage2.version_table.get(&version_uuid, &EmptyKey).await }); - let stream_block_0 = garage - .block_manager - .rpc_get_block_streaming(&first_block_hash, Some(order_stream.order(0))) + let stream_block_0 = encryption + .get_block(&garage, &first_block_hash, Some(order_stream.order(0))) .await?; + tx.send(stream_block_0) .await .ok_or_message("channel closed")?; let version = version_fut.await.unwrap()?.ok_or(Error::NoSuchKey)?; for (i, (_, vb)) in version.blocks.items().iter().enumerate().skip(1) { - let stream_block_i = garage - .block_manager - .rpc_get_block_streaming(&vb.hash, Some(order_stream.order(i as u64))) + let stream_block_i = encryption + .get_block(&garage, &vb.hash, Some(order_stream.order(i as u64))) .await?; tx.send(stream_block_i) .await @@ -354,8 +406,7 @@ async fn handle_get_full( } }); - let body = response_body_from_block_stream(rx); - Ok(resp_builder.body(body)?) + Box::pin(tokio_stream::wrappers::ReceiverStream::new(rx).flatten()) } } } @@ -365,13 +416,15 @@ async fn handle_get_range( version: &ObjectVersion, version_data: &ObjectVersionData, version_meta: &ObjectVersionMeta, + encryption: EncryptionParams, + headers: &ObjectVersionHeaders, begin: u64, end: u64, ) -> Result, Error> { // Here we do not use getobject_override_headers because we don't // want to add any overridden headers (those should not be added // when returning PARTIAL_CONTENT) - let resp_builder = object_headers(version, version_meta) + let resp_builder = object_headers(version, version_meta, headers, encryption) .header(CONTENT_LENGTH, format!("{}", end - begin)) .header( CONTENT_RANGE, @@ -382,6 +435,7 @@ async fn handle_get_range( match &version_data { ObjectVersionData::DeleteMarker => unreachable!(), ObjectVersionData::Inline(_meta, bytes) => { + let bytes = encryption.decrypt_blob(&bytes)?; if end as usize <= bytes.len() { let body = bytes_body(bytes[begin as usize..end as usize].to_vec().into()); Ok(resp_builder.body(body)?) @@ -398,7 +452,8 @@ async fn handle_get_range( .await? .ok_or(Error::NoSuchKey)?; - let body = body_from_blocks_range(garage, version.blocks.items(), begin, end); + let body = + body_from_blocks_range(garage, encryption, version.blocks.items(), begin, end); Ok(resp_builder.body(body)?) } } @@ -409,17 +464,21 @@ async fn handle_get_part( object_version: &ObjectVersion, version_data: &ObjectVersionData, version_meta: &ObjectVersionMeta, + encryption: EncryptionParams, + headers: &ObjectVersionHeaders, part_number: u64, ) -> Result, Error> { // Same as for get_range, no getobject_override_headers - let resp_builder = - object_headers(object_version, version_meta).status(StatusCode::PARTIAL_CONTENT); + let resp_builder = object_headers(object_version, version_meta, headers, encryption) + .status(StatusCode::PARTIAL_CONTENT); match version_data { ObjectVersionData::Inline(_, bytes) => { if part_number != 1 { return Err(Error::InvalidPart); } + let bytes = encryption.decrypt_blob(&bytes)?; + assert_eq!(bytes.len() as u64, version_meta.size); Ok(resp_builder .header(CONTENT_LENGTH, format!("{}", bytes.len())) .header( @@ -427,7 +486,7 @@ async fn handle_get_part( format!("bytes {}-{}/{}", 0, bytes.len() - 1, bytes.len()), ) .header(X_AMZ_MP_PARTS_COUNT, "1") - .body(bytes_body(bytes.to_vec().into()))?) + .body(bytes_body(bytes.into_owned().into()))?) } ObjectVersionData::FirstBlock(_, _) => { let version = garage @@ -439,7 +498,8 @@ async fn handle_get_part( let (begin, end) = calculate_part_bounds(&version, part_number).ok_or(Error::InvalidPart)?; - let body = body_from_blocks_range(garage, version.blocks.items(), begin, end); + let body = + body_from_blocks_range(garage, encryption, version.blocks.items(), begin, end); Ok(resp_builder .header(CONTENT_LENGTH, format!("{}", end - begin)) @@ -494,6 +554,7 @@ fn calculate_part_bounds(v: &Version, part_number: u64) -> Option<(u64, u64)> { fn body_from_blocks_range( garage: Arc, + encryption: EncryptionParams, all_blocks: &[(VersionBlockKey, VersionBlock)], begin: u64, end: u64, @@ -523,12 +584,11 @@ fn body_from_blocks_range( tokio::spawn(async move { match async { - let garage = garage.clone(); for (i, (block, block_offset)) in blocks.iter().enumerate() { - let block_stream = garage - .block_manager - .rpc_get_block_streaming(&block.hash, Some(order_stream.order(i as u64))) - .await? + let block_stream = encryption + .get_block(&garage, &block.hash, Some(order_stream.order(i as u64))) + .await?; + let block_stream = block_stream .scan(*block_offset, move |chunk_offset, chunk| { let r = match chunk { Ok(chunk_bytes) => { @@ -588,19 +648,30 @@ fn body_from_blocks_range( } fn response_body_from_block_stream(rx: mpsc::Receiver) -> ResBody { - let body_stream = tokio_stream::wrappers::ReceiverStream::new(rx) - .flatten() - .map(|x| { - x.map(hyper::body::Frame::data) - .map_err(|e| Error::from(garage_util::error::Error::from(e))) - }); + let body_stream = tokio_stream::wrappers::ReceiverStream::new(rx).flatten(); + response_body_from_stream(body_stream) +} + +fn response_body_from_stream(stream: S) -> ResBody +where + S: Stream> + Send + Sync + 'static, +{ + let body_stream = stream.map(|x| { + x.map(hyper::body::Frame::data) + .map_err(|e| Error::from(garage_util::error::Error::from(e))) + }); ResBody::new(http_body_util::StreamBody::new(body_stream)) } fn error_stream_item(e: E) -> ByteStream { - let err = std::io::Error::new( + Box::pin(stream::once(future::ready(Err(std_error_from_read_error( + e, + ))))) +} + +fn std_error_from_read_error(e: E) -> std::io::Error { + std::io::Error::new( std::io::ErrorKind::Other, - format!("Error while getting object data: {}", e), - ); - Box::pin(stream::once(future::ready(Err(err)))) + format!("Error while reading object data: {}", e), + ) } diff --git a/src/api/s3/list.rs b/src/api/s3/list.rs index 302c03f4..a7eebbb1 100644 --- a/src/api/s3/list.rs +++ b/src/api/s3/list.rs @@ -944,9 +944,11 @@ mod tests { timestamp: TS, state: ObjectVersionState::Uploading { multipart: true, - headers: ObjectVersionHeaders { - content_type: "text/plain".to_string(), - other: BTreeMap::::new(), + encryption: ObjectVersionEncryption::Plaintext { + headers: ObjectVersionHeaders { + content_type: "text/plain".to_string(), + other: BTreeMap::::new(), + }, }, }, } diff --git a/src/api/s3/mod.rs b/src/api/s3/mod.rs index cbdb94ab..1eb95d40 100644 --- a/src/api/s3/mod.rs +++ b/src/api/s3/mod.rs @@ -13,5 +13,6 @@ mod post_object; mod put; mod website; +mod encryption; mod router; pub mod xml; diff --git a/src/api/s3/multipart.rs b/src/api/s3/multipart.rs index 1d5aeb26..fcc5769f 100644 --- a/src/api/s3/multipart.rs +++ b/src/api/s3/multipart.rs @@ -16,6 +16,7 @@ use garage_model::s3::version_table::*; use crate::helpers::*; use crate::s3::api_server::{ReqBody, ResBody}; +use crate::s3::encryption::EncryptionParams; use crate::s3::error::*; use crate::s3::put::*; use crate::s3::xml as s3_xml; @@ -41,13 +42,17 @@ pub async fn handle_create_multipart_upload( let headers = get_headers(req.headers())?; + // Determine whether object should be encrypted, and if so the key + let encryption = EncryptionParams::new_from_headers(&garage, req.headers())?; + let object_encryption = encryption.encrypt_headers(headers)?; + // Create object in object table let object_version = ObjectVersion { uuid: upload_id, timestamp, state: ObjectVersionState::Uploading { multipart: true, - headers, + encryption: object_encryption, }, }; let object = Object::new(*bucket_id, key.to_string(), vec![object_version]); @@ -68,7 +73,9 @@ pub async fn handle_create_multipart_upload( }; let xml = s3_xml::to_xml_with_header(&result)?; - Ok(Response::new(string_body(xml))) + let mut resp = Response::builder(); + encryption.add_response_headers(&mut resp); + Ok(resp.body(string_body(xml))?) } pub async fn handle_put_part( @@ -91,12 +98,21 @@ pub async fn handle_put_part( // Read first chuck, and at the same time try to get object to see if it exists let key = key.to_string(); - let stream = body_stream(req.into_body()); + let (req_head, req_body) = req.into_parts(); + let stream = body_stream(req_body); let mut chunker = StreamChunker::new(stream, garage.config.block_size); - let ((_, _, mut mpu), first_block) = + let ((_, object_version, mut mpu), first_block) = futures::try_join!(get_upload(&ctx, &key, &upload_id), chunker.next(),)?; + // Check encryption params + let object_encryption = match object_version.state { + ObjectVersionState::Uploading { encryption, .. } => encryption, + _ => unreachable!(), + }; + let (encryption, _) = + EncryptionParams::check_decrypt(&garage, &req_head.headers, &object_encryption)?; + // Check object is valid and part can be accepted let first_block = first_block.ok_or_bad_request("Empty body")?; @@ -136,24 +152,32 @@ pub async fn handle_put_part( garage.version_table.insert(&version).await?; // Copy data to version - let (total_size, data_md5sum, data_sha256sum, _) = - read_and_put_blocks(&ctx, &version, part_number, first_block, &mut chunker).await?; + let (total_size, data_md5sum, data_sha256sum, _) = read_and_put_blocks( + &ctx, + &version, + encryption, + part_number, + first_block, + &mut chunker, + ) + .await?; // Verify that checksums map ensure_checksum_matches( - data_md5sum.as_slice(), + &data_md5sum, data_sha256sum, content_md5.as_deref(), content_sha256, )?; // Store part etag in version - let data_md5sum_hex = hex::encode(data_md5sum); + let etag = encryption.etag_from_md5(&data_md5sum); + mpu.parts.put( mpu_part_key, MpuPart { version: version_uuid, - etag: Some(data_md5sum_hex.clone()), + etag: Some(etag.clone()), size: Some(total_size), }, ); @@ -163,11 +187,9 @@ pub async fn handle_put_part( // We won't have to clean up on drop. interrupted_cleanup.cancel(); - let response = Response::builder() - .header("ETag", format!("\"{}\"", data_md5sum_hex)) - .body(empty_body()) - .unwrap(); - Ok(response) + let mut resp = Response::builder().header("ETag", format!("\"{}\"", etag)); + encryption.add_response_headers(&mut resp); + Ok(resp.body(empty_body())?) } struct InterruptedCleanup(Option); @@ -241,8 +263,8 @@ pub async fn handle_complete_multipart_upload( return Err(Error::bad_request("No data was uploaded")); } - let headers = match object_version.state { - ObjectVersionState::Uploading { headers, .. } => headers, + let object_encryption = match object_version.state { + ObjectVersionState::Uploading { encryption, .. } => encryption, _ => unreachable!(), }; @@ -344,7 +366,7 @@ pub async fn handle_complete_multipart_upload( // Write final object version object_version.state = ObjectVersionState::Complete(ObjectVersionData::FirstBlock( ObjectVersionMeta { - headers, + encryption: object_encryption, size: total_size, etag: etag.clone(), }, diff --git a/src/api/s3/post_object.rs b/src/api/s3/post_object.rs index 66f8174c..7c4219a7 100644 --- a/src/api/s3/post_object.rs +++ b/src/api/s3/post_object.rs @@ -18,6 +18,7 @@ use garage_model::garage::Garage; use crate::helpers::*; use crate::s3::api_server::ResBody; use crate::s3::cors::*; +use crate::s3::encryption::EncryptionParams; use crate::s3::error::*; use crate::s3::put::{get_headers, save_stream}; use crate::s3::xml as s3_xml; @@ -48,13 +49,17 @@ pub async fn handle_post_object( let mut multipart = Multipart::with_constraints(stream, boundary, constraints); let mut params = HeaderMap::new(); - let field = loop { + let file_field = loop { let field = if let Some(field) = multipart.next_field().await? { field } else { return Err(Error::bad_request("Request did not contain a file")); }; - let name: HeaderName = if let Some(Ok(name)) = field.name().map(TryInto::try_into) { + let name: HeaderName = if let Some(Ok(name)) = field + .name() + .map(str::to_ascii_lowercase) + .map(TryInto::try_into) + { name } else { continue; @@ -93,10 +98,14 @@ pub async fn handle_post_object( .ok_or_bad_request("No policy was provided")? .to_str()?; let authorization = Authorization::parse_form(¶ms)?; + let content_md5 = params + .get("content-md5") + .map(HeaderValue::to_str) + .transpose()?; let key = if key.contains("${filename}") { // if no filename is provided, don't replace. This matches the behavior of AWS. - if let Some(filename) = field.file_name() { + if let Some(filename) = file_field.file_name() { key.replace("${filename}", filename) } else { key.to_owned() @@ -143,9 +152,8 @@ pub async fn handle_post_object( let mut conditions = decoded_policy.into_conditions()?; for (param_key, value) in params.iter() { - let mut param_key = param_key.to_string(); - param_key.make_ascii_lowercase(); - match param_key.as_str() { + let param_key = param_key.as_str(); + match param_key { "policy" | "x-amz-signature" => (), // this is always accepted, as it's required to validate other fields "content-type" => { let conds = conditions.params.remove("content-type").ok_or_else(|| { @@ -190,7 +198,7 @@ pub async fn handle_post_object( // how aws seems to behave. continue; } - let conds = conditions.params.remove(¶m_key).ok_or_else(|| { + let conds = conditions.params.remove(param_key).ok_or_else(|| { Error::bad_request(format!("Key '{}' is not allowed in policy", param_key)) })?; for cond in conds { @@ -218,8 +226,9 @@ pub async fn handle_post_object( let headers = get_headers(¶ms)?; - let stream = field.map(|r| r.map_err(Into::into)); + let encryption = EncryptionParams::new_from_headers(&garage, ¶ms)?; + let stream = file_field.map(|r| r.map_err(Into::into)); let ctx = ReqCtx { garage, bucket_id, @@ -228,17 +237,18 @@ pub async fn handle_post_object( api_key, }; - let (_, md5) = save_stream( + let res = save_stream( &ctx, headers, + encryption, StreamLimiter::new(stream, conditions.content_length), &key, - None, + content_md5.map(str::to_string), None, ) .await?; - let etag = format!("\"{}\"", md5); + let etag = format!("\"{}\"", res.etag); let mut resp = if let Some(mut target) = params .get("success_action_redirect") @@ -252,11 +262,12 @@ pub async fn handle_post_object( .append_pair("key", &key) .append_pair("etag", &etag); let target = target.to_string(); - Response::builder() + let mut resp = Response::builder() .status(StatusCode::SEE_OTHER) .header(header::LOCATION, target.clone()) - .header(header::ETAG, etag) - .body(string_body(target))? + .header(header::ETAG, etag); + encryption.add_response_headers(&mut resp); + resp.body(string_body(target))? } else { let path = head .uri @@ -283,9 +294,10 @@ pub async fn handle_post_object( .get("success_action_status") .and_then(|h| h.to_str().ok()) .unwrap_or("204"); - let builder = Response::builder() + let mut builder = Response::builder() .header(header::LOCATION, location.clone()) .header(header::ETAG, etag.clone()); + encryption.add_response_headers(&mut builder); match action { "200" => builder.status(StatusCode::OK).body(empty_body())?, "201" => { diff --git a/src/api/s3/put.rs b/src/api/s3/put.rs index 2ced0580..745c2219 100644 --- a/src/api/s3/put.rs +++ b/src/api/s3/put.rs @@ -36,10 +36,18 @@ use garage_model::s3::version_table::*; use crate::helpers::*; use crate::s3::api_server::{ReqBody, ResBody}; +use crate::s3::encryption::EncryptionParams; use crate::s3::error::*; const PUT_BLOCKS_MAX_PARALLEL: usize = 3; +pub struct SaveStreamResult { + pub version_uuid: Uuid, + pub version_timestamp: u64, + /// Etag WITHOUT THE QUOTES (just the hex value) + pub etag: String, +} + pub async fn handle_put( ctx: ReqCtx, req: Request, @@ -50,6 +58,9 @@ pub async fn handle_put( let headers = get_headers(req.headers())?; debug!("Object headers: {:?}", headers); + // Determine whether object should be encrypted, and if so the key + let encryption = EncryptionParams::new_from_headers(&ctx.garage, req.headers())?; + let content_md5 = match req.headers().get("content-md5") { Some(x) => Some(x.to_str()?.to_string()), None => None, @@ -57,19 +68,33 @@ pub async fn handle_put( let stream = body_stream(req.into_body()); - save_stream(&ctx, headers, stream, key, content_md5, content_sha256) - .await - .map(|(uuid, md5)| put_response(uuid, md5)) + let res = save_stream( + &ctx, + headers, + encryption, + stream, + key, + content_md5, + content_sha256, + ) + .await?; + + let mut resp = Response::builder() + .header("x-amz-version-id", hex::encode(res.version_uuid)) + .header("ETag", format!("\"{}\"", res.etag)); + encryption.add_response_headers(&mut resp); + Ok(resp.body(empty_body())?) } pub(crate) async fn save_stream> + Unpin>( ctx: &ReqCtx, headers: ObjectVersionHeaders, + encryption: EncryptionParams, body: S, key: &String, content_md5: Option, content_sha256: Option, -) -> Result<(Uuid, String), Error> { +) -> Result { let ReqCtx { garage, bucket_id, .. } = ctx; @@ -82,6 +107,8 @@ pub(crate) async fn save_stream> + Unpin>( let first_block = first_block_opt.unwrap_or_default(); + let object_encryption = encryption.encrypt_headers(headers)?; + // Generate identity of new version let version_uuid = gen_uuid(); let version_timestamp = next_timestamp(existing_object.as_ref()); @@ -92,37 +119,43 @@ pub(crate) async fn save_stream> + Unpin>( let mut md5sum = Md5::new(); md5sum.update(&first_block[..]); let data_md5sum = md5sum.finalize(); - let data_md5sum_hex = hex::encode(data_md5sum); let data_sha256sum = sha256sum(&first_block[..]); - let size = first_block.len() as u64; ensure_checksum_matches( - data_md5sum.as_slice(), + &data_md5sum, data_sha256sum, content_md5.as_deref(), content_sha256, )?; + let size = first_block.len() as u64; check_quotas(ctx, size, existing_object.as_ref()).await?; + let etag = encryption.etag_from_md5(&data_md5sum); + let inline_data = encryption.encrypt_blob(&first_block)?.to_vec(); + let object_version = ObjectVersion { uuid: version_uuid, timestamp: version_timestamp, state: ObjectVersionState::Complete(ObjectVersionData::Inline( ObjectVersionMeta { - headers, + encryption: object_encryption, size, - etag: data_md5sum_hex.clone(), + etag: etag.clone(), }, - first_block.to_vec(), + inline_data, )), }; let object = Object::new(*bucket_id, key.into(), vec![object_version]); garage.object_table.insert(&object).await?; - return Ok((version_uuid, data_md5sum_hex)); + return Ok(SaveStreamResult { + version_uuid, + version_timestamp, + etag, + }); } // The following consists in many steps that can each fail. @@ -142,7 +175,7 @@ pub(crate) async fn save_stream> + Unpin>( uuid: version_uuid, timestamp: version_timestamp, state: ObjectVersionState::Uploading { - headers: headers.clone(), + encryption: object_encryption.clone(), multipart: false, }, }; @@ -165,10 +198,10 @@ pub(crate) async fn save_stream> + Unpin>( // Transfer data and verify checksum let (total_size, data_md5sum, data_sha256sum, first_block_hash) = - read_and_put_blocks(ctx, &version, 1, first_block, &mut chunker).await?; + read_and_put_blocks(ctx, &version, encryption, 1, first_block, &mut chunker).await?; ensure_checksum_matches( - data_md5sum.as_slice(), + &data_md5sum, data_sha256sum, content_md5.as_deref(), content_sha256, @@ -177,12 +210,13 @@ pub(crate) async fn save_stream> + Unpin>( check_quotas(ctx, total_size, existing_object.as_ref()).await?; // Save final object state, marked as Complete - let md5sum_hex = hex::encode(data_md5sum); + let etag = encryption.etag_from_md5(&data_md5sum); + object_version.state = ObjectVersionState::Complete(ObjectVersionData::FirstBlock( ObjectVersionMeta { - headers, + encryption: object_encryption, size: total_size, - etag: md5sum_hex.clone(), + etag: etag.clone(), }, first_block_hash, )); @@ -193,7 +227,11 @@ pub(crate) async fn save_stream> + Unpin>( // We won't have to clean up on drop. interrupted_cleanup.cancel(); - Ok((version_uuid, md5sum_hex)) + Ok(SaveStreamResult { + version_uuid, + version_timestamp, + etag, + }) } /// Validate MD5 sum against content-md5 header @@ -290,6 +328,7 @@ pub(crate) async fn check_quotas( pub(crate) async fn read_and_put_blocks> + Unpin>( ctx: &ReqCtx, version: &Version, + encryption: EncryptionParams, part_number: u64, first_block: Bytes, chunker: &mut StreamChunker, @@ -349,12 +388,31 @@ pub(crate) async fn read_and_put_blocks> + )) }; - let (block_tx3, mut block_rx3) = mpsc::channel::>(1); - let hash_blocks = async { + let (block_tx3, mut block_rx3) = mpsc::channel::>(1); + let encrypt_hash_blocks = async { let mut first_block_hash = None; while let Some(next) = block_rx2.recv().await { match next { Ok(block) => { + let unencrypted_len = block.len() as u64; + let block = if encryption.is_encrypted() { + let res = + tokio::task::spawn_blocking(move || encryption.encrypt_block(block)) + .with_context(Context::current_with_span( + tracer.start("Encrypt block"), + )) + .await + .unwrap(); + match res { + Ok(b) => b, + Err(e) => { + block_tx3.send(Err(e)).await?; + break; + } + } + } else { + block + }; let hash = async_blake2sum(block.clone()) .with_context(Context::current_with_span( tracer.start("Hash block (blake2)"), @@ -363,7 +421,7 @@ pub(crate) async fn read_and_put_blocks> + if first_block_hash.is_none() { first_block_hash = Some(hash); } - block_tx3.send(Ok((block, hash))).await?; + block_tx3.send(Ok((block, unencrypted_len, hash))).await?; } Err(e) => { block_tx3.send(Err(e)).await?; @@ -398,7 +456,7 @@ pub(crate) async fn read_and_put_blocks> + block_rx3.recv().await } }; - let (block, hash) = tokio::select! { + let (block, unencrypted_len, hash) = tokio::select! { result = write_futs_next => { result?; continue; @@ -410,17 +468,18 @@ pub(crate) async fn read_and_put_blocks> + }; // For next block to be written: count its size and spawn future to write it - let offset = written_bytes; - written_bytes += block.len() as u64; write_futs.push_back(put_block_and_meta( ctx, version, part_number, - offset, + written_bytes, hash, block, + unencrypted_len, + encryption.is_encrypted(), order_stream.order(written_bytes), )); + written_bytes += unencrypted_len; } while let Some(res) = write_futs.next().await { res?; @@ -429,7 +488,7 @@ pub(crate) async fn read_and_put_blocks> + }; let (_, stream_hash_result, block_hash_result, final_result) = - futures::join!(read_blocks, hash_stream, hash_blocks, put_blocks); + futures::join!(read_blocks, hash_stream, encrypt_hash_blocks, put_blocks); let total_size = final_result?; // unwrap here is ok, because if hasher failed, it is because something failed @@ -449,6 +508,8 @@ async fn put_block_and_meta( offset: u64, hash: Hash, block: Bytes, + size: u64, + is_encrypted: bool, order_tag: OrderTag, ) -> Result<(), GarageError> { let ReqCtx { garage, .. } = ctx; @@ -459,10 +520,7 @@ async fn put_block_and_meta( part_number, offset, }, - VersionBlock { - hash, - size: block.len() as u64, - }, + VersionBlock { hash, size }, ); let block_ref = BlockRef { @@ -474,7 +532,7 @@ async fn put_block_and_meta( futures::try_join!( garage .block_manager - .rpc_put_block(hash, block, Some(order_tag)), + .rpc_put_block(hash, block, is_encrypted, Some(order_tag)), garage.version_table.insert(&version), garage.block_ref_table.insert(&block_ref), )?; @@ -517,14 +575,6 @@ impl> + Unpin> StreamChunker { } } -pub fn put_response(version_uuid: Uuid, md5sum_hex: String) -> Response { - Response::builder() - .header("x-amz-version-id", hex::encode(version_uuid)) - .header("ETag", format!("\"{}\"", md5sum_hex)) - .body(empty_body()) - .unwrap() -} - struct InterruptedCleanup(Option); struct InterruptedCleanupInner { garage: Arc, diff --git a/src/block/block.rs b/src/block/block.rs index 504d11f8..bd95680e 100644 --- a/src/block/block.rs +++ b/src/block/block.rs @@ -96,7 +96,7 @@ impl DataBlock { } } -fn zstd_encode(mut source: R, level: i32) -> std::io::Result> { +pub fn zstd_encode(mut source: R, level: i32) -> std::io::Result> { let mut result = Vec::::new(); let mut encoder = Encoder::new(&mut result, level)?; encoder.include_checksum(true)?; diff --git a/src/block/lib.rs b/src/block/lib.rs index c9ff2845..6c4711ef 100644 --- a/src/block/lib.rs +++ b/src/block/lib.rs @@ -9,3 +9,5 @@ mod block; mod layout; mod metrics; mod rc; + +pub use block::zstd_encode; diff --git a/src/block/manager.rs b/src/block/manager.rs index f4d8ee56..c7e4df17 100644 --- a/src/block/manager.rs +++ b/src/block/manager.rs @@ -337,26 +337,18 @@ impl BlockManager { } } - /// Ask nodes that might have a block for it, return it as one big Bytes - pub async fn rpc_get_block( - &self, - hash: &Hash, - order_tag: Option, - ) -> Result { - let stream = self.rpc_get_block_streaming(hash, order_tag).await?; - Ok(read_stream_to_end(stream).await?.into_bytes()) - } - /// Send block to nodes that should have it pub async fn rpc_put_block( &self, hash: Hash, data: Bytes, + prevent_compression: bool, order_tag: Option, ) -> Result<(), Error> { let who = self.replication.write_sets(&hash); - let (header, bytes) = DataBlock::from_buffer(data, self.compression_level) + let compression_level = self.compression_level.filter(|_| !prevent_compression); + let (header, bytes) = DataBlock::from_buffer(data, compression_level) .await .into_parts(); let put_block_rpc = diff --git a/src/model/s3/object_table.rs b/src/model/s3/object_table.rs index ebea04bd..7fa4b9e0 100644 --- a/src/model/s3/object_table.rs +++ b/src/model/s3/object_table.rs @@ -210,7 +210,165 @@ mod v09 { } } -pub use v09::*; +mod v010 { + use garage_util::data::{Hash, Uuid}; + use serde::{Deserialize, Serialize}; + + use super::v09; + + pub use v09::ObjectVersionHeaders; + + /// An object + #[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] + pub struct Object { + /// The bucket in which the object is stored, used as partition key + pub bucket_id: Uuid, + + /// The key at which the object is stored in its bucket, used as sorting key + pub key: String, + + /// The list of currenty stored versions of the object + pub(super) versions: Vec, + } + + /// Informations about a version of an object + #[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] + pub struct ObjectVersion { + /// Id of the version + pub uuid: Uuid, + /// Timestamp of when the object was created + pub timestamp: u64, + /// State of the version + pub state: ObjectVersionState, + } + + /// State of an object version + #[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] + pub enum ObjectVersionState { + /// The version is being received + Uploading { + /// Indicates whether this is a multipart upload + multipart: bool, + /// Encryption params + headers to be included in the final object + encryption: ObjectVersionEncryption, + }, + /// The version is fully received + Complete(ObjectVersionData), + /// The version uploaded containded errors or the upload was explicitly aborted + Aborted, + } + + /// Data stored in object version + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] + pub enum ObjectVersionData { + /// The object was deleted, this Version is a tombstone to mark it as such + DeleteMarker, + /// The object is short, it's stored inlined. + /// It is never compressed. For encrypted objects, it is encrypted using + /// AES256-GCM, like the encrypted headers. + Inline(ObjectVersionMeta, #[serde(with = "serde_bytes")] Vec), + /// The object is not short, Hash of first block is stored here, next segments hashes are + /// stored in the version table + FirstBlock(ObjectVersionMeta, Hash), + } + + /// Metadata about the object version + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] + pub struct ObjectVersionMeta { + /// Size of the object. If object is encrypted/compressed, + /// this is always the size of the unencrypted/uncompressed data + pub size: u64, + /// etag of the object + pub etag: String, + /// Encryption params + headers (encrypted or plaintext) + pub encryption: ObjectVersionEncryption, + } + + /// Encryption information + metadata + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] + pub enum ObjectVersionEncryption { + SseC { + /// Encrypted serialized ObjectVersionHeaders struct. + /// This is never compressed, just encrypted using AES256-GCM. + #[serde(with = "serde_bytes")] + headers: Vec, + /// Whether data blocks are compressed in addition to being encrypted + /// (compression happens before encryption, whereas for non-encrypted + /// objects, compression is handled at the level of the block manager) + compressed: bool, + }, + Plaintext { + /// Plain-text headers + headers: ObjectVersionHeaders, + }, + } + + impl garage_util::migrate::Migrate for Object { + const VERSION_MARKER: &'static [u8] = b"G010s3ob"; + + type Previous = v09::Object; + + fn migrate(old: v09::Object) -> Object { + Object { + bucket_id: old.bucket_id, + key: old.key, + versions: old.versions.into_iter().map(migrate_version).collect(), + } + } + } + + fn migrate_version(old: v09::ObjectVersion) -> ObjectVersion { + ObjectVersion { + uuid: old.uuid, + timestamp: old.timestamp, + state: match old.state { + v09::ObjectVersionState::Uploading { multipart, headers } => { + ObjectVersionState::Uploading { + multipart, + encryption: migrate_headers(headers), + } + } + v09::ObjectVersionState::Complete(d) => { + ObjectVersionState::Complete(migrate_data(d)) + } + v09::ObjectVersionState::Aborted => ObjectVersionState::Aborted, + }, + } + } + + fn migrate_data(old: v09::ObjectVersionData) -> ObjectVersionData { + match old { + v09::ObjectVersionData::DeleteMarker => ObjectVersionData::DeleteMarker, + v09::ObjectVersionData::Inline(meta, data) => { + ObjectVersionData::Inline(migrate_meta(meta), data) + } + v09::ObjectVersionData::FirstBlock(meta, fb) => { + ObjectVersionData::FirstBlock(migrate_meta(meta), fb) + } + } + } + + fn migrate_meta(old: v09::ObjectVersionMeta) -> ObjectVersionMeta { + ObjectVersionMeta { + size: old.size, + etag: old.etag, + encryption: migrate_headers(old.headers), + } + } + + fn migrate_headers(old: v09::ObjectVersionHeaders) -> ObjectVersionEncryption { + ObjectVersionEncryption::Plaintext { headers: old } + } + + // Since ObjectVersionHeaders can now be serialized independently, for the + // purpose of being encrypted, we need it to support migrations on its own + // as well. + impl garage_util::migrate::InitialFormat for ObjectVersionHeaders { + const VERSION_MARKER: &'static [u8] = b"G010s3oh"; + } +} + +pub use v010::*; impl Object { /// Initialize an Object struct from parts diff --git a/src/model/s3/version_table.rs b/src/model/s3/version_table.rs index 5c032f9f..b4662a55 100644 --- a/src/model/s3/version_table.rs +++ b/src/model/s3/version_table.rs @@ -44,7 +44,8 @@ mod v05 { pub struct VersionBlockKey { /// Number of the part pub part_number: u64, - /// Offset of this sub-segment in its part + /// Offset of this sub-segment in its part as sent by the client + /// (before any kind of compression or encryption) pub offset: u64, } @@ -53,7 +54,7 @@ mod v05 { pub struct VersionBlock { /// Blake2 sum of the block pub hash: Hash, - /// Size of the block + /// Size of the block, before any kind of compression or encryption pub size: u64, } -- cgit v1.2.3 From fa4878bad6434f33ab9e0f663d8529e0db66d7e6 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 28 Feb 2024 14:09:41 +0100 Subject: [sse-c] Testing for SSE-C encryption --- script/test-smoke.sh | 13 ++ src/garage/tests/s3/mod.rs | 1 + src/garage/tests/s3/ssec.rs | 455 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 469 insertions(+) create mode 100644 src/garage/tests/s3/ssec.rs diff --git a/script/test-smoke.sh b/script/test-smoke.sh index 9f9ea50c..acf56a90 100755 --- a/script/test-smoke.sh +++ b/script/test-smoke.sh @@ -82,6 +82,19 @@ if [ -z "$SKIP_AWS" ]; then exit 1 fi aws s3api delete-object --bucket eprouvette --key upload + + echo "🛠️ Test SSE-C with awscli (aws s3)" + SSEC_KEY="u8zCfnEyt5Imo/krN+sxA1DQXxLWtPJavU6T6gOVj1Y=" + SSEC_KEY_MD5="jMGbs3GyZkYjJUP6q5jA7g==" + echo "$SSEC_KEY" | base64 -d > /tmp/garage.ssec-key + for idx in {1,2}.rnd; do + aws s3 cp --sse-c AES256 --sse-c-key fileb:///tmp/garage.ssec-key \ + "/tmp/garage.$idx" "s3://eprouvette/garage.$idx.aws.sse-c" + aws s3 cp --sse-c AES256 --sse-c-key fileb:///tmp/garage.ssec-key \ + "s3://eprouvette/garage.$idx.aws.sse-c" "/tmp/garage.$idx.dl.sse-c" + diff "/tmp/garage.$idx" "/tmp/garage.$idx.dl.sse-c" + aws s3api delete-object --bucket eprouvette --key "garage.$idx.aws.sse-c" + done fi # S3CMD diff --git a/src/garage/tests/s3/mod.rs b/src/garage/tests/s3/mod.rs index 4ebc4914..e75b1397 100644 --- a/src/garage/tests/s3/mod.rs +++ b/src/garage/tests/s3/mod.rs @@ -3,5 +3,6 @@ mod multipart; mod objects; mod presigned; mod simple; +mod ssec; mod streaming_signature; mod website; diff --git a/src/garage/tests/s3/ssec.rs b/src/garage/tests/s3/ssec.rs new file mode 100644 index 00000000..d8f11950 --- /dev/null +++ b/src/garage/tests/s3/ssec.rs @@ -0,0 +1,455 @@ +use crate::common::{self, Context}; +use aws_sdk_s3::primitives::ByteStream; +use aws_sdk_s3::types::{CompletedMultipartUpload, CompletedPart}; + +const SSEC_KEY: &str = "u8zCfnEyt5Imo/krN+sxA1DQXxLWtPJavU6T6gOVj1Y="; +const SSEC_KEY_MD5: &str = "jMGbs3GyZkYjJUP6q5jA7g=="; +const SSEC_KEY2: &str = "XkYVk4Z3vVDO2yJaUqCAEZX6lL10voMxtV06d8my/eU="; +const SSEC_KEY2_MD5: &str = "kedo2ab8J1MCjHwJuLTJHw=="; + +const SZ_2MB: usize = 2 * 1024 * 1024; + +#[tokio::test] +async fn test_ssec_object() { + let ctx = common::context(); + let bucket = ctx.create_bucket("sse-c"); + + let bytes1 = b"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz".to_vec(); + let bytes2 = (0..400000) + .map(|x| ((x * 3792) % 256) as u8) + .collect::>(); + + for data in vec![bytes1, bytes2] { + let stream = ByteStream::new(data.clone().into()); + + // Write encrypted object + let r = ctx + .client + .put_object() + .bucket(&bucket) + .key("testobj") + .sse_customer_algorithm("AES256") + .sse_customer_key(SSEC_KEY) + .sse_customer_key_md5(SSEC_KEY_MD5) + .body(stream) + .send() + .await + .unwrap(); + assert_eq!(r.sse_customer_algorithm, Some("AES256".into())); + assert_eq!(r.sse_customer_key_md5, Some(SSEC_KEY_MD5.into())); + + test_read_encrypted( + &ctx, + &bucket, + "testobj", + &data, + SSEC_KEY, + SSEC_KEY_MD5, + SSEC_KEY2, + SSEC_KEY2_MD5, + ) + .await; + + // Test copy from encrypted to non-encrypted + let r = ctx + .client + .copy_object() + .bucket(&bucket) + .key("test-copy-enc-dec") + .copy_source(format!("{}/{}", bucket, "testobj")) + .copy_source_sse_customer_algorithm("AES256") + .copy_source_sse_customer_key(SSEC_KEY) + .copy_source_sse_customer_key_md5(SSEC_KEY_MD5) + .send() + .await + .unwrap(); + assert_eq!(r.sse_customer_algorithm, None); + assert_eq!(r.sse_customer_key_md5, None); + + // Test read decrypted file + let r = ctx + .client + .get_object() + .bucket(&bucket) + .key("test-copy-enc-dec") + .send() + .await + .unwrap(); + assert_bytes_eq!(r.body, &data); + assert_eq!(r.sse_customer_algorithm, None); + assert_eq!(r.sse_customer_key_md5, None); + + // Test copy from non-encrypted to encrypted + let r = ctx + .client + .copy_object() + .bucket(&bucket) + .key("test-copy-enc-dec-enc") + .copy_source(format!("{}/test-copy-enc-dec", bucket)) + .sse_customer_algorithm("AES256") + .sse_customer_key(SSEC_KEY2) + .sse_customer_key_md5(SSEC_KEY2_MD5) + .send() + .await + .unwrap(); + assert_eq!(r.sse_customer_algorithm, Some("AES256".into())); + assert_eq!(r.sse_customer_key_md5, Some(SSEC_KEY2_MD5.into())); + + test_read_encrypted( + &ctx, + &bucket, + "test-copy-enc-dec-enc", + &data, + SSEC_KEY2, + SSEC_KEY2_MD5, + SSEC_KEY, + SSEC_KEY_MD5, + ) + .await; + + // Test copy from encrypted to encrypted with different keys + let r = ctx + .client + .copy_object() + .bucket(&bucket) + .key("test-copy-enc-enc") + .copy_source(format!("{}/{}", bucket, "testobj")) + .copy_source_sse_customer_algorithm("AES256") + .copy_source_sse_customer_key(SSEC_KEY) + .copy_source_sse_customer_key_md5(SSEC_KEY_MD5) + .sse_customer_algorithm("AES256") + .sse_customer_key(SSEC_KEY2) + .sse_customer_key_md5(SSEC_KEY2_MD5) + .send() + .await + .unwrap(); + assert_eq!(r.sse_customer_algorithm, Some("AES256".into())); + assert_eq!(r.sse_customer_key_md5, Some(SSEC_KEY2_MD5.into())); + test_read_encrypted( + &ctx, + &bucket, + "test-copy-enc-enc", + &data, + SSEC_KEY2, + SSEC_KEY2_MD5, + SSEC_KEY, + SSEC_KEY_MD5, + ) + .await; + + // Test copy from encrypted to encrypted with the same key + let r = ctx + .client + .copy_object() + .bucket(&bucket) + .key("test-copy-enc-enc-same") + .copy_source(format!("{}/{}", bucket, "testobj")) + .copy_source_sse_customer_algorithm("AES256") + .copy_source_sse_customer_key(SSEC_KEY) + .copy_source_sse_customer_key_md5(SSEC_KEY_MD5) + .sse_customer_algorithm("AES256") + .sse_customer_key(SSEC_KEY) + .sse_customer_key_md5(SSEC_KEY_MD5) + .send() + .await + .unwrap(); + assert_eq!(r.sse_customer_algorithm, Some("AES256".into())); + assert_eq!(r.sse_customer_key_md5, Some(SSEC_KEY_MD5.into())); + test_read_encrypted( + &ctx, + &bucket, + "test-copy-enc-enc-same", + &data, + SSEC_KEY, + SSEC_KEY_MD5, + SSEC_KEY2, + SSEC_KEY2_MD5, + ) + .await; + } +} + +#[tokio::test] +async fn test_multipart_upload() { + let ctx = common::context(); + let bucket = ctx.create_bucket("test-ssec-mpu"); + + let u1 = vec![0x11; SZ_2MB]; + let u2 = vec![0x22; SZ_2MB]; + let u3 = vec![0x33; SZ_2MB]; + let all = [&u1[..], &u2[..], &u3[..]].concat(); + + // Test simple encrypted mpu + { + let up = ctx + .client + .create_multipart_upload() + .bucket(&bucket) + .key("a") + .sse_customer_algorithm("AES256") + .sse_customer_key(SSEC_KEY) + .sse_customer_key_md5(SSEC_KEY_MD5) + .send() + .await + .unwrap(); + assert!(up.upload_id.is_some()); + assert_eq!(up.sse_customer_algorithm, Some("AES256".into())); + assert_eq!(up.sse_customer_key_md5, Some(SSEC_KEY_MD5.into())); + + let uid = up.upload_id.as_ref().unwrap(); + + let mut etags = vec![]; + for (i, part) in vec![&u1, &u2, &u3].into_iter().enumerate() { + let pu = ctx + .client + .upload_part() + .bucket(&bucket) + .key("a") + .upload_id(uid) + .part_number((i + 1) as i32) + .sse_customer_algorithm("AES256") + .sse_customer_key(SSEC_KEY) + .sse_customer_key_md5(SSEC_KEY_MD5) + .body(ByteStream::from(part.to_vec())) + .send() + .await + .unwrap(); + etags.push(pu.e_tag.unwrap()); + } + + let mut cmp = CompletedMultipartUpload::builder(); + for (i, etag) in etags.into_iter().enumerate() { + cmp = cmp.parts( + CompletedPart::builder() + .part_number((i + 1) as i32) + .e_tag(etag) + .build(), + ); + } + + ctx.client + .complete_multipart_upload() + .bucket(&bucket) + .key("a") + .upload_id(uid) + .multipart_upload(cmp.build()) + .send() + .await + .unwrap(); + + test_read_encrypted( + &ctx, + &bucket, + "a", + &all, + SSEC_KEY, + SSEC_KEY_MD5, + SSEC_KEY2, + SSEC_KEY2_MD5, + ) + .await; + } + + // Test upload part copy from first object + { + // (setup) Upload a single part object + ctx.client + .put_object() + .bucket(&bucket) + .key("b") + .body(ByteStream::from(u1.clone())) + .sse_customer_algorithm("AES256") + .sse_customer_key(SSEC_KEY2) + .sse_customer_key_md5(SSEC_KEY2_MD5) + .send() + .await + .unwrap(); + + let up = ctx + .client + .create_multipart_upload() + .bucket(&bucket) + .key("target") + .sse_customer_algorithm("AES256") + .sse_customer_key(SSEC_KEY2) + .sse_customer_key_md5(SSEC_KEY2_MD5) + .send() + .await + .unwrap(); + let uid = up.upload_id.as_ref().unwrap(); + + let p1 = ctx + .client + .upload_part() + .bucket(&bucket) + .key("target") + .upload_id(uid) + .part_number(1) + .sse_customer_algorithm("AES256") + .sse_customer_key(SSEC_KEY2) + .sse_customer_key_md5(SSEC_KEY2_MD5) + .body(ByteStream::from(u3.clone())) + .send() + .await + .unwrap(); + + let p2 = ctx + .client + .upload_part_copy() + .bucket(&bucket) + .key("target") + .upload_id(uid) + .part_number(2) + .copy_source(format!("{}/a", bucket)) + .copy_source_range("bytes=500-550000") + .copy_source_sse_customer_algorithm("AES256") + .copy_source_sse_customer_key(SSEC_KEY) + .copy_source_sse_customer_key_md5(SSEC_KEY_MD5) + .sse_customer_algorithm("AES256") + .sse_customer_key(SSEC_KEY2) + .sse_customer_key_md5(SSEC_KEY2_MD5) + .send() + .await + .unwrap(); + + let p3 = ctx + .client + .upload_part() + .bucket(&bucket) + .key("target") + .upload_id(uid) + .part_number(3) + .sse_customer_algorithm("AES256") + .sse_customer_key(SSEC_KEY2) + .sse_customer_key_md5(SSEC_KEY2_MD5) + .body(ByteStream::from(u2.clone())) + .send() + .await + .unwrap(); + + let p4 = ctx + .client + .upload_part_copy() + .bucket(&bucket) + .key("target") + .upload_id(uid) + .part_number(4) + .copy_source(format!("{}/b", bucket)) + .copy_source_range("bytes=1500-20500") + .copy_source_sse_customer_algorithm("AES256") + .copy_source_sse_customer_key(SSEC_KEY2) + .copy_source_sse_customer_key_md5(SSEC_KEY2_MD5) + .sse_customer_algorithm("AES256") + .sse_customer_key(SSEC_KEY2) + .sse_customer_key_md5(SSEC_KEY2_MD5) + .send() + .await + .unwrap(); + + let cmp = CompletedMultipartUpload::builder() + .parts( + CompletedPart::builder() + .part_number(1) + .e_tag(p1.e_tag.unwrap()) + .build(), + ) + .parts( + CompletedPart::builder() + .part_number(2) + .e_tag(p2.copy_part_result.unwrap().e_tag.unwrap()) + .build(), + ) + .parts( + CompletedPart::builder() + .part_number(3) + .e_tag(p3.e_tag.unwrap()) + .build(), + ) + .parts( + CompletedPart::builder() + .part_number(4) + .e_tag(p4.copy_part_result.unwrap().e_tag.unwrap()) + .build(), + ) + .build(); + + ctx.client + .complete_multipart_upload() + .bucket(&bucket) + .key("target") + .upload_id(uid) + .multipart_upload(cmp) + .send() + .await + .unwrap(); + + // (check) Get object + let expected = [&u3[..], &all[500..550001], &u2[..], &u1[1500..20501]].concat(); + test_read_encrypted( + &ctx, + &bucket, + "target", + &expected, + SSEC_KEY2, + SSEC_KEY2_MD5, + SSEC_KEY, + SSEC_KEY_MD5, + ) + .await; + } +} + +async fn test_read_encrypted( + ctx: &Context, + bucket: &str, + obj_key: &str, + expected_data: &[u8], + enc_key: &str, + enc_key_md5: &str, + wrong_enc_key: &str, + wrong_enc_key_md5: &str, +) { + // Test read encrypted without key + let o = ctx + .client + .get_object() + .bucket(bucket) + .key(obj_key) + .send() + .await; + assert!( + o.is_err(), + "encrypted file could be read without encryption key" + ); + + // Test read encrypted with wrong key + let o = ctx + .client + .get_object() + .bucket(bucket) + .key(obj_key) + .sse_customer_key(wrong_enc_key) + .sse_customer_key_md5(wrong_enc_key_md5) + .send() + .await; + assert!( + o.is_err(), + "encrypted file could be read with incorrect encryption key" + ); + + // Test read encrypted with correct key + let o = ctx + .client + .get_object() + .bucket(bucket) + .key(obj_key) + .sse_customer_algorithm("AES256") + .sse_customer_key(enc_key) + .sse_customer_key_md5(enc_key_md5) + .send() + .await + .unwrap(); + assert_bytes_eq!(o.body, expected_data); + assert_eq!(o.sse_customer_algorithm, Some("AES256".into())); + assert_eq!(o.sse_customer_key_md5, Some(enc_key_md5.to_string())); +} -- cgit v1.2.3 From e3333f2ac5d142b6faddc6d54bcf35a0465be4bb Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 6 Mar 2024 15:40:11 +0100 Subject: [sse-c] Documentation for SSE-C --- doc/book/connect/apps/index.md | 47 +++++++++++++++++++++++++++ doc/book/reference-manual/s3-compatibility.md | 1 + 2 files changed, 48 insertions(+) diff --git a/doc/book/connect/apps/index.md b/doc/book/connect/apps/index.md index c8571fac..5def3851 100644 --- a/doc/book/connect/apps/index.md +++ b/doc/book/connect/apps/index.md @@ -80,6 +80,53 @@ To test your new configuration, just reload your Nextcloud webpage and start sen *External link:* [Nextcloud Documentation > Primary Storage](https://docs.nextcloud.com/server/latest/admin_manual/configuration_files/primary_storage.html) +#### SSE-C encryption (since Garage v1.0) + +Since version 1.0, Garage supports server-side encryption with customer keys +(SSE-C). In this mode, Garage is responsible for encrypting and decrypting +objects, but it does not store the encryption key itself. The encryption key +should be provided by Nextcloud upon each request. This mode of operation is +supported by Nextcloud and it has successfully been tested together with +Garage. + +To enable SSE-C encryption: + +1. Make sure your Garage server is accessible via SSL through a reverse proxy + such as Nginx, and that it is using a valid public certificate (Nextcloud + might be able to connect to an S3 server that is using a self-signed + certificate, but you will lose many hours while trying, so don't). + Configure values for `use_ssl` and `port` accordingly in your `config.php` + file. + +2. Generate an encryption key using the following command: + + ``` + openssl rand -base64 32 + ``` + + Make sure to keep this key **secret**! + +3. Add the encryption key in your `config.php` file as follows: + + + ```php + [ + 'class' => '\\OC\\Files\\ObjectStore\\S3', + 'arguments' => [ + ... + 'sse_c_key' => 'exampleencryptionkeyLbU+5fKYQcVoqnn+RaIOXgo=', + ... + ], + ], + ``` + +Nextcloud will now make Garage encrypt files at rest in the storage bucket. +These files will not be readable by an S3 client that has credentials to the +bucket but doesn't also know the secret encryption key. + + ### External Storage **From the GUI.** Activate the "External storage support" app from the "Applications" page (click on your account icon on the top right corner of your screen to display the menu). Go to your parameters page (also located below your account icon). Click on external storage (or the corresponding translation in your language). diff --git a/doc/book/reference-manual/s3-compatibility.md b/doc/book/reference-manual/s3-compatibility.md index 1bcfd123..d2c47f3e 100644 --- a/doc/book/reference-manual/s3-compatibility.md +++ b/doc/book/reference-manual/s3-compatibility.md @@ -33,6 +33,7 @@ Feel free to open a PR to suggest fixes this table. Minio is missing because the | [URL path-style](https://docs.aws.amazon.com/AmazonS3/latest/userguide/VirtualHosting.html#path-style-access) (eg. `host.tld/bucket/key`) | ✅ Implemented | ✅ | ✅ | ❓| ✅ | | [URL vhost-style](https://docs.aws.amazon.com/AmazonS3/latest/userguide/VirtualHosting.html#virtual-hosted-style-access) URL (eg. `bucket.host.tld/key`) | ✅ Implemented | ❌| ✅| ✅ | ✅ | | [Presigned URLs](https://docs.aws.amazon.com/AmazonS3/latest/userguide/ShareObjectPreSignedURL.html) | ✅ Implemented | ❌| ✅ | ✅ | ✅(❓) | +| [SSE-C encryption](https://docs.aws.amazon.com/AmazonS3/latest/userguide/ServerSideEncryptionCustomerKeys.html) | ✅ Implemented | ❓ | ✅ | ❌ | ✅ | *Note:* OpenIO does not says if it supports presigned URLs. Because it is part of signature v4 and they claim they support it without additional precisions, -- cgit v1.2.3 From 3fcb54e3cf62cdc9ed84751e1f0522ff553ea63c Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 6 Mar 2024 19:23:36 +0100 Subject: [sse-c] Remove special case for Content-Type header --- Cargo.lock | 1 + Cargo.nix | 3 ++- src/api/s3/get.rs | 21 ++++++++++++++++--- src/api/s3/list.rs | 5 +---- src/api/s3/put.rs | 48 ++++++++++++-------------------------------- src/model/Cargo.toml | 1 + src/model/s3/object_table.rs | 20 +++++++++++++++--- 7 files changed, 53 insertions(+), 46 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5aeef747..f1aa93bf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1495,6 +1495,7 @@ dependencies = [ "garage_table", "garage_util", "hex", + "http 1.0.0", "opentelemetry", "rand", "serde", diff --git a/Cargo.nix b/Cargo.nix index 22e7d387..cd485416 100644 --- a/Cargo.nix +++ b/Cargo.nix @@ -34,7 +34,7 @@ args@{ ignoreLockHash, }: let - nixifiedLockHash = "170b83bf5f94d624b1caf773805f52b36970c99f4db21088c4ac794dad02c53b"; + nixifiedLockHash = "c3296a54f1c6f385e0d4a4a937734f1fe0fee4405b44d7462249d72675f7ac40"; workspaceSrc = if args.workspaceSrc == null then ./. else args.workspaceSrc; currentLockHash = builtins.hashFile "sha256" (workspaceSrc + /Cargo.lock); lockHashIgnored = if ignoreLockHash @@ -2171,6 +2171,7 @@ in garage_table = (rustPackages."unknown".garage_table."0.10.0" { inherit profileName; }).out; garage_util = (rustPackages."unknown".garage_util."0.10.0" { inherit profileName; }).out; hex = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hex."0.4.3" { inherit profileName; }).out; + http = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".http."1.0.0" { inherit profileName; }).out; opentelemetry = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".opentelemetry."0.17.0" { inherit profileName; }).out; rand = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".rand."0.8.5" { inherit profileName; }).out; serde = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".serde."1.0.196" { inherit profileName; }).out; diff --git a/src/api/s3/get.rs b/src/api/s3/get.rs index 1bca4671..ec300ab7 100644 --- a/src/api/s3/get.rs +++ b/src/api/s3/get.rs @@ -1,4 +1,5 @@ //! Function related to GET and HEAD requests +use std::collections::BTreeMap; use std::convert::TryInto; use std::sync::Arc; use std::time::{Duration, UNIX_EPOCH}; @@ -53,7 +54,6 @@ fn object_headers( let date_str = httpdate::fmt_http_date(date); let mut resp = Response::builder() - .header(CONTENT_TYPE, headers.content_type.to_string()) .header(LAST_MODIFIED, date_str) .header(ACCEPT_RANGES, "bytes".to_string()); @@ -61,8 +61,23 @@ fn object_headers( resp = resp.header(ETAG, format!("\"{}\"", version_meta.etag)); } - for (k, v) in headers.other.iter() { - resp = resp.header(k, v.to_string()); + // When metadata is retrieved through the REST API, Amazon S3 combines headers that + // have the same name (ignoring case) into a comma-delimited list. + // See: https://docs.aws.amazon.com/AmazonS3/latest/userguide/UsingMetadata.html + let mut headers_by_name = BTreeMap::new(); + for (name, value) in headers.0.iter() { + match headers_by_name.get_mut(name) { + None => { + headers_by_name.insert(name, vec![value.as_str()]); + } + Some(headers) => { + headers.push(value.as_str()); + } + } + } + + for (name, values) in headers_by_name { + resp = resp.header(name, values.join(",")); } encryption.add_response_headers(&mut resp); diff --git a/src/api/s3/list.rs b/src/api/s3/list.rs index a7eebbb1..1678f1fa 100644 --- a/src/api/s3/list.rs +++ b/src/api/s3/list.rs @@ -945,10 +945,7 @@ mod tests { state: ObjectVersionState::Uploading { multipart: true, encryption: ObjectVersionEncryption::Plaintext { - headers: ObjectVersionHeaders { - content_type: "text/plain".to_string(), - other: BTreeMap::::new(), - }, + headers: ObjectVersionHeaders(vec![]), }, }, } diff --git a/src/api/s3/put.rs b/src/api/s3/put.rs index 745c2219..941e4122 100644 --- a/src/api/s3/put.rs +++ b/src/api/s3/put.rs @@ -1,4 +1,4 @@ -use std::collections::{BTreeMap, HashMap}; +use std::collections::HashMap; use std::sync::Arc; use base64::prelude::*; @@ -609,57 +609,35 @@ impl Drop for InterruptedCleanup { // ============ helpers ============ -pub(crate) fn get_mime_type(headers: &HeaderMap) -> Result { - Ok(headers - .get(hyper::header::CONTENT_TYPE) - .map(|x| x.to_str()) - .unwrap_or(Ok("blob"))? - .to_string()) -} - pub(crate) fn get_headers(headers: &HeaderMap) -> Result { - let content_type = get_mime_type(headers)?; - let mut other = BTreeMap::new(); + let mut ret = Vec::new(); // Preserve standard headers let standard_header = vec![ + hyper::header::CONTENT_TYPE, hyper::header::CACHE_CONTROL, hyper::header::CONTENT_DISPOSITION, hyper::header::CONTENT_ENCODING, hyper::header::CONTENT_LANGUAGE, hyper::header::EXPIRES, ]; - for h in standard_header.iter() { - if let Some(v) = headers.get(h) { - match v.to_str() { - Ok(v_str) => { - other.insert(h.to_string(), v_str.to_string()); - } - Err(e) => { - warn!("Discarding header {}, error in .to_str(): {}", h, e); - } - } + for name in standard_header.iter() { + if let Some(value) = headers.get(name) { + ret.push((name.to_string(), value.to_str()?.to_string())); } } // Preserve x-amz-meta- headers - for (k, v) in headers.iter() { - if k.as_str().starts_with("x-amz-meta-") { - match std::str::from_utf8(v.as_bytes()) { - Ok(v_str) => { - other.insert(k.to_string(), v_str.to_string()); - } - Err(e) => { - warn!("Discarding header {}, error in .to_str(): {}", k, e); - } - } + for (name, value) in headers.iter() { + if name.as_str().starts_with("x-amz-meta-") { + ret.push(( + name.to_string(), + std::str::from_utf8(value.as_bytes())?.to_string(), + )); } } - Ok(ObjectVersionHeaders { - content_type, - other, - }) + Ok(ObjectVersionHeaders(ret)) } pub(crate) fn next_timestamp(existing_object: Option<&Object>) -> u64 { diff --git a/src/model/Cargo.toml b/src/model/Cargo.toml index 33898e20..776671d0 100644 --- a/src/model/Cargo.toml +++ b/src/model/Cargo.toml @@ -27,6 +27,7 @@ blake2.workspace = true chrono.workspace = true err-derive.workspace = true hex.workspace = true +http.workspace = true base64.workspace = true tracing.workspace = true rand.workspace = true diff --git a/src/model/s3/object_table.rs b/src/model/s3/object_table.rs index 7fa4b9e0..f2d21493 100644 --- a/src/model/s3/object_table.rs +++ b/src/model/s3/object_table.rs @@ -216,8 +216,6 @@ mod v010 { use super::v09; - pub use v09::ObjectVersionHeaders; - /// An object #[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] pub struct Object { @@ -303,6 +301,10 @@ mod v010 { }, } + /// Vector of headers, as tuples of the format (header name, header value) + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] + pub struct ObjectVersionHeaders(pub Vec<(String, String)>); + impl garage_util::migrate::Migrate for Object { const VERSION_MARKER: &'static [u8] = b"G010s3ob"; @@ -357,7 +359,19 @@ mod v010 { } fn migrate_headers(old: v09::ObjectVersionHeaders) -> ObjectVersionEncryption { - ObjectVersionEncryption::Plaintext { headers: old } + use http::header::CONTENT_TYPE; + + let mut new_headers = Vec::with_capacity(old.other.len() + 1); + if old.content_type != "blob" { + new_headers.push((CONTENT_TYPE.as_str().to_string(), old.content_type)); + } + for (name, value) in old.other.into_iter() { + new_headers.push((name, value)); + } + + ObjectVersionEncryption::Plaintext { + headers: ObjectVersionHeaders(new_headers), + } } // Since ObjectVersionHeaders can now be serialized independently, for the -- cgit v1.2.3 From f537f76681760e9b2b3cc095a6031ebb59ca4733 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 8 Mar 2024 13:24:47 +0100 Subject: [rm-migration] Remove migration path from Garage v0.5 --- src/garage/admin/mod.rs | 21 ------- src/garage/cli/cmd.rs | 3 - src/garage/cli/structs.rs | 22 -------- src/model/key_table.rs | 68 +---------------------- src/model/lib.rs | 4 -- src/model/migrate.rs | 108 ------------------------------------ src/model/prev/mod.rs | 1 - src/model/prev/v051/bucket_table.rs | 63 --------------------- src/model/prev/v051/mod.rs | 1 - src/model/s3/object_table.rs | 43 +------------- src/model/s3/version_table.rs | 55 +----------------- 11 files changed, 5 insertions(+), 384 deletions(-) delete mode 100644 src/model/migrate.rs delete mode 100644 src/model/prev/mod.rs delete mode 100644 src/model/prev/v051/bucket_table.rs delete mode 100644 src/model/prev/v051/mod.rs diff --git a/src/garage/admin/mod.rs b/src/garage/admin/mod.rs index de7851e1..073693ed 100644 --- a/src/garage/admin/mod.rs +++ b/src/garage/admin/mod.rs @@ -27,7 +27,6 @@ use garage_model::bucket_table::*; use garage_model::garage::Garage; use garage_model::helper::error::{Error, OkOrBadRequest}; use garage_model::key_table::*; -use garage_model::migrate::Migrate; use garage_model::s3::mpu_table::MultipartUpload; use garage_model::s3::version_table::Version; @@ -42,7 +41,6 @@ pub enum AdminRpc { BucketOperation(BucketOperation), KeyOperation(KeyOperation), LaunchRepair(RepairOpt), - Migrate(MigrateOpt), Stats(StatsOpt), Worker(WorkerOperation), BlockOperation(BlockOperation), @@ -95,24 +93,6 @@ impl AdminRpcHandler { admin } - // ================ MIGRATION COMMANDS ==================== - - async fn handle_migrate(self: &Arc, opt: MigrateOpt) -> Result { - if !opt.yes { - return Err(Error::BadRequest( - "Please provide the --yes flag to initiate migration operation.".to_string(), - )); - } - - let m = Migrate { - garage: self.garage.clone(), - }; - match opt.what { - MigrateWhat::Buckets050 => m.migrate_buckets050().await, - }?; - Ok(AdminRpc::Ok("Migration successfull.".into())) - } - // ================ REPAIR COMMANDS ==================== async fn handle_launch_repair(self: &Arc, opt: RepairOpt) -> Result { @@ -530,7 +510,6 @@ impl EndpointHandler for AdminRpcHandler { match message { AdminRpc::BucketOperation(bo) => self.handle_bucket_cmd(bo).await, AdminRpc::KeyOperation(ko) => self.handle_key_cmd(ko).await, - AdminRpc::Migrate(opt) => self.handle_migrate(opt.clone()).await, AdminRpc::LaunchRepair(opt) => self.handle_launch_repair(opt.clone()).await, AdminRpc::Stats(opt) => self.handle_stats(opt.clone()).await, AdminRpc::Worker(wo) => self.handle_worker_cmd(wo).await, diff --git a/src/garage/cli/cmd.rs b/src/garage/cli/cmd.rs index fb6dface..7440457f 100644 --- a/src/garage/cli/cmd.rs +++ b/src/garage/cli/cmd.rs @@ -33,9 +33,6 @@ pub async fn cli_command_dispatch( Command::Key(ko) => { cmd_admin(admin_rpc_endpoint, rpc_host, AdminRpc::KeyOperation(ko)).await } - Command::Migrate(mo) => { - cmd_admin(admin_rpc_endpoint, rpc_host, AdminRpc::Migrate(mo)).await - } Command::Repair(ro) => { cmd_admin(admin_rpc_endpoint, rpc_host, AdminRpc::LaunchRepair(ro)).await } diff --git a/src/garage/cli/structs.rs b/src/garage/cli/structs.rs index 40e47ee1..63014dbc 100644 --- a/src/garage/cli/structs.rs +++ b/src/garage/cli/structs.rs @@ -31,11 +31,6 @@ pub enum Command { #[structopt(name = "key", version = garage_version())] Key(KeyOperation), - /// Run migrations from previous Garage version - /// (DO NOT USE WITHOUT READING FULL DOCUMENTATION) - #[structopt(name = "migrate", version = garage_version())] - Migrate(MigrateOpt), - /// Start repair of node data on remote node #[structopt(name = "repair", version = garage_version())] Repair(RepairOpt), @@ -445,23 +440,6 @@ pub struct KeyImportOpt { pub yes: bool, } -#[derive(Serialize, Deserialize, StructOpt, Debug, Clone)] -pub struct MigrateOpt { - /// Confirm the launch of the migrate operation - #[structopt(long = "yes")] - pub yes: bool, - - #[structopt(subcommand)] - pub what: MigrateWhat, -} - -#[derive(Serialize, Deserialize, StructOpt, Debug, Eq, PartialEq, Clone)] -pub enum MigrateWhat { - /// Migrate buckets and permissions from v0.5.0 - #[structopt(name = "buckets050", version = garage_version())] - Buckets050, -} - #[derive(Serialize, Deserialize, StructOpt, Debug, Clone)] pub struct RepairOpt { /// Launch repair operation on all nodes diff --git a/src/model/key_table.rs b/src/model/key_table.rs index a9762f1b..efb95f08 100644 --- a/src/model/key_table.rs +++ b/src/model/key_table.rs @@ -7,48 +7,7 @@ use garage_table::{DeletedFilter, EmptyKey, Entry, TableSchema}; use crate::permission::BucketKeyPerm; -pub(crate) mod v05 { - use garage_util::crdt; - use serde::{Deserialize, Serialize}; - - /// An api key - #[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] - pub struct Key { - /// The id of the key (immutable), used as partition key - pub key_id: String, - - /// The secret_key associated - pub secret_key: String, - - /// Name for the key - pub name: crdt::Lww, - - /// Is the key deleted - pub deleted: crdt::Bool, - - /// Buckets in which the key is authorized. Empty if `Key` is deleted - // CRDT interaction: deleted implies authorized_buckets is empty - pub authorized_buckets: crdt::LwwMap, - } - - /// Permission given to a key in a bucket - #[derive(PartialOrd, Ord, PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] - pub struct PermissionSet { - /// The key can be used to read the bucket - pub allow_read: bool, - /// The key can be used to write in the bucket - pub allow_write: bool, - } - - impl crdt::AutoCrdt for PermissionSet { - const WARN_IF_DIFFERENT: bool = true; - } - - impl garage_util::migrate::InitialFormat for Key {} -} - mod v08 { - use super::v05; use crate::permission::BucketKeyPerm; use garage_util::crdt; use garage_util::data::Uuid; @@ -86,32 +45,7 @@ mod v08 { pub local_aliases: crdt::LwwMap>, } - impl garage_util::migrate::Migrate for Key { - type Previous = v05::Key; - - fn migrate(old_k: v05::Key) -> Key { - let name = crdt::Lww::raw(old_k.name.timestamp(), old_k.name.get().clone()); - - let state = if old_k.deleted.get() { - crdt::Deletable::Deleted - } else { - // Authorized buckets is ignored here, - // migration is performed in specific migration code in - // garage/migrate.rs - crdt::Deletable::Present(KeyParams { - secret_key: old_k.secret_key, - name, - allow_create_bucket: crdt::Lww::new(false), - authorized_buckets: crdt::Map::new(), - local_aliases: crdt::LwwMap::new(), - }) - }; - Key { - key_id: old_k.key_id, - state, - } - } - } + impl garage_util::migrate::InitialFormat for Key {} } pub use v08::*; diff --git a/src/model/lib.rs b/src/model/lib.rs index 4f20ea46..2166105f 100644 --- a/src/model/lib.rs +++ b/src/model/lib.rs @@ -1,9 +1,6 @@ #[macro_use] extern crate tracing; -// For migration from previous versions -pub(crate) mod prev; - pub mod permission; pub mod index_counter; @@ -18,4 +15,3 @@ pub mod s3; pub mod garage; pub mod helper; -pub mod migrate; diff --git a/src/model/migrate.rs b/src/model/migrate.rs deleted file mode 100644 index 8528382a..00000000 --- a/src/model/migrate.rs +++ /dev/null @@ -1,108 +0,0 @@ -use std::sync::Arc; - -use garage_util::crdt::*; -use garage_util::data::*; -use garage_util::encode::nonversioned_decode; -use garage_util::error::Error as GarageError; -use garage_util::time::*; - -use crate::prev::v051::bucket_table as old_bucket; - -use crate::bucket_alias_table::*; -use crate::bucket_table::*; -use crate::garage::Garage; -use crate::helper::error::*; -use crate::permission::*; - -pub struct Migrate { - pub garage: Arc, -} - -impl Migrate { - pub async fn migrate_buckets050(&self) -> Result<(), Error> { - let tree = self - .garage - .db - .open_tree("bucket:table") - .map_err(GarageError::from)?; - - let mut old_buckets = vec![]; - for res in tree.iter().map_err(GarageError::from)? { - let (_k, v) = res.map_err(GarageError::from)?; - let bucket = - nonversioned_decode::(&v[..]).map_err(GarageError::from)?; - old_buckets.push(bucket); - } - - for bucket in old_buckets { - if let old_bucket::BucketState::Present(p) = bucket.state.get() { - self.migrate_buckets050_do_bucket(&bucket, p).await?; - } - } - - Ok(()) - } - - pub async fn migrate_buckets050_do_bucket( - &self, - old_bucket: &old_bucket::Bucket, - old_bucket_p: &old_bucket::BucketParams, - ) -> Result<(), Error> { - let bucket_id = blake2sum(old_bucket.name.as_bytes()); - - let new_name = if is_valid_bucket_name(&old_bucket.name) { - old_bucket.name.clone() - } else { - // if old bucket name was not valid, replace it by - // a hex-encoded name derived from its identifier - hex::encode(&bucket_id.as_slice()[..16]) - }; - - let website = if *old_bucket_p.website.get() { - Some(WebsiteConfig { - index_document: "index.html".into(), - error_document: None, - }) - } else { - None - }; - - let helper = self.garage.locked_helper().await; - - self.garage - .bucket_table - .insert(&Bucket { - id: bucket_id, - state: Deletable::Present(BucketParams { - creation_date: now_msec(), - authorized_keys: Map::new(), - aliases: LwwMap::new(), - local_aliases: LwwMap::new(), - website_config: Lww::new(website), - cors_config: Lww::new(None), - lifecycle_config: Lww::new(None), - quotas: Lww::new(Default::default()), - }), - }) - .await?; - - helper.set_global_bucket_alias(bucket_id, &new_name).await?; - - for (k, ts, perm) in old_bucket_p.authorized_keys.items().iter() { - helper - .set_bucket_key_permissions( - bucket_id, - k, - BucketKeyPerm { - timestamp: *ts, - allow_read: perm.allow_read, - allow_write: perm.allow_write, - allow_owner: false, - }, - ) - .await?; - } - - Ok(()) - } -} diff --git a/src/model/prev/mod.rs b/src/model/prev/mod.rs deleted file mode 100644 index 68bb1502..00000000 --- a/src/model/prev/mod.rs +++ /dev/null @@ -1 +0,0 @@ -pub(crate) mod v051; diff --git a/src/model/prev/v051/bucket_table.rs b/src/model/prev/v051/bucket_table.rs deleted file mode 100644 index 19893458..00000000 --- a/src/model/prev/v051/bucket_table.rs +++ /dev/null @@ -1,63 +0,0 @@ -use serde::{Deserialize, Serialize}; - -use garage_table::crdt::Crdt; -use garage_table::*; - -use crate::key_table::v05::PermissionSet; - -/// A bucket is a collection of objects -/// -/// Its parameters are not directly accessible as: -/// - It must be possible to merge paramaters, hence the use of a LWW CRDT. -/// - A bucket has 2 states, Present or Deleted and parameters make sense only if present. -#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] -pub struct Bucket { - /// Name of the bucket - pub name: String, - /// State, and configuration if not deleted, of the bucket - pub state: crdt::Lww, -} - -/// State of a bucket -#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] -pub enum BucketState { - /// The bucket is deleted - Deleted, - /// The bucket exists - Present(BucketParams), -} - -impl Crdt for BucketState { - fn merge(&mut self, o: &Self) { - match o { - BucketState::Deleted => *self = BucketState::Deleted, - BucketState::Present(other_params) => { - if let BucketState::Present(params) = self { - params.merge(other_params); - } - } - } - } -} - -/// Configuration for a bucket -#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] -pub struct BucketParams { - /// Map of key with access to the bucket, and what kind of access they give - pub authorized_keys: crdt::LwwMap, - /// Is the bucket served as http - pub website: crdt::Lww, -} - -impl Crdt for BucketParams { - fn merge(&mut self, o: &Self) { - self.authorized_keys.merge(&o.authorized_keys); - self.website.merge(&o.website); - } -} - -impl Crdt for Bucket { - fn merge(&mut self, other: &Self) { - self.state.merge(&other.state); - } -} diff --git a/src/model/prev/v051/mod.rs b/src/model/prev/v051/mod.rs deleted file mode 100644 index 8c1335a5..00000000 --- a/src/model/prev/v051/mod.rs +++ /dev/null @@ -1 +0,0 @@ -pub(crate) mod bucket_table; diff --git a/src/model/s3/object_table.rs b/src/model/s3/object_table.rs index f2d21493..eedb9615 100644 --- a/src/model/s3/object_table.rs +++ b/src/model/s3/object_table.rs @@ -17,7 +17,7 @@ pub const OBJECTS: &str = "objects"; pub const UNFINISHED_UPLOADS: &str = "unfinished_uploads"; pub const BYTES: &str = "bytes"; -mod v05 { +mod v08 { use garage_util::data::{Hash, Uuid}; use serde::{Deserialize, Serialize}; use std::collections::BTreeMap; @@ -26,7 +26,7 @@ mod v05 { #[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] pub struct Object { /// The bucket in which the object is stored, used as partition key - pub bucket: String, + pub bucket_id: Uuid, /// The key at which the object is stored in its bucket, used as sorting key pub key: String, @@ -92,45 +92,6 @@ mod v05 { impl garage_util::migrate::InitialFormat for Object {} } -mod v08 { - use garage_util::data::Uuid; - use serde::{Deserialize, Serialize}; - - use super::v05; - - pub use v05::{ - ObjectVersion, ObjectVersionData, ObjectVersionHeaders, ObjectVersionMeta, - ObjectVersionState, - }; - - /// An object - #[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] - pub struct Object { - /// The bucket in which the object is stored, used as partition key - pub bucket_id: Uuid, - - /// The key at which the object is stored in its bucket, used as sorting key - pub key: String, - - /// The list of currenty stored versions of the object - pub(super) versions: Vec, - } - - impl garage_util::migrate::Migrate for Object { - type Previous = v05::Object; - - fn migrate(old: v05::Object) -> Object { - use garage_util::data::blake2sum; - - Object { - bucket_id: blake2sum(old.bucket.as_bytes()), - key: old.key, - versions: old.versions, - } - } - } -} - mod v09 { use garage_util::data::Uuid; use serde::{Deserialize, Serialize}; diff --git a/src/model/s3/version_table.rs b/src/model/s3/version_table.rs index b4662a55..d611a9e3 100644 --- a/src/model/s3/version_table.rs +++ b/src/model/s3/version_table.rs @@ -11,7 +11,7 @@ use garage_table::*; use crate::s3::block_ref_table::*; -mod v05 { +mod v08 { use garage_util::crdt; use garage_util::data::{Hash, Uuid}; use serde::{Deserialize, Serialize}; @@ -35,7 +35,7 @@ mod v05 { // Back link to bucket+key so that we can figure if // this was deleted later on /// Bucket in which the related object is stored - pub bucket: String, + pub bucket_id: Uuid, /// Key in which the related object is stored pub key: String, } @@ -61,57 +61,6 @@ mod v05 { impl garage_util::migrate::InitialFormat for Version {} } -mod v08 { - use garage_util::crdt; - use garage_util::data::Uuid; - use serde::{Deserialize, Serialize}; - - use super::v05; - - pub use v05::{VersionBlock, VersionBlockKey}; - - /// A version of an object - #[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] - pub struct Version { - /// UUID of the version, used as partition key - pub uuid: Uuid, - - // Actual data: the blocks for this version - // In the case of a multipart upload, also store the etags - // of individual parts and check them when doing CompleteMultipartUpload - /// Is this version deleted - pub deleted: crdt::Bool, - /// list of blocks of data composing the version - pub blocks: crdt::Map, - /// Etag of each part in case of a multipart upload, empty otherwise - pub parts_etags: crdt::Map, - - // Back link to bucket+key so that we can figure if - // this was deleted later on - /// Bucket in which the related object is stored - pub bucket_id: Uuid, - /// Key in which the related object is stored - pub key: String, - } - - impl garage_util::migrate::Migrate for Version { - type Previous = v05::Version; - - fn migrate(old: v05::Version) -> Version { - use garage_util::data::blake2sum; - - Version { - uuid: old.uuid, - deleted: old.deleted, - blocks: old.blocks, - parts_etags: old.parts_etags, - bucket_id: blake2sum(old.bucket.as_bytes()), - key: old.key, - } - } - } -} - pub(crate) mod v09 { use garage_util::crdt; use garage_util::data::Uuid; -- cgit v1.2.3 From 44454aac012cbef9158110f2352301ffcfaf31c7 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 8 Mar 2024 14:11:02 +0100 Subject: [rm-sled] Remove the Sled database engine --- Cargo.lock | 45 ----- Cargo.nix | 65 +------ Cargo.toml | 1 - default.nix | 1 - doc/api/garage-admin-v1.yml | 1 - doc/book/connect/apps/index.md | 2 +- doc/book/cookbook/from-source.md | 3 +- doc/book/cookbook/real-world.md | 5 +- doc/book/design/internals.md | 2 +- doc/book/reference-manual/configuration.md | 38 +--- doc/drafts/admin-api.md | 1 - doc/drafts/k2v-spec.md | 2 +- nix/compile.nix | 2 +- script/helm/garage/values.yaml | 12 +- src/block/resync.rs | 6 +- src/db/Cargo.toml | 3 +- src/db/lib.rs | 2 - src/db/open.rs | 27 +-- src/db/sled_adapter.rs | 274 ----------------------------- src/db/test.rs | 11 -- src/garage/Cargo.toml | 5 +- src/garage/cli/convert_db.rs | 2 +- src/garage/main.rs | 6 +- src/model/Cargo.toml | 3 +- src/model/garage.rs | 5 - src/table/gc.rs | 6 +- src/table/merkle.rs | 4 +- src/util/config.rs | 19 +- 28 files changed, 34 insertions(+), 519 deletions(-) delete mode 100644 src/db/sled_adapter.rs diff --git a/Cargo.lock b/Cargo.lock index f1aa93bf..284e2276 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -921,15 +921,6 @@ dependencies = [ "crossbeam-utils", ] -[[package]] -name = "crossbeam-epoch" -version = "0.9.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" -dependencies = [ - "crossbeam-utils", -] - [[package]] name = "crossbeam-queue" version = "0.3.11" @@ -1222,16 +1213,6 @@ dependencies = [ name = "format_table" version = "0.1.1" -[[package]] -name = "fs2" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9564fc758e15025b46aa6643b1b77d047d1a56a1aea6e01002ac0c7026876213" -dependencies = [ - "libc", - "winapi", -] - [[package]] name = "futures" version = "0.3.30" @@ -1321,15 +1302,6 @@ dependencies = [ "slab", ] -[[package]] -name = "fxhash" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" -dependencies = [ - "byteorder", -] - [[package]] name = "garage" version = "0.10.0" @@ -1472,7 +1444,6 @@ dependencies = [ "hexdump", "mktemp", "rusqlite", - "sled", "tracing", ] @@ -3831,22 +3802,6 @@ dependencies = [ "autocfg", ] -[[package]] -name = "sled" -version = "0.34.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f96b4737c2ce5987354855aed3797279def4ebf734436c6aa4552cf8e169935" -dependencies = [ - "crc32fast", - "crossbeam-epoch", - "crossbeam-utils", - "fs2", - "fxhash", - "libc", - "log", - "parking_lot 0.11.2", -] - [[package]] name = "smallvec" version = "1.13.1" diff --git a/Cargo.nix b/Cargo.nix index cd485416..0bc28ecd 100644 --- a/Cargo.nix +++ b/Cargo.nix @@ -34,7 +34,7 @@ args@{ ignoreLockHash, }: let - nixifiedLockHash = "c3296a54f1c6f385e0d4a4a937734f1fe0fee4405b44d7462249d72675f7ac40"; + nixifiedLockHash = "23e1504df44ec18cfc5c872c858154304c16da2a6c1f7c9f06608ef833815f30"; workspaceSrc = if args.workspaceSrc == null then ./. else args.workspaceSrc; currentLockHash = builtins.hashFile "sha256" (workspaceSrc + /Cargo.lock); lockHashIgnored = if ignoreLockHash @@ -1315,21 +1315,6 @@ in }; }); - "registry+https://github.com/rust-lang/crates.io-index".crossbeam-epoch."0.9.18" = overridableMkRustCrate (profileName: rec { - name = "crossbeam-epoch"; - version = "0.9.18"; - registry = "registry+https://github.com/rust-lang/crates.io-index"; - src = fetchCratesIo { inherit name version; sha256 = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"; }; - features = builtins.concatLists [ - (lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled") "alloc") - (lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled") "default") - (lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled") "std") - ]; - dependencies = { - ${ if rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled" then "crossbeam_utils" else null } = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".crossbeam-utils."0.8.19" { inherit profileName; }).out; - }; - }); - "registry+https://github.com/rust-lang/crates.io-index".crossbeam-queue."0.3.11" = overridableMkRustCrate (profileName: rec { name = "crossbeam-queue"; version = "0.3.11"; @@ -1351,7 +1336,6 @@ in registry = "registry+https://github.com/rust-lang/crates.io-index"; src = fetchCratesIo { inherit name version; sha256 = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345"; }; features = builtins.concatLists [ - (lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled") "default") [ "std" ] ]; }); @@ -1775,17 +1759,6 @@ in src = fetchCrateLocal (workspaceSrc + "/src/format-table"); }); - "registry+https://github.com/rust-lang/crates.io-index".fs2."0.4.3" = overridableMkRustCrate (profileName: rec { - name = "fs2"; - version = "0.4.3"; - registry = "registry+https://github.com/rust-lang/crates.io-index"; - src = fetchCratesIo { inherit name version; sha256 = "9564fc758e15025b46aa6643b1b77d047d1a56a1aea6e01002ac0c7026876213"; }; - dependencies = { - ${ if (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled") && hostPlatform.isUnix then "libc" else null } = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".libc."0.2.153" { inherit profileName; }).out; - ${ if (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled") && hostPlatform.isWindows then "winapi" else null } = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".winapi."0.3.9" { inherit profileName; }).out; - }; - }); - "registry+https://github.com/rust-lang/crates.io-index".futures."0.3.30" = overridableMkRustCrate (profileName: rec { name = "futures"; version = "0.3.30"; @@ -1937,16 +1910,6 @@ in }; }); - "registry+https://github.com/rust-lang/crates.io-index".fxhash."0.2.1" = overridableMkRustCrate (profileName: rec { - name = "fxhash"; - version = "0.2.1"; - registry = "registry+https://github.com/rust-lang/crates.io-index"; - src = fetchCratesIo { inherit name version; sha256 = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c"; }; - dependencies = { - ${ if rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled" then "byteorder" else null } = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".byteorder."1.5.0" { inherit profileName; }).out; - }; - }); - "unknown".garage."0.10.0" = overridableMkRustCrate (profileName: rec { name = "garage"; version = "0.10.0"; @@ -1963,7 +1926,6 @@ in (lib.optional (rootFeatures' ? "garage/opentelemetry-otlp" || rootFeatures' ? "garage/telemetry-otlp") "opentelemetry-otlp") (lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/metrics" || rootFeatures' ? "garage/opentelemetry-prometheus") "opentelemetry-prometheus") (lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/metrics" || rootFeatures' ? "garage/prometheus") "prometheus") - (lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled") "sled") (lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sqlite") "sqlite") (lib.optional (rootFeatures' ? "garage/system-libs") "system-libs") (lib.optional (rootFeatures' ? "garage/telemetry-otlp") "telemetry-otlp") @@ -2127,7 +2089,6 @@ in (lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/lmdb" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/heed" || rootFeatures' ? "garage_db/lmdb" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/lmdb") "heed") (lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/lmdb" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/lmdb" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/lmdb") "lmdb") (lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sqlite" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/rusqlite" || rootFeatures' ? "garage_db/sqlite" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sqlite") "rusqlite") - (lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled") "sled") (lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sqlite" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sqlite" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sqlite") "sqlite") ]; dependencies = { @@ -2135,7 +2096,6 @@ in ${ if rootFeatures' ? "garage/default" || rootFeatures' ? "garage/lmdb" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/heed" || rootFeatures' ? "garage_db/lmdb" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/lmdb" then "heed" else null } = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".heed."0.11.0" { inherit profileName; }).out; hexdump = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hexdump."0.1.1" { inherit profileName; }).out; ${ if rootFeatures' ? "garage/bundled-libs" || rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sqlite" || rootFeatures' ? "garage_db/bundled-libs" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/rusqlite" || rootFeatures' ? "garage_db/sqlite" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sqlite" then "rusqlite" else null } = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".rusqlite."0.30.0" { inherit profileName; }).out; - ${ if rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled" then "sled" else null } = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".sled."0.34.7" { inherit profileName; }).out; tracing = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".tracing."0.1.40" { inherit profileName; }).out; }; devDependencies = { @@ -2152,7 +2112,6 @@ in (lib.optional (rootFeatures' ? "garage_model/default") "default") (lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/k2v" || rootFeatures' ? "garage_api/k2v" || rootFeatures' ? "garage_model/k2v") "k2v") (lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/lmdb" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/lmdb") "lmdb") - (lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled") "sled") (lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sqlite" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sqlite") "sqlite") ]; dependencies = { @@ -5459,27 +5418,6 @@ in }; }); - "registry+https://github.com/rust-lang/crates.io-index".sled."0.34.7" = overridableMkRustCrate (profileName: rec { - name = "sled"; - version = "0.34.7"; - registry = "registry+https://github.com/rust-lang/crates.io-index"; - src = fetchCratesIo { inherit name version; sha256 = "7f96b4737c2ce5987354855aed3797279def4ebf734436c6aa4552cf8e169935"; }; - features = builtins.concatLists [ - (lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled") "default") - (lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled") "no_metrics") - ]; - dependencies = { - ${ if rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled" then "crc32fast" else null } = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".crc32fast."1.3.2" { inherit profileName; }).out; - ${ if rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled" then "crossbeam_epoch" else null } = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".crossbeam-epoch."0.9.18" { inherit profileName; }).out; - ${ if rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled" then "crossbeam_utils" else null } = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".crossbeam-utils."0.8.19" { inherit profileName; }).out; - ${ if (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled") && (hostPlatform.parsed.kernel.name == "linux" || hostPlatform.parsed.kernel.name == "darwin" || hostPlatform.parsed.kernel.name == "windows") then "fs2" else null } = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".fs2."0.4.3" { inherit profileName; }).out; - ${ if rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled" then "fxhash" else null } = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".fxhash."0.2.1" { inherit profileName; }).out; - ${ if rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled" then "libc" else null } = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".libc."0.2.153" { inherit profileName; }).out; - ${ if rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled" then "log" else null } = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".log."0.4.20" { inherit profileName; }).out; - ${ if rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled" then "parking_lot" else null } = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".parking_lot."0.11.2" { inherit profileName; }).out; - }; - }); - "registry+https://github.com/rust-lang/crates.io-index".smallvec."1.13.1" = overridableMkRustCrate (profileName: rec { name = "smallvec"; version = "1.13.1"; @@ -6723,7 +6661,6 @@ in [ "minwindef" ] [ "ntstatus" ] [ "processenv" ] - (lib.optional (rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sled" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/sled" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sled") "processthreadsapi") [ "std" ] [ "synchapi" ] [ "sysinfoapi" ] diff --git a/Cargo.toml b/Cargo.toml index c259c7f2..f40e8738 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -78,7 +78,6 @@ tracing-subscriber = { version = "0.3", features = ["env-filter"] } heed = { version = "0.11", default-features = false, features = ["lmdb"] } rusqlite = "0.30.0" -sled = "0.34" async-compression = { version = "0.4", features = ["tokio", "zstd"] } zstd = { version = "0.13", default-features = false } diff --git a/default.nix b/default.nix index ecdf6436..57bc24a5 100644 --- a/default.nix +++ b/default.nix @@ -40,7 +40,6 @@ in { features = [ "garage/bundled-libs" "garage/k2v" - "garage/sled" "garage/lmdb" "garage/sqlite" ]; diff --git a/doc/api/garage-admin-v1.yml b/doc/api/garage-admin-v1.yml index fd78feb1..1ea77b2e 100644 --- a/doc/api/garage-admin-v1.yml +++ b/doc/api/garage-admin-v1.yml @@ -98,7 +98,6 @@ paths: type: string example: - "k2v" - - "sled" - "lmdb" - "sqlite" - "consul-discovery" diff --git a/doc/book/connect/apps/index.md b/doc/book/connect/apps/index.md index 5def3851..9a678275 100644 --- a/doc/book/connect/apps/index.md +++ b/doc/book/connect/apps/index.md @@ -292,7 +292,7 @@ with average object size ranging from 50 KB to 150 KB. As such, your Garage cluster should be configured appropriately for good performance: - use Garage v0.8.0 or higher with the [LMDB database engine](@documentation/reference-manual/configuration.md#db-engine-since-v0-8-0). - With the default Sled database engine, your database could quickly end up taking tens of GB of disk space. + Older versions of Garage used the Sled database engine which had issues, such as databases quickly ending up taking tens of GB of disk space. - the Garage database should be stored on a SSD ### Creating your bucket diff --git a/doc/book/cookbook/from-source.md b/doc/book/cookbook/from-source.md index bacf93ab..f7fd17ce 100644 --- a/doc/book/cookbook/from-source.md +++ b/doc/book/cookbook/from-source.md @@ -90,6 +90,5 @@ The following feature flags are available in v0.8.0: | `kubernetes-discovery` | optional | Enable automatic registration and discovery
of cluster nodes through the Kubernetes API | | `metrics` | *by default* | Enable collection of metrics in Prometheus format on the admin API | | `telemetry-otlp` | optional | Enable collection of execution traces using OpenTelemetry | -| `sled` | *by default* | Enable using Sled to store Garage's metadata | -| `lmdb` | optional | Enable using LMDB to store Garage's metadata | +| `lmdb` | *by default* | Enable using LMDB to store Garage's metadata | | `sqlite` | optional | Enable using Sqlite3 to store Garage's metadata | diff --git a/doc/book/cookbook/real-world.md b/doc/book/cookbook/real-world.md index cb10b550..30be4907 100644 --- a/doc/book/cookbook/real-world.md +++ b/doc/book/cookbook/real-world.md @@ -70,9 +70,8 @@ to store 2 TB of data in total. - If you only have an HDD and no SSD, it's fine to put your metadata alongside the data on the same drive. Having lots of RAM for your kernel to cache the metadata will - help a lot with performance. Make sure to use the LMDB database engine, - instead of Sled, which suffers from quite bad performance degradation on HDDs. - Sled is still the default for legacy reasons, but is not recommended anymore. + help a lot with performance. The default LMDB database engine is the most tested + and has good performance. - For the metadata storage, Garage does not do checksumming and integrity verification on its own. If you are afraid of bitrot/data corruption, diff --git a/doc/book/design/internals.md b/doc/book/design/internals.md index cefb7acc..8e3c214e 100644 --- a/doc/book/design/internals.md +++ b/doc/book/design/internals.md @@ -97,7 +97,7 @@ delete a tombstone, the following condition has to be met: superseeded by the tombstone. This ensures that deleting the tombstone is safe and that no deleted value will come back in the system. -Garage makes use of Sled's atomic operations (such as compare-and-swap and +Garage uses atomic database operations (such as compare-and-swap and transactions) to ensure that only tombstones that have been correctly propagated to other nodes are ever deleted from the local entry tree. diff --git a/doc/book/reference-manual/configuration.md b/doc/book/reference-manual/configuration.md index 580e9fbc..4df2d0df 100644 --- a/doc/book/reference-manual/configuration.md +++ b/doc/book/reference-manual/configuration.md @@ -20,8 +20,6 @@ db_engine = "lmdb" block_size = "1M" -sled_cache_capacity = "128MiB" -sled_flush_every_ms = 2000 lmdb_map_size = "1T" compression_level = 1 @@ -96,9 +94,7 @@ Top-level configuration options: [`rpc_bind_addr`](#rpc_bind_addr), [`rpc_bind_outgoing`](#rpc_bind_outgoing), [`rpc_public_addr`](#rpc_public_addr), -[`rpc_secret`/`rpc_secret_file`](#rpc_secret), -[`sled_cache_capacity`](#sled_cache_capacity), -[`sled_flush_every_ms`](#sled_flush_every_ms). +[`rpc_secret`/`rpc_secret_file`](#rpc_secret). The `[consul_discovery]` section: [`api`](#consul_api), @@ -271,20 +267,16 @@ Since `v0.8.0`, Garage can use alternative storage backends as follows: | DB engine | `db_engine` value | Database path | | --------- | ----------------- | ------------- | -| [LMDB](https://www.lmdb.tech) (default since `v0.9.0`) | `"lmdb"` | `/db.lmdb/` | -| [Sled](https://sled.rs) (default up to `v0.8.0`) | `"sled"` | `/db/` | -| [Sqlite](https://sqlite.org) | `"sqlite"` | `/db.sqlite` | +| [LMDB](https://www.lmdb.tech) (since `v0.8.0`, default since `v0.9.0`) | `"lmdb"` | `/db.lmdb/` | +| [Sqlite](https://sqlite.org) (since `v0.8.0`) | `"sqlite"` | `/db.sqlite` | +| [Sled](https://sled.rs) (old default, removed since `v1.0`) | `"sled"` | `/db/` | -Sled was the only database engine up to Garage v0.7.0. Performance issues and -API limitations of Sled prompted the addition of alternative engines in v0.8.0. -Since v0.9.0, LMDB is the default engine instead of Sled, and Sled is -deprecated. We plan to remove Sled in Garage v1.0. +Sled was supported until Garage v0.9.x, and was removed in Garage v1.0. +You can still use an older binary of Garage (e.g. v0.9.3) to migrate +old Sled metadata databases to another engine. Performance characteristics of the different DB engines are as follows: -- Sled: tends to produce large data files and also has performance issues, - especially when the metadata folder is on a traditional HDD and not on SSD. - - LMDB: the recommended database engine on 64-bit systems, much more space-efficient and slightly faster. Note that the data format of LMDB is not portable between architectures, so for instance the Garage database of an @@ -333,7 +325,6 @@ Here is how this option impacts the different database engines: | Database | `metadata_fsync = false` (default) | `metadata_fsync = true` | |----------|------------------------------------|-------------------------------| -| Sled | default options | *unsupported* | | Sqlite | `PRAGMA synchronous = OFF` | `PRAGMA synchronous = NORMAL` | | LMDB | `MDB_NOMETASYNC` + `MDB_NOSYNC` | `MDB_NOMETASYNC` | @@ -367,21 +358,6 @@ files will remain available. This however means that chunks from existing files will not be deduplicated with chunks from newly uploaded files, meaning you might use more storage space that is optimally possible. -#### `sled_cache_capacity` {#sled_cache_capacity} - -This parameter can be used to tune the capacity of the cache used by -[sled](https://sled.rs), the database Garage uses internally to store metadata. -Tune this to fit the RAM you wish to make available to your Garage instance. -This value has a conservative default (128MB) so that Garage doesn't use too much -RAM by default, but feel free to increase this for higher performance. - -#### `sled_flush_every_ms` {#sled_flush_every_ms} - -This parameters can be used to tune the flushing interval of sled. -Increase this if sled is thrashing your SSD, at the risk of losing more data in case -of a power outage (though this should not matter much as data is replicated on other -nodes). The default value, 2000ms, should be appropriate for most use cases. - #### `lmdb_map_size` {#lmdb_map_size} This parameters can be used to set the map size used by LMDB, diff --git a/doc/drafts/admin-api.md b/doc/drafts/admin-api.md index e7851ab1..40c82f5a 100644 --- a/doc/drafts/admin-api.md +++ b/doc/drafts/admin-api.md @@ -73,7 +73,6 @@ Example response body: "garageVersion": "v0.10.0", "garageFeatures": [ "k2v", - "sled", "lmdb", "sqlite", "metrics", diff --git a/doc/drafts/k2v-spec.md b/doc/drafts/k2v-spec.md index faa1a247..3956fa31 100644 --- a/doc/drafts/k2v-spec.md +++ b/doc/drafts/k2v-spec.md @@ -146,7 +146,7 @@ in a bucket, as the partition key becomes the sort key in the index. How indexing works: - Each node keeps a local count of how many items it stores for each partition, - in a local Sled tree that is updated atomically when an item is modified. + in a local database tree that is updated atomically when an item is modified. - These local counters are asynchronously stored in the index table which is a regular Garage table spread in the network. Counters are stored as LWW values, so basically the final table will have the following structure: diff --git a/nix/compile.nix b/nix/compile.nix index 1e712710..78cbd80e 100644 --- a/nix/compile.nix +++ b/nix/compile.nix @@ -168,7 +168,7 @@ let rootFeatures = if features != null then features else - ([ "garage/bundled-libs" "garage/sled" "garage/lmdb" "garage/k2v" ] ++ (if release then [ + ([ "garage/bundled-libs" "garage/lmdb" "garage/k2v" ] ++ (if release then [ "garage/consul-discovery" "garage/kubernetes-discovery" "garage/metrics" diff --git a/script/helm/garage/values.yaml b/script/helm/garage/values.yaml index 02a6651b..56afa2b2 100644 --- a/script/helm/garage/values.yaml +++ b/script/helm/garage/values.yaml @@ -6,18 +6,13 @@ garage: # Can be changed for better performance on certain systems # https://garagehq.deuxfleurs.fr/documentation/reference-manual/configuration/#db-engine-since-v0-8-0 - dbEngine: "sled" + dbEngine: "lmdb" # Defaults is 1MB # An increase can result in better performance in certain scenarios # https://garagehq.deuxfleurs.fr/documentation/reference-manual/configuration/#block-size blockSize: "1048576" - # Tuning parameters for the sled DB engine - # https://garagehq.deuxfleurs.fr/documentation/reference-manual/configuration/#sled-cache-capacity - sledCacheCapacity: "134217728" - sledFlushEveryMs: "2000" - # Default to 3 replicas, see the replication_mode section at # https://garagehq.deuxfleurs.fr/documentation/reference-manual/configuration/#replication-mode replicationMode: "3" @@ -50,11 +45,6 @@ garage: block_size = {{ .Values.garage.blockSize }} - {{- if eq .Values.garage.dbEngine "sled"}} - sled_cache_capacity = {{ .Values.garage.sledCacheCapacity }} - sled_flush_every_ms = {{ .Values.garage.sledFlushEveryMs }} - {{- end }} - replication_mode = "{{ .Values.garage.replicationMode }}" compression_level = {{ .Values.garage.compressionLevel }} diff --git a/src/block/resync.rs b/src/block/resync.rs index 15f210e4..2516ba08 100644 --- a/src/block/resync.rs +++ b/src/block/resync.rs @@ -180,7 +180,7 @@ impl BlockResyncManager { // deleted once the garbage collection delay has passed. // // Here are some explanations on how the resync queue works. - // There are two Sled trees that are used to have information + // There are two db trees that are used to have information // about the status of blocks that need to be resynchronized: // // - resync.queue: a tree that is ordered first by a timestamp @@ -541,9 +541,9 @@ impl Worker for ResyncWorker { Ok(WorkerState::Idle) } Err(e) => { - // The errors that we have here are only Sled errors + // The errors that we have here are only db errors // We don't really know how to handle them so just ¯\_(ツ)_/¯ - // (there is kind of an assumption that Sled won't error on us, + // (there is kind of an assumption that the db won't error on us, // if it does there is not much we can do -- TODO should we just panic?) // Here we just give the error to the worker manager, // it will print it to the logs and increment a counter diff --git a/src/db/Cargo.toml b/src/db/Cargo.toml index fddc5cca..a8f6d586 100644 --- a/src/db/Cargo.toml +++ b/src/db/Cargo.toml @@ -18,13 +18,12 @@ tracing.workspace = true heed = { workspace = true, optional = true } rusqlite = { workspace = true, optional = true } -sled = { workspace = true, optional = true } [dev-dependencies] mktemp.workspace = true [features] -default = [ "sled", "lmdb", "sqlite" ] +default = [ "lmdb", "sqlite" ] bundled-libs = [ "rusqlite?/bundled" ] lmdb = [ "heed" ] sqlite = [ "rusqlite" ] diff --git a/src/db/lib.rs b/src/db/lib.rs index 0fb457ce..8975f295 100644 --- a/src/db/lib.rs +++ b/src/db/lib.rs @@ -3,8 +3,6 @@ extern crate tracing; #[cfg(feature = "lmdb")] pub mod lmdb_adapter; -#[cfg(feature = "sled")] -pub mod sled_adapter; #[cfg(feature = "sqlite")] pub mod sqlite_adapter; diff --git a/src/db/open.rs b/src/db/open.rs index ae135c4e..03476a42 100644 --- a/src/db/open.rs +++ b/src/db/open.rs @@ -11,7 +11,6 @@ use crate::{Db, Error, Result}; pub enum Engine { Lmdb, Sqlite, - Sled, } impl Engine { @@ -20,7 +19,6 @@ impl Engine { match self { Self::Lmdb => "lmdb", Self::Sqlite => "sqlite", - Self::Sled => "sled", } } } @@ -38,10 +36,10 @@ impl std::str::FromStr for Engine { match text { "lmdb" | "heed" => Ok(Self::Lmdb), "sqlite" | "sqlite3" | "rusqlite" => Ok(Self::Sqlite), - "sled" => Ok(Self::Sled), + "sled" => Err(Error("Sled is no longer supported as a database engine. Converting your old metadata db can be done using an older Garage binary (e.g. v0.9.3).".into())), kind => Err(Error( format!( - "Invalid DB engine: {} (options are: lmdb, sled, sqlite)", + "Invalid DB engine: {} (options are: lmdb, sqlite)", kind ) .into(), @@ -53,8 +51,6 @@ impl std::str::FromStr for Engine { pub struct OpenOpt { pub fsync: bool, pub lmdb_map_size: Option, - pub sled_cache_capacity: usize, - pub sled_flush_every_ms: u64, } impl Default for OpenOpt { @@ -62,31 +58,12 @@ impl Default for OpenOpt { Self { fsync: false, lmdb_map_size: None, - sled_cache_capacity: 1024 * 1024 * 1024, - sled_flush_every_ms: 2000, } } } pub fn open_db(path: &PathBuf, engine: Engine, opt: &OpenOpt) -> Result { match engine { - // ---- Sled DB ---- - #[cfg(feature = "sled")] - Engine::Sled => { - if opt.fsync { - return Err(Error( - "`metadata_fsync = true` is not supported with the Sled database engine".into(), - )); - } - info!("Opening Sled database at: {}", path.display()); - let db = crate::sled_adapter::sled::Config::default() - .path(&path) - .cache_capacity(opt.sled_cache_capacity as u64) - .flush_every_ms(Some(opt.sled_flush_every_ms)) - .open()?; - Ok(crate::sled_adapter::SledDb::init(db)) - } - // ---- Sqlite DB ---- #[cfg(feature = "sqlite")] Engine::Sqlite => { diff --git a/src/db/sled_adapter.rs b/src/db/sled_adapter.rs deleted file mode 100644 index 84f2001b..00000000 --- a/src/db/sled_adapter.rs +++ /dev/null @@ -1,274 +0,0 @@ -use core::ops::Bound; - -use std::cell::Cell; -use std::collections::HashMap; -use std::sync::{Arc, RwLock}; - -use sled::transaction::{ - ConflictableTransactionError, TransactionError, Transactional, TransactionalTree, - UnabortableTransactionError, -}; - -use crate::{ - Db, Error, IDb, ITx, ITxFn, OnCommit, Result, TxError, TxFnResult, TxOpError, TxOpResult, - TxResult, TxValueIter, Value, ValueIter, -}; - -pub use sled; - -// -- err - -impl From for Error { - fn from(e: sled::Error) -> Error { - Error(format!("Sled: {}", e).into()) - } -} - -impl From for TxOpError { - fn from(e: sled::Error) -> TxOpError { - TxOpError(e.into()) - } -} - -// -- db - -pub struct SledDb { - db: sled::Db, - trees: RwLock<(Vec, HashMap)>, -} - -impl SledDb { - #[deprecated( - since = "0.9.0", - note = "The Sled database is now deprecated and will be removed in Garage v1.0. Please migrate to LMDB or Sqlite as soon as possible." - )] - pub fn init(db: sled::Db) -> Db { - tracing::warn!("-------------------- IMPORTANT WARNING !!! ----------------------"); - tracing::warn!("The Sled database is now deprecated and will be removed in Garage v1.0."); - tracing::warn!("Please migrate to LMDB or Sqlite as soon as possible."); - tracing::warn!("-----------------------------------------------------------------------"); - let s = Self { - db, - trees: RwLock::new((Vec::new(), HashMap::new())), - }; - Db(Arc::new(s)) - } - - fn get_tree(&self, i: usize) -> Result { - self.trees - .read() - .unwrap() - .0 - .get(i) - .cloned() - .ok_or_else(|| Error("invalid tree id".into())) - } -} - -impl IDb for SledDb { - fn engine(&self) -> String { - "Sled".into() - } - - fn open_tree(&self, name: &str) -> Result { - let mut trees = self.trees.write().unwrap(); - if let Some(i) = trees.1.get(name) { - Ok(*i) - } else { - let tree = self.db.open_tree(name)?; - let i = trees.0.len(); - trees.0.push(tree); - trees.1.insert(name.to_string(), i); - Ok(i) - } - } - - fn list_trees(&self) -> Result> { - let mut trees = vec![]; - for name in self.db.tree_names() { - let name = std::str::from_utf8(&name) - .map_err(|e| Error(format!("{}", e).into()))? - .to_string(); - if name != "__sled__default" { - trees.push(name); - } - } - Ok(trees) - } - - // ---- - - fn get(&self, tree: usize, key: &[u8]) -> Result> { - let tree = self.get_tree(tree)?; - let val = tree.get(key)?; - Ok(val.map(|x| x.to_vec())) - } - - fn len(&self, tree: usize) -> Result { - let tree = self.get_tree(tree)?; - Ok(tree.len()) - } - - fn insert(&self, tree: usize, key: &[u8], value: &[u8]) -> Result> { - let tree = self.get_tree(tree)?; - let old_val = tree.insert(key, value)?; - Ok(old_val.map(|x| x.to_vec())) - } - - fn remove(&self, tree: usize, key: &[u8]) -> Result> { - let tree = self.get_tree(tree)?; - let old_val = tree.remove(key)?; - Ok(old_val.map(|x| x.to_vec())) - } - - fn clear(&self, tree: usize) -> Result<()> { - let tree = self.get_tree(tree)?; - tree.clear()?; - Ok(()) - } - - fn iter(&self, tree: usize) -> Result> { - let tree = self.get_tree(tree)?; - Ok(Box::new(tree.iter().map(|v| { - v.map(|(x, y)| (x.to_vec(), y.to_vec())).map_err(Into::into) - }))) - } - - fn iter_rev(&self, tree: usize) -> Result> { - let tree = self.get_tree(tree)?; - Ok(Box::new(tree.iter().rev().map(|v| { - v.map(|(x, y)| (x.to_vec(), y.to_vec())).map_err(Into::into) - }))) - } - - fn range<'r>( - &self, - tree: usize, - low: Bound<&'r [u8]>, - high: Bound<&'r [u8]>, - ) -> Result> { - let tree = self.get_tree(tree)?; - Ok(Box::new(tree.range::<&'r [u8], _>((low, high)).map(|v| { - v.map(|(x, y)| (x.to_vec(), y.to_vec())).map_err(Into::into) - }))) - } - fn range_rev<'r>( - &self, - tree: usize, - low: Bound<&'r [u8]>, - high: Bound<&'r [u8]>, - ) -> Result> { - let tree = self.get_tree(tree)?; - Ok(Box::new(tree.range::<&'r [u8], _>((low, high)).rev().map( - |v| v.map(|(x, y)| (x.to_vec(), y.to_vec())).map_err(Into::into), - ))) - } - - // ---- - - fn transaction(&self, f: &dyn ITxFn) -> TxResult { - let trees = self.trees.read().unwrap(); - let res = trees.0.transaction(|txtrees| { - let mut tx = SledTx { - trees: txtrees, - err: Cell::new(None), - }; - match f.try_on(&mut tx) { - TxFnResult::Ok(on_commit) => { - assert!(tx.err.into_inner().is_none()); - Ok(on_commit) - } - TxFnResult::Abort => { - assert!(tx.err.into_inner().is_none()); - Err(ConflictableTransactionError::Abort(())) - } - TxFnResult::DbErr => { - let e = tx.err.into_inner().expect("No DB error"); - Err(e.into()) - } - } - }); - match res { - Ok(on_commit) => Ok(on_commit), - Err(TransactionError::Abort(())) => Err(TxError::Abort(())), - Err(TransactionError::Storage(s)) => Err(TxError::Db(s.into())), - } - } -} - -// ---- - -struct SledTx<'a> { - trees: &'a [TransactionalTree], - err: Cell>, -} - -impl<'a> SledTx<'a> { - fn get_tree(&self, i: usize) -> TxOpResult<&TransactionalTree> { - self.trees.get(i).ok_or_else(|| { - TxOpError(Error( - "invalid tree id (it might have been openned after the transaction started)".into(), - )) - }) - } - - fn save_error( - &self, - v: std::result::Result, - ) -> TxOpResult { - match v { - Ok(x) => Ok(x), - Err(e) => { - let txt = format!("{}", e); - self.err.set(Some(e)); - Err(TxOpError(Error(txt.into()))) - } - } - } -} - -impl<'a> ITx for SledTx<'a> { - fn get(&self, tree: usize, key: &[u8]) -> TxOpResult> { - let tree = self.get_tree(tree)?; - let tmp = self.save_error(tree.get(key))?; - Ok(tmp.map(|x| x.to_vec())) - } - fn len(&self, _tree: usize) -> TxOpResult { - unimplemented!(".len() in transaction not supported with Sled backend") - } - - fn insert(&mut self, tree: usize, key: &[u8], value: &[u8]) -> TxOpResult> { - let tree = self.get_tree(tree)?; - let old_val = self.save_error(tree.insert(key, value))?; - Ok(old_val.map(|x| x.to_vec())) - } - fn remove(&mut self, tree: usize, key: &[u8]) -> TxOpResult> { - let tree = self.get_tree(tree)?; - let old_val = self.save_error(tree.remove(key))?; - Ok(old_val.map(|x| x.to_vec())) - } - - fn iter(&self, _tree: usize) -> TxOpResult> { - unimplemented!("Iterators in transactions not supported with Sled backend"); - } - fn iter_rev(&self, _tree: usize) -> TxOpResult> { - unimplemented!("Iterators in transactions not supported with Sled backend"); - } - - fn range<'r>( - &self, - _tree: usize, - _low: Bound<&'r [u8]>, - _high: Bound<&'r [u8]>, - ) -> TxOpResult> { - unimplemented!("Iterators in transactions not supported with Sled backend"); - } - fn range_rev<'r>( - &self, - _tree: usize, - _low: Bound<&'r [u8]>, - _high: Bound<&'r [u8]>, - ) -> TxOpResult> { - unimplemented!("Iterators in transactions not supported with Sled backend"); - } -} diff --git a/src/db/test.rs b/src/db/test.rs index cd99eafa..d4c875f0 100644 --- a/src/db/test.rs +++ b/src/db/test.rs @@ -90,17 +90,6 @@ fn test_lmdb_db() { drop(path); } -#[test] -#[cfg(feature = "sled")] -fn test_sled_db() { - use crate::sled_adapter::SledDb; - - let path = mktemp::Temp::new_dir().unwrap(); - let db = SledDb::init(sled::open(path.to_path_buf()).unwrap()); - test_suite(db); - drop(path); -} - #[test] #[cfg(feature = "sqlite")] fn test_sqlite_db() { diff --git a/src/garage/Cargo.toml b/src/garage/Cargo.toml index 00ecb35e..53449a1c 100644 --- a/src/garage/Cargo.toml +++ b/src/garage/Cargo.toml @@ -80,12 +80,11 @@ k2v-client.workspace = true [features] -default = [ "bundled-libs", "metrics", "sled", "lmdb", "sqlite", "k2v" ] +default = [ "bundled-libs", "metrics", "lmdb", "sqlite", "k2v" ] k2v = [ "garage_util/k2v", "garage_api/k2v" ] -# Database engines, Sled is still our default even though we don't like it -sled = [ "garage_model/sled" ] +# Database engines lmdb = [ "garage_model/lmdb" ] sqlite = [ "garage_model/sqlite" ] diff --git a/src/garage/cli/convert_db.rs b/src/garage/cli/convert_db.rs index 2aadb1d6..5346d55a 100644 --- a/src/garage/cli/convert_db.rs +++ b/src/garage/cli/convert_db.rs @@ -11,7 +11,7 @@ pub struct ConvertDbOpt { /// https://garagehq.deuxfleurs.fr/documentation/reference-manual/configuration/#db-engine-since-v0-8-0) #[structopt(short = "i")] input_path: PathBuf, - /// Input database engine (sled, lmdb or sqlite; limited by db engines + /// Input database engine (lmdb or sqlite; limited by db engines /// enabled in this build) #[structopt(short = "a")] input_engine: Engine, diff --git a/src/garage/main.rs b/src/garage/main.rs index e489fff0..5e9c061f 100644 --- a/src/garage/main.rs +++ b/src/garage/main.rs @@ -18,8 +18,8 @@ compile_error!("Either bundled-libs or system-libs Cargo feature must be enabled #[cfg(all(feature = "bundled-libs", feature = "system-libs"))] compile_error!("Only one of bundled-libs and system-libs Cargo features must be enabled"); -#[cfg(not(any(feature = "lmdb", feature = "sled", feature = "sqlite")))] -compile_error!("Must activate the Cargo feature for at least one DB engine: lmdb, sled or sqlite."); +#[cfg(not(any(feature = "lmdb", feature = "sqlite")))] +compile_error!("Must activate the Cargo feature for at least one DB engine: lmdb or sqlite."); use std::net::SocketAddr; use std::path::PathBuf; @@ -72,8 +72,6 @@ async fn main() { let features = &[ #[cfg(feature = "k2v")] "k2v", - #[cfg(feature = "sled")] - "sled", #[cfg(feature = "lmdb")] "lmdb", #[cfg(feature = "sqlite")] diff --git a/src/model/Cargo.toml b/src/model/Cargo.toml index 776671d0..a6bcfbe7 100644 --- a/src/model/Cargo.toml +++ b/src/model/Cargo.toml @@ -42,8 +42,7 @@ tokio.workspace = true opentelemetry.workspace = true [features] -default = [ "sled", "lmdb", "sqlite" ] +default = [ "lmdb", "sqlite" ] k2v = [ "garage_util/k2v" ] lmdb = [ "garage_db/lmdb" ] -sled = [ "garage_db/sled" ] sqlite = [ "garage_db/sqlite" ] diff --git a/src/model/garage.rs b/src/model/garage.rs index 7ec8b22e..8987c594 100644 --- a/src/model/garage.rs +++ b/src/model/garage.rs @@ -118,9 +118,6 @@ impl Garage { .ok_or_message("Invalid `db_engine` value in configuration file")?; let mut db_path = config.metadata_dir.clone(); match db_engine { - db::Engine::Sled => { - db_path.push("db"); - } db::Engine::Sqlite => { db_path.push("db.sqlite"); } @@ -134,8 +131,6 @@ impl Garage { v if v == usize::default() => None, v => Some(v), }, - sled_cache_capacity: config.sled_cache_capacity, - sled_flush_every_ms: config.sled_flush_every_ms, }; let db = db::open_db(&db_path, db_engine, &db_opt) .ok_or_message("Unable to open metadata db")?; diff --git a/src/table/gc.rs b/src/table/gc.rs index ef788749..65ad0c42 100644 --- a/src/table/gc.rs +++ b/src/table/gc.rs @@ -334,9 +334,9 @@ impl Worker for GcWorker { } } -/// An entry stored in the gc_todo Sled tree associated with the table +/// An entry stored in the gc_todo db tree associated with the table /// Contains helper function for parsing, saving, and removing -/// such entry in Sled +/// such entry in the db /// /// Format of an entry: /// - key = 8 bytes: timestamp of tombstone @@ -353,7 +353,7 @@ pub(crate) struct GcTodoEntry { } impl GcTodoEntry { - /// Creates a new GcTodoEntry (not saved in Sled) from its components: + /// Creates a new GcTodoEntry (not saved in the db) from its components: /// the key of an entry in the table, and the hash of the associated /// serialized value pub(crate) fn new(key: Vec, value_hash: Hash) -> Self { diff --git a/src/table/merkle.rs b/src/table/merkle.rs index 01271c58..be0ae243 100644 --- a/src/table/merkle.rs +++ b/src/table/merkle.rs @@ -31,14 +31,14 @@ pub struct MerkleUpdater { // - value = the hash of the full serialized item, if present, // or an empty vec if item is absent (deleted) // Fields in data: - // pub(crate) merkle_todo: sled::Tree, + // pub(crate) merkle_todo: db::Tree, // pub(crate) merkle_todo_notify: Notify, // Content of the merkle tree: items where // - key = .bytes() for MerkleNodeKey // - value = serialization of a MerkleNode, assumed to be MerkleNode::empty if not found // Field in data: - // pub(crate) merkle_tree: sled::Tree, + // pub(crate) merkle_tree: db::Tree, empty_node_hash: Hash, } diff --git a/src/util/config.rs b/src/util/config.rs index b7f27676..e243c813 100644 --- a/src/util/config.rs +++ b/src/util/config.rs @@ -87,20 +87,10 @@ pub struct Config { pub kubernetes_discovery: Option, // -- DB - /// Database engine to use for metadata (options: sled, sqlite, lmdb) + /// Database engine to use for metadata (options: sqlite, lmdb) #[serde(default = "default_db_engine")] pub db_engine: String, - /// Sled cache size, in bytes - #[serde( - deserialize_with = "deserialize_capacity", - default = "default_sled_cache_capacity" - )] - pub sled_cache_capacity: usize, - /// Sled flush interval in milliseconds - #[serde(default = "default_sled_flush_every_ms")] - pub sled_flush_every_ms: u64, - /// LMDB map size #[serde(deserialize_with = "deserialize_capacity", default)] pub lmdb_map_size: usize, @@ -246,13 +236,6 @@ fn default_db_engine() -> String { "lmdb".into() } -fn default_sled_cache_capacity() -> usize { - 128 * 1024 * 1024 -} -fn default_sled_flush_every_ms() -> u64 { - 2000 -} - fn default_block_size() -> usize { 1048576 } -- cgit v1.2.3 From 05c92204ecab87540806073ac4deedfd58519240 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 8 Mar 2024 14:59:56 +0100 Subject: [rm-sled] Remove counted_tree_hack --- src/block/manager.rs | 7 +-- src/block/metrics.rs | 17 +++--- src/block/resync.rs | 15 ++--- src/db/counted_tree_hack.rs | 127 --------------------------------------- src/db/lib.rs | 9 --- src/db/lmdb_adapter.rs | 4 -- src/db/sqlite_adapter.rs | 4 -- src/garage/admin/mod.rs | 51 +++------------- src/garage/cli/structs.rs | 4 -- src/model/s3/lifecycle_worker.rs | 8 +-- src/table/data.rs | 6 +- src/table/gc.rs | 18 +++--- src/table/merkle.rs | 4 -- src/table/metrics.rs | 21 ++++--- 14 files changed, 48 insertions(+), 247 deletions(-) delete mode 100644 src/db/counted_tree_hack.rs diff --git a/src/block/manager.rs b/src/block/manager.rs index c7e4df17..18fadf85 100644 --- a/src/block/manager.rs +++ b/src/block/manager.rs @@ -378,11 +378,6 @@ impl BlockManager { Ok(self.rc.rc.len()?) } - /// Get number of items in the refcount table - pub fn rc_fast_len(&self) -> Result, Error> { - Ok(self.rc.rc.fast_len()?) - } - /// Send command to start/stop/manager scrub worker pub async fn send_scrub_command(&self, cmd: ScrubWorkerCommand) -> Result<(), Error> { let tx = self.tx_scrub_command.load(); @@ -398,7 +393,7 @@ impl BlockManager { /// List all resync errors pub fn list_resync_errors(&self) -> Result, Error> { - let mut blocks = Vec::with_capacity(self.resync.errors.len()); + let mut blocks = Vec::with_capacity(self.resync.errors.len()?); for ent in self.resync.errors.iter()? { let (hash, cnt) = ent?; let cnt = ErrorCounter::decode(&cnt); diff --git a/src/block/metrics.rs b/src/block/metrics.rs index 6659df32..8e10afdf 100644 --- a/src/block/metrics.rs +++ b/src/block/metrics.rs @@ -1,7 +1,6 @@ use opentelemetry::{global, metrics::*}; use garage_db as db; -use garage_db::counted_tree_hack::CountedTree; /// TableMetrics reference all counter used for metrics pub struct BlockManagerMetrics { @@ -29,8 +28,8 @@ impl BlockManagerMetrics { pub fn new( compression_level: Option, rc_tree: db::Tree, - resync_queue: CountedTree, - resync_errors: CountedTree, + resync_queue: db::Tree, + resync_errors: db::Tree, ) -> Self { let meter = global::meter("garage_model/block"); Self { @@ -45,15 +44,17 @@ impl BlockManagerMetrics { .init(), _rc_size: meter .u64_value_observer("block.rc_size", move |observer| { - if let Ok(Some(v)) = rc_tree.fast_len() { - observer.observe(v as u64, &[]) + if let Ok(value) = rc_tree.len() { + observer.observe(value as u64, &[]) } }) .with_description("Number of blocks known to the reference counter") .init(), _resync_queue_len: meter .u64_value_observer("block.resync_queue_length", move |observer| { - observer.observe(resync_queue.len() as u64, &[]) + if let Ok(value) = resync_queue.len() { + observer.observe(value as u64, &[]); + } }) .with_description( "Number of block hashes queued for local check and possible resync", @@ -61,7 +62,9 @@ impl BlockManagerMetrics { .init(), _resync_errored_blocks: meter .u64_value_observer("block.resync_errored_blocks", move |observer| { - observer.observe(resync_errors.len() as u64, &[]) + if let Ok(value) = resync_errors.len() { + observer.observe(value as u64, &[]); + } }) .with_description("Number of block hashes whose last resync resulted in an error") .init(), diff --git a/src/block/resync.rs b/src/block/resync.rs index 2516ba08..48c2cef1 100644 --- a/src/block/resync.rs +++ b/src/block/resync.rs @@ -15,7 +15,6 @@ use opentelemetry::{ }; use garage_db as db; -use garage_db::counted_tree_hack::CountedTree; use garage_util::background::*; use garage_util::data::*; @@ -47,9 +46,9 @@ pub(crate) const MAX_RESYNC_WORKERS: usize = 8; const INITIAL_RESYNC_TRANQUILITY: u32 = 2; pub struct BlockResyncManager { - pub(crate) queue: CountedTree, + pub(crate) queue: db::Tree, pub(crate) notify: Arc, - pub(crate) errors: CountedTree, + pub(crate) errors: db::Tree, busy_set: BusySet, @@ -90,12 +89,10 @@ impl BlockResyncManager { let queue = db .open_tree("block_local_resync_queue") .expect("Unable to open block_local_resync_queue tree"); - let queue = CountedTree::new(queue).expect("Could not count block_local_resync_queue"); let errors = db .open_tree("block_local_resync_errors") .expect("Unable to open block_local_resync_errors tree"); - let errors = CountedTree::new(errors).expect("Could not count block_local_resync_errors"); let persister = PersisterShared::new(&system.metadata_dir, "resync_cfg"); @@ -110,16 +107,12 @@ impl BlockResyncManager { /// Get lenght of resync queue pub fn queue_len(&self) -> Result { - // This currently can't return an error because the CountedTree hack - // doesn't error on .len(), but this will change when we remove the hack - // (hopefully someday!) - Ok(self.queue.len()) + Ok(self.queue.len()?) } /// Get number of blocks that have an error pub fn errors_len(&self) -> Result { - // (see queue_len comment) - Ok(self.errors.len()) + Ok(self.errors.len()?) } /// Clear the error counter for a block and put it in queue immediately diff --git a/src/db/counted_tree_hack.rs b/src/db/counted_tree_hack.rs deleted file mode 100644 index a4ce12e0..00000000 --- a/src/db/counted_tree_hack.rs +++ /dev/null @@ -1,127 +0,0 @@ -//! This hack allows a db tree to keep in RAM a counter of the number of entries -//! it contains, which is used to call .len() on it. This is usefull only for -//! the sled backend where .len() otherwise would have to traverse the whole -//! tree to count items. For sqlite and lmdb, this is mostly useless (but -//! hopefully not harmfull!). Note that a CountedTree cannot be part of a -//! transaction. - -use std::sync::{ - atomic::{AtomicUsize, Ordering}, - Arc, -}; - -use crate::{Result, Tree, TxError, Value, ValueIter}; - -#[derive(Clone)] -pub struct CountedTree(Arc); - -struct CountedTreeInternal { - tree: Tree, - len: AtomicUsize, -} - -impl CountedTree { - pub fn new(tree: Tree) -> Result { - let len = tree.len()?; - Ok(Self(Arc::new(CountedTreeInternal { - tree, - len: AtomicUsize::new(len), - }))) - } - - pub fn len(&self) -> usize { - self.0.len.load(Ordering::SeqCst) - } - - pub fn is_empty(&self) -> bool { - self.len() == 0 - } - - pub fn get>(&self, key: K) -> Result> { - self.0.tree.get(key) - } - - pub fn first(&self) -> Result> { - self.0.tree.first() - } - - pub fn iter(&self) -> Result> { - self.0.tree.iter() - } - - // ---- writing functions ---- - - pub fn insert(&self, key: K, value: V) -> Result> - where - K: AsRef<[u8]>, - V: AsRef<[u8]>, - { - let old_val = self.0.tree.insert(key, value)?; - if old_val.is_none() { - self.0.len.fetch_add(1, Ordering::SeqCst); - } - Ok(old_val) - } - - pub fn remove>(&self, key: K) -> Result> { - let old_val = self.0.tree.remove(key)?; - if old_val.is_some() { - self.0.len.fetch_sub(1, Ordering::SeqCst); - } - Ok(old_val) - } - - pub fn compare_and_swap( - &self, - key: K, - expected_old: Option, - new: Option, - ) -> Result - where - K: AsRef<[u8]>, - OV: AsRef<[u8]>, - NV: AsRef<[u8]>, - { - let old_some = expected_old.is_some(); - let new_some = new.is_some(); - - let tx_res = self.0.tree.db().transaction(|tx| { - let old_val = tx.get(&self.0.tree, &key)?; - let is_same = match (&old_val, &expected_old) { - (None, None) => true, - (Some(x), Some(y)) if x == y.as_ref() => true, - _ => false, - }; - if is_same { - match &new { - Some(v) => { - tx.insert(&self.0.tree, &key, v)?; - } - None => { - tx.remove(&self.0.tree, &key)?; - } - } - Ok(()) - } else { - Err(TxError::Abort(())) - } - }); - - match tx_res { - Ok(()) => { - match (old_some, new_some) { - (false, true) => { - self.0.len.fetch_add(1, Ordering::SeqCst); - } - (true, false) => { - self.0.len.fetch_sub(1, Ordering::SeqCst); - } - _ => (), - } - Ok(true) - } - Err(TxError::Abort(())) => Ok(false), - Err(TxError::Db(e)) => Err(e), - } - } -} diff --git a/src/db/lib.rs b/src/db/lib.rs index 8975f295..e81c712c 100644 --- a/src/db/lib.rs +++ b/src/db/lib.rs @@ -6,8 +6,6 @@ pub mod lmdb_adapter; #[cfg(feature = "sqlite")] pub mod sqlite_adapter; -pub mod counted_tree_hack; - pub mod open; #[cfg(test)] @@ -187,10 +185,6 @@ impl Tree { pub fn len(&self) -> Result { self.0.len(self.1) } - #[inline] - pub fn fast_len(&self) -> Result> { - self.0.fast_len(self.1) - } #[inline] pub fn first(&self) -> Result> { @@ -326,9 +320,6 @@ pub(crate) trait IDb: Send + Sync { fn get(&self, tree: usize, key: &[u8]) -> Result>; fn len(&self, tree: usize) -> Result; - fn fast_len(&self, _tree: usize) -> Result> { - Ok(None) - } fn insert(&self, tree: usize, key: &[u8], value: &[u8]) -> Result>; fn remove(&self, tree: usize, key: &[u8]) -> Result>; diff --git a/src/db/lmdb_adapter.rs b/src/db/lmdb_adapter.rs index 59fa132d..99b29a74 100644 --- a/src/db/lmdb_adapter.rs +++ b/src/db/lmdb_adapter.rs @@ -121,10 +121,6 @@ impl IDb for LmdbDb { Ok(tree.len(&tx)?.try_into().unwrap()) } - fn fast_len(&self, tree: usize) -> Result> { - Ok(Some(self.len(tree)?)) - } - fn insert(&self, tree: usize, key: &[u8], value: &[u8]) -> Result> { let tree = self.get_tree(tree)?; let mut tx = self.db.write_txn()?; diff --git a/src/db/sqlite_adapter.rs b/src/db/sqlite_adapter.rs index 9f967c66..1a7ae5f0 100644 --- a/src/db/sqlite_adapter.rs +++ b/src/db/sqlite_adapter.rs @@ -144,10 +144,6 @@ impl IDb for SqliteDb { } } - fn fast_len(&self, tree: usize) -> Result> { - Ok(Some(self.len(tree)?)) - } - fn insert(&self, tree: usize, key: &[u8], value: &[u8]) -> Result> { trace!("insert {}: lock db", tree); let this = self.0.lock().unwrap(); diff --git a/src/garage/admin/mod.rs b/src/garage/admin/mod.rs index de7851e1..896751cc 100644 --- a/src/garage/admin/mod.rs +++ b/src/garage/admin/mod.rs @@ -217,11 +217,11 @@ impl AdminRpcHandler { // Gather table statistics let mut table = vec![" Table\tItems\tMklItems\tMklTodo\tGcTodo".into()]; - table.push(self.gather_table_stats(&self.garage.bucket_table, opt.detailed)?); - table.push(self.gather_table_stats(&self.garage.key_table, opt.detailed)?); - table.push(self.gather_table_stats(&self.garage.object_table, opt.detailed)?); - table.push(self.gather_table_stats(&self.garage.version_table, opt.detailed)?); - table.push(self.gather_table_stats(&self.garage.block_ref_table, opt.detailed)?); + table.push(self.gather_table_stats(&self.garage.bucket_table)?); + table.push(self.gather_table_stats(&self.garage.key_table)?); + table.push(self.gather_table_stats(&self.garage.object_table)?); + table.push(self.gather_table_stats(&self.garage.version_table)?); + table.push(self.gather_table_stats(&self.garage.block_ref_table)?); write!( &mut ret, "\nTable stats:\n{}", @@ -231,15 +231,7 @@ impl AdminRpcHandler { // Gather block manager statistics writeln!(&mut ret, "\nBlock manager stats:").unwrap(); - let rc_len = if opt.detailed { - self.garage.block_manager.rc_len()?.to_string() - } else { - self.garage - .block_manager - .rc_fast_len()? - .map(|x| x.to_string()) - .unwrap_or_else(|| "NC".into()) - }; + let rc_len = self.garage.block_manager.rc_len()?.to_string(); writeln!( &mut ret, @@ -260,10 +252,6 @@ impl AdminRpcHandler { ) .unwrap(); - if !opt.detailed { - writeln!(&mut ret, "\nIf values are missing above (marked as NC), consider adding the --detailed flag (this will be slow).").unwrap(); - } - if !opt.skip_global { write!(&mut ret, "\n{}", self.gather_cluster_stats()).unwrap(); } @@ -365,34 +353,13 @@ impl AdminRpcHandler { ret } - fn gather_table_stats( - &self, - t: &Arc>, - detailed: bool, - ) -> Result + fn gather_table_stats(&self, t: &Arc>) -> Result where F: TableSchema + 'static, R: TableReplication + 'static, { - let (data_len, mkl_len) = if detailed { - ( - t.data.store.len().map_err(GarageError::from)?.to_string(), - t.merkle_updater.merkle_tree_len()?.to_string(), - ) - } else { - ( - t.data - .store - .fast_len() - .map_err(GarageError::from)? - .map(|x| x.to_string()) - .unwrap_or_else(|| "NC".into()), - t.merkle_updater - .merkle_tree_fast_len()? - .map(|x| x.to_string()) - .unwrap_or_else(|| "NC".into()), - ) - }; + let data_len = t.data.store.len().map_err(GarageError::from)?.to_string(); + let mkl_len = t.merkle_updater.merkle_tree_len()?.to_string(); Ok(format!( " {}\t{}\t{}\t{}\t{}", diff --git a/src/garage/cli/structs.rs b/src/garage/cli/structs.rs index 40e47ee1..7e7ab71b 100644 --- a/src/garage/cli/structs.rs +++ b/src/garage/cli/structs.rs @@ -553,10 +553,6 @@ pub struct StatsOpt { #[structopt(short = "a", long = "all-nodes")] pub all_nodes: bool, - /// Gather detailed statistics (this can be long) - #[structopt(short = "d", long = "detailed")] - pub detailed: bool, - /// Don't show global cluster stats (internal use in RPC) #[structopt(skip)] #[serde(default)] diff --git a/src/model/s3/lifecycle_worker.rs b/src/model/s3/lifecycle_worker.rs index 50d4283f..9ecf168c 100644 --- a/src/model/s3/lifecycle_worker.rs +++ b/src/model/s3/lifecycle_worker.rs @@ -121,13 +121,7 @@ impl Worker for LifecycleWorker { mpu_aborted, .. } => { - let n_objects = self - .garage - .object_table - .data - .store - .fast_len() - .unwrap_or(None); + let n_objects = self.garage.object_table.data.store.len().ok(); let progress = match n_objects { None => "...".to_string(), Some(total) => format!( diff --git a/src/table/data.rs b/src/table/data.rs index 7f6b7847..09f4e008 100644 --- a/src/table/data.rs +++ b/src/table/data.rs @@ -6,7 +6,6 @@ use serde_bytes::ByteBuf; use tokio::sync::Notify; use garage_db as db; -use garage_db::counted_tree_hack::CountedTree; use garage_util::data::*; use garage_util::error::*; @@ -36,7 +35,7 @@ pub struct TableData { pub(crate) insert_queue: db::Tree, pub(crate) insert_queue_notify: Arc, - pub(crate) gc_todo: CountedTree, + pub(crate) gc_todo: db::Tree, pub(crate) metrics: TableMetrics, } @@ -61,7 +60,6 @@ impl TableData { let gc_todo = db .open_tree(format!("{}:gc_todo_v2", F::TABLE_NAME)) .expect("Unable to open GC DB tree"); - let gc_todo = CountedTree::new(gc_todo).expect("Cannot count gc_todo_v2"); let metrics = TableMetrics::new( F::TABLE_NAME, @@ -370,6 +368,6 @@ impl TableData { } pub fn gc_todo_len(&self) -> Result { - Ok(self.gc_todo.len()) + Ok(self.gc_todo.len()?) } } diff --git a/src/table/gc.rs b/src/table/gc.rs index 65ad0c42..d30a1849 100644 --- a/src/table/gc.rs +++ b/src/table/gc.rs @@ -10,7 +10,7 @@ use serde_bytes::ByteBuf; use futures::future::join_all; use tokio::sync::watch; -use garage_db::counted_tree_hack::CountedTree; +use garage_db as db; use garage_util::background::*; use garage_util::data::*; @@ -376,7 +376,7 @@ impl GcTodoEntry { } /// Saves the GcTodoEntry in the gc_todo tree - pub(crate) fn save(&self, gc_todo_tree: &CountedTree) -> Result<(), Error> { + pub(crate) fn save(&self, gc_todo_tree: &db::Tree) -> Result<(), Error> { gc_todo_tree.insert(self.todo_table_key(), self.value_hash.as_slice())?; Ok(()) } @@ -386,12 +386,14 @@ impl GcTodoEntry { /// This is usefull to remove a todo entry only under the condition /// that it has not changed since the time it was read, i.e. /// what we have to do is still the same - pub(crate) fn remove_if_equal(&self, gc_todo_tree: &CountedTree) -> Result<(), Error> { - gc_todo_tree.compare_and_swap::<_, _, &[u8]>( - &self.todo_table_key(), - Some(self.value_hash), - None, - )?; + pub(crate) fn remove_if_equal(&self, gc_todo_tree: &db::Tree) -> Result<(), Error> { + gc_todo_tree.db().transaction(|txn| { + let key = self.todo_table_key(); + if txn.get(gc_todo_tree, &key)?.as_deref() == Some(self.value_hash.as_slice()) { + txn.remove(gc_todo_tree, &key)?; + } + Ok(()) + })?; Ok(()) } diff --git a/src/table/merkle.rs b/src/table/merkle.rs index be0ae243..596d5805 100644 --- a/src/table/merkle.rs +++ b/src/table/merkle.rs @@ -291,10 +291,6 @@ impl MerkleUpdater { Ok(self.data.merkle_tree.len()?) } - pub fn merkle_tree_fast_len(&self) -> Result, Error> { - Ok(self.data.merkle_tree.fast_len()?) - } - pub fn todo_len(&self) -> Result { Ok(self.data.merkle_todo.len()?) } diff --git a/src/table/metrics.rs b/src/table/metrics.rs index 8318a84f..7bb0959a 100644 --- a/src/table/metrics.rs +++ b/src/table/metrics.rs @@ -1,7 +1,6 @@ use opentelemetry::{global, metrics::*, KeyValue}; use garage_db as db; -use garage_db::counted_tree_hack::CountedTree; /// TableMetrics reference all counter used for metrics pub struct TableMetrics { @@ -27,7 +26,7 @@ impl TableMetrics { store: db::Tree, merkle_tree: db::Tree, merkle_todo: db::Tree, - gc_todo: CountedTree, + gc_todo: db::Tree, ) -> Self { let meter = global::meter(table_name); TableMetrics { @@ -35,9 +34,9 @@ impl TableMetrics { .u64_value_observer( "table.size", move |observer| { - if let Ok(Some(v)) = store.fast_len() { + if let Ok(value) = store.len() { observer.observe( - v as u64, + value as u64, &[KeyValue::new("table_name", table_name)], ); } @@ -49,9 +48,9 @@ impl TableMetrics { .u64_value_observer( "table.merkle_tree_size", move |observer| { - if let Ok(Some(v)) = merkle_tree.fast_len() { + if let Ok(value) = merkle_tree.len() { observer.observe( - v as u64, + value as u64, &[KeyValue::new("table_name", table_name)], ); } @@ -77,10 +76,12 @@ impl TableMetrics { .u64_value_observer( "table.gc_todo_queue_length", move |observer| { - observer.observe( - gc_todo.len() as u64, - &[KeyValue::new("table_name", table_name)], - ); + if let Ok(value) = gc_todo.len() { + observer.observe( + value as u64, + &[KeyValue::new("table_name", table_name)], + ); + } }, ) .with_description("Table garbage collector TODO queue length") -- cgit v1.2.3 From 66c23890c1a6e73fd6c5246642e087cd2866451e Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 8 Mar 2024 16:02:58 +0100 Subject: [rm-sled] Implement some missing functionality in garage_db --- src/db/lib.rs | 6 ++++++ src/db/lmdb_adapter.rs | 10 ++++++++-- src/db/sqlite_adapter.rs | 5 +++++ 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/src/db/lib.rs b/src/db/lib.rs index e81c712c..5881954d 100644 --- a/src/db/lib.rs +++ b/src/db/lib.rs @@ -274,6 +274,11 @@ impl<'a> Transaction<'a> { pub fn remove>(&mut self, tree: &Tree, key: T) -> TxOpResult> { self.tx.remove(tree.1, key.as_ref()) } + /// Clears all values in a tree + #[inline] + pub fn clear(&mut self, tree: &Tree) -> TxOpResult<()> { + self.tx.clear(tree.1) + } #[inline] pub fn iter(&self, tree: &Tree) -> TxOpResult> { @@ -350,6 +355,7 @@ pub(crate) trait ITx { fn insert(&mut self, tree: usize, key: &[u8], value: &[u8]) -> TxOpResult>; fn remove(&mut self, tree: usize, key: &[u8]) -> TxOpResult>; + fn clear(&mut self, tree: usize) -> TxOpResult<()>; fn iter(&self, tree: usize) -> TxOpResult>; fn iter_rev(&self, tree: usize) -> TxOpResult>; diff --git a/src/db/lmdb_adapter.rs b/src/db/lmdb_adapter.rs index 99b29a74..01c360b4 100644 --- a/src/db/lmdb_adapter.rs +++ b/src/db/lmdb_adapter.rs @@ -238,8 +238,9 @@ impl<'a> ITx for LmdbTx<'a> { None => Ok(None), } } - fn len(&self, _tree: usize) -> TxOpResult { - unimplemented!(".len() in transaction not supported with LMDB backend") + fn len(&self, tree: usize) -> TxOpResult { + let tree = self.get_tree(tree)?; + Ok(tree.len(&self.tx)? as usize) } fn insert(&mut self, tree: usize, key: &[u8], value: &[u8]) -> TxOpResult> { @@ -254,6 +255,11 @@ impl<'a> ITx for LmdbTx<'a> { tree.delete(&mut self.tx, key)?; Ok(old_val) } + fn clear(&mut self, tree: usize) -> TxOpResult<()> { + let tree = *self.get_tree(tree)?; + tree.clear(&mut self.tx)?; + Ok(()) + } fn iter(&self, _tree: usize) -> TxOpResult> { unimplemented!("Iterators in transactions not supported with LMDB backend"); diff --git a/src/db/sqlite_adapter.rs b/src/db/sqlite_adapter.rs index 1a7ae5f0..d1394355 100644 --- a/src/db/sqlite_adapter.rs +++ b/src/db/sqlite_adapter.rs @@ -363,6 +363,11 @@ impl<'a> ITx for SqliteTx<'a> { Ok(old_val) } + fn clear(&mut self, tree: usize) -> TxOpResult<()> { + let tree = self.get_tree(tree)?; + self.tx.execute(&format!("DELETE FROM {}", tree), [])?; + Ok(()) + } fn iter(&self, _tree: usize) -> TxOpResult> { unimplemented!(); -- cgit v1.2.3 From b942949940b5a0dec8e8640c44a2705a4482a2e4 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 8 Mar 2024 16:38:01 +0100 Subject: [rm-sled] Implement iterators in sqlite & lmdb transactions with way too much unsafe code --- src/db/lib.rs | 1 + src/db/lmdb_adapter.rs | 48 ++++++++++++------ src/db/sqlite_adapter.rs | 125 +++++++++++++++++++++++++++++++++++++++++------ src/db/test.rs | 49 +++++++++++++++++++ 4 files changed, 195 insertions(+), 28 deletions(-) diff --git a/src/db/lib.rs b/src/db/lib.rs index 5881954d..ff511b5f 100644 --- a/src/db/lib.rs +++ b/src/db/lib.rs @@ -51,6 +51,7 @@ pub type Result = std::result::Result; pub struct TxOpError(pub(crate) Error); pub type TxOpResult = std::result::Result; +#[derive(Debug)] pub enum TxError { Abort(E), Db(Error), diff --git a/src/db/lmdb_adapter.rs b/src/db/lmdb_adapter.rs index 01c360b4..ddfb6ed5 100644 --- a/src/db/lmdb_adapter.rs +++ b/src/db/lmdb_adapter.rs @@ -261,32 +261,42 @@ impl<'a> ITx for LmdbTx<'a> { Ok(()) } - fn iter(&self, _tree: usize) -> TxOpResult> { - unimplemented!("Iterators in transactions not supported with LMDB backend"); + fn iter(&self, tree: usize) -> TxOpResult> { + let tree = *self.get_tree(tree)?; + Ok(Box::new(tree.iter(&self.tx)?.map(tx_iter_item))) } - fn iter_rev(&self, _tree: usize) -> TxOpResult> { - unimplemented!("Iterators in transactions not supported with LMDB backend"); + fn iter_rev(&self, tree: usize) -> TxOpResult> { + let tree = *self.get_tree(tree)?; + Ok(Box::new(tree.rev_iter(&self.tx)?.map(tx_iter_item))) } fn range<'r>( &self, - _tree: usize, - _low: Bound<&'r [u8]>, - _high: Bound<&'r [u8]>, + tree: usize, + low: Bound<&'r [u8]>, + high: Bound<&'r [u8]>, ) -> TxOpResult> { - unimplemented!("Iterators in transactions not supported with LMDB backend"); + let tree = *self.get_tree(tree)?; + Ok(Box::new( + tree.range(&self.tx, &(low, high))?.map(tx_iter_item), + )) } fn range_rev<'r>( &self, - _tree: usize, - _low: Bound<&'r [u8]>, - _high: Bound<&'r [u8]>, + tree: usize, + low: Bound<&'r [u8]>, + high: Bound<&'r [u8]>, ) -> TxOpResult> { - unimplemented!("Iterators in transactions not supported with LMDB backend"); + let tree = *self.get_tree(tree)?; + Ok(Box::new( + tree.rev_range(&self.tx, &(low, high))?.map(tx_iter_item), + )) } } -// ---- +// ---- iterators outside transactions ---- +// complicated, they must hold the transaction object +// therefore a bit of unsafe code (it is a self-referential struct) type IteratorItem<'a> = heed::Result<( >::DItem, @@ -323,6 +333,7 @@ where I: Iterator> + 'a, { fn drop(&mut self) { + // ensure the iterator is dropped before the RoTxn it references drop(self.iter.take()); } } @@ -342,7 +353,16 @@ where } } -// ---- +// ---- iterators within transactions ---- + +fn tx_iter_item<'a>( + item: std::result::Result<(&'a [u8], &'a [u8]), heed::Error>, +) -> TxOpResult<(Vec, Vec)> { + item.map(|(k, v)| (k.to_vec(), v.to_vec())) + .map_err(|e| TxOpError(Error::from(e))) +} + +// ---- utility ---- #[cfg(target_pointer_width = "64")] pub fn recommended_map_size() -> usize { diff --git a/src/db/sqlite_adapter.rs b/src/db/sqlite_adapter.rs index d1394355..077c1f1b 100644 --- a/src/db/sqlite_adapter.rs +++ b/src/db/sqlite_adapter.rs @@ -369,32 +369,58 @@ impl<'a> ITx for SqliteTx<'a> { Ok(()) } - fn iter(&self, _tree: usize) -> TxOpResult> { - unimplemented!(); + fn iter(&self, tree: usize) -> TxOpResult> { + let tree = self.get_tree(tree)?; + let sql = format!("SELECT k, v FROM {} ORDER BY k ASC", tree); + TxValueIterator::make(self, &sql, []) } - fn iter_rev(&self, _tree: usize) -> TxOpResult> { - unimplemented!(); + fn iter_rev(&self, tree: usize) -> TxOpResult> { + let tree = self.get_tree(tree)?; + let sql = format!("SELECT k, v FROM {} ORDER BY k DESC", tree); + TxValueIterator::make(self, &sql, []) } fn range<'r>( &self, - _tree: usize, - _low: Bound<&'r [u8]>, - _high: Bound<&'r [u8]>, + tree: usize, + low: Bound<&'r [u8]>, + high: Bound<&'r [u8]>, ) -> TxOpResult> { - unimplemented!(); + let tree = self.get_tree(tree)?; + + let (bounds_sql, params) = bounds_sql(low, high); + let sql = format!("SELECT k, v FROM {} {} ORDER BY k ASC", tree, bounds_sql); + + let params = params + .iter() + .map(|x| x as &dyn rusqlite::ToSql) + .collect::>(); + + TxValueIterator::make::<&[&dyn rusqlite::ToSql]>(self, &sql, params.as_ref()) } fn range_rev<'r>( &self, - _tree: usize, - _low: Bound<&'r [u8]>, - _high: Bound<&'r [u8]>, + tree: usize, + low: Bound<&'r [u8]>, + high: Bound<&'r [u8]>, ) -> TxOpResult> { - unimplemented!(); + let tree = self.get_tree(tree)?; + + let (bounds_sql, params) = bounds_sql(low, high); + let sql = format!("SELECT k, v FROM {} {} ORDER BY k DESC", tree, bounds_sql); + + let params = params + .iter() + .map(|x| x as &dyn rusqlite::ToSql) + .collect::>(); + + TxValueIterator::make::<&[&dyn rusqlite::ToSql]>(self, &sql, params.as_ref()) } } -// ---- +// ---- iterators outside transactions ---- +// complicated, they must hold the Statement and Row objects +// therefore quite some unsafe code (it is a self-referential struct) struct DbValueIterator<'a> { db: MutexGuard<'a, SqliteDbInner>, @@ -471,7 +497,78 @@ impl<'a> Iterator for DbValueIteratorPin<'a> { } } -// ---- +// ---- iterators within transactions ---- +// it's the same except we don't hold a mutex guard, +// only a Statement and a Rows object + +struct TxValueIterator<'a> { + stmt: Statement<'a>, + iter: Option>, + _pin: PhantomPinned, +} + +impl<'a> TxValueIterator<'a> { + fn make( + tx: &'a SqliteTx<'a>, + sql: &str, + args: P, + ) -> TxOpResult> { + let stmt = tx.tx.prepare(sql)?; + let res = TxValueIterator { + stmt, + iter: None, + _pin: PhantomPinned, + }; + let mut boxed = Box::pin(res); + trace!("make iterator with sql: {}", sql); + + unsafe { + let mut stmt = NonNull::from(&boxed.stmt); + let iter = stmt.as_mut().query(args)?; + + let mut_ref: Pin<&mut TxValueIterator<'a>> = Pin::as_mut(&mut boxed); + Pin::get_unchecked_mut(mut_ref).iter = Some(iter); + } + + Ok(Box::new(TxValueIteratorPin(boxed))) + } +} + +impl<'a> Drop for TxValueIterator<'a> { + fn drop(&mut self) { + trace!("drop iter"); + drop(self.iter.take()); + } +} + +struct TxValueIteratorPin<'a>(Pin>>); + +impl<'a> Iterator for TxValueIteratorPin<'a> { + type Item = TxOpResult<(Value, Value)>; + + fn next(&mut self) -> Option { + let next = unsafe { + let mut_ref: Pin<&mut TxValueIterator<'a>> = Pin::as_mut(&mut self.0); + Pin::get_unchecked_mut(mut_ref).iter.as_mut()?.next() + }; + let row = match next { + Err(e) => return Some(Err(e.into())), + Ok(None) => return None, + Ok(Some(r)) => r, + }; + let k = match row.get::<_, Vec>(0) { + Err(e) => return Some(Err(e.into())), + Ok(x) => x, + }; + let v = match row.get::<_, Vec>(1) { + Err(e) => return Some(Err(e.into())), + Ok(y) => y, + }; + Some(Ok((k, v))) + } +} + +// ---- utility ---- fn bounds_sql<'r>(low: Bound<&'r [u8]>, high: Bound<&'r [u8]>) -> (String, Vec>) { let mut sql = String::new(); diff --git a/src/db/test.rs b/src/db/test.rs index d4c875f0..3add89fb 100644 --- a/src/db/test.rs +++ b/src/db/test.rs @@ -10,8 +10,13 @@ fn test_suite(db: Db) { let vb: &[u8] = &b"plip"[..]; let vc: &[u8] = &b"plup"[..]; + // ---- test simple insert/delete ---- + assert!(tree.insert(ka, va).unwrap().is_none()); assert_eq!(tree.get(ka).unwrap().unwrap(), va); + assert_eq!(tree.len().unwrap(), 1); + + // ---- test transaction logic ---- let res = db.transaction::<_, (), _>(|tx| { assert_eq!(tx.get(&tree, ka).unwrap().unwrap(), va); @@ -37,6 +42,8 @@ fn test_suite(db: Db) { assert!(matches!(res, Err(TxError::Abort(42)))); assert_eq!(tree.get(ka).unwrap().unwrap(), vb); + // ---- test iteration outside of transactions ---- + let mut iter = tree.iter().unwrap(); let next = iter.next().unwrap().unwrap(); assert_eq!((next.0.as_ref(), next.1.as_ref()), (ka, vb)); @@ -73,6 +80,48 @@ fn test_suite(db: Db) { assert_eq!((next.0.as_ref(), next.1.as_ref()), (ka, vb)); assert!(iter.next().is_none()); drop(iter); + + // ---- test iteration within transactions ---- + + db.transaction::<_, (), _>(|tx| { + let mut iter = tx.iter(&tree).unwrap(); + let next = iter.next().unwrap().unwrap(); + assert_eq!((next.0.as_ref(), next.1.as_ref()), (ka, vb)); + let next = iter.next().unwrap().unwrap(); + assert_eq!((next.0.as_ref(), next.1.as_ref()), (kb, vc)); + assert!(iter.next().is_none()); + Ok(()) + }) + .unwrap(); + + db.transaction::<_, (), _>(|tx| { + let mut iter = tx.range(&tree, kint..).unwrap(); + let next = iter.next().unwrap().unwrap(); + assert_eq!((next.0.as_ref(), next.1.as_ref()), (kb, vc)); + assert!(iter.next().is_none()); + Ok(()) + }) + .unwrap(); + + db.transaction::<_, (), _>(|tx| { + let mut iter = tx.range_rev(&tree, ..kint).unwrap(); + let next = iter.next().unwrap().unwrap(); + assert_eq!((next.0.as_ref(), next.1.as_ref()), (ka, vb)); + assert!(iter.next().is_none()); + Ok(()) + }) + .unwrap(); + + db.transaction::<_, (), _>(|tx| { + let mut iter = tx.iter_rev(&tree).unwrap(); + let next = iter.next().unwrap().unwrap(); + assert_eq!((next.0.as_ref(), next.1.as_ref()), (kb, vc)); + let next = iter.next().unwrap().unwrap(); + assert_eq!((next.0.as_ref(), next.1.as_ref()), (ka, vb)); + assert!(iter.next().is_none()); + Ok(()) + }) + .unwrap(); } #[test] -- cgit v1.2.3 From 32aa2463001c0af9f87633a1ff78858dd4157eb9 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 8 Mar 2024 17:39:17 +0100 Subject: [rm-sled] Make proper use of pinning in LMDB adapter + comment unsafe --- src/db/lmdb_adapter.rs | 28 ++++++++++++++++++++++------ src/db/sqlite_adapter.rs | 44 +++++++++++++++++++++++++------------------- 2 files changed, 47 insertions(+), 25 deletions(-) diff --git a/src/db/lmdb_adapter.rs b/src/db/lmdb_adapter.rs index ddfb6ed5..5ce7d3e3 100644 --- a/src/db/lmdb_adapter.rs +++ b/src/db/lmdb_adapter.rs @@ -3,6 +3,7 @@ use core::ptr::NonNull; use std::collections::HashMap; use std::convert::TryInto; +use std::pin::Pin; use std::sync::{Arc, RwLock}; use heed::types::ByteSlice; @@ -319,12 +320,20 @@ where where F: FnOnce(&'a RoTxn<'a>) -> Result, { - let mut res = TxAndIterator { tx, iter: None }; + let res = TxAndIterator { tx, iter: None }; + let mut boxed = Box::pin(res); - let tx = unsafe { NonNull::from(&res.tx).as_ref() }; - res.iter = Some(iterfun(tx)?); + // This unsafe allows us to bypass lifetime checks + let tx = unsafe { NonNull::from(&boxed.tx).as_ref() }; + let iter = iterfun(tx)?; - Ok(Box::new(res)) + let mut_ref = Pin::as_mut(&mut boxed); + // This unsafe allows us to write in a field of the pinned struct + unsafe { + Pin::get_unchecked_mut(mut_ref).iter = Some(iter); + } + + Ok(Box::new(TxAndIteratorPin(boxed))) } } @@ -338,14 +347,21 @@ where } } -impl<'a, I> Iterator for TxAndIterator<'a, I> +struct TxAndIteratorPin<'a, I>(Pin>>) +where + I: Iterator> + 'a; + +impl<'a, I> Iterator for TxAndIteratorPin<'a, I> where I: Iterator> + 'a, { type Item = Result<(Value, Value)>; fn next(&mut self) -> Option { - match self.iter.as_mut().unwrap().next() { + let mut_ref = Pin::as_mut(&mut self.0); + // This unsafe allows us to mutably access the iterator field + let next = unsafe { Pin::get_unchecked_mut(mut_ref).iter.as_mut()?.next() }; + match next { None => None, Some(Err(e)) => Some(Err(e.into())), Some(Ok((k, v))) => Some(Ok((k.to_vec(), v.to_vec()))), diff --git a/src/db/sqlite_adapter.rs b/src/db/sqlite_adapter.rs index 077c1f1b..2c6a4159 100644 --- a/src/db/sqlite_adapter.rs +++ b/src/db/sqlite_adapter.rs @@ -444,17 +444,23 @@ impl<'a> DbValueIterator<'a> { let mut boxed = Box::pin(res); trace!("make iterator with sql: {}", sql); - unsafe { - let db = NonNull::from(&boxed.db); - let stmt = db.as_ref().db.prepare(sql)?; + // This unsafe allows us to bypass lifetime checks + let db = unsafe { NonNull::from(&boxed.db).as_ref() }; + let stmt = db.db.prepare(sql)?; - let mut_ref: Pin<&mut DbValueIterator<'a>> = Pin::as_mut(&mut boxed); + let mut_ref = Pin::as_mut(&mut boxed); + // This unsafe allows us to write in a field of the pinned struct + unsafe { Pin::get_unchecked_mut(mut_ref).stmt = Some(stmt); + } - let mut stmt = NonNull::from(&boxed.stmt); - let iter = stmt.as_mut().as_mut().unwrap().query(args)?; + // This unsafe allows us to bypass lifetime checks + let stmt = unsafe { NonNull::from(&boxed.stmt).as_mut() }; + let iter = stmt.as_mut().unwrap().query(args)?; - let mut_ref: Pin<&mut DbValueIterator<'a>> = Pin::as_mut(&mut boxed); + let mut_ref = Pin::as_mut(&mut boxed); + // This unsafe allows us to write in a field of the pinned struct + unsafe { Pin::get_unchecked_mut(mut_ref).iter = Some(iter); } @@ -476,10 +482,9 @@ impl<'a> Iterator for DbValueIteratorPin<'a> { type Item = Result<(Value, Value)>; fn next(&mut self) -> Option { - let next = unsafe { - let mut_ref: Pin<&mut DbValueIterator<'a>> = Pin::as_mut(&mut self.0); - Pin::get_unchecked_mut(mut_ref).iter.as_mut()?.next() - }; + let mut_ref = Pin::as_mut(&mut self.0); + // This unsafe allows us to mutably access the iterator field + let next = unsafe { Pin::get_unchecked_mut(mut_ref).iter.as_mut()?.next() }; let row = match next { Err(e) => return Some(Err(e.into())), Ok(None) => return None, @@ -522,11 +527,13 @@ impl<'a> TxValueIterator<'a> { let mut boxed = Box::pin(res); trace!("make iterator with sql: {}", sql); - unsafe { - let mut stmt = NonNull::from(&boxed.stmt); - let iter = stmt.as_mut().query(args)?; + // This unsafe allows us to bypass lifetime checks + let stmt = unsafe { NonNull::from(&boxed.stmt).as_mut() }; + let iter = stmt.query(args)?; - let mut_ref: Pin<&mut TxValueIterator<'a>> = Pin::as_mut(&mut boxed); + let mut_ref = Pin::as_mut(&mut boxed); + // This unsafe allows us to write in a field of the pinned struct + unsafe { Pin::get_unchecked_mut(mut_ref).iter = Some(iter); } @@ -547,10 +554,9 @@ impl<'a> Iterator for TxValueIteratorPin<'a> { type Item = TxOpResult<(Value, Value)>; fn next(&mut self) -> Option { - let next = unsafe { - let mut_ref: Pin<&mut TxValueIterator<'a>> = Pin::as_mut(&mut self.0); - Pin::get_unchecked_mut(mut_ref).iter.as_mut()?.next() - }; + let mut_ref = Pin::as_mut(&mut self.0); + // This unsafe allows us to mutably access the iterator field + let next = unsafe { Pin::get_unchecked_mut(mut_ref).iter.as_mut()?.next() }; let row = match next { Err(e) => return Some(Err(e.into())), Ok(None) => return None, -- cgit v1.2.3 From 2795b53b8b3ebd162df6b0244b73889e72f67ce0 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 12 Mar 2024 11:15:26 +0100 Subject: [rm-sled] factorize some code in sqlite backend --- src/db/sqlite_adapter.rs | 52 ++++++++++++++++++++++-------------------------- 1 file changed, 24 insertions(+), 28 deletions(-) diff --git a/src/db/sqlite_adapter.rs b/src/db/sqlite_adapter.rs index 2c6a4159..6c556c97 100644 --- a/src/db/sqlite_adapter.rs +++ b/src/db/sqlite_adapter.rs @@ -485,20 +485,7 @@ impl<'a> Iterator for DbValueIteratorPin<'a> { let mut_ref = Pin::as_mut(&mut self.0); // This unsafe allows us to mutably access the iterator field let next = unsafe { Pin::get_unchecked_mut(mut_ref).iter.as_mut()?.next() }; - let row = match next { - Err(e) => return Some(Err(e.into())), - Ok(None) => return None, - Ok(Some(r)) => r, - }; - let k = match row.get::<_, Vec>(0) { - Err(e) => return Some(Err(e.into())), - Ok(x) => x, - }; - let v = match row.get::<_, Vec>(1) { - Err(e) => return Some(Err(e.into())), - Ok(y) => y, - }; - Some(Ok((k, v))) + iter_next_row(next) } } @@ -557,20 +544,7 @@ impl<'a> Iterator for TxValueIteratorPin<'a> { let mut_ref = Pin::as_mut(&mut self.0); // This unsafe allows us to mutably access the iterator field let next = unsafe { Pin::get_unchecked_mut(mut_ref).iter.as_mut()?.next() }; - let row = match next { - Err(e) => return Some(Err(e.into())), - Ok(None) => return None, - Ok(Some(r)) => r, - }; - let k = match row.get::<_, Vec>(0) { - Err(e) => return Some(Err(e.into())), - Ok(x) => x, - }; - let v = match row.get::<_, Vec>(1) { - Err(e) => return Some(Err(e.into())), - Ok(y) => y, - }; - Some(Ok((k, v))) + iter_next_row(next) } } @@ -614,3 +588,25 @@ fn bounds_sql<'r>(low: Bound<&'r [u8]>, high: Bound<&'r [u8]>) -> (String, Vec( + next_row: rusqlite::Result>, +) -> Option> +where + E: From, +{ + let row = match next_row { + Err(e) => return Some(Err(e.into())), + Ok(None) => return None, + Ok(Some(r)) => r, + }; + let k = match row.get::<_, Vec>(0) { + Err(e) => return Some(Err(e.into())), + Ok(x) => x, + }; + let v = match row.get::<_, Vec>(1) { + Err(e) => return Some(Err(e.into())), + Ok(y) => y, + }; + Some(Ok((k, v))) +} -- cgit v1.2.3 From dc0b78cdb88e9cbfd7dc1a2ee0b15333939be549 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 19 Mar 2024 11:04:20 +0100 Subject: [block-ref-repair] Block refcount recalculation and repair - We always recalculate the reference count of a block before deleting it locally, to make sure that it is indeed zero. - If we had to fetch a remote block but we were not able to get it, check that refcount is indeed > 0. - Repair procedure that checks everything --- src/block/lib.rs | 1 + src/block/manager.rs | 14 ++++-- src/block/rc.rs | 65 +++++++++++++++++++++++- src/block/resync.rs | 17 ++++++- src/garage/cli/structs.rs | 5 +- src/garage/repair/online.rs | 106 ++++++++++++++++++++++++++++++++++++++++ src/model/garage.rs | 8 +++ src/model/s3/block_ref_table.rs | 39 +++++++++++++++ src/util/data.rs | 35 +++++++++++++ src/util/error.rs | 3 ++ 10 files changed, 285 insertions(+), 8 deletions(-) diff --git a/src/block/lib.rs b/src/block/lib.rs index 6c4711ef..944f0d83 100644 --- a/src/block/lib.rs +++ b/src/block/lib.rs @@ -11,3 +11,4 @@ mod metrics; mod rc; pub use block::zstd_encode; +pub use rc::CalculateRefcount; diff --git a/src/block/manager.rs b/src/block/manager.rs index eeacf8b9..628ffc71 100644 --- a/src/block/manager.rs +++ b/src/block/manager.rs @@ -88,7 +88,7 @@ pub struct BlockManager { mutation_lock: Vec>, - pub(crate) rc: BlockRc, + pub rc: BlockRc, pub resync: BlockResyncManager, pub(crate) system: Arc, @@ -229,6 +229,12 @@ impl BlockManager { } } + /// Initialization: set how block references are recalculated + /// for repair operations + pub fn set_recalc_rc(&self, recalc: Vec) { + self.rc.recalc_rc.store(Some(Arc::new(recalc))); + } + /// Ask nodes that might have a (possibly compressed) block for it /// Return it as a stream with a header async fn rpc_get_raw_block_streaming( @@ -316,9 +322,9 @@ impl BlockManager { }; } - let msg = format!("Get block {:?}: no node returned a valid block", hash); - debug!("{}", msg); - Err(Error::Message(msg)) + let err = Error::MissingBlock(*hash); + debug!("{}", err); + Err(err) } // ---- Public interface ---- diff --git a/src/block/rc.rs b/src/block/rc.rs index b6afb277..bf5aeced 100644 --- a/src/block/rc.rs +++ b/src/block/rc.rs @@ -1,5 +1,7 @@ use std::convert::TryInto; +use arc_swap::ArcSwapOption; + use garage_db as db; use garage_util::data::*; @@ -8,13 +10,20 @@ use garage_util::time::*; use crate::manager::BLOCK_GC_DELAY; +pub type CalculateRefcount = + Box db::TxResult + Send + Sync>; + pub struct BlockRc { - pub(crate) rc: db::Tree, + pub rc: db::Tree, + pub(crate) recalc_rc: ArcSwapOption>, } impl BlockRc { pub(crate) fn new(rc: db::Tree) -> Self { - Self { rc } + Self { + rc, + recalc_rc: ArcSwapOption::new(None), + } } /// Increment the reference counter associated to a hash. @@ -68,6 +77,58 @@ impl BlockRc { })?; Ok(()) } + + /// Recalculate the reference counter of a block + /// to fix potential inconsistencies + pub fn recalculate_rc(&self, hash: &Hash) -> Result<(usize, bool), Error> { + if let Some(recalc_fns) = self.recalc_rc.load().as_ref() { + trace!("Repair block RC for {:?}", hash); + let res = self + .rc + .db() + .transaction(|tx| { + let mut cnt = 0; + for f in recalc_fns.iter() { + cnt += f(&tx, hash)?; + } + let old_rc = RcEntry::parse_opt(tx.get(&self.rc, hash)?); + trace!( + "Block RC for {:?}: stored={}, calculated={}", + hash, + old_rc.as_u64(), + cnt + ); + if cnt as u64 != old_rc.as_u64() { + warn!( + "Fixing inconsistent block RC for {:?}: was {}, should be {}", + hash, + old_rc.as_u64(), + cnt + ); + let new_rc = if cnt > 0 { + RcEntry::Present { count: cnt as u64 } + } else { + RcEntry::Deletable { + at_time: now_msec() + BLOCK_GC_DELAY.as_millis() as u64, + } + }; + tx.insert(&self.rc, hash, new_rc.serialize().unwrap())?; + Ok((cnt, true)) + } else { + Ok((cnt, false)) + } + }) + .map_err(Error::from); + if let Err(e) = &res { + error!("Failed to fix RC for block {:?}: {}", hash, e); + } + res + } else { + Err(Error::Message( + "Block RC recalculation is not available at this point".into(), + )) + } + } } /// Describes the state of the reference counter for a block diff --git a/src/block/resync.rs b/src/block/resync.rs index 48c2cef1..b4108213 100644 --- a/src/block/resync.rs +++ b/src/block/resync.rs @@ -367,6 +367,13 @@ impl BlockResyncManager { } if exists && rc.is_deletable() { + if manager.rc.recalculate_rc(hash)?.0 > 0 { + return Err(Error::Message(format!( + "Refcount for block {:?} was inconsistent, retrying later", + hash + ))); + } + info!("Resync block {:?}: offloading and deleting", hash); let existing_path = existing_path.unwrap(); @@ -453,7 +460,15 @@ impl BlockResyncManager { hash ); - let block_data = manager.rpc_get_raw_block(hash, None).await?; + let block_data = manager.rpc_get_raw_block(hash, None).await; + if matches!(block_data, Err(Error::MissingBlock(_))) { + warn!( + "Could not fetch needed block {:?}, no node returned valid data. Checking that refcount is correct.", + hash + ); + manager.rc.recalculate_rc(hash)?; + } + let block_data = block_data?; manager.metrics.resync_recv_counter.add(1); diff --git a/src/garage/cli/structs.rs b/src/garage/cli/structs.rs index 1f572a9a..8380b5e2 100644 --- a/src/garage/cli/structs.rs +++ b/src/garage/cli/structs.rs @@ -473,8 +473,11 @@ pub enum RepairWhat { #[structopt(name = "mpu", version = garage_version())] MultipartUploads, /// Repropagate version deletions to the block ref table - #[structopt(name = "block_refs", version = garage_version())] + #[structopt(name = "block-refs", version = garage_version())] BlockRefs, + /// Recalculate block reference counters + #[structopt(name = "block-rc", version = garage_version())] + BlockRc, /// Verify integrity of all blocks on disc #[structopt(name = "scrub", version = garage_version())] Scrub { diff --git a/src/garage/repair/online.rs b/src/garage/repair/online.rs index 9e4de873..ecccdf6d 100644 --- a/src/garage/repair/online.rs +++ b/src/garage/repair/online.rs @@ -4,6 +4,7 @@ use std::time::Duration; use async_trait::async_trait; use tokio::sync::watch; +use garage_block::manager::BlockManager; use garage_block::repair::ScrubWorkerCommand; use garage_model::garage::Garage; @@ -16,11 +17,14 @@ use garage_table::replication::*; use garage_table::*; use garage_util::background::*; +use garage_util::data::*; use garage_util::error::Error; use garage_util::migrate::Migrate; use crate::*; +const RC_REPAIR_ITER_COUNT: usize = 64; + pub async fn launch_online_repair( garage: &Arc, bg: &BackgroundRunner, @@ -47,6 +51,13 @@ pub async fn launch_online_repair( info!("Repairing the block refs table"); bg.spawn_worker(TableRepairWorker::new(garage.clone(), RepairBlockRefs)); } + RepairWhat::BlockRc => { + info!("Repairing the block reference counters"); + bg.spawn_worker(BlockRcRepair::new( + garage.block_manager.clone(), + garage.block_ref_table.clone(), + )); + } RepairWhat::Blocks => { info!("Repairing the stored blocks"); bg.spawn_worker(garage_block::repair::RepairWorker::new( @@ -282,3 +293,98 @@ impl TableRepair for RepairMpu { Ok(false) } } + +// ===== block reference counter repair ===== + +pub struct BlockRcRepair { + block_manager: Arc, + block_ref_table: Arc>, + cursor: Hash, + counter: u64, + repairs: u64, +} + +impl BlockRcRepair { + fn new( + block_manager: Arc, + block_ref_table: Arc>, + ) -> Self { + Self { + block_manager, + block_ref_table, + cursor: [0u8; 32].into(), + counter: 0, + repairs: 0, + } + } +} + +#[async_trait] +impl Worker for BlockRcRepair { + fn name(&self) -> String { + format!("Block refcount repair worker") + } + + fn status(&self) -> WorkerStatus { + WorkerStatus { + progress: Some(format!("{} ({})", self.counter, self.repairs)), + ..Default::default() + } + } + + async fn work(&mut self, _must_exit: &mut watch::Receiver) -> Result { + for _i in 0..RC_REPAIR_ITER_COUNT { + let next1 = self + .block_manager + .rc + .rc + .range(self.cursor.as_slice()..)? + .next() + .transpose()? + .map(|(k, _)| Hash::try_from(k.as_slice()).unwrap()); + let next2 = self + .block_ref_table + .data + .store + .range(self.cursor.as_slice()..)? + .next() + .transpose()? + .map(|(k, _)| Hash::try_from(&k[..32]).unwrap()); + let next = match (next1, next2) { + (Some(k1), Some(k2)) => std::cmp::min(k1, k2), + (Some(k), None) | (None, Some(k)) => k, + (None, None) => { + info!( + "{}: finished, done {}, fixed {}", + self.name(), + self.counter, + self.repairs + ); + return Ok(WorkerState::Done); + } + }; + + if self.block_manager.rc.recalculate_rc(&next)?.1 { + self.repairs += 1; + } + self.counter += 1; + if let Some(next_incr) = next.increment() { + self.cursor = next_incr; + } else { + info!( + "{}: finished, done {}, fixed {}", + self.name(), + self.counter, + self.repairs + ); + return Ok(WorkerState::Done); + } + } + + Ok(WorkerState::Busy) + } + + async fn wait_for_work(&mut self) -> WorkerState { + unreachable!() + } +} diff --git a/src/model/garage.rs b/src/model/garage.rs index 4405d22d..273690db 100644 --- a/src/model/garage.rs +++ b/src/model/garage.rs @@ -247,6 +247,14 @@ impl Garage { #[cfg(feature = "k2v")] let k2v = GarageK2V::new(system.clone(), &db, meta_rep_param); + // ---- setup block refcount recalculation ---- + // this function can be used to fix inconsistencies in the RC table + block_manager.set_recalc_rc(vec![ + block_ref_recount_fn(&block_ref_table), + // other functions could be added here if we had other tables + // that hold references to data blocks + ]); + // -- done -- Ok(Arc::new(Self { config, diff --git a/src/model/s3/block_ref_table.rs b/src/model/s3/block_ref_table.rs index 7b023d87..57eb7b16 100644 --- a/src/model/s3/block_ref_table.rs +++ b/src/model/s3/block_ref_table.rs @@ -3,8 +3,12 @@ use std::sync::Arc; use garage_db as db; use garage_util::data::*; +use garage_util::error::*; +use garage_util::migrate::Migrate; +use garage_block::CalculateRefcount; use garage_table::crdt::Crdt; +use garage_table::replication::TableShardedReplication; use garage_table::*; use garage_block::manager::*; @@ -84,3 +88,38 @@ impl TableSchema for BlockRefTable { filter.apply(entry.deleted.get()) } } + +pub fn block_ref_recount_fn( + block_ref_table: &Arc>, +) -> CalculateRefcount { + let table = Arc::downgrade(block_ref_table); + Box::new(move |tx: &db::Transaction, block: &Hash| { + let table = table + .upgrade() + .ok_or_message("cannot upgrade weak ptr to block_ref_table") + .map_err(db::TxError::Abort)?; + Ok(calculate_refcount(&table, tx, block)?) + }) +} + +fn calculate_refcount( + block_ref_table: &Table, + tx: &db::Transaction, + block: &Hash, +) -> db::TxResult { + let mut result = 0; + for entry in tx.range(&block_ref_table.data.store, block.as_slice()..)? { + let (key, value) = entry?; + if &key[..32] != block.as_slice() { + break; + } + let value = BlockRef::decode(&value) + .ok_or_message("could not decode block_ref") + .map_err(db::TxError::Abort)?; + assert_eq!(value.block, *block); + if !value.deleted.get() { + result += 1; + } + } + Ok(result) +} diff --git a/src/util/data.rs b/src/util/data.rs index 2579fd1b..1fe7dfe0 100644 --- a/src/util/data.rs +++ b/src/util/data.rs @@ -83,6 +83,19 @@ impl FixedBytes32 { ret.copy_from_slice(by); Some(Self(ret)) } + /// Return the next hash + pub fn increment(&self) -> Option { + let mut ret = *self; + for byte in ret.0.iter_mut().rev() { + if *byte == u8::MAX { + *byte = 0; + } else { + *byte = *byte + 1; + return Some(ret); + } + } + return None; + } } impl From for FixedBytes32 { @@ -140,3 +153,25 @@ pub fn fasthash(data: &[u8]) -> FastHash { pub fn gen_uuid() -> Uuid { rand::thread_rng().gen::<[u8; 32]>().into() } + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_increment() { + let zero: FixedBytes32 = [0u8; 32].into(); + let mut one: FixedBytes32 = [0u8; 32].into(); + one.0[31] = 1; + let max: FixedBytes32 = [0xFFu8; 32].into(); + assert_eq!(zero.increment(), Some(one)); + assert_eq!(max.increment(), None); + + let mut test: FixedBytes32 = [0u8; 32].into(); + let i = 0x198DF97209F8FFFFu64; + test.0[24..32].copy_from_slice(&u64::to_be_bytes(i)); + let mut test2: FixedBytes32 = [0u8; 32].into(); + test2.0[24..32].copy_from_slice(&u64::to_be_bytes(i + 1)); + assert_eq!(test.increment(), Some(test2)); + } +} diff --git a/src/util/error.rs b/src/util/error.rs index da9eda10..75fd3f9c 100644 --- a/src/util/error.rs +++ b/src/util/error.rs @@ -70,6 +70,9 @@ pub enum Error { #[error(display = "Corrupt data: does not match hash {:?}", _0)] CorruptData(Hash), + #[error(display = "Missing block {:?}: no node returned a valid block", _0)] + MissingBlock(Hash), + #[error(display = "{}", _0)] Message(String), } -- cgit v1.2.3 From 3165ab926c665b795eab7a227f65a67a0874641e Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 19 Mar 2024 16:09:47 +0100 Subject: [block-ref-repair] rename rc's rc field to rc_table --- src/block/manager.rs | 4 ++-- src/block/rc.rs | 28 ++++++++++++++-------------- src/block/repair.rs | 2 +- src/garage/repair/online.rs | 2 +- 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/src/block/manager.rs b/src/block/manager.rs index 628ffc71..8ee33096 100644 --- a/src/block/manager.rs +++ b/src/block/manager.rs @@ -156,7 +156,7 @@ impl BlockManager { let metrics = BlockManagerMetrics::new( config.compression_level, - rc.rc.clone(), + rc.rc_table.clone(), resync.queue.clone(), resync.errors.clone(), ); @@ -387,7 +387,7 @@ impl BlockManager { /// Get number of items in the refcount table pub fn rc_len(&self) -> Result { - Ok(self.rc.rc.len()?) + Ok(self.rc.rc_table.len()?) } /// Send command to start/stop/manager scrub worker diff --git a/src/block/rc.rs b/src/block/rc.rs index bf5aeced..4a55ee29 100644 --- a/src/block/rc.rs +++ b/src/block/rc.rs @@ -14,14 +14,14 @@ pub type CalculateRefcount = Box db::TxResult + Send + Sync>; pub struct BlockRc { - pub rc: db::Tree, + pub rc_table: db::Tree, pub(crate) recalc_rc: ArcSwapOption>, } impl BlockRc { pub(crate) fn new(rc: db::Tree) -> Self { Self { - rc, + rc_table: rc, recalc_rc: ArcSwapOption::new(None), } } @@ -33,9 +33,9 @@ impl BlockRc { tx: &mut db::Transaction, hash: &Hash, ) -> db::TxOpResult { - let old_rc = RcEntry::parse_opt(tx.get(&self.rc, hash)?); + let old_rc = RcEntry::parse_opt(tx.get(&self.rc_table, hash)?); match old_rc.increment().serialize() { - Some(x) => tx.insert(&self.rc, hash, x)?, + Some(x) => tx.insert(&self.rc_table, hash, x)?, None => unreachable!(), }; Ok(old_rc.is_zero()) @@ -48,28 +48,28 @@ impl BlockRc { tx: &mut db::Transaction, hash: &Hash, ) -> db::TxOpResult { - let new_rc = RcEntry::parse_opt(tx.get(&self.rc, hash)?).decrement(); + let new_rc = RcEntry::parse_opt(tx.get(&self.rc_table, hash)?).decrement(); match new_rc.serialize() { - Some(x) => tx.insert(&self.rc, hash, x)?, - None => tx.remove(&self.rc, hash)?, + Some(x) => tx.insert(&self.rc_table, hash, x)?, + None => tx.remove(&self.rc_table, hash)?, }; Ok(matches!(new_rc, RcEntry::Deletable { .. })) } /// Read a block's reference count pub(crate) fn get_block_rc(&self, hash: &Hash) -> Result { - Ok(RcEntry::parse_opt(self.rc.get(hash.as_ref())?)) + Ok(RcEntry::parse_opt(self.rc_table.get(hash.as_ref())?)) } /// Delete an entry in the RC table if it is deletable and the /// deletion time has passed pub(crate) fn clear_deleted_block_rc(&self, hash: &Hash) -> Result<(), Error> { let now = now_msec(); - self.rc.db().transaction(|tx| { - let rcval = RcEntry::parse_opt(tx.get(&self.rc, hash)?); + self.rc_table.db().transaction(|tx| { + let rcval = RcEntry::parse_opt(tx.get(&self.rc_table, hash)?); match rcval { RcEntry::Deletable { at_time } if now > at_time => { - tx.remove(&self.rc, hash)?; + tx.remove(&self.rc_table, hash)?; } _ => (), }; @@ -84,14 +84,14 @@ impl BlockRc { if let Some(recalc_fns) = self.recalc_rc.load().as_ref() { trace!("Repair block RC for {:?}", hash); let res = self - .rc + .rc_table .db() .transaction(|tx| { let mut cnt = 0; for f in recalc_fns.iter() { cnt += f(&tx, hash)?; } - let old_rc = RcEntry::parse_opt(tx.get(&self.rc, hash)?); + let old_rc = RcEntry::parse_opt(tx.get(&self.rc_table, hash)?); trace!( "Block RC for {:?}: stored={}, calculated={}", hash, @@ -112,7 +112,7 @@ impl BlockRc { at_time: now_msec() + BLOCK_GC_DELAY.as_millis() as u64, } }; - tx.insert(&self.rc, hash, new_rc.serialize().unwrap())?; + tx.insert(&self.rc_table, hash, new_rc.serialize().unwrap())?; Ok((cnt, true)) } else { Ok((cnt, false)) diff --git a/src/block/repair.rs b/src/block/repair.rs index 2c8acbc9..ef271094 100644 --- a/src/block/repair.rs +++ b/src/block/repair.rs @@ -107,7 +107,7 @@ impl Worker for RepairWorker { for entry in self .manager .rc - .rc + .rc_table .range::<&[u8], _>((start_bound, Bound::Unbounded))? { let (hash, _) = entry?; diff --git a/src/garage/repair/online.rs b/src/garage/repair/online.rs index ecccdf6d..2c5227d2 100644 --- a/src/garage/repair/online.rs +++ b/src/garage/repair/online.rs @@ -337,7 +337,7 @@ impl Worker for BlockRcRepair { let next1 = self .block_manager .rc - .rc + .rc_table .range(self.cursor.as_slice()..)? .next() .transpose()? -- cgit v1.2.3 From 3eab639c146f67fc67534633ae26c9aec116327d Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 19 Mar 2024 16:24:34 +0100 Subject: [block-ref-repair] mention `garage block repair-rc` in documentation --- doc/book/operations/durability-repairs.md | 5 ++++- src/garage/cli/util.rs | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/doc/book/operations/durability-repairs.md b/doc/book/operations/durability-repairs.md index c76dc39e..fdf163e2 100644 --- a/doc/book/operations/durability-repairs.md +++ b/doc/book/operations/durability-repairs.md @@ -141,4 +141,7 @@ blocks may still be held by Garage. If you suspect that such corruption has occu in your cluster, you can run one of the following repair procedures: - `garage repair versions`: checks that all versions belong to a non-deleted object, and purges any orphan version -- `garage repair block_refs`: checks that all block references belong to a non-deleted object version, and purges any orphan block reference (this will then allow the blocks to be garbage-collected) + +- `garage repair block-refs`: checks that all block references belong to a non-deleted object version, and purges any orphan block reference (this will then allow the blocks to be garbage-collected) + +- `garage repair block-rc`: checks that the reference counters for blocks are in sync with the actual number of non-deleted entries in the block reference table diff --git a/src/garage/cli/util.rs b/src/garage/cli/util.rs index 0511e2b1..21c14f42 100644 --- a/src/garage/cli/util.rs +++ b/src/garage/cli/util.rs @@ -451,7 +451,7 @@ pub fn print_block_info( if refcount != nondeleted_count { println!(); println!( - "Warning: refcount does not match number of non-deleted versions (see issue #644)." + "Warning: refcount does not match number of non-deleted versions, you should try `garage repair block-rc`." ); } } -- cgit v1.2.3 From 5225a81dee21603950e7944cd93c40fdb1bd8feb Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 21 Mar 2024 09:47:04 +0100 Subject: [net-fixes] peering: only count node IDs and not addresses in hash --- src/net/peering.rs | 45 ++++++++++++++++++++++++++++----------------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/src/net/peering.rs b/src/net/peering.rs index 61882a18..f4283683 100644 --- a/src/net/peering.rs +++ b/src/net/peering.rs @@ -164,29 +164,40 @@ struct KnownHosts { impl KnownHosts { fn new() -> Self { let list = HashMap::new(); - let hash = Self::calculate_hash(vec![]); - Self { list, hash } + let mut ret = Self { + list, + hash: hash::Digest::from_slice(&[0u8; 64][..]).unwrap(), + }; + ret.update_hash(); + ret } fn update_hash(&mut self) { - self.hash = Self::calculate_hash(self.connected_peers_vec()); - } - fn connected_peers_vec(&self) -> Vec<(NodeID, SocketAddr)> { - let mut list = Vec::with_capacity(self.list.len()); - for (id, peer) in self.list.iter() { - if peer.state.is_up() { - list.push((*id, peer.addr)); - } - } - list - } - fn calculate_hash(mut list: Vec<(NodeID, SocketAddr)>) -> hash::Digest { + // The hash is a value that is exchanged between nodes when they ping one + // another. Nodes compare their known hosts hash to know if they are connected + // to the same set of nodes. If the hashes differ, they are connected to + // different nodes and they trigger an exchange of the full list of active + // connections. The hash value only represents the set of node IDs and not + // their actual socket addresses, because nodes can be connected via different + // addresses and that shouldn't necessarily trigger a full peer exchange. + let mut list = self + .list + .iter() + .filter(|(_, peer)| peer.state.is_up()) + .map(|(id, _)| *id) + .collect::>(); list.sort(); let mut hash_state = hash::State::new(); - for (id, addr) in list { + for id in list { hash_state.update(&id[..]); - hash_state.update(&format!("{}\n", addr).into_bytes()[..]); } - hash_state.finalize() + self.hash = hash_state.finalize(); + } + fn connected_peers_vec(&self) -> Vec<(NodeID, SocketAddr)> { + self.list + .iter() + .filter(|(_, peer)| peer.state.is_up()) + .map(|(id, peer)| (*id, peer.addr)) + .collect::>() } } -- cgit v1.2.3 From 961b4f9af36a7fb5d3a661ac19e8f2c168bb48ae Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 21 Mar 2024 10:45:34 +0100 Subject: [net-fixes] fix issues with local peer address (fix #761) --- src/api/admin/cluster.rs | 2 +- src/garage/cli/cmd.rs | 21 ++++---- src/net/netapp.rs | 51 +++++++++---------- src/net/peering.rs | 124 +++++++++++++++++++++-------------------------- src/rpc/system.rs | 15 ++++-- 5 files changed, 100 insertions(+), 113 deletions(-) diff --git a/src/api/admin/cluster.rs b/src/api/admin/cluster.rs index 8ce6c5ed..8c9cb1e5 100644 --- a/src/api/admin/cluster.rs +++ b/src/api/admin/cluster.rs @@ -27,7 +27,7 @@ pub async fn handle_get_cluster_status(garage: &Arc) -> Result, rpc_host: NodeID) -> vec!["ID\tHostname\tAddress\tTags\tZone\tCapacity\tDataAvail".to_string()]; for adv in status.iter().filter(|adv| adv.is_up) { let host = adv.status.hostname.as_deref().unwrap_or("?"); + let addr = match adv.addr { + Some(addr) => addr.to_string(), + None => "N/A".to_string(), + }; if let Some(NodeRoleV(Some(cfg))) = layout.current().roles.get(&adv.id) { let data_avail = match &adv.status.data_disk_avail { _ if cfg.capacity.is_none() => "N/A".into(), @@ -71,7 +75,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> "{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\t{capacity}\t{data_avail}", id = adv.id, host = host, - addr = adv.addr, + addr = addr, tags = cfg.tags.join(","), zone = cfg.zone, capacity = cfg.capacity_string(), @@ -91,7 +95,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> "{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\tdraining metadata...", id = adv.id, host = host, - addr = adv.addr, + addr = addr, tags = cfg.tags.join(","), zone = cfg.zone, )); @@ -104,7 +108,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> "{id:?}\t{h}\t{addr}\t\t\t{new_role}", id = adv.id, h = host, - addr = adv.addr, + addr = addr, new_role = new_role, )); } @@ -120,8 +124,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> let tf = timeago::Formatter::new(); let mut drain_msg = false; - let mut failed_nodes = - vec!["ID\tHostname\tAddress\tTags\tZone\tCapacity\tLast seen".to_string()]; + let mut failed_nodes = vec!["ID\tHostname\tTags\tZone\tCapacity\tLast seen".to_string()]; let mut listed = HashSet::new(); for ver in layout.versions.iter().rev() { for (node, _, role) in ver.roles.items().iter() { @@ -142,15 +145,14 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> // Node is in a layout version, is not a gateway node, and is not up: // it is in a failed state, add proper line to the output - let (host, addr, last_seen) = match adv { + let (host, last_seen) = match adv { Some(adv) => ( adv.status.hostname.as_deref().unwrap_or("?"), - adv.addr.to_string(), adv.last_seen_secs_ago .map(|s| tf.convert(Duration::from_secs(s))) .unwrap_or_else(|| "never seen".into()), ), - None => ("??", "??".into(), "never seen".into()), + None => ("??", "never seen".into()), }; let capacity = if ver.version == layout.current().version { cfg.capacity_string() @@ -159,10 +161,9 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> "draining metadata...".to_string() }; failed_nodes.push(format!( - "{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\t{capacity}\t{last_seen}", + "{id:?}\t{host}\t[{tags}]\t{zone}\t{capacity}\t{last_seen}", id = node, host = host, - addr = addr, tags = cfg.tags.join(","), zone = cfg.zone, capacity = capacity, diff --git a/src/net/netapp.rs b/src/net/netapp.rs index faa51a99..6480a126 100644 --- a/src/net/netapp.rs +++ b/src/net/netapp.rs @@ -292,13 +292,7 @@ impl NetApp { /// the other node with `Netapp::request` pub async fn try_connect(self: Arc, ip: SocketAddr, id: NodeID) -> Result<(), Error> { // Don't connect to ourself, we don't care - // but pretend we did if id == self.id { - tokio::spawn(async move { - if let Some(h) = self.on_connected_handler.load().as_ref() { - h(id, ip, false); - } - }); return Ok(()); } @@ -327,31 +321,32 @@ impl NetApp { /// Close the outgoing connection we have to a node specified by its public key, /// if such a connection is currently open. pub fn disconnect(self: &Arc, id: &NodeID) { + let conn = self.client_conns.write().unwrap().remove(id); + // If id is ourself, we're not supposed to have a connection open - if *id != self.id { - let conn = self.client_conns.write().unwrap().remove(id); - if let Some(c) = conn { - debug!( - "Closing connection to {} ({})", - hex::encode(&c.peer_id[..8]), - c.remote_addr - ); - c.close(); - } else { - return; - } + if *id == self.id { + // sanity check + assert!(conn.is_none(), "had a connection to local node"); + return; } - // call on_disconnected_handler immediately, since the connection - // was removed - // (if id == self.id, we pretend we disconnected) - let id = *id; - let self2 = self.clone(); - tokio::spawn(async move { - if let Some(h) = self2.on_disconnected_handler.load().as_ref() { - h(id, false); - } - }); + if let Some(c) = conn { + debug!( + "Closing connection to {} ({})", + hex::encode(&c.peer_id[..8]), + c.remote_addr + ); + c.close(); + + // call on_disconnected_handler immediately, since the connection was removed + let id = *id; + let self2 = self.clone(); + tokio::spawn(async move { + if let Some(h) = self2.on_disconnected_handler.load().as_ref() { + h(id, false); + } + }); + } } // Called from conn.rs when an incoming connection is successfully established diff --git a/src/net/peering.rs b/src/net/peering.rs index f4283683..0b4fec9e 100644 --- a/src/net/peering.rs +++ b/src/net/peering.rs @@ -43,7 +43,7 @@ impl Message for PingMessage { #[derive(Serialize, Deserialize)] struct PeerListMessage { - pub list: Vec<(NodeID, SocketAddr)>, + pub list: Vec<(NodeID, Vec)>, } impl Message for PeerListMessage { @@ -54,12 +54,8 @@ impl Message for PeerListMessage { #[derive(Debug)] struct PeerInfoInternal { - // addr is the currently connected address, - // or the last address we were connected to, - // or an arbitrary address some other peer gave us - addr: SocketAddr, - // all_addrs contains all of the addresses everyone gave us - all_addrs: Vec, + // known_addrs contains all of the addresses everyone gave us + known_addrs: Vec, state: PeerConnState, last_send_ping: Option, @@ -69,10 +65,9 @@ struct PeerInfoInternal { } impl PeerInfoInternal { - fn new(addr: SocketAddr, state: PeerConnState) -> Self { + fn new(state: PeerConnState, known_addr: Option) -> Self { Self { - addr, - all_addrs: vec![addr], + known_addrs: known_addr.map(|x| vec![x]).unwrap_or_default(), state, last_send_ping: None, last_seen: None, @@ -81,8 +76,8 @@ impl PeerInfoInternal { } } fn add_addr(&mut self, addr: SocketAddr) -> bool { - if !self.all_addrs.contains(&addr) { - self.all_addrs.push(addr); + if !self.known_addrs.contains(&addr) { + self.known_addrs.push(addr); // If we are learning a new address for this node, // we want to retry connecting self.state = match self.state { @@ -90,7 +85,7 @@ impl PeerInfoInternal { PeerConnState::Waiting(_, _) | PeerConnState::Abandonned => { PeerConnState::Waiting(0, Instant::now()) } - x @ (PeerConnState::Ourself | PeerConnState::Connected) => x, + x @ (PeerConnState::Ourself | PeerConnState::Connected { .. }) => x, }; true } else { @@ -104,8 +99,6 @@ impl PeerInfoInternal { pub struct PeerInfo { /// The node's identifier (its public key) pub id: NodeID, - /// The node's network address - pub addr: SocketAddr, /// The current status of our connection to this node pub state: PeerConnState, /// The last time at which the node was seen @@ -136,7 +129,7 @@ pub enum PeerConnState { Ourself, /// We currently have a connection to this peer - Connected, + Connected { addr: SocketAddr }, /// Our next connection tentative (the nth, where n is the first value of the tuple) /// will be at given Instant @@ -152,7 +145,7 @@ pub enum PeerConnState { impl PeerConnState { /// Returns true if we can currently send requests to this peer pub fn is_up(&self) -> bool { - matches!(self, Self::Ourself | Self::Connected) + matches!(self, Self::Ourself | Self::Connected { .. }) } } @@ -192,11 +185,11 @@ impl KnownHosts { } self.hash = hash_state.finalize(); } - fn connected_peers_vec(&self) -> Vec<(NodeID, SocketAddr)> { + fn connected_peers_vec(&self) -> Vec<(NodeID, Vec)> { self.list .iter() .filter(|(_, peer)| peer.state.is_up()) - .map(|(id, peer)| (*id, peer.addr)) + .map(|(id, peer)| (*id, peer.known_addrs.clone())) .collect::>() } } @@ -231,18 +224,16 @@ impl PeeringManager { if id != netapp.id { known_hosts.list.insert( id, - PeerInfoInternal::new(addr, PeerConnState::Waiting(0, Instant::now())), + PeerInfoInternal::new(PeerConnState::Waiting(0, Instant::now()), Some(addr)), ); } } - if let Some(addr) = our_addr { - known_hosts.list.insert( - netapp.id, - PeerInfoInternal::new(addr, PeerConnState::Ourself), - ); - known_hosts.update_hash(); - } + known_hosts.list.insert( + netapp.id, + PeerInfoInternal::new(PeerConnState::Ourself, our_addr), + ); + known_hosts.update_hash(); // TODO for v0.10 / v1.0 : rename the endpoint (it will break compatibility) let strat = Arc::new(Self { @@ -287,7 +278,7 @@ impl PeeringManager { for (id, info) in known_hosts.list.iter() { trace!("{}, {:?}", hex::encode(&id[..8]), info); match info.state { - PeerConnState::Connected => { + PeerConnState::Connected { .. } => { let must_ping = match info.last_send_ping { None => true, Some(t) => Instant::now() - t > PING_INTERVAL, @@ -330,7 +321,7 @@ impl PeeringManager { info!( "Retrying connection to {} at {} ({})", hex::encode(&id[..8]), - h.all_addrs + h.known_addrs .iter() .map(|x| format!("{}", x)) .collect::>() @@ -339,13 +330,8 @@ impl PeeringManager { ); h.state = PeerConnState::Trying(i); - let alternate_addrs = h - .all_addrs - .iter() - .filter(|x| **x != h.addr) - .cloned() - .collect::>(); - tokio::spawn(self.clone().try_connect(id, h.addr, alternate_addrs)); + let addresses = h.known_addrs.clone(); + tokio::spawn(self.clone().try_connect(id, addresses)); } } } @@ -373,27 +359,24 @@ impl PeeringManager { fn update_public_peer_list(&self, known_hosts: &KnownHosts) { let mut pub_peer_list = Vec::with_capacity(known_hosts.list.len()); for (id, info) in known_hosts.list.iter() { + if *id == self.netapp.id { + // sanity check + assert!(matches!(info.state, PeerConnState::Ourself)); + } let mut pings = info.ping.iter().cloned().collect::>(); pings.sort(); if !pings.is_empty() { pub_peer_list.push(PeerInfo { id: *id, - addr: info.addr, state: info.state, last_seen: info.last_seen, - avg_ping: Some( - pings - .iter() - .fold(Duration::from_secs(0), |x, y| x + *y) - .div_f64(pings.len() as f64), - ), + avg_ping: Some(pings.iter().sum::().div_f64(pings.len() as f64)), max_ping: pings.last().cloned(), med_ping: Some(pings[pings.len() / 2]), }); } else { pub_peer_list.push(PeerInfo { id: *id, - addr: info.addr, state: info.state, last_seen: info.last_seen, avg_ping: None, @@ -485,18 +468,20 @@ impl PeeringManager { } } - fn handle_peer_list(&self, list: &[(NodeID, SocketAddr)]) { + fn handle_peer_list(&self, list: &[(NodeID, Vec)]) { let mut known_hosts = self.known_hosts.write().unwrap(); let mut changed = false; - for (id, addr) in list.iter() { - if let Some(kh) = known_hosts.list.get_mut(id) { - if kh.add_addr(*addr) { + for (id, addrs) in list.iter() { + for addr in addrs.iter() { + if let Some(kh) = known_hosts.list.get_mut(id) { + if kh.add_addr(*addr) { + changed = true; + } + } else { + known_hosts.list.insert(*id, self.new_peer(id, *addr)); changed = true; } - } else { - known_hosts.list.insert(*id, self.new_peer(id, *addr)); - changed = true; } } @@ -506,15 +491,10 @@ impl PeeringManager { } } - async fn try_connect( - self: Arc, - id: NodeID, - default_addr: SocketAddr, - alternate_addrs: Vec, - ) { + async fn try_connect(self: Arc, id: NodeID, addresses: Vec) { let conn_addr = { let mut ret = None; - for addr in [default_addr].iter().chain(alternate_addrs.iter()) { + for addr in addresses.iter() { debug!("Trying address {} for peer {}", addr, hex::encode(&id[..8])); match self.netapp.clone().try_connect(*addr, id).await { Ok(()) => { @@ -540,7 +520,7 @@ impl PeeringManager { warn!( "Could not connect to peer {} ({} addresses tried)", hex::encode(&id[..8]), - 1 + alternate_addrs.len() + addresses.len() ); let mut known_hosts = self.known_hosts.write().unwrap(); if let Some(host) = known_hosts.list.get_mut(&id) { @@ -560,6 +540,14 @@ impl PeeringManager { } fn on_connected(self: &Arc, id: NodeID, addr: SocketAddr, is_incoming: bool) { + if id == self.netapp.id { + // sanity check + panic!( + "on_connected from local node, id={:?}, addr={}, incoming={}", + id, addr, is_incoming + ); + } + let mut known_hosts = self.known_hosts.write().unwrap(); if is_incoming { if let Some(host) = known_hosts.list.get_mut(&id) { @@ -574,13 +562,13 @@ impl PeeringManager { addr ); if let Some(host) = known_hosts.list.get_mut(&id) { - host.state = PeerConnState::Connected; - host.addr = addr; + host.state = PeerConnState::Connected { addr }; host.add_addr(addr); } else { - known_hosts - .list - .insert(id, PeerInfoInternal::new(addr, PeerConnState::Connected)); + known_hosts.list.insert( + id, + PeerInfoInternal::new(PeerConnState::Connected { addr }, Some(addr)), + ); } } known_hosts.update_hash(); @@ -600,12 +588,8 @@ impl PeeringManager { } fn new_peer(&self, id: &NodeID, addr: SocketAddr) -> PeerInfoInternal { - let state = if *id == self.netapp.id { - PeerConnState::Ourself - } else { - PeerConnState::Waiting(0, Instant::now()) - }; - PeerInfoInternal::new(addr, state) + assert!(*id != self.netapp.id); + PeerInfoInternal::new(PeerConnState::Waiting(0, Instant::now()), Some(addr)) } } diff --git a/src/rpc/system.rs b/src/rpc/system.rs index 54d589d2..9da1b176 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -16,7 +16,7 @@ use tokio::sync::{watch, Notify}; use garage_net::endpoint::{Endpoint, EndpointHandler}; use garage_net::message::*; -use garage_net::peering::PeeringManager; +use garage_net::peering::{PeerConnState, PeeringManager}; use garage_net::util::parse_and_resolve_peer_addr_async; use garage_net::{NetApp, NetworkKey, NodeID, NodeKey}; @@ -142,7 +142,7 @@ pub struct NodeStatus { #[derive(Debug, Clone, Serialize, Deserialize)] pub struct KnownNodeInfo { pub id: Uuid, - pub addr: SocketAddr, + pub addr: Option, pub is_up: bool, pub last_seen_secs_ago: Option, pub status: NodeStatus, @@ -381,7 +381,11 @@ impl System { .iter() .map(|n| KnownNodeInfo { id: n.id.into(), - addr: n.addr, + addr: match n.state { + PeerConnState::Ourself => self.rpc_public_addr, + PeerConnState::Connected { addr } => Some(addr), + _ => None, + }, is_up: n.is_up(), last_seen_secs_ago: n .last_seen @@ -722,7 +726,10 @@ impl System { .peering .get_peer_list() .iter() - .map(|n| (n.id.into(), n.addr)) + .filter_map(|n| match n.state { + PeerConnState::Connected { addr } => Some((n.id.into(), addr)), + _ => None, + }) .collect::>(); // Before doing it, we read the current peer list file (if it exists) -- cgit v1.2.3 From 3844110cd03210a1600d57db1aab53e41cf4815f Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 21 Mar 2024 10:50:44 +0100 Subject: [net-fixes] netapp peer exchange: send only currently connected address --- src/net/peering.rs | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/net/peering.rs b/src/net/peering.rs index 0b4fec9e..b4271231 100644 --- a/src/net/peering.rs +++ b/src/net/peering.rs @@ -43,7 +43,7 @@ impl Message for PingMessage { #[derive(Serialize, Deserialize)] struct PeerListMessage { - pub list: Vec<(NodeID, Vec)>, + pub list: Vec<(NodeID, SocketAddr)>, } impl Message for PeerListMessage { @@ -185,11 +185,13 @@ impl KnownHosts { } self.hash = hash_state.finalize(); } - fn connected_peers_vec(&self) -> Vec<(NodeID, Vec)> { + fn connected_peers_vec(&self) -> Vec<(NodeID, SocketAddr)> { self.list .iter() - .filter(|(_, peer)| peer.state.is_up()) - .map(|(id, peer)| (*id, peer.known_addrs.clone())) + .filter_map(|(id, peer)| match peer.state { + PeerConnState::Connected { addr } => Some((*id, addr)), + _ => None, + }) .collect::>() } } @@ -468,20 +470,18 @@ impl PeeringManager { } } - fn handle_peer_list(&self, list: &[(NodeID, Vec)]) { + fn handle_peer_list(&self, list: &[(NodeID, SocketAddr)]) { let mut known_hosts = self.known_hosts.write().unwrap(); let mut changed = false; - for (id, addrs) in list.iter() { - for addr in addrs.iter() { - if let Some(kh) = known_hosts.list.get_mut(id) { - if kh.add_addr(*addr) { - changed = true; - } - } else { - known_hosts.list.insert(*id, self.new_peer(id, *addr)); + for (id, addr) in list.iter() { + if let Some(kh) = known_hosts.list.get_mut(id) { + if kh.add_addr(*addr) { changed = true; } + } else { + known_hosts.list.insert(*id, self.new_peer(id, *addr)); + changed = true; } } -- cgit v1.2.3 From 74949c69cbf1a8222b6d10a02fcf5fe139ccb560 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 21 Mar 2024 14:06:59 +0100 Subject: [s3-checksum] implement x-amz-checksum-* headers --- Cargo.lock | 8 +- Cargo.nix | 16 +- Cargo.toml | 3 + src/api/Cargo.toml | 3 + src/api/s3/api_server.rs | 2 +- src/api/s3/checksum.rs | 406 +++++++++++++++++++++++++++++++++++++++ src/api/s3/copy.rs | 132 +++++++++---- src/api/s3/encryption.rs | 44 ++--- src/api/s3/error.rs | 6 + src/api/s3/get.rs | 147 ++++++++++---- src/api/s3/list.rs | 77 +++++++- src/api/s3/mod.rs | 1 + src/api/s3/multipart.rs | 159 ++++++++++++--- src/api/s3/post_object.rs | 28 ++- src/api/s3/put.rs | 221 ++++++++++----------- src/api/s3/xml.rs | 33 ++++ src/garage/Cargo.toml | 1 + src/garage/tests/s3/multipart.rs | 158 ++++++++++++++- src/model/s3/mpu_table.rs | 9 + src/model/s3/object_table.rs | 53 ++++- src/util/async_hash.rs | 61 ------ src/util/lib.rs | 1 - 22 files changed, 1228 insertions(+), 341 deletions(-) create mode 100644 src/api/s3/checksum.rs delete mode 100644 src/util/async_hash.rs diff --git a/Cargo.lock b/Cargo.lock index d806c253..c3dee8c7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -905,9 +905,9 @@ dependencies = [ [[package]] name = "crc32fast" -version = "1.3.2" +version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d" +checksum = "b3855a8a784b474f333699ef2bbca9db2c4a1f6d9088a90a2d25b1eb53111eaa" dependencies = [ "cfg-if", ] @@ -1346,6 +1346,7 @@ dependencies = [ "serde", "serde_bytes", "serde_json", + "sha1", "sha2", "static_init", "structopt", @@ -1367,6 +1368,8 @@ dependencies = [ "base64 0.21.7", "bytes", "chrono", + "crc32c", + "crc32fast", "crypto-common", "err-derive", "form_urlencoded", @@ -1400,6 +1403,7 @@ dependencies = [ "serde", "serde_bytes", "serde_json", + "sha1", "sha2", "tokio", "tokio-stream", diff --git a/Cargo.nix b/Cargo.nix index fb55dc4b..e5b975b8 100644 --- a/Cargo.nix +++ b/Cargo.nix @@ -34,7 +34,7 @@ args@{ ignoreLockHash, }: let - nixifiedLockHash = "f73523af24b5164222da0a1c326ba65fa4a01b55751dd9ddab251334cfe20d13"; + nixifiedLockHash = "a49da9d5ef560672a34c1e004c0122e706a74fac512300f20858f136cd00582e"; workspaceSrc = if args.workspaceSrc == null then ./. else args.workspaceSrc; currentLockHash = builtins.hashFile "sha256" (workspaceSrc + /Cargo.lock); lockHashIgnored = if ignoreLockHash @@ -674,7 +674,7 @@ in aws_smithy_types = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".aws-smithy-types."1.1.4" { inherit profileName; }).out; bytes = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".bytes."1.5.0" { inherit profileName; }).out; crc32c = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".crc32c."0.6.4" { inherit profileName; }).out; - crc32fast = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".crc32fast."1.3.2" { inherit profileName; }).out; + crc32fast = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".crc32fast."1.4.0" { inherit profileName; }).out; hex = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hex."0.4.3" { inherit profileName; }).out; http = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".http."0.2.11" { inherit profileName; }).out; http_body = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".http-body."0.4.6" { inherit profileName; }).out; @@ -694,7 +694,7 @@ in dependencies = { aws_smithy_types = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".aws-smithy-types."1.1.4" { inherit profileName; }).out; bytes = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".bytes."1.5.0" { inherit profileName; }).out; - crc32fast = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".crc32fast."1.3.2" { inherit profileName; }).out; + crc32fast = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".crc32fast."1.4.0" { inherit profileName; }).out; }; }); @@ -1287,11 +1287,11 @@ in }; }); - "registry+https://github.com/rust-lang/crates.io-index".crc32fast."1.3.2" = overridableMkRustCrate (profileName: rec { + "registry+https://github.com/rust-lang/crates.io-index".crc32fast."1.4.0" = overridableMkRustCrate (profileName: rec { name = "crc32fast"; - version = "1.3.2"; + version = "1.4.0"; registry = "registry+https://github.com/rust-lang/crates.io-index"; - src = fetchCratesIo { inherit name version; sha256 = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d"; }; + src = fetchCratesIo { inherit name version; sha256 = "b3855a8a784b474f333699ef2bbca9db2c4a1f6d9088a90a2d25b1eb53111eaa"; }; features = builtins.concatLists [ [ "default" ] [ "std" ] @@ -1958,6 +1958,7 @@ in rand = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".rand."0.8.5" { inherit profileName; }).out; serde = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".serde."1.0.196" { inherit profileName; }).out; serde_bytes = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".serde_bytes."0.11.14" { inherit profileName; }).out; + sha1 = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".sha1."0.10.6" { inherit profileName; }).out; structopt = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".structopt."0.3.26" { inherit profileName; }).out; timeago = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".timeago."0.4.2" { inherit profileName; }).out; tokio = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".tokio."1.36.0" { inherit profileName; }).out; @@ -2003,6 +2004,8 @@ in base64 = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".base64."0.21.7" { inherit profileName; }).out; bytes = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".bytes."1.5.0" { inherit profileName; }).out; chrono = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".chrono."0.4.33" { inherit profileName; }).out; + crc32c = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".crc32c."0.6.4" { inherit profileName; }).out; + crc32fast = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".crc32fast."1.4.0" { inherit profileName; }).out; crypto_common = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".crypto-common."0.1.6" { inherit profileName; }).out; err_derive = (buildRustPackages."registry+https://github.com/rust-lang/crates.io-index".err-derive."0.3.1" { profileName = "__noProfile"; }).out; form_urlencoded = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".form_urlencoded."1.2.1" { inherit profileName; }).out; @@ -2036,6 +2039,7 @@ in serde = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".serde."1.0.196" { inherit profileName; }).out; serde_bytes = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".serde_bytes."0.11.14" { inherit profileName; }).out; serde_json = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".serde_json."1.0.113" { inherit profileName; }).out; + sha1 = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".sha1."0.10.6" { inherit profileName; }).out; sha2 = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".sha2."0.10.8" { inherit profileName; }).out; tokio = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".tokio."1.36.0" { inherit profileName; }).out; tokio_stream = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".tokio-stream."0.1.14" { inherit profileName; }).out; diff --git a/Cargo.toml b/Cargo.toml index cda3d0fc..16d3df4c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -43,6 +43,8 @@ bytes = "1.0" bytesize = "1.1" cfg-if = "1.0" chrono = "0.4" +crc32fast = "1.4" +crc32c = "0.6" crypto-common = "0.1" digest = "0.10" err-derive = "0.3" @@ -62,6 +64,7 @@ parse_duration = "2.1" pin-project = "1.0.12" pnet_datalink = "0.34" rand = "0.8" +sha1 = "0.10" sha2 = "0.10" timeago = { version = "0.4", default-features = false } xxhash-rust = { version = "0.8", default-features = false, features = ["xxh3"] } diff --git a/src/api/Cargo.toml b/src/api/Cargo.toml index bcf6a537..1b87496c 100644 --- a/src/api/Cargo.toml +++ b/src/api/Cargo.toml @@ -28,6 +28,8 @@ async-trait.workspace = true base64.workspace = true bytes.workspace = true chrono.workspace = true +crc32fast.workspace = true +crc32c.workspace = true crypto-common.workspace = true err-derive.workspace = true hex.workspace = true @@ -37,6 +39,7 @@ tracing.workspace = true md-5.workspace = true nom.workspace = true pin-project.workspace = true +sha1.workspace = true sha2.workspace = true futures.workspace = true diff --git a/src/api/s3/api_server.rs b/src/api/s3/api_server.rs index 1ed30996..1737af33 100644 --- a/src/api/s3/api_server.rs +++ b/src/api/s3/api_server.rs @@ -325,7 +325,7 @@ impl ApiHandler for S3ApiServer { part_number_marker: part_number_marker.map(|p| p.min(10000)), max_parts: max_parts.unwrap_or(1000).clamp(1, 1000), }; - handle_list_parts(ctx, &query).await + handle_list_parts(ctx, req, &query).await } Endpoint::DeleteObjects {} => handle_delete_objects(ctx, req, content_sha256).await, Endpoint::GetBucketWebsite {} => handle_get_website(ctx).await, diff --git a/src/api/s3/checksum.rs b/src/api/s3/checksum.rs new file mode 100644 index 00000000..c9dc001c --- /dev/null +++ b/src/api/s3/checksum.rs @@ -0,0 +1,406 @@ +use std::convert::{TryFrom, TryInto}; +use std::hash::Hasher; + +use base64::prelude::*; +use crc32c::Crc32cHasher as Crc32c; +use crc32fast::Hasher as Crc32; +use md5::{Digest, Md5}; +use sha1::Sha1; +use sha2::Sha256; + +use http::{HeaderMap, HeaderName, HeaderValue}; + +use garage_util::data::*; +use garage_util::error::OkOrMessage; + +use garage_model::s3::object_table::*; + +use crate::s3::error::*; + +pub const X_AMZ_CHECKSUM_ALGORITHM: HeaderName = + HeaderName::from_static("x-amz-checksum-algorithm"); +pub const X_AMZ_CHECKSUM_MODE: HeaderName = HeaderName::from_static("x-amz-checksum-mode"); +pub const X_AMZ_CHECKSUM_CRC32: HeaderName = HeaderName::from_static("x-amz-checksum-crc32"); +pub const X_AMZ_CHECKSUM_CRC32C: HeaderName = HeaderName::from_static("x-amz-checksum-crc32c"); +pub const X_AMZ_CHECKSUM_SHA1: HeaderName = HeaderName::from_static("x-amz-checksum-sha1"); +pub const X_AMZ_CHECKSUM_SHA256: HeaderName = HeaderName::from_static("x-amz-checksum-sha256"); + +pub type Crc32Checksum = [u8; 4]; +pub type Crc32cChecksum = [u8; 4]; +pub type Md5Checksum = [u8; 16]; +pub type Sha1Checksum = [u8; 20]; +pub type Sha256Checksum = [u8; 32]; + +#[derive(Debug, Default)] +pub(crate) struct ExpectedChecksums { + // base64-encoded md5 (content-md5 header) + pub md5: Option, + // content_sha256 (as a Hash / FixedBytes32) + pub sha256: Option, + // extra x-amz-checksum-* header + pub extra: Option, +} + +pub(crate) struct Checksummer { + pub crc32: Option, + pub crc32c: Option, + pub md5: Option, + pub sha1: Option, + pub sha256: Option, +} + +#[derive(Default)] +pub(crate) struct Checksums { + pub crc32: Option, + pub crc32c: Option, + pub md5: Option, + pub sha1: Option, + pub sha256: Option, +} + +impl Checksummer { + pub(crate) fn init(expected: &ExpectedChecksums, require_md5: bool) -> Self { + let mut ret = Self { + crc32: None, + crc32c: None, + md5: None, + sha1: None, + sha256: None, + }; + + if expected.md5.is_some() || require_md5 { + ret.md5 = Some(Md5::new()); + } + if expected.sha256.is_some() || matches!(&expected.extra, Some(ChecksumValue::Sha256(_))) { + ret.sha256 = Some(Sha256::new()); + } + if matches!(&expected.extra, Some(ChecksumValue::Crc32(_))) { + ret.crc32 = Some(Crc32::new()); + } + if matches!(&expected.extra, Some(ChecksumValue::Crc32c(_))) { + ret.crc32c = Some(Crc32c::default()); + } + if matches!(&expected.extra, Some(ChecksumValue::Sha1(_))) { + ret.sha1 = Some(Sha1::new()); + } + ret + } + + pub(crate) fn add(mut self, algo: Option) -> Self { + match algo { + Some(ChecksumAlgorithm::Crc32) => { + self.crc32 = Some(Crc32::new()); + } + Some(ChecksumAlgorithm::Crc32c) => { + self.crc32c = Some(Crc32c::default()); + } + Some(ChecksumAlgorithm::Sha1) => { + self.sha1 = Some(Sha1::new()); + } + Some(ChecksumAlgorithm::Sha256) => { + self.sha256 = Some(Sha256::new()); + } + None => (), + } + self + } + + pub(crate) fn update(&mut self, bytes: &[u8]) { + if let Some(crc32) = &mut self.crc32 { + crc32.update(bytes); + } + if let Some(crc32c) = &mut self.crc32c { + crc32c.write(bytes); + } + if let Some(md5) = &mut self.md5 { + md5.update(bytes); + } + if let Some(sha1) = &mut self.sha1 { + sha1.update(bytes); + } + if let Some(sha256) = &mut self.sha256 { + sha256.update(bytes); + } + } + + pub(crate) fn finalize(self) -> Checksums { + Checksums { + crc32: self.crc32.map(|x| u32::to_be_bytes(x.finalize())), + crc32c: self + .crc32c + .map(|x| u32::to_be_bytes(u32::try_from(x.finish()).unwrap())), + md5: self.md5.map(|x| x.finalize()[..].try_into().unwrap()), + sha1: self.sha1.map(|x| x.finalize()[..].try_into().unwrap()), + sha256: self.sha256.map(|x| x.finalize()[..].try_into().unwrap()), + } + } +} + +impl Checksums { + pub fn verify(&self, expected: &ExpectedChecksums) -> Result<(), Error> { + if let Some(expected_md5) = &expected.md5 { + match self.md5 { + Some(md5) if BASE64_STANDARD.encode(&md5) == expected_md5.trim_matches('"') => (), + _ => { + return Err(Error::InvalidDigest( + "MD5 checksum verification failed (from content-md5)".into(), + )) + } + } + } + if let Some(expected_sha256) = &expected.sha256 { + match self.sha256 { + Some(sha256) if &sha256[..] == expected_sha256.as_slice() => (), + _ => { + return Err(Error::InvalidDigest( + "SHA256 checksum verification failed (from x-amz-content-sha256)".into(), + )) + } + } + } + if let Some(extra) = expected.extra { + let algo = extra.algorithm(); + if self.extract(Some(algo)) != Some(extra) { + return Err(Error::InvalidDigest(format!( + "Failed to validate checksum for algorithm {:?}", + algo + ))); + } + } + Ok(()) + } + + pub fn extract(&self, algo: Option) -> Option { + match algo { + None => None, + Some(ChecksumAlgorithm::Crc32) => Some(ChecksumValue::Crc32(self.crc32.unwrap())), + Some(ChecksumAlgorithm::Crc32c) => Some(ChecksumValue::Crc32c(self.crc32c.unwrap())), + Some(ChecksumAlgorithm::Sha1) => Some(ChecksumValue::Sha1(self.sha1.unwrap())), + Some(ChecksumAlgorithm::Sha256) => Some(ChecksumValue::Sha256(self.sha256.unwrap())), + } + } +} + +// ---- + +#[derive(Default)] +pub(crate) struct MultipartChecksummer { + pub md5: Md5, + pub extra: Option, +} + +pub(crate) enum MultipartExtraChecksummer { + Crc32(Crc32), + Crc32c(Crc32c), + Sha1(Sha1), + Sha256(Sha256), +} + +impl MultipartChecksummer { + pub(crate) fn init(algo: Option) -> Self { + Self { + md5: Md5::new(), + extra: match algo { + None => None, + Some(ChecksumAlgorithm::Crc32) => { + Some(MultipartExtraChecksummer::Crc32(Crc32::new())) + } + Some(ChecksumAlgorithm::Crc32c) => { + Some(MultipartExtraChecksummer::Crc32c(Crc32c::default())) + } + Some(ChecksumAlgorithm::Sha1) => Some(MultipartExtraChecksummer::Sha1(Sha1::new())), + Some(ChecksumAlgorithm::Sha256) => { + Some(MultipartExtraChecksummer::Sha256(Sha256::new())) + } + }, + } + } + + pub(crate) fn update( + &mut self, + etag: &str, + checksum: Option, + ) -> Result<(), Error> { + self.md5 + .update(&hex::decode(&etag).ok_or_message("invalid etag hex")?); + match (&mut self.extra, checksum) { + (None, _) => (), + ( + Some(MultipartExtraChecksummer::Crc32(ref mut crc32)), + Some(ChecksumValue::Crc32(x)), + ) => { + crc32.update(&x); + } + ( + Some(MultipartExtraChecksummer::Crc32c(ref mut crc32c)), + Some(ChecksumValue::Crc32c(x)), + ) => { + crc32c.write(&x); + } + (Some(MultipartExtraChecksummer::Sha1(ref mut sha1)), Some(ChecksumValue::Sha1(x))) => { + sha1.update(&x); + } + ( + Some(MultipartExtraChecksummer::Sha256(ref mut sha256)), + Some(ChecksumValue::Sha256(x)), + ) => { + sha256.update(&x); + } + (Some(_), b) => { + return Err(Error::internal_error(format!( + "part checksum was not computed correctly, got: {:?}", + b + ))) + } + } + Ok(()) + } + + pub(crate) fn finalize(self) -> (Md5Checksum, Option) { + let md5 = self.md5.finalize()[..].try_into().unwrap(); + let extra = match self.extra { + None => None, + Some(MultipartExtraChecksummer::Crc32(crc32)) => { + Some(ChecksumValue::Crc32(u32::to_be_bytes(crc32.finalize()))) + } + Some(MultipartExtraChecksummer::Crc32c(crc32c)) => Some(ChecksumValue::Crc32c( + u32::to_be_bytes(u32::try_from(crc32c.finish()).unwrap()), + )), + Some(MultipartExtraChecksummer::Sha1(sha1)) => { + Some(ChecksumValue::Sha1(sha1.finalize()[..].try_into().unwrap())) + } + Some(MultipartExtraChecksummer::Sha256(sha256)) => Some(ChecksumValue::Sha256( + sha256.finalize()[..].try_into().unwrap(), + )), + }; + (md5, extra) + } +} + +// ---- + +/// Extract the value of the x-amz-checksum-algorithm header +pub(crate) fn request_checksum_algorithm( + headers: &HeaderMap, +) -> Result, Error> { + match headers.get(X_AMZ_CHECKSUM_ALGORITHM) { + None => Ok(None), + Some(x) if x == "CRC32" => Ok(Some(ChecksumAlgorithm::Crc32)), + Some(x) if x == "CRC32C" => Ok(Some(ChecksumAlgorithm::Crc32c)), + Some(x) if x == "SHA1" => Ok(Some(ChecksumAlgorithm::Sha1)), + Some(x) if x == "SHA256" => Ok(Some(ChecksumAlgorithm::Sha256)), + _ => Err(Error::bad_request("invalid checksum algorithm")), + } +} + +/// Extract the value of any of the x-amz-checksum-* headers +pub(crate) fn request_checksum_value( + headers: &HeaderMap, +) -> Result, Error> { + let mut ret = vec![]; + + if let Some(crc32_str) = headers.get(X_AMZ_CHECKSUM_CRC32) { + let crc32 = BASE64_STANDARD + .decode(&crc32_str) + .ok() + .and_then(|x| x.try_into().ok()) + .ok_or_bad_request("invalid x-amz-checksum-crc32 header")?; + ret.push(ChecksumValue::Crc32(crc32)) + } + if let Some(crc32c_str) = headers.get(X_AMZ_CHECKSUM_CRC32C) { + let crc32c = BASE64_STANDARD + .decode(&crc32c_str) + .ok() + .and_then(|x| x.try_into().ok()) + .ok_or_bad_request("invalid x-amz-checksum-crc32c header")?; + ret.push(ChecksumValue::Crc32c(crc32c)) + } + if let Some(sha1_str) = headers.get(X_AMZ_CHECKSUM_SHA1) { + let sha1 = BASE64_STANDARD + .decode(&sha1_str) + .ok() + .and_then(|x| x.try_into().ok()) + .ok_or_bad_request("invalid x-amz-checksum-sha1 header")?; + ret.push(ChecksumValue::Sha1(sha1)) + } + if let Some(sha256_str) = headers.get(X_AMZ_CHECKSUM_SHA256) { + let sha256 = BASE64_STANDARD + .decode(&sha256_str) + .ok() + .and_then(|x| x.try_into().ok()) + .ok_or_bad_request("invalid x-amz-checksum-sha256 header")?; + ret.push(ChecksumValue::Sha256(sha256)) + } + + if ret.len() > 1 { + return Err(Error::bad_request( + "multiple x-amz-checksum-* headers given", + )); + } + Ok(ret.pop()) +} + +/// Checks for the presense of x-amz-checksum-algorithm +/// if so extract the corrseponding x-amz-checksum-* value +pub(crate) fn request_checksum_algorithm_value( + headers: &HeaderMap, +) -> Result, Error> { + match headers.get(X_AMZ_CHECKSUM_ALGORITHM) { + Some(x) if x == "CRC32" => { + let crc32 = headers + .get(X_AMZ_CHECKSUM_CRC32) + .and_then(|x| BASE64_STANDARD.decode(&x).ok()) + .and_then(|x| x.try_into().ok()) + .ok_or_bad_request("invalid x-amz-checksum-crc32 header")?; + Ok(Some(ChecksumValue::Crc32(crc32))) + } + Some(x) if x == "CRC32C" => { + let crc32c = headers + .get(X_AMZ_CHECKSUM_CRC32C) + .and_then(|x| BASE64_STANDARD.decode(&x).ok()) + .and_then(|x| x.try_into().ok()) + .ok_or_bad_request("invalid x-amz-checksum-crc32c header")?; + Ok(Some(ChecksumValue::Crc32c(crc32c))) + } + Some(x) if x == "SHA1" => { + let sha1 = headers + .get(X_AMZ_CHECKSUM_SHA1) + .and_then(|x| BASE64_STANDARD.decode(&x).ok()) + .and_then(|x| x.try_into().ok()) + .ok_or_bad_request("invalid x-amz-checksum-sha1 header")?; + Ok(Some(ChecksumValue::Sha1(sha1))) + } + Some(x) if x == "SHA256" => { + let sha256 = headers + .get(X_AMZ_CHECKSUM_SHA256) + .and_then(|x| BASE64_STANDARD.decode(&x).ok()) + .and_then(|x| x.try_into().ok()) + .ok_or_bad_request("invalid x-amz-checksum-sha256 header")?; + Ok(Some(ChecksumValue::Sha256(sha256))) + } + Some(_) => Err(Error::bad_request("invalid x-amz-checksum-algorithm")), + None => Ok(None), + } +} + +pub(crate) fn add_checksum_response_headers( + checksum: &Option, + mut resp: http::response::Builder, +) -> http::response::Builder { + match checksum { + Some(ChecksumValue::Crc32(crc32)) => { + resp = resp.header(X_AMZ_CHECKSUM_CRC32, BASE64_STANDARD.encode(&crc32)); + } + Some(ChecksumValue::Crc32c(crc32c)) => { + resp = resp.header(X_AMZ_CHECKSUM_CRC32C, BASE64_STANDARD.encode(&crc32c)); + } + Some(ChecksumValue::Sha1(sha1)) => { + resp = resp.header(X_AMZ_CHECKSUM_SHA1, BASE64_STANDARD.encode(&sha1)); + } + Some(ChecksumValue::Sha256(sha256)) => { + resp = resp.header(X_AMZ_CHECKSUM_SHA256, BASE64_STANDARD.encode(&sha256)); + } + None => (), + } + resp +} diff --git a/src/api/s3/copy.rs b/src/api/s3/copy.rs index 2b29ec6d..411a6917 100644 --- a/src/api/s3/copy.rs +++ b/src/api/s3/copy.rs @@ -2,7 +2,6 @@ use std::pin::Pin; use std::time::{Duration, SystemTime, UNIX_EPOCH}; use futures::{stream, stream::Stream, StreamExt, TryStreamExt}; -use md5::{Digest as Md5Digest, Md5}; use bytes::Bytes; use hyper::{Request, Response}; @@ -23,11 +22,12 @@ use garage_model::s3::version_table::*; use crate::helpers::*; use crate::s3::api_server::{ReqBody, ResBody}; +use crate::s3::checksum::*; use crate::s3::encryption::EncryptionParams; use crate::s3::error::*; use crate::s3::get::full_object_byte_stream; use crate::s3::multipart; -use crate::s3::put::{get_headers, save_stream, SaveStreamResult}; +use crate::s3::put::{get_headers, save_stream, ChecksumMode, SaveStreamResult}; use crate::s3::xml::{self as s3_xml, xmlns_tag}; // -------- CopyObject --------- @@ -39,6 +39,8 @@ pub async fn handle_copy( ) -> Result, Error> { let copy_precondition = CopyPreconditionHeaders::parse(req)?; + let checksum_algorithm = request_checksum_algorithm(req.headers())?; + let source_object = get_copy_source(&ctx, req).await?; let (source_version, source_version_data, source_version_meta) = @@ -48,7 +50,7 @@ pub async fn handle_copy( copy_precondition.check(source_version, &source_version_meta.etag)?; // Determine encryption parameters - let (source_encryption, source_object_headers) = + let (source_encryption, source_object_meta_inner) = EncryptionParams::check_decrypt_for_copy_source( &ctx.garage, req.headers(), @@ -56,23 +58,54 @@ pub async fn handle_copy( )?; let dest_encryption = EncryptionParams::new_from_headers(&ctx.garage, req.headers())?; - // Determine headers of destination object - let dest_object_headers = match req.headers().get("x-amz-metadata-directive") { - Some(v) if v == hyper::header::HeaderValue::from_static("REPLACE") => { - get_headers(req.headers())? - } - _ => source_object_headers.into_owned(), + // Extract source checksum info before source_object_meta_inner is consumed + let source_checksum = source_object_meta_inner.checksum; + let source_checksum_algorithm = source_checksum.map(|x| x.algorithm()); + + // If source object has a checksum, the destination object must as well. + // The x-amz-checksum-algorihtm header allows to change that algorithm, + // but if it is absent, we must use the same as before + let checksum_algorithm = checksum_algorithm.or(source_checksum_algorithm); + + // Determine metadata of destination object + let was_multipart = source_version_meta.etag.contains('-'); + let dest_object_meta = ObjectVersionMetaInner { + headers: match req.headers().get("x-amz-metadata-directive") { + Some(v) if v == hyper::header::HeaderValue::from_static("REPLACE") => { + get_headers(req.headers())? + } + _ => source_object_meta_inner.into_owned().headers, + }, + checksum: source_checksum, }; // Do actual object copying - let res = if EncryptionParams::is_same(&source_encryption, &dest_encryption) { - // If source and dest are both unencrypted, or if the encryption keys - // are the same, we can just copy the metadata and link blocks of the + // + // In any of the following scenarios, we need to read the whole object + // data and re-write it again: + // + // - the data needs to be decrypted or encrypted + // - the requested checksum algorithm requires us to recompute a checksum + // - the original object was a multipart upload and a checksum algorithm + // is defined (AWS specifies that in this case, we must recompute the + // checksum from scratch as if this was a single big object and not + // a multipart object, as the checksums are not computed in the same way) + // + // In other cases, we can just copy the metadata and reference the same blocks. + // + // See: https://docs.aws.amazon.com/AmazonS3/latest/userguide/checking-object-integrity.html + + let must_recopy = !EncryptionParams::is_same(&source_encryption, &dest_encryption) + || source_checksum_algorithm != checksum_algorithm + || (was_multipart && checksum_algorithm.is_some()); + + let res = if !must_recopy { + // In most cases, we can just copy the metadata and link blocks of the // old object from the new object. handle_copy_metaonly( ctx, dest_key, - dest_object_headers, + dest_object_meta, dest_encryption, source_version, source_version_data, @@ -80,16 +113,27 @@ pub async fn handle_copy( ) .await? } else { + let expected_checksum = ExpectedChecksums { + md5: None, + sha256: None, + extra: source_checksum, + }; + let checksum_mode = if was_multipart || source_checksum_algorithm != checksum_algorithm { + ChecksumMode::Calculate(checksum_algorithm) + } else { + ChecksumMode::Verify(&expected_checksum) + }; // If source and dest encryption use different keys, // we must decrypt content and re-encrypt, so rewrite all data blocks. handle_copy_reencrypt( ctx, dest_key, - dest_object_headers, + dest_object_meta, dest_encryption, source_version, source_version_data, source_encryption, + checksum_mode, ) .await? }; @@ -115,7 +159,7 @@ pub async fn handle_copy( async fn handle_copy_metaonly( ctx: ReqCtx, dest_key: &str, - dest_object_headers: ObjectVersionHeaders, + dest_object_meta: ObjectVersionMetaInner, dest_encryption: EncryptionParams, source_version: &ObjectVersion, source_version_data: &ObjectVersionData, @@ -132,7 +176,7 @@ async fn handle_copy_metaonly( let new_timestamp = now_msec(); let new_meta = ObjectVersionMeta { - encryption: dest_encryption.encrypt_headers(dest_object_headers)?, + encryption: dest_encryption.encrypt_meta(dest_object_meta)?, size: source_version_meta.size, etag: source_version_meta.etag.clone(), }; @@ -180,6 +224,7 @@ async fn handle_copy_metaonly( timestamp: new_timestamp, state: ObjectVersionState::Uploading { encryption: new_meta.encryption.clone(), + checksum_algorithm: None, multipart: false, }, }; @@ -252,11 +297,12 @@ async fn handle_copy_metaonly( async fn handle_copy_reencrypt( ctx: ReqCtx, dest_key: &str, - dest_object_headers: ObjectVersionHeaders, + dest_object_meta: ObjectVersionMetaInner, dest_encryption: EncryptionParams, source_version: &ObjectVersion, source_version_data: &ObjectVersionData, source_encryption: EncryptionParams, + checksum_mode: ChecksumMode<'_>, ) -> Result { // basically we will read the source data (decrypt if necessary) // and save that in a new object (encrypt if necessary), @@ -270,12 +316,11 @@ async fn handle_copy_reencrypt( save_stream( &ctx, - dest_object_headers, + dest_object_meta, dest_encryption, source_stream.map_err(|e| Error::from(GarageError::from(e))), &dest_key.to_string(), - None, - None, + checksum_mode, ) .await } @@ -313,8 +358,12 @@ pub async fn handle_upload_part_copy( req.headers(), &source_version_meta.encryption, )?; - let dest_object_encryption = match dest_version.state { - ObjectVersionState::Uploading { encryption, .. } => encryption, + let (dest_object_encryption, dest_object_checksum_algorithm) = match dest_version.state { + ObjectVersionState::Uploading { + encryption, + checksum_algorithm, + .. + } => (encryption, checksum_algorithm), _ => unreachable!(), }; let (dest_encryption, _) = @@ -412,7 +461,9 @@ pub async fn handle_upload_part_copy( dest_mpu_part_key, MpuPart { version: dest_version_id, + // These are all filled in later (bottom of this function) etag: None, + checksum: None, size: None, }, ); @@ -429,7 +480,8 @@ pub async fn handle_upload_part_copy( garage.version_table.insert(&dest_version).await?; // Now, actually copy the blocks - let mut md5hasher = Md5::new(); + let mut checksummer = Checksummer::init(&Default::default(), !dest_encryption.is_encrypted()) + .add(dest_object_checksum_algorithm); // First, create a stream that is able to read the source blocks // and extract the subrange if necessary. @@ -495,18 +547,24 @@ pub async fn handle_upload_part_copy( } let data_len = data.len() as u64; - md5hasher.update(&data[..]); - - let (final_data, must_upload, final_hash) = match existing_block_hash { - Some(hash) if same_encryption => (data, false, hash), - _ => tokio::task::spawn_blocking(move || { - let data_enc = dest_encryption.encrypt_block(data)?; - let hash = blake2sum(&data_enc); - Ok::<_, Error>((data_enc, true, hash)) + + let (checksummer_updated, (data_to_upload, final_hash)) = + tokio::task::spawn_blocking(move || { + checksummer.update(&data[..]); + + let tup = match existing_block_hash { + Some(hash) if same_encryption => (None, hash), + _ => { + let data_enc = dest_encryption.encrypt_block(data)?; + let hash = blake2sum(&data_enc); + (Some(data_enc), hash) + } + }; + Ok::<_, Error>((checksummer, tup)) }) .await - .unwrap()?, - }; + .unwrap()?; + checksummer = checksummer_updated; dest_version.blocks.clear(); dest_version.blocks.put( @@ -531,7 +589,7 @@ pub async fn handle_upload_part_copy( // Thing 1: if the block is not exactly a block that existed before, // we need to insert that data as a new block. async { - if must_upload { + if let Some(final_data) = data_to_upload { garage .block_manager .rpc_put_block(final_hash, final_data, dest_encryption.is_encrypted(), None) @@ -552,8 +610,9 @@ pub async fn handle_upload_part_copy( assert_eq!(current_offset, source_range.length); - let data_md5sum = md5hasher.finalize(); - let etag = dest_encryption.etag_from_md5(&data_md5sum); + let checksums = checksummer.finalize(); + let etag = dest_encryption.etag_from_md5(&checksums.md5); + let checksum = checksums.extract(dest_object_checksum_algorithm); // Put the part's ETag in the Versiontable dest_mpu.parts.put( @@ -561,6 +620,7 @@ pub async fn handle_upload_part_copy( MpuPart { version: dest_version_id, etag: Some(etag.clone()), + checksum, size: Some(current_offset), }, ); diff --git a/src/api/s3/encryption.rs b/src/api/s3/encryption.rs index 2b105e90..2e6ed65c 100644 --- a/src/api/s3/encryption.rs +++ b/src/api/s3/encryption.rs @@ -26,9 +26,10 @@ use garage_util::error::Error as GarageError; use garage_util::migrate::Migrate; use garage_model::garage::Garage; -use garage_model::s3::object_table::{ObjectVersionEncryption, ObjectVersionHeaders}; +use garage_model::s3::object_table::{ObjectVersionEncryption, ObjectVersionMetaInner}; use crate::common_error::*; +use crate::s3::checksum::Md5Checksum; use crate::s3::error::Error; const X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_ALGORITHM: HeaderName = @@ -124,7 +125,7 @@ impl EncryptionParams { garage: &Garage, headers: &HeaderMap, obj_enc: &'a ObjectVersionEncryption, - ) -> Result<(Self, Cow<'a, ObjectVersionHeaders>), Error> { + ) -> Result<(Self, Cow<'a, ObjectVersionMetaInner>), Error> { let key = parse_request_headers( headers, &X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_ALGORITHM, @@ -138,7 +139,7 @@ impl EncryptionParams { garage: &Garage, headers: &HeaderMap, obj_enc: &'a ObjectVersionEncryption, - ) -> Result<(Self, Cow<'a, ObjectVersionHeaders>), Error> { + ) -> Result<(Self, Cow<'a, ObjectVersionMetaInner>), Error> { let key = parse_request_headers( headers, &X_AMZ_COPY_SOURCE_SERVER_SIDE_ENCRYPTION_CUSTOMER_ALGORITHM, @@ -152,14 +153,11 @@ impl EncryptionParams { garage: &Garage, key: Option<(Key, Md5Output)>, obj_enc: &'a ObjectVersionEncryption, - ) -> Result<(Self, Cow<'a, ObjectVersionHeaders>), Error> { + ) -> Result<(Self, Cow<'a, ObjectVersionMetaInner>), Error> { match (key, &obj_enc) { ( Some((client_key, client_key_md5)), - ObjectVersionEncryption::SseC { - headers, - compressed, - }, + ObjectVersionEncryption::SseC { inner, compressed }, ) => { let enc = Self::SseC { client_key, @@ -170,13 +168,13 @@ impl EncryptionParams { None }, }; - let plaintext = enc.decrypt_blob(&headers)?; - let headers = ObjectVersionHeaders::decode(&plaintext) - .ok_or_internal_error("Could not decode encrypted headers")?; - Ok((enc, Cow::Owned(headers))) + let plaintext = enc.decrypt_blob(&inner)?; + let inner = ObjectVersionMetaInner::decode(&plaintext) + .ok_or_internal_error("Could not decode encrypted metadata")?; + Ok((enc, Cow::Owned(inner))) } - (None, ObjectVersionEncryption::Plaintext { headers }) => { - Ok((Self::Plaintext, Cow::Borrowed(headers))) + (None, ObjectVersionEncryption::Plaintext { inner }) => { + Ok((Self::Plaintext, Cow::Borrowed(inner))) } (_, ObjectVersionEncryption::SseC { .. }) => { Err(Error::bad_request("Object is encrypted")) @@ -188,29 +186,31 @@ impl EncryptionParams { } } - pub fn encrypt_headers( + pub fn encrypt_meta( &self, - h: ObjectVersionHeaders, + meta: ObjectVersionMetaInner, ) -> Result { match self { Self::SseC { compression_level, .. } => { - let plaintext = h.encode().map_err(GarageError::from)?; + let plaintext = meta.encode().map_err(GarageError::from)?; let ciphertext = self.encrypt_blob(&plaintext)?; Ok(ObjectVersionEncryption::SseC { - headers: ciphertext.into_owned(), + inner: ciphertext.into_owned(), compressed: compression_level.is_some(), }) } - Self::Plaintext => Ok(ObjectVersionEncryption::Plaintext { headers: h }), + Self::Plaintext => Ok(ObjectVersionEncryption::Plaintext { inner: meta }), } } // ---- generating object Etag values ---- - pub fn etag_from_md5(&self, md5sum: &[u8]) -> String { + pub fn etag_from_md5(&self, md5sum: &Option) -> String { match self { - Self::Plaintext => hex::encode(md5sum), + Self::Plaintext => md5sum + .map(|x| hex::encode(&x[..])) + .expect("md5 digest should have been computed"), Self::SseC { .. } => { // AWS specifies that for encrypted objects, the Etag is not // the md5sum of the data, but doesn't say what it is. @@ -224,7 +224,7 @@ impl EncryptionParams { // ---- generic function for encrypting / decrypting blobs ---- // Prepends a randomly-generated nonce to the encrypted value. - // This is used for encrypting object headers and inlined data for small objects. + // This is used for encrypting object metadata and inlined data for small objects. // This does not compress anything. pub fn encrypt_blob<'a>(&self, blob: &'a [u8]) -> Result, Error> { diff --git a/src/api/s3/error.rs b/src/api/s3/error.rs index 5cb5d04e..2855e0b3 100644 --- a/src/api/s3/error.rs +++ b/src/api/s3/error.rs @@ -69,6 +69,10 @@ pub enum Error { #[error(display = "Invalid encryption algorithm: {:?}, should be AES256", _0)] InvalidEncryptionAlgorithm(String), + /// The client sent invalid XML data + #[error(display = "Invalid digest: {}", _0)] + InvalidDigest(String), + /// The client sent a request for an action not supported by garage #[error(display = "Unimplemented action: {}", _0)] NotImplemented(String), @@ -129,6 +133,7 @@ impl Error { Error::NotImplemented(_) => "NotImplemented", Error::InvalidXml(_) => "MalformedXML", Error::InvalidRange(_) => "InvalidRange", + Error::InvalidDigest(_) => "InvalidDigest", Error::InvalidUtf8Str(_) | Error::InvalidUtf8String(_) => "InvalidRequest", Error::InvalidEncryptionAlgorithm(_) => "InvalidEncryptionAlgorithmError", } @@ -148,6 +153,7 @@ impl ApiError for Error { | Error::InvalidPart | Error::InvalidPartOrder | Error::EntityTooSmall + | Error::InvalidDigest(_) | Error::InvalidEncryptionAlgorithm(_) | Error::InvalidXml(_) | Error::InvalidUtf8Str(_) diff --git a/src/api/s3/get.rs b/src/api/s3/get.rs index ec300ab7..f5d3cf11 100644 --- a/src/api/s3/get.rs +++ b/src/api/s3/get.rs @@ -27,6 +27,7 @@ use garage_model::s3::version_table::*; use crate::helpers::*; use crate::s3::api_server::ResBody; +use crate::s3::checksum::{add_checksum_response_headers, X_AMZ_CHECKSUM_MODE}; use crate::s3::encryption::EncryptionParams; use crate::s3::error::*; @@ -45,8 +46,9 @@ pub struct GetObjectOverrides { fn object_headers( version: &ObjectVersion, version_meta: &ObjectVersionMeta, - headers: &ObjectVersionHeaders, + meta_inner: &ObjectVersionMetaInner, encryption: EncryptionParams, + checksum_mode: ChecksumMode, ) -> http::response::Builder { debug!("Version meta: {:?}", version_meta); @@ -65,7 +67,7 @@ fn object_headers( // have the same name (ignoring case) into a comma-delimited list. // See: https://docs.aws.amazon.com/AmazonS3/latest/userguide/UsingMetadata.html let mut headers_by_name = BTreeMap::new(); - for (name, value) in headers.0.iter() { + for (name, value) in meta_inner.headers.iter() { match headers_by_name.get_mut(name) { None => { headers_by_name.insert(name, vec![value.as_str()]); @@ -80,6 +82,10 @@ fn object_headers( resp = resp.header(name, values.join(",")); } + if checksum_mode.enabled { + resp = add_checksum_response_headers(&meta_inner.checksum, resp); + } + encryption.add_response_headers(&mut resp); resp @@ -199,6 +205,8 @@ pub async fn handle_head_without_ctx( let (encryption, headers) = EncryptionParams::check_decrypt(&garage, req.headers(), &version_meta.encryption)?; + let checksum_mode = checksum_mode(&req); + if let Some(pn) = part_number { match version_data { ObjectVersionData::Inline(_, _) => { @@ -206,17 +214,21 @@ pub async fn handle_head_without_ctx( return Err(Error::InvalidPart); } let bytes_len = version_meta.size; - Ok( - object_headers(object_version, version_meta, &headers, encryption) - .header(CONTENT_LENGTH, format!("{}", bytes_len)) - .header( - CONTENT_RANGE, - format!("bytes 0-{}/{}", bytes_len - 1, bytes_len), - ) - .header(X_AMZ_MP_PARTS_COUNT, "1") - .status(StatusCode::PARTIAL_CONTENT) - .body(empty_body())?, + Ok(object_headers( + object_version, + version_meta, + &headers, + encryption, + checksum_mode, + ) + .header(CONTENT_LENGTH, format!("{}", bytes_len)) + .header( + CONTENT_RANGE, + format!("bytes 0-{}/{}", bytes_len - 1, bytes_len), ) + .header(X_AMZ_MP_PARTS_COUNT, "1") + .status(StatusCode::PARTIAL_CONTENT) + .body(empty_body())?) } ObjectVersionData::FirstBlock(_, _) => { let version = garage @@ -228,32 +240,40 @@ pub async fn handle_head_without_ctx( let (part_offset, part_end) = calculate_part_bounds(&version, pn).ok_or(Error::InvalidPart)?; - Ok( - object_headers(object_version, version_meta, &headers, encryption) - .header(CONTENT_LENGTH, format!("{}", part_end - part_offset)) - .header( - CONTENT_RANGE, - format!( - "bytes {}-{}/{}", - part_offset, - part_end - 1, - version_meta.size - ), - ) - .header(X_AMZ_MP_PARTS_COUNT, format!("{}", version.n_parts()?)) - .status(StatusCode::PARTIAL_CONTENT) - .body(empty_body())?, + Ok(object_headers( + object_version, + version_meta, + &headers, + encryption, + checksum_mode, ) + .header(CONTENT_LENGTH, format!("{}", part_end - part_offset)) + .header( + CONTENT_RANGE, + format!( + "bytes {}-{}/{}", + part_offset, + part_end - 1, + version_meta.size + ), + ) + .header(X_AMZ_MP_PARTS_COUNT, format!("{}", version.n_parts()?)) + .status(StatusCode::PARTIAL_CONTENT) + .body(empty_body())?) } _ => unreachable!(), } } else { - Ok( - object_headers(object_version, version_meta, &headers, encryption) - .header(CONTENT_LENGTH, format!("{}", version_meta.size)) - .status(StatusCode::OK) - .body(empty_body())?, + Ok(object_headers( + object_version, + version_meta, + &headers, + encryption, + checksum_mode, ) + .header(CONTENT_LENGTH, format!("{}", version_meta.size)) + .status(StatusCode::OK) + .body(empty_body())?) } } @@ -307,12 +327,24 @@ pub async fn handle_get_without_ctx( let (enc, headers) = EncryptionParams::check_decrypt(&garage, req.headers(), &last_v_meta.encryption)?; + let checksum_mode = checksum_mode(&req); + match (part_number, parse_range_header(req, last_v_meta.size)?) { (Some(_), Some(_)) => Err(Error::bad_request( "Cannot specify both partNumber and Range header", )), (Some(pn), None) => { - handle_get_part(garage, last_v, last_v_data, last_v_meta, enc, &headers, pn).await + handle_get_part( + garage, + last_v, + last_v_data, + last_v_meta, + enc, + &headers, + pn, + checksum_mode, + ) + .await } (None, Some(range)) => { handle_get_range( @@ -324,6 +356,7 @@ pub async fn handle_get_without_ctx( &headers, range.start, range.start + range.length, + checksum_mode, ) .await } @@ -336,6 +369,7 @@ pub async fn handle_get_without_ctx( enc, &headers, overrides, + checksum_mode, ) .await } @@ -348,12 +382,19 @@ async fn handle_get_full( version_data: &ObjectVersionData, version_meta: &ObjectVersionMeta, encryption: EncryptionParams, - headers: &ObjectVersionHeaders, + meta_inner: &ObjectVersionMetaInner, overrides: GetObjectOverrides, + checksum_mode: ChecksumMode, ) -> Result, Error> { - let mut resp_builder = object_headers(version, version_meta, &headers, encryption) - .header(CONTENT_LENGTH, format!("{}", version_meta.size)) - .status(StatusCode::OK); + let mut resp_builder = object_headers( + version, + version_meta, + &meta_inner, + encryption, + checksum_mode, + ) + .header(CONTENT_LENGTH, format!("{}", version_meta.size)) + .status(StatusCode::OK); getobject_override_headers(overrides, &mut resp_builder)?; let stream = full_object_byte_stream(garage, version, version_data, encryption); @@ -432,14 +473,15 @@ async fn handle_get_range( version_data: &ObjectVersionData, version_meta: &ObjectVersionMeta, encryption: EncryptionParams, - headers: &ObjectVersionHeaders, + meta_inner: &ObjectVersionMetaInner, begin: u64, end: u64, + checksum_mode: ChecksumMode, ) -> Result, Error> { // Here we do not use getobject_override_headers because we don't // want to add any overridden headers (those should not be added // when returning PARTIAL_CONTENT) - let resp_builder = object_headers(version, version_meta, headers, encryption) + let resp_builder = object_headers(version, version_meta, meta_inner, encryption, checksum_mode) .header(CONTENT_LENGTH, format!("{}", end - begin)) .header( CONTENT_RANGE, @@ -480,12 +522,19 @@ async fn handle_get_part( version_data: &ObjectVersionData, version_meta: &ObjectVersionMeta, encryption: EncryptionParams, - headers: &ObjectVersionHeaders, + meta_inner: &ObjectVersionMetaInner, part_number: u64, + checksum_mode: ChecksumMode, ) -> Result, Error> { // Same as for get_range, no getobject_override_headers - let resp_builder = object_headers(object_version, version_meta, headers, encryption) - .status(StatusCode::PARTIAL_CONTENT); + let resp_builder = object_headers( + object_version, + version_meta, + meta_inner, + encryption, + checksum_mode, + ) + .status(StatusCode::PARTIAL_CONTENT); match version_data { ObjectVersionData::Inline(_, bytes) => { @@ -567,6 +616,20 @@ fn calculate_part_bounds(v: &Version, part_number: u64) -> Option<(u64, u64)> { None } +struct ChecksumMode { + enabled: bool, +} + +fn checksum_mode(req: &Request) -> ChecksumMode { + ChecksumMode { + enabled: req + .headers() + .get(X_AMZ_CHECKSUM_MODE) + .map(|x| x == "ENABLED") + .unwrap_or(false), + } +} + fn body_from_blocks_range( garage: Arc, encryption: EncryptionParams, diff --git a/src/api/s3/list.rs b/src/api/s3/list.rs index 1678f1fa..648bace2 100644 --- a/src/api/s3/list.rs +++ b/src/api/s3/list.rs @@ -2,7 +2,7 @@ use std::collections::{BTreeMap, BTreeSet}; use std::iter::{Iterator, Peekable}; use base64::prelude::*; -use hyper::Response; +use hyper::{Request, Response}; use garage_util::data::*; use garage_util::error::Error as GarageError; @@ -15,7 +15,8 @@ use garage_table::EnumerationOrder; use crate::encoding::*; use crate::helpers::*; -use crate::s3::api_server::ResBody; +use crate::s3::api_server::{ReqBody, ResBody}; +use crate::s3::encryption::EncryptionParams; use crate::s3::error::*; use crate::s3::multipart as s3_multipart; use crate::s3::xml as s3_xml; @@ -271,13 +272,21 @@ pub async fn handle_list_multipart_upload( pub async fn handle_list_parts( ctx: ReqCtx, + req: Request, query: &ListPartsQuery, ) -> Result, Error> { debug!("ListParts {:?}", query); let upload_id = s3_multipart::decode_upload_id(&query.upload_id)?; - let (_, _, mpu) = s3_multipart::get_upload(&ctx, &query.key, &upload_id).await?; + let (_, object_version, mpu) = s3_multipart::get_upload(&ctx, &query.key, &upload_id).await?; + + let object_encryption = match object_version.state { + ObjectVersionState::Uploading { encryption, .. } => encryption, + _ => unreachable!(), + }; + let encryption_res = + EncryptionParams::check_decrypt(&ctx.garage, req.headers(), &object_encryption); let (info, next) = fetch_part_info(query, &mpu)?; @@ -296,11 +305,40 @@ pub async fn handle_list_parts( is_truncated: s3_xml::Value(format!("{}", next.is_some())), parts: info .iter() - .map(|part| s3_xml::PartItem { - etag: s3_xml::Value(format!("\"{}\"", part.etag)), - last_modified: s3_xml::Value(msec_to_rfc3339(part.timestamp)), - part_number: s3_xml::IntValue(part.part_number as i64), - size: s3_xml::IntValue(part.size as i64), + .map(|part| { + // hide checksum if object is encrypted and the decryption + // keys are not provided + let checksum = part.checksum.filter(|_| encryption_res.is_ok()); + s3_xml::PartItem { + etag: s3_xml::Value(format!("\"{}\"", part.etag)), + last_modified: s3_xml::Value(msec_to_rfc3339(part.timestamp)), + part_number: s3_xml::IntValue(part.part_number as i64), + size: s3_xml::IntValue(part.size as i64), + checksum_crc32: match &checksum { + Some(ChecksumValue::Crc32(x)) => { + Some(s3_xml::Value(BASE64_STANDARD.encode(&x))) + } + _ => None, + }, + checksum_crc32c: match &checksum { + Some(ChecksumValue::Crc32c(x)) => { + Some(s3_xml::Value(BASE64_STANDARD.encode(&x))) + } + _ => None, + }, + checksum_sha1: match &checksum { + Some(ChecksumValue::Sha1(x)) => { + Some(s3_xml::Value(BASE64_STANDARD.encode(&x))) + } + _ => None, + }, + checksum_sha256: match &checksum { + Some(ChecksumValue::Sha256(x)) => { + Some(s3_xml::Value(BASE64_STANDARD.encode(&x))) + } + _ => None, + }, + } }) .collect(), @@ -346,6 +384,7 @@ struct PartInfo<'a> { timestamp: u64, part_number: u64, size: u64, + checksum: Option, } enum ExtractionResult { @@ -486,6 +525,7 @@ fn fetch_part_info<'a>( timestamp: pk.timestamp, etag, size, + checksum: p.checksum, }; match parts.last_mut() { Some(lastpart) if lastpart.part_number == pk.part_number => { @@ -945,8 +985,12 @@ mod tests { state: ObjectVersionState::Uploading { multipart: true, encryption: ObjectVersionEncryption::Plaintext { - headers: ObjectVersionHeaders(vec![]), + inner: ObjectVersionMetaInner { + headers: vec![], + checksum: None, + }, }, + checksum_algorithm: None, }, } } @@ -1135,6 +1179,7 @@ mod tests { version: uuid, size: Some(3), etag: Some("etag1".into()), + checksum: None, }, ), ( @@ -1146,6 +1191,7 @@ mod tests { version: uuid, size: None, etag: None, + checksum: None, }, ), ( @@ -1157,6 +1203,7 @@ mod tests { version: uuid, size: Some(10), etag: Some("etag2".into()), + checksum: None, }, ), ( @@ -1168,6 +1215,7 @@ mod tests { version: uuid, size: Some(7), etag: Some("etag3".into()), + checksum: None, }, ), ( @@ -1179,6 +1227,7 @@ mod tests { version: uuid, size: Some(5), etag: Some("etag4".into()), + checksum: None, }, ), ]; @@ -1217,12 +1266,14 @@ mod tests { etag: "etag1", timestamp: TS, part_number: 1, - size: 3 + size: 3, + checksum: None, }, PartInfo { etag: "etag2", timestamp: TS, part_number: 3, + checksum: None, size: 10 }, ] @@ -1238,12 +1289,14 @@ mod tests { PartInfo { etag: "etag3", timestamp: TS, + checksum: None, part_number: 5, size: 7 }, PartInfo { etag: "etag4", timestamp: TS, + checksum: None, part_number: 8, size: 5 }, @@ -1267,24 +1320,28 @@ mod tests { PartInfo { etag: "etag1", timestamp: TS, + checksum: None, part_number: 1, size: 3 }, PartInfo { etag: "etag2", timestamp: TS, + checksum: None, part_number: 3, size: 10 }, PartInfo { etag: "etag3", timestamp: TS, + checksum: None, part_number: 5, size: 7 }, PartInfo { etag: "etag4", timestamp: TS, + checksum: None, part_number: 8, size: 5 }, diff --git a/src/api/s3/mod.rs b/src/api/s3/mod.rs index 1eb95d40..b9bb1a6f 100644 --- a/src/api/s3/mod.rs +++ b/src/api/s3/mod.rs @@ -13,6 +13,7 @@ mod post_object; mod put; mod website; +mod checksum; mod encryption; mod router; pub mod xml; diff --git a/src/api/s3/multipart.rs b/src/api/s3/multipart.rs index fcc5769f..3db3e8aa 100644 --- a/src/api/s3/multipart.rs +++ b/src/api/s3/multipart.rs @@ -1,9 +1,10 @@ use std::collections::HashMap; +use std::convert::TryInto; use std::sync::Arc; +use base64::prelude::*; use futures::prelude::*; use hyper::{Request, Response}; -use md5::{Digest as Md5Digest, Md5}; use garage_table::*; use garage_util::data::*; @@ -16,6 +17,7 @@ use garage_model::s3::version_table::*; use crate::helpers::*; use crate::s3::api_server::{ReqBody, ResBody}; +use crate::s3::checksum::*; use crate::s3::encryption::EncryptionParams; use crate::s3::error::*; use crate::s3::put::*; @@ -41,10 +43,16 @@ pub async fn handle_create_multipart_upload( let timestamp = next_timestamp(existing_object.as_ref()); let headers = get_headers(req.headers())?; + let meta = ObjectVersionMetaInner { + headers, + checksum: None, + }; // Determine whether object should be encrypted, and if so the key let encryption = EncryptionParams::new_from_headers(&garage, req.headers())?; - let object_encryption = encryption.encrypt_headers(headers)?; + let object_encryption = encryption.encrypt_meta(meta)?; + + let checksum_algorithm = request_checksum_algorithm(req.headers())?; // Create object in object table let object_version = ObjectVersion { @@ -53,6 +61,7 @@ pub async fn handle_create_multipart_upload( state: ObjectVersionState::Uploading { multipart: true, encryption: object_encryption, + checksum_algorithm, }, }; let object = Object::new(*bucket_id, key.to_string(), vec![object_version]); @@ -90,9 +99,13 @@ pub async fn handle_put_part( let upload_id = decode_upload_id(upload_id)?; - let content_md5 = match req.headers().get("content-md5") { - Some(x) => Some(x.to_str()?.to_string()), - None => None, + let expected_checksums = ExpectedChecksums { + md5: match req.headers().get("content-md5") { + Some(x) => Some(x.to_str()?.to_string()), + None => None, + }, + sha256: content_sha256, + extra: request_checksum_value(req.headers())?, }; // Read first chuck, and at the same time try to get object to see if it exists @@ -106,8 +119,12 @@ pub async fn handle_put_part( futures::try_join!(get_upload(&ctx, &key, &upload_id), chunker.next(),)?; // Check encryption params - let object_encryption = match object_version.state { - ObjectVersionState::Uploading { encryption, .. } => encryption, + let (object_encryption, checksum_algorithm) = match object_version.state { + ObjectVersionState::Uploading { + encryption, + checksum_algorithm, + .. + } => (encryption, checksum_algorithm), _ => unreachable!(), }; let (encryption, _) = @@ -138,7 +155,9 @@ pub async fn handle_put_part( mpu_part_key, MpuPart { version: version_uuid, + // all these are filled in later, at the end of this function etag: None, + checksum: None, size: None, }, ); @@ -152,32 +171,31 @@ pub async fn handle_put_part( garage.version_table.insert(&version).await?; // Copy data to version - let (total_size, data_md5sum, data_sha256sum, _) = read_and_put_blocks( + let checksummer = + Checksummer::init(&expected_checksums, !encryption.is_encrypted()).add(checksum_algorithm); + let (total_size, checksums, _) = read_and_put_blocks( &ctx, &version, encryption, part_number, first_block, &mut chunker, + checksummer, ) .await?; // Verify that checksums map - ensure_checksum_matches( - &data_md5sum, - data_sha256sum, - content_md5.as_deref(), - content_sha256, - )?; + checksums.verify(&expected_checksums)?; // Store part etag in version - let etag = encryption.etag_from_md5(&data_md5sum); + let etag = encryption.etag_from_md5(&checksums.md5); mpu.parts.put( mpu_part_key, MpuPart { version: version_uuid, etag: Some(etag.clone()), + checksum: checksums.extract(checksum_algorithm), size: Some(total_size), }, ); @@ -189,6 +207,7 @@ pub async fn handle_put_part( let mut resp = Response::builder().header("ETag", format!("\"{}\"", etag)); encryption.add_response_headers(&mut resp); + let resp = add_checksum_response_headers(&expected_checksums.extra, resp); Ok(resp.body(empty_body())?) } @@ -236,10 +255,11 @@ pub async fn handle_complete_multipart_upload( bucket_name, .. } = &ctx; + let (req_head, req_body) = req.into_parts(); - let body = http_body_util::BodyExt::collect(req.into_body()) - .await? - .to_bytes(); + let expected_checksum = request_checksum_value(&req_head.headers)?; + + let body = http_body_util::BodyExt::collect(req_body).await?.to_bytes(); if let Some(content_sha256) = content_sha256 { verify_signed_content(content_sha256, &body[..])?; @@ -263,8 +283,12 @@ pub async fn handle_complete_multipart_upload( return Err(Error::bad_request("No data was uploaded")); } - let object_encryption = match object_version.state { - ObjectVersionState::Uploading { encryption, .. } => encryption, + let (object_encryption, checksum_algorithm) = match object_version.state { + ObjectVersionState::Uploading { + encryption, + checksum_algorithm, + .. + } => (encryption, checksum_algorithm), _ => unreachable!(), }; @@ -292,6 +316,13 @@ pub async fn handle_complete_multipart_upload( for req_part in body_list_of_parts.iter() { match have_parts.get(&req_part.part_number) { Some(part) if part.etag.as_ref() == Some(&req_part.etag) && part.size.is_some() => { + // alternative version: if req_part.checksum.is_some() && part.checksum != req_part.checksum { + if part.checksum != req_part.checksum { + return Err(Error::InvalidDigest(format!( + "Invalid checksum for part {}: in request = {:?}, uploaded part = {:?}", + req_part.part_number, req_part.checksum, part.checksum + ))); + } parts.push(*part) } _ => return Err(Error::InvalidPart), @@ -339,18 +370,23 @@ pub async fn handle_complete_multipart_upload( }); garage.block_ref_table.insert_many(block_refs).await?; - // Calculate etag of final object + // Calculate checksum and etag of final object // To understand how etags are calculated, read more here: + // https://docs.aws.amazon.com/AmazonS3/latest/userguide/checking-object-integrity.html // https://teppen.io/2018/06/23/aws_s3_etags/ - let mut etag_md5_hasher = Md5::new(); + let mut checksummer = MultipartChecksummer::init(checksum_algorithm); for part in parts.iter() { - etag_md5_hasher.update(part.etag.as_ref().unwrap().as_bytes()); + checksummer.update(part.etag.as_ref().unwrap(), part.checksum)?; + } + let (checksum_md5, checksum_extra) = checksummer.finalize(); + + if expected_checksum.is_some() && checksum_extra != expected_checksum { + return Err(Error::InvalidDigest( + "Failed to validate x-amz-checksum-*".into(), + )); } - let etag = format!( - "{}-{}", - hex::encode(etag_md5_hasher.finalize()), - parts.len() - ); + + let etag = format!("{}-{}", hex::encode(&checksum_md5[..]), parts.len()); // Calculate total size of final object let total_size = parts.iter().map(|x| x.size.unwrap()).sum(); @@ -363,6 +399,20 @@ pub async fn handle_complete_multipart_upload( return Err(e); } + // If there is a checksum algorithm, update metadata with checksum + let object_encryption = match checksum_algorithm { + None => object_encryption, + Some(_) => { + let (encryption, meta) = + EncryptionParams::check_decrypt(&garage, &req_head.headers, &object_encryption)?; + let new_meta = ObjectVersionMetaInner { + headers: meta.into_owned().headers, + checksum: checksum_extra, + }; + encryption.encrypt_meta(new_meta)? + } + }; + // Write final object version object_version.state = ObjectVersionState::Complete(ObjectVersionData::FirstBlock( ObjectVersionMeta { @@ -383,10 +433,28 @@ pub async fn handle_complete_multipart_upload( bucket: s3_xml::Value(bucket_name.to_string()), key: s3_xml::Value(key), etag: s3_xml::Value(format!("\"{}\"", etag)), + checksum_crc32: match &checksum_extra { + Some(ChecksumValue::Crc32(x)) => Some(s3_xml::Value(BASE64_STANDARD.encode(&x))), + _ => None, + }, + checksum_crc32c: match &checksum_extra { + Some(ChecksumValue::Crc32c(x)) => Some(s3_xml::Value(BASE64_STANDARD.encode(&x))), + _ => None, + }, + checksum_sha1: match &checksum_extra { + Some(ChecksumValue::Sha1(x)) => Some(s3_xml::Value(BASE64_STANDARD.encode(&x))), + _ => None, + }, + checksum_sha256: match &checksum_extra { + Some(ChecksumValue::Sha256(x)) => Some(s3_xml::Value(BASE64_STANDARD.encode(&x))), + _ => None, + }, }; let xml = s3_xml::to_xml_with_header(&result)?; - Ok(Response::new(string_body(xml))) + let resp = Response::builder(); + let resp = add_checksum_response_headers(&expected_checksum, resp); + Ok(resp.body(string_body(xml))?) } pub async fn handle_abort_multipart_upload( @@ -455,6 +523,7 @@ pub fn decode_upload_id(id: &str) -> Result { struct CompleteMultipartUploadPart { etag: String, part_number: u64, + checksum: Option, } fn parse_complete_multipart_upload_body( @@ -480,9 +549,41 @@ fn parse_complete_multipart_upload_body( .children() .find(|e| e.has_tag_name("PartNumber"))? .text()?; + let checksum = if let Some(crc32) = + item.children().find(|e| e.has_tag_name("ChecksumCRC32")) + { + Some(ChecksumValue::Crc32( + BASE64_STANDARD.decode(crc32.text()?).ok()?[..] + .try_into() + .ok()?, + )) + } else if let Some(crc32c) = item.children().find(|e| e.has_tag_name("ChecksumCRC32C")) + { + Some(ChecksumValue::Crc32c( + BASE64_STANDARD.decode(crc32c.text()?).ok()?[..] + .try_into() + .ok()?, + )) + } else if let Some(sha1) = item.children().find(|e| e.has_tag_name("ChecksumSHA1")) { + Some(ChecksumValue::Sha1( + BASE64_STANDARD.decode(sha1.text()?).ok()?[..] + .try_into() + .ok()?, + )) + } else if let Some(sha256) = item.children().find(|e| e.has_tag_name("ChecksumSHA256")) + { + Some(ChecksumValue::Sha256( + BASE64_STANDARD.decode(sha256.text()?).ok()?[..] + .try_into() + .ok()?, + )) + } else { + None + }; parts.push(CompleteMultipartUploadPart { etag: etag.trim_matches('"').to_string(), part_number: part_number.parse().ok()?, + checksum, }); } else { return None; diff --git a/src/api/s3/post_object.rs b/src/api/s3/post_object.rs index 7c4219a7..2c106b3b 100644 --- a/src/api/s3/post_object.rs +++ b/src/api/s3/post_object.rs @@ -14,13 +14,15 @@ use multer::{Constraints, Multipart, SizeLimit}; use serde::Deserialize; use garage_model::garage::Garage; +use garage_model::s3::object_table::*; use crate::helpers::*; use crate::s3::api_server::ResBody; +use crate::s3::checksum::*; use crate::s3::cors::*; use crate::s3::encryption::EncryptionParams; use crate::s3::error::*; -use crate::s3::put::{get_headers, save_stream}; +use crate::s3::put::{get_headers, save_stream, ChecksumMode}; use crate::s3::xml as s3_xml; use crate::signature::payload::{verify_v4, Authorization}; @@ -98,10 +100,6 @@ pub async fn handle_post_object( .ok_or_bad_request("No policy was provided")? .to_str()?; let authorization = Authorization::parse_form(¶ms)?; - let content_md5 = params - .get("content-md5") - .map(HeaderValue::to_str) - .transpose()?; let key = if key.contains("${filename}") { // if no filename is provided, don't replace. This matches the behavior of AWS. @@ -226,6 +224,21 @@ pub async fn handle_post_object( let headers = get_headers(¶ms)?; + let expected_checksums = ExpectedChecksums { + md5: params + .get("content-md5") + .map(HeaderValue::to_str) + .transpose()? + .map(str::to_string), + sha256: None, + extra: request_checksum_algorithm_value(¶ms)?, + }; + + let meta = ObjectVersionMetaInner { + headers, + checksum: expected_checksums.extra, + }; + let encryption = EncryptionParams::new_from_headers(&garage, ¶ms)?; let stream = file_field.map(|r| r.map_err(Into::into)); @@ -239,12 +252,11 @@ pub async fn handle_post_object( let res = save_stream( &ctx, - headers, + meta, encryption, StreamLimiter::new(stream, conditions.content_length), &key, - content_md5.map(str::to_string), - None, + ChecksumMode::Verify(&expected_checksums), ) .await?; diff --git a/src/api/s3/put.rs b/src/api/s3/put.rs index 941e4122..1e3b1b44 100644 --- a/src/api/s3/put.rs +++ b/src/api/s3/put.rs @@ -1,12 +1,9 @@ use std::collections::HashMap; use std::sync::Arc; -use base64::prelude::*; use futures::prelude::*; use futures::stream::FuturesOrdered; use futures::try_join; -use md5::{digest::generic_array::*, Digest as Md5Digest, Md5}; -use sha2::Sha256; use tokio::sync::mpsc; @@ -22,7 +19,6 @@ use opentelemetry::{ use garage_net::bytes_buf::BytesBuf; use garage_rpc::rpc_helper::OrderTag; use garage_table::*; -use garage_util::async_hash::*; use garage_util::data::*; use garage_util::error::Error as GarageError; use garage_util::time::*; @@ -36,16 +32,22 @@ use garage_model::s3::version_table::*; use crate::helpers::*; use crate::s3::api_server::{ReqBody, ResBody}; +use crate::s3::checksum::*; use crate::s3::encryption::EncryptionParams; use crate::s3::error::*; const PUT_BLOCKS_MAX_PARALLEL: usize = 3; -pub struct SaveStreamResult { - pub version_uuid: Uuid, - pub version_timestamp: u64, +pub(crate) struct SaveStreamResult { + pub(crate) version_uuid: Uuid, + pub(crate) version_timestamp: u64, /// Etag WITHOUT THE QUOTES (just the hex value) - pub etag: String, + pub(crate) etag: String, +} + +pub(crate) enum ChecksumMode<'a> { + Verify(&'a ExpectedChecksums), + Calculate(Option), } pub async fn handle_put( @@ -58,24 +60,32 @@ pub async fn handle_put( let headers = get_headers(req.headers())?; debug!("Object headers: {:?}", headers); - // Determine whether object should be encrypted, and if so the key - let encryption = EncryptionParams::new_from_headers(&ctx.garage, req.headers())?; + let expected_checksums = ExpectedChecksums { + md5: match req.headers().get("content-md5") { + Some(x) => Some(x.to_str()?.to_string()), + None => None, + }, + sha256: content_sha256, + extra: request_checksum_value(req.headers())?, + }; - let content_md5 = match req.headers().get("content-md5") { - Some(x) => Some(x.to_str()?.to_string()), - None => None, + let meta = ObjectVersionMetaInner { + headers, + checksum: expected_checksums.extra, }; + // Determine whether object should be encrypted, and if so the key + let encryption = EncryptionParams::new_from_headers(&ctx.garage, req.headers())?; + let stream = body_stream(req.into_body()); let res = save_stream( &ctx, - headers, + meta, encryption, stream, key, - content_md5, - content_sha256, + ChecksumMode::Verify(&expected_checksums), ) .await?; @@ -83,17 +93,17 @@ pub async fn handle_put( .header("x-amz-version-id", hex::encode(res.version_uuid)) .header("ETag", format!("\"{}\"", res.etag)); encryption.add_response_headers(&mut resp); + let resp = add_checksum_response_headers(&expected_checksums.extra, resp); Ok(resp.body(empty_body())?) } pub(crate) async fn save_stream> + Unpin>( ctx: &ReqCtx, - headers: ObjectVersionHeaders, + mut meta: ObjectVersionMetaInner, encryption: EncryptionParams, body: S, key: &String, - content_md5: Option, - content_sha256: Option, + checksum_mode: ChecksumMode<'_>, ) -> Result { let ReqCtx { garage, bucket_id, .. @@ -107,32 +117,36 @@ pub(crate) async fn save_stream> + Unpin>( let first_block = first_block_opt.unwrap_or_default(); - let object_encryption = encryption.encrypt_headers(headers)?; - // Generate identity of new version let version_uuid = gen_uuid(); let version_timestamp = next_timestamp(existing_object.as_ref()); + let mut checksummer = match checksum_mode { + ChecksumMode::Verify(expected) => Checksummer::init(expected, !encryption.is_encrypted()), + ChecksumMode::Calculate(algo) => { + Checksummer::init(&Default::default(), !encryption.is_encrypted()).add(algo) + } + }; + // If body is small enough, store it directly in the object table // as "inline data". We can then return immediately. if first_block.len() < INLINE_THRESHOLD { - let mut md5sum = Md5::new(); - md5sum.update(&first_block[..]); - let data_md5sum = md5sum.finalize(); - - let data_sha256sum = sha256sum(&first_block[..]); + checksummer.update(&first_block); + let checksums = checksummer.finalize(); - ensure_checksum_matches( - &data_md5sum, - data_sha256sum, - content_md5.as_deref(), - content_sha256, - )?; + match checksum_mode { + ChecksumMode::Verify(expected) => { + checksums.verify(&expected)?; + } + ChecksumMode::Calculate(algo) => { + meta.checksum = checksums.extract(algo); + } + }; let size = first_block.len() as u64; check_quotas(ctx, size, existing_object.as_ref()).await?; - let etag = encryption.etag_from_md5(&data_md5sum); + let etag = encryption.etag_from_md5(&checksums.md5); let inline_data = encryption.encrypt_blob(&first_block)?.to_vec(); let object_version = ObjectVersion { @@ -140,7 +154,7 @@ pub(crate) async fn save_stream> + Unpin>( timestamp: version_timestamp, state: ObjectVersionState::Complete(ObjectVersionData::Inline( ObjectVersionMeta { - encryption: object_encryption, + encryption: encryption.encrypt_meta(meta)?, size, etag: etag.clone(), }, @@ -175,7 +189,8 @@ pub(crate) async fn save_stream> + Unpin>( uuid: version_uuid, timestamp: version_timestamp, state: ObjectVersionState::Uploading { - encryption: object_encryption.clone(), + encryption: encryption.encrypt_meta(meta.clone())?, + checksum_algorithm: None, // don't care; overwritten later multipart: false, }, }; @@ -196,25 +211,37 @@ pub(crate) async fn save_stream> + Unpin>( ); garage.version_table.insert(&version).await?; - // Transfer data and verify checksum - let (total_size, data_md5sum, data_sha256sum, first_block_hash) = - read_and_put_blocks(ctx, &version, encryption, 1, first_block, &mut chunker).await?; + // Transfer data + let (total_size, checksums, first_block_hash) = read_and_put_blocks( + ctx, + &version, + encryption, + 1, + first_block, + &mut chunker, + checksummer, + ) + .await?; - ensure_checksum_matches( - &data_md5sum, - data_sha256sum, - content_md5.as_deref(), - content_sha256, - )?; + // Verify checksums are ok / add calculated checksum to metadata + match checksum_mode { + ChecksumMode::Verify(expected) => { + checksums.verify(&expected)?; + } + ChecksumMode::Calculate(algo) => { + meta.checksum = checksums.extract(algo); + } + }; + // Verify quotas are respsected check_quotas(ctx, total_size, existing_object.as_ref()).await?; // Save final object state, marked as Complete - let etag = encryption.etag_from_md5(&data_md5sum); + let etag = encryption.etag_from_md5(&checksums.md5); object_version.state = ObjectVersionState::Complete(ObjectVersionData::FirstBlock( ObjectVersionMeta { - encryption: object_encryption, + encryption: encryption.encrypt_meta(meta)?, size: total_size, etag: etag.clone(), }, @@ -234,33 +261,6 @@ pub(crate) async fn save_stream> + Unpin>( }) } -/// Validate MD5 sum against content-md5 header -/// and sha256sum against signed content-sha256 -pub(crate) fn ensure_checksum_matches( - data_md5sum: &[u8], - data_sha256sum: garage_util::data::FixedBytes32, - content_md5: Option<&str>, - content_sha256: Option, -) -> Result<(), Error> { - if let Some(expected_sha256) = content_sha256 { - if expected_sha256 != data_sha256sum { - return Err(Error::bad_request( - "Unable to validate x-amz-content-sha256", - )); - } else { - trace!("Successfully validated x-amz-content-sha256"); - } - } - if let Some(expected_md5) = content_md5 { - if expected_md5.trim_matches('"') != BASE64_STANDARD.encode(data_md5sum) { - return Err(Error::bad_request("Unable to validate content-md5")); - } else { - trace!("Successfully validated content-md5"); - } - } - Ok(()) -} - /// Check that inserting this object with this size doesn't exceed bucket quotas pub(crate) async fn check_quotas( ctx: &ReqCtx, @@ -332,7 +332,8 @@ pub(crate) async fn read_and_put_blocks> + part_number: u64, first_block: Bytes, chunker: &mut StreamChunker, -) -> Result<(u64, GenericArray, Hash, Hash), Error> { + checksummer: Checksummer, +) -> Result<(u64, Checksums, Hash), Error> { let tracer = opentelemetry::global::tracer("garage"); let (block_tx, mut block_rx) = mpsc::channel::>(2); @@ -360,20 +361,20 @@ pub(crate) async fn read_and_put_blocks> + let (block_tx2, mut block_rx2) = mpsc::channel::>(1); let hash_stream = async { - let md5hasher = AsyncHasher::::new(); - let sha256hasher = AsyncHasher::::new(); + let mut checksummer = checksummer; while let Some(next) = block_rx.recv().await { match next { Ok(block) => { block_tx2.send(Ok(block.clone())).await?; - futures::future::join( - md5hasher.update(block.clone()), - sha256hasher.update(block.clone()), - ) + checksummer = tokio::task::spawn_blocking(move || { + checksummer.update(&block); + checksummer + }) .with_context(Context::current_with_span( tracer.start("Hash block (md5, sha256)"), )) - .await; + .await + .unwrap() } Err(e) => { block_tx2.send(Err(e)).await?; @@ -382,10 +383,7 @@ pub(crate) async fn read_and_put_blocks> + } } drop(block_tx2); - Ok::<_, mpsc::error::SendError<_>>(futures::join!( - md5hasher.finalize(), - sha256hasher.finalize() - )) + Ok::<_, mpsc::error::SendError<_>>(checksummer) }; let (block_tx3, mut block_rx3) = mpsc::channel::>(1); @@ -395,33 +393,28 @@ pub(crate) async fn read_and_put_blocks> + match next { Ok(block) => { let unencrypted_len = block.len() as u64; - let block = if encryption.is_encrypted() { - let res = - tokio::task::spawn_blocking(move || encryption.encrypt_block(block)) - .with_context(Context::current_with_span( - tracer.start("Encrypt block"), - )) - .await - .unwrap(); - match res { - Ok(b) => b, - Err(e) => { - block_tx3.send(Err(e)).await?; - break; + let res = tokio::task::spawn_blocking(move || { + let block = encryption.encrypt_block(block)?; + let hash = blake2sum(&block); + Ok((block, hash)) + }) + .with_context(Context::current_with_span( + tracer.start("Encrypt and hash (blake2) block"), + )) + .await + .unwrap(); + match res { + Ok((block, hash)) => { + if first_block_hash.is_none() { + first_block_hash = Some(hash); } + block_tx3.send(Ok((block, unencrypted_len, hash))).await?; + } + Err(e) => { + block_tx3.send(Err(e)).await?; + break; } - } else { - block - }; - let hash = async_blake2sum(block.clone()) - .with_context(Context::current_with_span( - tracer.start("Hash block (blake2)"), - )) - .await; - if first_block_hash.is_none() { - first_block_hash = Some(hash); } - block_tx3.send(Ok((block, unencrypted_len, hash))).await?; } Err(e) => { block_tx3.send(Err(e)).await?; @@ -493,12 +486,10 @@ pub(crate) async fn read_and_put_blocks> + let total_size = final_result?; // unwrap here is ok, because if hasher failed, it is because something failed // later in the pipeline which already caused a return at the ? on previous line - let (data_md5sum, data_sha256sum) = stream_hash_result.unwrap(); let first_block_hash = block_hash_result.unwrap(); + let checksums = stream_hash_result.unwrap().finalize(); - let data_sha256sum = Hash::try_from(&data_sha256sum[..]).unwrap(); - - Ok((total_size, data_md5sum, data_sha256sum, first_block_hash)) + Ok((total_size, checksums, first_block_hash)) } async fn put_block_and_meta( @@ -609,7 +600,7 @@ impl Drop for InterruptedCleanup { // ============ helpers ============ -pub(crate) fn get_headers(headers: &HeaderMap) -> Result { +pub(crate) fn get_headers(headers: &HeaderMap) -> Result { let mut ret = Vec::new(); // Preserve standard headers @@ -637,7 +628,7 @@ pub(crate) fn get_headers(headers: &HeaderMap) -> Result) -> u64 { diff --git a/src/api/s3/xml.rs b/src/api/s3/xml.rs index 06f11288..1e569ade 100644 --- a/src/api/s3/xml.rs +++ b/src/api/s3/xml.rs @@ -131,6 +131,14 @@ pub struct CompleteMultipartUploadResult { pub key: Value, #[serde(rename = "ETag")] pub etag: Value, + #[serde(rename = "ChecksumCRC32")] + pub checksum_crc32: Option, + #[serde(rename = "ChecksumCRC32C")] + pub checksum_crc32c: Option, + #[serde(rename = "ChecksumSHA1")] + pub checksum_sha1: Option, + #[serde(rename = "ChecksumSHA256")] + pub checksum_sha256: Option, } #[derive(Debug, Serialize, PartialEq, Eq)] @@ -197,6 +205,14 @@ pub struct PartItem { pub part_number: IntValue, #[serde(rename = "Size")] pub size: IntValue, + #[serde(rename = "ChecksumCRC32")] + pub checksum_crc32: Option, + #[serde(rename = "ChecksumCRC32C")] + pub checksum_crc32c: Option, + #[serde(rename = "ChecksumSHA1")] + pub checksum_sha1: Option, + #[serde(rename = "ChecksumSHA256")] + pub checksum_sha256: Option, } #[derive(Debug, Serialize, PartialEq, Eq)] @@ -500,6 +516,10 @@ mod tests { bucket: Value("mybucket".to_string()), key: Value("a/plop".to_string()), etag: Value("\"3858f62230ac3c915f300c664312c11f-9\"".to_string()), + checksum_crc32: None, + checksum_crc32c: None, + checksum_sha1: Some(Value("ZJAnHyG8PeKz9tI8UTcHrJos39A=".into())), + checksum_sha256: None, }; assert_eq!( to_xml_with_header(&result)?, @@ -509,6 +529,7 @@ mod tests { mybucket\ a/plop\ "3858f62230ac3c915f300c664312c11f-9"\ + ZJAnHyG8PeKz9tI8UTcHrJos39A=\ " ); Ok(()) @@ -780,12 +801,22 @@ mod tests { last_modified: Value("2010-11-10T20:48:34.000Z".to_string()), part_number: IntValue(2), size: IntValue(10485760), + checksum_crc32: None, + checksum_crc32c: None, + checksum_sha256: Some(Value( + "5RQ3A5uk0w7ojNjvegohch4JRBBGN/cLhsNrPzfv/hA=".into(), + )), + checksum_sha1: None, }, PartItem { etag: Value("\"aaaa18db4cc2f85cedef654fccc4a4x8\"".to_string()), last_modified: Value("2010-11-10T20:48:33.000Z".to_string()), part_number: IntValue(3), size: IntValue(10485760), + checksum_sha256: None, + checksum_crc32c: None, + checksum_crc32: Some(Value("ZJAnHyG8=".into())), + checksum_sha1: None, }, ], initiator: Initiator { @@ -820,12 +851,14 @@ mod tests { 2010-11-10T20:48:34.000Z\ 2\ 10485760\ + 5RQ3A5uk0w7ojNjvegohch4JRBBGN/cLhsNrPzfv/hA=\ \ \ "aaaa18db4cc2f85cedef654fccc4a4x8"\ 2010-11-10T20:48:33.000Z\ 3\ 10485760\ + ZJAnHyG8=\ \ \ umat-user-11116a31-17b5-4fb7-9df5-b288870f11xx\ diff --git a/src/garage/Cargo.toml b/src/garage/Cargo.toml index 53449a1c..17da68f8 100644 --- a/src/garage/Cargo.toml +++ b/src/garage/Cargo.toml @@ -42,6 +42,7 @@ tracing.workspace = true tracing-subscriber.workspace = true rand.workspace = true async-trait.workspace = true +sha1.workspace = true sodiumoxide.workspace = true structopt.workspace = true git-version.workspace = true diff --git a/src/garage/tests/s3/multipart.rs b/src/garage/tests/s3/multipart.rs index 51c9df74..cc424f59 100644 --- a/src/garage/tests/s3/multipart.rs +++ b/src/garage/tests/s3/multipart.rs @@ -1,6 +1,7 @@ use crate::common; use aws_sdk_s3::primitives::ByteStream; -use aws_sdk_s3::types::{CompletedMultipartUpload, CompletedPart}; +use aws_sdk_s3::types::{ChecksumAlgorithm, CompletedMultipartUpload, CompletedPart}; +use base64::prelude::*; const SZ_5MB: usize = 5 * 1024 * 1024; const SZ_10MB: usize = 10 * 1024 * 1024; @@ -189,6 +190,153 @@ async fn test_multipart_upload() { } } +#[tokio::test] +async fn test_multipart_with_checksum() { + let ctx = common::context(); + let bucket = ctx.create_bucket("testmpu-cksum"); + + let u1 = vec![0x11; SZ_5MB]; + let u2 = vec![0x22; SZ_5MB]; + let u3 = vec![0x33; SZ_5MB]; + + let ck1 = calculate_sha1(&u1); + let ck2 = calculate_sha1(&u2); + let ck3 = calculate_sha1(&u3); + + let up = ctx + .client + .create_multipart_upload() + .bucket(&bucket) + .checksum_algorithm(ChecksumAlgorithm::Sha1) + .key("a") + .send() + .await + .unwrap(); + assert!(up.upload_id.is_some()); + + let uid = up.upload_id.as_ref().unwrap(); + + let p1 = ctx + .client + .upload_part() + .bucket(&bucket) + .key("a") + .upload_id(uid) + .part_number(1) + .checksum_sha1(&ck1) + .body(ByteStream::from(u1.clone())) + .send() + .await + .unwrap(); + + // wrong checksum value should return an error + let err1 = ctx + .client + .upload_part() + .bucket(&bucket) + .key("a") + .upload_id(uid) + .part_number(2) + .checksum_sha1(&ck1) + .body(ByteStream::from(u2.clone())) + .send() + .await; + assert!(err1.is_err()); + + let p2 = ctx + .client + .upload_part() + .bucket(&bucket) + .key("a") + .upload_id(uid) + .part_number(2) + .checksum_sha1(&ck2) + .body(ByteStream::from(u2)) + .send() + .await + .unwrap(); + + let p3 = ctx + .client + .upload_part() + .bucket(&bucket) + .key("a") + .upload_id(uid) + .part_number(3) + .checksum_sha1(&ck3) + .body(ByteStream::from(u3.clone())) + .send() + .await + .unwrap(); + + { + let r = ctx + .client + .list_parts() + .bucket(&bucket) + .key("a") + .upload_id(uid) + .send() + .await + .unwrap(); + let parts = r.parts.unwrap(); + assert_eq!(parts.len(), 3); + assert!(parts[0].checksum_crc32.is_none()); + assert!(parts[0].checksum_crc32_c.is_none()); + assert!(parts[0].checksum_sha256.is_none()); + assert_eq!(parts[0].checksum_sha1.as_deref().unwrap(), ck1); + assert_eq!(parts[1].checksum_sha1.as_deref().unwrap(), ck2); + assert_eq!(parts[2].checksum_sha1.as_deref().unwrap(), ck3); + } + + let cmp = CompletedMultipartUpload::builder() + .parts( + CompletedPart::builder() + .part_number(1) + .checksum_sha1(&ck1) + .e_tag(p1.e_tag.unwrap()) + .build(), + ) + .parts( + CompletedPart::builder() + .part_number(2) + .checksum_sha1(&ck2) + .e_tag(p2.e_tag.unwrap()) + .build(), + ) + .parts( + CompletedPart::builder() + .part_number(3) + .checksum_sha1(&ck3) + .e_tag(p3.e_tag.unwrap()) + .build(), + ) + .build(); + + let expected_checksum = calculate_sha1( + &vec![ + BASE64_STANDARD.decode(&ck1).unwrap(), + BASE64_STANDARD.decode(&ck2).unwrap(), + BASE64_STANDARD.decode(&ck3).unwrap(), + ] + .concat(), + ); + + let res = ctx + .client + .complete_multipart_upload() + .bucket(&bucket) + .key("a") + .upload_id(uid) + .checksum_sha1(expected_checksum.clone()) + .multipart_upload(cmp) + .send() + .await + .unwrap(); + + assert_eq!(res.checksum_sha1, Some(expected_checksum)); +} + #[tokio::test] async fn test_uploadlistpart() { let ctx = common::context(); @@ -624,3 +772,11 @@ async fn test_uploadpartcopy() { assert_eq!(real_obj.len(), exp_obj.len()); assert_eq!(real_obj, exp_obj); } + +fn calculate_sha1(bytes: &[u8]) -> String { + use sha1::{Digest, Sha1}; + + let mut hasher = Sha1::new(); + hasher.update(bytes); + BASE64_STANDARD.encode(&hasher.finalize()[..]) +} diff --git a/src/model/s3/mpu_table.rs b/src/model/s3/mpu_table.rs index 238cbf11..c9f79caf 100644 --- a/src/model/s3/mpu_table.rs +++ b/src/model/s3/mpu_table.rs @@ -17,6 +17,7 @@ pub const PARTS: &str = "parts"; pub const BYTES: &str = "bytes"; mod v09 { + use crate::s3::object_table::ChecksumValue; use garage_util::crdt; use garage_util::data::Uuid; use serde::{Deserialize, Serialize}; @@ -61,6 +62,9 @@ mod v09 { pub version: Uuid, /// ETag of the content of this part (known only once done uploading) pub etag: Option, + /// Checksum requested by x-amz-checksum-algorithm + #[serde(default)] + pub checksum: Option, /// Size of this part (known only once done uploading) pub size: Option, } @@ -155,6 +159,11 @@ impl Crdt for MpuPart { (Some(x), Some(y)) if x < y => other.size, (x, _) => x, }; + self.checksum = match (self.checksum.take(), &other.checksum) { + (None, Some(_)) => other.checksum.clone(), + (Some(x), Some(y)) if x < *y => other.checksum.clone(), + (x, _) => x, + }; } } diff --git a/src/model/s3/object_table.rs b/src/model/s3/object_table.rs index eedb9615..b2f25803 100644 --- a/src/model/s3/object_table.rs +++ b/src/model/s3/object_table.rs @@ -208,6 +208,8 @@ mod v010 { Uploading { /// Indicates whether this is a multipart upload multipart: bool, + /// Checksum algorithm to use + checksum_algorithm: Option, /// Encryption params + headers to be included in the final object encryption: ObjectVersionEncryption, }, @@ -247,10 +249,10 @@ mod v010 { #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] pub enum ObjectVersionEncryption { SseC { - /// Encrypted serialized ObjectVersionHeaders struct. + /// Encrypted serialized ObjectVersionInner struct. /// This is never compressed, just encrypted using AES256-GCM. #[serde(with = "serde_bytes")] - headers: Vec, + inner: Vec, /// Whether data blocks are compressed in addition to being encrypted /// (compression happens before encryption, whereas for non-encrypted /// objects, compression is handled at the level of the block manager) @@ -258,13 +260,35 @@ mod v010 { }, Plaintext { /// Plain-text headers - headers: ObjectVersionHeaders, + inner: ObjectVersionMetaInner, }, } /// Vector of headers, as tuples of the format (header name, header value) #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] - pub struct ObjectVersionHeaders(pub Vec<(String, String)>); + pub struct ObjectVersionMetaInner { + pub headers: HeaderList, + pub checksum: Option, + } + + pub type HeaderList = Vec<(String, String)>; + + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)] + pub enum ChecksumAlgorithm { + Crc32, + Crc32c, + Sha1, + Sha256, + } + + /// Checksum value for x-amz-checksum-algorithm + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)] + pub enum ChecksumValue { + Crc32(#[serde(with = "serde_bytes")] [u8; 4]), + Crc32c(#[serde(with = "serde_bytes")] [u8; 4]), + Sha1(#[serde(with = "serde_bytes")] [u8; 20]), + Sha256(#[serde(with = "serde_bytes")] [u8; 32]), + } impl garage_util::migrate::Migrate for Object { const VERSION_MARKER: &'static [u8] = b"G010s3ob"; @@ -288,6 +312,7 @@ mod v010 { v09::ObjectVersionState::Uploading { multipart, headers } => { ObjectVersionState::Uploading { multipart, + checksum_algorithm: None, encryption: migrate_headers(headers), } } @@ -331,15 +356,18 @@ mod v010 { } ObjectVersionEncryption::Plaintext { - headers: ObjectVersionHeaders(new_headers), + inner: ObjectVersionMetaInner { + headers: new_headers, + checksum: None, + }, } } // Since ObjectVersionHeaders can now be serialized independently, for the // purpose of being encrypted, we need it to support migrations on its own // as well. - impl garage_util::migrate::InitialFormat for ObjectVersionHeaders { - const VERSION_MARKER: &'static [u8] = b"G010s3oh"; + impl garage_util::migrate::InitialFormat for ObjectVersionMetaInner { + const VERSION_MARKER: &'static [u8] = b"G010s3om"; } } @@ -454,6 +482,17 @@ impl Entry for Object { } } +impl ChecksumValue { + pub fn algorithm(&self) -> ChecksumAlgorithm { + match self { + ChecksumValue::Crc32(_) => ChecksumAlgorithm::Crc32, + ChecksumValue::Crc32c(_) => ChecksumAlgorithm::Crc32c, + ChecksumValue::Sha1(_) => ChecksumAlgorithm::Sha1, + ChecksumValue::Sha256(_) => ChecksumAlgorithm::Sha256, + } + } +} + impl Crdt for Object { fn merge(&mut self, other: &Self) { // Merge versions from other into here diff --git a/src/util/async_hash.rs b/src/util/async_hash.rs deleted file mode 100644 index 5631ea6b..00000000 --- a/src/util/async_hash.rs +++ /dev/null @@ -1,61 +0,0 @@ -use bytes::Bytes; -use digest::Digest; - -use tokio::sync::mpsc; -use tokio::task::JoinHandle; - -use crate::data::*; - -/// Compute the sha256 of a slice, -/// spawning on a tokio thread for CPU-intensive processing -/// The argument has to be an owned Bytes, as it is moved out to a new thread. -pub async fn async_sha256sum(data: Bytes) -> Hash { - tokio::task::spawn_blocking(move || sha256sum(&data)) - .await - .unwrap() -} - -/// Compute the blake2sum of a slice, -/// spawning on a tokio thread for CPU-intensive processing. -/// The argument has to be an owned Bytes, as it is moved out to a new thread. -pub async fn async_blake2sum(data: Bytes) -> Hash { - tokio::task::spawn_blocking(move || blake2sum(&data)) - .await - .unwrap() -} - -// ---- - -pub struct AsyncHasher { - sendblk: mpsc::Sender, - task: JoinHandle>, -} - -impl AsyncHasher { - pub fn new() -> Self { - let (sendblk, mut recvblk) = mpsc::channel::(1); - let task = tokio::task::spawn_blocking(move || { - let mut digest = D::new(); - while let Some(blk) = recvblk.blocking_recv() { - digest.update(&blk[..]); - } - digest.finalize() - }); - Self { sendblk, task } - } - - pub async fn update(&self, b: Bytes) { - self.sendblk.send(b).await.unwrap(); - } - - pub async fn finalize(self) -> digest::Output { - drop(self.sendblk); - self.task.await.unwrap() - } -} - -impl Default for AsyncHasher { - fn default() -> Self { - Self::new() - } -} diff --git a/src/util/lib.rs b/src/util/lib.rs index 7df77959..8b035ff0 100644 --- a/src/util/lib.rs +++ b/src/util/lib.rs @@ -3,7 +3,6 @@ #[macro_use] extern crate tracing; -pub mod async_hash; pub mod background; pub mod config; pub mod crdt; -- cgit v1.2.3 From 51d11b4b269dbe0dd207a307ddac3811a4cd5079 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 27 Mar 2024 10:04:35 +0100 Subject: [next-0.10] doc: 2 changes - rewrite section on encryption to mention SSE-C - change to real-world to make it closer to main branch --- doc/book/cookbook/encryption.md | 51 ++++++++++++++++++++++++++++++----------- doc/book/cookbook/real-world.md | 11 +++++---- 2 files changed, 43 insertions(+), 19 deletions(-) diff --git a/doc/book/cookbook/encryption.md b/doc/book/cookbook/encryption.md index 21a5cbc6..bfbea0ec 100644 --- a/doc/book/cookbook/encryption.md +++ b/doc/book/cookbook/encryption.md @@ -53,20 +53,43 @@ and that's also why your nodes have super long identifiers. Adding TLS support built into Garage is not currently planned. -## Garage stores data in plain text on the filesystem - -Garage does not handle data encryption at rest by itself, and instead delegates -to the user to add encryption, either at the storage layer (LUKS, etc) or on -the client side (or both). There are no current plans to add data encryption -directly in Garage. - -Implementing data encryption directly in Garage might make things simpler for -end users, but also raises many more questions, especially around key -management: for encryption of data, where could Garage get the encryption keys -from ? If we encrypt data but keep the keys in a plaintext file next to them, -it's useless. We probably don't want to have to manage secrets in garage as it -would be very hard to do in a secure way. Maybe integrate with an external -system such as Hashicorp Vault? +## Garage stores data in plain text on the filesystem or encrypted using customer keys (SSE-C) + +For standard S3 API requests, Garage does not encrypt data at rest by itself. +For the most generic at rest encryption of data, we recommend setting up your +storage partitions on encrypted LUKS devices. + +If you are developping your own client software that makes use of S3 storage, +we recommend implementing data encryption directly on the client side and never +transmitting plaintext data to Garage. This makes it easy to use an external +untrusted storage provider if necessary. + +Garage does support [SSE-C +encryption](https://docs.aws.amazon.com/AmazonS3/latest/userguide/ServerSideEncryptionCustomerKeys.html), +an encryption mode of Amazon S3 where data is encrypted at rest using +encryption keys given by the client. The encryption keys are passed to the +server in a header in each request, to encrypt or decrypt data at the moment of +reading or writing. The server discards the key as soon as it has finished +using it for the request. This mode allows the data to be encrypted at rest by +Garage itself, but it requires support in the client software. It is also not +adapted to a model where the server is not trusted or assumed to be +compromised, as the server can easily know the encryption keys. Note however +that when using SSE-C encryption, the only Garage node that knows the +encryption key passed in a given request is the node to which the request is +directed (which can be a gateway node), so it is easy to have untrusted nodes +in the cluster as long as S3 API requests containing SSE-C encryption keys are +not directed to them. + +Implementing automatic data encryption directly in Garage without client-side +management of keys (something like +[SSE-S3](https://docs.aws.amazon.com/AmazonS3/latest/userguide/UsingServerSideEncryption.html)) +could make things simpler for end users that don't want to setup LUKS, but also +raises many more questions, especially around key management: for encryption of +data, where could Garage get the encryption keys from? If we encrypt data but +keep the keys in a plaintext file next to them, it's useless. We probably don't +want to have to manage secrets in Garage as it would be very hard to do in a +secure way. At the time of speaking, there are no plans to implement this in +Garage. # Adding data encryption using external tools diff --git a/doc/book/cookbook/real-world.md b/doc/book/cookbook/real-world.md index cd42bb0c..59a5a7b6 100644 --- a/doc/book/cookbook/real-world.md +++ b/doc/book/cookbook/real-world.md @@ -68,11 +68,6 @@ to store 2 TB of data in total. EXT4 is not recommended as it has more strict limitations on the number of inodes, which might cause issues with Garage when large numbers of objects are stored. -- If you only have an HDD and no SSD, it's fine to put your metadata alongside the data - on the same drive. Having lots of RAM for your kernel to cache the metadata will - help a lot with performance. The default LMDB database engine is the most tested - and has good performance. - - Servers with multiple HDDs are supported natively by Garage without resorting to RAID, see [our dedicated documentation page](@/documentation/operations/multi-hdd.md). @@ -92,6 +87,12 @@ to store 2 TB of data in total. and 2/ LMDB is not suited for 32-bit platforms. Sqlite is a viable alternative if any of these are of concern. +- If you only have an HDD and no SSD, it's fine to put your metadata alongside + the data on the same drive, but then consider your filesystem choice wisely + (see above). Having lots of RAM for your kernel to cache the metadata will + help a lot with performance. The default LMDB database engine is the most + tested and has good performance. + ## Get a Docker image Our docker image is currently named `dxflrs/garage` and is stored on the [Docker Hub](https://hub.docker.com/r/dxflrs/garage/tags?page=1&ordering=last_updated). -- cgit v1.2.3 From c0eeb0b0f32ed0a27cfdf9297d0e71e1b9948b73 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 27 Mar 2024 10:44:03 +0100 Subject: [next-0.10] fixes to k2v rpc + comment fixes --- src/model/k2v/rpc.rs | 13 ++++--------- src/model/s3/object_table.rs | 2 +- src/rpc/layout/helper.rs | 4 ++++ src/rpc/layout/history.rs | 6 ++++++ 4 files changed, 15 insertions(+), 10 deletions(-) diff --git a/src/model/k2v/rpc.rs b/src/model/k2v/rpc.rs index e15f2df8..95ff2d18 100644 --- a/src/model/k2v/rpc.rs +++ b/src/model/k2v/rpc.rs @@ -219,12 +219,11 @@ impl K2VRpcHandler { }, sort_key, }; - // TODO figure this out with write sets, is it still appropriate??? let nodes = self .item_table .data .replication - .read_nodes(&poll_key.partition.hash()); + .storage_nodes(&poll_key.partition.hash()); let rpc = self.system.rpc_helper().try_call_many( &self.endpoint, @@ -239,8 +238,7 @@ impl K2VRpcHandler { .send_all_at_once(true) .without_timeout(), ); - let timeout_duration = - Duration::from_millis(timeout_msec) + self.system.rpc_helper().rpc_timeout(); + let timeout_duration = Duration::from_millis(timeout_msec); let resps = select! { r = rpc => r?, _ = tokio::time::sleep(timeout_duration) => return Ok(None), @@ -282,12 +280,11 @@ impl K2VRpcHandler { seen.restrict(&range); // Prepare PollRange RPC to send to the storage nodes responsible for the parititon - // TODO figure this out with write sets, does it still work???? let nodes = self .item_table .data .replication - .read_nodes(&range.partition.hash()); + .storage_nodes(&range.partition.hash()); let quorum = self.item_table.data.replication.read_quorum(); let msg = K2VRpc::PollRange { range, @@ -320,9 +317,7 @@ impl K2VRpcHandler { // kind: all items produced by that node until time ts have been returned, so we can // bump the entry in the global vector clock and possibly remove some item-specific // vector clocks) - let mut deadline = Instant::now() - + Duration::from_millis(timeout_msec) - + self.system.rpc_helper().rpc_timeout(); + let mut deadline = Instant::now() + Duration::from_millis(timeout_msec); let mut resps = vec![]; let mut errors = vec![]; loop { diff --git a/src/model/s3/object_table.rs b/src/model/s3/object_table.rs index b2f25803..5c721148 100644 --- a/src/model/s3/object_table.rs +++ b/src/model/s3/object_table.rs @@ -363,7 +363,7 @@ mod v010 { } } - // Since ObjectVersionHeaders can now be serialized independently, for the + // Since ObjectVersionMetaInner can now be serialized independently, for the // purpose of being encrypted, we need it to support migrations on its own // as well. impl garage_util::migrate::InitialFormat for ObjectVersionMetaInner { diff --git a/src/rpc/layout/helper.rs b/src/rpc/layout/helper.rs index 2835347a..e3096945 100644 --- a/src/rpc/layout/helper.rs +++ b/src/rpc/layout/helper.rs @@ -153,10 +153,14 @@ impl LayoutHelper { // ------------------ read helpers --------------- + /// Return all nodes that have a role (gateway or storage) + /// in one of the currently active layout versions pub fn all_nodes(&self) -> &[Uuid] { &self.all_nodes } + /// Return all nodes that are configured to store data + /// in one of the currently active layout versions pub fn all_nongateway_nodes(&self) -> &[Uuid] { &self.all_nongateway_nodes } diff --git a/src/rpc/layout/history.rs b/src/rpc/layout/history.rs index 290f058d..af2cbc63 100644 --- a/src/rpc/layout/history.rs +++ b/src/rpc/layout/history.rs @@ -27,14 +27,18 @@ impl LayoutHistory { // ------------------ who stores what now? --------------- + /// Returns the layout version with the highest number pub fn current(&self) -> &LayoutVersion { self.versions.last().as_ref().unwrap() } + /// Returns the version number of the oldest layout version still active pub fn min_stored(&self) -> u64 { self.versions.first().as_ref().unwrap().version } + /// Calculate the set of all nodes that have a role (gateway or storage) + /// in one of the currently active layout versions pub fn get_all_nodes(&self) -> Vec { if self.versions.len() == 1 { self.versions[0].all_nodes().to_vec() @@ -48,6 +52,8 @@ impl LayoutHistory { } } + /// Calculate the set of all nodes that are configured to store data + /// in one of the currently active layout versions pub(crate) fn get_all_nongateway_nodes(&self) -> Vec { if self.versions.len() == 1 { self.versions[0].nongateway_nodes().to_vec() -- cgit v1.2.3 From 01a0bd54106941156ca998be1a44b8ac2c3aa74a Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 27 Mar 2024 13:32:13 +0100 Subject: [next-0.10] remove impl Deref for LayoutHelper --- src/api/admin/cluster.rs | 10 ++++----- src/rpc/layout/helper.rs | 54 +++++++++++++++++++++++------------------------ src/rpc/layout/manager.rs | 22 +++++++++---------- src/rpc/rpc_helper.rs | 20 ++++++++---------- src/rpc/system.rs | 6 +++--- 5 files changed, 55 insertions(+), 57 deletions(-) diff --git a/src/api/admin/cluster.rs b/src/api/admin/cluster.rs index 8c9cb1e5..e5877fcd 100644 --- a/src/api/admin/cluster.rs +++ b/src/api/admin/cluster.rs @@ -78,7 +78,7 @@ pub async fn handle_get_cluster_status(garage: &Arc) -> Result) -> Result, Error> { - let res = format_cluster_layout(&garage.system.cluster_layout()); + let res = format_cluster_layout(garage.system.cluster_layout().inner()); Ok(json_ok_response(&res)?) } @@ -295,7 +295,7 @@ pub async fn handle_update_cluster_layout( ) -> Result, Error> { let updates = parse_json_body::(req).await?; - let mut layout = garage.system.cluster_layout().clone(); + let mut layout = garage.system.cluster_layout().inner().clone(); let mut roles = layout.current().roles.clone(); roles.merge(&layout.staging.get().roles); @@ -341,7 +341,7 @@ pub async fn handle_apply_cluster_layout( ) -> Result, Error> { let param = parse_json_body::(req).await?; - let layout = garage.system.cluster_layout().clone(); + let layout = garage.system.cluster_layout().inner().clone(); let (layout, msg) = layout.apply_staged_changes(Some(param.version))?; garage @@ -360,7 +360,7 @@ pub async fn handle_apply_cluster_layout( pub async fn handle_revert_cluster_layout( garage: &Arc, ) -> Result, Error> { - let layout = garage.system.cluster_layout().clone(); + let layout = garage.system.cluster_layout().inner().clone(); let layout = layout.revert_staged_changes()?; garage .system diff --git a/src/rpc/layout/helper.rs b/src/rpc/layout/helper.rs index e3096945..ddf8fd44 100644 --- a/src/rpc/layout/helper.rs +++ b/src/rpc/layout/helper.rs @@ -1,5 +1,4 @@ use std::collections::HashMap; -use std::ops::Deref; use std::sync::atomic::{AtomicUsize, Ordering}; use serde::{Deserialize, Serialize}; @@ -49,13 +48,6 @@ pub struct LayoutHelper { pub(crate) ack_lock: HashMap, } -impl Deref for LayoutHelper { - type Target = LayoutHistory; - fn deref(&self) -> &LayoutHistory { - self.layout() - } -} - impl LayoutHelper { pub fn new( replication_factor: ReplicationFactor, @@ -131,10 +123,6 @@ impl LayoutHelper { // ------------------ single updating function -------------- - fn layout(&self) -> &LayoutHistory { - self.layout.as_ref().unwrap() - } - pub(crate) fn update(&mut self, f: F) -> bool where F: FnOnce(&mut LayoutHistory) -> bool, @@ -153,6 +141,18 @@ impl LayoutHelper { // ------------------ read helpers --------------- + pub fn inner(&self) -> &LayoutHistory { + self.layout.as_ref().unwrap() + } + + pub fn current(&self) -> &LayoutVersion { + self.inner().current() + } + + pub fn versions(&self) -> &[LayoutVersion] { + &self.inner().versions + } + /// Return all nodes that have a role (gateway or storage) /// in one of the currently active layout versions pub fn all_nodes(&self) -> &[Uuid] { @@ -175,20 +175,19 @@ impl LayoutHelper { pub fn sync_digest(&self) -> SyncLayoutDigest { SyncLayoutDigest { - current: self.layout().current().version, + current: self.current().version, ack_map_min: self.ack_map_min(), - min_stored: self.layout().min_stored(), + min_stored: self.inner().min_stored(), } } pub fn read_nodes_of(&self, position: &Hash) -> Vec { let sync_min = self.sync_map_min; let version = self - .layout() - .versions + .versions() .iter() .find(|x| x.version == sync_min) - .or(self.layout().versions.last()) + .or(self.versions().last()) .unwrap(); version .nodes_of(position, version.replication_factor) @@ -196,8 +195,7 @@ impl LayoutHelper { } pub fn storage_sets_of(&self, position: &Hash) -> Vec> { - self.layout() - .versions + self.versions() .iter() .map(|x| x.nodes_of(position, x.replication_factor).collect()) .collect() @@ -205,7 +203,7 @@ impl LayoutHelper { pub fn storage_nodes_of(&self, position: &Hash) -> Vec { let mut ret = vec![]; - for version in self.layout().versions.iter() { + for version in self.versions().iter() { ret.extend(version.nodes_of(position, version.replication_factor)); } ret.sort(); @@ -224,7 +222,7 @@ impl LayoutHelper { pub fn digest(&self) -> RpcLayoutDigest { RpcLayoutDigest { current_version: self.current().version, - active_versions: self.versions.len(), + active_versions: self.versions().len(), trackers_hash: self.trackers_hash, staging_hash: self.staging_hash, } @@ -246,13 +244,16 @@ impl LayoutHelper { // 3. Acknowledge everyone has synced up to min(self.sync_map) self.sync_ack(local_node_id); - debug!("ack_map: {:?}", self.update_trackers.ack_map); - debug!("sync_map: {:?}", self.update_trackers.sync_map); - debug!("sync_ack_map: {:?}", self.update_trackers.sync_ack_map); + debug!("ack_map: {:?}", self.inner().update_trackers.ack_map); + debug!("sync_map: {:?}", self.inner().update_trackers.sync_map); + debug!( + "sync_ack_map: {:?}", + self.inner().update_trackers.sync_ack_map + ); } fn sync_first(&mut self, local_node_id: Uuid) { - let first_version = self.min_stored(); + let first_version = self.inner().min_stored(); self.update(|layout| { layout .update_trackers @@ -286,8 +287,7 @@ impl LayoutHelper { } pub(crate) fn max_free_ack(&self) -> u64 { - self.layout() - .versions + self.versions() .iter() .map(|x| x.version) .skip_while(|v| { diff --git a/src/rpc/layout/manager.rs b/src/rpc/layout/manager.rs index 8a6eb1c3..3866f867 100644 --- a/src/rpc/layout/manager.rs +++ b/src/rpc/layout/manager.rs @@ -109,7 +109,7 @@ impl LayoutManager { } pub fn add_table(&self, table_name: &'static str) { - let first_version = self.layout().versions.first().unwrap().version; + let first_version = self.layout().versions().first().unwrap().version; self.table_sync_version .lock() @@ -127,7 +127,7 @@ impl LayoutManager { if layout.update(|l| l.update_trackers.sync_map.set_max(self.node_id, sync_until)) { info!("sync_until updated to {}", sync_until); self.broadcast_update(SystemRpc::AdvertiseClusterLayoutTrackers( - layout.update_trackers.clone(), + layout.inner().update_trackers.clone(), )); } } @@ -136,7 +136,7 @@ impl LayoutManager { let mut layout = self.layout.write().unwrap(); if layout.ack_max_free(self.node_id) { self.broadcast_update(SystemRpc::AdvertiseClusterLayoutTrackers( - layout.update_trackers.clone(), + layout.inner().update_trackers.clone(), )); } } @@ -160,16 +160,16 @@ impl LayoutManager { fn merge_layout(&self, adv: &LayoutHistory) -> Option { let mut layout = self.layout.write().unwrap(); let prev_digest = layout.digest(); - let prev_layout_check = layout.check().is_ok(); + let prev_layout_check = layout.inner().check().is_ok(); if !prev_layout_check || adv.check().is_ok() { if layout.update(|l| l.merge(adv)) { layout.update_trackers(self.node_id); - if prev_layout_check && layout.check().is_err() { + if prev_layout_check && layout.inner().check().is_err() { panic!("Merged two correct layouts and got an incorrect layout."); } assert!(layout.digest() != prev_digest); - return Some(layout.clone()); + return Some(layout.inner().clone()); } } @@ -180,11 +180,11 @@ impl LayoutManager { let mut layout = self.layout.write().unwrap(); let prev_digest = layout.digest(); - if layout.update_trackers != *adv { + if layout.inner().update_trackers != *adv { if layout.update(|l| l.update_trackers.merge(adv)) { layout.update_trackers(self.node_id); assert!(layout.digest() != prev_digest); - return Some(layout.update_trackers.clone()); + return Some(layout.inner().update_trackers.clone()); } } @@ -230,7 +230,7 @@ impl LayoutManager { /// Save cluster layout data to disk async fn save_cluster_layout(&self) -> Result<(), Error> { - let layout = self.layout.read().unwrap().clone(); + let layout = self.layout.read().unwrap().inner().clone(); self.persist_cluster_layout .save_async(&layout) .await @@ -278,13 +278,13 @@ impl LayoutManager { } pub(crate) fn handle_pull_cluster_layout(&self) -> SystemRpc { - let layout = self.layout.read().unwrap().clone(); + let layout = self.layout.read().unwrap().inner().clone(); SystemRpc::AdvertiseClusterLayout(layout) } pub(crate) fn handle_pull_cluster_layout_trackers(&self) -> SystemRpc { let layout = self.layout.read().unwrap(); - SystemRpc::AdvertiseClusterLayoutTrackers(layout.update_trackers.clone()) + SystemRpc::AdvertiseClusterLayoutTrackers(layout.inner().update_trackers.clone()) } pub(crate) async fn handle_advertise_cluster_layout( diff --git a/src/rpc/rpc_helper.rs b/src/rpc/rpc_helper.rs index 977c6ed8..05fdcce4 100644 --- a/src/rpc/rpc_helper.rs +++ b/src/rpc/rpc_helper.rs @@ -26,7 +26,7 @@ use garage_util::data::*; use garage_util::error::Error; use garage_util::metrics::RecordDuration; -use crate::layout::{LayoutHelper, LayoutHistory}; +use crate::layout::{LayoutHelper, LayoutVersion}; use crate::metrics::RpcMetrics; // Default RPC timeout = 5 minutes @@ -304,7 +304,8 @@ impl RpcHelper { // preemptively send an additional request to any remaining nodes. // Reorder requests to priorize closeness / low latency - let request_order = self.request_order(&self.0.layout.read().unwrap(), to.iter().copied()); + let request_order = + self.request_order(&self.0.layout.read().unwrap().current(), to.iter().copied()); let send_all_at_once = strategy.rs_send_all_at_once.unwrap_or(false); // Build future for each request @@ -497,16 +498,16 @@ impl RpcHelper { let mut ret = Vec::with_capacity(12); let ver_iter = layout - .versions + .versions() .iter() .rev() - .chain(layout.old_versions.iter().rev()); + .chain(layout.inner().old_versions.iter().rev()); for ver in ver_iter { if ver.version > layout.sync_map_min() { continue; } let nodes = ver.nodes_of(position, ver.replication_factor); - for node in rpc_helper.request_order(&layout, nodes) { + for node in rpc_helper.request_order(layout.current(), nodes) { if !ret.contains(&node) { ret.push(node); } @@ -517,15 +518,12 @@ impl RpcHelper { fn request_order( &self, - layout: &LayoutHistory, + layout: &LayoutVersion, nodes: impl Iterator, ) -> Vec { // Retrieve some status variables that we will use to sort requests let peer_list = self.0.peering.get_peer_list(); - let our_zone = layout - .current() - .get_node_zone(&self.0.our_node_id) - .unwrap_or(""); + let our_zone = layout.get_node_zone(&self.0.our_node_id).unwrap_or(""); // Augment requests with some information used to sort them. // The tuples are as follows: @@ -535,7 +533,7 @@ impl RpcHelper { // and within a same zone we priorize nodes with the lowest latency. let mut nodes = nodes .map(|to| { - let peer_zone = layout.current().get_node_zone(&to).unwrap_or(""); + let peer_zone = layout.get_node_zone(&to).unwrap_or(""); let peer_avg_ping = peer_list .iter() .find(|x| x.id.as_ref() == to.as_slice()) diff --git a/src/rpc/system.rs b/src/rpc/system.rs index 9da1b176..b38e2e01 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -451,7 +451,7 @@ impl System { // Obtain information about nodes that have a role as storage nodes // in one of the active layout versions let mut storage_nodes = HashSet::::with_capacity(16); - for ver in layout.versions.iter() { + for ver in layout.versions().iter() { storage_nodes.extend( ver.roles .items() @@ -470,7 +470,7 @@ impl System { let mut partitions_all_ok = 0; for (_, hash) in partitions.iter() { let mut write_sets = layout - .versions + .versions() .iter() .map(|x| x.nodes_of(hash, x.replication_factor)); let has_quorum = write_sets @@ -634,7 +634,7 @@ impl System { .filter(|p| p.is_up()) .count(); - let not_configured = self.cluster_layout().check().is_err(); + let not_configured = self.cluster_layout().inner().check().is_err(); let no_peers = n_connected < self.replication_factor.into(); let expected_n_nodes = self.cluster_layout().all_nodes().len(); let bad_peers = n_connected != expected_n_nodes; -- cgit v1.2.3 From 32f1786f9ff17f12911f5f3f37e2d1c35d534f59 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 27 Mar 2024 13:37:20 +0100 Subject: [next-0.10] cache layout check result --- src/rpc/layout/helper.rs | 8 ++++++++ src/rpc/layout/manager.rs | 4 ++-- src/rpc/system.rs | 2 +- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/src/rpc/layout/helper.rs b/src/rpc/layout/helper.rs index ddf8fd44..b15f7540 100644 --- a/src/rpc/layout/helper.rs +++ b/src/rpc/layout/helper.rs @@ -41,6 +41,7 @@ pub struct LayoutHelper { trackers_hash: Hash, staging_hash: Hash, + is_check_ok: bool, // ack lock: counts in-progress write operations for each // layout version ; we don't increase the ack update tracker @@ -107,6 +108,8 @@ impl LayoutHelper { .entry(layout.current().version) .or_insert(AtomicUsize::new(0)); + let is_check_ok = layout.check().is_ok(); + LayoutHelper { replication_factor, consistency_mode, @@ -118,6 +121,7 @@ impl LayoutHelper { trackers_hash, staging_hash, ack_lock, + is_check_ok, } } @@ -153,6 +157,10 @@ impl LayoutHelper { &self.inner().versions } + pub fn is_check_ok(&self) -> bool { + self.is_check_ok + } + /// Return all nodes that have a role (gateway or storage) /// in one of the currently active layout versions pub fn all_nodes(&self) -> &[Uuid] { diff --git a/src/rpc/layout/manager.rs b/src/rpc/layout/manager.rs index 3866f867..0ca532ba 100644 --- a/src/rpc/layout/manager.rs +++ b/src/rpc/layout/manager.rs @@ -160,12 +160,12 @@ impl LayoutManager { fn merge_layout(&self, adv: &LayoutHistory) -> Option { let mut layout = self.layout.write().unwrap(); let prev_digest = layout.digest(); - let prev_layout_check = layout.inner().check().is_ok(); + let prev_layout_check = layout.is_check_ok(); if !prev_layout_check || adv.check().is_ok() { if layout.update(|l| l.merge(adv)) { layout.update_trackers(self.node_id); - if prev_layout_check && layout.inner().check().is_err() { + if prev_layout_check && !layout.is_check_ok() { panic!("Merged two correct layouts and got an incorrect layout."); } assert!(layout.digest() != prev_digest); diff --git a/src/rpc/system.rs b/src/rpc/system.rs index b38e2e01..91a42415 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -634,7 +634,7 @@ impl System { .filter(|p| p.is_up()) .count(); - let not_configured = self.cluster_layout().inner().check().is_err(); + let not_configured = !self.cluster_layout().is_check_ok(); let no_peers = n_connected < self.replication_factor.into(); let expected_n_nodes = self.cluster_layout().all_nodes().len(); let bad_peers = n_connected != expected_n_nodes; -- cgit v1.2.3 From 4eba32f29fceea5ab19e44900f8d3a6864989d55 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 27 Mar 2024 13:47:06 +0100 Subject: [next-0.10] layout helper: rename & clarify updates to update trackers --- src/rpc/layout/helper.rs | 57 +++++++++++++++++++---------------------------- src/rpc/layout/manager.rs | 8 +++---- 2 files changed, 27 insertions(+), 38 deletions(-) diff --git a/src/rpc/layout/helper.rs b/src/rpc/layout/helper.rs index b15f7540..3a033ab2 100644 --- a/src/rpc/layout/helper.rs +++ b/src/rpc/layout/helper.rs @@ -238,29 +238,15 @@ impl LayoutHelper { // ------------------ helpers for update tracking --------------- - pub(crate) fn update_trackers(&mut self, local_node_id: Uuid) { + pub(crate) fn update_update_trackers(&mut self, local_node_id: Uuid) { // Ensure trackers for this node's values are up-to-date // 1. Acknowledge the last layout version which is not currently // locked by an in-progress write operation - self.ack_max_free(local_node_id); + self.update_ack_to_max_free(local_node_id); // 2. Assume the data on this node is sync'ed up at least to // the first layout version in the history - self.sync_first(local_node_id); - - // 3. Acknowledge everyone has synced up to min(self.sync_map) - self.sync_ack(local_node_id); - - debug!("ack_map: {:?}", self.inner().update_trackers.ack_map); - debug!("sync_map: {:?}", self.inner().update_trackers.sync_map); - debug!( - "sync_ack_map: {:?}", - self.inner().update_trackers.sync_ack_map - ); - } - - fn sync_first(&mut self, local_node_id: Uuid) { let first_version = self.inner().min_stored(); self.update(|layout| { layout @@ -268,9 +254,8 @@ impl LayoutHelper { .sync_map .set_max(local_node_id, first_version) }); - } - fn sync_ack(&mut self, local_node_id: Uuid) { + // 3. Acknowledge everyone has synced up to min(self.sync_map) let sync_map_min = self.sync_map_min; self.update(|layout| { layout @@ -278,24 +263,18 @@ impl LayoutHelper { .sync_ack_map .set_max(local_node_id, sync_map_min) }); - } - pub(crate) fn ack_max_free(&mut self, local_node_id: Uuid) -> bool { - let max_ack = self.max_free_ack(); - let changed = self.update(|layout| { - layout - .update_trackers - .ack_map - .set_max(local_node_id, max_ack) - }); - if changed { - info!("ack_until updated to {}", max_ack); - } - changed + debug!("ack_map: {:?}", self.inner().update_trackers.ack_map); + debug!("sync_map: {:?}", self.inner().update_trackers.sync_map); + debug!( + "sync_ack_map: {:?}", + self.inner().update_trackers.sync_ack_map + ); } - pub(crate) fn max_free_ack(&self) -> u64 { - self.versions() + pub(crate) fn update_ack_to_max_free(&mut self, local_node_id: Uuid) -> bool { + let max_free = self + .versions() .iter() .map(|x| x.version) .skip_while(|v| { @@ -305,6 +284,16 @@ impl LayoutHelper { .unwrap_or(true) }) .next() - .unwrap_or(self.current().version) + .unwrap_or(self.current().version); + let changed = self.update(|layout| { + layout + .update_trackers + .ack_map + .set_max(local_node_id, max_free) + }); + if changed { + info!("ack_until updated to {}", max_free); + } + changed } } diff --git a/src/rpc/layout/manager.rs b/src/rpc/layout/manager.rs index 0ca532ba..a0dcf50e 100644 --- a/src/rpc/layout/manager.rs +++ b/src/rpc/layout/manager.rs @@ -70,7 +70,7 @@ impl LayoutManager { cluster_layout, Default::default(), ); - cluster_layout.update_trackers(node_id.into()); + cluster_layout.update_update_trackers(node_id.into()); let layout = Arc::new(RwLock::new(cluster_layout)); let change_notify = Arc::new(Notify::new()); @@ -134,7 +134,7 @@ impl LayoutManager { fn ack_new_version(self: &Arc) { let mut layout = self.layout.write().unwrap(); - if layout.ack_max_free(self.node_id) { + if layout.update_ack_to_max_free(self.node_id) { self.broadcast_update(SystemRpc::AdvertiseClusterLayoutTrackers( layout.inner().update_trackers.clone(), )); @@ -164,7 +164,7 @@ impl LayoutManager { if !prev_layout_check || adv.check().is_ok() { if layout.update(|l| l.merge(adv)) { - layout.update_trackers(self.node_id); + layout.update_update_trackers(self.node_id); if prev_layout_check && !layout.is_check_ok() { panic!("Merged two correct layouts and got an incorrect layout."); } @@ -182,7 +182,7 @@ impl LayoutManager { if layout.inner().update_trackers != *adv { if layout.update(|l| l.update_trackers.merge(adv)) { - layout.update_trackers(self.node_id); + layout.update_update_trackers(self.node_id); assert!(layout.digest() != prev_digest); return Some(layout.inner().update_trackers.clone()); } -- cgit v1.2.3 From 25c196f34d958f4f61d50c89a1c5d40b96d7cd24 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 27 Mar 2024 13:55:49 +0100 Subject: [next-0.10] admin api: fix logic in get cluster status --- src/api/admin/cluster.rs | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/src/api/admin/cluster.rs b/src/api/admin/cluster.rs index e5877fcd..357ac600 100644 --- a/src/api/admin/cluster.rs +++ b/src/api/admin/cluster.rs @@ -70,9 +70,7 @@ pub async fn handle_get_cluster_status(garage: &Arc) -> Result { - if n.role.is_none() { - n.role = Some(role); - } + n.role = Some(role); } } } @@ -81,15 +79,21 @@ pub async fn handle_get_cluster_status(garage: &Arc) -> Result Date: Thu, 28 Mar 2024 15:19:44 +0100 Subject: [next-0.10] bump version number to 1.0 --- Cargo.lock | 20 +++--- Cargo.nix | 138 ++++++++++++++++++++-------------------- Cargo.toml | 18 +++--- doc/book/cookbook/real-world.md | 10 +-- doc/drafts/admin-api.md | 2 +- script/helm/garage/Chart.yaml | 2 +- src/api/Cargo.toml | 2 +- src/block/Cargo.toml | 2 +- src/db/Cargo.toml | 2 +- src/garage/Cargo.toml | 2 +- src/model/Cargo.toml | 2 +- src/net/Cargo.toml | 2 +- src/net/netapp.rs | 8 ++- src/net/peering.rs | 5 +- src/rpc/Cargo.toml | 2 +- src/rpc/system.rs | 2 +- src/table/Cargo.toml | 2 +- src/util/Cargo.toml | 2 +- src/web/Cargo.toml | 2 +- 19 files changed, 113 insertions(+), 112 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c41e0564..b96483dd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1304,7 +1304,7 @@ dependencies = [ [[package]] name = "garage" -version = "0.10.0" +version = "1.0.0" dependencies = [ "assert-json-diff", "async-trait", @@ -1360,7 +1360,7 @@ dependencies = [ [[package]] name = "garage_api" -version = "0.10.0" +version = "1.0.0" dependencies = [ "aes-gcm", "argon2", @@ -1415,7 +1415,7 @@ dependencies = [ [[package]] name = "garage_block" -version = "0.10.0" +version = "1.0.0" dependencies = [ "arc-swap", "async-compression", @@ -1442,7 +1442,7 @@ dependencies = [ [[package]] name = "garage_db" -version = "0.10.0" +version = "1.0.0" dependencies = [ "err-derive", "heed", @@ -1456,7 +1456,7 @@ dependencies = [ [[package]] name = "garage_model" -version = "0.10.0" +version = "1.0.0" dependencies = [ "arc-swap", "async-trait", @@ -1486,7 +1486,7 @@ dependencies = [ [[package]] name = "garage_net" -version = "0.10.0" +version = "1.0.0" dependencies = [ "arc-swap", "async-trait", @@ -1512,7 +1512,7 @@ dependencies = [ [[package]] name = "garage_rpc" -version = "0.10.0" +version = "1.0.0" dependencies = [ "arc-swap", "async-trait", @@ -1547,7 +1547,7 @@ dependencies = [ [[package]] name = "garage_table" -version = "0.10.0" +version = "1.0.0" dependencies = [ "arc-swap", "async-trait", @@ -1569,7 +1569,7 @@ dependencies = [ [[package]] name = "garage_util" -version = "0.10.0" +version = "1.0.0" dependencies = [ "arc-swap", "async-trait", @@ -1603,7 +1603,7 @@ dependencies = [ [[package]] name = "garage_web" -version = "0.10.0" +version = "1.0.0" dependencies = [ "err-derive", "futures", diff --git a/Cargo.nix b/Cargo.nix index bb852806..e0189cdd 100644 --- a/Cargo.nix +++ b/Cargo.nix @@ -34,7 +34,7 @@ args@{ ignoreLockHash, }: let - nixifiedLockHash = "1ef5e578c148e63bdc6491d497aba66b38dcf011779d417228906ce7b19d55f4"; + nixifiedLockHash = "1ccd5eb25a83962821e0e9da4ce6df31717b2b97a5b3a0c80c9e0e0759710143"; workspaceSrc = if args.workspaceSrc == null then ./. else args.workspaceSrc; currentLockHash = builtins.hashFile "sha256" (workspaceSrc + /Cargo.lock); lockHashIgnored = if ignoreLockHash @@ -58,17 +58,17 @@ in { cargo2nixVersion = "0.11.0"; workspace = { - garage_db = rustPackages.unknown.garage_db."0.10.0"; - garage_util = rustPackages.unknown.garage_util."0.10.0"; - garage_net = rustPackages.unknown.garage_net."0.10.0"; - garage_rpc = rustPackages.unknown.garage_rpc."0.10.0"; + garage_db = rustPackages.unknown.garage_db."1.0.0"; + garage_util = rustPackages.unknown.garage_util."1.0.0"; + garage_net = rustPackages.unknown.garage_net."1.0.0"; + garage_rpc = rustPackages.unknown.garage_rpc."1.0.0"; format_table = rustPackages.unknown.format_table."0.1.1"; - garage_table = rustPackages.unknown.garage_table."0.10.0"; - garage_block = rustPackages.unknown.garage_block."0.10.0"; - garage_model = rustPackages.unknown.garage_model."0.10.0"; - garage_api = rustPackages.unknown.garage_api."0.10.0"; - garage_web = rustPackages.unknown.garage_web."0.10.0"; - garage = rustPackages.unknown.garage."0.10.0"; + garage_table = rustPackages.unknown.garage_table."1.0.0"; + garage_block = rustPackages.unknown.garage_block."1.0.0"; + garage_model = rustPackages.unknown.garage_model."1.0.0"; + garage_api = rustPackages.unknown.garage_api."1.0.0"; + garage_web = rustPackages.unknown.garage_web."1.0.0"; + garage = rustPackages.unknown.garage."1.0.0"; k2v-client = rustPackages.unknown.k2v-client."0.0.4"; }; "registry+https://github.com/rust-lang/crates.io-index".addr2line."0.21.0" = overridableMkRustCrate (profileName: rec { @@ -1910,9 +1910,9 @@ in }; }); - "unknown".garage."0.10.0" = overridableMkRustCrate (profileName: rec { + "unknown".garage."1.0.0" = overridableMkRustCrate (profileName: rec { name = "garage"; - version = "0.10.0"; + version = "1.0.0"; registry = "unknown"; src = fetchCrateLocal (workspaceSrc + "/src/garage"); features = builtins.concatLists [ @@ -1940,15 +1940,15 @@ in format_table = (rustPackages."unknown".format_table."0.1.1" { inherit profileName; }).out; futures = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.30" { inherit profileName; }).out; futures_util = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures-util."0.3.30" { inherit profileName; }).out; - garage_api = (rustPackages."unknown".garage_api."0.10.0" { inherit profileName; }).out; - garage_block = (rustPackages."unknown".garage_block."0.10.0" { inherit profileName; }).out; - garage_db = (rustPackages."unknown".garage_db."0.10.0" { inherit profileName; }).out; - garage_model = (rustPackages."unknown".garage_model."0.10.0" { inherit profileName; }).out; - garage_net = (rustPackages."unknown".garage_net."0.10.0" { inherit profileName; }).out; - garage_rpc = (rustPackages."unknown".garage_rpc."0.10.0" { inherit profileName; }).out; - garage_table = (rustPackages."unknown".garage_table."0.10.0" { inherit profileName; }).out; - garage_util = (rustPackages."unknown".garage_util."0.10.0" { inherit profileName; }).out; - garage_web = (rustPackages."unknown".garage_web."0.10.0" { inherit profileName; }).out; + garage_api = (rustPackages."unknown".garage_api."1.0.0" { inherit profileName; }).out; + garage_block = (rustPackages."unknown".garage_block."1.0.0" { inherit profileName; }).out; + garage_db = (rustPackages."unknown".garage_db."1.0.0" { inherit profileName; }).out; + garage_model = (rustPackages."unknown".garage_model."1.0.0" { inherit profileName; }).out; + garage_net = (rustPackages."unknown".garage_net."1.0.0" { inherit profileName; }).out; + garage_rpc = (rustPackages."unknown".garage_rpc."1.0.0" { inherit profileName; }).out; + garage_table = (rustPackages."unknown".garage_table."1.0.0" { inherit profileName; }).out; + garage_util = (rustPackages."unknown".garage_util."1.0.0" { inherit profileName; }).out; + garage_web = (rustPackages."unknown".garage_web."1.0.0" { inherit profileName; }).out; git_version = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".git-version."0.3.9" { inherit profileName; }).out; hex = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hex."0.4.3" { inherit profileName; }).out; sodiumoxide = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".kuska-sodiumoxide."0.2.5-0" { inherit profileName; }).out; @@ -1988,9 +1988,9 @@ in }; }); - "unknown".garage_api."0.10.0" = overridableMkRustCrate (profileName: rec { + "unknown".garage_api."1.0.0" = overridableMkRustCrate (profileName: rec { name = "garage_api"; - version = "0.10.0"; + version = "1.0.0"; registry = "unknown"; src = fetchCrateLocal (workspaceSrc + "/src/api"); features = builtins.concatLists [ @@ -2014,12 +2014,12 @@ in form_urlencoded = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".form_urlencoded."1.2.1" { inherit profileName; }).out; futures = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.30" { inherit profileName; }).out; futures_util = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures-util."0.3.30" { inherit profileName; }).out; - garage_block = (rustPackages."unknown".garage_block."0.10.0" { inherit profileName; }).out; - garage_model = (rustPackages."unknown".garage_model."0.10.0" { inherit profileName; }).out; - garage_net = (rustPackages."unknown".garage_net."0.10.0" { inherit profileName; }).out; - garage_rpc = (rustPackages."unknown".garage_rpc."0.10.0" { inherit profileName; }).out; - garage_table = (rustPackages."unknown".garage_table."0.10.0" { inherit profileName; }).out; - garage_util = (rustPackages."unknown".garage_util."0.10.0" { inherit profileName; }).out; + garage_block = (rustPackages."unknown".garage_block."1.0.0" { inherit profileName; }).out; + garage_model = (rustPackages."unknown".garage_model."1.0.0" { inherit profileName; }).out; + garage_net = (rustPackages."unknown".garage_net."1.0.0" { inherit profileName; }).out; + garage_rpc = (rustPackages."unknown".garage_rpc."1.0.0" { inherit profileName; }).out; + garage_table = (rustPackages."unknown".garage_table."1.0.0" { inherit profileName; }).out; + garage_util = (rustPackages."unknown".garage_util."1.0.0" { inherit profileName; }).out; hex = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hex."0.4.3" { inherit profileName; }).out; hmac = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hmac."0.12.1" { inherit profileName; }).out; http = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".http."1.0.0" { inherit profileName; }).out; @@ -2052,9 +2052,9 @@ in }; }); - "unknown".garage_block."0.10.0" = overridableMkRustCrate (profileName: rec { + "unknown".garage_block."1.0.0" = overridableMkRustCrate (profileName: rec { name = "garage_block"; - version = "0.10.0"; + version = "1.0.0"; registry = "unknown"; src = fetchCrateLocal (workspaceSrc + "/src/block"); features = builtins.concatLists [ @@ -2068,11 +2068,11 @@ in bytesize = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".bytesize."1.3.0" { inherit profileName; }).out; futures = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.30" { inherit profileName; }).out; futures_util = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures-util."0.3.30" { inherit profileName; }).out; - garage_db = (rustPackages."unknown".garage_db."0.10.0" { inherit profileName; }).out; - garage_net = (rustPackages."unknown".garage_net."0.10.0" { inherit profileName; }).out; - garage_rpc = (rustPackages."unknown".garage_rpc."0.10.0" { inherit profileName; }).out; - garage_table = (rustPackages."unknown".garage_table."0.10.0" { inherit profileName; }).out; - garage_util = (rustPackages."unknown".garage_util."0.10.0" { inherit profileName; }).out; + garage_db = (rustPackages."unknown".garage_db."1.0.0" { inherit profileName; }).out; + garage_net = (rustPackages."unknown".garage_net."1.0.0" { inherit profileName; }).out; + garage_rpc = (rustPackages."unknown".garage_rpc."1.0.0" { inherit profileName; }).out; + garage_table = (rustPackages."unknown".garage_table."1.0.0" { inherit profileName; }).out; + garage_util = (rustPackages."unknown".garage_util."1.0.0" { inherit profileName; }).out; hex = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hex."0.4.3" { inherit profileName; }).out; opentelemetry = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".opentelemetry."0.17.0" { inherit profileName; }).out; rand = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".rand."0.8.5" { inherit profileName; }).out; @@ -2085,9 +2085,9 @@ in }; }); - "unknown".garage_db."0.10.0" = overridableMkRustCrate (profileName: rec { + "unknown".garage_db."1.0.0" = overridableMkRustCrate (profileName: rec { name = "garage_db"; - version = "0.10.0"; + version = "1.0.0"; registry = "unknown"; src = fetchCrateLocal (workspaceSrc + "/src/db"); features = builtins.concatLists [ @@ -2114,9 +2114,9 @@ in }; }); - "unknown".garage_model."0.10.0" = overridableMkRustCrate (profileName: rec { + "unknown".garage_model."1.0.0" = overridableMkRustCrate (profileName: rec { name = "garage_model"; - version = "0.10.0"; + version = "1.0.0"; registry = "unknown"; src = fetchCrateLocal (workspaceSrc + "/src/model"); features = builtins.concatLists [ @@ -2134,12 +2134,12 @@ in err_derive = (buildRustPackages."registry+https://github.com/rust-lang/crates.io-index".err-derive."0.3.1" { profileName = "__noProfile"; }).out; futures = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.30" { inherit profileName; }).out; futures_util = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures-util."0.3.30" { inherit profileName; }).out; - garage_block = (rustPackages."unknown".garage_block."0.10.0" { inherit profileName; }).out; - garage_db = (rustPackages."unknown".garage_db."0.10.0" { inherit profileName; }).out; - garage_net = (rustPackages."unknown".garage_net."0.10.0" { inherit profileName; }).out; - garage_rpc = (rustPackages."unknown".garage_rpc."0.10.0" { inherit profileName; }).out; - garage_table = (rustPackages."unknown".garage_table."0.10.0" { inherit profileName; }).out; - garage_util = (rustPackages."unknown".garage_util."0.10.0" { inherit profileName; }).out; + garage_block = (rustPackages."unknown".garage_block."1.0.0" { inherit profileName; }).out; + garage_db = (rustPackages."unknown".garage_db."1.0.0" { inherit profileName; }).out; + garage_net = (rustPackages."unknown".garage_net."1.0.0" { inherit profileName; }).out; + garage_rpc = (rustPackages."unknown".garage_rpc."1.0.0" { inherit profileName; }).out; + garage_table = (rustPackages."unknown".garage_table."1.0.0" { inherit profileName; }).out; + garage_util = (rustPackages."unknown".garage_util."1.0.0" { inherit profileName; }).out; hex = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hex."0.4.3" { inherit profileName; }).out; http = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".http."1.0.0" { inherit profileName; }).out; opentelemetry = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".opentelemetry."0.17.0" { inherit profileName; }).out; @@ -2153,9 +2153,9 @@ in }; }); - "unknown".garage_net."0.10.0" = overridableMkRustCrate (profileName: rec { + "unknown".garage_net."1.0.0" = overridableMkRustCrate (profileName: rec { name = "garage_net"; - version = "0.10.0"; + version = "1.0.0"; registry = "unknown"; src = fetchCrateLocal (workspaceSrc + "/src/net"); features = builtins.concatLists [ @@ -2190,9 +2190,9 @@ in }; }); - "unknown".garage_rpc."0.10.0" = overridableMkRustCrate (profileName: rec { + "unknown".garage_rpc."1.0.0" = overridableMkRustCrate (profileName: rec { name = "garage_rpc"; - version = "0.10.0"; + version = "1.0.0"; registry = "unknown"; src = fetchCrateLocal (workspaceSrc + "/src/rpc"); features = builtins.concatLists [ @@ -2214,9 +2214,9 @@ in format_table = (rustPackages."unknown".format_table."0.1.1" { inherit profileName; }).out; futures = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.30" { inherit profileName; }).out; futures_util = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures-util."0.3.30" { inherit profileName; }).out; - garage_db = (rustPackages."unknown".garage_db."0.10.0" { inherit profileName; }).out; - garage_net = (rustPackages."unknown".garage_net."0.10.0" { inherit profileName; }).out; - garage_util = (rustPackages."unknown".garage_util."0.10.0" { inherit profileName; }).out; + garage_db = (rustPackages."unknown".garage_db."1.0.0" { inherit profileName; }).out; + garage_net = (rustPackages."unknown".garage_net."1.0.0" { inherit profileName; }).out; + garage_util = (rustPackages."unknown".garage_util."1.0.0" { inherit profileName; }).out; gethostname = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".gethostname."0.4.3" { inherit profileName; }).out; hex = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hex."0.4.3" { inherit profileName; }).out; itertools = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".itertools."0.12.1" { inherit profileName; }).out; @@ -2238,9 +2238,9 @@ in }; }); - "unknown".garage_table."0.10.0" = overridableMkRustCrate (profileName: rec { + "unknown".garage_table."1.0.0" = overridableMkRustCrate (profileName: rec { name = "garage_table"; - version = "0.10.0"; + version = "1.0.0"; registry = "unknown"; src = fetchCrateLocal (workspaceSrc + "/src/table"); dependencies = { @@ -2249,9 +2249,9 @@ in bytes = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".bytes."1.5.0" { inherit profileName; }).out; futures = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.30" { inherit profileName; }).out; futures_util = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures-util."0.3.30" { inherit profileName; }).out; - garage_db = (rustPackages."unknown".garage_db."0.10.0" { inherit profileName; }).out; - garage_rpc = (rustPackages."unknown".garage_rpc."0.10.0" { inherit profileName; }).out; - garage_util = (rustPackages."unknown".garage_util."0.10.0" { inherit profileName; }).out; + garage_db = (rustPackages."unknown".garage_db."1.0.0" { inherit profileName; }).out; + garage_rpc = (rustPackages."unknown".garage_rpc."1.0.0" { inherit profileName; }).out; + garage_util = (rustPackages."unknown".garage_util."1.0.0" { inherit profileName; }).out; hex = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hex."0.4.3" { inherit profileName; }).out; hexdump = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hexdump."0.1.1" { inherit profileName; }).out; opentelemetry = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".opentelemetry."0.17.0" { inherit profileName; }).out; @@ -2263,9 +2263,9 @@ in }; }); - "unknown".garage_util."0.10.0" = overridableMkRustCrate (profileName: rec { + "unknown".garage_util."1.0.0" = overridableMkRustCrate (profileName: rec { name = "garage_util"; - version = "0.10.0"; + version = "1.0.0"; registry = "unknown"; src = fetchCrateLocal (workspaceSrc + "/src/util"); features = builtins.concatLists [ @@ -2281,8 +2281,8 @@ in digest = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".digest."0.10.7" { inherit profileName; }).out; err_derive = (buildRustPackages."registry+https://github.com/rust-lang/crates.io-index".err-derive."0.3.1" { profileName = "__noProfile"; }).out; futures = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.30" { inherit profileName; }).out; - garage_db = (rustPackages."unknown".garage_db."0.10.0" { inherit profileName; }).out; - garage_net = (rustPackages."unknown".garage_net."0.10.0" { inherit profileName; }).out; + garage_db = (rustPackages."unknown".garage_db."1.0.0" { inherit profileName; }).out; + garage_net = (rustPackages."unknown".garage_net."1.0.0" { inherit profileName; }).out; hex = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hex."0.4.3" { inherit profileName; }).out; hexdump = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hexdump."0.1.1" { inherit profileName; }).out; http = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".http."1.0.0" { inherit profileName; }).out; @@ -2307,18 +2307,18 @@ in }; }); - "unknown".garage_web."0.10.0" = overridableMkRustCrate (profileName: rec { + "unknown".garage_web."1.0.0" = overridableMkRustCrate (profileName: rec { name = "garage_web"; - version = "0.10.0"; + version = "1.0.0"; registry = "unknown"; src = fetchCrateLocal (workspaceSrc + "/src/web"); dependencies = { err_derive = (buildRustPackages."registry+https://github.com/rust-lang/crates.io-index".err-derive."0.3.1" { profileName = "__noProfile"; }).out; futures = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".futures."0.3.30" { inherit profileName; }).out; - garage_api = (rustPackages."unknown".garage_api."0.10.0" { inherit profileName; }).out; - garage_model = (rustPackages."unknown".garage_model."0.10.0" { inherit profileName; }).out; - garage_table = (rustPackages."unknown".garage_table."0.10.0" { inherit profileName; }).out; - garage_util = (rustPackages."unknown".garage_util."0.10.0" { inherit profileName; }).out; + garage_api = (rustPackages."unknown".garage_api."1.0.0" { inherit profileName; }).out; + garage_model = (rustPackages."unknown".garage_model."1.0.0" { inherit profileName; }).out; + garage_table = (rustPackages."unknown".garage_table."1.0.0" { inherit profileName; }).out; + garage_util = (rustPackages."unknown".garage_util."1.0.0" { inherit profileName; }).out; http = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".http."1.0.0" { inherit profileName; }).out; http_body_util = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".http-body-util."0.1.0" { inherit profileName; }).out; hyper = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hyper."1.1.0" { inherit profileName; }).out; diff --git a/Cargo.toml b/Cargo.toml index c0aad2d0..b8840a91 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,15 +21,15 @@ default-members = ["src/garage"] # Internal Garage crates format_table = { version = "0.1.1", path = "src/format-table" } -garage_api = { version = "0.10.0", path = "src/api" } -garage_block = { version = "0.10.0", path = "src/block" } -garage_db = { version = "0.10.0", path = "src/db", default-features = false } -garage_model = { version = "0.10.0", path = "src/model", default-features = false } -garage_net = { version = "0.10.0", path = "src/net" } -garage_rpc = { version = "0.10.0", path = "src/rpc" } -garage_table = { version = "0.10.0", path = "src/table" } -garage_util = { version = "0.10.0", path = "src/util" } -garage_web = { version = "0.10.0", path = "src/web" } +garage_api = { version = "1.0.0", path = "src/api" } +garage_block = { version = "1.0.0", path = "src/block" } +garage_db = { version = "1.0.0", path = "src/db", default-features = false } +garage_model = { version = "1.0.0", path = "src/model", default-features = false } +garage_net = { version = "1.0.0", path = "src/net" } +garage_rpc = { version = "1.0.0", path = "src/rpc" } +garage_table = { version = "1.0.0", path = "src/table" } +garage_util = { version = "1.0.0", path = "src/util" } +garage_web = { version = "1.0.0", path = "src/web" } k2v-client = { version = "0.0.4", path = "src/k2v-client" } # External crates from crates.io diff --git a/doc/book/cookbook/real-world.md b/doc/book/cookbook/real-world.md index 59a5a7b6..7dba784d 100644 --- a/doc/book/cookbook/real-world.md +++ b/doc/book/cookbook/real-world.md @@ -96,14 +96,14 @@ to store 2 TB of data in total. ## Get a Docker image Our docker image is currently named `dxflrs/garage` and is stored on the [Docker Hub](https://hub.docker.com/r/dxflrs/garage/tags?page=1&ordering=last_updated). -We encourage you to use a fixed tag (eg. `v0.9.3`) and not the `latest` tag. -For this example, we will use the latest published version at the time of the writing which is `v0.9.3` but it's up to you +We encourage you to use a fixed tag (eg. `v1.0.0`) and not the `latest` tag. +For this example, we will use the latest published version at the time of the writing which is `v1.0.0` but it's up to you to check [the most recent versions on the Docker Hub](https://hub.docker.com/r/dxflrs/garage/tags?page=1&ordering=last_updated). For example: ``` -sudo docker pull dxflrs/garage:v0.9.3 +sudo docker pull dxflrs/garage:v1.0.0 ``` ## Deploying and configuring Garage @@ -169,7 +169,7 @@ docker run \ -v /etc/garage.toml:/etc/garage.toml \ -v /var/lib/garage/meta:/var/lib/garage/meta \ -v /var/lib/garage/data:/var/lib/garage/data \ - dxflrs/garage:v0.9.3 + dxflrs/garage:v1.0.0 ``` With this command line, Garage should be started automatically at each boot. @@ -183,7 +183,7 @@ If you want to use `docker-compose`, you may use the following `docker-compose.y version: "3" services: garage: - image: dxflrs/garage:v0.9.3 + image: dxflrs/garage:v1.0.0 network_mode: "host" restart: unless-stopped volumes: diff --git a/doc/drafts/admin-api.md b/doc/drafts/admin-api.md index 40c82f5a..16338194 100644 --- a/doc/drafts/admin-api.md +++ b/doc/drafts/admin-api.md @@ -70,7 +70,7 @@ Example response body: ```json { "node": "b10c110e4e854e5aa3f4637681befac755154b20059ec163254ddbfae86b09df", - "garageVersion": "v0.10.0", + "garageVersion": "v1.0.0", "garageFeatures": [ "k2v", "lmdb", diff --git a/script/helm/garage/Chart.yaml b/script/helm/garage/Chart.yaml index 71906cfb..d5449f28 100644 --- a/script/helm/garage/Chart.yaml +++ b/script/helm/garage/Chart.yaml @@ -21,4 +21,4 @@ version: 0.4.1 # incremented each time you make changes to the application. Versions are not expected to # follow Semantic Versioning. They should reflect the version the application is using. # It is recommended to use it with quotes. -appVersion: "v0.9.3" +appVersion: "v1.0.0" diff --git a/src/api/Cargo.toml b/src/api/Cargo.toml index 1b87496c..a5645c26 100644 --- a/src/api/Cargo.toml +++ b/src/api/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "garage_api" -version = "0.10.0" +version = "1.0.0" authors = ["Alex Auvolat "] edition = "2018" license = "AGPL-3.0" diff --git a/src/block/Cargo.toml b/src/block/Cargo.toml index b5763120..7eb6bca8 100644 --- a/src/block/Cargo.toml +++ b/src/block/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "garage_block" -version = "0.10.0" +version = "1.0.0" authors = ["Alex Auvolat "] edition = "2018" license = "AGPL-3.0" diff --git a/src/db/Cargo.toml b/src/db/Cargo.toml index b88298ee..ef5a8659 100644 --- a/src/db/Cargo.toml +++ b/src/db/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "garage_db" -version = "0.10.0" +version = "1.0.0" authors = ["Alex Auvolat "] edition = "2018" license = "AGPL-3.0" diff --git a/src/garage/Cargo.toml b/src/garage/Cargo.toml index a4acbb1f..9cc71abd 100644 --- a/src/garage/Cargo.toml +++ b/src/garage/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "garage" -version = "0.10.0" +version = "1.0.0" authors = ["Alex Auvolat "] edition = "2018" license = "AGPL-3.0" diff --git a/src/model/Cargo.toml b/src/model/Cargo.toml index 1e1ce0f7..25926080 100644 --- a/src/model/Cargo.toml +++ b/src/model/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "garage_model" -version = "0.10.0" +version = "1.0.0" authors = ["Alex Auvolat "] edition = "2018" license = "AGPL-3.0" diff --git a/src/net/Cargo.toml b/src/net/Cargo.toml index 4bd0d2e5..c12b39a4 100644 --- a/src/net/Cargo.toml +++ b/src/net/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "garage_net" -version = "0.10.0" +version = "1.0.0" authors = ["Alex Auvolat "] edition = "2018" license = "AGPL-3.0" diff --git a/src/net/netapp.rs b/src/net/netapp.rs index 6480a126..f1e9f1ae 100644 --- a/src/net/netapp.rs +++ b/src/net/netapp.rs @@ -35,8 +35,10 @@ pub type NetworkKey = sodiumoxide::crypto::auth::Key; /// composed of 8 bytes for Netapp version and 8 bytes for client version pub(crate) type VersionTag = [u8; 16]; -/// Value of the Netapp version used in the version tag -pub(crate) const NETAPP_VERSION_TAG: u64 = 0x6e65746170700005; // netapp 0x0005 +/// Value of garage_net version used in the version tag +/// We are no longer using prefix `netapp` as garage_net is forked from the netapp crate. +/// Since Garage v1.0, we have replaced the prefix by `grgnet` (shorthand for garage_net). +pub(crate) const NETAPP_VERSION_TAG: u64 = 0x6772676e65740010; // grgnet 0x0010 (1.0) /// HelloMessage is sent by the client on a Netapp connection to indicate /// that they are also a server and ready to recieve incoming connections @@ -123,7 +125,7 @@ impl NetApp { netapp .hello_endpoint - .swap(Some(netapp.endpoint("__netapp/netapp.rs/Hello".into()))); + .swap(Some(netapp.endpoint("garage_net/netapp.rs/Hello".into()))); netapp .hello_endpoint .load_full() diff --git a/src/net/peering.rs b/src/net/peering.rs index b4271231..168162d9 100644 --- a/src/net/peering.rs +++ b/src/net/peering.rs @@ -237,14 +237,13 @@ impl PeeringManager { ); known_hosts.update_hash(); - // TODO for v0.10 / v1.0 : rename the endpoint (it will break compatibility) let strat = Arc::new(Self { netapp: netapp.clone(), known_hosts: RwLock::new(known_hosts), public_peer_list: ArcSwap::new(Arc::new(Vec::new())), next_ping_id: AtomicU64::new(42), - ping_endpoint: netapp.endpoint("__netapp/peering/fullmesh.rs/Ping".into()), - peer_list_endpoint: netapp.endpoint("__netapp/peering/fullmesh.rs/PeerList".into()), + ping_endpoint: netapp.endpoint("garage_net/peering.rs/Ping".into()), + peer_list_endpoint: netapp.endpoint("garage_net/peering.rs/PeerList".into()), ping_timeout_millis: DEFAULT_PING_TIMEOUT_MILLIS.into(), }); diff --git a/src/rpc/Cargo.toml b/src/rpc/Cargo.toml index 3e7ac635..43d5568e 100644 --- a/src/rpc/Cargo.toml +++ b/src/rpc/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "garage_rpc" -version = "0.10.0" +version = "1.0.0" authors = ["Alex Auvolat "] edition = "2018" license = "AGPL-3.0" diff --git a/src/rpc/system.rs b/src/rpc/system.rs index 91a42415..0e78060b 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -46,7 +46,7 @@ const STATUS_EXCHANGE_INTERVAL: Duration = Duration::from_secs(10); /// Version tag used for version check upon Netapp connection. /// Cluster nodes with different version tags are deemed /// incompatible and will refuse to connect. -pub const GARAGE_VERSION_TAG: u64 = 0x676172616765000A; // garage 0x000A +pub const GARAGE_VERSION_TAG: u64 = 0x6761726167650010; // garage 0x0010 (1.0) /// RPC endpoint used for calls related to membership pub const SYSTEM_RPC_PATH: &str = "garage_rpc/system.rs/SystemRpc"; diff --git a/src/table/Cargo.toml b/src/table/Cargo.toml index cac17da6..171118ea 100644 --- a/src/table/Cargo.toml +++ b/src/table/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "garage_table" -version = "0.10.0" +version = "1.0.0" authors = ["Alex Auvolat "] edition = "2018" license = "AGPL-3.0" diff --git a/src/util/Cargo.toml b/src/util/Cargo.toml index e4c31460..883c0aa4 100644 --- a/src/util/Cargo.toml +++ b/src/util/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "garage_util" -version = "0.10.0" +version = "1.0.0" authors = ["Alex Auvolat "] edition = "2018" license = "AGPL-3.0" diff --git a/src/web/Cargo.toml b/src/web/Cargo.toml index 49549c9b..f097755c 100644 --- a/src/web/Cargo.toml +++ b/src/web/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "garage_web" -version = "0.10.0" +version = "1.0.0" authors = ["Alex Auvolat ", "Quentin Dufour "] edition = "2018" license = "AGPL-3.0" -- cgit v1.2.3 From 554437254e8c7e9760a838523a80fb316574e607 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 28 Mar 2024 18:38:35 +0100 Subject: [next-0.10] Add migration guide for v1.0 --- doc/book/working-documents/migration-1.md | 77 +++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 doc/book/working-documents/migration-1.md diff --git a/doc/book/working-documents/migration-1.md b/doc/book/working-documents/migration-1.md new file mode 100644 index 00000000..7b650b6c --- /dev/null +++ b/doc/book/working-documents/migration-1.md @@ -0,0 +1,77 @@ ++++ +title = "Migrating from 0.9 to 1.0" +weight = 11 ++++ + +**This guide explains how to migrate to 1.0 if you have an existing 0.9 cluster. +We don't recommend trying to migrate to 1.0 directly from 0.8 or older.** + +This migration procedure has been tested on several clusters without issues. +However, it is still a *critical procedure* that might cause issues. +**Make sure to back up all your data before attempting it!** + +You might also want to read our [general documentation on upgrading Garage](@/documentation/operations/upgrading.md). + +## Changes introduced in v1.0 + +The following are **breaking changes** in Garage v1.0 that require your attention when migrating: + +- The Sled metadata db engine has been **removed**. If your cluster was still + using Sled, you will need to **use a Garage v0.9.x binary** to convert the + database using the `garage convert-db` subcommand. See + [here](@/documentation/reference-manual/configuration/#db_engine) for the + details of the procedure. + +The following syntax changes have been made to the configuration file: + +- The `replication_mode` parameter has been split into two parameters: + [`replication_factor`](@/documentation/reference-manual/configuration/#replication_factor) + and + [`consistency_mode`](@/documentation/reference-manual/configuration/#consistency_mode). + The old syntax using `replication_mode` is still supported for legacy + reasons and can still be used. + +- The parameters `sled_cache_capacity` and `sled_flush_every_ms` have been removed. + +## Migration procedure + +The migration to Garage v1.0 can be done with almost no downtime, +by restarting all nodes at once in the new version. + +The migration steps are as follows: + +1. Do a `garage repair --all-nodes --yes tables`, check the logs and check that + all data seems to be synced correctly between nodes. If you have time, do + additional `garage repair` procedures (`blocks`, `versions`, `block_refs`, + etc.) + +2. Ensure you have a snapshot of your Garage installation that you can restore + to in case the upgrade goes wrong: + + - If you are running Garage v0.9.4 or later, use the `garage meta snapshot + --all` to make a backup snapshot of the metadata directories of your nodes + for backup purposes, and save a copy of the following files in the + metadata directories of your nodes: `cluster_layout`, `data_layout`, + `node_key`, `node_key.pub`. + + - If you are running a filesystem such as ZFS or BTRFS that support + snapshotting, you can create a filesystem-level snapshot to be used as a + restoration point if needed. + + - In other cases, make a backup using the old procedure: turn off each node + individually; back up its metadata folder (for instance, use the following + command if your metadata directory is `/var/lib/garage/meta`: `cd + /var/lib/garage ; tar -acf meta-v0.9.tar.zst meta/`); turn it back on + again. This will allow you to take a backup of all nodes without + impacting global cluster availability. You can do all nodes of a single + zone at once as this does not impact the availability of Garage. + +3. Prepare your updated binaries and configuration files for Garage v1.0 + +4. Shut down all v0.9 nodes simultaneously, and restart them all simultaneously + in v1.0. Use your favorite deployment tool (Ansible, Kubernetes, Nomad) to + achieve this as fast as possible. Garage v1.0 should be in a working state + as soon as enough nodes have started. + +5. Monitor your cluster in the following hours to see if it works well under + your production load. -- cgit v1.2.3 From 50669b3e768f9c3570db0dae60927c16fdba3592 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 3 Apr 2024 14:19:59 +0200 Subject: [next-0.10] bump helm chart version --- script/helm/garage/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/script/helm/garage/Chart.yaml b/script/helm/garage/Chart.yaml index d5449f28..f3834fdc 100644 --- a/script/helm/garage/Chart.yaml +++ b/script/helm/garage/Chart.yaml @@ -15,7 +15,7 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 0.4.1 +version: 0.5.0 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to -- cgit v1.2.3 From 90e3c2af915251720a4253f78f3f1b4ba844800d Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 10 Apr 2024 14:35:30 +0200 Subject: [next-0.10] small updates to mention Garage v0.9.4 --- doc/book/reference-manual/configuration.md | 2 +- doc/book/working-documents/migration-1.md | 2 +- src/db/open.rs | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/book/reference-manual/configuration.md b/doc/book/reference-manual/configuration.md index 3202daf6..49bff05b 100644 --- a/doc/book/reference-manual/configuration.md +++ b/doc/book/reference-manual/configuration.md @@ -300,7 +300,7 @@ Since `v0.8.0`, Garage can use alternative storage backends as follows: | [Sled](https://sled.rs) (old default, removed since `v1.0`) | `"sled"` | `/db/` | Sled was supported until Garage v0.9.x, and was removed in Garage v1.0. -You can still use an older binary of Garage (e.g. v0.9.3) to migrate +You can still use an older binary of Garage (e.g. v0.9.4) to migrate old Sled metadata databases to another engine. Performance characteristics of the different DB engines are as follows: diff --git a/doc/book/working-documents/migration-1.md b/doc/book/working-documents/migration-1.md index 7b650b6c..2fb14ef9 100644 --- a/doc/book/working-documents/migration-1.md +++ b/doc/book/working-documents/migration-1.md @@ -44,7 +44,7 @@ The migration steps are as follows: all data seems to be synced correctly between nodes. If you have time, do additional `garage repair` procedures (`blocks`, `versions`, `block_refs`, etc.) - + 2. Ensure you have a snapshot of your Garage installation that you can restore to in case the upgrade goes wrong: diff --git a/src/db/open.rs b/src/db/open.rs index 19bc96cc..b8de3cd7 100644 --- a/src/db/open.rs +++ b/src/db/open.rs @@ -36,7 +36,7 @@ impl std::str::FromStr for Engine { match text { "lmdb" | "heed" => Ok(Self::Lmdb), "sqlite" | "sqlite3" | "rusqlite" => Ok(Self::Sqlite), - "sled" => Err(Error("Sled is no longer supported as a database engine. Converting your old metadata db can be done using an older Garage binary (e.g. v0.9.3).".into())), + "sled" => Err(Error("Sled is no longer supported as a database engine. Converting your old metadata db can be done using an older Garage binary (e.g. v0.9.4).".into())), kind => Err(Error( format!( "Invalid DB engine: {} (options are: lmdb, sqlite)", -- cgit v1.2.3