From c94406f4282d48e2e2ac82ffb57eafaad23f7edc Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 9 Nov 2021 12:24:04 +0100 Subject: Improve how node roles are assigned in Garage - change the terminology: the network configuration becomes the role table, the configuration of a nodes becomes a node's role - the modification of the role table takes place in two steps: first, changes are staged in a CRDT data structure. Then, once the user is happy with the changes, they can commit them all at once (or revert them). - update documentation - fix tests - implement smarter partition assignation algorithm This patch breaks the format of the network configuration: when migrating, the cluster will be in a state where no roles are assigned. All roles must be re-assigned and commited at once. This migration should not pose an issue. --- src/table/Cargo.toml | 7 +- src/table/crdt/bool.rs | 34 -------- src/table/crdt/crdt.rs | 71 ----------------- src/table/crdt/lww.rs | 114 --------------------------- src/table/crdt/lww_map.rs | 161 -------------------------------------- src/table/crdt/map.rs | 99 ----------------------- src/table/crdt/mod.rs | 23 ------ src/table/lib.rs | 5 +- src/table/replication/fullcopy.rs | 4 +- 9 files changed, 10 insertions(+), 508 deletions(-) delete mode 100644 src/table/crdt/bool.rs delete mode 100644 src/table/crdt/crdt.rs delete mode 100644 src/table/crdt/lww.rs delete mode 100644 src/table/crdt/lww_map.rs delete mode 100644 src/table/crdt/map.rs delete mode 100644 src/table/crdt/mod.rs (limited to 'src/table') diff --git a/src/table/Cargo.toml b/src/table/Cargo.toml index 616bf275..dc37f12c 100644 --- a/src/table/Cargo.toml +++ b/src/table/Cargo.toml @@ -1,11 +1,12 @@ [package] name = "garage_table" -version = "0.4.0" +version = "0.5.0" authors = ["Alex Auvolat "] edition = "2018" license = "AGPL-3.0" description = "Table sharding and replication engine (DynamoDB-like) for the Garage object store" repository = "https://git.deuxfleurs.fr/Deuxfleurs/garage" +readme = "../../README.md" [lib] path = "lib.rs" @@ -13,8 +14,8 @@ path = "lib.rs" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -garage_rpc = { version = "0.4.0", path = "../rpc" } -garage_util = { version = "0.4.0", path = "../util" } +garage_rpc = { version = "0.5.0", path = "../rpc" } +garage_util = { version = "0.5.0", path = "../util" } async-trait = "0.1.7" bytes = "1.0" diff --git a/src/table/crdt/bool.rs b/src/table/crdt/bool.rs deleted file mode 100644 index 53af8f82..00000000 --- a/src/table/crdt/bool.rs +++ /dev/null @@ -1,34 +0,0 @@ -use serde::{Deserialize, Serialize}; - -use crate::crdt::crdt::*; - -/// Boolean, where `true` is an absorbing state -#[derive(Clone, Copy, Debug, Serialize, Deserialize, PartialEq)] -pub struct Bool(bool); - -impl Bool { - /// Create a new boolean with the specified value - pub fn new(b: bool) -> Self { - Self(b) - } - /// Set the boolean to true - pub fn set(&mut self) { - self.0 = true; - } - /// Get the boolean value - pub fn get(&self) -> bool { - self.0 - } -} - -impl From for Bool { - fn from(b: bool) -> Bool { - Bool::new(b) - } -} - -impl Crdt for Bool { - fn merge(&mut self, other: &Self) { - self.0 = self.0 || other.0; - } -} diff --git a/src/table/crdt/crdt.rs b/src/table/crdt/crdt.rs deleted file mode 100644 index a8f1b9aa..00000000 --- a/src/table/crdt/crdt.rs +++ /dev/null @@ -1,71 +0,0 @@ -use garage_util::data::*; - -/// Definition of a CRDT - all CRDT Rust types implement this. -/// -/// A CRDT is defined as a merge operator that respects a certain set of axioms. -/// -/// In particular, the merge operator must be commutative, associative, -/// idempotent, and monotonic. -/// In other words, if `a`, `b` and `c` are CRDTs, and `⊔` denotes the merge operator, -/// the following axioms must apply: -/// -/// ```text -/// a ⊔ b = b ⊔ a (commutativity) -/// (a ⊔ b) ⊔ c = a ⊔ (b ⊔ c) (associativity) -/// (a ⊔ b) ⊔ b = a ⊔ b (idempotence) -/// ``` -/// -/// Moreover, the relationship `≥` defined by `a ≥ b ⇔ ∃c. a = b ⊔ c` must be a partial order. -/// This implies a few properties such as: if `a ⊔ b ≠ a`, then there is no `c` such that `(a ⊔ b) ⊔ c = a`, -/// as this would imply a cycle in the partial order. -pub trait Crdt { - /// Merge the two datastructures according to the CRDT rules. - /// `self` is modified to contain the merged CRDT value. `other` is not modified. - /// - /// # Arguments - /// - /// * `other` - the other CRDT we wish to merge with - fn merge(&mut self, other: &Self); -} - -/// All types that implement `Ord` (a total order) can also implement a trivial CRDT -/// defined by the merge rule: `a ⊔ b = max(a, b)`. Implement this trait for your type -/// to enable this behavior. -pub trait AutoCrdt: Ord + Clone + std::fmt::Debug { - /// WARN_IF_DIFFERENT: emit a warning when values differ. Set this to true if - /// different values in your application should never happen. Set this to false - /// if you are actually relying on the semantics of `a ⊔ b = max(a, b)`. - const WARN_IF_DIFFERENT: bool; -} - -impl Crdt for T -where - T: AutoCrdt, -{ - fn merge(&mut self, other: &Self) { - if Self::WARN_IF_DIFFERENT && self != other { - warn!( - "Different CRDT values should be the same (logic error!): {:?} vs {:?}", - self, other - ); - if other > self { - *self = other.clone(); - } - warn!("Making an arbitrary choice: {:?}", self); - } else if other > self { - *self = other.clone(); - } - } -} - -impl AutoCrdt for String { - const WARN_IF_DIFFERENT: bool = true; -} - -impl AutoCrdt for bool { - const WARN_IF_DIFFERENT: bool = true; -} - -impl AutoCrdt for FixedBytes32 { - const WARN_IF_DIFFERENT: bool = true; -} diff --git a/src/table/crdt/lww.rs b/src/table/crdt/lww.rs deleted file mode 100644 index be197d88..00000000 --- a/src/table/crdt/lww.rs +++ /dev/null @@ -1,114 +0,0 @@ -use serde::{Deserialize, Serialize}; - -use garage_util::time::now_msec; - -use crate::crdt::crdt::*; - -/// Last Write Win (LWW) -/// -/// An LWW CRDT associates a timestamp with a value, in order to implement a -/// time-based reconciliation rule: the most recent write wins. -/// For completeness, the LWW reconciliation rule must also be defined for two LWW CRDTs -/// with the same timestamp but different values. -/// -/// In our case, we add the constraint that the value that is wrapped inside the LWW CRDT must -/// itself be a CRDT: in the case when the timestamp does not allow us to decide on which value to -/// keep, the merge rule of the inner CRDT is applied on the wrapped values. (Note that all types -/// that implement the `Ord` trait get a default CRDT implemetnation that keeps the maximum value. -/// This enables us to use LWW directly with primitive data types such as numbers or strings. It is -/// generally desirable in this case to never explicitly produce LWW values with the same timestamp -/// but different inner values, as the rule to keep the maximum value isn't generally the desired -/// semantics.) -/// -/// As multiple computers clocks are always desynchronized, -/// when operations are close enough, it is equivalent to -/// take one copy and drop the other one. -/// -/// Given that clocks are not too desynchronized, this assumption -/// is enough for most cases, as there is few chance that two humans -/// coordonate themself faster than the time difference between two NTP servers. -/// -/// As a more concret example, let's suppose you want to upload a file -/// with the same key (path) in the same bucket at the very same time. -/// For each request, the file will be timestamped by the receiving server -/// and may differ from what you observed with your atomic clock! -/// -/// This scheme is used by AWS S3 or Soundcloud and often without knowing -/// in enterprise when reconciliating databases with ad-hoc scripts. -#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] -pub struct Lww { - ts: u64, - v: T, -} - -impl Lww -where - T: Crdt, -{ - /// Creates a new CRDT - /// - /// CRDT's internal timestamp is set with current node's clock. - pub fn new(value: T) -> Self { - Self { - ts: now_msec(), - v: value, - } - } - - /// Build a new CRDT from a previous non-compatible one - /// - /// Compared to new, the CRDT's timestamp is not set to now - /// but must be set to the previous, non-compatible, CRDT's timestamp. - pub fn migrate_from_raw(ts: u64, value: T) -> Self { - Self { ts, v: value } - } - - /// Update the LWW CRDT while keeping some causal ordering. - /// - /// The timestamp of the LWW CRDT is updated to be the current node's clock - /// at time of update, or the previous timestamp + 1 if that's bigger, - /// so that the new timestamp is always strictly larger than the previous one. - /// This ensures that merging the update with the old value will result in keeping - /// the updated value. - pub fn update(&mut self, new_value: T) { - self.ts = std::cmp::max(self.ts + 1, now_msec()); - self.v = new_value; - } - - /// Get the CRDT value - pub fn get(&self) -> &T { - &self.v - } - - /// Get a mutable reference to the CRDT's value - /// - /// This is usefull to mutate the inside value without changing the LWW timestamp. - /// When such mutation is done, the merge between two LWW values is done using the inner - /// CRDT's merge operation. This is usefull in the case where the inner CRDT is a large - /// data type, such as a map, and we only want to change a single item in the map. - /// To do this, we can produce a "CRDT delta", i.e. a LWW that contains only the modification. - /// This delta consists in a LWW with the same timestamp, and the map - /// inside only contains the updated value. - /// The advantage of such a delta is that it is much smaller than the whole map. - /// - /// Avoid using this if the inner data type is a primitive type such as a number or a string, - /// as you will then rely on the merge function defined on `Ord` types by keeping the maximum - /// of both values. - pub fn get_mut(&mut self) -> &mut T { - &mut self.v - } -} - -impl Crdt for Lww -where - T: Clone + Crdt, -{ - fn merge(&mut self, other: &Self) { - if other.ts > self.ts { - self.ts = other.ts; - self.v = other.v.clone(); - } else if other.ts == self.ts { - self.v.merge(&other.v); - } - } -} diff --git a/src/table/crdt/lww_map.rs b/src/table/crdt/lww_map.rs deleted file mode 100644 index fb25fd46..00000000 --- a/src/table/crdt/lww_map.rs +++ /dev/null @@ -1,161 +0,0 @@ -use serde::{Deserialize, Serialize}; - -use garage_util::time::now_msec; - -use crate::crdt::crdt::*; - -/// Last Write Win Map -/// -/// This types defines a CRDT for a map from keys to values. -/// The values have an associated timestamp, such that the last written value -/// takes precedence over previous ones. As for the simpler `LWW` type, the value -/// type `V` is also required to implement the CRDT trait. -/// We do not encourage mutating the values associated with a given key -/// without updating the timestamp, in fact at the moment we do not provide a `.get_mut()` -/// method that would allow that. -/// -/// Internally, the map is stored as a vector of keys and values, sorted by ascending key order. -/// This is why the key type `K` must implement `Ord` (and also to ensure a unique serialization, -/// such that two values can be compared for equality based on their hashes). As a consequence, -/// insertions take `O(n)` time. This means that LWWMap should be used for reasonably small maps. -/// However, note that even if we were using a more efficient data structure such as a `BTreeMap`, -/// the serialization cost `O(n)` would still have to be paid at each modification, so we are -/// actually not losing anything here. -#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] -pub struct LwwMap { - vals: Vec<(K, u64, V)>, -} - -impl LwwMap -where - K: Ord, - V: Crdt, -{ - /// Create a new empty map CRDT - pub fn new() -> Self { - Self { vals: vec![] } - } - /// Used to migrate from a map defined in an incompatible format. This produces - /// a map that contains a single item with the specified timestamp (copied from - /// the incompatible format). Do this as many times as you have items to migrate, - /// and put them all together using the CRDT merge operator. - pub fn migrate_from_raw_item(k: K, ts: u64, v: V) -> Self { - Self { - vals: vec![(k, ts, v)], - } - } - /// Returns a map that contains a single mapping from the specified key to the specified value. - /// This map is a mutator, or a delta-CRDT, such that when it is merged with the original map, - /// the previous value will be replaced with the one specified here. - /// The timestamp in the provided mutator is set to the maximum of the current system's clock - /// and 1 + the previous value's timestamp (if there is one), so that the new value will always - /// take precedence (LWW rule). - /// - /// Typically, to update the value associated to a key in the map, you would do the following: - /// - /// ```ignore - /// let my_update = my_crdt.update_mutator(key_to_modify, new_value); - /// my_crdt.merge(&my_update); - /// ``` - /// - /// However extracting the mutator on its own and only sending that on the network is very - /// interesting as it is much smaller than the whole map. - pub fn update_mutator(&self, k: K, new_v: V) -> Self { - let new_vals = match self.vals.binary_search_by(|(k2, _, _)| k2.cmp(&k)) { - Ok(i) => { - let (_, old_ts, _) = self.vals[i]; - let new_ts = std::cmp::max(old_ts + 1, now_msec()); - vec![(k, new_ts, new_v)] - } - Err(_) => vec![(k, now_msec(), new_v)], - }; - Self { vals: new_vals } - } - /// Takes all of the values of the map and returns them. The current map is reset to the - /// empty map. This is very usefull to produce in-place a new map that contains only a delta - /// that modifies a certain value: - /// - /// ```ignore - /// let mut a = get_my_crdt_value(); - /// let old_a = a.take_and_clear(); - /// a.merge(&old_a.update_mutator(key_to_modify, new_value)); - /// put_my_crdt_value(a); - /// ``` - /// - /// Of course in this simple example we could have written simply - /// `pyt_my_crdt_value(a.update_mutator(key_to_modify, new_value))`, - /// but in the case where the map is a field in a struct for instance (as is always the case), - /// this becomes very handy: - /// - /// ```ignore - /// let mut a = get_my_crdt_value(); - /// let old_a_map = a.map_field.take_and_clear(); - /// a.map_field.merge(&old_a_map.update_mutator(key_to_modify, new_value)); - /// put_my_crdt_value(a); - /// ``` - pub fn take_and_clear(&mut self) -> Self { - let vals = std::mem::take(&mut self.vals); - Self { vals } - } - /// Removes all values from the map - pub fn clear(&mut self) { - self.vals.clear(); - } - /// Get a reference to the value assigned to a key - pub fn get(&self, k: &K) -> Option<&V> { - match self.vals.binary_search_by(|(k2, _, _)| k2.cmp(k)) { - Ok(i) => Some(&self.vals[i].2), - Err(_) => None, - } - } - /// Gets a reference to all of the items, as a slice. Usefull to iterate on all map values. - /// In most case you will want to ignore the timestamp (second item of the tuple). - pub fn items(&self) -> &[(K, u64, V)] { - &self.vals[..] - } - - /// Returns the number of items in the map - pub fn len(&self) -> usize { - self.vals.len() - } - - /// Returns true if the map is empty - pub fn is_empty(&self) -> bool { - self.len() == 0 - } -} - -impl Crdt for LwwMap -where - K: Clone + Ord, - V: Clone + Crdt, -{ - fn merge(&mut self, other: &Self) { - for (k, ts2, v2) in other.vals.iter() { - match self.vals.binary_search_by(|(k2, _, _)| k2.cmp(k)) { - Ok(i) => { - let (_, ts1, _v1) = &self.vals[i]; - if ts2 > ts1 { - self.vals[i].1 = *ts2; - self.vals[i].2 = v2.clone(); - } else if ts1 == ts2 { - self.vals[i].2.merge(v2); - } - } - Err(i) => { - self.vals.insert(i, (k.clone(), *ts2, v2.clone())); - } - } - } - } -} - -impl Default for LwwMap -where - K: Ord, - V: Crdt, -{ - fn default() -> Self { - Self::new() - } -} diff --git a/src/table/crdt/map.rs b/src/table/crdt/map.rs deleted file mode 100644 index 7553cd50..00000000 --- a/src/table/crdt/map.rs +++ /dev/null @@ -1,99 +0,0 @@ -use serde::{Deserialize, Serialize}; - -use crate::crdt::crdt::*; - -/// Simple CRDT Map -/// -/// This types defines a CRDT for a map from keys to values. Values are CRDT types which -/// can have their own updating logic. -/// -/// Internally, the map is stored as a vector of keys and values, sorted by ascending key order. -/// This is why the key type `K` must implement `Ord` (and also to ensure a unique serialization, -/// such that two values can be compared for equality based on their hashes). As a consequence, -/// insertions take `O(n)` time. This means that Map should be used for reasonably small maps. -/// However, note that even if we were using a more efficient data structure such as a `BTreeMap`, -/// the serialization cost `O(n)` would still have to be paid at each modification, so we are -/// actually not losing anything here. -#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] -pub struct Map { - vals: Vec<(K, V)>, -} - -impl Map -where - K: Clone + Ord, - V: Clone + Crdt, -{ - /// Create a new empty map CRDT - pub fn new() -> Self { - Self { vals: vec![] } - } - - /// Returns a map that contains a single mapping from the specified key to the specified value. - /// This can be used to build a delta-mutator: - /// when merged with another map, the value will be added or CRDT-merged if a previous - /// value already exists. - pub fn put_mutator(k: K, v: V) -> Self { - Self { vals: vec![(k, v)] } - } - - /// Add a value to the map - pub fn put(&mut self, k: K, v: V) { - self.merge(&Self::put_mutator(k, v)); - } - - /// Removes all values from the map - pub fn clear(&mut self) { - self.vals.clear(); - } - - /// Get a reference to the value assigned to a key - pub fn get(&self, k: &K) -> Option<&V> { - match self.vals.binary_search_by(|(k2, _)| k2.cmp(k)) { - Ok(i) => Some(&self.vals[i].1), - Err(_) => None, - } - } - /// Gets a reference to all of the items, as a slice. Usefull to iterate on all map values. - pub fn items(&self) -> &[(K, V)] { - &self.vals[..] - } - /// Returns the number of items in the map - pub fn len(&self) -> usize { - self.vals.len() - } - - /// Returns true if the map is empty - pub fn is_empty(&self) -> bool { - self.len() == 0 - } -} - -impl Crdt for Map -where - K: Clone + Ord, - V: Clone + Crdt, -{ - fn merge(&mut self, other: &Self) { - for (k, v2) in other.vals.iter() { - match self.vals.binary_search_by(|(k2, _)| k2.cmp(k)) { - Ok(i) => { - self.vals[i].1.merge(v2); - } - Err(i) => { - self.vals.insert(i, (k.clone(), v2.clone())); - } - } - } - } -} - -impl Default for Map -where - K: Clone + Ord, - V: Clone + Crdt, -{ - fn default() -> Self { - Self::new() - } -} diff --git a/src/table/crdt/mod.rs b/src/table/crdt/mod.rs deleted file mode 100644 index 9663a5a5..00000000 --- a/src/table/crdt/mod.rs +++ /dev/null @@ -1,23 +0,0 @@ -//! This package provides a simple implementation of conflict-free replicated data types (CRDTs) -//! -//! CRDTs are a type of data structures that do not require coordination. In other words, we can -//! edit them in parallel, we will always find a way to merge it. -//! -//! A general example is a counter. Its initial value is 0. Alice and Bob get a copy of the -//! counter. Alice does +1 on her copy, she reads 1. Bob does +3 on his copy, he reads 3. Now, -//! it is easy to merge their counters, order does not count: we always get 4. -//! -//! Learn more about CRDT [on Wikipedia](https://en.wikipedia.org/wiki/Conflict-free_replicated_data_type) - -mod bool; -#[allow(clippy::module_inception)] -mod crdt; -mod lww; -mod lww_map; -mod map; - -pub use self::bool::*; -pub use crdt::*; -pub use lww::*; -pub use lww_map::*; -pub use map::*; diff --git a/src/table/lib.rs b/src/table/lib.rs index 53d2c93b..d6c19f1b 100644 --- a/src/table/lib.rs +++ b/src/table/lib.rs @@ -4,7 +4,6 @@ #[macro_use] extern crate log; -pub mod crdt; pub mod schema; pub mod util; @@ -18,3 +17,7 @@ pub mod table; pub use schema::*; pub use table::*; pub use util::*; + +pub mod crdt { + pub use garage_util::crdt::*; +} diff --git a/src/table/replication/fullcopy.rs b/src/table/replication/fullcopy.rs index 8f01fbdd..18682ace 100644 --- a/src/table/replication/fullcopy.rs +++ b/src/table/replication/fullcopy.rs @@ -28,10 +28,10 @@ impl TableReplication for TableFullReplication { fn write_nodes(&self, _hash: &Hash) -> Vec { let ring = self.system.ring.borrow(); - ring.config.members.keys().cloned().collect::>() + ring.layout.node_ids().to_vec() } fn write_quorum(&self) -> usize { - let nmembers = self.system.ring.borrow().config.members.len(); + let nmembers = self.system.ring.borrow().layout.node_ids().len(); if nmembers > self.max_faults { nmembers - self.max_faults } else { -- cgit v1.2.3