aboutsummaryrefslogtreecommitdiff
path: root/src/table
diff options
context:
space:
mode:
authorAlex Auvolat <alex@adnab.me>2021-11-09 12:24:04 +0100
committerAlex Auvolat <alex@adnab.me>2021-11-16 16:05:53 +0100
commitc94406f4282d48e2e2ac82ffb57eafaad23f7edc (patch)
tree01fe1b272e18fdae993e2207d8d3aea4a301ec56 /src/table
parent53888995bdd7c672d2e3ab8bb6a3529195c127a9 (diff)
downloadgarage-0.5-beta1.tar.gz
garage-0.5-beta1.zip
Improve how node roles are assigned in Garagev0.5-beta1
- change the terminology: the network configuration becomes the role table, the configuration of a nodes becomes a node's role - the modification of the role table takes place in two steps: first, changes are staged in a CRDT data structure. Then, once the user is happy with the changes, they can commit them all at once (or revert them). - update documentation - fix tests - implement smarter partition assignation algorithm This patch breaks the format of the network configuration: when migrating, the cluster will be in a state where no roles are assigned. All roles must be re-assigned and commited at once. This migration should not pose an issue.
Diffstat (limited to 'src/table')
-rw-r--r--src/table/Cargo.toml7
-rw-r--r--src/table/crdt/bool.rs34
-rw-r--r--src/table/crdt/crdt.rs71
-rw-r--r--src/table/crdt/lww.rs114
-rw-r--r--src/table/crdt/lww_map.rs161
-rw-r--r--src/table/crdt/map.rs99
-rw-r--r--src/table/crdt/mod.rs23
-rw-r--r--src/table/lib.rs5
-rw-r--r--src/table/replication/fullcopy.rs4
9 files changed, 10 insertions, 508 deletions
diff --git a/src/table/Cargo.toml b/src/table/Cargo.toml
index 616bf275..dc37f12c 100644
--- a/src/table/Cargo.toml
+++ b/src/table/Cargo.toml
@@ -1,11 +1,12 @@
[package]
name = "garage_table"
-version = "0.4.0"
+version = "0.5.0"
authors = ["Alex Auvolat <alex@adnab.me>"]
edition = "2018"
license = "AGPL-3.0"
description = "Table sharding and replication engine (DynamoDB-like) for the Garage object store"
repository = "https://git.deuxfleurs.fr/Deuxfleurs/garage"
+readme = "../../README.md"
[lib]
path = "lib.rs"
@@ -13,8 +14,8 @@ path = "lib.rs"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
-garage_rpc = { version = "0.4.0", path = "../rpc" }
-garage_util = { version = "0.4.0", path = "../util" }
+garage_rpc = { version = "0.5.0", path = "../rpc" }
+garage_util = { version = "0.5.0", path = "../util" }
async-trait = "0.1.7"
bytes = "1.0"
diff --git a/src/table/crdt/bool.rs b/src/table/crdt/bool.rs
deleted file mode 100644
index 53af8f82..00000000
--- a/src/table/crdt/bool.rs
+++ /dev/null
@@ -1,34 +0,0 @@
-use serde::{Deserialize, Serialize};
-
-use crate::crdt::crdt::*;
-
-/// Boolean, where `true` is an absorbing state
-#[derive(Clone, Copy, Debug, Serialize, Deserialize, PartialEq)]
-pub struct Bool(bool);
-
-impl Bool {
- /// Create a new boolean with the specified value
- pub fn new(b: bool) -> Self {
- Self(b)
- }
- /// Set the boolean to true
- pub fn set(&mut self) {
- self.0 = true;
- }
- /// Get the boolean value
- pub fn get(&self) -> bool {
- self.0
- }
-}
-
-impl From<bool> for Bool {
- fn from(b: bool) -> Bool {
- Bool::new(b)
- }
-}
-
-impl Crdt for Bool {
- fn merge(&mut self, other: &Self) {
- self.0 = self.0 || other.0;
- }
-}
diff --git a/src/table/crdt/crdt.rs b/src/table/crdt/crdt.rs
deleted file mode 100644
index a8f1b9aa..00000000
--- a/src/table/crdt/crdt.rs
+++ /dev/null
@@ -1,71 +0,0 @@
-use garage_util::data::*;
-
-/// Definition of a CRDT - all CRDT Rust types implement this.
-///
-/// A CRDT is defined as a merge operator that respects a certain set of axioms.
-///
-/// In particular, the merge operator must be commutative, associative,
-/// idempotent, and monotonic.
-/// In other words, if `a`, `b` and `c` are CRDTs, and `⊔` denotes the merge operator,
-/// the following axioms must apply:
-///
-/// ```text
-/// a ⊔ b = b ⊔ a (commutativity)
-/// (a ⊔ b) ⊔ c = a ⊔ (b ⊔ c) (associativity)
-/// (a ⊔ b) ⊔ b = a ⊔ b (idempotence)
-/// ```
-///
-/// Moreover, the relationship `≥` defined by `a ≥ b ⇔ ∃c. a = b ⊔ c` must be a partial order.
-/// This implies a few properties such as: if `a ⊔ b ≠ a`, then there is no `c` such that `(a ⊔ b) ⊔ c = a`,
-/// as this would imply a cycle in the partial order.
-pub trait Crdt {
- /// Merge the two datastructures according to the CRDT rules.
- /// `self` is modified to contain the merged CRDT value. `other` is not modified.
- ///
- /// # Arguments
- ///
- /// * `other` - the other CRDT we wish to merge with
- fn merge(&mut self, other: &Self);
-}
-
-/// All types that implement `Ord` (a total order) can also implement a trivial CRDT
-/// defined by the merge rule: `a ⊔ b = max(a, b)`. Implement this trait for your type
-/// to enable this behavior.
-pub trait AutoCrdt: Ord + Clone + std::fmt::Debug {
- /// WARN_IF_DIFFERENT: emit a warning when values differ. Set this to true if
- /// different values in your application should never happen. Set this to false
- /// if you are actually relying on the semantics of `a ⊔ b = max(a, b)`.
- const WARN_IF_DIFFERENT: bool;
-}
-
-impl<T> Crdt for T
-where
- T: AutoCrdt,
-{
- fn merge(&mut self, other: &Self) {
- if Self::WARN_IF_DIFFERENT && self != other {
- warn!(
- "Different CRDT values should be the same (logic error!): {:?} vs {:?}",
- self, other
- );
- if other > self {
- *self = other.clone();
- }
- warn!("Making an arbitrary choice: {:?}", self);
- } else if other > self {
- *self = other.clone();
- }
- }
-}
-
-impl AutoCrdt for String {
- const WARN_IF_DIFFERENT: bool = true;
-}
-
-impl AutoCrdt for bool {
- const WARN_IF_DIFFERENT: bool = true;
-}
-
-impl AutoCrdt for FixedBytes32 {
- const WARN_IF_DIFFERENT: bool = true;
-}
diff --git a/src/table/crdt/lww.rs b/src/table/crdt/lww.rs
deleted file mode 100644
index be197d88..00000000
--- a/src/table/crdt/lww.rs
+++ /dev/null
@@ -1,114 +0,0 @@
-use serde::{Deserialize, Serialize};
-
-use garage_util::time::now_msec;
-
-use crate::crdt::crdt::*;
-
-/// Last Write Win (LWW)
-///
-/// An LWW CRDT associates a timestamp with a value, in order to implement a
-/// time-based reconciliation rule: the most recent write wins.
-/// For completeness, the LWW reconciliation rule must also be defined for two LWW CRDTs
-/// with the same timestamp but different values.
-///
-/// In our case, we add the constraint that the value that is wrapped inside the LWW CRDT must
-/// itself be a CRDT: in the case when the timestamp does not allow us to decide on which value to
-/// keep, the merge rule of the inner CRDT is applied on the wrapped values. (Note that all types
-/// that implement the `Ord` trait get a default CRDT implemetnation that keeps the maximum value.
-/// This enables us to use LWW directly with primitive data types such as numbers or strings. It is
-/// generally desirable in this case to never explicitly produce LWW values with the same timestamp
-/// but different inner values, as the rule to keep the maximum value isn't generally the desired
-/// semantics.)
-///
-/// As multiple computers clocks are always desynchronized,
-/// when operations are close enough, it is equivalent to
-/// take one copy and drop the other one.
-///
-/// Given that clocks are not too desynchronized, this assumption
-/// is enough for most cases, as there is few chance that two humans
-/// coordonate themself faster than the time difference between two NTP servers.
-///
-/// As a more concret example, let's suppose you want to upload a file
-/// with the same key (path) in the same bucket at the very same time.
-/// For each request, the file will be timestamped by the receiving server
-/// and may differ from what you observed with your atomic clock!
-///
-/// This scheme is used by AWS S3 or Soundcloud and often without knowing
-/// in enterprise when reconciliating databases with ad-hoc scripts.
-#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
-pub struct Lww<T> {
- ts: u64,
- v: T,
-}
-
-impl<T> Lww<T>
-where
- T: Crdt,
-{
- /// Creates a new CRDT
- ///
- /// CRDT's internal timestamp is set with current node's clock.
- pub fn new(value: T) -> Self {
- Self {
- ts: now_msec(),
- v: value,
- }
- }
-
- /// Build a new CRDT from a previous non-compatible one
- ///
- /// Compared to new, the CRDT's timestamp is not set to now
- /// but must be set to the previous, non-compatible, CRDT's timestamp.
- pub fn migrate_from_raw(ts: u64, value: T) -> Self {
- Self { ts, v: value }
- }
-
- /// Update the LWW CRDT while keeping some causal ordering.
- ///
- /// The timestamp of the LWW CRDT is updated to be the current node's clock
- /// at time of update, or the previous timestamp + 1 if that's bigger,
- /// so that the new timestamp is always strictly larger than the previous one.
- /// This ensures that merging the update with the old value will result in keeping
- /// the updated value.
- pub fn update(&mut self, new_value: T) {
- self.ts = std::cmp::max(self.ts + 1, now_msec());
- self.v = new_value;
- }
-
- /// Get the CRDT value
- pub fn get(&self) -> &T {
- &self.v
- }
-
- /// Get a mutable reference to the CRDT's value
- ///
- /// This is usefull to mutate the inside value without changing the LWW timestamp.
- /// When such mutation is done, the merge between two LWW values is done using the inner
- /// CRDT's merge operation. This is usefull in the case where the inner CRDT is a large
- /// data type, such as a map, and we only want to change a single item in the map.
- /// To do this, we can produce a "CRDT delta", i.e. a LWW that contains only the modification.
- /// This delta consists in a LWW with the same timestamp, and the map
- /// inside only contains the updated value.
- /// The advantage of such a delta is that it is much smaller than the whole map.
- ///
- /// Avoid using this if the inner data type is a primitive type such as a number or a string,
- /// as you will then rely on the merge function defined on `Ord` types by keeping the maximum
- /// of both values.
- pub fn get_mut(&mut self) -> &mut T {
- &mut self.v
- }
-}
-
-impl<T> Crdt for Lww<T>
-where
- T: Clone + Crdt,
-{
- fn merge(&mut self, other: &Self) {
- if other.ts > self.ts {
- self.ts = other.ts;
- self.v = other.v.clone();
- } else if other.ts == self.ts {
- self.v.merge(&other.v);
- }
- }
-}
diff --git a/src/table/crdt/lww_map.rs b/src/table/crdt/lww_map.rs
deleted file mode 100644
index fb25fd46..00000000
--- a/src/table/crdt/lww_map.rs
+++ /dev/null
@@ -1,161 +0,0 @@
-use serde::{Deserialize, Serialize};
-
-use garage_util::time::now_msec;
-
-use crate::crdt::crdt::*;
-
-/// Last Write Win Map
-///
-/// This types defines a CRDT for a map from keys to values.
-/// The values have an associated timestamp, such that the last written value
-/// takes precedence over previous ones. As for the simpler `LWW` type, the value
-/// type `V` is also required to implement the CRDT trait.
-/// We do not encourage mutating the values associated with a given key
-/// without updating the timestamp, in fact at the moment we do not provide a `.get_mut()`
-/// method that would allow that.
-///
-/// Internally, the map is stored as a vector of keys and values, sorted by ascending key order.
-/// This is why the key type `K` must implement `Ord` (and also to ensure a unique serialization,
-/// such that two values can be compared for equality based on their hashes). As a consequence,
-/// insertions take `O(n)` time. This means that LWWMap should be used for reasonably small maps.
-/// However, note that even if we were using a more efficient data structure such as a `BTreeMap`,
-/// the serialization cost `O(n)` would still have to be paid at each modification, so we are
-/// actually not losing anything here.
-#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
-pub struct LwwMap<K, V> {
- vals: Vec<(K, u64, V)>,
-}
-
-impl<K, V> LwwMap<K, V>
-where
- K: Ord,
- V: Crdt,
-{
- /// Create a new empty map CRDT
- pub fn new() -> Self {
- Self { vals: vec![] }
- }
- /// Used to migrate from a map defined in an incompatible format. This produces
- /// a map that contains a single item with the specified timestamp (copied from
- /// the incompatible format). Do this as many times as you have items to migrate,
- /// and put them all together using the CRDT merge operator.
- pub fn migrate_from_raw_item(k: K, ts: u64, v: V) -> Self {
- Self {
- vals: vec![(k, ts, v)],
- }
- }
- /// Returns a map that contains a single mapping from the specified key to the specified value.
- /// This map is a mutator, or a delta-CRDT, such that when it is merged with the original map,
- /// the previous value will be replaced with the one specified here.
- /// The timestamp in the provided mutator is set to the maximum of the current system's clock
- /// and 1 + the previous value's timestamp (if there is one), so that the new value will always
- /// take precedence (LWW rule).
- ///
- /// Typically, to update the value associated to a key in the map, you would do the following:
- ///
- /// ```ignore
- /// let my_update = my_crdt.update_mutator(key_to_modify, new_value);
- /// my_crdt.merge(&my_update);
- /// ```
- ///
- /// However extracting the mutator on its own and only sending that on the network is very
- /// interesting as it is much smaller than the whole map.
- pub fn update_mutator(&self, k: K, new_v: V) -> Self {
- let new_vals = match self.vals.binary_search_by(|(k2, _, _)| k2.cmp(&k)) {
- Ok(i) => {
- let (_, old_ts, _) = self.vals[i];
- let new_ts = std::cmp::max(old_ts + 1, now_msec());
- vec![(k, new_ts, new_v)]
- }
- Err(_) => vec![(k, now_msec(), new_v)],
- };
- Self { vals: new_vals }
- }
- /// Takes all of the values of the map and returns them. The current map is reset to the
- /// empty map. This is very usefull to produce in-place a new map that contains only a delta
- /// that modifies a certain value:
- ///
- /// ```ignore
- /// let mut a = get_my_crdt_value();
- /// let old_a = a.take_and_clear();
- /// a.merge(&old_a.update_mutator(key_to_modify, new_value));
- /// put_my_crdt_value(a);
- /// ```
- ///
- /// Of course in this simple example we could have written simply
- /// `pyt_my_crdt_value(a.update_mutator(key_to_modify, new_value))`,
- /// but in the case where the map is a field in a struct for instance (as is always the case),
- /// this becomes very handy:
- ///
- /// ```ignore
- /// let mut a = get_my_crdt_value();
- /// let old_a_map = a.map_field.take_and_clear();
- /// a.map_field.merge(&old_a_map.update_mutator(key_to_modify, new_value));
- /// put_my_crdt_value(a);
- /// ```
- pub fn take_and_clear(&mut self) -> Self {
- let vals = std::mem::take(&mut self.vals);
- Self { vals }
- }
- /// Removes all values from the map
- pub fn clear(&mut self) {
- self.vals.clear();
- }
- /// Get a reference to the value assigned to a key
- pub fn get(&self, k: &K) -> Option<&V> {
- match self.vals.binary_search_by(|(k2, _, _)| k2.cmp(k)) {
- Ok(i) => Some(&self.vals[i].2),
- Err(_) => None,
- }
- }
- /// Gets a reference to all of the items, as a slice. Usefull to iterate on all map values.
- /// In most case you will want to ignore the timestamp (second item of the tuple).
- pub fn items(&self) -> &[(K, u64, V)] {
- &self.vals[..]
- }
-
- /// Returns the number of items in the map
- pub fn len(&self) -> usize {
- self.vals.len()
- }
-
- /// Returns true if the map is empty
- pub fn is_empty(&self) -> bool {
- self.len() == 0
- }
-}
-
-impl<K, V> Crdt for LwwMap<K, V>
-where
- K: Clone + Ord,
- V: Clone + Crdt,
-{
- fn merge(&mut self, other: &Self) {
- for (k, ts2, v2) in other.vals.iter() {
- match self.vals.binary_search_by(|(k2, _, _)| k2.cmp(k)) {
- Ok(i) => {
- let (_, ts1, _v1) = &self.vals[i];
- if ts2 > ts1 {
- self.vals[i].1 = *ts2;
- self.vals[i].2 = v2.clone();
- } else if ts1 == ts2 {
- self.vals[i].2.merge(v2);
- }
- }
- Err(i) => {
- self.vals.insert(i, (k.clone(), *ts2, v2.clone()));
- }
- }
- }
- }
-}
-
-impl<K, V> Default for LwwMap<K, V>
-where
- K: Ord,
- V: Crdt,
-{
- fn default() -> Self {
- Self::new()
- }
-}
diff --git a/src/table/crdt/map.rs b/src/table/crdt/map.rs
deleted file mode 100644
index 7553cd50..00000000
--- a/src/table/crdt/map.rs
+++ /dev/null
@@ -1,99 +0,0 @@
-use serde::{Deserialize, Serialize};
-
-use crate::crdt::crdt::*;
-
-/// Simple CRDT Map
-///
-/// This types defines a CRDT for a map from keys to values. Values are CRDT types which
-/// can have their own updating logic.
-///
-/// Internally, the map is stored as a vector of keys and values, sorted by ascending key order.
-/// This is why the key type `K` must implement `Ord` (and also to ensure a unique serialization,
-/// such that two values can be compared for equality based on their hashes). As a consequence,
-/// insertions take `O(n)` time. This means that Map should be used for reasonably small maps.
-/// However, note that even if we were using a more efficient data structure such as a `BTreeMap`,
-/// the serialization cost `O(n)` would still have to be paid at each modification, so we are
-/// actually not losing anything here.
-#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
-pub struct Map<K, V> {
- vals: Vec<(K, V)>,
-}
-
-impl<K, V> Map<K, V>
-where
- K: Clone + Ord,
- V: Clone + Crdt,
-{
- /// Create a new empty map CRDT
- pub fn new() -> Self {
- Self { vals: vec![] }
- }
-
- /// Returns a map that contains a single mapping from the specified key to the specified value.
- /// This can be used to build a delta-mutator:
- /// when merged with another map, the value will be added or CRDT-merged if a previous
- /// value already exists.
- pub fn put_mutator(k: K, v: V) -> Self {
- Self { vals: vec![(k, v)] }
- }
-
- /// Add a value to the map
- pub fn put(&mut self, k: K, v: V) {
- self.merge(&Self::put_mutator(k, v));
- }
-
- /// Removes all values from the map
- pub fn clear(&mut self) {
- self.vals.clear();
- }
-
- /// Get a reference to the value assigned to a key
- pub fn get(&self, k: &K) -> Option<&V> {
- match self.vals.binary_search_by(|(k2, _)| k2.cmp(k)) {
- Ok(i) => Some(&self.vals[i].1),
- Err(_) => None,
- }
- }
- /// Gets a reference to all of the items, as a slice. Usefull to iterate on all map values.
- pub fn items(&self) -> &[(K, V)] {
- &self.vals[..]
- }
- /// Returns the number of items in the map
- pub fn len(&self) -> usize {
- self.vals.len()
- }
-
- /// Returns true if the map is empty
- pub fn is_empty(&self) -> bool {
- self.len() == 0
- }
-}
-
-impl<K, V> Crdt for Map<K, V>
-where
- K: Clone + Ord,
- V: Clone + Crdt,
-{
- fn merge(&mut self, other: &Self) {
- for (k, v2) in other.vals.iter() {
- match self.vals.binary_search_by(|(k2, _)| k2.cmp(k)) {
- Ok(i) => {
- self.vals[i].1.merge(v2);
- }
- Err(i) => {
- self.vals.insert(i, (k.clone(), v2.clone()));
- }
- }
- }
- }
-}
-
-impl<K, V> Default for Map<K, V>
-where
- K: Clone + Ord,
- V: Clone + Crdt,
-{
- fn default() -> Self {
- Self::new()
- }
-}
diff --git a/src/table/crdt/mod.rs b/src/table/crdt/mod.rs
deleted file mode 100644
index 9663a5a5..00000000
--- a/src/table/crdt/mod.rs
+++ /dev/null
@@ -1,23 +0,0 @@
-//! This package provides a simple implementation of conflict-free replicated data types (CRDTs)
-//!
-//! CRDTs are a type of data structures that do not require coordination. In other words, we can
-//! edit them in parallel, we will always find a way to merge it.
-//!
-//! A general example is a counter. Its initial value is 0. Alice and Bob get a copy of the
-//! counter. Alice does +1 on her copy, she reads 1. Bob does +3 on his copy, he reads 3. Now,
-//! it is easy to merge their counters, order does not count: we always get 4.
-//!
-//! Learn more about CRDT [on Wikipedia](https://en.wikipedia.org/wiki/Conflict-free_replicated_data_type)
-
-mod bool;
-#[allow(clippy::module_inception)]
-mod crdt;
-mod lww;
-mod lww_map;
-mod map;
-
-pub use self::bool::*;
-pub use crdt::*;
-pub use lww::*;
-pub use lww_map::*;
-pub use map::*;
diff --git a/src/table/lib.rs b/src/table/lib.rs
index 53d2c93b..d6c19f1b 100644
--- a/src/table/lib.rs
+++ b/src/table/lib.rs
@@ -4,7 +4,6 @@
#[macro_use]
extern crate log;
-pub mod crdt;
pub mod schema;
pub mod util;
@@ -18,3 +17,7 @@ pub mod table;
pub use schema::*;
pub use table::*;
pub use util::*;
+
+pub mod crdt {
+ pub use garage_util::crdt::*;
+}
diff --git a/src/table/replication/fullcopy.rs b/src/table/replication/fullcopy.rs
index 8f01fbdd..18682ace 100644
--- a/src/table/replication/fullcopy.rs
+++ b/src/table/replication/fullcopy.rs
@@ -28,10 +28,10 @@ impl TableReplication for TableFullReplication {
fn write_nodes(&self, _hash: &Hash) -> Vec<Uuid> {
let ring = self.system.ring.borrow();
- ring.config.members.keys().cloned().collect::<Vec<_>>()
+ ring.layout.node_ids().to_vec()
}
fn write_quorum(&self) -> usize {
- let nmembers = self.system.ring.borrow().config.members.len();
+ let nmembers = self.system.ring.borrow().layout.node_ids().len();
if nmembers > self.max_faults {
nmembers - self.max_faults
} else {