diff options
Diffstat (limited to 'src/model')
-rw-r--r-- | src/model/Cargo.toml | 21 | ||||
-rw-r--r-- | src/model/bucket_alias_table.rs | 24 | ||||
-rw-r--r-- | src/model/bucket_table.rs | 133 | ||||
-rw-r--r-- | src/model/garage.rs | 47 | ||||
-rw-r--r-- | src/model/index_counter.rs | 232 | ||||
-rw-r--r-- | src/model/k2v/causality.rs | 65 | ||||
-rw-r--r-- | src/model/k2v/item_table.rs | 69 | ||||
-rw-r--r-- | src/model/k2v/mod.rs | 4 | ||||
-rw-r--r-- | src/model/k2v/poll.rs | 50 | ||||
-rw-r--r-- | src/model/k2v/rpc.rs | 278 | ||||
-rw-r--r-- | src/model/k2v/seen.rs | 105 | ||||
-rw-r--r-- | src/model/k2v/sub.rs | 110 | ||||
-rw-r--r-- | src/model/key_table.rs | 154 | ||||
-rw-r--r-- | src/model/migrate.rs | 5 | ||||
-rw-r--r-- | src/model/prev/v051/bucket_table.rs | 2 | ||||
-rw-r--r-- | src/model/prev/v051/key_table.rs | 50 | ||||
-rw-r--r-- | src/model/prev/v051/mod.rs | 3 | ||||
-rw-r--r-- | src/model/prev/v051/object_table.rs | 149 | ||||
-rw-r--r-- | src/model/prev/v051/version_table.rs | 79 | ||||
-rw-r--r-- | src/model/s3/block_ref_table.rs | 29 | ||||
-rw-r--r-- | src/model/s3/object_table.rs | 296 | ||||
-rw-r--r-- | src/model/s3/version_table.rs | 219 |
22 files changed, 1138 insertions, 986 deletions
diff --git a/src/model/Cargo.toml b/src/model/Cargo.toml index 08baf81f..d1c7cd29 100644 --- a/src/model/Cargo.toml +++ b/src/model/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "garage_model" -version = "0.8.0" +version = "0.8.1" authors = ["Alex Auvolat <alex@adnab.me>"] edition = "2018" license = "AGPL-3.0" @@ -14,23 +14,22 @@ path = "lib.rs" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -garage_db = { version = "0.8.0", default-features = false, path = "../db" } -garage_rpc = { version = "0.8.0", path = "../rpc" } -garage_table = { version = "0.8.0", path = "../table" } -garage_block = { version = "0.8.0", path = "../block" } -garage_util = { version = "0.8.0", path = "../util" } +garage_db = { version = "0.8.1", default-features = false, path = "../db" } +garage_rpc = { version = "0.8.1", path = "../rpc" } +garage_table = { version = "0.8.1", path = "../table" } +garage_block = { version = "0.8.1", path = "../block" } +garage_util = { version = "0.8.1", path = "../util" } async-trait = "0.1.7" arc-swap = "1.0" -blake2 = "0.9" +blake2 = "0.10" err-derive = "0.3" hex = "0.4" -base64 = "0.13" -tracing = "0.1.30" +base64 = "0.21" +tracing = "0.1" rand = "0.8" -zstd = { version = "0.9", default-features = false } +zstd = { version = "0.12", default-features = false } -rmp-serde = "0.15" serde = { version = "1.0", default-features = false, features = ["derive", "rc"] } serde_bytes = "0.11" diff --git a/src/model/bucket_alias_table.rs b/src/model/bucket_alias_table.rs index fcd1536e..54d7fbad 100644 --- a/src/model/bucket_alias_table.rs +++ b/src/model/bucket_alias_table.rs @@ -1,18 +1,26 @@ -use serde::{Deserialize, Serialize}; - use garage_util::data::*; use garage_table::crdt::*; use garage_table::*; -/// The bucket alias table holds the names given to buckets -/// in the global namespace. -#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] -pub struct BucketAlias { - name: String, - pub state: crdt::Lww<Option<Uuid>>, +mod v08 { + use garage_util::crdt; + use garage_util::data::Uuid; + use serde::{Deserialize, Serialize}; + + /// The bucket alias table holds the names given to buckets + /// in the global namespace. + #[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] + pub struct BucketAlias { + pub(super) name: String, + pub state: crdt::Lww<Option<Uuid>>, + } + + impl garage_util::migrate::InitialFormat for BucketAlias {} } +pub use v08::*; + impl BucketAlias { pub fn new(name: String, ts: u64, bucket_id: Option<Uuid>) -> Option<Self> { if !is_valid_bucket_name(&name) { diff --git a/src/model/bucket_table.rs b/src/model/bucket_table.rs index 7be42702..ac163736 100644 --- a/src/model/bucket_table.rs +++ b/src/model/bucket_table.rs @@ -1,5 +1,3 @@ -use serde::{Deserialize, Serialize}; - use garage_table::crdt::*; use garage_table::*; use garage_util::data::*; @@ -7,72 +5,83 @@ use garage_util::time::*; use crate::permission::BucketKeyPerm; -/// A bucket is a collection of objects -/// -/// Its parameters are not directly accessible as: -/// - It must be possible to merge paramaters, hence the use of a LWW CRDT. -/// - A bucket has 2 states, Present or Deleted and parameters make sense only if present. -#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] -pub struct Bucket { - /// ID of the bucket - pub id: Uuid, - /// State, and configuration if not deleted, of the bucket - pub state: crdt::Deletable<BucketParams>, -} +mod v08 { + use crate::permission::BucketKeyPerm; + use garage_util::crdt; + use garage_util::data::Uuid; + use serde::{Deserialize, Serialize}; + + /// A bucket is a collection of objects + /// + /// Its parameters are not directly accessible as: + /// - It must be possible to merge paramaters, hence the use of a LWW CRDT. + /// - A bucket has 2 states, Present or Deleted and parameters make sense only if present. + #[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] + pub struct Bucket { + /// ID of the bucket + pub id: Uuid, + /// State, and configuration if not deleted, of the bucket + pub state: crdt::Deletable<BucketParams>, + } -/// Configuration for a bucket -#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] -pub struct BucketParams { - /// Bucket's creation date - pub creation_date: u64, - /// Map of key with access to the bucket, and what kind of access they give - pub authorized_keys: crdt::Map<String, BucketKeyPerm>, - - /// Map of aliases that are or have been given to this bucket - /// in the global namespace - /// (not authoritative: this is just used as an indication to - /// map back to aliases when doing ListBuckets) - pub aliases: crdt::LwwMap<String, bool>, - /// Map of aliases that are or have been given to this bucket - /// in namespaces local to keys - /// key = (access key id, alias name) - pub local_aliases: crdt::LwwMap<(String, String), bool>, - - /// Whether this bucket is allowed for website access - /// (under all of its global alias names), - /// and if so, the website configuration XML document - pub website_config: crdt::Lww<Option<WebsiteConfig>>, - /// CORS rules - pub cors_config: crdt::Lww<Option<Vec<CorsRule>>>, - /// Bucket quotas - #[serde(default)] - pub quotas: crdt::Lww<BucketQuotas>, -} + /// Configuration for a bucket + #[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] + pub struct BucketParams { + /// Bucket's creation date + pub creation_date: u64, + /// Map of key with access to the bucket, and what kind of access they give + pub authorized_keys: crdt::Map<String, BucketKeyPerm>, + + /// Map of aliases that are or have been given to this bucket + /// in the global namespace + /// (not authoritative: this is just used as an indication to + /// map back to aliases when doing ListBuckets) + pub aliases: crdt::LwwMap<String, bool>, + /// Map of aliases that are or have been given to this bucket + /// in namespaces local to keys + /// key = (access key id, alias name) + pub local_aliases: crdt::LwwMap<(String, String), bool>, + + /// Whether this bucket is allowed for website access + /// (under all of its global alias names), + /// and if so, the website configuration XML document + pub website_config: crdt::Lww<Option<WebsiteConfig>>, + /// CORS rules + pub cors_config: crdt::Lww<Option<Vec<CorsRule>>>, + /// Bucket quotas + #[serde(default)] + pub quotas: crdt::Lww<BucketQuotas>, + } -#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] -pub struct WebsiteConfig { - pub index_document: String, - pub error_document: Option<String>, -} + #[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] + pub struct WebsiteConfig { + pub index_document: String, + pub error_document: Option<String>, + } -#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] -pub struct CorsRule { - pub id: Option<String>, - pub max_age_seconds: Option<u64>, - pub allow_origins: Vec<String>, - pub allow_methods: Vec<String>, - pub allow_headers: Vec<String>, - pub expose_headers: Vec<String>, -} + #[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] + pub struct CorsRule { + pub id: Option<String>, + pub max_age_seconds: Option<u64>, + pub allow_origins: Vec<String>, + pub allow_methods: Vec<String>, + pub allow_headers: Vec<String>, + pub expose_headers: Vec<String>, + } -#[derive(Default, PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] -pub struct BucketQuotas { - /// Maximum size in bytes (bucket size = sum of sizes of objects in the bucket) - pub max_size: Option<u64>, - /// Maximum number of non-deleted objects in the bucket - pub max_objects: Option<u64>, + #[derive(Default, PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] + pub struct BucketQuotas { + /// Maximum size in bytes (bucket size = sum of sizes of objects in the bucket) + pub max_size: Option<u64>, + /// Maximum number of non-deleted objects in the bucket + pub max_objects: Option<u64>, + } + + impl garage_util::migrate::InitialFormat for Bucket {} } +pub use v08::*; + impl AutoCrdt for BucketQuotas { const WARN_IF_DIFFERENT: bool = true; } diff --git a/src/model/garage.rs b/src/model/garage.rs index e34d034f..4716954a 100644 --- a/src/model/garage.rs +++ b/src/model/garage.rs @@ -27,20 +27,20 @@ use crate::index_counter::*; use crate::key_table::*; #[cfg(feature = "k2v")] -use crate::k2v::{item_table::*, poll::*, rpc::*}; +use crate::k2v::{item_table::*, rpc::*, sub::*}; /// An entire Garage full of data pub struct Garage { /// The parsed configuration Garage is running pub config: Config, + /// The set of background variables that can be viewed/modified at runtime + pub bg_vars: vars::BgVars, /// The replication mode of this cluster pub replication_mode: ReplicationMode, /// The local database pub db: db::Db, - /// A background job runner - pub background: Arc<BackgroundRunner>, /// The membership manager pub system: Arc<System>, /// The block manager @@ -78,7 +78,7 @@ pub struct GarageK2V { impl Garage { /// Create and run garage - pub fn new(config: Config, background: Arc<BackgroundRunner>) -> Result<Arc<Self>, Error> { + pub fn new(config: Config) -> Result<Arc<Self>, Error> { // Create meta dir and data dir if they don't exist already std::fs::create_dir_all(&config.metadata_dir) .ok_or_message("Unable to create Garage metadata directory")?; @@ -159,7 +159,7 @@ impl Garage { }; let network_key = NetworkKey::from_slice( - &hex::decode(&config.rpc_secret).expect("Invalid RPC secret key")[..], + &hex::decode(&config.rpc_secret.as_ref().unwrap()).expect("Invalid RPC secret key")[..], ) .expect("Invalid RPC secret key"); @@ -167,7 +167,7 @@ impl Garage { .expect("Invalid replication_mode in config file."); info!("Initialize membership management system..."); - let system = System::new(network_key, background.clone(), replication_mode, &config)?; + let system = System::new(network_key, replication_mode, &config)?; let data_rep_param = TableShardedReplication { system: system.clone(), @@ -225,7 +225,6 @@ impl Garage { info!("Initialize version_table..."); let version_table = Table::new( VersionTable { - background: background.clone(), block_ref_table: block_ref_table.clone(), }, meta_rep_param.clone(), @@ -240,7 +239,6 @@ impl Garage { #[allow(clippy::redundant_clone)] let object_table = Table::new( ObjectTable { - background: background.clone(), version_table: version_table.clone(), object_counter_table: object_counter_table.clone(), }, @@ -253,12 +251,16 @@ impl Garage { #[cfg(feature = "k2v")] let k2v = GarageK2V::new(system.clone(), &db, meta_rep_param); + // Initialize bg vars + let mut bg_vars = vars::BgVars::new(); + block_manager.register_bg_vars(&mut bg_vars); + // -- done -- Ok(Arc::new(Self { config, + bg_vars, replication_mode, db, - background, system, block_manager, bucket_table, @@ -273,6 +275,22 @@ impl Garage { })) } + pub fn spawn_workers(&self, bg: &BackgroundRunner) { + self.block_manager.spawn_workers(bg); + + self.bucket_table.spawn_workers(bg); + self.bucket_alias_table.spawn_workers(bg); + self.key_table.spawn_workers(bg); + + self.object_table.spawn_workers(bg); + self.object_counter_table.spawn_workers(bg); + self.version_table.spawn_workers(bg); + self.block_ref_table.spawn_workers(bg); + + #[cfg(feature = "k2v")] + self.k2v.spawn_workers(bg); + } + pub fn bucket_helper(&self) -> helper::bucket::BucketHelper { helper::bucket::BucketHelper(self) } @@ -287,8 +305,10 @@ impl GarageK2V { fn new(system: Arc<System>, db: &db::Db, meta_rep_param: TableShardedReplication) -> Self { info!("Initialize K2V counter table..."); let counter_table = IndexCounter::new(system.clone(), meta_rep_param.clone(), db); + info!("Initialize K2V subscription manager..."); let subscriptions = Arc::new(SubscriptionManager::new()); + info!("Initialize K2V item table..."); let item_table = Table::new( K2VItemTable { @@ -299,7 +319,9 @@ impl GarageK2V { system.clone(), db, ); - let rpc = K2VRpcHandler::new(system, item_table.clone(), subscriptions); + + info!("Initialize K2V RPC handler..."); + let rpc = K2VRpcHandler::new(system, db, item_table.clone(), subscriptions); Self { item_table, @@ -307,4 +329,9 @@ impl GarageK2V { rpc, } } + + pub fn spawn_workers(&self, bg: &BackgroundRunner) { + self.item_table.spawn_workers(bg); + self.counter_table.spawn_workers(bg); + } } diff --git a/src/model/index_counter.rs b/src/model/index_counter.rs index e6394f0c..35d6596d 100644 --- a/src/model/index_counter.rs +++ b/src/model/index_counter.rs @@ -1,19 +1,18 @@ use core::ops::Bound; -use std::collections::{hash_map, BTreeMap, HashMap}; +use std::collections::{BTreeMap, HashMap}; use std::marker::PhantomData; use std::sync::Arc; -use async_trait::async_trait; use serde::{Deserialize, Serialize}; -use tokio::sync::{mpsc, watch}; use garage_db as db; use garage_rpc::ring::Ring; use garage_rpc::system::System; -use garage_util::background::*; +use garage_util::background::BackgroundRunner; use garage_util::data::*; use garage_util::error::*; +use garage_util::migrate::Migrate; use garage_util::time::*; use garage_table::crdt::*; @@ -31,14 +30,44 @@ pub trait CountedItem: Clone + PartialEq + Send + Sync + 'static { fn counts(&self) -> Vec<(&'static str, i64)>; } -/// A counter entry in the global table -#[derive(Clone, PartialEq, Debug, Serialize, Deserialize)] -pub struct CounterEntry<T: CountedItem> { - pub pk: T::CP, - pub sk: T::CS, - pub values: BTreeMap<String, CounterValue>, +mod v08 { + use super::CountedItem; + use garage_util::data::Uuid; + use serde::{Deserialize, Serialize}; + use std::collections::BTreeMap; + + // ---- Global part (the table everyone queries) ---- + + /// A counter entry in the global table + #[derive(Clone, PartialEq, Debug, Serialize, Deserialize)] + pub struct CounterEntry<T: CountedItem> { + pub pk: T::CP, + pub sk: T::CS, + pub values: BTreeMap<String, CounterValue>, + } + + /// A counter entry in the global table + #[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] + pub struct CounterValue { + pub node_values: BTreeMap<Uuid, (u64, i64)>, + } + + impl<T: CountedItem> garage_util::migrate::InitialFormat for CounterEntry<T> {} + + // ---- Local part (the counter we maintain transactionnaly on each node) ---- + + #[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] + pub(super) struct LocalCounterEntry<T: CountedItem> { + pub(super) pk: T::CP, + pub(super) sk: T::CS, + pub(super) values: BTreeMap<String, (u64, i64)>, + } + + impl<T: CountedItem> garage_util::migrate::InitialFormat for LocalCounterEntry<T> {} } +pub use v08::*; + impl<T: CountedItem> Entry<T::CP, T::CS> for CounterEntry<T> { fn partition_key(&self) -> &T::CP { &self.pk @@ -80,12 +109,6 @@ impl<T: CountedItem> CounterEntry<T> { } } -/// A counter entry in the global table -#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] -pub struct CounterValue { - pub node_values: BTreeMap<Uuid, (u64, i64)>, -} - impl<T: CountedItem> Crdt for CounterEntry<T> { fn merge(&mut self, other: &Self) { for (name, e2) in other.values.iter() { @@ -142,7 +165,6 @@ impl<T: CountedItem> TableSchema for CounterTable<T> { pub struct IndexCounter<T: CountedItem> { this_node: Uuid, local_counter: db::Tree, - propagate_tx: mpsc::UnboundedSender<(T::CP, T::CS, LocalCounterEntry<T>)>, pub table: Arc<Table<CounterTable<T>, TableShardedReplication>>, } @@ -152,16 +174,11 @@ impl<T: CountedItem> IndexCounter<T> { replication: TableShardedReplication, db: &db::Db, ) -> Arc<Self> { - let background = system.background.clone(); - - let (propagate_tx, propagate_rx) = mpsc::unbounded_channel(); - - let this = Arc::new(Self { + Arc::new(Self { this_node: system.id, local_counter: db .open_tree(format!("local_counter_v2:{}", T::COUNTER_TABLE_NAME)) .expect("Unable to open local counter tree"), - propagate_tx, table: Table::new( CounterTable { _phantom_t: Default::default(), @@ -170,16 +187,11 @@ impl<T: CountedItem> IndexCounter<T> { system, db, ), - }); - - background.spawn_worker(IndexPropagatorWorker { - index_counter: this.clone(), - propagate_rx, - buf: HashMap::new(), - errors: 0, - }); + }) + } - this + pub fn spawn_workers(&self, bg: &BackgroundRunner) { + self.table.spawn_workers(bg); } pub fn count( @@ -208,11 +220,9 @@ impl<T: CountedItem> IndexCounter<T> { let tree_key = self.table.data.tree_key(pk, sk); let mut entry = match tx.get(&self.local_counter, &tree_key[..])? { - Some(old_bytes) => { - rmp_serde::decode::from_read_ref::<_, LocalCounterEntry<T>>(&old_bytes) - .map_err(Error::RmpDecode) - .map_err(db::TxError::Abort)? - } + Some(old_bytes) => LocalCounterEntry::<T>::decode(&old_bytes) + .ok_or_message("Cannot decode local counter entry") + .map_err(db::TxError::Abort)?, None => LocalCounterEntry { pk: pk.clone(), sk: sk.clone(), @@ -227,17 +237,14 @@ impl<T: CountedItem> IndexCounter<T> { ent.1 += *inc; } - let new_entry_bytes = rmp_to_vec_all_named(&entry) + let new_entry_bytes = entry + .encode() .map_err(Error::RmpEncode) .map_err(db::TxError::Abort)?; tx.insert(&self.local_counter, &tree_key[..], new_entry_bytes)?; - if let Err(e) = self.propagate_tx.send((pk.clone(), sk.clone(), entry)) { - error!( - "Could not propagate updated counter values, failed to send to channel: {}", - e - ); - } + let dist_entry = entry.into_counter_entry(self.this_node); + self.table.queue_insert(tx, &dist_entry)?; Ok(()) } @@ -250,23 +257,6 @@ impl<T: CountedItem> IndexCounter<T> { TS: TableSchema<E = T>, TR: TableReplication, { - let save_counter_entry = |entry: CounterEntry<T>| -> Result<(), Error> { - let entry_k = self - .table - .data - .tree_key(entry.partition_key(), entry.sort_key()); - self.table - .data - .update_entry_with(&entry_k, |ent| match ent { - Some(mut ent) => { - ent.merge(&entry); - ent - } - None => entry.clone(), - })?; - Ok(()) - }; - // 1. Set all old local counters to zero let now = now_msec(); let mut next_start: Option<Vec<u8>> = None; @@ -289,20 +279,22 @@ impl<T: CountedItem> IndexCounter<T> { info!("zeroing old counters... ({})", hex::encode(&batch[0].0)); for (local_counter_k, local_counter) in batch { - let mut local_counter = - rmp_serde::decode::from_read_ref::<_, LocalCounterEntry<T>>(&local_counter)?; + let mut local_counter = LocalCounterEntry::<T>::decode(&local_counter) + .ok_or_message("Cannot decode local counter entry")?; for (_, tv) in local_counter.values.iter_mut() { tv.0 = std::cmp::max(tv.0 + 1, now); tv.1 = 0; } - let local_counter_bytes = rmp_to_vec_all_named(&local_counter)?; + let local_counter_bytes = local_counter.encode()?; self.local_counter .insert(&local_counter_k, &local_counter_bytes)?; let counter_entry = local_counter.into_counter_entry(self.this_node); - save_counter_entry(counter_entry)?; + self.local_counter + .db() + .transaction(|mut tx| self.table.queue_insert(&mut tx, &counter_entry))?; next_start = Some(local_counter_k); } @@ -343,9 +335,8 @@ impl<T: CountedItem> IndexCounter<T> { let local_counter_key = self.table.data.tree_key(pk, sk); let mut local_counter = match self.local_counter.get(&local_counter_key)? { Some(old_bytes) => { - let ent = rmp_serde::decode::from_read_ref::<_, LocalCounterEntry<T>>( - &old_bytes, - )?; + let ent = LocalCounterEntry::<T>::decode(&old_bytes) + .ok_or_message("Cannot decode local counter entry")?; assert!(ent.pk == *pk); assert!(ent.sk == *sk); ent @@ -362,12 +353,14 @@ impl<T: CountedItem> IndexCounter<T> { tv.1 += v; } - let local_counter_bytes = rmp_to_vec_all_named(&local_counter)?; + let local_counter_bytes = local_counter.encode()?; self.local_counter .insert(&local_counter_key, local_counter_bytes)?; let counter_entry = local_counter.into_counter_entry(self.this_node); - save_counter_entry(counter_entry)?; + self.local_counter + .db() + .transaction(|mut tx| self.table.queue_insert(&mut tx, &counter_entry))?; next_start = Some(counted_entry_k); } @@ -378,104 +371,7 @@ impl<T: CountedItem> IndexCounter<T> { } } -struct IndexPropagatorWorker<T: CountedItem> { - index_counter: Arc<IndexCounter<T>>, - propagate_rx: mpsc::UnboundedReceiver<(T::CP, T::CS, LocalCounterEntry<T>)>, - - buf: HashMap<Vec<u8>, CounterEntry<T>>, - errors: usize, -} - -impl<T: CountedItem> IndexPropagatorWorker<T> { - fn add_ent(&mut self, pk: T::CP, sk: T::CS, counters: LocalCounterEntry<T>) { - let tree_key = self.index_counter.table.data.tree_key(&pk, &sk); - let dist_entry = counters.into_counter_entry(self.index_counter.this_node); - match self.buf.entry(tree_key) { - hash_map::Entry::Vacant(e) => { - e.insert(dist_entry); - } - hash_map::Entry::Occupied(mut e) => { - e.get_mut().merge(&dist_entry); - } - } - } -} - -#[async_trait] -impl<T: CountedItem> Worker for IndexPropagatorWorker<T> { - fn name(&self) -> String { - format!("{} index counter propagator", T::COUNTER_TABLE_NAME) - } - - fn info(&self) -> Option<String> { - if !self.buf.is_empty() { - Some(format!("{} items in queue", self.buf.len())) - } else { - None - } - } - - async fn work(&mut self, must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error> { - // This loop batches updates to counters to be sent all at once. - // They are sent once the propagate_rx channel has been emptied (or is closed). - let closed = loop { - match self.propagate_rx.try_recv() { - Ok((pk, sk, counters)) => { - self.add_ent(pk, sk, counters); - } - Err(mpsc::error::TryRecvError::Empty) => break false, - Err(mpsc::error::TryRecvError::Disconnected) => break true, - } - }; - - if !self.buf.is_empty() { - let entries_k = self.buf.keys().take(100).cloned().collect::<Vec<_>>(); - let entries = entries_k.iter().map(|k| self.buf.get(k).unwrap()); - if let Err(e) = self.index_counter.table.insert_many(entries).await { - self.errors += 1; - if self.errors >= 2 && *must_exit.borrow() { - error!("({}) Could not propagate {} counter values: {}, these counters will not be updated correctly.", T::COUNTER_TABLE_NAME, self.buf.len(), e); - return Ok(WorkerState::Done); - } - // Propagate error up to worker manager, it will log it, increment a counter, - // and sleep for a certain delay (with exponential backoff), waiting for - // things to go back to normal - return Err(e); - } else { - for k in entries_k { - self.buf.remove(&k); - } - self.errors = 0; - } - - return Ok(WorkerState::Busy); - } else if closed { - return Ok(WorkerState::Done); - } else { - return Ok(WorkerState::Idle); - } - } - - async fn wait_for_work(&mut self, _must_exit: &watch::Receiver<bool>) -> WorkerState { - match self.propagate_rx.recv().await { - Some((pk, sk, counters)) => { - self.add_ent(pk, sk, counters); - WorkerState::Busy - } - None => match self.buf.is_empty() { - false => WorkerState::Busy, - true => WorkerState::Done, - }, - } - } -} - -#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] -struct LocalCounterEntry<T: CountedItem> { - pk: T::CP, - sk: T::CS, - values: BTreeMap<String, (u64, i64)>, -} +// ---- impl<T: CountedItem> LocalCounterEntry<T> { fn into_counter_entry(self, this_node: Uuid) -> CounterEntry<T> { diff --git a/src/model/k2v/causality.rs b/src/model/k2v/causality.rs index 9a692870..c80ebd39 100644 --- a/src/model/k2v/causality.rs +++ b/src/model/k2v/causality.rs @@ -1,3 +1,14 @@ +//! Implements a CausalContext, which is a set of timestamps for each +//! node -- a vector clock --, indicating that the versions with +//! timestamps <= these numbers have been seen and can be +//! overwritten by a subsequent write. +//! +//! The textual representation of a CausalContext, which we call a +//! "causality token", is used in the API and must be sent along with +//! each write or delete operation to indicate the previously seen +//! versions that we want to overwrite or delete. +use base64::prelude::*; + use std::collections::BTreeMap; use std::convert::TryInto; @@ -5,28 +16,44 @@ use serde::{Deserialize, Serialize}; use garage_util::data::*; +use crate::helper::error::{Error as HelperError, OkOrBadRequest}; + /// Node IDs used in K2V are u64 integers that are the abbreviation /// of full Garage node IDs which are 256-bit UUIDs. pub type K2VNodeId = u64; +pub type VectorClock = BTreeMap<K2VNodeId, u64>; + pub fn make_node_id(node_id: Uuid) -> K2VNodeId { let mut tmp = [0u8; 8]; tmp.copy_from_slice(&node_id.as_slice()[..8]); u64::from_be_bytes(tmp) } -#[derive(PartialEq, Eq, Debug, Serialize, Deserialize)] +pub fn vclock_gt(a: &VectorClock, b: &VectorClock) -> bool { + a.iter().any(|(n, ts)| ts > b.get(n).unwrap_or(&0)) +} + +pub fn vclock_max(a: &VectorClock, b: &VectorClock) -> VectorClock { + let mut ret = a.clone(); + for (n, ts) in b.iter() { + let ent = ret.entry(*n).or_insert(0); + *ent = std::cmp::max(*ts, *ent); + } + ret +} + +#[derive(PartialEq, Eq, Debug, Serialize, Deserialize, Default)] pub struct CausalContext { - pub vector_clock: BTreeMap<K2VNodeId, u64>, + pub vector_clock: VectorClock, } impl CausalContext { /// Empty causality context - pub fn new_empty() -> Self { - Self { - vector_clock: BTreeMap::new(), - } + pub fn new() -> Self { + Self::default() } + /// Make binary representation and encode in base64 pub fn serialize(&self) -> String { let mut ints = Vec::with_capacity(2 * self.vector_clock.len()); @@ -41,14 +68,15 @@ impl CausalContext { bytes.extend(u64::to_be_bytes(i)); } - base64::encode_config(bytes, base64::URL_SAFE_NO_PAD) + BASE64_URL_SAFE_NO_PAD.encode(bytes) } - /// Parse from base64-encoded binary representation - pub fn parse(s: &str) -> Result<Self, String> { - let bytes = base64::decode_config(s, base64::URL_SAFE_NO_PAD) - .map_err(|e| format!("bad causality token base64: {}", e))?; + + /// Parse from base64-encoded binary representation. + /// Returns None on error. + pub fn parse(s: &str) -> Option<Self> { + let bytes = BASE64_URL_SAFE_NO_PAD.decode(s).ok()?; if bytes.len() % 16 != 8 || bytes.len() < 8 { - return Err("bad causality token length".into()); + return None; } let checksum = u64::from_be_bytes(bytes[..8].try_into().unwrap()); @@ -65,16 +93,19 @@ impl CausalContext { let check = ret.vector_clock.iter().fold(0, |acc, (n, t)| acc ^ *n ^ *t); if check != checksum { - return Err("bad causality token checksum".into()); + return None; } - Ok(ret) + Some(ret) } + + pub fn parse_helper(s: &str) -> Result<Self, HelperError> { + Self::parse(s).ok_or_bad_request("Invalid causality token") + } + /// Check if this causal context contains newer items than another one pub fn is_newer_than(&self, other: &Self) -> bool { - self.vector_clock - .iter() - .any(|(k, v)| v > other.vector_clock.get(k).unwrap_or(&0)) + vclock_gt(&self.vector_clock, &other.vector_clock) } } diff --git a/src/model/k2v/item_table.rs b/src/model/k2v/item_table.rs index 7860cb17..28646f37 100644 --- a/src/model/k2v/item_table.rs +++ b/src/model/k2v/item_table.rs @@ -1,7 +1,8 @@ -use serde::{Deserialize, Serialize}; use std::collections::BTreeMap; use std::sync::Arc; +use serde::{Deserialize, Serialize}; + use garage_db as db; use garage_util::data::*; @@ -10,39 +11,50 @@ use garage_table::*; use crate::index_counter::*; use crate::k2v::causality::*; -use crate::k2v::poll::*; +use crate::k2v::sub::*; pub const ENTRIES: &str = "entries"; pub const CONFLICTS: &str = "conflicts"; pub const VALUES: &str = "values"; pub const BYTES: &str = "bytes"; -#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] -pub struct K2VItem { - pub partition: K2VItemPartition, - pub sort_key: String, +mod v08 { + use crate::k2v::causality::K2VNodeId; + use garage_util::data::Uuid; + use serde::{Deserialize, Serialize}; + use std::collections::BTreeMap; - items: BTreeMap<K2VNodeId, DvvsEntry>, -} + #[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] + pub struct K2VItem { + pub partition: K2VItemPartition, + pub sort_key: String, -#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize, Hash)] -pub struct K2VItemPartition { - pub bucket_id: Uuid, - pub partition_key: String, -} + pub(super) items: BTreeMap<K2VNodeId, DvvsEntry>, + } -#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] -struct DvvsEntry { - t_discard: u64, - values: Vec<(u64, DvvsValue)>, -} + #[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize, Hash)] + pub struct K2VItemPartition { + pub bucket_id: Uuid, + pub partition_key: String, + } + + #[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] + pub struct DvvsEntry { + pub(super) t_discard: u64, + pub(super) values: Vec<(u64, DvvsValue)>, + } -#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] -pub enum DvvsValue { - Value(#[serde(with = "serde_bytes")] Vec<u8>), - Deleted, + #[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] + pub enum DvvsValue { + Value(#[serde(with = "serde_bytes")] Vec<u8>), + Deleted, + } + + impl garage_util::migrate::InitialFormat for K2VItem {} } +pub use v08::*; + impl K2VItem { /// Creates a new K2VItem when no previous entry existed in the db pub fn new(bucket_id: Uuid, partition_key: String, sort_key: String) -> Self { @@ -61,7 +73,8 @@ impl K2VItem { this_node: Uuid, context: &Option<CausalContext>, new_value: DvvsValue, - ) { + node_ts: u64, + ) -> u64 { if let Some(context) = context { for (node, t_discard) in context.vector_clock.iter() { if let Some(e) = self.items.get_mut(node) { @@ -86,12 +99,14 @@ impl K2VItem { values: vec![], }); let t_prev = e.max_time(); - e.values.push((t_prev + 1, new_value)); + let t_new = std::cmp::max(t_prev + 1, node_ts + 1); + e.values.push((t_new, new_value)); + t_new } /// Extract the causality context of a K2V Item pub fn causal_context(&self) -> CausalContext { - let mut cc = CausalContext::new_empty(); + let mut cc = CausalContext::new(); for (node, ent) in self.items.iter() { cc.vector_clock.insert(*node, ent.max_time()); } @@ -161,9 +176,9 @@ impl Crdt for DvvsEntry { impl PartitionKey for K2VItemPartition { fn hash(&self) -> Hash { - use blake2::{Blake2b, Digest}; + use blake2::{Blake2b512, Digest}; - let mut hasher = Blake2b::new(); + let mut hasher = Blake2b512::new(); hasher.update(self.bucket_id.as_slice()); hasher.update(self.partition_key.as_bytes()); let mut hash = [0u8; 32]; diff --git a/src/model/k2v/mod.rs b/src/model/k2v/mod.rs index f6a96151..acc1fcdc 100644 --- a/src/model/k2v/mod.rs +++ b/src/model/k2v/mod.rs @@ -1,6 +1,8 @@ pub mod causality; +pub mod seen; pub mod item_table; -pub mod poll; pub mod rpc; + +pub mod sub; diff --git a/src/model/k2v/poll.rs b/src/model/k2v/poll.rs deleted file mode 100644 index 93105207..00000000 --- a/src/model/k2v/poll.rs +++ /dev/null @@ -1,50 +0,0 @@ -use std::collections::HashMap; -use std::sync::Mutex; - -use serde::{Deserialize, Serialize}; -use tokio::sync::broadcast; - -use crate::k2v::item_table::*; - -#[derive(Debug, Hash, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub struct PollKey { - pub partition: K2VItemPartition, - pub sort_key: String, -} - -#[derive(Default)] -pub struct SubscriptionManager { - subscriptions: Mutex<HashMap<PollKey, broadcast::Sender<K2VItem>>>, -} - -impl SubscriptionManager { - pub fn new() -> Self { - Self::default() - } - - pub fn subscribe(&self, key: &PollKey) -> broadcast::Receiver<K2VItem> { - let mut subs = self.subscriptions.lock().unwrap(); - if let Some(s) = subs.get(key) { - s.subscribe() - } else { - let (tx, rx) = broadcast::channel(8); - subs.insert(key.clone(), tx); - rx - } - } - - pub fn notify(&self, item: &K2VItem) { - let key = PollKey { - partition: item.partition.clone(), - sort_key: item.sort_key.clone(), - }; - let mut subs = self.subscriptions.lock().unwrap(); - if let Some(s) = subs.get(&key) { - if s.send(item.clone()).is_err() { - // no more subscribers, remove channel from here - // (we will re-create it later if we need to subscribe again) - subs.remove(&key); - } - } - } -} diff --git a/src/model/k2v/rpc.rs b/src/model/k2v/rpc.rs index a74df277..117103b6 100644 --- a/src/model/k2v/rpc.rs +++ b/src/model/k2v/rpc.rs @@ -5,9 +5,10 @@ //! node does not process the entry directly, as this would //! mean the vector clock gets much larger than needed). -use std::collections::HashMap; -use std::sync::Arc; -use std::time::Duration; +use std::collections::{BTreeMap, HashMap}; +use std::convert::TryInto; +use std::sync::{Arc, Mutex, MutexGuard}; +use std::time::{Duration, Instant}; use async_trait::async_trait; use futures::stream::FuturesUnordered; @@ -15,9 +16,12 @@ use futures::StreamExt; use serde::{Deserialize, Serialize}; use tokio::select; +use garage_db as db; + use garage_util::crdt::*; use garage_util::data::*; use garage_util::error::*; +use garage_util::time::now_msec; use garage_rpc::system::System; use garage_rpc::*; @@ -25,9 +29,15 @@ use garage_rpc::*; use garage_table::replication::{TableReplication, TableShardedReplication}; use garage_table::{PartitionKey, Table}; +use crate::helper::error::Error as HelperError; use crate::k2v::causality::*; use crate::k2v::item_table::*; -use crate::k2v::poll::*; +use crate::k2v::seen::*; +use crate::k2v::sub::*; + +const POLL_RANGE_EXTRA_DELAY: Duration = Duration::from_millis(200); + +const TIMESTAMP_KEY: &'static [u8] = b"timestamp"; /// RPC messages for K2V #[derive(Debug, Serialize, Deserialize)] @@ -40,7 +50,13 @@ enum K2VRpc { causal_context: CausalContext, timeout_msec: u64, }, + PollRange { + range: PollRange, + seen_str: Option<String>, + timeout_msec: u64, + }, PollItemResponse(Option<K2VItem>), + PollRangeResponse(Uuid, Vec<K2VItem>), } #[derive(Debug, Serialize, Deserialize)] @@ -59,6 +75,12 @@ impl Rpc for K2VRpc { pub struct K2VRpcHandler { system: Arc<System>, item_table: Arc<Table<K2VItemTable, TableShardedReplication>>, + + // Using a mutex on the local_timestamp_tree is not strictly necessary, + // but it helps to not try to do several inserts at the same time, + // which would create transaction conflicts and force many useless retries. + local_timestamp_tree: Mutex<db::Tree>, + endpoint: Arc<Endpoint<K2VRpc, Self>>, subscriptions: Arc<SubscriptionManager>, } @@ -66,14 +88,19 @@ pub struct K2VRpcHandler { impl K2VRpcHandler { pub fn new( system: Arc<System>, + db: &db::Db, item_table: Arc<Table<K2VItemTable, TableShardedReplication>>, subscriptions: Arc<SubscriptionManager>, ) -> Arc<Self> { + let local_timestamp_tree = db + .open_tree("k2v_local_timestamp") + .expect("Unable to open DB tree for k2v local timestamp"); let endpoint = system.netapp.endpoint("garage_model/k2v/Rpc".to_string()); let rpc_handler = Arc::new(Self { system, item_table, + local_timestamp_tree: Mutex::new(local_timestamp_tree), endpoint, subscriptions, }); @@ -181,7 +208,7 @@ impl K2VRpcHandler { Ok(()) } - pub async fn poll( + pub async fn poll_item( &self, bucket_id: Uuid, partition_key: String, @@ -230,9 +257,7 @@ impl K2VRpcHandler { resp = Some(x); } } - K2VRpc::PollItemResponse(None) => { - return Ok(None); - } + K2VRpc::PollItemResponse(None) => (), v => return Err(Error::unexpected_rpc_message(v)), } } @@ -240,10 +265,117 @@ impl K2VRpcHandler { Ok(resp) } + pub async fn poll_range( + &self, + range: PollRange, + seen_str: Option<String>, + timeout_msec: u64, + ) -> Result<Option<(BTreeMap<String, K2VItem>, String)>, HelperError> { + let has_seen_marker = seen_str.is_some(); + + // Parse seen marker, we will use it below. This is also the first check + // that it is valid, which returns a bad request error if not. + let mut seen = seen_str + .as_deref() + .map(RangeSeenMarker::decode_helper) + .transpose()? + .unwrap_or_default(); + seen.restrict(&range); + + // Prepare PollRange RPC to send to the storage nodes responsible for the parititon + let nodes = self + .item_table + .data + .replication + .write_nodes(&range.partition.hash()); + let quorum = self.item_table.data.replication.read_quorum(); + let msg = K2VRpc::PollRange { + range, + seen_str, + timeout_msec, + }; + + // Send the request to all nodes, use FuturesUnordered to get the responses in any order + let msg = msg.into_req().map_err(netapp::error::Error::from)?; + let rs = RequestStrategy::with_priority(PRIO_NORMAL).without_timeout(); + let mut requests = nodes + .iter() + .map(|node| self.system.rpc.call(&self.endpoint, *node, msg.clone(), rs)) + .collect::<FuturesUnordered<_>>(); + + // Fetch responses. This procedure stops fetching responses when any of the following + // conditions arise: + // - we have a response to all requests + // - we have a response to a read quorum of requests (e.g. 2/3), and an extra delay + // has passed since the quorum was achieved + // - a global RPC timeout expired + // The extra delay after a quorum was received is usefull if the third response was to + // arrive during this short interval: this would allow us to consider all the data seen + // by that last node in the response we produce, and would likely help reduce the + // size of the seen marker that we will return (because we would have an info of the + // kind: all items produced by that node until time ts have been returned, so we can + // bump the entry in the global vector clock and possibly remove some item-specific + // vector clocks) + let mut deadline = + Instant::now() + Duration::from_millis(timeout_msec) + self.system.rpc.rpc_timeout(); + let mut resps = vec![]; + let mut errors = vec![]; + loop { + select! { + _ = tokio::time::sleep_until(deadline.into()) => { + break; + } + res = requests.next() => match res { + None => break, + Some(Err(e)) => errors.push(e), + Some(Ok(r)) => { + resps.push(r); + if resps.len() >= quorum { + deadline = std::cmp::min(deadline, Instant::now() + POLL_RANGE_EXTRA_DELAY); + } + } + } + } + } + if errors.len() > nodes.len() - quorum { + let errors = errors.iter().map(|e| format!("{}", e)).collect::<Vec<_>>(); + return Err(Error::Quorum(quorum, resps.len(), nodes.len(), errors).into()); + } + + // Take all returned items into account to produce the response. + let mut new_items = BTreeMap::<String, K2VItem>::new(); + for v in resps { + if let K2VRpc::PollRangeResponse(node, items) = v { + seen.mark_seen_node_items(node, items.iter()); + for item in items.into_iter() { + match new_items.get_mut(&item.sort_key) { + Some(ent) => { + ent.merge(&item); + } + None => { + new_items.insert(item.sort_key.clone(), item); + } + } + } + } else { + return Err(Error::unexpected_rpc_message(v).into()); + } + } + + if new_items.is_empty() && has_seen_marker { + Ok(None) + } else { + Ok(Some((new_items, seen.encode()?))) + } + } + // ---- internal handlers ---- async fn handle_insert(&self, item: &InsertedItem) -> Result<K2VRpc, Error> { - let new = self.local_insert(item)?; + let new = { + let local_timestamp_tree = self.local_timestamp_tree.lock().unwrap(); + self.local_insert(&local_timestamp_tree, item)? + }; // Propagate to rest of network if let Some(updated) = new { @@ -256,11 +388,14 @@ impl K2VRpcHandler { async fn handle_insert_many(&self, items: &[InsertedItem]) -> Result<K2VRpc, Error> { let mut updated_vec = vec![]; - for item in items { - let new = self.local_insert(item)?; + { + let local_timestamp_tree = self.local_timestamp_tree.lock().unwrap(); + for item in items { + let new = self.local_insert(&local_timestamp_tree, item)?; - if let Some(updated) = new { - updated_vec.push(updated); + if let Some(updated) = new { + updated_vec.push(updated); + } } } @@ -272,15 +407,22 @@ impl K2VRpcHandler { Ok(K2VRpc::Ok) } - fn local_insert(&self, item: &InsertedItem) -> Result<Option<K2VItem>, Error> { - let tree_key = self - .item_table - .data - .tree_key(&item.partition, &item.sort_key); + fn local_insert( + &self, + local_timestamp_tree: &MutexGuard<'_, db::Tree>, + item: &InsertedItem, + ) -> Result<Option<K2VItem>, Error> { + let now = now_msec(); self.item_table .data - .update_entry_with(&tree_key[..], |ent| { + .update_entry_with(&item.partition, &item.sort_key, |tx, ent| { + let old_local_timestamp = tx + .get(&local_timestamp_tree, TIMESTAMP_KEY)? + .and_then(|x| x.try_into().ok()) + .map(u64::from_be_bytes) + .unwrap_or_default(); + let mut ent = ent.unwrap_or_else(|| { K2VItem::new( item.partition.bucket_id, @@ -288,13 +430,25 @@ impl K2VRpcHandler { item.sort_key.clone(), ) }); - ent.update(self.system.id, &item.causal_context, item.value.clone()); - ent + let new_local_timestamp = ent.update( + self.system.id, + &item.causal_context, + item.value.clone(), + std::cmp::max(old_local_timestamp, now), + ); + + tx.insert( + &local_timestamp_tree, + TIMESTAMP_KEY, + u64::to_be_bytes(new_local_timestamp), + )?; + + Ok(ent) }) } - async fn handle_poll(&self, key: &PollKey, ct: &CausalContext) -> Result<K2VItem, Error> { - let mut chan = self.subscriptions.subscribe(key); + async fn handle_poll_item(&self, key: &PollKey, ct: &CausalContext) -> Result<K2VItem, Error> { + let mut chan = self.subscriptions.subscribe_item(key); let mut value = self .item_table @@ -316,6 +470,71 @@ impl K2VRpcHandler { Ok(value) } + + async fn handle_poll_range( + &self, + range: &PollRange, + seen_str: &Option<String>, + ) -> Result<Vec<K2VItem>, Error> { + if let Some(seen_str) = seen_str { + let seen = RangeSeenMarker::decode(seen_str).ok_or_message("Invalid seenMarker")?; + + // Subscribe now to all changes on that partition, + // so that new items that are inserted while we are reading the range + // will be seen in the loop below + let mut chan = self.subscriptions.subscribe_partition(&range.partition); + + // Check for the presence of any new items already stored in the item table + let mut new_items = self.poll_range_read_range(range, &seen)?; + + // If we found no new items, wait for a matching item to arrive + // on the channel + while new_items.is_empty() { + let item = chan.recv().await?; + if range.matches(&item) && seen.is_new_item(&item) { + new_items.push(item); + } + } + + Ok(new_items) + } else { + // If no seen marker was specified, we do not poll for anything. + // We return immediately with the set of known items (even if + // it is empty), which will give the client an inital view of + // the dataset and an initial seen marker for further + // PollRange calls. + self.poll_range_read_range(range, &RangeSeenMarker::default()) + } + } + + fn poll_range_read_range( + &self, + range: &PollRange, + seen: &RangeSeenMarker, + ) -> Result<Vec<K2VItem>, Error> { + let mut new_items = vec![]; + + let partition_hash = range.partition.hash(); + let first_key = match &range.start { + None => partition_hash.to_vec(), + Some(sk) => self.item_table.data.tree_key(&range.partition, sk), + }; + for item in self.item_table.data.store.range(first_key..)? { + let (key, value) = item?; + if &key[..32] != partition_hash.as_slice() { + break; + } + let item = self.item_table.data.decode_entry(&value)?; + if !range.matches(&item) { + break; + } + if seen.is_new_item(&item) { + new_items.push(item); + } + } + + Ok(new_items) + } } #[async_trait] @@ -331,10 +550,21 @@ impl EndpointHandler<K2VRpc> for K2VRpcHandler { } => { let delay = tokio::time::sleep(Duration::from_millis(*timeout_msec)); select! { - ret = self.handle_poll(key, causal_context) => ret.map(Some).map(K2VRpc::PollItemResponse), + ret = self.handle_poll_item(key, causal_context) => ret.map(Some).map(K2VRpc::PollItemResponse), _ = delay => Ok(K2VRpc::PollItemResponse(None)), } } + K2VRpc::PollRange { + range, + seen_str, + timeout_msec, + } => { + let delay = tokio::time::sleep(Duration::from_millis(*timeout_msec)); + select! { + ret = self.handle_poll_range(range, seen_str) => ret.map(|items| K2VRpc::PollRangeResponse(self.system.id, items)), + _ = delay => Ok(K2VRpc::PollRangeResponse(self.system.id, vec![])), + } + } m => Err(Error::unexpected_rpc_message(m)), } } diff --git a/src/model/k2v/seen.rs b/src/model/k2v/seen.rs new file mode 100644 index 00000000..51098710 --- /dev/null +++ b/src/model/k2v/seen.rs @@ -0,0 +1,105 @@ +//! Implements a RangeSeenMarker, a data type used in the PollRange API +//! to indicate which items in the range have already been seen +//! and which have not been seen yet. +//! +//! It consists of a vector clock that indicates that for each node, +//! all items produced by that node with timestamps <= the value in the +//! vector clock has been seen, as well as a set of causal contexts for +//! individual items. + +use std::collections::BTreeMap; + +use base64::prelude::*; +use serde::{Deserialize, Serialize}; + +use garage_util::data::Uuid; +use garage_util::encode::{nonversioned_decode, nonversioned_encode}; +use garage_util::error::Error; + +use crate::helper::error::{Error as HelperError, OkOrBadRequest}; +use crate::k2v::causality::*; +use crate::k2v::item_table::*; +use crate::k2v::sub::*; + +#[derive(Debug, Serialize, Deserialize, Default)] +pub struct RangeSeenMarker { + vector_clock: VectorClock, + items: BTreeMap<String, VectorClock>, +} + +impl RangeSeenMarker { + pub fn new() -> Self { + Self::default() + } + + pub fn restrict(&mut self, range: &PollRange) { + if let Some(start) = &range.start { + self.items = self.items.split_off(start); + } + if let Some(end) = &range.end { + self.items.split_off(end); + } + if let Some(pfx) = &range.prefix { + self.items.retain(|k, _v| k.starts_with(pfx)); + } + } + + pub fn mark_seen_node_items<'a, I: IntoIterator<Item = &'a K2VItem>>( + &mut self, + node: Uuid, + items: I, + ) { + let node = make_node_id(node); + for item in items.into_iter() { + let cc = item.causal_context(); + + if let Some(ts) = cc.vector_clock.get(&node) { + let ent = self.vector_clock.entry(node).or_insert(0); + *ent = std::cmp::max(*ent, *ts); + } + + if vclock_gt(&cc.vector_clock, &self.vector_clock) { + match self.items.get_mut(&item.sort_key) { + None => { + self.items.insert(item.sort_key.clone(), cc.vector_clock); + } + Some(ent) => *ent = vclock_max(&ent, &cc.vector_clock), + } + } + } + } + + pub fn canonicalize(&mut self) { + let self_vc = &self.vector_clock; + self.items.retain(|_sk, vc| vclock_gt(&vc, self_vc)) + } + + pub fn encode(&mut self) -> Result<String, Error> { + self.canonicalize(); + + let bytes = nonversioned_encode(&self)?; + let bytes = zstd::stream::encode_all(&mut &bytes[..], zstd::DEFAULT_COMPRESSION_LEVEL)?; + Ok(BASE64_STANDARD.encode(&bytes)) + } + + /// Decode from msgpack+zstd+b64 representation, returns None on error. + pub fn decode(s: &str) -> Option<Self> { + let bytes = BASE64_STANDARD.decode(&s).ok()?; + let bytes = zstd::stream::decode_all(&mut &bytes[..]).ok()?; + nonversioned_decode(&bytes).ok() + } + + pub fn decode_helper(s: &str) -> Result<Self, HelperError> { + Self::decode(s).ok_or_bad_request("Invalid causality token") + } + + pub fn is_new_item(&self, item: &K2VItem) -> bool { + let cc = item.causal_context(); + vclock_gt(&cc.vector_clock, &self.vector_clock) + && self + .items + .get(&item.sort_key) + .map(|vc| vclock_gt(&cc.vector_clock, &vc)) + .unwrap_or(true) + } +} diff --git a/src/model/k2v/sub.rs b/src/model/k2v/sub.rs new file mode 100644 index 00000000..b1daa271 --- /dev/null +++ b/src/model/k2v/sub.rs @@ -0,0 +1,110 @@ +use std::collections::HashMap; +use std::sync::Mutex; + +use serde::{Deserialize, Serialize}; +use tokio::sync::broadcast; + +use crate::k2v::item_table::*; + +#[derive(Debug, Hash, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct PollKey { + pub partition: K2VItemPartition, + pub sort_key: String, +} + +#[derive(Debug, Hash, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct PollRange { + pub partition: K2VItemPartition, + pub prefix: Option<String>, + pub start: Option<String>, + pub end: Option<String>, +} + +#[derive(Default)] +pub struct SubscriptionManager(Mutex<SubscriptionManagerInner>); + +#[derive(Default)] +pub struct SubscriptionManagerInner { + item_subscriptions: HashMap<PollKey, broadcast::Sender<K2VItem>>, + part_subscriptions: HashMap<K2VItemPartition, broadcast::Sender<K2VItem>>, +} + +impl SubscriptionManager { + pub fn new() -> Self { + Self::default() + } + + pub(crate) fn subscribe_item(&self, key: &PollKey) -> broadcast::Receiver<K2VItem> { + let mut inner = self.0.lock().unwrap(); + if let Some(s) = inner.item_subscriptions.get(key) { + s.subscribe() + } else { + let (tx, rx) = broadcast::channel(8); + inner.item_subscriptions.insert(key.clone(), tx); + rx + } + } + + pub(crate) fn subscribe_partition( + &self, + part: &K2VItemPartition, + ) -> broadcast::Receiver<K2VItem> { + let mut inner = self.0.lock().unwrap(); + if let Some(s) = inner.part_subscriptions.get(part) { + s.subscribe() + } else { + let (tx, rx) = broadcast::channel(8); + inner.part_subscriptions.insert(part.clone(), tx); + rx + } + } + + pub(crate) fn notify(&self, item: &K2VItem) { + let mut inner = self.0.lock().unwrap(); + + // 1. Notify single item subscribers, + // removing subscriptions with no more listeners if any + let key = PollKey { + partition: item.partition.clone(), + sort_key: item.sort_key.clone(), + }; + if let Some(s) = inner.item_subscriptions.get(&key) { + if s.send(item.clone()).is_err() { + // no more subscribers, remove channel from here + // (we will re-create it later if we need to subscribe again) + inner.item_subscriptions.remove(&key); + } + } + + // 2. Notify partition subscribers, + // removing subscriptions with no more listeners if any + if let Some(s) = inner.part_subscriptions.get(&item.partition) { + if s.send(item.clone()).is_err() { + // no more subscribers, remove channel from here + // (we will re-create it later if we need to subscribe again) + inner.part_subscriptions.remove(&item.partition); + } + } + } +} + +impl PollRange { + pub fn matches(&self, item: &K2VItem) -> bool { + item.partition == self.partition + && self + .prefix + .as_ref() + .map(|x| item.sort_key.starts_with(x)) + .unwrap_or(true) + && self + .start + .as_ref() + .map(|x| item.sort_key >= *x) + .unwrap_or(true) + && self + .end + .as_ref() + .map(|x| item.sort_key < *x) + .unwrap_or(true) + } +} diff --git a/src/model/key_table.rs b/src/model/key_table.rs index 9d2fc783..bb5334a3 100644 --- a/src/model/key_table.rs +++ b/src/model/key_table.rs @@ -1,45 +1,121 @@ use serde::{Deserialize, Serialize}; -use garage_table::crdt::*; -use garage_table::*; +use garage_util::crdt::{self, Crdt}; use garage_util::data::*; +use garage_table::{DeletedFilter, EmptyKey, Entry, TableSchema}; + use crate::permission::BucketKeyPerm; -use crate::prev::v051::key_table as old; +pub(crate) mod v05 { + use garage_util::crdt; + use serde::{Deserialize, Serialize}; -/// An api key -#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] -pub struct Key { - /// The id of the key (immutable), used as partition key - pub key_id: String, + /// An api key + #[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] + pub struct Key { + /// The id of the key (immutable), used as partition key + pub key_id: String, - /// Internal state of the key - pub state: crdt::Deletable<KeyParams>, -} + /// The secret_key associated + pub secret_key: String, -/// Configuration for a key -#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] -pub struct KeyParams { - /// The secret_key associated (immutable) - pub secret_key: String, + /// Name for the key + pub name: crdt::Lww<String>, - /// Name for the key - pub name: crdt::Lww<String>, + /// Is the key deleted + pub deleted: crdt::Bool, + + /// Buckets in which the key is authorized. Empty if `Key` is deleted + // CRDT interaction: deleted implies authorized_buckets is empty + pub authorized_buckets: crdt::LwwMap<String, PermissionSet>, + } - /// Flag to allow users having this key to create buckets - pub allow_create_bucket: crdt::Lww<bool>, + /// Permission given to a key in a bucket + #[derive(PartialOrd, Ord, PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] + pub struct PermissionSet { + /// The key can be used to read the bucket + pub allow_read: bool, + /// The key can be used to write in the bucket + pub allow_write: bool, + } + + impl crdt::AutoCrdt for PermissionSet { + const WARN_IF_DIFFERENT: bool = true; + } - /// If the key is present: it gives some permissions, - /// a map of bucket IDs (uuids) to permissions. - /// Otherwise no permissions are granted to key - pub authorized_buckets: crdt::Map<Uuid, BucketKeyPerm>, + impl garage_util::migrate::InitialFormat for Key {} +} - /// A key can have a local view of buckets names it is - /// the only one to see, this is the namespace for these aliases - pub local_aliases: crdt::LwwMap<String, Option<Uuid>>, +mod v08 { + use super::v05; + use crate::permission::BucketKeyPerm; + use garage_util::crdt; + use garage_util::data::Uuid; + use serde::{Deserialize, Serialize}; + + /// An api key + #[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] + pub struct Key { + /// The id of the key (immutable), used as partition key + pub key_id: String, + + /// Internal state of the key + pub state: crdt::Deletable<KeyParams>, + } + + /// Configuration for a key + #[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] + pub struct KeyParams { + /// The secret_key associated (immutable) + pub secret_key: String, + + /// Name for the key + pub name: crdt::Lww<String>, + + /// Flag to allow users having this key to create buckets + pub allow_create_bucket: crdt::Lww<bool>, + + /// If the key is present: it gives some permissions, + /// a map of bucket IDs (uuids) to permissions. + /// Otherwise no permissions are granted to key + pub authorized_buckets: crdt::Map<Uuid, BucketKeyPerm>, + + /// A key can have a local view of buckets names it is + /// the only one to see, this is the namespace for these aliases + pub local_aliases: crdt::LwwMap<String, Option<Uuid>>, + } + + impl garage_util::migrate::Migrate for Key { + type Previous = v05::Key; + + fn migrate(old_k: v05::Key) -> Key { + let name = crdt::Lww::raw(old_k.name.timestamp(), old_k.name.get().clone()); + + let state = if old_k.deleted.get() { + crdt::Deletable::Deleted + } else { + // Authorized buckets is ignored here, + // migration is performed in specific migration code in + // garage/migrate.rs + crdt::Deletable::Present(KeyParams { + secret_key: old_k.secret_key, + name, + allow_create_bucket: crdt::Lww::new(false), + authorized_buckets: crdt::Map::new(), + local_aliases: crdt::LwwMap::new(), + }) + }; + Key { + key_id: old_k.key_id, + state, + } + } + } } +pub use v08::*; + impl KeyParams { fn new(secret_key: &str, name: &str) -> Self { KeyParams { @@ -173,28 +249,4 @@ impl TableSchema for KeyTable { } } } - - fn try_migrate(bytes: &[u8]) -> Option<Self::E> { - let old_k = rmp_serde::decode::from_read_ref::<_, old::Key>(bytes).ok()?; - let name = crdt::Lww::raw(old_k.name.timestamp(), old_k.name.get().clone()); - - let state = if old_k.deleted.get() { - crdt::Deletable::Deleted - } else { - // Authorized buckets is ignored here, - // migration is performed in specific migration code in - // garage/migrate.rs - crdt::Deletable::Present(KeyParams { - secret_key: old_k.secret_key, - name, - allow_create_bucket: crdt::Lww::new(false), - authorized_buckets: crdt::Map::new(), - local_aliases: crdt::LwwMap::new(), - }) - }; - Some(Key { - key_id: old_k.key_id, - state, - }) - } } diff --git a/src/model/migrate.rs b/src/model/migrate.rs index cd6ad26a..6b4c3eed 100644 --- a/src/model/migrate.rs +++ b/src/model/migrate.rs @@ -2,6 +2,7 @@ use std::sync::Arc; use garage_util::crdt::*; use garage_util::data::*; +use garage_util::encode::nonversioned_decode; use garage_util::error::Error as GarageError; use garage_util::time::*; @@ -28,8 +29,8 @@ impl Migrate { let mut old_buckets = vec![]; for res in tree.iter().map_err(GarageError::from)? { let (_k, v) = res.map_err(GarageError::from)?; - let bucket = rmp_serde::decode::from_read_ref::<_, old_bucket::Bucket>(&v[..]) - .map_err(GarageError::from)?; + let bucket = + nonversioned_decode::<old_bucket::Bucket>(&v[..]).map_err(GarageError::from)?; old_buckets.push(bucket); } diff --git a/src/model/prev/v051/bucket_table.rs b/src/model/prev/v051/bucket_table.rs index 628a49dd..19893458 100644 --- a/src/model/prev/v051/bucket_table.rs +++ b/src/model/prev/v051/bucket_table.rs @@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize}; use garage_table::crdt::Crdt; use garage_table::*; -use super::key_table::PermissionSet; +use crate::key_table::v05::PermissionSet; /// A bucket is a collection of objects /// diff --git a/src/model/prev/v051/key_table.rs b/src/model/prev/v051/key_table.rs deleted file mode 100644 index 37516b1c..00000000 --- a/src/model/prev/v051/key_table.rs +++ /dev/null @@ -1,50 +0,0 @@ -use serde::{Deserialize, Serialize}; - -use garage_table::crdt::*; -use garage_table::*; - -/// An api key -#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] -pub struct Key { - /// The id of the key (immutable), used as partition key - pub key_id: String, - - /// The secret_key associated - pub secret_key: String, - - /// Name for the key - pub name: crdt::Lww<String>, - - /// Is the key deleted - pub deleted: crdt::Bool, - - /// Buckets in which the key is authorized. Empty if `Key` is deleted - // CRDT interaction: deleted implies authorized_buckets is empty - pub authorized_buckets: crdt::LwwMap<String, PermissionSet>, -} - -/// Permission given to a key in a bucket -#[derive(PartialOrd, Ord, PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] -pub struct PermissionSet { - /// The key can be used to read the bucket - pub allow_read: bool, - /// The key can be used to write in the bucket - pub allow_write: bool, -} - -impl AutoCrdt for PermissionSet { - const WARN_IF_DIFFERENT: bool = true; -} - -impl Crdt for Key { - fn merge(&mut self, other: &Self) { - self.name.merge(&other.name); - self.deleted.merge(&other.deleted); - - if self.deleted.get() { - self.authorized_buckets.clear(); - } else { - self.authorized_buckets.merge(&other.authorized_buckets); - } - } -} diff --git a/src/model/prev/v051/mod.rs b/src/model/prev/v051/mod.rs index 7a954752..8c1335a5 100644 --- a/src/model/prev/v051/mod.rs +++ b/src/model/prev/v051/mod.rs @@ -1,4 +1 @@ pub(crate) mod bucket_table; -pub(crate) mod key_table; -pub(crate) mod object_table; -pub(crate) mod version_table; diff --git a/src/model/prev/v051/object_table.rs b/src/model/prev/v051/object_table.rs deleted file mode 100644 index e79e5787..00000000 --- a/src/model/prev/v051/object_table.rs +++ /dev/null @@ -1,149 +0,0 @@ -use serde::{Deserialize, Serialize}; -use std::collections::BTreeMap; - -use garage_util::data::*; - -use garage_table::crdt::*; - -/// An object -#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] -pub struct Object { - /// The bucket in which the object is stored, used as partition key - pub bucket: String, - - /// The key at which the object is stored in its bucket, used as sorting key - pub key: String, - - /// The list of currenty stored versions of the object - versions: Vec<ObjectVersion>, -} - -impl Object { - /// Get a list of currently stored versions of `Object` - pub fn versions(&self) -> &[ObjectVersion] { - &self.versions[..] - } -} - -/// Informations about a version of an object -#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] -pub struct ObjectVersion { - /// Id of the version - pub uuid: Uuid, - /// Timestamp of when the object was created - pub timestamp: u64, - /// State of the version - pub state: ObjectVersionState, -} - -/// State of an object version -#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] -pub enum ObjectVersionState { - /// The version is being received - Uploading(ObjectVersionHeaders), - /// The version is fully received - Complete(ObjectVersionData), - /// The version uploaded containded errors or the upload was explicitly aborted - Aborted, -} - -impl Crdt for ObjectVersionState { - fn merge(&mut self, other: &Self) { - use ObjectVersionState::*; - match other { - Aborted => { - *self = Aborted; - } - Complete(b) => match self { - Aborted => {} - Complete(a) => { - a.merge(b); - } - Uploading(_) => { - *self = Complete(b.clone()); - } - }, - Uploading(_) => {} - } - } -} - -/// Data stored in object version -#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] -pub enum ObjectVersionData { - /// The object was deleted, this Version is a tombstone to mark it as such - DeleteMarker, - /// The object is short, it's stored inlined - Inline(ObjectVersionMeta, #[serde(with = "serde_bytes")] Vec<u8>), - /// The object is not short, Hash of first block is stored here, next segments hashes are - /// stored in the version table - FirstBlock(ObjectVersionMeta, Hash), -} - -impl AutoCrdt for ObjectVersionData { - const WARN_IF_DIFFERENT: bool = true; -} - -/// Metadata about the object version -#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] -pub struct ObjectVersionMeta { - /// Headers to send to the client - pub headers: ObjectVersionHeaders, - /// Size of the object - pub size: u64, - /// etag of the object - pub etag: String, -} - -/// Additional headers for an object -#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] -pub struct ObjectVersionHeaders { - /// Content type of the object - pub content_type: String, - /// Any other http headers to send - pub other: BTreeMap<String, String>, -} - -impl ObjectVersion { - fn cmp_key(&self) -> (u64, Uuid) { - (self.timestamp, self.uuid) - } - - /// Is the object version completely received - pub fn is_complete(&self) -> bool { - matches!(self.state, ObjectVersionState::Complete(_)) - } -} - -impl Crdt for Object { - fn merge(&mut self, other: &Self) { - // Merge versions from other into here - for other_v in other.versions.iter() { - match self - .versions - .binary_search_by(|v| v.cmp_key().cmp(&other_v.cmp_key())) - { - Ok(i) => { - self.versions[i].state.merge(&other_v.state); - } - Err(i) => { - self.versions.insert(i, other_v.clone()); - } - } - } - - // Remove versions which are obsolete, i.e. those that come - // before the last version which .is_complete(). - let last_complete = self - .versions - .iter() - .enumerate() - .rev() - .find(|(_, v)| v.is_complete()) - .map(|(vi, _)| vi); - - if let Some(last_vi) = last_complete { - self.versions = self.versions.drain(last_vi..).collect::<Vec<_>>(); - } - } -} diff --git a/src/model/prev/v051/version_table.rs b/src/model/prev/v051/version_table.rs deleted file mode 100644 index c11c62d5..00000000 --- a/src/model/prev/v051/version_table.rs +++ /dev/null @@ -1,79 +0,0 @@ -use serde::{Deserialize, Serialize}; - -use garage_util::data::*; - -use garage_table::crdt::*; -use garage_table::*; - -/// A version of an object -#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] -pub struct Version { - /// UUID of the version, used as partition key - pub uuid: Uuid, - - // Actual data: the blocks for this version - // In the case of a multipart upload, also store the etags - // of individual parts and check them when doing CompleteMultipartUpload - /// Is this version deleted - pub deleted: crdt::Bool, - /// list of blocks of data composing the version - pub blocks: crdt::Map<VersionBlockKey, VersionBlock>, - /// Etag of each part in case of a multipart upload, empty otherwise - pub parts_etags: crdt::Map<u64, String>, - - // Back link to bucket+key so that we can figure if - // this was deleted later on - /// Bucket in which the related object is stored - pub bucket: String, - /// Key in which the related object is stored - pub key: String, -} - -#[derive(PartialEq, Eq, Clone, Copy, Debug, Serialize, Deserialize)] -pub struct VersionBlockKey { - /// Number of the part - pub part_number: u64, - /// Offset of this sub-segment in its part - pub offset: u64, -} - -impl Ord for VersionBlockKey { - fn cmp(&self, other: &Self) -> std::cmp::Ordering { - self.part_number - .cmp(&other.part_number) - .then(self.offset.cmp(&other.offset)) - } -} - -impl PartialOrd for VersionBlockKey { - fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> { - Some(self.cmp(other)) - } -} - -/// Informations about a single block -#[derive(PartialEq, Eq, Ord, PartialOrd, Clone, Copy, Debug, Serialize, Deserialize)] -pub struct VersionBlock { - /// Blake2 sum of the block - pub hash: Hash, - /// Size of the block - pub size: u64, -} - -impl AutoCrdt for VersionBlock { - const WARN_IF_DIFFERENT: bool = true; -} - -impl Crdt for Version { - fn merge(&mut self, other: &Self) { - self.deleted.merge(&other.deleted); - - if self.deleted.get() { - self.blocks.clear(); - self.parts_etags.clear(); - } else { - self.blocks.merge(&other.blocks); - self.parts_etags.merge(&other.parts_etags); - } - } -} diff --git a/src/model/s3/block_ref_table.rs b/src/model/s3/block_ref_table.rs index c7017409..7b023d87 100644 --- a/src/model/s3/block_ref_table.rs +++ b/src/model/s3/block_ref_table.rs @@ -1,4 +1,3 @@ -use serde::{Deserialize, Serialize}; use std::sync::Arc; use garage_db as db; @@ -10,19 +9,29 @@ use garage_table::*; use garage_block::manager::*; -#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] -pub struct BlockRef { - /// Hash (blake2 sum) of the block, used as partition key - pub block: Hash, +mod v08 { + use garage_util::crdt; + use garage_util::data::{Hash, Uuid}; + use serde::{Deserialize, Serialize}; - /// Id of the Version for the object containing this block, used as sorting key - pub version: Uuid, + #[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] + pub struct BlockRef { + /// Hash (blake2 sum) of the block, used as partition key + pub block: Hash, - // Keep track of deleted status - /// Is the Version that contains this block deleted - pub deleted: crdt::Bool, + /// Id of the Version for the object containing this block, used as sorting key + pub version: Uuid, + + // Keep track of deleted status + /// Is the Version that contains this block deleted + pub deleted: crdt::Bool, + } + + impl garage_util::migrate::InitialFormat for BlockRef {} } +pub use v08::*; + impl Entry<Hash, Uuid> for BlockRef { fn partition_key(&self) -> &Hash { &self.block diff --git a/src/model/s3/object_table.rs b/src/model/s3/object_table.rs index 26ff57f6..518acc95 100644 --- a/src/model/s3/object_table.rs +++ b/src/model/s3/object_table.rs @@ -1,10 +1,8 @@ use serde::{Deserialize, Serialize}; -use std::collections::BTreeMap; use std::sync::Arc; use garage_db as db; -use garage_util::background::BackgroundRunner; use garage_util::data::*; use garage_table::crdt::*; @@ -14,25 +12,126 @@ use garage_table::*; use crate::index_counter::*; use crate::s3::version_table::*; -use crate::prev::v051::object_table as old; - pub const OBJECTS: &str = "objects"; pub const UNFINISHED_UPLOADS: &str = "unfinished_uploads"; pub const BYTES: &str = "bytes"; -/// An object -#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] -pub struct Object { - /// The bucket in which the object is stored, used as partition key - pub bucket_id: Uuid, +mod v05 { + use garage_util::data::{Hash, Uuid}; + use serde::{Deserialize, Serialize}; + use std::collections::BTreeMap; + + /// An object + #[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] + pub struct Object { + /// The bucket in which the object is stored, used as partition key + pub bucket: String, + + /// The key at which the object is stored in its bucket, used as sorting key + pub key: String, + + /// The list of currenty stored versions of the object + pub(super) versions: Vec<ObjectVersion>, + } + + /// Informations about a version of an object + #[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] + pub struct ObjectVersion { + /// Id of the version + pub uuid: Uuid, + /// Timestamp of when the object was created + pub timestamp: u64, + /// State of the version + pub state: ObjectVersionState, + } + + /// State of an object version + #[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] + pub enum ObjectVersionState { + /// The version is being received + Uploading(ObjectVersionHeaders), + /// The version is fully received + Complete(ObjectVersionData), + /// The version uploaded containded errors or the upload was explicitly aborted + Aborted, + } + + /// Data stored in object version + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] + pub enum ObjectVersionData { + /// The object was deleted, this Version is a tombstone to mark it as such + DeleteMarker, + /// The object is short, it's stored inlined + Inline(ObjectVersionMeta, #[serde(with = "serde_bytes")] Vec<u8>), + /// The object is not short, Hash of first block is stored here, next segments hashes are + /// stored in the version table + FirstBlock(ObjectVersionMeta, Hash), + } + + /// Metadata about the object version + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] + pub struct ObjectVersionMeta { + /// Headers to send to the client + pub headers: ObjectVersionHeaders, + /// Size of the object + pub size: u64, + /// etag of the object + pub etag: String, + } - /// The key at which the object is stored in its bucket, used as sorting key - pub key: String, + /// Additional headers for an object + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] + pub struct ObjectVersionHeaders { + /// Content type of the object + pub content_type: String, + /// Any other http headers to send + pub other: BTreeMap<String, String>, + } - /// The list of currenty stored versions of the object - versions: Vec<ObjectVersion>, + impl garage_util::migrate::InitialFormat for Object {} } +mod v08 { + use garage_util::data::Uuid; + use serde::{Deserialize, Serialize}; + + use super::v05; + + pub use v05::{ + ObjectVersion, ObjectVersionData, ObjectVersionHeaders, ObjectVersionMeta, + ObjectVersionState, + }; + + /// An object + #[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] + pub struct Object { + /// The bucket in which the object is stored, used as partition key + pub bucket_id: Uuid, + + /// The key at which the object is stored in its bucket, used as sorting key + pub key: String, + + /// The list of currenty stored versions of the object + pub(super) versions: Vec<ObjectVersion>, + } + + impl garage_util::migrate::Migrate for Object { + type Previous = v05::Object; + + fn migrate(old: v05::Object) -> Object { + use garage_util::data::blake2sum; + + Object { + bucket_id: blake2sum(old.bucket.as_bytes()), + key: old.key, + versions: old.versions, + } + } + } +} + +pub use v08::*; + impl Object { /// Initialize an Object struct from parts pub fn new(bucket_id: Uuid, key: String, versions: Vec<ObjectVersion>) -> Self { @@ -69,28 +168,6 @@ impl Object { } } -/// Informations about a version of an object -#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] -pub struct ObjectVersion { - /// Id of the version - pub uuid: Uuid, - /// Timestamp of when the object was created - pub timestamp: u64, - /// State of the version - pub state: ObjectVersionState, -} - -/// State of an object version -#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] -pub enum ObjectVersionState { - /// The version is being received - Uploading(ObjectVersionHeaders), - /// The version is fully received - Complete(ObjectVersionData), - /// The version uploaded containded errors or the upload was explicitly aborted - Aborted, -} - impl Crdt for ObjectVersionState { fn merge(&mut self, other: &Self) { use ObjectVersionState::*; @@ -112,42 +189,10 @@ impl Crdt for ObjectVersionState { } } -/// Data stored in object version -#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] -pub enum ObjectVersionData { - /// The object was deleted, this Version is a tombstone to mark it as such - DeleteMarker, - /// The object is short, it's stored inlined - Inline(ObjectVersionMeta, #[serde(with = "serde_bytes")] Vec<u8>), - /// The object is not short, Hash of first block is stored here, next segments hashes are - /// stored in the version table - FirstBlock(ObjectVersionMeta, Hash), -} - impl AutoCrdt for ObjectVersionData { const WARN_IF_DIFFERENT: bool = true; } -/// Metadata about the object version -#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] -pub struct ObjectVersionMeta { - /// Headers to send to the client - pub headers: ObjectVersionHeaders, - /// Size of the object - pub size: u64, - /// etag of the object - pub etag: String, -} - -/// Additional headers for an object -#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] -pub struct ObjectVersionHeaders { - /// Content type of the object - pub content_type: String, - /// Any other http headers to send - pub other: BTreeMap<String, String>, -} - impl ObjectVersion { fn cmp_key(&self) -> (u64, Uuid) { (self.timestamp, self.uuid) @@ -221,7 +266,6 @@ impl Crdt for Object { } pub struct ObjectTable { - pub background: Arc<BackgroundRunner>, pub version_table: Arc<Table<VersionTable, TableShardedReplication>>, pub object_counter_table: Arc<IndexCounter<Object>>, } @@ -255,34 +299,34 @@ impl TableSchema for ObjectTable { ); } - // 2. Spawn threads that propagates deletions to version table - let version_table = self.version_table.clone(); - let old = old.cloned(); - let new = new.cloned(); - - self.background.spawn(async move { - if let (Some(old_v), Some(new_v)) = (old, new) { - // Propagate deletion of old versions - for v in old_v.versions.iter() { - let newly_deleted = match new_v - .versions - .binary_search_by(|nv| nv.cmp_key().cmp(&v.cmp_key())) - { - Err(_) => true, - Ok(i) => { - new_v.versions[i].state == ObjectVersionState::Aborted - && v.state != ObjectVersionState::Aborted - } - }; - if newly_deleted { - let deleted_version = - Version::new(v.uuid, old_v.bucket_id, old_v.key.clone(), true); - version_table.insert(&deleted_version).await?; + // 2. Enqueue propagation deletions to version table + if let (Some(old_v), Some(new_v)) = (old, new) { + // Propagate deletion of old versions + for v in old_v.versions.iter() { + let newly_deleted = match new_v + .versions + .binary_search_by(|nv| nv.cmp_key().cmp(&v.cmp_key())) + { + Err(_) => true, + Ok(i) => { + new_v.versions[i].state == ObjectVersionState::Aborted + && v.state != ObjectVersionState::Aborted + } + }; + if newly_deleted { + let deleted_version = + Version::new(v.uuid, old_v.bucket_id, old_v.key.clone(), true); + let res = self.version_table.queue_insert(tx, &deleted_version); + if let Err(e) = db::unabort(res)? { + error!( + "Unable to enqueue version deletion propagation: {}. A repair will be needed.", + e + ); } } } - Ok(()) - }); + } + Ok(()) } @@ -292,11 +336,6 @@ impl TableSchema for ObjectTable { ObjectFilter::IsUploading => entry.versions.iter().any(|v| v.is_uploading()), } } - - fn try_migrate(bytes: &[u8]) -> Option<Self::E> { - let old_obj = rmp_serde::decode::from_read_ref::<_, old::Object>(bytes).ok()?; - Some(migrate_object(old_obj)) - } } impl CountedItem for Object { @@ -341,64 +380,3 @@ impl CountedItem for Object { ] } } - -// vvvvvvvv migration code, stupid stuff vvvvvvvvvvvv -// (we just want to change bucket into bucket_id by hashing it) - -fn migrate_object(o: old::Object) -> Object { - let versions = o - .versions() - .iter() - .cloned() - .map(migrate_object_version) - .collect(); - Object { - bucket_id: blake2sum(o.bucket.as_bytes()), - key: o.key, - versions, - } -} - -fn migrate_object_version(v: old::ObjectVersion) -> ObjectVersion { - ObjectVersion { - uuid: Uuid::try_from(v.uuid.as_slice()).unwrap(), - timestamp: v.timestamp, - state: match v.state { - old::ObjectVersionState::Uploading(h) => { - ObjectVersionState::Uploading(migrate_object_version_headers(h)) - } - old::ObjectVersionState::Complete(d) => { - ObjectVersionState::Complete(migrate_object_version_data(d)) - } - old::ObjectVersionState::Aborted => ObjectVersionState::Aborted, - }, - } -} - -fn migrate_object_version_headers(h: old::ObjectVersionHeaders) -> ObjectVersionHeaders { - ObjectVersionHeaders { - content_type: h.content_type, - other: h.other, - } -} - -fn migrate_object_version_data(d: old::ObjectVersionData) -> ObjectVersionData { - match d { - old::ObjectVersionData::DeleteMarker => ObjectVersionData::DeleteMarker, - old::ObjectVersionData::Inline(m, b) => { - ObjectVersionData::Inline(migrate_object_version_meta(m), b) - } - old::ObjectVersionData::FirstBlock(m, h) => ObjectVersionData::FirstBlock( - migrate_object_version_meta(m), - Hash::try_from(h.as_slice()).unwrap(), - ), - } -} - -fn migrate_object_version_meta(m: old::ObjectVersionMeta) -> ObjectVersionMeta { - ObjectVersionMeta { - headers: migrate_object_version_headers(m.headers), - size: m.size, - etag: m.etag, - } -} diff --git a/src/model/s3/version_table.rs b/src/model/s3/version_table.rs index 6bc2ecd1..6edc83f4 100644 --- a/src/model/s3/version_table.rs +++ b/src/model/s3/version_table.rs @@ -1,9 +1,7 @@ -use serde::{Deserialize, Serialize}; use std::sync::Arc; use garage_db as db; -use garage_util::background::BackgroundRunner; use garage_util::data::*; use garage_table::crdt::*; @@ -12,32 +10,108 @@ use garage_table::*; use crate::s3::block_ref_table::*; -use crate::prev::v051::version_table as old; - -/// A version of an object -#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] -pub struct Version { - /// UUID of the version, used as partition key - pub uuid: Uuid, - - // Actual data: the blocks for this version - // In the case of a multipart upload, also store the etags - // of individual parts and check them when doing CompleteMultipartUpload - /// Is this version deleted - pub deleted: crdt::Bool, - /// list of blocks of data composing the version - pub blocks: crdt::Map<VersionBlockKey, VersionBlock>, - /// Etag of each part in case of a multipart upload, empty otherwise - pub parts_etags: crdt::Map<u64, String>, - - // Back link to bucket+key so that we can figure if - // this was deleted later on - /// Bucket in which the related object is stored - pub bucket_id: Uuid, - /// Key in which the related object is stored - pub key: String, +mod v05 { + use garage_util::crdt; + use garage_util::data::{Hash, Uuid}; + use serde::{Deserialize, Serialize}; + + /// A version of an object + #[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] + pub struct Version { + /// UUID of the version, used as partition key + pub uuid: Uuid, + + // Actual data: the blocks for this version + // In the case of a multipart upload, also store the etags + // of individual parts and check them when doing CompleteMultipartUpload + /// Is this version deleted + pub deleted: crdt::Bool, + /// list of blocks of data composing the version + pub blocks: crdt::Map<VersionBlockKey, VersionBlock>, + /// Etag of each part in case of a multipart upload, empty otherwise + pub parts_etags: crdt::Map<u64, String>, + + // Back link to bucket+key so that we can figure if + // this was deleted later on + /// Bucket in which the related object is stored + pub bucket: String, + /// Key in which the related object is stored + pub key: String, + } + + #[derive(PartialEq, Eq, Clone, Copy, Debug, Serialize, Deserialize)] + pub struct VersionBlockKey { + /// Number of the part + pub part_number: u64, + /// Offset of this sub-segment in its part + pub offset: u64, + } + + /// Informations about a single block + #[derive(PartialEq, Eq, Ord, PartialOrd, Clone, Copy, Debug, Serialize, Deserialize)] + pub struct VersionBlock { + /// Blake2 sum of the block + pub hash: Hash, + /// Size of the block + pub size: u64, + } + + impl garage_util::migrate::InitialFormat for Version {} } +mod v08 { + use garage_util::crdt; + use garage_util::data::Uuid; + use serde::{Deserialize, Serialize}; + + use super::v05; + + /// A version of an object + #[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] + pub struct Version { + /// UUID of the version, used as partition key + pub uuid: Uuid, + + // Actual data: the blocks for this version + // In the case of a multipart upload, also store the etags + // of individual parts and check them when doing CompleteMultipartUpload + /// Is this version deleted + pub deleted: crdt::Bool, + /// list of blocks of data composing the version + pub blocks: crdt::Map<VersionBlockKey, VersionBlock>, + /// Etag of each part in case of a multipart upload, empty otherwise + pub parts_etags: crdt::Map<u64, String>, + + // Back link to bucket+key so that we can figure if + // this was deleted later on + /// Bucket in which the related object is stored + pub bucket_id: Uuid, + /// Key in which the related object is stored + pub key: String, + } + + pub use v05::{VersionBlock, VersionBlockKey}; + + impl garage_util::migrate::Migrate for Version { + type Previous = v05::Version; + + fn migrate(old: v05::Version) -> Version { + use garage_util::data::blake2sum; + + Version { + uuid: old.uuid, + deleted: old.deleted, + blocks: old.blocks, + parts_etags: old.parts_etags, + bucket_id: blake2sum(old.bucket.as_bytes()), + key: old.key, + } + } + } +} + +pub use v08::*; + impl Version { pub fn new(uuid: Uuid, bucket_id: Uuid, key: String, deleted: bool) -> Self { Self { @@ -65,14 +139,6 @@ impl Version { } } -#[derive(PartialEq, Eq, Clone, Copy, Debug, Serialize, Deserialize)] -pub struct VersionBlockKey { - /// Number of the part - pub part_number: u64, - /// Offset of this sub-segment in its part - pub offset: u64, -} - impl Ord for VersionBlockKey { fn cmp(&self, other: &Self) -> std::cmp::Ordering { self.part_number @@ -87,15 +153,6 @@ impl PartialOrd for VersionBlockKey { } } -/// Informations about a single block -#[derive(PartialEq, Eq, Ord, PartialOrd, Clone, Copy, Debug, Serialize, Deserialize)] -pub struct VersionBlock { - /// Blake2 sum of the block - pub hash: Hash, - /// Size of the block - pub size: u64, -} - impl AutoCrdt for VersionBlock { const WARN_IF_DIFFERENT: bool = true; } @@ -127,7 +184,6 @@ impl Crdt for Version { } pub struct VersionTable { - pub background: Arc<BackgroundRunner>, pub block_ref_table: Arc<Table<BlockRefTable, TableShardedReplication>>, } @@ -141,33 +197,26 @@ impl TableSchema for VersionTable { fn updated( &self, - _tx: &mut db::Transaction, + tx: &mut db::Transaction, old: Option<&Self::E>, new: Option<&Self::E>, ) -> db::TxOpResult<()> { - let block_ref_table = self.block_ref_table.clone(); - let old = old.cloned(); - let new = new.cloned(); - - self.background.spawn(async move { - if let (Some(old_v), Some(new_v)) = (old, new) { - // Propagate deletion of version blocks - if new_v.deleted.get() && !old_v.deleted.get() { - let deleted_block_refs = old_v - .blocks - .items() - .iter() - .map(|(_k, vb)| BlockRef { - block: vb.hash, - version: old_v.uuid, - deleted: true.into(), - }) - .collect::<Vec<_>>(); - block_ref_table.insert_many(&deleted_block_refs[..]).await?; + if let (Some(old_v), Some(new_v)) = (old, new) { + // Propagate deletion of version blocks + if new_v.deleted.get() && !old_v.deleted.get() { + let deleted_block_refs = old_v.blocks.items().iter().map(|(_k, vb)| BlockRef { + block: vb.hash, + version: old_v.uuid, + deleted: true.into(), + }); + for block_ref in deleted_block_refs { + let res = self.block_ref_table.queue_insert(tx, &block_ref); + if let Err(e) = db::unabort(res)? { + error!("Unable to enqueue block ref deletion propagation: {}. A repair will be needed.", e); + } } } - Ok(()) - }); + } Ok(()) } @@ -175,42 +224,4 @@ impl TableSchema for VersionTable { fn matches_filter(entry: &Self::E, filter: &Self::Filter) -> bool { filter.apply(entry.deleted.get()) } - - fn try_migrate(bytes: &[u8]) -> Option<Self::E> { - let old = rmp_serde::decode::from_read_ref::<_, old::Version>(bytes).ok()?; - - let blocks = old - .blocks - .items() - .iter() - .map(|(k, v)| { - ( - VersionBlockKey { - part_number: k.part_number, - offset: k.offset, - }, - VersionBlock { - hash: Hash::try_from(v.hash.as_slice()).unwrap(), - size: v.size, - }, - ) - }) - .collect::<crdt::Map<_, _>>(); - - let parts_etags = old - .parts_etags - .items() - .iter() - .map(|(k, v)| (*k, v.clone())) - .collect::<crdt::Map<_, _>>(); - - Some(Version { - uuid: Hash::try_from(old.uuid.as_slice()).unwrap(), - deleted: crdt::Bool::new(old.deleted.get()), - blocks, - parts_etags, - bucket_id: blake2sum(old.bucket.as_bytes()), - key: old.key, - }) - } } |