diff options
author | Alex <alex@adnab.me> | 2022-05-10 13:16:57 +0200 |
---|---|---|
committer | Alex <alex@adnab.me> | 2022-05-10 13:16:57 +0200 |
commit | 5768bf362262f78376af14517c4921941986192e (patch) | |
tree | b4baf3051eade0f63649443278bb3a3f4c38ec25 /src/model/s3 | |
parent | def78c5e6f5da37a0d17b5652c525fbeccbc2e86 (diff) | |
download | garage-5768bf362262f78376af14517c4921941986192e.tar.gz garage-5768bf362262f78376af14517c4921941986192e.zip |
First implementation of K2V (#293)
**Specification:**
View spec at [this URL](https://git.deuxfleurs.fr/Deuxfleurs/garage/src/branch/k2v/doc/drafts/k2v-spec.md)
- [x] Specify the structure of K2V triples
- [x] Specify the DVVS format used for causality detection
- [x] Specify the K2V index (just a counter of number of values per partition key)
- [x] Specify single-item endpoints: ReadItem, InsertItem, DeleteItem
- [x] Specify index endpoint: ReadIndex
- [x] Specify multi-item endpoints: InsertBatch, ReadBatch, DeleteBatch
- [x] Move to JSON objects instead of tuples
- [x] Specify endpoints for polling for updates on single values (PollItem)
**Implementation:**
- [x] Table for K2V items, causal contexts
- [x] Indexing mechanism and table for K2V index
- [x] Make API handlers a bit more generic
- [x] K2V API endpoint
- [x] K2V API router
- [x] ReadItem
- [x] InsertItem
- [x] DeleteItem
- [x] PollItem
- [x] ReadIndex
- [x] InsertBatch
- [x] ReadBatch
- [x] DeleteBatch
**Testing:**
- [x] Just a simple Python script that does some requests to check visually that things are going right (does not contain parsing of results or assertions on returned values)
- [x] Actual tests:
- [x] Adapt testing framework
- [x] Simple test with InsertItem + ReadItem
- [x] Test with several Insert/Read/DeleteItem + ReadIndex
- [x] Test all combinations of return formats for ReadItem
- [x] Test with ReadBatch, InsertBatch, DeleteBatch
- [x] Test with PollItem
- [x] Test error codes
- [ ] Fix most broken stuff
- [x] test PollItem broken randomly
- [x] when invalid causality tokens are given, errors should be 4xx not 5xx
**Improvements:**
- [x] Descending range queries
- [x] Specify
- [x] Implement
- [x] Add test
- [x] Batch updates to index counter
- [x] Put K2V behind `k2v` feature flag
Co-authored-by: Alex Auvolat <alex@adnab.me>
Reviewed-on: https://git.deuxfleurs.fr/Deuxfleurs/garage/pulls/293
Co-authored-by: Alex <alex@adnab.me>
Co-committed-by: Alex <alex@adnab.me>
Diffstat (limited to 'src/model/s3')
-rw-r--r-- | src/model/s3/block_ref_table.rs | 74 | ||||
-rw-r--r-- | src/model/s3/mod.rs | 3 | ||||
-rw-r--r-- | src/model/s3/object_table.rs | 337 | ||||
-rw-r--r-- | src/model/s3/version_table.rs | 207 |
4 files changed, 621 insertions, 0 deletions
diff --git a/src/model/s3/block_ref_table.rs b/src/model/s3/block_ref_table.rs new file mode 100644 index 00000000..9b3991bf --- /dev/null +++ b/src/model/s3/block_ref_table.rs @@ -0,0 +1,74 @@ +use serde::{Deserialize, Serialize}; +use std::sync::Arc; + +use garage_util::data::*; + +use garage_table::crdt::Crdt; +use garage_table::*; + +use garage_block::manager::*; + +#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] +pub struct BlockRef { + /// Hash (blake2 sum) of the block, used as partition key + pub block: Hash, + + /// Id of the Version for the object containing this block, used as sorting key + pub version: Uuid, + + // Keep track of deleted status + /// Is the Version that contains this block deleted + pub deleted: crdt::Bool, +} + +impl Entry<Hash, Uuid> for BlockRef { + fn partition_key(&self) -> &Hash { + &self.block + } + fn sort_key(&self) -> &Uuid { + &self.version + } + fn is_tombstone(&self) -> bool { + self.deleted.get() + } +} + +impl Crdt for BlockRef { + fn merge(&mut self, other: &Self) { + self.deleted.merge(&other.deleted); + } +} + +pub struct BlockRefTable { + pub block_manager: Arc<BlockManager>, +} + +impl TableSchema for BlockRefTable { + const TABLE_NAME: &'static str = "block_ref"; + + type P = Hash; + type S = Uuid; + type E = BlockRef; + type Filter = DeletedFilter; + + fn updated(&self, old: Option<&Self::E>, new: Option<&Self::E>) { + #[allow(clippy::or_fun_call)] + let block = &old.or(new).unwrap().block; + let was_before = old.map(|x| !x.deleted.get()).unwrap_or(false); + let is_after = new.map(|x| !x.deleted.get()).unwrap_or(false); + if is_after && !was_before { + if let Err(e) = self.block_manager.block_incref(block) { + warn!("block_incref failed for block {:?}: {}", block, e); + } + } + if was_before && !is_after { + if let Err(e) = self.block_manager.block_decref(block) { + warn!("block_decref failed for block {:?}: {}", block, e); + } + } + } + + fn matches_filter(entry: &Self::E, filter: &Self::Filter) -> bool { + filter.apply(entry.deleted.get()) + } +} diff --git a/src/model/s3/mod.rs b/src/model/s3/mod.rs new file mode 100644 index 00000000..4e94337d --- /dev/null +++ b/src/model/s3/mod.rs @@ -0,0 +1,3 @@ +pub mod block_ref_table; +pub mod object_table; +pub mod version_table; diff --git a/src/model/s3/object_table.rs b/src/model/s3/object_table.rs new file mode 100644 index 00000000..3d9a89f7 --- /dev/null +++ b/src/model/s3/object_table.rs @@ -0,0 +1,337 @@ +use serde::{Deserialize, Serialize}; +use std::collections::BTreeMap; +use std::sync::Arc; + +use garage_util::background::BackgroundRunner; +use garage_util::data::*; + +use garage_table::crdt::*; +use garage_table::replication::TableShardedReplication; +use garage_table::*; + +use crate::s3::version_table::*; + +use garage_model_050::object_table as old; + +/// An object +#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] +pub struct Object { + /// The bucket in which the object is stored, used as partition key + pub bucket_id: Uuid, + + /// The key at which the object is stored in its bucket, used as sorting key + pub key: String, + + /// The list of currenty stored versions of the object + versions: Vec<ObjectVersion>, +} + +impl Object { + /// Initialize an Object struct from parts + pub fn new(bucket_id: Uuid, key: String, versions: Vec<ObjectVersion>) -> Self { + let mut ret = Self { + bucket_id, + key, + versions: vec![], + }; + for v in versions { + ret.add_version(v) + .expect("Twice the same ObjectVersion in Object constructor"); + } + ret + } + + /// Adds a version if it wasn't already present + #[allow(clippy::result_unit_err)] + pub fn add_version(&mut self, new: ObjectVersion) -> Result<(), ()> { + match self + .versions + .binary_search_by(|v| v.cmp_key().cmp(&new.cmp_key())) + { + Err(i) => { + self.versions.insert(i, new); + Ok(()) + } + Ok(_) => Err(()), + } + } + + /// Get a list of currently stored versions of `Object` + pub fn versions(&self) -> &[ObjectVersion] { + &self.versions[..] + } +} + +/// Informations about a version of an object +#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] +pub struct ObjectVersion { + /// Id of the version + pub uuid: Uuid, + /// Timestamp of when the object was created + pub timestamp: u64, + /// State of the version + pub state: ObjectVersionState, +} + +/// State of an object version +#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] +pub enum ObjectVersionState { + /// The version is being received + Uploading(ObjectVersionHeaders), + /// The version is fully received + Complete(ObjectVersionData), + /// The version uploaded containded errors or the upload was explicitly aborted + Aborted, +} + +impl Crdt for ObjectVersionState { + fn merge(&mut self, other: &Self) { + use ObjectVersionState::*; + match other { + Aborted => { + *self = Aborted; + } + Complete(b) => match self { + Aborted => {} + Complete(a) => { + a.merge(b); + } + Uploading(_) => { + *self = Complete(b.clone()); + } + }, + Uploading(_) => {} + } + } +} + +/// Data stored in object version +#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] +pub enum ObjectVersionData { + /// The object was deleted, this Version is a tombstone to mark it as such + DeleteMarker, + /// The object is short, it's stored inlined + Inline(ObjectVersionMeta, #[serde(with = "serde_bytes")] Vec<u8>), + /// The object is not short, Hash of first block is stored here, next segments hashes are + /// stored in the version table + FirstBlock(ObjectVersionMeta, Hash), +} + +impl AutoCrdt for ObjectVersionData { + const WARN_IF_DIFFERENT: bool = true; +} + +/// Metadata about the object version +#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] +pub struct ObjectVersionMeta { + /// Headers to send to the client + pub headers: ObjectVersionHeaders, + /// Size of the object + pub size: u64, + /// etag of the object + pub etag: String, +} + +/// Additional headers for an object +#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] +pub struct ObjectVersionHeaders { + /// Content type of the object + pub content_type: String, + /// Any other http headers to send + pub other: BTreeMap<String, String>, +} + +impl ObjectVersion { + fn cmp_key(&self) -> (u64, Uuid) { + (self.timestamp, self.uuid) + } + + /// Is the object version currently being uploaded + pub fn is_uploading(&self) -> bool { + matches!(self.state, ObjectVersionState::Uploading(_)) + } + + /// Is the object version completely received + pub fn is_complete(&self) -> bool { + matches!(self.state, ObjectVersionState::Complete(_)) + } + + /// Is the object version available (received and not a tombstone) + pub fn is_data(&self) -> bool { + match self.state { + ObjectVersionState::Complete(ObjectVersionData::DeleteMarker) => false, + ObjectVersionState::Complete(_) => true, + _ => false, + } + } +} + +impl Entry<Uuid, String> for Object { + fn partition_key(&self) -> &Uuid { + &self.bucket_id + } + fn sort_key(&self) -> &String { + &self.key + } + fn is_tombstone(&self) -> bool { + self.versions.len() == 1 + && self.versions[0].state + == ObjectVersionState::Complete(ObjectVersionData::DeleteMarker) + } +} + +impl Crdt for Object { + fn merge(&mut self, other: &Self) { + // Merge versions from other into here + for other_v in other.versions.iter() { + match self + .versions + .binary_search_by(|v| v.cmp_key().cmp(&other_v.cmp_key())) + { + Ok(i) => { + self.versions[i].state.merge(&other_v.state); + } + Err(i) => { + self.versions.insert(i, other_v.clone()); + } + } + } + + // Remove versions which are obsolete, i.e. those that come + // before the last version which .is_complete(). + let last_complete = self + .versions + .iter() + .enumerate() + .rev() + .find(|(_, v)| v.is_complete()) + .map(|(vi, _)| vi); + + if let Some(last_vi) = last_complete { + self.versions = self.versions.drain(last_vi..).collect::<Vec<_>>(); + } + } +} + +pub struct ObjectTable { + pub background: Arc<BackgroundRunner>, + pub version_table: Arc<Table<VersionTable, TableShardedReplication>>, +} + +#[derive(Clone, Copy, Debug, Serialize, Deserialize)] +pub enum ObjectFilter { + IsData, + IsUploading, +} + +impl TableSchema for ObjectTable { + const TABLE_NAME: &'static str = "object"; + + type P = Uuid; + type S = String; + type E = Object; + type Filter = ObjectFilter; + + fn updated(&self, old: Option<&Self::E>, new: Option<&Self::E>) { + let version_table = self.version_table.clone(); + let old = old.cloned(); + let new = new.cloned(); + + self.background.spawn(async move { + if let (Some(old_v), Some(new_v)) = (old, new) { + // Propagate deletion of old versions + for v in old_v.versions.iter() { + let newly_deleted = match new_v + .versions + .binary_search_by(|nv| nv.cmp_key().cmp(&v.cmp_key())) + { + Err(_) => true, + Ok(i) => { + new_v.versions[i].state == ObjectVersionState::Aborted + && v.state != ObjectVersionState::Aborted + } + }; + if newly_deleted { + let deleted_version = + Version::new(v.uuid, old_v.bucket_id, old_v.key.clone(), true); + version_table.insert(&deleted_version).await?; + } + } + } + Ok(()) + }) + } + + fn matches_filter(entry: &Self::E, filter: &Self::Filter) -> bool { + match filter { + ObjectFilter::IsData => entry.versions.iter().any(|v| v.is_data()), + ObjectFilter::IsUploading => entry.versions.iter().any(|v| v.is_uploading()), + } + } + + fn try_migrate(bytes: &[u8]) -> Option<Self::E> { + let old_obj = rmp_serde::decode::from_read_ref::<_, old::Object>(bytes).ok()?; + Some(migrate_object(old_obj)) + } +} + +// vvvvvvvv migration code, stupid stuff vvvvvvvvvvvv +// (we just want to change bucket into bucket_id by hashing it) + +fn migrate_object(o: old::Object) -> Object { + let versions = o + .versions() + .iter() + .cloned() + .map(migrate_object_version) + .collect(); + Object { + bucket_id: blake2sum(o.bucket.as_bytes()), + key: o.key, + versions, + } +} + +fn migrate_object_version(v: old::ObjectVersion) -> ObjectVersion { + ObjectVersion { + uuid: Uuid::try_from(v.uuid.as_slice()).unwrap(), + timestamp: v.timestamp, + state: match v.state { + old::ObjectVersionState::Uploading(h) => { + ObjectVersionState::Uploading(migrate_object_version_headers(h)) + } + old::ObjectVersionState::Complete(d) => { + ObjectVersionState::Complete(migrate_object_version_data(d)) + } + old::ObjectVersionState::Aborted => ObjectVersionState::Aborted, + }, + } +} + +fn migrate_object_version_headers(h: old::ObjectVersionHeaders) -> ObjectVersionHeaders { + ObjectVersionHeaders { + content_type: h.content_type, + other: h.other, + } +} + +fn migrate_object_version_data(d: old::ObjectVersionData) -> ObjectVersionData { + match d { + old::ObjectVersionData::DeleteMarker => ObjectVersionData::DeleteMarker, + old::ObjectVersionData::Inline(m, b) => { + ObjectVersionData::Inline(migrate_object_version_meta(m), b) + } + old::ObjectVersionData::FirstBlock(m, h) => ObjectVersionData::FirstBlock( + migrate_object_version_meta(m), + Hash::try_from(h.as_slice()).unwrap(), + ), + } +} + +fn migrate_object_version_meta(m: old::ObjectVersionMeta) -> ObjectVersionMeta { + ObjectVersionMeta { + headers: migrate_object_version_headers(m.headers), + size: m.size, + etag: m.etag, + } +} diff --git a/src/model/s3/version_table.rs b/src/model/s3/version_table.rs new file mode 100644 index 00000000..ad096772 --- /dev/null +++ b/src/model/s3/version_table.rs @@ -0,0 +1,207 @@ +use serde::{Deserialize, Serialize}; +use std::sync::Arc; + +use garage_util::background::BackgroundRunner; +use garage_util::data::*; + +use garage_table::crdt::*; +use garage_table::replication::TableShardedReplication; +use garage_table::*; + +use crate::s3::block_ref_table::*; + +use garage_model_050::version_table as old; + +/// A version of an object +#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] +pub struct Version { + /// UUID of the version, used as partition key + pub uuid: Uuid, + + // Actual data: the blocks for this version + // In the case of a multipart upload, also store the etags + // of individual parts and check them when doing CompleteMultipartUpload + /// Is this version deleted + pub deleted: crdt::Bool, + /// list of blocks of data composing the version + pub blocks: crdt::Map<VersionBlockKey, VersionBlock>, + /// Etag of each part in case of a multipart upload, empty otherwise + pub parts_etags: crdt::Map<u64, String>, + + // Back link to bucket+key so that we can figure if + // this was deleted later on + /// Bucket in which the related object is stored + pub bucket_id: Uuid, + /// Key in which the related object is stored + pub key: String, +} + +impl Version { + pub fn new(uuid: Uuid, bucket_id: Uuid, key: String, deleted: bool) -> Self { + Self { + uuid, + deleted: deleted.into(), + blocks: crdt::Map::new(), + parts_etags: crdt::Map::new(), + bucket_id, + key, + } + } + + pub fn has_part_number(&self, part_number: u64) -> bool { + let case1 = self + .parts_etags + .items() + .binary_search_by(|(k, _)| k.cmp(&part_number)) + .is_ok(); + let case2 = self + .blocks + .items() + .binary_search_by(|(k, _)| k.part_number.cmp(&part_number)) + .is_ok(); + case1 || case2 + } +} + +#[derive(PartialEq, Eq, Clone, Copy, Debug, Serialize, Deserialize)] +pub struct VersionBlockKey { + /// Number of the part + pub part_number: u64, + /// Offset of this sub-segment in its part + pub offset: u64, +} + +impl Ord for VersionBlockKey { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + self.part_number + .cmp(&other.part_number) + .then(self.offset.cmp(&other.offset)) + } +} + +impl PartialOrd for VersionBlockKey { + fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> { + Some(self.cmp(other)) + } +} + +/// Informations about a single block +#[derive(PartialEq, Eq, Ord, PartialOrd, Clone, Copy, Debug, Serialize, Deserialize)] +pub struct VersionBlock { + /// Blake2 sum of the block + pub hash: Hash, + /// Size of the block + pub size: u64, +} + +impl AutoCrdt for VersionBlock { + const WARN_IF_DIFFERENT: bool = true; +} + +impl Entry<Uuid, EmptyKey> for Version { + fn partition_key(&self) -> &Uuid { + &self.uuid + } + fn sort_key(&self) -> &EmptyKey { + &EmptyKey + } + fn is_tombstone(&self) -> bool { + self.deleted.get() + } +} + +impl Crdt for Version { + fn merge(&mut self, other: &Self) { + self.deleted.merge(&other.deleted); + + if self.deleted.get() { + self.blocks.clear(); + self.parts_etags.clear(); + } else { + self.blocks.merge(&other.blocks); + self.parts_etags.merge(&other.parts_etags); + } + } +} + +pub struct VersionTable { + pub background: Arc<BackgroundRunner>, + pub block_ref_table: Arc<Table<BlockRefTable, TableShardedReplication>>, +} + +impl TableSchema for VersionTable { + const TABLE_NAME: &'static str = "version"; + + type P = Uuid; + type S = EmptyKey; + type E = Version; + type Filter = DeletedFilter; + + fn updated(&self, old: Option<&Self::E>, new: Option<&Self::E>) { + let block_ref_table = self.block_ref_table.clone(); + let old = old.cloned(); + let new = new.cloned(); + + self.background.spawn(async move { + if let (Some(old_v), Some(new_v)) = (old, new) { + // Propagate deletion of version blocks + if new_v.deleted.get() && !old_v.deleted.get() { + let deleted_block_refs = old_v + .blocks + .items() + .iter() + .map(|(_k, vb)| BlockRef { + block: vb.hash, + version: old_v.uuid, + deleted: true.into(), + }) + .collect::<Vec<_>>(); + block_ref_table.insert_many(&deleted_block_refs[..]).await?; + } + } + Ok(()) + }) + } + + fn matches_filter(entry: &Self::E, filter: &Self::Filter) -> bool { + filter.apply(entry.deleted.get()) + } + + fn try_migrate(bytes: &[u8]) -> Option<Self::E> { + let old = rmp_serde::decode::from_read_ref::<_, old::Version>(bytes).ok()?; + + let blocks = old + .blocks + .items() + .iter() + .map(|(k, v)| { + ( + VersionBlockKey { + part_number: k.part_number, + offset: k.offset, + }, + VersionBlock { + hash: Hash::try_from(v.hash.as_slice()).unwrap(), + size: v.size, + }, + ) + }) + .collect::<crdt::Map<_, _>>(); + + let parts_etags = old + .parts_etags + .items() + .iter() + .map(|(k, v)| (*k, v.clone())) + .collect::<crdt::Map<_, _>>(); + + Some(Version { + uuid: Hash::try_from(old.uuid.as_slice()).unwrap(), + deleted: crdt::Bool::new(old.deleted.get()), + blocks, + parts_etags, + bucket_id: blake2sum(old.bucket.as_bytes()), + key: old.key, + }) + } +} |