aboutsummaryrefslogtreecommitdiff
path: root/src/model/s3
diff options
context:
space:
mode:
authorMendes <mendes.oulamara@pm.me>2022-10-04 18:14:49 +0200
committerMendes <mendes.oulamara@pm.me>2022-10-04 18:14:49 +0200
commit829f815a897b04986559910bbcbf53625adcdf20 (patch)
tree6db3c27cff2aded754a641d1f2b05c83be701267 /src/model/s3
parent99f96b9564c9c841dc6c56f1255a6e70ff884d46 (diff)
parenta096ced35562bd0a8877a1ee2f755be1edafe343 (diff)
downloadgarage-829f815a897b04986559910bbcbf53625adcdf20.tar.gz
garage-829f815a897b04986559910bbcbf53625adcdf20.zip
Merge remote-tracking branch 'origin/main' into optimal-layout
Diffstat (limited to 'src/model/s3')
-rw-r--r--src/model/s3/block_ref_table.rs77
-rw-r--r--src/model/s3/mod.rs3
-rw-r--r--src/model/s3/object_table.rs404
-rw-r--r--src/model/s3/version_table.rs216
4 files changed, 700 insertions, 0 deletions
diff --git a/src/model/s3/block_ref_table.rs b/src/model/s3/block_ref_table.rs
new file mode 100644
index 00000000..c7017409
--- /dev/null
+++ b/src/model/s3/block_ref_table.rs
@@ -0,0 +1,77 @@
+use serde::{Deserialize, Serialize};
+use std::sync::Arc;
+
+use garage_db as db;
+
+use garage_util::data::*;
+
+use garage_table::crdt::Crdt;
+use garage_table::*;
+
+use garage_block::manager::*;
+
+#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
+pub struct BlockRef {
+ /// Hash (blake2 sum) of the block, used as partition key
+ pub block: Hash,
+
+ /// Id of the Version for the object containing this block, used as sorting key
+ pub version: Uuid,
+
+ // Keep track of deleted status
+ /// Is the Version that contains this block deleted
+ pub deleted: crdt::Bool,
+}
+
+impl Entry<Hash, Uuid> for BlockRef {
+ fn partition_key(&self) -> &Hash {
+ &self.block
+ }
+ fn sort_key(&self) -> &Uuid {
+ &self.version
+ }
+ fn is_tombstone(&self) -> bool {
+ self.deleted.get()
+ }
+}
+
+impl Crdt for BlockRef {
+ fn merge(&mut self, other: &Self) {
+ self.deleted.merge(&other.deleted);
+ }
+}
+
+pub struct BlockRefTable {
+ pub block_manager: Arc<BlockManager>,
+}
+
+impl TableSchema for BlockRefTable {
+ const TABLE_NAME: &'static str = "block_ref";
+
+ type P = Hash;
+ type S = Uuid;
+ type E = BlockRef;
+ type Filter = DeletedFilter;
+
+ fn updated(
+ &self,
+ tx: &mut db::Transaction,
+ old: Option<&Self::E>,
+ new: Option<&Self::E>,
+ ) -> db::TxOpResult<()> {
+ let block = old.or(new).unwrap().block;
+ let was_before = old.map(|x| !x.deleted.get()).unwrap_or(false);
+ let is_after = new.map(|x| !x.deleted.get()).unwrap_or(false);
+ if is_after && !was_before {
+ self.block_manager.block_incref(tx, block)?;
+ }
+ if was_before && !is_after {
+ self.block_manager.block_decref(tx, block)?;
+ }
+ Ok(())
+ }
+
+ fn matches_filter(entry: &Self::E, filter: &Self::Filter) -> bool {
+ filter.apply(entry.deleted.get())
+ }
+}
diff --git a/src/model/s3/mod.rs b/src/model/s3/mod.rs
new file mode 100644
index 00000000..4e94337d
--- /dev/null
+++ b/src/model/s3/mod.rs
@@ -0,0 +1,3 @@
+pub mod block_ref_table;
+pub mod object_table;
+pub mod version_table;
diff --git a/src/model/s3/object_table.rs b/src/model/s3/object_table.rs
new file mode 100644
index 00000000..26ff57f6
--- /dev/null
+++ b/src/model/s3/object_table.rs
@@ -0,0 +1,404 @@
+use serde::{Deserialize, Serialize};
+use std::collections::BTreeMap;
+use std::sync::Arc;
+
+use garage_db as db;
+
+use garage_util::background::BackgroundRunner;
+use garage_util::data::*;
+
+use garage_table::crdt::*;
+use garage_table::replication::TableShardedReplication;
+use garage_table::*;
+
+use crate::index_counter::*;
+use crate::s3::version_table::*;
+
+use crate::prev::v051::object_table as old;
+
+pub const OBJECTS: &str = "objects";
+pub const UNFINISHED_UPLOADS: &str = "unfinished_uploads";
+pub const BYTES: &str = "bytes";
+
+/// An object
+#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
+pub struct Object {
+ /// The bucket in which the object is stored, used as partition key
+ pub bucket_id: Uuid,
+
+ /// The key at which the object is stored in its bucket, used as sorting key
+ pub key: String,
+
+ /// The list of currenty stored versions of the object
+ versions: Vec<ObjectVersion>,
+}
+
+impl Object {
+ /// Initialize an Object struct from parts
+ pub fn new(bucket_id: Uuid, key: String, versions: Vec<ObjectVersion>) -> Self {
+ let mut ret = Self {
+ bucket_id,
+ key,
+ versions: vec![],
+ };
+ for v in versions {
+ ret.add_version(v)
+ .expect("Twice the same ObjectVersion in Object constructor");
+ }
+ ret
+ }
+
+ /// Adds a version if it wasn't already present
+ #[allow(clippy::result_unit_err)]
+ pub fn add_version(&mut self, new: ObjectVersion) -> Result<(), ()> {
+ match self
+ .versions
+ .binary_search_by(|v| v.cmp_key().cmp(&new.cmp_key()))
+ {
+ Err(i) => {
+ self.versions.insert(i, new);
+ Ok(())
+ }
+ Ok(_) => Err(()),
+ }
+ }
+
+ /// Get a list of currently stored versions of `Object`
+ pub fn versions(&self) -> &[ObjectVersion] {
+ &self.versions[..]
+ }
+}
+
+/// Informations about a version of an object
+#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
+pub struct ObjectVersion {
+ /// Id of the version
+ pub uuid: Uuid,
+ /// Timestamp of when the object was created
+ pub timestamp: u64,
+ /// State of the version
+ pub state: ObjectVersionState,
+}
+
+/// State of an object version
+#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
+pub enum ObjectVersionState {
+ /// The version is being received
+ Uploading(ObjectVersionHeaders),
+ /// The version is fully received
+ Complete(ObjectVersionData),
+ /// The version uploaded containded errors or the upload was explicitly aborted
+ Aborted,
+}
+
+impl Crdt for ObjectVersionState {
+ fn merge(&mut self, other: &Self) {
+ use ObjectVersionState::*;
+ match other {
+ Aborted => {
+ *self = Aborted;
+ }
+ Complete(b) => match self {
+ Aborted => {}
+ Complete(a) => {
+ a.merge(b);
+ }
+ Uploading(_) => {
+ *self = Complete(b.clone());
+ }
+ },
+ Uploading(_) => {}
+ }
+ }
+}
+
+/// Data stored in object version
+#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)]
+pub enum ObjectVersionData {
+ /// The object was deleted, this Version is a tombstone to mark it as such
+ DeleteMarker,
+ /// The object is short, it's stored inlined
+ Inline(ObjectVersionMeta, #[serde(with = "serde_bytes")] Vec<u8>),
+ /// The object is not short, Hash of first block is stored here, next segments hashes are
+ /// stored in the version table
+ FirstBlock(ObjectVersionMeta, Hash),
+}
+
+impl AutoCrdt for ObjectVersionData {
+ const WARN_IF_DIFFERENT: bool = true;
+}
+
+/// Metadata about the object version
+#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)]
+pub struct ObjectVersionMeta {
+ /// Headers to send to the client
+ pub headers: ObjectVersionHeaders,
+ /// Size of the object
+ pub size: u64,
+ /// etag of the object
+ pub etag: String,
+}
+
+/// Additional headers for an object
+#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)]
+pub struct ObjectVersionHeaders {
+ /// Content type of the object
+ pub content_type: String,
+ /// Any other http headers to send
+ pub other: BTreeMap<String, String>,
+}
+
+impl ObjectVersion {
+ fn cmp_key(&self) -> (u64, Uuid) {
+ (self.timestamp, self.uuid)
+ }
+
+ /// Is the object version currently being uploaded
+ pub fn is_uploading(&self) -> bool {
+ matches!(self.state, ObjectVersionState::Uploading(_))
+ }
+
+ /// Is the object version completely received
+ pub fn is_complete(&self) -> bool {
+ matches!(self.state, ObjectVersionState::Complete(_))
+ }
+
+ /// Is the object version available (received and not a tombstone)
+ pub fn is_data(&self) -> bool {
+ match self.state {
+ ObjectVersionState::Complete(ObjectVersionData::DeleteMarker) => false,
+ ObjectVersionState::Complete(_) => true,
+ _ => false,
+ }
+ }
+}
+
+impl Entry<Uuid, String> for Object {
+ fn partition_key(&self) -> &Uuid {
+ &self.bucket_id
+ }
+ fn sort_key(&self) -> &String {
+ &self.key
+ }
+ fn is_tombstone(&self) -> bool {
+ self.versions.len() == 1
+ && self.versions[0].state
+ == ObjectVersionState::Complete(ObjectVersionData::DeleteMarker)
+ }
+}
+
+impl Crdt for Object {
+ fn merge(&mut self, other: &Self) {
+ // Merge versions from other into here
+ for other_v in other.versions.iter() {
+ match self
+ .versions
+ .binary_search_by(|v| v.cmp_key().cmp(&other_v.cmp_key()))
+ {
+ Ok(i) => {
+ self.versions[i].state.merge(&other_v.state);
+ }
+ Err(i) => {
+ self.versions.insert(i, other_v.clone());
+ }
+ }
+ }
+
+ // Remove versions which are obsolete, i.e. those that come
+ // before the last version which .is_complete().
+ let last_complete = self
+ .versions
+ .iter()
+ .enumerate()
+ .rev()
+ .find(|(_, v)| v.is_complete())
+ .map(|(vi, _)| vi);
+
+ if let Some(last_vi) = last_complete {
+ self.versions = self.versions.drain(last_vi..).collect::<Vec<_>>();
+ }
+ }
+}
+
+pub struct ObjectTable {
+ pub background: Arc<BackgroundRunner>,
+ pub version_table: Arc<Table<VersionTable, TableShardedReplication>>,
+ pub object_counter_table: Arc<IndexCounter<Object>>,
+}
+
+#[derive(Clone, Copy, Debug, Serialize, Deserialize)]
+pub enum ObjectFilter {
+ IsData,
+ IsUploading,
+}
+
+impl TableSchema for ObjectTable {
+ const TABLE_NAME: &'static str = "object";
+
+ type P = Uuid;
+ type S = String;
+ type E = Object;
+ type Filter = ObjectFilter;
+
+ fn updated(
+ &self,
+ tx: &mut db::Transaction,
+ old: Option<&Self::E>,
+ new: Option<&Self::E>,
+ ) -> db::TxOpResult<()> {
+ // 1. Count
+ let counter_res = self.object_counter_table.count(tx, old, new);
+ if let Err(e) = db::unabort(counter_res)? {
+ error!(
+ "Unable to update object counter: {}. Index values will be wrong!",
+ e
+ );
+ }
+
+ // 2. Spawn threads that propagates deletions to version table
+ let version_table = self.version_table.clone();
+ let old = old.cloned();
+ let new = new.cloned();
+
+ self.background.spawn(async move {
+ if let (Some(old_v), Some(new_v)) = (old, new) {
+ // Propagate deletion of old versions
+ for v in old_v.versions.iter() {
+ let newly_deleted = match new_v
+ .versions
+ .binary_search_by(|nv| nv.cmp_key().cmp(&v.cmp_key()))
+ {
+ Err(_) => true,
+ Ok(i) => {
+ new_v.versions[i].state == ObjectVersionState::Aborted
+ && v.state != ObjectVersionState::Aborted
+ }
+ };
+ if newly_deleted {
+ let deleted_version =
+ Version::new(v.uuid, old_v.bucket_id, old_v.key.clone(), true);
+ version_table.insert(&deleted_version).await?;
+ }
+ }
+ }
+ Ok(())
+ });
+ Ok(())
+ }
+
+ fn matches_filter(entry: &Self::E, filter: &Self::Filter) -> bool {
+ match filter {
+ ObjectFilter::IsData => entry.versions.iter().any(|v| v.is_data()),
+ ObjectFilter::IsUploading => entry.versions.iter().any(|v| v.is_uploading()),
+ }
+ }
+
+ fn try_migrate(bytes: &[u8]) -> Option<Self::E> {
+ let old_obj = rmp_serde::decode::from_read_ref::<_, old::Object>(bytes).ok()?;
+ Some(migrate_object(old_obj))
+ }
+}
+
+impl CountedItem for Object {
+ const COUNTER_TABLE_NAME: &'static str = "bucket_object_counter";
+
+ // Partition key = bucket id
+ type CP = Uuid;
+ // Sort key = nothing
+ type CS = EmptyKey;
+
+ fn counter_partition_key(&self) -> &Uuid {
+ &self.bucket_id
+ }
+ fn counter_sort_key(&self) -> &EmptyKey {
+ &EmptyKey
+ }
+
+ fn counts(&self) -> Vec<(&'static str, i64)> {
+ let versions = self.versions();
+ let n_objects = if versions.iter().any(|v| v.is_data()) {
+ 1
+ } else {
+ 0
+ };
+ let n_unfinished_uploads = versions
+ .iter()
+ .filter(|v| matches!(v.state, ObjectVersionState::Uploading(_)))
+ .count();
+ let n_bytes = versions
+ .iter()
+ .map(|v| match &v.state {
+ ObjectVersionState::Complete(ObjectVersionData::Inline(meta, _))
+ | ObjectVersionState::Complete(ObjectVersionData::FirstBlock(meta, _)) => meta.size,
+ _ => 0,
+ })
+ .sum::<u64>();
+
+ vec![
+ (OBJECTS, n_objects),
+ (UNFINISHED_UPLOADS, n_unfinished_uploads as i64),
+ (BYTES, n_bytes as i64),
+ ]
+ }
+}
+
+// vvvvvvvv migration code, stupid stuff vvvvvvvvvvvv
+// (we just want to change bucket into bucket_id by hashing it)
+
+fn migrate_object(o: old::Object) -> Object {
+ let versions = o
+ .versions()
+ .iter()
+ .cloned()
+ .map(migrate_object_version)
+ .collect();
+ Object {
+ bucket_id: blake2sum(o.bucket.as_bytes()),
+ key: o.key,
+ versions,
+ }
+}
+
+fn migrate_object_version(v: old::ObjectVersion) -> ObjectVersion {
+ ObjectVersion {
+ uuid: Uuid::try_from(v.uuid.as_slice()).unwrap(),
+ timestamp: v.timestamp,
+ state: match v.state {
+ old::ObjectVersionState::Uploading(h) => {
+ ObjectVersionState::Uploading(migrate_object_version_headers(h))
+ }
+ old::ObjectVersionState::Complete(d) => {
+ ObjectVersionState::Complete(migrate_object_version_data(d))
+ }
+ old::ObjectVersionState::Aborted => ObjectVersionState::Aborted,
+ },
+ }
+}
+
+fn migrate_object_version_headers(h: old::ObjectVersionHeaders) -> ObjectVersionHeaders {
+ ObjectVersionHeaders {
+ content_type: h.content_type,
+ other: h.other,
+ }
+}
+
+fn migrate_object_version_data(d: old::ObjectVersionData) -> ObjectVersionData {
+ match d {
+ old::ObjectVersionData::DeleteMarker => ObjectVersionData::DeleteMarker,
+ old::ObjectVersionData::Inline(m, b) => {
+ ObjectVersionData::Inline(migrate_object_version_meta(m), b)
+ }
+ old::ObjectVersionData::FirstBlock(m, h) => ObjectVersionData::FirstBlock(
+ migrate_object_version_meta(m),
+ Hash::try_from(h.as_slice()).unwrap(),
+ ),
+ }
+}
+
+fn migrate_object_version_meta(m: old::ObjectVersionMeta) -> ObjectVersionMeta {
+ ObjectVersionMeta {
+ headers: migrate_object_version_headers(m.headers),
+ size: m.size,
+ etag: m.etag,
+ }
+}
diff --git a/src/model/s3/version_table.rs b/src/model/s3/version_table.rs
new file mode 100644
index 00000000..6bc2ecd1
--- /dev/null
+++ b/src/model/s3/version_table.rs
@@ -0,0 +1,216 @@
+use serde::{Deserialize, Serialize};
+use std::sync::Arc;
+
+use garage_db as db;
+
+use garage_util::background::BackgroundRunner;
+use garage_util::data::*;
+
+use garage_table::crdt::*;
+use garage_table::replication::TableShardedReplication;
+use garage_table::*;
+
+use crate::s3::block_ref_table::*;
+
+use crate::prev::v051::version_table as old;
+
+/// A version of an object
+#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
+pub struct Version {
+ /// UUID of the version, used as partition key
+ pub uuid: Uuid,
+
+ // Actual data: the blocks for this version
+ // In the case of a multipart upload, also store the etags
+ // of individual parts and check them when doing CompleteMultipartUpload
+ /// Is this version deleted
+ pub deleted: crdt::Bool,
+ /// list of blocks of data composing the version
+ pub blocks: crdt::Map<VersionBlockKey, VersionBlock>,
+ /// Etag of each part in case of a multipart upload, empty otherwise
+ pub parts_etags: crdt::Map<u64, String>,
+
+ // Back link to bucket+key so that we can figure if
+ // this was deleted later on
+ /// Bucket in which the related object is stored
+ pub bucket_id: Uuid,
+ /// Key in which the related object is stored
+ pub key: String,
+}
+
+impl Version {
+ pub fn new(uuid: Uuid, bucket_id: Uuid, key: String, deleted: bool) -> Self {
+ Self {
+ uuid,
+ deleted: deleted.into(),
+ blocks: crdt::Map::new(),
+ parts_etags: crdt::Map::new(),
+ bucket_id,
+ key,
+ }
+ }
+
+ pub fn has_part_number(&self, part_number: u64) -> bool {
+ let case1 = self
+ .parts_etags
+ .items()
+ .binary_search_by(|(k, _)| k.cmp(&part_number))
+ .is_ok();
+ let case2 = self
+ .blocks
+ .items()
+ .binary_search_by(|(k, _)| k.part_number.cmp(&part_number))
+ .is_ok();
+ case1 || case2
+ }
+}
+
+#[derive(PartialEq, Eq, Clone, Copy, Debug, Serialize, Deserialize)]
+pub struct VersionBlockKey {
+ /// Number of the part
+ pub part_number: u64,
+ /// Offset of this sub-segment in its part
+ pub offset: u64,
+}
+
+impl Ord for VersionBlockKey {
+ fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+ self.part_number
+ .cmp(&other.part_number)
+ .then(self.offset.cmp(&other.offset))
+ }
+}
+
+impl PartialOrd for VersionBlockKey {
+ fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+ Some(self.cmp(other))
+ }
+}
+
+/// Informations about a single block
+#[derive(PartialEq, Eq, Ord, PartialOrd, Clone, Copy, Debug, Serialize, Deserialize)]
+pub struct VersionBlock {
+ /// Blake2 sum of the block
+ pub hash: Hash,
+ /// Size of the block
+ pub size: u64,
+}
+
+impl AutoCrdt for VersionBlock {
+ const WARN_IF_DIFFERENT: bool = true;
+}
+
+impl Entry<Uuid, EmptyKey> for Version {
+ fn partition_key(&self) -> &Uuid {
+ &self.uuid
+ }
+ fn sort_key(&self) -> &EmptyKey {
+ &EmptyKey
+ }
+ fn is_tombstone(&self) -> bool {
+ self.deleted.get()
+ }
+}
+
+impl Crdt for Version {
+ fn merge(&mut self, other: &Self) {
+ self.deleted.merge(&other.deleted);
+
+ if self.deleted.get() {
+ self.blocks.clear();
+ self.parts_etags.clear();
+ } else {
+ self.blocks.merge(&other.blocks);
+ self.parts_etags.merge(&other.parts_etags);
+ }
+ }
+}
+
+pub struct VersionTable {
+ pub background: Arc<BackgroundRunner>,
+ pub block_ref_table: Arc<Table<BlockRefTable, TableShardedReplication>>,
+}
+
+impl TableSchema for VersionTable {
+ const TABLE_NAME: &'static str = "version";
+
+ type P = Uuid;
+ type S = EmptyKey;
+ type E = Version;
+ type Filter = DeletedFilter;
+
+ fn updated(
+ &self,
+ _tx: &mut db::Transaction,
+ old: Option<&Self::E>,
+ new: Option<&Self::E>,
+ ) -> db::TxOpResult<()> {
+ let block_ref_table = self.block_ref_table.clone();
+ let old = old.cloned();
+ let new = new.cloned();
+
+ self.background.spawn(async move {
+ if let (Some(old_v), Some(new_v)) = (old, new) {
+ // Propagate deletion of version blocks
+ if new_v.deleted.get() && !old_v.deleted.get() {
+ let deleted_block_refs = old_v
+ .blocks
+ .items()
+ .iter()
+ .map(|(_k, vb)| BlockRef {
+ block: vb.hash,
+ version: old_v.uuid,
+ deleted: true.into(),
+ })
+ .collect::<Vec<_>>();
+ block_ref_table.insert_many(&deleted_block_refs[..]).await?;
+ }
+ }
+ Ok(())
+ });
+
+ Ok(())
+ }
+
+ fn matches_filter(entry: &Self::E, filter: &Self::Filter) -> bool {
+ filter.apply(entry.deleted.get())
+ }
+
+ fn try_migrate(bytes: &[u8]) -> Option<Self::E> {
+ let old = rmp_serde::decode::from_read_ref::<_, old::Version>(bytes).ok()?;
+
+ let blocks = old
+ .blocks
+ .items()
+ .iter()
+ .map(|(k, v)| {
+ (
+ VersionBlockKey {
+ part_number: k.part_number,
+ offset: k.offset,
+ },
+ VersionBlock {
+ hash: Hash::try_from(v.hash.as_slice()).unwrap(),
+ size: v.size,
+ },
+ )
+ })
+ .collect::<crdt::Map<_, _>>();
+
+ let parts_etags = old
+ .parts_etags
+ .items()
+ .iter()
+ .map(|(k, v)| (*k, v.clone()))
+ .collect::<crdt::Map<_, _>>();
+
+ Some(Version {
+ uuid: Hash::try_from(old.uuid.as_slice()).unwrap(),
+ deleted: crdt::Bool::new(old.deleted.get()),
+ blocks,
+ parts_etags,
+ bucket_id: blake2sum(old.bucket.as_bytes()),
+ key: old.key,
+ })
+ }
+}