use std::sync::Arc; use garage_db as db; use garage_util::crdt::Crdt; use garage_util::data::*; use garage_util::time::*; use garage_table::replication::TableShardedReplication; use garage_table::*; use crate::index_counter::*; use crate::s3::version_table::*; pub const UPLOADS: &str = "uploads"; pub const PARTS: &str = "parts"; pub const BYTES: &str = "bytes"; mod v09 { use garage_util::crdt; use garage_util::data::Uuid; use serde::{Deserialize, Serialize}; /// A part of a multipart upload #[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] pub struct MultipartUpload { /// Partition key = Upload id = UUID of the object version pub upload_id: Uuid, /// The timestamp at which the multipart upload was created pub timestamp: u64, /// Is this multipart upload deleted /// The MultipartUpload is marked as deleted as soon as the /// multipart upload is either completed or aborted pub deleted: crdt::Bool, /// List of uploaded parts, key = (part number, timestamp) /// In case of retries, all versions for each part are kept /// Everything is cleaned up only once the MultipartUpload is marked deleted pub parts: crdt::Map<MpuPartKey, MpuPart>, // Back link to bucket+key so that we can find the object this mpu // belongs to and check whether it is still valid /// Bucket in which the related object is stored pub bucket_id: Uuid, /// Key in which the related object is stored pub key: String, } #[derive(PartialEq, Eq, Clone, Copy, Debug, Serialize, Deserialize)] pub struct MpuPartKey { /// Number of the part pub part_number: u64, /// Timestamp of part upload pub timestamp: u64, } /// The version of an uploaded part #[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] pub struct MpuPart { /// Links to a Version in VersionTable pub version: Uuid, /// ETag of the content of this part (known only once done uploading) pub etag: Option<String>, /// Size of this part (known only once done uploading) pub size: Option<u64>, } impl garage_util::migrate::InitialFormat for MultipartUpload { const VERSION_MARKER: &'static [u8] = b"G09s3mpu"; } } pub use v09::*; impl Ord for MpuPartKey { fn cmp(&self, other: &Self) -> std::cmp::Ordering { self.part_number .cmp(&other.part_number) .then(self.timestamp.cmp(&other.timestamp)) } } impl PartialOrd for MpuPartKey { fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> { Some(self.cmp(other)) } } impl MultipartUpload { pub fn new( upload_id: Uuid, timestamp: u64, bucket_id: Uuid, key: String, deleted: bool, ) -> Self { Self { upload_id, timestamp, deleted: crdt::Bool::new(deleted), parts: crdt::Map::new(), bucket_id, key, } } pub fn next_timestamp(&self, part_number: u64) -> u64 { std::cmp::max( now_msec(), 1 + self .parts .items() .iter() .filter(|(x, _)| x.part_number == part_number) .map(|(x, _)| x.timestamp) .max() .unwrap_or(0), ) } } impl Entry<Uuid, EmptyKey> for MultipartUpload { fn partition_key(&self) -> &Uuid { &self.upload_id } fn sort_key(&self) -> &EmptyKey { &EmptyKey } fn is_tombstone(&self) -> bool { self.deleted.get() } } impl Crdt for MultipartUpload { fn merge(&mut self, other: &Self) { self.deleted.merge(&other.deleted); if self.deleted.get() { self.parts.clear(); } else { self.parts.merge(&other.parts); } } } impl Crdt for MpuPart { fn merge(&mut self, other: &Self) { self.etag = match (self.etag.take(), &other.etag) { (None, Some(_)) => other.etag.clone(), (Some(x), Some(y)) if x < *y => other.etag.clone(), (x, _) => x, }; self.size = match (self.size, other.size) { (None, Some(_)) => other.size, (Some(x), Some(y)) if x < y => other.size, (x, _) => x, }; } } pub struct MultipartUploadTable { pub version_table: Arc<Table<VersionTable, TableShardedReplication>>, pub mpu_counter_table: Arc<IndexCounter<MultipartUpload>>, } impl TableSchema for MultipartUploadTable { const TABLE_NAME: &'static str = "multipart_upload"; type P = Uuid; type S = EmptyKey; type E = MultipartUpload; type Filter = DeletedFilter; fn updated( &self, tx: &mut db::Transaction, old: Option<&Self::E>, new: Option<&Self::E>, ) -> db::TxOpResult<()> { // 1. Count let counter_res = self.mpu_counter_table.count(tx, old, new); if let Err(e) = db::unabort(counter_res)? { error!( "Unable to update multipart object part counter: {}. Index values will be wrong!", e ); } // 2. Propagate deletions to version table if let (Some(old_mpu), Some(new_mpu)) = (old, new) { if new_mpu.deleted.get() && !old_mpu.deleted.get() { let deleted_versions = old_mpu.parts.items().iter().map(|(_k, p)| { Version::new( p.version, VersionBacklink::MultipartUpload { upload_id: old_mpu.upload_id, }, true, ) }); for version in deleted_versions { let res = self.version_table.queue_insert(tx, &version); if let Err(e) = db::unabort(res)? { error!("Unable to enqueue version deletion propagation: {}. A repair will be needed.", e); } } } } Ok(()) } fn matches_filter(entry: &Self::E, filter: &Self::Filter) -> bool { filter.apply(entry.is_tombstone()) } } impl CountedItem for MultipartUpload { const COUNTER_TABLE_NAME: &'static str = "bucket_mpu_counter"; // Partition key = bucket id type CP = Uuid; // Sort key = nothing type CS = EmptyKey; fn counter_partition_key(&self) -> &Uuid { &self.bucket_id } fn counter_sort_key(&self) -> &EmptyKey { &EmptyKey } fn counts(&self) -> Vec<(&'static str, i64)> { let uploads = if self.deleted.get() { 0 } else { 1 }; let mut parts = self .parts .items() .iter() .map(|(k, _)| k.part_number) .collect::<Vec<_>>(); parts.dedup(); let bytes = self .parts .items() .iter() .map(|(_, p)| p.size.unwrap_or(0)) .sum::<u64>(); vec![ (UPLOADS, uploads), (PARTS, parts.len() as i64), (BYTES, bytes as i64), ] } }