diff options
Diffstat (limited to 'src/api/s3/put.rs')
-rw-r--r-- | src/api/s3/put.rs | 474 |
1 files changed, 69 insertions, 405 deletions
diff --git a/src/api/s3/put.rs b/src/api/s3/put.rs index 350ab884..606facc4 100644 --- a/src/api/s3/put.rs +++ b/src/api/s3/put.rs @@ -1,8 +1,9 @@ -use std::collections::{BTreeMap, BTreeSet, HashMap}; +use std::collections::{BTreeMap, HashMap}; use std::sync::Arc; use base64::prelude::*; use futures::prelude::*; +use futures::try_join; use hyper::body::{Body, Bytes}; use hyper::header::{HeaderMap, HeaderValue}; use hyper::{Request, Response}; @@ -30,14 +31,12 @@ use garage_model::s3::object_table::*; use garage_model::s3::version_table::*; use crate::s3::error::*; -use crate::s3::xml as s3_xml; -use crate::signature::verify_signed_content; pub async fn handle_put( garage: Arc<Garage>, req: Request<Body>, bucket: &Bucket, - key: &str, + key: &String, content_sha256: Option<Hash>, ) -> Result<Response<Body>, Error> { // Retrieve interesting headers from request @@ -70,16 +69,24 @@ pub(crate) async fn save_stream<S: Stream<Item = Result<Bytes, Error>> + Unpin>( headers: ObjectVersionHeaders, body: S, bucket: &Bucket, - key: &str, + key: &String, content_md5: Option<String>, content_sha256: Option<FixedBytes32>, ) -> Result<(Uuid, String), Error> { + let mut chunker = StreamChunker::new(body, garage.config.block_size); + let (first_block_opt, existing_object) = try_join!( + chunker.next(), + garage + .object_table + .get(&bucket.id, key) + .map_err(Error::from), + )?; + + let first_block = first_block_opt.unwrap_or_default(); + // Generate identity of new version let version_uuid = gen_uuid(); - let version_timestamp = now_msec(); - - let mut chunker = StreamChunker::new(body, garage.config.block_size); - let first_block = chunker.next().await?.unwrap_or_default(); + let version_timestamp = next_timestamp(existing_object.as_ref()); // If body is small enough, store it directly in the object table // as "inline data". We can then return immediately. @@ -99,7 +106,7 @@ pub(crate) async fn save_stream<S: Stream<Item = Result<Bytes, Error>> + Unpin>( content_sha256, )?; - check_quotas(&garage, bucket, key, size).await?; + check_quotas(&garage, bucket, size, existing_object.as_ref()).await?; let object_version = ObjectVersion { uuid: version_uuid, @@ -123,20 +130,23 @@ pub(crate) async fn save_stream<S: Stream<Item = Result<Bytes, Error>> + Unpin>( // The following consists in many steps that can each fail. // Keep track that some cleanup will be needed if things fail // before everything is finished (cleanup is done using the Drop trait). - let mut interrupted_cleanup = InterruptedCleanup(Some(( - garage.clone(), - bucket.id, - key.into(), + let mut interrupted_cleanup = InterruptedCleanup(Some(InterruptedCleanupInner { + garage: garage.clone(), + bucket_id: bucket.id, + key: key.into(), version_uuid, version_timestamp, - ))); + })); // Write version identifier in object table so that we have a trace // that we are uploading something let mut object_version = ObjectVersion { uuid: version_uuid, timestamp: version_timestamp, - state: ObjectVersionState::Uploading(headers.clone()), + state: ObjectVersionState::Uploading { + headers: headers.clone(), + multipart: false, + }, }; let object = Object::new(bucket.id, key.into(), vec![object_version.clone()]); garage.object_table.insert(&object).await?; @@ -145,7 +155,14 @@ pub(crate) async fn save_stream<S: Stream<Item = Result<Bytes, Error>> + Unpin>( // Write this entry now, even with empty block list, // to prevent block_ref entries from being deleted (they can be deleted // if the reference a version that isn't found in the version table) - let version = Version::new(version_uuid, bucket.id, key.into(), false); + let version = Version::new( + version_uuid, + VersionBacklink::Object { + bucket_id: bucket.id, + key: key.into(), + }, + false, + ); garage.version_table.insert(&version).await?; // Transfer data and verify checksum @@ -168,7 +185,7 @@ pub(crate) async fn save_stream<S: Stream<Item = Result<Bytes, Error>> + Unpin>( content_sha256, )?; - check_quotas(&garage, bucket, key, total_size).await?; + check_quotas(&garage, bucket, total_size, existing_object.as_ref()).await?; // Save final object state, marked as Complete let md5sum_hex = hex::encode(data_md5sum); @@ -192,7 +209,7 @@ pub(crate) async fn save_stream<S: Stream<Item = Result<Bytes, Error>> + Unpin>( /// Validate MD5 sum against content-md5 header /// and sha256sum against signed content-sha256 -fn ensure_checksum_matches( +pub(crate) fn ensure_checksum_matches( data_md5sum: &[u8], data_sha256sum: garage_util::data::FixedBytes32, content_md5: Option<&str>, @@ -218,22 +235,22 @@ fn ensure_checksum_matches( } /// Check that inserting this object with this size doesn't exceed bucket quotas -async fn check_quotas( +pub(crate) async fn check_quotas( garage: &Arc<Garage>, bucket: &Bucket, - key: &str, size: u64, + prev_object: Option<&Object>, ) -> Result<(), Error> { let quotas = bucket.state.as_option().unwrap().quotas.get(); if quotas.max_objects.is_none() && quotas.max_size.is_none() { return Ok(()); }; - let key = key.to_string(); - let (prev_object, counters) = futures::try_join!( - garage.object_table.get(&bucket.id, &key), - garage.object_counter_table.table.get(&bucket.id, &EmptyKey), - )?; + let counters = garage + .object_counter_table + .table + .get(&bucket.id, &EmptyKey) + .await?; let counters = counters .map(|x| x.filtered_values(&garage.system.ring.borrow())) @@ -267,7 +284,7 @@ async fn check_quotas( if cnt_size_diff > 0 && current_size + cnt_size_diff > ms as i64 { return Err(Error::forbidden(format!( "Bucket size quota is reached, maximum total size of objects for this bucket: {}. The bucket is already {} bytes, and this object would add {} bytes.", - ms, current_size, size + ms, current_size, cnt_size_diff ))); } } @@ -275,7 +292,7 @@ async fn check_quotas( Ok(()) } -async fn read_and_put_blocks<S: Stream<Item = Result<Bytes, Error>> + Unpin>( +pub(crate) async fn read_and_put_blocks<S: Stream<Item = Result<Bytes, Error>> + Unpin>( garage: &Garage, version: &Version, part_number: u64, @@ -381,7 +398,7 @@ async fn put_block_meta( Ok(()) } -struct StreamChunker<S: Stream<Item = Result<Bytes, Error>>> { +pub(crate) struct StreamChunker<S: Stream<Item = Result<Bytes, Error>>> { stream: S, read_all: bool, block_size: usize, @@ -389,7 +406,7 @@ struct StreamChunker<S: Stream<Item = Result<Bytes, Error>>> { } impl<S: Stream<Item = Result<Bytes, Error>> + Unpin> StreamChunker<S> { - fn new(stream: S, block_size: usize) -> Self { + pub(crate) fn new(stream: S, block_size: usize) -> Self { Self { stream, read_all: false, @@ -398,7 +415,7 @@ impl<S: Stream<Item = Result<Bytes, Error>> + Unpin> StreamChunker<S> { } } - async fn next(&mut self) -> Result<Option<Bytes>, Error> { + pub(crate) async fn next(&mut self) -> Result<Option<Bytes>, Error> { while !self.read_all && self.buf.len() < self.block_size { if let Some(block) = self.stream.next().await { let bytes = block?; @@ -425,7 +442,14 @@ pub fn put_response(version_uuid: Uuid, md5sum_hex: String) -> Response<Body> { .unwrap() } -struct InterruptedCleanup(Option<(Arc<Garage>, Uuid, String, Uuid, u64)>); +struct InterruptedCleanup(Option<InterruptedCleanupInner>); +struct InterruptedCleanupInner { + garage: Arc<Garage>, + bucket_id: Uuid, + key: String, + version_uuid: Uuid, + version_timestamp: u64, +} impl InterruptedCleanup { fn cancel(&mut self) { @@ -434,15 +458,15 @@ impl InterruptedCleanup { } impl Drop for InterruptedCleanup { fn drop(&mut self) { - if let Some((garage, bucket_id, key, version_uuid, version_ts)) = self.0.take() { + if let Some(info) = self.0.take() { tokio::spawn(async move { let object_version = ObjectVersion { - uuid: version_uuid, - timestamp: version_ts, + uuid: info.version_uuid, + timestamp: info.version_timestamp, state: ObjectVersionState::Aborted, }; - let object = Object::new(bucket_id, key, vec![object_version]); - if let Err(e) = garage.object_table.insert(&object).await { + let object = Object::new(info.bucket_id, info.key, vec![object_version]); + if let Err(e) = info.garage.object_table.insert(&object).await { warn!("Cannot cleanup after aborted PutObject: {}", e); } }); @@ -450,326 +474,9 @@ impl Drop for InterruptedCleanup { } } -// ---- - -pub async fn handle_create_multipart_upload( - garage: Arc<Garage>, - req: &Request<Body>, - bucket_name: &str, - bucket_id: Uuid, - key: &str, -) -> Result<Response<Body>, Error> { - let version_uuid = gen_uuid(); - let headers = get_headers(req.headers())?; - - // Create object in object table - let object_version = ObjectVersion { - uuid: version_uuid, - timestamp: now_msec(), - state: ObjectVersionState::Uploading(headers), - }; - let object = Object::new(bucket_id, key.to_string(), vec![object_version]); - garage.object_table.insert(&object).await?; - - // Insert empty version so that block_ref entries refer to something - // (they are inserted concurrently with blocks in the version table, so - // there is the possibility that they are inserted before the version table - // is created, in which case it is allowed to delete them, e.g. in repair_*) - let version = Version::new(version_uuid, bucket_id, key.into(), false); - garage.version_table.insert(&version).await?; - - // Send success response - let result = s3_xml::InitiateMultipartUploadResult { - xmlns: (), - bucket: s3_xml::Value(bucket_name.to_string()), - key: s3_xml::Value(key.to_string()), - upload_id: s3_xml::Value(hex::encode(version_uuid)), - }; - let xml = s3_xml::to_xml_with_header(&result)?; - - Ok(Response::new(Body::from(xml.into_bytes()))) -} - -pub async fn handle_put_part( - garage: Arc<Garage>, - req: Request<Body>, - bucket_id: Uuid, - key: &str, - part_number: u64, - upload_id: &str, - content_sha256: Option<Hash>, -) -> Result<Response<Body>, Error> { - let version_uuid = decode_upload_id(upload_id)?; - - let content_md5 = match req.headers().get("content-md5") { - Some(x) => Some(x.to_str()?.to_string()), - None => None, - }; - - // Read first chuck, and at the same time try to get object to see if it exists - let key = key.to_string(); - - let body = req.into_body().map_err(Error::from); - let mut chunker = StreamChunker::new(body, garage.config.block_size); - - let (object, version, first_block) = futures::try_join!( - garage - .object_table - .get(&bucket_id, &key) - .map_err(Error::from), - garage - .version_table - .get(&version_uuid, &EmptyKey) - .map_err(Error::from), - chunker.next(), - )?; - - // Check object is valid and multipart block can be accepted - let first_block = first_block.ok_or_bad_request("Empty body")?; - let object = object.ok_or_bad_request("Object not found")?; - - if !object - .versions() - .iter() - .any(|v| v.uuid == version_uuid && v.is_uploading()) - { - return Err(Error::NoSuchUpload); - } - - // Check part hasn't already been uploaded - if let Some(v) = version { - if v.has_part_number(part_number) { - return Err(Error::bad_request(format!( - "Part number {} has already been uploaded", - part_number - ))); - } - } - - // Copy block to store - let version = Version::new(version_uuid, bucket_id, key, false); - - let first_block_hash = async_blake2sum(first_block.clone()).await; - - let (_, data_md5sum, data_sha256sum) = read_and_put_blocks( - &garage, - &version, - part_number, - first_block, - first_block_hash, - &mut chunker, - ) - .await?; - - // Verify that checksums map - ensure_checksum_matches( - data_md5sum.as_slice(), - data_sha256sum, - content_md5.as_deref(), - content_sha256, - )?; - - // Store part etag in version - let data_md5sum_hex = hex::encode(data_md5sum); - let mut version = version; - version - .parts_etags - .put(part_number, data_md5sum_hex.clone()); - garage.version_table.insert(&version).await?; - - let response = Response::builder() - .header("ETag", format!("\"{}\"", data_md5sum_hex)) - .body(Body::empty()) - .unwrap(); - Ok(response) -} - -pub async fn handle_complete_multipart_upload( - garage: Arc<Garage>, - req: Request<Body>, - bucket_name: &str, - bucket: &Bucket, - key: &str, - upload_id: &str, - content_sha256: Option<Hash>, -) -> Result<Response<Body>, Error> { - let body = hyper::body::to_bytes(req.into_body()).await?; - - if let Some(content_sha256) = content_sha256 { - verify_signed_content(content_sha256, &body[..])?; - } - - let body_xml = roxmltree::Document::parse(std::str::from_utf8(&body)?)?; - let body_list_of_parts = parse_complete_multipart_upload_body(&body_xml) - .ok_or_bad_request("Invalid CompleteMultipartUpload XML")?; - debug!( - "CompleteMultipartUpload list of parts: {:?}", - body_list_of_parts - ); - - let version_uuid = decode_upload_id(upload_id)?; - - // Get object and version - let key = key.to_string(); - let (object, version) = futures::try_join!( - garage.object_table.get(&bucket.id, &key), - garage.version_table.get(&version_uuid, &EmptyKey), - )?; - - let object = object.ok_or(Error::NoSuchKey)?; - let mut object_version = object - .versions() - .iter() - .find(|v| v.uuid == version_uuid && v.is_uploading()) - .cloned() - .ok_or(Error::NoSuchUpload)?; - - let version = version.ok_or(Error::NoSuchKey)?; - if version.blocks.is_empty() { - return Err(Error::bad_request("No data was uploaded")); - } - - let headers = match object_version.state { - ObjectVersionState::Uploading(headers) => headers, - _ => unreachable!(), - }; - - // Check that part numbers are an increasing sequence. - // (it doesn't need to start at 1 nor to be a continuous sequence, - // see discussion in #192) - if body_list_of_parts.is_empty() { - return Err(Error::EntityTooSmall); - } - if !body_list_of_parts - .iter() - .zip(body_list_of_parts.iter().skip(1)) - .all(|(p1, p2)| p1.part_number < p2.part_number) - { - return Err(Error::InvalidPartOrder); - } - - // Garage-specific restriction, see #204: part numbers must be - // consecutive starting at 1 - if body_list_of_parts[0].part_number != 1 - || !body_list_of_parts - .iter() - .zip(body_list_of_parts.iter().skip(1)) - .all(|(p1, p2)| p1.part_number + 1 == p2.part_number) - { - return Err(Error::NotImplemented("Garage does not support completing a Multipart upload with non-consecutive part numbers. This is a restriction of Garage's data model, which might be fixed in a future release. See issue #204 for more information on this topic.".into())); - } - - // Check that the list of parts they gave us corresponds to the parts we have here - debug!("Expected parts from request: {:?}", body_list_of_parts); - debug!("Parts stored in version: {:?}", version.parts_etags.items()); - let parts = version - .parts_etags - .items() - .iter() - .map(|pair| (&pair.0, &pair.1)); - let same_parts = body_list_of_parts - .iter() - .map(|x| (&x.part_number, &x.etag)) - .eq(parts); - if !same_parts { - return Err(Error::InvalidPart); - } - - // Check that all blocks belong to one of the parts - let block_parts = version - .blocks - .items() - .iter() - .map(|(bk, _)| bk.part_number) - .collect::<BTreeSet<_>>(); - let same_parts = body_list_of_parts - .iter() - .map(|x| x.part_number) - .eq(block_parts.into_iter()); - if !same_parts { - return Err(Error::bad_request( - "Part numbers in block list and part list do not match. This can happen if a part was partially uploaded. Please abort the multipart upload and try again." - )); - } - - // Calculate etag of final object - // To understand how etags are calculated, read more here: - // https://teppen.io/2018/06/23/aws_s3_etags/ - let num_parts = body_list_of_parts.len(); - let mut etag_md5_hasher = Md5::new(); - for (_, etag) in version.parts_etags.items().iter() { - etag_md5_hasher.update(etag.as_bytes()); - } - let etag = format!("{}-{}", hex::encode(etag_md5_hasher.finalize()), num_parts); - - // Calculate total size of final object - let total_size = version.blocks.items().iter().map(|x| x.1.size).sum(); - - if let Err(e) = check_quotas(&garage, bucket, &key, total_size).await { - object_version.state = ObjectVersionState::Aborted; - let final_object = Object::new(bucket.id, key.clone(), vec![object_version]); - garage.object_table.insert(&final_object).await?; - - return Err(e); - } - - // Write final object version - object_version.state = ObjectVersionState::Complete(ObjectVersionData::FirstBlock( - ObjectVersionMeta { - headers, - size: total_size, - etag: etag.clone(), - }, - version.blocks.items()[0].1.hash, - )); - - let final_object = Object::new(bucket.id, key.clone(), vec![object_version]); - garage.object_table.insert(&final_object).await?; - - // Send response saying ok we're done - let result = s3_xml::CompleteMultipartUploadResult { - xmlns: (), - location: None, - bucket: s3_xml::Value(bucket_name.to_string()), - key: s3_xml::Value(key), - etag: s3_xml::Value(format!("\"{}\"", etag)), - }; - let xml = s3_xml::to_xml_with_header(&result)?; - - Ok(Response::new(Body::from(xml.into_bytes()))) -} - -pub async fn handle_abort_multipart_upload( - garage: Arc<Garage>, - bucket_id: Uuid, - key: &str, - upload_id: &str, -) -> Result<Response<Body>, Error> { - let version_uuid = decode_upload_id(upload_id)?; - - let object = garage - .object_table - .get(&bucket_id, &key.to_string()) - .await?; - let object = object.ok_or(Error::NoSuchKey)?; - - let object_version = object - .versions() - .iter() - .find(|v| v.uuid == version_uuid && v.is_uploading()); - let mut object_version = match object_version { - None => return Err(Error::NoSuchUpload), - Some(x) => x.clone(), - }; +// ============ helpers ============ - object_version.state = ObjectVersionState::Aborted; - let final_object = Object::new(bucket_id, key.to_string(), vec![object_version]); - garage.object_table.insert(&final_object).await?; - - Ok(Response::new(Body::from(vec![]))) -} - -fn get_mime_type(headers: &HeaderMap<HeaderValue>) -> Result<String, Error> { +pub(crate) fn get_mime_type(headers: &HeaderMap<HeaderValue>) -> Result<String, Error> { Ok(headers .get(hyper::header::CONTENT_TYPE) .map(|x| x.to_str()) @@ -822,53 +529,10 @@ pub(crate) fn get_headers(headers: &HeaderMap<HeaderValue>) -> Result<ObjectVers }) } -pub fn decode_upload_id(id: &str) -> Result<Uuid, Error> { - let id_bin = hex::decode(id).map_err(|_| Error::NoSuchUpload)?; - if id_bin.len() != 32 { - return Err(Error::NoSuchUpload); - } - let mut uuid = [0u8; 32]; - uuid.copy_from_slice(&id_bin[..]); - Ok(Uuid::from(uuid)) -} - -#[derive(Debug)] -struct CompleteMultipartUploadPart { - etag: String, - part_number: u64, -} - -fn parse_complete_multipart_upload_body( - xml: &roxmltree::Document, -) -> Option<Vec<CompleteMultipartUploadPart>> { - let mut parts = vec![]; - - let root = xml.root(); - let cmu = root.first_child()?; - if !cmu.has_tag_name("CompleteMultipartUpload") { - return None; - } - - for item in cmu.children() { - // Only parse <Part> nodes - if !item.is_element() { - continue; - } - - if item.has_tag_name("Part") { - let etag = item.children().find(|e| e.has_tag_name("ETag"))?.text()?; - let part_number = item - .children() - .find(|e| e.has_tag_name("PartNumber"))? - .text()?; - parts.push(CompleteMultipartUploadPart { - etag: etag.trim_matches('"').to_string(), - part_number: part_number.parse().ok()?, - }); - } else { - return None; - } - } - - Some(parts) +pub(crate) fn next_timestamp(existing_object: Option<&Object>) -> u64 { + existing_object + .as_ref() + .and_then(|obj| obj.versions().iter().map(|v| v.timestamp).max()) + .map(|t| std::cmp::max(t + 1, now_msec())) + .unwrap_or_else(now_msec) } |