diff options
Diffstat (limited to 'src')
47 files changed, 3472 insertions, 1404 deletions
diff --git a/src/api/admin/bucket.rs b/src/api/admin/bucket.rs index f0a4a9e7..17f46c30 100644 --- a/src/api/admin/bucket.rs +++ b/src/api/admin/bucket.rs @@ -14,6 +14,7 @@ use garage_model::bucket_alias_table::*; use garage_model::bucket_table::*; use garage_model::garage::Garage; use garage_model::permission::*; +use garage_model::s3::mpu_table; use garage_model::s3::object_table::*; use crate::admin::error::*; @@ -124,6 +125,14 @@ async fn bucket_info_results( .map(|x| x.filtered_values(&garage.system.ring.borrow())) .unwrap_or_default(); + let mpu_counters = garage + .mpu_counter_table + .table + .get(&bucket_id, &EmptyKey) + .await? + .map(|x| x.filtered_values(&garage.system.ring.borrow())) + .unwrap_or_default(); + let mut relevant_keys = HashMap::new(); for (k, _) in bucket .state @@ -208,12 +217,12 @@ async fn bucket_info_results( } }) .collect::<Vec<_>>(), - objects: counters.get(OBJECTS).cloned().unwrap_or_default(), - bytes: counters.get(BYTES).cloned().unwrap_or_default(), - unfinished_uploads: counters - .get(UNFINISHED_UPLOADS) - .cloned() - .unwrap_or_default(), + objects: *counters.get(OBJECTS).unwrap_or(&0), + bytes: *counters.get(BYTES).unwrap_or(&0), + unfinished_uploads: *counters.get(UNFINISHED_UPLOADS).unwrap_or(&0), + unfinished_multipart_uploads: *mpu_counters.get(mpu_table::UPLOADS).unwrap_or(&0), + unfinished_multipart_upload_parts: *mpu_counters.get(mpu_table::PARTS).unwrap_or(&0), + unfinished_multipart_upload_bytes: *mpu_counters.get(mpu_table::BYTES).unwrap_or(&0), quotas: ApiBucketQuotas { max_size: quotas.max_size, max_objects: quotas.max_objects, @@ -235,6 +244,9 @@ struct GetBucketInfoResult { objects: i64, bytes: i64, unfinished_uploads: i64, + unfinished_multipart_uploads: i64, + unfinished_multipart_upload_parts: i64, + unfinished_multipart_upload_bytes: i64, quotas: ApiBucketQuotas, } diff --git a/src/api/admin/cluster.rs b/src/api/admin/cluster.rs index 98bf2b5a..a2c97ee5 100644 --- a/src/api/admin/cluster.rs +++ b/src/api/admin/cluster.rs @@ -92,7 +92,7 @@ fn get_cluster_layout(garage: &Arc<Garage>) -> GetClusterLayoutResponse { .map(|(k, _, v)| (hex::encode(k), v.0.clone())) .collect(), staged_role_changes: layout - .staging + .staging_roles .items() .iter() .filter(|(k, _, v)| layout.roles.get(k) != Some(v)) @@ -114,6 +114,7 @@ struct GetClusterStatusResponse { } #[derive(Serialize)] +#[serde(rename_all = "camelCase")] struct ConnectClusterNodesResponse { success: bool, error: Option<String>, @@ -128,6 +129,7 @@ struct GetClusterLayoutResponse { } #[derive(Serialize)] +#[serde(rename_all = "camelCase")] struct KnownNodeResp { addr: SocketAddr, is_up: bool, @@ -144,14 +146,14 @@ pub async fn handle_update_cluster_layout( let mut layout = garage.system.get_cluster_layout(); let mut roles = layout.roles.clone(); - roles.merge(&layout.staging); + roles.merge(&layout.staging_roles); for (node, role) in updates { let node = hex::decode(node).ok_or_bad_request("Invalid node identifier")?; let node = Uuid::try_from(&node).ok_or_bad_request("Invalid node identifier")?; layout - .staging + .staging_roles .merge(&roles.update_mutator(node, NodeRoleV(role))); } @@ -169,12 +171,14 @@ pub async fn handle_apply_cluster_layout( let param = parse_json_body::<ApplyRevertLayoutRequest>(req).await?; let layout = garage.system.get_cluster_layout(); - let layout = layout.apply_staged_changes(Some(param.version))?; + let (layout, msg) = layout.apply_staged_changes(Some(param.version))?; + garage.system.update_cluster_layout(&layout).await?; Ok(Response::builder() - .status(StatusCode::NO_CONTENT) - .body(Body::empty())?) + .status(StatusCode::OK) + .header(http::header::CONTENT_TYPE, "text/plain") + .body(Body::from(msg.join("\n")))?) } pub async fn handle_revert_cluster_layout( @@ -195,6 +199,7 @@ pub async fn handle_revert_cluster_layout( type UpdateClusterLayoutRequest = HashMap<String, Option<NodeRole>>; #[derive(Deserialize)] +#[serde(rename_all = "camelCase")] struct ApplyRevertLayoutRequest { version: u64, } diff --git a/src/api/admin/key.rs b/src/api/admin/key.rs index d74ca361..25ba76f8 100644 --- a/src/api/admin/key.rs +++ b/src/api/admin/key.rs @@ -34,6 +34,7 @@ pub async fn handle_list_keys(garage: &Arc<Garage>) -> Result<Response<Body>, Er } #[derive(Serialize)] +#[serde(rename_all = "camelCase")] struct ListKeyResultItem { id: String, name: String, @@ -71,6 +72,7 @@ pub async fn handle_create_key( } #[derive(Deserialize)] +#[serde(rename_all = "camelCase")] struct CreateKeyRequest { name: String, } @@ -131,6 +133,7 @@ pub async fn handle_update_key( } #[derive(Deserialize)] +#[serde(rename_all = "camelCase")] struct UpdateKeyRequest { name: Option<String>, allow: Option<KeyPerm>, @@ -246,6 +249,7 @@ struct KeyInfoBucketResult { } #[derive(Serialize, Deserialize, Default)] +#[serde(rename_all = "camelCase")] pub(crate) struct ApiBucketKeyPerm { #[serde(default)] pub(crate) read: bool, diff --git a/src/api/s3/api_server.rs b/src/api/s3/api_server.rs index 27837297..5e793082 100644 --- a/src/api/s3/api_server.rs +++ b/src/api/s3/api_server.rs @@ -27,6 +27,7 @@ use crate::s3::cors::*; use crate::s3::delete::*; use crate::s3::get::*; use crate::s3::list::*; +use crate::s3::multipart::*; use crate::s3::post_object::handle_post_object; use crate::s3::put::*; use crate::s3::router::Endpoint; @@ -256,7 +257,7 @@ impl ApiHandler for S3ApiServer { bucket_name, bucket_id, delimiter: delimiter.map(|d| d.to_string()), - page_size: max_keys.map(|p| p.clamp(1, 1000)).unwrap_or(1000), + page_size: max_keys.unwrap_or(1000).clamp(1, 1000), prefix: prefix.unwrap_or_default(), urlencode_resp: encoding_type.map(|e| e == "url").unwrap_or(false), }, @@ -286,7 +287,7 @@ impl ApiHandler for S3ApiServer { bucket_name, bucket_id, delimiter: delimiter.map(|d| d.to_string()), - page_size: max_keys.map(|p| p.clamp(1, 1000)).unwrap_or(1000), + page_size: max_keys.unwrap_or(1000).clamp(1, 1000), urlencode_resp: encoding_type.map(|e| e == "url").unwrap_or(false), prefix: prefix.unwrap_or_default(), }, @@ -319,7 +320,7 @@ impl ApiHandler for S3ApiServer { bucket_name, bucket_id, delimiter: delimiter.map(|d| d.to_string()), - page_size: max_uploads.map(|p| p.clamp(1, 1000)).unwrap_or(1000), + page_size: max_uploads.unwrap_or(1000).clamp(1, 1000), prefix: prefix.unwrap_or_default(), urlencode_resp: encoding_type.map(|e| e == "url").unwrap_or(false), }, @@ -343,7 +344,7 @@ impl ApiHandler for S3ApiServer { key, upload_id, part_number_marker: part_number_marker.map(|p| p.clamp(1, 10000)), - max_parts: max_parts.map(|p| p.clamp(1, 1000)).unwrap_or(1000), + max_parts: max_parts.unwrap_or(1000).clamp(1, 1000), }, ) .await diff --git a/src/api/s3/copy.rs b/src/api/s3/copy.rs index 7eb6459d..68b4f0c9 100644 --- a/src/api/s3/copy.rs +++ b/src/api/s3/copy.rs @@ -2,7 +2,7 @@ use std::pin::Pin; use std::sync::Arc; use std::time::{Duration, SystemTime, UNIX_EPOCH}; -use futures::{stream, stream::Stream, StreamExt, TryFutureExt}; +use futures::{stream, stream::Stream, StreamExt}; use md5::{Digest as Md5Digest, Md5}; use bytes::Bytes; @@ -18,12 +18,14 @@ use garage_util::time::*; use garage_model::garage::Garage; use garage_model::key_table::Key; use garage_model::s3::block_ref_table::*; +use garage_model::s3::mpu_table::*; use garage_model::s3::object_table::*; use garage_model::s3::version_table::*; use crate::helpers::parse_bucket_key; use crate::s3::error::*; -use crate::s3::put::{decode_upload_id, get_headers}; +use crate::s3::multipart; +use crate::s3::put::get_headers; use crate::s3::xml::{self as s3_xml, xmlns_tag}; pub async fn handle_copy( @@ -92,7 +94,10 @@ pub async fn handle_copy( let tmp_dest_object_version = ObjectVersion { uuid: new_uuid, timestamp: new_timestamp, - state: ObjectVersionState::Uploading(new_meta.headers.clone()), + state: ObjectVersionState::Uploading { + headers: new_meta.headers.clone(), + multipart: false, + }, }; let tmp_dest_object = Object::new( dest_bucket_id, @@ -105,8 +110,14 @@ pub async fn handle_copy( // this means that the BlockRef entries linked to this version cannot be // marked as deleted (they are marked as deleted only if the Version // doesn't exist or is marked as deleted). - let mut dest_version = - Version::new(new_uuid, dest_bucket_id, dest_key.to_string(), false); + let mut dest_version = Version::new( + new_uuid, + VersionBacklink::Object { + bucket_id: dest_bucket_id, + key: dest_key.to_string(), + }, + false, + ); garage.version_table.insert(&dest_version).await?; // Fill in block list for version and insert block refs @@ -179,17 +190,13 @@ pub async fn handle_upload_part_copy( ) -> Result<Response<Body>, Error> { let copy_precondition = CopyPreconditionHeaders::parse(req)?; - let dest_version_uuid = decode_upload_id(upload_id)?; + let dest_upload_id = multipart::decode_upload_id(upload_id)?; let dest_key = dest_key.to_string(); - let (source_object, dest_object) = futures::try_join!( + let (source_object, (_, _, mut dest_mpu)) = futures::try_join!( get_copy_source(&garage, api_key, req), - garage - .object_table - .get(&dest_bucket_id, &dest_key) - .map_err(Error::from), + multipart::get_upload(&garage, &dest_bucket_id, &dest_key, &dest_upload_id) )?; - let dest_object = dest_object.ok_or(Error::NoSuchKey)?; let (source_object_version, source_version_data, source_version_meta) = extract_source_info(&source_object)?; @@ -217,15 +224,6 @@ pub async fn handle_upload_part_copy( }, }; - // Check destination version is indeed in uploading state - if !dest_object - .versions() - .iter() - .any(|v| v.uuid == dest_version_uuid && v.is_uploading()) - { - return Err(Error::NoSuchUpload); - } - // Check source version is not inlined match source_version_data { ObjectVersionData::DeleteMarker => unreachable!(), @@ -242,23 +240,11 @@ pub async fn handle_upload_part_copy( // Fetch source versin with its block list, // and destination version to check part hasn't yet been uploaded - let (source_version, dest_version) = futures::try_join!( - garage - .version_table - .get(&source_object_version.uuid, &EmptyKey), - garage.version_table.get(&dest_version_uuid, &EmptyKey), - )?; - let source_version = source_version.ok_or(Error::NoSuchKey)?; - - // Check this part number hasn't yet been uploaded - if let Some(dv) = dest_version { - if dv.has_part_number(part_number) { - return Err(Error::bad_request(format!( - "Part number {} has already been uploaded", - part_number - ))); - } - } + let source_version = garage + .version_table + .get(&source_object_version.uuid, &EmptyKey) + .await? + .ok_or(Error::NoSuchKey)?; // We want to reuse blocks from the source version as much as possible. // However, we still need to get the data from these blocks @@ -299,6 +285,33 @@ pub async fn handle_upload_part_copy( current_offset = block_end; } + // Calculate the identity of destination part: timestamp, version id + let dest_version_id = gen_uuid(); + let dest_mpu_part_key = MpuPartKey { + part_number, + timestamp: dest_mpu.next_timestamp(part_number), + }; + + // Create the uploaded part + dest_mpu.parts.clear(); + dest_mpu.parts.put( + dest_mpu_part_key, + MpuPart { + version: dest_version_id, + etag: None, + size: None, + }, + ); + garage.mpu_table.insert(&dest_mpu).await?; + + let mut dest_version = Version::new( + dest_version_id, + VersionBacklink::MultipartUpload { + upload_id: dest_upload_id, + }, + false, + ); + // Now, actually copy the blocks let mut md5hasher = Md5::new(); @@ -348,8 +361,8 @@ pub async fn handle_upload_part_copy( let must_upload = existing_block_hash.is_none(); let final_hash = existing_block_hash.unwrap_or_else(|| blake2sum(&data[..])); - let mut version = Version::new(dest_version_uuid, dest_bucket_id, dest_key.clone(), false); - version.blocks.put( + dest_version.blocks.clear(); + dest_version.blocks.put( VersionBlockKey { part_number, offset: current_offset, @@ -363,7 +376,7 @@ pub async fn handle_upload_part_copy( let block_ref = BlockRef { block: final_hash, - version: dest_version_uuid, + version: dest_version_id, deleted: false.into(), }; @@ -378,23 +391,33 @@ pub async fn handle_upload_part_copy( Ok(()) } }, - // Thing 2: we need to insert the block in the version - garage.version_table.insert(&version), - // Thing 3: we need to add a block reference - garage.block_ref_table.insert(&block_ref), + async { + // Thing 2: we need to insert the block in the version + garage.version_table.insert(&dest_version).await?; + // Thing 3: we need to add a block reference + garage.block_ref_table.insert(&block_ref).await + }, // Thing 4: we need to prefetch the next block defragmenter.next(), )?; - next_block = res.3; + next_block = res.2; } + assert_eq!(current_offset, source_range.length); + let data_md5sum = md5hasher.finalize(); let etag = hex::encode(data_md5sum); // Put the part's ETag in the Versiontable - let mut version = Version::new(dest_version_uuid, dest_bucket_id, dest_key.clone(), false); - version.parts_etags.put(part_number, etag.clone()); - garage.version_table.insert(&version).await?; + dest_mpu.parts.put( + dest_mpu_part_key, + MpuPart { + version: dest_version_id, + etag: Some(etag.clone()), + size: Some(current_offset), + }, + ); + garage.mpu_table.insert(&dest_mpu).await?; // LGTM let resp_xml = s3_xml::to_xml_with_header(&CopyPartResult { diff --git a/src/api/s3/get.rs b/src/api/s3/get.rs index cde7b461..5e682726 100644 --- a/src/api/s3/get.rs +++ b/src/api/s3/get.rs @@ -149,7 +149,6 @@ pub async fn handle_head( let (part_offset, part_end) = calculate_part_bounds(&version, pn).ok_or(Error::InvalidPart)?; - let n_parts = version.parts_etags.items().len(); Ok(object_headers(object_version, version_meta) .header(CONTENT_LENGTH, format!("{}", part_end - part_offset)) @@ -162,7 +161,7 @@ pub async fn handle_head( version_meta.size ), ) - .header(X_AMZ_MP_PARTS_COUNT, format!("{}", n_parts)) + .header(X_AMZ_MP_PARTS_COUNT, format!("{}", version.n_parts()?)) .status(StatusCode::PARTIAL_CONTENT) .body(Body::empty())?) } @@ -376,7 +375,6 @@ async fn handle_get_part( let (begin, end) = calculate_part_bounds(&version, part_number).ok_or(Error::InvalidPart)?; - let n_parts = version.parts_etags.items().len(); let body = body_from_blocks_range(garage, version.blocks.items(), begin, end); @@ -386,7 +384,7 @@ async fn handle_get_part( CONTENT_RANGE, format!("bytes {}-{}/{}", begin, end - 1, version_meta.size), ) - .header(X_AMZ_MP_PARTS_COUNT, format!("{}", n_parts)) + .header(X_AMZ_MP_PARTS_COUNT, format!("{}", version.n_parts()?)) .body(body)?) } _ => unreachable!(), diff --git a/src/api/s3/list.rs b/src/api/s3/list.rs index 5cb0d65a..33d62518 100644 --- a/src/api/s3/list.rs +++ b/src/api/s3/list.rs @@ -1,4 +1,3 @@ -use std::cmp::Ordering; use std::collections::{BTreeMap, BTreeSet}; use std::iter::{Iterator, Peekable}; use std::sync::Arc; @@ -11,15 +10,15 @@ use garage_util::error::Error as GarageError; use garage_util::time::*; use garage_model::garage::Garage; +use garage_model::s3::mpu_table::*; use garage_model::s3::object_table::*; -use garage_model::s3::version_table::Version; -use garage_table::{EmptyKey, EnumerationOrder}; +use garage_table::EnumerationOrder; use crate::encoding::*; use crate::helpers::key_after_prefix; use crate::s3::error::*; -use crate::s3::put as s3_put; +use crate::s3::multipart as s3_multipart; use crate::s3::xml as s3_xml; const DUMMY_NAME: &str = "Dummy Key"; @@ -176,7 +175,9 @@ pub async fn handle_list_multipart_upload( t.get_range( &bucket, key, - Some(ObjectFilter::IsUploading), + Some(ObjectFilter::IsUploading { + check_multipart: Some(true), + }), count, EnumerationOrder::Forward, ) @@ -272,24 +273,26 @@ pub async fn handle_list_parts( ) -> Result<Response<Body>, Error> { debug!("ListParts {:?}", query); - let upload_id = s3_put::decode_upload_id(&query.upload_id)?; + let upload_id = s3_multipart::decode_upload_id(&query.upload_id)?; - let (object, version) = futures::try_join!( - garage.object_table.get(&query.bucket_id, &query.key), - garage.version_table.get(&upload_id, &EmptyKey), - )?; + let (_, _, mpu) = + s3_multipart::get_upload(&garage, &query.bucket_id, &query.key, &upload_id).await?; - let (info, next) = fetch_part_info(query, object, version, upload_id)?; + let (info, next) = fetch_part_info(query, &mpu)?; let result = s3_xml::ListPartsResult { xmlns: (), + + // Query parameters bucket: s3_xml::Value(query.bucket_name.to_string()), key: s3_xml::Value(query.key.to_string()), upload_id: s3_xml::Value(query.upload_id.to_string()), part_number_marker: query.part_number_marker.map(|e| s3_xml::IntValue(e as i64)), - next_part_number_marker: next.map(|e| s3_xml::IntValue(e as i64)), max_parts: s3_xml::IntValue(query.max_parts as i64), - is_truncated: s3_xml::Value(next.map(|_| "true").unwrap_or("false").to_string()), + + // Result values + next_part_number_marker: next.map(|e| s3_xml::IntValue(e as i64)), + is_truncated: s3_xml::Value(format!("{}", next.is_some())), parts: info .iter() .map(|part| s3_xml::PartItem { @@ -299,6 +302,8 @@ pub async fn handle_list_parts( size: s3_xml::IntValue(part.size as i64), }) .collect(), + + // Dummy result values (unsupported features) initiator: s3_xml::Initiator { display_name: s3_xml::Value(DUMMY_NAME.to_string()), id: s3_xml::Value(DUMMY_KEY.to_string()), @@ -335,8 +340,8 @@ struct UploadInfo { } #[derive(Debug, PartialEq)] -struct PartInfo { - etag: String, +struct PartInfo<'a> { + etag: &'a str, timestamp: u64, part_number: u64, size: u64, @@ -456,107 +461,51 @@ where } } -fn fetch_part_info( +fn fetch_part_info<'a>( query: &ListPartsQuery, - object: Option<Object>, - version: Option<Version>, - upload_id: Uuid, -) -> Result<(Vec<PartInfo>, Option<u64>), Error> { - // Check results - let object = object.ok_or(Error::NoSuchKey)?; - - let obj_version = object - .versions() - .iter() - .find(|v| v.uuid == upload_id && v.is_uploading()) - .ok_or(Error::NoSuchUpload)?; - - let version = version.ok_or(Error::NoSuchKey)?; - - // Cut the beginning of our 2 vectors if required - let (etags, blocks) = match &query.part_number_marker { - Some(marker) => { - let next = marker + 1; - - let part_idx = into_ok_or_err( - version - .parts_etags - .items() - .binary_search_by(|(part_num, _)| part_num.cmp(&next)), - ); - let parts = &version.parts_etags.items()[part_idx..]; - - let block_idx = into_ok_or_err( - version - .blocks - .items() - .binary_search_by(|(vkey, _)| vkey.part_number.cmp(&next)), - ); - let blocks = &version.blocks.items()[block_idx..]; - - (parts, blocks) - } - None => (version.parts_etags.items(), version.blocks.items()), - }; - - // Use the block vector to compute a (part_number, size) vector - let mut size = Vec::<(u64, u64)>::new(); - blocks.iter().for_each(|(key, val)| { - let mut new_size = val.size; - match size.pop() { - Some((part_number, size)) if part_number == key.part_number => new_size += size, - Some(v) => size.push(v), - None => (), - } - size.push((key.part_number, new_size)) - }); - - // Merge the etag vector and size vector to build a PartInfo vector - let max_parts = query.max_parts as usize; - let (mut etag_iter, mut size_iter) = (etags.iter().peekable(), size.iter().peekable()); - - let mut info = Vec::<PartInfo>::with_capacity(max_parts); - - while info.len() < max_parts { - match (etag_iter.peek(), size_iter.peek()) { - (Some((ep, etag)), Some((sp, size))) => match ep.cmp(sp) { - Ordering::Less => { - debug!("ETag information ignored due to missing corresponding block information. Query: {:?}", query); - etag_iter.next(); - } - Ordering::Equal => { - info.push(PartInfo { - etag: etag.to_string(), - timestamp: obj_version.timestamp, - part_number: *ep, - size: *size, - }); - etag_iter.next(); - size_iter.next(); + mpu: &'a MultipartUpload, +) -> Result<(Vec<PartInfo<'a>>, Option<u64>), Error> { + assert!((1..=1000).contains(&query.max_parts)); // see s3/api_server.rs + + // Parse multipart upload part list, removing parts not yet finished + // and failed part uploads that were overwritten + let mut parts: Vec<PartInfo<'a>> = Vec::with_capacity(mpu.parts.items().len()); + for (pk, p) in mpu.parts.items().iter() { + if let (Some(etag), Some(size)) = (&p.etag, p.size) { + let part_info = PartInfo { + part_number: pk.part_number, + timestamp: pk.timestamp, + etag, + size, + }; + match parts.last_mut() { + Some(lastpart) if lastpart.part_number == pk.part_number => { + *lastpart = part_info; } - Ordering::Greater => { - debug!("Block information ignored due to missing corresponding ETag information. Query: {:?}", query); - size_iter.next(); + _ => { + parts.push(part_info); } - }, - (None, None) => return Ok((info, None)), - _ => { - debug!( - "Additional block or ETag information ignored. Query: {:?}", - query - ); - return Ok((info, None)); } } } - match info.last() { - Some(part_info) => { - let pagination = Some(part_info.part_number); - Ok((info, pagination)) - } - None => Ok((info, None)), + // Cut the beginning if we have a marker + if let Some(marker) = &query.part_number_marker { + let next = marker + 1; + let part_idx = parts + .binary_search_by(|part| part.part_number.cmp(&next)) + .unwrap_or_else(|x| x); + parts = parts.split_off(part_idx); + } + + // Cut the end if we have too many parts + if parts.len() > query.max_parts as usize { + parts.truncate(query.max_parts as usize); + let pagination = Some(parts.last().unwrap().part_number); + return Ok((parts, pagination)); } + + Ok((parts, None)) } /* @@ -651,7 +600,7 @@ impl ListMultipartUploadsQuery { }), uuid => Ok(RangeBegin::AfterUpload { key: key_marker.to_string(), - upload: s3_put::decode_upload_id(uuid)?, + upload: s3_multipart::decode_upload_id(uuid)?, }), }, @@ -843,7 +792,7 @@ impl ExtractAccumulator for UploadAccumulator { let mut uploads_for_key = object .versions() .iter() - .filter(|x| x.is_uploading()) + .filter(|x| x.is_uploading(Some(true))) .collect::<Vec<&ObjectVersion>>(); // S3 logic requires lexicographically sorted upload ids. @@ -918,14 +867,6 @@ impl ExtractAccumulator for UploadAccumulator { * Utility functions */ -/// This is a stub for Result::into_ok_or_err that is not yet in Rust stable -fn into_ok_or_err<T>(r: Result<T, T>) -> T { - match r { - Ok(r) => r, - Err(r) => r, - } -} - /// Returns the common prefix of the object given the query prefix and delimiter fn common_prefix<'a>(object: &'a Object, query: &ListQueryCommon) -> Option<&'a str> { match &query.delimiter { @@ -951,7 +892,6 @@ fn uriencode_maybe(s: &str, yes: bool) -> s3_xml::Value { #[cfg(test)] mod tests { use super::*; - use garage_model::s3::version_table::*; use garage_util::*; use std::iter::FromIterator; @@ -991,10 +931,13 @@ mod tests { ObjectVersion { uuid: Uuid::from(uuid), timestamp: TS, - state: ObjectVersionState::Uploading(ObjectVersionHeaders { - content_type: "text/plain".to_string(), - other: BTreeMap::<String, String>::new(), - }), + state: ObjectVersionState::Uploading { + multipart: true, + headers: ObjectVersionHeaders { + content_type: "text/plain".to_string(), + other: BTreeMap::<String, String>::new(), + }, + }, } } @@ -1169,83 +1112,77 @@ mod tests { Ok(()) } - fn version() -> Version { + fn mpu() -> MultipartUpload { let uuid = Uuid::from([0x08; 32]); - let blocks = vec![ + let parts = vec![ ( - VersionBlockKey { + MpuPartKey { part_number: 1, - offset: 1, + timestamp: TS, }, - VersionBlock { - hash: uuid, - size: 3, + MpuPart { + version: uuid, + size: Some(3), + etag: Some("etag1".into()), }, ), ( - VersionBlockKey { - part_number: 1, - offset: 2, + MpuPartKey { + part_number: 2, + timestamp: TS, }, - VersionBlock { - hash: uuid, - size: 2, + MpuPart { + version: uuid, + size: None, + etag: None, }, ), ( - VersionBlockKey { - part_number: 2, - offset: 1, + MpuPartKey { + part_number: 3, + timestamp: TS, }, - VersionBlock { - hash: uuid, - size: 8, + MpuPart { + version: uuid, + size: Some(10), + etag: Some("etag2".into()), }, ), ( - VersionBlockKey { + MpuPartKey { part_number: 5, - offset: 1, + timestamp: TS, }, - VersionBlock { - hash: uuid, - size: 7, + MpuPart { + version: uuid, + size: Some(7), + etag: Some("etag3".into()), }, ), ( - VersionBlockKey { + MpuPartKey { part_number: 8, - offset: 1, + timestamp: TS, }, - VersionBlock { - hash: uuid, - size: 5, + MpuPart { + version: uuid, + size: Some(5), + etag: Some("etag4".into()), }, ), ]; - let etags = vec![ - (1, "etag1".to_string()), - (3, "etag2".to_string()), - (5, "etag3".to_string()), - (8, "etag4".to_string()), - (9, "etag5".to_string()), - ]; - Version { - bucket_id: uuid, - key: "a".to_string(), - uuid, + MultipartUpload { + upload_id: uuid, + timestamp: TS, deleted: false.into(), - blocks: crdt::Map::<VersionBlockKey, VersionBlock>::from_iter(blocks), - parts_etags: crdt::Map::<u64, String>::from_iter(etags), + parts: crdt::Map::<MpuPartKey, MpuPart>::from_iter(parts), + bucket_id: uuid, + key: "a".into(), } } - fn obj() -> Object { - Object::new(bucket(), "d".to_string(), vec![objup_version([0x08; 32])]) - } - #[test] fn test_fetch_part_info() -> Result<(), Error> { let uuid = Uuid::from([0x08; 32]); @@ -1258,82 +1195,85 @@ mod tests { max_parts: 2, }; - assert!( - fetch_part_info(&query, None, None, uuid).is_err(), - "No object and version should fail" - ); - assert!( - fetch_part_info(&query, Some(obj()), None, uuid).is_err(), - "No version should faild" - ); - assert!( - fetch_part_info(&query, None, Some(version()), uuid).is_err(), - "No object should fail" - ); + let mpu = mpu(); // Start from the beginning but with limited size to trigger pagination - let (info, pagination) = fetch_part_info(&query, Some(obj()), Some(version()), uuid)?; - assert_eq!(pagination.unwrap(), 5); + let (info, pagination) = fetch_part_info(&query, &mpu)?; + assert_eq!(pagination.unwrap(), 3); assert_eq!( info, vec![ PartInfo { - etag: "etag1".to_string(), + etag: "etag1", timestamp: TS, part_number: 1, - size: 5 + size: 3 }, PartInfo { - etag: "etag3".to_string(), + etag: "etag2", timestamp: TS, - part_number: 5, - size: 7 + part_number: 3, + size: 10 }, ] ); // Use previous pagination to make a new request query.part_number_marker = Some(pagination.unwrap()); - let (info, pagination) = fetch_part_info(&query, Some(obj()), Some(version()), uuid)?; + let (info, pagination) = fetch_part_info(&query, &mpu)?; assert!(pagination.is_none()); assert_eq!( info, - vec![PartInfo { - etag: "etag4".to_string(), - timestamp: TS, - part_number: 8, - size: 5 - },] + vec![ + PartInfo { + etag: "etag3", + timestamp: TS, + part_number: 5, + size: 7 + }, + PartInfo { + etag: "etag4", + timestamp: TS, + part_number: 8, + size: 5 + }, + ] ); // Trying to access a part that is way larger than registered ones query.part_number_marker = Some(9999); - let (info, pagination) = fetch_part_info(&query, Some(obj()), Some(version()), uuid)?; + let (info, pagination) = fetch_part_info(&query, &mpu)?; assert!(pagination.is_none()); assert_eq!(info, vec![]); // Try without any limitation query.max_parts = 1000; query.part_number_marker = None; - let (info, pagination) = fetch_part_info(&query, Some(obj()), Some(version()), uuid)?; + let (info, pagination) = fetch_part_info(&query, &mpu)?; assert!(pagination.is_none()); assert_eq!( info, vec![ PartInfo { - etag: "etag1".to_string(), + etag: "etag1", timestamp: TS, part_number: 1, - size: 5 + size: 3 + }, + PartInfo { + etag: "etag2", + timestamp: TS, + part_number: 3, + size: 10 }, PartInfo { - etag: "etag3".to_string(), + etag: "etag3", timestamp: TS, part_number: 5, size: 7 }, PartInfo { - etag: "etag4".to_string(), + etag: "etag4", timestamp: TS, part_number: 8, size: 5 diff --git a/src/api/s3/mod.rs b/src/api/s3/mod.rs index 7b56d4d8..b5237bf7 100644 --- a/src/api/s3/mod.rs +++ b/src/api/s3/mod.rs @@ -7,6 +7,7 @@ pub mod cors; mod delete; pub mod get; mod list; +mod multipart; mod post_object; mod put; mod website; diff --git a/src/api/s3/multipart.rs b/src/api/s3/multipart.rs new file mode 100644 index 00000000..52ea8e78 --- /dev/null +++ b/src/api/s3/multipart.rs @@ -0,0 +1,465 @@ +use std::collections::HashMap; +use std::sync::Arc; + +use futures::prelude::*; +use hyper::body::Body; +use hyper::{Request, Response}; +use md5::{Digest as Md5Digest, Md5}; + +use garage_table::*; +use garage_util::async_hash::*; +use garage_util::data::*; +use garage_util::time::*; + +use garage_model::bucket_table::Bucket; +use garage_model::garage::Garage; +use garage_model::s3::block_ref_table::*; +use garage_model::s3::mpu_table::*; +use garage_model::s3::object_table::*; +use garage_model::s3::version_table::*; + +use crate::s3::error::*; +use crate::s3::put::*; +use crate::s3::xml as s3_xml; +use crate::signature::verify_signed_content; + +// ---- + +pub async fn handle_create_multipart_upload( + garage: Arc<Garage>, + req: &Request<Body>, + bucket_name: &str, + bucket_id: Uuid, + key: &str, +) -> Result<Response<Body>, Error> { + let upload_id = gen_uuid(); + let timestamp = now_msec(); + let headers = get_headers(req.headers())?; + + // Create object in object table + let object_version = ObjectVersion { + uuid: upload_id, + timestamp, + state: ObjectVersionState::Uploading { + multipart: true, + headers, + }, + }; + let object = Object::new(bucket_id, key.to_string(), vec![object_version]); + garage.object_table.insert(&object).await?; + + // Create multipart upload in mpu table + // This multipart upload will hold references to uploaded parts + // (which are entries in the Version table) + let mpu = MultipartUpload::new(upload_id, timestamp, bucket_id, key.into(), false); + garage.mpu_table.insert(&mpu).await?; + + // Send success response + let result = s3_xml::InitiateMultipartUploadResult { + xmlns: (), + bucket: s3_xml::Value(bucket_name.to_string()), + key: s3_xml::Value(key.to_string()), + upload_id: s3_xml::Value(hex::encode(upload_id)), + }; + let xml = s3_xml::to_xml_with_header(&result)?; + + Ok(Response::new(Body::from(xml.into_bytes()))) +} + +pub async fn handle_put_part( + garage: Arc<Garage>, + req: Request<Body>, + bucket_id: Uuid, + key: &str, + part_number: u64, + upload_id: &str, + content_sha256: Option<Hash>, +) -> Result<Response<Body>, Error> { + let upload_id = decode_upload_id(upload_id)?; + + let content_md5 = match req.headers().get("content-md5") { + Some(x) => Some(x.to_str()?.to_string()), + None => None, + }; + + // Read first chuck, and at the same time try to get object to see if it exists + let key = key.to_string(); + + let body = req.into_body().map_err(Error::from); + let mut chunker = StreamChunker::new(body, garage.config.block_size); + + let ((_, _, mut mpu), first_block) = futures::try_join!( + get_upload(&garage, &bucket_id, &key, &upload_id), + chunker.next(), + )?; + + // Check object is valid and part can be accepted + let first_block = first_block.ok_or_bad_request("Empty body")?; + + // Calculate part identity: timestamp, version id + let version_uuid = gen_uuid(); + let mpu_part_key = MpuPartKey { + part_number, + timestamp: mpu.next_timestamp(part_number), + }; + + // The following consists in many steps that can each fail. + // Keep track that some cleanup will be needed if things fail + // before everything is finished (cleanup is done using the Drop trait). + let mut interrupted_cleanup = InterruptedCleanup(Some(InterruptedCleanupInner { + garage: garage.clone(), + upload_id, + version_uuid, + })); + + // Create version and link version from MPU + mpu.parts.clear(); + mpu.parts.put( + mpu_part_key, + MpuPart { + version: version_uuid, + etag: None, + size: None, + }, + ); + garage.mpu_table.insert(&mpu).await?; + + let version = Version::new( + version_uuid, + VersionBacklink::MultipartUpload { upload_id }, + false, + ); + garage.version_table.insert(&version).await?; + + // Copy data to version + let first_block_hash = async_blake2sum(first_block.clone()).await; + + let (total_size, data_md5sum, data_sha256sum) = read_and_put_blocks( + &garage, + &version, + part_number, + first_block, + first_block_hash, + &mut chunker, + ) + .await?; + + // Verify that checksums map + ensure_checksum_matches( + data_md5sum.as_slice(), + data_sha256sum, + content_md5.as_deref(), + content_sha256, + )?; + + // Store part etag in version + let data_md5sum_hex = hex::encode(data_md5sum); + mpu.parts.put( + mpu_part_key, + MpuPart { + version: version_uuid, + etag: Some(data_md5sum_hex.clone()), + size: Some(total_size), + }, + ); + garage.mpu_table.insert(&mpu).await?; + + // We were not interrupted, everything went fine. + // We won't have to clean up on drop. + interrupted_cleanup.cancel(); + + let response = Response::builder() + .header("ETag", format!("\"{}\"", data_md5sum_hex)) + .body(Body::empty()) + .unwrap(); + Ok(response) +} + +struct InterruptedCleanup(Option<InterruptedCleanupInner>); +struct InterruptedCleanupInner { + garage: Arc<Garage>, + upload_id: Uuid, + version_uuid: Uuid, +} + +impl InterruptedCleanup { + fn cancel(&mut self) { + drop(self.0.take()); + } +} +impl Drop for InterruptedCleanup { + fn drop(&mut self) { + if let Some(info) = self.0.take() { + tokio::spawn(async move { + let version = Version::new( + info.version_uuid, + VersionBacklink::MultipartUpload { + upload_id: info.upload_id, + }, + true, + ); + if let Err(e) = info.garage.version_table.insert(&version).await { + warn!("Cannot cleanup after aborted UploadPart: {}", e); + } + }); + } + } +} + +pub async fn handle_complete_multipart_upload( + garage: Arc<Garage>, + req: Request<Body>, + bucket_name: &str, + bucket: &Bucket, + key: &str, + upload_id: &str, + content_sha256: Option<Hash>, +) -> Result<Response<Body>, Error> { + let body = hyper::body::to_bytes(req.into_body()).await?; + + if let Some(content_sha256) = content_sha256 { + verify_signed_content(content_sha256, &body[..])?; + } + + let body_xml = roxmltree::Document::parse(std::str::from_utf8(&body)?)?; + let body_list_of_parts = parse_complete_multipart_upload_body(&body_xml) + .ok_or_bad_request("Invalid CompleteMultipartUpload XML")?; + debug!( + "CompleteMultipartUpload list of parts: {:?}", + body_list_of_parts + ); + + let upload_id = decode_upload_id(upload_id)?; + + // Get object and multipart upload + let key = key.to_string(); + let (_, mut object_version, mpu) = get_upload(&garage, &bucket.id, &key, &upload_id).await?; + + if mpu.parts.is_empty() { + return Err(Error::bad_request("No data was uploaded")); + } + + let headers = match object_version.state { + ObjectVersionState::Uploading { headers, .. } => headers, + _ => unreachable!(), + }; + + // Check that part numbers are an increasing sequence. + // (it doesn't need to start at 1 nor to be a continuous sequence, + // see discussion in #192) + if body_list_of_parts.is_empty() { + return Err(Error::EntityTooSmall); + } + if !body_list_of_parts + .iter() + .zip(body_list_of_parts.iter().skip(1)) + .all(|(p1, p2)| p1.part_number < p2.part_number) + { + return Err(Error::InvalidPartOrder); + } + + // Check that the list of parts they gave us corresponds to parts we have here + debug!("Parts stored in multipart upload: {:?}", mpu.parts.items()); + let mut have_parts = HashMap::new(); + for (pk, pv) in mpu.parts.items().iter() { + have_parts.insert(pk.part_number, pv); + } + let mut parts = vec![]; + for req_part in body_list_of_parts.iter() { + match have_parts.get(&req_part.part_number) { + Some(part) if part.etag.as_ref() == Some(&req_part.etag) && part.size.is_some() => { + parts.push(*part) + } + _ => return Err(Error::InvalidPart), + } + } + + let grg = &garage; + let parts_versions = futures::future::try_join_all(parts.iter().map(|p| async move { + grg.version_table + .get(&p.version, &EmptyKey) + .await? + .ok_or_internal_error("Part version missing from version table") + })) + .await?; + + // Create final version and block refs + let mut final_version = Version::new( + upload_id, + VersionBacklink::Object { + bucket_id: bucket.id, + key: key.to_string(), + }, + false, + ); + for (part_number, part_version) in parts_versions.iter().enumerate() { + if part_version.deleted.get() { + return Err(Error::InvalidPart); + } + for (vbk, vb) in part_version.blocks.items().iter() { + final_version.blocks.put( + VersionBlockKey { + part_number: (part_number + 1) as u64, + offset: vbk.offset, + }, + *vb, + ); + } + } + garage.version_table.insert(&final_version).await?; + + let block_refs = final_version.blocks.items().iter().map(|(_, b)| BlockRef { + block: b.hash, + version: upload_id, + deleted: false.into(), + }); + garage.block_ref_table.insert_many(block_refs).await?; + + // Calculate etag of final object + // To understand how etags are calculated, read more here: + // https://teppen.io/2018/06/23/aws_s3_etags/ + let mut etag_md5_hasher = Md5::new(); + for part in parts.iter() { + etag_md5_hasher.update(part.etag.as_ref().unwrap().as_bytes()); + } + let etag = format!( + "{}-{}", + hex::encode(etag_md5_hasher.finalize()), + parts.len() + ); + + // Calculate total size of final object + let total_size = parts.iter().map(|x| x.size.unwrap()).sum(); + + if let Err(e) = check_quotas(&garage, bucket, &key, total_size).await { + object_version.state = ObjectVersionState::Aborted; + let final_object = Object::new(bucket.id, key.clone(), vec![object_version]); + garage.object_table.insert(&final_object).await?; + + return Err(e); + } + + // Write final object version + object_version.state = ObjectVersionState::Complete(ObjectVersionData::FirstBlock( + ObjectVersionMeta { + headers, + size: total_size, + etag: etag.clone(), + }, + final_version.blocks.items()[0].1.hash, + )); + + let final_object = Object::new(bucket.id, key.clone(), vec![object_version]); + garage.object_table.insert(&final_object).await?; + + // Send response saying ok we're done + let result = s3_xml::CompleteMultipartUploadResult { + xmlns: (), + location: None, + bucket: s3_xml::Value(bucket_name.to_string()), + key: s3_xml::Value(key), + etag: s3_xml::Value(format!("\"{}\"", etag)), + }; + let xml = s3_xml::to_xml_with_header(&result)?; + + Ok(Response::new(Body::from(xml.into_bytes()))) +} + +pub async fn handle_abort_multipart_upload( + garage: Arc<Garage>, + bucket_id: Uuid, + key: &str, + upload_id: &str, +) -> Result<Response<Body>, Error> { + let upload_id = decode_upload_id(upload_id)?; + + let (_, mut object_version, _) = + get_upload(&garage, &bucket_id, &key.to_string(), &upload_id).await?; + + object_version.state = ObjectVersionState::Aborted; + let final_object = Object::new(bucket_id, key.to_string(), vec![object_version]); + garage.object_table.insert(&final_object).await?; + + Ok(Response::new(Body::from(vec![]))) +} + +// ======== helpers ============ + +#[allow(clippy::ptr_arg)] +pub(crate) async fn get_upload( + garage: &Garage, + bucket_id: &Uuid, + key: &String, + upload_id: &Uuid, +) -> Result<(Object, ObjectVersion, MultipartUpload), Error> { + let (object, mpu) = futures::try_join!( + garage.object_table.get(bucket_id, key).map_err(Error::from), + garage + .mpu_table + .get(upload_id, &EmptyKey) + .map_err(Error::from), + )?; + + let object = object.ok_or(Error::NoSuchUpload)?; + let mpu = mpu.ok_or(Error::NoSuchUpload)?; + + let object_version = object + .versions() + .iter() + .find(|v| v.uuid == *upload_id && v.is_uploading(Some(true))) + .ok_or(Error::NoSuchUpload)? + .clone(); + + Ok((object, object_version, mpu)) +} + +pub fn decode_upload_id(id: &str) -> Result<Uuid, Error> { + let id_bin = hex::decode(id).map_err(|_| Error::NoSuchUpload)?; + if id_bin.len() != 32 { + return Err(Error::NoSuchUpload); + } + let mut uuid = [0u8; 32]; + uuid.copy_from_slice(&id_bin[..]); + Ok(Uuid::from(uuid)) +} + +#[derive(Debug)] +struct CompleteMultipartUploadPart { + etag: String, + part_number: u64, +} + +fn parse_complete_multipart_upload_body( + xml: &roxmltree::Document, +) -> Option<Vec<CompleteMultipartUploadPart>> { + let mut parts = vec![]; + + let root = xml.root(); + let cmu = root.first_child()?; + if !cmu.has_tag_name("CompleteMultipartUpload") { + return None; + } + + for item in cmu.children() { + // Only parse <Part> nodes + if !item.is_element() { + continue; + } + + if item.has_tag_name("Part") { + let etag = item.children().find(|e| e.has_tag_name("ETag"))?.text()?; + let part_number = item + .children() + .find(|e| e.has_tag_name("PartNumber"))? + .text()?; + parts.push(CompleteMultipartUploadPart { + etag: etag.trim_matches('"').to_string(), + part_number: part_number.parse().ok()?, + }); + } else { + return None; + } + } + + Some(parts) +} diff --git a/src/api/s3/put.rs b/src/api/s3/put.rs index 350ab884..c7ac5030 100644 --- a/src/api/s3/put.rs +++ b/src/api/s3/put.rs @@ -1,4 +1,4 @@ -use std::collections::{BTreeMap, BTreeSet, HashMap}; +use std::collections::{BTreeMap, HashMap}; use std::sync::Arc; use base64::prelude::*; @@ -30,8 +30,6 @@ use garage_model::s3::object_table::*; use garage_model::s3::version_table::*; use crate::s3::error::*; -use crate::s3::xml as s3_xml; -use crate::signature::verify_signed_content; pub async fn handle_put( garage: Arc<Garage>, @@ -123,20 +121,23 @@ pub(crate) async fn save_stream<S: Stream<Item = Result<Bytes, Error>> + Unpin>( // The following consists in many steps that can each fail. // Keep track that some cleanup will be needed if things fail // before everything is finished (cleanup is done using the Drop trait). - let mut interrupted_cleanup = InterruptedCleanup(Some(( - garage.clone(), - bucket.id, - key.into(), + let mut interrupted_cleanup = InterruptedCleanup(Some(InterruptedCleanupInner { + garage: garage.clone(), + bucket_id: bucket.id, + key: key.into(), version_uuid, version_timestamp, - ))); + })); // Write version identifier in object table so that we have a trace // that we are uploading something let mut object_version = ObjectVersion { uuid: version_uuid, timestamp: version_timestamp, - state: ObjectVersionState::Uploading(headers.clone()), + state: ObjectVersionState::Uploading { + headers: headers.clone(), + multipart: false, + }, }; let object = Object::new(bucket.id, key.into(), vec![object_version.clone()]); garage.object_table.insert(&object).await?; @@ -145,7 +146,14 @@ pub(crate) async fn save_stream<S: Stream<Item = Result<Bytes, Error>> + Unpin>( // Write this entry now, even with empty block list, // to prevent block_ref entries from being deleted (they can be deleted // if the reference a version that isn't found in the version table) - let version = Version::new(version_uuid, bucket.id, key.into(), false); + let version = Version::new( + version_uuid, + VersionBacklink::Object { + bucket_id: bucket.id, + key: key.into(), + }, + false, + ); garage.version_table.insert(&version).await?; // Transfer data and verify checksum @@ -192,7 +200,7 @@ pub(crate) async fn save_stream<S: Stream<Item = Result<Bytes, Error>> + Unpin>( /// Validate MD5 sum against content-md5 header /// and sha256sum against signed content-sha256 -fn ensure_checksum_matches( +pub(crate) fn ensure_checksum_matches( data_md5sum: &[u8], data_sha256sum: garage_util::data::FixedBytes32, content_md5: Option<&str>, @@ -218,7 +226,7 @@ fn ensure_checksum_matches( } /// Check that inserting this object with this size doesn't exceed bucket quotas -async fn check_quotas( +pub(crate) async fn check_quotas( garage: &Arc<Garage>, bucket: &Bucket, key: &str, @@ -275,7 +283,7 @@ async fn check_quotas( Ok(()) } -async fn read_and_put_blocks<S: Stream<Item = Result<Bytes, Error>> + Unpin>( +pub(crate) async fn read_and_put_blocks<S: Stream<Item = Result<Bytes, Error>> + Unpin>( garage: &Garage, version: &Version, part_number: u64, @@ -381,7 +389,7 @@ async fn put_block_meta( Ok(()) } -struct StreamChunker<S: Stream<Item = Result<Bytes, Error>>> { +pub(crate) struct StreamChunker<S: Stream<Item = Result<Bytes, Error>>> { stream: S, read_all: bool, block_size: usize, @@ -389,7 +397,7 @@ struct StreamChunker<S: Stream<Item = Result<Bytes, Error>>> { } impl<S: Stream<Item = Result<Bytes, Error>> + Unpin> StreamChunker<S> { - fn new(stream: S, block_size: usize) -> Self { + pub(crate) fn new(stream: S, block_size: usize) -> Self { Self { stream, read_all: false, @@ -398,7 +406,7 @@ impl<S: Stream<Item = Result<Bytes, Error>> + Unpin> StreamChunker<S> { } } - async fn next(&mut self) -> Result<Option<Bytes>, Error> { + pub(crate) async fn next(&mut self) -> Result<Option<Bytes>, Error> { while !self.read_all && self.buf.len() < self.block_size { if let Some(block) = self.stream.next().await { let bytes = block?; @@ -425,7 +433,14 @@ pub fn put_response(version_uuid: Uuid, md5sum_hex: String) -> Response<Body> { .unwrap() } -struct InterruptedCleanup(Option<(Arc<Garage>, Uuid, String, Uuid, u64)>); +struct InterruptedCleanup(Option<InterruptedCleanupInner>); +struct InterruptedCleanupInner { + garage: Arc<Garage>, + bucket_id: Uuid, + key: String, + version_uuid: Uuid, + version_timestamp: u64, +} impl InterruptedCleanup { fn cancel(&mut self) { @@ -434,15 +449,15 @@ impl InterruptedCleanup { } impl Drop for InterruptedCleanup { fn drop(&mut self) { - if let Some((garage, bucket_id, key, version_uuid, version_ts)) = self.0.take() { + if let Some(info) = self.0.take() { tokio::spawn(async move { let object_version = ObjectVersion { - uuid: version_uuid, - timestamp: version_ts, + uuid: info.version_uuid, + timestamp: info.version_timestamp, state: ObjectVersionState::Aborted, }; - let object = Object::new(bucket_id, key, vec![object_version]); - if let Err(e) = garage.object_table.insert(&object).await { + let object = Object::new(info.bucket_id, info.key, vec![object_version]); + if let Err(e) = info.garage.object_table.insert(&object).await { warn!("Cannot cleanup after aborted PutObject: {}", e); } }); @@ -450,326 +465,9 @@ impl Drop for InterruptedCleanup { } } -// ---- - -pub async fn handle_create_multipart_upload( - garage: Arc<Garage>, - req: &Request<Body>, - bucket_name: &str, - bucket_id: Uuid, - key: &str, -) -> Result<Response<Body>, Error> { - let version_uuid = gen_uuid(); - let headers = get_headers(req.headers())?; - - // Create object in object table - let object_version = ObjectVersion { - uuid: version_uuid, - timestamp: now_msec(), - state: ObjectVersionState::Uploading(headers), - }; - let object = Object::new(bucket_id, key.to_string(), vec![object_version]); - garage.object_table.insert(&object).await?; - - // Insert empty version so that block_ref entries refer to something - // (they are inserted concurrently with blocks in the version table, so - // there is the possibility that they are inserted before the version table - // is created, in which case it is allowed to delete them, e.g. in repair_*) - let version = Version::new(version_uuid, bucket_id, key.into(), false); - garage.version_table.insert(&version).await?; - - // Send success response - let result = s3_xml::InitiateMultipartUploadResult { - xmlns: (), - bucket: s3_xml::Value(bucket_name.to_string()), - key: s3_xml::Value(key.to_string()), - upload_id: s3_xml::Value(hex::encode(version_uuid)), - }; - let xml = s3_xml::to_xml_with_header(&result)?; - - Ok(Response::new(Body::from(xml.into_bytes()))) -} - -pub async fn handle_put_part( - garage: Arc<Garage>, - req: Request<Body>, - bucket_id: Uuid, - key: &str, - part_number: u64, - upload_id: &str, - content_sha256: Option<Hash>, -) -> Result<Response<Body>, Error> { - let version_uuid = decode_upload_id(upload_id)?; - - let content_md5 = match req.headers().get("content-md5") { - Some(x) => Some(x.to_str()?.to_string()), - None => None, - }; - - // Read first chuck, and at the same time try to get object to see if it exists - let key = key.to_string(); - - let body = req.into_body().map_err(Error::from); - let mut chunker = StreamChunker::new(body, garage.config.block_size); - - let (object, version, first_block) = futures::try_join!( - garage - .object_table - .get(&bucket_id, &key) - .map_err(Error::from), - garage - .version_table - .get(&version_uuid, &EmptyKey) - .map_err(Error::from), - chunker.next(), - )?; - - // Check object is valid and multipart block can be accepted - let first_block = first_block.ok_or_bad_request("Empty body")?; - let object = object.ok_or_bad_request("Object not found")?; +// ============ helpers ============ - if !object - .versions() - .iter() - .any(|v| v.uuid == version_uuid && v.is_uploading()) - { - return Err(Error::NoSuchUpload); - } - - // Check part hasn't already been uploaded - if let Some(v) = version { - if v.has_part_number(part_number) { - return Err(Error::bad_request(format!( - "Part number {} has already been uploaded", - part_number - ))); - } - } - - // Copy block to store - let version = Version::new(version_uuid, bucket_id, key, false); - - let first_block_hash = async_blake2sum(first_block.clone()).await; - - let (_, data_md5sum, data_sha256sum) = read_and_put_blocks( - &garage, - &version, - part_number, - first_block, - first_block_hash, - &mut chunker, - ) - .await?; - - // Verify that checksums map - ensure_checksum_matches( - data_md5sum.as_slice(), - data_sha256sum, - content_md5.as_deref(), - content_sha256, - )?; - - // Store part etag in version - let data_md5sum_hex = hex::encode(data_md5sum); - let mut version = version; - version - .parts_etags - .put(part_number, data_md5sum_hex.clone()); - garage.version_table.insert(&version).await?; - - let response = Response::builder() - .header("ETag", format!("\"{}\"", data_md5sum_hex)) - .body(Body::empty()) - .unwrap(); - Ok(response) -} - -pub async fn handle_complete_multipart_upload( - garage: Arc<Garage>, - req: Request<Body>, - bucket_name: &str, - bucket: &Bucket, - key: &str, - upload_id: &str, - content_sha256: Option<Hash>, -) -> Result<Response<Body>, Error> { - let body = hyper::body::to_bytes(req.into_body()).await?; - - if let Some(content_sha256) = content_sha256 { - verify_signed_content(content_sha256, &body[..])?; - } - - let body_xml = roxmltree::Document::parse(std::str::from_utf8(&body)?)?; - let body_list_of_parts = parse_complete_multipart_upload_body(&body_xml) - .ok_or_bad_request("Invalid CompleteMultipartUpload XML")?; - debug!( - "CompleteMultipartUpload list of parts: {:?}", - body_list_of_parts - ); - - let version_uuid = decode_upload_id(upload_id)?; - - // Get object and version - let key = key.to_string(); - let (object, version) = futures::try_join!( - garage.object_table.get(&bucket.id, &key), - garage.version_table.get(&version_uuid, &EmptyKey), - )?; - - let object = object.ok_or(Error::NoSuchKey)?; - let mut object_version = object - .versions() - .iter() - .find(|v| v.uuid == version_uuid && v.is_uploading()) - .cloned() - .ok_or(Error::NoSuchUpload)?; - - let version = version.ok_or(Error::NoSuchKey)?; - if version.blocks.is_empty() { - return Err(Error::bad_request("No data was uploaded")); - } - - let headers = match object_version.state { - ObjectVersionState::Uploading(headers) => headers, - _ => unreachable!(), - }; - - // Check that part numbers are an increasing sequence. - // (it doesn't need to start at 1 nor to be a continuous sequence, - // see discussion in #192) - if body_list_of_parts.is_empty() { - return Err(Error::EntityTooSmall); - } - if !body_list_of_parts - .iter() - .zip(body_list_of_parts.iter().skip(1)) - .all(|(p1, p2)| p1.part_number < p2.part_number) - { - return Err(Error::InvalidPartOrder); - } - - // Garage-specific restriction, see #204: part numbers must be - // consecutive starting at 1 - if body_list_of_parts[0].part_number != 1 - || !body_list_of_parts - .iter() - .zip(body_list_of_parts.iter().skip(1)) - .all(|(p1, p2)| p1.part_number + 1 == p2.part_number) - { - return Err(Error::NotImplemented("Garage does not support completing a Multipart upload with non-consecutive part numbers. This is a restriction of Garage's data model, which might be fixed in a future release. See issue #204 for more information on this topic.".into())); - } - - // Check that the list of parts they gave us corresponds to the parts we have here - debug!("Expected parts from request: {:?}", body_list_of_parts); - debug!("Parts stored in version: {:?}", version.parts_etags.items()); - let parts = version - .parts_etags - .items() - .iter() - .map(|pair| (&pair.0, &pair.1)); - let same_parts = body_list_of_parts - .iter() - .map(|x| (&x.part_number, &x.etag)) - .eq(parts); - if !same_parts { - return Err(Error::InvalidPart); - } - - // Check that all blocks belong to one of the parts - let block_parts = version - .blocks - .items() - .iter() - .map(|(bk, _)| bk.part_number) - .collect::<BTreeSet<_>>(); - let same_parts = body_list_of_parts - .iter() - .map(|x| x.part_number) - .eq(block_parts.into_iter()); - if !same_parts { - return Err(Error::bad_request( - "Part numbers in block list and part list do not match. This can happen if a part was partially uploaded. Please abort the multipart upload and try again." - )); - } - - // Calculate etag of final object - // To understand how etags are calculated, read more here: - // https://teppen.io/2018/06/23/aws_s3_etags/ - let num_parts = body_list_of_parts.len(); - let mut etag_md5_hasher = Md5::new(); - for (_, etag) in version.parts_etags.items().iter() { - etag_md5_hasher.update(etag.as_bytes()); - } - let etag = format!("{}-{}", hex::encode(etag_md5_hasher.finalize()), num_parts); - - // Calculate total size of final object - let total_size = version.blocks.items().iter().map(|x| x.1.size).sum(); - - if let Err(e) = check_quotas(&garage, bucket, &key, total_size).await { - object_version.state = ObjectVersionState::Aborted; - let final_object = Object::new(bucket.id, key.clone(), vec![object_version]); - garage.object_table.insert(&final_object).await?; - - return Err(e); - } - - // Write final object version - object_version.state = ObjectVersionState::Complete(ObjectVersionData::FirstBlock( - ObjectVersionMeta { - headers, - size: total_size, - etag: etag.clone(), - }, - version.blocks.items()[0].1.hash, - )); - - let final_object = Object::new(bucket.id, key.clone(), vec![object_version]); - garage.object_table.insert(&final_object).await?; - - // Send response saying ok we're done - let result = s3_xml::CompleteMultipartUploadResult { - xmlns: (), - location: None, - bucket: s3_xml::Value(bucket_name.to_string()), - key: s3_xml::Value(key), - etag: s3_xml::Value(format!("\"{}\"", etag)), - }; - let xml = s3_xml::to_xml_with_header(&result)?; - - Ok(Response::new(Body::from(xml.into_bytes()))) -} - -pub async fn handle_abort_multipart_upload( - garage: Arc<Garage>, - bucket_id: Uuid, - key: &str, - upload_id: &str, -) -> Result<Response<Body>, Error> { - let version_uuid = decode_upload_id(upload_id)?; - - let object = garage - .object_table - .get(&bucket_id, &key.to_string()) - .await?; - let object = object.ok_or(Error::NoSuchKey)?; - - let object_version = object - .versions() - .iter() - .find(|v| v.uuid == version_uuid && v.is_uploading()); - let mut object_version = match object_version { - None => return Err(Error::NoSuchUpload), - Some(x) => x.clone(), - }; - - object_version.state = ObjectVersionState::Aborted; - let final_object = Object::new(bucket_id, key.to_string(), vec![object_version]); - garage.object_table.insert(&final_object).await?; - - Ok(Response::new(Body::from(vec![]))) -} - -fn get_mime_type(headers: &HeaderMap<HeaderValue>) -> Result<String, Error> { +pub(crate) fn get_mime_type(headers: &HeaderMap<HeaderValue>) -> Result<String, Error> { Ok(headers .get(hyper::header::CONTENT_TYPE) .map(|x| x.to_str()) @@ -821,54 +519,3 @@ pub(crate) fn get_headers(headers: &HeaderMap<HeaderValue>) -> Result<ObjectVers other, }) } - -pub fn decode_upload_id(id: &str) -> Result<Uuid, Error> { - let id_bin = hex::decode(id).map_err(|_| Error::NoSuchUpload)?; - if id_bin.len() != 32 { - return Err(Error::NoSuchUpload); - } - let mut uuid = [0u8; 32]; - uuid.copy_from_slice(&id_bin[..]); - Ok(Uuid::from(uuid)) -} - -#[derive(Debug)] -struct CompleteMultipartUploadPart { - etag: String, - part_number: u64, -} - -fn parse_complete_multipart_upload_body( - xml: &roxmltree::Document, -) -> Option<Vec<CompleteMultipartUploadPart>> { - let mut parts = vec![]; - - let root = xml.root(); - let cmu = root.first_child()?; - if !cmu.has_tag_name("CompleteMultipartUpload") { - return None; - } - - for item in cmu.children() { - // Only parse <Part> nodes - if !item.is_element() { - continue; - } - - if item.has_tag_name("Part") { - let etag = item.children().find(|e| e.has_tag_name("ETag"))?.text()?; - let part_number = item - .children() - .find(|e| e.has_tag_name("PartNumber"))? - .text()?; - parts.push(CompleteMultipartUploadPart { - etag: etag.trim_matches('"').to_string(), - part_number: part_number.parse().ok()?, - }); - } else { - return None; - } - } - - Some(parts) -} diff --git a/src/block/manager.rs b/src/block/manager.rs index 3ece9a8a..c7e4cd03 100644 --- a/src/block/manager.rs +++ b/src/block/manager.rs @@ -80,6 +80,7 @@ pub struct BlockManager { /// Directory in which block are stored pub data_dir: PathBuf, + data_fsync: bool, compression_level: Option<i32>, mutation_lock: [Mutex<BlockManagerLocked>; 256], @@ -114,6 +115,7 @@ impl BlockManager { pub fn new( db: &db::Db, data_dir: PathBuf, + data_fsync: bool, compression_level: Option<i32>, replication: TableShardedReplication, system: Arc<System>, @@ -141,6 +143,7 @@ impl BlockManager { let block_manager = Arc::new(Self { replication, data_dir, + data_fsync, compression_level, mutation_lock: [(); 256].map(|_| Mutex::new(BlockManagerLocked())), rc, @@ -713,7 +716,11 @@ impl BlockManagerLocked { let mut f = fs::File::create(&path_tmp).await?; f.write_all(data).await?; - f.sync_all().await?; + + if mgr.data_fsync { + f.sync_all().await?; + } + drop(f); fs::rename(path_tmp, path).await?; @@ -724,18 +731,20 @@ impl BlockManagerLocked { fs::remove_file(to_delete).await?; } - // We want to ensure that when this function returns, data is properly persisted - // to disk. The first step is the sync_all above that does an fsync on the data file. - // Now, we do an fsync on the containing directory, to ensure that the rename - // is persisted properly. See: - // http://thedjbway.b0llix.net/qmail/syncdir.html - let dir = fs::OpenOptions::new() - .read(true) - .mode(0) - .open(directory) - .await?; - dir.sync_all().await?; - drop(dir); + if mgr.data_fsync { + // We want to ensure that when this function returns, data is properly persisted + // to disk. The first step is the sync_all above that does an fsync on the data file. + // Now, we do an fsync on the containing directory, to ensure that the rename + // is persisted properly. See: + // http://thedjbway.b0llix.net/qmail/syncdir.html + let dir = fs::OpenOptions::new() + .read(true) + .mode(0) + .open(directory) + .await?; + dir.sync_all().await?; + drop(dir); + } Ok(()) } diff --git a/src/db/Cargo.toml b/src/db/Cargo.toml index e3a65857..7401c9cd 100644 --- a/src/db/Cargo.toml +++ b/src/db/Cargo.toml @@ -33,7 +33,7 @@ pretty_env_logger = { version = "0.4", optional = true } mktemp = "0.5" [features] -default = [ "sled" ] +default = [ "sled", "lmdb", "sqlite" ] bundled-libs = [ "rusqlite/bundled" ] cli = ["clap", "pretty_env_logger"] lmdb = [ "heed" ] diff --git a/src/db/bin/convert.rs b/src/db/bin/convert.rs index bbde2048..957deedf 100644 --- a/src/db/bin/convert.rs +++ b/src/db/bin/convert.rs @@ -48,6 +48,8 @@ fn open_db(path: PathBuf, engine: String) -> Result<Db> { } "sqlite" | "sqlite3" | "rusqlite" => { let db = sqlite_adapter::rusqlite::Connection::open(&path)?; + db.pragma_update(None, "journal_mode", &"WAL")?; + db.pragma_update(None, "synchronous", &"NORMAL")?; Ok(sqlite_adapter::SqliteDb::init(db)) } "lmdb" | "heed" => { @@ -57,11 +59,13 @@ fn open_db(path: PathBuf, engine: String) -> Result<Db> { let map_size = lmdb_adapter::recommended_map_size(); - let db = lmdb_adapter::heed::EnvOpenOptions::new() - .max_dbs(100) - .map_size(map_size) - .open(&path) - .unwrap(); + let mut env_builder = lmdb_adapter::heed::EnvOpenOptions::new(); + env_builder.max_dbs(100); + env_builder.map_size(map_size); + unsafe { + env_builder.flag(heed::flags::Flags::MdbNoMetaSync); + } + let db = env_builder.open(&path)?; Ok(lmdb_adapter::LmdbDb::init(db)) } e => Err(Error(format!("Invalid DB engine: {}", e).into())), diff --git a/src/db/lib.rs b/src/db/lib.rs index 11cae4e3..22bd9364 100644 --- a/src/db/lib.rs +++ b/src/db/lib.rs @@ -2,9 +2,6 @@ #[cfg(feature = "sqlite")] extern crate tracing; -#[cfg(not(any(feature = "lmdb", feature = "sled", feature = "sqlite")))] -compile_error!("Must activate the Cargo feature for at least one DB engine: lmdb, sled or sqlite."); - #[cfg(feature = "lmdb")] pub mod lmdb_adapter; #[cfg(feature = "sled")] diff --git a/src/db/sled_adapter.rs b/src/db/sled_adapter.rs index cf61867d..52393a95 100644 --- a/src/db/sled_adapter.rs +++ b/src/db/sled_adapter.rs @@ -38,7 +38,15 @@ pub struct SledDb { } impl SledDb { + #[deprecated( + since = "0.9.0", + note = "The Sled database is now deprecated and will be removed in Garage v1.0. Please migrate to LMDB or Sqlite as soon as possible." + )] pub fn init(db: sled::Db) -> Db { + tracing::warn!("-------------------- IMPORTANT WARNING !!! ----------------------"); + tracing::warn!("The Sled database is now deprecated and will be removed in Garage v1.0."); + tracing::warn!("Please migrate to LMDB or Sqlite as soon as possible."); + tracing::warn!("-----------------------------------------------------------------------"); let s = Self { db, trees: RwLock::new((Vec::new(), HashMap::new())), diff --git a/src/garage/Cargo.toml b/src/garage/Cargo.toml index 52d0ea79..f9fc206b 100644 --- a/src/garage/Cargo.toml +++ b/src/garage/Cargo.toml @@ -77,7 +77,7 @@ k2v-client.workspace = true [features] -default = [ "bundled-libs", "metrics", "sled", "k2v" ] +default = [ "bundled-libs", "metrics", "sled", "lmdb", "sqlite", "k2v" ] k2v = [ "garage_util/k2v", "garage_api/k2v" ] diff --git a/src/garage/admin/block.rs b/src/garage/admin/block.rs index e9e3ff96..c4a45738 100644 --- a/src/garage/admin/block.rs +++ b/src/garage/admin/block.rs @@ -34,6 +34,7 @@ impl AdminRpcHandler { .get_range(&hash, None, None, 10000, Default::default()) .await?; let mut versions = vec![]; + let mut uploads = vec![]; for br in block_refs { if let Some(v) = self .garage @@ -41,6 +42,11 @@ impl AdminRpcHandler { .get(&br.version, &EmptyKey) .await? { + if let VersionBacklink::MultipartUpload { upload_id } = &v.backlink { + if let Some(u) = self.garage.mpu_table.get(upload_id, &EmptyKey).await? { + uploads.push(u); + } + } versions.push(Ok(v)); } else { versions.push(Err(br.version)); @@ -50,6 +56,7 @@ impl AdminRpcHandler { hash, refcount, versions, + uploads, }) } @@ -93,6 +100,7 @@ impl AdminRpcHandler { } let mut obj_dels = 0; + let mut mpu_dels = 0; let mut ver_dels = 0; for hash in blocks { @@ -105,56 +113,80 @@ impl AdminRpcHandler { .await?; for br in block_refs { - let version = match self + if let Some(version) = self .garage .version_table .get(&br.version, &EmptyKey) .await? { - Some(v) => v, - None => continue, - }; + self.handle_block_purge_version_backlink( + &version, + &mut obj_dels, + &mut mpu_dels, + ) + .await?; - if let Some(object) = self - .garage - .object_table - .get(&version.bucket_id, &version.key) - .await? - { - let ov = object.versions().iter().rev().find(|v| v.is_complete()); - if let Some(ov) = ov { - if ov.uuid == br.version { - let del_uuid = gen_uuid(); - let deleted_object = Object::new( - version.bucket_id, - version.key.clone(), - vec![ObjectVersion { - uuid: del_uuid, - timestamp: ov.timestamp + 1, - state: ObjectVersionState::Complete( - ObjectVersionData::DeleteMarker, - ), - }], - ); - self.garage.object_table.insert(&deleted_object).await?; - obj_dels += 1; - } + if !version.deleted.get() { + let deleted_version = Version::new(version.uuid, version.backlink, true); + self.garage.version_table.insert(&deleted_version).await?; + ver_dels += 1; } } - - if !version.deleted.get() { - let deleted_version = - Version::new(version.uuid, version.bucket_id, version.key.clone(), true); - self.garage.version_table.insert(&deleted_version).await?; - ver_dels += 1; - } } } + Ok(AdminRpc::Ok(format!( - "{} blocks were purged: {} object deletion markers added, {} versions marked deleted", + "Purged {} blocks, {} versions, {} objects, {} multipart uploads", blocks.len(), + ver_dels, obj_dels, - ver_dels + mpu_dels, ))) } + + async fn handle_block_purge_version_backlink( + &self, + version: &Version, + obj_dels: &mut usize, + mpu_dels: &mut usize, + ) -> Result<(), Error> { + let (bucket_id, key, ov_id) = match &version.backlink { + VersionBacklink::Object { bucket_id, key } => (*bucket_id, key.clone(), version.uuid), + VersionBacklink::MultipartUpload { upload_id } => { + if let Some(mut mpu) = self.garage.mpu_table.get(upload_id, &EmptyKey).await? { + if !mpu.deleted.get() { + mpu.parts.clear(); + mpu.deleted.set(); + self.garage.mpu_table.insert(&mpu).await?; + *mpu_dels += 1; + } + (mpu.bucket_id, mpu.key.clone(), *upload_id) + } else { + return Ok(()); + } + } + }; + + if let Some(object) = self.garage.object_table.get(&bucket_id, &key).await? { + let ov = object.versions().iter().rev().find(|v| v.is_complete()); + if let Some(ov) = ov { + if ov.uuid == ov_id { + let del_uuid = gen_uuid(); + let deleted_object = Object::new( + bucket_id, + key, + vec![ObjectVersion { + uuid: del_uuid, + timestamp: ov.timestamp + 1, + state: ObjectVersionState::Complete(ObjectVersionData::DeleteMarker), + }], + ); + self.garage.object_table.insert(&deleted_object).await?; + *obj_dels += 1; + } + } + } + + Ok(()) + } } diff --git a/src/garage/admin/bucket.rs b/src/garage/admin/bucket.rs index 11bb8730..0781cb8b 100644 --- a/src/garage/admin/bucket.rs +++ b/src/garage/admin/bucket.rs @@ -73,6 +73,15 @@ impl AdminRpcHandler { .map(|x| x.filtered_values(&self.garage.system.ring.borrow())) .unwrap_or_default(); + let mpu_counters = self + .garage + .mpu_counter_table + .table + .get(&bucket_id, &EmptyKey) + .await? + .map(|x| x.filtered_values(&self.garage.system.ring.borrow())) + .unwrap_or_default(); + let mut relevant_keys = HashMap::new(); for (k, _) in bucket .state @@ -112,6 +121,7 @@ impl AdminRpcHandler { bucket, relevant_keys, counters, + mpu_counters, }) } diff --git a/src/garage/admin/key.rs b/src/garage/admin/key.rs index cab13bcf..8a1c02af 100644 --- a/src/garage/admin/key.rs +++ b/src/garage/admin/key.rs @@ -14,7 +14,7 @@ impl AdminRpcHandler { match cmd { KeyOperation::List => self.handle_list_keys().await, KeyOperation::Info(query) => self.handle_key_info(query).await, - KeyOperation::New(query) => self.handle_create_key(query).await, + KeyOperation::Create(query) => self.handle_create_key(query).await, KeyOperation::Rename(query) => self.handle_rename_key(query).await, KeyOperation::Delete(query) => self.handle_delete_key(query).await, KeyOperation::Allow(query) => self.handle_allow_key(query).await, @@ -118,10 +118,25 @@ impl AdminRpcHandler { } async fn handle_import_key(&self, query: &KeyImportOpt) -> Result<AdminRpc, Error> { + if !query.yes { + return Err(Error::BadRequest("This command is intended to re-import keys that were previously generated by Garage. If you want to create a new key, use `garage key new` instead. Add the --yes flag if you really want to re-import a key.".to_string())); + } + + if query.key_id.len() != 26 + || &query.key_id[..2] != "GK" + || hex::decode(&query.key_id[2..]).is_err() + { + return Err(Error::BadRequest(format!("The specified key ID is not a valid Garage key ID (starts with `GK`, followed by 12 hex-encoded bytes)"))); + } + if query.secret_key.len() != 64 || hex::decode(&query.secret_key).is_err() { + return Err(Error::BadRequest(format!("The specified secret key is not a valid Garage secret key (composed of 32 hex-encoded bytes)"))); + } + let prev_key = self.garage.key_table.get(&EmptyKey, &query.key_id).await?; if prev_key.is_some() { return Err(Error::BadRequest(format!("Key {} already exists in data store. Even if it is deleted, we can't let you create a new key with the same ID. Sorry.", query.key_id))); } + let imported_key = Key::import(&query.key_id, &query.secret_key, &query.name); self.garage.key_table.insert(&imported_key).await?; diff --git a/src/garage/admin/mod.rs b/src/garage/admin/mod.rs index 2709f08a..b6f9c426 100644 --- a/src/garage/admin/mod.rs +++ b/src/garage/admin/mod.rs @@ -28,6 +28,7 @@ use garage_model::garage::Garage; use garage_model::helper::error::{Error, OkOrBadRequest}; use garage_model::key_table::*; use garage_model::migrate::Migrate; +use garage_model::s3::mpu_table::MultipartUpload; use garage_model::s3::version_table::Version; use crate::cli::*; @@ -53,6 +54,7 @@ pub enum AdminRpc { bucket: Bucket, relevant_keys: HashMap<String, Key>, counters: HashMap<String, i64>, + mpu_counters: HashMap<String, i64>, }, KeyList(Vec<(String, String)>), KeyInfo(Key, HashMap<Uuid, Bucket>), @@ -67,6 +69,7 @@ pub enum AdminRpc { hash: Hash, refcount: u64, versions: Vec<Result<Version, Uuid>>, + uploads: Vec<MultipartUpload>, }, } @@ -274,7 +277,7 @@ impl AdminRpcHandler { // Gather storage node and free space statistics let layout = &self.garage.system.ring.borrow().layout; let mut node_partition_count = HashMap::<Uuid, u64>::new(); - for short_id in layout.ring_assignation_data.iter() { + for short_id in layout.ring_assignment_data.iter() { let id = layout.node_id_vec[*short_id as usize]; *node_partition_count.entry(id).or_default() += 1; } diff --git a/src/garage/cli/cmd.rs b/src/garage/cli/cmd.rs index cb7a898c..48359614 100644 --- a/src/garage/cli/cmd.rs +++ b/src/garage/cli/cmd.rs @@ -85,7 +85,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint<SystemRpc, ()>, rpc_host: NodeID) -> )); } _ => { - let new_role = match layout.staging.get(&adv.id) { + let new_role = match layout.staging_roles.get(&adv.id) { Some(NodeRoleV(Some(_))) => "(pending)", _ => "NO ROLE ASSIGNED", }; @@ -190,8 +190,9 @@ pub async fn cmd_admin( bucket, relevant_keys, counters, + mpu_counters, } => { - print_bucket_info(&bucket, &relevant_keys, &counters); + print_bucket_info(&bucket, &relevant_keys, &counters, &mpu_counters); } AdminRpc::KeyList(kl) => { print_key_list(kl); @@ -215,8 +216,9 @@ pub async fn cmd_admin( hash, refcount, versions, + uploads, } => { - print_block_info(hash, refcount, versions); + print_block_info(hash, refcount, versions, uploads); } r => { error!("Unexpected response: {:?}", r); diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index dc5315a1..3932f115 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -1,3 +1,5 @@ +use bytesize::ByteSize; + use format_table::format_table; use garage_util::crdt::Crdt; use garage_util::error::*; @@ -14,8 +16,8 @@ pub async fn cli_layout_command_dispatch( rpc_host: NodeID, ) -> Result<(), Error> { match cmd { - LayoutOperation::Assign(configure_opt) => { - cmd_assign_role(system_rpc_endpoint, rpc_host, configure_opt).await + LayoutOperation::Assign(assign_opt) => { + cmd_assign_role(system_rpc_endpoint, rpc_host, assign_opt).await } LayoutOperation::Remove(remove_opt) => { cmd_remove_role(system_rpc_endpoint, rpc_host, remove_opt).await @@ -27,6 +29,9 @@ pub async fn cli_layout_command_dispatch( LayoutOperation::Revert(revert_opt) => { cmd_revert_layout(system_rpc_endpoint, rpc_host, revert_opt).await } + LayoutOperation::Config(config_opt) => { + cmd_config_layout(system_rpc_endpoint, rpc_host, config_opt).await + } } } @@ -60,14 +65,14 @@ pub async fn cmd_assign_role( .collect::<Result<Vec<_>, _>>()?; let mut roles = layout.roles.clone(); - roles.merge(&layout.staging); + roles.merge(&layout.staging_roles); for replaced in args.replace.iter() { let replaced_node = find_matching_node(layout.node_ids().iter().cloned(), replaced)?; match roles.get(&replaced_node) { Some(NodeRoleV(Some(_))) => { layout - .staging + .staging_roles .merge(&roles.update_mutator(replaced_node, NodeRoleV(None))); } _ => { @@ -83,7 +88,7 @@ pub async fn cmd_assign_role( return Err(Error::Message( "-c and -g are mutually exclusive, please configure node either with c>0 to act as a storage node or with -g to act as a gateway node".into())); } - if args.capacity == Some(0) { + if args.capacity == Some(ByteSize::b(0)) { return Err(Error::Message("Invalid capacity value: 0".into())); } @@ -91,7 +96,7 @@ pub async fn cmd_assign_role( let new_entry = match roles.get(&added_node) { Some(NodeRoleV(Some(old))) => { let capacity = match args.capacity { - Some(c) => Some(c), + Some(c) => Some(c.as_u64()), None if args.gateway => None, None => old.capacity, }; @@ -108,7 +113,7 @@ pub async fn cmd_assign_role( } _ => { let capacity = match args.capacity { - Some(c) => Some(c), + Some(c) => Some(c.as_u64()), None if args.gateway => None, None => return Err(Error::Message( "Please specify a capacity with the -c flag, or set node explicitly as gateway with -g".into())), @@ -125,7 +130,7 @@ pub async fn cmd_assign_role( }; layout - .staging + .staging_roles .merge(&roles.update_mutator(added_node, NodeRoleV(Some(new_entry)))); } @@ -145,13 +150,13 @@ pub async fn cmd_remove_role( let mut layout = fetch_layout(rpc_cli, rpc_host).await?; let mut roles = layout.roles.clone(); - roles.merge(&layout.staging); + roles.merge(&layout.staging_roles); let deleted_node = find_matching_node(roles.items().iter().map(|(id, _, _)| *id), &args.node_id)?; layout - .staging + .staging_roles .merge(&roles.update_mutator(deleted_node, NodeRoleV(None))); send_layout(rpc_cli, rpc_host, layout).await?; @@ -166,7 +171,7 @@ pub async fn cmd_show_layout( rpc_cli: &Endpoint<SystemRpc, ()>, rpc_host: NodeID, ) -> Result<(), Error> { - let mut layout = fetch_layout(rpc_cli, rpc_host).await?; + let layout = fetch_layout(rpc_cli, rpc_host).await?; println!("==== CURRENT CLUSTER LAYOUT ===="); if !print_cluster_layout(&layout) { @@ -176,30 +181,41 @@ pub async fn cmd_show_layout( println!(); println!("Current cluster layout version: {}", layout.version); - if print_staging_role_changes(&layout) { - layout.roles.merge(&layout.staging); - - println!(); - println!("==== NEW CLUSTER LAYOUT AFTER APPLYING CHANGES ===="); - if !print_cluster_layout(&layout) { - println!("No nodes have a role in the new layout."); - } - println!(); + let has_role_changes = print_staging_role_changes(&layout); + let has_param_changes = print_staging_parameters_changes(&layout); + if has_role_changes || has_param_changes { + let v = layout.version; + let res_apply = layout.apply_staged_changes(Some(v + 1)); // this will print the stats of what partitions // will move around when we apply - if layout.calculate_partition_assignation() { - println!("To enact the staged role changes, type:"); - println!(); - println!(" garage layout apply --version {}", layout.version + 1); - println!(); - println!( - "You can also revert all proposed changes with: garage layout revert --version {}", - layout.version + 1 - ); - } else { - println!("Not enough nodes have an assigned role to maintain enough copies of data."); - println!("This new layout cannot yet be applied."); + match res_apply { + Ok((layout, msg)) => { + println!(); + println!("==== NEW CLUSTER LAYOUT AFTER APPLYING CHANGES ===="); + if !print_cluster_layout(&layout) { + println!("No nodes have a role in the new layout."); + } + println!(); + + for line in msg.iter() { + println!("{}", line); + } + println!("To enact the staged role changes, type:"); + println!(); + println!(" garage layout apply --version {}", v + 1); + println!(); + println!( + "You can also revert all proposed changes with: garage layout revert --version {}", + v + 1) + } + Err(e) => { + println!("Error while trying to compute the assignment: {}", e); + println!("This new layout cannot yet be applied."); + println!( + "You can also revert all proposed changes with: garage layout revert --version {}", + v + 1) + } } } @@ -213,11 +229,14 @@ pub async fn cmd_apply_layout( ) -> Result<(), Error> { let layout = fetch_layout(rpc_cli, rpc_host).await?; - let layout = layout.apply_staged_changes(apply_opt.version)?; + let (layout, msg) = layout.apply_staged_changes(apply_opt.version)?; + for line in msg.iter() { + println!("{}", line); + } send_layout(rpc_cli, rpc_host, layout).await?; - println!("New cluster layout with updated role assignation has been applied in cluster."); + println!("New cluster layout with updated role assignment has been applied in cluster."); println!("Data will now be moved around between nodes accordingly."); Ok(()) @@ -238,6 +257,45 @@ pub async fn cmd_revert_layout( Ok(()) } +pub async fn cmd_config_layout( + rpc_cli: &Endpoint<SystemRpc, ()>, + rpc_host: NodeID, + config_opt: ConfigLayoutOpt, +) -> Result<(), Error> { + let mut layout = fetch_layout(rpc_cli, rpc_host).await?; + + let mut did_something = false; + match config_opt.redundancy { + None => (), + Some(r) => { + if r > layout.replication_factor { + println!( + "The zone redundancy must be smaller or equal to the \ + replication factor ({}).", + layout.replication_factor + ); + } else if r < 1 { + println!("The zone redundancy must be at least 1."); + } else { + layout + .staging_parameters + .update(LayoutParameters { zone_redundancy: r }); + println!("The new zone redundancy has been saved ({}).", r); + } + did_something = true; + } + } + + if !did_something { + return Err(Error::Message( + "Please specify an action for `garage layout config` to do".into(), + )); + } + + send_layout(rpc_cli, rpc_host, layout).await?; + Ok(()) +} + // --- utility --- pub async fn fetch_layout( @@ -269,21 +327,39 @@ pub async fn send_layout( } pub fn print_cluster_layout(layout: &ClusterLayout) -> bool { - let mut table = vec!["ID\tTags\tZone\tCapacity".to_string()]; + let mut table = vec!["ID\tTags\tZone\tCapacity\tUsable capacity".to_string()]; for (id, _, role) in layout.roles.items().iter() { let role = match &role.0 { Some(r) => r, _ => continue, }; let tags = role.tags.join(","); - table.push(format!( - "{:?}\t{}\t{}\t{}", - id, - tags, - role.zone, - role.capacity_string() - )); + let usage = layout.get_node_usage(id).unwrap_or(0); + let capacity = layout.get_node_capacity(id).unwrap_or(0); + if capacity > 0 { + table.push(format!( + "{:?}\t{}\t{}\t{}\t{} ({:.1}%)", + id, + tags, + role.zone, + role.capacity_string(), + ByteSize::b(usage as u64 * layout.partition_size).to_string_as(false), + (100.0 * usage as f32 * layout.partition_size as f32) / (capacity as f32) + )); + } else { + table.push(format!( + "{:?}\t{}\t{}\t{}", + id, + tags, + role.zone, + role.capacity_string() + )); + }; } + println!(); + println!("Parameters of the layout computation:"); + println!("Zone redundancy: {}", layout.parameters.zone_redundancy); + println!(); if table.len() == 1 { false } else { @@ -292,9 +368,23 @@ pub fn print_cluster_layout(layout: &ClusterLayout) -> bool { } } +pub fn print_staging_parameters_changes(layout: &ClusterLayout) -> bool { + let has_changes = *layout.staging_parameters.get() != layout.parameters; + if has_changes { + println!(); + println!("==== NEW LAYOUT PARAMETERS ===="); + println!( + "Zone redundancy: {}", + layout.staging_parameters.get().zone_redundancy + ); + println!(); + } + has_changes +} + pub fn print_staging_role_changes(layout: &ClusterLayout) -> bool { let has_changes = layout - .staging + .staging_roles .items() .iter() .any(|(k, _, v)| layout.roles.get(k) != Some(v)); @@ -303,7 +393,7 @@ pub fn print_staging_role_changes(layout: &ClusterLayout) -> bool { println!(); println!("==== STAGED ROLE CHANGES ===="); let mut table = vec!["ID\tTags\tZone\tCapacity".to_string()]; - for (id, _, role) in layout.staging.items().iter() { + for (id, _, role) in layout.staging_roles.items().iter() { if layout.roles.get(id) == Some(role) { continue; } diff --git a/src/garage/cli/structs.rs b/src/garage/cli/structs.rs index 01ae92da..2547fb8d 100644 --- a/src/garage/cli/structs.rs +++ b/src/garage/cli/structs.rs @@ -17,7 +17,7 @@ pub enum Command { #[structopt(name = "node", version = garage_version())] Node(NodeOperation), - /// Operations on the assignation of node roles in the cluster layout + /// Operations on the assignment of node roles in the cluster layout #[structopt(name = "layout", version = garage_version())] Layout(LayoutOperation), @@ -91,6 +91,10 @@ pub enum LayoutOperation { #[structopt(name = "remove", version = garage_version())] Remove(RemoveRoleOpt), + /// Configure parameters value for the layout computation + #[structopt(name = "config", version = garage_version())] + Config(ConfigLayoutOpt), + /// Show roles currently assigned to nodes and changes staged for commit #[structopt(name = "show", version = garage_version())] Show, @@ -114,9 +118,9 @@ pub struct AssignRoleOpt { #[structopt(short = "z", long = "zone")] pub(crate) zone: Option<String>, - /// Capacity (in relative terms, use 1 to represent your smallest server) + /// Storage capacity, in bytes (supported suffixes: B, KB, MB, GB, TB, PB) #[structopt(short = "c", long = "capacity")] - pub(crate) capacity: Option<u32>, + pub(crate) capacity: Option<bytesize::ByteSize>, /// Gateway-only node #[structopt(short = "g", long = "gateway")] @@ -138,6 +142,13 @@ pub struct RemoveRoleOpt { } #[derive(StructOpt, Debug)] +pub struct ConfigLayoutOpt { + /// Zone redundancy parameter + #[structopt(short = "r", long = "redundancy")] + pub(crate) redundancy: Option<usize>, +} + +#[derive(StructOpt, Debug)] pub struct ApplyLayoutOpt { /// Version number of new configuration: this command will fail if /// it is not exactly 1 + the previous configuration's version @@ -320,8 +331,8 @@ pub enum KeyOperation { Info(KeyOpt), /// Create new key - #[structopt(name = "new", version = garage_version())] - New(KeyNewOpt), + #[structopt(name = "create", version = garage_version())] + Create(KeyNewOpt), /// Rename key #[structopt(name = "rename", version = garage_version())] @@ -353,7 +364,7 @@ pub struct KeyOpt { #[derive(Serialize, Deserialize, StructOpt, Debug)] pub struct KeyNewOpt { /// Name of the key - #[structopt(long = "name", default_value = "Unnamed key")] + #[structopt(default_value = "Unnamed key")] pub name: String, } @@ -397,6 +408,10 @@ pub struct KeyImportOpt { /// Key name #[structopt(short = "n", default_value = "Imported key")] pub name: String, + + /// Confirm key import + #[structopt(long = "yes")] + pub yes: bool, } #[derive(Serialize, Deserialize, StructOpt, Debug, Clone)] @@ -432,19 +447,22 @@ pub struct RepairOpt { #[derive(Serialize, Deserialize, StructOpt, Debug, Eq, PartialEq, Clone)] pub enum RepairWhat { - /// Only do a full sync of metadata tables + /// Do a full sync of metadata tables #[structopt(name = "tables", version = garage_version())] Tables, - /// Only repair (resync/rebalance) the set of stored blocks + /// Repair (resync/rebalance) the set of stored blocks #[structopt(name = "blocks", version = garage_version())] Blocks, - /// Only redo the propagation of object deletions to the version table (slow) + /// Repropagate object deletions to the version table #[structopt(name = "versions", version = garage_version())] Versions, - /// Only redo the propagation of version deletions to the block ref table (extremely slow) + /// Repropagate object deletions to the multipart upload table + #[structopt(name = "mpu", version = garage_version())] + MultipartUploads, + /// Repropagate version deletions to the block ref table #[structopt(name = "block_refs", version = garage_version())] BlockRefs, - /// Verify integrity of all blocks on disc (extremely slow, i/o intensive) + /// Verify integrity of all blocks on disc #[structopt(name = "scrub", version = garage_version())] Scrub { #[structopt(subcommand)] diff --git a/src/garage/cli/util.rs b/src/garage/cli/util.rs index 1140cf22..2232d395 100644 --- a/src/garage/cli/util.rs +++ b/src/garage/cli/util.rs @@ -12,8 +12,9 @@ use garage_block::manager::BlockResyncErrorInfo; use garage_model::bucket_table::*; use garage_model::key_table::*; -use garage_model::s3::object_table::{BYTES, OBJECTS, UNFINISHED_UPLOADS}; -use garage_model::s3::version_table::Version; +use garage_model::s3::mpu_table::{self, MultipartUpload}; +use garage_model::s3::object_table; +use garage_model::s3::version_table::*; use crate::cli::structs::WorkerListOpt; @@ -135,6 +136,7 @@ pub fn print_bucket_info( bucket: &Bucket, relevant_keys: &HashMap<String, Key>, counters: &HashMap<String, i64>, + mpu_counters: &HashMap<String, i64>, ) { let key_name = |k| { relevant_keys @@ -148,7 +150,7 @@ pub fn print_bucket_info( Deletable::Deleted => println!("Bucket is deleted."), Deletable::Present(p) => { let size = - bytesize::ByteSize::b(counters.get(BYTES).cloned().unwrap_or_default() as u64); + bytesize::ByteSize::b(*counters.get(object_table::BYTES).unwrap_or(&0) as u64); println!( "\nSize: {} ({})", size.to_string_as(true), @@ -156,14 +158,22 @@ pub fn print_bucket_info( ); println!( "Objects: {}", - counters.get(OBJECTS).cloned().unwrap_or_default() + *counters.get(object_table::OBJECTS).unwrap_or(&0) + ); + println!( + "Unfinished uploads (multipart and non-multipart): {}", + *counters.get(object_table::UNFINISHED_UPLOADS).unwrap_or(&0) ); println!( "Unfinished multipart uploads: {}", - counters - .get(UNFINISHED_UPLOADS) - .cloned() - .unwrap_or_default() + *mpu_counters.get(mpu_table::UPLOADS).unwrap_or(&0) + ); + let mpu_size = + bytesize::ByteSize::b(*mpu_counters.get(mpu_table::BYTES).unwrap_or(&0) as u64); + println!( + "Size of unfinished multipart uploads: {} ({})", + mpu_size.to_string_as(true), + mpu_size.to_string_as(false), ); println!("\nWebsite access: {}", p.website_config.get().is_some()); @@ -390,29 +400,49 @@ pub fn print_block_error_list(el: Vec<BlockResyncErrorInfo>) { format_table(table); } -pub fn print_block_info(hash: Hash, refcount: u64, versions: Vec<Result<Version, Uuid>>) { +pub fn print_block_info( + hash: Hash, + refcount: u64, + versions: Vec<Result<Version, Uuid>>, + uploads: Vec<MultipartUpload>, +) { println!("Block hash: {}", hex::encode(hash.as_slice())); println!("Refcount: {}", refcount); println!(); - let mut table = vec!["Version\tBucket\tKey\tDeleted".into()]; + let mut table = vec!["Version\tBucket\tKey\tMPU\tDeleted".into()]; let mut nondeleted_count = 0; for v in versions.iter() { match v { Ok(ver) => { - table.push(format!( - "{:?}\t{:?}\t{}\t{:?}", - ver.uuid, - ver.bucket_id, - ver.key, - ver.deleted.get() - )); + match &ver.backlink { + VersionBacklink::Object { bucket_id, key } => { + table.push(format!( + "{:?}\t{:?}\t{}\t\t{:?}", + ver.uuid, + bucket_id, + key, + ver.deleted.get() + )); + } + VersionBacklink::MultipartUpload { upload_id } => { + let upload = uploads.iter().find(|x| x.upload_id == *upload_id); + table.push(format!( + "{:?}\t{:?}\t{}\t{:?}\t{:?}", + ver.uuid, + upload.map(|u| u.bucket_id).unwrap_or_default(), + upload.map(|u| u.key.as_str()).unwrap_or_default(), + upload_id, + ver.deleted.get() + )); + } + } if !ver.deleted.get() { nondeleted_count += 1; } } Err(vh) => { - table.push(format!("{:?}\t\t\tyes", vh)); + table.push(format!("{:?}\t\t\t\tyes", vh)); } } } diff --git a/src/garage/main.rs b/src/garage/main.rs index e8aee892..3d07208c 100644 --- a/src/garage/main.rs +++ b/src/garage/main.rs @@ -17,6 +17,9 @@ compile_error!("Either bundled-libs or system-libs Cargo feature must be enabled #[cfg(all(feature = "bundled-libs", feature = "system-libs"))] compile_error!("Only one of bundled-libs and system-libs Cargo features must be enabled"); +#[cfg(not(any(feature = "lmdb", feature = "sled", feature = "sqlite")))] +compile_error!("Must activate the Cargo feature for at least one DB engine: lmdb, sled or sqlite."); + use std::net::SocketAddr; use std::path::PathBuf; diff --git a/src/garage/repair/online.rs b/src/garage/repair/online.rs index 0e14ed51..abfaf9f9 100644 --- a/src/garage/repair/online.rs +++ b/src/garage/repair/online.rs @@ -5,11 +5,16 @@ use async_trait::async_trait; use tokio::sync::watch; use garage_block::repair::ScrubWorkerCommand; + use garage_model::garage::Garage; use garage_model::s3::block_ref_table::*; +use garage_model::s3::mpu_table::*; use garage_model::s3::object_table::*; use garage_model::s3::version_table::*; + +use garage_table::replication::*; use garage_table::*; + use garage_util::background::*; use garage_util::error::Error; use garage_util::migrate::Migrate; @@ -32,11 +37,15 @@ pub async fn launch_online_repair( } RepairWhat::Versions => { info!("Repairing the versions table"); - bg.spawn_worker(RepairVersionsWorker::new(garage.clone())); + bg.spawn_worker(TableRepairWorker::new(garage.clone(), RepairVersions)); + } + RepairWhat::MultipartUploads => { + info!("Repairing the multipart uploads table"); + bg.spawn_worker(TableRepairWorker::new(garage.clone(), RepairMpu)); } RepairWhat::BlockRefs => { info!("Repairing the block refs table"); - bg.spawn_worker(RepairBlockrefsWorker::new(garage.clone())); + bg.spawn_worker(TableRepairWorker::new(garage.clone(), RepairBlockRefs)); } RepairWhat::Blocks => { info!("Repairing the stored blocks"); @@ -67,70 +76,70 @@ pub async fn launch_online_repair( // ---- -struct RepairVersionsWorker { +#[async_trait] +trait TableRepair: Send + Sync + 'static { + type T: TableSchema; + + fn table(garage: &Garage) -> &Table<Self::T, TableShardedReplication>; + + async fn process( + &mut self, + garage: &Garage, + entry: <<Self as TableRepair>::T as TableSchema>::E, + ) -> Result<bool, Error>; +} + +struct TableRepairWorker<T: TableRepair> { garage: Arc<Garage>, pos: Vec<u8>, counter: usize, + repairs: usize, + inner: T, } -impl RepairVersionsWorker { - fn new(garage: Arc<Garage>) -> Self { +impl<R: TableRepair> TableRepairWorker<R> { + fn new(garage: Arc<Garage>, inner: R) -> Self { Self { garage, + inner, pos: vec![], counter: 0, + repairs: 0, } } } #[async_trait] -impl Worker for RepairVersionsWorker { +impl<R: TableRepair> Worker for TableRepairWorker<R> { fn name(&self) -> String { - "Version repair worker".into() + format!("{} repair worker", R::T::TABLE_NAME) } fn status(&self) -> WorkerStatus { WorkerStatus { - progress: Some(self.counter.to_string()), + progress: Some(format!("{} ({})", self.counter, self.repairs)), ..Default::default() } } async fn work(&mut self, _must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error> { - let (item_bytes, next_pos) = match self.garage.version_table.data.store.get_gt(&self.pos)? { + let (item_bytes, next_pos) = match R::table(&self.garage).data.store.get_gt(&self.pos)? { Some((k, v)) => (v, k), None => { - info!("repair_versions: finished, done {}", self.counter); + info!( + "{}: finished, done {}, fixed {}", + self.name(), + self.counter, + self.repairs + ); return Ok(WorkerState::Done); } }; - let version = Version::decode(&item_bytes).ok_or_message("Cannot decode Version")?; - if !version.deleted.get() { - let object = self - .garage - .object_table - .get(&version.bucket_id, &version.key) - .await?; - let version_exists = match object { - Some(o) => o - .versions() - .iter() - .any(|x| x.uuid == version.uuid && x.state != ObjectVersionState::Aborted), - None => false, - }; - if !version_exists { - info!("Repair versions: marking version as deleted: {:?}", version); - self.garage - .version_table - .insert(&Version::new( - version.uuid, - version.bucket_id, - version.key, - true, - )) - .await?; - } + let entry = <R::T as TableSchema>::E::decode(&item_bytes) + .ok_or_message("Cannot decode table entry")?; + if self.inner.process(&self.garage, entry).await? { + self.repairs += 1; } self.counter += 1; @@ -146,77 +155,124 @@ impl Worker for RepairVersionsWorker { // ---- -struct RepairBlockrefsWorker { - garage: Arc<Garage>, - pos: Vec<u8>, - counter: usize, -} +struct RepairVersions; -impl RepairBlockrefsWorker { - fn new(garage: Arc<Garage>) -> Self { - Self { - garage, - pos: vec![], - counter: 0, +#[async_trait] +impl TableRepair for RepairVersions { + type T = VersionTable; + + fn table(garage: &Garage) -> &Table<Self::T, TableShardedReplication> { + &garage.version_table + } + + async fn process(&mut self, garage: &Garage, version: Version) -> Result<bool, Error> { + if !version.deleted.get() { + let ref_exists = match &version.backlink { + VersionBacklink::Object { bucket_id, key } => garage + .object_table + .get(bucket_id, key) + .await? + .map(|o| { + o.versions().iter().any(|x| { + x.uuid == version.uuid && x.state != ObjectVersionState::Aborted + }) + }) + .unwrap_or(false), + VersionBacklink::MultipartUpload { upload_id } => garage + .mpu_table + .get(upload_id, &EmptyKey) + .await? + .map(|u| !u.deleted.get()) + .unwrap_or(false), + }; + + if !ref_exists { + info!("Repair versions: marking version as deleted: {:?}", version); + garage + .version_table + .insert(&Version::new(version.uuid, version.backlink, true)) + .await?; + return Ok(true); + } } + + Ok(false) } } +// ---- + +struct RepairBlockRefs; + #[async_trait] -impl Worker for RepairBlockrefsWorker { - fn name(&self) -> String { - "Block refs repair worker".into() - } +impl TableRepair for RepairBlockRefs { + type T = BlockRefTable; - fn status(&self) -> WorkerStatus { - WorkerStatus { - progress: Some(self.counter.to_string()), - ..Default::default() - } + fn table(garage: &Garage) -> &Table<Self::T, TableShardedReplication> { + &garage.block_ref_table } - async fn work(&mut self, _must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error> { - let (item_bytes, next_pos) = - match self.garage.block_ref_table.data.store.get_gt(&self.pos)? { - Some((k, v)) => (v, k), - None => { - info!("repair_block_ref: finished, done {}", self.counter); - return Ok(WorkerState::Done); - } - }; - - let block_ref = BlockRef::decode(&item_bytes).ok_or_message("Cannot decode BlockRef")?; + async fn process(&mut self, garage: &Garage, mut block_ref: BlockRef) -> Result<bool, Error> { if !block_ref.deleted.get() { - let version = self - .garage + let ref_exists = garage .version_table .get(&block_ref.version, &EmptyKey) - .await?; - // The version might not exist if it has been GC'ed - let ref_exists = version.map(|v| !v.deleted.get()).unwrap_or(false); + .await? + .map(|v| !v.deleted.get()) + .unwrap_or(false); + if !ref_exists { info!( "Repair block ref: marking block_ref as deleted: {:?}", block_ref ); - self.garage - .block_ref_table - .insert(&BlockRef { - block: block_ref.block, - version: block_ref.version, - deleted: true.into(), - }) - .await?; + block_ref.deleted.set(); + garage.block_ref_table.insert(&block_ref).await?; + return Ok(true); } } - self.counter += 1; - self.pos = next_pos; + Ok(false) + } +} - Ok(WorkerState::Busy) +// ---- + +struct RepairMpu; + +#[async_trait] +impl TableRepair for RepairMpu { + type T = MultipartUploadTable; + + fn table(garage: &Garage) -> &Table<Self::T, TableShardedReplication> { + &garage.mpu_table } - async fn wait_for_work(&mut self) -> WorkerState { - unreachable!() + async fn process(&mut self, garage: &Garage, mut mpu: MultipartUpload) -> Result<bool, Error> { + if !mpu.deleted.get() { + let ref_exists = garage + .object_table + .get(&mpu.bucket_id, &mpu.key) + .await? + .map(|o| { + o.versions() + .iter() + .any(|x| x.uuid == mpu.upload_id && x.is_uploading(Some(true))) + }) + .unwrap_or(false); + + if !ref_exists { + info!( + "Repair multipart uploads: marking mpu as deleted: {:?}", + mpu + ); + mpu.parts.clear(); + mpu.deleted.set(); + garage.mpu_table.insert(&mpu).await?; + return Ok(true); + } + } + + Ok(false) } } diff --git a/src/garage/tests/common/garage.rs b/src/garage/tests/common/garage.rs index 8aaf6f5b..cb1b7ea5 100644 --- a/src/garage/tests/common/garage.rs +++ b/src/garage/tests/common/garage.rs @@ -52,6 +52,7 @@ impl Instance { r#" metadata_dir = "{path}/meta" data_dir = "{path}/data" +db_engine = "sled" replication_mode = "1" @@ -141,7 +142,7 @@ api_bind_addr = "127.0.0.1:{admin_port}" self.command() .args(["layout", "assign"]) .arg(node_short_id) - .args(["-c", "1", "-z", "unzonned"]) + .args(["-c", "1G", "-z", "unzonned"]) .quiet() .expect_success_status("Could not assign garage node layout"); self.command() @@ -186,9 +187,9 @@ api_bind_addr = "127.0.0.1:{admin_port}" let mut key = Key::default(); let mut cmd = self.command(); - let base = cmd.args(["key", "new"]); + let base = cmd.args(["key", "create"]); let with_name = match maybe_name { - Some(name) => base.args(["--name", name]), + Some(name) => base.args([name]), None => base, }; diff --git a/src/garage/tests/s3/multipart.rs b/src/garage/tests/s3/multipart.rs index 895a2993..8ae6b66e 100644 --- a/src/garage/tests/s3/multipart.rs +++ b/src/garage/tests/s3/multipart.rs @@ -6,6 +6,190 @@ const SZ_5MB: usize = 5 * 1024 * 1024; const SZ_10MB: usize = 10 * 1024 * 1024; #[tokio::test] +async fn test_multipart_upload() { + let ctx = common::context(); + let bucket = ctx.create_bucket("testmpu"); + + let u1 = vec![0x11; SZ_5MB]; + let u2 = vec![0x22; SZ_5MB]; + let u3 = vec![0x33; SZ_5MB]; + let u4 = vec![0x44; SZ_5MB]; + let u5 = vec![0x55; SZ_5MB]; + + let up = ctx + .client + .create_multipart_upload() + .bucket(&bucket) + .key("a") + .send() + .await + .unwrap(); + assert!(up.upload_id.is_some()); + + let uid = up.upload_id.as_ref().unwrap(); + + let p3 = ctx + .client + .upload_part() + .bucket(&bucket) + .key("a") + .upload_id(uid) + .part_number(3) + .body(ByteStream::from(u3.clone())) + .send() + .await + .unwrap(); + + let _p1 = ctx + .client + .upload_part() + .bucket(&bucket) + .key("a") + .upload_id(uid) + .part_number(1) + .body(ByteStream::from(u1)) + .send() + .await + .unwrap(); + + let _p4 = ctx + .client + .upload_part() + .bucket(&bucket) + .key("a") + .upload_id(uid) + .part_number(4) + .body(ByteStream::from(u4)) + .send() + .await + .unwrap(); + + let p1bis = ctx + .client + .upload_part() + .bucket(&bucket) + .key("a") + .upload_id(uid) + .part_number(1) + .body(ByteStream::from(u2.clone())) + .send() + .await + .unwrap(); + + let p6 = ctx + .client + .upload_part() + .bucket(&bucket) + .key("a") + .upload_id(uid) + .part_number(6) + .body(ByteStream::from(u5.clone())) + .send() + .await + .unwrap(); + + { + let r = ctx + .client + .list_parts() + .bucket(&bucket) + .key("a") + .upload_id(uid) + .send() + .await + .unwrap(); + assert_eq!(r.parts.unwrap().len(), 4); + } + + let cmp = CompletedMultipartUpload::builder() + .parts( + CompletedPart::builder() + .part_number(1) + .e_tag(p1bis.e_tag.unwrap()) + .build(), + ) + .parts( + CompletedPart::builder() + .part_number(3) + .e_tag(p3.e_tag.unwrap()) + .build(), + ) + .parts( + CompletedPart::builder() + .part_number(6) + .e_tag(p6.e_tag.unwrap()) + .build(), + ) + .build(); + + ctx.client + .complete_multipart_upload() + .bucket(&bucket) + .key("a") + .upload_id(uid) + .multipart_upload(cmp) + .send() + .await + .unwrap(); + + // The multipart upload must not appear anymore + assert!(ctx + .client + .list_parts() + .bucket(&bucket) + .key("a") + .upload_id(uid) + .send() + .await + .is_err()); + + { + // The object must appear as a regular object + let r = ctx + .client + .head_object() + .bucket(&bucket) + .key("a") + .send() + .await + .unwrap(); + + assert_eq!(r.content_length, (SZ_5MB * 3) as i64); + } + + { + let o = ctx + .client + .get_object() + .bucket(&bucket) + .key("a") + .send() + .await + .unwrap(); + + assert_bytes_eq!(o.body, &[&u2[..], &u3[..], &u5[..]].concat()); + } + + { + for (part_number, data) in [(1, &u2), (2, &u3), (3, &u5)] { + let o = ctx + .client + .get_object() + .bucket(&bucket) + .key("a") + .part_number(part_number) + .send() + .await + .unwrap(); + + eprintln!("get_object with part_number = {}", part_number); + assert_eq!(o.content_length, SZ_5MB as i64); + assert_bytes_eq!(o.body, data); + } + } +} + +#[tokio::test] async fn test_uploadlistpart() { let ctx = common::context(); let bucket = ctx.create_bucket("uploadpart"); @@ -65,7 +249,8 @@ async fn test_uploadlistpart() { let ps = r.parts.unwrap(); assert_eq!(ps.len(), 1); - let fp = ps.iter().find(|x| x.part_number == 2).unwrap(); + assert_eq!(ps[0].part_number, 2); + let fp = &ps[0]; assert!(fp.last_modified.is_some()); assert_eq!( fp.e_tag.as_ref().unwrap(), @@ -100,13 +285,24 @@ async fn test_uploadlistpart() { let ps = r.parts.unwrap(); assert_eq!(ps.len(), 2); - let fp = ps.iter().find(|x| x.part_number == 1).unwrap(); + + assert_eq!(ps[0].part_number, 1); + let fp = &ps[0]; assert!(fp.last_modified.is_some()); assert_eq!( fp.e_tag.as_ref().unwrap(), "\"3c484266f9315485694556e6c693bfa2\"" ); assert_eq!(fp.size, SZ_5MB as i64); + + assert_eq!(ps[1].part_number, 2); + let sp = &ps[1]; + assert!(sp.last_modified.is_some()); + assert_eq!( + sp.e_tag.as_ref().unwrap(), + "\"3366bb9dcf710d6801b5926467d02e19\"" + ); + assert_eq!(sp.size, SZ_5MB as i64); } { @@ -123,12 +319,19 @@ async fn test_uploadlistpart() { .unwrap(); assert!(r.part_number_marker.is_none()); - assert!(r.next_part_number_marker.is_some()); + assert_eq!(r.next_part_number_marker.as_deref(), Some("1")); assert_eq!(r.max_parts, 1_i32); assert!(r.is_truncated); assert_eq!(r.key.unwrap(), "a"); assert_eq!(r.upload_id.unwrap().as_str(), uid.as_str()); - assert_eq!(r.parts.unwrap().len(), 1); + let parts = r.parts.unwrap(); + assert_eq!(parts.len(), 1); + let fp = &parts[0]; + assert_eq!(fp.part_number, 1); + assert_eq!( + fp.e_tag.as_ref().unwrap(), + "\"3c484266f9315485694556e6c693bfa2\"" + ); let r2 = ctx .client @@ -147,10 +350,18 @@ async fn test_uploadlistpart() { r.next_part_number_marker.as_ref().unwrap() ); assert_eq!(r2.max_parts, 1_i32); - assert!(r2.is_truncated); assert_eq!(r2.key.unwrap(), "a"); assert_eq!(r2.upload_id.unwrap().as_str(), uid.as_str()); - assert_eq!(r2.parts.unwrap().len(), 1); + let parts = r2.parts.unwrap(); + assert_eq!(parts.len(), 1); + let fp = &parts[0]; + assert_eq!(fp.part_number, 2); + assert_eq!( + fp.e_tag.as_ref().unwrap(), + "\"3366bb9dcf710d6801b5926467d02e19\"" + ); + //assert!(r2.is_truncated); // WHY? (this was the test before) + assert!(!r2.is_truncated); } let cmp = CompletedMultipartUpload::builder() diff --git a/src/model/Cargo.toml b/src/model/Cargo.toml index 6dc954d4..c4291c32 100644 --- a/src/model/Cargo.toml +++ b/src/model/Cargo.toml @@ -41,7 +41,7 @@ opentelemetry = "0.17" netapp = "0.5" [features] -default = [ "sled" ] +default = [ "sled", "lmdb", "sqlite" ] k2v = [ "garage_util/k2v" ] lmdb = [ "garage_db/lmdb" ] sled = [ "garage_db/sled" ] diff --git a/src/model/garage.rs b/src/model/garage.rs index 3daa1b33..db2475ed 100644 --- a/src/model/garage.rs +++ b/src/model/garage.rs @@ -17,6 +17,7 @@ use garage_table::replication::TableShardedReplication; use garage_table::*; use crate::s3::block_ref_table::*; +use crate::s3::mpu_table::*; use crate::s3::object_table::*; use crate::s3::version_table::*; @@ -57,6 +58,10 @@ pub struct Garage { pub object_table: Arc<Table<ObjectTable, TableShardedReplication>>, /// Counting table containing object counters pub object_counter_table: Arc<IndexCounter<Object>>, + /// Table containing S3 multipart uploads + pub mpu_table: Arc<Table<MultipartUploadTable, TableShardedReplication>>, + /// Counting table containing multipart object counters + pub mpu_counter_table: Arc<IndexCounter<MultipartUpload>>, /// Table containing S3 object versions pub version_table: Arc<Table<VersionTable, TableShardedReplication>>, /// Table containing S3 block references (not blocks themselves) @@ -91,6 +96,11 @@ impl Garage { // ---- Sled DB ---- #[cfg(feature = "sled")] "sled" => { + if config.metadata_fsync { + return Err(Error::Message(format!( + "`metadata_fsync = true` is not supported with the Sled database engine" + ))); + } db_path.push("db"); info!("Opening Sled database at: {}", db_path.display()); let db = db::sled_adapter::sled::Config::default() @@ -109,6 +119,15 @@ impl Garage { db_path.push("db.sqlite"); info!("Opening Sqlite database at: {}", db_path.display()); let db = db::sqlite_adapter::rusqlite::Connection::open(db_path) + .and_then(|db| { + db.pragma_update(None, "journal_mode", &"WAL")?; + if config.metadata_fsync { + db.pragma_update(None, "synchronous", &"NORMAL")?; + } else { + db.pragma_update(None, "synchronous", &"OFF")?; + } + Ok(db) + }) .ok_or_message("Unable to open sqlite DB")?; db::sqlite_adapter::SqliteDb::init(db) } @@ -133,8 +152,10 @@ impl Garage { env_builder.max_readers(500); env_builder.map_size(map_size); unsafe { - env_builder.flag(heed::flags::Flags::MdbNoSync); env_builder.flag(heed::flags::Flags::MdbNoMetaSync); + if !config.metadata_fsync { + env_builder.flag(heed::flags::Flags::MdbNoSync); + } } let db = match env_builder.open(&db_path) { Err(heed::Error::Io(e)) if e.kind() == std::io::ErrorKind::OutOfMemory => { @@ -204,6 +225,7 @@ impl Garage { let block_manager = BlockManager::new( &db, config.data_dir.clone(), + config.data_fsync, config.compression_level, data_rep_param, system.clone(), @@ -244,6 +266,20 @@ impl Garage { &db, ); + info!("Initialize multipart upload counter table..."); + let mpu_counter_table = IndexCounter::new(system.clone(), meta_rep_param.clone(), &db); + + info!("Initialize multipart upload table..."); + let mpu_table = Table::new( + MultipartUploadTable { + version_table: version_table.clone(), + mpu_counter_table: mpu_counter_table.clone(), + }, + meta_rep_param.clone(), + system.clone(), + &db, + ); + info!("Initialize object counter table..."); let object_counter_table = IndexCounter::new(system.clone(), meta_rep_param.clone(), &db); @@ -252,6 +288,7 @@ impl Garage { let object_table = Table::new( ObjectTable { version_table: version_table.clone(), + mpu_table: mpu_table.clone(), object_counter_table: object_counter_table.clone(), }, meta_rep_param.clone(), @@ -280,6 +317,8 @@ impl Garage { key_table, object_table, object_counter_table, + mpu_table, + mpu_counter_table, version_table, block_ref_table, #[cfg(feature = "k2v")] @@ -296,6 +335,8 @@ impl Garage { self.object_table.spawn_workers(bg); self.object_counter_table.spawn_workers(bg); + self.mpu_table.spawn_workers(bg); + self.mpu_counter_table.spawn_workers(bg); self.version_table.spawn_workers(bg); self.block_ref_table.spawn_workers(bg); diff --git a/src/model/helper/bucket.rs b/src/model/helper/bucket.rs index 4a488d7f..576d03f3 100644 --- a/src/model/helper/bucket.rs +++ b/src/model/helper/bucket.rs @@ -478,7 +478,9 @@ impl<'a> BucketHelper<'a> { // ---- /// Deletes all incomplete multipart uploads that are older than a certain time. - /// Returns the number of uploads aborted + /// Returns the number of uploads aborted. + /// This will also include non-multipart uploads, which may be lingering + /// after a node crash pub async fn cleanup_incomplete_uploads( &self, bucket_id: &Uuid, @@ -496,7 +498,9 @@ impl<'a> BucketHelper<'a> { .get_range( bucket_id, start, - Some(ObjectFilter::IsUploading), + Some(ObjectFilter::IsUploading { + check_multipart: None, + }), 1000, EnumerationOrder::Forward, ) @@ -508,7 +512,7 @@ impl<'a> BucketHelper<'a> { let aborted_versions = object .versions() .iter() - .filter(|v| v.is_uploading() && v.timestamp < older_than) + .filter(|v| v.is_uploading(None) && v.timestamp < older_than) .map(|v| ObjectVersion { state: ObjectVersionState::Aborted, uuid: v.uuid, diff --git a/src/model/s3/mod.rs b/src/model/s3/mod.rs index 4e94337d..36d67093 100644 --- a/src/model/s3/mod.rs +++ b/src/model/s3/mod.rs @@ -1,3 +1,4 @@ pub mod block_ref_table; +pub mod mpu_table; pub mod object_table; pub mod version_table; diff --git a/src/model/s3/mpu_table.rs b/src/model/s3/mpu_table.rs new file mode 100644 index 00000000..238cbf11 --- /dev/null +++ b/src/model/s3/mpu_table.rs @@ -0,0 +1,254 @@ +use std::sync::Arc; + +use garage_db as db; + +use garage_util::crdt::Crdt; +use garage_util::data::*; +use garage_util::time::*; + +use garage_table::replication::TableShardedReplication; +use garage_table::*; + +use crate::index_counter::*; +use crate::s3::version_table::*; + +pub const UPLOADS: &str = "uploads"; +pub const PARTS: &str = "parts"; +pub const BYTES: &str = "bytes"; + +mod v09 { + use garage_util::crdt; + use garage_util::data::Uuid; + use serde::{Deserialize, Serialize}; + + /// A part of a multipart upload + #[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] + pub struct MultipartUpload { + /// Partition key = Upload id = UUID of the object version + pub upload_id: Uuid, + + /// The timestamp at which the multipart upload was created + pub timestamp: u64, + /// Is this multipart upload deleted + /// The MultipartUpload is marked as deleted as soon as the + /// multipart upload is either completed or aborted + pub deleted: crdt::Bool, + /// List of uploaded parts, key = (part number, timestamp) + /// In case of retries, all versions for each part are kept + /// Everything is cleaned up only once the MultipartUpload is marked deleted + pub parts: crdt::Map<MpuPartKey, MpuPart>, + + // Back link to bucket+key so that we can find the object this mpu + // belongs to and check whether it is still valid + /// Bucket in which the related object is stored + pub bucket_id: Uuid, + /// Key in which the related object is stored + pub key: String, + } + + #[derive(PartialEq, Eq, Clone, Copy, Debug, Serialize, Deserialize)] + pub struct MpuPartKey { + /// Number of the part + pub part_number: u64, + /// Timestamp of part upload + pub timestamp: u64, + } + + /// The version of an uploaded part + #[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] + pub struct MpuPart { + /// Links to a Version in VersionTable + pub version: Uuid, + /// ETag of the content of this part (known only once done uploading) + pub etag: Option<String>, + /// Size of this part (known only once done uploading) + pub size: Option<u64>, + } + + impl garage_util::migrate::InitialFormat for MultipartUpload { + const VERSION_MARKER: &'static [u8] = b"G09s3mpu"; + } +} + +pub use v09::*; + +impl Ord for MpuPartKey { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + self.part_number + .cmp(&other.part_number) + .then(self.timestamp.cmp(&other.timestamp)) + } +} + +impl PartialOrd for MpuPartKey { + fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> { + Some(self.cmp(other)) + } +} + +impl MultipartUpload { + pub fn new( + upload_id: Uuid, + timestamp: u64, + bucket_id: Uuid, + key: String, + deleted: bool, + ) -> Self { + Self { + upload_id, + timestamp, + deleted: crdt::Bool::new(deleted), + parts: crdt::Map::new(), + bucket_id, + key, + } + } + + pub fn next_timestamp(&self, part_number: u64) -> u64 { + std::cmp::max( + now_msec(), + 1 + self + .parts + .items() + .iter() + .filter(|(x, _)| x.part_number == part_number) + .map(|(x, _)| x.timestamp) + .max() + .unwrap_or(0), + ) + } +} + +impl Entry<Uuid, EmptyKey> for MultipartUpload { + fn partition_key(&self) -> &Uuid { + &self.upload_id + } + fn sort_key(&self) -> &EmptyKey { + &EmptyKey + } + fn is_tombstone(&self) -> bool { + self.deleted.get() + } +} + +impl Crdt for MultipartUpload { + fn merge(&mut self, other: &Self) { + self.deleted.merge(&other.deleted); + + if self.deleted.get() { + self.parts.clear(); + } else { + self.parts.merge(&other.parts); + } + } +} + +impl Crdt for MpuPart { + fn merge(&mut self, other: &Self) { + self.etag = match (self.etag.take(), &other.etag) { + (None, Some(_)) => other.etag.clone(), + (Some(x), Some(y)) if x < *y => other.etag.clone(), + (x, _) => x, + }; + self.size = match (self.size, other.size) { + (None, Some(_)) => other.size, + (Some(x), Some(y)) if x < y => other.size, + (x, _) => x, + }; + } +} + +pub struct MultipartUploadTable { + pub version_table: Arc<Table<VersionTable, TableShardedReplication>>, + pub mpu_counter_table: Arc<IndexCounter<MultipartUpload>>, +} + +impl TableSchema for MultipartUploadTable { + const TABLE_NAME: &'static str = "multipart_upload"; + + type P = Uuid; + type S = EmptyKey; + type E = MultipartUpload; + type Filter = DeletedFilter; + + fn updated( + &self, + tx: &mut db::Transaction, + old: Option<&Self::E>, + new: Option<&Self::E>, + ) -> db::TxOpResult<()> { + // 1. Count + let counter_res = self.mpu_counter_table.count(tx, old, new); + if let Err(e) = db::unabort(counter_res)? { + error!( + "Unable to update multipart object part counter: {}. Index values will be wrong!", + e + ); + } + + // 2. Propagate deletions to version table + if let (Some(old_mpu), Some(new_mpu)) = (old, new) { + if new_mpu.deleted.get() && !old_mpu.deleted.get() { + let deleted_versions = old_mpu.parts.items().iter().map(|(_k, p)| { + Version::new( + p.version, + VersionBacklink::MultipartUpload { + upload_id: old_mpu.upload_id, + }, + true, + ) + }); + for version in deleted_versions { + let res = self.version_table.queue_insert(tx, &version); + if let Err(e) = db::unabort(res)? { + error!("Unable to enqueue version deletion propagation: {}. A repair will be needed.", e); + } + } + } + } + + Ok(()) + } + + fn matches_filter(entry: &Self::E, filter: &Self::Filter) -> bool { + filter.apply(entry.is_tombstone()) + } +} + +impl CountedItem for MultipartUpload { + const COUNTER_TABLE_NAME: &'static str = "bucket_mpu_counter"; + + // Partition key = bucket id + type CP = Uuid; + // Sort key = nothing + type CS = EmptyKey; + + fn counter_partition_key(&self) -> &Uuid { + &self.bucket_id + } + fn counter_sort_key(&self) -> &EmptyKey { + &EmptyKey + } + + fn counts(&self) -> Vec<(&'static str, i64)> { + let uploads = if self.deleted.get() { 0 } else { 1 }; + let mut parts = self + .parts + .items() + .iter() + .map(|(k, _)| k.part_number) + .collect::<Vec<_>>(); + parts.dedup(); + let bytes = self + .parts + .items() + .iter() + .map(|(_, p)| p.size.unwrap_or(0)) + .sum::<u64>(); + vec![ + (UPLOADS, uploads), + (PARTS, parts.len() as i64), + (BYTES, bytes as i64), + ] + } +} diff --git a/src/model/s3/object_table.rs b/src/model/s3/object_table.rs index 518acc95..ebea04bd 100644 --- a/src/model/s3/object_table.rs +++ b/src/model/s3/object_table.rs @@ -10,6 +10,7 @@ use garage_table::replication::TableShardedReplication; use garage_table::*; use crate::index_counter::*; +use crate::s3::mpu_table::*; use crate::s3::version_table::*; pub const OBJECTS: &str = "objects"; @@ -130,7 +131,86 @@ mod v08 { } } -pub use v08::*; +mod v09 { + use garage_util::data::Uuid; + use serde::{Deserialize, Serialize}; + + use super::v08; + + pub use v08::{ObjectVersionData, ObjectVersionHeaders, ObjectVersionMeta}; + + /// An object + #[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] + pub struct Object { + /// The bucket in which the object is stored, used as partition key + pub bucket_id: Uuid, + + /// The key at which the object is stored in its bucket, used as sorting key + pub key: String, + + /// The list of currenty stored versions of the object + pub(super) versions: Vec<ObjectVersion>, + } + + /// Informations about a version of an object + #[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] + pub struct ObjectVersion { + /// Id of the version + pub uuid: Uuid, + /// Timestamp of when the object was created + pub timestamp: u64, + /// State of the version + pub state: ObjectVersionState, + } + + /// State of an object version + #[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] + pub enum ObjectVersionState { + /// The version is being received + Uploading { + /// Indicates whether this is a multipart upload + multipart: bool, + /// Headers to be included in the final object + headers: ObjectVersionHeaders, + }, + /// The version is fully received + Complete(ObjectVersionData), + /// The version uploaded containded errors or the upload was explicitly aborted + Aborted, + } + + impl garage_util::migrate::Migrate for Object { + const VERSION_MARKER: &'static [u8] = b"G09s3o"; + + type Previous = v08::Object; + + fn migrate(old: v08::Object) -> Object { + let versions = old + .versions + .into_iter() + .map(|x| ObjectVersion { + uuid: x.uuid, + timestamp: x.timestamp, + state: match x.state { + v08::ObjectVersionState::Uploading(h) => ObjectVersionState::Uploading { + multipart: false, + headers: h, + }, + v08::ObjectVersionState::Complete(d) => ObjectVersionState::Complete(d), + v08::ObjectVersionState::Aborted => ObjectVersionState::Aborted, + }, + }) + .collect(); + Object { + bucket_id: old.bucket_id, + key: old.key, + versions, + } + } + } +} + +pub use v09::*; impl Object { /// Initialize an Object struct from parts @@ -180,11 +260,11 @@ impl Crdt for ObjectVersionState { Complete(a) => { a.merge(b); } - Uploading(_) => { + Uploading { .. } => { *self = Complete(b.clone()); } }, - Uploading(_) => {} + Uploading { .. } => {} } } } @@ -199,8 +279,17 @@ impl ObjectVersion { } /// Is the object version currently being uploaded - pub fn is_uploading(&self) -> bool { - matches!(self.state, ObjectVersionState::Uploading(_)) + /// + /// matches only multipart uploads if check_multipart is Some(true) + /// matches only non-multipart uploads if check_multipart is Some(false) + /// matches both if check_multipart is None + pub fn is_uploading(&self, check_multipart: Option<bool>) -> bool { + match &self.state { + ObjectVersionState::Uploading { multipart, .. } => { + check_multipart.map(|x| x == *multipart).unwrap_or(true) + } + _ => false, + } } /// Is the object version completely received @@ -267,13 +356,20 @@ impl Crdt for Object { pub struct ObjectTable { pub version_table: Arc<Table<VersionTable, TableShardedReplication>>, + pub mpu_table: Arc<Table<MultipartUploadTable, TableShardedReplication>>, pub object_counter_table: Arc<IndexCounter<Object>>, } #[derive(Clone, Copy, Debug, Serialize, Deserialize)] pub enum ObjectFilter { + /// Is the object version available (received and not a tombstone) IsData, - IsUploading, + /// Is the object version currently being uploaded + /// + /// matches only multipart uploads if check_multipart is Some(true) + /// matches only non-multipart uploads if check_multipart is Some(false) + /// matches both if check_multipart is None + IsUploading { check_multipart: Option<bool> }, } impl TableSchema for ObjectTable { @@ -301,21 +397,28 @@ impl TableSchema for ObjectTable { // 2. Enqueue propagation deletions to version table if let (Some(old_v), Some(new_v)) = (old, new) { - // Propagate deletion of old versions for v in old_v.versions.iter() { - let newly_deleted = match new_v + let new_v_id = new_v .versions - .binary_search_by(|nv| nv.cmp_key().cmp(&v.cmp_key())) - { + .binary_search_by(|nv| nv.cmp_key().cmp(&v.cmp_key())); + + // Propagate deletion of old versions to the Version table + let delete_version = match new_v_id { Err(_) => true, Ok(i) => { new_v.versions[i].state == ObjectVersionState::Aborted && v.state != ObjectVersionState::Aborted } }; - if newly_deleted { - let deleted_version = - Version::new(v.uuid, old_v.bucket_id, old_v.key.clone(), true); + if delete_version { + let deleted_version = Version::new( + v.uuid, + VersionBacklink::Object { + bucket_id: old_v.bucket_id, + key: old_v.key.clone(), + }, + true, + ); let res = self.version_table.queue_insert(tx, &deleted_version); if let Err(e) = db::unabort(res)? { error!( @@ -324,6 +427,39 @@ impl TableSchema for ObjectTable { ); } } + + // After abortion or completion of multipart uploads, delete MPU table entry + if matches!( + v.state, + ObjectVersionState::Uploading { + multipart: true, + .. + } + ) { + let delete_mpu = match new_v_id { + Err(_) => true, + Ok(i) => !matches!( + new_v.versions[i].state, + ObjectVersionState::Uploading { .. } + ), + }; + if delete_mpu { + let deleted_mpu = MultipartUpload::new( + v.uuid, + v.timestamp, + old_v.bucket_id, + old_v.key.clone(), + true, + ); + let res = self.mpu_table.queue_insert(tx, &deleted_mpu); + if let Err(e) = db::unabort(res)? { + error!( + "Unable to enqueue multipart upload deletion propagation: {}. A repair will be needed.", + e + ); + } + } + } } } @@ -333,7 +469,10 @@ impl TableSchema for ObjectTable { fn matches_filter(entry: &Self::E, filter: &Self::Filter) -> bool { match filter { ObjectFilter::IsData => entry.versions.iter().any(|v| v.is_data()), - ObjectFilter::IsUploading => entry.versions.iter().any(|v| v.is_uploading()), + ObjectFilter::IsUploading { check_multipart } => entry + .versions + .iter() + .any(|v| v.is_uploading(*check_multipart)), } } } @@ -360,10 +499,7 @@ impl CountedItem for Object { } else { 0 }; - let n_unfinished_uploads = versions - .iter() - .filter(|v| matches!(v.state, ObjectVersionState::Uploading(_))) - .count(); + let n_unfinished_uploads = versions.iter().filter(|v| v.is_uploading(None)).count(); let n_bytes = versions .iter() .map(|v| match &v.state { diff --git a/src/model/s3/version_table.rs b/src/model/s3/version_table.rs index 6edc83f4..5c032f9f 100644 --- a/src/model/s3/version_table.rs +++ b/src/model/s3/version_table.rs @@ -3,6 +3,7 @@ use std::sync::Arc; use garage_db as db; use garage_util::data::*; +use garage_util::error::*; use garage_table::crdt::*; use garage_table::replication::TableShardedReplication; @@ -66,6 +67,8 @@ mod v08 { use super::v05; + pub use v05::{VersionBlock, VersionBlockKey}; + /// A version of an object #[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] pub struct Version { @@ -90,8 +93,6 @@ mod v08 { pub key: String, } - pub use v05::{VersionBlock, VersionBlockKey}; - impl garage_util::migrate::Migrate for Version { type Previous = v05::Version; @@ -110,32 +111,94 @@ mod v08 { } } -pub use v08::*; +pub(crate) mod v09 { + use garage_util::crdt; + use garage_util::data::Uuid; + use serde::{Deserialize, Serialize}; + + use super::v08; + + pub use v08::{VersionBlock, VersionBlockKey}; + + /// A version of an object + #[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] + pub struct Version { + /// UUID of the version, used as partition key + pub uuid: Uuid, + + // Actual data: the blocks for this version + // In the case of a multipart upload, also store the etags + // of individual parts and check them when doing CompleteMultipartUpload + /// Is this version deleted + pub deleted: crdt::Bool, + /// list of blocks of data composing the version + pub blocks: crdt::Map<VersionBlockKey, VersionBlock>, + + // Back link to owner of this version (either an object or a multipart + // upload), used to find whether it has been deleted and this version + // should in turn be deleted (see versions repair procedure) + pub backlink: VersionBacklink, + } + + #[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] + pub enum VersionBacklink { + Object { + /// Bucket in which the related object is stored + bucket_id: Uuid, + /// Key in which the related object is stored + key: String, + }, + MultipartUpload { + upload_id: Uuid, + }, + } + + impl garage_util::migrate::Migrate for Version { + const VERSION_MARKER: &'static [u8] = b"G09s3v"; + + type Previous = v08::Version; + + fn migrate(old: v08::Version) -> Version { + Version { + uuid: old.uuid, + deleted: old.deleted, + blocks: old.blocks, + backlink: VersionBacklink::Object { + bucket_id: old.bucket_id, + key: old.key, + }, + } + } + } +} + +pub use v09::*; impl Version { - pub fn new(uuid: Uuid, bucket_id: Uuid, key: String, deleted: bool) -> Self { + pub fn new(uuid: Uuid, backlink: VersionBacklink, deleted: bool) -> Self { Self { uuid, deleted: deleted.into(), blocks: crdt::Map::new(), - parts_etags: crdt::Map::new(), - bucket_id, - key, + backlink, } } pub fn has_part_number(&self, part_number: u64) -> bool { - let case1 = self - .parts_etags + self.blocks .items() - .binary_search_by(|(k, _)| k.cmp(&part_number)) - .is_ok(); - let case2 = self + .binary_search_by(|(k, _)| k.part_number.cmp(&part_number)) + .is_ok() + } + + pub fn n_parts(&self) -> Result<u64, Error> { + Ok(self .blocks .items() - .binary_search_by(|(k, _)| k.part_number.cmp(&part_number)) - .is_ok(); - case1 || case2 + .last() + .ok_or_message("version has no parts")? + .0 + .part_number) } } @@ -175,10 +238,8 @@ impl Crdt for Version { if self.deleted.get() { self.blocks.clear(); - self.parts_etags.clear(); } else { self.blocks.merge(&other.blocks); - self.parts_etags.merge(&other.parts_etags); } } } diff --git a/src/rpc/Cargo.toml b/src/rpc/Cargo.toml index f0fde7a7..69e81f38 100644 --- a/src/rpc/Cargo.toml +++ b/src/rpc/Cargo.toml @@ -18,10 +18,12 @@ garage_util.workspace = true arc-swap = "1.0" bytes = "1.0" +bytesize = "1.1" gethostname = "0.2" hex = "0.4" tracing = "0.1" rand = "0.8" +itertools="0.10" sodiumoxide = { version = "0.2.5-0", package = "kuska-sodiumoxide" } systemstat = "0.2.3" @@ -45,7 +47,7 @@ tokio = { version = "1.0", default-features = false, features = ["rt", "rt-multi tokio-stream = { version = "0.1", features = ["net"] } opentelemetry = "0.17" -netapp = { version = "0.5.2", features = ["telemetry"] } +netapp = { version = "0.5.3", features = ["telemetry"] } [features] kubernetes-discovery = [ "kube", "k8s-openapi", "schemars" ] diff --git a/src/rpc/graph_algo.rs b/src/rpc/graph_algo.rs new file mode 100644 index 00000000..65450d64 --- /dev/null +++ b/src/rpc/graph_algo.rs @@ -0,0 +1,411 @@ +//! This module deals with graph algorithms. +//! It is used in layout.rs to build the partition to node assignment. + +use rand::prelude::SliceRandom; +use std::cmp::{max, min}; +use std::collections::HashMap; +use std::collections::VecDeque; + +/// Vertex data structures used in all the graphs used in layout.rs. +/// usize parameters correspond to node/zone/partitions ids. +/// To understand the vertex roles below, please refer to the formal description +/// of the layout computation algorithm. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +pub enum Vertex { + Source, + Pup(usize), // The vertex p+ of partition p + Pdown(usize), // The vertex p- of partition p + PZ(usize, usize), // The vertex corresponding to x_(partition p, zone z) + N(usize), // The vertex corresponding to node n + Sink, +} + +/// Edge data structure for the flow algorithm. +#[derive(Clone, Copy, Debug)] +pub struct FlowEdge { + cap: u64, // flow maximal capacity of the edge + flow: i64, // flow value on the edge + dest: usize, // destination vertex id + rev: usize, // index of the reversed edge (v, self) in the edge list of vertex v +} + +/// Edge data structure for the detection of negative cycles. +#[derive(Clone, Copy, Debug)] +pub struct WeightedEdge { + w: i64, // weight of the edge + dest: usize, +} + +pub trait Edge: Clone + Copy {} +impl Edge for FlowEdge {} +impl Edge for WeightedEdge {} + +/// Struct for the graph structure. We do encapsulation here to be able to both +/// provide user friendly Vertex enum to address vertices, and to use internally usize +/// indices and Vec instead of HashMap in the graph algorithm to optimize execution speed. +pub struct Graph<E: Edge> { + vertex_to_id: HashMap<Vertex, usize>, + id_to_vertex: Vec<Vertex>, + + // The graph is stored as an adjacency list + graph: Vec<Vec<E>>, +} + +pub type CostFunction = HashMap<(Vertex, Vertex), i64>; + +impl<E: Edge> Graph<E> { + pub fn new(vertices: &[Vertex]) -> Self { + let mut map = HashMap::<Vertex, usize>::new(); + for (i, vert) in vertices.iter().enumerate() { + map.insert(*vert, i); + } + Graph::<E> { + vertex_to_id: map, + id_to_vertex: vertices.to_vec(), + graph: vec![Vec::<E>::new(); vertices.len()], + } + } + + fn get_vertex_id(&self, v: &Vertex) -> Result<usize, String> { + self.vertex_to_id + .get(v) + .cloned() + .ok_or_else(|| format!("The graph does not contain vertex {:?}", v)) + } +} + +impl Graph<FlowEdge> { + /// This function adds a directed edge to the graph with capacity c, and the + /// corresponding reversed edge with capacity 0. + pub fn add_edge(&mut self, u: Vertex, v: Vertex, c: u64) -> Result<(), String> { + let idu = self.get_vertex_id(&u)?; + let idv = self.get_vertex_id(&v)?; + if idu == idv { + return Err("Cannot add edge from vertex to itself in flow graph".into()); + } + + let rev_u = self.graph[idu].len(); + let rev_v = self.graph[idv].len(); + self.graph[idu].push(FlowEdge { + cap: c, + dest: idv, + flow: 0, + rev: rev_v, + }); + self.graph[idv].push(FlowEdge { + cap: 0, + dest: idu, + flow: 0, + rev: rev_u, + }); + Ok(()) + } + + /// This function returns the list of vertices that receive a positive flow from + /// vertex v. + pub fn get_positive_flow_from(&self, v: Vertex) -> Result<Vec<Vertex>, String> { + let idv = self.get_vertex_id(&v)?; + let mut result = Vec::<Vertex>::new(); + for edge in self.graph[idv].iter() { + if edge.flow > 0 { + result.push(self.id_to_vertex[edge.dest]); + } + } + Ok(result) + } + + /// This function returns the value of the flow incoming to v. + pub fn get_inflow(&self, v: Vertex) -> Result<i64, String> { + let idv = self.get_vertex_id(&v)?; + let mut result = 0; + for edge in self.graph[idv].iter() { + result += max(0, self.graph[edge.dest][edge.rev].flow); + } + Ok(result) + } + + /// This function returns the value of the flow outgoing from v. + pub fn get_outflow(&self, v: Vertex) -> Result<i64, String> { + let idv = self.get_vertex_id(&v)?; + let mut result = 0; + for edge in self.graph[idv].iter() { + result += max(0, edge.flow); + } + Ok(result) + } + + /// This function computes the flow total value by computing the outgoing flow + /// from the source. + pub fn get_flow_value(&mut self) -> Result<i64, String> { + self.get_outflow(Vertex::Source) + } + + /// This function shuffles the order of the edge lists. It keeps the ids of the + /// reversed edges consistent. + fn shuffle_edges(&mut self) { + let mut rng = rand::thread_rng(); + for i in 0..self.graph.len() { + self.graph[i].shuffle(&mut rng); + // We need to update the ids of the reverse edges. + for j in 0..self.graph[i].len() { + let target_v = self.graph[i][j].dest; + let target_rev = self.graph[i][j].rev; + self.graph[target_v][target_rev].rev = j; + } + } + } + + /// Computes an upper bound of the flow on the graph + pub fn flow_upper_bound(&self) -> Result<u64, String> { + let idsource = self.get_vertex_id(&Vertex::Source)?; + let mut flow_upper_bound = 0; + for edge in self.graph[idsource].iter() { + flow_upper_bound += edge.cap; + } + Ok(flow_upper_bound) + } + + /// This function computes the maximal flow using Dinic's algorithm. It starts with + /// the flow values already present in the graph. So it is possible to add some edge to + /// the graph, compute a flow, add other edges, update the flow. + pub fn compute_maximal_flow(&mut self) -> Result<(), String> { + let idsource = self.get_vertex_id(&Vertex::Source)?; + let idsink = self.get_vertex_id(&Vertex::Sink)?; + + let nb_vertices = self.graph.len(); + + let flow_upper_bound = self.flow_upper_bound()?; + + // To ensure the dispersion of the associations generated by the + // assignment, we shuffle the neighbours of the nodes. Hence, + // the vertices do not consider their neighbours in the same order. + self.shuffle_edges(); + + // We run Dinic's max flow algorithm + loop { + // We build the level array from Dinic's algorithm. + let mut level = vec![None; nb_vertices]; + + let mut fifo = VecDeque::new(); + fifo.push_back((idsource, 0)); + while let Some((id, lvl)) = fifo.pop_front() { + if level[id] == None { + // it means id has not yet been reached + level[id] = Some(lvl); + for edge in self.graph[id].iter() { + if edge.cap as i64 - edge.flow > 0 { + fifo.push_back((edge.dest, lvl + 1)); + } + } + } + } + if level[idsink] == None { + // There is no residual flow + break; + } + // Now we run DFS respecting the level array + let mut next_nbd = vec![0; nb_vertices]; + let mut lifo = Vec::new(); + + lifo.push((idsource, flow_upper_bound)); + + while let Some((id, f)) = lifo.last().cloned() { + if id == idsink { + // The DFS reached the sink, we can add a + // residual flow. + lifo.pop(); + while let Some((id, _)) = lifo.pop() { + let nbd = next_nbd[id]; + self.graph[id][nbd].flow += f as i64; + let id_rev = self.graph[id][nbd].dest; + let nbd_rev = self.graph[id][nbd].rev; + self.graph[id_rev][nbd_rev].flow -= f as i64; + } + lifo.push((idsource, flow_upper_bound)); + continue; + } + // else we did not reach the sink + let nbd = next_nbd[id]; + if nbd >= self.graph[id].len() { + // There is nothing to explore from id anymore + lifo.pop(); + if let Some((parent, _)) = lifo.last() { + next_nbd[*parent] += 1; + } + continue; + } + // else we can try to send flow from id to its nbd + let new_flow = min( + f as i64, + self.graph[id][nbd].cap as i64 - self.graph[id][nbd].flow, + ) as u64; + if new_flow == 0 { + next_nbd[id] += 1; + continue; + } + if let (Some(lvldest), Some(lvlid)) = (level[self.graph[id][nbd].dest], level[id]) { + if lvldest <= lvlid { + // We cannot send flow to nbd. + next_nbd[id] += 1; + continue; + } + } + // otherwise, we send flow to nbd. + lifo.push((self.graph[id][nbd].dest, new_flow)); + } + } + Ok(()) + } + + /// This function takes a flow, and a cost function on the edges, and tries to find an + /// equivalent flow with a better cost, by finding improving overflow cycles. It uses + /// as subroutine the Bellman Ford algorithm run up to path_length. + /// We assume that the cost of edge (u,v) is the opposite of the cost of (v,u), and + /// only one needs to be present in the cost function. + pub fn optimize_flow_with_cost( + &mut self, + cost: &CostFunction, + path_length: usize, + ) -> Result<(), String> { + // We build the weighted graph g where we will look for negative cycle + let mut gf = self.build_cost_graph(cost)?; + let mut cycles = gf.list_negative_cycles(path_length); + while !cycles.is_empty() { + // we enumerate negative cycles + for c in cycles.iter() { + for i in 0..c.len() { + // We add one flow unit to the edge (u,v) of cycle c + let idu = self.vertex_to_id[&c[i]]; + let idv = self.vertex_to_id[&c[(i + 1) % c.len()]]; + for j in 0..self.graph[idu].len() { + // since idu appears at most once in the cycles, we enumerate every + // edge at most once. + let edge = self.graph[idu][j]; + if edge.dest == idv { + self.graph[idu][j].flow += 1; + self.graph[idv][edge.rev].flow -= 1; + break; + } + } + } + } + + gf = self.build_cost_graph(cost)?; + cycles = gf.list_negative_cycles(path_length); + } + Ok(()) + } + + /// Construct the weighted graph G_f from the flow and the cost function + fn build_cost_graph(&self, cost: &CostFunction) -> Result<Graph<WeightedEdge>, String> { + let mut g = Graph::<WeightedEdge>::new(&self.id_to_vertex); + let nb_vertices = self.id_to_vertex.len(); + for i in 0..nb_vertices { + for edge in self.graph[i].iter() { + if edge.cap as i64 - edge.flow > 0 { + // It is possible to send overflow through this edge + let u = self.id_to_vertex[i]; + let v = self.id_to_vertex[edge.dest]; + if cost.contains_key(&(u, v)) { + g.add_edge(u, v, cost[&(u, v)])?; + } else if cost.contains_key(&(v, u)) { + g.add_edge(u, v, -cost[&(v, u)])?; + } else { + g.add_edge(u, v, 0)?; + } + } + } + } + Ok(g) + } +} + +impl Graph<WeightedEdge> { + /// This function adds a single directed weighted edge to the graph. + pub fn add_edge(&mut self, u: Vertex, v: Vertex, w: i64) -> Result<(), String> { + let idu = self.get_vertex_id(&u)?; + let idv = self.get_vertex_id(&v)?; + self.graph[idu].push(WeightedEdge { w, dest: idv }); + Ok(()) + } + + /// This function lists the negative cycles it manages to find after path_length + /// iterations of the main loop of the Bellman-Ford algorithm. For the classical + /// algorithm, path_length needs to be equal to the number of vertices. However, + /// for particular graph structures like in our case, the algorithm is still correct + /// when path_length is the length of the longest possible simple path. + /// See the formal description of the algorithm for more details. + fn list_negative_cycles(&self, path_length: usize) -> Vec<Vec<Vertex>> { + let nb_vertices = self.graph.len(); + + // We start with every vertex at distance 0 of some imaginary extra -1 vertex. + let mut distance = vec![0; nb_vertices]; + // The prev vector collects for every vertex from where does the shortest path come + let mut prev = vec![None; nb_vertices]; + + for _ in 0..path_length + 1 { + for id in 0..nb_vertices { + for e in self.graph[id].iter() { + if distance[id] + e.w < distance[e.dest] { + distance[e.dest] = distance[id] + e.w; + prev[e.dest] = Some(id); + } + } + } + } + + // If self.graph contains a negative cycle, then at this point the graph described + // by prev (which is a directed 1-forest/functional graph) + // must contain a cycle. We list the cycles of prev. + let cycles_prev = cycles_of_1_forest(&prev); + + // Remark that the cycle in prev is in the reverse order compared to the cycle + // in the graph. Thus the .rev(). + return cycles_prev + .iter() + .map(|cycle| { + cycle + .iter() + .rev() + .map(|id| self.id_to_vertex[*id]) + .collect() + }) + .collect(); + } +} + +/// This function returns the list of cycles of a directed 1 forest. It does not +/// check for the consistency of the input. +fn cycles_of_1_forest(forest: &[Option<usize>]) -> Vec<Vec<usize>> { + let mut cycles = Vec::<Vec<usize>>::new(); + let mut time_of_discovery = vec![None; forest.len()]; + + for t in 0..forest.len() { + let mut id = t; + // while we are on a valid undiscovered node + while time_of_discovery[id] == None { + time_of_discovery[id] = Some(t); + if let Some(i) = forest[id] { + id = i; + } else { + break; + } + } + if forest[id] != None && time_of_discovery[id] == Some(t) { + // We discovered an id that we explored at this iteration t. + // It means we are on a cycle + let mut cy = vec![id; 1]; + let mut id2 = id; + while let Some(id_next) = forest[id2] { + id2 = id_next; + if id2 != id { + cy.push(id2); + } else { + break; + } + } + cycles.push(cy); + } + } + cycles +} diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index 1030e3a6..c2655e59 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -1,87 +1,260 @@ use std::cmp::Ordering; -use std::collections::{HashMap, HashSet}; +use std::collections::HashMap; +use std::collections::HashSet; -use serde::{Deserialize, Serialize}; +use bytesize::ByteSize; +use itertools::Itertools; -use garage_util::crdt::{AutoCrdt, Crdt, LwwMap}; +use garage_util::crdt::{AutoCrdt, Crdt, Lww, LwwMap}; use garage_util::data::*; use garage_util::encode::nonversioned_encode; use garage_util::error::*; +use crate::graph_algo::*; + use crate::ring::*; -/// The layout of the cluster, i.e. the list of roles -/// which are assigned to each cluster node -#[derive(Clone, Debug, Serialize, Deserialize)] -pub struct ClusterLayout { - pub version: u64, - - pub replication_factor: usize, - pub roles: LwwMap<Uuid, NodeRoleV>, - - /// node_id_vec: a vector of node IDs with a role assigned - /// in the system (this includes gateway nodes). - /// The order here is different than the vec stored by `roles`, because: - /// 1. non-gateway nodes are first so that they have lower numbers - /// 2. nodes that don't have a role are excluded (but they need to - /// stay in the CRDT as tombstones) - pub node_id_vec: Vec<Uuid>, - /// the assignation of data partitions to node, the values - /// are indices in node_id_vec - #[serde(with = "serde_bytes")] - pub ring_assignation_data: Vec<CompactNodeType>, - - /// Role changes which are staged for the next version of the layout - pub staging: LwwMap<Uuid, NodeRoleV>, - pub staging_hash: Hash, +use std::convert::TryInto; + +const NB_PARTITIONS: usize = 1usize << PARTITION_BITS; + +// The Message type will be used to collect information on the algorithm. +type Message = Vec<String>; + +mod v08 { + use crate::ring::CompactNodeType; + use garage_util::crdt::LwwMap; + use garage_util::data::{Hash, Uuid}; + use serde::{Deserialize, Serialize}; + + /// The layout of the cluster, i.e. the list of roles + /// which are assigned to each cluster node + #[derive(Clone, Debug, Serialize, Deserialize)] + pub struct ClusterLayout { + pub version: u64, + + pub replication_factor: usize, + pub roles: LwwMap<Uuid, NodeRoleV>, + + /// node_id_vec: a vector of node IDs with a role assigned + /// in the system (this includes gateway nodes). + /// The order here is different than the vec stored by `roles`, because: + /// 1. non-gateway nodes are first so that they have lower numbers + /// 2. nodes that don't have a role are excluded (but they need to + /// stay in the CRDT as tombstones) + pub node_id_vec: Vec<Uuid>, + /// the assignation of data partitions to node, the values + /// are indices in node_id_vec + #[serde(with = "serde_bytes")] + pub ring_assignation_data: Vec<CompactNodeType>, + + /// Role changes which are staged for the next version of the layout + pub staging: LwwMap<Uuid, NodeRoleV>, + pub staging_hash: Hash, + } + + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] + pub struct NodeRoleV(pub Option<NodeRole>); + + /// The user-assigned roles of cluster nodes + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] + pub struct NodeRole { + /// Datacenter at which this entry belong. This information is used to + /// perform a better geodistribution + pub zone: String, + /// The capacity of the node + /// If this is set to None, the node does not participate in storing data for the system + /// and is only active as an API gateway to other nodes + pub capacity: Option<u64>, + /// A set of tags to recognize the node + pub tags: Vec<String>, + } + + impl garage_util::migrate::InitialFormat for ClusterLayout {} } -impl garage_util::migrate::InitialFormat for ClusterLayout {} +mod v09 { + use super::v08; + use crate::ring::CompactNodeType; + use garage_util::crdt::{Lww, LwwMap}; + use garage_util::data::{Hash, Uuid}; + use serde::{Deserialize, Serialize}; + pub use v08::{NodeRole, NodeRoleV}; + + /// The layout of the cluster, i.e. the list of roles + /// which are assigned to each cluster node + #[derive(Clone, Debug, Serialize, Deserialize)] + pub struct ClusterLayout { + pub version: u64, + + pub replication_factor: usize, + + /// This attribute is only used to retain the previously computed partition size, + /// to know to what extent does it change with the layout update. + pub partition_size: u64, + /// Parameters used to compute the assignment currently given by + /// ring_assignment_data + pub parameters: LayoutParameters, + + pub roles: LwwMap<Uuid, NodeRoleV>, + + /// see comment in v08::ClusterLayout + pub node_id_vec: Vec<Uuid>, + /// see comment in v08::ClusterLayout + #[serde(with = "serde_bytes")] + pub ring_assignment_data: Vec<CompactNodeType>, + + /// Parameters to be used in the next partition assignment computation. + pub staging_parameters: Lww<LayoutParameters>, + /// Role changes which are staged for the next version of the layout + pub staging_roles: LwwMap<Uuid, NodeRoleV>, + pub staging_hash: Hash, + } -#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] -pub struct NodeRoleV(pub Option<NodeRole>); + /// This struct is used to set the parameters to be used in the assignment computation + /// algorithm. It is stored as a Crdt. + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)] + pub struct LayoutParameters { + pub zone_redundancy: usize, + } -impl AutoCrdt for NodeRoleV { + impl garage_util::migrate::Migrate for ClusterLayout { + const VERSION_MARKER: &'static [u8] = b"G09layout"; + + type Previous = v08::ClusterLayout; + + fn migrate(previous: Self::Previous) -> Self { + use itertools::Itertools; + use std::collections::HashSet; + + // In the old layout, capacities are in an arbitrary unit, + // but in the new layout they are in bytes. + // Here we arbitrarily multiply everything by 1G, + // such that 1 old capacity unit = 1GB in the new units. + // This is totally arbitrary and won't work for most users. + let cap_mul = 1024 * 1024 * 1024; + let roles = multiply_all_capacities(previous.roles, cap_mul); + let staging_roles = multiply_all_capacities(previous.staging, cap_mul); + let node_id_vec = previous.node_id_vec; + + // Determine partition size + let mut tmp = previous.ring_assignation_data.clone(); + tmp.sort(); + let partition_size = tmp + .into_iter() + .dedup_with_count() + .map(|(npart, node)| { + roles + .get(&node_id_vec[node as usize]) + .and_then(|p| p.0.as_ref().and_then(|r| r.capacity)) + .unwrap_or(0) / npart as u64 + }) + .min() + .unwrap_or(0); + + // Determine zone redundancy parameter + let zone_redundancy = std::cmp::min( + previous.replication_factor, + roles + .items() + .iter() + .filter_map(|(_, _, r)| r.0.as_ref().map(|p| p.zone.as_str())) + .collect::<HashSet<&str>>() + .len(), + ); + let parameters = LayoutParameters { zone_redundancy }; + + let mut res = Self { + version: previous.version, + replication_factor: previous.replication_factor, + partition_size, + parameters, + roles, + node_id_vec, + ring_assignment_data: previous.ring_assignation_data, + staging_parameters: Lww::new(parameters), + staging_roles, + staging_hash: [0u8; 32].into(), + }; + res.staging_hash = res.calculate_staging_hash(); + res + } + } + + fn multiply_all_capacities( + old_roles: LwwMap<Uuid, NodeRoleV>, + mul: u64, + ) -> LwwMap<Uuid, NodeRoleV> { + let mut new_roles = LwwMap::new(); + for (node, ts, role) in old_roles.items() { + let mut role = role.clone(); + if let NodeRoleV(Some(NodeRole { + capacity: Some(ref mut cap), + .. + })) = role + { + *cap = *cap * mul; + } + new_roles.merge_raw(node, *ts, &role); + } + new_roles + } +} + +pub use v09::*; + +impl AutoCrdt for LayoutParameters { const WARN_IF_DIFFERENT: bool = true; } -/// The user-assigned roles of cluster nodes -#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] -pub struct NodeRole { - /// Datacenter at which this entry belong. This information might be used to perform a better - /// geodistribution - pub zone: String, - /// The (relative) capacity of the node - /// If this is set to None, the node does not participate in storing data for the system - /// and is only active as an API gateway to other nodes - pub capacity: Option<u32>, - /// A set of tags to recognize the node - pub tags: Vec<String>, +impl AutoCrdt for NodeRoleV { + const WARN_IF_DIFFERENT: bool = true; } impl NodeRole { pub fn capacity_string(&self) -> String { match self.capacity { - Some(c) => format!("{}", c), + Some(c) => ByteSize::b(c).to_string_as(false), None => "gateway".to_string(), } } + + pub fn tags_string(&self) -> String { + self.tags.join(",") + } } +// Implementation of the ClusterLayout methods unrelated to the assignment algorithm. impl ClusterLayout { pub fn new(replication_factor: usize) -> Self { + // We set the default zone redundancy to be equal to the replication factor, + // i.e. as strict as possible. + let parameters = LayoutParameters { + zone_redundancy: replication_factor, + }; + let staging_parameters = Lww::<LayoutParameters>::new(parameters.clone()); + let empty_lwwmap = LwwMap::new(); - let empty_lwwmap_hash = blake2sum(&nonversioned_encode(&empty_lwwmap).unwrap()[..]); - ClusterLayout { + let mut ret = ClusterLayout { version: 0, replication_factor, + partition_size: 0, roles: LwwMap::new(), node_id_vec: Vec::new(), - ring_assignation_data: Vec::new(), - staging: empty_lwwmap, - staging_hash: empty_lwwmap_hash, - } + ring_assignment_data: Vec::new(), + parameters, + staging_parameters, + staging_roles: empty_lwwmap, + staging_hash: [0u8; 32].into(), + }; + ret.staging_hash = ret.calculate_staging_hash(); + ret + } + + fn calculate_staging_hash(&self) -> Hash { + let hashed_tuple = (&self.staging_roles, &self.staging_parameters); + blake2sum(&nonversioned_encode(&hashed_tuple).unwrap()[..]) } pub fn merge(&mut self, other: &ClusterLayout) -> bool { @@ -91,9 +264,10 @@ impl ClusterLayout { true } Ordering::Equal => { - self.staging.merge(&other.staging); + self.staging_parameters.merge(&other.staging_parameters); + self.staging_roles.merge(&other.staging_roles); - let new_staging_hash = blake2sum(&nonversioned_encode(&self.staging).unwrap()[..]); + let new_staging_hash = self.calculate_staging_hash(); let changed = new_staging_hash != self.staging_hash; self.staging_hash = new_staging_hash; @@ -104,7 +278,7 @@ impl ClusterLayout { } } - pub fn apply_staged_changes(mut self, version: Option<u64>) -> Result<Self, Error> { + pub fn apply_staged_changes(mut self, version: Option<u64>) -> Result<(Self, Message), Error> { match version { None => { let error = r#" @@ -120,19 +294,18 @@ To know the correct value of the new layout version, invoke `garage layout show` } } - self.roles.merge(&self.staging); + self.roles.merge(&self.staging_roles); self.roles.retain(|(_, _, v)| v.0.is_some()); + self.parameters = self.staging_parameters.get().clone(); - if !self.calculate_partition_assignation() { - return Err(Error::Message("Could not calculate new assignation of partitions to nodes. This can happen if there are less nodes than the desired number of copies of your data (see the replication_mode configuration parameter).".into())); - } + self.staging_roles.clear(); + self.staging_hash = self.calculate_staging_hash(); - self.staging.clear(); - self.staging_hash = blake2sum(&nonversioned_encode(&self.staging).unwrap()[..]); + let msg = self.calculate_partition_assignment()?; self.version += 1; - Ok(self) + Ok((self, msg)) } pub fn revert_staged_changes(mut self, version: Option<u64>) -> Result<Self, Error> { @@ -151,8 +324,9 @@ To know the correct value of the new layout version, invoke `garage layout show` } } - self.staging.clear(); - self.staging_hash = blake2sum(&nonversioned_encode(&self.staging).unwrap()[..]); + self.staging_roles.clear(); + self.staging_parameters.update(self.parameters.clone()); + self.staging_hash = self.calculate_staging_hash(); self.version += 1; @@ -177,13 +351,81 @@ To know the correct value of the new layout version, invoke `garage layout show` } } + /// Returns the uuids of the non_gateway nodes in self.node_id_vec. + fn nongateway_nodes(&self) -> Vec<Uuid> { + let mut result = Vec::<Uuid>::new(); + for uuid in self.node_id_vec.iter() { + match self.node_role(uuid) { + Some(role) if role.capacity != None => result.push(*uuid), + _ => (), + } + } + result + } + + /// Given a node uuids, this function returns the label of its zone + fn get_node_zone(&self, uuid: &Uuid) -> Result<String, Error> { + match self.node_role(uuid) { + Some(role) => Ok(role.zone.clone()), + _ => Err(Error::Message( + "The Uuid does not correspond to a node present in the cluster.".into(), + )), + } + } + + /// Given a node uuids, this function returns its capacity or fails if it does not have any + pub fn get_node_capacity(&self, uuid: &Uuid) -> Result<u64, Error> { + match self.node_role(uuid) { + Some(NodeRole { + capacity: Some(cap), + zone: _, + tags: _, + }) => Ok(*cap), + _ => Err(Error::Message( + "The Uuid does not correspond to a node present in the \ + cluster or this node does not have a positive capacity." + .into(), + )), + } + } + + /// Returns the number of partitions associated to this node in the ring + pub fn get_node_usage(&self, uuid: &Uuid) -> Result<usize, Error> { + for (i, id) in self.node_id_vec.iter().enumerate() { + if id == uuid { + let mut count = 0; + for nod in self.ring_assignment_data.iter() { + if i as u8 == *nod { + count += 1 + } + } + return Ok(count); + } + } + Err(Error::Message( + "The Uuid does not correspond to a node present in the \ + cluster or this node does not have a positive capacity." + .into(), + )) + } + + /// Returns the sum of capacities of non gateway nodes in the cluster + fn get_total_capacity(&self) -> Result<u64, Error> { + let mut total_capacity = 0; + for uuid in self.nongateway_nodes().iter() { + total_capacity += self.get_node_capacity(uuid)?; + } + Ok(total_capacity) + } + /// Check a cluster layout for internal consistency + /// (assignment, roles, parameters, partition size) /// returns true if consistent, false if error - pub fn check(&self) -> bool { + pub fn check(&self) -> Result<(), String> { // Check that the hash of the staging data is correct - let staging_hash = blake2sum(&nonversioned_encode(&self.staging).unwrap()[..]); + let staging_hash = self.calculate_staging_hash(); if staging_hash != self.staging_hash { - return false; + return Err("staging_hash is incorrect".into()); } // Check that node_id_vec contains the correct list of nodes @@ -198,472 +440,794 @@ To know the correct value of the new layout version, invoke `garage layout show` let mut node_id_vec = self.node_id_vec.clone(); node_id_vec.sort(); if expected_nodes != node_id_vec { - return false; + return Err(format!("node_id_vec does not contain the correct set of nodes\nnode_id_vec: {:?}\nexpected: {:?}", node_id_vec, expected_nodes)); } - // Check that the assignation data has the correct length - if self.ring_assignation_data.len() != (1 << PARTITION_BITS) * self.replication_factor { - return false; + // Check that the assignment data has the correct length + let expected_assignment_data_len = (1 << PARTITION_BITS) * self.replication_factor; + if self.ring_assignment_data.len() != expected_assignment_data_len { + return Err(format!( + "ring_assignment_data has incorrect length {} instead of {}", + self.ring_assignment_data.len(), + expected_assignment_data_len + )); } // Check that the assigned nodes are correct identifiers // of nodes that are assigned a role // and that role is not the role of a gateway nodes - for x in self.ring_assignation_data.iter() { + for x in self.ring_assignment_data.iter() { if *x as usize >= self.node_id_vec.len() { - return false; + return Err(format!( + "ring_assignment_data contains invalid node id {}", + *x + )); } let node = self.node_id_vec[*x as usize]; match self.roles.get(&node) { Some(NodeRoleV(Some(x))) if x.capacity.is_some() => (), - _ => return false, + _ => return Err("ring_assignment_data contains id of a gateway node".into()), } } - true - } + // Check that every partition is associated to distinct nodes + let rf = self.replication_factor; + for p in 0..(1 << PARTITION_BITS) { + let nodes_of_p = self.ring_assignment_data[rf * p..rf * (p + 1)].to_vec(); + if nodes_of_p.iter().unique().count() != rf { + return Err(format!("partition does not contain {} unique node ids", rf)); + } + // Check that every partition is spread over at least zone_redundancy zones. + let zones_of_p = nodes_of_p + .iter() + .map(|n| { + self.get_node_zone(&self.node_id_vec[*n as usize]) + .expect("Zone not found.") + }) + .collect::<Vec<_>>(); + let redundancy = self.parameters.zone_redundancy; + if zones_of_p.iter().unique().count() < redundancy { + return Err(format!( + "nodes of partition are in less than {} distinct zones", + redundancy + )); + } + } - /// Calculate an assignation of partitions to nodes - pub fn calculate_partition_assignation(&mut self) -> bool { - let (configured_nodes, zones) = self.configured_nodes_and_zones(); - let n_zones = zones.len(); + // Check that the nodes capacities is consistent with the stored partitions + let mut node_usage = vec![0; MAX_NODE_NUMBER]; + for n in self.ring_assignment_data.iter() { + node_usage[*n as usize] += 1; + } + for (n, usage) in node_usage.iter().enumerate() { + if *usage > 0 { + let uuid = self.node_id_vec[n]; + let partusage = usage * self.partition_size; + let nodecap = self.get_node_capacity(&uuid).unwrap(); + if partusage > nodecap { + return Err(format!( + "node usage ({}) is bigger than node capacity ({})", + usage * self.partition_size, + nodecap + )); + } + } + } - println!("Calculating updated partition assignation, this may take some time..."); - println!(); + // Check that the partition size stored is the one computed by the asignation + // algorithm. + let cl2 = self.clone(); + let (_, zone_to_id) = cl2.generate_nongateway_zone_ids().unwrap(); + match cl2.compute_optimal_partition_size(&zone_to_id) { + Ok(s) if s != self.partition_size => { + return Err(format!( + "partition_size ({}) is different than optimal value ({})", + self.partition_size, s + )) + } + Err(e) => return Err(format!("could not calculate optimal partition size: {}", e)), + _ => (), + } - // Get old partition assignation - let old_partitions = self.parse_assignation_data(); + Ok(()) + } +} - // Start new partition assignation with nodes from old assignation where it is relevant - let mut partitions = old_partitions - .iter() - .map(|old_part| { - let mut new_part = PartitionAss::new(); - for node in old_part.nodes.iter() { - if let Some(role) = node.1 { - if role.capacity.is_some() { - new_part.add(None, n_zones, node.0, role); - } - } - } - new_part - }) - .collect::<Vec<_>>(); +// Implementation of the ClusterLayout methods related to the assignment algorithm. +impl ClusterLayout { + /// This function calculates a new partition-to-node assignment. + /// The computed assignment respects the node replication factor + /// and the zone redundancy parameter It maximizes the capacity of a + /// partition (assuming all partitions have the same size). + /// Among such optimal assignment, it minimizes the distance to + /// the former assignment (if any) to minimize the amount of + /// data to be moved. + /// Staged role changes must be merged with nodes roles before calling this function, + /// hence it must only be called from apply_staged_changes() and hence is not public. + fn calculate_partition_assignment(&mut self) -> Result<Message, Error> { + // We update the node ids, since the node role list might have changed with the + // changes in the layout. We retrieve the old_assignment reframed with new ids + let old_assignment_opt = self.update_node_id_vec()?; + + let mut msg = Message::new(); + msg.push("==== COMPUTATION OF A NEW PARTITION ASSIGNATION ====".into()); + msg.push("".into()); + msg.push(format!( + "Partitions are \ + replicated {} times on at least {} distinct zones.", + self.replication_factor, self.parameters.zone_redundancy + )); + + // We generate for once numerical ids for the zones of non gateway nodes, + // to use them as indices in the flow graphs. + let (id_to_zone, zone_to_id) = self.generate_nongateway_zone_ids()?; + + let nb_nongateway_nodes = self.nongateway_nodes().len(); + if nb_nongateway_nodes < self.replication_factor { + return Err(Error::Message(format!( + "The number of nodes with positive \ + capacity ({}) is smaller than the replication factor ({}).", + nb_nongateway_nodes, self.replication_factor + ))); + } + if id_to_zone.len() < self.parameters.zone_redundancy { + return Err(Error::Message(format!( + "The number of zones with non-gateway \ + nodes ({}) is smaller than the redundancy parameter ({})", + id_to_zone.len(), + self.parameters.zone_redundancy + ))); + } - // In various cases, not enough nodes will have been added for all partitions - // in the step above (e.g. due to node removals, or new zones being added). - // Here we add more nodes to make a complete (but sub-optimal) assignation, - // using an initial partition assignation that is calculated using the multi-dc maglev trick - match self.initial_partition_assignation() { - Some(initial_partitions) => { - for (part, ipart) in partitions.iter_mut().zip(initial_partitions.iter()) { - for _ in 0..2 { - for (id, info) in ipart.nodes.iter() { - if part.nodes.len() < self.replication_factor { - part.add(None, n_zones, id, info.unwrap()); - } - } - } - assert!(part.nodes.len() == self.replication_factor); - } - } - None => { - // Not enough nodes in cluster to build a correct assignation. - // Signal it by returning an error. - return false; - } + // We compute the optimal partition size + // Capacities should be given in a unit so that partition size is at least 100. + // In this case, integer rounding plays a marginal role in the percentages of + // optimality. + let partition_size = self.compute_optimal_partition_size(&zone_to_id)?; + + if old_assignment_opt != None { + msg.push(format!( + "Optimal size of a partition: {} (was {} in the previous layout).", + ByteSize::b(partition_size).to_string_as(false), + ByteSize::b(self.partition_size).to_string_as(false) + )); + } else { + msg.push(format!( + "Given the replication and redundancy constraints, the \ + optimal size of a partition is {}.", + ByteSize::b(partition_size).to_string_as(false) + )); + } + // We write the partition size. + self.partition_size = partition_size; + + if partition_size < 100 { + msg.push( + "WARNING: The partition size is low (< 100), make sure the capacities of your nodes are correct and are of at least a few MB" + .into(), + ); } - // Calculate how many partitions each node should ideally store, - // and how many partitions they are storing with the current assignation - // This defines our target for which we will optimize in the following loop. - let total_capacity = configured_nodes - .iter() - .map(|(_, info)| info.capacity.unwrap_or(0)) - .sum::<u32>() as usize; - let total_partitions = self.replication_factor * (1 << PARTITION_BITS); - let target_partitions_per_node = configured_nodes - .iter() - .map(|(id, info)| { - ( - *id, - info.capacity.unwrap_or(0) as usize * total_partitions / total_capacity, - ) - }) - .collect::<HashMap<&Uuid, usize>>(); - - let mut partitions_per_node = self.partitions_per_node(&partitions[..]); - - println!("Target number of partitions per node:"); - for (node, npart) in target_partitions_per_node.iter() { - println!("{:?}\t{}", node, npart); - } - println!(); - - // Shuffle partitions between nodes so that nodes will reach (or better approach) - // their target number of stored partitions - loop { - let mut option = None; - for (i, part) in partitions.iter_mut().enumerate() { - for (irm, (idrm, _)) in part.nodes.iter().enumerate() { - let errratio = |node, parts| { - let tgt = *target_partitions_per_node.get(node).unwrap() as f32; - (parts - tgt) / tgt - }; - let square = |x| x * x; - - let partsrm = partitions_per_node.get(*idrm).cloned().unwrap_or(0) as f32; - - for (idadd, infoadd) in configured_nodes.iter() { - // skip replacing a node by itself - // and skip replacing by gateway nodes - if idadd == idrm || infoadd.capacity.is_none() { - continue; - } + // We compute a first flow/assignment that is heuristically close to the previous + // assignment + let mut gflow = self.compute_candidate_assignment(&zone_to_id, &old_assignment_opt)?; + if let Some(assoc) = &old_assignment_opt { + // We minimize the distance to the previous assignment. + self.minimize_rebalance_load(&mut gflow, &zone_to_id, assoc)?; + } - // We want to try replacing node idrm by node idadd - // if that brings us close to our goal. - let partsadd = partitions_per_node.get(*idadd).cloned().unwrap_or(0) as f32; - let oldcost = square(errratio(*idrm, partsrm) - errratio(*idadd, partsadd)); - let newcost = - square(errratio(*idrm, partsrm - 1.) - errratio(*idadd, partsadd + 1.)); - if newcost >= oldcost { - // not closer to our goal - continue; - } - let gain = oldcost - newcost; + // We display statistics of the computation + msg.extend(self.output_stat(&gflow, &old_assignment_opt, &zone_to_id, &id_to_zone)?); + msg.push("".to_string()); - let mut newpart = part.clone(); + // We update the layout structure + self.update_ring_from_flow(id_to_zone.len(), &gflow)?; - newpart.nodes.remove(irm); - if !newpart.add(None, n_zones, idadd, infoadd) { - continue; - } - assert!(newpart.nodes.len() == self.replication_factor); + if let Err(e) = self.check() { + return Err(Error::Message( + format!("Layout check returned an error: {}\nOriginal result of computation: <<<<\n{}\n>>>>", e, msg.join("\n")) + )); + } - if !old_partitions[i] - .is_valid_transition_to(&newpart, self.replication_factor) - { - continue; - } + Ok(msg) + } - if option - .as_ref() - .map(|(old_gain, _, _, _, _)| gain > *old_gain) - .unwrap_or(true) - { - option = Some((gain, i, idadd, idrm, newpart)); - } - } - } - } - if let Some((_gain, i, idadd, idrm, newpart)) = option { - *partitions_per_node.entry(idadd).or_insert(0) += 1; - *partitions_per_node.get_mut(idrm).unwrap() -= 1; - partitions[i] = newpart; - } else { - break; - } + /// The LwwMap of node roles might have changed. This function updates the node_id_vec + /// and returns the assignment given by ring, with the new indices of the nodes, and + /// None if the node is not present anymore. + /// We work with the assumption that only this function and calculate_new_assignment + /// do modify assignment_ring and node_id_vec. + fn update_node_id_vec(&mut self) -> Result<Option<Vec<Vec<usize>>>, Error> { + // (1) We compute the new node list + // Non gateway nodes should be coded on 8bits, hence they must be first in the list + // We build the new node ids + let new_non_gateway_nodes: Vec<Uuid> = self + .roles + .items() + .iter() + .filter(|(_, _, v)| matches!(&v.0, Some(r) if r.capacity != None)) + .map(|(k, _, _)| *k) + .collect(); + + if new_non_gateway_nodes.len() > MAX_NODE_NUMBER { + return Err(Error::Message(format!( + "There are more than {} non-gateway nodes in the new \ + layout. This is not allowed.", + MAX_NODE_NUMBER + ))); } - // Check we completed the assignation correctly - // (this is a set of checks for the algorithm's consistency) - assert!(partitions.len() == (1 << PARTITION_BITS)); - assert!(partitions + let new_gateway_nodes: Vec<Uuid> = self + .roles + .items() .iter() - .all(|p| p.nodes.len() == self.replication_factor)); - - let new_partitions_per_node = self.partitions_per_node(&partitions[..]); - assert!(new_partitions_per_node == partitions_per_node); - - // Show statistics - println!("New number of partitions per node:"); - for (node, npart) in partitions_per_node.iter() { - let tgt = *target_partitions_per_node.get(node).unwrap(); - let pct = 100f32 * (*npart as f32) / (tgt as f32); - println!("{:?}\t{}\t({}% of {})", node, npart, pct as i32, tgt); - } - println!(); - - let mut diffcount = HashMap::new(); - for (oldpart, newpart) in old_partitions.iter().zip(partitions.iter()) { - let nminus = oldpart.txtplus(newpart); - let nplus = newpart.txtplus(oldpart); - if nminus != "[...]" || nplus != "[...]" { - let tup = (nminus, nplus); - *diffcount.entry(tup).or_insert(0) += 1; - } + .filter(|(_, _, v)| matches!(v, NodeRoleV(Some(r)) if r.capacity == None)) + .map(|(k, _, _)| *k) + .collect(); + + let mut new_node_id_vec = Vec::<Uuid>::new(); + new_node_id_vec.extend(new_non_gateway_nodes); + new_node_id_vec.extend(new_gateway_nodes); + + let old_node_id_vec = self.node_id_vec.clone(); + self.node_id_vec = new_node_id_vec.clone(); + + // (2) We retrieve the old association + // We rewrite the old association with the new indices. We only consider partition + // to node assignments where the node is still in use. + if self.ring_assignment_data.is_empty() { + // This is a new association + return Ok(None); } - if diffcount.is_empty() { - println!("No data will be moved between nodes."); - } else { - let mut diffcount = diffcount.into_iter().collect::<Vec<_>>(); - diffcount.sort(); - println!("Number of partitions that move:"); - for ((nminus, nplus), npart) in diffcount { - println!("\t{}\t{} -> {}", npart, nminus, nplus); - } + + if self.ring_assignment_data.len() != NB_PARTITIONS * self.replication_factor { + return Err(Error::Message( + "The old assignment does not have a size corresponding to \ + the old replication factor or the number of partitions." + .into(), + )); + } + + // We build a translation table between the uuid and new ids + let mut uuid_to_new_id = HashMap::<Uuid, usize>::new(); + + // We add the indices of only the new non-gateway nodes that can be used in the + // association ring + for (i, uuid) in new_node_id_vec.iter().enumerate() { + uuid_to_new_id.insert(*uuid, i); } - println!(); - // Calculate and save new assignation data - let (nodes, assignation_data) = - self.compute_assignation_data(&configured_nodes[..], &partitions[..]); + let mut old_assignment = vec![Vec::<usize>::new(); NB_PARTITIONS]; + let rf = self.replication_factor; - self.node_id_vec = nodes; - self.ring_assignation_data = assignation_data; + for (p, old_assign_p) in old_assignment.iter_mut().enumerate() { + for old_id in &self.ring_assignment_data[p * rf..(p + 1) * rf] { + let uuid = old_node_id_vec[*old_id as usize]; + if uuid_to_new_id.contains_key(&uuid) { + old_assign_p.push(uuid_to_new_id[&uuid]); + } + } + } - true + // We write the ring + self.ring_assignment_data = Vec::<CompactNodeType>::new(); + + Ok(Some(old_assignment)) } - fn initial_partition_assignation(&self) -> Option<Vec<PartitionAss<'_>>> { - let (configured_nodes, zones) = self.configured_nodes_and_zones(); - let n_zones = zones.len(); + /// This function generates ids for the zone of the nodes appearing in + /// self.node_id_vec. + fn generate_nongateway_zone_ids(&self) -> Result<(Vec<String>, HashMap<String, usize>), Error> { + let mut id_to_zone = Vec::<String>::new(); + let mut zone_to_id = HashMap::<String, usize>::new(); + + for uuid in self.nongateway_nodes().iter() { + let r = self.node_role(uuid).unwrap(); + if !zone_to_id.contains_key(&r.zone) && r.capacity != None { + zone_to_id.insert(r.zone.clone(), id_to_zone.len()); + id_to_zone.push(r.zone.clone()); + } + } + Ok((id_to_zone, zone_to_id)) + } - // Create a vector of partition indices (0 to 2**PARTITION_BITS-1) - let partitions_idx = (0usize..(1usize << PARTITION_BITS)).collect::<Vec<_>>(); + /// This function computes by dichotomy the largest realizable partition size, given + /// the layout roles and parameters. + fn compute_optimal_partition_size( + &self, + zone_to_id: &HashMap<String, usize>, + ) -> Result<u64, Error> { + let empty_set = HashSet::<(usize, usize)>::new(); + let mut g = self.generate_flow_graph(1, zone_to_id, &empty_set)?; + g.compute_maximal_flow()?; + if g.get_flow_value()? < (NB_PARTITIONS * self.replication_factor) as i64 { + return Err(Error::Message( + "The storage capacity of he cluster is to small. It is \ + impossible to store partitions of size 1." + .into(), + )); + } - // Prepare ring - let mut partitions: Vec<PartitionAss> = partitions_idx - .iter() - .map(|_i| PartitionAss::new()) - .collect::<Vec<_>>(); + let mut s_down = 1; + let mut s_up = self.get_total_capacity()?; + while s_down + 1 < s_up { + g = self.generate_flow_graph((s_down + s_up) / 2, zone_to_id, &empty_set)?; + g.compute_maximal_flow()?; + if g.get_flow_value()? < (NB_PARTITIONS * self.replication_factor) as i64 { + s_up = (s_down + s_up) / 2; + } else { + s_down = (s_down + s_up) / 2; + } + } - // Create MagLev priority queues for each node - let mut queues = configured_nodes - .iter() - .filter(|(_id, info)| info.capacity.is_some()) - .map(|(node_id, node_info)| { - let mut parts = partitions_idx - .iter() - .map(|i| { - let part_data = - [&u16::to_be_bytes(*i as u16)[..], node_id.as_slice()].concat(); - (*i, fasthash(&part_data[..])) - }) - .collect::<Vec<_>>(); - parts.sort_by_key(|(_i, h)| *h); - let parts_i = parts.iter().map(|(i, _h)| *i).collect::<Vec<_>>(); - (node_id, node_info, parts_i, 0) - }) - .collect::<Vec<_>>(); + Ok(s_down) + } - let max_capacity = configured_nodes - .iter() - .filter_map(|(_, node_info)| node_info.capacity) - .fold(0, std::cmp::max); - - // Fill up ring - for rep in 0..self.replication_factor { - queues.sort_by_key(|(ni, _np, _q, _p)| { - let queue_data = [&u16::to_be_bytes(rep as u16)[..], ni.as_slice()].concat(); - fasthash(&queue_data[..]) - }); - - for (_, _, _, pos) in queues.iter_mut() { - *pos = 0; + fn generate_graph_vertices(nb_zones: usize, nb_nodes: usize) -> Vec<Vertex> { + let mut vertices = vec![Vertex::Source, Vertex::Sink]; + for p in 0..NB_PARTITIONS { + vertices.push(Vertex::Pup(p)); + vertices.push(Vertex::Pdown(p)); + for z in 0..nb_zones { + vertices.push(Vertex::PZ(p, z)); } + } + for n in 0..nb_nodes { + vertices.push(Vertex::N(n)); + } + vertices + } - let mut remaining = partitions_idx.len(); - while remaining > 0 { - let remaining0 = remaining; - for i_round in 0..max_capacity { - for (node_id, node_info, q, pos) in queues.iter_mut() { - if i_round >= node_info.capacity.unwrap() { - continue; - } - for (pos2, &qv) in q.iter().enumerate().skip(*pos) { - if partitions[qv].add(Some(rep + 1), n_zones, node_id, node_info) { - remaining -= 1; - *pos = pos2 + 1; - break; - } - } - } - } - if remaining == remaining0 { - // No progress made, exit - return None; + /// Generates the graph to compute the maximal flow corresponding to the optimal + /// partition assignment. + /// exclude_assoc is the set of (partition, node) association that we are forbidden + /// to use (hence we do not add the corresponding edge to the graph). This parameter + /// is used to compute a first flow that uses only edges appearing in the previous + /// assignment. This produces a solution that heuristically should be close to the + /// previous one. + fn generate_flow_graph( + &self, + partition_size: u64, + zone_to_id: &HashMap<String, usize>, + exclude_assoc: &HashSet<(usize, usize)>, + ) -> Result<Graph<FlowEdge>, Error> { + let vertices = + ClusterLayout::generate_graph_vertices(zone_to_id.len(), self.nongateway_nodes().len()); + let mut g = Graph::<FlowEdge>::new(&vertices); + let nb_zones = zone_to_id.len(); + let redundancy = self.parameters.zone_redundancy; + for p in 0..NB_PARTITIONS { + g.add_edge(Vertex::Source, Vertex::Pup(p), redundancy as u64)?; + g.add_edge( + Vertex::Source, + Vertex::Pdown(p), + (self.replication_factor - redundancy) as u64, + )?; + for z in 0..nb_zones { + g.add_edge(Vertex::Pup(p), Vertex::PZ(p, z), 1)?; + g.add_edge( + Vertex::Pdown(p), + Vertex::PZ(p, z), + self.replication_factor as u64, + )?; + } + } + for n in 0..self.nongateway_nodes().len() { + let node_capacity = self.get_node_capacity(&self.node_id_vec[n])?; + let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[n])?]; + g.add_edge(Vertex::N(n), Vertex::Sink, node_capacity / partition_size)?; + for p in 0..NB_PARTITIONS { + if !exclude_assoc.contains(&(p, n)) { + g.add_edge(Vertex::PZ(p, node_zone), Vertex::N(n), 1)?; } } } - - Some(partitions) + Ok(g) } - fn configured_nodes_and_zones(&self) -> (Vec<(&Uuid, &NodeRole)>, HashSet<&str>) { - let configured_nodes = self - .roles - .items() - .iter() - .filter(|(_id, _, info)| info.0.is_some()) - .map(|(id, _, info)| (id, info.0.as_ref().unwrap())) - .collect::<Vec<(&Uuid, &NodeRole)>>(); + /// This function computes a first optimal assignment (in the form of a flow graph). + fn compute_candidate_assignment( + &self, + zone_to_id: &HashMap<String, usize>, + prev_assign_opt: &Option<Vec<Vec<usize>>>, + ) -> Result<Graph<FlowEdge>, Error> { + // We list the (partition,node) associations that are not used in the + // previous assignment + let mut exclude_edge = HashSet::<(usize, usize)>::new(); + if let Some(prev_assign) = prev_assign_opt { + let nb_nodes = self.nongateway_nodes().len(); + for (p, prev_assign_p) in prev_assign.iter().enumerate() { + for n in 0..nb_nodes { + exclude_edge.insert((p, n)); + } + for n in prev_assign_p.iter() { + exclude_edge.remove(&(p, *n)); + } + } + } - let zones = configured_nodes - .iter() - .filter(|(_id, info)| info.capacity.is_some()) - .map(|(_id, info)| info.zone.as_str()) - .collect::<HashSet<&str>>(); + // We compute the best flow using only the edges used in the previous assignment + let mut g = self.generate_flow_graph(self.partition_size, zone_to_id, &exclude_edge)?; + g.compute_maximal_flow()?; - (configured_nodes, zones) + // We add the excluded edges and compute the maximal flow with the full graph. + // The algorithm is such that it will start with the flow that we just computed + // and find ameliorating paths from that. + for (p, n) in exclude_edge.iter() { + let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]; + g.add_edge(Vertex::PZ(*p, node_zone), Vertex::N(*n), 1)?; + } + g.compute_maximal_flow()?; + Ok(g) } - fn compute_assignation_data<'a>( + /// This function updates the flow graph gflow to minimize the distance between + /// its corresponding assignment and the previous one + fn minimize_rebalance_load( &self, - configured_nodes: &[(&'a Uuid, &'a NodeRole)], - partitions: &[PartitionAss<'a>], - ) -> (Vec<Uuid>, Vec<CompactNodeType>) { - assert!(partitions.len() == (1 << PARTITION_BITS)); - - // Make a canonical order for nodes - let mut nodes = configured_nodes - .iter() - .filter(|(_id, info)| info.capacity.is_some()) - .map(|(id, _)| **id) - .collect::<Vec<_>>(); - let nodes_rev = nodes - .iter() - .enumerate() - .map(|(i, id)| (*id, i as CompactNodeType)) - .collect::<HashMap<Uuid, CompactNodeType>>(); - - let mut assignation_data = vec![]; - for partition in partitions.iter() { - assert!(partition.nodes.len() == self.replication_factor); - for (id, _) in partition.nodes.iter() { - assignation_data.push(*nodes_rev.get(id).unwrap()); + gflow: &mut Graph<FlowEdge>, + zone_to_id: &HashMap<String, usize>, + prev_assign: &[Vec<usize>], + ) -> Result<(), Error> { + // We define a cost function on the edges (pairs of vertices) corresponding + // to the distance between the two assignments. + let mut cost = CostFunction::new(); + for (p, assoc_p) in prev_assign.iter().enumerate() { + for n in assoc_p.iter() { + let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]; + cost.insert((Vertex::PZ(p, node_zone), Vertex::N(*n)), -1); } } - nodes.extend( - configured_nodes - .iter() - .filter(|(_id, info)| info.capacity.is_none()) - .map(|(id, _)| **id), - ); + // We compute the maximal length of a simple path in gflow. It is used in the + // Bellman-Ford algorithm in optimize_flow_with_cost to set the number + // of iterations. + let nb_nodes = self.nongateway_nodes().len(); + let path_length = 4 * nb_nodes; + gflow.optimize_flow_with_cost(&cost, path_length)?; - (nodes, assignation_data) + Ok(()) } - fn parse_assignation_data(&self) -> Vec<PartitionAss<'_>> { - if self.ring_assignation_data.len() == self.replication_factor * (1 << PARTITION_BITS) { - // If the previous assignation data is correct, use that - let mut partitions = vec![]; - for i in 0..(1 << PARTITION_BITS) { - let mut part = PartitionAss::new(); - for node_i in self.ring_assignation_data - [i * self.replication_factor..(i + 1) * self.replication_factor] - .iter() - { - let node_id = &self.node_id_vec[*node_i as usize]; - - if let Some(NodeRoleV(Some(info))) = self.roles.get(node_id) { - part.nodes.push((node_id, Some(info))); - } else { - part.nodes.push((node_id, None)); + /// This function updates the assignment ring from the flow graph. + fn update_ring_from_flow( + &mut self, + nb_zones: usize, + gflow: &Graph<FlowEdge>, + ) -> Result<(), Error> { + self.ring_assignment_data = Vec::<CompactNodeType>::new(); + for p in 0..NB_PARTITIONS { + for z in 0..nb_zones { + let assoc_vertex = gflow.get_positive_flow_from(Vertex::PZ(p, z))?; + for vertex in assoc_vertex.iter() { + if let Vertex::N(n) = vertex { + self.ring_assignment_data.push((*n).try_into().unwrap()); } } - partitions.push(part); } - partitions - } else { - // Otherwise start fresh - (0..(1 << PARTITION_BITS)) - .map(|_| PartitionAss::new()) - .collect() } + + if self.ring_assignment_data.len() != NB_PARTITIONS * self.replication_factor { + return Err(Error::Message( + "Critical Error : the association ring we produced does not \ + have the right size." + .into(), + )); + } + Ok(()) } - fn partitions_per_node<'a>(&self, partitions: &[PartitionAss<'a>]) -> HashMap<&'a Uuid, usize> { - let mut partitions_per_node = HashMap::<&Uuid, usize>::new(); - for p in partitions.iter() { - for (id, _) in p.nodes.iter() { - *partitions_per_node.entry(*id).or_insert(0) += 1; + /// This function returns a message summing up the partition repartition of the new + /// layout, and other statistics of the partition assignment computation. + fn output_stat( + &self, + gflow: &Graph<FlowEdge>, + prev_assign_opt: &Option<Vec<Vec<usize>>>, + zone_to_id: &HashMap<String, usize>, + id_to_zone: &[String], + ) -> Result<Message, Error> { + let mut msg = Message::new(); + + let used_cap = self.partition_size * NB_PARTITIONS as u64 * self.replication_factor as u64; + let total_cap = self.get_total_capacity()?; + let percent_cap = 100.0 * (used_cap as f32) / (total_cap as f32); + msg.push("".into()); + msg.push(format!( + "Usable capacity / Total cluster capacity: {} / {} ({:.1} %)", + ByteSize::b(used_cap).to_string_as(false), + ByteSize::b(total_cap).to_string_as(false), + percent_cap + )); + msg.push("".into()); + msg.push( + "If the percentage is too low, it might be that the \ + replication/redundancy constraints force the use of nodes/zones with small \ + storage capacities. \ + You might want to rebalance the storage capacities or relax the constraints. \ + See the detailed statistics below and look for saturated nodes/zones." + .into(), + ); + msg.push(format!( + "Recall that because of the replication factor, the actual available \ + storage capacity is {} / {} = {}.", + ByteSize::b(used_cap).to_string_as(false), + self.replication_factor, + ByteSize::b(used_cap / self.replication_factor as u64).to_string_as(false) + )); + + // We define and fill in the following tables + let storing_nodes = self.nongateway_nodes(); + let mut new_partitions = vec![0; storing_nodes.len()]; + let mut stored_partitions = vec![0; storing_nodes.len()]; + + let mut new_partitions_zone = vec![0; id_to_zone.len()]; + let mut stored_partitions_zone = vec![0; id_to_zone.len()]; + + for p in 0..NB_PARTITIONS { + for z in 0..id_to_zone.len() { + let pz_nodes = gflow.get_positive_flow_from(Vertex::PZ(p, z))?; + if !pz_nodes.is_empty() { + stored_partitions_zone[z] += 1; + if let Some(prev_assign) = prev_assign_opt { + let mut old_zones_of_p = Vec::<usize>::new(); + for n in prev_assign[p].iter() { + old_zones_of_p + .push(zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]); + } + if !old_zones_of_p.contains(&z) { + new_partitions_zone[z] += 1; + } + } + } + for vert in pz_nodes.iter() { + if let Vertex::N(n) = *vert { + stored_partitions[n] += 1; + if let Some(prev_assign) = prev_assign_opt { + if !prev_assign[p].contains(&n) { + new_partitions[n] += 1; + } + } + } + } + } + } + + if *prev_assign_opt == None { + new_partitions = stored_partitions.clone(); + new_partitions_zone = stored_partitions_zone.clone(); + } + + // We display the statistics + + msg.push("".into()); + if *prev_assign_opt != None { + let total_new_partitions: usize = new_partitions.iter().sum(); + msg.push(format!( + "A total of {} new copies of partitions need to be \ + transferred.", + total_new_partitions + )); + } + msg.push("".into()); + msg.push("==== DETAILED STATISTICS BY ZONES AND NODES ====".into()); + + for z in 0..id_to_zone.len() { + let mut nodes_of_z = Vec::<usize>::new(); + for n in 0..storing_nodes.len() { + if self.get_node_zone(&self.node_id_vec[n])? == id_to_zone[z] { + nodes_of_z.push(n); + } + } + let replicated_partitions: usize = + nodes_of_z.iter().map(|n| stored_partitions[*n]).sum(); + msg.push("".into()); + + msg.push(format!( + "Zone {}: {} distinct partitions stored ({} new, \ + {} partition copies) ", + id_to_zone[z], + stored_partitions_zone[z], + new_partitions_zone[z], + replicated_partitions + )); + + let available_cap_z: u64 = self.partition_size * replicated_partitions as u64; + let mut total_cap_z = 0; + for n in nodes_of_z.iter() { + total_cap_z += self.get_node_capacity(&self.node_id_vec[*n])?; + } + let percent_cap_z = 100.0 * (available_cap_z as f32) / (total_cap_z as f32); + msg.push(format!( + " Usable capacity / Total capacity: {} / {} ({:.1}%).", + ByteSize::b(available_cap_z).to_string_as(false), + ByteSize::b(total_cap_z).to_string_as(false), + percent_cap_z + )); + + for n in nodes_of_z.iter() { + let available_cap_n = stored_partitions[*n] as u64 * self.partition_size; + let total_cap_n = self.get_node_capacity(&self.node_id_vec[*n])?; + let tags_n = (self + .node_role(&self.node_id_vec[*n]) + .ok_or("Node not found."))? + .tags_string(); + msg.push(format!( + " Node {:?}: {} partitions ({} new) ; \ + usable/total capacity: {} / {} ({:.1}%) ; tags:{}", + self.node_id_vec[*n], + stored_partitions[*n], + new_partitions[*n], + ByteSize::b(available_cap_n).to_string_as(false), + ByteSize::b(total_cap_n).to_string_as(false), + (available_cap_n as f32) / (total_cap_n as f32) * 100.0, + tags_n + )); } } - partitions_per_node + + Ok(msg) } } -// ---- Internal structs for partition assignation in layout ---- +// ==================================================================================== + +#[cfg(test)] +mod tests { + use super::{Error, *}; + use std::cmp::min; + + // This function checks that the partition size S computed is at least better than the + // one given by a very naive algorithm. To do so, we try to run the naive algorithm + // assuming a partion size of S+1. If we succed, it means that the optimal assignment + // was not optimal. The naive algorithm is the following : + // - we compute the max number of partitions associated to every node, capped at the + // partition number. It gives the number of tokens of every node. + // - every zone has a number of tokens equal to the sum of the tokens of its nodes. + // - we cycle over the partitions and associate zone tokens while respecting the + // zone redundancy constraint. + // NOTE: the naive algorithm is not optimal. Counter example: + // take nb_partition = 3 ; replication_factor = 5; redundancy = 4; + // number of tokens by zone : (A, 4), (B,1), (C,4), (D, 4), (E, 2) + // With these parameters, the naive algo fails, whereas there is a solution: + // (A,A,C,D,E) , (A,B,C,D,D) (A,C,C,D,E) + fn check_against_naive(cl: &ClusterLayout) -> Result<bool, Error> { + let over_size = cl.partition_size + 1; + let mut zone_token = HashMap::<String, usize>::new(); + + let (zones, zone_to_id) = cl.generate_nongateway_zone_ids()?; + + if zones.is_empty() { + return Ok(false); + } -#[derive(Clone)] -struct PartitionAss<'a> { - nodes: Vec<(&'a Uuid, Option<&'a NodeRole>)>, -} + for z in zones.iter() { + zone_token.insert(z.clone(), 0); + } + for uuid in cl.nongateway_nodes().iter() { + let z = cl.get_node_zone(uuid)?; + let c = cl.get_node_capacity(uuid)?; + zone_token.insert( + z.clone(), + zone_token[&z] + min(NB_PARTITIONS, (c / over_size) as usize), + ); + } -impl<'a> PartitionAss<'a> { - fn new() -> Self { - Self { nodes: Vec::new() } - } + // For every partition, we count the number of zone already associated and + // the name of the last zone associated - fn nplus(&self, other: &PartitionAss<'a>) -> usize { - self.nodes - .iter() - .filter(|x| !other.nodes.contains(x)) - .count() - } + let mut id_zone_token = vec![0; zones.len()]; + for (z, t) in zone_token.iter() { + id_zone_token[zone_to_id[z]] = *t; + } - fn txtplus(&self, other: &PartitionAss<'a>) -> String { - let mut nodes = self - .nodes - .iter() - .filter(|x| !other.nodes.contains(x)) - .map(|x| format!("{:?}", x.0)) - .collect::<Vec<_>>(); - nodes.sort(); - if self.nodes.iter().any(|x| other.nodes.contains(x)) { - nodes.push("...".into()); + let mut nb_token = vec![0; NB_PARTITIONS]; + let mut last_zone = vec![zones.len(); NB_PARTITIONS]; + + let mut curr_zone = 0; + + let redundancy = cl.parameters.zone_redundancy; + + for replic in 0..cl.replication_factor { + for p in 0..NB_PARTITIONS { + while id_zone_token[curr_zone] == 0 + || (last_zone[p] == curr_zone + && redundancy - nb_token[p] <= cl.replication_factor - replic) + { + curr_zone += 1; + if curr_zone >= zones.len() { + return Ok(true); + } + } + id_zone_token[curr_zone] -= 1; + if last_zone[p] != curr_zone { + nb_token[p] += 1; + last_zone[p] = curr_zone; + } + } } - format!("[{}]", nodes.join(" ")) - } - fn is_valid_transition_to(&self, other: &PartitionAss<'a>, replication_factor: usize) -> bool { - let min_keep_nodes_per_part = (replication_factor + 1) / 2; - let n_removed = self.nplus(other); + return Ok(false); + } - if self.nodes.len() <= min_keep_nodes_per_part { - n_removed == 0 - } else { - n_removed <= self.nodes.len() - min_keep_nodes_per_part + fn show_msg(msg: &Message) { + for s in msg.iter() { + println!("{}", s); } } - // add is a key function in creating a PartitionAss, i.e. the list of nodes - // to which a partition is assigned. It tries to add a certain node id to the - // assignation, but checks that doing so is compatible with the NECESSARY - // condition that the partition assignation must be dispersed over different - // zones (datacenters) if enough zones exist. This is why it takes a n_zones - // parameter, which is the total number of zones that have existing nodes: - // if nodes in the assignation already cover all n_zones zones, then any node - // that is not yet in the assignation can be added. Otherwise, only nodes - // that are in a new zone can be added. - fn add( - &mut self, - target_len: Option<usize>, - n_zones: usize, - node: &'a Uuid, - role: &'a NodeRole, - ) -> bool { - if let Some(tl) = target_len { - if self.nodes.len() != tl - 1 { - return false; + fn update_layout( + cl: &mut ClusterLayout, + node_id_vec: &Vec<u8>, + node_capacity_vec: &Vec<u64>, + node_zone_vec: &Vec<String>, + zone_redundancy: usize, + ) { + for i in 0..node_id_vec.len() { + if let Some(x) = FixedBytes32::try_from(&[i as u8; 32]) { + cl.node_id_vec.push(x); } - } - let p_zns = self - .nodes - .iter() - .map(|(_id, info)| info.unwrap().zone.as_str()) - .collect::<HashSet<&str>>(); - if (p_zns.len() < n_zones && !p_zns.contains(&role.zone.as_str())) - || (p_zns.len() == n_zones && !self.nodes.iter().any(|(id, _)| *id == node)) - { - self.nodes.push((node, Some(role))); - true - } else { - false + let update = cl.staging_roles.update_mutator( + cl.node_id_vec[i], + NodeRoleV(Some(NodeRole { + zone: (node_zone_vec[i].to_string()), + capacity: (Some(node_capacity_vec[i])), + tags: (vec![]), + })), + ); + cl.staging_roles.merge(&update); } + cl.staging_parameters + .update(LayoutParameters { zone_redundancy }); + cl.staging_hash = cl.calculate_staging_hash(); + } + + #[test] + fn test_assignment() { + let mut node_id_vec = vec![1, 2, 3]; + let mut node_capacity_vec = vec![4000, 1000, 2000]; + let mut node_zone_vec = vec!["A", "B", "C"] + .into_iter() + .map(|x| x.to_string()) + .collect(); + + let mut cl = ClusterLayout::new(3); + update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 3); + let v = cl.version; + let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); + show_msg(&msg); + assert_eq!(cl.check(), Ok(())); + assert!(matches!(check_against_naive(&cl), Ok(true))); + + node_id_vec = vec![1, 2, 3, 4, 5, 6, 7, 8, 9]; + node_capacity_vec = vec![4000, 1000, 1000, 3000, 1000, 1000, 2000, 10000, 2000]; + node_zone_vec = vec!["A", "B", "C", "C", "C", "B", "G", "H", "I"] + .into_iter() + .map(|x| x.to_string()) + .collect(); + update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 2); + let v = cl.version; + let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); + show_msg(&msg); + assert_eq!(cl.check(), Ok(())); + assert!(matches!(check_against_naive(&cl), Ok(true))); + + node_capacity_vec = vec![4000, 1000, 2000, 7000, 1000, 1000, 2000, 10000, 2000]; + update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 3); + let v = cl.version; + let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); + show_msg(&msg); + assert_eq!(cl.check(), Ok(())); + assert!(matches!(check_against_naive(&cl), Ok(true))); + + node_capacity_vec = vec![ + 4000000, 4000000, 2000000, 7000000, 1000000, 9000000, 2000000, 10000, 2000000, + ]; + update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 1); + let v = cl.version; + let (cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); + show_msg(&msg); + assert_eq!(cl.check(), Ok(())); + assert!(matches!(check_against_naive(&cl), Ok(true))); } } diff --git a/src/rpc/lib.rs b/src/rpc/lib.rs index 5aec92c0..a5f8fc6e 100644 --- a/src/rpc/lib.rs +++ b/src/rpc/lib.rs @@ -11,6 +11,7 @@ mod consul; #[cfg(feature = "kubernetes-discovery")] mod kubernetes; +pub mod graph_algo; pub mod layout; pub mod replication_mode; pub mod ring; diff --git a/src/rpc/ring.rs b/src/rpc/ring.rs index 73a126a2..6a2e5c72 100644 --- a/src/rpc/ring.rs +++ b/src/rpc/ring.rs @@ -40,6 +40,7 @@ pub struct Ring { // Type to store compactly the id of a node in the system // Change this to u16 the day we want to have more than 256 nodes in a cluster pub type CompactNodeType = u8; +pub const MAX_NODE_NUMBER: usize = 256; // The maximum number of times an object might get replicated // This must be at least 3 because Garage supports 3-way replication @@ -62,12 +63,12 @@ struct RingEntry { impl Ring { pub(crate) fn new(layout: ClusterLayout, replication_factor: usize) -> Self { if replication_factor != layout.replication_factor { - warn!("Could not build ring: replication factor does not match between local configuration and network role assignation."); + warn!("Could not build ring: replication factor does not match between local configuration and network role assignment."); return Self::empty(layout, replication_factor); } - if layout.ring_assignation_data.len() != replication_factor * (1 << PARTITION_BITS) { - warn!("Could not build ring: network role assignation data has invalid length"); + if layout.ring_assignment_data.len() != replication_factor * (1 << PARTITION_BITS) { + warn!("Could not build ring: network role assignment data has invalid length"); return Self::empty(layout, replication_factor); } @@ -77,7 +78,7 @@ impl Ring { let top = (i as u16) << (16 - PARTITION_BITS); let mut nodes_buf = [0u8; MAX_REPLICATION]; nodes_buf[..replication_factor].copy_from_slice( - &layout.ring_assignation_data + &layout.ring_assignment_data [replication_factor * i..replication_factor * (i + 1)], ); RingEntry { diff --git a/src/rpc/system.rs b/src/rpc/system.rs index b42e49fc..c549d8fc 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -666,9 +666,9 @@ impl System { let update_ring = self.update_ring.lock().await; let mut layout: ClusterLayout = self.ring.borrow().layout.clone(); - let prev_layout_check = layout.check(); + let prev_layout_check = layout.check().is_ok(); if layout.merge(adv) { - if prev_layout_check && !layout.check() { + if prev_layout_check && !layout.check().is_ok() { error!("New cluster layout is invalid, discarding."); return Err(Error::Message( "New cluster layout is invalid, discarding.".into(), @@ -724,7 +724,7 @@ impl System { async fn discovery_loop(self: &Arc<Self>, mut stop_signal: watch::Receiver<bool>) { while !*stop_signal.borrow() { - let not_configured = !self.ring.borrow().layout.check(); + let not_configured = !self.ring.borrow().layout.check().is_ok(); let no_peers = self.fullmesh.get_peer_list().len() < self.replication_factor; let expected_n_nodes = self.ring.borrow().layout.num_nodes(); let bad_peers = self diff --git a/src/table/data.rs b/src/table/data.rs index e76836ca..26101da4 100644 --- a/src/table/data.rs +++ b/src/table/data.rs @@ -347,9 +347,7 @@ impl<F: TableSchema, R: TableReplication> TableData<F, R> { // ---- Utility functions ---- pub fn tree_key(&self, p: &F::P, s: &F::S) -> Vec<u8> { - let mut ret = p.hash().to_vec(); - ret.extend(s.sort_key()); - ret + [p.hash().as_slice(), s.sort_key()].concat() } pub fn decode_entry(&self, bytes: &[u8]) -> Result<F::E, Error> { diff --git a/src/table/schema.rs b/src/table/schema.rs index 5cbf6c95..fc1a465e 100644 --- a/src/table/schema.rs +++ b/src/table/schema.rs @@ -6,6 +6,8 @@ use garage_util::migrate::Migrate; use crate::crdt::Crdt; +// =================================== PARTITION KEYS + /// Trait for field used to partition data pub trait PartitionKey: Clone + PartialEq + Serialize + for<'de> Deserialize<'de> + Send + Sync + 'static @@ -29,6 +31,8 @@ impl PartitionKey for FixedBytes32 { } } +// =================================== SORT KEYS + /// Trait for field used to sort data pub trait SortKey: Clone + Serialize + for<'de> Deserialize<'de> + Send + Sync + 'static { /// Get the key used to sort @@ -47,6 +51,8 @@ impl SortKey for FixedBytes32 { } } +// =================================== SCHEMA + /// Trait for an entry in a table. It must be sortable and partitionnable. pub trait Entry<P: PartitionKey, S: SortKey>: Crdt + PartialEq + Clone + Migrate + Send + Sync + 'static diff --git a/src/util/Cargo.toml b/src/util/Cargo.toml index f72051b9..27ef4550 100644 --- a/src/util/Cargo.toml +++ b/src/util/Cargo.toml @@ -31,7 +31,7 @@ rand = "0.8" sha2 = "0.10" chrono = "0.4" -rmp-serde = "0.15" +rmp-serde = "1.1" serde = { version = "1.0", default-features = false, features = ["derive", "rc"] } serde_json = "1.0" toml = "0.6" diff --git a/src/util/config.rs b/src/util/config.rs index 1da95b2f..eeb17e0e 100644 --- a/src/util/config.rs +++ b/src/util/config.rs @@ -15,6 +15,13 @@ pub struct Config { /// Path where to store data. Can be slower, but need higher volume pub data_dir: PathBuf, + /// Whether to fsync after all metadata transactions (disabled by default) + #[serde(default)] + pub metadata_fsync: bool, + /// Whether to fsync after all data block writes (disabled by default) + #[serde(default)] + pub data_fsync: bool, + /// Size of data blocks to save to disk #[serde(default = "default_block_size")] pub block_size: usize, @@ -183,7 +190,7 @@ pub struct KubernetesDiscoveryConfig { } fn default_db_engine() -> String { - "sled".into() + "lmdb".into() } fn default_sled_cache_capacity() -> u64 { diff --git a/src/util/encode.rs b/src/util/encode.rs index 1cd3198f..a9ab9a35 100644 --- a/src/util/encode.rs +++ b/src/util/encode.rs @@ -8,9 +8,7 @@ where T: Serialize + ?Sized, { let mut wr = Vec::with_capacity(128); - let mut se = rmp_serde::Serializer::new(&mut wr) - .with_struct_map() - .with_string_variants(); + let mut se = rmp_serde::Serializer::new(&mut wr).with_struct_map(); val.serialize(&mut se)?; Ok(wr) } @@ -22,7 +20,7 @@ pub fn nonversioned_decode<T>(bytes: &[u8]) -> Result<T, rmp_serde::decode::Erro where T: for<'de> Deserialize<'de> + ?Sized, { - rmp_serde::decode::from_read_ref::<_, T>(bytes) + rmp_serde::decode::from_slice::<_>(bytes) } /// Serialize to JSON, truncating long result diff --git a/src/util/migrate.rs b/src/util/migrate.rs index 1229fd9c..5b708cc8 100644 --- a/src/util/migrate.rs +++ b/src/util/migrate.rs @@ -19,7 +19,7 @@ pub trait Migrate: Serialize + for<'de> Deserialize<'de> + 'static { fn decode(bytes: &[u8]) -> Option<Self> { let marker_len = Self::VERSION_MARKER.len(); if bytes.get(..marker_len) == Some(Self::VERSION_MARKER) { - if let Ok(value) = rmp_serde::decode::from_read_ref::<_, Self>(&bytes[marker_len..]) { + if let Ok(value) = rmp_serde::decode::from_slice::<_>(&bytes[marker_len..]) { return Some(value); } } @@ -31,9 +31,7 @@ pub trait Migrate: Serialize + for<'de> Deserialize<'de> + 'static { fn encode(&self) -> Result<Vec<u8>, rmp_serde::encode::Error> { let mut wr = Vec::with_capacity(128); wr.extend_from_slice(Self::VERSION_MARKER); - let mut se = rmp_serde::Serializer::new(&mut wr) - .with_struct_map() - .with_string_variants(); + let mut se = rmp_serde::Serializer::new(&mut wr).with_struct_map(); self.serialize(&mut se)?; Ok(wr) } |