diff options
Diffstat (limited to 'src')
68 files changed, 5169 insertions, 2005 deletions
diff --git a/src/api/Cargo.toml b/src/api/Cargo.toml index 317031a7..bcf6a537 100644 --- a/src/api/Cargo.toml +++ b/src/api/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "garage_api" -version = "0.9.3" +version = "0.10.0" authors = ["Alex Auvolat <alex@adnab.me>"] edition = "2018" license = "AGPL-3.0" @@ -21,7 +21,9 @@ garage_net.workspace = true garage_util.workspace = true garage_rpc.workspace = true +aes-gcm.workspace = true argon2.workspace = true +async-compression.workspace = true async-trait.workspace = true base64.workspace = true bytes.workspace = true @@ -41,6 +43,7 @@ futures.workspace = true futures-util.workspace = true tokio.workspace = true tokio-stream.workspace = true +tokio-util.workspace = true form_urlencoded.workspace = true http.workspace = true diff --git a/src/api/admin/api_server.rs b/src/api/admin/api_server.rs index 265639c4..0e4565bb 100644 --- a/src/api/admin/api_server.rs +++ b/src/api/admin/api_server.rs @@ -276,7 +276,7 @@ impl ApiHandler for AdminApiServer { Endpoint::GetClusterLayout => handle_get_cluster_layout(&self.garage).await, Endpoint::UpdateClusterLayout => handle_update_cluster_layout(&self.garage, req).await, Endpoint::ApplyClusterLayout => handle_apply_cluster_layout(&self.garage, req).await, - Endpoint::RevertClusterLayout => handle_revert_cluster_layout(&self.garage, req).await, + Endpoint::RevertClusterLayout => handle_revert_cluster_layout(&self.garage).await, // Keys Endpoint::ListKeys => handle_list_keys(&self.garage).await, Endpoint::GetKeyInfo { diff --git a/src/api/admin/bucket.rs b/src/api/admin/bucket.rs index cfe8a6c4..ac3cba00 100644 --- a/src/api/admin/bucket.rs +++ b/src/api/admin/bucket.rs @@ -123,7 +123,7 @@ async fn bucket_info_results( .table .get(&bucket_id, &EmptyKey) .await? - .map(|x| x.filtered_values(&garage.system.ring.borrow())) + .map(|x| x.filtered_values(&garage.system.cluster_layout())) .unwrap_or_default(); let mpu_counters = garage @@ -131,7 +131,7 @@ async fn bucket_info_results( .table .get(&bucket_id, &EmptyKey) .await? - .map(|x| x.filtered_values(&garage.system.ring.borrow())) + .map(|x| x.filtered_values(&garage.system.cluster_layout())) .unwrap_or_default(); let mut relevant_keys = HashMap::new(); diff --git a/src/api/admin/cluster.rs b/src/api/admin/cluster.rs index 3876c608..8ce6c5ed 100644 --- a/src/api/admin/cluster.rs +++ b/src/api/admin/cluster.rs @@ -1,3 +1,4 @@ +use std::collections::HashMap; use std::net::SocketAddr; use std::sync::Arc; @@ -16,25 +17,95 @@ use crate::admin::error::*; use crate::helpers::{json_ok_response, parse_json_body}; pub async fn handle_get_cluster_status(garage: &Arc<Garage>) -> Result<Response<ResBody>, Error> { + let layout = garage.system.cluster_layout(); + let mut nodes = garage + .system + .get_known_nodes() + .into_iter() + .map(|i| { + ( + i.id, + NodeResp { + id: hex::encode(i.id), + addr: Some(i.addr), + hostname: i.status.hostname, + is_up: i.is_up, + last_seen_secs_ago: i.last_seen_secs_ago, + data_partition: i + .status + .data_disk_avail + .map(|(avail, total)| FreeSpaceResp { + available: avail, + total, + }), + metadata_partition: i.status.meta_disk_avail.map(|(avail, total)| { + FreeSpaceResp { + available: avail, + total, + } + }), + ..Default::default() + }, + ) + }) + .collect::<HashMap<_, _>>(); + + for (id, _, role) in layout.current().roles.items().iter() { + if let layout::NodeRoleV(Some(r)) = role { + let role = NodeRoleResp { + id: hex::encode(id), + zone: r.zone.to_string(), + capacity: r.capacity, + tags: r.tags.clone(), + }; + match nodes.get_mut(id) { + None => { + nodes.insert( + *id, + NodeResp { + id: hex::encode(id), + role: Some(role), + ..Default::default() + }, + ); + } + Some(n) => { + if n.role.is_none() { + n.role = Some(role); + } + } + } + } + } + + for ver in layout.versions.iter().rev().skip(1) { + for (id, _, role) in ver.roles.items().iter() { + if let layout::NodeRoleV(Some(r)) = role { + if !nodes.contains_key(id) && r.capacity.is_some() { + nodes.insert( + *id, + NodeResp { + id: hex::encode(id), + draining: true, + ..Default::default() + }, + ); + } + } + } + } + + let mut nodes = nodes.into_values().collect::<Vec<_>>(); + nodes.sort_by(|x, y| x.id.cmp(&y.id)); + let res = GetClusterStatusResponse { node: hex::encode(garage.system.id), garage_version: garage_util::version::garage_version(), garage_features: garage_util::version::garage_features(), rust_version: garage_util::version::rust_version(), db_engine: garage.db.engine(), - known_nodes: garage - .system - .get_known_nodes() - .into_iter() - .map(|i| KnownNodeResp { - id: hex::encode(i.id), - addr: i.addr, - is_up: i.is_up, - last_seen_secs_ago: i.last_seen_secs_ago, - hostname: i.status.hostname, - }) - .collect(), - layout: format_cluster_layout(&garage.system.get_cluster_layout()), + layout_version: layout.current().version, + nodes, }; Ok(json_ok_response(&res)?) @@ -85,13 +156,14 @@ pub async fn handle_connect_cluster_nodes( } pub async fn handle_get_cluster_layout(garage: &Arc<Garage>) -> Result<Response<ResBody>, Error> { - let res = format_cluster_layout(&garage.system.get_cluster_layout()); + let res = format_cluster_layout(&garage.system.cluster_layout()); Ok(json_ok_response(&res)?) } -fn format_cluster_layout(layout: &layout::ClusterLayout) -> GetClusterLayoutResponse { +fn format_cluster_layout(layout: &layout::LayoutHistory) -> GetClusterLayoutResponse { let roles = layout + .current() .roles .items() .iter() @@ -105,10 +177,12 @@ fn format_cluster_layout(layout: &layout::ClusterLayout) -> GetClusterLayoutResp .collect::<Vec<_>>(); let staged_role_changes = layout - .staging_roles + .staging + .get() + .roles .items() .iter() - .filter(|(k, _, v)| layout.roles.get(k) != Some(v)) + .filter(|(k, _, v)| layout.current().roles.get(k) != Some(v)) .map(|(k, _, v)| match &v.0 { None => NodeRoleChange { id: hex::encode(k), @@ -126,7 +200,7 @@ fn format_cluster_layout(layout: &layout::ClusterLayout) -> GetClusterLayoutResp .collect::<Vec<_>>(); GetClusterLayoutResponse { - version: layout.version, + version: layout.current().version, roles, staged_role_changes, } @@ -155,8 +229,8 @@ struct GetClusterStatusResponse { garage_features: Option<&'static [&'static str]>, rust_version: &'static str, db_engine: String, - known_nodes: Vec<KnownNodeResp>, - layout: GetClusterLayoutResponse, + layout_version: u64, + nodes: Vec<NodeResp>, } #[derive(Serialize)] @@ -190,14 +264,27 @@ struct NodeRoleResp { tags: Vec<String>, } -#[derive(Serialize)] +#[derive(Serialize, Default)] +#[serde(rename_all = "camelCase")] +struct FreeSpaceResp { + available: u64, + total: u64, +} + +#[derive(Serialize, Default)] #[serde(rename_all = "camelCase")] -struct KnownNodeResp { +struct NodeResp { id: String, - addr: SocketAddr, + role: Option<NodeRoleResp>, + addr: Option<SocketAddr>, + hostname: Option<String>, is_up: bool, last_seen_secs_ago: Option<u64>, - hostname: String, + draining: bool, + #[serde(skip_serializing_if = "Option::is_none")] + data_partition: Option<FreeSpaceResp>, + #[serde(skip_serializing_if = "Option::is_none")] + metadata_partition: Option<FreeSpaceResp>, } // ---- update functions ---- @@ -208,10 +295,10 @@ pub async fn handle_update_cluster_layout( ) -> Result<Response<ResBody>, Error> { let updates = parse_json_body::<UpdateClusterLayoutRequest, _, Error>(req).await?; - let mut layout = garage.system.get_cluster_layout(); + let mut layout = garage.system.cluster_layout().clone(); - let mut roles = layout.roles.clone(); - roles.merge(&layout.staging_roles); + let mut roles = layout.current().roles.clone(); + roles.merge(&layout.staging.get().roles); for change in updates { let node = hex::decode(&change.id).ok_or_bad_request("Invalid node identifier")?; @@ -232,11 +319,17 @@ pub async fn handle_update_cluster_layout( }; layout - .staging_roles + .staging + .get_mut() + .roles .merge(&roles.update_mutator(node, layout::NodeRoleV(new_role))); } - garage.system.update_cluster_layout(&layout).await?; + garage + .system + .layout_manager + .update_cluster_layout(&layout) + .await?; let res = format_cluster_layout(&layout); Ok(json_ok_response(&res)?) @@ -246,12 +339,16 @@ pub async fn handle_apply_cluster_layout( garage: &Arc<Garage>, req: Request<IncomingBody>, ) -> Result<Response<ResBody>, Error> { - let param = parse_json_body::<ApplyRevertLayoutRequest, _, Error>(req).await?; + let param = parse_json_body::<ApplyLayoutRequest, _, Error>(req).await?; - let layout = garage.system.get_cluster_layout(); + let layout = garage.system.cluster_layout().clone(); let (layout, msg) = layout.apply_staged_changes(Some(param.version))?; - garage.system.update_cluster_layout(&layout).await?; + garage + .system + .layout_manager + .update_cluster_layout(&layout) + .await?; let res = ApplyClusterLayoutResponse { message: msg, @@ -262,13 +359,14 @@ pub async fn handle_apply_cluster_layout( pub async fn handle_revert_cluster_layout( garage: &Arc<Garage>, - req: Request<IncomingBody>, ) -> Result<Response<ResBody>, Error> { - let param = parse_json_body::<ApplyRevertLayoutRequest, _, Error>(req).await?; - - let layout = garage.system.get_cluster_layout(); - let layout = layout.revert_staged_changes(Some(param.version))?; - garage.system.update_cluster_layout(&layout).await?; + let layout = garage.system.cluster_layout().clone(); + let layout = layout.revert_staged_changes()?; + garage + .system + .layout_manager + .update_cluster_layout(&layout) + .await?; let res = format_cluster_layout(&layout); Ok(json_ok_response(&res)?) @@ -280,7 +378,7 @@ type UpdateClusterLayoutRequest = Vec<NodeRoleChange>; #[derive(Deserialize)] #[serde(rename_all = "camelCase")] -struct ApplyRevertLayoutRequest { +struct ApplyLayoutRequest { version: u64, } diff --git a/src/api/common_error.rs b/src/api/common_error.rs index 4381f227..c47555d4 100644 --- a/src/api/common_error.rs +++ b/src/api/common_error.rs @@ -59,9 +59,7 @@ impl CommonError { pub fn http_status_code(&self) -> StatusCode { match self { CommonError::InternalError( - GarageError::Timeout - | GarageError::RemoteError(_) - | GarageError::Quorum(_, _, _, _), + GarageError::Timeout | GarageError::RemoteError(_) | GarageError::Quorum(..), ) => StatusCode::SERVICE_UNAVAILABLE, CommonError::InternalError(_) | CommonError::Hyper(_) | CommonError::Http(_) => { StatusCode::INTERNAL_SERVER_ERROR @@ -80,9 +78,7 @@ impl CommonError { match self { CommonError::Forbidden(_) => "AccessDenied", CommonError::InternalError( - GarageError::Timeout - | GarageError::RemoteError(_) - | GarageError::Quorum(_, _, _, _), + GarageError::Timeout | GarageError::RemoteError(_) | GarageError::Quorum(..), ) => "ServiceUnavailable", CommonError::InternalError(_) | CommonError::Hyper(_) | CommonError::Http(_) => { "InternalError" diff --git a/src/api/k2v/index.rs b/src/api/k2v/index.rs index 822bec44..e3397238 100644 --- a/src/api/k2v/index.rs +++ b/src/api/k2v/index.rs @@ -1,9 +1,6 @@ -use std::sync::Arc; - use hyper::Response; use serde::Serialize; -use garage_rpc::ring::Ring; use garage_table::util::*; use garage_model::k2v::item_table::{BYTES, CONFLICTS, ENTRIES, VALUES}; @@ -27,7 +24,11 @@ pub async fn handle_read_index( let reverse = reverse.unwrap_or(false); - let ring: Arc<Ring> = garage.system.ring.borrow().clone(); + let node_id_vec = garage + .system + .cluster_layout() + .all_nongateway_nodes() + .to_vec(); let (partition_keys, more, next_start) = read_range( &garage.k2v.counter_table.table, @@ -36,7 +37,7 @@ pub async fn handle_read_index( &start, &end, limit, - Some((DeletedFilter::NotDeleted, ring.layout.node_id_vec.clone())), + Some((DeletedFilter::NotDeleted, node_id_vec)), EnumerationOrder::from_reverse(reverse), ) .await?; @@ -55,7 +56,7 @@ pub async fn handle_read_index( partition_keys: partition_keys .into_iter() .map(|part| { - let vals = part.filtered_values(&ring); + let vals = part.filtered_values(&garage.system.cluster_layout()); ReadIndexResponseEntry { pk: part.sk, entries: *vals.get(&s_entries).unwrap_or(&0), diff --git a/src/api/s3/copy.rs b/src/api/s3/copy.rs index 3c2bd483..2b29ec6d 100644 --- a/src/api/s3/copy.rs +++ b/src/api/s3/copy.rs @@ -1,7 +1,7 @@ use std::pin::Pin; use std::time::{Duration, SystemTime, UNIX_EPOCH}; -use futures::{stream, stream::Stream, StreamExt}; +use futures::{stream, stream::Stream, StreamExt, TryStreamExt}; use md5::{Digest as Md5Digest, Md5}; use bytes::Bytes; @@ -9,9 +9,11 @@ use hyper::{Request, Response}; use serde::Serialize; use garage_net::bytes_buf::BytesBuf; +use garage_net::stream::read_stream_to_end; use garage_rpc::rpc_helper::OrderTag; use garage_table::*; use garage_util::data::*; +use garage_util::error::Error as GarageError; use garage_util::time::*; use garage_model::s3::block_ref_table::*; @@ -21,11 +23,15 @@ use garage_model::s3::version_table::*; use crate::helpers::*; use crate::s3::api_server::{ReqBody, ResBody}; +use crate::s3::encryption::EncryptionParams; use crate::s3::error::*; +use crate::s3::get::full_object_byte_stream; use crate::s3::multipart; -use crate::s3::put::get_headers; +use crate::s3::put::{get_headers, save_stream, SaveStreamResult}; use crate::s3::xml::{self as s3_xml, xmlns_tag}; +// -------- CopyObject --------- + pub async fn handle_copy( ctx: ReqCtx, req: &Request<ReqBody>, @@ -35,38 +41,114 @@ pub async fn handle_copy( let source_object = get_copy_source(&ctx, req).await?; - let ReqCtx { - garage, - bucket_id: dest_bucket_id, - .. - } = ctx; - let (source_version, source_version_data, source_version_meta) = extract_source_info(&source_object)?; // Check precondition, e.g. x-amz-copy-source-if-match copy_precondition.check(source_version, &source_version_meta.etag)?; + // Determine encryption parameters + let (source_encryption, source_object_headers) = + EncryptionParams::check_decrypt_for_copy_source( + &ctx.garage, + req.headers(), + &source_version_meta.encryption, + )?; + let dest_encryption = EncryptionParams::new_from_headers(&ctx.garage, req.headers())?; + + // Determine headers of destination object + let dest_object_headers = match req.headers().get("x-amz-metadata-directive") { + Some(v) if v == hyper::header::HeaderValue::from_static("REPLACE") => { + get_headers(req.headers())? + } + _ => source_object_headers.into_owned(), + }; + + // Do actual object copying + let res = if EncryptionParams::is_same(&source_encryption, &dest_encryption) { + // If source and dest are both unencrypted, or if the encryption keys + // are the same, we can just copy the metadata and link blocks of the + // old object from the new object. + handle_copy_metaonly( + ctx, + dest_key, + dest_object_headers, + dest_encryption, + source_version, + source_version_data, + source_version_meta, + ) + .await? + } else { + // If source and dest encryption use different keys, + // we must decrypt content and re-encrypt, so rewrite all data blocks. + handle_copy_reencrypt( + ctx, + dest_key, + dest_object_headers, + dest_encryption, + source_version, + source_version_data, + source_encryption, + ) + .await? + }; + + let last_modified = msec_to_rfc3339(res.version_timestamp); + let result = CopyObjectResult { + last_modified: s3_xml::Value(last_modified), + etag: s3_xml::Value(format!("\"{}\"", res.etag)), + }; + let xml = s3_xml::to_xml_with_header(&result)?; + + let mut resp = Response::builder() + .header("Content-Type", "application/xml") + .header("x-amz-version-id", hex::encode(res.version_uuid)) + .header( + "x-amz-copy-source-version-id", + hex::encode(source_version.uuid), + ); + dest_encryption.add_response_headers(&mut resp); + Ok(resp.body(string_body(xml))?) +} + +async fn handle_copy_metaonly( + ctx: ReqCtx, + dest_key: &str, + dest_object_headers: ObjectVersionHeaders, + dest_encryption: EncryptionParams, + source_version: &ObjectVersion, + source_version_data: &ObjectVersionData, + source_version_meta: &ObjectVersionMeta, +) -> Result<SaveStreamResult, Error> { + let ReqCtx { + garage, + bucket_id: dest_bucket_id, + .. + } = ctx; + // Generate parameters for copied object let new_uuid = gen_uuid(); let new_timestamp = now_msec(); - // Implement x-amz-metadata-directive: REPLACE - let new_meta = match req.headers().get("x-amz-metadata-directive") { - Some(v) if v == hyper::header::HeaderValue::from_static("REPLACE") => ObjectVersionMeta { - headers: get_headers(req.headers())?, - size: source_version_meta.size, - etag: source_version_meta.etag.clone(), - }, - _ => source_version_meta.clone(), + let new_meta = ObjectVersionMeta { + encryption: dest_encryption.encrypt_headers(dest_object_headers)?, + size: source_version_meta.size, + etag: source_version_meta.etag.clone(), }; - let etag = new_meta.etag.to_string(); + let res = SaveStreamResult { + version_uuid: new_uuid, + version_timestamp: new_timestamp, + etag: new_meta.etag.clone(), + }; // Save object copy match source_version_data { ObjectVersionData::DeleteMarker => unreachable!(), ObjectVersionData::Inline(_meta, bytes) => { + // bytes is either plaintext before&after or encrypted with the + // same keys, so it's ok to just copy it as is let dest_object_version = ObjectVersion { uuid: new_uuid, timestamp: new_timestamp, @@ -97,7 +179,7 @@ pub async fn handle_copy( uuid: new_uuid, timestamp: new_timestamp, state: ObjectVersionState::Uploading { - headers: new_meta.headers.clone(), + encryption: new_meta.encryption.clone(), multipart: false, }, }; @@ -164,23 +246,42 @@ pub async fn handle_copy( } } - let last_modified = msec_to_rfc3339(new_timestamp); - let result = CopyObjectResult { - last_modified: s3_xml::Value(last_modified), - etag: s3_xml::Value(format!("\"{}\"", etag)), - }; - let xml = s3_xml::to_xml_with_header(&result)?; + Ok(res) +} - Ok(Response::builder() - .header("Content-Type", "application/xml") - .header("x-amz-version-id", hex::encode(new_uuid)) - .header( - "x-amz-copy-source-version-id", - hex::encode(source_version.uuid), - ) - .body(string_body(xml))?) +async fn handle_copy_reencrypt( + ctx: ReqCtx, + dest_key: &str, + dest_object_headers: ObjectVersionHeaders, + dest_encryption: EncryptionParams, + source_version: &ObjectVersion, + source_version_data: &ObjectVersionData, + source_encryption: EncryptionParams, +) -> Result<SaveStreamResult, Error> { + // basically we will read the source data (decrypt if necessary) + // and save that in a new object (encrypt if necessary), + // by combining the code used in getobject and putobject + let source_stream = full_object_byte_stream( + ctx.garage.clone(), + source_version, + source_version_data, + source_encryption, + ); + + save_stream( + &ctx, + dest_object_headers, + dest_encryption, + source_stream.map_err(|e| Error::from(GarageError::from(e))), + &dest_key.to_string(), + None, + None, + ) + .await } +// -------- UploadPartCopy --------- + pub async fn handle_upload_part_copy( ctx: ReqCtx, req: &Request<ReqBody>, @@ -193,7 +294,7 @@ pub async fn handle_upload_part_copy( let dest_upload_id = multipart::decode_upload_id(upload_id)?; let dest_key = dest_key.to_string(); - let (source_object, (_, _, mut dest_mpu)) = futures::try_join!( + let (source_object, (_, dest_version, mut dest_mpu)) = futures::try_join!( get_copy_source(&ctx, req), multipart::get_upload(&ctx, &dest_key, &dest_upload_id) )?; @@ -206,6 +307,20 @@ pub async fn handle_upload_part_copy( // Check precondition on source, e.g. x-amz-copy-source-if-match copy_precondition.check(source_object_version, &source_version_meta.etag)?; + // Determine encryption parameters + let (source_encryption, _) = EncryptionParams::check_decrypt_for_copy_source( + &garage, + req.headers(), + &source_version_meta.encryption, + )?; + let dest_object_encryption = match dest_version.state { + ObjectVersionState::Uploading { encryption, .. } => encryption, + _ => unreachable!(), + }; + let (dest_encryption, _) = + EncryptionParams::check_decrypt(&garage, req.headers(), &dest_object_encryption)?; + let same_encryption = EncryptionParams::is_same(&source_encryption, &dest_encryption); + // Check source range is valid let source_range = match req.headers().get("x-amz-copy-source-range") { Some(range) => { @@ -227,21 +342,16 @@ pub async fn handle_upload_part_copy( }; // Check source version is not inlined - match source_version_data { - ObjectVersionData::DeleteMarker => unreachable!(), - ObjectVersionData::Inline(_meta, _bytes) => { - // This is only for small files, we don't bother handling this. - // (in AWS UploadPartCopy works for parts at least 5MB which - // is never the case of an inline object) - return Err(Error::bad_request( - "Source object is too small (minimum part size is 5Mb)", - )); - } - ObjectVersionData::FirstBlock(_meta, _first_block_hash) => (), - }; + if matches!(source_version_data, ObjectVersionData::Inline(_, _)) { + // This is only for small files, we don't bother handling this. + // (in AWS UploadPartCopy works for parts at least 5MB which + // is never the case of an inline object) + return Err(Error::bad_request( + "Source object is too small (minimum part size is 5Mb)", + )); + } - // Fetch source versin with its block list, - // and destination version to check part hasn't yet been uploaded + // Fetch source version with its block list let source_version = garage .version_table .get(&source_object_version.uuid, &EmptyKey) @@ -251,7 +361,9 @@ pub async fn handle_upload_part_copy( // We want to reuse blocks from the source version as much as possible. // However, we still need to get the data from these blocks // because we need to know it to calculate the MD5sum of the part - // which is used as its ETag. + // which is used as its ETag. For encrypted sources or destinations, + // we must always read(+decrypt) and then write(+encrypt), so we + // can never reuse data blocks as is. // First, calculate what blocks we want to keep, // and the subrange of the block to take, if the bounds of the @@ -313,6 +425,8 @@ pub async fn handle_upload_part_copy( }, false, ); + // write an empty version now to be the parent of the block_ref entries + garage.version_table.insert(&dest_version).await?; // Now, actually copy the blocks let mut md5hasher = Md5::new(); @@ -321,24 +435,44 @@ pub async fn handle_upload_part_copy( // and extract the subrange if necessary. // The second returned value is an Option<Hash>, that is Some // if and only if the block returned is a block that already existed - // in the Garage data store (thus we don't need to save it again). + // in the Garage data store and can be reused as-is instead of having + // to save it again. This excludes encrypted source blocks that we had + // to decrypt. let garage2 = garage.clone(); let order_stream = OrderTag::stream(); let source_blocks = stream::iter(blocks_to_copy) .enumerate() - .flat_map(|(i, (block_hash, range_to_copy))| { + .map(|(i, (block_hash, range_to_copy))| { let garage3 = garage2.clone(); - stream::once(async move { - let data = garage3 - .block_manager - .rpc_get_block(&block_hash, Some(order_stream.order(i as u64))) + async move { + let stream = source_encryption + .get_block(&garage3, &block_hash, Some(order_stream.order(i as u64))) .await?; + let data = read_stream_to_end(stream).await?.into_bytes(); + // For each item, we return a tuple of: + // 1. the full data block (decrypted) + // 2. an Option<Hash> that indicates the hash of the block in the block store, + // only if it can be re-used as-is in the copied object match range_to_copy { - Some(r) => Ok((data.slice(r), None)), - None => Ok((data, Some(block_hash))), + Some(r) => { + // If we are taking a subslice of the data, we cannot reuse the block as-is + Ok((data.slice(r), None)) + } + None if same_encryption => { + // If the data is unencrypted before & after, or if we are using + // the same encryption key, we can reuse the stored block, no need + // to re-send it to storage nodes. + Ok((data, Some(block_hash))) + } + None => { + // If we are decrypting / (re)encrypting with different keys, + // we cannot reuse the block as-is + Ok((data, None)) + } } - }) + } }) + .buffered(2) .peekable(); // The defragmenter is a custom stream (defined below) that concatenates @@ -346,22 +480,33 @@ pub async fn handle_upload_part_copy( // It returns a series of (Vec<u8>, Option<Hash>). // When it is done, it returns an empty vec. // Same as the previous iterator, the Option is Some(_) if and only if - // it's an existing block of the Garage data store. + // it's an existing block of the Garage data store that can be reused. let mut defragmenter = Defragmenter::new(garage.config.block_size, Box::pin(source_blocks)); let mut current_offset = 0; let mut next_block = defragmenter.next().await?; + // TODO this could be optimized similarly to read_and_put_blocks + // low priority because uploadpartcopy is rarely used loop { let (data, existing_block_hash) = next_block; if data.is_empty() { break; } + let data_len = data.len() as u64; md5hasher.update(&data[..]); - let must_upload = existing_block_hash.is_none(); - let final_hash = existing_block_hash.unwrap_or_else(|| blake2sum(&data[..])); + let (final_data, must_upload, final_hash) = match existing_block_hash { + Some(hash) if same_encryption => (data, false, hash), + _ => tokio::task::spawn_blocking(move || { + let data_enc = dest_encryption.encrypt_block(data)?; + let hash = blake2sum(&data_enc); + Ok::<_, Error>((data_enc, true, hash)) + }) + .await + .unwrap()?, + }; dest_version.blocks.clear(); dest_version.blocks.put( @@ -371,10 +516,10 @@ pub async fn handle_upload_part_copy( }, VersionBlock { hash: final_hash, - size: data.len() as u64, + size: data_len, }, ); - current_offset += data.len() as u64; + current_offset += data_len; let block_ref = BlockRef { block: final_hash, @@ -382,36 +527,33 @@ pub async fn handle_upload_part_copy( deleted: false.into(), }; - let garage2 = garage.clone(); - let res = futures::try_join!( + let (_, _, _, next) = futures::try_join!( // Thing 1: if the block is not exactly a block that existed before, // we need to insert that data as a new block. - async move { + async { if must_upload { - garage2 + garage .block_manager - .rpc_put_block(final_hash, data, None) + .rpc_put_block(final_hash, final_data, dest_encryption.is_encrypted(), None) .await } else { Ok(()) } }, - async { - // Thing 2: we need to insert the block in the version - garage.version_table.insert(&dest_version).await?; - // Thing 3: we need to add a block reference - garage.block_ref_table.insert(&block_ref).await - }, - // Thing 4: we need to prefetch the next block + // Thing 2: we need to insert the block in the version + garage.version_table.insert(&dest_version), + // Thing 3: we need to add a block reference + garage.block_ref_table.insert(&block_ref), + // Thing 4: we need to read the next block defragmenter.next(), )?; - next_block = res.2; + next_block = next; } assert_eq!(current_offset, source_range.length); let data_md5sum = md5hasher.finalize(); - let etag = hex::encode(data_md5sum); + let etag = dest_encryption.etag_from_md5(&data_md5sum); // Put the part's ETag in the Versiontable dest_mpu.parts.put( @@ -431,13 +573,14 @@ pub async fn handle_upload_part_copy( last_modified: s3_xml::Value(msec_to_rfc3339(source_object_version.timestamp)), })?; - Ok(Response::builder() + let mut resp = Response::builder() .header("Content-Type", "application/xml") .header( "x-amz-copy-source-version-id", hex::encode(source_object_version.uuid), - ) - .body(string_body(resp_xml))?) + ); + dest_encryption.add_response_headers(&mut resp); + Ok(resp.body(string_body(resp_xml))?) } async fn get_copy_source(ctx: &ReqCtx, req: &Request<ReqBody>) -> Result<Object, Error> { diff --git a/src/api/s3/encryption.rs b/src/api/s3/encryption.rs new file mode 100644 index 00000000..2b105e90 --- /dev/null +++ b/src/api/s3/encryption.rs @@ -0,0 +1,595 @@ +use std::borrow::Cow; +use std::convert::TryInto; +use std::pin::Pin; + +use aes_gcm::{ + aead::stream::{DecryptorLE31, EncryptorLE31, StreamLE31}, + aead::{Aead, AeadCore, KeyInit, OsRng}, + aes::cipher::crypto_common::rand_core::RngCore, + aes::cipher::typenum::Unsigned, + Aes256Gcm, Key, Nonce, +}; +use base64::prelude::*; +use bytes::Bytes; + +use futures::stream::Stream; +use futures::task; +use tokio::io::BufReader; + +use http::header::{HeaderMap, HeaderName, HeaderValue}; + +use garage_net::bytes_buf::BytesBuf; +use garage_net::stream::{stream_asyncread, ByteStream}; +use garage_rpc::rpc_helper::OrderTag; +use garage_util::data::Hash; +use garage_util::error::Error as GarageError; +use garage_util::migrate::Migrate; + +use garage_model::garage::Garage; +use garage_model::s3::object_table::{ObjectVersionEncryption, ObjectVersionHeaders}; + +use crate::common_error::*; +use crate::s3::error::Error; + +const X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_ALGORITHM: HeaderName = + HeaderName::from_static("x-amz-server-side-encryption-customer-algorithm"); +const X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY: HeaderName = + HeaderName::from_static("x-amz-server-side-encryption-customer-key"); +const X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5: HeaderName = + HeaderName::from_static("x-amz-server-side-encryption-customer-key-md5"); + +const X_AMZ_COPY_SOURCE_SERVER_SIDE_ENCRYPTION_CUSTOMER_ALGORITHM: HeaderName = + HeaderName::from_static("x-amz-copy-source-server-side-encryption-customer-algorithm"); +const X_AMZ_COPY_SOURCE_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY: HeaderName = + HeaderName::from_static("x-amz-copy-source-server-side-encryption-customer-key"); +const X_AMZ_COPY_SOURCE_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5: HeaderName = + HeaderName::from_static("x-amz-copy-source-server-side-encryption-customer-key-md5"); + +const CUSTOMER_ALGORITHM_AES256: &[u8] = b"AES256"; + +type Md5Output = md5::digest::Output<md5::Md5Core>; + +type StreamNonceSize = aes_gcm::aead::stream::NonceSize<Aes256Gcm, StreamLE31<Aes256Gcm>>; + +// Data blocks are encrypted by smaller chunks of size 4096 bytes, +// so that data can be streamed when reading. +// This size has to be known and has to be constant, or data won't be +// readable anymore. DO NOT CHANGE THIS VALUE. +const STREAM_ENC_PLAIN_CHUNK_SIZE: usize = 0x1000; // 4096 bytes +const STREAM_ENC_CYPER_CHUNK_SIZE: usize = STREAM_ENC_PLAIN_CHUNK_SIZE + 16; + +#[derive(Clone, Copy)] +pub enum EncryptionParams { + Plaintext, + SseC { + client_key: Key<Aes256Gcm>, + client_key_md5: Md5Output, + compression_level: Option<i32>, + }, +} + +impl EncryptionParams { + pub fn is_encrypted(&self) -> bool { + !matches!(self, Self::Plaintext) + } + + pub fn is_same(a: &Self, b: &Self) -> bool { + let relevant_info = |x: &Self| match x { + Self::Plaintext => None, + Self::SseC { + client_key, + compression_level, + .. + } => Some((*client_key, compression_level.is_some())), + }; + relevant_info(a) == relevant_info(b) + } + + pub fn new_from_headers( + garage: &Garage, + headers: &HeaderMap, + ) -> Result<EncryptionParams, Error> { + let key = parse_request_headers( + headers, + &X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_ALGORITHM, + &X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY, + &X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5, + )?; + match key { + Some((client_key, client_key_md5)) => Ok(EncryptionParams::SseC { + client_key, + client_key_md5, + compression_level: garage.config.compression_level, + }), + None => Ok(EncryptionParams::Plaintext), + } + } + + pub fn add_response_headers(&self, resp: &mut http::response::Builder) { + if let Self::SseC { client_key_md5, .. } = self { + let md5 = BASE64_STANDARD.encode(&client_key_md5); + + resp.headers_mut().unwrap().insert( + X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_ALGORITHM, + HeaderValue::from_bytes(CUSTOMER_ALGORITHM_AES256).unwrap(), + ); + resp.headers_mut().unwrap().insert( + X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5, + HeaderValue::from_bytes(md5.as_bytes()).unwrap(), + ); + } + } + + pub fn check_decrypt<'a>( + garage: &Garage, + headers: &HeaderMap, + obj_enc: &'a ObjectVersionEncryption, + ) -> Result<(Self, Cow<'a, ObjectVersionHeaders>), Error> { + let key = parse_request_headers( + headers, + &X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_ALGORITHM, + &X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY, + &X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5, + )?; + Self::check_decrypt_common(garage, key, obj_enc) + } + + pub fn check_decrypt_for_copy_source<'a>( + garage: &Garage, + headers: &HeaderMap, + obj_enc: &'a ObjectVersionEncryption, + ) -> Result<(Self, Cow<'a, ObjectVersionHeaders>), Error> { + let key = parse_request_headers( + headers, + &X_AMZ_COPY_SOURCE_SERVER_SIDE_ENCRYPTION_CUSTOMER_ALGORITHM, + &X_AMZ_COPY_SOURCE_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY, + &X_AMZ_COPY_SOURCE_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5, + )?; + Self::check_decrypt_common(garage, key, obj_enc) + } + + fn check_decrypt_common<'a>( + garage: &Garage, + key: Option<(Key<Aes256Gcm>, Md5Output)>, + obj_enc: &'a ObjectVersionEncryption, + ) -> Result<(Self, Cow<'a, ObjectVersionHeaders>), Error> { + match (key, &obj_enc) { + ( + Some((client_key, client_key_md5)), + ObjectVersionEncryption::SseC { + headers, + compressed, + }, + ) => { + let enc = Self::SseC { + client_key, + client_key_md5, + compression_level: if *compressed { + Some(garage.config.compression_level.unwrap_or(1)) + } else { + None + }, + }; + let plaintext = enc.decrypt_blob(&headers)?; + let headers = ObjectVersionHeaders::decode(&plaintext) + .ok_or_internal_error("Could not decode encrypted headers")?; + Ok((enc, Cow::Owned(headers))) + } + (None, ObjectVersionEncryption::Plaintext { headers }) => { + Ok((Self::Plaintext, Cow::Borrowed(headers))) + } + (_, ObjectVersionEncryption::SseC { .. }) => { + Err(Error::bad_request("Object is encrypted")) + } + (Some(_), _) => { + // TODO: should this be an OK scenario? + Err(Error::bad_request("Trying to decrypt a plaintext object")) + } + } + } + + pub fn encrypt_headers( + &self, + h: ObjectVersionHeaders, + ) -> Result<ObjectVersionEncryption, Error> { + match self { + Self::SseC { + compression_level, .. + } => { + let plaintext = h.encode().map_err(GarageError::from)?; + let ciphertext = self.encrypt_blob(&plaintext)?; + Ok(ObjectVersionEncryption::SseC { + headers: ciphertext.into_owned(), + compressed: compression_level.is_some(), + }) + } + Self::Plaintext => Ok(ObjectVersionEncryption::Plaintext { headers: h }), + } + } + + // ---- generating object Etag values ---- + pub fn etag_from_md5(&self, md5sum: &[u8]) -> String { + match self { + Self::Plaintext => hex::encode(md5sum), + Self::SseC { .. } => { + // AWS specifies that for encrypted objects, the Etag is not + // the md5sum of the data, but doesn't say what it is. + // So we just put some random bytes. + let mut random = [0u8; 16]; + OsRng.fill_bytes(&mut random); + hex::encode(&random) + } + } + } + + // ---- generic function for encrypting / decrypting blobs ---- + // Prepends a randomly-generated nonce to the encrypted value. + // This is used for encrypting object headers and inlined data for small objects. + // This does not compress anything. + + pub fn encrypt_blob<'a>(&self, blob: &'a [u8]) -> Result<Cow<'a, [u8]>, Error> { + match self { + Self::SseC { client_key, .. } => { + let cipher = Aes256Gcm::new(&client_key); + let nonce = Aes256Gcm::generate_nonce(&mut OsRng); + let ciphertext = cipher + .encrypt(&nonce, blob) + .ok_or_internal_error("Encryption failed")?; + Ok(Cow::Owned([nonce.to_vec(), ciphertext].concat())) + } + Self::Plaintext => Ok(Cow::Borrowed(blob)), + } + } + + pub fn decrypt_blob<'a>(&self, blob: &'a [u8]) -> Result<Cow<'a, [u8]>, Error> { + match self { + Self::SseC { client_key, .. } => { + let cipher = Aes256Gcm::new(&client_key); + let nonce_size = <Aes256Gcm as AeadCore>::NonceSize::to_usize(); + let nonce = Nonce::from_slice( + blob.get(..nonce_size) + .ok_or_internal_error("invalid encrypted data")?, + ); + let plaintext = cipher + .decrypt(nonce, &blob[nonce_size..]) + .ok_or_bad_request( + "Invalid encryption key, could not decrypt object metadata.", + )?; + Ok(Cow::Owned(plaintext)) + } + Self::Plaintext => Ok(Cow::Borrowed(blob)), + } + } + + // ---- function for encrypting / decrypting byte streams ---- + + /// Get a data block from the storage node, and decrypt+decompress it + /// if necessary. If object is plaintext, just get it without any processing. + pub async fn get_block( + &self, + garage: &Garage, + hash: &Hash, + order: Option<OrderTag>, + ) -> Result<ByteStream, GarageError> { + let raw_block = garage + .block_manager + .rpc_get_block_streaming(hash, order) + .await?; + Ok(self.decrypt_block_stream(raw_block)) + } + + pub fn decrypt_block_stream(&self, stream: ByteStream) -> ByteStream { + match self { + Self::Plaintext => stream, + Self::SseC { + client_key, + compression_level, + .. + } => { + let plaintext = DecryptStream::new(stream, *client_key); + if compression_level.is_some() { + let reader = stream_asyncread(Box::pin(plaintext)); + let reader = BufReader::new(reader); + let reader = async_compression::tokio::bufread::ZstdDecoder::new(reader); + Box::pin(tokio_util::io::ReaderStream::new(reader)) + } else { + Box::pin(plaintext) + } + } + } + } + + /// Encrypt a data block if encryption is set, for use before + /// putting the data blocks into storage + pub fn encrypt_block(&self, block: Bytes) -> Result<Bytes, Error> { + match self { + Self::Plaintext => Ok(block), + Self::SseC { + client_key, + compression_level, + .. + } => { + let block = if let Some(level) = compression_level { + Cow::Owned( + garage_block::zstd_encode(block.as_ref(), *level) + .ok_or_internal_error("failed to compress data block")?, + ) + } else { + Cow::Borrowed(block.as_ref()) + }; + + let mut ret = Vec::with_capacity(block.len() + 32 + block.len() / 64); + + let mut nonce: Nonce<StreamNonceSize> = Default::default(); + OsRng.fill_bytes(&mut nonce); + ret.extend_from_slice(nonce.as_slice()); + + let mut cipher = EncryptorLE31::<Aes256Gcm>::new(&client_key, &nonce); + let mut iter = block.chunks(STREAM_ENC_PLAIN_CHUNK_SIZE).peekable(); + + if iter.peek().is_none() { + // Empty stream: we encrypt an empty last chunk + let chunk_enc = cipher + .encrypt_last(&[][..]) + .ok_or_internal_error("failed to encrypt chunk")?; + ret.extend_from_slice(&chunk_enc); + } else { + loop { + let chunk = iter.next().unwrap(); + if iter.peek().is_some() { + let chunk_enc = cipher + .encrypt_next(chunk) + .ok_or_internal_error("failed to encrypt chunk")?; + assert_eq!(chunk.len(), STREAM_ENC_PLAIN_CHUNK_SIZE); + assert_eq!(chunk_enc.len(), STREAM_ENC_CYPER_CHUNK_SIZE); + ret.extend_from_slice(&chunk_enc); + } else { + // use encrypt_last for the last chunk + let chunk_enc = cipher + .encrypt_last(chunk) + .ok_or_internal_error("failed to encrypt chunk")?; + ret.extend_from_slice(&chunk_enc); + break; + } + } + } + + Ok(ret.into()) + } + } + } +} + +fn parse_request_headers( + headers: &HeaderMap, + alg_header: &HeaderName, + key_header: &HeaderName, + md5_header: &HeaderName, +) -> Result<Option<(Key<Aes256Gcm>, Md5Output)>, Error> { + let alg = headers.get(alg_header).map(HeaderValue::as_bytes); + let key = headers.get(key_header).map(HeaderValue::as_bytes); + let md5 = headers.get(md5_header).map(HeaderValue::as_bytes); + + match alg { + Some(CUSTOMER_ALGORITHM_AES256) => { + use md5::{Digest, Md5}; + + let key_b64 = + key.ok_or_bad_request("Missing server-side-encryption-customer-key header")?; + let key_bytes: [u8; 32] = BASE64_STANDARD + .decode(&key_b64) + .ok_or_bad_request( + "Invalid server-side-encryption-customer-key header: invalid base64", + )? + .try_into() + .ok() + .ok_or_bad_request( + "Invalid server-side-encryption-customer-key header: invalid length", + )?; + + let md5_b64 = + md5.ok_or_bad_request("Missing server-side-encryption-customer-key-md5 header")?; + let md5_bytes = BASE64_STANDARD.decode(&md5_b64).ok_or_bad_request( + "Invalid server-side-encryption-customer-key-md5 header: invalid bass64", + )?; + + let mut hasher = Md5::new(); + hasher.update(&key_bytes[..]); + let our_md5 = hasher.finalize(); + if our_md5.as_slice() != md5_bytes.as_slice() { + return Err(Error::bad_request( + "Server-side encryption client key MD5 checksum does not match", + )); + } + + Ok(Some((key_bytes.into(), our_md5))) + } + Some(alg) => Err(Error::InvalidEncryptionAlgorithm( + String::from_utf8_lossy(alg).into_owned(), + )), + None => { + if key.is_some() || md5.is_some() { + Err(Error::bad_request( + "Unexpected server-side-encryption-customer-key{,-md5} header(s)", + )) + } else { + Ok(None) + } + } + } +} + +// ---- encrypt & decrypt streams ---- + +#[pin_project::pin_project] +struct DecryptStream { + #[pin] + stream: ByteStream, + done_reading: bool, + buf: BytesBuf, + key: Key<Aes256Gcm>, + state: DecryptStreamState, +} + +enum DecryptStreamState { + Starting, + Running(DecryptorLE31<Aes256Gcm>), + Done, +} + +impl DecryptStream { + fn new(stream: ByteStream, key: Key<Aes256Gcm>) -> Self { + Self { + stream, + done_reading: false, + buf: BytesBuf::new(), + key, + state: DecryptStreamState::Starting, + } + } +} + +impl Stream for DecryptStream { + type Item = Result<Bytes, std::io::Error>; + + fn poll_next( + self: Pin<&mut Self>, + cx: &mut task::Context<'_>, + ) -> task::Poll<Option<Self::Item>> { + use std::task::Poll; + + let mut this = self.project(); + + // The first bytes of the stream should contain the starting nonce. + // If we don't have a Running state, it means that we haven't + // yet read the nonce. + while matches!(this.state, DecryptStreamState::Starting) { + let nonce_size = StreamNonceSize::to_usize(); + if let Some(nonce) = this.buf.take_exact(nonce_size) { + let nonce = Nonce::from_slice(nonce.as_ref()); + *this.state = DecryptStreamState::Running(DecryptorLE31::new(&this.key, nonce)); + break; + } + + match futures::ready!(this.stream.as_mut().poll_next(cx)) { + Some(Ok(bytes)) => { + this.buf.extend(bytes); + } + Some(Err(e)) => { + return Poll::Ready(Some(Err(e))); + } + None => { + return Poll::Ready(Some(Err(std::io::Error::new( + std::io::ErrorKind::UnexpectedEof, + "Decrypt: unexpected EOF, could not read nonce", + )))); + } + } + } + + // Read at least one byte more than the encrypted chunk size + // (if possible), so that we know if we are decrypting the + // last chunk or not. + while !*this.done_reading && this.buf.len() <= STREAM_ENC_CYPER_CHUNK_SIZE { + match futures::ready!(this.stream.as_mut().poll_next(cx)) { + Some(Ok(bytes)) => { + this.buf.extend(bytes); + } + Some(Err(e)) => { + return Poll::Ready(Some(Err(e))); + } + None => { + *this.done_reading = true; + break; + } + } + } + + if matches!(this.state, DecryptStreamState::Done) { + if !this.buf.is_empty() { + return Poll::Ready(Some(Err(std::io::Error::new( + std::io::ErrorKind::Other, + "Decrypt: unexpected bytes after last encrypted chunk", + )))); + } + return Poll::Ready(None); + } + + let res = if this.buf.len() > STREAM_ENC_CYPER_CHUNK_SIZE { + // we have strictly more bytes than the encrypted chunk size, + // so we know this is not the last + let DecryptStreamState::Running(ref mut cipher) = this.state else { + unreachable!() + }; + let chunk = this.buf.take_exact(STREAM_ENC_CYPER_CHUNK_SIZE).unwrap(); + let chunk_dec = cipher.decrypt_next(chunk.as_ref()); + if let Ok(c) = &chunk_dec { + assert_eq!(c.len(), STREAM_ENC_PLAIN_CHUNK_SIZE); + } + chunk_dec + } else { + // We have one encrypted chunk size or less, even though we tried + // to read more, so this is the last chunk. Decrypt using the + // appropriate decrypt_last() function that then destroys the cipher. + let state = std::mem::replace(this.state, DecryptStreamState::Done); + let DecryptStreamState::Running(cipher) = state else { + unreachable!() + }; + let chunk = this.buf.take_all(); + cipher.decrypt_last(chunk.as_ref()) + }; + + match res { + Ok(bytes) if bytes.is_empty() => Poll::Ready(None), + Ok(bytes) => Poll::Ready(Some(Ok(bytes.into()))), + Err(_) => Poll::Ready(Some(Err(std::io::Error::new( + std::io::ErrorKind::Other, + "Decryption failed", + )))), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use futures::stream::StreamExt; + use garage_net::stream::read_stream_to_end; + + fn stream() -> ByteStream { + Box::pin( + futures::stream::iter(16usize..1024) + .map(|i| Ok(Bytes::from(vec![(i % 256) as u8; (i * 37) % 1024]))), + ) + } + + async fn test_block_enc(compression_level: Option<i32>) { + let enc = EncryptionParams::SseC { + client_key: Aes256Gcm::generate_key(&mut OsRng), + client_key_md5: Default::default(), // not needed + compression_level, + }; + + let block_plain = read_stream_to_end(stream()).await.unwrap().into_bytes(); + + let block_enc = enc.encrypt_block(block_plain.clone()).unwrap(); + + let block_dec = + enc.decrypt_block_stream(Box::pin(futures::stream::once(async { Ok(block_enc) }))); + let block_dec = read_stream_to_end(block_dec).await.unwrap().into_bytes(); + + assert_eq!(block_plain, block_dec); + assert!(block_dec.len() > 128000); + } + + #[tokio::test] + async fn test_encrypt_block() { + test_block_enc(None).await + } + + #[tokio::test] + async fn test_encrypt_block_compressed() { + test_block_enc(Some(1)).await + } +} diff --git a/src/api/s3/error.rs b/src/api/s3/error.rs index f86c19a6..5cb5d04e 100644 --- a/src/api/s3/error.rs +++ b/src/api/s3/error.rs @@ -65,6 +65,10 @@ pub enum Error { #[error(display = "Invalid HTTP range: {:?}", _0)] InvalidRange(#[error(from)] (http_range::HttpRangeParseError, u64)), + /// The client sent a range header with invalid value + #[error(display = "Invalid encryption algorithm: {:?}, should be AES256", _0)] + InvalidEncryptionAlgorithm(String), + /// The client sent a request for an action not supported by garage #[error(display = "Unimplemented action: {}", _0)] NotImplemented(String), @@ -126,6 +130,7 @@ impl Error { Error::InvalidXml(_) => "MalformedXML", Error::InvalidRange(_) => "InvalidRange", Error::InvalidUtf8Str(_) | Error::InvalidUtf8String(_) => "InvalidRequest", + Error::InvalidEncryptionAlgorithm(_) => "InvalidEncryptionAlgorithmError", } } } @@ -143,6 +148,7 @@ impl ApiError for Error { | Error::InvalidPart | Error::InvalidPartOrder | Error::EntityTooSmall + | Error::InvalidEncryptionAlgorithm(_) | Error::InvalidXml(_) | Error::InvalidUtf8Str(_) | Error::InvalidUtf8String(_) => StatusCode::BAD_REQUEST, diff --git a/src/api/s3/get.rs b/src/api/s3/get.rs index ed996fb1..ec300ab7 100644 --- a/src/api/s3/get.rs +++ b/src/api/s3/get.rs @@ -1,10 +1,12 @@ //! Function related to GET and HEAD requests +use std::collections::BTreeMap; use std::convert::TryInto; use std::sync::Arc; use std::time::{Duration, UNIX_EPOCH}; +use bytes::Bytes; use futures::future; -use futures::stream::{self, StreamExt}; +use futures::stream::{self, Stream, StreamExt}; use http::header::{ ACCEPT_RANGES, CACHE_CONTROL, CONTENT_DISPOSITION, CONTENT_ENCODING, CONTENT_LANGUAGE, CONTENT_LENGTH, CONTENT_RANGE, CONTENT_TYPE, ETAG, EXPIRES, IF_MODIFIED_SINCE, IF_NONE_MATCH, @@ -25,6 +27,7 @@ use garage_model::s3::version_table::*; use crate::helpers::*; use crate::s3::api_server::ResBody; +use crate::s3::encryption::EncryptionParams; use crate::s3::error::*; const X_AMZ_MP_PARTS_COUNT: &str = "x-amz-mp-parts-count"; @@ -42,6 +45,8 @@ pub struct GetObjectOverrides { fn object_headers( version: &ObjectVersion, version_meta: &ObjectVersionMeta, + headers: &ObjectVersionHeaders, + encryption: EncryptionParams, ) -> http::response::Builder { debug!("Version meta: {:?}", version_meta); @@ -49,7 +54,6 @@ fn object_headers( let date_str = httpdate::fmt_http_date(date); let mut resp = Response::builder() - .header(CONTENT_TYPE, version_meta.headers.content_type.to_string()) .header(LAST_MODIFIED, date_str) .header(ACCEPT_RANGES, "bytes".to_string()); @@ -57,10 +61,27 @@ fn object_headers( resp = resp.header(ETAG, format!("\"{}\"", version_meta.etag)); } - for (k, v) in version_meta.headers.other.iter() { - resp = resp.header(k, v.to_string()); + // When metadata is retrieved through the REST API, Amazon S3 combines headers that + // have the same name (ignoring case) into a comma-delimited list. + // See: https://docs.aws.amazon.com/AmazonS3/latest/userguide/UsingMetadata.html + let mut headers_by_name = BTreeMap::new(); + for (name, value) in headers.0.iter() { + match headers_by_name.get_mut(name) { + None => { + headers_by_name.insert(name, vec![value.as_str()]); + } + Some(headers) => { + headers.push(value.as_str()); + } + } + } + + for (name, values) in headers_by_name { + resp = resp.header(name, values.join(",")); } + encryption.add_response_headers(&mut resp); + resp } @@ -175,21 +196,27 @@ pub async fn handle_head_without_ctx( return Ok(cached); } + let (encryption, headers) = + EncryptionParams::check_decrypt(&garage, req.headers(), &version_meta.encryption)?; + if let Some(pn) = part_number { match version_data { - ObjectVersionData::Inline(_, bytes) => { + ObjectVersionData::Inline(_, _) => { if pn != 1 { return Err(Error::InvalidPart); } - Ok(object_headers(object_version, version_meta) - .header(CONTENT_LENGTH, format!("{}", bytes.len())) - .header( - CONTENT_RANGE, - format!("bytes 0-{}/{}", bytes.len() - 1, bytes.len()), - ) - .header(X_AMZ_MP_PARTS_COUNT, "1") - .status(StatusCode::PARTIAL_CONTENT) - .body(empty_body())?) + let bytes_len = version_meta.size; + Ok( + object_headers(object_version, version_meta, &headers, encryption) + .header(CONTENT_LENGTH, format!("{}", bytes_len)) + .header( + CONTENT_RANGE, + format!("bytes 0-{}/{}", bytes_len - 1, bytes_len), + ) + .header(X_AMZ_MP_PARTS_COUNT, "1") + .status(StatusCode::PARTIAL_CONTENT) + .body(empty_body())?, + ) } ObjectVersionData::FirstBlock(_, _) => { let version = garage @@ -201,28 +228,32 @@ pub async fn handle_head_without_ctx( let (part_offset, part_end) = calculate_part_bounds(&version, pn).ok_or(Error::InvalidPart)?; - Ok(object_headers(object_version, version_meta) - .header(CONTENT_LENGTH, format!("{}", part_end - part_offset)) - .header( - CONTENT_RANGE, - format!( - "bytes {}-{}/{}", - part_offset, - part_end - 1, - version_meta.size - ), - ) - .header(X_AMZ_MP_PARTS_COUNT, format!("{}", version.n_parts()?)) - .status(StatusCode::PARTIAL_CONTENT) - .body(empty_body())?) + Ok( + object_headers(object_version, version_meta, &headers, encryption) + .header(CONTENT_LENGTH, format!("{}", part_end - part_offset)) + .header( + CONTENT_RANGE, + format!( + "bytes {}-{}/{}", + part_offset, + part_end - 1, + version_meta.size + ), + ) + .header(X_AMZ_MP_PARTS_COUNT, format!("{}", version.n_parts()?)) + .status(StatusCode::PARTIAL_CONTENT) + .body(empty_body())?, + ) } _ => unreachable!(), } } else { - Ok(object_headers(object_version, version_meta) - .header(CONTENT_LENGTH, format!("{}", version_meta.size)) - .status(StatusCode::OK) - .body(empty_body())?) + Ok( + object_headers(object_version, version_meta, &headers, encryption) + .header(CONTENT_LENGTH, format!("{}", version_meta.size)) + .status(StatusCode::OK) + .body(empty_body())?, + ) } } @@ -273,23 +304,41 @@ pub async fn handle_get_without_ctx( return Ok(cached); } + let (enc, headers) = + EncryptionParams::check_decrypt(&garage, req.headers(), &last_v_meta.encryption)?; + match (part_number, parse_range_header(req, last_v_meta.size)?) { (Some(_), Some(_)) => Err(Error::bad_request( "Cannot specify both partNumber and Range header", )), - (Some(pn), None) => handle_get_part(garage, last_v, last_v_data, last_v_meta, pn).await, + (Some(pn), None) => { + handle_get_part(garage, last_v, last_v_data, last_v_meta, enc, &headers, pn).await + } (None, Some(range)) => { handle_get_range( garage, last_v, last_v_data, last_v_meta, + enc, + &headers, range.start, range.start + range.length, ) .await } - (None, None) => handle_get_full(garage, last_v, last_v_data, last_v_meta, overrides).await, + (None, None) => { + handle_get_full( + garage, + last_v, + last_v_data, + last_v_meta, + enc, + &headers, + overrides, + ) + .await + } } } @@ -298,17 +347,36 @@ async fn handle_get_full( version: &ObjectVersion, version_data: &ObjectVersionData, version_meta: &ObjectVersionMeta, + encryption: EncryptionParams, + headers: &ObjectVersionHeaders, overrides: GetObjectOverrides, ) -> Result<Response<ResBody>, Error> { - let mut resp_builder = object_headers(version, version_meta) + let mut resp_builder = object_headers(version, version_meta, &headers, encryption) .header(CONTENT_LENGTH, format!("{}", version_meta.size)) .status(StatusCode::OK); getobject_override_headers(overrides, &mut resp_builder)?; + let stream = full_object_byte_stream(garage, version, version_data, encryption); + + Ok(resp_builder.body(response_body_from_stream(stream))?) +} + +pub fn full_object_byte_stream( + garage: Arc<Garage>, + version: &ObjectVersion, + version_data: &ObjectVersionData, + encryption: EncryptionParams, +) -> ByteStream { match &version_data { ObjectVersionData::DeleteMarker => unreachable!(), ObjectVersionData::Inline(_, bytes) => { - Ok(resp_builder.body(bytes_body(bytes.to_vec().into()))?) + let bytes = bytes.to_vec(); + Box::pin(futures::stream::once(async move { + encryption + .decrypt_blob(&bytes) + .map(|x| Bytes::from(x.to_vec())) + .map_err(std_error_from_read_error) + })) } ObjectVersionData::FirstBlock(_, first_block_hash) => { let (tx, rx) = mpsc::channel::<ByteStream>(2); @@ -324,19 +392,18 @@ async fn handle_get_full( garage2.version_table.get(&version_uuid, &EmptyKey).await }); - let stream_block_0 = garage - .block_manager - .rpc_get_block_streaming(&first_block_hash, Some(order_stream.order(0))) + let stream_block_0 = encryption + .get_block(&garage, &first_block_hash, Some(order_stream.order(0))) .await?; + tx.send(stream_block_0) .await .ok_or_message("channel closed")?; let version = version_fut.await.unwrap()?.ok_or(Error::NoSuchKey)?; for (i, (_, vb)) in version.blocks.items().iter().enumerate().skip(1) { - let stream_block_i = garage - .block_manager - .rpc_get_block_streaming(&vb.hash, Some(order_stream.order(i as u64))) + let stream_block_i = encryption + .get_block(&garage, &vb.hash, Some(order_stream.order(i as u64))) .await?; tx.send(stream_block_i) .await @@ -354,8 +421,7 @@ async fn handle_get_full( } }); - let body = response_body_from_block_stream(rx); - Ok(resp_builder.body(body)?) + Box::pin(tokio_stream::wrappers::ReceiverStream::new(rx).flatten()) } } } @@ -365,13 +431,15 @@ async fn handle_get_range( version: &ObjectVersion, version_data: &ObjectVersionData, version_meta: &ObjectVersionMeta, + encryption: EncryptionParams, + headers: &ObjectVersionHeaders, begin: u64, end: u64, ) -> Result<Response<ResBody>, Error> { // Here we do not use getobject_override_headers because we don't // want to add any overridden headers (those should not be added // when returning PARTIAL_CONTENT) - let resp_builder = object_headers(version, version_meta) + let resp_builder = object_headers(version, version_meta, headers, encryption) .header(CONTENT_LENGTH, format!("{}", end - begin)) .header( CONTENT_RANGE, @@ -382,6 +450,7 @@ async fn handle_get_range( match &version_data { ObjectVersionData::DeleteMarker => unreachable!(), ObjectVersionData::Inline(_meta, bytes) => { + let bytes = encryption.decrypt_blob(&bytes)?; if end as usize <= bytes.len() { let body = bytes_body(bytes[begin as usize..end as usize].to_vec().into()); Ok(resp_builder.body(body)?) @@ -398,7 +467,8 @@ async fn handle_get_range( .await? .ok_or(Error::NoSuchKey)?; - let body = body_from_blocks_range(garage, version.blocks.items(), begin, end); + let body = + body_from_blocks_range(garage, encryption, version.blocks.items(), begin, end); Ok(resp_builder.body(body)?) } } @@ -409,17 +479,21 @@ async fn handle_get_part( object_version: &ObjectVersion, version_data: &ObjectVersionData, version_meta: &ObjectVersionMeta, + encryption: EncryptionParams, + headers: &ObjectVersionHeaders, part_number: u64, ) -> Result<Response<ResBody>, Error> { // Same as for get_range, no getobject_override_headers - let resp_builder = - object_headers(object_version, version_meta).status(StatusCode::PARTIAL_CONTENT); + let resp_builder = object_headers(object_version, version_meta, headers, encryption) + .status(StatusCode::PARTIAL_CONTENT); match version_data { ObjectVersionData::Inline(_, bytes) => { if part_number != 1 { return Err(Error::InvalidPart); } + let bytes = encryption.decrypt_blob(&bytes)?; + assert_eq!(bytes.len() as u64, version_meta.size); Ok(resp_builder .header(CONTENT_LENGTH, format!("{}", bytes.len())) .header( @@ -427,7 +501,7 @@ async fn handle_get_part( format!("bytes {}-{}/{}", 0, bytes.len() - 1, bytes.len()), ) .header(X_AMZ_MP_PARTS_COUNT, "1") - .body(bytes_body(bytes.to_vec().into()))?) + .body(bytes_body(bytes.into_owned().into()))?) } ObjectVersionData::FirstBlock(_, _) => { let version = garage @@ -439,7 +513,8 @@ async fn handle_get_part( let (begin, end) = calculate_part_bounds(&version, part_number).ok_or(Error::InvalidPart)?; - let body = body_from_blocks_range(garage, version.blocks.items(), begin, end); + let body = + body_from_blocks_range(garage, encryption, version.blocks.items(), begin, end); Ok(resp_builder .header(CONTENT_LENGTH, format!("{}", end - begin)) @@ -494,6 +569,7 @@ fn calculate_part_bounds(v: &Version, part_number: u64) -> Option<(u64, u64)> { fn body_from_blocks_range( garage: Arc<Garage>, + encryption: EncryptionParams, all_blocks: &[(VersionBlockKey, VersionBlock)], begin: u64, end: u64, @@ -523,12 +599,11 @@ fn body_from_blocks_range( tokio::spawn(async move { match async { - let garage = garage.clone(); for (i, (block, block_offset)) in blocks.iter().enumerate() { - let block_stream = garage - .block_manager - .rpc_get_block_streaming(&block.hash, Some(order_stream.order(i as u64))) - .await? + let block_stream = encryption + .get_block(&garage, &block.hash, Some(order_stream.order(i as u64))) + .await?; + let block_stream = block_stream .scan(*block_offset, move |chunk_offset, chunk| { let r = match chunk { Ok(chunk_bytes) => { @@ -588,19 +663,30 @@ fn body_from_blocks_range( } fn response_body_from_block_stream(rx: mpsc::Receiver<ByteStream>) -> ResBody { - let body_stream = tokio_stream::wrappers::ReceiverStream::new(rx) - .flatten() - .map(|x| { - x.map(hyper::body::Frame::data) - .map_err(|e| Error::from(garage_util::error::Error::from(e))) - }); + let body_stream = tokio_stream::wrappers::ReceiverStream::new(rx).flatten(); + response_body_from_stream(body_stream) +} + +fn response_body_from_stream<S>(stream: S) -> ResBody +where + S: Stream<Item = Result<Bytes, std::io::Error>> + Send + Sync + 'static, +{ + let body_stream = stream.map(|x| { + x.map(hyper::body::Frame::data) + .map_err(|e| Error::from(garage_util::error::Error::from(e))) + }); ResBody::new(http_body_util::StreamBody::new(body_stream)) } fn error_stream_item<E: std::fmt::Display>(e: E) -> ByteStream { - let err = std::io::Error::new( + Box::pin(stream::once(future::ready(Err(std_error_from_read_error( + e, + ))))) +} + +fn std_error_from_read_error<E: std::fmt::Display>(e: E) -> std::io::Error { + std::io::Error::new( std::io::ErrorKind::Other, - format!("Error while getting object data: {}", e), - ); - Box::pin(stream::once(future::ready(Err(err)))) + format!("Error while reading object data: {}", e), + ) } diff --git a/src/api/s3/list.rs b/src/api/s3/list.rs index 302c03f4..1678f1fa 100644 --- a/src/api/s3/list.rs +++ b/src/api/s3/list.rs @@ -944,9 +944,8 @@ mod tests { timestamp: TS, state: ObjectVersionState::Uploading { multipart: true, - headers: ObjectVersionHeaders { - content_type: "text/plain".to_string(), - other: BTreeMap::<String, String>::new(), + encryption: ObjectVersionEncryption::Plaintext { + headers: ObjectVersionHeaders(vec![]), }, }, } diff --git a/src/api/s3/mod.rs b/src/api/s3/mod.rs index cbdb94ab..1eb95d40 100644 --- a/src/api/s3/mod.rs +++ b/src/api/s3/mod.rs @@ -13,5 +13,6 @@ mod post_object; mod put; mod website; +mod encryption; mod router; pub mod xml; diff --git a/src/api/s3/multipart.rs b/src/api/s3/multipart.rs index 1d5aeb26..fcc5769f 100644 --- a/src/api/s3/multipart.rs +++ b/src/api/s3/multipart.rs @@ -16,6 +16,7 @@ use garage_model::s3::version_table::*; use crate::helpers::*; use crate::s3::api_server::{ReqBody, ResBody}; +use crate::s3::encryption::EncryptionParams; use crate::s3::error::*; use crate::s3::put::*; use crate::s3::xml as s3_xml; @@ -41,13 +42,17 @@ pub async fn handle_create_multipart_upload( let headers = get_headers(req.headers())?; + // Determine whether object should be encrypted, and if so the key + let encryption = EncryptionParams::new_from_headers(&garage, req.headers())?; + let object_encryption = encryption.encrypt_headers(headers)?; + // Create object in object table let object_version = ObjectVersion { uuid: upload_id, timestamp, state: ObjectVersionState::Uploading { multipart: true, - headers, + encryption: object_encryption, }, }; let object = Object::new(*bucket_id, key.to_string(), vec![object_version]); @@ -68,7 +73,9 @@ pub async fn handle_create_multipart_upload( }; let xml = s3_xml::to_xml_with_header(&result)?; - Ok(Response::new(string_body(xml))) + let mut resp = Response::builder(); + encryption.add_response_headers(&mut resp); + Ok(resp.body(string_body(xml))?) } pub async fn handle_put_part( @@ -91,12 +98,21 @@ pub async fn handle_put_part( // Read first chuck, and at the same time try to get object to see if it exists let key = key.to_string(); - let stream = body_stream(req.into_body()); + let (req_head, req_body) = req.into_parts(); + let stream = body_stream(req_body); let mut chunker = StreamChunker::new(stream, garage.config.block_size); - let ((_, _, mut mpu), first_block) = + let ((_, object_version, mut mpu), first_block) = futures::try_join!(get_upload(&ctx, &key, &upload_id), chunker.next(),)?; + // Check encryption params + let object_encryption = match object_version.state { + ObjectVersionState::Uploading { encryption, .. } => encryption, + _ => unreachable!(), + }; + let (encryption, _) = + EncryptionParams::check_decrypt(&garage, &req_head.headers, &object_encryption)?; + // Check object is valid and part can be accepted let first_block = first_block.ok_or_bad_request("Empty body")?; @@ -136,24 +152,32 @@ pub async fn handle_put_part( garage.version_table.insert(&version).await?; // Copy data to version - let (total_size, data_md5sum, data_sha256sum, _) = - read_and_put_blocks(&ctx, &version, part_number, first_block, &mut chunker).await?; + let (total_size, data_md5sum, data_sha256sum, _) = read_and_put_blocks( + &ctx, + &version, + encryption, + part_number, + first_block, + &mut chunker, + ) + .await?; // Verify that checksums map ensure_checksum_matches( - data_md5sum.as_slice(), + &data_md5sum, data_sha256sum, content_md5.as_deref(), content_sha256, )?; // Store part etag in version - let data_md5sum_hex = hex::encode(data_md5sum); + let etag = encryption.etag_from_md5(&data_md5sum); + mpu.parts.put( mpu_part_key, MpuPart { version: version_uuid, - etag: Some(data_md5sum_hex.clone()), + etag: Some(etag.clone()), size: Some(total_size), }, ); @@ -163,11 +187,9 @@ pub async fn handle_put_part( // We won't have to clean up on drop. interrupted_cleanup.cancel(); - let response = Response::builder() - .header("ETag", format!("\"{}\"", data_md5sum_hex)) - .body(empty_body()) - .unwrap(); - Ok(response) + let mut resp = Response::builder().header("ETag", format!("\"{}\"", etag)); + encryption.add_response_headers(&mut resp); + Ok(resp.body(empty_body())?) } struct InterruptedCleanup(Option<InterruptedCleanupInner>); @@ -241,8 +263,8 @@ pub async fn handle_complete_multipart_upload( return Err(Error::bad_request("No data was uploaded")); } - let headers = match object_version.state { - ObjectVersionState::Uploading { headers, .. } => headers, + let object_encryption = match object_version.state { + ObjectVersionState::Uploading { encryption, .. } => encryption, _ => unreachable!(), }; @@ -344,7 +366,7 @@ pub async fn handle_complete_multipart_upload( // Write final object version object_version.state = ObjectVersionState::Complete(ObjectVersionData::FirstBlock( ObjectVersionMeta { - headers, + encryption: object_encryption, size: total_size, etag: etag.clone(), }, diff --git a/src/api/s3/post_object.rs b/src/api/s3/post_object.rs index 66f8174c..7c4219a7 100644 --- a/src/api/s3/post_object.rs +++ b/src/api/s3/post_object.rs @@ -18,6 +18,7 @@ use garage_model::garage::Garage; use crate::helpers::*; use crate::s3::api_server::ResBody; use crate::s3::cors::*; +use crate::s3::encryption::EncryptionParams; use crate::s3::error::*; use crate::s3::put::{get_headers, save_stream}; use crate::s3::xml as s3_xml; @@ -48,13 +49,17 @@ pub async fn handle_post_object( let mut multipart = Multipart::with_constraints(stream, boundary, constraints); let mut params = HeaderMap::new(); - let field = loop { + let file_field = loop { let field = if let Some(field) = multipart.next_field().await? { field } else { return Err(Error::bad_request("Request did not contain a file")); }; - let name: HeaderName = if let Some(Ok(name)) = field.name().map(TryInto::try_into) { + let name: HeaderName = if let Some(Ok(name)) = field + .name() + .map(str::to_ascii_lowercase) + .map(TryInto::try_into) + { name } else { continue; @@ -93,10 +98,14 @@ pub async fn handle_post_object( .ok_or_bad_request("No policy was provided")? .to_str()?; let authorization = Authorization::parse_form(¶ms)?; + let content_md5 = params + .get("content-md5") + .map(HeaderValue::to_str) + .transpose()?; let key = if key.contains("${filename}") { // if no filename is provided, don't replace. This matches the behavior of AWS. - if let Some(filename) = field.file_name() { + if let Some(filename) = file_field.file_name() { key.replace("${filename}", filename) } else { key.to_owned() @@ -143,9 +152,8 @@ pub async fn handle_post_object( let mut conditions = decoded_policy.into_conditions()?; for (param_key, value) in params.iter() { - let mut param_key = param_key.to_string(); - param_key.make_ascii_lowercase(); - match param_key.as_str() { + let param_key = param_key.as_str(); + match param_key { "policy" | "x-amz-signature" => (), // this is always accepted, as it's required to validate other fields "content-type" => { let conds = conditions.params.remove("content-type").ok_or_else(|| { @@ -190,7 +198,7 @@ pub async fn handle_post_object( // how aws seems to behave. continue; } - let conds = conditions.params.remove(¶m_key).ok_or_else(|| { + let conds = conditions.params.remove(param_key).ok_or_else(|| { Error::bad_request(format!("Key '{}' is not allowed in policy", param_key)) })?; for cond in conds { @@ -218,8 +226,9 @@ pub async fn handle_post_object( let headers = get_headers(¶ms)?; - let stream = field.map(|r| r.map_err(Into::into)); + let encryption = EncryptionParams::new_from_headers(&garage, ¶ms)?; + let stream = file_field.map(|r| r.map_err(Into::into)); let ctx = ReqCtx { garage, bucket_id, @@ -228,17 +237,18 @@ pub async fn handle_post_object( api_key, }; - let (_, md5) = save_stream( + let res = save_stream( &ctx, headers, + encryption, StreamLimiter::new(stream, conditions.content_length), &key, - None, + content_md5.map(str::to_string), None, ) .await?; - let etag = format!("\"{}\"", md5); + let etag = format!("\"{}\"", res.etag); let mut resp = if let Some(mut target) = params .get("success_action_redirect") @@ -252,11 +262,12 @@ pub async fn handle_post_object( .append_pair("key", &key) .append_pair("etag", &etag); let target = target.to_string(); - Response::builder() + let mut resp = Response::builder() .status(StatusCode::SEE_OTHER) .header(header::LOCATION, target.clone()) - .header(header::ETAG, etag) - .body(string_body(target))? + .header(header::ETAG, etag); + encryption.add_response_headers(&mut resp); + resp.body(string_body(target))? } else { let path = head .uri @@ -283,9 +294,10 @@ pub async fn handle_post_object( .get("success_action_status") .and_then(|h| h.to_str().ok()) .unwrap_or("204"); - let builder = Response::builder() + let mut builder = Response::builder() .header(header::LOCATION, location.clone()) .header(header::ETAG, etag.clone()); + encryption.add_response_headers(&mut builder); match action { "200" => builder.status(StatusCode::OK).body(empty_body())?, "201" => { diff --git a/src/api/s3/put.rs b/src/api/s3/put.rs index 685cca80..941e4122 100644 --- a/src/api/s3/put.rs +++ b/src/api/s3/put.rs @@ -1,4 +1,4 @@ -use std::collections::{BTreeMap, HashMap}; +use std::collections::HashMap; use std::sync::Arc; use base64::prelude::*; @@ -36,10 +36,18 @@ use garage_model::s3::version_table::*; use crate::helpers::*; use crate::s3::api_server::{ReqBody, ResBody}; +use crate::s3::encryption::EncryptionParams; use crate::s3::error::*; const PUT_BLOCKS_MAX_PARALLEL: usize = 3; +pub struct SaveStreamResult { + pub version_uuid: Uuid, + pub version_timestamp: u64, + /// Etag WITHOUT THE QUOTES (just the hex value) + pub etag: String, +} + pub async fn handle_put( ctx: ReqCtx, req: Request<ReqBody>, @@ -50,6 +58,9 @@ pub async fn handle_put( let headers = get_headers(req.headers())?; debug!("Object headers: {:?}", headers); + // Determine whether object should be encrypted, and if so the key + let encryption = EncryptionParams::new_from_headers(&ctx.garage, req.headers())?; + let content_md5 = match req.headers().get("content-md5") { Some(x) => Some(x.to_str()?.to_string()), None => None, @@ -57,19 +68,33 @@ pub async fn handle_put( let stream = body_stream(req.into_body()); - save_stream(&ctx, headers, stream, key, content_md5, content_sha256) - .await - .map(|(uuid, md5)| put_response(uuid, md5)) + let res = save_stream( + &ctx, + headers, + encryption, + stream, + key, + content_md5, + content_sha256, + ) + .await?; + + let mut resp = Response::builder() + .header("x-amz-version-id", hex::encode(res.version_uuid)) + .header("ETag", format!("\"{}\"", res.etag)); + encryption.add_response_headers(&mut resp); + Ok(resp.body(empty_body())?) } pub(crate) async fn save_stream<S: Stream<Item = Result<Bytes, Error>> + Unpin>( ctx: &ReqCtx, headers: ObjectVersionHeaders, + encryption: EncryptionParams, body: S, key: &String, content_md5: Option<String>, content_sha256: Option<FixedBytes32>, -) -> Result<(Uuid, String), Error> { +) -> Result<SaveStreamResult, Error> { let ReqCtx { garage, bucket_id, .. } = ctx; @@ -82,6 +107,8 @@ pub(crate) async fn save_stream<S: Stream<Item = Result<Bytes, Error>> + Unpin>( let first_block = first_block_opt.unwrap_or_default(); + let object_encryption = encryption.encrypt_headers(headers)?; + // Generate identity of new version let version_uuid = gen_uuid(); let version_timestamp = next_timestamp(existing_object.as_ref()); @@ -92,37 +119,43 @@ pub(crate) async fn save_stream<S: Stream<Item = Result<Bytes, Error>> + Unpin>( let mut md5sum = Md5::new(); md5sum.update(&first_block[..]); let data_md5sum = md5sum.finalize(); - let data_md5sum_hex = hex::encode(data_md5sum); let data_sha256sum = sha256sum(&first_block[..]); - let size = first_block.len() as u64; ensure_checksum_matches( - data_md5sum.as_slice(), + &data_md5sum, data_sha256sum, content_md5.as_deref(), content_sha256, )?; + let size = first_block.len() as u64; check_quotas(ctx, size, existing_object.as_ref()).await?; + let etag = encryption.etag_from_md5(&data_md5sum); + let inline_data = encryption.encrypt_blob(&first_block)?.to_vec(); + let object_version = ObjectVersion { uuid: version_uuid, timestamp: version_timestamp, state: ObjectVersionState::Complete(ObjectVersionData::Inline( ObjectVersionMeta { - headers, + encryption: object_encryption, size, - etag: data_md5sum_hex.clone(), + etag: etag.clone(), }, - first_block.to_vec(), + inline_data, )), }; let object = Object::new(*bucket_id, key.into(), vec![object_version]); garage.object_table.insert(&object).await?; - return Ok((version_uuid, data_md5sum_hex)); + return Ok(SaveStreamResult { + version_uuid, + version_timestamp, + etag, + }); } // The following consists in many steps that can each fail. @@ -142,7 +175,7 @@ pub(crate) async fn save_stream<S: Stream<Item = Result<Bytes, Error>> + Unpin>( uuid: version_uuid, timestamp: version_timestamp, state: ObjectVersionState::Uploading { - headers: headers.clone(), + encryption: object_encryption.clone(), multipart: false, }, }; @@ -165,10 +198,10 @@ pub(crate) async fn save_stream<S: Stream<Item = Result<Bytes, Error>> + Unpin>( // Transfer data and verify checksum let (total_size, data_md5sum, data_sha256sum, first_block_hash) = - read_and_put_blocks(ctx, &version, 1, first_block, &mut chunker).await?; + read_and_put_blocks(ctx, &version, encryption, 1, first_block, &mut chunker).await?; ensure_checksum_matches( - data_md5sum.as_slice(), + &data_md5sum, data_sha256sum, content_md5.as_deref(), content_sha256, @@ -177,12 +210,13 @@ pub(crate) async fn save_stream<S: Stream<Item = Result<Bytes, Error>> + Unpin>( check_quotas(ctx, total_size, existing_object.as_ref()).await?; // Save final object state, marked as Complete - let md5sum_hex = hex::encode(data_md5sum); + let etag = encryption.etag_from_md5(&data_md5sum); + object_version.state = ObjectVersionState::Complete(ObjectVersionData::FirstBlock( ObjectVersionMeta { - headers, + encryption: object_encryption, size: total_size, - etag: md5sum_hex.clone(), + etag: etag.clone(), }, first_block_hash, )); @@ -193,7 +227,11 @@ pub(crate) async fn save_stream<S: Stream<Item = Result<Bytes, Error>> + Unpin>( // We won't have to clean up on drop. interrupted_cleanup.cancel(); - Ok((version_uuid, md5sum_hex)) + Ok(SaveStreamResult { + version_uuid, + version_timestamp, + etag, + }) } /// Validate MD5 sum against content-md5 header @@ -248,7 +286,7 @@ pub(crate) async fn check_quotas( .await?; let counters = counters - .map(|x| x.filtered_values(&garage.system.ring.borrow())) + .map(|x| x.filtered_values(&garage.system.cluster_layout())) .unwrap_or_default(); let (prev_cnt_obj, prev_cnt_size) = match prev_object { @@ -290,6 +328,7 @@ pub(crate) async fn check_quotas( pub(crate) async fn read_and_put_blocks<S: Stream<Item = Result<Bytes, Error>> + Unpin>( ctx: &ReqCtx, version: &Version, + encryption: EncryptionParams, part_number: u64, first_block: Bytes, chunker: &mut StreamChunker<S>, @@ -349,12 +388,31 @@ pub(crate) async fn read_and_put_blocks<S: Stream<Item = Result<Bytes, Error>> + )) }; - let (block_tx3, mut block_rx3) = mpsc::channel::<Result<(Bytes, Hash), Error>>(1); - let hash_blocks = async { + let (block_tx3, mut block_rx3) = mpsc::channel::<Result<(Bytes, u64, Hash), Error>>(1); + let encrypt_hash_blocks = async { let mut first_block_hash = None; while let Some(next) = block_rx2.recv().await { match next { Ok(block) => { + let unencrypted_len = block.len() as u64; + let block = if encryption.is_encrypted() { + let res = + tokio::task::spawn_blocking(move || encryption.encrypt_block(block)) + .with_context(Context::current_with_span( + tracer.start("Encrypt block"), + )) + .await + .unwrap(); + match res { + Ok(b) => b, + Err(e) => { + block_tx3.send(Err(e)).await?; + break; + } + } + } else { + block + }; let hash = async_blake2sum(block.clone()) .with_context(Context::current_with_span( tracer.start("Hash block (blake2)"), @@ -363,7 +421,7 @@ pub(crate) async fn read_and_put_blocks<S: Stream<Item = Result<Bytes, Error>> + if first_block_hash.is_none() { first_block_hash = Some(hash); } - block_tx3.send(Ok((block, hash))).await?; + block_tx3.send(Ok((block, unencrypted_len, hash))).await?; } Err(e) => { block_tx3.send(Err(e)).await?; @@ -398,7 +456,7 @@ pub(crate) async fn read_and_put_blocks<S: Stream<Item = Result<Bytes, Error>> + block_rx3.recv().await } }; - let (block, hash) = tokio::select! { + let (block, unencrypted_len, hash) = tokio::select! { result = write_futs_next => { result?; continue; @@ -410,17 +468,18 @@ pub(crate) async fn read_and_put_blocks<S: Stream<Item = Result<Bytes, Error>> + }; // For next block to be written: count its size and spawn future to write it - let offset = written_bytes; - written_bytes += block.len() as u64; write_futs.push_back(put_block_and_meta( ctx, version, part_number, - offset, + written_bytes, hash, block, + unencrypted_len, + encryption.is_encrypted(), order_stream.order(written_bytes), )); + written_bytes += unencrypted_len; } while let Some(res) = write_futs.next().await { res?; @@ -429,7 +488,7 @@ pub(crate) async fn read_and_put_blocks<S: Stream<Item = Result<Bytes, Error>> + }; let (_, stream_hash_result, block_hash_result, final_result) = - futures::join!(read_blocks, hash_stream, hash_blocks, put_blocks); + futures::join!(read_blocks, hash_stream, encrypt_hash_blocks, put_blocks); let total_size = final_result?; // unwrap here is ok, because if hasher failed, it is because something failed @@ -449,6 +508,8 @@ async fn put_block_and_meta( offset: u64, hash: Hash, block: Bytes, + size: u64, + is_encrypted: bool, order_tag: OrderTag, ) -> Result<(), GarageError> { let ReqCtx { garage, .. } = ctx; @@ -459,10 +520,7 @@ async fn put_block_and_meta( part_number, offset, }, - VersionBlock { - hash, - size: block.len() as u64, - }, + VersionBlock { hash, size }, ); let block_ref = BlockRef { @@ -474,7 +532,7 @@ async fn put_block_and_meta( futures::try_join!( garage .block_manager - .rpc_put_block(hash, block, Some(order_tag)), + .rpc_put_block(hash, block, is_encrypted, Some(order_tag)), garage.version_table.insert(&version), garage.block_ref_table.insert(&block_ref), )?; @@ -517,14 +575,6 @@ impl<S: Stream<Item = Result<Bytes, Error>> + Unpin> StreamChunker<S> { } } -pub fn put_response(version_uuid: Uuid, md5sum_hex: String) -> Response<ResBody> { - Response::builder() - .header("x-amz-version-id", hex::encode(version_uuid)) - .header("ETag", format!("\"{}\"", md5sum_hex)) - .body(empty_body()) - .unwrap() -} - struct InterruptedCleanup(Option<InterruptedCleanupInner>); struct InterruptedCleanupInner { garage: Arc<Garage>, @@ -559,57 +609,35 @@ impl Drop for InterruptedCleanup { // ============ helpers ============ -pub(crate) fn get_mime_type(headers: &HeaderMap<HeaderValue>) -> Result<String, Error> { - Ok(headers - .get(hyper::header::CONTENT_TYPE) - .map(|x| x.to_str()) - .unwrap_or(Ok("blob"))? - .to_string()) -} - pub(crate) fn get_headers(headers: &HeaderMap<HeaderValue>) -> Result<ObjectVersionHeaders, Error> { - let content_type = get_mime_type(headers)?; - let mut other = BTreeMap::new(); + let mut ret = Vec::new(); // Preserve standard headers let standard_header = vec![ + hyper::header::CONTENT_TYPE, hyper::header::CACHE_CONTROL, hyper::header::CONTENT_DISPOSITION, hyper::header::CONTENT_ENCODING, hyper::header::CONTENT_LANGUAGE, hyper::header::EXPIRES, ]; - for h in standard_header.iter() { - if let Some(v) = headers.get(h) { - match v.to_str() { - Ok(v_str) => { - other.insert(h.to_string(), v_str.to_string()); - } - Err(e) => { - warn!("Discarding header {}, error in .to_str(): {}", h, e); - } - } + for name in standard_header.iter() { + if let Some(value) = headers.get(name) { + ret.push((name.to_string(), value.to_str()?.to_string())); } } // Preserve x-amz-meta- headers - for (k, v) in headers.iter() { - if k.as_str().starts_with("x-amz-meta-") { - match std::str::from_utf8(v.as_bytes()) { - Ok(v_str) => { - other.insert(k.to_string(), v_str.to_string()); - } - Err(e) => { - warn!("Discarding header {}, error in .to_str(): {}", k, e); - } - } + for (name, value) in headers.iter() { + if name.as_str().starts_with("x-amz-meta-") { + ret.push(( + name.to_string(), + std::str::from_utf8(value.as_bytes())?.to_string(), + )); } } - Ok(ObjectVersionHeaders { - content_type, - other, - }) + Ok(ObjectVersionHeaders(ret)) } pub(crate) fn next_timestamp(existing_object: Option<&Object>) -> u64 { diff --git a/src/block/Cargo.toml b/src/block/Cargo.toml index 0f6fb5bb..b5763120 100644 --- a/src/block/Cargo.toml +++ b/src/block/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "garage_block" -version = "0.9.3" +version = "0.10.0" authors = ["Alex Auvolat <alex@adnab.me>"] edition = "2018" license = "AGPL-3.0" diff --git a/src/block/block.rs b/src/block/block.rs index 504d11f8..bd95680e 100644 --- a/src/block/block.rs +++ b/src/block/block.rs @@ -96,7 +96,7 @@ impl DataBlock { } } -fn zstd_encode<R: std::io::Read>(mut source: R, level: i32) -> std::io::Result<Vec<u8>> { +pub fn zstd_encode<R: std::io::Read>(mut source: R, level: i32) -> std::io::Result<Vec<u8>> { let mut result = Vec::<u8>::new(); let mut encoder = Encoder::new(&mut result, level)?; encoder.include_checksum(true)?; diff --git a/src/block/lib.rs b/src/block/lib.rs index c9ff2845..6c4711ef 100644 --- a/src/block/lib.rs +++ b/src/block/lib.rs @@ -9,3 +9,5 @@ mod block; mod layout; mod metrics; mod rc; + +pub use block::zstd_encode; diff --git a/src/block/manager.rs b/src/block/manager.rs index 890ea8b7..c7e4df17 100644 --- a/src/block/manager.rs +++ b/src/block/manager.rs @@ -261,8 +261,10 @@ impl BlockManager { F: Fn(DataBlockStream) -> Fut, Fut: futures::Future<Output = Result<T, Error>>, { - let who = self.replication.read_nodes(hash); - let who = self.system.rpc.request_order(&who); + let who = self + .system + .rpc_helper() + .block_read_nodes_of(hash, self.system.rpc_helper()); for node in who.iter() { let node_id = NodeID::from(*node); @@ -302,7 +304,7 @@ impl BlockManager { // if the first one doesn't succeed rapidly // TODO: keep first request running when initiating a new one and take the // one that finishes earlier - _ = tokio::time::sleep(self.system.rpc.rpc_timeout()) => { + _ = tokio::time::sleep(self.system.rpc_helper().rpc_timeout()) => { debug!("Get block {:?}: node {:?} didn't return block in time, trying next.", hash, node); } }; @@ -335,26 +337,18 @@ impl BlockManager { } } - /// Ask nodes that might have a block for it, return it as one big Bytes - pub async fn rpc_get_block( - &self, - hash: &Hash, - order_tag: Option<OrderTag>, - ) -> Result<Bytes, Error> { - let stream = self.rpc_get_block_streaming(hash, order_tag).await?; - Ok(read_stream_to_end(stream).await?.into_bytes()) - } - /// Send block to nodes that should have it pub async fn rpc_put_block( &self, hash: Hash, data: Bytes, + prevent_compression: bool, order_tag: Option<OrderTag>, ) -> Result<(), Error> { - let who = self.replication.write_nodes(&hash); + let who = self.replication.write_sets(&hash); - let (header, bytes) = DataBlock::from_buffer(data, self.compression_level) + let compression_level = self.compression_level.filter(|_| !prevent_compression); + let (header, bytes) = DataBlock::from_buffer(data, compression_level) .await .into_parts(); let put_block_rpc = @@ -366,10 +360,10 @@ impl BlockManager { }; self.system - .rpc - .try_call_many( + .rpc_helper() + .try_write_many_sets( &self.endpoint, - &who[..], + who.as_ref(), put_block_rpc, RequestStrategy::with_priority(PRIO_NORMAL | PRIO_SECONDARY) .with_quorum(self.replication.write_quorum()), @@ -442,7 +436,7 @@ impl BlockManager { tokio::spawn(async move { if let Err(e) = this .resync - .put_to_resync(&hash, 2 * this.system.rpc.rpc_timeout()) + .put_to_resync(&hash, 2 * this.system.rpc_helper().rpc_timeout()) { error!("Block {:?} could not be put in resync queue: {}.", hash, e); } @@ -536,7 +530,7 @@ impl BlockManager { None => { // Not found but maybe we should have had it ?? self.resync - .put_to_resync(hash, 2 * self.system.rpc.rpc_timeout())?; + .put_to_resync(hash, 2 * self.system.rpc_helper().rpc_timeout())?; return Err(Error::Message(format!( "block {:?} not found on node", hash diff --git a/src/block/resync.rs b/src/block/resync.rs index 9c1da4a7..15f210e4 100644 --- a/src/block/resync.rs +++ b/src/block/resync.rs @@ -377,7 +377,7 @@ impl BlockResyncManager { info!("Resync block {:?}: offloading and deleting", hash); let existing_path = existing_path.unwrap(); - let mut who = manager.replication.write_nodes(hash); + let mut who = manager.replication.storage_nodes(hash); if who.len() < manager.replication.write_quorum() { return Err(Error::Message("Not trying to offload block because we don't have a quorum of nodes to write to".to_string())); } @@ -385,7 +385,7 @@ impl BlockResyncManager { let who_needs_resps = manager .system - .rpc + .rpc_helper() .call_many( &manager.endpoint, &who, @@ -431,10 +431,10 @@ impl BlockResyncManager { .with_stream_from_buffer(bytes); manager .system - .rpc + .rpc_helper() .try_call_many( &manager.endpoint, - &need_nodes[..], + &need_nodes, put_block_message, RequestStrategy::with_priority(PRIO_BACKGROUND) .with_quorum(need_nodes.len()), diff --git a/src/db/Cargo.toml b/src/db/Cargo.toml index d7c89620..fddc5cca 100644 --- a/src/db/Cargo.toml +++ b/src/db/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "garage_db" -version = "0.9.3" +version = "0.10.0" authors = ["Alex Auvolat <alex@adnab.me>"] edition = "2018" license = "AGPL-3.0" diff --git a/src/garage/Cargo.toml b/src/garage/Cargo.toml index b022049c..00ecb35e 100644 --- a/src/garage/Cargo.toml +++ b/src/garage/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "garage" -version = "0.9.3" +version = "0.10.0" authors = ["Alex Auvolat <alex@adnab.me>"] edition = "2018" license = "AGPL-3.0" diff --git a/src/garage/admin/bucket.rs b/src/garage/admin/bucket.rs index 803b55bd..6b1190f8 100644 --- a/src/garage/admin/bucket.rs +++ b/src/garage/admin/bucket.rs @@ -70,7 +70,7 @@ impl AdminRpcHandler { .table .get(&bucket_id, &EmptyKey) .await? - .map(|x| x.filtered_values(&self.garage.system.ring.borrow())) + .map(|x| x.filtered_values(&self.garage.system.cluster_layout())) .unwrap_or_default(); let mpu_counters = self @@ -79,7 +79,7 @@ impl AdminRpcHandler { .table .get(&bucket_id, &EmptyKey) .await? - .map(|x| x.filtered_values(&self.garage.system.ring.borrow())) + .map(|x| x.filtered_values(&self.garage.system.cluster_layout())) .unwrap_or_default(); let mut relevant_keys = HashMap::new(); diff --git a/src/garage/admin/mod.rs b/src/garage/admin/mod.rs index b6f9c426..de7851e1 100644 --- a/src/garage/admin/mod.rs +++ b/src/garage/admin/mod.rs @@ -18,7 +18,7 @@ use garage_util::error::Error as GarageError; use garage_table::replication::*; use garage_table::*; -use garage_rpc::ring::PARTITION_BITS; +use garage_rpc::layout::PARTITION_BITS; use garage_rpc::*; use garage_block::manager::BlockResyncErrorInfo; @@ -126,8 +126,8 @@ impl AdminRpcHandler { opt_to_send.all_nodes = false; let mut failures = vec![]; - let ring = self.garage.system.ring.borrow().clone(); - for node in ring.layout.node_ids().iter() { + let all_nodes = self.garage.system.cluster_layout().all_nodes().to_vec(); + for node in all_nodes.iter() { let node = (*node).into(); let resp = self .endpoint @@ -163,9 +163,9 @@ impl AdminRpcHandler { async fn handle_stats(&self, opt: StatsOpt) -> Result<AdminRpc, Error> { if opt.all_nodes { let mut ret = String::new(); - let ring = self.garage.system.ring.borrow().clone(); + let all_nodes = self.garage.system.cluster_layout().all_nodes().to_vec(); - for node in ring.layout.node_ids().iter() { + for node in all_nodes.iter() { let mut opt = opt.clone(); opt.all_nodes = false; opt.skip_global = true; @@ -274,11 +274,11 @@ impl AdminRpcHandler { fn gather_cluster_stats(&self) -> String { let mut ret = String::new(); - // Gather storage node and free space statistics - let layout = &self.garage.system.ring.borrow().layout; + // Gather storage node and free space statistics for current nodes + let layout = &self.garage.system.cluster_layout(); let mut node_partition_count = HashMap::<Uuid, u64>::new(); - for short_id in layout.ring_assignment_data.iter() { - let id = layout.node_id_vec[*short_id as usize]; + for short_id in layout.current().ring_assignment_data.iter() { + let id = layout.current().node_id_vec[*short_id as usize]; *node_partition_count.entry(id).or_default() += 1; } let node_info = self @@ -293,8 +293,8 @@ impl AdminRpcHandler { for (id, parts) in node_partition_count.iter() { let info = node_info.get(id); let status = info.map(|x| &x.status); - let role = layout.roles.get(id).and_then(|x| x.0.as_ref()); - let hostname = status.map(|x| x.hostname.as_str()).unwrap_or("?"); + let role = layout.current().roles.get(id).and_then(|x| x.0.as_ref()); + let hostname = status.and_then(|x| x.hostname.as_deref()).unwrap_or("?"); let zone = role.map(|x| x.zone.as_str()).unwrap_or("?"); let capacity = role .map(|x| x.capacity_string()) @@ -440,8 +440,8 @@ impl AdminRpcHandler { ) -> Result<AdminRpc, Error> { if all_nodes { let mut ret = vec![]; - let ring = self.garage.system.ring.borrow().clone(); - for node in ring.layout.node_ids().iter() { + let all_nodes = self.garage.system.cluster_layout().all_nodes().to_vec(); + for node in all_nodes.iter() { let node = (*node).into(); match self .endpoint @@ -488,8 +488,8 @@ impl AdminRpcHandler { ) -> Result<AdminRpc, Error> { if all_nodes { let mut ret = vec![]; - let ring = self.garage.system.ring.borrow().clone(); - for node in ring.layout.node_ids().iter() { + let all_nodes = self.garage.system.cluster_layout().all_nodes().to_vec(); + for node in all_nodes.iter() { let node = (*node).into(); match self .endpoint diff --git a/src/garage/cli/cmd.rs b/src/garage/cli/cmd.rs index 48359614..fb6dface 100644 --- a/src/garage/cli/cmd.rs +++ b/src/garage/cli/cmd.rs @@ -1,4 +1,4 @@ -use std::collections::HashSet; +use std::collections::{HashMap, HashSet}; use std::time::Duration; use format_table::format_table; @@ -49,50 +49,61 @@ pub async fn cli_command_dispatch( } pub async fn cmd_status(rpc_cli: &Endpoint<SystemRpc, ()>, rpc_host: NodeID) -> Result<(), Error> { - let status = match rpc_cli - .call(&rpc_host, SystemRpc::GetKnownNodes, PRIO_NORMAL) - .await?? - { - SystemRpc::ReturnKnownNodes(nodes) => nodes, - resp => return Err(Error::Message(format!("Invalid RPC response: {:?}", resp))), - }; + let status = fetch_status(rpc_cli, rpc_host).await?; let layout = fetch_layout(rpc_cli, rpc_host).await?; println!("==== HEALTHY NODES ===="); let mut healthy_nodes = vec!["ID\tHostname\tAddress\tTags\tZone\tCapacity\tDataAvail".to_string()]; for adv in status.iter().filter(|adv| adv.is_up) { - match layout.roles.get(&adv.id) { - Some(NodeRoleV(Some(cfg))) => { - let data_avail = match &adv.status.data_disk_avail { - _ if cfg.capacity.is_none() => "N/A".into(), - Some((avail, total)) => { - let pct = (*avail as f64) / (*total as f64) * 100.; - let avail = bytesize::ByteSize::b(*avail); - format!("{} ({:.1}%)", avail, pct) - } - None => "?".into(), - }; + let host = adv.status.hostname.as_deref().unwrap_or("?"); + if let Some(NodeRoleV(Some(cfg))) = layout.current().roles.get(&adv.id) { + let data_avail = match &adv.status.data_disk_avail { + _ if cfg.capacity.is_none() => "N/A".into(), + Some((avail, total)) => { + let pct = (*avail as f64) / (*total as f64) * 100.; + let avail = bytesize::ByteSize::b(*avail); + format!("{} ({:.1}%)", avail, pct) + } + None => "?".into(), + }; + healthy_nodes.push(format!( + "{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\t{capacity}\t{data_avail}", + id = adv.id, + host = host, + addr = adv.addr, + tags = cfg.tags.join(","), + zone = cfg.zone, + capacity = cfg.capacity_string(), + data_avail = data_avail, + )); + } else { + let prev_role = layout + .versions + .iter() + .rev() + .find_map(|x| match x.roles.get(&adv.id) { + Some(NodeRoleV(Some(cfg))) => Some(cfg), + _ => None, + }); + if let Some(cfg) = prev_role { healthy_nodes.push(format!( - "{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\t{capacity}\t{data_avail}", + "{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\tdraining metadata...", id = adv.id, - host = adv.status.hostname, + host = host, addr = adv.addr, tags = cfg.tags.join(","), zone = cfg.zone, - capacity = cfg.capacity_string(), - data_avail = data_avail, )); - } - _ => { - let new_role = match layout.staging_roles.get(&adv.id) { - Some(NodeRoleV(Some(_))) => "(pending)", + } else { + let new_role = match layout.staging.get().roles.get(&adv.id) { + Some(NodeRoleV(Some(_))) => "pending...", _ => "NO ROLE ASSIGNED", }; healthy_nodes.push(format!( - "{id:?}\t{h}\t{addr}\t{new_role}", + "{id:?}\t{h}\t{addr}\t\t\t{new_role}", id = adv.id, - h = adv.status.hostname, + h = host, addr = adv.addr, new_role = new_role, )); @@ -101,51 +112,76 @@ pub async fn cmd_status(rpc_cli: &Endpoint<SystemRpc, ()>, rpc_host: NodeID) -> } format_table(healthy_nodes); - let status_keys = status.iter().map(|adv| adv.id).collect::<HashSet<_>>(); - let failure_case_1 = status - .iter() - .any(|adv| !adv.is_up && matches!(layout.roles.get(&adv.id), Some(NodeRoleV(Some(_))))); - let failure_case_2 = layout - .roles - .items() + // Determine which nodes are unhealthy and print that to stdout + let status_map = status .iter() - .any(|(id, _, v)| !status_keys.contains(id) && v.0.is_some()); - if failure_case_1 || failure_case_2 { - println!("\n==== FAILED NODES ===="); - let mut failed_nodes = - vec!["ID\tHostname\tAddress\tTags\tZone\tCapacity\tLast seen".to_string()]; - for adv in status.iter().filter(|adv| !adv.is_up) { - if let Some(NodeRoleV(Some(cfg))) = layout.roles.get(&adv.id) { - let tf = timeago::Formatter::new(); - failed_nodes.push(format!( - "{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\t{capacity}\t{last_seen}", - id = adv.id, - host = adv.status.hostname, - addr = adv.addr, - tags = cfg.tags.join(","), - zone = cfg.zone, - capacity = cfg.capacity_string(), - last_seen = adv - .last_seen_secs_ago - .map(|s| tf.convert(Duration::from_secs(s))) - .unwrap_or_else(|| "never seen".into()), - )); + .map(|adv| (adv.id, adv)) + .collect::<HashMap<_, _>>(); + + let tf = timeago::Formatter::new(); + let mut drain_msg = false; + let mut failed_nodes = + vec!["ID\tHostname\tAddress\tTags\tZone\tCapacity\tLast seen".to_string()]; + let mut listed = HashSet::new(); + for ver in layout.versions.iter().rev() { + for (node, _, role) in ver.roles.items().iter() { + let cfg = match role { + NodeRoleV(Some(role)) if role.capacity.is_some() => role, + _ => continue, + }; + + if listed.contains(node) { + continue; } - } - for (id, _, role_v) in layout.roles.items().iter() { - if let NodeRoleV(Some(cfg)) = role_v { - if !status_keys.contains(id) { - failed_nodes.push(format!( - "{id:?}\t??\t??\t[{tags}]\t{zone}\t{capacity}\tnever seen", - id = id, - tags = cfg.tags.join(","), - zone = cfg.zone, - capacity = cfg.capacity_string(), - )); - } + listed.insert(*node); + + let adv = status_map.get(node); + if adv.map(|x| x.is_up).unwrap_or(false) { + continue; } + + // Node is in a layout version, is not a gateway node, and is not up: + // it is in a failed state, add proper line to the output + let (host, addr, last_seen) = match adv { + Some(adv) => ( + adv.status.hostname.as_deref().unwrap_or("?"), + adv.addr.to_string(), + adv.last_seen_secs_ago + .map(|s| tf.convert(Duration::from_secs(s))) + .unwrap_or_else(|| "never seen".into()), + ), + None => ("??", "??".into(), "never seen".into()), + }; + let capacity = if ver.version == layout.current().version { + cfg.capacity_string() + } else { + drain_msg = true; + "draining metadata...".to_string() + }; + failed_nodes.push(format!( + "{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\t{capacity}\t{last_seen}", + id = node, + host = host, + addr = addr, + tags = cfg.tags.join(","), + zone = cfg.zone, + capacity = capacity, + last_seen = last_seen, + )); } + } + + if failed_nodes.len() > 1 { + println!("\n==== FAILED NODES ===="); format_table(failed_nodes); + if drain_msg { + println!(); + println!("Your cluster is expecting to drain data from nodes that are currently unavailable."); + println!("If these nodes are definitely dead, please review the layout history with"); + println!( + "`garage layout history` and use `garage layout skip-dead-nodes` to force progress." + ); + } } if print_staging_role_changes(&layout) { @@ -226,3 +262,18 @@ pub async fn cmd_admin( } Ok(()) } + +// ---- utility ---- + +pub async fn fetch_status( + rpc_cli: &Endpoint<SystemRpc, ()>, + rpc_host: NodeID, +) -> Result<Vec<KnownNodeInfo>, Error> { + match rpc_cli + .call(&rpc_host, SystemRpc::GetKnownNodes, PRIO_NORMAL) + .await?? + { + SystemRpc::ReturnKnownNodes(nodes) => Ok(nodes), + resp => Err(Error::unexpected_rpc_message(resp)), + } +} diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index ce2b11e0..f76e33c5 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -32,6 +32,10 @@ pub async fn cli_layout_command_dispatch( LayoutOperation::Config(config_opt) => { cmd_config_layout(system_rpc_endpoint, rpc_host, config_opt).await } + LayoutOperation::History => cmd_layout_history(system_rpc_endpoint, rpc_host).await, + LayoutOperation::SkipDeadNodes(assume_sync_opt) => { + cmd_layout_skip_dead_nodes(system_rpc_endpoint, rpc_host, assume_sync_opt).await + } } } @@ -49,6 +53,7 @@ pub async fn cmd_assign_role( }; let mut layout = fetch_layout(rpc_cli, rpc_host).await?; + let all_nodes = layout.get_all_nodes(); let added_nodes = args .node_ids @@ -58,21 +63,23 @@ pub async fn cmd_assign_role( status .iter() .map(|adv| adv.id) - .chain(layout.node_ids().iter().cloned()), + .chain(all_nodes.iter().cloned()), node_id, ) }) .collect::<Result<Vec<_>, _>>()?; - let mut roles = layout.roles.clone(); - roles.merge(&layout.staging_roles); + let mut roles = layout.current().roles.clone(); + roles.merge(&layout.staging.get().roles); for replaced in args.replace.iter() { - let replaced_node = find_matching_node(layout.node_ids().iter().cloned(), replaced)?; + let replaced_node = find_matching_node(all_nodes.iter().cloned(), replaced)?; match roles.get(&replaced_node) { Some(NodeRoleV(Some(_))) => { layout - .staging_roles + .staging + .get_mut() + .roles .merge(&roles.update_mutator(replaced_node, NodeRoleV(None))); } _ => { @@ -130,7 +137,9 @@ pub async fn cmd_assign_role( }; layout - .staging_roles + .staging + .get_mut() + .roles .merge(&roles.update_mutator(added_node, NodeRoleV(Some(new_entry)))); } @@ -149,14 +158,16 @@ pub async fn cmd_remove_role( ) -> Result<(), Error> { let mut layout = fetch_layout(rpc_cli, rpc_host).await?; - let mut roles = layout.roles.clone(); - roles.merge(&layout.staging_roles); + let mut roles = layout.current().roles.clone(); + roles.merge(&layout.staging.get().roles); let deleted_node = find_matching_node(roles.items().iter().map(|(id, _, _)| *id), &args.node_id)?; layout - .staging_roles + .staging + .get_mut() + .roles .merge(&roles.update_mutator(deleted_node, NodeRoleV(None))); send_layout(rpc_cli, rpc_host, layout).await?; @@ -174,13 +185,16 @@ pub async fn cmd_show_layout( let layout = fetch_layout(rpc_cli, rpc_host).await?; println!("==== CURRENT CLUSTER LAYOUT ===="); - print_cluster_layout(&layout, "No nodes currently have a role in the cluster.\nSee `garage status` to view available nodes."); + print_cluster_layout(layout.current(), "No nodes currently have a role in the cluster.\nSee `garage status` to view available nodes."); println!(); - println!("Current cluster layout version: {}", layout.version); + println!( + "Current cluster layout version: {}", + layout.current().version + ); let has_role_changes = print_staging_role_changes(&layout); if has_role_changes { - let v = layout.version; + let v = layout.current().version; let res_apply = layout.apply_staged_changes(Some(v + 1)); // this will print the stats of what partitions @@ -189,7 +203,7 @@ pub async fn cmd_show_layout( Ok((layout, msg)) => { println!(); println!("==== NEW CLUSTER LAYOUT AFTER APPLYING CHANGES ===="); - print_cluster_layout(&layout, "No nodes have a role in the new layout."); + print_cluster_layout(layout.current(), "No nodes have a role in the new layout."); println!(); for line in msg.iter() { @@ -199,16 +213,12 @@ pub async fn cmd_show_layout( println!(); println!(" garage layout apply --version {}", v + 1); println!(); - println!( - "You can also revert all proposed changes with: garage layout revert --version {}", - v + 1) + println!("You can also revert all proposed changes with: garage layout revert"); } Err(e) => { println!("Error while trying to compute the assignment: {}", e); println!("This new layout cannot yet be applied."); - println!( - "You can also revert all proposed changes with: garage layout revert --version {}", - v + 1) + println!("You can also revert all proposed changes with: garage layout revert"); } } } @@ -241,9 +251,15 @@ pub async fn cmd_revert_layout( rpc_host: NodeID, revert_opt: RevertLayoutOpt, ) -> Result<(), Error> { + if !revert_opt.yes { + return Err(Error::Message( + "Please add the --yes flag to run the layout revert operation".into(), + )); + } + let layout = fetch_layout(rpc_cli, rpc_host).await?; - let layout = layout.revert_staged_changes(revert_opt.version)?; + let layout = layout.revert_staged_changes()?; send_layout(rpc_cli, rpc_host, layout).await?; @@ -266,11 +282,11 @@ pub async fn cmd_config_layout( .parse::<ZoneRedundancy>() .ok_or_message("invalid zone redundancy value")?; if let ZoneRedundancy::AtLeast(r_int) = r { - if r_int > layout.replication_factor { + if r_int > layout.current().replication_factor { return Err(Error::Message(format!( "The zone redundancy must be smaller or equal to the \ replication factor ({}).", - layout.replication_factor + layout.current().replication_factor ))); } else if r_int < 1 { return Err(Error::Message( @@ -280,7 +296,9 @@ pub async fn cmd_config_layout( } layout - .staging_parameters + .staging + .get_mut() + .parameters .update(LayoutParameters { zone_redundancy: r }); println!("The zone redundancy parameter has been set to '{}'.", r); did_something = true; @@ -297,25 +315,166 @@ pub async fn cmd_config_layout( Ok(()) } +pub async fn cmd_layout_history( + rpc_cli: &Endpoint<SystemRpc, ()>, + rpc_host: NodeID, +) -> Result<(), Error> { + let layout = fetch_layout(rpc_cli, rpc_host).await?; + let min_stored = layout.min_stored(); + + println!("==== LAYOUT HISTORY ===="); + let mut table = vec!["Version\tStatus\tStorage nodes\tGateway nodes".to_string()]; + for ver in layout + .versions + .iter() + .rev() + .chain(layout.old_versions.iter().rev()) + { + let status = if ver.version == layout.current().version { + "current" + } else if ver.version >= min_stored { + "draining" + } else { + "historical" + }; + table.push(format!( + "#{}\t{}\t{}\t{}", + ver.version, + status, + ver.roles + .items() + .iter() + .filter(|(_, _, x)| matches!(x, NodeRoleV(Some(c)) if c.capacity.is_some())) + .count(), + ver.roles + .items() + .iter() + .filter(|(_, _, x)| matches!(x, NodeRoleV(Some(c)) if c.capacity.is_none())) + .count(), + )); + } + format_table(table); + println!(); + + if layout.versions.len() > 1 { + println!("==== UPDATE TRACKERS ===="); + println!("Several layout versions are currently live in the version, and data is being migrated."); + println!( + "This is the internal data that Garage stores to know which nodes have what data." + ); + println!(); + let mut table = vec!["Node\tAck\tSync\tSync_ack".to_string()]; + let all_nodes = layout.get_all_nodes(); + for node in all_nodes.iter() { + table.push(format!( + "{:?}\t#{}\t#{}\t#{}", + node, + layout.update_trackers.ack_map.get(node, min_stored), + layout.update_trackers.sync_map.get(node, min_stored), + layout.update_trackers.sync_ack_map.get(node, min_stored), + )); + } + table[1..].sort(); + format_table(table); + + println!(); + println!( + "If some nodes are not catching up to the latest layout version in the update trackers," + ); + println!("it might be because they are offline or unable to complete a sync successfully."); + println!( + "You may force progress using `garage layout skip-dead-nodes --version {}`", + layout.current().version + ); + } else { + println!("Your cluster is currently in a stable state with a single live layout version."); + println!("No metadata migration is in progress. Note that the migration of data blocks is not tracked,"); + println!( + "so you might want to keep old nodes online until their data directories become empty." + ); + } + + Ok(()) +} + +pub async fn cmd_layout_skip_dead_nodes( + rpc_cli: &Endpoint<SystemRpc, ()>, + rpc_host: NodeID, + opt: SkipDeadNodesOpt, +) -> Result<(), Error> { + let status = fetch_status(rpc_cli, rpc_host).await?; + let mut layout = fetch_layout(rpc_cli, rpc_host).await?; + + if layout.versions.len() == 1 { + return Err(Error::Message( + "This command cannot be called when there is only one live cluster layout version" + .into(), + )); + } + + let min_v = layout.min_stored(); + if opt.version <= min_v || opt.version > layout.current().version { + return Err(Error::Message(format!( + "Invalid version, you may use the following version numbers: {}", + (min_v + 1..=layout.current().version) + .map(|x| x.to_string()) + .collect::<Vec<_>>() + .join(" ") + ))); + } + + let all_nodes = layout.get_all_nodes(); + let mut did_something = false; + for node in all_nodes.iter() { + if status.iter().any(|x| x.id == *node && x.is_up) { + continue; + } + + if layout.update_trackers.ack_map.set_max(*node, opt.version) { + println!("Increased the ACK tracker for node {:?}", node); + did_something = true; + } + + if opt.allow_missing_data { + if layout.update_trackers.sync_map.set_max(*node, opt.version) { + println!("Increased the SYNC tracker for node {:?}", node); + did_something = true; + } + } + } + + if did_something { + send_layout(rpc_cli, rpc_host, layout).await?; + println!("Success."); + Ok(()) + } else if !opt.allow_missing_data { + Err(Error::Message("Nothing was done, try passing the `--allow-missing-data` flag to force progress even when not enough nodes can complete a metadata sync.".into())) + } else { + Err(Error::Message( + "Sorry, there is nothing I can do for you. Please wait patiently. If you ask for help, please send the output of the `garage layout history` command.".into(), + )) + } +} + // --- utility --- pub async fn fetch_layout( rpc_cli: &Endpoint<SystemRpc, ()>, rpc_host: NodeID, -) -> Result<ClusterLayout, Error> { +) -> Result<LayoutHistory, Error> { match rpc_cli .call(&rpc_host, SystemRpc::PullClusterLayout, PRIO_NORMAL) .await?? { SystemRpc::AdvertiseClusterLayout(t) => Ok(t), - resp => Err(Error::Message(format!("Invalid RPC response: {:?}", resp))), + resp => Err(Error::unexpected_rpc_message(resp)), } } pub async fn send_layout( rpc_cli: &Endpoint<SystemRpc, ()>, rpc_host: NodeID, - layout: ClusterLayout, + layout: LayoutHistory, ) -> Result<(), Error> { rpc_cli .call( @@ -327,7 +486,7 @@ pub async fn send_layout( Ok(()) } -pub fn print_cluster_layout(layout: &ClusterLayout, empty_msg: &str) { +pub fn print_cluster_layout(layout: &LayoutVersion, empty_msg: &str) { let mut table = vec!["ID\tTags\tZone\tCapacity\tUsable capacity".to_string()]; for (id, _, role) in layout.roles.items().iter() { let role = match &role.0 { @@ -366,21 +525,22 @@ pub fn print_cluster_layout(layout: &ClusterLayout, empty_msg: &str) { } } -pub fn print_staging_role_changes(layout: &ClusterLayout) -> bool { - let has_role_changes = layout - .staging_roles +pub fn print_staging_role_changes(layout: &LayoutHistory) -> bool { + let staging = layout.staging.get(); + let has_role_changes = staging + .roles .items() .iter() - .any(|(k, _, v)| layout.roles.get(k) != Some(v)); - let has_layout_changes = *layout.staging_parameters.get() != layout.parameters; + .any(|(k, _, v)| layout.current().roles.get(k) != Some(v)); + let has_layout_changes = *staging.parameters.get() != layout.current().parameters; if has_role_changes || has_layout_changes { println!(); println!("==== STAGED ROLE CHANGES ===="); if has_role_changes { let mut table = vec!["ID\tTags\tZone\tCapacity".to_string()]; - for (id, _, role) in layout.staging_roles.items().iter() { - if layout.roles.get(id) == Some(role) { + for (id, _, role) in staging.roles.items().iter() { + if layout.current().roles.get(id) == Some(role) { continue; } if let Some(role) = &role.0 { @@ -402,7 +562,7 @@ pub fn print_staging_role_changes(layout: &ClusterLayout) -> bool { if has_layout_changes { println!( "Zone redundancy: {}", - layout.staging_parameters.get().zone_redundancy + staging.parameters.get().zone_redundancy ); } true diff --git a/src/garage/cli/structs.rs b/src/garage/cli/structs.rs index be4d5bd6..40e47ee1 100644 --- a/src/garage/cli/structs.rs +++ b/src/garage/cli/structs.rs @@ -114,6 +114,14 @@ pub enum LayoutOperation { /// Revert staged changes to cluster layout #[structopt(name = "revert", version = garage_version())] Revert(RevertLayoutOpt), + + /// View the history of layouts in the cluster + #[structopt(name = "history", version = garage_version())] + History, + + /// Skip dead nodes when awaiting for a new layout version to be synchronized + #[structopt(name = "skip-dead-nodes", version = garage_version())] + SkipDeadNodes(SkipDeadNodesOpt), } #[derive(StructOpt, Debug)] @@ -166,9 +174,21 @@ pub struct ApplyLayoutOpt { #[derive(StructOpt, Debug)] pub struct RevertLayoutOpt { - /// Version number of old configuration to which to revert + /// The revert operation will not be ran unless this flag is added + #[structopt(long = "yes")] + pub(crate) yes: bool, +} + +#[derive(StructOpt, Debug)] +pub struct SkipDeadNodesOpt { + /// Version number of the layout to assume is currently up-to-date. + /// This will generally be the current layout version. #[structopt(long = "version")] - pub(crate) version: Option<u64>, + pub(crate) version: u64, + /// Allow the skip even if a quorum of ndoes could not be found for + /// the data among the remaining nodes + #[structopt(long = "allow-missing-data")] + pub(crate) allow_missing_data: bool, } #[derive(Serialize, Deserialize, StructOpt, Debug)] diff --git a/src/garage/cli/util.rs b/src/garage/cli/util.rs index 2232d395..0511e2b1 100644 --- a/src/garage/cli/util.rs +++ b/src/garage/cli/util.rs @@ -450,6 +450,8 @@ pub fn print_block_info( if refcount != nondeleted_count { println!(); - println!("Warning: refcount does not match number of non-deleted versions"); + println!( + "Warning: refcount does not match number of non-deleted versions (see issue #644)." + ); } } diff --git a/src/garage/secrets.rs b/src/garage/secrets.rs index c3d704aa..8d2ff475 100644 --- a/src/garage/secrets.rs +++ b/src/garage/secrets.rs @@ -163,7 +163,7 @@ mod tests { r#" metadata_dir = "/tmp/garage/meta" data_dir = "/tmp/garage/data" - replication_mode = "3" + replication_factor = 3 rpc_bind_addr = "[::]:3901" rpc_secret_file = "{}" @@ -185,7 +185,7 @@ mod tests { r#" metadata_dir = "/tmp/garage/meta" data_dir = "/tmp/garage/data" - replication_mode = "3" + replication_factor = 3 rpc_bind_addr = "[::]:3901" rpc_secret_file = "{}" allow_world_readable_secrets = true @@ -296,7 +296,7 @@ mod tests { r#" metadata_dir = "/tmp/garage/meta" data_dir = "/tmp/garage/data" - replication_mode = "3" + replication_factor = 3 rpc_bind_addr = "[::]:3901" rpc_secret= "dummy" rpc_secret_file = "dummy" diff --git a/src/garage/tests/common/ext/process.rs b/src/garage/tests/common/ext/process.rs index ba533b6c..8e20bf7c 100644 --- a/src/garage/tests/common/ext/process.rs +++ b/src/garage/tests/common/ext/process.rs @@ -14,42 +14,20 @@ impl CommandExt for process::Command { } fn expect_success_status(&mut self, msg: &str) -> process::ExitStatus { - let status = self.status().expect(msg); - status.expect_success(msg); - status + self.expect_success_output(msg).status } fn expect_success_output(&mut self, msg: &str) -> process::Output { let output = self.output().expect(msg); - output.expect_success(msg); - output - } -} - -pub trait OutputExt { - fn expect_success(&self, msg: &str); -} - -impl OutputExt for process::Output { - fn expect_success(&self, msg: &str) { - self.status.expect_success(msg) - } -} - -pub trait ExitStatusExt { - fn expect_success(&self, msg: &str); -} - -impl ExitStatusExt for process::ExitStatus { - fn expect_success(&self, msg: &str) { - if !self.success() { - match self.code() { - Some(code) => panic!( - "Command exited with code {code}: {msg}", - code = code, - msg = msg - ), - None => panic!("Command exited with signal: {msg}", msg = msg), - } + if !output.status.success() { + panic!( + "{}: command {:?} exited with error {:?}\nSTDOUT: {}\nSTDERR: {}", + msg, + self, + output.status.code(), + String::from_utf8_lossy(&output.stdout), + String::from_utf8_lossy(&output.stderr) + ); } + output } } diff --git a/src/garage/tests/common/garage.rs b/src/garage/tests/common/garage.rs index d1f0867a..f1c1efc8 100644 --- a/src/garage/tests/common/garage.rs +++ b/src/garage/tests/common/garage.rs @@ -54,7 +54,7 @@ metadata_dir = "{path}/meta" data_dir = "{path}/data" db_engine = "lmdb" -replication_mode = "1" +replication_factor = 1 rpc_bind_addr = "127.0.0.1:{rpc_port}" rpc_public_addr = "127.0.0.1:{rpc_port}" @@ -96,7 +96,7 @@ api_bind_addr = "127.0.0.1:{admin_port}" .arg("server") .stdout(stdout) .stderr(stderr) - .env("RUST_LOG", "garage=info,garage_api=trace") + .env("RUST_LOG", "garage=debug,garage_api=trace") .spawn() .expect("Could not start garage"); diff --git a/src/garage/tests/s3/mod.rs b/src/garage/tests/s3/mod.rs index 4ebc4914..e75b1397 100644 --- a/src/garage/tests/s3/mod.rs +++ b/src/garage/tests/s3/mod.rs @@ -3,5 +3,6 @@ mod multipart; mod objects; mod presigned; mod simple; +mod ssec; mod streaming_signature; mod website; diff --git a/src/garage/tests/s3/ssec.rs b/src/garage/tests/s3/ssec.rs new file mode 100644 index 00000000..d8f11950 --- /dev/null +++ b/src/garage/tests/s3/ssec.rs @@ -0,0 +1,455 @@ +use crate::common::{self, Context}; +use aws_sdk_s3::primitives::ByteStream; +use aws_sdk_s3::types::{CompletedMultipartUpload, CompletedPart}; + +const SSEC_KEY: &str = "u8zCfnEyt5Imo/krN+sxA1DQXxLWtPJavU6T6gOVj1Y="; +const SSEC_KEY_MD5: &str = "jMGbs3GyZkYjJUP6q5jA7g=="; +const SSEC_KEY2: &str = "XkYVk4Z3vVDO2yJaUqCAEZX6lL10voMxtV06d8my/eU="; +const SSEC_KEY2_MD5: &str = "kedo2ab8J1MCjHwJuLTJHw=="; + +const SZ_2MB: usize = 2 * 1024 * 1024; + +#[tokio::test] +async fn test_ssec_object() { + let ctx = common::context(); + let bucket = ctx.create_bucket("sse-c"); + + let bytes1 = b"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz".to_vec(); + let bytes2 = (0..400000) + .map(|x| ((x * 3792) % 256) as u8) + .collect::<Vec<u8>>(); + + for data in vec![bytes1, bytes2] { + let stream = ByteStream::new(data.clone().into()); + + // Write encrypted object + let r = ctx + .client + .put_object() + .bucket(&bucket) + .key("testobj") + .sse_customer_algorithm("AES256") + .sse_customer_key(SSEC_KEY) + .sse_customer_key_md5(SSEC_KEY_MD5) + .body(stream) + .send() + .await + .unwrap(); + assert_eq!(r.sse_customer_algorithm, Some("AES256".into())); + assert_eq!(r.sse_customer_key_md5, Some(SSEC_KEY_MD5.into())); + + test_read_encrypted( + &ctx, + &bucket, + "testobj", + &data, + SSEC_KEY, + SSEC_KEY_MD5, + SSEC_KEY2, + SSEC_KEY2_MD5, + ) + .await; + + // Test copy from encrypted to non-encrypted + let r = ctx + .client + .copy_object() + .bucket(&bucket) + .key("test-copy-enc-dec") + .copy_source(format!("{}/{}", bucket, "testobj")) + .copy_source_sse_customer_algorithm("AES256") + .copy_source_sse_customer_key(SSEC_KEY) + .copy_source_sse_customer_key_md5(SSEC_KEY_MD5) + .send() + .await + .unwrap(); + assert_eq!(r.sse_customer_algorithm, None); + assert_eq!(r.sse_customer_key_md5, None); + + // Test read decrypted file + let r = ctx + .client + .get_object() + .bucket(&bucket) + .key("test-copy-enc-dec") + .send() + .await + .unwrap(); + assert_bytes_eq!(r.body, &data); + assert_eq!(r.sse_customer_algorithm, None); + assert_eq!(r.sse_customer_key_md5, None); + + // Test copy from non-encrypted to encrypted + let r = ctx + .client + .copy_object() + .bucket(&bucket) + .key("test-copy-enc-dec-enc") + .copy_source(format!("{}/test-copy-enc-dec", bucket)) + .sse_customer_algorithm("AES256") + .sse_customer_key(SSEC_KEY2) + .sse_customer_key_md5(SSEC_KEY2_MD5) + .send() + .await + .unwrap(); + assert_eq!(r.sse_customer_algorithm, Some("AES256".into())); + assert_eq!(r.sse_customer_key_md5, Some(SSEC_KEY2_MD5.into())); + + test_read_encrypted( + &ctx, + &bucket, + "test-copy-enc-dec-enc", + &data, + SSEC_KEY2, + SSEC_KEY2_MD5, + SSEC_KEY, + SSEC_KEY_MD5, + ) + .await; + + // Test copy from encrypted to encrypted with different keys + let r = ctx + .client + .copy_object() + .bucket(&bucket) + .key("test-copy-enc-enc") + .copy_source(format!("{}/{}", bucket, "testobj")) + .copy_source_sse_customer_algorithm("AES256") + .copy_source_sse_customer_key(SSEC_KEY) + .copy_source_sse_customer_key_md5(SSEC_KEY_MD5) + .sse_customer_algorithm("AES256") + .sse_customer_key(SSEC_KEY2) + .sse_customer_key_md5(SSEC_KEY2_MD5) + .send() + .await + .unwrap(); + assert_eq!(r.sse_customer_algorithm, Some("AES256".into())); + assert_eq!(r.sse_customer_key_md5, Some(SSEC_KEY2_MD5.into())); + test_read_encrypted( + &ctx, + &bucket, + "test-copy-enc-enc", + &data, + SSEC_KEY2, + SSEC_KEY2_MD5, + SSEC_KEY, + SSEC_KEY_MD5, + ) + .await; + + // Test copy from encrypted to encrypted with the same key + let r = ctx + .client + .copy_object() + .bucket(&bucket) + .key("test-copy-enc-enc-same") + .copy_source(format!("{}/{}", bucket, "testobj")) + .copy_source_sse_customer_algorithm("AES256") + .copy_source_sse_customer_key(SSEC_KEY) + .copy_source_sse_customer_key_md5(SSEC_KEY_MD5) + .sse_customer_algorithm("AES256") + .sse_customer_key(SSEC_KEY) + .sse_customer_key_md5(SSEC_KEY_MD5) + .send() + .await + .unwrap(); + assert_eq!(r.sse_customer_algorithm, Some("AES256".into())); + assert_eq!(r.sse_customer_key_md5, Some(SSEC_KEY_MD5.into())); + test_read_encrypted( + &ctx, + &bucket, + "test-copy-enc-enc-same", + &data, + SSEC_KEY, + SSEC_KEY_MD5, + SSEC_KEY2, + SSEC_KEY2_MD5, + ) + .await; + } +} + +#[tokio::test] +async fn test_multipart_upload() { + let ctx = common::context(); + let bucket = ctx.create_bucket("test-ssec-mpu"); + + let u1 = vec![0x11; SZ_2MB]; + let u2 = vec![0x22; SZ_2MB]; + let u3 = vec![0x33; SZ_2MB]; + let all = [&u1[..], &u2[..], &u3[..]].concat(); + + // Test simple encrypted mpu + { + let up = ctx + .client + .create_multipart_upload() + .bucket(&bucket) + .key("a") + .sse_customer_algorithm("AES256") + .sse_customer_key(SSEC_KEY) + .sse_customer_key_md5(SSEC_KEY_MD5) + .send() + .await + .unwrap(); + assert!(up.upload_id.is_some()); + assert_eq!(up.sse_customer_algorithm, Some("AES256".into())); + assert_eq!(up.sse_customer_key_md5, Some(SSEC_KEY_MD5.into())); + + let uid = up.upload_id.as_ref().unwrap(); + + let mut etags = vec![]; + for (i, part) in vec![&u1, &u2, &u3].into_iter().enumerate() { + let pu = ctx + .client + .upload_part() + .bucket(&bucket) + .key("a") + .upload_id(uid) + .part_number((i + 1) as i32) + .sse_customer_algorithm("AES256") + .sse_customer_key(SSEC_KEY) + .sse_customer_key_md5(SSEC_KEY_MD5) + .body(ByteStream::from(part.to_vec())) + .send() + .await + .unwrap(); + etags.push(pu.e_tag.unwrap()); + } + + let mut cmp = CompletedMultipartUpload::builder(); + for (i, etag) in etags.into_iter().enumerate() { + cmp = cmp.parts( + CompletedPart::builder() + .part_number((i + 1) as i32) + .e_tag(etag) + .build(), + ); + } + + ctx.client + .complete_multipart_upload() + .bucket(&bucket) + .key("a") + .upload_id(uid) + .multipart_upload(cmp.build()) + .send() + .await + .unwrap(); + + test_read_encrypted( + &ctx, + &bucket, + "a", + &all, + SSEC_KEY, + SSEC_KEY_MD5, + SSEC_KEY2, + SSEC_KEY2_MD5, + ) + .await; + } + + // Test upload part copy from first object + { + // (setup) Upload a single part object + ctx.client + .put_object() + .bucket(&bucket) + .key("b") + .body(ByteStream::from(u1.clone())) + .sse_customer_algorithm("AES256") + .sse_customer_key(SSEC_KEY2) + .sse_customer_key_md5(SSEC_KEY2_MD5) + .send() + .await + .unwrap(); + + let up = ctx + .client + .create_multipart_upload() + .bucket(&bucket) + .key("target") + .sse_customer_algorithm("AES256") + .sse_customer_key(SSEC_KEY2) + .sse_customer_key_md5(SSEC_KEY2_MD5) + .send() + .await + .unwrap(); + let uid = up.upload_id.as_ref().unwrap(); + + let p1 = ctx + .client + .upload_part() + .bucket(&bucket) + .key("target") + .upload_id(uid) + .part_number(1) + .sse_customer_algorithm("AES256") + .sse_customer_key(SSEC_KEY2) + .sse_customer_key_md5(SSEC_KEY2_MD5) + .body(ByteStream::from(u3.clone())) + .send() + .await + .unwrap(); + + let p2 = ctx + .client + .upload_part_copy() + .bucket(&bucket) + .key("target") + .upload_id(uid) + .part_number(2) + .copy_source(format!("{}/a", bucket)) + .copy_source_range("bytes=500-550000") + .copy_source_sse_customer_algorithm("AES256") + .copy_source_sse_customer_key(SSEC_KEY) + .copy_source_sse_customer_key_md5(SSEC_KEY_MD5) + .sse_customer_algorithm("AES256") + .sse_customer_key(SSEC_KEY2) + .sse_customer_key_md5(SSEC_KEY2_MD5) + .send() + .await + .unwrap(); + + let p3 = ctx + .client + .upload_part() + .bucket(&bucket) + .key("target") + .upload_id(uid) + .part_number(3) + .sse_customer_algorithm("AES256") + .sse_customer_key(SSEC_KEY2) + .sse_customer_key_md5(SSEC_KEY2_MD5) + .body(ByteStream::from(u2.clone())) + .send() + .await + .unwrap(); + + let p4 = ctx + .client + .upload_part_copy() + .bucket(&bucket) + .key("target") + .upload_id(uid) + .part_number(4) + .copy_source(format!("{}/b", bucket)) + .copy_source_range("bytes=1500-20500") + .copy_source_sse_customer_algorithm("AES256") + .copy_source_sse_customer_key(SSEC_KEY2) + .copy_source_sse_customer_key_md5(SSEC_KEY2_MD5) + .sse_customer_algorithm("AES256") + .sse_customer_key(SSEC_KEY2) + .sse_customer_key_md5(SSEC_KEY2_MD5) + .send() + .await + .unwrap(); + + let cmp = CompletedMultipartUpload::builder() + .parts( + CompletedPart::builder() + .part_number(1) + .e_tag(p1.e_tag.unwrap()) + .build(), + ) + .parts( + CompletedPart::builder() + .part_number(2) + .e_tag(p2.copy_part_result.unwrap().e_tag.unwrap()) + .build(), + ) + .parts( + CompletedPart::builder() + .part_number(3) + .e_tag(p3.e_tag.unwrap()) + .build(), + ) + .parts( + CompletedPart::builder() + .part_number(4) + .e_tag(p4.copy_part_result.unwrap().e_tag.unwrap()) + .build(), + ) + .build(); + + ctx.client + .complete_multipart_upload() + .bucket(&bucket) + .key("target") + .upload_id(uid) + .multipart_upload(cmp) + .send() + .await + .unwrap(); + + // (check) Get object + let expected = [&u3[..], &all[500..550001], &u2[..], &u1[1500..20501]].concat(); + test_read_encrypted( + &ctx, + &bucket, + "target", + &expected, + SSEC_KEY2, + SSEC_KEY2_MD5, + SSEC_KEY, + SSEC_KEY_MD5, + ) + .await; + } +} + +async fn test_read_encrypted( + ctx: &Context, + bucket: &str, + obj_key: &str, + expected_data: &[u8], + enc_key: &str, + enc_key_md5: &str, + wrong_enc_key: &str, + wrong_enc_key_md5: &str, +) { + // Test read encrypted without key + let o = ctx + .client + .get_object() + .bucket(bucket) + .key(obj_key) + .send() + .await; + assert!( + o.is_err(), + "encrypted file could be read without encryption key" + ); + + // Test read encrypted with wrong key + let o = ctx + .client + .get_object() + .bucket(bucket) + .key(obj_key) + .sse_customer_key(wrong_enc_key) + .sse_customer_key_md5(wrong_enc_key_md5) + .send() + .await; + assert!( + o.is_err(), + "encrypted file could be read with incorrect encryption key" + ); + + // Test read encrypted with correct key + let o = ctx + .client + .get_object() + .bucket(bucket) + .key(obj_key) + .sse_customer_algorithm("AES256") + .sse_customer_key(enc_key) + .sse_customer_key_md5(enc_key_md5) + .send() + .await + .unwrap(); + assert_bytes_eq!(o.body, expected_data); + assert_eq!(o.sse_customer_algorithm, Some("AES256".into())); + assert_eq!(o.sse_customer_key_md5, Some(enc_key_md5.to_string())); +} diff --git a/src/model/Cargo.toml b/src/model/Cargo.toml index 2e5b047d..776671d0 100644 --- a/src/model/Cargo.toml +++ b/src/model/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "garage_model" -version = "0.9.3" +version = "0.10.0" authors = ["Alex Auvolat <alex@adnab.me>"] edition = "2018" license = "AGPL-3.0" @@ -27,6 +27,7 @@ blake2.workspace = true chrono.workspace = true err-derive.workspace = true hex.workspace = true +http.workspace = true base64.workspace = true tracing.workspace = true rand.workspace = true diff --git a/src/model/garage.rs b/src/model/garage.rs index 18421ca3..7ec8b22e 100644 --- a/src/model/garage.rs +++ b/src/model/garage.rs @@ -10,7 +10,7 @@ use garage_util::config::*; use garage_util::error::*; use garage_util::persister::PersisterShared; -use garage_rpc::replication_mode::ReplicationMode; +use garage_rpc::replication_mode::*; use garage_rpc::system::System; use garage_block::manager::*; @@ -40,8 +40,8 @@ pub struct Garage { /// The set of background variables that can be viewed/modified at runtime pub bg_vars: vars::BgVars, - /// The replication mode of this cluster - pub replication_mode: ReplicationMode, + /// The replication factor of this cluster + pub replication_factor: ReplicationFactor, /// The local database pub db: db::Db, @@ -148,32 +148,30 @@ impl Garage { .and_then(|x| NetworkKey::from_slice(&x)) .ok_or_message("Invalid RPC secret key")?; - let replication_mode = ReplicationMode::parse(&config.replication_mode) - .ok_or_message("Invalid replication_mode in config file.")?; + let (replication_factor, consistency_mode) = parse_replication_mode(&config)?; info!("Initialize background variable system..."); let mut bg_vars = vars::BgVars::new(); info!("Initialize membership management system..."); - let system = System::new(network_key, replication_mode, &config)?; + let system = System::new(network_key, replication_factor, consistency_mode, &config)?; let data_rep_param = TableShardedReplication { system: system.clone(), - replication_factor: replication_mode.replication_factor(), - write_quorum: replication_mode.write_quorum(), + replication_factor: replication_factor.into(), + write_quorum: replication_factor.write_quorum(consistency_mode), read_quorum: 1, }; let meta_rep_param = TableShardedReplication { system: system.clone(), - replication_factor: replication_mode.replication_factor(), - write_quorum: replication_mode.write_quorum(), - read_quorum: replication_mode.read_quorum(), + replication_factor: replication_factor.into(), + write_quorum: replication_factor.write_quorum(consistency_mode), + read_quorum: replication_factor.read_quorum(consistency_mode), }; let control_rep_param = TableFullReplication { system: system.clone(), - max_faults: replication_mode.control_write_max_faults(), }; info!("Initialize block manager..."); @@ -265,7 +263,7 @@ impl Garage { Ok(Arc::new(Self { config, bg_vars, - replication_mode, + replication_factor, db, system, block_manager, diff --git a/src/model/helper/bucket.rs b/src/model/helper/bucket.rs index f4e669c3..a712d683 100644 --- a/src/model/helper/bucket.rs +++ b/src/model/helper/bucket.rs @@ -112,10 +112,12 @@ impl<'a> BucketHelper<'a> { #[cfg(feature = "k2v")] { - use garage_rpc::ring::Ring; - use std::sync::Arc; - - let ring: Arc<Ring> = self.0.system.ring.borrow().clone(); + let node_id_vec = self + .0 + .system + .cluster_layout() + .all_nongateway_nodes() + .to_vec(); let k2vindexes = self .0 .k2v @@ -124,7 +126,7 @@ impl<'a> BucketHelper<'a> { .get_range( &bucket_id, None, - Some((DeletedFilter::NotDeleted, ring.layout.node_id_vec.clone())), + Some((DeletedFilter::NotDeleted, node_id_vec)), 10, EnumerationOrder::Forward, ) diff --git a/src/model/index_counter.rs b/src/model/index_counter.rs index c0bf38d8..aa13ee7b 100644 --- a/src/model/index_counter.rs +++ b/src/model/index_counter.rs @@ -7,7 +7,7 @@ use serde::{Deserialize, Serialize}; use garage_db as db; -use garage_rpc::ring::Ring; +use garage_rpc::layout::LayoutHelper; use garage_rpc::system::System; use garage_util::background::BackgroundRunner; use garage_util::data::*; @@ -83,9 +83,9 @@ impl<T: CountedItem> Entry<T::CP, T::CS> for CounterEntry<T> { } impl<T: CountedItem> CounterEntry<T> { - pub fn filtered_values(&self, ring: &Ring) -> HashMap<String, i64> { - let nodes = &ring.layout.node_id_vec[..]; - self.filtered_values_with_nodes(nodes) + pub fn filtered_values(&self, layout: &LayoutHelper) -> HashMap<String, i64> { + let nodes = layout.all_nongateway_nodes(); + self.filtered_values_with_nodes(&nodes) } pub fn filtered_values_with_nodes(&self, nodes: &[Uuid]) -> HashMap<String, i64> { diff --git a/src/model/k2v/rpc.rs b/src/model/k2v/rpc.rs index 4ab44c22..e15f2df8 100644 --- a/src/model/k2v/rpc.rs +++ b/src/model/k2v/rpc.rs @@ -127,23 +127,21 @@ impl K2VRpcHandler { .item_table .data .replication - .write_nodes(&partition.hash()); + .storage_nodes(&partition.hash()); who.sort(); self.system - .rpc + .rpc_helper() .try_call_many( &self.endpoint, - &who[..], + &who, K2VRpc::InsertItem(InsertedItem { partition, sort_key, causal_context, value, }), - RequestStrategy::with_priority(PRIO_NORMAL) - .with_quorum(1) - .interrupt_after_quorum(true), + RequestStrategy::with_priority(PRIO_NORMAL).with_quorum(1), ) .await?; @@ -168,7 +166,7 @@ impl K2VRpcHandler { .item_table .data .replication - .write_nodes(&partition.hash()); + .storage_nodes(&partition.hash()); who.sort(); call_list.entry(who).or_default().push(InsertedItem { @@ -187,14 +185,12 @@ impl K2VRpcHandler { let call_futures = call_list.into_iter().map(|(nodes, items)| async move { let resp = self .system - .rpc + .rpc_helper() .try_call_many( &self.endpoint, &nodes[..], K2VRpc::InsertManyItems(items), - RequestStrategy::with_priority(PRIO_NORMAL) - .with_quorum(1) - .interrupt_after_quorum(true), + RequestStrategy::with_priority(PRIO_NORMAL).with_quorum(1), ) .await?; Ok::<_, Error>((nodes, resp)) @@ -223,15 +219,16 @@ impl K2VRpcHandler { }, sort_key, }; + // TODO figure this out with write sets, is it still appropriate??? let nodes = self .item_table .data .replication - .write_nodes(&poll_key.partition.hash()); + .read_nodes(&poll_key.partition.hash()); - let rpc = self.system.rpc.try_call_many( + let rpc = self.system.rpc_helper().try_call_many( &self.endpoint, - &nodes[..], + &nodes, K2VRpc::PollItem { key: poll_key, causal_context, @@ -239,9 +236,11 @@ impl K2VRpcHandler { }, RequestStrategy::with_priority(PRIO_NORMAL) .with_quorum(self.item_table.data.replication.read_quorum()) + .send_all_at_once(true) .without_timeout(), ); - let timeout_duration = Duration::from_millis(timeout_msec) + self.system.rpc.rpc_timeout(); + let timeout_duration = + Duration::from_millis(timeout_msec) + self.system.rpc_helper().rpc_timeout(); let resps = select! { r = rpc => r?, _ = tokio::time::sleep(timeout_duration) => return Ok(None), @@ -283,11 +282,12 @@ impl K2VRpcHandler { seen.restrict(&range); // Prepare PollRange RPC to send to the storage nodes responsible for the parititon + // TODO figure this out with write sets, does it still work???? let nodes = self .item_table .data .replication - .write_nodes(&range.partition.hash()); + .read_nodes(&range.partition.hash()); let quorum = self.item_table.data.replication.read_quorum(); let msg = K2VRpc::PollRange { range, @@ -300,7 +300,11 @@ impl K2VRpcHandler { let rs = RequestStrategy::with_priority(PRIO_NORMAL).without_timeout(); let mut requests = nodes .iter() - .map(|node| self.system.rpc.call(&self.endpoint, *node, msg.clone(), rs)) + .map(|node| { + self.system + .rpc_helper() + .call(&self.endpoint, *node, msg.clone(), rs) + }) .collect::<FuturesUnordered<_>>(); // Fetch responses. This procedure stops fetching responses when any of the following @@ -316,8 +320,9 @@ impl K2VRpcHandler { // kind: all items produced by that node until time ts have been returned, so we can // bump the entry in the global vector clock and possibly remove some item-specific // vector clocks) - let mut deadline = - Instant::now() + Duration::from_millis(timeout_msec) + self.system.rpc.rpc_timeout(); + let mut deadline = Instant::now() + + Duration::from_millis(timeout_msec) + + self.system.rpc_helper().rpc_timeout(); let mut resps = vec![]; let mut errors = vec![]; loop { @@ -339,7 +344,7 @@ impl K2VRpcHandler { } if errors.len() > nodes.len() - quorum { let errors = errors.iter().map(|e| format!("{}", e)).collect::<Vec<_>>(); - return Err(Error::Quorum(quorum, resps.len(), nodes.len(), errors).into()); + return Err(Error::Quorum(quorum, None, resps.len(), nodes.len(), errors).into()); } // Take all returned items into account to produce the response. diff --git a/src/model/s3/object_table.rs b/src/model/s3/object_table.rs index ebea04bd..f2d21493 100644 --- a/src/model/s3/object_table.rs +++ b/src/model/s3/object_table.rs @@ -210,7 +210,179 @@ mod v09 { } } -pub use v09::*; +mod v010 { + use garage_util::data::{Hash, Uuid}; + use serde::{Deserialize, Serialize}; + + use super::v09; + + /// An object + #[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] + pub struct Object { + /// The bucket in which the object is stored, used as partition key + pub bucket_id: Uuid, + + /// The key at which the object is stored in its bucket, used as sorting key + pub key: String, + + /// The list of currenty stored versions of the object + pub(super) versions: Vec<ObjectVersion>, + } + + /// Informations about a version of an object + #[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] + pub struct ObjectVersion { + /// Id of the version + pub uuid: Uuid, + /// Timestamp of when the object was created + pub timestamp: u64, + /// State of the version + pub state: ObjectVersionState, + } + + /// State of an object version + #[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] + pub enum ObjectVersionState { + /// The version is being received + Uploading { + /// Indicates whether this is a multipart upload + multipart: bool, + /// Encryption params + headers to be included in the final object + encryption: ObjectVersionEncryption, + }, + /// The version is fully received + Complete(ObjectVersionData), + /// The version uploaded containded errors or the upload was explicitly aborted + Aborted, + } + + /// Data stored in object version + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] + pub enum ObjectVersionData { + /// The object was deleted, this Version is a tombstone to mark it as such + DeleteMarker, + /// The object is short, it's stored inlined. + /// It is never compressed. For encrypted objects, it is encrypted using + /// AES256-GCM, like the encrypted headers. + Inline(ObjectVersionMeta, #[serde(with = "serde_bytes")] Vec<u8>), + /// The object is not short, Hash of first block is stored here, next segments hashes are + /// stored in the version table + FirstBlock(ObjectVersionMeta, Hash), + } + + /// Metadata about the object version + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] + pub struct ObjectVersionMeta { + /// Size of the object. If object is encrypted/compressed, + /// this is always the size of the unencrypted/uncompressed data + pub size: u64, + /// etag of the object + pub etag: String, + /// Encryption params + headers (encrypted or plaintext) + pub encryption: ObjectVersionEncryption, + } + + /// Encryption information + metadata + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] + pub enum ObjectVersionEncryption { + SseC { + /// Encrypted serialized ObjectVersionHeaders struct. + /// This is never compressed, just encrypted using AES256-GCM. + #[serde(with = "serde_bytes")] + headers: Vec<u8>, + /// Whether data blocks are compressed in addition to being encrypted + /// (compression happens before encryption, whereas for non-encrypted + /// objects, compression is handled at the level of the block manager) + compressed: bool, + }, + Plaintext { + /// Plain-text headers + headers: ObjectVersionHeaders, + }, + } + + /// Vector of headers, as tuples of the format (header name, header value) + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] + pub struct ObjectVersionHeaders(pub Vec<(String, String)>); + + impl garage_util::migrate::Migrate for Object { + const VERSION_MARKER: &'static [u8] = b"G010s3ob"; + + type Previous = v09::Object; + + fn migrate(old: v09::Object) -> Object { + Object { + bucket_id: old.bucket_id, + key: old.key, + versions: old.versions.into_iter().map(migrate_version).collect(), + } + } + } + + fn migrate_version(old: v09::ObjectVersion) -> ObjectVersion { + ObjectVersion { + uuid: old.uuid, + timestamp: old.timestamp, + state: match old.state { + v09::ObjectVersionState::Uploading { multipart, headers } => { + ObjectVersionState::Uploading { + multipart, + encryption: migrate_headers(headers), + } + } + v09::ObjectVersionState::Complete(d) => { + ObjectVersionState::Complete(migrate_data(d)) + } + v09::ObjectVersionState::Aborted => ObjectVersionState::Aborted, + }, + } + } + + fn migrate_data(old: v09::ObjectVersionData) -> ObjectVersionData { + match old { + v09::ObjectVersionData::DeleteMarker => ObjectVersionData::DeleteMarker, + v09::ObjectVersionData::Inline(meta, data) => { + ObjectVersionData::Inline(migrate_meta(meta), data) + } + v09::ObjectVersionData::FirstBlock(meta, fb) => { + ObjectVersionData::FirstBlock(migrate_meta(meta), fb) + } + } + } + + fn migrate_meta(old: v09::ObjectVersionMeta) -> ObjectVersionMeta { + ObjectVersionMeta { + size: old.size, + etag: old.etag, + encryption: migrate_headers(old.headers), + } + } + + fn migrate_headers(old: v09::ObjectVersionHeaders) -> ObjectVersionEncryption { + use http::header::CONTENT_TYPE; + + let mut new_headers = Vec::with_capacity(old.other.len() + 1); + if old.content_type != "blob" { + new_headers.push((CONTENT_TYPE.as_str().to_string(), old.content_type)); + } + for (name, value) in old.other.into_iter() { + new_headers.push((name, value)); + } + + ObjectVersionEncryption::Plaintext { + headers: ObjectVersionHeaders(new_headers), + } + } + + // Since ObjectVersionHeaders can now be serialized independently, for the + // purpose of being encrypted, we need it to support migrations on its own + // as well. + impl garage_util::migrate::InitialFormat for ObjectVersionHeaders { + const VERSION_MARKER: &'static [u8] = b"G010s3oh"; + } +} + +pub use v010::*; impl Object { /// Initialize an Object struct from parts diff --git a/src/model/s3/version_table.rs b/src/model/s3/version_table.rs index 5c032f9f..b4662a55 100644 --- a/src/model/s3/version_table.rs +++ b/src/model/s3/version_table.rs @@ -44,7 +44,8 @@ mod v05 { pub struct VersionBlockKey { /// Number of the part pub part_number: u64, - /// Offset of this sub-segment in its part + /// Offset of this sub-segment in its part as sent by the client + /// (before any kind of compression or encryption) pub offset: u64, } @@ -53,7 +54,7 @@ mod v05 { pub struct VersionBlock { /// Blake2 sum of the block pub hash: Hash, - /// Size of the block + /// Size of the block, before any kind of compression or encryption pub size: u64, } diff --git a/src/net/Cargo.toml b/src/net/Cargo.toml index 5168c939..4bd0d2e5 100644 --- a/src/net/Cargo.toml +++ b/src/net/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "garage_net" -version = "0.9.3" +version = "0.10.0" authors = ["Alex Auvolat <alex@adnab.me>"] edition = "2018" license = "AGPL-3.0" diff --git a/src/rpc/Cargo.toml b/src/rpc/Cargo.toml index 823fab99..3e7ac635 100644 --- a/src/rpc/Cargo.toml +++ b/src/rpc/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "garage_rpc" -version = "0.9.3" +version = "0.10.0" authors = ["Alex Auvolat <alex@adnab.me>"] edition = "2018" license = "AGPL-3.0" diff --git a/src/rpc/graph_algo.rs b/src/rpc/layout/graph_algo.rs index d8c6c9b9..bd33e97f 100644 --- a/src/rpc/graph_algo.rs +++ b/src/rpc/layout/graph_algo.rs @@ -114,16 +114,6 @@ impl Graph<FlowEdge> { Ok(result) } - /// This function returns the value of the flow incoming to v. - pub fn get_inflow(&self, v: Vertex) -> Result<i64, String> { - let idv = self.get_vertex_id(&v)?; - let mut result = 0; - for edge in self.graph[idv].iter() { - result += max(0, self.graph[edge.dest][edge.rev].flow); - } - Ok(result) - } - /// This function returns the value of the flow outgoing from v. pub fn get_outflow(&self, v: Vertex) -> Result<i64, String> { let idv = self.get_vertex_id(&v)?; diff --git a/src/rpc/layout/helper.rs b/src/rpc/layout/helper.rs new file mode 100644 index 00000000..2835347a --- /dev/null +++ b/src/rpc/layout/helper.rs @@ -0,0 +1,298 @@ +use std::collections::HashMap; +use std::ops::Deref; +use std::sync::atomic::{AtomicUsize, Ordering}; + +use serde::{Deserialize, Serialize}; + +use garage_util::data::*; + +use super::*; +use crate::replication_mode::*; + +#[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq, Eq)] +pub struct RpcLayoutDigest { + /// Cluster layout version + pub current_version: u64, + /// Number of active layout versions + pub active_versions: usize, + /// Hash of cluster layout update trackers + pub trackers_hash: Hash, + /// Hash of cluster layout staging data + pub staging_hash: Hash, +} + +#[derive(Debug, Clone, Copy, Eq, PartialEq)] +pub struct SyncLayoutDigest { + current: u64, + ack_map_min: u64, + min_stored: u64, +} + +pub struct LayoutHelper { + replication_factor: ReplicationFactor, + consistency_mode: ConsistencyMode, + layout: Option<LayoutHistory>, + + // cached values + ack_map_min: u64, + sync_map_min: u64, + + all_nodes: Vec<Uuid>, + all_nongateway_nodes: Vec<Uuid>, + + trackers_hash: Hash, + staging_hash: Hash, + + // ack lock: counts in-progress write operations for each + // layout version ; we don't increase the ack update tracker + // while this lock is nonzero + pub(crate) ack_lock: HashMap<u64, AtomicUsize>, +} + +impl Deref for LayoutHelper { + type Target = LayoutHistory; + fn deref(&self) -> &LayoutHistory { + self.layout() + } +} + +impl LayoutHelper { + pub fn new( + replication_factor: ReplicationFactor, + consistency_mode: ConsistencyMode, + mut layout: LayoutHistory, + mut ack_lock: HashMap<u64, AtomicUsize>, + ) -> Self { + // In the new() function of the helper, we do a bunch of cleanup + // and calculations on the layout history to make sure things are + // correct and we have rapid access to important values such as + // the layout versions to use when reading to ensure consistency. + + if consistency_mode != ConsistencyMode::Consistent { + // Fast path for when no consistency is required. + // In this case we only need to keep the last version of the layout, + // we don't care about coordinating stuff in the cluster. + layout.keep_current_version_only(); + } + + layout.cleanup_old_versions(); + + let all_nodes = layout.get_all_nodes(); + let all_nongateway_nodes = layout.get_all_nongateway_nodes(); + + layout.clamp_update_trackers(&all_nodes); + + let min_version = layout.min_stored(); + + // ack_map_min is the minimum value of ack_map among all nodes + // in the cluster (gateway, non-gateway, current and previous layouts). + // It is the highest layout version which all of these nodes have + // acknowledged, indicating that they are aware of it and are no + // longer processing write operations that did not take it into account. + let ack_map_min = layout + .update_trackers + .ack_map + .min_among(&all_nodes, min_version); + + // sync_map_min is the minimum value of sync_map among storage nodes + // in the cluster (non-gateway nodes only, current and previous layouts). + // It is the highest layout version for which we know that all relevant + // storage nodes have fullfilled a sync, and therefore it is safe to + // use a read quorum within that layout to ensure consistency. + // Gateway nodes are excluded here because they hold no relevant data + // (they store the bucket and access key tables, but we don't have + // consistency on those). + // This value is calculated using quorums to allow progress even + // if not all nodes have successfully completed a sync. + let sync_map_min = + layout.calculate_sync_map_min_with_quorum(replication_factor, &all_nongateway_nodes); + + let trackers_hash = layout.calculate_trackers_hash(); + let staging_hash = layout.calculate_staging_hash(); + + ack_lock.retain(|_, cnt| *cnt.get_mut() > 0); + ack_lock + .entry(layout.current().version) + .or_insert(AtomicUsize::new(0)); + + LayoutHelper { + replication_factor, + consistency_mode, + layout: Some(layout), + ack_map_min, + sync_map_min, + all_nodes, + all_nongateway_nodes, + trackers_hash, + staging_hash, + ack_lock, + } + } + + // ------------------ single updating function -------------- + + fn layout(&self) -> &LayoutHistory { + self.layout.as_ref().unwrap() + } + + pub(crate) fn update<F>(&mut self, f: F) -> bool + where + F: FnOnce(&mut LayoutHistory) -> bool, + { + let changed = f(self.layout.as_mut().unwrap()); + if changed { + *self = Self::new( + self.replication_factor, + self.consistency_mode, + self.layout.take().unwrap(), + std::mem::take(&mut self.ack_lock), + ); + } + changed + } + + // ------------------ read helpers --------------- + + pub fn all_nodes(&self) -> &[Uuid] { + &self.all_nodes + } + + pub fn all_nongateway_nodes(&self) -> &[Uuid] { + &self.all_nongateway_nodes + } + + pub fn ack_map_min(&self) -> u64 { + self.ack_map_min + } + + pub fn sync_map_min(&self) -> u64 { + self.sync_map_min + } + + pub fn sync_digest(&self) -> SyncLayoutDigest { + SyncLayoutDigest { + current: self.layout().current().version, + ack_map_min: self.ack_map_min(), + min_stored: self.layout().min_stored(), + } + } + + pub fn read_nodes_of(&self, position: &Hash) -> Vec<Uuid> { + let sync_min = self.sync_map_min; + let version = self + .layout() + .versions + .iter() + .find(|x| x.version == sync_min) + .or(self.layout().versions.last()) + .unwrap(); + version + .nodes_of(position, version.replication_factor) + .collect() + } + + pub fn storage_sets_of(&self, position: &Hash) -> Vec<Vec<Uuid>> { + self.layout() + .versions + .iter() + .map(|x| x.nodes_of(position, x.replication_factor).collect()) + .collect() + } + + pub fn storage_nodes_of(&self, position: &Hash) -> Vec<Uuid> { + let mut ret = vec![]; + for version in self.layout().versions.iter() { + ret.extend(version.nodes_of(position, version.replication_factor)); + } + ret.sort(); + ret.dedup(); + ret + } + + pub fn trackers_hash(&self) -> Hash { + self.trackers_hash + } + + pub fn staging_hash(&self) -> Hash { + self.staging_hash + } + + pub fn digest(&self) -> RpcLayoutDigest { + RpcLayoutDigest { + current_version: self.current().version, + active_versions: self.versions.len(), + trackers_hash: self.trackers_hash, + staging_hash: self.staging_hash, + } + } + + // ------------------ helpers for update tracking --------------- + + pub(crate) fn update_trackers(&mut self, local_node_id: Uuid) { + // Ensure trackers for this node's values are up-to-date + + // 1. Acknowledge the last layout version which is not currently + // locked by an in-progress write operation + self.ack_max_free(local_node_id); + + // 2. Assume the data on this node is sync'ed up at least to + // the first layout version in the history + self.sync_first(local_node_id); + + // 3. Acknowledge everyone has synced up to min(self.sync_map) + self.sync_ack(local_node_id); + + debug!("ack_map: {:?}", self.update_trackers.ack_map); + debug!("sync_map: {:?}", self.update_trackers.sync_map); + debug!("sync_ack_map: {:?}", self.update_trackers.sync_ack_map); + } + + fn sync_first(&mut self, local_node_id: Uuid) { + let first_version = self.min_stored(); + self.update(|layout| { + layout + .update_trackers + .sync_map + .set_max(local_node_id, first_version) + }); + } + + fn sync_ack(&mut self, local_node_id: Uuid) { + let sync_map_min = self.sync_map_min; + self.update(|layout| { + layout + .update_trackers + .sync_ack_map + .set_max(local_node_id, sync_map_min) + }); + } + + pub(crate) fn ack_max_free(&mut self, local_node_id: Uuid) -> bool { + let max_ack = self.max_free_ack(); + let changed = self.update(|layout| { + layout + .update_trackers + .ack_map + .set_max(local_node_id, max_ack) + }); + if changed { + info!("ack_until updated to {}", max_ack); + } + changed + } + + pub(crate) fn max_free_ack(&self) -> u64 { + self.layout() + .versions + .iter() + .map(|x| x.version) + .skip_while(|v| { + self.ack_lock + .get(v) + .map(|x| x.load(Ordering::Relaxed) == 0) + .unwrap_or(true) + }) + .next() + .unwrap_or(self.current().version) + } +} diff --git a/src/rpc/layout/history.rs b/src/rpc/layout/history.rs new file mode 100644 index 00000000..290f058d --- /dev/null +++ b/src/rpc/layout/history.rs @@ -0,0 +1,306 @@ +use std::collections::HashSet; + +use garage_util::crdt::{Crdt, Lww, LwwMap}; +use garage_util::data::*; +use garage_util::encode::nonversioned_encode; +use garage_util::error::*; + +use super::*; +use crate::replication_mode::*; + +impl LayoutHistory { + pub fn new(replication_factor: ReplicationFactor) -> Self { + let version = LayoutVersion::new(replication_factor.into()); + + let staging = LayoutStaging { + parameters: Lww::<LayoutParameters>::new(version.parameters), + roles: LwwMap::new(), + }; + + LayoutHistory { + versions: vec![version], + old_versions: vec![], + update_trackers: Default::default(), + staging: Lww::raw(0, staging), + } + } + + // ------------------ who stores what now? --------------- + + pub fn current(&self) -> &LayoutVersion { + self.versions.last().as_ref().unwrap() + } + + pub fn min_stored(&self) -> u64 { + self.versions.first().as_ref().unwrap().version + } + + pub fn get_all_nodes(&self) -> Vec<Uuid> { + if self.versions.len() == 1 { + self.versions[0].all_nodes().to_vec() + } else { + let set = self + .versions + .iter() + .flat_map(|x| x.all_nodes()) + .collect::<HashSet<_>>(); + set.into_iter().copied().collect::<Vec<_>>() + } + } + + pub(crate) fn get_all_nongateway_nodes(&self) -> Vec<Uuid> { + if self.versions.len() == 1 { + self.versions[0].nongateway_nodes().to_vec() + } else { + let set = self + .versions + .iter() + .flat_map(|x| x.nongateway_nodes()) + .collect::<HashSet<_>>(); + set.into_iter().copied().collect::<Vec<_>>() + } + } + + // ---- housekeeping (all invoked by LayoutHelper) ---- + + pub(crate) fn keep_current_version_only(&mut self) { + while self.versions.len() > 1 { + let removed = self.versions.remove(0); + self.old_versions.push(removed); + } + } + + pub(crate) fn cleanup_old_versions(&mut self) { + // If there are invalid versions before valid versions, remove them + if self.versions.len() > 1 && self.current().check().is_ok() { + while self.versions.len() > 1 && self.versions.first().unwrap().check().is_err() { + let removed = self.versions.remove(0); + info!( + "Layout history: pruning old invalid version {}", + removed.version + ); + } + } + + // If there are old versions that no one is reading from anymore, + // remove them (keep them in self.old_versions). + // ASSUMPTION: we only care about where nodes in the current layout version + // are reading from, as we assume older nodes are being discarded. + let current_nodes = &self.current().node_id_vec; + let min_version = self.min_stored(); + let sync_ack_map_min = self + .update_trackers + .sync_ack_map + .min_among(current_nodes, min_version); + while self.min_stored() < sync_ack_map_min { + assert!(self.versions.len() > 1); + let removed = self.versions.remove(0); + info!( + "Layout history: moving version {} to old_versions", + removed.version + ); + self.old_versions.push(removed); + } + + while self.old_versions.len() > OLD_VERSION_COUNT { + let removed = self.old_versions.remove(0); + info!("Layout history: removing old_version {}", removed.version); + } + } + + pub(crate) fn clamp_update_trackers(&mut self, nodes: &[Uuid]) { + let min_v = self.min_stored(); + for node in nodes { + self.update_trackers.ack_map.set_max(*node, min_v); + self.update_trackers.sync_map.set_max(*node, min_v); + self.update_trackers.sync_ack_map.set_max(*node, min_v); + } + } + + pub(crate) fn calculate_sync_map_min_with_quorum( + &self, + replication_factor: ReplicationFactor, + all_nongateway_nodes: &[Uuid], + ) -> u64 { + // This function calculates the minimum layout version from which + // it is safe to read if we want to maintain read-after-write consistency. + // In the general case the computation can be a bit expensive so + // we try to optimize it in several ways. + + // If there is only one layout version, we know that's the one + // we need to read from. + if self.versions.len() == 1 { + return self.current().version; + } + + let quorum = replication_factor.write_quorum(ConsistencyMode::Consistent); + + let min_version = self.min_stored(); + let global_min = self + .update_trackers + .sync_map + .min_among(all_nongateway_nodes, min_version); + + // If the write quorums are equal to the total number of nodes, + // i.e. no writes can succeed while they are not written to all nodes, + // then we must in all case wait for all nodes to complete a sync. + // This is represented by reading from the layout with version + // number global_min, the smallest layout version for which all nodes + // have completed a sync. + if quorum == self.current().replication_factor { + return global_min; + } + + // In the general case, we need to look at all write sets for all partitions, + // and find a safe layout version to read for that partition. We then + // take the minimum value among all partition as the safe layout version + // to read in all cases (the layout version to which all reads are directed). + let mut current_min = self.current().version; + let mut sets_done = HashSet::<Vec<Uuid>>::new(); + + for (_, p_hash) in self.current().partitions() { + for v in self.versions.iter() { + if v.version == self.current().version { + // We don't care about whether nodes in the latest layout version + // have completed a sync or not, as the sync is push-only + // and by definition nodes in the latest layout version do not + // hold data that must be pushed to nodes in the latest layout + // version, since that's the same version (any data that's + // already in the latest version is assumed to have been written + // by an operation that ensured a quorum of writes within + // that version). + continue; + } + + // Determine set of nodes for partition p in layout version v. + // Sort the node set to avoid duplicate computations. + let mut set = v + .nodes_of(&p_hash, v.replication_factor) + .collect::<Vec<Uuid>>(); + set.sort(); + + // If this set was already processed, skip it. + if sets_done.contains(&set) { + continue; + } + + // Find the value of the sync update trackers that is the + // highest possible minimum within a quorum of nodes. + let mut sync_values = set + .iter() + .map(|x| self.update_trackers.sync_map.get(x, min_version)) + .collect::<Vec<_>>(); + sync_values.sort(); + let set_min = sync_values[sync_values.len() - quorum]; + if set_min < current_min { + current_min = set_min; + } + // defavorable case, we know we are at the smallest possible version, + // so we can stop early + assert!(current_min >= global_min); + if current_min == global_min { + return current_min; + } + + // Add set to already processed sets + sets_done.insert(set); + } + } + + current_min + } + + pub(crate) fn calculate_trackers_hash(&self) -> Hash { + blake2sum(&nonversioned_encode(&self.update_trackers).unwrap()[..]) + } + + pub(crate) fn calculate_staging_hash(&self) -> Hash { + blake2sum(&nonversioned_encode(&self.staging).unwrap()[..]) + } + + // ================== updates to layout, public interface =================== + + pub fn merge(&mut self, other: &LayoutHistory) -> bool { + let mut changed = false; + + // Add any new versions to history + for v2 in other.versions.iter() { + if let Some(v1) = self.versions.iter().find(|v| v.version == v2.version) { + // Version is already present, check consistency + if v1 != v2 { + error!("Inconsistent layout histories: different layout compositions for version {}. Your cluster will be broken as long as this layout version is not replaced.", v2.version); + } + } else if self.versions.iter().all(|v| v.version != v2.version - 1) { + error!( + "Cannot receive new layout version {}, version {} is missing", + v2.version, + v2.version - 1 + ); + } else { + self.versions.push(v2.clone()); + changed = true; + } + } + + // Merge trackers + let c = self.update_trackers.merge(&other.update_trackers); + changed = changed || c; + + // Merge staged layout changes + if self.staging != other.staging { + let prev_staging = self.staging.clone(); + self.staging.merge(&other.staging); + changed = changed || self.staging != prev_staging; + } + + changed + } + + pub fn apply_staged_changes(mut self, version: Option<u64>) -> Result<(Self, Message), Error> { + match version { + None => { + let error = r#" +Please pass the new layout version number to ensure that you are writing the correct version of the cluster layout. +To know the correct value of the new layout version, invoke `garage layout show` and review the proposed changes. + "#; + return Err(Error::Message(error.into())); + } + Some(v) => { + if v != self.current().version + 1 { + return Err(Error::Message("Invalid new layout version".into())); + } + } + } + + // Compute new version and add it to history + let (new_version, msg) = self + .current() + .clone() + .calculate_next_version(self.staging.get())?; + + self.versions.push(new_version); + self.cleanup_old_versions(); + + // Reset the staged layout changes + self.staging.update(LayoutStaging { + parameters: self.staging.get().parameters.clone(), + roles: LwwMap::new(), + }); + + Ok((self, msg)) + } + + pub fn revert_staged_changes(mut self) -> Result<Self, Error> { + self.staging.update(LayoutStaging { + parameters: Lww::new(self.current().parameters), + roles: LwwMap::new(), + }); + + Ok(self) + } + + pub fn check(&self) -> Result<(), String> { + // TODO: anything more ? + self.current().check() + } +} diff --git a/src/rpc/layout/manager.rs b/src/rpc/layout/manager.rs new file mode 100644 index 00000000..8a6eb1c3 --- /dev/null +++ b/src/rpc/layout/manager.rs @@ -0,0 +1,381 @@ +use std::collections::HashMap; +use std::sync::{atomic::Ordering, Arc, Mutex, RwLock, RwLockReadGuard}; +use std::time::Duration; + +use tokio::sync::Notify; + +use garage_net::endpoint::Endpoint; +use garage_net::peering::PeeringManager; +use garage_net::NodeID; + +use garage_util::config::Config; +use garage_util::data::*; +use garage_util::error::*; +use garage_util::persister::Persister; + +use super::*; +use crate::replication_mode::*; +use crate::rpc_helper::*; +use crate::system::*; + +pub struct LayoutManager { + node_id: Uuid, + replication_factor: ReplicationFactor, + persist_cluster_layout: Persister<LayoutHistory>, + + layout: Arc<RwLock<LayoutHelper>>, + pub(crate) change_notify: Arc<Notify>, + + table_sync_version: Mutex<HashMap<String, u64>>, + + pub(crate) rpc_helper: RpcHelper, + system_endpoint: Arc<Endpoint<SystemRpc, System>>, +} + +impl LayoutManager { + pub fn new( + config: &Config, + node_id: NodeID, + system_endpoint: Arc<Endpoint<SystemRpc, System>>, + peering: Arc<PeeringManager>, + replication_factor: ReplicationFactor, + consistency_mode: ConsistencyMode, + ) -> Result<Arc<Self>, Error> { + let persist_cluster_layout: Persister<LayoutHistory> = + Persister::new(&config.metadata_dir, "cluster_layout"); + + let cluster_layout = match persist_cluster_layout.load() { + Ok(x) => { + if x.current().replication_factor != replication_factor.replication_factor() { + return Err(Error::Message(format!( + "Prevous cluster layout has replication factor {}, which is different than the one specified in the config file ({}). The previous cluster layout can be purged, if you know what you are doing, simply by deleting the `cluster_layout` file in your metadata directory.", + x.current().replication_factor, + replication_factor.replication_factor() + ))); + } + x + } + Err(e) => { + info!( + "No valid previous cluster layout stored ({}), starting fresh.", + e + ); + LayoutHistory::new(replication_factor) + } + }; + + let mut cluster_layout = LayoutHelper::new( + replication_factor, + consistency_mode, + cluster_layout, + Default::default(), + ); + cluster_layout.update_trackers(node_id.into()); + + let layout = Arc::new(RwLock::new(cluster_layout)); + let change_notify = Arc::new(Notify::new()); + + let rpc_helper = RpcHelper::new( + node_id.into(), + peering, + layout.clone(), + config.rpc_timeout_msec.map(Duration::from_millis), + ); + + Ok(Arc::new(Self { + node_id: node_id.into(), + replication_factor, + persist_cluster_layout, + layout, + change_notify, + table_sync_version: Mutex::new(HashMap::new()), + system_endpoint, + rpc_helper, + })) + } + + // ---- PUBLIC INTERFACE ---- + + pub fn layout(&self) -> RwLockReadGuard<'_, LayoutHelper> { + self.layout.read().unwrap() + } + + pub async fn update_cluster_layout( + self: &Arc<Self>, + layout: &LayoutHistory, + ) -> Result<(), Error> { + self.handle_advertise_cluster_layout(layout).await?; + Ok(()) + } + + pub fn add_table(&self, table_name: &'static str) { + let first_version = self.layout().versions.first().unwrap().version; + + self.table_sync_version + .lock() + .unwrap() + .insert(table_name.to_string(), first_version); + } + + pub fn sync_table_until(self: &Arc<Self>, table_name: &'static str, version: u64) { + let mut table_sync_version = self.table_sync_version.lock().unwrap(); + *table_sync_version.get_mut(table_name).unwrap() = version; + let sync_until = table_sync_version.iter().map(|(_, v)| *v).min().unwrap(); + drop(table_sync_version); + + let mut layout = self.layout.write().unwrap(); + if layout.update(|l| l.update_trackers.sync_map.set_max(self.node_id, sync_until)) { + info!("sync_until updated to {}", sync_until); + self.broadcast_update(SystemRpc::AdvertiseClusterLayoutTrackers( + layout.update_trackers.clone(), + )); + } + } + + fn ack_new_version(self: &Arc<Self>) { + let mut layout = self.layout.write().unwrap(); + if layout.ack_max_free(self.node_id) { + self.broadcast_update(SystemRpc::AdvertiseClusterLayoutTrackers( + layout.update_trackers.clone(), + )); + } + } + + // ---- ACK LOCKING ---- + + pub fn write_sets_of(self: &Arc<Self>, position: &Hash) -> WriteLock<Vec<Vec<Uuid>>> { + let layout = self.layout(); + let version = layout.current().version; + let nodes = layout.storage_sets_of(position); + layout + .ack_lock + .get(&version) + .unwrap() + .fetch_add(1, Ordering::Relaxed); + WriteLock::new(version, self, nodes) + } + + // ---- INTERNALS --- + + fn merge_layout(&self, adv: &LayoutHistory) -> Option<LayoutHistory> { + let mut layout = self.layout.write().unwrap(); + let prev_digest = layout.digest(); + let prev_layout_check = layout.check().is_ok(); + + if !prev_layout_check || adv.check().is_ok() { + if layout.update(|l| l.merge(adv)) { + layout.update_trackers(self.node_id); + if prev_layout_check && layout.check().is_err() { + panic!("Merged two correct layouts and got an incorrect layout."); + } + assert!(layout.digest() != prev_digest); + return Some(layout.clone()); + } + } + + None + } + + fn merge_layout_trackers(&self, adv: &UpdateTrackers) -> Option<UpdateTrackers> { + let mut layout = self.layout.write().unwrap(); + let prev_digest = layout.digest(); + + if layout.update_trackers != *adv { + if layout.update(|l| l.update_trackers.merge(adv)) { + layout.update_trackers(self.node_id); + assert!(layout.digest() != prev_digest); + return Some(layout.update_trackers.clone()); + } + } + + None + } + + async fn pull_cluster_layout(self: &Arc<Self>, peer: Uuid) { + let resp = self + .rpc_helper + .call( + &self.system_endpoint, + peer, + SystemRpc::PullClusterLayout, + RequestStrategy::with_priority(PRIO_HIGH), + ) + .await; + if let Ok(SystemRpc::AdvertiseClusterLayout(layout)) = resp { + if let Err(e) = self.handle_advertise_cluster_layout(&layout).await { + warn!("In pull_cluster_layout: {}", e); + } + } + } + + async fn pull_cluster_layout_trackers(self: &Arc<Self>, peer: Uuid) { + let resp = self + .rpc_helper + .call( + &self.system_endpoint, + peer, + SystemRpc::PullClusterLayoutTrackers, + RequestStrategy::with_priority(PRIO_HIGH), + ) + .await; + if let Ok(SystemRpc::AdvertiseClusterLayoutTrackers(trackers)) = resp { + if let Err(e) = self + .handle_advertise_cluster_layout_trackers(&trackers) + .await + { + warn!("In pull_cluster_layout_trackers: {}", e); + } + } + } + + /// Save cluster layout data to disk + async fn save_cluster_layout(&self) -> Result<(), Error> { + let layout = self.layout.read().unwrap().clone(); + self.persist_cluster_layout + .save_async(&layout) + .await + .expect("Cannot save current cluster layout"); + Ok(()) + } + + fn broadcast_update(self: &Arc<Self>, rpc: SystemRpc) { + tokio::spawn({ + let this = self.clone(); + async move { + if let Err(e) = this + .rpc_helper + .broadcast( + &this.system_endpoint, + rpc, + RequestStrategy::with_priority(PRIO_HIGH), + ) + .await + { + warn!("Error while broadcasting new cluster layout: {}", e); + } + } + }); + } + + // ---- RPC HANDLERS ---- + + pub(crate) fn handle_advertise_status(self: &Arc<Self>, from: Uuid, remote: &RpcLayoutDigest) { + let local = self.layout().digest(); + if remote.current_version > local.current_version + || remote.active_versions != local.active_versions + || remote.staging_hash != local.staging_hash + { + tokio::spawn({ + let this = self.clone(); + async move { this.pull_cluster_layout(from).await } + }); + } else if remote.trackers_hash != local.trackers_hash { + tokio::spawn({ + let this = self.clone(); + async move { this.pull_cluster_layout_trackers(from).await } + }); + } + } + + pub(crate) fn handle_pull_cluster_layout(&self) -> SystemRpc { + let layout = self.layout.read().unwrap().clone(); + SystemRpc::AdvertiseClusterLayout(layout) + } + + pub(crate) fn handle_pull_cluster_layout_trackers(&self) -> SystemRpc { + let layout = self.layout.read().unwrap(); + SystemRpc::AdvertiseClusterLayoutTrackers(layout.update_trackers.clone()) + } + + pub(crate) async fn handle_advertise_cluster_layout( + self: &Arc<Self>, + adv: &LayoutHistory, + ) -> Result<SystemRpc, Error> { + debug!( + "handle_advertise_cluster_layout: {} versions, last={}, trackers={:?}", + adv.versions.len(), + adv.current().version, + adv.update_trackers + ); + + if adv.current().replication_factor != self.replication_factor.replication_factor() { + let msg = format!( + "Received a cluster layout from another node with replication factor {}, which is different from what we have in our configuration ({}). Discarding the cluster layout we received.", + adv.current().replication_factor, + self.replication_factor.replication_factor() + ); + error!("{}", msg); + return Err(Error::Message(msg)); + } + + if let Some(new_layout) = self.merge_layout(adv) { + debug!("handle_advertise_cluster_layout: some changes were added to the current stuff"); + + self.change_notify.notify_waiters(); + self.broadcast_update(SystemRpc::AdvertiseClusterLayout(new_layout)); + self.save_cluster_layout().await?; + } + + Ok(SystemRpc::Ok) + } + + pub(crate) async fn handle_advertise_cluster_layout_trackers( + self: &Arc<Self>, + trackers: &UpdateTrackers, + ) -> Result<SystemRpc, Error> { + debug!("handle_advertise_cluster_layout_trackers: {:?}", trackers); + + if let Some(new_trackers) = self.merge_layout_trackers(trackers) { + self.change_notify.notify_waiters(); + self.broadcast_update(SystemRpc::AdvertiseClusterLayoutTrackers(new_trackers)); + self.save_cluster_layout().await?; + } + + Ok(SystemRpc::Ok) + } +} + +// ---- ack lock ---- + +pub struct WriteLock<T> { + layout_version: u64, + layout_manager: Arc<LayoutManager>, + value: T, +} + +impl<T> WriteLock<T> { + fn new(version: u64, layout_manager: &Arc<LayoutManager>, value: T) -> Self { + Self { + layout_version: version, + layout_manager: layout_manager.clone(), + value, + } + } +} + +impl<T> AsRef<T> for WriteLock<T> { + fn as_ref(&self) -> &T { + &self.value + } +} + +impl<T> AsMut<T> for WriteLock<T> { + fn as_mut(&mut self) -> &mut T { + &mut self.value + } +} + +impl<T> Drop for WriteLock<T> { + fn drop(&mut self) { + let layout = self.layout_manager.layout(); // acquire read lock + if let Some(counter) = layout.ack_lock.get(&self.layout_version) { + let prev_lock = counter.fetch_sub(1, Ordering::Relaxed); + if prev_lock == 1 && layout.current().version > self.layout_version { + drop(layout); // release read lock, write lock will be acquired + self.layout_manager.ack_new_version(); + } + } else { + error!("Could not find ack lock counter for layout version {}. This probably indicates a bug in Garage.", self.layout_version); + } + } +} diff --git a/src/rpc/layout/mod.rs b/src/rpc/layout/mod.rs new file mode 100644 index 00000000..33676c37 --- /dev/null +++ b/src/rpc/layout/mod.rs @@ -0,0 +1,478 @@ +use std::fmt; + +use bytesize::ByteSize; + +use garage_util::crdt::{AutoCrdt, Crdt}; +use garage_util::data::Uuid; + +mod graph_algo; +mod helper; +mod history; +mod version; + +#[cfg(test)] +mod test; + +pub mod manager; + +// ---- re-exports ---- + +pub use helper::{LayoutHelper, RpcLayoutDigest, SyncLayoutDigest}; +pub use manager::WriteLock; +pub use version::*; + +// ---- defines: partitions ---- + +/// A partition id, which is stored on 16 bits +/// i.e. we have up to 2**16 partitions. +/// (in practice we have exactly 2**PARTITION_BITS partitions) +pub type Partition = u16; + +// TODO: make this constant parametrizable in the config file +// For deployments with many nodes it might make sense to bump +// it up to 10. +// Maximum value : 16 +/// How many bits from the hash are used to make partitions. Higher numbers means more fairness in +/// presence of numerous nodes, but exponentially bigger ring. Max 16 +pub const PARTITION_BITS: usize = 8; + +const NB_PARTITIONS: usize = 1usize << PARTITION_BITS; + +// ---- defines: nodes ---- + +// Type to store compactly the id of a node in the system +// Change this to u16 the day we want to have more than 256 nodes in a cluster +pub type CompactNodeType = u8; +pub const MAX_NODE_NUMBER: usize = 256; + +// ======== actual data structures for the layout data ======== +// ======== that is persisted to disk ======== +// some small utility impls are at the end of this file, +// but most of the code that actually computes stuff is in +// version.rs, history.rs and helper.rs + +mod v08 { + use crate::layout::CompactNodeType; + use garage_util::crdt::LwwMap; + use garage_util::data::{Hash, Uuid}; + use serde::{Deserialize, Serialize}; + + /// The layout of the cluster, i.e. the list of roles + /// which are assigned to each cluster node + #[derive(Clone, Debug, Serialize, Deserialize)] + pub struct ClusterLayout { + pub version: u64, + + pub replication_factor: usize, + pub roles: LwwMap<Uuid, NodeRoleV>, + + // see comments in v010::ClusterLayout + pub node_id_vec: Vec<Uuid>, + #[serde(with = "serde_bytes")] + pub ring_assignation_data: Vec<CompactNodeType>, + + /// Role changes which are staged for the next version of the layout + pub staging: LwwMap<Uuid, NodeRoleV>, + pub staging_hash: Hash, + } + + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] + pub struct NodeRoleV(pub Option<NodeRole>); + + /// The user-assigned roles of cluster nodes + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] + pub struct NodeRole { + /// Datacenter at which this entry belong. This information is used to + /// perform a better geodistribution + pub zone: String, + /// The capacity of the node + /// If this is set to None, the node does not participate in storing data for the system + /// and is only active as an API gateway to other nodes + pub capacity: Option<u64>, + /// A set of tags to recognize the node + pub tags: Vec<String>, + } + + impl garage_util::migrate::InitialFormat for ClusterLayout {} +} + +mod v09 { + use super::v08; + use crate::layout::CompactNodeType; + use garage_util::crdt::{Lww, LwwMap}; + use garage_util::data::{Hash, Uuid}; + use serde::{Deserialize, Serialize}; + pub use v08::{NodeRole, NodeRoleV}; + + /// The layout of the cluster, i.e. the list of roles + /// which are assigned to each cluster node + #[derive(Clone, Debug, Serialize, Deserialize)] + pub struct ClusterLayout { + pub version: u64, + + pub replication_factor: usize, + + /// This attribute is only used to retain the previously computed partition size, + /// to know to what extent does it change with the layout update. + pub partition_size: u64, + /// Parameters used to compute the assignment currently given by + /// ring_assignment_data + pub parameters: LayoutParameters, + + pub roles: LwwMap<Uuid, NodeRoleV>, + + // see comments in v010::ClusterLayout + pub node_id_vec: Vec<Uuid>, + #[serde(with = "serde_bytes")] + pub ring_assignment_data: Vec<CompactNodeType>, + + /// Parameters to be used in the next partition assignment computation. + pub staging_parameters: Lww<LayoutParameters>, + /// Role changes which are staged for the next version of the layout + pub staging_roles: LwwMap<Uuid, NodeRoleV>, + pub staging_hash: Hash, + } + + /// This struct is used to set the parameters to be used in the assignment computation + /// algorithm. It is stored as a Crdt. + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)] + pub struct LayoutParameters { + pub zone_redundancy: ZoneRedundancy, + } + + /// Zone redundancy: if set to AtLeast(x), the layout calculation will aim to store copies + /// of each partition on at least that number of different zones. + /// Otherwise, copies will be stored on the maximum possible number of zones. + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)] + pub enum ZoneRedundancy { + AtLeast(usize), + Maximum, + } + + impl garage_util::migrate::Migrate for ClusterLayout { + const VERSION_MARKER: &'static [u8] = b"G09layout"; + + type Previous = v08::ClusterLayout; + + fn migrate(previous: Self::Previous) -> Self { + use itertools::Itertools; + + // In the old layout, capacities are in an arbitrary unit, + // but in the new layout they are in bytes. + // Here we arbitrarily multiply everything by 1G, + // such that 1 old capacity unit = 1GB in the new units. + // This is totally arbitrary and won't work for most users. + let cap_mul = 1024 * 1024 * 1024; + let roles = multiply_all_capacities(previous.roles, cap_mul); + let staging_roles = multiply_all_capacities(previous.staging, cap_mul); + let node_id_vec = previous.node_id_vec; + + // Determine partition size + let mut tmp = previous.ring_assignation_data.clone(); + tmp.sort(); + let partition_size = tmp + .into_iter() + .dedup_with_count() + .map(|(npart, node)| { + roles + .get(&node_id_vec[node as usize]) + .and_then(|p| p.0.as_ref().and_then(|r| r.capacity)) + .unwrap_or(0) / npart as u64 + }) + .min() + .unwrap_or(0); + + // By default, zone_redundancy is maximum possible value + let parameters = LayoutParameters { + zone_redundancy: ZoneRedundancy::Maximum, + }; + + Self { + version: previous.version, + replication_factor: previous.replication_factor, + partition_size, + parameters, + roles, + node_id_vec, + ring_assignment_data: previous.ring_assignation_data, + staging_parameters: Lww::new(parameters), + staging_roles, + staging_hash: [0u8; 32].into(), // will be set in the next migration + } + } + } + + fn multiply_all_capacities( + old_roles: LwwMap<Uuid, NodeRoleV>, + mul: u64, + ) -> LwwMap<Uuid, NodeRoleV> { + let mut new_roles = LwwMap::new(); + for (node, ts, role) in old_roles.items() { + let mut role = role.clone(); + if let NodeRoleV(Some(NodeRole { + capacity: Some(ref mut cap), + .. + })) = role + { + *cap *= mul; + } + new_roles.merge_raw(node, *ts, &role); + } + new_roles + } +} + +mod v010 { + use super::v09; + use crate::layout::CompactNodeType; + use garage_util::crdt::{Lww, LwwMap}; + use garage_util::data::Uuid; + use serde::{Deserialize, Serialize}; + use std::collections::BTreeMap; + pub use v09::{LayoutParameters, NodeRole, NodeRoleV, ZoneRedundancy}; + + /// Number of old (non-live) versions to keep, see LayoutHistory::old_versions + pub const OLD_VERSION_COUNT: usize = 5; + + /// The history of cluster layouts, with trackers to keep a record + /// of which nodes are up-to-date to current cluster data + #[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] + pub struct LayoutHistory { + /// The versions currently in use in the cluster + pub versions: Vec<LayoutVersion>, + /// At most 5 of the previous versions, not used by the garage_table + /// module, but usefull for the garage_block module to find data blocks + /// that have not yet been moved + pub old_versions: Vec<LayoutVersion>, + + /// Update trackers + pub update_trackers: UpdateTrackers, + + /// Staged changes for the next version + pub staging: Lww<LayoutStaging>, + } + + /// A version of the layout of the cluster, i.e. the list of roles + /// which are assigned to each cluster node + #[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] + pub struct LayoutVersion { + /// The number of this version + pub version: u64, + + /// Roles assigned to nodes in this version + pub roles: LwwMap<Uuid, NodeRoleV>, + /// Parameters used to compute the assignment currently given by + /// ring_assignment_data + pub parameters: LayoutParameters, + + /// The number of replicas for each data partition + pub replication_factor: usize, + /// This attribute is only used to retain the previously computed partition size, + /// to know to what extent does it change with the layout update. + pub partition_size: u64, + + /// node_id_vec: a vector of node IDs with a role assigned + /// in the system (this includes gateway nodes). + /// The order here is different than the vec stored by `roles`, because: + /// 1. non-gateway nodes are first so that they have lower numbers + /// 2. nodes that don't have a role are excluded (but they need to + /// stay in the CRDT as tombstones) + pub node_id_vec: Vec<Uuid>, + /// number of non-gateway nodes, which are the first ids in node_id_vec + pub nongateway_node_count: usize, + /// The assignation of data partitions to nodes, the values + /// are indices in node_id_vec + #[serde(with = "serde_bytes")] + pub ring_assignment_data: Vec<CompactNodeType>, + } + + /// The staged changes for the next layout version + #[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] + pub struct LayoutStaging { + /// Parameters to be used in the next partition assignment computation. + pub parameters: Lww<LayoutParameters>, + /// Role changes which are staged for the next version of the layout + pub roles: LwwMap<Uuid, NodeRoleV>, + } + + /// The tracker of acknowlegments and data syncs around the cluster + #[derive(Clone, Debug, Serialize, Deserialize, Default, PartialEq)] + pub struct UpdateTrackers { + /// The highest layout version number each node has ack'ed + pub ack_map: UpdateTracker, + /// The highest layout version number each node has synced data for + pub sync_map: UpdateTracker, + /// The highest layout version number each node has + /// ack'ed that all other nodes have synced data for + pub sync_ack_map: UpdateTracker, + } + + /// Generic update tracker struct + #[derive(Clone, Debug, Serialize, Deserialize, Default, PartialEq)] + pub struct UpdateTracker(pub BTreeMap<Uuid, u64>); + + impl garage_util::migrate::Migrate for LayoutHistory { + const VERSION_MARKER: &'static [u8] = b"G010lh"; + + type Previous = v09::ClusterLayout; + + fn migrate(previous: Self::Previous) -> Self { + let nongateway_node_count = previous + .node_id_vec + .iter() + .enumerate() + .filter(|(_, uuid)| { + let role = previous.roles.get(uuid); + matches!(role, Some(NodeRoleV(Some(role))) if role.capacity.is_some()) + }) + .map(|(i, _)| i + 1) + .max() + .unwrap_or(0); + + let version = LayoutVersion { + version: previous.version, + replication_factor: previous.replication_factor, + partition_size: previous.partition_size, + parameters: previous.parameters, + roles: previous.roles, + node_id_vec: previous.node_id_vec, + nongateway_node_count, + ring_assignment_data: previous.ring_assignment_data, + }; + let update_tracker = UpdateTracker( + version + .nongateway_nodes() + .iter() + .copied() + .map(|x| (x, version.version)) + .collect::<BTreeMap<Uuid, u64>>(), + ); + let staging = LayoutStaging { + parameters: previous.staging_parameters, + roles: previous.staging_roles, + }; + Self { + versions: vec![version], + old_versions: vec![], + update_trackers: UpdateTrackers { + ack_map: update_tracker.clone(), + sync_map: update_tracker.clone(), + sync_ack_map: update_tracker, + }, + staging: Lww::raw(previous.version, staging), + } + } + } +} + +pub use v010::*; + +// ---- utility functions ---- + +impl AutoCrdt for LayoutParameters { + const WARN_IF_DIFFERENT: bool = true; +} + +impl AutoCrdt for NodeRoleV { + const WARN_IF_DIFFERENT: bool = true; +} + +impl Crdt for LayoutStaging { + fn merge(&mut self, other: &LayoutStaging) { + self.parameters.merge(&other.parameters); + self.roles.merge(&other.roles); + } +} + +impl NodeRole { + pub fn capacity_string(&self) -> String { + match self.capacity { + Some(c) => ByteSize::b(c).to_string_as(false), + None => "gateway".to_string(), + } + } + + pub fn tags_string(&self) -> String { + self.tags.join(",") + } +} + +impl fmt::Display for ZoneRedundancy { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + ZoneRedundancy::Maximum => write!(f, "maximum"), + ZoneRedundancy::AtLeast(x) => write!(f, "{}", x), + } + } +} + +impl core::str::FromStr for ZoneRedundancy { + type Err = &'static str; + fn from_str(s: &str) -> Result<Self, Self::Err> { + match s { + "none" | "max" | "maximum" => Ok(ZoneRedundancy::Maximum), + x => { + let v = x + .parse::<usize>() + .map_err(|_| "zone redundancy must be 'none'/'max' or an integer")?; + Ok(ZoneRedundancy::AtLeast(v)) + } + } + } +} + +impl UpdateTracker { + fn merge(&mut self, other: &UpdateTracker) -> bool { + let mut changed = false; + for (k, v) in other.0.iter() { + if let Some(v_mut) = self.0.get_mut(k) { + if *v > *v_mut { + *v_mut = *v; + changed = true; + } + } else { + self.0.insert(*k, *v); + changed = true; + } + } + changed + } + + /// This bumps the update tracker for a given node up to the specified value. + /// This has potential impacts on the correctness of Garage and should only + /// be used in very specific circumstances. + pub fn set_max(&mut self, peer: Uuid, value: u64) -> bool { + match self.0.get_mut(&peer) { + Some(e) if *e < value => { + *e = value; + true + } + None => { + self.0.insert(peer, value); + true + } + _ => false, + } + } + + pub(crate) fn min_among(&self, storage_nodes: &[Uuid], min_version: u64) -> u64 { + storage_nodes + .iter() + .map(|x| self.get(x, min_version)) + .min() + .unwrap_or(min_version) + } + + pub fn get(&self, node: &Uuid, min_version: u64) -> u64 { + self.0.get(node).copied().unwrap_or(min_version) + } +} + +impl UpdateTrackers { + pub(crate) fn merge(&mut self, other: &UpdateTrackers) -> bool { + let c1 = self.ack_map.merge(&other.ack_map); + let c2 = self.sync_map.merge(&other.sync_map); + let c3 = self.sync_ack_map.merge(&other.sync_ack_map); + c1 || c2 || c3 + } +} diff --git a/src/rpc/layout/test.rs b/src/rpc/layout/test.rs new file mode 100644 index 00000000..fcbb9dfc --- /dev/null +++ b/src/rpc/layout/test.rs @@ -0,0 +1,158 @@ +use std::cmp::min; +use std::collections::HashMap; + +use garage_util::crdt::Crdt; +use garage_util::error::*; + +use crate::layout::*; +use crate::replication_mode::ReplicationFactor; + +// This function checks that the partition size S computed is at least better than the +// one given by a very naive algorithm. To do so, we try to run the naive algorithm +// assuming a partion size of S+1. If we succed, it means that the optimal assignment +// was not optimal. The naive algorithm is the following : +// - we compute the max number of partitions associated to every node, capped at the +// partition number. It gives the number of tokens of every node. +// - every zone has a number of tokens equal to the sum of the tokens of its nodes. +// - we cycle over the partitions and associate zone tokens while respecting the +// zone redundancy constraint. +// NOTE: the naive algorithm is not optimal. Counter example: +// take nb_partition = 3 ; replication_factor = 5; redundancy = 4; +// number of tokens by zone : (A, 4), (B,1), (C,4), (D, 4), (E, 2) +// With these parameters, the naive algo fails, whereas there is a solution: +// (A,A,C,D,E) , (A,B,C,D,D) (A,C,C,D,E) +fn check_against_naive(cl: &LayoutVersion) -> Result<bool, Error> { + let over_size = cl.partition_size + 1; + let mut zone_token = HashMap::<String, usize>::new(); + + let (zones, zone_to_id) = cl.generate_nongateway_zone_ids()?; + + if zones.is_empty() { + return Ok(false); + } + + for z in zones.iter() { + zone_token.insert(z.clone(), 0); + } + for uuid in cl.nongateway_nodes() { + let z = cl.expect_get_node_zone(&uuid); + let c = cl.expect_get_node_capacity(&uuid); + zone_token.insert( + z.to_string(), + zone_token[z] + min(NB_PARTITIONS, (c / over_size) as usize), + ); + } + + // For every partition, we count the number of zone already associated and + // the name of the last zone associated + + let mut id_zone_token = vec![0; zones.len()]; + for (z, t) in zone_token.iter() { + id_zone_token[zone_to_id[z]] = *t; + } + + let mut nb_token = vec![0; NB_PARTITIONS]; + let mut last_zone = vec![zones.len(); NB_PARTITIONS]; + + let mut curr_zone = 0; + + let redundancy = cl.effective_zone_redundancy(); + + for replic in 0..cl.replication_factor { + for p in 0..NB_PARTITIONS { + while id_zone_token[curr_zone] == 0 + || (last_zone[p] == curr_zone + && redundancy - nb_token[p] <= cl.replication_factor - replic) + { + curr_zone += 1; + if curr_zone >= zones.len() { + return Ok(true); + } + } + id_zone_token[curr_zone] -= 1; + if last_zone[p] != curr_zone { + nb_token[p] += 1; + last_zone[p] = curr_zone; + } + } + } + + return Ok(false); +} + +fn show_msg(msg: &Message) { + for s in msg.iter() { + println!("{}", s); + } +} + +fn update_layout( + cl: &mut LayoutHistory, + node_capacity_vec: &[u64], + node_zone_vec: &[&'static str], + zone_redundancy: usize, +) { + let staging = cl.staging.get_mut(); + + for (i, (capacity, zone)) in node_capacity_vec + .iter() + .zip(node_zone_vec.iter()) + .enumerate() + { + let node_id = [i as u8; 32].into(); + + let update = staging.roles.update_mutator( + node_id, + NodeRoleV(Some(NodeRole { + zone: zone.to_string(), + capacity: Some(*capacity), + tags: (vec![]), + })), + ); + staging.roles.merge(&update); + } + staging.parameters.update(LayoutParameters { + zone_redundancy: ZoneRedundancy::AtLeast(zone_redundancy), + }); +} + +#[test] +fn test_assignment() { + let mut node_capacity_vec = vec![4000, 1000, 2000]; + let mut node_zone_vec = vec!["A", "B", "C"]; + + let mut cl = LayoutHistory::new(ReplicationFactor::new(3).unwrap()); + update_layout(&mut cl, &node_capacity_vec, &node_zone_vec, 3); + let v = cl.current().version; + let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); + show_msg(&msg); + assert_eq!(cl.check(), Ok(())); + assert!(check_against_naive(cl.current()).unwrap()); + + node_capacity_vec = vec![4000, 1000, 1000, 3000, 1000, 1000, 2000, 10000, 2000]; + node_zone_vec = vec!["A", "B", "C", "C", "C", "B", "G", "H", "I"]; + update_layout(&mut cl, &node_capacity_vec, &node_zone_vec, 2); + let v = cl.current().version; + let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); + show_msg(&msg); + assert_eq!(cl.check(), Ok(())); + assert!(check_against_naive(cl.current()).unwrap()); + + node_capacity_vec = vec![4000, 1000, 2000, 7000, 1000, 1000, 2000, 10000, 2000]; + update_layout(&mut cl, &node_capacity_vec, &node_zone_vec, 3); + let v = cl.current().version; + let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); + show_msg(&msg); + assert_eq!(cl.check(), Ok(())); + assert!(check_against_naive(cl.current()).unwrap()); + + node_capacity_vec = vec![ + 4000000, 4000000, 2000000, 7000000, 1000000, 9000000, 2000000, 10000, 2000000, + ]; + update_layout(&mut cl, &node_capacity_vec, &node_zone_vec, 1); + let v = cl.current().version; + let (cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); + show_msg(&msg); + assert_eq!(cl.check(), Ok(())); + assert!(check_against_naive(cl.current()).unwrap()); +} diff --git a/src/rpc/layout.rs b/src/rpc/layout/version.rs index e02a180b..ee4b2821 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout/version.rs @@ -1,375 +1,55 @@ -use std::cmp::Ordering; use std::collections::HashMap; use std::collections::HashSet; -use std::fmt; +use std::convert::TryInto; use bytesize::ByteSize; use itertools::Itertools; -use garage_util::crdt::{AutoCrdt, Crdt, Lww, LwwMap}; +use garage_util::crdt::{Crdt, LwwMap}; use garage_util::data::*; -use garage_util::encode::nonversioned_encode; use garage_util::error::*; -use crate::graph_algo::*; - -use crate::ring::*; - -use std::convert::TryInto; - -const NB_PARTITIONS: usize = 1usize << PARTITION_BITS; +use super::graph_algo::*; +use super::*; // The Message type will be used to collect information on the algorithm. -type Message = Vec<String>; - -mod v08 { - use crate::ring::CompactNodeType; - use garage_util::crdt::LwwMap; - use garage_util::data::{Hash, Uuid}; - use serde::{Deserialize, Serialize}; - - /// The layout of the cluster, i.e. the list of roles - /// which are assigned to each cluster node - #[derive(Clone, Debug, Serialize, Deserialize)] - pub struct ClusterLayout { - pub version: u64, - - pub replication_factor: usize, - pub roles: LwwMap<Uuid, NodeRoleV>, - - /// node_id_vec: a vector of node IDs with a role assigned - /// in the system (this includes gateway nodes). - /// The order here is different than the vec stored by `roles`, because: - /// 1. non-gateway nodes are first so that they have lower numbers - /// 2. nodes that don't have a role are excluded (but they need to - /// stay in the CRDT as tombstones) - pub node_id_vec: Vec<Uuid>, - /// the assignation of data partitions to node, the values - /// are indices in node_id_vec - #[serde(with = "serde_bytes")] - pub ring_assignation_data: Vec<CompactNodeType>, - - /// Role changes which are staged for the next version of the layout - pub staging: LwwMap<Uuid, NodeRoleV>, - pub staging_hash: Hash, - } - - #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] - pub struct NodeRoleV(pub Option<NodeRole>); - - /// The user-assigned roles of cluster nodes - #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] - pub struct NodeRole { - /// Datacenter at which this entry belong. This information is used to - /// perform a better geodistribution - pub zone: String, - /// The capacity of the node - /// If this is set to None, the node does not participate in storing data for the system - /// and is only active as an API gateway to other nodes - pub capacity: Option<u64>, - /// A set of tags to recognize the node - pub tags: Vec<String>, - } - - impl garage_util::migrate::InitialFormat for ClusterLayout {} -} - -mod v09 { - use super::v08; - use crate::ring::CompactNodeType; - use garage_util::crdt::{Lww, LwwMap}; - use garage_util::data::{Hash, Uuid}; - use serde::{Deserialize, Serialize}; - pub use v08::{NodeRole, NodeRoleV}; - - /// The layout of the cluster, i.e. the list of roles - /// which are assigned to each cluster node - #[derive(Clone, Debug, Serialize, Deserialize)] - pub struct ClusterLayout { - pub version: u64, - - pub replication_factor: usize, - - /// This attribute is only used to retain the previously computed partition size, - /// to know to what extent does it change with the layout update. - pub partition_size: u64, - /// Parameters used to compute the assignment currently given by - /// ring_assignment_data - pub parameters: LayoutParameters, - - pub roles: LwwMap<Uuid, NodeRoleV>, - - /// see comment in v08::ClusterLayout - pub node_id_vec: Vec<Uuid>, - /// see comment in v08::ClusterLayout - #[serde(with = "serde_bytes")] - pub ring_assignment_data: Vec<CompactNodeType>, - - /// Parameters to be used in the next partition assignment computation. - pub staging_parameters: Lww<LayoutParameters>, - /// Role changes which are staged for the next version of the layout - pub staging_roles: LwwMap<Uuid, NodeRoleV>, - pub staging_hash: Hash, - } +pub type Message = Vec<String>; - /// This struct is used to set the parameters to be used in the assignment computation - /// algorithm. It is stored as a Crdt. - #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)] - pub struct LayoutParameters { - pub zone_redundancy: ZoneRedundancy, - } - - /// Zone redundancy: if set to AtLeast(x), the layout calculation will aim to store copies - /// of each partition on at least that number of different zones. - /// Otherwise, copies will be stored on the maximum possible number of zones. - #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug, Serialize, Deserialize)] - pub enum ZoneRedundancy { - AtLeast(usize), - Maximum, - } - - impl garage_util::migrate::Migrate for ClusterLayout { - const VERSION_MARKER: &'static [u8] = b"G09layout"; - - type Previous = v08::ClusterLayout; - - fn migrate(previous: Self::Previous) -> Self { - use itertools::Itertools; - - // In the old layout, capacities are in an arbitrary unit, - // but in the new layout they are in bytes. - // Here we arbitrarily multiply everything by 1G, - // such that 1 old capacity unit = 1GB in the new units. - // This is totally arbitrary and won't work for most users. - let cap_mul = 1024 * 1024 * 1024; - let roles = multiply_all_capacities(previous.roles, cap_mul); - let staging_roles = multiply_all_capacities(previous.staging, cap_mul); - let node_id_vec = previous.node_id_vec; - - // Determine partition size - let mut tmp = previous.ring_assignation_data.clone(); - tmp.sort(); - let partition_size = tmp - .into_iter() - .dedup_with_count() - .map(|(npart, node)| { - roles - .get(&node_id_vec[node as usize]) - .and_then(|p| p.0.as_ref().and_then(|r| r.capacity)) - .unwrap_or(0) / npart as u64 - }) - .min() - .unwrap_or(0); - - // By default, zone_redundancy is maximum possible value - let parameters = LayoutParameters { - zone_redundancy: ZoneRedundancy::Maximum, - }; - - let mut res = Self { - version: previous.version, - replication_factor: previous.replication_factor, - partition_size, - parameters, - roles, - node_id_vec, - ring_assignment_data: previous.ring_assignation_data, - staging_parameters: Lww::new(parameters), - staging_roles, - staging_hash: [0u8; 32].into(), - }; - res.staging_hash = res.calculate_staging_hash(); - res - } - } - - fn multiply_all_capacities( - old_roles: LwwMap<Uuid, NodeRoleV>, - mul: u64, - ) -> LwwMap<Uuid, NodeRoleV> { - let mut new_roles = LwwMap::new(); - for (node, ts, role) in old_roles.items() { - let mut role = role.clone(); - if let NodeRoleV(Some(NodeRole { - capacity: Some(ref mut cap), - .. - })) = role - { - *cap *= mul; - } - new_roles.merge_raw(node, *ts, &role); - } - new_roles - } -} - -pub use v09::*; - -impl AutoCrdt for LayoutParameters { - const WARN_IF_DIFFERENT: bool = true; -} - -impl AutoCrdt for NodeRoleV { - const WARN_IF_DIFFERENT: bool = true; -} - -impl NodeRole { - pub fn capacity_string(&self) -> String { - match self.capacity { - Some(c) => ByteSize::b(c).to_string_as(false), - None => "gateway".to_string(), - } - } - - pub fn tags_string(&self) -> String { - self.tags.join(",") - } -} - -impl fmt::Display for ZoneRedundancy { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - ZoneRedundancy::Maximum => write!(f, "maximum"), - ZoneRedundancy::AtLeast(x) => write!(f, "{}", x), - } - } -} - -impl core::str::FromStr for ZoneRedundancy { - type Err = &'static str; - fn from_str(s: &str) -> Result<Self, Self::Err> { - match s { - "none" | "max" | "maximum" => Ok(ZoneRedundancy::Maximum), - x => { - let v = x - .parse::<usize>() - .map_err(|_| "zone redundancy must be 'none'/'max' or an integer")?; - Ok(ZoneRedundancy::AtLeast(v)) - } - } - } -} - -// Implementation of the ClusterLayout methods unrelated to the assignment algorithm. -impl ClusterLayout { +impl LayoutVersion { pub fn new(replication_factor: usize) -> Self { // We set the default zone redundancy to be Maximum, meaning that the maximum // possible value will be used depending on the cluster topology let parameters = LayoutParameters { zone_redundancy: ZoneRedundancy::Maximum, }; - let staging_parameters = Lww::<LayoutParameters>::new(parameters); - - let empty_lwwmap = LwwMap::new(); - let mut ret = ClusterLayout { + LayoutVersion { version: 0, replication_factor, partition_size: 0, roles: LwwMap::new(), node_id_vec: Vec::new(), + nongateway_node_count: 0, ring_assignment_data: Vec::new(), parameters, - staging_parameters, - staging_roles: empty_lwwmap, - staging_hash: [0u8; 32].into(), - }; - ret.staging_hash = ret.calculate_staging_hash(); - ret - } - - fn calculate_staging_hash(&self) -> Hash { - let hashed_tuple = (&self.staging_roles, &self.staging_parameters); - blake2sum(&nonversioned_encode(&hashed_tuple).unwrap()[..]) - } - - pub fn merge(&mut self, other: &ClusterLayout) -> bool { - match other.version.cmp(&self.version) { - Ordering::Greater => { - *self = other.clone(); - true - } - Ordering::Equal => { - self.staging_parameters.merge(&other.staging_parameters); - self.staging_roles.merge(&other.staging_roles); - - let new_staging_hash = self.calculate_staging_hash(); - let changed = new_staging_hash != self.staging_hash; - - self.staging_hash = new_staging_hash; - - changed - } - Ordering::Less => false, } } - pub fn apply_staged_changes(mut self, version: Option<u64>) -> Result<(Self, Message), Error> { - match version { - None => { - let error = r#" -Please pass the new layout version number to ensure that you are writing the correct version of the cluster layout. -To know the correct value of the new layout version, invoke `garage layout show` and review the proposed changes. - "#; - return Err(Error::Message(error.into())); - } - Some(v) => { - if v != self.version + 1 { - return Err(Error::Message("Invalid new layout version".into())); - } - } - } - - self.roles.merge(&self.staging_roles); - self.roles.retain(|(_, _, v)| v.0.is_some()); - self.parameters = *self.staging_parameters.get(); - - self.staging_roles.clear(); - self.staging_hash = self.calculate_staging_hash(); + // ===================== accessors ====================== - let msg = self.calculate_partition_assignment()?; - - self.version += 1; - - Ok((self, msg)) - } - - pub fn revert_staged_changes(mut self, version: Option<u64>) -> Result<Self, Error> { - match version { - None => { - let error = r#" -Please pass the new layout version number to ensure that you are writing the correct version of the cluster layout. -To know the correct value of the new layout version, invoke `garage layout show` and review the proposed changes. - "#; - return Err(Error::Message(error.into())); - } - Some(v) => { - if v != self.version + 1 { - return Err(Error::Message("Invalid new layout version".into())); - } - } - } - - self.staging_roles.clear(); - self.staging_parameters.update(self.parameters); - self.staging_hash = self.calculate_staging_hash(); - - self.version += 1; - - Ok(self) - } - - /// Returns a list of IDs of nodes that currently have - /// a role in the cluster - pub fn node_ids(&self) -> &[Uuid] { + /// Returns a list of IDs of nodes that have a role in this + /// version of the cluster layout, including gateway nodes + pub fn all_nodes(&self) -> &[Uuid] { &self.node_id_vec[..] } - pub fn num_nodes(&self) -> usize { - self.node_id_vec.len() + /// Returns a list of IDs of nodes that have a storage capacity + /// assigned in this version of the cluster layout + pub fn nongateway_nodes(&self) -> &[Uuid] { + &self.node_id_vec[..self.nongateway_node_count] } - /// Returns the role of a node in the layout + /// Returns the role of a node in the layout, if it has one pub fn node_role(&self, node: &Uuid) -> Option<&NodeRole> { match self.roles.get(node) { Some(NodeRoleV(Some(v))) => Some(v), @@ -377,41 +57,23 @@ To know the correct value of the new layout version, invoke `garage layout show` } } - /// Returns the uuids of the non_gateway nodes in self.node_id_vec. - fn nongateway_nodes(&self) -> Vec<Uuid> { - let mut result = Vec::<Uuid>::new(); - for uuid in self.node_id_vec.iter() { - match self.node_role(uuid) { - Some(role) if role.capacity.is_some() => result.push(*uuid), - _ => (), - } - } - result - } - - /// Given a node uuids, this function returns the label of its zone - fn get_node_zone(&self, uuid: &Uuid) -> Result<String, Error> { - match self.node_role(uuid) { - Some(role) => Ok(role.zone.clone()), - _ => Err(Error::Message( - "The Uuid does not correspond to a node present in the cluster.".into(), - )), - } - } - - /// Given a node uuids, this function returns its capacity or fails if it does not have any - pub fn get_node_capacity(&self, uuid: &Uuid) -> Result<u64, Error> { + /// Returns the capacity of a node in the layout, if it has one + pub fn get_node_capacity(&self, uuid: &Uuid) -> Option<u64> { match self.node_role(uuid) { Some(NodeRole { capacity: Some(cap), zone: _, tags: _, - }) => Ok(*cap), - _ => Err(Error::Message( - "The Uuid does not correspond to a node present in the \ - cluster or this node does not have a positive capacity." - .into(), - )), + }) => Some(*cap), + _ => None, + } + } + + /// Given a node uuids, this function returns the label of its zone if it has one + pub fn get_node_zone(&self, uuid: &Uuid) -> Option<&str> { + match self.node_role(uuid) { + Some(role) => Some(&role.zone), + _ => None, } } @@ -435,17 +97,65 @@ To know the correct value of the new layout version, invoke `garage layout show` )) } + /// Get the partition in which data would fall on + pub fn partition_of(&self, position: &Hash) -> Partition { + let top = u16::from_be_bytes(position.as_slice()[0..2].try_into().unwrap()); + top >> (16 - PARTITION_BITS) + } + + /// Get the list of partitions and the first hash of a partition key that would fall in it + pub fn partitions(&self) -> impl Iterator<Item = (Partition, Hash)> + '_ { + (0..(1 << PARTITION_BITS)).map(|i| { + let top = (i as u16) << (16 - PARTITION_BITS); + let mut location = [0u8; 32]; + location[..2].copy_from_slice(&u16::to_be_bytes(top)[..]); + (i as u16, Hash::from(location)) + }) + } + + /// Return the n servers in which data for this hash should be replicated + pub fn nodes_of(&self, position: &Hash, n: usize) -> impl Iterator<Item = Uuid> + '_ { + assert_eq!(n, self.replication_factor); + + let data = &self.ring_assignment_data; + + let partition_nodes = if data.len() == self.replication_factor * (1 << PARTITION_BITS) { + let partition_idx = self.partition_of(position) as usize; + let partition_start = partition_idx * self.replication_factor; + let partition_end = (partition_idx + 1) * self.replication_factor; + &data[partition_start..partition_end] + } else { + warn!("Ring not yet ready, read/writes will be lost!"); + &[] + }; + + partition_nodes + .iter() + .map(move |i| self.node_id_vec[*i as usize]) + } + + // ===================== internal information extractors ====================== + + pub(crate) fn expect_get_node_capacity(&self, uuid: &Uuid) -> u64 { + self.get_node_capacity(uuid) + .expect("non-gateway node with zero capacity") + } + + pub(crate) fn expect_get_node_zone(&self, uuid: &Uuid) -> &str { + self.get_node_zone(uuid).expect("node without a zone") + } + /// Returns the sum of capacities of non gateway nodes in the cluster - fn get_total_capacity(&self) -> Result<u64, Error> { + fn get_total_capacity(&self) -> u64 { let mut total_capacity = 0; - for uuid in self.nongateway_nodes().iter() { - total_capacity += self.get_node_capacity(uuid)?; + for uuid in self.nongateway_nodes() { + total_capacity += self.expect_get_node_capacity(uuid); } - Ok(total_capacity) + total_capacity } /// Returns the effective value of the zone_redundancy parameter - fn effective_zone_redundancy(&self) -> usize { + pub(crate) fn effective_zone_redundancy(&self) -> usize { match self.parameters.zone_redundancy { ZoneRedundancy::AtLeast(v) => v, ZoneRedundancy::Maximum => { @@ -465,10 +175,14 @@ To know the correct value of the new layout version, invoke `garage layout show` /// (assignment, roles, parameters, partition size) /// returns true if consistent, false if error pub fn check(&self) -> Result<(), String> { - // Check that the hash of the staging data is correct - let staging_hash = self.calculate_staging_hash(); - if staging_hash != self.staging_hash { - return Err("staging_hash is incorrect".into()); + // Check that the assignment data has the correct length + let expected_assignment_data_len = (1 << PARTITION_BITS) * self.replication_factor; + if self.ring_assignment_data.len() != expected_assignment_data_len { + return Err(format!( + "ring_assignment_data has incorrect length {} instead of {}", + self.ring_assignment_data.len(), + expected_assignment_data_len + )); } // Check that node_id_vec contains the correct list of nodes @@ -486,16 +200,6 @@ To know the correct value of the new layout version, invoke `garage layout show` return Err(format!("node_id_vec does not contain the correct set of nodes\nnode_id_vec: {:?}\nexpected: {:?}", node_id_vec, expected_nodes)); } - // Check that the assignment data has the correct length - let expected_assignment_data_len = (1 << PARTITION_BITS) * self.replication_factor; - if self.ring_assignment_data.len() != expected_assignment_data_len { - return Err(format!( - "ring_assignment_data has incorrect length {} instead of {}", - self.ring_assignment_data.len(), - expected_assignment_data_len - )); - } - // Check that the assigned nodes are correct identifiers // of nodes that are assigned a role // and that role is not the role of a gateway nodes @@ -524,10 +228,7 @@ To know the correct value of the new layout version, invoke `garage layout show` // Check that every partition is spread over at least zone_redundancy zones. let zones_of_p = nodes_of_p .iter() - .map(|n| { - self.get_node_zone(&self.node_id_vec[*n as usize]) - .expect("Zone not found.") - }) + .map(|n| self.expect_get_node_zone(&self.node_id_vec[*n as usize])) .collect::<Vec<_>>(); if zones_of_p.iter().unique().count() < zone_redundancy { return Err(format!( @@ -546,7 +247,7 @@ To know the correct value of the new layout version, invoke `garage layout show` if *usage > 0 { let uuid = self.node_id_vec[n]; let partusage = usage * self.partition_size; - let nodecap = self.get_node_capacity(&uuid).unwrap(); + let nodecap = self.expect_get_node_capacity(&uuid); if partusage > nodecap { return Err(format!( "node usage ({}) is bigger than node capacity ({})", @@ -574,12 +275,24 @@ To know the correct value of the new layout version, invoke `garage layout show` Ok(()) } -} -// ==================================================================================== + // ================== updates to layout, internals =================== + + pub(crate) fn calculate_next_version( + mut self, + staging: &LayoutStaging, + ) -> Result<(Self, Message), Error> { + self.version += 1; + + self.roles.merge(&staging.roles); + self.roles.retain(|(_, _, v)| v.0.is_some()); + self.parameters = *staging.parameters.get(); + + let msg = self.calculate_partition_assignment()?; + + Ok((self, msg)) + } -// Implementation of the ClusterLayout methods related to the assignment algorithm. -impl ClusterLayout { /// This function calculates a new partition-to-node assignment. /// The computed assignment respects the node replication factor /// and the zone redundancy parameter It maximizes the capacity of a @@ -609,12 +322,12 @@ impl ClusterLayout { // to use them as indices in the flow graphs. let (id_to_zone, zone_to_id) = self.generate_nongateway_zone_ids()?; - let nb_nongateway_nodes = self.nongateway_nodes().len(); - if nb_nongateway_nodes < self.replication_factor { + if self.nongateway_nodes().len() < self.replication_factor { return Err(Error::Message(format!( "The number of nodes with positive \ capacity ({}) is smaller than the replication factor ({}).", - nb_nongateway_nodes, self.replication_factor + self.nongateway_nodes().len(), + self.replication_factor ))); } if id_to_zone.len() < zone_redundancy { @@ -712,12 +425,14 @@ impl ClusterLayout { .map(|(k, _, _)| *k) .collect(); - let mut new_node_id_vec = Vec::<Uuid>::new(); - new_node_id_vec.extend(new_non_gateway_nodes); - new_node_id_vec.extend(new_gateway_nodes); + let old_node_id_vec = std::mem::take(&mut self.node_id_vec); - let old_node_id_vec = self.node_id_vec.clone(); - self.node_id_vec = new_node_id_vec.clone(); + self.nongateway_node_count = new_non_gateway_nodes.len(); + self.node_id_vec.clear(); + self.node_id_vec.extend(new_non_gateway_nodes); + self.node_id_vec.extend(new_gateway_nodes); + + let new_node_id_vec = &self.node_id_vec; // (2) We retrieve the old association // We rewrite the old association with the new indices. We only consider partition @@ -756,7 +471,7 @@ impl ClusterLayout { } } - // We write the ring + // We clear the ring assignemnt data self.ring_assignment_data = Vec::<CompactNodeType>::new(); Ok(Some(old_assignment)) @@ -764,7 +479,9 @@ impl ClusterLayout { /// This function generates ids for the zone of the nodes appearing in /// self.node_id_vec. - fn generate_nongateway_zone_ids(&self) -> Result<(Vec<String>, HashMap<String, usize>), Error> { + pub(crate) fn generate_nongateway_zone_ids( + &self, + ) -> Result<(Vec<String>, HashMap<String, usize>), Error> { let mut id_to_zone = Vec::<String>::new(); let mut zone_to_id = HashMap::<String, usize>::new(); @@ -797,7 +514,7 @@ impl ClusterLayout { } let mut s_down = 1; - let mut s_up = self.get_total_capacity()?; + let mut s_up = self.get_total_capacity(); while s_down + 1 < s_up { g = self.generate_flow_graph( (s_down + s_up) / 2, @@ -846,7 +563,7 @@ impl ClusterLayout { zone_redundancy: usize, ) -> Result<Graph<FlowEdge>, Error> { let vertices = - ClusterLayout::generate_graph_vertices(zone_to_id.len(), self.nongateway_nodes().len()); + LayoutVersion::generate_graph_vertices(zone_to_id.len(), self.nongateway_nodes().len()); let mut g = Graph::<FlowEdge>::new(&vertices); let nb_zones = zone_to_id.len(); for p in 0..NB_PARTITIONS { @@ -866,8 +583,8 @@ impl ClusterLayout { } } for n in 0..self.nongateway_nodes().len() { - let node_capacity = self.get_node_capacity(&self.node_id_vec[n])?; - let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[n])?]; + let node_capacity = self.expect_get_node_capacity(&self.node_id_vec[n]); + let node_zone = zone_to_id[self.expect_get_node_zone(&self.node_id_vec[n])]; g.add_edge(Vertex::N(n), Vertex::Sink, node_capacity / partition_size)?; for p in 0..NB_PARTITIONS { if !exclude_assoc.contains(&(p, n)) { @@ -913,7 +630,7 @@ impl ClusterLayout { // The algorithm is such that it will start with the flow that we just computed // and find ameliorating paths from that. for (p, n) in exclude_edge.iter() { - let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]; + let node_zone = zone_to_id[self.expect_get_node_zone(&self.node_id_vec[*n])]; g.add_edge(Vertex::PZ(*p, node_zone), Vertex::N(*n), 1)?; } g.compute_maximal_flow()?; @@ -933,7 +650,7 @@ impl ClusterLayout { let mut cost = CostFunction::new(); for (p, assoc_p) in prev_assign.iter().enumerate() { for n in assoc_p.iter() { - let node_zone = zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]; + let node_zone = zone_to_id[self.expect_get_node_zone(&self.node_id_vec[*n])]; cost.insert((Vertex::PZ(p, node_zone), Vertex::N(*n)), -1); } } @@ -988,7 +705,7 @@ impl ClusterLayout { let mut msg = Message::new(); let used_cap = self.partition_size * NB_PARTITIONS as u64 * self.replication_factor as u64; - let total_cap = self.get_total_capacity()?; + let total_cap = self.get_total_capacity(); let percent_cap = 100.0 * (used_cap as f32) / (total_cap as f32); msg.push(format!( "Usable capacity / total cluster capacity: {} / {} ({:.1} %)", @@ -1035,7 +752,7 @@ impl ClusterLayout { let mut old_zones_of_p = Vec::<usize>::new(); for n in prev_assign[p].iter() { old_zones_of_p - .push(zone_to_id[&self.get_node_zone(&self.node_id_vec[*n])?]); + .push(zone_to_id[self.expect_get_node_zone(&self.node_id_vec[*n])]); } if !old_zones_of_p.contains(&z) { new_partitions_zone[z] += 1; @@ -1077,7 +794,7 @@ impl ClusterLayout { for z in 0..id_to_zone.len() { let mut nodes_of_z = Vec::<usize>::new(); for n in 0..storing_nodes.len() { - if self.get_node_zone(&self.node_id_vec[n])? == id_to_zone[z] { + if self.expect_get_node_zone(&self.node_id_vec[n]) == id_to_zone[z] { nodes_of_z.push(n); } } @@ -1091,13 +808,13 @@ impl ClusterLayout { let available_cap_z: u64 = self.partition_size * replicated_partitions as u64; let mut total_cap_z = 0; for n in nodes_of_z.iter() { - total_cap_z += self.get_node_capacity(&self.node_id_vec[*n])?; + total_cap_z += self.expect_get_node_capacity(&self.node_id_vec[*n]); } let percent_cap_z = 100.0 * (available_cap_z as f32) / (total_cap_z as f32); for n in nodes_of_z.iter() { let available_cap_n = stored_partitions[*n] as u64 * self.partition_size; - let total_cap_n = self.get_node_capacity(&self.node_id_vec[*n])?; + let total_cap_n = self.expect_get_node_capacity(&self.node_id_vec[*n]); let tags_n = (self.node_role(&self.node_id_vec[*n]).ok_or("<??>"))?.tags_string(); table.push(format!( " {:?}\t{}\t{} ({} new)\t{}\t{} ({:.1}%)", @@ -1127,167 +844,3 @@ impl ClusterLayout { Ok(msg) } } - -// ==================================================================================== - -#[cfg(test)] -mod tests { - use super::{Error, *}; - use std::cmp::min; - - // This function checks that the partition size S computed is at least better than the - // one given by a very naive algorithm. To do so, we try to run the naive algorithm - // assuming a partion size of S+1. If we succed, it means that the optimal assignment - // was not optimal. The naive algorithm is the following : - // - we compute the max number of partitions associated to every node, capped at the - // partition number. It gives the number of tokens of every node. - // - every zone has a number of tokens equal to the sum of the tokens of its nodes. - // - we cycle over the partitions and associate zone tokens while respecting the - // zone redundancy constraint. - // NOTE: the naive algorithm is not optimal. Counter example: - // take nb_partition = 3 ; replication_factor = 5; redundancy = 4; - // number of tokens by zone : (A, 4), (B,1), (C,4), (D, 4), (E, 2) - // With these parameters, the naive algo fails, whereas there is a solution: - // (A,A,C,D,E) , (A,B,C,D,D) (A,C,C,D,E) - fn check_against_naive(cl: &ClusterLayout) -> Result<bool, Error> { - let over_size = cl.partition_size + 1; - let mut zone_token = HashMap::<String, usize>::new(); - - let (zones, zone_to_id) = cl.generate_nongateway_zone_ids()?; - - if zones.is_empty() { - return Ok(false); - } - - for z in zones.iter() { - zone_token.insert(z.clone(), 0); - } - for uuid in cl.nongateway_nodes().iter() { - let z = cl.get_node_zone(uuid)?; - let c = cl.get_node_capacity(uuid)?; - zone_token.insert( - z.clone(), - zone_token[&z] + min(NB_PARTITIONS, (c / over_size) as usize), - ); - } - - // For every partition, we count the number of zone already associated and - // the name of the last zone associated - - let mut id_zone_token = vec![0; zones.len()]; - for (z, t) in zone_token.iter() { - id_zone_token[zone_to_id[z]] = *t; - } - - let mut nb_token = vec![0; NB_PARTITIONS]; - let mut last_zone = vec![zones.len(); NB_PARTITIONS]; - - let mut curr_zone = 0; - - let redundancy = cl.effective_zone_redundancy(); - - for replic in 0..cl.replication_factor { - for p in 0..NB_PARTITIONS { - while id_zone_token[curr_zone] == 0 - || (last_zone[p] == curr_zone - && redundancy - nb_token[p] <= cl.replication_factor - replic) - { - curr_zone += 1; - if curr_zone >= zones.len() { - return Ok(true); - } - } - id_zone_token[curr_zone] -= 1; - if last_zone[p] != curr_zone { - nb_token[p] += 1; - last_zone[p] = curr_zone; - } - } - } - - return Ok(false); - } - - fn show_msg(msg: &Message) { - for s in msg.iter() { - println!("{}", s); - } - } - - fn update_layout( - cl: &mut ClusterLayout, - node_id_vec: &Vec<u8>, - node_capacity_vec: &Vec<u64>, - node_zone_vec: &Vec<String>, - zone_redundancy: usize, - ) { - for i in 0..node_id_vec.len() { - if let Some(x) = FixedBytes32::try_from(&[i as u8; 32]) { - cl.node_id_vec.push(x); - } - - let update = cl.staging_roles.update_mutator( - cl.node_id_vec[i], - NodeRoleV(Some(NodeRole { - zone: (node_zone_vec[i].to_string()), - capacity: (Some(node_capacity_vec[i])), - tags: (vec![]), - })), - ); - cl.staging_roles.merge(&update); - } - cl.staging_parameters.update(LayoutParameters { - zone_redundancy: ZoneRedundancy::AtLeast(zone_redundancy), - }); - cl.staging_hash = cl.calculate_staging_hash(); - } - - #[test] - fn test_assignment() { - let mut node_id_vec = vec![1, 2, 3]; - let mut node_capacity_vec = vec![4000, 1000, 2000]; - let mut node_zone_vec = vec!["A", "B", "C"] - .into_iter() - .map(|x| x.to_string()) - .collect(); - - let mut cl = ClusterLayout::new(3); - update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 3); - let v = cl.version; - let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); - show_msg(&msg); - assert_eq!(cl.check(), Ok(())); - assert!(matches!(check_against_naive(&cl), Ok(true))); - - node_id_vec = vec![1, 2, 3, 4, 5, 6, 7, 8, 9]; - node_capacity_vec = vec![4000, 1000, 1000, 3000, 1000, 1000, 2000, 10000, 2000]; - node_zone_vec = vec!["A", "B", "C", "C", "C", "B", "G", "H", "I"] - .into_iter() - .map(|x| x.to_string()) - .collect(); - update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 2); - let v = cl.version; - let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); - show_msg(&msg); - assert_eq!(cl.check(), Ok(())); - assert!(matches!(check_against_naive(&cl), Ok(true))); - - node_capacity_vec = vec![4000, 1000, 2000, 7000, 1000, 1000, 2000, 10000, 2000]; - update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 3); - let v = cl.version; - let (mut cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); - show_msg(&msg); - assert_eq!(cl.check(), Ok(())); - assert!(matches!(check_against_naive(&cl), Ok(true))); - - node_capacity_vec = vec![ - 4000000, 4000000, 2000000, 7000000, 1000000, 9000000, 2000000, 10000, 2000000, - ]; - update_layout(&mut cl, &node_id_vec, &node_capacity_vec, &node_zone_vec, 1); - let v = cl.version; - let (cl, msg) = cl.apply_staged_changes(Some(v + 1)).unwrap(); - show_msg(&msg); - assert_eq!(cl.check(), Ok(())); - assert!(matches!(check_against_naive(&cl), Ok(true))); - } -} diff --git a/src/rpc/lib.rs b/src/rpc/lib.rs index a5f8fc6e..b5b31c05 100644 --- a/src/rpc/lib.rs +++ b/src/rpc/lib.rs @@ -11,10 +11,8 @@ mod consul; #[cfg(feature = "kubernetes-discovery")] mod kubernetes; -pub mod graph_algo; pub mod layout; pub mod replication_mode; -pub mod ring; pub mod system; pub mod rpc_helper; diff --git a/src/rpc/replication_mode.rs b/src/rpc/replication_mode.rs index e244e063..a3a94085 100644 --- a/src/rpc/replication_mode.rs +++ b/src/rpc/replication_mode.rs @@ -1,57 +1,94 @@ -#[derive(Clone, Copy)] -pub enum ReplicationMode { - None, - TwoWay, - TwoWayDangerous, - ThreeWay, - ThreeWayDegraded, - ThreeWayDangerous, +use garage_util::config::Config; +use garage_util::crdt::AutoCrdt; +use garage_util::error::*; +use serde::{Deserialize, Serialize}; + +#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)] +#[serde(transparent)] +pub struct ReplicationFactor(usize); + +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Default, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum ConsistencyMode { + /// Read- and Write-quorum are 1 + Dangerous, + /// Read-quorum is 1 + Degraded, + /// Read- and Write-quorum are determined for read-after-write-consistency + #[default] + Consistent, } -impl ReplicationMode { - pub fn parse(v: &str) -> Option<Self> { - match v { - "none" | "1" => Some(Self::None), - "2" => Some(Self::TwoWay), - "2-dangerous" => Some(Self::TwoWayDangerous), - "3" => Some(Self::ThreeWay), - "3-degraded" => Some(Self::ThreeWayDegraded), - "3-dangerous" => Some(Self::ThreeWayDangerous), - _ => None, - } +impl ConsistencyMode { + pub fn parse(s: &str) -> Option<Self> { + serde_json::from_value(serde_json::Value::String(s.to_string())).ok() } +} + +impl AutoCrdt for ConsistencyMode { + const WARN_IF_DIFFERENT: bool = true; +} - pub fn control_write_max_faults(&self) -> usize { - match self { - Self::None => 0, - _ => 1, +impl ReplicationFactor { + pub fn new(replication_factor: usize) -> Option<Self> { + if replication_factor < 1 { + None + } else { + Some(Self(replication_factor)) } } pub fn replication_factor(&self) -> usize { - match self { - Self::None => 1, - Self::TwoWay | Self::TwoWayDangerous => 2, - Self::ThreeWay | Self::ThreeWayDegraded | Self::ThreeWayDangerous => 3, - } + self.0 } - pub fn read_quorum(&self) -> usize { - match self { - Self::None => 1, - Self::TwoWay | Self::TwoWayDangerous => 1, - Self::ThreeWay => 2, - Self::ThreeWayDegraded | Self::ThreeWayDangerous => 1, + pub fn read_quorum(&self, consistency_mode: ConsistencyMode) -> usize { + match consistency_mode { + ConsistencyMode::Dangerous | ConsistencyMode::Degraded => 1, + ConsistencyMode::Consistent => self.replication_factor().div_ceil(2), } } - pub fn write_quorum(&self) -> usize { - match self { - Self::None => 1, - Self::TwoWay => 2, - Self::TwoWayDangerous => 1, - Self::ThreeWay | Self::ThreeWayDegraded => 2, - Self::ThreeWayDangerous => 1, + pub fn write_quorum(&self, consistency_mode: ConsistencyMode) -> usize { + match consistency_mode { + ConsistencyMode::Dangerous => 1, + ConsistencyMode::Degraded | ConsistencyMode::Consistent => { + (self.replication_factor() + 1) - self.read_quorum(ConsistencyMode::Consistent) + } } } } + +impl std::convert::From<ReplicationFactor> for usize { + fn from(replication_factor: ReplicationFactor) -> usize { + replication_factor.0 + } +} + +pub fn parse_replication_mode( + config: &Config, +) -> Result<(ReplicationFactor, ConsistencyMode), Error> { + match (&config.replication_mode, config.replication_factor, config.consistency_mode.as_str()) { + (Some(replication_mode), None, "consistent") => { + tracing::warn!("Legacy config option replication_mode in use. Please migrate to replication_factor and consistency_mode"); + let parsed_replication_mode = match replication_mode.as_str() { + "1" | "none" => Some((ReplicationFactor(1), ConsistencyMode::Consistent)), + "2" => Some((ReplicationFactor(2), ConsistencyMode::Consistent)), + "2-dangerous" => Some((ReplicationFactor(2), ConsistencyMode::Dangerous)), + "3" => Some((ReplicationFactor(3), ConsistencyMode::Consistent)), + "3-degraded" => Some((ReplicationFactor(3), ConsistencyMode::Degraded)), + "3-dangerous" => Some((ReplicationFactor(3), ConsistencyMode::Dangerous)), + _ => None, + }; + Some(parsed_replication_mode.ok_or_message("Invalid replication_mode in config file.")?) + }, + (None, Some(replication_factor), consistency_mode) => { + let replication_factor = ReplicationFactor::new(replication_factor) + .ok_or_message("Invalid replication_factor in config file.")?; + let consistency_mode = ConsistencyMode::parse(consistency_mode) + .ok_or_message("Invalid consistency_mode in config file.")?; + Some((replication_factor, consistency_mode)) + } + _ => None, + }.ok_or_message("Either the legacy replication_mode or replication_level and consistency_mode can be set, not both.") +} diff --git a/src/rpc/ring.rs b/src/rpc/ring.rs deleted file mode 100644 index 6a2e5c72..00000000 --- a/src/rpc/ring.rs +++ /dev/null @@ -1,164 +0,0 @@ -//! Module containing types related to computing nodes which should receive a copy of data blocks -//! and metadata -use std::convert::TryInto; - -use garage_util::data::*; - -use crate::layout::ClusterLayout; - -/// A partition id, which is stored on 16 bits -/// i.e. we have up to 2**16 partitions. -/// (in practice we have exactly 2**PARTITION_BITS partitions) -pub type Partition = u16; - -// TODO: make this constant parametrizable in the config file -// For deployments with many nodes it might make sense to bump -// it up to 10. -// Maximum value : 16 -/// How many bits from the hash are used to make partitions. Higher numbers means more fairness in -/// presence of numerous nodes, but exponentially bigger ring. Max 16 -pub const PARTITION_BITS: usize = 8; - -const PARTITION_MASK_U16: u16 = ((1 << PARTITION_BITS) - 1) << (16 - PARTITION_BITS); - -/// A ring distributing fairly objects to nodes -#[derive(Clone)] -pub struct Ring { - /// The replication factor for this ring - pub replication_factor: usize, - - /// The network configuration used to generate this ring - pub layout: ClusterLayout, - - // Internal order of nodes used to make a more compact representation of the ring - nodes: Vec<Uuid>, - - // The list of entries in the ring - ring: Vec<RingEntry>, -} - -// Type to store compactly the id of a node in the system -// Change this to u16 the day we want to have more than 256 nodes in a cluster -pub type CompactNodeType = u8; -pub const MAX_NODE_NUMBER: usize = 256; - -// The maximum number of times an object might get replicated -// This must be at least 3 because Garage supports 3-way replication -// Here we use 6 so that the size of a ring entry is 8 bytes -// (2 bytes partition id, 6 bytes node numbers as u8s) -const MAX_REPLICATION: usize = 6; - -/// An entry in the ring -#[derive(Clone, Debug)] -struct RingEntry { - // The two first bytes of the first hash that goes in this partition - // (the next bytes are zeroes) - hash_prefix: u16, - // The nodes that store this partition, stored as a list of positions in the `nodes` - // field of the Ring structure - // Only items 0 up to ring.replication_factor - 1 are used, others are zeros - nodes_buf: [CompactNodeType; MAX_REPLICATION], -} - -impl Ring { - pub(crate) fn new(layout: ClusterLayout, replication_factor: usize) -> Self { - if replication_factor != layout.replication_factor { - warn!("Could not build ring: replication factor does not match between local configuration and network role assignment."); - return Self::empty(layout, replication_factor); - } - - if layout.ring_assignment_data.len() != replication_factor * (1 << PARTITION_BITS) { - warn!("Could not build ring: network role assignment data has invalid length"); - return Self::empty(layout, replication_factor); - } - - let nodes = layout.node_id_vec.clone(); - let ring = (0..(1 << PARTITION_BITS)) - .map(|i| { - let top = (i as u16) << (16 - PARTITION_BITS); - let mut nodes_buf = [0u8; MAX_REPLICATION]; - nodes_buf[..replication_factor].copy_from_slice( - &layout.ring_assignment_data - [replication_factor * i..replication_factor * (i + 1)], - ); - RingEntry { - hash_prefix: top, - nodes_buf, - } - }) - .collect::<Vec<_>>(); - - Self { - replication_factor, - layout, - nodes, - ring, - } - } - - fn empty(layout: ClusterLayout, replication_factor: usize) -> Self { - Self { - replication_factor, - layout, - nodes: vec![], - ring: vec![], - } - } - - /// Get the partition in which data would fall on - pub fn partition_of(&self, position: &Hash) -> Partition { - let top = u16::from_be_bytes(position.as_slice()[0..2].try_into().unwrap()); - top >> (16 - PARTITION_BITS) - } - - /// Get the list of partitions and the first hash of a partition key that would fall in it - pub fn partitions(&self) -> Vec<(Partition, Hash)> { - let mut ret = vec![]; - - for (i, entry) in self.ring.iter().enumerate() { - let mut location = [0u8; 32]; - location[..2].copy_from_slice(&u16::to_be_bytes(entry.hash_prefix)[..]); - ret.push((i as u16, location.into())); - } - if !ret.is_empty() { - assert_eq!(ret[0].1, [0u8; 32].into()); - } - - ret - } - - /// Walk the ring to find the n servers in which data should be replicated - pub fn get_nodes(&self, position: &Hash, n: usize) -> Vec<Uuid> { - if self.ring.len() != 1 << PARTITION_BITS { - warn!("Ring not yet ready, read/writes will be lost!"); - return vec![]; - } - - let partition_idx = self.partition_of(position) as usize; - let partition = &self.ring[partition_idx]; - - let top = u16::from_be_bytes(position.as_slice()[0..2].try_into().unwrap()); - // Check that we haven't messed up our partition table, i.e. that this partition - // table entrey indeed corresponds to the item we are storing - assert_eq!( - partition.hash_prefix & PARTITION_MASK_U16, - top & PARTITION_MASK_U16 - ); - - assert!(n <= self.replication_factor); - partition.nodes_buf[..n] - .iter() - .map(|i| self.nodes[*i as usize]) - .collect::<Vec<_>>() - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_ring_entry_size() { - assert_eq!(std::mem::size_of::<RingEntry>(), 8); - } -} diff --git a/src/rpc/rpc_helper.rs b/src/rpc/rpc_helper.rs index c46e577f..977c6ed8 100644 --- a/src/rpc/rpc_helper.rs +++ b/src/rpc/rpc_helper.rs @@ -1,12 +1,12 @@ //! Contain structs related to making RPCs -use std::sync::Arc; +use std::collections::HashMap; +use std::sync::{Arc, RwLock}; use std::time::Duration; use futures::future::join_all; use futures::stream::futures_unordered::FuturesUnordered; use futures::stream::StreamExt; use tokio::select; -use tokio::sync::watch; use opentelemetry::KeyValue; use opentelemetry::{ @@ -26,8 +26,8 @@ use garage_util::data::*; use garage_util::error::Error; use garage_util::metrics::RecordDuration; +use crate::layout::{LayoutHelper, LayoutHistory}; use crate::metrics::RpcMetrics; -use crate::ring::Ring; // Default RPC timeout = 5 minutes const DEFAULT_TIMEOUT: Duration = Duration::from_secs(300); @@ -36,11 +36,11 @@ const DEFAULT_TIMEOUT: Duration = Duration::from_secs(300); #[derive(Copy, Clone)] pub struct RequestStrategy { /// Min number of response to consider the request successful - pub rs_quorum: Option<usize>, - /// Should requests be dropped after enough response are received - pub rs_interrupt_after_quorum: bool, + rs_quorum: Option<usize>, + /// Send all requests at once + rs_send_all_at_once: Option<bool>, /// Request priority - pub rs_priority: RequestPriority, + rs_priority: RequestPriority, /// Custom timeout for this request rs_timeout: Timeout, } @@ -57,7 +57,7 @@ impl RequestStrategy { pub fn with_priority(prio: RequestPriority) -> Self { RequestStrategy { rs_quorum: None, - rs_interrupt_after_quorum: false, + rs_send_all_at_once: None, rs_priority: prio, rs_timeout: Timeout::Default, } @@ -67,10 +67,9 @@ impl RequestStrategy { self.rs_quorum = Some(quorum); self } - /// Set if requests can be dropped after quorum has been reached - /// In general true for read requests, and false for write - pub fn interrupt_after_quorum(mut self, interrupt: bool) -> Self { - self.rs_interrupt_after_quorum = interrupt; + /// Set quorum to be reached for request + pub fn send_all_at_once(mut self, value: bool) -> Self { + self.rs_send_all_at_once = Some(value); self } /// Deactivate timeout for this request @@ -91,7 +90,7 @@ pub struct RpcHelper(Arc<RpcHelperInner>); struct RpcHelperInner { our_node_id: Uuid, peering: Arc<PeeringManager>, - ring: watch::Receiver<Arc<Ring>>, + layout: Arc<RwLock<LayoutHelper>>, metrics: RpcMetrics, rpc_timeout: Duration, } @@ -100,7 +99,7 @@ impl RpcHelper { pub(crate) fn new( our_node_id: Uuid, peering: Arc<PeeringManager>, - ring: watch::Receiver<Arc<Ring>>, + layout: Arc<RwLock<LayoutHelper>>, rpc_timeout: Option<Duration>, ) -> Self { let metrics = RpcMetrics::new(); @@ -108,7 +107,7 @@ impl RpcHelper { Self(Arc::new(RpcHelperInner { our_node_id, peering, - ring, + layout, metrics, rpc_timeout: rpc_timeout.unwrap_or(DEFAULT_TIMEOUT), })) @@ -130,6 +129,12 @@ impl RpcHelper { N: IntoReq<M> + Send, H: StreamingEndpointHandler<M>, { + let tracer = opentelemetry::global::tracer("garage"); + let span_name = format!("RPC [{}] to {:?}", endpoint.path(), to); + let mut span = tracer.start(span_name); + span.set_attribute(KeyValue::new("from", format!("{:?}", self.0.our_node_id))); + span.set_attribute(KeyValue::new("to", format!("{:?}", to))); + let metric_tags = [ KeyValue::new("rpc_endpoint", endpoint.path().to_string()), KeyValue::new("from", format!("{:?}", self.0.our_node_id)), @@ -141,6 +146,7 @@ impl RpcHelper { let node_id = to.into(); let rpc_call = endpoint .call_streaming(&node_id, msg, strat.rs_priority) + .with_context(Context::current_with_span(span)) .record_duration(&self.0.metrics.rpc_duration, &metric_tags); let timeout = async { @@ -183,12 +189,17 @@ impl RpcHelper { N: IntoReq<M>, H: StreamingEndpointHandler<M>, { + let tracer = opentelemetry::global::tracer("garage"); + let span_name = format!("RPC [{}] call_many {} nodes", endpoint.path(), to.len()); + let span = tracer.start(span_name); + let msg = msg.into_req().map_err(garage_net::error::Error::from)?; let resps = join_all( to.iter() .map(|to| self.call(endpoint, *to, msg.clone(), strat)), ) + .with_context(Context::current_with_span(span)) .await; Ok(to .iter() @@ -220,6 +231,22 @@ impl RpcHelper { /// Make a RPC call to multiple servers, returning either a Vec of responses, /// or an error if quorum could not be reached due to too many errors + /// + /// If RequestStrategy has send_all_at_once set, then all requests will be + /// sent at once, and `try_call_many` will return as soon as a quorum of + /// responses is achieved, dropping and cancelling the remaining requests. + /// + /// Otherwise, `quorum` requests will be sent at the same time, and if an + /// error response is received, a new request will be sent to replace it. + /// The ordering of nodes to which requests are sent is determined by + /// the `RpcHelper::request_order` function, which takes into account + /// parameters such as node zones and measured ping values. + /// + /// In both cases, the basic contract of this function is that even in the + /// absence of failures, the RPC call might not be driven to completion + /// on all of the specified nodes. It is therefore unfit for broadcast + /// write operations where we expect all nodes to successfully store + /// the written date. pub async fn try_call_many<M, N, H, S>( &self, endpoint: &Arc<Endpoint<M, H>>, @@ -236,31 +263,24 @@ impl RpcHelper { let quorum = strategy.rs_quorum.unwrap_or(to.len()); let tracer = opentelemetry::global::tracer("garage"); - let span_name = if strategy.rs_interrupt_after_quorum { - format!("RPC {} to {} of {}", endpoint.path(), quorum, to.len()) - } else { - format!( - "RPC {} to {} (quorum {})", - endpoint.path(), - to.len(), - quorum - ) - }; + let span_name = format!( + "RPC [{}] try_call_many (quorum {}/{})", + endpoint.path(), + quorum, + to.len() + ); + let mut span = tracer.start(span_name); span.set_attribute(KeyValue::new("from", format!("{:?}", self.0.our_node_id))); span.set_attribute(KeyValue::new("to", format!("{:?}", to))); span.set_attribute(KeyValue::new("quorum", quorum as i64)); - span.set_attribute(KeyValue::new( - "interrupt_after_quorum", - strategy.rs_interrupt_after_quorum.to_string(), - )); - self.try_call_many_internal(endpoint, to, msg, strategy, quorum) + self.try_call_many_inner(endpoint, to, msg, strategy, quorum) .with_context(Context::current_with_span(span)) .await } - async fn try_call_many_internal<M, N, H, S>( + async fn try_call_many_inner<M, N, H, S>( &self, endpoint: &Arc<Endpoint<M, H>>, to: &[Uuid], @@ -274,129 +294,238 @@ impl RpcHelper { H: StreamingEndpointHandler<M> + 'static, S: Send + 'static, { - let msg = msg.into_req().map_err(garage_net::error::Error::from)?; + // Once quorum is reached, other requests don't matter. + // What we do here is only send the required number of requests + // to reach a quorum, priorizing nodes with the lowest latency. + // When there are errors, we start new requests to compensate. + + // TODO: this could be made more aggressive, e.g. if after 2x the + // average ping of a given request, the response is not yet received, + // preemptively send an additional request to any remaining nodes. + + // Reorder requests to priorize closeness / low latency + let request_order = self.request_order(&self.0.layout.read().unwrap(), to.iter().copied()); + let send_all_at_once = strategy.rs_send_all_at_once.unwrap_or(false); // Build future for each request // They are not started now: they are added below in a FuturesUnordered // object that will take care of polling them (see below) - let requests = to.iter().cloned().map(|to| { + let msg = msg.into_req().map_err(garage_net::error::Error::from)?; + let mut requests = request_order.into_iter().map(|to| { let self2 = self.clone(); let msg = msg.clone(); let endpoint2 = endpoint.clone(); - (to, async move { - self2.call(&endpoint2, to, msg, strategy).await - }) + async move { self2.call(&endpoint2, to, msg, strategy).await } }); // Vectors in which success results and errors will be collected let mut successes = vec![]; let mut errors = vec![]; - if strategy.rs_interrupt_after_quorum { - // Case 1: once quorum is reached, other requests don't matter. - // What we do here is only send the required number of requests - // to reach a quorum, priorizing nodes with the lowest latency. - // When there are errors, we start new requests to compensate. - - // Reorder requests to priorize closeness / low latency - let request_order = self.request_order(to); - let mut ord_requests = vec![(); request_order.len()] - .into_iter() - .map(|_| None) - .collect::<Vec<_>>(); - for (to, fut) in requests { - let i = request_order.iter().position(|x| *x == to).unwrap(); - ord_requests[i] = Some((to, fut)); + // resp_stream will contain all of the requests that are currently in flight. + // (for the moment none, they will be added in the loop below) + let mut resp_stream = FuturesUnordered::new(); + + // Do some requests and collect results + while successes.len() < quorum { + // If the current set of requests that are running is not enough to possibly + // reach quorum, start some new requests. + while send_all_at_once || successes.len() + resp_stream.len() < quorum { + if let Some(fut) = requests.next() { + resp_stream.push(fut) + } else { + break; + } + } + + if successes.len() + resp_stream.len() < quorum { + // We know we won't ever reach quorum + break; } - // Make an iterator to take requests in their sorted order - let mut requests = ord_requests.into_iter().map(Option::unwrap); - - // resp_stream will contain all of the requests that are currently in flight. - // (for the moment none, they will be added in the loop below) - let mut resp_stream = FuturesUnordered::new(); - - // Do some requests and collect results - 'request_loop: while successes.len() < quorum { - // If the current set of requests that are running is not enough to possibly - // reach quorum, start some new requests. - while successes.len() + resp_stream.len() < quorum { - if let Some((req_to, fut)) = requests.next() { - let tracer = opentelemetry::global::tracer("garage"); - let span = tracer.start(format!("RPC to {:?}", req_to)); - resp_stream.push(tokio::spawn( - fut.with_context(Context::current_with_span(span)), - )); - } else { - // If we have no request to add, we know that we won't ever - // reach quorum: bail out now. - break 'request_loop; - } + // Wait for one request to terminate + match resp_stream.next().await.unwrap() { + Ok(msg) => { + successes.push(msg); } - assert!(!resp_stream.is_empty()); // because of loop invariants - - // Wait for one request to terminate - match resp_stream.next().await.unwrap().unwrap() { - Ok(msg) => { - successes.push(msg); - } - Err(e) => { - errors.push(e); - } + Err(e) => { + errors.push(e); } } + } + + if successes.len() >= quorum { + Ok(successes) } else { - // Case 2: all of the requests need to be sent in all cases, - // and need to terminate. (this is the case for writes that - // must be spread to n nodes) - // Just start all the requests in parallel and return as soon - // as the quorum is reached. - let mut resp_stream = requests - .map(|(_, fut)| fut) - .collect::<FuturesUnordered<_>>(); - - while let Some(resp) = resp_stream.next().await { - match resp { - Ok(msg) => { - successes.push(msg); - if successes.len() >= quorum { - break; - } - } - Err(e) => { - errors.push(e); - } - } - } + let errors = errors.iter().map(|e| format!("{}", e)).collect::<Vec<_>>(); + Err(Error::Quorum( + quorum, + None, + successes.len(), + to.len(), + errors, + )) + } + } + + /// Make a RPC call to multiple servers, returning either a Vec of responses, + /// or an error if quorum could not be reached due to too many errors + /// + /// Contrary to try_call_many, this fuction is especially made for broadcast + /// write operations. In particular: + /// + /// - The request are sent to all specified nodes as soon as `try_write_many_sets` + /// is invoked. + /// + /// - When `try_write_many_sets` returns, all remaining requests that haven't + /// completed move to a background task so that they have a chance to + /// complete successfully if there are no failures. + /// + /// In addition, the nodes to which requests should be sent are divided in + /// "quorum sets", and `try_write_many_sets` only returns once a quorum + /// has been validated in each set. This is used in the case of cluster layout + /// changes, where data has to be written both in the old layout and in the + /// new one as long as all nodes have not successfully tranisitionned and + /// moved all data to the new layout. + pub async fn try_write_many_sets<M, N, H, S>( + &self, + endpoint: &Arc<Endpoint<M, H>>, + to_sets: &[Vec<Uuid>], + msg: N, + strategy: RequestStrategy, + ) -> Result<Vec<S>, Error> + where + M: Rpc<Response = Result<S, Error>> + 'static, + N: IntoReq<M>, + H: StreamingEndpointHandler<M> + 'static, + S: Send + 'static, + { + let quorum = strategy + .rs_quorum + .expect("internal error: missing quorum value in try_write_many_sets"); + + let tracer = opentelemetry::global::tracer("garage"); + let span_name = format!( + "RPC [{}] try_write_many_sets (quorum {} in {} sets)", + endpoint.path(), + quorum, + to_sets.len() + ); + + let mut span = tracer.start(span_name); + span.set_attribute(KeyValue::new("from", format!("{:?}", self.0.our_node_id))); + span.set_attribute(KeyValue::new("to", format!("{:?}", to_sets))); + span.set_attribute(KeyValue::new("quorum", quorum as i64)); + + self.try_write_many_sets_inner(endpoint, to_sets, msg, strategy, quorum) + .with_context(Context::current_with_span(span)) + .await + } + + async fn try_write_many_sets_inner<M, N, H, S>( + &self, + endpoint: &Arc<Endpoint<M, H>>, + to_sets: &[Vec<Uuid>], + msg: N, + strategy: RequestStrategy, + quorum: usize, + ) -> Result<Vec<S>, Error> + where + M: Rpc<Response = Result<S, Error>> + 'static, + N: IntoReq<M>, + H: StreamingEndpointHandler<M> + 'static, + S: Send + 'static, + { + // Peers may appear in many quorum sets. Here, build a list of peers, + // mapping to the index of the quorum sets in which they appear. + let mut result_tracker = QuorumSetResultTracker::new(to_sets, quorum); + + // Send one request to each peer of the quorum sets + let msg = msg.into_req().map_err(garage_net::error::Error::from)?; + let requests = result_tracker.nodes.keys().map(|peer| { + let self2 = self.clone(); + let msg = msg.clone(); + let endpoint2 = endpoint.clone(); + let to = *peer; + async move { (to, self2.call(&endpoint2, to, msg, strategy).await) } + }); + let mut resp_stream = requests.collect::<FuturesUnordered<_>>(); + + // Drive requests to completion + while let Some((node, resp)) = resp_stream.next().await { + // Store the response in the correct vector and increment the + // appropriate counters + result_tracker.register_result(node, resp); - if !resp_stream.is_empty() { - // Continue remaining requests in background. - // Note: these requests can get interrupted on process shutdown, - // we must not count on them being executed for certain. - // For all background things that have to happen with certainty, - // they have to be put in a proper queue that is persisted to disk. + // If we have a quorum of ok in all quorum sets, then it's a success! + if result_tracker.all_quorums_ok() { + // Continue all other requets in background tokio::spawn(async move { - resp_stream.collect::<Vec<Result<_, _>>>().await; + resp_stream.collect::<Vec<(Uuid, Result<_, _>)>>().await; }); + + return Ok(result_tracker.success_values()); + } + + // If there is a quorum set for which too many errors were received, + // we know it's impossible to get a quorum, so return immediately. + if result_tracker.too_many_failures() { + break; } } - if successes.len() >= quorum { - Ok(successes) - } else { - let errors = errors.iter().map(|e| format!("{}", e)).collect::<Vec<_>>(); - Err(Error::Quorum(quorum, successes.len(), to.len(), errors)) + // At this point, there is no quorum and we know that a quorum + // will never be achieved. Currently, we drop all remaining requests. + // Should we still move them to background so that they can continue + // for non-failed nodes? Not doing so has no impact on correctness, + // but it means that more cancellation messages will be sent. Idk. + // (When an in-progress request future is dropped, Netapp automatically + // sends a cancellation message to the remote node to inform it that + // the result is no longer needed. In turn, if the remote node receives + // the cancellation message in time, it interrupts the task of the + // running request handler.) + + // Failure, could not get quorum + Err(result_tracker.quorum_error()) + } + + // ---- functions not related to MAKING RPCs, but just determining to what nodes + // they should be made and in which order ---- + + pub fn block_read_nodes_of(&self, position: &Hash, rpc_helper: &RpcHelper) -> Vec<Uuid> { + let layout = self.0.layout.read().unwrap(); + + let mut ret = Vec::with_capacity(12); + let ver_iter = layout + .versions + .iter() + .rev() + .chain(layout.old_versions.iter().rev()); + for ver in ver_iter { + if ver.version > layout.sync_map_min() { + continue; + } + let nodes = ver.nodes_of(position, ver.replication_factor); + for node in rpc_helper.request_order(&layout, nodes) { + if !ret.contains(&node) { + ret.push(node); + } + } } + ret } - pub fn request_order(&self, nodes: &[Uuid]) -> Vec<Uuid> { + fn request_order( + &self, + layout: &LayoutHistory, + nodes: impl Iterator<Item = Uuid>, + ) -> Vec<Uuid> { // Retrieve some status variables that we will use to sort requests let peer_list = self.0.peering.get_peer_list(); - let ring: Arc<Ring> = self.0.ring.borrow().clone(); - let our_zone = match ring.layout.node_role(&self.0.our_node_id) { - Some(pc) => &pc.zone, - None => "", - }; + let our_zone = layout + .current() + .get_node_zone(&self.0.our_node_id) + .unwrap_or(""); // Augment requests with some information used to sort them. // The tuples are as follows: @@ -405,22 +534,18 @@ impl RpcHelper { // By sorting this vec, we priorize ourself, then nodes in the same zone, // and within a same zone we priorize nodes with the lowest latency. let mut nodes = nodes - .iter() .map(|to| { - let peer_zone = match ring.layout.node_role(to) { - Some(pc) => &pc.zone, - None => "", - }; + let peer_zone = layout.current().get_node_zone(&to).unwrap_or(""); let peer_avg_ping = peer_list .iter() .find(|x| x.id.as_ref() == to.as_slice()) .and_then(|pi| pi.avg_ping) .unwrap_or_else(|| Duration::from_secs(10)); ( - *to != self.0.our_node_id, + to != self.0.our_node_id, peer_zone != our_zone, peer_avg_ping, - *to, + to, ) }) .collect::<Vec<_>>(); @@ -434,3 +559,108 @@ impl RpcHelper { .collect::<Vec<_>>() } } + +// ------- utility for tracking successes/errors among write sets -------- + +pub struct QuorumSetResultTracker<S, E> { + /// The set of nodes and the index of the quorum sets they belong to + pub nodes: HashMap<Uuid, Vec<usize>>, + /// The quorum value, i.e. number of success responses to await in each set + pub quorum: usize, + + /// The success responses received + pub successes: Vec<(Uuid, S)>, + /// The error responses received + pub failures: Vec<(Uuid, E)>, + + /// The counters for successes in each set + pub success_counters: Box<[usize]>, + /// The counters for failures in each set + pub failure_counters: Box<[usize]>, + /// The total number of nodes in each set + pub set_lens: Box<[usize]>, +} + +impl<S, E> QuorumSetResultTracker<S, E> +where + E: std::fmt::Display, +{ + pub fn new<A>(sets: &[A], quorum: usize) -> Self + where + A: AsRef<[Uuid]>, + { + let mut nodes = HashMap::<Uuid, Vec<usize>>::new(); + for (i, set) in sets.iter().enumerate() { + for node in set.as_ref().iter() { + nodes.entry(*node).or_default().push(i); + } + } + + let num_nodes = nodes.len(); + Self { + nodes, + quorum, + successes: Vec::with_capacity(num_nodes), + failures: vec![], + success_counters: vec![0; sets.len()].into_boxed_slice(), + failure_counters: vec![0; sets.len()].into_boxed_slice(), + set_lens: sets + .iter() + .map(|x| x.as_ref().len()) + .collect::<Vec<_>>() + .into_boxed_slice(), + } + } + + pub fn register_result(&mut self, node: Uuid, result: Result<S, E>) { + match result { + Ok(s) => { + self.successes.push((node, s)); + for set in self.nodes.get(&node).unwrap().iter() { + self.success_counters[*set] += 1; + } + } + Err(e) => { + self.failures.push((node, e)); + for set in self.nodes.get(&node).unwrap().iter() { + self.failure_counters[*set] += 1; + } + } + } + } + + pub fn all_quorums_ok(&self) -> bool { + self.success_counters + .iter() + .all(|ok_cnt| *ok_cnt >= self.quorum) + } + + pub fn too_many_failures(&self) -> bool { + self.failure_counters + .iter() + .zip(self.set_lens.iter()) + .any(|(err_cnt, set_len)| *err_cnt + self.quorum > *set_len) + } + + pub fn success_values(self) -> Vec<S> { + self.successes + .into_iter() + .map(|(_, x)| x) + .collect::<Vec<_>>() + } + + pub fn quorum_error(self) -> Error { + let errors = self + .failures + .iter() + .map(|(n, e)| format!("{:?}: {}", n, e)) + .collect::<Vec<_>>(); + Error::Quorum( + self.quorum, + Some(self.set_lens.len()), + self.successes.len(), + self.nodes.len(), + errors, + ) + } +} diff --git a/src/rpc/system.rs b/src/rpc/system.rs index 9e475717..54d589d2 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -1,9 +1,9 @@ //! Module containing structs related to membership management -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::io::{Read, Write}; use std::net::{IpAddr, SocketAddr}; use std::path::{Path, PathBuf}; -use std::sync::{Arc, RwLock}; +use std::sync::{Arc, RwLock, RwLockReadGuard}; use std::time::{Duration, Instant}; use arc_swap::ArcSwapOption; @@ -12,8 +12,7 @@ use futures::join; use serde::{Deserialize, Serialize}; use sodiumoxide::crypto::sign::ed25519; use tokio::select; -use tokio::sync::watch; -use tokio::sync::Mutex; +use tokio::sync::{watch, Notify}; use garage_net::endpoint::{Endpoint, EndpointHandler}; use garage_net::message::*; @@ -33,9 +32,10 @@ use garage_util::time::*; use crate::consul::ConsulDiscovery; #[cfg(feature = "kubernetes-discovery")] use crate::kubernetes::*; -use crate::layout::*; +use crate::layout::{ + self, manager::LayoutManager, LayoutHelper, LayoutHistory, NodeRoleV, RpcLayoutDigest, +}; use crate::replication_mode::*; -use crate::ring::*; use crate::rpc_helper::*; use crate::system_metrics::*; @@ -46,10 +46,10 @@ const STATUS_EXCHANGE_INTERVAL: Duration = Duration::from_secs(10); /// Version tag used for version check upon Netapp connection. /// Cluster nodes with different version tags are deemed /// incompatible and will refuse to connect. -pub const GARAGE_VERSION_TAG: u64 = 0x6761726167650008; // garage 0x0008 +pub const GARAGE_VERSION_TAG: u64 = 0x676172616765000A; // garage 0x000A /// RPC endpoint used for calls related to membership -pub const SYSTEM_RPC_PATH: &str = "garage_rpc/membership.rs/SystemRpc"; +pub const SYSTEM_RPC_PATH: &str = "garage_rpc/system.rs/SystemRpc"; /// RPC messages related to membership #[derive(Debug, Serialize, Deserialize, Clone)] @@ -58,17 +58,22 @@ pub enum SystemRpc { Ok, /// Request to connect to a specific node (in <pubkey>@<host>:<port> format, pubkey = full-length node ID) Connect(String), - /// Ask other node its cluster layout. Answered with AdvertiseClusterLayout - PullClusterLayout, /// Advertise Garage status. Answered with another AdvertiseStatus. /// Exchanged with every node on a regular basis. AdvertiseStatus(NodeStatus), - /// Advertisement of cluster layout. Sent spontanously or in response to PullClusterLayout - AdvertiseClusterLayout(ClusterLayout), /// Get known nodes states GetKnownNodes, /// Return known nodes ReturnKnownNodes(Vec<KnownNodeInfo>), + + /// Ask other node its cluster layout. Answered with AdvertiseClusterLayout + PullClusterLayout, + /// Advertisement of cluster layout. Sent spontanously or in response to PullClusterLayout + AdvertiseClusterLayout(LayoutHistory), + /// Ask other node its cluster layout update trackers. + PullClusterLayoutTrackers, + /// Advertisement of cluster layout update trackers. + AdvertiseClusterLayoutTrackers(layout::UpdateTrackers), } impl Rpc for SystemRpc { @@ -84,7 +89,6 @@ pub struct System { /// The id of this node pub id: Uuid, - persist_cluster_layout: Persister<ClusterLayout>, persist_peer_list: Persister<PeerList>, pub(crate) local_status: RwLock<NodeStatus>, @@ -92,9 +96,8 @@ pub struct System { pub netapp: Arc<NetApp>, peering: Arc<PeeringManager>, - pub rpc: RpcHelper, - system_endpoint: Arc<Endpoint<SystemRpc, System>>, + pub(crate) system_endpoint: Arc<Endpoint<SystemRpc, System>>, rpc_listen_addr: SocketAddr, rpc_public_addr: Option<SocketAddr>, @@ -105,14 +108,11 @@ pub struct System { #[cfg(feature = "kubernetes-discovery")] kubernetes_discovery: Option<KubernetesDiscoveryConfig>, - metrics: ArcSwapOption<SystemMetrics>, + pub layout_manager: Arc<LayoutManager>, - replication_mode: ReplicationMode, - pub(crate) replication_factor: usize, + metrics: ArcSwapOption<SystemMetrics>, - /// The ring - pub ring: watch::Receiver<Arc<Ring>>, - update_ring: Mutex<watch::Sender<Arc<Ring>>>, + pub(crate) replication_factor: ReplicationFactor, /// Path to metadata directory pub metadata_dir: PathBuf, @@ -123,14 +123,13 @@ pub struct System { #[derive(Debug, Clone, Serialize, Deserialize)] pub struct NodeStatus { /// Hostname of the node - pub hostname: String, + pub hostname: Option<String>, /// Replication factor configured on the node pub replication_factor: usize, - /// Cluster layout version - pub cluster_layout_version: u64, - /// Hash of cluster layout staging data - pub cluster_layout_staging_hash: Hash, + + /// Cluster layout digest + pub layout_digest: RpcLayoutDigest, /// Disk usage on partition containing metadata directory (tuple: `(avail, total)`) #[serde(default)] @@ -243,11 +242,11 @@ impl System { /// Create this node's membership manager pub fn new( network_key: NetworkKey, - replication_mode: ReplicationMode, + replication_factor: ReplicationFactor, + consistency_mode: ConsistencyMode, config: &Config, ) -> Result<Arc<Self>, Error> { - let replication_factor = replication_mode.replication_factor(); - + // ---- setup netapp RPC protocol ---- let node_key = gen_node_key(&config.metadata_dir).expect("Unable to read or generate node ID"); info!( @@ -255,83 +254,39 @@ impl System { hex::encode(&node_key.public_key()[..8]) ); - let persist_cluster_layout: Persister<ClusterLayout> = - Persister::new(&config.metadata_dir, "cluster_layout"); - let persist_peer_list = Persister::new(&config.metadata_dir, "peer_list"); - - let cluster_layout = match persist_cluster_layout.load() { - Ok(x) => { - if x.replication_factor != replication_factor { - return Err(Error::Message(format!( - "Prevous cluster layout has replication factor {}, which is different than the one specified in the config file ({}). The previous cluster layout can be purged, if you know what you are doing, simply by deleting the `cluster_layout` file in your metadata directory.", - x.replication_factor, - replication_factor - ))); - } - x - } - Err(e) => { - info!( - "No valid previous cluster layout stored ({}), starting fresh.", - e - ); - ClusterLayout::new(replication_factor) - } - }; - - let mut local_status = NodeStatus::initial(replication_factor, &cluster_layout); - local_status.update_disk_usage(&config.metadata_dir, &config.data_dir); + let bind_outgoing_to = Some(config) + .filter(|x| x.rpc_bind_outgoing) + .map(|x| x.rpc_bind_addr.ip()); + let netapp = NetApp::new(GARAGE_VERSION_TAG, network_key, node_key, bind_outgoing_to); + let system_endpoint = netapp.endpoint(SYSTEM_RPC_PATH.into()); - let ring = Ring::new(cluster_layout, replication_factor); - let (update_ring, ring) = watch::channel(Arc::new(ring)); - - let rpc_public_addr = match &config.rpc_public_addr { - Some(a_str) => { - use std::net::ToSocketAddrs; - match a_str.to_socket_addrs() { - Err(e) => { - error!( - "Cannot resolve rpc_public_addr {} from config file: {}.", - a_str, e - ); - None - } - Ok(a) => { - let a = a.collect::<Vec<_>>(); - if a.is_empty() { - error!("rpc_public_addr {} resolve to no known IP address", a_str); - } - if a.len() > 1 { - warn!("Multiple possible resolutions for rpc_public_addr: {:?}. Taking the first one.", a); - } - a.into_iter().next() - } - } - } - None => { - let addr = - get_default_ip().map(|ip| SocketAddr::new(ip, config.rpc_bind_addr.port())); - if let Some(a) = addr { - warn!("Using autodetected rpc_public_addr: {}. Consider specifying it explicitly in configuration file if possible.", a); - } - addr - } - }; + // ---- setup netapp public listener and full mesh peering strategy ---- + let rpc_public_addr = get_rpc_public_addr(config); if rpc_public_addr.is_none() { warn!("This Garage node does not know its publicly reachable RPC address, this might hamper intra-cluster communication."); } - let bind_outgoing_to = Some(config) - .filter(|x| x.rpc_bind_outgoing) - .map(|x| x.rpc_bind_addr.ip()); - let netapp = NetApp::new(GARAGE_VERSION_TAG, network_key, node_key, bind_outgoing_to); let peering = PeeringManager::new(netapp.clone(), vec![], rpc_public_addr); if let Some(ping_timeout) = config.rpc_ping_timeout_msec { peering.set_ping_timeout_millis(ping_timeout); } - let system_endpoint = netapp.endpoint(SYSTEM_RPC_PATH.into()); + let persist_peer_list = Persister::new(&config.metadata_dir, "peer_list"); + + // ---- setup cluster layout and layout manager ---- + let layout_manager = LayoutManager::new( + config, + netapp.id, + system_endpoint.clone(), + peering.clone(), + replication_factor, + consistency_mode, + )?; + let mut local_status = NodeStatus::initial(replication_factor, &layout_manager); + local_status.update_disk_usage(&config.metadata_dir, &config.data_dir); + + // ---- if enabled, set up additionnal peer discovery methods ---- #[cfg(feature = "consul-discovery")] let consul_discovery = match &config.consul_discovery { Some(cfg) => Some( @@ -350,22 +305,15 @@ impl System { warn!("Kubernetes discovery is not enabled in this build."); } + // ---- almost done ---- let sys = Arc::new(System { id: netapp.id.into(), - persist_cluster_layout, persist_peer_list, local_status: RwLock::new(local_status), node_status: RwLock::new(HashMap::new()), netapp: netapp.clone(), peering: peering.clone(), - rpc: RpcHelper::new( - netapp.id.into(), - peering, - ring.clone(), - config.rpc_timeout_msec.map(Duration::from_millis), - ), system_endpoint, - replication_mode, replication_factor, rpc_listen_addr: config.rpc_bind_addr, rpc_public_addr, @@ -374,10 +322,9 @@ impl System { consul_discovery, #[cfg(feature = "kubernetes-discovery")] kubernetes_discovery: config.kubernetes_discovery.clone(), + layout_manager, metrics: ArcSwapOption::new(None), - ring, - update_ring: Mutex::new(update_ring), metadata_dir: config.metadata_dir.clone(), data_dir: config.data_dir.clone(), }); @@ -409,6 +356,20 @@ impl System { self.metrics.store(None); } + // ---- Public utilities / accessors ---- + + pub fn cluster_layout(&self) -> RwLockReadGuard<'_, LayoutHelper> { + self.layout_manager.layout() + } + + pub fn layout_notify(&self) -> Arc<Notify> { + self.layout_manager.change_notify.clone() + } + + pub fn rpc_helper(&self) -> &RpcHelper { + &self.layout_manager.rpc_helper + } + // ---- Administrative operations (directly available and // also available through RPC) ---- @@ -435,18 +396,6 @@ impl System { known_nodes } - pub fn get_cluster_layout(&self) -> ClusterLayout { - self.ring.borrow().layout.clone() - } - - pub async fn update_cluster_layout( - self: &Arc<Self>, - layout: &ClusterLayout, - ) -> Result<(), Error> { - self.handle_advertise_cluster_layout(layout).await?; - Ok(()) - } - pub async fn connect(&self, node: &str) -> Result<(), Error> { let (pubkey, addrs) = parse_and_resolve_peer_addr_async(node) .await @@ -476,47 +425,65 @@ impl System { } pub fn health(&self) -> ClusterHealth { - let ring: Arc<_> = self.ring.borrow().clone(); - let quorum = self.replication_mode.write_quorum(); - let replication_factor = self.replication_factor; - + let quorum = self + .replication_factor + .write_quorum(ConsistencyMode::Consistent); + + // Gather information about running nodes. + // Technically, `nodes` contains currently running nodes, as well + // as nodes that this Garage process has been connected to at least + // once since it started. let nodes = self .get_known_nodes() .into_iter() .map(|n| (n.id, n)) .collect::<HashMap<Uuid, _>>(); let connected_nodes = nodes.iter().filter(|(_, n)| n.is_up).count(); + let node_up = |x: &Uuid| nodes.get(x).map(|n| n.is_up).unwrap_or(false); + + // Acquire a rwlock read-lock to the current cluster layout + let layout = self.cluster_layout(); + + // Obtain information about nodes that have a role as storage nodes + // in one of the active layout versions + let mut storage_nodes = HashSet::<Uuid>::with_capacity(16); + for ver in layout.versions.iter() { + storage_nodes.extend( + ver.roles + .items() + .iter() + .filter(|(_, _, v)| matches!(v, NodeRoleV(Some(r)) if r.capacity.is_some())) + .map(|(n, _, _)| *n), + ) + } + let storage_nodes_ok = storage_nodes.iter().filter(|x| node_up(x)).count(); + + // Determine the number of partitions that have: + // - a quorum of up nodes for all write sets (i.e. are available) + // - for which all nodes in all write sets are up (i.e. are fully healthy) + let partitions = layout.current().partitions().collect::<Vec<_>>(); + let mut partitions_quorum = 0; + let mut partitions_all_ok = 0; + for (_, hash) in partitions.iter() { + let mut write_sets = layout + .versions + .iter() + .map(|x| x.nodes_of(hash, x.replication_factor)); + let has_quorum = write_sets + .clone() + .all(|set| set.filter(|x| node_up(x)).count() >= quorum); + let all_ok = write_sets.all(|mut set| set.all(|x| node_up(&x))); + if has_quorum { + partitions_quorum += 1; + } + if all_ok { + partitions_all_ok += 1; + } + } - let storage_nodes = ring - .layout - .roles - .items() - .iter() - .filter(|(_, _, v)| matches!(v, NodeRoleV(Some(r)) if r.capacity.is_some())) - .collect::<Vec<_>>(); - let storage_nodes_ok = storage_nodes - .iter() - .filter(|(x, _, _)| nodes.get(x).map(|n| n.is_up).unwrap_or(false)) - .count(); - - let partitions = ring.partitions(); - let partitions_n_up = partitions - .iter() - .map(|(_, h)| { - let pn = ring.get_nodes(h, ring.replication_factor); - pn.iter() - .filter(|x| nodes.get(x).map(|n| n.is_up).unwrap_or(false)) - .count() - }) - .collect::<Vec<usize>>(); - let partitions_all_ok = partitions_n_up - .iter() - .filter(|c| **c == replication_factor) - .count(); - let partitions_quorum = partitions_n_up.iter().filter(|c| **c >= quorum).count(); - + // Determine overall cluster status let status = - if partitions_quorum == partitions.len() && storage_nodes_ok == storage_nodes.len() { + if partitions_all_ok == partitions.len() && storage_nodes_ok == storage_nodes.len() { ClusterHealthStatus::Healthy } else if partitions_quorum == partitions.len() { ClusterHealthStatus::Degraded @@ -553,7 +520,7 @@ impl System { } }; - let hostname = self.local_status.read().unwrap().hostname.clone(); + let hostname = self.local_status.read().unwrap().hostname.clone().unwrap(); if let Err(e) = c .publish_consul_service(self.netapp.id, &hostname, rpc_public_addr) .await @@ -577,30 +544,16 @@ impl System { } }; - let hostname = self.local_status.read().unwrap().hostname.clone(); + let hostname = self.local_status.read().unwrap().hostname.clone().unwrap(); if let Err(e) = publish_kubernetes_node(k, self.netapp.id, &hostname, rpc_public_addr).await { error!("Error while publishing node to Kubernetes: {}", e); } } - /// Save network configuration to disc - async fn save_cluster_layout(&self) -> Result<(), Error> { - let ring: Arc<Ring> = self.ring.borrow().clone(); - self.persist_cluster_layout - .save_async(&ring.layout) - .await - .expect("Cannot save current cluster layout"); - Ok(()) - } - fn update_local_status(&self) { let mut local_status = self.local_status.write().unwrap(); - - let ring = self.ring.borrow(); - local_status.cluster_layout_version = ring.layout.version; - local_status.cluster_layout_staging_hash = ring.layout.staging_hash; - + local_status.layout_digest = self.layout_manager.layout().digest(); local_status.update_disk_usage(&self.metadata_dir, &self.data_dir); } @@ -611,11 +564,6 @@ impl System { Ok(SystemRpc::Ok) } - fn handle_pull_cluster_layout(&self) -> SystemRpc { - let ring = self.ring.borrow().clone(); - SystemRpc::AdvertiseClusterLayout(ring.layout.clone()) - } - fn handle_get_known_nodes(&self) -> SystemRpc { let known_nodes = self.get_known_nodes(); SystemRpc::ReturnKnownNodes(known_nodes) @@ -635,11 +583,8 @@ impl System { std::process::exit(1); } - if info.cluster_layout_version > local_info.cluster_layout_version - || info.cluster_layout_staging_hash != local_info.cluster_layout_staging_hash - { - tokio::spawn(self.clone().pull_cluster_layout(from)); - } + self.layout_manager + .handle_advertise_status(from, &info.layout_digest); drop(local_info); @@ -651,57 +596,6 @@ impl System { Ok(SystemRpc::Ok) } - async fn handle_advertise_cluster_layout( - self: &Arc<Self>, - adv: &ClusterLayout, - ) -> Result<SystemRpc, Error> { - if adv.replication_factor != self.replication_factor { - let msg = format!( - "Received a cluster layout from another node with replication factor {}, which is different from what we have in our configuration ({}). Discarding the cluster layout we received.", - adv.replication_factor, - self.replication_factor - ); - error!("{}", msg); - return Err(Error::Message(msg)); - } - - let update_ring = self.update_ring.lock().await; - let mut layout: ClusterLayout = self.ring.borrow().layout.clone(); - - let prev_layout_check = layout.check().is_ok(); - if layout.merge(adv) { - if prev_layout_check && layout.check().is_err() { - error!("New cluster layout is invalid, discarding."); - return Err(Error::Message( - "New cluster layout is invalid, discarding.".into(), - )); - } - - let ring = Ring::new(layout.clone(), self.replication_factor); - update_ring.send(Arc::new(ring))?; - drop(update_ring); - - let self2 = self.clone(); - tokio::spawn(async move { - if let Err(e) = self2 - .rpc - .broadcast( - &self2.system_endpoint, - SystemRpc::AdvertiseClusterLayout(layout), - RequestStrategy::with_priority(PRIO_HIGH), - ) - .await - { - warn!("Error while broadcasting new cluster layout: {}", e); - } - }); - - self.save_cluster_layout().await?; - } - - Ok(SystemRpc::Ok) - } - async fn status_exchange_loop(&self, mut stop_signal: watch::Receiver<bool>) { while !*stop_signal.borrow() { let restart_at = Instant::now() + STATUS_EXCHANGE_INTERVAL; @@ -711,7 +605,7 @@ impl System { let local_status: NodeStatus = self.local_status.read().unwrap().clone(); let _ = self - .rpc + .rpc_helper() .broadcast( &self.system_endpoint, SystemRpc::AdvertiseStatus(local_status), @@ -736,10 +630,9 @@ impl System { .filter(|p| p.is_up()) .count(); - let not_configured = self.ring.borrow().layout.check().is_err(); - let no_peers = n_connected < self.replication_factor; - - let expected_n_nodes = self.ring.borrow().layout.num_nodes(); + let not_configured = self.cluster_layout().check().is_err(); + let no_peers = n_connected < self.replication_factor.into(); + let expected_n_nodes = self.cluster_layout().all_nodes().len(); let bad_peers = n_connected != expected_n_nodes; if not_configured || no_peers || bad_peers { @@ -791,8 +684,8 @@ impl System { // If the layout is configured, and we already have some connections // to other nodes in the cluster, we can skip trying to connect to // nodes that are not in the cluster layout. - let ring = self.ring.borrow(); - ping_list.retain(|(id, _)| ring.layout.node_ids().contains(&(*id).into())); + let layout = self.cluster_layout(); + ping_list.retain(|(id, _)| layout.all_nodes().contains(&(*id).into())); } for (node_id, node_addr) in ping_list { @@ -847,48 +740,49 @@ impl System { .save_async(&PeerList(peer_list)) .await } - - async fn pull_cluster_layout(self: Arc<Self>, peer: Uuid) { - let resp = self - .rpc - .call( - &self.system_endpoint, - peer, - SystemRpc::PullClusterLayout, - RequestStrategy::with_priority(PRIO_HIGH), - ) - .await; - if let Ok(SystemRpc::AdvertiseClusterLayout(layout)) = resp { - let _: Result<_, _> = self.handle_advertise_cluster_layout(&layout).await; - } - } } #[async_trait] impl EndpointHandler<SystemRpc> for System { async fn handle(self: &Arc<Self>, msg: &SystemRpc, from: NodeID) -> Result<SystemRpc, Error> { match msg { + // ---- system functions -> System ---- SystemRpc::Connect(node) => self.handle_connect(node).await, - SystemRpc::PullClusterLayout => Ok(self.handle_pull_cluster_layout()), SystemRpc::AdvertiseStatus(adv) => self.handle_advertise_status(from.into(), adv).await, + SystemRpc::GetKnownNodes => Ok(self.handle_get_known_nodes()), + + // ---- layout functions -> LayoutManager ---- + SystemRpc::PullClusterLayout => Ok(self.layout_manager.handle_pull_cluster_layout()), SystemRpc::AdvertiseClusterLayout(adv) => { - self.clone().handle_advertise_cluster_layout(adv).await + self.layout_manager + .handle_advertise_cluster_layout(adv) + .await } - SystemRpc::GetKnownNodes => Ok(self.handle_get_known_nodes()), + SystemRpc::PullClusterLayoutTrackers => { + Ok(self.layout_manager.handle_pull_cluster_layout_trackers()) + } + SystemRpc::AdvertiseClusterLayoutTrackers(adv) => { + self.layout_manager + .handle_advertise_cluster_layout_trackers(adv) + .await + } + + // ---- other -> Error ---- m => Err(Error::unexpected_rpc_message(m)), } } } impl NodeStatus { - fn initial(replication_factor: usize, layout: &ClusterLayout) -> Self { + fn initial(replication_factor: ReplicationFactor, layout_manager: &LayoutManager) -> Self { NodeStatus { - hostname: gethostname::gethostname() - .into_string() - .unwrap_or_else(|_| "<invalid utf-8>".to_string()), - replication_factor, - cluster_layout_version: layout.version, - cluster_layout_staging_hash: layout.staging_hash, + hostname: Some( + gethostname::gethostname() + .into_string() + .unwrap_or_else(|_| "<invalid utf-8>".to_string()), + ), + replication_factor: replication_factor.into(), + layout_digest: layout_manager.layout().digest(), meta_disk_avail: None, data_disk_avail: None, } @@ -896,10 +790,9 @@ impl NodeStatus { fn unknown() -> Self { NodeStatus { - hostname: "?".to_string(), + hostname: None, replication_factor: 0, - cluster_layout_version: 0, - cluster_layout_staging_hash: Hash::from([0u8; 32]), + layout_digest: Default::default(), meta_disk_avail: None, data_disk_avail: None, } @@ -952,6 +845,40 @@ fn get_default_ip() -> Option<IpAddr> { .map(|a| a.ip()) } +fn get_rpc_public_addr(config: &Config) -> Option<SocketAddr> { + match &config.rpc_public_addr { + Some(a_str) => { + use std::net::ToSocketAddrs; + match a_str.to_socket_addrs() { + Err(e) => { + error!( + "Cannot resolve rpc_public_addr {} from config file: {}.", + a_str, e + ); + None + } + Ok(a) => { + let a = a.collect::<Vec<_>>(); + if a.is_empty() { + error!("rpc_public_addr {} resolve to no known IP address", a_str); + } + if a.len() > 1 { + warn!("Multiple possible resolutions for rpc_public_addr: {:?}. Taking the first one.", a); + } + a.into_iter().next() + } + } + } + None => { + let addr = get_default_ip().map(|ip| SocketAddr::new(ip, config.rpc_bind_addr.port())); + if let Some(a) = addr { + warn!("Using autodetected rpc_public_addr: {}. Consider specifying it explicitly in configuration file if possible.", a); + } + addr + } + } +} + async fn resolve_peers(peers: &[String]) -> Vec<(NodeID, SocketAddr)> { let mut ret = vec![]; diff --git a/src/rpc/system_metrics.rs b/src/rpc/system_metrics.rs index ffbef6df..a64daec8 100644 --- a/src/rpc/system_metrics.rs +++ b/src/rpc/system_metrics.rs @@ -3,7 +3,6 @@ use std::time::{Duration, Instant}; use opentelemetry::{global, metrics::*, KeyValue}; -use crate::ring::Ring; use crate::system::{ClusterHealthStatus, System}; /// TableMetrics reference all counter used for metrics @@ -69,7 +68,7 @@ impl SystemMetrics { let replication_factor = system.replication_factor; meter .u64_value_observer("garage_replication_factor", move |observer| { - observer.observe(replication_factor as u64, &[]) + observer.observe(replication_factor.replication_factor() as u64, &[]) }) .with_description("Garage replication factor setting") .init() @@ -215,14 +214,14 @@ impl SystemMetrics { let system = system.clone(); meter .u64_value_observer("cluster_layout_node_connected", move |observer| { - let ring: Arc<Ring> = system.ring.borrow().clone(); + let layout = system.cluster_layout(); let nodes = system.get_known_nodes(); - for (id, _, config) in ring.layout.roles.items().iter() { - if let Some(role) = &config.0 { - let mut kv = vec![ - KeyValue::new("id", format!("{:?}", id)), - KeyValue::new("role_zone", role.zone.clone()), - ]; + for id in layout.all_nodes().iter() { + let mut kv = vec![KeyValue::new("id", format!("{:?}", id))]; + if let Some(role) = + layout.current().roles.get(id).and_then(|r| r.0.as_ref()) + { + kv.push(KeyValue::new("role_zone", role.zone.clone())); match role.capacity { Some(cap) => { kv.push(KeyValue::new("role_capacity", cap as i64)); @@ -232,24 +231,24 @@ impl SystemMetrics { kv.push(KeyValue::new("role_gateway", 1)); } } + } - let value; - if let Some(node) = nodes.iter().find(|n| n.id == *id) { - value = if node.is_up { 1 } else { 0 }; + let value; + if let Some(node) = nodes.iter().find(|n| n.id == *id) { // TODO: if we add address and hostname, and those change, we // get duplicate metrics, due to bad otel aggregation :( // Can probably be fixed when we upgrade opentelemetry // kv.push(KeyValue::new("address", node.addr.to_string())); // kv.push(KeyValue::new( - // "hostname", - // node.status.hostname.clone(), + // "hostname", + // node.status.hostname.clone(), // )); - } else { - value = 0; - } - - observer.observe(value, &kv); + value = if node.is_up { 1 } else { 0 }; + } else { + value = 0; } + + observer.observe(value, &kv); } }) .with_description("Connection status for nodes in the cluster layout") @@ -259,14 +258,14 @@ impl SystemMetrics { let system = system.clone(); meter .u64_value_observer("cluster_layout_node_disconnected_time", move |observer| { - let ring: Arc<Ring> = system.ring.borrow().clone(); + let layout = system.cluster_layout(); let nodes = system.get_known_nodes(); - for (id, _, config) in ring.layout.roles.items().iter() { - if let Some(role) = &config.0 { - let mut kv = vec![ - KeyValue::new("id", format!("{:?}", id)), - KeyValue::new("role_zone", role.zone.clone()), - ]; + for id in layout.all_nodes().iter() { + let mut kv = vec![KeyValue::new("id", format!("{:?}", id))]; + if let Some(role) = + layout.current().roles.get(id).and_then(|r| r.0.as_ref()) + { + kv.push(KeyValue::new("role_zone", role.zone.clone())); match role.capacity { Some(cap) => { kv.push(KeyValue::new("role_capacity", cap as i64)); @@ -276,19 +275,19 @@ impl SystemMetrics { kv.push(KeyValue::new("role_gateway", 1)); } } + } - if let Some(node) = nodes.iter().find(|n| n.id == *id) { - // TODO: see comment above - // kv.push(KeyValue::new("address", node.addr.to_string())); - // kv.push(KeyValue::new( - // "hostname", - // node.status.hostname.clone(), - // )); - if node.is_up { - observer.observe(0, &kv); - } else if let Some(secs) = node.last_seen_secs_ago { - observer.observe(secs, &kv); - } + if let Some(node) = nodes.iter().find(|n| n.id == *id) { + // TODO: see comment above + // kv.push(KeyValue::new("address", node.addr.to_string())); + // kv.push(KeyValue::new( + // "hostname", + // node.status.hostname.clone(), + // )); + if node.is_up { + observer.observe(0, &kv); + } else if let Some(secs) = node.last_seen_secs_ago { + observer.observe(secs, &kv); } } } diff --git a/src/table/Cargo.toml b/src/table/Cargo.toml index 61b8a82e..cac17da6 100644 --- a/src/table/Cargo.toml +++ b/src/table/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "garage_table" -version = "0.9.3" +version = "0.10.0" authors = ["Alex Auvolat <alex@adnab.me>"] edition = "2018" license = "AGPL-3.0" diff --git a/src/table/data.rs b/src/table/data.rs index bbfdf58b..7f6b7847 100644 --- a/src/table/data.rs +++ b/src/table/data.rs @@ -254,7 +254,8 @@ impl<F: TableSchema, R: TableReplication> TableData<F, R> { // of the GC algorithm, as in all cases GC is suspended if // any node of the partition is unavailable. let pk_hash = Hash::try_from(&tree_key[..32]).unwrap(); - let nodes = self.replication.write_nodes(&pk_hash); + // TODO: this probably breaks when the layout changes + let nodes = self.replication.storage_nodes(&pk_hash); if nodes.first() == Some(&self.system.id) { GcTodoEntry::new(tree_key, new_bytes_hash).save(&self.gc_todo)?; } diff --git a/src/table/gc.rs b/src/table/gc.rs index 5b9124a7..ef788749 100644 --- a/src/table/gc.rs +++ b/src/table/gc.rs @@ -152,7 +152,7 @@ impl<F: TableSchema, R: TableReplication> TableGc<F, R> { let mut partitions = HashMap::new(); for entry in entries { let pkh = Hash::try_from(&entry.key[..32]).unwrap(); - let mut nodes = self.data.replication.write_nodes(&pkh); + let mut nodes = self.data.replication.storage_nodes(&pkh); nodes.retain(|x| *x != self.system.id); nodes.sort(); @@ -227,10 +227,10 @@ impl<F: TableSchema, R: TableReplication> TableGc<F, R> { // GC'ing is not a critical function of the system, so it's not a big // deal if we can't do it right now. self.system - .rpc + .rpc_helper() .try_call_many( &self.endpoint, - &nodes[..], + &nodes, GcRpc::Update(updates), RequestStrategy::with_priority(PRIO_BACKGROUND).with_quorum(nodes.len()), ) @@ -248,10 +248,10 @@ impl<F: TableSchema, R: TableReplication> TableGc<F, R> { // it means that the garbage collection wasn't completed and has // to be retried later. self.system - .rpc + .rpc_helper() .try_call_many( &self.endpoint, - &nodes[..], + &nodes, GcRpc::DeleteIfEqualHash(deletes), RequestStrategy::with_priority(PRIO_BACKGROUND).with_quorum(nodes.len()), ) diff --git a/src/table/merkle.rs b/src/table/merkle.rs index 4577f872..01271c58 100644 --- a/src/table/merkle.rs +++ b/src/table/merkle.rs @@ -13,7 +13,7 @@ use garage_util::data::*; use garage_util::encode::{nonversioned_decode, nonversioned_encode}; use garage_util::error::Error; -use garage_rpc::ring::*; +use garage_rpc::layout::*; use crate::data::*; use crate::replication::*; diff --git a/src/table/replication/fullcopy.rs b/src/table/replication/fullcopy.rs index 18682ace..1e52bb47 100644 --- a/src/table/replication/fullcopy.rs +++ b/src/table/replication/fullcopy.rs @@ -1,24 +1,36 @@ use std::sync::Arc; -use garage_rpc::ring::*; +use garage_rpc::layout::*; use garage_rpc::system::System; use garage_util::data::*; use crate::replication::*; +// TODO: find a way to track layout changes for this as well +// The hard thing is that this data is stored also on gateway nodes, +// whereas sharded data is stored only on non-Gateway nodes (storage nodes) +// Also we want to be more tolerant to failures of gateways so we don't +// want to do too much holding back of data when progress of gateway +// nodes is not reported in the layout history's ack/sync/sync_ack maps. + /// Full replication schema: all nodes store everything -/// Writes are disseminated in an epidemic manner in the network /// Advantage: do all reads locally, extremely fast /// Inconvenient: only suitable to reasonably small tables +/// Inconvenient: if some writes fail, nodes will read outdated data #[derive(Clone)] pub struct TableFullReplication { /// The membership manager of this node pub system: Arc<System>, - /// Max number of faults allowed while replicating a record - pub max_faults: usize, } impl TableReplication for TableFullReplication { + type WriteSets = Vec<Vec<Uuid>>; + + fn storage_nodes(&self, _hash: &Hash) -> Vec<Uuid> { + let layout = self.system.cluster_layout(); + layout.current().all_nodes().to_vec() + } + fn read_nodes(&self, _hash: &Hash) -> Vec<Uuid> { vec![self.system.id] } @@ -26,26 +38,36 @@ impl TableReplication for TableFullReplication { 1 } - fn write_nodes(&self, _hash: &Hash) -> Vec<Uuid> { - let ring = self.system.ring.borrow(); - ring.layout.node_ids().to_vec() + fn write_sets(&self, hash: &Hash) -> Self::WriteSets { + vec![self.storage_nodes(hash)] } fn write_quorum(&self) -> usize { - let nmembers = self.system.ring.borrow().layout.node_ids().len(); - if nmembers > self.max_faults { - nmembers - self.max_faults + let nmembers = self.system.cluster_layout().current().all_nodes().len(); + + let max_faults = if nmembers > 1 { 1 } else { 0 }; + + if nmembers > max_faults { + nmembers - max_faults } else { 1 } } - fn max_write_errors(&self) -> usize { - self.max_faults - } fn partition_of(&self, _hash: &Hash) -> Partition { 0u16 } - fn partitions(&self) -> Vec<(Partition, Hash)> { - vec![(0u16, [0u8; 32].into())] + + fn sync_partitions(&self) -> SyncPartitions { + let layout = self.system.cluster_layout(); + let layout_version = layout.current().version; + SyncPartitions { + layout_version, + partitions: vec![SyncPartition { + partition: 0u16, + first_hash: [0u8; 32].into(), + last_hash: [0xff; 32].into(), + storage_sets: vec![layout.current().all_nodes().to_vec()], + }], + } } } diff --git a/src/table/replication/parameters.rs b/src/table/replication/parameters.rs index f00815a2..682c1ea6 100644 --- a/src/table/replication/parameters.rs +++ b/src/table/replication/parameters.rs @@ -1,25 +1,43 @@ -use garage_rpc::ring::*; +use garage_rpc::layout::*; use garage_util::data::*; /// Trait to describe how a table shall be replicated pub trait TableReplication: Send + Sync + 'static { + type WriteSets: AsRef<Vec<Vec<Uuid>>> + AsMut<Vec<Vec<Uuid>>> + Send + Sync + 'static; + // See examples in table_sharded.rs and table_fullcopy.rs // To understand various replication methods + /// The entire list of all nodes that store a partition + fn storage_nodes(&self, hash: &Hash) -> Vec<Uuid>; + /// Which nodes to send read requests to fn read_nodes(&self, hash: &Hash) -> Vec<Uuid>; /// Responses needed to consider a read succesfull fn read_quorum(&self) -> usize; /// Which nodes to send writes to - fn write_nodes(&self, hash: &Hash) -> Vec<Uuid>; - /// Responses needed to consider a write succesfull + fn write_sets(&self, hash: &Hash) -> Self::WriteSets; + /// Responses needed to consider a write succesfull in each set fn write_quorum(&self) -> usize; - fn max_write_errors(&self) -> usize; // Accessing partitions, for Merkle tree & sync /// Get partition for data with given hash fn partition_of(&self, hash: &Hash) -> Partition; - /// List of existing partitions - fn partitions(&self) -> Vec<(Partition, Hash)>; + /// List of partitions and nodes to sync with in current layout + fn sync_partitions(&self) -> SyncPartitions; +} + +#[derive(Debug)] +pub struct SyncPartitions { + pub layout_version: u64, + pub partitions: Vec<SyncPartition>, +} + +#[derive(Debug)] +pub struct SyncPartition { + pub partition: Partition, + pub first_hash: Hash, + pub last_hash: Hash, + pub storage_sets: Vec<Vec<Uuid>>, } diff --git a/src/table/replication/sharded.rs b/src/table/replication/sharded.rs index 1cf964af..e0245949 100644 --- a/src/table/replication/sharded.rs +++ b/src/table/replication/sharded.rs @@ -1,6 +1,6 @@ use std::sync::Arc; -use garage_rpc::ring::*; +use garage_rpc::layout::*; use garage_rpc::system::System; use garage_util::data::*; @@ -25,29 +25,59 @@ pub struct TableShardedReplication { } impl TableReplication for TableShardedReplication { + type WriteSets = WriteLock<Vec<Vec<Uuid>>>; + + fn storage_nodes(&self, hash: &Hash) -> Vec<Uuid> { + self.system.cluster_layout().storage_nodes_of(hash) + } + fn read_nodes(&self, hash: &Hash) -> Vec<Uuid> { - let ring = self.system.ring.borrow(); - ring.get_nodes(hash, self.replication_factor) + self.system.cluster_layout().read_nodes_of(hash) } fn read_quorum(&self) -> usize { self.read_quorum } - fn write_nodes(&self, hash: &Hash) -> Vec<Uuid> { - let ring = self.system.ring.borrow(); - ring.get_nodes(hash, self.replication_factor) + fn write_sets(&self, hash: &Hash) -> Self::WriteSets { + self.system.layout_manager.write_sets_of(hash) } fn write_quorum(&self) -> usize { self.write_quorum } - fn max_write_errors(&self) -> usize { - self.replication_factor - self.write_quorum - } fn partition_of(&self, hash: &Hash) -> Partition { - self.system.ring.borrow().partition_of(hash) + self.system.cluster_layout().current().partition_of(hash) } - fn partitions(&self) -> Vec<(Partition, Hash)> { - self.system.ring.borrow().partitions() + + fn sync_partitions(&self) -> SyncPartitions { + let layout = self.system.cluster_layout(); + let layout_version = layout.ack_map_min(); + + let mut partitions = layout + .current() + .partitions() + .map(|(partition, first_hash)| { + let storage_sets = layout.storage_sets_of(&first_hash); + SyncPartition { + partition, + first_hash, + last_hash: [0u8; 32].into(), // filled in just after + storage_sets, + } + }) + .collect::<Vec<_>>(); + + for i in 0..partitions.len() { + partitions[i].last_hash = if i + 1 < partitions.len() { + partitions[i + 1].first_hash + } else { + [0xFFu8; 32].into() + }; + } + + SyncPartitions { + layout_version, + partitions, + } } } diff --git a/src/table/sync.rs b/src/table/sync.rs index 92a353c6..cd080df0 100644 --- a/src/table/sync.rs +++ b/src/table/sync.rs @@ -6,18 +6,19 @@ use arc_swap::ArcSwapOption; use async_trait::async_trait; use futures_util::stream::*; use opentelemetry::KeyValue; -use rand::Rng; +use rand::prelude::*; use serde::{Deserialize, Serialize}; use serde_bytes::ByteBuf; use tokio::select; -use tokio::sync::{mpsc, watch}; +use tokio::sync::{mpsc, watch, Notify}; use garage_util::background::*; use garage_util::data::*; use garage_util::encode::{debug_serialize, nonversioned_encode}; use garage_util::error::{Error, OkOrMessage}; -use garage_rpc::ring::*; +use garage_rpc::layout::*; +use garage_rpc::rpc_helper::QuorumSetResultTracker; use garage_rpc::system::System; use garage_rpc::*; @@ -52,16 +53,6 @@ impl Rpc for SyncRpc { type Response = Result<SyncRpc, Error>; } -#[derive(Debug, Clone)] -struct TodoPartition { - partition: Partition, - begin: Hash, - end: Hash, - - // Are we a node that stores this partition or not? - retain: bool, -} - impl<F: TableSchema, R: TableReplication> TableSyncer<F, R> { pub(crate) fn new( system: Arc<System>, @@ -91,10 +82,10 @@ impl<F: TableSchema, R: TableReplication> TableSyncer<F, R> { bg.spawn_worker(SyncWorker { syncer: self.clone(), - ring_recv: self.system.ring.clone(), - ring: self.system.ring.borrow().clone(), + layout_notify: self.system.layout_notify(), + layout_digest: self.system.cluster_layout().sync_digest(), add_full_sync_rx, - todo: vec![], + todo: None, next_full_sync: Instant::now() + Duration::from_secs(20), }); } @@ -112,53 +103,56 @@ impl<F: TableSchema, R: TableReplication> TableSyncer<F, R> { async fn sync_partition( self: &Arc<Self>, - partition: &TodoPartition, + partition: &SyncPartition, must_exit: &mut watch::Receiver<bool>, ) -> Result<(), Error> { - if partition.retain { - let my_id = self.system.id; - - let nodes = self - .data - .replication - .write_nodes(&partition.begin) - .into_iter() - .filter(|node| *node != my_id) - .collect::<Vec<_>>(); + let my_id = self.system.id; + let retain = partition.storage_sets.iter().any(|x| x.contains(&my_id)); + if retain { debug!( "({}) Syncing {:?} with {:?}...", F::TABLE_NAME, partition, - nodes + partition.storage_sets ); - let mut sync_futures = nodes - .iter() + let mut result_tracker = QuorumSetResultTracker::new( + &partition.storage_sets, + self.data.replication.write_quorum(), + ); + + let mut sync_futures = result_tracker + .nodes + .keys() + .copied() .map(|node| { - self.clone() - .do_sync_with(partition.clone(), *node, must_exit.clone()) + let must_exit = must_exit.clone(); + async move { + if node == my_id { + (node, Ok(())) + } else { + (node, self.do_sync_with(partition, node, must_exit).await) + } + } }) .collect::<FuturesUnordered<_>>(); - let mut n_errors = 0; - while let Some(r) = sync_futures.next().await { - if let Err(e) = r { - n_errors += 1; - warn!("({}) Sync error: {}", F::TABLE_NAME, e); + while let Some((node, res)) = sync_futures.next().await { + if let Err(e) = &res { + warn!("({}) Sync error with {:?}: {}", F::TABLE_NAME, node, e); } + result_tracker.register_result(node, res); } - if n_errors > self.data.replication.max_write_errors() { - return Err(Error::Message(format!( - "Sync failed with too many nodes (should have been: {:?}).", - nodes - ))); + + if result_tracker.too_many_failures() { + Err(result_tracker.quorum_error()) + } else { + Ok(()) } } else { - self.offload_partition(&partition.begin, &partition.end, must_exit) - .await?; + self.offload_partition(&partition.first_hash, &partition.last_hash, must_exit) + .await } - - Ok(()) } // Offload partition: this partition is not something we are storing, @@ -188,12 +182,7 @@ impl<F: TableSchema, R: TableReplication> TableSyncer<F, R> { } if !items.is_empty() { - let nodes = self - .data - .replication - .write_nodes(begin) - .into_iter() - .collect::<Vec<_>>(); + let nodes = self.data.replication.storage_nodes(begin); if nodes.contains(&self.system.id) { warn!( "({}) Interrupting offload as partitions seem to have changed", @@ -217,7 +206,7 @@ impl<F: TableSchema, R: TableReplication> TableSyncer<F, R> { end, counter ); - self.offload_items(&items, &nodes[..]).await?; + self.offload_items(&items, &nodes).await?; } else { break; } @@ -244,7 +233,7 @@ impl<F: TableSchema, R: TableReplication> TableSyncer<F, R> { } self.system - .rpc + .rpc_helper() .try_call_many( &self.endpoint, nodes, @@ -284,8 +273,8 @@ impl<F: TableSchema, R: TableReplication> TableSyncer<F, R> { } async fn do_sync_with( - self: Arc<Self>, - partition: TodoPartition, + self: &Arc<Self>, + partition: &SyncPartition, who: Uuid, must_exit: watch::Receiver<bool>, ) -> Result<(), Error> { @@ -305,7 +294,7 @@ impl<F: TableSchema, R: TableReplication> TableSyncer<F, R> { // If so, do nothing. let root_resp = self .system - .rpc + .rpc_helper() .call( &self.endpoint, who, @@ -361,7 +350,7 @@ impl<F: TableSchema, R: TableReplication> TableSyncer<F, R> { // and compare it with local node let remote_node = match self .system - .rpc + .rpc_helper() .call( &self.endpoint, who, @@ -437,7 +426,7 @@ impl<F: TableSchema, R: TableReplication> TableSyncer<F, R> { let rpc_resp = self .system - .rpc + .rpc_helper() .call( &self.endpoint, who, @@ -492,75 +481,41 @@ impl<F: TableSchema, R: TableReplication> EndpointHandler<SyncRpc> for TableSync struct SyncWorker<F: TableSchema, R: TableReplication> { syncer: Arc<TableSyncer<F, R>>, - ring_recv: watch::Receiver<Arc<Ring>>, - ring: Arc<Ring>, + + layout_notify: Arc<Notify>, + layout_digest: SyncLayoutDigest, + add_full_sync_rx: mpsc::UnboundedReceiver<()>, - todo: Vec<TodoPartition>, next_full_sync: Instant, + + todo: Option<SyncPartitions>, } impl<F: TableSchema, R: TableReplication> SyncWorker<F, R> { - fn add_full_sync(&mut self) { - let system = &self.syncer.system; - let data = &self.syncer.data; - - let my_id = system.id; - - self.todo.clear(); - - let partitions = data.replication.partitions(); - - for i in 0..partitions.len() { - let begin = partitions[i].1; - - let end = if i + 1 < partitions.len() { - partitions[i + 1].1 - } else { - [0xFFu8; 32].into() - }; - - let nodes = data.replication.write_nodes(&begin); - - let retain = nodes.contains(&my_id); - if !retain { - // Check if we have some data to send, otherwise skip - match data.store.range(begin..end) { - Ok(mut iter) => { - if iter.next().is_none() { - continue; - } - } - Err(e) => { - warn!("DB error in add_full_sync: {}", e); - continue; - } - } - } - - self.todo.push(TodoPartition { - partition: partitions[i].0, - begin, - end, - retain, - }); + fn check_add_full_sync(&mut self) { + let layout_digest = self.syncer.system.cluster_layout().sync_digest(); + if layout_digest != self.layout_digest { + self.layout_digest = layout_digest; + info!( + "({}) Layout versions changed ({:?}), adding full sync to syncer todo list", + F::TABLE_NAME, + layout_digest, + ); + self.add_full_sync(); } - - self.next_full_sync = Instant::now() + ANTI_ENTROPY_INTERVAL; } - fn pop_task(&mut self) -> Option<TodoPartition> { - if self.todo.is_empty() { - return None; - } + fn add_full_sync(&mut self) { + let mut partitions = self.syncer.data.replication.sync_partitions(); + info!( + "{}: Adding full sync for ack layout version {}", + F::TABLE_NAME, + partitions.layout_version + ); - let i = rand::thread_rng().gen_range(0..self.todo.len()); - if i == self.todo.len() - 1 { - self.todo.pop() - } else { - let replacement = self.todo.pop().unwrap(); - let ret = std::mem::replace(&mut self.todo[i], replacement); - Some(ret) - } + partitions.partitions.shuffle(&mut thread_rng()); + self.todo = Some(partitions); + self.next_full_sync = Instant::now() + ANTI_ENTROPY_INTERVAL; } } @@ -572,14 +527,48 @@ impl<F: TableSchema, R: TableReplication> Worker for SyncWorker<F, R> { fn status(&self) -> WorkerStatus { WorkerStatus { - queue_length: Some(self.todo.len() as u64), + queue_length: Some(self.todo.as_ref().map(|x| x.partitions.len()).unwrap_or(0) as u64), ..Default::default() } } async fn work(&mut self, must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error> { - if let Some(partition) = self.pop_task() { - self.syncer.sync_partition(&partition, must_exit).await?; + self.check_add_full_sync(); + + if let Some(todo) = &mut self.todo { + let partition = todo.partitions.pop().unwrap(); + + // process partition + if let Err(e) = self.syncer.sync_partition(&partition, must_exit).await { + error!( + "{}: Failed to sync partition {:?}: {}", + F::TABLE_NAME, + partition, + e + ); + // if error, put partition back at the other side of the queue, + // so that other partitions will be tried in the meantime + todo.partitions.insert(0, partition); + // TODO: returning an error here will cause the background job worker + // to delay this task for some time, but maybe we don't want to + // delay it if there are lots of failures from nodes that are gone + // (we also don't want zero delays as that will cause lots of useless retries) + return Err(e); + } + + if todo.partitions.is_empty() { + info!( + "{}: Completed full sync for ack layout version {}", + F::TABLE_NAME, + todo.layout_version + ); + self.syncer + .system + .layout_manager + .sync_table_until(F::TABLE_NAME, todo.layout_version); + self.todo = None; + } + Ok(WorkerState::Busy) } else { Ok(WorkerState::Idle) @@ -593,22 +582,16 @@ impl<F: TableSchema, R: TableReplication> Worker for SyncWorker<F, R> { self.add_full_sync(); } }, - _ = self.ring_recv.changed() => { - let new_ring = self.ring_recv.borrow(); - if !Arc::ptr_eq(&new_ring, &self.ring) { - self.ring = new_ring.clone(); - drop(new_ring); - debug!("({}) Ring changed, adding full sync to syncer todo list", F::TABLE_NAME); - self.add_full_sync(); - } + _ = self.layout_notify.notified() => { + self.check_add_full_sync(); }, _ = tokio::time::sleep_until(self.next_full_sync.into()) => { self.add_full_sync(); } } - match self.todo.is_empty() { - false => WorkerState::Busy, - true => WorkerState::Idle, + match self.todo.is_some() { + true => WorkerState::Busy, + false => WorkerState::Idle, } } } diff --git a/src/table/table.rs b/src/table/table.rs index 7ad79677..a5be2910 100644 --- a/src/table/table.rs +++ b/src/table/table.rs @@ -20,6 +20,7 @@ use garage_util::error::Error; use garage_util::metrics::RecordDuration; use garage_util::migrate::Migrate; +use garage_rpc::rpc_helper::QuorumSetResultTracker; use garage_rpc::system::System; use garage_rpc::*; @@ -80,6 +81,8 @@ impl<F: TableSchema, R: TableReplication> Table<F, R> { let syncer = TableSyncer::new(system.clone(), data.clone(), merkle_updater.clone()); let gc = TableGc::new(system.clone(), data.clone()); + system.layout_manager.add_table(F::TABLE_NAME); + let table = Arc::new(Self { system, data, @@ -117,16 +120,16 @@ impl<F: TableSchema, R: TableReplication> Table<F, R> { async fn insert_internal(&self, e: &F::E) -> Result<(), Error> { let hash = e.partition_key().hash(); - let who = self.data.replication.write_nodes(&hash); + let who = self.data.replication.write_sets(&hash); let e_enc = Arc::new(ByteBuf::from(e.encode()?)); let rpc = TableRpc::<F>::Update(vec![e_enc]); self.system - .rpc - .try_call_many( + .rpc_helper() + .try_write_many_sets( &self.endpoint, - &who[..], + who.as_ref(), rpc, RequestStrategy::with_priority(PRIO_NORMAL) .with_quorum(self.data.replication.write_quorum()), @@ -141,7 +144,7 @@ impl<F: TableSchema, R: TableReplication> Table<F, R> { self.data.queue_insert(tx, e) } - pub async fn insert_many<I, IE>(&self, entries: I) -> Result<(), Error> + pub async fn insert_many<I, IE>(self: &Arc<Self>, entries: I) -> Result<(), Error> where I: IntoIterator<Item = IE> + Send + Sync, IE: Borrow<F::E> + Send + Sync, @@ -159,51 +162,123 @@ impl<F: TableSchema, R: TableReplication> Table<F, R> { Ok(()) } - async fn insert_many_internal<I, IE>(&self, entries: I) -> Result<(), Error> + async fn insert_many_internal<I, IE>(self: &Arc<Self>, entries: I) -> Result<(), Error> where I: IntoIterator<Item = IE> + Send + Sync, IE: Borrow<F::E> + Send + Sync, { - let mut call_list: HashMap<_, Vec<_>> = HashMap::new(); - + // The different items will have to be stored on possibly different nodes. + // We will here batch all items into a single request for each concerned + // node, with all of the entries it must store within that request. + // Each entry has to be saved to a specific list of "write sets", i.e. a set + // of node within wich a quorum must be achieved. In normal operation, there + // is a single write set which corresponds to the quorum in the current + // cluster layout, but when the layout is updated, multiple write sets might + // have to be handled at once. Here, since we are sending many entries, we + // will have to handle many write sets in all cases. The algorihtm is thus + // to send one request to each node with all the items it must save, + // and keep track of the OK responses within each write set: if for all sets + // a quorum of nodes has answered OK, then the insert has succeeded and + // consistency properties (read-after-write) are preserved. + + let quorum = self.data.replication.write_quorum(); + + // Serialize all entries and compute the write sets for each of them. + // In the case of sharded table replication, this also takes an "ack lock" + // to the layout manager to avoid ack'ing newer versions which are not + // taken into account by writes in progress (the ack can happen later, once + // all writes that didn't take the new layout into account are finished). + // These locks are released when entries_vec is dropped, i.e. when this + // function returns. + let mut entries_vec = Vec::new(); for entry in entries.into_iter() { let entry = entry.borrow(); let hash = entry.partition_key().hash(); - let who = self.data.replication.write_nodes(&hash); + let mut write_sets = self.data.replication.write_sets(&hash); + for set in write_sets.as_mut().iter_mut() { + // Sort nodes in each write sets to merge write sets with same + // nodes but in possibly different orders + set.sort(); + } let e_enc = Arc::new(ByteBuf::from(entry.encode()?)); - for node in who { - call_list.entry(node).or_default().push(e_enc.clone()); + entries_vec.push((write_sets, e_enc)); + } + + // Compute a deduplicated list of all of the write sets, + // and compute an index from each node to the position of the sets in which + // it takes part, to optimize the detection of a quorum. + let mut write_sets = entries_vec + .iter() + .flat_map(|(wss, _)| wss.as_ref().iter().map(|ws| ws.as_slice())) + .collect::<Vec<&[Uuid]>>(); + write_sets.sort(); + write_sets.dedup(); + + let mut result_tracker = QuorumSetResultTracker::new(&write_sets, quorum); + + // Build a map of all nodes to the entries that must be sent to that node. + let mut call_list: HashMap<Uuid, Vec<_>> = HashMap::new(); + for (write_sets, entry_enc) in entries_vec.iter() { + for write_set in write_sets.as_ref().iter() { + for node in write_set.iter() { + let node_entries = call_list.entry(*node).or_default(); + match node_entries.last() { + Some(x) if Arc::ptr_eq(x, entry_enc) => { + // skip if entry already in list to send to this node + // (could happen if node is in several write sets for this entry) + } + _ => { + node_entries.push(entry_enc.clone()); + } + } + } } } - let call_futures = call_list.drain().map(|(node, entries)| async move { - let rpc = TableRpc::<F>::Update(entries); - - let resp = self - .system - .rpc - .call( - &self.endpoint, - node, - rpc, - RequestStrategy::with_priority(PRIO_NORMAL), - ) - .await?; - Ok::<_, Error>((node, resp)) + // Build futures to actually perform each of the corresponding RPC calls + let call_futures = call_list.into_iter().map(|(node, entries)| { + let this = self.clone(); + async move { + let rpc = TableRpc::<F>::Update(entries); + let resp = this + .system + .rpc_helper() + .call( + &this.endpoint, + node, + rpc, + RequestStrategy::with_priority(PRIO_NORMAL).with_quorum(quorum), + ) + .await; + (node, resp) + } }); + + // Run all requests in parallel thanks to FuturesUnordered, and collect results. let mut resps = call_futures.collect::<FuturesUnordered<_>>(); - let mut errors = vec![]; - while let Some(resp) = resps.next().await { - if let Err(e) = resp { - errors.push(e); + while let Some((node, resp)) = resps.next().await { + result_tracker.register_result(node, resp.map(|_| ())); + + if result_tracker.all_quorums_ok() { + // Success + + // Continue all other requests in background + tokio::spawn(async move { + resps.collect::<Vec<(Uuid, Result<_, _>)>>().await; + }); + + return Ok(()); + } + + if result_tracker.too_many_failures() { + // Too many errors in this set, we know we won't get a quorum + break; } } - if errors.len() > self.data.replication.max_write_errors() { - Err(Error::Message("Too many errors".into())) - } else { - Ok(()) - } + + // Failure, could not get quorum within at least one set + Err(result_tracker.quorum_error()) } pub async fn get( @@ -236,14 +311,13 @@ impl<F: TableSchema, R: TableReplication> Table<F, R> { let rpc = TableRpc::<F>::ReadEntry(partition_key.clone(), sort_key.clone()); let resps = self .system - .rpc + .rpc_helper() .try_call_many( &self.endpoint, - &who[..], + &who, rpc, RequestStrategy::with_priority(PRIO_NORMAL) - .with_quorum(self.data.replication.read_quorum()) - .interrupt_after_quorum(true), + .with_quorum(self.data.replication.read_quorum()), ) .await?; @@ -332,14 +406,13 @@ impl<F: TableSchema, R: TableReplication> Table<F, R> { let resps = self .system - .rpc + .rpc_helper() .try_call_many( &self.endpoint, - &who[..], + &who, rpc, RequestStrategy::with_priority(PRIO_NORMAL) - .with_quorum(self.data.replication.read_quorum()) - .interrupt_after_quorum(true), + .with_quorum(self.data.replication.read_quorum()), ) .await?; @@ -411,7 +484,7 @@ impl<F: TableSchema, R: TableReplication> Table<F, R> { async fn repair_on_read(&self, who: &[Uuid], what: F::E) -> Result<(), Error> { let what_enc = Arc::new(ByteBuf::from(what.encode()?)); self.system - .rpc + .rpc_helper() .try_call_many( &self.endpoint, who, diff --git a/src/util/Cargo.toml b/src/util/Cargo.toml index 72581c16..e4c31460 100644 --- a/src/util/Cargo.toml +++ b/src/util/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "garage_util" -version = "0.9.3" +version = "0.10.0" authors = ["Alex Auvolat <alex@adnab.me>"] edition = "2018" license = "AGPL-3.0" diff --git a/src/util/config.rs b/src/util/config.rs index 056c625d..b7f27676 100644 --- a/src/util/config.rs +++ b/src/util/config.rs @@ -30,12 +30,20 @@ pub struct Config { )] pub block_size: usize, - /// Replication mode. Supported values: - /// - none, 1 -> no replication - /// - 2 -> 2-way replication - /// - 3 -> 3-way replication - // (we can add more aliases for this later) - pub replication_mode: String, + /// Number of replicas. Can be any positive integer, but uneven numbers are more favorable. + /// - 1 for single-node clusters, or to disable replication + /// - 3 is the recommended and supported setting. + #[serde(default)] + pub replication_factor: Option<usize>, + + /// Consistency mode for all for requests through this node + /// - Degraded -> Disable read quorum + /// - Dangerous -> Disable read and write quorum + #[serde(default = "default_consistency_mode")] + pub consistency_mode: String, + + /// Legacy option + pub replication_mode: Option<String>, /// Zstd compression level used on data blocks #[serde( @@ -244,10 +252,15 @@ fn default_sled_cache_capacity() -> usize { fn default_sled_flush_every_ms() -> u64 { 2000 } + fn default_block_size() -> usize { 1048576 } +fn default_consistency_mode() -> String { + "consistent".into() +} + fn default_compression() -> Option<i32> { Some(1) } @@ -359,7 +372,7 @@ mod tests { r#" metadata_dir = "/tmp/garage/meta" data_dir = "/tmp/garage/data" - replication_mode = "3" + replication_factor = 3 rpc_bind_addr = "[::]:3901" rpc_secret = "foo" diff --git a/src/util/error.rs b/src/util/error.rs index e73d88ba..da9eda10 100644 --- a/src/util/error.rs +++ b/src/util/error.rs @@ -55,13 +55,14 @@ pub enum Error { Timeout, #[error( - display = "Could not reach quorum of {}. {} of {} request succeeded, others returned errors: {:?}", + display = "Could not reach quorum of {} (sets={:?}). {} of {} request succeeded, others returned errors: {:?}", _0, _1, _2, - _3 + _3, + _4 )] - Quorum(usize, usize, usize, Vec<String>), + Quorum(usize, Option<usize>, usize, usize, Vec<String>), #[error(display = "Unexpected RPC message: {}", _0)] UnexpectedRpcMessage(String), diff --git a/src/web/Cargo.toml b/src/web/Cargo.toml index 7bbaca56..49549c9b 100644 --- a/src/web/Cargo.toml +++ b/src/web/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "garage_web" -version = "0.9.3" +version = "0.10.0" authors = ["Alex Auvolat <alex@adnab.me>", "Quentin Dufour <quentin@dufour.io>"] edition = "2018" license = "AGPL-3.0" |