From 277a20ec449011daab961e17b5c6bd7f48e3c291 Mon Sep 17 00:00:00 2001 From: Alex Date: Mon, 9 May 2022 11:14:55 +0200 Subject: Fix `layout show` to not show changes when there are no changes (#297) fixes #295, partially Co-authored-by: Alex Auvolat Reviewed-on: https://git.deuxfleurs.fr/Deuxfleurs/garage/pulls/297 Co-authored-by: Alex Co-committed-by: Alex --- src/garage/cli/layout.rs | 25 +++++++++++++++++++++---- src/garage/cli/util.rs | 2 +- 2 files changed, 22 insertions(+), 5 deletions(-) (limited to 'src') diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index e76f7737..88941d78 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -43,14 +43,22 @@ pub async fn cmd_assign_role( resp => return Err(Error::Message(format!("Invalid RPC response: {:?}", resp))), }; + let mut layout = fetch_layout(rpc_cli, rpc_host).await?; + let added_nodes = args .node_ids .iter() - .map(|node_id| find_matching_node(status.iter().map(|adv| adv.id), node_id)) + .map(|node_id| { + find_matching_node( + status + .iter() + .map(|adv| adv.id) + .chain(layout.node_ids().iter().cloned()), + node_id, + ) + }) .collect::, _>>()?; - let mut layout = fetch_layout(rpc_cli, rpc_host).await?; - let mut roles = layout.roles.clone(); roles.merge(&layout.staging); @@ -323,11 +331,20 @@ pub fn print_cluster_layout(layout: &ClusterLayout) -> bool { } pub fn print_staging_role_changes(layout: &ClusterLayout) -> bool { - if !layout.staging.items().is_empty() { + let has_changes = layout + .staging + .items() + .iter() + .any(|(k, _, v)| layout.roles.get(k) != Some(v)); + + if has_changes { println!(); println!("==== STAGED ROLE CHANGES ===="); let mut table = vec!["ID\tTags\tZone\tCapacity".to_string()]; for (id, _, role) in layout.staging.items().iter() { + if layout.roles.get(id) == Some(role) { + continue; + } if let Some(role) = &role.0 { let tags = role.tags.join(","); table.push(format!( diff --git a/src/garage/cli/util.rs b/src/garage/cli/util.rs index 7d496507..fe11ad44 100644 --- a/src/garage/cli/util.rs +++ b/src/garage/cli/util.rs @@ -208,7 +208,7 @@ pub fn find_matching_node( ) -> Result { let mut candidates = vec![]; for c in cand { - if hex::encode(&c).starts_with(&pattern) { + if hex::encode(&c).starts_with(&pattern) && !candidates.contains(&c) { candidates.push(c); } } -- cgit v1.2.3 From def78c5e6f5da37a0d17b5652c525fbeccbc2e86 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 9 May 2022 12:08:47 +0200 Subject: Update netapp to 0.4.4, fix #300 --- src/rpc/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src') diff --git a/src/rpc/Cargo.toml b/src/rpc/Cargo.toml index efaacf2e..46d0dc1e 100644 --- a/src/rpc/Cargo.toml +++ b/src/rpc/Cargo.toml @@ -48,7 +48,7 @@ opentelemetry = "0.17" #netapp = { version = "0.3.0", git = "https://git.deuxfleurs.fr/lx/netapp" } #netapp = { version = "0.4", path = "../../../netapp", features = ["telemetry"] } -netapp = { version = "0.4.2", features = ["telemetry"] } +netapp = { version = "0.4.4", features = ["telemetry"] } hyper = { version = "0.14", features = ["client", "http1", "runtime", "tcp"] } -- cgit v1.2.3 From 5768bf362262f78376af14517c4921941986192e Mon Sep 17 00:00:00 2001 From: Alex Date: Tue, 10 May 2022 13:16:57 +0200 Subject: First implementation of K2V (#293) **Specification:** View spec at [this URL](https://git.deuxfleurs.fr/Deuxfleurs/garage/src/branch/k2v/doc/drafts/k2v-spec.md) - [x] Specify the structure of K2V triples - [x] Specify the DVVS format used for causality detection - [x] Specify the K2V index (just a counter of number of values per partition key) - [x] Specify single-item endpoints: ReadItem, InsertItem, DeleteItem - [x] Specify index endpoint: ReadIndex - [x] Specify multi-item endpoints: InsertBatch, ReadBatch, DeleteBatch - [x] Move to JSON objects instead of tuples - [x] Specify endpoints for polling for updates on single values (PollItem) **Implementation:** - [x] Table for K2V items, causal contexts - [x] Indexing mechanism and table for K2V index - [x] Make API handlers a bit more generic - [x] K2V API endpoint - [x] K2V API router - [x] ReadItem - [x] InsertItem - [x] DeleteItem - [x] PollItem - [x] ReadIndex - [x] InsertBatch - [x] ReadBatch - [x] DeleteBatch **Testing:** - [x] Just a simple Python script that does some requests to check visually that things are going right (does not contain parsing of results or assertions on returned values) - [x] Actual tests: - [x] Adapt testing framework - [x] Simple test with InsertItem + ReadItem - [x] Test with several Insert/Read/DeleteItem + ReadIndex - [x] Test all combinations of return formats for ReadItem - [x] Test with ReadBatch, InsertBatch, DeleteBatch - [x] Test with PollItem - [x] Test error codes - [ ] Fix most broken stuff - [x] test PollItem broken randomly - [x] when invalid causality tokens are given, errors should be 4xx not 5xx **Improvements:** - [x] Descending range queries - [x] Specify - [x] Implement - [x] Add test - [x] Batch updates to index counter - [x] Put K2V behind `k2v` feature flag Co-authored-by: Alex Auvolat Reviewed-on: https://git.deuxfleurs.fr/Deuxfleurs/garage/pulls/293 Co-authored-by: Alex Co-committed-by: Alex --- src/api/Cargo.toml | 5 + src/api/api_server.rs | 645 ------------- src/api/error.rs | 7 +- src/api/generic_server.rs | 202 ++++ src/api/helpers.rs | 188 +++- src/api/k2v/api_server.rs | 195 ++++ src/api/k2v/batch.rs | 368 +++++++ src/api/k2v/index.rs | 100 ++ src/api/k2v/item.rs | 230 +++++ src/api/k2v/mod.rs | 8 + src/api/k2v/range.rs | 96 ++ src/api/k2v/router.rs | 252 +++++ src/api/lib.rs | 22 +- src/api/router_macros.rs | 190 ++++ src/api/s3/api_server.rs | 401 ++++++++ src/api/s3/bucket.rs | 358 +++++++ src/api/s3/copy.rs | 660 +++++++++++++ src/api/s3/cors.rs | 442 +++++++++ src/api/s3/delete.rs | 170 ++++ src/api/s3/get.rs | 461 +++++++++ src/api/s3/list.rs | 1337 ++++++++++++++++++++++++++ src/api/s3/mod.rs | 14 + src/api/s3/post_object.rs | 507 ++++++++++ src/api/s3/put.rs | 753 +++++++++++++++ src/api/s3/router.rs | 1080 +++++++++++++++++++++ src/api/s3/website.rs | 369 +++++++ src/api/s3/xml.rs | 844 ++++++++++++++++ src/api/s3_bucket.rs | 352 ------- src/api/s3_copy.rs | 660 ------------- src/api/s3_cors.rs | 442 --------- src/api/s3_delete.rs | 170 ---- src/api/s3_get.rs | 461 --------- src/api/s3_list.rs | 1383 --------------------------- src/api/s3_post_object.rs | 499 ---------- src/api/s3_put.rs | 753 --------------- src/api/s3_router.rs | 1278 ------------------------- src/api/s3_website.rs | 369 ------- src/api/s3_xml.rs | 844 ---------------- src/api/signature/mod.rs | 9 +- src/api/signature/payload.rs | 15 +- src/api/signature/streaming.rs | 61 +- src/block/manager.rs | 2 +- src/garage/Cargo.toml | 8 + src/garage/admin.rs | 19 +- src/garage/cli/cmd.rs | 7 +- src/garage/repair.rs | 6 +- src/garage/server.rs | 26 +- src/garage/tests/common/client.rs | 2 +- src/garage/tests/common/custom_requester.rs | 55 +- src/garage/tests/common/garage.rs | 34 +- src/garage/tests/common/mod.rs | 11 +- src/garage/tests/k2v/batch.rs | 525 ++++++++++ src/garage/tests/k2v/errorcodes.rs | 141 +++ src/garage/tests/k2v/item.rs | 719 ++++++++++++++ src/garage/tests/k2v/mod.rs | 18 + src/garage/tests/k2v/poll.rs | 98 ++ src/garage/tests/k2v/simple.rs | 40 + src/garage/tests/lib.rs | 8 +- src/garage/tests/list.rs | 615 ------------ src/garage/tests/multipart.rs | 415 -------- src/garage/tests/objects.rs | 266 ------ src/garage/tests/s3/list.rs | 615 ++++++++++++ src/garage/tests/s3/mod.rs | 6 + src/garage/tests/s3/multipart.rs | 415 ++++++++ src/garage/tests/s3/objects.rs | 266 ++++++ src/garage/tests/s3/simple.rs | 31 + src/garage/tests/s3/streaming_signature.rs | 185 ++++ src/garage/tests/s3/website.rs | 324 +++++++ src/garage/tests/simple.rs | 31 - src/garage/tests/streaming_signature.rs | 185 ---- src/garage/tests/website.rs | 342 ------- src/model/Cargo.toml | 5 + src/model/block_ref_table.rs | 74 -- src/model/garage.rs | 97 +- src/model/helper/bucket.rs | 3 +- src/model/index_counter.rs | 305 ++++++ src/model/k2v/causality.rs | 96 ++ src/model/k2v/counter_table.rs | 20 + src/model/k2v/item_table.rs | 291 ++++++ src/model/k2v/mod.rs | 7 + src/model/k2v/poll.rs | 50 + src/model/k2v/rpc.rs | 343 +++++++ src/model/lib.rs | 9 +- src/model/object_table.rs | 334 ------- src/model/s3/block_ref_table.rs | 74 ++ src/model/s3/mod.rs | 3 + src/model/s3/object_table.rs | 337 +++++++ src/model/s3/version_table.rs | 207 ++++ src/model/version_table.rs | 204 ---- src/rpc/Cargo.toml | 1 + src/table/data.rs | 98 +- src/table/schema.rs | 2 +- src/table/table.rs | 126 ++- src/table/util.rs | 18 +- src/util/Cargo.toml | 3 + src/util/config.rs | 16 +- src/util/error.rs | 3 + src/web/web_server.rs | 4 +- 98 files changed, 14862 insertions(+), 10483 deletions(-) delete mode 100644 src/api/api_server.rs create mode 100644 src/api/generic_server.rs create mode 100644 src/api/k2v/api_server.rs create mode 100644 src/api/k2v/batch.rs create mode 100644 src/api/k2v/index.rs create mode 100644 src/api/k2v/item.rs create mode 100644 src/api/k2v/mod.rs create mode 100644 src/api/k2v/range.rs create mode 100644 src/api/k2v/router.rs create mode 100644 src/api/router_macros.rs create mode 100644 src/api/s3/api_server.rs create mode 100644 src/api/s3/bucket.rs create mode 100644 src/api/s3/copy.rs create mode 100644 src/api/s3/cors.rs create mode 100644 src/api/s3/delete.rs create mode 100644 src/api/s3/get.rs create mode 100644 src/api/s3/list.rs create mode 100644 src/api/s3/mod.rs create mode 100644 src/api/s3/post_object.rs create mode 100644 src/api/s3/put.rs create mode 100644 src/api/s3/router.rs create mode 100644 src/api/s3/website.rs create mode 100644 src/api/s3/xml.rs delete mode 100644 src/api/s3_bucket.rs delete mode 100644 src/api/s3_copy.rs delete mode 100644 src/api/s3_cors.rs delete mode 100644 src/api/s3_delete.rs delete mode 100644 src/api/s3_get.rs delete mode 100644 src/api/s3_list.rs delete mode 100644 src/api/s3_post_object.rs delete mode 100644 src/api/s3_put.rs delete mode 100644 src/api/s3_router.rs delete mode 100644 src/api/s3_website.rs delete mode 100644 src/api/s3_xml.rs create mode 100644 src/garage/tests/k2v/batch.rs create mode 100644 src/garage/tests/k2v/errorcodes.rs create mode 100644 src/garage/tests/k2v/item.rs create mode 100644 src/garage/tests/k2v/mod.rs create mode 100644 src/garage/tests/k2v/poll.rs create mode 100644 src/garage/tests/k2v/simple.rs delete mode 100644 src/garage/tests/list.rs delete mode 100644 src/garage/tests/multipart.rs delete mode 100644 src/garage/tests/objects.rs create mode 100644 src/garage/tests/s3/list.rs create mode 100644 src/garage/tests/s3/mod.rs create mode 100644 src/garage/tests/s3/multipart.rs create mode 100644 src/garage/tests/s3/objects.rs create mode 100644 src/garage/tests/s3/simple.rs create mode 100644 src/garage/tests/s3/streaming_signature.rs create mode 100644 src/garage/tests/s3/website.rs delete mode 100644 src/garage/tests/simple.rs delete mode 100644 src/garage/tests/streaming_signature.rs delete mode 100644 src/garage/tests/website.rs delete mode 100644 src/model/block_ref_table.rs create mode 100644 src/model/index_counter.rs create mode 100644 src/model/k2v/causality.rs create mode 100644 src/model/k2v/counter_table.rs create mode 100644 src/model/k2v/item_table.rs create mode 100644 src/model/k2v/mod.rs create mode 100644 src/model/k2v/poll.rs create mode 100644 src/model/k2v/rpc.rs delete mode 100644 src/model/object_table.rs create mode 100644 src/model/s3/block_ref_table.rs create mode 100644 src/model/s3/mod.rs create mode 100644 src/model/s3/object_table.rs create mode 100644 src/model/s3/version_table.rs delete mode 100644 src/model/version_table.rs (limited to 'src') diff --git a/src/api/Cargo.toml b/src/api/Cargo.toml index 5e96b081..29b26e5e 100644 --- a/src/api/Cargo.toml +++ b/src/api/Cargo.toml @@ -18,7 +18,9 @@ garage_model = { version = "0.7.0", path = "../model" } garage_table = { version = "0.7.0", path = "../table" } garage_block = { version = "0.7.0", path = "../block" } garage_util = { version = "0.7.0", path = "../util" } +garage_rpc = { version = "0.7.0", path = "../rpc" } +async-trait = "0.1.7" base64 = "0.13" bytes = "1.0" chrono = "0.4" @@ -52,3 +54,6 @@ quick-xml = { version = "0.21", features = [ "serialize" ] } url = "2.1" opentelemetry = "0.17" + +[features] +k2v = [ "garage_util/k2v", "garage_model/k2v" ] diff --git a/src/api/api_server.rs b/src/api/api_server.rs deleted file mode 100644 index e7b86d9e..00000000 --- a/src/api/api_server.rs +++ /dev/null @@ -1,645 +0,0 @@ -use std::net::SocketAddr; -use std::sync::Arc; - -use chrono::{DateTime, NaiveDateTime, Utc}; -use futures::future::Future; -use futures::prelude::*; -use hyper::header; -use hyper::server::conn::AddrStream; -use hyper::service::{make_service_fn, service_fn}; -use hyper::{Body, Method, Request, Response, Server}; - -use opentelemetry::{ - global, - metrics::{Counter, ValueRecorder}, - trace::{FutureExt, TraceContextExt, Tracer}, - Context, KeyValue, -}; - -use garage_util::data::*; -use garage_util::error::Error as GarageError; -use garage_util::metrics::{gen_trace_id, RecordDuration}; - -use garage_model::garage::Garage; -use garage_model::key_table::Key; - -use garage_table::util::*; - -use crate::error::*; -use crate::signature::compute_scope; -use crate::signature::payload::check_payload_signature; -use crate::signature::streaming::SignedPayloadStream; -use crate::signature::LONG_DATETIME; - -use crate::helpers::*; -use crate::s3_bucket::*; -use crate::s3_copy::*; -use crate::s3_cors::*; -use crate::s3_delete::*; -use crate::s3_get::*; -use crate::s3_list::*; -use crate::s3_post_object::handle_post_object; -use crate::s3_put::*; -use crate::s3_router::{Authorization, Endpoint}; -use crate::s3_website::*; - -struct ApiMetrics { - request_counter: Counter, - error_counter: Counter, - request_duration: ValueRecorder, -} - -impl ApiMetrics { - fn new() -> Self { - let meter = global::meter("garage/api"); - Self { - request_counter: meter - .u64_counter("api.request_counter") - .with_description("Number of API calls to the various S3 API endpoints") - .init(), - error_counter: meter - .u64_counter("api.error_counter") - .with_description( - "Number of API calls to the various S3 API endpoints that resulted in errors", - ) - .init(), - request_duration: meter - .f64_value_recorder("api.request_duration") - .with_description("Duration of API calls to the various S3 API endpoints") - .init(), - } - } -} - -/// Run the S3 API server -pub async fn run_api_server( - garage: Arc, - shutdown_signal: impl Future, -) -> Result<(), GarageError> { - let addr = &garage.config.s3_api.api_bind_addr; - - let metrics = Arc::new(ApiMetrics::new()); - - let service = make_service_fn(|conn: &AddrStream| { - let garage = garage.clone(); - let metrics = metrics.clone(); - - let client_addr = conn.remote_addr(); - async move { - Ok::<_, GarageError>(service_fn(move |req: Request| { - let garage = garage.clone(); - let metrics = metrics.clone(); - - handler(garage, metrics, req, client_addr) - })) - } - }); - - let server = Server::bind(addr).serve(service); - - let graceful = server.with_graceful_shutdown(shutdown_signal); - info!("API server listening on http://{}", addr); - - graceful.await?; - Ok(()) -} - -async fn handler( - garage: Arc, - metrics: Arc, - req: Request, - addr: SocketAddr, -) -> Result, GarageError> { - let uri = req.uri().clone(); - info!("{} {} {}", addr, req.method(), uri); - debug!("{:?}", req); - - let tracer = opentelemetry::global::tracer("garage"); - let span = tracer - .span_builder("S3 API call (unknown)") - .with_trace_id(gen_trace_id()) - .with_attributes(vec![ - KeyValue::new("method", format!("{}", req.method())), - KeyValue::new("uri", req.uri().to_string()), - ]) - .start(&tracer); - - let res = handler_stage2(garage.clone(), metrics, req) - .with_context(Context::current_with_span(span)) - .await; - - match res { - Ok(x) => { - debug!("{} {:?}", x.status(), x.headers()); - Ok(x) - } - Err(e) => { - let body: Body = Body::from(e.aws_xml(&garage.config.s3_api.s3_region, uri.path())); - let mut http_error_builder = Response::builder() - .status(e.http_status_code()) - .header("Content-Type", "application/xml"); - - if let Some(header_map) = http_error_builder.headers_mut() { - e.add_headers(header_map) - } - - let http_error = http_error_builder.body(body)?; - - if e.http_status_code().is_server_error() { - warn!("Response: error {}, {}", e.http_status_code(), e); - } else { - info!("Response: error {}, {}", e.http_status_code(), e); - } - Ok(http_error) - } - } -} - -async fn handler_stage2( - garage: Arc, - metrics: Arc, - req: Request, -) -> Result, Error> { - let authority = req - .headers() - .get(header::HOST) - .ok_or_bad_request("Host header required")? - .to_str()?; - - let host = authority_to_host(authority)?; - - let bucket_name = garage - .config - .s3_api - .root_domain - .as_ref() - .and_then(|root_domain| host_to_bucket(&host, root_domain)); - - let (endpoint, bucket_name) = Endpoint::from_request(&req, bucket_name.map(ToOwned::to_owned))?; - debug!("Endpoint: {:?}", endpoint); - - let current_context = Context::current(); - let current_span = current_context.span(); - current_span.update_name::(format!("S3 API {}", endpoint.name())); - current_span.set_attribute(KeyValue::new("endpoint", endpoint.name())); - current_span.set_attribute(KeyValue::new( - "bucket", - bucket_name.clone().unwrap_or_default(), - )); - - let metrics_tags = &[KeyValue::new("api_endpoint", endpoint.name())]; - - let res = handler_stage3(garage, req, endpoint, bucket_name) - .record_duration(&metrics.request_duration, &metrics_tags[..]) - .await; - - metrics.request_counter.add(1, &metrics_tags[..]); - - let status_code = match &res { - Ok(r) => r.status(), - Err(e) => e.http_status_code(), - }; - if status_code.is_client_error() || status_code.is_server_error() { - metrics.error_counter.add( - 1, - &[ - metrics_tags[0].clone(), - KeyValue::new("status_code", status_code.as_str().to_string()), - ], - ); - } - - res -} - -async fn handler_stage3( - garage: Arc, - req: Request, - endpoint: Endpoint, - bucket_name: Option, -) -> Result, Error> { - // Some endpoints are processed early, before we even check for an API key - if let Endpoint::PostObject = endpoint { - return handle_post_object(garage, req, bucket_name.unwrap()).await; - } - if let Endpoint::Options = endpoint { - return handle_options_s3api(garage, &req, bucket_name).await; - } - - let (api_key, mut content_sha256) = check_payload_signature(&garage, &req).await?; - let api_key = api_key.ok_or_else(|| { - Error::Forbidden("Garage does not support anonymous access yet".to_string()) - })?; - - let req = match req.headers().get("x-amz-content-sha256") { - Some(header) if header == "STREAMING-AWS4-HMAC-SHA256-PAYLOAD" => { - let signature = content_sha256 - .take() - .ok_or_bad_request("No signature provided")?; - - let secret_key = &api_key - .state - .as_option() - .ok_or_internal_error("Deleted key state")? - .secret_key; - - let date = req - .headers() - .get("x-amz-date") - .ok_or_bad_request("Missing X-Amz-Date field")? - .to_str()?; - let date: NaiveDateTime = NaiveDateTime::parse_from_str(date, LONG_DATETIME) - .ok_or_bad_request("Invalid date")?; - let date: DateTime = DateTime::from_utc(date, Utc); - - let scope = compute_scope(&date, &garage.config.s3_api.s3_region); - let signing_hmac = crate::signature::signing_hmac( - &date, - secret_key, - &garage.config.s3_api.s3_region, - "s3", - ) - .ok_or_internal_error("Unable to build signing HMAC")?; - - req.map(move |body| { - Body::wrap_stream( - SignedPayloadStream::new( - body.map_err(Error::from), - signing_hmac, - date, - &scope, - signature, - ) - .map_err(Error::from), - ) - }) - } - _ => req, - }; - - let bucket_name = match bucket_name { - None => return handle_request_without_bucket(garage, req, api_key, endpoint).await, - Some(bucket) => bucket.to_string(), - }; - - // Special code path for CreateBucket API endpoint - if let Endpoint::CreateBucket {} = endpoint { - return handle_create_bucket(&garage, req, content_sha256, api_key, bucket_name).await; - } - - let bucket_id = resolve_bucket(&garage, &bucket_name, &api_key).await?; - let bucket = garage - .bucket_table - .get(&EmptyKey, &bucket_id) - .await? - .filter(|b| !b.state.is_deleted()) - .ok_or(Error::NoSuchBucket)?; - - let allowed = match endpoint.authorization_type() { - Authorization::Read => api_key.allow_read(&bucket_id), - Authorization::Write => api_key.allow_write(&bucket_id), - Authorization::Owner => api_key.allow_owner(&bucket_id), - _ => unreachable!(), - }; - - if !allowed { - return Err(Error::Forbidden( - "Operation is not allowed for this key.".to_string(), - )); - } - - // Look up what CORS rule might apply to response. - // Requests for methods different than GET, HEAD or POST - // are always preflighted, i.e. the browser should make - // an OPTIONS call before to check it is allowed - let matching_cors_rule = match *req.method() { - Method::GET | Method::HEAD | Method::POST => find_matching_cors_rule(&bucket, &req)?, - _ => None, - }; - - let resp = match endpoint { - Endpoint::HeadObject { - key, part_number, .. - } => handle_head(garage, &req, bucket_id, &key, part_number).await, - Endpoint::GetObject { - key, part_number, .. - } => handle_get(garage, &req, bucket_id, &key, part_number).await, - Endpoint::UploadPart { - key, - part_number, - upload_id, - } => { - handle_put_part( - garage, - req, - bucket_id, - &key, - part_number, - &upload_id, - content_sha256, - ) - .await - } - Endpoint::CopyObject { key } => handle_copy(garage, &api_key, &req, bucket_id, &key).await, - Endpoint::UploadPartCopy { - key, - part_number, - upload_id, - } => { - handle_upload_part_copy( - garage, - &api_key, - &req, - bucket_id, - &key, - part_number, - &upload_id, - ) - .await - } - Endpoint::PutObject { key } => { - handle_put(garage, req, bucket_id, &key, content_sha256).await - } - Endpoint::AbortMultipartUpload { key, upload_id } => { - handle_abort_multipart_upload(garage, bucket_id, &key, &upload_id).await - } - Endpoint::DeleteObject { key, .. } => handle_delete(garage, bucket_id, &key).await, - Endpoint::CreateMultipartUpload { key } => { - handle_create_multipart_upload(garage, &req, &bucket_name, bucket_id, &key).await - } - Endpoint::CompleteMultipartUpload { key, upload_id } => { - handle_complete_multipart_upload( - garage, - req, - &bucket_name, - bucket_id, - &key, - &upload_id, - content_sha256, - ) - .await - } - Endpoint::CreateBucket {} => unreachable!(), - Endpoint::HeadBucket {} => { - let empty_body: Body = Body::from(vec![]); - let response = Response::builder().body(empty_body).unwrap(); - Ok(response) - } - Endpoint::DeleteBucket {} => { - handle_delete_bucket(&garage, bucket_id, bucket_name, api_key).await - } - Endpoint::GetBucketLocation {} => handle_get_bucket_location(garage), - Endpoint::GetBucketVersioning {} => handle_get_bucket_versioning(), - Endpoint::ListObjects { - delimiter, - encoding_type, - marker, - max_keys, - prefix, - } => { - handle_list( - garage, - &ListObjectsQuery { - common: ListQueryCommon { - bucket_name, - bucket_id, - delimiter: delimiter.map(|d| d.to_string()), - page_size: max_keys.map(|p| p.clamp(1, 1000)).unwrap_or(1000), - prefix: prefix.unwrap_or_default(), - urlencode_resp: encoding_type.map(|e| e == "url").unwrap_or(false), - }, - is_v2: false, - marker, - continuation_token: None, - start_after: None, - }, - ) - .await - } - Endpoint::ListObjectsV2 { - delimiter, - encoding_type, - max_keys, - prefix, - continuation_token, - start_after, - list_type, - .. - } => { - if list_type == "2" { - handle_list( - garage, - &ListObjectsQuery { - common: ListQueryCommon { - bucket_name, - bucket_id, - delimiter: delimiter.map(|d| d.to_string()), - page_size: max_keys.map(|p| p.clamp(1, 1000)).unwrap_or(1000), - urlencode_resp: encoding_type.map(|e| e == "url").unwrap_or(false), - prefix: prefix.unwrap_or_default(), - }, - is_v2: true, - marker: None, - continuation_token, - start_after, - }, - ) - .await - } else { - Err(Error::BadRequest(format!( - "Invalid endpoint: list-type={}", - list_type - ))) - } - } - Endpoint::ListMultipartUploads { - delimiter, - encoding_type, - key_marker, - max_uploads, - prefix, - upload_id_marker, - } => { - handle_list_multipart_upload( - garage, - &ListMultipartUploadsQuery { - common: ListQueryCommon { - bucket_name, - bucket_id, - delimiter: delimiter.map(|d| d.to_string()), - page_size: max_uploads.map(|p| p.clamp(1, 1000)).unwrap_or(1000), - prefix: prefix.unwrap_or_default(), - urlencode_resp: encoding_type.map(|e| e == "url").unwrap_or(false), - }, - key_marker, - upload_id_marker, - }, - ) - .await - } - Endpoint::ListParts { - key, - max_parts, - part_number_marker, - upload_id, - } => { - handle_list_parts( - garage, - &ListPartsQuery { - bucket_name, - bucket_id, - key, - upload_id, - part_number_marker: part_number_marker.map(|p| p.clamp(1, 10000)), - max_parts: max_parts.map(|p| p.clamp(1, 1000)).unwrap_or(1000), - }, - ) - .await - } - Endpoint::DeleteObjects {} => { - handle_delete_objects(garage, bucket_id, req, content_sha256).await - } - Endpoint::GetBucketWebsite {} => handle_get_website(&bucket).await, - Endpoint::PutBucketWebsite {} => { - handle_put_website(garage, bucket_id, req, content_sha256).await - } - Endpoint::DeleteBucketWebsite {} => handle_delete_website(garage, bucket_id).await, - Endpoint::GetBucketCors {} => handle_get_cors(&bucket).await, - Endpoint::PutBucketCors {} => handle_put_cors(garage, bucket_id, req, content_sha256).await, - Endpoint::DeleteBucketCors {} => handle_delete_cors(garage, bucket_id).await, - endpoint => Err(Error::NotImplemented(endpoint.name().to_owned())), - }; - - // If request was a success and we have a CORS rule that applies to it, - // add the corresponding CORS headers to the response - let mut resp_ok = resp?; - if let Some(rule) = matching_cors_rule { - add_cors_headers(&mut resp_ok, rule) - .ok_or_internal_error("Invalid bucket CORS configuration")?; - } - - Ok(resp_ok) -} - -async fn handle_request_without_bucket( - garage: Arc, - _req: Request, - api_key: Key, - endpoint: Endpoint, -) -> Result, Error> { - match endpoint { - Endpoint::ListBuckets => handle_list_buckets(&garage, &api_key).await, - endpoint => Err(Error::NotImplemented(endpoint.name().to_owned())), - } -} - -#[allow(clippy::ptr_arg)] -pub async fn resolve_bucket( - garage: &Garage, - bucket_name: &String, - api_key: &Key, -) -> Result { - let api_key_params = api_key - .state - .as_option() - .ok_or_internal_error("Key should not be deleted at this point")?; - - if let Some(Some(bucket_id)) = api_key_params.local_aliases.get(bucket_name) { - Ok(*bucket_id) - } else { - Ok(garage - .bucket_helper() - .resolve_global_bucket_name(bucket_name) - .await? - .ok_or(Error::NoSuchBucket)?) - } -} - -/// Extract the bucket name and the key name from an HTTP path and possibly a bucket provided in -/// the host header of the request -/// -/// S3 internally manages only buckets and keys. This function splits -/// an HTTP path to get the corresponding bucket name and key. -pub fn parse_bucket_key<'a>( - path: &'a str, - host_bucket: Option<&'a str>, -) -> Result<(&'a str, Option<&'a str>), Error> { - let path = path.trim_start_matches('/'); - - if let Some(bucket) = host_bucket { - if !path.is_empty() { - return Ok((bucket, Some(path))); - } else { - return Ok((bucket, None)); - } - } - - let (bucket, key) = match path.find('/') { - Some(i) => { - let key = &path[i + 1..]; - if !key.is_empty() { - (&path[..i], Some(key)) - } else { - (&path[..i], None) - } - } - None => (path, None), - }; - if bucket.is_empty() { - return Err(Error::BadRequest("No bucket specified".to_string())); - } - Ok((bucket, key)) -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn parse_bucket_containing_a_key() -> Result<(), Error> { - let (bucket, key) = parse_bucket_key("/my_bucket/a/super/file.jpg", None)?; - assert_eq!(bucket, "my_bucket"); - assert_eq!(key.expect("key must be set"), "a/super/file.jpg"); - Ok(()) - } - - #[test] - fn parse_bucket_containing_no_key() -> Result<(), Error> { - let (bucket, key) = parse_bucket_key("/my_bucket/", None)?; - assert_eq!(bucket, "my_bucket"); - assert!(key.is_none()); - let (bucket, key) = parse_bucket_key("/my_bucket", None)?; - assert_eq!(bucket, "my_bucket"); - assert!(key.is_none()); - Ok(()) - } - - #[test] - fn parse_bucket_containing_no_bucket() { - let parsed = parse_bucket_key("", None); - assert!(parsed.is_err()); - let parsed = parse_bucket_key("/", None); - assert!(parsed.is_err()); - let parsed = parse_bucket_key("////", None); - assert!(parsed.is_err()); - } - - #[test] - fn parse_bucket_with_vhost_and_key() -> Result<(), Error> { - let (bucket, key) = parse_bucket_key("/a/super/file.jpg", Some("my-bucket"))?; - assert_eq!(bucket, "my-bucket"); - assert_eq!(key.expect("key must be set"), "a/super/file.jpg"); - Ok(()) - } - - #[test] - fn parse_bucket_with_vhost_no_key() -> Result<(), Error> { - let (bucket, key) = parse_bucket_key("", Some("my-bucket"))?; - assert_eq!(bucket, "my-bucket"); - assert!(key.is_none()); - let (bucket, key) = parse_bucket_key("/", Some("my-bucket"))?; - assert_eq!(bucket, "my-bucket"); - assert!(key.is_none()); - Ok(()) - } -} diff --git a/src/api/error.rs b/src/api/error.rs index f53ed1fd..4b7254d2 100644 --- a/src/api/error.rs +++ b/src/api/error.rs @@ -7,7 +7,7 @@ use hyper::{HeaderMap, StatusCode}; use garage_model::helper::error::Error as HelperError; use garage_util::error::Error as GarageError; -use crate::s3_xml; +use crate::s3::xml as s3_xml; /// Errors of this crate #[derive(Debug, Error)] @@ -100,6 +100,10 @@ pub enum Error { #[error(display = "Bad request: {}", _0)] BadRequest(String), + /// The client asked for an invalid return format (invalid Accept header) + #[error(display = "Not acceptable: {}", _0)] + NotAcceptable(String), + /// The client sent a request for an action not supported by garage #[error(display = "Unimplemented action: {}", _0)] NotImplemented(String), @@ -140,6 +144,7 @@ impl Error { Error::BucketNotEmpty | Error::BucketAlreadyExists => StatusCode::CONFLICT, Error::PreconditionFailed => StatusCode::PRECONDITION_FAILED, Error::Forbidden(_) => StatusCode::FORBIDDEN, + Error::NotAcceptable(_) => StatusCode::NOT_ACCEPTABLE, Error::InternalError( GarageError::Timeout | GarageError::RemoteError(_) diff --git a/src/api/generic_server.rs b/src/api/generic_server.rs new file mode 100644 index 00000000..9281e596 --- /dev/null +++ b/src/api/generic_server.rs @@ -0,0 +1,202 @@ +use std::net::SocketAddr; +use std::sync::Arc; + +use async_trait::async_trait; + +use futures::future::Future; + +use hyper::server::conn::AddrStream; +use hyper::service::{make_service_fn, service_fn}; +use hyper::{Body, Request, Response, Server}; + +use opentelemetry::{ + global, + metrics::{Counter, ValueRecorder}, + trace::{FutureExt, SpanRef, TraceContextExt, Tracer}, + Context, KeyValue, +}; + +use garage_util::error::Error as GarageError; +use garage_util::metrics::{gen_trace_id, RecordDuration}; + +use crate::error::*; + +pub(crate) trait ApiEndpoint: Send + Sync + 'static { + fn name(&self) -> &'static str; + fn add_span_attributes(&self, span: SpanRef<'_>); +} + +#[async_trait] +pub(crate) trait ApiHandler: Send + Sync + 'static { + const API_NAME: &'static str; + const API_NAME_DISPLAY: &'static str; + + type Endpoint: ApiEndpoint; + + fn parse_endpoint(&self, r: &Request) -> Result; + async fn handle( + &self, + req: Request, + endpoint: Self::Endpoint, + ) -> Result, Error>; +} + +pub(crate) struct ApiServer { + region: String, + api_handler: A, + + // Metrics + request_counter: Counter, + error_counter: Counter, + request_duration: ValueRecorder, +} + +impl ApiServer { + pub fn new(region: String, api_handler: A) -> Arc { + let meter = global::meter("garage/api"); + Arc::new(Self { + region, + api_handler, + request_counter: meter + .u64_counter(format!("api.{}.request_counter", A::API_NAME)) + .with_description(format!( + "Number of API calls to the various {} API endpoints", + A::API_NAME_DISPLAY + )) + .init(), + error_counter: meter + .u64_counter(format!("api.{}.error_counter", A::API_NAME)) + .with_description(format!( + "Number of API calls to the various {} API endpoints that resulted in errors", + A::API_NAME_DISPLAY + )) + .init(), + request_duration: meter + .f64_value_recorder(format!("api.{}.request_duration", A::API_NAME)) + .with_description(format!( + "Duration of API calls to the various {} API endpoints", + A::API_NAME_DISPLAY + )) + .init(), + }) + } + + pub async fn run_server( + self: Arc, + bind_addr: SocketAddr, + shutdown_signal: impl Future, + ) -> Result<(), GarageError> { + let service = make_service_fn(|conn: &AddrStream| { + let this = self.clone(); + + let client_addr = conn.remote_addr(); + async move { + Ok::<_, GarageError>(service_fn(move |req: Request| { + let this = this.clone(); + + this.handler(req, client_addr) + })) + } + }); + + let server = Server::bind(&bind_addr).serve(service); + + let graceful = server.with_graceful_shutdown(shutdown_signal); + info!( + "{} API server listening on http://{}", + A::API_NAME_DISPLAY, + bind_addr + ); + + graceful.await?; + Ok(()) + } + + async fn handler( + self: Arc, + req: Request, + addr: SocketAddr, + ) -> Result, GarageError> { + let uri = req.uri().clone(); + info!("{} {} {}", addr, req.method(), uri); + debug!("{:?}", req); + + let tracer = opentelemetry::global::tracer("garage"); + let span = tracer + .span_builder(format!("{} API call (unknown)", A::API_NAME_DISPLAY)) + .with_trace_id(gen_trace_id()) + .with_attributes(vec![ + KeyValue::new("method", format!("{}", req.method())), + KeyValue::new("uri", req.uri().to_string()), + ]) + .start(&tracer); + + let res = self + .handler_stage2(req) + .with_context(Context::current_with_span(span)) + .await; + + match res { + Ok(x) => { + debug!("{} {:?}", x.status(), x.headers()); + Ok(x) + } + Err(e) => { + let body: Body = Body::from(e.aws_xml(&self.region, uri.path())); + let mut http_error_builder = Response::builder() + .status(e.http_status_code()) + .header("Content-Type", "application/xml"); + + if let Some(header_map) = http_error_builder.headers_mut() { + e.add_headers(header_map) + } + + let http_error = http_error_builder.body(body)?; + + if e.http_status_code().is_server_error() { + warn!("Response: error {}, {}", e.http_status_code(), e); + } else { + info!("Response: error {}, {}", e.http_status_code(), e); + } + Ok(http_error) + } + } + } + + async fn handler_stage2(&self, req: Request) -> Result, Error> { + let endpoint = self.api_handler.parse_endpoint(&req)?; + debug!("Endpoint: {}", endpoint.name()); + + let current_context = Context::current(); + let current_span = current_context.span(); + current_span.update_name::(format!("S3 API {}", endpoint.name())); + current_span.set_attribute(KeyValue::new("endpoint", endpoint.name())); + endpoint.add_span_attributes(current_span); + + let metrics_tags = &[KeyValue::new("api_endpoint", endpoint.name())]; + + let res = self + .api_handler + .handle(req, endpoint) + .record_duration(&self.request_duration, &metrics_tags[..]) + .await; + + self.request_counter.add(1, &metrics_tags[..]); + + let status_code = match &res { + Ok(r) => r.status(), + Err(e) => e.http_status_code(), + }; + if status_code.is_client_error() || status_code.is_server_error() { + self.error_counter.add( + 1, + &[ + metrics_tags[0].clone(), + KeyValue::new("status_code", status_code.as_str().to_string()), + ], + ); + } + + res + } +} diff --git a/src/api/helpers.rs b/src/api/helpers.rs index c2709bb3..a994b82f 100644 --- a/src/api/helpers.rs +++ b/src/api/helpers.rs @@ -1,6 +1,25 @@ -use crate::Error; use idna::domain_to_unicode; +use garage_util::data::*; + +use garage_model::garage::Garage; +use garage_model::key_table::Key; + +use crate::error::*; + +/// What kind of authorization is required to perform a given action +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum Authorization { + /// No authorization is required + None, + /// Having Read permission on bucket + Read, + /// Having Write permission on bucket + Write, + /// Having Owner permission on bucket + Owner, +} + /// Host to bucket /// /// Convert a host, like "bucket.garage-site.tld" to the corresponding bucket "bucket", @@ -60,10 +79,142 @@ pub fn authority_to_host(authority: &str) -> Result { authority.map(|h| domain_to_unicode(h).0) } +#[allow(clippy::ptr_arg)] +pub async fn resolve_bucket( + garage: &Garage, + bucket_name: &String, + api_key: &Key, +) -> Result { + let api_key_params = api_key + .state + .as_option() + .ok_or_internal_error("Key should not be deleted at this point")?; + + if let Some(Some(bucket_id)) = api_key_params.local_aliases.get(bucket_name) { + Ok(*bucket_id) + } else { + Ok(garage + .bucket_helper() + .resolve_global_bucket_name(bucket_name) + .await? + .ok_or(Error::NoSuchBucket)?) + } +} + +/// Extract the bucket name and the key name from an HTTP path and possibly a bucket provided in +/// the host header of the request +/// +/// S3 internally manages only buckets and keys. This function splits +/// an HTTP path to get the corresponding bucket name and key. +pub fn parse_bucket_key<'a>( + path: &'a str, + host_bucket: Option<&'a str>, +) -> Result<(&'a str, Option<&'a str>), Error> { + let path = path.trim_start_matches('/'); + + if let Some(bucket) = host_bucket { + if !path.is_empty() { + return Ok((bucket, Some(path))); + } else { + return Ok((bucket, None)); + } + } + + let (bucket, key) = match path.find('/') { + Some(i) => { + let key = &path[i + 1..]; + if !key.is_empty() { + (&path[..i], Some(key)) + } else { + (&path[..i], None) + } + } + None => (path, None), + }; + if bucket.is_empty() { + return Err(Error::BadRequest("No bucket specified".to_string())); + } + Ok((bucket, key)) +} + +const UTF8_BEFORE_LAST_CHAR: char = '\u{10FFFE}'; + +/// Compute the key after the prefix +pub fn key_after_prefix(pfx: &str) -> Option { + let mut next = pfx.to_string(); + while !next.is_empty() { + let tail = next.pop().unwrap(); + if tail >= char::MAX { + continue; + } + + // Circumvent a limitation of RangeFrom that overflow earlier than needed + // See: https://doc.rust-lang.org/core/ops/struct.RangeFrom.html + let new_tail = if tail == UTF8_BEFORE_LAST_CHAR { + char::MAX + } else { + (tail..).nth(1).unwrap() + }; + + next.push(new_tail); + return Some(next); + } + + None +} + #[cfg(test)] mod tests { use super::*; + #[test] + fn parse_bucket_containing_a_key() -> Result<(), Error> { + let (bucket, key) = parse_bucket_key("/my_bucket/a/super/file.jpg", None)?; + assert_eq!(bucket, "my_bucket"); + assert_eq!(key.expect("key must be set"), "a/super/file.jpg"); + Ok(()) + } + + #[test] + fn parse_bucket_containing_no_key() -> Result<(), Error> { + let (bucket, key) = parse_bucket_key("/my_bucket/", None)?; + assert_eq!(bucket, "my_bucket"); + assert!(key.is_none()); + let (bucket, key) = parse_bucket_key("/my_bucket", None)?; + assert_eq!(bucket, "my_bucket"); + assert!(key.is_none()); + Ok(()) + } + + #[test] + fn parse_bucket_containing_no_bucket() { + let parsed = parse_bucket_key("", None); + assert!(parsed.is_err()); + let parsed = parse_bucket_key("/", None); + assert!(parsed.is_err()); + let parsed = parse_bucket_key("////", None); + assert!(parsed.is_err()); + } + + #[test] + fn parse_bucket_with_vhost_and_key() -> Result<(), Error> { + let (bucket, key) = parse_bucket_key("/a/super/file.jpg", Some("my-bucket"))?; + assert_eq!(bucket, "my-bucket"); + assert_eq!(key.expect("key must be set"), "a/super/file.jpg"); + Ok(()) + } + + #[test] + fn parse_bucket_with_vhost_no_key() -> Result<(), Error> { + let (bucket, key) = parse_bucket_key("", Some("my-bucket"))?; + assert_eq!(bucket, "my-bucket"); + assert!(key.is_none()); + let (bucket, key) = parse_bucket_key("/", Some("my-bucket"))?; + assert_eq!(bucket, "my-bucket"); + assert!(key.is_none()); + Ok(()) + } + #[test] fn authority_to_host_with_port() -> Result<(), Error> { let domain = authority_to_host("[::1]:3902")?; @@ -111,4 +262,39 @@ mod tests { assert_eq!(host_to_bucket("not-garage.tld", "garage.tld"), None); assert_eq!(host_to_bucket("not-garage.tld", ".garage.tld"), None); } + + #[test] + fn test_key_after_prefix() { + use std::iter::FromIterator; + + assert_eq!(UTF8_BEFORE_LAST_CHAR as u32, (char::MAX as u32) - 1); + assert_eq!(key_after_prefix("a/b/").unwrap().as_str(), "a/b0"); + assert_eq!(key_after_prefix("€").unwrap().as_str(), "₭"); + assert_eq!( + key_after_prefix("􏿽").unwrap().as_str(), + String::from(char::from_u32(0x10FFFE).unwrap()) + ); + + // When the last character is the biggest UTF8 char + let a = String::from_iter(['a', char::MAX].iter()); + assert_eq!(key_after_prefix(a.as_str()).unwrap().as_str(), "b"); + + // When all characters are the biggest UTF8 char + let b = String::from_iter([char::MAX; 3].iter()); + assert!(key_after_prefix(b.as_str()).is_none()); + + // Check utf8 surrogates + let c = String::from('\u{D7FF}'); + assert_eq!( + key_after_prefix(c.as_str()).unwrap().as_str(), + String::from('\u{E000}') + ); + + // Check the character before the biggest one + let d = String::from('\u{10FFFE}'); + assert_eq!( + key_after_prefix(d.as_str()).unwrap().as_str(), + String::from(char::MAX) + ); + } } diff --git a/src/api/k2v/api_server.rs b/src/api/k2v/api_server.rs new file mode 100644 index 00000000..5f5e9030 --- /dev/null +++ b/src/api/k2v/api_server.rs @@ -0,0 +1,195 @@ +use std::sync::Arc; + +use async_trait::async_trait; + +use futures::future::Future; +use hyper::{Body, Method, Request, Response}; + +use opentelemetry::{trace::SpanRef, KeyValue}; + +use garage_table::util::*; +use garage_util::error::Error as GarageError; + +use garage_model::garage::Garage; + +use crate::error::*; +use crate::generic_server::*; + +use crate::signature::payload::check_payload_signature; +use crate::signature::streaming::*; + +use crate::helpers::*; +use crate::k2v::batch::*; +use crate::k2v::index::*; +use crate::k2v::item::*; +use crate::k2v::router::Endpoint; +use crate::s3::cors::*; + +pub struct K2VApiServer { + garage: Arc, +} + +pub(crate) struct K2VApiEndpoint { + bucket_name: String, + endpoint: Endpoint, +} + +impl K2VApiServer { + pub async fn run( + garage: Arc, + shutdown_signal: impl Future, + ) -> Result<(), GarageError> { + if let Some(cfg) = &garage.config.k2v_api { + let bind_addr = cfg.api_bind_addr; + + ApiServer::new( + garage.config.s3_api.s3_region.clone(), + K2VApiServer { garage }, + ) + .run_server(bind_addr, shutdown_signal) + .await + } else { + Ok(()) + } + } +} + +#[async_trait] +impl ApiHandler for K2VApiServer { + const API_NAME: &'static str = "k2v"; + const API_NAME_DISPLAY: &'static str = "K2V"; + + type Endpoint = K2VApiEndpoint; + + fn parse_endpoint(&self, req: &Request) -> Result { + let (endpoint, bucket_name) = Endpoint::from_request(req)?; + + Ok(K2VApiEndpoint { + bucket_name, + endpoint, + }) + } + + async fn handle( + &self, + req: Request, + endpoint: K2VApiEndpoint, + ) -> Result, Error> { + let K2VApiEndpoint { + bucket_name, + endpoint, + } = endpoint; + let garage = self.garage.clone(); + + // The OPTIONS method is procesed early, before we even check for an API key + if let Endpoint::Options = endpoint { + return handle_options_s3api(garage, &req, Some(bucket_name)).await; + } + + let (api_key, mut content_sha256) = check_payload_signature(&garage, "k2v", &req).await?; + let api_key = api_key.ok_or_else(|| { + Error::Forbidden("Garage does not support anonymous access yet".to_string()) + })?; + + let req = parse_streaming_body( + &api_key, + req, + &mut content_sha256, + &garage.config.s3_api.s3_region, + "k2v", + )?; + + let bucket_id = resolve_bucket(&garage, &bucket_name, &api_key).await?; + let bucket = garage + .bucket_table + .get(&EmptyKey, &bucket_id) + .await? + .filter(|b| !b.state.is_deleted()) + .ok_or(Error::NoSuchBucket)?; + + let allowed = match endpoint.authorization_type() { + Authorization::Read => api_key.allow_read(&bucket_id), + Authorization::Write => api_key.allow_write(&bucket_id), + Authorization::Owner => api_key.allow_owner(&bucket_id), + _ => unreachable!(), + }; + + if !allowed { + return Err(Error::Forbidden( + "Operation is not allowed for this key.".to_string(), + )); + } + + // Look up what CORS rule might apply to response. + // Requests for methods different than GET, HEAD or POST + // are always preflighted, i.e. the browser should make + // an OPTIONS call before to check it is allowed + let matching_cors_rule = match *req.method() { + Method::GET | Method::HEAD | Method::POST => find_matching_cors_rule(&bucket, &req)?, + _ => None, + }; + + let resp = match endpoint { + Endpoint::DeleteItem { + partition_key, + sort_key, + } => handle_delete_item(garage, req, bucket_id, &partition_key, &sort_key).await, + Endpoint::InsertItem { + partition_key, + sort_key, + } => handle_insert_item(garage, req, bucket_id, &partition_key, &sort_key).await, + Endpoint::ReadItem { + partition_key, + sort_key, + } => handle_read_item(garage, &req, bucket_id, &partition_key, &sort_key).await, + Endpoint::PollItem { + partition_key, + sort_key, + causality_token, + timeout, + } => { + handle_poll_item( + garage, + &req, + bucket_id, + partition_key, + sort_key, + causality_token, + timeout, + ) + .await + } + Endpoint::ReadIndex { + prefix, + start, + end, + limit, + reverse, + } => handle_read_index(garage, bucket_id, prefix, start, end, limit, reverse).await, + Endpoint::InsertBatch {} => handle_insert_batch(garage, bucket_id, req).await, + Endpoint::ReadBatch {} => handle_read_batch(garage, bucket_id, req).await, + Endpoint::DeleteBatch {} => handle_delete_batch(garage, bucket_id, req).await, + Endpoint::Options => unreachable!(), + }; + + // If request was a success and we have a CORS rule that applies to it, + // add the corresponding CORS headers to the response + let mut resp_ok = resp?; + if let Some(rule) = matching_cors_rule { + add_cors_headers(&mut resp_ok, rule) + .ok_or_internal_error("Invalid bucket CORS configuration")?; + } + + Ok(resp_ok) + } +} + +impl ApiEndpoint for K2VApiEndpoint { + fn name(&self) -> &'static str { + self.endpoint.name() + } + + fn add_span_attributes(&self, span: SpanRef<'_>) { + span.set_attribute(KeyValue::new("bucket", self.bucket_name.clone())); + } +} diff --git a/src/api/k2v/batch.rs b/src/api/k2v/batch.rs new file mode 100644 index 00000000..4ecddeb9 --- /dev/null +++ b/src/api/k2v/batch.rs @@ -0,0 +1,368 @@ +use std::sync::Arc; + +use hyper::{Body, Request, Response, StatusCode}; +use serde::{Deserialize, Serialize}; + +use garage_util::data::*; +use garage_util::error::Error as GarageError; + +use garage_table::{EnumerationOrder, TableSchema}; + +use garage_model::garage::Garage; +use garage_model::k2v::causality::*; +use garage_model::k2v::item_table::*; + +use crate::error::*; +use crate::k2v::range::read_range; + +pub async fn handle_insert_batch( + garage: Arc, + bucket_id: Uuid, + req: Request, +) -> Result, Error> { + let body = hyper::body::to_bytes(req.into_body()).await?; + let items: Vec = + serde_json::from_slice(&body).ok_or_bad_request("Invalid JSON")?; + + let mut items2 = vec![]; + for it in items { + let ct = it + .ct + .map(|s| CausalContext::parse(&s)) + .transpose() + .ok_or_bad_request("Invalid causality token")?; + let v = match it.v { + Some(vs) => { + DvvsValue::Value(base64::decode(vs).ok_or_bad_request("Invalid base64 value")?) + } + None => DvvsValue::Deleted, + }; + items2.push((it.pk, it.sk, ct, v)); + } + + garage.k2v.rpc.insert_batch(bucket_id, items2).await?; + + Ok(Response::builder() + .status(StatusCode::OK) + .body(Body::empty())?) +} + +pub async fn handle_read_batch( + garage: Arc, + bucket_id: Uuid, + req: Request, +) -> Result, Error> { + let body = hyper::body::to_bytes(req.into_body()).await?; + let queries: Vec = + serde_json::from_slice(&body).ok_or_bad_request("Invalid JSON")?; + + let resp_results = futures::future::join_all( + queries + .into_iter() + .map(|q| handle_read_batch_query(&garage, bucket_id, q)), + ) + .await; + + let mut resps: Vec = vec![]; + for resp in resp_results { + resps.push(resp?); + } + + let resp_json = serde_json::to_string_pretty(&resps).map_err(GarageError::from)?; + Ok(Response::builder() + .status(StatusCode::OK) + .body(Body::from(resp_json))?) +} + +async fn handle_read_batch_query( + garage: &Arc, + bucket_id: Uuid, + query: ReadBatchQuery, +) -> Result { + let partition = K2VItemPartition { + bucket_id, + partition_key: query.partition_key.clone(), + }; + + let filter = ItemFilter { + exclude_only_tombstones: !query.tombstones, + conflicts_only: query.conflicts_only, + }; + + let (items, more, next_start) = if query.single_item { + if query.prefix.is_some() || query.end.is_some() || query.limit.is_some() || query.reverse { + return Err(Error::BadRequest("Batch query parameters 'prefix', 'end', 'limit' and 'reverse' must not be set when singleItem is true.".into())); + } + let sk = query + .start + .as_ref() + .ok_or_bad_request("start should be specified if single_item is set")?; + let item = garage + .k2v + .item_table + .get(&partition, sk) + .await? + .filter(|e| K2VItemTable::matches_filter(e, &filter)); + match item { + Some(i) => (vec![ReadBatchResponseItem::from(i)], false, None), + None => (vec![], false, None), + } + } else { + let (items, more, next_start) = read_range( + &garage.k2v.item_table, + &partition, + &query.prefix, + &query.start, + &query.end, + query.limit, + Some(filter), + EnumerationOrder::from_reverse(query.reverse), + ) + .await?; + + let items = items + .into_iter() + .map(ReadBatchResponseItem::from) + .collect::>(); + + (items, more, next_start) + }; + + Ok(ReadBatchResponse { + partition_key: query.partition_key, + prefix: query.prefix, + start: query.start, + end: query.end, + limit: query.limit, + reverse: query.reverse, + single_item: query.single_item, + conflicts_only: query.conflicts_only, + tombstones: query.tombstones, + items, + more, + next_start, + }) +} + +pub async fn handle_delete_batch( + garage: Arc, + bucket_id: Uuid, + req: Request, +) -> Result, Error> { + let body = hyper::body::to_bytes(req.into_body()).await?; + let queries: Vec = + serde_json::from_slice(&body).ok_or_bad_request("Invalid JSON")?; + + let resp_results = futures::future::join_all( + queries + .into_iter() + .map(|q| handle_delete_batch_query(&garage, bucket_id, q)), + ) + .await; + + let mut resps: Vec = vec![]; + for resp in resp_results { + resps.push(resp?); + } + + let resp_json = serde_json::to_string_pretty(&resps).map_err(GarageError::from)?; + Ok(Response::builder() + .status(StatusCode::OK) + .body(Body::from(resp_json))?) +} + +async fn handle_delete_batch_query( + garage: &Arc, + bucket_id: Uuid, + query: DeleteBatchQuery, +) -> Result { + let partition = K2VItemPartition { + bucket_id, + partition_key: query.partition_key.clone(), + }; + + let filter = ItemFilter { + exclude_only_tombstones: true, + conflicts_only: false, + }; + + let deleted_items = if query.single_item { + if query.prefix.is_some() || query.end.is_some() { + return Err(Error::BadRequest("Batch query parameters 'prefix' and 'end' must not be set when singleItem is true.".into())); + } + let sk = query + .start + .as_ref() + .ok_or_bad_request("start should be specified if single_item is set")?; + let item = garage + .k2v + .item_table + .get(&partition, sk) + .await? + .filter(|e| K2VItemTable::matches_filter(e, &filter)); + match item { + Some(i) => { + let cc = i.causal_context(); + garage + .k2v + .rpc + .insert( + bucket_id, + i.partition.partition_key, + i.sort_key, + Some(cc), + DvvsValue::Deleted, + ) + .await?; + 1 + } + None => 0, + } + } else { + let (items, more, _next_start) = read_range( + &garage.k2v.item_table, + &partition, + &query.prefix, + &query.start, + &query.end, + None, + Some(filter), + EnumerationOrder::Forward, + ) + .await?; + assert!(!more); + + // TODO delete items + let items = items + .into_iter() + .map(|i| { + let cc = i.causal_context(); + ( + i.partition.partition_key, + i.sort_key, + Some(cc), + DvvsValue::Deleted, + ) + }) + .collect::>(); + let n = items.len(); + + garage.k2v.rpc.insert_batch(bucket_id, items).await?; + + n + }; + + Ok(DeleteBatchResponse { + partition_key: query.partition_key, + prefix: query.prefix, + start: query.start, + end: query.end, + single_item: query.single_item, + deleted_items, + }) +} + +#[derive(Deserialize)] +struct InsertBatchItem { + pk: String, + sk: String, + ct: Option, + v: Option, +} + +#[derive(Deserialize)] +struct ReadBatchQuery { + #[serde(rename = "partitionKey")] + partition_key: String, + #[serde(default)] + prefix: Option, + #[serde(default)] + start: Option, + #[serde(default)] + end: Option, + #[serde(default)] + limit: Option, + #[serde(default)] + reverse: bool, + #[serde(default, rename = "singleItem")] + single_item: bool, + #[serde(default, rename = "conflictsOnly")] + conflicts_only: bool, + #[serde(default)] + tombstones: bool, +} + +#[derive(Serialize)] +struct ReadBatchResponse { + #[serde(rename = "partitionKey")] + partition_key: String, + prefix: Option, + start: Option, + end: Option, + limit: Option, + reverse: bool, + #[serde(rename = "singleItem")] + single_item: bool, + #[serde(rename = "conflictsOnly")] + conflicts_only: bool, + tombstones: bool, + + items: Vec, + more: bool, + #[serde(rename = "nextStart")] + next_start: Option, +} + +#[derive(Serialize)] +struct ReadBatchResponseItem { + sk: String, + ct: String, + v: Vec>, +} + +impl ReadBatchResponseItem { + fn from(i: K2VItem) -> Self { + let ct = i.causal_context().serialize(); + let v = i + .values() + .iter() + .map(|v| match v { + DvvsValue::Value(x) => Some(base64::encode(x)), + DvvsValue::Deleted => None, + }) + .collect::>(); + Self { + sk: i.sort_key, + ct, + v, + } + } +} + +#[derive(Deserialize)] +struct DeleteBatchQuery { + #[serde(rename = "partitionKey")] + partition_key: String, + #[serde(default)] + prefix: Option, + #[serde(default)] + start: Option, + #[serde(default)] + end: Option, + #[serde(default, rename = "singleItem")] + single_item: bool, +} + +#[derive(Serialize)] +struct DeleteBatchResponse { + #[serde(rename = "partitionKey")] + partition_key: String, + prefix: Option, + start: Option, + end: Option, + #[serde(rename = "singleItem")] + single_item: bool, + + #[serde(rename = "deletedItems")] + deleted_items: usize, +} diff --git a/src/api/k2v/index.rs b/src/api/k2v/index.rs new file mode 100644 index 00000000..896dbcf0 --- /dev/null +++ b/src/api/k2v/index.rs @@ -0,0 +1,100 @@ +use std::sync::Arc; + +use hyper::{Body, Response, StatusCode}; +use serde::Serialize; + +use garage_util::data::*; +use garage_util::error::Error as GarageError; + +use garage_rpc::ring::Ring; +use garage_table::util::*; + +use garage_model::garage::Garage; +use garage_model::k2v::counter_table::{BYTES, CONFLICTS, ENTRIES, VALUES}; + +use crate::error::*; +use crate::k2v::range::read_range; + +pub async fn handle_read_index( + garage: Arc, + bucket_id: Uuid, + prefix: Option, + start: Option, + end: Option, + limit: Option, + reverse: Option, +) -> Result, Error> { + let reverse = reverse.unwrap_or(false); + + let ring: Arc = garage.system.ring.borrow().clone(); + + let (partition_keys, more, next_start) = read_range( + &garage.k2v.counter_table.table, + &bucket_id, + &prefix, + &start, + &end, + limit, + Some((DeletedFilter::NotDeleted, ring.layout.node_id_vec.clone())), + EnumerationOrder::from_reverse(reverse), + ) + .await?; + + let s_entries = ENTRIES.to_string(); + let s_conflicts = CONFLICTS.to_string(); + let s_values = VALUES.to_string(); + let s_bytes = BYTES.to_string(); + + let resp = ReadIndexResponse { + prefix, + start, + end, + limit, + reverse, + partition_keys: partition_keys + .into_iter() + .map(|part| { + let vals = part.filtered_values(&ring); + ReadIndexResponseEntry { + pk: part.sk, + entries: *vals.get(&s_entries).unwrap_or(&0), + conflicts: *vals.get(&s_conflicts).unwrap_or(&0), + values: *vals.get(&s_values).unwrap_or(&0), + bytes: *vals.get(&s_bytes).unwrap_or(&0), + } + }) + .collect::>(), + more, + next_start, + }; + + let resp_json = serde_json::to_string_pretty(&resp).map_err(GarageError::from)?; + Ok(Response::builder() + .status(StatusCode::OK) + .body(Body::from(resp_json))?) +} + +#[derive(Serialize)] +struct ReadIndexResponse { + prefix: Option, + start: Option, + end: Option, + limit: Option, + reverse: bool, + + #[serde(rename = "partitionKeys")] + partition_keys: Vec, + + more: bool, + #[serde(rename = "nextStart")] + next_start: Option, +} + +#[derive(Serialize)] +struct ReadIndexResponseEntry { + pk: String, + entries: i64, + conflicts: i64, + values: i64, + bytes: i64, +} diff --git a/src/api/k2v/item.rs b/src/api/k2v/item.rs new file mode 100644 index 00000000..1860863e --- /dev/null +++ b/src/api/k2v/item.rs @@ -0,0 +1,230 @@ +use std::sync::Arc; + +use http::header; + +use hyper::{Body, Request, Response, StatusCode}; + +use garage_util::data::*; + +use garage_model::garage::Garage; +use garage_model::k2v::causality::*; +use garage_model::k2v::item_table::*; + +use crate::error::*; + +pub const X_GARAGE_CAUSALITY_TOKEN: &str = "X-Garage-Causality-Token"; + +pub enum ReturnFormat { + Json, + Binary, + Either, +} + +impl ReturnFormat { + pub fn from(req: &Request) -> Result { + let accept = match req.headers().get(header::ACCEPT) { + Some(a) => a.to_str()?, + None => return Ok(Self::Json), + }; + + let accept = accept.split(',').map(|s| s.trim()).collect::>(); + let accept_json = accept.contains(&"application/json") || accept.contains(&"*/*"); + let accept_binary = accept.contains(&"application/octet-stream") || accept.contains(&"*/*"); + + match (accept_json, accept_binary) { + (true, true) => Ok(Self::Either), + (true, false) => Ok(Self::Json), + (false, true) => Ok(Self::Binary), + (false, false) => Err(Error::NotAcceptable("Invalid Accept: header value, must contain either application/json or application/octet-stream (or both)".into())), + } + } + + pub fn make_response(&self, item: &K2VItem) -> Result, Error> { + let vals = item.values(); + + if vals.is_empty() { + return Err(Error::NoSuchKey); + } + + let ct = item.causal_context().serialize(); + match self { + Self::Binary if vals.len() > 1 => Ok(Response::builder() + .header(X_GARAGE_CAUSALITY_TOKEN, ct) + .status(StatusCode::CONFLICT) + .body(Body::empty())?), + Self::Binary => { + assert!(vals.len() == 1); + Self::make_binary_response(ct, vals[0]) + } + Self::Either if vals.len() == 1 => Self::make_binary_response(ct, vals[0]), + _ => Self::make_json_response(ct, &vals[..]), + } + } + + fn make_binary_response(ct: String, v: &DvvsValue) -> Result, Error> { + match v { + DvvsValue::Deleted => Ok(Response::builder() + .header(X_GARAGE_CAUSALITY_TOKEN, ct) + .header(header::CONTENT_TYPE, "application/octet-stream") + .status(StatusCode::NO_CONTENT) + .body(Body::empty())?), + DvvsValue::Value(v) => Ok(Response::builder() + .header(X_GARAGE_CAUSALITY_TOKEN, ct) + .header(header::CONTENT_TYPE, "application/octet-stream") + .status(StatusCode::OK) + .body(Body::from(v.to_vec()))?), + } + } + + fn make_json_response(ct: String, v: &[&DvvsValue]) -> Result, Error> { + let items = v + .iter() + .map(|v| match v { + DvvsValue::Deleted => serde_json::Value::Null, + DvvsValue::Value(v) => serde_json::Value::String(base64::encode(v)), + }) + .collect::>(); + let json_body = + serde_json::to_string_pretty(&items).ok_or_internal_error("JSON encoding error")?; + Ok(Response::builder() + .header(X_GARAGE_CAUSALITY_TOKEN, ct) + .header(header::CONTENT_TYPE, "application/json") + .status(StatusCode::OK) + .body(Body::from(json_body))?) + } +} + +/// Handle ReadItem request +#[allow(clippy::ptr_arg)] +pub async fn handle_read_item( + garage: Arc, + req: &Request, + bucket_id: Uuid, + partition_key: &str, + sort_key: &String, +) -> Result, Error> { + let format = ReturnFormat::from(req)?; + + let item = garage + .k2v + .item_table + .get( + &K2VItemPartition { + bucket_id, + partition_key: partition_key.to_string(), + }, + sort_key, + ) + .await? + .ok_or(Error::NoSuchKey)?; + + format.make_response(&item) +} + +pub async fn handle_insert_item( + garage: Arc, + req: Request, + bucket_id: Uuid, + partition_key: &str, + sort_key: &str, +) -> Result, Error> { + let causal_context = req + .headers() + .get(X_GARAGE_CAUSALITY_TOKEN) + .map(|s| s.to_str()) + .transpose()? + .map(CausalContext::parse) + .transpose() + .ok_or_bad_request("Invalid causality token")?; + + let body = hyper::body::to_bytes(req.into_body()).await?; + let value = DvvsValue::Value(body.to_vec()); + + garage + .k2v + .rpc + .insert( + bucket_id, + partition_key.to_string(), + sort_key.to_string(), + causal_context, + value, + ) + .await?; + + Ok(Response::builder() + .status(StatusCode::OK) + .body(Body::empty())?) +} + +pub async fn handle_delete_item( + garage: Arc, + req: Request, + bucket_id: Uuid, + partition_key: &str, + sort_key: &str, +) -> Result, Error> { + let causal_context = req + .headers() + .get(X_GARAGE_CAUSALITY_TOKEN) + .map(|s| s.to_str()) + .transpose()? + .map(CausalContext::parse) + .transpose() + .ok_or_bad_request("Invalid causality token")?; + + let value = DvvsValue::Deleted; + + garage + .k2v + .rpc + .insert( + bucket_id, + partition_key.to_string(), + sort_key.to_string(), + causal_context, + value, + ) + .await?; + + Ok(Response::builder() + .status(StatusCode::NO_CONTENT) + .body(Body::empty())?) +} + +/// Handle ReadItem request +#[allow(clippy::ptr_arg)] +pub async fn handle_poll_item( + garage: Arc, + req: &Request, + bucket_id: Uuid, + partition_key: String, + sort_key: String, + causality_token: String, + timeout_secs: Option, +) -> Result, Error> { + let format = ReturnFormat::from(req)?; + + let causal_context = + CausalContext::parse(&causality_token).ok_or_bad_request("Invalid causality token")?; + + let item = garage + .k2v + .rpc + .poll( + bucket_id, + partition_key, + sort_key, + causal_context, + timeout_secs.unwrap_or(300) * 1000, + ) + .await?; + + if let Some(item) = item { + format.make_response(&item) + } else { + Ok(Response::builder() + .status(StatusCode::NOT_MODIFIED) + .body(Body::empty())?) + } +} diff --git a/src/api/k2v/mod.rs b/src/api/k2v/mod.rs new file mode 100644 index 00000000..ee210ad5 --- /dev/null +++ b/src/api/k2v/mod.rs @@ -0,0 +1,8 @@ +pub mod api_server; +mod router; + +mod batch; +mod index; +mod item; + +mod range; diff --git a/src/api/k2v/range.rs b/src/api/k2v/range.rs new file mode 100644 index 00000000..cd019723 --- /dev/null +++ b/src/api/k2v/range.rs @@ -0,0 +1,96 @@ +//! Utility module for retrieving ranges of items in Garage tables +//! Implements parameters (prefix, start, end, limit) as specified +//! for endpoints ReadIndex, ReadBatch and DeleteBatch + +use std::sync::Arc; + +use garage_table::replication::TableShardedReplication; +use garage_table::*; + +use crate::error::*; +use crate::helpers::key_after_prefix; + +/// Read range in a Garage table. +/// Returns (entries, more?, nextStart) +#[allow(clippy::too_many_arguments)] +pub(crate) async fn read_range( + table: &Arc>, + partition_key: &F::P, + prefix: &Option, + start: &Option, + end: &Option, + limit: Option, + filter: Option, + enumeration_order: EnumerationOrder, +) -> Result<(Vec, bool, Option), Error> +where + F: TableSchema + 'static, +{ + let (mut start, mut start_ignore) = match (prefix, start) { + (None, None) => (None, false), + (None, Some(s)) => (Some(s.clone()), false), + (Some(p), Some(s)) => { + if !s.starts_with(p) { + return Err(Error::BadRequest(format!( + "Start key '{}' does not start with prefix '{}'", + s, p + ))); + } + (Some(s.clone()), false) + } + (Some(p), None) if enumeration_order == EnumerationOrder::Reverse => { + let start = key_after_prefix(p) + .ok_or_internal_error("Sorry, can't list this prefix in reverse order")?; + (Some(start), true) + } + (Some(p), None) => (Some(p.clone()), false), + }; + + let mut entries = vec![]; + loop { + let n_get = std::cmp::min( + 1000, + limit.map(|x| x as usize).unwrap_or(usize::MAX - 10) - entries.len() + 2, + ); + let get_ret = table + .get_range( + partition_key, + start.clone(), + filter.clone(), + n_get, + enumeration_order, + ) + .await?; + + let get_ret_len = get_ret.len(); + + for entry in get_ret { + if start_ignore && Some(entry.sort_key()) == start.as_ref() { + continue; + } + if let Some(p) = prefix { + if !entry.sort_key().starts_with(p) { + return Ok((entries, false, None)); + } + } + if let Some(e) = end { + if entry.sort_key() == e { + return Ok((entries, false, None)); + } + } + if let Some(l) = limit { + if entries.len() >= l as usize { + return Ok((entries, true, Some(entry.sort_key().clone()))); + } + } + entries.push(entry); + } + + if get_ret_len < n_get { + return Ok((entries, false, None)); + } + + start = Some(entries.last().unwrap().sort_key().clone()); + start_ignore = true; + } +} diff --git a/src/api/k2v/router.rs b/src/api/k2v/router.rs new file mode 100644 index 00000000..f948ffce --- /dev/null +++ b/src/api/k2v/router.rs @@ -0,0 +1,252 @@ +use crate::error::*; + +use std::borrow::Cow; + +use hyper::{Method, Request}; + +use crate::helpers::Authorization; +use crate::router_macros::{generateQueryParameters, router_match}; + +router_match! {@func + + +/// List of all K2V API endpoints. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum Endpoint { + DeleteBatch { + }, + DeleteItem { + partition_key: String, + sort_key: String, + }, + InsertBatch { + }, + InsertItem { + partition_key: String, + sort_key: String, + }, + Options, + PollItem { + partition_key: String, + sort_key: String, + causality_token: String, + timeout: Option, + }, + ReadBatch { + }, + ReadIndex { + prefix: Option, + start: Option, + end: Option, + limit: Option, + reverse: Option, + }, + ReadItem { + partition_key: String, + sort_key: String, + }, +}} + +impl Endpoint { + /// Determine which S3 endpoint a request is for using the request, and a bucket which was + /// possibly extracted from the Host header. + /// Returns Self plus bucket name, if endpoint is not Endpoint::ListBuckets + pub fn from_request(req: &Request) -> Result<(Self, String), Error> { + let uri = req.uri(); + let path = uri.path().trim_start_matches('/'); + let query = uri.query(); + + let (bucket, partition_key) = path + .split_once('/') + .map(|(b, p)| (b.to_owned(), p.trim_start_matches('/'))) + .unwrap_or((path.to_owned(), "")); + + if bucket.is_empty() { + return Err(Error::BadRequest("Missing bucket name".to_owned())); + } + + if *req.method() == Method::OPTIONS { + return Ok((Self::Options, bucket)); + } + + let partition_key = percent_encoding::percent_decode_str(partition_key) + .decode_utf8()? + .into_owned(); + + let mut query = QueryParameters::from_query(query.unwrap_or_default())?; + + let method_search = Method::from_bytes(b"SEARCH").unwrap(); + let res = match *req.method() { + Method::GET => Self::from_get(partition_key, &mut query)?, + //&Method::HEAD => Self::from_head(partition_key, &mut query)?, + Method::POST => Self::from_post(partition_key, &mut query)?, + Method::PUT => Self::from_put(partition_key, &mut query)?, + Method::DELETE => Self::from_delete(partition_key, &mut query)?, + _ if req.method() == method_search => Self::from_search(partition_key, &mut query)?, + _ => return Err(Error::BadRequest("Unknown method".to_owned())), + }; + + if let Some(message) = query.nonempty_message() { + debug!("Unused query parameter: {}", message) + } + Ok((res, bucket)) + } + + /// Determine which endpoint a request is for, knowing it is a GET. + fn from_get(partition_key: String, query: &mut QueryParameters<'_>) -> Result { + router_match! { + @gen_parser + (query.keyword.take().unwrap_or_default().as_ref(), partition_key, query, None), + key: [ + EMPTY if causality_token => PollItem (query::sort_key, query::causality_token, opt_parse::timeout), + EMPTY => ReadItem (query::sort_key), + ], + no_key: [ + EMPTY => ReadIndex (query_opt::prefix, query_opt::start, query_opt::end, opt_parse::limit, opt_parse::reverse), + ] + } + } + + /// Determine which endpoint a request is for, knowing it is a SEARCH. + fn from_search(partition_key: String, query: &mut QueryParameters<'_>) -> Result { + router_match! { + @gen_parser + (query.keyword.take().unwrap_or_default().as_ref(), partition_key, query, None), + key: [ + ], + no_key: [ + EMPTY => ReadBatch, + ] + } + } + + /* + /// Determine which endpoint a request is for, knowing it is a HEAD. + fn from_head(partition_key: String, query: &mut QueryParameters<'_>) -> Result { + router_match! { + @gen_parser + (query.keyword.take().unwrap_or_default().as_ref(), partition_key, query, None), + key: [ + EMPTY => HeadObject(opt_parse::part_number, query_opt::version_id), + ], + no_key: [ + EMPTY => HeadBucket, + ] + } + } + */ + + /// Determine which endpoint a request is for, knowing it is a POST. + fn from_post(partition_key: String, query: &mut QueryParameters<'_>) -> Result { + router_match! { + @gen_parser + (query.keyword.take().unwrap_or_default().as_ref(), partition_key, query, None), + key: [ + ], + no_key: [ + EMPTY => InsertBatch, + DELETE => DeleteBatch, + SEARCH => ReadBatch, + ] + } + } + + /// Determine which endpoint a request is for, knowing it is a PUT. + fn from_put(partition_key: String, query: &mut QueryParameters<'_>) -> Result { + router_match! { + @gen_parser + (query.keyword.take().unwrap_or_default().as_ref(), partition_key, query, None), + key: [ + EMPTY => InsertItem (query::sort_key), + + ], + no_key: [ + ] + } + } + + /// Determine which endpoint a request is for, knowing it is a DELETE. + fn from_delete(partition_key: String, query: &mut QueryParameters<'_>) -> Result { + router_match! { + @gen_parser + (query.keyword.take().unwrap_or_default().as_ref(), partition_key, query, None), + key: [ + EMPTY => DeleteItem (query::sort_key), + ], + no_key: [ + ] + } + } + + /// Get the partition key the request target. Returns None for requests which don't use a partition key. + #[allow(dead_code)] + pub fn get_partition_key(&self) -> Option<&str> { + router_match! { + @extract + self, + partition_key, + [ + DeleteItem, + InsertItem, + PollItem, + ReadItem, + ] + } + } + + /// Get the sort key the request target. Returns None for requests which don't use a sort key. + #[allow(dead_code)] + pub fn get_sort_key(&self) -> Option<&str> { + router_match! { + @extract + self, + sort_key, + [ + DeleteItem, + InsertItem, + PollItem, + ReadItem, + ] + } + } + + /// Get the kind of authorization which is required to perform the operation. + pub fn authorization_type(&self) -> Authorization { + let readonly = router_match! { + @match + self, + [ + PollItem, + ReadBatch, + ReadIndex, + ReadItem, + ] + }; + if readonly { + Authorization::Read + } else { + Authorization::Write + } + } +} + +// parameter name => struct field +generateQueryParameters! { + "prefix" => prefix, + "start" => start, + "causality_token" => causality_token, + "end" => end, + "limit" => limit, + "reverse" => reverse, + "sort_key" => sort_key, + "timeout" => timeout +} + +mod keywords { + //! This module contain all query parameters with no associated value + //! used to differentiate endpoints. + pub const EMPTY: &str = ""; + + pub const DELETE: &str = "delete"; + pub const SEARCH: &str = "search"; +} diff --git a/src/api/lib.rs b/src/api/lib.rs index de60ec53..0078f7b5 100644 --- a/src/api/lib.rs +++ b/src/api/lib.rs @@ -6,22 +6,12 @@ pub mod error; pub use error::Error; mod encoding; - -mod api_server; -pub use api_server::run_api_server; - +mod generic_server; +pub mod helpers; +mod router_macros; /// This mode is public only to help testing. Don't expect stability here pub mod signature; -pub mod helpers; -mod s3_bucket; -mod s3_copy; -pub mod s3_cors; -mod s3_delete; -pub mod s3_get; -mod s3_list; -mod s3_post_object; -mod s3_put; -mod s3_router; -mod s3_website; -mod s3_xml; +#[cfg(feature = "k2v")] +pub mod k2v; +pub mod s3; diff --git a/src/api/router_macros.rs b/src/api/router_macros.rs new file mode 100644 index 00000000..8471407c --- /dev/null +++ b/src/api/router_macros.rs @@ -0,0 +1,190 @@ +/// This macro is used to generate very repetitive match {} blocks in this module +/// It is _not_ made to be used anywhere else +macro_rules! router_match { + (@match $enum:expr , [ $($endpoint:ident,)* ]) => {{ + // usage: router_match {@match my_enum, [ VariantWithField1, VariantWithField2 ..] } + // returns true if the variant was one of the listed variants, false otherwise. + use Endpoint::*; + match $enum { + $( + $endpoint { .. } => true, + )* + _ => false + } + }}; + (@extract $enum:expr , $param:ident, [ $($endpoint:ident,)* ]) => {{ + // usage: router_match {@extract my_enum, field_name, [ VariantWithField1, VariantWithField2 ..] } + // returns Some(field_value), or None if the variant was not one of the listed variants. + use Endpoint::*; + match $enum { + $( + $endpoint {$param, ..} => Some($param), + )* + _ => None + } + }}; + (@gen_parser ($keyword:expr, $key:ident, $query:expr, $header:expr), + key: [$($kw_k:ident $(if $required_k:ident)? $(header $header_k:expr)? => $api_k:ident $(($($conv_k:ident :: $param_k:ident),*))?,)*], + no_key: [$($kw_nk:ident $(if $required_nk:ident)? $(if_header $header_nk:expr)? => $api_nk:ident $(($($conv_nk:ident :: $param_nk:ident),*))?,)*]) => {{ + // usage: router_match {@gen_parser (keyword, key, query, header), + // key: [ + // SOME_KEYWORD => VariantWithKey, + // ... + // ], + // no_key: [ + // SOME_KEYWORD => VariantWithoutKey, + // ... + // ] + // } + // See in from_{method} for more detailed usage. + use Endpoint::*; + use keywords::*; + match ($keyword, !$key.is_empty()){ + $( + ($kw_k, true) if true $(&& $query.$required_k.is_some())? $(&& $header.contains_key($header_k))? => Ok($api_k { + $key, + $($( + $param_k: router_match!(@@parse_param $query, $conv_k, $param_k), + )*)? + }), + )* + $( + ($kw_nk, false) $(if $query.$required_nk.is_some())? $(if $header.contains($header_nk))? => Ok($api_nk { + $($( + $param_nk: router_match!(@@parse_param $query, $conv_nk, $param_nk), + )*)? + }), + )* + (kw, _) => Err(Error::BadRequest(format!("Invalid endpoint: {}", kw))) + } + }}; + + (@@parse_param $query:expr, query_opt, $param:ident) => {{ + // extract optional query parameter + $query.$param.take().map(|param| param.into_owned()) + }}; + (@@parse_param $query:expr, query, $param:ident) => {{ + // extract mendatory query parameter + $query.$param.take().ok_or_bad_request("Missing argument for endpoint")?.into_owned() + }}; + (@@parse_param $query:expr, opt_parse, $param:ident) => {{ + // extract and parse optional query parameter + // missing parameter is file, however parse error is reported as an error + $query.$param + .take() + .map(|param| param.parse()) + .transpose() + .map_err(|_| Error::BadRequest("Failed to parse query parameter".to_owned()))? + }}; + (@@parse_param $query:expr, parse, $param:ident) => {{ + // extract and parse mandatory query parameter + // both missing and un-parseable parameters are reported as errors + $query.$param.take().ok_or_bad_request("Missing argument for endpoint")? + .parse() + .map_err(|_| Error::BadRequest("Failed to parse query parameter".to_owned()))? + }}; + (@func + $(#[$doc:meta])* + pub enum Endpoint { + $( + $(#[$outer:meta])* + $variant:ident $({ + $($name:ident: $ty:ty,)* + })?, + )* + }) => { + $(#[$doc])* + pub enum Endpoint { + $( + $(#[$outer])* + $variant $({ + $($name: $ty, )* + })?, + )* + } + impl Endpoint { + pub fn name(&self) -> &'static str { + match self { + $(Endpoint::$variant $({ $($name: _,)* .. })? => stringify!($variant),)* + } + } + } + }; + (@if ($($cond:tt)+) then ($($then:tt)*) else ($($else:tt)*)) => { + $($then)* + }; + (@if () then ($($then:tt)*) else ($($else:tt)*)) => { + $($else)* + }; +} + +/// This macro is used to generate part of the code in this module. It must be called only one, and +/// is useless outside of this module. +macro_rules! generateQueryParameters { + ( $($rest:expr => $name:ident),* ) => { + /// Struct containing all query parameters used in endpoints. Think of it as an HashMap, + /// but with keys statically known. + #[derive(Debug, Default)] + struct QueryParameters<'a> { + keyword: Option>, + $( + $name: Option>, + )* + } + + impl<'a> QueryParameters<'a> { + /// Build this struct from the query part of an URI. + fn from_query(query: &'a str) -> Result { + let mut res: Self = Default::default(); + for (k, v) in url::form_urlencoded::parse(query.as_bytes()) { + let repeated = match k.as_ref() { + $( + $rest => if !v.is_empty() { + res.$name.replace(v).is_some() + } else { + false + }, + )* + _ => { + if k.starts_with("response-") || k.starts_with("X-Amz-") { + false + } else if v.as_ref().is_empty() { + if res.keyword.replace(k).is_some() { + return Err(Error::BadRequest("Multiple keywords".to_owned())); + } + continue; + } else { + debug!("Received an unknown query parameter: '{}'", k); + false + } + } + }; + if repeated { + return Err(Error::BadRequest(format!( + "Query parameter repeated: '{}'", + k + ))); + } + } + Ok(res) + } + + /// Get an error message in case not all parameters where used when extracting them to + /// build an Enpoint variant + fn nonempty_message(&self) -> Option<&str> { + if self.keyword.is_some() { + Some("Keyword not used") + } $( + else if self.$name.is_some() { + Some(concat!("'", $rest, "'")) + } + )* else { + None + } + } + } + } +} + +pub(crate) use generateQueryParameters; +pub(crate) use router_match; diff --git a/src/api/s3/api_server.rs b/src/api/s3/api_server.rs new file mode 100644 index 00000000..78a69d53 --- /dev/null +++ b/src/api/s3/api_server.rs @@ -0,0 +1,401 @@ +use std::sync::Arc; + +use async_trait::async_trait; + +use futures::future::Future; +use hyper::header; +use hyper::{Body, Method, Request, Response}; + +use opentelemetry::{trace::SpanRef, KeyValue}; + +use garage_table::util::*; +use garage_util::error::Error as GarageError; + +use garage_model::garage::Garage; +use garage_model::key_table::Key; + +use crate::error::*; +use crate::generic_server::*; + +use crate::signature::payload::check_payload_signature; +use crate::signature::streaming::*; + +use crate::helpers::*; +use crate::s3::bucket::*; +use crate::s3::copy::*; +use crate::s3::cors::*; +use crate::s3::delete::*; +use crate::s3::get::*; +use crate::s3::list::*; +use crate::s3::post_object::handle_post_object; +use crate::s3::put::*; +use crate::s3::router::Endpoint; +use crate::s3::website::*; + +pub struct S3ApiServer { + garage: Arc, +} + +pub(crate) struct S3ApiEndpoint { + bucket_name: Option, + endpoint: Endpoint, +} + +impl S3ApiServer { + pub async fn run( + garage: Arc, + shutdown_signal: impl Future, + ) -> Result<(), GarageError> { + let addr = garage.config.s3_api.api_bind_addr; + + ApiServer::new( + garage.config.s3_api.s3_region.clone(), + S3ApiServer { garage }, + ) + .run_server(addr, shutdown_signal) + .await + } + + async fn handle_request_without_bucket( + &self, + _req: Request, + api_key: Key, + endpoint: Endpoint, + ) -> Result, Error> { + match endpoint { + Endpoint::ListBuckets => handle_list_buckets(&self.garage, &api_key).await, + endpoint => Err(Error::NotImplemented(endpoint.name().to_owned())), + } + } +} + +#[async_trait] +impl ApiHandler for S3ApiServer { + const API_NAME: &'static str = "s3"; + const API_NAME_DISPLAY: &'static str = "S3"; + + type Endpoint = S3ApiEndpoint; + + fn parse_endpoint(&self, req: &Request) -> Result { + let authority = req + .headers() + .get(header::HOST) + .ok_or_bad_request("Host header required")? + .to_str()?; + + let host = authority_to_host(authority)?; + + let bucket_name = self + .garage + .config + .s3_api + .root_domain + .as_ref() + .and_then(|root_domain| host_to_bucket(&host, root_domain)); + + let (endpoint, bucket_name) = + Endpoint::from_request(req, bucket_name.map(ToOwned::to_owned))?; + + Ok(S3ApiEndpoint { + bucket_name, + endpoint, + }) + } + + async fn handle( + &self, + req: Request, + endpoint: S3ApiEndpoint, + ) -> Result, Error> { + let S3ApiEndpoint { + bucket_name, + endpoint, + } = endpoint; + let garage = self.garage.clone(); + + // Some endpoints are processed early, before we even check for an API key + if let Endpoint::PostObject = endpoint { + return handle_post_object(garage, req, bucket_name.unwrap()).await; + } + if let Endpoint::Options = endpoint { + return handle_options_s3api(garage, &req, bucket_name).await; + } + + let (api_key, mut content_sha256) = check_payload_signature(&garage, "s3", &req).await?; + let api_key = api_key.ok_or_else(|| { + Error::Forbidden("Garage does not support anonymous access yet".to_string()) + })?; + + let req = parse_streaming_body( + &api_key, + req, + &mut content_sha256, + &garage.config.s3_api.s3_region, + "s3", + )?; + + let bucket_name = match bucket_name { + None => { + return self + .handle_request_without_bucket(req, api_key, endpoint) + .await + } + Some(bucket) => bucket.to_string(), + }; + + // Special code path for CreateBucket API endpoint + if let Endpoint::CreateBucket {} = endpoint { + return handle_create_bucket(&garage, req, content_sha256, api_key, bucket_name).await; + } + + let bucket_id = resolve_bucket(&garage, &bucket_name, &api_key).await?; + let bucket = garage + .bucket_table + .get(&EmptyKey, &bucket_id) + .await? + .filter(|b| !b.state.is_deleted()) + .ok_or(Error::NoSuchBucket)?; + + let allowed = match endpoint.authorization_type() { + Authorization::Read => api_key.allow_read(&bucket_id), + Authorization::Write => api_key.allow_write(&bucket_id), + Authorization::Owner => api_key.allow_owner(&bucket_id), + _ => unreachable!(), + }; + + if !allowed { + return Err(Error::Forbidden( + "Operation is not allowed for this key.".to_string(), + )); + } + + // Look up what CORS rule might apply to response. + // Requests for methods different than GET, HEAD or POST + // are always preflighted, i.e. the browser should make + // an OPTIONS call before to check it is allowed + let matching_cors_rule = match *req.method() { + Method::GET | Method::HEAD | Method::POST => find_matching_cors_rule(&bucket, &req)?, + _ => None, + }; + + let resp = match endpoint { + Endpoint::HeadObject { + key, part_number, .. + } => handle_head(garage, &req, bucket_id, &key, part_number).await, + Endpoint::GetObject { + key, part_number, .. + } => handle_get(garage, &req, bucket_id, &key, part_number).await, + Endpoint::UploadPart { + key, + part_number, + upload_id, + } => { + handle_put_part( + garage, + req, + bucket_id, + &key, + part_number, + &upload_id, + content_sha256, + ) + .await + } + Endpoint::CopyObject { key } => { + handle_copy(garage, &api_key, &req, bucket_id, &key).await + } + Endpoint::UploadPartCopy { + key, + part_number, + upload_id, + } => { + handle_upload_part_copy( + garage, + &api_key, + &req, + bucket_id, + &key, + part_number, + &upload_id, + ) + .await + } + Endpoint::PutObject { key } => { + handle_put(garage, req, bucket_id, &key, content_sha256).await + } + Endpoint::AbortMultipartUpload { key, upload_id } => { + handle_abort_multipart_upload(garage, bucket_id, &key, &upload_id).await + } + Endpoint::DeleteObject { key, .. } => handle_delete(garage, bucket_id, &key).await, + Endpoint::CreateMultipartUpload { key } => { + handle_create_multipart_upload(garage, &req, &bucket_name, bucket_id, &key).await + } + Endpoint::CompleteMultipartUpload { key, upload_id } => { + handle_complete_multipart_upload( + garage, + req, + &bucket_name, + bucket_id, + &key, + &upload_id, + content_sha256, + ) + .await + } + Endpoint::CreateBucket {} => unreachable!(), + Endpoint::HeadBucket {} => { + let empty_body: Body = Body::from(vec![]); + let response = Response::builder().body(empty_body).unwrap(); + Ok(response) + } + Endpoint::DeleteBucket {} => { + handle_delete_bucket(&garage, bucket_id, bucket_name, api_key).await + } + Endpoint::GetBucketLocation {} => handle_get_bucket_location(garage), + Endpoint::GetBucketVersioning {} => handle_get_bucket_versioning(), + Endpoint::ListObjects { + delimiter, + encoding_type, + marker, + max_keys, + prefix, + } => { + handle_list( + garage, + &ListObjectsQuery { + common: ListQueryCommon { + bucket_name, + bucket_id, + delimiter: delimiter.map(|d| d.to_string()), + page_size: max_keys.map(|p| p.clamp(1, 1000)).unwrap_or(1000), + prefix: prefix.unwrap_or_default(), + urlencode_resp: encoding_type.map(|e| e == "url").unwrap_or(false), + }, + is_v2: false, + marker, + continuation_token: None, + start_after: None, + }, + ) + .await + } + Endpoint::ListObjectsV2 { + delimiter, + encoding_type, + max_keys, + prefix, + continuation_token, + start_after, + list_type, + .. + } => { + if list_type == "2" { + handle_list( + garage, + &ListObjectsQuery { + common: ListQueryCommon { + bucket_name, + bucket_id, + delimiter: delimiter.map(|d| d.to_string()), + page_size: max_keys.map(|p| p.clamp(1, 1000)).unwrap_or(1000), + urlencode_resp: encoding_type.map(|e| e == "url").unwrap_or(false), + prefix: prefix.unwrap_or_default(), + }, + is_v2: true, + marker: None, + continuation_token, + start_after, + }, + ) + .await + } else { + Err(Error::BadRequest(format!( + "Invalid endpoint: list-type={}", + list_type + ))) + } + } + Endpoint::ListMultipartUploads { + delimiter, + encoding_type, + key_marker, + max_uploads, + prefix, + upload_id_marker, + } => { + handle_list_multipart_upload( + garage, + &ListMultipartUploadsQuery { + common: ListQueryCommon { + bucket_name, + bucket_id, + delimiter: delimiter.map(|d| d.to_string()), + page_size: max_uploads.map(|p| p.clamp(1, 1000)).unwrap_or(1000), + prefix: prefix.unwrap_or_default(), + urlencode_resp: encoding_type.map(|e| e == "url").unwrap_or(false), + }, + key_marker, + upload_id_marker, + }, + ) + .await + } + Endpoint::ListParts { + key, + max_parts, + part_number_marker, + upload_id, + } => { + handle_list_parts( + garage, + &ListPartsQuery { + bucket_name, + bucket_id, + key, + upload_id, + part_number_marker: part_number_marker.map(|p| p.clamp(1, 10000)), + max_parts: max_parts.map(|p| p.clamp(1, 1000)).unwrap_or(1000), + }, + ) + .await + } + Endpoint::DeleteObjects {} => { + handle_delete_objects(garage, bucket_id, req, content_sha256).await + } + Endpoint::GetBucketWebsite {} => handle_get_website(&bucket).await, + Endpoint::PutBucketWebsite {} => { + handle_put_website(garage, bucket_id, req, content_sha256).await + } + Endpoint::DeleteBucketWebsite {} => handle_delete_website(garage, bucket_id).await, + Endpoint::GetBucketCors {} => handle_get_cors(&bucket).await, + Endpoint::PutBucketCors {} => { + handle_put_cors(garage, bucket_id, req, content_sha256).await + } + Endpoint::DeleteBucketCors {} => handle_delete_cors(garage, bucket_id).await, + endpoint => Err(Error::NotImplemented(endpoint.name().to_owned())), + }; + + // If request was a success and we have a CORS rule that applies to it, + // add the corresponding CORS headers to the response + let mut resp_ok = resp?; + if let Some(rule) = matching_cors_rule { + add_cors_headers(&mut resp_ok, rule) + .ok_or_internal_error("Invalid bucket CORS configuration")?; + } + + Ok(resp_ok) + } +} + +impl ApiEndpoint for S3ApiEndpoint { + fn name(&self) -> &'static str { + self.endpoint.name() + } + + fn add_span_attributes(&self, span: SpanRef<'_>) { + span.set_attribute(KeyValue::new( + "bucket", + self.bucket_name.clone().unwrap_or_default(), + )); + } +} diff --git a/src/api/s3/bucket.rs b/src/api/s3/bucket.rs new file mode 100644 index 00000000..93048a8c --- /dev/null +++ b/src/api/s3/bucket.rs @@ -0,0 +1,358 @@ +use std::collections::HashMap; +use std::sync::Arc; + +use hyper::{Body, Request, Response, StatusCode}; + +use garage_model::bucket_alias_table::*; +use garage_model::bucket_table::Bucket; +use garage_model::garage::Garage; +use garage_model::key_table::Key; +use garage_model::permission::BucketKeyPerm; +use garage_model::s3::object_table::ObjectFilter; +use garage_table::util::*; +use garage_util::crdt::*; +use garage_util::data::*; +use garage_util::time::*; + +use crate::error::*; +use crate::s3::xml as s3_xml; +use crate::signature::verify_signed_content; + +pub fn handle_get_bucket_location(garage: Arc) -> Result, Error> { + let loc = s3_xml::LocationConstraint { + xmlns: (), + region: garage.config.s3_api.s3_region.to_string(), + }; + let xml = s3_xml::to_xml_with_header(&loc)?; + + Ok(Response::builder() + .header("Content-Type", "application/xml") + .body(Body::from(xml.into_bytes()))?) +} + +pub fn handle_get_bucket_versioning() -> Result, Error> { + let versioning = s3_xml::VersioningConfiguration { + xmlns: (), + status: None, + }; + + let xml = s3_xml::to_xml_with_header(&versioning)?; + + Ok(Response::builder() + .header("Content-Type", "application/xml") + .body(Body::from(xml.into_bytes()))?) +} + +pub async fn handle_list_buckets(garage: &Garage, api_key: &Key) -> Result, Error> { + let key_p = api_key.params().ok_or_internal_error( + "Key should not be in deleted state at this point (in handle_list_buckets)", + )?; + + // Collect buckets user has access to + let ids = api_key + .state + .as_option() + .unwrap() + .authorized_buckets + .items() + .iter() + .filter(|(_, perms)| perms.is_any()) + .map(|(id, _)| *id) + .collect::>(); + + let mut buckets_by_id = HashMap::new(); + let mut aliases = HashMap::new(); + + for bucket_id in ids.iter() { + let bucket = garage.bucket_table.get(&EmptyKey, bucket_id).await?; + if let Some(bucket) = bucket { + for (alias, _, _active) in bucket.aliases().iter().filter(|(_, _, active)| *active) { + let alias_opt = garage.bucket_alias_table.get(&EmptyKey, alias).await?; + if let Some(alias_ent) = alias_opt { + if *alias_ent.state.get() == Some(*bucket_id) { + aliases.insert(alias_ent.name().to_string(), *bucket_id); + } + } + } + if let Deletable::Present(param) = bucket.state { + buckets_by_id.insert(bucket_id, param); + } + } + } + + for (alias, _, id_opt) in key_p.local_aliases.items() { + if let Some(id) = id_opt { + aliases.insert(alias.clone(), *id); + } + } + + // Generate response + let list_buckets = s3_xml::ListAllMyBucketsResult { + owner: s3_xml::Owner { + display_name: s3_xml::Value(key_p.name.get().to_string()), + id: s3_xml::Value(api_key.key_id.to_string()), + }, + buckets: s3_xml::BucketList { + entries: aliases + .iter() + .filter_map(|(name, id)| buckets_by_id.get(id).map(|p| (name, id, p))) + .map(|(name, _id, param)| s3_xml::Bucket { + creation_date: s3_xml::Value(msec_to_rfc3339(param.creation_date)), + name: s3_xml::Value(name.to_string()), + }) + .collect(), + }, + }; + + let xml = s3_xml::to_xml_with_header(&list_buckets)?; + trace!("xml: {}", xml); + + Ok(Response::builder() + .header("Content-Type", "application/xml") + .body(Body::from(xml))?) +} + +pub async fn handle_create_bucket( + garage: &Garage, + req: Request, + content_sha256: Option, + api_key: Key, + bucket_name: String, +) -> Result, Error> { + let body = hyper::body::to_bytes(req.into_body()).await?; + + if let Some(content_sha256) = content_sha256 { + verify_signed_content(content_sha256, &body[..])?; + } + + let cmd = + parse_create_bucket_xml(&body[..]).ok_or_bad_request("Invalid create bucket XML query")?; + + if let Some(location_constraint) = cmd { + if location_constraint != garage.config.s3_api.s3_region { + return Err(Error::BadRequest(format!( + "Cannot satisfy location constraint `{}`: buckets can only be created in region `{}`", + location_constraint, + garage.config.s3_api.s3_region + ))); + } + } + + let key_params = api_key + .params() + .ok_or_internal_error("Key should not be deleted at this point")?; + + let existing_bucket = if let Some(Some(bucket_id)) = key_params.local_aliases.get(&bucket_name) + { + Some(*bucket_id) + } else { + garage + .bucket_helper() + .resolve_global_bucket_name(&bucket_name) + .await? + }; + + if let Some(bucket_id) = existing_bucket { + // Check we have write or owner permission on the bucket, + // in that case it's fine, return 200 OK, bucket exists; + // otherwise return a forbidden error. + let kp = api_key.bucket_permissions(&bucket_id); + if !(kp.allow_write || kp.allow_owner) { + return Err(Error::BucketAlreadyExists); + } + } else { + // Create the bucket! + if !is_valid_bucket_name(&bucket_name) { + return Err(Error::BadRequest(format!( + "{}: {}", + bucket_name, INVALID_BUCKET_NAME_MESSAGE + ))); + } + + let bucket = Bucket::new(); + garage.bucket_table.insert(&bucket).await?; + + garage + .bucket_helper() + .set_bucket_key_permissions(bucket.id, &api_key.key_id, BucketKeyPerm::ALL_PERMISSIONS) + .await?; + + garage + .bucket_helper() + .set_local_bucket_alias(bucket.id, &api_key.key_id, &bucket_name) + .await?; + } + + Ok(Response::builder() + .header("Location", format!("/{}", bucket_name)) + .body(Body::empty()) + .unwrap()) +} + +pub async fn handle_delete_bucket( + garage: &Garage, + bucket_id: Uuid, + bucket_name: String, + api_key: Key, +) -> Result, Error> { + let key_params = api_key + .params() + .ok_or_internal_error("Key should not be deleted at this point")?; + + let is_local_alias = matches!(key_params.local_aliases.get(&bucket_name), Some(Some(_))); + + let mut bucket = garage + .bucket_helper() + .get_existing_bucket(bucket_id) + .await?; + let bucket_state = bucket.state.as_option().unwrap(); + + // If the bucket has no other aliases, this is a true deletion. + // Otherwise, it is just an alias removal. + + let has_other_global_aliases = bucket_state + .aliases + .items() + .iter() + .filter(|(_, _, active)| *active) + .any(|(n, _, _)| is_local_alias || (*n != bucket_name)); + + let has_other_local_aliases = bucket_state + .local_aliases + .items() + .iter() + .filter(|(_, _, active)| *active) + .any(|((k, n), _, _)| !is_local_alias || *n != bucket_name || *k != api_key.key_id); + + if !has_other_global_aliases && !has_other_local_aliases { + // Delete bucket + + // Check bucket is empty + let objects = garage + .object_table + .get_range( + &bucket_id, + None, + Some(ObjectFilter::IsData), + 10, + EnumerationOrder::Forward, + ) + .await?; + if !objects.is_empty() { + return Err(Error::BucketNotEmpty); + } + + // --- done checking, now commit --- + // 1. delete bucket alias + if is_local_alias { + garage + .bucket_helper() + .unset_local_bucket_alias(bucket_id, &api_key.key_id, &bucket_name) + .await?; + } else { + garage + .bucket_helper() + .unset_global_bucket_alias(bucket_id, &bucket_name) + .await?; + } + + // 2. delete authorization from keys that had access + for (key_id, _) in bucket.authorized_keys() { + garage + .bucket_helper() + .set_bucket_key_permissions(bucket.id, key_id, BucketKeyPerm::NO_PERMISSIONS) + .await?; + } + + // 3. delete bucket + bucket.state = Deletable::delete(); + garage.bucket_table.insert(&bucket).await?; + } else if is_local_alias { + // Just unalias + garage + .bucket_helper() + .unset_local_bucket_alias(bucket_id, &api_key.key_id, &bucket_name) + .await?; + } else { + // Just unalias (but from global namespace) + garage + .bucket_helper() + .unset_global_bucket_alias(bucket_id, &bucket_name) + .await?; + } + + Ok(Response::builder() + .status(StatusCode::NO_CONTENT) + .body(Body::empty())?) +} + +fn parse_create_bucket_xml(xml_bytes: &[u8]) -> Option> { + // Returns None if invalid data + // Returns Some(None) if no location constraint is given + // Returns Some(Some("xxxx")) where xxxx is the given location constraint + + let xml_str = std::str::from_utf8(xml_bytes).ok()?; + if xml_str.trim_matches(char::is_whitespace).is_empty() { + return Some(None); + } + + let xml = roxmltree::Document::parse(xml_str).ok()?; + + let cbc = xml.root().first_child()?; + if !cbc.has_tag_name("CreateBucketConfiguration") { + return None; + } + + let mut ret = None; + for item in cbc.children() { + println!("{:?}", item); + if item.has_tag_name("LocationConstraint") { + if ret != None { + return None; + } + ret = Some(item.text()?.to_string()); + } else if !item.is_text() { + return None; + } + } + + Some(ret) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn create_bucket() { + assert_eq!(parse_create_bucket_xml(br#""#), Some(None)); + assert_eq!( + parse_create_bucket_xml( + br#" + + + "# + ), + Some(None) + ); + assert_eq!( + parse_create_bucket_xml( + br#" + + Europe + + "# + ), + Some(Some("Europe".into())) + ); + assert_eq!( + parse_create_bucket_xml( + br#" + + + "# + ), + None + ); + } +} diff --git a/src/api/s3/copy.rs b/src/api/s3/copy.rs new file mode 100644 index 00000000..4e94d887 --- /dev/null +++ b/src/api/s3/copy.rs @@ -0,0 +1,660 @@ +use std::pin::Pin; +use std::sync::Arc; +use std::time::{Duration, SystemTime, UNIX_EPOCH}; + +use futures::{stream, stream::Stream, StreamExt, TryFutureExt}; +use md5::{Digest as Md5Digest, Md5}; + +use hyper::{Body, Request, Response}; +use serde::Serialize; + +use garage_table::*; +use garage_util::data::*; +use garage_util::time::*; + +use garage_model::garage::Garage; +use garage_model::key_table::Key; +use garage_model::s3::block_ref_table::*; +use garage_model::s3::object_table::*; +use garage_model::s3::version_table::*; + +use crate::error::*; +use crate::helpers::{parse_bucket_key, resolve_bucket}; +use crate::s3::put::{decode_upload_id, get_headers}; +use crate::s3::xml::{self as s3_xml, xmlns_tag}; + +pub async fn handle_copy( + garage: Arc, + api_key: &Key, + req: &Request, + dest_bucket_id: Uuid, + dest_key: &str, +) -> Result, Error> { + let copy_precondition = CopyPreconditionHeaders::parse(req)?; + + let source_object = get_copy_source(&garage, api_key, req).await?; + + let (source_version, source_version_data, source_version_meta) = + extract_source_info(&source_object)?; + + // Check precondition, e.g. x-amz-copy-source-if-match + copy_precondition.check(source_version, &source_version_meta.etag)?; + + // Generate parameters for copied object + let new_uuid = gen_uuid(); + let new_timestamp = now_msec(); + + // Implement x-amz-metadata-directive: REPLACE + let new_meta = match req.headers().get("x-amz-metadata-directive") { + Some(v) if v == hyper::header::HeaderValue::from_static("REPLACE") => ObjectVersionMeta { + headers: get_headers(req.headers())?, + size: source_version_meta.size, + etag: source_version_meta.etag.clone(), + }, + _ => source_version_meta.clone(), + }; + + let etag = new_meta.etag.to_string(); + + // Save object copy + match source_version_data { + ObjectVersionData::DeleteMarker => unreachable!(), + ObjectVersionData::Inline(_meta, bytes) => { + let dest_object_version = ObjectVersion { + uuid: new_uuid, + timestamp: new_timestamp, + state: ObjectVersionState::Complete(ObjectVersionData::Inline( + new_meta, + bytes.clone(), + )), + }; + let dest_object = Object::new( + dest_bucket_id, + dest_key.to_string(), + vec![dest_object_version], + ); + garage.object_table.insert(&dest_object).await?; + } + ObjectVersionData::FirstBlock(_meta, first_block_hash) => { + // Get block list from source version + let source_version = garage + .version_table + .get(&source_version.uuid, &EmptyKey) + .await?; + let source_version = source_version.ok_or(Error::NoSuchKey)?; + + // Write an "uploading" marker in Object table + // This holds a reference to the object in the Version table + // so that it won't be deleted, e.g. by repair_versions. + let tmp_dest_object_version = ObjectVersion { + uuid: new_uuid, + timestamp: new_timestamp, + state: ObjectVersionState::Uploading(new_meta.headers.clone()), + }; + let tmp_dest_object = Object::new( + dest_bucket_id, + dest_key.to_string(), + vec![tmp_dest_object_version], + ); + garage.object_table.insert(&tmp_dest_object).await?; + + // Write version in the version table. Even with empty block list, + // this means that the BlockRef entries linked to this version cannot be + // marked as deleted (they are marked as deleted only if the Version + // doesn't exist or is marked as deleted). + let mut dest_version = + Version::new(new_uuid, dest_bucket_id, dest_key.to_string(), false); + garage.version_table.insert(&dest_version).await?; + + // Fill in block list for version and insert block refs + for (bk, bv) in source_version.blocks.items().iter() { + dest_version.blocks.put(*bk, *bv); + } + let dest_block_refs = dest_version + .blocks + .items() + .iter() + .map(|b| BlockRef { + block: b.1.hash, + version: new_uuid, + deleted: false.into(), + }) + .collect::>(); + futures::try_join!( + garage.version_table.insert(&dest_version), + garage.block_ref_table.insert_many(&dest_block_refs[..]), + )?; + + // Insert final object + // We do this last because otherwise there is a race condition in the case where + // the copy call has the same source and destination (this happens, rclone does + // it to update the modification timestamp for instance). If we did this concurrently + // with the stuff before, the block's reference counts could be decremented before + // they are incremented again for the new version, leading to data being deleted. + let dest_object_version = ObjectVersion { + uuid: new_uuid, + timestamp: new_timestamp, + state: ObjectVersionState::Complete(ObjectVersionData::FirstBlock( + new_meta, + *first_block_hash, + )), + }; + let dest_object = Object::new( + dest_bucket_id, + dest_key.to_string(), + vec![dest_object_version], + ); + garage.object_table.insert(&dest_object).await?; + } + } + + let last_modified = msec_to_rfc3339(new_timestamp); + let result = CopyObjectResult { + last_modified: s3_xml::Value(last_modified), + etag: s3_xml::Value(format!("\"{}\"", etag)), + }; + let xml = s3_xml::to_xml_with_header(&result)?; + + Ok(Response::builder() + .header("Content-Type", "application/xml") + .header("x-amz-version-id", hex::encode(new_uuid)) + .header( + "x-amz-copy-source-version-id", + hex::encode(source_version.uuid), + ) + .body(Body::from(xml))?) +} + +pub async fn handle_upload_part_copy( + garage: Arc, + api_key: &Key, + req: &Request, + dest_bucket_id: Uuid, + dest_key: &str, + part_number: u64, + upload_id: &str, +) -> Result, Error> { + let copy_precondition = CopyPreconditionHeaders::parse(req)?; + + let dest_version_uuid = decode_upload_id(upload_id)?; + + let dest_key = dest_key.to_string(); + let (source_object, dest_object) = futures::try_join!( + get_copy_source(&garage, api_key, req), + garage + .object_table + .get(&dest_bucket_id, &dest_key) + .map_err(Error::from), + )?; + let dest_object = dest_object.ok_or(Error::NoSuchKey)?; + + let (source_object_version, source_version_data, source_version_meta) = + extract_source_info(&source_object)?; + + // Check precondition on source, e.g. x-amz-copy-source-if-match + copy_precondition.check(source_object_version, &source_version_meta.etag)?; + + // Check source range is valid + let source_range = match req.headers().get("x-amz-copy-source-range") { + Some(range) => { + let range_str = range.to_str()?; + let mut ranges = http_range::HttpRange::parse(range_str, source_version_meta.size) + .map_err(|e| (e, source_version_meta.size))?; + if ranges.len() != 1 { + return Err(Error::BadRequest( + "Invalid x-amz-copy-source-range header: exactly 1 range must be given".into(), + )); + } else { + ranges.pop().unwrap() + } + } + None => http_range::HttpRange { + start: 0, + length: source_version_meta.size, + }, + }; + + // Check destination version is indeed in uploading state + if !dest_object + .versions() + .iter() + .any(|v| v.uuid == dest_version_uuid && v.is_uploading()) + { + return Err(Error::NoSuchUpload); + } + + // Check source version is not inlined + match source_version_data { + ObjectVersionData::DeleteMarker => unreachable!(), + ObjectVersionData::Inline(_meta, _bytes) => { + // This is only for small files, we don't bother handling this. + // (in AWS UploadPartCopy works for parts at least 5MB which + // is never the case of an inline object) + return Err(Error::BadRequest( + "Source object is too small (minimum part size is 5Mb)".into(), + )); + } + ObjectVersionData::FirstBlock(_meta, _first_block_hash) => (), + }; + + // Fetch source versin with its block list, + // and destination version to check part hasn't yet been uploaded + let (source_version, dest_version) = futures::try_join!( + garage + .version_table + .get(&source_object_version.uuid, &EmptyKey), + garage.version_table.get(&dest_version_uuid, &EmptyKey), + )?; + let source_version = source_version.ok_or(Error::NoSuchKey)?; + + // Check this part number hasn't yet been uploaded + if let Some(dv) = dest_version { + if dv.has_part_number(part_number) { + return Err(Error::BadRequest(format!( + "Part number {} has already been uploaded", + part_number + ))); + } + } + + // We want to reuse blocks from the source version as much as possible. + // However, we still need to get the data from these blocks + // because we need to know it to calculate the MD5sum of the part + // which is used as its ETag. + + // First, calculate what blocks we want to keep, + // and the subrange of the block to take, if the bounds of the + // requested range are in the middle. + let (range_begin, range_end) = (source_range.start, source_range.start + source_range.length); + + let mut blocks_to_copy = vec![]; + let mut current_offset = 0; + for (_bk, block) in source_version.blocks.items().iter() { + let (block_begin, block_end) = (current_offset, current_offset + block.size); + + if block_begin < range_end && block_end > range_begin { + let subrange_begin = if block_begin < range_begin { + Some(range_begin - block_begin) + } else { + None + }; + let subrange_end = if block_end > range_end { + Some(range_end - block_begin) + } else { + None + }; + let range_to_copy = match (subrange_begin, subrange_end) { + (Some(b), Some(e)) => Some(b as usize..e as usize), + (None, Some(e)) => Some(0..e as usize), + (Some(b), None) => Some(b as usize..block.size as usize), + (None, None) => None, + }; + + blocks_to_copy.push((block.hash, range_to_copy)); + } + + current_offset = block_end; + } + + // Now, actually copy the blocks + let mut md5hasher = Md5::new(); + + // First, create a stream that is able to read the source blocks + // and extract the subrange if necessary. + // The second returned value is an Option, that is Some + // if and only if the block returned is a block that already existed + // in the Garage data store (thus we don't need to save it again). + let garage2 = garage.clone(); + let source_blocks = stream::iter(blocks_to_copy) + .flat_map(|(block_hash, range_to_copy)| { + let garage3 = garage2.clone(); + stream::once(async move { + let data = garage3.block_manager.rpc_get_block(&block_hash).await?; + match range_to_copy { + Some(r) => Ok((data[r].to_vec(), None)), + None => Ok((data, Some(block_hash))), + } + }) + }) + .peekable(); + + // The defragmenter is a custom stream (defined below) that concatenates + // consecutive block parts when they are too small. + // It returns a series of (Vec, Option). + // When it is done, it returns an empty vec. + // Same as the previous iterator, the Option is Some(_) if and only if + // it's an existing block of the Garage data store. + let mut defragmenter = Defragmenter::new(garage.config.block_size, Box::pin(source_blocks)); + + let mut current_offset = 0; + let mut next_block = defragmenter.next().await?; + + loop { + let (data, existing_block_hash) = next_block; + if data.is_empty() { + break; + } + + md5hasher.update(&data[..]); + + let must_upload = existing_block_hash.is_none(); + let final_hash = existing_block_hash.unwrap_or_else(|| blake2sum(&data[..])); + + let mut version = Version::new(dest_version_uuid, dest_bucket_id, dest_key.clone(), false); + version.blocks.put( + VersionBlockKey { + part_number, + offset: current_offset, + }, + VersionBlock { + hash: final_hash, + size: data.len() as u64, + }, + ); + current_offset += data.len() as u64; + + let block_ref = BlockRef { + block: final_hash, + version: dest_version_uuid, + deleted: false.into(), + }; + + let garage2 = garage.clone(); + let res = futures::try_join!( + // Thing 1: if the block is not exactly a block that existed before, + // we need to insert that data as a new block. + async move { + if must_upload { + garage2.block_manager.rpc_put_block(final_hash, data).await + } else { + Ok(()) + } + }, + // Thing 2: we need to insert the block in the version + garage.version_table.insert(&version), + // Thing 3: we need to add a block reference + garage.block_ref_table.insert(&block_ref), + // Thing 4: we need to prefetch the next block + defragmenter.next(), + )?; + next_block = res.3; + } + + let data_md5sum = md5hasher.finalize(); + let etag = hex::encode(data_md5sum); + + // Put the part's ETag in the Versiontable + let mut version = Version::new(dest_version_uuid, dest_bucket_id, dest_key.clone(), false); + version.parts_etags.put(part_number, etag.clone()); + garage.version_table.insert(&version).await?; + + // LGTM + let resp_xml = s3_xml::to_xml_with_header(&CopyPartResult { + xmlns: (), + etag: s3_xml::Value(format!("\"{}\"", etag)), + last_modified: s3_xml::Value(msec_to_rfc3339(source_object_version.timestamp)), + })?; + + Ok(Response::builder() + .header("Content-Type", "application/xml") + .header( + "x-amz-copy-source-version-id", + hex::encode(source_object_version.uuid), + ) + .body(Body::from(resp_xml))?) +} + +async fn get_copy_source( + garage: &Garage, + api_key: &Key, + req: &Request, +) -> Result { + let copy_source = req.headers().get("x-amz-copy-source").unwrap().to_str()?; + let copy_source = percent_encoding::percent_decode_str(copy_source).decode_utf8()?; + + let (source_bucket, source_key) = parse_bucket_key(©_source, None)?; + let source_bucket_id = resolve_bucket(garage, &source_bucket.to_string(), api_key).await?; + + if !api_key.allow_read(&source_bucket_id) { + return Err(Error::Forbidden(format!( + "Reading from bucket {} not allowed for this key", + source_bucket + ))); + } + + let source_key = source_key.ok_or_bad_request("No source key specified")?; + + let source_object = garage + .object_table + .get(&source_bucket_id, &source_key.to_string()) + .await? + .ok_or(Error::NoSuchKey)?; + + Ok(source_object) +} + +fn extract_source_info( + source_object: &Object, +) -> Result<(&ObjectVersion, &ObjectVersionData, &ObjectVersionMeta), Error> { + let source_version = source_object + .versions() + .iter() + .rev() + .find(|v| v.is_complete()) + .ok_or(Error::NoSuchKey)?; + + let source_version_data = match &source_version.state { + ObjectVersionState::Complete(x) => x, + _ => unreachable!(), + }; + + let source_version_meta = match source_version_data { + ObjectVersionData::DeleteMarker => { + return Err(Error::NoSuchKey); + } + ObjectVersionData::Inline(meta, _bytes) => meta, + ObjectVersionData::FirstBlock(meta, _fbh) => meta, + }; + + Ok((source_version, source_version_data, source_version_meta)) +} + +struct CopyPreconditionHeaders { + copy_source_if_match: Option>, + copy_source_if_modified_since: Option, + copy_source_if_none_match: Option>, + copy_source_if_unmodified_since: Option, +} + +impl CopyPreconditionHeaders { + fn parse(req: &Request) -> Result { + Ok(Self { + copy_source_if_match: req + .headers() + .get("x-amz-copy-source-if-match") + .map(|x| x.to_str()) + .transpose()? + .map(|x| { + x.split(',') + .map(|m| m.trim().trim_matches('"').to_string()) + .collect::>() + }), + copy_source_if_modified_since: req + .headers() + .get("x-amz-copy-source-if-modified-since") + .map(|x| x.to_str()) + .transpose()? + .map(httpdate::parse_http_date) + .transpose() + .ok_or_bad_request("Invalid date in x-amz-copy-source-if-modified-since")?, + copy_source_if_none_match: req + .headers() + .get("x-amz-copy-source-if-none-match") + .map(|x| x.to_str()) + .transpose()? + .map(|x| { + x.split(',') + .map(|m| m.trim().trim_matches('"').to_string()) + .collect::>() + }), + copy_source_if_unmodified_since: req + .headers() + .get("x-amz-copy-source-if-unmodified-since") + .map(|x| x.to_str()) + .transpose()? + .map(httpdate::parse_http_date) + .transpose() + .ok_or_bad_request("Invalid date in x-amz-copy-source-if-unmodified-since")?, + }) + } + + fn check(&self, v: &ObjectVersion, etag: &str) -> Result<(), Error> { + let v_date = UNIX_EPOCH + Duration::from_millis(v.timestamp); + + let ok = match ( + &self.copy_source_if_match, + &self.copy_source_if_unmodified_since, + &self.copy_source_if_none_match, + &self.copy_source_if_modified_since, + ) { + // TODO I'm not sure all of the conditions are evaluated correctly here + + // If we have both if-match and if-unmodified-since, + // basically we don't care about if-unmodified-since, + // because in the spec it says that if if-match evaluates to + // true but if-unmodified-since evaluates to false, + // the copy is still done. + (Some(im), _, None, None) => im.iter().any(|x| x == etag || x == "*"), + (None, Some(ius), None, None) => v_date <= *ius, + + // If we have both if-none-match and if-modified-since, + // then both of the two conditions must evaluate to true + (None, None, Some(inm), Some(ims)) => { + !inm.iter().any(|x| x == etag || x == "*") && v_date > *ims + } + (None, None, Some(inm), None) => !inm.iter().any(|x| x == etag || x == "*"), + (None, None, None, Some(ims)) => v_date > *ims, + (None, None, None, None) => true, + _ => { + return Err(Error::BadRequest( + "Invalid combination of x-amz-copy-source-if-xxxxx headers".into(), + )) + } + }; + + if ok { + Ok(()) + } else { + Err(Error::PreconditionFailed) + } + } +} + +type BlockStreamItemOk = (Vec, Option); +type BlockStreamItem = Result; + +struct Defragmenter> { + block_size: usize, + block_stream: Pin>>, + buffer: Vec, + hash: Option, +} + +impl> Defragmenter { + fn new(block_size: usize, block_stream: Pin>>) -> Self { + Self { + block_size, + block_stream, + buffer: vec![], + hash: None, + } + } + + async fn next(&mut self) -> BlockStreamItem { + // Fill buffer while we can + while let Some(res) = self.block_stream.as_mut().peek().await { + let (peeked_next_block, _) = match res { + Ok(t) => t, + Err(_) => { + self.block_stream.next().await.unwrap()?; + unreachable!() + } + }; + + if self.buffer.is_empty() { + let (next_block, next_block_hash) = self.block_stream.next().await.unwrap()?; + self.buffer = next_block; + self.hash = next_block_hash; + } else if self.buffer.len() + peeked_next_block.len() > self.block_size { + break; + } else { + let (next_block, _) = self.block_stream.next().await.unwrap()?; + self.buffer.extend(next_block); + self.hash = None; + } + } + + Ok((std::mem::take(&mut self.buffer), self.hash.take())) + } +} + +#[derive(Debug, Serialize, PartialEq)] +pub struct CopyObjectResult { + #[serde(rename = "LastModified")] + pub last_modified: s3_xml::Value, + #[serde(rename = "ETag")] + pub etag: s3_xml::Value, +} + +#[derive(Debug, Serialize, PartialEq)] +pub struct CopyPartResult { + #[serde(serialize_with = "xmlns_tag")] + pub xmlns: (), + #[serde(rename = "LastModified")] + pub last_modified: s3_xml::Value, + #[serde(rename = "ETag")] + pub etag: s3_xml::Value, +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::s3::xml::to_xml_with_header; + + #[test] + fn copy_object_result() -> Result<(), Error> { + let copy_result = CopyObjectResult { + last_modified: s3_xml::Value(msec_to_rfc3339(0)), + etag: s3_xml::Value("\"9b2cf535f27731c974343645a3985328\"".to_string()), + }; + assert_eq!( + to_xml_with_header(©_result)?, + "\ +\ + 1970-01-01T00:00:00.000Z\ + "9b2cf535f27731c974343645a3985328"\ +\ + " + ); + Ok(()) + } + + #[test] + fn serialize_copy_part_result() -> Result<(), Error> { + let expected_retval = "\ +\ + 2011-04-11T20:34:56.000Z\ + "9b2cf535f27731c974343645a3985328"\ +"; + let v = CopyPartResult { + xmlns: (), + last_modified: s3_xml::Value("2011-04-11T20:34:56.000Z".into()), + etag: s3_xml::Value("\"9b2cf535f27731c974343645a3985328\"".into()), + }; + println!("{}", to_xml_with_header(&v)?); + + assert_eq!(to_xml_with_header(&v)?, expected_retval); + + Ok(()) + } +} diff --git a/src/api/s3/cors.rs b/src/api/s3/cors.rs new file mode 100644 index 00000000..37ea2e43 --- /dev/null +++ b/src/api/s3/cors.rs @@ -0,0 +1,442 @@ +use quick_xml::de::from_reader; +use std::sync::Arc; + +use http::header::{ + ACCESS_CONTROL_ALLOW_HEADERS, ACCESS_CONTROL_ALLOW_METHODS, ACCESS_CONTROL_ALLOW_ORIGIN, + ACCESS_CONTROL_EXPOSE_HEADERS, ACCESS_CONTROL_REQUEST_HEADERS, ACCESS_CONTROL_REQUEST_METHOD, +}; +use hyper::{header::HeaderName, Body, Method, Request, Response, StatusCode}; + +use serde::{Deserialize, Serialize}; + +use crate::error::*; +use crate::s3::xml::{to_xml_with_header, xmlns_tag, IntValue, Value}; +use crate::signature::verify_signed_content; + +use garage_model::bucket_table::{Bucket, CorsRule as GarageCorsRule}; +use garage_model::garage::Garage; +use garage_table::*; +use garage_util::data::*; + +pub async fn handle_get_cors(bucket: &Bucket) -> Result, Error> { + let param = bucket + .params() + .ok_or_internal_error("Bucket should not be deleted at this point")?; + + if let Some(cors) = param.cors_config.get() { + let wc = CorsConfiguration { + xmlns: (), + cors_rules: cors + .iter() + .map(CorsRule::from_garage_cors_rule) + .collect::>(), + }; + let xml = to_xml_with_header(&wc)?; + Ok(Response::builder() + .status(StatusCode::OK) + .header(http::header::CONTENT_TYPE, "application/xml") + .body(Body::from(xml))?) + } else { + Ok(Response::builder() + .status(StatusCode::NO_CONTENT) + .body(Body::empty())?) + } +} + +pub async fn handle_delete_cors( + garage: Arc, + bucket_id: Uuid, +) -> Result, Error> { + let mut bucket = garage + .bucket_table + .get(&EmptyKey, &bucket_id) + .await? + .ok_or(Error::NoSuchBucket)?; + + let param = bucket + .params_mut() + .ok_or_internal_error("Bucket should not be deleted at this point")?; + + param.cors_config.update(None); + garage.bucket_table.insert(&bucket).await?; + + Ok(Response::builder() + .status(StatusCode::NO_CONTENT) + .body(Body::empty())?) +} + +pub async fn handle_put_cors( + garage: Arc, + bucket_id: Uuid, + req: Request, + content_sha256: Option, +) -> Result, Error> { + let body = hyper::body::to_bytes(req.into_body()).await?; + + if let Some(content_sha256) = content_sha256 { + verify_signed_content(content_sha256, &body[..])?; + } + + let mut bucket = garage + .bucket_table + .get(&EmptyKey, &bucket_id) + .await? + .ok_or(Error::NoSuchBucket)?; + + let param = bucket + .params_mut() + .ok_or_internal_error("Bucket should not be deleted at this point")?; + + let conf: CorsConfiguration = from_reader(&body as &[u8])?; + conf.validate()?; + + param + .cors_config + .update(Some(conf.into_garage_cors_config()?)); + garage.bucket_table.insert(&bucket).await?; + + Ok(Response::builder() + .status(StatusCode::OK) + .body(Body::empty())?) +} + +pub async fn handle_options_s3api( + garage: Arc, + req: &Request, + bucket_name: Option, +) -> Result, Error> { + // FIXME: CORS rules of buckets with local aliases are + // not taken into account. + + // If the bucket name is a global bucket name, + // we try to apply the CORS rules of that bucket. + // If a user has a local bucket name that has + // the same name, its CORS rules won't be applied + // and will be shadowed by the rules of the globally + // existing bucket (but this is inevitable because + // OPTIONS calls are not auhtenticated). + if let Some(bn) = bucket_name { + let helper = garage.bucket_helper(); + let bucket_id = helper.resolve_global_bucket_name(&bn).await?; + if let Some(id) = bucket_id { + let bucket = garage + .bucket_table + .get(&EmptyKey, &id) + .await? + .filter(|b| !b.state.is_deleted()) + .ok_or(Error::NoSuchBucket)?; + handle_options_for_bucket(req, &bucket) + } else { + // If there is a bucket name in the request, but that name + // does not correspond to a global alias for a bucket, + // then it's either a non-existing bucket or a local bucket. + // We have no way of knowing, because the request is not + // authenticated and thus we can't resolve local aliases. + // We take the permissive approach of allowing everything, + // because we don't want to prevent web apps that use + // local bucket names from making API calls. + Ok(Response::builder() + .header(ACCESS_CONTROL_ALLOW_ORIGIN, "*") + .header(ACCESS_CONTROL_ALLOW_METHODS, "*") + .status(StatusCode::OK) + .body(Body::empty())?) + } + } else { + // If there is no bucket name in the request, + // we are doing a ListBuckets call, which we want to allow + // for all origins. + Ok(Response::builder() + .header(ACCESS_CONTROL_ALLOW_ORIGIN, "*") + .header(ACCESS_CONTROL_ALLOW_METHODS, "GET") + .status(StatusCode::OK) + .body(Body::empty())?) + } +} + +pub fn handle_options_for_bucket( + req: &Request, + bucket: &Bucket, +) -> Result, Error> { + let origin = req + .headers() + .get("Origin") + .ok_or_bad_request("Missing Origin header")? + .to_str()?; + let request_method = req + .headers() + .get(ACCESS_CONTROL_REQUEST_METHOD) + .ok_or_bad_request("Missing Access-Control-Request-Method header")? + .to_str()?; + let request_headers = match req.headers().get(ACCESS_CONTROL_REQUEST_HEADERS) { + Some(h) => h.to_str()?.split(',').map(|h| h.trim()).collect::>(), + None => vec![], + }; + + if let Some(cors_config) = bucket.params().unwrap().cors_config.get() { + let matching_rule = cors_config + .iter() + .find(|rule| cors_rule_matches(rule, origin, request_method, request_headers.iter())); + if let Some(rule) = matching_rule { + let mut resp = Response::builder() + .status(StatusCode::OK) + .body(Body::empty())?; + add_cors_headers(&mut resp, rule).ok_or_internal_error("Invalid CORS configuration")?; + return Ok(resp); + } + } + + Err(Error::Forbidden("This CORS request is not allowed.".into())) +} + +pub fn find_matching_cors_rule<'a>( + bucket: &'a Bucket, + req: &Request, +) -> Result, Error> { + if let Some(cors_config) = bucket.params().unwrap().cors_config.get() { + if let Some(origin) = req.headers().get("Origin") { + let origin = origin.to_str()?; + let request_headers = match req.headers().get(ACCESS_CONTROL_REQUEST_HEADERS) { + Some(h) => h.to_str()?.split(',').map(|h| h.trim()).collect::>(), + None => vec![], + }; + return Ok(cors_config.iter().find(|rule| { + cors_rule_matches(rule, origin, req.method().as_ref(), request_headers.iter()) + })); + } + } + Ok(None) +} + +fn cors_rule_matches<'a, HI, S>( + rule: &GarageCorsRule, + origin: &'a str, + method: &'a str, + mut request_headers: HI, +) -> bool +where + HI: Iterator, + S: AsRef, +{ + rule.allow_origins.iter().any(|x| x == "*" || x == origin) + && rule.allow_methods.iter().any(|x| x == "*" || x == method) + && request_headers.all(|h| { + rule.allow_headers + .iter() + .any(|x| x == "*" || x == h.as_ref()) + }) +} + +pub fn add_cors_headers( + resp: &mut Response, + rule: &GarageCorsRule, +) -> Result<(), http::header::InvalidHeaderValue> { + let h = resp.headers_mut(); + h.insert( + ACCESS_CONTROL_ALLOW_ORIGIN, + rule.allow_origins.join(", ").parse()?, + ); + h.insert( + ACCESS_CONTROL_ALLOW_METHODS, + rule.allow_methods.join(", ").parse()?, + ); + h.insert( + ACCESS_CONTROL_ALLOW_HEADERS, + rule.allow_headers.join(", ").parse()?, + ); + h.insert( + ACCESS_CONTROL_EXPOSE_HEADERS, + rule.expose_headers.join(", ").parse()?, + ); + Ok(()) +} + +// ---- SERIALIZATION AND DESERIALIZATION TO/FROM S3 XML ---- + +#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)] +#[serde(rename = "CORSConfiguration")] +pub struct CorsConfiguration { + #[serde(serialize_with = "xmlns_tag", skip_deserializing)] + pub xmlns: (), + #[serde(rename = "CORSRule")] + pub cors_rules: Vec, +} + +#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)] +pub struct CorsRule { + #[serde(rename = "ID")] + pub id: Option, + #[serde(rename = "MaxAgeSeconds")] + pub max_age_seconds: Option, + #[serde(rename = "AllowedOrigin")] + pub allowed_origins: Vec, + #[serde(rename = "AllowedMethod")] + pub allowed_methods: Vec, + #[serde(rename = "AllowedHeader", default)] + pub allowed_headers: Vec, + #[serde(rename = "ExposeHeader", default)] + pub expose_headers: Vec, +} + +#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)] +pub struct AllowedMethod { + #[serde(rename = "AllowedMethod")] + pub allowed_method: Value, +} + +#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)] +pub struct AllowedHeader { + #[serde(rename = "AllowedHeader")] + pub allowed_header: Value, +} + +#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)] +pub struct ExposeHeader { + #[serde(rename = "ExposeHeader")] + pub expose_header: Value, +} + +impl CorsConfiguration { + pub fn validate(&self) -> Result<(), Error> { + for r in self.cors_rules.iter() { + r.validate()?; + } + Ok(()) + } + + pub fn into_garage_cors_config(self) -> Result, Error> { + Ok(self + .cors_rules + .iter() + .map(CorsRule::to_garage_cors_rule) + .collect()) + } +} + +impl CorsRule { + pub fn validate(&self) -> Result<(), Error> { + for method in self.allowed_methods.iter() { + method + .0 + .parse::() + .ok_or_bad_request("Invalid CORSRule method")?; + } + for header in self + .allowed_headers + .iter() + .chain(self.expose_headers.iter()) + { + header + .0 + .parse::() + .ok_or_bad_request("Invalid HTTP header name")?; + } + Ok(()) + } + + pub fn to_garage_cors_rule(&self) -> GarageCorsRule { + let convert_vec = + |vval: &[Value]| vval.iter().map(|x| x.0.to_owned()).collect::>(); + GarageCorsRule { + id: self.id.as_ref().map(|x| x.0.to_owned()), + max_age_seconds: self.max_age_seconds.as_ref().map(|x| x.0 as u64), + allow_origins: convert_vec(&self.allowed_origins), + allow_methods: convert_vec(&self.allowed_methods), + allow_headers: convert_vec(&self.allowed_headers), + expose_headers: convert_vec(&self.expose_headers), + } + } + + pub fn from_garage_cors_rule(rule: &GarageCorsRule) -> Self { + let convert_vec = |vval: &[String]| { + vval.iter() + .map(|x| Value(x.clone())) + .collect::>() + }; + Self { + id: rule.id.as_ref().map(|x| Value(x.clone())), + max_age_seconds: rule.max_age_seconds.map(|x| IntValue(x as i64)), + allowed_origins: convert_vec(&rule.allow_origins), + allowed_methods: convert_vec(&rule.allow_methods), + allowed_headers: convert_vec(&rule.allow_headers), + expose_headers: convert_vec(&rule.expose_headers), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use quick_xml::de::from_str; + + #[test] + fn test_deserialize() -> Result<(), Error> { + let message = r#" + + + http://www.example.com + + PUT + POST + DELETE + + * + + + * + GET + + + qsdfjklm + 12345 + https://perdu.com + + GET + DELETE + * + * + +"#; + let conf: CorsConfiguration = from_str(message).unwrap(); + let ref_value = CorsConfiguration { + xmlns: (), + cors_rules: vec![ + CorsRule { + id: None, + max_age_seconds: None, + allowed_origins: vec!["http://www.example.com".into()], + allowed_methods: vec!["PUT".into(), "POST".into(), "DELETE".into()], + allowed_headers: vec!["*".into()], + expose_headers: vec![], + }, + CorsRule { + id: None, + max_age_seconds: None, + allowed_origins: vec!["*".into()], + allowed_methods: vec!["GET".into()], + allowed_headers: vec![], + expose_headers: vec![], + }, + CorsRule { + id: Some("qsdfjklm".into()), + max_age_seconds: Some(IntValue(12345)), + allowed_origins: vec!["https://perdu.com".into()], + allowed_methods: vec!["GET".into(), "DELETE".into()], + allowed_headers: vec!["*".into()], + expose_headers: vec!["*".into()], + }, + ], + }; + assert_eq! { + ref_value, + conf + }; + + let message2 = to_xml_with_header(&ref_value)?; + + let cleanup = |c: &str| c.replace(char::is_whitespace, ""); + assert_eq!(cleanup(message), cleanup(&message2)); + + Ok(()) + } +} diff --git a/src/api/s3/delete.rs b/src/api/s3/delete.rs new file mode 100644 index 00000000..1e3f1249 --- /dev/null +++ b/src/api/s3/delete.rs @@ -0,0 +1,170 @@ +use std::sync::Arc; + +use hyper::{Body, Request, Response, StatusCode}; + +use garage_util::data::*; +use garage_util::time::*; + +use garage_model::garage::Garage; +use garage_model::s3::object_table::*; + +use crate::error::*; +use crate::s3::xml as s3_xml; +use crate::signature::verify_signed_content; + +async fn handle_delete_internal( + garage: &Garage, + bucket_id: Uuid, + key: &str, +) -> Result<(Uuid, Uuid), Error> { + let object = garage + .object_table + .get(&bucket_id, &key.to_string()) + .await? + .ok_or(Error::NoSuchKey)?; // No need to delete + + let interesting_versions = object.versions().iter().filter(|v| { + !matches!( + v.state, + ObjectVersionState::Aborted + | ObjectVersionState::Complete(ObjectVersionData::DeleteMarker) + ) + }); + + let mut version_to_delete = None; + let mut timestamp = now_msec(); + for v in interesting_versions { + if v.timestamp + 1 > timestamp || version_to_delete.is_none() { + version_to_delete = Some(v.uuid); + } + timestamp = std::cmp::max(timestamp, v.timestamp + 1); + } + + let deleted_version = version_to_delete.ok_or(Error::NoSuchKey)?; + + let version_uuid = gen_uuid(); + + let object = Object::new( + bucket_id, + key.into(), + vec![ObjectVersion { + uuid: version_uuid, + timestamp, + state: ObjectVersionState::Complete(ObjectVersionData::DeleteMarker), + }], + ); + + garage.object_table.insert(&object).await?; + + Ok((deleted_version, version_uuid)) +} + +pub async fn handle_delete( + garage: Arc, + bucket_id: Uuid, + key: &str, +) -> Result, Error> { + let (_deleted_version, delete_marker_version) = + handle_delete_internal(&garage, bucket_id, key).await?; + + Ok(Response::builder() + .header("x-amz-version-id", hex::encode(delete_marker_version)) + .status(StatusCode::NO_CONTENT) + .body(Body::from(vec![])) + .unwrap()) +} + +pub async fn handle_delete_objects( + garage: Arc, + bucket_id: Uuid, + req: Request, + content_sha256: Option, +) -> Result, Error> { + let body = hyper::body::to_bytes(req.into_body()).await?; + + if let Some(content_sha256) = content_sha256 { + verify_signed_content(content_sha256, &body[..])?; + } + + let cmd_xml = roxmltree::Document::parse(std::str::from_utf8(&body)?)?; + let cmd = parse_delete_objects_xml(&cmd_xml).ok_or_bad_request("Invalid delete XML query")?; + + let mut ret_deleted = Vec::new(); + let mut ret_errors = Vec::new(); + + for obj in cmd.objects.iter() { + match handle_delete_internal(&garage, bucket_id, &obj.key).await { + Ok((deleted_version, delete_marker_version)) => { + if cmd.quiet { + continue; + } + ret_deleted.push(s3_xml::Deleted { + key: s3_xml::Value(obj.key.clone()), + version_id: s3_xml::Value(hex::encode(deleted_version)), + delete_marker_version_id: s3_xml::Value(hex::encode(delete_marker_version)), + }); + } + Err(e) => { + ret_errors.push(s3_xml::DeleteError { + code: s3_xml::Value(e.aws_code().to_string()), + key: Some(s3_xml::Value(obj.key.clone())), + message: s3_xml::Value(format!("{}", e)), + version_id: None, + }); + } + } + } + + let xml = s3_xml::to_xml_with_header(&s3_xml::DeleteResult { + xmlns: (), + deleted: ret_deleted, + errors: ret_errors, + })?; + + Ok(Response::builder() + .header("Content-Type", "application/xml") + .body(Body::from(xml))?) +} + +struct DeleteRequest { + quiet: bool, + objects: Vec, +} + +struct DeleteObject { + key: String, +} + +fn parse_delete_objects_xml(xml: &roxmltree::Document) -> Option { + let mut ret = DeleteRequest { + quiet: false, + objects: vec![], + }; + + let root = xml.root(); + let delete = root.first_child()?; + + if !delete.has_tag_name("Delete") { + return None; + } + + for item in delete.children() { + if item.has_tag_name("Object") { + let key = item.children().find(|e| e.has_tag_name("Key"))?; + let key_str = key.text()?; + ret.objects.push(DeleteObject { + key: key_str.to_string(), + }); + } else if item.has_tag_name("Quiet") { + if item.text()? == "true" { + ret.quiet = true; + } else { + ret.quiet = false; + } + } else { + return None; + } + } + + Some(ret) +} diff --git a/src/api/s3/get.rs b/src/api/s3/get.rs new file mode 100644 index 00000000..3edf22a6 --- /dev/null +++ b/src/api/s3/get.rs @@ -0,0 +1,461 @@ +//! Function related to GET and HEAD requests +use std::sync::Arc; +use std::time::{Duration, UNIX_EPOCH}; + +use futures::stream::*; +use http::header::{ + ACCEPT_RANGES, CONTENT_LENGTH, CONTENT_RANGE, CONTENT_TYPE, ETAG, IF_MODIFIED_SINCE, + IF_NONE_MATCH, LAST_MODIFIED, RANGE, +}; +use hyper::body::Bytes; +use hyper::{Body, Request, Response, StatusCode}; + +use garage_table::EmptyKey; +use garage_util::data::*; + +use garage_model::garage::Garage; +use garage_model::s3::object_table::*; +use garage_model::s3::version_table::*; + +use crate::error::*; + +const X_AMZ_MP_PARTS_COUNT: &str = "x-amz-mp-parts-count"; + +fn object_headers( + version: &ObjectVersion, + version_meta: &ObjectVersionMeta, +) -> http::response::Builder { + debug!("Version meta: {:?}", version_meta); + + let date = UNIX_EPOCH + Duration::from_millis(version.timestamp); + let date_str = httpdate::fmt_http_date(date); + + let mut resp = Response::builder() + .header(CONTENT_TYPE, version_meta.headers.content_type.to_string()) + .header(LAST_MODIFIED, date_str) + .header(ACCEPT_RANGES, "bytes".to_string()); + + if !version_meta.etag.is_empty() { + resp = resp.header(ETAG, format!("\"{}\"", version_meta.etag)); + } + + for (k, v) in version_meta.headers.other.iter() { + resp = resp.header(k, v.to_string()); + } + + resp +} + +fn try_answer_cached( + version: &ObjectVersion, + version_meta: &ObjectVersionMeta, + req: &Request, +) -> Option> { + // It is possible, and is even usually the case, [that both If-None-Match and + // If-Modified-Since] are present in a request. In this situation If-None-Match takes + // precedence and If-Modified-Since is ignored (as per 6.Precedence from rfc7232). The rational + // being that etag based matching is more accurate, it has no issue with sub-second precision + // for instance (in case of very fast updates) + let cached = if let Some(none_match) = req.headers().get(IF_NONE_MATCH) { + let none_match = none_match.to_str().ok()?; + let expected = format!("\"{}\"", version_meta.etag); + let found = none_match + .split(',') + .map(str::trim) + .any(|etag| etag == expected || etag == "\"*\""); + found + } else if let Some(modified_since) = req.headers().get(IF_MODIFIED_SINCE) { + let modified_since = modified_since.to_str().ok()?; + let client_date = httpdate::parse_http_date(modified_since).ok()?; + let server_date = UNIX_EPOCH + Duration::from_millis(version.timestamp); + client_date >= server_date + } else { + false + }; + + if cached { + Some( + Response::builder() + .status(StatusCode::NOT_MODIFIED) + .body(Body::empty()) + .unwrap(), + ) + } else { + None + } +} + +/// Handle HEAD request +pub async fn handle_head( + garage: Arc, + req: &Request, + bucket_id: Uuid, + key: &str, + part_number: Option, +) -> Result, Error> { + let object = garage + .object_table + .get(&bucket_id, &key.to_string()) + .await? + .ok_or(Error::NoSuchKey)?; + + let object_version = object + .versions() + .iter() + .rev() + .find(|v| v.is_data()) + .ok_or(Error::NoSuchKey)?; + + let version_data = match &object_version.state { + ObjectVersionState::Complete(c) => c, + _ => unreachable!(), + }; + + let version_meta = match version_data { + ObjectVersionData::Inline(meta, _) => meta, + ObjectVersionData::FirstBlock(meta, _) => meta, + _ => unreachable!(), + }; + + if let Some(cached) = try_answer_cached(object_version, version_meta, req) { + return Ok(cached); + } + + if let Some(pn) = part_number { + match version_data { + ObjectVersionData::Inline(_, bytes) => { + if pn != 1 { + return Err(Error::InvalidPart); + } + Ok(object_headers(object_version, version_meta) + .header(CONTENT_LENGTH, format!("{}", bytes.len())) + .header( + CONTENT_RANGE, + format!("bytes 0-{}/{}", bytes.len() - 1, bytes.len()), + ) + .header(X_AMZ_MP_PARTS_COUNT, "1") + .status(StatusCode::PARTIAL_CONTENT) + .body(Body::empty())?) + } + ObjectVersionData::FirstBlock(_, _) => { + let version = garage + .version_table + .get(&object_version.uuid, &EmptyKey) + .await? + .ok_or(Error::NoSuchKey)?; + + let (part_offset, part_end) = + calculate_part_bounds(&version, pn).ok_or(Error::InvalidPart)?; + let n_parts = version.parts_etags.items().len(); + + Ok(object_headers(object_version, version_meta) + .header(CONTENT_LENGTH, format!("{}", part_end - part_offset)) + .header( + CONTENT_RANGE, + format!( + "bytes {}-{}/{}", + part_offset, + part_end - 1, + version_meta.size + ), + ) + .header(X_AMZ_MP_PARTS_COUNT, format!("{}", n_parts)) + .status(StatusCode::PARTIAL_CONTENT) + .body(Body::empty())?) + } + _ => unreachable!(), + } + } else { + Ok(object_headers(object_version, version_meta) + .header(CONTENT_LENGTH, format!("{}", version_meta.size)) + .status(StatusCode::OK) + .body(Body::empty())?) + } +} + +/// Handle GET request +pub async fn handle_get( + garage: Arc, + req: &Request, + bucket_id: Uuid, + key: &str, + part_number: Option, +) -> Result, Error> { + let object = garage + .object_table + .get(&bucket_id, &key.to_string()) + .await? + .ok_or(Error::NoSuchKey)?; + + let last_v = object + .versions() + .iter() + .rev() + .find(|v| v.is_complete()) + .ok_or(Error::NoSuchKey)?; + + let last_v_data = match &last_v.state { + ObjectVersionState::Complete(x) => x, + _ => unreachable!(), + }; + let last_v_meta = match last_v_data { + ObjectVersionData::DeleteMarker => return Err(Error::NoSuchKey), + ObjectVersionData::Inline(meta, _) => meta, + ObjectVersionData::FirstBlock(meta, _) => meta, + }; + + if let Some(cached) = try_answer_cached(last_v, last_v_meta, req) { + return Ok(cached); + } + + match (part_number, parse_range_header(req, last_v_meta.size)?) { + (Some(_), Some(_)) => { + return Err(Error::BadRequest( + "Cannot specify both partNumber and Range header".into(), + )); + } + (Some(pn), None) => { + return handle_get_part(garage, last_v, last_v_data, last_v_meta, pn).await; + } + (None, Some(range)) => { + return handle_get_range( + garage, + last_v, + last_v_data, + last_v_meta, + range.start, + range.start + range.length, + ) + .await; + } + (None, None) => (), + } + + let resp_builder = object_headers(last_v, last_v_meta) + .header(CONTENT_LENGTH, format!("{}", last_v_meta.size)) + .status(StatusCode::OK); + + match &last_v_data { + ObjectVersionData::DeleteMarker => unreachable!(), + ObjectVersionData::Inline(_, bytes) => { + let body: Body = Body::from(bytes.to_vec()); + Ok(resp_builder.body(body)?) + } + ObjectVersionData::FirstBlock(_, first_block_hash) => { + let read_first_block = garage.block_manager.rpc_get_block(first_block_hash); + let get_next_blocks = garage.version_table.get(&last_v.uuid, &EmptyKey); + + let (first_block, version) = futures::try_join!(read_first_block, get_next_blocks)?; + let version = version.ok_or(Error::NoSuchKey)?; + + let mut blocks = version + .blocks + .items() + .iter() + .map(|(_, vb)| (vb.hash, None)) + .collect::>(); + blocks[0].1 = Some(first_block); + + let body_stream = futures::stream::iter(blocks) + .map(move |(hash, data_opt)| { + let garage = garage.clone(); + async move { + if let Some(data) = data_opt { + Ok(Bytes::from(data)) + } else { + garage + .block_manager + .rpc_get_block(&hash) + .await + .map(Bytes::from) + } + } + }) + .buffered(2); + + let body = hyper::body::Body::wrap_stream(body_stream); + Ok(resp_builder.body(body)?) + } + } +} + +async fn handle_get_range( + garage: Arc, + version: &ObjectVersion, + version_data: &ObjectVersionData, + version_meta: &ObjectVersionMeta, + begin: u64, + end: u64, +) -> Result, Error> { + let resp_builder = object_headers(version, version_meta) + .header(CONTENT_LENGTH, format!("{}", end - begin)) + .header( + CONTENT_RANGE, + format!("bytes {}-{}/{}", begin, end - 1, version_meta.size), + ) + .status(StatusCode::PARTIAL_CONTENT); + + match &version_data { + ObjectVersionData::DeleteMarker => unreachable!(), + ObjectVersionData::Inline(_meta, bytes) => { + if end as usize <= bytes.len() { + let body: Body = Body::from(bytes[begin as usize..end as usize].to_vec()); + Ok(resp_builder.body(body)?) + } else { + None.ok_or_internal_error( + "Requested range not present in inline bytes when it should have been", + ) + } + } + ObjectVersionData::FirstBlock(_meta, _first_block_hash) => { + let version = garage + .version_table + .get(&version.uuid, &EmptyKey) + .await? + .ok_or(Error::NoSuchKey)?; + + let body = body_from_blocks_range(garage, version.blocks.items(), begin, end); + Ok(resp_builder.body(body)?) + } + } +} + +async fn handle_get_part( + garage: Arc, + object_version: &ObjectVersion, + version_data: &ObjectVersionData, + version_meta: &ObjectVersionMeta, + part_number: u64, +) -> Result, Error> { + let resp_builder = + object_headers(object_version, version_meta).status(StatusCode::PARTIAL_CONTENT); + + match version_data { + ObjectVersionData::Inline(_, bytes) => { + if part_number != 1 { + return Err(Error::InvalidPart); + } + Ok(resp_builder + .header(CONTENT_LENGTH, format!("{}", bytes.len())) + .header( + CONTENT_RANGE, + format!("bytes {}-{}/{}", 0, bytes.len() - 1, bytes.len()), + ) + .header(X_AMZ_MP_PARTS_COUNT, "1") + .body(Body::from(bytes.to_vec()))?) + } + ObjectVersionData::FirstBlock(_, _) => { + let version = garage + .version_table + .get(&object_version.uuid, &EmptyKey) + .await? + .ok_or(Error::NoSuchKey)?; + + let (begin, end) = + calculate_part_bounds(&version, part_number).ok_or(Error::InvalidPart)?; + let n_parts = version.parts_etags.items().len(); + + let body = body_from_blocks_range(garage, version.blocks.items(), begin, end); + + Ok(resp_builder + .header(CONTENT_LENGTH, format!("{}", end - begin)) + .header( + CONTENT_RANGE, + format!("bytes {}-{}/{}", begin, end - 1, version_meta.size), + ) + .header(X_AMZ_MP_PARTS_COUNT, format!("{}", n_parts)) + .body(body)?) + } + _ => unreachable!(), + } +} + +fn parse_range_header( + req: &Request, + total_size: u64, +) -> Result, Error> { + let range = match req.headers().get(RANGE) { + Some(range) => { + let range_str = range.to_str()?; + let mut ranges = + http_range::HttpRange::parse(range_str, total_size).map_err(|e| (e, total_size))?; + if ranges.len() > 1 { + // garage does not support multi-range requests yet, so we respond with the entire + // object when multiple ranges are requested + None + } else { + ranges.pop() + } + } + None => None, + }; + Ok(range) +} + +fn calculate_part_bounds(v: &Version, part_number: u64) -> Option<(u64, u64)> { + let mut offset = 0; + for (i, (bk, bv)) in v.blocks.items().iter().enumerate() { + if bk.part_number == part_number { + let size: u64 = v.blocks.items()[i..] + .iter() + .take_while(|(k, _)| k.part_number == part_number) + .map(|(_, v)| v.size) + .sum(); + return Some((offset, offset + size)); + } + offset += bv.size; + } + None +} + +fn body_from_blocks_range( + garage: Arc, + all_blocks: &[(VersionBlockKey, VersionBlock)], + begin: u64, + end: u64, +) -> Body { + // We will store here the list of blocks that have an intersection with the requested + // range, as well as their "true offset", which is their actual offset in the complete + // file (whereas block.offset designates the offset of the block WITHIN THE PART + // block.part_number, which is not the same in the case of a multipart upload) + let mut blocks: Vec<(VersionBlock, u64)> = Vec::with_capacity(std::cmp::min( + all_blocks.len(), + 4 + ((end - begin) / std::cmp::max(all_blocks[0].1.size as u64, 1024)) as usize, + )); + let mut true_offset = 0; + for (_, b) in all_blocks.iter() { + if true_offset >= end { + break; + } + // Keep only blocks that have an intersection with the requested range + if true_offset < end && true_offset + b.size > begin { + blocks.push((*b, true_offset)); + } + true_offset += b.size; + } + + let body_stream = futures::stream::iter(blocks) + .map(move |(block, true_offset)| { + let garage = garage.clone(); + async move { + let data = garage.block_manager.rpc_get_block(&block.hash).await?; + let data = Bytes::from(data); + let start_in_block = if true_offset > begin { + 0 + } else { + begin - true_offset + }; + let end_in_block = if true_offset + block.size < end { + block.size + } else { + end - true_offset + }; + Result::::Ok( + data.slice(start_in_block as usize..end_in_block as usize), + ) + } + }) + .buffered(2); + + hyper::body::Body::wrap_stream(body_stream) +} diff --git a/src/api/s3/list.rs b/src/api/s3/list.rs new file mode 100644 index 00000000..e2848c57 --- /dev/null +++ b/src/api/s3/list.rs @@ -0,0 +1,1337 @@ +use std::cmp::Ordering; +use std::collections::{BTreeMap, BTreeSet}; +use std::iter::{Iterator, Peekable}; +use std::sync::Arc; + +use hyper::{Body, Response}; + +use garage_util::data::*; +use garage_util::error::Error as GarageError; +use garage_util::time::*; + +use garage_model::garage::Garage; +use garage_model::s3::object_table::*; +use garage_model::s3::version_table::Version; + +use garage_table::{EmptyKey, EnumerationOrder}; + +use crate::encoding::*; +use crate::error::*; +use crate::helpers::key_after_prefix; +use crate::s3::put as s3_put; +use crate::s3::xml as s3_xml; + +const DUMMY_NAME: &str = "Dummy Key"; +const DUMMY_KEY: &str = "GKDummyKey"; + +#[derive(Debug)] +pub struct ListQueryCommon { + pub bucket_name: String, + pub bucket_id: Uuid, + pub delimiter: Option, + pub page_size: usize, + pub prefix: String, + pub urlencode_resp: bool, +} + +#[derive(Debug)] +pub struct ListObjectsQuery { + pub is_v2: bool, + pub marker: Option, + pub continuation_token: Option, + pub start_after: Option, + pub common: ListQueryCommon, +} + +#[derive(Debug)] +pub struct ListMultipartUploadsQuery { + pub key_marker: Option, + pub upload_id_marker: Option, + pub common: ListQueryCommon, +} + +#[derive(Debug)] +pub struct ListPartsQuery { + pub bucket_name: String, + pub bucket_id: Uuid, + pub key: String, + pub upload_id: String, + pub part_number_marker: Option, + pub max_parts: u64, +} + +pub async fn handle_list( + garage: Arc, + query: &ListObjectsQuery, +) -> Result, Error> { + let io = |bucket, key, count| { + let t = &garage.object_table; + async move { + t.get_range( + &bucket, + key, + Some(ObjectFilter::IsData), + count, + EnumerationOrder::Forward, + ) + .await + } + }; + + debug!("ListObjects {:?}", query); + let mut acc = query.build_accumulator(); + let pagination = fetch_list_entries(&query.common, query.begin()?, &mut acc, &io).await?; + + let result = s3_xml::ListBucketResult { + xmlns: (), + // Sending back request information + name: s3_xml::Value(query.common.bucket_name.to_string()), + prefix: uriencode_maybe(&query.common.prefix, query.common.urlencode_resp), + max_keys: s3_xml::IntValue(query.common.page_size as i64), + delimiter: query + .common + .delimiter + .as_ref() + .map(|x| uriencode_maybe(x, query.common.urlencode_resp)), + encoding_type: match query.common.urlencode_resp { + true => Some(s3_xml::Value("url".to_string())), + false => None, + }, + marker: match (!query.is_v2, &query.marker) { + (true, Some(k)) => Some(uriencode_maybe(k, query.common.urlencode_resp)), + _ => None, + }, + start_after: match (query.is_v2, &query.start_after) { + (true, Some(sa)) => Some(uriencode_maybe(sa, query.common.urlencode_resp)), + _ => None, + }, + continuation_token: match (query.is_v2, &query.continuation_token) { + (true, Some(ct)) => Some(s3_xml::Value(ct.to_string())), + _ => None, + }, + + // Pagination + is_truncated: s3_xml::Value(format!("{}", pagination.is_some())), + key_count: Some(s3_xml::IntValue( + acc.keys.len() as i64 + acc.common_prefixes.len() as i64, + )), + next_marker: match (!query.is_v2, &pagination) { + (true, Some(RangeBegin::AfterKey { key: k })) + | ( + true, + Some(RangeBegin::IncludingKey { + fallback_key: Some(k), + .. + }), + ) => Some(uriencode_maybe(k, query.common.urlencode_resp)), + _ => None, + }, + next_continuation_token: match (query.is_v2, &pagination) { + (true, Some(RangeBegin::AfterKey { key })) => Some(s3_xml::Value(format!( + "]{}", + base64::encode(key.as_bytes()) + ))), + (true, Some(RangeBegin::IncludingKey { key, .. })) => Some(s3_xml::Value(format!( + "[{}", + base64::encode(key.as_bytes()) + ))), + _ => None, + }, + + // Body + contents: acc + .keys + .iter() + .map(|(key, info)| s3_xml::ListBucketItem { + key: uriencode_maybe(key, query.common.urlencode_resp), + last_modified: s3_xml::Value(msec_to_rfc3339(info.last_modified)), + size: s3_xml::IntValue(info.size as i64), + etag: s3_xml::Value(format!("\"{}\"", info.etag)), + storage_class: s3_xml::Value("STANDARD".to_string()), + }) + .collect(), + common_prefixes: acc + .common_prefixes + .iter() + .map(|pfx| s3_xml::CommonPrefix { + prefix: uriencode_maybe(pfx, query.common.urlencode_resp), + }) + .collect(), + }; + + let xml = s3_xml::to_xml_with_header(&result)?; + Ok(Response::builder() + .header("Content-Type", "application/xml") + .body(Body::from(xml.into_bytes()))?) +} + +pub async fn handle_list_multipart_upload( + garage: Arc, + query: &ListMultipartUploadsQuery, +) -> Result, Error> { + let io = |bucket, key, count| { + let t = &garage.object_table; + async move { + t.get_range( + &bucket, + key, + Some(ObjectFilter::IsUploading), + count, + EnumerationOrder::Forward, + ) + .await + } + }; + + debug!("ListMultipartUploads {:?}", query); + let mut acc = query.build_accumulator(); + let pagination = fetch_list_entries(&query.common, query.begin()?, &mut acc, &io).await?; + + let result = s3_xml::ListMultipartUploadsResult { + xmlns: (), + + // Sending back some information about the request + bucket: s3_xml::Value(query.common.bucket_name.to_string()), + prefix: uriencode_maybe(&query.common.prefix, query.common.urlencode_resp), + delimiter: query + .common + .delimiter + .as_ref() + .map(|d| uriencode_maybe(d, query.common.urlencode_resp)), + max_uploads: s3_xml::IntValue(query.common.page_size as i64), + key_marker: query + .key_marker + .as_ref() + .map(|m| uriencode_maybe(m, query.common.urlencode_resp)), + upload_id_marker: query + .upload_id_marker + .as_ref() + .map(|m| s3_xml::Value(m.to_string())), + encoding_type: match query.common.urlencode_resp { + true => Some(s3_xml::Value("url".to_string())), + false => None, + }, + + // Handling pagination + is_truncated: s3_xml::Value(format!("{}", pagination.is_some())), + next_key_marker: match &pagination { + None => None, + Some(RangeBegin::AfterKey { key }) + | Some(RangeBegin::AfterUpload { key, .. }) + | Some(RangeBegin::IncludingKey { key, .. }) => { + Some(uriencode_maybe(key, query.common.urlencode_resp)) + } + }, + next_upload_id_marker: match pagination { + Some(RangeBegin::AfterUpload { upload, .. }) => { + Some(s3_xml::Value(hex::encode(upload))) + } + Some(RangeBegin::IncludingKey { .. }) => Some(s3_xml::Value("include".to_string())), + _ => None, + }, + + // Result body + upload: acc + .keys + .iter() + .map(|(uuid, info)| s3_xml::ListMultipartItem { + initiated: s3_xml::Value(msec_to_rfc3339(info.timestamp)), + key: uriencode_maybe(&info.key, query.common.urlencode_resp), + upload_id: s3_xml::Value(hex::encode(uuid)), + storage_class: s3_xml::Value("STANDARD".to_string()), + initiator: s3_xml::Initiator { + display_name: s3_xml::Value(DUMMY_NAME.to_string()), + id: s3_xml::Value(DUMMY_KEY.to_string()), + }, + owner: s3_xml::Owner { + display_name: s3_xml::Value(DUMMY_NAME.to_string()), + id: s3_xml::Value(DUMMY_KEY.to_string()), + }, + }) + .collect(), + common_prefixes: acc + .common_prefixes + .iter() + .map(|c| s3_xml::CommonPrefix { + prefix: s3_xml::Value(c.to_string()), + }) + .collect(), + }; + + let xml = s3_xml::to_xml_with_header(&result)?; + + Ok(Response::builder() + .header("Content-Type", "application/xml") + .body(Body::from(xml.into_bytes()))?) +} + +pub async fn handle_list_parts( + garage: Arc, + query: &ListPartsQuery, +) -> Result, Error> { + debug!("ListParts {:?}", query); + + let upload_id = s3_put::decode_upload_id(&query.upload_id)?; + + let (object, version) = futures::try_join!( + garage.object_table.get(&query.bucket_id, &query.key), + garage.version_table.get(&upload_id, &EmptyKey), + )?; + + let (info, next) = fetch_part_info(query, object, version, upload_id)?; + + let result = s3_xml::ListPartsResult { + xmlns: (), + bucket: s3_xml::Value(query.bucket_name.to_string()), + key: s3_xml::Value(query.key.to_string()), + upload_id: s3_xml::Value(query.upload_id.to_string()), + part_number_marker: query.part_number_marker.map(|e| s3_xml::IntValue(e as i64)), + next_part_number_marker: next.map(|e| s3_xml::IntValue(e as i64)), + max_parts: s3_xml::IntValue(query.max_parts as i64), + is_truncated: s3_xml::Value(next.map(|_| "true").unwrap_or("false").to_string()), + parts: info + .iter() + .map(|part| s3_xml::PartItem { + etag: s3_xml::Value(format!("\"{}\"", part.etag)), + last_modified: s3_xml::Value(msec_to_rfc3339(part.timestamp)), + part_number: s3_xml::IntValue(part.part_number as i64), + size: s3_xml::IntValue(part.size as i64), + }) + .collect(), + initiator: s3_xml::Initiator { + display_name: s3_xml::Value(DUMMY_NAME.to_string()), + id: s3_xml::Value(DUMMY_KEY.to_string()), + }, + owner: s3_xml::Owner { + display_name: s3_xml::Value(DUMMY_NAME.to_string()), + id: s3_xml::Value(DUMMY_KEY.to_string()), + }, + storage_class: s3_xml::Value("STANDARD".to_string()), + }; + + let xml = s3_xml::to_xml_with_header(&result)?; + + Ok(Response::builder() + .header("Content-Type", "application/xml") + .body(Body::from(xml.into_bytes()))?) +} + +/* + * Private enums and structs + */ + +#[derive(Debug)] +struct ObjectInfo { + last_modified: u64, + size: u64, + etag: String, +} + +#[derive(Debug, PartialEq)] +struct UploadInfo { + key: String, + timestamp: u64, +} + +#[derive(Debug, PartialEq)] +struct PartInfo { + etag: String, + timestamp: u64, + part_number: u64, + size: u64, +} + +enum ExtractionResult { + NoMore, + Filled, + FilledAtUpload { + key: String, + upload: Uuid, + }, + Extracted { + key: String, + }, + // Fallback key is used for legacy APIs that only support + // exlusive pagination (and not inclusive one). + SkipTo { + key: String, + fallback_key: Option, + }, +} + +#[derive(PartialEq, Clone, Debug)] +enum RangeBegin { + // Fallback key is used for legacy APIs that only support + // exlusive pagination (and not inclusive one). + IncludingKey { + key: String, + fallback_key: Option, + }, + AfterKey { + key: String, + }, + AfterUpload { + key: String, + upload: Uuid, + }, +} +type Pagination = Option; + +/* + * Fetch list entries + */ + +async fn fetch_list_entries( + query: &ListQueryCommon, + begin: RangeBegin, + acc: &mut impl ExtractAccumulator, + mut io: F, +) -> Result +where + R: futures::Future, GarageError>>, + F: FnMut(Uuid, Option, usize) -> R, +{ + let mut cursor = begin; + // +1 is needed as we may need to skip the 1st key + // (range is inclusive while most S3 requests are exclusive) + let count = query.page_size + 1; + + loop { + let start_key = match cursor { + RangeBegin::AfterKey { ref key } + | RangeBegin::AfterUpload { ref key, .. } + | RangeBegin::IncludingKey { ref key, .. } => Some(key.clone()), + }; + + // Fetch objects + let objects = io(query.bucket_id, start_key.clone(), count).await?; + + debug!( + "List: get range {:?} (max {}), results: {}", + start_key, + count, + objects.len() + ); + let server_more = objects.len() >= count; + + let prev_req_cursor = cursor.clone(); + let mut iter = objects.iter().peekable(); + + // Drop the first key if needed + // Only AfterKey requires it according to the S3 spec and our implem. + match (&cursor, iter.peek()) { + (RangeBegin::AfterKey { key }, Some(object)) if &object.key == key => iter.next(), + (_, _) => None, + }; + + while let Some(object) = iter.peek() { + if !object.key.starts_with(&query.prefix) { + // If the key is not in the requested prefix, we're done + return Ok(None); + } + + cursor = match acc.extract(query, &cursor, &mut iter) { + ExtractionResult::Extracted { key } => RangeBegin::AfterKey { key }, + ExtractionResult::SkipTo { key, fallback_key } => { + RangeBegin::IncludingKey { key, fallback_key } + } + ExtractionResult::FilledAtUpload { key, upload } => { + return Ok(Some(RangeBegin::AfterUpload { key, upload })) + } + ExtractionResult::Filled => return Ok(Some(cursor)), + ExtractionResult::NoMore => return Ok(None), + }; + } + + if !server_more { + // We did not fully fill the accumulator despite exhausting all the data we have, + // we're done + return Ok(None); + } + + if prev_req_cursor == cursor { + unreachable!("No progress has been done in the loop. This is a bug, please report it."); + } + } +} + +fn fetch_part_info( + query: &ListPartsQuery, + object: Option, + version: Option, + upload_id: Uuid, +) -> Result<(Vec, Option), Error> { + // Check results + let object = object.ok_or(Error::NoSuchKey)?; + + let obj_version = object + .versions() + .iter() + .find(|v| v.uuid == upload_id && v.is_uploading()) + .ok_or(Error::NoSuchUpload)?; + + let version = version.ok_or(Error::NoSuchKey)?; + + // Cut the beginning of our 2 vectors if required + let (etags, blocks) = match &query.part_number_marker { + Some(marker) => { + let next = marker + 1; + + let part_idx = into_ok_or_err( + version + .parts_etags + .items() + .binary_search_by(|(part_num, _)| part_num.cmp(&next)), + ); + let parts = &version.parts_etags.items()[part_idx..]; + + let block_idx = into_ok_or_err( + version + .blocks + .items() + .binary_search_by(|(vkey, _)| vkey.part_number.cmp(&next)), + ); + let blocks = &version.blocks.items()[block_idx..]; + + (parts, blocks) + } + None => (version.parts_etags.items(), version.blocks.items()), + }; + + // Use the block vector to compute a (part_number, size) vector + let mut size = Vec::<(u64, u64)>::new(); + blocks.iter().for_each(|(key, val)| { + let mut new_size = val.size; + match size.pop() { + Some((part_number, size)) if part_number == key.part_number => new_size += size, + Some(v) => size.push(v), + None => (), + } + size.push((key.part_number, new_size)) + }); + + // Merge the etag vector and size vector to build a PartInfo vector + let max_parts = query.max_parts as usize; + let (mut etag_iter, mut size_iter) = (etags.iter().peekable(), size.iter().peekable()); + + let mut info = Vec::::with_capacity(max_parts); + + while info.len() < max_parts { + match (etag_iter.peek(), size_iter.peek()) { + (Some((ep, etag)), Some((sp, size))) => match ep.cmp(sp) { + Ordering::Less => { + debug!("ETag information ignored due to missing corresponding block information. Query: {:?}", query); + etag_iter.next(); + } + Ordering::Equal => { + info.push(PartInfo { + etag: etag.to_string(), + timestamp: obj_version.timestamp, + part_number: *ep, + size: *size, + }); + etag_iter.next(); + size_iter.next(); + } + Ordering::Greater => { + debug!("Block information ignored due to missing corresponding ETag information. Query: {:?}", query); + size_iter.next(); + } + }, + (None, None) => return Ok((info, None)), + _ => { + debug!( + "Additional block or ETag information ignored. Query: {:?}", + query + ); + return Ok((info, None)); + } + } + } + + match info.last() { + Some(part_info) => { + let pagination = Some(part_info.part_number); + Ok((info, pagination)) + } + None => Ok((info, None)), + } +} + +/* + * ListQuery logic + */ + +/// Determine the key from where we want to start fetch objects from the database +/// +/// We choose whether the object at this key must +/// be included or excluded from the response. +/// This key can be the prefix in the base case, or intermediate +/// points in the dataset if we are continuing a previous listing. +impl ListObjectsQuery { + fn build_accumulator(&self) -> Accumulator { + Accumulator::::new(self.common.page_size) + } + + fn begin(&self) -> Result { + if self.is_v2 { + match (&self.continuation_token, &self.start_after) { + // In V2 mode, the continuation token is defined as an opaque + // string in the spec, so we can do whatever we want with it. + // In our case, it is defined as either [ or ] (for include + // representing the key to start with. + (Some(token), _) => match &token[..1] { + "[" => Ok(RangeBegin::IncludingKey { + key: String::from_utf8(base64::decode(token[1..].as_bytes())?)?, + fallback_key: None, + }), + "]" => Ok(RangeBegin::AfterKey { + key: String::from_utf8(base64::decode(token[1..].as_bytes())?)?, + }), + _ => Err(Error::BadRequest("Invalid continuation token".to_string())), + }, + + // StartAfter has defined semantics in the spec: + // start listing at the first key immediately after. + (_, Some(key)) => Ok(RangeBegin::AfterKey { + key: key.to_string(), + }), + + // In the case where neither is specified, we start + // listing at the specified prefix. If an object has this + // exact same key, we include it. (@TODO is this correct?) + _ => Ok(RangeBegin::IncludingKey { + key: self.common.prefix.to_string(), + fallback_key: None, + }), + } + } else { + match &self.marker { + // In V1 mode, the spec defines the Marker value to mean + // the same thing as the StartAfter value in V2 mode. + Some(key) => Ok(RangeBegin::AfterKey { + key: key.to_string(), + }), + _ => Ok(RangeBegin::IncludingKey { + key: self.common.prefix.to_string(), + fallback_key: None, + }), + } + } + } +} + +impl ListMultipartUploadsQuery { + fn build_accumulator(&self) -> Accumulator { + Accumulator::::new(self.common.page_size) + } + + fn begin(&self) -> Result { + match (&self.upload_id_marker, &self.key_marker) { + // If both the upload id marker and the key marker are sets, + // the spec specifies that we must start listing uploads INCLUDING the given key, + // AFTER the specified upload id (sorted in a lexicographic order). + // To enable some optimisations, we emulate "IncludingKey" by extending the upload id + // semantic. We base our reasoning on the hypothesis that S3's upload ids are opaques + // while Garage's ones are 32 bytes hex encoded which enables us to extend this query + // with a specific "include" upload id. + (Some(up_marker), Some(key_marker)) => match &up_marker[..] { + "include" => Ok(RangeBegin::IncludingKey { + key: key_marker.to_string(), + fallback_key: None, + }), + uuid => Ok(RangeBegin::AfterUpload { + key: key_marker.to_string(), + upload: s3_put::decode_upload_id(uuid)?, + }), + }, + + // If only the key marker is specified, the spec says that we must start listing + // uploads AFTER the specified key. + (None, Some(key_marker)) => Ok(RangeBegin::AfterKey { + key: key_marker.to_string(), + }), + _ => Ok(RangeBegin::IncludingKey { + key: self.common.prefix.to_string(), + fallback_key: None, + }), + } + } +} + +/* + * Accumulator logic + */ + +trait ExtractAccumulator { + fn extract<'a>( + &mut self, + query: &ListQueryCommon, + cursor: &RangeBegin, + iter: &mut Peekable>, + ) -> ExtractionResult; +} + +struct Accumulator { + common_prefixes: BTreeSet, + keys: BTreeMap, + max_capacity: usize, +} + +type ObjectAccumulator = Accumulator; +type UploadAccumulator = Accumulator; + +impl Accumulator { + fn new(page_size: usize) -> Accumulator { + Accumulator { + common_prefixes: BTreeSet::::new(), + keys: BTreeMap::::new(), + max_capacity: page_size, + } + } + + /// Observe the Object iterator and try to extract a single common prefix + /// + /// This function can consume an arbitrary number of items as long as they share the same + /// common prefix. + fn extract_common_prefix<'a>( + &mut self, + objects: &mut Peekable>, + query: &ListQueryCommon, + ) -> Option { + // Get the next object from the iterator + let object = objects.peek().expect("This iterator can not be empty as it is checked earlier in the code. This is a logic bug, please report it."); + + // Check if this is a common prefix (requires a passed delimiter and its value in the key) + let pfx = match common_prefix(object, query) { + Some(p) => p, + None => return None, + }; + + // Try to register this prefix + // If not possible, we can return early + if !self.try_insert_common_prefix(pfx.to_string()) { + return Some(ExtractionResult::Filled); + } + + // We consume the whole common prefix from the iterator + let mut last_pfx_key = &object.key; + loop { + last_pfx_key = match objects.peek() { + Some(o) if o.key.starts_with(pfx) => &o.key, + Some(_) => { + return Some(ExtractionResult::Extracted { + key: last_pfx_key.to_owned(), + }) + } + None => { + return match key_after_prefix(pfx) { + Some(next) => Some(ExtractionResult::SkipTo { + key: next, + fallback_key: Some(last_pfx_key.to_owned()), + }), + None => Some(ExtractionResult::NoMore), + } + } + }; + + objects.next(); + } + } + + fn is_full(&mut self) -> bool { + self.keys.len() + self.common_prefixes.len() >= self.max_capacity + } + + fn try_insert_common_prefix(&mut self, key: String) -> bool { + // If we already have an entry, we can continue + if self.common_prefixes.contains(&key) { + return true; + } + + // Otherwise, we need to check if we can add it + match self.is_full() { + true => false, + false => { + self.common_prefixes.insert(key); + true + } + } + } + + fn try_insert_entry(&mut self, key: K, value: V) -> bool { + // It is impossible to add twice a key, this is an error + assert!(!self.keys.contains_key(&key)); + + match self.is_full() { + true => false, + false => { + self.keys.insert(key, value); + true + } + } + } +} + +impl ExtractAccumulator for ObjectAccumulator { + fn extract<'a>( + &mut self, + query: &ListQueryCommon, + _cursor: &RangeBegin, + objects: &mut Peekable>, + ) -> ExtractionResult { + if let Some(e) = self.extract_common_prefix(objects, query) { + return e; + } + + let object = objects.next().expect("This iterator can not be empty as it is checked earlier in the code. This is a logic bug, please report it."); + + let version = match object.versions().iter().find(|x| x.is_data()) { + Some(v) => v, + None => unreachable!( + "Expect to have objects having data due to earlier filtering. This is a logic bug." + ), + }; + + let meta = match &version.state { + ObjectVersionState::Complete(ObjectVersionData::Inline(meta, _)) => meta, + ObjectVersionState::Complete(ObjectVersionData::FirstBlock(meta, _)) => meta, + _ => unreachable!(), + }; + let info = ObjectInfo { + last_modified: version.timestamp, + size: meta.size, + etag: meta.etag.to_string(), + }; + + match self.try_insert_entry(object.key.clone(), info) { + true => ExtractionResult::Extracted { + key: object.key.clone(), + }, + false => ExtractionResult::Filled, + } + } +} + +impl ExtractAccumulator for UploadAccumulator { + /// Observe the iterator, process a single key, and try to extract one or more upload entries + /// + /// This function processes a single object from the iterator that can contain an arbitrary + /// number of versions, and thus "uploads". + fn extract<'a>( + &mut self, + query: &ListQueryCommon, + cursor: &RangeBegin, + objects: &mut Peekable>, + ) -> ExtractionResult { + if let Some(e) = self.extract_common_prefix(objects, query) { + return e; + } + + // Get the next object from the iterator + let object = objects.next().expect("This iterator can not be empty as it is checked earlier in the code. This is a logic bug, please report it."); + + let mut uploads_for_key = object + .versions() + .iter() + .filter(|x| x.is_uploading()) + .collect::>(); + + // S3 logic requires lexicographically sorted upload ids. + uploads_for_key.sort_unstable_by_key(|e| e.uuid); + + // Skip results if an upload marker is provided + if let RangeBegin::AfterUpload { upload, .. } = cursor { + // Because our data are sorted, we can use a binary search to find the UUID + // or to find where it should have been added. Once this position is found, + // we use it to discard the first part of the array. + let idx = match uploads_for_key.binary_search_by(|e| e.uuid.cmp(upload)) { + // we start after the found uuid so we need to discard the pointed value. + // In the worst case, the UUID is the last element, which lead us to an empty array + // but we are never out of bound. + Ok(i) => i + 1, + // if the UUID is not found, the upload may have been discarded between the 2 request, + // this function returns where it could have been inserted, + // the pointed value is thus greater than our marker and we need to keep it. + Err(i) => i, + }; + uploads_for_key = uploads_for_key[idx..].to_vec(); + } + + let mut iter = uploads_for_key.iter(); + + // The first entry is a specific case + // as it changes our result enum type + let first_upload = match iter.next() { + Some(u) => u, + None => { + return ExtractionResult::Extracted { + key: object.key.clone(), + } + } + }; + let first_up_info = UploadInfo { + key: object.key.to_string(), + timestamp: first_upload.timestamp, + }; + if !self.try_insert_entry(first_upload.uuid, first_up_info) { + return ExtractionResult::Filled; + } + + // We can then collect the remaining uploads in a loop + let mut prev_uuid = first_upload.uuid; + for upload in iter { + let up_info = UploadInfo { + key: object.key.to_string(), + timestamp: upload.timestamp, + }; + + // Insert data in our accumulator + // If it is full, return information to paginate. + if !self.try_insert_entry(upload.uuid, up_info) { + return ExtractionResult::FilledAtUpload { + key: object.key.clone(), + upload: prev_uuid, + }; + } + // Update our last added UUID + prev_uuid = upload.uuid; + } + + // We successfully collected all the uploads + ExtractionResult::Extracted { + key: object.key.clone(), + } + } +} + +/* + * Utility functions + */ + +/// This is a stub for Result::into_ok_or_err that is not yet in Rust stable +fn into_ok_or_err(r: Result) -> T { + match r { + Ok(r) => r, + Err(r) => r, + } +} + +/// Returns the common prefix of the object given the query prefix and delimiter +fn common_prefix<'a>(object: &'a Object, query: &ListQueryCommon) -> Option<&'a str> { + match &query.delimiter { + Some(delimiter) => object.key[query.prefix.len()..] + .find(delimiter) + .map(|i| &object.key[..query.prefix.len() + i + delimiter.len()]), + None => None, + } +} + +/// URIencode a value if needed +fn uriencode_maybe(s: &str, yes: bool) -> s3_xml::Value { + if yes { + s3_xml::Value(uri_encode(s, true)) + } else { + s3_xml::Value(s.to_string()) + } +} + +/* + * Unit tests of this module + */ +#[cfg(test)] +mod tests { + use super::*; + use garage_model::s3::version_table::*; + use garage_util::*; + use std::iter::FromIterator; + + const TS: u64 = 1641394898314; + + fn bucket() -> Uuid { + Uuid::from([0x42; 32]) + } + + fn query() -> ListMultipartUploadsQuery { + ListMultipartUploadsQuery { + common: ListQueryCommon { + prefix: "".to_string(), + delimiter: Some("/".to_string()), + page_size: 1000, + urlencode_resp: false, + bucket_name: "a".to_string(), + bucket_id: Uuid::from([0x00; 32]), + }, + key_marker: None, + upload_id_marker: None, + } + } + + fn objs() -> Vec { + vec![ + Object::new( + bucket(), + "a/b/c".to_string(), + vec![objup_version([0x01; 32])], + ), + Object::new(bucket(), "d".to_string(), vec![objup_version([0x01; 32])]), + ] + } + + fn objup_version(uuid: [u8; 32]) -> ObjectVersion { + ObjectVersion { + uuid: Uuid::from(uuid), + timestamp: TS, + state: ObjectVersionState::Uploading(ObjectVersionHeaders { + content_type: "text/plain".to_string(), + other: BTreeMap::::new(), + }), + } + } + + #[test] + fn test_common_prefixes() { + let mut query = query(); + let objs = objs(); + + query.common.prefix = "a/".to_string(); + assert_eq!( + common_prefix(objs.get(0).unwrap(), &query.common), + Some("a/b/") + ); + + query.common.prefix = "a/b/".to_string(); + assert_eq!(common_prefix(objs.get(0).unwrap(), &query.common), None); + } + + #[test] + fn test_extract_common_prefix() { + let mut query = query(); + query.common.prefix = "a/".to_string(); + let objs = objs(); + let mut acc = UploadAccumulator::new(query.common.page_size); + + let mut iter = objs.iter().peekable(); + match acc.extract_common_prefix(&mut iter, &query.common) { + Some(ExtractionResult::Extracted { key }) => assert_eq!(key, "a/b/c".to_string()), + _ => panic!("wrong result"), + } + assert_eq!(acc.common_prefixes.len(), 1); + assert_eq!(acc.common_prefixes.iter().next().unwrap(), "a/b/"); + } + + #[test] + fn test_extract_upload() { + let objs = vec![ + Object::new( + bucket(), + "b".to_string(), + vec![ + objup_version([0x01; 32]), + objup_version([0x80; 32]), + objup_version([0x8f; 32]), + objup_version([0xdd; 32]), + ], + ), + Object::new(bucket(), "c".to_string(), vec![]), + ]; + + let mut acc = UploadAccumulator::new(2); + let mut start = RangeBegin::AfterUpload { + key: "b".to_string(), + upload: Uuid::from([0x01; 32]), + }; + + let mut iter = objs.iter().peekable(); + + // Check the case where we skip some uploads + match acc.extract(&(query().common), &start, &mut iter) { + ExtractionResult::FilledAtUpload { key, upload } => { + assert_eq!(key, "b"); + assert_eq!(upload, Uuid::from([0x8f; 32])); + } + _ => panic!("wrong result"), + }; + + assert_eq!(acc.keys.len(), 2); + assert_eq!( + acc.keys.get(&Uuid::from([0x80; 32])).unwrap(), + &UploadInfo { + timestamp: TS, + key: "b".to_string() + } + ); + assert_eq!( + acc.keys.get(&Uuid::from([0x8f; 32])).unwrap(), + &UploadInfo { + timestamp: TS, + key: "b".to_string() + } + ); + + acc = UploadAccumulator::new(2); + start = RangeBegin::AfterUpload { + key: "b".to_string(), + upload: Uuid::from([0xff; 32]), + }; + iter = objs.iter().peekable(); + + // Check the case where we skip all the uploads + match acc.extract(&(query().common), &start, &mut iter) { + ExtractionResult::Extracted { key } if key.as_str() == "b" => (), + _ => panic!("wrong result"), + }; + } + + #[tokio::test] + async fn test_fetch_uploads_no_result() -> Result<(), Error> { + let query = query(); + let mut acc = query.build_accumulator(); + let page = fetch_list_entries( + &query.common, + query.begin()?, + &mut acc, + |_, _, _| async move { Ok(vec![]) }, + ) + .await?; + assert_eq!(page, None); + assert_eq!(acc.common_prefixes.len(), 0); + assert_eq!(acc.keys.len(), 0); + + Ok(()) + } + + #[tokio::test] + async fn test_fetch_uploads_basic() -> Result<(), Error> { + let query = query(); + let mut acc = query.build_accumulator(); + let mut fake_io = |_, _, _| async move { Ok(objs()) }; + let page = + fetch_list_entries(&query.common, query.begin()?, &mut acc, &mut fake_io).await?; + assert_eq!(page, None); + assert_eq!(acc.common_prefixes.len(), 1); + assert_eq!(acc.keys.len(), 1); + assert!(acc.common_prefixes.contains("a/")); + + Ok(()) + } + + #[tokio::test] + async fn test_fetch_uploads_advanced() -> Result<(), Error> { + let mut query = query(); + query.common.page_size = 2; + + let mut fake_io = |_, k: Option, _| async move { + Ok(match k.as_deref() { + Some("") => vec![ + Object::new(bucket(), "b/a".to_string(), vec![objup_version([0x01; 32])]), + Object::new(bucket(), "b/b".to_string(), vec![objup_version([0x01; 32])]), + Object::new(bucket(), "b/c".to_string(), vec![objup_version([0x01; 32])]), + ], + Some("b0") => vec![ + Object::new(bucket(), "c/a".to_string(), vec![objup_version([0x01; 32])]), + Object::new(bucket(), "c/b".to_string(), vec![objup_version([0x01; 32])]), + Object::new(bucket(), "c/c".to_string(), vec![objup_version([0x02; 32])]), + ], + Some("c0") => vec![Object::new( + bucket(), + "d".to_string(), + vec![objup_version([0x01; 32])], + )], + _ => panic!("wrong value {:?}", k), + }) + }; + + let mut acc = query.build_accumulator(); + let page = + fetch_list_entries(&query.common, query.begin()?, &mut acc, &mut fake_io).await?; + assert_eq!( + page, + Some(RangeBegin::IncludingKey { + key: "c0".to_string(), + fallback_key: Some("c/c".to_string()) + }) + ); + assert_eq!(acc.common_prefixes.len(), 2); + assert_eq!(acc.keys.len(), 0); + assert!(acc.common_prefixes.contains("b/")); + assert!(acc.common_prefixes.contains("c/")); + + Ok(()) + } + + fn version() -> Version { + let uuid = Uuid::from([0x08; 32]); + + let blocks = vec![ + ( + VersionBlockKey { + part_number: 1, + offset: 1, + }, + VersionBlock { + hash: uuid, + size: 3, + }, + ), + ( + VersionBlockKey { + part_number: 1, + offset: 2, + }, + VersionBlock { + hash: uuid, + size: 2, + }, + ), + ( + VersionBlockKey { + part_number: 2, + offset: 1, + }, + VersionBlock { + hash: uuid, + size: 8, + }, + ), + ( + VersionBlockKey { + part_number: 5, + offset: 1, + }, + VersionBlock { + hash: uuid, + size: 7, + }, + ), + ( + VersionBlockKey { + part_number: 8, + offset: 1, + }, + VersionBlock { + hash: uuid, + size: 5, + }, + ), + ]; + let etags = vec![ + (1, "etag1".to_string()), + (3, "etag2".to_string()), + (5, "etag3".to_string()), + (8, "etag4".to_string()), + (9, "etag5".to_string()), + ]; + + Version { + bucket_id: uuid, + key: "a".to_string(), + uuid, + deleted: false.into(), + blocks: crdt::Map::::from_iter(blocks), + parts_etags: crdt::Map::::from_iter(etags), + } + } + + fn obj() -> Object { + Object::new(bucket(), "d".to_string(), vec![objup_version([0x08; 32])]) + } + + #[test] + fn test_fetch_part_info() -> Result<(), Error> { + let uuid = Uuid::from([0x08; 32]); + let mut query = ListPartsQuery { + bucket_name: "a".to_string(), + bucket_id: uuid, + key: "a".to_string(), + upload_id: "xx".to_string(), + part_number_marker: None, + max_parts: 2, + }; + + assert!( + fetch_part_info(&query, None, None, uuid).is_err(), + "No object and version should fail" + ); + assert!( + fetch_part_info(&query, Some(obj()), None, uuid).is_err(), + "No version should faild" + ); + assert!( + fetch_part_info(&query, None, Some(version()), uuid).is_err(), + "No object should fail" + ); + + // Start from the beginning but with limited size to trigger pagination + let (info, pagination) = fetch_part_info(&query, Some(obj()), Some(version()), uuid)?; + assert_eq!(pagination.unwrap(), 5); + assert_eq!( + info, + vec![ + PartInfo { + etag: "etag1".to_string(), + timestamp: TS, + part_number: 1, + size: 5 + }, + PartInfo { + etag: "etag3".to_string(), + timestamp: TS, + part_number: 5, + size: 7 + }, + ] + ); + + // Use previous pagination to make a new request + query.part_number_marker = Some(pagination.unwrap()); + let (info, pagination) = fetch_part_info(&query, Some(obj()), Some(version()), uuid)?; + assert!(pagination.is_none()); + assert_eq!( + info, + vec![PartInfo { + etag: "etag4".to_string(), + timestamp: TS, + part_number: 8, + size: 5 + },] + ); + + // Trying to access a part that is way larger than registered ones + query.part_number_marker = Some(9999); + let (info, pagination) = fetch_part_info(&query, Some(obj()), Some(version()), uuid)?; + assert!(pagination.is_none()); + assert_eq!(info, vec![]); + + // Try without any limitation + query.max_parts = 1000; + query.part_number_marker = None; + let (info, pagination) = fetch_part_info(&query, Some(obj()), Some(version()), uuid)?; + assert!(pagination.is_none()); + assert_eq!( + info, + vec![ + PartInfo { + etag: "etag1".to_string(), + timestamp: TS, + part_number: 1, + size: 5 + }, + PartInfo { + etag: "etag3".to_string(), + timestamp: TS, + part_number: 5, + size: 7 + }, + PartInfo { + etag: "etag4".to_string(), + timestamp: TS, + part_number: 8, + size: 5 + }, + ] + ); + + Ok(()) + } +} diff --git a/src/api/s3/mod.rs b/src/api/s3/mod.rs new file mode 100644 index 00000000..3f5c1915 --- /dev/null +++ b/src/api/s3/mod.rs @@ -0,0 +1,14 @@ +pub mod api_server; + +mod bucket; +mod copy; +pub mod cors; +mod delete; +pub mod get; +mod list; +mod post_object; +mod put; +mod website; + +mod router; +pub mod xml; diff --git a/src/api/s3/post_object.rs b/src/api/s3/post_object.rs new file mode 100644 index 00000000..86fa7880 --- /dev/null +++ b/src/api/s3/post_object.rs @@ -0,0 +1,507 @@ +use std::collections::HashMap; +use std::convert::TryInto; +use std::ops::RangeInclusive; +use std::sync::Arc; +use std::task::{Context, Poll}; + +use bytes::Bytes; +use chrono::{DateTime, Duration, Utc}; +use futures::{Stream, StreamExt}; +use hyper::header::{self, HeaderMap, HeaderName, HeaderValue}; +use hyper::{Body, Request, Response, StatusCode}; +use multer::{Constraints, Multipart, SizeLimit}; +use serde::Deserialize; + +use garage_model::garage::Garage; + +use crate::error::*; +use crate::helpers::resolve_bucket; +use crate::s3::put::{get_headers, save_stream}; +use crate::s3::xml as s3_xml; +use crate::signature::payload::{parse_date, verify_v4}; + +pub async fn handle_post_object( + garage: Arc, + req: Request, + bucket: String, +) -> Result, Error> { + let boundary = req + .headers() + .get(header::CONTENT_TYPE) + .and_then(|ct| ct.to_str().ok()) + .and_then(|ct| multer::parse_boundary(ct).ok()) + .ok_or_bad_request("Counld not get multipart boundary")?; + + // 16k seems plenty for a header. 5G is the max size of a single part, so it seems reasonable + // for a PostObject + let constraints = Constraints::new().size_limit( + SizeLimit::new() + .per_field(16 * 1024) + .for_field("file", 5 * 1024 * 1024 * 1024), + ); + + let (head, body) = req.into_parts(); + let mut multipart = Multipart::with_constraints(body, boundary, constraints); + + let mut params = HeaderMap::new(); + let field = loop { + let field = if let Some(field) = multipart.next_field().await? { + field + } else { + return Err(Error::BadRequest( + "Request did not contain a file".to_owned(), + )); + }; + let name: HeaderName = if let Some(Ok(name)) = field.name().map(TryInto::try_into) { + name + } else { + continue; + }; + if name == "file" { + break field; + } + + if let Ok(content) = HeaderValue::from_str(&field.text().await?) { + match name.as_str() { + "tag" => (/* tag need to be reencoded, but we don't support them yet anyway */), + "acl" => { + if params.insert("x-amz-acl", content).is_some() { + return Err(Error::BadRequest( + "Field 'acl' provided more than one time".to_string(), + )); + } + } + _ => { + if params.insert(&name, content).is_some() { + return Err(Error::BadRequest(format!( + "Field '{}' provided more than one time", + name + ))); + } + } + } + } + }; + + // Current part is file. Do some checks before handling to PutObject code + let key = params + .get("key") + .ok_or_bad_request("No key was provided")? + .to_str()?; + let credential = params + .get("x-amz-credential") + .ok_or_else(|| { + Error::Forbidden("Garage does not support anonymous access yet".to_string()) + })? + .to_str()?; + let policy = params + .get("policy") + .ok_or_bad_request("No policy was provided")? + .to_str()?; + let signature = params + .get("x-amz-signature") + .ok_or_bad_request("No signature was provided")? + .to_str()?; + let date = params + .get("x-amz-date") + .ok_or_bad_request("No date was provided")? + .to_str()?; + + let key = if key.contains("${filename}") { + // if no filename is provided, don't replace. This matches the behavior of AWS. + if let Some(filename) = field.file_name() { + key.replace("${filename}", filename) + } else { + key.to_owned() + } + } else { + key.to_owned() + }; + + let date = parse_date(date)?; + let api_key = verify_v4( + &garage, + "s3", + credential, + &date, + signature, + policy.as_bytes(), + ) + .await?; + + let bucket_id = resolve_bucket(&garage, &bucket, &api_key).await?; + + if !api_key.allow_write(&bucket_id) { + return Err(Error::Forbidden( + "Operation is not allowed for this key.".to_string(), + )); + } + + let decoded_policy = base64::decode(&policy)?; + let decoded_policy: Policy = + serde_json::from_slice(&decoded_policy).ok_or_bad_request("Invalid policy")?; + + let expiration: DateTime = DateTime::parse_from_rfc3339(&decoded_policy.expiration) + .ok_or_bad_request("Invalid expiration date")? + .into(); + if Utc::now() - expiration > Duration::zero() { + return Err(Error::BadRequest( + "Expiration date is in the paste".to_string(), + )); + } + + let mut conditions = decoded_policy.into_conditions()?; + + for (param_key, value) in params.iter() { + let mut param_key = param_key.to_string(); + param_key.make_ascii_lowercase(); + match param_key.as_str() { + "policy" | "x-amz-signature" => (), // this is always accepted, as it's required to validate other fields + "content-type" => { + let conds = conditions.params.remove("content-type").ok_or_else(|| { + Error::BadRequest(format!("Key '{}' is not allowed in policy", param_key)) + })?; + for cond in conds { + let ok = match cond { + Operation::Equal(s) => s.as_str() == value, + Operation::StartsWith(s) => { + value.to_str()?.split(',').all(|v| v.starts_with(&s)) + } + }; + if !ok { + return Err(Error::BadRequest(format!( + "Key '{}' has value not allowed in policy", + param_key + ))); + } + } + } + "key" => { + let conds = conditions.params.remove("key").ok_or_else(|| { + Error::BadRequest(format!("Key '{}' is not allowed in policy", param_key)) + })?; + for cond in conds { + let ok = match cond { + Operation::Equal(s) => s == key, + Operation::StartsWith(s) => key.starts_with(&s), + }; + if !ok { + return Err(Error::BadRequest(format!( + "Key '{}' has value not allowed in policy", + param_key + ))); + } + } + } + _ => { + if param_key.starts_with("x-ignore-") { + // if a x-ignore is provided in policy, it's not removed here, so it will be + // rejected as provided in policy but not in the request. As odd as it is, it's + // how aws seems to behave. + continue; + } + let conds = conditions.params.remove(¶m_key).ok_or_else(|| { + Error::BadRequest(format!("Key '{}' is not allowed in policy", param_key)) + })?; + for cond in conds { + let ok = match cond { + Operation::Equal(s) => s.as_str() == value, + Operation::StartsWith(s) => value.to_str()?.starts_with(s.as_str()), + }; + if !ok { + return Err(Error::BadRequest(format!( + "Key '{}' has value not allowed in policy", + param_key + ))); + } + } + } + } + } + + if let Some((param_key, _)) = conditions.params.iter().next() { + return Err(Error::BadRequest(format!( + "Key '{}' is required in policy, but no value was provided", + param_key + ))); + } + + let headers = get_headers(¶ms)?; + + let stream = field.map(|r| r.map_err(Into::into)); + let (_, md5) = save_stream( + garage, + headers, + StreamLimiter::new(stream, conditions.content_length), + bucket_id, + &key, + None, + None, + ) + .await?; + + let etag = format!("\"{}\"", md5); + + let resp = if let Some(mut target) = params + .get("success_action_redirect") + .and_then(|h| h.to_str().ok()) + .and_then(|u| url::Url::parse(u).ok()) + .filter(|u| u.scheme() == "https" || u.scheme() == "http") + { + target + .query_pairs_mut() + .append_pair("bucket", &bucket) + .append_pair("key", &key) + .append_pair("etag", &etag); + let target = target.to_string(); + Response::builder() + .status(StatusCode::SEE_OTHER) + .header(header::LOCATION, target.clone()) + .header(header::ETAG, etag) + .body(target.into())? + } else { + let path = head + .uri + .into_parts() + .path_and_query + .map(|paq| paq.path().to_string()) + .unwrap_or_else(|| "/".to_string()); + let authority = head + .headers + .get(header::HOST) + .and_then(|h| h.to_str().ok()) + .unwrap_or_default(); + let proto = if !authority.is_empty() { + "https://" + } else { + "" + }; + + let url_key: String = form_urlencoded::byte_serialize(key.as_bytes()) + .flat_map(str::chars) + .collect(); + let location = format!("{}{}{}{}", proto, authority, path, url_key); + + let action = params + .get("success_action_status") + .and_then(|h| h.to_str().ok()) + .unwrap_or("204"); + let builder = Response::builder() + .header(header::LOCATION, location.clone()) + .header(header::ETAG, etag.clone()); + match action { + "200" => builder.status(StatusCode::OK).body(Body::empty())?, + "201" => { + let xml = s3_xml::PostObject { + xmlns: (), + location: s3_xml::Value(location), + bucket: s3_xml::Value(bucket), + key: s3_xml::Value(key), + etag: s3_xml::Value(etag), + }; + let body = s3_xml::to_xml_with_header(&xml)?; + builder + .status(StatusCode::CREATED) + .body(Body::from(body.into_bytes()))? + } + _ => builder.status(StatusCode::NO_CONTENT).body(Body::empty())?, + } + }; + + Ok(resp) +} + +#[derive(Deserialize)] +struct Policy { + expiration: String, + conditions: Vec, +} + +impl Policy { + fn into_conditions(self) -> Result { + let mut params = HashMap::<_, Vec<_>>::new(); + + let mut length = (0, u64::MAX); + for condition in self.conditions { + match condition { + PolicyCondition::Equal(map) => { + if map.len() != 1 { + return Err(Error::BadRequest("Invalid policy item".to_owned())); + } + let (mut k, v) = map.into_iter().next().expect("size was verified"); + k.make_ascii_lowercase(); + params.entry(k).or_default().push(Operation::Equal(v)); + } + PolicyCondition::OtherOp([cond, mut key, value]) => { + if key.remove(0) != '$' { + return Err(Error::BadRequest("Invalid policy item".to_owned())); + } + key.make_ascii_lowercase(); + match cond.as_str() { + "eq" => { + params.entry(key).or_default().push(Operation::Equal(value)); + } + "starts-with" => { + params + .entry(key) + .or_default() + .push(Operation::StartsWith(value)); + } + _ => return Err(Error::BadRequest("Invalid policy item".to_owned())), + } + } + PolicyCondition::SizeRange(key, min, max) => { + if key == "content-length-range" { + length.0 = length.0.max(min); + length.1 = length.1.min(max); + } else { + return Err(Error::BadRequest("Invalid policy item".to_owned())); + } + } + } + } + Ok(Conditions { + params, + content_length: RangeInclusive::new(length.0, length.1), + }) + } +} + +/// A single condition from a policy +#[derive(Debug, Deserialize)] +#[serde(untagged)] +enum PolicyCondition { + // will contain a single key-value pair + Equal(HashMap), + OtherOp([String; 3]), + SizeRange(String, u64, u64), +} + +#[derive(Debug)] +struct Conditions { + params: HashMap>, + content_length: RangeInclusive, +} + +#[derive(Debug, PartialEq, Eq)] +enum Operation { + Equal(String), + StartsWith(String), +} + +struct StreamLimiter { + inner: T, + length: RangeInclusive, + read: u64, +} + +impl StreamLimiter { + fn new(stream: T, length: RangeInclusive) -> Self { + StreamLimiter { + inner: stream, + length, + read: 0, + } + } +} + +impl Stream for StreamLimiter +where + T: Stream> + Unpin, +{ + type Item = Result; + fn poll_next( + mut self: std::pin::Pin<&mut Self>, + ctx: &mut Context<'_>, + ) -> Poll> { + let res = std::pin::Pin::new(&mut self.inner).poll_next(ctx); + match &res { + Poll::Ready(Some(Ok(bytes))) => { + self.read += bytes.len() as u64; + // optimization to fail early when we know before the end it's too long + if self.length.end() < &self.read { + return Poll::Ready(Some(Err(Error::BadRequest( + "File size does not match policy".to_owned(), + )))); + } + } + Poll::Ready(None) => { + if !self.length.contains(&self.read) { + return Poll::Ready(Some(Err(Error::BadRequest( + "File size does not match policy".to_owned(), + )))); + } + } + _ => {} + } + res + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_policy_1() { + let policy_json = br#" +{ "expiration": "2007-12-01T12:00:00.000Z", + "conditions": [ + {"acl": "public-read" }, + {"bucket": "johnsmith" }, + ["starts-with", "$key", "user/eric/"] + ] +} + "#; + let policy_2: Policy = serde_json::from_slice(&policy_json[..]).unwrap(); + let mut conditions = policy_2.into_conditions().unwrap(); + + assert_eq!( + conditions.params.remove(&"acl".to_string()), + Some(vec![Operation::Equal("public-read".into())]) + ); + assert_eq!( + conditions.params.remove(&"bucket".to_string()), + Some(vec![Operation::Equal("johnsmith".into())]) + ); + assert_eq!( + conditions.params.remove(&"key".to_string()), + Some(vec![Operation::StartsWith("user/eric/".into())]) + ); + assert!(conditions.params.is_empty()); + assert_eq!(conditions.content_length, 0..=u64::MAX); + } + + #[test] + fn test_policy_2() { + let policy_json = br#" +{ "expiration": "2007-12-01T12:00:00.000Z", + "conditions": [ + [ "eq", "$acl", "public-read" ], + ["starts-with", "$Content-Type", "image/"], + ["starts-with", "$success_action_redirect", ""], + ["content-length-range", 1048576, 10485760] + ] +} + "#; + let policy_2: Policy = serde_json::from_slice(&policy_json[..]).unwrap(); + let mut conditions = policy_2.into_conditions().unwrap(); + + assert_eq!( + conditions.params.remove(&"acl".to_string()), + Some(vec![Operation::Equal("public-read".into())]) + ); + assert_eq!( + conditions.params.remove("content-type").unwrap(), + vec![Operation::StartsWith("image/".into())] + ); + assert_eq!( + conditions + .params + .remove(&"success_action_redirect".to_string()), + Some(vec![Operation::StartsWith("".into())]) + ); + assert!(conditions.params.is_empty()); + assert_eq!(conditions.content_length, 1048576..=10485760); + } +} diff --git a/src/api/s3/put.rs b/src/api/s3/put.rs new file mode 100644 index 00000000..89aa8d84 --- /dev/null +++ b/src/api/s3/put.rs @@ -0,0 +1,753 @@ +use std::collections::{BTreeMap, BTreeSet, VecDeque}; +use std::sync::Arc; + +use futures::prelude::*; +use hyper::body::{Body, Bytes}; +use hyper::header::{HeaderMap, HeaderValue}; +use hyper::{Request, Response}; +use md5::{digest::generic_array::*, Digest as Md5Digest, Md5}; +use sha2::Sha256; + +use garage_table::*; +use garage_util::data::*; +use garage_util::error::Error as GarageError; +use garage_util::time::*; + +use garage_block::manager::INLINE_THRESHOLD; +use garage_model::garage::Garage; +use garage_model::s3::block_ref_table::*; +use garage_model::s3::object_table::*; +use garage_model::s3::version_table::*; + +use crate::error::*; +use crate::s3::xml as s3_xml; +use crate::signature::verify_signed_content; + +pub async fn handle_put( + garage: Arc, + req: Request, + bucket_id: Uuid, + key: &str, + content_sha256: Option, +) -> Result, Error> { + // Retrieve interesting headers from request + let headers = get_headers(req.headers())?; + debug!("Object headers: {:?}", headers); + + let content_md5 = match req.headers().get("content-md5") { + Some(x) => Some(x.to_str()?.to_string()), + None => None, + }; + + let (_head, body) = req.into_parts(); + let body = body.map_err(Error::from); + + save_stream( + garage, + headers, + body, + bucket_id, + key, + content_md5, + content_sha256, + ) + .await + .map(|(uuid, md5)| put_response(uuid, md5)) +} + +pub(crate) async fn save_stream> + Unpin>( + garage: Arc, + headers: ObjectVersionHeaders, + body: S, + bucket_id: Uuid, + key: &str, + content_md5: Option, + content_sha256: Option, +) -> Result<(Uuid, String), Error> { + // Generate identity of new version + let version_uuid = gen_uuid(); + let version_timestamp = now_msec(); + + let mut chunker = StreamChunker::new(body, garage.config.block_size); + let first_block = chunker.next().await?.unwrap_or_default(); + + // If body is small enough, store it directly in the object table + // as "inline data". We can then return immediately. + if first_block.len() < INLINE_THRESHOLD { + let mut md5sum = Md5::new(); + md5sum.update(&first_block[..]); + let data_md5sum = md5sum.finalize(); + let data_md5sum_hex = hex::encode(data_md5sum); + + let data_sha256sum = sha256sum(&first_block[..]); + + ensure_checksum_matches( + data_md5sum.as_slice(), + data_sha256sum, + content_md5.as_deref(), + content_sha256, + )?; + + let object_version = ObjectVersion { + uuid: version_uuid, + timestamp: version_timestamp, + state: ObjectVersionState::Complete(ObjectVersionData::Inline( + ObjectVersionMeta { + headers, + size: first_block.len() as u64, + etag: data_md5sum_hex.clone(), + }, + first_block, + )), + }; + + let object = Object::new(bucket_id, key.into(), vec![object_version]); + garage.object_table.insert(&object).await?; + + return Ok((version_uuid, data_md5sum_hex)); + } + + // Write version identifier in object table so that we have a trace + // that we are uploading something + let mut object_version = ObjectVersion { + uuid: version_uuid, + timestamp: version_timestamp, + state: ObjectVersionState::Uploading(headers.clone()), + }; + let object = Object::new(bucket_id, key.into(), vec![object_version.clone()]); + garage.object_table.insert(&object).await?; + + // Initialize corresponding entry in version table + // Write this entry now, even with empty block list, + // to prevent block_ref entries from being deleted (they can be deleted + // if the reference a version that isn't found in the version table) + let version = Version::new(version_uuid, bucket_id, key.into(), false); + garage.version_table.insert(&version).await?; + + // Transfer data and verify checksum + let first_block_hash = blake2sum(&first_block[..]); + let tx_result = read_and_put_blocks( + &garage, + &version, + 1, + first_block, + first_block_hash, + &mut chunker, + ) + .await + .and_then(|(total_size, data_md5sum, data_sha256sum)| { + ensure_checksum_matches( + data_md5sum.as_slice(), + data_sha256sum, + content_md5.as_deref(), + content_sha256, + ) + .map(|()| (total_size, data_md5sum)) + }); + + // If something went wrong, clean up + let (total_size, md5sum_arr) = match tx_result { + Ok(rv) => rv, + Err(e) => { + // Mark object as aborted, this will free the blocks further down + object_version.state = ObjectVersionState::Aborted; + let object = Object::new(bucket_id, key.into(), vec![object_version.clone()]); + garage.object_table.insert(&object).await?; + return Err(e); + } + }; + + // Save final object state, marked as Complete + let md5sum_hex = hex::encode(md5sum_arr); + object_version.state = ObjectVersionState::Complete(ObjectVersionData::FirstBlock( + ObjectVersionMeta { + headers, + size: total_size, + etag: md5sum_hex.clone(), + }, + first_block_hash, + )); + let object = Object::new(bucket_id, key.into(), vec![object_version]); + garage.object_table.insert(&object).await?; + + Ok((version_uuid, md5sum_hex)) +} + +/// Validate MD5 sum against content-md5 header +/// and sha256sum against signed content-sha256 +fn ensure_checksum_matches( + data_md5sum: &[u8], + data_sha256sum: garage_util::data::FixedBytes32, + content_md5: Option<&str>, + content_sha256: Option, +) -> Result<(), Error> { + if let Some(expected_sha256) = content_sha256 { + if expected_sha256 != data_sha256sum { + return Err(Error::BadRequest( + "Unable to validate x-amz-content-sha256".to_string(), + )); + } else { + trace!("Successfully validated x-amz-content-sha256"); + } + } + if let Some(expected_md5) = content_md5 { + if expected_md5.trim_matches('"') != base64::encode(data_md5sum) { + return Err(Error::BadRequest( + "Unable to validate content-md5".to_string(), + )); + } else { + trace!("Successfully validated content-md5"); + } + } + Ok(()) +} + +async fn read_and_put_blocks> + Unpin>( + garage: &Garage, + version: &Version, + part_number: u64, + first_block: Vec, + first_block_hash: Hash, + chunker: &mut StreamChunker, +) -> Result<(u64, GenericArray, Hash), Error> { + let mut md5hasher = Md5::new(); + let mut sha256hasher = Sha256::new(); + md5hasher.update(&first_block[..]); + sha256hasher.update(&first_block[..]); + + let mut next_offset = first_block.len(); + let mut put_curr_version_block = put_block_meta( + garage, + version, + part_number, + 0, + first_block_hash, + first_block.len() as u64, + ); + let mut put_curr_block = garage + .block_manager + .rpc_put_block(first_block_hash, first_block); + + loop { + let (_, _, next_block) = futures::try_join!( + put_curr_block.map_err(Error::from), + put_curr_version_block.map_err(Error::from), + chunker.next(), + )?; + if let Some(block) = next_block { + md5hasher.update(&block[..]); + sha256hasher.update(&block[..]); + let block_hash = blake2sum(&block[..]); + let block_len = block.len(); + put_curr_version_block = put_block_meta( + garage, + version, + part_number, + next_offset as u64, + block_hash, + block_len as u64, + ); + put_curr_block = garage.block_manager.rpc_put_block(block_hash, block); + next_offset += block_len; + } else { + break; + } + } + + let total_size = next_offset as u64; + let data_md5sum = md5hasher.finalize(); + + let data_sha256sum = sha256hasher.finalize(); + let data_sha256sum = Hash::try_from(&data_sha256sum[..]).unwrap(); + + Ok((total_size, data_md5sum, data_sha256sum)) +} + +async fn put_block_meta( + garage: &Garage, + version: &Version, + part_number: u64, + offset: u64, + hash: Hash, + size: u64, +) -> Result<(), GarageError> { + let mut version = version.clone(); + version.blocks.put( + VersionBlockKey { + part_number, + offset, + }, + VersionBlock { hash, size }, + ); + + let block_ref = BlockRef { + block: hash, + version: version.uuid, + deleted: false.into(), + }; + + futures::try_join!( + garage.version_table.insert(&version), + garage.block_ref_table.insert(&block_ref), + )?; + Ok(()) +} + +struct StreamChunker>> { + stream: S, + read_all: bool, + block_size: usize, + buf: VecDeque, +} + +impl> + Unpin> StreamChunker { + fn new(stream: S, block_size: usize) -> Self { + Self { + stream, + read_all: false, + block_size, + buf: VecDeque::with_capacity(2 * block_size), + } + } + + async fn next(&mut self) -> Result>, Error> { + while !self.read_all && self.buf.len() < self.block_size { + if let Some(block) = self.stream.next().await { + let bytes = block?; + trace!("Body next: {} bytes", bytes.len()); + self.buf.extend(bytes); + } else { + self.read_all = true; + } + } + + if self.buf.is_empty() { + Ok(None) + } else if self.buf.len() <= self.block_size { + let block = self.buf.drain(..).collect::>(); + Ok(Some(block)) + } else { + let block = self.buf.drain(..self.block_size).collect::>(); + Ok(Some(block)) + } + } +} + +pub fn put_response(version_uuid: Uuid, md5sum_hex: String) -> Response { + Response::builder() + .header("x-amz-version-id", hex::encode(version_uuid)) + .header("ETag", format!("\"{}\"", md5sum_hex)) + .body(Body::from(vec![])) + .unwrap() +} + +pub async fn handle_create_multipart_upload( + garage: Arc, + req: &Request, + bucket_name: &str, + bucket_id: Uuid, + key: &str, +) -> Result, Error> { + let version_uuid = gen_uuid(); + let headers = get_headers(req.headers())?; + + // Create object in object table + let object_version = ObjectVersion { + uuid: version_uuid, + timestamp: now_msec(), + state: ObjectVersionState::Uploading(headers), + }; + let object = Object::new(bucket_id, key.to_string(), vec![object_version]); + garage.object_table.insert(&object).await?; + + // Insert empty version so that block_ref entries refer to something + // (they are inserted concurrently with blocks in the version table, so + // there is the possibility that they are inserted before the version table + // is created, in which case it is allowed to delete them, e.g. in repair_*) + let version = Version::new(version_uuid, bucket_id, key.into(), false); + garage.version_table.insert(&version).await?; + + // Send success response + let result = s3_xml::InitiateMultipartUploadResult { + xmlns: (), + bucket: s3_xml::Value(bucket_name.to_string()), + key: s3_xml::Value(key.to_string()), + upload_id: s3_xml::Value(hex::encode(version_uuid)), + }; + let xml = s3_xml::to_xml_with_header(&result)?; + + Ok(Response::new(Body::from(xml.into_bytes()))) +} + +pub async fn handle_put_part( + garage: Arc, + req: Request, + bucket_id: Uuid, + key: &str, + part_number: u64, + upload_id: &str, + content_sha256: Option, +) -> Result, Error> { + let version_uuid = decode_upload_id(upload_id)?; + + let content_md5 = match req.headers().get("content-md5") { + Some(x) => Some(x.to_str()?.to_string()), + None => None, + }; + + // Read first chuck, and at the same time try to get object to see if it exists + let key = key.to_string(); + + let body = req.into_body().map_err(Error::from); + let mut chunker = StreamChunker::new(body, garage.config.block_size); + + let (object, version, first_block) = futures::try_join!( + garage + .object_table + .get(&bucket_id, &key) + .map_err(Error::from), + garage + .version_table + .get(&version_uuid, &EmptyKey) + .map_err(Error::from), + chunker.next(), + )?; + + // Check object is valid and multipart block can be accepted + let first_block = first_block.ok_or_bad_request("Empty body")?; + let object = object.ok_or_bad_request("Object not found")?; + + if !object + .versions() + .iter() + .any(|v| v.uuid == version_uuid && v.is_uploading()) + { + return Err(Error::NoSuchUpload); + } + + // Check part hasn't already been uploaded + if let Some(v) = version { + if v.has_part_number(part_number) { + return Err(Error::BadRequest(format!( + "Part number {} has already been uploaded", + part_number + ))); + } + } + + // Copy block to store + let version = Version::new(version_uuid, bucket_id, key, false); + let first_block_hash = blake2sum(&first_block[..]); + let (_, data_md5sum, data_sha256sum) = read_and_put_blocks( + &garage, + &version, + part_number, + first_block, + first_block_hash, + &mut chunker, + ) + .await?; + + // Verify that checksums map + ensure_checksum_matches( + data_md5sum.as_slice(), + data_sha256sum, + content_md5.as_deref(), + content_sha256, + )?; + + // Store part etag in version + let data_md5sum_hex = hex::encode(data_md5sum); + let mut version = version; + version + .parts_etags + .put(part_number, data_md5sum_hex.clone()); + garage.version_table.insert(&version).await?; + + let response = Response::builder() + .header("ETag", format!("\"{}\"", data_md5sum_hex)) + .body(Body::empty()) + .unwrap(); + Ok(response) +} + +pub async fn handle_complete_multipart_upload( + garage: Arc, + req: Request, + bucket_name: &str, + bucket_id: Uuid, + key: &str, + upload_id: &str, + content_sha256: Option, +) -> Result, Error> { + let body = hyper::body::to_bytes(req.into_body()).await?; + + if let Some(content_sha256) = content_sha256 { + verify_signed_content(content_sha256, &body[..])?; + } + + let body_xml = roxmltree::Document::parse(std::str::from_utf8(&body)?)?; + let body_list_of_parts = parse_complete_multipart_upload_body(&body_xml) + .ok_or_bad_request("Invalid CompleteMultipartUpload XML")?; + debug!( + "CompleteMultipartUpload list of parts: {:?}", + body_list_of_parts + ); + + let version_uuid = decode_upload_id(upload_id)?; + + // Get object and version + let key = key.to_string(); + let (object, version) = futures::try_join!( + garage.object_table.get(&bucket_id, &key), + garage.version_table.get(&version_uuid, &EmptyKey), + )?; + + let object = object.ok_or(Error::NoSuchKey)?; + let mut object_version = object + .versions() + .iter() + .find(|v| v.uuid == version_uuid && v.is_uploading()) + .cloned() + .ok_or(Error::NoSuchUpload)?; + + let version = version.ok_or(Error::NoSuchKey)?; + if version.blocks.is_empty() { + return Err(Error::BadRequest("No data was uploaded".to_string())); + } + + let headers = match object_version.state { + ObjectVersionState::Uploading(headers) => headers, + _ => unreachable!(), + }; + + // Check that part numbers are an increasing sequence. + // (it doesn't need to start at 1 nor to be a continuous sequence, + // see discussion in #192) + if body_list_of_parts.is_empty() { + return Err(Error::EntityTooSmall); + } + if !body_list_of_parts + .iter() + .zip(body_list_of_parts.iter().skip(1)) + .all(|(p1, p2)| p1.part_number < p2.part_number) + { + return Err(Error::InvalidPartOrder); + } + + // Garage-specific restriction, see #204: part numbers must be + // consecutive starting at 1 + if body_list_of_parts[0].part_number != 1 + || !body_list_of_parts + .iter() + .zip(body_list_of_parts.iter().skip(1)) + .all(|(p1, p2)| p1.part_number + 1 == p2.part_number) + { + return Err(Error::NotImplemented("Garage does not support completing a Multipart upload with non-consecutive part numbers. This is a restriction of Garage's data model, which might be fixed in a future release. See issue #204 for more information on this topic.".into())); + } + + // Check that the list of parts they gave us corresponds to the parts we have here + debug!("Expected parts from request: {:?}", body_list_of_parts); + debug!("Parts stored in version: {:?}", version.parts_etags.items()); + let parts = version + .parts_etags + .items() + .iter() + .map(|pair| (&pair.0, &pair.1)); + let same_parts = body_list_of_parts + .iter() + .map(|x| (&x.part_number, &x.etag)) + .eq(parts); + if !same_parts { + return Err(Error::InvalidPart); + } + + // Check that all blocks belong to one of the parts + let block_parts = version + .blocks + .items() + .iter() + .map(|(bk, _)| bk.part_number) + .collect::>(); + let same_parts = body_list_of_parts + .iter() + .map(|x| x.part_number) + .eq(block_parts.into_iter()); + if !same_parts { + return Err(Error::BadRequest( + "Part numbers in block list and part list do not match. This can happen if a part was partially uploaded. Please abort the multipart upload and try again.".into(), + )); + } + + // Calculate etag of final object + // To understand how etags are calculated, read more here: + // https://teppen.io/2018/06/23/aws_s3_etags/ + let num_parts = body_list_of_parts.len(); + let mut etag_md5_hasher = Md5::new(); + for (_, etag) in version.parts_etags.items().iter() { + etag_md5_hasher.update(etag.as_bytes()); + } + let etag = format!("{}-{}", hex::encode(etag_md5_hasher.finalize()), num_parts); + + // Calculate total size of final object + let total_size = version.blocks.items().iter().map(|x| x.1.size).sum(); + + // Write final object version + object_version.state = ObjectVersionState::Complete(ObjectVersionData::FirstBlock( + ObjectVersionMeta { + headers, + size: total_size, + etag: etag.clone(), + }, + version.blocks.items()[0].1.hash, + )); + + let final_object = Object::new(bucket_id, key.clone(), vec![object_version]); + garage.object_table.insert(&final_object).await?; + + // Send response saying ok we're done + let result = s3_xml::CompleteMultipartUploadResult { + xmlns: (), + location: None, + bucket: s3_xml::Value(bucket_name.to_string()), + key: s3_xml::Value(key), + etag: s3_xml::Value(format!("\"{}\"", etag)), + }; + let xml = s3_xml::to_xml_with_header(&result)?; + + Ok(Response::new(Body::from(xml.into_bytes()))) +} + +pub async fn handle_abort_multipart_upload( + garage: Arc, + bucket_id: Uuid, + key: &str, + upload_id: &str, +) -> Result, Error> { + let version_uuid = decode_upload_id(upload_id)?; + + let object = garage + .object_table + .get(&bucket_id, &key.to_string()) + .await?; + let object = object.ok_or(Error::NoSuchKey)?; + + let object_version = object + .versions() + .iter() + .find(|v| v.uuid == version_uuid && v.is_uploading()); + let mut object_version = match object_version { + None => return Err(Error::NoSuchUpload), + Some(x) => x.clone(), + }; + + object_version.state = ObjectVersionState::Aborted; + let final_object = Object::new(bucket_id, key.to_string(), vec![object_version]); + garage.object_table.insert(&final_object).await?; + + Ok(Response::new(Body::from(vec![]))) +} + +fn get_mime_type(headers: &HeaderMap) -> Result { + Ok(headers + .get(hyper::header::CONTENT_TYPE) + .map(|x| x.to_str()) + .unwrap_or(Ok("blob"))? + .to_string()) +} + +pub(crate) fn get_headers(headers: &HeaderMap) -> Result { + let content_type = get_mime_type(headers)?; + let mut other = BTreeMap::new(); + + // Preserve standard headers + let standard_header = vec![ + hyper::header::CACHE_CONTROL, + hyper::header::CONTENT_DISPOSITION, + hyper::header::CONTENT_ENCODING, + hyper::header::CONTENT_LANGUAGE, + hyper::header::EXPIRES, + ]; + for h in standard_header.iter() { + if let Some(v) = headers.get(h) { + match v.to_str() { + Ok(v_str) => { + other.insert(h.to_string(), v_str.to_string()); + } + Err(e) => { + warn!("Discarding header {}, error in .to_str(): {}", h, e); + } + } + } + } + + // Preserve x-amz-meta- headers + for (k, v) in headers.iter() { + if k.as_str().starts_with("x-amz-meta-") { + match v.to_str() { + Ok(v_str) => { + other.insert(k.to_string(), v_str.to_string()); + } + Err(e) => { + warn!("Discarding header {}, error in .to_str(): {}", k, e); + } + } + } + } + + Ok(ObjectVersionHeaders { + content_type, + other, + }) +} + +pub fn decode_upload_id(id: &str) -> Result { + let id_bin = hex::decode(id).map_err(|_| Error::NoSuchUpload)?; + if id_bin.len() != 32 { + return Err(Error::NoSuchUpload); + } + let mut uuid = [0u8; 32]; + uuid.copy_from_slice(&id_bin[..]); + Ok(Uuid::from(uuid)) +} + +#[derive(Debug)] +struct CompleteMultipartUploadPart { + etag: String, + part_number: u64, +} + +fn parse_complete_multipart_upload_body( + xml: &roxmltree::Document, +) -> Option> { + let mut parts = vec![]; + + let root = xml.root(); + let cmu = root.first_child()?; + if !cmu.has_tag_name("CompleteMultipartUpload") { + return None; + } + + for item in cmu.children() { + // Only parse nodes + if !item.is_element() { + continue; + } + + if item.has_tag_name("Part") { + let etag = item.children().find(|e| e.has_tag_name("ETag"))?.text()?; + let part_number = item + .children() + .find(|e| e.has_tag_name("PartNumber"))? + .text()?; + parts.push(CompleteMultipartUploadPart { + etag: etag.trim_matches('"').to_string(), + part_number: part_number.parse().ok()?, + }); + } else { + return None; + } + } + + Some(parts) +} diff --git a/src/api/s3/router.rs b/src/api/s3/router.rs new file mode 100644 index 00000000..0525c649 --- /dev/null +++ b/src/api/s3/router.rs @@ -0,0 +1,1080 @@ +use crate::error::{Error, OkOrBadRequest}; + +use std::borrow::Cow; + +use hyper::header::HeaderValue; +use hyper::{HeaderMap, Method, Request}; + +use crate::helpers::Authorization; +use crate::router_macros::{generateQueryParameters, router_match}; + +router_match! {@func + +/// List of all S3 API endpoints. +/// +/// For each endpoint, it contains the parameters this endpoint receive by url (bucket, key and +/// query parameters). Parameters it may receive by header are left out, however headers are +/// considered when required to determine between one endpoint or another (for CopyObject and +/// UploadObject, for instance). +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum Endpoint { + AbortMultipartUpload { + key: String, + upload_id: String, + }, + CompleteMultipartUpload { + key: String, + upload_id: String, + }, + CopyObject { + key: String, + }, + CreateBucket { + }, + CreateMultipartUpload { + key: String, + }, + DeleteBucket { + }, + DeleteBucketAnalyticsConfiguration { + id: String, + }, + DeleteBucketCors { + }, + DeleteBucketEncryption { + }, + DeleteBucketIntelligentTieringConfiguration { + id: String, + }, + DeleteBucketInventoryConfiguration { + id: String, + }, + DeleteBucketLifecycle { + }, + DeleteBucketMetricsConfiguration { + id: String, + }, + DeleteBucketOwnershipControls { + }, + DeleteBucketPolicy { + }, + DeleteBucketReplication { + }, + DeleteBucketTagging { + }, + DeleteBucketWebsite { + }, + DeleteObject { + key: String, + version_id: Option, + }, + DeleteObjects { + }, + DeleteObjectTagging { + key: String, + version_id: Option, + }, + DeletePublicAccessBlock { + }, + GetBucketAccelerateConfiguration { + }, + GetBucketAcl { + }, + GetBucketAnalyticsConfiguration { + id: String, + }, + GetBucketCors { + }, + GetBucketEncryption { + }, + GetBucketIntelligentTieringConfiguration { + id: String, + }, + GetBucketInventoryConfiguration { + id: String, + }, + GetBucketLifecycleConfiguration { + }, + GetBucketLocation { + }, + GetBucketLogging { + }, + GetBucketMetricsConfiguration { + id: String, + }, + GetBucketNotificationConfiguration { + }, + GetBucketOwnershipControls { + }, + GetBucketPolicy { + }, + GetBucketPolicyStatus { + }, + GetBucketReplication { + }, + GetBucketRequestPayment { + }, + GetBucketTagging { + }, + GetBucketVersioning { + }, + GetBucketWebsite { + }, + /// There are actually many more query parameters, used to add headers to the answer. They were + /// not added here as they are best handled in a dedicated route. + GetObject { + key: String, + part_number: Option, + version_id: Option, + }, + GetObjectAcl { + key: String, + version_id: Option, + }, + GetObjectLegalHold { + key: String, + version_id: Option, + }, + GetObjectLockConfiguration { + }, + GetObjectRetention { + key: String, + version_id: Option, + }, + GetObjectTagging { + key: String, + version_id: Option, + }, + GetObjectTorrent { + key: String, + }, + GetPublicAccessBlock { + }, + HeadBucket { + }, + HeadObject { + key: String, + part_number: Option, + version_id: Option, + }, + ListBucketAnalyticsConfigurations { + continuation_token: Option, + }, + ListBucketIntelligentTieringConfigurations { + continuation_token: Option, + }, + ListBucketInventoryConfigurations { + continuation_token: Option, + }, + ListBucketMetricsConfigurations { + continuation_token: Option, + }, + ListBuckets, + ListMultipartUploads { + delimiter: Option, + encoding_type: Option, + key_marker: Option, + max_uploads: Option, + prefix: Option, + upload_id_marker: Option, + }, + ListObjects { + delimiter: Option, + encoding_type: Option, + marker: Option, + max_keys: Option, + prefix: Option, + }, + ListObjectsV2 { + // This value should always be 2. It is not checked when constructing the struct + list_type: String, + continuation_token: Option, + delimiter: Option, + encoding_type: Option, + fetch_owner: Option, + max_keys: Option, + prefix: Option, + start_after: Option, + }, + ListObjectVersions { + delimiter: Option, + encoding_type: Option, + key_marker: Option, + max_keys: Option, + prefix: Option, + version_id_marker: Option, + }, + ListParts { + key: String, + max_parts: Option, + part_number_marker: Option, + upload_id: String, + }, + Options, + PutBucketAccelerateConfiguration { + }, + PutBucketAcl { + }, + PutBucketAnalyticsConfiguration { + id: String, + }, + PutBucketCors { + }, + PutBucketEncryption { + }, + PutBucketIntelligentTieringConfiguration { + id: String, + }, + PutBucketInventoryConfiguration { + id: String, + }, + PutBucketLifecycleConfiguration { + }, + PutBucketLogging { + }, + PutBucketMetricsConfiguration { + id: String, + }, + PutBucketNotificationConfiguration { + }, + PutBucketOwnershipControls { + }, + PutBucketPolicy { + }, + PutBucketReplication { + }, + PutBucketRequestPayment { + }, + PutBucketTagging { + }, + PutBucketVersioning { + }, + PutBucketWebsite { + }, + PutObject { + key: String, + }, + PutObjectAcl { + key: String, + version_id: Option, + }, + PutObjectLegalHold { + key: String, + version_id: Option, + }, + PutObjectLockConfiguration { + }, + PutObjectRetention { + key: String, + version_id: Option, + }, + PutObjectTagging { + key: String, + version_id: Option, + }, + PutPublicAccessBlock { + }, + RestoreObject { + key: String, + version_id: Option, + }, + SelectObjectContent { + key: String, + // This value should always be 2. It is not checked when constructing the struct + select_type: String, + }, + UploadPart { + key: String, + part_number: u64, + upload_id: String, + }, + UploadPartCopy { + key: String, + part_number: u64, + upload_id: String, + }, + // This endpoint is not documented with others because it has special use case : + // It's intended to be used with HTML forms, using a multipart/form-data body. + // It works a lot like presigned requests, but everything is in the form instead + // of being query parameters of the URL, so authenticating it is a bit different. + PostObject, +}} + +impl Endpoint { + /// Determine which S3 endpoint a request is for using the request, and a bucket which was + /// possibly extracted from the Host header. + /// Returns Self plus bucket name, if endpoint is not Endpoint::ListBuckets + pub fn from_request( + req: &Request, + bucket: Option, + ) -> Result<(Self, Option), Error> { + let uri = req.uri(); + let path = uri.path().trim_start_matches('/'); + let query = uri.query(); + if bucket.is_none() && path.is_empty() { + if *req.method() == Method::OPTIONS { + return Ok((Self::Options, None)); + } else { + return Ok((Self::ListBuckets, None)); + } + } + + let (bucket, key) = if let Some(bucket) = bucket { + (bucket, path) + } else { + path.split_once('/') + .map(|(b, p)| (b.to_owned(), p.trim_start_matches('/'))) + .unwrap_or((path.to_owned(), "")) + }; + + if *req.method() == Method::OPTIONS { + return Ok((Self::Options, Some(bucket))); + } + + let key = percent_encoding::percent_decode_str(key) + .decode_utf8()? + .into_owned(); + + let mut query = QueryParameters::from_query(query.unwrap_or_default())?; + + let res = match *req.method() { + Method::GET => Self::from_get(key, &mut query)?, + Method::HEAD => Self::from_head(key, &mut query)?, + Method::POST => Self::from_post(key, &mut query)?, + Method::PUT => Self::from_put(key, &mut query, req.headers())?, + Method::DELETE => Self::from_delete(key, &mut query)?, + _ => return Err(Error::BadRequest("Unknown method".to_owned())), + }; + + if let Some(message) = query.nonempty_message() { + debug!("Unused query parameter: {}", message) + } + Ok((res, Some(bucket))) + } + + /// Determine which endpoint a request is for, knowing it is a GET. + fn from_get(key: String, query: &mut QueryParameters<'_>) -> Result { + router_match! { + @gen_parser + (query.keyword.take().unwrap_or_default().as_ref(), key, query, None), + key: [ + EMPTY if upload_id => ListParts (query::upload_id, opt_parse::max_parts, opt_parse::part_number_marker), + EMPTY => GetObject (query_opt::version_id, opt_parse::part_number), + ACL => GetObjectAcl (query_opt::version_id), + LEGAL_HOLD => GetObjectLegalHold (query_opt::version_id), + RETENTION => GetObjectRetention (query_opt::version_id), + TAGGING => GetObjectTagging (query_opt::version_id), + TORRENT => GetObjectTorrent, + ], + no_key: [ + EMPTY if list_type => ListObjectsV2 (query::list_type, query_opt::continuation_token, + opt_parse::delimiter, query_opt::encoding_type, + opt_parse::fetch_owner, opt_parse::max_keys, + query_opt::prefix, query_opt::start_after), + EMPTY => ListObjects (opt_parse::delimiter, query_opt::encoding_type, query_opt::marker, + opt_parse::max_keys, opt_parse::prefix), + ACCELERATE => GetBucketAccelerateConfiguration, + ACL => GetBucketAcl, + ANALYTICS if id => GetBucketAnalyticsConfiguration (query::id), + ANALYTICS => ListBucketAnalyticsConfigurations (query_opt::continuation_token), + CORS => GetBucketCors, + ENCRYPTION => GetBucketEncryption, + INTELLIGENT_TIERING if id => GetBucketIntelligentTieringConfiguration (query::id), + INTELLIGENT_TIERING => ListBucketIntelligentTieringConfigurations (query_opt::continuation_token), + INVENTORY if id => GetBucketInventoryConfiguration (query::id), + INVENTORY => ListBucketInventoryConfigurations (query_opt::continuation_token), + LIFECYCLE => GetBucketLifecycleConfiguration, + LOCATION => GetBucketLocation, + LOGGING => GetBucketLogging, + METRICS if id => GetBucketMetricsConfiguration (query::id), + METRICS => ListBucketMetricsConfigurations (query_opt::continuation_token), + NOTIFICATION => GetBucketNotificationConfiguration, + OBJECT_LOCK => GetObjectLockConfiguration, + OWNERSHIP_CONTROLS => GetBucketOwnershipControls, + POLICY => GetBucketPolicy, + POLICY_STATUS => GetBucketPolicyStatus, + PUBLIC_ACCESS_BLOCK => GetPublicAccessBlock, + REPLICATION => GetBucketReplication, + REQUEST_PAYMENT => GetBucketRequestPayment, + TAGGING => GetBucketTagging, + UPLOADS => ListMultipartUploads (opt_parse::delimiter, query_opt::encoding_type, + query_opt::key_marker, opt_parse::max_uploads, + query_opt::prefix, query_opt::upload_id_marker), + VERSIONING => GetBucketVersioning, + VERSIONS => ListObjectVersions (opt_parse::delimiter, query_opt::encoding_type, + query_opt::key_marker, opt_parse::max_keys, + query_opt::prefix, query_opt::version_id_marker), + WEBSITE => GetBucketWebsite, + ] + } + } + + /// Determine which endpoint a request is for, knowing it is a HEAD. + fn from_head(key: String, query: &mut QueryParameters<'_>) -> Result { + router_match! { + @gen_parser + (query.keyword.take().unwrap_or_default().as_ref(), key, query, None), + key: [ + EMPTY => HeadObject(opt_parse::part_number, query_opt::version_id), + ], + no_key: [ + EMPTY => HeadBucket, + ] + } + } + + /// Determine which endpoint a request is for, knowing it is a POST. + fn from_post(key: String, query: &mut QueryParameters<'_>) -> Result { + router_match! { + @gen_parser + (query.keyword.take().unwrap_or_default().as_ref(), key, query, None), + key: [ + EMPTY if upload_id => CompleteMultipartUpload (query::upload_id), + RESTORE => RestoreObject (query_opt::version_id), + SELECT => SelectObjectContent (query::select_type), + UPLOADS => CreateMultipartUpload, + ], + no_key: [ + EMPTY => PostObject, + DELETE => DeleteObjects, + ] + } + } + + /// Determine which endpoint a request is for, knowing it is a PUT. + fn from_put( + key: String, + query: &mut QueryParameters<'_>, + headers: &HeaderMap, + ) -> Result { + router_match! { + @gen_parser + (query.keyword.take().unwrap_or_default().as_ref(), key, query, headers), + key: [ + EMPTY if part_number header "x-amz-copy-source" => UploadPartCopy (parse::part_number, query::upload_id), + EMPTY header "x-amz-copy-source" => CopyObject, + EMPTY if part_number => UploadPart (parse::part_number, query::upload_id), + EMPTY => PutObject, + ACL => PutObjectAcl (query_opt::version_id), + LEGAL_HOLD => PutObjectLegalHold (query_opt::version_id), + RETENTION => PutObjectRetention (query_opt::version_id), + TAGGING => PutObjectTagging (query_opt::version_id), + + ], + no_key: [ + EMPTY => CreateBucket, + ACCELERATE => PutBucketAccelerateConfiguration, + ACL => PutBucketAcl, + ANALYTICS => PutBucketAnalyticsConfiguration (query::id), + CORS => PutBucketCors, + ENCRYPTION => PutBucketEncryption, + INTELLIGENT_TIERING => PutBucketIntelligentTieringConfiguration(query::id), + INVENTORY => PutBucketInventoryConfiguration(query::id), + LIFECYCLE => PutBucketLifecycleConfiguration, + LOGGING => PutBucketLogging, + METRICS => PutBucketMetricsConfiguration(query::id), + NOTIFICATION => PutBucketNotificationConfiguration, + OBJECT_LOCK => PutObjectLockConfiguration, + OWNERSHIP_CONTROLS => PutBucketOwnershipControls, + POLICY => PutBucketPolicy, + PUBLIC_ACCESS_BLOCK => PutPublicAccessBlock, + REPLICATION => PutBucketReplication, + REQUEST_PAYMENT => PutBucketRequestPayment, + TAGGING => PutBucketTagging, + VERSIONING => PutBucketVersioning, + WEBSITE => PutBucketWebsite, + ] + } + } + + /// Determine which endpoint a request is for, knowing it is a DELETE. + fn from_delete(key: String, query: &mut QueryParameters<'_>) -> Result { + router_match! { + @gen_parser + (query.keyword.take().unwrap_or_default().as_ref(), key, query, None), + key: [ + EMPTY if upload_id => AbortMultipartUpload (query::upload_id), + EMPTY => DeleteObject (query_opt::version_id), + TAGGING => DeleteObjectTagging (query_opt::version_id), + ], + no_key: [ + EMPTY => DeleteBucket, + ANALYTICS => DeleteBucketAnalyticsConfiguration (query::id), + CORS => DeleteBucketCors, + ENCRYPTION => DeleteBucketEncryption, + INTELLIGENT_TIERING => DeleteBucketIntelligentTieringConfiguration (query::id), + INVENTORY => DeleteBucketInventoryConfiguration (query::id), + LIFECYCLE => DeleteBucketLifecycle, + METRICS => DeleteBucketMetricsConfiguration (query::id), + OWNERSHIP_CONTROLS => DeleteBucketOwnershipControls, + POLICY => DeleteBucketPolicy, + PUBLIC_ACCESS_BLOCK => DeletePublicAccessBlock, + REPLICATION => DeleteBucketReplication, + TAGGING => DeleteBucketTagging, + WEBSITE => DeleteBucketWebsite, + ] + } + } + + /// Get the key the request target. Returns None for requests which don't use a key. + #[allow(dead_code)] + pub fn get_key(&self) -> Option<&str> { + router_match! { + @extract + self, + key, + [ + AbortMultipartUpload, + CompleteMultipartUpload, + CopyObject, + CreateMultipartUpload, + DeleteObject, + DeleteObjectTagging, + GetObject, + GetObjectAcl, + GetObjectLegalHold, + GetObjectRetention, + GetObjectTagging, + GetObjectTorrent, + HeadObject, + ListParts, + PutObject, + PutObjectAcl, + PutObjectLegalHold, + PutObjectRetention, + PutObjectTagging, + RestoreObject, + SelectObjectContent, + UploadPart, + UploadPartCopy, + ] + } + } + + /// Get the kind of authorization which is required to perform the operation. + pub fn authorization_type(&self) -> Authorization { + if let Endpoint::ListBuckets = self { + return Authorization::None; + }; + let readonly = router_match! { + @match + self, + [ + GetBucketAccelerateConfiguration, + GetBucketAcl, + GetBucketAnalyticsConfiguration, + GetBucketEncryption, + GetBucketIntelligentTieringConfiguration, + GetBucketInventoryConfiguration, + GetBucketLifecycleConfiguration, + GetBucketLocation, + GetBucketLogging, + GetBucketMetricsConfiguration, + GetBucketNotificationConfiguration, + GetBucketOwnershipControls, + GetBucketPolicy, + GetBucketPolicyStatus, + GetBucketReplication, + GetBucketRequestPayment, + GetBucketTagging, + GetBucketVersioning, + GetObject, + GetObjectAcl, + GetObjectLegalHold, + GetObjectLockConfiguration, + GetObjectRetention, + GetObjectTagging, + GetObjectTorrent, + GetPublicAccessBlock, + HeadBucket, + HeadObject, + ListBucketAnalyticsConfigurations, + ListBucketIntelligentTieringConfigurations, + ListBucketInventoryConfigurations, + ListBucketMetricsConfigurations, + ListMultipartUploads, + ListObjects, + ListObjectsV2, + ListObjectVersions, + ListParts, + SelectObjectContent, + ] + }; + let owner = router_match! { + @match + self, + [ + DeleteBucket, + GetBucketWebsite, + PutBucketWebsite, + DeleteBucketWebsite, + GetBucketCors, + PutBucketCors, + DeleteBucketCors, + ] + }; + if readonly { + Authorization::Read + } else if owner { + Authorization::Owner + } else { + Authorization::Write + } + } +} + +// parameter name => struct field +generateQueryParameters! { + "continuation-token" => continuation_token, + "delimiter" => delimiter, + "encoding-type" => encoding_type, + "fetch-owner" => fetch_owner, + "id" => id, + "key-marker" => key_marker, + "list-type" => list_type, + "marker" => marker, + "max-keys" => max_keys, + "max-parts" => max_parts, + "max-uploads" => max_uploads, + "partNumber" => part_number, + "part-number-marker" => part_number_marker, + "prefix" => prefix, + "select-type" => select_type, + "start-after" => start_after, + "uploadId" => upload_id, + "upload-id-marker" => upload_id_marker, + "versionId" => version_id, + "version-id-marker" => version_id_marker +} + +mod keywords { + //! This module contain all query parameters with no associated value S3 uses to differentiate + //! endpoints. + pub const EMPTY: &str = ""; + + pub const ACCELERATE: &str = "accelerate"; + pub const ACL: &str = "acl"; + pub const ANALYTICS: &str = "analytics"; + pub const CORS: &str = "cors"; + pub const DELETE: &str = "delete"; + pub const ENCRYPTION: &str = "encryption"; + pub const INTELLIGENT_TIERING: &str = "intelligent-tiering"; + pub const INVENTORY: &str = "inventory"; + pub const LEGAL_HOLD: &str = "legal-hold"; + pub const LIFECYCLE: &str = "lifecycle"; + pub const LOCATION: &str = "location"; + pub const LOGGING: &str = "logging"; + pub const METRICS: &str = "metrics"; + pub const NOTIFICATION: &str = "notification"; + pub const OBJECT_LOCK: &str = "object-lock"; + pub const OWNERSHIP_CONTROLS: &str = "ownershipControls"; + pub const POLICY: &str = "policy"; + pub const POLICY_STATUS: &str = "policyStatus"; + pub const PUBLIC_ACCESS_BLOCK: &str = "publicAccessBlock"; + pub const REPLICATION: &str = "replication"; + pub const REQUEST_PAYMENT: &str = "requestPayment"; + pub const RESTORE: &str = "restore"; + pub const RETENTION: &str = "retention"; + pub const SELECT: &str = "select"; + pub const TAGGING: &str = "tagging"; + pub const TORRENT: &str = "torrent"; + pub const UPLOADS: &str = "uploads"; + pub const VERSIONING: &str = "versioning"; + pub const VERSIONS: &str = "versions"; + pub const WEBSITE: &str = "website"; +} + +#[cfg(test)] +mod tests { + use super::*; + + fn parse( + method: &str, + uri: &str, + bucket: Option, + header: Option<(&str, &str)>, + ) -> (Endpoint, Option) { + let mut req = Request::builder().method(method).uri(uri); + if let Some((k, v)) = header { + req = req.header(k, v) + } + let req = req.body(()).unwrap(); + + Endpoint::from_request(&req, bucket).unwrap() + } + + macro_rules! test_cases { + ($($method:ident $uri:expr => $variant:ident )*) => {{ + $( + assert!( + matches!( + parse(test_cases!{@actual_method $method}, $uri, Some("my_bucket".to_owned()), None).0, + Endpoint::$variant { .. } + ) + ); + assert!( + matches!( + parse(test_cases!{@actual_method $method}, concat!("/my_bucket", $uri), None, None).0, + Endpoint::$variant { .. } + ) + ); + + test_cases!{@auth $method $uri} + )* + }}; + + (@actual_method HEAD) => {{ "HEAD" }}; + (@actual_method GET) => {{ "GET" }}; + (@actual_method OWNER_GET) => {{ "GET" }}; + (@actual_method PUT) => {{ "PUT" }}; + (@actual_method OWNER_PUT) => {{ "PUT" }}; + (@actual_method POST) => {{ "POST" }}; + (@actual_method DELETE) => {{ "DELETE" }}; + (@actual_method OWNER_DELETE) => {{ "DELETE" }}; + + (@auth HEAD $uri:expr) => {{ + assert_eq!(parse("HEAD", concat!("/my_bucket", $uri), None, None).0.authorization_type(), + Authorization::Read) + }}; + (@auth GET $uri:expr) => {{ + assert_eq!(parse("GET", concat!("/my_bucket", $uri), None, None).0.authorization_type(), + Authorization::Read) + }}; + (@auth OWNER_GET $uri:expr) => {{ + assert_eq!(parse("GET", concat!("/my_bucket", $uri), None, None).0.authorization_type(), + Authorization::Owner) + }}; + (@auth PUT $uri:expr) => {{ + assert_eq!(parse("PUT", concat!("/my_bucket", $uri), None, None).0.authorization_type(), + Authorization::Write) + }}; + (@auth OWNER_PUT $uri:expr) => {{ + assert_eq!(parse("PUT", concat!("/my_bucket", $uri), None, None).0.authorization_type(), + Authorization::Owner) + }}; + (@auth POST $uri:expr) => {{ + assert_eq!(parse("POST", concat!("/my_bucket", $uri), None, None).0.authorization_type(), + Authorization::Write) + }}; + (@auth DELETE $uri:expr) => {{ + assert_eq!(parse("DELETE", concat!("/my_bucket", $uri), None, None).0.authorization_type(), + Authorization::Write) + }}; + (@auth OWNER_DELETE $uri:expr) => {{ + assert_eq!(parse("DELETE", concat!("/my_bucket", $uri), None, None).0.authorization_type(), + Authorization::Owner) + }}; + } + + #[test] + fn test_bucket_extraction() { + assert_eq!( + parse("GET", "/my/key", Some("my_bucket".to_owned()), None).1, + parse("GET", "/my_bucket/my/key", None, None).1 + ); + assert_eq!( + parse("GET", "/my_bucket/my/key", None, None).1.unwrap(), + "my_bucket" + ); + assert!(parse("GET", "/", None, None).1.is_none()); + } + + #[test] + fn test_key() { + assert_eq!( + parse("GET", "/my/key", Some("my_bucket".to_owned()), None) + .0 + .get_key(), + parse("GET", "/my_bucket/my/key", None, None).0.get_key() + ); + assert_eq!( + parse("GET", "/my_bucket/my/key", None, None) + .0 + .get_key() + .unwrap(), + "my/key" + ); + assert_eq!( + parse("GET", "/my_bucket/my/key?acl", None, None) + .0 + .get_key() + .unwrap(), + "my/key" + ); + assert!(parse("GET", "/my_bucket/?list-type=2", None, None) + .0 + .get_key() + .is_none()); + + assert_eq!( + parse("GET", "/my_bucket/%26%2B%3F%25%C3%A9/something", None, None) + .0 + .get_key() + .unwrap(), + "&+?%é/something" + ); + + /* + * this case is failing. We should verify how clients encode space in url + assert_eq!( + parse("GET", "/my_bucket/+", None, None).get_key().unwrap(), + " "); + */ + } + + #[test] + fn invalid_endpoint() { + let req = Request::builder() + .method("GET") + .uri("/bucket/key?website") + .body(()) + .unwrap(); + + assert!(Endpoint::from_request(&req, None).is_err()) + } + + #[test] + fn test_aws_doc_examples() { + test_cases!( + DELETE "/example-object?uploadId=VXBsb2FkIElEIGZvciBlbHZpbmcncyBteS1tb3ZpZS5tMnRzIHVwbG9hZ" => AbortMultipartUpload + DELETE "/Key+?uploadId=UploadId" => AbortMultipartUpload + POST "/example-object?uploadId=AAAsb2FkIElEIGZvciBlbHZpbmcncyWeeS1tb3ZpZS5tMnRzIRRwbG9hZA" => CompleteMultipartUpload + POST "/Key+?uploadId=UploadId" => CompleteMultipartUpload + PUT "/" => CreateBucket + POST "/example-object?uploads" => CreateMultipartUpload + POST "/{Key+}?uploads" => CreateMultipartUpload + OWNER_DELETE "/" => DeleteBucket + DELETE "/?analytics&id=list1" => DeleteBucketAnalyticsConfiguration + DELETE "/?analytics&id=Id" => DeleteBucketAnalyticsConfiguration + OWNER_DELETE "/?cors" => DeleteBucketCors + DELETE "/?encryption" => DeleteBucketEncryption + DELETE "/?intelligent-tiering&id=Id" => DeleteBucketIntelligentTieringConfiguration + DELETE "/?inventory&id=list1" => DeleteBucketInventoryConfiguration + DELETE "/?inventory&id=Id" => DeleteBucketInventoryConfiguration + DELETE "/?lifecycle" => DeleteBucketLifecycle + DELETE "/?metrics&id=ExampleMetrics" => DeleteBucketMetricsConfiguration + DELETE "/?metrics&id=Id" => DeleteBucketMetricsConfiguration + DELETE "/?ownershipControls" => DeleteBucketOwnershipControls + DELETE "/?policy" => DeleteBucketPolicy + DELETE "/?replication" => DeleteBucketReplication + DELETE "/?tagging" => DeleteBucketTagging + OWNER_DELETE "/?website" => DeleteBucketWebsite + DELETE "/my-second-image.jpg" => DeleteObject + DELETE "/my-third-image.jpg?versionId=UIORUnfndfiufdisojhr398493jfdkjFJjkndnqUifhnw89493jJFJ" => DeleteObject + DELETE "/Key+?versionId=VersionId" => DeleteObject + POST "/?delete" => DeleteObjects + DELETE "/exampleobject?tagging" => DeleteObjectTagging + DELETE "/{Key+}?tagging&versionId=VersionId" => DeleteObjectTagging + DELETE "/?publicAccessBlock" => DeletePublicAccessBlock + GET "/?accelerate" => GetBucketAccelerateConfiguration + GET "/?acl" => GetBucketAcl + GET "/?analytics&id=Id" => GetBucketAnalyticsConfiguration + OWNER_GET "/?cors" => GetBucketCors + GET "/?encryption" => GetBucketEncryption + GET "/?intelligent-tiering&id=Id" => GetBucketIntelligentTieringConfiguration + GET "/?inventory&id=list1" => GetBucketInventoryConfiguration + GET "/?inventory&id=Id" => GetBucketInventoryConfiguration + GET "/?lifecycle" => GetBucketLifecycleConfiguration + GET "/?location" => GetBucketLocation + GET "/?logging" => GetBucketLogging + GET "/?metrics&id=Documents" => GetBucketMetricsConfiguration + GET "/?metrics&id=Id" => GetBucketMetricsConfiguration + GET "/?notification" => GetBucketNotificationConfiguration + GET "/?ownershipControls" => GetBucketOwnershipControls + GET "/?policy" => GetBucketPolicy + GET "/?policyStatus" => GetBucketPolicyStatus + GET "/?replication" => GetBucketReplication + GET "/?requestPayment" => GetBucketRequestPayment + GET "/?tagging" => GetBucketTagging + GET "/?versioning" => GetBucketVersioning + OWNER_GET "/?website" => GetBucketWebsite + GET "/my-image.jpg" => GetObject + GET "/myObject?versionId=3/L4kqtJlcpXroDTDmpUMLUo" => GetObject + GET "/Junk3.txt?response-cache-control=No-cache&response-content-disposition=attachment%3B%20filename%3Dtesting.txt&response-content-encoding=x-gzip&response-content-language=mi%2C%20en&response-expires=Thu%2C%2001%20Dec%201994%2016:00:00%20GMT" => GetObject + GET "/Key+?partNumber=1&response-cache-control=ResponseCacheControl&response-content-disposition=ResponseContentDisposition&response-content-encoding=ResponseContentEncoding&response-content-language=ResponseContentLanguage&response-content-type=ResponseContentType&response-expires=ResponseExpires&versionId=VersionId" => GetObject + GET "/my-image.jpg?acl" => GetObjectAcl + GET "/my-image.jpg?versionId=3/L4kqtJlcpXroDVBH40Nr8X8gdRQBpUMLUo&acl" => GetObjectAcl + GET "/{Key+}?acl&versionId=VersionId" => GetObjectAcl + GET "/{Key+}?legal-hold&versionId=VersionId" => GetObjectLegalHold + GET "/?object-lock" => GetObjectLockConfiguration + GET "/{Key+}?retention&versionId=VersionId" => GetObjectRetention + GET "/example-object?tagging" => GetObjectTagging + GET "/{Key+}?tagging&versionId=VersionId" => GetObjectTagging + GET "/quotes/Nelson?torrent" => GetObjectTorrent + GET "/{Key+}?torrent" => GetObjectTorrent + GET "/?publicAccessBlock" => GetPublicAccessBlock + HEAD "/" => HeadBucket + HEAD "/my-image.jpg" => HeadObject + HEAD "/my-image.jpg?versionId=3HL4kqCxf3vjVBH40Nrjfkd" => HeadObject + HEAD "/Key+?partNumber=3&versionId=VersionId" => HeadObject + GET "/?analytics" => ListBucketAnalyticsConfigurations + GET "/?analytics&continuation-token=ContinuationToken" => ListBucketAnalyticsConfigurations + GET "/?intelligent-tiering" => ListBucketIntelligentTieringConfigurations + GET "/?intelligent-tiering&continuation-token=ContinuationToken" => ListBucketIntelligentTieringConfigurations + GET "/?inventory" => ListBucketInventoryConfigurations + GET "/?inventory&continuation-token=ContinuationToken" => ListBucketInventoryConfigurations + GET "/?metrics" => ListBucketMetricsConfigurations + GET "/?metrics&continuation-token=ContinuationToken" => ListBucketMetricsConfigurations + GET "/?uploads&max-uploads=3" => ListMultipartUploads + GET "/?uploads&delimiter=/" => ListMultipartUploads + GET "/?uploads&delimiter=/&prefix=photos/2006/" => ListMultipartUploads + GET "/?uploads&delimiter=D&encoding-type=EncodingType&key-marker=KeyMarker&max-uploads=1&prefix=Prefix&upload-id-marker=UploadIdMarker" => ListMultipartUploads + GET "/" => ListObjects + GET "/?prefix=N&marker=Ned&max-keys=40" => ListObjects + GET "/?delimiter=/" => ListObjects + GET "/?prefix=photos/2006/&delimiter=/" => ListObjects + + GET "/?delimiter=D&encoding-type=EncodingType&marker=Marker&max-keys=1&prefix=Prefix" => ListObjects + GET "/?list-type=2" => ListObjectsV2 + GET "/?list-type=2&max-keys=3&prefix=E&start-after=ExampleGuide.pdf" => ListObjectsV2 + GET "/?list-type=2&delimiter=/" => ListObjectsV2 + GET "/?list-type=2&prefix=photos/2006/&delimiter=/" => ListObjectsV2 + GET "/?list-type=2" => ListObjectsV2 + GET "/?list-type=2&continuation-token=1ueGcxLPRx1Tr/XYExHnhbYLgveDs2J/wm36Hy4vbOwM=" => ListObjectsV2 + GET "/?list-type=2&continuation-token=ContinuationToken&delimiter=D&encoding-type=EncodingType&fetch-owner=true&max-keys=1&prefix=Prefix&start-after=StartAfter" => ListObjectsV2 + GET "/?versions" => ListObjectVersions + GET "/?versions&key-marker=key2" => ListObjectVersions + GET "/?versions&key-marker=key3&version-id-marker=t46ZenlYTZBnj" => ListObjectVersions + GET "/?versions&key-marker=key3&version-id-marker=t46Z0menlYTZBnj&max-keys=3" => ListObjectVersions + GET "/?versions&delimiter=/" => ListObjectVersions + GET "/?versions&prefix=photos/2006/&delimiter=/" => ListObjectVersions + GET "/?versions&delimiter=D&encoding-type=EncodingType&key-marker=KeyMarker&max-keys=2&prefix=Prefix&version-id-marker=VersionIdMarker" => ListObjectVersions + GET "/example-object?uploadId=XXBsb2FkIElEIGZvciBlbHZpbmcncyVcdS1tb3ZpZS5tMnRzEEEwbG9hZA&max-parts=2&part-number-marker=1" => ListParts + GET "/Key+?max-parts=2&part-number-marker=2&uploadId=UploadId" => ListParts + PUT "/?accelerate" => PutBucketAccelerateConfiguration + PUT "/?acl" => PutBucketAcl + PUT "/?analytics&id=report1" => PutBucketAnalyticsConfiguration + PUT "/?analytics&id=Id" => PutBucketAnalyticsConfiguration + OWNER_PUT "/?cors" => PutBucketCors + PUT "/?encryption" => PutBucketEncryption + PUT "/?intelligent-tiering&id=Id" => PutBucketIntelligentTieringConfiguration + PUT "/?inventory&id=report1" => PutBucketInventoryConfiguration + PUT "/?inventory&id=Id" => PutBucketInventoryConfiguration + PUT "/?lifecycle" => PutBucketLifecycleConfiguration + PUT "/?logging" => PutBucketLogging + PUT "/?metrics&id=EntireBucket" => PutBucketMetricsConfiguration + PUT "/?metrics&id=Id" => PutBucketMetricsConfiguration + PUT "/?notification" => PutBucketNotificationConfiguration + PUT "/?ownershipControls" => PutBucketOwnershipControls + PUT "/?policy" => PutBucketPolicy + PUT "/?replication" => PutBucketReplication + PUT "/?requestPayment" => PutBucketRequestPayment + PUT "/?tagging" => PutBucketTagging + PUT "/?versioning" => PutBucketVersioning + OWNER_PUT "/?website" => PutBucketWebsite + PUT "/my-image.jpg" => PutObject + PUT "/Key+" => PutObject + PUT "/my-image.jpg?acl" => PutObjectAcl + PUT "/my-image.jpg?acl&versionId=3HL4kqtJlcpXroDTDmJ+rmSpXd3dIbrHY+MTRCxf3vjVBH40Nrjfkd" => PutObjectAcl + PUT "/{Key+}?acl&versionId=VersionId" => PutObjectAcl + PUT "/{Key+}?legal-hold&versionId=VersionId" => PutObjectLegalHold + PUT "/?object-lock" => PutObjectLockConfiguration + PUT "/{Key+}?retention&versionId=VersionId" => PutObjectRetention + PUT "/object-key?tagging" => PutObjectTagging + PUT "/{Key+}?tagging&versionId=VersionId" => PutObjectTagging + PUT "/?publicAccessBlock" => PutPublicAccessBlock + POST "/object-one.csv?restore" => RestoreObject + POST "/{Key+}?restore&versionId=VersionId" => RestoreObject + PUT "/my-movie.m2ts?partNumber=1&uploadId=VCVsb2FkIElEIGZvciBlbZZpbmcncyBteS1tb3ZpZS5tMnRzIHVwbG9hZR" => UploadPart + PUT "/Key+?partNumber=2&uploadId=UploadId" => UploadPart + POST "/" => PostObject + ); + // no bucket, won't work with the rest of the test suite + assert!(matches!( + parse("GET", "/", None, None).0, + Endpoint::ListBuckets { .. } + )); + assert!(matches!( + parse("GET", "/", None, None).0.authorization_type(), + Authorization::None + )); + + // require a header + assert!(matches!( + parse( + "PUT", + "/Key+", + Some("my_bucket".to_owned()), + Some(("x-amz-copy-source", "some/key")) + ) + .0, + Endpoint::CopyObject { .. } + )); + assert!(matches!( + parse( + "PUT", + "/my_bucket/Key+", + None, + Some(("x-amz-copy-source", "some/key")) + ) + .0, + Endpoint::CopyObject { .. } + )); + assert!(matches!( + parse( + "PUT", + "/my_bucket/Key+", + None, + Some(("x-amz-copy-source", "some/key")) + ) + .0 + .authorization_type(), + Authorization::Write + )); + + // require a header + assert!(matches!( + parse( + "PUT", + "/Key+?partNumber=2&uploadId=UploadId", + Some("my_bucket".to_owned()), + Some(("x-amz-copy-source", "some/key")) + ) + .0, + Endpoint::UploadPartCopy { .. } + )); + assert!(matches!( + parse( + "PUT", + "/my_bucket/Key+?partNumber=2&uploadId=UploadId", + None, + Some(("x-amz-copy-source", "some/key")) + ) + .0, + Endpoint::UploadPartCopy { .. } + )); + assert!(matches!( + parse( + "PUT", + "/my_bucket/Key+?partNumber=2&uploadId=UploadId", + None, + Some(("x-amz-copy-source", "some/key")) + ) + .0 + .authorization_type(), + Authorization::Write + )); + + // POST request, but with GET semantic for permissions purpose + assert!(matches!( + parse( + "POST", + "/{Key+}?select&select-type=2", + Some("my_bucket".to_owned()), + None + ) + .0, + Endpoint::SelectObjectContent { .. } + )); + assert!(matches!( + parse("POST", "/my_bucket/{Key+}?select&select-type=2", None, None).0, + Endpoint::SelectObjectContent { .. } + )); + assert!(matches!( + parse("POST", "/my_bucket/{Key+}?select&select-type=2", None, None) + .0 + .authorization_type(), + Authorization::Read + )); + } +} diff --git a/src/api/s3/website.rs b/src/api/s3/website.rs new file mode 100644 index 00000000..561130dc --- /dev/null +++ b/src/api/s3/website.rs @@ -0,0 +1,369 @@ +use quick_xml::de::from_reader; +use std::sync::Arc; + +use hyper::{Body, Request, Response, StatusCode}; +use serde::{Deserialize, Serialize}; + +use crate::error::*; +use crate::s3::xml::{to_xml_with_header, xmlns_tag, IntValue, Value}; +use crate::signature::verify_signed_content; + +use garage_model::bucket_table::*; +use garage_model::garage::Garage; +use garage_table::*; +use garage_util::data::*; + +pub async fn handle_get_website(bucket: &Bucket) -> Result, Error> { + let param = bucket + .params() + .ok_or_internal_error("Bucket should not be deleted at this point")?; + + if let Some(website) = param.website_config.get() { + let wc = WebsiteConfiguration { + xmlns: (), + error_document: website.error_document.as_ref().map(|v| Key { + key: Value(v.to_string()), + }), + index_document: Some(Suffix { + suffix: Value(website.index_document.to_string()), + }), + redirect_all_requests_to: None, + routing_rules: None, + }; + let xml = to_xml_with_header(&wc)?; + Ok(Response::builder() + .status(StatusCode::OK) + .header(http::header::CONTENT_TYPE, "application/xml") + .body(Body::from(xml))?) + } else { + Ok(Response::builder() + .status(StatusCode::NO_CONTENT) + .body(Body::empty())?) + } +} + +pub async fn handle_delete_website( + garage: Arc, + bucket_id: Uuid, +) -> Result, Error> { + let mut bucket = garage + .bucket_table + .get(&EmptyKey, &bucket_id) + .await? + .ok_or(Error::NoSuchBucket)?; + + let param = bucket + .params_mut() + .ok_or_internal_error("Bucket should not be deleted at this point")?; + + param.website_config.update(None); + garage.bucket_table.insert(&bucket).await?; + + Ok(Response::builder() + .status(StatusCode::NO_CONTENT) + .body(Body::empty())?) +} + +pub async fn handle_put_website( + garage: Arc, + bucket_id: Uuid, + req: Request, + content_sha256: Option, +) -> Result, Error> { + let body = hyper::body::to_bytes(req.into_body()).await?; + + if let Some(content_sha256) = content_sha256 { + verify_signed_content(content_sha256, &body[..])?; + } + + let mut bucket = garage + .bucket_table + .get(&EmptyKey, &bucket_id) + .await? + .ok_or(Error::NoSuchBucket)?; + + let param = bucket + .params_mut() + .ok_or_internal_error("Bucket should not be deleted at this point")?; + + let conf: WebsiteConfiguration = from_reader(&body as &[u8])?; + conf.validate()?; + + param + .website_config + .update(Some(conf.into_garage_website_config()?)); + garage.bucket_table.insert(&bucket).await?; + + Ok(Response::builder() + .status(StatusCode::OK) + .body(Body::empty())?) +} + +#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)] +pub struct WebsiteConfiguration { + #[serde(serialize_with = "xmlns_tag", skip_deserializing)] + pub xmlns: (), + #[serde(rename = "ErrorDocument")] + pub error_document: Option, + #[serde(rename = "IndexDocument")] + pub index_document: Option, + #[serde(rename = "RedirectAllRequestsTo")] + pub redirect_all_requests_to: Option, + #[serde(rename = "RoutingRules")] + pub routing_rules: Option>, +} + +#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)] +pub struct RoutingRule { + #[serde(rename = "RoutingRule")] + pub inner: RoutingRuleInner, +} + +#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)] +pub struct RoutingRuleInner { + #[serde(rename = "Condition")] + pub condition: Option, + #[serde(rename = "Redirect")] + pub redirect: Redirect, +} + +#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)] +pub struct Key { + #[serde(rename = "Key")] + pub key: Value, +} + +#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)] +pub struct Suffix { + #[serde(rename = "Suffix")] + pub suffix: Value, +} + +#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)] +pub struct Target { + #[serde(rename = "HostName")] + pub hostname: Value, + #[serde(rename = "Protocol")] + pub protocol: Option, +} + +#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)] +pub struct Condition { + #[serde(rename = "HttpErrorCodeReturnedEquals")] + pub http_error_code: Option, + #[serde(rename = "KeyPrefixEquals")] + pub prefix: Option, +} + +#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)] +pub struct Redirect { + #[serde(rename = "HostName")] + pub hostname: Option, + #[serde(rename = "Protocol")] + pub protocol: Option, + #[serde(rename = "HttpRedirectCode")] + pub http_redirect_code: Option, + #[serde(rename = "ReplaceKeyPrefixWith")] + pub replace_prefix: Option, + #[serde(rename = "ReplaceKeyWith")] + pub replace_full: Option, +} + +impl WebsiteConfiguration { + pub fn validate(&self) -> Result<(), Error> { + if self.redirect_all_requests_to.is_some() + && (self.error_document.is_some() + || self.index_document.is_some() + || self.routing_rules.is_some()) + { + return Err(Error::BadRequest( + "Bad XML: can't have RedirectAllRequestsTo and other fields".to_owned(), + )); + } + if let Some(ref ed) = self.error_document { + ed.validate()?; + } + if let Some(ref id) = self.index_document { + id.validate()?; + } + if let Some(ref rart) = self.redirect_all_requests_to { + rart.validate()?; + } + if let Some(ref rrs) = self.routing_rules { + for rr in rrs { + rr.inner.validate()?; + } + } + + Ok(()) + } + + pub fn into_garage_website_config(self) -> Result { + if self.redirect_all_requests_to.is_some() { + Err(Error::NotImplemented( + "S3 website redirects are not currently implemented in Garage.".into(), + )) + } else if self.routing_rules.map(|x| !x.is_empty()).unwrap_or(false) { + Err(Error::NotImplemented( + "S3 routing rules are not currently implemented in Garage.".into(), + )) + } else { + Ok(WebsiteConfig { + index_document: self + .index_document + .map(|x| x.suffix.0) + .unwrap_or_else(|| "index.html".to_string()), + error_document: self.error_document.map(|x| x.key.0), + }) + } + } +} + +impl Key { + pub fn validate(&self) -> Result<(), Error> { + if self.key.0.is_empty() { + Err(Error::BadRequest( + "Bad XML: error document specified but empty".to_owned(), + )) + } else { + Ok(()) + } + } +} + +impl Suffix { + pub fn validate(&self) -> Result<(), Error> { + if self.suffix.0.is_empty() | self.suffix.0.contains('/') { + Err(Error::BadRequest( + "Bad XML: index document is empty or contains /".to_owned(), + )) + } else { + Ok(()) + } + } +} + +impl Target { + pub fn validate(&self) -> Result<(), Error> { + if let Some(ref protocol) = self.protocol { + if protocol.0 != "http" && protocol.0 != "https" { + return Err(Error::BadRequest("Bad XML: invalid protocol".to_owned())); + } + } + Ok(()) + } +} + +impl RoutingRuleInner { + pub fn validate(&self) -> Result<(), Error> { + let has_prefix = self + .condition + .as_ref() + .and_then(|c| c.prefix.as_ref()) + .is_some(); + self.redirect.validate(has_prefix) + } +} + +impl Redirect { + pub fn validate(&self, has_prefix: bool) -> Result<(), Error> { + if self.replace_prefix.is_some() { + if self.replace_full.is_some() { + return Err(Error::BadRequest( + "Bad XML: both ReplaceKeyPrefixWith and ReplaceKeyWith are set".to_owned(), + )); + } + if !has_prefix { + return Err(Error::BadRequest( + "Bad XML: ReplaceKeyPrefixWith is set, but KeyPrefixEquals isn't".to_owned(), + )); + } + } + if let Some(ref protocol) = self.protocol { + if protocol.0 != "http" && protocol.0 != "https" { + return Err(Error::BadRequest("Bad XML: invalid protocol".to_owned())); + } + } + // TODO there are probably more invalide cases, but which ones? + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use quick_xml::de::from_str; + + #[test] + fn test_deserialize() -> Result<(), Error> { + let message = r#" + + + my-error-doc + + + my-index + + + garage.tld + https + + + + + 404 + prefix1 + + + gara.ge + http + 303 + prefix2 + fullkey + + + +"#; + let conf: WebsiteConfiguration = from_str(message).unwrap(); + let ref_value = WebsiteConfiguration { + xmlns: (), + error_document: Some(Key { + key: Value("my-error-doc".to_owned()), + }), + index_document: Some(Suffix { + suffix: Value("my-index".to_owned()), + }), + redirect_all_requests_to: Some(Target { + hostname: Value("garage.tld".to_owned()), + protocol: Some(Value("https".to_owned())), + }), + routing_rules: Some(vec![RoutingRule { + inner: RoutingRuleInner { + condition: Some(Condition { + http_error_code: Some(IntValue(404)), + prefix: Some(Value("prefix1".to_owned())), + }), + redirect: Redirect { + hostname: Some(Value("gara.ge".to_owned())), + protocol: Some(Value("http".to_owned())), + http_redirect_code: Some(IntValue(303)), + replace_prefix: Some(Value("prefix2".to_owned())), + replace_full: Some(Value("fullkey".to_owned())), + }, + }, + }]), + }; + assert_eq! { + ref_value, + conf + } + + let message2 = to_xml_with_header(&ref_value)?; + + let cleanup = |c: &str| c.replace(char::is_whitespace, ""); + assert_eq!(cleanup(message), cleanup(&message2)); + + Ok(()) + } +} diff --git a/src/api/s3/xml.rs b/src/api/s3/xml.rs new file mode 100644 index 00000000..75ec4559 --- /dev/null +++ b/src/api/s3/xml.rs @@ -0,0 +1,844 @@ +use quick_xml::se::to_string; +use serde::{Deserialize, Serialize, Serializer}; + +use crate::Error as ApiError; + +pub fn to_xml_with_header(x: &T) -> Result { + let mut xml = r#""#.to_string(); + xml.push_str(&to_string(x)?); + Ok(xml) +} + +pub fn xmlns_tag(_v: &(), s: S) -> Result { + s.serialize_str("http://s3.amazonaws.com/doc/2006-03-01/") +} + +#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)] +pub struct Value(#[serde(rename = "$value")] pub String); + +impl From<&str> for Value { + fn from(s: &str) -> Value { + Value(s.to_string()) + } +} + +#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)] +pub struct IntValue(#[serde(rename = "$value")] pub i64); + +#[derive(Debug, Serialize, PartialEq)] +pub struct Bucket { + #[serde(rename = "CreationDate")] + pub creation_date: Value, + #[serde(rename = "Name")] + pub name: Value, +} + +#[derive(Debug, Serialize, PartialEq)] +pub struct Owner { + #[serde(rename = "DisplayName")] + pub display_name: Value, + #[serde(rename = "ID")] + pub id: Value, +} + +#[derive(Debug, Serialize, PartialEq)] +pub struct BucketList { + #[serde(rename = "Bucket")] + pub entries: Vec, +} + +#[derive(Debug, Serialize, PartialEq)] +pub struct ListAllMyBucketsResult { + #[serde(rename = "Buckets")] + pub buckets: BucketList, + #[serde(rename = "Owner")] + pub owner: Owner, +} + +#[derive(Debug, Serialize, PartialEq)] +pub struct LocationConstraint { + #[serde(serialize_with = "xmlns_tag")] + pub xmlns: (), + #[serde(rename = "$value")] + pub region: String, +} + +#[derive(Debug, Serialize, PartialEq)] +pub struct Deleted { + #[serde(rename = "Key")] + pub key: Value, + #[serde(rename = "VersionId")] + pub version_id: Value, + #[serde(rename = "DeleteMarkerVersionId")] + pub delete_marker_version_id: Value, +} + +#[derive(Debug, Serialize, PartialEq)] +pub struct Error { + #[serde(rename = "Code")] + pub code: Value, + #[serde(rename = "Message")] + pub message: Value, + #[serde(rename = "Resource")] + pub resource: Option, + #[serde(rename = "Region")] + pub region: Option, +} + +#[derive(Debug, Serialize, PartialEq)] +pub struct DeleteError { + #[serde(rename = "Code")] + pub code: Value, + #[serde(rename = "Key")] + pub key: Option, + #[serde(rename = "Message")] + pub message: Value, + #[serde(rename = "VersionId")] + pub version_id: Option, +} + +#[derive(Debug, Serialize, PartialEq)] +pub struct DeleteResult { + #[serde(serialize_with = "xmlns_tag")] + pub xmlns: (), + #[serde(rename = "Deleted")] + pub deleted: Vec, + #[serde(rename = "Error")] + pub errors: Vec, +} + +#[derive(Debug, Serialize, PartialEq)] +pub struct InitiateMultipartUploadResult { + #[serde(serialize_with = "xmlns_tag")] + pub xmlns: (), + #[serde(rename = "Bucket")] + pub bucket: Value, + #[serde(rename = "Key")] + pub key: Value, + #[serde(rename = "UploadId")] + pub upload_id: Value, +} + +#[derive(Debug, Serialize, PartialEq)] +pub struct CompleteMultipartUploadResult { + #[serde(serialize_with = "xmlns_tag")] + pub xmlns: (), + #[serde(rename = "Location")] + pub location: Option, + #[serde(rename = "Bucket")] + pub bucket: Value, + #[serde(rename = "Key")] + pub key: Value, + #[serde(rename = "ETag")] + pub etag: Value, +} + +#[derive(Debug, Serialize, PartialEq)] +pub struct Initiator { + #[serde(rename = "DisplayName")] + pub display_name: Value, + #[serde(rename = "ID")] + pub id: Value, +} + +#[derive(Debug, Serialize, PartialEq)] +pub struct ListMultipartItem { + #[serde(rename = "Initiated")] + pub initiated: Value, + #[serde(rename = "Initiator")] + pub initiator: Initiator, + #[serde(rename = "Key")] + pub key: Value, + #[serde(rename = "UploadId")] + pub upload_id: Value, + #[serde(rename = "Owner")] + pub owner: Owner, + #[serde(rename = "StorageClass")] + pub storage_class: Value, +} + +#[derive(Debug, Serialize, PartialEq)] +pub struct ListMultipartUploadsResult { + #[serde(serialize_with = "xmlns_tag")] + pub xmlns: (), + #[serde(rename = "Bucket")] + pub bucket: Value, + #[serde(rename = "KeyMarker")] + pub key_marker: Option, + #[serde(rename = "UploadIdMarker")] + pub upload_id_marker: Option, + #[serde(rename = "NextKeyMarker")] + pub next_key_marker: Option, + #[serde(rename = "NextUploadIdMarker")] + pub next_upload_id_marker: Option, + #[serde(rename = "Prefix")] + pub prefix: Value, + #[serde(rename = "Delimiter")] + pub delimiter: Option, + #[serde(rename = "MaxUploads")] + pub max_uploads: IntValue, + #[serde(rename = "IsTruncated")] + pub is_truncated: Value, + #[serde(rename = "Upload")] + pub upload: Vec, + #[serde(rename = "CommonPrefixes")] + pub common_prefixes: Vec, + #[serde(rename = "EncodingType")] + pub encoding_type: Option, +} + +#[derive(Debug, Serialize, PartialEq)] +pub struct PartItem { + #[serde(rename = "ETag")] + pub etag: Value, + #[serde(rename = "LastModified")] + pub last_modified: Value, + #[serde(rename = "PartNumber")] + pub part_number: IntValue, + #[serde(rename = "Size")] + pub size: IntValue, +} + +#[derive(Debug, Serialize, PartialEq)] +pub struct ListPartsResult { + #[serde(serialize_with = "xmlns_tag")] + pub xmlns: (), + #[serde(rename = "Bucket")] + pub bucket: Value, + #[serde(rename = "Key")] + pub key: Value, + #[serde(rename = "UploadId")] + pub upload_id: Value, + #[serde(rename = "PartNumberMarker")] + pub part_number_marker: Option, + #[serde(rename = "NextPartNumberMarker")] + pub next_part_number_marker: Option, + #[serde(rename = "MaxParts")] + pub max_parts: IntValue, + #[serde(rename = "IsTruncated")] + pub is_truncated: Value, + #[serde(rename = "Part", default)] + pub parts: Vec, + #[serde(rename = "Initiator")] + pub initiator: Initiator, + #[serde(rename = "Owner")] + pub owner: Owner, + #[serde(rename = "StorageClass")] + pub storage_class: Value, +} + +#[derive(Debug, Serialize, PartialEq)] +pub struct ListBucketItem { + #[serde(rename = "Key")] + pub key: Value, + #[serde(rename = "LastModified")] + pub last_modified: Value, + #[serde(rename = "ETag")] + pub etag: Value, + #[serde(rename = "Size")] + pub size: IntValue, + #[serde(rename = "StorageClass")] + pub storage_class: Value, +} + +#[derive(Debug, Serialize, PartialEq)] +pub struct CommonPrefix { + #[serde(rename = "Prefix")] + pub prefix: Value, +} + +#[derive(Debug, Serialize, PartialEq)] +pub struct ListBucketResult { + #[serde(serialize_with = "xmlns_tag")] + pub xmlns: (), + #[serde(rename = "Name")] + pub name: Value, + #[serde(rename = "Prefix")] + pub prefix: Value, + #[serde(rename = "Marker")] + pub marker: Option, + #[serde(rename = "NextMarker")] + pub next_marker: Option, + #[serde(rename = "StartAfter")] + pub start_after: Option, + #[serde(rename = "ContinuationToken")] + pub continuation_token: Option, + #[serde(rename = "NextContinuationToken")] + pub next_continuation_token: Option, + #[serde(rename = "KeyCount")] + pub key_count: Option, + #[serde(rename = "MaxKeys")] + pub max_keys: IntValue, + #[serde(rename = "Delimiter")] + pub delimiter: Option, + #[serde(rename = "EncodingType")] + pub encoding_type: Option, + #[serde(rename = "IsTruncated")] + pub is_truncated: Value, + #[serde(rename = "Contents")] + pub contents: Vec, + #[serde(rename = "CommonPrefixes")] + pub common_prefixes: Vec, +} + +#[derive(Debug, Serialize, PartialEq)] +pub struct VersioningConfiguration { + #[serde(serialize_with = "xmlns_tag")] + pub xmlns: (), + #[serde(rename = "Status")] + pub status: Option, +} + +#[derive(Debug, Serialize, PartialEq)] +pub struct PostObject { + #[serde(serialize_with = "xmlns_tag")] + pub xmlns: (), + #[serde(rename = "Location")] + pub location: Value, + #[serde(rename = "Bucket")] + pub bucket: Value, + #[serde(rename = "Key")] + pub key: Value, + #[serde(rename = "ETag")] + pub etag: Value, +} + +#[cfg(test)] +mod tests { + use super::*; + + use garage_util::time::*; + + #[test] + fn error_message() -> Result<(), ApiError> { + let error = Error { + code: Value("TestError".to_string()), + message: Value("A dummy error message".to_string()), + resource: Some(Value("/bucket/a/plop".to_string())), + region: Some(Value("garage".to_string())), + }; + assert_eq!( + to_xml_with_header(&error)?, + "\ +\ + TestError\ + A dummy error message\ + /bucket/a/plop\ + garage\ +" + ); + Ok(()) + } + + #[test] + fn list_all_my_buckets_result() -> Result<(), ApiError> { + let list_buckets = ListAllMyBucketsResult { + owner: Owner { + display_name: Value("owner_name".to_string()), + id: Value("qsdfjklm".to_string()), + }, + buckets: BucketList { + entries: vec![ + Bucket { + creation_date: Value(msec_to_rfc3339(0)), + name: Value("bucket_A".to_string()), + }, + Bucket { + creation_date: Value(msec_to_rfc3339(3600 * 24 * 1000)), + name: Value("bucket_B".to_string()), + }, + ], + }, + }; + assert_eq!( + to_xml_with_header(&list_buckets)?, + "\ +\ + \ + \ + 1970-01-01T00:00:00.000Z\ + bucket_A\ + \ + \ + 1970-01-02T00:00:00.000Z\ + bucket_B\ + \ + \ + \ + owner_name\ + qsdfjklm\ + \ +" + ); + Ok(()) + } + + #[test] + fn get_bucket_location_result() -> Result<(), ApiError> { + let get_bucket_location = LocationConstraint { + xmlns: (), + region: "garage".to_string(), + }; + assert_eq!( + to_xml_with_header(&get_bucket_location)?, + "\ +garage" + ); + Ok(()) + } + + #[test] + fn get_bucket_versioning_result() -> Result<(), ApiError> { + let get_bucket_versioning = VersioningConfiguration { + xmlns: (), + status: None, + }; + assert_eq!( + to_xml_with_header(&get_bucket_versioning)?, + "\ +" + ); + let get_bucket_versioning2 = VersioningConfiguration { + xmlns: (), + status: Some(Value("Suspended".to_string())), + }; + assert_eq!( + to_xml_with_header(&get_bucket_versioning2)?, + "\ +Suspended" + ); + + Ok(()) + } + + #[test] + fn delete_result() -> Result<(), ApiError> { + let delete_result = DeleteResult { + xmlns: (), + deleted: vec![ + Deleted { + key: Value("a/plop".to_string()), + version_id: Value("qsdfjklm".to_string()), + delete_marker_version_id: Value("wxcvbn".to_string()), + }, + Deleted { + key: Value("b/plip".to_string()), + version_id: Value("1234".to_string()), + delete_marker_version_id: Value("4321".to_string()), + }, + ], + errors: vec![ + DeleteError { + code: Value("NotFound".to_string()), + key: Some(Value("c/plap".to_string())), + message: Value("Object c/plap not found".to_string()), + version_id: None, + }, + DeleteError { + code: Value("Forbidden".to_string()), + key: Some(Value("d/plep".to_string())), + message: Value("Not authorized".to_string()), + version_id: Some(Value("789".to_string())), + }, + ], + }; + assert_eq!( + to_xml_with_header(&delete_result)?, + "\ +\ + \ + a/plop\ + qsdfjklm\ + wxcvbn\ + \ + \ + b/plip\ + 1234\ + 4321\ + \ + \ + NotFound\ + c/plap\ + Object c/plap not found\ + \ + \ + Forbidden\ + d/plep\ + Not authorized\ + 789\ + \ +" + ); + Ok(()) + } + + #[test] + fn initiate_multipart_upload_result() -> Result<(), ApiError> { + let result = InitiateMultipartUploadResult { + xmlns: (), + bucket: Value("mybucket".to_string()), + key: Value("a/plop".to_string()), + upload_id: Value("azerty".to_string()), + }; + assert_eq!( + to_xml_with_header(&result)?, + "\ +\ + mybucket\ + a/plop\ + azerty\ +" + ); + Ok(()) + } + + #[test] + fn complete_multipart_upload_result() -> Result<(), ApiError> { + let result = CompleteMultipartUploadResult { + xmlns: (), + location: Some(Value("https://garage.tld/mybucket/a/plop".to_string())), + bucket: Value("mybucket".to_string()), + key: Value("a/plop".to_string()), + etag: Value("\"3858f62230ac3c915f300c664312c11f-9\"".to_string()), + }; + assert_eq!( + to_xml_with_header(&result)?, + "\ +\ + https://garage.tld/mybucket/a/plop\ + mybucket\ + a/plop\ + "3858f62230ac3c915f300c664312c11f-9"\ +" + ); + Ok(()) + } + + #[test] + fn list_multipart_uploads_result() -> Result<(), ApiError> { + let result = ListMultipartUploadsResult { + xmlns: (), + bucket: Value("example-bucket".to_string()), + key_marker: None, + next_key_marker: None, + upload_id_marker: None, + encoding_type: None, + next_upload_id_marker: None, + upload: vec![], + delimiter: Some(Value("/".to_string())), + prefix: Value("photos/2006/".to_string()), + max_uploads: IntValue(1000), + is_truncated: Value("false".to_string()), + common_prefixes: vec![ + CommonPrefix { + prefix: Value("photos/2006/February/".to_string()), + }, + CommonPrefix { + prefix: Value("photos/2006/January/".to_string()), + }, + CommonPrefix { + prefix: Value("photos/2006/March/".to_string()), + }, + ], + }; + + assert_eq!( + to_xml_with_header(&result)?, + "\ +\ + example-bucket\ + photos/2006/\ + /\ + 1000\ + false\ + \ + photos/2006/February/\ + \ + \ + photos/2006/January/\ + \ + \ + photos/2006/March/\ + \ +" + ); + + Ok(()) + } + + #[test] + fn list_objects_v1_1() -> Result<(), ApiError> { + let result = ListBucketResult { + xmlns: (), + name: Value("example-bucket".to_string()), + prefix: Value("".to_string()), + marker: Some(Value("".to_string())), + next_marker: None, + start_after: None, + continuation_token: None, + next_continuation_token: None, + key_count: None, + max_keys: IntValue(1000), + encoding_type: None, + delimiter: Some(Value("/".to_string())), + is_truncated: Value("false".to_string()), + contents: vec![ListBucketItem { + key: Value("sample.jpg".to_string()), + last_modified: Value(msec_to_rfc3339(0)), + etag: Value("\"bf1d737a4d46a19f3bced6905cc8b902\"".to_string()), + size: IntValue(142863), + storage_class: Value("STANDARD".to_string()), + }], + common_prefixes: vec![CommonPrefix { + prefix: Value("photos/".to_string()), + }], + }; + assert_eq!( + to_xml_with_header(&result)?, + "\ +\ + example-bucket\ + \ + \ + 1000\ + /\ + false\ + \ + sample.jpg\ + 1970-01-01T00:00:00.000Z\ + "bf1d737a4d46a19f3bced6905cc8b902"\ + 142863\ + STANDARD\ + \ + \ + photos/\ + \ +" + ); + Ok(()) + } + + #[test] + fn list_objects_v1_2() -> Result<(), ApiError> { + let result = ListBucketResult { + xmlns: (), + name: Value("example-bucket".to_string()), + prefix: Value("photos/2006/".to_string()), + marker: Some(Value("".to_string())), + next_marker: None, + start_after: None, + continuation_token: None, + next_continuation_token: None, + key_count: None, + max_keys: IntValue(1000), + delimiter: Some(Value("/".to_string())), + encoding_type: None, + is_truncated: Value("false".to_string()), + contents: vec![], + common_prefixes: vec![ + CommonPrefix { + prefix: Value("photos/2006/February/".to_string()), + }, + CommonPrefix { + prefix: Value("photos/2006/January/".to_string()), + }, + ], + }; + assert_eq!( + to_xml_with_header(&result)?, + "\ +\ + example-bucket\ + photos/2006/\ + \ + 1000\ + /\ + false\ + \ + photos/2006/February/\ + \ + \ + photos/2006/January/\ + \ +" + ); + Ok(()) + } + + #[test] + fn list_objects_v2_1() -> Result<(), ApiError> { + let result = ListBucketResult { + xmlns: (), + name: Value("quotes".to_string()), + prefix: Value("E".to_string()), + marker: None, + next_marker: None, + start_after: Some(Value("ExampleGuide.pdf".to_string())), + continuation_token: None, + next_continuation_token: None, + key_count: None, + max_keys: IntValue(3), + delimiter: None, + encoding_type: None, + is_truncated: Value("false".to_string()), + contents: vec![ListBucketItem { + key: Value("ExampleObject.txt".to_string()), + last_modified: Value(msec_to_rfc3339(0)), + etag: Value("\"599bab3ed2c697f1d26842727561fd94\"".to_string()), + size: IntValue(857), + storage_class: Value("REDUCED_REDUNDANCY".to_string()), + }], + common_prefixes: vec![], + }; + assert_eq!( + to_xml_with_header(&result)?, + "\ +\ + quotes\ + E\ + ExampleGuide.pdf\ + 3\ + false\ + \ + ExampleObject.txt\ + 1970-01-01T00:00:00.000Z\ + "599bab3ed2c697f1d26842727561fd94"\ + 857\ + REDUCED_REDUNDANCY\ + \ +" + ); + Ok(()) + } + + #[test] + fn list_objects_v2_2() -> Result<(), ApiError> { + let result = ListBucketResult { + xmlns: (), + name: Value("bucket".to_string()), + prefix: Value("".to_string()), + marker: None, + next_marker: None, + start_after: None, + continuation_token: Some(Value( + "1ueGcxLPRx1Tr/XYExHnhbYLgveDs2J/wm36Hy4vbOwM=".to_string(), + )), + next_continuation_token: Some(Value("qsdfjklm".to_string())), + key_count: Some(IntValue(112)), + max_keys: IntValue(1000), + delimiter: None, + encoding_type: None, + is_truncated: Value("false".to_string()), + contents: vec![ListBucketItem { + key: Value("happyfacex.jpg".to_string()), + last_modified: Value(msec_to_rfc3339(0)), + etag: Value("\"70ee1738b6b21e2c8a43f3a5ab0eee71\"".to_string()), + size: IntValue(1111), + storage_class: Value("STANDARD".to_string()), + }], + common_prefixes: vec![], + }; + assert_eq!( + to_xml_with_header(&result)?, + "\ +\ + bucket\ + \ + 1ueGcxLPRx1Tr/XYExHnhbYLgveDs2J/wm36Hy4vbOwM=\ + qsdfjklm\ + 112\ + 1000\ + false\ + \ + happyfacex.jpg\ + 1970-01-01T00:00:00.000Z\ + "70ee1738b6b21e2c8a43f3a5ab0eee71"\ + 1111\ + STANDARD\ + \ +" + ); + Ok(()) + } + + #[test] + fn list_parts() -> Result<(), ApiError> { + let result = ListPartsResult { + xmlns: (), + bucket: Value("example-bucket".to_string()), + key: Value("example-object".to_string()), + upload_id: Value( + "XXBsb2FkIElEIGZvciBlbHZpbmcncyVcdS1tb3ZpZS5tMnRzEEEwbG9hZA".to_string(), + ), + part_number_marker: Some(IntValue(1)), + next_part_number_marker: Some(IntValue(3)), + max_parts: IntValue(2), + is_truncated: Value("true".to_string()), + parts: vec![ + PartItem { + etag: Value("\"7778aef83f66abc1fa1e8477f296d394\"".to_string()), + last_modified: Value("2010-11-10T20:48:34.000Z".to_string()), + part_number: IntValue(2), + size: IntValue(10485760), + }, + PartItem { + etag: Value("\"aaaa18db4cc2f85cedef654fccc4a4x8\"".to_string()), + last_modified: Value("2010-11-10T20:48:33.000Z".to_string()), + part_number: IntValue(3), + size: IntValue(10485760), + }, + ], + initiator: Initiator { + display_name: Value("umat-user-11116a31-17b5-4fb7-9df5-b288870f11xx".to_string()), + id: Value( + "arn:aws:iam::111122223333:user/some-user-11116a31-17b5-4fb7-9df5-b288870f11xx" + .to_string(), + ), + }, + owner: Owner { + display_name: Value("someName".to_string()), + id: Value( + "75aa57f09aa0c8caeab4f8c24e99d10f8e7faeebf76c078efc7c6caea54ba06a".to_string(), + ), + }, + storage_class: Value("STANDARD".to_string()), + }; + + assert_eq!( + to_xml_with_header(&result)?, + "\ +\ + example-bucket\ + example-object\ + XXBsb2FkIElEIGZvciBlbHZpbmcncyVcdS1tb3ZpZS5tMnRzEEEwbG9hZA\ + 1\ + 3\ + 2\ + true\ + \ + "7778aef83f66abc1fa1e8477f296d394"\ + 2010-11-10T20:48:34.000Z\ + 2\ + 10485760\ + \ + \ + "aaaa18db4cc2f85cedef654fccc4a4x8"\ + 2010-11-10T20:48:33.000Z\ + 3\ + 10485760\ + \ + \ + umat-user-11116a31-17b5-4fb7-9df5-b288870f11xx\ + arn:aws:iam::111122223333:user/some-user-11116a31-17b5-4fb7-9df5-b288870f11xx\ + \ + \ + someName\ + 75aa57f09aa0c8caeab4f8c24e99d10f8e7faeebf76c078efc7c6caea54ba06a\ + \ + STANDARD\ +" + ); + + Ok(()) + } +} diff --git a/src/api/s3_bucket.rs b/src/api/s3_bucket.rs deleted file mode 100644 index 8a5407d3..00000000 --- a/src/api/s3_bucket.rs +++ /dev/null @@ -1,352 +0,0 @@ -use std::collections::HashMap; -use std::sync::Arc; - -use hyper::{Body, Request, Response, StatusCode}; - -use garage_model::bucket_alias_table::*; -use garage_model::bucket_table::Bucket; -use garage_model::garage::Garage; -use garage_model::key_table::Key; -use garage_model::object_table::ObjectFilter; -use garage_model::permission::BucketKeyPerm; -use garage_table::util::*; -use garage_util::crdt::*; -use garage_util::data::*; -use garage_util::time::*; - -use crate::error::*; -use crate::s3_xml; -use crate::signature::verify_signed_content; - -pub fn handle_get_bucket_location(garage: Arc) -> Result, Error> { - let loc = s3_xml::LocationConstraint { - xmlns: (), - region: garage.config.s3_api.s3_region.to_string(), - }; - let xml = s3_xml::to_xml_with_header(&loc)?; - - Ok(Response::builder() - .header("Content-Type", "application/xml") - .body(Body::from(xml.into_bytes()))?) -} - -pub fn handle_get_bucket_versioning() -> Result, Error> { - let versioning = s3_xml::VersioningConfiguration { - xmlns: (), - status: None, - }; - - let xml = s3_xml::to_xml_with_header(&versioning)?; - - Ok(Response::builder() - .header("Content-Type", "application/xml") - .body(Body::from(xml.into_bytes()))?) -} - -pub async fn handle_list_buckets(garage: &Garage, api_key: &Key) -> Result, Error> { - let key_p = api_key.params().ok_or_internal_error( - "Key should not be in deleted state at this point (in handle_list_buckets)", - )?; - - // Collect buckets user has access to - let ids = api_key - .state - .as_option() - .unwrap() - .authorized_buckets - .items() - .iter() - .filter(|(_, perms)| perms.is_any()) - .map(|(id, _)| *id) - .collect::>(); - - let mut buckets_by_id = HashMap::new(); - let mut aliases = HashMap::new(); - - for bucket_id in ids.iter() { - let bucket = garage.bucket_table.get(&EmptyKey, bucket_id).await?; - if let Some(bucket) = bucket { - for (alias, _, _active) in bucket.aliases().iter().filter(|(_, _, active)| *active) { - let alias_opt = garage.bucket_alias_table.get(&EmptyKey, alias).await?; - if let Some(alias_ent) = alias_opt { - if *alias_ent.state.get() == Some(*bucket_id) { - aliases.insert(alias_ent.name().to_string(), *bucket_id); - } - } - } - if let Deletable::Present(param) = bucket.state { - buckets_by_id.insert(bucket_id, param); - } - } - } - - for (alias, _, id_opt) in key_p.local_aliases.items() { - if let Some(id) = id_opt { - aliases.insert(alias.clone(), *id); - } - } - - // Generate response - let list_buckets = s3_xml::ListAllMyBucketsResult { - owner: s3_xml::Owner { - display_name: s3_xml::Value(key_p.name.get().to_string()), - id: s3_xml::Value(api_key.key_id.to_string()), - }, - buckets: s3_xml::BucketList { - entries: aliases - .iter() - .filter_map(|(name, id)| buckets_by_id.get(id).map(|p| (name, id, p))) - .map(|(name, _id, param)| s3_xml::Bucket { - creation_date: s3_xml::Value(msec_to_rfc3339(param.creation_date)), - name: s3_xml::Value(name.to_string()), - }) - .collect(), - }, - }; - - let xml = s3_xml::to_xml_with_header(&list_buckets)?; - trace!("xml: {}", xml); - - Ok(Response::builder() - .header("Content-Type", "application/xml") - .body(Body::from(xml))?) -} - -pub async fn handle_create_bucket( - garage: &Garage, - req: Request, - content_sha256: Option, - api_key: Key, - bucket_name: String, -) -> Result, Error> { - let body = hyper::body::to_bytes(req.into_body()).await?; - - if let Some(content_sha256) = content_sha256 { - verify_signed_content(content_sha256, &body[..])?; - } - - let cmd = - parse_create_bucket_xml(&body[..]).ok_or_bad_request("Invalid create bucket XML query")?; - - if let Some(location_constraint) = cmd { - if location_constraint != garage.config.s3_api.s3_region { - return Err(Error::BadRequest(format!( - "Cannot satisfy location constraint `{}`: buckets can only be created in region `{}`", - location_constraint, - garage.config.s3_api.s3_region - ))); - } - } - - let key_params = api_key - .params() - .ok_or_internal_error("Key should not be deleted at this point")?; - - let existing_bucket = if let Some(Some(bucket_id)) = key_params.local_aliases.get(&bucket_name) - { - Some(*bucket_id) - } else { - garage - .bucket_helper() - .resolve_global_bucket_name(&bucket_name) - .await? - }; - - if let Some(bucket_id) = existing_bucket { - // Check we have write or owner permission on the bucket, - // in that case it's fine, return 200 OK, bucket exists; - // otherwise return a forbidden error. - let kp = api_key.bucket_permissions(&bucket_id); - if !(kp.allow_write || kp.allow_owner) { - return Err(Error::BucketAlreadyExists); - } - } else { - // Create the bucket! - if !is_valid_bucket_name(&bucket_name) { - return Err(Error::BadRequest(format!( - "{}: {}", - bucket_name, INVALID_BUCKET_NAME_MESSAGE - ))); - } - - let bucket = Bucket::new(); - garage.bucket_table.insert(&bucket).await?; - - garage - .bucket_helper() - .set_bucket_key_permissions(bucket.id, &api_key.key_id, BucketKeyPerm::ALL_PERMISSIONS) - .await?; - - garage - .bucket_helper() - .set_local_bucket_alias(bucket.id, &api_key.key_id, &bucket_name) - .await?; - } - - Ok(Response::builder() - .header("Location", format!("/{}", bucket_name)) - .body(Body::empty()) - .unwrap()) -} - -pub async fn handle_delete_bucket( - garage: &Garage, - bucket_id: Uuid, - bucket_name: String, - api_key: Key, -) -> Result, Error> { - let key_params = api_key - .params() - .ok_or_internal_error("Key should not be deleted at this point")?; - - let is_local_alias = matches!(key_params.local_aliases.get(&bucket_name), Some(Some(_))); - - let mut bucket = garage - .bucket_helper() - .get_existing_bucket(bucket_id) - .await?; - let bucket_state = bucket.state.as_option().unwrap(); - - // If the bucket has no other aliases, this is a true deletion. - // Otherwise, it is just an alias removal. - - let has_other_global_aliases = bucket_state - .aliases - .items() - .iter() - .filter(|(_, _, active)| *active) - .any(|(n, _, _)| is_local_alias || (*n != bucket_name)); - - let has_other_local_aliases = bucket_state - .local_aliases - .items() - .iter() - .filter(|(_, _, active)| *active) - .any(|((k, n), _, _)| !is_local_alias || *n != bucket_name || *k != api_key.key_id); - - if !has_other_global_aliases && !has_other_local_aliases { - // Delete bucket - - // Check bucket is empty - let objects = garage - .object_table - .get_range(&bucket_id, None, Some(ObjectFilter::IsData), 10) - .await?; - if !objects.is_empty() { - return Err(Error::BucketNotEmpty); - } - - // --- done checking, now commit --- - // 1. delete bucket alias - if is_local_alias { - garage - .bucket_helper() - .unset_local_bucket_alias(bucket_id, &api_key.key_id, &bucket_name) - .await?; - } else { - garage - .bucket_helper() - .unset_global_bucket_alias(bucket_id, &bucket_name) - .await?; - } - - // 2. delete authorization from keys that had access - for (key_id, _) in bucket.authorized_keys() { - garage - .bucket_helper() - .set_bucket_key_permissions(bucket.id, key_id, BucketKeyPerm::NO_PERMISSIONS) - .await?; - } - - // 3. delete bucket - bucket.state = Deletable::delete(); - garage.bucket_table.insert(&bucket).await?; - } else if is_local_alias { - // Just unalias - garage - .bucket_helper() - .unset_local_bucket_alias(bucket_id, &api_key.key_id, &bucket_name) - .await?; - } else { - // Just unalias (but from global namespace) - garage - .bucket_helper() - .unset_global_bucket_alias(bucket_id, &bucket_name) - .await?; - } - - Ok(Response::builder() - .status(StatusCode::NO_CONTENT) - .body(Body::empty())?) -} - -fn parse_create_bucket_xml(xml_bytes: &[u8]) -> Option> { - // Returns None if invalid data - // Returns Some(None) if no location constraint is given - // Returns Some(Some("xxxx")) where xxxx is the given location constraint - - let xml_str = std::str::from_utf8(xml_bytes).ok()?; - if xml_str.trim_matches(char::is_whitespace).is_empty() { - return Some(None); - } - - let xml = roxmltree::Document::parse(xml_str).ok()?; - - let cbc = xml.root().first_child()?; - if !cbc.has_tag_name("CreateBucketConfiguration") { - return None; - } - - let mut ret = None; - for item in cbc.children() { - println!("{:?}", item); - if item.has_tag_name("LocationConstraint") { - if ret != None { - return None; - } - ret = Some(item.text()?.to_string()); - } else if !item.is_text() { - return None; - } - } - - Some(ret) -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn create_bucket() { - assert_eq!(parse_create_bucket_xml(br#""#), Some(None)); - assert_eq!( - parse_create_bucket_xml( - br#" - - - "# - ), - Some(None) - ); - assert_eq!( - parse_create_bucket_xml( - br#" - - Europe - - "# - ), - Some(Some("Europe".into())) - ); - assert_eq!( - parse_create_bucket_xml( - br#" - - - "# - ), - None - ); - } -} diff --git a/src/api/s3_copy.rs b/src/api/s3_copy.rs deleted file mode 100644 index fc4707e2..00000000 --- a/src/api/s3_copy.rs +++ /dev/null @@ -1,660 +0,0 @@ -use std::pin::Pin; -use std::sync::Arc; -use std::time::{Duration, SystemTime, UNIX_EPOCH}; - -use futures::{stream, stream::Stream, StreamExt, TryFutureExt}; -use md5::{Digest as Md5Digest, Md5}; - -use hyper::{Body, Request, Response}; -use serde::Serialize; - -use garage_table::*; -use garage_util::data::*; -use garage_util::time::*; - -use garage_model::block_ref_table::*; -use garage_model::garage::Garage; -use garage_model::key_table::Key; -use garage_model::object_table::*; -use garage_model::version_table::*; - -use crate::api_server::{parse_bucket_key, resolve_bucket}; -use crate::error::*; -use crate::s3_put::{decode_upload_id, get_headers}; -use crate::s3_xml::{self, xmlns_tag}; - -pub async fn handle_copy( - garage: Arc, - api_key: &Key, - req: &Request, - dest_bucket_id: Uuid, - dest_key: &str, -) -> Result, Error> { - let copy_precondition = CopyPreconditionHeaders::parse(req)?; - - let source_object = get_copy_source(&garage, api_key, req).await?; - - let (source_version, source_version_data, source_version_meta) = - extract_source_info(&source_object)?; - - // Check precondition, e.g. x-amz-copy-source-if-match - copy_precondition.check(source_version, &source_version_meta.etag)?; - - // Generate parameters for copied object - let new_uuid = gen_uuid(); - let new_timestamp = now_msec(); - - // Implement x-amz-metadata-directive: REPLACE - let new_meta = match req.headers().get("x-amz-metadata-directive") { - Some(v) if v == hyper::header::HeaderValue::from_static("REPLACE") => ObjectVersionMeta { - headers: get_headers(req.headers())?, - size: source_version_meta.size, - etag: source_version_meta.etag.clone(), - }, - _ => source_version_meta.clone(), - }; - - let etag = new_meta.etag.to_string(); - - // Save object copy - match source_version_data { - ObjectVersionData::DeleteMarker => unreachable!(), - ObjectVersionData::Inline(_meta, bytes) => { - let dest_object_version = ObjectVersion { - uuid: new_uuid, - timestamp: new_timestamp, - state: ObjectVersionState::Complete(ObjectVersionData::Inline( - new_meta, - bytes.clone(), - )), - }; - let dest_object = Object::new( - dest_bucket_id, - dest_key.to_string(), - vec![dest_object_version], - ); - garage.object_table.insert(&dest_object).await?; - } - ObjectVersionData::FirstBlock(_meta, first_block_hash) => { - // Get block list from source version - let source_version = garage - .version_table - .get(&source_version.uuid, &EmptyKey) - .await?; - let source_version = source_version.ok_or(Error::NoSuchKey)?; - - // Write an "uploading" marker in Object table - // This holds a reference to the object in the Version table - // so that it won't be deleted, e.g. by repair_versions. - let tmp_dest_object_version = ObjectVersion { - uuid: new_uuid, - timestamp: new_timestamp, - state: ObjectVersionState::Uploading(new_meta.headers.clone()), - }; - let tmp_dest_object = Object::new( - dest_bucket_id, - dest_key.to_string(), - vec![tmp_dest_object_version], - ); - garage.object_table.insert(&tmp_dest_object).await?; - - // Write version in the version table. Even with empty block list, - // this means that the BlockRef entries linked to this version cannot be - // marked as deleted (they are marked as deleted only if the Version - // doesn't exist or is marked as deleted). - let mut dest_version = - Version::new(new_uuid, dest_bucket_id, dest_key.to_string(), false); - garage.version_table.insert(&dest_version).await?; - - // Fill in block list for version and insert block refs - for (bk, bv) in source_version.blocks.items().iter() { - dest_version.blocks.put(*bk, *bv); - } - let dest_block_refs = dest_version - .blocks - .items() - .iter() - .map(|b| BlockRef { - block: b.1.hash, - version: new_uuid, - deleted: false.into(), - }) - .collect::>(); - futures::try_join!( - garage.version_table.insert(&dest_version), - garage.block_ref_table.insert_many(&dest_block_refs[..]), - )?; - - // Insert final object - // We do this last because otherwise there is a race condition in the case where - // the copy call has the same source and destination (this happens, rclone does - // it to update the modification timestamp for instance). If we did this concurrently - // with the stuff before, the block's reference counts could be decremented before - // they are incremented again for the new version, leading to data being deleted. - let dest_object_version = ObjectVersion { - uuid: new_uuid, - timestamp: new_timestamp, - state: ObjectVersionState::Complete(ObjectVersionData::FirstBlock( - new_meta, - *first_block_hash, - )), - }; - let dest_object = Object::new( - dest_bucket_id, - dest_key.to_string(), - vec![dest_object_version], - ); - garage.object_table.insert(&dest_object).await?; - } - } - - let last_modified = msec_to_rfc3339(new_timestamp); - let result = CopyObjectResult { - last_modified: s3_xml::Value(last_modified), - etag: s3_xml::Value(format!("\"{}\"", etag)), - }; - let xml = s3_xml::to_xml_with_header(&result)?; - - Ok(Response::builder() - .header("Content-Type", "application/xml") - .header("x-amz-version-id", hex::encode(new_uuid)) - .header( - "x-amz-copy-source-version-id", - hex::encode(source_version.uuid), - ) - .body(Body::from(xml))?) -} - -pub async fn handle_upload_part_copy( - garage: Arc, - api_key: &Key, - req: &Request, - dest_bucket_id: Uuid, - dest_key: &str, - part_number: u64, - upload_id: &str, -) -> Result, Error> { - let copy_precondition = CopyPreconditionHeaders::parse(req)?; - - let dest_version_uuid = decode_upload_id(upload_id)?; - - let dest_key = dest_key.to_string(); - let (source_object, dest_object) = futures::try_join!( - get_copy_source(&garage, api_key, req), - garage - .object_table - .get(&dest_bucket_id, &dest_key) - .map_err(Error::from), - )?; - let dest_object = dest_object.ok_or(Error::NoSuchKey)?; - - let (source_object_version, source_version_data, source_version_meta) = - extract_source_info(&source_object)?; - - // Check precondition on source, e.g. x-amz-copy-source-if-match - copy_precondition.check(source_object_version, &source_version_meta.etag)?; - - // Check source range is valid - let source_range = match req.headers().get("x-amz-copy-source-range") { - Some(range) => { - let range_str = range.to_str()?; - let mut ranges = http_range::HttpRange::parse(range_str, source_version_meta.size) - .map_err(|e| (e, source_version_meta.size))?; - if ranges.len() != 1 { - return Err(Error::BadRequest( - "Invalid x-amz-copy-source-range header: exactly 1 range must be given".into(), - )); - } else { - ranges.pop().unwrap() - } - } - None => http_range::HttpRange { - start: 0, - length: source_version_meta.size, - }, - }; - - // Check destination version is indeed in uploading state - if !dest_object - .versions() - .iter() - .any(|v| v.uuid == dest_version_uuid && v.is_uploading()) - { - return Err(Error::NoSuchUpload); - } - - // Check source version is not inlined - match source_version_data { - ObjectVersionData::DeleteMarker => unreachable!(), - ObjectVersionData::Inline(_meta, _bytes) => { - // This is only for small files, we don't bother handling this. - // (in AWS UploadPartCopy works for parts at least 5MB which - // is never the case of an inline object) - return Err(Error::BadRequest( - "Source object is too small (minimum part size is 5Mb)".into(), - )); - } - ObjectVersionData::FirstBlock(_meta, _first_block_hash) => (), - }; - - // Fetch source versin with its block list, - // and destination version to check part hasn't yet been uploaded - let (source_version, dest_version) = futures::try_join!( - garage - .version_table - .get(&source_object_version.uuid, &EmptyKey), - garage.version_table.get(&dest_version_uuid, &EmptyKey), - )?; - let source_version = source_version.ok_or(Error::NoSuchKey)?; - - // Check this part number hasn't yet been uploaded - if let Some(dv) = dest_version { - if dv.has_part_number(part_number) { - return Err(Error::BadRequest(format!( - "Part number {} has already been uploaded", - part_number - ))); - } - } - - // We want to reuse blocks from the source version as much as possible. - // However, we still need to get the data from these blocks - // because we need to know it to calculate the MD5sum of the part - // which is used as its ETag. - - // First, calculate what blocks we want to keep, - // and the subrange of the block to take, if the bounds of the - // requested range are in the middle. - let (range_begin, range_end) = (source_range.start, source_range.start + source_range.length); - - let mut blocks_to_copy = vec![]; - let mut current_offset = 0; - for (_bk, block) in source_version.blocks.items().iter() { - let (block_begin, block_end) = (current_offset, current_offset + block.size); - - if block_begin < range_end && block_end > range_begin { - let subrange_begin = if block_begin < range_begin { - Some(range_begin - block_begin) - } else { - None - }; - let subrange_end = if block_end > range_end { - Some(range_end - block_begin) - } else { - None - }; - let range_to_copy = match (subrange_begin, subrange_end) { - (Some(b), Some(e)) => Some(b as usize..e as usize), - (None, Some(e)) => Some(0..e as usize), - (Some(b), None) => Some(b as usize..block.size as usize), - (None, None) => None, - }; - - blocks_to_copy.push((block.hash, range_to_copy)); - } - - current_offset = block_end; - } - - // Now, actually copy the blocks - let mut md5hasher = Md5::new(); - - // First, create a stream that is able to read the source blocks - // and extract the subrange if necessary. - // The second returned value is an Option, that is Some - // if and only if the block returned is a block that already existed - // in the Garage data store (thus we don't need to save it again). - let garage2 = garage.clone(); - let source_blocks = stream::iter(blocks_to_copy) - .flat_map(|(block_hash, range_to_copy)| { - let garage3 = garage2.clone(); - stream::once(async move { - let data = garage3.block_manager.rpc_get_block(&block_hash).await?; - match range_to_copy { - Some(r) => Ok((data[r].to_vec(), None)), - None => Ok((data, Some(block_hash))), - } - }) - }) - .peekable(); - - // The defragmenter is a custom stream (defined below) that concatenates - // consecutive block parts when they are too small. - // It returns a series of (Vec, Option). - // When it is done, it returns an empty vec. - // Same as the previous iterator, the Option is Some(_) if and only if - // it's an existing block of the Garage data store. - let mut defragmenter = Defragmenter::new(garage.config.block_size, Box::pin(source_blocks)); - - let mut current_offset = 0; - let mut next_block = defragmenter.next().await?; - - loop { - let (data, existing_block_hash) = next_block; - if data.is_empty() { - break; - } - - md5hasher.update(&data[..]); - - let must_upload = existing_block_hash.is_none(); - let final_hash = existing_block_hash.unwrap_or_else(|| blake2sum(&data[..])); - - let mut version = Version::new(dest_version_uuid, dest_bucket_id, dest_key.clone(), false); - version.blocks.put( - VersionBlockKey { - part_number, - offset: current_offset, - }, - VersionBlock { - hash: final_hash, - size: data.len() as u64, - }, - ); - current_offset += data.len() as u64; - - let block_ref = BlockRef { - block: final_hash, - version: dest_version_uuid, - deleted: false.into(), - }; - - let garage2 = garage.clone(); - let res = futures::try_join!( - // Thing 1: if the block is not exactly a block that existed before, - // we need to insert that data as a new block. - async move { - if must_upload { - garage2.block_manager.rpc_put_block(final_hash, data).await - } else { - Ok(()) - } - }, - // Thing 2: we need to insert the block in the version - garage.version_table.insert(&version), - // Thing 3: we need to add a block reference - garage.block_ref_table.insert(&block_ref), - // Thing 4: we need to prefetch the next block - defragmenter.next(), - )?; - next_block = res.3; - } - - let data_md5sum = md5hasher.finalize(); - let etag = hex::encode(data_md5sum); - - // Put the part's ETag in the Versiontable - let mut version = Version::new(dest_version_uuid, dest_bucket_id, dest_key.clone(), false); - version.parts_etags.put(part_number, etag.clone()); - garage.version_table.insert(&version).await?; - - // LGTM - let resp_xml = s3_xml::to_xml_with_header(&CopyPartResult { - xmlns: (), - etag: s3_xml::Value(format!("\"{}\"", etag)), - last_modified: s3_xml::Value(msec_to_rfc3339(source_object_version.timestamp)), - })?; - - Ok(Response::builder() - .header("Content-Type", "application/xml") - .header( - "x-amz-copy-source-version-id", - hex::encode(source_object_version.uuid), - ) - .body(Body::from(resp_xml))?) -} - -async fn get_copy_source( - garage: &Garage, - api_key: &Key, - req: &Request, -) -> Result { - let copy_source = req.headers().get("x-amz-copy-source").unwrap().to_str()?; - let copy_source = percent_encoding::percent_decode_str(copy_source).decode_utf8()?; - - let (source_bucket, source_key) = parse_bucket_key(©_source, None)?; - let source_bucket_id = resolve_bucket(garage, &source_bucket.to_string(), api_key).await?; - - if !api_key.allow_read(&source_bucket_id) { - return Err(Error::Forbidden(format!( - "Reading from bucket {} not allowed for this key", - source_bucket - ))); - } - - let source_key = source_key.ok_or_bad_request("No source key specified")?; - - let source_object = garage - .object_table - .get(&source_bucket_id, &source_key.to_string()) - .await? - .ok_or(Error::NoSuchKey)?; - - Ok(source_object) -} - -fn extract_source_info( - source_object: &Object, -) -> Result<(&ObjectVersion, &ObjectVersionData, &ObjectVersionMeta), Error> { - let source_version = source_object - .versions() - .iter() - .rev() - .find(|v| v.is_complete()) - .ok_or(Error::NoSuchKey)?; - - let source_version_data = match &source_version.state { - ObjectVersionState::Complete(x) => x, - _ => unreachable!(), - }; - - let source_version_meta = match source_version_data { - ObjectVersionData::DeleteMarker => { - return Err(Error::NoSuchKey); - } - ObjectVersionData::Inline(meta, _bytes) => meta, - ObjectVersionData::FirstBlock(meta, _fbh) => meta, - }; - - Ok((source_version, source_version_data, source_version_meta)) -} - -struct CopyPreconditionHeaders { - copy_source_if_match: Option>, - copy_source_if_modified_since: Option, - copy_source_if_none_match: Option>, - copy_source_if_unmodified_since: Option, -} - -impl CopyPreconditionHeaders { - fn parse(req: &Request) -> Result { - Ok(Self { - copy_source_if_match: req - .headers() - .get("x-amz-copy-source-if-match") - .map(|x| x.to_str()) - .transpose()? - .map(|x| { - x.split(',') - .map(|m| m.trim().trim_matches('"').to_string()) - .collect::>() - }), - copy_source_if_modified_since: req - .headers() - .get("x-amz-copy-source-if-modified-since") - .map(|x| x.to_str()) - .transpose()? - .map(httpdate::parse_http_date) - .transpose() - .ok_or_bad_request("Invalid date in x-amz-copy-source-if-modified-since")?, - copy_source_if_none_match: req - .headers() - .get("x-amz-copy-source-if-none-match") - .map(|x| x.to_str()) - .transpose()? - .map(|x| { - x.split(',') - .map(|m| m.trim().trim_matches('"').to_string()) - .collect::>() - }), - copy_source_if_unmodified_since: req - .headers() - .get("x-amz-copy-source-if-unmodified-since") - .map(|x| x.to_str()) - .transpose()? - .map(httpdate::parse_http_date) - .transpose() - .ok_or_bad_request("Invalid date in x-amz-copy-source-if-unmodified-since")?, - }) - } - - fn check(&self, v: &ObjectVersion, etag: &str) -> Result<(), Error> { - let v_date = UNIX_EPOCH + Duration::from_millis(v.timestamp); - - let ok = match ( - &self.copy_source_if_match, - &self.copy_source_if_unmodified_since, - &self.copy_source_if_none_match, - &self.copy_source_if_modified_since, - ) { - // TODO I'm not sure all of the conditions are evaluated correctly here - - // If we have both if-match and if-unmodified-since, - // basically we don't care about if-unmodified-since, - // because in the spec it says that if if-match evaluates to - // true but if-unmodified-since evaluates to false, - // the copy is still done. - (Some(im), _, None, None) => im.iter().any(|x| x == etag || x == "*"), - (None, Some(ius), None, None) => v_date <= *ius, - - // If we have both if-none-match and if-modified-since, - // then both of the two conditions must evaluate to true - (None, None, Some(inm), Some(ims)) => { - !inm.iter().any(|x| x == etag || x == "*") && v_date > *ims - } - (None, None, Some(inm), None) => !inm.iter().any(|x| x == etag || x == "*"), - (None, None, None, Some(ims)) => v_date > *ims, - (None, None, None, None) => true, - _ => { - return Err(Error::BadRequest( - "Invalid combination of x-amz-copy-source-if-xxxxx headers".into(), - )) - } - }; - - if ok { - Ok(()) - } else { - Err(Error::PreconditionFailed) - } - } -} - -type BlockStreamItemOk = (Vec, Option); -type BlockStreamItem = Result; - -struct Defragmenter> { - block_size: usize, - block_stream: Pin>>, - buffer: Vec, - hash: Option, -} - -impl> Defragmenter { - fn new(block_size: usize, block_stream: Pin>>) -> Self { - Self { - block_size, - block_stream, - buffer: vec![], - hash: None, - } - } - - async fn next(&mut self) -> BlockStreamItem { - // Fill buffer while we can - while let Some(res) = self.block_stream.as_mut().peek().await { - let (peeked_next_block, _) = match res { - Ok(t) => t, - Err(_) => { - self.block_stream.next().await.unwrap()?; - unreachable!() - } - }; - - if self.buffer.is_empty() { - let (next_block, next_block_hash) = self.block_stream.next().await.unwrap()?; - self.buffer = next_block; - self.hash = next_block_hash; - } else if self.buffer.len() + peeked_next_block.len() > self.block_size { - break; - } else { - let (next_block, _) = self.block_stream.next().await.unwrap()?; - self.buffer.extend(next_block); - self.hash = None; - } - } - - Ok((std::mem::take(&mut self.buffer), self.hash.take())) - } -} - -#[derive(Debug, Serialize, PartialEq)] -pub struct CopyObjectResult { - #[serde(rename = "LastModified")] - pub last_modified: s3_xml::Value, - #[serde(rename = "ETag")] - pub etag: s3_xml::Value, -} - -#[derive(Debug, Serialize, PartialEq)] -pub struct CopyPartResult { - #[serde(serialize_with = "xmlns_tag")] - pub xmlns: (), - #[serde(rename = "LastModified")] - pub last_modified: s3_xml::Value, - #[serde(rename = "ETag")] - pub etag: s3_xml::Value, -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::s3_xml::to_xml_with_header; - - #[test] - fn copy_object_result() -> Result<(), Error> { - let copy_result = CopyObjectResult { - last_modified: s3_xml::Value(msec_to_rfc3339(0)), - etag: s3_xml::Value("\"9b2cf535f27731c974343645a3985328\"".to_string()), - }; - assert_eq!( - to_xml_with_header(©_result)?, - "\ -\ - 1970-01-01T00:00:00.000Z\ - "9b2cf535f27731c974343645a3985328"\ -\ - " - ); - Ok(()) - } - - #[test] - fn serialize_copy_part_result() -> Result<(), Error> { - let expected_retval = "\ -\ - 2011-04-11T20:34:56.000Z\ - "9b2cf535f27731c974343645a3985328"\ -"; - let v = CopyPartResult { - xmlns: (), - last_modified: s3_xml::Value("2011-04-11T20:34:56.000Z".into()), - etag: s3_xml::Value("\"9b2cf535f27731c974343645a3985328\"".into()), - }; - println!("{}", to_xml_with_header(&v)?); - - assert_eq!(to_xml_with_header(&v)?, expected_retval); - - Ok(()) - } -} diff --git a/src/api/s3_cors.rs b/src/api/s3_cors.rs deleted file mode 100644 index ab77e23a..00000000 --- a/src/api/s3_cors.rs +++ /dev/null @@ -1,442 +0,0 @@ -use quick_xml::de::from_reader; -use std::sync::Arc; - -use http::header::{ - ACCESS_CONTROL_ALLOW_HEADERS, ACCESS_CONTROL_ALLOW_METHODS, ACCESS_CONTROL_ALLOW_ORIGIN, - ACCESS_CONTROL_EXPOSE_HEADERS, ACCESS_CONTROL_REQUEST_HEADERS, ACCESS_CONTROL_REQUEST_METHOD, -}; -use hyper::{header::HeaderName, Body, Method, Request, Response, StatusCode}; - -use serde::{Deserialize, Serialize}; - -use crate::error::*; -use crate::s3_xml::{to_xml_with_header, xmlns_tag, IntValue, Value}; -use crate::signature::verify_signed_content; - -use garage_model::bucket_table::{Bucket, CorsRule as GarageCorsRule}; -use garage_model::garage::Garage; -use garage_table::*; -use garage_util::data::*; - -pub async fn handle_get_cors(bucket: &Bucket) -> Result, Error> { - let param = bucket - .params() - .ok_or_internal_error("Bucket should not be deleted at this point")?; - - if let Some(cors) = param.cors_config.get() { - let wc = CorsConfiguration { - xmlns: (), - cors_rules: cors - .iter() - .map(CorsRule::from_garage_cors_rule) - .collect::>(), - }; - let xml = to_xml_with_header(&wc)?; - Ok(Response::builder() - .status(StatusCode::OK) - .header(http::header::CONTENT_TYPE, "application/xml") - .body(Body::from(xml))?) - } else { - Ok(Response::builder() - .status(StatusCode::NO_CONTENT) - .body(Body::empty())?) - } -} - -pub async fn handle_delete_cors( - garage: Arc, - bucket_id: Uuid, -) -> Result, Error> { - let mut bucket = garage - .bucket_table - .get(&EmptyKey, &bucket_id) - .await? - .ok_or(Error::NoSuchBucket)?; - - let param = bucket - .params_mut() - .ok_or_internal_error("Bucket should not be deleted at this point")?; - - param.cors_config.update(None); - garage.bucket_table.insert(&bucket).await?; - - Ok(Response::builder() - .status(StatusCode::NO_CONTENT) - .body(Body::empty())?) -} - -pub async fn handle_put_cors( - garage: Arc, - bucket_id: Uuid, - req: Request, - content_sha256: Option, -) -> Result, Error> { - let body = hyper::body::to_bytes(req.into_body()).await?; - - if let Some(content_sha256) = content_sha256 { - verify_signed_content(content_sha256, &body[..])?; - } - - let mut bucket = garage - .bucket_table - .get(&EmptyKey, &bucket_id) - .await? - .ok_or(Error::NoSuchBucket)?; - - let param = bucket - .params_mut() - .ok_or_internal_error("Bucket should not be deleted at this point")?; - - let conf: CorsConfiguration = from_reader(&body as &[u8])?; - conf.validate()?; - - param - .cors_config - .update(Some(conf.into_garage_cors_config()?)); - garage.bucket_table.insert(&bucket).await?; - - Ok(Response::builder() - .status(StatusCode::OK) - .body(Body::empty())?) -} - -pub async fn handle_options_s3api( - garage: Arc, - req: &Request, - bucket_name: Option, -) -> Result, Error> { - // FIXME: CORS rules of buckets with local aliases are - // not taken into account. - - // If the bucket name is a global bucket name, - // we try to apply the CORS rules of that bucket. - // If a user has a local bucket name that has - // the same name, its CORS rules won't be applied - // and will be shadowed by the rules of the globally - // existing bucket (but this is inevitable because - // OPTIONS calls are not auhtenticated). - if let Some(bn) = bucket_name { - let helper = garage.bucket_helper(); - let bucket_id = helper.resolve_global_bucket_name(&bn).await?; - if let Some(id) = bucket_id { - let bucket = garage - .bucket_table - .get(&EmptyKey, &id) - .await? - .filter(|b| !b.state.is_deleted()) - .ok_or(Error::NoSuchBucket)?; - handle_options_for_bucket(req, &bucket) - } else { - // If there is a bucket name in the request, but that name - // does not correspond to a global alias for a bucket, - // then it's either a non-existing bucket or a local bucket. - // We have no way of knowing, because the request is not - // authenticated and thus we can't resolve local aliases. - // We take the permissive approach of allowing everything, - // because we don't want to prevent web apps that use - // local bucket names from making API calls. - Ok(Response::builder() - .header(ACCESS_CONTROL_ALLOW_ORIGIN, "*") - .header(ACCESS_CONTROL_ALLOW_METHODS, "*") - .status(StatusCode::OK) - .body(Body::empty())?) - } - } else { - // If there is no bucket name in the request, - // we are doing a ListBuckets call, which we want to allow - // for all origins. - Ok(Response::builder() - .header(ACCESS_CONTROL_ALLOW_ORIGIN, "*") - .header(ACCESS_CONTROL_ALLOW_METHODS, "GET") - .status(StatusCode::OK) - .body(Body::empty())?) - } -} - -pub fn handle_options_for_bucket( - req: &Request, - bucket: &Bucket, -) -> Result, Error> { - let origin = req - .headers() - .get("Origin") - .ok_or_bad_request("Missing Origin header")? - .to_str()?; - let request_method = req - .headers() - .get(ACCESS_CONTROL_REQUEST_METHOD) - .ok_or_bad_request("Missing Access-Control-Request-Method header")? - .to_str()?; - let request_headers = match req.headers().get(ACCESS_CONTROL_REQUEST_HEADERS) { - Some(h) => h.to_str()?.split(',').map(|h| h.trim()).collect::>(), - None => vec![], - }; - - if let Some(cors_config) = bucket.params().unwrap().cors_config.get() { - let matching_rule = cors_config - .iter() - .find(|rule| cors_rule_matches(rule, origin, request_method, request_headers.iter())); - if let Some(rule) = matching_rule { - let mut resp = Response::builder() - .status(StatusCode::OK) - .body(Body::empty())?; - add_cors_headers(&mut resp, rule).ok_or_internal_error("Invalid CORS configuration")?; - return Ok(resp); - } - } - - Err(Error::Forbidden("This CORS request is not allowed.".into())) -} - -pub fn find_matching_cors_rule<'a>( - bucket: &'a Bucket, - req: &Request, -) -> Result, Error> { - if let Some(cors_config) = bucket.params().unwrap().cors_config.get() { - if let Some(origin) = req.headers().get("Origin") { - let origin = origin.to_str()?; - let request_headers = match req.headers().get(ACCESS_CONTROL_REQUEST_HEADERS) { - Some(h) => h.to_str()?.split(',').map(|h| h.trim()).collect::>(), - None => vec![], - }; - return Ok(cors_config.iter().find(|rule| { - cors_rule_matches(rule, origin, req.method().as_ref(), request_headers.iter()) - })); - } - } - Ok(None) -} - -fn cors_rule_matches<'a, HI, S>( - rule: &GarageCorsRule, - origin: &'a str, - method: &'a str, - mut request_headers: HI, -) -> bool -where - HI: Iterator, - S: AsRef, -{ - rule.allow_origins.iter().any(|x| x == "*" || x == origin) - && rule.allow_methods.iter().any(|x| x == "*" || x == method) - && request_headers.all(|h| { - rule.allow_headers - .iter() - .any(|x| x == "*" || x == h.as_ref()) - }) -} - -pub fn add_cors_headers( - resp: &mut Response, - rule: &GarageCorsRule, -) -> Result<(), http::header::InvalidHeaderValue> { - let h = resp.headers_mut(); - h.insert( - ACCESS_CONTROL_ALLOW_ORIGIN, - rule.allow_origins.join(", ").parse()?, - ); - h.insert( - ACCESS_CONTROL_ALLOW_METHODS, - rule.allow_methods.join(", ").parse()?, - ); - h.insert( - ACCESS_CONTROL_ALLOW_HEADERS, - rule.allow_headers.join(", ").parse()?, - ); - h.insert( - ACCESS_CONTROL_EXPOSE_HEADERS, - rule.expose_headers.join(", ").parse()?, - ); - Ok(()) -} - -// ---- SERIALIZATION AND DESERIALIZATION TO/FROM S3 XML ---- - -#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)] -#[serde(rename = "CORSConfiguration")] -pub struct CorsConfiguration { - #[serde(serialize_with = "xmlns_tag", skip_deserializing)] - pub xmlns: (), - #[serde(rename = "CORSRule")] - pub cors_rules: Vec, -} - -#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)] -pub struct CorsRule { - #[serde(rename = "ID")] - pub id: Option, - #[serde(rename = "MaxAgeSeconds")] - pub max_age_seconds: Option, - #[serde(rename = "AllowedOrigin")] - pub allowed_origins: Vec, - #[serde(rename = "AllowedMethod")] - pub allowed_methods: Vec, - #[serde(rename = "AllowedHeader", default)] - pub allowed_headers: Vec, - #[serde(rename = "ExposeHeader", default)] - pub expose_headers: Vec, -} - -#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)] -pub struct AllowedMethod { - #[serde(rename = "AllowedMethod")] - pub allowed_method: Value, -} - -#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)] -pub struct AllowedHeader { - #[serde(rename = "AllowedHeader")] - pub allowed_header: Value, -} - -#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)] -pub struct ExposeHeader { - #[serde(rename = "ExposeHeader")] - pub expose_header: Value, -} - -impl CorsConfiguration { - pub fn validate(&self) -> Result<(), Error> { - for r in self.cors_rules.iter() { - r.validate()?; - } - Ok(()) - } - - pub fn into_garage_cors_config(self) -> Result, Error> { - Ok(self - .cors_rules - .iter() - .map(CorsRule::to_garage_cors_rule) - .collect()) - } -} - -impl CorsRule { - pub fn validate(&self) -> Result<(), Error> { - for method in self.allowed_methods.iter() { - method - .0 - .parse::() - .ok_or_bad_request("Invalid CORSRule method")?; - } - for header in self - .allowed_headers - .iter() - .chain(self.expose_headers.iter()) - { - header - .0 - .parse::() - .ok_or_bad_request("Invalid HTTP header name")?; - } - Ok(()) - } - - pub fn to_garage_cors_rule(&self) -> GarageCorsRule { - let convert_vec = - |vval: &[Value]| vval.iter().map(|x| x.0.to_owned()).collect::>(); - GarageCorsRule { - id: self.id.as_ref().map(|x| x.0.to_owned()), - max_age_seconds: self.max_age_seconds.as_ref().map(|x| x.0 as u64), - allow_origins: convert_vec(&self.allowed_origins), - allow_methods: convert_vec(&self.allowed_methods), - allow_headers: convert_vec(&self.allowed_headers), - expose_headers: convert_vec(&self.expose_headers), - } - } - - pub fn from_garage_cors_rule(rule: &GarageCorsRule) -> Self { - let convert_vec = |vval: &[String]| { - vval.iter() - .map(|x| Value(x.clone())) - .collect::>() - }; - Self { - id: rule.id.as_ref().map(|x| Value(x.clone())), - max_age_seconds: rule.max_age_seconds.map(|x| IntValue(x as i64)), - allowed_origins: convert_vec(&rule.allow_origins), - allowed_methods: convert_vec(&rule.allow_methods), - allowed_headers: convert_vec(&rule.allow_headers), - expose_headers: convert_vec(&rule.expose_headers), - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - - use quick_xml::de::from_str; - - #[test] - fn test_deserialize() -> Result<(), Error> { - let message = r#" - - - http://www.example.com - - PUT - POST - DELETE - - * - - - * - GET - - - qsdfjklm - 12345 - https://perdu.com - - GET - DELETE - * - * - -"#; - let conf: CorsConfiguration = from_str(message).unwrap(); - let ref_value = CorsConfiguration { - xmlns: (), - cors_rules: vec![ - CorsRule { - id: None, - max_age_seconds: None, - allowed_origins: vec!["http://www.example.com".into()], - allowed_methods: vec!["PUT".into(), "POST".into(), "DELETE".into()], - allowed_headers: vec!["*".into()], - expose_headers: vec![], - }, - CorsRule { - id: None, - max_age_seconds: None, - allowed_origins: vec!["*".into()], - allowed_methods: vec!["GET".into()], - allowed_headers: vec![], - expose_headers: vec![], - }, - CorsRule { - id: Some("qsdfjklm".into()), - max_age_seconds: Some(IntValue(12345)), - allowed_origins: vec!["https://perdu.com".into()], - allowed_methods: vec!["GET".into(), "DELETE".into()], - allowed_headers: vec!["*".into()], - expose_headers: vec!["*".into()], - }, - ], - }; - assert_eq! { - ref_value, - conf - }; - - let message2 = to_xml_with_header(&ref_value)?; - - let cleanup = |c: &str| c.replace(char::is_whitespace, ""); - assert_eq!(cleanup(message), cleanup(&message2)); - - Ok(()) - } -} diff --git a/src/api/s3_delete.rs b/src/api/s3_delete.rs deleted file mode 100644 index b243d982..00000000 --- a/src/api/s3_delete.rs +++ /dev/null @@ -1,170 +0,0 @@ -use std::sync::Arc; - -use hyper::{Body, Request, Response, StatusCode}; - -use garage_util::data::*; -use garage_util::time::*; - -use garage_model::garage::Garage; -use garage_model::object_table::*; - -use crate::error::*; -use crate::s3_xml; -use crate::signature::verify_signed_content; - -async fn handle_delete_internal( - garage: &Garage, - bucket_id: Uuid, - key: &str, -) -> Result<(Uuid, Uuid), Error> { - let object = garage - .object_table - .get(&bucket_id, &key.to_string()) - .await? - .ok_or(Error::NoSuchKey)?; // No need to delete - - let interesting_versions = object.versions().iter().filter(|v| { - !matches!( - v.state, - ObjectVersionState::Aborted - | ObjectVersionState::Complete(ObjectVersionData::DeleteMarker) - ) - }); - - let mut version_to_delete = None; - let mut timestamp = now_msec(); - for v in interesting_versions { - if v.timestamp + 1 > timestamp || version_to_delete.is_none() { - version_to_delete = Some(v.uuid); - } - timestamp = std::cmp::max(timestamp, v.timestamp + 1); - } - - let deleted_version = version_to_delete.ok_or(Error::NoSuchKey)?; - - let version_uuid = gen_uuid(); - - let object = Object::new( - bucket_id, - key.into(), - vec![ObjectVersion { - uuid: version_uuid, - timestamp, - state: ObjectVersionState::Complete(ObjectVersionData::DeleteMarker), - }], - ); - - garage.object_table.insert(&object).await?; - - Ok((deleted_version, version_uuid)) -} - -pub async fn handle_delete( - garage: Arc, - bucket_id: Uuid, - key: &str, -) -> Result, Error> { - let (_deleted_version, delete_marker_version) = - handle_delete_internal(&garage, bucket_id, key).await?; - - Ok(Response::builder() - .header("x-amz-version-id", hex::encode(delete_marker_version)) - .status(StatusCode::NO_CONTENT) - .body(Body::from(vec![])) - .unwrap()) -} - -pub async fn handle_delete_objects( - garage: Arc, - bucket_id: Uuid, - req: Request, - content_sha256: Option, -) -> Result, Error> { - let body = hyper::body::to_bytes(req.into_body()).await?; - - if let Some(content_sha256) = content_sha256 { - verify_signed_content(content_sha256, &body[..])?; - } - - let cmd_xml = roxmltree::Document::parse(std::str::from_utf8(&body)?)?; - let cmd = parse_delete_objects_xml(&cmd_xml).ok_or_bad_request("Invalid delete XML query")?; - - let mut ret_deleted = Vec::new(); - let mut ret_errors = Vec::new(); - - for obj in cmd.objects.iter() { - match handle_delete_internal(&garage, bucket_id, &obj.key).await { - Ok((deleted_version, delete_marker_version)) => { - if cmd.quiet { - continue; - } - ret_deleted.push(s3_xml::Deleted { - key: s3_xml::Value(obj.key.clone()), - version_id: s3_xml::Value(hex::encode(deleted_version)), - delete_marker_version_id: s3_xml::Value(hex::encode(delete_marker_version)), - }); - } - Err(e) => { - ret_errors.push(s3_xml::DeleteError { - code: s3_xml::Value(e.aws_code().to_string()), - key: Some(s3_xml::Value(obj.key.clone())), - message: s3_xml::Value(format!("{}", e)), - version_id: None, - }); - } - } - } - - let xml = s3_xml::to_xml_with_header(&s3_xml::DeleteResult { - xmlns: (), - deleted: ret_deleted, - errors: ret_errors, - })?; - - Ok(Response::builder() - .header("Content-Type", "application/xml") - .body(Body::from(xml))?) -} - -struct DeleteRequest { - quiet: bool, - objects: Vec, -} - -struct DeleteObject { - key: String, -} - -fn parse_delete_objects_xml(xml: &roxmltree::Document) -> Option { - let mut ret = DeleteRequest { - quiet: false, - objects: vec![], - }; - - let root = xml.root(); - let delete = root.first_child()?; - - if !delete.has_tag_name("Delete") { - return None; - } - - for item in delete.children() { - if item.has_tag_name("Object") { - let key = item.children().find(|e| e.has_tag_name("Key"))?; - let key_str = key.text()?; - ret.objects.push(DeleteObject { - key: key_str.to_string(), - }); - } else if item.has_tag_name("Quiet") { - if item.text()? == "true" { - ret.quiet = true; - } else { - ret.quiet = false; - } - } else { - return None; - } - } - - Some(ret) -} diff --git a/src/api/s3_get.rs b/src/api/s3_get.rs deleted file mode 100644 index 7f647e15..00000000 --- a/src/api/s3_get.rs +++ /dev/null @@ -1,461 +0,0 @@ -//! Function related to GET and HEAD requests -use std::sync::Arc; -use std::time::{Duration, UNIX_EPOCH}; - -use futures::stream::*; -use http::header::{ - ACCEPT_RANGES, CONTENT_LENGTH, CONTENT_RANGE, CONTENT_TYPE, ETAG, IF_MODIFIED_SINCE, - IF_NONE_MATCH, LAST_MODIFIED, RANGE, -}; -use hyper::body::Bytes; -use hyper::{Body, Request, Response, StatusCode}; - -use garage_table::EmptyKey; -use garage_util::data::*; - -use garage_model::garage::Garage; -use garage_model::object_table::*; -use garage_model::version_table::*; - -use crate::error::*; - -const X_AMZ_MP_PARTS_COUNT: &str = "x-amz-mp-parts-count"; - -fn object_headers( - version: &ObjectVersion, - version_meta: &ObjectVersionMeta, -) -> http::response::Builder { - debug!("Version meta: {:?}", version_meta); - - let date = UNIX_EPOCH + Duration::from_millis(version.timestamp); - let date_str = httpdate::fmt_http_date(date); - - let mut resp = Response::builder() - .header(CONTENT_TYPE, version_meta.headers.content_type.to_string()) - .header(LAST_MODIFIED, date_str) - .header(ACCEPT_RANGES, "bytes".to_string()); - - if !version_meta.etag.is_empty() { - resp = resp.header(ETAG, format!("\"{}\"", version_meta.etag)); - } - - for (k, v) in version_meta.headers.other.iter() { - resp = resp.header(k, v.to_string()); - } - - resp -} - -fn try_answer_cached( - version: &ObjectVersion, - version_meta: &ObjectVersionMeta, - req: &Request, -) -> Option> { - // It is possible, and is even usually the case, [that both If-None-Match and - // If-Modified-Since] are present in a request. In this situation If-None-Match takes - // precedence and If-Modified-Since is ignored (as per 6.Precedence from rfc7232). The rational - // being that etag based matching is more accurate, it has no issue with sub-second precision - // for instance (in case of very fast updates) - let cached = if let Some(none_match) = req.headers().get(IF_NONE_MATCH) { - let none_match = none_match.to_str().ok()?; - let expected = format!("\"{}\"", version_meta.etag); - let found = none_match - .split(',') - .map(str::trim) - .any(|etag| etag == expected || etag == "\"*\""); - found - } else if let Some(modified_since) = req.headers().get(IF_MODIFIED_SINCE) { - let modified_since = modified_since.to_str().ok()?; - let client_date = httpdate::parse_http_date(modified_since).ok()?; - let server_date = UNIX_EPOCH + Duration::from_millis(version.timestamp); - client_date >= server_date - } else { - false - }; - - if cached { - Some( - Response::builder() - .status(StatusCode::NOT_MODIFIED) - .body(Body::empty()) - .unwrap(), - ) - } else { - None - } -} - -/// Handle HEAD request -pub async fn handle_head( - garage: Arc, - req: &Request, - bucket_id: Uuid, - key: &str, - part_number: Option, -) -> Result, Error> { - let object = garage - .object_table - .get(&bucket_id, &key.to_string()) - .await? - .ok_or(Error::NoSuchKey)?; - - let object_version = object - .versions() - .iter() - .rev() - .find(|v| v.is_data()) - .ok_or(Error::NoSuchKey)?; - - let version_data = match &object_version.state { - ObjectVersionState::Complete(c) => c, - _ => unreachable!(), - }; - - let version_meta = match version_data { - ObjectVersionData::Inline(meta, _) => meta, - ObjectVersionData::FirstBlock(meta, _) => meta, - _ => unreachable!(), - }; - - if let Some(cached) = try_answer_cached(object_version, version_meta, req) { - return Ok(cached); - } - - if let Some(pn) = part_number { - match version_data { - ObjectVersionData::Inline(_, bytes) => { - if pn != 1 { - return Err(Error::InvalidPart); - } - Ok(object_headers(object_version, version_meta) - .header(CONTENT_LENGTH, format!("{}", bytes.len())) - .header( - CONTENT_RANGE, - format!("bytes 0-{}/{}", bytes.len() - 1, bytes.len()), - ) - .header(X_AMZ_MP_PARTS_COUNT, "1") - .status(StatusCode::PARTIAL_CONTENT) - .body(Body::empty())?) - } - ObjectVersionData::FirstBlock(_, _) => { - let version = garage - .version_table - .get(&object_version.uuid, &EmptyKey) - .await? - .ok_or(Error::NoSuchKey)?; - - let (part_offset, part_end) = - calculate_part_bounds(&version, pn).ok_or(Error::InvalidPart)?; - let n_parts = version.parts_etags.items().len(); - - Ok(object_headers(object_version, version_meta) - .header(CONTENT_LENGTH, format!("{}", part_end - part_offset)) - .header( - CONTENT_RANGE, - format!( - "bytes {}-{}/{}", - part_offset, - part_end - 1, - version_meta.size - ), - ) - .header(X_AMZ_MP_PARTS_COUNT, format!("{}", n_parts)) - .status(StatusCode::PARTIAL_CONTENT) - .body(Body::empty())?) - } - _ => unreachable!(), - } - } else { - Ok(object_headers(object_version, version_meta) - .header(CONTENT_LENGTH, format!("{}", version_meta.size)) - .status(StatusCode::OK) - .body(Body::empty())?) - } -} - -/// Handle GET request -pub async fn handle_get( - garage: Arc, - req: &Request, - bucket_id: Uuid, - key: &str, - part_number: Option, -) -> Result, Error> { - let object = garage - .object_table - .get(&bucket_id, &key.to_string()) - .await? - .ok_or(Error::NoSuchKey)?; - - let last_v = object - .versions() - .iter() - .rev() - .find(|v| v.is_complete()) - .ok_or(Error::NoSuchKey)?; - - let last_v_data = match &last_v.state { - ObjectVersionState::Complete(x) => x, - _ => unreachable!(), - }; - let last_v_meta = match last_v_data { - ObjectVersionData::DeleteMarker => return Err(Error::NoSuchKey), - ObjectVersionData::Inline(meta, _) => meta, - ObjectVersionData::FirstBlock(meta, _) => meta, - }; - - if let Some(cached) = try_answer_cached(last_v, last_v_meta, req) { - return Ok(cached); - } - - match (part_number, parse_range_header(req, last_v_meta.size)?) { - (Some(_), Some(_)) => { - return Err(Error::BadRequest( - "Cannot specify both partNumber and Range header".into(), - )); - } - (Some(pn), None) => { - return handle_get_part(garage, last_v, last_v_data, last_v_meta, pn).await; - } - (None, Some(range)) => { - return handle_get_range( - garage, - last_v, - last_v_data, - last_v_meta, - range.start, - range.start + range.length, - ) - .await; - } - (None, None) => (), - } - - let resp_builder = object_headers(last_v, last_v_meta) - .header(CONTENT_LENGTH, format!("{}", last_v_meta.size)) - .status(StatusCode::OK); - - match &last_v_data { - ObjectVersionData::DeleteMarker => unreachable!(), - ObjectVersionData::Inline(_, bytes) => { - let body: Body = Body::from(bytes.to_vec()); - Ok(resp_builder.body(body)?) - } - ObjectVersionData::FirstBlock(_, first_block_hash) => { - let read_first_block = garage.block_manager.rpc_get_block(first_block_hash); - let get_next_blocks = garage.version_table.get(&last_v.uuid, &EmptyKey); - - let (first_block, version) = futures::try_join!(read_first_block, get_next_blocks)?; - let version = version.ok_or(Error::NoSuchKey)?; - - let mut blocks = version - .blocks - .items() - .iter() - .map(|(_, vb)| (vb.hash, None)) - .collect::>(); - blocks[0].1 = Some(first_block); - - let body_stream = futures::stream::iter(blocks) - .map(move |(hash, data_opt)| { - let garage = garage.clone(); - async move { - if let Some(data) = data_opt { - Ok(Bytes::from(data)) - } else { - garage - .block_manager - .rpc_get_block(&hash) - .await - .map(Bytes::from) - } - } - }) - .buffered(2); - - let body = hyper::body::Body::wrap_stream(body_stream); - Ok(resp_builder.body(body)?) - } - } -} - -async fn handle_get_range( - garage: Arc, - version: &ObjectVersion, - version_data: &ObjectVersionData, - version_meta: &ObjectVersionMeta, - begin: u64, - end: u64, -) -> Result, Error> { - let resp_builder = object_headers(version, version_meta) - .header(CONTENT_LENGTH, format!("{}", end - begin)) - .header( - CONTENT_RANGE, - format!("bytes {}-{}/{}", begin, end - 1, version_meta.size), - ) - .status(StatusCode::PARTIAL_CONTENT); - - match &version_data { - ObjectVersionData::DeleteMarker => unreachable!(), - ObjectVersionData::Inline(_meta, bytes) => { - if end as usize <= bytes.len() { - let body: Body = Body::from(bytes[begin as usize..end as usize].to_vec()); - Ok(resp_builder.body(body)?) - } else { - None.ok_or_internal_error( - "Requested range not present in inline bytes when it should have been", - ) - } - } - ObjectVersionData::FirstBlock(_meta, _first_block_hash) => { - let version = garage - .version_table - .get(&version.uuid, &EmptyKey) - .await? - .ok_or(Error::NoSuchKey)?; - - let body = body_from_blocks_range(garage, version.blocks.items(), begin, end); - Ok(resp_builder.body(body)?) - } - } -} - -async fn handle_get_part( - garage: Arc, - object_version: &ObjectVersion, - version_data: &ObjectVersionData, - version_meta: &ObjectVersionMeta, - part_number: u64, -) -> Result, Error> { - let resp_builder = - object_headers(object_version, version_meta).status(StatusCode::PARTIAL_CONTENT); - - match version_data { - ObjectVersionData::Inline(_, bytes) => { - if part_number != 1 { - return Err(Error::InvalidPart); - } - Ok(resp_builder - .header(CONTENT_LENGTH, format!("{}", bytes.len())) - .header( - CONTENT_RANGE, - format!("bytes {}-{}/{}", 0, bytes.len() - 1, bytes.len()), - ) - .header(X_AMZ_MP_PARTS_COUNT, "1") - .body(Body::from(bytes.to_vec()))?) - } - ObjectVersionData::FirstBlock(_, _) => { - let version = garage - .version_table - .get(&object_version.uuid, &EmptyKey) - .await? - .ok_or(Error::NoSuchKey)?; - - let (begin, end) = - calculate_part_bounds(&version, part_number).ok_or(Error::InvalidPart)?; - let n_parts = version.parts_etags.items().len(); - - let body = body_from_blocks_range(garage, version.blocks.items(), begin, end); - - Ok(resp_builder - .header(CONTENT_LENGTH, format!("{}", end - begin)) - .header( - CONTENT_RANGE, - format!("bytes {}-{}/{}", begin, end - 1, version_meta.size), - ) - .header(X_AMZ_MP_PARTS_COUNT, format!("{}", n_parts)) - .body(body)?) - } - _ => unreachable!(), - } -} - -fn parse_range_header( - req: &Request, - total_size: u64, -) -> Result, Error> { - let range = match req.headers().get(RANGE) { - Some(range) => { - let range_str = range.to_str()?; - let mut ranges = - http_range::HttpRange::parse(range_str, total_size).map_err(|e| (e, total_size))?; - if ranges.len() > 1 { - // garage does not support multi-range requests yet, so we respond with the entire - // object when multiple ranges are requested - None - } else { - ranges.pop() - } - } - None => None, - }; - Ok(range) -} - -fn calculate_part_bounds(v: &Version, part_number: u64) -> Option<(u64, u64)> { - let mut offset = 0; - for (i, (bk, bv)) in v.blocks.items().iter().enumerate() { - if bk.part_number == part_number { - let size: u64 = v.blocks.items()[i..] - .iter() - .take_while(|(k, _)| k.part_number == part_number) - .map(|(_, v)| v.size) - .sum(); - return Some((offset, offset + size)); - } - offset += bv.size; - } - None -} - -fn body_from_blocks_range( - garage: Arc, - all_blocks: &[(VersionBlockKey, VersionBlock)], - begin: u64, - end: u64, -) -> Body { - // We will store here the list of blocks that have an intersection with the requested - // range, as well as their "true offset", which is their actual offset in the complete - // file (whereas block.offset designates the offset of the block WITHIN THE PART - // block.part_number, which is not the same in the case of a multipart upload) - let mut blocks: Vec<(VersionBlock, u64)> = Vec::with_capacity(std::cmp::min( - all_blocks.len(), - 4 + ((end - begin) / std::cmp::max(all_blocks[0].1.size as u64, 1024)) as usize, - )); - let mut true_offset = 0; - for (_, b) in all_blocks.iter() { - if true_offset >= end { - break; - } - // Keep only blocks that have an intersection with the requested range - if true_offset < end && true_offset + b.size > begin { - blocks.push((*b, true_offset)); - } - true_offset += b.size; - } - - let body_stream = futures::stream::iter(blocks) - .map(move |(block, true_offset)| { - let garage = garage.clone(); - async move { - let data = garage.block_manager.rpc_get_block(&block.hash).await?; - let data = Bytes::from(data); - let start_in_block = if true_offset > begin { - 0 - } else { - begin - true_offset - }; - let end_in_block = if true_offset + block.size < end { - block.size - } else { - end - true_offset - }; - Result::::Ok( - data.slice(start_in_block as usize..end_in_block as usize), - ) - } - }) - .buffered(2); - - hyper::body::Body::wrap_stream(body_stream) -} diff --git a/src/api/s3_list.rs b/src/api/s3_list.rs deleted file mode 100644 index 5852fc1b..00000000 --- a/src/api/s3_list.rs +++ /dev/null @@ -1,1383 +0,0 @@ -use std::cmp::Ordering; -use std::collections::{BTreeMap, BTreeSet}; -use std::iter::{Iterator, Peekable}; -use std::sync::Arc; - -use hyper::{Body, Response}; - -use garage_util::data::*; -use garage_util::error::Error as GarageError; -use garage_util::time::*; - -use garage_model::garage::Garage; -use garage_model::object_table::*; -use garage_model::version_table::Version; - -use garage_table::EmptyKey; - -use crate::encoding::*; -use crate::error::*; -use crate::s3_put; -use crate::s3_xml; - -const DUMMY_NAME: &str = "Dummy Key"; -const DUMMY_KEY: &str = "GKDummyKey"; - -#[derive(Debug)] -pub struct ListQueryCommon { - pub bucket_name: String, - pub bucket_id: Uuid, - pub delimiter: Option, - pub page_size: usize, - pub prefix: String, - pub urlencode_resp: bool, -} - -#[derive(Debug)] -pub struct ListObjectsQuery { - pub is_v2: bool, - pub marker: Option, - pub continuation_token: Option, - pub start_after: Option, - pub common: ListQueryCommon, -} - -#[derive(Debug)] -pub struct ListMultipartUploadsQuery { - pub key_marker: Option, - pub upload_id_marker: Option, - pub common: ListQueryCommon, -} - -#[derive(Debug)] -pub struct ListPartsQuery { - pub bucket_name: String, - pub bucket_id: Uuid, - pub key: String, - pub upload_id: String, - pub part_number_marker: Option, - pub max_parts: u64, -} - -pub async fn handle_list( - garage: Arc, - query: &ListObjectsQuery, -) -> Result, Error> { - let io = |bucket, key, count| { - let t = &garage.object_table; - async move { - t.get_range(&bucket, key, Some(ObjectFilter::IsData), count) - .await - } - }; - - debug!("ListObjects {:?}", query); - let mut acc = query.build_accumulator(); - let pagination = fetch_list_entries(&query.common, query.begin()?, &mut acc, &io).await?; - - let result = s3_xml::ListBucketResult { - xmlns: (), - // Sending back request information - name: s3_xml::Value(query.common.bucket_name.to_string()), - prefix: uriencode_maybe(&query.common.prefix, query.common.urlencode_resp), - max_keys: s3_xml::IntValue(query.common.page_size as i64), - delimiter: query - .common - .delimiter - .as_ref() - .map(|x| uriencode_maybe(x, query.common.urlencode_resp)), - encoding_type: match query.common.urlencode_resp { - true => Some(s3_xml::Value("url".to_string())), - false => None, - }, - marker: match (!query.is_v2, &query.marker) { - (true, Some(k)) => Some(uriencode_maybe(k, query.common.urlencode_resp)), - _ => None, - }, - start_after: match (query.is_v2, &query.start_after) { - (true, Some(sa)) => Some(uriencode_maybe(sa, query.common.urlencode_resp)), - _ => None, - }, - continuation_token: match (query.is_v2, &query.continuation_token) { - (true, Some(ct)) => Some(s3_xml::Value(ct.to_string())), - _ => None, - }, - - // Pagination - is_truncated: s3_xml::Value(format!("{}", pagination.is_some())), - key_count: Some(s3_xml::IntValue( - acc.keys.len() as i64 + acc.common_prefixes.len() as i64, - )), - next_marker: match (!query.is_v2, &pagination) { - (true, Some(RangeBegin::AfterKey { key: k })) - | ( - true, - Some(RangeBegin::IncludingKey { - fallback_key: Some(k), - .. - }), - ) => Some(uriencode_maybe(k, query.common.urlencode_resp)), - _ => None, - }, - next_continuation_token: match (query.is_v2, &pagination) { - (true, Some(RangeBegin::AfterKey { key })) => Some(s3_xml::Value(format!( - "]{}", - base64::encode(key.as_bytes()) - ))), - (true, Some(RangeBegin::IncludingKey { key, .. })) => Some(s3_xml::Value(format!( - "[{}", - base64::encode(key.as_bytes()) - ))), - _ => None, - }, - - // Body - contents: acc - .keys - .iter() - .map(|(key, info)| s3_xml::ListBucketItem { - key: uriencode_maybe(key, query.common.urlencode_resp), - last_modified: s3_xml::Value(msec_to_rfc3339(info.last_modified)), - size: s3_xml::IntValue(info.size as i64), - etag: s3_xml::Value(format!("\"{}\"", info.etag)), - storage_class: s3_xml::Value("STANDARD".to_string()), - }) - .collect(), - common_prefixes: acc - .common_prefixes - .iter() - .map(|pfx| s3_xml::CommonPrefix { - prefix: uriencode_maybe(pfx, query.common.urlencode_resp), - }) - .collect(), - }; - - let xml = s3_xml::to_xml_with_header(&result)?; - Ok(Response::builder() - .header("Content-Type", "application/xml") - .body(Body::from(xml.into_bytes()))?) -} - -pub async fn handle_list_multipart_upload( - garage: Arc, - query: &ListMultipartUploadsQuery, -) -> Result, Error> { - let io = |bucket, key, count| { - let t = &garage.object_table; - async move { - t.get_range(&bucket, key, Some(ObjectFilter::IsUploading), count) - .await - } - }; - - debug!("ListMultipartUploads {:?}", query); - let mut acc = query.build_accumulator(); - let pagination = fetch_list_entries(&query.common, query.begin()?, &mut acc, &io).await?; - - let result = s3_xml::ListMultipartUploadsResult { - xmlns: (), - - // Sending back some information about the request - bucket: s3_xml::Value(query.common.bucket_name.to_string()), - prefix: uriencode_maybe(&query.common.prefix, query.common.urlencode_resp), - delimiter: query - .common - .delimiter - .as_ref() - .map(|d| uriencode_maybe(d, query.common.urlencode_resp)), - max_uploads: s3_xml::IntValue(query.common.page_size as i64), - key_marker: query - .key_marker - .as_ref() - .map(|m| uriencode_maybe(m, query.common.urlencode_resp)), - upload_id_marker: query - .upload_id_marker - .as_ref() - .map(|m| s3_xml::Value(m.to_string())), - encoding_type: match query.common.urlencode_resp { - true => Some(s3_xml::Value("url".to_string())), - false => None, - }, - - // Handling pagination - is_truncated: s3_xml::Value(format!("{}", pagination.is_some())), - next_key_marker: match &pagination { - None => None, - Some(RangeBegin::AfterKey { key }) - | Some(RangeBegin::AfterUpload { key, .. }) - | Some(RangeBegin::IncludingKey { key, .. }) => { - Some(uriencode_maybe(key, query.common.urlencode_resp)) - } - }, - next_upload_id_marker: match pagination { - Some(RangeBegin::AfterUpload { upload, .. }) => { - Some(s3_xml::Value(hex::encode(upload))) - } - Some(RangeBegin::IncludingKey { .. }) => Some(s3_xml::Value("include".to_string())), - _ => None, - }, - - // Result body - upload: acc - .keys - .iter() - .map(|(uuid, info)| s3_xml::ListMultipartItem { - initiated: s3_xml::Value(msec_to_rfc3339(info.timestamp)), - key: uriencode_maybe(&info.key, query.common.urlencode_resp), - upload_id: s3_xml::Value(hex::encode(uuid)), - storage_class: s3_xml::Value("STANDARD".to_string()), - initiator: s3_xml::Initiator { - display_name: s3_xml::Value(DUMMY_NAME.to_string()), - id: s3_xml::Value(DUMMY_KEY.to_string()), - }, - owner: s3_xml::Owner { - display_name: s3_xml::Value(DUMMY_NAME.to_string()), - id: s3_xml::Value(DUMMY_KEY.to_string()), - }, - }) - .collect(), - common_prefixes: acc - .common_prefixes - .iter() - .map(|c| s3_xml::CommonPrefix { - prefix: s3_xml::Value(c.to_string()), - }) - .collect(), - }; - - let xml = s3_xml::to_xml_with_header(&result)?; - - Ok(Response::builder() - .header("Content-Type", "application/xml") - .body(Body::from(xml.into_bytes()))?) -} - -pub async fn handle_list_parts( - garage: Arc, - query: &ListPartsQuery, -) -> Result, Error> { - debug!("ListParts {:?}", query); - - let upload_id = s3_put::decode_upload_id(&query.upload_id)?; - - let (object, version) = futures::try_join!( - garage.object_table.get(&query.bucket_id, &query.key), - garage.version_table.get(&upload_id, &EmptyKey), - )?; - - let (info, next) = fetch_part_info(query, object, version, upload_id)?; - - let result = s3_xml::ListPartsResult { - xmlns: (), - bucket: s3_xml::Value(query.bucket_name.to_string()), - key: s3_xml::Value(query.key.to_string()), - upload_id: s3_xml::Value(query.upload_id.to_string()), - part_number_marker: query.part_number_marker.map(|e| s3_xml::IntValue(e as i64)), - next_part_number_marker: next.map(|e| s3_xml::IntValue(e as i64)), - max_parts: s3_xml::IntValue(query.max_parts as i64), - is_truncated: s3_xml::Value(next.map(|_| "true").unwrap_or("false").to_string()), - parts: info - .iter() - .map(|part| s3_xml::PartItem { - etag: s3_xml::Value(format!("\"{}\"", part.etag)), - last_modified: s3_xml::Value(msec_to_rfc3339(part.timestamp)), - part_number: s3_xml::IntValue(part.part_number as i64), - size: s3_xml::IntValue(part.size as i64), - }) - .collect(), - initiator: s3_xml::Initiator { - display_name: s3_xml::Value(DUMMY_NAME.to_string()), - id: s3_xml::Value(DUMMY_KEY.to_string()), - }, - owner: s3_xml::Owner { - display_name: s3_xml::Value(DUMMY_NAME.to_string()), - id: s3_xml::Value(DUMMY_KEY.to_string()), - }, - storage_class: s3_xml::Value("STANDARD".to_string()), - }; - - let xml = s3_xml::to_xml_with_header(&result)?; - - Ok(Response::builder() - .header("Content-Type", "application/xml") - .body(Body::from(xml.into_bytes()))?) -} - -/* - * Private enums and structs - */ - -#[derive(Debug)] -struct ObjectInfo { - last_modified: u64, - size: u64, - etag: String, -} - -#[derive(Debug, PartialEq)] -struct UploadInfo { - key: String, - timestamp: u64, -} - -#[derive(Debug, PartialEq)] -struct PartInfo { - etag: String, - timestamp: u64, - part_number: u64, - size: u64, -} - -enum ExtractionResult { - NoMore, - Filled, - FilledAtUpload { - key: String, - upload: Uuid, - }, - Extracted { - key: String, - }, - // Fallback key is used for legacy APIs that only support - // exlusive pagination (and not inclusive one). - SkipTo { - key: String, - fallback_key: Option, - }, -} - -#[derive(PartialEq, Clone, Debug)] -enum RangeBegin { - // Fallback key is used for legacy APIs that only support - // exlusive pagination (and not inclusive one). - IncludingKey { - key: String, - fallback_key: Option, - }, - AfterKey { - key: String, - }, - AfterUpload { - key: String, - upload: Uuid, - }, -} -type Pagination = Option; - -/* - * Fetch list entries - */ - -async fn fetch_list_entries( - query: &ListQueryCommon, - begin: RangeBegin, - acc: &mut impl ExtractAccumulator, - mut io: F, -) -> Result -where - R: futures::Future, GarageError>>, - F: FnMut(Uuid, Option, usize) -> R, -{ - let mut cursor = begin; - // +1 is needed as we may need to skip the 1st key - // (range is inclusive while most S3 requests are exclusive) - let count = query.page_size + 1; - - loop { - let start_key = match cursor { - RangeBegin::AfterKey { ref key } - | RangeBegin::AfterUpload { ref key, .. } - | RangeBegin::IncludingKey { ref key, .. } => Some(key.clone()), - }; - - // Fetch objects - let objects = io(query.bucket_id, start_key.clone(), count).await?; - - debug!( - "List: get range {:?} (max {}), results: {}", - start_key, - count, - objects.len() - ); - let server_more = objects.len() >= count; - - let prev_req_cursor = cursor.clone(); - let mut iter = objects.iter().peekable(); - - // Drop the first key if needed - // Only AfterKey requires it according to the S3 spec and our implem. - match (&cursor, iter.peek()) { - (RangeBegin::AfterKey { key }, Some(object)) if &object.key == key => iter.next(), - (_, _) => None, - }; - - while let Some(object) = iter.peek() { - if !object.key.starts_with(&query.prefix) { - // If the key is not in the requested prefix, we're done - return Ok(None); - } - - cursor = match acc.extract(query, &cursor, &mut iter) { - ExtractionResult::Extracted { key } => RangeBegin::AfterKey { key }, - ExtractionResult::SkipTo { key, fallback_key } => { - RangeBegin::IncludingKey { key, fallback_key } - } - ExtractionResult::FilledAtUpload { key, upload } => { - return Ok(Some(RangeBegin::AfterUpload { key, upload })) - } - ExtractionResult::Filled => return Ok(Some(cursor)), - ExtractionResult::NoMore => return Ok(None), - }; - } - - if !server_more { - // We did not fully fill the accumulator despite exhausting all the data we have, - // we're done - return Ok(None); - } - - if prev_req_cursor == cursor { - unreachable!("No progress has been done in the loop. This is a bug, please report it."); - } - } -} - -fn fetch_part_info( - query: &ListPartsQuery, - object: Option, - version: Option, - upload_id: Uuid, -) -> Result<(Vec, Option), Error> { - // Check results - let object = object.ok_or(Error::NoSuchKey)?; - - let obj_version = object - .versions() - .iter() - .find(|v| v.uuid == upload_id && v.is_uploading()) - .ok_or(Error::NoSuchUpload)?; - - let version = version.ok_or(Error::NoSuchKey)?; - - // Cut the beginning of our 2 vectors if required - let (etags, blocks) = match &query.part_number_marker { - Some(marker) => { - let next = marker + 1; - - let part_idx = into_ok_or_err( - version - .parts_etags - .items() - .binary_search_by(|(part_num, _)| part_num.cmp(&next)), - ); - let parts = &version.parts_etags.items()[part_idx..]; - - let block_idx = into_ok_or_err( - version - .blocks - .items() - .binary_search_by(|(vkey, _)| vkey.part_number.cmp(&next)), - ); - let blocks = &version.blocks.items()[block_idx..]; - - (parts, blocks) - } - None => (version.parts_etags.items(), version.blocks.items()), - }; - - // Use the block vector to compute a (part_number, size) vector - let mut size = Vec::<(u64, u64)>::new(); - blocks.iter().for_each(|(key, val)| { - let mut new_size = val.size; - match size.pop() { - Some((part_number, size)) if part_number == key.part_number => new_size += size, - Some(v) => size.push(v), - None => (), - } - size.push((key.part_number, new_size)) - }); - - // Merge the etag vector and size vector to build a PartInfo vector - let max_parts = query.max_parts as usize; - let (mut etag_iter, mut size_iter) = (etags.iter().peekable(), size.iter().peekable()); - - let mut info = Vec::::with_capacity(max_parts); - - while info.len() < max_parts { - match (etag_iter.peek(), size_iter.peek()) { - (Some((ep, etag)), Some((sp, size))) => match ep.cmp(sp) { - Ordering::Less => { - debug!("ETag information ignored due to missing corresponding block information. Query: {:?}", query); - etag_iter.next(); - } - Ordering::Equal => { - info.push(PartInfo { - etag: etag.to_string(), - timestamp: obj_version.timestamp, - part_number: *ep, - size: *size, - }); - etag_iter.next(); - size_iter.next(); - } - Ordering::Greater => { - debug!("Block information ignored due to missing corresponding ETag information. Query: {:?}", query); - size_iter.next(); - } - }, - (None, None) => return Ok((info, None)), - _ => { - debug!( - "Additional block or ETag information ignored. Query: {:?}", - query - ); - return Ok((info, None)); - } - } - } - - match info.last() { - Some(part_info) => { - let pagination = Some(part_info.part_number); - Ok((info, pagination)) - } - None => Ok((info, None)), - } -} - -/* - * ListQuery logic - */ - -/// Determine the key from where we want to start fetch objects from the database -/// -/// We choose whether the object at this key must -/// be included or excluded from the response. -/// This key can be the prefix in the base case, or intermediate -/// points in the dataset if we are continuing a previous listing. -impl ListObjectsQuery { - fn build_accumulator(&self) -> Accumulator { - Accumulator::::new(self.common.page_size) - } - - fn begin(&self) -> Result { - if self.is_v2 { - match (&self.continuation_token, &self.start_after) { - // In V2 mode, the continuation token is defined as an opaque - // string in the spec, so we can do whatever we want with it. - // In our case, it is defined as either [ or ] (for include - // representing the key to start with. - (Some(token), _) => match &token[..1] { - "[" => Ok(RangeBegin::IncludingKey { - key: String::from_utf8(base64::decode(token[1..].as_bytes())?)?, - fallback_key: None, - }), - "]" => Ok(RangeBegin::AfterKey { - key: String::from_utf8(base64::decode(token[1..].as_bytes())?)?, - }), - _ => Err(Error::BadRequest("Invalid continuation token".to_string())), - }, - - // StartAfter has defined semantics in the spec: - // start listing at the first key immediately after. - (_, Some(key)) => Ok(RangeBegin::AfterKey { - key: key.to_string(), - }), - - // In the case where neither is specified, we start - // listing at the specified prefix. If an object has this - // exact same key, we include it. (@TODO is this correct?) - _ => Ok(RangeBegin::IncludingKey { - key: self.common.prefix.to_string(), - fallback_key: None, - }), - } - } else { - match &self.marker { - // In V1 mode, the spec defines the Marker value to mean - // the same thing as the StartAfter value in V2 mode. - Some(key) => Ok(RangeBegin::AfterKey { - key: key.to_string(), - }), - _ => Ok(RangeBegin::IncludingKey { - key: self.common.prefix.to_string(), - fallback_key: None, - }), - } - } - } -} - -impl ListMultipartUploadsQuery { - fn build_accumulator(&self) -> Accumulator { - Accumulator::::new(self.common.page_size) - } - - fn begin(&self) -> Result { - match (&self.upload_id_marker, &self.key_marker) { - // If both the upload id marker and the key marker are sets, - // the spec specifies that we must start listing uploads INCLUDING the given key, - // AFTER the specified upload id (sorted in a lexicographic order). - // To enable some optimisations, we emulate "IncludingKey" by extending the upload id - // semantic. We base our reasoning on the hypothesis that S3's upload ids are opaques - // while Garage's ones are 32 bytes hex encoded which enables us to extend this query - // with a specific "include" upload id. - (Some(up_marker), Some(key_marker)) => match &up_marker[..] { - "include" => Ok(RangeBegin::IncludingKey { - key: key_marker.to_string(), - fallback_key: None, - }), - uuid => Ok(RangeBegin::AfterUpload { - key: key_marker.to_string(), - upload: s3_put::decode_upload_id(uuid)?, - }), - }, - - // If only the key marker is specified, the spec says that we must start listing - // uploads AFTER the specified key. - (None, Some(key_marker)) => Ok(RangeBegin::AfterKey { - key: key_marker.to_string(), - }), - _ => Ok(RangeBegin::IncludingKey { - key: self.common.prefix.to_string(), - fallback_key: None, - }), - } - } -} - -/* - * Accumulator logic - */ - -trait ExtractAccumulator { - fn extract<'a>( - &mut self, - query: &ListQueryCommon, - cursor: &RangeBegin, - iter: &mut Peekable>, - ) -> ExtractionResult; -} - -struct Accumulator { - common_prefixes: BTreeSet, - keys: BTreeMap, - max_capacity: usize, -} - -type ObjectAccumulator = Accumulator; -type UploadAccumulator = Accumulator; - -impl Accumulator { - fn new(page_size: usize) -> Accumulator { - Accumulator { - common_prefixes: BTreeSet::::new(), - keys: BTreeMap::::new(), - max_capacity: page_size, - } - } - - /// Observe the Object iterator and try to extract a single common prefix - /// - /// This function can consume an arbitrary number of items as long as they share the same - /// common prefix. - fn extract_common_prefix<'a>( - &mut self, - objects: &mut Peekable>, - query: &ListQueryCommon, - ) -> Option { - // Get the next object from the iterator - let object = objects.peek().expect("This iterator can not be empty as it is checked earlier in the code. This is a logic bug, please report it."); - - // Check if this is a common prefix (requires a passed delimiter and its value in the key) - let pfx = match common_prefix(object, query) { - Some(p) => p, - None => return None, - }; - - // Try to register this prefix - // If not possible, we can return early - if !self.try_insert_common_prefix(pfx.to_string()) { - return Some(ExtractionResult::Filled); - } - - // We consume the whole common prefix from the iterator - let mut last_pfx_key = &object.key; - loop { - last_pfx_key = match objects.peek() { - Some(o) if o.key.starts_with(pfx) => &o.key, - Some(_) => { - return Some(ExtractionResult::Extracted { - key: last_pfx_key.to_owned(), - }) - } - None => { - return match key_after_prefix(pfx) { - Some(next) => Some(ExtractionResult::SkipTo { - key: next, - fallback_key: Some(last_pfx_key.to_owned()), - }), - None => Some(ExtractionResult::NoMore), - } - } - }; - - objects.next(); - } - } - - fn is_full(&mut self) -> bool { - self.keys.len() + self.common_prefixes.len() >= self.max_capacity - } - - fn try_insert_common_prefix(&mut self, key: String) -> bool { - // If we already have an entry, we can continue - if self.common_prefixes.contains(&key) { - return true; - } - - // Otherwise, we need to check if we can add it - match self.is_full() { - true => false, - false => { - self.common_prefixes.insert(key); - true - } - } - } - - fn try_insert_entry(&mut self, key: K, value: V) -> bool { - // It is impossible to add twice a key, this is an error - assert!(!self.keys.contains_key(&key)); - - match self.is_full() { - true => false, - false => { - self.keys.insert(key, value); - true - } - } - } -} - -impl ExtractAccumulator for ObjectAccumulator { - fn extract<'a>( - &mut self, - query: &ListQueryCommon, - _cursor: &RangeBegin, - objects: &mut Peekable>, - ) -> ExtractionResult { - if let Some(e) = self.extract_common_prefix(objects, query) { - return e; - } - - let object = objects.next().expect("This iterator can not be empty as it is checked earlier in the code. This is a logic bug, please report it."); - - let version = match object.versions().iter().find(|x| x.is_data()) { - Some(v) => v, - None => unreachable!( - "Expect to have objects having data due to earlier filtering. This is a logic bug." - ), - }; - - let meta = match &version.state { - ObjectVersionState::Complete(ObjectVersionData::Inline(meta, _)) => meta, - ObjectVersionState::Complete(ObjectVersionData::FirstBlock(meta, _)) => meta, - _ => unreachable!(), - }; - let info = ObjectInfo { - last_modified: version.timestamp, - size: meta.size, - etag: meta.etag.to_string(), - }; - - match self.try_insert_entry(object.key.clone(), info) { - true => ExtractionResult::Extracted { - key: object.key.clone(), - }, - false => ExtractionResult::Filled, - } - } -} - -impl ExtractAccumulator for UploadAccumulator { - /// Observe the iterator, process a single key, and try to extract one or more upload entries - /// - /// This function processes a single object from the iterator that can contain an arbitrary - /// number of versions, and thus "uploads". - fn extract<'a>( - &mut self, - query: &ListQueryCommon, - cursor: &RangeBegin, - objects: &mut Peekable>, - ) -> ExtractionResult { - if let Some(e) = self.extract_common_prefix(objects, query) { - return e; - } - - // Get the next object from the iterator - let object = objects.next().expect("This iterator can not be empty as it is checked earlier in the code. This is a logic bug, please report it."); - - let mut uploads_for_key = object - .versions() - .iter() - .filter(|x| x.is_uploading()) - .collect::>(); - - // S3 logic requires lexicographically sorted upload ids. - uploads_for_key.sort_unstable_by_key(|e| e.uuid); - - // Skip results if an upload marker is provided - if let RangeBegin::AfterUpload { upload, .. } = cursor { - // Because our data are sorted, we can use a binary search to find the UUID - // or to find where it should have been added. Once this position is found, - // we use it to discard the first part of the array. - let idx = match uploads_for_key.binary_search_by(|e| e.uuid.cmp(upload)) { - // we start after the found uuid so we need to discard the pointed value. - // In the worst case, the UUID is the last element, which lead us to an empty array - // but we are never out of bound. - Ok(i) => i + 1, - // if the UUID is not found, the upload may have been discarded between the 2 request, - // this function returns where it could have been inserted, - // the pointed value is thus greater than our marker and we need to keep it. - Err(i) => i, - }; - uploads_for_key = uploads_for_key[idx..].to_vec(); - } - - let mut iter = uploads_for_key.iter(); - - // The first entry is a specific case - // as it changes our result enum type - let first_upload = match iter.next() { - Some(u) => u, - None => { - return ExtractionResult::Extracted { - key: object.key.clone(), - } - } - }; - let first_up_info = UploadInfo { - key: object.key.to_string(), - timestamp: first_upload.timestamp, - }; - if !self.try_insert_entry(first_upload.uuid, first_up_info) { - return ExtractionResult::Filled; - } - - // We can then collect the remaining uploads in a loop - let mut prev_uuid = first_upload.uuid; - for upload in iter { - let up_info = UploadInfo { - key: object.key.to_string(), - timestamp: upload.timestamp, - }; - - // Insert data in our accumulator - // If it is full, return information to paginate. - if !self.try_insert_entry(upload.uuid, up_info) { - return ExtractionResult::FilledAtUpload { - key: object.key.clone(), - upload: prev_uuid, - }; - } - // Update our last added UUID - prev_uuid = upload.uuid; - } - - // We successfully collected all the uploads - ExtractionResult::Extracted { - key: object.key.clone(), - } - } -} - -/* - * Utility functions - */ - -/// This is a stub for Result::into_ok_or_err that is not yet in Rust stable -fn into_ok_or_err(r: Result) -> T { - match r { - Ok(r) => r, - Err(r) => r, - } -} - -/// Returns the common prefix of the object given the query prefix and delimiter -fn common_prefix<'a>(object: &'a Object, query: &ListQueryCommon) -> Option<&'a str> { - match &query.delimiter { - Some(delimiter) => object.key[query.prefix.len()..] - .find(delimiter) - .map(|i| &object.key[..query.prefix.len() + i + delimiter.len()]), - None => None, - } -} - -/// URIencode a value if needed -fn uriencode_maybe(s: &str, yes: bool) -> s3_xml::Value { - if yes { - s3_xml::Value(uri_encode(s, true)) - } else { - s3_xml::Value(s.to_string()) - } -} - -const UTF8_BEFORE_LAST_CHAR: char = '\u{10FFFE}'; - -/// Compute the key after the prefix -fn key_after_prefix(pfx: &str) -> Option { - let mut next = pfx.to_string(); - while !next.is_empty() { - let tail = next.pop().unwrap(); - if tail >= char::MAX { - continue; - } - - // Circumvent a limitation of RangeFrom that overflow earlier than needed - // See: https://doc.rust-lang.org/core/ops/struct.RangeFrom.html - let new_tail = if tail == UTF8_BEFORE_LAST_CHAR { - char::MAX - } else { - (tail..).nth(1).unwrap() - }; - - next.push(new_tail); - return Some(next); - } - - None -} - -/* - * Unit tests of this module - */ -#[cfg(test)] -mod tests { - use super::*; - use garage_model::version_table::*; - use garage_util::*; - use std::iter::FromIterator; - - const TS: u64 = 1641394898314; - - fn bucket() -> Uuid { - Uuid::from([0x42; 32]) - } - - fn query() -> ListMultipartUploadsQuery { - ListMultipartUploadsQuery { - common: ListQueryCommon { - prefix: "".to_string(), - delimiter: Some("/".to_string()), - page_size: 1000, - urlencode_resp: false, - bucket_name: "a".to_string(), - bucket_id: Uuid::from([0x00; 32]), - }, - key_marker: None, - upload_id_marker: None, - } - } - - fn objs() -> Vec { - vec![ - Object::new( - bucket(), - "a/b/c".to_string(), - vec![objup_version([0x01; 32])], - ), - Object::new(bucket(), "d".to_string(), vec![objup_version([0x01; 32])]), - ] - } - - fn objup_version(uuid: [u8; 32]) -> ObjectVersion { - ObjectVersion { - uuid: Uuid::from(uuid), - timestamp: TS, - state: ObjectVersionState::Uploading(ObjectVersionHeaders { - content_type: "text/plain".to_string(), - other: BTreeMap::::new(), - }), - } - } - - #[test] - fn test_key_after_prefix() { - assert_eq!(UTF8_BEFORE_LAST_CHAR as u32, (char::MAX as u32) - 1); - assert_eq!(key_after_prefix("a/b/").unwrap().as_str(), "a/b0"); - assert_eq!(key_after_prefix("€").unwrap().as_str(), "₭"); - assert_eq!( - key_after_prefix("􏿽").unwrap().as_str(), - String::from(char::from_u32(0x10FFFE).unwrap()) - ); - - // When the last character is the biggest UTF8 char - let a = String::from_iter(['a', char::MAX].iter()); - assert_eq!(key_after_prefix(a.as_str()).unwrap().as_str(), "b"); - - // When all characters are the biggest UTF8 char - let b = String::from_iter([char::MAX; 3].iter()); - assert!(key_after_prefix(b.as_str()).is_none()); - - // Check utf8 surrogates - let c = String::from('\u{D7FF}'); - assert_eq!( - key_after_prefix(c.as_str()).unwrap().as_str(), - String::from('\u{E000}') - ); - - // Check the character before the biggest one - let d = String::from('\u{10FFFE}'); - assert_eq!( - key_after_prefix(d.as_str()).unwrap().as_str(), - String::from(char::MAX) - ); - } - - #[test] - fn test_common_prefixes() { - let mut query = query(); - let objs = objs(); - - query.common.prefix = "a/".to_string(); - assert_eq!( - common_prefix(objs.get(0).unwrap(), &query.common), - Some("a/b/") - ); - - query.common.prefix = "a/b/".to_string(); - assert_eq!(common_prefix(objs.get(0).unwrap(), &query.common), None); - } - - #[test] - fn test_extract_common_prefix() { - let mut query = query(); - query.common.prefix = "a/".to_string(); - let objs = objs(); - let mut acc = UploadAccumulator::new(query.common.page_size); - - let mut iter = objs.iter().peekable(); - match acc.extract_common_prefix(&mut iter, &query.common) { - Some(ExtractionResult::Extracted { key }) => assert_eq!(key, "a/b/c".to_string()), - _ => panic!("wrong result"), - } - assert_eq!(acc.common_prefixes.len(), 1); - assert_eq!(acc.common_prefixes.iter().next().unwrap(), "a/b/"); - } - - #[test] - fn test_extract_upload() { - let objs = vec![ - Object::new( - bucket(), - "b".to_string(), - vec![ - objup_version([0x01; 32]), - objup_version([0x80; 32]), - objup_version([0x8f; 32]), - objup_version([0xdd; 32]), - ], - ), - Object::new(bucket(), "c".to_string(), vec![]), - ]; - - let mut acc = UploadAccumulator::new(2); - let mut start = RangeBegin::AfterUpload { - key: "b".to_string(), - upload: Uuid::from([0x01; 32]), - }; - - let mut iter = objs.iter().peekable(); - - // Check the case where we skip some uploads - match acc.extract(&(query().common), &start, &mut iter) { - ExtractionResult::FilledAtUpload { key, upload } => { - assert_eq!(key, "b"); - assert_eq!(upload, Uuid::from([0x8f; 32])); - } - _ => panic!("wrong result"), - }; - - assert_eq!(acc.keys.len(), 2); - assert_eq!( - acc.keys.get(&Uuid::from([0x80; 32])).unwrap(), - &UploadInfo { - timestamp: TS, - key: "b".to_string() - } - ); - assert_eq!( - acc.keys.get(&Uuid::from([0x8f; 32])).unwrap(), - &UploadInfo { - timestamp: TS, - key: "b".to_string() - } - ); - - acc = UploadAccumulator::new(2); - start = RangeBegin::AfterUpload { - key: "b".to_string(), - upload: Uuid::from([0xff; 32]), - }; - iter = objs.iter().peekable(); - - // Check the case where we skip all the uploads - match acc.extract(&(query().common), &start, &mut iter) { - ExtractionResult::Extracted { key } if key.as_str() == "b" => (), - _ => panic!("wrong result"), - }; - } - - #[tokio::test] - async fn test_fetch_uploads_no_result() -> Result<(), Error> { - let query = query(); - let mut acc = query.build_accumulator(); - let page = fetch_list_entries( - &query.common, - query.begin()?, - &mut acc, - |_, _, _| async move { Ok(vec![]) }, - ) - .await?; - assert_eq!(page, None); - assert_eq!(acc.common_prefixes.len(), 0); - assert_eq!(acc.keys.len(), 0); - - Ok(()) - } - - #[tokio::test] - async fn test_fetch_uploads_basic() -> Result<(), Error> { - let query = query(); - let mut acc = query.build_accumulator(); - let mut fake_io = |_, _, _| async move { Ok(objs()) }; - let page = - fetch_list_entries(&query.common, query.begin()?, &mut acc, &mut fake_io).await?; - assert_eq!(page, None); - assert_eq!(acc.common_prefixes.len(), 1); - assert_eq!(acc.keys.len(), 1); - assert!(acc.common_prefixes.contains("a/")); - - Ok(()) - } - - #[tokio::test] - async fn test_fetch_uploads_advanced() -> Result<(), Error> { - let mut query = query(); - query.common.page_size = 2; - - let mut fake_io = |_, k: Option, _| async move { - Ok(match k.as_deref() { - Some("") => vec![ - Object::new(bucket(), "b/a".to_string(), vec![objup_version([0x01; 32])]), - Object::new(bucket(), "b/b".to_string(), vec![objup_version([0x01; 32])]), - Object::new(bucket(), "b/c".to_string(), vec![objup_version([0x01; 32])]), - ], - Some("b0") => vec![ - Object::new(bucket(), "c/a".to_string(), vec![objup_version([0x01; 32])]), - Object::new(bucket(), "c/b".to_string(), vec![objup_version([0x01; 32])]), - Object::new(bucket(), "c/c".to_string(), vec![objup_version([0x02; 32])]), - ], - Some("c0") => vec![Object::new( - bucket(), - "d".to_string(), - vec![objup_version([0x01; 32])], - )], - _ => panic!("wrong value {:?}", k), - }) - }; - - let mut acc = query.build_accumulator(); - let page = - fetch_list_entries(&query.common, query.begin()?, &mut acc, &mut fake_io).await?; - assert_eq!( - page, - Some(RangeBegin::IncludingKey { - key: "c0".to_string(), - fallback_key: Some("c/c".to_string()) - }) - ); - assert_eq!(acc.common_prefixes.len(), 2); - assert_eq!(acc.keys.len(), 0); - assert!(acc.common_prefixes.contains("b/")); - assert!(acc.common_prefixes.contains("c/")); - - Ok(()) - } - - fn version() -> Version { - let uuid = Uuid::from([0x08; 32]); - - let blocks = vec![ - ( - VersionBlockKey { - part_number: 1, - offset: 1, - }, - VersionBlock { - hash: uuid, - size: 3, - }, - ), - ( - VersionBlockKey { - part_number: 1, - offset: 2, - }, - VersionBlock { - hash: uuid, - size: 2, - }, - ), - ( - VersionBlockKey { - part_number: 2, - offset: 1, - }, - VersionBlock { - hash: uuid, - size: 8, - }, - ), - ( - VersionBlockKey { - part_number: 5, - offset: 1, - }, - VersionBlock { - hash: uuid, - size: 7, - }, - ), - ( - VersionBlockKey { - part_number: 8, - offset: 1, - }, - VersionBlock { - hash: uuid, - size: 5, - }, - ), - ]; - let etags = vec![ - (1, "etag1".to_string()), - (3, "etag2".to_string()), - (5, "etag3".to_string()), - (8, "etag4".to_string()), - (9, "etag5".to_string()), - ]; - - Version { - bucket_id: uuid, - key: "a".to_string(), - uuid, - deleted: false.into(), - blocks: crdt::Map::::from_iter(blocks), - parts_etags: crdt::Map::::from_iter(etags), - } - } - - fn obj() -> Object { - Object::new(bucket(), "d".to_string(), vec![objup_version([0x08; 32])]) - } - - #[test] - fn test_fetch_part_info() -> Result<(), Error> { - let uuid = Uuid::from([0x08; 32]); - let mut query = ListPartsQuery { - bucket_name: "a".to_string(), - bucket_id: uuid, - key: "a".to_string(), - upload_id: "xx".to_string(), - part_number_marker: None, - max_parts: 2, - }; - - assert!( - fetch_part_info(&query, None, None, uuid).is_err(), - "No object and version should fail" - ); - assert!( - fetch_part_info(&query, Some(obj()), None, uuid).is_err(), - "No version should faild" - ); - assert!( - fetch_part_info(&query, None, Some(version()), uuid).is_err(), - "No object should fail" - ); - - // Start from the beginning but with limited size to trigger pagination - let (info, pagination) = fetch_part_info(&query, Some(obj()), Some(version()), uuid)?; - assert_eq!(pagination.unwrap(), 5); - assert_eq!( - info, - vec![ - PartInfo { - etag: "etag1".to_string(), - timestamp: TS, - part_number: 1, - size: 5 - }, - PartInfo { - etag: "etag3".to_string(), - timestamp: TS, - part_number: 5, - size: 7 - }, - ] - ); - - // Use previous pagination to make a new request - query.part_number_marker = Some(pagination.unwrap()); - let (info, pagination) = fetch_part_info(&query, Some(obj()), Some(version()), uuid)?; - assert!(pagination.is_none()); - assert_eq!( - info, - vec![PartInfo { - etag: "etag4".to_string(), - timestamp: TS, - part_number: 8, - size: 5 - },] - ); - - // Trying to access a part that is way larger than registered ones - query.part_number_marker = Some(9999); - let (info, pagination) = fetch_part_info(&query, Some(obj()), Some(version()), uuid)?; - assert!(pagination.is_none()); - assert_eq!(info, vec![]); - - // Try without any limitation - query.max_parts = 1000; - query.part_number_marker = None; - let (info, pagination) = fetch_part_info(&query, Some(obj()), Some(version()), uuid)?; - assert!(pagination.is_none()); - assert_eq!( - info, - vec![ - PartInfo { - etag: "etag1".to_string(), - timestamp: TS, - part_number: 1, - size: 5 - }, - PartInfo { - etag: "etag3".to_string(), - timestamp: TS, - part_number: 5, - size: 7 - }, - PartInfo { - etag: "etag4".to_string(), - timestamp: TS, - part_number: 8, - size: 5 - }, - ] - ); - - Ok(()) - } -} diff --git a/src/api/s3_post_object.rs b/src/api/s3_post_object.rs deleted file mode 100644 index 585e0304..00000000 --- a/src/api/s3_post_object.rs +++ /dev/null @@ -1,499 +0,0 @@ -use std::collections::HashMap; -use std::convert::TryInto; -use std::ops::RangeInclusive; -use std::sync::Arc; -use std::task::{Context, Poll}; - -use bytes::Bytes; -use chrono::{DateTime, Duration, Utc}; -use futures::{Stream, StreamExt}; -use hyper::header::{self, HeaderMap, HeaderName, HeaderValue}; -use hyper::{Body, Request, Response, StatusCode}; -use multer::{Constraints, Multipart, SizeLimit}; -use serde::Deserialize; - -use garage_model::garage::Garage; - -use crate::api_server::resolve_bucket; -use crate::error::*; -use crate::s3_put::{get_headers, save_stream}; -use crate::s3_xml; -use crate::signature::payload::{parse_date, verify_v4}; - -pub async fn handle_post_object( - garage: Arc, - req: Request, - bucket: String, -) -> Result, Error> { - let boundary = req - .headers() - .get(header::CONTENT_TYPE) - .and_then(|ct| ct.to_str().ok()) - .and_then(|ct| multer::parse_boundary(ct).ok()) - .ok_or_bad_request("Counld not get multipart boundary")?; - - // 16k seems plenty for a header. 5G is the max size of a single part, so it seems reasonable - // for a PostObject - let constraints = Constraints::new().size_limit( - SizeLimit::new() - .per_field(16 * 1024) - .for_field("file", 5 * 1024 * 1024 * 1024), - ); - - let (head, body) = req.into_parts(); - let mut multipart = Multipart::with_constraints(body, boundary, constraints); - - let mut params = HeaderMap::new(); - let field = loop { - let field = if let Some(field) = multipart.next_field().await? { - field - } else { - return Err(Error::BadRequest( - "Request did not contain a file".to_owned(), - )); - }; - let name: HeaderName = if let Some(Ok(name)) = field.name().map(TryInto::try_into) { - name - } else { - continue; - }; - if name == "file" { - break field; - } - - if let Ok(content) = HeaderValue::from_str(&field.text().await?) { - match name.as_str() { - "tag" => (/* tag need to be reencoded, but we don't support them yet anyway */), - "acl" => { - if params.insert("x-amz-acl", content).is_some() { - return Err(Error::BadRequest( - "Field 'acl' provided more than one time".to_string(), - )); - } - } - _ => { - if params.insert(&name, content).is_some() { - return Err(Error::BadRequest(format!( - "Field '{}' provided more than one time", - name - ))); - } - } - } - } - }; - - // Current part is file. Do some checks before handling to PutObject code - let key = params - .get("key") - .ok_or_bad_request("No key was provided")? - .to_str()?; - let credential = params - .get("x-amz-credential") - .ok_or_else(|| { - Error::Forbidden("Garage does not support anonymous access yet".to_string()) - })? - .to_str()?; - let policy = params - .get("policy") - .ok_or_bad_request("No policy was provided")? - .to_str()?; - let signature = params - .get("x-amz-signature") - .ok_or_bad_request("No signature was provided")? - .to_str()?; - let date = params - .get("x-amz-date") - .ok_or_bad_request("No date was provided")? - .to_str()?; - - let key = if key.contains("${filename}") { - // if no filename is provided, don't replace. This matches the behavior of AWS. - if let Some(filename) = field.file_name() { - key.replace("${filename}", filename) - } else { - key.to_owned() - } - } else { - key.to_owned() - }; - - let date = parse_date(date)?; - let api_key = verify_v4(&garage, credential, &date, signature, policy.as_bytes()).await?; - - let bucket_id = resolve_bucket(&garage, &bucket, &api_key).await?; - - if !api_key.allow_write(&bucket_id) { - return Err(Error::Forbidden( - "Operation is not allowed for this key.".to_string(), - )); - } - - let decoded_policy = base64::decode(&policy)?; - let decoded_policy: Policy = - serde_json::from_slice(&decoded_policy).ok_or_bad_request("Invalid policy")?; - - let expiration: DateTime = DateTime::parse_from_rfc3339(&decoded_policy.expiration) - .ok_or_bad_request("Invalid expiration date")? - .into(); - if Utc::now() - expiration > Duration::zero() { - return Err(Error::BadRequest( - "Expiration date is in the paste".to_string(), - )); - } - - let mut conditions = decoded_policy.into_conditions()?; - - for (param_key, value) in params.iter() { - let mut param_key = param_key.to_string(); - param_key.make_ascii_lowercase(); - match param_key.as_str() { - "policy" | "x-amz-signature" => (), // this is always accepted, as it's required to validate other fields - "content-type" => { - let conds = conditions.params.remove("content-type").ok_or_else(|| { - Error::BadRequest(format!("Key '{}' is not allowed in policy", param_key)) - })?; - for cond in conds { - let ok = match cond { - Operation::Equal(s) => s.as_str() == value, - Operation::StartsWith(s) => { - value.to_str()?.split(',').all(|v| v.starts_with(&s)) - } - }; - if !ok { - return Err(Error::BadRequest(format!( - "Key '{}' has value not allowed in policy", - param_key - ))); - } - } - } - "key" => { - let conds = conditions.params.remove("key").ok_or_else(|| { - Error::BadRequest(format!("Key '{}' is not allowed in policy", param_key)) - })?; - for cond in conds { - let ok = match cond { - Operation::Equal(s) => s == key, - Operation::StartsWith(s) => key.starts_with(&s), - }; - if !ok { - return Err(Error::BadRequest(format!( - "Key '{}' has value not allowed in policy", - param_key - ))); - } - } - } - _ => { - if param_key.starts_with("x-ignore-") { - // if a x-ignore is provided in policy, it's not removed here, so it will be - // rejected as provided in policy but not in the request. As odd as it is, it's - // how aws seems to behave. - continue; - } - let conds = conditions.params.remove(¶m_key).ok_or_else(|| { - Error::BadRequest(format!("Key '{}' is not allowed in policy", param_key)) - })?; - for cond in conds { - let ok = match cond { - Operation::Equal(s) => s.as_str() == value, - Operation::StartsWith(s) => value.to_str()?.starts_with(s.as_str()), - }; - if !ok { - return Err(Error::BadRequest(format!( - "Key '{}' has value not allowed in policy", - param_key - ))); - } - } - } - } - } - - if let Some((param_key, _)) = conditions.params.iter().next() { - return Err(Error::BadRequest(format!( - "Key '{}' is required in policy, but no value was provided", - param_key - ))); - } - - let headers = get_headers(¶ms)?; - - let stream = field.map(|r| r.map_err(Into::into)); - let (_, md5) = save_stream( - garage, - headers, - StreamLimiter::new(stream, conditions.content_length), - bucket_id, - &key, - None, - None, - ) - .await?; - - let etag = format!("\"{}\"", md5); - - let resp = if let Some(mut target) = params - .get("success_action_redirect") - .and_then(|h| h.to_str().ok()) - .and_then(|u| url::Url::parse(u).ok()) - .filter(|u| u.scheme() == "https" || u.scheme() == "http") - { - target - .query_pairs_mut() - .append_pair("bucket", &bucket) - .append_pair("key", &key) - .append_pair("etag", &etag); - let target = target.to_string(); - Response::builder() - .status(StatusCode::SEE_OTHER) - .header(header::LOCATION, target.clone()) - .header(header::ETAG, etag) - .body(target.into())? - } else { - let path = head - .uri - .into_parts() - .path_and_query - .map(|paq| paq.path().to_string()) - .unwrap_or_else(|| "/".to_string()); - let authority = head - .headers - .get(header::HOST) - .and_then(|h| h.to_str().ok()) - .unwrap_or_default(); - let proto = if !authority.is_empty() { - "https://" - } else { - "" - }; - - let url_key: String = form_urlencoded::byte_serialize(key.as_bytes()) - .flat_map(str::chars) - .collect(); - let location = format!("{}{}{}{}", proto, authority, path, url_key); - - let action = params - .get("success_action_status") - .and_then(|h| h.to_str().ok()) - .unwrap_or("204"); - let builder = Response::builder() - .header(header::LOCATION, location.clone()) - .header(header::ETAG, etag.clone()); - match action { - "200" => builder.status(StatusCode::OK).body(Body::empty())?, - "201" => { - let xml = s3_xml::PostObject { - xmlns: (), - location: s3_xml::Value(location), - bucket: s3_xml::Value(bucket), - key: s3_xml::Value(key), - etag: s3_xml::Value(etag), - }; - let body = s3_xml::to_xml_with_header(&xml)?; - builder - .status(StatusCode::CREATED) - .body(Body::from(body.into_bytes()))? - } - _ => builder.status(StatusCode::NO_CONTENT).body(Body::empty())?, - } - }; - - Ok(resp) -} - -#[derive(Deserialize)] -struct Policy { - expiration: String, - conditions: Vec, -} - -impl Policy { - fn into_conditions(self) -> Result { - let mut params = HashMap::<_, Vec<_>>::new(); - - let mut length = (0, u64::MAX); - for condition in self.conditions { - match condition { - PolicyCondition::Equal(map) => { - if map.len() != 1 { - return Err(Error::BadRequest("Invalid policy item".to_owned())); - } - let (mut k, v) = map.into_iter().next().expect("size was verified"); - k.make_ascii_lowercase(); - params.entry(k).or_default().push(Operation::Equal(v)); - } - PolicyCondition::OtherOp([cond, mut key, value]) => { - if key.remove(0) != '$' { - return Err(Error::BadRequest("Invalid policy item".to_owned())); - } - key.make_ascii_lowercase(); - match cond.as_str() { - "eq" => { - params.entry(key).or_default().push(Operation::Equal(value)); - } - "starts-with" => { - params - .entry(key) - .or_default() - .push(Operation::StartsWith(value)); - } - _ => return Err(Error::BadRequest("Invalid policy item".to_owned())), - } - } - PolicyCondition::SizeRange(key, min, max) => { - if key == "content-length-range" { - length.0 = length.0.max(min); - length.1 = length.1.min(max); - } else { - return Err(Error::BadRequest("Invalid policy item".to_owned())); - } - } - } - } - Ok(Conditions { - params, - content_length: RangeInclusive::new(length.0, length.1), - }) - } -} - -/// A single condition from a policy -#[derive(Debug, Deserialize)] -#[serde(untagged)] -enum PolicyCondition { - // will contain a single key-value pair - Equal(HashMap), - OtherOp([String; 3]), - SizeRange(String, u64, u64), -} - -#[derive(Debug)] -struct Conditions { - params: HashMap>, - content_length: RangeInclusive, -} - -#[derive(Debug, PartialEq, Eq)] -enum Operation { - Equal(String), - StartsWith(String), -} - -struct StreamLimiter { - inner: T, - length: RangeInclusive, - read: u64, -} - -impl StreamLimiter { - fn new(stream: T, length: RangeInclusive) -> Self { - StreamLimiter { - inner: stream, - length, - read: 0, - } - } -} - -impl Stream for StreamLimiter -where - T: Stream> + Unpin, -{ - type Item = Result; - fn poll_next( - mut self: std::pin::Pin<&mut Self>, - ctx: &mut Context<'_>, - ) -> Poll> { - let res = std::pin::Pin::new(&mut self.inner).poll_next(ctx); - match &res { - Poll::Ready(Some(Ok(bytes))) => { - self.read += bytes.len() as u64; - // optimization to fail early when we know before the end it's too long - if self.length.end() < &self.read { - return Poll::Ready(Some(Err(Error::BadRequest( - "File size does not match policy".to_owned(), - )))); - } - } - Poll::Ready(None) => { - if !self.length.contains(&self.read) { - return Poll::Ready(Some(Err(Error::BadRequest( - "File size does not match policy".to_owned(), - )))); - } - } - _ => {} - } - res - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_policy_1() { - let policy_json = br#" -{ "expiration": "2007-12-01T12:00:00.000Z", - "conditions": [ - {"acl": "public-read" }, - {"bucket": "johnsmith" }, - ["starts-with", "$key", "user/eric/"] - ] -} - "#; - let policy_2: Policy = serde_json::from_slice(&policy_json[..]).unwrap(); - let mut conditions = policy_2.into_conditions().unwrap(); - - assert_eq!( - conditions.params.remove(&"acl".to_string()), - Some(vec![Operation::Equal("public-read".into())]) - ); - assert_eq!( - conditions.params.remove(&"bucket".to_string()), - Some(vec![Operation::Equal("johnsmith".into())]) - ); - assert_eq!( - conditions.params.remove(&"key".to_string()), - Some(vec![Operation::StartsWith("user/eric/".into())]) - ); - assert!(conditions.params.is_empty()); - assert_eq!(conditions.content_length, 0..=u64::MAX); - } - - #[test] - fn test_policy_2() { - let policy_json = br#" -{ "expiration": "2007-12-01T12:00:00.000Z", - "conditions": [ - [ "eq", "$acl", "public-read" ], - ["starts-with", "$Content-Type", "image/"], - ["starts-with", "$success_action_redirect", ""], - ["content-length-range", 1048576, 10485760] - ] -} - "#; - let policy_2: Policy = serde_json::from_slice(&policy_json[..]).unwrap(); - let mut conditions = policy_2.into_conditions().unwrap(); - - assert_eq!( - conditions.params.remove(&"acl".to_string()), - Some(vec![Operation::Equal("public-read".into())]) - ); - assert_eq!( - conditions.params.remove("content-type").unwrap(), - vec![Operation::StartsWith("image/".into())] - ); - assert_eq!( - conditions - .params - .remove(&"success_action_redirect".to_string()), - Some(vec![Operation::StartsWith("".into())]) - ); - assert!(conditions.params.is_empty()); - assert_eq!(conditions.content_length, 1048576..=10485760); - } -} diff --git a/src/api/s3_put.rs b/src/api/s3_put.rs deleted file mode 100644 index ed0bf00b..00000000 --- a/src/api/s3_put.rs +++ /dev/null @@ -1,753 +0,0 @@ -use std::collections::{BTreeMap, BTreeSet, VecDeque}; -use std::sync::Arc; - -use futures::prelude::*; -use hyper::body::{Body, Bytes}; -use hyper::header::{HeaderMap, HeaderValue}; -use hyper::{Request, Response}; -use md5::{digest::generic_array::*, Digest as Md5Digest, Md5}; -use sha2::Sha256; - -use garage_table::*; -use garage_util::data::*; -use garage_util::error::Error as GarageError; -use garage_util::time::*; - -use garage_block::manager::INLINE_THRESHOLD; -use garage_model::block_ref_table::*; -use garage_model::garage::Garage; -use garage_model::object_table::*; -use garage_model::version_table::*; - -use crate::error::*; -use crate::s3_xml; -use crate::signature::verify_signed_content; - -pub async fn handle_put( - garage: Arc, - req: Request, - bucket_id: Uuid, - key: &str, - content_sha256: Option, -) -> Result, Error> { - // Retrieve interesting headers from request - let headers = get_headers(req.headers())?; - debug!("Object headers: {:?}", headers); - - let content_md5 = match req.headers().get("content-md5") { - Some(x) => Some(x.to_str()?.to_string()), - None => None, - }; - - let (_head, body) = req.into_parts(); - let body = body.map_err(Error::from); - - save_stream( - garage, - headers, - body, - bucket_id, - key, - content_md5, - content_sha256, - ) - .await - .map(|(uuid, md5)| put_response(uuid, md5)) -} - -pub(crate) async fn save_stream> + Unpin>( - garage: Arc, - headers: ObjectVersionHeaders, - body: S, - bucket_id: Uuid, - key: &str, - content_md5: Option, - content_sha256: Option, -) -> Result<(Uuid, String), Error> { - // Generate identity of new version - let version_uuid = gen_uuid(); - let version_timestamp = now_msec(); - - let mut chunker = StreamChunker::new(body, garage.config.block_size); - let first_block = chunker.next().await?.unwrap_or_default(); - - // If body is small enough, store it directly in the object table - // as "inline data". We can then return immediately. - if first_block.len() < INLINE_THRESHOLD { - let mut md5sum = Md5::new(); - md5sum.update(&first_block[..]); - let data_md5sum = md5sum.finalize(); - let data_md5sum_hex = hex::encode(data_md5sum); - - let data_sha256sum = sha256sum(&first_block[..]); - - ensure_checksum_matches( - data_md5sum.as_slice(), - data_sha256sum, - content_md5.as_deref(), - content_sha256, - )?; - - let object_version = ObjectVersion { - uuid: version_uuid, - timestamp: version_timestamp, - state: ObjectVersionState::Complete(ObjectVersionData::Inline( - ObjectVersionMeta { - headers, - size: first_block.len() as u64, - etag: data_md5sum_hex.clone(), - }, - first_block, - )), - }; - - let object = Object::new(bucket_id, key.into(), vec![object_version]); - garage.object_table.insert(&object).await?; - - return Ok((version_uuid, data_md5sum_hex)); - } - - // Write version identifier in object table so that we have a trace - // that we are uploading something - let mut object_version = ObjectVersion { - uuid: version_uuid, - timestamp: version_timestamp, - state: ObjectVersionState::Uploading(headers.clone()), - }; - let object = Object::new(bucket_id, key.into(), vec![object_version.clone()]); - garage.object_table.insert(&object).await?; - - // Initialize corresponding entry in version table - // Write this entry now, even with empty block list, - // to prevent block_ref entries from being deleted (they can be deleted - // if the reference a version that isn't found in the version table) - let version = Version::new(version_uuid, bucket_id, key.into(), false); - garage.version_table.insert(&version).await?; - - // Transfer data and verify checksum - let first_block_hash = blake2sum(&first_block[..]); - let tx_result = read_and_put_blocks( - &garage, - &version, - 1, - first_block, - first_block_hash, - &mut chunker, - ) - .await - .and_then(|(total_size, data_md5sum, data_sha256sum)| { - ensure_checksum_matches( - data_md5sum.as_slice(), - data_sha256sum, - content_md5.as_deref(), - content_sha256, - ) - .map(|()| (total_size, data_md5sum)) - }); - - // If something went wrong, clean up - let (total_size, md5sum_arr) = match tx_result { - Ok(rv) => rv, - Err(e) => { - // Mark object as aborted, this will free the blocks further down - object_version.state = ObjectVersionState::Aborted; - let object = Object::new(bucket_id, key.into(), vec![object_version.clone()]); - garage.object_table.insert(&object).await?; - return Err(e); - } - }; - - // Save final object state, marked as Complete - let md5sum_hex = hex::encode(md5sum_arr); - object_version.state = ObjectVersionState::Complete(ObjectVersionData::FirstBlock( - ObjectVersionMeta { - headers, - size: total_size, - etag: md5sum_hex.clone(), - }, - first_block_hash, - )); - let object = Object::new(bucket_id, key.into(), vec![object_version]); - garage.object_table.insert(&object).await?; - - Ok((version_uuid, md5sum_hex)) -} - -/// Validate MD5 sum against content-md5 header -/// and sha256sum against signed content-sha256 -fn ensure_checksum_matches( - data_md5sum: &[u8], - data_sha256sum: garage_util::data::FixedBytes32, - content_md5: Option<&str>, - content_sha256: Option, -) -> Result<(), Error> { - if let Some(expected_sha256) = content_sha256 { - if expected_sha256 != data_sha256sum { - return Err(Error::BadRequest( - "Unable to validate x-amz-content-sha256".to_string(), - )); - } else { - trace!("Successfully validated x-amz-content-sha256"); - } - } - if let Some(expected_md5) = content_md5 { - if expected_md5.trim_matches('"') != base64::encode(data_md5sum) { - return Err(Error::BadRequest( - "Unable to validate content-md5".to_string(), - )); - } else { - trace!("Successfully validated content-md5"); - } - } - Ok(()) -} - -async fn read_and_put_blocks> + Unpin>( - garage: &Garage, - version: &Version, - part_number: u64, - first_block: Vec, - first_block_hash: Hash, - chunker: &mut StreamChunker, -) -> Result<(u64, GenericArray, Hash), Error> { - let mut md5hasher = Md5::new(); - let mut sha256hasher = Sha256::new(); - md5hasher.update(&first_block[..]); - sha256hasher.update(&first_block[..]); - - let mut next_offset = first_block.len(); - let mut put_curr_version_block = put_block_meta( - garage, - version, - part_number, - 0, - first_block_hash, - first_block.len() as u64, - ); - let mut put_curr_block = garage - .block_manager - .rpc_put_block(first_block_hash, first_block); - - loop { - let (_, _, next_block) = futures::try_join!( - put_curr_block.map_err(Error::from), - put_curr_version_block.map_err(Error::from), - chunker.next(), - )?; - if let Some(block) = next_block { - md5hasher.update(&block[..]); - sha256hasher.update(&block[..]); - let block_hash = blake2sum(&block[..]); - let block_len = block.len(); - put_curr_version_block = put_block_meta( - garage, - version, - part_number, - next_offset as u64, - block_hash, - block_len as u64, - ); - put_curr_block = garage.block_manager.rpc_put_block(block_hash, block); - next_offset += block_len; - } else { - break; - } - } - - let total_size = next_offset as u64; - let data_md5sum = md5hasher.finalize(); - - let data_sha256sum = sha256hasher.finalize(); - let data_sha256sum = Hash::try_from(&data_sha256sum[..]).unwrap(); - - Ok((total_size, data_md5sum, data_sha256sum)) -} - -async fn put_block_meta( - garage: &Garage, - version: &Version, - part_number: u64, - offset: u64, - hash: Hash, - size: u64, -) -> Result<(), GarageError> { - let mut version = version.clone(); - version.blocks.put( - VersionBlockKey { - part_number, - offset, - }, - VersionBlock { hash, size }, - ); - - let block_ref = BlockRef { - block: hash, - version: version.uuid, - deleted: false.into(), - }; - - futures::try_join!( - garage.version_table.insert(&version), - garage.block_ref_table.insert(&block_ref), - )?; - Ok(()) -} - -struct StreamChunker>> { - stream: S, - read_all: bool, - block_size: usize, - buf: VecDeque, -} - -impl> + Unpin> StreamChunker { - fn new(stream: S, block_size: usize) -> Self { - Self { - stream, - read_all: false, - block_size, - buf: VecDeque::with_capacity(2 * block_size), - } - } - - async fn next(&mut self) -> Result>, Error> { - while !self.read_all && self.buf.len() < self.block_size { - if let Some(block) = self.stream.next().await { - let bytes = block?; - trace!("Body next: {} bytes", bytes.len()); - self.buf.extend(bytes); - } else { - self.read_all = true; - } - } - - if self.buf.is_empty() { - Ok(None) - } else if self.buf.len() <= self.block_size { - let block = self.buf.drain(..).collect::>(); - Ok(Some(block)) - } else { - let block = self.buf.drain(..self.block_size).collect::>(); - Ok(Some(block)) - } - } -} - -pub fn put_response(version_uuid: Uuid, md5sum_hex: String) -> Response { - Response::builder() - .header("x-amz-version-id", hex::encode(version_uuid)) - .header("ETag", format!("\"{}\"", md5sum_hex)) - .body(Body::from(vec![])) - .unwrap() -} - -pub async fn handle_create_multipart_upload( - garage: Arc, - req: &Request, - bucket_name: &str, - bucket_id: Uuid, - key: &str, -) -> Result, Error> { - let version_uuid = gen_uuid(); - let headers = get_headers(req.headers())?; - - // Create object in object table - let object_version = ObjectVersion { - uuid: version_uuid, - timestamp: now_msec(), - state: ObjectVersionState::Uploading(headers), - }; - let object = Object::new(bucket_id, key.to_string(), vec![object_version]); - garage.object_table.insert(&object).await?; - - // Insert empty version so that block_ref entries refer to something - // (they are inserted concurrently with blocks in the version table, so - // there is the possibility that they are inserted before the version table - // is created, in which case it is allowed to delete them, e.g. in repair_*) - let version = Version::new(version_uuid, bucket_id, key.into(), false); - garage.version_table.insert(&version).await?; - - // Send success response - let result = s3_xml::InitiateMultipartUploadResult { - xmlns: (), - bucket: s3_xml::Value(bucket_name.to_string()), - key: s3_xml::Value(key.to_string()), - upload_id: s3_xml::Value(hex::encode(version_uuid)), - }; - let xml = s3_xml::to_xml_with_header(&result)?; - - Ok(Response::new(Body::from(xml.into_bytes()))) -} - -pub async fn handle_put_part( - garage: Arc, - req: Request, - bucket_id: Uuid, - key: &str, - part_number: u64, - upload_id: &str, - content_sha256: Option, -) -> Result, Error> { - let version_uuid = decode_upload_id(upload_id)?; - - let content_md5 = match req.headers().get("content-md5") { - Some(x) => Some(x.to_str()?.to_string()), - None => None, - }; - - // Read first chuck, and at the same time try to get object to see if it exists - let key = key.to_string(); - - let body = req.into_body().map_err(Error::from); - let mut chunker = StreamChunker::new(body, garage.config.block_size); - - let (object, version, first_block) = futures::try_join!( - garage - .object_table - .get(&bucket_id, &key) - .map_err(Error::from), - garage - .version_table - .get(&version_uuid, &EmptyKey) - .map_err(Error::from), - chunker.next(), - )?; - - // Check object is valid and multipart block can be accepted - let first_block = first_block.ok_or_bad_request("Empty body")?; - let object = object.ok_or_bad_request("Object not found")?; - - if !object - .versions() - .iter() - .any(|v| v.uuid == version_uuid && v.is_uploading()) - { - return Err(Error::NoSuchUpload); - } - - // Check part hasn't already been uploaded - if let Some(v) = version { - if v.has_part_number(part_number) { - return Err(Error::BadRequest(format!( - "Part number {} has already been uploaded", - part_number - ))); - } - } - - // Copy block to store - let version = Version::new(version_uuid, bucket_id, key, false); - let first_block_hash = blake2sum(&first_block[..]); - let (_, data_md5sum, data_sha256sum) = read_and_put_blocks( - &garage, - &version, - part_number, - first_block, - first_block_hash, - &mut chunker, - ) - .await?; - - // Verify that checksums map - ensure_checksum_matches( - data_md5sum.as_slice(), - data_sha256sum, - content_md5.as_deref(), - content_sha256, - )?; - - // Store part etag in version - let data_md5sum_hex = hex::encode(data_md5sum); - let mut version = version; - version - .parts_etags - .put(part_number, data_md5sum_hex.clone()); - garage.version_table.insert(&version).await?; - - let response = Response::builder() - .header("ETag", format!("\"{}\"", data_md5sum_hex)) - .body(Body::empty()) - .unwrap(); - Ok(response) -} - -pub async fn handle_complete_multipart_upload( - garage: Arc, - req: Request, - bucket_name: &str, - bucket_id: Uuid, - key: &str, - upload_id: &str, - content_sha256: Option, -) -> Result, Error> { - let body = hyper::body::to_bytes(req.into_body()).await?; - - if let Some(content_sha256) = content_sha256 { - verify_signed_content(content_sha256, &body[..])?; - } - - let body_xml = roxmltree::Document::parse(std::str::from_utf8(&body)?)?; - let body_list_of_parts = parse_complete_multipart_upload_body(&body_xml) - .ok_or_bad_request("Invalid CompleteMultipartUpload XML")?; - debug!( - "CompleteMultipartUpload list of parts: {:?}", - body_list_of_parts - ); - - let version_uuid = decode_upload_id(upload_id)?; - - // Get object and version - let key = key.to_string(); - let (object, version) = futures::try_join!( - garage.object_table.get(&bucket_id, &key), - garage.version_table.get(&version_uuid, &EmptyKey), - )?; - - let object = object.ok_or(Error::NoSuchKey)?; - let mut object_version = object - .versions() - .iter() - .find(|v| v.uuid == version_uuid && v.is_uploading()) - .cloned() - .ok_or(Error::NoSuchUpload)?; - - let version = version.ok_or(Error::NoSuchKey)?; - if version.blocks.is_empty() { - return Err(Error::BadRequest("No data was uploaded".to_string())); - } - - let headers = match object_version.state { - ObjectVersionState::Uploading(headers) => headers, - _ => unreachable!(), - }; - - // Check that part numbers are an increasing sequence. - // (it doesn't need to start at 1 nor to be a continuous sequence, - // see discussion in #192) - if body_list_of_parts.is_empty() { - return Err(Error::EntityTooSmall); - } - if !body_list_of_parts - .iter() - .zip(body_list_of_parts.iter().skip(1)) - .all(|(p1, p2)| p1.part_number < p2.part_number) - { - return Err(Error::InvalidPartOrder); - } - - // Garage-specific restriction, see #204: part numbers must be - // consecutive starting at 1 - if body_list_of_parts[0].part_number != 1 - || !body_list_of_parts - .iter() - .zip(body_list_of_parts.iter().skip(1)) - .all(|(p1, p2)| p1.part_number + 1 == p2.part_number) - { - return Err(Error::NotImplemented("Garage does not support completing a Multipart upload with non-consecutive part numbers. This is a restriction of Garage's data model, which might be fixed in a future release. See issue #204 for more information on this topic.".into())); - } - - // Check that the list of parts they gave us corresponds to the parts we have here - debug!("Expected parts from request: {:?}", body_list_of_parts); - debug!("Parts stored in version: {:?}", version.parts_etags.items()); - let parts = version - .parts_etags - .items() - .iter() - .map(|pair| (&pair.0, &pair.1)); - let same_parts = body_list_of_parts - .iter() - .map(|x| (&x.part_number, &x.etag)) - .eq(parts); - if !same_parts { - return Err(Error::InvalidPart); - } - - // Check that all blocks belong to one of the parts - let block_parts = version - .blocks - .items() - .iter() - .map(|(bk, _)| bk.part_number) - .collect::>(); - let same_parts = body_list_of_parts - .iter() - .map(|x| x.part_number) - .eq(block_parts.into_iter()); - if !same_parts { - return Err(Error::BadRequest( - "Part numbers in block list and part list do not match. This can happen if a part was partially uploaded. Please abort the multipart upload and try again.".into(), - )); - } - - // Calculate etag of final object - // To understand how etags are calculated, read more here: - // https://teppen.io/2018/06/23/aws_s3_etags/ - let num_parts = body_list_of_parts.len(); - let mut etag_md5_hasher = Md5::new(); - for (_, etag) in version.parts_etags.items().iter() { - etag_md5_hasher.update(etag.as_bytes()); - } - let etag = format!("{}-{}", hex::encode(etag_md5_hasher.finalize()), num_parts); - - // Calculate total size of final object - let total_size = version.blocks.items().iter().map(|x| x.1.size).sum(); - - // Write final object version - object_version.state = ObjectVersionState::Complete(ObjectVersionData::FirstBlock( - ObjectVersionMeta { - headers, - size: total_size, - etag: etag.clone(), - }, - version.blocks.items()[0].1.hash, - )); - - let final_object = Object::new(bucket_id, key.clone(), vec![object_version]); - garage.object_table.insert(&final_object).await?; - - // Send response saying ok we're done - let result = s3_xml::CompleteMultipartUploadResult { - xmlns: (), - location: None, - bucket: s3_xml::Value(bucket_name.to_string()), - key: s3_xml::Value(key), - etag: s3_xml::Value(format!("\"{}\"", etag)), - }; - let xml = s3_xml::to_xml_with_header(&result)?; - - Ok(Response::new(Body::from(xml.into_bytes()))) -} - -pub async fn handle_abort_multipart_upload( - garage: Arc, - bucket_id: Uuid, - key: &str, - upload_id: &str, -) -> Result, Error> { - let version_uuid = decode_upload_id(upload_id)?; - - let object = garage - .object_table - .get(&bucket_id, &key.to_string()) - .await?; - let object = object.ok_or(Error::NoSuchKey)?; - - let object_version = object - .versions() - .iter() - .find(|v| v.uuid == version_uuid && v.is_uploading()); - let mut object_version = match object_version { - None => return Err(Error::NoSuchUpload), - Some(x) => x.clone(), - }; - - object_version.state = ObjectVersionState::Aborted; - let final_object = Object::new(bucket_id, key.to_string(), vec![object_version]); - garage.object_table.insert(&final_object).await?; - - Ok(Response::new(Body::from(vec![]))) -} - -fn get_mime_type(headers: &HeaderMap) -> Result { - Ok(headers - .get(hyper::header::CONTENT_TYPE) - .map(|x| x.to_str()) - .unwrap_or(Ok("blob"))? - .to_string()) -} - -pub(crate) fn get_headers(headers: &HeaderMap) -> Result { - let content_type = get_mime_type(headers)?; - let mut other = BTreeMap::new(); - - // Preserve standard headers - let standard_header = vec![ - hyper::header::CACHE_CONTROL, - hyper::header::CONTENT_DISPOSITION, - hyper::header::CONTENT_ENCODING, - hyper::header::CONTENT_LANGUAGE, - hyper::header::EXPIRES, - ]; - for h in standard_header.iter() { - if let Some(v) = headers.get(h) { - match v.to_str() { - Ok(v_str) => { - other.insert(h.to_string(), v_str.to_string()); - } - Err(e) => { - warn!("Discarding header {}, error in .to_str(): {}", h, e); - } - } - } - } - - // Preserve x-amz-meta- headers - for (k, v) in headers.iter() { - if k.as_str().starts_with("x-amz-meta-") { - match v.to_str() { - Ok(v_str) => { - other.insert(k.to_string(), v_str.to_string()); - } - Err(e) => { - warn!("Discarding header {}, error in .to_str(): {}", k, e); - } - } - } - } - - Ok(ObjectVersionHeaders { - content_type, - other, - }) -} - -pub fn decode_upload_id(id: &str) -> Result { - let id_bin = hex::decode(id).map_err(|_| Error::NoSuchUpload)?; - if id_bin.len() != 32 { - return Err(Error::NoSuchUpload); - } - let mut uuid = [0u8; 32]; - uuid.copy_from_slice(&id_bin[..]); - Ok(Uuid::from(uuid)) -} - -#[derive(Debug)] -struct CompleteMultipartUploadPart { - etag: String, - part_number: u64, -} - -fn parse_complete_multipart_upload_body( - xml: &roxmltree::Document, -) -> Option> { - let mut parts = vec![]; - - let root = xml.root(); - let cmu = root.first_child()?; - if !cmu.has_tag_name("CompleteMultipartUpload") { - return None; - } - - for item in cmu.children() { - // Only parse nodes - if !item.is_element() { - continue; - } - - if item.has_tag_name("Part") { - let etag = item.children().find(|e| e.has_tag_name("ETag"))?.text()?; - let part_number = item - .children() - .find(|e| e.has_tag_name("PartNumber"))? - .text()?; - parts.push(CompleteMultipartUploadPart { - etag: etag.trim_matches('"').to_string(), - part_number: part_number.parse().ok()?, - }); - } else { - return None; - } - } - - Some(parts) -} diff --git a/src/api/s3_router.rs b/src/api/s3_router.rs deleted file mode 100644 index 95a7eceb..00000000 --- a/src/api/s3_router.rs +++ /dev/null @@ -1,1278 +0,0 @@ -use crate::error::{Error, OkOrBadRequest}; - -use std::borrow::Cow; - -use hyper::header::HeaderValue; -use hyper::{HeaderMap, Method, Request}; - -/// This macro is used to generate very repetitive match {} blocks in this module -/// It is _not_ made to be used anywhere else -macro_rules! s3_match { - (@match $enum:expr , [ $($endpoint:ident,)* ]) => {{ - // usage: s3_match {@match my_enum, [ VariantWithField1, VariantWithField2 ..] } - // returns true if the variant was one of the listed variants, false otherwise. - use Endpoint::*; - match $enum { - $( - $endpoint { .. } => true, - )* - _ => false - } - }}; - (@extract $enum:expr , $param:ident, [ $($endpoint:ident,)* ]) => {{ - // usage: s3_match {@extract my_enum, field_name, [ VariantWithField1, VariantWithField2 ..] } - // returns Some(field_value), or None if the variant was not one of the listed variants. - use Endpoint::*; - match $enum { - $( - $endpoint {$param, ..} => Some($param), - )* - _ => None - } - }}; - (@gen_parser ($keyword:expr, $key:expr, $query:expr, $header:expr), - key: [$($kw_k:ident $(if $required_k:ident)? $(header $header_k:expr)? => $api_k:ident $(($($conv_k:ident :: $param_k:ident),*))?,)*], - no_key: [$($kw_nk:ident $(if $required_nk:ident)? $(if_header $header_nk:expr)? => $api_nk:ident $(($($conv_nk:ident :: $param_nk:ident),*))?,)*]) => {{ - // usage: s3_match {@gen_parser (keyword, key, query, header), - // key: [ - // SOME_KEYWORD => VariantWithKey, - // ... - // ], - // no_key: [ - // SOME_KEYWORD => VariantWithoutKey, - // ... - // ] - // } - // See in from_{method} for more detailed usage. - use Endpoint::*; - use keywords::*; - match ($keyword, !$key.is_empty()){ - $( - ($kw_k, true) if true $(&& $query.$required_k.is_some())? $(&& $header.contains_key($header_k))? => Ok($api_k { - key: $key, - $($( - $param_k: s3_match!(@@parse_param $query, $conv_k, $param_k), - )*)? - }), - )* - $( - ($kw_nk, false) $(if $query.$required_nk.is_some())? $(if $header.contains($header_nk))? => Ok($api_nk { - $($( - $param_nk: s3_match!(@@parse_param $query, $conv_nk, $param_nk), - )*)? - }), - )* - (kw, _) => Err(Error::BadRequest(format!("Invalid endpoint: {}", kw))) - } - }}; - - (@@parse_param $query:expr, query_opt, $param:ident) => {{ - // extract optional query parameter - $query.$param.take().map(|param| param.into_owned()) - }}; - (@@parse_param $query:expr, query, $param:ident) => {{ - // extract mendatory query parameter - $query.$param.take().ok_or_bad_request("Missing argument for endpoint")?.into_owned() - }}; - (@@parse_param $query:expr, opt_parse, $param:ident) => {{ - // extract and parse optional query parameter - // missing parameter is file, however parse error is reported as an error - $query.$param - .take() - .map(|param| param.parse()) - .transpose() - .map_err(|_| Error::BadRequest("Failed to parse query parameter".to_owned()))? - }}; - (@@parse_param $query:expr, parse, $param:ident) => {{ - // extract and parse mandatory query parameter - // both missing and un-parseable parameters are reported as errors - $query.$param.take().ok_or_bad_request("Missing argument for endpoint")? - .parse() - .map_err(|_| Error::BadRequest("Failed to parse query parameter".to_owned()))? - }}; - (@func - $(#[$doc:meta])* - pub enum Endpoint { - $( - $(#[$outer:meta])* - $variant:ident $({ - $($name:ident: $ty:ty,)* - })?, - )* - }) => { - $(#[$doc])* - pub enum Endpoint { - $( - $(#[$outer])* - $variant $({ - $($name: $ty, )* - })?, - )* - } - impl Endpoint { - pub fn name(&self) -> &'static str { - match self { - $(Endpoint::$variant $({ $($name: _,)* .. })? => stringify!($variant),)* - } - } - } - }; - (@if ($($cond:tt)+) then ($($then:tt)*) else ($($else:tt)*)) => { - $($then)* - }; - (@if () then ($($then:tt)*) else ($($else:tt)*)) => { - $($else)* - }; -} - -s3_match! {@func - -/// List of all S3 API endpoints. -/// -/// For each endpoint, it contains the parameters this endpoint receive by url (bucket, key and -/// query parameters). Parameters it may receive by header are left out, however headers are -/// considered when required to determine between one endpoint or another (for CopyObject and -/// UploadObject, for instance). -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum Endpoint { - AbortMultipartUpload { - key: String, - upload_id: String, - }, - CompleteMultipartUpload { - key: String, - upload_id: String, - }, - CopyObject { - key: String, - }, - CreateBucket { - }, - CreateMultipartUpload { - key: String, - }, - DeleteBucket { - }, - DeleteBucketAnalyticsConfiguration { - id: String, - }, - DeleteBucketCors { - }, - DeleteBucketEncryption { - }, - DeleteBucketIntelligentTieringConfiguration { - id: String, - }, - DeleteBucketInventoryConfiguration { - id: String, - }, - DeleteBucketLifecycle { - }, - DeleteBucketMetricsConfiguration { - id: String, - }, - DeleteBucketOwnershipControls { - }, - DeleteBucketPolicy { - }, - DeleteBucketReplication { - }, - DeleteBucketTagging { - }, - DeleteBucketWebsite { - }, - DeleteObject { - key: String, - version_id: Option, - }, - DeleteObjects { - }, - DeleteObjectTagging { - key: String, - version_id: Option, - }, - DeletePublicAccessBlock { - }, - GetBucketAccelerateConfiguration { - }, - GetBucketAcl { - }, - GetBucketAnalyticsConfiguration { - id: String, - }, - GetBucketCors { - }, - GetBucketEncryption { - }, - GetBucketIntelligentTieringConfiguration { - id: String, - }, - GetBucketInventoryConfiguration { - id: String, - }, - GetBucketLifecycleConfiguration { - }, - GetBucketLocation { - }, - GetBucketLogging { - }, - GetBucketMetricsConfiguration { - id: String, - }, - GetBucketNotificationConfiguration { - }, - GetBucketOwnershipControls { - }, - GetBucketPolicy { - }, - GetBucketPolicyStatus { - }, - GetBucketReplication { - }, - GetBucketRequestPayment { - }, - GetBucketTagging { - }, - GetBucketVersioning { - }, - GetBucketWebsite { - }, - /// There are actually many more query parameters, used to add headers to the answer. They were - /// not added here as they are best handled in a dedicated route. - GetObject { - key: String, - part_number: Option, - version_id: Option, - }, - GetObjectAcl { - key: String, - version_id: Option, - }, - GetObjectLegalHold { - key: String, - version_id: Option, - }, - GetObjectLockConfiguration { - }, - GetObjectRetention { - key: String, - version_id: Option, - }, - GetObjectTagging { - key: String, - version_id: Option, - }, - GetObjectTorrent { - key: String, - }, - GetPublicAccessBlock { - }, - HeadBucket { - }, - HeadObject { - key: String, - part_number: Option, - version_id: Option, - }, - ListBucketAnalyticsConfigurations { - continuation_token: Option, - }, - ListBucketIntelligentTieringConfigurations { - continuation_token: Option, - }, - ListBucketInventoryConfigurations { - continuation_token: Option, - }, - ListBucketMetricsConfigurations { - continuation_token: Option, - }, - ListBuckets, - ListMultipartUploads { - delimiter: Option, - encoding_type: Option, - key_marker: Option, - max_uploads: Option, - prefix: Option, - upload_id_marker: Option, - }, - ListObjects { - delimiter: Option, - encoding_type: Option, - marker: Option, - max_keys: Option, - prefix: Option, - }, - ListObjectsV2 { - // This value should always be 2. It is not checked when constructing the struct - list_type: String, - continuation_token: Option, - delimiter: Option, - encoding_type: Option, - fetch_owner: Option, - max_keys: Option, - prefix: Option, - start_after: Option, - }, - ListObjectVersions { - delimiter: Option, - encoding_type: Option, - key_marker: Option, - max_keys: Option, - prefix: Option, - version_id_marker: Option, - }, - ListParts { - key: String, - max_parts: Option, - part_number_marker: Option, - upload_id: String, - }, - Options, - PutBucketAccelerateConfiguration { - }, - PutBucketAcl { - }, - PutBucketAnalyticsConfiguration { - id: String, - }, - PutBucketCors { - }, - PutBucketEncryption { - }, - PutBucketIntelligentTieringConfiguration { - id: String, - }, - PutBucketInventoryConfiguration { - id: String, - }, - PutBucketLifecycleConfiguration { - }, - PutBucketLogging { - }, - PutBucketMetricsConfiguration { - id: String, - }, - PutBucketNotificationConfiguration { - }, - PutBucketOwnershipControls { - }, - PutBucketPolicy { - }, - PutBucketReplication { - }, - PutBucketRequestPayment { - }, - PutBucketTagging { - }, - PutBucketVersioning { - }, - PutBucketWebsite { - }, - PutObject { - key: String, - }, - PutObjectAcl { - key: String, - version_id: Option, - }, - PutObjectLegalHold { - key: String, - version_id: Option, - }, - PutObjectLockConfiguration { - }, - PutObjectRetention { - key: String, - version_id: Option, - }, - PutObjectTagging { - key: String, - version_id: Option, - }, - PutPublicAccessBlock { - }, - RestoreObject { - key: String, - version_id: Option, - }, - SelectObjectContent { - key: String, - // This value should always be 2. It is not checked when constructing the struct - select_type: String, - }, - UploadPart { - key: String, - part_number: u64, - upload_id: String, - }, - UploadPartCopy { - key: String, - part_number: u64, - upload_id: String, - }, - // This endpoint is not documented with others because it has special use case : - // It's intended to be used with HTML forms, using a multipart/form-data body. - // It works a lot like presigned requests, but everything is in the form instead - // of being query parameters of the URL, so authenticating it is a bit different. - PostObject, -}} - -impl Endpoint { - /// Determine which S3 endpoint a request is for using the request, and a bucket which was - /// possibly extracted from the Host header. - /// Returns Self plus bucket name, if endpoint is not Endpoint::ListBuckets - pub fn from_request( - req: &Request, - bucket: Option, - ) -> Result<(Self, Option), Error> { - let uri = req.uri(); - let path = uri.path().trim_start_matches('/'); - let query = uri.query(); - if bucket.is_none() && path.is_empty() { - if *req.method() == Method::OPTIONS { - return Ok((Self::Options, None)); - } else { - return Ok((Self::ListBuckets, None)); - } - } - - let (bucket, key) = if let Some(bucket) = bucket { - (bucket, path) - } else { - path.split_once('/') - .map(|(b, p)| (b.to_owned(), p.trim_start_matches('/'))) - .unwrap_or((path.to_owned(), "")) - }; - - if *req.method() == Method::OPTIONS { - return Ok((Self::Options, Some(bucket))); - } - - let key = percent_encoding::percent_decode_str(key) - .decode_utf8()? - .into_owned(); - - let mut query = QueryParameters::from_query(query.unwrap_or_default())?; - - let res = match *req.method() { - Method::GET => Self::from_get(key, &mut query)?, - Method::HEAD => Self::from_head(key, &mut query)?, - Method::POST => Self::from_post(key, &mut query)?, - Method::PUT => Self::from_put(key, &mut query, req.headers())?, - Method::DELETE => Self::from_delete(key, &mut query)?, - _ => return Err(Error::BadRequest("Unknown method".to_owned())), - }; - - if let Some(message) = query.nonempty_message() { - debug!("Unused query parameter: {}", message) - } - Ok((res, Some(bucket))) - } - - /// Determine which endpoint a request is for, knowing it is a GET. - fn from_get(key: String, query: &mut QueryParameters<'_>) -> Result { - s3_match! { - @gen_parser - (query.keyword.take().unwrap_or_default().as_ref(), key, query, None), - key: [ - EMPTY if upload_id => ListParts (query::upload_id, opt_parse::max_parts, opt_parse::part_number_marker), - EMPTY => GetObject (query_opt::version_id, opt_parse::part_number), - ACL => GetObjectAcl (query_opt::version_id), - LEGAL_HOLD => GetObjectLegalHold (query_opt::version_id), - RETENTION => GetObjectRetention (query_opt::version_id), - TAGGING => GetObjectTagging (query_opt::version_id), - TORRENT => GetObjectTorrent, - ], - no_key: [ - EMPTY if list_type => ListObjectsV2 (query::list_type, query_opt::continuation_token, - opt_parse::delimiter, query_opt::encoding_type, - opt_parse::fetch_owner, opt_parse::max_keys, - query_opt::prefix, query_opt::start_after), - EMPTY => ListObjects (opt_parse::delimiter, query_opt::encoding_type, query_opt::marker, - opt_parse::max_keys, opt_parse::prefix), - ACCELERATE => GetBucketAccelerateConfiguration, - ACL => GetBucketAcl, - ANALYTICS if id => GetBucketAnalyticsConfiguration (query::id), - ANALYTICS => ListBucketAnalyticsConfigurations (query_opt::continuation_token), - CORS => GetBucketCors, - ENCRYPTION => GetBucketEncryption, - INTELLIGENT_TIERING if id => GetBucketIntelligentTieringConfiguration (query::id), - INTELLIGENT_TIERING => ListBucketIntelligentTieringConfigurations (query_opt::continuation_token), - INVENTORY if id => GetBucketInventoryConfiguration (query::id), - INVENTORY => ListBucketInventoryConfigurations (query_opt::continuation_token), - LIFECYCLE => GetBucketLifecycleConfiguration, - LOCATION => GetBucketLocation, - LOGGING => GetBucketLogging, - METRICS if id => GetBucketMetricsConfiguration (query::id), - METRICS => ListBucketMetricsConfigurations (query_opt::continuation_token), - NOTIFICATION => GetBucketNotificationConfiguration, - OBJECT_LOCK => GetObjectLockConfiguration, - OWNERSHIP_CONTROLS => GetBucketOwnershipControls, - POLICY => GetBucketPolicy, - POLICY_STATUS => GetBucketPolicyStatus, - PUBLIC_ACCESS_BLOCK => GetPublicAccessBlock, - REPLICATION => GetBucketReplication, - REQUEST_PAYMENT => GetBucketRequestPayment, - TAGGING => GetBucketTagging, - UPLOADS => ListMultipartUploads (opt_parse::delimiter, query_opt::encoding_type, - query_opt::key_marker, opt_parse::max_uploads, - query_opt::prefix, query_opt::upload_id_marker), - VERSIONING => GetBucketVersioning, - VERSIONS => ListObjectVersions (opt_parse::delimiter, query_opt::encoding_type, - query_opt::key_marker, opt_parse::max_keys, - query_opt::prefix, query_opt::version_id_marker), - WEBSITE => GetBucketWebsite, - ] - } - } - - /// Determine which endpoint a request is for, knowing it is a HEAD. - fn from_head(key: String, query: &mut QueryParameters<'_>) -> Result { - s3_match! { - @gen_parser - (query.keyword.take().unwrap_or_default().as_ref(), key, query, None), - key: [ - EMPTY => HeadObject(opt_parse::part_number, query_opt::version_id), - ], - no_key: [ - EMPTY => HeadBucket, - ] - } - } - - /// Determine which endpoint a request is for, knowing it is a POST. - fn from_post(key: String, query: &mut QueryParameters<'_>) -> Result { - s3_match! { - @gen_parser - (query.keyword.take().unwrap_or_default().as_ref(), key, query, None), - key: [ - EMPTY if upload_id => CompleteMultipartUpload (query::upload_id), - RESTORE => RestoreObject (query_opt::version_id), - SELECT => SelectObjectContent (query::select_type), - UPLOADS => CreateMultipartUpload, - ], - no_key: [ - EMPTY => PostObject, - DELETE => DeleteObjects, - ] - } - } - - /// Determine which endpoint a request is for, knowing it is a PUT. - fn from_put( - key: String, - query: &mut QueryParameters<'_>, - headers: &HeaderMap, - ) -> Result { - s3_match! { - @gen_parser - (query.keyword.take().unwrap_or_default().as_ref(), key, query, headers), - key: [ - EMPTY if part_number header "x-amz-copy-source" => UploadPartCopy (parse::part_number, query::upload_id), - EMPTY header "x-amz-copy-source" => CopyObject, - EMPTY if part_number => UploadPart (parse::part_number, query::upload_id), - EMPTY => PutObject, - ACL => PutObjectAcl (query_opt::version_id), - LEGAL_HOLD => PutObjectLegalHold (query_opt::version_id), - RETENTION => PutObjectRetention (query_opt::version_id), - TAGGING => PutObjectTagging (query_opt::version_id), - - ], - no_key: [ - EMPTY => CreateBucket, - ACCELERATE => PutBucketAccelerateConfiguration, - ACL => PutBucketAcl, - ANALYTICS => PutBucketAnalyticsConfiguration (query::id), - CORS => PutBucketCors, - ENCRYPTION => PutBucketEncryption, - INTELLIGENT_TIERING => PutBucketIntelligentTieringConfiguration(query::id), - INVENTORY => PutBucketInventoryConfiguration(query::id), - LIFECYCLE => PutBucketLifecycleConfiguration, - LOGGING => PutBucketLogging, - METRICS => PutBucketMetricsConfiguration(query::id), - NOTIFICATION => PutBucketNotificationConfiguration, - OBJECT_LOCK => PutObjectLockConfiguration, - OWNERSHIP_CONTROLS => PutBucketOwnershipControls, - POLICY => PutBucketPolicy, - PUBLIC_ACCESS_BLOCK => PutPublicAccessBlock, - REPLICATION => PutBucketReplication, - REQUEST_PAYMENT => PutBucketRequestPayment, - TAGGING => PutBucketTagging, - VERSIONING => PutBucketVersioning, - WEBSITE => PutBucketWebsite, - ] - } - } - - /// Determine which endpoint a request is for, knowing it is a DELETE. - fn from_delete(key: String, query: &mut QueryParameters<'_>) -> Result { - s3_match! { - @gen_parser - (query.keyword.take().unwrap_or_default().as_ref(), key, query, None), - key: [ - EMPTY if upload_id => AbortMultipartUpload (query::upload_id), - EMPTY => DeleteObject (query_opt::version_id), - TAGGING => DeleteObjectTagging (query_opt::version_id), - ], - no_key: [ - EMPTY => DeleteBucket, - ANALYTICS => DeleteBucketAnalyticsConfiguration (query::id), - CORS => DeleteBucketCors, - ENCRYPTION => DeleteBucketEncryption, - INTELLIGENT_TIERING => DeleteBucketIntelligentTieringConfiguration (query::id), - INVENTORY => DeleteBucketInventoryConfiguration (query::id), - LIFECYCLE => DeleteBucketLifecycle, - METRICS => DeleteBucketMetricsConfiguration (query::id), - OWNERSHIP_CONTROLS => DeleteBucketOwnershipControls, - POLICY => DeleteBucketPolicy, - PUBLIC_ACCESS_BLOCK => DeletePublicAccessBlock, - REPLICATION => DeleteBucketReplication, - TAGGING => DeleteBucketTagging, - WEBSITE => DeleteBucketWebsite, - ] - } - } - - /// Get the key the request target. Returns None for requests which don't use a key. - #[allow(dead_code)] - pub fn get_key(&self) -> Option<&str> { - s3_match! { - @extract - self, - key, - [ - AbortMultipartUpload, - CompleteMultipartUpload, - CopyObject, - CreateMultipartUpload, - DeleteObject, - DeleteObjectTagging, - GetObject, - GetObjectAcl, - GetObjectLegalHold, - GetObjectRetention, - GetObjectTagging, - GetObjectTorrent, - HeadObject, - ListParts, - PutObject, - PutObjectAcl, - PutObjectLegalHold, - PutObjectRetention, - PutObjectTagging, - RestoreObject, - SelectObjectContent, - UploadPart, - UploadPartCopy, - ] - } - } - - /// Get the kind of authorization which is required to perform the operation. - pub fn authorization_type(&self) -> Authorization { - if let Endpoint::ListBuckets = self { - return Authorization::None; - }; - let readonly = s3_match! { - @match - self, - [ - GetBucketAccelerateConfiguration, - GetBucketAcl, - GetBucketAnalyticsConfiguration, - GetBucketEncryption, - GetBucketIntelligentTieringConfiguration, - GetBucketInventoryConfiguration, - GetBucketLifecycleConfiguration, - GetBucketLocation, - GetBucketLogging, - GetBucketMetricsConfiguration, - GetBucketNotificationConfiguration, - GetBucketOwnershipControls, - GetBucketPolicy, - GetBucketPolicyStatus, - GetBucketReplication, - GetBucketRequestPayment, - GetBucketTagging, - GetBucketVersioning, - GetObject, - GetObjectAcl, - GetObjectLegalHold, - GetObjectLockConfiguration, - GetObjectRetention, - GetObjectTagging, - GetObjectTorrent, - GetPublicAccessBlock, - HeadBucket, - HeadObject, - ListBucketAnalyticsConfigurations, - ListBucketIntelligentTieringConfigurations, - ListBucketInventoryConfigurations, - ListBucketMetricsConfigurations, - ListMultipartUploads, - ListObjects, - ListObjectsV2, - ListObjectVersions, - ListParts, - SelectObjectContent, - ] - }; - let owner = s3_match! { - @match - self, - [ - DeleteBucket, - GetBucketWebsite, - PutBucketWebsite, - DeleteBucketWebsite, - GetBucketCors, - PutBucketCors, - DeleteBucketCors, - ] - }; - if readonly { - Authorization::Read - } else if owner { - Authorization::Owner - } else { - Authorization::Write - } - } -} - -/// What kind of authorization is required to perform a given action -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum Authorization { - /// No authorization is required - None, - /// Having Read permission on bucket - Read, - /// Having Write permission on bucket - Write, - /// Having Owner permission on bucket - Owner, -} - -/// This macro is used to generate part of the code in this module. It must be called only one, and -/// is useless outside of this module. -macro_rules! generateQueryParameters { - ( $($rest:expr => $name:ident),* ) => { - /// Struct containing all query parameters used in endpoints. Think of it as an HashMap, - /// but with keys statically known. - #[derive(Debug, Default)] - struct QueryParameters<'a> { - keyword: Option>, - $( - $name: Option>, - )* - } - - impl<'a> QueryParameters<'a> { - /// Build this struct from the query part of an URI. - fn from_query(query: &'a str) -> Result { - let mut res: Self = Default::default(); - for (k, v) in url::form_urlencoded::parse(query.as_bytes()) { - let repeated = match k.as_ref() { - $( - $rest => if !v.is_empty() { - res.$name.replace(v).is_some() - } else { - false - }, - )* - _ => { - if k.starts_with("response-") || k.starts_with("X-Amz-") { - false - } else if v.as_ref().is_empty() { - if res.keyword.replace(k).is_some() { - return Err(Error::BadRequest("Multiple keywords".to_owned())); - } - continue; - } else { - debug!("Received an unknown query parameter: '{}'", k); - false - } - } - }; - if repeated { - return Err(Error::BadRequest(format!( - "Query parameter repeated: '{}'", - k - ))); - } - } - Ok(res) - } - - /// Get an error message in case not all parameters where used when extracting them to - /// build an Enpoint variant - fn nonempty_message(&self) -> Option<&str> { - if self.keyword.is_some() { - Some("Keyword not used") - } $( - else if self.$name.is_some() { - Some(concat!("'", $rest, "'")) - } - )* else { - None - } - } - } - } -} - -// parameter name => struct field -generateQueryParameters! { - "continuation-token" => continuation_token, - "delimiter" => delimiter, - "encoding-type" => encoding_type, - "fetch-owner" => fetch_owner, - "id" => id, - "key-marker" => key_marker, - "list-type" => list_type, - "marker" => marker, - "max-keys" => max_keys, - "max-parts" => max_parts, - "max-uploads" => max_uploads, - "partNumber" => part_number, - "part-number-marker" => part_number_marker, - "prefix" => prefix, - "select-type" => select_type, - "start-after" => start_after, - "uploadId" => upload_id, - "upload-id-marker" => upload_id_marker, - "versionId" => version_id, - "version-id-marker" => version_id_marker -} - -mod keywords { - //! This module contain all query parameters with no associated value S3 uses to differentiate - //! endpoints. - pub const EMPTY: &str = ""; - - pub const ACCELERATE: &str = "accelerate"; - pub const ACL: &str = "acl"; - pub const ANALYTICS: &str = "analytics"; - pub const CORS: &str = "cors"; - pub const DELETE: &str = "delete"; - pub const ENCRYPTION: &str = "encryption"; - pub const INTELLIGENT_TIERING: &str = "intelligent-tiering"; - pub const INVENTORY: &str = "inventory"; - pub const LEGAL_HOLD: &str = "legal-hold"; - pub const LIFECYCLE: &str = "lifecycle"; - pub const LOCATION: &str = "location"; - pub const LOGGING: &str = "logging"; - pub const METRICS: &str = "metrics"; - pub const NOTIFICATION: &str = "notification"; - pub const OBJECT_LOCK: &str = "object-lock"; - pub const OWNERSHIP_CONTROLS: &str = "ownershipControls"; - pub const POLICY: &str = "policy"; - pub const POLICY_STATUS: &str = "policyStatus"; - pub const PUBLIC_ACCESS_BLOCK: &str = "publicAccessBlock"; - pub const REPLICATION: &str = "replication"; - pub const REQUEST_PAYMENT: &str = "requestPayment"; - pub const RESTORE: &str = "restore"; - pub const RETENTION: &str = "retention"; - pub const SELECT: &str = "select"; - pub const TAGGING: &str = "tagging"; - pub const TORRENT: &str = "torrent"; - pub const UPLOADS: &str = "uploads"; - pub const VERSIONING: &str = "versioning"; - pub const VERSIONS: &str = "versions"; - pub const WEBSITE: &str = "website"; -} - -#[cfg(test)] -mod tests { - use super::*; - - fn parse( - method: &str, - uri: &str, - bucket: Option, - header: Option<(&str, &str)>, - ) -> (Endpoint, Option) { - let mut req = Request::builder().method(method).uri(uri); - if let Some((k, v)) = header { - req = req.header(k, v) - } - let req = req.body(()).unwrap(); - - Endpoint::from_request(&req, bucket).unwrap() - } - - macro_rules! test_cases { - ($($method:ident $uri:expr => $variant:ident )*) => {{ - $( - assert!( - matches!( - parse(test_cases!{@actual_method $method}, $uri, Some("my_bucket".to_owned()), None).0, - Endpoint::$variant { .. } - ) - ); - assert!( - matches!( - parse(test_cases!{@actual_method $method}, concat!("/my_bucket", $uri), None, None).0, - Endpoint::$variant { .. } - ) - ); - - test_cases!{@auth $method $uri} - )* - }}; - - (@actual_method HEAD) => {{ "HEAD" }}; - (@actual_method GET) => {{ "GET" }}; - (@actual_method OWNER_GET) => {{ "GET" }}; - (@actual_method PUT) => {{ "PUT" }}; - (@actual_method OWNER_PUT) => {{ "PUT" }}; - (@actual_method POST) => {{ "POST" }}; - (@actual_method DELETE) => {{ "DELETE" }}; - (@actual_method OWNER_DELETE) => {{ "DELETE" }}; - - (@auth HEAD $uri:expr) => {{ - assert_eq!(parse("HEAD", concat!("/my_bucket", $uri), None, None).0.authorization_type(), - Authorization::Read) - }}; - (@auth GET $uri:expr) => {{ - assert_eq!(parse("GET", concat!("/my_bucket", $uri), None, None).0.authorization_type(), - Authorization::Read) - }}; - (@auth OWNER_GET $uri:expr) => {{ - assert_eq!(parse("GET", concat!("/my_bucket", $uri), None, None).0.authorization_type(), - Authorization::Owner) - }}; - (@auth PUT $uri:expr) => {{ - assert_eq!(parse("PUT", concat!("/my_bucket", $uri), None, None).0.authorization_type(), - Authorization::Write) - }}; - (@auth OWNER_PUT $uri:expr) => {{ - assert_eq!(parse("PUT", concat!("/my_bucket", $uri), None, None).0.authorization_type(), - Authorization::Owner) - }}; - (@auth POST $uri:expr) => {{ - assert_eq!(parse("POST", concat!("/my_bucket", $uri), None, None).0.authorization_type(), - Authorization::Write) - }}; - (@auth DELETE $uri:expr) => {{ - assert_eq!(parse("DELETE", concat!("/my_bucket", $uri), None, None).0.authorization_type(), - Authorization::Write) - }}; - (@auth OWNER_DELETE $uri:expr) => {{ - assert_eq!(parse("DELETE", concat!("/my_bucket", $uri), None, None).0.authorization_type(), - Authorization::Owner) - }}; - } - - #[test] - fn test_bucket_extraction() { - assert_eq!( - parse("GET", "/my/key", Some("my_bucket".to_owned()), None).1, - parse("GET", "/my_bucket/my/key", None, None).1 - ); - assert_eq!( - parse("GET", "/my_bucket/my/key", None, None).1.unwrap(), - "my_bucket" - ); - assert!(parse("GET", "/", None, None).1.is_none()); - } - - #[test] - fn test_key() { - assert_eq!( - parse("GET", "/my/key", Some("my_bucket".to_owned()), None) - .0 - .get_key(), - parse("GET", "/my_bucket/my/key", None, None).0.get_key() - ); - assert_eq!( - parse("GET", "/my_bucket/my/key", None, None) - .0 - .get_key() - .unwrap(), - "my/key" - ); - assert_eq!( - parse("GET", "/my_bucket/my/key?acl", None, None) - .0 - .get_key() - .unwrap(), - "my/key" - ); - assert!(parse("GET", "/my_bucket/?list-type=2", None, None) - .0 - .get_key() - .is_none()); - - assert_eq!( - parse("GET", "/my_bucket/%26%2B%3F%25%C3%A9/something", None, None) - .0 - .get_key() - .unwrap(), - "&+?%é/something" - ); - - /* - * this case is failing. We should verify how clients encode space in url - assert_eq!( - parse("GET", "/my_bucket/+", None, None).get_key().unwrap(), - " "); - */ - } - - #[test] - fn invalid_endpoint() { - let req = Request::builder() - .method("GET") - .uri("/bucket/key?website") - .body(()) - .unwrap(); - - assert!(Endpoint::from_request(&req, None).is_err()) - } - - #[test] - fn test_aws_doc_examples() { - test_cases!( - DELETE "/example-object?uploadId=VXBsb2FkIElEIGZvciBlbHZpbmcncyBteS1tb3ZpZS5tMnRzIHVwbG9hZ" => AbortMultipartUpload - DELETE "/Key+?uploadId=UploadId" => AbortMultipartUpload - POST "/example-object?uploadId=AAAsb2FkIElEIGZvciBlbHZpbmcncyWeeS1tb3ZpZS5tMnRzIRRwbG9hZA" => CompleteMultipartUpload - POST "/Key+?uploadId=UploadId" => CompleteMultipartUpload - PUT "/" => CreateBucket - POST "/example-object?uploads" => CreateMultipartUpload - POST "/{Key+}?uploads" => CreateMultipartUpload - OWNER_DELETE "/" => DeleteBucket - DELETE "/?analytics&id=list1" => DeleteBucketAnalyticsConfiguration - DELETE "/?analytics&id=Id" => DeleteBucketAnalyticsConfiguration - OWNER_DELETE "/?cors" => DeleteBucketCors - DELETE "/?encryption" => DeleteBucketEncryption - DELETE "/?intelligent-tiering&id=Id" => DeleteBucketIntelligentTieringConfiguration - DELETE "/?inventory&id=list1" => DeleteBucketInventoryConfiguration - DELETE "/?inventory&id=Id" => DeleteBucketInventoryConfiguration - DELETE "/?lifecycle" => DeleteBucketLifecycle - DELETE "/?metrics&id=ExampleMetrics" => DeleteBucketMetricsConfiguration - DELETE "/?metrics&id=Id" => DeleteBucketMetricsConfiguration - DELETE "/?ownershipControls" => DeleteBucketOwnershipControls - DELETE "/?policy" => DeleteBucketPolicy - DELETE "/?replication" => DeleteBucketReplication - DELETE "/?tagging" => DeleteBucketTagging - OWNER_DELETE "/?website" => DeleteBucketWebsite - DELETE "/my-second-image.jpg" => DeleteObject - DELETE "/my-third-image.jpg?versionId=UIORUnfndfiufdisojhr398493jfdkjFJjkndnqUifhnw89493jJFJ" => DeleteObject - DELETE "/Key+?versionId=VersionId" => DeleteObject - POST "/?delete" => DeleteObjects - DELETE "/exampleobject?tagging" => DeleteObjectTagging - DELETE "/{Key+}?tagging&versionId=VersionId" => DeleteObjectTagging - DELETE "/?publicAccessBlock" => DeletePublicAccessBlock - GET "/?accelerate" => GetBucketAccelerateConfiguration - GET "/?acl" => GetBucketAcl - GET "/?analytics&id=Id" => GetBucketAnalyticsConfiguration - OWNER_GET "/?cors" => GetBucketCors - GET "/?encryption" => GetBucketEncryption - GET "/?intelligent-tiering&id=Id" => GetBucketIntelligentTieringConfiguration - GET "/?inventory&id=list1" => GetBucketInventoryConfiguration - GET "/?inventory&id=Id" => GetBucketInventoryConfiguration - GET "/?lifecycle" => GetBucketLifecycleConfiguration - GET "/?location" => GetBucketLocation - GET "/?logging" => GetBucketLogging - GET "/?metrics&id=Documents" => GetBucketMetricsConfiguration - GET "/?metrics&id=Id" => GetBucketMetricsConfiguration - GET "/?notification" => GetBucketNotificationConfiguration - GET "/?ownershipControls" => GetBucketOwnershipControls - GET "/?policy" => GetBucketPolicy - GET "/?policyStatus" => GetBucketPolicyStatus - GET "/?replication" => GetBucketReplication - GET "/?requestPayment" => GetBucketRequestPayment - GET "/?tagging" => GetBucketTagging - GET "/?versioning" => GetBucketVersioning - OWNER_GET "/?website" => GetBucketWebsite - GET "/my-image.jpg" => GetObject - GET "/myObject?versionId=3/L4kqtJlcpXroDTDmpUMLUo" => GetObject - GET "/Junk3.txt?response-cache-control=No-cache&response-content-disposition=attachment%3B%20filename%3Dtesting.txt&response-content-encoding=x-gzip&response-content-language=mi%2C%20en&response-expires=Thu%2C%2001%20Dec%201994%2016:00:00%20GMT" => GetObject - GET "/Key+?partNumber=1&response-cache-control=ResponseCacheControl&response-content-disposition=ResponseContentDisposition&response-content-encoding=ResponseContentEncoding&response-content-language=ResponseContentLanguage&response-content-type=ResponseContentType&response-expires=ResponseExpires&versionId=VersionId" => GetObject - GET "/my-image.jpg?acl" => GetObjectAcl - GET "/my-image.jpg?versionId=3/L4kqtJlcpXroDVBH40Nr8X8gdRQBpUMLUo&acl" => GetObjectAcl - GET "/{Key+}?acl&versionId=VersionId" => GetObjectAcl - GET "/{Key+}?legal-hold&versionId=VersionId" => GetObjectLegalHold - GET "/?object-lock" => GetObjectLockConfiguration - GET "/{Key+}?retention&versionId=VersionId" => GetObjectRetention - GET "/example-object?tagging" => GetObjectTagging - GET "/{Key+}?tagging&versionId=VersionId" => GetObjectTagging - GET "/quotes/Nelson?torrent" => GetObjectTorrent - GET "/{Key+}?torrent" => GetObjectTorrent - GET "/?publicAccessBlock" => GetPublicAccessBlock - HEAD "/" => HeadBucket - HEAD "/my-image.jpg" => HeadObject - HEAD "/my-image.jpg?versionId=3HL4kqCxf3vjVBH40Nrjfkd" => HeadObject - HEAD "/Key+?partNumber=3&versionId=VersionId" => HeadObject - GET "/?analytics" => ListBucketAnalyticsConfigurations - GET "/?analytics&continuation-token=ContinuationToken" => ListBucketAnalyticsConfigurations - GET "/?intelligent-tiering" => ListBucketIntelligentTieringConfigurations - GET "/?intelligent-tiering&continuation-token=ContinuationToken" => ListBucketIntelligentTieringConfigurations - GET "/?inventory" => ListBucketInventoryConfigurations - GET "/?inventory&continuation-token=ContinuationToken" => ListBucketInventoryConfigurations - GET "/?metrics" => ListBucketMetricsConfigurations - GET "/?metrics&continuation-token=ContinuationToken" => ListBucketMetricsConfigurations - GET "/?uploads&max-uploads=3" => ListMultipartUploads - GET "/?uploads&delimiter=/" => ListMultipartUploads - GET "/?uploads&delimiter=/&prefix=photos/2006/" => ListMultipartUploads - GET "/?uploads&delimiter=D&encoding-type=EncodingType&key-marker=KeyMarker&max-uploads=1&prefix=Prefix&upload-id-marker=UploadIdMarker" => ListMultipartUploads - GET "/" => ListObjects - GET "/?prefix=N&marker=Ned&max-keys=40" => ListObjects - GET "/?delimiter=/" => ListObjects - GET "/?prefix=photos/2006/&delimiter=/" => ListObjects - - GET "/?delimiter=D&encoding-type=EncodingType&marker=Marker&max-keys=1&prefix=Prefix" => ListObjects - GET "/?list-type=2" => ListObjectsV2 - GET "/?list-type=2&max-keys=3&prefix=E&start-after=ExampleGuide.pdf" => ListObjectsV2 - GET "/?list-type=2&delimiter=/" => ListObjectsV2 - GET "/?list-type=2&prefix=photos/2006/&delimiter=/" => ListObjectsV2 - GET "/?list-type=2" => ListObjectsV2 - GET "/?list-type=2&continuation-token=1ueGcxLPRx1Tr/XYExHnhbYLgveDs2J/wm36Hy4vbOwM=" => ListObjectsV2 - GET "/?list-type=2&continuation-token=ContinuationToken&delimiter=D&encoding-type=EncodingType&fetch-owner=true&max-keys=1&prefix=Prefix&start-after=StartAfter" => ListObjectsV2 - GET "/?versions" => ListObjectVersions - GET "/?versions&key-marker=key2" => ListObjectVersions - GET "/?versions&key-marker=key3&version-id-marker=t46ZenlYTZBnj" => ListObjectVersions - GET "/?versions&key-marker=key3&version-id-marker=t46Z0menlYTZBnj&max-keys=3" => ListObjectVersions - GET "/?versions&delimiter=/" => ListObjectVersions - GET "/?versions&prefix=photos/2006/&delimiter=/" => ListObjectVersions - GET "/?versions&delimiter=D&encoding-type=EncodingType&key-marker=KeyMarker&max-keys=2&prefix=Prefix&version-id-marker=VersionIdMarker" => ListObjectVersions - GET "/example-object?uploadId=XXBsb2FkIElEIGZvciBlbHZpbmcncyVcdS1tb3ZpZS5tMnRzEEEwbG9hZA&max-parts=2&part-number-marker=1" => ListParts - GET "/Key+?max-parts=2&part-number-marker=2&uploadId=UploadId" => ListParts - PUT "/?accelerate" => PutBucketAccelerateConfiguration - PUT "/?acl" => PutBucketAcl - PUT "/?analytics&id=report1" => PutBucketAnalyticsConfiguration - PUT "/?analytics&id=Id" => PutBucketAnalyticsConfiguration - OWNER_PUT "/?cors" => PutBucketCors - PUT "/?encryption" => PutBucketEncryption - PUT "/?intelligent-tiering&id=Id" => PutBucketIntelligentTieringConfiguration - PUT "/?inventory&id=report1" => PutBucketInventoryConfiguration - PUT "/?inventory&id=Id" => PutBucketInventoryConfiguration - PUT "/?lifecycle" => PutBucketLifecycleConfiguration - PUT "/?logging" => PutBucketLogging - PUT "/?metrics&id=EntireBucket" => PutBucketMetricsConfiguration - PUT "/?metrics&id=Id" => PutBucketMetricsConfiguration - PUT "/?notification" => PutBucketNotificationConfiguration - PUT "/?ownershipControls" => PutBucketOwnershipControls - PUT "/?policy" => PutBucketPolicy - PUT "/?replication" => PutBucketReplication - PUT "/?requestPayment" => PutBucketRequestPayment - PUT "/?tagging" => PutBucketTagging - PUT "/?versioning" => PutBucketVersioning - OWNER_PUT "/?website" => PutBucketWebsite - PUT "/my-image.jpg" => PutObject - PUT "/Key+" => PutObject - PUT "/my-image.jpg?acl" => PutObjectAcl - PUT "/my-image.jpg?acl&versionId=3HL4kqtJlcpXroDTDmJ+rmSpXd3dIbrHY+MTRCxf3vjVBH40Nrjfkd" => PutObjectAcl - PUT "/{Key+}?acl&versionId=VersionId" => PutObjectAcl - PUT "/{Key+}?legal-hold&versionId=VersionId" => PutObjectLegalHold - PUT "/?object-lock" => PutObjectLockConfiguration - PUT "/{Key+}?retention&versionId=VersionId" => PutObjectRetention - PUT "/object-key?tagging" => PutObjectTagging - PUT "/{Key+}?tagging&versionId=VersionId" => PutObjectTagging - PUT "/?publicAccessBlock" => PutPublicAccessBlock - POST "/object-one.csv?restore" => RestoreObject - POST "/{Key+}?restore&versionId=VersionId" => RestoreObject - PUT "/my-movie.m2ts?partNumber=1&uploadId=VCVsb2FkIElEIGZvciBlbZZpbmcncyBteS1tb3ZpZS5tMnRzIHVwbG9hZR" => UploadPart - PUT "/Key+?partNumber=2&uploadId=UploadId" => UploadPart - POST "/" => PostObject - ); - // no bucket, won't work with the rest of the test suite - assert!(matches!( - parse("GET", "/", None, None).0, - Endpoint::ListBuckets { .. } - )); - assert!(matches!( - parse("GET", "/", None, None).0.authorization_type(), - Authorization::None - )); - - // require a header - assert!(matches!( - parse( - "PUT", - "/Key+", - Some("my_bucket".to_owned()), - Some(("x-amz-copy-source", "some/key")) - ) - .0, - Endpoint::CopyObject { .. } - )); - assert!(matches!( - parse( - "PUT", - "/my_bucket/Key+", - None, - Some(("x-amz-copy-source", "some/key")) - ) - .0, - Endpoint::CopyObject { .. } - )); - assert!(matches!( - parse( - "PUT", - "/my_bucket/Key+", - None, - Some(("x-amz-copy-source", "some/key")) - ) - .0 - .authorization_type(), - Authorization::Write - )); - - // require a header - assert!(matches!( - parse( - "PUT", - "/Key+?partNumber=2&uploadId=UploadId", - Some("my_bucket".to_owned()), - Some(("x-amz-copy-source", "some/key")) - ) - .0, - Endpoint::UploadPartCopy { .. } - )); - assert!(matches!( - parse( - "PUT", - "/my_bucket/Key+?partNumber=2&uploadId=UploadId", - None, - Some(("x-amz-copy-source", "some/key")) - ) - .0, - Endpoint::UploadPartCopy { .. } - )); - assert!(matches!( - parse( - "PUT", - "/my_bucket/Key+?partNumber=2&uploadId=UploadId", - None, - Some(("x-amz-copy-source", "some/key")) - ) - .0 - .authorization_type(), - Authorization::Write - )); - - // POST request, but with GET semantic for permissions purpose - assert!(matches!( - parse( - "POST", - "/{Key+}?select&select-type=2", - Some("my_bucket".to_owned()), - None - ) - .0, - Endpoint::SelectObjectContent { .. } - )); - assert!(matches!( - parse("POST", "/my_bucket/{Key+}?select&select-type=2", None, None).0, - Endpoint::SelectObjectContent { .. } - )); - assert!(matches!( - parse("POST", "/my_bucket/{Key+}?select&select-type=2", None, None) - .0 - .authorization_type(), - Authorization::Read - )); - } -} diff --git a/src/api/s3_website.rs b/src/api/s3_website.rs deleted file mode 100644 index b464dd45..00000000 --- a/src/api/s3_website.rs +++ /dev/null @@ -1,369 +0,0 @@ -use quick_xml::de::from_reader; -use std::sync::Arc; - -use hyper::{Body, Request, Response, StatusCode}; -use serde::{Deserialize, Serialize}; - -use crate::error::*; -use crate::s3_xml::{to_xml_with_header, xmlns_tag, IntValue, Value}; -use crate::signature::verify_signed_content; - -use garage_model::bucket_table::*; -use garage_model::garage::Garage; -use garage_table::*; -use garage_util::data::*; - -pub async fn handle_get_website(bucket: &Bucket) -> Result, Error> { - let param = bucket - .params() - .ok_or_internal_error("Bucket should not be deleted at this point")?; - - if let Some(website) = param.website_config.get() { - let wc = WebsiteConfiguration { - xmlns: (), - error_document: website.error_document.as_ref().map(|v| Key { - key: Value(v.to_string()), - }), - index_document: Some(Suffix { - suffix: Value(website.index_document.to_string()), - }), - redirect_all_requests_to: None, - routing_rules: None, - }; - let xml = to_xml_with_header(&wc)?; - Ok(Response::builder() - .status(StatusCode::OK) - .header(http::header::CONTENT_TYPE, "application/xml") - .body(Body::from(xml))?) - } else { - Ok(Response::builder() - .status(StatusCode::NO_CONTENT) - .body(Body::empty())?) - } -} - -pub async fn handle_delete_website( - garage: Arc, - bucket_id: Uuid, -) -> Result, Error> { - let mut bucket = garage - .bucket_table - .get(&EmptyKey, &bucket_id) - .await? - .ok_or(Error::NoSuchBucket)?; - - let param = bucket - .params_mut() - .ok_or_internal_error("Bucket should not be deleted at this point")?; - - param.website_config.update(None); - garage.bucket_table.insert(&bucket).await?; - - Ok(Response::builder() - .status(StatusCode::NO_CONTENT) - .body(Body::empty())?) -} - -pub async fn handle_put_website( - garage: Arc, - bucket_id: Uuid, - req: Request, - content_sha256: Option, -) -> Result, Error> { - let body = hyper::body::to_bytes(req.into_body()).await?; - - if let Some(content_sha256) = content_sha256 { - verify_signed_content(content_sha256, &body[..])?; - } - - let mut bucket = garage - .bucket_table - .get(&EmptyKey, &bucket_id) - .await? - .ok_or(Error::NoSuchBucket)?; - - let param = bucket - .params_mut() - .ok_or_internal_error("Bucket should not be deleted at this point")?; - - let conf: WebsiteConfiguration = from_reader(&body as &[u8])?; - conf.validate()?; - - param - .website_config - .update(Some(conf.into_garage_website_config()?)); - garage.bucket_table.insert(&bucket).await?; - - Ok(Response::builder() - .status(StatusCode::OK) - .body(Body::empty())?) -} - -#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)] -pub struct WebsiteConfiguration { - #[serde(serialize_with = "xmlns_tag", skip_deserializing)] - pub xmlns: (), - #[serde(rename = "ErrorDocument")] - pub error_document: Option, - #[serde(rename = "IndexDocument")] - pub index_document: Option, - #[serde(rename = "RedirectAllRequestsTo")] - pub redirect_all_requests_to: Option, - #[serde(rename = "RoutingRules")] - pub routing_rules: Option>, -} - -#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)] -pub struct RoutingRule { - #[serde(rename = "RoutingRule")] - pub inner: RoutingRuleInner, -} - -#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)] -pub struct RoutingRuleInner { - #[serde(rename = "Condition")] - pub condition: Option, - #[serde(rename = "Redirect")] - pub redirect: Redirect, -} - -#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)] -pub struct Key { - #[serde(rename = "Key")] - pub key: Value, -} - -#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)] -pub struct Suffix { - #[serde(rename = "Suffix")] - pub suffix: Value, -} - -#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)] -pub struct Target { - #[serde(rename = "HostName")] - pub hostname: Value, - #[serde(rename = "Protocol")] - pub protocol: Option, -} - -#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)] -pub struct Condition { - #[serde(rename = "HttpErrorCodeReturnedEquals")] - pub http_error_code: Option, - #[serde(rename = "KeyPrefixEquals")] - pub prefix: Option, -} - -#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)] -pub struct Redirect { - #[serde(rename = "HostName")] - pub hostname: Option, - #[serde(rename = "Protocol")] - pub protocol: Option, - #[serde(rename = "HttpRedirectCode")] - pub http_redirect_code: Option, - #[serde(rename = "ReplaceKeyPrefixWith")] - pub replace_prefix: Option, - #[serde(rename = "ReplaceKeyWith")] - pub replace_full: Option, -} - -impl WebsiteConfiguration { - pub fn validate(&self) -> Result<(), Error> { - if self.redirect_all_requests_to.is_some() - && (self.error_document.is_some() - || self.index_document.is_some() - || self.routing_rules.is_some()) - { - return Err(Error::BadRequest( - "Bad XML: can't have RedirectAllRequestsTo and other fields".to_owned(), - )); - } - if let Some(ref ed) = self.error_document { - ed.validate()?; - } - if let Some(ref id) = self.index_document { - id.validate()?; - } - if let Some(ref rart) = self.redirect_all_requests_to { - rart.validate()?; - } - if let Some(ref rrs) = self.routing_rules { - for rr in rrs { - rr.inner.validate()?; - } - } - - Ok(()) - } - - pub fn into_garage_website_config(self) -> Result { - if self.redirect_all_requests_to.is_some() { - Err(Error::NotImplemented( - "S3 website redirects are not currently implemented in Garage.".into(), - )) - } else if self.routing_rules.map(|x| !x.is_empty()).unwrap_or(false) { - Err(Error::NotImplemented( - "S3 routing rules are not currently implemented in Garage.".into(), - )) - } else { - Ok(WebsiteConfig { - index_document: self - .index_document - .map(|x| x.suffix.0) - .unwrap_or_else(|| "index.html".to_string()), - error_document: self.error_document.map(|x| x.key.0), - }) - } - } -} - -impl Key { - pub fn validate(&self) -> Result<(), Error> { - if self.key.0.is_empty() { - Err(Error::BadRequest( - "Bad XML: error document specified but empty".to_owned(), - )) - } else { - Ok(()) - } - } -} - -impl Suffix { - pub fn validate(&self) -> Result<(), Error> { - if self.suffix.0.is_empty() | self.suffix.0.contains('/') { - Err(Error::BadRequest( - "Bad XML: index document is empty or contains /".to_owned(), - )) - } else { - Ok(()) - } - } -} - -impl Target { - pub fn validate(&self) -> Result<(), Error> { - if let Some(ref protocol) = self.protocol { - if protocol.0 != "http" && protocol.0 != "https" { - return Err(Error::BadRequest("Bad XML: invalid protocol".to_owned())); - } - } - Ok(()) - } -} - -impl RoutingRuleInner { - pub fn validate(&self) -> Result<(), Error> { - let has_prefix = self - .condition - .as_ref() - .and_then(|c| c.prefix.as_ref()) - .is_some(); - self.redirect.validate(has_prefix) - } -} - -impl Redirect { - pub fn validate(&self, has_prefix: bool) -> Result<(), Error> { - if self.replace_prefix.is_some() { - if self.replace_full.is_some() { - return Err(Error::BadRequest( - "Bad XML: both ReplaceKeyPrefixWith and ReplaceKeyWith are set".to_owned(), - )); - } - if !has_prefix { - return Err(Error::BadRequest( - "Bad XML: ReplaceKeyPrefixWith is set, but KeyPrefixEquals isn't".to_owned(), - )); - } - } - if let Some(ref protocol) = self.protocol { - if protocol.0 != "http" && protocol.0 != "https" { - return Err(Error::BadRequest("Bad XML: invalid protocol".to_owned())); - } - } - // TODO there are probably more invalide cases, but which ones? - Ok(()) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - use quick_xml::de::from_str; - - #[test] - fn test_deserialize() -> Result<(), Error> { - let message = r#" - - - my-error-doc - - - my-index - - - garage.tld - https - - - - - 404 - prefix1 - - - gara.ge - http - 303 - prefix2 - fullkey - - - -"#; - let conf: WebsiteConfiguration = from_str(message).unwrap(); - let ref_value = WebsiteConfiguration { - xmlns: (), - error_document: Some(Key { - key: Value("my-error-doc".to_owned()), - }), - index_document: Some(Suffix { - suffix: Value("my-index".to_owned()), - }), - redirect_all_requests_to: Some(Target { - hostname: Value("garage.tld".to_owned()), - protocol: Some(Value("https".to_owned())), - }), - routing_rules: Some(vec![RoutingRule { - inner: RoutingRuleInner { - condition: Some(Condition { - http_error_code: Some(IntValue(404)), - prefix: Some(Value("prefix1".to_owned())), - }), - redirect: Redirect { - hostname: Some(Value("gara.ge".to_owned())), - protocol: Some(Value("http".to_owned())), - http_redirect_code: Some(IntValue(303)), - replace_prefix: Some(Value("prefix2".to_owned())), - replace_full: Some(Value("fullkey".to_owned())), - }, - }, - }]), - }; - assert_eq! { - ref_value, - conf - } - - let message2 = to_xml_with_header(&ref_value)?; - - let cleanup = |c: &str| c.replace(char::is_whitespace, ""); - assert_eq!(cleanup(message), cleanup(&message2)); - - Ok(()) - } -} diff --git a/src/api/s3_xml.rs b/src/api/s3_xml.rs deleted file mode 100644 index 75ec4559..00000000 --- a/src/api/s3_xml.rs +++ /dev/null @@ -1,844 +0,0 @@ -use quick_xml::se::to_string; -use serde::{Deserialize, Serialize, Serializer}; - -use crate::Error as ApiError; - -pub fn to_xml_with_header(x: &T) -> Result { - let mut xml = r#""#.to_string(); - xml.push_str(&to_string(x)?); - Ok(xml) -} - -pub fn xmlns_tag(_v: &(), s: S) -> Result { - s.serialize_str("http://s3.amazonaws.com/doc/2006-03-01/") -} - -#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)] -pub struct Value(#[serde(rename = "$value")] pub String); - -impl From<&str> for Value { - fn from(s: &str) -> Value { - Value(s.to_string()) - } -} - -#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)] -pub struct IntValue(#[serde(rename = "$value")] pub i64); - -#[derive(Debug, Serialize, PartialEq)] -pub struct Bucket { - #[serde(rename = "CreationDate")] - pub creation_date: Value, - #[serde(rename = "Name")] - pub name: Value, -} - -#[derive(Debug, Serialize, PartialEq)] -pub struct Owner { - #[serde(rename = "DisplayName")] - pub display_name: Value, - #[serde(rename = "ID")] - pub id: Value, -} - -#[derive(Debug, Serialize, PartialEq)] -pub struct BucketList { - #[serde(rename = "Bucket")] - pub entries: Vec, -} - -#[derive(Debug, Serialize, PartialEq)] -pub struct ListAllMyBucketsResult { - #[serde(rename = "Buckets")] - pub buckets: BucketList, - #[serde(rename = "Owner")] - pub owner: Owner, -} - -#[derive(Debug, Serialize, PartialEq)] -pub struct LocationConstraint { - #[serde(serialize_with = "xmlns_tag")] - pub xmlns: (), - #[serde(rename = "$value")] - pub region: String, -} - -#[derive(Debug, Serialize, PartialEq)] -pub struct Deleted { - #[serde(rename = "Key")] - pub key: Value, - #[serde(rename = "VersionId")] - pub version_id: Value, - #[serde(rename = "DeleteMarkerVersionId")] - pub delete_marker_version_id: Value, -} - -#[derive(Debug, Serialize, PartialEq)] -pub struct Error { - #[serde(rename = "Code")] - pub code: Value, - #[serde(rename = "Message")] - pub message: Value, - #[serde(rename = "Resource")] - pub resource: Option, - #[serde(rename = "Region")] - pub region: Option, -} - -#[derive(Debug, Serialize, PartialEq)] -pub struct DeleteError { - #[serde(rename = "Code")] - pub code: Value, - #[serde(rename = "Key")] - pub key: Option, - #[serde(rename = "Message")] - pub message: Value, - #[serde(rename = "VersionId")] - pub version_id: Option, -} - -#[derive(Debug, Serialize, PartialEq)] -pub struct DeleteResult { - #[serde(serialize_with = "xmlns_tag")] - pub xmlns: (), - #[serde(rename = "Deleted")] - pub deleted: Vec, - #[serde(rename = "Error")] - pub errors: Vec, -} - -#[derive(Debug, Serialize, PartialEq)] -pub struct InitiateMultipartUploadResult { - #[serde(serialize_with = "xmlns_tag")] - pub xmlns: (), - #[serde(rename = "Bucket")] - pub bucket: Value, - #[serde(rename = "Key")] - pub key: Value, - #[serde(rename = "UploadId")] - pub upload_id: Value, -} - -#[derive(Debug, Serialize, PartialEq)] -pub struct CompleteMultipartUploadResult { - #[serde(serialize_with = "xmlns_tag")] - pub xmlns: (), - #[serde(rename = "Location")] - pub location: Option, - #[serde(rename = "Bucket")] - pub bucket: Value, - #[serde(rename = "Key")] - pub key: Value, - #[serde(rename = "ETag")] - pub etag: Value, -} - -#[derive(Debug, Serialize, PartialEq)] -pub struct Initiator { - #[serde(rename = "DisplayName")] - pub display_name: Value, - #[serde(rename = "ID")] - pub id: Value, -} - -#[derive(Debug, Serialize, PartialEq)] -pub struct ListMultipartItem { - #[serde(rename = "Initiated")] - pub initiated: Value, - #[serde(rename = "Initiator")] - pub initiator: Initiator, - #[serde(rename = "Key")] - pub key: Value, - #[serde(rename = "UploadId")] - pub upload_id: Value, - #[serde(rename = "Owner")] - pub owner: Owner, - #[serde(rename = "StorageClass")] - pub storage_class: Value, -} - -#[derive(Debug, Serialize, PartialEq)] -pub struct ListMultipartUploadsResult { - #[serde(serialize_with = "xmlns_tag")] - pub xmlns: (), - #[serde(rename = "Bucket")] - pub bucket: Value, - #[serde(rename = "KeyMarker")] - pub key_marker: Option, - #[serde(rename = "UploadIdMarker")] - pub upload_id_marker: Option, - #[serde(rename = "NextKeyMarker")] - pub next_key_marker: Option, - #[serde(rename = "NextUploadIdMarker")] - pub next_upload_id_marker: Option, - #[serde(rename = "Prefix")] - pub prefix: Value, - #[serde(rename = "Delimiter")] - pub delimiter: Option, - #[serde(rename = "MaxUploads")] - pub max_uploads: IntValue, - #[serde(rename = "IsTruncated")] - pub is_truncated: Value, - #[serde(rename = "Upload")] - pub upload: Vec, - #[serde(rename = "CommonPrefixes")] - pub common_prefixes: Vec, - #[serde(rename = "EncodingType")] - pub encoding_type: Option, -} - -#[derive(Debug, Serialize, PartialEq)] -pub struct PartItem { - #[serde(rename = "ETag")] - pub etag: Value, - #[serde(rename = "LastModified")] - pub last_modified: Value, - #[serde(rename = "PartNumber")] - pub part_number: IntValue, - #[serde(rename = "Size")] - pub size: IntValue, -} - -#[derive(Debug, Serialize, PartialEq)] -pub struct ListPartsResult { - #[serde(serialize_with = "xmlns_tag")] - pub xmlns: (), - #[serde(rename = "Bucket")] - pub bucket: Value, - #[serde(rename = "Key")] - pub key: Value, - #[serde(rename = "UploadId")] - pub upload_id: Value, - #[serde(rename = "PartNumberMarker")] - pub part_number_marker: Option, - #[serde(rename = "NextPartNumberMarker")] - pub next_part_number_marker: Option, - #[serde(rename = "MaxParts")] - pub max_parts: IntValue, - #[serde(rename = "IsTruncated")] - pub is_truncated: Value, - #[serde(rename = "Part", default)] - pub parts: Vec, - #[serde(rename = "Initiator")] - pub initiator: Initiator, - #[serde(rename = "Owner")] - pub owner: Owner, - #[serde(rename = "StorageClass")] - pub storage_class: Value, -} - -#[derive(Debug, Serialize, PartialEq)] -pub struct ListBucketItem { - #[serde(rename = "Key")] - pub key: Value, - #[serde(rename = "LastModified")] - pub last_modified: Value, - #[serde(rename = "ETag")] - pub etag: Value, - #[serde(rename = "Size")] - pub size: IntValue, - #[serde(rename = "StorageClass")] - pub storage_class: Value, -} - -#[derive(Debug, Serialize, PartialEq)] -pub struct CommonPrefix { - #[serde(rename = "Prefix")] - pub prefix: Value, -} - -#[derive(Debug, Serialize, PartialEq)] -pub struct ListBucketResult { - #[serde(serialize_with = "xmlns_tag")] - pub xmlns: (), - #[serde(rename = "Name")] - pub name: Value, - #[serde(rename = "Prefix")] - pub prefix: Value, - #[serde(rename = "Marker")] - pub marker: Option, - #[serde(rename = "NextMarker")] - pub next_marker: Option, - #[serde(rename = "StartAfter")] - pub start_after: Option, - #[serde(rename = "ContinuationToken")] - pub continuation_token: Option, - #[serde(rename = "NextContinuationToken")] - pub next_continuation_token: Option, - #[serde(rename = "KeyCount")] - pub key_count: Option, - #[serde(rename = "MaxKeys")] - pub max_keys: IntValue, - #[serde(rename = "Delimiter")] - pub delimiter: Option, - #[serde(rename = "EncodingType")] - pub encoding_type: Option, - #[serde(rename = "IsTruncated")] - pub is_truncated: Value, - #[serde(rename = "Contents")] - pub contents: Vec, - #[serde(rename = "CommonPrefixes")] - pub common_prefixes: Vec, -} - -#[derive(Debug, Serialize, PartialEq)] -pub struct VersioningConfiguration { - #[serde(serialize_with = "xmlns_tag")] - pub xmlns: (), - #[serde(rename = "Status")] - pub status: Option, -} - -#[derive(Debug, Serialize, PartialEq)] -pub struct PostObject { - #[serde(serialize_with = "xmlns_tag")] - pub xmlns: (), - #[serde(rename = "Location")] - pub location: Value, - #[serde(rename = "Bucket")] - pub bucket: Value, - #[serde(rename = "Key")] - pub key: Value, - #[serde(rename = "ETag")] - pub etag: Value, -} - -#[cfg(test)] -mod tests { - use super::*; - - use garage_util::time::*; - - #[test] - fn error_message() -> Result<(), ApiError> { - let error = Error { - code: Value("TestError".to_string()), - message: Value("A dummy error message".to_string()), - resource: Some(Value("/bucket/a/plop".to_string())), - region: Some(Value("garage".to_string())), - }; - assert_eq!( - to_xml_with_header(&error)?, - "\ -\ - TestError\ - A dummy error message\ - /bucket/a/plop\ - garage\ -" - ); - Ok(()) - } - - #[test] - fn list_all_my_buckets_result() -> Result<(), ApiError> { - let list_buckets = ListAllMyBucketsResult { - owner: Owner { - display_name: Value("owner_name".to_string()), - id: Value("qsdfjklm".to_string()), - }, - buckets: BucketList { - entries: vec![ - Bucket { - creation_date: Value(msec_to_rfc3339(0)), - name: Value("bucket_A".to_string()), - }, - Bucket { - creation_date: Value(msec_to_rfc3339(3600 * 24 * 1000)), - name: Value("bucket_B".to_string()), - }, - ], - }, - }; - assert_eq!( - to_xml_with_header(&list_buckets)?, - "\ -\ - \ - \ - 1970-01-01T00:00:00.000Z\ - bucket_A\ - \ - \ - 1970-01-02T00:00:00.000Z\ - bucket_B\ - \ - \ - \ - owner_name\ - qsdfjklm\ - \ -" - ); - Ok(()) - } - - #[test] - fn get_bucket_location_result() -> Result<(), ApiError> { - let get_bucket_location = LocationConstraint { - xmlns: (), - region: "garage".to_string(), - }; - assert_eq!( - to_xml_with_header(&get_bucket_location)?, - "\ -garage" - ); - Ok(()) - } - - #[test] - fn get_bucket_versioning_result() -> Result<(), ApiError> { - let get_bucket_versioning = VersioningConfiguration { - xmlns: (), - status: None, - }; - assert_eq!( - to_xml_with_header(&get_bucket_versioning)?, - "\ -" - ); - let get_bucket_versioning2 = VersioningConfiguration { - xmlns: (), - status: Some(Value("Suspended".to_string())), - }; - assert_eq!( - to_xml_with_header(&get_bucket_versioning2)?, - "\ -Suspended" - ); - - Ok(()) - } - - #[test] - fn delete_result() -> Result<(), ApiError> { - let delete_result = DeleteResult { - xmlns: (), - deleted: vec![ - Deleted { - key: Value("a/plop".to_string()), - version_id: Value("qsdfjklm".to_string()), - delete_marker_version_id: Value("wxcvbn".to_string()), - }, - Deleted { - key: Value("b/plip".to_string()), - version_id: Value("1234".to_string()), - delete_marker_version_id: Value("4321".to_string()), - }, - ], - errors: vec![ - DeleteError { - code: Value("NotFound".to_string()), - key: Some(Value("c/plap".to_string())), - message: Value("Object c/plap not found".to_string()), - version_id: None, - }, - DeleteError { - code: Value("Forbidden".to_string()), - key: Some(Value("d/plep".to_string())), - message: Value("Not authorized".to_string()), - version_id: Some(Value("789".to_string())), - }, - ], - }; - assert_eq!( - to_xml_with_header(&delete_result)?, - "\ -\ - \ - a/plop\ - qsdfjklm\ - wxcvbn\ - \ - \ - b/plip\ - 1234\ - 4321\ - \ - \ - NotFound\ - c/plap\ - Object c/plap not found\ - \ - \ - Forbidden\ - d/plep\ - Not authorized\ - 789\ - \ -" - ); - Ok(()) - } - - #[test] - fn initiate_multipart_upload_result() -> Result<(), ApiError> { - let result = InitiateMultipartUploadResult { - xmlns: (), - bucket: Value("mybucket".to_string()), - key: Value("a/plop".to_string()), - upload_id: Value("azerty".to_string()), - }; - assert_eq!( - to_xml_with_header(&result)?, - "\ -\ - mybucket\ - a/plop\ - azerty\ -" - ); - Ok(()) - } - - #[test] - fn complete_multipart_upload_result() -> Result<(), ApiError> { - let result = CompleteMultipartUploadResult { - xmlns: (), - location: Some(Value("https://garage.tld/mybucket/a/plop".to_string())), - bucket: Value("mybucket".to_string()), - key: Value("a/plop".to_string()), - etag: Value("\"3858f62230ac3c915f300c664312c11f-9\"".to_string()), - }; - assert_eq!( - to_xml_with_header(&result)?, - "\ -\ - https://garage.tld/mybucket/a/plop\ - mybucket\ - a/plop\ - "3858f62230ac3c915f300c664312c11f-9"\ -" - ); - Ok(()) - } - - #[test] - fn list_multipart_uploads_result() -> Result<(), ApiError> { - let result = ListMultipartUploadsResult { - xmlns: (), - bucket: Value("example-bucket".to_string()), - key_marker: None, - next_key_marker: None, - upload_id_marker: None, - encoding_type: None, - next_upload_id_marker: None, - upload: vec![], - delimiter: Some(Value("/".to_string())), - prefix: Value("photos/2006/".to_string()), - max_uploads: IntValue(1000), - is_truncated: Value("false".to_string()), - common_prefixes: vec![ - CommonPrefix { - prefix: Value("photos/2006/February/".to_string()), - }, - CommonPrefix { - prefix: Value("photos/2006/January/".to_string()), - }, - CommonPrefix { - prefix: Value("photos/2006/March/".to_string()), - }, - ], - }; - - assert_eq!( - to_xml_with_header(&result)?, - "\ -\ - example-bucket\ - photos/2006/\ - /\ - 1000\ - false\ - \ - photos/2006/February/\ - \ - \ - photos/2006/January/\ - \ - \ - photos/2006/March/\ - \ -" - ); - - Ok(()) - } - - #[test] - fn list_objects_v1_1() -> Result<(), ApiError> { - let result = ListBucketResult { - xmlns: (), - name: Value("example-bucket".to_string()), - prefix: Value("".to_string()), - marker: Some(Value("".to_string())), - next_marker: None, - start_after: None, - continuation_token: None, - next_continuation_token: None, - key_count: None, - max_keys: IntValue(1000), - encoding_type: None, - delimiter: Some(Value("/".to_string())), - is_truncated: Value("false".to_string()), - contents: vec![ListBucketItem { - key: Value("sample.jpg".to_string()), - last_modified: Value(msec_to_rfc3339(0)), - etag: Value("\"bf1d737a4d46a19f3bced6905cc8b902\"".to_string()), - size: IntValue(142863), - storage_class: Value("STANDARD".to_string()), - }], - common_prefixes: vec![CommonPrefix { - prefix: Value("photos/".to_string()), - }], - }; - assert_eq!( - to_xml_with_header(&result)?, - "\ -\ - example-bucket\ - \ - \ - 1000\ - /\ - false\ - \ - sample.jpg\ - 1970-01-01T00:00:00.000Z\ - "bf1d737a4d46a19f3bced6905cc8b902"\ - 142863\ - STANDARD\ - \ - \ - photos/\ - \ -" - ); - Ok(()) - } - - #[test] - fn list_objects_v1_2() -> Result<(), ApiError> { - let result = ListBucketResult { - xmlns: (), - name: Value("example-bucket".to_string()), - prefix: Value("photos/2006/".to_string()), - marker: Some(Value("".to_string())), - next_marker: None, - start_after: None, - continuation_token: None, - next_continuation_token: None, - key_count: None, - max_keys: IntValue(1000), - delimiter: Some(Value("/".to_string())), - encoding_type: None, - is_truncated: Value("false".to_string()), - contents: vec![], - common_prefixes: vec![ - CommonPrefix { - prefix: Value("photos/2006/February/".to_string()), - }, - CommonPrefix { - prefix: Value("photos/2006/January/".to_string()), - }, - ], - }; - assert_eq!( - to_xml_with_header(&result)?, - "\ -\ - example-bucket\ - photos/2006/\ - \ - 1000\ - /\ - false\ - \ - photos/2006/February/\ - \ - \ - photos/2006/January/\ - \ -" - ); - Ok(()) - } - - #[test] - fn list_objects_v2_1() -> Result<(), ApiError> { - let result = ListBucketResult { - xmlns: (), - name: Value("quotes".to_string()), - prefix: Value("E".to_string()), - marker: None, - next_marker: None, - start_after: Some(Value("ExampleGuide.pdf".to_string())), - continuation_token: None, - next_continuation_token: None, - key_count: None, - max_keys: IntValue(3), - delimiter: None, - encoding_type: None, - is_truncated: Value("false".to_string()), - contents: vec![ListBucketItem { - key: Value("ExampleObject.txt".to_string()), - last_modified: Value(msec_to_rfc3339(0)), - etag: Value("\"599bab3ed2c697f1d26842727561fd94\"".to_string()), - size: IntValue(857), - storage_class: Value("REDUCED_REDUNDANCY".to_string()), - }], - common_prefixes: vec![], - }; - assert_eq!( - to_xml_with_header(&result)?, - "\ -\ - quotes\ - E\ - ExampleGuide.pdf\ - 3\ - false\ - \ - ExampleObject.txt\ - 1970-01-01T00:00:00.000Z\ - "599bab3ed2c697f1d26842727561fd94"\ - 857\ - REDUCED_REDUNDANCY\ - \ -" - ); - Ok(()) - } - - #[test] - fn list_objects_v2_2() -> Result<(), ApiError> { - let result = ListBucketResult { - xmlns: (), - name: Value("bucket".to_string()), - prefix: Value("".to_string()), - marker: None, - next_marker: None, - start_after: None, - continuation_token: Some(Value( - "1ueGcxLPRx1Tr/XYExHnhbYLgveDs2J/wm36Hy4vbOwM=".to_string(), - )), - next_continuation_token: Some(Value("qsdfjklm".to_string())), - key_count: Some(IntValue(112)), - max_keys: IntValue(1000), - delimiter: None, - encoding_type: None, - is_truncated: Value("false".to_string()), - contents: vec![ListBucketItem { - key: Value("happyfacex.jpg".to_string()), - last_modified: Value(msec_to_rfc3339(0)), - etag: Value("\"70ee1738b6b21e2c8a43f3a5ab0eee71\"".to_string()), - size: IntValue(1111), - storage_class: Value("STANDARD".to_string()), - }], - common_prefixes: vec![], - }; - assert_eq!( - to_xml_with_header(&result)?, - "\ -\ - bucket\ - \ - 1ueGcxLPRx1Tr/XYExHnhbYLgveDs2J/wm36Hy4vbOwM=\ - qsdfjklm\ - 112\ - 1000\ - false\ - \ - happyfacex.jpg\ - 1970-01-01T00:00:00.000Z\ - "70ee1738b6b21e2c8a43f3a5ab0eee71"\ - 1111\ - STANDARD\ - \ -" - ); - Ok(()) - } - - #[test] - fn list_parts() -> Result<(), ApiError> { - let result = ListPartsResult { - xmlns: (), - bucket: Value("example-bucket".to_string()), - key: Value("example-object".to_string()), - upload_id: Value( - "XXBsb2FkIElEIGZvciBlbHZpbmcncyVcdS1tb3ZpZS5tMnRzEEEwbG9hZA".to_string(), - ), - part_number_marker: Some(IntValue(1)), - next_part_number_marker: Some(IntValue(3)), - max_parts: IntValue(2), - is_truncated: Value("true".to_string()), - parts: vec![ - PartItem { - etag: Value("\"7778aef83f66abc1fa1e8477f296d394\"".to_string()), - last_modified: Value("2010-11-10T20:48:34.000Z".to_string()), - part_number: IntValue(2), - size: IntValue(10485760), - }, - PartItem { - etag: Value("\"aaaa18db4cc2f85cedef654fccc4a4x8\"".to_string()), - last_modified: Value("2010-11-10T20:48:33.000Z".to_string()), - part_number: IntValue(3), - size: IntValue(10485760), - }, - ], - initiator: Initiator { - display_name: Value("umat-user-11116a31-17b5-4fb7-9df5-b288870f11xx".to_string()), - id: Value( - "arn:aws:iam::111122223333:user/some-user-11116a31-17b5-4fb7-9df5-b288870f11xx" - .to_string(), - ), - }, - owner: Owner { - display_name: Value("someName".to_string()), - id: Value( - "75aa57f09aa0c8caeab4f8c24e99d10f8e7faeebf76c078efc7c6caea54ba06a".to_string(), - ), - }, - storage_class: Value("STANDARD".to_string()), - }; - - assert_eq!( - to_xml_with_header(&result)?, - "\ -\ - example-bucket\ - example-object\ - XXBsb2FkIElEIGZvciBlbHZpbmcncyVcdS1tb3ZpZS5tMnRzEEEwbG9hZA\ - 1\ - 3\ - 2\ - true\ - \ - "7778aef83f66abc1fa1e8477f296d394"\ - 2010-11-10T20:48:34.000Z\ - 2\ - 10485760\ - \ - \ - "aaaa18db4cc2f85cedef654fccc4a4x8"\ - 2010-11-10T20:48:33.000Z\ - 3\ - 10485760\ - \ - \ - umat-user-11116a31-17b5-4fb7-9df5-b288870f11xx\ - arn:aws:iam::111122223333:user/some-user-11116a31-17b5-4fb7-9df5-b288870f11xx\ - \ - \ - someName\ - 75aa57f09aa0c8caeab4f8c24e99d10f8e7faeebf76c078efc7c6caea54ba06a\ - \ - STANDARD\ -" - ); - - Ok(()) - } -} diff --git a/src/api/signature/mod.rs b/src/api/signature/mod.rs index ebdee6da..5646f4fa 100644 --- a/src/api/signature/mod.rs +++ b/src/api/signature/mod.rs @@ -42,6 +42,11 @@ pub fn signing_hmac( Ok(hmac) } -pub fn compute_scope(datetime: &DateTime, region: &str) -> String { - format!("{}/{}/s3/aws4_request", datetime.format(SHORT_DATE), region,) +pub fn compute_scope(datetime: &DateTime, region: &str, service: &str) -> String { + format!( + "{}/{}/{}/aws4_request", + datetime.format(SHORT_DATE), + region, + service + ) } diff --git a/src/api/signature/payload.rs b/src/api/signature/payload.rs index 2a41b307..9137dd2d 100644 --- a/src/api/signature/payload.rs +++ b/src/api/signature/payload.rs @@ -11,14 +11,15 @@ use garage_util::data::Hash; use garage_model::garage::Garage; use garage_model::key_table::*; -use super::signing_hmac; -use super::{LONG_DATETIME, SHORT_DATE}; +use super::LONG_DATETIME; +use super::{compute_scope, signing_hmac}; use crate::encoding::uri_encode; use crate::error::*; pub async fn check_payload_signature( garage: &Garage, + service: &str, request: &Request, ) -> Result<(Option, Option), Error> { let mut headers = HashMap::new(); @@ -64,6 +65,7 @@ pub async fn check_payload_signature( let key = verify_v4( garage, + service, &authorization.credential, &authorization.date, &authorization.signature, @@ -281,6 +283,7 @@ pub fn parse_date(date: &str) -> Result, Error> { pub async fn verify_v4( garage: &Garage, + service: &str, credential: &str, date: &DateTime, signature: &str, @@ -288,11 +291,7 @@ pub async fn verify_v4( ) -> Result { let (key_id, scope) = parse_credential(credential)?; - let scope_expected = format!( - "{}/{}/s3/aws4_request", - date.format(SHORT_DATE), - garage.config.s3_api.s3_region - ); + let scope_expected = compute_scope(date, &garage.config.s3_api.s3_region, service); if scope != scope_expected { return Err(Error::AuthorizationHeaderMalformed(scope.to_string())); } @@ -309,7 +308,7 @@ pub async fn verify_v4( date, &key_p.secret_key, &garage.config.s3_api.s3_region, - "s3", + service, ) .ok_or_internal_error("Unable to build signing HMAC")?; hmac.update(payload); diff --git a/src/api/signature/streaming.rs b/src/api/signature/streaming.rs index 969a45d6..ded9d993 100644 --- a/src/api/signature/streaming.rs +++ b/src/api/signature/streaming.rs @@ -1,19 +1,68 @@ use std::pin::Pin; -use chrono::{DateTime, Utc}; +use chrono::{DateTime, NaiveDateTime, Utc}; use futures::prelude::*; use futures::task; +use garage_model::key_table::Key; +use hmac::Mac; use hyper::body::Bytes; +use hyper::{Body, Request}; use garage_util::data::Hash; -use hmac::Mac; -use super::sha256sum; -use super::HmacSha256; -use super::LONG_DATETIME; +use super::{compute_scope, sha256sum, HmacSha256, LONG_DATETIME}; use crate::error::*; +pub fn parse_streaming_body( + api_key: &Key, + req: Request, + content_sha256: &mut Option, + region: &str, + service: &str, +) -> Result, Error> { + match req.headers().get("x-amz-content-sha256") { + Some(header) if header == "STREAMING-AWS4-HMAC-SHA256-PAYLOAD" => { + let signature = content_sha256 + .take() + .ok_or_bad_request("No signature provided")?; + + let secret_key = &api_key + .state + .as_option() + .ok_or_internal_error("Deleted key state")? + .secret_key; + + let date = req + .headers() + .get("x-amz-date") + .ok_or_bad_request("Missing X-Amz-Date field")? + .to_str()?; + let date: NaiveDateTime = NaiveDateTime::parse_from_str(date, LONG_DATETIME) + .ok_or_bad_request("Invalid date")?; + let date: DateTime = DateTime::from_utc(date, Utc); + + let scope = compute_scope(&date, region, service); + let signing_hmac = crate::signature::signing_hmac(&date, secret_key, region, service) + .ok_or_internal_error("Unable to build signing HMAC")?; + + Ok(req.map(move |body| { + Body::wrap_stream( + SignedPayloadStream::new( + body.map_err(Error::from), + signing_hmac, + date, + &scope, + signature, + ) + .map_err(Error::from), + ) + })) + } + _ => Ok(req), + } +} + /// Result of `sha256("")` const EMPTY_STRING_HEX_DIGEST: &str = "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"; @@ -295,7 +344,7 @@ mod tests { .with_timezone(&Utc); let secret_key = "test"; let region = "test"; - let scope = crate::signature::compute_scope(&datetime, region); + let scope = crate::signature::compute_scope(&datetime, region, "s3"); let signing_hmac = crate::signature::signing_hmac(&datetime, secret_key, region, "s3").unwrap(); diff --git a/src/block/manager.rs b/src/block/manager.rs index 1c04a335..9b2d9cad 100644 --- a/src/block/manager.rs +++ b/src/block/manager.rs @@ -132,7 +132,7 @@ impl BlockManager { let endpoint = system .netapp - .endpoint("garage_model/block.rs/Rpc".to_string()); + .endpoint("garage_block/manager.rs/Rpc".to_string()); let manager_locked = BlockManagerLocked(); diff --git a/src/garage/Cargo.toml b/src/garage/Cargo.toml index 59f402ff..3b69d7bc 100644 --- a/src/garage/Cargo.toml +++ b/src/garage/Cargo.toml @@ -63,3 +63,11 @@ hyper = { version = "0.14", features = ["client", "http1", "runtime"] } sha2 = "0.9" static_init = "1.0" +assert-json-diff = "2.0" +serde_json = "1.0" +base64 = "0.13" + + +[features] +kubernetes-discovery = [ "garage_rpc/kubernetes-discovery" ] +k2v = [ "garage_util/k2v", "garage_api/k2v" ] diff --git a/src/garage/admin.rs b/src/garage/admin.rs index 0b20bb20..af0c3f22 100644 --- a/src/garage/admin.rs +++ b/src/garage/admin.rs @@ -21,8 +21,8 @@ use garage_model::garage::Garage; use garage_model::helper::error::{Error, OkOrBadRequest}; use garage_model::key_table::*; use garage_model::migrate::Migrate; -use garage_model::object_table::ObjectFilter; use garage_model::permission::*; +use garage_model::s3::object_table::ObjectFilter; use crate::cli::*; use crate::repair::Repair; @@ -80,7 +80,13 @@ impl AdminRpcHandler { let buckets = self .garage .bucket_table - .get_range(&EmptyKey, None, Some(DeletedFilter::NotDeleted), 10000) + .get_range( + &EmptyKey, + None, + Some(DeletedFilter::NotDeleted), + 10000, + EnumerationOrder::Forward, + ) .await?; Ok(AdminRpc::BucketList(buckets)) } @@ -210,7 +216,13 @@ impl AdminRpcHandler { let objects = self .garage .object_table - .get_range(&bucket_id, None, Some(ObjectFilter::IsData), 10) + .get_range( + &bucket_id, + None, + Some(ObjectFilter::IsData), + 10, + EnumerationOrder::Forward, + ) .await?; if !objects.is_empty() { return Err(Error::BadRequest(format!( @@ -445,6 +457,7 @@ impl AdminRpcHandler { None, Some(KeyFilter::Deleted(DeletedFilter::NotDeleted)), 10000, + EnumerationOrder::Forward, ) .await? .iter() diff --git a/src/garage/cli/cmd.rs b/src/garage/cli/cmd.rs index a90277a0..2a799868 100644 --- a/src/garage/cli/cmd.rs +++ b/src/garage/cli/cmd.rs @@ -85,13 +85,14 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> format_table(healthy_nodes); let status_keys = status.iter().map(|adv| adv.id).collect::>(); - let failure_case_1 = status.iter().any(|adv| !adv.is_up); + let failure_case_1 = status + .iter() + .any(|adv| !adv.is_up && matches!(layout.roles.get(&adv.id), Some(NodeRoleV(Some(_))))); let failure_case_2 = layout .roles .items() .iter() - .filter(|(_, _, v)| v.0.is_some()) - .any(|(id, _, _)| !status_keys.contains(id)); + .any(|(id, _, v)| !status_keys.contains(id) && v.0.is_some()); if failure_case_1 || failure_case_2 { println!("\n==== FAILED NODES ===="); let mut failed_nodes = diff --git a/src/garage/repair.rs b/src/garage/repair.rs index 3666ca8f..830eac71 100644 --- a/src/garage/repair.rs +++ b/src/garage/repair.rs @@ -2,10 +2,10 @@ use std::sync::Arc; use tokio::sync::watch; -use garage_model::block_ref_table::*; use garage_model::garage::Garage; -use garage_model::object_table::*; -use garage_model::version_table::*; +use garage_model::s3::block_ref_table::*; +use garage_model::s3::object_table::*; +use garage_model::s3::version_table::*; use garage_table::*; use garage_util::error::Error; diff --git a/src/garage/server.rs b/src/garage/server.rs index 58c9e782..24bb25b3 100644 --- a/src/garage/server.rs +++ b/src/garage/server.rs @@ -8,10 +8,13 @@ use garage_util::error::Error; use garage_admin::metrics::*; use garage_admin::tracing_setup::*; -use garage_api::run_api_server; +use garage_api::s3::api_server::S3ApiServer; use garage_model::garage::Garage; use garage_web::run_web_server; +#[cfg(feature = "k2v")] +use garage_api::k2v::api_server::K2VApiServer; + use crate::admin::*; async fn wait_from(mut chan: watch::Receiver) { @@ -56,12 +59,21 @@ pub async fn run_server(config_file: PathBuf) -> Result<(), Error> { info!("Create admin RPC handler..."); AdminRpcHandler::new(garage.clone()); - info!("Initializing API server..."); - let api_server = tokio::spawn(run_api_server( + info!("Initializing S3 API server..."); + let s3_api_server = tokio::spawn(S3ApiServer::run( garage.clone(), wait_from(watch_cancel.clone()), )); + #[cfg(feature = "k2v")] + let k2v_api_server = { + info!("Initializing K2V API server..."); + tokio::spawn(K2VApiServer::run( + garage.clone(), + wait_from(watch_cancel.clone()), + )) + }; + info!("Initializing web server..."); let web_server = tokio::spawn(run_web_server( garage.clone(), @@ -80,8 +92,12 @@ pub async fn run_server(config_file: PathBuf) -> Result<(), Error> { // Stuff runs // When a cancel signal is sent, stuff stops - if let Err(e) = api_server.await? { - warn!("API server exited with error: {}", e); + if let Err(e) = s3_api_server.await? { + warn!("S3 API server exited with error: {}", e); + } + #[cfg(feature = "k2v")] + if let Err(e) = k2v_api_server.await? { + warn!("K2V API server exited with error: {}", e); } if let Err(e) = web_server.await? { warn!("Web server exited with error: {}", e); diff --git a/src/garage/tests/common/client.rs b/src/garage/tests/common/client.rs index c5ddc6e5..212588b5 100644 --- a/src/garage/tests/common/client.rs +++ b/src/garage/tests/common/client.rs @@ -10,7 +10,7 @@ pub fn build_client(instance: &Instance) -> Client { None, "garage-integ-test", ); - let endpoint = Endpoint::immutable(instance.uri()); + let endpoint = Endpoint::immutable(instance.s3_uri()); let config = Config::builder() .region(super::REGION) diff --git a/src/garage/tests/common/custom_requester.rs b/src/garage/tests/common/custom_requester.rs index 580691a1..1700cc90 100644 --- a/src/garage/tests/common/custom_requester.rs +++ b/src/garage/tests/common/custom_requester.rs @@ -17,14 +17,25 @@ use garage_api::signature; pub struct CustomRequester { key: Key, uri: Uri, + service: &'static str, client: Client, } impl CustomRequester { - pub fn new(instance: &Instance) -> Self { + pub fn new_s3(instance: &Instance) -> Self { CustomRequester { key: instance.key.clone(), - uri: instance.uri(), + uri: instance.s3_uri(), + service: "s3", + client: Client::new(), + } + } + + pub fn new_k2v(instance: &Instance) -> Self { + CustomRequester { + key: instance.key.clone(), + uri: instance.k2v_uri(), + service: "k2v", client: Client::new(), } } @@ -32,6 +43,7 @@ impl CustomRequester { pub fn builder(&self, bucket: String) -> RequestBuilder<'_> { RequestBuilder { requester: self, + service: self.service, bucket, method: Method::GET, path: String::new(), @@ -47,6 +59,7 @@ impl CustomRequester { pub struct RequestBuilder<'a> { requester: &'a CustomRequester, + service: &'static str, bucket: String, method: Method, path: String, @@ -59,13 +72,17 @@ pub struct RequestBuilder<'a> { } impl<'a> RequestBuilder<'a> { + pub fn service(&mut self, service: &'static str) -> &mut Self { + self.service = service; + self + } pub fn method(&mut self, method: Method) -> &mut Self { self.method = method; self } - pub fn path(&mut self, path: String) -> &mut Self { - self.path = path; + pub fn path(&mut self, path: impl ToString) -> &mut Self { + self.path = path.to_string(); self } @@ -74,16 +91,38 @@ impl<'a> RequestBuilder<'a> { self } + pub fn query_param(&mut self, param: T, value: Option) -> &mut Self + where + T: ToString, + U: ToString, + { + self.query_params + .insert(param.to_string(), value.as_ref().map(ToString::to_string)); + self + } + pub fn signed_headers(&mut self, signed_headers: HashMap) -> &mut Self { self.signed_headers = signed_headers; self } + pub fn signed_header(&mut self, name: impl ToString, value: impl ToString) -> &mut Self { + self.signed_headers + .insert(name.to_string(), value.to_string()); + self + } + pub fn unsigned_headers(&mut self, unsigned_headers: HashMap) -> &mut Self { self.unsigned_headers = unsigned_headers; self } + pub fn unsigned_header(&mut self, name: impl ToString, value: impl ToString) -> &mut Self { + self.unsigned_headers + .insert(name.to_string(), value.to_string()); + self + } + pub fn body(&mut self, body: Vec) -> &mut Self { self.body = body; self @@ -106,24 +145,24 @@ impl<'a> RequestBuilder<'a> { let query = query_param_to_string(&self.query_params); let (host, path) = if self.vhost_style { ( - format!("{}.s3.garage", self.bucket), + format!("{}.{}.garage", self.bucket, self.service), format!("{}{}", self.path, query), ) } else { ( - "s3.garage".to_owned(), + format!("{}.garage", self.service), format!("{}/{}{}", self.bucket, self.path, query), ) }; let uri = format!("{}{}", self.requester.uri, path); let now = Utc::now(); - let scope = signature::compute_scope(&now, super::REGION.as_ref()); + let scope = signature::compute_scope(&now, super::REGION.as_ref(), self.service); let mut signer = signature::signing_hmac( &now, &self.requester.key.secret, super::REGION.as_ref(), - "s3", + self.service, ) .unwrap(); let streaming_signer = signer.clone(); diff --git a/src/garage/tests/common/garage.rs b/src/garage/tests/common/garage.rs index 88c51501..44d727f9 100644 --- a/src/garage/tests/common/garage.rs +++ b/src/garage/tests/common/garage.rs @@ -22,7 +22,9 @@ pub struct Instance { process: process::Child, pub path: PathBuf, pub key: Key, - pub api_port: u16, + pub s3_port: u16, + pub k2v_port: u16, + pub web_port: u16, } impl Instance { @@ -58,9 +60,12 @@ rpc_secret = "{secret}" [s3_api] s3_region = "{region}" -api_bind_addr = "127.0.0.1:{api_port}" +api_bind_addr = "127.0.0.1:{s3_port}" root_domain = ".s3.garage" +[k2v_api] +api_bind_addr = "127.0.0.1:{k2v_port}" + [s3_web] bind_addr = "127.0.0.1:{web_port}" root_domain = ".web.garage" @@ -72,10 +77,11 @@ api_bind_addr = "127.0.0.1:{admin_port}" path = path.display(), secret = GARAGE_TEST_SECRET, region = super::REGION, - api_port = port, - rpc_port = port + 1, - web_port = port + 2, - admin_port = port + 3, + s3_port = port, + k2v_port = port + 1, + rpc_port = port + 2, + web_port = port + 3, + admin_port = port + 4, ); fs::write(path.join("config.toml"), config).expect("Could not write garage config file"); @@ -88,7 +94,7 @@ api_bind_addr = "127.0.0.1:{admin_port}" .arg("server") .stdout(stdout) .stderr(stderr) - .env("RUST_LOG", "garage=info,garage_api=debug") + .env("RUST_LOG", "garage=info,garage_api=trace") .spawn() .expect("Could not start garage"); @@ -96,7 +102,9 @@ api_bind_addr = "127.0.0.1:{admin_port}" process: child, path, key: Key::default(), - api_port: port, + s3_port: port, + k2v_port: port + 1, + web_port: port + 3, } } @@ -147,8 +155,14 @@ api_bind_addr = "127.0.0.1:{admin_port}" String::from_utf8(output.stdout).unwrap() } - pub fn uri(&self) -> http::Uri { - format!("http://127.0.0.1:{api_port}", api_port = self.api_port) + pub fn s3_uri(&self) -> http::Uri { + format!("http://127.0.0.1:{s3_port}", s3_port = self.s3_port) + .parse() + .expect("Could not build garage endpoint URI") + } + + pub fn k2v_uri(&self) -> http::Uri { + format!("http://127.0.0.1:{k2v_port}", k2v_port = self.k2v_port) .parse() .expect("Could not build garage endpoint URI") } diff --git a/src/garage/tests/common/mod.rs b/src/garage/tests/common/mod.rs index 8f88c731..28874b02 100644 --- a/src/garage/tests/common/mod.rs +++ b/src/garage/tests/common/mod.rs @@ -17,18 +17,27 @@ pub struct Context { pub garage: &'static garage::Instance, pub client: Client, pub custom_request: CustomRequester, + pub k2v: K2VContext, +} + +pub struct K2VContext { + pub request: CustomRequester, } impl Context { fn new() -> Self { let garage = garage::instance(); let client = client::build_client(garage); - let custom_request = CustomRequester::new(garage); + let custom_request = CustomRequester::new_s3(garage); + let k2v_request = CustomRequester::new_k2v(garage); Context { garage, client, custom_request, + k2v: K2VContext { + request: k2v_request, + }, } } diff --git a/src/garage/tests/k2v/batch.rs b/src/garage/tests/k2v/batch.rs new file mode 100644 index 00000000..1182a298 --- /dev/null +++ b/src/garage/tests/k2v/batch.rs @@ -0,0 +1,525 @@ +use std::collections::HashMap; + +use crate::common; + +use assert_json_diff::assert_json_eq; +use serde_json::json; + +use super::json_body; +use hyper::Method; + +#[tokio::test] +async fn test_batch() { + let ctx = common::context(); + let bucket = ctx.create_bucket("test-k2v-batch"); + + let mut values = HashMap::new(); + values.insert("a", "initial test 1"); + values.insert("b", "initial test 2"); + values.insert("c", "initial test 3"); + values.insert("d.1", "initial test 4"); + values.insert("d.2", "initial test 5"); + values.insert("e", "initial test 6"); + let mut ct = HashMap::new(); + + let res = ctx + .k2v + .request + .builder(bucket.clone()) + .body( + format!( + r#"[ + {{"pk": "root", "sk": "a", "ct": null, "v": "{}"}}, + {{"pk": "root", "sk": "b", "ct": null, "v": "{}"}}, + {{"pk": "root", "sk": "c", "ct": null, "v": "{}"}}, + {{"pk": "root", "sk": "d.1", "ct": null, "v": "{}"}}, + {{"pk": "root", "sk": "d.2", "ct": null, "v": "{}"}}, + {{"pk": "root", "sk": "e", "ct": null, "v": "{}"}} + ]"#, + base64::encode(values.get(&"a").unwrap()), + base64::encode(values.get(&"b").unwrap()), + base64::encode(values.get(&"c").unwrap()), + base64::encode(values.get(&"d.1").unwrap()), + base64::encode(values.get(&"d.2").unwrap()), + base64::encode(values.get(&"e").unwrap()), + ) + .into_bytes(), + ) + .method(Method::POST) + .send() + .await + .unwrap(); + assert_eq!(res.status(), 200); + + for sk in ["a", "b", "c", "d.1", "d.2", "e"] { + let res = ctx + .k2v + .request + .builder(bucket.clone()) + .path("root") + .query_param("sort_key", Some(sk)) + .signed_header("accept", "*/*") + .send() + .await + .unwrap(); + assert_eq!(res.status(), 200); + assert_eq!( + res.headers().get("content-type").unwrap().to_str().unwrap(), + "application/octet-stream" + ); + ct.insert( + sk, + res.headers() + .get("x-garage-causality-token") + .unwrap() + .to_str() + .unwrap() + .to_string(), + ); + let res_body = hyper::body::to_bytes(res.into_body()) + .await + .unwrap() + .to_vec(); + assert_eq!(res_body, values.get(sk).unwrap().as_bytes()); + } + + let res = ctx + .k2v + .request + .builder(bucket.clone()) + .query_param("search", Option::<&str>::None) + .body( + br#"[ + {"partitionKey": "root"}, + {"partitionKey": "root", "start": "c"}, + {"partitionKey": "root", "start": "c", "reverse": true, "end": "a"}, + {"partitionKey": "root", "limit": 1}, + {"partitionKey": "root", "prefix": "d"} + ]"# + .to_vec(), + ) + .method(Method::POST) + .send() + .await + .unwrap(); + assert_eq!(res.status(), 200); + let json_res = json_body(res).await; + assert_json_eq!( + json_res, + json!([ + { + "partitionKey": "root", + "prefix": null, + "start": null, + "end": null, + "limit": null, + "reverse": false, + "conflictsOnly": false, + "tombstones": false, + "singleItem": false, + "items": [ + {"sk": "a", "ct": ct.get("a").unwrap(), "v": [base64::encode(values.get("a").unwrap())]}, + {"sk": "b", "ct": ct.get("b").unwrap(), "v": [base64::encode(values.get("b").unwrap())]}, + {"sk": "c", "ct": ct.get("c").unwrap(), "v": [base64::encode(values.get("c").unwrap())]}, + {"sk": "d.1", "ct": ct.get("d.1").unwrap(), "v": [base64::encode(values.get("d.1").unwrap())]}, + {"sk": "d.2", "ct": ct.get("d.2").unwrap(), "v": [base64::encode(values.get("d.2").unwrap())]}, + {"sk": "e", "ct": ct.get("e").unwrap(), "v": [base64::encode(values.get("e").unwrap())]} + ], + "more": false, + "nextStart": null, + }, + { + "partitionKey": "root", + "prefix": null, + "start": "c", + "end": null, + "limit": null, + "reverse": false, + "conflictsOnly": false, + "tombstones": false, + "singleItem": false, + "items": [ + {"sk": "c", "ct": ct.get("c").unwrap(), "v": [base64::encode(values.get("c").unwrap())]}, + {"sk": "d.1", "ct": ct.get("d.1").unwrap(), "v": [base64::encode(values.get("d.1").unwrap())]}, + {"sk": "d.2", "ct": ct.get("d.2").unwrap(), "v": [base64::encode(values.get("d.2").unwrap())]}, + {"sk": "e", "ct": ct.get("e").unwrap(), "v": [base64::encode(values.get("e").unwrap())]} + ], + "more": false, + "nextStart": null, + }, + { + "partitionKey": "root", + "prefix": null, + "start": "c", + "end": "a", + "limit": null, + "reverse": true, + "conflictsOnly": false, + "tombstones": false, + "singleItem": false, + "items": [ + {"sk": "c", "ct": ct.get("c").unwrap(), "v": [base64::encode(values.get("c").unwrap())]}, + {"sk": "b", "ct": ct.get("b").unwrap(), "v": [base64::encode(values.get("b").unwrap())]}, + ], + "more": false, + "nextStart": null, + }, + { + "partitionKey": "root", + "prefix": null, + "start": null, + "end": null, + "limit": 1, + "reverse": false, + "conflictsOnly": false, + "tombstones": false, + "singleItem": false, + "items": [ + {"sk": "a", "ct": ct.get("a").unwrap(), "v": [base64::encode(values.get("a").unwrap())]} + ], + "more": true, + "nextStart": "b", + }, + { + "partitionKey": "root", + "prefix": "d", + "start": null, + "end": null, + "limit": null, + "reverse": false, + "conflictsOnly": false, + "tombstones": false, + "singleItem": false, + "items": [ + {"sk": "d.1", "ct": ct.get("d.1").unwrap(), "v": [base64::encode(values.get("d.1").unwrap())]}, + {"sk": "d.2", "ct": ct.get("d.2").unwrap(), "v": [base64::encode(values.get("d.2").unwrap())]} + ], + "more": false, + "nextStart": null, + }, + ]) + ); + + // Insert some new values + values.insert("c'", "new test 3"); + values.insert("d.1'", "new test 4"); + values.insert("d.2'", "new test 5"); + + let res = ctx + .k2v + .request + .builder(bucket.clone()) + .body( + format!( + r#"[ + {{"pk": "root", "sk": "b", "ct": "{}", "v": null}}, + {{"pk": "root", "sk": "c", "ct": null, "v": "{}"}}, + {{"pk": "root", "sk": "d.1", "ct": "{}", "v": "{}"}}, + {{"pk": "root", "sk": "d.2", "ct": null, "v": "{}"}} + ]"#, + ct.get(&"b").unwrap(), + base64::encode(values.get(&"c'").unwrap()), + ct.get(&"d.1").unwrap(), + base64::encode(values.get(&"d.1'").unwrap()), + base64::encode(values.get(&"d.2'").unwrap()), + ) + .into_bytes(), + ) + .method(Method::POST) + .send() + .await + .unwrap(); + assert_eq!(res.status(), 200); + + for sk in ["b", "c", "d.1", "d.2"] { + let res = ctx + .k2v + .request + .builder(bucket.clone()) + .path("root") + .query_param("sort_key", Some(sk)) + .signed_header("accept", "*/*") + .send() + .await + .unwrap(); + if sk == "b" { + assert_eq!(res.status(), 204); + } else { + assert_eq!(res.status(), 200); + } + ct.insert( + sk, + res.headers() + .get("x-garage-causality-token") + .unwrap() + .to_str() + .unwrap() + .to_string(), + ); + } + + let res = ctx + .k2v + .request + .builder(bucket.clone()) + .query_param("search", Option::<&str>::None) + .body( + br#"[ + {"partitionKey": "root"}, + {"partitionKey": "root", "prefix": "d"}, + {"partitionKey": "root", "prefix": "d.", "end": "d.2"}, + {"partitionKey": "root", "prefix": "d.", "limit": 1}, + {"partitionKey": "root", "prefix": "d.", "start": "d.2", "limit": 1}, + {"partitionKey": "root", "prefix": "d.", "reverse": true}, + {"partitionKey": "root", "prefix": "d.", "start": "d.2", "reverse": true}, + {"partitionKey": "root", "prefix": "d.", "limit": 2} + ]"# + .to_vec(), + ) + .method(Method::POST) + .send() + .await + .unwrap(); + assert_eq!(res.status(), 200); + let json_res = json_body(res).await; + assert_json_eq!( + json_res, + json!([ + { + "partitionKey": "root", + "prefix": null, + "start": null, + "end": null, + "limit": null, + "reverse": false, + "conflictsOnly": false, + "tombstones": false, + "singleItem": false, + "items": [ + {"sk": "a", "ct": ct.get("a").unwrap(), "v": [base64::encode(values.get("a").unwrap())]}, + {"sk": "c", "ct": ct.get("c").unwrap(), "v": [base64::encode(values.get("c").unwrap()), base64::encode(values.get("c'").unwrap())]}, + {"sk": "d.1", "ct": ct.get("d.1").unwrap(), "v": [base64::encode(values.get("d.1'").unwrap())]}, + {"sk": "d.2", "ct": ct.get("d.2").unwrap(), "v": [base64::encode(values.get("d.2").unwrap()), base64::encode(values.get("d.2'").unwrap())]}, + {"sk": "e", "ct": ct.get("e").unwrap(), "v": [base64::encode(values.get("e").unwrap())]} + ], + "more": false, + "nextStart": null, + }, + { + "partitionKey": "root", + "prefix": "d", + "start": null, + "end": null, + "limit": null, + "reverse": false, + "conflictsOnly": false, + "tombstones": false, + "singleItem": false, + "items": [ + {"sk": "d.1", "ct": ct.get("d.1").unwrap(), "v": [base64::encode(values.get("d.1'").unwrap())]}, + {"sk": "d.2", "ct": ct.get("d.2").unwrap(), "v": [base64::encode(values.get("d.2").unwrap()), base64::encode(values.get("d.2'").unwrap())]}, + ], + "more": false, + "nextStart": null, + }, + { + "partitionKey": "root", + "prefix": "d.", + "start": null, + "end": "d.2", + "limit": null, + "reverse": false, + "conflictsOnly": false, + "tombstones": false, + "singleItem": false, + "items": [ + {"sk": "d.1", "ct": ct.get("d.1").unwrap(), "v": [base64::encode(values.get("d.1'").unwrap())]}, + ], + "more": false, + "nextStart": null, + }, + { + "partitionKey": "root", + "prefix": "d.", + "start": null, + "end": null, + "limit": 1, + "reverse": false, + "conflictsOnly": false, + "tombstones": false, + "singleItem": false, + "items": [ + {"sk": "d.1", "ct": ct.get("d.1").unwrap(), "v": [base64::encode(values.get("d.1'").unwrap())]}, + ], + "more": true, + "nextStart": "d.2", + }, + { + "partitionKey": "root", + "prefix": "d.", + "start": "d.2", + "end": null, + "limit": 1, + "reverse": false, + "conflictsOnly": false, + "tombstones": false, + "singleItem": false, + "items": [ + {"sk": "d.2", "ct": ct.get("d.2").unwrap(), "v": [base64::encode(values.get("d.2").unwrap()), base64::encode(values.get("d.2'").unwrap())]}, + ], + "more": false, + "nextStart": null, + }, + { + "partitionKey": "root", + "prefix": "d.", + "start": null, + "end": null, + "limit": null, + "reverse": true, + "conflictsOnly": false, + "tombstones": false, + "singleItem": false, + "items": [ + {"sk": "d.2", "ct": ct.get("d.2").unwrap(), "v": [base64::encode(values.get("d.2").unwrap()), base64::encode(values.get("d.2'").unwrap())]}, + {"sk": "d.1", "ct": ct.get("d.1").unwrap(), "v": [base64::encode(values.get("d.1'").unwrap())]}, + ], + "more": false, + "nextStart": null, + }, + { + "partitionKey": "root", + "prefix": "d.", + "start": "d.2", + "end": null, + "limit": null, + "reverse": true, + "conflictsOnly": false, + "tombstones": false, + "singleItem": false, + "items": [ + {"sk": "d.2", "ct": ct.get("d.2").unwrap(), "v": [base64::encode(values.get("d.2").unwrap()), base64::encode(values.get("d.2'").unwrap())]}, + {"sk": "d.1", "ct": ct.get("d.1").unwrap(), "v": [base64::encode(values.get("d.1'").unwrap())]}, + ], + "more": false, + "nextStart": null, + }, + { + "partitionKey": "root", + "prefix": "d.", + "start": null, + "end": null, + "limit": 2, + "reverse": false, + "conflictsOnly": false, + "tombstones": false, + "singleItem": false, + "items": [ + {"sk": "d.1", "ct": ct.get("d.1").unwrap(), "v": [base64::encode(values.get("d.1'").unwrap())]}, + {"sk": "d.2", "ct": ct.get("d.2").unwrap(), "v": [base64::encode(values.get("d.2").unwrap()), base64::encode(values.get("d.2'").unwrap())]}, + ], + "more": false, + "nextStart": null, + }, + ]) + ); + + // Test DeleteBatch + let res = ctx + .k2v + .request + .builder(bucket.clone()) + .query_param("delete", Option::<&str>::None) + .body( + br#"[ + {"partitionKey": "root", "start": "a", "end": "c"}, + {"partitionKey": "root", "prefix": "d"} + ]"# + .to_vec(), + ) + .method(Method::POST) + .send() + .await + .unwrap(); + assert_eq!(res.status(), 200); + let json_res = json_body(res).await; + assert_json_eq!( + json_res, + json!([ + { + "partitionKey": "root", + "prefix": null, + "start": "a", + "end": "c", + "singleItem": false, + "deletedItems": 1, + }, + { + "partitionKey": "root", + "prefix": "d", + "start": null, + "end": null, + "singleItem": false, + "deletedItems": 2, + }, + ]) + ); + + let res = ctx + .k2v + .request + .builder(bucket.clone()) + .query_param("search", Option::<&str>::None) + .body( + br#"[ + {"partitionKey": "root"}, + {"partitionKey": "root", "reverse": true} + ]"# + .to_vec(), + ) + .method(Method::POST) + .send() + .await + .unwrap(); + assert_eq!(res.status(), 200); + let json_res = json_body(res).await; + assert_json_eq!( + json_res, + json!([ + { + "partitionKey": "root", + "prefix": null, + "start": null, + "end": null, + "limit": null, + "reverse": false, + "conflictsOnly": false, + "tombstones": false, + "singleItem": false, + "items": [ + {"sk": "c", "ct": ct.get("c").unwrap(), "v": [base64::encode(values.get("c").unwrap()), base64::encode(values.get("c'").unwrap())]}, + {"sk": "e", "ct": ct.get("e").unwrap(), "v": [base64::encode(values.get("e").unwrap())]} + ], + "more": false, + "nextStart": null, + }, + { + "partitionKey": "root", + "prefix": null, + "start": null, + "end": null, + "limit": null, + "reverse": true, + "conflictsOnly": false, + "tombstones": false, + "singleItem": false, + "items": [ + {"sk": "e", "ct": ct.get("e").unwrap(), "v": [base64::encode(values.get("e").unwrap())]}, + {"sk": "c", "ct": ct.get("c").unwrap(), "v": [base64::encode(values.get("c").unwrap()), base64::encode(values.get("c'").unwrap())]}, + ], + "more": false, + "nextStart": null, + }, + ]) + ); +} diff --git a/src/garage/tests/k2v/errorcodes.rs b/src/garage/tests/k2v/errorcodes.rs new file mode 100644 index 00000000..2fcc45bc --- /dev/null +++ b/src/garage/tests/k2v/errorcodes.rs @@ -0,0 +1,141 @@ +use crate::common; + +use hyper::Method; + +#[tokio::test] +async fn test_error_codes() { + let ctx = common::context(); + let bucket = ctx.create_bucket("test-k2v-error-codes"); + + // Regular insert should work (code 200) + let res = ctx + .k2v + .request + .builder(bucket.clone()) + .method(Method::PUT) + .path("root") + .query_param("sort_key", Some("test1")) + .body(b"Hello, world!".to_vec()) + .send() + .await + .unwrap(); + assert_eq!(res.status(), 200); + + // Insert with trash causality token: invalid request + let res = ctx + .k2v + .request + .builder(bucket.clone()) + .method(Method::PUT) + .path("root") + .query_param("sort_key", Some("test1")) + .signed_header("x-garage-causality-token", "tra$sh") + .body(b"Hello, world!".to_vec()) + .send() + .await + .unwrap(); + assert_eq!(res.status(), 400); + + // Search without partition key: invalid request + let res = ctx + .k2v + .request + .builder(bucket.clone()) + .query_param("search", Option::<&str>::None) + .body( + br#"[ + {}, + ]"# + .to_vec(), + ) + .method(Method::POST) + .send() + .await + .unwrap(); + assert_eq!(res.status(), 400); + + // Search with start that is not in prefix: invalid request + let res = ctx + .k2v + .request + .builder(bucket.clone()) + .query_param("search", Option::<&str>::None) + .body( + br#"[ + {"partition_key": "root", "prefix": "a", "start": "bx"}, + ]"# + .to_vec(), + ) + .method(Method::POST) + .send() + .await + .unwrap(); + assert_eq!(res.status(), 400); + + // Search with invalid json: 400 + let res = ctx + .k2v + .request + .builder(bucket.clone()) + .query_param("search", Option::<&str>::None) + .body( + br#"[ + {"partition_key": "root" + ]"# + .to_vec(), + ) + .method(Method::POST) + .send() + .await + .unwrap(); + assert_eq!(res.status(), 400); + + // Batch insert with invalid causality token: 400 + let res = ctx + .k2v + .request + .builder(bucket.clone()) + .body( + br#"[ + {"pk": "root", "sk": "a", "ct": "tra$h", "v": "aGVsbG8sIHdvcmxkCg=="} + ]"# + .to_vec(), + ) + .method(Method::POST) + .send() + .await + .unwrap(); + assert_eq!(res.status(), 400); + + // Batch insert with invalid data: 400 + let res = ctx + .k2v + .request + .builder(bucket.clone()) + .body( + br#"[ + {"pk": "root", "sk": "a", "ct": null, "v": "aGVsbG8sIHdvcmx$Cg=="} + ]"# + .to_vec(), + ) + .method(Method::POST) + .send() + .await + .unwrap(); + assert_eq!(res.status(), 400); + + // Poll with invalid causality token: 400 + let res = ctx + .k2v + .request + .builder(bucket.clone()) + .path("root") + .query_param("sort_key", Some("test1")) + .query_param("causality_token", Some("tra$h")) + .query_param("timeout", Some("10")) + .signed_header("accept", "application/octet-stream") + .send() + .await + .unwrap(); + assert_eq!(res.status(), 400); +} diff --git a/src/garage/tests/k2v/item.rs b/src/garage/tests/k2v/item.rs new file mode 100644 index 00000000..bf2b01f8 --- /dev/null +++ b/src/garage/tests/k2v/item.rs @@ -0,0 +1,719 @@ +use crate::common; + +use assert_json_diff::assert_json_eq; +use serde_json::json; + +use super::json_body; +use hyper::Method; + +#[tokio::test] +async fn test_items_and_indices() { + let ctx = common::context(); + let bucket = ctx.create_bucket("test-k2v-item-and-index"); + + // ReadIndex -- there should be nothing + let res = ctx + .k2v + .request + .builder(bucket.clone()) + .send() + .await + .unwrap(); + let res_body = json_body(res).await; + assert_json_eq!( + res_body, + json!({ + "prefix": null, + "start": null, + "end": null, + "limit": null, + "reverse": false, + "partitionKeys": [], + "more": false, + "nextStart": null + }) + ); + + let content2_len = "_: hello universe".len(); + let content3_len = "_: concurrent value".len(); + + for (i, sk) in ["a", "b", "c", "d"].iter().enumerate() { + let content = format!("{}: hello world", sk).into_bytes(); + let content2 = format!("{}: hello universe", sk).into_bytes(); + let content3 = format!("{}: concurrent value", sk).into_bytes(); + + // Put initially, no causality token + let res = ctx + .k2v + .request + .builder(bucket.clone()) + .path("root") + .query_param("sort_key", Some(sk)) + .body(content.clone()) + .method(Method::PUT) + .send() + .await + .unwrap(); + assert_eq!(res.status(), 200); + + // Get value back + let res = ctx + .k2v + .request + .builder(bucket.clone()) + .path("root") + .query_param("sort_key", Some(sk)) + .signed_header("accept", "*/*") + .send() + .await + .unwrap(); + assert_eq!(res.status(), 200); + assert_eq!( + res.headers().get("content-type").unwrap().to_str().unwrap(), + "application/octet-stream" + ); + let ct = res + .headers() + .get("x-garage-causality-token") + .unwrap() + .to_str() + .unwrap() + .to_string(); + let res_body = hyper::body::to_bytes(res.into_body()) + .await + .unwrap() + .to_vec(); + assert_eq!(res_body, content); + + // ReadIndex -- now there should be some stuff + let res = ctx + .k2v + .request + .builder(bucket.clone()) + .send() + .await + .unwrap(); + let res_body = json_body(res).await; + assert_json_eq!( + res_body, + json!({ + "prefix": null, + "start": null, + "end": null, + "limit": null, + "reverse": false, + "partitionKeys": [ + { + "pk": "root", + "entries": i+1, + "conflicts": i, + "values": i+i+1, + "bytes": i*(content2.len() + content3.len()) + content.len(), + } + ], + "more": false, + "nextStart": null + }) + ); + + // Put again, this time with causality token + let res = ctx + .k2v + .request + .builder(bucket.clone()) + .path("root") + .query_param("sort_key", Some(sk)) + .signed_header("x-garage-causality-token", ct.clone()) + .body(content2.clone()) + .method(Method::PUT) + .send() + .await + .unwrap(); + assert_eq!(res.status(), 200); + + // Get value back + let res = ctx + .k2v + .request + .builder(bucket.clone()) + .path("root") + .query_param("sort_key", Some(sk)) + .signed_header("accept", "*/*") + .send() + .await + .unwrap(); + assert_eq!(res.status(), 200); + assert_eq!( + res.headers().get("content-type").unwrap().to_str().unwrap(), + "application/octet-stream" + ); + let res_body = hyper::body::to_bytes(res.into_body()) + .await + .unwrap() + .to_vec(); + assert_eq!(res_body, content2); + + // ReadIndex -- now there should be some stuff + let res = ctx + .k2v + .request + .builder(bucket.clone()) + .send() + .await + .unwrap(); + let res_body = json_body(res).await; + assert_json_eq!( + res_body, + json!({ + "prefix": null, + "start": null, + "end": null, + "limit": null, + "reverse": false, + "partitionKeys": [ + { + "pk": "root", + "entries": i+1, + "conflicts": i, + "values": i+i+1, + "bytes": i*content3.len() + (i+1)*content2.len(), + } + ], + "more": false, + "nextStart": null + }) + ); + + // Put again with same CT, now we have concurrent values + let res = ctx + .k2v + .request + .builder(bucket.clone()) + .path("root") + .query_param("sort_key", Some(sk)) + .signed_header("x-garage-causality-token", ct.clone()) + .body(content3.clone()) + .method(Method::PUT) + .send() + .await + .unwrap(); + assert_eq!(res.status(), 200); + + // Get value back + let res = ctx + .k2v + .request + .builder(bucket.clone()) + .path("root") + .query_param("sort_key", Some(sk)) + .signed_header("accept", "*/*") + .send() + .await + .unwrap(); + assert_eq!(res.status(), 200); + assert_eq!( + res.headers().get("content-type").unwrap().to_str().unwrap(), + "application/json" + ); + let res_json = json_body(res).await; + assert_json_eq!( + res_json, + [base64::encode(&content2), base64::encode(&content3)] + ); + + // ReadIndex -- now there should be some stuff + let res = ctx + .k2v + .request + .builder(bucket.clone()) + .send() + .await + .unwrap(); + let res_body = json_body(res).await; + assert_json_eq!( + res_body, + json!({ + "prefix": null, + "start": null, + "end": null, + "limit": null, + "reverse": false, + "partitionKeys": [ + { + "pk": "root", + "entries": i+1, + "conflicts": i+1, + "values": 2*(i+1), + "bytes": (i+1)*(content2.len() + content3.len()), + } + ], + "more": false, + "nextStart": null + }) + ); + } + + // Now delete things + for (i, sk) in ["a", "b", "c", "d"].iter().enumerate() { + // Get value back (we just need the CT) + let res = ctx + .k2v + .request + .builder(bucket.clone()) + .path("root") + .query_param("sort_key", Some(sk)) + .signed_header("accept", "*/*") + .send() + .await + .unwrap(); + assert_eq!(res.status(), 200); + let ct = res + .headers() + .get("x-garage-causality-token") + .unwrap() + .to_str() + .unwrap() + .to_string(); + + // Delete it + let res = ctx + .k2v + .request + .builder(bucket.clone()) + .method(Method::DELETE) + .path("root") + .query_param("sort_key", Some(sk)) + .signed_header("x-garage-causality-token", ct) + .send() + .await + .unwrap(); + assert_eq!(res.status(), 204); + + // ReadIndex -- now there should be some stuff + let res = ctx + .k2v + .request + .builder(bucket.clone()) + .send() + .await + .unwrap(); + let res_body = json_body(res).await; + if i < 3 { + assert_json_eq!( + res_body, + json!({ + "prefix": null, + "start": null, + "end": null, + "limit": null, + "reverse": false, + "partitionKeys": [ + { + "pk": "root", + "entries": 3-i, + "conflicts": 3-i, + "values": 2*(3-i), + "bytes": (3-i)*(content2_len + content3_len), + } + ], + "more": false, + "nextStart": null + }) + ); + } else { + assert_json_eq!( + res_body, + json!({ + "prefix": null, + "start": null, + "end": null, + "limit": null, + "reverse": false, + "partitionKeys": [], + "more": false, + "nextStart": null + }) + ); + } + } +} + +#[tokio::test] +async fn test_item_return_format() { + let ctx = common::context(); + let bucket = ctx.create_bucket("test-k2v-item-return-format"); + + let single_value = b"A single value".to_vec(); + let concurrent_value = b"A concurrent value".to_vec(); + + // -- Test with a single value -- + let res = ctx + .k2v + .request + .builder(bucket.clone()) + .path("root") + .query_param("sort_key", Some("v1")) + .body(single_value.clone()) + .method(Method::PUT) + .send() + .await + .unwrap(); + assert_eq!(res.status(), 200); + + // f0: either + let res = ctx + .k2v + .request + .builder(bucket.clone()) + .path("root") + .query_param("sort_key", Some("v1")) + .signed_header("accept", "*/*") + .send() + .await + .unwrap(); + assert_eq!(res.status(), 200); + assert_eq!( + res.headers().get("content-type").unwrap().to_str().unwrap(), + "application/octet-stream" + ); + let ct = res + .headers() + .get("x-garage-causality-token") + .unwrap() + .to_str() + .unwrap() + .to_string(); + let res_body = hyper::body::to_bytes(res.into_body()) + .await + .unwrap() + .to_vec(); + assert_eq!(res_body, single_value); + + // f1: not specified + let res = ctx + .k2v + .request + .builder(bucket.clone()) + .path("root") + .query_param("sort_key", Some("v1")) + .send() + .await + .unwrap(); + assert_eq!(res.status(), 200); + assert_eq!( + res.headers().get("content-type").unwrap().to_str().unwrap(), + "application/json" + ); + let res_body = json_body(res).await; + assert_json_eq!(res_body, json!([base64::encode(&single_value)])); + + // f2: binary + let res = ctx + .k2v + .request + .builder(bucket.clone()) + .path("root") + .query_param("sort_key", Some("v1")) + .signed_header("accept", "application/octet-stream") + .send() + .await + .unwrap(); + assert_eq!(res.status(), 200); + assert_eq!( + res.headers().get("content-type").unwrap().to_str().unwrap(), + "application/octet-stream" + ); + let res_body = hyper::body::to_bytes(res.into_body()) + .await + .unwrap() + .to_vec(); + assert_eq!(res_body, single_value); + + // f3: json + let res = ctx + .k2v + .request + .builder(bucket.clone()) + .path("root") + .query_param("sort_key", Some("v1")) + .signed_header("accept", "application/json") + .send() + .await + .unwrap(); + assert_eq!(res.status(), 200); + assert_eq!( + res.headers().get("content-type").unwrap().to_str().unwrap(), + "application/json" + ); + let res_body = json_body(res).await; + assert_json_eq!(res_body, json!([base64::encode(&single_value)])); + + // -- Test with a second, concurrent value -- + let res = ctx + .k2v + .request + .builder(bucket.clone()) + .path("root") + .query_param("sort_key", Some("v1")) + .body(concurrent_value.clone()) + .method(Method::PUT) + .send() + .await + .unwrap(); + assert_eq!(res.status(), 200); + + // f0: either + let res = ctx + .k2v + .request + .builder(bucket.clone()) + .path("root") + .query_param("sort_key", Some("v1")) + .signed_header("accept", "*/*") + .send() + .await + .unwrap(); + assert_eq!(res.status(), 200); + assert_eq!( + res.headers().get("content-type").unwrap().to_str().unwrap(), + "application/json" + ); + let res_body = json_body(res).await; + assert_json_eq!( + res_body, + json!([ + base64::encode(&single_value), + base64::encode(&concurrent_value) + ]) + ); + + // f1: not specified + let res = ctx + .k2v + .request + .builder(bucket.clone()) + .path("root") + .query_param("sort_key", Some("v1")) + .send() + .await + .unwrap(); + assert_eq!(res.status(), 200); + assert_eq!( + res.headers().get("content-type").unwrap().to_str().unwrap(), + "application/json" + ); + let res_body = json_body(res).await; + assert_json_eq!( + res_body, + json!([ + base64::encode(&single_value), + base64::encode(&concurrent_value) + ]) + ); + + // f2: binary + let res = ctx + .k2v + .request + .builder(bucket.clone()) + .path("root") + .query_param("sort_key", Some("v1")) + .signed_header("accept", "application/octet-stream") + .send() + .await + .unwrap(); + assert_eq!(res.status(), 409); // CONFLICT + + // f3: json + let res = ctx + .k2v + .request + .builder(bucket.clone()) + .path("root") + .query_param("sort_key", Some("v1")) + .signed_header("accept", "application/json") + .send() + .await + .unwrap(); + assert_eq!(res.status(), 200); + assert_eq!( + res.headers().get("content-type").unwrap().to_str().unwrap(), + "application/json" + ); + let res_body = json_body(res).await; + assert_json_eq!( + res_body, + json!([ + base64::encode(&single_value), + base64::encode(&concurrent_value) + ]) + ); + + // -- Delete first value, concurrently with second insert -- + // -- (we now have a concurrent value and a deletion) -- + let res = ctx + .k2v + .request + .builder(bucket.clone()) + .path("root") + .query_param("sort_key", Some("v1")) + .method(Method::DELETE) + .signed_header("x-garage-causality-token", ct) + .send() + .await + .unwrap(); + assert_eq!(res.status(), 204); + + // f0: either + let res = ctx + .k2v + .request + .builder(bucket.clone()) + .path("root") + .query_param("sort_key", Some("v1")) + .signed_header("accept", "*/*") + .send() + .await + .unwrap(); + assert_eq!(res.status(), 200); + assert_eq!( + res.headers().get("content-type").unwrap().to_str().unwrap(), + "application/json" + ); + let res_body = json_body(res).await; + assert_json_eq!(res_body, json!([base64::encode(&concurrent_value), null])); + + // f1: not specified + let res = ctx + .k2v + .request + .builder(bucket.clone()) + .path("root") + .query_param("sort_key", Some("v1")) + .send() + .await + .unwrap(); + assert_eq!(res.status(), 200); + assert_eq!( + res.headers().get("content-type").unwrap().to_str().unwrap(), + "application/json" + ); + let ct = res + .headers() + .get("x-garage-causality-token") + .unwrap() + .to_str() + .unwrap() + .to_string(); + let res_body = json_body(res).await; + assert_json_eq!(res_body, json!([base64::encode(&concurrent_value), null])); + + // f2: binary + let res = ctx + .k2v + .request + .builder(bucket.clone()) + .path("root") + .query_param("sort_key", Some("v1")) + .signed_header("accept", "application/octet-stream") + .send() + .await + .unwrap(); + assert_eq!(res.status(), 409); // CONFLICT + + // f3: json + let res = ctx + .k2v + .request + .builder(bucket.clone()) + .path("root") + .query_param("sort_key", Some("v1")) + .signed_header("accept", "application/json") + .send() + .await + .unwrap(); + assert_eq!(res.status(), 200); + assert_eq!( + res.headers().get("content-type").unwrap().to_str().unwrap(), + "application/json" + ); + let res_body = json_body(res).await; + assert_json_eq!(res_body, json!([base64::encode(&concurrent_value), null])); + + // -- Delete everything -- + let res = ctx + .k2v + .request + .builder(bucket.clone()) + .path("root") + .query_param("sort_key", Some("v1")) + .method(Method::DELETE) + .signed_header("x-garage-causality-token", ct) + .send() + .await + .unwrap(); + assert_eq!(res.status(), 204); + + // f0: either + let res = ctx + .k2v + .request + .builder(bucket.clone()) + .path("root") + .query_param("sort_key", Some("v1")) + .signed_header("accept", "*/*") + .send() + .await + .unwrap(); + assert_eq!(res.status(), 204); // NO CONTENT + + // f1: not specified + let res = ctx + .k2v + .request + .builder(bucket.clone()) + .path("root") + .query_param("sort_key", Some("v1")) + .send() + .await + .unwrap(); + assert_eq!(res.status(), 200); + assert_eq!( + res.headers().get("content-type").unwrap().to_str().unwrap(), + "application/json" + ); + let res_body = json_body(res).await; + assert_json_eq!(res_body, json!([null])); + + // f2: binary + let res = ctx + .k2v + .request + .builder(bucket.clone()) + .path("root") + .query_param("sort_key", Some("v1")) + .signed_header("accept", "application/octet-stream") + .send() + .await + .unwrap(); + assert_eq!(res.status(), 204); // NO CONTENT + + // f3: json + let res = ctx + .k2v + .request + .builder(bucket.clone()) + .path("root") + .query_param("sort_key", Some("v1")) + .signed_header("accept", "application/json") + .send() + .await + .unwrap(); + assert_eq!(res.status(), 200); + assert_eq!( + res.headers().get("content-type").unwrap().to_str().unwrap(), + "application/json" + ); + let res_body = json_body(res).await; + assert_json_eq!(res_body, json!([null])); +} diff --git a/src/garage/tests/k2v/mod.rs b/src/garage/tests/k2v/mod.rs new file mode 100644 index 00000000..a009460e --- /dev/null +++ b/src/garage/tests/k2v/mod.rs @@ -0,0 +1,18 @@ +pub mod batch; +pub mod errorcodes; +pub mod item; +pub mod poll; +pub mod simple; + +use hyper::{Body, Response}; + +pub async fn json_body(res: Response) -> serde_json::Value { + let res_body: serde_json::Value = serde_json::from_slice( + &hyper::body::to_bytes(res.into_body()) + .await + .unwrap() + .to_vec()[..], + ) + .unwrap(); + res_body +} diff --git a/src/garage/tests/k2v/poll.rs b/src/garage/tests/k2v/poll.rs new file mode 100644 index 00000000..70dc0410 --- /dev/null +++ b/src/garage/tests/k2v/poll.rs @@ -0,0 +1,98 @@ +use hyper::Method; +use std::time::Duration; + +use crate::common; + +#[tokio::test] +async fn test_poll() { + let ctx = common::context(); + let bucket = ctx.create_bucket("test-k2v-poll"); + + // Write initial value + let res = ctx + .k2v + .request + .builder(bucket.clone()) + .method(Method::PUT) + .path("root") + .query_param("sort_key", Some("test1")) + .body(b"Initial value".to_vec()) + .send() + .await + .unwrap(); + assert_eq!(res.status(), 200); + + // Retrieve initial value to get its causality token + let res2 = ctx + .k2v + .request + .builder(bucket.clone()) + .path("root") + .query_param("sort_key", Some("test1")) + .signed_header("accept", "application/octet-stream") + .send() + .await + .unwrap(); + assert_eq!(res2.status(), 200); + let ct = res2 + .headers() + .get("x-garage-causality-token") + .unwrap() + .to_str() + .unwrap() + .to_string(); + + let res2_body = hyper::body::to_bytes(res2.into_body()) + .await + .unwrap() + .to_vec(); + assert_eq!(res2_body, b"Initial value"); + + // Start poll operation + let poll = { + let bucket = bucket.clone(); + let ct = ct.clone(); + tokio::spawn(async move { + let ctx = common::context(); + ctx.k2v + .request + .builder(bucket.clone()) + .path("root") + .query_param("sort_key", Some("test1")) + .query_param("causality_token", Some(ct)) + .query_param("timeout", Some("10")) + .signed_header("accept", "application/octet-stream") + .send() + .await + }) + }; + + // Write new value that supersedes initial one + let res = ctx + .k2v + .request + .builder(bucket.clone()) + .method(Method::PUT) + .path("root") + .query_param("sort_key", Some("test1")) + .signed_header("x-garage-causality-token", ct) + .body(b"New value".to_vec()) + .send() + .await + .unwrap(); + assert_eq!(res.status(), 200); + + // Check poll finishes with correct value + let poll_res = tokio::select! { + _ = tokio::time::sleep(Duration::from_secs(10)) => panic!("poll did not terminate in time"), + res = poll => res.unwrap().unwrap(), + }; + + assert_eq!(poll_res.status(), 200); + + let poll_res_body = hyper::body::to_bytes(poll_res.into_body()) + .await + .unwrap() + .to_vec(); + assert_eq!(poll_res_body, b"New value"); +} diff --git a/src/garage/tests/k2v/simple.rs b/src/garage/tests/k2v/simple.rs new file mode 100644 index 00000000..ae9a8674 --- /dev/null +++ b/src/garage/tests/k2v/simple.rs @@ -0,0 +1,40 @@ +use crate::common; + +use hyper::Method; + +#[tokio::test] +async fn test_simple() { + let ctx = common::context(); + let bucket = ctx.create_bucket("test-k2v-simple"); + + let res = ctx + .k2v + .request + .builder(bucket.clone()) + .method(Method::PUT) + .path("root") + .query_param("sort_key", Some("test1")) + .body(b"Hello, world!".to_vec()) + .send() + .await + .unwrap(); + assert_eq!(res.status(), 200); + + let res2 = ctx + .k2v + .request + .builder(bucket.clone()) + .path("root") + .query_param("sort_key", Some("test1")) + .signed_header("accept", "application/octet-stream") + .send() + .await + .unwrap(); + assert_eq!(res2.status(), 200); + + let res2_body = hyper::body::to_bytes(res2.into_body()) + .await + .unwrap() + .to_vec(); + assert_eq!(res2_body, b"Hello, world!"); +} diff --git a/src/garage/tests/lib.rs b/src/garage/tests/lib.rs index 8799c395..0106ad10 100644 --- a/src/garage/tests/lib.rs +++ b/src/garage/tests/lib.rs @@ -3,9 +3,5 @@ mod common; mod admin; mod bucket; -mod list; -mod multipart; -mod objects; -mod simple; -mod streaming_signature; -mod website; +mod k2v; +mod s3; diff --git a/src/garage/tests/list.rs b/src/garage/tests/list.rs deleted file mode 100644 index bb03f250..00000000 --- a/src/garage/tests/list.rs +++ /dev/null @@ -1,615 +0,0 @@ -use crate::common; - -const KEYS: [&str; 8] = ["a", "a/a", "a/b", "a/c", "a/d/a", "a/é", "b", "c"]; -const KEYS_MULTIPART: [&str; 5] = ["a", "a", "c", "c/a", "c/b"]; - -#[tokio::test] -async fn test_listobjectsv2() { - let ctx = common::context(); - let bucket = ctx.create_bucket("listobjectsv2"); - - for k in KEYS { - ctx.client - .put_object() - .bucket(&bucket) - .key(k) - .send() - .await - .unwrap(); - } - - { - // Scoping the variable to avoid reusing it - // in a following assert due to copy paste - let r = ctx - .client - .list_objects_v2() - .bucket(&bucket) - .send() - .await - .unwrap(); - - assert_eq!(r.contents.unwrap().len(), 8); - assert!(r.common_prefixes.is_none()); - } - - //@FIXME aws-sdk-s3 automatically checks max-key values. - // If we set it to zero, it drops it, and it is probably - // the same behavior on values bigger than 1000. - // Boto and awscli do not perform these tests, we should write - // our own minimal library to bypass AWS SDK's tests and be - // sure that we behave correctly. - - { - // With 2 elements - let r = ctx - .client - .list_objects_v2() - .bucket(&bucket) - .max_keys(2) - .send() - .await - .unwrap(); - - assert_eq!(r.contents.unwrap().len(), 2); - assert!(r.common_prefixes.is_none()); - assert!(r.next_continuation_token.is_some()); - } - - { - // With pagination - let mut cnt = 0; - let mut next = None; - let last_idx = KEYS.len() - 1; - - for i in 0..KEYS.len() { - let r = ctx - .client - .list_objects_v2() - .bucket(&bucket) - .set_continuation_token(next) - .max_keys(1) - .send() - .await - .unwrap(); - - cnt += 1; - next = r.next_continuation_token; - - assert_eq!(r.contents.unwrap().len(), 1); - assert!(r.common_prefixes.is_none()); - if i != last_idx { - assert!(next.is_some()); - } - } - assert_eq!(cnt, KEYS.len()); - } - - { - // With a delimiter - let r = ctx - .client - .list_objects_v2() - .bucket(&bucket) - .delimiter("/") - .send() - .await - .unwrap(); - - assert_eq!(r.contents.unwrap().len(), 3); - assert_eq!(r.common_prefixes.unwrap().len(), 1); - } - - { - // With a delimiter and pagination - let mut cnt_pfx = 0; - let mut cnt_key = 0; - let mut next = None; - - for _i in 0..KEYS.len() { - let r = ctx - .client - .list_objects_v2() - .bucket(&bucket) - .set_continuation_token(next) - .delimiter("/") - .max_keys(1) - .send() - .await - .unwrap(); - - next = r.next_continuation_token; - match (r.contents, r.common_prefixes) { - (Some(k), None) if k.len() == 1 => cnt_key += 1, - (None, Some(pfx)) if pfx.len() == 1 => cnt_pfx += 1, - _ => unreachable!("logic error"), - }; - if next.is_none() { - break; - } - } - assert_eq!(cnt_key, 3); - assert_eq!(cnt_pfx, 1); - } - - { - // With a prefix - let r = ctx - .client - .list_objects_v2() - .bucket(&bucket) - .prefix("a/") - .send() - .await - .unwrap(); - - assert_eq!(r.contents.unwrap().len(), 5); - assert!(r.common_prefixes.is_none()); - } - - { - // With a prefix and a delimiter - let r = ctx - .client - .list_objects_v2() - .bucket(&bucket) - .prefix("a/") - .delimiter("/") - .send() - .await - .unwrap(); - - assert_eq!(r.contents.unwrap().len(), 4); - assert_eq!(r.common_prefixes.unwrap().len(), 1); - } - - { - // With a prefix, a delimiter and max_key - let r = ctx - .client - .list_objects_v2() - .bucket(&bucket) - .prefix("a/") - .delimiter("/") - .max_keys(1) - .send() - .await - .unwrap(); - - assert_eq!(r.contents.as_ref().unwrap().len(), 1); - assert_eq!( - r.contents - .unwrap() - .first() - .unwrap() - .key - .as_ref() - .unwrap() - .as_str(), - "a/a" - ); - assert!(r.common_prefixes.is_none()); - } - { - // With start_after before all keys - let r = ctx - .client - .list_objects_v2() - .bucket(&bucket) - .start_after("Z") - .send() - .await - .unwrap(); - - assert_eq!(r.contents.unwrap().len(), 8); - assert!(r.common_prefixes.is_none()); - } - { - // With start_after after all keys - let r = ctx - .client - .list_objects_v2() - .bucket(&bucket) - .start_after("c") - .send() - .await - .unwrap(); - - assert!(r.contents.is_none()); - assert!(r.common_prefixes.is_none()); - } -} - -#[tokio::test] -async fn test_listobjectsv1() { - let ctx = common::context(); - let bucket = ctx.create_bucket("listobjects"); - - for k in KEYS { - ctx.client - .put_object() - .bucket(&bucket) - .key(k) - .send() - .await - .unwrap(); - } - - { - let r = ctx - .client - .list_objects() - .bucket(&bucket) - .send() - .await - .unwrap(); - - assert_eq!(r.contents.unwrap().len(), 8); - assert!(r.common_prefixes.is_none()); - } - - { - // With 2 elements - let r = ctx - .client - .list_objects() - .bucket(&bucket) - .max_keys(2) - .send() - .await - .unwrap(); - - assert_eq!(r.contents.unwrap().len(), 2); - assert!(r.common_prefixes.is_none()); - assert!(r.next_marker.is_some()); - } - - { - // With pagination - let mut cnt = 0; - let mut next = None; - let last_idx = KEYS.len() - 1; - - for i in 0..KEYS.len() { - let r = ctx - .client - .list_objects() - .bucket(&bucket) - .set_marker(next) - .max_keys(1) - .send() - .await - .unwrap(); - - cnt += 1; - next = r.next_marker; - - assert_eq!(r.contents.unwrap().len(), 1); - assert!(r.common_prefixes.is_none()); - if i != last_idx { - assert!(next.is_some()); - } - } - assert_eq!(cnt, KEYS.len()); - } - - { - // With a delimiter - let r = ctx - .client - .list_objects() - .bucket(&bucket) - .delimiter("/") - .send() - .await - .unwrap(); - - assert_eq!(r.contents.unwrap().len(), 3); - assert_eq!(r.common_prefixes.unwrap().len(), 1); - } - - { - // With a delimiter and pagination - let mut cnt_pfx = 0; - let mut cnt_key = 0; - let mut next = None; - - for _i in 0..KEYS.len() { - let r = ctx - .client - .list_objects() - .bucket(&bucket) - .delimiter("/") - .set_marker(next) - .max_keys(1) - .send() - .await - .unwrap(); - - next = r.next_marker; - match (r.contents, r.common_prefixes) { - (Some(k), None) if k.len() == 1 => cnt_key += 1, - (None, Some(pfx)) if pfx.len() == 1 => cnt_pfx += 1, - _ => unreachable!("logic error"), - }; - if next.is_none() { - break; - } - } - assert_eq!(cnt_key, 3); - // We have no optimization to skip the whole prefix - // on listobjectsv1 so we return the same one 5 times, - // for each element. It is up to the client to merge its result. - // This is compliant with AWS spec. - assert_eq!(cnt_pfx, 5); - } - - { - // With a prefix - let r = ctx - .client - .list_objects() - .bucket(&bucket) - .prefix("a/") - .send() - .await - .unwrap(); - - assert_eq!(r.contents.unwrap().len(), 5); - assert!(r.common_prefixes.is_none()); - } - - { - // With a prefix and a delimiter - let r = ctx - .client - .list_objects() - .bucket(&bucket) - .prefix("a/") - .delimiter("/") - .send() - .await - .unwrap(); - - assert_eq!(r.contents.unwrap().len(), 4); - assert_eq!(r.common_prefixes.unwrap().len(), 1); - } - - { - // With a prefix, a delimiter and max_key - let r = ctx - .client - .list_objects() - .bucket(&bucket) - .prefix("a/") - .delimiter("/") - .max_keys(1) - .send() - .await - .unwrap(); - - assert_eq!(r.contents.as_ref().unwrap().len(), 1); - assert_eq!( - r.contents - .unwrap() - .first() - .unwrap() - .key - .as_ref() - .unwrap() - .as_str(), - "a/a" - ); - assert!(r.common_prefixes.is_none()); - } - { - // With marker before all keys - let r = ctx - .client - .list_objects() - .bucket(&bucket) - .marker("Z") - .send() - .await - .unwrap(); - - assert_eq!(r.contents.unwrap().len(), 8); - assert!(r.common_prefixes.is_none()); - } - { - // With start_after after all keys - let r = ctx - .client - .list_objects() - .bucket(&bucket) - .marker("c") - .send() - .await - .unwrap(); - - assert!(r.contents.is_none()); - assert!(r.common_prefixes.is_none()); - } -} - -#[tokio::test] -async fn test_listmultipart() { - let ctx = common::context(); - let bucket = ctx.create_bucket("listmultipartuploads"); - - for k in KEYS_MULTIPART { - ctx.client - .create_multipart_upload() - .bucket(&bucket) - .key(k) - .send() - .await - .unwrap(); - } - - { - // Default - let r = ctx - .client - .list_multipart_uploads() - .bucket(&bucket) - .send() - .await - .unwrap(); - - assert_eq!(r.uploads.unwrap().len(), 5); - assert!(r.common_prefixes.is_none()); - } - { - // With pagination - let mut next = None; - let mut upnext = None; - let last_idx = KEYS_MULTIPART.len() - 1; - - for i in 0..KEYS_MULTIPART.len() { - let r = ctx - .client - .list_multipart_uploads() - .bucket(&bucket) - .set_key_marker(next) - .set_upload_id_marker(upnext) - .max_uploads(1) - .send() - .await - .unwrap(); - - next = r.next_key_marker; - upnext = r.next_upload_id_marker; - - assert_eq!(r.uploads.unwrap().len(), 1); - assert!(r.common_prefixes.is_none()); - if i != last_idx { - assert!(next.is_some()); - } - } - } - { - // With delimiter - let r = ctx - .client - .list_multipart_uploads() - .bucket(&bucket) - .delimiter("/") - .send() - .await - .unwrap(); - - assert_eq!(r.uploads.unwrap().len(), 3); - assert_eq!(r.common_prefixes.unwrap().len(), 1); - } - { - // With delimiter and pagination - let mut next = None; - let mut upnext = None; - let mut upcnt = 0; - let mut pfxcnt = 0; - let mut loopcnt = 0; - - while loopcnt < KEYS_MULTIPART.len() { - let r = ctx - .client - .list_multipart_uploads() - .bucket(&bucket) - .delimiter("/") - .max_uploads(1) - .set_key_marker(next) - .set_upload_id_marker(upnext) - .send() - .await - .unwrap(); - - next = r.next_key_marker; - upnext = r.next_upload_id_marker; - - loopcnt += 1; - upcnt += r.uploads.unwrap_or_default().len(); - pfxcnt += r.common_prefixes.unwrap_or_default().len(); - - if next.is_none() { - break; - } - } - - assert_eq!(upcnt + pfxcnt, loopcnt); - assert_eq!(upcnt, 3); - assert_eq!(pfxcnt, 1); - } - { - // With prefix - let r = ctx - .client - .list_multipart_uploads() - .bucket(&bucket) - .prefix("c") - .send() - .await - .unwrap(); - - assert_eq!(r.uploads.unwrap().len(), 3); - assert!(r.common_prefixes.is_none()); - } - { - // With prefix and delimiter - let r = ctx - .client - .list_multipart_uploads() - .bucket(&bucket) - .prefix("c") - .delimiter("/") - .send() - .await - .unwrap(); - - assert_eq!(r.uploads.unwrap().len(), 1); - assert_eq!(r.common_prefixes.unwrap().len(), 1); - } - { - // With prefix, delimiter and max keys - let r = ctx - .client - .list_multipart_uploads() - .bucket(&bucket) - .prefix("c") - .delimiter("/") - .max_uploads(1) - .send() - .await - .unwrap(); - - assert_eq!(r.uploads.unwrap().len(), 1); - assert!(r.common_prefixes.is_none()); - } - { - // With starting token before the first element - let r = ctx - .client - .list_multipart_uploads() - .bucket(&bucket) - .key_marker("ZZZZZ") - .send() - .await - .unwrap(); - - assert_eq!(r.uploads.unwrap().len(), 5); - assert!(r.common_prefixes.is_none()); - } - { - // With starting token after the last element - let r = ctx - .client - .list_multipart_uploads() - .bucket(&bucket) - .key_marker("d") - .send() - .await - .unwrap(); - - assert!(r.uploads.is_none()); - assert!(r.common_prefixes.is_none()); - } -} diff --git a/src/garage/tests/multipart.rs b/src/garage/tests/multipart.rs deleted file mode 100644 index 895a2993..00000000 --- a/src/garage/tests/multipart.rs +++ /dev/null @@ -1,415 +0,0 @@ -use crate::common; -use aws_sdk_s3::model::{CompletedMultipartUpload, CompletedPart}; -use aws_sdk_s3::types::ByteStream; - -const SZ_5MB: usize = 5 * 1024 * 1024; -const SZ_10MB: usize = 10 * 1024 * 1024; - -#[tokio::test] -async fn test_uploadlistpart() { - let ctx = common::context(); - let bucket = ctx.create_bucket("uploadpart"); - - let u1 = vec![0xee; SZ_5MB]; - let u2 = vec![0x11; SZ_5MB]; - - let up = ctx - .client - .create_multipart_upload() - .bucket(&bucket) - .key("a") - .send() - .await - .unwrap(); - let uid = up.upload_id.as_ref().unwrap(); - - assert!(up.upload_id.is_some()); - - { - let r = ctx - .client - .list_parts() - .bucket(&bucket) - .key("a") - .upload_id(uid) - .send() - .await - .unwrap(); - - assert!(r.parts.is_none()); - } - - let p1 = ctx - .client - .upload_part() - .bucket(&bucket) - .key("a") - .upload_id(uid) - .part_number(2) - .body(ByteStream::from(u1)) - .send() - .await - .unwrap(); - - { - // ListPart on 1st element - let r = ctx - .client - .list_parts() - .bucket(&bucket) - .key("a") - .upload_id(uid) - .send() - .await - .unwrap(); - - let ps = r.parts.unwrap(); - assert_eq!(ps.len(), 1); - let fp = ps.iter().find(|x| x.part_number == 2).unwrap(); - assert!(fp.last_modified.is_some()); - assert_eq!( - fp.e_tag.as_ref().unwrap(), - "\"3366bb9dcf710d6801b5926467d02e19\"" - ); - assert_eq!(fp.size, SZ_5MB as i64); - } - - let p2 = ctx - .client - .upload_part() - .bucket(&bucket) - .key("a") - .upload_id(uid) - .part_number(1) - .body(ByteStream::from(u2)) - .send() - .await - .unwrap(); - - { - // ListPart on the 2 elements - let r = ctx - .client - .list_parts() - .bucket(&bucket) - .key("a") - .upload_id(uid) - .send() - .await - .unwrap(); - - let ps = r.parts.unwrap(); - assert_eq!(ps.len(), 2); - let fp = ps.iter().find(|x| x.part_number == 1).unwrap(); - assert!(fp.last_modified.is_some()); - assert_eq!( - fp.e_tag.as_ref().unwrap(), - "\"3c484266f9315485694556e6c693bfa2\"" - ); - assert_eq!(fp.size, SZ_5MB as i64); - } - - { - // Call pagination - let r = ctx - .client - .list_parts() - .bucket(&bucket) - .key("a") - .upload_id(uid) - .max_parts(1) - .send() - .await - .unwrap(); - - assert!(r.part_number_marker.is_none()); - assert!(r.next_part_number_marker.is_some()); - assert_eq!(r.max_parts, 1_i32); - assert!(r.is_truncated); - assert_eq!(r.key.unwrap(), "a"); - assert_eq!(r.upload_id.unwrap().as_str(), uid.as_str()); - assert_eq!(r.parts.unwrap().len(), 1); - - let r2 = ctx - .client - .list_parts() - .bucket(&bucket) - .key("a") - .upload_id(uid) - .max_parts(1) - .part_number_marker(r.next_part_number_marker.as_ref().unwrap()) - .send() - .await - .unwrap(); - - assert_eq!( - r2.part_number_marker.as_ref().unwrap(), - r.next_part_number_marker.as_ref().unwrap() - ); - assert_eq!(r2.max_parts, 1_i32); - assert!(r2.is_truncated); - assert_eq!(r2.key.unwrap(), "a"); - assert_eq!(r2.upload_id.unwrap().as_str(), uid.as_str()); - assert_eq!(r2.parts.unwrap().len(), 1); - } - - let cmp = CompletedMultipartUpload::builder() - .parts( - CompletedPart::builder() - .part_number(1) - .e_tag(p2.e_tag.unwrap()) - .build(), - ) - .parts( - CompletedPart::builder() - .part_number(2) - .e_tag(p1.e_tag.unwrap()) - .build(), - ) - .build(); - - ctx.client - .complete_multipart_upload() - .bucket(&bucket) - .key("a") - .upload_id(uid) - .multipart_upload(cmp) - .send() - .await - .unwrap(); - - // The multipart upload must not appear anymore - assert!(ctx - .client - .list_parts() - .bucket(&bucket) - .key("a") - .upload_id(uid) - .send() - .await - .is_err()); - - { - // The object must appear as a regular object - let r = ctx - .client - .head_object() - .bucket(&bucket) - .key("a") - .send() - .await - .unwrap(); - - assert_eq!(r.content_length, (SZ_5MB * 2) as i64); - } -} - -#[tokio::test] -async fn test_uploadpartcopy() { - let ctx = common::context(); - let bucket = ctx.create_bucket("uploadpartcopy"); - - let u1 = vec![0x11; SZ_10MB]; - let u2 = vec![0x22; SZ_5MB]; - let u3 = vec![0x33; SZ_5MB]; - let u4 = vec![0x44; SZ_5MB]; - let u5 = vec![0x55; SZ_5MB]; - - let overflow = 5500000 - SZ_5MB; - let mut exp_obj = u3.clone(); - exp_obj.extend(&u4[500..]); - exp_obj.extend(&u5[..overflow + 1]); - exp_obj.extend(&u2); - exp_obj.extend(&u1[500..5500000 + 1]); - - // (setup) Upload a single part object - ctx.client - .put_object() - .bucket(&bucket) - .key("source1") - .body(ByteStream::from(u1)) - .send() - .await - .unwrap(); - - // (setup) Upload a multipart object with 2 parts - { - let up = ctx - .client - .create_multipart_upload() - .bucket(&bucket) - .key("source2") - .send() - .await - .unwrap(); - let uid = up.upload_id.as_ref().unwrap(); - - let p1 = ctx - .client - .upload_part() - .bucket(&bucket) - .key("source2") - .upload_id(uid) - .part_number(1) - .body(ByteStream::from(u4)) - .send() - .await - .unwrap(); - - let p2 = ctx - .client - .upload_part() - .bucket(&bucket) - .key("source2") - .upload_id(uid) - .part_number(2) - .body(ByteStream::from(u5)) - .send() - .await - .unwrap(); - - let cmp = CompletedMultipartUpload::builder() - .parts( - CompletedPart::builder() - .part_number(1) - .e_tag(p1.e_tag.unwrap()) - .build(), - ) - .parts( - CompletedPart::builder() - .part_number(2) - .e_tag(p2.e_tag.unwrap()) - .build(), - ) - .build(); - - ctx.client - .complete_multipart_upload() - .bucket(&bucket) - .key("source2") - .upload_id(uid) - .multipart_upload(cmp) - .send() - .await - .unwrap(); - } - - // Our multipart object that does copy - let up = ctx - .client - .create_multipart_upload() - .bucket(&bucket) - .key("target") - .send() - .await - .unwrap(); - let uid = up.upload_id.as_ref().unwrap(); - - let p3 = ctx - .client - .upload_part() - .bucket(&bucket) - .key("target") - .upload_id(uid) - .part_number(3) - .body(ByteStream::from(u2)) - .send() - .await - .unwrap(); - - let p1 = ctx - .client - .upload_part() - .bucket(&bucket) - .key("target") - .upload_id(uid) - .part_number(1) - .body(ByteStream::from(u3)) - .send() - .await - .unwrap(); - - let p2 = ctx - .client - .upload_part_copy() - .bucket(&bucket) - .key("target") - .upload_id(uid) - .part_number(2) - .copy_source("uploadpartcopy/source2") - .copy_source_range("bytes=500-5500000") - .send() - .await - .unwrap(); - - let p4 = ctx - .client - .upload_part_copy() - .bucket(&bucket) - .key("target") - .upload_id(uid) - .part_number(4) - .copy_source("uploadpartcopy/source1") - .copy_source_range("bytes=500-5500000") - .send() - .await - .unwrap(); - - let cmp = CompletedMultipartUpload::builder() - .parts( - CompletedPart::builder() - .part_number(1) - .e_tag(p1.e_tag.unwrap()) - .build(), - ) - .parts( - CompletedPart::builder() - .part_number(2) - .e_tag(p2.copy_part_result.unwrap().e_tag.unwrap()) - .build(), - ) - .parts( - CompletedPart::builder() - .part_number(3) - .e_tag(p3.e_tag.unwrap()) - .build(), - ) - .parts( - CompletedPart::builder() - .part_number(4) - .e_tag(p4.copy_part_result.unwrap().e_tag.unwrap()) - .build(), - ) - .build(); - - ctx.client - .complete_multipart_upload() - .bucket(&bucket) - .key("target") - .upload_id(uid) - .multipart_upload(cmp) - .send() - .await - .unwrap(); - - // (check) Get object - - let obj = ctx - .client - .get_object() - .bucket(&bucket) - .key("target") - .send() - .await - .unwrap(); - - let real_obj = obj - .body - .collect() - .await - .expect("Error reading data") - .into_bytes(); - - assert_eq!(real_obj.len(), exp_obj.len()); - assert_eq!(real_obj, exp_obj); -} diff --git a/src/garage/tests/objects.rs b/src/garage/tests/objects.rs deleted file mode 100644 index e1175b81..00000000 --- a/src/garage/tests/objects.rs +++ /dev/null @@ -1,266 +0,0 @@ -use crate::common; -use aws_sdk_s3::model::{Delete, ObjectIdentifier}; -use aws_sdk_s3::types::ByteStream; - -const STD_KEY: &str = "hello world"; -const CTRL_KEY: &str = "\x00\x01\x02\x00"; -const UTF8_KEY: &str = "\u{211D}\u{1F923}\u{1F44B}"; -const BODY: &[u8; 62] = b"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; - -#[tokio::test] -async fn test_putobject() { - let ctx = common::context(); - let bucket = ctx.create_bucket("putobject"); - - { - // Send an empty object (can serve as a directory marker) - // with a content type - let etag = "\"d41d8cd98f00b204e9800998ecf8427e\""; - let content_type = "text/csv"; - let r = ctx - .client - .put_object() - .bucket(&bucket) - .key(STD_KEY) - .content_type(content_type) - .send() - .await - .unwrap(); - - assert_eq!(r.e_tag.unwrap().as_str(), etag); - // We return a version ID here - // We should check if Amazon is returning one when versioning is not enabled - assert!(r.version_id.is_some()); - - let _version = r.version_id.unwrap(); - - let o = ctx - .client - .get_object() - .bucket(&bucket) - .key(STD_KEY) - .send() - .await - .unwrap(); - - assert_bytes_eq!(o.body, b""); - assert_eq!(o.e_tag.unwrap(), etag); - // We do not return version ID - // We should check if Amazon is returning one when versioning is not enabled - // assert_eq!(o.version_id.unwrap(), _version); - assert_eq!(o.content_type.unwrap(), content_type); - assert!(o.last_modified.is_some()); - assert_eq!(o.content_length, 0); - assert_eq!(o.parts_count, 0); - assert_eq!(o.tag_count, 0); - } - - { - // Key with control characters, - // no content type and some data - let etag = "\"49f68a5c8493ec2c0bf489821c21fc3b\""; - let data = ByteStream::from_static(b"hi"); - - let r = ctx - .client - .put_object() - .bucket(&bucket) - .key(CTRL_KEY) - .body(data) - .send() - .await - .unwrap(); - - assert_eq!(r.e_tag.unwrap().as_str(), etag); - assert!(r.version_id.is_some()); - - let o = ctx - .client - .get_object() - .bucket(&bucket) - .key(CTRL_KEY) - .send() - .await - .unwrap(); - - assert_bytes_eq!(o.body, b"hi"); - assert_eq!(o.e_tag.unwrap(), etag); - assert!(o.last_modified.is_some()); - assert_eq!(o.content_length, 2); - assert_eq!(o.parts_count, 0); - assert_eq!(o.tag_count, 0); - } - - { - // Key with UTF8 codepoints including emoji - let etag = "\"d41d8cd98f00b204e9800998ecf8427e\""; - - let r = ctx - .client - .put_object() - .bucket(&bucket) - .key(UTF8_KEY) - .send() - .await - .unwrap(); - - assert_eq!(r.e_tag.unwrap().as_str(), etag); - assert!(r.version_id.is_some()); - - let o = ctx - .client - .get_object() - .bucket(&bucket) - .key(UTF8_KEY) - .send() - .await - .unwrap(); - - assert_bytes_eq!(o.body, b""); - assert_eq!(o.e_tag.unwrap(), etag); - assert!(o.last_modified.is_some()); - assert_eq!(o.content_length, 0); - assert_eq!(o.parts_count, 0); - assert_eq!(o.tag_count, 0); - } -} - -#[tokio::test] -async fn test_getobject() { - let ctx = common::context(); - let bucket = ctx.create_bucket("getobject"); - - let etag = "\"46cf18a9b447991b450cad3facf5937e\""; - let data = ByteStream::from_static(BODY); - - let r = ctx - .client - .put_object() - .bucket(&bucket) - .key(STD_KEY) - .body(data) - .send() - .await - .unwrap(); - - assert_eq!(r.e_tag.unwrap().as_str(), etag); - - { - let o = ctx - .client - .get_object() - .bucket(&bucket) - .key(STD_KEY) - .range("bytes=1-9") - .send() - .await - .unwrap(); - - assert_eq!(o.content_range.unwrap().as_str(), "bytes 1-9/62"); - assert_bytes_eq!(o.body, &BODY[1..10]); - } - { - let o = ctx - .client - .get_object() - .bucket(&bucket) - .key(STD_KEY) - .range("bytes=9-") - .send() - .await - .unwrap(); - assert_eq!(o.content_range.unwrap().as_str(), "bytes 9-61/62"); - assert_bytes_eq!(o.body, &BODY[9..]); - } - { - let o = ctx - .client - .get_object() - .bucket(&bucket) - .key(STD_KEY) - .range("bytes=-5") - .send() - .await - .unwrap(); - assert_eq!(o.content_range.unwrap().as_str(), "bytes 57-61/62"); - assert_bytes_eq!(o.body, &BODY[57..]); - } -} - -#[tokio::test] -async fn test_deleteobject() { - let ctx = common::context(); - let bucket = ctx.create_bucket("deleteobject"); - - let mut to_del = Delete::builder(); - - // add content without data - for i in 0..5 { - let k = format!("k-{}", i); - ctx.client - .put_object() - .bucket(&bucket) - .key(k.to_string()) - .send() - .await - .unwrap(); - if i > 0 { - to_del = to_del.objects(ObjectIdentifier::builder().key(k).build()); - } - } - - // add content with data - for i in 0..5 { - let k = format!("l-{}", i); - let data = ByteStream::from_static(BODY); - ctx.client - .put_object() - .bucket(&bucket) - .key(k.to_string()) - .body(data) - .send() - .await - .unwrap(); - - if i > 0 { - to_del = to_del.objects(ObjectIdentifier::builder().key(k).build()); - } - } - - ctx.client - .delete_object() - .bucket(&bucket) - .key("k-0") - .send() - .await - .unwrap(); - - ctx.client - .delete_object() - .bucket(&bucket) - .key("l-0") - .send() - .await - .unwrap(); - - let r = ctx - .client - .delete_objects() - .bucket(&bucket) - .delete(to_del.build()) - .send() - .await - .unwrap(); - - assert_eq!(r.deleted.unwrap().len(), 8); - - let l = ctx - .client - .list_objects_v2() - .bucket(&bucket) - .send() - .await - .unwrap(); - - assert!(l.contents.is_none()); -} diff --git a/src/garage/tests/s3/list.rs b/src/garage/tests/s3/list.rs new file mode 100644 index 00000000..bb03f250 --- /dev/null +++ b/src/garage/tests/s3/list.rs @@ -0,0 +1,615 @@ +use crate::common; + +const KEYS: [&str; 8] = ["a", "a/a", "a/b", "a/c", "a/d/a", "a/é", "b", "c"]; +const KEYS_MULTIPART: [&str; 5] = ["a", "a", "c", "c/a", "c/b"]; + +#[tokio::test] +async fn test_listobjectsv2() { + let ctx = common::context(); + let bucket = ctx.create_bucket("listobjectsv2"); + + for k in KEYS { + ctx.client + .put_object() + .bucket(&bucket) + .key(k) + .send() + .await + .unwrap(); + } + + { + // Scoping the variable to avoid reusing it + // in a following assert due to copy paste + let r = ctx + .client + .list_objects_v2() + .bucket(&bucket) + .send() + .await + .unwrap(); + + assert_eq!(r.contents.unwrap().len(), 8); + assert!(r.common_prefixes.is_none()); + } + + //@FIXME aws-sdk-s3 automatically checks max-key values. + // If we set it to zero, it drops it, and it is probably + // the same behavior on values bigger than 1000. + // Boto and awscli do not perform these tests, we should write + // our own minimal library to bypass AWS SDK's tests and be + // sure that we behave correctly. + + { + // With 2 elements + let r = ctx + .client + .list_objects_v2() + .bucket(&bucket) + .max_keys(2) + .send() + .await + .unwrap(); + + assert_eq!(r.contents.unwrap().len(), 2); + assert!(r.common_prefixes.is_none()); + assert!(r.next_continuation_token.is_some()); + } + + { + // With pagination + let mut cnt = 0; + let mut next = None; + let last_idx = KEYS.len() - 1; + + for i in 0..KEYS.len() { + let r = ctx + .client + .list_objects_v2() + .bucket(&bucket) + .set_continuation_token(next) + .max_keys(1) + .send() + .await + .unwrap(); + + cnt += 1; + next = r.next_continuation_token; + + assert_eq!(r.contents.unwrap().len(), 1); + assert!(r.common_prefixes.is_none()); + if i != last_idx { + assert!(next.is_some()); + } + } + assert_eq!(cnt, KEYS.len()); + } + + { + // With a delimiter + let r = ctx + .client + .list_objects_v2() + .bucket(&bucket) + .delimiter("/") + .send() + .await + .unwrap(); + + assert_eq!(r.contents.unwrap().len(), 3); + assert_eq!(r.common_prefixes.unwrap().len(), 1); + } + + { + // With a delimiter and pagination + let mut cnt_pfx = 0; + let mut cnt_key = 0; + let mut next = None; + + for _i in 0..KEYS.len() { + let r = ctx + .client + .list_objects_v2() + .bucket(&bucket) + .set_continuation_token(next) + .delimiter("/") + .max_keys(1) + .send() + .await + .unwrap(); + + next = r.next_continuation_token; + match (r.contents, r.common_prefixes) { + (Some(k), None) if k.len() == 1 => cnt_key += 1, + (None, Some(pfx)) if pfx.len() == 1 => cnt_pfx += 1, + _ => unreachable!("logic error"), + }; + if next.is_none() { + break; + } + } + assert_eq!(cnt_key, 3); + assert_eq!(cnt_pfx, 1); + } + + { + // With a prefix + let r = ctx + .client + .list_objects_v2() + .bucket(&bucket) + .prefix("a/") + .send() + .await + .unwrap(); + + assert_eq!(r.contents.unwrap().len(), 5); + assert!(r.common_prefixes.is_none()); + } + + { + // With a prefix and a delimiter + let r = ctx + .client + .list_objects_v2() + .bucket(&bucket) + .prefix("a/") + .delimiter("/") + .send() + .await + .unwrap(); + + assert_eq!(r.contents.unwrap().len(), 4); + assert_eq!(r.common_prefixes.unwrap().len(), 1); + } + + { + // With a prefix, a delimiter and max_key + let r = ctx + .client + .list_objects_v2() + .bucket(&bucket) + .prefix("a/") + .delimiter("/") + .max_keys(1) + .send() + .await + .unwrap(); + + assert_eq!(r.contents.as_ref().unwrap().len(), 1); + assert_eq!( + r.contents + .unwrap() + .first() + .unwrap() + .key + .as_ref() + .unwrap() + .as_str(), + "a/a" + ); + assert!(r.common_prefixes.is_none()); + } + { + // With start_after before all keys + let r = ctx + .client + .list_objects_v2() + .bucket(&bucket) + .start_after("Z") + .send() + .await + .unwrap(); + + assert_eq!(r.contents.unwrap().len(), 8); + assert!(r.common_prefixes.is_none()); + } + { + // With start_after after all keys + let r = ctx + .client + .list_objects_v2() + .bucket(&bucket) + .start_after("c") + .send() + .await + .unwrap(); + + assert!(r.contents.is_none()); + assert!(r.common_prefixes.is_none()); + } +} + +#[tokio::test] +async fn test_listobjectsv1() { + let ctx = common::context(); + let bucket = ctx.create_bucket("listobjects"); + + for k in KEYS { + ctx.client + .put_object() + .bucket(&bucket) + .key(k) + .send() + .await + .unwrap(); + } + + { + let r = ctx + .client + .list_objects() + .bucket(&bucket) + .send() + .await + .unwrap(); + + assert_eq!(r.contents.unwrap().len(), 8); + assert!(r.common_prefixes.is_none()); + } + + { + // With 2 elements + let r = ctx + .client + .list_objects() + .bucket(&bucket) + .max_keys(2) + .send() + .await + .unwrap(); + + assert_eq!(r.contents.unwrap().len(), 2); + assert!(r.common_prefixes.is_none()); + assert!(r.next_marker.is_some()); + } + + { + // With pagination + let mut cnt = 0; + let mut next = None; + let last_idx = KEYS.len() - 1; + + for i in 0..KEYS.len() { + let r = ctx + .client + .list_objects() + .bucket(&bucket) + .set_marker(next) + .max_keys(1) + .send() + .await + .unwrap(); + + cnt += 1; + next = r.next_marker; + + assert_eq!(r.contents.unwrap().len(), 1); + assert!(r.common_prefixes.is_none()); + if i != last_idx { + assert!(next.is_some()); + } + } + assert_eq!(cnt, KEYS.len()); + } + + { + // With a delimiter + let r = ctx + .client + .list_objects() + .bucket(&bucket) + .delimiter("/") + .send() + .await + .unwrap(); + + assert_eq!(r.contents.unwrap().len(), 3); + assert_eq!(r.common_prefixes.unwrap().len(), 1); + } + + { + // With a delimiter and pagination + let mut cnt_pfx = 0; + let mut cnt_key = 0; + let mut next = None; + + for _i in 0..KEYS.len() { + let r = ctx + .client + .list_objects() + .bucket(&bucket) + .delimiter("/") + .set_marker(next) + .max_keys(1) + .send() + .await + .unwrap(); + + next = r.next_marker; + match (r.contents, r.common_prefixes) { + (Some(k), None) if k.len() == 1 => cnt_key += 1, + (None, Some(pfx)) if pfx.len() == 1 => cnt_pfx += 1, + _ => unreachable!("logic error"), + }; + if next.is_none() { + break; + } + } + assert_eq!(cnt_key, 3); + // We have no optimization to skip the whole prefix + // on listobjectsv1 so we return the same one 5 times, + // for each element. It is up to the client to merge its result. + // This is compliant with AWS spec. + assert_eq!(cnt_pfx, 5); + } + + { + // With a prefix + let r = ctx + .client + .list_objects() + .bucket(&bucket) + .prefix("a/") + .send() + .await + .unwrap(); + + assert_eq!(r.contents.unwrap().len(), 5); + assert!(r.common_prefixes.is_none()); + } + + { + // With a prefix and a delimiter + let r = ctx + .client + .list_objects() + .bucket(&bucket) + .prefix("a/") + .delimiter("/") + .send() + .await + .unwrap(); + + assert_eq!(r.contents.unwrap().len(), 4); + assert_eq!(r.common_prefixes.unwrap().len(), 1); + } + + { + // With a prefix, a delimiter and max_key + let r = ctx + .client + .list_objects() + .bucket(&bucket) + .prefix("a/") + .delimiter("/") + .max_keys(1) + .send() + .await + .unwrap(); + + assert_eq!(r.contents.as_ref().unwrap().len(), 1); + assert_eq!( + r.contents + .unwrap() + .first() + .unwrap() + .key + .as_ref() + .unwrap() + .as_str(), + "a/a" + ); + assert!(r.common_prefixes.is_none()); + } + { + // With marker before all keys + let r = ctx + .client + .list_objects() + .bucket(&bucket) + .marker("Z") + .send() + .await + .unwrap(); + + assert_eq!(r.contents.unwrap().len(), 8); + assert!(r.common_prefixes.is_none()); + } + { + // With start_after after all keys + let r = ctx + .client + .list_objects() + .bucket(&bucket) + .marker("c") + .send() + .await + .unwrap(); + + assert!(r.contents.is_none()); + assert!(r.common_prefixes.is_none()); + } +} + +#[tokio::test] +async fn test_listmultipart() { + let ctx = common::context(); + let bucket = ctx.create_bucket("listmultipartuploads"); + + for k in KEYS_MULTIPART { + ctx.client + .create_multipart_upload() + .bucket(&bucket) + .key(k) + .send() + .await + .unwrap(); + } + + { + // Default + let r = ctx + .client + .list_multipart_uploads() + .bucket(&bucket) + .send() + .await + .unwrap(); + + assert_eq!(r.uploads.unwrap().len(), 5); + assert!(r.common_prefixes.is_none()); + } + { + // With pagination + let mut next = None; + let mut upnext = None; + let last_idx = KEYS_MULTIPART.len() - 1; + + for i in 0..KEYS_MULTIPART.len() { + let r = ctx + .client + .list_multipart_uploads() + .bucket(&bucket) + .set_key_marker(next) + .set_upload_id_marker(upnext) + .max_uploads(1) + .send() + .await + .unwrap(); + + next = r.next_key_marker; + upnext = r.next_upload_id_marker; + + assert_eq!(r.uploads.unwrap().len(), 1); + assert!(r.common_prefixes.is_none()); + if i != last_idx { + assert!(next.is_some()); + } + } + } + { + // With delimiter + let r = ctx + .client + .list_multipart_uploads() + .bucket(&bucket) + .delimiter("/") + .send() + .await + .unwrap(); + + assert_eq!(r.uploads.unwrap().len(), 3); + assert_eq!(r.common_prefixes.unwrap().len(), 1); + } + { + // With delimiter and pagination + let mut next = None; + let mut upnext = None; + let mut upcnt = 0; + let mut pfxcnt = 0; + let mut loopcnt = 0; + + while loopcnt < KEYS_MULTIPART.len() { + let r = ctx + .client + .list_multipart_uploads() + .bucket(&bucket) + .delimiter("/") + .max_uploads(1) + .set_key_marker(next) + .set_upload_id_marker(upnext) + .send() + .await + .unwrap(); + + next = r.next_key_marker; + upnext = r.next_upload_id_marker; + + loopcnt += 1; + upcnt += r.uploads.unwrap_or_default().len(); + pfxcnt += r.common_prefixes.unwrap_or_default().len(); + + if next.is_none() { + break; + } + } + + assert_eq!(upcnt + pfxcnt, loopcnt); + assert_eq!(upcnt, 3); + assert_eq!(pfxcnt, 1); + } + { + // With prefix + let r = ctx + .client + .list_multipart_uploads() + .bucket(&bucket) + .prefix("c") + .send() + .await + .unwrap(); + + assert_eq!(r.uploads.unwrap().len(), 3); + assert!(r.common_prefixes.is_none()); + } + { + // With prefix and delimiter + let r = ctx + .client + .list_multipart_uploads() + .bucket(&bucket) + .prefix("c") + .delimiter("/") + .send() + .await + .unwrap(); + + assert_eq!(r.uploads.unwrap().len(), 1); + assert_eq!(r.common_prefixes.unwrap().len(), 1); + } + { + // With prefix, delimiter and max keys + let r = ctx + .client + .list_multipart_uploads() + .bucket(&bucket) + .prefix("c") + .delimiter("/") + .max_uploads(1) + .send() + .await + .unwrap(); + + assert_eq!(r.uploads.unwrap().len(), 1); + assert!(r.common_prefixes.is_none()); + } + { + // With starting token before the first element + let r = ctx + .client + .list_multipart_uploads() + .bucket(&bucket) + .key_marker("ZZZZZ") + .send() + .await + .unwrap(); + + assert_eq!(r.uploads.unwrap().len(), 5); + assert!(r.common_prefixes.is_none()); + } + { + // With starting token after the last element + let r = ctx + .client + .list_multipart_uploads() + .bucket(&bucket) + .key_marker("d") + .send() + .await + .unwrap(); + + assert!(r.uploads.is_none()); + assert!(r.common_prefixes.is_none()); + } +} diff --git a/src/garage/tests/s3/mod.rs b/src/garage/tests/s3/mod.rs new file mode 100644 index 00000000..623eb665 --- /dev/null +++ b/src/garage/tests/s3/mod.rs @@ -0,0 +1,6 @@ +mod list; +mod multipart; +mod objects; +mod simple; +mod streaming_signature; +mod website; diff --git a/src/garage/tests/s3/multipart.rs b/src/garage/tests/s3/multipart.rs new file mode 100644 index 00000000..895a2993 --- /dev/null +++ b/src/garage/tests/s3/multipart.rs @@ -0,0 +1,415 @@ +use crate::common; +use aws_sdk_s3::model::{CompletedMultipartUpload, CompletedPart}; +use aws_sdk_s3::types::ByteStream; + +const SZ_5MB: usize = 5 * 1024 * 1024; +const SZ_10MB: usize = 10 * 1024 * 1024; + +#[tokio::test] +async fn test_uploadlistpart() { + let ctx = common::context(); + let bucket = ctx.create_bucket("uploadpart"); + + let u1 = vec![0xee; SZ_5MB]; + let u2 = vec![0x11; SZ_5MB]; + + let up = ctx + .client + .create_multipart_upload() + .bucket(&bucket) + .key("a") + .send() + .await + .unwrap(); + let uid = up.upload_id.as_ref().unwrap(); + + assert!(up.upload_id.is_some()); + + { + let r = ctx + .client + .list_parts() + .bucket(&bucket) + .key("a") + .upload_id(uid) + .send() + .await + .unwrap(); + + assert!(r.parts.is_none()); + } + + let p1 = ctx + .client + .upload_part() + .bucket(&bucket) + .key("a") + .upload_id(uid) + .part_number(2) + .body(ByteStream::from(u1)) + .send() + .await + .unwrap(); + + { + // ListPart on 1st element + let r = ctx + .client + .list_parts() + .bucket(&bucket) + .key("a") + .upload_id(uid) + .send() + .await + .unwrap(); + + let ps = r.parts.unwrap(); + assert_eq!(ps.len(), 1); + let fp = ps.iter().find(|x| x.part_number == 2).unwrap(); + assert!(fp.last_modified.is_some()); + assert_eq!( + fp.e_tag.as_ref().unwrap(), + "\"3366bb9dcf710d6801b5926467d02e19\"" + ); + assert_eq!(fp.size, SZ_5MB as i64); + } + + let p2 = ctx + .client + .upload_part() + .bucket(&bucket) + .key("a") + .upload_id(uid) + .part_number(1) + .body(ByteStream::from(u2)) + .send() + .await + .unwrap(); + + { + // ListPart on the 2 elements + let r = ctx + .client + .list_parts() + .bucket(&bucket) + .key("a") + .upload_id(uid) + .send() + .await + .unwrap(); + + let ps = r.parts.unwrap(); + assert_eq!(ps.len(), 2); + let fp = ps.iter().find(|x| x.part_number == 1).unwrap(); + assert!(fp.last_modified.is_some()); + assert_eq!( + fp.e_tag.as_ref().unwrap(), + "\"3c484266f9315485694556e6c693bfa2\"" + ); + assert_eq!(fp.size, SZ_5MB as i64); + } + + { + // Call pagination + let r = ctx + .client + .list_parts() + .bucket(&bucket) + .key("a") + .upload_id(uid) + .max_parts(1) + .send() + .await + .unwrap(); + + assert!(r.part_number_marker.is_none()); + assert!(r.next_part_number_marker.is_some()); + assert_eq!(r.max_parts, 1_i32); + assert!(r.is_truncated); + assert_eq!(r.key.unwrap(), "a"); + assert_eq!(r.upload_id.unwrap().as_str(), uid.as_str()); + assert_eq!(r.parts.unwrap().len(), 1); + + let r2 = ctx + .client + .list_parts() + .bucket(&bucket) + .key("a") + .upload_id(uid) + .max_parts(1) + .part_number_marker(r.next_part_number_marker.as_ref().unwrap()) + .send() + .await + .unwrap(); + + assert_eq!( + r2.part_number_marker.as_ref().unwrap(), + r.next_part_number_marker.as_ref().unwrap() + ); + assert_eq!(r2.max_parts, 1_i32); + assert!(r2.is_truncated); + assert_eq!(r2.key.unwrap(), "a"); + assert_eq!(r2.upload_id.unwrap().as_str(), uid.as_str()); + assert_eq!(r2.parts.unwrap().len(), 1); + } + + let cmp = CompletedMultipartUpload::builder() + .parts( + CompletedPart::builder() + .part_number(1) + .e_tag(p2.e_tag.unwrap()) + .build(), + ) + .parts( + CompletedPart::builder() + .part_number(2) + .e_tag(p1.e_tag.unwrap()) + .build(), + ) + .build(); + + ctx.client + .complete_multipart_upload() + .bucket(&bucket) + .key("a") + .upload_id(uid) + .multipart_upload(cmp) + .send() + .await + .unwrap(); + + // The multipart upload must not appear anymore + assert!(ctx + .client + .list_parts() + .bucket(&bucket) + .key("a") + .upload_id(uid) + .send() + .await + .is_err()); + + { + // The object must appear as a regular object + let r = ctx + .client + .head_object() + .bucket(&bucket) + .key("a") + .send() + .await + .unwrap(); + + assert_eq!(r.content_length, (SZ_5MB * 2) as i64); + } +} + +#[tokio::test] +async fn test_uploadpartcopy() { + let ctx = common::context(); + let bucket = ctx.create_bucket("uploadpartcopy"); + + let u1 = vec![0x11; SZ_10MB]; + let u2 = vec![0x22; SZ_5MB]; + let u3 = vec![0x33; SZ_5MB]; + let u4 = vec![0x44; SZ_5MB]; + let u5 = vec![0x55; SZ_5MB]; + + let overflow = 5500000 - SZ_5MB; + let mut exp_obj = u3.clone(); + exp_obj.extend(&u4[500..]); + exp_obj.extend(&u5[..overflow + 1]); + exp_obj.extend(&u2); + exp_obj.extend(&u1[500..5500000 + 1]); + + // (setup) Upload a single part object + ctx.client + .put_object() + .bucket(&bucket) + .key("source1") + .body(ByteStream::from(u1)) + .send() + .await + .unwrap(); + + // (setup) Upload a multipart object with 2 parts + { + let up = ctx + .client + .create_multipart_upload() + .bucket(&bucket) + .key("source2") + .send() + .await + .unwrap(); + let uid = up.upload_id.as_ref().unwrap(); + + let p1 = ctx + .client + .upload_part() + .bucket(&bucket) + .key("source2") + .upload_id(uid) + .part_number(1) + .body(ByteStream::from(u4)) + .send() + .await + .unwrap(); + + let p2 = ctx + .client + .upload_part() + .bucket(&bucket) + .key("source2") + .upload_id(uid) + .part_number(2) + .body(ByteStream::from(u5)) + .send() + .await + .unwrap(); + + let cmp = CompletedMultipartUpload::builder() + .parts( + CompletedPart::builder() + .part_number(1) + .e_tag(p1.e_tag.unwrap()) + .build(), + ) + .parts( + CompletedPart::builder() + .part_number(2) + .e_tag(p2.e_tag.unwrap()) + .build(), + ) + .build(); + + ctx.client + .complete_multipart_upload() + .bucket(&bucket) + .key("source2") + .upload_id(uid) + .multipart_upload(cmp) + .send() + .await + .unwrap(); + } + + // Our multipart object that does copy + let up = ctx + .client + .create_multipart_upload() + .bucket(&bucket) + .key("target") + .send() + .await + .unwrap(); + let uid = up.upload_id.as_ref().unwrap(); + + let p3 = ctx + .client + .upload_part() + .bucket(&bucket) + .key("target") + .upload_id(uid) + .part_number(3) + .body(ByteStream::from(u2)) + .send() + .await + .unwrap(); + + let p1 = ctx + .client + .upload_part() + .bucket(&bucket) + .key("target") + .upload_id(uid) + .part_number(1) + .body(ByteStream::from(u3)) + .send() + .await + .unwrap(); + + let p2 = ctx + .client + .upload_part_copy() + .bucket(&bucket) + .key("target") + .upload_id(uid) + .part_number(2) + .copy_source("uploadpartcopy/source2") + .copy_source_range("bytes=500-5500000") + .send() + .await + .unwrap(); + + let p4 = ctx + .client + .upload_part_copy() + .bucket(&bucket) + .key("target") + .upload_id(uid) + .part_number(4) + .copy_source("uploadpartcopy/source1") + .copy_source_range("bytes=500-5500000") + .send() + .await + .unwrap(); + + let cmp = CompletedMultipartUpload::builder() + .parts( + CompletedPart::builder() + .part_number(1) + .e_tag(p1.e_tag.unwrap()) + .build(), + ) + .parts( + CompletedPart::builder() + .part_number(2) + .e_tag(p2.copy_part_result.unwrap().e_tag.unwrap()) + .build(), + ) + .parts( + CompletedPart::builder() + .part_number(3) + .e_tag(p3.e_tag.unwrap()) + .build(), + ) + .parts( + CompletedPart::builder() + .part_number(4) + .e_tag(p4.copy_part_result.unwrap().e_tag.unwrap()) + .build(), + ) + .build(); + + ctx.client + .complete_multipart_upload() + .bucket(&bucket) + .key("target") + .upload_id(uid) + .multipart_upload(cmp) + .send() + .await + .unwrap(); + + // (check) Get object + + let obj = ctx + .client + .get_object() + .bucket(&bucket) + .key("target") + .send() + .await + .unwrap(); + + let real_obj = obj + .body + .collect() + .await + .expect("Error reading data") + .into_bytes(); + + assert_eq!(real_obj.len(), exp_obj.len()); + assert_eq!(real_obj, exp_obj); +} diff --git a/src/garage/tests/s3/objects.rs b/src/garage/tests/s3/objects.rs new file mode 100644 index 00000000..e1175b81 --- /dev/null +++ b/src/garage/tests/s3/objects.rs @@ -0,0 +1,266 @@ +use crate::common; +use aws_sdk_s3::model::{Delete, ObjectIdentifier}; +use aws_sdk_s3::types::ByteStream; + +const STD_KEY: &str = "hello world"; +const CTRL_KEY: &str = "\x00\x01\x02\x00"; +const UTF8_KEY: &str = "\u{211D}\u{1F923}\u{1F44B}"; +const BODY: &[u8; 62] = b"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; + +#[tokio::test] +async fn test_putobject() { + let ctx = common::context(); + let bucket = ctx.create_bucket("putobject"); + + { + // Send an empty object (can serve as a directory marker) + // with a content type + let etag = "\"d41d8cd98f00b204e9800998ecf8427e\""; + let content_type = "text/csv"; + let r = ctx + .client + .put_object() + .bucket(&bucket) + .key(STD_KEY) + .content_type(content_type) + .send() + .await + .unwrap(); + + assert_eq!(r.e_tag.unwrap().as_str(), etag); + // We return a version ID here + // We should check if Amazon is returning one when versioning is not enabled + assert!(r.version_id.is_some()); + + let _version = r.version_id.unwrap(); + + let o = ctx + .client + .get_object() + .bucket(&bucket) + .key(STD_KEY) + .send() + .await + .unwrap(); + + assert_bytes_eq!(o.body, b""); + assert_eq!(o.e_tag.unwrap(), etag); + // We do not return version ID + // We should check if Amazon is returning one when versioning is not enabled + // assert_eq!(o.version_id.unwrap(), _version); + assert_eq!(o.content_type.unwrap(), content_type); + assert!(o.last_modified.is_some()); + assert_eq!(o.content_length, 0); + assert_eq!(o.parts_count, 0); + assert_eq!(o.tag_count, 0); + } + + { + // Key with control characters, + // no content type and some data + let etag = "\"49f68a5c8493ec2c0bf489821c21fc3b\""; + let data = ByteStream::from_static(b"hi"); + + let r = ctx + .client + .put_object() + .bucket(&bucket) + .key(CTRL_KEY) + .body(data) + .send() + .await + .unwrap(); + + assert_eq!(r.e_tag.unwrap().as_str(), etag); + assert!(r.version_id.is_some()); + + let o = ctx + .client + .get_object() + .bucket(&bucket) + .key(CTRL_KEY) + .send() + .await + .unwrap(); + + assert_bytes_eq!(o.body, b"hi"); + assert_eq!(o.e_tag.unwrap(), etag); + assert!(o.last_modified.is_some()); + assert_eq!(o.content_length, 2); + assert_eq!(o.parts_count, 0); + assert_eq!(o.tag_count, 0); + } + + { + // Key with UTF8 codepoints including emoji + let etag = "\"d41d8cd98f00b204e9800998ecf8427e\""; + + let r = ctx + .client + .put_object() + .bucket(&bucket) + .key(UTF8_KEY) + .send() + .await + .unwrap(); + + assert_eq!(r.e_tag.unwrap().as_str(), etag); + assert!(r.version_id.is_some()); + + let o = ctx + .client + .get_object() + .bucket(&bucket) + .key(UTF8_KEY) + .send() + .await + .unwrap(); + + assert_bytes_eq!(o.body, b""); + assert_eq!(o.e_tag.unwrap(), etag); + assert!(o.last_modified.is_some()); + assert_eq!(o.content_length, 0); + assert_eq!(o.parts_count, 0); + assert_eq!(o.tag_count, 0); + } +} + +#[tokio::test] +async fn test_getobject() { + let ctx = common::context(); + let bucket = ctx.create_bucket("getobject"); + + let etag = "\"46cf18a9b447991b450cad3facf5937e\""; + let data = ByteStream::from_static(BODY); + + let r = ctx + .client + .put_object() + .bucket(&bucket) + .key(STD_KEY) + .body(data) + .send() + .await + .unwrap(); + + assert_eq!(r.e_tag.unwrap().as_str(), etag); + + { + let o = ctx + .client + .get_object() + .bucket(&bucket) + .key(STD_KEY) + .range("bytes=1-9") + .send() + .await + .unwrap(); + + assert_eq!(o.content_range.unwrap().as_str(), "bytes 1-9/62"); + assert_bytes_eq!(o.body, &BODY[1..10]); + } + { + let o = ctx + .client + .get_object() + .bucket(&bucket) + .key(STD_KEY) + .range("bytes=9-") + .send() + .await + .unwrap(); + assert_eq!(o.content_range.unwrap().as_str(), "bytes 9-61/62"); + assert_bytes_eq!(o.body, &BODY[9..]); + } + { + let o = ctx + .client + .get_object() + .bucket(&bucket) + .key(STD_KEY) + .range("bytes=-5") + .send() + .await + .unwrap(); + assert_eq!(o.content_range.unwrap().as_str(), "bytes 57-61/62"); + assert_bytes_eq!(o.body, &BODY[57..]); + } +} + +#[tokio::test] +async fn test_deleteobject() { + let ctx = common::context(); + let bucket = ctx.create_bucket("deleteobject"); + + let mut to_del = Delete::builder(); + + // add content without data + for i in 0..5 { + let k = format!("k-{}", i); + ctx.client + .put_object() + .bucket(&bucket) + .key(k.to_string()) + .send() + .await + .unwrap(); + if i > 0 { + to_del = to_del.objects(ObjectIdentifier::builder().key(k).build()); + } + } + + // add content with data + for i in 0..5 { + let k = format!("l-{}", i); + let data = ByteStream::from_static(BODY); + ctx.client + .put_object() + .bucket(&bucket) + .key(k.to_string()) + .body(data) + .send() + .await + .unwrap(); + + if i > 0 { + to_del = to_del.objects(ObjectIdentifier::builder().key(k).build()); + } + } + + ctx.client + .delete_object() + .bucket(&bucket) + .key("k-0") + .send() + .await + .unwrap(); + + ctx.client + .delete_object() + .bucket(&bucket) + .key("l-0") + .send() + .await + .unwrap(); + + let r = ctx + .client + .delete_objects() + .bucket(&bucket) + .delete(to_del.build()) + .send() + .await + .unwrap(); + + assert_eq!(r.deleted.unwrap().len(), 8); + + let l = ctx + .client + .list_objects_v2() + .bucket(&bucket) + .send() + .await + .unwrap(); + + assert!(l.contents.is_none()); +} diff --git a/src/garage/tests/s3/simple.rs b/src/garage/tests/s3/simple.rs new file mode 100644 index 00000000..f54ae9ac --- /dev/null +++ b/src/garage/tests/s3/simple.rs @@ -0,0 +1,31 @@ +use crate::common; + +#[tokio::test] +async fn test_simple() { + use aws_sdk_s3::types::ByteStream; + + let ctx = common::context(); + let bucket = ctx.create_bucket("test-simple"); + + let data = ByteStream::from_static(b"Hello world!"); + + ctx.client + .put_object() + .bucket(&bucket) + .key("test") + .body(data) + .send() + .await + .unwrap(); + + let res = ctx + .client + .get_object() + .bucket(&bucket) + .key("test") + .send() + .await + .unwrap(); + + assert_bytes_eq!(res.body, b"Hello world!"); +} diff --git a/src/garage/tests/s3/streaming_signature.rs b/src/garage/tests/s3/streaming_signature.rs new file mode 100644 index 00000000..c68f7dfc --- /dev/null +++ b/src/garage/tests/s3/streaming_signature.rs @@ -0,0 +1,185 @@ +use std::collections::HashMap; + +use crate::common; +use common::custom_requester::BodySignature; +use hyper::Method; + +const STD_KEY: &str = "hello-world"; +//const CTRL_KEY: &str = "\x00\x01\x02\x00"; +const BODY: &[u8; 62] = b"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; + +#[tokio::test] +async fn test_putobject_streaming() { + let ctx = common::context(); + let bucket = ctx.create_bucket("putobject-streaming"); + + { + // Send an empty object (can serve as a directory marker) + // with a content type + let etag = "\"d41d8cd98f00b204e9800998ecf8427e\""; + let content_type = "text/csv"; + let mut headers = HashMap::new(); + headers.insert("content-type".to_owned(), content_type.to_owned()); + let _ = ctx + .custom_request + .builder(bucket.clone()) + .method(Method::PUT) + .path(STD_KEY.to_owned()) + .unsigned_headers(headers) + .vhost_style(true) + .body(vec![]) + .body_signature(BodySignature::Streaming(10)) + .send() + .await + .unwrap(); + + // assert_eq!(r.e_tag.unwrap().as_str(), etag); + // We return a version ID here + // We should check if Amazon is returning one when versioning is not enabled + // assert!(r.version_id.is_some()); + + //let _version = r.version_id.unwrap(); + + let o = ctx + .client + .get_object() + .bucket(&bucket) + .key(STD_KEY) + .send() + .await + .unwrap(); + + assert_bytes_eq!(o.body, b""); + assert_eq!(o.e_tag.unwrap(), etag); + // We do not return version ID + // We should check if Amazon is returning one when versioning is not enabled + // assert_eq!(o.version_id.unwrap(), _version); + assert_eq!(o.content_type.unwrap(), content_type); + assert!(o.last_modified.is_some()); + assert_eq!(o.content_length, 0); + assert_eq!(o.parts_count, 0); + assert_eq!(o.tag_count, 0); + } + + { + let etag = "\"46cf18a9b447991b450cad3facf5937e\""; + + let _ = ctx + .custom_request + .builder(bucket.clone()) + .method(Method::PUT) + //.path(CTRL_KEY.to_owned()) at the moment custom_request does not encode url so this + //fail + .path("abc".to_owned()) + .vhost_style(true) + .body(BODY.to_vec()) + .body_signature(BodySignature::Streaming(16)) + .send() + .await + .unwrap(); + + // assert_eq!(r.e_tag.unwrap().as_str(), etag); + // assert!(r.version_id.is_some()); + + let o = ctx + .client + .get_object() + .bucket(&bucket) + //.key(CTRL_KEY) + .key("abc") + .send() + .await + .unwrap(); + + assert_bytes_eq!(o.body, BODY); + assert_eq!(o.e_tag.unwrap(), etag); + assert!(o.last_modified.is_some()); + assert_eq!(o.content_length, 62); + assert_eq!(o.parts_count, 0); + assert_eq!(o.tag_count, 0); + } +} + +#[tokio::test] +async fn test_create_bucket_streaming() { + let ctx = common::context(); + let bucket = "createbucket-streaming"; + + { + // create bucket + let _ = ctx + .custom_request + .builder(bucket.to_owned()) + .method(Method::PUT) + .body_signature(BodySignature::Streaming(10)) + .send() + .await + .unwrap(); + + // test if the bucket exists and works properly + let etag = "\"d41d8cd98f00b204e9800998ecf8427e\""; + let content_type = "text/csv"; + let _ = ctx + .client + .put_object() + .bucket(bucket) + .key(STD_KEY) + .content_type(content_type) + .send() + .await + .unwrap(); + + let o = ctx + .client + .get_object() + .bucket(bucket) + .key(STD_KEY) + .send() + .await + .unwrap(); + + assert_eq!(o.e_tag.unwrap(), etag); + } +} + +#[tokio::test] +async fn test_put_website_streaming() { + let ctx = common::context(); + let bucket = ctx.create_bucket("putwebsite-streaming"); + + { + let website_config = r#" + + + err/error.html + + + home.html + +"#; + + let mut query = HashMap::new(); + query.insert("website".to_owned(), None); + let _ = ctx + .custom_request + .builder(bucket.clone()) + .method(Method::PUT) + .query_params(query) + .body(website_config.as_bytes().to_vec()) + .body_signature(BodySignature::Streaming(10)) + .send() + .await + .unwrap(); + + let o = ctx + .client + .get_bucket_website() + .bucket(&bucket) + .send() + .await + .unwrap(); + + assert_eq!(o.index_document.unwrap().suffix.unwrap(), "home.html"); + assert_eq!(o.error_document.unwrap().key.unwrap(), "err/error.html"); + } +} diff --git a/src/garage/tests/s3/website.rs b/src/garage/tests/s3/website.rs new file mode 100644 index 00000000..0570ac6a --- /dev/null +++ b/src/garage/tests/s3/website.rs @@ -0,0 +1,324 @@ +use crate::common; +use crate::common::ext::*; +use aws_sdk_s3::{ + model::{CorsConfiguration, CorsRule, ErrorDocument, IndexDocument, WebsiteConfiguration}, + types::ByteStream, +}; +use http::Request; +use hyper::{ + body::{to_bytes, Body}, + Client, +}; + +const BODY: &[u8; 16] = b"

bonjour

"; +const BODY_ERR: &[u8; 6] = b"erreur"; + +#[tokio::test] +async fn test_website() { + const BCKT_NAME: &str = "my-website"; + let ctx = common::context(); + let bucket = ctx.create_bucket(BCKT_NAME); + + let data = ByteStream::from_static(BODY); + + ctx.client + .put_object() + .bucket(&bucket) + .key("index.html") + .body(data) + .send() + .await + .unwrap(); + + let client = Client::new(); + + let req = || { + Request::builder() + .method("GET") + .uri(format!("http://127.0.0.1:{}/", ctx.garage.web_port)) + .header("Host", format!("{}.web.garage", BCKT_NAME)) + .body(Body::empty()) + .unwrap() + }; + + let mut resp = client.request(req()).await.unwrap(); + + assert_eq!(resp.status(), 404); + assert_ne!( + to_bytes(resp.body_mut()).await.unwrap().as_ref(), + BODY.as_ref() + ); /* check that we do not leak body */ + + ctx.garage + .command() + .args(["bucket", "website", "--allow", BCKT_NAME]) + .quiet() + .expect_success_status("Could not allow website on bucket"); + + resp = client.request(req()).await.unwrap(); + assert_eq!(resp.status(), 200); + assert_eq!( + to_bytes(resp.body_mut()).await.unwrap().as_ref(), + BODY.as_ref() + ); + + ctx.garage + .command() + .args(["bucket", "website", "--deny", BCKT_NAME]) + .quiet() + .expect_success_status("Could not deny website on bucket"); + + resp = client.request(req()).await.unwrap(); + assert_eq!(resp.status(), 404); + assert_ne!( + to_bytes(resp.body_mut()).await.unwrap().as_ref(), + BODY.as_ref() + ); /* check that we do not leak body */ +} + +#[tokio::test] +async fn test_website_s3_api() { + const BCKT_NAME: &str = "my-cors"; + let ctx = common::context(); + let bucket = ctx.create_bucket(BCKT_NAME); + + let data = ByteStream::from_static(BODY); + + ctx.client + .put_object() + .bucket(&bucket) + .key("site/home.html") + .body(data) + .send() + .await + .unwrap(); + + ctx.client + .put_object() + .bucket(&bucket) + .key("err/error.html") + .body(ByteStream::from_static(BODY_ERR)) + .send() + .await + .unwrap(); + + let conf = WebsiteConfiguration::builder() + .index_document(IndexDocument::builder().suffix("home.html").build()) + .error_document(ErrorDocument::builder().key("err/error.html").build()) + .build(); + + ctx.client + .put_bucket_website() + .bucket(&bucket) + .website_configuration(conf) + .send() + .await + .unwrap(); + + let cors = CorsConfiguration::builder() + .cors_rules( + CorsRule::builder() + .id("main-rule") + .allowed_headers("*") + .allowed_methods("GET") + .allowed_methods("PUT") + .allowed_origins("*") + .build(), + ) + .build(); + + ctx.client + .put_bucket_cors() + .bucket(&bucket) + .cors_configuration(cors) + .send() + .await + .unwrap(); + + { + let cors_res = ctx + .client + .get_bucket_cors() + .bucket(&bucket) + .send() + .await + .unwrap(); + + let main_rule = cors_res.cors_rules().unwrap().iter().next().unwrap(); + + assert_eq!(main_rule.id.as_ref().unwrap(), "main-rule"); + assert_eq!( + main_rule.allowed_headers.as_ref().unwrap(), + &vec!["*".to_string()] + ); + assert_eq!( + main_rule.allowed_origins.as_ref().unwrap(), + &vec!["*".to_string()] + ); + assert_eq!( + main_rule.allowed_methods.as_ref().unwrap(), + &vec!["GET".to_string(), "PUT".to_string()] + ); + } + + let client = Client::new(); + + // Test direct requests with CORS + { + let req = Request::builder() + .method("GET") + .uri(format!("http://127.0.0.1:{}/site/", ctx.garage.web_port)) + .header("Host", format!("{}.web.garage", BCKT_NAME)) + .header("Origin", "https://example.com") + .body(Body::empty()) + .unwrap(); + + let mut resp = client.request(req).await.unwrap(); + + assert_eq!(resp.status(), 200); + assert_eq!( + resp.headers().get("access-control-allow-origin").unwrap(), + "*" + ); + assert_eq!( + to_bytes(resp.body_mut()).await.unwrap().as_ref(), + BODY.as_ref() + ); + } + + // Test ErrorDocument on 404 + { + let req = Request::builder() + .method("GET") + .uri(format!( + "http://127.0.0.1:{}/wrong.html", + ctx.garage.web_port + )) + .header("Host", format!("{}.web.garage", BCKT_NAME)) + .body(Body::empty()) + .unwrap(); + + let mut resp = client.request(req).await.unwrap(); + + assert_eq!(resp.status(), 404); + assert_eq!( + to_bytes(resp.body_mut()).await.unwrap().as_ref(), + BODY_ERR.as_ref() + ); + } + + // Test CORS with an allowed preflight request + { + let req = Request::builder() + .method("OPTIONS") + .uri(format!("http://127.0.0.1:{}/site/", ctx.garage.web_port)) + .header("Host", format!("{}.web.garage", BCKT_NAME)) + .header("Origin", "https://example.com") + .header("Access-Control-Request-Method", "PUT") + .body(Body::empty()) + .unwrap(); + + let mut resp = client.request(req).await.unwrap(); + + assert_eq!(resp.status(), 200); + assert_eq!( + resp.headers().get("access-control-allow-origin").unwrap(), + "*" + ); + assert_ne!( + to_bytes(resp.body_mut()).await.unwrap().as_ref(), + BODY.as_ref() + ); + } + + // Test CORS with a forbidden preflight request + { + let req = Request::builder() + .method("OPTIONS") + .uri(format!("http://127.0.0.1:{}/site/", ctx.garage.web_port)) + .header("Host", format!("{}.web.garage", BCKT_NAME)) + .header("Origin", "https://example.com") + .header("Access-Control-Request-Method", "DELETE") + .body(Body::empty()) + .unwrap(); + + let mut resp = client.request(req).await.unwrap(); + + assert_eq!(resp.status(), 403); + assert_ne!( + to_bytes(resp.body_mut()).await.unwrap().as_ref(), + BODY.as_ref() + ); + } + + //@TODO test CORS on the S3 endpoint. We need to handle auth manually to check it. + + // Delete cors + ctx.client + .delete_bucket_cors() + .bucket(&bucket) + .send() + .await + .unwrap(); + + // Check CORS are deleted from the API + // @FIXME check what is the expected behavior when GetBucketCors is called on a bucket without + // any CORS. + assert!(ctx + .client + .get_bucket_cors() + .bucket(&bucket) + .send() + .await + .is_err()); + + // Test CORS are not sent anymore on a previously allowed request + { + let req = Request::builder() + .method("OPTIONS") + .uri(format!("http://127.0.0.1:{}/site/", ctx.garage.web_port)) + .header("Host", format!("{}.web.garage", BCKT_NAME)) + .header("Origin", "https://example.com") + .header("Access-Control-Request-Method", "PUT") + .body(Body::empty()) + .unwrap(); + + let mut resp = client.request(req).await.unwrap(); + + assert_eq!(resp.status(), 403); + assert_ne!( + to_bytes(resp.body_mut()).await.unwrap().as_ref(), + BODY.as_ref() + ); + } + + // Disallow website from the API + ctx.client + .delete_bucket_website() + .bucket(&bucket) + .send() + .await + .unwrap(); + + // Check that the website is not served anymore + { + let req = Request::builder() + .method("GET") + .uri(format!("http://127.0.0.1:{}/site/", ctx.garage.web_port)) + .header("Host", format!("{}.web.garage", BCKT_NAME)) + .body(Body::empty()) + .unwrap(); + + let mut resp = client.request(req).await.unwrap(); + + assert_eq!(resp.status(), 404); + assert_ne!( + to_bytes(resp.body_mut()).await.unwrap().as_ref(), + BODY_ERR.as_ref() + ); + assert_ne!( + to_bytes(resp.body_mut()).await.unwrap().as_ref(), + BODY.as_ref() + ); + } +} diff --git a/src/garage/tests/simple.rs b/src/garage/tests/simple.rs deleted file mode 100644 index f54ae9ac..00000000 --- a/src/garage/tests/simple.rs +++ /dev/null @@ -1,31 +0,0 @@ -use crate::common; - -#[tokio::test] -async fn test_simple() { - use aws_sdk_s3::types::ByteStream; - - let ctx = common::context(); - let bucket = ctx.create_bucket("test-simple"); - - let data = ByteStream::from_static(b"Hello world!"); - - ctx.client - .put_object() - .bucket(&bucket) - .key("test") - .body(data) - .send() - .await - .unwrap(); - - let res = ctx - .client - .get_object() - .bucket(&bucket) - .key("test") - .send() - .await - .unwrap(); - - assert_bytes_eq!(res.body, b"Hello world!"); -} diff --git a/src/garage/tests/streaming_signature.rs b/src/garage/tests/streaming_signature.rs deleted file mode 100644 index c68f7dfc..00000000 --- a/src/garage/tests/streaming_signature.rs +++ /dev/null @@ -1,185 +0,0 @@ -use std::collections::HashMap; - -use crate::common; -use common::custom_requester::BodySignature; -use hyper::Method; - -const STD_KEY: &str = "hello-world"; -//const CTRL_KEY: &str = "\x00\x01\x02\x00"; -const BODY: &[u8; 62] = b"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; - -#[tokio::test] -async fn test_putobject_streaming() { - let ctx = common::context(); - let bucket = ctx.create_bucket("putobject-streaming"); - - { - // Send an empty object (can serve as a directory marker) - // with a content type - let etag = "\"d41d8cd98f00b204e9800998ecf8427e\""; - let content_type = "text/csv"; - let mut headers = HashMap::new(); - headers.insert("content-type".to_owned(), content_type.to_owned()); - let _ = ctx - .custom_request - .builder(bucket.clone()) - .method(Method::PUT) - .path(STD_KEY.to_owned()) - .unsigned_headers(headers) - .vhost_style(true) - .body(vec![]) - .body_signature(BodySignature::Streaming(10)) - .send() - .await - .unwrap(); - - // assert_eq!(r.e_tag.unwrap().as_str(), etag); - // We return a version ID here - // We should check if Amazon is returning one when versioning is not enabled - // assert!(r.version_id.is_some()); - - //let _version = r.version_id.unwrap(); - - let o = ctx - .client - .get_object() - .bucket(&bucket) - .key(STD_KEY) - .send() - .await - .unwrap(); - - assert_bytes_eq!(o.body, b""); - assert_eq!(o.e_tag.unwrap(), etag); - // We do not return version ID - // We should check if Amazon is returning one when versioning is not enabled - // assert_eq!(o.version_id.unwrap(), _version); - assert_eq!(o.content_type.unwrap(), content_type); - assert!(o.last_modified.is_some()); - assert_eq!(o.content_length, 0); - assert_eq!(o.parts_count, 0); - assert_eq!(o.tag_count, 0); - } - - { - let etag = "\"46cf18a9b447991b450cad3facf5937e\""; - - let _ = ctx - .custom_request - .builder(bucket.clone()) - .method(Method::PUT) - //.path(CTRL_KEY.to_owned()) at the moment custom_request does not encode url so this - //fail - .path("abc".to_owned()) - .vhost_style(true) - .body(BODY.to_vec()) - .body_signature(BodySignature::Streaming(16)) - .send() - .await - .unwrap(); - - // assert_eq!(r.e_tag.unwrap().as_str(), etag); - // assert!(r.version_id.is_some()); - - let o = ctx - .client - .get_object() - .bucket(&bucket) - //.key(CTRL_KEY) - .key("abc") - .send() - .await - .unwrap(); - - assert_bytes_eq!(o.body, BODY); - assert_eq!(o.e_tag.unwrap(), etag); - assert!(o.last_modified.is_some()); - assert_eq!(o.content_length, 62); - assert_eq!(o.parts_count, 0); - assert_eq!(o.tag_count, 0); - } -} - -#[tokio::test] -async fn test_create_bucket_streaming() { - let ctx = common::context(); - let bucket = "createbucket-streaming"; - - { - // create bucket - let _ = ctx - .custom_request - .builder(bucket.to_owned()) - .method(Method::PUT) - .body_signature(BodySignature::Streaming(10)) - .send() - .await - .unwrap(); - - // test if the bucket exists and works properly - let etag = "\"d41d8cd98f00b204e9800998ecf8427e\""; - let content_type = "text/csv"; - let _ = ctx - .client - .put_object() - .bucket(bucket) - .key(STD_KEY) - .content_type(content_type) - .send() - .await - .unwrap(); - - let o = ctx - .client - .get_object() - .bucket(bucket) - .key(STD_KEY) - .send() - .await - .unwrap(); - - assert_eq!(o.e_tag.unwrap(), etag); - } -} - -#[tokio::test] -async fn test_put_website_streaming() { - let ctx = common::context(); - let bucket = ctx.create_bucket("putwebsite-streaming"); - - { - let website_config = r#" - - - err/error.html - - - home.html - -"#; - - let mut query = HashMap::new(); - query.insert("website".to_owned(), None); - let _ = ctx - .custom_request - .builder(bucket.clone()) - .method(Method::PUT) - .query_params(query) - .body(website_config.as_bytes().to_vec()) - .body_signature(BodySignature::Streaming(10)) - .send() - .await - .unwrap(); - - let o = ctx - .client - .get_bucket_website() - .bucket(&bucket) - .send() - .await - .unwrap(); - - assert_eq!(o.index_document.unwrap().suffix.unwrap(), "home.html"); - assert_eq!(o.error_document.unwrap().key.unwrap(), "err/error.html"); - } -} diff --git a/src/garage/tests/website.rs b/src/garage/tests/website.rs deleted file mode 100644 index 963d11ea..00000000 --- a/src/garage/tests/website.rs +++ /dev/null @@ -1,342 +0,0 @@ -use crate::common; -use crate::common::ext::*; -use aws_sdk_s3::{ - model::{CorsConfiguration, CorsRule, ErrorDocument, IndexDocument, WebsiteConfiguration}, - types::ByteStream, -}; -use http::Request; -use hyper::{ - body::{to_bytes, Body}, - Client, -}; - -const BODY: &[u8; 16] = b"

bonjour

"; -const BODY_ERR: &[u8; 6] = b"erreur"; - -#[tokio::test] -async fn test_website() { - const BCKT_NAME: &str = "my-website"; - let ctx = common::context(); - let bucket = ctx.create_bucket(BCKT_NAME); - - let data = ByteStream::from_static(BODY); - - ctx.client - .put_object() - .bucket(&bucket) - .key("index.html") - .body(data) - .send() - .await - .unwrap(); - - let client = Client::new(); - - let req = || { - Request::builder() - .method("GET") - .uri(format!( - "http://127.0.0.1:{}/", - common::garage::DEFAULT_PORT + 2 - )) - .header("Host", format!("{}.web.garage", BCKT_NAME)) - .body(Body::empty()) - .unwrap() - }; - - let mut resp = client.request(req()).await.unwrap(); - - assert_eq!(resp.status(), 404); - assert_ne!( - to_bytes(resp.body_mut()).await.unwrap().as_ref(), - BODY.as_ref() - ); /* check that we do not leak body */ - - ctx.garage - .command() - .args(["bucket", "website", "--allow", BCKT_NAME]) - .quiet() - .expect_success_status("Could not allow website on bucket"); - - resp = client.request(req()).await.unwrap(); - assert_eq!(resp.status(), 200); - assert_eq!( - to_bytes(resp.body_mut()).await.unwrap().as_ref(), - BODY.as_ref() - ); - - ctx.garage - .command() - .args(["bucket", "website", "--deny", BCKT_NAME]) - .quiet() - .expect_success_status("Could not deny website on bucket"); - - resp = client.request(req()).await.unwrap(); - assert_eq!(resp.status(), 404); - assert_ne!( - to_bytes(resp.body_mut()).await.unwrap().as_ref(), - BODY.as_ref() - ); /* check that we do not leak body */ -} - -#[tokio::test] -async fn test_website_s3_api() { - const BCKT_NAME: &str = "my-cors"; - let ctx = common::context(); - let bucket = ctx.create_bucket(BCKT_NAME); - - let data = ByteStream::from_static(BODY); - - ctx.client - .put_object() - .bucket(&bucket) - .key("site/home.html") - .body(data) - .send() - .await - .unwrap(); - - ctx.client - .put_object() - .bucket(&bucket) - .key("err/error.html") - .body(ByteStream::from_static(BODY_ERR)) - .send() - .await - .unwrap(); - - let conf = WebsiteConfiguration::builder() - .index_document(IndexDocument::builder().suffix("home.html").build()) - .error_document(ErrorDocument::builder().key("err/error.html").build()) - .build(); - - ctx.client - .put_bucket_website() - .bucket(&bucket) - .website_configuration(conf) - .send() - .await - .unwrap(); - - let cors = CorsConfiguration::builder() - .cors_rules( - CorsRule::builder() - .id("main-rule") - .allowed_headers("*") - .allowed_methods("GET") - .allowed_methods("PUT") - .allowed_origins("*") - .build(), - ) - .build(); - - ctx.client - .put_bucket_cors() - .bucket(&bucket) - .cors_configuration(cors) - .send() - .await - .unwrap(); - - { - let cors_res = ctx - .client - .get_bucket_cors() - .bucket(&bucket) - .send() - .await - .unwrap(); - - let main_rule = cors_res.cors_rules().unwrap().iter().next().unwrap(); - - assert_eq!(main_rule.id.as_ref().unwrap(), "main-rule"); - assert_eq!( - main_rule.allowed_headers.as_ref().unwrap(), - &vec!["*".to_string()] - ); - assert_eq!( - main_rule.allowed_origins.as_ref().unwrap(), - &vec!["*".to_string()] - ); - assert_eq!( - main_rule.allowed_methods.as_ref().unwrap(), - &vec!["GET".to_string(), "PUT".to_string()] - ); - } - - let client = Client::new(); - - // Test direct requests with CORS - { - let req = Request::builder() - .method("GET") - .uri(format!( - "http://127.0.0.1:{}/site/", - common::garage::DEFAULT_PORT + 2 - )) - .header("Host", format!("{}.web.garage", BCKT_NAME)) - .header("Origin", "https://example.com") - .body(Body::empty()) - .unwrap(); - - let mut resp = client.request(req).await.unwrap(); - - assert_eq!(resp.status(), 200); - assert_eq!( - resp.headers().get("access-control-allow-origin").unwrap(), - "*" - ); - assert_eq!( - to_bytes(resp.body_mut()).await.unwrap().as_ref(), - BODY.as_ref() - ); - } - - // Test ErrorDocument on 404 - { - let req = Request::builder() - .method("GET") - .uri(format!( - "http://127.0.0.1:{}/wrong.html", - common::garage::DEFAULT_PORT + 2 - )) - .header("Host", format!("{}.web.garage", BCKT_NAME)) - .body(Body::empty()) - .unwrap(); - - let mut resp = client.request(req).await.unwrap(); - - assert_eq!(resp.status(), 404); - assert_eq!( - to_bytes(resp.body_mut()).await.unwrap().as_ref(), - BODY_ERR.as_ref() - ); - } - - // Test CORS with an allowed preflight request - { - let req = Request::builder() - .method("OPTIONS") - .uri(format!( - "http://127.0.0.1:{}/site/", - common::garage::DEFAULT_PORT + 2 - )) - .header("Host", format!("{}.web.garage", BCKT_NAME)) - .header("Origin", "https://example.com") - .header("Access-Control-Request-Method", "PUT") - .body(Body::empty()) - .unwrap(); - - let mut resp = client.request(req).await.unwrap(); - - assert_eq!(resp.status(), 200); - assert_eq!( - resp.headers().get("access-control-allow-origin").unwrap(), - "*" - ); - assert_ne!( - to_bytes(resp.body_mut()).await.unwrap().as_ref(), - BODY.as_ref() - ); - } - - // Test CORS with a forbidden preflight request - { - let req = Request::builder() - .method("OPTIONS") - .uri(format!( - "http://127.0.0.1:{}/site/", - common::garage::DEFAULT_PORT + 2 - )) - .header("Host", format!("{}.web.garage", BCKT_NAME)) - .header("Origin", "https://example.com") - .header("Access-Control-Request-Method", "DELETE") - .body(Body::empty()) - .unwrap(); - - let mut resp = client.request(req).await.unwrap(); - - assert_eq!(resp.status(), 403); - assert_ne!( - to_bytes(resp.body_mut()).await.unwrap().as_ref(), - BODY.as_ref() - ); - } - - //@TODO test CORS on the S3 endpoint. We need to handle auth manually to check it. - - // Delete cors - ctx.client - .delete_bucket_cors() - .bucket(&bucket) - .send() - .await - .unwrap(); - - // Check CORS are deleted from the API - // @FIXME check what is the expected behavior when GetBucketCors is called on a bucket without - // any CORS. - assert!(ctx - .client - .get_bucket_cors() - .bucket(&bucket) - .send() - .await - .is_err()); - - // Test CORS are not sent anymore on a previously allowed request - { - let req = Request::builder() - .method("OPTIONS") - .uri(format!( - "http://127.0.0.1:{}/site/", - common::garage::DEFAULT_PORT + 2 - )) - .header("Host", format!("{}.web.garage", BCKT_NAME)) - .header("Origin", "https://example.com") - .header("Access-Control-Request-Method", "PUT") - .body(Body::empty()) - .unwrap(); - - let mut resp = client.request(req).await.unwrap(); - - assert_eq!(resp.status(), 403); - assert_ne!( - to_bytes(resp.body_mut()).await.unwrap().as_ref(), - BODY.as_ref() - ); - } - - // Disallow website from the API - ctx.client - .delete_bucket_website() - .bucket(&bucket) - .send() - .await - .unwrap(); - - // Check that the website is not served anymore - { - let req = Request::builder() - .method("GET") - .uri(format!( - "http://127.0.0.1:{}/site/", - common::garage::DEFAULT_PORT + 2 - )) - .header("Host", format!("{}.web.garage", BCKT_NAME)) - .body(Body::empty()) - .unwrap(); - - let mut resp = client.request(req).await.unwrap(); - - assert_eq!(resp.status(), 404); - assert_ne!( - to_bytes(resp.body_mut()).await.unwrap().as_ref(), - BODY_ERR.as_ref() - ); - assert_ne!( - to_bytes(resp.body_mut()).await.unwrap().as_ref(), - BODY.as_ref() - ); - } -} diff --git a/src/model/Cargo.toml b/src/model/Cargo.toml index 007cec89..133fe44e 100644 --- a/src/model/Cargo.toml +++ b/src/model/Cargo.toml @@ -22,8 +22,10 @@ garage_model_050 = { package = "garage_model", version = "0.5.1" } async-trait = "0.1.7" arc-swap = "1.0" +blake2 = "0.9" err-derive = "0.3" hex = "0.4" +base64 = "0.13" tracing = "0.1.30" rand = "0.8" zstd = { version = "0.9", default-features = false } @@ -42,3 +44,6 @@ opentelemetry = "0.17" #netapp = { version = "0.3.0", git = "https://git.deuxfleurs.fr/lx/netapp" } #netapp = { version = "0.4", path = "../../../netapp" } netapp = "0.4" + +[features] +k2v = [ "garage_util/k2v" ] diff --git a/src/model/block_ref_table.rs b/src/model/block_ref_table.rs deleted file mode 100644 index b6945403..00000000 --- a/src/model/block_ref_table.rs +++ /dev/null @@ -1,74 +0,0 @@ -use serde::{Deserialize, Serialize}; -use std::sync::Arc; - -use garage_util::data::*; - -use garage_table::crdt::Crdt; -use garage_table::*; - -use garage_block::manager::*; - -#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] -pub struct BlockRef { - /// Hash (blake2 sum) of the block, used as partition key - pub block: Hash, - - /// Id of the Version for the object containing this block, used as sorting key - pub version: Uuid, - - // Keep track of deleted status - /// Is the Version that contains this block deleted - pub deleted: crdt::Bool, -} - -impl Entry for BlockRef { - fn partition_key(&self) -> &Hash { - &self.block - } - fn sort_key(&self) -> &Uuid { - &self.version - } - fn is_tombstone(&self) -> bool { - self.deleted.get() - } -} - -impl Crdt for BlockRef { - fn merge(&mut self, other: &Self) { - self.deleted.merge(&other.deleted); - } -} - -pub struct BlockRefTable { - pub block_manager: Arc, -} - -impl TableSchema for BlockRefTable { - const TABLE_NAME: &'static str = "block_ref"; - - type P = Hash; - type S = Uuid; - type E = BlockRef; - type Filter = DeletedFilter; - - fn updated(&self, old: Option, new: Option) { - #[allow(clippy::or_fun_call)] - let block = &old.as_ref().or(new.as_ref()).unwrap().block; - let was_before = old.as_ref().map(|x| !x.deleted.get()).unwrap_or(false); - let is_after = new.as_ref().map(|x| !x.deleted.get()).unwrap_or(false); - if is_after && !was_before { - if let Err(e) = self.block_manager.block_incref(block) { - warn!("block_incref failed for block {:?}: {}", block, e); - } - } - if was_before && !is_after { - if let Err(e) = self.block_manager.block_decref(block) { - warn!("block_decref failed for block {:?}: {}", block, e); - } - } - } - - fn matches_filter(entry: &Self::E, filter: &Self::Filter) -> bool { - filter.apply(entry.deleted.get()) - } -} diff --git a/src/model/garage.rs b/src/model/garage.rs index abdb920a..03e21f8a 100644 --- a/src/model/garage.rs +++ b/src/model/garage.rs @@ -13,13 +13,19 @@ use garage_table::replication::TableFullReplication; use garage_table::replication::TableShardedReplication; use garage_table::*; -use crate::block_ref_table::*; +use crate::s3::block_ref_table::*; +use crate::s3::object_table::*; +use crate::s3::version_table::*; + use crate::bucket_alias_table::*; use crate::bucket_table::*; use crate::helper; use crate::key_table::*; -use crate::object_table::*; -use crate::version_table::*; + +#[cfg(feature = "k2v")] +use crate::index_counter::*; +#[cfg(feature = "k2v")] +use crate::k2v::{counter_table::*, item_table::*, poll::*, rpc::*}; /// An entire Garage full of data pub struct Garage { @@ -35,16 +41,32 @@ pub struct Garage { /// The block manager pub block_manager: Arc, - /// Table containing informations about buckets + /// Table containing buckets pub bucket_table: Arc>, - /// Table containing informations about bucket aliases + /// Table containing bucket aliases pub bucket_alias_table: Arc>, - /// Table containing informations about api keys + /// Table containing api keys pub key_table: Arc>, + /// Table containing S3 objects pub object_table: Arc>, + /// Table containing S3 object versions pub version_table: Arc>, + /// Table containing S3 block references (not blocks themselves) pub block_ref_table: Arc>, + + #[cfg(feature = "k2v")] + pub k2v: GarageK2V, +} + +#[cfg(feature = "k2v")] +pub struct GarageK2V { + /// Table containing K2V items + pub item_table: Arc>, + /// Indexing table containing K2V item counters + pub counter_table: Arc>, + /// K2V RPC handler + pub rpc: Arc, } impl Garage { @@ -95,6 +117,21 @@ impl Garage { system.clone(), ); + // ---- admin tables ---- + info!("Initialize bucket_table..."); + let bucket_table = Table::new(BucketTable, control_rep_param.clone(), system.clone(), &db); + + info!("Initialize bucket_alias_table..."); + let bucket_alias_table = Table::new( + BucketAliasTable, + control_rep_param.clone(), + system.clone(), + &db, + ); + info!("Initialize key_table_table..."); + let key_table = Table::new(KeyTable, control_rep_param, system.clone(), &db); + + // ---- S3 tables ---- info!("Initialize block_ref_table..."); let block_ref_table = Table::new( BlockRefTable { @@ -117,29 +154,20 @@ impl Garage { ); info!("Initialize object_table..."); + #[allow(clippy::redundant_clone)] let object_table = Table::new( ObjectTable { background: background.clone(), version_table: version_table.clone(), }, - meta_rep_param, - system.clone(), - &db, - ); - - info!("Initialize bucket_table..."); - let bucket_table = Table::new(BucketTable, control_rep_param.clone(), system.clone(), &db); - - info!("Initialize bucket_alias_table..."); - let bucket_alias_table = Table::new( - BucketAliasTable, - control_rep_param.clone(), + meta_rep_param.clone(), system.clone(), &db, ); - info!("Initialize key_table_table..."); - let key_table = Table::new(KeyTable, control_rep_param, system.clone(), &db); + // ---- K2V ---- + #[cfg(feature = "k2v")] + let k2v = GarageK2V::new(system.clone(), &db, meta_rep_param); info!("Initialize Garage..."); @@ -155,6 +183,8 @@ impl Garage { object_table, version_table, block_ref_table, + #[cfg(feature = "k2v")] + k2v, }) } @@ -162,3 +192,30 @@ impl Garage { helper::bucket::BucketHelper(self) } } + +#[cfg(feature = "k2v")] +impl GarageK2V { + fn new(system: Arc, db: &sled::Db, meta_rep_param: TableShardedReplication) -> Self { + info!("Initialize K2V counter table..."); + let counter_table = IndexCounter::new(system.clone(), meta_rep_param.clone(), db); + info!("Initialize K2V subscription manager..."); + let subscriptions = Arc::new(SubscriptionManager::new()); + info!("Initialize K2V item table..."); + let item_table = Table::new( + K2VItemTable { + counter_table: counter_table.clone(), + subscriptions: subscriptions.clone(), + }, + meta_rep_param, + system.clone(), + db, + ); + let rpc = K2VRpcHandler::new(system, item_table.clone(), subscriptions); + + Self { + item_table, + counter_table, + rpc, + } + } +} diff --git a/src/model/helper/bucket.rs b/src/model/helper/bucket.rs index 706faf26..54d2f97b 100644 --- a/src/model/helper/bucket.rs +++ b/src/model/helper/bucket.rs @@ -1,4 +1,4 @@ -use garage_table::util::EmptyKey; +use garage_table::util::*; use garage_util::crdt::*; use garage_util::data::*; use garage_util::error::{Error as GarageError, OkOrMessage}; @@ -116,6 +116,7 @@ impl<'a> BucketHelper<'a> { None, Some(KeyFilter::MatchesAndNotDeleted(pattern.to_string())), 10, + EnumerationOrder::Forward, ) .await? .into_iter() diff --git a/src/model/index_counter.rs b/src/model/index_counter.rs new file mode 100644 index 00000000..123154d4 --- /dev/null +++ b/src/model/index_counter.rs @@ -0,0 +1,305 @@ +use std::collections::{hash_map, BTreeMap, HashMap}; +use std::marker::PhantomData; +use std::sync::Arc; +use std::time::Duration; + +use serde::{Deserialize, Serialize}; +use tokio::sync::{mpsc, watch}; + +use garage_rpc::ring::Ring; +use garage_rpc::system::System; +use garage_util::data::*; +use garage_util::error::*; + +use garage_table::crdt::*; +use garage_table::replication::TableShardedReplication; +use garage_table::*; + +pub trait CounterSchema: Clone + PartialEq + Send + Sync + 'static { + const NAME: &'static str; + type P: PartitionKey + Clone + PartialEq + Serialize + for<'de> Deserialize<'de> + Send + Sync; + type S: SortKey + Clone + PartialEq + Serialize + for<'de> Deserialize<'de> + Send + Sync; +} + +/// A counter entry in the global table +#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] +pub struct CounterEntry { + pub pk: T::P, + pub sk: T::S, + pub values: BTreeMap, +} + +impl Entry for CounterEntry { + fn partition_key(&self) -> &T::P { + &self.pk + } + fn sort_key(&self) -> &T::S { + &self.sk + } + fn is_tombstone(&self) -> bool { + self.values + .iter() + .all(|(_, v)| v.node_values.iter().all(|(_, (_, v))| *v == 0)) + } +} + +impl CounterEntry { + pub fn filtered_values(&self, ring: &Ring) -> HashMap { + let nodes = &ring.layout.node_id_vec[..]; + self.filtered_values_with_nodes(nodes) + } + + pub fn filtered_values_with_nodes(&self, nodes: &[Uuid]) -> HashMap { + let mut ret = HashMap::new(); + for (name, vals) in self.values.iter() { + let new_vals = vals + .node_values + .iter() + .filter(|(n, _)| nodes.contains(n)) + .map(|(_, (_, v))| *v) + .collect::>(); + if !new_vals.is_empty() { + ret.insert( + name.clone(), + new_vals.iter().fold(i64::MIN, |a, b| std::cmp::max(a, *b)), + ); + } + } + + ret + } +} + +/// A counter entry in the global table +#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] +pub struct CounterValue { + pub node_values: BTreeMap, +} + +impl Crdt for CounterEntry { + fn merge(&mut self, other: &Self) { + for (name, e2) in other.values.iter() { + if let Some(e) = self.values.get_mut(name) { + e.merge(e2); + } else { + self.values.insert(name.clone(), e2.clone()); + } + } + } +} + +impl Crdt for CounterValue { + fn merge(&mut self, other: &Self) { + for (node, (t2, e2)) in other.node_values.iter() { + if let Some((t, e)) = self.node_values.get_mut(node) { + if t2 > t { + *e = *e2; + } + } else { + self.node_values.insert(*node, (*t2, *e2)); + } + } + } +} + +pub struct CounterTable { + _phantom_t: PhantomData, +} + +impl TableSchema for CounterTable { + const TABLE_NAME: &'static str = T::NAME; + + type P = T::P; + type S = T::S; + type E = CounterEntry; + type Filter = (DeletedFilter, Vec); + + fn updated(&self, _old: Option<&Self::E>, _new: Option<&Self::E>) { + // nothing for now + } + + fn matches_filter(entry: &Self::E, filter: &Self::Filter) -> bool { + if filter.0 == DeletedFilter::Any { + return true; + } + + let is_tombstone = entry + .filtered_values_with_nodes(&filter.1[..]) + .iter() + .all(|(_, v)| *v == 0); + filter.0.apply(is_tombstone) + } +} + +// ---- + +pub struct IndexCounter { + this_node: Uuid, + local_counter: sled::Tree, + propagate_tx: mpsc::UnboundedSender<(T::P, T::S, LocalCounterEntry)>, + pub table: Arc, TableShardedReplication>>, +} + +impl IndexCounter { + pub fn new( + system: Arc, + replication: TableShardedReplication, + db: &sled::Db, + ) -> Arc { + let background = system.background.clone(); + + let (propagate_tx, propagate_rx) = mpsc::unbounded_channel(); + + let this = Arc::new(Self { + this_node: system.id, + local_counter: db + .open_tree(format!("local_counter:{}", T::NAME)) + .expect("Unable to open local counter tree"), + propagate_tx, + table: Table::new( + CounterTable { + _phantom_t: Default::default(), + }, + replication, + system, + db, + ), + }); + + let this2 = this.clone(); + background.spawn_worker( + format!("{} index counter propagator", T::NAME), + move |must_exit| this2.clone().propagate_loop(propagate_rx, must_exit), + ); + this + } + + pub fn count(&self, pk: &T::P, sk: &T::S, counts: &[(&str, i64)]) -> Result<(), Error> { + let tree_key = self.table.data.tree_key(pk, sk); + + let new_entry = self.local_counter.transaction(|tx| { + let mut entry = match tx.get(&tree_key[..])? { + Some(old_bytes) => { + rmp_serde::decode::from_read_ref::<_, LocalCounterEntry>(&old_bytes) + .map_err(Error::RmpDecode) + .map_err(sled::transaction::ConflictableTransactionError::Abort)? + } + None => LocalCounterEntry { + values: BTreeMap::new(), + }, + }; + + for (s, inc) in counts.iter() { + let mut ent = entry.values.entry(s.to_string()).or_insert((0, 0)); + ent.0 += 1; + ent.1 += *inc; + } + + let new_entry_bytes = rmp_to_vec_all_named(&entry) + .map_err(Error::RmpEncode) + .map_err(sled::transaction::ConflictableTransactionError::Abort)?; + tx.insert(&tree_key[..], new_entry_bytes)?; + + Ok(entry) + })?; + + if let Err(e) = self.propagate_tx.send((pk.clone(), sk.clone(), new_entry)) { + error!( + "Could not propagate updated counter values, failed to send to channel: {}", + e + ); + } + + Ok(()) + } + + async fn propagate_loop( + self: Arc, + mut propagate_rx: mpsc::UnboundedReceiver<(T::P, T::S, LocalCounterEntry)>, + must_exit: watch::Receiver, + ) { + // This loop batches updates to counters to be sent all at once. + // They are sent once the propagate_rx channel has been emptied (or is closed). + let mut buf = HashMap::new(); + let mut errors = 0; + + loop { + let (ent, closed) = match propagate_rx.try_recv() { + Ok(ent) => (Some(ent), false), + Err(mpsc::error::TryRecvError::Empty) if buf.is_empty() => { + match propagate_rx.recv().await { + Some(ent) => (Some(ent), false), + None => (None, true), + } + } + Err(mpsc::error::TryRecvError::Empty) => (None, false), + Err(mpsc::error::TryRecvError::Disconnected) => (None, true), + }; + + if let Some((pk, sk, counters)) = ent { + let tree_key = self.table.data.tree_key(&pk, &sk); + let dist_entry = counters.into_counter_entry::(self.this_node, pk, sk); + match buf.entry(tree_key) { + hash_map::Entry::Vacant(e) => { + e.insert(dist_entry); + } + hash_map::Entry::Occupied(mut e) => { + e.get_mut().merge(&dist_entry); + } + } + // As long as we can add entries, loop back and add them to batch + // before sending batch to other nodes + continue; + } + + if !buf.is_empty() { + let entries = buf.iter().map(|(_k, v)| v); + if let Err(e) = self.table.insert_many(entries).await { + errors += 1; + if errors >= 2 && *must_exit.borrow() { + error!("({}) Could not propagate {} counter values: {}, these counters will not be updated correctly.", T::NAME, buf.len(), e); + break; + } + warn!("({}) Could not propagate {} counter values: {}, retrying in 5 seconds (retry #{})", T::NAME, buf.len(), e, errors); + tokio::time::sleep(Duration::from_secs(5)).await; + continue; + } + + buf.clear(); + errors = 0; + } + + if closed || *must_exit.borrow() { + break; + } + } + } +} + +#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] +struct LocalCounterEntry { + values: BTreeMap, +} + +impl LocalCounterEntry { + fn into_counter_entry( + self, + this_node: Uuid, + pk: T::P, + sk: T::S, + ) -> CounterEntry { + CounterEntry { + pk, + sk, + values: self + .values + .into_iter() + .map(|(name, (ts, v))| { + let mut node_values = BTreeMap::new(); + node_values.insert(this_node, (ts, v)); + (name, CounterValue { node_values }) + }) + .collect(), + } + } +} diff --git a/src/model/k2v/causality.rs b/src/model/k2v/causality.rs new file mode 100644 index 00000000..8c76a32b --- /dev/null +++ b/src/model/k2v/causality.rs @@ -0,0 +1,96 @@ +use std::collections::BTreeMap; +use std::convert::TryInto; + +use serde::{Deserialize, Serialize}; + +use garage_util::data::*; + +/// Node IDs used in K2V are u64 integers that are the abbreviation +/// of full Garage node IDs which are 256-bit UUIDs. +pub type K2VNodeId = u64; + +pub fn make_node_id(node_id: Uuid) -> K2VNodeId { + let mut tmp = [0u8; 8]; + tmp.copy_from_slice(&node_id.as_slice()[..8]); + u64::from_be_bytes(tmp) +} + +#[derive(PartialEq, Debug, Serialize, Deserialize)] +pub struct CausalContext { + pub vector_clock: BTreeMap, +} + +impl CausalContext { + /// Empty causality context + pub fn new_empty() -> Self { + Self { + vector_clock: BTreeMap::new(), + } + } + /// Make binary representation and encode in base64 + pub fn serialize(&self) -> String { + let mut ints = Vec::with_capacity(2 * self.vector_clock.len()); + for (node, time) in self.vector_clock.iter() { + ints.push(*node); + ints.push(*time); + } + let checksum = ints.iter().fold(0, |acc, v| acc ^ *v); + + let mut bytes = u64::to_be_bytes(checksum).to_vec(); + for i in ints { + bytes.extend(u64::to_be_bytes(i)); + } + + base64::encode_config(bytes, base64::URL_SAFE_NO_PAD) + } + /// Parse from base64-encoded binary representation + pub fn parse(s: &str) -> Result { + let bytes = base64::decode_config(s, base64::URL_SAFE_NO_PAD) + .map_err(|e| format!("bad causality token base64: {}", e))?; + if bytes.len() % 16 != 8 || bytes.len() < 8 { + return Err("bad causality token length".into()); + } + + let checksum = u64::from_be_bytes(bytes[..8].try_into().unwrap()); + let mut ret = CausalContext { + vector_clock: BTreeMap::new(), + }; + + for i in 0..(bytes.len() / 16) { + let node_id = u64::from_be_bytes(bytes[8 + i * 16..16 + i * 16].try_into().unwrap()); + let time = u64::from_be_bytes(bytes[16 + i * 16..24 + i * 16].try_into().unwrap()); + ret.vector_clock.insert(node_id, time); + } + + let check = ret.vector_clock.iter().fold(0, |acc, (n, t)| acc ^ *n ^ *t); + + if check != checksum { + return Err("bad causality token checksum".into()); + } + + Ok(ret) + } + /// Check if this causal context contains newer items than another one + pub fn is_newer_than(&self, other: &Self) -> bool { + self.vector_clock + .iter() + .any(|(k, v)| v > other.vector_clock.get(k).unwrap_or(&0)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_causality_token_serialization() { + let ct = CausalContext { + vector_clock: [(4, 42), (1928131023, 76), (0xefc0c1c47f9de433, 2)] + .iter() + .cloned() + .collect(), + }; + + assert_eq!(CausalContext::parse(&ct.serialize()).unwrap(), ct); + } +} diff --git a/src/model/k2v/counter_table.rs b/src/model/k2v/counter_table.rs new file mode 100644 index 00000000..4856eb2b --- /dev/null +++ b/src/model/k2v/counter_table.rs @@ -0,0 +1,20 @@ +use garage_util::data::*; + +use crate::index_counter::*; + +pub const ENTRIES: &str = "entries"; +pub const CONFLICTS: &str = "conflicts"; +pub const VALUES: &str = "values"; +pub const BYTES: &str = "bytes"; + +#[derive(PartialEq, Clone)] +pub struct K2VCounterTable; + +impl CounterSchema for K2VCounterTable { + const NAME: &'static str = "k2v_index_counter"; + + // Partition key = bucket id + type P = Uuid; + // Sort key = K2V item's partition key + type S = String; +} diff --git a/src/model/k2v/item_table.rs b/src/model/k2v/item_table.rs new file mode 100644 index 00000000..8b7cc08a --- /dev/null +++ b/src/model/k2v/item_table.rs @@ -0,0 +1,291 @@ +use serde::{Deserialize, Serialize}; +use std::collections::BTreeMap; +use std::sync::Arc; + +use garage_util::data::*; + +use garage_table::crdt::*; +use garage_table::*; + +use crate::index_counter::*; +use crate::k2v::causality::*; +use crate::k2v::counter_table::*; +use crate::k2v::poll::*; + +#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] +pub struct K2VItem { + pub partition: K2VItemPartition, + pub sort_key: String, + + items: BTreeMap, +} + +#[derive(PartialEq, Clone, Debug, Serialize, Deserialize, Hash, Eq)] +pub struct K2VItemPartition { + pub bucket_id: Uuid, + pub partition_key: String, +} + +#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] +struct DvvsEntry { + t_discard: u64, + values: Vec<(u64, DvvsValue)>, +} + +#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] +pub enum DvvsValue { + Value(#[serde(with = "serde_bytes")] Vec), + Deleted, +} + +impl K2VItem { + /// Creates a new K2VItem when no previous entry existed in the db + pub fn new(bucket_id: Uuid, partition_key: String, sort_key: String) -> Self { + Self { + partition: K2VItemPartition { + bucket_id, + partition_key, + }, + sort_key, + items: BTreeMap::new(), + } + } + /// Updates a K2VItem with a new value or a deletion event + pub fn update( + &mut self, + this_node: Uuid, + context: &Option, + new_value: DvvsValue, + ) { + if let Some(context) = context { + for (node, t_discard) in context.vector_clock.iter() { + if let Some(e) = self.items.get_mut(node) { + e.t_discard = std::cmp::max(e.t_discard, *t_discard); + } else { + self.items.insert( + *node, + DvvsEntry { + t_discard: *t_discard, + values: vec![], + }, + ); + } + } + } + + self.discard(); + + let node_id = make_node_id(this_node); + let e = self.items.entry(node_id).or_insert(DvvsEntry { + t_discard: 0, + values: vec![], + }); + let t_prev = e.max_time(); + e.values.push((t_prev + 1, new_value)); + } + + /// Extract the causality context of a K2V Item + pub fn causal_context(&self) -> CausalContext { + let mut cc = CausalContext::new_empty(); + for (node, ent) in self.items.iter() { + cc.vector_clock.insert(*node, ent.max_time()); + } + cc + } + + /// Extract the list of values + pub fn values(&'_ self) -> Vec<&'_ DvvsValue> { + let mut ret = vec![]; + for (_, ent) in self.items.iter() { + for (_, v) in ent.values.iter() { + if !ret.contains(&v) { + ret.push(v); + } + } + } + ret + } + + fn discard(&mut self) { + for (_, ent) in self.items.iter_mut() { + ent.discard(); + } + } + + // returns counters: (non-deleted entries, conflict entries, non-tombstone values, bytes used) + fn stats(&self) -> (i64, i64, i64, i64) { + let values = self.values(); + + let n_entries = if self.is_tombstone() { 0 } else { 1 }; + let n_conflicts = if values.len() > 1 { 1 } else { 0 }; + let n_values = values + .iter() + .filter(|v| matches!(v, DvvsValue::Value(_))) + .count() as i64; + let n_bytes = values + .iter() + .map(|v| match v { + DvvsValue::Deleted => 0, + DvvsValue::Value(v) => v.len() as i64, + }) + .sum(); + + (n_entries, n_conflicts, n_values, n_bytes) + } +} + +impl DvvsEntry { + fn max_time(&self) -> u64 { + self.values + .iter() + .fold(self.t_discard, |acc, (vts, _)| std::cmp::max(acc, *vts)) + } + + fn discard(&mut self) { + self.values = std::mem::take(&mut self.values) + .into_iter() + .filter(|(t, _)| *t > self.t_discard) + .collect::>(); + } +} + +impl Crdt for K2VItem { + fn merge(&mut self, other: &Self) { + for (node, e2) in other.items.iter() { + if let Some(e) = self.items.get_mut(node) { + e.merge(e2); + } else { + self.items.insert(*node, e2.clone()); + } + } + } +} + +impl Crdt for DvvsEntry { + fn merge(&mut self, other: &Self) { + self.t_discard = std::cmp::max(self.t_discard, other.t_discard); + self.discard(); + + let t_max = self.max_time(); + for (vt, vv) in other.values.iter() { + if *vt > t_max { + self.values.push((*vt, vv.clone())); + } + } + } +} + +impl PartitionKey for K2VItemPartition { + fn hash(&self) -> Hash { + use blake2::{Blake2b, Digest}; + + let mut hasher = Blake2b::new(); + hasher.update(self.bucket_id.as_slice()); + hasher.update(self.partition_key.as_bytes()); + let mut hash = [0u8; 32]; + hash.copy_from_slice(&hasher.finalize()[..32]); + hash.into() + } +} + +impl Entry for K2VItem { + fn partition_key(&self) -> &K2VItemPartition { + &self.partition + } + fn sort_key(&self) -> &String { + &self.sort_key + } + fn is_tombstone(&self) -> bool { + self.values() + .iter() + .all(|v| matches!(v, DvvsValue::Deleted)) + } +} + +pub struct K2VItemTable { + pub(crate) counter_table: Arc>, + pub(crate) subscriptions: Arc, +} + +#[derive(Clone, Copy, Debug, Serialize, Deserialize)] +pub struct ItemFilter { + pub exclude_only_tombstones: bool, + pub conflicts_only: bool, +} + +impl TableSchema for K2VItemTable { + const TABLE_NAME: &'static str = "k2v_item"; + + type P = K2VItemPartition; + type S = String; + type E = K2VItem; + type Filter = ItemFilter; + + fn updated(&self, old: Option<&Self::E>, new: Option<&Self::E>) { + // 1. Count + let (old_entries, old_conflicts, old_values, old_bytes) = match old { + None => (0, 0, 0, 0), + Some(e) => e.stats(), + }; + let (new_entries, new_conflicts, new_values, new_bytes) = match new { + None => (0, 0, 0, 0), + Some(e) => e.stats(), + }; + + let count_pk = old + .map(|e| e.partition.bucket_id) + .unwrap_or_else(|| new.unwrap().partition.bucket_id); + let count_sk = old + .map(|e| &e.partition.partition_key) + .unwrap_or_else(|| &new.unwrap().partition.partition_key); + + if let Err(e) = self.counter_table.count( + &count_pk, + count_sk, + &[ + (ENTRIES, new_entries - old_entries), + (CONFLICTS, new_conflicts - old_conflicts), + (VALUES, new_values - old_values), + (BYTES, new_bytes - old_bytes), + ], + ) { + error!("Could not update K2V counter for bucket {:?} partition {}; counts will now be inconsistent. {}", count_pk, count_sk, e); + } + + // 2. Notify + if let Some(new_ent) = new { + self.subscriptions.notify(new_ent); + } + } + + #[allow(clippy::nonminimal_bool)] + fn matches_filter(entry: &Self::E, filter: &Self::Filter) -> bool { + let v = entry.values(); + !(filter.conflicts_only && v.len() < 2) + && !(filter.exclude_only_tombstones && entry.is_tombstone()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_dvvsentry_merge_simple() { + let e1 = DvvsEntry { + t_discard: 4, + values: vec![ + (5, DvvsValue::Value(vec![15])), + (6, DvvsValue::Value(vec![16])), + ], + }; + let e2 = DvvsEntry { + t_discard: 5, + values: vec![(6, DvvsValue::Value(vec![16])), (7, DvvsValue::Deleted)], + }; + + let mut e3 = e1.clone(); + e3.merge(&e2); + assert_eq!(e2, e3); + } +} diff --git a/src/model/k2v/mod.rs b/src/model/k2v/mod.rs new file mode 100644 index 00000000..664172a6 --- /dev/null +++ b/src/model/k2v/mod.rs @@ -0,0 +1,7 @@ +pub mod causality; + +pub mod counter_table; +pub mod item_table; + +pub mod poll; +pub mod rpc; diff --git a/src/model/k2v/poll.rs b/src/model/k2v/poll.rs new file mode 100644 index 00000000..93105207 --- /dev/null +++ b/src/model/k2v/poll.rs @@ -0,0 +1,50 @@ +use std::collections::HashMap; +use std::sync::Mutex; + +use serde::{Deserialize, Serialize}; +use tokio::sync::broadcast; + +use crate::k2v::item_table::*; + +#[derive(Debug, Hash, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct PollKey { + pub partition: K2VItemPartition, + pub sort_key: String, +} + +#[derive(Default)] +pub struct SubscriptionManager { + subscriptions: Mutex>>, +} + +impl SubscriptionManager { + pub fn new() -> Self { + Self::default() + } + + pub fn subscribe(&self, key: &PollKey) -> broadcast::Receiver { + let mut subs = self.subscriptions.lock().unwrap(); + if let Some(s) = subs.get(key) { + s.subscribe() + } else { + let (tx, rx) = broadcast::channel(8); + subs.insert(key.clone(), tx); + rx + } + } + + pub fn notify(&self, item: &K2VItem) { + let key = PollKey { + partition: item.partition.clone(), + sort_key: item.sort_key.clone(), + }; + let mut subs = self.subscriptions.lock().unwrap(); + if let Some(s) = subs.get(&key) { + if s.send(item.clone()).is_err() { + // no more subscribers, remove channel from here + // (we will re-create it later if we need to subscribe again) + subs.remove(&key); + } + } + } +} diff --git a/src/model/k2v/rpc.rs b/src/model/k2v/rpc.rs new file mode 100644 index 00000000..90101d0f --- /dev/null +++ b/src/model/k2v/rpc.rs @@ -0,0 +1,343 @@ +//! Module that implements RPCs specific to K2V. +//! This is necessary for insertions into the K2V store, +//! as they have to be transmitted to one of the nodes responsible +//! for storing the entry to be processed (the API entry +//! node does not process the entry directly, as this would +//! mean the vector clock gets much larger than needed). + +use std::collections::HashMap; +use std::sync::Arc; +use std::time::Duration; + +use async_trait::async_trait; +use futures::stream::FuturesUnordered; +use futures::StreamExt; +use serde::{Deserialize, Serialize}; +use tokio::select; + +use garage_util::crdt::*; +use garage_util::data::*; +use garage_util::error::*; + +use garage_rpc::system::System; +use garage_rpc::*; + +use garage_table::replication::{TableReplication, TableShardedReplication}; +use garage_table::table::TABLE_RPC_TIMEOUT; +use garage_table::{PartitionKey, Table}; + +use crate::k2v::causality::*; +use crate::k2v::item_table::*; +use crate::k2v::poll::*; + +/// RPC messages for K2V +#[derive(Debug, Serialize, Deserialize)] +enum K2VRpc { + Ok, + InsertItem(InsertedItem), + InsertManyItems(Vec), + PollItem { + key: PollKey, + causal_context: CausalContext, + timeout_msec: u64, + }, + PollItemResponse(Option), +} + +#[derive(Debug, Serialize, Deserialize)] +struct InsertedItem { + partition: K2VItemPartition, + sort_key: String, + causal_context: Option, + value: DvvsValue, +} + +impl Rpc for K2VRpc { + type Response = Result; +} + +/// The block manager, handling block exchange between nodes, and block storage on local node +pub struct K2VRpcHandler { + system: Arc, + item_table: Arc>, + endpoint: Arc>, + subscriptions: Arc, +} + +impl K2VRpcHandler { + pub fn new( + system: Arc, + item_table: Arc>, + subscriptions: Arc, + ) -> Arc { + let endpoint = system.netapp.endpoint("garage_model/k2v/Rpc".to_string()); + + let rpc_handler = Arc::new(Self { + system, + item_table, + endpoint, + subscriptions, + }); + rpc_handler.endpoint.set_handler(rpc_handler.clone()); + + rpc_handler + } + + // ---- public interface ---- + + pub async fn insert( + &self, + bucket_id: Uuid, + partition_key: String, + sort_key: String, + causal_context: Option, + value: DvvsValue, + ) -> Result<(), Error> { + let partition = K2VItemPartition { + bucket_id, + partition_key, + }; + let mut who = self + .item_table + .data + .replication + .write_nodes(&partition.hash()); + who.sort(); + + self.system + .rpc + .try_call_many( + &self.endpoint, + &who[..], + K2VRpc::InsertItem(InsertedItem { + partition, + sort_key, + causal_context, + value, + }), + RequestStrategy::with_priority(PRIO_NORMAL) + .with_quorum(1) + .with_timeout(TABLE_RPC_TIMEOUT) + .interrupt_after_quorum(true), + ) + .await?; + + Ok(()) + } + + pub async fn insert_batch( + &self, + bucket_id: Uuid, + items: Vec<(String, String, Option, DvvsValue)>, + ) -> Result<(), Error> { + let n_items = items.len(); + + let mut call_list: HashMap<_, Vec<_>> = HashMap::new(); + + for (partition_key, sort_key, causal_context, value) in items { + let partition = K2VItemPartition { + bucket_id, + partition_key, + }; + let mut who = self + .item_table + .data + .replication + .write_nodes(&partition.hash()); + who.sort(); + + call_list.entry(who).or_default().push(InsertedItem { + partition, + sort_key, + causal_context, + value, + }); + } + + debug!( + "K2V insert_batch: {} requests to insert {} items", + call_list.len(), + n_items + ); + let call_futures = call_list.into_iter().map(|(nodes, items)| async move { + let resp = self + .system + .rpc + .try_call_many( + &self.endpoint, + &nodes[..], + K2VRpc::InsertManyItems(items), + RequestStrategy::with_priority(PRIO_NORMAL) + .with_quorum(1) + .with_timeout(TABLE_RPC_TIMEOUT) + .interrupt_after_quorum(true), + ) + .await?; + Ok::<_, Error>((nodes, resp)) + }); + + let mut resps = call_futures.collect::>(); + while let Some(resp) = resps.next().await { + resp?; + } + + Ok(()) + } + + pub async fn poll( + &self, + bucket_id: Uuid, + partition_key: String, + sort_key: String, + causal_context: CausalContext, + timeout_msec: u64, + ) -> Result, Error> { + let poll_key = PollKey { + partition: K2VItemPartition { + bucket_id, + partition_key, + }, + sort_key, + }; + let nodes = self + .item_table + .data + .replication + .write_nodes(&poll_key.partition.hash()); + + let resps = self + .system + .rpc + .try_call_many( + &self.endpoint, + &nodes[..], + K2VRpc::PollItem { + key: poll_key, + causal_context, + timeout_msec, + }, + RequestStrategy::with_priority(PRIO_NORMAL) + .with_quorum(self.item_table.data.replication.read_quorum()) + .with_timeout(Duration::from_millis(timeout_msec) + TABLE_RPC_TIMEOUT), + ) + .await?; + + let mut resp: Option = None; + for v in resps { + match v { + K2VRpc::PollItemResponse(Some(x)) => { + if let Some(y) = &mut resp { + y.merge(&x); + } else { + resp = Some(x); + } + } + K2VRpc::PollItemResponse(None) => { + return Ok(None); + } + v => return Err(Error::unexpected_rpc_message(v)), + } + } + + Ok(resp) + } + + // ---- internal handlers ---- + + async fn handle_insert(&self, item: &InsertedItem) -> Result { + let new = self.local_insert(item)?; + + // Propagate to rest of network + if let Some(updated) = new { + self.item_table.insert(&updated).await?; + } + + Ok(K2VRpc::Ok) + } + + async fn handle_insert_many(&self, items: &[InsertedItem]) -> Result { + let mut updated_vec = vec![]; + + for item in items { + let new = self.local_insert(item)?; + + if let Some(updated) = new { + updated_vec.push(updated); + } + } + + // Propagate to rest of network + if !updated_vec.is_empty() { + self.item_table.insert_many(&updated_vec).await?; + } + + Ok(K2VRpc::Ok) + } + + fn local_insert(&self, item: &InsertedItem) -> Result, Error> { + let tree_key = self + .item_table + .data + .tree_key(&item.partition, &item.sort_key); + + self.item_table + .data + .update_entry_with(&tree_key[..], |ent| { + let mut ent = ent.unwrap_or_else(|| { + K2VItem::new( + item.partition.bucket_id, + item.partition.partition_key.clone(), + item.sort_key.clone(), + ) + }); + ent.update(self.system.id, &item.causal_context, item.value.clone()); + ent + }) + } + + async fn handle_poll(&self, key: &PollKey, ct: &CausalContext) -> Result { + let mut chan = self.subscriptions.subscribe(key); + + let mut value = self + .item_table + .data + .read_entry(&key.partition, &key.sort_key)? + .map(|bytes| self.item_table.data.decode_entry(&bytes[..])) + .transpose()? + .unwrap_or_else(|| { + K2VItem::new( + key.partition.bucket_id, + key.partition.partition_key.clone(), + key.sort_key.clone(), + ) + }); + + while !value.causal_context().is_newer_than(ct) { + value = chan.recv().await?; + } + + Ok(value) + } +} + +#[async_trait] +impl EndpointHandler for K2VRpcHandler { + async fn handle(self: &Arc, message: &K2VRpc, _from: NodeID) -> Result { + match message { + K2VRpc::InsertItem(item) => self.handle_insert(item).await, + K2VRpc::InsertManyItems(items) => self.handle_insert_many(&items[..]).await, + K2VRpc::PollItem { + key, + causal_context, + timeout_msec, + } => { + let delay = tokio::time::sleep(Duration::from_millis(*timeout_msec)); + select! { + ret = self.handle_poll(key, causal_context) => ret.map(Some).map(K2VRpc::PollItemResponse), + _ = delay => Ok(K2VRpc::PollItemResponse(None)), + } + } + m => Err(Error::unexpected_rpc_message(m)), + } + } +} diff --git a/src/model/lib.rs b/src/model/lib.rs index 05a4cdc7..7c9d9270 100644 --- a/src/model/lib.rs +++ b/src/model/lib.rs @@ -3,12 +3,15 @@ extern crate tracing; pub mod permission; -pub mod block_ref_table; +pub mod index_counter; + pub mod bucket_alias_table; pub mod bucket_table; pub mod key_table; -pub mod object_table; -pub mod version_table; + +#[cfg(feature = "k2v")] +pub mod k2v; +pub mod s3; pub mod garage; pub mod helper; diff --git a/src/model/object_table.rs b/src/model/object_table.rs deleted file mode 100644 index da53878e..00000000 --- a/src/model/object_table.rs +++ /dev/null @@ -1,334 +0,0 @@ -use serde::{Deserialize, Serialize}; -use std::collections::BTreeMap; -use std::sync::Arc; - -use garage_util::background::BackgroundRunner; -use garage_util::data::*; - -use garage_table::crdt::*; -use garage_table::replication::TableShardedReplication; -use garage_table::*; - -use crate::version_table::*; - -use garage_model_050::object_table as old; - -/// An object -#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] -pub struct Object { - /// The bucket in which the object is stored, used as partition key - pub bucket_id: Uuid, - - /// The key at which the object is stored in its bucket, used as sorting key - pub key: String, - - /// The list of currenty stored versions of the object - versions: Vec, -} - -impl Object { - /// Initialize an Object struct from parts - pub fn new(bucket_id: Uuid, key: String, versions: Vec) -> Self { - let mut ret = Self { - bucket_id, - key, - versions: vec![], - }; - for v in versions { - ret.add_version(v) - .expect("Twice the same ObjectVersion in Object constructor"); - } - ret - } - - /// Adds a version if it wasn't already present - #[allow(clippy::result_unit_err)] - pub fn add_version(&mut self, new: ObjectVersion) -> Result<(), ()> { - match self - .versions - .binary_search_by(|v| v.cmp_key().cmp(&new.cmp_key())) - { - Err(i) => { - self.versions.insert(i, new); - Ok(()) - } - Ok(_) => Err(()), - } - } - - /// Get a list of currently stored versions of `Object` - pub fn versions(&self) -> &[ObjectVersion] { - &self.versions[..] - } -} - -/// Informations about a version of an object -#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] -pub struct ObjectVersion { - /// Id of the version - pub uuid: Uuid, - /// Timestamp of when the object was created - pub timestamp: u64, - /// State of the version - pub state: ObjectVersionState, -} - -/// State of an object version -#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] -pub enum ObjectVersionState { - /// The version is being received - Uploading(ObjectVersionHeaders), - /// The version is fully received - Complete(ObjectVersionData), - /// The version uploaded containded errors or the upload was explicitly aborted - Aborted, -} - -impl Crdt for ObjectVersionState { - fn merge(&mut self, other: &Self) { - use ObjectVersionState::*; - match other { - Aborted => { - *self = Aborted; - } - Complete(b) => match self { - Aborted => {} - Complete(a) => { - a.merge(b); - } - Uploading(_) => { - *self = Complete(b.clone()); - } - }, - Uploading(_) => {} - } - } -} - -/// Data stored in object version -#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] -pub enum ObjectVersionData { - /// The object was deleted, this Version is a tombstone to mark it as such - DeleteMarker, - /// The object is short, it's stored inlined - Inline(ObjectVersionMeta, #[serde(with = "serde_bytes")] Vec), - /// The object is not short, Hash of first block is stored here, next segments hashes are - /// stored in the version table - FirstBlock(ObjectVersionMeta, Hash), -} - -impl AutoCrdt for ObjectVersionData { - const WARN_IF_DIFFERENT: bool = true; -} - -/// Metadata about the object version -#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] -pub struct ObjectVersionMeta { - /// Headers to send to the client - pub headers: ObjectVersionHeaders, - /// Size of the object - pub size: u64, - /// etag of the object - pub etag: String, -} - -/// Additional headers for an object -#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] -pub struct ObjectVersionHeaders { - /// Content type of the object - pub content_type: String, - /// Any other http headers to send - pub other: BTreeMap, -} - -impl ObjectVersion { - fn cmp_key(&self) -> (u64, Uuid) { - (self.timestamp, self.uuid) - } - - /// Is the object version currently being uploaded - pub fn is_uploading(&self) -> bool { - matches!(self.state, ObjectVersionState::Uploading(_)) - } - - /// Is the object version completely received - pub fn is_complete(&self) -> bool { - matches!(self.state, ObjectVersionState::Complete(_)) - } - - /// Is the object version available (received and not a tombstone) - pub fn is_data(&self) -> bool { - match self.state { - ObjectVersionState::Complete(ObjectVersionData::DeleteMarker) => false, - ObjectVersionState::Complete(_) => true, - _ => false, - } - } -} - -impl Entry for Object { - fn partition_key(&self) -> &Uuid { - &self.bucket_id - } - fn sort_key(&self) -> &String { - &self.key - } - fn is_tombstone(&self) -> bool { - self.versions.len() == 1 - && self.versions[0].state - == ObjectVersionState::Complete(ObjectVersionData::DeleteMarker) - } -} - -impl Crdt for Object { - fn merge(&mut self, other: &Self) { - // Merge versions from other into here - for other_v in other.versions.iter() { - match self - .versions - .binary_search_by(|v| v.cmp_key().cmp(&other_v.cmp_key())) - { - Ok(i) => { - self.versions[i].state.merge(&other_v.state); - } - Err(i) => { - self.versions.insert(i, other_v.clone()); - } - } - } - - // Remove versions which are obsolete, i.e. those that come - // before the last version which .is_complete(). - let last_complete = self - .versions - .iter() - .enumerate() - .rev() - .find(|(_, v)| v.is_complete()) - .map(|(vi, _)| vi); - - if let Some(last_vi) = last_complete { - self.versions = self.versions.drain(last_vi..).collect::>(); - } - } -} - -pub struct ObjectTable { - pub background: Arc, - pub version_table: Arc>, -} - -#[derive(Clone, Copy, Debug, Serialize, Deserialize)] -pub enum ObjectFilter { - IsData, - IsUploading, -} - -impl TableSchema for ObjectTable { - const TABLE_NAME: &'static str = "object"; - - type P = Uuid; - type S = String; - type E = Object; - type Filter = ObjectFilter; - - fn updated(&self, old: Option, new: Option) { - let version_table = self.version_table.clone(); - self.background.spawn(async move { - if let (Some(old_v), Some(new_v)) = (old, new) { - // Propagate deletion of old versions - for v in old_v.versions.iter() { - let newly_deleted = match new_v - .versions - .binary_search_by(|nv| nv.cmp_key().cmp(&v.cmp_key())) - { - Err(_) => true, - Ok(i) => { - new_v.versions[i].state == ObjectVersionState::Aborted - && v.state != ObjectVersionState::Aborted - } - }; - if newly_deleted { - let deleted_version = - Version::new(v.uuid, old_v.bucket_id, old_v.key.clone(), true); - version_table.insert(&deleted_version).await?; - } - } - } - Ok(()) - }) - } - - fn matches_filter(entry: &Self::E, filter: &Self::Filter) -> bool { - match filter { - ObjectFilter::IsData => entry.versions.iter().any(|v| v.is_data()), - ObjectFilter::IsUploading => entry.versions.iter().any(|v| v.is_uploading()), - } - } - - fn try_migrate(bytes: &[u8]) -> Option { - let old_obj = rmp_serde::decode::from_read_ref::<_, old::Object>(bytes).ok()?; - Some(migrate_object(old_obj)) - } -} - -// vvvvvvvv migration code, stupid stuff vvvvvvvvvvvv -// (we just want to change bucket into bucket_id by hashing it) - -fn migrate_object(o: old::Object) -> Object { - let versions = o - .versions() - .iter() - .cloned() - .map(migrate_object_version) - .collect(); - Object { - bucket_id: blake2sum(o.bucket.as_bytes()), - key: o.key, - versions, - } -} - -fn migrate_object_version(v: old::ObjectVersion) -> ObjectVersion { - ObjectVersion { - uuid: Uuid::try_from(v.uuid.as_slice()).unwrap(), - timestamp: v.timestamp, - state: match v.state { - old::ObjectVersionState::Uploading(h) => { - ObjectVersionState::Uploading(migrate_object_version_headers(h)) - } - old::ObjectVersionState::Complete(d) => { - ObjectVersionState::Complete(migrate_object_version_data(d)) - } - old::ObjectVersionState::Aborted => ObjectVersionState::Aborted, - }, - } -} - -fn migrate_object_version_headers(h: old::ObjectVersionHeaders) -> ObjectVersionHeaders { - ObjectVersionHeaders { - content_type: h.content_type, - other: h.other, - } -} - -fn migrate_object_version_data(d: old::ObjectVersionData) -> ObjectVersionData { - match d { - old::ObjectVersionData::DeleteMarker => ObjectVersionData::DeleteMarker, - old::ObjectVersionData::Inline(m, b) => { - ObjectVersionData::Inline(migrate_object_version_meta(m), b) - } - old::ObjectVersionData::FirstBlock(m, h) => ObjectVersionData::FirstBlock( - migrate_object_version_meta(m), - Hash::try_from(h.as_slice()).unwrap(), - ), - } -} - -fn migrate_object_version_meta(m: old::ObjectVersionMeta) -> ObjectVersionMeta { - ObjectVersionMeta { - headers: migrate_object_version_headers(m.headers), - size: m.size, - etag: m.etag, - } -} diff --git a/src/model/s3/block_ref_table.rs b/src/model/s3/block_ref_table.rs new file mode 100644 index 00000000..9b3991bf --- /dev/null +++ b/src/model/s3/block_ref_table.rs @@ -0,0 +1,74 @@ +use serde::{Deserialize, Serialize}; +use std::sync::Arc; + +use garage_util::data::*; + +use garage_table::crdt::Crdt; +use garage_table::*; + +use garage_block::manager::*; + +#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] +pub struct BlockRef { + /// Hash (blake2 sum) of the block, used as partition key + pub block: Hash, + + /// Id of the Version for the object containing this block, used as sorting key + pub version: Uuid, + + // Keep track of deleted status + /// Is the Version that contains this block deleted + pub deleted: crdt::Bool, +} + +impl Entry for BlockRef { + fn partition_key(&self) -> &Hash { + &self.block + } + fn sort_key(&self) -> &Uuid { + &self.version + } + fn is_tombstone(&self) -> bool { + self.deleted.get() + } +} + +impl Crdt for BlockRef { + fn merge(&mut self, other: &Self) { + self.deleted.merge(&other.deleted); + } +} + +pub struct BlockRefTable { + pub block_manager: Arc, +} + +impl TableSchema for BlockRefTable { + const TABLE_NAME: &'static str = "block_ref"; + + type P = Hash; + type S = Uuid; + type E = BlockRef; + type Filter = DeletedFilter; + + fn updated(&self, old: Option<&Self::E>, new: Option<&Self::E>) { + #[allow(clippy::or_fun_call)] + let block = &old.or(new).unwrap().block; + let was_before = old.map(|x| !x.deleted.get()).unwrap_or(false); + let is_after = new.map(|x| !x.deleted.get()).unwrap_or(false); + if is_after && !was_before { + if let Err(e) = self.block_manager.block_incref(block) { + warn!("block_incref failed for block {:?}: {}", block, e); + } + } + if was_before && !is_after { + if let Err(e) = self.block_manager.block_decref(block) { + warn!("block_decref failed for block {:?}: {}", block, e); + } + } + } + + fn matches_filter(entry: &Self::E, filter: &Self::Filter) -> bool { + filter.apply(entry.deleted.get()) + } +} diff --git a/src/model/s3/mod.rs b/src/model/s3/mod.rs new file mode 100644 index 00000000..4e94337d --- /dev/null +++ b/src/model/s3/mod.rs @@ -0,0 +1,3 @@ +pub mod block_ref_table; +pub mod object_table; +pub mod version_table; diff --git a/src/model/s3/object_table.rs b/src/model/s3/object_table.rs new file mode 100644 index 00000000..3d9a89f7 --- /dev/null +++ b/src/model/s3/object_table.rs @@ -0,0 +1,337 @@ +use serde::{Deserialize, Serialize}; +use std::collections::BTreeMap; +use std::sync::Arc; + +use garage_util::background::BackgroundRunner; +use garage_util::data::*; + +use garage_table::crdt::*; +use garage_table::replication::TableShardedReplication; +use garage_table::*; + +use crate::s3::version_table::*; + +use garage_model_050::object_table as old; + +/// An object +#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] +pub struct Object { + /// The bucket in which the object is stored, used as partition key + pub bucket_id: Uuid, + + /// The key at which the object is stored in its bucket, used as sorting key + pub key: String, + + /// The list of currenty stored versions of the object + versions: Vec, +} + +impl Object { + /// Initialize an Object struct from parts + pub fn new(bucket_id: Uuid, key: String, versions: Vec) -> Self { + let mut ret = Self { + bucket_id, + key, + versions: vec![], + }; + for v in versions { + ret.add_version(v) + .expect("Twice the same ObjectVersion in Object constructor"); + } + ret + } + + /// Adds a version if it wasn't already present + #[allow(clippy::result_unit_err)] + pub fn add_version(&mut self, new: ObjectVersion) -> Result<(), ()> { + match self + .versions + .binary_search_by(|v| v.cmp_key().cmp(&new.cmp_key())) + { + Err(i) => { + self.versions.insert(i, new); + Ok(()) + } + Ok(_) => Err(()), + } + } + + /// Get a list of currently stored versions of `Object` + pub fn versions(&self) -> &[ObjectVersion] { + &self.versions[..] + } +} + +/// Informations about a version of an object +#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] +pub struct ObjectVersion { + /// Id of the version + pub uuid: Uuid, + /// Timestamp of when the object was created + pub timestamp: u64, + /// State of the version + pub state: ObjectVersionState, +} + +/// State of an object version +#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] +pub enum ObjectVersionState { + /// The version is being received + Uploading(ObjectVersionHeaders), + /// The version is fully received + Complete(ObjectVersionData), + /// The version uploaded containded errors or the upload was explicitly aborted + Aborted, +} + +impl Crdt for ObjectVersionState { + fn merge(&mut self, other: &Self) { + use ObjectVersionState::*; + match other { + Aborted => { + *self = Aborted; + } + Complete(b) => match self { + Aborted => {} + Complete(a) => { + a.merge(b); + } + Uploading(_) => { + *self = Complete(b.clone()); + } + }, + Uploading(_) => {} + } + } +} + +/// Data stored in object version +#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] +pub enum ObjectVersionData { + /// The object was deleted, this Version is a tombstone to mark it as such + DeleteMarker, + /// The object is short, it's stored inlined + Inline(ObjectVersionMeta, #[serde(with = "serde_bytes")] Vec), + /// The object is not short, Hash of first block is stored here, next segments hashes are + /// stored in the version table + FirstBlock(ObjectVersionMeta, Hash), +} + +impl AutoCrdt for ObjectVersionData { + const WARN_IF_DIFFERENT: bool = true; +} + +/// Metadata about the object version +#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] +pub struct ObjectVersionMeta { + /// Headers to send to the client + pub headers: ObjectVersionHeaders, + /// Size of the object + pub size: u64, + /// etag of the object + pub etag: String, +} + +/// Additional headers for an object +#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] +pub struct ObjectVersionHeaders { + /// Content type of the object + pub content_type: String, + /// Any other http headers to send + pub other: BTreeMap, +} + +impl ObjectVersion { + fn cmp_key(&self) -> (u64, Uuid) { + (self.timestamp, self.uuid) + } + + /// Is the object version currently being uploaded + pub fn is_uploading(&self) -> bool { + matches!(self.state, ObjectVersionState::Uploading(_)) + } + + /// Is the object version completely received + pub fn is_complete(&self) -> bool { + matches!(self.state, ObjectVersionState::Complete(_)) + } + + /// Is the object version available (received and not a tombstone) + pub fn is_data(&self) -> bool { + match self.state { + ObjectVersionState::Complete(ObjectVersionData::DeleteMarker) => false, + ObjectVersionState::Complete(_) => true, + _ => false, + } + } +} + +impl Entry for Object { + fn partition_key(&self) -> &Uuid { + &self.bucket_id + } + fn sort_key(&self) -> &String { + &self.key + } + fn is_tombstone(&self) -> bool { + self.versions.len() == 1 + && self.versions[0].state + == ObjectVersionState::Complete(ObjectVersionData::DeleteMarker) + } +} + +impl Crdt for Object { + fn merge(&mut self, other: &Self) { + // Merge versions from other into here + for other_v in other.versions.iter() { + match self + .versions + .binary_search_by(|v| v.cmp_key().cmp(&other_v.cmp_key())) + { + Ok(i) => { + self.versions[i].state.merge(&other_v.state); + } + Err(i) => { + self.versions.insert(i, other_v.clone()); + } + } + } + + // Remove versions which are obsolete, i.e. those that come + // before the last version which .is_complete(). + let last_complete = self + .versions + .iter() + .enumerate() + .rev() + .find(|(_, v)| v.is_complete()) + .map(|(vi, _)| vi); + + if let Some(last_vi) = last_complete { + self.versions = self.versions.drain(last_vi..).collect::>(); + } + } +} + +pub struct ObjectTable { + pub background: Arc, + pub version_table: Arc>, +} + +#[derive(Clone, Copy, Debug, Serialize, Deserialize)] +pub enum ObjectFilter { + IsData, + IsUploading, +} + +impl TableSchema for ObjectTable { + const TABLE_NAME: &'static str = "object"; + + type P = Uuid; + type S = String; + type E = Object; + type Filter = ObjectFilter; + + fn updated(&self, old: Option<&Self::E>, new: Option<&Self::E>) { + let version_table = self.version_table.clone(); + let old = old.cloned(); + let new = new.cloned(); + + self.background.spawn(async move { + if let (Some(old_v), Some(new_v)) = (old, new) { + // Propagate deletion of old versions + for v in old_v.versions.iter() { + let newly_deleted = match new_v + .versions + .binary_search_by(|nv| nv.cmp_key().cmp(&v.cmp_key())) + { + Err(_) => true, + Ok(i) => { + new_v.versions[i].state == ObjectVersionState::Aborted + && v.state != ObjectVersionState::Aborted + } + }; + if newly_deleted { + let deleted_version = + Version::new(v.uuid, old_v.bucket_id, old_v.key.clone(), true); + version_table.insert(&deleted_version).await?; + } + } + } + Ok(()) + }) + } + + fn matches_filter(entry: &Self::E, filter: &Self::Filter) -> bool { + match filter { + ObjectFilter::IsData => entry.versions.iter().any(|v| v.is_data()), + ObjectFilter::IsUploading => entry.versions.iter().any(|v| v.is_uploading()), + } + } + + fn try_migrate(bytes: &[u8]) -> Option { + let old_obj = rmp_serde::decode::from_read_ref::<_, old::Object>(bytes).ok()?; + Some(migrate_object(old_obj)) + } +} + +// vvvvvvvv migration code, stupid stuff vvvvvvvvvvvv +// (we just want to change bucket into bucket_id by hashing it) + +fn migrate_object(o: old::Object) -> Object { + let versions = o + .versions() + .iter() + .cloned() + .map(migrate_object_version) + .collect(); + Object { + bucket_id: blake2sum(o.bucket.as_bytes()), + key: o.key, + versions, + } +} + +fn migrate_object_version(v: old::ObjectVersion) -> ObjectVersion { + ObjectVersion { + uuid: Uuid::try_from(v.uuid.as_slice()).unwrap(), + timestamp: v.timestamp, + state: match v.state { + old::ObjectVersionState::Uploading(h) => { + ObjectVersionState::Uploading(migrate_object_version_headers(h)) + } + old::ObjectVersionState::Complete(d) => { + ObjectVersionState::Complete(migrate_object_version_data(d)) + } + old::ObjectVersionState::Aborted => ObjectVersionState::Aborted, + }, + } +} + +fn migrate_object_version_headers(h: old::ObjectVersionHeaders) -> ObjectVersionHeaders { + ObjectVersionHeaders { + content_type: h.content_type, + other: h.other, + } +} + +fn migrate_object_version_data(d: old::ObjectVersionData) -> ObjectVersionData { + match d { + old::ObjectVersionData::DeleteMarker => ObjectVersionData::DeleteMarker, + old::ObjectVersionData::Inline(m, b) => { + ObjectVersionData::Inline(migrate_object_version_meta(m), b) + } + old::ObjectVersionData::FirstBlock(m, h) => ObjectVersionData::FirstBlock( + migrate_object_version_meta(m), + Hash::try_from(h.as_slice()).unwrap(), + ), + } +} + +fn migrate_object_version_meta(m: old::ObjectVersionMeta) -> ObjectVersionMeta { + ObjectVersionMeta { + headers: migrate_object_version_headers(m.headers), + size: m.size, + etag: m.etag, + } +} diff --git a/src/model/s3/version_table.rs b/src/model/s3/version_table.rs new file mode 100644 index 00000000..ad096772 --- /dev/null +++ b/src/model/s3/version_table.rs @@ -0,0 +1,207 @@ +use serde::{Deserialize, Serialize}; +use std::sync::Arc; + +use garage_util::background::BackgroundRunner; +use garage_util::data::*; + +use garage_table::crdt::*; +use garage_table::replication::TableShardedReplication; +use garage_table::*; + +use crate::s3::block_ref_table::*; + +use garage_model_050::version_table as old; + +/// A version of an object +#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] +pub struct Version { + /// UUID of the version, used as partition key + pub uuid: Uuid, + + // Actual data: the blocks for this version + // In the case of a multipart upload, also store the etags + // of individual parts and check them when doing CompleteMultipartUpload + /// Is this version deleted + pub deleted: crdt::Bool, + /// list of blocks of data composing the version + pub blocks: crdt::Map, + /// Etag of each part in case of a multipart upload, empty otherwise + pub parts_etags: crdt::Map, + + // Back link to bucket+key so that we can figure if + // this was deleted later on + /// Bucket in which the related object is stored + pub bucket_id: Uuid, + /// Key in which the related object is stored + pub key: String, +} + +impl Version { + pub fn new(uuid: Uuid, bucket_id: Uuid, key: String, deleted: bool) -> Self { + Self { + uuid, + deleted: deleted.into(), + blocks: crdt::Map::new(), + parts_etags: crdt::Map::new(), + bucket_id, + key, + } + } + + pub fn has_part_number(&self, part_number: u64) -> bool { + let case1 = self + .parts_etags + .items() + .binary_search_by(|(k, _)| k.cmp(&part_number)) + .is_ok(); + let case2 = self + .blocks + .items() + .binary_search_by(|(k, _)| k.part_number.cmp(&part_number)) + .is_ok(); + case1 || case2 + } +} + +#[derive(PartialEq, Eq, Clone, Copy, Debug, Serialize, Deserialize)] +pub struct VersionBlockKey { + /// Number of the part + pub part_number: u64, + /// Offset of this sub-segment in its part + pub offset: u64, +} + +impl Ord for VersionBlockKey { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + self.part_number + .cmp(&other.part_number) + .then(self.offset.cmp(&other.offset)) + } +} + +impl PartialOrd for VersionBlockKey { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +/// Informations about a single block +#[derive(PartialEq, Eq, Ord, PartialOrd, Clone, Copy, Debug, Serialize, Deserialize)] +pub struct VersionBlock { + /// Blake2 sum of the block + pub hash: Hash, + /// Size of the block + pub size: u64, +} + +impl AutoCrdt for VersionBlock { + const WARN_IF_DIFFERENT: bool = true; +} + +impl Entry for Version { + fn partition_key(&self) -> &Uuid { + &self.uuid + } + fn sort_key(&self) -> &EmptyKey { + &EmptyKey + } + fn is_tombstone(&self) -> bool { + self.deleted.get() + } +} + +impl Crdt for Version { + fn merge(&mut self, other: &Self) { + self.deleted.merge(&other.deleted); + + if self.deleted.get() { + self.blocks.clear(); + self.parts_etags.clear(); + } else { + self.blocks.merge(&other.blocks); + self.parts_etags.merge(&other.parts_etags); + } + } +} + +pub struct VersionTable { + pub background: Arc, + pub block_ref_table: Arc>, +} + +impl TableSchema for VersionTable { + const TABLE_NAME: &'static str = "version"; + + type P = Uuid; + type S = EmptyKey; + type E = Version; + type Filter = DeletedFilter; + + fn updated(&self, old: Option<&Self::E>, new: Option<&Self::E>) { + let block_ref_table = self.block_ref_table.clone(); + let old = old.cloned(); + let new = new.cloned(); + + self.background.spawn(async move { + if let (Some(old_v), Some(new_v)) = (old, new) { + // Propagate deletion of version blocks + if new_v.deleted.get() && !old_v.deleted.get() { + let deleted_block_refs = old_v + .blocks + .items() + .iter() + .map(|(_k, vb)| BlockRef { + block: vb.hash, + version: old_v.uuid, + deleted: true.into(), + }) + .collect::>(); + block_ref_table.insert_many(&deleted_block_refs[..]).await?; + } + } + Ok(()) + }) + } + + fn matches_filter(entry: &Self::E, filter: &Self::Filter) -> bool { + filter.apply(entry.deleted.get()) + } + + fn try_migrate(bytes: &[u8]) -> Option { + let old = rmp_serde::decode::from_read_ref::<_, old::Version>(bytes).ok()?; + + let blocks = old + .blocks + .items() + .iter() + .map(|(k, v)| { + ( + VersionBlockKey { + part_number: k.part_number, + offset: k.offset, + }, + VersionBlock { + hash: Hash::try_from(v.hash.as_slice()).unwrap(), + size: v.size, + }, + ) + }) + .collect::>(); + + let parts_etags = old + .parts_etags + .items() + .iter() + .map(|(k, v)| (*k, v.clone())) + .collect::>(); + + Some(Version { + uuid: Hash::try_from(old.uuid.as_slice()).unwrap(), + deleted: crdt::Bool::new(old.deleted.get()), + blocks, + parts_etags, + bucket_id: blake2sum(old.bucket.as_bytes()), + key: old.key, + }) + } +} diff --git a/src/model/version_table.rs b/src/model/version_table.rs deleted file mode 100644 index 839b1f4f..00000000 --- a/src/model/version_table.rs +++ /dev/null @@ -1,204 +0,0 @@ -use serde::{Deserialize, Serialize}; -use std::sync::Arc; - -use garage_util::background::BackgroundRunner; -use garage_util::data::*; - -use garage_table::crdt::*; -use garage_table::replication::TableShardedReplication; -use garage_table::*; - -use crate::block_ref_table::*; - -use garage_model_050::version_table as old; - -/// A version of an object -#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] -pub struct Version { - /// UUID of the version, used as partition key - pub uuid: Uuid, - - // Actual data: the blocks for this version - // In the case of a multipart upload, also store the etags - // of individual parts and check them when doing CompleteMultipartUpload - /// Is this version deleted - pub deleted: crdt::Bool, - /// list of blocks of data composing the version - pub blocks: crdt::Map, - /// Etag of each part in case of a multipart upload, empty otherwise - pub parts_etags: crdt::Map, - - // Back link to bucket+key so that we can figure if - // this was deleted later on - /// Bucket in which the related object is stored - pub bucket_id: Uuid, - /// Key in which the related object is stored - pub key: String, -} - -impl Version { - pub fn new(uuid: Uuid, bucket_id: Uuid, key: String, deleted: bool) -> Self { - Self { - uuid, - deleted: deleted.into(), - blocks: crdt::Map::new(), - parts_etags: crdt::Map::new(), - bucket_id, - key, - } - } - - pub fn has_part_number(&self, part_number: u64) -> bool { - let case1 = self - .parts_etags - .items() - .binary_search_by(|(k, _)| k.cmp(&part_number)) - .is_ok(); - let case2 = self - .blocks - .items() - .binary_search_by(|(k, _)| k.part_number.cmp(&part_number)) - .is_ok(); - case1 || case2 - } -} - -#[derive(PartialEq, Eq, Clone, Copy, Debug, Serialize, Deserialize)] -pub struct VersionBlockKey { - /// Number of the part - pub part_number: u64, - /// Offset of this sub-segment in its part - pub offset: u64, -} - -impl Ord for VersionBlockKey { - fn cmp(&self, other: &Self) -> std::cmp::Ordering { - self.part_number - .cmp(&other.part_number) - .then(self.offset.cmp(&other.offset)) - } -} - -impl PartialOrd for VersionBlockKey { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -/// Informations about a single block -#[derive(PartialEq, Eq, Ord, PartialOrd, Clone, Copy, Debug, Serialize, Deserialize)] -pub struct VersionBlock { - /// Blake2 sum of the block - pub hash: Hash, - /// Size of the block - pub size: u64, -} - -impl AutoCrdt for VersionBlock { - const WARN_IF_DIFFERENT: bool = true; -} - -impl Entry for Version { - fn partition_key(&self) -> &Uuid { - &self.uuid - } - fn sort_key(&self) -> &EmptyKey { - &EmptyKey - } - fn is_tombstone(&self) -> bool { - self.deleted.get() - } -} - -impl Crdt for Version { - fn merge(&mut self, other: &Self) { - self.deleted.merge(&other.deleted); - - if self.deleted.get() { - self.blocks.clear(); - self.parts_etags.clear(); - } else { - self.blocks.merge(&other.blocks); - self.parts_etags.merge(&other.parts_etags); - } - } -} - -pub struct VersionTable { - pub background: Arc, - pub block_ref_table: Arc>, -} - -impl TableSchema for VersionTable { - const TABLE_NAME: &'static str = "version"; - - type P = Uuid; - type S = EmptyKey; - type E = Version; - type Filter = DeletedFilter; - - fn updated(&self, old: Option, new: Option) { - let block_ref_table = self.block_ref_table.clone(); - self.background.spawn(async move { - if let (Some(old_v), Some(new_v)) = (old, new) { - // Propagate deletion of version blocks - if new_v.deleted.get() && !old_v.deleted.get() { - let deleted_block_refs = old_v - .blocks - .items() - .iter() - .map(|(_k, vb)| BlockRef { - block: vb.hash, - version: old_v.uuid, - deleted: true.into(), - }) - .collect::>(); - block_ref_table.insert_many(&deleted_block_refs[..]).await?; - } - } - Ok(()) - }) - } - - fn matches_filter(entry: &Self::E, filter: &Self::Filter) -> bool { - filter.apply(entry.deleted.get()) - } - - fn try_migrate(bytes: &[u8]) -> Option { - let old = rmp_serde::decode::from_read_ref::<_, old::Version>(bytes).ok()?; - - let blocks = old - .blocks - .items() - .iter() - .map(|(k, v)| { - ( - VersionBlockKey { - part_number: k.part_number, - offset: k.offset, - }, - VersionBlock { - hash: Hash::try_from(v.hash.as_slice()).unwrap(), - size: v.size, - }, - ) - }) - .collect::>(); - - let parts_etags = old - .parts_etags - .items() - .iter() - .map(|(k, v)| (*k, v.clone())) - .collect::>(); - - Some(Version { - uuid: Hash::try_from(old.uuid.as_slice()).unwrap(), - deleted: crdt::Bool::new(old.deleted.get()), - blocks, - parts_etags, - bucket_id: blake2sum(old.bucket.as_bytes()), - key: old.key, - }) - } -} diff --git a/src/rpc/Cargo.toml b/src/rpc/Cargo.toml index 46d0dc1e..bed7f44a 100644 --- a/src/rpc/Cargo.toml +++ b/src/rpc/Cargo.toml @@ -52,5 +52,6 @@ netapp = { version = "0.4.4", features = ["telemetry"] } hyper = { version = "0.14", features = ["client", "http1", "runtime", "tcp"] } + [features] kubernetes-discovery = [ "kube", "k8s-openapi", "openssl", "schemars" ] diff --git a/src/table/data.rs b/src/table/data.rs index ff7965f5..5cb10066 100644 --- a/src/table/data.rs +++ b/src/table/data.rs @@ -1,8 +1,9 @@ use core::borrow::Borrow; +use std::convert::TryInto; use std::sync::Arc; use serde_bytes::ByteBuf; -use sled::Transactional; +use sled::{IVec, Transactional}; use tokio::sync::Notify; use garage_util::data::*; @@ -16,12 +17,13 @@ use crate::gc::GcTodoEntry; use crate::metrics::*; use crate::replication::*; use crate::schema::*; +use crate::util::*; pub struct TableData { system: Arc, - pub(crate) instance: F, - pub(crate) replication: R, + pub instance: F, + pub replication: R, pub store: sled::Tree, @@ -83,18 +85,48 @@ where pub fn read_range( &self, - p: &F::P, - s: &Option, + partition_key: &F::P, + start: &Option, + filter: &Option, + limit: usize, + enumeration_order: EnumerationOrder, + ) -> Result>, Error> { + let partition_hash = partition_key.hash(); + match enumeration_order { + EnumerationOrder::Forward => { + let first_key = match start { + None => partition_hash.to_vec(), + Some(sk) => self.tree_key(partition_key, sk), + }; + let range = self.store.range(first_key..); + self.read_range_aux(partition_hash, range, filter, limit) + } + EnumerationOrder::Reverse => match start { + Some(sk) => { + let last_key = self.tree_key(partition_key, sk); + let range = self.store.range(..=last_key).rev(); + self.read_range_aux(partition_hash, range, filter, limit) + } + None => { + let mut last_key = partition_hash.to_vec(); + let lower = u128::from_be_bytes(last_key[16..32].try_into().unwrap()); + last_key[16..32].copy_from_slice(&u128::to_be_bytes(lower + 1)); + let range = self.store.range(..last_key).rev(); + self.read_range_aux(partition_hash, range, filter, limit) + } + }, + } + } + + fn read_range_aux( + &self, + partition_hash: Hash, + range: impl Iterator>, filter: &Option, limit: usize, ) -> Result>, Error> { - let partition_hash = p.hash(); - let first_key = match s { - None => partition_hash.to_vec(), - Some(sk) => self.tree_key(p, sk), - }; let mut ret = vec![]; - for item in self.store.range(first_key..) { + for item in range { let (key, value) = item?; if &key[..32] != partition_hash.as_slice() { break; @@ -136,17 +168,31 @@ where let update = self.decode_entry(update_bytes)?; let tree_key = self.tree_key(update.partition_key(), update.sort_key()); + self.update_entry_with(&tree_key[..], |ent| match ent { + Some(mut ent) => { + ent.merge(&update); + ent + } + None => update.clone(), + })?; + Ok(()) + } + + pub fn update_entry_with( + &self, + tree_key: &[u8], + f: impl Fn(Option) -> F::E, + ) -> Result, Error> { let changed = (&self.store, &self.merkle_todo).transaction(|(store, mkl_todo)| { - let (old_entry, old_bytes, new_entry) = match store.get(&tree_key)? { + let (old_entry, old_bytes, new_entry) = match store.get(tree_key)? { Some(old_bytes) => { let old_entry = self .decode_entry(&old_bytes) .map_err(sled::transaction::ConflictableTransactionError::Abort)?; - let mut new_entry = old_entry.clone(); - new_entry.merge(&update); + let new_entry = f(Some(old_entry.clone())); (Some(old_entry), Some(old_bytes), new_entry) } - None => (None, None, update.clone()), + None => (None, None, f(None)), }; // Scenario 1: the value changed, so of course there is a change @@ -163,8 +209,8 @@ where if value_changed || encoding_changed { let new_bytes_hash = blake2sum(&new_bytes[..]); - mkl_todo.insert(tree_key.clone(), new_bytes_hash.as_slice())?; - store.insert(tree_key.clone(), new_bytes)?; + mkl_todo.insert(tree_key.to_vec(), new_bytes_hash.as_slice())?; + store.insert(tree_key.to_vec(), new_bytes)?; Ok(Some((old_entry, new_entry, new_bytes_hash))) } else { Ok(None) @@ -175,7 +221,7 @@ where self.metrics.internal_update_counter.add(1); let is_tombstone = new_entry.is_tombstone(); - self.instance.updated(old_entry, Some(new_entry)); + self.instance.updated(old_entry.as_ref(), Some(&new_entry)); self.merkle_todo_notify.notify_one(); if is_tombstone { // We are only responsible for GC'ing this item if we are the @@ -187,12 +233,14 @@ where let pk_hash = Hash::try_from(&tree_key[..32]).unwrap(); let nodes = self.replication.write_nodes(&pk_hash); if nodes.first() == Some(&self.system.id) { - GcTodoEntry::new(tree_key, new_bytes_hash).save(&self.gc_todo)?; + GcTodoEntry::new(tree_key.to_vec(), new_bytes_hash).save(&self.gc_todo)?; } } - } - Ok(()) + Ok(Some(new_entry)) + } else { + Ok(None) + } } pub(crate) fn delete_if_equal(self: &Arc, k: &[u8], v: &[u8]) -> Result { @@ -211,7 +259,7 @@ where self.metrics.internal_delete_counter.add(1); let old_entry = self.decode_entry(v)?; - self.instance.updated(Some(old_entry), None); + self.instance.updated(Some(&old_entry), None); self.merkle_todo_notify.notify_one(); } Ok(removed) @@ -235,7 +283,7 @@ where if let Some(old_v) = removed { let old_entry = self.decode_entry(&old_v[..])?; - self.instance.updated(Some(old_entry), None); + self.instance.updated(Some(&old_entry), None); self.merkle_todo_notify.notify_one(); Ok(true) } else { @@ -245,13 +293,13 @@ where // ---- Utility functions ---- - pub(crate) fn tree_key(&self, p: &F::P, s: &F::S) -> Vec { + pub fn tree_key(&self, p: &F::P, s: &F::S) -> Vec { let mut ret = p.hash().to_vec(); ret.extend(s.sort_key()); ret } - pub(crate) fn decode_entry(&self, bytes: &[u8]) -> Result { + pub fn decode_entry(&self, bytes: &[u8]) -> Result { match rmp_serde::decode::from_read_ref::<_, F::E>(bytes) { Ok(x) => Ok(x), Err(e) => match F::try_migrate(bytes) { diff --git a/src/table/schema.rs b/src/table/schema.rs index eba918a2..37327037 100644 --- a/src/table/schema.rs +++ b/src/table/schema.rs @@ -86,7 +86,7 @@ pub trait TableSchema: Send + Sync { // as the update itself is an unchangeable fact that will never go back // due to CRDT logic. Typically errors in propagation of info should be logged // to stderr. - fn updated(&self, _old: Option, _new: Option) {} + fn updated(&self, _old: Option<&Self::E>, _new: Option<&Self::E>) {} fn matches_filter(entry: &Self::E, filter: &Self::Filter) -> bool; } diff --git a/src/table/table.rs b/src/table/table.rs index 7f87a449..2a167604 100644 --- a/src/table/table.rs +++ b/src/table/table.rs @@ -1,4 +1,5 @@ -use std::collections::{BTreeMap, HashMap}; +use std::borrow::Borrow; +use std::collections::{BTreeMap, BTreeSet, HashMap}; use std::sync::Arc; use std::time::Duration; @@ -26,8 +27,9 @@ use crate::merkle::*; use crate::replication::*; use crate::schema::*; use crate::sync::*; +use crate::util::*; -const TABLE_RPC_TIMEOUT: Duration = Duration::from_secs(10); +pub const TABLE_RPC_TIMEOUT: Duration = Duration::from_secs(10); pub struct Table { pub system: Arc, @@ -45,7 +47,13 @@ pub(crate) enum TableRpc { ReadEntryResponse(Option), // Read range: read all keys in partition P, possibly starting at a certain sort key offset - ReadRange(F::P, Option, Option, usize), + ReadRange { + partition: F::P, + begin_sort_key: Option, + filter: Option, + limit: usize, + enumeration_order: EnumerationOrder, + }, Update(Vec>), } @@ -123,9 +131,13 @@ where Ok(()) } - pub async fn insert_many(&self, entries: &[F::E]) -> Result<(), Error> { + pub async fn insert_many(&self, entries: I) -> Result<(), Error> + where + I: IntoIterator + Send + Sync, + IE: Borrow + Send + Sync, + { let tracer = opentelemetry::global::tracer("garage_table"); - let span = tracer.start(format!("{} insert_many {}", F::TABLE_NAME, entries.len())); + let span = tracer.start(format!("{} insert_many", F::TABLE_NAME)); self.insert_many_internal(entries) .bound_record_duration(&self.data.metrics.put_request_duration) @@ -137,10 +149,15 @@ where Ok(()) } - async fn insert_many_internal(&self, entries: &[F::E]) -> Result<(), Error> { + async fn insert_many_internal(&self, entries: I) -> Result<(), Error> + where + I: IntoIterator + Send + Sync, + IE: Borrow + Send + Sync, + { let mut call_list: HashMap<_, Vec<_>> = HashMap::new(); - for entry in entries.iter() { + for entry in entries.into_iter() { + let entry = entry.borrow(); let hash = entry.partition_key().hash(); let who = self.data.replication.write_nodes(&hash); let e_enc = Arc::new(ByteBuf::from(rmp_to_vec_all_named(entry)?)); @@ -261,12 +278,19 @@ where begin_sort_key: Option, filter: Option, limit: usize, + enumeration_order: EnumerationOrder, ) -> Result, Error> { let tracer = opentelemetry::global::tracer("garage_table"); let span = tracer.start(format!("{} get_range", F::TABLE_NAME)); let res = self - .get_range_internal(partition_key, begin_sort_key, filter, limit) + .get_range_internal( + partition_key, + begin_sort_key, + filter, + limit, + enumeration_order, + ) .bound_record_duration(&self.data.metrics.get_request_duration) .with_context(Context::current_with_span(span)) .await?; @@ -282,11 +306,18 @@ where begin_sort_key: Option, filter: Option, limit: usize, + enumeration_order: EnumerationOrder, ) -> Result, Error> { let hash = partition_key.hash(); let who = self.data.replication.read_nodes(&hash); - let rpc = TableRpc::::ReadRange(partition_key.clone(), begin_sort_key, filter, limit); + let rpc = TableRpc::::ReadRange { + partition: partition_key.clone(), + begin_sort_key, + filter, + limit, + enumeration_order, + }; let resps = self .system @@ -302,44 +333,65 @@ where ) .await?; - let mut ret = BTreeMap::new(); - let mut to_repair = BTreeMap::new(); + let mut ret: BTreeMap, F::E> = BTreeMap::new(); + let mut to_repair = BTreeSet::new(); for resp in resps { if let TableRpc::Update(entries) = resp { for entry_bytes in entries.iter() { let entry = self.data.decode_entry(entry_bytes.as_slice())?; let entry_key = self.data.tree_key(entry.partition_key(), entry.sort_key()); - match ret.remove(&entry_key) { - None => { - ret.insert(entry_key, Some(entry)); - } - Some(Some(mut prev)) => { - let must_repair = prev != entry; - prev.merge(&entry); - if must_repair { - to_repair.insert(entry_key.clone(), Some(prev.clone())); + match ret.get_mut(&entry_key) { + Some(e) => { + if *e != entry { + e.merge(&entry); + to_repair.insert(entry_key.clone()); } - ret.insert(entry_key, Some(prev)); } - Some(None) => unreachable!(), + None => { + ret.insert(entry_key, entry); + } } } + } else { + return Err(Error::unexpected_rpc_message(resp)); } } + if !to_repair.is_empty() { let self2 = self.clone(); + let to_repair = to_repair + .into_iter() + .map(|k| ret.get(&k).unwrap().clone()) + .collect::>(); self.system.background.spawn_cancellable(async move { - for (_, v) in to_repair.iter_mut() { - self2.repair_on_read(&who[..], v.take().unwrap()).await?; + for v in to_repair { + self2.repair_on_read(&who[..], v).await?; } Ok(()) }); } - let ret_vec = ret - .iter_mut() - .take(limit) - .map(|(_k, v)| v.take().unwrap()) - .collect::>(); + + // At this point, the `ret` btreemap might contain more than `limit` + // items, because nodes might have returned us each `limit` items + // but for different keys. We have to take only the first `limit` items + // in this map, in the specified enumeration order, for two reasons: + // 1. To return to the user no more than the number of items that they requested + // 2. To return only items for which we have a read quorum: we do not know + // that we have a read quorum for the items after the first `limit` + // of them + let ret_vec = match enumeration_order { + EnumerationOrder::Forward => ret + .into_iter() + .take(limit) + .map(|(_k, v)| v) + .collect::>(), + EnumerationOrder::Reverse => ret + .into_iter() + .rev() + .take(limit) + .map(|(_k, v)| v) + .collect::>(), + }; Ok(ret_vec) } @@ -378,8 +430,20 @@ where let value = self.data.read_entry(key, sort_key)?; Ok(TableRpc::ReadEntryResponse(value)) } - TableRpc::ReadRange(key, begin_sort_key, filter, limit) => { - let values = self.data.read_range(key, begin_sort_key, filter, *limit)?; + TableRpc::ReadRange { + partition, + begin_sort_key, + filter, + limit, + enumeration_order, + } => { + let values = self.data.read_range( + partition, + begin_sort_key, + filter, + *limit, + *enumeration_order, + )?; Ok(TableRpc::Update(values)) } TableRpc::Update(pairs) => { diff --git a/src/table/util.rs b/src/table/util.rs index 2a5c3afe..20595a94 100644 --- a/src/table/util.rs +++ b/src/table/util.rs @@ -17,7 +17,7 @@ impl PartitionKey for EmptyKey { } } -#[derive(Clone, Copy, Debug, Serialize, Deserialize)] +#[derive(Clone, Copy, Debug, Serialize, Deserialize, PartialEq, Eq)] pub enum DeletedFilter { Any, Deleted, @@ -33,3 +33,19 @@ impl DeletedFilter { } } } + +#[derive(Clone, Copy, Debug, Serialize, Deserialize, PartialEq, Eq)] +pub enum EnumerationOrder { + Forward, + Reverse, +} + +impl EnumerationOrder { + pub fn from_reverse(reverse: bool) -> Self { + if reverse { + Self::Reverse + } else { + Self::Forward + } + } +} diff --git a/src/util/Cargo.toml b/src/util/Cargo.toml index f13c1589..95cde531 100644 --- a/src/util/Cargo.toml +++ b/src/util/Cargo.toml @@ -41,3 +41,6 @@ http = "0.2" hyper = "0.14" opentelemetry = { version = "0.17", features = [ "rt-tokio", "metrics", "trace" ] } + +[features] +k2v = [] diff --git a/src/util/config.rs b/src/util/config.rs index e4d96476..4d66bfe4 100644 --- a/src/util/config.rs +++ b/src/util/config.rs @@ -73,7 +73,11 @@ pub struct Config { pub sled_flush_every_ms: u64, /// Configuration for S3 api - pub s3_api: ApiConfig, + pub s3_api: S3ApiConfig, + + /// Configuration for K2V api + #[cfg(feature = "k2v")] + pub k2v_api: Option, /// Configuration for serving files as normal web server pub s3_web: WebConfig, @@ -85,7 +89,7 @@ pub struct Config { /// Configuration for S3 api #[derive(Deserialize, Debug, Clone)] -pub struct ApiConfig { +pub struct S3ApiConfig { /// Address and port to bind for api serving pub api_bind_addr: SocketAddr, /// S3 region to use @@ -95,6 +99,14 @@ pub struct ApiConfig { pub root_domain: Option, } +/// Configuration for K2V api +#[cfg(feature = "k2v")] +#[derive(Deserialize, Debug, Clone)] +pub struct K2VApiConfig { + /// Address and port to bind for api serving + pub api_bind_addr: SocketAddr, +} + /// Configuration for serving files as normal web server #[derive(Deserialize, Debug, Clone)] pub struct WebConfig { diff --git a/src/util/error.rs b/src/util/error.rs index bdb3a69b..8734a0c8 100644 --- a/src/util/error.rs +++ b/src/util/error.rs @@ -44,6 +44,9 @@ pub enum Error { #[error(display = "Tokio semaphore acquire error: {}", _0)] TokioSemAcquire(#[error(source)] tokio::sync::AcquireError), + #[error(display = "Tokio broadcast receive error: {}", _0)] + TokioBcastRecv(#[error(source)] tokio::sync::broadcast::error::RecvError), + #[error(display = "Remote error: {}", _0)] RemoteError(String), diff --git a/src/web/web_server.rs b/src/web/web_server.rs index c3d691d0..867adc51 100644 --- a/src/web/web_server.rs +++ b/src/web/web_server.rs @@ -20,8 +20,8 @@ use crate::error::*; use garage_api::error::{Error as ApiError, OkOrBadRequest, OkOrInternalError}; use garage_api::helpers::{authority_to_host, host_to_bucket}; -use garage_api::s3_cors::{add_cors_headers, find_matching_cors_rule, handle_options_for_bucket}; -use garage_api::s3_get::{handle_get, handle_head}; +use garage_api::s3::cors::{add_cors_headers, find_matching_cors_rule, handle_options_for_bucket}; +use garage_api::s3::get::{handle_get, handle_head}; use garage_model::garage::Garage; -- cgit v1.2.3 From 7b474855e3a8491fcdde69d12d3fbae27f520383 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 5 May 2022 10:56:44 +0200 Subject: Make background runner terminate correctly --- src/util/background.rs | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) (limited to 'src') diff --git a/src/util/background.rs b/src/util/background.rs index bfdaaf1e..d35425f5 100644 --- a/src/util/background.rs +++ b/src/util/background.rs @@ -6,7 +6,9 @@ use std::time::Duration; use futures::future::*; use futures::select; -use tokio::sync::{mpsc, watch, Mutex}; +use futures::stream::FuturesUnordered; +use futures::StreamExt; +use tokio::sync::{mpsc, mpsc::error::TryRecvError, watch, Mutex}; use crate::error::Error; @@ -30,26 +32,31 @@ impl BackgroundRunner { let stop_signal_2 = stop_signal.clone(); let await_all_done = tokio::spawn(async move { + let mut workers = FuturesUnordered::new(); + let mut shutdown_timer = 0; loop { - let wkr = { - select! { - item = worker_out.recv().fuse() => { - match item { - Some(x) => x, - None => break, - } + let closed = match worker_out.try_recv() { + Ok(wkr) => { + workers.push(wkr); + false + } + Err(TryRecvError::Empty) => false, + Err(TryRecvError::Disconnected) => true, + }; + select! { + res = workers.next() => { + if let Some(Err(e)) = res { + error!("Worker exited with error: {}", e); } - _ = tokio::time::sleep(Duration::from_secs(5)).fuse() => { - if *stop_signal_2.borrow() { + } + _ = tokio::time::sleep(Duration::from_secs(1)).fuse() => { + if closed || *stop_signal_2.borrow() { + shutdown_timer += 1; + if shutdown_timer >= 10 { break; - } else { - continue; } } } - }; - if let Err(e) = wkr.await { - error!("Error while awaiting for worker: {}", e); } } }); -- cgit v1.2.3 From c692f55d5ce2c3ed08db7fbc4844debcc0aeb134 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 17 May 2022 11:50:23 +0200 Subject: K2V: Fix `end` parameter and add tests (fix #305) --- src/api/k2v/range.rs | 6 ++- src/garage/tests/k2v/batch.rs | 89 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 93 insertions(+), 2 deletions(-) (limited to 'src') diff --git a/src/api/k2v/range.rs b/src/api/k2v/range.rs index cd019723..295c34aa 100644 --- a/src/api/k2v/range.rs +++ b/src/api/k2v/range.rs @@ -74,7 +74,11 @@ where } } if let Some(e) = end { - if entry.sort_key() == e { + let is_finished = match enumeration_order { + EnumerationOrder::Forward => entry.sort_key() >= e, + EnumerationOrder::Reverse => entry.sort_key() <= e, + }; + if is_finished { return Ok((entries, false, None)); } } diff --git a/src/garage/tests/k2v/batch.rs b/src/garage/tests/k2v/batch.rs index 1182a298..acae1910 100644 --- a/src/garage/tests/k2v/batch.rs +++ b/src/garage/tests/k2v/batch.rs @@ -92,7 +92,9 @@ async fn test_batch() { br#"[ {"partitionKey": "root"}, {"partitionKey": "root", "start": "c"}, + {"partitionKey": "root", "start": "c", "end": "dynamite"}, {"partitionKey": "root", "start": "c", "reverse": true, "end": "a"}, + {"partitionKey": "root", "start": "c", "reverse": true, "end": "azerty"}, {"partitionKey": "root", "limit": 1}, {"partitionKey": "root", "prefix": "d"} ]"# @@ -147,6 +149,24 @@ async fn test_batch() { "more": false, "nextStart": null, }, + { + "partitionKey": "root", + "prefix": null, + "start": "c", + "end": "dynamite", + "limit": null, + "reverse": false, + "conflictsOnly": false, + "tombstones": false, + "singleItem": false, + "items": [ + {"sk": "c", "ct": ct.get("c").unwrap(), "v": [base64::encode(values.get("c").unwrap())]}, + {"sk": "d.1", "ct": ct.get("d.1").unwrap(), "v": [base64::encode(values.get("d.1").unwrap())]}, + {"sk": "d.2", "ct": ct.get("d.2").unwrap(), "v": [base64::encode(values.get("d.2").unwrap())]}, + ], + "more": false, + "nextStart": null, + }, { "partitionKey": "root", "prefix": null, @@ -164,6 +184,23 @@ async fn test_batch() { "more": false, "nextStart": null, }, + { + "partitionKey": "root", + "prefix": null, + "start": "c", + "end": "azerty", + "limit": null, + "reverse": true, + "conflictsOnly": false, + "tombstones": false, + "singleItem": false, + "items": [ + {"sk": "c", "ct": ct.get("c").unwrap(), "v": [base64::encode(values.get("c").unwrap())]}, + {"sk": "b", "ct": ct.get("b").unwrap(), "v": [base64::encode(values.get("b").unwrap())]}, + ], + "more": false, + "nextStart": null, + }, { "partitionKey": "root", "prefix": null, @@ -465,6 +502,34 @@ async fn test_batch() { ]) ); + // update our known tombstones + for sk in ["a", "b", "d.1", "d.2"] { + let res = ctx + .k2v + .request + .builder(bucket.clone()) + .path("root") + .query_param("sort_key", Some(sk)) + .signed_header("accept", "application/octet-stream") + .send() + .await + .unwrap(); + assert_eq!(res.status(), 204); + assert_eq!( + res.headers().get("content-type").unwrap().to_str().unwrap(), + "application/octet-stream" + ); + ct.insert( + sk, + res.headers() + .get("x-garage-causality-token") + .unwrap() + .to_str() + .unwrap() + .to_string(), + ); + } + let res = ctx .k2v .request @@ -473,7 +538,8 @@ async fn test_batch() { .body( br#"[ {"partitionKey": "root"}, - {"partitionKey": "root", "reverse": true} + {"partitionKey": "root", "reverse": true}, + {"partitionKey": "root", "tombstones": true} ]"# .to_vec(), ) @@ -520,6 +586,27 @@ async fn test_batch() { "more": false, "nextStart": null, }, + { + "partitionKey": "root", + "prefix": null, + "start": null, + "end": null, + "limit": null, + "reverse": false, + "conflictsOnly": false, + "tombstones": true, + "singleItem": false, + "items": [ + {"sk": "a", "ct": ct.get("a").unwrap(), "v": [null]}, + {"sk": "b", "ct": ct.get("b").unwrap(), "v": [null]}, + {"sk": "c", "ct": ct.get("c").unwrap(), "v": [base64::encode(values.get("c").unwrap()), base64::encode(values.get("c'").unwrap())]}, + {"sk": "d.1", "ct": ct.get("d.1").unwrap(), "v": [null]}, + {"sk": "d.2", "ct": ct.get("d.2").unwrap(), "v": [null]}, + {"sk": "e", "ct": ct.get("e").unwrap(), "v": [base64::encode(values.get("e").unwrap())]}, + ], + "more": false, + "nextStart": null, + }, ]) ); } -- cgit v1.2.3 From 64c193e3dbb536d5d3c2881bc9aebbb3e4e6272e Mon Sep 17 00:00:00 2001 From: trinity-1686a Date: Wed, 18 May 2022 22:24:09 +0200 Subject: Add a K2V client library and CLI (#303) lib.rs could use getting split in modules, but I'm not sure how exactly Co-authored-by: trinity-1686a Reviewed-on: https://git.deuxfleurs.fr/Deuxfleurs/garage/pulls/303 Co-authored-by: trinity-1686a Co-committed-by: trinity-1686a --- src/garage/cli/cmd.rs | 1 + src/garage/cli/layout.rs | 1 + src/garage/cli/util.rs | 30 +- src/k2v-client/Cargo.toml | 27 ++ src/k2v-client/README.md | 25 ++ src/k2v-client/src/bin/k2v-cli.rs | 466 +++++++++++++++++++++++++++++++ src/k2v-client/src/error.rs | 22 ++ src/k2v-client/src/lib.rs | 566 ++++++++++++++++++++++++++++++++++++++ src/util/formater.rs | 28 ++ src/util/lib.rs | 1 + 10 files changed, 1138 insertions(+), 29 deletions(-) create mode 100644 src/k2v-client/Cargo.toml create mode 100644 src/k2v-client/README.md create mode 100644 src/k2v-client/src/bin/k2v-cli.rs create mode 100644 src/k2v-client/src/error.rs create mode 100644 src/k2v-client/src/lib.rs create mode 100644 src/util/formater.rs (limited to 'src') diff --git a/src/garage/cli/cmd.rs b/src/garage/cli/cmd.rs index 2a799868..b2dd8f14 100644 --- a/src/garage/cli/cmd.rs +++ b/src/garage/cli/cmd.rs @@ -1,6 +1,7 @@ use std::collections::HashSet; use garage_util::error::*; +use garage_util::formater::format_table; use garage_rpc::layout::*; use garage_rpc::system::*; diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index 88941d78..0247c32b 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -1,6 +1,7 @@ use garage_util::crdt::Crdt; use garage_util::data::*; use garage_util::error::*; +use garage_util::formater::format_table; use garage_rpc::layout::*; use garage_rpc::system::*; diff --git a/src/garage/cli/util.rs b/src/garage/cli/util.rs index fe11ad44..6d73be3a 100644 --- a/src/garage/cli/util.rs +++ b/src/garage/cli/util.rs @@ -3,6 +3,7 @@ use std::collections::HashMap; use garage_util::crdt::*; use garage_util::data::Uuid; use garage_util::error::*; +use garage_util::formater::format_table; use garage_model::bucket_table::*; use garage_model::key_table::*; @@ -173,35 +174,6 @@ pub fn print_bucket_info(bucket: &Bucket, relevant_keys: &HashMap) }; } -pub fn format_table(data: Vec) { - let data = data - .iter() - .map(|s| s.split('\t').collect::>()) - .collect::>(); - - let columns = data.iter().map(|row| row.len()).fold(0, std::cmp::max); - let mut column_size = vec![0; columns]; - - let mut out = String::new(); - - for row in data.iter() { - for (i, col) in row.iter().enumerate() { - column_size[i] = std::cmp::max(column_size[i], col.chars().count()); - } - } - - for row in data.iter() { - for (col, col_len) in row[..row.len() - 1].iter().zip(column_size.iter()) { - out.push_str(col); - (0..col_len - col.chars().count() + 2).for_each(|_| out.push(' ')); - } - out.push_str(row[row.len() - 1]); - out.push('\n'); - } - - print!("{}", out); -} - pub fn find_matching_node( cand: impl std::iter::Iterator, pattern: &str, diff --git a/src/k2v-client/Cargo.toml b/src/k2v-client/Cargo.toml new file mode 100644 index 00000000..84c6b8b2 --- /dev/null +++ b/src/k2v-client/Cargo.toml @@ -0,0 +1,27 @@ +[package] +name = "k2v-client" +version = "0.1.0" +edition = "2018" + +[dependencies] +base64 = "0.13.0" +http = "0.2.6" +rusoto_core = "0.48.0" +rusoto_credential = "0.48.0" +rusoto_signature = "0.48.0" +serde = "1.0.137" +serde_json = "1.0.81" +thiserror = "1.0.31" +tokio = "1.17.0" + +# cli deps +clap = { version = "3.1.18", optional = true, features = ["derive", "env"] } +garage_util = { path = "../util", optional = true } + + +[features] +cli = ["clap", "tokio/fs", "tokio/io-std", "garage_util"] + +[[bin]] +name = "k2v-cli" +required-features = ["cli"] diff --git a/src/k2v-client/README.md b/src/k2v-client/README.md new file mode 100644 index 00000000..db454805 --- /dev/null +++ b/src/k2v-client/README.md @@ -0,0 +1,25 @@ +Example usage: +```sh +# all these values can be provided on the cli instead +export AWS_ACCESS_KEY_ID=GK123456 +export AWS_SECRET_ACCESS_KEY=0123..789 +export AWS_REGION=garage +export K2V_ENDPOINT=http://172.30.2.1:3903 +export K2V_BUCKET=my-bucket + +cargo run --features=cli -- read-range my-partition-key --all + +cargo run --features=cli -- insert my-partition-key my-sort-key --text "my string1" +cargo run --features=cli -- insert my-partition-key my-sort-key --text "my string2" +cargo run --features=cli -- insert my-partition-key my-sort-key2 --text "my string" + +cargo run --features=cli -- read-range my-partition-key --all + +causality=$(cargo run --features=cli -- read my-partition-key my-sort-key2 -b | head -n1) +cargo run --features=cli -- delete my-partition-key my-sort-key2 -c $causality + +causality=$(cargo run --features=cli -- read my-partition-key my-sort-key -b | head -n1) +cargo run --features=cli -- insert my-partition-key my-sort-key --text "my string3" -c $causality + +cargo run --features=cli -- read-range my-partition-key --all +``` diff --git a/src/k2v-client/src/bin/k2v-cli.rs b/src/k2v-client/src/bin/k2v-cli.rs new file mode 100644 index 00000000..38c39361 --- /dev/null +++ b/src/k2v-client/src/bin/k2v-cli.rs @@ -0,0 +1,466 @@ +use k2v_client::*; + +use garage_util::formater::format_table; + +use rusoto_core::credential::AwsCredentials; +use rusoto_core::Region; + +use clap::{Parser, Subcommand}; + +/// K2V command line interface +#[derive(Parser, Debug)] +#[clap(author, version, about, long_about = None)] +struct Args { + /// Name of the region to use + #[clap(short, long, env = "AWS_REGION", default_value = "garage")] + region: String, + /// Url of the endpoint to connect to + #[clap(short, long, env = "K2V_ENDPOINT")] + endpoint: String, + /// Access key ID + #[clap(short, long, env = "AWS_ACCESS_KEY_ID")] + key_id: String, + /// Access key ID + #[clap(short, long, env = "AWS_SECRET_ACCESS_KEY")] + secret: String, + /// Bucket name + #[clap(short, long, env = "K2V_BUCKET")] + bucket: String, + #[clap(subcommand)] + command: Command, +} + +#[derive(Subcommand, Debug)] +enum Command { + /// Insert a single value + Insert { + /// Partition key to insert to + partition_key: String, + /// Sort key to insert to + sort_key: String, + /// Causality of the insertion + #[clap(short, long)] + causality: Option, + /// Value to insert + #[clap(flatten)] + value: Value, + }, + /// Read a single value + Read { + /// Partition key to read from + partition_key: String, + /// Sort key to read from + sort_key: String, + /// Output formating + #[clap(flatten)] + output_kind: ReadOutputKind, + }, + /// Delete a single value + Delete { + /// Partition key to delete from + partition_key: String, + /// Sort key to delete from + sort_key: String, + /// Causality information + #[clap(short, long)] + causality: String, + }, + /// List partition keys + ReadIndex { + /// Output formating + #[clap(flatten)] + output_kind: BatchOutputKind, + /// Output only partition keys matching this filter + #[clap(flatten)] + filter: Filter, + }, + /// Read a range of sort keys + ReadRange { + /// Partition key to read from + partition_key: String, + /// Output formating + #[clap(flatten)] + output_kind: BatchOutputKind, + /// Output only sort keys matching this filter + #[clap(flatten)] + filter: Filter, + }, + /// Delete a range of sort keys + DeleteRange { + /// Partition key to delete from + partition_key: String, + /// Output formating + #[clap(flatten)] + output_kind: BatchOutputKind, + /// Delete only sort keys matching this filter + #[clap(flatten)] + filter: Filter, + }, +} + +/// Where to read a value from +#[derive(Parser, Debug)] +#[clap(group = clap::ArgGroup::new("value").multiple(false).required(true))] +struct Value { + /// Read value from a file. use - to read from stdin + #[clap(short, long, group = "value")] + file: Option, + /// Read a base64 value from commandline + #[clap(short, long, group = "value")] + b64: Option, + /// Read a raw (UTF-8) value from the commandline + #[clap(short, long, group = "value")] + text: Option, +} + +impl Value { + async fn to_data(&self) -> Result, Error> { + if let Some(ref text) = self.text { + Ok(text.as_bytes().to_vec()) + } else if let Some(ref b64) = self.b64 { + base64::decode(b64).map_err(|_| Error::Message("invalid base64 input".into())) + } else if let Some(ref path) = self.file { + use tokio::io::AsyncReadExt; + if path == "-" { + let mut file = tokio::io::stdin(); + let mut vec = Vec::new(); + file.read_to_end(&mut vec).await?; + Ok(vec) + } else { + let mut file = tokio::fs::File::open(path).await?; + let mut vec = Vec::new(); + file.read_to_end(&mut vec).await?; + Ok(vec) + } + } else { + unreachable!("Value must have one option set") + } + } +} + +#[derive(Parser, Debug)] +#[clap(group = clap::ArgGroup::new("output-kind").multiple(false).required(false))] +struct ReadOutputKind { + /// Base64 output. Conflicts are line separated, first line is causality token + #[clap(short, long, group = "output-kind")] + b64: bool, + /// Raw output. Conflicts generate error, causality token is not returned + #[clap(short, long, group = "output-kind")] + raw: bool, + /// Human formated output + #[clap(short = 'H', long, group = "output-kind")] + human: bool, + /// JSON formated output + #[clap(short, long, group = "output-kind")] + json: bool, +} + +impl ReadOutputKind { + fn display_output(&self, val: CausalValue) -> ! { + use std::io::Write; + use std::process::exit; + + if self.json { + let stdout = std::io::stdout(); + serde_json::to_writer_pretty(stdout, &val).unwrap(); + exit(0); + } + + if self.raw { + let mut val = val.value; + if val.len() != 1 { + eprintln!( + "Raw mode can only read non-concurent values, found {} values, expected 1", + val.len() + ); + exit(1); + } + let val = val.pop().unwrap(); + match val { + K2vValue::Value(v) => { + std::io::stdout().write_all(&v).unwrap(); + exit(0); + } + K2vValue::Tombstone => { + eprintln!("Expected value, found tombstone"); + exit(2); + } + } + } + + let causality: String = val.causality.into(); + if self.b64 { + println!("{}", causality); + for val in val.value { + match val { + K2vValue::Value(v) => { + println!("{}", base64::encode(&v)) + } + K2vValue::Tombstone => { + println!(); + } + } + } + exit(0); + } + + // human + println!("causality: {}", causality); + println!("values:"); + for val in val.value { + match val { + K2vValue::Value(v) => { + if let Ok(string) = std::str::from_utf8(&v) { + println!(" utf-8: {}", string); + } else { + println!(" base64: {}", base64::encode(&v)); + } + } + K2vValue::Tombstone => { + println!(" tombstone"); + } + } + } + exit(0); + } +} + +#[derive(Parser, Debug)] +#[clap(group = clap::ArgGroup::new("output-kind").multiple(false).required(false))] +struct BatchOutputKind { + /// Human formated output + #[clap(short = 'H', long, group = "output-kind")] + human: bool, + /// JSON formated output + #[clap(short, long, group = "output-kind")] + json: bool, +} + +/// Filter for batch operations +#[derive(Parser, Debug)] +#[clap(group = clap::ArgGroup::new("filter").multiple(true).required(true))] +struct Filter { + /// Match only keys starting with this prefix + #[clap(short, long, group = "filter")] + prefix: Option, + /// Match only keys lexicographically after this key (including this key itself) + #[clap(short, long, group = "filter")] + start: Option, + /// Match only keys lexicographically before this key (excluding this key) + #[clap(short, long, group = "filter")] + end: Option, + /// Only match the first X keys + #[clap(short, long)] + limit: Option, + /// Return keys in reverse order + #[clap(short, long)] + reverse: bool, + /// Return only keys where conflict happened + #[clap(short, long)] + conflicts_only: bool, + /// Also include keys storing only tombstones + #[clap(short, long)] + tombstones: bool, + /// Return any key + #[clap(short, long, group = "filter")] + all: bool, +} + +impl Filter { + fn k2v_filter(&self) -> k2v_client::Filter<'_> { + k2v_client::Filter { + start: self.start.as_deref(), + end: self.end.as_deref(), + prefix: self.prefix.as_deref(), + limit: self.limit, + reverse: self.reverse, + } + } +} + +#[tokio::main] +async fn main() -> Result<(), Error> { + let args = Args::parse(); + + let region = Region::Custom { + name: args.region, + endpoint: args.endpoint, + }; + + let creds = AwsCredentials::new(args.key_id, args.secret, None, None); + + let client = K2vClient::new(region, args.bucket, creds, None)?; + + match args.command { + Command::Insert { + partition_key, + sort_key, + causality, + value, + } => { + client + .insert_item( + &partition_key, + &sort_key, + value.to_data().await?, + causality.map(Into::into), + ) + .await?; + } + Command::Delete { + partition_key, + sort_key, + causality, + } => { + client + .delete_item(&partition_key, &sort_key, causality.into()) + .await?; + } + Command::Read { + partition_key, + sort_key, + output_kind, + } => { + let res = client.read_item(&partition_key, &sort_key).await?; + output_kind.display_output(res); + } + Command::ReadIndex { + output_kind, + filter, + } => { + if filter.conflicts_only || filter.tombstones { + return Err(Error::Message( + "conlicts-only and tombstones are invalid for read-index".into(), + )); + } + let res = client.read_index(filter.k2v_filter()).await?; + if output_kind.json { + let values = res + .items + .into_iter() + .map(|(k, v)| { + let mut value = serde_json::to_value(v).unwrap(); + value + .as_object_mut() + .unwrap() + .insert("sort_key".to_owned(), k.into()); + value + }) + .collect::>(); + let json = serde_json::json!({ + "next_key": res.next_start, + "values": values, + }); + + let stdout = std::io::stdout(); + serde_json::to_writer_pretty(stdout, &json).unwrap(); + } else { + if let Some(next) = res.next_start { + println!("next key: {}", next); + } + + let mut to_print = Vec::new(); + to_print.push(format!("key:\tentries\tconflicts\tvalues\tbytes")); + for (k, v) in res.items { + to_print.push(format!( + "{}\t{}\t{}\t{}\t{}", + k, v.entries, v.conflicts, v.values, v.bytes + )); + } + format_table(to_print); + } + } + Command::ReadRange { + partition_key, + output_kind, + filter, + } => { + let op = BatchReadOp { + partition_key: &partition_key, + filter: filter.k2v_filter(), + conflicts_only: filter.conflicts_only, + tombstones: filter.tombstones, + single_item: false, + }; + let mut res = client.read_batch(&[op]).await?; + let res = res.pop().unwrap(); + if output_kind.json { + let values = res + .items + .into_iter() + .map(|(k, v)| { + let mut value = serde_json::to_value(v).unwrap(); + value + .as_object_mut() + .unwrap() + .insert("sort_key".to_owned(), k.into()); + value + }) + .collect::>(); + let json = serde_json::json!({ + "next_key": res.next_start, + "values": values, + }); + + let stdout = std::io::stdout(); + serde_json::to_writer_pretty(stdout, &json).unwrap(); + } else { + if let Some(next) = res.next_start { + println!("next key: {}", next); + } + for (key, values) in res.items { + println!("key: {}", key); + let causality: String = values.causality.into(); + println!("causality: {}", causality); + for value in values.value { + match value { + K2vValue::Value(v) => { + if let Ok(string) = std::str::from_utf8(&v) { + println!(" value(utf-8): {}", string); + } else { + println!(" value(base64): {}", base64::encode(&v)); + } + } + K2vValue::Tombstone => { + println!(" tombstone"); + } + } + } + } + } + } + Command::DeleteRange { + partition_key, + output_kind, + filter, + } => { + let op = BatchDeleteOp { + partition_key: &partition_key, + prefix: filter.prefix.as_deref(), + start: filter.start.as_deref(), + end: filter.end.as_deref(), + single_item: false, + }; + if filter.reverse + || filter.conflicts_only + || filter.tombstones + || filter.limit.is_some() + { + return Err(Error::Message( + "limit, conlicts-only, reverse and tombstones are invalid for delete-range" + .into(), + )); + } + + let res = client.delete_batch(&[op]).await?; + + if output_kind.json { + println!("{}", res[0]); + } else { + println!("deleted {} keys", res[0]); + } + } + } + + Ok(()) +} diff --git a/src/k2v-client/src/error.rs b/src/k2v-client/src/error.rs new file mode 100644 index 00000000..62357934 --- /dev/null +++ b/src/k2v-client/src/error.rs @@ -0,0 +1,22 @@ +use std::borrow::Cow; + +use thiserror::Error; + +/// Errors returned by this crate +#[derive(Error, Debug)] +pub enum Error { + #[error("received invalid response: {0}")] + InvalidResponse(Cow<'static, str>), + #[error("not found")] + NotFound, + #[error("io error: {0}")] + IoError(#[from] std::io::Error), + #[error("rusoto tls error: {0}")] + RusotoTls(#[from] rusoto_core::request::TlsError), + #[error("rusoto http error: {0}")] + RusotoHttp(#[from] rusoto_core::HttpDispatchError), + #[error("deserialization error: {0}")] + Deserialization(#[from] serde_json::Error), + #[error("{0}")] + Message(Cow<'static, str>), +} diff --git a/src/k2v-client/src/lib.rs b/src/k2v-client/src/lib.rs new file mode 100644 index 00000000..ba1cd6ea --- /dev/null +++ b/src/k2v-client/src/lib.rs @@ -0,0 +1,566 @@ +use std::collections::BTreeMap; +use std::time::Duration; + +use http::header::{ACCEPT, CONTENT_LENGTH, CONTENT_TYPE}; +use http::status::StatusCode; +use http::HeaderMap; + +use rusoto_core::{ByteStream, DispatchSignedRequest, HttpClient}; +use rusoto_credential::AwsCredentials; +use rusoto_signature::region::Region; +use rusoto_signature::signature::SignedRequest; +use serde::de::Error as DeError; +use serde::{Deserialize, Deserializer, Serialize, Serializer}; + +use tokio::io::AsyncReadExt; + +mod error; + +pub use error::Error; + +const DEFAULT_TIMEOUT: Duration = Duration::from_secs(5); +const DEFAULT_POLL_TIMEOUT: Duration = Duration::from_secs(300); +const SERVICE: &str = "k2v"; +const GARAGE_CAUSALITY_TOKEN: &str = "X-Garage-Causality-Token"; + +/// Client used to query a K2V server. +pub struct K2vClient { + region: Region, + bucket: String, + creds: AwsCredentials, + client: HttpClient, +} + +impl K2vClient { + /// Create a new K2V client. + pub fn new( + region: Region, + bucket: String, + creds: AwsCredentials, + user_agent: Option, + ) -> Result { + let mut client = HttpClient::new()?; + if let Some(ua) = user_agent { + client.local_agent_prepend(ua); + } else { + client.local_agent_prepend(format!("k2v/{}", env!("CARGO_PKG_VERSION"))); + } + Ok(K2vClient { + region, + bucket, + creds, + client, + }) + } + + /// Perform a ReadItem request, reading the value(s) stored for a single pk+sk. + pub async fn read_item( + &self, + partition_key: &str, + sort_key: &str, + ) -> Result { + let mut req = SignedRequest::new( + "GET", + SERVICE, + &self.region, + &format!("/{}/{}", self.bucket, partition_key), + ); + req.add_param("sort_key", sort_key); + req.add_header(ACCEPT, "application/octet-stream, application/json"); + + let res = self.dispatch(req, None).await?; + + let causality = res + .causality_token + .ok_or_else(|| Error::InvalidResponse("missing causality token".into()))?; + + if res.status == StatusCode::NO_CONTENT { + return Ok(CausalValue { + causality, + value: vec![K2vValue::Tombstone], + }); + } + + match res.content_type.as_deref() { + Some("application/octet-stream") => Ok(CausalValue { + causality, + value: vec![K2vValue::Value(res.body)], + }), + Some("application/json") => { + let value = serde_json::from_slice(&res.body)?; + Ok(CausalValue { causality, value }) + } + Some(ct) => Err(Error::InvalidResponse( + format!("invalid content type: {}", ct).into(), + )), + None => Err(Error::InvalidResponse("missing content type".into())), + } + } + + /// Perform a PollItem request, waiting for the value(s) stored for a single pk+sk to be + /// updated. + pub async fn poll_item( + &self, + partition_key: &str, + sort_key: &str, + causality: CausalityToken, + timeout: Option, + ) -> Result, Error> { + let timeout = timeout.unwrap_or(DEFAULT_POLL_TIMEOUT); + + let mut req = SignedRequest::new( + "GET", + SERVICE, + &self.region, + &format!("/{}/{}", self.bucket, partition_key), + ); + req.add_param("sort_key", sort_key); + req.add_param("causality_token", &causality.0); + req.add_param("timeout", &timeout.as_secs().to_string()); + req.add_header(ACCEPT, "application/octet-stream, application/json"); + + let res = self.dispatch(req, Some(timeout + DEFAULT_TIMEOUT)).await?; + + let causality = res + .causality_token + .ok_or_else(|| Error::InvalidResponse("missing causality token".into()))?; + + if res.status == StatusCode::NOT_MODIFIED { + return Ok(None); + } + + if res.status == StatusCode::NO_CONTENT { + return Ok(Some(CausalValue { + causality, + value: vec![K2vValue::Tombstone], + })); + } + + match res.content_type.as_deref() { + Some("application/octet-stream") => Ok(Some(CausalValue { + causality, + value: vec![K2vValue::Value(res.body)], + })), + Some("application/json") => { + let value = serde_json::from_slice(&res.body)?; + Ok(Some(CausalValue { causality, value })) + } + Some(ct) => Err(Error::InvalidResponse( + format!("invalid content type: {}", ct).into(), + )), + None => Err(Error::InvalidResponse("missing content type".into())), + } + } + + /// Perform an InsertItem request, inserting a value for a single pk+sk. + pub async fn insert_item( + &self, + partition_key: &str, + sort_key: &str, + value: Vec, + causality: Option, + ) -> Result<(), Error> { + let mut req = SignedRequest::new( + "PUT", + SERVICE, + &self.region, + &format!("/{}/{}", self.bucket, partition_key), + ); + req.add_param("sort_key", sort_key); + req.set_payload(Some(value)); + + if let Some(causality) = causality { + req.add_header(GARAGE_CAUSALITY_TOKEN, &causality.0); + } + + self.dispatch(req, None).await?; + Ok(()) + } + + /// Perform a DeleteItem request, deleting the value(s) stored for a single pk+sk. + pub async fn delete_item( + &self, + partition_key: &str, + sort_key: &str, + causality: CausalityToken, + ) -> Result<(), Error> { + let mut req = SignedRequest::new( + "DELETE", + SERVICE, + &self.region, + &format!("/{}/{}", self.bucket, partition_key), + ); + req.add_param("sort_key", sort_key); + req.add_header(GARAGE_CAUSALITY_TOKEN, &causality.0); + + self.dispatch(req, None).await?; + Ok(()) + } + + /// Perform a ReadIndex request, listing partition key which have at least one associated + /// sort key, and which matches the filter. + pub async fn read_index( + &self, + filter: Filter<'_>, + ) -> Result, Error> { + let mut req = + SignedRequest::new("GET", SERVICE, &self.region, &format!("/{}", self.bucket)); + filter.insert_params(&mut req); + + let res = self.dispatch(req, None).await?; + + let resp: ReadIndexResponse = serde_json::from_slice(&res.body)?; + + let items = resp + .partition_keys + .into_iter() + .map(|ReadIndexItem { pk, info }| (pk, info)) + .collect(); + + Ok(PaginatedRange { + items, + next_start: resp.next_start, + }) + } + + /// Perform an InsertBatch request, inserting multiple values at once. Note: this operation is + /// *not* atomic: it is possible for some sub-operations to fails and others to success. In + /// that case, failure is reported. + pub async fn insert_batch(&self, operations: &[BatchInsertOp<'_>]) -> Result<(), Error> { + let mut req = + SignedRequest::new("POST", SERVICE, &self.region, &format!("/{}", self.bucket)); + + let payload = serde_json::to_vec(operations)?; + req.set_payload(Some(payload)); + self.dispatch(req, None).await?; + Ok(()) + } + + /// Perform a ReadBatch request, reading multiple values or range of values at once. + pub async fn read_batch( + &self, + operations: &[BatchReadOp<'_>], + ) -> Result>, Error> { + let mut req = + SignedRequest::new("POST", SERVICE, &self.region, &format!("/{}", self.bucket)); + req.add_param("search", ""); + + let payload = serde_json::to_vec(operations)?; + req.set_payload(Some(payload)); + let res = self.dispatch(req, None).await?; + + let resp: Vec = serde_json::from_slice(&res.body)?; + + Ok(resp + .into_iter() + .map(|e| PaginatedRange { + items: e + .items + .into_iter() + .map(|BatchReadItem { sk, ct, v }| { + ( + sk, + CausalValue { + causality: ct, + value: v, + }, + ) + }) + .collect(), + next_start: e.next_start, + }) + .collect()) + } + + /// Perform a DeleteBatch request, deleting mutiple values or range of values at once, without + /// providing causality information. + pub async fn delete_batch(&self, operations: &[BatchDeleteOp<'_>]) -> Result, Error> { + let mut req = + SignedRequest::new("POST", SERVICE, &self.region, &format!("/{}", self.bucket)); + req.add_param("delete", ""); + + let payload = serde_json::to_vec(operations)?; + req.set_payload(Some(payload)); + let res = self.dispatch(req, None).await?; + + let resp: Vec = serde_json::from_slice(&res.body)?; + + Ok(resp.into_iter().map(|r| r.deleted_items).collect()) + } + + async fn dispatch( + &self, + mut req: SignedRequest, + timeout: Option, + ) -> Result { + req.sign(&self.creds); + let mut res = self + .client + .dispatch(req, Some(timeout.unwrap_or(DEFAULT_TIMEOUT))) + .await?; + + let causality_token = res + .headers + .remove(GARAGE_CAUSALITY_TOKEN) + .map(CausalityToken); + let content_type = res.headers.remove(CONTENT_TYPE); + + let body = match res.status { + StatusCode::OK => read_body(&mut res.headers, res.body).await?, + StatusCode::NO_CONTENT => Vec::new(), + StatusCode::NOT_FOUND => return Err(Error::NotFound), + StatusCode::NOT_MODIFIED => Vec::new(), + _ => { + return Err(Error::InvalidResponse( + format!("invalid error code: {}", res.status).into(), + )) + } + }; + + Ok(Response { + body, + status: res.status, + causality_token, + content_type, + }) + } +} + +async fn read_body(headers: &mut HeaderMap, body: ByteStream) -> Result, Error> { + let body_len = headers + .get(CONTENT_LENGTH) + .and_then(|h| h.parse().ok()) + .unwrap_or(0); + let mut res = Vec::with_capacity(body_len); + body.into_async_read().read_to_end(&mut res).await?; + Ok(res) +} + +/// An opaque token used to convey causality between operations. +#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)] +#[serde(transparent)] +pub struct CausalityToken(String); + +impl From for CausalityToken { + fn from(v: String) -> Self { + CausalityToken(v) + } +} + +impl From for String { + fn from(v: CausalityToken) -> Self { + v.0 + } +} + +/// A value in K2V. can be either a binary value, or a tombstone. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum K2vValue { + Tombstone, + Value(Vec), +} + +impl From> for K2vValue { + fn from(v: Vec) -> Self { + K2vValue::Value(v) + } +} + +impl From>> for K2vValue { + fn from(v: Option>) -> Self { + match v { + Some(v) => K2vValue::Value(v), + None => K2vValue::Tombstone, + } + } +} + +impl<'de> Deserialize<'de> for K2vValue { + fn deserialize(d: D) -> Result + where + D: Deserializer<'de>, + { + let val: Option<&str> = Option::deserialize(d)?; + Ok(match val { + Some(s) => { + K2vValue::Value(base64::decode(s).map_err(|_| DeError::custom("invalid base64"))?) + } + None => K2vValue::Tombstone, + }) + } +} + +impl Serialize for K2vValue { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + match self { + K2vValue::Tombstone => serializer.serialize_none(), + K2vValue::Value(v) => { + let b64 = base64::encode(v); + serializer.serialize_str(&b64) + } + } + } +} + +/// A set of K2vValue and associated causality information. +#[derive(Debug, Clone, Serialize)] +pub struct CausalValue { + pub causality: CausalityToken, + pub value: Vec, +} + +/// Result of paginated requests. +#[derive(Debug, Clone)] +pub struct PaginatedRange { + pub items: BTreeMap, + pub next_start: Option, +} + +/// Filter for batch operations. +#[derive(Debug, Default, Clone, Deserialize, Serialize)] +pub struct Filter<'a> { + pub start: Option<&'a str>, + pub end: Option<&'a str>, + pub prefix: Option<&'a str>, + pub limit: Option, + #[serde(default)] + pub reverse: bool, +} + +impl<'a> Filter<'a> { + fn insert_params(&self, req: &mut SignedRequest) { + if let Some(start) = &self.start { + req.add_param("start", start); + } + if let Some(end) = &self.end { + req.add_param("end", end); + } + if let Some(prefix) = &self.prefix { + req.add_param("prefix", prefix); + } + if let Some(limit) = &self.limit { + req.add_param("limit", &limit.to_string()); + } + if self.reverse { + req.add_param("reverse", "true"); + } + } +} + +#[derive(Debug, Clone, Deserialize)] +#[serde(rename_all = "camelCase")] +struct ReadIndexResponse<'a> { + #[serde(flatten, borrow)] + #[allow(dead_code)] + filter: Filter<'a>, + partition_keys: Vec, + #[allow(dead_code)] + more: bool, + next_start: Option, +} + +#[derive(Debug, Clone, Deserialize)] +struct ReadIndexItem { + pk: String, + #[serde(flatten)] + info: PartitionInfo, +} + +/// Information about data stored with a given partition key. +#[derive(Debug, Clone, Deserialize, Serialize)] +pub struct PartitionInfo { + pub entries: u64, + pub conflicts: u64, + pub values: u64, + pub bytes: u64, +} + +/// Single sub-operation of an InsertBatch. +#[derive(Debug, Clone, Serialize)] +pub struct BatchInsertOp<'a> { + #[serde(rename = "pk")] + pub partition_key: &'a str, + #[serde(rename = "sk")] + pub sort_key: &'a str, + #[serde(rename = "ct")] + pub causality: Option, + #[serde(rename = "v")] + pub value: K2vValue, +} + +/// Single sub-operation of a ReadBatch. +#[derive(Debug, Default, Clone, Deserialize, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct BatchReadOp<'a> { + pub partition_key: &'a str, + #[serde(flatten, borrow)] + pub filter: Filter<'a>, + #[serde(default)] + pub single_item: bool, + #[serde(default)] + pub conflicts_only: bool, + #[serde(default)] + pub tombstones: bool, +} + +#[derive(Debug, Clone, Deserialize)] +#[serde(rename_all = "camelCase")] +struct BatchReadResponse<'a> { + #[serde(flatten, borrow)] + #[allow(dead_code)] + op: BatchReadOp<'a>, + items: Vec, + #[allow(dead_code)] + more: bool, + next_start: Option, +} + +#[derive(Debug, Clone, Deserialize)] +struct BatchReadItem { + sk: String, + ct: CausalityToken, + v: Vec, +} + +/// Single sub-operation of a DeleteBatch +#[derive(Debug, Clone, Deserialize, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct BatchDeleteOp<'a> { + pub partition_key: &'a str, + pub prefix: Option<&'a str>, + pub start: Option<&'a str>, + pub end: Option<&'a str>, + #[serde(default)] + pub single_item: bool, +} + +impl<'a> BatchDeleteOp<'a> { + pub fn new(partition_key: &'a str) -> Self { + BatchDeleteOp { + partition_key, + prefix: None, + start: None, + end: None, + single_item: false, + } + } +} + +#[derive(Debug, Clone, Deserialize)] +#[serde(rename_all = "camelCase")] +struct BatchDeleteResponse<'a> { + #[serde(flatten, borrow)] + #[allow(dead_code)] + filter: BatchDeleteOp<'a>, + deleted_items: u64, +} + +struct Response { + body: Vec, + status: StatusCode, + causality_token: Option, + content_type: Option, +} diff --git a/src/util/formater.rs b/src/util/formater.rs new file mode 100644 index 00000000..95324f9a --- /dev/null +++ b/src/util/formater.rs @@ -0,0 +1,28 @@ +pub fn format_table(data: Vec) { + let data = data + .iter() + .map(|s| s.split('\t').collect::>()) + .collect::>(); + + let columns = data.iter().map(|row| row.len()).fold(0, std::cmp::max); + let mut column_size = vec![0; columns]; + + let mut out = String::new(); + + for row in data.iter() { + for (i, col) in row.iter().enumerate() { + column_size[i] = std::cmp::max(column_size[i], col.chars().count()); + } + } + + for row in data.iter() { + for (col, col_len) in row[..row.len() - 1].iter().zip(column_size.iter()) { + out.push_str(col); + (0..col_len - col.chars().count() + 2).for_each(|_| out.push(' ')); + } + out.push_str(row[row.len() - 1]); + out.push('\n'); + } + + print!("{}", out); +} diff --git a/src/util/lib.rs b/src/util/lib.rs index e83fc2e6..d8ffdd0b 100644 --- a/src/util/lib.rs +++ b/src/util/lib.rs @@ -8,6 +8,7 @@ pub mod config; pub mod crdt; pub mod data; pub mod error; +pub mod formater; pub mod metrics; pub mod persister; pub mod sled_counter; -- cgit v1.2.3 From 382e74c798263d042b1c6ca3788c866a8c69c4f4 Mon Sep 17 00:00:00 2001 From: Alex Date: Tue, 24 May 2022 12:16:39 +0200 Subject: First version of admin API (#298) **Spec:** - [x] Start writing - [x] Specify all layout endpoints - [x] Specify all endpoints for operations on keys - [x] Specify all endpoints for operations on key/bucket permissions - [x] Specify all endpoints for operations on buckets - [x] Specify all endpoints for operations on bucket aliases View rendered spec at **Code:** - [x] Refactor code for admin api to use common api code that was created for K2V **General endpoints:** - [x] Metrics - [x] GetClusterStatus - [x] ConnectClusterNodes - [x] GetClusterLayout - [x] UpdateClusterLayout - [x] ApplyClusterLayout - [x] RevertClusterLayout **Key-related endpoints:** - [x] ListKeys - [x] CreateKey - [x] ImportKey - [x] GetKeyInfo - [x] UpdateKey - [x] DeleteKey **Bucket-related endpoints:** - [x] ListBuckets - [x] CreateBucket - [x] GetBucketInfo - [x] DeleteBucket - [x] PutBucketWebsite - [x] DeleteBucketWebsite **Operations on key/bucket permissions:** - [x] BucketAllowKey - [x] BucketDenyKey **Operations on bucket aliases:** - [x] GlobalAliasBucket - [x] GlobalUnaliasBucket - [x] LocalAliasBucket - [x] LocalUnaliasBucket **And also:** - [x] Separate error type for the admin API (this PR includes a quite big refactoring of error handling) - [x] Add management of website access - [ ] Check that nothing is missing wrt what can be done using the CLI - [ ] Improve formatting of the spec - [x] Make sure everyone is cool with the API design Fix #231 Fix #295 Co-authored-by: Alex Auvolat Reviewed-on: https://git.deuxfleurs.fr/Deuxfleurs/garage/pulls/298 Co-authored-by: Alex Co-committed-by: Alex --- src/admin/Cargo.toml | 29 --- src/admin/lib.rs | 6 - src/admin/metrics.rs | 146 ----------- src/admin/tracing_setup.rs | 37 --- src/api/Cargo.toml | 3 + src/api/admin/api_server.rs | 199 +++++++++++++++ src/api/admin/bucket.rs | 549 +++++++++++++++++++++++++++++++++++++++++ src/api/admin/cluster.rs | 198 +++++++++++++++ src/api/admin/error.rs | 96 +++++++ src/api/admin/key.rs | 264 ++++++++++++++++++++ src/api/admin/mod.rs | 7 + src/api/admin/router.rs | 149 +++++++++++ src/api/common_error.rs | 177 +++++++++++++ src/api/error.rs | 284 --------------------- src/api/generic_server.rs | 21 +- src/api/helpers.rs | 53 ++-- src/api/k2v/api_server.rs | 33 +-- src/api/k2v/batch.rs | 19 +- src/api/k2v/error.rs | 134 ++++++++++ src/api/k2v/index.rs | 2 +- src/api/k2v/item.rs | 2 +- src/api/k2v/mod.rs | 1 + src/api/k2v/range.rs | 4 +- src/api/k2v/router.rs | 6 +- src/api/lib.rs | 6 +- src/api/router_macros.rs | 33 ++- src/api/s3/api_server.rs | 28 +-- src/api/s3/bucket.rs | 24 +- src/api/s3/copy.rs | 25 +- src/api/s3/cors.rs | 34 +-- src/api/s3/delete.rs | 2 +- src/api/s3/error.rs | 207 ++++++++++++++++ src/api/s3/get.rs | 10 +- src/api/s3/list.rs | 14 +- src/api/s3/mod.rs | 1 + src/api/s3/post_object.rs | 62 +++-- src/api/s3/put.rs | 18 +- src/api/s3/router.rs | 5 +- src/api/s3/website.rs | 49 ++-- src/api/s3/xml.rs | 2 +- src/api/signature/error.rs | 36 +++ src/api/signature/mod.rs | 7 +- src/api/signature/payload.rs | 20 +- src/api/signature/streaming.rs | 8 +- src/garage/Cargo.toml | 7 +- src/garage/admin.rs | 77 ++---- src/garage/cli/layout.rs | 47 +--- src/garage/main.rs | 2 + src/garage/server.rs | 36 +-- src/garage/tracing_setup.rs | 37 +++ src/model/garage.rs | 4 + src/model/helper/bucket.rs | 152 ++++++------ src/model/helper/error.rs | 10 + src/model/helper/key.rs | 102 ++++++++ src/model/helper/mod.rs | 1 + src/rpc/Cargo.toml | 2 +- src/rpc/layout.rs | 56 +++++ src/rpc/system.rs | 132 ++++++---- src/util/config.rs | 4 + src/util/crdt/lww_map.rs | 5 + src/web/error.rs | 36 +-- src/web/web_server.rs | 10 +- 62 files changed, 2712 insertions(+), 1018 deletions(-) delete mode 100644 src/admin/Cargo.toml delete mode 100644 src/admin/lib.rs delete mode 100644 src/admin/metrics.rs delete mode 100644 src/admin/tracing_setup.rs create mode 100644 src/api/admin/api_server.rs create mode 100644 src/api/admin/bucket.rs create mode 100644 src/api/admin/cluster.rs create mode 100644 src/api/admin/error.rs create mode 100644 src/api/admin/key.rs create mode 100644 src/api/admin/mod.rs create mode 100644 src/api/admin/router.rs create mode 100644 src/api/common_error.rs delete mode 100644 src/api/error.rs create mode 100644 src/api/k2v/error.rs create mode 100644 src/api/s3/error.rs create mode 100644 src/api/signature/error.rs create mode 100644 src/garage/tracing_setup.rs create mode 100644 src/model/helper/key.rs (limited to 'src') diff --git a/src/admin/Cargo.toml b/src/admin/Cargo.toml deleted file mode 100644 index 2db4bb08..00000000 --- a/src/admin/Cargo.toml +++ /dev/null @@ -1,29 +0,0 @@ -[package] -name = "garage_admin" -version = "0.7.0" -authors = ["Maximilien Richer "] -edition = "2018" -license = "AGPL-3.0" -description = "Administration and metrics REST HTTP server for Garage" -repository = "https://git.deuxfleurs.fr/Deuxfleurs/garage" - -[lib] -path = "lib.rs" - -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - -[dependencies] -garage_util = { version = "0.7.0", path = "../util" } - -hex = "0.4" - -futures = "0.3" -futures-util = "0.3" -http = "0.2" -hyper = "0.14" -tracing = "0.1.30" - -opentelemetry = { version = "0.17", features = [ "rt-tokio" ] } -opentelemetry-prometheus = "0.10" -opentelemetry-otlp = "0.10" -prometheus = "0.13" diff --git a/src/admin/lib.rs b/src/admin/lib.rs deleted file mode 100644 index b5b0775b..00000000 --- a/src/admin/lib.rs +++ /dev/null @@ -1,6 +0,0 @@ -//! Crate for handling the admin and metric HTTP APIs -#[macro_use] -extern crate tracing; - -pub mod metrics; -pub mod tracing_setup; diff --git a/src/admin/metrics.rs b/src/admin/metrics.rs deleted file mode 100644 index 7edc36c6..00000000 --- a/src/admin/metrics.rs +++ /dev/null @@ -1,146 +0,0 @@ -use std::convert::Infallible; -use std::net::SocketAddr; -use std::sync::Arc; -use std::time::SystemTime; - -use futures::future::*; -use hyper::{ - header::CONTENT_TYPE, - service::{make_service_fn, service_fn}, - Body, Method, Request, Response, Server, -}; - -use opentelemetry::{ - global, - metrics::{BoundCounter, BoundValueRecorder}, - trace::{FutureExt, TraceContextExt, Tracer}, - Context, -}; -use opentelemetry_prometheus::PrometheusExporter; - -use prometheus::{Encoder, TextEncoder}; - -use garage_util::error::Error as GarageError; -use garage_util::metrics::*; - -// serve_req on metric endpoint -async fn serve_req( - req: Request, - admin_server: Arc, -) -> Result, hyper::Error> { - debug!("Receiving request at path {}", req.uri()); - let request_start = SystemTime::now(); - - admin_server.metrics.http_counter.add(1); - - let response = match (req.method(), req.uri().path()) { - (&Method::GET, "/metrics") => { - let mut buffer = vec![]; - let encoder = TextEncoder::new(); - - let tracer = opentelemetry::global::tracer("garage"); - let metric_families = tracer.in_span("admin/gather_metrics", |_| { - admin_server.exporter.registry().gather() - }); - - encoder.encode(&metric_families, &mut buffer).unwrap(); - admin_server - .metrics - .http_body_gauge - .record(buffer.len() as u64); - - Response::builder() - .status(200) - .header(CONTENT_TYPE, encoder.format_type()) - .body(Body::from(buffer)) - .unwrap() - } - _ => Response::builder() - .status(404) - .body(Body::from("Not implemented")) - .unwrap(), - }; - - admin_server - .metrics - .http_req_histogram - .record(request_start.elapsed().map_or(0.0, |d| d.as_secs_f64())); - Ok(response) -} - -// AdminServer hold the admin server internal admin_server and the metric exporter -pub struct AdminServer { - exporter: PrometheusExporter, - metrics: AdminServerMetrics, -} - -// GarageMetricadmin_server holds the metrics counter definition for Garage -// FIXME: we would rather have that split up among the different libraries? -struct AdminServerMetrics { - http_counter: BoundCounter, - http_body_gauge: BoundValueRecorder, - http_req_histogram: BoundValueRecorder, -} - -impl AdminServer { - /// init initilialize the AdminServer and background metric server - pub fn init() -> AdminServer { - let exporter = opentelemetry_prometheus::exporter().init(); - let meter = global::meter("garage/admin_server"); - AdminServer { - exporter, - metrics: AdminServerMetrics { - http_counter: meter - .u64_counter("admin.http_requests_total") - .with_description("Total number of HTTP requests made.") - .init() - .bind(&[]), - http_body_gauge: meter - .u64_value_recorder("admin.http_response_size_bytes") - .with_description("The metrics HTTP response sizes in bytes.") - .init() - .bind(&[]), - http_req_histogram: meter - .f64_value_recorder("admin.http_request_duration_seconds") - .with_description("The HTTP request latencies in seconds.") - .init() - .bind(&[]), - }, - } - } - /// run execute the admin server on the designated HTTP port and listen for requests - pub async fn run( - self, - bind_addr: SocketAddr, - shutdown_signal: impl Future, - ) -> Result<(), GarageError> { - let admin_server = Arc::new(self); - // For every connection, we must make a `Service` to handle all - // incoming HTTP requests on said connection. - let make_svc = make_service_fn(move |_conn| { - let admin_server = admin_server.clone(); - // This is the `Service` that will handle the connection. - // `service_fn` is a helper to convert a function that - // returns a Response into a `Service`. - async move { - Ok::<_, Infallible>(service_fn(move |req| { - let tracer = opentelemetry::global::tracer("garage"); - let span = tracer - .span_builder("admin/request") - .with_trace_id(gen_trace_id()) - .start(&tracer); - - serve_req(req, admin_server.clone()) - .with_context(Context::current_with_span(span)) - })) - } - }); - - let server = Server::bind(&bind_addr).serve(make_svc); - let graceful = server.with_graceful_shutdown(shutdown_signal); - info!("Admin server listening on http://{}", bind_addr); - - graceful.await?; - Ok(()) - } -} diff --git a/src/admin/tracing_setup.rs b/src/admin/tracing_setup.rs deleted file mode 100644 index 55fc4094..00000000 --- a/src/admin/tracing_setup.rs +++ /dev/null @@ -1,37 +0,0 @@ -use std::time::Duration; - -use opentelemetry::sdk::{ - trace::{self, IdGenerator, Sampler}, - Resource, -}; -use opentelemetry::KeyValue; -use opentelemetry_otlp::WithExportConfig; - -use garage_util::data::*; -use garage_util::error::*; - -pub fn init_tracing(export_to: &str, node_id: Uuid) -> Result<(), Error> { - let node_id = hex::encode(&node_id.as_slice()[..8]); - - opentelemetry_otlp::new_pipeline() - .tracing() - .with_exporter( - opentelemetry_otlp::new_exporter() - .tonic() - .with_endpoint(export_to) - .with_timeout(Duration::from_secs(3)), - ) - .with_trace_config( - trace::config() - .with_id_generator(IdGenerator::default()) - .with_sampler(Sampler::AlwaysOn) - .with_resource(Resource::new(vec![ - KeyValue::new("service.name", "garage"), - KeyValue::new("service.instance.id", node_id), - ])), - ) - .install_batch(opentelemetry::runtime::Tokio) - .ok_or_message("Unable to initialize tracing")?; - - Ok(()) -} diff --git a/src/api/Cargo.toml b/src/api/Cargo.toml index 29b26e5e..db77cf38 100644 --- a/src/api/Cargo.toml +++ b/src/api/Cargo.toml @@ -54,6 +54,9 @@ quick-xml = { version = "0.21", features = [ "serialize" ] } url = "2.1" opentelemetry = "0.17" +opentelemetry-prometheus = "0.10" +opentelemetry-otlp = "0.10" +prometheus = "0.13" [features] k2v = [ "garage_util/k2v", "garage_model/k2v" ] diff --git a/src/api/admin/api_server.rs b/src/api/admin/api_server.rs new file mode 100644 index 00000000..57e3e5cf --- /dev/null +++ b/src/api/admin/api_server.rs @@ -0,0 +1,199 @@ +use std::sync::Arc; + +use async_trait::async_trait; + +use futures::future::Future; +use http::header::{ + ACCESS_CONTROL_ALLOW_METHODS, ACCESS_CONTROL_ALLOW_ORIGIN, ALLOW, CONTENT_TYPE, +}; +use hyper::{Body, Request, Response}; + +use opentelemetry::trace::{SpanRef, Tracer}; +use opentelemetry_prometheus::PrometheusExporter; +use prometheus::{Encoder, TextEncoder}; + +use garage_model::garage::Garage; +use garage_util::error::Error as GarageError; + +use crate::generic_server::*; + +use crate::admin::bucket::*; +use crate::admin::cluster::*; +use crate::admin::error::*; +use crate::admin::key::*; +use crate::admin::router::{Authorization, Endpoint}; + +pub struct AdminApiServer { + garage: Arc, + exporter: PrometheusExporter, + metrics_token: Option, + admin_token: Option, +} + +impl AdminApiServer { + pub fn new(garage: Arc) -> Self { + let exporter = opentelemetry_prometheus::exporter().init(); + let cfg = &garage.config.admin; + let metrics_token = cfg + .metrics_token + .as_ref() + .map(|tok| format!("Bearer {}", tok)); + let admin_token = cfg + .admin_token + .as_ref() + .map(|tok| format!("Bearer {}", tok)); + Self { + garage, + exporter, + metrics_token, + admin_token, + } + } + + pub async fn run(self, shutdown_signal: impl Future) -> Result<(), GarageError> { + if let Some(bind_addr) = self.garage.config.admin.api_bind_addr { + let region = self.garage.config.s3_api.s3_region.clone(); + ApiServer::new(region, self) + .run_server(bind_addr, shutdown_signal) + .await + } else { + Ok(()) + } + } + + fn handle_options(&self, _req: &Request) -> Result, Error> { + Ok(Response::builder() + .status(204) + .header(ALLOW, "OPTIONS, GET, POST") + .header(ACCESS_CONTROL_ALLOW_METHODS, "OPTIONS, GET, POST") + .header(ACCESS_CONTROL_ALLOW_ORIGIN, "*") + .body(Body::empty())?) + } + + fn handle_metrics(&self) -> Result, Error> { + let mut buffer = vec![]; + let encoder = TextEncoder::new(); + + let tracer = opentelemetry::global::tracer("garage"); + let metric_families = tracer.in_span("admin/gather_metrics", |_| { + self.exporter.registry().gather() + }); + + encoder + .encode(&metric_families, &mut buffer) + .ok_or_internal_error("Could not serialize metrics")?; + + Ok(Response::builder() + .status(200) + .header(CONTENT_TYPE, encoder.format_type()) + .body(Body::from(buffer))?) + } +} + +#[async_trait] +impl ApiHandler for AdminApiServer { + const API_NAME: &'static str = "admin"; + const API_NAME_DISPLAY: &'static str = "Admin"; + + type Endpoint = Endpoint; + type Error = Error; + + fn parse_endpoint(&self, req: &Request) -> Result { + Endpoint::from_request(req) + } + + async fn handle( + &self, + req: Request, + endpoint: Endpoint, + ) -> Result, Error> { + let expected_auth_header = + match endpoint.authorization_type() { + Authorization::MetricsToken => self.metrics_token.as_ref(), + Authorization::AdminToken => match &self.admin_token { + None => return Err(Error::forbidden( + "Admin token isn't configured, admin API access is disabled for security.", + )), + Some(t) => Some(t), + }, + }; + + if let Some(h) = expected_auth_header { + match req.headers().get("Authorization") { + None => return Err(Error::forbidden("Authorization token must be provided")), + Some(v) => { + let authorized = v.to_str().map(|hv| hv.trim() == h).unwrap_or(false); + if !authorized { + return Err(Error::forbidden("Invalid authorization token provided")); + } + } + } + } + + match endpoint { + Endpoint::Options => self.handle_options(&req), + Endpoint::Metrics => self.handle_metrics(), + Endpoint::GetClusterStatus => handle_get_cluster_status(&self.garage).await, + Endpoint::ConnectClusterNodes => handle_connect_cluster_nodes(&self.garage, req).await, + // Layout + Endpoint::GetClusterLayout => handle_get_cluster_layout(&self.garage).await, + Endpoint::UpdateClusterLayout => handle_update_cluster_layout(&self.garage, req).await, + Endpoint::ApplyClusterLayout => handle_apply_cluster_layout(&self.garage, req).await, + Endpoint::RevertClusterLayout => handle_revert_cluster_layout(&self.garage, req).await, + // Keys + Endpoint::ListKeys => handle_list_keys(&self.garage).await, + Endpoint::GetKeyInfo { id, search } => { + handle_get_key_info(&self.garage, id, search).await + } + Endpoint::CreateKey => handle_create_key(&self.garage, req).await, + Endpoint::ImportKey => handle_import_key(&self.garage, req).await, + Endpoint::UpdateKey { id } => handle_update_key(&self.garage, id, req).await, + Endpoint::DeleteKey { id } => handle_delete_key(&self.garage, id).await, + // Buckets + Endpoint::ListBuckets => handle_list_buckets(&self.garage).await, + Endpoint::GetBucketInfo { id, global_alias } => { + handle_get_bucket_info(&self.garage, id, global_alias).await + } + Endpoint::CreateBucket => handle_create_bucket(&self.garage, req).await, + Endpoint::DeleteBucket { id } => handle_delete_bucket(&self.garage, id).await, + Endpoint::PutBucketWebsite { id } => { + handle_put_bucket_website(&self.garage, id, req).await + } + Endpoint::DeleteBucketWebsite { id } => { + handle_delete_bucket_website(&self.garage, id).await + } + // Bucket-key permissions + Endpoint::BucketAllowKey => { + handle_bucket_change_key_perm(&self.garage, req, true).await + } + Endpoint::BucketDenyKey => { + handle_bucket_change_key_perm(&self.garage, req, false).await + } + // Bucket aliasing + Endpoint::GlobalAliasBucket { id, alias } => { + handle_global_alias_bucket(&self.garage, id, alias).await + } + Endpoint::GlobalUnaliasBucket { id, alias } => { + handle_global_unalias_bucket(&self.garage, id, alias).await + } + Endpoint::LocalAliasBucket { + id, + access_key_id, + alias, + } => handle_local_alias_bucket(&self.garage, id, access_key_id, alias).await, + Endpoint::LocalUnaliasBucket { + id, + access_key_id, + alias, + } => handle_local_unalias_bucket(&self.garage, id, access_key_id, alias).await, + } + } +} + +impl ApiEndpoint for Endpoint { + fn name(&self) -> &'static str { + Endpoint::name(self) + } + + fn add_span_attributes(&self, _span: SpanRef<'_>) {} +} diff --git a/src/api/admin/bucket.rs b/src/api/admin/bucket.rs new file mode 100644 index 00000000..849d28ac --- /dev/null +++ b/src/api/admin/bucket.rs @@ -0,0 +1,549 @@ +use std::collections::HashMap; +use std::sync::Arc; + +use hyper::{Body, Request, Response, StatusCode}; +use serde::{Deserialize, Serialize}; + +use garage_util::crdt::*; +use garage_util::data::*; +use garage_util::error::Error as GarageError; +use garage_util::time::*; + +use garage_table::*; + +use garage_model::bucket_alias_table::*; +use garage_model::bucket_table::*; +use garage_model::garage::Garage; +use garage_model::permission::*; + +use crate::admin::error::*; +use crate::admin::key::ApiBucketKeyPerm; +use crate::common_error::CommonError; +use crate::helpers::parse_json_body; + +pub async fn handle_list_buckets(garage: &Arc) -> Result, Error> { + let buckets = garage + .bucket_table + .get_range( + &EmptyKey, + None, + Some(DeletedFilter::NotDeleted), + 10000, + EnumerationOrder::Forward, + ) + .await?; + + let res = buckets + .into_iter() + .map(|b| { + let state = b.state.as_option().unwrap(); + ListBucketResultItem { + id: hex::encode(b.id), + global_aliases: state + .aliases + .items() + .iter() + .filter(|(_, _, a)| *a) + .map(|(n, _, _)| n.to_string()) + .collect::>(), + local_aliases: state + .local_aliases + .items() + .iter() + .filter(|(_, _, a)| *a) + .map(|((k, n), _, _)| BucketLocalAlias { + access_key_id: k.to_string(), + alias: n.to_string(), + }) + .collect::>(), + } + }) + .collect::>(); + + let resp_json = serde_json::to_string_pretty(&res).map_err(GarageError::from)?; + Ok(Response::builder() + .status(StatusCode::OK) + .body(Body::from(resp_json))?) +} + +#[derive(Serialize)] +#[serde(rename_all = "camelCase")] +struct ListBucketResultItem { + id: String, + global_aliases: Vec, + local_aliases: Vec, +} + +#[derive(Serialize)] +#[serde(rename_all = "camelCase")] +struct BucketLocalAlias { + access_key_id: String, + alias: String, +} + +pub async fn handle_get_bucket_info( + garage: &Arc, + id: Option, + global_alias: Option, +) -> Result, Error> { + let bucket_id = match (id, global_alias) { + (Some(id), None) => parse_bucket_id(&id)?, + (None, Some(ga)) => garage + .bucket_helper() + .resolve_global_bucket_name(&ga) + .await? + .ok_or_else(|| HelperError::NoSuchBucket(ga.to_string()))?, + _ => { + return Err(Error::bad_request( + "Either id or globalAlias must be provided (but not both)", + )); + } + }; + + bucket_info_results(garage, bucket_id).await +} + +async fn bucket_info_results( + garage: &Arc, + bucket_id: Uuid, +) -> Result, Error> { + let bucket = garage + .bucket_helper() + .get_existing_bucket(bucket_id) + .await?; + + let mut relevant_keys = HashMap::new(); + for (k, _) in bucket + .state + .as_option() + .unwrap() + .authorized_keys + .items() + .iter() + { + if let Some(key) = garage + .key_table + .get(&EmptyKey, k) + .await? + .filter(|k| !k.is_deleted()) + { + if !key.state.is_deleted() { + relevant_keys.insert(k.clone(), key); + } + } + } + for ((k, _), _, _) in bucket + .state + .as_option() + .unwrap() + .local_aliases + .items() + .iter() + { + if relevant_keys.contains_key(k) { + continue; + } + if let Some(key) = garage.key_table.get(&EmptyKey, k).await? { + if !key.state.is_deleted() { + relevant_keys.insert(k.clone(), key); + } + } + } + + let state = bucket.state.as_option().unwrap(); + + let res = + GetBucketInfoResult { + id: hex::encode(&bucket.id), + global_aliases: state + .aliases + .items() + .iter() + .filter(|(_, _, a)| *a) + .map(|(n, _, _)| n.to_string()) + .collect::>(), + website_access: state.website_config.get().is_some(), + website_config: state.website_config.get().clone().map(|wsc| { + GetBucketInfoWebsiteResult { + index_document: wsc.index_document, + error_document: wsc.error_document, + } + }), + keys: relevant_keys + .into_iter() + .map(|(_, key)| { + let p = key.state.as_option().unwrap(); + GetBucketInfoKey { + access_key_id: key.key_id, + name: p.name.get().to_string(), + permissions: p + .authorized_buckets + .get(&bucket.id) + .map(|p| ApiBucketKeyPerm { + read: p.allow_read, + write: p.allow_write, + owner: p.allow_owner, + }) + .unwrap_or_default(), + bucket_local_aliases: p + .local_aliases + .items() + .iter() + .filter(|(_, _, b)| *b == Some(bucket.id)) + .map(|(n, _, _)| n.to_string()) + .collect::>(), + } + }) + .collect::>(), + }; + + let resp_json = serde_json::to_string_pretty(&res).map_err(GarageError::from)?; + Ok(Response::builder() + .status(StatusCode::OK) + .body(Body::from(resp_json))?) +} + +#[derive(Serialize)] +#[serde(rename_all = "camelCase")] +struct GetBucketInfoResult { + id: String, + global_aliases: Vec, + website_access: bool, + #[serde(default)] + website_config: Option, + keys: Vec, +} + +#[derive(Serialize)] +#[serde(rename_all = "camelCase")] +struct GetBucketInfoWebsiteResult { + index_document: String, + error_document: Option, +} + +#[derive(Serialize)] +#[serde(rename_all = "camelCase")] +struct GetBucketInfoKey { + access_key_id: String, + name: String, + permissions: ApiBucketKeyPerm, + bucket_local_aliases: Vec, +} + +pub async fn handle_create_bucket( + garage: &Arc, + req: Request, +) -> Result, Error> { + let req = parse_json_body::(req).await?; + + if let Some(ga) = &req.global_alias { + if !is_valid_bucket_name(ga) { + return Err(Error::bad_request(format!( + "{}: {}", + ga, INVALID_BUCKET_NAME_MESSAGE + ))); + } + + if let Some(alias) = garage.bucket_alias_table.get(&EmptyKey, ga).await? { + if alias.state.get().is_some() { + return Err(CommonError::BucketAlreadyExists.into()); + } + } + } + + if let Some(la) = &req.local_alias { + if !is_valid_bucket_name(&la.alias) { + return Err(Error::bad_request(format!( + "{}: {}", + la.alias, INVALID_BUCKET_NAME_MESSAGE + ))); + } + + let key = garage + .key_helper() + .get_existing_key(&la.access_key_id) + .await?; + let state = key.state.as_option().unwrap(); + if matches!(state.local_aliases.get(&la.alias), Some(_)) { + return Err(Error::bad_request("Local alias already exists")); + } + } + + let bucket = Bucket::new(); + garage.bucket_table.insert(&bucket).await?; + + if let Some(ga) = &req.global_alias { + garage + .bucket_helper() + .set_global_bucket_alias(bucket.id, ga) + .await?; + } + + if let Some(la) = &req.local_alias { + garage + .bucket_helper() + .set_local_bucket_alias(bucket.id, &la.access_key_id, &la.alias) + .await?; + + if la.allow.read || la.allow.write || la.allow.owner { + garage + .bucket_helper() + .set_bucket_key_permissions( + bucket.id, + &la.access_key_id, + BucketKeyPerm { + timestamp: now_msec(), + allow_read: la.allow.read, + allow_write: la.allow.write, + allow_owner: la.allow.owner, + }, + ) + .await?; + } + } + + bucket_info_results(garage, bucket.id).await +} + +#[derive(Deserialize)] +#[serde(rename_all = "camelCase")] +struct CreateBucketRequest { + global_alias: Option, + local_alias: Option, +} + +#[derive(Deserialize)] +#[serde(rename_all = "camelCase")] +struct CreateBucketLocalAlias { + access_key_id: String, + alias: String, + #[serde(default)] + allow: ApiBucketKeyPerm, +} + +pub async fn handle_delete_bucket( + garage: &Arc, + id: String, +) -> Result, Error> { + let helper = garage.bucket_helper(); + + let bucket_id = parse_bucket_id(&id)?; + + let mut bucket = helper.get_existing_bucket(bucket_id).await?; + let state = bucket.state.as_option().unwrap(); + + // Check bucket is empty + if !helper.is_bucket_empty(bucket_id).await? { + return Err(CommonError::BucketNotEmpty.into()); + } + + // --- done checking, now commit --- + // 1. delete authorization from keys that had access + for (key_id, perm) in bucket.authorized_keys() { + if perm.is_any() { + helper + .set_bucket_key_permissions(bucket.id, key_id, BucketKeyPerm::NO_PERMISSIONS) + .await?; + } + } + // 2. delete all local aliases + for ((key_id, alias), _, active) in state.local_aliases.items().iter() { + if *active { + helper + .unset_local_bucket_alias(bucket.id, key_id, alias) + .await?; + } + } + // 3. delete all global aliases + for (alias, _, active) in state.aliases.items().iter() { + if *active { + helper.purge_global_bucket_alias(bucket.id, alias).await?; + } + } + + // 4. delete bucket + bucket.state = Deletable::delete(); + garage.bucket_table.insert(&bucket).await?; + + Ok(Response::builder() + .status(StatusCode::NO_CONTENT) + .body(Body::empty())?) +} + +// ---- BUCKET WEBSITE CONFIGURATION ---- + +pub async fn handle_put_bucket_website( + garage: &Arc, + id: String, + req: Request, +) -> Result, Error> { + let req = parse_json_body::(req).await?; + let bucket_id = parse_bucket_id(&id)?; + + let mut bucket = garage + .bucket_helper() + .get_existing_bucket(bucket_id) + .await?; + + let state = bucket.state.as_option_mut().unwrap(); + state.website_config.update(Some(WebsiteConfig { + index_document: req.index_document, + error_document: req.error_document, + })); + + garage.bucket_table.insert(&bucket).await?; + + bucket_info_results(garage, bucket_id).await +} + +#[derive(Deserialize)] +#[serde(rename_all = "camelCase")] +struct PutBucketWebsiteRequest { + index_document: String, + #[serde(default)] + error_document: Option, +} + +pub async fn handle_delete_bucket_website( + garage: &Arc, + id: String, +) -> Result, Error> { + let bucket_id = parse_bucket_id(&id)?; + + let mut bucket = garage + .bucket_helper() + .get_existing_bucket(bucket_id) + .await?; + + let state = bucket.state.as_option_mut().unwrap(); + state.website_config.update(None); + + garage.bucket_table.insert(&bucket).await?; + + bucket_info_results(garage, bucket_id).await +} + +// ---- BUCKET/KEY PERMISSIONS ---- + +pub async fn handle_bucket_change_key_perm( + garage: &Arc, + req: Request, + new_perm_flag: bool, +) -> Result, Error> { + let req = parse_json_body::(req).await?; + + let bucket_id = parse_bucket_id(&req.bucket_id)?; + + let bucket = garage + .bucket_helper() + .get_existing_bucket(bucket_id) + .await?; + let state = bucket.state.as_option().unwrap(); + + let key = garage + .key_helper() + .get_existing_key(&req.access_key_id) + .await?; + + let mut perm = state + .authorized_keys + .get(&key.key_id) + .cloned() + .unwrap_or(BucketKeyPerm::NO_PERMISSIONS); + + if req.permissions.read { + perm.allow_read = new_perm_flag; + } + if req.permissions.write { + perm.allow_write = new_perm_flag; + } + if req.permissions.owner { + perm.allow_owner = new_perm_flag; + } + + garage + .bucket_helper() + .set_bucket_key_permissions(bucket.id, &key.key_id, perm) + .await?; + + bucket_info_results(garage, bucket.id).await +} + +#[derive(Deserialize)] +#[serde(rename_all = "camelCase")] +struct BucketKeyPermChangeRequest { + bucket_id: String, + access_key_id: String, + permissions: ApiBucketKeyPerm, +} + +// ---- BUCKET ALIASES ---- + +pub async fn handle_global_alias_bucket( + garage: &Arc, + bucket_id: String, + alias: String, +) -> Result, Error> { + let bucket_id = parse_bucket_id(&bucket_id)?; + + garage + .bucket_helper() + .set_global_bucket_alias(bucket_id, &alias) + .await?; + + bucket_info_results(garage, bucket_id).await +} + +pub async fn handle_global_unalias_bucket( + garage: &Arc, + bucket_id: String, + alias: String, +) -> Result, Error> { + let bucket_id = parse_bucket_id(&bucket_id)?; + + garage + .bucket_helper() + .unset_global_bucket_alias(bucket_id, &alias) + .await?; + + bucket_info_results(garage, bucket_id).await +} + +pub async fn handle_local_alias_bucket( + garage: &Arc, + bucket_id: String, + access_key_id: String, + alias: String, +) -> Result, Error> { + let bucket_id = parse_bucket_id(&bucket_id)?; + + garage + .bucket_helper() + .set_local_bucket_alias(bucket_id, &access_key_id, &alias) + .await?; + + bucket_info_results(garage, bucket_id).await +} + +pub async fn handle_local_unalias_bucket( + garage: &Arc, + bucket_id: String, + access_key_id: String, + alias: String, +) -> Result, Error> { + let bucket_id = parse_bucket_id(&bucket_id)?; + + garage + .bucket_helper() + .unset_local_bucket_alias(bucket_id, &access_key_id, &alias) + .await?; + + bucket_info_results(garage, bucket_id).await +} + +// ---- HELPER ---- + +fn parse_bucket_id(id: &str) -> Result { + let id_hex = hex::decode(&id).ok_or_bad_request("Invalid bucket id")?; + Ok(Uuid::try_from(&id_hex).ok_or_bad_request("Invalid bucket id")?) +} diff --git a/src/api/admin/cluster.rs b/src/api/admin/cluster.rs new file mode 100644 index 00000000..3401be42 --- /dev/null +++ b/src/api/admin/cluster.rs @@ -0,0 +1,198 @@ +use std::collections::HashMap; +use std::net::SocketAddr; +use std::sync::Arc; + +use hyper::{Body, Request, Response, StatusCode}; +use serde::{Deserialize, Serialize}; + +use garage_util::crdt::*; +use garage_util::data::*; +use garage_util::error::Error as GarageError; + +use garage_rpc::layout::*; + +use garage_model::garage::Garage; + +use crate::admin::error::*; +use crate::helpers::parse_json_body; + +pub async fn handle_get_cluster_status(garage: &Arc) -> Result, Error> { + let res = GetClusterStatusResponse { + node: hex::encode(garage.system.id), + garage_version: garage.system.garage_version(), + known_nodes: garage + .system + .get_known_nodes() + .into_iter() + .map(|i| { + ( + hex::encode(i.id), + KnownNodeResp { + addr: i.addr, + is_up: i.is_up, + last_seen_secs_ago: i.last_seen_secs_ago, + hostname: i.status.hostname, + }, + ) + }) + .collect(), + layout: get_cluster_layout(garage), + }; + + let resp_json = serde_json::to_string_pretty(&res).map_err(GarageError::from)?; + Ok(Response::builder() + .status(StatusCode::OK) + .body(Body::from(resp_json))?) +} + +pub async fn handle_connect_cluster_nodes( + garage: &Arc, + req: Request, +) -> Result, Error> { + let req = parse_json_body::>(req).await?; + + let res = futures::future::join_all(req.iter().map(|node| garage.system.connect(node))) + .await + .into_iter() + .map(|r| match r { + Ok(()) => ConnectClusterNodesResponse { + success: true, + error: None, + }, + Err(e) => ConnectClusterNodesResponse { + success: false, + error: Some(format!("{}", e)), + }, + }) + .collect::>(); + + let resp_json = serde_json::to_string_pretty(&res).map_err(GarageError::from)?; + Ok(Response::builder() + .status(StatusCode::OK) + .body(Body::from(resp_json))?) +} + +pub async fn handle_get_cluster_layout(garage: &Arc) -> Result, Error> { + let res = get_cluster_layout(garage); + let resp_json = serde_json::to_string_pretty(&res).map_err(GarageError::from)?; + Ok(Response::builder() + .status(StatusCode::OK) + .body(Body::from(resp_json))?) +} + +fn get_cluster_layout(garage: &Arc) -> GetClusterLayoutResponse { + let layout = garage.system.get_cluster_layout(); + + GetClusterLayoutResponse { + version: layout.version, + roles: layout + .roles + .items() + .iter() + .filter(|(_, _, v)| v.0.is_some()) + .map(|(k, _, v)| (hex::encode(k), v.0.clone())) + .collect(), + staged_role_changes: layout + .staging + .items() + .iter() + .filter(|(k, _, v)| layout.roles.get(k) != Some(v)) + .map(|(k, _, v)| (hex::encode(k), v.0.clone())) + .collect(), + } +} + +#[derive(Serialize)] +#[serde(rename_all = "camelCase")] +struct GetClusterStatusResponse { + node: String, + garage_version: &'static str, + known_nodes: HashMap, + layout: GetClusterLayoutResponse, +} + +#[derive(Serialize)] +struct ConnectClusterNodesResponse { + success: bool, + error: Option, +} + +#[derive(Serialize)] +#[serde(rename_all = "camelCase")] +struct GetClusterLayoutResponse { + version: u64, + roles: HashMap>, + staged_role_changes: HashMap>, +} + +#[derive(Serialize)] +struct KnownNodeResp { + addr: SocketAddr, + is_up: bool, + last_seen_secs_ago: Option, + hostname: String, +} + +pub async fn handle_update_cluster_layout( + garage: &Arc, + req: Request, +) -> Result, Error> { + let updates = parse_json_body::(req).await?; + + let mut layout = garage.system.get_cluster_layout(); + + let mut roles = layout.roles.clone(); + roles.merge(&layout.staging); + + for (node, role) in updates { + let node = hex::decode(node).ok_or_bad_request("Invalid node identifier")?; + let node = Uuid::try_from(&node).ok_or_bad_request("Invalid node identifier")?; + + layout + .staging + .merge(&roles.update_mutator(node, NodeRoleV(role))); + } + + garage.system.update_cluster_layout(&layout).await?; + + Ok(Response::builder() + .status(StatusCode::OK) + .body(Body::empty())?) +} + +pub async fn handle_apply_cluster_layout( + garage: &Arc, + req: Request, +) -> Result, Error> { + let param = parse_json_body::(req).await?; + + let layout = garage.system.get_cluster_layout(); + let layout = layout.apply_staged_changes(Some(param.version))?; + garage.system.update_cluster_layout(&layout).await?; + + Ok(Response::builder() + .status(StatusCode::OK) + .body(Body::empty())?) +} + +pub async fn handle_revert_cluster_layout( + garage: &Arc, + req: Request, +) -> Result, Error> { + let param = parse_json_body::(req).await?; + + let layout = garage.system.get_cluster_layout(); + let layout = layout.revert_staged_changes(Some(param.version))?; + garage.system.update_cluster_layout(&layout).await?; + + Ok(Response::builder() + .status(StatusCode::OK) + .body(Body::empty())?) +} + +type UpdateClusterLayoutRequest = HashMap>; + +#[derive(Deserialize)] +struct ApplyRevertLayoutRequest { + version: u64, +} diff --git a/src/api/admin/error.rs b/src/api/admin/error.rs new file mode 100644 index 00000000..c4613cb3 --- /dev/null +++ b/src/api/admin/error.rs @@ -0,0 +1,96 @@ +use err_derive::Error; +use hyper::header::HeaderValue; +use hyper::{Body, HeaderMap, StatusCode}; + +pub use garage_model::helper::error::Error as HelperError; + +use crate::common_error::CommonError; +pub use crate::common_error::{CommonErrorDerivative, OkOrBadRequest, OkOrInternalError}; +use crate::generic_server::ApiError; +use crate::helpers::CustomApiErrorBody; + +/// Errors of this crate +#[derive(Debug, Error)] +pub enum Error { + #[error(display = "{}", _0)] + /// Error from common error + Common(CommonError), + + // Category: cannot process + /// The API access key does not exist + #[error(display = "Access key not found: {}", _0)] + NoSuchAccessKey(String), + + /// In Import key, the key already exists + #[error( + display = "Key {} already exists in data store. Even if it is deleted, we can't let you create a new key with the same ID. Sorry.", + _0 + )] + KeyAlreadyExists(String), +} + +impl From for Error +where + CommonError: From, +{ + fn from(err: T) -> Self { + Error::Common(CommonError::from(err)) + } +} + +impl CommonErrorDerivative for Error {} + +impl From for Error { + fn from(err: HelperError) -> Self { + match err { + HelperError::Internal(i) => Self::Common(CommonError::InternalError(i)), + HelperError::BadRequest(b) => Self::Common(CommonError::BadRequest(b)), + HelperError::InvalidBucketName(n) => Self::Common(CommonError::InvalidBucketName(n)), + HelperError::NoSuchBucket(n) => Self::Common(CommonError::NoSuchBucket(n)), + HelperError::NoSuchAccessKey(n) => Self::NoSuchAccessKey(n), + } + } +} + +impl Error { + fn code(&self) -> &'static str { + match self { + Error::Common(c) => c.aws_code(), + Error::NoSuchAccessKey(_) => "NoSuchAccessKey", + Error::KeyAlreadyExists(_) => "KeyAlreadyExists", + } + } +} + +impl ApiError for Error { + /// Get the HTTP status code that best represents the meaning of the error for the client + fn http_status_code(&self) -> StatusCode { + match self { + Error::Common(c) => c.http_status_code(), + Error::NoSuchAccessKey(_) => StatusCode::NOT_FOUND, + Error::KeyAlreadyExists(_) => StatusCode::CONFLICT, + } + } + + fn add_http_headers(&self, _header_map: &mut HeaderMap) { + // nothing + } + + fn http_body(&self, garage_region: &str, path: &str) -> Body { + let error = CustomApiErrorBody { + code: self.code().to_string(), + message: format!("{}", self), + path: path.to_string(), + region: garage_region.to_string(), + }; + Body::from(serde_json::to_string_pretty(&error).unwrap_or_else(|_| { + r#" +{ + "code": "InternalError", + "message": "JSON encoding of error failed" +} + "# + .into() + })) + } +} diff --git a/src/api/admin/key.rs b/src/api/admin/key.rs new file mode 100644 index 00000000..f30b5dbb --- /dev/null +++ b/src/api/admin/key.rs @@ -0,0 +1,264 @@ +use std::collections::HashMap; +use std::sync::Arc; + +use hyper::{Body, Request, Response, StatusCode}; +use serde::{Deserialize, Serialize}; + +use garage_util::error::Error as GarageError; + +use garage_table::*; + +use garage_model::garage::Garage; +use garage_model::key_table::*; + +use crate::admin::error::*; +use crate::helpers::parse_json_body; + +pub async fn handle_list_keys(garage: &Arc) -> Result, Error> { + let res = garage + .key_table + .get_range( + &EmptyKey, + None, + Some(KeyFilter::Deleted(DeletedFilter::NotDeleted)), + 10000, + EnumerationOrder::Forward, + ) + .await? + .iter() + .map(|k| ListKeyResultItem { + id: k.key_id.to_string(), + name: k.params().unwrap().name.get().clone(), + }) + .collect::>(); + + let resp_json = serde_json::to_string_pretty(&res).map_err(GarageError::from)?; + Ok(Response::builder() + .status(StatusCode::OK) + .body(Body::from(resp_json))?) +} + +#[derive(Serialize)] +struct ListKeyResultItem { + id: String, + name: String, +} + +pub async fn handle_get_key_info( + garage: &Arc, + id: Option, + search: Option, +) -> Result, Error> { + let key = if let Some(id) = id { + garage.key_helper().get_existing_key(&id).await? + } else if let Some(search) = search { + garage + .key_helper() + .get_existing_matching_key(&search) + .await? + } else { + unreachable!(); + }; + + key_info_results(garage, key).await +} + +pub async fn handle_create_key( + garage: &Arc, + req: Request, +) -> Result, Error> { + let req = parse_json_body::(req).await?; + + let key = Key::new(&req.name); + garage.key_table.insert(&key).await?; + + key_info_results(garage, key).await +} + +#[derive(Deserialize)] +struct CreateKeyRequest { + name: String, +} + +pub async fn handle_import_key( + garage: &Arc, + req: Request, +) -> Result, Error> { + let req = parse_json_body::(req).await?; + + let prev_key = garage.key_table.get(&EmptyKey, &req.access_key_id).await?; + if prev_key.is_some() { + return Err(Error::KeyAlreadyExists(req.access_key_id.to_string())); + } + + let imported_key = Key::import(&req.access_key_id, &req.secret_access_key, &req.name); + garage.key_table.insert(&imported_key).await?; + + key_info_results(garage, imported_key).await +} + +#[derive(Deserialize)] +#[serde(rename_all = "camelCase")] +struct ImportKeyRequest { + access_key_id: String, + secret_access_key: String, + name: String, +} + +pub async fn handle_update_key( + garage: &Arc, + id: String, + req: Request, +) -> Result, Error> { + let req = parse_json_body::(req).await?; + + let mut key = garage.key_helper().get_existing_key(&id).await?; + + let key_state = key.state.as_option_mut().unwrap(); + + if let Some(new_name) = req.name { + key_state.name.update(new_name); + } + if let Some(allow) = req.allow { + if allow.create_bucket { + key_state.allow_create_bucket.update(true); + } + } + if let Some(deny) = req.deny { + if deny.create_bucket { + key_state.allow_create_bucket.update(false); + } + } + + garage.key_table.insert(&key).await?; + + key_info_results(garage, key).await +} + +#[derive(Deserialize)] +struct UpdateKeyRequest { + name: Option, + allow: Option, + deny: Option, +} + +pub async fn handle_delete_key(garage: &Arc, id: String) -> Result, Error> { + let mut key = garage.key_helper().get_existing_key(&id).await?; + + key.state.as_option().unwrap(); + + garage.key_helper().delete_key(&mut key).await?; + + Ok(Response::builder() + .status(StatusCode::NO_CONTENT) + .body(Body::empty())?) +} + +async fn key_info_results(garage: &Arc, key: Key) -> Result, Error> { + let mut relevant_buckets = HashMap::new(); + + let key_state = key.state.as_option().unwrap(); + + for id in key_state + .authorized_buckets + .items() + .iter() + .map(|(id, _)| id) + .chain( + key_state + .local_aliases + .items() + .iter() + .filter_map(|(_, _, v)| v.as_ref()), + ) { + if !relevant_buckets.contains_key(id) { + if let Some(b) = garage.bucket_table.get(&EmptyKey, id).await? { + if b.state.as_option().is_some() { + relevant_buckets.insert(*id, b); + } + } + } + } + + let res = GetKeyInfoResult { + name: key_state.name.get().clone(), + access_key_id: key.key_id.clone(), + secret_access_key: key_state.secret_key.clone(), + permissions: KeyPerm { + create_bucket: *key_state.allow_create_bucket.get(), + }, + buckets: relevant_buckets + .into_iter() + .map(|(_, bucket)| { + let state = bucket.state.as_option().unwrap(); + KeyInfoBucketResult { + id: hex::encode(bucket.id), + global_aliases: state + .aliases + .items() + .iter() + .filter(|(_, _, a)| *a) + .map(|(n, _, _)| n.to_string()) + .collect::>(), + local_aliases: state + .local_aliases + .items() + .iter() + .filter(|((k, _), _, a)| *a && *k == key.key_id) + .map(|((_, n), _, _)| n.to_string()) + .collect::>(), + permissions: key_state + .authorized_buckets + .get(&bucket.id) + .map(|p| ApiBucketKeyPerm { + read: p.allow_read, + write: p.allow_write, + owner: p.allow_owner, + }) + .unwrap_or_default(), + } + }) + .collect::>(), + }; + + let resp_json = serde_json::to_string_pretty(&res).map_err(GarageError::from)?; + Ok(Response::builder() + .status(StatusCode::OK) + .body(Body::from(resp_json))?) +} + +#[derive(Serialize)] +#[serde(rename_all = "camelCase")] +struct GetKeyInfoResult { + name: String, + access_key_id: String, + secret_access_key: String, + permissions: KeyPerm, + buckets: Vec, +} + +#[derive(Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] +struct KeyPerm { + #[serde(default)] + create_bucket: bool, +} + +#[derive(Serialize)] +#[serde(rename_all = "camelCase")] +struct KeyInfoBucketResult { + id: String, + global_aliases: Vec, + local_aliases: Vec, + permissions: ApiBucketKeyPerm, +} + +#[derive(Serialize, Deserialize, Default)] +pub(crate) struct ApiBucketKeyPerm { + #[serde(default)] + pub(crate) read: bool, + #[serde(default)] + pub(crate) write: bool, + #[serde(default)] + pub(crate) owner: bool, +} diff --git a/src/api/admin/mod.rs b/src/api/admin/mod.rs new file mode 100644 index 00000000..c4857c10 --- /dev/null +++ b/src/api/admin/mod.rs @@ -0,0 +1,7 @@ +pub mod api_server; +mod error; +mod router; + +mod bucket; +mod cluster; +mod key; diff --git a/src/api/admin/router.rs b/src/api/admin/router.rs new file mode 100644 index 00000000..93639873 --- /dev/null +++ b/src/api/admin/router.rs @@ -0,0 +1,149 @@ +use std::borrow::Cow; + +use hyper::{Method, Request}; + +use crate::admin::error::*; +use crate::router_macros::*; + +pub enum Authorization { + MetricsToken, + AdminToken, +} + +router_match! {@func + +/// List of all Admin API endpoints. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum Endpoint { + Options, + Metrics, + GetClusterStatus, + ConnectClusterNodes, + // Layout + GetClusterLayout, + UpdateClusterLayout, + ApplyClusterLayout, + RevertClusterLayout, + // Keys + ListKeys, + CreateKey, + ImportKey, + GetKeyInfo { + id: Option, + search: Option, + }, + DeleteKey { + id: String, + }, + UpdateKey { + id: String, + }, + // Buckets + ListBuckets, + CreateBucket, + GetBucketInfo { + id: Option, + global_alias: Option, + }, + DeleteBucket { + id: String, + }, + PutBucketWebsite { + id: String, + }, + DeleteBucketWebsite { + id: String, + }, + // Bucket-Key Permissions + BucketAllowKey, + BucketDenyKey, + // Bucket aliases + GlobalAliasBucket { + id: String, + alias: String, + }, + GlobalUnaliasBucket { + id: String, + alias: String, + }, + LocalAliasBucket { + id: String, + access_key_id: String, + alias: String, + }, + LocalUnaliasBucket { + id: String, + access_key_id: String, + alias: String, + }, +}} + +impl Endpoint { + /// Determine which S3 endpoint a request is for using the request, and a bucket which was + /// possibly extracted from the Host header. + /// Returns Self plus bucket name, if endpoint is not Endpoint::ListBuckets + pub fn from_request(req: &Request) -> Result { + let uri = req.uri(); + let path = uri.path(); + let query = uri.query(); + + let mut query = QueryParameters::from_query(query.unwrap_or_default())?; + + let res = router_match!(@gen_path_parser (req.method(), path, query) [ + OPTIONS _ => Options, + GET "/metrics" => Metrics, + GET "/v0/status" => GetClusterStatus, + POST "/v0/connect" => ConnectClusterNodes, + // Layout endpoints + GET "/v0/layout" => GetClusterLayout, + POST "/v0/layout" => UpdateClusterLayout, + POST "/v0/layout/apply" => ApplyClusterLayout, + POST "/v0/layout/revert" => RevertClusterLayout, + // API key endpoints + GET "/v0/key" if id => GetKeyInfo (query_opt::id, query_opt::search), + GET "/v0/key" if search => GetKeyInfo (query_opt::id, query_opt::search), + POST "/v0/key" if id => UpdateKey (query::id), + POST "/v0/key" => CreateKey, + POST "/v0/key/import" => ImportKey, + DELETE "/v0/key" if id => DeleteKey (query::id), + GET "/v0/key" => ListKeys, + // Bucket endpoints + GET "/v0/bucket" if id => GetBucketInfo (query_opt::id, query_opt::global_alias), + GET "/v0/bucket" if global_alias => GetBucketInfo (query_opt::id, query_opt::global_alias), + GET "/v0/bucket" => ListBuckets, + POST "/v0/bucket" => CreateBucket, + DELETE "/v0/bucket" if id => DeleteBucket (query::id), + PUT "/v0/bucket/website" if id => PutBucketWebsite (query::id), + DELETE "/v0/bucket/website" if id => DeleteBucketWebsite (query::id), + // Bucket-key permissions + POST "/v0/bucket/allow" => BucketAllowKey, + POST "/v0/bucket/deny" => BucketDenyKey, + // Bucket aliases + PUT "/v0/bucket/alias/global" => GlobalAliasBucket (query::id, query::alias), + DELETE "/v0/bucket/alias/global" => GlobalUnaliasBucket (query::id, query::alias), + PUT "/v0/bucket/alias/local" => LocalAliasBucket (query::id, query::access_key_id, query::alias), + DELETE "/v0/bucket/alias/local" => LocalUnaliasBucket (query::id, query::access_key_id, query::alias), + ]); + + if let Some(message) = query.nonempty_message() { + debug!("Unused query parameter: {}", message) + } + + Ok(res) + } + /// Get the kind of authorization which is required to perform the operation. + pub fn authorization_type(&self) -> Authorization { + match self { + Self::Metrics => Authorization::MetricsToken, + _ => Authorization::AdminToken, + } + } +} + +generateQueryParameters! { + "id" => id, + "search" => search, + "globalAlias" => global_alias, + "alias" => alias, + "accessKeyId" => access_key_id +} diff --git a/src/api/common_error.rs b/src/api/common_error.rs new file mode 100644 index 00000000..20f9f266 --- /dev/null +++ b/src/api/common_error.rs @@ -0,0 +1,177 @@ +use err_derive::Error; +use hyper::StatusCode; + +use garage_util::error::Error as GarageError; + +/// Errors of this crate +#[derive(Debug, Error)] +pub enum CommonError { + // ---- INTERNAL ERRORS ---- + /// Error related to deeper parts of Garage + #[error(display = "Internal error: {}", _0)] + InternalError(#[error(source)] GarageError), + + /// Error related to Hyper + #[error(display = "Internal error (Hyper error): {}", _0)] + Hyper(#[error(source)] hyper::Error), + + /// Error related to HTTP + #[error(display = "Internal error (HTTP error): {}", _0)] + Http(#[error(source)] http::Error), + + // ---- GENERIC CLIENT ERRORS ---- + /// Proper authentication was not provided + #[error(display = "Forbidden: {}", _0)] + Forbidden(String), + + /// Generic bad request response with custom message + #[error(display = "Bad request: {}", _0)] + BadRequest(String), + + // ---- SPECIFIC ERROR CONDITIONS ---- + // These have to be error codes referenced in the S3 spec here: + // https://docs.aws.amazon.com/AmazonS3/latest/API/ErrorResponses.html#ErrorCodeList + /// The bucket requested don't exists + #[error(display = "Bucket not found: {}", _0)] + NoSuchBucket(String), + + /// Tried to create a bucket that already exist + #[error(display = "Bucket already exists")] + BucketAlreadyExists, + + /// Tried to delete a non-empty bucket + #[error(display = "Tried to delete a non-empty bucket")] + BucketNotEmpty, + + // Category: bad request + /// Bucket name is not valid according to AWS S3 specs + #[error(display = "Invalid bucket name: {}", _0)] + InvalidBucketName(String), +} + +impl CommonError { + pub fn http_status_code(&self) -> StatusCode { + match self { + CommonError::InternalError( + GarageError::Timeout + | GarageError::RemoteError(_) + | GarageError::Quorum(_, _, _, _), + ) => StatusCode::SERVICE_UNAVAILABLE, + CommonError::InternalError(_) | CommonError::Hyper(_) | CommonError::Http(_) => { + StatusCode::INTERNAL_SERVER_ERROR + } + CommonError::BadRequest(_) => StatusCode::BAD_REQUEST, + CommonError::Forbidden(_) => StatusCode::FORBIDDEN, + CommonError::NoSuchBucket(_) => StatusCode::NOT_FOUND, + CommonError::BucketNotEmpty | CommonError::BucketAlreadyExists => StatusCode::CONFLICT, + CommonError::InvalidBucketName(_) => StatusCode::BAD_REQUEST, + } + } + + pub fn aws_code(&self) -> &'static str { + match self { + CommonError::Forbidden(_) => "AccessDenied", + CommonError::InternalError( + GarageError::Timeout + | GarageError::RemoteError(_) + | GarageError::Quorum(_, _, _, _), + ) => "ServiceUnavailable", + CommonError::InternalError(_) | CommonError::Hyper(_) | CommonError::Http(_) => { + "InternalError" + } + CommonError::BadRequest(_) => "InvalidRequest", + CommonError::NoSuchBucket(_) => "NoSuchBucket", + CommonError::BucketAlreadyExists => "BucketAlreadyExists", + CommonError::BucketNotEmpty => "BucketNotEmpty", + CommonError::InvalidBucketName(_) => "InvalidBucketName", + } + } + + pub fn bad_request(msg: M) -> Self { + CommonError::BadRequest(msg.to_string()) + } +} + +pub trait CommonErrorDerivative: From { + fn internal_error(msg: M) -> Self { + Self::from(CommonError::InternalError(GarageError::Message( + msg.to_string(), + ))) + } + + fn bad_request(msg: M) -> Self { + Self::from(CommonError::BadRequest(msg.to_string())) + } + + fn forbidden(msg: M) -> Self { + Self::from(CommonError::Forbidden(msg.to_string())) + } +} + +/// Trait to map error to the Bad Request error code +pub trait OkOrBadRequest { + type S; + fn ok_or_bad_request>(self, reason: M) -> Result; +} + +impl OkOrBadRequest for Result +where + E: std::fmt::Display, +{ + type S = T; + fn ok_or_bad_request>(self, reason: M) -> Result { + match self { + Ok(x) => Ok(x), + Err(e) => Err(CommonError::BadRequest(format!( + "{}: {}", + reason.as_ref(), + e + ))), + } + } +} + +impl OkOrBadRequest for Option { + type S = T; + fn ok_or_bad_request>(self, reason: M) -> Result { + match self { + Some(x) => Ok(x), + None => Err(CommonError::BadRequest(reason.as_ref().to_string())), + } + } +} + +/// Trait to map an error to an Internal Error code +pub trait OkOrInternalError { + type S; + fn ok_or_internal_error>(self, reason: M) -> Result; +} + +impl OkOrInternalError for Result +where + E: std::fmt::Display, +{ + type S = T; + fn ok_or_internal_error>(self, reason: M) -> Result { + match self { + Ok(x) => Ok(x), + Err(e) => Err(CommonError::InternalError(GarageError::Message(format!( + "{}: {}", + reason.as_ref(), + e + )))), + } + } +} + +impl OkOrInternalError for Option { + type S = T; + fn ok_or_internal_error>(self, reason: M) -> Result { + match self { + Some(x) => Ok(x), + None => Err(CommonError::InternalError(GarageError::Message( + reason.as_ref().to_string(), + ))), + } + } +} diff --git a/src/api/error.rs b/src/api/error.rs deleted file mode 100644 index 4b7254d2..00000000 --- a/src/api/error.rs +++ /dev/null @@ -1,284 +0,0 @@ -use std::convert::TryInto; - -use err_derive::Error; -use hyper::header::HeaderValue; -use hyper::{HeaderMap, StatusCode}; - -use garage_model::helper::error::Error as HelperError; -use garage_util::error::Error as GarageError; - -use crate::s3::xml as s3_xml; - -/// Errors of this crate -#[derive(Debug, Error)] -pub enum Error { - // Category: internal error - /// Error related to deeper parts of Garage - #[error(display = "Internal error: {}", _0)] - InternalError(#[error(source)] GarageError), - - /// Error related to Hyper - #[error(display = "Internal error (Hyper error): {}", _0)] - Hyper(#[error(source)] hyper::Error), - - /// Error related to HTTP - #[error(display = "Internal error (HTTP error): {}", _0)] - Http(#[error(source)] http::Error), - - // Category: cannot process - /// No proper api key was used, or the signature was invalid - #[error(display = "Forbidden: {}", _0)] - Forbidden(String), - - /// Authorization Header Malformed - #[error(display = "Authorization header malformed, expected scope: {}", _0)] - AuthorizationHeaderMalformed(String), - - /// The object requested don't exists - #[error(display = "Key not found")] - NoSuchKey, - - /// The bucket requested don't exists - #[error(display = "Bucket not found")] - NoSuchBucket, - - /// The multipart upload requested don't exists - #[error(display = "Upload not found")] - NoSuchUpload, - - /// Tried to create a bucket that already exist - #[error(display = "Bucket already exists")] - BucketAlreadyExists, - - /// Tried to delete a non-empty bucket - #[error(display = "Tried to delete a non-empty bucket")] - BucketNotEmpty, - - /// Precondition failed (e.g. x-amz-copy-source-if-match) - #[error(display = "At least one of the preconditions you specified did not hold")] - PreconditionFailed, - - /// Parts specified in CMU request do not match parts actually uploaded - #[error(display = "Parts given to CompleteMultipartUpload do not match uploaded parts")] - InvalidPart, - - /// Parts given to CompleteMultipartUpload were not in ascending order - #[error(display = "Parts given to CompleteMultipartUpload were not in ascending order")] - InvalidPartOrder, - - /// In CompleteMultipartUpload: not enough data - /// (here we are more lenient than AWS S3) - #[error(display = "Proposed upload is smaller than the minimum allowed object size")] - EntityTooSmall, - - // Category: bad request - /// The request contained an invalid UTF-8 sequence in its path or in other parameters - #[error(display = "Invalid UTF-8: {}", _0)] - InvalidUtf8Str(#[error(source)] std::str::Utf8Error), - - /// The request used an invalid path - #[error(display = "Invalid UTF-8: {}", _0)] - InvalidUtf8String(#[error(source)] std::string::FromUtf8Error), - - /// Some base64 encoded data was badly encoded - #[error(display = "Invalid base64: {}", _0)] - InvalidBase64(#[error(source)] base64::DecodeError), - - /// The client sent invalid XML data - #[error(display = "Invalid XML: {}", _0)] - InvalidXml(String), - - /// The client sent a header with invalid value - #[error(display = "Invalid header value: {}", _0)] - InvalidHeader(#[error(source)] hyper::header::ToStrError), - - /// The client sent a range header with invalid value - #[error(display = "Invalid HTTP range: {:?}", _0)] - InvalidRange(#[error(from)] (http_range::HttpRangeParseError, u64)), - - /// The client sent an invalid request - #[error(display = "Bad request: {}", _0)] - BadRequest(String), - - /// The client asked for an invalid return format (invalid Accept header) - #[error(display = "Not acceptable: {}", _0)] - NotAcceptable(String), - - /// The client sent a request for an action not supported by garage - #[error(display = "Unimplemented action: {}", _0)] - NotImplemented(String), -} - -impl From for Error { - fn from(err: roxmltree::Error) -> Self { - Self::InvalidXml(format!("{}", err)) - } -} - -impl From for Error { - fn from(err: quick_xml::de::DeError) -> Self { - Self::InvalidXml(format!("{}", err)) - } -} - -impl From for Error { - fn from(err: HelperError) -> Self { - match err { - HelperError::Internal(i) => Self::InternalError(i), - HelperError::BadRequest(b) => Self::BadRequest(b), - } - } -} - -impl From for Error { - fn from(err: multer::Error) -> Self { - Self::BadRequest(err.to_string()) - } -} - -impl Error { - /// Get the HTTP status code that best represents the meaning of the error for the client - pub fn http_status_code(&self) -> StatusCode { - match self { - Error::NoSuchKey | Error::NoSuchBucket | Error::NoSuchUpload => StatusCode::NOT_FOUND, - Error::BucketNotEmpty | Error::BucketAlreadyExists => StatusCode::CONFLICT, - Error::PreconditionFailed => StatusCode::PRECONDITION_FAILED, - Error::Forbidden(_) => StatusCode::FORBIDDEN, - Error::NotAcceptable(_) => StatusCode::NOT_ACCEPTABLE, - Error::InternalError( - GarageError::Timeout - | GarageError::RemoteError(_) - | GarageError::Quorum(_, _, _, _), - ) => StatusCode::SERVICE_UNAVAILABLE, - Error::InternalError(_) | Error::Hyper(_) | Error::Http(_) => { - StatusCode::INTERNAL_SERVER_ERROR - } - Error::InvalidRange(_) => StatusCode::RANGE_NOT_SATISFIABLE, - Error::NotImplemented(_) => StatusCode::NOT_IMPLEMENTED, - _ => StatusCode::BAD_REQUEST, - } - } - - pub fn aws_code(&self) -> &'static str { - match self { - Error::NoSuchKey => "NoSuchKey", - Error::NoSuchBucket => "NoSuchBucket", - Error::NoSuchUpload => "NoSuchUpload", - Error::BucketAlreadyExists => "BucketAlreadyExists", - Error::BucketNotEmpty => "BucketNotEmpty", - Error::PreconditionFailed => "PreconditionFailed", - Error::InvalidPart => "InvalidPart", - Error::InvalidPartOrder => "InvalidPartOrder", - Error::EntityTooSmall => "EntityTooSmall", - Error::Forbidden(_) => "AccessDenied", - Error::AuthorizationHeaderMalformed(_) => "AuthorizationHeaderMalformed", - Error::NotImplemented(_) => "NotImplemented", - Error::InternalError( - GarageError::Timeout - | GarageError::RemoteError(_) - | GarageError::Quorum(_, _, _, _), - ) => "ServiceUnavailable", - Error::InternalError(_) | Error::Hyper(_) | Error::Http(_) => "InternalError", - _ => "InvalidRequest", - } - } - - pub fn aws_xml(&self, garage_region: &str, path: &str) -> String { - let error = s3_xml::Error { - code: s3_xml::Value(self.aws_code().to_string()), - message: s3_xml::Value(format!("{}", self)), - resource: Some(s3_xml::Value(path.to_string())), - region: Some(s3_xml::Value(garage_region.to_string())), - }; - s3_xml::to_xml_with_header(&error).unwrap_or_else(|_| { - r#" - - - InternalError - XML encoding of error failed - - "# - .into() - }) - } - - pub fn add_headers(&self, header_map: &mut HeaderMap) { - use hyper::header; - #[allow(clippy::single_match)] - match self { - Error::InvalidRange((_, len)) => { - header_map.append( - header::CONTENT_RANGE, - format!("bytes */{}", len) - .try_into() - .expect("header value only contain ascii"), - ); - } - _ => (), - } - } -} - -/// Trait to map error to the Bad Request error code -pub trait OkOrBadRequest { - type S; - fn ok_or_bad_request>(self, reason: M) -> Result; -} - -impl OkOrBadRequest for Result -where - E: std::fmt::Display, -{ - type S = T; - fn ok_or_bad_request>(self, reason: M) -> Result { - match self { - Ok(x) => Ok(x), - Err(e) => Err(Error::BadRequest(format!("{}: {}", reason.as_ref(), e))), - } - } -} - -impl OkOrBadRequest for Option { - type S = T; - fn ok_or_bad_request>(self, reason: M) -> Result { - match self { - Some(x) => Ok(x), - None => Err(Error::BadRequest(reason.as_ref().to_string())), - } - } -} - -/// Trait to map an error to an Internal Error code -pub trait OkOrInternalError { - type S; - fn ok_or_internal_error>(self, reason: M) -> Result; -} - -impl OkOrInternalError for Result -where - E: std::fmt::Display, -{ - type S = T; - fn ok_or_internal_error>(self, reason: M) -> Result { - match self { - Ok(x) => Ok(x), - Err(e) => Err(Error::InternalError(GarageError::Message(format!( - "{}: {}", - reason.as_ref(), - e - )))), - } - } -} - -impl OkOrInternalError for Option { - type S = T; - fn ok_or_internal_error>(self, reason: M) -> Result { - match self { - Some(x) => Ok(x), - None => Err(Error::InternalError(GarageError::Message( - reason.as_ref().to_string(), - ))), - } - } -} diff --git a/src/api/generic_server.rs b/src/api/generic_server.rs index 9281e596..77278908 100644 --- a/src/api/generic_server.rs +++ b/src/api/generic_server.rs @@ -5,9 +5,11 @@ use async_trait::async_trait; use futures::future::Future; +use hyper::header::HeaderValue; use hyper::server::conn::AddrStream; use hyper::service::{make_service_fn, service_fn}; use hyper::{Body, Request, Response, Server}; +use hyper::{HeaderMap, StatusCode}; use opentelemetry::{ global, @@ -19,26 +21,31 @@ use opentelemetry::{ use garage_util::error::Error as GarageError; use garage_util::metrics::{gen_trace_id, RecordDuration}; -use crate::error::*; - pub(crate) trait ApiEndpoint: Send + Sync + 'static { fn name(&self) -> &'static str; fn add_span_attributes(&self, span: SpanRef<'_>); } +pub trait ApiError: std::error::Error + Send + Sync + 'static { + fn http_status_code(&self) -> StatusCode; + fn add_http_headers(&self, header_map: &mut HeaderMap); + fn http_body(&self, garage_region: &str, path: &str) -> Body; +} + #[async_trait] pub(crate) trait ApiHandler: Send + Sync + 'static { const API_NAME: &'static str; const API_NAME_DISPLAY: &'static str; type Endpoint: ApiEndpoint; + type Error: ApiError; - fn parse_endpoint(&self, r: &Request) -> Result; + fn parse_endpoint(&self, r: &Request) -> Result; async fn handle( &self, req: Request, endpoint: Self::Endpoint, - ) -> Result, Error>; + ) -> Result, Self::Error>; } pub(crate) struct ApiServer { @@ -142,13 +149,13 @@ impl ApiServer { Ok(x) } Err(e) => { - let body: Body = Body::from(e.aws_xml(&self.region, uri.path())); + let body: Body = e.http_body(&self.region, uri.path()); let mut http_error_builder = Response::builder() .status(e.http_status_code()) .header("Content-Type", "application/xml"); if let Some(header_map) = http_error_builder.headers_mut() { - e.add_headers(header_map) + e.add_http_headers(header_map) } let http_error = http_error_builder.body(body)?; @@ -163,7 +170,7 @@ impl ApiServer { } } - async fn handler_stage2(&self, req: Request) -> Result, Error> { + async fn handler_stage2(&self, req: Request) -> Result, A::Error> { let endpoint = self.api_handler.parse_endpoint(&req)?; debug!("Endpoint: {}", endpoint.name()); diff --git a/src/api/helpers.rs b/src/api/helpers.rs index a994b82f..9fb12dbe 100644 --- a/src/api/helpers.rs +++ b/src/api/helpers.rs @@ -1,11 +1,8 @@ +use hyper::{Body, Request}; use idna::domain_to_unicode; +use serde::{Deserialize, Serialize}; -use garage_util::data::*; - -use garage_model::garage::Garage; -use garage_model::key_table::Key; - -use crate::error::*; +use crate::common_error::{CommonError as Error, *}; /// What kind of authorization is required to perform a given action #[derive(Debug, Clone, PartialEq, Eq)] @@ -50,7 +47,7 @@ pub fn authority_to_host(authority: &str) -> Result { let mut iter = authority.chars().enumerate(); let (_, first_char) = iter .next() - .ok_or_else(|| Error::BadRequest("Authority is empty".to_string()))?; + .ok_or_else(|| Error::bad_request("Authority is empty".to_string()))?; let split = match first_char { '[' => { @@ -58,7 +55,7 @@ pub fn authority_to_host(authority: &str) -> Result { match iter.next() { Some((_, ']')) => iter.next(), _ => { - return Err(Error::BadRequest(format!( + return Err(Error::bad_request(format!( "Authority {} has an illegal format", authority ))) @@ -71,7 +68,7 @@ pub fn authority_to_host(authority: &str) -> Result { let authority = match split { Some((i, ':')) => Ok(&authority[..i]), None => Ok(authority), - Some((_, _)) => Err(Error::BadRequest(format!( + Some((_, _)) => Err(Error::bad_request(format!( "Authority {} has an illegal format", authority ))), @@ -79,28 +76,6 @@ pub fn authority_to_host(authority: &str) -> Result { authority.map(|h| domain_to_unicode(h).0) } -#[allow(clippy::ptr_arg)] -pub async fn resolve_bucket( - garage: &Garage, - bucket_name: &String, - api_key: &Key, -) -> Result { - let api_key_params = api_key - .state - .as_option() - .ok_or_internal_error("Key should not be deleted at this point")?; - - if let Some(Some(bucket_id)) = api_key_params.local_aliases.get(bucket_name) { - Ok(*bucket_id) - } else { - Ok(garage - .bucket_helper() - .resolve_global_bucket_name(bucket_name) - .await? - .ok_or(Error::NoSuchBucket)?) - } -} - /// Extract the bucket name and the key name from an HTTP path and possibly a bucket provided in /// the host header of the request /// @@ -132,7 +107,7 @@ pub fn parse_bucket_key<'a>( None => (path, None), }; if bucket.is_empty() { - return Err(Error::BadRequest("No bucket specified".to_string())); + return Err(Error::bad_request("No bucket specified")); } Ok((bucket, key)) } @@ -163,6 +138,12 @@ pub fn key_after_prefix(pfx: &str) -> Option { None } +pub async fn parse_json_body Deserialize<'de>>(req: Request) -> Result { + let body = hyper::body::to_bytes(req.into_body()).await?; + let resp: T = serde_json::from_slice(&body).ok_or_bad_request("Invalid JSON")?; + Ok(resp) +} + #[cfg(test)] mod tests { use super::*; @@ -298,3 +279,11 @@ mod tests { ); } } + +#[derive(Serialize)] +pub(crate) struct CustomApiErrorBody { + pub(crate) code: String, + pub(crate) message: String, + pub(crate) region: String, + pub(crate) path: String, +} diff --git a/src/api/k2v/api_server.rs b/src/api/k2v/api_server.rs index 5f5e9030..eb0fbdd7 100644 --- a/src/api/k2v/api_server.rs +++ b/src/api/k2v/api_server.rs @@ -7,13 +7,12 @@ use hyper::{Body, Method, Request, Response}; use opentelemetry::{trace::SpanRef, KeyValue}; -use garage_table::util::*; use garage_util::error::Error as GarageError; use garage_model::garage::Garage; -use crate::error::*; use crate::generic_server::*; +use crate::k2v::error::*; use crate::signature::payload::check_payload_signature; use crate::signature::streaming::*; @@ -60,6 +59,7 @@ impl ApiHandler for K2VApiServer { const API_NAME_DISPLAY: &'static str = "K2V"; type Endpoint = K2VApiEndpoint; + type Error = Error; fn parse_endpoint(&self, req: &Request) -> Result { let (endpoint, bucket_name) = Endpoint::from_request(req)?; @@ -83,13 +83,14 @@ impl ApiHandler for K2VApiServer { // The OPTIONS method is procesed early, before we even check for an API key if let Endpoint::Options = endpoint { - return handle_options_s3api(garage, &req, Some(bucket_name)).await; + return Ok(handle_options_s3api(garage, &req, Some(bucket_name)) + .await + .ok_or_bad_request("Error handling OPTIONS")?); } let (api_key, mut content_sha256) = check_payload_signature(&garage, "k2v", &req).await?; - let api_key = api_key.ok_or_else(|| { - Error::Forbidden("Garage does not support anonymous access yet".to_string()) - })?; + let api_key = api_key + .ok_or_else(|| Error::forbidden("Garage does not support anonymous access yet"))?; let req = parse_streaming_body( &api_key, @@ -99,13 +100,14 @@ impl ApiHandler for K2VApiServer { "k2v", )?; - let bucket_id = resolve_bucket(&garage, &bucket_name, &api_key).await?; + let bucket_id = garage + .bucket_helper() + .resolve_bucket(&bucket_name, &api_key) + .await?; let bucket = garage - .bucket_table - .get(&EmptyKey, &bucket_id) - .await? - .filter(|b| !b.state.is_deleted()) - .ok_or(Error::NoSuchBucket)?; + .bucket_helper() + .get_existing_bucket(bucket_id) + .await?; let allowed = match endpoint.authorization_type() { Authorization::Read => api_key.allow_read(&bucket_id), @@ -115,9 +117,7 @@ impl ApiHandler for K2VApiServer { }; if !allowed { - return Err(Error::Forbidden( - "Operation is not allowed for this key.".to_string(), - )); + return Err(Error::forbidden("Operation is not allowed for this key.")); } // Look up what CORS rule might apply to response. @@ -125,7 +125,8 @@ impl ApiHandler for K2VApiServer { // are always preflighted, i.e. the browser should make // an OPTIONS call before to check it is allowed let matching_cors_rule = match *req.method() { - Method::GET | Method::HEAD | Method::POST => find_matching_cors_rule(&bucket, &req)?, + Method::GET | Method::HEAD | Method::POST => find_matching_cors_rule(&bucket, &req) + .ok_or_internal_error("Error looking up CORS rule")?, _ => None, }; diff --git a/src/api/k2v/batch.rs b/src/api/k2v/batch.rs index 4ecddeb9..db9901cf 100644 --- a/src/api/k2v/batch.rs +++ b/src/api/k2v/batch.rs @@ -12,7 +12,8 @@ use garage_model::garage::Garage; use garage_model::k2v::causality::*; use garage_model::k2v::item_table::*; -use crate::error::*; +use crate::helpers::*; +use crate::k2v::error::*; use crate::k2v::range::read_range; pub async fn handle_insert_batch( @@ -20,9 +21,7 @@ pub async fn handle_insert_batch( bucket_id: Uuid, req: Request, ) -> Result, Error> { - let body = hyper::body::to_bytes(req.into_body()).await?; - let items: Vec = - serde_json::from_slice(&body).ok_or_bad_request("Invalid JSON")?; + let items = parse_json_body::>(req).await?; let mut items2 = vec![]; for it in items { @@ -52,9 +51,7 @@ pub async fn handle_read_batch( bucket_id: Uuid, req: Request, ) -> Result, Error> { - let body = hyper::body::to_bytes(req.into_body()).await?; - let queries: Vec = - serde_json::from_slice(&body).ok_or_bad_request("Invalid JSON")?; + let queries = parse_json_body::>(req).await?; let resp_results = futures::future::join_all( queries @@ -91,7 +88,7 @@ async fn handle_read_batch_query( let (items, more, next_start) = if query.single_item { if query.prefix.is_some() || query.end.is_some() || query.limit.is_some() || query.reverse { - return Err(Error::BadRequest("Batch query parameters 'prefix', 'end', 'limit' and 'reverse' must not be set when singleItem is true.".into())); + return Err(Error::bad_request("Batch query parameters 'prefix', 'end', 'limit' and 'reverse' must not be set when singleItem is true.")); } let sk = query .start @@ -149,9 +146,7 @@ pub async fn handle_delete_batch( bucket_id: Uuid, req: Request, ) -> Result, Error> { - let body = hyper::body::to_bytes(req.into_body()).await?; - let queries: Vec = - serde_json::from_slice(&body).ok_or_bad_request("Invalid JSON")?; + let queries = parse_json_body::>(req).await?; let resp_results = futures::future::join_all( queries @@ -188,7 +183,7 @@ async fn handle_delete_batch_query( let deleted_items = if query.single_item { if query.prefix.is_some() || query.end.is_some() { - return Err(Error::BadRequest("Batch query parameters 'prefix' and 'end' must not be set when singleItem is true.".into())); + return Err(Error::bad_request("Batch query parameters 'prefix' and 'end' must not be set when singleItem is true.")); } let sk = query .start diff --git a/src/api/k2v/error.rs b/src/api/k2v/error.rs new file mode 100644 index 00000000..4c55d8b5 --- /dev/null +++ b/src/api/k2v/error.rs @@ -0,0 +1,134 @@ +use err_derive::Error; +use hyper::header::HeaderValue; +use hyper::{Body, HeaderMap, StatusCode}; + +use garage_model::helper::error::Error as HelperError; + +use crate::common_error::CommonError; +pub use crate::common_error::{CommonErrorDerivative, OkOrBadRequest, OkOrInternalError}; +use crate::generic_server::ApiError; +use crate::helpers::CustomApiErrorBody; +use crate::signature::error::Error as SignatureError; + +/// Errors of this crate +#[derive(Debug, Error)] +pub enum Error { + #[error(display = "{}", _0)] + /// Error from common error + Common(CommonError), + + // Category: cannot process + /// Authorization Header Malformed + #[error(display = "Authorization header malformed, expected scope: {}", _0)] + AuthorizationHeaderMalformed(String), + + /// The object requested don't exists + #[error(display = "Key not found")] + NoSuchKey, + + /// Some base64 encoded data was badly encoded + #[error(display = "Invalid base64: {}", _0)] + InvalidBase64(#[error(source)] base64::DecodeError), + + /// The client sent a header with invalid value + #[error(display = "Invalid header value: {}", _0)] + InvalidHeader(#[error(source)] hyper::header::ToStrError), + + /// The client asked for an invalid return format (invalid Accept header) + #[error(display = "Not acceptable: {}", _0)] + NotAcceptable(String), + + /// The request contained an invalid UTF-8 sequence in its path or in other parameters + #[error(display = "Invalid UTF-8: {}", _0)] + InvalidUtf8Str(#[error(source)] std::str::Utf8Error), +} + +impl From for Error +where + CommonError: From, +{ + fn from(err: T) -> Self { + Error::Common(CommonError::from(err)) + } +} + +impl CommonErrorDerivative for Error {} + +impl From for Error { + fn from(err: HelperError) -> Self { + match err { + HelperError::Internal(i) => Self::Common(CommonError::InternalError(i)), + HelperError::BadRequest(b) => Self::Common(CommonError::BadRequest(b)), + HelperError::InvalidBucketName(n) => Self::Common(CommonError::InvalidBucketName(n)), + HelperError::NoSuchBucket(n) => Self::Common(CommonError::NoSuchBucket(n)), + e => Self::Common(CommonError::BadRequest(format!("{}", e))), + } + } +} + +impl From for Error { + fn from(err: SignatureError) -> Self { + match err { + SignatureError::Common(c) => Self::Common(c), + SignatureError::AuthorizationHeaderMalformed(c) => { + Self::AuthorizationHeaderMalformed(c) + } + SignatureError::InvalidUtf8Str(i) => Self::InvalidUtf8Str(i), + SignatureError::InvalidHeader(h) => Self::InvalidHeader(h), + } + } +} + +impl Error { + /// This returns a keyword for the corresponding error. + /// Here, these keywords are not necessarily those from AWS S3, + /// as we are building a custom API + fn code(&self) -> &'static str { + match self { + Error::Common(c) => c.aws_code(), + Error::NoSuchKey => "NoSuchKey", + Error::NotAcceptable(_) => "NotAcceptable", + Error::AuthorizationHeaderMalformed(_) => "AuthorizationHeaderMalformed", + Error::InvalidBase64(_) => "InvalidBase64", + Error::InvalidHeader(_) => "InvalidHeaderValue", + Error::InvalidUtf8Str(_) => "InvalidUtf8String", + } + } +} + +impl ApiError for Error { + /// Get the HTTP status code that best represents the meaning of the error for the client + fn http_status_code(&self) -> StatusCode { + match self { + Error::Common(c) => c.http_status_code(), + Error::NoSuchKey => StatusCode::NOT_FOUND, + Error::NotAcceptable(_) => StatusCode::NOT_ACCEPTABLE, + Error::AuthorizationHeaderMalformed(_) + | Error::InvalidBase64(_) + | Error::InvalidHeader(_) + | Error::InvalidUtf8Str(_) => StatusCode::BAD_REQUEST, + } + } + + fn add_http_headers(&self, _header_map: &mut HeaderMap) { + // nothing + } + + fn http_body(&self, garage_region: &str, path: &str) -> Body { + let error = CustomApiErrorBody { + code: self.code().to_string(), + message: format!("{}", self), + path: path.to_string(), + region: garage_region.to_string(), + }; + Body::from(serde_json::to_string_pretty(&error).unwrap_or_else(|_| { + r#" +{ + "code": "InternalError", + "message": "JSON encoding of error failed" +} + "# + .into() + })) + } +} diff --git a/src/api/k2v/index.rs b/src/api/k2v/index.rs index 896dbcf0..d5db906d 100644 --- a/src/api/k2v/index.rs +++ b/src/api/k2v/index.rs @@ -12,7 +12,7 @@ use garage_table::util::*; use garage_model::garage::Garage; use garage_model::k2v::counter_table::{BYTES, CONFLICTS, ENTRIES, VALUES}; -use crate::error::*; +use crate::k2v::error::*; use crate::k2v::range::read_range; pub async fn handle_read_index( diff --git a/src/api/k2v/item.rs b/src/api/k2v/item.rs index 1860863e..836d386f 100644 --- a/src/api/k2v/item.rs +++ b/src/api/k2v/item.rs @@ -10,7 +10,7 @@ use garage_model::garage::Garage; use garage_model::k2v::causality::*; use garage_model::k2v::item_table::*; -use crate::error::*; +use crate::k2v::error::*; pub const X_GARAGE_CAUSALITY_TOKEN: &str = "X-Garage-Causality-Token"; diff --git a/src/api/k2v/mod.rs b/src/api/k2v/mod.rs index ee210ad5..b6a8c5cf 100644 --- a/src/api/k2v/mod.rs +++ b/src/api/k2v/mod.rs @@ -1,4 +1,5 @@ pub mod api_server; +mod error; mod router; mod batch; diff --git a/src/api/k2v/range.rs b/src/api/k2v/range.rs index 295c34aa..bb9d3be5 100644 --- a/src/api/k2v/range.rs +++ b/src/api/k2v/range.rs @@ -7,8 +7,8 @@ use std::sync::Arc; use garage_table::replication::TableShardedReplication; use garage_table::*; -use crate::error::*; use crate::helpers::key_after_prefix; +use crate::k2v::error::*; /// Read range in a Garage table. /// Returns (entries, more?, nextStart) @@ -31,7 +31,7 @@ where (None, Some(s)) => (Some(s.clone()), false), (Some(p), Some(s)) => { if !s.starts_with(p) { - return Err(Error::BadRequest(format!( + return Err(Error::bad_request(format!( "Start key '{}' does not start with prefix '{}'", s, p ))); diff --git a/src/api/k2v/router.rs b/src/api/k2v/router.rs index f948ffce..50e6965b 100644 --- a/src/api/k2v/router.rs +++ b/src/api/k2v/router.rs @@ -1,4 +1,4 @@ -use crate::error::*; +use crate::k2v::error::*; use std::borrow::Cow; @@ -62,7 +62,7 @@ impl Endpoint { .unwrap_or((path.to_owned(), "")); if bucket.is_empty() { - return Err(Error::BadRequest("Missing bucket name".to_owned())); + return Err(Error::bad_request("Missing bucket name")); } if *req.method() == Method::OPTIONS { @@ -83,7 +83,7 @@ impl Endpoint { Method::PUT => Self::from_put(partition_key, &mut query)?, Method::DELETE => Self::from_delete(partition_key, &mut query)?, _ if req.method() == method_search => Self::from_search(partition_key, &mut query)?, - _ => return Err(Error::BadRequest("Unknown method".to_owned())), + _ => return Err(Error::bad_request("Unknown method")), }; if let Some(message) = query.nonempty_message() { diff --git a/src/api/lib.rs b/src/api/lib.rs index 0078f7b5..370dfd7a 100644 --- a/src/api/lib.rs +++ b/src/api/lib.rs @@ -2,16 +2,16 @@ #[macro_use] extern crate tracing; -pub mod error; -pub use error::Error; +pub mod common_error; mod encoding; -mod generic_server; +pub mod generic_server; pub mod helpers; mod router_macros; /// This mode is public only to help testing. Don't expect stability here pub mod signature; +pub mod admin; #[cfg(feature = "k2v")] pub mod k2v; pub mod s3; diff --git a/src/api/router_macros.rs b/src/api/router_macros.rs index 8471407c..4c593300 100644 --- a/src/api/router_macros.rs +++ b/src/api/router_macros.rs @@ -23,6 +23,29 @@ macro_rules! router_match { _ => None } }}; + (@gen_path_parser ($method:expr, $reqpath:expr, $query:expr) + [ + $($meth:ident $path:pat $(if $required:ident)? => $api:ident $(($($conv:ident :: $param:ident),*))?,)* + ]) => {{ + { + use Endpoint::*; + match ($method, $reqpath) { + $( + (&Method::$meth, $path) if true $(&& $query.$required.is_some())? => $api { + $($( + $param: router_match!(@@parse_param $query, $conv, $param), + )*)? + }, + )* + (m, p) => { + return Err(Error::bad_request(format!( + "Unknown API endpoint: {} {}", + m, p + ))) + } + } + } + }}; (@gen_parser ($keyword:expr, $key:ident, $query:expr, $header:expr), key: [$($kw_k:ident $(if $required_k:ident)? $(header $header_k:expr)? => $api_k:ident $(($($conv_k:ident :: $param_k:ident),*))?,)*], no_key: [$($kw_nk:ident $(if $required_nk:ident)? $(if_header $header_nk:expr)? => $api_nk:ident $(($($conv_nk:ident :: $param_nk:ident),*))?,)*]) => {{ @@ -55,7 +78,7 @@ macro_rules! router_match { )*)? }), )* - (kw, _) => Err(Error::BadRequest(format!("Invalid endpoint: {}", kw))) + (kw, _) => Err(Error::bad_request(format!("Invalid endpoint: {}", kw))) } }}; @@ -74,14 +97,14 @@ macro_rules! router_match { .take() .map(|param| param.parse()) .transpose() - .map_err(|_| Error::BadRequest("Failed to parse query parameter".to_owned()))? + .map_err(|_| Error::bad_request("Failed to parse query parameter"))? }}; (@@parse_param $query:expr, parse, $param:ident) => {{ // extract and parse mandatory query parameter // both missing and un-parseable parameters are reported as errors $query.$param.take().ok_or_bad_request("Missing argument for endpoint")? .parse() - .map_err(|_| Error::BadRequest("Failed to parse query parameter".to_owned()))? + .map_err(|_| Error::bad_request("Failed to parse query parameter"))? }}; (@func $(#[$doc:meta])* @@ -150,7 +173,7 @@ macro_rules! generateQueryParameters { false } else if v.as_ref().is_empty() { if res.keyword.replace(k).is_some() { - return Err(Error::BadRequest("Multiple keywords".to_owned())); + return Err(Error::bad_request("Multiple keywords")); } continue; } else { @@ -160,7 +183,7 @@ macro_rules! generateQueryParameters { } }; if repeated { - return Err(Error::BadRequest(format!( + return Err(Error::bad_request(format!( "Query parameter repeated: '{}'", k ))); diff --git a/src/api/s3/api_server.rs b/src/api/s3/api_server.rs index 78a69d53..ecc417ab 100644 --- a/src/api/s3/api_server.rs +++ b/src/api/s3/api_server.rs @@ -8,14 +8,13 @@ use hyper::{Body, Method, Request, Response}; use opentelemetry::{trace::SpanRef, KeyValue}; -use garage_table::util::*; use garage_util::error::Error as GarageError; use garage_model::garage::Garage; use garage_model::key_table::Key; -use crate::error::*; use crate::generic_server::*; +use crate::s3::error::*; use crate::signature::payload::check_payload_signature; use crate::signature::streaming::*; @@ -75,6 +74,7 @@ impl ApiHandler for S3ApiServer { const API_NAME_DISPLAY: &'static str = "S3"; type Endpoint = S3ApiEndpoint; + type Error = Error; fn parse_endpoint(&self, req: &Request) -> Result { let authority = req @@ -122,9 +122,8 @@ impl ApiHandler for S3ApiServer { } let (api_key, mut content_sha256) = check_payload_signature(&garage, "s3", &req).await?; - let api_key = api_key.ok_or_else(|| { - Error::Forbidden("Garage does not support anonymous access yet".to_string()) - })?; + let api_key = api_key + .ok_or_else(|| Error::forbidden("Garage does not support anonymous access yet"))?; let req = parse_streaming_body( &api_key, @@ -148,13 +147,14 @@ impl ApiHandler for S3ApiServer { return handle_create_bucket(&garage, req, content_sha256, api_key, bucket_name).await; } - let bucket_id = resolve_bucket(&garage, &bucket_name, &api_key).await?; + let bucket_id = garage + .bucket_helper() + .resolve_bucket(&bucket_name, &api_key) + .await?; let bucket = garage - .bucket_table - .get(&EmptyKey, &bucket_id) - .await? - .filter(|b| !b.state.is_deleted()) - .ok_or(Error::NoSuchBucket)?; + .bucket_helper() + .get_existing_bucket(bucket_id) + .await?; let allowed = match endpoint.authorization_type() { Authorization::Read => api_key.allow_read(&bucket_id), @@ -164,9 +164,7 @@ impl ApiHandler for S3ApiServer { }; if !allowed { - return Err(Error::Forbidden( - "Operation is not allowed for this key.".to_string(), - )); + return Err(Error::forbidden("Operation is not allowed for this key.")); } // Look up what CORS rule might apply to response. @@ -309,7 +307,7 @@ impl ApiHandler for S3ApiServer { ) .await } else { - Err(Error::BadRequest(format!( + Err(Error::bad_request(format!( "Invalid endpoint: list-type={}", list_type ))) diff --git a/src/api/s3/bucket.rs b/src/api/s3/bucket.rs index 93048a8c..2071fe55 100644 --- a/src/api/s3/bucket.rs +++ b/src/api/s3/bucket.rs @@ -8,13 +8,13 @@ use garage_model::bucket_table::Bucket; use garage_model::garage::Garage; use garage_model::key_table::Key; use garage_model::permission::BucketKeyPerm; -use garage_model::s3::object_table::ObjectFilter; use garage_table::util::*; use garage_util::crdt::*; use garage_util::data::*; use garage_util::time::*; -use crate::error::*; +use crate::common_error::CommonError; +use crate::s3::error::*; use crate::s3::xml as s3_xml; use crate::signature::verify_signed_content; @@ -130,7 +130,7 @@ pub async fn handle_create_bucket( if let Some(location_constraint) = cmd { if location_constraint != garage.config.s3_api.s3_region { - return Err(Error::BadRequest(format!( + return Err(Error::bad_request(format!( "Cannot satisfy location constraint `{}`: buckets can only be created in region `{}`", location_constraint, garage.config.s3_api.s3_region @@ -158,12 +158,12 @@ pub async fn handle_create_bucket( // otherwise return a forbidden error. let kp = api_key.bucket_permissions(&bucket_id); if !(kp.allow_write || kp.allow_owner) { - return Err(Error::BucketAlreadyExists); + return Err(CommonError::BucketAlreadyExists.into()); } } else { // Create the bucket! if !is_valid_bucket_name(&bucket_name) { - return Err(Error::BadRequest(format!( + return Err(Error::bad_request(format!( "{}: {}", bucket_name, INVALID_BUCKET_NAME_MESSAGE ))); @@ -228,18 +228,8 @@ pub async fn handle_delete_bucket( // Delete bucket // Check bucket is empty - let objects = garage - .object_table - .get_range( - &bucket_id, - None, - Some(ObjectFilter::IsData), - 10, - EnumerationOrder::Forward, - ) - .await?; - if !objects.is_empty() { - return Err(Error::BucketNotEmpty); + if !garage.bucket_helper().is_bucket_empty(bucket_id).await? { + return Err(CommonError::BucketNotEmpty.into()); } // --- done checking, now commit --- diff --git a/src/api/s3/copy.rs b/src/api/s3/copy.rs index 4e94d887..0fc16993 100644 --- a/src/api/s3/copy.rs +++ b/src/api/s3/copy.rs @@ -18,8 +18,8 @@ use garage_model::s3::block_ref_table::*; use garage_model::s3::object_table::*; use garage_model::s3::version_table::*; -use crate::error::*; -use crate::helpers::{parse_bucket_key, resolve_bucket}; +use crate::helpers::parse_bucket_key; +use crate::s3::error::*; use crate::s3::put::{decode_upload_id, get_headers}; use crate::s3::xml::{self as s3_xml, xmlns_tag}; @@ -201,8 +201,8 @@ pub async fn handle_upload_part_copy( let mut ranges = http_range::HttpRange::parse(range_str, source_version_meta.size) .map_err(|e| (e, source_version_meta.size))?; if ranges.len() != 1 { - return Err(Error::BadRequest( - "Invalid x-amz-copy-source-range header: exactly 1 range must be given".into(), + return Err(Error::bad_request( + "Invalid x-amz-copy-source-range header: exactly 1 range must be given", )); } else { ranges.pop().unwrap() @@ -230,8 +230,8 @@ pub async fn handle_upload_part_copy( // This is only for small files, we don't bother handling this. // (in AWS UploadPartCopy works for parts at least 5MB which // is never the case of an inline object) - return Err(Error::BadRequest( - "Source object is too small (minimum part size is 5Mb)".into(), + return Err(Error::bad_request( + "Source object is too small (minimum part size is 5Mb)", )); } ObjectVersionData::FirstBlock(_meta, _first_block_hash) => (), @@ -250,7 +250,7 @@ pub async fn handle_upload_part_copy( // Check this part number hasn't yet been uploaded if let Some(dv) = dest_version { if dv.has_part_number(part_number) { - return Err(Error::BadRequest(format!( + return Err(Error::bad_request(format!( "Part number {} has already been uploaded", part_number ))); @@ -413,10 +413,13 @@ async fn get_copy_source( let copy_source = percent_encoding::percent_decode_str(copy_source).decode_utf8()?; let (source_bucket, source_key) = parse_bucket_key(©_source, None)?; - let source_bucket_id = resolve_bucket(garage, &source_bucket.to_string(), api_key).await?; + let source_bucket_id = garage + .bucket_helper() + .resolve_bucket(&source_bucket.to_string(), api_key) + .await?; if !api_key.allow_read(&source_bucket_id) { - return Err(Error::Forbidden(format!( + return Err(Error::forbidden(format!( "Reading from bucket {} not allowed for this key", source_bucket ))); @@ -536,8 +539,8 @@ impl CopyPreconditionHeaders { (None, None, None, Some(ims)) => v_date > *ims, (None, None, None, None) => true, _ => { - return Err(Error::BadRequest( - "Invalid combination of x-amz-copy-source-if-xxxxx headers".into(), + return Err(Error::bad_request( + "Invalid combination of x-amz-copy-source-if-xxxxx headers", )) } }; diff --git a/src/api/s3/cors.rs b/src/api/s3/cors.rs index 37ea2e43..c7273464 100644 --- a/src/api/s3/cors.rs +++ b/src/api/s3/cors.rs @@ -9,13 +9,12 @@ use hyper::{header::HeaderName, Body, Method, Request, Response, StatusCode}; use serde::{Deserialize, Serialize}; -use crate::error::*; +use crate::s3::error::*; use crate::s3::xml::{to_xml_with_header, xmlns_tag, IntValue, Value}; use crate::signature::verify_signed_content; use garage_model::bucket_table::{Bucket, CorsRule as GarageCorsRule}; use garage_model::garage::Garage; -use garage_table::*; use garage_util::data::*; pub async fn handle_get_cors(bucket: &Bucket) -> Result, Error> { @@ -48,14 +47,11 @@ pub async fn handle_delete_cors( bucket_id: Uuid, ) -> Result, Error> { let mut bucket = garage - .bucket_table - .get(&EmptyKey, &bucket_id) - .await? - .ok_or(Error::NoSuchBucket)?; + .bucket_helper() + .get_existing_bucket(bucket_id) + .await?; - let param = bucket - .params_mut() - .ok_or_internal_error("Bucket should not be deleted at this point")?; + let param = bucket.params_mut().unwrap(); param.cors_config.update(None); garage.bucket_table.insert(&bucket).await?; @@ -78,14 +74,11 @@ pub async fn handle_put_cors( } let mut bucket = garage - .bucket_table - .get(&EmptyKey, &bucket_id) - .await? - .ok_or(Error::NoSuchBucket)?; + .bucket_helper() + .get_existing_bucket(bucket_id) + .await?; - let param = bucket - .params_mut() - .ok_or_internal_error("Bucket should not be deleted at this point")?; + let param = bucket.params_mut().unwrap(); let conf: CorsConfiguration = from_reader(&body as &[u8])?; conf.validate()?; @@ -119,12 +112,7 @@ pub async fn handle_options_s3api( let helper = garage.bucket_helper(); let bucket_id = helper.resolve_global_bucket_name(&bn).await?; if let Some(id) = bucket_id { - let bucket = garage - .bucket_table - .get(&EmptyKey, &id) - .await? - .filter(|b| !b.state.is_deleted()) - .ok_or(Error::NoSuchBucket)?; + let bucket = garage.bucket_helper().get_existing_bucket(id).await?; handle_options_for_bucket(req, &bucket) } else { // If there is a bucket name in the request, but that name @@ -185,7 +173,7 @@ pub fn handle_options_for_bucket( } } - Err(Error::Forbidden("This CORS request is not allowed.".into())) + Err(Error::forbidden("This CORS request is not allowed.")) } pub fn find_matching_cors_rule<'a>( diff --git a/src/api/s3/delete.rs b/src/api/s3/delete.rs index 1e3f1249..5065b285 100644 --- a/src/api/s3/delete.rs +++ b/src/api/s3/delete.rs @@ -8,7 +8,7 @@ use garage_util::time::*; use garage_model::garage::Garage; use garage_model::s3::object_table::*; -use crate::error::*; +use crate::s3::error::*; use crate::s3::xml as s3_xml; use crate::signature::verify_signed_content; diff --git a/src/api/s3/error.rs b/src/api/s3/error.rs new file mode 100644 index 00000000..ac632540 --- /dev/null +++ b/src/api/s3/error.rs @@ -0,0 +1,207 @@ +use std::convert::TryInto; + +use err_derive::Error; +use hyper::header::HeaderValue; +use hyper::{Body, HeaderMap, StatusCode}; + +use garage_model::helper::error::Error as HelperError; + +use crate::common_error::CommonError; +pub use crate::common_error::{CommonErrorDerivative, OkOrBadRequest, OkOrInternalError}; +use crate::generic_server::ApiError; +use crate::s3::xml as s3_xml; +use crate::signature::error::Error as SignatureError; + +/// Errors of this crate +#[derive(Debug, Error)] +pub enum Error { + #[error(display = "{}", _0)] + /// Error from common error + Common(CommonError), + + // Category: cannot process + /// Authorization Header Malformed + #[error(display = "Authorization header malformed, expected scope: {}", _0)] + AuthorizationHeaderMalformed(String), + + /// The object requested don't exists + #[error(display = "Key not found")] + NoSuchKey, + + /// The multipart upload requested don't exists + #[error(display = "Upload not found")] + NoSuchUpload, + + /// Precondition failed (e.g. x-amz-copy-source-if-match) + #[error(display = "At least one of the preconditions you specified did not hold")] + PreconditionFailed, + + /// Parts specified in CMU request do not match parts actually uploaded + #[error(display = "Parts given to CompleteMultipartUpload do not match uploaded parts")] + InvalidPart, + + /// Parts given to CompleteMultipartUpload were not in ascending order + #[error(display = "Parts given to CompleteMultipartUpload were not in ascending order")] + InvalidPartOrder, + + /// In CompleteMultipartUpload: not enough data + /// (here we are more lenient than AWS S3) + #[error(display = "Proposed upload is smaller than the minimum allowed object size")] + EntityTooSmall, + + // Category: bad request + /// The request contained an invalid UTF-8 sequence in its path or in other parameters + #[error(display = "Invalid UTF-8: {}", _0)] + InvalidUtf8Str(#[error(source)] std::str::Utf8Error), + + /// The request used an invalid path + #[error(display = "Invalid UTF-8: {}", _0)] + InvalidUtf8String(#[error(source)] std::string::FromUtf8Error), + + /// The client sent invalid XML data + #[error(display = "Invalid XML: {}", _0)] + InvalidXml(String), + + /// The client sent a header with invalid value + #[error(display = "Invalid header value: {}", _0)] + InvalidHeader(#[error(source)] hyper::header::ToStrError), + + /// The client sent a range header with invalid value + #[error(display = "Invalid HTTP range: {:?}", _0)] + InvalidRange(#[error(from)] (http_range::HttpRangeParseError, u64)), + + /// The client sent a request for an action not supported by garage + #[error(display = "Unimplemented action: {}", _0)] + NotImplemented(String), +} + +impl From for Error +where + CommonError: From, +{ + fn from(err: T) -> Self { + Error::Common(CommonError::from(err)) + } +} + +impl CommonErrorDerivative for Error {} + +impl From for Error { + fn from(err: HelperError) -> Self { + match err { + HelperError::Internal(i) => Self::Common(CommonError::InternalError(i)), + HelperError::BadRequest(b) => Self::Common(CommonError::BadRequest(b)), + HelperError::InvalidBucketName(n) => Self::Common(CommonError::InvalidBucketName(n)), + HelperError::NoSuchBucket(n) => Self::Common(CommonError::NoSuchBucket(n)), + e => Self::bad_request(format!("{}", e)), + } + } +} + +impl From for Error { + fn from(err: roxmltree::Error) -> Self { + Self::InvalidXml(format!("{}", err)) + } +} + +impl From for Error { + fn from(err: quick_xml::de::DeError) -> Self { + Self::InvalidXml(format!("{}", err)) + } +} + +impl From for Error { + fn from(err: SignatureError) -> Self { + match err { + SignatureError::Common(c) => Self::Common(c), + SignatureError::AuthorizationHeaderMalformed(c) => { + Self::AuthorizationHeaderMalformed(c) + } + SignatureError::InvalidUtf8Str(i) => Self::InvalidUtf8Str(i), + SignatureError::InvalidHeader(h) => Self::InvalidHeader(h), + } + } +} + +impl From for Error { + fn from(err: multer::Error) -> Self { + Self::bad_request(err) + } +} + +impl Error { + pub fn aws_code(&self) -> &'static str { + match self { + Error::Common(c) => c.aws_code(), + Error::NoSuchKey => "NoSuchKey", + Error::NoSuchUpload => "NoSuchUpload", + Error::PreconditionFailed => "PreconditionFailed", + Error::InvalidPart => "InvalidPart", + Error::InvalidPartOrder => "InvalidPartOrder", + Error::EntityTooSmall => "EntityTooSmall", + Error::AuthorizationHeaderMalformed(_) => "AuthorizationHeaderMalformed", + Error::NotImplemented(_) => "NotImplemented", + Error::InvalidXml(_) => "MalformedXML", + Error::InvalidRange(_) => "InvalidRange", + Error::InvalidUtf8Str(_) | Error::InvalidUtf8String(_) | Error::InvalidHeader(_) => { + "InvalidRequest" + } + } + } +} + +impl ApiError for Error { + /// Get the HTTP status code that best represents the meaning of the error for the client + fn http_status_code(&self) -> StatusCode { + match self { + Error::Common(c) => c.http_status_code(), + Error::NoSuchKey | Error::NoSuchUpload => StatusCode::NOT_FOUND, + Error::PreconditionFailed => StatusCode::PRECONDITION_FAILED, + Error::InvalidRange(_) => StatusCode::RANGE_NOT_SATISFIABLE, + Error::NotImplemented(_) => StatusCode::NOT_IMPLEMENTED, + Error::AuthorizationHeaderMalformed(_) + | Error::InvalidPart + | Error::InvalidPartOrder + | Error::EntityTooSmall + | Error::InvalidXml(_) + | Error::InvalidUtf8Str(_) + | Error::InvalidUtf8String(_) + | Error::InvalidHeader(_) => StatusCode::BAD_REQUEST, + } + } + + fn add_http_headers(&self, header_map: &mut HeaderMap) { + use hyper::header; + #[allow(clippy::single_match)] + match self { + Error::InvalidRange((_, len)) => { + header_map.append( + header::CONTENT_RANGE, + format!("bytes */{}", len) + .try_into() + .expect("header value only contain ascii"), + ); + } + _ => (), + } + } + + fn http_body(&self, garage_region: &str, path: &str) -> Body { + let error = s3_xml::Error { + code: s3_xml::Value(self.aws_code().to_string()), + message: s3_xml::Value(format!("{}", self)), + resource: Some(s3_xml::Value(path.to_string())), + region: Some(s3_xml::Value(garage_region.to_string())), + }; + Body::from(s3_xml::to_xml_with_header(&error).unwrap_or_else(|_| { + r#" + + + InternalError + XML encoding of error failed + + "# + .into() + })) + } +} diff --git a/src/api/s3/get.rs b/src/api/s3/get.rs index 3edf22a6..7fa1a177 100644 --- a/src/api/s3/get.rs +++ b/src/api/s3/get.rs @@ -17,7 +17,7 @@ use garage_model::garage::Garage; use garage_model::s3::object_table::*; use garage_model::s3::version_table::*; -use crate::error::*; +use crate::s3::error::*; const X_AMZ_MP_PARTS_COUNT: &str = "x-amz-mp-parts-count"; @@ -210,8 +210,8 @@ pub async fn handle_get( match (part_number, parse_range_header(req, last_v_meta.size)?) { (Some(_), Some(_)) => { - return Err(Error::BadRequest( - "Cannot specify both partNumber and Range header".into(), + return Err(Error::bad_request( + "Cannot specify both partNumber and Range header", )); } (Some(pn), None) => { @@ -302,9 +302,9 @@ async fn handle_get_range( let body: Body = Body::from(bytes[begin as usize..end as usize].to_vec()); Ok(resp_builder.body(body)?) } else { - None.ok_or_internal_error( + Err(Error::internal_error( "Requested range not present in inline bytes when it should have been", - ) + )) } } ObjectVersionData::FirstBlock(_meta, _first_block_hash) => { diff --git a/src/api/s3/list.rs b/src/api/s3/list.rs index e2848c57..e5f486c8 100644 --- a/src/api/s3/list.rs +++ b/src/api/s3/list.rs @@ -16,8 +16,8 @@ use garage_model::s3::version_table::Version; use garage_table::{EmptyKey, EnumerationOrder}; use crate::encoding::*; -use crate::error::*; use crate::helpers::key_after_prefix; +use crate::s3::error::*; use crate::s3::put as s3_put; use crate::s3::xml as s3_xml; @@ -582,13 +582,19 @@ impl ListObjectsQuery { // representing the key to start with. (Some(token), _) => match &token[..1] { "[" => Ok(RangeBegin::IncludingKey { - key: String::from_utf8(base64::decode(token[1..].as_bytes())?)?, + key: String::from_utf8( + base64::decode(token[1..].as_bytes()) + .ok_or_bad_request("Invalid continuation token")?, + )?, fallback_key: None, }), "]" => Ok(RangeBegin::AfterKey { - key: String::from_utf8(base64::decode(token[1..].as_bytes())?)?, + key: String::from_utf8( + base64::decode(token[1..].as_bytes()) + .ok_or_bad_request("Invalid continuation token")?, + )?, }), - _ => Err(Error::BadRequest("Invalid continuation token".to_string())), + _ => Err(Error::bad_request("Invalid continuation token")), }, // StartAfter has defined semantics in the spec: diff --git a/src/api/s3/mod.rs b/src/api/s3/mod.rs index 3f5c1915..7b56d4d8 100644 --- a/src/api/s3/mod.rs +++ b/src/api/s3/mod.rs @@ -1,4 +1,5 @@ pub mod api_server; +pub mod error; mod bucket; mod copy; diff --git a/src/api/s3/post_object.rs b/src/api/s3/post_object.rs index 86fa7880..dc640f43 100644 --- a/src/api/s3/post_object.rs +++ b/src/api/s3/post_object.rs @@ -14,8 +14,7 @@ use serde::Deserialize; use garage_model::garage::Garage; -use crate::error::*; -use crate::helpers::resolve_bucket; +use crate::s3::error::*; use crate::s3::put::{get_headers, save_stream}; use crate::s3::xml as s3_xml; use crate::signature::payload::{parse_date, verify_v4}; @@ -48,9 +47,7 @@ pub async fn handle_post_object( let field = if let Some(field) = multipart.next_field().await? { field } else { - return Err(Error::BadRequest( - "Request did not contain a file".to_owned(), - )); + return Err(Error::bad_request("Request did not contain a file")); }; let name: HeaderName = if let Some(Ok(name)) = field.name().map(TryInto::try_into) { name @@ -66,14 +63,14 @@ pub async fn handle_post_object( "tag" => (/* tag need to be reencoded, but we don't support them yet anyway */), "acl" => { if params.insert("x-amz-acl", content).is_some() { - return Err(Error::BadRequest( - "Field 'acl' provided more than one time".to_string(), + return Err(Error::bad_request( + "Field 'acl' provided more than one time", )); } } _ => { if params.insert(&name, content).is_some() { - return Err(Error::BadRequest(format!( + return Err(Error::bad_request(format!( "Field '{}' provided more than one time", name ))); @@ -90,9 +87,7 @@ pub async fn handle_post_object( .to_str()?; let credential = params .get("x-amz-credential") - .ok_or_else(|| { - Error::Forbidden("Garage does not support anonymous access yet".to_string()) - })? + .ok_or_else(|| Error::forbidden("Garage does not support anonymous access yet"))? .to_str()?; let policy = params .get("policy") @@ -129,15 +124,16 @@ pub async fn handle_post_object( ) .await?; - let bucket_id = resolve_bucket(&garage, &bucket, &api_key).await?; + let bucket_id = garage + .bucket_helper() + .resolve_bucket(&bucket, &api_key) + .await?; if !api_key.allow_write(&bucket_id) { - return Err(Error::Forbidden( - "Operation is not allowed for this key.".to_string(), - )); + return Err(Error::forbidden("Operation is not allowed for this key.")); } - let decoded_policy = base64::decode(&policy)?; + let decoded_policy = base64::decode(&policy).ok_or_bad_request("Invalid policy")?; let decoded_policy: Policy = serde_json::from_slice(&decoded_policy).ok_or_bad_request("Invalid policy")?; @@ -145,9 +141,7 @@ pub async fn handle_post_object( .ok_or_bad_request("Invalid expiration date")? .into(); if Utc::now() - expiration > Duration::zero() { - return Err(Error::BadRequest( - "Expiration date is in the paste".to_string(), - )); + return Err(Error::bad_request("Expiration date is in the paste")); } let mut conditions = decoded_policy.into_conditions()?; @@ -159,7 +153,7 @@ pub async fn handle_post_object( "policy" | "x-amz-signature" => (), // this is always accepted, as it's required to validate other fields "content-type" => { let conds = conditions.params.remove("content-type").ok_or_else(|| { - Error::BadRequest(format!("Key '{}' is not allowed in policy", param_key)) + Error::bad_request(format!("Key '{}' is not allowed in policy", param_key)) })?; for cond in conds { let ok = match cond { @@ -169,7 +163,7 @@ pub async fn handle_post_object( } }; if !ok { - return Err(Error::BadRequest(format!( + return Err(Error::bad_request(format!( "Key '{}' has value not allowed in policy", param_key ))); @@ -178,7 +172,7 @@ pub async fn handle_post_object( } "key" => { let conds = conditions.params.remove("key").ok_or_else(|| { - Error::BadRequest(format!("Key '{}' is not allowed in policy", param_key)) + Error::bad_request(format!("Key '{}' is not allowed in policy", param_key)) })?; for cond in conds { let ok = match cond { @@ -186,7 +180,7 @@ pub async fn handle_post_object( Operation::StartsWith(s) => key.starts_with(&s), }; if !ok { - return Err(Error::BadRequest(format!( + return Err(Error::bad_request(format!( "Key '{}' has value not allowed in policy", param_key ))); @@ -201,7 +195,7 @@ pub async fn handle_post_object( continue; } let conds = conditions.params.remove(¶m_key).ok_or_else(|| { - Error::BadRequest(format!("Key '{}' is not allowed in policy", param_key)) + Error::bad_request(format!("Key '{}' is not allowed in policy", param_key)) })?; for cond in conds { let ok = match cond { @@ -209,7 +203,7 @@ pub async fn handle_post_object( Operation::StartsWith(s) => value.to_str()?.starts_with(s.as_str()), }; if !ok { - return Err(Error::BadRequest(format!( + return Err(Error::bad_request(format!( "Key '{}' has value not allowed in policy", param_key ))); @@ -220,7 +214,7 @@ pub async fn handle_post_object( } if let Some((param_key, _)) = conditions.params.iter().next() { - return Err(Error::BadRequest(format!( + return Err(Error::bad_request(format!( "Key '{}' is required in policy, but no value was provided", param_key ))); @@ -326,7 +320,7 @@ impl Policy { match condition { PolicyCondition::Equal(map) => { if map.len() != 1 { - return Err(Error::BadRequest("Invalid policy item".to_owned())); + return Err(Error::bad_request("Invalid policy item")); } let (mut k, v) = map.into_iter().next().expect("size was verified"); k.make_ascii_lowercase(); @@ -334,7 +328,7 @@ impl Policy { } PolicyCondition::OtherOp([cond, mut key, value]) => { if key.remove(0) != '$' { - return Err(Error::BadRequest("Invalid policy item".to_owned())); + return Err(Error::bad_request("Invalid policy item")); } key.make_ascii_lowercase(); match cond.as_str() { @@ -347,7 +341,7 @@ impl Policy { .or_default() .push(Operation::StartsWith(value)); } - _ => return Err(Error::BadRequest("Invalid policy item".to_owned())), + _ => return Err(Error::bad_request("Invalid policy item")), } } PolicyCondition::SizeRange(key, min, max) => { @@ -355,7 +349,7 @@ impl Policy { length.0 = length.0.max(min); length.1 = length.1.min(max); } else { - return Err(Error::BadRequest("Invalid policy item".to_owned())); + return Err(Error::bad_request("Invalid policy item")); } } } @@ -420,15 +414,15 @@ where self.read += bytes.len() as u64; // optimization to fail early when we know before the end it's too long if self.length.end() < &self.read { - return Poll::Ready(Some(Err(Error::BadRequest( - "File size does not match policy".to_owned(), + return Poll::Ready(Some(Err(Error::bad_request( + "File size does not match policy", )))); } } Poll::Ready(None) => { if !self.length.contains(&self.read) { - return Poll::Ready(Some(Err(Error::BadRequest( - "File size does not match policy".to_owned(), + return Poll::Ready(Some(Err(Error::bad_request( + "File size does not match policy", )))); } } diff --git a/src/api/s3/put.rs b/src/api/s3/put.rs index 89aa8d84..8b06ef3f 100644 --- a/src/api/s3/put.rs +++ b/src/api/s3/put.rs @@ -19,7 +19,7 @@ use garage_model::s3::block_ref_table::*; use garage_model::s3::object_table::*; use garage_model::s3::version_table::*; -use crate::error::*; +use crate::s3::error::*; use crate::s3::xml as s3_xml; use crate::signature::verify_signed_content; @@ -183,8 +183,8 @@ fn ensure_checksum_matches( ) -> Result<(), Error> { if let Some(expected_sha256) = content_sha256 { if expected_sha256 != data_sha256sum { - return Err(Error::BadRequest( - "Unable to validate x-amz-content-sha256".to_string(), + return Err(Error::bad_request( + "Unable to validate x-amz-content-sha256", )); } else { trace!("Successfully validated x-amz-content-sha256"); @@ -192,9 +192,7 @@ fn ensure_checksum_matches( } if let Some(expected_md5) = content_md5 { if expected_md5.trim_matches('"') != base64::encode(data_md5sum) { - return Err(Error::BadRequest( - "Unable to validate content-md5".to_string(), - )); + return Err(Error::bad_request("Unable to validate content-md5")); } else { trace!("Successfully validated content-md5"); } @@ -428,7 +426,7 @@ pub async fn handle_put_part( // Check part hasn't already been uploaded if let Some(v) = version { if v.has_part_number(part_number) { - return Err(Error::BadRequest(format!( + return Err(Error::bad_request(format!( "Part number {} has already been uploaded", part_number ))); @@ -513,7 +511,7 @@ pub async fn handle_complete_multipart_upload( let version = version.ok_or(Error::NoSuchKey)?; if version.blocks.is_empty() { - return Err(Error::BadRequest("No data was uploaded".to_string())); + return Err(Error::bad_request("No data was uploaded")); } let headers = match object_version.state { @@ -574,8 +572,8 @@ pub async fn handle_complete_multipart_upload( .map(|x| x.part_number) .eq(block_parts.into_iter()); if !same_parts { - return Err(Error::BadRequest( - "Part numbers in block list and part list do not match. This can happen if a part was partially uploaded. Please abort the multipart upload and try again.".into(), + return Err(Error::bad_request( + "Part numbers in block list and part list do not match. This can happen if a part was partially uploaded. Please abort the multipart upload and try again." )); } diff --git a/src/api/s3/router.rs b/src/api/s3/router.rs index 0525c649..44f581ff 100644 --- a/src/api/s3/router.rs +++ b/src/api/s3/router.rs @@ -1,5 +1,3 @@ -use crate::error::{Error, OkOrBadRequest}; - use std::borrow::Cow; use hyper::header::HeaderValue; @@ -7,6 +5,7 @@ use hyper::{HeaderMap, Method, Request}; use crate::helpers::Authorization; use crate::router_macros::{generateQueryParameters, router_match}; +use crate::s3::error::*; router_match! {@func @@ -343,7 +342,7 @@ impl Endpoint { Method::POST => Self::from_post(key, &mut query)?, Method::PUT => Self::from_put(key, &mut query, req.headers())?, Method::DELETE => Self::from_delete(key, &mut query)?, - _ => return Err(Error::BadRequest("Unknown method".to_owned())), + _ => return Err(Error::bad_request("Unknown method")), }; if let Some(message) = query.nonempty_message() { diff --git a/src/api/s3/website.rs b/src/api/s3/website.rs index 561130dc..77738971 100644 --- a/src/api/s3/website.rs +++ b/src/api/s3/website.rs @@ -4,13 +4,12 @@ use std::sync::Arc; use hyper::{Body, Request, Response, StatusCode}; use serde::{Deserialize, Serialize}; -use crate::error::*; +use crate::s3::error::*; use crate::s3::xml::{to_xml_with_header, xmlns_tag, IntValue, Value}; use crate::signature::verify_signed_content; use garage_model::bucket_table::*; use garage_model::garage::Garage; -use garage_table::*; use garage_util::data::*; pub async fn handle_get_website(bucket: &Bucket) -> Result, Error> { @@ -47,14 +46,11 @@ pub async fn handle_delete_website( bucket_id: Uuid, ) -> Result, Error> { let mut bucket = garage - .bucket_table - .get(&EmptyKey, &bucket_id) - .await? - .ok_or(Error::NoSuchBucket)?; + .bucket_helper() + .get_existing_bucket(bucket_id) + .await?; - let param = bucket - .params_mut() - .ok_or_internal_error("Bucket should not be deleted at this point")?; + let param = bucket.params_mut().unwrap(); param.website_config.update(None); garage.bucket_table.insert(&bucket).await?; @@ -77,14 +73,11 @@ pub async fn handle_put_website( } let mut bucket = garage - .bucket_table - .get(&EmptyKey, &bucket_id) - .await? - .ok_or(Error::NoSuchBucket)?; + .bucket_helper() + .get_existing_bucket(bucket_id) + .await?; - let param = bucket - .params_mut() - .ok_or_internal_error("Bucket should not be deleted at this point")?; + let param = bucket.params_mut().unwrap(); let conf: WebsiteConfiguration = from_reader(&body as &[u8])?; conf.validate()?; @@ -176,8 +169,8 @@ impl WebsiteConfiguration { || self.index_document.is_some() || self.routing_rules.is_some()) { - return Err(Error::BadRequest( - "Bad XML: can't have RedirectAllRequestsTo and other fields".to_owned(), + return Err(Error::bad_request( + "Bad XML: can't have RedirectAllRequestsTo and other fields", )); } if let Some(ref ed) = self.error_document { @@ -222,8 +215,8 @@ impl WebsiteConfiguration { impl Key { pub fn validate(&self) -> Result<(), Error> { if self.key.0.is_empty() { - Err(Error::BadRequest( - "Bad XML: error document specified but empty".to_owned(), + Err(Error::bad_request( + "Bad XML: error document specified but empty", )) } else { Ok(()) @@ -234,8 +227,8 @@ impl Key { impl Suffix { pub fn validate(&self) -> Result<(), Error> { if self.suffix.0.is_empty() | self.suffix.0.contains('/') { - Err(Error::BadRequest( - "Bad XML: index document is empty or contains /".to_owned(), + Err(Error::bad_request( + "Bad XML: index document is empty or contains /", )) } else { Ok(()) @@ -247,7 +240,7 @@ impl Target { pub fn validate(&self) -> Result<(), Error> { if let Some(ref protocol) = self.protocol { if protocol.0 != "http" && protocol.0 != "https" { - return Err(Error::BadRequest("Bad XML: invalid protocol".to_owned())); + return Err(Error::bad_request("Bad XML: invalid protocol")); } } Ok(()) @@ -269,19 +262,19 @@ impl Redirect { pub fn validate(&self, has_prefix: bool) -> Result<(), Error> { if self.replace_prefix.is_some() { if self.replace_full.is_some() { - return Err(Error::BadRequest( - "Bad XML: both ReplaceKeyPrefixWith and ReplaceKeyWith are set".to_owned(), + return Err(Error::bad_request( + "Bad XML: both ReplaceKeyPrefixWith and ReplaceKeyWith are set", )); } if !has_prefix { - return Err(Error::BadRequest( - "Bad XML: ReplaceKeyPrefixWith is set, but KeyPrefixEquals isn't".to_owned(), + return Err(Error::bad_request( + "Bad XML: ReplaceKeyPrefixWith is set, but KeyPrefixEquals isn't", )); } } if let Some(ref protocol) = self.protocol { if protocol.0 != "http" && protocol.0 != "https" { - return Err(Error::BadRequest("Bad XML: invalid protocol".to_owned())); + return Err(Error::bad_request("Bad XML: invalid protocol")); } } // TODO there are probably more invalide cases, but which ones? diff --git a/src/api/s3/xml.rs b/src/api/s3/xml.rs index 75ec4559..111657a0 100644 --- a/src/api/s3/xml.rs +++ b/src/api/s3/xml.rs @@ -1,7 +1,7 @@ use quick_xml::se::to_string; use serde::{Deserialize, Serialize, Serializer}; -use crate::Error as ApiError; +use crate::s3::error::Error as ApiError; pub fn to_xml_with_header(x: &T) -> Result { let mut xml = r#""#.to_string(); diff --git a/src/api/signature/error.rs b/src/api/signature/error.rs new file mode 100644 index 00000000..f5a067bd --- /dev/null +++ b/src/api/signature/error.rs @@ -0,0 +1,36 @@ +use err_derive::Error; + +use crate::common_error::CommonError; +pub use crate::common_error::{CommonErrorDerivative, OkOrBadRequest, OkOrInternalError}; + +/// Errors of this crate +#[derive(Debug, Error)] +pub enum Error { + #[error(display = "{}", _0)] + /// Error from common error + Common(CommonError), + + /// Authorization Header Malformed + #[error(display = "Authorization header malformed, expected scope: {}", _0)] + AuthorizationHeaderMalformed(String), + + // Category: bad request + /// The request contained an invalid UTF-8 sequence in its path or in other parameters + #[error(display = "Invalid UTF-8: {}", _0)] + InvalidUtf8Str(#[error(source)] std::str::Utf8Error), + + /// The client sent a header with invalid value + #[error(display = "Invalid header value: {}", _0)] + InvalidHeader(#[error(source)] hyper::header::ToStrError), +} + +impl From for Error +where + CommonError: From, +{ + fn from(err: T) -> Self { + Error::Common(CommonError::from(err)) + } +} + +impl CommonErrorDerivative for Error {} diff --git a/src/api/signature/mod.rs b/src/api/signature/mod.rs index 5646f4fa..dd5b590c 100644 --- a/src/api/signature/mod.rs +++ b/src/api/signature/mod.rs @@ -4,11 +4,12 @@ use sha2::Sha256; use garage_util::data::{sha256sum, Hash}; -use crate::error::*; - +pub mod error; pub mod payload; pub mod streaming; +use error::*; + pub const SHORT_DATE: &str = "%Y%m%d"; pub const LONG_DATETIME: &str = "%Y%m%dT%H%M%SZ"; @@ -16,7 +17,7 @@ type HmacSha256 = Hmac; pub fn verify_signed_content(expected_sha256: Hash, body: &[u8]) -> Result<(), Error> { if expected_sha256 != sha256sum(body) { - return Err(Error::BadRequest( + return Err(Error::bad_request( "Request content hash does not match signed hash".to_string(), )); } diff --git a/src/api/signature/payload.rs b/src/api/signature/payload.rs index 9137dd2d..4c7934e5 100644 --- a/src/api/signature/payload.rs +++ b/src/api/signature/payload.rs @@ -15,7 +15,7 @@ use super::LONG_DATETIME; use super::{compute_scope, signing_hmac}; use crate::encoding::uri_encode; -use crate::error::*; +use crate::signature::error::*; pub async fn check_payload_signature( garage: &Garage, @@ -105,7 +105,7 @@ fn parse_authorization( let (auth_kind, rest) = authorization.split_at(first_space); if auth_kind != "AWS4-HMAC-SHA256" { - return Err(Error::BadRequest("Unsupported authorization method".into())); + return Err(Error::bad_request("Unsupported authorization method")); } let mut auth_params = HashMap::new(); @@ -129,10 +129,11 @@ fn parse_authorization( let date = headers .get("x-amz-date") .ok_or_bad_request("Missing X-Amz-Date field") + .map_err(Error::from) .and_then(|d| parse_date(d))?; if Utc::now() - date > Duration::hours(24) { - return Err(Error::BadRequest("Date is too old".to_string())); + return Err(Error::bad_request("Date is too old".to_string())); } let auth = Authorization { @@ -156,7 +157,7 @@ fn parse_query_authorization( headers: &HashMap, ) -> Result { if algorithm != "AWS4-HMAC-SHA256" { - return Err(Error::BadRequest( + return Err(Error::bad_request( "Unsupported authorization method".to_string(), )); } @@ -179,10 +180,10 @@ fn parse_query_authorization( .get("x-amz-expires") .ok_or_bad_request("X-Amz-Expires not found in query parameters")? .parse() - .map_err(|_| Error::BadRequest("X-Amz-Expires is not a number".to_string()))?; + .map_err(|_| Error::bad_request("X-Amz-Expires is not a number".to_string()))?; if duration > 7 * 24 * 3600 { - return Err(Error::BadRequest( + return Err(Error::bad_request( "X-Amz-Exprires may not exceed a week".to_string(), )); } @@ -190,10 +191,11 @@ fn parse_query_authorization( let date = headers .get("x-amz-date") .ok_or_bad_request("Missing X-Amz-Date field") + .map_err(Error::from) .and_then(|d| parse_date(d))?; if Utc::now() - date > Duration::seconds(duration) { - return Err(Error::BadRequest("Date is too old".to_string())); + return Err(Error::bad_request("Date is too old".to_string())); } Ok(Authorization { @@ -301,7 +303,7 @@ pub async fn verify_v4( .get(&EmptyKey, &key_id) .await? .filter(|k| !k.state.is_deleted()) - .ok_or_else(|| Error::Forbidden(format!("No such key: {}", &key_id)))?; + .ok_or_else(|| Error::forbidden(format!("No such key: {}", &key_id)))?; let key_p = key.params().unwrap(); let mut hmac = signing_hmac( @@ -314,7 +316,7 @@ pub async fn verify_v4( hmac.update(payload); let our_signature = hex::encode(hmac.finalize().into_bytes()); if signature != our_signature { - return Err(Error::Forbidden("Invalid signature".to_string())); + return Err(Error::forbidden("Invalid signature".to_string())); } Ok(key) diff --git a/src/api/signature/streaming.rs b/src/api/signature/streaming.rs index ded9d993..c8358c4f 100644 --- a/src/api/signature/streaming.rs +++ b/src/api/signature/streaming.rs @@ -12,7 +12,7 @@ use garage_util::data::Hash; use super::{compute_scope, sha256sum, HmacSha256, LONG_DATETIME}; -use crate::error::*; +use crate::signature::error::*; pub fn parse_streaming_body( api_key: &Key, @@ -87,7 +87,7 @@ fn compute_streaming_payload_signature( let mut hmac = signing_hmac.clone(); hmac.update(string_to_sign.as_bytes()); - Hash::try_from(&hmac.finalize().into_bytes()).ok_or_internal_error("Invalid signature") + Ok(Hash::try_from(&hmac.finalize().into_bytes()).ok_or_internal_error("Invalid signature")?) } mod payload { @@ -163,10 +163,10 @@ impl From for Error { match err { SignedPayloadStreamError::Stream(e) => e, SignedPayloadStreamError::InvalidSignature => { - Error::BadRequest("Invalid payload signature".into()) + Error::bad_request("Invalid payload signature") } SignedPayloadStreamError::Message(e) => { - Error::BadRequest(format!("Chunk format error: {}", e)) + Error::bad_request(format!("Chunk format error: {}", e)) } } } diff --git a/src/garage/Cargo.toml b/src/garage/Cargo.toml index 3b69d7bc..902f67f8 100644 --- a/src/garage/Cargo.toml +++ b/src/garage/Cargo.toml @@ -27,10 +27,8 @@ garage_rpc = { version = "0.7.0", path = "../rpc" } garage_table = { version = "0.7.0", path = "../table" } garage_util = { version = "0.7.0", path = "../util" } garage_web = { version = "0.7.0", path = "../web" } -garage_admin = { version = "0.7.0", path = "../admin" } bytes = "1.0" -git-version = "0.3.4" hex = "0.4" tracing = { version = "0.1.30", features = ["log-always"] } pretty_env_logger = "0.4" @@ -54,6 +52,11 @@ tokio = { version = "1.0", default-features = false, features = ["rt", "rt-multi #netapp = { version = "0.4", path = "../../../netapp" } netapp = "0.4" +opentelemetry = { version = "0.17", features = [ "rt-tokio" ] } +opentelemetry-prometheus = "0.10" +opentelemetry-otlp = "0.10" +prometheus = "0.13" + [dev-dependencies] aws-sdk-s3 = "0.8" chrono = "0.4" diff --git a/src/garage/admin.rs b/src/garage/admin.rs index af0c3f22..bc1f494a 100644 --- a/src/garage/admin.rs +++ b/src/garage/admin.rs @@ -22,7 +22,6 @@ use garage_model::helper::error::{Error, OkOrBadRequest}; use garage_model::key_table::*; use garage_model::migrate::Migrate; use garage_model::permission::*; -use garage_model::s3::object_table::ObjectFilter; use crate::cli::*; use crate::repair::Repair; @@ -213,18 +212,7 @@ impl AdminRpcHandler { } // Check bucket is empty - let objects = self - .garage - .object_table - .get_range( - &bucket_id, - None, - Some(ObjectFilter::IsData), - 10, - EnumerationOrder::Forward, - ) - .await?; - if !objects.is_empty() { + if !helper.is_bucket_empty(bucket_id).await? { return Err(Error::BadRequest(format!( "Bucket {} is not empty", query.name @@ -261,6 +249,7 @@ impl AdminRpcHandler { async fn handle_alias_bucket(&self, query: &AliasBucketOpt) -> Result { let helper = self.garage.bucket_helper(); + let key_helper = self.garage.key_helper(); let bucket_id = helper .resolve_global_bucket_name(&query.existing_bucket) @@ -268,7 +257,7 @@ impl AdminRpcHandler { .ok_or_bad_request("Bucket not found")?; if let Some(key_pattern) = &query.local { - let key = helper.get_existing_matching_key(key_pattern).await?; + let key = key_helper.get_existing_matching_key(key_pattern).await?; helper .set_local_bucket_alias(bucket_id, &key.key_id, &query.new_name) @@ -290,9 +279,10 @@ impl AdminRpcHandler { async fn handle_unalias_bucket(&self, query: &UnaliasBucketOpt) -> Result { let helper = self.garage.bucket_helper(); + let key_helper = self.garage.key_helper(); if let Some(key_pattern) = &query.local { - let key = helper.get_existing_matching_key(key_pattern).await?; + let key = key_helper.get_existing_matching_key(key_pattern).await?; let bucket_id = key .state @@ -331,12 +321,15 @@ impl AdminRpcHandler { async fn handle_bucket_allow(&self, query: &PermBucketOpt) -> Result { let helper = self.garage.bucket_helper(); + let key_helper = self.garage.key_helper(); let bucket_id = helper .resolve_global_bucket_name(&query.bucket) .await? .ok_or_bad_request("Bucket not found")?; - let key = helper.get_existing_matching_key(&query.key_pattern).await?; + let key = key_helper + .get_existing_matching_key(&query.key_pattern) + .await?; let allow_read = query.read || key.allow_read(&bucket_id); let allow_write = query.write || key.allow_write(&bucket_id); @@ -363,12 +356,15 @@ impl AdminRpcHandler { async fn handle_bucket_deny(&self, query: &PermBucketOpt) -> Result { let helper = self.garage.bucket_helper(); + let key_helper = self.garage.key_helper(); let bucket_id = helper .resolve_global_bucket_name(&query.bucket) .await? .ok_or_bad_request("Bucket not found")?; - let key = helper.get_existing_matching_key(&query.key_pattern).await?; + let key = key_helper + .get_existing_matching_key(&query.key_pattern) + .await?; let allow_read = !query.read && key.allow_read(&bucket_id); let allow_write = !query.write && key.allow_write(&bucket_id); @@ -469,7 +465,7 @@ impl AdminRpcHandler { async fn handle_key_info(&self, query: &KeyOpt) -> Result { let key = self .garage - .bucket_helper() + .key_helper() .get_existing_matching_key(&query.key_pattern) .await?; self.key_info_result(key).await @@ -484,7 +480,7 @@ impl AdminRpcHandler { async fn handle_rename_key(&self, query: &KeyRenameOpt) -> Result { let mut key = self .garage - .bucket_helper() + .key_helper() .get_existing_matching_key(&query.key_pattern) .await?; key.params_mut() @@ -496,9 +492,11 @@ impl AdminRpcHandler { } async fn handle_delete_key(&self, query: &KeyDeleteOpt) -> Result { - let helper = self.garage.bucket_helper(); + let key_helper = self.garage.key_helper(); - let mut key = helper.get_existing_matching_key(&query.key_pattern).await?; + let mut key = key_helper + .get_existing_matching_key(&query.key_pattern) + .await?; if !query.yes { return Err(Error::BadRequest( @@ -506,32 +504,7 @@ impl AdminRpcHandler { )); } - let state = key.state.as_option_mut().unwrap(); - - // --- done checking, now commit --- - // (the step at unset_local_bucket_alias will fail if a bucket - // does not have another alias, the deletion will be - // interrupted in the middle if that happens) - - // 1. Delete local aliases - for (alias, _, to) in state.local_aliases.items().iter() { - if let Some(bucket_id) = to { - helper - .unset_local_bucket_alias(*bucket_id, &key.key_id, alias) - .await?; - } - } - - // 2. Remove permissions on all authorized buckets - for (ab_id, _auth) in state.authorized_buckets.items().iter() { - helper - .set_bucket_key_permissions(*ab_id, &key.key_id, BucketKeyPerm::NO_PERMISSIONS) - .await?; - } - - // 3. Actually delete key - key.state = Deletable::delete(); - self.garage.key_table.insert(&key).await?; + key_helper.delete_key(&mut key).await?; Ok(AdminRpc::Ok(format!( "Key {} was deleted successfully.", @@ -542,7 +515,7 @@ impl AdminRpcHandler { async fn handle_allow_key(&self, query: &KeyPermOpt) -> Result { let mut key = self .garage - .bucket_helper() + .key_helper() .get_existing_matching_key(&query.key_pattern) .await?; if query.create_bucket { @@ -555,7 +528,7 @@ impl AdminRpcHandler { async fn handle_deny_key(&self, query: &KeyPermOpt) -> Result { let mut key = self .garage - .bucket_helper() + .key_helper() .get_existing_matching_key(&query.key_pattern) .await?; if query.create_bucket { @@ -696,11 +669,7 @@ impl AdminRpcHandler { writeln!( &mut ret, "\nGarage version: {}", - option_env!("GIT_VERSION").unwrap_or(git_version::git_version!( - prefix = "git:", - cargo_prefix = "cargo:", - fallback = "unknown" - )) + self.garage.system.garage_version(), ) .unwrap(); diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index 0247c32b..db0af57c 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -1,5 +1,4 @@ use garage_util::crdt::Crdt; -use garage_util::data::*; use garage_util::error::*; use garage_util::formater::format_table; @@ -212,31 +211,9 @@ pub async fn cmd_apply_layout( rpc_host: NodeID, apply_opt: ApplyLayoutOpt, ) -> Result<(), Error> { - let mut layout = fetch_layout(rpc_cli, rpc_host).await?; - - match apply_opt.version { - None => { - println!("Please pass the --version flag to ensure that you are writing the correct version of the cluster layout."); - println!("To know the correct value of the --version flag, invoke `garage layout show` and review the proposed changes."); - return Err(Error::Message("--version flag is missing".into())); - } - Some(v) => { - if v != layout.version + 1 { - return Err(Error::Message("Invalid value of --version flag".into())); - } - } - } - - layout.roles.merge(&layout.staging); - - if !layout.calculate_partition_assignation() { - return Err(Error::Message("Could not calculate new assignation of partitions to nodes. This can happen if there are less nodes than the desired number of copies of your data (see the replication_mode configuration parameter).".into())); - } + let layout = fetch_layout(rpc_cli, rpc_host).await?; - layout.staging.clear(); - layout.staging_hash = blake2sum(&rmp_to_vec_all_named(&layout.staging).unwrap()[..]); - - layout.version += 1; + let layout = layout.apply_staged_changes(apply_opt.version)?; send_layout(rpc_cli, rpc_host, layout).await?; @@ -251,25 +228,9 @@ pub async fn cmd_revert_layout( rpc_host: NodeID, revert_opt: RevertLayoutOpt, ) -> Result<(), Error> { - let mut layout = fetch_layout(rpc_cli, rpc_host).await?; - - match revert_opt.version { - None => { - println!("Please pass the --version flag to ensure that you are writing the correct version of the cluster layout."); - println!("To know the correct value of the --version flag, invoke `garage layout show` and review the proposed changes."); - return Err(Error::Message("--version flag is missing".into())); - } - Some(v) => { - if v != layout.version + 1 { - return Err(Error::Message("Invalid value of --version flag".into())); - } - } - } - - layout.staging.clear(); - layout.staging_hash = blake2sum(&rmp_to_vec_all_named(&layout.staging).unwrap()[..]); + let layout = fetch_layout(rpc_cli, rpc_host).await?; - layout.version += 1; + let layout = layout.revert_staged_changes(revert_opt.version)?; send_layout(rpc_cli, rpc_host, layout).await?; diff --git a/src/garage/main.rs b/src/garage/main.rs index e898e680..bd09b6ea 100644 --- a/src/garage/main.rs +++ b/src/garage/main.rs @@ -8,6 +8,7 @@ mod admin; mod cli; mod repair; mod server; +mod tracing_setup; use std::net::SocketAddr; use std::path::PathBuf; @@ -141,6 +142,7 @@ async fn cli_command(opt: Opt) -> Result<(), Error> { match cli_command_dispatch(opt.cmd, &system_rpc_endpoint, &admin_rpc_endpoint, id).await { Err(HelperError::Internal(i)) => Err(Error::Message(format!("Internal error: {}", i))), Err(HelperError::BadRequest(b)) => Err(Error::Message(b)), + Err(e) => Err(Error::Message(format!("{}", e))), Ok(x) => Ok(x), } } diff --git a/src/garage/server.rs b/src/garage/server.rs index 24bb25b3..b58ad286 100644 --- a/src/garage/server.rs +++ b/src/garage/server.rs @@ -6,8 +6,7 @@ use garage_util::background::*; use garage_util::config::*; use garage_util::error::Error; -use garage_admin::metrics::*; -use garage_admin::tracing_setup::*; +use garage_api::admin::api_server::AdminApiServer; use garage_api::s3::api_server::S3ApiServer; use garage_model::garage::Garage; use garage_web::run_web_server; @@ -16,6 +15,7 @@ use garage_web::run_web_server; use garage_api::k2v::api_server::K2VApiServer; use crate::admin::*; +use crate::tracing_setup::*; async fn wait_from(mut chan: watch::Receiver) { while !*chan.borrow() { @@ -39,9 +39,6 @@ pub async fn run_server(config_file: PathBuf) -> Result<(), Error> { .open() .expect("Unable to open sled DB"); - info!("Initialize admin web server and metric backend..."); - let admin_server_init = AdminServer::init(); - info!("Initializing background runner..."); let watch_cancel = netapp::util::watch_ctrl_c(); let (background, await_background_done) = BackgroundRunner::new(16, watch_cancel.clone()); @@ -54,6 +51,9 @@ pub async fn run_server(config_file: PathBuf) -> Result<(), Error> { init_tracing(&export_to, garage.system.id)?; } + info!("Initialize Admin API server and metrics collector..."); + let admin_server = AdminApiServer::new(garage.clone()); + let run_system = tokio::spawn(garage.system.clone().run(watch_cancel.clone())); info!("Create admin RPC handler..."); @@ -80,39 +80,41 @@ pub async fn run_server(config_file: PathBuf) -> Result<(), Error> { wait_from(watch_cancel.clone()), )); - let admin_server = if let Some(admin_bind_addr) = config.admin.api_bind_addr { - info!("Configure and run admin web server..."); - Some(tokio::spawn( - admin_server_init.run(admin_bind_addr, wait_from(watch_cancel.clone())), - )) - } else { - None - }; + info!("Launching Admin API server..."); + let admin_server = tokio::spawn(admin_server.run(wait_from(watch_cancel.clone()))); // Stuff runs // When a cancel signal is sent, stuff stops if let Err(e) = s3_api_server.await? { warn!("S3 API server exited with error: {}", e); + } else { + info!("S3 API server exited without error."); } #[cfg(feature = "k2v")] if let Err(e) = k2v_api_server.await? { warn!("K2V API server exited with error: {}", e); + } else { + info!("K2V API server exited without error."); } if let Err(e) = web_server.await? { warn!("Web server exited with error: {}", e); + } else { + info!("Web server exited without error."); } - if let Some(a) = admin_server { - if let Err(e) = a.await? { - warn!("Admin web server exited with error: {}", e); - } + if let Err(e) = admin_server.await? { + warn!("Admin web server exited with error: {}", e); + } else { + info!("Admin API server exited without error."); } // Remove RPC handlers for system to break reference cycles garage.system.netapp.drop_all_handlers(); + opentelemetry::global::shutdown_tracer_provider(); // Await for netapp RPC system to end run_system.await?; + info!("Netapp exited"); // Drop all references so that stuff can terminate properly drop(garage); diff --git a/src/garage/tracing_setup.rs b/src/garage/tracing_setup.rs new file mode 100644 index 00000000..55fc4094 --- /dev/null +++ b/src/garage/tracing_setup.rs @@ -0,0 +1,37 @@ +use std::time::Duration; + +use opentelemetry::sdk::{ + trace::{self, IdGenerator, Sampler}, + Resource, +}; +use opentelemetry::KeyValue; +use opentelemetry_otlp::WithExportConfig; + +use garage_util::data::*; +use garage_util::error::*; + +pub fn init_tracing(export_to: &str, node_id: Uuid) -> Result<(), Error> { + let node_id = hex::encode(&node_id.as_slice()[..8]); + + opentelemetry_otlp::new_pipeline() + .tracing() + .with_exporter( + opentelemetry_otlp::new_exporter() + .tonic() + .with_endpoint(export_to) + .with_timeout(Duration::from_secs(3)), + ) + .with_trace_config( + trace::config() + .with_id_generator(IdGenerator::default()) + .with_sampler(Sampler::AlwaysOn) + .with_resource(Resource::new(vec![ + KeyValue::new("service.name", "garage"), + KeyValue::new("service.instance.id", node_id), + ])), + ) + .install_batch(opentelemetry::runtime::Tokio) + .ok_or_message("Unable to initialize tracing")?; + + Ok(()) +} diff --git a/src/model/garage.rs b/src/model/garage.rs index 03e21f8a..2f99bd68 100644 --- a/src/model/garage.rs +++ b/src/model/garage.rs @@ -191,6 +191,10 @@ impl Garage { pub fn bucket_helper(&self) -> helper::bucket::BucketHelper { helper::bucket::BucketHelper(self) } + + pub fn key_helper(&self) -> helper::key::KeyHelper { + helper::key::KeyHelper(self) + } } #[cfg(feature = "k2v")] diff --git a/src/model/helper/bucket.rs b/src/model/helper/bucket.rs index 54d2f97b..130ba5be 100644 --- a/src/model/helper/bucket.rs +++ b/src/model/helper/bucket.rs @@ -1,15 +1,18 @@ -use garage_table::util::*; use garage_util::crdt::*; use garage_util::data::*; use garage_util::error::{Error as GarageError, OkOrMessage}; use garage_util::time::*; +use garage_table::util::*; + use crate::bucket_alias_table::*; use crate::bucket_table::*; use crate::garage::Garage; use crate::helper::error::*; -use crate::key_table::{Key, KeyFilter}; +use crate::helper::key::KeyHelper; +use crate::key_table::*; use crate::permission::BucketKeyPerm; +use crate::s3::object_table::ObjectFilter; pub struct BucketHelper<'a>(pub(crate) &'a Garage); @@ -49,6 +52,23 @@ impl<'a> BucketHelper<'a> { } } + #[allow(clippy::ptr_arg)] + pub async fn resolve_bucket(&self, bucket_name: &String, api_key: &Key) -> Result { + let api_key_params = api_key + .state + .as_option() + .ok_or_message("Key should not be deleted at this point")?; + + if let Some(Some(bucket_id)) = api_key_params.local_aliases.get(bucket_name) { + Ok(*bucket_id) + } else { + Ok(self + .resolve_global_bucket_name(bucket_name) + .await? + .ok_or_else(|| Error::NoSuchBucket(bucket_name.to_string()))?) + } + } + /// Returns a Bucket if it is present in bucket table, /// even if it is in deleted state. Querying a non-existing /// bucket ID returns an internal error. @@ -71,64 +91,7 @@ impl<'a> BucketHelper<'a> { .get(&EmptyKey, &bucket_id) .await? .filter(|b| !b.is_deleted()) - .ok_or_bad_request(format!( - "Bucket {:?} does not exist or has been deleted", - bucket_id - )) - } - - /// Returns a Key if it is present in key table, - /// even if it is in deleted state. Querying a non-existing - /// key ID returns an internal error. - pub async fn get_internal_key(&self, key_id: &String) -> Result { - Ok(self - .0 - .key_table - .get(&EmptyKey, key_id) - .await? - .ok_or_message(format!("Key {} does not exist", key_id))?) - } - - /// Returns a Key if it is present in key table, - /// only if it is in non-deleted state. - /// Querying a non-existing key ID or a deleted key - /// returns a bad request error. - pub async fn get_existing_key(&self, key_id: &String) -> Result { - self.0 - .key_table - .get(&EmptyKey, key_id) - .await? - .filter(|b| !b.state.is_deleted()) - .ok_or_bad_request(format!("Key {} does not exist or has been deleted", key_id)) - } - - /// Returns a Key if it is present in key table, - /// looking it up by key ID or by a match on its name, - /// only if it is in non-deleted state. - /// Querying a non-existing key ID or a deleted key - /// returns a bad request error. - pub async fn get_existing_matching_key(&self, pattern: &str) -> Result { - let candidates = self - .0 - .key_table - .get_range( - &EmptyKey, - None, - Some(KeyFilter::MatchesAndNotDeleted(pattern.to_string())), - 10, - EnumerationOrder::Forward, - ) - .await? - .into_iter() - .collect::>(); - if candidates.len() != 1 { - Err(Error::BadRequest(format!( - "{} matching keys", - candidates.len() - ))) - } else { - Ok(candidates.into_iter().next().unwrap()) - } + .ok_or_else(|| Error::NoSuchBucket(hex::encode(bucket_id))) } /// Sets a new alias for a bucket in global namespace. @@ -142,10 +105,7 @@ impl<'a> BucketHelper<'a> { alias_name: &String, ) -> Result<(), Error> { if !is_valid_bucket_name(alias_name) { - return Err(Error::BadRequest(format!( - "{}: {}", - alias_name, INVALID_BUCKET_NAME_MESSAGE - ))); + return Err(Error::InvalidBucketName(alias_name.to_string())); } let mut bucket = self.get_existing_bucket(bucket_id).await?; @@ -176,7 +136,7 @@ impl<'a> BucketHelper<'a> { let alias = match alias { None => BucketAlias::new(alias_name.clone(), alias_ts, Some(bucket_id)) - .ok_or_bad_request(format!("{}: {}", alias_name, INVALID_BUCKET_NAME_MESSAGE))?, + .ok_or_else(|| Error::InvalidBucketName(alias_name.clone()))?, Some(mut a) => { a.state = Lww::raw(alias_ts, Some(bucket_id)); a @@ -264,7 +224,7 @@ impl<'a> BucketHelper<'a> { .bucket_alias_table .get(&EmptyKey, alias_name) .await? - .ok_or_message(format!("Alias {} not found", alias_name))?; + .ok_or_else(|| Error::NoSuchBucket(alias_name.to_string()))?; // Checks ok, remove alias let alias_ts = match bucket.state.as_option() { @@ -303,15 +263,14 @@ impl<'a> BucketHelper<'a> { key_id: &String, alias_name: &String, ) -> Result<(), Error> { + let key_helper = KeyHelper(self.0); + if !is_valid_bucket_name(alias_name) { - return Err(Error::BadRequest(format!( - "{}: {}", - alias_name, INVALID_BUCKET_NAME_MESSAGE - ))); + return Err(Error::InvalidBucketName(alias_name.to_string())); } let mut bucket = self.get_existing_bucket(bucket_id).await?; - let mut key = self.get_existing_key(key_id).await?; + let mut key = key_helper.get_existing_key(key_id).await?; let mut key_param = key.state.as_option_mut().unwrap(); @@ -360,8 +319,10 @@ impl<'a> BucketHelper<'a> { key_id: &String, alias_name: &String, ) -> Result<(), Error> { + let key_helper = KeyHelper(self.0); + let mut bucket = self.get_existing_bucket(bucket_id).await?; - let mut key = self.get_existing_key(key_id).await?; + let mut key = key_helper.get_existing_key(key_id).await?; let mut bucket_p = bucket.state.as_option_mut().unwrap(); @@ -429,8 +390,10 @@ impl<'a> BucketHelper<'a> { key_id: &String, mut perm: BucketKeyPerm, ) -> Result<(), Error> { + let key_helper = KeyHelper(self.0); + let mut bucket = self.get_internal_bucket(bucket_id).await?; - let mut key = self.get_internal_key(key_id).await?; + let mut key = key_helper.get_internal_key(key_id).await?; if let Some(bstate) = bucket.state.as_option() { if let Some(kp) = bstate.authorized_keys.get(key_id) { @@ -466,4 +429,47 @@ impl<'a> BucketHelper<'a> { Ok(()) } + + pub async fn is_bucket_empty(&self, bucket_id: Uuid) -> Result { + let objects = self + .0 + .object_table + .get_range( + &bucket_id, + None, + Some(ObjectFilter::IsData), + 10, + EnumerationOrder::Forward, + ) + .await?; + if !objects.is_empty() { + return Ok(false); + } + + #[cfg(feature = "k2v")] + { + use garage_rpc::ring::Ring; + use std::sync::Arc; + + let ring: Arc = self.0.system.ring.borrow().clone(); + let k2vindexes = self + .0 + .k2v + .counter_table + .table + .get_range( + &bucket_id, + None, + Some((DeletedFilter::NotDeleted, ring.layout.node_id_vec.clone())), + 10, + EnumerationOrder::Forward, + ) + .await?; + if !k2vindexes.is_empty() { + return Ok(false); + } + } + + Ok(true) + } } diff --git a/src/model/helper/error.rs b/src/model/helper/error.rs index 30b2ba32..3ca8f55c 100644 --- a/src/model/helper/error.rs +++ b/src/model/helper/error.rs @@ -10,6 +10,16 @@ pub enum Error { #[error(display = "Bad request: {}", _0)] BadRequest(String), + + /// Bucket name is not valid according to AWS S3 specs + #[error(display = "Invalid bucket name: {}", _0)] + InvalidBucketName(String), + + #[error(display = "Access key not found: {}", _0)] + NoSuchAccessKey(String), + + #[error(display = "Bucket not found: {}", _0)] + NoSuchBucket(String), } impl From for Error { diff --git a/src/model/helper/key.rs b/src/model/helper/key.rs new file mode 100644 index 00000000..c1a8e974 --- /dev/null +++ b/src/model/helper/key.rs @@ -0,0 +1,102 @@ +use garage_table::util::*; +use garage_util::crdt::*; +use garage_util::error::OkOrMessage; + +use crate::garage::Garage; +use crate::helper::bucket::BucketHelper; +use crate::helper::error::*; +use crate::key_table::{Key, KeyFilter}; +use crate::permission::BucketKeyPerm; + +pub struct KeyHelper<'a>(pub(crate) &'a Garage); + +#[allow(clippy::ptr_arg)] +impl<'a> KeyHelper<'a> { + /// Returns a Key if it is present in key table, + /// even if it is in deleted state. Querying a non-existing + /// key ID returns an internal error. + pub async fn get_internal_key(&self, key_id: &String) -> Result { + Ok(self + .0 + .key_table + .get(&EmptyKey, key_id) + .await? + .ok_or_message(format!("Key {} does not exist", key_id))?) + } + + /// Returns a Key if it is present in key table, + /// only if it is in non-deleted state. + /// Querying a non-existing key ID or a deleted key + /// returns a bad request error. + pub async fn get_existing_key(&self, key_id: &String) -> Result { + self.0 + .key_table + .get(&EmptyKey, key_id) + .await? + .filter(|b| !b.state.is_deleted()) + .ok_or_else(|| Error::NoSuchAccessKey(key_id.to_string())) + } + + /// Returns a Key if it is present in key table, + /// looking it up by key ID or by a match on its name, + /// only if it is in non-deleted state. + /// Querying a non-existing key ID or a deleted key + /// returns a bad request error. + pub async fn get_existing_matching_key(&self, pattern: &str) -> Result { + let candidates = self + .0 + .key_table + .get_range( + &EmptyKey, + None, + Some(KeyFilter::MatchesAndNotDeleted(pattern.to_string())), + 10, + EnumerationOrder::Forward, + ) + .await? + .into_iter() + .collect::>(); + if candidates.len() != 1 { + Err(Error::BadRequest(format!( + "{} matching keys", + candidates.len() + ))) + } else { + Ok(candidates.into_iter().next().unwrap()) + } + } + + /// Deletes an API access key + pub async fn delete_key(&self, key: &mut Key) -> Result<(), Error> { + let bucket_helper = BucketHelper(self.0); + + let state = key.state.as_option_mut().unwrap(); + + // --- done checking, now commit --- + // (the step at unset_local_bucket_alias will fail if a bucket + // does not have another alias, the deletion will be + // interrupted in the middle if that happens) + + // 1. Delete local aliases + for (alias, _, to) in state.local_aliases.items().iter() { + if let Some(bucket_id) = to { + bucket_helper + .unset_local_bucket_alias(*bucket_id, &key.key_id, alias) + .await?; + } + } + + // 2. Remove permissions on all authorized buckets + for (ab_id, _auth) in state.authorized_buckets.items().iter() { + bucket_helper + .set_bucket_key_permissions(*ab_id, &key.key_id, BucketKeyPerm::NO_PERMISSIONS) + .await?; + } + + // 3. Actually delete key + key.state = Deletable::delete(); + self.0.key_table.insert(key).await?; + + Ok(()) + } +} diff --git a/src/model/helper/mod.rs b/src/model/helper/mod.rs index 2f4e8898..dd947c86 100644 --- a/src/model/helper/mod.rs +++ b/src/model/helper/mod.rs @@ -1,2 +1,3 @@ pub mod bucket; pub mod error; +pub mod key; diff --git a/src/rpc/Cargo.toml b/src/rpc/Cargo.toml index bed7f44a..73328993 100644 --- a/src/rpc/Cargo.toml +++ b/src/rpc/Cargo.toml @@ -15,11 +15,11 @@ path = "lib.rs" [dependencies] garage_util = { version = "0.7.0", path = "../util" } -garage_admin = { version = "0.7.0", path = "../admin" } arc-swap = "1.0" bytes = "1.0" gethostname = "0.2" +git-version = "0.3.4" hex = "0.4" tracing = "0.1.30" rand = "0.8" diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs index b9c02c21..f517f36f 100644 --- a/src/rpc/layout.rs +++ b/src/rpc/layout.rs @@ -5,6 +5,7 @@ use serde::{Deserialize, Serialize}; use garage_util::crdt::{AutoCrdt, Crdt, LwwMap}; use garage_util::data::*; +use garage_util::error::*; use crate::ring::*; @@ -100,6 +101,61 @@ impl ClusterLayout { } } + pub fn apply_staged_changes(mut self, version: Option) -> Result { + match version { + None => { + let error = r#" +Please pass the new layout version number to ensure that you are writing the correct version of the cluster layout. +To know the correct value of the new layout version, invoke `garage layout show` and review the proposed changes. + "#; + return Err(Error::Message(error.into())); + } + Some(v) => { + if v != self.version + 1 { + return Err(Error::Message("Invalid new layout version".into())); + } + } + } + + self.roles.merge(&self.staging); + self.roles.retain(|(_, _, v)| v.0.is_some()); + + if !self.calculate_partition_assignation() { + return Err(Error::Message("Could not calculate new assignation of partitions to nodes. This can happen if there are less nodes than the desired number of copies of your data (see the replication_mode configuration parameter).".into())); + } + + self.staging.clear(); + self.staging_hash = blake2sum(&rmp_to_vec_all_named(&self.staging).unwrap()[..]); + + self.version += 1; + + Ok(self) + } + + pub fn revert_staged_changes(mut self, version: Option) -> Result { + match version { + None => { + let error = r#" +Please pass the new layout version number to ensure that you are writing the correct version of the cluster layout. +To know the correct value of the new layout version, invoke `garage layout show` and review the proposed changes. + "#; + return Err(Error::Message(error.into())); + } + Some(v) => { + if v != self.version + 1 { + return Err(Error::Message("Invalid new layout version".into())); + } + } + } + + self.staging.clear(); + self.staging_hash = blake2sum(&rmp_to_vec_all_named(&self.staging).unwrap()[..]); + + self.version += 1; + + Ok(self) + } + /// Returns a list of IDs of nodes that currently have /// a role in the cluster pub fn node_ids(&self) -> &[Uuid] { diff --git a/src/rpc/system.rs b/src/rpc/system.rs index 68d94ea5..1d7c3ea4 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -312,6 +312,84 @@ impl System { ); } + // ---- Administrative operations (directly available and + // also available through RPC) ---- + + pub fn garage_version(&self) -> &'static str { + option_env!("GIT_VERSION").unwrap_or(git_version::git_version!( + prefix = "git:", + cargo_prefix = "cargo:", + fallback = "unknown" + )) + } + + pub fn get_known_nodes(&self) -> Vec { + let node_status = self.node_status.read().unwrap(); + let known_nodes = self + .fullmesh + .get_peer_list() + .iter() + .map(|n| KnownNodeInfo { + id: n.id.into(), + addr: n.addr, + is_up: n.is_up(), + last_seen_secs_ago: n.last_seen.map(|t| (Instant::now() - t).as_secs()), + status: node_status + .get(&n.id.into()) + .cloned() + .map(|(_, st)| st) + .unwrap_or(NodeStatus { + hostname: "?".to_string(), + replication_factor: 0, + cluster_layout_version: 0, + cluster_layout_staging_hash: Hash::from([0u8; 32]), + }), + }) + .collect::>(); + known_nodes + } + + pub fn get_cluster_layout(&self) -> ClusterLayout { + self.ring.borrow().layout.clone() + } + + pub async fn update_cluster_layout( + self: &Arc, + layout: &ClusterLayout, + ) -> Result<(), Error> { + self.handle_advertise_cluster_layout(layout).await?; + Ok(()) + } + + pub async fn connect(&self, node: &str) -> Result<(), Error> { + let (pubkey, addrs) = parse_and_resolve_peer_addr(node).ok_or_else(|| { + Error::Message(format!( + "Unable to parse or resolve node specification: {}", + node + )) + })?; + let mut errors = vec![]; + for ip in addrs.iter() { + match self + .netapp + .clone() + .try_connect(*ip, pubkey) + .await + .err_context(CONNECT_ERROR_MESSAGE) + { + Ok(()) => return Ok(()), + Err(e) => { + errors.push((*ip, e)); + } + } + } + if errors.len() == 1 { + Err(Error::Message(errors[0].1.to_string())) + } else { + Err(Error::Message(format!("{:?}", errors))) + } + } + // ---- INTERNALS ---- async fn advertise_to_consul(self: Arc) -> Result<(), Error> { @@ -384,32 +462,11 @@ impl System { self.local_status.swap(Arc::new(new_si)); } + // --- RPC HANDLERS --- + async fn handle_connect(&self, node: &str) -> Result { - let (pubkey, addrs) = parse_and_resolve_peer_addr(node).ok_or_else(|| { - Error::Message(format!( - "Unable to parse or resolve node specification: {}", - node - )) - })?; - let mut errors = vec![]; - for ip in addrs.iter() { - match self - .netapp - .clone() - .try_connect(*ip, pubkey) - .await - .err_context(CONNECT_ERROR_MESSAGE) - { - Ok(()) => return Ok(SystemRpc::Ok), - Err(e) => { - errors.push((*ip, e)); - } - } - } - return Err(Error::Message(format!( - "Could not connect to specified peers. Errors: {:?}", - errors - ))); + self.connect(node).await?; + Ok(SystemRpc::Ok) } fn handle_pull_cluster_layout(&self) -> SystemRpc { @@ -418,28 +475,7 @@ impl System { } fn handle_get_known_nodes(&self) -> SystemRpc { - let node_status = self.node_status.read().unwrap(); - let known_nodes = self - .fullmesh - .get_peer_list() - .iter() - .map(|n| KnownNodeInfo { - id: n.id.into(), - addr: n.addr, - is_up: n.is_up(), - last_seen_secs_ago: n.last_seen.map(|t| (Instant::now() - t).as_secs()), - status: node_status - .get(&n.id.into()) - .cloned() - .map(|(_, st)| st) - .unwrap_or(NodeStatus { - hostname: "?".to_string(), - replication_factor: 0, - cluster_layout_version: 0, - cluster_layout_staging_hash: Hash::from([0u8; 32]), - }), - }) - .collect::>(); + let known_nodes = self.get_known_nodes(); SystemRpc::ReturnKnownNodes(known_nodes) } @@ -476,7 +512,7 @@ impl System { } async fn handle_advertise_cluster_layout( - self: Arc, + self: &Arc, adv: &ClusterLayout, ) -> Result { let update_ring = self.update_ring.lock().await; diff --git a/src/util/config.rs b/src/util/config.rs index 4d66bfe4..99ebce31 100644 --- a/src/util/config.rs +++ b/src/util/config.rs @@ -121,6 +121,10 @@ pub struct WebConfig { pub struct AdminConfig { /// Address and port to bind for admin API serving pub api_bind_addr: Option, + /// Bearer token to use to scrape metrics + pub metrics_token: Option, + /// Bearer token to use to access Admin API endpoints + pub admin_token: Option, /// OTLP server to where to export traces pub trace_sink: Option, } diff --git a/src/util/crdt/lww_map.rs b/src/util/crdt/lww_map.rs index c155c3a8..91d24c7f 100644 --- a/src/util/crdt/lww_map.rs +++ b/src/util/crdt/lww_map.rs @@ -140,6 +140,11 @@ where self.vals.clear(); } + /// Retain only values that match a certain predicate + pub fn retain(&mut self, pred: impl FnMut(&(K, u64, V)) -> bool) { + self.vals.retain(pred); + } + /// Get a reference to the value assigned to a key pub fn get(&self, k: &K) -> Option<&V> { match self.vals.binary_search_by(|(k2, _, _)| k2.cmp(k)) { diff --git a/src/web/error.rs b/src/web/error.rs index 55990e9d..bd8f17b5 100644 --- a/src/web/error.rs +++ b/src/web/error.rs @@ -2,57 +2,47 @@ use err_derive::Error; use hyper::header::HeaderValue; use hyper::{HeaderMap, StatusCode}; -use garage_util::error::Error as GarageError; +use garage_api::generic_server::ApiError; /// Errors of this crate #[derive(Debug, Error)] pub enum Error { /// An error received from the API crate #[error(display = "API error: {}", _0)] - ApiError(#[error(source)] garage_api::Error), - - // Category: internal error - /// Error internal to garage - #[error(display = "Internal error: {}", _0)] - InternalError(#[error(source)] GarageError), + ApiError(garage_api::s3::error::Error), /// The file does not exist #[error(display = "Not found")] NotFound, - /// The request contained an invalid UTF-8 sequence in its path or in other parameters - #[error(display = "Invalid UTF-8: {}", _0)] - InvalidUtf8(#[error(source)] std::str::Utf8Error), - - /// The client send a header with invalid value - #[error(display = "Invalid header value: {}", _0)] - InvalidHeader(#[error(source)] hyper::header::ToStrError), - /// The client sent a request without host, or with unsupported method #[error(display = "Bad request: {}", _0)] BadRequest(String), } +impl From for Error +where + garage_api::s3::error::Error: From, +{ + fn from(err: T) -> Self { + Error::ApiError(garage_api::s3::error::Error::from(err)) + } +} + impl Error { /// Transform errors into http status code pub fn http_status_code(&self) -> StatusCode { match self { Error::NotFound => StatusCode::NOT_FOUND, Error::ApiError(e) => e.http_status_code(), - Error::InternalError( - GarageError::Timeout - | GarageError::RemoteError(_) - | GarageError::Quorum(_, _, _, _), - ) => StatusCode::SERVICE_UNAVAILABLE, - Error::InternalError(_) => StatusCode::INTERNAL_SERVER_ERROR, - _ => StatusCode::BAD_REQUEST, + Error::BadRequest(_) => StatusCode::BAD_REQUEST, } } pub fn add_headers(&self, header_map: &mut HeaderMap) { #[allow(clippy::single_match)] match self { - Error::ApiError(e) => e.add_headers(header_map), + Error::ApiError(e) => e.add_http_headers(header_map), _ => (), } } diff --git a/src/web/web_server.rs b/src/web/web_server.rs index 867adc51..c30d8957 100644 --- a/src/web/web_server.rs +++ b/src/web/web_server.rs @@ -18,9 +18,11 @@ use opentelemetry::{ use crate::error::*; -use garage_api::error::{Error as ApiError, OkOrBadRequest, OkOrInternalError}; use garage_api::helpers::{authority_to_host, host_to_bucket}; use garage_api::s3::cors::{add_cors_headers, find_matching_cors_rule, handle_options_for_bucket}; +use garage_api::s3::error::{ + CommonErrorDerivative, Error as ApiError, OkOrBadRequest, OkOrInternalError, +}; use garage_api::s3::get::{handle_get, handle_head}; use garage_model::garage::Garage; @@ -207,7 +209,7 @@ async fn serve_file(garage: Arc, req: &Request) -> Result handle_options_for_bucket(req, &bucket), Method::HEAD => handle_head(garage.clone(), req, bucket_id, &key, None).await, Method::GET => handle_get(garage.clone(), req, bucket_id, &key, None).await, - _ => Err(ApiError::BadRequest("HTTP method not supported".into())), + _ => Err(ApiError::bad_request("HTTP method not supported")), } .map_err(Error::from); @@ -290,9 +292,7 @@ fn path_to_key<'a>(path: &'a str, index: &str) -> Result, Error> { let path_utf8 = percent_encoding::percent_decode_str(path).decode_utf8()?; if !path_utf8.starts_with('/') { - return Err(Error::BadRequest( - "Path must start with a / (slash)".to_string(), - )); + return Err(Error::BadRequest("Path must start with a / (slash)".into())); } match path_utf8.chars().last() { -- cgit v1.2.3 From b2a2d3859fefd53dab0b87274d5aed1f6bb608a3 Mon Sep 17 00:00:00 2001 From: Alex Date: Tue, 24 May 2022 12:48:05 +0200 Subject: K2V client improvements (#307) - [x] Better distinguish error types - [x] Parse error messages received from server - [x] Remove `src/` folder layer, we don't have that for other crates Co-authored-by: Alex Auvolat Reviewed-on: https://git.deuxfleurs.fr/Deuxfleurs/garage/pulls/307 Co-authored-by: Alex Co-committed-by: Alex --- src/k2v-client/Cargo.toml | 5 + src/k2v-client/bin/k2v-cli.rs | 466 +++++++++++++++++++++++++++++ src/k2v-client/error.rs | 29 ++ src/k2v-client/lib.rs | 611 ++++++++++++++++++++++++++++++++++++++ src/k2v-client/src/bin/k2v-cli.rs | 466 ----------------------------- src/k2v-client/src/error.rs | 22 -- src/k2v-client/src/lib.rs | 566 ----------------------------------- 7 files changed, 1111 insertions(+), 1054 deletions(-) create mode 100644 src/k2v-client/bin/k2v-cli.rs create mode 100644 src/k2v-client/error.rs create mode 100644 src/k2v-client/lib.rs delete mode 100644 src/k2v-client/src/bin/k2v-cli.rs delete mode 100644 src/k2v-client/src/error.rs delete mode 100644 src/k2v-client/src/lib.rs (limited to 'src') diff --git a/src/k2v-client/Cargo.toml b/src/k2v-client/Cargo.toml index 84c6b8b2..224414ab 100644 --- a/src/k2v-client/Cargo.toml +++ b/src/k2v-client/Cargo.toml @@ -6,6 +6,7 @@ edition = "2018" [dependencies] base64 = "0.13.0" http = "0.2.6" +log = "0.4" rusoto_core = "0.48.0" rusoto_credential = "0.48.0" rusoto_signature = "0.48.0" @@ -22,6 +23,10 @@ garage_util = { path = "../util", optional = true } [features] cli = ["clap", "tokio/fs", "tokio/io-std", "garage_util"] +[lib] +path = "lib.rs" + [[bin]] name = "k2v-cli" +path = "bin/k2v-cli.rs" required-features = ["cli"] diff --git a/src/k2v-client/bin/k2v-cli.rs b/src/k2v-client/bin/k2v-cli.rs new file mode 100644 index 00000000..38c39361 --- /dev/null +++ b/src/k2v-client/bin/k2v-cli.rs @@ -0,0 +1,466 @@ +use k2v_client::*; + +use garage_util::formater::format_table; + +use rusoto_core::credential::AwsCredentials; +use rusoto_core::Region; + +use clap::{Parser, Subcommand}; + +/// K2V command line interface +#[derive(Parser, Debug)] +#[clap(author, version, about, long_about = None)] +struct Args { + /// Name of the region to use + #[clap(short, long, env = "AWS_REGION", default_value = "garage")] + region: String, + /// Url of the endpoint to connect to + #[clap(short, long, env = "K2V_ENDPOINT")] + endpoint: String, + /// Access key ID + #[clap(short, long, env = "AWS_ACCESS_KEY_ID")] + key_id: String, + /// Access key ID + #[clap(short, long, env = "AWS_SECRET_ACCESS_KEY")] + secret: String, + /// Bucket name + #[clap(short, long, env = "K2V_BUCKET")] + bucket: String, + #[clap(subcommand)] + command: Command, +} + +#[derive(Subcommand, Debug)] +enum Command { + /// Insert a single value + Insert { + /// Partition key to insert to + partition_key: String, + /// Sort key to insert to + sort_key: String, + /// Causality of the insertion + #[clap(short, long)] + causality: Option, + /// Value to insert + #[clap(flatten)] + value: Value, + }, + /// Read a single value + Read { + /// Partition key to read from + partition_key: String, + /// Sort key to read from + sort_key: String, + /// Output formating + #[clap(flatten)] + output_kind: ReadOutputKind, + }, + /// Delete a single value + Delete { + /// Partition key to delete from + partition_key: String, + /// Sort key to delete from + sort_key: String, + /// Causality information + #[clap(short, long)] + causality: String, + }, + /// List partition keys + ReadIndex { + /// Output formating + #[clap(flatten)] + output_kind: BatchOutputKind, + /// Output only partition keys matching this filter + #[clap(flatten)] + filter: Filter, + }, + /// Read a range of sort keys + ReadRange { + /// Partition key to read from + partition_key: String, + /// Output formating + #[clap(flatten)] + output_kind: BatchOutputKind, + /// Output only sort keys matching this filter + #[clap(flatten)] + filter: Filter, + }, + /// Delete a range of sort keys + DeleteRange { + /// Partition key to delete from + partition_key: String, + /// Output formating + #[clap(flatten)] + output_kind: BatchOutputKind, + /// Delete only sort keys matching this filter + #[clap(flatten)] + filter: Filter, + }, +} + +/// Where to read a value from +#[derive(Parser, Debug)] +#[clap(group = clap::ArgGroup::new("value").multiple(false).required(true))] +struct Value { + /// Read value from a file. use - to read from stdin + #[clap(short, long, group = "value")] + file: Option, + /// Read a base64 value from commandline + #[clap(short, long, group = "value")] + b64: Option, + /// Read a raw (UTF-8) value from the commandline + #[clap(short, long, group = "value")] + text: Option, +} + +impl Value { + async fn to_data(&self) -> Result, Error> { + if let Some(ref text) = self.text { + Ok(text.as_bytes().to_vec()) + } else if let Some(ref b64) = self.b64 { + base64::decode(b64).map_err(|_| Error::Message("invalid base64 input".into())) + } else if let Some(ref path) = self.file { + use tokio::io::AsyncReadExt; + if path == "-" { + let mut file = tokio::io::stdin(); + let mut vec = Vec::new(); + file.read_to_end(&mut vec).await?; + Ok(vec) + } else { + let mut file = tokio::fs::File::open(path).await?; + let mut vec = Vec::new(); + file.read_to_end(&mut vec).await?; + Ok(vec) + } + } else { + unreachable!("Value must have one option set") + } + } +} + +#[derive(Parser, Debug)] +#[clap(group = clap::ArgGroup::new("output-kind").multiple(false).required(false))] +struct ReadOutputKind { + /// Base64 output. Conflicts are line separated, first line is causality token + #[clap(short, long, group = "output-kind")] + b64: bool, + /// Raw output. Conflicts generate error, causality token is not returned + #[clap(short, long, group = "output-kind")] + raw: bool, + /// Human formated output + #[clap(short = 'H', long, group = "output-kind")] + human: bool, + /// JSON formated output + #[clap(short, long, group = "output-kind")] + json: bool, +} + +impl ReadOutputKind { + fn display_output(&self, val: CausalValue) -> ! { + use std::io::Write; + use std::process::exit; + + if self.json { + let stdout = std::io::stdout(); + serde_json::to_writer_pretty(stdout, &val).unwrap(); + exit(0); + } + + if self.raw { + let mut val = val.value; + if val.len() != 1 { + eprintln!( + "Raw mode can only read non-concurent values, found {} values, expected 1", + val.len() + ); + exit(1); + } + let val = val.pop().unwrap(); + match val { + K2vValue::Value(v) => { + std::io::stdout().write_all(&v).unwrap(); + exit(0); + } + K2vValue::Tombstone => { + eprintln!("Expected value, found tombstone"); + exit(2); + } + } + } + + let causality: String = val.causality.into(); + if self.b64 { + println!("{}", causality); + for val in val.value { + match val { + K2vValue::Value(v) => { + println!("{}", base64::encode(&v)) + } + K2vValue::Tombstone => { + println!(); + } + } + } + exit(0); + } + + // human + println!("causality: {}", causality); + println!("values:"); + for val in val.value { + match val { + K2vValue::Value(v) => { + if let Ok(string) = std::str::from_utf8(&v) { + println!(" utf-8: {}", string); + } else { + println!(" base64: {}", base64::encode(&v)); + } + } + K2vValue::Tombstone => { + println!(" tombstone"); + } + } + } + exit(0); + } +} + +#[derive(Parser, Debug)] +#[clap(group = clap::ArgGroup::new("output-kind").multiple(false).required(false))] +struct BatchOutputKind { + /// Human formated output + #[clap(short = 'H', long, group = "output-kind")] + human: bool, + /// JSON formated output + #[clap(short, long, group = "output-kind")] + json: bool, +} + +/// Filter for batch operations +#[derive(Parser, Debug)] +#[clap(group = clap::ArgGroup::new("filter").multiple(true).required(true))] +struct Filter { + /// Match only keys starting with this prefix + #[clap(short, long, group = "filter")] + prefix: Option, + /// Match only keys lexicographically after this key (including this key itself) + #[clap(short, long, group = "filter")] + start: Option, + /// Match only keys lexicographically before this key (excluding this key) + #[clap(short, long, group = "filter")] + end: Option, + /// Only match the first X keys + #[clap(short, long)] + limit: Option, + /// Return keys in reverse order + #[clap(short, long)] + reverse: bool, + /// Return only keys where conflict happened + #[clap(short, long)] + conflicts_only: bool, + /// Also include keys storing only tombstones + #[clap(short, long)] + tombstones: bool, + /// Return any key + #[clap(short, long, group = "filter")] + all: bool, +} + +impl Filter { + fn k2v_filter(&self) -> k2v_client::Filter<'_> { + k2v_client::Filter { + start: self.start.as_deref(), + end: self.end.as_deref(), + prefix: self.prefix.as_deref(), + limit: self.limit, + reverse: self.reverse, + } + } +} + +#[tokio::main] +async fn main() -> Result<(), Error> { + let args = Args::parse(); + + let region = Region::Custom { + name: args.region, + endpoint: args.endpoint, + }; + + let creds = AwsCredentials::new(args.key_id, args.secret, None, None); + + let client = K2vClient::new(region, args.bucket, creds, None)?; + + match args.command { + Command::Insert { + partition_key, + sort_key, + causality, + value, + } => { + client + .insert_item( + &partition_key, + &sort_key, + value.to_data().await?, + causality.map(Into::into), + ) + .await?; + } + Command::Delete { + partition_key, + sort_key, + causality, + } => { + client + .delete_item(&partition_key, &sort_key, causality.into()) + .await?; + } + Command::Read { + partition_key, + sort_key, + output_kind, + } => { + let res = client.read_item(&partition_key, &sort_key).await?; + output_kind.display_output(res); + } + Command::ReadIndex { + output_kind, + filter, + } => { + if filter.conflicts_only || filter.tombstones { + return Err(Error::Message( + "conlicts-only and tombstones are invalid for read-index".into(), + )); + } + let res = client.read_index(filter.k2v_filter()).await?; + if output_kind.json { + let values = res + .items + .into_iter() + .map(|(k, v)| { + let mut value = serde_json::to_value(v).unwrap(); + value + .as_object_mut() + .unwrap() + .insert("sort_key".to_owned(), k.into()); + value + }) + .collect::>(); + let json = serde_json::json!({ + "next_key": res.next_start, + "values": values, + }); + + let stdout = std::io::stdout(); + serde_json::to_writer_pretty(stdout, &json).unwrap(); + } else { + if let Some(next) = res.next_start { + println!("next key: {}", next); + } + + let mut to_print = Vec::new(); + to_print.push(format!("key:\tentries\tconflicts\tvalues\tbytes")); + for (k, v) in res.items { + to_print.push(format!( + "{}\t{}\t{}\t{}\t{}", + k, v.entries, v.conflicts, v.values, v.bytes + )); + } + format_table(to_print); + } + } + Command::ReadRange { + partition_key, + output_kind, + filter, + } => { + let op = BatchReadOp { + partition_key: &partition_key, + filter: filter.k2v_filter(), + conflicts_only: filter.conflicts_only, + tombstones: filter.tombstones, + single_item: false, + }; + let mut res = client.read_batch(&[op]).await?; + let res = res.pop().unwrap(); + if output_kind.json { + let values = res + .items + .into_iter() + .map(|(k, v)| { + let mut value = serde_json::to_value(v).unwrap(); + value + .as_object_mut() + .unwrap() + .insert("sort_key".to_owned(), k.into()); + value + }) + .collect::>(); + let json = serde_json::json!({ + "next_key": res.next_start, + "values": values, + }); + + let stdout = std::io::stdout(); + serde_json::to_writer_pretty(stdout, &json).unwrap(); + } else { + if let Some(next) = res.next_start { + println!("next key: {}", next); + } + for (key, values) in res.items { + println!("key: {}", key); + let causality: String = values.causality.into(); + println!("causality: {}", causality); + for value in values.value { + match value { + K2vValue::Value(v) => { + if let Ok(string) = std::str::from_utf8(&v) { + println!(" value(utf-8): {}", string); + } else { + println!(" value(base64): {}", base64::encode(&v)); + } + } + K2vValue::Tombstone => { + println!(" tombstone"); + } + } + } + } + } + } + Command::DeleteRange { + partition_key, + output_kind, + filter, + } => { + let op = BatchDeleteOp { + partition_key: &partition_key, + prefix: filter.prefix.as_deref(), + start: filter.start.as_deref(), + end: filter.end.as_deref(), + single_item: false, + }; + if filter.reverse + || filter.conflicts_only + || filter.tombstones + || filter.limit.is_some() + { + return Err(Error::Message( + "limit, conlicts-only, reverse and tombstones are invalid for delete-range" + .into(), + )); + } + + let res = client.delete_batch(&[op]).await?; + + if output_kind.json { + println!("{}", res[0]); + } else { + println!("deleted {} keys", res[0]); + } + } + } + + Ok(()) +} diff --git a/src/k2v-client/error.rs b/src/k2v-client/error.rs new file mode 100644 index 00000000..37c221f2 --- /dev/null +++ b/src/k2v-client/error.rs @@ -0,0 +1,29 @@ +use std::borrow::Cow; + +use thiserror::Error; + +/// Errors returned by this crate +#[derive(Error, Debug)] +pub enum Error { + #[error("{0}, {1}: {2} (path = {3})")] + Remote( + http::StatusCode, + Cow<'static, str>, + Cow<'static, str>, + Cow<'static, str>, + ), + #[error("received invalid response: {0}")] + InvalidResponse(Cow<'static, str>), + #[error("not found")] + NotFound, + #[error("io error: {0}")] + IoError(#[from] std::io::Error), + #[error("rusoto tls error: {0}")] + RusotoTls(#[from] rusoto_core::request::TlsError), + #[error("rusoto http error: {0}")] + RusotoHttp(#[from] rusoto_core::HttpDispatchError), + #[error("deserialization error: {0}")] + Deserialization(#[from] serde_json::Error), + #[error("{0}")] + Message(Cow<'static, str>), +} diff --git a/src/k2v-client/lib.rs b/src/k2v-client/lib.rs new file mode 100644 index 00000000..95974d7a --- /dev/null +++ b/src/k2v-client/lib.rs @@ -0,0 +1,611 @@ +use std::collections::BTreeMap; +use std::time::Duration; + +use http::header::{ACCEPT, CONTENT_LENGTH, CONTENT_TYPE}; +use http::status::StatusCode; +use http::HeaderMap; +use log::{debug, error}; + +use rusoto_core::{ByteStream, DispatchSignedRequest, HttpClient}; +use rusoto_credential::AwsCredentials; +use rusoto_signature::region::Region; +use rusoto_signature::signature::SignedRequest; +use serde::de::Error as DeError; +use serde::{Deserialize, Deserializer, Serialize, Serializer}; + +use tokio::io::AsyncReadExt; + +mod error; + +pub use error::Error; + +const DEFAULT_TIMEOUT: Duration = Duration::from_secs(5); +const DEFAULT_POLL_TIMEOUT: Duration = Duration::from_secs(300); +const SERVICE: &str = "k2v"; +const GARAGE_CAUSALITY_TOKEN: &str = "X-Garage-Causality-Token"; + +/// Client used to query a K2V server. +pub struct K2vClient { + region: Region, + bucket: String, + creds: AwsCredentials, + client: HttpClient, +} + +impl K2vClient { + /// Create a new K2V client. + pub fn new( + region: Region, + bucket: String, + creds: AwsCredentials, + user_agent: Option, + ) -> Result { + let mut client = HttpClient::new()?; + if let Some(ua) = user_agent { + client.local_agent_prepend(ua); + } else { + client.local_agent_prepend(format!("k2v/{}", env!("CARGO_PKG_VERSION"))); + } + Ok(K2vClient { + region, + bucket, + creds, + client, + }) + } + + /// Perform a ReadItem request, reading the value(s) stored for a single pk+sk. + pub async fn read_item( + &self, + partition_key: &str, + sort_key: &str, + ) -> Result { + let mut req = SignedRequest::new( + "GET", + SERVICE, + &self.region, + &format!("/{}/{}", self.bucket, partition_key), + ); + req.add_param("sort_key", sort_key); + req.add_header(ACCEPT, "application/octet-stream, application/json"); + + let res = self.dispatch(req, None).await?; + + let causality = res + .causality_token + .ok_or_else(|| Error::InvalidResponse("missing causality token".into()))?; + + if res.status == StatusCode::NO_CONTENT { + return Ok(CausalValue { + causality, + value: vec![K2vValue::Tombstone], + }); + } + + match res.content_type.as_deref() { + Some("application/octet-stream") => Ok(CausalValue { + causality, + value: vec![K2vValue::Value(res.body)], + }), + Some("application/json") => { + let value = serde_json::from_slice(&res.body)?; + Ok(CausalValue { causality, value }) + } + Some(ct) => Err(Error::InvalidResponse( + format!("invalid content type: {}", ct).into(), + )), + None => Err(Error::InvalidResponse("missing content type".into())), + } + } + + /// Perform a PollItem request, waiting for the value(s) stored for a single pk+sk to be + /// updated. + pub async fn poll_item( + &self, + partition_key: &str, + sort_key: &str, + causality: CausalityToken, + timeout: Option, + ) -> Result, Error> { + let timeout = timeout.unwrap_or(DEFAULT_POLL_TIMEOUT); + + let mut req = SignedRequest::new( + "GET", + SERVICE, + &self.region, + &format!("/{}/{}", self.bucket, partition_key), + ); + req.add_param("sort_key", sort_key); + req.add_param("causality_token", &causality.0); + req.add_param("timeout", &timeout.as_secs().to_string()); + req.add_header(ACCEPT, "application/octet-stream, application/json"); + + let res = self.dispatch(req, Some(timeout + DEFAULT_TIMEOUT)).await?; + + let causality = res + .causality_token + .ok_or_else(|| Error::InvalidResponse("missing causality token".into()))?; + + if res.status == StatusCode::NOT_MODIFIED { + return Ok(None); + } + + if res.status == StatusCode::NO_CONTENT { + return Ok(Some(CausalValue { + causality, + value: vec![K2vValue::Tombstone], + })); + } + + match res.content_type.as_deref() { + Some("application/octet-stream") => Ok(Some(CausalValue { + causality, + value: vec![K2vValue::Value(res.body)], + })), + Some("application/json") => { + let value = serde_json::from_slice(&res.body)?; + Ok(Some(CausalValue { causality, value })) + } + Some(ct) => Err(Error::InvalidResponse( + format!("invalid content type: {}", ct).into(), + )), + None => Err(Error::InvalidResponse("missing content type".into())), + } + } + + /// Perform an InsertItem request, inserting a value for a single pk+sk. + pub async fn insert_item( + &self, + partition_key: &str, + sort_key: &str, + value: Vec, + causality: Option, + ) -> Result<(), Error> { + let mut req = SignedRequest::new( + "PUT", + SERVICE, + &self.region, + &format!("/{}/{}", self.bucket, partition_key), + ); + req.add_param("sort_key", sort_key); + req.set_payload(Some(value)); + + if let Some(causality) = causality { + req.add_header(GARAGE_CAUSALITY_TOKEN, &causality.0); + } + + self.dispatch(req, None).await?; + Ok(()) + } + + /// Perform a DeleteItem request, deleting the value(s) stored for a single pk+sk. + pub async fn delete_item( + &self, + partition_key: &str, + sort_key: &str, + causality: CausalityToken, + ) -> Result<(), Error> { + let mut req = SignedRequest::new( + "DELETE", + SERVICE, + &self.region, + &format!("/{}/{}", self.bucket, partition_key), + ); + req.add_param("sort_key", sort_key); + req.add_header(GARAGE_CAUSALITY_TOKEN, &causality.0); + + self.dispatch(req, None).await?; + Ok(()) + } + + /// Perform a ReadIndex request, listing partition key which have at least one associated + /// sort key, and which matches the filter. + pub async fn read_index( + &self, + filter: Filter<'_>, + ) -> Result, Error> { + let mut req = + SignedRequest::new("GET", SERVICE, &self.region, &format!("/{}", self.bucket)); + filter.insert_params(&mut req); + + let res = self.dispatch(req, None).await?; + + let resp: ReadIndexResponse = serde_json::from_slice(&res.body)?; + + let items = resp + .partition_keys + .into_iter() + .map(|ReadIndexItem { pk, info }| (pk, info)) + .collect(); + + Ok(PaginatedRange { + items, + next_start: resp.next_start, + }) + } + + /// Perform an InsertBatch request, inserting multiple values at once. Note: this operation is + /// *not* atomic: it is possible for some sub-operations to fails and others to success. In + /// that case, failure is reported. + pub async fn insert_batch(&self, operations: &[BatchInsertOp<'_>]) -> Result<(), Error> { + let mut req = + SignedRequest::new("POST", SERVICE, &self.region, &format!("/{}", self.bucket)); + + let payload = serde_json::to_vec(operations)?; + req.set_payload(Some(payload)); + self.dispatch(req, None).await?; + Ok(()) + } + + /// Perform a ReadBatch request, reading multiple values or range of values at once. + pub async fn read_batch( + &self, + operations: &[BatchReadOp<'_>], + ) -> Result>, Error> { + let mut req = + SignedRequest::new("POST", SERVICE, &self.region, &format!("/{}", self.bucket)); + req.add_param("search", ""); + + let payload = serde_json::to_vec(operations)?; + req.set_payload(Some(payload)); + let res = self.dispatch(req, None).await?; + + let resp: Vec = serde_json::from_slice(&res.body)?; + + Ok(resp + .into_iter() + .map(|e| PaginatedRange { + items: e + .items + .into_iter() + .map(|BatchReadItem { sk, ct, v }| { + ( + sk, + CausalValue { + causality: ct, + value: v, + }, + ) + }) + .collect(), + next_start: e.next_start, + }) + .collect()) + } + + /// Perform a DeleteBatch request, deleting mutiple values or range of values at once, without + /// providing causality information. + pub async fn delete_batch(&self, operations: &[BatchDeleteOp<'_>]) -> Result, Error> { + let mut req = + SignedRequest::new("POST", SERVICE, &self.region, &format!("/{}", self.bucket)); + req.add_param("delete", ""); + + let payload = serde_json::to_vec(operations)?; + req.set_payload(Some(payload)); + let res = self.dispatch(req, None).await?; + + let resp: Vec = serde_json::from_slice(&res.body)?; + + Ok(resp.into_iter().map(|r| r.deleted_items).collect()) + } + + async fn dispatch( + &self, + mut req: SignedRequest, + timeout: Option, + ) -> Result { + req.sign(&self.creds); + let mut res = self + .client + .dispatch(req, Some(timeout.unwrap_or(DEFAULT_TIMEOUT))) + .await?; + + let causality_token = res + .headers + .remove(GARAGE_CAUSALITY_TOKEN) + .map(CausalityToken); + let content_type = res.headers.remove(CONTENT_TYPE); + + let body = match res.status { + StatusCode::OK => read_body(&mut res.headers, res.body).await?, + StatusCode::NO_CONTENT => Vec::new(), + StatusCode::NOT_FOUND => return Err(Error::NotFound), + StatusCode::NOT_MODIFIED => Vec::new(), + s => { + let err_body = read_body(&mut res.headers, res.body) + .await + .unwrap_or_default(); + let err_body_str = std::str::from_utf8(&err_body) + .map(String::from) + .unwrap_or_else(|_| base64::encode(&err_body)); + + if s.is_client_error() || s.is_server_error() { + error!("Error response {}: {}", res.status, err_body_str); + let err = match serde_json::from_slice::(&err_body) { + Ok(err) => Error::Remote( + res.status, + err.code.into(), + err.message.into(), + err.path.into(), + ), + Err(_) => Error::Remote( + res.status, + "unknown".into(), + err_body_str.into(), + "?".into(), + ), + }; + return Err(err); + } else { + let msg = format!( + "Unexpected response code {}. Response body: {}", + res.status, err_body_str + ); + error!("{}", msg); + return Err(Error::InvalidResponse(msg.into())); + } + } + }; + debug!( + "Response body: {}", + std::str::from_utf8(&body) + .map(String::from) + .unwrap_or_else(|_| base64::encode(&body)) + ); + + Ok(Response { + body, + status: res.status, + causality_token, + content_type, + }) + } +} + +async fn read_body(headers: &mut HeaderMap, body: ByteStream) -> Result, Error> { + let body_len = headers + .get(CONTENT_LENGTH) + .and_then(|h| h.parse().ok()) + .unwrap_or(0); + let mut res = Vec::with_capacity(body_len); + body.into_async_read().read_to_end(&mut res).await?; + Ok(res) +} + +/// An opaque token used to convey causality between operations. +#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)] +#[serde(transparent)] +pub struct CausalityToken(String); + +impl From for CausalityToken { + fn from(v: String) -> Self { + CausalityToken(v) + } +} + +impl From for String { + fn from(v: CausalityToken) -> Self { + v.0 + } +} + +/// A value in K2V. can be either a binary value, or a tombstone. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum K2vValue { + Tombstone, + Value(Vec), +} + +impl From> for K2vValue { + fn from(v: Vec) -> Self { + K2vValue::Value(v) + } +} + +impl From>> for K2vValue { + fn from(v: Option>) -> Self { + match v { + Some(v) => K2vValue::Value(v), + None => K2vValue::Tombstone, + } + } +} + +impl<'de> Deserialize<'de> for K2vValue { + fn deserialize(d: D) -> Result + where + D: Deserializer<'de>, + { + let val: Option<&str> = Option::deserialize(d)?; + Ok(match val { + Some(s) => { + K2vValue::Value(base64::decode(s).map_err(|_| DeError::custom("invalid base64"))?) + } + None => K2vValue::Tombstone, + }) + } +} + +impl Serialize for K2vValue { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + match self { + K2vValue::Tombstone => serializer.serialize_none(), + K2vValue::Value(v) => { + let b64 = base64::encode(v); + serializer.serialize_str(&b64) + } + } + } +} + +/// A set of K2vValue and associated causality information. +#[derive(Debug, Clone, Serialize)] +pub struct CausalValue { + pub causality: CausalityToken, + pub value: Vec, +} + +/// Result of paginated requests. +#[derive(Debug, Clone)] +pub struct PaginatedRange { + pub items: BTreeMap, + pub next_start: Option, +} + +/// Filter for batch operations. +#[derive(Debug, Default, Clone, Deserialize, Serialize)] +pub struct Filter<'a> { + pub start: Option<&'a str>, + pub end: Option<&'a str>, + pub prefix: Option<&'a str>, + pub limit: Option, + #[serde(default)] + pub reverse: bool, +} + +impl<'a> Filter<'a> { + fn insert_params(&self, req: &mut SignedRequest) { + if let Some(start) = &self.start { + req.add_param("start", start); + } + if let Some(end) = &self.end { + req.add_param("end", end); + } + if let Some(prefix) = &self.prefix { + req.add_param("prefix", prefix); + } + if let Some(limit) = &self.limit { + req.add_param("limit", &limit.to_string()); + } + if self.reverse { + req.add_param("reverse", "true"); + } + } +} + +#[derive(Debug, Clone, Deserialize)] +#[serde(rename_all = "camelCase")] +struct ReadIndexResponse<'a> { + #[serde(flatten, borrow)] + #[allow(dead_code)] + filter: Filter<'a>, + partition_keys: Vec, + #[allow(dead_code)] + more: bool, + next_start: Option, +} + +#[derive(Debug, Clone, Deserialize)] +struct ReadIndexItem { + pk: String, + #[serde(flatten)] + info: PartitionInfo, +} + +/// Information about data stored with a given partition key. +#[derive(Debug, Clone, Deserialize, Serialize)] +pub struct PartitionInfo { + pub entries: u64, + pub conflicts: u64, + pub values: u64, + pub bytes: u64, +} + +/// Single sub-operation of an InsertBatch. +#[derive(Debug, Clone, Serialize)] +pub struct BatchInsertOp<'a> { + #[serde(rename = "pk")] + pub partition_key: &'a str, + #[serde(rename = "sk")] + pub sort_key: &'a str, + #[serde(rename = "ct")] + pub causality: Option, + #[serde(rename = "v")] + pub value: K2vValue, +} + +/// Single sub-operation of a ReadBatch. +#[derive(Debug, Default, Clone, Deserialize, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct BatchReadOp<'a> { + pub partition_key: &'a str, + #[serde(flatten, borrow)] + pub filter: Filter<'a>, + #[serde(default)] + pub single_item: bool, + #[serde(default)] + pub conflicts_only: bool, + #[serde(default)] + pub tombstones: bool, +} + +#[derive(Debug, Clone, Deserialize)] +#[serde(rename_all = "camelCase")] +struct BatchReadResponse<'a> { + #[serde(flatten, borrow)] + #[allow(dead_code)] + op: BatchReadOp<'a>, + items: Vec, + #[allow(dead_code)] + more: bool, + next_start: Option, +} + +#[derive(Debug, Clone, Deserialize)] +struct BatchReadItem { + sk: String, + ct: CausalityToken, + v: Vec, +} + +/// Single sub-operation of a DeleteBatch +#[derive(Debug, Clone, Deserialize, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct BatchDeleteOp<'a> { + pub partition_key: &'a str, + pub prefix: Option<&'a str>, + pub start: Option<&'a str>, + pub end: Option<&'a str>, + #[serde(default)] + pub single_item: bool, +} + +impl<'a> BatchDeleteOp<'a> { + pub fn new(partition_key: &'a str) -> Self { + BatchDeleteOp { + partition_key, + prefix: None, + start: None, + end: None, + single_item: false, + } + } +} + +#[derive(Debug, Clone, Deserialize)] +#[serde(rename_all = "camelCase")] +struct BatchDeleteResponse<'a> { + #[serde(flatten, borrow)] + #[allow(dead_code)] + filter: BatchDeleteOp<'a>, + deleted_items: u64, +} + +#[derive(Deserialize)] +struct ErrorResponse { + code: String, + message: String, + #[allow(dead_code)] + region: String, + path: String, +} + +struct Response { + body: Vec, + status: StatusCode, + causality_token: Option, + content_type: Option, +} diff --git a/src/k2v-client/src/bin/k2v-cli.rs b/src/k2v-client/src/bin/k2v-cli.rs deleted file mode 100644 index 38c39361..00000000 --- a/src/k2v-client/src/bin/k2v-cli.rs +++ /dev/null @@ -1,466 +0,0 @@ -use k2v_client::*; - -use garage_util::formater::format_table; - -use rusoto_core::credential::AwsCredentials; -use rusoto_core::Region; - -use clap::{Parser, Subcommand}; - -/// K2V command line interface -#[derive(Parser, Debug)] -#[clap(author, version, about, long_about = None)] -struct Args { - /// Name of the region to use - #[clap(short, long, env = "AWS_REGION", default_value = "garage")] - region: String, - /// Url of the endpoint to connect to - #[clap(short, long, env = "K2V_ENDPOINT")] - endpoint: String, - /// Access key ID - #[clap(short, long, env = "AWS_ACCESS_KEY_ID")] - key_id: String, - /// Access key ID - #[clap(short, long, env = "AWS_SECRET_ACCESS_KEY")] - secret: String, - /// Bucket name - #[clap(short, long, env = "K2V_BUCKET")] - bucket: String, - #[clap(subcommand)] - command: Command, -} - -#[derive(Subcommand, Debug)] -enum Command { - /// Insert a single value - Insert { - /// Partition key to insert to - partition_key: String, - /// Sort key to insert to - sort_key: String, - /// Causality of the insertion - #[clap(short, long)] - causality: Option, - /// Value to insert - #[clap(flatten)] - value: Value, - }, - /// Read a single value - Read { - /// Partition key to read from - partition_key: String, - /// Sort key to read from - sort_key: String, - /// Output formating - #[clap(flatten)] - output_kind: ReadOutputKind, - }, - /// Delete a single value - Delete { - /// Partition key to delete from - partition_key: String, - /// Sort key to delete from - sort_key: String, - /// Causality information - #[clap(short, long)] - causality: String, - }, - /// List partition keys - ReadIndex { - /// Output formating - #[clap(flatten)] - output_kind: BatchOutputKind, - /// Output only partition keys matching this filter - #[clap(flatten)] - filter: Filter, - }, - /// Read a range of sort keys - ReadRange { - /// Partition key to read from - partition_key: String, - /// Output formating - #[clap(flatten)] - output_kind: BatchOutputKind, - /// Output only sort keys matching this filter - #[clap(flatten)] - filter: Filter, - }, - /// Delete a range of sort keys - DeleteRange { - /// Partition key to delete from - partition_key: String, - /// Output formating - #[clap(flatten)] - output_kind: BatchOutputKind, - /// Delete only sort keys matching this filter - #[clap(flatten)] - filter: Filter, - }, -} - -/// Where to read a value from -#[derive(Parser, Debug)] -#[clap(group = clap::ArgGroup::new("value").multiple(false).required(true))] -struct Value { - /// Read value from a file. use - to read from stdin - #[clap(short, long, group = "value")] - file: Option, - /// Read a base64 value from commandline - #[clap(short, long, group = "value")] - b64: Option, - /// Read a raw (UTF-8) value from the commandline - #[clap(short, long, group = "value")] - text: Option, -} - -impl Value { - async fn to_data(&self) -> Result, Error> { - if let Some(ref text) = self.text { - Ok(text.as_bytes().to_vec()) - } else if let Some(ref b64) = self.b64 { - base64::decode(b64).map_err(|_| Error::Message("invalid base64 input".into())) - } else if let Some(ref path) = self.file { - use tokio::io::AsyncReadExt; - if path == "-" { - let mut file = tokio::io::stdin(); - let mut vec = Vec::new(); - file.read_to_end(&mut vec).await?; - Ok(vec) - } else { - let mut file = tokio::fs::File::open(path).await?; - let mut vec = Vec::new(); - file.read_to_end(&mut vec).await?; - Ok(vec) - } - } else { - unreachable!("Value must have one option set") - } - } -} - -#[derive(Parser, Debug)] -#[clap(group = clap::ArgGroup::new("output-kind").multiple(false).required(false))] -struct ReadOutputKind { - /// Base64 output. Conflicts are line separated, first line is causality token - #[clap(short, long, group = "output-kind")] - b64: bool, - /// Raw output. Conflicts generate error, causality token is not returned - #[clap(short, long, group = "output-kind")] - raw: bool, - /// Human formated output - #[clap(short = 'H', long, group = "output-kind")] - human: bool, - /// JSON formated output - #[clap(short, long, group = "output-kind")] - json: bool, -} - -impl ReadOutputKind { - fn display_output(&self, val: CausalValue) -> ! { - use std::io::Write; - use std::process::exit; - - if self.json { - let stdout = std::io::stdout(); - serde_json::to_writer_pretty(stdout, &val).unwrap(); - exit(0); - } - - if self.raw { - let mut val = val.value; - if val.len() != 1 { - eprintln!( - "Raw mode can only read non-concurent values, found {} values, expected 1", - val.len() - ); - exit(1); - } - let val = val.pop().unwrap(); - match val { - K2vValue::Value(v) => { - std::io::stdout().write_all(&v).unwrap(); - exit(0); - } - K2vValue::Tombstone => { - eprintln!("Expected value, found tombstone"); - exit(2); - } - } - } - - let causality: String = val.causality.into(); - if self.b64 { - println!("{}", causality); - for val in val.value { - match val { - K2vValue::Value(v) => { - println!("{}", base64::encode(&v)) - } - K2vValue::Tombstone => { - println!(); - } - } - } - exit(0); - } - - // human - println!("causality: {}", causality); - println!("values:"); - for val in val.value { - match val { - K2vValue::Value(v) => { - if let Ok(string) = std::str::from_utf8(&v) { - println!(" utf-8: {}", string); - } else { - println!(" base64: {}", base64::encode(&v)); - } - } - K2vValue::Tombstone => { - println!(" tombstone"); - } - } - } - exit(0); - } -} - -#[derive(Parser, Debug)] -#[clap(group = clap::ArgGroup::new("output-kind").multiple(false).required(false))] -struct BatchOutputKind { - /// Human formated output - #[clap(short = 'H', long, group = "output-kind")] - human: bool, - /// JSON formated output - #[clap(short, long, group = "output-kind")] - json: bool, -} - -/// Filter for batch operations -#[derive(Parser, Debug)] -#[clap(group = clap::ArgGroup::new("filter").multiple(true).required(true))] -struct Filter { - /// Match only keys starting with this prefix - #[clap(short, long, group = "filter")] - prefix: Option, - /// Match only keys lexicographically after this key (including this key itself) - #[clap(short, long, group = "filter")] - start: Option, - /// Match only keys lexicographically before this key (excluding this key) - #[clap(short, long, group = "filter")] - end: Option, - /// Only match the first X keys - #[clap(short, long)] - limit: Option, - /// Return keys in reverse order - #[clap(short, long)] - reverse: bool, - /// Return only keys where conflict happened - #[clap(short, long)] - conflicts_only: bool, - /// Also include keys storing only tombstones - #[clap(short, long)] - tombstones: bool, - /// Return any key - #[clap(short, long, group = "filter")] - all: bool, -} - -impl Filter { - fn k2v_filter(&self) -> k2v_client::Filter<'_> { - k2v_client::Filter { - start: self.start.as_deref(), - end: self.end.as_deref(), - prefix: self.prefix.as_deref(), - limit: self.limit, - reverse: self.reverse, - } - } -} - -#[tokio::main] -async fn main() -> Result<(), Error> { - let args = Args::parse(); - - let region = Region::Custom { - name: args.region, - endpoint: args.endpoint, - }; - - let creds = AwsCredentials::new(args.key_id, args.secret, None, None); - - let client = K2vClient::new(region, args.bucket, creds, None)?; - - match args.command { - Command::Insert { - partition_key, - sort_key, - causality, - value, - } => { - client - .insert_item( - &partition_key, - &sort_key, - value.to_data().await?, - causality.map(Into::into), - ) - .await?; - } - Command::Delete { - partition_key, - sort_key, - causality, - } => { - client - .delete_item(&partition_key, &sort_key, causality.into()) - .await?; - } - Command::Read { - partition_key, - sort_key, - output_kind, - } => { - let res = client.read_item(&partition_key, &sort_key).await?; - output_kind.display_output(res); - } - Command::ReadIndex { - output_kind, - filter, - } => { - if filter.conflicts_only || filter.tombstones { - return Err(Error::Message( - "conlicts-only and tombstones are invalid for read-index".into(), - )); - } - let res = client.read_index(filter.k2v_filter()).await?; - if output_kind.json { - let values = res - .items - .into_iter() - .map(|(k, v)| { - let mut value = serde_json::to_value(v).unwrap(); - value - .as_object_mut() - .unwrap() - .insert("sort_key".to_owned(), k.into()); - value - }) - .collect::>(); - let json = serde_json::json!({ - "next_key": res.next_start, - "values": values, - }); - - let stdout = std::io::stdout(); - serde_json::to_writer_pretty(stdout, &json).unwrap(); - } else { - if let Some(next) = res.next_start { - println!("next key: {}", next); - } - - let mut to_print = Vec::new(); - to_print.push(format!("key:\tentries\tconflicts\tvalues\tbytes")); - for (k, v) in res.items { - to_print.push(format!( - "{}\t{}\t{}\t{}\t{}", - k, v.entries, v.conflicts, v.values, v.bytes - )); - } - format_table(to_print); - } - } - Command::ReadRange { - partition_key, - output_kind, - filter, - } => { - let op = BatchReadOp { - partition_key: &partition_key, - filter: filter.k2v_filter(), - conflicts_only: filter.conflicts_only, - tombstones: filter.tombstones, - single_item: false, - }; - let mut res = client.read_batch(&[op]).await?; - let res = res.pop().unwrap(); - if output_kind.json { - let values = res - .items - .into_iter() - .map(|(k, v)| { - let mut value = serde_json::to_value(v).unwrap(); - value - .as_object_mut() - .unwrap() - .insert("sort_key".to_owned(), k.into()); - value - }) - .collect::>(); - let json = serde_json::json!({ - "next_key": res.next_start, - "values": values, - }); - - let stdout = std::io::stdout(); - serde_json::to_writer_pretty(stdout, &json).unwrap(); - } else { - if let Some(next) = res.next_start { - println!("next key: {}", next); - } - for (key, values) in res.items { - println!("key: {}", key); - let causality: String = values.causality.into(); - println!("causality: {}", causality); - for value in values.value { - match value { - K2vValue::Value(v) => { - if let Ok(string) = std::str::from_utf8(&v) { - println!(" value(utf-8): {}", string); - } else { - println!(" value(base64): {}", base64::encode(&v)); - } - } - K2vValue::Tombstone => { - println!(" tombstone"); - } - } - } - } - } - } - Command::DeleteRange { - partition_key, - output_kind, - filter, - } => { - let op = BatchDeleteOp { - partition_key: &partition_key, - prefix: filter.prefix.as_deref(), - start: filter.start.as_deref(), - end: filter.end.as_deref(), - single_item: false, - }; - if filter.reverse - || filter.conflicts_only - || filter.tombstones - || filter.limit.is_some() - { - return Err(Error::Message( - "limit, conlicts-only, reverse and tombstones are invalid for delete-range" - .into(), - )); - } - - let res = client.delete_batch(&[op]).await?; - - if output_kind.json { - println!("{}", res[0]); - } else { - println!("deleted {} keys", res[0]); - } - } - } - - Ok(()) -} diff --git a/src/k2v-client/src/error.rs b/src/k2v-client/src/error.rs deleted file mode 100644 index 62357934..00000000 --- a/src/k2v-client/src/error.rs +++ /dev/null @@ -1,22 +0,0 @@ -use std::borrow::Cow; - -use thiserror::Error; - -/// Errors returned by this crate -#[derive(Error, Debug)] -pub enum Error { - #[error("received invalid response: {0}")] - InvalidResponse(Cow<'static, str>), - #[error("not found")] - NotFound, - #[error("io error: {0}")] - IoError(#[from] std::io::Error), - #[error("rusoto tls error: {0}")] - RusotoTls(#[from] rusoto_core::request::TlsError), - #[error("rusoto http error: {0}")] - RusotoHttp(#[from] rusoto_core::HttpDispatchError), - #[error("deserialization error: {0}")] - Deserialization(#[from] serde_json::Error), - #[error("{0}")] - Message(Cow<'static, str>), -} diff --git a/src/k2v-client/src/lib.rs b/src/k2v-client/src/lib.rs deleted file mode 100644 index ba1cd6ea..00000000 --- a/src/k2v-client/src/lib.rs +++ /dev/null @@ -1,566 +0,0 @@ -use std::collections::BTreeMap; -use std::time::Duration; - -use http::header::{ACCEPT, CONTENT_LENGTH, CONTENT_TYPE}; -use http::status::StatusCode; -use http::HeaderMap; - -use rusoto_core::{ByteStream, DispatchSignedRequest, HttpClient}; -use rusoto_credential::AwsCredentials; -use rusoto_signature::region::Region; -use rusoto_signature::signature::SignedRequest; -use serde::de::Error as DeError; -use serde::{Deserialize, Deserializer, Serialize, Serializer}; - -use tokio::io::AsyncReadExt; - -mod error; - -pub use error::Error; - -const DEFAULT_TIMEOUT: Duration = Duration::from_secs(5); -const DEFAULT_POLL_TIMEOUT: Duration = Duration::from_secs(300); -const SERVICE: &str = "k2v"; -const GARAGE_CAUSALITY_TOKEN: &str = "X-Garage-Causality-Token"; - -/// Client used to query a K2V server. -pub struct K2vClient { - region: Region, - bucket: String, - creds: AwsCredentials, - client: HttpClient, -} - -impl K2vClient { - /// Create a new K2V client. - pub fn new( - region: Region, - bucket: String, - creds: AwsCredentials, - user_agent: Option, - ) -> Result { - let mut client = HttpClient::new()?; - if let Some(ua) = user_agent { - client.local_agent_prepend(ua); - } else { - client.local_agent_prepend(format!("k2v/{}", env!("CARGO_PKG_VERSION"))); - } - Ok(K2vClient { - region, - bucket, - creds, - client, - }) - } - - /// Perform a ReadItem request, reading the value(s) stored for a single pk+sk. - pub async fn read_item( - &self, - partition_key: &str, - sort_key: &str, - ) -> Result { - let mut req = SignedRequest::new( - "GET", - SERVICE, - &self.region, - &format!("/{}/{}", self.bucket, partition_key), - ); - req.add_param("sort_key", sort_key); - req.add_header(ACCEPT, "application/octet-stream, application/json"); - - let res = self.dispatch(req, None).await?; - - let causality = res - .causality_token - .ok_or_else(|| Error::InvalidResponse("missing causality token".into()))?; - - if res.status == StatusCode::NO_CONTENT { - return Ok(CausalValue { - causality, - value: vec![K2vValue::Tombstone], - }); - } - - match res.content_type.as_deref() { - Some("application/octet-stream") => Ok(CausalValue { - causality, - value: vec![K2vValue::Value(res.body)], - }), - Some("application/json") => { - let value = serde_json::from_slice(&res.body)?; - Ok(CausalValue { causality, value }) - } - Some(ct) => Err(Error::InvalidResponse( - format!("invalid content type: {}", ct).into(), - )), - None => Err(Error::InvalidResponse("missing content type".into())), - } - } - - /// Perform a PollItem request, waiting for the value(s) stored for a single pk+sk to be - /// updated. - pub async fn poll_item( - &self, - partition_key: &str, - sort_key: &str, - causality: CausalityToken, - timeout: Option, - ) -> Result, Error> { - let timeout = timeout.unwrap_or(DEFAULT_POLL_TIMEOUT); - - let mut req = SignedRequest::new( - "GET", - SERVICE, - &self.region, - &format!("/{}/{}", self.bucket, partition_key), - ); - req.add_param("sort_key", sort_key); - req.add_param("causality_token", &causality.0); - req.add_param("timeout", &timeout.as_secs().to_string()); - req.add_header(ACCEPT, "application/octet-stream, application/json"); - - let res = self.dispatch(req, Some(timeout + DEFAULT_TIMEOUT)).await?; - - let causality = res - .causality_token - .ok_or_else(|| Error::InvalidResponse("missing causality token".into()))?; - - if res.status == StatusCode::NOT_MODIFIED { - return Ok(None); - } - - if res.status == StatusCode::NO_CONTENT { - return Ok(Some(CausalValue { - causality, - value: vec![K2vValue::Tombstone], - })); - } - - match res.content_type.as_deref() { - Some("application/octet-stream") => Ok(Some(CausalValue { - causality, - value: vec![K2vValue::Value(res.body)], - })), - Some("application/json") => { - let value = serde_json::from_slice(&res.body)?; - Ok(Some(CausalValue { causality, value })) - } - Some(ct) => Err(Error::InvalidResponse( - format!("invalid content type: {}", ct).into(), - )), - None => Err(Error::InvalidResponse("missing content type".into())), - } - } - - /// Perform an InsertItem request, inserting a value for a single pk+sk. - pub async fn insert_item( - &self, - partition_key: &str, - sort_key: &str, - value: Vec, - causality: Option, - ) -> Result<(), Error> { - let mut req = SignedRequest::new( - "PUT", - SERVICE, - &self.region, - &format!("/{}/{}", self.bucket, partition_key), - ); - req.add_param("sort_key", sort_key); - req.set_payload(Some(value)); - - if let Some(causality) = causality { - req.add_header(GARAGE_CAUSALITY_TOKEN, &causality.0); - } - - self.dispatch(req, None).await?; - Ok(()) - } - - /// Perform a DeleteItem request, deleting the value(s) stored for a single pk+sk. - pub async fn delete_item( - &self, - partition_key: &str, - sort_key: &str, - causality: CausalityToken, - ) -> Result<(), Error> { - let mut req = SignedRequest::new( - "DELETE", - SERVICE, - &self.region, - &format!("/{}/{}", self.bucket, partition_key), - ); - req.add_param("sort_key", sort_key); - req.add_header(GARAGE_CAUSALITY_TOKEN, &causality.0); - - self.dispatch(req, None).await?; - Ok(()) - } - - /// Perform a ReadIndex request, listing partition key which have at least one associated - /// sort key, and which matches the filter. - pub async fn read_index( - &self, - filter: Filter<'_>, - ) -> Result, Error> { - let mut req = - SignedRequest::new("GET", SERVICE, &self.region, &format!("/{}", self.bucket)); - filter.insert_params(&mut req); - - let res = self.dispatch(req, None).await?; - - let resp: ReadIndexResponse = serde_json::from_slice(&res.body)?; - - let items = resp - .partition_keys - .into_iter() - .map(|ReadIndexItem { pk, info }| (pk, info)) - .collect(); - - Ok(PaginatedRange { - items, - next_start: resp.next_start, - }) - } - - /// Perform an InsertBatch request, inserting multiple values at once. Note: this operation is - /// *not* atomic: it is possible for some sub-operations to fails and others to success. In - /// that case, failure is reported. - pub async fn insert_batch(&self, operations: &[BatchInsertOp<'_>]) -> Result<(), Error> { - let mut req = - SignedRequest::new("POST", SERVICE, &self.region, &format!("/{}", self.bucket)); - - let payload = serde_json::to_vec(operations)?; - req.set_payload(Some(payload)); - self.dispatch(req, None).await?; - Ok(()) - } - - /// Perform a ReadBatch request, reading multiple values or range of values at once. - pub async fn read_batch( - &self, - operations: &[BatchReadOp<'_>], - ) -> Result>, Error> { - let mut req = - SignedRequest::new("POST", SERVICE, &self.region, &format!("/{}", self.bucket)); - req.add_param("search", ""); - - let payload = serde_json::to_vec(operations)?; - req.set_payload(Some(payload)); - let res = self.dispatch(req, None).await?; - - let resp: Vec = serde_json::from_slice(&res.body)?; - - Ok(resp - .into_iter() - .map(|e| PaginatedRange { - items: e - .items - .into_iter() - .map(|BatchReadItem { sk, ct, v }| { - ( - sk, - CausalValue { - causality: ct, - value: v, - }, - ) - }) - .collect(), - next_start: e.next_start, - }) - .collect()) - } - - /// Perform a DeleteBatch request, deleting mutiple values or range of values at once, without - /// providing causality information. - pub async fn delete_batch(&self, operations: &[BatchDeleteOp<'_>]) -> Result, Error> { - let mut req = - SignedRequest::new("POST", SERVICE, &self.region, &format!("/{}", self.bucket)); - req.add_param("delete", ""); - - let payload = serde_json::to_vec(operations)?; - req.set_payload(Some(payload)); - let res = self.dispatch(req, None).await?; - - let resp: Vec = serde_json::from_slice(&res.body)?; - - Ok(resp.into_iter().map(|r| r.deleted_items).collect()) - } - - async fn dispatch( - &self, - mut req: SignedRequest, - timeout: Option, - ) -> Result { - req.sign(&self.creds); - let mut res = self - .client - .dispatch(req, Some(timeout.unwrap_or(DEFAULT_TIMEOUT))) - .await?; - - let causality_token = res - .headers - .remove(GARAGE_CAUSALITY_TOKEN) - .map(CausalityToken); - let content_type = res.headers.remove(CONTENT_TYPE); - - let body = match res.status { - StatusCode::OK => read_body(&mut res.headers, res.body).await?, - StatusCode::NO_CONTENT => Vec::new(), - StatusCode::NOT_FOUND => return Err(Error::NotFound), - StatusCode::NOT_MODIFIED => Vec::new(), - _ => { - return Err(Error::InvalidResponse( - format!("invalid error code: {}", res.status).into(), - )) - } - }; - - Ok(Response { - body, - status: res.status, - causality_token, - content_type, - }) - } -} - -async fn read_body(headers: &mut HeaderMap, body: ByteStream) -> Result, Error> { - let body_len = headers - .get(CONTENT_LENGTH) - .and_then(|h| h.parse().ok()) - .unwrap_or(0); - let mut res = Vec::with_capacity(body_len); - body.into_async_read().read_to_end(&mut res).await?; - Ok(res) -} - -/// An opaque token used to convey causality between operations. -#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)] -#[serde(transparent)] -pub struct CausalityToken(String); - -impl From for CausalityToken { - fn from(v: String) -> Self { - CausalityToken(v) - } -} - -impl From for String { - fn from(v: CausalityToken) -> Self { - v.0 - } -} - -/// A value in K2V. can be either a binary value, or a tombstone. -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum K2vValue { - Tombstone, - Value(Vec), -} - -impl From> for K2vValue { - fn from(v: Vec) -> Self { - K2vValue::Value(v) - } -} - -impl From>> for K2vValue { - fn from(v: Option>) -> Self { - match v { - Some(v) => K2vValue::Value(v), - None => K2vValue::Tombstone, - } - } -} - -impl<'de> Deserialize<'de> for K2vValue { - fn deserialize(d: D) -> Result - where - D: Deserializer<'de>, - { - let val: Option<&str> = Option::deserialize(d)?; - Ok(match val { - Some(s) => { - K2vValue::Value(base64::decode(s).map_err(|_| DeError::custom("invalid base64"))?) - } - None => K2vValue::Tombstone, - }) - } -} - -impl Serialize for K2vValue { - fn serialize(&self, serializer: S) -> Result - where - S: Serializer, - { - match self { - K2vValue::Tombstone => serializer.serialize_none(), - K2vValue::Value(v) => { - let b64 = base64::encode(v); - serializer.serialize_str(&b64) - } - } - } -} - -/// A set of K2vValue and associated causality information. -#[derive(Debug, Clone, Serialize)] -pub struct CausalValue { - pub causality: CausalityToken, - pub value: Vec, -} - -/// Result of paginated requests. -#[derive(Debug, Clone)] -pub struct PaginatedRange { - pub items: BTreeMap, - pub next_start: Option, -} - -/// Filter for batch operations. -#[derive(Debug, Default, Clone, Deserialize, Serialize)] -pub struct Filter<'a> { - pub start: Option<&'a str>, - pub end: Option<&'a str>, - pub prefix: Option<&'a str>, - pub limit: Option, - #[serde(default)] - pub reverse: bool, -} - -impl<'a> Filter<'a> { - fn insert_params(&self, req: &mut SignedRequest) { - if let Some(start) = &self.start { - req.add_param("start", start); - } - if let Some(end) = &self.end { - req.add_param("end", end); - } - if let Some(prefix) = &self.prefix { - req.add_param("prefix", prefix); - } - if let Some(limit) = &self.limit { - req.add_param("limit", &limit.to_string()); - } - if self.reverse { - req.add_param("reverse", "true"); - } - } -} - -#[derive(Debug, Clone, Deserialize)] -#[serde(rename_all = "camelCase")] -struct ReadIndexResponse<'a> { - #[serde(flatten, borrow)] - #[allow(dead_code)] - filter: Filter<'a>, - partition_keys: Vec, - #[allow(dead_code)] - more: bool, - next_start: Option, -} - -#[derive(Debug, Clone, Deserialize)] -struct ReadIndexItem { - pk: String, - #[serde(flatten)] - info: PartitionInfo, -} - -/// Information about data stored with a given partition key. -#[derive(Debug, Clone, Deserialize, Serialize)] -pub struct PartitionInfo { - pub entries: u64, - pub conflicts: u64, - pub values: u64, - pub bytes: u64, -} - -/// Single sub-operation of an InsertBatch. -#[derive(Debug, Clone, Serialize)] -pub struct BatchInsertOp<'a> { - #[serde(rename = "pk")] - pub partition_key: &'a str, - #[serde(rename = "sk")] - pub sort_key: &'a str, - #[serde(rename = "ct")] - pub causality: Option, - #[serde(rename = "v")] - pub value: K2vValue, -} - -/// Single sub-operation of a ReadBatch. -#[derive(Debug, Default, Clone, Deserialize, Serialize)] -#[serde(rename_all = "camelCase")] -pub struct BatchReadOp<'a> { - pub partition_key: &'a str, - #[serde(flatten, borrow)] - pub filter: Filter<'a>, - #[serde(default)] - pub single_item: bool, - #[serde(default)] - pub conflicts_only: bool, - #[serde(default)] - pub tombstones: bool, -} - -#[derive(Debug, Clone, Deserialize)] -#[serde(rename_all = "camelCase")] -struct BatchReadResponse<'a> { - #[serde(flatten, borrow)] - #[allow(dead_code)] - op: BatchReadOp<'a>, - items: Vec, - #[allow(dead_code)] - more: bool, - next_start: Option, -} - -#[derive(Debug, Clone, Deserialize)] -struct BatchReadItem { - sk: String, - ct: CausalityToken, - v: Vec, -} - -/// Single sub-operation of a DeleteBatch -#[derive(Debug, Clone, Deserialize, Serialize)] -#[serde(rename_all = "camelCase")] -pub struct BatchDeleteOp<'a> { - pub partition_key: &'a str, - pub prefix: Option<&'a str>, - pub start: Option<&'a str>, - pub end: Option<&'a str>, - #[serde(default)] - pub single_item: bool, -} - -impl<'a> BatchDeleteOp<'a> { - pub fn new(partition_key: &'a str) -> Self { - BatchDeleteOp { - partition_key, - prefix: None, - start: None, - end: None, - single_item: false, - } - } -} - -#[derive(Debug, Clone, Deserialize)] -#[serde(rename_all = "camelCase")] -struct BatchDeleteResponse<'a> { - #[serde(flatten, borrow)] - #[allow(dead_code)] - filter: BatchDeleteOp<'a>, - deleted_items: u64, -} - -struct Response { - body: Vec, - status: StatusCode, - causality_token: Option, - content_type: Option, -} -- cgit v1.2.3 From ff06d3f0829464863e64ed55471f2caa13bed191 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 25 May 2022 17:05:56 +0200 Subject: Fix Content-Type headers for {admin,k2v} errors and admin responses Fix #315 --- src/api/admin/bucket.rs | 13 +++---------- src/api/admin/cluster.rs | 19 +++++-------------- src/api/admin/error.rs | 5 +++-- src/api/admin/key.rs | 14 +++----------- src/api/generic_server.rs | 4 +--- src/api/helpers.rs | 10 +++++++++- src/api/k2v/error.rs | 5 +++-- src/api/s3/error.rs | 3 +++ 8 files changed, 30 insertions(+), 43 deletions(-) (limited to 'src') diff --git a/src/api/admin/bucket.rs b/src/api/admin/bucket.rs index 849d28ac..7f9a813f 100644 --- a/src/api/admin/bucket.rs +++ b/src/api/admin/bucket.rs @@ -6,7 +6,6 @@ use serde::{Deserialize, Serialize}; use garage_util::crdt::*; use garage_util::data::*; -use garage_util::error::Error as GarageError; use garage_util::time::*; use garage_table::*; @@ -19,7 +18,7 @@ use garage_model::permission::*; use crate::admin::error::*; use crate::admin::key::ApiBucketKeyPerm; use crate::common_error::CommonError; -use crate::helpers::parse_json_body; +use crate::helpers::{json_ok_response, parse_json_body}; pub async fn handle_list_buckets(garage: &Arc) -> Result, Error> { let buckets = garage @@ -60,10 +59,7 @@ pub async fn handle_list_buckets(garage: &Arc) -> Result, }) .collect::>(); - let resp_json = serde_json::to_string_pretty(&res).map_err(GarageError::from)?; - Ok(Response::builder() - .status(StatusCode::OK) - .body(Body::from(resp_json))?) + Ok(json_ok_response(&res)?) } #[derive(Serialize)] @@ -197,10 +193,7 @@ async fn bucket_info_results( .collect::>(), }; - let resp_json = serde_json::to_string_pretty(&res).map_err(GarageError::from)?; - Ok(Response::builder() - .status(StatusCode::OK) - .body(Body::from(resp_json))?) + Ok(json_ok_response(&res)?) } #[derive(Serialize)] diff --git a/src/api/admin/cluster.rs b/src/api/admin/cluster.rs index 3401be42..6d01317d 100644 --- a/src/api/admin/cluster.rs +++ b/src/api/admin/cluster.rs @@ -7,14 +7,13 @@ use serde::{Deserialize, Serialize}; use garage_util::crdt::*; use garage_util::data::*; -use garage_util::error::Error as GarageError; use garage_rpc::layout::*; use garage_model::garage::Garage; use crate::admin::error::*; -use crate::helpers::parse_json_body; +use crate::helpers::{json_ok_response, parse_json_body}; pub async fn handle_get_cluster_status(garage: &Arc) -> Result, Error> { let res = GetClusterStatusResponse { @@ -39,10 +38,7 @@ pub async fn handle_get_cluster_status(garage: &Arc) -> Result>(); - let resp_json = serde_json::to_string_pretty(&res).map_err(GarageError::from)?; - Ok(Response::builder() - .status(StatusCode::OK) - .body(Body::from(resp_json))?) + Ok(json_ok_response(&res)?) } pub async fn handle_get_cluster_layout(garage: &Arc) -> Result, Error> { let res = get_cluster_layout(garage); - let resp_json = serde_json::to_string_pretty(&res).map_err(GarageError::from)?; - Ok(Response::builder() - .status(StatusCode::OK) - .body(Body::from(resp_json))?) + + Ok(json_ok_response(&res)?) } fn get_cluster_layout(garage: &Arc) -> GetClusterLayoutResponse { diff --git a/src/api/admin/error.rs b/src/api/admin/error.rs index c4613cb3..ed1a07bd 100644 --- a/src/api/admin/error.rs +++ b/src/api/admin/error.rs @@ -72,8 +72,9 @@ impl ApiError for Error { } } - fn add_http_headers(&self, _header_map: &mut HeaderMap) { - // nothing + fn add_http_headers(&self, header_map: &mut HeaderMap) { + use hyper::header; + header_map.append(header::CONTENT_TYPE, "application/json".parse().unwrap()); } fn http_body(&self, garage_region: &str, path: &str) -> Body { diff --git a/src/api/admin/key.rs b/src/api/admin/key.rs index f30b5dbb..2bbabb7b 100644 --- a/src/api/admin/key.rs +++ b/src/api/admin/key.rs @@ -4,15 +4,13 @@ use std::sync::Arc; use hyper::{Body, Request, Response, StatusCode}; use serde::{Deserialize, Serialize}; -use garage_util::error::Error as GarageError; - use garage_table::*; use garage_model::garage::Garage; use garage_model::key_table::*; use crate::admin::error::*; -use crate::helpers::parse_json_body; +use crate::helpers::{json_ok_response, parse_json_body}; pub async fn handle_list_keys(garage: &Arc) -> Result, Error> { let res = garage @@ -32,10 +30,7 @@ pub async fn handle_list_keys(garage: &Arc) -> Result, Er }) .collect::>(); - let resp_json = serde_json::to_string_pretty(&res).map_err(GarageError::from)?; - Ok(Response::builder() - .status(StatusCode::OK) - .body(Body::from(resp_json))?) + Ok(json_ok_response(&res)?) } #[derive(Serialize)] @@ -221,10 +216,7 @@ async fn key_info_results(garage: &Arc, key: Key) -> Result>(), }; - let resp_json = serde_json::to_string_pretty(&res).map_err(GarageError::from)?; - Ok(Response::builder() - .status(StatusCode::OK) - .body(Body::from(resp_json))?) + Ok(json_ok_response(&res)?) } #[derive(Serialize)] diff --git a/src/api/generic_server.rs b/src/api/generic_server.rs index 77278908..a48be1bc 100644 --- a/src/api/generic_server.rs +++ b/src/api/generic_server.rs @@ -150,9 +150,7 @@ impl ApiServer { } Err(e) => { let body: Body = e.http_body(&self.region, uri.path()); - let mut http_error_builder = Response::builder() - .status(e.http_status_code()) - .header("Content-Type", "application/xml"); + let mut http_error_builder = Response::builder().status(e.http_status_code()); if let Some(header_map) = http_error_builder.headers_mut() { e.add_http_headers(header_map) diff --git a/src/api/helpers.rs b/src/api/helpers.rs index 9fb12dbe..642dbc42 100644 --- a/src/api/helpers.rs +++ b/src/api/helpers.rs @@ -1,4 +1,4 @@ -use hyper::{Body, Request}; +use hyper::{Body, Request, Response}; use idna::domain_to_unicode; use serde::{Deserialize, Serialize}; @@ -144,6 +144,14 @@ pub async fn parse_json_body Deserialize<'de>>(req: Request) - Ok(resp) } +pub fn json_ok_response(res: &T) -> Result, Error> { + let resp_json = serde_json::to_string_pretty(res).map_err(garage_util::error::Error::from)?; + Ok(Response::builder() + .status(hyper::StatusCode::OK) + .header(http::header::CONTENT_TYPE, "application/json") + .body(Body::from(resp_json))?) +} + #[cfg(test)] mod tests { use super::*; diff --git a/src/api/k2v/error.rs b/src/api/k2v/error.rs index 4c55d8b5..42491466 100644 --- a/src/api/k2v/error.rs +++ b/src/api/k2v/error.rs @@ -110,8 +110,9 @@ impl ApiError for Error { } } - fn add_http_headers(&self, _header_map: &mut HeaderMap) { - // nothing + fn add_http_headers(&self, header_map: &mut HeaderMap) { + use hyper::header; + header_map.append(header::CONTENT_TYPE, "application/json".parse().unwrap()); } fn http_body(&self, garage_region: &str, path: &str) -> Body { diff --git a/src/api/s3/error.rs b/src/api/s3/error.rs index ac632540..67009d63 100644 --- a/src/api/s3/error.rs +++ b/src/api/s3/error.rs @@ -172,6 +172,9 @@ impl ApiError for Error { fn add_http_headers(&self, header_map: &mut HeaderMap) { use hyper::header; + + header_map.append(header::CONTENT_TYPE, "application/xml".parse().unwrap()); + #[allow(clippy::single_match)] match self { Error::InvalidRange((_, len)) => { -- cgit v1.2.3 From b44d3fc796484a50cd6854f20c9b46e5fddedc9d Mon Sep 17 00:00:00 2001 From: Alex Date: Wed, 8 Jun 2022 10:01:44 +0200 Subject: Abstract database behind generic interface and implement alternative drivers (#322) - [x] Design interface - [x] Implement Sled backend - [x] Re-implement the SledCountedTree hack ~~on Sled backend~~ on all backends (i.e. over the abstraction) - [x] Convert Garage code to use generic interface - [x] Proof-read converted Garage code - [ ] Test everything well - [x] Implement sqlite backend - [x] Implement LMDB backend - [ ] (Implement Persy backend?) - [ ] (Implement other backends? (like RocksDB, ...)) - [x] Implement backend choice in config file and garage server module - [x] Add CLI for converting between DB formats - Exploit the new interface to put more things in transactions - [x] `.updated()` trigger on Garage tables Fix #284 **Bugs** - [x] When exporting sqlite, trees iterate empty?? - [x] LMDB doesn't work **Known issues for various back-ends** - Sled: - Eats all my RAM and also all my disk space - `.len()` has to traverse the whole table - Is actually quite slow on some operations - And is actually pretty bad code... - Sqlite: - Requires a lock to be taken on all operations. The lock is also taken when iterating on a table with `.iter()`, and the lock isn't released until the iterator is dropped. This means that we must be VERY carefull to not do anything else inside a `.iter()` loop or else we will have a deadlock! Most such cases have been eliminated from the Garage codebase, but there might still be some that remain. If your Garage-over-Sqlite seems to hang/freeze, this is the reason. - (adapter uses a bunch of unsafe code) - Heed (LMDB): - Not suited for 32-bit machines as it has to map the whole DB in memory. - (adpater uses a tiny bit of unsafe code) **My recommendation:** avoid 32-bit machines and use LMDB as much as possible. **Converting databases** is actually quite easy. For example from Sled to LMDB: ```bash cd src/db cargo run --features cli --bin convert -- -i path/to/garage/meta/db -a sled -o path/to/garage/meta/db.lmdb -b lmdb ``` Then, just add this to your `config.toml`: ```toml db_engine = "lmdb" ``` Co-authored-by: Alex Auvolat Reviewed-on: https://git.deuxfleurs.fr/Deuxfleurs/garage/pulls/322 Co-authored-by: Alex Co-committed-by: Alex --- src/api/admin/cluster.rs | 2 + src/block/Cargo.toml | 3 +- src/block/manager.rs | 127 +++++++--- src/block/metrics.rs | 4 +- src/block/rc.rs | 49 ++-- src/db/Cargo.toml | 36 +++ src/db/bin/convert.rs | 76 ++++++ src/db/counted_tree_hack.rs | 127 ++++++++++ src/db/lib.rs | 400 ++++++++++++++++++++++++++++++++ src/db/lmdb_adapter.rs | 329 ++++++++++++++++++++++++++ src/db/sled_adapter.rs | 260 +++++++++++++++++++++ src/db/sqlite_adapter.rs | 500 ++++++++++++++++++++++++++++++++++++++++ src/db/test.rs | 106 +++++++++ src/garage/Cargo.toml | 3 +- src/garage/admin.rs | 45 ++-- src/garage/repair.rs | 50 ++-- src/garage/server.rs | 54 ++++- src/garage/tests/bucket.rs | 8 +- src/model/Cargo.toml | 3 +- src/model/garage.rs | 8 +- src/model/index_counter.rs | 62 +++-- src/model/k2v/item_table.rs | 24 +- src/model/migrate.rs | 6 +- src/model/s3/block_ref_table.rs | 21 +- src/model/s3/object_table.rs | 12 +- src/model/s3/version_table.rs | 13 +- src/table/Cargo.toml | 3 +- src/table/data.rs | 113 ++++----- src/table/gc.rs | 41 ++-- src/table/merkle.rs | 101 ++++---- src/table/metrics.rs | 21 +- src/table/schema.rs | 19 +- src/table/sync.rs | 16 +- src/table/table.rs | 4 +- src/util/Cargo.toml | 4 +- src/util/config.rs | 11 +- src/util/error.rs | 12 +- src/util/lib.rs | 2 +- src/util/sled_counter.rs | 100 -------- 39 files changed, 2362 insertions(+), 413 deletions(-) create mode 100644 src/db/Cargo.toml create mode 100644 src/db/bin/convert.rs create mode 100644 src/db/counted_tree_hack.rs create mode 100644 src/db/lib.rs create mode 100644 src/db/lmdb_adapter.rs create mode 100644 src/db/sled_adapter.rs create mode 100644 src/db/sqlite_adapter.rs create mode 100644 src/db/test.rs delete mode 100644 src/util/sled_counter.rs (limited to 'src') diff --git a/src/api/admin/cluster.rs b/src/api/admin/cluster.rs index 6d01317d..4b7716a3 100644 --- a/src/api/admin/cluster.rs +++ b/src/api/admin/cluster.rs @@ -19,6 +19,7 @@ pub async fn handle_get_cluster_status(garage: &Arc) -> Result) -> GetClusterLayoutResponse { struct GetClusterStatusResponse { node: String, garage_version: &'static str, + db_engine: String, known_nodes: HashMap, layout: GetClusterLayoutResponse, } diff --git a/src/block/Cargo.toml b/src/block/Cargo.toml index 9cba69ee..80346aca 100644 --- a/src/block/Cargo.toml +++ b/src/block/Cargo.toml @@ -14,6 +14,7 @@ path = "lib.rs" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +garage_db = { version = "0.8.0", path = "../db" } garage_rpc = { version = "0.7.0", path = "../rpc" } garage_util = { version = "0.7.0", path = "../util" } garage_table = { version = "0.7.0", path = "../table" } @@ -27,8 +28,6 @@ tracing = "0.1.30" rand = "0.8" zstd = { version = "0.9", default-features = false } -sled = "0.34" - rmp-serde = "0.15" serde = { version = "1.0", default-features = false, features = ["derive", "rc"] } serde_bytes = "0.11" diff --git a/src/block/manager.rs b/src/block/manager.rs index 9b2d9cad..32ba0431 100644 --- a/src/block/manager.rs +++ b/src/block/manager.rs @@ -1,3 +1,5 @@ +use core::ops::Bound; + use std::convert::TryInto; use std::path::{Path, PathBuf}; use std::sync::Arc; @@ -17,10 +19,12 @@ use opentelemetry::{ Context, KeyValue, }; +use garage_db as db; +use garage_db::counted_tree_hack::CountedTree; + use garage_util::data::*; use garage_util::error::*; use garage_util::metrics::RecordDuration; -use garage_util::sled_counter::SledCountedTree; use garage_util::time::*; use garage_util::tranquilizer::Tranquilizer; @@ -91,9 +95,9 @@ pub struct BlockManager { rc: BlockRc, - resync_queue: SledCountedTree, + resync_queue: CountedTree, resync_notify: Notify, - resync_errors: SledCountedTree, + resync_errors: CountedTree, system: Arc, endpoint: Arc>, @@ -108,7 +112,7 @@ struct BlockManagerLocked(); impl BlockManager { pub fn new( - db: &sled::Db, + db: &db::Db, data_dir: PathBuf, compression_level: Option, background_tranquility: u32, @@ -123,12 +127,14 @@ impl BlockManager { let resync_queue = db .open_tree("block_local_resync_queue") .expect("Unable to open block_local_resync_queue tree"); - let resync_queue = SledCountedTree::new(resync_queue); + let resync_queue = + CountedTree::new(resync_queue).expect("Could not count block_local_resync_queue"); let resync_errors = db .open_tree("block_local_resync_errors") .expect("Unable to open block_local_resync_errors tree"); - let resync_errors = SledCountedTree::new(resync_errors); + let resync_errors = + CountedTree::new(resync_errors).expect("Could not count block_local_resync_errors"); let endpoint = system .netapp @@ -219,11 +225,44 @@ impl BlockManager { /// to fix any mismatch between the two. pub async fn repair_data_store(&self, must_exit: &watch::Receiver) -> Result<(), Error> { // 1. Repair blocks from RC table. - for (i, entry) in self.rc.rc.iter().enumerate() { - let (hash, _) = entry?; - let hash = Hash::try_from(&hash[..]).unwrap(); - self.put_to_resync(&hash, Duration::from_secs(0))?; - if i & 0xFF == 0 && *must_exit.borrow() { + let mut next_start: Option = None; + loop { + // We have to do this complicated two-step process where we first read a bunch + // of hashes from the RC table, and then insert them in the to-resync queue, + // because of SQLite. Basically, as long as we have an iterator on a DB table, + // we can't do anything else on the DB. The naive approach (which we had previously) + // of just iterating on the RC table and inserting items one to one in the resync + // queue can't work here, it would just provoke a deadlock in the SQLite adapter code. + // This is mostly because the Rust bindings for SQLite assume a worst-case scenario + // where SQLite is not compiled in thread-safe mode, so we have to wrap everything + // in a mutex (see db/sqlite_adapter.rs and discussion in PR #322). + let mut batch_of_hashes = vec![]; + let start_bound = match next_start.as_ref() { + None => Bound::Unbounded, + Some(x) => Bound::Excluded(x.as_slice()), + }; + for entry in self + .rc + .rc + .range::<&[u8], _>((start_bound, Bound::Unbounded))? + { + let (hash, _) = entry?; + let hash = Hash::try_from(&hash[..]).unwrap(); + batch_of_hashes.push(hash); + if batch_of_hashes.len() >= 1000 { + break; + } + } + if batch_of_hashes.is_empty() { + break; + } + + for hash in batch_of_hashes.into_iter() { + self.put_to_resync(&hash, Duration::from_secs(0))?; + next_start = Some(hash) + } + + if *must_exit.borrow() { return Ok(()); } } @@ -264,46 +303,69 @@ impl BlockManager { } /// Get lenght of resync queue - pub fn resync_queue_len(&self) -> usize { - self.resync_queue.len() + pub fn resync_queue_len(&self) -> Result { + // This currently can't return an error because the CountedTree hack + // doesn't error on .len(), but this will change when we remove the hack + // (hopefully someday!) + Ok(self.resync_queue.len()) } /// Get number of blocks that have an error - pub fn resync_errors_len(&self) -> usize { - self.resync_errors.len() + pub fn resync_errors_len(&self) -> Result { + // (see resync_queue_len comment) + Ok(self.resync_errors.len()) } /// Get number of items in the refcount table - pub fn rc_len(&self) -> usize { - self.rc.rc.len() + pub fn rc_len(&self) -> Result { + Ok(self.rc.rc.len()?) } //// ----- Managing the reference counter ---- /// Increment the number of time a block is used, putting it to resynchronization if it is /// required, but not known - pub fn block_incref(&self, hash: &Hash) -> Result<(), Error> { - if self.rc.block_incref(hash)? { + pub fn block_incref( + self: &Arc, + tx: &mut db::Transaction, + hash: Hash, + ) -> db::TxOpResult<()> { + if self.rc.block_incref(tx, &hash)? { // When the reference counter is incremented, there is // normally a node that is responsible for sending us the // data of the block. However that operation may fail, // so in all cases we add the block here to the todo list // to check later that it arrived correctly, and if not // we will fecth it from someone. - self.put_to_resync(hash, 2 * BLOCK_RW_TIMEOUT)?; + let this = self.clone(); + tokio::spawn(async move { + if let Err(e) = this.put_to_resync(&hash, 2 * BLOCK_RW_TIMEOUT) { + error!("Block {:?} could not be put in resync queue: {}.", hash, e); + } + }); } Ok(()) } /// Decrement the number of time a block is used - pub fn block_decref(&self, hash: &Hash) -> Result<(), Error> { - if self.rc.block_decref(hash)? { + pub fn block_decref( + self: &Arc, + tx: &mut db::Transaction, + hash: Hash, + ) -> db::TxOpResult<()> { + if self.rc.block_decref(tx, &hash)? { // When the RC is decremented, it might drop to zero, // indicating that we don't need the block. // There is a delay before we garbage collect it; // make sure that it is handled in the resync loop // after that delay has passed. - self.put_to_resync(hash, BLOCK_GC_DELAY + Duration::from_secs(10))?; + let this = self.clone(); + tokio::spawn(async move { + if let Err(e) = this.put_to_resync(&hash, BLOCK_GC_DELAY + Duration::from_secs(10)) + { + error!("Block {:?} could not be put in resync queue: {}.", hash, e); + } + }); } Ok(()) } @@ -503,12 +565,12 @@ impl BlockManager { }); } - fn put_to_resync(&self, hash: &Hash, delay: Duration) -> Result<(), sled::Error> { + fn put_to_resync(&self, hash: &Hash, delay: Duration) -> db::Result<()> { let when = now_msec() + delay.as_millis() as u64; self.put_to_resync_at(hash, when) } - fn put_to_resync_at(&self, hash: &Hash, when: u64) -> Result<(), sled::Error> { + fn put_to_resync_at(&self, hash: &Hash, when: u64) -> db::Result<()> { trace!("Put resync_queue: {} {:?}", when, hash); let mut key = u64::to_be_bytes(when).to_vec(); key.extend(hash.as_ref()); @@ -547,13 +609,8 @@ impl BlockManager { // - Ok(true) -> a block was processed (successfully or not) // - Ok(false) -> no block was processed, but we are ready for the next iteration // - Err(_) -> a Sled error occurred when reading/writing from resync_queue/resync_errors - async fn resync_iter( - &self, - must_exit: &mut watch::Receiver, - ) -> Result { - if let Some(first_pair_res) = self.resync_queue.iter().next() { - let (time_bytes, hash_bytes) = first_pair_res?; - + async fn resync_iter(&self, must_exit: &mut watch::Receiver) -> Result { + if let Some((time_bytes, hash_bytes)) = self.resync_queue.first()? { let time_msec = u64::from_be_bytes(time_bytes[0..8].try_into().unwrap()); let now = now_msec(); @@ -561,7 +618,7 @@ impl BlockManager { let hash = Hash::try_from(&hash_bytes[..]).unwrap(); if let Some(ec) = self.resync_errors.get(hash.as_slice())? { - let ec = ErrorCounter::decode(ec); + let ec = ErrorCounter::decode(&ec); if now < ec.next_try() { // if next retry after an error is not yet, // don't do resync and return early, but still @@ -602,7 +659,7 @@ impl BlockManager { warn!("Error when resyncing {:?}: {}", hash, e); let err_counter = match self.resync_errors.get(hash.as_slice())? { - Some(ec) => ErrorCounter::decode(ec).add1(now + 1), + Some(ec) => ErrorCounter::decode(&ec).add1(now + 1), None => ErrorCounter::new(now + 1), }; @@ -966,7 +1023,7 @@ impl ErrorCounter { } } - fn decode(data: sled::IVec) -> Self { + fn decode(data: &[u8]) -> Self { Self { errors: u64::from_be_bytes(data[0..8].try_into().unwrap()), last_try: u64::from_be_bytes(data[8..16].try_into().unwrap()), diff --git a/src/block/metrics.rs b/src/block/metrics.rs index f0f541a3..477add66 100644 --- a/src/block/metrics.rs +++ b/src/block/metrics.rs @@ -1,6 +1,6 @@ use opentelemetry::{global, metrics::*}; -use garage_util::sled_counter::SledCountedTree; +use garage_db::counted_tree_hack::CountedTree; /// TableMetrics reference all counter used for metrics pub struct BlockManagerMetrics { @@ -23,7 +23,7 @@ pub struct BlockManagerMetrics { } impl BlockManagerMetrics { - pub fn new(resync_queue: SledCountedTree, resync_errors: SledCountedTree) -> Self { + pub fn new(resync_queue: CountedTree, resync_errors: CountedTree) -> Self { let meter = global::meter("garage_model/block"); Self { _resync_queue_len: meter diff --git a/src/block/rc.rs b/src/block/rc.rs index ec3ea44e..ce6defad 100644 --- a/src/block/rc.rs +++ b/src/block/rc.rs @@ -1,5 +1,7 @@ use std::convert::TryInto; +use garage_db as db; + use garage_util::data::*; use garage_util::error::*; use garage_util::time::*; @@ -7,31 +9,41 @@ use garage_util::time::*; use crate::manager::BLOCK_GC_DELAY; pub struct BlockRc { - pub(crate) rc: sled::Tree, + pub(crate) rc: db::Tree, } impl BlockRc { - pub(crate) fn new(rc: sled::Tree) -> Self { + pub(crate) fn new(rc: db::Tree) -> Self { Self { rc } } /// Increment the reference counter associated to a hash. /// Returns true if the RC goes from zero to nonzero. - pub(crate) fn block_incref(&self, hash: &Hash) -> Result { - let old_rc = self - .rc - .fetch_and_update(&hash, |old| RcEntry::parse_opt(old).increment().serialize())?; - let old_rc = RcEntry::parse_opt(old_rc); + pub(crate) fn block_incref( + &self, + tx: &mut db::Transaction, + hash: &Hash, + ) -> db::TxOpResult { + let old_rc = RcEntry::parse_opt(tx.get(&self.rc, &hash)?); + match old_rc.increment().serialize() { + Some(x) => tx.insert(&self.rc, &hash, x)?, + None => unreachable!(), + }; Ok(old_rc.is_zero()) } /// Decrement the reference counter associated to a hash. /// Returns true if the RC is now zero. - pub(crate) fn block_decref(&self, hash: &Hash) -> Result { - let new_rc = self - .rc - .update_and_fetch(&hash, |old| RcEntry::parse_opt(old).decrement().serialize())?; - let new_rc = RcEntry::parse_opt(new_rc); + pub(crate) fn block_decref( + &self, + tx: &mut db::Transaction, + hash: &Hash, + ) -> db::TxOpResult { + let new_rc = RcEntry::parse_opt(tx.get(&self.rc, &hash)?).decrement(); + match new_rc.serialize() { + Some(x) => tx.insert(&self.rc, &hash, x)?, + None => tx.remove(&self.rc, &hash)?, + }; Ok(matches!(new_rc, RcEntry::Deletable { .. })) } @@ -44,12 +56,15 @@ impl BlockRc { /// deletion time has passed pub(crate) fn clear_deleted_block_rc(&self, hash: &Hash) -> Result<(), Error> { let now = now_msec(); - self.rc.update_and_fetch(&hash, |rcval| { - let updated = match RcEntry::parse_opt(rcval) { - RcEntry::Deletable { at_time } if now > at_time => RcEntry::Absent, - v => v, + self.rc.db().transaction(|mut tx| { + let rcval = RcEntry::parse_opt(tx.get(&self.rc, &hash)?); + match rcval { + RcEntry::Deletable { at_time } if now > at_time => { + tx.remove(&self.rc, &hash)?; + } + _ => (), }; - updated.serialize() + tx.commit(()) })?; Ok(()) } diff --git a/src/db/Cargo.toml b/src/db/Cargo.toml new file mode 100644 index 00000000..6d8f64be --- /dev/null +++ b/src/db/Cargo.toml @@ -0,0 +1,36 @@ +[package] +name = "garage_db" +version = "0.8.0" +authors = ["Alex Auvolat "] +edition = "2018" +license = "AGPL-3.0" +description = "Abstraction over multiple key/value storage engines that supports transactions" +repository = "https://git.deuxfleurs.fr/Deuxfleurs/garage" +readme = "../../README.md" + +[lib] +path = "lib.rs" + +[[bin]] +name = "convert" +path = "bin/convert.rs" +required-features = ["cli"] + +[dependencies] +err-derive = "0.3" +hexdump = "0.1" +log = "0.4" + +heed = "0.11" +rusqlite = { version = "0.27", features = ["bundled"] } +sled = "0.34" + +# cli deps +clap = { version = "3.1.18", optional = true, features = ["derive", "env"] } +pretty_env_logger = { version = "0.4", optional = true } + +[dev-dependencies] +mktemp = "0.4" + +[features] +cli = ["clap", "pretty_env_logger"] diff --git a/src/db/bin/convert.rs b/src/db/bin/convert.rs new file mode 100644 index 00000000..9e45e61f --- /dev/null +++ b/src/db/bin/convert.rs @@ -0,0 +1,76 @@ +use std::path::PathBuf; + +use garage_db::*; + +use clap::Parser; + +/// K2V command line interface +#[derive(Parser, Debug)] +#[clap(author, version, about, long_about = None)] +struct Args { + /// Input DB path + #[clap(short = 'i')] + input_path: PathBuf, + /// Input DB engine + #[clap(short = 'a')] + input_engine: String, + + /// Output DB path + #[clap(short = 'o')] + output_path: PathBuf, + /// Output DB engine + #[clap(short = 'b')] + output_engine: String, +} + +fn main() { + let args = Args::parse(); + pretty_env_logger::init(); + + match do_conversion(args) { + Ok(()) => println!("Success!"), + Err(e) => eprintln!("Error: {}", e), + } +} + +fn do_conversion(args: Args) -> Result<()> { + let input = open_db(args.input_path, args.input_engine)?; + let output = open_db(args.output_path, args.output_engine)?; + output.import(&input)?; + Ok(()) +} + +fn open_db(path: PathBuf, engine: String) -> Result { + match engine.as_str() { + "sled" => { + let db = sled_adapter::sled::Config::default().path(&path).open()?; + Ok(sled_adapter::SledDb::init(db)) + } + "sqlite" | "sqlite3" | "rusqlite" => { + let db = sqlite_adapter::rusqlite::Connection::open(&path)?; + Ok(sqlite_adapter::SqliteDb::init(db)) + } + "lmdb" | "heed" => { + std::fs::create_dir_all(&path).map_err(|e| { + Error(format!("Unable to create LMDB data directory: {}", e).into()) + })?; + + let map_size = if u32::MAX as usize == usize::MAX { + eprintln!( + "LMDB is not recommended on 32-bit systems, database size will be limited" + ); + 1usize << 30 // 1GB for 32-bit systems + } else { + 1usize << 40 // 1TB for 64-bit systems + }; + + let db = lmdb_adapter::heed::EnvOpenOptions::new() + .max_dbs(100) + .map_size(map_size) + .open(&path) + .unwrap(); + Ok(lmdb_adapter::LmdbDb::init(db)) + } + e => Err(Error(format!("Invalid DB engine: {}", e).into())), + } +} diff --git a/src/db/counted_tree_hack.rs b/src/db/counted_tree_hack.rs new file mode 100644 index 00000000..bbe943a2 --- /dev/null +++ b/src/db/counted_tree_hack.rs @@ -0,0 +1,127 @@ +//! This hack allows a db tree to keep in RAM a counter of the number of entries +//! it contains, which is used to call .len() on it. This is usefull only for +//! the sled backend where .len() otherwise would have to traverse the whole +//! tree to count items. For sqlite and lmdb, this is mostly useless (but +//! hopefully not harmfull!). Note that a CountedTree cannot be part of a +//! transaction. + +use std::sync::{ + atomic::{AtomicUsize, Ordering}, + Arc, +}; + +use crate::{Result, Tree, TxError, Value, ValueIter}; + +#[derive(Clone)] +pub struct CountedTree(Arc); + +struct CountedTreeInternal { + tree: Tree, + len: AtomicUsize, +} + +impl CountedTree { + pub fn new(tree: Tree) -> Result { + let len = tree.len()?; + Ok(Self(Arc::new(CountedTreeInternal { + tree, + len: AtomicUsize::new(len), + }))) + } + + pub fn len(&self) -> usize { + self.0.len.load(Ordering::SeqCst) + } + + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + pub fn get>(&self, key: K) -> Result> { + self.0.tree.get(key) + } + + pub fn first(&self) -> Result> { + self.0.tree.first() + } + + pub fn iter(&self) -> Result> { + self.0.tree.iter() + } + + // ---- writing functions ---- + + pub fn insert(&self, key: K, value: V) -> Result> + where + K: AsRef<[u8]>, + V: AsRef<[u8]>, + { + let old_val = self.0.tree.insert(key, value)?; + if old_val.is_none() { + self.0.len.fetch_add(1, Ordering::SeqCst); + } + Ok(old_val) + } + + pub fn remove>(&self, key: K) -> Result> { + let old_val = self.0.tree.remove(key)?; + if old_val.is_some() { + self.0.len.fetch_sub(1, Ordering::SeqCst); + } + Ok(old_val) + } + + pub fn compare_and_swap( + &self, + key: K, + expected_old: Option, + new: Option, + ) -> Result + where + K: AsRef<[u8]>, + OV: AsRef<[u8]>, + NV: AsRef<[u8]>, + { + let old_some = expected_old.is_some(); + let new_some = new.is_some(); + + let tx_res = self.0.tree.db().transaction(|mut tx| { + let old_val = tx.get(&self.0.tree, &key)?; + let is_same = match (&old_val, &expected_old) { + (None, None) => true, + (Some(x), Some(y)) if x == y.as_ref() => true, + _ => false, + }; + if is_same { + match &new { + Some(v) => { + tx.insert(&self.0.tree, &key, v)?; + } + None => { + tx.remove(&self.0.tree, &key)?; + } + } + tx.commit(()) + } else { + tx.abort(()) + } + }); + + match tx_res { + Ok(()) => { + match (old_some, new_some) { + (false, true) => { + self.0.len.fetch_add(1, Ordering::SeqCst); + } + (true, false) => { + self.0.len.fetch_sub(1, Ordering::SeqCst); + } + _ => (), + } + Ok(true) + } + Err(TxError::Abort(())) => Ok(false), + Err(TxError::Db(e)) => Err(e), + } + } +} diff --git a/src/db/lib.rs b/src/db/lib.rs new file mode 100644 index 00000000..e9d3ea18 --- /dev/null +++ b/src/db/lib.rs @@ -0,0 +1,400 @@ +pub mod lmdb_adapter; +pub mod sled_adapter; +pub mod sqlite_adapter; + +pub mod counted_tree_hack; + +#[cfg(test)] +pub mod test; + +use core::ops::{Bound, RangeBounds}; + +use std::borrow::Cow; +use std::cell::Cell; +use std::sync::Arc; + +use err_derive::Error; + +#[derive(Clone)] +pub struct Db(pub(crate) Arc); + +pub struct Transaction<'a>(&'a mut dyn ITx); + +#[derive(Clone)] +pub struct Tree(Arc, usize); + +pub type Value = Vec; +pub type ValueIter<'a> = Box> + 'a>; +pub type TxValueIter<'a> = Box> + 'a>; + +// ---- + +#[derive(Debug, Error)] +#[error(display = "{}", _0)] +pub struct Error(pub Cow<'static, str>); + +pub type Result = std::result::Result; + +#[derive(Debug, Error)] +#[error(display = "{}", _0)] +pub struct TxOpError(pub(crate) Error); +pub type TxOpResult = std::result::Result; + +pub enum TxError { + Abort(E), + Db(Error), +} +pub type TxResult = std::result::Result>; + +impl From for TxError { + fn from(e: TxOpError) -> TxError { + TxError::Db(e.0) + } +} + +pub fn unabort(res: TxResult) -> TxOpResult> { + match res { + Ok(v) => Ok(Ok(v)), + Err(TxError::Abort(e)) => Ok(Err(e)), + Err(TxError::Db(e)) => Err(TxOpError(e)), + } +} + +// ---- + +impl Db { + pub fn engine(&self) -> String { + self.0.engine() + } + + pub fn open_tree>(&self, name: S) -> Result { + let tree_id = self.0.open_tree(name.as_ref())?; + Ok(Tree(self.0.clone(), tree_id)) + } + + pub fn list_trees(&self) -> Result> { + self.0.list_trees() + } + + pub fn transaction(&self, fun: F) -> TxResult + where + F: Fn(Transaction<'_>) -> TxResult, + { + let f = TxFn { + function: fun, + result: Cell::new(None), + }; + let tx_res = self.0.transaction(&f); + let ret = f + .result + .into_inner() + .expect("Transaction did not store result"); + + match tx_res { + Ok(()) => { + assert!(matches!(ret, Ok(_))); + ret + } + Err(TxError::Abort(())) => { + assert!(matches!(ret, Err(TxError::Abort(_)))); + ret + } + Err(TxError::Db(e2)) => match ret { + // Ok was stored -> the error occured when finalizing + // transaction + Ok(_) => Err(TxError::Db(e2)), + // An error was already stored: that's the one we want to + // return + Err(TxError::Db(e)) => Err(TxError::Db(e)), + _ => unreachable!(), + }, + } + } + + pub fn import(&self, other: &Db) -> Result<()> { + let existing_trees = self.list_trees()?; + if !existing_trees.is_empty() { + return Err(Error( + format!( + "destination database already contains data: {:?}", + existing_trees + ) + .into(), + )); + } + + let tree_names = other.list_trees()?; + for name in tree_names { + let tree = self.open_tree(&name)?; + if tree.len()? > 0 { + return Err(Error(format!("tree {} already contains data", name).into())); + } + + let ex_tree = other.open_tree(&name)?; + + let tx_res = self.transaction(|mut tx| { + let mut i = 0; + for item in ex_tree.iter().map_err(TxError::Abort)? { + let (k, v) = item.map_err(TxError::Abort)?; + tx.insert(&tree, k, v)?; + i += 1; + if i % 1000 == 0 { + println!("{}: imported {}", name, i); + } + } + tx.commit(i) + }); + let total = match tx_res { + Err(TxError::Db(e)) => return Err(e), + Err(TxError::Abort(e)) => return Err(e), + Ok(x) => x, + }; + + println!("{}: finished importing, {} items", name, total); + } + Ok(()) + } +} + +#[allow(clippy::len_without_is_empty)] +impl Tree { + #[inline] + pub fn db(&self) -> Db { + Db(self.0.clone()) + } + + #[inline] + pub fn get>(&self, key: T) -> Result> { + self.0.get(self.1, key.as_ref()) + } + #[inline] + pub fn len(&self) -> Result { + self.0.len(self.1) + } + + #[inline] + pub fn first(&self) -> Result> { + self.iter()?.next().transpose() + } + #[inline] + pub fn get_gt>(&self, from: T) -> Result> { + self.range((Bound::Excluded(from), Bound::Unbounded))? + .next() + .transpose() + } + + /// Returns the old value if there was one + #[inline] + pub fn insert, U: AsRef<[u8]>>( + &self, + key: T, + value: U, + ) -> Result> { + self.0.insert(self.1, key.as_ref(), value.as_ref()) + } + /// Returns the old value if there was one + #[inline] + pub fn remove>(&self, key: T) -> Result> { + self.0.remove(self.1, key.as_ref()) + } + + #[inline] + pub fn iter(&self) -> Result> { + self.0.iter(self.1) + } + #[inline] + pub fn iter_rev(&self) -> Result> { + self.0.iter_rev(self.1) + } + + #[inline] + pub fn range(&self, range: R) -> Result> + where + K: AsRef<[u8]>, + R: RangeBounds, + { + let sb = range.start_bound(); + let eb = range.end_bound(); + self.0.range(self.1, get_bound(sb), get_bound(eb)) + } + #[inline] + pub fn range_rev(&self, range: R) -> Result> + where + K: AsRef<[u8]>, + R: RangeBounds, + { + let sb = range.start_bound(); + let eb = range.end_bound(); + self.0.range_rev(self.1, get_bound(sb), get_bound(eb)) + } +} + +#[allow(clippy::len_without_is_empty)] +impl<'a> Transaction<'a> { + #[inline] + pub fn get>(&self, tree: &Tree, key: T) -> TxOpResult> { + self.0.get(tree.1, key.as_ref()) + } + #[inline] + pub fn len(&self, tree: &Tree) -> TxOpResult { + self.0.len(tree.1) + } + + /// Returns the old value if there was one + #[inline] + pub fn insert, U: AsRef<[u8]>>( + &mut self, + tree: &Tree, + key: T, + value: U, + ) -> TxOpResult> { + self.0.insert(tree.1, key.as_ref(), value.as_ref()) + } + /// Returns the old value if there was one + #[inline] + pub fn remove>(&mut self, tree: &Tree, key: T) -> TxOpResult> { + self.0.remove(tree.1, key.as_ref()) + } + + #[inline] + pub fn iter(&self, tree: &Tree) -> TxOpResult> { + self.0.iter(tree.1) + } + #[inline] + pub fn iter_rev(&self, tree: &Tree) -> TxOpResult> { + self.0.iter_rev(tree.1) + } + + #[inline] + pub fn range(&self, tree: &Tree, range: R) -> TxOpResult> + where + K: AsRef<[u8]>, + R: RangeBounds, + { + let sb = range.start_bound(); + let eb = range.end_bound(); + self.0.range(tree.1, get_bound(sb), get_bound(eb)) + } + #[inline] + pub fn range_rev(&self, tree: &Tree, range: R) -> TxOpResult> + where + K: AsRef<[u8]>, + R: RangeBounds, + { + let sb = range.start_bound(); + let eb = range.end_bound(); + self.0.range_rev(tree.1, get_bound(sb), get_bound(eb)) + } + + // ---- + + #[inline] + pub fn abort(self, e: E) -> TxResult { + Err(TxError::Abort(e)) + } + + #[inline] + pub fn commit(self, r: R) -> TxResult { + Ok(r) + } +} + +// ---- Internal interfaces + +pub(crate) trait IDb: Send + Sync { + fn engine(&self) -> String; + fn open_tree(&self, name: &str) -> Result; + fn list_trees(&self) -> Result>; + + fn get(&self, tree: usize, key: &[u8]) -> Result>; + fn len(&self, tree: usize) -> Result; + + fn insert(&self, tree: usize, key: &[u8], value: &[u8]) -> Result>; + fn remove(&self, tree: usize, key: &[u8]) -> Result>; + + fn iter(&self, tree: usize) -> Result>; + fn iter_rev(&self, tree: usize) -> Result>; + + fn range<'r>( + &self, + tree: usize, + low: Bound<&'r [u8]>, + high: Bound<&'r [u8]>, + ) -> Result>; + fn range_rev<'r>( + &self, + tree: usize, + low: Bound<&'r [u8]>, + high: Bound<&'r [u8]>, + ) -> Result>; + + fn transaction(&self, f: &dyn ITxFn) -> TxResult<(), ()>; +} + +pub(crate) trait ITx { + fn get(&self, tree: usize, key: &[u8]) -> TxOpResult>; + fn len(&self, tree: usize) -> TxOpResult; + + fn insert(&mut self, tree: usize, key: &[u8], value: &[u8]) -> TxOpResult>; + fn remove(&mut self, tree: usize, key: &[u8]) -> TxOpResult>; + + fn iter(&self, tree: usize) -> TxOpResult>; + fn iter_rev(&self, tree: usize) -> TxOpResult>; + + fn range<'r>( + &self, + tree: usize, + low: Bound<&'r [u8]>, + high: Bound<&'r [u8]>, + ) -> TxOpResult>; + fn range_rev<'r>( + &self, + tree: usize, + low: Bound<&'r [u8]>, + high: Bound<&'r [u8]>, + ) -> TxOpResult>; +} + +pub(crate) trait ITxFn { + fn try_on(&self, tx: &mut dyn ITx) -> TxFnResult; +} + +pub(crate) enum TxFnResult { + Ok, + Abort, + DbErr, +} + +struct TxFn +where + F: Fn(Transaction<'_>) -> TxResult, +{ + function: F, + result: Cell>>, +} + +impl ITxFn for TxFn +where + F: Fn(Transaction<'_>) -> TxResult, +{ + fn try_on(&self, tx: &mut dyn ITx) -> TxFnResult { + let res = (self.function)(Transaction(tx)); + let res2 = match &res { + Ok(_) => TxFnResult::Ok, + Err(TxError::Abort(_)) => TxFnResult::Abort, + Err(TxError::Db(_)) => TxFnResult::DbErr, + }; + self.result.set(Some(res)); + res2 + } +} + +// ---- + +fn get_bound>(b: Bound<&K>) -> Bound<&[u8]> { + match b { + Bound::Included(v) => Bound::Included(v.as_ref()), + Bound::Excluded(v) => Bound::Excluded(v.as_ref()), + Bound::Unbounded => Bound::Unbounded, + } +} diff --git a/src/db/lmdb_adapter.rs b/src/db/lmdb_adapter.rs new file mode 100644 index 00000000..74622919 --- /dev/null +++ b/src/db/lmdb_adapter.rs @@ -0,0 +1,329 @@ +use core::ops::Bound; +use core::ptr::NonNull; + +use std::collections::HashMap; +use std::convert::TryInto; +use std::sync::{Arc, RwLock}; + +use heed::types::ByteSlice; +use heed::{BytesDecode, Env, RoTxn, RwTxn, UntypedDatabase as Database}; + +use crate::{ + Db, Error, IDb, ITx, ITxFn, Result, TxError, TxFnResult, TxOpError, TxOpResult, TxResult, + TxValueIter, Value, ValueIter, +}; + +pub use heed; + +// -- err + +impl From for Error { + fn from(e: heed::Error) -> Error { + Error(format!("LMDB: {}", e).into()) + } +} + +impl From for TxOpError { + fn from(e: heed::Error) -> TxOpError { + TxOpError(e.into()) + } +} + +// -- db + +pub struct LmdbDb { + db: heed::Env, + trees: RwLock<(Vec, HashMap)>, +} + +impl LmdbDb { + pub fn init(db: Env) -> Db { + let s = Self { + db, + trees: RwLock::new((Vec::new(), HashMap::new())), + }; + Db(Arc::new(s)) + } + + fn get_tree(&self, i: usize) -> Result { + self.trees + .read() + .unwrap() + .0 + .get(i) + .cloned() + .ok_or_else(|| Error("invalid tree id".into())) + } +} + +impl IDb for LmdbDb { + fn engine(&self) -> String { + "LMDB (using Heed crate)".into() + } + + fn open_tree(&self, name: &str) -> Result { + let mut trees = self.trees.write().unwrap(); + if let Some(i) = trees.1.get(name) { + Ok(*i) + } else { + let tree = self.db.create_database(Some(name))?; + let i = trees.0.len(); + trees.0.push(tree); + trees.1.insert(name.to_string(), i); + Ok(i) + } + } + + fn list_trees(&self) -> Result> { + let tree0 = match self.db.open_database::(None)? { + Some(x) => x, + None => return Ok(vec![]), + }; + + let mut ret = vec![]; + let tx = self.db.read_txn()?; + for item in tree0.iter(&tx)? { + let (tree_name, _) = item?; + ret.push(tree_name.to_string()); + } + drop(tx); + + let mut ret2 = vec![]; + for tree_name in ret { + if self + .db + .open_database::(Some(&tree_name))? + .is_some() + { + ret2.push(tree_name); + } + } + + Ok(ret2) + } + + // ---- + + fn get(&self, tree: usize, key: &[u8]) -> Result> { + let tree = self.get_tree(tree)?; + + let tx = self.db.read_txn()?; + let val = tree.get(&tx, key)?; + match val { + None => Ok(None), + Some(v) => Ok(Some(v.to_vec())), + } + } + + fn len(&self, tree: usize) -> Result { + let tree = self.get_tree(tree)?; + let tx = self.db.read_txn()?; + Ok(tree.len(&tx)?.try_into().unwrap()) + } + + fn insert(&self, tree: usize, key: &[u8], value: &[u8]) -> Result> { + let tree = self.get_tree(tree)?; + let mut tx = self.db.write_txn()?; + let old_val = tree.get(&tx, key)?.map(Vec::from); + tree.put(&mut tx, key, value)?; + tx.commit()?; + Ok(old_val) + } + + fn remove(&self, tree: usize, key: &[u8]) -> Result> { + let tree = self.get_tree(tree)?; + let mut tx = self.db.write_txn()?; + let old_val = tree.get(&tx, key)?.map(Vec::from); + tree.delete(&mut tx, key)?; + tx.commit()?; + Ok(old_val) + } + + fn iter(&self, tree: usize) -> Result> { + let tree = self.get_tree(tree)?; + let tx = self.db.read_txn()?; + TxAndIterator::make(tx, |tx| Ok(tree.iter(tx)?)) + } + + fn iter_rev(&self, tree: usize) -> Result> { + let tree = self.get_tree(tree)?; + let tx = self.db.read_txn()?; + TxAndIterator::make(tx, |tx| Ok(tree.rev_iter(tx)?)) + } + + fn range<'r>( + &self, + tree: usize, + low: Bound<&'r [u8]>, + high: Bound<&'r [u8]>, + ) -> Result> { + let tree = self.get_tree(tree)?; + let tx = self.db.read_txn()?; + TxAndIterator::make(tx, |tx| Ok(tree.range(tx, &(low, high))?)) + } + fn range_rev<'r>( + &self, + tree: usize, + low: Bound<&'r [u8]>, + high: Bound<&'r [u8]>, + ) -> Result> { + let tree = self.get_tree(tree)?; + let tx = self.db.read_txn()?; + TxAndIterator::make(tx, |tx| Ok(tree.rev_range(tx, &(low, high))?)) + } + + // ---- + + fn transaction(&self, f: &dyn ITxFn) -> TxResult<(), ()> { + let trees = self.trees.read().unwrap(); + let mut tx = LmdbTx { + trees: &trees.0[..], + tx: self + .db + .write_txn() + .map_err(Error::from) + .map_err(TxError::Db)?, + }; + + let res = f.try_on(&mut tx); + match res { + TxFnResult::Ok => { + tx.tx.commit().map_err(Error::from).map_err(TxError::Db)?; + Ok(()) + } + TxFnResult::Abort => { + tx.tx.abort().map_err(Error::from).map_err(TxError::Db)?; + Err(TxError::Abort(())) + } + TxFnResult::DbErr => { + tx.tx.abort().map_err(Error::from).map_err(TxError::Db)?; + Err(TxError::Db(Error( + "(this message will be discarded)".into(), + ))) + } + } + } +} + +// ---- + +struct LmdbTx<'a> { + trees: &'a [Database], + tx: RwTxn<'a, 'a>, +} + +impl<'a> LmdbTx<'a> { + fn get_tree(&self, i: usize) -> TxOpResult<&Database> { + self.trees.get(i).ok_or_else(|| { + TxOpError(Error( + "invalid tree id (it might have been openned after the transaction started)".into(), + )) + }) + } +} + +impl<'a> ITx for LmdbTx<'a> { + fn get(&self, tree: usize, key: &[u8]) -> TxOpResult> { + let tree = self.get_tree(tree)?; + match tree.get(&self.tx, key)? { + Some(v) => Ok(Some(v.to_vec())), + None => Ok(None), + } + } + fn len(&self, _tree: usize) -> TxOpResult { + unimplemented!(".len() in transaction not supported with LMDB backend") + } + + fn insert(&mut self, tree: usize, key: &[u8], value: &[u8]) -> TxOpResult> { + let tree = *self.get_tree(tree)?; + let old_val = tree.get(&self.tx, key)?.map(Vec::from); + tree.put(&mut self.tx, key, value)?; + Ok(old_val) + } + fn remove(&mut self, tree: usize, key: &[u8]) -> TxOpResult> { + let tree = *self.get_tree(tree)?; + let old_val = tree.get(&self.tx, key)?.map(Vec::from); + tree.delete(&mut self.tx, key)?; + Ok(old_val) + } + + fn iter(&self, _tree: usize) -> TxOpResult> { + unimplemented!("Iterators in transactions not supported with LMDB backend"); + } + fn iter_rev(&self, _tree: usize) -> TxOpResult> { + unimplemented!("Iterators in transactions not supported with LMDB backend"); + } + + fn range<'r>( + &self, + _tree: usize, + _low: Bound<&'r [u8]>, + _high: Bound<&'r [u8]>, + ) -> TxOpResult> { + unimplemented!("Iterators in transactions not supported with LMDB backend"); + } + fn range_rev<'r>( + &self, + _tree: usize, + _low: Bound<&'r [u8]>, + _high: Bound<&'r [u8]>, + ) -> TxOpResult> { + unimplemented!("Iterators in transactions not supported with LMDB backend"); + } +} + +// ---- + +type IteratorItem<'a> = heed::Result<( + >::DItem, + >::DItem, +)>; + +struct TxAndIterator<'a, I> +where + I: Iterator> + 'a, +{ + tx: RoTxn<'a>, + iter: Option, +} + +impl<'a, I> TxAndIterator<'a, I> +where + I: Iterator> + 'a, +{ + fn make(tx: RoTxn<'a>, iterfun: F) -> Result> + where + F: FnOnce(&'a RoTxn<'a>) -> Result, + { + let mut res = TxAndIterator { tx, iter: None }; + + let tx = unsafe { NonNull::from(&res.tx).as_ref() }; + res.iter = Some(iterfun(tx)?); + + Ok(Box::new(res)) + } +} + +impl<'a, I> Drop for TxAndIterator<'a, I> +where + I: Iterator> + 'a, +{ + fn drop(&mut self) { + drop(self.iter.take()); + } +} + +impl<'a, I> Iterator for TxAndIterator<'a, I> +where + I: Iterator> + 'a, +{ + type Item = Result<(Value, Value)>; + + fn next(&mut self) -> Option { + match self.iter.as_mut().unwrap().next() { + None => None, + Some(Err(e)) => Some(Err(e.into())), + Some(Ok((k, v))) => Some(Ok((k.to_vec(), v.to_vec()))), + } + } +} diff --git a/src/db/sled_adapter.rs b/src/db/sled_adapter.rs new file mode 100644 index 00000000..982f8d82 --- /dev/null +++ b/src/db/sled_adapter.rs @@ -0,0 +1,260 @@ +use core::ops::Bound; + +use std::cell::Cell; +use std::collections::HashMap; +use std::sync::{Arc, RwLock}; + +use sled::transaction::{ + ConflictableTransactionError, TransactionError, Transactional, TransactionalTree, + UnabortableTransactionError, +}; + +use crate::{ + Db, Error, IDb, ITx, ITxFn, Result, TxError, TxFnResult, TxOpError, TxOpResult, TxResult, + TxValueIter, Value, ValueIter, +}; + +pub use sled; + +// -- err + +impl From for Error { + fn from(e: sled::Error) -> Error { + Error(format!("Sled: {}", e).into()) + } +} + +impl From for TxOpError { + fn from(e: sled::Error) -> TxOpError { + TxOpError(e.into()) + } +} + +// -- db + +pub struct SledDb { + db: sled::Db, + trees: RwLock<(Vec, HashMap)>, +} + +impl SledDb { + pub fn init(db: sled::Db) -> Db { + let s = Self { + db, + trees: RwLock::new((Vec::new(), HashMap::new())), + }; + Db(Arc::new(s)) + } + + fn get_tree(&self, i: usize) -> Result { + self.trees + .read() + .unwrap() + .0 + .get(i) + .cloned() + .ok_or_else(|| Error("invalid tree id".into())) + } +} + +impl IDb for SledDb { + fn engine(&self) -> String { + "Sled".into() + } + + fn open_tree(&self, name: &str) -> Result { + let mut trees = self.trees.write().unwrap(); + if let Some(i) = trees.1.get(name) { + Ok(*i) + } else { + let tree = self.db.open_tree(name)?; + let i = trees.0.len(); + trees.0.push(tree); + trees.1.insert(name.to_string(), i); + Ok(i) + } + } + + fn list_trees(&self) -> Result> { + let mut trees = vec![]; + for name in self.db.tree_names() { + let name = std::str::from_utf8(&name) + .map_err(|e| Error(format!("{}", e).into()))? + .to_string(); + if name != "__sled__default" { + trees.push(name); + } + } + Ok(trees) + } + + // ---- + + fn get(&self, tree: usize, key: &[u8]) -> Result> { + let tree = self.get_tree(tree)?; + let val = tree.get(key)?; + Ok(val.map(|x| x.to_vec())) + } + + fn len(&self, tree: usize) -> Result { + let tree = self.get_tree(tree)?; + Ok(tree.len()) + } + + fn insert(&self, tree: usize, key: &[u8], value: &[u8]) -> Result> { + let tree = self.get_tree(tree)?; + let old_val = tree.insert(key, value)?; + Ok(old_val.map(|x| x.to_vec())) + } + + fn remove(&self, tree: usize, key: &[u8]) -> Result> { + let tree = self.get_tree(tree)?; + let old_val = tree.remove(key)?; + Ok(old_val.map(|x| x.to_vec())) + } + + fn iter(&self, tree: usize) -> Result> { + let tree = self.get_tree(tree)?; + Ok(Box::new(tree.iter().map(|v| { + v.map(|(x, y)| (x.to_vec(), y.to_vec())).map_err(Into::into) + }))) + } + + fn iter_rev(&self, tree: usize) -> Result> { + let tree = self.get_tree(tree)?; + Ok(Box::new(tree.iter().rev().map(|v| { + v.map(|(x, y)| (x.to_vec(), y.to_vec())).map_err(Into::into) + }))) + } + + fn range<'r>( + &self, + tree: usize, + low: Bound<&'r [u8]>, + high: Bound<&'r [u8]>, + ) -> Result> { + let tree = self.get_tree(tree)?; + Ok(Box::new(tree.range::<&'r [u8], _>((low, high)).map(|v| { + v.map(|(x, y)| (x.to_vec(), y.to_vec())).map_err(Into::into) + }))) + } + fn range_rev<'r>( + &self, + tree: usize, + low: Bound<&'r [u8]>, + high: Bound<&'r [u8]>, + ) -> Result> { + let tree = self.get_tree(tree)?; + Ok(Box::new(tree.range::<&'r [u8], _>((low, high)).rev().map( + |v| v.map(|(x, y)| (x.to_vec(), y.to_vec())).map_err(Into::into), + ))) + } + + // ---- + + fn transaction(&self, f: &dyn ITxFn) -> TxResult<(), ()> { + let trees = self.trees.read().unwrap(); + let res = trees.0.transaction(|txtrees| { + let mut tx = SledTx { + trees: txtrees, + err: Cell::new(None), + }; + match f.try_on(&mut tx) { + TxFnResult::Ok => { + assert!(tx.err.into_inner().is_none()); + Ok(()) + } + TxFnResult::Abort => { + assert!(tx.err.into_inner().is_none()); + Err(ConflictableTransactionError::Abort(())) + } + TxFnResult::DbErr => { + let e = tx.err.into_inner().expect("No DB error"); + Err(e.into()) + } + } + }); + match res { + Ok(()) => Ok(()), + Err(TransactionError::Abort(())) => Err(TxError::Abort(())), + Err(TransactionError::Storage(s)) => Err(TxError::Db(s.into())), + } + } +} + +// ---- + +struct SledTx<'a> { + trees: &'a [TransactionalTree], + err: Cell>, +} + +impl<'a> SledTx<'a> { + fn get_tree(&self, i: usize) -> TxOpResult<&TransactionalTree> { + self.trees.get(i).ok_or_else(|| { + TxOpError(Error( + "invalid tree id (it might have been openned after the transaction started)".into(), + )) + }) + } + + fn save_error( + &self, + v: std::result::Result, + ) -> TxOpResult { + match v { + Ok(x) => Ok(x), + Err(e) => { + let txt = format!("{}", e); + self.err.set(Some(e)); + Err(TxOpError(Error(txt.into()))) + } + } + } +} + +impl<'a> ITx for SledTx<'a> { + fn get(&self, tree: usize, key: &[u8]) -> TxOpResult> { + let tree = self.get_tree(tree)?; + let tmp = self.save_error(tree.get(key))?; + Ok(tmp.map(|x| x.to_vec())) + } + fn len(&self, _tree: usize) -> TxOpResult { + unimplemented!(".len() in transaction not supported with Sled backend") + } + + fn insert(&mut self, tree: usize, key: &[u8], value: &[u8]) -> TxOpResult> { + let tree = self.get_tree(tree)?; + let old_val = self.save_error(tree.insert(key, value))?; + Ok(old_val.map(|x| x.to_vec())) + } + fn remove(&mut self, tree: usize, key: &[u8]) -> TxOpResult> { + let tree = self.get_tree(tree)?; + let old_val = self.save_error(tree.remove(key))?; + Ok(old_val.map(|x| x.to_vec())) + } + + fn iter(&self, _tree: usize) -> TxOpResult> { + unimplemented!("Iterators in transactions not supported with Sled backend"); + } + fn iter_rev(&self, _tree: usize) -> TxOpResult> { + unimplemented!("Iterators in transactions not supported with Sled backend"); + } + + fn range<'r>( + &self, + _tree: usize, + _low: Bound<&'r [u8]>, + _high: Bound<&'r [u8]>, + ) -> TxOpResult> { + unimplemented!("Iterators in transactions not supported with Sled backend"); + } + fn range_rev<'r>( + &self, + _tree: usize, + _low: Bound<&'r [u8]>, + _high: Bound<&'r [u8]>, + ) -> TxOpResult> { + unimplemented!("Iterators in transactions not supported with Sled backend"); + } +} diff --git a/src/db/sqlite_adapter.rs b/src/db/sqlite_adapter.rs new file mode 100644 index 00000000..14bf35ff --- /dev/null +++ b/src/db/sqlite_adapter.rs @@ -0,0 +1,500 @@ +use core::ops::Bound; + +use std::borrow::BorrowMut; +use std::marker::PhantomPinned; +use std::pin::Pin; +use std::ptr::NonNull; +use std::sync::{Arc, Mutex, MutexGuard}; + +use log::trace; + +use rusqlite::{params, Connection, Rows, Statement, Transaction}; + +use crate::{ + Db, Error, IDb, ITx, ITxFn, Result, TxError, TxFnResult, TxOpError, TxOpResult, TxResult, + TxValueIter, Value, ValueIter, +}; + +pub use rusqlite; + +// --- err + +impl From for Error { + fn from(e: rusqlite::Error) -> Error { + Error(format!("Sqlite: {}", e).into()) + } +} + +impl From for TxOpError { + fn from(e: rusqlite::Error) -> TxOpError { + TxOpError(e.into()) + } +} + +// -- db + +pub struct SqliteDb(Mutex); + +struct SqliteDbInner { + db: Connection, + trees: Vec, +} + +impl SqliteDb { + pub fn init(db: rusqlite::Connection) -> Db { + let s = Self(Mutex::new(SqliteDbInner { + db, + trees: Vec::new(), + })); + Db(Arc::new(s)) + } +} + +impl SqliteDbInner { + fn get_tree(&self, i: usize) -> Result<&'_ str> { + self.trees + .get(i) + .map(String::as_str) + .ok_or_else(|| Error("invalid tree id".into())) + } + + fn internal_get(&self, tree: &str, key: &[u8]) -> Result> { + let mut stmt = self + .db + .prepare(&format!("SELECT v FROM {} WHERE k = ?1", tree))?; + let mut res_iter = stmt.query([key])?; + match res_iter.next()? { + None => Ok(None), + Some(v) => Ok(Some(v.get::<_, Vec>(0)?)), + } + } +} + +impl IDb for SqliteDb { + fn engine(&self) -> String { + format!("sqlite3 v{} (using rusqlite crate)", rusqlite::version()) + } + + fn open_tree(&self, name: &str) -> Result { + let name = format!("tree_{}", name.replace(':', "_COLON_")); + let mut this = self.0.lock().unwrap(); + + if let Some(i) = this.trees.iter().position(|x| x == &name) { + Ok(i) + } else { + trace!("create table {}", name); + this.db.execute( + &format!( + "CREATE TABLE IF NOT EXISTS {} ( + k BLOB PRIMARY KEY, + v BLOB + )", + name + ), + [], + )?; + trace!("table created: {}, unlocking", name); + + let i = this.trees.len(); + this.trees.push(name.to_string()); + Ok(i) + } + } + + fn list_trees(&self) -> Result> { + let mut trees = vec![]; + + trace!("list_trees: lock db"); + let this = self.0.lock().unwrap(); + trace!("list_trees: lock acquired"); + + let mut stmt = this.db.prepare( + "SELECT name FROM sqlite_schema WHERE type = 'table' AND name LIKE 'tree_%'", + )?; + let mut rows = stmt.query([])?; + while let Some(row) = rows.next()? { + let name = row.get::<_, String>(0)?; + let name = name.replace("_COLON_", ":"); + let name = name.strip_prefix("tree_").unwrap().to_string(); + trees.push(name); + } + Ok(trees) + } + + // ---- + + fn get(&self, tree: usize, key: &[u8]) -> Result> { + trace!("get {}: lock db", tree); + let this = self.0.lock().unwrap(); + trace!("get {}: lock acquired", tree); + + let tree = this.get_tree(tree)?; + this.internal_get(tree, key) + } + + fn len(&self, tree: usize) -> Result { + trace!("len {}: lock db", tree); + let this = self.0.lock().unwrap(); + trace!("len {}: lock acquired", tree); + + let tree = this.get_tree(tree)?; + let mut stmt = this.db.prepare(&format!("SELECT COUNT(*) FROM {}", tree))?; + let mut res_iter = stmt.query([])?; + match res_iter.next()? { + None => Ok(0), + Some(v) => Ok(v.get::<_, usize>(0)?), + } + } + + fn insert(&self, tree: usize, key: &[u8], value: &[u8]) -> Result> { + trace!("insert {}: lock db", tree); + let this = self.0.lock().unwrap(); + trace!("insert {}: lock acquired", tree); + + let tree = this.get_tree(tree)?; + let old_val = this.internal_get(tree, key)?; + + let sql = match &old_val { + Some(_) => format!("UPDATE {} SET v = ?2 WHERE k = ?1", tree), + None => format!("INSERT INTO {} (k, v) VALUES (?1, ?2)", tree), + }; + let n = this.db.execute(&sql, params![key, value])?; + assert_eq!(n, 1); + + Ok(old_val) + } + + fn remove(&self, tree: usize, key: &[u8]) -> Result> { + trace!("remove {}: lock db", tree); + let this = self.0.lock().unwrap(); + trace!("remove {}: lock acquired", tree); + + let tree = this.get_tree(tree)?; + let old_val = this.internal_get(tree, key)?; + + if old_val.is_some() { + let n = this + .db + .execute(&format!("DELETE FROM {} WHERE k = ?1", tree), params![key])?; + assert_eq!(n, 1); + } + + Ok(old_val) + } + + fn iter(&self, tree: usize) -> Result> { + trace!("iter {}: lock db", tree); + let this = self.0.lock().unwrap(); + trace!("iter {}: lock acquired", tree); + + let tree = this.get_tree(tree)?; + let sql = format!("SELECT k, v FROM {} ORDER BY k ASC", tree); + DbValueIterator::make(this, &sql, []) + } + + fn iter_rev(&self, tree: usize) -> Result> { + trace!("iter_rev {}: lock db", tree); + let this = self.0.lock().unwrap(); + trace!("iter_rev {}: lock acquired", tree); + + let tree = this.get_tree(tree)?; + let sql = format!("SELECT k, v FROM {} ORDER BY k DESC", tree); + DbValueIterator::make(this, &sql, []) + } + + fn range<'r>( + &self, + tree: usize, + low: Bound<&'r [u8]>, + high: Bound<&'r [u8]>, + ) -> Result> { + trace!("range {}: lock db", tree); + let this = self.0.lock().unwrap(); + trace!("range {}: lock acquired", tree); + + let tree = this.get_tree(tree)?; + + let (bounds_sql, params) = bounds_sql(low, high); + let sql = format!("SELECT k, v FROM {} {} ORDER BY k ASC", tree, bounds_sql); + + let params = params + .iter() + .map(|x| x as &dyn rusqlite::ToSql) + .collect::>(); + + DbValueIterator::make::<&[&dyn rusqlite::ToSql]>(this, &sql, params.as_ref()) + } + fn range_rev<'r>( + &self, + tree: usize, + low: Bound<&'r [u8]>, + high: Bound<&'r [u8]>, + ) -> Result> { + trace!("range_rev {}: lock db", tree); + let this = self.0.lock().unwrap(); + trace!("range_rev {}: lock acquired", tree); + + let tree = this.get_tree(tree)?; + + let (bounds_sql, params) = bounds_sql(low, high); + let sql = format!("SELECT k, v FROM {} {} ORDER BY k DESC", tree, bounds_sql); + + let params = params + .iter() + .map(|x| x as &dyn rusqlite::ToSql) + .collect::>(); + + DbValueIterator::make::<&[&dyn rusqlite::ToSql]>(this, &sql, params.as_ref()) + } + + // ---- + + fn transaction(&self, f: &dyn ITxFn) -> TxResult<(), ()> { + trace!("transaction: lock db"); + let mut this = self.0.lock().unwrap(); + trace!("transaction: lock acquired"); + + let this_mut_ref: &mut SqliteDbInner = this.borrow_mut(); + + let mut tx = SqliteTx { + tx: this_mut_ref + .db + .transaction() + .map_err(Error::from) + .map_err(TxError::Db)?, + trees: &this_mut_ref.trees, + }; + let res = match f.try_on(&mut tx) { + TxFnResult::Ok => { + tx.tx.commit().map_err(Error::from).map_err(TxError::Db)?; + Ok(()) + } + TxFnResult::Abort => { + tx.tx.rollback().map_err(Error::from).map_err(TxError::Db)?; + Err(TxError::Abort(())) + } + TxFnResult::DbErr => { + tx.tx.rollback().map_err(Error::from).map_err(TxError::Db)?; + Err(TxError::Db(Error( + "(this message will be discarded)".into(), + ))) + } + }; + + trace!("transaction done"); + res + } +} + +// ---- + +struct SqliteTx<'a> { + tx: Transaction<'a>, + trees: &'a [String], +} + +impl<'a> SqliteTx<'a> { + fn get_tree(&self, i: usize) -> TxOpResult<&'_ str> { + self.trees.get(i).map(String::as_ref).ok_or_else(|| { + TxOpError(Error( + "invalid tree id (it might have been openned after the transaction started)".into(), + )) + }) + } + + fn internal_get(&self, tree: &str, key: &[u8]) -> TxOpResult> { + let mut stmt = self + .tx + .prepare(&format!("SELECT v FROM {} WHERE k = ?1", tree))?; + let mut res_iter = stmt.query([key])?; + match res_iter.next()? { + None => Ok(None), + Some(v) => Ok(Some(v.get::<_, Vec>(0)?)), + } + } +} + +impl<'a> ITx for SqliteTx<'a> { + fn get(&self, tree: usize, key: &[u8]) -> TxOpResult> { + let tree = self.get_tree(tree)?; + self.internal_get(tree, key) + } + fn len(&self, tree: usize) -> TxOpResult { + let tree = self.get_tree(tree)?; + let mut stmt = self.tx.prepare(&format!("SELECT COUNT(*) FROM {}", tree))?; + let mut res_iter = stmt.query([])?; + match res_iter.next()? { + None => Ok(0), + Some(v) => Ok(v.get::<_, usize>(0)?), + } + } + + fn insert(&mut self, tree: usize, key: &[u8], value: &[u8]) -> TxOpResult> { + let tree = self.get_tree(tree)?; + let old_val = self.internal_get(tree, key)?; + + let sql = match &old_val { + Some(_) => format!("UPDATE {} SET v = ?2 WHERE k = ?1", tree), + None => format!("INSERT INTO {} (k, v) VALUES (?1, ?2)", tree), + }; + let n = self.tx.execute(&sql, params![key, value])?; + assert_eq!(n, 1); + + Ok(old_val) + } + fn remove(&mut self, tree: usize, key: &[u8]) -> TxOpResult> { + let tree = self.get_tree(tree)?; + let old_val = self.internal_get(tree, key)?; + + if old_val.is_some() { + let n = self + .tx + .execute(&format!("DELETE FROM {} WHERE k = ?1", tree), params![key])?; + assert_eq!(n, 1); + } + + Ok(old_val) + } + + fn iter(&self, _tree: usize) -> TxOpResult> { + unimplemented!(); + } + fn iter_rev(&self, _tree: usize) -> TxOpResult> { + unimplemented!(); + } + + fn range<'r>( + &self, + _tree: usize, + _low: Bound<&'r [u8]>, + _high: Bound<&'r [u8]>, + ) -> TxOpResult> { + unimplemented!(); + } + fn range_rev<'r>( + &self, + _tree: usize, + _low: Bound<&'r [u8]>, + _high: Bound<&'r [u8]>, + ) -> TxOpResult> { + unimplemented!(); + } +} + +// ---- + +struct DbValueIterator<'a> { + db: MutexGuard<'a, SqliteDbInner>, + stmt: Option>, + iter: Option>, + _pin: PhantomPinned, +} + +impl<'a> DbValueIterator<'a> { + fn make( + db: MutexGuard<'a, SqliteDbInner>, + sql: &str, + args: P, + ) -> Result> { + let res = DbValueIterator { + db, + stmt: None, + iter: None, + _pin: PhantomPinned, + }; + let mut boxed = Box::pin(res); + trace!("make iterator with sql: {}", sql); + + unsafe { + let db = NonNull::from(&boxed.db); + let stmt = db.as_ref().db.prepare(sql)?; + + let mut_ref: Pin<&mut DbValueIterator<'a>> = Pin::as_mut(&mut boxed); + Pin::get_unchecked_mut(mut_ref).stmt = Some(stmt); + + let mut stmt = NonNull::from(&boxed.stmt); + let iter = stmt.as_mut().as_mut().unwrap().query(args)?; + + let mut_ref: Pin<&mut DbValueIterator<'a>> = Pin::as_mut(&mut boxed); + Pin::get_unchecked_mut(mut_ref).iter = Some(iter); + } + + Ok(Box::new(DbValueIteratorPin(boxed))) + } +} + +impl<'a> Drop for DbValueIterator<'a> { + fn drop(&mut self) { + trace!("drop iter"); + drop(self.iter.take()); + drop(self.stmt.take()); + } +} + +struct DbValueIteratorPin<'a>(Pin>>); + +impl<'a> Iterator for DbValueIteratorPin<'a> { + type Item = Result<(Value, Value)>; + + fn next(&mut self) -> Option { + let next = unsafe { + let mut_ref: Pin<&mut DbValueIterator<'a>> = Pin::as_mut(&mut self.0); + Pin::get_unchecked_mut(mut_ref).iter.as_mut()?.next() + }; + let row = match next { + Err(e) => return Some(Err(e.into())), + Ok(None) => return None, + Ok(Some(r)) => r, + }; + let k = match row.get::<_, Vec>(0) { + Err(e) => return Some(Err(e.into())), + Ok(x) => x, + }; + let v = match row.get::<_, Vec>(1) { + Err(e) => return Some(Err(e.into())), + Ok(y) => y, + }; + Some(Ok((k, v))) + } +} + +// ---- + +fn bounds_sql<'r>(low: Bound<&'r [u8]>, high: Bound<&'r [u8]>) -> (String, Vec>) { + let mut sql = String::new(); + let mut params: Vec> = vec![]; + + match low { + Bound::Included(b) => { + sql.push_str(" WHERE k >= ?1"); + params.push(b.to_vec()); + } + Bound::Excluded(b) => { + sql.push_str(" WHERE k > ?1"); + params.push(b.to_vec()); + } + Bound::Unbounded => (), + }; + + match high { + Bound::Included(b) => { + if !params.is_empty() { + sql.push_str(" AND k <= ?2"); + } else { + sql.push_str(" WHERE k <= ?1"); + } + params.push(b.to_vec()); + } + Bound::Excluded(b) => { + if !params.is_empty() { + sql.push_str(" AND k < ?2"); + } else { + sql.push_str(" WHERE k < ?1"); + } + params.push(b.to_vec()); + } + Bound::Unbounded => (), + } + + (sql, params) +} diff --git a/src/db/test.rs b/src/db/test.rs new file mode 100644 index 00000000..cfcee643 --- /dev/null +++ b/src/db/test.rs @@ -0,0 +1,106 @@ +use crate::*; + +use crate::lmdb_adapter::LmdbDb; +use crate::sled_adapter::SledDb; +use crate::sqlite_adapter::SqliteDb; + +fn test_suite(db: Db) { + let tree = db.open_tree("tree").unwrap(); + + let ka: &[u8] = &b"test"[..]; + let kb: &[u8] = &b"zwello"[..]; + let kint: &[u8] = &b"tz"[..]; + let va: &[u8] = &b"plop"[..]; + let vb: &[u8] = &b"plip"[..]; + let vc: &[u8] = &b"plup"[..]; + + assert!(tree.insert(ka, va).unwrap().is_none()); + assert_eq!(tree.get(ka).unwrap().unwrap(), va); + + let res = db.transaction::<_, (), _>(|mut tx| { + assert_eq!(tx.get(&tree, ka).unwrap().unwrap(), va); + + assert_eq!(tx.insert(&tree, ka, vb).unwrap().unwrap(), va); + + assert_eq!(tx.get(&tree, ka).unwrap().unwrap(), vb); + + tx.commit(12) + }); + assert!(matches!(res, Ok(12))); + assert_eq!(tree.get(ka).unwrap().unwrap(), vb); + + let res = db.transaction::<(), _, _>(|mut tx| { + assert_eq!(tx.get(&tree, ka).unwrap().unwrap(), vb); + + assert_eq!(tx.insert(&tree, ka, vc).unwrap().unwrap(), vb); + + assert_eq!(tx.get(&tree, ka).unwrap().unwrap(), vc); + + tx.abort(42) + }); + assert!(matches!(res, Err(TxError::Abort(42)))); + assert_eq!(tree.get(ka).unwrap().unwrap(), vb); + + let mut iter = tree.iter().unwrap(); + let next = iter.next().unwrap().unwrap(); + assert_eq!((next.0.as_ref(), next.1.as_ref()), (ka, vb)); + assert!(iter.next().is_none()); + drop(iter); + + assert!(tree.insert(kb, vc).unwrap().is_none()); + assert_eq!(tree.get(kb).unwrap().unwrap(), vc); + + let mut iter = tree.iter().unwrap(); + let next = iter.next().unwrap().unwrap(); + assert_eq!((next.0.as_ref(), next.1.as_ref()), (ka, vb)); + let next = iter.next().unwrap().unwrap(); + assert_eq!((next.0.as_ref(), next.1.as_ref()), (kb, vc)); + assert!(iter.next().is_none()); + drop(iter); + + let mut iter = tree.range(kint..).unwrap(); + let next = iter.next().unwrap().unwrap(); + assert_eq!((next.0.as_ref(), next.1.as_ref()), (kb, vc)); + assert!(iter.next().is_none()); + drop(iter); + + let mut iter = tree.range_rev(..kint).unwrap(); + let next = iter.next().unwrap().unwrap(); + assert_eq!((next.0.as_ref(), next.1.as_ref()), (ka, vb)); + assert!(iter.next().is_none()); + drop(iter); + + let mut iter = tree.iter_rev().unwrap(); + let next = iter.next().unwrap().unwrap(); + assert_eq!((next.0.as_ref(), next.1.as_ref()), (kb, vc)); + let next = iter.next().unwrap().unwrap(); + assert_eq!((next.0.as_ref(), next.1.as_ref()), (ka, vb)); + assert!(iter.next().is_none()); + drop(iter); +} + +#[test] +fn test_lmdb_db() { + let path = mktemp::Temp::new_dir().unwrap(); + let db = heed::EnvOpenOptions::new() + .max_dbs(100) + .open(&path) + .unwrap(); + let db = LmdbDb::init(db); + test_suite(db); + drop(path); +} + +#[test] +fn test_sled_db() { + let path = mktemp::Temp::new_dir().unwrap(); + let db = SledDb::init(sled::open(path.to_path_buf()).unwrap()); + test_suite(db); + drop(path); +} + +#[test] +fn test_sqlite_db() { + let db = SqliteDb::init(rusqlite::Connection::open_in_memory().unwrap()); + test_suite(db); +} diff --git a/src/garage/Cargo.toml b/src/garage/Cargo.toml index 902f67f8..eb643160 100644 --- a/src/garage/Cargo.toml +++ b/src/garage/Cargo.toml @@ -21,6 +21,7 @@ path = "tests/lib.rs" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +garage_db = { version = "0.8.0", path = "../db" } garage_api = { version = "0.7.0", path = "../api" } garage_model = { version = "0.7.0", path = "../model" } garage_rpc = { version = "0.7.0", path = "../rpc" } @@ -36,8 +37,6 @@ rand = "0.8" async-trait = "0.1.7" sodiumoxide = { version = "0.2.5-0", package = "kuska-sodiumoxide" } -sled = "0.34" - rmp-serde = "0.15" serde = { version = "1.0", default-features = false, features = ["derive", "rc"] } serde_bytes = "0.11" diff --git a/src/garage/admin.rs b/src/garage/admin.rs index bc1f494a..c662aa00 100644 --- a/src/garage/admin.rs +++ b/src/garage/admin.rs @@ -660,11 +660,11 @@ impl AdminRpcHandler { } Ok(AdminRpc::Ok(ret)) } else { - Ok(AdminRpc::Ok(self.gather_stats_local(opt))) + Ok(AdminRpc::Ok(self.gather_stats_local(opt)?)) } } - fn gather_stats_local(&self, opt: StatsOpt) -> String { + fn gather_stats_local(&self, opt: StatsOpt) -> Result { let mut ret = String::new(); writeln!( &mut ret, @@ -672,6 +672,7 @@ impl AdminRpcHandler { self.garage.system.garage_version(), ) .unwrap(); + writeln!(&mut ret, "\nDatabase engine: {}", self.garage.db.engine()).unwrap(); // Gather ring statistics let ring = self.garage.system.ring.borrow().clone(); @@ -689,59 +690,71 @@ impl AdminRpcHandler { writeln!(&mut ret, " {:?} {}", n, c).unwrap(); } - self.gather_table_stats(&mut ret, &self.garage.bucket_table, &opt); - self.gather_table_stats(&mut ret, &self.garage.key_table, &opt); - self.gather_table_stats(&mut ret, &self.garage.object_table, &opt); - self.gather_table_stats(&mut ret, &self.garage.version_table, &opt); - self.gather_table_stats(&mut ret, &self.garage.block_ref_table, &opt); + self.gather_table_stats(&mut ret, &self.garage.bucket_table, &opt)?; + self.gather_table_stats(&mut ret, &self.garage.key_table, &opt)?; + self.gather_table_stats(&mut ret, &self.garage.object_table, &opt)?; + self.gather_table_stats(&mut ret, &self.garage.version_table, &opt)?; + self.gather_table_stats(&mut ret, &self.garage.block_ref_table, &opt)?; writeln!(&mut ret, "\nBlock manager stats:").unwrap(); if opt.detailed { writeln!( &mut ret, " number of RC entries (~= number of blocks): {}", - self.garage.block_manager.rc_len() + self.garage.block_manager.rc_len()? ) .unwrap(); } writeln!( &mut ret, " resync queue length: {}", - self.garage.block_manager.resync_queue_len() + self.garage.block_manager.resync_queue_len()? ) .unwrap(); writeln!( &mut ret, " blocks with resync errors: {}", - self.garage.block_manager.resync_errors_len() + self.garage.block_manager.resync_errors_len()? ) .unwrap(); - ret + Ok(ret) } - fn gather_table_stats(&self, to: &mut String, t: &Arc>, opt: &StatsOpt) + fn gather_table_stats( + &self, + to: &mut String, + t: &Arc>, + opt: &StatsOpt, + ) -> Result<(), Error> where F: TableSchema + 'static, R: TableReplication + 'static, { writeln!(to, "\nTable stats for {}", F::TABLE_NAME).unwrap(); if opt.detailed { - writeln!(to, " number of items: {}", t.data.store.len()).unwrap(); + writeln!( + to, + " number of items: {}", + t.data.store.len().map_err(GarageError::from)? + ) + .unwrap(); writeln!( to, " Merkle tree size: {}", - t.merkle_updater.merkle_tree_len() + t.merkle_updater.merkle_tree_len()? ) .unwrap(); } writeln!( to, " Merkle updater todo queue length: {}", - t.merkle_updater.todo_len() + t.merkle_updater.todo_len()? ) .unwrap(); - writeln!(to, " GC todo queue length: {}", t.data.gc_todo_len()).unwrap(); + writeln!(to, " GC todo queue length: {}", t.data.gc_todo_len()?).unwrap(); + + Ok(()) } } diff --git a/src/garage/repair.rs b/src/garage/repair.rs index 830eac71..17e14b8b 100644 --- a/src/garage/repair.rs +++ b/src/garage/repair.rs @@ -64,13 +64,23 @@ impl Repair { async fn repair_versions(&self, must_exit: &watch::Receiver) -> Result<(), Error> { let mut pos = vec![]; + let mut i = 0; - while let Some((item_key, item_bytes)) = - self.garage.version_table.data.store.get_gt(&pos)? - { - pos = item_key.to_vec(); + while !*must_exit.borrow() { + let item_bytes = match self.garage.version_table.data.store.get_gt(pos)? { + Some((k, v)) => { + pos = k; + v + } + None => break, + }; + + i += 1; + if i % 1000 == 0 { + info!("repair_versions: {}", i); + } - let version = rmp_serde::decode::from_read_ref::<_, Version>(item_bytes.as_ref())?; + let version = rmp_serde::decode::from_read_ref::<_, Version>(&item_bytes)?; if version.deleted.get() { continue; } @@ -98,23 +108,30 @@ impl Repair { )) .await?; } - - if *must_exit.borrow() { - break; - } } + info!("repair_versions: finished, done {}", i); Ok(()) } async fn repair_block_ref(&self, must_exit: &watch::Receiver) -> Result<(), Error> { let mut pos = vec![]; + let mut i = 0; - while let Some((item_key, item_bytes)) = - self.garage.block_ref_table.data.store.get_gt(&pos)? - { - pos = item_key.to_vec(); + while !*must_exit.borrow() { + let item_bytes = match self.garage.block_ref_table.data.store.get_gt(pos)? { + Some((k, v)) => { + pos = k; + v + } + None => break, + }; + + i += 1; + if i % 1000 == 0 { + info!("repair_block_ref: {}", i); + } - let block_ref = rmp_serde::decode::from_read_ref::<_, BlockRef>(item_bytes.as_ref())?; + let block_ref = rmp_serde::decode::from_read_ref::<_, BlockRef>(&item_bytes)?; if block_ref.deleted.get() { continue; } @@ -139,11 +156,8 @@ impl Repair { }) .await?; } - - if *must_exit.borrow() { - break; - } } + info!("repair_block_ref: finished, done {}", i); Ok(()) } } diff --git a/src/garage/server.rs b/src/garage/server.rs index b58ad286..697d3358 100644 --- a/src/garage/server.rs +++ b/src/garage/server.rs @@ -2,6 +2,8 @@ use std::path::PathBuf; use tokio::sync::watch; +use garage_db as db; + use garage_util::background::*; use garage_util::config::*; use garage_util::error::Error; @@ -31,13 +33,51 @@ pub async fn run_server(config_file: PathBuf) -> Result<(), Error> { info!("Opening database..."); let mut db_path = config.metadata_dir.clone(); - db_path.push("db"); - let db = sled::Config::default() - .path(&db_path) - .cache_capacity(config.sled_cache_capacity) - .flush_every_ms(Some(config.sled_flush_every_ms)) - .open() - .expect("Unable to open sled DB"); + std::fs::create_dir_all(&db_path).expect("Unable to create Garage meta data directory"); + let db = match config.db_engine.as_str() { + "sled" => { + db_path.push("db"); + info!("Opening Sled database at: {}", db_path.display()); + let db = db::sled_adapter::sled::Config::default() + .path(&db_path) + .cache_capacity(config.sled_cache_capacity) + .flush_every_ms(Some(config.sled_flush_every_ms)) + .open() + .expect("Unable to open sled DB"); + db::sled_adapter::SledDb::init(db) + } + "sqlite" | "sqlite3" | "rusqlite" => { + db_path.push("db.sqlite"); + info!("Opening Sqlite database at: {}", db_path.display()); + let db = db::sqlite_adapter::rusqlite::Connection::open(db_path) + .expect("Unable to open sqlite DB"); + db::sqlite_adapter::SqliteDb::init(db) + } + "lmdb" | "heed" => { + db_path.push("db.lmdb"); + info!("Opening LMDB database at: {}", db_path.display()); + std::fs::create_dir_all(&db_path).expect("Unable to create LMDB data directory"); + let map_size = if u32::MAX as usize == usize::MAX { + warn!("LMDB is not recommended on 32-bit systems, database size will be limited"); + 1usize << 30 // 1GB for 32-bit systems + } else { + 1usize << 40 // 1TB for 64-bit systems + }; + + let db = db::lmdb_adapter::heed::EnvOpenOptions::new() + .max_dbs(100) + .map_size(map_size) + .open(&db_path) + .expect("Unable to open LMDB DB"); + db::lmdb_adapter::LmdbDb::init(db) + } + e => { + return Err(Error::Message(format!( + "Unsupported DB engine: {} (options: sled, sqlite, lmdb)", + e + ))); + } + }; info!("Initializing background runner..."); let watch_cancel = netapp::util::watch_ctrl_c(); diff --git a/src/garage/tests/bucket.rs b/src/garage/tests/bucket.rs index ff5cc8da..b32af068 100644 --- a/src/garage/tests/bucket.rs +++ b/src/garage/tests/bucket.rs @@ -29,8 +29,7 @@ async fn test_bucket_all() { .unwrap() .iter() .filter(|x| x.name.as_ref().is_some()) - .find(|x| x.name.as_ref().unwrap() == "hello") - .is_some()); + .any(|x| x.name.as_ref().unwrap() == "hello")); } { // Get its location @@ -75,13 +74,12 @@ async fn test_bucket_all() { { // Check bucket is deleted with List buckets let r = ctx.client.list_buckets().send().await.unwrap(); - assert!(r + assert!(!r .buckets .as_ref() .unwrap() .iter() .filter(|x| x.name.as_ref().is_some()) - .find(|x| x.name.as_ref().unwrap() == "hello") - .is_none()); + .any(|x| x.name.as_ref().unwrap() == "hello")); } } diff --git a/src/model/Cargo.toml b/src/model/Cargo.toml index 133fe44e..d908dc01 100644 --- a/src/model/Cargo.toml +++ b/src/model/Cargo.toml @@ -14,6 +14,7 @@ path = "lib.rs" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +garage_db = { version = "0.8.0", path = "../db" } garage_rpc = { version = "0.7.0", path = "../rpc" } garage_table = { version = "0.7.0", path = "../table" } garage_block = { version = "0.7.0", path = "../block" } @@ -30,8 +31,6 @@ tracing = "0.1.30" rand = "0.8" zstd = { version = "0.9", default-features = false } -sled = "0.34" - rmp-serde = "0.15" serde = { version = "1.0", default-features = false, features = ["derive", "rc"] } serde_bytes = "0.11" diff --git a/src/model/garage.rs b/src/model/garage.rs index 2f99bd68..280f3dc7 100644 --- a/src/model/garage.rs +++ b/src/model/garage.rs @@ -2,6 +2,8 @@ use std::sync::Arc; use netapp::NetworkKey; +use garage_db as db; + use garage_util::background::*; use garage_util::config::*; @@ -33,7 +35,7 @@ pub struct Garage { pub config: Config, /// The local database - pub db: sled::Db, + pub db: db::Db, /// A background job runner pub background: Arc, /// The membership manager @@ -71,7 +73,7 @@ pub struct GarageK2V { impl Garage { /// Create and run garage - pub fn new(config: Config, db: sled::Db, background: Arc) -> Arc { + pub fn new(config: Config, db: db::Db, background: Arc) -> Arc { let network_key = NetworkKey::from_slice( &hex::decode(&config.rpc_secret).expect("Invalid RPC secret key")[..], ) @@ -199,7 +201,7 @@ impl Garage { #[cfg(feature = "k2v")] impl GarageK2V { - fn new(system: Arc, db: &sled::Db, meta_rep_param: TableShardedReplication) -> Self { + fn new(system: Arc, db: &db::Db, meta_rep_param: TableShardedReplication) -> Self { info!("Initialize K2V counter table..."); let counter_table = IndexCounter::new(system.clone(), meta_rep_param.clone(), db); info!("Initialize K2V subscription manager..."); diff --git a/src/model/index_counter.rs b/src/model/index_counter.rs index 123154d4..2602d5d9 100644 --- a/src/model/index_counter.rs +++ b/src/model/index_counter.rs @@ -6,6 +6,8 @@ use std::time::Duration; use serde::{Deserialize, Serialize}; use tokio::sync::{mpsc, watch}; +use garage_db as db; + use garage_rpc::ring::Ring; use garage_rpc::system::System; use garage_util::data::*; @@ -114,10 +116,6 @@ impl TableSchema for CounterTable { type E = CounterEntry; type Filter = (DeletedFilter, Vec); - fn updated(&self, _old: Option<&Self::E>, _new: Option<&Self::E>) { - // nothing for now - } - fn matches_filter(entry: &Self::E, filter: &Self::Filter) -> bool { if filter.0 == DeletedFilter::Any { return true; @@ -135,7 +133,7 @@ impl TableSchema for CounterTable { pub struct IndexCounter { this_node: Uuid, - local_counter: sled::Tree, + local_counter: db::Tree, propagate_tx: mpsc::UnboundedSender<(T::P, T::S, LocalCounterEntry)>, pub table: Arc, TableShardedReplication>>, } @@ -144,7 +142,7 @@ impl IndexCounter { pub fn new( system: Arc, replication: TableShardedReplication, - db: &sled::Db, + db: &db::Db, ) -> Arc { let background = system.background.clone(); @@ -174,36 +172,36 @@ impl IndexCounter { this } - pub fn count(&self, pk: &T::P, sk: &T::S, counts: &[(&str, i64)]) -> Result<(), Error> { + pub fn count( + &self, + tx: &mut db::Transaction, + pk: &T::P, + sk: &T::S, + counts: &[(&str, i64)], + ) -> db::TxResult<(), Error> { let tree_key = self.table.data.tree_key(pk, sk); - let new_entry = self.local_counter.transaction(|tx| { - let mut entry = match tx.get(&tree_key[..])? { - Some(old_bytes) => { - rmp_serde::decode::from_read_ref::<_, LocalCounterEntry>(&old_bytes) - .map_err(Error::RmpDecode) - .map_err(sled::transaction::ConflictableTransactionError::Abort)? - } - None => LocalCounterEntry { - values: BTreeMap::new(), - }, - }; - - for (s, inc) in counts.iter() { - let mut ent = entry.values.entry(s.to_string()).or_insert((0, 0)); - ent.0 += 1; - ent.1 += *inc; - } - - let new_entry_bytes = rmp_to_vec_all_named(&entry) - .map_err(Error::RmpEncode) - .map_err(sled::transaction::ConflictableTransactionError::Abort)?; - tx.insert(&tree_key[..], new_entry_bytes)?; + let mut entry = match tx.get(&self.local_counter, &tree_key[..])? { + Some(old_bytes) => rmp_serde::decode::from_read_ref::<_, LocalCounterEntry>(&old_bytes) + .map_err(Error::RmpDecode) + .map_err(db::TxError::Abort)?, + None => LocalCounterEntry { + values: BTreeMap::new(), + }, + }; + + for (s, inc) in counts.iter() { + let mut ent = entry.values.entry(s.to_string()).or_insert((0, 0)); + ent.0 += 1; + ent.1 += *inc; + } - Ok(entry) - })?; + let new_entry_bytes = rmp_to_vec_all_named(&entry) + .map_err(Error::RmpEncode) + .map_err(db::TxError::Abort)?; + tx.insert(&self.local_counter, &tree_key[..], new_entry_bytes)?; - if let Err(e) = self.propagate_tx.send((pk.clone(), sk.clone(), new_entry)) { + if let Err(e) = self.propagate_tx.send((pk.clone(), sk.clone(), entry)) { error!( "Could not propagate updated counter values, failed to send to channel: {}", e diff --git a/src/model/k2v/item_table.rs b/src/model/k2v/item_table.rs index 8b7cc08a..991fe66d 100644 --- a/src/model/k2v/item_table.rs +++ b/src/model/k2v/item_table.rs @@ -2,6 +2,7 @@ use serde::{Deserialize, Serialize}; use std::collections::BTreeMap; use std::sync::Arc; +use garage_db as db; use garage_util::data::*; use garage_table::crdt::*; @@ -221,7 +222,12 @@ impl TableSchema for K2VItemTable { type E = K2VItem; type Filter = ItemFilter; - fn updated(&self, old: Option<&Self::E>, new: Option<&Self::E>) { + fn updated( + &self, + tx: &mut db::Transaction, + old: Option<&Self::E>, + new: Option<&Self::E>, + ) -> db::TxOpResult<()> { // 1. Count let (old_entries, old_conflicts, old_values, old_bytes) = match old { None => (0, 0, 0, 0), @@ -239,7 +245,8 @@ impl TableSchema for K2VItemTable { .map(|e| &e.partition.partition_key) .unwrap_or_else(|| &new.unwrap().partition.partition_key); - if let Err(e) = self.counter_table.count( + let counter_res = self.counter_table.count( + tx, &count_pk, count_sk, &[ @@ -248,14 +255,23 @@ impl TableSchema for K2VItemTable { (VALUES, new_values - old_values), (BYTES, new_bytes - old_bytes), ], - ) { - error!("Could not update K2V counter for bucket {:?} partition {}; counts will now be inconsistent. {}", count_pk, count_sk, e); + ); + if let Err(e) = db::unabort(counter_res)? { + // This result can be returned by `counter_table.count()` for instance + // if messagepack serialization or deserialization fails at some step. + // Warn admin but ignore this error for now, that's all we can do. + error!( + "Unable to update K2V item counter for bucket {:?} partition {}: {}. Index values will be wrong!", + count_pk, count_sk, e + ); } // 2. Notify if let Some(new_ent) = new { self.subscriptions.notify(new_ent); } + + Ok(()) } #[allow(clippy::nonminimal_bool)] diff --git a/src/model/migrate.rs b/src/model/migrate.rs index 7e61957a..25acb4b0 100644 --- a/src/model/migrate.rs +++ b/src/model/migrate.rs @@ -25,11 +25,15 @@ impl Migrate { .open_tree("bucket:table") .map_err(GarageError::from)?; - for res in tree.iter() { + let mut old_buckets = vec![]; + for res in tree.iter().map_err(GarageError::from)? { let (_k, v) = res.map_err(GarageError::from)?; let bucket = rmp_serde::decode::from_read_ref::<_, old_bucket::Bucket>(&v[..]) .map_err(GarageError::from)?; + old_buckets.push(bucket); + } + for bucket in old_buckets { if let old_bucket::BucketState::Present(p) = bucket.state.get() { self.migrate_buckets050_do_bucket(&bucket, p).await?; } diff --git a/src/model/s3/block_ref_table.rs b/src/model/s3/block_ref_table.rs index 9b3991bf..9589b4aa 100644 --- a/src/model/s3/block_ref_table.rs +++ b/src/model/s3/block_ref_table.rs @@ -1,6 +1,8 @@ use serde::{Deserialize, Serialize}; use std::sync::Arc; +use garage_db as db; + use garage_util::data::*; use garage_table::crdt::Crdt; @@ -51,21 +53,22 @@ impl TableSchema for BlockRefTable { type E = BlockRef; type Filter = DeletedFilter; - fn updated(&self, old: Option<&Self::E>, new: Option<&Self::E>) { - #[allow(clippy::or_fun_call)] - let block = &old.or(new).unwrap().block; + fn updated( + &self, + tx: &mut db::Transaction, + old: Option<&Self::E>, + new: Option<&Self::E>, + ) -> db::TxOpResult<()> { + let block = old.or(new).unwrap().block; let was_before = old.map(|x| !x.deleted.get()).unwrap_or(false); let is_after = new.map(|x| !x.deleted.get()).unwrap_or(false); if is_after && !was_before { - if let Err(e) = self.block_manager.block_incref(block) { - warn!("block_incref failed for block {:?}: {}", block, e); - } + self.block_manager.block_incref(tx, block)?; } if was_before && !is_after { - if let Err(e) = self.block_manager.block_decref(block) { - warn!("block_decref failed for block {:?}: {}", block, e); - } + self.block_manager.block_decref(tx, block)?; } + Ok(()) } fn matches_filter(entry: &Self::E, filter: &Self::Filter) -> bool { diff --git a/src/model/s3/object_table.rs b/src/model/s3/object_table.rs index 3d9a89f7..62f5d8d9 100644 --- a/src/model/s3/object_table.rs +++ b/src/model/s3/object_table.rs @@ -2,6 +2,8 @@ use serde::{Deserialize, Serialize}; use std::collections::BTreeMap; use std::sync::Arc; +use garage_db as db; + use garage_util::background::BackgroundRunner; use garage_util::data::*; @@ -232,7 +234,12 @@ impl TableSchema for ObjectTable { type E = Object; type Filter = ObjectFilter; - fn updated(&self, old: Option<&Self::E>, new: Option<&Self::E>) { + fn updated( + &self, + _tx: &mut db::Transaction, + old: Option<&Self::E>, + new: Option<&Self::E>, + ) -> db::TxOpResult<()> { let version_table = self.version_table.clone(); let old = old.cloned(); let new = new.cloned(); @@ -259,7 +266,8 @@ impl TableSchema for ObjectTable { } } Ok(()) - }) + }); + Ok(()) } fn matches_filter(entry: &Self::E, filter: &Self::Filter) -> bool { diff --git a/src/model/s3/version_table.rs b/src/model/s3/version_table.rs index ad096772..881c245a 100644 --- a/src/model/s3/version_table.rs +++ b/src/model/s3/version_table.rs @@ -1,6 +1,8 @@ use serde::{Deserialize, Serialize}; use std::sync::Arc; +use garage_db as db; + use garage_util::background::BackgroundRunner; use garage_util::data::*; @@ -137,7 +139,12 @@ impl TableSchema for VersionTable { type E = Version; type Filter = DeletedFilter; - fn updated(&self, old: Option<&Self::E>, new: Option<&Self::E>) { + fn updated( + &self, + _tx: &mut db::Transaction, + old: Option<&Self::E>, + new: Option<&Self::E>, + ) -> db::TxOpResult<()> { let block_ref_table = self.block_ref_table.clone(); let old = old.cloned(); let new = new.cloned(); @@ -160,7 +167,9 @@ impl TableSchema for VersionTable { } } Ok(()) - }) + }); + + Ok(()) } fn matches_filter(entry: &Self::E, filter: &Self::Filter) -> bool { diff --git a/src/table/Cargo.toml b/src/table/Cargo.toml index ed1a213f..6de37cda 100644 --- a/src/table/Cargo.toml +++ b/src/table/Cargo.toml @@ -14,6 +14,7 @@ path = "lib.rs" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +garage_db = { version = "0.8.0", path = "../db" } garage_rpc = { version = "0.7.0", path = "../rpc" } garage_util = { version = "0.7.0", path = "../util" } @@ -25,8 +26,6 @@ hexdump = "0.1" tracing = "0.1.30" rand = "0.8" -sled = "0.34" - rmp-serde = "0.15" serde = { version = "1.0", default-features = false, features = ["derive", "rc"] } serde_bytes = "0.11" diff --git a/src/table/data.rs b/src/table/data.rs index 5cb10066..3212e82b 100644 --- a/src/table/data.rs +++ b/src/table/data.rs @@ -3,12 +3,13 @@ use std::convert::TryInto; use std::sync::Arc; use serde_bytes::ByteBuf; -use sled::{IVec, Transactional}; use tokio::sync::Notify; +use garage_db as db; +use garage_db::counted_tree_hack::CountedTree; + use garage_util::data::*; use garage_util::error::*; -use garage_util::sled_counter::SledCountedTree; use garage_rpc::system::System; @@ -25,12 +26,12 @@ pub struct TableData { pub instance: F, pub replication: R, - pub store: sled::Tree, + pub store: db::Tree, - pub(crate) merkle_tree: sled::Tree, - pub(crate) merkle_todo: sled::Tree, + pub(crate) merkle_tree: db::Tree, + pub(crate) merkle_todo: db::Tree, pub(crate) merkle_todo_notify: Notify, - pub(crate) gc_todo: SledCountedTree, + pub(crate) gc_todo: CountedTree, pub(crate) metrics: TableMetrics, } @@ -40,7 +41,7 @@ where F: TableSchema, R: TableReplication, { - pub fn new(system: Arc, instance: F, replication: R, db: &sled::Db) -> Arc { + pub fn new(system: Arc, instance: F, replication: R, db: &db::Db) -> Arc { let store = db .open_tree(&format!("{}:table", F::TABLE_NAME)) .expect("Unable to open DB tree"); @@ -55,7 +56,7 @@ where let gc_todo = db .open_tree(&format!("{}:gc_todo_v2", F::TABLE_NAME)) .expect("Unable to open DB tree"); - let gc_todo = SledCountedTree::new(gc_todo); + let gc_todo = CountedTree::new(gc_todo).expect("Cannot count gc_todo_v2"); let metrics = TableMetrics::new(F::TABLE_NAME, merkle_todo.clone(), gc_todo.clone()); @@ -98,30 +99,30 @@ where None => partition_hash.to_vec(), Some(sk) => self.tree_key(partition_key, sk), }; - let range = self.store.range(first_key..); + let range = self.store.range(first_key..)?; self.read_range_aux(partition_hash, range, filter, limit) } EnumerationOrder::Reverse => match start { Some(sk) => { let last_key = self.tree_key(partition_key, sk); - let range = self.store.range(..=last_key).rev(); + let range = self.store.range_rev(..=last_key)?; self.read_range_aux(partition_hash, range, filter, limit) } None => { let mut last_key = partition_hash.to_vec(); let lower = u128::from_be_bytes(last_key[16..32].try_into().unwrap()); last_key[16..32].copy_from_slice(&u128::to_be_bytes(lower + 1)); - let range = self.store.range(..last_key).rev(); + let range = self.store.range_rev(..last_key)?; self.read_range_aux(partition_hash, range, filter, limit) } }, } } - fn read_range_aux( + fn read_range_aux<'a>( &self, partition_hash: Hash, - range: impl Iterator>, + range: db::ValueIter<'a>, filter: &Option, limit: usize, ) -> Result>, Error> { @@ -139,7 +140,7 @@ where } }; if keep { - ret.push(Arc::new(ByteBuf::from(value.as_ref()))); + ret.push(Arc::new(ByteBuf::from(value))); } if ret.len() >= limit { break; @@ -183,12 +184,10 @@ where tree_key: &[u8], f: impl Fn(Option) -> F::E, ) -> Result, Error> { - let changed = (&self.store, &self.merkle_todo).transaction(|(store, mkl_todo)| { - let (old_entry, old_bytes, new_entry) = match store.get(tree_key)? { + let changed = self.store.db().transaction(|mut tx| { + let (old_entry, old_bytes, new_entry) = match tx.get(&self.store, tree_key)? { Some(old_bytes) => { - let old_entry = self - .decode_entry(&old_bytes) - .map_err(sled::transaction::ConflictableTransactionError::Abort)?; + let old_entry = self.decode_entry(&old_bytes).map_err(db::TxError::Abort)?; let new_entry = f(Some(old_entry.clone())); (Some(old_entry), Some(old_bytes), new_entry) } @@ -204,24 +203,28 @@ where // the associated Merkle tree entry. let new_bytes = rmp_to_vec_all_named(&new_entry) .map_err(Error::RmpEncode) - .map_err(sled::transaction::ConflictableTransactionError::Abort)?; + .map_err(db::TxError::Abort)?; let encoding_changed = Some(&new_bytes[..]) != old_bytes.as_ref().map(|x| &x[..]); + drop(old_bytes); if value_changed || encoding_changed { let new_bytes_hash = blake2sum(&new_bytes[..]); - mkl_todo.insert(tree_key.to_vec(), new_bytes_hash.as_slice())?; - store.insert(tree_key.to_vec(), new_bytes)?; - Ok(Some((old_entry, new_entry, new_bytes_hash))) + tx.insert(&self.merkle_todo, tree_key, new_bytes_hash.as_slice())?; + tx.insert(&self.store, tree_key, new_bytes)?; + + self.instance + .updated(&mut tx, old_entry.as_ref(), Some(&new_entry))?; + + Ok(Some((new_entry, new_bytes_hash))) } else { Ok(None) } })?; - if let Some((old_entry, new_entry, new_bytes_hash)) = changed { + if let Some((new_entry, new_bytes_hash)) = changed { self.metrics.internal_update_counter.add(1); let is_tombstone = new_entry.is_tombstone(); - self.instance.updated(old_entry.as_ref(), Some(&new_entry)); self.merkle_todo_notify.notify_one(); if is_tombstone { // We are only responsible for GC'ing this item if we are the @@ -244,22 +247,23 @@ where } pub(crate) fn delete_if_equal(self: &Arc, k: &[u8], v: &[u8]) -> Result { - let removed = (&self.store, &self.merkle_todo).transaction(|(store, mkl_todo)| { - if let Some(cur_v) = store.get(k)? { - if cur_v == v { - store.remove(k)?; - mkl_todo.insert(k, vec![])?; - return Ok(true); + let removed = self + .store + .db() + .transaction(|mut tx| match tx.get(&self.store, k)? { + Some(cur_v) if cur_v == v => { + tx.remove(&self.store, k)?; + tx.insert(&self.merkle_todo, k, vec![])?; + + let old_entry = self.decode_entry(v).map_err(db::TxError::Abort)?; + self.instance.updated(&mut tx, Some(&old_entry), None)?; + Ok(true) } - } - Ok(false) - })?; + _ => Ok(false), + })?; if removed { self.metrics.internal_delete_counter.add(1); - - let old_entry = self.decode_entry(v)?; - self.instance.updated(Some(&old_entry), None); self.merkle_todo_notify.notify_one(); } Ok(removed) @@ -270,25 +274,26 @@ where k: &[u8], vhash: Hash, ) -> Result { - let removed = (&self.store, &self.merkle_todo).transaction(|(store, mkl_todo)| { - if let Some(cur_v) = store.get(k)? { - if blake2sum(&cur_v[..]) == vhash { - store.remove(k)?; - mkl_todo.insert(k, vec![])?; - return Ok(Some(cur_v)); + let removed = self + .store + .db() + .transaction(|mut tx| match tx.get(&self.store, k)? { + Some(cur_v) if blake2sum(&cur_v[..]) == vhash => { + tx.remove(&self.store, k)?; + tx.insert(&self.merkle_todo, k, vec![])?; + + let old_entry = self.decode_entry(&cur_v[..]).map_err(db::TxError::Abort)?; + self.instance.updated(&mut tx, Some(&old_entry), None)?; + Ok(true) } - } - Ok(None) - })?; + _ => Ok(false), + })?; - if let Some(old_v) = removed { - let old_entry = self.decode_entry(&old_v[..])?; - self.instance.updated(Some(&old_entry), None); + if removed { + self.metrics.internal_delete_counter.add(1); self.merkle_todo_notify.notify_one(); - Ok(true) - } else { - Ok(false) } + Ok(removed) } // ---- Utility functions ---- @@ -315,7 +320,7 @@ where } } - pub fn gc_todo_len(&self) -> usize { - self.gc_todo.len() + pub fn gc_todo_len(&self) -> Result { + Ok(self.gc_todo.len()) } } diff --git a/src/table/gc.rs b/src/table/gc.rs index 2a05b6ae..e7fbbcb0 100644 --- a/src/table/gc.rs +++ b/src/table/gc.rs @@ -12,9 +12,10 @@ use futures::select; use futures_util::future::*; use tokio::sync::watch; +use garage_db::counted_tree_hack::CountedTree; + use garage_util::data::*; use garage_util::error::*; -use garage_util::sled_counter::SledCountedTree; use garage_util::time::*; use garage_rpc::system::System; @@ -100,18 +101,16 @@ where async fn gc_loop_iter(&self) -> Result, Error> { let now = now_msec(); - let mut entries = vec![]; - let mut excluded = vec![]; - // List entries in the GC todo list // These entries are put there when a tombstone is inserted in the table // (see update_entry in data.rs) - for entry_kv in self.data.gc_todo.iter() { + let mut candidates = vec![]; + for entry_kv in self.data.gc_todo.iter()? { let (k, vhash) = entry_kv?; - let mut todo_entry = GcTodoEntry::parse(&k, &vhash); + let todo_entry = GcTodoEntry::parse(&k, &vhash); if todo_entry.deletion_time() > now { - if entries.is_empty() && excluded.is_empty() { + if candidates.is_empty() { // If the earliest entry in the todo list shouldn't yet be processed, // return a duration to wait in the loop return Ok(Some(Duration::from_millis( @@ -123,15 +122,23 @@ where } } - let vhash = Hash::try_from(&vhash[..]).unwrap(); + candidates.push(todo_entry); + if candidates.len() >= 2 * TABLE_GC_BATCH_SIZE { + break; + } + } + let mut entries = vec![]; + let mut excluded = vec![]; + for mut todo_entry in candidates { // Check if the tombstone is still the current value of the entry. // If not, we don't actually want to GC it, and we will remove it // from the gc_todo table later (below). + let vhash = todo_entry.value_hash; todo_entry.value = self .data .store - .get(&k[..])? + .get(&todo_entry.key[..])? .filter(|v| blake2sum(&v[..]) == vhash) .map(|v| v.to_vec()); @@ -353,17 +360,17 @@ impl GcTodoEntry { } /// Parses a GcTodoEntry from a (k, v) pair stored in the gc_todo tree - pub(crate) fn parse(sled_k: &[u8], sled_v: &[u8]) -> Self { + pub(crate) fn parse(db_k: &[u8], db_v: &[u8]) -> Self { Self { - tombstone_timestamp: u64::from_be_bytes(sled_k[0..8].try_into().unwrap()), - key: sled_k[8..].to_vec(), - value_hash: Hash::try_from(sled_v).unwrap(), + tombstone_timestamp: u64::from_be_bytes(db_k[0..8].try_into().unwrap()), + key: db_k[8..].to_vec(), + value_hash: Hash::try_from(db_v).unwrap(), value: None, } } /// Saves the GcTodoEntry in the gc_todo tree - pub(crate) fn save(&self, gc_todo_tree: &SledCountedTree) -> Result<(), Error> { + pub(crate) fn save(&self, gc_todo_tree: &CountedTree) -> Result<(), Error> { gc_todo_tree.insert(self.todo_table_key(), self.value_hash.as_slice())?; Ok(()) } @@ -373,9 +380,9 @@ impl GcTodoEntry { /// This is usefull to remove a todo entry only under the condition /// that it has not changed since the time it was read, i.e. /// what we have to do is still the same - pub(crate) fn remove_if_equal(&self, gc_todo_tree: &SledCountedTree) -> Result<(), Error> { - let _ = gc_todo_tree.compare_and_swap::<_, _, Vec>( - &self.todo_table_key()[..], + pub(crate) fn remove_if_equal(&self, gc_todo_tree: &CountedTree) -> Result<(), Error> { + gc_todo_tree.compare_and_swap::<_, _, &[u8]>( + &self.todo_table_key(), Some(self.value_hash), None, )?; diff --git a/src/table/merkle.rs b/src/table/merkle.rs index 93bf7e47..7685b193 100644 --- a/src/table/merkle.rs +++ b/src/table/merkle.rs @@ -4,11 +4,10 @@ use std::time::Duration; use futures::select; use futures_util::future::*; use serde::{Deserialize, Serialize}; -use sled::transaction::{ - ConflictableTransactionError, ConflictableTransactionResult, TransactionalTree, -}; use tokio::sync::watch; +use garage_db as db; + use garage_util::background::BackgroundRunner; use garage_util::data::*; use garage_util::error::Error; @@ -90,35 +89,35 @@ where async fn updater_loop(self: Arc, mut must_exit: watch::Receiver) { while !*must_exit.borrow() { - if let Some(x) = self.data.merkle_todo.iter().next() { - match x { - Ok((key, valhash)) => { - if let Err(e) = self.update_item(&key[..], &valhash[..]) { - warn!( - "({}) Error while updating Merkle tree item: {}", - F::TABLE_NAME, - e - ); - } - } - Err(e) => { - warn!( - "({}) Error while iterating on Merkle todo tree: {}", - F::TABLE_NAME, - e - ); - tokio::time::sleep(Duration::from_secs(10)).await; + match self.updater_loop_iter() { + Ok(true) => (), + Ok(false) => { + select! { + _ = self.data.merkle_todo_notify.notified().fuse() => {}, + _ = must_exit.changed().fuse() => {}, } } - } else { - select! { - _ = self.data.merkle_todo_notify.notified().fuse() => {}, - _ = must_exit.changed().fuse() => {}, + Err(e) => { + warn!( + "({}) Error while updating Merkle tree item: {}", + F::TABLE_NAME, + e + ); + tokio::time::sleep(Duration::from_secs(10)).await; } } } } + fn updater_loop_iter(&self) -> Result { + if let Some((key, valhash)) = self.data.merkle_todo.first()? { + self.update_item(&key, &valhash)?; + Ok(true) + } else { + Ok(false) + } + } + fn update_item(&self, k: &[u8], vhash_by: &[u8]) -> Result<(), Error> { let khash = blake2sum(k); @@ -137,13 +136,16 @@ where }; self.data .merkle_tree - .transaction(|tx| self.update_item_rec(tx, k, &khash, &key, new_vhash))?; + .db() + .transaction(|mut tx| self.update_item_rec(&mut tx, k, &khash, &key, new_vhash))?; - let deleted = self - .data - .merkle_todo - .compare_and_swap::<_, _, Vec>(k, Some(vhash_by), None)? - .is_ok(); + let deleted = self.data.merkle_todo.db().transaction(|mut tx| { + let remove = matches!(tx.get(&self.data.merkle_todo, k)?, Some(ov) if ov == vhash_by); + if remove { + tx.remove(&self.data.merkle_todo, k)?; + } + Ok(remove) + })?; if !deleted { debug!( @@ -157,12 +159,12 @@ where fn update_item_rec( &self, - tx: &TransactionalTree, + tx: &mut db::Transaction<'_>, k: &[u8], khash: &Hash, key: &MerkleNodeKey, new_vhash: Option, - ) -> ConflictableTransactionResult, Error> { + ) -> db::TxResult, Error> { let i = key.prefix.len(); // Read node at current position (defined by the prefix stored in key) @@ -203,7 +205,7 @@ where } MerkleNode::Intermediate(_) => Some(MerkleNode::Intermediate(children)), x @ MerkleNode::Leaf(_, _) => { - tx.remove(key_sub.encode())?; + tx.remove(&self.data.merkle_tree, key_sub.encode())?; Some(x) } } @@ -283,28 +285,27 @@ where fn read_node_txn( &self, - tx: &TransactionalTree, + tx: &mut db::Transaction<'_>, k: &MerkleNodeKey, - ) -> ConflictableTransactionResult { - let ent = tx.get(k.encode())?; - MerkleNode::decode_opt(ent).map_err(ConflictableTransactionError::Abort) + ) -> db::TxResult { + let ent = tx.get(&self.data.merkle_tree, k.encode())?; + MerkleNode::decode_opt(&ent).map_err(db::TxError::Abort) } fn put_node_txn( &self, - tx: &TransactionalTree, + tx: &mut db::Transaction<'_>, k: &MerkleNodeKey, v: &MerkleNode, - ) -> ConflictableTransactionResult { + ) -> db::TxResult { trace!("Put Merkle node: {:?} => {:?}", k, v); if *v == MerkleNode::Empty { - tx.remove(k.encode())?; + tx.remove(&self.data.merkle_tree, k.encode())?; Ok(self.empty_node_hash) } else { - let vby = rmp_to_vec_all_named(v) - .map_err(|e| ConflictableTransactionError::Abort(e.into()))?; + let vby = rmp_to_vec_all_named(v).map_err(|e| db::TxError::Abort(e.into()))?; let rethash = blake2sum(&vby[..]); - tx.insert(k.encode(), vby)?; + tx.insert(&self.data.merkle_tree, k.encode(), vby)?; Ok(rethash) } } @@ -312,15 +313,15 @@ where // Access a node in the Merkle tree, used by the sync protocol pub(crate) fn read_node(&self, k: &MerkleNodeKey) -> Result { let ent = self.data.merkle_tree.get(k.encode())?; - MerkleNode::decode_opt(ent) + MerkleNode::decode_opt(&ent) } - pub fn merkle_tree_len(&self) -> usize { - self.data.merkle_tree.len() + pub fn merkle_tree_len(&self) -> Result { + Ok(self.data.merkle_tree.len()?) } - pub fn todo_len(&self) -> usize { - self.data.merkle_todo.len() + pub fn todo_len(&self) -> Result { + Ok(self.data.merkle_todo.len()?) } } @@ -347,7 +348,7 @@ impl MerkleNodeKey { } impl MerkleNode { - fn decode_opt(ent: Option) -> Result { + fn decode_opt(ent: &Option) -> Result { match ent { None => Ok(MerkleNode::Empty), Some(v) => Ok(rmp_serde::decode::from_read_ref::<_, MerkleNode>(&v[..])?), diff --git a/src/table/metrics.rs b/src/table/metrics.rs index 752a2a6d..3a1783e0 100644 --- a/src/table/metrics.rs +++ b/src/table/metrics.rs @@ -1,6 +1,7 @@ use opentelemetry::{global, metrics::*, KeyValue}; -use garage_util::sled_counter::SledCountedTree; +use garage_db as db; +use garage_db::counted_tree_hack::CountedTree; /// TableMetrics reference all counter used for metrics pub struct TableMetrics { @@ -19,21 +20,19 @@ pub struct TableMetrics { pub(crate) sync_items_received: Counter, } impl TableMetrics { - pub fn new( - table_name: &'static str, - merkle_todo: sled::Tree, - gc_todo: SledCountedTree, - ) -> Self { + pub fn new(table_name: &'static str, merkle_todo: db::Tree, gc_todo: CountedTree) -> Self { let meter = global::meter(table_name); TableMetrics { _merkle_todo_len: meter .u64_value_observer( "table.merkle_updater_todo_queue_length", move |observer| { - observer.observe( - merkle_todo.len() as u64, - &[KeyValue::new("table_name", table_name)], - ) + if let Ok(v) = merkle_todo.len() { + observer.observe( + v as u64, + &[KeyValue::new("table_name", table_name)], + ); + } }, ) .with_description("Merkle tree updater TODO queue length") @@ -45,7 +44,7 @@ impl TableMetrics { observer.observe( gc_todo.len() as u64, &[KeyValue::new("table_name", table_name)], - ) + ); }, ) .with_description("Table garbage collector TODO queue length") diff --git a/src/table/schema.rs b/src/table/schema.rs index 37327037..74f57798 100644 --- a/src/table/schema.rs +++ b/src/table/schema.rs @@ -1,5 +1,6 @@ use serde::{Deserialize, Serialize}; +use garage_db as db; use garage_util::data::*; use crate::crdt::Crdt; @@ -82,11 +83,19 @@ pub trait TableSchema: Send + Sync { None } - // Updated triggers some stuff downstream, but it is not supposed to block or fail, - // as the update itself is an unchangeable fact that will never go back - // due to CRDT logic. Typically errors in propagation of info should be logged - // to stderr. - fn updated(&self, _old: Option<&Self::E>, _new: Option<&Self::E>) {} + /// Actions triggered by data changing in a table. If such actions + /// include updates to the local database that should be applied + /// atomically with the item update itself, a db transaction is + /// provided on which these changes should be done. + /// This function can return a DB error but that's all. + fn updated( + &self, + _tx: &mut db::Transaction, + _old: Option<&Self::E>, + _new: Option<&Self::E>, + ) -> db::TxOpResult<()> { + Ok(()) + } fn matches_filter(entry: &Self::E, filter: &Self::Filter) -> bool; } diff --git a/src/table/sync.rs b/src/table/sync.rs index 08069ad0..4c83e991 100644 --- a/src/table/sync.rs +++ b/src/table/sync.rs @@ -258,9 +258,9 @@ where while !*must_exit.borrow() { let mut items = Vec::new(); - for item in self.data.store.range(begin.to_vec()..end.to_vec()) { + for item in self.data.store.range(begin.to_vec()..end.to_vec())? { let (key, value) = item?; - items.push((key.to_vec(), Arc::new(ByteBuf::from(value.as_ref())))); + items.push((key.to_vec(), Arc::new(ByteBuf::from(value)))); if items.len() >= 1024 { break; @@ -603,8 +603,16 @@ impl SyncTodo { let retain = nodes.contains(&my_id); if !retain { // Check if we have some data to send, otherwise skip - if data.store.range(begin..end).next().is_none() { - continue; + match data.store.range(begin..end) { + Ok(mut iter) => { + if iter.next().is_none() { + continue; + } + } + Err(e) => { + warn!("DB error in add_full_sync: {}", e); + continue; + } } } diff --git a/src/table/table.rs b/src/table/table.rs index 2a167604..3c211728 100644 --- a/src/table/table.rs +++ b/src/table/table.rs @@ -13,6 +13,8 @@ use opentelemetry::{ Context, }; +use garage_db as db; + use garage_util::data::*; use garage_util::error::Error; use garage_util::metrics::RecordDuration; @@ -69,7 +71,7 @@ where { // =============== PUBLIC INTERFACE FUNCTIONS (new, insert, get, etc) =============== - pub fn new(instance: F, replication: R, system: Arc, db: &sled::Db) -> Arc { + pub fn new(instance: F, replication: R, system: Arc, db: &db::Db) -> Arc { let endpoint = system .netapp .endpoint(format!("garage_table/table.rs/Rpc:{}", F::TABLE_NAME)); diff --git a/src/util/Cargo.toml b/src/util/Cargo.toml index 95cde531..5d073436 100644 --- a/src/util/Cargo.toml +++ b/src/util/Cargo.toml @@ -14,6 +14,8 @@ path = "lib.rs" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +garage_db = { version = "0.8.0", path = "../db" } + blake2 = "0.9" err-derive = "0.3" xxhash-rust = { version = "0.8", default-features = false, features = ["xxh3"] } @@ -22,8 +24,6 @@ tracing = "0.1.30" rand = "0.8" sha2 = "0.9" -sled = "0.34" - chrono = "0.4" rmp-serde = "0.15" serde = { version = "1.0", default-features = false, features = ["derive", "rc"] } diff --git a/src/util/config.rs b/src/util/config.rs index 99ebce31..e8ef4fdd 100644 --- a/src/util/config.rs +++ b/src/util/config.rs @@ -64,14 +64,19 @@ pub struct Config { #[serde(default)] pub kubernetes_skip_crd: bool, + // -- DB + /// Database engine to use for metadata (options: sled, sqlite, lmdb) + #[serde(default = "default_db_engine")] + pub db_engine: String, + /// Sled cache size, in bytes #[serde(default = "default_sled_cache_capacity")] pub sled_cache_capacity: u64, - /// Sled flush interval in milliseconds #[serde(default = "default_sled_flush_every_ms")] pub sled_flush_every_ms: u64, + // -- APIs /// Configuration for S3 api pub s3_api: S3ApiConfig, @@ -129,6 +134,10 @@ pub struct AdminConfig { pub trace_sink: Option, } +fn default_db_engine() -> String { + "sled".into() +} + fn default_sled_cache_capacity() -> u64 { 128 * 1024 * 1024 } diff --git a/src/util/error.rs b/src/util/error.rs index 8734a0c8..9995c746 100644 --- a/src/util/error.rs +++ b/src/util/error.rs @@ -26,8 +26,8 @@ pub enum Error { #[error(display = "Netapp error: {}", _0)] Netapp(#[error(source)] netapp::error::Error), - #[error(display = "Sled error: {}", _0)] - Sled(#[error(source)] sled::Error), + #[error(display = "DB error: {}", _0)] + Db(#[error(source)] garage_db::Error), #[error(display = "Messagepack encode error: {}", _0)] RmpEncode(#[error(source)] rmp_serde::encode::Error), @@ -78,11 +78,11 @@ impl Error { } } -impl From> for Error { - fn from(e: sled::transaction::TransactionError) -> Error { +impl From> for Error { + fn from(e: garage_db::TxError) -> Error { match e { - sled::transaction::TransactionError::Abort(x) => x, - sled::transaction::TransactionError::Storage(x) => Error::Sled(x), + garage_db::TxError::Abort(x) => x, + garage_db::TxError::Db(x) => Error::Db(x), } } } diff --git a/src/util/lib.rs b/src/util/lib.rs index d8ffdd0b..8ca6e310 100644 --- a/src/util/lib.rs +++ b/src/util/lib.rs @@ -11,7 +11,7 @@ pub mod error; pub mod formater; pub mod metrics; pub mod persister; -pub mod sled_counter; +//pub mod sled_counter; pub mod time; pub mod token_bucket; pub mod tranquilizer; diff --git a/src/util/sled_counter.rs b/src/util/sled_counter.rs deleted file mode 100644 index bc54cea0..00000000 --- a/src/util/sled_counter.rs +++ /dev/null @@ -1,100 +0,0 @@ -use std::sync::{ - atomic::{AtomicUsize, Ordering}, - Arc, -}; - -use sled::{CompareAndSwapError, IVec, Iter, Result, Tree}; - -#[derive(Clone)] -pub struct SledCountedTree(Arc); - -struct SledCountedTreeInternal { - tree: Tree, - len: AtomicUsize, -} - -impl SledCountedTree { - pub fn new(tree: Tree) -> Self { - let len = tree.len(); - Self(Arc::new(SledCountedTreeInternal { - tree, - len: AtomicUsize::new(len), - })) - } - - pub fn len(&self) -> usize { - self.0.len.load(Ordering::Relaxed) - } - - pub fn is_empty(&self) -> bool { - self.0.tree.is_empty() - } - - pub fn get>(&self, key: K) -> Result> { - self.0.tree.get(key) - } - - pub fn iter(&self) -> Iter { - self.0.tree.iter() - } - - // ---- writing functions ---- - - pub fn insert(&self, key: K, value: V) -> Result> - where - K: AsRef<[u8]>, - V: Into, - { - let res = self.0.tree.insert(key, value); - if res == Ok(None) { - self.0.len.fetch_add(1, Ordering::Relaxed); - } - res - } - - pub fn remove>(&self, key: K) -> Result> { - let res = self.0.tree.remove(key); - if matches!(res, Ok(Some(_))) { - self.0.len.fetch_sub(1, Ordering::Relaxed); - } - res - } - - pub fn pop_min(&self) -> Result> { - let res = self.0.tree.pop_min(); - if let Ok(Some(_)) = &res { - self.0.len.fetch_sub(1, Ordering::Relaxed); - }; - res - } - - pub fn compare_and_swap( - &self, - key: K, - old: Option, - new: Option, - ) -> Result> - where - K: AsRef<[u8]>, - OV: AsRef<[u8]>, - NV: Into, - { - let old_some = old.is_some(); - let new_some = new.is_some(); - - let res = self.0.tree.compare_and_swap(key, old, new); - - if res == Ok(Ok(())) { - match (old_some, new_some) { - (false, true) => { - self.0.len.fetch_add(1, Ordering::Relaxed); - } - (true, false) => { - self.0.len.fetch_sub(1, Ordering::Relaxed); - } - _ => (), - } - } - res - } -} -- cgit v1.2.3 From 138e13071be37d873344cd03e316c87ff8057ea0 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 9 Jun 2022 14:55:20 +0200 Subject: Fix garage_db build on 32-bit systems --- src/db/bin/convert.rs | 9 +-------- src/db/lmdb_adapter.rs | 14 ++++++++++++++ src/garage/server.rs | 7 +------ 3 files changed, 16 insertions(+), 14 deletions(-) (limited to 'src') diff --git a/src/db/bin/convert.rs b/src/db/bin/convert.rs index 9e45e61f..bbde2048 100644 --- a/src/db/bin/convert.rs +++ b/src/db/bin/convert.rs @@ -55,14 +55,7 @@ fn open_db(path: PathBuf, engine: String) -> Result { Error(format!("Unable to create LMDB data directory: {}", e).into()) })?; - let map_size = if u32::MAX as usize == usize::MAX { - eprintln!( - "LMDB is not recommended on 32-bit systems, database size will be limited" - ); - 1usize << 30 // 1GB for 32-bit systems - } else { - 1usize << 40 // 1TB for 64-bit systems - }; + let map_size = lmdb_adapter::recommended_map_size(); let db = lmdb_adapter::heed::EnvOpenOptions::new() .max_dbs(100) diff --git a/src/db/lmdb_adapter.rs b/src/db/lmdb_adapter.rs index 74622919..62fcc3e6 100644 --- a/src/db/lmdb_adapter.rs +++ b/src/db/lmdb_adapter.rs @@ -327,3 +327,17 @@ where } } } + +// ---- + +#[cfg(target_pointer_width = "64")] +pub fn recommended_map_size() -> usize { + 1usize << 40 +} + +#[cfg(target_pointer_width = "32")] +pub fn recommended_map_size() -> usize { + use log::warn; + warn!("LMDB is not recommended on 32-bit systems, database size will be limited"); + 1usize << 30 +} diff --git a/src/garage/server.rs b/src/garage/server.rs index 697d3358..7aa6185f 100644 --- a/src/garage/server.rs +++ b/src/garage/server.rs @@ -57,12 +57,7 @@ pub async fn run_server(config_file: PathBuf) -> Result<(), Error> { db_path.push("db.lmdb"); info!("Opening LMDB database at: {}", db_path.display()); std::fs::create_dir_all(&db_path).expect("Unable to create LMDB data directory"); - let map_size = if u32::MAX as usize == usize::MAX { - warn!("LMDB is not recommended on 32-bit systems, database size will be limited"); - 1usize << 30 // 1GB for 32-bit systems - } else { - 1usize << 40 // 1TB for 64-bit systems - }; + let map_size = garage_db::lmdb_adapter::recommended_map_size(); let db = db::lmdb_adapter::heed::EnvOpenOptions::new() .max_dbs(100) -- cgit v1.2.3 From d544a0e0e03c9b69b226fb5bba2ce27a7af270ca Mon Sep 17 00:00:00 2001 From: Quentin Dufour Date: Mon, 13 Jun 2022 10:13:31 +0200 Subject: Send CORS headers for all requests --- src/api/s3/api_server.rs | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) (limited to 'src') diff --git a/src/api/s3/api_server.rs b/src/api/s3/api_server.rs index ecc417ab..d1d6288c 100644 --- a/src/api/s3/api_server.rs +++ b/src/api/s3/api_server.rs @@ -4,7 +4,7 @@ use async_trait::async_trait; use futures::future::Future; use hyper::header; -use hyper::{Body, Method, Request, Response}; +use hyper::{Body, Request, Response}; use opentelemetry::{trace::SpanRef, KeyValue}; @@ -167,14 +167,7 @@ impl ApiHandler for S3ApiServer { return Err(Error::forbidden("Operation is not allowed for this key.")); } - // Look up what CORS rule might apply to response. - // Requests for methods different than GET, HEAD or POST - // are always preflighted, i.e. the browser should make - // an OPTIONS call before to check it is allowed - let matching_cors_rule = match *req.method() { - Method::GET | Method::HEAD | Method::POST => find_matching_cors_rule(&bucket, &req)?, - _ => None, - }; + let matching_cors_rule = find_matching_cors_rule(&bucket, &req)?; let resp = match endpoint { Endpoint::HeadObject { -- cgit v1.2.3 From 77e3fd6db2c9cd3a10889bd071e95ef839cfbefc Mon Sep 17 00:00:00 2001 From: Alex Date: Wed, 15 Jun 2022 20:20:28 +0200 Subject: improve internal item counter mechanisms and implement bucket quotas (#326) - [x] Refactoring of internal counting API - [x] Repair procedure for counters (it's an offline procedure!!!) - [x] New counter for objects in buckets - [x] Add quotas to buckets struct - [x] Add CLI to manage bucket quotas - [x] Add admin API to manage bucket quotas - [x] Apply quotas by adding checks on put operations - [x] Proof-read Co-authored-by: Alex Auvolat Reviewed-on: https://git.deuxfleurs.fr/Deuxfleurs/garage/pulls/326 Co-authored-by: Alex Co-committed-by: Alex --- src/api/admin/api_server.rs | 7 +- src/api/admin/bucket.rs | 96 +++++++++++----- src/api/admin/router.rs | 8 +- src/api/k2v/index.rs | 2 +- src/api/s3/api_server.rs | 4 +- src/api/s3/post_object.rs | 15 ++- src/api/s3/put.rs | 129 ++++++++++++++++----- src/db/lib.rs | 6 + src/db/lmdb_adapter.rs | 8 ++ src/db/sled_adapter.rs | 6 + src/db/sqlite_adapter.rs | 10 ++ src/garage/Cargo.toml | 1 + src/garage/admin.rs | 82 +++++++++++++- src/garage/cli/cmd.rs | 8 +- src/garage/cli/structs.rs | 47 +++++++- src/garage/cli/util.rs | 47 +++++++- src/garage/main.rs | 18 +-- src/garage/repair.rs | 163 --------------------------- src/garage/repair/mod.rs | 2 + src/garage/repair/offline.rs | 55 +++++++++ src/garage/repair/online.rs | 163 +++++++++++++++++++++++++++ src/garage/server.rs | 50 +-------- src/model/bucket_table.rs | 19 +++- src/model/garage.rs | 67 +++++++++-- src/model/index_counter.rs | 250 +++++++++++++++++++++++++++++++++-------- src/model/k2v/counter_table.rs | 20 ---- src/model/k2v/item_table.rs | 102 +++++++++-------- src/model/k2v/mod.rs | 1 - src/model/migrate.rs | 1 + src/model/s3/object_table.rs | 61 +++++++++- 30 files changed, 1015 insertions(+), 433 deletions(-) delete mode 100644 src/garage/repair.rs create mode 100644 src/garage/repair/mod.rs create mode 100644 src/garage/repair/offline.rs create mode 100644 src/garage/repair/online.rs delete mode 100644 src/model/k2v/counter_table.rs (limited to 'src') diff --git a/src/api/admin/api_server.rs b/src/api/admin/api_server.rs index 57e3e5cf..c3b16715 100644 --- a/src/api/admin/api_server.rs +++ b/src/api/admin/api_server.rs @@ -156,12 +156,7 @@ impl ApiHandler for AdminApiServer { } Endpoint::CreateBucket => handle_create_bucket(&self.garage, req).await, Endpoint::DeleteBucket { id } => handle_delete_bucket(&self.garage, id).await, - Endpoint::PutBucketWebsite { id } => { - handle_put_bucket_website(&self.garage, id, req).await - } - Endpoint::DeleteBucketWebsite { id } => { - handle_delete_bucket_website(&self.garage, id).await - } + Endpoint::UpdateBucket { id } => handle_update_bucket(&self.garage, id, req).await, // Bucket-key permissions Endpoint::BucketAllowKey => { handle_bucket_change_key_perm(&self.garage, req, true).await diff --git a/src/api/admin/bucket.rs b/src/api/admin/bucket.rs index 7f9a813f..ac8a8a40 100644 --- a/src/api/admin/bucket.rs +++ b/src/api/admin/bucket.rs @@ -14,6 +14,7 @@ use garage_model::bucket_alias_table::*; use garage_model::bucket_table::*; use garage_model::garage::Garage; use garage_model::permission::*; +use garage_model::s3::object_table::*; use crate::admin::error::*; use crate::admin::key::ApiBucketKeyPerm; @@ -77,6 +78,13 @@ struct BucketLocalAlias { alias: String, } +#[derive(Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] +struct ApiBucketQuotas { + max_size: Option, + max_objects: Option, +} + pub async fn handle_get_bucket_info( garage: &Arc, id: Option, @@ -108,6 +116,14 @@ async fn bucket_info_results( .get_existing_bucket(bucket_id) .await?; + let counters = garage + .object_counter_table + .table + .get(&bucket_id, &EmptyKey) + .await? + .map(|x| x.filtered_values(&garage.system.ring.borrow())) + .unwrap_or_default(); + let mut relevant_keys = HashMap::new(); for (k, _) in bucket .state @@ -148,6 +164,7 @@ async fn bucket_info_results( let state = bucket.state.as_option().unwrap(); + let quotas = state.quotas.get(); let res = GetBucketInfoResult { id: hex::encode(&bucket.id), @@ -191,6 +208,16 @@ async fn bucket_info_results( } }) .collect::>(), + objects: counters.get(OBJECTS).cloned().unwrap_or_default(), + bytes: counters.get(BYTES).cloned().unwrap_or_default(), + unfinshed_uploads: counters + .get(UNFINISHED_UPLOADS) + .cloned() + .unwrap_or_default(), + quotas: ApiBucketQuotas { + max_size: quotas.max_size, + max_objects: quotas.max_objects, + }, }; Ok(json_ok_response(&res)?) @@ -205,6 +232,10 @@ struct GetBucketInfoResult { #[serde(default)] website_config: Option, keys: Vec, + objects: i64, + bytes: i64, + unfinshed_uploads: i64, + quotas: ApiBucketQuotas, } #[derive(Serialize)] @@ -363,14 +394,12 @@ pub async fn handle_delete_bucket( .body(Body::empty())?) } -// ---- BUCKET WEBSITE CONFIGURATION ---- - -pub async fn handle_put_bucket_website( +pub async fn handle_update_bucket( garage: &Arc, id: String, req: Request, ) -> Result, Error> { - let req = parse_json_body::(req).await?; + let req = parse_json_body::(req).await?; let bucket_id = parse_bucket_id(&id)?; let mut bucket = garage @@ -379,10 +408,31 @@ pub async fn handle_put_bucket_website( .await?; let state = bucket.state.as_option_mut().unwrap(); - state.website_config.update(Some(WebsiteConfig { - index_document: req.index_document, - error_document: req.error_document, - })); + + if let Some(wa) = req.website_access { + if wa.enabled { + state.website_config.update(Some(WebsiteConfig { + index_document: wa.index_document.ok_or_bad_request( + "Please specify indexDocument when enabling website access.", + )?, + error_document: wa.error_document, + })); + } else { + if wa.index_document.is_some() || wa.error_document.is_some() { + return Err(Error::bad_request( + "Cannot specify indexDocument or errorDocument when disabling website access.", + )); + } + state.website_config.update(None); + } + } + + if let Some(q) = req.quotas { + state.quotas.update(BucketQuotas { + max_size: q.max_size, + max_objects: q.max_objects, + }); + } garage.bucket_table.insert(&bucket).await?; @@ -391,29 +441,17 @@ pub async fn handle_put_bucket_website( #[derive(Deserialize)] #[serde(rename_all = "camelCase")] -struct PutBucketWebsiteRequest { - index_document: String, - #[serde(default)] - error_document: Option, +struct UpdateBucketRequest { + website_access: Option, + quotas: Option, } -pub async fn handle_delete_bucket_website( - garage: &Arc, - id: String, -) -> Result, Error> { - let bucket_id = parse_bucket_id(&id)?; - - let mut bucket = garage - .bucket_helper() - .get_existing_bucket(bucket_id) - .await?; - - let state = bucket.state.as_option_mut().unwrap(); - state.website_config.update(None); - - garage.bucket_table.insert(&bucket).await?; - - bucket_info_results(garage, bucket_id).await +#[derive(Deserialize)] +#[serde(rename_all = "camelCase")] +struct UpdateBucketWebsiteAccess { + enabled: bool, + index_document: Option, + error_document: Option, } // ---- BUCKET/KEY PERMISSIONS ---- diff --git a/src/api/admin/router.rs b/src/api/admin/router.rs index 93639873..3eee8b67 100644 --- a/src/api/admin/router.rs +++ b/src/api/admin/router.rs @@ -48,10 +48,7 @@ pub enum Endpoint { DeleteBucket { id: String, }, - PutBucketWebsite { - id: String, - }, - DeleteBucketWebsite { + UpdateBucket { id: String, }, // Bucket-Key Permissions @@ -113,8 +110,7 @@ impl Endpoint { GET "/v0/bucket" => ListBuckets, POST "/v0/bucket" => CreateBucket, DELETE "/v0/bucket" if id => DeleteBucket (query::id), - PUT "/v0/bucket/website" if id => PutBucketWebsite (query::id), - DELETE "/v0/bucket/website" if id => DeleteBucketWebsite (query::id), + PUT "/v0/bucket" if id => UpdateBucket (query::id), // Bucket-key permissions POST "/v0/bucket/allow" => BucketAllowKey, POST "/v0/bucket/deny" => BucketDenyKey, diff --git a/src/api/k2v/index.rs b/src/api/k2v/index.rs index d5db906d..210950bf 100644 --- a/src/api/k2v/index.rs +++ b/src/api/k2v/index.rs @@ -10,7 +10,7 @@ use garage_rpc::ring::Ring; use garage_table::util::*; use garage_model::garage::Garage; -use garage_model::k2v::counter_table::{BYTES, CONFLICTS, ENTRIES, VALUES}; +use garage_model::k2v::item_table::{BYTES, CONFLICTS, ENTRIES, VALUES}; use crate::k2v::error::*; use crate::k2v::range::read_range; diff --git a/src/api/s3/api_server.rs b/src/api/s3/api_server.rs index d1d6288c..78dfeeac 100644 --- a/src/api/s3/api_server.rs +++ b/src/api/s3/api_server.rs @@ -212,7 +212,7 @@ impl ApiHandler for S3ApiServer { .await } Endpoint::PutObject { key } => { - handle_put(garage, req, bucket_id, &key, content_sha256).await + handle_put(garage, req, &bucket, &key, content_sha256).await } Endpoint::AbortMultipartUpload { key, upload_id } => { handle_abort_multipart_upload(garage, bucket_id, &key, &upload_id).await @@ -226,7 +226,7 @@ impl ApiHandler for S3ApiServer { garage, req, &bucket_name, - bucket_id, + &bucket, &key, &upload_id, content_sha256, diff --git a/src/api/s3/post_object.rs b/src/api/s3/post_object.rs index dc640f43..d063faa4 100644 --- a/src/api/s3/post_object.rs +++ b/src/api/s3/post_object.rs @@ -22,7 +22,7 @@ use crate::signature::payload::{parse_date, verify_v4}; pub async fn handle_post_object( garage: Arc, req: Request, - bucket: String, + bucket_name: String, ) -> Result, Error> { let boundary = req .headers() @@ -126,13 +126,18 @@ pub async fn handle_post_object( let bucket_id = garage .bucket_helper() - .resolve_bucket(&bucket, &api_key) + .resolve_bucket(&bucket_name, &api_key) .await?; if !api_key.allow_write(&bucket_id) { return Err(Error::forbidden("Operation is not allowed for this key.")); } + let bucket = garage + .bucket_helper() + .get_existing_bucket(bucket_id) + .await?; + let decoded_policy = base64::decode(&policy).ok_or_bad_request("Invalid policy")?; let decoded_policy: Policy = serde_json::from_slice(&decoded_policy).ok_or_bad_request("Invalid policy")?; @@ -227,7 +232,7 @@ pub async fn handle_post_object( garage, headers, StreamLimiter::new(stream, conditions.content_length), - bucket_id, + &bucket, &key, None, None, @@ -244,7 +249,7 @@ pub async fn handle_post_object( { target .query_pairs_mut() - .append_pair("bucket", &bucket) + .append_pair("bucket", &bucket_name) .append_pair("key", &key) .append_pair("etag", &etag); let target = target.to_string(); @@ -289,7 +294,7 @@ pub async fn handle_post_object( let xml = s3_xml::PostObject { xmlns: (), location: s3_xml::Value(location), - bucket: s3_xml::Value(bucket), + bucket: s3_xml::Value(bucket_name), key: s3_xml::Value(key), etag: s3_xml::Value(etag), }; diff --git a/src/api/s3/put.rs b/src/api/s3/put.rs index 8b06ef3f..9ef37421 100644 --- a/src/api/s3/put.rs +++ b/src/api/s3/put.rs @@ -1,4 +1,4 @@ -use std::collections::{BTreeMap, BTreeSet, VecDeque}; +use std::collections::{BTreeMap, BTreeSet, HashMap, VecDeque}; use std::sync::Arc; use futures::prelude::*; @@ -14,7 +14,9 @@ use garage_util::error::Error as GarageError; use garage_util::time::*; use garage_block::manager::INLINE_THRESHOLD; +use garage_model::bucket_table::Bucket; use garage_model::garage::Garage; +use garage_model::index_counter::CountedItem; use garage_model::s3::block_ref_table::*; use garage_model::s3::object_table::*; use garage_model::s3::version_table::*; @@ -26,7 +28,7 @@ use crate::signature::verify_signed_content; pub async fn handle_put( garage: Arc, req: Request, - bucket_id: Uuid, + bucket: &Bucket, key: &str, content_sha256: Option, ) -> Result, Error> { @@ -46,7 +48,7 @@ pub async fn handle_put( garage, headers, body, - bucket_id, + bucket, key, content_md5, content_sha256, @@ -59,7 +61,7 @@ pub(crate) async fn save_stream> + Unpin>( garage: Arc, headers: ObjectVersionHeaders, body: S, - bucket_id: Uuid, + bucket: &Bucket, key: &str, content_md5: Option, content_sha256: Option, @@ -80,6 +82,7 @@ pub(crate) async fn save_stream> + Unpin>( let data_md5sum_hex = hex::encode(data_md5sum); let data_sha256sum = sha256sum(&first_block[..]); + let size = first_block.len() as u64; ensure_checksum_matches( data_md5sum.as_slice(), @@ -88,20 +91,22 @@ pub(crate) async fn save_stream> + Unpin>( content_sha256, )?; + check_quotas(&garage, bucket, key, size).await?; + let object_version = ObjectVersion { uuid: version_uuid, timestamp: version_timestamp, state: ObjectVersionState::Complete(ObjectVersionData::Inline( ObjectVersionMeta { headers, - size: first_block.len() as u64, + size, etag: data_md5sum_hex.clone(), }, first_block, )), }; - let object = Object::new(bucket_id, key.into(), vec![object_version]); + let object = Object::new(bucket.id, key.into(), vec![object_version]); garage.object_table.insert(&object).await?; return Ok((version_uuid, data_md5sum_hex)); @@ -114,36 +119,42 @@ pub(crate) async fn save_stream> + Unpin>( timestamp: version_timestamp, state: ObjectVersionState::Uploading(headers.clone()), }; - let object = Object::new(bucket_id, key.into(), vec![object_version.clone()]); + let object = Object::new(bucket.id, key.into(), vec![object_version.clone()]); garage.object_table.insert(&object).await?; // Initialize corresponding entry in version table // Write this entry now, even with empty block list, // to prevent block_ref entries from being deleted (they can be deleted // if the reference a version that isn't found in the version table) - let version = Version::new(version_uuid, bucket_id, key.into(), false); + let version = Version::new(version_uuid, bucket.id, key.into(), false); garage.version_table.insert(&version).await?; // Transfer data and verify checksum let first_block_hash = blake2sum(&first_block[..]); - let tx_result = read_and_put_blocks( - &garage, - &version, - 1, - first_block, - first_block_hash, - &mut chunker, - ) - .await - .and_then(|(total_size, data_md5sum, data_sha256sum)| { + + let tx_result = (|| async { + let (total_size, data_md5sum, data_sha256sum) = read_and_put_blocks( + &garage, + &version, + 1, + first_block, + first_block_hash, + &mut chunker, + ) + .await?; + ensure_checksum_matches( data_md5sum.as_slice(), data_sha256sum, content_md5.as_deref(), content_sha256, - ) - .map(|()| (total_size, data_md5sum)) - }); + )?; + + check_quotas(&garage, bucket, key, total_size).await?; + + Ok((total_size, data_md5sum)) + })() + .await; // If something went wrong, clean up let (total_size, md5sum_arr) = match tx_result { @@ -151,7 +162,7 @@ pub(crate) async fn save_stream> + Unpin>( Err(e) => { // Mark object as aborted, this will free the blocks further down object_version.state = ObjectVersionState::Aborted; - let object = Object::new(bucket_id, key.into(), vec![object_version.clone()]); + let object = Object::new(bucket.id, key.into(), vec![object_version.clone()]); garage.object_table.insert(&object).await?; return Err(e); } @@ -167,7 +178,7 @@ pub(crate) async fn save_stream> + Unpin>( }, first_block_hash, )); - let object = Object::new(bucket_id, key.into(), vec![object_version]); + let object = Object::new(bucket.id, key.into(), vec![object_version]); garage.object_table.insert(&object).await?; Ok((version_uuid, md5sum_hex)) @@ -200,6 +211,64 @@ fn ensure_checksum_matches( Ok(()) } +/// Check that inserting this object with this size doesn't exceed bucket quotas +async fn check_quotas( + garage: &Arc, + bucket: &Bucket, + key: &str, + size: u64, +) -> Result<(), Error> { + let quotas = bucket.state.as_option().unwrap().quotas.get(); + if quotas.max_objects.is_none() && quotas.max_size.is_none() { + return Ok(()); + }; + + let key = key.to_string(); + let (prev_object, counters) = futures::try_join!( + garage.object_table.get(&bucket.id, &key), + garage.object_counter_table.table.get(&bucket.id, &EmptyKey), + )?; + + let counters = counters + .map(|x| x.filtered_values(&garage.system.ring.borrow())) + .unwrap_or_default(); + + let (prev_cnt_obj, prev_cnt_size) = match prev_object { + Some(o) => { + let prev_cnt = o.counts().into_iter().collect::>(); + ( + prev_cnt.get(OBJECTS).cloned().unwrap_or_default(), + prev_cnt.get(BYTES).cloned().unwrap_or_default(), + ) + } + None => (0, 0), + }; + let cnt_obj_diff = 1 - prev_cnt_obj; + let cnt_size_diff = size as i64 - prev_cnt_size; + + if let Some(mo) = quotas.max_objects { + let current_objects = counters.get(OBJECTS).cloned().unwrap_or_default(); + if cnt_obj_diff > 0 && current_objects + cnt_obj_diff > mo as i64 { + return Err(Error::forbidden(format!( + "Object quota is reached, maximum objects for this bucket: {}", + mo + ))); + } + } + + if let Some(ms) = quotas.max_size { + let current_size = counters.get(BYTES).cloned().unwrap_or_default(); + if cnt_size_diff > 0 && current_size + cnt_size_diff > ms as i64 { + return Err(Error::forbidden(format!( + "Bucket size quota is reached, maximum total size of objects for this bucket: {}. The bucket is already {} bytes, and this object would add {} bytes.", + ms, current_size, size + ))); + } + } + + Ok(()) +} + async fn read_and_put_blocks> + Unpin>( garage: &Garage, version: &Version, @@ -473,7 +542,7 @@ pub async fn handle_complete_multipart_upload( garage: Arc, req: Request, bucket_name: &str, - bucket_id: Uuid, + bucket: &Bucket, key: &str, upload_id: &str, content_sha256: Option, @@ -497,7 +566,7 @@ pub async fn handle_complete_multipart_upload( // Get object and version let key = key.to_string(); let (object, version) = futures::try_join!( - garage.object_table.get(&bucket_id, &key), + garage.object_table.get(&bucket.id, &key), garage.version_table.get(&version_uuid, &EmptyKey), )?; @@ -590,6 +659,14 @@ pub async fn handle_complete_multipart_upload( // Calculate total size of final object let total_size = version.blocks.items().iter().map(|x| x.1.size).sum(); + if let Err(e) = check_quotas(&garage, bucket, &key, total_size).await { + object_version.state = ObjectVersionState::Aborted; + let final_object = Object::new(bucket.id, key.clone(), vec![object_version]); + garage.object_table.insert(&final_object).await?; + + return Err(e); + } + // Write final object version object_version.state = ObjectVersionState::Complete(ObjectVersionData::FirstBlock( ObjectVersionMeta { @@ -600,7 +677,7 @@ pub async fn handle_complete_multipart_upload( version.blocks.items()[0].1.hash, )); - let final_object = Object::new(bucket_id, key.clone(), vec![object_version]); + let final_object = Object::new(bucket.id, key.clone(), vec![object_version]); garage.object_table.insert(&final_object).await?; // Send response saying ok we're done diff --git a/src/db/lib.rs b/src/db/lib.rs index e9d3ea18..8188c715 100644 --- a/src/db/lib.rs +++ b/src/db/lib.rs @@ -197,6 +197,11 @@ impl Tree { pub fn remove>(&self, key: T) -> Result> { self.0.remove(self.1, key.as_ref()) } + /// Clears all values from the tree + #[inline] + pub fn clear(&self) -> Result<()> { + self.0.clear(self.1) + } #[inline] pub fn iter(&self) -> Result> { @@ -311,6 +316,7 @@ pub(crate) trait IDb: Send + Sync { fn insert(&self, tree: usize, key: &[u8], value: &[u8]) -> Result>; fn remove(&self, tree: usize, key: &[u8]) -> Result>; + fn clear(&self, tree: usize) -> Result<()>; fn iter(&self, tree: usize) -> Result>; fn iter_rev(&self, tree: usize) -> Result>; diff --git a/src/db/lmdb_adapter.rs b/src/db/lmdb_adapter.rs index 62fcc3e6..fdb254c6 100644 --- a/src/db/lmdb_adapter.rs +++ b/src/db/lmdb_adapter.rs @@ -139,6 +139,14 @@ impl IDb for LmdbDb { Ok(old_val) } + fn clear(&self, tree: usize) -> Result<()> { + let tree = self.get_tree(tree)?; + let mut tx = self.db.write_txn()?; + tree.clear(&mut tx)?; + tx.commit()?; + Ok(()) + } + fn iter(&self, tree: usize) -> Result> { let tree = self.get_tree(tree)?; let tx = self.db.read_txn()?; diff --git a/src/db/sled_adapter.rs b/src/db/sled_adapter.rs index 982f8d82..cf61867d 100644 --- a/src/db/sled_adapter.rs +++ b/src/db/sled_adapter.rs @@ -113,6 +113,12 @@ impl IDb for SledDb { Ok(old_val.map(|x| x.to_vec())) } + fn clear(&self, tree: usize) -> Result<()> { + let tree = self.get_tree(tree)?; + tree.clear()?; + Ok(()) + } + fn iter(&self, tree: usize) -> Result> { let tree = self.get_tree(tree)?; Ok(Box::new(tree.iter().map(|v| { diff --git a/src/db/sqlite_adapter.rs b/src/db/sqlite_adapter.rs index 14bf35ff..68d96ca0 100644 --- a/src/db/sqlite_adapter.rs +++ b/src/db/sqlite_adapter.rs @@ -182,6 +182,16 @@ impl IDb for SqliteDb { Ok(old_val) } + fn clear(&self, tree: usize) -> Result<()> { + trace!("clear {}: lock db", tree); + let this = self.0.lock().unwrap(); + trace!("clear {}: lock acquired", tree); + + let tree = this.get_tree(tree)?; + this.db.execute(&format!("DELETE FROM {}", tree), [])?; + Ok(()) + } + fn iter(&self, tree: usize) -> Result> { trace!("iter {}: lock db", tree); let this = self.0.lock().unwrap(); diff --git a/src/garage/Cargo.toml b/src/garage/Cargo.toml index eb643160..640e6975 100644 --- a/src/garage/Cargo.toml +++ b/src/garage/Cargo.toml @@ -30,6 +30,7 @@ garage_util = { version = "0.7.0", path = "../util" } garage_web = { version = "0.7.0", path = "../web" } bytes = "1.0" +bytesize = "1.1" hex = "0.4" tracing = { version = "0.1.30", features = ["log-always"] } pretty_env_logger = "0.4" diff --git a/src/garage/admin.rs b/src/garage/admin.rs index c662aa00..48914655 100644 --- a/src/garage/admin.rs +++ b/src/garage/admin.rs @@ -24,11 +24,12 @@ use garage_model::migrate::Migrate; use garage_model::permission::*; use crate::cli::*; -use crate::repair::Repair; +use crate::repair::online::OnlineRepair; pub const ADMIN_RPC_PATH: &str = "garage/admin_rpc.rs/Rpc"; #[derive(Debug, Serialize, Deserialize)] +#[allow(clippy::large_enum_variant)] pub enum AdminRpc { BucketOperation(BucketOperation), KeyOperation(KeyOperation), @@ -39,7 +40,11 @@ pub enum AdminRpc { // Replies Ok(String), BucketList(Vec), - BucketInfo(Bucket, HashMap), + BucketInfo { + bucket: Bucket, + relevant_keys: HashMap, + counters: HashMap, + }, KeyList(Vec<(String, String)>), KeyInfo(Key, HashMap), } @@ -72,6 +77,7 @@ impl AdminRpcHandler { BucketOperation::Allow(query) => self.handle_bucket_allow(query).await, BucketOperation::Deny(query) => self.handle_bucket_deny(query).await, BucketOperation::Website(query) => self.handle_bucket_website(query).await, + BucketOperation::SetQuotas(query) => self.handle_bucket_set_quotas(query).await, } } @@ -87,6 +93,7 @@ impl AdminRpcHandler { EnumerationOrder::Forward, ) .await?; + Ok(AdminRpc::BucketList(buckets)) } @@ -104,6 +111,15 @@ impl AdminRpcHandler { .get_existing_bucket(bucket_id) .await?; + let counters = self + .garage + .object_counter_table + .table + .get(&bucket_id, &EmptyKey) + .await? + .map(|x| x.filtered_values(&self.garage.system.ring.borrow())) + .unwrap_or_default(); + let mut relevant_keys = HashMap::new(); for (k, _) in bucket .state @@ -139,7 +155,11 @@ impl AdminRpcHandler { } } - Ok(AdminRpc::BucketInfo(bucket, relevant_keys)) + Ok(AdminRpc::BucketInfo { + bucket, + relevant_keys, + counters, + }) } #[allow(clippy::ptr_arg)] @@ -431,6 +451,60 @@ impl AdminRpcHandler { Ok(AdminRpc::Ok(msg)) } + async fn handle_bucket_set_quotas(&self, query: &SetQuotasOpt) -> Result { + let bucket_id = self + .garage + .bucket_helper() + .resolve_global_bucket_name(&query.bucket) + .await? + .ok_or_bad_request("Bucket not found")?; + + let mut bucket = self + .garage + .bucket_helper() + .get_existing_bucket(bucket_id) + .await?; + let bucket_state = bucket.state.as_option_mut().unwrap(); + + if query.max_size.is_none() && query.max_objects.is_none() { + return Err(Error::BadRequest( + "You must specify either --max-size or --max-objects (or both) for this command to do something.".to_string(), + )); + } + + let mut quotas = bucket_state.quotas.get().clone(); + + match query.max_size.as_ref().map(String::as_ref) { + Some("none") => quotas.max_size = None, + Some(v) => { + let bs = v + .parse::() + .ok_or_bad_request(format!("Invalid size specified: {}", v))?; + quotas.max_size = Some(bs.as_u64()); + } + _ => (), + } + + match query.max_objects.as_ref().map(String::as_ref) { + Some("none") => quotas.max_objects = None, + Some(v) => { + let mo = v + .parse::() + .ok_or_bad_request(format!("Invalid number specified: {}", v))?; + quotas.max_objects = Some(mo); + } + _ => (), + } + + bucket_state.quotas.update(quotas); + self.garage.bucket_table.insert(&bucket).await?; + + Ok(AdminRpc::Ok(format!( + "Quotas updated for {}", + &query.bucket + ))) + } + async fn handle_key_cmd(&self, cmd: &KeyOperation) -> Result { match cmd { KeyOperation::List => self.handle_list_keys().await, @@ -619,7 +693,7 @@ impl AdminRpcHandler { ))) } } else { - let repair = Repair { + let repair = OnlineRepair { garage: self.garage.clone(), }; self.garage diff --git a/src/garage/cli/cmd.rs b/src/garage/cli/cmd.rs index b2dd8f14..3a0bd956 100644 --- a/src/garage/cli/cmd.rs +++ b/src/garage/cli/cmd.rs @@ -169,8 +169,12 @@ pub async fn cmd_admin( AdminRpc::BucketList(bl) => { print_bucket_list(bl); } - AdminRpc::BucketInfo(bucket, rk) => { - print_bucket_info(&bucket, &rk); + AdminRpc::BucketInfo { + bucket, + relevant_keys, + counters, + } => { + print_bucket_info(&bucket, &relevant_keys, &counters); } AdminRpc::KeyList(kl) => { print_key_list(kl); diff --git a/src/garage/cli/structs.rs b/src/garage/cli/structs.rs index a0c49aeb..4f2efe19 100644 --- a/src/garage/cli/structs.rs +++ b/src/garage/cli/structs.rs @@ -33,10 +33,15 @@ pub enum Command { #[structopt(name = "migrate")] Migrate(MigrateOpt), - /// Start repair of node data + /// Start repair of node data on remote node #[structopt(name = "repair")] Repair(RepairOpt), + /// Offline reparation of node data (these repairs must be run offline + /// directly on the server node) + #[structopt(name = "offline-repair")] + OfflineRepair(OfflineRepairOpt), + /// Gather node statistics #[structopt(name = "stats")] Stats(StatsOpt), @@ -175,6 +180,10 @@ pub enum BucketOperation { /// Expose as website or not #[structopt(name = "website")] Website(WebsiteOpt), + + /// Set the quotas for this bucket + #[structopt(name = "set-quotas")] + SetQuotas(SetQuotasOpt), } #[derive(Serialize, Deserialize, StructOpt, Debug)] @@ -261,6 +270,21 @@ pub struct PermBucketOpt { pub bucket: String, } +#[derive(Serialize, Deserialize, StructOpt, Debug)] +pub struct SetQuotasOpt { + /// Bucket name + pub bucket: String, + + /// Set a maximum size for the bucket (specify a size e.g. in MiB or GiB, + /// or `none` for no size restriction) + #[structopt(long = "max-size")] + pub max_size: Option, + + /// Set a maximum number of objects for the bucket (or `none` for no restriction) + #[structopt(long = "max-objects")] + pub max_objects: Option, +} + #[derive(Serialize, Deserialize, StructOpt, Debug)] pub enum KeyOperation { /// List keys @@ -405,6 +429,27 @@ pub enum RepairWhat { }, } +#[derive(Serialize, Deserialize, StructOpt, Debug, Clone)] +pub struct OfflineRepairOpt { + /// Confirm the launch of the repair operation + #[structopt(long = "yes")] + pub yes: bool, + + #[structopt(subcommand)] + pub what: OfflineRepairWhat, +} + +#[derive(Serialize, Deserialize, StructOpt, Debug, Eq, PartialEq, Clone)] +pub enum OfflineRepairWhat { + /// Repair K2V item counters + #[cfg(feature = "k2v")] + #[structopt(name = "k2v_item_counters")] + K2VItemCounters, + /// Repair object counters + #[structopt(name = "object_counters")] + ObjectCounters, +} + #[derive(Serialize, Deserialize, StructOpt, Debug, Clone)] pub struct StatsOpt { /// Gather statistics from all nodes diff --git a/src/garage/cli/util.rs b/src/garage/cli/util.rs index 6d73be3a..329e8a3e 100644 --- a/src/garage/cli/util.rs +++ b/src/garage/cli/util.rs @@ -7,6 +7,7 @@ use garage_util::formater::format_table; use garage_model::bucket_table::*; use garage_model::key_table::*; +use garage_model::s3::object_table::{BYTES, OBJECTS, UNFINISHED_UPLOADS}; pub fn print_bucket_list(bl: Vec) { println!("List of buckets:"); @@ -29,11 +30,12 @@ pub fn print_bucket_list(bl: Vec) { [((k, n), _, _)] => format!("{}:{}", k, n), s => format!("[{} local aliases]", s.len()), }; + table.push(format!( "\t{}\t{}\t{}", aliases.join(","), local_aliases_n, - hex::encode(bucket.id) + hex::encode(bucket.id), )); } format_table(table); @@ -121,7 +123,11 @@ pub fn print_key_info(key: &Key, relevant_buckets: &HashMap) { } } -pub fn print_bucket_info(bucket: &Bucket, relevant_keys: &HashMap) { +pub fn print_bucket_info( + bucket: &Bucket, + relevant_keys: &HashMap, + counters: &HashMap, +) { let key_name = |k| { relevant_keys .get(k) @@ -133,7 +139,42 @@ pub fn print_bucket_info(bucket: &Bucket, relevant_keys: &HashMap) match &bucket.state { Deletable::Deleted => println!("Bucket is deleted."), Deletable::Present(p) => { - println!("Website access: {}", p.website_config.get().is_some()); + let size = + bytesize::ByteSize::b(counters.get(BYTES).cloned().unwrap_or_default() as u64); + println!( + "\nSize: {} ({})", + size.to_string_as(true), + size.to_string_as(false) + ); + println!( + "Objects: {}", + counters.get(OBJECTS).cloned().unwrap_or_default() + ); + println!( + "Unfinished multipart uploads: {}", + counters + .get(UNFINISHED_UPLOADS) + .cloned() + .unwrap_or_default() + ); + + println!("\nWebsite access: {}", p.website_config.get().is_some()); + + let quotas = p.quotas.get(); + if quotas.max_size.is_some() || quotas.max_objects.is_some() { + println!("\nQuotas:"); + if let Some(ms) = quotas.max_size { + let ms = bytesize::ByteSize::b(ms); + println!( + " maximum size: {} ({})", + ms.to_string_as(true), + ms.to_string_as(false) + ); + } + if let Some(mo) = quotas.max_objects { + println!(" maximum number of objects: {}", mo); + } + } println!("\nGlobal aliases:"); for (alias, _, active) in p.aliases.items().iter() { diff --git a/src/garage/main.rs b/src/garage/main.rs index bd09b6ea..3fa5c3c0 100644 --- a/src/garage/main.rs +++ b/src/garage/main.rs @@ -61,17 +61,17 @@ async fn main() { pretty_env_logger::init(); sodiumoxide::init().expect("Unable to init sodiumoxide"); - let opt = Opt::from_args(); + // Abort on panic (same behavior as in Go) + std::panic::set_hook(Box::new(|panic_info| { + error!("{}", panic_info.to_string()); + std::process::abort(); + })); + let opt = Opt::from_args(); let res = match opt.cmd { - Command::Server => { - // Abort on panic (same behavior as in Go) - std::panic::set_hook(Box::new(|panic_info| { - error!("{}", panic_info.to_string()); - std::process::abort(); - })); - - server::run_server(opt.config_file).await + Command::Server => server::run_server(opt.config_file).await, + Command::OfflineRepair(repair_opt) => { + repair::offline::offline_repair(opt.config_file, repair_opt).await } Command::Node(NodeOperation::NodeId(node_id_opt)) => { node_id_command(opt.config_file, node_id_opt.quiet) diff --git a/src/garage/repair.rs b/src/garage/repair.rs deleted file mode 100644 index 17e14b8b..00000000 --- a/src/garage/repair.rs +++ /dev/null @@ -1,163 +0,0 @@ -use std::sync::Arc; - -use tokio::sync::watch; - -use garage_model::garage::Garage; -use garage_model::s3::block_ref_table::*; -use garage_model::s3::object_table::*; -use garage_model::s3::version_table::*; -use garage_table::*; -use garage_util::error::Error; - -use crate::*; - -pub struct Repair { - pub garage: Arc, -} - -impl Repair { - pub async fn repair_worker(&self, opt: RepairOpt, must_exit: watch::Receiver) { - if let Err(e) = self.repair_worker_aux(opt, must_exit).await { - warn!("Repair worker failed with error: {}", e); - } - } - - async fn repair_worker_aux( - &self, - opt: RepairOpt, - must_exit: watch::Receiver, - ) -> Result<(), Error> { - match opt.what { - RepairWhat::Tables => { - info!("Launching a full sync of tables"); - self.garage.bucket_table.syncer.add_full_sync(); - self.garage.object_table.syncer.add_full_sync(); - self.garage.version_table.syncer.add_full_sync(); - self.garage.block_ref_table.syncer.add_full_sync(); - self.garage.key_table.syncer.add_full_sync(); - } - RepairWhat::Versions => { - info!("Repairing the versions table"); - self.repair_versions(&must_exit).await?; - } - RepairWhat::BlockRefs => { - info!("Repairing the block refs table"); - self.repair_block_ref(&must_exit).await?; - } - RepairWhat::Blocks => { - info!("Repairing the stored blocks"); - self.garage - .block_manager - .repair_data_store(&must_exit) - .await?; - } - RepairWhat::Scrub { tranquility } => { - info!("Verifying integrity of stored blocks"); - self.garage - .block_manager - .scrub_data_store(&must_exit, tranquility) - .await?; - } - } - Ok(()) - } - - async fn repair_versions(&self, must_exit: &watch::Receiver) -> Result<(), Error> { - let mut pos = vec![]; - let mut i = 0; - - while !*must_exit.borrow() { - let item_bytes = match self.garage.version_table.data.store.get_gt(pos)? { - Some((k, v)) => { - pos = k; - v - } - None => break, - }; - - i += 1; - if i % 1000 == 0 { - info!("repair_versions: {}", i); - } - - let version = rmp_serde::decode::from_read_ref::<_, Version>(&item_bytes)?; - if version.deleted.get() { - continue; - } - let object = self - .garage - .object_table - .get(&version.bucket_id, &version.key) - .await?; - let version_exists = match object { - Some(o) => o - .versions() - .iter() - .any(|x| x.uuid == version.uuid && x.state != ObjectVersionState::Aborted), - None => false, - }; - if !version_exists { - info!("Repair versions: marking version as deleted: {:?}", version); - self.garage - .version_table - .insert(&Version::new( - version.uuid, - version.bucket_id, - version.key, - true, - )) - .await?; - } - } - info!("repair_versions: finished, done {}", i); - Ok(()) - } - - async fn repair_block_ref(&self, must_exit: &watch::Receiver) -> Result<(), Error> { - let mut pos = vec![]; - let mut i = 0; - - while !*must_exit.borrow() { - let item_bytes = match self.garage.block_ref_table.data.store.get_gt(pos)? { - Some((k, v)) => { - pos = k; - v - } - None => break, - }; - - i += 1; - if i % 1000 == 0 { - info!("repair_block_ref: {}", i); - } - - let block_ref = rmp_serde::decode::from_read_ref::<_, BlockRef>(&item_bytes)?; - if block_ref.deleted.get() { - continue; - } - let version = self - .garage - .version_table - .get(&block_ref.version, &EmptyKey) - .await?; - // The version might not exist if it has been GC'ed - let ref_exists = version.map(|v| !v.deleted.get()).unwrap_or(false); - if !ref_exists { - info!( - "Repair block ref: marking block_ref as deleted: {:?}", - block_ref - ); - self.garage - .block_ref_table - .insert(&BlockRef { - block: block_ref.block, - version: block_ref.version, - deleted: true.into(), - }) - .await?; - } - } - info!("repair_block_ref: finished, done {}", i); - Ok(()) - } -} diff --git a/src/garage/repair/mod.rs b/src/garage/repair/mod.rs new file mode 100644 index 00000000..4699ace5 --- /dev/null +++ b/src/garage/repair/mod.rs @@ -0,0 +1,2 @@ +pub mod offline; +pub mod online; diff --git a/src/garage/repair/offline.rs b/src/garage/repair/offline.rs new file mode 100644 index 00000000..7760a8bd --- /dev/null +++ b/src/garage/repair/offline.rs @@ -0,0 +1,55 @@ +use std::path::PathBuf; + +use tokio::sync::watch; + +use garage_util::background::*; +use garage_util::config::*; +use garage_util::error::*; + +use garage_model::garage::Garage; + +use crate::cli::structs::*; + +pub async fn offline_repair(config_file: PathBuf, opt: OfflineRepairOpt) -> Result<(), Error> { + if !opt.yes { + return Err(Error::Message( + "Please add the --yes flag to launch repair operation".into(), + )); + } + + info!("Loading configuration..."); + let config = read_config(config_file)?; + + info!("Initializing background runner..."); + let (done_tx, done_rx) = watch::channel(false); + let (background, await_background_done) = BackgroundRunner::new(16, done_rx); + + info!("Initializing Garage main data store..."); + let garage = Garage::new(config.clone(), background)?; + + info!("Launching repair operation..."); + match opt.what { + #[cfg(feature = "k2v")] + OfflineRepairWhat::K2VItemCounters => { + garage + .k2v + .counter_table + .offline_recount_all(&garage.k2v.item_table)?; + } + OfflineRepairWhat::ObjectCounters => { + garage + .object_counter_table + .offline_recount_all(&garage.object_table)?; + } + } + + info!("Repair operation finished, shutting down Garage internals..."); + done_tx.send(true).unwrap(); + drop(garage); + + await_background_done.await?; + + info!("Cleaning up..."); + + Ok(()) +} diff --git a/src/garage/repair/online.rs b/src/garage/repair/online.rs new file mode 100644 index 00000000..d6a71742 --- /dev/null +++ b/src/garage/repair/online.rs @@ -0,0 +1,163 @@ +use std::sync::Arc; + +use tokio::sync::watch; + +use garage_model::garage::Garage; +use garage_model::s3::block_ref_table::*; +use garage_model::s3::object_table::*; +use garage_model::s3::version_table::*; +use garage_table::*; +use garage_util::error::Error; + +use crate::*; + +pub struct OnlineRepair { + pub garage: Arc, +} + +impl OnlineRepair { + pub async fn repair_worker(&self, opt: RepairOpt, must_exit: watch::Receiver) { + if let Err(e) = self.repair_worker_aux(opt, must_exit).await { + warn!("Repair worker failed with error: {}", e); + } + } + + async fn repair_worker_aux( + &self, + opt: RepairOpt, + must_exit: watch::Receiver, + ) -> Result<(), Error> { + match opt.what { + RepairWhat::Tables => { + info!("Launching a full sync of tables"); + self.garage.bucket_table.syncer.add_full_sync(); + self.garage.object_table.syncer.add_full_sync(); + self.garage.version_table.syncer.add_full_sync(); + self.garage.block_ref_table.syncer.add_full_sync(); + self.garage.key_table.syncer.add_full_sync(); + } + RepairWhat::Versions => { + info!("Repairing the versions table"); + self.repair_versions(&must_exit).await?; + } + RepairWhat::BlockRefs => { + info!("Repairing the block refs table"); + self.repair_block_ref(&must_exit).await?; + } + RepairWhat::Blocks => { + info!("Repairing the stored blocks"); + self.garage + .block_manager + .repair_data_store(&must_exit) + .await?; + } + RepairWhat::Scrub { tranquility } => { + info!("Verifying integrity of stored blocks"); + self.garage + .block_manager + .scrub_data_store(&must_exit, tranquility) + .await?; + } + } + Ok(()) + } + + async fn repair_versions(&self, must_exit: &watch::Receiver) -> Result<(), Error> { + let mut pos = vec![]; + let mut i = 0; + + while !*must_exit.borrow() { + let item_bytes = match self.garage.version_table.data.store.get_gt(pos)? { + Some((k, v)) => { + pos = k; + v + } + None => break, + }; + + i += 1; + if i % 1000 == 0 { + info!("repair_versions: {}", i); + } + + let version = rmp_serde::decode::from_read_ref::<_, Version>(&item_bytes)?; + if version.deleted.get() { + continue; + } + let object = self + .garage + .object_table + .get(&version.bucket_id, &version.key) + .await?; + let version_exists = match object { + Some(o) => o + .versions() + .iter() + .any(|x| x.uuid == version.uuid && x.state != ObjectVersionState::Aborted), + None => false, + }; + if !version_exists { + info!("Repair versions: marking version as deleted: {:?}", version); + self.garage + .version_table + .insert(&Version::new( + version.uuid, + version.bucket_id, + version.key, + true, + )) + .await?; + } + } + info!("repair_versions: finished, done {}", i); + Ok(()) + } + + async fn repair_block_ref(&self, must_exit: &watch::Receiver) -> Result<(), Error> { + let mut pos = vec![]; + let mut i = 0; + + while !*must_exit.borrow() { + let item_bytes = match self.garage.block_ref_table.data.store.get_gt(pos)? { + Some((k, v)) => { + pos = k; + v + } + None => break, + }; + + i += 1; + if i % 1000 == 0 { + info!("repair_block_ref: {}", i); + } + + let block_ref = rmp_serde::decode::from_read_ref::<_, BlockRef>(&item_bytes)?; + if block_ref.deleted.get() { + continue; + } + let version = self + .garage + .version_table + .get(&block_ref.version, &EmptyKey) + .await?; + // The version might not exist if it has been GC'ed + let ref_exists = version.map(|v| !v.deleted.get()).unwrap_or(false); + if !ref_exists { + info!( + "Repair block ref: marking block_ref as deleted: {:?}", + block_ref + ); + self.garage + .block_ref_table + .insert(&BlockRef { + block: block_ref.block, + version: block_ref.version, + deleted: true.into(), + }) + .await?; + } + } + info!("repair_block_ref: finished, done {}", i); + Ok(()) + } +} diff --git a/src/garage/server.rs b/src/garage/server.rs index 7aa6185f..6321357a 100644 --- a/src/garage/server.rs +++ b/src/garage/server.rs @@ -2,8 +2,6 @@ use std::path::PathBuf; use tokio::sync::watch; -use garage_db as db; - use garage_util::background::*; use garage_util::config::*; use garage_util::error::Error; @@ -29,57 +27,14 @@ async fn wait_from(mut chan: watch::Receiver) { pub async fn run_server(config_file: PathBuf) -> Result<(), Error> { info!("Loading configuration..."); - let config = read_config(config_file).expect("Unable to read config file"); - - info!("Opening database..."); - let mut db_path = config.metadata_dir.clone(); - std::fs::create_dir_all(&db_path).expect("Unable to create Garage meta data directory"); - let db = match config.db_engine.as_str() { - "sled" => { - db_path.push("db"); - info!("Opening Sled database at: {}", db_path.display()); - let db = db::sled_adapter::sled::Config::default() - .path(&db_path) - .cache_capacity(config.sled_cache_capacity) - .flush_every_ms(Some(config.sled_flush_every_ms)) - .open() - .expect("Unable to open sled DB"); - db::sled_adapter::SledDb::init(db) - } - "sqlite" | "sqlite3" | "rusqlite" => { - db_path.push("db.sqlite"); - info!("Opening Sqlite database at: {}", db_path.display()); - let db = db::sqlite_adapter::rusqlite::Connection::open(db_path) - .expect("Unable to open sqlite DB"); - db::sqlite_adapter::SqliteDb::init(db) - } - "lmdb" | "heed" => { - db_path.push("db.lmdb"); - info!("Opening LMDB database at: {}", db_path.display()); - std::fs::create_dir_all(&db_path).expect("Unable to create LMDB data directory"); - let map_size = garage_db::lmdb_adapter::recommended_map_size(); - - let db = db::lmdb_adapter::heed::EnvOpenOptions::new() - .max_dbs(100) - .map_size(map_size) - .open(&db_path) - .expect("Unable to open LMDB DB"); - db::lmdb_adapter::LmdbDb::init(db) - } - e => { - return Err(Error::Message(format!( - "Unsupported DB engine: {} (options: sled, sqlite, lmdb)", - e - ))); - } - }; + let config = read_config(config_file)?; info!("Initializing background runner..."); let watch_cancel = netapp::util::watch_ctrl_c(); let (background, await_background_done) = BackgroundRunner::new(16, watch_cancel.clone()); info!("Initializing Garage main data store..."); - let garage = Garage::new(config.clone(), db, background); + let garage = Garage::new(config.clone(), background)?; info!("Initialize tracing..."); if let Some(export_to) = config.admin.trace_sink { @@ -89,6 +44,7 @@ pub async fn run_server(config_file: PathBuf) -> Result<(), Error> { info!("Initialize Admin API server and metrics collector..."); let admin_server = AdminApiServer::new(garage.clone()); + info!("Launching internal Garage cluster communications..."); let run_system = tokio::spawn(garage.system.clone().run(watch_cancel.clone())); info!("Create admin RPC handler..."); diff --git a/src/model/bucket_table.rs b/src/model/bucket_table.rs index 7c7b9f30..130eb6a6 100644 --- a/src/model/bucket_table.rs +++ b/src/model/bucket_table.rs @@ -1,6 +1,6 @@ use serde::{Deserialize, Serialize}; -use garage_table::crdt::Crdt; +use garage_table::crdt::*; use garage_table::*; use garage_util::data::*; use garage_util::time::*; @@ -44,6 +44,9 @@ pub struct BucketParams { pub website_config: crdt::Lww>, /// CORS rules pub cors_config: crdt::Lww>>, + /// Bucket quotas + #[serde(default)] + pub quotas: crdt::Lww, } #[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] @@ -62,6 +65,18 @@ pub struct CorsRule { pub expose_headers: Vec, } +#[derive(Default, PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] +pub struct BucketQuotas { + /// Maximum size in bytes (bucket size = sum of sizes of objects in the bucket) + pub max_size: Option, + /// Maximum number of non-deleted objects in the bucket + pub max_objects: Option, +} + +impl AutoCrdt for BucketQuotas { + const WARN_IF_DIFFERENT: bool = true; +} + impl BucketParams { /// Create an empty BucketParams with no authorized keys and no website accesss pub fn new() -> Self { @@ -72,6 +87,7 @@ impl BucketParams { local_aliases: crdt::LwwMap::new(), website_config: crdt::Lww::new(None), cors_config: crdt::Lww::new(None), + quotas: crdt::Lww::new(BucketQuotas::default()), } } } @@ -86,6 +102,7 @@ impl Crdt for BucketParams { self.website_config.merge(&o.website_config); self.cors_config.merge(&o.cors_config); + self.quotas.merge(&o.quotas); } } diff --git a/src/model/garage.rs b/src/model/garage.rs index 280f3dc7..15769a17 100644 --- a/src/model/garage.rs +++ b/src/model/garage.rs @@ -6,6 +6,7 @@ use garage_db as db; use garage_util::background::*; use garage_util::config::*; +use garage_util::error::Error; use garage_rpc::system::System; @@ -22,12 +23,11 @@ use crate::s3::version_table::*; use crate::bucket_alias_table::*; use crate::bucket_table::*; use crate::helper; +use crate::index_counter::*; use crate::key_table::*; #[cfg(feature = "k2v")] -use crate::index_counter::*; -#[cfg(feature = "k2v")] -use crate::k2v::{counter_table::*, item_table::*, poll::*, rpc::*}; +use crate::k2v::{item_table::*, poll::*, rpc::*}; /// An entire Garage full of data pub struct Garage { @@ -52,6 +52,8 @@ pub struct Garage { /// Table containing S3 objects pub object_table: Arc>, + /// Counting table containing object counters + pub object_counter_table: Arc>, /// Table containing S3 object versions pub version_table: Arc>, /// Table containing S3 block references (not blocks themselves) @@ -66,14 +68,57 @@ pub struct GarageK2V { /// Table containing K2V items pub item_table: Arc>, /// Indexing table containing K2V item counters - pub counter_table: Arc>, + pub counter_table: Arc>, /// K2V RPC handler pub rpc: Arc, } impl Garage { /// Create and run garage - pub fn new(config: Config, db: db::Db, background: Arc) -> Arc { + pub fn new(config: Config, background: Arc) -> Result, Error> { + info!("Opening database..."); + let mut db_path = config.metadata_dir.clone(); + std::fs::create_dir_all(&db_path).expect("Unable to create Garage meta data directory"); + let db = match config.db_engine.as_str() { + "sled" => { + db_path.push("db"); + info!("Opening Sled database at: {}", db_path.display()); + let db = db::sled_adapter::sled::Config::default() + .path(&db_path) + .cache_capacity(config.sled_cache_capacity) + .flush_every_ms(Some(config.sled_flush_every_ms)) + .open() + .expect("Unable to open sled DB"); + db::sled_adapter::SledDb::init(db) + } + "sqlite" | "sqlite3" | "rusqlite" => { + db_path.push("db.sqlite"); + info!("Opening Sqlite database at: {}", db_path.display()); + let db = db::sqlite_adapter::rusqlite::Connection::open(db_path) + .expect("Unable to open sqlite DB"); + db::sqlite_adapter::SqliteDb::init(db) + } + "lmdb" | "heed" => { + db_path.push("db.lmdb"); + info!("Opening LMDB database at: {}", db_path.display()); + std::fs::create_dir_all(&db_path).expect("Unable to create LMDB data directory"); + let map_size = garage_db::lmdb_adapter::recommended_map_size(); + + let db = db::lmdb_adapter::heed::EnvOpenOptions::new() + .max_dbs(100) + .map_size(map_size) + .open(&db_path) + .expect("Unable to open LMDB DB"); + db::lmdb_adapter::LmdbDb::init(db) + } + e => { + return Err(Error::Message(format!( + "Unsupported DB engine: {} (options: sled, sqlite, lmdb)", + e + ))); + } + }; + let network_key = NetworkKey::from_slice( &hex::decode(&config.rpc_secret).expect("Invalid RPC secret key")[..], ) @@ -155,12 +200,16 @@ impl Garage { &db, ); + info!("Initialize object counter table..."); + let object_counter_table = IndexCounter::new(system.clone(), meta_rep_param.clone(), &db); + info!("Initialize object_table..."); #[allow(clippy::redundant_clone)] let object_table = Table::new( ObjectTable { background: background.clone(), version_table: version_table.clone(), + object_counter_table: object_counter_table.clone(), }, meta_rep_param.clone(), system.clone(), @@ -171,9 +220,8 @@ impl Garage { #[cfg(feature = "k2v")] let k2v = GarageK2V::new(system.clone(), &db, meta_rep_param); - info!("Initialize Garage..."); - - Arc::new(Self { + // -- done -- + Ok(Arc::new(Self { config, db, background, @@ -183,11 +231,12 @@ impl Garage { bucket_alias_table, key_table, object_table, + object_counter_table, version_table, block_ref_table, #[cfg(feature = "k2v")] k2v, - }) + })) } pub fn bucket_helper(&self) -> helper::bucket::BucketHelper { diff --git a/src/model/index_counter.rs b/src/model/index_counter.rs index 2602d5d9..36e8172b 100644 --- a/src/model/index_counter.rs +++ b/src/model/index_counter.rs @@ -1,3 +1,4 @@ +use core::ops::Bound; use std::collections::{hash_map, BTreeMap, HashMap}; use std::marker::PhantomData; use std::sync::Arc; @@ -12,30 +13,36 @@ use garage_rpc::ring::Ring; use garage_rpc::system::System; use garage_util::data::*; use garage_util::error::*; +use garage_util::time::*; use garage_table::crdt::*; -use garage_table::replication::TableShardedReplication; +use garage_table::replication::*; use garage_table::*; -pub trait CounterSchema: Clone + PartialEq + Send + Sync + 'static { - const NAME: &'static str; - type P: PartitionKey + Clone + PartialEq + Serialize + for<'de> Deserialize<'de> + Send + Sync; - type S: SortKey + Clone + PartialEq + Serialize + for<'de> Deserialize<'de> + Send + Sync; +pub trait CountedItem: Clone + PartialEq + Send + Sync + 'static { + const COUNTER_TABLE_NAME: &'static str; + + type CP: PartitionKey + Clone + PartialEq + Serialize + for<'de> Deserialize<'de> + Send + Sync; + type CS: SortKey + Clone + PartialEq + Serialize + for<'de> Deserialize<'de> + Send + Sync; + + fn counter_partition_key(&self) -> &Self::CP; + fn counter_sort_key(&self) -> &Self::CS; + fn counts(&self) -> Vec<(&'static str, i64)>; } /// A counter entry in the global table -#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] -pub struct CounterEntry { - pub pk: T::P, - pub sk: T::S, +#[derive(Clone, PartialEq, Debug, Serialize, Deserialize)] +pub struct CounterEntry { + pub pk: T::CP, + pub sk: T::CS, pub values: BTreeMap, } -impl Entry for CounterEntry { - fn partition_key(&self) -> &T::P { +impl Entry for CounterEntry { + fn partition_key(&self) -> &T::CP { &self.pk } - fn sort_key(&self) -> &T::S { + fn sort_key(&self) -> &T::CS { &self.sk } fn is_tombstone(&self) -> bool { @@ -45,7 +52,7 @@ impl Entry for CounterEntry { } } -impl CounterEntry { +impl CounterEntry { pub fn filtered_values(&self, ring: &Ring) -> HashMap { let nodes = &ring.layout.node_id_vec[..]; self.filtered_values_with_nodes(nodes) @@ -78,7 +85,7 @@ pub struct CounterValue { pub node_values: BTreeMap, } -impl Crdt for CounterEntry { +impl Crdt for CounterEntry { fn merge(&mut self, other: &Self) { for (name, e2) in other.values.iter() { if let Some(e) = self.values.get_mut(name) { @@ -104,15 +111,15 @@ impl Crdt for CounterValue { } } -pub struct CounterTable { +pub struct CounterTable { _phantom_t: PhantomData, } -impl TableSchema for CounterTable { - const TABLE_NAME: &'static str = T::NAME; +impl TableSchema for CounterTable { + const TABLE_NAME: &'static str = T::COUNTER_TABLE_NAME; - type P = T::P; - type S = T::S; + type P = T::CP; + type S = T::CS; type E = CounterEntry; type Filter = (DeletedFilter, Vec); @@ -131,14 +138,14 @@ impl TableSchema for CounterTable { // ---- -pub struct IndexCounter { +pub struct IndexCounter { this_node: Uuid, local_counter: db::Tree, - propagate_tx: mpsc::UnboundedSender<(T::P, T::S, LocalCounterEntry)>, + propagate_tx: mpsc::UnboundedSender<(T::CP, T::CS, LocalCounterEntry)>, pub table: Arc, TableShardedReplication>>, } -impl IndexCounter { +impl IndexCounter { pub fn new( system: Arc, replication: TableShardedReplication, @@ -151,7 +158,7 @@ impl IndexCounter { let this = Arc::new(Self { this_node: system.id, local_counter: db - .open_tree(format!("local_counter:{}", T::NAME)) + .open_tree(format!("local_counter_v2:{}", T::COUNTER_TABLE_NAME)) .expect("Unable to open local counter tree"), propagate_tx, table: Table::new( @@ -166,7 +173,7 @@ impl IndexCounter { let this2 = this.clone(); background.spawn_worker( - format!("{} index counter propagator", T::NAME), + format!("{} index counter propagator", T::COUNTER_TABLE_NAME), move |must_exit| this2.clone().propagate_loop(propagate_rx, must_exit), ); this @@ -175,24 +182,45 @@ impl IndexCounter { pub fn count( &self, tx: &mut db::Transaction, - pk: &T::P, - sk: &T::S, - counts: &[(&str, i64)], + old: Option<&T>, + new: Option<&T>, ) -> db::TxResult<(), Error> { + let pk = old + .map(|e| e.counter_partition_key()) + .unwrap_or_else(|| new.unwrap().counter_partition_key()); + let sk = old + .map(|e| e.counter_sort_key()) + .unwrap_or_else(|| new.unwrap().counter_sort_key()); + + // calculate counter differences + let mut counts = HashMap::new(); + for (k, v) in old.map(|x| x.counts()).unwrap_or_default() { + *counts.entry(k).or_insert(0) -= v; + } + for (k, v) in new.map(|x| x.counts()).unwrap_or_default() { + *counts.entry(k).or_insert(0) += v; + } + + // update local counter table let tree_key = self.table.data.tree_key(pk, sk); let mut entry = match tx.get(&self.local_counter, &tree_key[..])? { - Some(old_bytes) => rmp_serde::decode::from_read_ref::<_, LocalCounterEntry>(&old_bytes) - .map_err(Error::RmpDecode) - .map_err(db::TxError::Abort)?, + Some(old_bytes) => { + rmp_serde::decode::from_read_ref::<_, LocalCounterEntry>(&old_bytes) + .map_err(Error::RmpDecode) + .map_err(db::TxError::Abort)? + } None => LocalCounterEntry { + pk: pk.clone(), + sk: sk.clone(), values: BTreeMap::new(), }, }; + let now = now_msec(); for (s, inc) in counts.iter() { let mut ent = entry.values.entry(s.to_string()).or_insert((0, 0)); - ent.0 += 1; + ent.0 = std::cmp::max(ent.0 + 1, now); ent.1 += *inc; } @@ -213,7 +241,7 @@ impl IndexCounter { async fn propagate_loop( self: Arc, - mut propagate_rx: mpsc::UnboundedReceiver<(T::P, T::S, LocalCounterEntry)>, + mut propagate_rx: mpsc::UnboundedReceiver<(T::CP, T::CS, LocalCounterEntry)>, must_exit: watch::Receiver, ) { // This loop batches updates to counters to be sent all at once. @@ -236,7 +264,7 @@ impl IndexCounter { if let Some((pk, sk, counters)) = ent { let tree_key = self.table.data.tree_key(&pk, &sk); - let dist_entry = counters.into_counter_entry::(self.this_node, pk, sk); + let dist_entry = counters.into_counter_entry(self.this_node); match buf.entry(tree_key) { hash_map::Entry::Vacant(e) => { e.insert(dist_entry); @@ -255,10 +283,10 @@ impl IndexCounter { if let Err(e) = self.table.insert_many(entries).await { errors += 1; if errors >= 2 && *must_exit.borrow() { - error!("({}) Could not propagate {} counter values: {}, these counters will not be updated correctly.", T::NAME, buf.len(), e); + error!("({}) Could not propagate {} counter values: {}, these counters will not be updated correctly.", T::COUNTER_TABLE_NAME, buf.len(), e); break; } - warn!("({}) Could not propagate {} counter values: {}, retrying in 5 seconds (retry #{})", T::NAME, buf.len(), e, errors); + warn!("({}) Could not propagate {} counter values: {}, retrying in 5 seconds (retry #{})", T::COUNTER_TABLE_NAME, buf.len(), e, errors); tokio::time::sleep(Duration::from_secs(5)).await; continue; } @@ -272,23 +300,155 @@ impl IndexCounter { } } } + + pub fn offline_recount_all( + &self, + counted_table: &Arc>, + ) -> Result<(), Error> + where + TS: TableSchema, + TR: TableReplication, + { + let save_counter_entry = |entry: CounterEntry| -> Result<(), Error> { + let entry_k = self + .table + .data + .tree_key(entry.partition_key(), entry.sort_key()); + self.table + .data + .update_entry_with(&entry_k, |ent| match ent { + Some(mut ent) => { + ent.merge(&entry); + ent + } + None => entry.clone(), + })?; + Ok(()) + }; + + // 1. Set all old local counters to zero + let now = now_msec(); + let mut next_start: Option> = None; + loop { + let low_bound = match next_start.take() { + Some(v) => Bound::Excluded(v), + None => Bound::Unbounded, + }; + let mut batch = vec![]; + for item in self.local_counter.range((low_bound, Bound::Unbounded))? { + batch.push(item?); + if batch.len() > 1000 { + break; + } + } + + if batch.is_empty() { + break; + } + + info!("zeroing old counters... ({})", hex::encode(&batch[0].0)); + for (local_counter_k, local_counter) in batch { + let mut local_counter = + rmp_serde::decode::from_read_ref::<_, LocalCounterEntry>(&local_counter)?; + + for (_, tv) in local_counter.values.iter_mut() { + tv.0 = std::cmp::max(tv.0 + 1, now); + tv.1 = 0; + } + + let local_counter_bytes = rmp_to_vec_all_named(&local_counter)?; + self.local_counter + .insert(&local_counter_k, &local_counter_bytes)?; + + let counter_entry = local_counter.into_counter_entry(self.this_node); + save_counter_entry(counter_entry)?; + + next_start = Some(local_counter_k); + } + } + + // 2. Recount all table entries + let now = now_msec(); + let mut next_start: Option> = None; + loop { + let low_bound = match next_start.take() { + Some(v) => Bound::Excluded(v), + None => Bound::Unbounded, + }; + let mut batch = vec![]; + for item in counted_table + .data + .store + .range((low_bound, Bound::Unbounded))? + { + batch.push(item?); + if batch.len() > 1000 { + break; + } + } + + if batch.is_empty() { + break; + } + + info!("counting entries... ({})", hex::encode(&batch[0].0)); + for (counted_entry_k, counted_entry) in batch { + let counted_entry = counted_table.data.decode_entry(&counted_entry)?; + + let pk = counted_entry.counter_partition_key(); + let sk = counted_entry.counter_sort_key(); + let counts = counted_entry.counts(); + + let local_counter_key = self.table.data.tree_key(pk, sk); + let mut local_counter = match self.local_counter.get(&local_counter_key)? { + Some(old_bytes) => { + let ent = rmp_serde::decode::from_read_ref::<_, LocalCounterEntry>( + &old_bytes, + )?; + assert!(ent.pk == *pk); + assert!(ent.sk == *sk); + ent + } + None => LocalCounterEntry { + pk: pk.clone(), + sk: sk.clone(), + values: BTreeMap::new(), + }, + }; + for (s, v) in counts.iter() { + let mut tv = local_counter.values.entry(s.to_string()).or_insert((0, 0)); + tv.0 = std::cmp::max(tv.0 + 1, now); + tv.1 += v; + } + + let local_counter_bytes = rmp_to_vec_all_named(&local_counter)?; + self.local_counter + .insert(&local_counter_key, local_counter_bytes)?; + + let counter_entry = local_counter.into_counter_entry(self.this_node); + save_counter_entry(counter_entry)?; + + next_start = Some(counted_entry_k); + } + } + + // Done + Ok(()) + } } #[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] -struct LocalCounterEntry { +struct LocalCounterEntry { + pk: T::CP, + sk: T::CS, values: BTreeMap, } -impl LocalCounterEntry { - fn into_counter_entry( - self, - this_node: Uuid, - pk: T::P, - sk: T::S, - ) -> CounterEntry { +impl LocalCounterEntry { + fn into_counter_entry(self, this_node: Uuid) -> CounterEntry { CounterEntry { - pk, - sk, + pk: self.pk, + sk: self.sk, values: self .values .into_iter() diff --git a/src/model/k2v/counter_table.rs b/src/model/k2v/counter_table.rs deleted file mode 100644 index 4856eb2b..00000000 --- a/src/model/k2v/counter_table.rs +++ /dev/null @@ -1,20 +0,0 @@ -use garage_util::data::*; - -use crate::index_counter::*; - -pub const ENTRIES: &str = "entries"; -pub const CONFLICTS: &str = "conflicts"; -pub const VALUES: &str = "values"; -pub const BYTES: &str = "bytes"; - -#[derive(PartialEq, Clone)] -pub struct K2VCounterTable; - -impl CounterSchema for K2VCounterTable { - const NAME: &'static str = "k2v_index_counter"; - - // Partition key = bucket id - type P = Uuid; - // Sort key = K2V item's partition key - type S = String; -} diff --git a/src/model/k2v/item_table.rs b/src/model/k2v/item_table.rs index 991fe66d..baa1db4b 100644 --- a/src/model/k2v/item_table.rs +++ b/src/model/k2v/item_table.rs @@ -10,9 +10,13 @@ use garage_table::*; use crate::index_counter::*; use crate::k2v::causality::*; -use crate::k2v::counter_table::*; use crate::k2v::poll::*; +pub const ENTRIES: &str = "entries"; +pub const CONFLICTS: &str = "conflicts"; +pub const VALUES: &str = "values"; +pub const BYTES: &str = "bytes"; + #[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] pub struct K2VItem { pub partition: K2VItemPartition, @@ -112,27 +116,6 @@ impl K2VItem { ent.discard(); } } - - // returns counters: (non-deleted entries, conflict entries, non-tombstone values, bytes used) - fn stats(&self) -> (i64, i64, i64, i64) { - let values = self.values(); - - let n_entries = if self.is_tombstone() { 0 } else { 1 }; - let n_conflicts = if values.len() > 1 { 1 } else { 0 }; - let n_values = values - .iter() - .filter(|v| matches!(v, DvvsValue::Value(_))) - .count() as i64; - let n_bytes = values - .iter() - .map(|v| match v { - DvvsValue::Deleted => 0, - DvvsValue::Value(v) => v.len() as i64, - }) - .sum(); - - (n_entries, n_conflicts, n_values, n_bytes) - } } impl DvvsEntry { @@ -204,7 +187,7 @@ impl Entry for K2VItem { } pub struct K2VItemTable { - pub(crate) counter_table: Arc>, + pub(crate) counter_table: Arc>, pub(crate) subscriptions: Arc, } @@ -229,40 +212,14 @@ impl TableSchema for K2VItemTable { new: Option<&Self::E>, ) -> db::TxOpResult<()> { // 1. Count - let (old_entries, old_conflicts, old_values, old_bytes) = match old { - None => (0, 0, 0, 0), - Some(e) => e.stats(), - }; - let (new_entries, new_conflicts, new_values, new_bytes) = match new { - None => (0, 0, 0, 0), - Some(e) => e.stats(), - }; - - let count_pk = old - .map(|e| e.partition.bucket_id) - .unwrap_or_else(|| new.unwrap().partition.bucket_id); - let count_sk = old - .map(|e| &e.partition.partition_key) - .unwrap_or_else(|| &new.unwrap().partition.partition_key); - - let counter_res = self.counter_table.count( - tx, - &count_pk, - count_sk, - &[ - (ENTRIES, new_entries - old_entries), - (CONFLICTS, new_conflicts - old_conflicts), - (VALUES, new_values - old_values), - (BYTES, new_bytes - old_bytes), - ], - ); + let counter_res = self.counter_table.count(tx, old, new); if let Err(e) = db::unabort(counter_res)? { // This result can be returned by `counter_table.count()` for instance // if messagepack serialization or deserialization fails at some step. // Warn admin but ignore this error for now, that's all we can do. error!( - "Unable to update K2V item counter for bucket {:?} partition {}: {}. Index values will be wrong!", - count_pk, count_sk, e + "Unable to update K2V item counter: {}. Index values will be wrong!", + e ); } @@ -282,6 +239,47 @@ impl TableSchema for K2VItemTable { } } +impl CountedItem for K2VItem { + const COUNTER_TABLE_NAME: &'static str = "k2v_index_counter_v2"; + + // Partition key = bucket id + type CP = Uuid; + // Sort key = K2V item's partition key + type CS = String; + + fn counter_partition_key(&self) -> &Uuid { + &self.partition.bucket_id + } + fn counter_sort_key(&self) -> &String { + &self.partition.partition_key + } + + fn counts(&self) -> Vec<(&'static str, i64)> { + let values = self.values(); + + let n_entries = if self.is_tombstone() { 0 } else { 1 }; + let n_conflicts = if values.len() > 1 { 1 } else { 0 }; + let n_values = values + .iter() + .filter(|v| matches!(v, DvvsValue::Value(_))) + .count() as i64; + let n_bytes = values + .iter() + .map(|v| match v { + DvvsValue::Deleted => 0, + DvvsValue::Value(v) => v.len() as i64, + }) + .sum(); + + vec![ + (ENTRIES, n_entries), + (CONFLICTS, n_conflicts), + (VALUES, n_values), + (BYTES, n_bytes), + ] + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/src/model/k2v/mod.rs b/src/model/k2v/mod.rs index 664172a6..f6a96151 100644 --- a/src/model/k2v/mod.rs +++ b/src/model/k2v/mod.rs @@ -1,6 +1,5 @@ pub mod causality; -pub mod counter_table; pub mod item_table; pub mod poll; diff --git a/src/model/migrate.rs b/src/model/migrate.rs index 25acb4b0..5fc67069 100644 --- a/src/model/migrate.rs +++ b/src/model/migrate.rs @@ -77,6 +77,7 @@ impl Migrate { local_aliases: LwwMap::new(), website_config: Lww::new(website), cors_config: Lww::new(None), + quotas: Lww::new(Default::default()), }), }) .await?; diff --git a/src/model/s3/object_table.rs b/src/model/s3/object_table.rs index 62f5d8d9..a3914c36 100644 --- a/src/model/s3/object_table.rs +++ b/src/model/s3/object_table.rs @@ -11,10 +11,15 @@ use garage_table::crdt::*; use garage_table::replication::TableShardedReplication; use garage_table::*; +use crate::index_counter::*; use crate::s3::version_table::*; use garage_model_050::object_table as old; +pub const OBJECTS: &str = "objects"; +pub const UNFINISHED_UPLOADS: &str = "unfinished_uploads"; +pub const BYTES: &str = "bytes"; + /// An object #[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] pub struct Object { @@ -218,6 +223,7 @@ impl Crdt for Object { pub struct ObjectTable { pub background: Arc, pub version_table: Arc>, + pub object_counter_table: Arc>, } #[derive(Clone, Copy, Debug, Serialize, Deserialize)] @@ -236,10 +242,20 @@ impl TableSchema for ObjectTable { fn updated( &self, - _tx: &mut db::Transaction, + tx: &mut db::Transaction, old: Option<&Self::E>, new: Option<&Self::E>, ) -> db::TxOpResult<()> { + // 1. Count + let counter_res = self.object_counter_table.count(tx, old, new); + if let Err(e) = db::unabort(counter_res)? { + error!( + "Unable to update object counter: {}. Index values will be wrong!", + e + ); + } + + // 2. Spawn threads that propagates deletions to version table let version_table = self.version_table.clone(); let old = old.cloned(); let new = new.cloned(); @@ -283,6 +299,49 @@ impl TableSchema for ObjectTable { } } +impl CountedItem for Object { + const COUNTER_TABLE_NAME: &'static str = "bucket_object_counter"; + + // Partition key = bucket id + type CP = Uuid; + // Sort key = nothing + type CS = EmptyKey; + + fn counter_partition_key(&self) -> &Uuid { + &self.bucket_id + } + fn counter_sort_key(&self) -> &EmptyKey { + &EmptyKey + } + + fn counts(&self) -> Vec<(&'static str, i64)> { + let versions = self.versions(); + let n_objects = if versions.iter().any(|v| v.is_data()) { + 1 + } else { + 0 + }; + let n_unfinished_uploads = versions + .iter() + .filter(|v| matches!(v.state, ObjectVersionState::Uploading(_))) + .count(); + let n_bytes = versions + .iter() + .map(|v| match &v.state { + ObjectVersionState::Complete(ObjectVersionData::Inline(meta, _)) + | ObjectVersionState::Complete(ObjectVersionData::FirstBlock(meta, _)) => meta.size, + _ => 0, + }) + .sum::(); + + vec![ + (OBJECTS, n_objects), + (UNFINISHED_UPLOADS, n_unfinished_uploads as i64), + (BYTES, n_bytes as i64), + ] + } +} + // vvvvvvvv migration code, stupid stuff vvvvvvvvvvvv // (we just want to change bucket into bucket_id by hashing it) -- cgit v1.2.3 From 0850bac874029f0b8b278d75537dd037e5db57da Mon Sep 17 00:00:00 2001 From: Alex Date: Mon, 4 Jul 2022 12:45:32 +0200 Subject: Add `poll` command to `k2v-cli` (#335) Co-authored-by: Alex Auvolat Reviewed-on: https://git.deuxfleurs.fr/Deuxfleurs/garage/pulls/335 Co-authored-by: Alex Co-committed-by: Alex --- src/k2v-client/bin/k2v-cli.rs | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'src') diff --git a/src/k2v-client/bin/k2v-cli.rs b/src/k2v-client/bin/k2v-cli.rs index 38c39361..884e7438 100644 --- a/src/k2v-client/bin/k2v-cli.rs +++ b/src/k2v-client/bin/k2v-cli.rs @@ -55,6 +55,19 @@ enum Command { #[clap(flatten)] output_kind: ReadOutputKind, }, + /// Watch changes on a single value + Poll { + /// Partition key to delete from + partition_key: String, + /// Sort key to delete from + sort_key: String, + /// Causality information + #[clap(short, long)] + causality: String, + /// Output formating + #[clap(flatten)] + output_kind: ReadOutputKind, + }, /// Delete a single value Delete { /// Partition key to delete from @@ -324,6 +337,21 @@ async fn main() -> Result<(), Error> { let res = client.read_item(&partition_key, &sort_key).await?; output_kind.display_output(res); } + Command::Poll { + partition_key, + sort_key, + causality, + output_kind, + } => { + let res_opt = client + .poll_item(&partition_key, &sort_key, causality.into(), None) + .await?; + if let Some(res) = res_opt { + output_kind.display_output(res); + } else { + println!("Delay expired and value didn't change."); + } + } Command::ReadIndex { output_kind, filter, -- cgit v1.2.3 From b6d59ec19a3d41ce581716cf0dda5d47c2785843 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 4 Jul 2022 14:00:02 +0200 Subject: Fix poll item when item didn't change --- src/k2v-client/bin/k2v-cli.rs | 9 ++++++++- src/k2v-client/lib.rs | 8 ++++---- 2 files changed, 12 insertions(+), 5 deletions(-) (limited to 'src') diff --git a/src/k2v-client/bin/k2v-cli.rs b/src/k2v-client/bin/k2v-cli.rs index 884e7438..925ebeb8 100644 --- a/src/k2v-client/bin/k2v-cli.rs +++ b/src/k2v-client/bin/k2v-cli.rs @@ -1,3 +1,5 @@ +use std::time::Duration; + use k2v_client::*; use garage_util::formater::format_table; @@ -64,6 +66,9 @@ enum Command { /// Causality information #[clap(short, long)] causality: String, + /// Timeout, in seconds + #[clap(short, long)] + timeout: Option, /// Output formating #[clap(flatten)] output_kind: ReadOutputKind, @@ -341,10 +346,12 @@ async fn main() -> Result<(), Error> { partition_key, sort_key, causality, + timeout, output_kind, } => { + let timeout = timeout.map(Duration::from_secs); let res_opt = client - .poll_item(&partition_key, &sort_key, causality.into(), None) + .poll_item(&partition_key, &sort_key, causality.into(), timeout) .await?; if let Some(res) = res_opt { output_kind.display_output(res); diff --git a/src/k2v-client/lib.rs b/src/k2v-client/lib.rs index 95974d7a..c2606af4 100644 --- a/src/k2v-client/lib.rs +++ b/src/k2v-client/lib.rs @@ -122,14 +122,14 @@ impl K2vClient { let res = self.dispatch(req, Some(timeout + DEFAULT_TIMEOUT)).await?; - let causality = res - .causality_token - .ok_or_else(|| Error::InvalidResponse("missing causality token".into()))?; - if res.status == StatusCode::NOT_MODIFIED { return Ok(None); } + let causality = res + .causality_token + .ok_or_else(|| Error::InvalidResponse("missing causality token".into()))?; + if res.status == StatusCode::NO_CONTENT { return Ok(Some(CausalValue { causality, -- cgit v1.2.3 From fe3fa83de74b79ffeeb2042c58b9360defa65431 Mon Sep 17 00:00:00 2001 From: Alex Date: Mon, 4 Jul 2022 18:27:25 +0200 Subject: Publish k2v-client crate to crates.io (#337) Co-authored-by: Alex Auvolat Reviewed-on: https://git.deuxfleurs.fr/Deuxfleurs/garage/pulls/337 Co-authored-by: Alex Co-committed-by: Alex --- src/k2v-client/Cargo.toml | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'src') diff --git a/src/k2v-client/Cargo.toml b/src/k2v-client/Cargo.toml index 224414ab..2f8a2679 100644 --- a/src/k2v-client/Cargo.toml +++ b/src/k2v-client/Cargo.toml @@ -1,7 +1,12 @@ [package] name = "k2v-client" -version = "0.1.0" +version = "0.0.1" +authors = ["Trinity Pointard ", "Alex Auvolat "] edition = "2018" +license = "AGPL-3.0" +description = "Client library for the Garage K2V protocol" +repository = "https://git.deuxfleurs.fr/Deuxfleurs/garage" +readme = "../../README.md" [dependencies] base64 = "0.13.0" @@ -17,7 +22,7 @@ tokio = "1.17.0" # cli deps clap = { version = "3.1.18", optional = true, features = ["derive", "env"] } -garage_util = { path = "../util", optional = true } +garage_util = { version = "0.7.0", path = "../util", optional = true } [features] -- cgit v1.2.3 From aab34bfe5415e9584432bf32e29a151dc5af9ebd Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 4 Jul 2022 12:53:47 +0200 Subject: add delays in k2v test_items_and_indices --- src/garage/tests/k2v/item.rs | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'src') diff --git a/src/garage/tests/k2v/item.rs b/src/garage/tests/k2v/item.rs index bf2b01f8..32537336 100644 --- a/src/garage/tests/k2v/item.rs +++ b/src/garage/tests/k2v/item.rs @@ -1,3 +1,5 @@ +use std::time::Duration; + use crate::common; use assert_json_diff::assert_json_eq; @@ -86,6 +88,7 @@ async fn test_items_and_indices() { assert_eq!(res_body, content); // ReadIndex -- now there should be some stuff + tokio::time::sleep(Duration::from_secs(1)).await; let res = ctx .k2v .request @@ -154,6 +157,7 @@ async fn test_items_and_indices() { assert_eq!(res_body, content2); // ReadIndex -- now there should be some stuff + tokio::time::sleep(Duration::from_secs(1)).await; let res = ctx .k2v .request @@ -222,6 +226,7 @@ async fn test_items_and_indices() { ); // ReadIndex -- now there should be some stuff + tokio::time::sleep(Duration::from_secs(1)).await; let res = ctx .k2v .request @@ -290,6 +295,7 @@ async fn test_items_and_indices() { assert_eq!(res.status(), 204); // ReadIndex -- now there should be some stuff + tokio::time::sleep(Duration::from_secs(1)).await; let res = ctx .k2v .request -- cgit v1.2.3 From 4f38cadf6e2963a652ed28327d1c2ccfa2ebb2b7 Mon Sep 17 00:00:00 2001 From: Alex Date: Fri, 8 Jul 2022 13:30:26 +0200 Subject: Background task manager (#332) - [x] New background worker trait - [x] Adapt all current workers to use new API - [x] Command to list currently running workers, and whether they are active, idle, or dead - [x] Error reporting - Optimizations - [x] Merkle updater: several items per iteration - [ ] Use `tokio::task::spawn_blocking` where appropriate so that CPU-intensive tasks don't block other things going on - scrub: - [x] have only one worker with a channel to start/pause/cancel - [x] automatic scrub - [x] ability to view and change tranquility from CLI - [x] persistence of a few info - [ ] Testing Co-authored-by: Alex Auvolat Reviewed-on: https://git.deuxfleurs.fr/Deuxfleurs/garage/pulls/332 Co-authored-by: Alex Co-committed-by: Alex --- src/block/Cargo.toml | 1 + src/block/lib.rs | 1 + src/block/manager.rs | 329 ++++++++++------------------ src/block/repair.rs | 444 ++++++++++++++++++++++++++++++++++++++ src/db/Cargo.toml | 2 +- src/db/sqlite_adapter.rs | 2 +- src/garage/Cargo.toml | 2 + src/garage/admin.rs | 29 ++- src/garage/cli/cmd.rs | 8 +- src/garage/cli/structs.rs | 55 ++++- src/garage/cli/util.rs | 58 +++++ src/garage/repair/online.rs | 222 +++++++++++-------- src/model/index_counter.rs | 169 +++++++++------ src/rpc/system.rs | 6 +- src/table/gc.rs | 89 +++++--- src/table/merkle.rs | 87 +++++--- src/table/sync.rs | 198 ++++++++--------- src/util/Cargo.toml | 1 + src/util/background.rs | 160 -------------- src/util/background/job_worker.rs | 48 +++++ src/util/background/mod.rs | 117 ++++++++++ src/util/background/worker.rs | 261 ++++++++++++++++++++++ src/util/lib.rs | 1 - src/util/tranquilizer.rs | 25 ++- 24 files changed, 1606 insertions(+), 709 deletions(-) create mode 100644 src/block/repair.rs delete mode 100644 src/util/background.rs create mode 100644 src/util/background/job_worker.rs create mode 100644 src/util/background/mod.rs create mode 100644 src/util/background/worker.rs (limited to 'src') diff --git a/src/block/Cargo.toml b/src/block/Cargo.toml index 80346aca..2555a44a 100644 --- a/src/block/Cargo.toml +++ b/src/block/Cargo.toml @@ -21,6 +21,7 @@ garage_table = { version = "0.7.0", path = "../table" } opentelemetry = "0.17" +arc-swap = "1.5" async-trait = "0.1.7" bytes = "1.0" hex = "0.4" diff --git a/src/block/lib.rs b/src/block/lib.rs index dc685657..ebdb95d8 100644 --- a/src/block/lib.rs +++ b/src/block/lib.rs @@ -2,6 +2,7 @@ extern crate tracing; pub mod manager; +pub mod repair; mod block; mod metrics; diff --git a/src/block/manager.rs b/src/block/manager.rs index 32ba0431..017ba9da 100644 --- a/src/block/manager.rs +++ b/src/block/manager.rs @@ -1,18 +1,17 @@ -use core::ops::Bound; - use std::convert::TryInto; -use std::path::{Path, PathBuf}; +use std::path::PathBuf; use std::sync::Arc; use std::time::Duration; +use arc_swap::ArcSwapOption; use async_trait::async_trait; use serde::{Deserialize, Serialize}; use futures::future::*; -use futures::select; use tokio::fs; use tokio::io::{AsyncReadExt, AsyncWriteExt}; -use tokio::sync::{watch, Mutex, Notify}; +use tokio::select; +use tokio::sync::{mpsc, watch, Mutex, Notify}; use opentelemetry::{ trace::{FutureExt as OtelFutureExt, TraceContextExt, Tracer}, @@ -22,6 +21,7 @@ use opentelemetry::{ use garage_db as db; use garage_db::counted_tree_hack::CountedTree; +use garage_util::background::*; use garage_util::data::*; use garage_util::error::*; use garage_util::metrics::RecordDuration; @@ -36,6 +36,7 @@ use garage_table::replication::{TableReplication, TableShardedReplication}; use crate::block::*; use crate::metrics::*; use crate::rc::*; +use crate::repair::*; /// Size under which data will be stored inlined in database instead of as files pub const INLINE_THRESHOLD: usize = 3072; @@ -93,16 +94,18 @@ pub struct BlockManager { mutation_lock: Mutex, - rc: BlockRc, + pub(crate) rc: BlockRc, resync_queue: CountedTree, resync_notify: Notify, resync_errors: CountedTree, - system: Arc, + pub(crate) system: Arc, endpoint: Arc>, metrics: BlockManagerMetrics, + + tx_scrub_command: ArcSwapOption>, } // This custom struct contains functions that must only be ran @@ -110,6 +113,12 @@ pub struct BlockManager { // it INSIDE a Mutex. struct BlockManagerLocked(); +enum ResyncIterResult { + BusyDidSomething, + BusyDidNothing, + IdleFor(Duration), +} + impl BlockManager { pub fn new( db: &db::Db, @@ -157,10 +166,11 @@ impl BlockManager { system, endpoint, metrics, + tx_scrub_command: ArcSwapOption::new(None), }); block_manager.endpoint.set_handler(block_manager.clone()); - block_manager.clone().spawn_background_worker(); + block_manager.clone().spawn_background_workers(); block_manager } @@ -218,90 +228,6 @@ impl BlockManager { Ok(()) } - /// Launch the repair procedure on the data store - /// - /// This will list all blocks locally present, as well as those - /// that are required because of refcount > 0, and will try - /// to fix any mismatch between the two. - pub async fn repair_data_store(&self, must_exit: &watch::Receiver) -> Result<(), Error> { - // 1. Repair blocks from RC table. - let mut next_start: Option = None; - loop { - // We have to do this complicated two-step process where we first read a bunch - // of hashes from the RC table, and then insert them in the to-resync queue, - // because of SQLite. Basically, as long as we have an iterator on a DB table, - // we can't do anything else on the DB. The naive approach (which we had previously) - // of just iterating on the RC table and inserting items one to one in the resync - // queue can't work here, it would just provoke a deadlock in the SQLite adapter code. - // This is mostly because the Rust bindings for SQLite assume a worst-case scenario - // where SQLite is not compiled in thread-safe mode, so we have to wrap everything - // in a mutex (see db/sqlite_adapter.rs and discussion in PR #322). - let mut batch_of_hashes = vec![]; - let start_bound = match next_start.as_ref() { - None => Bound::Unbounded, - Some(x) => Bound::Excluded(x.as_slice()), - }; - for entry in self - .rc - .rc - .range::<&[u8], _>((start_bound, Bound::Unbounded))? - { - let (hash, _) = entry?; - let hash = Hash::try_from(&hash[..]).unwrap(); - batch_of_hashes.push(hash); - if batch_of_hashes.len() >= 1000 { - break; - } - } - if batch_of_hashes.is_empty() { - break; - } - - for hash in batch_of_hashes.into_iter() { - self.put_to_resync(&hash, Duration::from_secs(0))?; - next_start = Some(hash) - } - - if *must_exit.borrow() { - return Ok(()); - } - } - - // 2. Repair blocks actually on disk - // Lists all blocks on disk and adds them to the resync queue. - // This allows us to find blocks we are storing but don't actually need, - // so that we can offload them if necessary and then delete them locally. - self.for_each_file( - (), - move |_, hash| async move { - self.put_to_resync(&hash, Duration::from_secs(0)) - .map_err(Into::into) - }, - must_exit, - ) - .await - } - - /// Verify integrity of each block on disk. Use `speed_limit` to limit the load generated by - /// this function. - pub async fn scrub_data_store( - &self, - must_exit: &watch::Receiver, - tranquility: u32, - ) -> Result<(), Error> { - let tranquilizer = Tranquilizer::new(30); - self.for_each_file( - tranquilizer, - move |mut tranquilizer, hash| async move { - let _ = self.read_block(&hash).await; - tranquilizer.tranquilize(tranquility).await; - Ok(tranquilizer) - }, - must_exit, - ) - .await - } - /// Get lenght of resync queue pub fn resync_queue_len(&self) -> Result { // This currently can't return an error because the CountedTree hack @@ -321,6 +247,17 @@ impl BlockManager { Ok(self.rc.rc.len()?) } + /// Send command to start/stop/manager scrub worker + pub async fn send_scrub_command(&self, cmd: ScrubWorkerCommand) { + let _ = self + .tx_scrub_command + .load() + .as_ref() + .unwrap() + .send(cmd) + .await; + } + //// ----- Managing the reference counter ---- /// Increment the number of time a block is used, putting it to resynchronization if it is @@ -390,7 +327,7 @@ impl BlockManager { } /// Read block from disk, verifying it's integrity - async fn read_block(&self, hash: &Hash) -> Result { + pub(crate) async fn read_block(&self, hash: &Hash) -> Result { let data = self .read_block_internal(hash) .bound_record_duration(&self.metrics.block_read_duration) @@ -554,18 +491,23 @@ impl BlockManager { // for times that are earlier than the exponential back-off delay // is a natural condition that is handled properly). - fn spawn_background_worker(self: Arc) { + fn spawn_background_workers(self: Arc) { // Launch a background workers for background resync loop processing let background = self.system.background.clone(); + let worker = ResyncWorker::new(self.clone()); tokio::spawn(async move { tokio::time::sleep(Duration::from_secs(10)).await; - background.spawn_worker("block resync worker".into(), move |must_exit| { - self.resync_loop(must_exit) - }); + background.spawn_worker(worker); }); + + // Launch a background worker for data store scrubs + let (scrub_tx, scrub_rx) = mpsc::channel(1); + self.tx_scrub_command.store(Some(Arc::new(scrub_tx))); + let scrub_worker = ScrubWorker::new(self.clone(), scrub_rx); + self.system.background.spawn_worker(scrub_worker); } - fn put_to_resync(&self, hash: &Hash, delay: Duration) -> db::Result<()> { + pub(crate) fn put_to_resync(&self, hash: &Hash, delay: Duration) -> db::Result<()> { let when = now_msec() + delay.as_millis() as u64; self.put_to_resync_at(hash, when) } @@ -579,37 +521,7 @@ impl BlockManager { Ok(()) } - async fn resync_loop(self: Arc, mut must_exit: watch::Receiver) { - let mut tranquilizer = Tranquilizer::new(30); - - while !*must_exit.borrow() { - match self.resync_iter(&mut must_exit).await { - Ok(true) => { - tranquilizer.tranquilize(self.background_tranquility).await; - } - Ok(false) => { - tranquilizer.reset(); - } - Err(e) => { - // The errors that we have here are only Sled errors - // We don't really know how to handle them so just ¯\_(ツ)_/¯ - // (there is kind of an assumption that Sled won't error on us, - // if it does there is not much we can do -- TODO should we just panic?) - error!( - "Could not do a resync iteration: {} (this is a very bad error)", - e - ); - tranquilizer.reset(); - } - } - } - } - - // The result of resync_iter is: - // - Ok(true) -> a block was processed (successfully or not) - // - Ok(false) -> no block was processed, but we are ready for the next iteration - // - Err(_) -> a Sled error occurred when reading/writing from resync_queue/resync_errors - async fn resync_iter(&self, must_exit: &mut watch::Receiver) -> Result { + async fn resync_iter(&self) -> Result { if let Some((time_bytes, hash_bytes)) = self.resync_queue.first()? { let time_msec = u64::from_be_bytes(time_bytes[0..8].try_into().unwrap()); let now = now_msec(); @@ -629,7 +541,7 @@ impl BlockManager { // (we want to do the remove after the insert to ensure // that the item is not lost if we crash in-between) self.resync_queue.remove(time_bytes)?; - return Ok(false); + return Ok(ResyncIterResult::BusyDidNothing); } } @@ -676,15 +588,11 @@ impl BlockManager { self.resync_queue.remove(time_bytes)?; } - Ok(true) + Ok(ResyncIterResult::BusyDidSomething) } else { - let delay = tokio::time::sleep(Duration::from_millis(time_msec - now)); - select! { - _ = delay.fuse() => {}, - _ = self.resync_notify.notified().fuse() => {}, - _ = must_exit.changed().fuse() => {}, - } - Ok(false) + Ok(ResyncIterResult::IdleFor(Duration::from_millis( + time_msec - now, + ))) } } else { // Here we wait either for a notification that an item has been @@ -693,13 +601,7 @@ impl BlockManager { // between the time we checked the queue and the first poll // to resync_notify.notified(): if that happens, we'll just loop // back 10 seconds later, which is fine. - let delay = tokio::time::sleep(Duration::from_secs(10)); - select! { - _ = delay.fuse() => {}, - _ = self.resync_notify.notified().fuse() => {}, - _ = must_exit.changed().fuse() => {}, - } - Ok(false) + Ok(ResyncIterResult::IdleFor(Duration::from_secs(10))) } } @@ -814,72 +716,6 @@ impl BlockManager { Ok(()) } - - // ---- Utility: iteration on files in the data directory ---- - - async fn for_each_file( - &self, - state: State, - mut f: F, - must_exit: &watch::Receiver, - ) -> Result<(), Error> - where - F: FnMut(State, Hash) -> Fut + Send, - Fut: Future> + Send, - State: Send, - { - self.for_each_file_rec(&self.data_dir, state, &mut f, must_exit) - .await - .map(|_| ()) - } - - fn for_each_file_rec<'a, F, Fut, State>( - &'a self, - path: &'a Path, - mut state: State, - f: &'a mut F, - must_exit: &'a watch::Receiver, - ) -> BoxFuture<'a, Result> - where - F: FnMut(State, Hash) -> Fut + Send, - Fut: Future> + Send, - State: Send + 'a, - { - async move { - let mut ls_data_dir = fs::read_dir(path).await?; - while let Some(data_dir_ent) = ls_data_dir.next_entry().await? { - if *must_exit.borrow() { - break; - } - - let name = data_dir_ent.file_name(); - let name = if let Ok(n) = name.into_string() { - n - } else { - continue; - }; - let ent_type = data_dir_ent.file_type().await?; - - let name = name.strip_suffix(".zst").unwrap_or(&name); - if name.len() == 2 && hex::decode(&name).is_ok() && ent_type.is_dir() { - state = self - .for_each_file_rec(&data_dir_ent.path(), state, f, must_exit) - .await?; - } else if name.len() == 64 { - let hash_bytes = if let Ok(h) = hex::decode(&name) { - h - } else { - continue; - }; - let mut hash = [0u8; 32]; - hash.copy_from_slice(&hash_bytes[..]); - state = f(state, hash.into()).await?; - } - } - Ok(state) - } - .boxed() - } } #[async_trait] @@ -898,6 +734,77 @@ impl EndpointHandler for BlockManager { } } +struct ResyncWorker { + manager: Arc, + tranquilizer: Tranquilizer, + next_delay: Duration, +} + +impl ResyncWorker { + fn new(manager: Arc) -> Self { + Self { + manager, + tranquilizer: Tranquilizer::new(30), + next_delay: Duration::from_secs(10), + } + } +} + +#[async_trait] +impl Worker for ResyncWorker { + fn name(&self) -> String { + "Block resync worker".into() + } + + fn info(&self) -> Option { + let mut ret = vec![]; + let qlen = self.manager.resync_queue_len().unwrap_or(0); + let elen = self.manager.resync_errors_len().unwrap_or(0); + if qlen > 0 { + ret.push(format!("{} blocks in queue", qlen)); + } + if elen > 0 { + ret.push(format!("{} blocks in error state", elen)); + } + if !ret.is_empty() { + Some(ret.join(", ")) + } else { + None + } + } + + async fn work(&mut self, _must_exit: &mut watch::Receiver) -> Result { + self.tranquilizer.reset(); + match self.manager.resync_iter().await { + Ok(ResyncIterResult::BusyDidSomething) => Ok(self + .tranquilizer + .tranquilize_worker(self.manager.background_tranquility)), + Ok(ResyncIterResult::BusyDidNothing) => Ok(WorkerState::Busy), + Ok(ResyncIterResult::IdleFor(delay)) => { + self.next_delay = delay; + Ok(WorkerState::Idle) + } + Err(e) => { + // The errors that we have here are only Sled errors + // We don't really know how to handle them so just ¯\_(ツ)_/¯ + // (there is kind of an assumption that Sled won't error on us, + // if it does there is not much we can do -- TODO should we just panic?) + // Here we just give the error to the worker manager, + // it will print it to the logs and increment a counter + Err(e.into()) + } + } + } + + async fn wait_for_work(&mut self, _must_exit: &watch::Receiver) -> WorkerState { + select! { + _ = tokio::time::sleep(self.next_delay) => (), + _ = self.manager.resync_notify.notified() => (), + }; + WorkerState::Busy + } +} + struct BlockStatus { exists: bool, needed: RcEntry, diff --git a/src/block/repair.rs b/src/block/repair.rs new file mode 100644 index 00000000..07ff6772 --- /dev/null +++ b/src/block/repair.rs @@ -0,0 +1,444 @@ +use core::ops::Bound; +use std::path::PathBuf; +use std::sync::Arc; +use std::time::Duration; + +use async_trait::async_trait; +use serde::{Deserialize, Serialize}; +use tokio::fs; +use tokio::select; +use tokio::sync::mpsc; +use tokio::sync::watch; + +use garage_util::background::*; +use garage_util::data::*; +use garage_util::error::*; +use garage_util::persister::Persister; +use garage_util::time::*; +use garage_util::tranquilizer::Tranquilizer; + +use crate::manager::*; + +const SCRUB_INTERVAL: Duration = Duration::from_secs(3600 * 24 * 30); // full scrub every 30 days + +pub struct RepairWorker { + manager: Arc, + next_start: Option, + block_iter: Option, +} + +impl RepairWorker { + pub fn new(manager: Arc) -> Self { + Self { + manager, + next_start: None, + block_iter: None, + } + } +} + +#[async_trait] +impl Worker for RepairWorker { + fn name(&self) -> String { + "Block repair worker".into() + } + + fn info(&self) -> Option { + match self.block_iter.as_ref() { + None => { + let idx_bytes = self + .next_start + .as_ref() + .map(|x| x.as_slice()) + .unwrap_or(&[]); + let idx_bytes = if idx_bytes.len() > 4 { + &idx_bytes[..4] + } else { + idx_bytes + }; + Some(format!("Phase 1: {}", hex::encode(idx_bytes))) + } + Some(bi) => Some(format!("Phase 2: {:.2}% done", bi.progress() * 100.)), + } + } + + async fn work(&mut self, _must_exit: &mut watch::Receiver) -> Result { + match self.block_iter.as_mut() { + None => { + // Phase 1: Repair blocks from RC table. + + // We have to do this complicated two-step process where we first read a bunch + // of hashes from the RC table, and then insert them in the to-resync queue, + // because of SQLite. Basically, as long as we have an iterator on a DB table, + // we can't do anything else on the DB. The naive approach (which we had previously) + // of just iterating on the RC table and inserting items one to one in the resync + // queue can't work here, it would just provoke a deadlock in the SQLite adapter code. + // This is mostly because the Rust bindings for SQLite assume a worst-case scenario + // where SQLite is not compiled in thread-safe mode, so we have to wrap everything + // in a mutex (see db/sqlite_adapter.rs and discussion in PR #322). + // TODO: maybe do this with tokio::task::spawn_blocking ? + let mut batch_of_hashes = vec![]; + let start_bound = match self.next_start.as_ref() { + None => Bound::Unbounded, + Some(x) => Bound::Excluded(x.as_slice()), + }; + for entry in self + .manager + .rc + .rc + .range::<&[u8], _>((start_bound, Bound::Unbounded))? + { + let (hash, _) = entry?; + let hash = Hash::try_from(&hash[..]).unwrap(); + batch_of_hashes.push(hash); + if batch_of_hashes.len() >= 1000 { + break; + } + } + if batch_of_hashes.is_empty() { + // move on to phase 2 + self.block_iter = Some(BlockStoreIterator::new(&self.manager)); + return Ok(WorkerState::Busy); + } + + for hash in batch_of_hashes.into_iter() { + self.manager.put_to_resync(&hash, Duration::from_secs(0))?; + self.next_start = Some(hash) + } + + Ok(WorkerState::Busy) + } + Some(bi) => { + // Phase 2: Repair blocks actually on disk + // Lists all blocks on disk and adds them to the resync queue. + // This allows us to find blocks we are storing but don't actually need, + // so that we can offload them if necessary and then delete them locally. + if let Some(hash) = bi.next().await? { + self.manager.put_to_resync(&hash, Duration::from_secs(0))?; + Ok(WorkerState::Busy) + } else { + Ok(WorkerState::Done) + } + } + } + } + + async fn wait_for_work(&mut self, _must_exit: &watch::Receiver) -> WorkerState { + unreachable!() + } +} + +// ---- + +pub struct ScrubWorker { + manager: Arc, + rx_cmd: mpsc::Receiver, + + work: ScrubWorkerState, + tranquilizer: Tranquilizer, + + persister: Persister, + persisted: ScrubWorkerPersisted, +} + +#[derive(Serialize, Deserialize)] +struct ScrubWorkerPersisted { + tranquility: u32, + time_last_complete_scrub: u64, + corruptions_detected: u64, +} + +enum ScrubWorkerState { + Running(BlockStoreIterator), + Paused(BlockStoreIterator, u64), // u64 = time when to resume scrub + Finished, +} + +impl Default for ScrubWorkerState { + fn default() -> Self { + ScrubWorkerState::Finished + } +} + +#[derive(Debug)] +pub enum ScrubWorkerCommand { + Start, + Pause(Duration), + Resume, + Cancel, + SetTranquility(u32), +} + +impl ScrubWorker { + pub fn new(manager: Arc, rx_cmd: mpsc::Receiver) -> Self { + let persister = Persister::new(&manager.system.metadata_dir, "scrub_info"); + let persisted = match persister.load() { + Ok(v) => v, + Err(_) => ScrubWorkerPersisted { + time_last_complete_scrub: 0, + tranquility: 4, + corruptions_detected: 0, + }, + }; + Self { + manager, + rx_cmd, + work: ScrubWorkerState::Finished, + tranquilizer: Tranquilizer::new(30), + persister, + persisted, + } + } + + async fn handle_cmd(&mut self, cmd: ScrubWorkerCommand) { + match cmd { + ScrubWorkerCommand::Start => { + self.work = match std::mem::take(&mut self.work) { + ScrubWorkerState::Finished => { + let iterator = BlockStoreIterator::new(&self.manager); + ScrubWorkerState::Running(iterator) + } + work => { + error!("Cannot start scrub worker: already running!"); + work + } + }; + } + ScrubWorkerCommand::Pause(dur) => { + self.work = match std::mem::take(&mut self.work) { + ScrubWorkerState::Running(it) | ScrubWorkerState::Paused(it, _) => { + ScrubWorkerState::Paused(it, now_msec() + dur.as_millis() as u64) + } + work => { + error!("Cannot pause scrub worker: not running!"); + work + } + }; + } + ScrubWorkerCommand::Resume => { + self.work = match std::mem::take(&mut self.work) { + ScrubWorkerState::Paused(it, _) => ScrubWorkerState::Running(it), + work => { + error!("Cannot resume scrub worker: not paused!"); + work + } + }; + } + ScrubWorkerCommand::Cancel => { + self.work = match std::mem::take(&mut self.work) { + ScrubWorkerState::Running(_) | ScrubWorkerState::Paused(_, _) => { + ScrubWorkerState::Finished + } + work => { + error!("Cannot cancel scrub worker: not running!"); + work + } + } + } + ScrubWorkerCommand::SetTranquility(t) => { + self.persisted.tranquility = t; + if let Err(e) = self.persister.save_async(&self.persisted).await { + error!("Could not save new tranquilitiy value: {}", e); + } + } + } + } +} + +#[async_trait] +impl Worker for ScrubWorker { + fn name(&self) -> String { + "Block scrub worker".into() + } + + fn info(&self) -> Option { + let s = match &self.work { + ScrubWorkerState::Running(bsi) => format!( + "{:.2}% done (tranquility = {})", + bsi.progress() * 100., + self.persisted.tranquility + ), + ScrubWorkerState::Paused(bsi, rt) => { + format!( + "Paused, {:.2}% done, resumes at {}", + bsi.progress() * 100., + msec_to_rfc3339(*rt) + ) + } + ScrubWorkerState::Finished => format!( + "Last completed scrub: {}", + msec_to_rfc3339(self.persisted.time_last_complete_scrub) + ), + }; + Some(format!( + "{} ; corruptions detected: {}", + s, self.persisted.corruptions_detected + )) + } + + async fn work(&mut self, _must_exit: &mut watch::Receiver) -> Result { + match self.rx_cmd.try_recv() { + Ok(cmd) => self.handle_cmd(cmd).await, + Err(mpsc::error::TryRecvError::Disconnected) => return Ok(WorkerState::Done), + Err(mpsc::error::TryRecvError::Empty) => (), + }; + + match &mut self.work { + ScrubWorkerState::Running(bsi) => { + self.tranquilizer.reset(); + if let Some(hash) = bsi.next().await? { + match self.manager.read_block(&hash).await { + Err(Error::CorruptData(_)) => { + error!("Found corrupt data block during scrub: {:?}", hash); + self.persisted.corruptions_detected += 1; + self.persister.save_async(&self.persisted).await?; + } + Err(e) => return Err(e), + _ => (), + }; + Ok(self + .tranquilizer + .tranquilize_worker(self.persisted.tranquility)) + } else { + self.persisted.time_last_complete_scrub = now_msec(); + self.persister.save_async(&self.persisted).await?; + self.work = ScrubWorkerState::Finished; + self.tranquilizer.clear(); + Ok(WorkerState::Idle) + } + } + _ => Ok(WorkerState::Idle), + } + } + + async fn wait_for_work(&mut self, _must_exit: &watch::Receiver) -> WorkerState { + let (wait_until, command) = match &self.work { + ScrubWorkerState::Running(_) => return WorkerState::Busy, + ScrubWorkerState::Paused(_, resume_time) => (*resume_time, ScrubWorkerCommand::Resume), + ScrubWorkerState::Finished => ( + self.persisted.time_last_complete_scrub + SCRUB_INTERVAL.as_millis() as u64, + ScrubWorkerCommand::Start, + ), + }; + + let now = now_msec(); + if now >= wait_until { + self.handle_cmd(command).await; + return WorkerState::Busy; + } + let delay = Duration::from_millis(wait_until - now); + select! { + _ = tokio::time::sleep(delay) => self.handle_cmd(command).await, + cmd = self.rx_cmd.recv() => if let Some(cmd) = cmd { + self.handle_cmd(cmd).await; + } else { + return WorkerState::Done; + } + } + + match &self.work { + ScrubWorkerState::Running(_) => WorkerState::Busy, + _ => WorkerState::Idle, + } + } +} + +// ---- + +struct BlockStoreIterator { + path: Vec, +} + +enum ReadingDir { + Pending(PathBuf), + Read { + subpaths: Vec, + pos: usize, + }, +} + +impl BlockStoreIterator { + fn new(manager: &BlockManager) -> Self { + let root_dir = manager.data_dir.clone(); + Self { + path: vec![ReadingDir::Pending(root_dir)], + } + } + + /// Returns progress done, between 0 and 1 + fn progress(&self) -> f32 { + if self.path.is_empty() { + 1.0 + } else { + let mut ret = 0.0; + let mut next_div = 1; + for p in self.path.iter() { + match p { + ReadingDir::Pending(_) => break, + ReadingDir::Read { subpaths, pos } => { + next_div *= subpaths.len(); + ret += ((*pos - 1) as f32) / (next_div as f32); + } + } + } + ret + } + } + + async fn next(&mut self) -> Result, Error> { + loop { + let last_path = match self.path.last_mut() { + None => return Ok(None), + Some(lp) => lp, + }; + + if let ReadingDir::Pending(path) = last_path { + let mut reader = fs::read_dir(&path).await?; + let mut subpaths = vec![]; + while let Some(ent) = reader.next_entry().await? { + subpaths.push(ent); + } + *last_path = ReadingDir::Read { subpaths, pos: 0 }; + } + + let (subpaths, pos) = match *last_path { + ReadingDir::Read { + ref subpaths, + ref mut pos, + } => (subpaths, pos), + ReadingDir::Pending(_) => unreachable!(), + }; + + let data_dir_ent = match subpaths.get(*pos) { + None => { + self.path.pop(); + continue; + } + Some(ent) => { + *pos += 1; + ent + } + }; + + let name = data_dir_ent.file_name(); + let name = if let Ok(n) = name.into_string() { + n + } else { + continue; + }; + let ent_type = data_dir_ent.file_type().await?; + + let name = name.strip_suffix(".zst").unwrap_or(&name); + if name.len() == 2 && hex::decode(&name).is_ok() && ent_type.is_dir() { + let path = data_dir_ent.path(); + self.path.push(ReadingDir::Pending(path)); + } else if name.len() == 64 { + if let Ok(h) = hex::decode(&name) { + let mut hash = [0u8; 32]; + hash.copy_from_slice(&h); + return Ok(Some(hash.into())); + } + } + } + } +} diff --git a/src/db/Cargo.toml b/src/db/Cargo.toml index 6d8f64be..f697054b 100644 --- a/src/db/Cargo.toml +++ b/src/db/Cargo.toml @@ -19,7 +19,7 @@ required-features = ["cli"] [dependencies] err-derive = "0.3" hexdump = "0.1" -log = "0.4" +tracing = "0.1.30" heed = "0.11" rusqlite = { version = "0.27", features = ["bundled"] } diff --git a/src/db/sqlite_adapter.rs b/src/db/sqlite_adapter.rs index 68d96ca0..97a78b07 100644 --- a/src/db/sqlite_adapter.rs +++ b/src/db/sqlite_adapter.rs @@ -6,7 +6,7 @@ use std::pin::Pin; use std::ptr::NonNull; use std::sync::{Arc, Mutex, MutexGuard}; -use log::trace; +use tracing::trace; use rusqlite::{params, Connection, Rows, Statement, Transaction}; diff --git a/src/garage/Cargo.toml b/src/garage/Cargo.toml index 640e6975..8948e750 100644 --- a/src/garage/Cargo.toml +++ b/src/garage/Cargo.toml @@ -23,6 +23,7 @@ path = "tests/lib.rs" [dependencies] garage_db = { version = "0.8.0", path = "../db" } garage_api = { version = "0.7.0", path = "../api" } +garage_block = { version = "0.7.0", path = "../block" } garage_model = { version = "0.7.0", path = "../model" } garage_rpc = { version = "0.7.0", path = "../rpc" } garage_table = { version = "0.7.0", path = "../table" } @@ -31,6 +32,7 @@ garage_web = { version = "0.7.0", path = "../web" } bytes = "1.0" bytesize = "1.1" +timeago = "0.3" hex = "0.4" tracing = { version = "0.1.30", features = ["log-always"] } pretty_env_logger = "0.4" diff --git a/src/garage/admin.rs b/src/garage/admin.rs index 48914655..71ee608c 100644 --- a/src/garage/admin.rs +++ b/src/garage/admin.rs @@ -24,7 +24,7 @@ use garage_model::migrate::Migrate; use garage_model::permission::*; use crate::cli::*; -use crate::repair::online::OnlineRepair; +use crate::repair::online::launch_online_repair; pub const ADMIN_RPC_PATH: &str = "garage/admin_rpc.rs/Rpc"; @@ -36,6 +36,7 @@ pub enum AdminRpc { LaunchRepair(RepairOpt), Migrate(MigrateOpt), Stats(StatsOpt), + Worker(WorkerOpt), // Replies Ok(String), @@ -47,6 +48,10 @@ pub enum AdminRpc { }, KeyList(Vec<(String, String)>), KeyInfo(Key, HashMap), + WorkerList( + HashMap, + WorkerListOpt, + ), } impl Rpc for AdminRpc { @@ -693,15 +698,7 @@ impl AdminRpcHandler { ))) } } else { - let repair = OnlineRepair { - garage: self.garage.clone(), - }; - self.garage - .system - .background - .spawn_worker("Repair worker".into(), move |must_exit| async move { - repair.repair_worker(opt, must_exit).await - }); + launch_online_repair(self.garage.clone(), opt).await; Ok(AdminRpc::Ok(format!( "Repair launched on {:?}", self.garage.system.id @@ -830,6 +827,17 @@ impl AdminRpcHandler { Ok(()) } + + // ---- + + async fn handle_worker_cmd(&self, opt: WorkerOpt) -> Result { + match opt.cmd { + WorkerCmd::List { opt } => { + let workers = self.garage.background.get_worker_info(); + Ok(AdminRpc::WorkerList(workers, opt)) + } + } + } } #[async_trait] @@ -845,6 +853,7 @@ impl EndpointHandler for AdminRpcHandler { AdminRpc::Migrate(opt) => self.handle_migrate(opt.clone()).await, AdminRpc::LaunchRepair(opt) => self.handle_launch_repair(opt.clone()).await, AdminRpc::Stats(opt) => self.handle_stats(opt.clone()).await, + AdminRpc::Worker(opt) => self.handle_worker_cmd(opt.clone()).await, m => Err(GarageError::unexpected_rpc_message(m).into()), } } diff --git a/src/garage/cli/cmd.rs b/src/garage/cli/cmd.rs index 3a0bd956..1aa2c2ff 100644 --- a/src/garage/cli/cmd.rs +++ b/src/garage/cli/cmd.rs @@ -1,4 +1,5 @@ use std::collections::HashSet; +use std::time::Duration; use garage_util::error::*; use garage_util::formater::format_table; @@ -39,6 +40,7 @@ pub async fn cli_command_dispatch( cmd_admin(admin_rpc_endpoint, rpc_host, AdminRpc::LaunchRepair(ro)).await } Command::Stats(so) => cmd_admin(admin_rpc_endpoint, rpc_host, AdminRpc::Stats(so)).await, + Command::Worker(wo) => cmd_admin(admin_rpc_endpoint, rpc_host, AdminRpc::Worker(wo)).await, _ => unreachable!(), } } @@ -100,6 +102,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> vec!["ID\tHostname\tAddress\tTags\tZone\tCapacity\tLast seen".to_string()]; for adv in status.iter().filter(|adv| !adv.is_up) { if let Some(NodeRoleV(Some(cfg))) = layout.roles.get(&adv.id) { + let tf = timeago::Formatter::new(); failed_nodes.push(format!( "{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\t{capacity}\t{last_seen}", id = adv.id, @@ -110,7 +113,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> capacity = cfg.capacity_string(), last_seen = adv .last_seen_secs_ago - .map(|s| format!("{}s ago", s)) + .map(|s| tf.convert(Duration::from_secs(s))) .unwrap_or_else(|| "never seen".into()), )); } @@ -182,6 +185,9 @@ pub async fn cmd_admin( AdminRpc::KeyInfo(key, rb) => { print_key_info(&key, &rb); } + AdminRpc::WorkerList(wi, wlo) => { + print_worker_info(wi, wlo); + } r => { error!("Unexpected response: {:?}", r); } diff --git a/src/garage/cli/structs.rs b/src/garage/cli/structs.rs index 4f2efe19..bc44b5ef 100644 --- a/src/garage/cli/structs.rs +++ b/src/garage/cli/structs.rs @@ -45,6 +45,10 @@ pub enum Command { /// Gather node statistics #[structopt(name = "stats")] Stats(StatsOpt), + + /// Manage background workers + #[structopt(name = "worker")] + Worker(WorkerOpt), } #[derive(StructOpt, Debug)] @@ -423,8 +427,29 @@ pub enum RepairWhat { /// Verify integrity of all blocks on disc (extremely slow, i/o intensive) #[structopt(name = "scrub")] Scrub { - /// Tranquility factor (see tranquilizer documentation) - #[structopt(name = "tranquility", default_value = "2")] + #[structopt(subcommand)] + cmd: ScrubCmd, + }, +} + +#[derive(Serialize, Deserialize, StructOpt, Debug, Eq, PartialEq, Clone)] +pub enum ScrubCmd { + /// Start scrub + #[structopt(name = "start")] + Start, + /// Pause scrub (it will resume automatically after 24 hours) + #[structopt(name = "pause")] + Pause, + /// Resume paused scrub + #[structopt(name = "resume")] + Resume, + /// Cancel scrub in progress + #[structopt(name = "cancel")] + Cancel, + /// Set tranquility level for in-progress and future scrubs + #[structopt(name = "set-tranquility")] + SetTranquility { + #[structopt()] tranquility: u32, }, } @@ -460,3 +485,29 @@ pub struct StatsOpt { #[structopt(short = "d", long = "detailed")] pub detailed: bool, } + +#[derive(Serialize, Deserialize, StructOpt, Debug, Clone)] +pub struct WorkerOpt { + #[structopt(subcommand)] + pub cmd: WorkerCmd, +} + +#[derive(Serialize, Deserialize, StructOpt, Debug, Eq, PartialEq, Clone)] +pub enum WorkerCmd { + /// List all workers on Garage node + #[structopt(name = "list")] + List { + #[structopt(flatten)] + opt: WorkerListOpt, + }, +} + +#[derive(Serialize, Deserialize, StructOpt, Debug, Eq, PartialEq, Clone, Copy)] +pub struct WorkerListOpt { + /// Show only busy workers + #[structopt(short = "b", long = "busy")] + pub busy: bool, + /// Show only workers with errors + #[structopt(short = "e", long = "errors")] + pub errors: bool, +} diff --git a/src/garage/cli/util.rs b/src/garage/cli/util.rs index 329e8a3e..396938ae 100644 --- a/src/garage/cli/util.rs +++ b/src/garage/cli/util.rs @@ -1,14 +1,19 @@ use std::collections::HashMap; +use std::time::Duration; +use garage_util::background::*; use garage_util::crdt::*; use garage_util::data::Uuid; use garage_util::error::*; use garage_util::formater::format_table; +use garage_util::time::*; use garage_model::bucket_table::*; use garage_model::key_table::*; use garage_model::s3::object_table::{BYTES, OBJECTS, UNFINISHED_UPLOADS}; +use crate::cli::structs::WorkerListOpt; + pub fn print_bucket_list(bl: Vec) { println!("List of buckets:"); @@ -235,3 +240,56 @@ pub fn find_matching_node( Ok(candidates[0]) } } + +pub fn print_worker_info(wi: HashMap, wlo: WorkerListOpt) { + let mut wi = wi.into_iter().collect::>(); + wi.sort_by_key(|(tid, info)| { + ( + match info.state { + WorkerState::Busy | WorkerState::Throttled(_) => 0, + WorkerState::Idle => 1, + WorkerState::Done => 2, + }, + *tid, + ) + }); + + let mut table = vec![]; + for (tid, info) in wi.iter() { + if wlo.busy && !matches!(info.state, WorkerState::Busy | WorkerState::Throttled(_)) { + continue; + } + if wlo.errors && info.errors == 0 { + continue; + } + + table.push(format!("{}\t{}\t{}", tid, info.state, info.name)); + if let Some(i) = &info.info { + table.push(format!("\t\t {}", i)); + } + let tf = timeago::Formatter::new(); + let (err_ago, err_msg) = info + .last_error + .as_ref() + .map(|(m, t)| { + ( + tf.convert(Duration::from_millis(now_msec() - t)), + m.as_str(), + ) + }) + .unwrap_or(("(?) ago".into(), "(?)")); + if info.consecutive_errors > 0 { + table.push(format!( + "\t\t {} consecutive errors ({} total), last {}", + info.consecutive_errors, info.errors, err_ago, + )); + table.push(format!("\t\t {}", err_msg)); + } else if info.errors > 0 { + table.push(format!("\t\t ({} errors, last {})", info.errors, err_ago,)); + if wlo.errors { + table.push(format!("\t\t {}", err_msg)); + } + } + } + format_table(table); +} diff --git a/src/garage/repair/online.rs b/src/garage/repair/online.rs index d6a71742..e33cf097 100644 --- a/src/garage/repair/online.rs +++ b/src/garage/repair/online.rs @@ -1,89 +1,110 @@ use std::sync::Arc; +use std::time::Duration; +use async_trait::async_trait; use tokio::sync::watch; +use garage_block::repair::ScrubWorkerCommand; use garage_model::garage::Garage; use garage_model::s3::block_ref_table::*; use garage_model::s3::object_table::*; use garage_model::s3::version_table::*; use garage_table::*; +use garage_util::background::*; use garage_util::error::Error; use crate::*; -pub struct OnlineRepair { - pub garage: Arc, -} - -impl OnlineRepair { - pub async fn repair_worker(&self, opt: RepairOpt, must_exit: watch::Receiver) { - if let Err(e) = self.repair_worker_aux(opt, must_exit).await { - warn!("Repair worker failed with error: {}", e); +pub async fn launch_online_repair(garage: Arc, opt: RepairOpt) { + match opt.what { + RepairWhat::Tables => { + info!("Launching a full sync of tables"); + garage.bucket_table.syncer.add_full_sync(); + garage.object_table.syncer.add_full_sync(); + garage.version_table.syncer.add_full_sync(); + garage.block_ref_table.syncer.add_full_sync(); + garage.key_table.syncer.add_full_sync(); + } + RepairWhat::Versions => { + info!("Repairing the versions table"); + garage + .background + .spawn_worker(RepairVersionsWorker::new(garage.clone())); + } + RepairWhat::BlockRefs => { + info!("Repairing the block refs table"); + garage + .background + .spawn_worker(RepairBlockrefsWorker::new(garage.clone())); + } + RepairWhat::Blocks => { + info!("Repairing the stored blocks"); + garage + .background + .spawn_worker(garage_block::repair::RepairWorker::new( + garage.block_manager.clone(), + )); + } + RepairWhat::Scrub { cmd } => { + let cmd = match cmd { + ScrubCmd::Start => ScrubWorkerCommand::Start, + ScrubCmd::Pause => ScrubWorkerCommand::Pause(Duration::from_secs(3600 * 24)), + ScrubCmd::Resume => ScrubWorkerCommand::Resume, + ScrubCmd::Cancel => ScrubWorkerCommand::Cancel, + ScrubCmd::SetTranquility { tranquility } => { + ScrubWorkerCommand::SetTranquility(tranquility) + } + }; + info!("Sending command to scrub worker: {:?}", cmd); + garage.block_manager.send_scrub_command(cmd).await; } } +} - async fn repair_worker_aux( - &self, - opt: RepairOpt, - must_exit: watch::Receiver, - ) -> Result<(), Error> { - match opt.what { - RepairWhat::Tables => { - info!("Launching a full sync of tables"); - self.garage.bucket_table.syncer.add_full_sync(); - self.garage.object_table.syncer.add_full_sync(); - self.garage.version_table.syncer.add_full_sync(); - self.garage.block_ref_table.syncer.add_full_sync(); - self.garage.key_table.syncer.add_full_sync(); - } - RepairWhat::Versions => { - info!("Repairing the versions table"); - self.repair_versions(&must_exit).await?; - } - RepairWhat::BlockRefs => { - info!("Repairing the block refs table"); - self.repair_block_ref(&must_exit).await?; - } - RepairWhat::Blocks => { - info!("Repairing the stored blocks"); - self.garage - .block_manager - .repair_data_store(&must_exit) - .await?; - } - RepairWhat::Scrub { tranquility } => { - info!("Verifying integrity of stored blocks"); - self.garage - .block_manager - .scrub_data_store(&must_exit, tranquility) - .await?; - } +// ---- + +struct RepairVersionsWorker { + garage: Arc, + pos: Vec, + counter: usize, +} + +impl RepairVersionsWorker { + fn new(garage: Arc) -> Self { + Self { + garage, + pos: vec![], + counter: 0, } - Ok(()) } +} - async fn repair_versions(&self, must_exit: &watch::Receiver) -> Result<(), Error> { - let mut pos = vec![]; - let mut i = 0; +#[async_trait] +impl Worker for RepairVersionsWorker { + fn name(&self) -> String { + "Version repair worker".into() + } - while !*must_exit.borrow() { - let item_bytes = match self.garage.version_table.data.store.get_gt(pos)? { - Some((k, v)) => { - pos = k; - v - } - None => break, - }; + fn info(&self) -> Option { + Some(format!("{} items done", self.counter)) + } - i += 1; - if i % 1000 == 0 { - info!("repair_versions: {}", i); + async fn work(&mut self, _must_exit: &mut watch::Receiver) -> Result { + let item_bytes = match self.garage.version_table.data.store.get_gt(&self.pos)? { + Some((k, v)) => { + self.pos = k; + v } - - let version = rmp_serde::decode::from_read_ref::<_, Version>(&item_bytes)?; - if version.deleted.get() { - continue; + None => { + info!("repair_versions: finished, done {}", self.counter); + return Ok(WorkerState::Done); } + }; + + self.counter += 1; + + let version = rmp_serde::decode::from_read_ref::<_, Version>(&item_bytes)?; + if !version.deleted.get() { let object = self .garage .object_table @@ -109,32 +130,59 @@ impl OnlineRepair { .await?; } } - info!("repair_versions: finished, done {}", i); - Ok(()) + + Ok(WorkerState::Busy) } - async fn repair_block_ref(&self, must_exit: &watch::Receiver) -> Result<(), Error> { - let mut pos = vec![]; - let mut i = 0; + async fn wait_for_work(&mut self, _must_exit: &watch::Receiver) -> WorkerState { + unreachable!() + } +} - while !*must_exit.borrow() { - let item_bytes = match self.garage.block_ref_table.data.store.get_gt(pos)? { - Some((k, v)) => { - pos = k; - v - } - None => break, - }; +// ---- - i += 1; - if i % 1000 == 0 { - info!("repair_block_ref: {}", i); - } +struct RepairBlockrefsWorker { + garage: Arc, + pos: Vec, + counter: usize, +} - let block_ref = rmp_serde::decode::from_read_ref::<_, BlockRef>(&item_bytes)?; - if block_ref.deleted.get() { - continue; +impl RepairBlockrefsWorker { + fn new(garage: Arc) -> Self { + Self { + garage, + pos: vec![], + counter: 0, + } + } +} + +#[async_trait] +impl Worker for RepairBlockrefsWorker { + fn name(&self) -> String { + "Block refs repair worker".into() + } + + fn info(&self) -> Option { + Some(format!("{} items done", self.counter)) + } + + async fn work(&mut self, _must_exit: &mut watch::Receiver) -> Result { + let item_bytes = match self.garage.block_ref_table.data.store.get_gt(&self.pos)? { + Some((k, v)) => { + self.pos = k; + v } + None => { + info!("repair_block_ref: finished, done {}", self.counter); + return Ok(WorkerState::Done); + } + }; + + self.counter += 1; + + let block_ref = rmp_serde::decode::from_read_ref::<_, BlockRef>(&item_bytes)?; + if !block_ref.deleted.get() { let version = self .garage .version_table @@ -157,7 +205,11 @@ impl OnlineRepair { .await?; } } - info!("repair_block_ref: finished, done {}", i); - Ok(()) + + Ok(WorkerState::Busy) + } + + async fn wait_for_work(&mut self, _must_exit: &watch::Receiver) -> WorkerState { + unreachable!() } } diff --git a/src/model/index_counter.rs b/src/model/index_counter.rs index 36e8172b..26833390 100644 --- a/src/model/index_counter.rs +++ b/src/model/index_counter.rs @@ -2,8 +2,8 @@ use core::ops::Bound; use std::collections::{hash_map, BTreeMap, HashMap}; use std::marker::PhantomData; use std::sync::Arc; -use std::time::Duration; +use async_trait::async_trait; use serde::{Deserialize, Serialize}; use tokio::sync::{mpsc, watch}; @@ -11,6 +11,7 @@ use garage_db as db; use garage_rpc::ring::Ring; use garage_rpc::system::System; +use garage_util::background::*; use garage_util::data::*; use garage_util::error::*; use garage_util::time::*; @@ -171,11 +172,13 @@ impl IndexCounter { ), }); - let this2 = this.clone(); - background.spawn_worker( - format!("{} index counter propagator", T::COUNTER_TABLE_NAME), - move |must_exit| this2.clone().propagate_loop(propagate_rx, must_exit), - ); + background.spawn_worker(IndexPropagatorWorker { + index_counter: this.clone(), + propagate_rx, + buf: HashMap::new(), + errors: 0, + }); + this } @@ -239,68 +242,6 @@ impl IndexCounter { Ok(()) } - async fn propagate_loop( - self: Arc, - mut propagate_rx: mpsc::UnboundedReceiver<(T::CP, T::CS, LocalCounterEntry)>, - must_exit: watch::Receiver, - ) { - // This loop batches updates to counters to be sent all at once. - // They are sent once the propagate_rx channel has been emptied (or is closed). - let mut buf = HashMap::new(); - let mut errors = 0; - - loop { - let (ent, closed) = match propagate_rx.try_recv() { - Ok(ent) => (Some(ent), false), - Err(mpsc::error::TryRecvError::Empty) if buf.is_empty() => { - match propagate_rx.recv().await { - Some(ent) => (Some(ent), false), - None => (None, true), - } - } - Err(mpsc::error::TryRecvError::Empty) => (None, false), - Err(mpsc::error::TryRecvError::Disconnected) => (None, true), - }; - - if let Some((pk, sk, counters)) = ent { - let tree_key = self.table.data.tree_key(&pk, &sk); - let dist_entry = counters.into_counter_entry(self.this_node); - match buf.entry(tree_key) { - hash_map::Entry::Vacant(e) => { - e.insert(dist_entry); - } - hash_map::Entry::Occupied(mut e) => { - e.get_mut().merge(&dist_entry); - } - } - // As long as we can add entries, loop back and add them to batch - // before sending batch to other nodes - continue; - } - - if !buf.is_empty() { - let entries = buf.iter().map(|(_k, v)| v); - if let Err(e) = self.table.insert_many(entries).await { - errors += 1; - if errors >= 2 && *must_exit.borrow() { - error!("({}) Could not propagate {} counter values: {}, these counters will not be updated correctly.", T::COUNTER_TABLE_NAME, buf.len(), e); - break; - } - warn!("({}) Could not propagate {} counter values: {}, retrying in 5 seconds (retry #{})", T::COUNTER_TABLE_NAME, buf.len(), e, errors); - tokio::time::sleep(Duration::from_secs(5)).await; - continue; - } - - buf.clear(); - errors = 0; - } - - if closed || *must_exit.borrow() { - break; - } - } - } - pub fn offline_recount_all( &self, counted_table: &Arc>, @@ -437,6 +378,98 @@ impl IndexCounter { } } +struct IndexPropagatorWorker { + index_counter: Arc>, + propagate_rx: mpsc::UnboundedReceiver<(T::CP, T::CS, LocalCounterEntry)>, + + buf: HashMap, CounterEntry>, + errors: usize, +} + +impl IndexPropagatorWorker { + fn add_ent(&mut self, pk: T::CP, sk: T::CS, counters: LocalCounterEntry) { + let tree_key = self.index_counter.table.data.tree_key(&pk, &sk); + let dist_entry = counters.into_counter_entry(self.index_counter.this_node); + match self.buf.entry(tree_key) { + hash_map::Entry::Vacant(e) => { + e.insert(dist_entry); + } + hash_map::Entry::Occupied(mut e) => { + e.get_mut().merge(&dist_entry); + } + } + } +} + +#[async_trait] +impl Worker for IndexPropagatorWorker { + fn name(&self) -> String { + format!("{} index counter propagator", T::COUNTER_TABLE_NAME) + } + + fn info(&self) -> Option { + if !self.buf.is_empty() { + Some(format!("{} items in queue", self.buf.len())) + } else { + None + } + } + + async fn work(&mut self, must_exit: &mut watch::Receiver) -> Result { + // This loop batches updates to counters to be sent all at once. + // They are sent once the propagate_rx channel has been emptied (or is closed). + let closed = loop { + match self.propagate_rx.try_recv() { + Ok((pk, sk, counters)) => { + self.add_ent(pk, sk, counters); + } + Err(mpsc::error::TryRecvError::Empty) => break false, + Err(mpsc::error::TryRecvError::Disconnected) => break true, + } + }; + + if !self.buf.is_empty() { + let entries_k = self.buf.keys().take(100).cloned().collect::>(); + let entries = entries_k.iter().map(|k| self.buf.get(k).unwrap()); + if let Err(e) = self.index_counter.table.insert_many(entries).await { + self.errors += 1; + if self.errors >= 2 && *must_exit.borrow() { + error!("({}) Could not propagate {} counter values: {}, these counters will not be updated correctly.", T::COUNTER_TABLE_NAME, self.buf.len(), e); + return Ok(WorkerState::Done); + } + // Propagate error up to worker manager, it will log it, increment a counter, + // and sleep for a certain delay (with exponential backoff), waiting for + // things to go back to normal + return Err(e); + } else { + for k in entries_k { + self.buf.remove(&k); + } + self.errors = 0; + } + + return Ok(WorkerState::Busy); + } else if closed { + return Ok(WorkerState::Done); + } else { + return Ok(WorkerState::Idle); + } + } + + async fn wait_for_work(&mut self, _must_exit: &watch::Receiver) -> WorkerState { + match self.propagate_rx.recv().await { + Some((pk, sk, counters)) => { + self.add_ent(pk, sk, counters); + WorkerState::Busy + } + None => match self.buf.is_empty() { + false => WorkerState::Busy, + true => WorkerState::Done, + }, + } + } +} + #[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] struct LocalCounterEntry { pk: T::CP, diff --git a/src/rpc/system.rs b/src/rpc/system.rs index 1d7c3ea4..f9f2970b 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -2,7 +2,7 @@ use std::collections::HashMap; use std::io::{Read, Write}; use std::net::{IpAddr, SocketAddr}; -use std::path::Path; +use std::path::{Path, PathBuf}; use std::sync::{Arc, RwLock}; use std::time::{Duration, Instant}; @@ -104,6 +104,9 @@ pub struct System { /// The job runner of this node pub background: Arc, + + /// Path to metadata directory + pub metadata_dir: PathBuf, } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -295,6 +298,7 @@ impl System { ring, update_ring: Mutex::new(update_ring), background, + metadata_dir: config.metadata_dir.clone(), }); sys.system_endpoint.set_handler(sys.clone()); sys diff --git a/src/table/gc.rs b/src/table/gc.rs index e7fbbcb0..12218d97 100644 --- a/src/table/gc.rs +++ b/src/table/gc.rs @@ -8,12 +8,11 @@ use serde::{Deserialize, Serialize}; use serde_bytes::ByteBuf; use futures::future::join_all; -use futures::select; -use futures_util::future::*; use tokio::sync::watch; use garage_db::counted_tree_hack::CountedTree; +use garage_util::background::*; use garage_util::data::*; use garage_util::error::*; use garage_util::time::*; @@ -69,35 +68,11 @@ where gc.endpoint.set_handler(gc.clone()); - let gc1 = gc.clone(); - system.background.spawn_worker( - format!("GC loop for {}", F::TABLE_NAME), - move |must_exit: watch::Receiver| gc1.gc_loop(must_exit), - ); + system.background.spawn_worker(GcWorker::new(gc.clone())); gc } - async fn gc_loop(self: Arc, mut must_exit: watch::Receiver) { - while !*must_exit.borrow() { - match self.gc_loop_iter().await { - Ok(None) => { - // Stuff was done, loop immediately - } - Ok(Some(wait_delay)) => { - // Nothing was done, wait specified delay. - select! { - _ = tokio::time::sleep(wait_delay).fuse() => {}, - _ = must_exit.changed().fuse() => {}, - } - } - Err(e) => { - warn!("({}) Error doing GC: {}", F::TABLE_NAME, e); - } - } - } - } - async fn gc_loop_iter(&self) -> Result, Error> { let now = now_msec(); @@ -328,6 +303,66 @@ where } } +struct GcWorker +where + F: TableSchema + 'static, + R: TableReplication + 'static, +{ + gc: Arc>, + wait_delay: Duration, +} + +impl GcWorker +where + F: TableSchema + 'static, + R: TableReplication + 'static, +{ + fn new(gc: Arc>) -> Self { + Self { + gc, + wait_delay: Duration::from_secs(0), + } + } +} + +#[async_trait] +impl Worker for GcWorker +where + F: TableSchema + 'static, + R: TableReplication + 'static, +{ + fn name(&self) -> String { + format!("{} GC", F::TABLE_NAME) + } + + fn info(&self) -> Option { + let l = self.gc.data.gc_todo_len().unwrap_or(0); + if l > 0 { + Some(format!("{} items in queue", l)) + } else { + None + } + } + + async fn work(&mut self, _must_exit: &mut watch::Receiver) -> Result { + match self.gc.gc_loop_iter().await? { + None => Ok(WorkerState::Busy), + Some(delay) => { + self.wait_delay = delay; + Ok(WorkerState::Idle) + } + } + } + + async fn wait_for_work(&mut self, must_exit: &watch::Receiver) -> WorkerState { + if *must_exit.borrow() { + return WorkerState::Done; + } + tokio::time::sleep(self.wait_delay).await; + WorkerState::Busy + } +} + /// An entry stored in the gc_todo Sled tree associated with the table /// Contains helper function for parsing, saving, and removing /// such entry in Sled diff --git a/src/table/merkle.rs b/src/table/merkle.rs index 7685b193..a5c29723 100644 --- a/src/table/merkle.rs +++ b/src/table/merkle.rs @@ -1,14 +1,13 @@ use std::sync::Arc; use std::time::Duration; -use futures::select; -use futures_util::future::*; +use async_trait::async_trait; use serde::{Deserialize, Serialize}; use tokio::sync::watch; use garage_db as db; -use garage_util::background::BackgroundRunner; +use garage_util::background::*; use garage_util::data::*; use garage_util::error::Error; @@ -78,43 +77,17 @@ where empty_node_hash, }); - let ret2 = ret.clone(); - background.spawn_worker( - format!("Merkle tree updater for {}", F::TABLE_NAME), - |must_exit: watch::Receiver| ret2.updater_loop(must_exit), - ); + background.spawn_worker(MerkleWorker(ret.clone())); ret } - async fn updater_loop(self: Arc, mut must_exit: watch::Receiver) { - while !*must_exit.borrow() { - match self.updater_loop_iter() { - Ok(true) => (), - Ok(false) => { - select! { - _ = self.data.merkle_todo_notify.notified().fuse() => {}, - _ = must_exit.changed().fuse() => {}, - } - } - Err(e) => { - warn!( - "({}) Error while updating Merkle tree item: {}", - F::TABLE_NAME, - e - ); - tokio::time::sleep(Duration::from_secs(10)).await; - } - } - } - } - - fn updater_loop_iter(&self) -> Result { + fn updater_loop_iter(&self) -> Result { if let Some((key, valhash)) = self.data.merkle_todo.first()? { self.update_item(&key, &valhash)?; - Ok(true) + Ok(WorkerState::Busy) } else { - Ok(false) + Ok(WorkerState::Idle) } } @@ -325,6 +298,54 @@ where } } +struct MerkleWorker(Arc>) +where + F: TableSchema + 'static, + R: TableReplication + 'static; + +#[async_trait] +impl Worker for MerkleWorker +where + F: TableSchema + 'static, + R: TableReplication + 'static, +{ + fn name(&self) -> String { + format!("{} Merkle tree updater", F::TABLE_NAME) + } + + fn info(&self) -> Option { + let l = self.0.todo_len().unwrap_or(0); + if l > 0 { + Some(format!("{} items in queue", l)) + } else { + None + } + } + + async fn work(&mut self, _must_exit: &mut watch::Receiver) -> Result { + let updater = self.0.clone(); + tokio::task::spawn_blocking(move || { + for _i in 0..100 { + let s = updater.updater_loop_iter(); + if !matches!(s, Ok(WorkerState::Busy)) { + return s; + } + } + Ok(WorkerState::Busy) + }) + .await + .unwrap() + } + + async fn wait_for_work(&mut self, must_exit: &watch::Receiver) -> WorkerState { + if *must_exit.borrow() { + return WorkerState::Done; + } + tokio::time::sleep(Duration::from_secs(10)).await; + WorkerState::Busy + } +} + impl MerkleNodeKey { fn encode(&self) -> Vec { let mut ret = Vec::with_capacity(2 + self.prefix.len()); diff --git a/src/table/sync.rs b/src/table/sync.rs index 4c83e991..b3756a5e 100644 --- a/src/table/sync.rs +++ b/src/table/sync.rs @@ -1,17 +1,17 @@ use std::collections::VecDeque; -use std::sync::{Arc, Mutex}; +use std::sync::Arc; use std::time::{Duration, Instant}; use async_trait::async_trait; -use futures::select; -use futures_util::future::*; use futures_util::stream::*; use opentelemetry::KeyValue; use rand::Rng; use serde::{Deserialize, Serialize}; use serde_bytes::ByteBuf; +use tokio::select; use tokio::sync::{mpsc, watch}; +use garage_util::background::*; use garage_util::data::*; use garage_util::error::Error; @@ -34,7 +34,7 @@ pub struct TableSyncer data: Arc>, merkle: Arc>, - todo: Mutex, + add_full_sync_tx: mpsc::UnboundedSender<()>, endpoint: Arc>, } @@ -52,10 +52,6 @@ impl Rpc for SyncRpc { type Response = Result; } -struct SyncTodo { - todo: Vec, -} - #[derive(Debug, Clone)] struct TodoPartition { partition: Partition, @@ -80,118 +76,40 @@ where .netapp .endpoint(format!("garage_table/sync.rs/Rpc:{}", F::TABLE_NAME)); - let todo = SyncTodo { todo: vec![] }; + let (add_full_sync_tx, add_full_sync_rx) = mpsc::unbounded_channel(); let syncer = Arc::new(Self { system: system.clone(), data, merkle, - todo: Mutex::new(todo), + add_full_sync_tx, endpoint, }); syncer.endpoint.set_handler(syncer.clone()); - let (busy_tx, busy_rx) = mpsc::unbounded_channel(); - - let s1 = syncer.clone(); - system.background.spawn_worker( - format!("table sync watcher for {}", F::TABLE_NAME), - move |must_exit: watch::Receiver| s1.watcher_task(must_exit, busy_rx), - ); - - let s2 = syncer.clone(); - system.background.spawn_worker( - format!("table syncer for {}", F::TABLE_NAME), - move |must_exit: watch::Receiver| s2.syncer_task(must_exit, busy_tx), - ); - - let s3 = syncer.clone(); - tokio::spawn(async move { - tokio::time::sleep(Duration::from_secs(20)).await; - s3.add_full_sync(); + system.background.spawn_worker(SyncWorker { + syncer: syncer.clone(), + ring_recv: system.ring.clone(), + ring: system.ring.borrow().clone(), + add_full_sync_rx, + todo: vec![], + next_full_sync: Instant::now() + Duration::from_secs(20), }); syncer } - async fn watcher_task( - self: Arc, - mut must_exit: watch::Receiver, - mut busy_rx: mpsc::UnboundedReceiver, - ) { - let mut prev_ring: Arc = self.system.ring.borrow().clone(); - let mut ring_recv: watch::Receiver> = self.system.ring.clone(); - let mut nothing_to_do_since = Some(Instant::now()); - - while !*must_exit.borrow() { - select! { - _ = ring_recv.changed().fuse() => { - let new_ring = ring_recv.borrow(); - if !Arc::ptr_eq(&new_ring, &prev_ring) { - debug!("({}) Ring changed, adding full sync to syncer todo list", F::TABLE_NAME); - self.add_full_sync(); - prev_ring = new_ring.clone(); - } - } - busy_opt = busy_rx.recv().fuse() => { - if let Some(busy) = busy_opt { - if busy { - nothing_to_do_since = None; - } else if nothing_to_do_since.is_none() { - nothing_to_do_since = Some(Instant::now()); - } - } - } - _ = must_exit.changed().fuse() => {}, - _ = tokio::time::sleep(Duration::from_secs(1)).fuse() => { - if nothing_to_do_since.map(|t| Instant::now() - t >= ANTI_ENTROPY_INTERVAL).unwrap_or(false) { - nothing_to_do_since = None; - debug!("({}) Interval passed, adding full sync to syncer todo list", F::TABLE_NAME); - self.add_full_sync(); - } - } - } - } - } - pub fn add_full_sync(&self) { - self.todo - .lock() - .unwrap() - .add_full_sync(&self.data, &self.system); - } - - async fn syncer_task( - self: Arc, - mut must_exit: watch::Receiver, - busy_tx: mpsc::UnboundedSender, - ) { - while !*must_exit.borrow() { - let task = self.todo.lock().unwrap().pop_task(); - if let Some(partition) = task { - busy_tx.send(true).unwrap(); - let res = self - .clone() - .sync_partition(&partition, &mut must_exit) - .await; - if let Err(e) = res { - warn!( - "({}) Error while syncing {:?}: {}", - F::TABLE_NAME, - partition, - e - ); - } - } else { - busy_tx.send(false).unwrap(); - tokio::time::sleep(Duration::from_secs(1)).await; - } + if self.add_full_sync_tx.send(()).is_err() { + error!("({}) Could not add full sync", F::TABLE_NAME); } } + // ---- + async fn sync_partition( - self: Arc, + self: &Arc, partition: &TodoPartition, must_exit: &mut watch::Receiver, ) -> Result<(), Error> { @@ -577,12 +495,22 @@ where } } -impl SyncTodo { - fn add_full_sync( - &mut self, - data: &TableData, - system: &System, - ) { +// -------- Sync Worker --------- + +struct SyncWorker { + syncer: Arc>, + ring_recv: watch::Receiver>, + ring: Arc, + add_full_sync_rx: mpsc::UnboundedReceiver<()>, + todo: Vec, + next_full_sync: Instant, +} + +impl SyncWorker { + fn add_full_sync(&mut self) { + let system = &self.syncer.system; + let data = &self.syncer.data; + let my_id = system.id; self.todo.clear(); @@ -623,6 +551,8 @@ impl SyncTodo { retain, }); } + + self.next_full_sync = Instant::now() + ANTI_ENTROPY_INTERVAL; } fn pop_task(&mut self) -> Option { @@ -641,6 +571,62 @@ impl SyncTodo { } } +#[async_trait] +impl Worker for SyncWorker { + fn name(&self) -> String { + format!("{} sync", F::TABLE_NAME) + } + + fn info(&self) -> Option { + let l = self.todo.len(); + if l > 0 { + Some(format!("{} partitions remaining", l)) + } else { + None + } + } + + async fn work(&mut self, must_exit: &mut watch::Receiver) -> Result { + if let Some(partition) = self.pop_task() { + self.syncer.sync_partition(&partition, must_exit).await?; + Ok(WorkerState::Busy) + } else { + Ok(WorkerState::Idle) + } + } + + async fn wait_for_work(&mut self, must_exit: &watch::Receiver) -> WorkerState { + if *must_exit.borrow() { + return WorkerState::Done; + } + select! { + s = self.add_full_sync_rx.recv() => { + if let Some(()) = s { + self.add_full_sync(); + } + }, + _ = self.ring_recv.changed() => { + let new_ring = self.ring_recv.borrow(); + if !Arc::ptr_eq(&new_ring, &self.ring) { + self.ring = new_ring.clone(); + drop(new_ring); + debug!("({}) Ring changed, adding full sync to syncer todo list", F::TABLE_NAME); + self.add_full_sync(); + } + }, + _ = tokio::time::sleep(self.next_full_sync - Instant::now()) => { + self.add_full_sync(); + } + } + match self.todo.is_empty() { + false => WorkerState::Busy, + true => WorkerState::Idle, + } + } +} + +// ---- UTIL ---- + fn hash_of(x: &T) -> Result { Ok(blake2sum(&rmp_to_vec_all_named(x)?[..])) } diff --git a/src/util/Cargo.toml b/src/util/Cargo.toml index 5d073436..57c70ffb 100644 --- a/src/util/Cargo.toml +++ b/src/util/Cargo.toml @@ -16,6 +16,7 @@ path = "lib.rs" [dependencies] garage_db = { version = "0.8.0", path = "../db" } +async-trait = "0.1" blake2 = "0.9" err-derive = "0.3" xxhash-rust = { version = "0.8", default-features = false, features = ["xxh3"] } diff --git a/src/util/background.rs b/src/util/background.rs deleted file mode 100644 index d35425f5..00000000 --- a/src/util/background.rs +++ /dev/null @@ -1,160 +0,0 @@ -//! Job runner for futures and async functions -use core::future::Future; -use std::pin::Pin; -use std::sync::Arc; -use std::time::Duration; - -use futures::future::*; -use futures::select; -use futures::stream::FuturesUnordered; -use futures::StreamExt; -use tokio::sync::{mpsc, mpsc::error::TryRecvError, watch, Mutex}; - -use crate::error::Error; - -type JobOutput = Result<(), Error>; -type Job = Pin + Send>>; - -/// Job runner for futures and async functions -pub struct BackgroundRunner { - stop_signal: watch::Receiver, - queue_in: mpsc::UnboundedSender<(Job, bool)>, - worker_in: mpsc::UnboundedSender>, -} - -impl BackgroundRunner { - /// Create a new BackgroundRunner - pub fn new( - n_runners: usize, - stop_signal: watch::Receiver, - ) -> (Arc, tokio::task::JoinHandle<()>) { - let (worker_in, mut worker_out) = mpsc::unbounded_channel(); - - let stop_signal_2 = stop_signal.clone(); - let await_all_done = tokio::spawn(async move { - let mut workers = FuturesUnordered::new(); - let mut shutdown_timer = 0; - loop { - let closed = match worker_out.try_recv() { - Ok(wkr) => { - workers.push(wkr); - false - } - Err(TryRecvError::Empty) => false, - Err(TryRecvError::Disconnected) => true, - }; - select! { - res = workers.next() => { - if let Some(Err(e)) = res { - error!("Worker exited with error: {}", e); - } - } - _ = tokio::time::sleep(Duration::from_secs(1)).fuse() => { - if closed || *stop_signal_2.borrow() { - shutdown_timer += 1; - if shutdown_timer >= 10 { - break; - } - } - } - } - } - }); - - let (queue_in, queue_out) = mpsc::unbounded_channel(); - let queue_out = Arc::new(Mutex::new(queue_out)); - - for i in 0..n_runners { - let queue_out = queue_out.clone(); - let stop_signal = stop_signal.clone(); - - worker_in - .send(tokio::spawn(async move { - loop { - let (job, cancellable) = { - select! { - item = wait_job(&queue_out).fuse() => match item { - // We received a task, process it - Some(x) => x, - // We received a signal that no more tasks will ever be sent - // because the sending side was dropped. Exit now. - None => break, - }, - _ = tokio::time::sleep(Duration::from_secs(5)).fuse() => { - if *stop_signal.borrow() { - // Nothing has been going on for 5 secs, and we are shutting - // down. Exit now. - break; - } else { - // Nothing is going on but we don't want to exit. - continue; - } - } - } - }; - if cancellable && *stop_signal.borrow() { - continue; - } - if let Err(e) = job.await { - error!("Job failed: {}", e) - } - } - info!("Background worker {} exiting", i); - })) - .unwrap(); - } - - let bgrunner = Arc::new(Self { - stop_signal, - queue_in, - worker_in, - }); - (bgrunner, await_all_done) - } - - /// Spawn a task to be run in background - pub fn spawn(&self, job: T) - where - T: Future + Send + 'static, - { - let boxed: Job = Box::pin(job); - self.queue_in - .send((boxed, false)) - .map_err(|_| "could not put job in queue") - .unwrap(); - } - - /// Spawn a task to be run in background. It may get discarded before running if spawned while - /// the runner is stopping - pub fn spawn_cancellable(&self, job: T) - where - T: Future + Send + 'static, - { - let boxed: Job = Box::pin(job); - self.queue_in - .send((boxed, true)) - .map_err(|_| "could not put job in queue") - .unwrap(); - } - - pub fn spawn_worker(&self, name: String, worker: F) - where - F: FnOnce(watch::Receiver) -> T + Send + 'static, - T: Future + Send + 'static, - { - let stop_signal = self.stop_signal.clone(); - let task = tokio::spawn(async move { - info!("Worker started: {}", name); - worker(stop_signal).await; - info!("Worker exited: {}", name); - }); - self.worker_in - .send(task) - .map_err(|_| "could not put job in queue") - .unwrap(); - } -} - -async fn wait_job(q: &Mutex>) -> Option<(Job, bool)> { - q.lock().await.recv().await -} diff --git a/src/util/background/job_worker.rs b/src/util/background/job_worker.rs new file mode 100644 index 00000000..2568ea11 --- /dev/null +++ b/src/util/background/job_worker.rs @@ -0,0 +1,48 @@ +//! Job worker: a generic worker that just processes incoming +//! jobs one by one + +use std::sync::Arc; + +use async_trait::async_trait; +use tokio::sync::{mpsc, Mutex}; + +use crate::background::worker::*; +use crate::background::*; + +pub(crate) struct JobWorker { + pub(crate) index: usize, + pub(crate) job_chan: Arc>>, + pub(crate) next_job: Option, +} + +#[async_trait] +impl Worker for JobWorker { + fn name(&self) -> String { + format!("Job worker #{}", self.index) + } + + async fn work(&mut self, _must_exit: &mut watch::Receiver) -> Result { + match self.next_job.take() { + None => return Ok(WorkerState::Idle), + Some(job) => { + job.await?; + Ok(WorkerState::Busy) + } + } + } + + async fn wait_for_work(&mut self, must_exit: &watch::Receiver) -> WorkerState { + loop { + match self.job_chan.lock().await.recv().await { + Some((job, cancellable)) => { + if cancellable && *must_exit.borrow() { + continue; + } + self.next_job = Some(job); + return WorkerState::Busy; + } + None => return WorkerState::Done, + } + } + } +} diff --git a/src/util/background/mod.rs b/src/util/background/mod.rs new file mode 100644 index 00000000..619f5068 --- /dev/null +++ b/src/util/background/mod.rs @@ -0,0 +1,117 @@ +//! Job runner for futures and async functions + +pub mod job_worker; +pub mod worker; + +use core::future::Future; + +use std::collections::HashMap; +use std::pin::Pin; +use std::sync::Arc; + +use serde::{Deserialize, Serialize}; +use tokio::sync::{mpsc, watch, Mutex}; + +use crate::error::Error; +use worker::WorkerProcessor; +pub use worker::{Worker, WorkerState}; + +pub(crate) type JobOutput = Result<(), Error>; +pub(crate) type Job = Pin + Send>>; + +/// Job runner for futures and async functions +pub struct BackgroundRunner { + send_job: mpsc::UnboundedSender<(Job, bool)>, + send_worker: mpsc::UnboundedSender>, + worker_info: Arc>>, +} + +#[derive(Clone, Serialize, Deserialize, Debug)] +pub struct WorkerInfo { + pub name: String, + pub info: Option, + pub state: WorkerState, + pub errors: usize, + pub consecutive_errors: usize, + pub last_error: Option<(String, u64)>, +} + +impl BackgroundRunner { + /// Create a new BackgroundRunner + pub fn new( + n_runners: usize, + stop_signal: watch::Receiver, + ) -> (Arc, tokio::task::JoinHandle<()>) { + let (send_worker, worker_out) = mpsc::unbounded_channel::>(); + + let worker_info = Arc::new(std::sync::Mutex::new(HashMap::new())); + let mut worker_processor = + WorkerProcessor::new(worker_out, stop_signal, worker_info.clone()); + + let await_all_done = tokio::spawn(async move { + worker_processor.run().await; + }); + + let (send_job, queue_out) = mpsc::unbounded_channel(); + let queue_out = Arc::new(Mutex::new(queue_out)); + + for i in 0..n_runners { + let queue_out = queue_out.clone(); + + send_worker + .send(Box::new(job_worker::JobWorker { + index: i, + job_chan: queue_out.clone(), + next_job: None, + })) + .ok() + .unwrap(); + } + + let bgrunner = Arc::new(Self { + send_job, + send_worker, + worker_info, + }); + (bgrunner, await_all_done) + } + + pub fn get_worker_info(&self) -> HashMap { + self.worker_info.lock().unwrap().clone() + } + + /// Spawn a task to be run in background + pub fn spawn(&self, job: T) + where + T: Future + Send + 'static, + { + let boxed: Job = Box::pin(job); + self.send_job + .send((boxed, false)) + .ok() + .expect("Could not put job in queue"); + } + + /// Spawn a task to be run in background. It may get discarded before running if spawned while + /// the runner is stopping + pub fn spawn_cancellable(&self, job: T) + where + T: Future + Send + 'static, + { + let boxed: Job = Box::pin(job); + self.send_job + .send((boxed, true)) + .ok() + .expect("Could not put job in queue"); + } + + pub fn spawn_worker(&self, worker: W) + where + W: Worker + 'static, + { + self.send_worker + .send(Box::new(worker)) + .ok() + .expect("Could not put worker in queue"); + } +} diff --git a/src/util/background/worker.rs b/src/util/background/worker.rs new file mode 100644 index 00000000..7f573a07 --- /dev/null +++ b/src/util/background/worker.rs @@ -0,0 +1,261 @@ +use std::collections::HashMap; +use std::sync::Arc; +use std::time::{Duration, Instant}; + +use async_trait::async_trait; +use futures::future::*; +use futures::stream::FuturesUnordered; +use futures::StreamExt; +use serde::{Deserialize, Serialize}; +use tokio::select; +use tokio::sync::{mpsc, watch}; +use tracing::*; + +use crate::background::WorkerInfo; +use crate::error::Error; +use crate::time::now_msec; + +#[derive(PartialEq, Copy, Clone, Serialize, Deserialize, Debug)] +pub enum WorkerState { + Busy, + Throttled(f32), + Idle, + Done, +} + +impl std::fmt::Display for WorkerState { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + WorkerState::Busy => write!(f, "Busy"), + WorkerState::Throttled(t) => write!(f, "Thr:{:.3}", t), + WorkerState::Idle => write!(f, "Idle"), + WorkerState::Done => write!(f, "Done"), + } + } +} + +#[async_trait] +pub trait Worker: Send { + fn name(&self) -> String; + + fn info(&self) -> Option { + None + } + + /// Work: do a basic unit of work, if one is available (otherwise, should return + /// WorkerState::Idle immediately). We will do our best to not interrupt this future in the + /// middle of processing, it will only be interrupted at the last minute when Garage is trying + /// to exit and this hasn't returned yet. This function may return an error to indicate that + /// its unit of work could not be processed due to an error: the error will be logged and + /// .work() will be called again after a short delay. + async fn work(&mut self, must_exit: &mut watch::Receiver) -> Result; + + /// Wait for work: await for some task to become available. This future can be interrupted in + /// the middle for any reason. This future doesn't have to await on must_exit.changed(), we + /// are doing it for you. Therefore it only receives a read refernce to must_exit which allows + /// it to check if we are exiting. + async fn wait_for_work(&mut self, must_exit: &watch::Receiver) -> WorkerState; +} + +pub(crate) struct WorkerProcessor { + stop_signal: watch::Receiver, + worker_chan: mpsc::UnboundedReceiver>, + worker_info: Arc>>, +} + +impl WorkerProcessor { + pub(crate) fn new( + worker_chan: mpsc::UnboundedReceiver>, + stop_signal: watch::Receiver, + worker_info: Arc>>, + ) -> Self { + Self { + stop_signal, + worker_chan, + worker_info, + } + } + + pub(crate) async fn run(&mut self) { + let mut workers = FuturesUnordered::new(); + let mut next_task_id = 1; + + while !*self.stop_signal.borrow() { + let await_next_worker = async { + if workers.is_empty() { + futures::future::pending().await + } else { + workers.next().await + } + }; + select! { + new_worker_opt = self.worker_chan.recv() => { + if let Some(new_worker) = new_worker_opt { + let task_id = next_task_id; + next_task_id += 1; + let stop_signal = self.stop_signal.clone(); + let stop_signal_worker = self.stop_signal.clone(); + let mut worker = WorkerHandler { + task_id, + stop_signal, + stop_signal_worker, + worker: new_worker, + state: WorkerState::Busy, + errors: 0, + consecutive_errors: 0, + last_error: None, + }; + workers.push(async move { + worker.step().await; + worker + }.boxed()); + } + } + worker = await_next_worker => { + if let Some(mut worker) = worker { + trace!("{} (TID {}): {:?}", worker.worker.name(), worker.task_id, worker.state); + + // Save worker info + let mut wi = self.worker_info.lock().unwrap(); + match wi.get_mut(&worker.task_id) { + Some(i) => { + i.state = worker.state; + i.info = worker.worker.info(); + i.errors = worker.errors; + i.consecutive_errors = worker.consecutive_errors; + if worker.last_error.is_some() { + i.last_error = worker.last_error.take(); + } + } + None => { + wi.insert(worker.task_id, WorkerInfo { + name: worker.worker.name(), + state: worker.state, + info: worker.worker.info(), + errors: worker.errors, + consecutive_errors: worker.consecutive_errors, + last_error: worker.last_error.take(), + }); + } + } + + if worker.state == WorkerState::Done { + info!("Worker {} (TID {}) exited", worker.worker.name(), worker.task_id); + } else { + workers.push(async move { + worker.step().await; + worker + }.boxed()); + } + } + } + _ = self.stop_signal.changed() => (), + } + } + + // We are exiting, drain everything + let drain_half_time = Instant::now() + Duration::from_secs(5); + let drain_everything = async move { + while let Some(mut worker) = workers.next().await { + if worker.state == WorkerState::Done { + info!( + "Worker {} (TID {}) exited", + worker.worker.name(), + worker.task_id + ); + } else if Instant::now() > drain_half_time { + warn!("Worker {} (TID {}) interrupted between two iterations in state {:?} (this should be fine)", worker.worker.name(), worker.task_id, worker.state); + } else { + workers.push( + async move { + worker.step().await; + worker + } + .boxed(), + ); + } + } + }; + + select! { + _ = drain_everything => { + info!("All workers exited peacefully \\o/"); + } + _ = tokio::time::sleep(Duration::from_secs(9)) => { + error!("Some workers could not exit in time, we are cancelling some things in the middle"); + } + } + } +} + +struct WorkerHandler { + task_id: usize, + stop_signal: watch::Receiver, + stop_signal_worker: watch::Receiver, + worker: Box, + state: WorkerState, + errors: usize, + consecutive_errors: usize, + last_error: Option<(String, u64)>, +} + +impl WorkerHandler { + async fn step(&mut self) { + match self.state { + WorkerState::Busy => match self.worker.work(&mut self.stop_signal).await { + Ok(s) => { + self.state = s; + self.consecutive_errors = 0; + } + Err(e) => { + error!( + "Error in worker {} (TID {}): {}", + self.worker.name(), + self.task_id, + e + ); + self.errors += 1; + self.consecutive_errors += 1; + self.last_error = Some((format!("{}", e), now_msec())); + // Sleep a bit so that error won't repeat immediately, exponential backoff + // strategy (min 1sec, max ~60sec) + self.state = WorkerState::Throttled( + (1.5f32).powf(std::cmp::min(10, self.consecutive_errors - 1) as f32), + ); + } + }, + WorkerState::Throttled(delay) => { + // Sleep for given delay and go back to busy state + if !*self.stop_signal.borrow() { + select! { + _ = tokio::time::sleep(Duration::from_secs_f32(delay)) => (), + _ = self.stop_signal.changed() => (), + } + } + self.state = WorkerState::Busy; + } + WorkerState::Idle => { + if *self.stop_signal.borrow() { + select! { + new_st = self.worker.wait_for_work(&self.stop_signal_worker) => { + self.state = new_st; + } + _ = tokio::time::sleep(Duration::from_secs(1)) => { + // stay in Idle state + } + } + } else { + select! { + new_st = self.worker.wait_for_work(&self.stop_signal_worker) => { + self.state = new_st; + } + _ = self.stop_signal.changed() => { + // stay in Idle state + } + } + } + } + WorkerState::Done => unreachable!(), + } + } +} diff --git a/src/util/lib.rs b/src/util/lib.rs index 8ca6e310..fce151af 100644 --- a/src/util/lib.rs +++ b/src/util/lib.rs @@ -11,7 +11,6 @@ pub mod error; pub mod formater; pub mod metrics; pub mod persister; -//pub mod sled_counter; pub mod time; pub mod token_bucket; pub mod tranquilizer; diff --git a/src/util/tranquilizer.rs b/src/util/tranquilizer.rs index 28711387..fdb2918b 100644 --- a/src/util/tranquilizer.rs +++ b/src/util/tranquilizer.rs @@ -3,6 +3,8 @@ use std::time::{Duration, Instant}; use tokio::time::sleep; +use crate::background::WorkerState; + /// A tranquilizer is a helper object that is used to make /// background operations not take up too much time. /// @@ -33,7 +35,7 @@ impl Tranquilizer { } } - pub async fn tranquilize(&mut self, tranquility: u32) { + fn tranquilize_internal(&mut self, tranquility: u32) -> Option { let observation = Instant::now() - self.last_step_begin; self.observations.push_back(observation); @@ -45,13 +47,32 @@ impl Tranquilizer { if !self.observations.is_empty() { let delay = (tranquility * self.sum_observations) / (self.observations.len() as u32); + Some(delay) + } else { + None + } + } + + pub async fn tranquilize(&mut self, tranquility: u32) { + if let Some(delay) = self.tranquilize_internal(tranquility) { sleep(delay).await; + self.reset(); } + } - self.reset(); + #[must_use] + pub fn tranquilize_worker(&mut self, tranquility: u32) -> WorkerState { + match self.tranquilize_internal(tranquility) { + Some(delay) => WorkerState::Throttled(delay.as_secs_f32()), + None => WorkerState::Busy, + } } pub fn reset(&mut self) { self.last_step_begin = Instant::now(); } + + pub fn clear(&mut self) { + self.observations.clear(); + } } -- cgit v1.2.3 From ac03fa7937d9da29d2358343a499fe9d15ac5f7c Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 15 Jul 2022 18:31:19 +0200 Subject: Uniformize tracing::* imports (hopefully fixes 32-bit build) --- src/db/lib.rs | 3 +++ src/db/lmdb_adapter.rs | 1 - src/db/sqlite_adapter.rs | 2 -- src/util/background/worker.rs | 1 - 4 files changed, 3 insertions(+), 4 deletions(-) (limited to 'src') diff --git a/src/db/lib.rs b/src/db/lib.rs index 8188c715..f185114e 100644 --- a/src/db/lib.rs +++ b/src/db/lib.rs @@ -1,3 +1,6 @@ +#[macro_use] +extern crate tracing; + pub mod lmdb_adapter; pub mod sled_adapter; pub mod sqlite_adapter; diff --git a/src/db/lmdb_adapter.rs b/src/db/lmdb_adapter.rs index fdb254c6..c036c990 100644 --- a/src/db/lmdb_adapter.rs +++ b/src/db/lmdb_adapter.rs @@ -345,7 +345,6 @@ pub fn recommended_map_size() -> usize { #[cfg(target_pointer_width = "32")] pub fn recommended_map_size() -> usize { - use log::warn; warn!("LMDB is not recommended on 32-bit systems, database size will be limited"); 1usize << 30 } diff --git a/src/db/sqlite_adapter.rs b/src/db/sqlite_adapter.rs index 97a78b07..886fda6e 100644 --- a/src/db/sqlite_adapter.rs +++ b/src/db/sqlite_adapter.rs @@ -6,8 +6,6 @@ use std::pin::Pin; use std::ptr::NonNull; use std::sync::{Arc, Mutex, MutexGuard}; -use tracing::trace; - use rusqlite::{params, Connection, Rows, Statement, Transaction}; use crate::{ diff --git a/src/util/background/worker.rs b/src/util/background/worker.rs index 7f573a07..f5e3addb 100644 --- a/src/util/background/worker.rs +++ b/src/util/background/worker.rs @@ -9,7 +9,6 @@ use futures::StreamExt; use serde::{Deserialize, Serialize}; use tokio::select; use tokio::sync::{mpsc, watch}; -use tracing::*; use crate::background::WorkerInfo; use crate::error::Error; -- cgit v1.2.3 From 1b2e1296eb99630e969e585ede0424072adc2d0c Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 18 Jul 2022 17:18:47 +0200 Subject: Compute hashes on dedicated threads --- src/api/Cargo.toml | 8 +++---- src/api/s3/copy.rs | 5 ++++- src/api/s3/put.rs | 32 +++++++++++++++++----------- src/api/signature/mod.rs | 14 ++++++------ src/block/block.rs | 17 +++++++++------ src/block/manager.rs | 6 ++++-- src/garage/Cargo.toml | 2 +- src/util/Cargo.toml | 4 +++- src/util/async_hash.rs | 55 ++++++++++++++++++++++++++++++++++++++++++++++++ src/util/lib.rs | 1 + 10 files changed, 110 insertions(+), 34 deletions(-) create mode 100644 src/util/async_hash.rs (limited to 'src') diff --git a/src/api/Cargo.toml b/src/api/Cargo.toml index db77cf38..901cb959 100644 --- a/src/api/Cargo.toml +++ b/src/api/Cargo.toml @@ -24,15 +24,15 @@ async-trait = "0.1.7" base64 = "0.13" bytes = "1.0" chrono = "0.4" -crypto-mac = "0.10" +crypto-common = "0.1" err-derive = "0.3" hex = "0.4" -hmac = "0.10" +hmac = "0.12" idna = "0.2" tracing = "0.1.30" -md-5 = "0.9" +md-5 = "0.10" nom = "7.1" -sha2 = "0.9" +sha2 = "0.10" futures = "0.3" futures-util = "0.3" diff --git a/src/api/s3/copy.rs b/src/api/s3/copy.rs index 0fc16993..4415a037 100644 --- a/src/api/s3/copy.rs +++ b/src/api/s3/copy.rs @@ -365,7 +365,10 @@ pub async fn handle_upload_part_copy( // we need to insert that data as a new block. async move { if must_upload { - garage2.block_manager.rpc_put_block(final_hash, data).await + garage2 + .block_manager + .rpc_put_block(final_hash, data.into()) + .await } else { Ok(()) } diff --git a/src/api/s3/put.rs b/src/api/s3/put.rs index 9ef37421..fbfa6f0d 100644 --- a/src/api/s3/put.rs +++ b/src/api/s3/put.rs @@ -9,6 +9,7 @@ use md5::{digest::generic_array::*, Digest as Md5Digest, Md5}; use sha2::Sha256; use garage_table::*; +use garage_util::async_hash::*; use garage_util::data::*; use garage_util::error::Error as GarageError; use garage_util::time::*; @@ -130,7 +131,8 @@ pub(crate) async fn save_stream> + Unpin>( garage.version_table.insert(&version).await?; // Transfer data and verify checksum - let first_block_hash = blake2sum(&first_block[..]); + let first_block = Bytes::from(first_block); + let first_block_hash = async_blake2sum(first_block.clone()).await; let tx_result = (|| async { let (total_size, data_md5sum, data_sha256sum) = read_and_put_blocks( @@ -273,14 +275,16 @@ async fn read_and_put_blocks> + Unpin>( garage: &Garage, version: &Version, part_number: u64, - first_block: Vec, + first_block: Bytes, first_block_hash: Hash, chunker: &mut StreamChunker, ) -> Result<(u64, GenericArray, Hash), Error> { - let mut md5hasher = Md5::new(); - let mut sha256hasher = Sha256::new(); - md5hasher.update(&first_block[..]); - sha256hasher.update(&first_block[..]); + let first_block = Bytes::from(first_block); + + let md5hasher = AsyncHasher::::new(); + let sha256hasher = AsyncHasher::::new(); + md5hasher.update(first_block.clone()); + sha256hasher.update(first_block.clone()); let mut next_offset = first_block.len(); let mut put_curr_version_block = put_block_meta( @@ -302,9 +306,10 @@ async fn read_and_put_blocks> + Unpin>( chunker.next(), )?; if let Some(block) = next_block { - md5hasher.update(&block[..]); - sha256hasher.update(&block[..]); - let block_hash = blake2sum(&block[..]); + let block = Bytes::from(block); + md5hasher.update(block.clone()); + sha256hasher.update(block.clone()); + let block_hash = async_blake2sum(block.clone()).await; let block_len = block.len(); put_curr_version_block = put_block_meta( garage, @@ -322,9 +327,9 @@ async fn read_and_put_blocks> + Unpin>( } let total_size = next_offset as u64; - let data_md5sum = md5hasher.finalize(); + let data_md5sum = md5hasher.finalize().await; - let data_sha256sum = sha256hasher.finalize(); + let data_sha256sum = sha256hasher.finalize().await; let data_sha256sum = Hash::try_from(&data_sha256sum[..]).unwrap(); Ok((total_size, data_md5sum, data_sha256sum)) @@ -504,7 +509,10 @@ pub async fn handle_put_part( // Copy block to store let version = Version::new(version_uuid, bucket_id, key, false); - let first_block_hash = blake2sum(&first_block[..]); + + let first_block = Bytes::from(first_block); + let first_block_hash = async_blake2sum(first_block.clone()).await; + let (_, data_md5sum, data_sha256sum) = read_and_put_blocks( &garage, &version, diff --git a/src/api/signature/mod.rs b/src/api/signature/mod.rs index dd5b590c..4b8b990f 100644 --- a/src/api/signature/mod.rs +++ b/src/api/signature/mod.rs @@ -1,5 +1,5 @@ use chrono::{DateTime, Utc}; -use hmac::{Hmac, Mac, NewMac}; +use hmac::{Hmac, Mac}; use sha2::Sha256; use garage_util::data::{sha256sum, Hash}; @@ -29,17 +29,17 @@ pub fn signing_hmac( secret_key: &str, region: &str, service: &str, -) -> Result { +) -> Result { let secret = String::from("AWS4") + secret_key; - let mut date_hmac = HmacSha256::new_varkey(secret.as_bytes())?; + let mut date_hmac = HmacSha256::new_from_slice(secret.as_bytes())?; date_hmac.update(datetime.format(SHORT_DATE).to_string().as_bytes()); - let mut region_hmac = HmacSha256::new_varkey(&date_hmac.finalize().into_bytes())?; + let mut region_hmac = HmacSha256::new_from_slice(&date_hmac.finalize().into_bytes())?; region_hmac.update(region.as_bytes()); - let mut service_hmac = HmacSha256::new_varkey(®ion_hmac.finalize().into_bytes())?; + let mut service_hmac = HmacSha256::new_from_slice(®ion_hmac.finalize().into_bytes())?; service_hmac.update(service.as_bytes()); - let mut signing_hmac = HmacSha256::new_varkey(&service_hmac.finalize().into_bytes())?; + let mut signing_hmac = HmacSha256::new_from_slice(&service_hmac.finalize().into_bytes())?; signing_hmac.update(b"aws4_request"); - let hmac = HmacSha256::new_varkey(&signing_hmac.finalize().into_bytes())?; + let hmac = HmacSha256::new_from_slice(&signing_hmac.finalize().into_bytes())?; Ok(hmac) } diff --git a/src/block/block.rs b/src/block/block.rs index 4d3fbcb8..f17bd2c0 100644 --- a/src/block/block.rs +++ b/src/block/block.rs @@ -1,3 +1,4 @@ +use bytes::Bytes; use serde::{Deserialize, Serialize}; use zstd::stream::{decode_all as zstd_decode, Encoder}; @@ -61,13 +62,17 @@ impl DataBlock { } } - pub fn from_buffer(data: Vec, level: Option) -> DataBlock { - if let Some(level) = level { - if let Ok(data) = zstd_encode(&data[..], level) { - return DataBlock::Compressed(data); + pub async fn from_buffer(data: Bytes, level: Option) -> DataBlock { + tokio::task::spawn_blocking(move || { + if let Some(level) = level { + if let Ok(data) = zstd_encode(&data[..], level) { + return DataBlock::Compressed(data); + } } - } - DataBlock::Plain(data) + DataBlock::Plain(data.to_vec()) // TODO: remove to_vec here + }) + .await + .unwrap() } } diff --git a/src/block/manager.rs b/src/block/manager.rs index 017ba9da..890c247d 100644 --- a/src/block/manager.rs +++ b/src/block/manager.rs @@ -5,6 +5,7 @@ use std::time::Duration; use arc_swap::ArcSwapOption; use async_trait::async_trait; +use bytes::Bytes; use serde::{Deserialize, Serialize}; use futures::future::*; @@ -211,14 +212,15 @@ impl BlockManager { } /// Send block to nodes that should have it - pub async fn rpc_put_block(&self, hash: Hash, data: Vec) -> Result<(), Error> { + pub async fn rpc_put_block(&self, hash: Hash, data: Bytes) -> Result<(), Error> { let who = self.replication.write_nodes(&hash); - let data = DataBlock::from_buffer(data, self.compression_level); + let data = DataBlock::from_buffer(data, self.compression_level).await; self.system .rpc .try_call_many( &self.endpoint, &who[..], + // TODO: remove to_vec() here BlockRpc::PutBlock { hash, data }, RequestStrategy::with_priority(PRIO_NORMAL) .with_quorum(self.replication.write_quorum()) diff --git a/src/garage/Cargo.toml b/src/garage/Cargo.toml index 8948e750..80802a16 100644 --- a/src/garage/Cargo.toml +++ b/src/garage/Cargo.toml @@ -65,7 +65,7 @@ chrono = "0.4" http = "0.2" hmac = "0.10" hyper = { version = "0.14", features = ["client", "http1", "runtime"] } -sha2 = "0.9" +sha2 = "0.10" static_init = "1.0" assert-json-diff = "2.0" diff --git a/src/util/Cargo.toml b/src/util/Cargo.toml index 57c70ffb..7d79f21a 100644 --- a/src/util/Cargo.toml +++ b/src/util/Cargo.toml @@ -18,12 +18,14 @@ garage_db = { version = "0.8.0", path = "../db" } async-trait = "0.1" blake2 = "0.9" +bytes = "1.0" +digest = "0.10" err-derive = "0.3" xxhash-rust = { version = "0.8", default-features = false, features = ["xxh3"] } hex = "0.4" tracing = "0.1.30" rand = "0.8" -sha2 = "0.9" +sha2 = "0.10" chrono = "0.4" rmp-serde = "0.15" diff --git a/src/util/async_hash.rs b/src/util/async_hash.rs new file mode 100644 index 00000000..67776eb9 --- /dev/null +++ b/src/util/async_hash.rs @@ -0,0 +1,55 @@ +use bytes::Bytes; +use digest::Digest; + +use tokio::sync::mpsc; +use tokio::task::JoinHandle; + +use crate::data::*; + +/// Compute the sha256 of a slice, +/// spawning on a tokio thread for CPU-intensive processing +/// The argument has to be an owned Bytes, as it is moved out to a new thread. +pub async fn async_sha256sum(data: Bytes) -> Hash { + tokio::task::spawn_blocking(move || sha256sum(&data)) + .await + .unwrap() +} + +/// Compute the blake2sum of a slice, +/// spawning on a tokio thread for CPU-intensive processing. +/// The argument has to be an owned Bytes, as it is moved out to a new thread. +pub async fn async_blake2sum(data: Bytes) -> Hash { + tokio::task::spawn_blocking(move || blake2sum(&data)) + .await + .unwrap() +} + +// ---- + +pub struct AsyncHasher { + sendblk: mpsc::UnboundedSender, + task: JoinHandle>, +} + +impl AsyncHasher { + pub fn new() -> Self { + let (sendblk, mut recvblk) = mpsc::unbounded_channel::(); + let task = tokio::task::spawn_blocking(move || { + let mut digest = D::new(); + while let Some(blk) = recvblk.blocking_recv() { + digest.update(&blk[..]); + } + digest.finalize() + }); + Self { sendblk, task } + } + + pub fn update(&self, b: Bytes) { + self.sendblk.send(b).unwrap() + } + + pub async fn finalize(self) -> digest::Output { + drop(self.sendblk); + self.task.await.unwrap() + } +} diff --git a/src/util/lib.rs b/src/util/lib.rs index fce151af..7152f92a 100644 --- a/src/util/lib.rs +++ b/src/util/lib.rs @@ -3,6 +3,7 @@ #[macro_use] extern crate tracing; +pub mod async_hash; pub mod background; pub mod config; pub mod crdt; -- cgit v1.2.3 From 2f111e6b3d772b10c8ed6279ce0c82d22852afd1 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 18 Jul 2022 18:40:57 +0200 Subject: Performance improvements: - reduce contention on mutation_lock by having 256 of them - better lmdb defaults --- src/api/s3/put.rs | 30 +++++++++++++++++++++++++----- src/block/manager.rs | 27 +++++++++++++++------------ src/model/garage.rs | 15 ++++++++++----- src/util/async_hash.rs | 15 +++++++++------ 4 files changed, 59 insertions(+), 28 deletions(-) (limited to 'src') diff --git a/src/api/s3/put.rs b/src/api/s3/put.rs index fbfa6f0d..a182f04d 100644 --- a/src/api/s3/put.rs +++ b/src/api/s3/put.rs @@ -8,6 +8,11 @@ use hyper::{Request, Response}; use md5::{digest::generic_array::*, Digest as Md5Digest, Md5}; use sha2::Sha256; +use opentelemetry::{ + trace::{FutureExt as OtelFutureExt, TraceContextExt, Tracer}, + Context, +}; + use garage_table::*; use garage_util::async_hash::*; use garage_util::data::*; @@ -279,12 +284,21 @@ async fn read_and_put_blocks> + Unpin>( first_block_hash: Hash, chunker: &mut StreamChunker, ) -> Result<(u64, GenericArray, Hash), Error> { + let tracer = opentelemetry::global::tracer("garage"); + let first_block = Bytes::from(first_block); let md5hasher = AsyncHasher::::new(); let sha256hasher = AsyncHasher::::new(); - md5hasher.update(first_block.clone()); - sha256hasher.update(first_block.clone()); + + futures::future::join( + md5hasher.update(first_block.clone()), + sha256hasher.update(first_block.clone()), + ) + .with_context(Context::current_with_span( + tracer.start("Hash first block (md5, sha256)"), + )) + .await; let mut next_offset = first_block.len(); let mut put_curr_version_block = put_block_meta( @@ -307,9 +321,15 @@ async fn read_and_put_blocks> + Unpin>( )?; if let Some(block) = next_block { let block = Bytes::from(block); - md5hasher.update(block.clone()); - sha256hasher.update(block.clone()); - let block_hash = async_blake2sum(block.clone()).await; + let (_, _, block_hash) = futures::future::join3( + md5hasher.update(block.clone()), + sha256hasher.update(block.clone()), + async_blake2sum(block.clone()), + ) + .with_context(Context::current_with_span( + tracer.start("Hash block (md5, sha256, blake2)"), + )) + .await; let block_len = block.len(); put_curr_version_block = put_block_meta( garage, diff --git a/src/block/manager.rs b/src/block/manager.rs index 890c247d..be53ec6e 100644 --- a/src/block/manager.rs +++ b/src/block/manager.rs @@ -93,7 +93,7 @@ pub struct BlockManager { compression_level: Option, background_tranquility: u32, - mutation_lock: Mutex, + mutation_lock: [Mutex; 256], pub(crate) rc: BlockRc, @@ -150,8 +150,6 @@ impl BlockManager { .netapp .endpoint("garage_block/manager.rs/Rpc".to_string()); - let manager_locked = BlockManagerLocked(); - let metrics = BlockManagerMetrics::new(resync_queue.clone(), resync_errors.clone()); let block_manager = Arc::new(Self { @@ -159,7 +157,7 @@ impl BlockManager { data_dir, compression_level, background_tranquility, - mutation_lock: Mutex::new(manager_locked), + mutation_lock: [(); 256].map(|_| Mutex::new(BlockManagerLocked())), rc, resync_queue, resync_notify: Notify::new(), @@ -313,14 +311,21 @@ impl BlockManager { /// Write a block to disk async fn write_block(&self, hash: &Hash, data: &DataBlock) -> Result { + let tracer = opentelemetry::global::tracer("garage"); + let write_size = data.inner_buffer().len() as u64; - let res = self - .mutation_lock + let res = self.mutation_lock[hash.as_slice()[0] as usize] .lock() + .with_context(Context::current_with_span( + tracer.start("Acquire mutation_lock"), + )) .await .write_block(hash, data, self) .bound_record_duration(&self.metrics.block_write_duration) + .with_context(Context::current_with_span( + tracer.start("BlockManagerLocked::write_block"), + )) .await?; self.metrics.bytes_written.add(write_size); @@ -370,7 +375,7 @@ impl BlockManager { if data.verify(*hash).is_err() { self.metrics.corruption_counter.add(1); - self.mutation_lock + self.mutation_lock[hash.as_slice()[0] as usize] .lock() .await .move_block_to_corrupted(hash, self) @@ -384,8 +389,7 @@ impl BlockManager { /// Check if this node should have a block, but don't actually have it async fn need_block(&self, hash: &Hash) -> Result { - let BlockStatus { exists, needed } = self - .mutation_lock + let BlockStatus { exists, needed } = self.mutation_lock[hash.as_slice()[0] as usize] .lock() .await .check_block_status(hash, self) @@ -608,8 +612,7 @@ impl BlockManager { } async fn resync_block(&self, hash: &Hash) -> Result<(), Error> { - let BlockStatus { exists, needed } = self - .mutation_lock + let BlockStatus { exists, needed } = self.mutation_lock[hash.as_slice()[0] as usize] .lock() .await .check_block_status(hash, self) @@ -694,7 +697,7 @@ impl BlockManager { who.len() ); - self.mutation_lock + self.mutation_lock[hash.as_slice()[0] as usize] .lock() .await .delete_if_unneeded(hash, self) diff --git a/src/model/garage.rs b/src/model/garage.rs index 15769a17..0d239df6 100644 --- a/src/model/garage.rs +++ b/src/model/garage.rs @@ -104,11 +104,16 @@ impl Garage { std::fs::create_dir_all(&db_path).expect("Unable to create LMDB data directory"); let map_size = garage_db::lmdb_adapter::recommended_map_size(); - let db = db::lmdb_adapter::heed::EnvOpenOptions::new() - .max_dbs(100) - .map_size(map_size) - .open(&db_path) - .expect("Unable to open LMDB DB"); + use db::lmdb_adapter::heed; + let mut env_builder = heed::EnvOpenOptions::new(); + env_builder.max_dbs(100); + env_builder.max_readers(500); + env_builder.map_size(map_size); + unsafe { + env_builder.flag(heed::flags::Flags::MdbNoSync); + env_builder.flag(heed::flags::Flags::MdbNoMetaSync); + } + let db = env_builder.open(&db_path).expect("Unable to open LMDB DB"); db::lmdb_adapter::LmdbDb::init(db) } e => { diff --git a/src/util/async_hash.rs b/src/util/async_hash.rs index 67776eb9..be0535de 100644 --- a/src/util/async_hash.rs +++ b/src/util/async_hash.rs @@ -1,7 +1,7 @@ use bytes::Bytes; use digest::Digest; -use tokio::sync::mpsc; +use tokio::sync::{mpsc, oneshot}; use tokio::task::JoinHandle; use crate::data::*; @@ -27,25 +27,28 @@ pub async fn async_blake2sum(data: Bytes) -> Hash { // ---- pub struct AsyncHasher { - sendblk: mpsc::UnboundedSender, + sendblk: mpsc::UnboundedSender<(Bytes, oneshot::Sender<()>)>, task: JoinHandle>, } impl AsyncHasher { pub fn new() -> Self { - let (sendblk, mut recvblk) = mpsc::unbounded_channel::(); + let (sendblk, mut recvblk) = mpsc::unbounded_channel::<(Bytes, oneshot::Sender<()>)>(); let task = tokio::task::spawn_blocking(move || { let mut digest = D::new(); - while let Some(blk) = recvblk.blocking_recv() { + while let Some((blk, ch)) = recvblk.blocking_recv() { digest.update(&blk[..]); + let _ = ch.send(()); } digest.finalize() }); Self { sendblk, task } } - pub fn update(&self, b: Bytes) { - self.sendblk.send(b).unwrap() + pub async fn update(&self, b: Bytes) { + let (tx, rx) = oneshot::channel(); + self.sendblk.send((b, tx)).unwrap(); + let _ = rx.await; } pub async fn finalize(self) -> digest::Output { -- cgit v1.2.3 From 0176da3ad2aae9d18cb04feb452e0243cfb940fc Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 22 Jul 2022 18:37:20 +0200 Subject: Make clippy happy --- src/util/async_hash.rs | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'src') diff --git a/src/util/async_hash.rs b/src/util/async_hash.rs index be0535de..fa8ee7ff 100644 --- a/src/util/async_hash.rs +++ b/src/util/async_hash.rs @@ -56,3 +56,9 @@ impl AsyncHasher { self.task.await.unwrap() } } + +impl Default for AsyncHasher { + fn default() -> Self { + Self::new() + } +} -- cgit v1.2.3 From 2cad656a0332b19481ce779f5026b07c6ed8198f Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 22 Jul 2022 18:40:06 +0200 Subject: More make clippy happy --- src/api/s3/put.rs | 2 -- 1 file changed, 2 deletions(-) (limited to 'src') diff --git a/src/api/s3/put.rs b/src/api/s3/put.rs index a182f04d..2c51909f 100644 --- a/src/api/s3/put.rs +++ b/src/api/s3/put.rs @@ -286,8 +286,6 @@ async fn read_and_put_blocks> + Unpin>( ) -> Result<(u64, GenericArray, Hash), Error> { let tracer = opentelemetry::global::tracer("garage"); - let first_block = Bytes::from(first_block); - let md5hasher = AsyncHasher::::new(); let sha256hasher = AsyncHasher::::new(); -- cgit v1.2.3 From 381eb9a5a1dc530ce864ac2b3dc8eb0d454f1bc9 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 22 Jul 2022 18:55:52 +0200 Subject: Fix tests --- src/garage/Cargo.toml | 2 +- src/garage/tests/lib.rs | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'src') diff --git a/src/garage/Cargo.toml b/src/garage/Cargo.toml index 80802a16..2cb8ec46 100644 --- a/src/garage/Cargo.toml +++ b/src/garage/Cargo.toml @@ -63,7 +63,7 @@ prometheus = "0.13" aws-sdk-s3 = "0.8" chrono = "0.4" http = "0.2" -hmac = "0.10" +hmac = "0.12" hyper = { version = "0.14", features = ["client", "http1", "runtime"] } sha2 = "0.10" diff --git a/src/garage/tests/lib.rs b/src/garage/tests/lib.rs index 0106ad10..99aa1d58 100644 --- a/src/garage/tests/lib.rs +++ b/src/garage/tests/lib.rs @@ -3,5 +3,8 @@ mod common; mod admin; mod bucket; + +#[cfg(feature="k2v")] mod k2v; + mod s3; -- cgit v1.2.3 From ff4771c36ac06761a50364d53dc65f65ca6750f9 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 22 Jul 2022 18:56:53 +0200 Subject: cargo fmt --- src/garage/tests/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src') diff --git a/src/garage/tests/lib.rs b/src/garage/tests/lib.rs index 99aa1d58..24d794c3 100644 --- a/src/garage/tests/lib.rs +++ b/src/garage/tests/lib.rs @@ -4,7 +4,7 @@ mod common; mod admin; mod bucket; -#[cfg(feature="k2v")] +#[cfg(feature = "k2v")] mod k2v; mod s3; -- cgit v1.2.3 From ad35b18bb146fcbf5e817c10837c6e835b1af5b7 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 25 Jul 2022 11:59:55 +0200 Subject: Faster chunker --- src/api/s3/put.rs | 42 ++++++++++++++++++++++++++++++++---------- 1 file changed, 32 insertions(+), 10 deletions(-) (limited to 'src') diff --git a/src/api/s3/put.rs b/src/api/s3/put.rs index 2c51909f..e6698bfa 100644 --- a/src/api/s3/put.rs +++ b/src/api/s3/put.rs @@ -387,7 +387,8 @@ struct StreamChunker>> { stream: S, read_all: bool, block_size: usize, - buf: VecDeque, + buf: VecDeque, + buf_len: usize, } impl> + Unpin> StreamChunker { @@ -396,29 +397,50 @@ impl> + Unpin> StreamChunker { stream, read_all: false, block_size, - buf: VecDeque::with_capacity(2 * block_size), + buf: VecDeque::with_capacity(8), + buf_len: 0, } } async fn next(&mut self) -> Result>, Error> { - while !self.read_all && self.buf.len() < self.block_size { + while !self.read_all && self.buf_len < self.block_size { if let Some(block) = self.stream.next().await { let bytes = block?; trace!("Body next: {} bytes", bytes.len()); - self.buf.extend(bytes); + self.buf_len += bytes.len(); + self.buf.push_back(bytes); } else { self.read_all = true; } } - if self.buf.is_empty() { + if self.buf_len == 0 { Ok(None) - } else if self.buf.len() <= self.block_size { - let block = self.buf.drain(..).collect::>(); - Ok(Some(block)) } else { - let block = self.buf.drain(..self.block_size).collect::>(); - Ok(Some(block)) + let mut slices = Vec::with_capacity(self.buf.len()); + let mut taken = 0; + while self.buf_len > 0 && taken < self.block_size { + let front = self.buf.pop_front().unwrap(); + if taken + front.len() <= self.block_size { + taken += front.len(); + self.buf_len -= front.len(); + slices.push(front); + } else { + let front_take = self.block_size - taken; + slices.push(front.slice(..front_take)); + self.buf.push_front(front.slice(front_take..)); + self.buf_len -= front_take; + break; + } + } + Ok(Some( + slices + .iter() + .map(|x| &x[..]) + .collect::>() + .concat() + .into(), + )) } } } -- cgit v1.2.3 From 16f6a1a65d4b973ea13cd00bbfdd7e225041e447 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 25 Jul 2022 12:06:06 +0200 Subject: fix clippy --- src/api/s3/put.rs | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'src') diff --git a/src/api/s3/put.rs b/src/api/s3/put.rs index e6698bfa..dc0530df 100644 --- a/src/api/s3/put.rs +++ b/src/api/s3/put.rs @@ -434,12 +434,7 @@ impl> + Unpin> StreamChunker { } } Ok(Some( - slices - .iter() - .map(|x| &x[..]) - .collect::>() - .concat() - .into(), + slices.iter().map(|x| &x[..]).collect::>().concat(), )) } } -- cgit v1.2.3 From 8e7e680afe39f48fe15f365c9ef3fee57596e119 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 22 Jul 2022 15:20:00 +0200 Subject: First adaptation to WIP netapp with streaming body --- src/block/manager.rs | 19 ++++++------- src/garage/Cargo.toml | 5 ++-- src/garage/admin.rs | 4 +-- src/garage/cli/cmd.rs | 6 ++-- src/garage/cli/layout.rs | 6 ++-- src/model/Cargo.toml | 5 ++-- src/rpc/Cargo.toml | 5 ++-- src/rpc/rpc_helper.rs | 71 ++++++++++++++++++++---------------------------- src/rpc/system.rs | 7 +++-- src/table/schema.rs | 2 +- src/util/Cargo.toml | 5 ++-- 11 files changed, 60 insertions(+), 75 deletions(-) (limited to 'src') diff --git a/src/block/manager.rs b/src/block/manager.rs index be53ec6e..408de148 100644 --- a/src/block/manager.rs +++ b/src/block/manager.rs @@ -8,7 +8,6 @@ use async_trait::async_trait; use bytes::Bytes; use serde::{Deserialize, Serialize}; -use futures::future::*; use tokio::fs; use tokio::io::{AsyncReadExt, AsyncWriteExt}; use tokio::select; @@ -637,24 +636,24 @@ impl BlockManager { } who.retain(|id| *id != self.system.id); - let msg = Arc::new(BlockRpc::NeedBlockQuery(*hash)); - let who_needs_fut = who.iter().map(|to| { - self.system.rpc.call_arc( + let who_needs_resps = self + .system + .rpc + .call_many( &self.endpoint, - *to, - msg.clone(), + &who, + BlockRpc::NeedBlockQuery(*hash), RequestStrategy::with_priority(PRIO_BACKGROUND) .with_timeout(NEED_BLOCK_QUERY_TIMEOUT), ) - }); - let who_needs_resps = join_all(who_needs_fut).await; + .await?; let mut need_nodes = vec![]; - for (node, needed) in who.iter().zip(who_needs_resps.into_iter()) { + for (node, needed) in who_needs_resps.into_iter() { match needed.err_context("NeedBlockQuery RPC")? { BlockRpc::NeedBlockReply(needed) => { if needed { - need_nodes.push(*node); + need_nodes.push(node); } } m => { diff --git a/src/garage/Cargo.toml b/src/garage/Cargo.toml index 2cb8ec46..5a872c7a 100644 --- a/src/garage/Cargo.toml +++ b/src/garage/Cargo.toml @@ -50,9 +50,8 @@ futures = "0.3" futures-util = "0.3" tokio = { version = "1.0", default-features = false, features = ["rt", "rt-multi-thread", "io-util", "net", "time", "macros", "sync", "signal", "fs"] } -#netapp = { version = "0.3.2", git = "https://git.deuxfleurs.fr/lx/netapp" } -#netapp = { version = "0.4", path = "../../../netapp" } -netapp = "0.4" +#netapp = "0.4" +netapp = { version = "0.4.4", git = "https://git.deuxfleurs.fr/lx/netapp", branch = "stream-body", features = ["telemetry"] } opentelemetry = { version = "0.17", features = [ "rt-tokio" ] } opentelemetry-prometheus = "0.10" diff --git a/src/garage/admin.rs b/src/garage/admin.rs index 71ee608c..64a448fc 100644 --- a/src/garage/admin.rs +++ b/src/garage/admin.rs @@ -681,7 +681,7 @@ impl AdminRpcHandler { .endpoint .call( &node, - &AdminRpc::LaunchRepair(opt_to_send.clone()), + AdminRpc::LaunchRepair(opt_to_send.clone()), PRIO_NORMAL, ) .await; @@ -721,7 +721,7 @@ impl AdminRpcHandler { let node_id = (*node).into(); match self .endpoint - .call(&node_id, &AdminRpc::Stats(opt), PRIO_NORMAL) + .call(&node_id, AdminRpc::Stats(opt), PRIO_NORMAL) .await? { Ok(AdminRpc::Ok(s)) => writeln!(&mut ret, "{}", s).unwrap(), diff --git a/src/garage/cli/cmd.rs b/src/garage/cli/cmd.rs index 1aa2c2ff..c8b96489 100644 --- a/src/garage/cli/cmd.rs +++ b/src/garage/cli/cmd.rs @@ -47,7 +47,7 @@ pub async fn cli_command_dispatch( pub async fn cmd_status(rpc_cli: &Endpoint, rpc_host: NodeID) -> Result<(), Error> { let status = match rpc_cli - .call(&rpc_host, &SystemRpc::GetKnownNodes, PRIO_NORMAL) + .call(&rpc_host, SystemRpc::GetKnownNodes, PRIO_NORMAL) .await?? { SystemRpc::ReturnKnownNodes(nodes) => nodes, @@ -149,7 +149,7 @@ pub async fn cmd_connect( args: ConnectNodeOpt, ) -> Result<(), Error> { match rpc_cli - .call(&rpc_host, &SystemRpc::Connect(args.node), PRIO_NORMAL) + .call(&rpc_host, SystemRpc::Connect(args.node), PRIO_NORMAL) .await?? { SystemRpc::Ok => { @@ -165,7 +165,7 @@ pub async fn cmd_admin( rpc_host: NodeID, args: AdminRpc, ) -> Result<(), HelperError> { - match rpc_cli.call(&rpc_host, &args, PRIO_NORMAL).await?? { + match rpc_cli.call(&rpc_host, args, PRIO_NORMAL).await?? { AdminRpc::Ok(msg) => { println!("{}", msg); } diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs index db0af57c..3884bb92 100644 --- a/src/garage/cli/layout.rs +++ b/src/garage/cli/layout.rs @@ -36,7 +36,7 @@ pub async fn cmd_assign_role( args: AssignRoleOpt, ) -> Result<(), Error> { let status = match rpc_cli - .call(&rpc_host, &SystemRpc::GetKnownNodes, PRIO_NORMAL) + .call(&rpc_host, SystemRpc::GetKnownNodes, PRIO_NORMAL) .await?? { SystemRpc::ReturnKnownNodes(nodes) => nodes, @@ -245,7 +245,7 @@ pub async fn fetch_layout( rpc_host: NodeID, ) -> Result { match rpc_cli - .call(&rpc_host, &SystemRpc::PullClusterLayout, PRIO_NORMAL) + .call(&rpc_host, SystemRpc::PullClusterLayout, PRIO_NORMAL) .await?? { SystemRpc::AdvertiseClusterLayout(t) => Ok(t), @@ -261,7 +261,7 @@ pub async fn send_layout( rpc_cli .call( &rpc_host, - &SystemRpc::AdvertiseClusterLayout(layout), + SystemRpc::AdvertiseClusterLayout(layout), PRIO_NORMAL, ) .await??; diff --git a/src/model/Cargo.toml b/src/model/Cargo.toml index d908dc01..a97bce4d 100644 --- a/src/model/Cargo.toml +++ b/src/model/Cargo.toml @@ -40,9 +40,8 @@ futures-util = "0.3" tokio = { version = "1.0", default-features = false, features = ["rt", "rt-multi-thread", "io-util", "net", "time", "macros", "sync", "signal", "fs"] } opentelemetry = "0.17" -#netapp = { version = "0.3.0", git = "https://git.deuxfleurs.fr/lx/netapp" } -#netapp = { version = "0.4", path = "../../../netapp" } -netapp = "0.4" +#netapp = "0.4" +netapp = { version = "0.4.4", git = "https://git.deuxfleurs.fr/lx/netapp", branch = "stream-body", features = ["telemetry"] } [features] k2v = [ "garage_util/k2v" ] diff --git a/src/rpc/Cargo.toml b/src/rpc/Cargo.toml index 73328993..5d5151cd 100644 --- a/src/rpc/Cargo.toml +++ b/src/rpc/Cargo.toml @@ -46,9 +46,8 @@ tokio = { version = "1.0", default-features = false, features = ["rt", "rt-multi tokio-stream = { version = "0.1", features = ["net"] } opentelemetry = "0.17" -#netapp = { version = "0.3.0", git = "https://git.deuxfleurs.fr/lx/netapp" } -#netapp = { version = "0.4", path = "../../../netapp", features = ["telemetry"] } -netapp = { version = "0.4.4", features = ["telemetry"] } +#netapp = { version = "0.4.4", features = ["telemetry"] } +netapp = { version = "0.4.4", git = "https://git.deuxfleurs.fr/lx/netapp", branch = "stream-body", features = ["telemetry"] } hyper = { version = "0.14", features = ["client", "http1", "runtime", "tcp"] } diff --git a/src/rpc/rpc_helper.rs b/src/rpc/rpc_helper.rs index 34717d3b..079cdc70 100644 --- a/src/rpc/rpc_helper.rs +++ b/src/rpc/rpc_helper.rs @@ -15,9 +15,9 @@ use opentelemetry::{ Context, }; -pub use netapp::endpoint::{Endpoint, EndpointHandler, Message as Rpc}; +pub use netapp::endpoint::{Endpoint, EndpointHandler}; +pub use netapp::message::{Message as Rpc, *}; use netapp::peering::fullmesh::FullMeshPeeringStrategy; -pub use netapp::proto::*; pub use netapp::{NetApp, NodeID}; use garage_util::background::BackgroundRunner; @@ -30,10 +30,8 @@ use crate::ring::Ring; const DEFAULT_TIMEOUT: Duration = Duration::from_secs(10); -// Try to never have more than 200MB of outgoing requests -// buffered at the same time. Other requests are queued until -// space is freed. -const REQUEST_BUFFER_SIZE: usize = 200 * 1024 * 1024; +// Don't allow more than 100 concurrent outgoing RPCs. +const MAX_CONCURRENT_REQUESTS: usize = 100; /// Strategy to apply when making RPC #[derive(Copy, Clone)] @@ -95,7 +93,7 @@ impl RpcHelper { background: Arc, ring: watch::Receiver>, ) -> Self { - let sem = Arc::new(Semaphore::new(REQUEST_BUFFER_SIZE)); + let sem = Arc::new(Semaphore::new(MAX_CONCURRENT_REQUESTS)); let metrics = RpcMetrics::new(sem.clone()); @@ -109,29 +107,16 @@ impl RpcHelper { })) } - pub async fn call( + pub async fn call( &self, endpoint: &Endpoint, to: Uuid, - msg: M, - strat: RequestStrategy, - ) -> Result - where - M: Rpc>, - H: EndpointHandler, - { - self.call_arc(endpoint, to, Arc::new(msg), strat).await - } - - pub async fn call_arc( - &self, - endpoint: &Endpoint, - to: Uuid, - msg: Arc, + msg: N, strat: RequestStrategy, ) -> Result where M: Rpc>, + N: IntoReq + Send, H: EndpointHandler, { let metric_tags = [ @@ -140,11 +125,10 @@ impl RpcHelper { KeyValue::new("to", format!("{:?}", to)), ]; - let msg_size = rmp_to_vec_all_named(&msg)?.len() as u32; let permit = self .0 .request_buffer_semaphore - .acquire_many(msg_size) + .acquire() .record_duration(&self.0.metrics.rpc_queueing_time, &metric_tags) .await?; @@ -152,7 +136,7 @@ impl RpcHelper { let node_id = to.into(); let rpc_call = endpoint - .call(&node_id, msg, strat.rs_priority) + .call_streaming(&node_id, msg, strat.rs_priority) .record_duration(&self.0.metrics.rpc_duration, &metric_tags); select! { @@ -162,7 +146,7 @@ impl RpcHelper { if res.is_err() { self.0.metrics.rpc_netapp_error_counter.add(1, &metric_tags); } - let res = res?; + let res = res?.into_msg(); if res.is_err() { self.0.metrics.rpc_garage_error_counter.add(1, &metric_tags); @@ -178,37 +162,41 @@ impl RpcHelper { } } - pub async fn call_many( + pub async fn call_many( &self, endpoint: &Endpoint, to: &[Uuid], - msg: M, + msg: N, strat: RequestStrategy, - ) -> Vec<(Uuid, Result)> + ) -> Result)>, Error> where M: Rpc>, + N: IntoReq, H: EndpointHandler, { - let msg = Arc::new(msg); + let msg = msg.into_req().map_err(netapp::error::Error::from)?; + let resps = join_all( to.iter() - .map(|to| self.call_arc(endpoint, *to, msg.clone(), strat)), + .map(|to| self.call(endpoint, *to, msg.clone(), strat)), ) .await; - to.iter() + Ok(to + .iter() .cloned() .zip(resps.into_iter()) - .collect::>() + .collect::>()) } - pub async fn broadcast( + pub async fn broadcast( &self, endpoint: &Endpoint, - msg: M, + msg: N, strat: RequestStrategy, - ) -> Vec<(Uuid, Result)> + ) -> Result)>, Error> where M: Rpc>, + N: IntoReq, H: EndpointHandler, { let to = self @@ -262,20 +250,21 @@ impl RpcHelper { .await } - async fn try_call_many_internal( + async fn try_call_many_internal( &self, endpoint: &Arc>, to: &[Uuid], - msg: M, + msg: N, strategy: RequestStrategy, quorum: usize, ) -> Result, Error> where M: Rpc> + 'static, + N: IntoReq, H: EndpointHandler + 'static, S: Send + 'static, { - let msg = Arc::new(msg); + let msg = msg.into_req().map_err(netapp::error::Error::from)?; // Build future for each request // They are not started now: they are added below in a FuturesUnordered @@ -285,7 +274,7 @@ impl RpcHelper { let msg = msg.clone(); let endpoint2 = endpoint.clone(); (to, async move { - self2.call_arc(&endpoint2, to, msg, strategy).await + self2.call(&endpoint2, to, msg, strategy).await }) }); diff --git a/src/rpc/system.rs b/src/rpc/system.rs index f9f2970b..04ef2f69 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -16,8 +16,8 @@ use tokio::sync::watch; use tokio::sync::Mutex; use netapp::endpoint::{Endpoint, EndpointHandler}; +use netapp::message::*; use netapp::peering::fullmesh::FullMeshPeeringStrategy; -use netapp::proto::*; use netapp::util::parse_and_resolve_peer_addr; use netapp::{NetApp, NetworkKey, NodeID, NodeKey}; @@ -544,7 +544,7 @@ impl System { SystemRpc::AdvertiseClusterLayout(layout), RequestStrategy::with_priority(PRIO_HIGH), ) - .await; + .await?; Ok(()) }); self.background.spawn(self.clone().save_cluster_layout()); @@ -559,7 +559,8 @@ impl System { self.update_local_status(); let local_status: NodeStatus = self.local_status.load().as_ref().clone(); - self.rpc + let _ = self + .rpc .broadcast( &self.system_endpoint, SystemRpc::AdvertiseStatus(local_status), diff --git a/src/table/schema.rs b/src/table/schema.rs index 74f57798..f37e98d8 100644 --- a/src/table/schema.rs +++ b/src/table/schema.rs @@ -60,7 +60,7 @@ pub trait Entry: } /// Trait for the schema used in a table -pub trait TableSchema: Send + Sync { +pub trait TableSchema: Send + Sync + 'static { /// The name of the table in the database const TABLE_NAME: &'static str; diff --git a/src/util/Cargo.toml b/src/util/Cargo.toml index 7d79f21a..89064592 100644 --- a/src/util/Cargo.toml +++ b/src/util/Cargo.toml @@ -36,9 +36,8 @@ toml = "0.5" futures = "0.3" tokio = { version = "1.0", default-features = false, features = ["rt", "rt-multi-thread", "io-util", "net", "time", "macros", "sync", "signal", "fs"] } -#netapp = { version = "0.3.0", git = "https://git.deuxfleurs.fr/lx/netapp" } -#netapp = { version = "0.4", path = "../../../netapp" } -netapp = "0.4" +#netapp = "0.4" +netapp = { version = "0.4.4", git = "https://git.deuxfleurs.fr/lx/netapp", branch = "stream-body", features = ["telemetry"] } http = "0.2" hyper = "0.14" -- cgit v1.2.3 From a35d4da721db3550a2833d8576d4283bc999e8df Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 22 Jul 2022 16:45:45 +0200 Subject: update netapp to 0.5 --- src/garage/Cargo.toml | 2 +- src/model/Cargo.toml | 2 +- src/rpc/Cargo.toml | 2 +- src/util/Cargo.toml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'src') diff --git a/src/garage/Cargo.toml b/src/garage/Cargo.toml index 5a872c7a..1e96f1f3 100644 --- a/src/garage/Cargo.toml +++ b/src/garage/Cargo.toml @@ -51,7 +51,7 @@ futures-util = "0.3" tokio = { version = "1.0", default-features = false, features = ["rt", "rt-multi-thread", "io-util", "net", "time", "macros", "sync", "signal", "fs"] } #netapp = "0.4" -netapp = { version = "0.4.4", git = "https://git.deuxfleurs.fr/lx/netapp", branch = "stream-body", features = ["telemetry"] } +netapp = { version = "0.5", git = "https://git.deuxfleurs.fr/lx/netapp", branch = "stream-body", features = ["telemetry"] } opentelemetry = { version = "0.17", features = [ "rt-tokio" ] } opentelemetry-prometheus = "0.10" diff --git a/src/model/Cargo.toml b/src/model/Cargo.toml index a97bce4d..73011e0d 100644 --- a/src/model/Cargo.toml +++ b/src/model/Cargo.toml @@ -41,7 +41,7 @@ tokio = { version = "1.0", default-features = false, features = ["rt", "rt-multi opentelemetry = "0.17" #netapp = "0.4" -netapp = { version = "0.4.4", git = "https://git.deuxfleurs.fr/lx/netapp", branch = "stream-body", features = ["telemetry"] } +netapp = { version = "0.5", git = "https://git.deuxfleurs.fr/lx/netapp", branch = "stream-body", features = ["telemetry"] } [features] k2v = [ "garage_util/k2v" ] diff --git a/src/rpc/Cargo.toml b/src/rpc/Cargo.toml index 5d5151cd..5986b7bf 100644 --- a/src/rpc/Cargo.toml +++ b/src/rpc/Cargo.toml @@ -47,7 +47,7 @@ tokio-stream = { version = "0.1", features = ["net"] } opentelemetry = "0.17" #netapp = { version = "0.4.4", features = ["telemetry"] } -netapp = { version = "0.4.4", git = "https://git.deuxfleurs.fr/lx/netapp", branch = "stream-body", features = ["telemetry"] } +netapp = { version = "0.5.0", git = "https://git.deuxfleurs.fr/lx/netapp", branch = "stream-body", features = ["telemetry"] } hyper = { version = "0.14", features = ["client", "http1", "runtime", "tcp"] } diff --git a/src/util/Cargo.toml b/src/util/Cargo.toml index 89064592..a70f68b9 100644 --- a/src/util/Cargo.toml +++ b/src/util/Cargo.toml @@ -37,7 +37,7 @@ futures = "0.3" tokio = { version = "1.0", default-features = false, features = ["rt", "rt-multi-thread", "io-util", "net", "time", "macros", "sync", "signal", "fs"] } #netapp = "0.4" -netapp = { version = "0.4.4", git = "https://git.deuxfleurs.fr/lx/netapp", branch = "stream-body", features = ["telemetry"] } +netapp = { version = "0.5", git = "https://git.deuxfleurs.fr/lx/netapp", branch = "stream-body", features = ["telemetry"] } http = "0.2" hyper = "0.14" -- cgit v1.2.3 From 605a630333c8ee60c55fe011a375c01277bba173 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 22 Jul 2022 18:20:27 +0200 Subject: Use streaming in block manager --- src/api/s3/copy.rs | 12 ++- src/api/s3/get.rs | 29 ++++-- src/block/Cargo.toml | 3 + src/block/block.rs | 37 ++++++-- src/block/manager.rs | 249 +++++++++++++++++++++++++++++++++++++++----------- src/rpc/rpc_helper.rs | 24 +++-- 6 files changed, 267 insertions(+), 87 deletions(-) (limited to 'src') diff --git a/src/api/s3/copy.rs b/src/api/s3/copy.rs index 4415a037..54a565e0 100644 --- a/src/api/s3/copy.rs +++ b/src/api/s3/copy.rs @@ -5,6 +5,7 @@ use std::time::{Duration, SystemTime, UNIX_EPOCH}; use futures::{stream, stream::Stream, StreamExt, TryFutureExt}; use md5::{Digest as Md5Digest, Md5}; +use bytes::Bytes; use hyper::{Body, Request, Response}; use serde::Serialize; @@ -311,7 +312,7 @@ pub async fn handle_upload_part_copy( stream::once(async move { let data = garage3.block_manager.rpc_get_block(&block_hash).await?; match range_to_copy { - Some(r) => Ok((data[r].to_vec(), None)), + Some(r) => Ok((data.slice(r), None)), None => Ok((data, Some(block_hash))), } }) @@ -556,7 +557,7 @@ impl CopyPreconditionHeaders { } } -type BlockStreamItemOk = (Vec, Option); +type BlockStreamItemOk = (Bytes, Option); type BlockStreamItem = Result; struct Defragmenter> { @@ -589,7 +590,7 @@ impl> Defragmenter { if self.buffer.is_empty() { let (next_block, next_block_hash) = self.block_stream.next().await.unwrap()?; - self.buffer = next_block; + self.buffer = next_block.to_vec(); // TODO TOO MUCH COPY self.hash = next_block_hash; } else if self.buffer.len() + peeked_next_block.len() > self.block_size { break; @@ -600,7 +601,10 @@ impl> Defragmenter { } } - Ok((std::mem::take(&mut self.buffer), self.hash.take())) + Ok(( + Bytes::from(std::mem::take(&mut self.buffer)), + self.hash.take(), + )) } } diff --git a/src/api/s3/get.rs b/src/api/s3/get.rs index 7fa1a177..7d118f89 100644 --- a/src/api/s3/get.rs +++ b/src/api/s3/get.rs @@ -242,10 +242,13 @@ pub async fn handle_get( Ok(resp_builder.body(body)?) } ObjectVersionData::FirstBlock(_, first_block_hash) => { - let read_first_block = garage.block_manager.rpc_get_block(first_block_hash); + let read_first_block = garage + .block_manager + .rpc_get_block_streaming(first_block_hash); let get_next_blocks = garage.version_table.get(&last_v.uuid, &EmptyKey); - let (first_block, version) = futures::try_join!(read_first_block, get_next_blocks)?; + let (first_block_stream, version) = + futures::try_join!(read_first_block, get_next_blocks)?; let version = version.ok_or(Error::NoSuchKey)?; let mut blocks = version @@ -254,24 +257,32 @@ pub async fn handle_get( .iter() .map(|(_, vb)| (vb.hash, None)) .collect::>(); - blocks[0].1 = Some(first_block); + blocks[0].1 = Some(first_block_stream); let body_stream = futures::stream::iter(blocks) - .map(move |(hash, data_opt)| { + .map(move |(hash, stream_opt)| { let garage = garage.clone(); async move { - if let Some(data) = data_opt { - Ok(Bytes::from(data)) + if let Some(stream) = stream_opt { + stream } else { garage .block_manager - .rpc_get_block(&hash) + .rpc_get_block_streaming(&hash) .await - .map(Bytes::from) + .unwrap_or_else(|_| { + Box::pin(futures::stream::once(async move { + Err(std::io::Error::new( + std::io::ErrorKind::Other, + "Could not get next block", + )) + })) + }) } } }) - .buffered(2); + .buffered(3) + .flatten(); let body = hyper::body::Body::wrap_stream(body_stream); Ok(resp_builder.body(body)?) diff --git a/src/block/Cargo.toml b/src/block/Cargo.toml index 2555a44a..3e6f7bc0 100644 --- a/src/block/Cargo.toml +++ b/src/block/Cargo.toml @@ -27,6 +27,8 @@ bytes = "1.0" hex = "0.4" tracing = "0.1.30" rand = "0.8" + +async-compression = { version = "0.3", features = ["tokio", "zstd"] } zstd = { version = "0.9", default-features = false } rmp-serde = "0.15" @@ -36,3 +38,4 @@ serde_bytes = "0.11" futures = "0.3" futures-util = "0.3" tokio = { version = "1.0", default-features = false, features = ["rt", "rt-multi-thread", "io-util", "net", "time", "macros", "sync", "signal", "fs"] } +tokio-util = { version = "0.6", features = ["io"] } diff --git a/src/block/block.rs b/src/block/block.rs index f17bd2c0..935aa900 100644 --- a/src/block/block.rs +++ b/src/block/block.rs @@ -5,13 +5,18 @@ use zstd::stream::{decode_all as zstd_decode, Encoder}; use garage_util::data::*; use garage_util::error::*; +#[derive(Debug, Serialize, Deserialize, Copy, Clone)] +pub enum DataBlockHeader { + Plain, + Compressed, +} + /// A possibly compressed block of data -#[derive(Debug, Serialize, Deserialize)] pub enum DataBlock { /// Uncompressed data - Plain(#[serde(with = "serde_bytes")] Vec), + Plain(Bytes), /// Data compressed with zstd - Compressed(#[serde(with = "serde_bytes")] Vec), + Compressed(Bytes), } impl DataBlock { @@ -31,7 +36,7 @@ impl DataBlock { /// Get the buffer, possibly decompressing it, and verify it's integrity. /// For Plain block, data is compared to hash, for Compressed block, zstd checksumming system /// is used instead. - pub fn verify_get(self, hash: Hash) -> Result, Error> { + pub fn verify_get(self, hash: Hash) -> Result { match self { DataBlock::Plain(data) => { if blake2sum(&data) == hash { @@ -40,9 +45,9 @@ impl DataBlock { Err(Error::CorruptData(hash)) } } - DataBlock::Compressed(data) => { - zstd_decode(&data[..]).map_err(|_| Error::CorruptData(hash)) - } + DataBlock::Compressed(data) => zstd_decode(&data[..]) + .map_err(|_| Error::CorruptData(hash)) + .map(Bytes::from), } } @@ -66,14 +71,28 @@ impl DataBlock { tokio::task::spawn_blocking(move || { if let Some(level) = level { if let Ok(data) = zstd_encode(&data[..], level) { - return DataBlock::Compressed(data); + return DataBlock::Compressed(data.into()); } } - DataBlock::Plain(data.to_vec()) // TODO: remove to_vec here + DataBlock::Plain(data) }) .await .unwrap() } + + pub fn into_parts(self) -> (DataBlockHeader, Bytes) { + match self { + DataBlock::Plain(data) => (DataBlockHeader::Plain, data), + DataBlock::Compressed(data) => (DataBlockHeader::Compressed, data), + } + } + + pub fn from_parts(h: DataBlockHeader, bytes: Bytes) -> Self { + match h { + DataBlockHeader::Plain => DataBlock::Plain(bytes), + DataBlockHeader::Compressed => DataBlock::Compressed(bytes), + } + } } fn zstd_encode(mut source: R, level: i32) -> std::io::Result> { diff --git a/src/block/manager.rs b/src/block/manager.rs index 408de148..bb01c300 100644 --- a/src/block/manager.rs +++ b/src/block/manager.rs @@ -1,5 +1,6 @@ use std::convert::TryInto; use std::path::PathBuf; +use std::pin::Pin; use std::sync::Arc; use std::time::Duration; @@ -8,8 +9,10 @@ use async_trait::async_trait; use bytes::Bytes; use serde::{Deserialize, Serialize}; +use futures::{Stream, TryStreamExt}; +use futures_util::stream::StreamExt; use tokio::fs; -use tokio::io::{AsyncReadExt, AsyncWriteExt}; +use tokio::io::{AsyncReadExt, AsyncWriteExt, BufReader}; use tokio::select; use tokio::sync::{mpsc, watch, Mutex, Notify}; @@ -18,6 +21,8 @@ use opentelemetry::{ Context, KeyValue, }; +use garage_rpc::rpc_helper::netapp::stream::{stream_asyncread, ByteStream}; + use garage_db as db; use garage_db::counted_tree_hack::CountedTree; @@ -70,7 +75,7 @@ pub enum BlockRpc { /// block PutBlock { hash: Hash, - data: DataBlock, + header: DataBlockHeader, }, /// Ask other node if they should have this block, but don't actually have it NeedBlockQuery(Hash), @@ -174,56 +179,146 @@ impl BlockManager { } /// Ask nodes that might have a (possibly compressed) block for it + /// Return it as a stream with a header + async fn rpc_get_raw_block_streaming( + &self, + hash: &Hash, + ) -> Result<(DataBlockHeader, ByteStream), Error> { + let who = self.replication.read_nodes(hash); + + for node in who.iter() { + let node_id = NodeID::from(*node); + let rpc = + self.endpoint + .call_streaming(&node_id, BlockRpc::GetBlock(*hash), PRIO_NORMAL); + tokio::select! { + res = rpc => { + let res = match res { + Ok(res) => res, + Err(e) => { + debug!("Node {:?} returned error: {}", node, e); + continue; + } + }; + let (header, stream) = match res.into_parts() { + (Ok(BlockRpc::PutBlock { hash: _, header }), Some(stream)) => (header, stream), + _ => { + debug!("Node {:?} returned a malformed response", node); + continue; + } + }; + return Ok((header, stream)); + } + _ = tokio::time::sleep(BLOCK_RW_TIMEOUT) => { + debug!("Node {:?} didn't return block in time, trying next.", node); + } + }; + } + + Err(Error::Message(format!( + "Unable to read block {:?}: no node returned a valid block", + hash + ))) + } + + /// Ask nodes that might have a (possibly compressed) block for it + /// Return its entire body async fn rpc_get_raw_block(&self, hash: &Hash) -> Result { let who = self.replication.read_nodes(hash); - let resps = self - .system - .rpc - .try_call_many( - &self.endpoint, - &who[..], - BlockRpc::GetBlock(*hash), - RequestStrategy::with_priority(PRIO_NORMAL) - .with_quorum(1) - .with_timeout(BLOCK_RW_TIMEOUT) - .interrupt_after_quorum(true), - ) - .await?; - for resp in resps { - if let BlockRpc::PutBlock { data, .. } = resp { - return Ok(data); - } + for node in who.iter() { + let node_id = NodeID::from(*node); + let rpc = + self.endpoint + .call_streaming(&node_id, BlockRpc::GetBlock(*hash), PRIO_NORMAL); + tokio::select! { + res = rpc => { + let res = match res { + Ok(res) => res, + Err(e) => { + debug!("Node {:?} returned error: {}", node, e); + continue; + } + }; + let (header, stream) = match res.into_parts() { + (Ok(BlockRpc::PutBlock { hash: _, header }), Some(stream)) => (header, stream), + _ => { + debug!("Node {:?} returned a malformed response", node); + continue; + } + }; + match read_stream_to_end(stream).await { + Ok(bytes) => return Ok(DataBlock::from_parts(header, bytes)), + Err(e) => { + debug!("Error reading stream from node {:?}: {}", node, e); + } + } + } + _ = tokio::time::sleep(BLOCK_RW_TIMEOUT) => { + debug!("Node {:?} didn't return block in time, trying next.", node); + } + }; } + Err(Error::Message(format!( - "Unable to read block {:?}: no valid blocks returned", + "Unable to read block {:?}: no node returned a valid block", hash ))) } // ---- Public interface ---- + /// Ask nodes that might have a block for it, + /// return it as a stream + pub async fn rpc_get_block_streaming( + &self, + hash: &Hash, + ) -> Result< + Pin> + Send + Sync + 'static>>, + Error, + > { + let (header, stream) = self.rpc_get_raw_block_streaming(hash).await?; + match header { + DataBlockHeader::Plain => Ok(Box::pin(stream.map_err(|_| { + std::io::Error::new(std::io::ErrorKind::Other, "netapp stream error") + }))), + DataBlockHeader::Compressed => { + // Too many things, I hate it. + let reader = stream_asyncread(stream); + let reader = BufReader::new(reader); + let reader = async_compression::tokio::bufread::ZstdDecoder::new(reader); + Ok(Box::pin(tokio_util::io::ReaderStream::new(reader))) + } + } + } + /// Ask nodes that might have a block for it - pub async fn rpc_get_block(&self, hash: &Hash) -> Result, Error> { + pub async fn rpc_get_block(&self, hash: &Hash) -> Result { self.rpc_get_raw_block(hash).await?.verify_get(*hash) } /// Send block to nodes that should have it pub async fn rpc_put_block(&self, hash: Hash, data: Bytes) -> Result<(), Error> { let who = self.replication.write_nodes(&hash); - let data = DataBlock::from_buffer(data, self.compression_level).await; + + let (header, bytes) = DataBlock::from_buffer(data, self.compression_level) + .await + .into_parts(); + let put_block_rpc = + Req::new(BlockRpc::PutBlock { hash, header })?.with_stream_from_buffer(bytes); + self.system .rpc .try_call_many( &self.endpoint, &who[..], - // TODO: remove to_vec() here - BlockRpc::PutBlock { hash, data }, + put_block_rpc, RequestStrategy::with_priority(PRIO_NORMAL) .with_quorum(self.replication.write_quorum()) .with_timeout(BLOCK_RW_TIMEOUT), ) .await?; + Ok(()) } @@ -308,13 +403,25 @@ impl BlockManager { // ---- Reading and writing blocks locally ---- + async fn handle_put_block( + &self, + hash: Hash, + header: DataBlockHeader, + stream: Option, + ) -> Result<(), Error> { + let stream = stream.ok_or_message("missing stream")?; + let bytes = read_stream_to_end(stream).await?; + let data = DataBlock::from_parts(header, bytes); + self.write_block(&hash, &data).await + } + /// Write a block to disk - async fn write_block(&self, hash: &Hash, data: &DataBlock) -> Result { + async fn write_block(&self, hash: &Hash, data: &DataBlock) -> Result<(), Error> { let tracer = opentelemetry::global::tracer("garage"); let write_size = data.inner_buffer().len() as u64; - let res = self.mutation_lock[hash.as_slice()[0] as usize] + self.mutation_lock[hash.as_slice()[0] as usize] .lock() .with_context(Context::current_with_span( tracer.start("Acquire mutation_lock"), @@ -329,21 +436,31 @@ impl BlockManager { self.metrics.bytes_written.add(write_size); - Ok(res) + Ok(()) } - /// Read block from disk, verifying it's integrity - pub(crate) async fn read_block(&self, hash: &Hash) -> Result { - let data = self - .read_block_internal(hash) - .bound_record_duration(&self.metrics.block_read_duration) - .await?; + async fn handle_get_block(&self, hash: &Hash) -> Resp { + let block = match self.read_block(hash).await { + Ok(data) => data, + Err(e) => return Resp::new(Err(e)), + }; + + let (header, data) = block.into_parts(); - self.metrics - .bytes_read - .add(data.inner_buffer().len() as u64); + self.metrics.bytes_read.add(data.len() as u64); - Ok(BlockRpc::PutBlock { hash: *hash, data }) + Resp::new(Ok(BlockRpc::PutBlock { + hash: *hash, + header, + })) + .with_stream_from_buffer(data) + } + + /// Read block from disk, verifying it's integrity + pub(crate) async fn read_block(&self, hash: &Hash) -> Result { + self.read_block_internal(hash) + .bound_record_duration(&self.metrics.block_read_duration) + .await } async fn read_block_internal(&self, hash: &Hash) -> Result { @@ -366,9 +483,9 @@ impl BlockManager { drop(f); let data = if compressed { - DataBlock::Compressed(data) + DataBlock::Compressed(data.into()) } else { - DataBlock::Plain(data) + DataBlock::Plain(data.into()) }; if data.verify(*hash).is_err() { @@ -675,7 +792,13 @@ impl BlockManager { .add(1, &[KeyValue::new("to", format!("{:?}", node))]); } - let put_block_message = self.read_block(hash).await?; + let block = self.read_block(hash).await?; + let (header, bytes) = block.into_parts(); + let put_block_message = Req::new(BlockRpc::PutBlock { + hash: *hash, + header, + })? + .with_stream_from_buffer(bytes); self.system .rpc .try_call_many( @@ -723,17 +846,19 @@ impl BlockManager { } #[async_trait] -impl EndpointHandler for BlockManager { - async fn handle( - self: &Arc, - message: &BlockRpc, - _from: NodeID, - ) -> Result { - match message { - BlockRpc::PutBlock { hash, data } => self.write_block(hash, data).await, - BlockRpc::GetBlock(h) => self.read_block(h).await, - BlockRpc::NeedBlockQuery(h) => self.need_block(h).await.map(BlockRpc::NeedBlockReply), - m => Err(Error::unexpected_rpc_message(m)), +impl StreamingEndpointHandler for BlockManager { + async fn handle(self: &Arc, mut message: Req, _from: NodeID) -> Resp { + match message.msg() { + BlockRpc::PutBlock { hash, header } => Resp::new( + self.handle_put_block(*hash, *header, message.take_stream()) + .await + .map(|_| BlockRpc::Ok), + ), + BlockRpc::GetBlock(h) => self.handle_get_block(h).await, + BlockRpc::NeedBlockQuery(h) => { + Resp::new(self.need_block(h).await.map(BlockRpc::NeedBlockReply)) + } + m => Resp::new(Err(Error::unexpected_rpc_message(m))), } } } @@ -831,7 +956,7 @@ impl BlockManagerLocked { hash: &Hash, data: &DataBlock, mgr: &BlockManager, - ) -> Result { + ) -> Result<(), Error> { let compressed = data.is_compressed(); let data = data.inner_buffer(); @@ -842,8 +967,8 @@ impl BlockManagerLocked { fs::create_dir_all(&directory).await?; let to_delete = match (mgr.is_block_compressed(hash).await, compressed) { - (Ok(true), _) => return Ok(BlockRpc::Ok), - (Ok(false), false) => return Ok(BlockRpc::Ok), + (Ok(true), _) => return Ok(()), + (Ok(false), false) => return Ok(()), (Ok(false), true) => { let path_to_delete = path.clone(); path.set_extension("zst"); @@ -882,7 +1007,7 @@ impl BlockManagerLocked { dir.sync_all().await?; drop(dir); - Ok(BlockRpc::Ok) + Ok(()) } async fn move_block_to_corrupted(&self, hash: &Hash, mgr: &BlockManager) -> Result<(), Error> { @@ -963,3 +1088,17 @@ impl ErrorCounter { self.last_try + self.delay_msec() } } + +async fn read_stream_to_end(mut stream: ByteStream) -> Result { + let mut parts: Vec = vec![]; + while let Some(part) = stream.next().await { + parts.push(part.ok_or_message("error in stream")?); + } + + Ok(parts + .iter() + .map(|x| &x[..]) + .collect::>() + .concat() + .into()) +} diff --git a/src/rpc/rpc_helper.rs b/src/rpc/rpc_helper.rs index 079cdc70..6e098446 100644 --- a/src/rpc/rpc_helper.rs +++ b/src/rpc/rpc_helper.rs @@ -15,10 +15,13 @@ use opentelemetry::{ Context, }; -pub use netapp::endpoint::{Endpoint, EndpointHandler}; -pub use netapp::message::{Message as Rpc, *}; +pub use netapp::endpoint::{Endpoint, EndpointHandler, StreamingEndpointHandler}; +use netapp::message::IntoReq; +pub use netapp::message::{ + Message as Rpc, Req, RequestPriority, Resp, PRIO_BACKGROUND, PRIO_HIGH, PRIO_NORMAL, +}; use netapp::peering::fullmesh::FullMeshPeeringStrategy; -pub use netapp::{NetApp, NodeID}; +pub use netapp::{self, NetApp, NodeID}; use garage_util::background::BackgroundRunner; use garage_util::data::*; @@ -117,7 +120,7 @@ impl RpcHelper { where M: Rpc>, N: IntoReq + Send, - H: EndpointHandler, + H: StreamingEndpointHandler, { let metric_tags = [ KeyValue::new("rpc_endpoint", endpoint.path().to_string()), @@ -172,7 +175,7 @@ impl RpcHelper { where M: Rpc>, N: IntoReq, - H: EndpointHandler, + H: StreamingEndpointHandler, { let msg = msg.into_req().map_err(netapp::error::Error::from)?; @@ -197,7 +200,7 @@ impl RpcHelper { where M: Rpc>, N: IntoReq, - H: EndpointHandler, + H: StreamingEndpointHandler, { let to = self .0 @@ -211,16 +214,17 @@ impl RpcHelper { /// Make a RPC call to multiple servers, returning either a Vec of responses, /// or an error if quorum could not be reached due to too many errors - pub async fn try_call_many( + pub async fn try_call_many( &self, endpoint: &Arc>, to: &[Uuid], - msg: M, + msg: N, strategy: RequestStrategy, ) -> Result, Error> where M: Rpc> + 'static, - H: EndpointHandler + 'static, + N: IntoReq, + H: StreamingEndpointHandler + 'static, S: Send + 'static, { let quorum = strategy.rs_quorum.unwrap_or(to.len()); @@ -261,7 +265,7 @@ impl RpcHelper { where M: Rpc> + 'static, N: IntoReq, - H: EndpointHandler + 'static, + H: StreamingEndpointHandler + 'static, S: Send + 'static, { let msg = msg.into_req().map_err(netapp::error::Error::from)?; -- cgit v1.2.3 From 68087ee13dc22dbaeb0c1fa8dcb4bdbaa82098a6 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 22 Jul 2022 19:06:56 +0200 Subject: Fix clippy --- src/api/s3/copy.rs | 5 +---- src/api/s3/get.rs | 1 - 2 files changed, 1 insertion(+), 5 deletions(-) (limited to 'src') diff --git a/src/api/s3/copy.rs b/src/api/s3/copy.rs index 54a565e0..b54cbd23 100644 --- a/src/api/s3/copy.rs +++ b/src/api/s3/copy.rs @@ -366,10 +366,7 @@ pub async fn handle_upload_part_copy( // we need to insert that data as a new block. async move { if must_upload { - garage2 - .block_manager - .rpc_put_block(final_hash, data.into()) - .await + garage2.block_manager.rpc_put_block(final_hash, data).await } else { Ok(()) } diff --git a/src/api/s3/get.rs b/src/api/s3/get.rs index 7d118f89..c7621ade 100644 --- a/src/api/s3/get.rs +++ b/src/api/s3/get.rs @@ -450,7 +450,6 @@ fn body_from_blocks_range( let garage = garage.clone(); async move { let data = garage.block_manager.rpc_get_block(&block.hash).await?; - let data = Bytes::from(data); let start_in_block = if true_offset > begin { 0 } else { -- cgit v1.2.3 From e935861854deed5d1ca66767fc51d9849201a4dd Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 25 Jul 2022 18:19:35 +0200 Subject: Factor out node request order selection logic & use in manager --- src/block/manager.rs | 2 ++ src/rpc/rpc_helper.rs | 95 ++++++++++++++++++++++++++++++--------------------- 2 files changed, 58 insertions(+), 39 deletions(-) (limited to 'src') diff --git a/src/block/manager.rs b/src/block/manager.rs index bb01c300..80c52510 100644 --- a/src/block/manager.rs +++ b/src/block/manager.rs @@ -185,6 +185,7 @@ impl BlockManager { hash: &Hash, ) -> Result<(DataBlockHeader, ByteStream), Error> { let who = self.replication.read_nodes(hash); + //let who = self.system.rpc.request_order(&who); for node in who.iter() { let node_id = NodeID::from(*node); @@ -225,6 +226,7 @@ impl BlockManager { /// Return its entire body async fn rpc_get_raw_block(&self, hash: &Hash) -> Result { let who = self.replication.read_nodes(hash); + //let who = self.system.rpc.request_order(&who); for node in who.iter() { let node_id = NodeID::from(*node); diff --git a/src/rpc/rpc_helper.rs b/src/rpc/rpc_helper.rs index 6e098446..ddabd636 100644 --- a/src/rpc/rpc_helper.rs +++ b/src/rpc/rpc_helper.rs @@ -292,47 +292,19 @@ impl RpcHelper { // to reach a quorum, priorizing nodes with the lowest latency. // When there are errors, we start new requests to compensate. - // Retrieve some status variables that we will use to sort requests - let peer_list = self.0.fullmesh.get_peer_list(); - let ring: Arc = self.0.ring.borrow().clone(); - let our_zone = match ring.layout.node_role(&self.0.our_node_id) { - Some(pc) => &pc.zone, - None => "", - }; - - // Augment requests with some information used to sort them. - // The tuples are as follows: - // (is another node?, is another zone?, latency, node ID, request future) - // We store all of these tuples in a vec that we can sort. - // By sorting this vec, we priorize ourself, then nodes in the same zone, - // and within a same zone we priorize nodes with the lowest latency. - let mut requests = requests - .map(|(to, fut)| { - let peer_zone = match ring.layout.node_role(&to) { - Some(pc) => &pc.zone, - None => "", - }; - let peer_avg_ping = peer_list - .iter() - .find(|x| x.id.as_ref() == to.as_slice()) - .and_then(|pi| pi.avg_ping) - .unwrap_or_else(|| Duration::from_secs(1)); - ( - to != self.0.our_node_id, - peer_zone != our_zone, - peer_avg_ping, - to, - fut, - ) - }) + // Reorder requests to priorize closeness / low latency + let request_order = self.request_order(to); + let mut ord_requests = vec![(); request_order.len()] + .into_iter() + .map(|_| None) .collect::>(); - - // Sort requests by (priorize ourself, priorize same zone, priorize low latency) - requests - .sort_by_key(|(diffnode, diffzone, ping, _to, _fut)| (*diffnode, *diffzone, *ping)); + for (to, fut) in requests { + let i = request_order.iter().position(|x| *x == to).unwrap(); + ord_requests[i] = Some((to, fut)); + } // Make an iterator to take requests in their sorted order - let mut requests = requests.into_iter(); + let mut requests = ord_requests.into_iter().map(Option::unwrap); // resp_stream will contain all of the requests that are currently in flight. // (for the moment none, they will be added in the loop below) @@ -343,7 +315,7 @@ impl RpcHelper { // If the current set of requests that are running is not enough to possibly // reach quorum, start some new requests. while successes.len() + resp_stream.len() < quorum { - if let Some((_, _, _, req_to, fut)) = requests.next() { + if let Some((req_to, fut)) = requests.next() { let tracer = opentelemetry::global::tracer("garage"); let span = tracer.start(format!("RPC to {:?}", req_to)); resp_stream.push(tokio::spawn( @@ -413,4 +385,49 @@ impl RpcHelper { Err(Error::Quorum(quorum, successes.len(), to.len(), errors)) } } + + pub fn request_order(&self, nodes: &[Uuid]) -> Vec { + // Retrieve some status variables that we will use to sort requests + let peer_list = self.0.fullmesh.get_peer_list(); + let ring: Arc = self.0.ring.borrow().clone(); + let our_zone = match ring.layout.node_role(&self.0.our_node_id) { + Some(pc) => &pc.zone, + None => "", + }; + + // Augment requests with some information used to sort them. + // The tuples are as follows: + // (is another node?, is another zone?, latency, node ID, request future) + // We store all of these tuples in a vec that we can sort. + // By sorting this vec, we priorize ourself, then nodes in the same zone, + // and within a same zone we priorize nodes with the lowest latency. + let mut nodes = nodes + .iter() + .map(|to| { + let peer_zone = match ring.layout.node_role(&to) { + Some(pc) => &pc.zone, + None => "", + }; + let peer_avg_ping = peer_list + .iter() + .find(|x| x.id.as_ref() == to.as_slice()) + .and_then(|pi| pi.avg_ping) + .unwrap_or_else(|| Duration::from_secs(1)); + ( + *to != self.0.our_node_id, + peer_zone != our_zone, + peer_avg_ping, + *to, + ) + }) + .collect::>(); + + // Sort requests by (priorize ourself, priorize same zone, priorize low latency) + nodes.sort_by_key(|(diffnode, diffzone, ping, _to)| (*diffnode, *diffzone, *ping)); + + nodes + .into_iter() + .map(|(_, _, _, to)| to) + .collect::>() + } } -- cgit v1.2.3 From 2c7bae935ac68acab831fe86e5330d3c9a84a953 Mon Sep 17 00:00:00 2001 From: Quentin Dufour Date: Wed, 10 Aug 2022 12:18:44 +0200 Subject: Configure structopt to report the right version By default, structopt reports the value provided by the env var CARGO_PKG_VERSION, feeded by Cargo when reading Cargo.toml. However for Garage we use a versioning based on git, so we often report a version that is behind the real version. In this commit, we create garage_util::version::garage() that reports the right version and configure all structopt subcommands to call this function instead of using the env var. --- src/garage/cli/structs.rs | 101 +++++++++++++++++++++++----------------------- src/garage/main.rs | 3 +- src/rpc/Cargo.toml | 1 - src/rpc/system.rs | 7 +--- src/util/Cargo.toml | 2 + src/util/lib.rs | 1 + src/util/version.rs | 7 ++++ 7 files changed, 65 insertions(+), 57 deletions(-) create mode 100644 src/util/version.rs (limited to 'src') diff --git a/src/garage/cli/structs.rs b/src/garage/cli/structs.rs index bc44b5ef..9274f80f 100644 --- a/src/garage/cli/structs.rs +++ b/src/garage/cli/structs.rs @@ -1,64 +1,65 @@ use serde::{Deserialize, Serialize}; +use garage_util::version; use structopt::StructOpt; #[derive(StructOpt, Debug)] pub enum Command { /// Run Garage server - #[structopt(name = "server")] + #[structopt(name = "server", version = version::garage())] Server, /// Get network status - #[structopt(name = "status")] + #[structopt(name = "status", version = version::garage())] Status, /// Operations on individual Garage nodes - #[structopt(name = "node")] + #[structopt(name = "node", version = version::garage())] Node(NodeOperation), /// Operations on the assignation of node roles in the cluster layout - #[structopt(name = "layout")] + #[structopt(name = "layout", version = version::garage())] Layout(LayoutOperation), /// Operations on buckets - #[structopt(name = "bucket")] + #[structopt(name = "bucket", version = version::garage())] Bucket(BucketOperation), /// Operations on S3 access keys - #[structopt(name = "key")] + #[structopt(name = "key", version = version::garage())] Key(KeyOperation), /// Run migrations from previous Garage version /// (DO NOT USE WITHOUT READING FULL DOCUMENTATION) - #[structopt(name = "migrate")] + #[structopt(name = "migrate", version = version::garage())] Migrate(MigrateOpt), /// Start repair of node data on remote node - #[structopt(name = "repair")] + #[structopt(name = "repair", version = version::garage())] Repair(RepairOpt), /// Offline reparation of node data (these repairs must be run offline /// directly on the server node) - #[structopt(name = "offline-repair")] + #[structopt(name = "offline-repair", version = version::garage())] OfflineRepair(OfflineRepairOpt), /// Gather node statistics - #[structopt(name = "stats")] + #[structopt(name = "stats", version = version::garage())] Stats(StatsOpt), /// Manage background workers - #[structopt(name = "worker")] + #[structopt(name = "worker", version = version::garage())] Worker(WorkerOpt), } #[derive(StructOpt, Debug)] pub enum NodeOperation { /// Print identifier (public key) of this Garage node - #[structopt(name = "id")] + #[structopt(name = "id", version = version::garage())] NodeId(NodeIdOpt), /// Connect to Garage node that is currently isolated from the system - #[structopt(name = "connect")] + #[structopt(name = "connect", version = version::garage())] Connect(ConnectNodeOpt), } @@ -79,23 +80,23 @@ pub struct ConnectNodeOpt { #[derive(StructOpt, Debug)] pub enum LayoutOperation { /// Assign role to Garage node - #[structopt(name = "assign")] + #[structopt(name = "assign", version = version::garage())] Assign(AssignRoleOpt), /// Remove role from Garage cluster node - #[structopt(name = "remove")] + #[structopt(name = "remove", version = version::garage())] Remove(RemoveRoleOpt), /// Show roles currently assigned to nodes and changes staged for commit - #[structopt(name = "show")] + #[structopt(name = "show", version = version::garage())] Show, /// Apply staged changes to cluster layout - #[structopt(name = "apply")] + #[structopt(name = "apply", version = version::garage())] Apply(ApplyLayoutOpt), /// Revert staged changes to cluster layout - #[structopt(name = "revert")] + #[structopt(name = "revert", version = version::garage())] Revert(RevertLayoutOpt), } @@ -150,43 +151,43 @@ pub struct RevertLayoutOpt { #[derive(Serialize, Deserialize, StructOpt, Debug)] pub enum BucketOperation { /// List buckets - #[structopt(name = "list")] + #[structopt(name = "list", version = version::garage())] List, /// Get bucket info - #[structopt(name = "info")] + #[structopt(name = "info", version = version::garage())] Info(BucketOpt), /// Create bucket - #[structopt(name = "create")] + #[structopt(name = "create", version = version::garage())] Create(BucketOpt), /// Delete bucket - #[structopt(name = "delete")] + #[structopt(name = "delete", version = version::garage())] Delete(DeleteBucketOpt), /// Alias bucket under new name - #[structopt(name = "alias")] + #[structopt(name = "alias", version = version::garage())] Alias(AliasBucketOpt), /// Remove bucket alias - #[structopt(name = "unalias")] + #[structopt(name = "unalias", version = version::garage())] Unalias(UnaliasBucketOpt), /// Allow key to read or write to bucket - #[structopt(name = "allow")] + #[structopt(name = "allow", version = version::garage())] Allow(PermBucketOpt), /// Deny key from reading or writing to bucket - #[structopt(name = "deny")] + #[structopt(name = "deny", version = version::garage())] Deny(PermBucketOpt), /// Expose as website or not - #[structopt(name = "website")] + #[structopt(name = "website", version = version::garage())] Website(WebsiteOpt), /// Set the quotas for this bucket - #[structopt(name = "set-quotas")] + #[structopt(name = "set-quotas", version = version::garage())] SetQuotas(SetQuotasOpt), } @@ -292,35 +293,35 @@ pub struct SetQuotasOpt { #[derive(Serialize, Deserialize, StructOpt, Debug)] pub enum KeyOperation { /// List keys - #[structopt(name = "list")] + #[structopt(name = "list", version = version::garage())] List, /// Get key info - #[structopt(name = "info")] + #[structopt(name = "info", version = version::garage())] Info(KeyOpt), /// Create new key - #[structopt(name = "new")] + #[structopt(name = "new", version = version::garage())] New(KeyNewOpt), /// Rename key - #[structopt(name = "rename")] + #[structopt(name = "rename", version = version::garage())] Rename(KeyRenameOpt), /// Delete key - #[structopt(name = "delete")] + #[structopt(name = "delete", version = version::garage())] Delete(KeyDeleteOpt), /// Set permission flags for key - #[structopt(name = "allow")] + #[structopt(name = "allow", version = version::garage())] Allow(KeyPermOpt), /// Unset permission flags for key - #[structopt(name = "deny")] + #[structopt(name = "deny", version = version::garage())] Deny(KeyPermOpt), /// Import key - #[structopt(name = "import")] + #[structopt(name = "import", version = version::garage())] Import(KeyImportOpt), } @@ -392,7 +393,7 @@ pub struct MigrateOpt { #[derive(Serialize, Deserialize, StructOpt, Debug, Eq, PartialEq, Clone)] pub enum MigrateWhat { /// Migrate buckets and permissions from v0.5.0 - #[structopt(name = "buckets050")] + #[structopt(name = "buckets050", version = version::garage())] Buckets050, } @@ -413,19 +414,19 @@ pub struct RepairOpt { #[derive(Serialize, Deserialize, StructOpt, Debug, Eq, PartialEq, Clone)] pub enum RepairWhat { /// Only do a full sync of metadata tables - #[structopt(name = "tables")] + #[structopt(name = "tables", version = version::garage())] Tables, /// Only repair (resync/rebalance) the set of stored blocks - #[structopt(name = "blocks")] + #[structopt(name = "blocks", version = version::garage())] Blocks, /// Only redo the propagation of object deletions to the version table (slow) - #[structopt(name = "versions")] + #[structopt(name = "versions", version = version::garage())] Versions, /// Only redo the propagation of version deletions to the block ref table (extremely slow) - #[structopt(name = "block_refs")] + #[structopt(name = "block_refs", version = version::garage())] BlockRefs, /// Verify integrity of all blocks on disc (extremely slow, i/o intensive) - #[structopt(name = "scrub")] + #[structopt(name = "scrub", version = version::garage())] Scrub { #[structopt(subcommand)] cmd: ScrubCmd, @@ -435,19 +436,19 @@ pub enum RepairWhat { #[derive(Serialize, Deserialize, StructOpt, Debug, Eq, PartialEq, Clone)] pub enum ScrubCmd { /// Start scrub - #[structopt(name = "start")] + #[structopt(name = "start", version = version::garage())] Start, /// Pause scrub (it will resume automatically after 24 hours) - #[structopt(name = "pause")] + #[structopt(name = "pause", version = version::garage())] Pause, /// Resume paused scrub - #[structopt(name = "resume")] + #[structopt(name = "resume", version = version::garage())] Resume, /// Cancel scrub in progress - #[structopt(name = "cancel")] + #[structopt(name = "cancel", version = version::garage())] Cancel, /// Set tranquility level for in-progress and future scrubs - #[structopt(name = "set-tranquility")] + #[structopt(name = "set-tranquility", version = version::garage())] SetTranquility { #[structopt()] tranquility: u32, @@ -468,10 +469,10 @@ pub struct OfflineRepairOpt { pub enum OfflineRepairWhat { /// Repair K2V item counters #[cfg(feature = "k2v")] - #[structopt(name = "k2v_item_counters")] + #[structopt(name = "k2v_item_counters", version = version::garage())] K2VItemCounters, /// Repair object counters - #[structopt(name = "object_counters")] + #[structopt(name = "object_counters", version = version::garage())] ObjectCounters, } @@ -495,7 +496,7 @@ pub struct WorkerOpt { #[derive(Serialize, Deserialize, StructOpt, Debug, Eq, PartialEq, Clone)] pub enum WorkerCmd { /// List all workers on Garage node - #[structopt(name = "list")] + #[structopt(name = "list", version = version::garage())] List { #[structopt(flatten)] opt: WorkerListOpt, diff --git a/src/garage/main.rs b/src/garage/main.rs index 3fa5c3c0..89888884 100644 --- a/src/garage/main.rs +++ b/src/garage/main.rs @@ -22,6 +22,7 @@ use garage_util::error::*; use garage_rpc::system::*; use garage_rpc::*; +use garage_util::version; use garage_model::helper::error::Error as HelperError; @@ -29,7 +30,7 @@ use admin::*; use cli::*; #[derive(StructOpt, Debug)] -#[structopt(name = "garage")] +#[structopt(name = "garage", version = version::garage(), about = "S3-compatible object store for self-hosted geo-distributed deployments")] struct Opt { /// Host to connect to for admin operations, in the format: /// @: diff --git a/src/rpc/Cargo.toml b/src/rpc/Cargo.toml index 73328993..80a1975c 100644 --- a/src/rpc/Cargo.toml +++ b/src/rpc/Cargo.toml @@ -19,7 +19,6 @@ garage_util = { version = "0.7.0", path = "../util" } arc-swap = "1.0" bytes = "1.0" gethostname = "0.2" -git-version = "0.3.4" hex = "0.4" tracing = "0.1.30" rand = "0.8" diff --git a/src/rpc/system.rs b/src/rpc/system.rs index f9f2970b..fbfbbf56 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -27,6 +27,7 @@ use garage_util::data::*; use garage_util::error::*; use garage_util::persister::Persister; use garage_util::time::*; +use garage_util::version; use crate::consul::*; #[cfg(feature = "kubernetes-discovery")] @@ -320,11 +321,7 @@ impl System { // also available through RPC) ---- pub fn garage_version(&self) -> &'static str { - option_env!("GIT_VERSION").unwrap_or(git_version::git_version!( - prefix = "git:", - cargo_prefix = "cargo:", - fallback = "unknown" - )) + version::garage() } pub fn get_known_nodes(&self) -> Vec { diff --git a/src/util/Cargo.toml b/src/util/Cargo.toml index 57c70ffb..783fb3fc 100644 --- a/src/util/Cargo.toml +++ b/src/util/Cargo.toml @@ -24,6 +24,7 @@ hex = "0.4" tracing = "0.1.30" rand = "0.8" sha2 = "0.9" +git-version = "0.3.4" chrono = "0.4" rmp-serde = "0.15" @@ -43,5 +44,6 @@ hyper = "0.14" opentelemetry = { version = "0.17", features = [ "rt-tokio", "metrics", "trace" ] } + [features] k2v = [] diff --git a/src/util/lib.rs b/src/util/lib.rs index fce151af..47c85c3a 100644 --- a/src/util/lib.rs +++ b/src/util/lib.rs @@ -14,3 +14,4 @@ pub mod persister; pub mod time; pub mod token_bucket; pub mod tranquilizer; +pub mod version; diff --git a/src/util/version.rs b/src/util/version.rs new file mode 100644 index 00000000..8882d035 --- /dev/null +++ b/src/util/version.rs @@ -0,0 +1,7 @@ +pub fn garage() -> &'static str { + option_env!("GIT_VERSION").unwrap_or(git_version::git_version!( + prefix = "git:", + cargo_prefix = "cargo:", + fallback = "unknown" + )) +} -- cgit v1.2.3 From 322dafc761295df45c081183c5fc059a750a3249 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 29 Aug 2022 17:32:45 +0200 Subject: Try to fix clippy --- src/rpc/rpc_helper.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src') diff --git a/src/rpc/rpc_helper.rs b/src/rpc/rpc_helper.rs index ddabd636..216fffd4 100644 --- a/src/rpc/rpc_helper.rs +++ b/src/rpc/rpc_helper.rs @@ -404,7 +404,7 @@ impl RpcHelper { let mut nodes = nodes .iter() .map(|to| { - let peer_zone = match ring.layout.node_role(&to) { + let peer_zone = match ring.layout.node_role(to) { Some(pc) => &pc.zone, None => "", }; -- cgit v1.2.3 From dd5304f6fc2b4af3556549a3b587f588407dfa71 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 31 Aug 2022 14:24:41 +0200 Subject: Replace logging crate pretty_env_logger by tracing_subscriber::fmt --- src/garage/Cargo.toml | 2 +- src/garage/main.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'src') diff --git a/src/garage/Cargo.toml b/src/garage/Cargo.toml index 2cb8ec46..6eb4c5d2 100644 --- a/src/garage/Cargo.toml +++ b/src/garage/Cargo.toml @@ -35,7 +35,7 @@ bytesize = "1.1" timeago = "0.3" hex = "0.4" tracing = { version = "0.1.30", features = ["log-always"] } -pretty_env_logger = "0.4" +tracing-subscriber = "0.3" rand = "0.8" async-trait = "0.1.7" sodiumoxide = { version = "0.2.5-0", package = "kuska-sodiumoxide" } diff --git a/src/garage/main.rs b/src/garage/main.rs index 89888884..65abfd48 100644 --- a/src/garage/main.rs +++ b/src/garage/main.rs @@ -59,7 +59,7 @@ async fn main() { if std::env::var("RUST_LOG").is_err() { std::env::set_var("RUST_LOG", "netapp=info,garage=info") } - pretty_env_logger::init(); + tracing_subscriber::fmt::init(); sodiumoxide::init().expect("Unable to init sodiumoxide"); // Abort on panic (same behavior as in Go) -- cgit v1.2.3 From 44cd98d2e4eb981f29c3124c7ab3ddf55ccb3848 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 31 Aug 2022 14:28:17 +0200 Subject: Tracing-subscriber: write to stderr --- src/garage/main.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'src') diff --git a/src/garage/main.rs b/src/garage/main.rs index 65abfd48..cc441727 100644 --- a/src/garage/main.rs +++ b/src/garage/main.rs @@ -59,7 +59,9 @@ async fn main() { if std::env::var("RUST_LOG").is_err() { std::env::set_var("RUST_LOG", "netapp=info,garage=info") } - tracing_subscriber::fmt::init(); + tracing_subscriber::fmt() + .with_writer(std::io::stderr) + .init(); sodiumoxide::init().expect("Unable to init sodiumoxide"); // Abort on panic (same behavior as in Go) -- cgit v1.2.3 From efbca67ce43891f4cfe696bbd182f6726e8fdc73 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 31 Aug 2022 14:39:12 +0200 Subject: Add env filter to tracing subscriber --- src/garage/Cargo.toml | 2 +- src/garage/main.rs | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'src') diff --git a/src/garage/Cargo.toml b/src/garage/Cargo.toml index 6eb4c5d2..4de377aa 100644 --- a/src/garage/Cargo.toml +++ b/src/garage/Cargo.toml @@ -35,7 +35,7 @@ bytesize = "1.1" timeago = "0.3" hex = "0.4" tracing = { version = "0.1.30", features = ["log-always"] } -tracing-subscriber = "0.3" +tracing-subscriber = { version = "0.3", features = ["env-filter"] } rand = "0.8" async-trait = "0.1.7" sodiumoxide = { version = "0.2.5-0", package = "kuska-sodiumoxide" } diff --git a/src/garage/main.rs b/src/garage/main.rs index cc441727..f6e694f3 100644 --- a/src/garage/main.rs +++ b/src/garage/main.rs @@ -61,6 +61,7 @@ async fn main() { } tracing_subscriber::fmt() .with_writer(std::io::stderr) + .with_env_filter(tracing_subscriber::filter::EnvFilter::from_default_env()) .init(); sodiumoxide::init().expect("Unable to init sodiumoxide"); -- cgit v1.2.3 From 70231d68b27054c2185b73b5ceee1c445baaaa2d Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 31 Aug 2022 19:44:27 +0200 Subject: Fix bytes_read counter --- src/block/manager.rs | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'src') diff --git a/src/block/manager.rs b/src/block/manager.rs index 80c52510..b8fe4c74 100644 --- a/src/block/manager.rs +++ b/src/block/manager.rs @@ -449,8 +449,6 @@ impl BlockManager { let (header, data) = block.into_parts(); - self.metrics.bytes_read.add(data.len() as u64); - Resp::new(Ok(BlockRpc::PutBlock { hash: *hash, header, @@ -460,9 +458,16 @@ impl BlockManager { /// Read block from disk, verifying it's integrity pub(crate) async fn read_block(&self, hash: &Hash) -> Result { - self.read_block_internal(hash) + let data = self + .read_block_internal(hash) .bound_record_duration(&self.metrics.block_read_duration) - .await + .await?; + + self.metrics + .bytes_read + .add(data.inner_buffer().len() as u64); + + Ok(data) } async fn read_block_internal(&self, hash: &Hash) -> Result { -- cgit v1.2.3 From bc977f9a7a7a5bd87ccf5fe96d64b397591f8ba0 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 1 Sep 2022 12:58:20 +0200 Subject: Update to Netapp with OrderTag support and exploit OrderTags --- src/api/s3/copy.rs | 10 ++++++++-- src/api/s3/get.rs | 21 ++++++++++++++------ src/block/manager.rs | 55 ++++++++++++++++++++++++++++++++++++--------------- src/rpc/rpc_helper.rs | 2 +- 4 files changed, 63 insertions(+), 25 deletions(-) (limited to 'src') diff --git a/src/api/s3/copy.rs b/src/api/s3/copy.rs index b54cbd23..10cf5935 100644 --- a/src/api/s3/copy.rs +++ b/src/api/s3/copy.rs @@ -9,6 +9,7 @@ use bytes::Bytes; use hyper::{Body, Request, Response}; use serde::Serialize; +use garage_rpc::rpc_helper::OrderTag; use garage_table::*; use garage_util::data::*; use garage_util::time::*; @@ -306,11 +307,16 @@ pub async fn handle_upload_part_copy( // if and only if the block returned is a block that already existed // in the Garage data store (thus we don't need to save it again). let garage2 = garage.clone(); + let order_stream = OrderTag::stream(); let source_blocks = stream::iter(blocks_to_copy) - .flat_map(|(block_hash, range_to_copy)| { + .enumerate() + .flat_map(|(i, (block_hash, range_to_copy))| { let garage3 = garage2.clone(); stream::once(async move { - let data = garage3.block_manager.rpc_get_block(&block_hash).await?; + let data = garage3 + .block_manager + .rpc_get_block(&block_hash, Some(order_stream.order(i as u64))) + .await?; match range_to_copy { Some(r) => Ok((data.slice(r), None)), None => Ok((data, Some(block_hash))), diff --git a/src/api/s3/get.rs b/src/api/s3/get.rs index c7621ade..dfc284fe 100644 --- a/src/api/s3/get.rs +++ b/src/api/s3/get.rs @@ -10,6 +10,7 @@ use http::header::{ use hyper::body::Bytes; use hyper::{Body, Request, Response, StatusCode}; +use garage_rpc::rpc_helper::OrderTag; use garage_table::EmptyKey; use garage_util::data::*; @@ -242,9 +243,11 @@ pub async fn handle_get( Ok(resp_builder.body(body)?) } ObjectVersionData::FirstBlock(_, first_block_hash) => { + let order_stream = OrderTag::stream(); + let read_first_block = garage .block_manager - .rpc_get_block_streaming(first_block_hash); + .rpc_get_block_streaming(first_block_hash, Some(order_stream.order(0))); let get_next_blocks = garage.version_table.get(&last_v.uuid, &EmptyKey); let (first_block_stream, version) = @@ -260,7 +263,8 @@ pub async fn handle_get( blocks[0].1 = Some(first_block_stream); let body_stream = futures::stream::iter(blocks) - .map(move |(hash, stream_opt)| { + .enumerate() + .map(move |(i, (hash, stream_opt))| { let garage = garage.clone(); async move { if let Some(stream) = stream_opt { @@ -268,7 +272,7 @@ pub async fn handle_get( } else { garage .block_manager - .rpc_get_block_streaming(&hash) + .rpc_get_block_streaming(&hash, Some(order_stream.order(i as u64))) .await .unwrap_or_else(|_| { Box::pin(futures::stream::once(async move { @@ -281,7 +285,7 @@ pub async fn handle_get( } } }) - .buffered(3) + .buffered(2) .flatten(); let body = hyper::body::Body::wrap_stream(body_stream); @@ -445,11 +449,16 @@ fn body_from_blocks_range( true_offset += b.size; } + let order_stream = OrderTag::stream(); let body_stream = futures::stream::iter(blocks) - .map(move |(block, true_offset)| { + .enumerate() + .map(move |(i, (block, true_offset))| { let garage = garage.clone(); async move { - let data = garage.block_manager.rpc_get_block(&block.hash).await?; + let data = garage + .block_manager + .rpc_get_block(&block.hash, Some(order_stream.order(i as u64))) + .await?; let start_in_block = if true_offset > begin { 0 } else { diff --git a/src/block/manager.rs b/src/block/manager.rs index b8fe4c74..b9f6fc0f 100644 --- a/src/block/manager.rs +++ b/src/block/manager.rs @@ -33,6 +33,7 @@ use garage_util::metrics::RecordDuration; use garage_util::time::*; use garage_util::tranquilizer::Tranquilizer; +use garage_rpc::rpc_helper::OrderTag; use garage_rpc::system::System; use garage_rpc::*; @@ -70,7 +71,7 @@ pub(crate) const BLOCK_GC_DELAY: Duration = Duration::from_secs(600); pub enum BlockRpc { Ok, /// Message to ask for a block of data, by hash - GetBlock(Hash), + GetBlock(Hash, Option), /// Message to send a block of data, either because requested, of for first delivery of new /// block PutBlock { @@ -183,15 +184,18 @@ impl BlockManager { async fn rpc_get_raw_block_streaming( &self, hash: &Hash, + order_tag: Option, ) -> Result<(DataBlockHeader, ByteStream), Error> { let who = self.replication.read_nodes(hash); //let who = self.system.rpc.request_order(&who); for node in who.iter() { let node_id = NodeID::from(*node); - let rpc = - self.endpoint - .call_streaming(&node_id, BlockRpc::GetBlock(*hash), PRIO_NORMAL); + let rpc = self.endpoint.call_streaming( + &node_id, + BlockRpc::GetBlock(*hash, order_tag), + PRIO_NORMAL, + ); tokio::select! { res = rpc => { let res = match res { @@ -224,15 +228,21 @@ impl BlockManager { /// Ask nodes that might have a (possibly compressed) block for it /// Return its entire body - async fn rpc_get_raw_block(&self, hash: &Hash) -> Result { + async fn rpc_get_raw_block( + &self, + hash: &Hash, + order_tag: Option, + ) -> Result { let who = self.replication.read_nodes(hash); //let who = self.system.rpc.request_order(&who); for node in who.iter() { let node_id = NodeID::from(*node); - let rpc = - self.endpoint - .call_streaming(&node_id, BlockRpc::GetBlock(*hash), PRIO_NORMAL); + let rpc = self.endpoint.call_streaming( + &node_id, + BlockRpc::GetBlock(*hash, order_tag), + PRIO_NORMAL, + ); tokio::select! { res = rpc => { let res = match res { @@ -275,11 +285,12 @@ impl BlockManager { pub async fn rpc_get_block_streaming( &self, hash: &Hash, + order_tag: Option, ) -> Result< Pin> + Send + Sync + 'static>>, Error, > { - let (header, stream) = self.rpc_get_raw_block_streaming(hash).await?; + let (header, stream) = self.rpc_get_raw_block_streaming(hash, order_tag).await?; match header { DataBlockHeader::Plain => Ok(Box::pin(stream.map_err(|_| { std::io::Error::new(std::io::ErrorKind::Other, "netapp stream error") @@ -295,8 +306,14 @@ impl BlockManager { } /// Ask nodes that might have a block for it - pub async fn rpc_get_block(&self, hash: &Hash) -> Result { - self.rpc_get_raw_block(hash).await?.verify_get(*hash) + pub async fn rpc_get_block( + &self, + hash: &Hash, + order_tag: Option, + ) -> Result { + self.rpc_get_raw_block(hash, order_tag) + .await? + .verify_get(*hash) } /// Send block to nodes that should have it @@ -441,7 +458,7 @@ impl BlockManager { Ok(()) } - async fn handle_get_block(&self, hash: &Hash) -> Resp { + async fn handle_get_block(&self, hash: &Hash, order_tag: Option) -> Resp { let block = match self.read_block(hash).await { Ok(data) => data, Err(e) => return Resp::new(Err(e)), @@ -449,11 +466,17 @@ impl BlockManager { let (header, data) = block.into_parts(); - Resp::new(Ok(BlockRpc::PutBlock { + let resp = Resp::new(Ok(BlockRpc::PutBlock { hash: *hash, header, })) - .with_stream_from_buffer(data) + .with_stream_from_buffer(data); + + if let Some(order_tag) = order_tag { + resp.with_order_tag(order_tag) + } else { + resp + } } /// Read block from disk, verifying it's integrity @@ -841,7 +864,7 @@ impl BlockManager { hash ); - let block_data = self.rpc_get_raw_block(hash).await?; + let block_data = self.rpc_get_raw_block(hash, None).await?; self.metrics.resync_recv_counter.add(1); @@ -861,7 +884,7 @@ impl StreamingEndpointHandler for BlockManager { .await .map(|_| BlockRpc::Ok), ), - BlockRpc::GetBlock(h) => self.handle_get_block(h).await, + BlockRpc::GetBlock(h, order_tag) => self.handle_get_block(h, *order_tag).await, BlockRpc::NeedBlockQuery(h) => { Resp::new(self.need_block(h).await.map(BlockRpc::NeedBlockReply)) } diff --git a/src/rpc/rpc_helper.rs b/src/rpc/rpc_helper.rs index 216fffd4..6c79c502 100644 --- a/src/rpc/rpc_helper.rs +++ b/src/rpc/rpc_helper.rs @@ -18,7 +18,7 @@ use opentelemetry::{ pub use netapp::endpoint::{Endpoint, EndpointHandler, StreamingEndpointHandler}; use netapp::message::IntoReq; pub use netapp::message::{ - Message as Rpc, Req, RequestPriority, Resp, PRIO_BACKGROUND, PRIO_HIGH, PRIO_NORMAL, + Message as Rpc, OrderTag, Req, RequestPriority, Resp, PRIO_BACKGROUND, PRIO_HIGH, PRIO_NORMAL, }; use netapp::peering::fullmesh::FullMeshPeeringStrategy; pub use netapp::{self, NetApp, NodeID}; -- cgit v1.2.3 From df094bd8075332bb765b8b44c9b19cf2485e9ca8 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 1 Sep 2022 16:30:44 +0200 Subject: Less strict timeouts --- src/block/manager.rs | 8 ++++++-- src/rpc/rpc_helper.rs | 2 +- src/rpc/system.rs | 6 +++--- src/table/gc.rs | 3 ++- src/table/sync.rs | 3 ++- src/table/table.rs | 2 +- 6 files changed, 15 insertions(+), 9 deletions(-) (limited to 'src') diff --git a/src/block/manager.rs b/src/block/manager.rs index b9f6fc0f..00438648 100644 --- a/src/block/manager.rs +++ b/src/block/manager.rs @@ -48,10 +48,14 @@ use crate::repair::*; pub const INLINE_THRESHOLD: usize = 3072; // Timeout for RPCs that read and write blocks to remote nodes -const BLOCK_RW_TIMEOUT: Duration = Duration::from_secs(30); +const BLOCK_RW_TIMEOUT: Duration = Duration::from_secs(60); // Timeout for RPCs that ask other nodes whether they need a copy // of a given block before we delete it locally -const NEED_BLOCK_QUERY_TIMEOUT: Duration = Duration::from_secs(5); +// The timeout here is relatively low because we don't want to block +// the entire resync loop when some nodes are not responding. +// Nothing will be deleted if the nodes don't answer the queries, +// we will just retry later. +const NEED_BLOCK_QUERY_TIMEOUT: Duration = Duration::from_secs(15); // The delay between the time where a resync operation fails // and the time when it is retried, with exponential backoff diff --git a/src/rpc/rpc_helper.rs b/src/rpc/rpc_helper.rs index 6c79c502..e9575261 100644 --- a/src/rpc/rpc_helper.rs +++ b/src/rpc/rpc_helper.rs @@ -31,7 +31,7 @@ use garage_util::metrics::RecordDuration; use crate::metrics::RpcMetrics; use crate::ring::Ring; -const DEFAULT_TIMEOUT: Duration = Duration::from_secs(10); +const DEFAULT_TIMEOUT: Duration = Duration::from_secs(30); // Don't allow more than 100 concurrent outgoing RPCs. const MAX_CONCURRENT_REQUESTS: usize = 100; diff --git a/src/rpc/system.rs b/src/rpc/system.rs index 5858660e..d7ef2140 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -38,7 +38,7 @@ use crate::rpc_helper::*; const DISCOVERY_INTERVAL: Duration = Duration::from_secs(60); const STATUS_EXCHANGE_INTERVAL: Duration = Duration::from_secs(10); -const PING_TIMEOUT: Duration = Duration::from_secs(2); +const SYSTEM_RPC_TIMEOUT: Duration = Duration::from_secs(15); /// Version tag used for version check upon Netapp connection pub const GARAGE_VERSION_TAG: u64 = 0x6761726167650007; // garage 0x0007 @@ -561,7 +561,7 @@ impl System { .broadcast( &self.system_endpoint, SystemRpc::AdvertiseStatus(local_status), - RequestStrategy::with_priority(PRIO_HIGH).with_timeout(PING_TIMEOUT), + RequestStrategy::with_priority(PRIO_HIGH).with_timeout(SYSTEM_RPC_TIMEOUT), ) .await; @@ -685,7 +685,7 @@ impl System { &self.system_endpoint, peer, SystemRpc::PullClusterLayout, - RequestStrategy::with_priority(PRIO_HIGH).with_timeout(PING_TIMEOUT), + RequestStrategy::with_priority(PRIO_HIGH).with_timeout(SYSTEM_RPC_TIMEOUT), ) .await; if let Ok(SystemRpc::AdvertiseClusterLayout(layout)) = resp { diff --git a/src/table/gc.rs b/src/table/gc.rs index 12218d97..6cae9701 100644 --- a/src/table/gc.rs +++ b/src/table/gc.rs @@ -25,7 +25,8 @@ use crate::replication::*; use crate::schema::*; const TABLE_GC_BATCH_SIZE: usize = 1024; -const TABLE_GC_RPC_TIMEOUT: Duration = Duration::from_secs(30); +// Same timeout as NEED_BLOCK_QUERY_TIMEOUT in block manager +const TABLE_GC_RPC_TIMEOUT: Duration = Duration::from_secs(15); // GC delay for table entries: 1 day (24 hours) // (the delay before the entry is added in the GC todo list diff --git a/src/table/sync.rs b/src/table/sync.rs index b3756a5e..62b88a58 100644 --- a/src/table/sync.rs +++ b/src/table/sync.rs @@ -24,7 +24,8 @@ use crate::merkle::*; use crate::replication::*; use crate::*; -const TABLE_SYNC_RPC_TIMEOUT: Duration = Duration::from_secs(30); +// Sync RPC can contain a lot of data, so have a 1min timeout +const TABLE_SYNC_RPC_TIMEOUT: Duration = Duration::from_secs(60); // Do anti-entropy every 10 minutes const ANTI_ENTROPY_INTERVAL: Duration = Duration::from_secs(10 * 60); diff --git a/src/table/table.rs b/src/table/table.rs index 3c211728..51f3837f 100644 --- a/src/table/table.rs +++ b/src/table/table.rs @@ -31,7 +31,7 @@ use crate::schema::*; use crate::sync::*; use crate::util::*; -pub const TABLE_RPC_TIMEOUT: Duration = Duration::from_secs(10); +pub const TABLE_RPC_TIMEOUT: Duration = Duration::from_secs(30); pub struct Table { pub system: Arc, -- cgit v1.2.3 From 99b532b85bf35b5acf621c229fb991825f3d994c Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 1 Sep 2022 16:35:43 +0200 Subject: Apply PRIO_SECONDARY to block data transfers --- src/block/manager.rs | 6 +++--- src/rpc/rpc_helper.rs | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'src') diff --git a/src/block/manager.rs b/src/block/manager.rs index 00438648..a9def3b0 100644 --- a/src/block/manager.rs +++ b/src/block/manager.rs @@ -198,7 +198,7 @@ impl BlockManager { let rpc = self.endpoint.call_streaming( &node_id, BlockRpc::GetBlock(*hash, order_tag), - PRIO_NORMAL, + PRIO_NORMAL | PRIO_SECONDARY, ); tokio::select! { res = rpc => { @@ -245,7 +245,7 @@ impl BlockManager { let rpc = self.endpoint.call_streaming( &node_id, BlockRpc::GetBlock(*hash, order_tag), - PRIO_NORMAL, + PRIO_NORMAL | PRIO_SECONDARY, ); tokio::select! { res = rpc => { @@ -336,7 +336,7 @@ impl BlockManager { &self.endpoint, &who[..], put_block_rpc, - RequestStrategy::with_priority(PRIO_NORMAL) + RequestStrategy::with_priority(PRIO_NORMAL | PRIO_SECONDARY) .with_quorum(self.replication.write_quorum()) .with_timeout(BLOCK_RW_TIMEOUT), ) diff --git a/src/rpc/rpc_helper.rs b/src/rpc/rpc_helper.rs index e9575261..aa204c5e 100644 --- a/src/rpc/rpc_helper.rs +++ b/src/rpc/rpc_helper.rs @@ -18,7 +18,7 @@ use opentelemetry::{ pub use netapp::endpoint::{Endpoint, EndpointHandler, StreamingEndpointHandler}; use netapp::message::IntoReq; pub use netapp::message::{ - Message as Rpc, OrderTag, Req, RequestPriority, Resp, PRIO_BACKGROUND, PRIO_HIGH, PRIO_NORMAL, + Message as Rpc, OrderTag, Req, RequestPriority, Resp, PRIO_BACKGROUND, PRIO_HIGH, PRIO_NORMAL, PRIO_SECONDARY }; use netapp::peering::fullmesh::FullMeshPeeringStrategy; pub use netapp::{self, NetApp, NodeID}; -- cgit v1.2.3 From 1ef87ac4cb676113e86fc16a9eb27546d9a737bd Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 2 Sep 2022 13:38:29 +0200 Subject: cargo fmt --- src/rpc/rpc_helper.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'src') diff --git a/src/rpc/rpc_helper.rs b/src/rpc/rpc_helper.rs index aa204c5e..19abb4c5 100644 --- a/src/rpc/rpc_helper.rs +++ b/src/rpc/rpc_helper.rs @@ -18,7 +18,8 @@ use opentelemetry::{ pub use netapp::endpoint::{Endpoint, EndpointHandler, StreamingEndpointHandler}; use netapp::message::IntoReq; pub use netapp::message::{ - Message as Rpc, OrderTag, Req, RequestPriority, Resp, PRIO_BACKGROUND, PRIO_HIGH, PRIO_NORMAL, PRIO_SECONDARY + Message as Rpc, OrderTag, Req, RequestPriority, Resp, PRIO_BACKGROUND, PRIO_HIGH, PRIO_NORMAL, + PRIO_SECONDARY, }; use netapp::peering::fullmesh::FullMeshPeeringStrategy; pub use netapp::{self, NetApp, NodeID}; -- cgit v1.2.3 From 13b5f28c7e8dec12b1db61735931b3830a3c893f Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 2 Sep 2022 13:46:42 +0200 Subject: Make use of BytesBuf from new Netapp --- src/api/s3/put.rs | 43 ++++++++++--------------------------------- 1 file changed, 10 insertions(+), 33 deletions(-) (limited to 'src') diff --git a/src/api/s3/put.rs b/src/api/s3/put.rs index dc0530df..97b8e4e3 100644 --- a/src/api/s3/put.rs +++ b/src/api/s3/put.rs @@ -1,4 +1,4 @@ -use std::collections::{BTreeMap, BTreeSet, HashMap, VecDeque}; +use std::collections::{BTreeMap, BTreeSet, HashMap}; use std::sync::Arc; use futures::prelude::*; @@ -13,6 +13,7 @@ use opentelemetry::{ Context, }; +use garage_rpc::netapp::bytes_buf::BytesBuf; use garage_table::*; use garage_util::async_hash::*; use garage_util::data::*; @@ -108,7 +109,7 @@ pub(crate) async fn save_stream> + Unpin>( size, etag: data_md5sum_hex.clone(), }, - first_block, + first_block.to_vec(), )), }; @@ -136,7 +137,6 @@ pub(crate) async fn save_stream> + Unpin>( garage.version_table.insert(&version).await?; // Transfer data and verify checksum - let first_block = Bytes::from(first_block); let first_block_hash = async_blake2sum(first_block.clone()).await; let tx_result = (|| async { @@ -318,7 +318,6 @@ async fn read_and_put_blocks> + Unpin>( chunker.next(), )?; if let Some(block) = next_block { - let block = Bytes::from(block); let (_, _, block_hash) = futures::future::join3( md5hasher.update(block.clone()), sha256hasher.update(block.clone()), @@ -387,8 +386,7 @@ struct StreamChunker>> { stream: S, read_all: bool, block_size: usize, - buf: VecDeque, - buf_len: usize, + buf: BytesBuf, } impl> + Unpin> StreamChunker { @@ -397,45 +395,25 @@ impl> + Unpin> StreamChunker { stream, read_all: false, block_size, - buf: VecDeque::with_capacity(8), - buf_len: 0, + buf: BytesBuf::new(), } } - async fn next(&mut self) -> Result>, Error> { - while !self.read_all && self.buf_len < self.block_size { + async fn next(&mut self) -> Result, Error> { + while !self.read_all && self.buf.len() < self.block_size { if let Some(block) = self.stream.next().await { let bytes = block?; trace!("Body next: {} bytes", bytes.len()); - self.buf_len += bytes.len(); - self.buf.push_back(bytes); + self.buf.extend(bytes); } else { self.read_all = true; } } - if self.buf_len == 0 { + if self.buf.is_empty() { Ok(None) } else { - let mut slices = Vec::with_capacity(self.buf.len()); - let mut taken = 0; - while self.buf_len > 0 && taken < self.block_size { - let front = self.buf.pop_front().unwrap(); - if taken + front.len() <= self.block_size { - taken += front.len(); - self.buf_len -= front.len(); - slices.push(front); - } else { - let front_take = self.block_size - taken; - slices.push(front.slice(..front_take)); - self.buf.push_front(front.slice(front_take..)); - self.buf_len -= front_take; - break; - } - } - Ok(Some( - slices.iter().map(|x| &x[..]).collect::>().concat(), - )) + Ok(Some(self.buf.take_max(self.block_size))) } } } @@ -545,7 +523,6 @@ pub async fn handle_put_part( // Copy block to store let version = Version::new(version_uuid, bucket_id, key, false); - let first_block = Bytes::from(first_block); let first_block_hash = async_blake2sum(first_block.clone()).await; let (_, data_md5sum, data_sha256sum) = read_and_put_blocks( -- cgit v1.2.3 From 6226f5ceca7828d096890c3dbc5b9fbc3f7c4b14 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 2 Sep 2022 14:33:12 +0200 Subject: Update to netapp 0.4.5 - fixed ping --- src/rpc/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src') diff --git a/src/rpc/Cargo.toml b/src/rpc/Cargo.toml index 80a1975c..5757fe8d 100644 --- a/src/rpc/Cargo.toml +++ b/src/rpc/Cargo.toml @@ -47,7 +47,7 @@ opentelemetry = "0.17" #netapp = { version = "0.3.0", git = "https://git.deuxfleurs.fr/lx/netapp" } #netapp = { version = "0.4", path = "../../../netapp", features = ["telemetry"] } -netapp = { version = "0.4.4", features = ["telemetry"] } +netapp = { version = "0.4.5", features = ["telemetry"] } hyper = { version = "0.14", features = ["client", "http1", "runtime", "tcp"] } -- cgit v1.2.3 From 943d76c583f5740b1d922275a673233a27fe1693 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 2 Sep 2022 15:34:21 +0200 Subject: Ability to dynamically set resync tranquility --- src/block/manager.rs | 62 ++++++++++++++++++++++++++++++++++++++--------- src/block/repair.rs | 26 +++++++++++++++++--- src/garage/admin.rs | 19 +++++++++++++++ src/garage/cli/structs.rs | 16 ++++++++++++ src/model/garage.rs | 1 - src/util/config.rs | 7 ------ 6 files changed, 108 insertions(+), 23 deletions(-) (limited to 'src') diff --git a/src/block/manager.rs b/src/block/manager.rs index 017ba9da..ef48107f 100644 --- a/src/block/manager.rs +++ b/src/block/manager.rs @@ -3,7 +3,7 @@ use std::path::PathBuf; use std::sync::Arc; use std::time::Duration; -use arc_swap::ArcSwapOption; +use arc_swap::{ArcSwap, ArcSwapOption}; use async_trait::async_trait; use serde::{Deserialize, Serialize}; @@ -25,6 +25,7 @@ use garage_util::background::*; use garage_util::data::*; use garage_util::error::*; use garage_util::metrics::RecordDuration; +use garage_util::persister::Persister; use garage_util::time::*; use garage_util::tranquilizer::Tranquilizer; @@ -55,6 +56,10 @@ const RESYNC_RETRY_DELAY: Duration = Duration::from_secs(60); // The maximum retry delay is 60 seconds * 2^6 = 60 seconds << 6 = 64 minutes (~1 hour) const RESYNC_RETRY_DELAY_MAX_BACKOFF_POWER: u64 = 6; +// Resync tranquility is initially set to 2, but can be changed in the CLI +// and the updated version is persisted over Garage restarts +const INITIAL_RESYNC_TRANQUILITY: u32 = 2; + // The delay between the moment when the reference counter // drops to zero, and the moment where we allow ourselves // to delete the block locally. @@ -90,7 +95,6 @@ pub struct BlockManager { pub data_dir: PathBuf, compression_level: Option, - background_tranquility: u32, mutation_lock: Mutex, @@ -100,6 +104,9 @@ pub struct BlockManager { resync_notify: Notify, resync_errors: CountedTree, + resync_persister: Persister, + resync_persisted: ArcSwap, + pub(crate) system: Arc, endpoint: Arc>, @@ -124,7 +131,6 @@ impl BlockManager { db: &db::Db, data_dir: PathBuf, compression_level: Option, - background_tranquility: u32, replication: TableShardedReplication, system: Arc, ) -> Arc { @@ -145,6 +151,14 @@ impl BlockManager { let resync_errors = CountedTree::new(resync_errors).expect("Could not count block_local_resync_errors"); + let resync_persister = Persister::new(&system.metadata_dir, "resync_cfg"); + let resync_persisted = match resync_persister.load() { + Ok(v) => v, + Err(_) => ResyncPersistedConfig { + tranquility: INITIAL_RESYNC_TRANQUILITY, + }, + }; + let endpoint = system .netapp .endpoint("garage_block/manager.rs/Rpc".to_string()); @@ -157,12 +171,13 @@ impl BlockManager { replication, data_dir, compression_level, - background_tranquility, mutation_lock: Mutex::new(manager_locked), rc, resync_queue, resync_notify: Notify::new(), resync_errors, + resync_persister, + resync_persisted: ArcSwap::new(Arc::new(resync_persisted)), system, endpoint, metrics, @@ -716,6 +731,23 @@ impl BlockManager { Ok(()) } + + async fn update_resync_persisted( + &self, + update: impl Fn(&mut ResyncPersistedConfig), + ) -> Result<(), Error> { + let mut cfg: ResyncPersistedConfig = *self.resync_persisted.load().as_ref(); + update(&mut cfg); + self.resync_persister.save_async(&cfg).await?; + self.resync_persisted.store(Arc::new(cfg)); + self.resync_notify.notify_one(); + Ok(()) + } + + pub async fn set_resync_tranquility(&self, tranquility: u32) -> Result<(), Error> { + self.update_resync_persisted(|cfg| cfg.tranquility = tranquility) + .await + } } #[async_trait] @@ -734,6 +766,11 @@ impl EndpointHandler for BlockManager { } } +#[derive(Serialize, Deserialize, Clone, Copy)] +struct ResyncPersistedConfig { + tranquility: u32, +} + struct ResyncWorker { manager: Arc, tranquilizer: Tranquilizer, @@ -758,19 +795,22 @@ impl Worker for ResyncWorker { fn info(&self) -> Option { let mut ret = vec![]; + ret.push(format!( + "tranquility = {}", + self.manager.resync_persisted.load().tranquility + )); + let qlen = self.manager.resync_queue_len().unwrap_or(0); - let elen = self.manager.resync_errors_len().unwrap_or(0); if qlen > 0 { ret.push(format!("{} blocks in queue", qlen)); } + + let elen = self.manager.resync_errors_len().unwrap_or(0); if elen > 0 { ret.push(format!("{} blocks in error state", elen)); } - if !ret.is_empty() { - Some(ret.join(", ")) - } else { - None - } + + Some(ret.join(", ")) } async fn work(&mut self, _must_exit: &mut watch::Receiver) -> Result { @@ -778,7 +818,7 @@ impl Worker for ResyncWorker { match self.manager.resync_iter().await { Ok(ResyncIterResult::BusyDidSomething) => Ok(self .tranquilizer - .tranquilize_worker(self.manager.background_tranquility)), + .tranquilize_worker(self.manager.resync_persisted.load().tranquility)), Ok(ResyncIterResult::BusyDidNothing) => Ok(WorkerState::Busy), Ok(ResyncIterResult::IdleFor(delay)) => { self.next_delay = delay; diff --git a/src/block/repair.rs b/src/block/repair.rs index 07ff6772..18e1de95 100644 --- a/src/block/repair.rs +++ b/src/block/repair.rs @@ -19,7 +19,17 @@ use garage_util::tranquilizer::Tranquilizer; use crate::manager::*; -const SCRUB_INTERVAL: Duration = Duration::from_secs(3600 * 24 * 30); // full scrub every 30 days +// Full scrub every 30 days +const SCRUB_INTERVAL: Duration = Duration::from_secs(3600 * 24 * 30); +// Scrub tranquility is initially set to 4, but can be changed in the CLI +// and the updated version is persisted over Garage restarts +const INITIAL_SCRUB_TRANQUILITY: u32 = 4; + +// ---- ---- ---- +// FIRST KIND OF REPAIR: FINDING MISSING BLOCKS/USELESS BLOCKS +// This is a one-shot repair operation that can be launched, +// checks everything, and then exits. +// ---- ---- ---- pub struct RepairWorker { manager: Arc, @@ -128,7 +138,13 @@ impl Worker for RepairWorker { } } -// ---- +// ---- ---- ---- +// SECOND KIND OF REPAIR: SCRUBBING THE DATASTORE +// This is significantly more complex than the process above, +// as it is a continuously-running task that triggers automatically +// every SCRUB_INTERVAL, but can also be triggered manually +// and whose parameter (esp. speed) can be controlled at runtime. +// ---- ---- ---- pub struct ScrubWorker { manager: Arc, @@ -176,7 +192,7 @@ impl ScrubWorker { Ok(v) => v, Err(_) => ScrubWorkerPersisted { time_last_complete_scrub: 0, - tranquility: 4, + tranquility: INITIAL_SCRUB_TRANQUILITY, corruptions_detected: 0, }, }; @@ -343,7 +359,9 @@ impl Worker for ScrubWorker { } } -// ---- +// ---- ---- ---- +// UTILITY FOR ENUMERATING THE BLOCK STORE +// ---- ---- ---- struct BlockStoreIterator { path: Vec, diff --git a/src/garage/admin.rs b/src/garage/admin.rs index 71ee608c..1d80889c 100644 --- a/src/garage/admin.rs +++ b/src/garage/admin.rs @@ -15,6 +15,8 @@ use garage_table::*; use garage_rpc::*; +use garage_block::repair::ScrubWorkerCommand; + use garage_model::bucket_alias_table::*; use garage_model::bucket_table::*; use garage_model::garage::Garage; @@ -836,6 +838,23 @@ impl AdminRpcHandler { let workers = self.garage.background.get_worker_info(); Ok(AdminRpc::WorkerList(workers, opt)) } + WorkerCmd::Set { opt } => match opt { + WorkerSetCmd::ScrubTranquility { tranquility } => { + let scrub_command = ScrubWorkerCommand::SetTranquility(tranquility); + self.garage + .block_manager + .send_scrub_command(scrub_command) + .await; + Ok(AdminRpc::Ok("Scrub tranquility updated".into())) + } + WorkerSetCmd::ResyncTranquility { tranquility } => { + self.garage + .block_manager + .set_resync_tranquility(tranquility) + .await?; + Ok(AdminRpc::Ok("Resync tranquility updated".into())) + } + }, } } } diff --git a/src/garage/cli/structs.rs b/src/garage/cli/structs.rs index 9274f80f..1fba934f 100644 --- a/src/garage/cli/structs.rs +++ b/src/garage/cli/structs.rs @@ -501,6 +501,12 @@ pub enum WorkerCmd { #[structopt(flatten)] opt: WorkerListOpt, }, + /// Set worker parameter + #[structopt(name = "set", version = version::garage())] + Set { + #[structopt(subcommand)] + opt: WorkerSetCmd, + }, } #[derive(Serialize, Deserialize, StructOpt, Debug, Eq, PartialEq, Clone, Copy)] @@ -512,3 +518,13 @@ pub struct WorkerListOpt { #[structopt(short = "e", long = "errors")] pub errors: bool, } + +#[derive(Serialize, Deserialize, StructOpt, Debug, Eq, PartialEq, Clone)] +pub enum WorkerSetCmd { + /// Set tranquility of scrub operations + #[structopt(name = "scrub-tranquility", version = version::garage())] + ScrubTranquility { tranquility: u32 }, + /// Set tranquility of resync operations + #[structopt(name = "resync-tranquility", version = version::garage())] + ResyncTranquility { tranquility: u32 }, +} diff --git a/src/model/garage.rs b/src/model/garage.rs index 15769a17..4dd24582 100644 --- a/src/model/garage.rs +++ b/src/model/garage.rs @@ -159,7 +159,6 @@ impl Garage { &db, config.data_dir.clone(), config.compression_level, - config.block_manager_background_tranquility, data_rep_param, system.clone(), ); diff --git a/src/util/config.rs b/src/util/config.rs index e8ef4fdd..a2bb8fb3 100644 --- a/src/util/config.rs +++ b/src/util/config.rs @@ -23,10 +23,6 @@ pub struct Config { #[serde(default = "default_block_size")] pub block_size: usize, - /// Size of data blocks to save to disk - #[serde(default = "default_block_manager_background_tranquility")] - pub block_manager_background_tranquility: u32, - /// Replication mode. Supported values: /// - none, 1 -> no replication /// - 2 -> 2-way replication @@ -147,9 +143,6 @@ fn default_sled_flush_every_ms() -> u64 { fn default_block_size() -> usize { 1048576 } -fn default_block_manager_background_tranquility() -> u32 { - 2 -} /// Read and parse configuration pub fn read_config(config_file: PathBuf) -> Result { -- cgit v1.2.3 From 47be652a1fe08a8e6dab6aa2c4a41d8eb119f392 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 2 Sep 2022 16:47:15 +0200 Subject: block manager: refactor: split resync into separate file --- src/block/lib.rs | 1 + src/block/manager.rs | 598 +++++---------------------------------------------- src/block/repair.rs | 8 +- src/block/resync.rs | 536 +++++++++++++++++++++++++++++++++++++++++++++ src/garage/admin.rs | 7 +- 5 files changed, 595 insertions(+), 555 deletions(-) create mode 100644 src/block/resync.rs (limited to 'src') diff --git a/src/block/lib.rs b/src/block/lib.rs index ebdb95d8..d2814f77 100644 --- a/src/block/lib.rs +++ b/src/block/lib.rs @@ -3,6 +3,7 @@ extern crate tracing; pub mod manager; pub mod repair; +pub mod resync; mod block; mod metrics; diff --git a/src/block/manager.rs b/src/block/manager.rs index ef48107f..efb5349c 100644 --- a/src/block/manager.rs +++ b/src/block/manager.rs @@ -1,33 +1,19 @@ -use std::convert::TryInto; use std::path::PathBuf; use std::sync::Arc; use std::time::Duration; -use arc_swap::{ArcSwap, ArcSwapOption}; use async_trait::async_trait; use serde::{Deserialize, Serialize}; -use futures::future::*; use tokio::fs; use tokio::io::{AsyncReadExt, AsyncWriteExt}; -use tokio::select; -use tokio::sync::{mpsc, watch, Mutex, Notify}; - -use opentelemetry::{ - trace::{FutureExt as OtelFutureExt, TraceContextExt, Tracer}, - Context, KeyValue, -}; +use tokio::sync::{mpsc, Mutex}; use garage_db as db; -use garage_db::counted_tree_hack::CountedTree; -use garage_util::background::*; use garage_util::data::*; use garage_util::error::*; use garage_util::metrics::RecordDuration; -use garage_util::persister::Persister; -use garage_util::time::*; -use garage_util::tranquilizer::Tranquilizer; use garage_rpc::system::System; use garage_rpc::*; @@ -38,27 +24,13 @@ use crate::block::*; use crate::metrics::*; use crate::rc::*; use crate::repair::*; +use crate::resync::*; /// Size under which data will be stored inlined in database instead of as files pub const INLINE_THRESHOLD: usize = 3072; // Timeout for RPCs that read and write blocks to remote nodes -const BLOCK_RW_TIMEOUT: Duration = Duration::from_secs(30); -// Timeout for RPCs that ask other nodes whether they need a copy -// of a given block before we delete it locally -const NEED_BLOCK_QUERY_TIMEOUT: Duration = Duration::from_secs(5); - -// The delay between the time where a resync operation fails -// and the time when it is retried, with exponential backoff -// (multiplied by 2, 4, 8, 16, etc. for every consecutive failure). -const RESYNC_RETRY_DELAY: Duration = Duration::from_secs(60); -// The minimum retry delay is 60 seconds = 1 minute -// The maximum retry delay is 60 seconds * 2^6 = 60 seconds << 6 = 64 minutes (~1 hour) -const RESYNC_RETRY_DELAY_MAX_BACKOFF_POWER: u64 = 6; - -// Resync tranquility is initially set to 2, but can be changed in the CLI -// and the updated version is persisted over Garage restarts -const INITIAL_RESYNC_TRANQUILITY: u32 = 2; +pub(crate) const BLOCK_RW_TIMEOUT: Duration = Duration::from_secs(30); // The delay between the moment when the reference counter // drops to zero, and the moment where we allow ourselves @@ -96,35 +68,23 @@ pub struct BlockManager { compression_level: Option, - mutation_lock: Mutex, + pub(crate) mutation_lock: Mutex, pub(crate) rc: BlockRc, - - resync_queue: CountedTree, - resync_notify: Notify, - resync_errors: CountedTree, - - resync_persister: Persister, - resync_persisted: ArcSwap, + pub resync: BlockResyncManager, pub(crate) system: Arc, - endpoint: Arc>, + pub(crate) endpoint: Arc>, - metrics: BlockManagerMetrics, + pub(crate) metrics: BlockManagerMetrics, - tx_scrub_command: ArcSwapOption>, + tx_scrub_command: mpsc::Sender, } // This custom struct contains functions that must only be ran // when the lock is held. We ensure that it is the case by storing // it INSIDE a Mutex. -struct BlockManagerLocked(); - -enum ResyncIterResult { - BusyDidSomething, - BusyDidNothing, - IdleFor(Duration), -} +pub(crate) struct BlockManagerLocked(); impl BlockManager { pub fn new( @@ -139,25 +99,7 @@ impl BlockManager { .expect("Unable to open block_local_rc tree"); let rc = BlockRc::new(rc); - let resync_queue = db - .open_tree("block_local_resync_queue") - .expect("Unable to open block_local_resync_queue tree"); - let resync_queue = - CountedTree::new(resync_queue).expect("Could not count block_local_resync_queue"); - - let resync_errors = db - .open_tree("block_local_resync_errors") - .expect("Unable to open block_local_resync_errors tree"); - let resync_errors = - CountedTree::new(resync_errors).expect("Could not count block_local_resync_errors"); - - let resync_persister = Persister::new(&system.metadata_dir, "resync_cfg"); - let resync_persisted = match resync_persister.load() { - Ok(v) => v, - Err(_) => ResyncPersistedConfig { - tranquility: INITIAL_RESYNC_TRANQUILITY, - }, - }; + let resync = BlockResyncManager::new(db, &system); let endpoint = system .netapp @@ -165,7 +107,9 @@ impl BlockManager { let manager_locked = BlockManagerLocked(); - let metrics = BlockManagerMetrics::new(resync_queue.clone(), resync_errors.clone()); + let metrics = BlockManagerMetrics::new(resync.queue.clone(), resync.errors.clone()); + + let (scrub_tx, scrub_rx) = mpsc::channel(1); let block_manager = Arc::new(Self { replication, @@ -173,25 +117,31 @@ impl BlockManager { compression_level, mutation_lock: Mutex::new(manager_locked), rc, - resync_queue, - resync_notify: Notify::new(), - resync_errors, - resync_persister, - resync_persisted: ArcSwap::new(Arc::new(resync_persisted)), + resync, system, endpoint, metrics, - tx_scrub_command: ArcSwapOption::new(None), + tx_scrub_command: scrub_tx, }); block_manager.endpoint.set_handler(block_manager.clone()); - block_manager.clone().spawn_background_workers(); + // Spawn one resync worker + let background = block_manager.system.background.clone(); + let worker = ResyncWorker::new(block_manager.clone()); + tokio::spawn(async move { + tokio::time::sleep(Duration::from_secs(10)).await; + background.spawn_worker(worker); + }); + + // Spawn scrub worker + let scrub_worker = ScrubWorker::new(block_manager.clone(), scrub_rx); + block_manager.system.background.spawn_worker(scrub_worker); block_manager } /// Ask nodes that might have a (possibly compressed) block for it - async fn rpc_get_raw_block(&self, hash: &Hash) -> Result { + pub(crate) async fn rpc_get_raw_block(&self, hash: &Hash) -> Result { let who = self.replication.read_nodes(hash); let resps = self .system @@ -243,20 +193,6 @@ impl BlockManager { Ok(()) } - /// Get lenght of resync queue - pub fn resync_queue_len(&self) -> Result { - // This currently can't return an error because the CountedTree hack - // doesn't error on .len(), but this will change when we remove the hack - // (hopefully someday!) - Ok(self.resync_queue.len()) - } - - /// Get number of blocks that have an error - pub fn resync_errors_len(&self) -> Result { - // (see resync_queue_len comment) - Ok(self.resync_errors.len()) - } - /// Get number of items in the refcount table pub fn rc_len(&self) -> Result { Ok(self.rc.rc.len()?) @@ -264,13 +200,7 @@ impl BlockManager { /// Send command to start/stop/manager scrub worker pub async fn send_scrub_command(&self, cmd: ScrubWorkerCommand) { - let _ = self - .tx_scrub_command - .load() - .as_ref() - .unwrap() - .send(cmd) - .await; + let _ = self.tx_scrub_command.send(cmd).await; } //// ----- Managing the reference counter ---- @@ -291,7 +221,7 @@ impl BlockManager { // we will fecth it from someone. let this = self.clone(); tokio::spawn(async move { - if let Err(e) = this.put_to_resync(&hash, 2 * BLOCK_RW_TIMEOUT) { + if let Err(e) = this.resync.put_to_resync(&hash, 2 * BLOCK_RW_TIMEOUT) { error!("Block {:?} could not be put in resync queue: {}.", hash, e); } }); @@ -313,7 +243,9 @@ impl BlockManager { // after that delay has passed. let this = self.clone(); tokio::spawn(async move { - if let Err(e) = this.put_to_resync(&hash, BLOCK_GC_DELAY + Duration::from_secs(10)) + if let Err(e) = this + .resync + .put_to_resync(&hash, BLOCK_GC_DELAY + Duration::from_secs(10)) { error!("Block {:?} could not be put in resync queue: {}.", hash, e); } @@ -325,7 +257,11 @@ impl BlockManager { // ---- Reading and writing blocks locally ---- /// Write a block to disk - async fn write_block(&self, hash: &Hash, data: &DataBlock) -> Result { + pub(crate) async fn write_block( + &self, + hash: &Hash, + data: &DataBlock, + ) -> Result { let write_size = data.inner_buffer().len() as u64; let res = self @@ -361,7 +297,7 @@ impl BlockManager { Ok(c) => c, Err(e) => { // Not found but maybe we should have had it ?? - self.put_to_resync(hash, 2 * BLOCK_RW_TIMEOUT)?; + self.resync.put_to_resync(hash, 2 * BLOCK_RW_TIMEOUT)?; return Err(Into::into(e)); } }; @@ -388,7 +324,7 @@ impl BlockManager { .await .move_block_to_corrupted(hash, self) .await?; - self.put_to_resync(hash, Duration::from_millis(0))?; + self.resync.put_to_resync(hash, Duration::from_millis(0))?; return Err(Error::CorruptData(*hash)); } @@ -432,322 +368,6 @@ impl BlockManager { path.set_extension(""); fs::metadata(&path).await.map(|_| false).map_err(Into::into) } - - // ---- Resync loop ---- - - // This part manages a queue of blocks that need to be - // "resynchronized", i.e. that need to have a check that - // they are at present if we need them, or that they are - // deleted once the garbage collection delay has passed. - // - // Here are some explanations on how the resync queue works. - // There are two Sled trees that are used to have information - // about the status of blocks that need to be resynchronized: - // - // - resync_queue: a tree that is ordered first by a timestamp - // (in milliseconds since Unix epoch) that is the time at which - // the resync must be done, and second by block hash. - // The key in this tree is just: - // concat(timestamp (8 bytes), hash (32 bytes)) - // The value is the same 32-byte hash. - // - // - resync_errors: a tree that indicates for each block - // if the last resync resulted in an error, and if so, - // the following two informations (see the ErrorCounter struct): - // - how many consecutive resync errors for this block? - // - when was the last try? - // These two informations are used to implement an - // exponential backoff retry strategy. - // The key in this tree is the 32-byte hash of the block, - // and the value is the encoded ErrorCounter value. - // - // We need to have these two trees, because the resync queue - // is not just a queue of items to process, but a set of items - // that are waiting a specific delay until we can process them - // (the delay being necessary both internally for the exponential - // backoff strategy, and exposed as a parameter when adding items - // to the queue, e.g. to wait until the GC delay has passed). - // This is why we need one tree ordered by time, and one - // ordered by identifier of item to be processed (block hash). - // - // When the worker wants to process an item it takes from - // resync_queue, it checks in resync_errors that if there is an - // exponential back-off delay to await, it has passed before we - // process the item. If not, the item in the queue is skipped - // (but added back for later processing after the time of the - // delay). - // - // An alternative that would have seemed natural is to - // only add items to resync_queue with a processing time that is - // after the delay, but there are several issues with this: - // - This requires to synchronize updates to resync_queue and - // resync_errors (with the current model, there is only one thread, - // the worker thread, that accesses resync_errors, - // so no need to synchronize) by putting them both in a lock. - // This would mean that block_incref might need to take a lock - // before doing its thing, meaning it has much more chances of - // not completing successfully if something bad happens to Garage. - // Currently Garage is not able to recover from block_incref that - // doesn't complete successfully, because it is necessary to ensure - // the consistency between the state of the block manager and - // information in the BlockRef table. - // - If a resync fails, we put that block in the resync_errors table, - // and also add it back to resync_queue to be processed after - // the exponential back-off delay, - // but maybe the block is already scheduled to be resynced again - // at another time that is before the exponential back-off delay, - // and we have no way to check that easily. This means that - // in all cases, we need to check the resync_errors table - // in the resync loop at the time when a block is popped from - // the resync_queue. - // Overall, the current design is therefore simpler and more robust - // because it tolerates inconsistencies between the resync_queue - // and resync_errors table (items being scheduled in resync_queue - // for times that are earlier than the exponential back-off delay - // is a natural condition that is handled properly). - - fn spawn_background_workers(self: Arc) { - // Launch a background workers for background resync loop processing - let background = self.system.background.clone(); - let worker = ResyncWorker::new(self.clone()); - tokio::spawn(async move { - tokio::time::sleep(Duration::from_secs(10)).await; - background.spawn_worker(worker); - }); - - // Launch a background worker for data store scrubs - let (scrub_tx, scrub_rx) = mpsc::channel(1); - self.tx_scrub_command.store(Some(Arc::new(scrub_tx))); - let scrub_worker = ScrubWorker::new(self.clone(), scrub_rx); - self.system.background.spawn_worker(scrub_worker); - } - - pub(crate) fn put_to_resync(&self, hash: &Hash, delay: Duration) -> db::Result<()> { - let when = now_msec() + delay.as_millis() as u64; - self.put_to_resync_at(hash, when) - } - - fn put_to_resync_at(&self, hash: &Hash, when: u64) -> db::Result<()> { - trace!("Put resync_queue: {} {:?}", when, hash); - let mut key = u64::to_be_bytes(when).to_vec(); - key.extend(hash.as_ref()); - self.resync_queue.insert(key, hash.as_ref())?; - self.resync_notify.notify_waiters(); - Ok(()) - } - - async fn resync_iter(&self) -> Result { - if let Some((time_bytes, hash_bytes)) = self.resync_queue.first()? { - let time_msec = u64::from_be_bytes(time_bytes[0..8].try_into().unwrap()); - let now = now_msec(); - - if now >= time_msec { - let hash = Hash::try_from(&hash_bytes[..]).unwrap(); - - if let Some(ec) = self.resync_errors.get(hash.as_slice())? { - let ec = ErrorCounter::decode(&ec); - if now < ec.next_try() { - // if next retry after an error is not yet, - // don't do resync and return early, but still - // make sure the item is still in queue at expected time - self.put_to_resync_at(&hash, ec.next_try())?; - // ec.next_try() > now >= time_msec, so this remove - // is not removing the one we added just above - // (we want to do the remove after the insert to ensure - // that the item is not lost if we crash in-between) - self.resync_queue.remove(time_bytes)?; - return Ok(ResyncIterResult::BusyDidNothing); - } - } - - let tracer = opentelemetry::global::tracer("garage"); - let trace_id = gen_uuid(); - let span = tracer - .span_builder("Resync block") - .with_trace_id( - opentelemetry::trace::TraceId::from_hex(&hex::encode( - &trace_id.as_slice()[..16], - )) - .unwrap(), - ) - .with_attributes(vec![KeyValue::new("block", format!("{:?}", hash))]) - .start(&tracer); - - let res = self - .resync_block(&hash) - .with_context(Context::current_with_span(span)) - .bound_record_duration(&self.metrics.resync_duration) - .await; - - self.metrics.resync_counter.add(1); - - if let Err(e) = &res { - self.metrics.resync_error_counter.add(1); - warn!("Error when resyncing {:?}: {}", hash, e); - - let err_counter = match self.resync_errors.get(hash.as_slice())? { - Some(ec) => ErrorCounter::decode(&ec).add1(now + 1), - None => ErrorCounter::new(now + 1), - }; - - self.resync_errors - .insert(hash.as_slice(), err_counter.encode())?; - - self.put_to_resync_at(&hash, err_counter.next_try())?; - // err_counter.next_try() >= now + 1 > now, - // the entry we remove from the queue is not - // the entry we inserted with put_to_resync_at - self.resync_queue.remove(time_bytes)?; - } else { - self.resync_errors.remove(hash.as_slice())?; - self.resync_queue.remove(time_bytes)?; - } - - Ok(ResyncIterResult::BusyDidSomething) - } else { - Ok(ResyncIterResult::IdleFor(Duration::from_millis( - time_msec - now, - ))) - } - } else { - // Here we wait either for a notification that an item has been - // added to the queue, or for a constant delay of 10 secs to expire. - // The delay avoids a race condition where the notification happens - // between the time we checked the queue and the first poll - // to resync_notify.notified(): if that happens, we'll just loop - // back 10 seconds later, which is fine. - Ok(ResyncIterResult::IdleFor(Duration::from_secs(10))) - } - } - - async fn resync_block(&self, hash: &Hash) -> Result<(), Error> { - let BlockStatus { exists, needed } = self - .mutation_lock - .lock() - .await - .check_block_status(hash, self) - .await?; - - if exists != needed.is_needed() || exists != needed.is_nonzero() { - debug!( - "Resync block {:?}: exists {}, nonzero rc {}, deletable {}", - hash, - exists, - needed.is_nonzero(), - needed.is_deletable(), - ); - } - - if exists && needed.is_deletable() { - info!("Resync block {:?}: offloading and deleting", hash); - - let mut who = self.replication.write_nodes(hash); - if who.len() < self.replication.write_quorum() { - return Err(Error::Message("Not trying to offload block because we don't have a quorum of nodes to write to".to_string())); - } - who.retain(|id| *id != self.system.id); - - let msg = Arc::new(BlockRpc::NeedBlockQuery(*hash)); - let who_needs_fut = who.iter().map(|to| { - self.system.rpc.call_arc( - &self.endpoint, - *to, - msg.clone(), - RequestStrategy::with_priority(PRIO_BACKGROUND) - .with_timeout(NEED_BLOCK_QUERY_TIMEOUT), - ) - }); - let who_needs_resps = join_all(who_needs_fut).await; - - let mut need_nodes = vec![]; - for (node, needed) in who.iter().zip(who_needs_resps.into_iter()) { - match needed.err_context("NeedBlockQuery RPC")? { - BlockRpc::NeedBlockReply(needed) => { - if needed { - need_nodes.push(*node); - } - } - m => { - return Err(Error::unexpected_rpc_message(m)); - } - } - } - - if !need_nodes.is_empty() { - trace!( - "Block {:?} needed by {} nodes, sending", - hash, - need_nodes.len() - ); - - for node in need_nodes.iter() { - self.metrics - .resync_send_counter - .add(1, &[KeyValue::new("to", format!("{:?}", node))]); - } - - let put_block_message = self.read_block(hash).await?; - self.system - .rpc - .try_call_many( - &self.endpoint, - &need_nodes[..], - put_block_message, - RequestStrategy::with_priority(PRIO_BACKGROUND) - .with_quorum(need_nodes.len()) - .with_timeout(BLOCK_RW_TIMEOUT), - ) - .await - .err_context("PutBlock RPC")?; - } - info!( - "Deleting unneeded block {:?}, offload finished ({} / {})", - hash, - need_nodes.len(), - who.len() - ); - - self.mutation_lock - .lock() - .await - .delete_if_unneeded(hash, self) - .await?; - - self.rc.clear_deleted_block_rc(hash)?; - } - - if needed.is_nonzero() && !exists { - info!( - "Resync block {:?}: fetching absent but needed block (refcount > 0)", - hash - ); - - let block_data = self.rpc_get_raw_block(hash).await?; - - self.metrics.resync_recv_counter.add(1); - - self.write_block(hash, &block_data).await?; - } - - Ok(()) - } - - async fn update_resync_persisted( - &self, - update: impl Fn(&mut ResyncPersistedConfig), - ) -> Result<(), Error> { - let mut cfg: ResyncPersistedConfig = *self.resync_persisted.load().as_ref(); - update(&mut cfg); - self.resync_persister.save_async(&cfg).await?; - self.resync_persisted.store(Arc::new(cfg)); - self.resync_notify.notify_one(); - Ok(()) - } - - pub async fn set_resync_tranquility(&self, tranquility: u32) -> Result<(), Error> { - self.update_resync_persisted(|cfg| cfg.tranquility = tranquility) - .await - } } #[async_trait] @@ -766,92 +386,13 @@ impl EndpointHandler for BlockManager { } } -#[derive(Serialize, Deserialize, Clone, Copy)] -struct ResyncPersistedConfig { - tranquility: u32, -} - -struct ResyncWorker { - manager: Arc, - tranquilizer: Tranquilizer, - next_delay: Duration, -} - -impl ResyncWorker { - fn new(manager: Arc) -> Self { - Self { - manager, - tranquilizer: Tranquilizer::new(30), - next_delay: Duration::from_secs(10), - } - } -} - -#[async_trait] -impl Worker for ResyncWorker { - fn name(&self) -> String { - "Block resync worker".into() - } - - fn info(&self) -> Option { - let mut ret = vec![]; - ret.push(format!( - "tranquility = {}", - self.manager.resync_persisted.load().tranquility - )); - - let qlen = self.manager.resync_queue_len().unwrap_or(0); - if qlen > 0 { - ret.push(format!("{} blocks in queue", qlen)); - } - - let elen = self.manager.resync_errors_len().unwrap_or(0); - if elen > 0 { - ret.push(format!("{} blocks in error state", elen)); - } - - Some(ret.join(", ")) - } - - async fn work(&mut self, _must_exit: &mut watch::Receiver) -> Result { - self.tranquilizer.reset(); - match self.manager.resync_iter().await { - Ok(ResyncIterResult::BusyDidSomething) => Ok(self - .tranquilizer - .tranquilize_worker(self.manager.resync_persisted.load().tranquility)), - Ok(ResyncIterResult::BusyDidNothing) => Ok(WorkerState::Busy), - Ok(ResyncIterResult::IdleFor(delay)) => { - self.next_delay = delay; - Ok(WorkerState::Idle) - } - Err(e) => { - // The errors that we have here are only Sled errors - // We don't really know how to handle them so just ¯\_(ツ)_/¯ - // (there is kind of an assumption that Sled won't error on us, - // if it does there is not much we can do -- TODO should we just panic?) - // Here we just give the error to the worker manager, - // it will print it to the logs and increment a counter - Err(e.into()) - } - } - } - - async fn wait_for_work(&mut self, _must_exit: &watch::Receiver) -> WorkerState { - select! { - _ = tokio::time::sleep(self.next_delay) => (), - _ = self.manager.resync_notify.notified() => (), - }; - WorkerState::Busy - } -} - -struct BlockStatus { - exists: bool, - needed: RcEntry, +pub(crate) struct BlockStatus { + pub(crate) exists: bool, + pub(crate) needed: RcEntry, } impl BlockManagerLocked { - async fn check_block_status( + pub(crate) async fn check_block_status( &self, hash: &Hash, mgr: &BlockManager, @@ -938,7 +479,11 @@ impl BlockManagerLocked { Ok(()) } - async fn delete_if_unneeded(&self, hash: &Hash, mgr: &BlockManager) -> Result<(), Error> { + pub(crate) async fn delete_if_unneeded( + &self, + hash: &Hash, + mgr: &BlockManager, + ) -> Result<(), Error> { let BlockStatus { exists, needed } = self.check_block_status(hash, mgr).await?; if exists && needed.is_deletable() { @@ -952,50 +497,3 @@ impl BlockManagerLocked { Ok(()) } } - -/// Counts the number of errors when resyncing a block, -/// and the time of the last try. -/// Used to implement exponential backoff. -#[derive(Clone, Copy, Debug)] -struct ErrorCounter { - errors: u64, - last_try: u64, -} - -impl ErrorCounter { - fn new(now: u64) -> Self { - Self { - errors: 1, - last_try: now, - } - } - - fn decode(data: &[u8]) -> Self { - Self { - errors: u64::from_be_bytes(data[0..8].try_into().unwrap()), - last_try: u64::from_be_bytes(data[8..16].try_into().unwrap()), - } - } - fn encode(&self) -> Vec { - [ - u64::to_be_bytes(self.errors), - u64::to_be_bytes(self.last_try), - ] - .concat() - } - - fn add1(self, now: u64) -> Self { - Self { - errors: self.errors + 1, - last_try: now, - } - } - - fn delay_msec(&self) -> u64 { - (RESYNC_RETRY_DELAY.as_millis() as u64) - << std::cmp::min(self.errors - 1, RESYNC_RETRY_DELAY_MAX_BACKOFF_POWER) - } - fn next_try(&self) -> u64 { - self.last_try + self.delay_msec() - } -} diff --git a/src/block/repair.rs b/src/block/repair.rs index 18e1de95..e2884b69 100644 --- a/src/block/repair.rs +++ b/src/block/repair.rs @@ -112,7 +112,9 @@ impl Worker for RepairWorker { } for hash in batch_of_hashes.into_iter() { - self.manager.put_to_resync(&hash, Duration::from_secs(0))?; + self.manager + .resync + .put_to_resync(&hash, Duration::from_secs(0))?; self.next_start = Some(hash) } @@ -124,7 +126,9 @@ impl Worker for RepairWorker { // This allows us to find blocks we are storing but don't actually need, // so that we can offload them if necessary and then delete them locally. if let Some(hash) = bi.next().await? { - self.manager.put_to_resync(&hash, Duration::from_secs(0))?; + self.manager + .resync + .put_to_resync(&hash, Duration::from_secs(0))?; Ok(WorkerState::Busy) } else { Ok(WorkerState::Done) diff --git a/src/block/resync.rs b/src/block/resync.rs new file mode 100644 index 00000000..2a8184b7 --- /dev/null +++ b/src/block/resync.rs @@ -0,0 +1,536 @@ +use std::convert::TryInto; +use std::sync::Arc; +use std::time::Duration; + +use arc_swap::ArcSwap; +use async_trait::async_trait; +use serde::{Deserialize, Serialize}; + +use futures::future::*; +use tokio::select; +use tokio::sync::{watch, Notify}; + +use opentelemetry::{ + trace::{FutureExt as OtelFutureExt, TraceContextExt, Tracer}, + Context, KeyValue, +}; + +use garage_db as db; +use garage_db::counted_tree_hack::CountedTree; + +use garage_util::background::*; +use garage_util::data::*; +use garage_util::error::*; +use garage_util::metrics::RecordDuration; +use garage_util::persister::Persister; +use garage_util::time::*; +use garage_util::tranquilizer::Tranquilizer; + +use garage_rpc::system::System; +use garage_rpc::*; + +use garage_table::replication::TableReplication; + +use crate::manager::*; + +// Timeout for RPCs that ask other nodes whether they need a copy +// of a given block before we delete it locally +pub(crate) const NEED_BLOCK_QUERY_TIMEOUT: Duration = Duration::from_secs(5); + +// The delay between the time where a resync operation fails +// and the time when it is retried, with exponential backoff +// (multiplied by 2, 4, 8, 16, etc. for every consecutive failure). +pub(crate) const RESYNC_RETRY_DELAY: Duration = Duration::from_secs(60); +// The minimum retry delay is 60 seconds = 1 minute +// The maximum retry delay is 60 seconds * 2^6 = 60 seconds << 6 = 64 minutes (~1 hour) +pub(crate) const RESYNC_RETRY_DELAY_MAX_BACKOFF_POWER: u64 = 6; +// Resync tranquility is initially set to 2, but can be changed in the CLI +// and the updated version is persisted over Garage restarts +const INITIAL_RESYNC_TRANQUILITY: u32 = 2; + +pub struct BlockResyncManager { + pub(crate) queue: CountedTree, + pub(crate) notify: Notify, + pub(crate) errors: CountedTree, + + persister: Persister, + persisted: ArcSwap, +} + +#[derive(Serialize, Deserialize, Clone, Copy)] +struct ResyncPersistedConfig { + tranquility: u32, +} + +enum ResyncIterResult { + BusyDidSomething, + BusyDidNothing, + IdleFor(Duration), +} + +impl BlockResyncManager { + pub(crate) fn new(db: &db::Db, system: &System) -> Self { + let queue = db + .open_tree("block_local_resync_queue") + .expect("Unable to open block_local_resync_queue tree"); + let queue = CountedTree::new(queue).expect("Could not count block_local_resync_queue"); + + let errors = db + .open_tree("block_local_resync_errors") + .expect("Unable to open block_local_resync_errors tree"); + let errors = CountedTree::new(errors).expect("Could not count block_local_resync_errors"); + + let persister = Persister::new(&system.metadata_dir, "resync_cfg"); + let persisted = match persister.load() { + Ok(v) => v, + Err(_) => ResyncPersistedConfig { + tranquility: INITIAL_RESYNC_TRANQUILITY, + }, + }; + + Self { + queue, + notify: Notify::new(), + errors, + persister, + persisted: ArcSwap::new(Arc::new(persisted)), + } + } + + /// Get lenght of resync queue + pub fn queue_len(&self) -> Result { + // This currently can't return an error because the CountedTree hack + // doesn't error on .len(), but this will change when we remove the hack + // (hopefully someday!) + Ok(self.queue.len()) + } + + /// Get number of blocks that have an error + pub fn errors_len(&self) -> Result { + // (see queue_len comment) + Ok(self.errors.len()) + } + + // ---- Resync loop ---- + + // This part manages a queue of blocks that need to be + // "resynchronized", i.e. that need to have a check that + // they are at present if we need them, or that they are + // deleted once the garbage collection delay has passed. + // + // Here are some explanations on how the resync queue works. + // There are two Sled trees that are used to have information + // about the status of blocks that need to be resynchronized: + // + // - resync.queue: a tree that is ordered first by a timestamp + // (in milliseconds since Unix epoch) that is the time at which + // the resync must be done, and second by block hash. + // The key in this tree is just: + // concat(timestamp (8 bytes), hash (32 bytes)) + // The value is the same 32-byte hash. + // + // - resync.errors: a tree that indicates for each block + // if the last resync resulted in an error, and if so, + // the following two informations (see the ErrorCounter struct): + // - how many consecutive resync errors for this block? + // - when was the last try? + // These two informations are used to implement an + // exponential backoff retry strategy. + // The key in this tree is the 32-byte hash of the block, + // and the value is the encoded ErrorCounter value. + // + // We need to have these two trees, because the resync queue + // is not just a queue of items to process, but a set of items + // that are waiting a specific delay until we can process them + // (the delay being necessary both internally for the exponential + // backoff strategy, and exposed as a parameter when adding items + // to the queue, e.g. to wait until the GC delay has passed). + // This is why we need one tree ordered by time, and one + // ordered by identifier of item to be processed (block hash). + // + // When the worker wants to process an item it takes from + // resync.queue, it checks in resync.errors that if there is an + // exponential back-off delay to await, it has passed before we + // process the item. If not, the item in the queue is skipped + // (but added back for later processing after the time of the + // delay). + // + // An alternative that would have seemed natural is to + // only add items to resync.queue with a processing time that is + // after the delay, but there are several issues with this: + // - This requires to synchronize updates to resync.queue and + // resync.errors (with the current model, there is only one thread, + // the worker thread, that accesses resync.errors, + // so no need to synchronize) by putting them both in a lock. + // This would mean that block_incref might need to take a lock + // before doing its thing, meaning it has much more chances of + // not completing successfully if something bad happens to Garage. + // Currently Garage is not able to recover from block_incref that + // doesn't complete successfully, because it is necessary to ensure + // the consistency between the state of the block manager and + // information in the BlockRef table. + // - If a resync fails, we put that block in the resync.errors table, + // and also add it back to resync.queue to be processed after + // the exponential back-off delay, + // but maybe the block is already scheduled to be resynced again + // at another time that is before the exponential back-off delay, + // and we have no way to check that easily. This means that + // in all cases, we need to check the resync.errors table + // in the resync loop at the time when a block is popped from + // the resync.queue. + // Overall, the current design is therefore simpler and more robust + // because it tolerates inconsistencies between the resync.queue + // and resync.errors table (items being scheduled in resync.queue + // for times that are earlier than the exponential back-off delay + // is a natural condition that is handled properly). + + pub(crate) fn put_to_resync(&self, hash: &Hash, delay: Duration) -> db::Result<()> { + let when = now_msec() + delay.as_millis() as u64; + self.put_to_resync_at(hash, when) + } + + pub(crate) fn put_to_resync_at(&self, hash: &Hash, when: u64) -> db::Result<()> { + trace!("Put resync_queue: {} {:?}", when, hash); + let mut key = u64::to_be_bytes(when).to_vec(); + key.extend(hash.as_ref()); + self.queue.insert(key, hash.as_ref())?; + self.notify.notify_waiters(); + Ok(()) + } + + async fn resync_iter(&self, manager: &BlockManager) -> Result { + if let Some((time_bytes, hash_bytes)) = self.queue.first()? { + let time_msec = u64::from_be_bytes(time_bytes[0..8].try_into().unwrap()); + let now = now_msec(); + + if now >= time_msec { + let hash = Hash::try_from(&hash_bytes[..]).unwrap(); + + if let Some(ec) = self.errors.get(hash.as_slice())? { + let ec = ErrorCounter::decode(&ec); + if now < ec.next_try() { + // if next retry after an error is not yet, + // don't do resync and return early, but still + // make sure the item is still in queue at expected time + self.put_to_resync_at(&hash, ec.next_try())?; + // ec.next_try() > now >= time_msec, so this remove + // is not removing the one we added just above + // (we want to do the remove after the insert to ensure + // that the item is not lost if we crash in-between) + self.queue.remove(time_bytes)?; + return Ok(ResyncIterResult::BusyDidNothing); + } + } + + let tracer = opentelemetry::global::tracer("garage"); + let trace_id = gen_uuid(); + let span = tracer + .span_builder("Resync block") + .with_trace_id( + opentelemetry::trace::TraceId::from_hex(&hex::encode( + &trace_id.as_slice()[..16], + )) + .unwrap(), + ) + .with_attributes(vec![KeyValue::new("block", format!("{:?}", hash))]) + .start(&tracer); + + let res = self + .resync_block(manager, &hash) + .with_context(Context::current_with_span(span)) + .bound_record_duration(&manager.metrics.resync_duration) + .await; + + manager.metrics.resync_counter.add(1); + + if let Err(e) = &res { + manager.metrics.resync_error_counter.add(1); + warn!("Error when resyncing {:?}: {}", hash, e); + + let err_counter = match self.errors.get(hash.as_slice())? { + Some(ec) => ErrorCounter::decode(&ec).add1(now + 1), + None => ErrorCounter::new(now + 1), + }; + + self.errors.insert(hash.as_slice(), err_counter.encode())?; + + self.put_to_resync_at(&hash, err_counter.next_try())?; + // err_counter.next_try() >= now + 1 > now, + // the entry we remove from the queue is not + // the entry we inserted with put_to_resync_at + self.queue.remove(time_bytes)?; + } else { + self.errors.remove(hash.as_slice())?; + self.queue.remove(time_bytes)?; + } + + Ok(ResyncIterResult::BusyDidSomething) + } else { + Ok(ResyncIterResult::IdleFor(Duration::from_millis( + time_msec - now, + ))) + } + } else { + // Here we wait either for a notification that an item has been + // added to the queue, or for a constant delay of 10 secs to expire. + // The delay avoids a race condition where the notification happens + // between the time we checked the queue and the first poll + // to resync_notify.notified(): if that happens, we'll just loop + // back 10 seconds later, which is fine. + Ok(ResyncIterResult::IdleFor(Duration::from_secs(10))) + } + } + + async fn resync_block(&self, manager: &BlockManager, hash: &Hash) -> Result<(), Error> { + let BlockStatus { exists, needed } = manager + .mutation_lock + .lock() + .await + .check_block_status(hash, manager) + .await?; + + if exists != needed.is_needed() || exists != needed.is_nonzero() { + debug!( + "Resync block {:?}: exists {}, nonzero rc {}, deletable {}", + hash, + exists, + needed.is_nonzero(), + needed.is_deletable(), + ); + } + + if exists && needed.is_deletable() { + info!("Resync block {:?}: offloading and deleting", hash); + + let mut who = manager.replication.write_nodes(hash); + if who.len() < manager.replication.write_quorum() { + return Err(Error::Message("Not trying to offload block because we don't have a quorum of nodes to write to".to_string())); + } + who.retain(|id| *id != manager.system.id); + + let msg = Arc::new(BlockRpc::NeedBlockQuery(*hash)); + let who_needs_fut = who.iter().map(|to| { + manager.system.rpc.call_arc( + &manager.endpoint, + *to, + msg.clone(), + RequestStrategy::with_priority(PRIO_BACKGROUND) + .with_timeout(NEED_BLOCK_QUERY_TIMEOUT), + ) + }); + let who_needs_resps = join_all(who_needs_fut).await; + + let mut need_nodes = vec![]; + for (node, needed) in who.iter().zip(who_needs_resps.into_iter()) { + match needed.err_context("NeedBlockQuery RPC")? { + BlockRpc::NeedBlockReply(needed) => { + if needed { + need_nodes.push(*node); + } + } + m => { + return Err(Error::unexpected_rpc_message(m)); + } + } + } + + if !need_nodes.is_empty() { + trace!( + "Block {:?} needed by {} nodes, sending", + hash, + need_nodes.len() + ); + + for node in need_nodes.iter() { + manager + .metrics + .resync_send_counter + .add(1, &[KeyValue::new("to", format!("{:?}", node))]); + } + + let put_block_message = manager.read_block(hash).await?; + manager + .system + .rpc + .try_call_many( + &manager.endpoint, + &need_nodes[..], + put_block_message, + RequestStrategy::with_priority(PRIO_BACKGROUND) + .with_quorum(need_nodes.len()) + .with_timeout(BLOCK_RW_TIMEOUT), + ) + .await + .err_context("PutBlock RPC")?; + } + info!( + "Deleting unneeded block {:?}, offload finished ({} / {})", + hash, + need_nodes.len(), + who.len() + ); + + manager + .mutation_lock + .lock() + .await + .delete_if_unneeded(hash, manager) + .await?; + + manager.rc.clear_deleted_block_rc(hash)?; + } + + if needed.is_nonzero() && !exists { + info!( + "Resync block {:?}: fetching absent but needed block (refcount > 0)", + hash + ); + + let block_data = manager.rpc_get_raw_block(hash).await?; + + manager.metrics.resync_recv_counter.add(1); + + manager.write_block(hash, &block_data).await?; + } + + Ok(()) + } + + async fn update_persisted( + &self, + update: impl Fn(&mut ResyncPersistedConfig), + ) -> Result<(), Error> { + let mut cfg: ResyncPersistedConfig = *self.persisted.load().as_ref(); + update(&mut cfg); + self.persister.save_async(&cfg).await?; + self.persisted.store(Arc::new(cfg)); + self.notify.notify_one(); + Ok(()) + } + + pub async fn set_tranquility(&self, tranquility: u32) -> Result<(), Error> { + self.update_persisted(|cfg| cfg.tranquility = tranquility) + .await + } +} + +pub(crate) struct ResyncWorker { + manager: Arc, + tranquilizer: Tranquilizer, + next_delay: Duration, +} + +impl ResyncWorker { + pub(crate) fn new(manager: Arc) -> Self { + Self { + manager, + tranquilizer: Tranquilizer::new(30), + next_delay: Duration::from_secs(10), + } + } +} + +#[async_trait] +impl Worker for ResyncWorker { + fn name(&self) -> String { + "Block resync worker".into() + } + + fn info(&self) -> Option { + let mut ret = vec![]; + ret.push(format!( + "tranquility = {}", + self.manager.resync.persisted.load().tranquility + )); + + let qlen = self.manager.resync.queue_len().unwrap_or(0); + if qlen > 0 { + ret.push(format!("{} blocks in queue", qlen)); + } + + let elen = self.manager.resync.errors_len().unwrap_or(0); + if elen > 0 { + ret.push(format!("{} blocks in error state", elen)); + } + + Some(ret.join(", ")) + } + + async fn work(&mut self, _must_exit: &mut watch::Receiver) -> Result { + self.tranquilizer.reset(); + match self.manager.resync.resync_iter(&self.manager).await { + Ok(ResyncIterResult::BusyDidSomething) => Ok(self + .tranquilizer + .tranquilize_worker(self.manager.resync.persisted.load().tranquility)), + Ok(ResyncIterResult::BusyDidNothing) => Ok(WorkerState::Busy), + Ok(ResyncIterResult::IdleFor(delay)) => { + self.next_delay = delay; + Ok(WorkerState::Idle) + } + Err(e) => { + // The errors that we have here are only Sled errors + // We don't really know how to handle them so just ¯\_(ツ)_/¯ + // (there is kind of an assumption that Sled won't error on us, + // if it does there is not much we can do -- TODO should we just panic?) + // Here we just give the error to the worker manager, + // it will print it to the logs and increment a counter + Err(e.into()) + } + } + } + + async fn wait_for_work(&mut self, _must_exit: &watch::Receiver) -> WorkerState { + select! { + _ = tokio::time::sleep(self.next_delay) => (), + _ = self.manager.resync.notify.notified() => (), + }; + WorkerState::Busy + } +} + +/// Counts the number of errors when resyncing a block, +/// and the time of the last try. +/// Used to implement exponential backoff. +#[derive(Clone, Copy, Debug)] +struct ErrorCounter { + errors: u64, + last_try: u64, +} + +impl ErrorCounter { + fn new(now: u64) -> Self { + Self { + errors: 1, + last_try: now, + } + } + + fn decode(data: &[u8]) -> Self { + Self { + errors: u64::from_be_bytes(data[0..8].try_into().unwrap()), + last_try: u64::from_be_bytes(data[8..16].try_into().unwrap()), + } + } + fn encode(&self) -> Vec { + [ + u64::to_be_bytes(self.errors), + u64::to_be_bytes(self.last_try), + ] + .concat() + } + + fn add1(self, now: u64) -> Self { + Self { + errors: self.errors + 1, + last_try: now, + } + } + + fn delay_msec(&self) -> u64 { + (RESYNC_RETRY_DELAY.as_millis() as u64) + << std::cmp::min(self.errors - 1, RESYNC_RETRY_DELAY_MAX_BACKOFF_POWER) + } + fn next_try(&self) -> u64 { + self.last_try + self.delay_msec() + } +} diff --git a/src/garage/admin.rs b/src/garage/admin.rs index 1d80889c..9f4764df 100644 --- a/src/garage/admin.rs +++ b/src/garage/admin.rs @@ -781,13 +781,13 @@ impl AdminRpcHandler { writeln!( &mut ret, " resync queue length: {}", - self.garage.block_manager.resync_queue_len()? + self.garage.block_manager.resync.queue_len()? ) .unwrap(); writeln!( &mut ret, " blocks with resync errors: {}", - self.garage.block_manager.resync_errors_len()? + self.garage.block_manager.resync.errors_len()? ) .unwrap(); @@ -850,7 +850,8 @@ impl AdminRpcHandler { WorkerSetCmd::ResyncTranquility { tranquility } => { self.garage .block_manager - .set_resync_tranquility(tranquility) + .resync + .set_tranquility(tranquility) .await?; Ok(AdminRpc::Ok("Resync tranquility updated".into())) } -- cgit v1.2.3 From 5e8baa433d743a06ab3ee90f375f24c3c36fc236 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 2 Sep 2022 16:52:22 +0200 Subject: Make BlockManagerLocked fully private again --- src/block/manager.rs | 35 ++++++++++++++++++++++------------- src/block/resync.rs | 14 ++------------ 2 files changed, 24 insertions(+), 25 deletions(-) (limited to 'src') diff --git a/src/block/manager.rs b/src/block/manager.rs index efb5349c..62ef96b9 100644 --- a/src/block/manager.rs +++ b/src/block/manager.rs @@ -68,7 +68,7 @@ pub struct BlockManager { compression_level: Option, - pub(crate) mutation_lock: Mutex, + mutation_lock: Mutex, pub(crate) rc: BlockRc, pub resync: BlockResyncManager, @@ -84,7 +84,7 @@ pub struct BlockManager { // This custom struct contains functions that must only be ran // when the lock is held. We ensure that it is the case by storing // it INSIDE a Mutex. -pub(crate) struct BlockManagerLocked(); +struct BlockManagerLocked(); impl BlockManager { pub fn new( @@ -331,17 +331,30 @@ impl BlockManager { Ok(data) } - /// Check if this node should have a block, but don't actually have it - async fn need_block(&self, hash: &Hash) -> Result { - let BlockStatus { exists, needed } = self - .mutation_lock + /// Check if this node has a block and whether it needs it + pub(crate) async fn check_block_status(&self, hash: &Hash) -> Result { + self.mutation_lock .lock() .await .check_block_status(hash, self) - .await?; + .await + } + + /// Check if this node should have a block, but don't actually have it + async fn need_block(&self, hash: &Hash) -> Result { + let BlockStatus { exists, needed } = self.check_block_status(hash).await?; Ok(needed.is_nonzero() && !exists) } + /// Delete block if it is not needed anymore + pub(crate) async fn delete_if_unneeded(&self, hash: &Hash) -> Result<(), Error> { + self.mutation_lock + .lock() + .await + .delete_if_unneeded(hash, self) + .await + } + /// Utility: gives the path of the directory in which a block should be found fn block_dir(&self, hash: &Hash) -> PathBuf { let mut path = self.data_dir.clone(); @@ -392,7 +405,7 @@ pub(crate) struct BlockStatus { } impl BlockManagerLocked { - pub(crate) async fn check_block_status( + async fn check_block_status( &self, hash: &Hash, mgr: &BlockManager, @@ -479,11 +492,7 @@ impl BlockManagerLocked { Ok(()) } - pub(crate) async fn delete_if_unneeded( - &self, - hash: &Hash, - mgr: &BlockManager, - ) -> Result<(), Error> { + async fn delete_if_unneeded(&self, hash: &Hash, mgr: &BlockManager) -> Result<(), Error> { let BlockStatus { exists, needed } = self.check_block_status(hash, mgr).await?; if exists && needed.is_deletable() { diff --git a/src/block/resync.rs b/src/block/resync.rs index 2a8184b7..dab08338 100644 --- a/src/block/resync.rs +++ b/src/block/resync.rs @@ -282,12 +282,7 @@ impl BlockResyncManager { } async fn resync_block(&self, manager: &BlockManager, hash: &Hash) -> Result<(), Error> { - let BlockStatus { exists, needed } = manager - .mutation_lock - .lock() - .await - .check_block_status(hash, manager) - .await?; + let BlockStatus { exists, needed } = manager.check_block_status(hash).await?; if exists != needed.is_needed() || exists != needed.is_nonzero() { debug!( @@ -370,12 +365,7 @@ impl BlockResyncManager { who.len() ); - manager - .mutation_lock - .lock() - .await - .delete_if_unneeded(hash, manager) - .await?; + manager.delete_if_unneeded(hash).await?; manager.rc.clear_deleted_block_rc(hash)?; } -- cgit v1.2.3 From 5d4b937a00882b9bf8b36f7430f3d1fe9db58903 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 2 Sep 2022 17:18:13 +0200 Subject: Ability to have up to 4 concurrently working resync workers --- src/block/manager.rs | 12 +++---- src/block/resync.rs | 92 +++++++++++++++++++++++++++++++++++++++-------- src/garage/admin.rs | 8 +++++ src/garage/cli/structs.rs | 5 ++- 4 files changed, 95 insertions(+), 22 deletions(-) (limited to 'src') diff --git a/src/block/manager.rs b/src/block/manager.rs index 62ef96b9..9240db25 100644 --- a/src/block/manager.rs +++ b/src/block/manager.rs @@ -125,13 +125,11 @@ impl BlockManager { }); block_manager.endpoint.set_handler(block_manager.clone()); - // Spawn one resync worker - let background = block_manager.system.background.clone(); - let worker = ResyncWorker::new(block_manager.clone()); - tokio::spawn(async move { - tokio::time::sleep(Duration::from_secs(10)).await; - background.spawn_worker(worker); - }); + // Spawn a bunch of resync workers + for index in 0..MAX_RESYNC_WORKERS { + let worker = ResyncWorker::new(index, block_manager.clone()); + block_manager.system.background.spawn_worker(worker); + } // Spawn scrub worker let scrub_worker = ScrubWorker::new(block_manager.clone(), scrub_rx); diff --git a/src/block/resync.rs b/src/block/resync.rs index dab08338..0f358d48 100644 --- a/src/block/resync.rs +++ b/src/block/resync.rs @@ -1,5 +1,6 @@ +use std::collections::HashSet; use std::convert::TryInto; -use std::sync::Arc; +use std::sync::{Arc, Mutex}; use std::time::Duration; use arc_swap::ArcSwap; @@ -44,6 +45,9 @@ pub(crate) const RESYNC_RETRY_DELAY: Duration = Duration::from_secs(60); // The minimum retry delay is 60 seconds = 1 minute // The maximum retry delay is 60 seconds * 2^6 = 60 seconds << 6 = 64 minutes (~1 hour) pub(crate) const RESYNC_RETRY_DELAY_MAX_BACKOFF_POWER: u64 = 6; + +// No more than 4 resync workers can be running in the system +pub(crate) const MAX_RESYNC_WORKERS: usize = 4; // Resync tranquility is initially set to 2, but can be changed in the CLI // and the updated version is persisted over Garage restarts const INITIAL_RESYNC_TRANQUILITY: u32 = 2; @@ -53,12 +57,15 @@ pub struct BlockResyncManager { pub(crate) notify: Notify, pub(crate) errors: CountedTree, + busy_set: BusySet, + persister: Persister, persisted: ArcSwap, } #[derive(Serialize, Deserialize, Clone, Copy)] struct ResyncPersistedConfig { + n_workers: usize, tranquility: u32, } @@ -68,6 +75,14 @@ enum ResyncIterResult { IdleFor(Duration), } +type BusySet = Arc>>>; + +struct BusyBlock { + time_bytes: Vec, + hash_bytes: Vec, + busy_set: BusySet, +} + impl BlockResyncManager { pub(crate) fn new(db: &db::Db, system: &System) -> Self { let queue = db @@ -84,6 +99,7 @@ impl BlockResyncManager { let persisted = match persister.load() { Ok(v) => v, Err(_) => ResyncPersistedConfig { + n_workers: 1, tranquility: INITIAL_RESYNC_TRANQUILITY, }, }; @@ -92,6 +108,7 @@ impl BlockResyncManager { queue, notify: Notify::new(), errors, + busy_set: Arc::new(Mutex::new(HashSet::new())), persister, persisted: ArcSwap::new(Arc::new(persisted)), } @@ -199,12 +216,12 @@ impl BlockResyncManager { } async fn resync_iter(&self, manager: &BlockManager) -> Result { - if let Some((time_bytes, hash_bytes)) = self.queue.first()? { - let time_msec = u64::from_be_bytes(time_bytes[0..8].try_into().unwrap()); + if let Some(block) = self.get_block_to_resync()? { + let time_msec = u64::from_be_bytes(block.time_bytes[0..8].try_into().unwrap()); let now = now_msec(); if now >= time_msec { - let hash = Hash::try_from(&hash_bytes[..]).unwrap(); + let hash = Hash::try_from(&block.hash_bytes[..]).unwrap(); if let Some(ec) = self.errors.get(hash.as_slice())? { let ec = ErrorCounter::decode(&ec); @@ -217,7 +234,7 @@ impl BlockResyncManager { // is not removing the one we added just above // (we want to do the remove after the insert to ensure // that the item is not lost if we crash in-between) - self.queue.remove(time_bytes)?; + self.queue.remove(&block.time_bytes)?; return Ok(ResyncIterResult::BusyDidNothing); } } @@ -258,10 +275,10 @@ impl BlockResyncManager { // err_counter.next_try() >= now + 1 > now, // the entry we remove from the queue is not // the entry we inserted with put_to_resync_at - self.queue.remove(time_bytes)?; + self.queue.remove(&block.time_bytes)?; } else { self.errors.remove(hash.as_slice())?; - self.queue.remove(time_bytes)?; + self.queue.remove(&block.time_bytes)?; } Ok(ResyncIterResult::BusyDidSomething) @@ -281,6 +298,22 @@ impl BlockResyncManager { } } + fn get_block_to_resync(&self) -> Result, db::Error> { + let mut busy = self.busy_set.lock().unwrap(); + for it in self.queue.iter()? { + let (time_bytes, hash_bytes) = it?; + if !busy.contains(&time_bytes) { + busy.insert(time_bytes.clone()); + return Ok(Some(BusyBlock { + time_bytes, + hash_bytes, + busy_set: self.busy_set.clone(), + })); + } + } + return Ok(None); + } + async fn resync_block(&self, manager: &BlockManager, hash: &Hash) -> Result<(), Error> { let BlockStatus { exists, needed } = manager.check_block_status(hash).await?; @@ -394,25 +427,44 @@ impl BlockResyncManager { update(&mut cfg); self.persister.save_async(&cfg).await?; self.persisted.store(Arc::new(cfg)); - self.notify.notify_one(); + self.notify.notify_waiters(); Ok(()) } + pub async fn set_n_workers(&self, n_workers: usize) -> Result<(), Error> { + if n_workers < 1 || n_workers > MAX_RESYNC_WORKERS { + return Err(Error::Message(format!( + "Invalid number of resync workers, must be between 1 and {}", + MAX_RESYNC_WORKERS + ))); + } + self.update_persisted(|cfg| cfg.n_workers = n_workers).await + } + pub async fn set_tranquility(&self, tranquility: u32) -> Result<(), Error> { self.update_persisted(|cfg| cfg.tranquility = tranquility) .await } } +impl Drop for BusyBlock { + fn drop(&mut self) { + let mut busy = self.busy_set.lock().unwrap(); + busy.remove(&self.time_bytes); + } +} + pub(crate) struct ResyncWorker { + index: usize, manager: Arc, tranquilizer: Tranquilizer, next_delay: Duration, } impl ResyncWorker { - pub(crate) fn new(manager: Arc) -> Self { + pub(crate) fn new(index: usize, manager: Arc) -> Self { Self { + index, manager, tranquilizer: Tranquilizer::new(30), next_delay: Duration::from_secs(10), @@ -423,15 +475,18 @@ impl ResyncWorker { #[async_trait] impl Worker for ResyncWorker { fn name(&self) -> String { - "Block resync worker".into() + format!("Block resync worker #{}", self.index + 1) } fn info(&self) -> Option { + let persisted = self.manager.resync.persisted.load(); + + if self.index >= persisted.n_workers { + return Some("(unused)".into()); + } + let mut ret = vec![]; - ret.push(format!( - "tranquility = {}", - self.manager.resync.persisted.load().tranquility - )); + ret.push(format!("tranquility = {}", persisted.tranquility)); let qlen = self.manager.resync.queue_len().unwrap_or(0); if qlen > 0 { @@ -447,6 +502,10 @@ impl Worker for ResyncWorker { } async fn work(&mut self, _must_exit: &mut watch::Receiver) -> Result { + if self.index >= self.manager.resync.persisted.load().n_workers { + return Ok(WorkerState::Idle); + } + self.tranquilizer.reset(); match self.manager.resync.resync_iter(&self.manager).await { Ok(ResyncIterResult::BusyDidSomething) => Ok(self @@ -470,10 +529,15 @@ impl Worker for ResyncWorker { } async fn wait_for_work(&mut self, _must_exit: &watch::Receiver) -> WorkerState { + while self.index >= self.manager.resync.persisted.load().n_workers { + self.manager.resync.notify.notified().await + } + select! { _ = tokio::time::sleep(self.next_delay) => (), _ = self.manager.resync.notify.notified() => (), }; + WorkerState::Busy } } diff --git a/src/garage/admin.rs b/src/garage/admin.rs index 9f4764df..76261050 100644 --- a/src/garage/admin.rs +++ b/src/garage/admin.rs @@ -847,6 +847,14 @@ impl AdminRpcHandler { .await; Ok(AdminRpc::Ok("Scrub tranquility updated".into())) } + WorkerSetCmd::ResyncNWorkers { n_workers } => { + self.garage + .block_manager + .resync + .set_n_workers(n_workers) + .await?; + Ok(AdminRpc::Ok("Number of resync workers updated".into())) + } WorkerSetCmd::ResyncTranquility { tranquility } => { self.garage .block_manager diff --git a/src/garage/cli/structs.rs b/src/garage/cli/structs.rs index 1fba934f..0388cef5 100644 --- a/src/garage/cli/structs.rs +++ b/src/garage/cli/structs.rs @@ -524,7 +524,10 @@ pub enum WorkerSetCmd { /// Set tranquility of scrub operations #[structopt(name = "scrub-tranquility", version = version::garage())] ScrubTranquility { tranquility: u32 }, - /// Set tranquility of resync operations + /// Set number of concurrent block resync workers + #[structopt(name = "resync-n-workers", version = version::garage())] + ResyncNWorkers { n_workers: usize }, + /// Set tranquility of block resync operations #[structopt(name = "resync-tranquility", version = version::garage())] ResyncTranquility { tranquility: u32 }, } -- cgit v1.2.3 From e1751c8a9cb2a0d91b5aed636ee72ca4fa31ca68 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 2 Sep 2022 17:24:26 +0200 Subject: fix clippy --- src/block/resync.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'src') diff --git a/src/block/resync.rs b/src/block/resync.rs index 0f358d48..39e4d50f 100644 --- a/src/block/resync.rs +++ b/src/block/resync.rs @@ -311,7 +311,7 @@ impl BlockResyncManager { })); } } - return Ok(None); + Ok(None) } async fn resync_block(&self, manager: &BlockManager, hash: &Hash) -> Result<(), Error> { @@ -432,7 +432,7 @@ impl BlockResyncManager { } pub async fn set_n_workers(&self, n_workers: usize) -> Result<(), Error> { - if n_workers < 1 || n_workers > MAX_RESYNC_WORKERS { + if !(1..=MAX_RESYNC_WORKERS).contains(&n_workers) { return Err(Error::Message(format!( "Invalid number of resync workers, must be between 1 and {}", MAX_RESYNC_WORKERS -- cgit v1.2.3 From a6e40b75eabf0d6a863a91ae17f7d0ae20582d9e Mon Sep 17 00:00:00 2001 From: Jakub Jirutka Date: Sat, 3 Sep 2022 18:37:24 +0200 Subject: Add feature "system-libs" to enable linking against system libraries If this feature is enabled, libsodium-sys and zstd-sys will link dynamically against system-provided libraries instead of building and linking statically the bundled (possibly outdated and vulnerable) copies of them. This feature is intended mainly for linux package maintainers. --- src/block/Cargo.toml | 4 ++++ src/garage/Cargo.toml | 1 + src/rpc/Cargo.toml | 1 + 3 files changed, 6 insertions(+) (limited to 'src') diff --git a/src/block/Cargo.toml b/src/block/Cargo.toml index 2555a44a..ca0360b5 100644 --- a/src/block/Cargo.toml +++ b/src/block/Cargo.toml @@ -36,3 +36,7 @@ serde_bytes = "0.11" futures = "0.3" futures-util = "0.3" tokio = { version = "1.0", default-features = false, features = ["rt", "rt-multi-thread", "io-util", "net", "time", "macros", "sync", "signal", "fs"] } + + +[features] +system-libs = [ "zstd/pkg-config" ] diff --git a/src/garage/Cargo.toml b/src/garage/Cargo.toml index 8948e750..6cc93fc0 100644 --- a/src/garage/Cargo.toml +++ b/src/garage/Cargo.toml @@ -76,3 +76,4 @@ base64 = "0.13" [features] kubernetes-discovery = [ "garage_rpc/kubernetes-discovery" ] k2v = [ "garage_util/k2v", "garage_api/k2v" ] +system-libs = [ "garage_block/system-libs", "garage_rpc/system-libs", "sodiumoxide/use-pkg-config" ] diff --git a/src/rpc/Cargo.toml b/src/rpc/Cargo.toml index 80a1975c..309e3fc2 100644 --- a/src/rpc/Cargo.toml +++ b/src/rpc/Cargo.toml @@ -54,3 +54,4 @@ hyper = { version = "0.14", features = ["client", "http1", "runtime", "tcp"] } [features] kubernetes-discovery = [ "kube", "k8s-openapi", "openssl", "schemars" ] +system-libs = [ "sodiumoxide/use-pkg-config" ] -- cgit v1.2.3 From 7511ba5530d56a446fefe2372409d9c2ceea17c5 Mon Sep 17 00:00:00 2001 From: Jakub Jirutka Date: Sat, 3 Sep 2022 19:05:32 +0200 Subject: Allow linking against system-provided libsqlite Unfortunately, rusqlite uses the opposite logic for enabling/disabling bundled libraries to others (libsodium-sys, zstd-sys). Cargo features are very limited and doesn't allow to enable feature A in a dependency iff feature B is disabled. Note, lmdb-rkv-sys doesn't need any special treatment because it automatically links against system liblmdb if found via pkgconf. Linux distros should build garage with `--no-default-features --features system-libs` to disable bundled-libs and enable system-libs. --- src/db/Cargo.toml | 3 ++- src/garage/Cargo.toml | 8 ++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'src') diff --git a/src/db/Cargo.toml b/src/db/Cargo.toml index f697054b..230fbaf9 100644 --- a/src/db/Cargo.toml +++ b/src/db/Cargo.toml @@ -22,7 +22,7 @@ hexdump = "0.1" tracing = "0.1.30" heed = "0.11" -rusqlite = { version = "0.27", features = ["bundled"] } +rusqlite = "0.27" sled = "0.34" # cli deps @@ -33,4 +33,5 @@ pretty_env_logger = { version = "0.4", optional = true } mktemp = "0.4" [features] +bundled-libs = [ "rusqlite/bundled" ] cli = ["clap", "pretty_env_logger"] diff --git a/src/garage/Cargo.toml b/src/garage/Cargo.toml index 6cc93fc0..e19aac50 100644 --- a/src/garage/Cargo.toml +++ b/src/garage/Cargo.toml @@ -74,6 +74,14 @@ base64 = "0.13" [features] +default = [ "bundled-libs" ] kubernetes-discovery = [ "garage_rpc/kubernetes-discovery" ] k2v = [ "garage_util/k2v", "garage_api/k2v" ] + +# NOTE: bundled-libs and system-libs should be treat as mutually exclusive; +# exactly one of them should be enabled. + +# Use bundled libsqlite instead of linking against system-provided. +bundled-libs = [ "garage_db/bundled-libs" ] +# Link against system-provided libsodium and libzstd. system-libs = [ "garage_block/system-libs", "garage_rpc/system-libs", "sodiumoxide/use-pkg-config" ] -- cgit v1.2.3 From 729a910e14bc44925175ea8240d0c16fdfc18103 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 5 Sep 2022 16:40:13 +0200 Subject: Remove Heed default features --- src/db/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src') diff --git a/src/db/Cargo.toml b/src/db/Cargo.toml index 230fbaf9..44f0be56 100644 --- a/src/db/Cargo.toml +++ b/src/db/Cargo.toml @@ -21,7 +21,7 @@ err-derive = "0.3" hexdump = "0.1" tracing = "0.1.30" -heed = "0.11" +heed = { version = "0.11", default-features = false, features = ["lmdb"] } rusqlite = "0.27" sled = "0.34" -- cgit v1.2.3 From e7af006c1c8211bf83b5d8abb7490ef270dd8345 Mon Sep 17 00:00:00 2001 From: Jakub Jirutka Date: Sat, 3 Sep 2022 23:40:44 +0200 Subject: Make OTLP exporter optional via feature "telemetry-otlp" opentelemetry-otlp add 48 (!) extra dependencies and increases the size of the garage binary by ~11 % (with fat LTO). --- src/api/Cargo.toml | 3 ++- src/garage/Cargo.toml | 4 +++- src/garage/main.rs | 1 + src/garage/server.rs | 8 +++++++- 4 files changed, 13 insertions(+), 3 deletions(-) (limited to 'src') diff --git a/src/api/Cargo.toml b/src/api/Cargo.toml index db77cf38..782054bd 100644 --- a/src/api/Cargo.toml +++ b/src/api/Cargo.toml @@ -55,8 +55,9 @@ url = "2.1" opentelemetry = "0.17" opentelemetry-prometheus = "0.10" -opentelemetry-otlp = "0.10" +opentelemetry-otlp = { version = "0.10", optional = true } prometheus = "0.13" [features] k2v = [ "garage_util/k2v", "garage_model/k2v" ] +telemetry-otlp = ["opentelemetry-otlp"] diff --git a/src/garage/Cargo.toml b/src/garage/Cargo.toml index e19aac50..8573e2fc 100644 --- a/src/garage/Cargo.toml +++ b/src/garage/Cargo.toml @@ -56,7 +56,7 @@ netapp = "0.4" opentelemetry = { version = "0.17", features = [ "rt-tokio" ] } opentelemetry-prometheus = "0.10" -opentelemetry-otlp = "0.10" +opentelemetry-otlp = { version = "0.10", optional = true } prometheus = "0.13" [dev-dependencies] @@ -77,6 +77,8 @@ base64 = "0.13" default = [ "bundled-libs" ] kubernetes-discovery = [ "garage_rpc/kubernetes-discovery" ] k2v = [ "garage_util/k2v", "garage_api/k2v" ] +# Exporter for the OpenTelemetry Collector. +telemetry-otlp = [ "opentelemetry-otlp", "garage_api/telemetry-otlp" ] # NOTE: bundled-libs and system-libs should be treat as mutually exclusive; # exactly one of them should be enabled. diff --git a/src/garage/main.rs b/src/garage/main.rs index 89888884..8f0b377e 100644 --- a/src/garage/main.rs +++ b/src/garage/main.rs @@ -8,6 +8,7 @@ mod admin; mod cli; mod repair; mod server; +#[cfg(feature = "telemetry-otlp")] mod tracing_setup; use std::net::SocketAddr; diff --git a/src/garage/server.rs b/src/garage/server.rs index 6321357a..d328c044 100644 --- a/src/garage/server.rs +++ b/src/garage/server.rs @@ -15,6 +15,7 @@ use garage_web::run_web_server; use garage_api::k2v::api_server::K2VApiServer; use crate::admin::*; +#[cfg(feature = "telemetry-otlp")] use crate::tracing_setup::*; async fn wait_from(mut chan: watch::Receiver) { @@ -36,9 +37,14 @@ pub async fn run_server(config_file: PathBuf) -> Result<(), Error> { info!("Initializing Garage main data store..."); let garage = Garage::new(config.clone(), background)?; - info!("Initialize tracing..."); if let Some(export_to) = config.admin.trace_sink { + info!("Initialize tracing..."); + + #[cfg(feature = "telemetry-otlp")] init_tracing(&export_to, garage.system.id)?; + + #[cfg(not(feature = "telemetry-otlp"))] + warn!("Garage was built without OTLP exporter, admin.trace_sink is ignored."); } info!("Initialize Admin API server and metrics collector..."); -- cgit v1.2.3 From ea36b9ff904a8300afb8fb1601cde88c915a810f Mon Sep 17 00:00:00 2001 From: Jakub Jirutka Date: Sun, 4 Sep 2022 00:43:48 +0200 Subject: Allow building without Prometheus exporter (/metrics endpoint) prometheus and opentelemetry-prometheus add 7 extra dependencies in total and increases the size of the garage binary by ~7 % (with fat LTO). --- src/api/Cargo.toml | 5 +++-- src/api/admin/api_server.rs | 55 +++++++++++++++++++++++++++------------------ src/garage/Cargo.toml | 8 ++++--- 3 files changed, 41 insertions(+), 27 deletions(-) (limited to 'src') diff --git a/src/api/Cargo.toml b/src/api/Cargo.toml index 782054bd..ce2d11c0 100644 --- a/src/api/Cargo.toml +++ b/src/api/Cargo.toml @@ -54,10 +54,11 @@ quick-xml = { version = "0.21", features = [ "serialize" ] } url = "2.1" opentelemetry = "0.17" -opentelemetry-prometheus = "0.10" +opentelemetry-prometheus = { version = "0.10", optional = true } opentelemetry-otlp = { version = "0.10", optional = true } -prometheus = "0.13" +prometheus = { version = "0.13", optional = true } [features] k2v = [ "garage_util/k2v", "garage_model/k2v" ] +metrics = [ "opentelemetry-prometheus", "prometheus" ] telemetry-otlp = ["opentelemetry-otlp"] diff --git a/src/api/admin/api_server.rs b/src/api/admin/api_server.rs index c3b16715..d871d4e2 100644 --- a/src/api/admin/api_server.rs +++ b/src/api/admin/api_server.rs @@ -3,13 +3,14 @@ use std::sync::Arc; use async_trait::async_trait; use futures::future::Future; -use http::header::{ - ACCESS_CONTROL_ALLOW_METHODS, ACCESS_CONTROL_ALLOW_ORIGIN, ALLOW, CONTENT_TYPE, -}; +use http::header::{ACCESS_CONTROL_ALLOW_METHODS, ACCESS_CONTROL_ALLOW_ORIGIN, ALLOW}; use hyper::{Body, Request, Response}; -use opentelemetry::trace::{SpanRef, Tracer}; +use opentelemetry::trace::SpanRef; + +#[cfg(feature = "metrics")] use opentelemetry_prometheus::PrometheusExporter; +#[cfg(feature = "metrics")] use prometheus::{Encoder, TextEncoder}; use garage_model::garage::Garage; @@ -25,6 +26,7 @@ use crate::admin::router::{Authorization, Endpoint}; pub struct AdminApiServer { garage: Arc, + #[cfg(feature = "metrics")] exporter: PrometheusExporter, metrics_token: Option, admin_token: Option, @@ -32,7 +34,6 @@ pub struct AdminApiServer { impl AdminApiServer { pub fn new(garage: Arc) -> Self { - let exporter = opentelemetry_prometheus::exporter().init(); let cfg = &garage.config.admin; let metrics_token = cfg .metrics_token @@ -44,7 +45,8 @@ impl AdminApiServer { .map(|tok| format!("Bearer {}", tok)); Self { garage, - exporter, + #[cfg(feature = "metrics")] + exporter: opentelemetry_prometheus::exporter().init(), metrics_token, admin_token, } @@ -71,22 +73,31 @@ impl AdminApiServer { } fn handle_metrics(&self) -> Result, Error> { - let mut buffer = vec![]; - let encoder = TextEncoder::new(); - - let tracer = opentelemetry::global::tracer("garage"); - let metric_families = tracer.in_span("admin/gather_metrics", |_| { - self.exporter.registry().gather() - }); - - encoder - .encode(&metric_families, &mut buffer) - .ok_or_internal_error("Could not serialize metrics")?; - - Ok(Response::builder() - .status(200) - .header(CONTENT_TYPE, encoder.format_type()) - .body(Body::from(buffer))?) + #[cfg(feature = "metrics")] + { + use opentelemetry::trace::Tracer; + + let mut buffer = vec![]; + let encoder = TextEncoder::new(); + + let tracer = opentelemetry::global::tracer("garage"); + let metric_families = tracer.in_span("admin/gather_metrics", |_| { + self.exporter.registry().gather() + }); + + encoder + .encode(&metric_families, &mut buffer) + .ok_or_internal_error("Could not serialize metrics")?; + + Ok(Response::builder() + .status(200) + .header(http::header::CONTENT_TYPE, encoder.format_type()) + .body(Body::from(buffer))?) + } + #[cfg(not(feature = "metrics"))] + Err(Error::bad_request( + "Garage was built without the metrics feature".to_string(), + )) } } diff --git a/src/garage/Cargo.toml b/src/garage/Cargo.toml index 8573e2fc..553ac57a 100644 --- a/src/garage/Cargo.toml +++ b/src/garage/Cargo.toml @@ -55,9 +55,9 @@ tokio = { version = "1.0", default-features = false, features = ["rt", "rt-multi netapp = "0.4" opentelemetry = { version = "0.17", features = [ "rt-tokio" ] } -opentelemetry-prometheus = "0.10" +opentelemetry-prometheus = { version = "0.10", optional = true } opentelemetry-otlp = { version = "0.10", optional = true } -prometheus = "0.13" +prometheus = { version = "0.13", optional = true } [dev-dependencies] aws-sdk-s3 = "0.8" @@ -74,9 +74,11 @@ base64 = "0.13" [features] -default = [ "bundled-libs" ] +default = [ "bundled-libs", "metrics" ] kubernetes-discovery = [ "garage_rpc/kubernetes-discovery" ] k2v = [ "garage_util/k2v", "garage_api/k2v" ] +# Prometheus exporter (/metrics endpoint). +metrics = [ "garage_api/metrics", "opentelemetry-prometheus", "prometheus" ] # Exporter for the OpenTelemetry Collector. telemetry-otlp = [ "opentelemetry-otlp", "garage_api/telemetry-otlp" ] -- cgit v1.2.3 From 454d8474ef2b1364750ee96dacae0d69df583f93 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 6 Sep 2022 15:43:50 +0200 Subject: Fix clippy --- src/garage/server.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'src') diff --git a/src/garage/server.rs b/src/garage/server.rs index d328c044..0851738d 100644 --- a/src/garage/server.rs +++ b/src/garage/server.rs @@ -37,11 +37,11 @@ pub async fn run_server(config_file: PathBuf) -> Result<(), Error> { info!("Initializing Garage main data store..."); let garage = Garage::new(config.clone(), background)?; - if let Some(export_to) = config.admin.trace_sink { + if config.admin.trace_sink.is_some() { info!("Initialize tracing..."); #[cfg(feature = "telemetry-otlp")] - init_tracing(&export_to, garage.system.id)?; + init_tracing(config.admin.trace_sink.as_ref().unwrap(), garage.system.id)?; #[cfg(not(feature = "telemetry-otlp"))] warn!("Garage was built without OTLP exporter, admin.trace_sink is ignored."); -- cgit v1.2.3 From 48ffaaadfc790142ed9556f5227913fa8c32d2ed Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 6 Sep 2022 16:47:56 +0200 Subject: Bump versions to 0.8.0 (compatibility is broken already) --- src/api/Cargo.toml | 12 ++++++------ src/block/Cargo.toml | 8 ++++---- src/garage/Cargo.toml | 16 ++++++++-------- src/k2v-client/Cargo.toml | 2 +- src/model/Cargo.toml | 10 +++++----- src/rpc/Cargo.toml | 4 ++-- src/table/Cargo.toml | 6 +++--- src/util/Cargo.toml | 2 +- src/web/Cargo.toml | 10 +++++----- 9 files changed, 35 insertions(+), 35 deletions(-) (limited to 'src') diff --git a/src/api/Cargo.toml b/src/api/Cargo.toml index ce2d11c0..106f9014 100644 --- a/src/api/Cargo.toml +++ b/src/api/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "garage_api" -version = "0.7.0" +version = "0.8.0" authors = ["Alex Auvolat "] edition = "2018" license = "AGPL-3.0" @@ -14,11 +14,11 @@ path = "lib.rs" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -garage_model = { version = "0.7.0", path = "../model" } -garage_table = { version = "0.7.0", path = "../table" } -garage_block = { version = "0.7.0", path = "../block" } -garage_util = { version = "0.7.0", path = "../util" } -garage_rpc = { version = "0.7.0", path = "../rpc" } +garage_model = { version = "0.8.0", path = "../model" } +garage_table = { version = "0.8.0", path = "../table" } +garage_block = { version = "0.8.0", path = "../block" } +garage_util = { version = "0.8.0", path = "../util" } +garage_rpc = { version = "0.8.0", path = "../rpc" } async-trait = "0.1.7" base64 = "0.13" diff --git a/src/block/Cargo.toml b/src/block/Cargo.toml index ca0360b5..8cf5a01c 100644 --- a/src/block/Cargo.toml +++ b/src/block/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "garage_block" -version = "0.7.0" +version = "0.8.0" authors = ["Alex Auvolat "] edition = "2018" license = "AGPL-3.0" @@ -15,9 +15,9 @@ path = "lib.rs" [dependencies] garage_db = { version = "0.8.0", path = "../db" } -garage_rpc = { version = "0.7.0", path = "../rpc" } -garage_util = { version = "0.7.0", path = "../util" } -garage_table = { version = "0.7.0", path = "../table" } +garage_rpc = { version = "0.8.0", path = "../rpc" } +garage_util = { version = "0.8.0", path = "../util" } +garage_table = { version = "0.8.0", path = "../table" } opentelemetry = "0.17" diff --git a/src/garage/Cargo.toml b/src/garage/Cargo.toml index 553ac57a..78579995 100644 --- a/src/garage/Cargo.toml +++ b/src/garage/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "garage" -version = "0.7.0" +version = "0.8.0" authors = ["Alex Auvolat "] edition = "2018" license = "AGPL-3.0" @@ -22,13 +22,13 @@ path = "tests/lib.rs" [dependencies] garage_db = { version = "0.8.0", path = "../db" } -garage_api = { version = "0.7.0", path = "../api" } -garage_block = { version = "0.7.0", path = "../block" } -garage_model = { version = "0.7.0", path = "../model" } -garage_rpc = { version = "0.7.0", path = "../rpc" } -garage_table = { version = "0.7.0", path = "../table" } -garage_util = { version = "0.7.0", path = "../util" } -garage_web = { version = "0.7.0", path = "../web" } +garage_api = { version = "0.8.0", path = "../api" } +garage_block = { version = "0.8.0", path = "../block" } +garage_model = { version = "0.8.0", path = "../model" } +garage_rpc = { version = "0.8.0", path = "../rpc" } +garage_table = { version = "0.8.0", path = "../table" } +garage_util = { version = "0.8.0", path = "../util" } +garage_web = { version = "0.8.0", path = "../web" } bytes = "1.0" bytesize = "1.1" diff --git a/src/k2v-client/Cargo.toml b/src/k2v-client/Cargo.toml index 2f8a2679..0f0b76ae 100644 --- a/src/k2v-client/Cargo.toml +++ b/src/k2v-client/Cargo.toml @@ -22,7 +22,7 @@ tokio = "1.17.0" # cli deps clap = { version = "3.1.18", optional = true, features = ["derive", "env"] } -garage_util = { version = "0.7.0", path = "../util", optional = true } +garage_util = { version = "0.8.0", path = "../util", optional = true } [features] diff --git a/src/model/Cargo.toml b/src/model/Cargo.toml index d908dc01..7b831538 100644 --- a/src/model/Cargo.toml +++ b/src/model/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "garage_model" -version = "0.7.0" +version = "0.8.0" authors = ["Alex Auvolat "] edition = "2018" license = "AGPL-3.0" @@ -15,10 +15,10 @@ path = "lib.rs" [dependencies] garage_db = { version = "0.8.0", path = "../db" } -garage_rpc = { version = "0.7.0", path = "../rpc" } -garage_table = { version = "0.7.0", path = "../table" } -garage_block = { version = "0.7.0", path = "../block" } -garage_util = { version = "0.7.0", path = "../util" } +garage_rpc = { version = "0.8.0", path = "../rpc" } +garage_table = { version = "0.8.0", path = "../table" } +garage_block = { version = "0.8.0", path = "../block" } +garage_util = { version = "0.8.0", path = "../util" } garage_model_050 = { package = "garage_model", version = "0.5.1" } async-trait = "0.1.7" diff --git a/src/rpc/Cargo.toml b/src/rpc/Cargo.toml index 309e3fc2..21841a02 100644 --- a/src/rpc/Cargo.toml +++ b/src/rpc/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "garage_rpc" -version = "0.7.0" +version = "0.8.0" authors = ["Alex Auvolat "] edition = "2018" license = "AGPL-3.0" @@ -14,7 +14,7 @@ path = "lib.rs" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -garage_util = { version = "0.7.0", path = "../util" } +garage_util = { version = "0.8.0", path = "../util" } arc-swap = "1.0" bytes = "1.0" diff --git a/src/table/Cargo.toml b/src/table/Cargo.toml index 6de37cda..ae52e8d7 100644 --- a/src/table/Cargo.toml +++ b/src/table/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "garage_table" -version = "0.7.0" +version = "0.8.0" authors = ["Alex Auvolat "] edition = "2018" license = "AGPL-3.0" @@ -15,8 +15,8 @@ path = "lib.rs" [dependencies] garage_db = { version = "0.8.0", path = "../db" } -garage_rpc = { version = "0.7.0", path = "../rpc" } -garage_util = { version = "0.7.0", path = "../util" } +garage_rpc = { version = "0.8.0", path = "../rpc" } +garage_util = { version = "0.8.0", path = "../util" } opentelemetry = "0.17" diff --git a/src/util/Cargo.toml b/src/util/Cargo.toml index 783fb3fc..5f3e5c57 100644 --- a/src/util/Cargo.toml +++ b/src/util/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "garage_util" -version = "0.7.0" +version = "0.8.0" authors = ["Alex Auvolat "] edition = "2018" license = "AGPL-3.0" diff --git a/src/web/Cargo.toml b/src/web/Cargo.toml index 59a1231d..7bf70c55 100644 --- a/src/web/Cargo.toml +++ b/src/web/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "garage_web" -version = "0.7.0" +version = "0.8.0" authors = ["Alex Auvolat ", "Quentin Dufour "] edition = "2018" license = "AGPL-3.0" @@ -14,10 +14,10 @@ path = "lib.rs" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -garage_api = { version = "0.7.0", path = "../api" } -garage_model = { version = "0.7.0", path = "../model" } -garage_util = { version = "0.7.0", path = "../util" } -garage_table = { version = "0.7.0", path = "../table" } +garage_api = { version = "0.8.0", path = "../api" } +garage_model = { version = "0.8.0", path = "../model" } +garage_util = { version = "0.8.0", path = "../util" } +garage_table = { version = "0.8.0", path = "../table" } err-derive = "0.3" tracing = "0.1.30" -- cgit v1.2.3 From b886c75450e3ee6a7c2b0a8265d7ada20a4d9d75 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 6 Sep 2022 17:09:43 +0200 Subject: Make all DB engines optional build features --- src/db/Cargo.toml | 8 +++++--- src/db/lib.rs | 4 ++++ src/garage/Cargo.toml | 12 ++++++++++-- src/model/Cargo.toml | 3 +++ src/model/garage.rs | 30 ++++++++++++++++++++++++++++-- 5 files changed, 50 insertions(+), 7 deletions(-) (limited to 'src') diff --git a/src/db/Cargo.toml b/src/db/Cargo.toml index 44f0be56..62dda2ca 100644 --- a/src/db/Cargo.toml +++ b/src/db/Cargo.toml @@ -21,9 +21,9 @@ err-derive = "0.3" hexdump = "0.1" tracing = "0.1.30" -heed = { version = "0.11", default-features = false, features = ["lmdb"] } -rusqlite = "0.27" -sled = "0.34" +heed = { version = "0.11", default-features = false, features = ["lmdb"], optional = true } +rusqlite = { version = "0.27", optional = true } +sled = { version = "0.34", optional = true } # cli deps clap = { version = "3.1.18", optional = true, features = ["derive", "env"] } @@ -35,3 +35,5 @@ mktemp = "0.4" [features] bundled-libs = [ "rusqlite/bundled" ] cli = ["clap", "pretty_env_logger"] +lmdb = [ "heed" ] +sqlite = [ "rusqlite" ] diff --git a/src/db/lib.rs b/src/db/lib.rs index f185114e..5304c195 100644 --- a/src/db/lib.rs +++ b/src/db/lib.rs @@ -1,8 +1,12 @@ #[macro_use] +#[cfg(feature = "sqlite")] extern crate tracing; +#[cfg(feature = "lmdb")] pub mod lmdb_adapter; +#[cfg(feature = "sled")] pub mod sled_adapter; +#[cfg(feature = "sqlite")] pub mod sqlite_adapter; pub mod counted_tree_hack; diff --git a/src/garage/Cargo.toml b/src/garage/Cargo.toml index 78579995..00b16ded 100644 --- a/src/garage/Cargo.toml +++ b/src/garage/Cargo.toml @@ -74,9 +74,17 @@ base64 = "0.13" [features] -default = [ "bundled-libs", "metrics" ] -kubernetes-discovery = [ "garage_rpc/kubernetes-discovery" ] +default = [ "bundled-libs", "metrics", "sled" ] + k2v = [ "garage_util/k2v", "garage_api/k2v" ] + +# Database engines, Sled is still our default even though we don't like it +sled = [ "garage_model/sled" ] +lmdb = [ "garage_model/lmdb" ] +sqlite = [ "garage_model/sqlite" ] + +# Automatic registration and discovery via Kubernetes API +kubernetes-discovery = [ "garage_rpc/kubernetes-discovery" ] # Prometheus exporter (/metrics endpoint). metrics = [ "garage_api/metrics", "opentelemetry-prometheus", "prometheus" ] # Exporter for the OpenTelemetry Collector. diff --git a/src/model/Cargo.toml b/src/model/Cargo.toml index 7b831538..cb0017b2 100644 --- a/src/model/Cargo.toml +++ b/src/model/Cargo.toml @@ -46,3 +46,6 @@ netapp = "0.4" [features] k2v = [ "garage_util/k2v" ] +lmdb = [ "garage_db/lmdb" ] +sled = [ "garage_db/sled" ] +sqlite = [ "garage_db/sqlite" ] diff --git a/src/model/garage.rs b/src/model/garage.rs index 15769a17..19eecb1e 100644 --- a/src/model/garage.rs +++ b/src/model/garage.rs @@ -80,6 +80,8 @@ impl Garage { let mut db_path = config.metadata_dir.clone(); std::fs::create_dir_all(&db_path).expect("Unable to create Garage meta data directory"); let db = match config.db_engine.as_str() { + // ---- Sled DB ---- + #[cfg(feature = "sled")] "sled" => { db_path.push("db"); info!("Opening Sled database at: {}", db_path.display()); @@ -91,6 +93,10 @@ impl Garage { .expect("Unable to open sled DB"); db::sled_adapter::SledDb::init(db) } + #[cfg(not(feature = "sled"))] + "sled" => return Err(Error::Message("sled db not available in this build".into())), + // ---- Sqlite DB ---- + #[cfg(feature = "sqlite")] "sqlite" | "sqlite3" | "rusqlite" => { db_path.push("db.sqlite"); info!("Opening Sqlite database at: {}", db_path.display()); @@ -98,6 +104,14 @@ impl Garage { .expect("Unable to open sqlite DB"); db::sqlite_adapter::SqliteDb::init(db) } + #[cfg(not(feature = "sqlite"))] + "sqlite" | "sqlite3" | "rusqlite" => { + return Err(Error::Message( + "sqlite db not available in this build".into(), + )) + } + // ---- LMDB DB ---- + #[cfg(feature = "lmdb")] "lmdb" | "heed" => { db_path.push("db.lmdb"); info!("Opening LMDB database at: {}", db_path.display()); @@ -111,10 +125,22 @@ impl Garage { .expect("Unable to open LMDB DB"); db::lmdb_adapter::LmdbDb::init(db) } + #[cfg(not(feature = "lmdb"))] + "lmdb" | "heed" => return Err(Error::Message("lmdb db not available in this build".into())), + // ---- Unavailable DB engine ---- e => { return Err(Error::Message(format!( - "Unsupported DB engine: {} (options: sled, sqlite, lmdb)", - e + "Unsupported DB engine: {} (options: {})", + e, + vec![ + #[cfg(feature = "sled")] + "sled", + #[cfg(feature = "sqlite")] + "sqlite", + #[cfg(feature = "lmdb")] + "lmdb", + ] + .join(", ") ))); } }; -- cgit v1.2.3 From 431dee050f9dd1454ac89d20de214f973cbb387f Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 6 Sep 2022 17:25:44 +0200 Subject: Remove opentelemetry-otlp dep in api/ --- src/api/Cargo.toml | 2 -- src/garage/Cargo.toml | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'src') diff --git a/src/api/Cargo.toml b/src/api/Cargo.toml index 106f9014..eb87a4fb 100644 --- a/src/api/Cargo.toml +++ b/src/api/Cargo.toml @@ -55,10 +55,8 @@ url = "2.1" opentelemetry = "0.17" opentelemetry-prometheus = { version = "0.10", optional = true } -opentelemetry-otlp = { version = "0.10", optional = true } prometheus = { version = "0.13", optional = true } [features] k2v = [ "garage_util/k2v", "garage_model/k2v" ] metrics = [ "opentelemetry-prometheus", "prometheus" ] -telemetry-otlp = ["opentelemetry-otlp"] diff --git a/src/garage/Cargo.toml b/src/garage/Cargo.toml index 00b16ded..b08e9439 100644 --- a/src/garage/Cargo.toml +++ b/src/garage/Cargo.toml @@ -88,7 +88,7 @@ kubernetes-discovery = [ "garage_rpc/kubernetes-discovery" ] # Prometheus exporter (/metrics endpoint). metrics = [ "garage_api/metrics", "opentelemetry-prometheus", "prometheus" ] # Exporter for the OpenTelemetry Collector. -telemetry-otlp = [ "opentelemetry-otlp", "garage_api/telemetry-otlp" ] +telemetry-otlp = [ "opentelemetry-otlp" ] # NOTE: bundled-libs and system-libs should be treat as mutually exclusive; # exactly one of them should be enabled. -- cgit v1.2.3 From 1e92e9f78251ed72e3e5eb27ed3f389f9f53c488 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 6 Sep 2022 17:29:46 +0200 Subject: Disable k2v tests when feature is disabled --- src/garage/tests/lib.rs | 1 + 1 file changed, 1 insertion(+) (limited to 'src') diff --git a/src/garage/tests/lib.rs b/src/garage/tests/lib.rs index 0106ad10..d15639b9 100644 --- a/src/garage/tests/lib.rs +++ b/src/garage/tests/lib.rs @@ -3,5 +3,6 @@ mod common; mod admin; mod bucket; +#[cfg(feature = "k2v")] mod k2v; mod s3; -- cgit v1.2.3 From 0f5689c16920479066277db2880e2ca87f7ca602 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 6 Sep 2022 17:52:50 +0200 Subject: Include code from v0.5.1 directly to remove dependencies --- src/model/Cargo.toml | 1 - src/model/key_table.rs | 2 +- src/model/lib.rs | 3 + src/model/migrate.rs | 2 +- src/model/prev/mod.rs | 1 + src/model/prev/v051/bucket_table.rs | 63 +++++++++++++++ src/model/prev/v051/key_table.rs | 51 ++++++++++++ src/model/prev/v051/mod.rs | 4 + src/model/prev/v051/object_table.rs | 150 +++++++++++++++++++++++++++++++++++ src/model/prev/v051/version_table.rs | 79 ++++++++++++++++++ src/model/s3/object_table.rs | 2 +- src/model/s3/version_table.rs | 2 +- 12 files changed, 355 insertions(+), 5 deletions(-) create mode 100644 src/model/prev/mod.rs create mode 100644 src/model/prev/v051/bucket_table.rs create mode 100644 src/model/prev/v051/key_table.rs create mode 100644 src/model/prev/v051/mod.rs create mode 100644 src/model/prev/v051/object_table.rs create mode 100644 src/model/prev/v051/version_table.rs (limited to 'src') diff --git a/src/model/Cargo.toml b/src/model/Cargo.toml index cb0017b2..bbcfe89c 100644 --- a/src/model/Cargo.toml +++ b/src/model/Cargo.toml @@ -19,7 +19,6 @@ garage_rpc = { version = "0.8.0", path = "../rpc" } garage_table = { version = "0.8.0", path = "../table" } garage_block = { version = "0.8.0", path = "../block" } garage_util = { version = "0.8.0", path = "../util" } -garage_model_050 = { package = "garage_model", version = "0.5.1" } async-trait = "0.1.7" arc-swap = "1.0" diff --git a/src/model/key_table.rs b/src/model/key_table.rs index 330e83f0..7288f6e4 100644 --- a/src/model/key_table.rs +++ b/src/model/key_table.rs @@ -6,7 +6,7 @@ use garage_util::data::*; use crate::permission::BucketKeyPerm; -use garage_model_050::key_table as old; +use crate::prev::v051::key_table as old; /// An api key #[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] diff --git a/src/model/lib.rs b/src/model/lib.rs index 7c9d9270..4f20ea46 100644 --- a/src/model/lib.rs +++ b/src/model/lib.rs @@ -1,6 +1,9 @@ #[macro_use] extern crate tracing; +// For migration from previous versions +pub(crate) mod prev; + pub mod permission; pub mod index_counter; diff --git a/src/model/migrate.rs b/src/model/migrate.rs index 5fc67069..cd6ad26a 100644 --- a/src/model/migrate.rs +++ b/src/model/migrate.rs @@ -5,7 +5,7 @@ use garage_util::data::*; use garage_util::error::Error as GarageError; use garage_util::time::*; -use garage_model_050::bucket_table as old_bucket; +use crate::prev::v051::bucket_table as old_bucket; use crate::bucket_alias_table::*; use crate::bucket_table::*; diff --git a/src/model/prev/mod.rs b/src/model/prev/mod.rs new file mode 100644 index 00000000..68bb1502 --- /dev/null +++ b/src/model/prev/mod.rs @@ -0,0 +1 @@ +pub(crate) mod v051; diff --git a/src/model/prev/v051/bucket_table.rs b/src/model/prev/v051/bucket_table.rs new file mode 100644 index 00000000..0c52b6ea --- /dev/null +++ b/src/model/prev/v051/bucket_table.rs @@ -0,0 +1,63 @@ +use serde::{Deserialize, Serialize}; + +use garage_table::crdt::Crdt; +use garage_table::*; + +use super::key_table::PermissionSet; + +/// A bucket is a collection of objects +/// +/// Its parameters are not directly accessible as: +/// - It must be possible to merge paramaters, hence the use of a LWW CRDT. +/// - A bucket has 2 states, Present or Deleted and parameters make sense only if present. +#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] +pub struct Bucket { + /// Name of the bucket + pub name: String, + /// State, and configuration if not deleted, of the bucket + pub state: crdt::Lww, +} + +/// State of a bucket +#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] +pub enum BucketState { + /// The bucket is deleted + Deleted, + /// The bucket exists + Present(BucketParams), +} + +impl Crdt for BucketState { + fn merge(&mut self, o: &Self) { + match o { + BucketState::Deleted => *self = BucketState::Deleted, + BucketState::Present(other_params) => { + if let BucketState::Present(params) = self { + params.merge(other_params); + } + } + } + } +} + +/// Configuration for a bucket +#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] +pub struct BucketParams { + /// Map of key with access to the bucket, and what kind of access they give + pub authorized_keys: crdt::LwwMap, + /// Is the bucket served as http + pub website: crdt::Lww, +} + +impl Crdt for BucketParams { + fn merge(&mut self, o: &Self) { + self.authorized_keys.merge(&o.authorized_keys); + self.website.merge(&o.website); + } +} + +impl Crdt for Bucket { + fn merge(&mut self, other: &Self) { + self.state.merge(&other.state); + } +} diff --git a/src/model/prev/v051/key_table.rs b/src/model/prev/v051/key_table.rs new file mode 100644 index 00000000..dab6caa7 --- /dev/null +++ b/src/model/prev/v051/key_table.rs @@ -0,0 +1,51 @@ +use serde::{Deserialize, Serialize}; + +use garage_table::crdt::*; +use garage_table::*; + +/// An api key +#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] +pub struct Key { + /// The id of the key (immutable), used as partition key + pub key_id: String, + + /// The secret_key associated + pub secret_key: String, + + /// Name for the key + pub name: crdt::Lww, + + /// Is the key deleted + pub deleted: crdt::Bool, + + /// Buckets in which the key is authorized. Empty if `Key` is deleted + // CRDT interaction: deleted implies authorized_buckets is empty + pub authorized_buckets: crdt::LwwMap, +} + +/// Permission given to a key in a bucket +#[derive(PartialOrd, Ord, PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] +pub struct PermissionSet { + /// The key can be used to read the bucket + pub allow_read: bool, + /// The key can be used to write in the bucket + pub allow_write: bool, +} + +impl AutoCrdt for PermissionSet { + const WARN_IF_DIFFERENT: bool = true; +} + +impl Crdt for Key { + fn merge(&mut self, other: &Self) { + self.name.merge(&other.name); + self.deleted.merge(&other.deleted); + + if self.deleted.get() { + self.authorized_buckets.clear(); + } else { + self.authorized_buckets.merge(&other.authorized_buckets); + } + } +} + diff --git a/src/model/prev/v051/mod.rs b/src/model/prev/v051/mod.rs new file mode 100644 index 00000000..7a954752 --- /dev/null +++ b/src/model/prev/v051/mod.rs @@ -0,0 +1,4 @@ +pub(crate) mod bucket_table; +pub(crate) mod key_table; +pub(crate) mod object_table; +pub(crate) mod version_table; diff --git a/src/model/prev/v051/object_table.rs b/src/model/prev/v051/object_table.rs new file mode 100644 index 00000000..fe35d683 --- /dev/null +++ b/src/model/prev/v051/object_table.rs @@ -0,0 +1,150 @@ +use serde::{Deserialize, Serialize}; +use std::collections::BTreeMap; + +use garage_util::data::*; + +use garage_table::crdt::*; + +/// An object +#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] +pub struct Object { + /// The bucket in which the object is stored, used as partition key + pub bucket: String, + + /// The key at which the object is stored in its bucket, used as sorting key + pub key: String, + + /// The list of currenty stored versions of the object + versions: Vec, +} + +impl Object { + /// Get a list of currently stored versions of `Object` + pub fn versions(&self) -> &[ObjectVersion] { + &self.versions[..] + } +} + +/// Informations about a version of an object +#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] +pub struct ObjectVersion { + /// Id of the version + pub uuid: Uuid, + /// Timestamp of when the object was created + pub timestamp: u64, + /// State of the version + pub state: ObjectVersionState, +} + +/// State of an object version +#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] +pub enum ObjectVersionState { + /// The version is being received + Uploading(ObjectVersionHeaders), + /// The version is fully received + Complete(ObjectVersionData), + /// The version uploaded containded errors or the upload was explicitly aborted + Aborted, +} + +impl Crdt for ObjectVersionState { + fn merge(&mut self, other: &Self) { + use ObjectVersionState::*; + match other { + Aborted => { + *self = Aborted; + } + Complete(b) => match self { + Aborted => {} + Complete(a) => { + a.merge(b); + } + Uploading(_) => { + *self = Complete(b.clone()); + } + }, + Uploading(_) => {} + } + } +} + +/// Data stored in object version +#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] +pub enum ObjectVersionData { + /// The object was deleted, this Version is a tombstone to mark it as such + DeleteMarker, + /// The object is short, it's stored inlined + Inline(ObjectVersionMeta, #[serde(with = "serde_bytes")] Vec), + /// The object is not short, Hash of first block is stored here, next segments hashes are + /// stored in the version table + FirstBlock(ObjectVersionMeta, Hash), +} + +impl AutoCrdt for ObjectVersionData { + const WARN_IF_DIFFERENT: bool = true; +} + +/// Metadata about the object version +#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] +pub struct ObjectVersionMeta { + /// Headers to send to the client + pub headers: ObjectVersionHeaders, + /// Size of the object + pub size: u64, + /// etag of the object + pub etag: String, +} + +/// Additional headers for an object +#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)] +pub struct ObjectVersionHeaders { + /// Content type of the object + pub content_type: String, + /// Any other http headers to send + pub other: BTreeMap, +} + +impl ObjectVersion { + fn cmp_key(&self) -> (u64, Uuid) { + (self.timestamp, self.uuid) + } + + /// Is the object version completely received + pub fn is_complete(&self) -> bool { + matches!(self.state, ObjectVersionState::Complete(_)) + } +} + +impl Crdt for Object { + fn merge(&mut self, other: &Self) { + // Merge versions from other into here + for other_v in other.versions.iter() { + match self + .versions + .binary_search_by(|v| v.cmp_key().cmp(&other_v.cmp_key())) + { + Ok(i) => { + self.versions[i].state.merge(&other_v.state); + } + Err(i) => { + self.versions.insert(i, other_v.clone()); + } + } + } + + // Remove versions which are obsolete, i.e. those that come + // before the last version which .is_complete(). + let last_complete = self + .versions + .iter() + .enumerate() + .rev() + .find(|(_, v)| v.is_complete()) + .map(|(vi, _)| vi); + + if let Some(last_vi) = last_complete { + self.versions = self.versions.drain(last_vi..).collect::>(); + } + } +} + diff --git a/src/model/prev/v051/version_table.rs b/src/model/prev/v051/version_table.rs new file mode 100644 index 00000000..1e658f91 --- /dev/null +++ b/src/model/prev/v051/version_table.rs @@ -0,0 +1,79 @@ +use serde::{Deserialize, Serialize}; + +use garage_util::data::*; + +use garage_table::crdt::*; +use garage_table::*; + +/// A version of an object +#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] +pub struct Version { + /// UUID of the version, used as partition key + pub uuid: Uuid, + + // Actual data: the blocks for this version + // In the case of a multipart upload, also store the etags + // of individual parts and check them when doing CompleteMultipartUpload + /// Is this version deleted + pub deleted: crdt::Bool, + /// list of blocks of data composing the version + pub blocks: crdt::Map, + /// Etag of each part in case of a multipart upload, empty otherwise + pub parts_etags: crdt::Map, + + // Back link to bucket+key so that we can figure if + // this was deleted later on + /// Bucket in which the related object is stored + pub bucket: String, + /// Key in which the related object is stored + pub key: String, +} + +#[derive(PartialEq, Eq, Clone, Copy, Debug, Serialize, Deserialize)] +pub struct VersionBlockKey { + /// Number of the part + pub part_number: u64, + /// Offset of this sub-segment in its part + pub offset: u64, +} + +impl Ord for VersionBlockKey { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + self.part_number + .cmp(&other.part_number) + .then(self.offset.cmp(&other.offset)) + } +} + +impl PartialOrd for VersionBlockKey { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +/// Informations about a single block +#[derive(PartialEq, Eq, Ord, PartialOrd, Clone, Copy, Debug, Serialize, Deserialize)] +pub struct VersionBlock { + /// Blake2 sum of the block + pub hash: Hash, + /// Size of the block + pub size: u64, +} + +impl AutoCrdt for VersionBlock { + const WARN_IF_DIFFERENT: bool = true; +} + +impl Crdt for Version { + fn merge(&mut self, other: &Self) { + self.deleted.merge(&other.deleted); + + if self.deleted.get() { + self.blocks.clear(); + self.parts_etags.clear(); + } else { + self.blocks.merge(&other.blocks); + self.parts_etags.merge(&other.parts_etags); + } + } +} diff --git a/src/model/s3/object_table.rs b/src/model/s3/object_table.rs index a3914c36..a151f1b1 100644 --- a/src/model/s3/object_table.rs +++ b/src/model/s3/object_table.rs @@ -14,7 +14,7 @@ use garage_table::*; use crate::index_counter::*; use crate::s3::version_table::*; -use garage_model_050::object_table as old; +use crate::prev::v051::object_table as old; pub const OBJECTS: &str = "objects"; pub const UNFINISHED_UPLOADS: &str = "unfinished_uploads"; diff --git a/src/model/s3/version_table.rs b/src/model/s3/version_table.rs index 881c245a..b545e66a 100644 --- a/src/model/s3/version_table.rs +++ b/src/model/s3/version_table.rs @@ -12,7 +12,7 @@ use garage_table::*; use crate::s3::block_ref_table::*; -use garage_model_050::version_table as old; +use crate::prev::v051::version_table as old; /// A version of an object #[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] -- cgit v1.2.3 From 6f02c36a89d93b04944a3f0882b6f6b703d9c012 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 6 Sep 2022 17:59:41 +0200 Subject: cargo fmt --- src/model/prev/v051/key_table.rs | 1 - src/model/prev/v051/object_table.rs | 1 - 2 files changed, 2 deletions(-) (limited to 'src') diff --git a/src/model/prev/v051/key_table.rs b/src/model/prev/v051/key_table.rs index dab6caa7..fee24741 100644 --- a/src/model/prev/v051/key_table.rs +++ b/src/model/prev/v051/key_table.rs @@ -48,4 +48,3 @@ impl Crdt for Key { } } } - diff --git a/src/model/prev/v051/object_table.rs b/src/model/prev/v051/object_table.rs index fe35d683..cb59b309 100644 --- a/src/model/prev/v051/object_table.rs +++ b/src/model/prev/v051/object_table.rs @@ -147,4 +147,3 @@ impl Crdt for Object { } } } - -- cgit v1.2.3 From c2cc08852bcbd94bad5c15c39e7145c0496d7241 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 6 Sep 2022 19:31:42 +0200 Subject: Reenable node ordering --- src/block/manager.rs | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'src') diff --git a/src/block/manager.rs b/src/block/manager.rs index a9def3b0..66a454b0 100644 --- a/src/block/manager.rs +++ b/src/block/manager.rs @@ -9,7 +9,7 @@ use async_trait::async_trait; use bytes::Bytes; use serde::{Deserialize, Serialize}; -use futures::{Stream, TryStreamExt}; +use futures::Stream; use futures_util::stream::StreamExt; use tokio::fs; use tokio::io::{AsyncReadExt, AsyncWriteExt, BufReader}; @@ -191,7 +191,7 @@ impl BlockManager { order_tag: Option, ) -> Result<(DataBlockHeader, ByteStream), Error> { let who = self.replication.read_nodes(hash); - //let who = self.system.rpc.request_order(&who); + let who = self.system.rpc.request_order(&who); for node in who.iter() { let node_id = NodeID::from(*node); @@ -238,7 +238,7 @@ impl BlockManager { order_tag: Option, ) -> Result { let who = self.replication.read_nodes(hash); - //let who = self.system.rpc.request_order(&who); + let who = self.system.rpc.request_order(&who); for node in who.iter() { let node_id = NodeID::from(*node); @@ -296,9 +296,7 @@ impl BlockManager { > { let (header, stream) = self.rpc_get_raw_block_streaming(hash, order_tag).await?; match header { - DataBlockHeader::Plain => Ok(Box::pin(stream.map_err(|_| { - std::io::Error::new(std::io::ErrorKind::Other, "netapp stream error") - }))), + DataBlockHeader::Plain => Ok(Box::pin(stream)), DataBlockHeader::Compressed => { // Too many things, I hate it. let reader = stream_asyncread(stream); -- cgit v1.2.3 From 907054775dc71a10a92ab96112889db9113130ab Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 6 Sep 2022 22:25:23 +0200 Subject: Faster copy, better get error message --- src/api/s3/copy.rs | 12 +++++------- src/api/s3/get.rs | 4 ++-- 2 files changed, 7 insertions(+), 9 deletions(-) (limited to 'src') diff --git a/src/api/s3/copy.rs b/src/api/s3/copy.rs index 10cf5935..a1a8c9a4 100644 --- a/src/api/s3/copy.rs +++ b/src/api/s3/copy.rs @@ -9,6 +9,7 @@ use bytes::Bytes; use hyper::{Body, Request, Response}; use serde::Serialize; +use garage_rpc::netapp::bytes_buf::BytesBuf; use garage_rpc::rpc_helper::OrderTag; use garage_table::*; use garage_util::data::*; @@ -566,7 +567,7 @@ type BlockStreamItem = Result; struct Defragmenter> { block_size: usize, block_stream: Pin>>, - buffer: Vec, + buffer: BytesBuf, hash: Option, } @@ -575,7 +576,7 @@ impl> Defragmenter { Self { block_size, block_stream, - buffer: vec![], + buffer: BytesBuf::new(), hash: None, } } @@ -593,7 +594,7 @@ impl> Defragmenter { if self.buffer.is_empty() { let (next_block, next_block_hash) = self.block_stream.next().await.unwrap()?; - self.buffer = next_block.to_vec(); // TODO TOO MUCH COPY + self.buffer.extend(next_block); self.hash = next_block_hash; } else if self.buffer.len() + peeked_next_block.len() > self.block_size { break; @@ -604,10 +605,7 @@ impl> Defragmenter { } } - Ok(( - Bytes::from(std::mem::take(&mut self.buffer)), - self.hash.take(), - )) + Ok((self.buffer.take_all(), self.hash.take())) } } diff --git a/src/api/s3/get.rs b/src/api/s3/get.rs index dfc284fe..dd95f6e7 100644 --- a/src/api/s3/get.rs +++ b/src/api/s3/get.rs @@ -274,11 +274,11 @@ pub async fn handle_get( .block_manager .rpc_get_block_streaming(&hash, Some(order_stream.order(i as u64))) .await - .unwrap_or_else(|_| { + .unwrap_or_else(|e| { Box::pin(futures::stream::once(async move { Err(std::io::Error::new( std::io::ErrorKind::Other, - "Could not get next block", + format!("Could not get block {}: {}", i, e), )) })) }) -- cgit v1.2.3 From db61f41030678c5756c844c8aa41a210c658769e Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 7 Sep 2022 11:59:56 +0200 Subject: Move GIT_VERSION injection later in build chain to reduce build times --- src/api/admin/cluster.rs | 2 +- src/garage/admin.rs | 2 +- src/garage/cli/structs.rs | 104 +++++++++++++++++++++++----------------------- src/garage/main.rs | 4 +- src/model/Cargo.toml | 1 + src/model/lib.rs | 1 + src/model/version.rs | 7 ++++ src/rpc/system.rs | 11 ++--- src/util/Cargo.toml | 1 - src/util/lib.rs | 1 - src/util/version.rs | 7 ---- 11 files changed, 69 insertions(+), 72 deletions(-) create mode 100644 src/model/version.rs delete mode 100644 src/util/version.rs (limited to 'src') diff --git a/src/api/admin/cluster.rs b/src/api/admin/cluster.rs index 4b7716a3..8e6dfb3f 100644 --- a/src/api/admin/cluster.rs +++ b/src/api/admin/cluster.rs @@ -18,7 +18,7 @@ use crate::helpers::{json_ok_response, parse_json_body}; pub async fn handle_get_cluster_status(garage: &Arc) -> Result, Error> { let res = GetClusterStatusResponse { node: hex::encode(garage.system.id), - garage_version: garage.system.garage_version(), + garage_version: garage_model::version::garage_version(), db_engine: garage.db.engine(), known_nodes: garage .system diff --git a/src/garage/admin.rs b/src/garage/admin.rs index 71ee608c..f4c182fe 100644 --- a/src/garage/admin.rs +++ b/src/garage/admin.rs @@ -740,7 +740,7 @@ impl AdminRpcHandler { writeln!( &mut ret, "\nGarage version: {}", - self.garage.system.garage_version(), + garage_model::version::garage_version(), ) .unwrap(); writeln!(&mut ret, "\nDatabase engine: {}", self.garage.db.engine()).unwrap(); diff --git a/src/garage/cli/structs.rs b/src/garage/cli/structs.rs index 9274f80f..018c8119 100644 --- a/src/garage/cli/structs.rs +++ b/src/garage/cli/structs.rs @@ -1,65 +1,65 @@ use serde::{Deserialize, Serialize}; - -use garage_util::version; use structopt::StructOpt; +use garage_model::version::garage_version; + #[derive(StructOpt, Debug)] pub enum Command { /// Run Garage server - #[structopt(name = "server", version = version::garage())] + #[structopt(name = "server", version = garage_version())] Server, /// Get network status - #[structopt(name = "status", version = version::garage())] + #[structopt(name = "status", version = garage_version())] Status, /// Operations on individual Garage nodes - #[structopt(name = "node", version = version::garage())] + #[structopt(name = "node", version = garage_version())] Node(NodeOperation), /// Operations on the assignation of node roles in the cluster layout - #[structopt(name = "layout", version = version::garage())] + #[structopt(name = "layout", version = garage_version())] Layout(LayoutOperation), /// Operations on buckets - #[structopt(name = "bucket", version = version::garage())] + #[structopt(name = "bucket", version = garage_version())] Bucket(BucketOperation), /// Operations on S3 access keys - #[structopt(name = "key", version = version::garage())] + #[structopt(name = "key", version = garage_version())] Key(KeyOperation), /// Run migrations from previous Garage version /// (DO NOT USE WITHOUT READING FULL DOCUMENTATION) - #[structopt(name = "migrate", version = version::garage())] + #[structopt(name = "migrate", version = garage_version())] Migrate(MigrateOpt), /// Start repair of node data on remote node - #[structopt(name = "repair", version = version::garage())] + #[structopt(name = "repair", version = garage_version())] Repair(RepairOpt), /// Offline reparation of node data (these repairs must be run offline /// directly on the server node) - #[structopt(name = "offline-repair", version = version::garage())] + #[structopt(name = "offline-repair", version = garage_version())] OfflineRepair(OfflineRepairOpt), /// Gather node statistics - #[structopt(name = "stats", version = version::garage())] + #[structopt(name = "stats", version = garage_version())] Stats(StatsOpt), /// Manage background workers - #[structopt(name = "worker", version = version::garage())] + #[structopt(name = "worker", version = garage_version())] Worker(WorkerOpt), } #[derive(StructOpt, Debug)] pub enum NodeOperation { /// Print identifier (public key) of this Garage node - #[structopt(name = "id", version = version::garage())] + #[structopt(name = "id", version = garage_version())] NodeId(NodeIdOpt), /// Connect to Garage node that is currently isolated from the system - #[structopt(name = "connect", version = version::garage())] + #[structopt(name = "connect", version = garage_version())] Connect(ConnectNodeOpt), } @@ -80,23 +80,23 @@ pub struct ConnectNodeOpt { #[derive(StructOpt, Debug)] pub enum LayoutOperation { /// Assign role to Garage node - #[structopt(name = "assign", version = version::garage())] + #[structopt(name = "assign", version = garage_version())] Assign(AssignRoleOpt), /// Remove role from Garage cluster node - #[structopt(name = "remove", version = version::garage())] + #[structopt(name = "remove", version = garage_version())] Remove(RemoveRoleOpt), /// Show roles currently assigned to nodes and changes staged for commit - #[structopt(name = "show", version = version::garage())] + #[structopt(name = "show", version = garage_version())] Show, /// Apply staged changes to cluster layout - #[structopt(name = "apply", version = version::garage())] + #[structopt(name = "apply", version = garage_version())] Apply(ApplyLayoutOpt), /// Revert staged changes to cluster layout - #[structopt(name = "revert", version = version::garage())] + #[structopt(name = "revert", version = garage_version())] Revert(RevertLayoutOpt), } @@ -151,43 +151,43 @@ pub struct RevertLayoutOpt { #[derive(Serialize, Deserialize, StructOpt, Debug)] pub enum BucketOperation { /// List buckets - #[structopt(name = "list", version = version::garage())] + #[structopt(name = "list", version = garage_version())] List, /// Get bucket info - #[structopt(name = "info", version = version::garage())] + #[structopt(name = "info", version = garage_version())] Info(BucketOpt), /// Create bucket - #[structopt(name = "create", version = version::garage())] + #[structopt(name = "create", version = garage_version())] Create(BucketOpt), /// Delete bucket - #[structopt(name = "delete", version = version::garage())] + #[structopt(name = "delete", version = garage_version())] Delete(DeleteBucketOpt), /// Alias bucket under new name - #[structopt(name = "alias", version = version::garage())] + #[structopt(name = "alias", version = garage_version())] Alias(AliasBucketOpt), /// Remove bucket alias - #[structopt(name = "unalias", version = version::garage())] + #[structopt(name = "unalias", version = garage_version())] Unalias(UnaliasBucketOpt), /// Allow key to read or write to bucket - #[structopt(name = "allow", version = version::garage())] + #[structopt(name = "allow", version = garage_version())] Allow(PermBucketOpt), /// Deny key from reading or writing to bucket - #[structopt(name = "deny", version = version::garage())] + #[structopt(name = "deny", version = garage_version())] Deny(PermBucketOpt), /// Expose as website or not - #[structopt(name = "website", version = version::garage())] + #[structopt(name = "website", version = garage_version())] Website(WebsiteOpt), /// Set the quotas for this bucket - #[structopt(name = "set-quotas", version = version::garage())] + #[structopt(name = "set-quotas", version = garage_version())] SetQuotas(SetQuotasOpt), } @@ -293,35 +293,35 @@ pub struct SetQuotasOpt { #[derive(Serialize, Deserialize, StructOpt, Debug)] pub enum KeyOperation { /// List keys - #[structopt(name = "list", version = version::garage())] + #[structopt(name = "list", version = garage_version())] List, /// Get key info - #[structopt(name = "info", version = version::garage())] + #[structopt(name = "info", version = garage_version())] Info(KeyOpt), /// Create new key - #[structopt(name = "new", version = version::garage())] + #[structopt(name = "new", version = garage_version())] New(KeyNewOpt), /// Rename key - #[structopt(name = "rename", version = version::garage())] + #[structopt(name = "rename", version = garage_version())] Rename(KeyRenameOpt), /// Delete key - #[structopt(name = "delete", version = version::garage())] + #[structopt(name = "delete", version = garage_version())] Delete(KeyDeleteOpt), /// Set permission flags for key - #[structopt(name = "allow", version = version::garage())] + #[structopt(name = "allow", version = garage_version())] Allow(KeyPermOpt), /// Unset permission flags for key - #[structopt(name = "deny", version = version::garage())] + #[structopt(name = "deny", version = garage_version())] Deny(KeyPermOpt), /// Import key - #[structopt(name = "import", version = version::garage())] + #[structopt(name = "import", version = garage_version())] Import(KeyImportOpt), } @@ -393,7 +393,7 @@ pub struct MigrateOpt { #[derive(Serialize, Deserialize, StructOpt, Debug, Eq, PartialEq, Clone)] pub enum MigrateWhat { /// Migrate buckets and permissions from v0.5.0 - #[structopt(name = "buckets050", version = version::garage())] + #[structopt(name = "buckets050", version = garage_version())] Buckets050, } @@ -414,19 +414,19 @@ pub struct RepairOpt { #[derive(Serialize, Deserialize, StructOpt, Debug, Eq, PartialEq, Clone)] pub enum RepairWhat { /// Only do a full sync of metadata tables - #[structopt(name = "tables", version = version::garage())] + #[structopt(name = "tables", version = garage_version())] Tables, /// Only repair (resync/rebalance) the set of stored blocks - #[structopt(name = "blocks", version = version::garage())] + #[structopt(name = "blocks", version = garage_version())] Blocks, /// Only redo the propagation of object deletions to the version table (slow) - #[structopt(name = "versions", version = version::garage())] + #[structopt(name = "versions", version = garage_version())] Versions, /// Only redo the propagation of version deletions to the block ref table (extremely slow) - #[structopt(name = "block_refs", version = version::garage())] + #[structopt(name = "block_refs", version = garage_version())] BlockRefs, /// Verify integrity of all blocks on disc (extremely slow, i/o intensive) - #[structopt(name = "scrub", version = version::garage())] + #[structopt(name = "scrub", version = garage_version())] Scrub { #[structopt(subcommand)] cmd: ScrubCmd, @@ -436,19 +436,19 @@ pub enum RepairWhat { #[derive(Serialize, Deserialize, StructOpt, Debug, Eq, PartialEq, Clone)] pub enum ScrubCmd { /// Start scrub - #[structopt(name = "start", version = version::garage())] + #[structopt(name = "start", version = garage_version())] Start, /// Pause scrub (it will resume automatically after 24 hours) - #[structopt(name = "pause", version = version::garage())] + #[structopt(name = "pause", version = garage_version())] Pause, /// Resume paused scrub - #[structopt(name = "resume", version = version::garage())] + #[structopt(name = "resume", version = garage_version())] Resume, /// Cancel scrub in progress - #[structopt(name = "cancel", version = version::garage())] + #[structopt(name = "cancel", version = garage_version())] Cancel, /// Set tranquility level for in-progress and future scrubs - #[structopt(name = "set-tranquility", version = version::garage())] + #[structopt(name = "set-tranquility", version = garage_version())] SetTranquility { #[structopt()] tranquility: u32, @@ -469,10 +469,10 @@ pub struct OfflineRepairOpt { pub enum OfflineRepairWhat { /// Repair K2V item counters #[cfg(feature = "k2v")] - #[structopt(name = "k2v_item_counters", version = version::garage())] + #[structopt(name = "k2v_item_counters", version = garage_version())] K2VItemCounters, /// Repair object counters - #[structopt(name = "object_counters", version = version::garage())] + #[structopt(name = "object_counters", version = garage_version())] ObjectCounters, } @@ -496,7 +496,7 @@ pub struct WorkerOpt { #[derive(Serialize, Deserialize, StructOpt, Debug, Eq, PartialEq, Clone)] pub enum WorkerCmd { /// List all workers on Garage node - #[structopt(name = "list", version = version::garage())] + #[structopt(name = "list", version = garage_version())] List { #[structopt(flatten)] opt: WorkerListOpt, diff --git a/src/garage/main.rs b/src/garage/main.rs index 8f0b377e..94c9bf61 100644 --- a/src/garage/main.rs +++ b/src/garage/main.rs @@ -23,15 +23,15 @@ use garage_util::error::*; use garage_rpc::system::*; use garage_rpc::*; -use garage_util::version; use garage_model::helper::error::Error as HelperError; +use garage_model::version::garage_version; use admin::*; use cli::*; #[derive(StructOpt, Debug)] -#[structopt(name = "garage", version = version::garage(), about = "S3-compatible object store for self-hosted geo-distributed deployments")] +#[structopt(name = "garage", version = garage_version(), about = "S3-compatible object store for self-hosted geo-distributed deployments")] struct Opt { /// Host to connect to for admin operations, in the format: /// @: diff --git a/src/model/Cargo.toml b/src/model/Cargo.toml index bbcfe89c..c41d3f16 100644 --- a/src/model/Cargo.toml +++ b/src/model/Cargo.toml @@ -24,6 +24,7 @@ async-trait = "0.1.7" arc-swap = "1.0" blake2 = "0.9" err-derive = "0.3" +git-version = "0.3.4" hex = "0.4" base64 = "0.13" tracing = "0.1.30" diff --git a/src/model/lib.rs b/src/model/lib.rs index 4f20ea46..43db01c5 100644 --- a/src/model/lib.rs +++ b/src/model/lib.rs @@ -19,3 +19,4 @@ pub mod s3; pub mod garage; pub mod helper; pub mod migrate; +pub mod version; diff --git a/src/model/version.rs b/src/model/version.rs new file mode 100644 index 00000000..cdb3ea62 --- /dev/null +++ b/src/model/version.rs @@ -0,0 +1,7 @@ +pub fn garage_version() -> &'static str { + option_env!("GIT_VERSION").unwrap_or(git_version::git_version!( + prefix = "git:", + cargo_prefix = "cargo:", + fallback = "unknown" + )) +} diff --git a/src/rpc/system.rs b/src/rpc/system.rs index fbfbbf56..d621f59f 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -27,7 +27,6 @@ use garage_util::data::*; use garage_util::error::*; use garage_util::persister::Persister; use garage_util::time::*; -use garage_util::version; use crate::consul::*; #[cfg(feature = "kubernetes-discovery")] @@ -40,8 +39,10 @@ const DISCOVERY_INTERVAL: Duration = Duration::from_secs(60); const STATUS_EXCHANGE_INTERVAL: Duration = Duration::from_secs(10); const PING_TIMEOUT: Duration = Duration::from_secs(2); -/// Version tag used for version check upon Netapp connection -pub const GARAGE_VERSION_TAG: u64 = 0x6761726167650007; // garage 0x0007 +/// Version tag used for version check upon Netapp connection. +/// Cluster nodes with different version tags are deemed +/// incompatible and will refuse to connect. +pub const GARAGE_VERSION_TAG: u64 = 0x6761726167650008; // garage 0x0008 /// RPC endpoint used for calls related to membership pub const SYSTEM_RPC_PATH: &str = "garage_rpc/membership.rs/SystemRpc"; @@ -320,10 +321,6 @@ impl System { // ---- Administrative operations (directly available and // also available through RPC) ---- - pub fn garage_version(&self) -> &'static str { - version::garage() - } - pub fn get_known_nodes(&self) -> Vec { let node_status = self.node_status.read().unwrap(); let known_nodes = self diff --git a/src/util/Cargo.toml b/src/util/Cargo.toml index 5f3e5c57..af57008e 100644 --- a/src/util/Cargo.toml +++ b/src/util/Cargo.toml @@ -24,7 +24,6 @@ hex = "0.4" tracing = "0.1.30" rand = "0.8" sha2 = "0.9" -git-version = "0.3.4" chrono = "0.4" rmp-serde = "0.15" diff --git a/src/util/lib.rs b/src/util/lib.rs index 47c85c3a..fce151af 100644 --- a/src/util/lib.rs +++ b/src/util/lib.rs @@ -14,4 +14,3 @@ pub mod persister; pub mod time; pub mod token_bucket; pub mod tranquilizer; -pub mod version; diff --git a/src/util/version.rs b/src/util/version.rs deleted file mode 100644 index 8882d035..00000000 --- a/src/util/version.rs +++ /dev/null @@ -1,7 +0,0 @@ -pub fn garage() -> &'static str { - option_env!("GIT_VERSION").unwrap_or(git_version::git_version!( - prefix = "git:", - cargo_prefix = "cargo:", - fallback = "unknown" - )) -} -- cgit v1.2.3 From 28d86e76021bed674ca78684b9522cfb664a8ae2 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 7 Sep 2022 17:05:21 +0200 Subject: Report build features in garage --help --- src/api/admin/cluster.rs | 2 ++ src/garage/admin.rs | 5 ++++- src/garage/main.rs | 36 +++++++++++++++++++++++++++++++++--- src/model/Cargo.toml | 1 + src/model/version.rs | 16 ++++++++++++++++ 5 files changed, 56 insertions(+), 4 deletions(-) (limited to 'src') diff --git a/src/api/admin/cluster.rs b/src/api/admin/cluster.rs index 8e6dfb3f..010382f2 100644 --- a/src/api/admin/cluster.rs +++ b/src/api/admin/cluster.rs @@ -19,6 +19,7 @@ pub async fn handle_get_cluster_status(garage: &Arc) -> Result) -> GetClusterLayoutResponse { struct GetClusterStatusResponse { node: String, garage_version: &'static str, + garage_features: Option<&'static [&'static str]>, db_engine: String, known_nodes: HashMap, layout: GetClusterLayoutResponse, diff --git a/src/garage/admin.rs b/src/garage/admin.rs index f4c182fe..8854a58d 100644 --- a/src/garage/admin.rs +++ b/src/garage/admin.rs @@ -739,8 +739,11 @@ impl AdminRpcHandler { let mut ret = String::new(); writeln!( &mut ret, - "\nGarage version: {}", + "\nGarage version: {} [features: {}]", garage_model::version::garage_version(), + garage_model::version::garage_features() + .map(|list| list.join(", ")) + .unwrap_or_else(|| "(unknown)".into()), ) .unwrap(); writeln!(&mut ret, "\nDatabase engine: {}", self.garage.db.engine()).unwrap(); diff --git a/src/garage/main.rs b/src/garage/main.rs index 94c9bf61..7d00811a 100644 --- a/src/garage/main.rs +++ b/src/garage/main.rs @@ -25,13 +25,15 @@ use garage_rpc::system::*; use garage_rpc::*; use garage_model::helper::error::Error as HelperError; -use garage_model::version::garage_version; use admin::*; use cli::*; #[derive(StructOpt, Debug)] -#[structopt(name = "garage", version = garage_version(), about = "S3-compatible object store for self-hosted geo-distributed deployments")] +#[structopt( + name = "garage", + about = "S3-compatible object store for self-hosted geo-distributed deployments" +)] struct Opt { /// Host to connect to for admin operations, in the format: /// @: @@ -69,7 +71,35 @@ async fn main() { std::process::abort(); })); - let opt = Opt::from_args(); + // Parse opt + let features = &[ + #[cfg(feature = "k2v")] + "k2v", + #[cfg(feature = "sled")] + "sled", + #[cfg(feature = "lmdb")] + "lmdb", + #[cfg(feature = "sqlite")] + "sqlite", + #[cfg(feature = "kubernetes-discovery")] + "kubernetes-discovery", + #[cfg(feature = "metrics")] + "metrics", + #[cfg(feature = "telemetry-otlp")] + "telemetry-otlp", + #[cfg(feature = "bundled-libs")] + "bundled-libs", + #[cfg(feature = "system-libs")] + "system-libs", + ][..]; + let version = format!( + "{} [features: {}]", + garage_model::version::garage_version(), + features.join(", ") + ); + garage_model::version::init_features(features); + let opt = Opt::from_clap(&Opt::clap().version(version.as_str()).get_matches()); + let res = match opt.cmd { Command::Server => server::run_server(opt.config_file).await, Command::OfflineRepair(repair_opt) => { diff --git a/src/model/Cargo.toml b/src/model/Cargo.toml index c41d3f16..101c97d3 100644 --- a/src/model/Cargo.toml +++ b/src/model/Cargo.toml @@ -26,6 +26,7 @@ blake2 = "0.9" err-derive = "0.3" git-version = "0.3.4" hex = "0.4" +lazy_static = "1.4" base64 = "0.13" tracing = "0.1.30" rand = "0.8" diff --git a/src/model/version.rs b/src/model/version.rs index cdb3ea62..af6aa809 100644 --- a/src/model/version.rs +++ b/src/model/version.rs @@ -1,3 +1,11 @@ +use std::sync::Arc; + +use arc_swap::ArcSwapOption; + +lazy_static::lazy_static! { + static ref FEATURES: ArcSwapOption<&'static [&'static str]> = ArcSwapOption::new(None); +} + pub fn garage_version() -> &'static str { option_env!("GIT_VERSION").unwrap_or(git_version::git_version!( prefix = "git:", @@ -5,3 +13,11 @@ pub fn garage_version() -> &'static str { fallback = "unknown" )) } + +pub fn garage_features() -> Option<&'static [&'static str]> { + FEATURES.load().as_ref().map(|f| &f[..]) +} + +pub fn init_features(features: &'static [&'static str]) { + FEATURES.store(Some(Arc::new(features))); +} -- cgit v1.2.3 From 2559f63e9bb58a66da70f33e852ebbd5f909876e Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 7 Sep 2022 17:54:16 +0200 Subject: Make all HTTP services optionnal --- src/api/admin/api_server.rs | 19 +- src/api/k2v/api_server.rs | 14 +- src/api/s3/api_server.rs | 14 +- src/garage/server.rs | 112 +++++++----- src/util/config.rs | 6 +- src/web/lib.rs | 2 +- src/web/web_server.rs | 409 +++++++++++++++++++++++--------------------- 7 files changed, 301 insertions(+), 275 deletions(-) (limited to 'src') diff --git a/src/api/admin/api_server.rs b/src/api/admin/api_server.rs index d871d4e2..fb0078cc 100644 --- a/src/api/admin/api_server.rs +++ b/src/api/admin/api_server.rs @@ -1,3 +1,4 @@ +use std::net::SocketAddr; use std::sync::Arc; use async_trait::async_trait; @@ -52,15 +53,15 @@ impl AdminApiServer { } } - pub async fn run(self, shutdown_signal: impl Future) -> Result<(), GarageError> { - if let Some(bind_addr) = self.garage.config.admin.api_bind_addr { - let region = self.garage.config.s3_api.s3_region.clone(); - ApiServer::new(region, self) - .run_server(bind_addr, shutdown_signal) - .await - } else { - Ok(()) - } + pub async fn run( + self, + bind_addr: SocketAddr, + shutdown_signal: impl Future, + ) -> Result<(), GarageError> { + let region = self.garage.config.s3_api.s3_region.clone(); + ApiServer::new(region, self) + .run_server(bind_addr, shutdown_signal) + .await } fn handle_options(&self, _req: &Request) -> Result, Error> { diff --git a/src/api/k2v/api_server.rs b/src/api/k2v/api_server.rs index eb0fbdd7..084867b5 100644 --- a/src/api/k2v/api_server.rs +++ b/src/api/k2v/api_server.rs @@ -1,3 +1,4 @@ +use std::net::SocketAddr; use std::sync::Arc; use async_trait::async_trait; @@ -36,20 +37,13 @@ pub(crate) struct K2VApiEndpoint { impl K2VApiServer { pub async fn run( garage: Arc, + bind_addr: SocketAddr, + s3_region: String, shutdown_signal: impl Future, ) -> Result<(), GarageError> { - if let Some(cfg) = &garage.config.k2v_api { - let bind_addr = cfg.api_bind_addr; - - ApiServer::new( - garage.config.s3_api.s3_region.clone(), - K2VApiServer { garage }, - ) + ApiServer::new(s3_region, K2VApiServer { garage }) .run_server(bind_addr, shutdown_signal) .await - } else { - Ok(()) - } } } diff --git a/src/api/s3/api_server.rs b/src/api/s3/api_server.rs index 78dfeeac..27837297 100644 --- a/src/api/s3/api_server.rs +++ b/src/api/s3/api_server.rs @@ -1,3 +1,4 @@ +use std::net::SocketAddr; use std::sync::Arc; use async_trait::async_trait; @@ -43,16 +44,13 @@ pub(crate) struct S3ApiEndpoint { impl S3ApiServer { pub async fn run( garage: Arc, + addr: SocketAddr, + s3_region: String, shutdown_signal: impl Future, ) -> Result<(), GarageError> { - let addr = garage.config.s3_api.api_bind_addr; - - ApiServer::new( - garage.config.s3_api.s3_region.clone(), - S3ApiServer { garage }, - ) - .run_server(addr, shutdown_signal) - .await + ApiServer::new(s3_region, S3ApiServer { garage }) + .run_server(addr, shutdown_signal) + .await } async fn handle_request_without_bucket( diff --git a/src/garage/server.rs b/src/garage/server.rs index 0851738d..fb6d2279 100644 --- a/src/garage/server.rs +++ b/src/garage/server.rs @@ -9,7 +9,7 @@ use garage_util::error::Error; use garage_api::admin::api_server::AdminApiServer; use garage_api::s3::api_server::S3ApiServer; use garage_model::garage::Garage; -use garage_web::run_web_server; +use garage_web::WebServer; #[cfg(feature = "k2v")] use garage_api::k2v::api_server::K2VApiServer; @@ -30,6 +30,8 @@ pub async fn run_server(config_file: PathBuf) -> Result<(), Error> { info!("Loading configuration..."); let config = read_config(config_file)?; + // ---- Initialize Garage internals ---- + info!("Initializing background runner..."); let watch_cancel = netapp::util::watch_ctrl_c(); let (background, await_background_done) = BackgroundRunner::new(16, watch_cancel.clone()); @@ -44,7 +46,7 @@ pub async fn run_server(config_file: PathBuf) -> Result<(), Error> { init_tracing(config.admin.trace_sink.as_ref().unwrap(), garage.system.id)?; #[cfg(not(feature = "telemetry-otlp"))] - warn!("Garage was built without OTLP exporter, admin.trace_sink is ignored."); + error!("Garage was built without OTLP exporter, admin.trace_sink is ignored."); } info!("Initialize Admin API server and metrics collector..."); @@ -56,53 +58,73 @@ pub async fn run_server(config_file: PathBuf) -> Result<(), Error> { info!("Create admin RPC handler..."); AdminRpcHandler::new(garage.clone()); - info!("Initializing S3 API server..."); - let s3_api_server = tokio::spawn(S3ApiServer::run( - garage.clone(), - wait_from(watch_cancel.clone()), - )); - - #[cfg(feature = "k2v")] - let k2v_api_server = { - info!("Initializing K2V API server..."); - tokio::spawn(K2VApiServer::run( - garage.clone(), - wait_from(watch_cancel.clone()), - )) - }; - - info!("Initializing web server..."); - let web_server = tokio::spawn(run_web_server( - garage.clone(), - wait_from(watch_cancel.clone()), - )); - - info!("Launching Admin API server..."); - let admin_server = tokio::spawn(admin_server.run(wait_from(watch_cancel.clone()))); - - // Stuff runs + // ---- Launch public-facing API servers ---- + + let mut servers = vec![]; + + if let Some(s3_bind_addr) = &config.s3_api.api_bind_addr { + info!("Initializing S3 API server..."); + servers.push(( + "S3 API", + tokio::spawn(S3ApiServer::run( + garage.clone(), + *s3_bind_addr, + config.s3_api.s3_region.clone(), + wait_from(watch_cancel.clone()), + )), + )); + } - // When a cancel signal is sent, stuff stops - if let Err(e) = s3_api_server.await? { - warn!("S3 API server exited with error: {}", e); - } else { - info!("S3 API server exited without error."); + if config.k2v_api.is_some() { + #[cfg(feature = "k2v")] + { + info!("Initializing K2V API server..."); + servers.push(( + "K2V API", + tokio::spawn(K2VApiServer::run( + garage.clone(), + config.k2v_api.as_ref().unwrap().api_bind_addr, + config.s3_api.s3_region.clone(), + wait_from(watch_cancel.clone()), + )), + )); + } + #[cfg(not(feature = "k2v"))] + error!("K2V is not enabled in this build, cannot start K2V API server"); } - #[cfg(feature = "k2v")] - if let Err(e) = k2v_api_server.await? { - warn!("K2V API server exited with error: {}", e); - } else { - info!("K2V API server exited without error."); + + if let Some(web_config) = &config.s3_web { + info!("Initializing web server..."); + servers.push(( + "Web", + tokio::spawn(WebServer::run( + garage.clone(), + web_config.bind_addr, + web_config.root_domain.clone(), + wait_from(watch_cancel.clone()), + )), + )); } - if let Err(e) = web_server.await? { - warn!("Web server exited with error: {}", e); - } else { - info!("Web server exited without error."); + + if let Some(admin_bind_addr) = &config.admin.api_bind_addr { + info!("Launching Admin API server..."); + servers.push(( + "Admin", + tokio::spawn(admin_server.run(*admin_bind_addr, wait_from(watch_cancel.clone()))), + )); } - if let Err(e) = admin_server.await? { - warn!("Admin web server exited with error: {}", e); - } else { - info!("Admin API server exited without error."); + + // Stuff runs + + // When a cancel signal is sent, stuff stops + + // Collect stuff + for (desc, join_handle) in servers { + if let Err(e) = join_handle.await? { + error!("{} server exited with error: {}", desc, e); + } else { + info!("{} server exited without error.", desc); + } } // Remove RPC handlers for system to break reference cycles diff --git a/src/util/config.rs b/src/util/config.rs index e8ef4fdd..46c5cb9d 100644 --- a/src/util/config.rs +++ b/src/util/config.rs @@ -81,11 +81,10 @@ pub struct Config { pub s3_api: S3ApiConfig, /// Configuration for K2V api - #[cfg(feature = "k2v")] pub k2v_api: Option, /// Configuration for serving files as normal web server - pub s3_web: WebConfig, + pub s3_web: Option, /// Configuration for the admin API endpoint #[serde(default = "Default::default")] @@ -96,7 +95,7 @@ pub struct Config { #[derive(Deserialize, Debug, Clone)] pub struct S3ApiConfig { /// Address and port to bind for api serving - pub api_bind_addr: SocketAddr, + pub api_bind_addr: Option, /// S3 region to use pub s3_region: String, /// Suffix to remove from domain name to find bucket. If None, @@ -105,7 +104,6 @@ pub struct S3ApiConfig { } /// Configuration for K2V api -#[cfg(feature = "k2v")] #[derive(Deserialize, Debug, Clone)] pub struct K2VApiConfig { /// Address and port to bind for api serving diff --git a/src/web/lib.rs b/src/web/lib.rs index 9b7c8573..7207c365 100644 --- a/src/web/lib.rs +++ b/src/web/lib.rs @@ -6,4 +6,4 @@ mod error; pub use error::Error; mod web_server; -pub use web_server::run_web_server; +pub use web_server::WebServer; diff --git a/src/web/web_server.rs b/src/web/web_server.rs index c30d8957..c2322073 100644 --- a/src/web/web_server.rs +++ b/src/web/web_server.rs @@ -57,90 +57,226 @@ impl WebMetrics { } } -/// Run a web server -pub async fn run_web_server( +pub struct WebServer { garage: Arc, - shutdown_signal: impl Future, -) -> Result<(), GarageError> { - let addr = &garage.config.s3_web.bind_addr; + metrics: Arc, + root_domain: String, +} - let metrics = Arc::new(WebMetrics::new()); +impl WebServer { + /// Run a web server + pub async fn run( + garage: Arc, + addr: SocketAddr, + root_domain: String, + shutdown_signal: impl Future, + ) -> Result<(), GarageError> { + let metrics = Arc::new(WebMetrics::new()); + let web_server = Arc::new(WebServer { + garage, + metrics, + root_domain, + }); + + let service = make_service_fn(|conn: &AddrStream| { + let web_server = web_server.clone(); + + let client_addr = conn.remote_addr(); + async move { + Ok::<_, Error>(service_fn(move |req: Request| { + let web_server = web_server.clone(); + + web_server.handle_request(req, client_addr) + })) + } + }); - let service = make_service_fn(|conn: &AddrStream| { - let garage = garage.clone(); - let metrics = metrics.clone(); + let server = Server::bind(&addr).serve(service); + let graceful = server.with_graceful_shutdown(shutdown_signal); + info!("Web server listening on http://{}", addr); - let client_addr = conn.remote_addr(); - async move { - Ok::<_, Error>(service_fn(move |req: Request| { - let garage = garage.clone(); - let metrics = metrics.clone(); + graceful.await?; + Ok(()) + } - handle_request(garage, metrics, req, client_addr) - })) + async fn handle_request( + self: Arc, + req: Request, + addr: SocketAddr, + ) -> Result, Infallible> { + info!("{} {} {}", addr, req.method(), req.uri()); + + // Lots of instrumentation + let tracer = opentelemetry::global::tracer("garage"); + let span = tracer + .span_builder(format!("Web {} request", req.method())) + .with_trace_id(gen_trace_id()) + .with_attributes(vec![ + KeyValue::new("method", format!("{}", req.method())), + KeyValue::new("uri", req.uri().to_string()), + ]) + .start(&tracer); + + let metrics_tags = &[KeyValue::new("method", req.method().to_string())]; + + // The actual handler + let res = self + .serve_file(&req) + .with_context(Context::current_with_span(span)) + .record_duration(&self.metrics.request_duration, &metrics_tags[..]) + .await; + + // More instrumentation + self.metrics.request_counter.add(1, &metrics_tags[..]); + + // Returning the result + match res { + Ok(res) => { + debug!("{} {} {}", req.method(), res.status(), req.uri()); + Ok(res) + } + Err(error) => { + info!( + "{} {} {} {}", + req.method(), + error.http_status_code(), + req.uri(), + error + ); + self.metrics.error_counter.add( + 1, + &[ + metrics_tags[0].clone(), + KeyValue::new("status_code", error.http_status_code().to_string()), + ], + ); + Ok(error_to_res(error)) + } } - }); + } - let server = Server::bind(addr).serve(service); - let graceful = server.with_graceful_shutdown(shutdown_signal); - info!("Web server listening on http://{}", addr); + async fn serve_file(self: &Arc, req: &Request) -> Result, Error> { + // Get http authority string (eg. [::1]:3902 or garage.tld:80) + let authority = req + .headers() + .get(HOST) + .ok_or_bad_request("HOST header required")? + .to_str()?; + + // Get bucket + let host = authority_to_host(authority)?; + + let bucket_name = host_to_bucket(&host, &self.root_domain).unwrap_or(&host); + let bucket_id = self + .garage + .bucket_alias_table + .get(&EmptyKey, &bucket_name.to_string()) + .await? + .and_then(|x| x.state.take()) + .ok_or(Error::NotFound)?; + + // Check bucket isn't deleted and has website access enabled + let bucket = self + .garage + .bucket_table + .get(&EmptyKey, &bucket_id) + .await? + .ok_or(Error::NotFound)?; + + let website_config = bucket + .params() + .ok_or(Error::NotFound)? + .website_config + .get() + .as_ref() + .ok_or(Error::NotFound)?; + + // Get path + let path = req.uri().path().to_string(); + let index = &website_config.index_document; + let key = path_to_key(&path, index)?; + + debug!( + "Selected bucket: \"{}\" {:?}, selected key: \"{}\"", + bucket_name, bucket_id, key + ); + + let ret_doc = match *req.method() { + Method::OPTIONS => handle_options_for_bucket(req, &bucket), + Method::HEAD => handle_head(self.garage.clone(), req, bucket_id, &key, None).await, + Method::GET => handle_get(self.garage.clone(), req, bucket_id, &key, None).await, + _ => Err(ApiError::bad_request("HTTP method not supported")), + } + .map_err(Error::from); + + match ret_doc { + Err(error) => { + // For a HEAD or OPTIONS method, and for non-4xx errors, + // we don't return the error document as content, + // we return above and just return the error message + // by relying on err_to_res that is called when we return an Err. + if *req.method() == Method::HEAD + || *req.method() == Method::OPTIONS + || !error.http_status_code().is_client_error() + { + return Err(error); + } - graceful.await?; - Ok(()) -} + // If no error document is set: just return the error directly + let error_document = match &website_config.error_document { + Some(ed) => ed.trim_start_matches('/').to_owned(), + None => return Err(error), + }; + + // We want to return the error document + // Create a fake HTTP request with path = the error document + let req2 = Request::builder() + .uri(format!("http://{}/{}", host, &error_document)) + .body(Body::empty()) + .unwrap(); + + match handle_get(self.garage.clone(), &req2, bucket_id, &error_document, None).await + { + Ok(mut error_doc) => { + // The error won't be logged back in handle_request, + // so log it here + info!( + "{} {} {} {}", + req.method(), + req.uri(), + error.http_status_code(), + error + ); + + *error_doc.status_mut() = error.http_status_code(); + error.add_headers(error_doc.headers_mut()); + + // Preserve error message in a special header + for error_line in error.to_string().split('\n') { + if let Ok(v) = HeaderValue::from_bytes(error_line.as_bytes()) { + error_doc.headers_mut().append("X-Garage-Error", v); + } + } -async fn handle_request( - garage: Arc, - metrics: Arc, - req: Request, - addr: SocketAddr, -) -> Result, Infallible> { - info!("{} {} {}", addr, req.method(), req.uri()); - - // Lots of instrumentation - let tracer = opentelemetry::global::tracer("garage"); - let span = tracer - .span_builder(format!("Web {} request", req.method())) - .with_trace_id(gen_trace_id()) - .with_attributes(vec![ - KeyValue::new("method", format!("{}", req.method())), - KeyValue::new("uri", req.uri().to_string()), - ]) - .start(&tracer); - - let metrics_tags = &[KeyValue::new("method", req.method().to_string())]; - - // The actual handler - let res = serve_file(garage, &req) - .with_context(Context::current_with_span(span)) - .record_duration(&metrics.request_duration, &metrics_tags[..]) - .await; - - // More instrumentation - metrics.request_counter.add(1, &metrics_tags[..]); - - // Returning the result - match res { - Ok(res) => { - debug!("{} {} {}", req.method(), res.status(), req.uri()); - Ok(res) - } - Err(error) => { - info!( - "{} {} {} {}", - req.method(), - error.http_status_code(), - req.uri(), - error - ); - metrics.error_counter.add( - 1, - &[ - metrics_tags[0].clone(), - KeyValue::new("status_code", error.http_status_code().to_string()), - ], - ); - Ok(error_to_res(error)) + Ok(error_doc) + } + Err(error_doc_error) => { + warn!( + "Couldn't get error document {} for bucket {:?}: {}", + error_document, bucket_id, error_doc_error + ); + Err(error) + } + } + } + Ok(mut resp) => { + // Maybe add CORS headers + if let Some(rule) = find_matching_cors_rule(&bucket, req)? { + add_cors_headers(&mut resp, rule) + .ok_or_internal_error("Invalid bucket CORS configuration")?; + } + Ok(resp) + } } } } @@ -160,129 +296,6 @@ fn error_to_res(e: Error) -> Response { http_error } -async fn serve_file(garage: Arc, req: &Request) -> Result, Error> { - // Get http authority string (eg. [::1]:3902 or garage.tld:80) - let authority = req - .headers() - .get(HOST) - .ok_or_bad_request("HOST header required")? - .to_str()?; - - // Get bucket - let host = authority_to_host(authority)?; - let root = &garage.config.s3_web.root_domain; - - let bucket_name = host_to_bucket(&host, root).unwrap_or(&host); - let bucket_id = garage - .bucket_alias_table - .get(&EmptyKey, &bucket_name.to_string()) - .await? - .and_then(|x| x.state.take()) - .ok_or(Error::NotFound)?; - - // Check bucket isn't deleted and has website access enabled - let bucket = garage - .bucket_table - .get(&EmptyKey, &bucket_id) - .await? - .ok_or(Error::NotFound)?; - - let website_config = bucket - .params() - .ok_or(Error::NotFound)? - .website_config - .get() - .as_ref() - .ok_or(Error::NotFound)?; - - // Get path - let path = req.uri().path().to_string(); - let index = &website_config.index_document; - let key = path_to_key(&path, index)?; - - debug!( - "Selected bucket: \"{}\" {:?}, selected key: \"{}\"", - bucket_name, bucket_id, key - ); - - let ret_doc = match *req.method() { - Method::OPTIONS => handle_options_for_bucket(req, &bucket), - Method::HEAD => handle_head(garage.clone(), req, bucket_id, &key, None).await, - Method::GET => handle_get(garage.clone(), req, bucket_id, &key, None).await, - _ => Err(ApiError::bad_request("HTTP method not supported")), - } - .map_err(Error::from); - - match ret_doc { - Err(error) => { - // For a HEAD or OPTIONS method, and for non-4xx errors, - // we don't return the error document as content, - // we return above and just return the error message - // by relying on err_to_res that is called when we return an Err. - if *req.method() == Method::HEAD - || *req.method() == Method::OPTIONS - || !error.http_status_code().is_client_error() - { - return Err(error); - } - - // If no error document is set: just return the error directly - let error_document = match &website_config.error_document { - Some(ed) => ed.trim_start_matches('/').to_owned(), - None => return Err(error), - }; - - // We want to return the error document - // Create a fake HTTP request with path = the error document - let req2 = Request::builder() - .uri(format!("http://{}/{}", host, &error_document)) - .body(Body::empty()) - .unwrap(); - - match handle_get(garage, &req2, bucket_id, &error_document, None).await { - Ok(mut error_doc) => { - // The error won't be logged back in handle_request, - // so log it here - info!( - "{} {} {} {}", - req.method(), - req.uri(), - error.http_status_code(), - error - ); - - *error_doc.status_mut() = error.http_status_code(); - error.add_headers(error_doc.headers_mut()); - - // Preserve error message in a special header - for error_line in error.to_string().split('\n') { - if let Ok(v) = HeaderValue::from_bytes(error_line.as_bytes()) { - error_doc.headers_mut().append("X-Garage-Error", v); - } - } - - Ok(error_doc) - } - Err(error_doc_error) => { - warn!( - "Couldn't get error document {} for bucket {:?}: {}", - error_document, bucket_id, error_doc_error - ); - Err(error) - } - } - } - Ok(mut resp) => { - // Maybe add CORS headers - if let Some(rule) = find_matching_cors_rule(&bucket, req)? { - add_cors_headers(&mut resp, rule) - .ok_or_internal_error("Invalid bucket CORS configuration")?; - } - Ok(resp) - } - } -} - /// Path to key /// /// Convert the provided path to the internal key -- cgit v1.2.3 From 2e00809af58c142c86fae5f4bad85c4ef5e57872 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 7 Sep 2022 17:57:12 +0200 Subject: Error messages when system-libs XOR bundled-libs != 1 --- src/garage/main.rs | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'src') diff --git a/src/garage/main.rs b/src/garage/main.rs index 7d00811a..751dd941 100644 --- a/src/garage/main.rs +++ b/src/garage/main.rs @@ -11,6 +11,12 @@ mod server; #[cfg(feature = "telemetry-otlp")] mod tracing_setup; +#[cfg(not(any(feature = "bundled-libs", feature = "system-libs")))] +compile_error!("Either bundled-libs or system-libs Cargo feature must be enabled"); + +#[cfg(all(feature = "bundled-libs", feature = "system-libs"))] +compile_error!("Only one of bundled-libs and system-libs Cargo features must be enabled"); + use std::net::SocketAddr; use std::path::PathBuf; -- cgit v1.2.3 From 14492044394a875475b2159d51234ac1e35531bf Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 7 Sep 2022 18:02:13 +0200 Subject: Add warnings when features are not included in build --- src/db/lib.rs | 3 +++ src/garage/server.rs | 5 +++++ 2 files changed, 8 insertions(+) (limited to 'src') diff --git a/src/db/lib.rs b/src/db/lib.rs index 5304c195..d96586be 100644 --- a/src/db/lib.rs +++ b/src/db/lib.rs @@ -2,6 +2,9 @@ #[cfg(feature = "sqlite")] extern crate tracing; +#[cfg(not(any(feature = "lmdb", feature = "sled", feature = "sqlite")))] +compile_error!("Must activate the Cargo feature for at least one DB engine: lmdb, sled or sqlite."); + #[cfg(feature = "lmdb")] pub mod lmdb_adapter; #[cfg(feature = "sled")] diff --git a/src/garage/server.rs b/src/garage/server.rs index fb6d2279..3c96ff22 100644 --- a/src/garage/server.rs +++ b/src/garage/server.rs @@ -114,6 +114,11 @@ pub async fn run_server(config_file: PathBuf) -> Result<(), Error> { )); } + #[cfg(not(feature = "metrics"))] + if config.admin_api.metrics_token.is_some() { + warn!("This Garage version is built without the metrics feature"); + } + // Stuff runs // When a cancel signal is sent, stuff stops -- cgit v1.2.3 From 107853334bd045e145e3149c63172a9c0260b8db Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 7 Sep 2022 18:10:19 +0200 Subject: Fix build error --- src/garage/server.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src') diff --git a/src/garage/server.rs b/src/garage/server.rs index 3c96ff22..aeef02a2 100644 --- a/src/garage/server.rs +++ b/src/garage/server.rs @@ -115,7 +115,7 @@ pub async fn run_server(config_file: PathBuf) -> Result<(), Error> { } #[cfg(not(feature = "metrics"))] - if config.admin_api.metrics_token.is_some() { + if config.admin.metrics_token.is_some() { warn!("This Garage version is built without the metrics feature"); } -- cgit v1.2.3 From 06df301de5ab2068ee55c8663eebafb0d9a26978 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 7 Sep 2022 18:16:01 +0200 Subject: Fix merge --- src/garage/cli/structs.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'src') diff --git a/src/garage/cli/structs.rs b/src/garage/cli/structs.rs index 105e17f8..825fe859 100644 --- a/src/garage/cli/structs.rs +++ b/src/garage/cli/structs.rs @@ -502,7 +502,7 @@ pub enum WorkerCmd { opt: WorkerListOpt, }, /// Set worker parameter - #[structopt(name = "set", version = version::garage())] + #[structopt(name = "set", version = garage_version())] Set { #[structopt(subcommand)] opt: WorkerSetCmd, @@ -522,12 +522,12 @@ pub struct WorkerListOpt { #[derive(Serialize, Deserialize, StructOpt, Debug, Eq, PartialEq, Clone)] pub enum WorkerSetCmd { /// Set tranquility of scrub operations - #[structopt(name = "scrub-tranquility", version = version::garage())] + #[structopt(name = "scrub-tranquility", version = garage_version())] ScrubTranquility { tranquility: u32 }, /// Set number of concurrent block resync workers - #[structopt(name = "resync-n-workers", version = version::garage())] + #[structopt(name = "resync-n-workers", version = garage_version())] ResyncNWorkers { n_workers: usize }, /// Set tranquility of block resync operations - #[structopt(name = "resync-tranquility", version = version::garage())] + #[structopt(name = "resync-tranquility", version = garage_version())] ResyncTranquility { tranquility: u32 }, } -- cgit v1.2.3 From f310fce34b0273f9f75e7a6ea665f51003a1f795 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 7 Sep 2022 18:30:15 +0200 Subject: Inject GIT_VERSION even later --- src/garage/main.rs | 9 +++++++-- src/model/version.rs | 17 +++++++++++------ 2 files changed, 18 insertions(+), 8 deletions(-) (limited to 'src') diff --git a/src/garage/main.rs b/src/garage/main.rs index 751dd941..1a4a939a 100644 --- a/src/garage/main.rs +++ b/src/garage/main.rs @@ -77,7 +77,7 @@ async fn main() { std::process::abort(); })); - // Parse opt + // Initialize version and features info let features = &[ #[cfg(feature = "k2v")] "k2v", @@ -98,12 +98,17 @@ async fn main() { #[cfg(feature = "system-libs")] "system-libs", ][..]; + if let Some(git_version) = option_env!("GIT_VERSION") { + garage_model::version::init_version(git_version); + } + garage_model::version::init_features(features); + + // Parse arguments let version = format!( "{} [features: {}]", garage_model::version::garage_version(), features.join(", ") ); - garage_model::version::init_features(features); let opt = Opt::from_clap(&Opt::clap().version(version.as_str()).get_matches()); let res = match opt.cmd { diff --git a/src/model/version.rs b/src/model/version.rs index af6aa809..b515dccc 100644 --- a/src/model/version.rs +++ b/src/model/version.rs @@ -1,23 +1,28 @@ use std::sync::Arc; -use arc_swap::ArcSwapOption; +use arc_swap::{ArcSwap, ArcSwapOption}; lazy_static::lazy_static! { + static ref VERSION: ArcSwap<&'static str> = ArcSwap::new(Arc::new(git_version::git_version!( + prefix = "git:", + cargo_prefix = "cargo:", + fallback = "unknown" + ))); static ref FEATURES: ArcSwapOption<&'static [&'static str]> = ArcSwapOption::new(None); } pub fn garage_version() -> &'static str { - option_env!("GIT_VERSION").unwrap_or(git_version::git_version!( - prefix = "git:", - cargo_prefix = "cargo:", - fallback = "unknown" - )) + &VERSION.load() } pub fn garage_features() -> Option<&'static [&'static str]> { FEATURES.load().as_ref().map(|f| &f[..]) } +pub fn init_version(version: &'static str) { + VERSION.store(Arc::new(version)); +} + pub fn init_features(features: &'static [&'static str]) { FEATURES.store(Some(Arc::new(features))); } -- cgit v1.2.3 From ceb1f0229a9c8b9f8255b4a4c70272627f0c34d7 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 7 Sep 2022 18:36:46 +0200 Subject: Move version back into util --- src/api/admin/cluster.rs | 4 ++-- src/garage/admin.rs | 4 ++-- src/garage/cli/structs.rs | 2 +- src/garage/main.rs | 6 +++--- src/model/Cargo.toml | 2 -- src/model/lib.rs | 1 - src/model/version.rs | 28 ---------------------------- src/util/Cargo.toml | 3 +++ src/util/lib.rs | 1 + src/util/version.rs | 28 ++++++++++++++++++++++++++++ 10 files changed, 40 insertions(+), 39 deletions(-) delete mode 100644 src/model/version.rs create mode 100644 src/util/version.rs (limited to 'src') diff --git a/src/api/admin/cluster.rs b/src/api/admin/cluster.rs index 010382f2..99c6e332 100644 --- a/src/api/admin/cluster.rs +++ b/src/api/admin/cluster.rs @@ -18,8 +18,8 @@ use crate::helpers::{json_ok_response, parse_json_body}; pub async fn handle_get_cluster_status(garage: &Arc) -> Result, Error> { let res = GetClusterStatusResponse { node: hex::encode(garage.system.id), - garage_version: garage_model::version::garage_version(), - garage_features: garage_model::version::garage_features(), + garage_version: garage_util::version::garage_version(), + garage_features: garage_util::version::garage_features(), db_engine: garage.db.engine(), known_nodes: garage .system diff --git a/src/garage/admin.rs b/src/garage/admin.rs index 9c4ecd9d..b4d2d1a1 100644 --- a/src/garage/admin.rs +++ b/src/garage/admin.rs @@ -742,8 +742,8 @@ impl AdminRpcHandler { writeln!( &mut ret, "\nGarage version: {} [features: {}]", - garage_model::version::garage_version(), - garage_model::version::garage_features() + garage_util::version::garage_version(), + garage_util::version::garage_features() .map(|list| list.join(", ")) .unwrap_or_else(|| "(unknown)".into()), ) diff --git a/src/garage/cli/structs.rs b/src/garage/cli/structs.rs index 825fe859..06548e89 100644 --- a/src/garage/cli/structs.rs +++ b/src/garage/cli/structs.rs @@ -1,7 +1,7 @@ use serde::{Deserialize, Serialize}; use structopt::StructOpt; -use garage_model::version::garage_version; +use garage_util::version::garage_version; #[derive(StructOpt, Debug)] pub enum Command { diff --git a/src/garage/main.rs b/src/garage/main.rs index 1a4a939a..77d5db24 100644 --- a/src/garage/main.rs +++ b/src/garage/main.rs @@ -99,14 +99,14 @@ async fn main() { "system-libs", ][..]; if let Some(git_version) = option_env!("GIT_VERSION") { - garage_model::version::init_version(git_version); + garage_util::version::init_version(git_version); } - garage_model::version::init_features(features); + garage_util::version::init_features(features); // Parse arguments let version = format!( "{} [features: {}]", - garage_model::version::garage_version(), + garage_util::version::garage_version(), features.join(", ") ); let opt = Opt::from_clap(&Opt::clap().version(version.as_str()).get_matches()); diff --git a/src/model/Cargo.toml b/src/model/Cargo.toml index 101c97d3..bbcfe89c 100644 --- a/src/model/Cargo.toml +++ b/src/model/Cargo.toml @@ -24,9 +24,7 @@ async-trait = "0.1.7" arc-swap = "1.0" blake2 = "0.9" err-derive = "0.3" -git-version = "0.3.4" hex = "0.4" -lazy_static = "1.4" base64 = "0.13" tracing = "0.1.30" rand = "0.8" diff --git a/src/model/lib.rs b/src/model/lib.rs index 43db01c5..4f20ea46 100644 --- a/src/model/lib.rs +++ b/src/model/lib.rs @@ -19,4 +19,3 @@ pub mod s3; pub mod garage; pub mod helper; pub mod migrate; -pub mod version; diff --git a/src/model/version.rs b/src/model/version.rs deleted file mode 100644 index b515dccc..00000000 --- a/src/model/version.rs +++ /dev/null @@ -1,28 +0,0 @@ -use std::sync::Arc; - -use arc_swap::{ArcSwap, ArcSwapOption}; - -lazy_static::lazy_static! { - static ref VERSION: ArcSwap<&'static str> = ArcSwap::new(Arc::new(git_version::git_version!( - prefix = "git:", - cargo_prefix = "cargo:", - fallback = "unknown" - ))); - static ref FEATURES: ArcSwapOption<&'static [&'static str]> = ArcSwapOption::new(None); -} - -pub fn garage_version() -> &'static str { - &VERSION.load() -} - -pub fn garage_features() -> Option<&'static [&'static str]> { - FEATURES.load().as_ref().map(|f| &f[..]) -} - -pub fn init_version(version: &'static str) { - VERSION.store(Arc::new(version)); -} - -pub fn init_features(features: &'static [&'static str]) { - FEATURES.store(Some(Arc::new(features))); -} diff --git a/src/util/Cargo.toml b/src/util/Cargo.toml index af57008e..163c1b77 100644 --- a/src/util/Cargo.toml +++ b/src/util/Cargo.toml @@ -16,11 +16,14 @@ path = "lib.rs" [dependencies] garage_db = { version = "0.8.0", path = "../db" } +arc-swap = "1.0" async-trait = "0.1" blake2 = "0.9" err-derive = "0.3" +git-version = "0.3.4" xxhash-rust = { version = "0.8", default-features = false, features = ["xxh3"] } hex = "0.4" +lazy_static = "1.4" tracing = "0.1.30" rand = "0.8" sha2 = "0.9" diff --git a/src/util/lib.rs b/src/util/lib.rs index fce151af..47c85c3a 100644 --- a/src/util/lib.rs +++ b/src/util/lib.rs @@ -14,3 +14,4 @@ pub mod persister; pub mod time; pub mod token_bucket; pub mod tranquilizer; +pub mod version; diff --git a/src/util/version.rs b/src/util/version.rs new file mode 100644 index 00000000..b515dccc --- /dev/null +++ b/src/util/version.rs @@ -0,0 +1,28 @@ +use std::sync::Arc; + +use arc_swap::{ArcSwap, ArcSwapOption}; + +lazy_static::lazy_static! { + static ref VERSION: ArcSwap<&'static str> = ArcSwap::new(Arc::new(git_version::git_version!( + prefix = "git:", + cargo_prefix = "cargo:", + fallback = "unknown" + ))); + static ref FEATURES: ArcSwapOption<&'static [&'static str]> = ArcSwapOption::new(None); +} + +pub fn garage_version() -> &'static str { + &VERSION.load() +} + +pub fn garage_features() -> Option<&'static [&'static str]> { + FEATURES.load().as_ref().map(|f| &f[..]) +} + +pub fn init_version(version: &'static str) { + VERSION.store(Arc::new(version)); +} + +pub fn init_features(features: &'static [&'static str]) { + FEATURES.store(Some(Arc::new(features))); +} -- cgit v1.2.3 From f91fab8582728f176f446a4a2e039d22f752167b Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 12 Sep 2022 16:23:43 +0200 Subject: Simplify+improve async hasher by using bounded channel --- src/util/async_hash.rs | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'src') diff --git a/src/util/async_hash.rs b/src/util/async_hash.rs index fa8ee7ff..5631ea6b 100644 --- a/src/util/async_hash.rs +++ b/src/util/async_hash.rs @@ -1,7 +1,7 @@ use bytes::Bytes; use digest::Digest; -use tokio::sync::{mpsc, oneshot}; +use tokio::sync::mpsc; use tokio::task::JoinHandle; use crate::data::*; @@ -27,18 +27,17 @@ pub async fn async_blake2sum(data: Bytes) -> Hash { // ---- pub struct AsyncHasher { - sendblk: mpsc::UnboundedSender<(Bytes, oneshot::Sender<()>)>, + sendblk: mpsc::Sender, task: JoinHandle>, } impl AsyncHasher { pub fn new() -> Self { - let (sendblk, mut recvblk) = mpsc::unbounded_channel::<(Bytes, oneshot::Sender<()>)>(); + let (sendblk, mut recvblk) = mpsc::channel::(1); let task = tokio::task::spawn_blocking(move || { let mut digest = D::new(); - while let Some((blk, ch)) = recvblk.blocking_recv() { + while let Some(blk) = recvblk.blocking_recv() { digest.update(&blk[..]); - let _ = ch.send(()); } digest.finalize() }); @@ -46,9 +45,7 @@ impl AsyncHasher { } pub async fn update(&self, b: Bytes) { - let (tx, rx) = oneshot::channel(); - self.sendblk.send((b, tx)).unwrap(); - let _ = rx.await; + self.sendblk.send(b).await.unwrap(); } pub async fn finalize(self) -> digest::Output { -- cgit v1.2.3 From b823151a0bba7ee6c5f0f96c6b06355572528d94 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 12 Sep 2022 16:57:38 +0200 Subject: improvements in block manager --- src/block/manager.rs | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) (limited to 'src') diff --git a/src/block/manager.rs b/src/block/manager.rs index b9cd09e7..ec694fc8 100644 --- a/src/block/manager.rs +++ b/src/block/manager.rs @@ -11,7 +11,7 @@ use futures::Stream; use futures_util::stream::StreamExt; use tokio::fs; use tokio::io::{AsyncReadExt, AsyncWriteExt, BufReader}; -use tokio::sync::{mpsc, Mutex}; +use tokio::sync::{mpsc, Mutex, MutexGuard}; use opentelemetry::{ trace::{FutureExt as OtelFutureExt, TraceContextExt, Tracer}, @@ -261,7 +261,7 @@ impl BlockManager { > { let (header, stream) = self.rpc_get_raw_block_streaming(hash, order_tag).await?; match header { - DataBlockHeader::Plain => Ok(Box::pin(stream)), + DataBlockHeader::Plain => Ok(stream), DataBlockHeader::Compressed => { // Too many things, I hate it. let reader = stream_asyncread(stream); @@ -389,11 +389,7 @@ impl BlockManager { let write_size = data.inner_buffer().len() as u64; - self.mutation_lock[hash.as_slice()[0] as usize] - .lock() - .with_context(Context::current_with_span( - tracer.start("Acquire mutation_lock"), - )) + self.lock_mutate(hash) .await .write_block(hash, data, self) .bound_record_duration(&self.metrics.block_write_duration) @@ -470,8 +466,7 @@ impl BlockManager { if data.verify(*hash).is_err() { self.metrics.corruption_counter.add(1); - self.mutation_lock[hash.as_slice()[0] as usize] - .lock() + self.lock_mutate(hash) .await .move_block_to_corrupted(hash, self) .await?; @@ -484,8 +479,7 @@ impl BlockManager { /// Check if this node has a block and whether it needs it pub(crate) async fn check_block_status(&self, hash: &Hash) -> Result { - self.mutation_lock[hash.as_slice()[0] as usize] - .lock() + self.lock_mutate(hash) .await .check_block_status(hash, self) .await @@ -499,8 +493,7 @@ impl BlockManager { /// Delete block if it is not needed anymore pub(crate) async fn delete_if_unneeded(&self, hash: &Hash) -> Result<(), Error> { - self.mutation_lock[hash.as_slice()[0] as usize] - .lock() + self.lock_mutate(hash) .await .delete_if_unneeded(hash, self) .await @@ -532,6 +525,16 @@ impl BlockManager { path.set_extension(""); fs::metadata(&path).await.map(|_| false).map_err(Into::into) } + + async fn lock_mutate(&self, hash: &Hash) -> MutexGuard<'_, BlockManagerLocked> { + let tracer = opentelemetry::global::tracer("garage"); + self.mutation_lock[hash.as_slice()[0] as usize] + .lock() + .with_context(Context::current_with_span( + tracer.start("Acquire mutation_lock"), + )) + .await + } } #[async_trait] -- cgit v1.2.3 From 28a4af73ca8c0f82314157939fc98c46f338e84a Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 13 Sep 2022 13:11:44 +0200 Subject: Use netapp 0.5 published from crates.io --- src/garage/Cargo.toml | 3 +-- src/model/Cargo.toml | 3 +-- src/rpc/Cargo.toml | 3 +-- src/util/Cargo.toml | 3 +-- 4 files changed, 4 insertions(+), 8 deletions(-) (limited to 'src') diff --git a/src/garage/Cargo.toml b/src/garage/Cargo.toml index dcb3b78e..5ce40ff2 100644 --- a/src/garage/Cargo.toml +++ b/src/garage/Cargo.toml @@ -50,8 +50,7 @@ futures = "0.3" futures-util = "0.3" tokio = { version = "1.0", default-features = false, features = ["rt", "rt-multi-thread", "io-util", "net", "time", "macros", "sync", "signal", "fs"] } -#netapp = "0.4" -netapp = { version = "0.5", git = "https://git.deuxfleurs.fr/lx/netapp", branch = "stream-body", features = ["telemetry"] } +netapp = "0.5" opentelemetry = { version = "0.17", features = [ "rt-tokio" ] } opentelemetry-prometheus = { version = "0.10", optional = true } diff --git a/src/model/Cargo.toml b/src/model/Cargo.toml index d6e2adfe..2c2e2bfe 100644 --- a/src/model/Cargo.toml +++ b/src/model/Cargo.toml @@ -39,8 +39,7 @@ futures-util = "0.3" tokio = { version = "1.0", default-features = false, features = ["rt", "rt-multi-thread", "io-util", "net", "time", "macros", "sync", "signal", "fs"] } opentelemetry = "0.17" -#netapp = "0.4" -netapp = { version = "0.5", git = "https://git.deuxfleurs.fr/lx/netapp", branch = "stream-body", features = ["telemetry"] } +netapp = "0.5" [features] k2v = [ "garage_util/k2v" ] diff --git a/src/rpc/Cargo.toml b/src/rpc/Cargo.toml index d7136401..079cfe34 100644 --- a/src/rpc/Cargo.toml +++ b/src/rpc/Cargo.toml @@ -45,8 +45,7 @@ tokio = { version = "1.0", default-features = false, features = ["rt", "rt-multi tokio-stream = { version = "0.1", features = ["net"] } opentelemetry = "0.17" -#netapp = { version = "0.4.5", features = ["telemetry"] } -netapp = { version = "0.5.0", git = "https://git.deuxfleurs.fr/lx/netapp", branch = "stream-body", features = ["telemetry"] } +netapp = { version = "0.5.0", features = ["telemetry"] } hyper = { version = "0.14", features = ["client", "http1", "runtime", "tcp"] } diff --git a/src/util/Cargo.toml b/src/util/Cargo.toml index c648e13b..8e978fc2 100644 --- a/src/util/Cargo.toml +++ b/src/util/Cargo.toml @@ -39,8 +39,7 @@ toml = "0.5" futures = "0.3" tokio = { version = "1.0", default-features = false, features = ["rt", "rt-multi-thread", "io-util", "net", "time", "macros", "sync", "signal", "fs"] } -#netapp = "0.4" -netapp = { version = "0.5", git = "https://git.deuxfleurs.fr/lx/netapp", branch = "stream-body", features = ["telemetry"] } +netapp = "0.5" http = "0.2" hyper = "0.14" -- cgit v1.2.3 From ff30891999b5be5421b80b89da1037e943179d2d Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 13 Sep 2022 15:13:07 +0200 Subject: Use streaming block API for get with Range requests --- src/api/s3/get.rs | 93 +++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 60 insertions(+), 33 deletions(-) (limited to 'src') diff --git a/src/api/s3/get.rs b/src/api/s3/get.rs index dd95f6e7..ae4c287d 100644 --- a/src/api/s3/get.rs +++ b/src/api/s3/get.rs @@ -7,10 +7,9 @@ use http::header::{ ACCEPT_RANGES, CONTENT_LENGTH, CONTENT_RANGE, CONTENT_TYPE, ETAG, IF_MODIFIED_SINCE, IF_NONE_MATCH, LAST_MODIFIED, RANGE, }; -use hyper::body::Bytes; use hyper::{Body, Request, Response, StatusCode}; -use garage_rpc::rpc_helper::OrderTag; +use garage_rpc::rpc_helper::{netapp::stream::ByteStream, OrderTag}; use garage_table::EmptyKey; use garage_util::data::*; @@ -274,14 +273,7 @@ pub async fn handle_get( .block_manager .rpc_get_block_streaming(&hash, Some(order_stream.order(i as u64))) .await - .unwrap_or_else(|e| { - Box::pin(futures::stream::once(async move { - Err(std::io::Error::new( - std::io::ErrorKind::Other, - format!("Could not get block {}: {}", i, e), - )) - })) - }) + .unwrap_or_else(|e| error_stream(i, e)) } } }) @@ -437,44 +429,79 @@ fn body_from_blocks_range( all_blocks.len(), 4 + ((end - begin) / std::cmp::max(all_blocks[0].1.size as u64, 1024)) as usize, )); - let mut true_offset = 0; + let mut block_offset: u64 = 0; for (_, b) in all_blocks.iter() { - if true_offset >= end { + if block_offset >= end { break; } // Keep only blocks that have an intersection with the requested range - if true_offset < end && true_offset + b.size > begin { - blocks.push((*b, true_offset)); + if block_offset < end && block_offset + b.size > begin { + blocks.push((*b, block_offset)); } - true_offset += b.size; + block_offset += b.size as u64; } let order_stream = OrderTag::stream(); let body_stream = futures::stream::iter(blocks) .enumerate() - .map(move |(i, (block, true_offset))| { + .map(move |(i, (block, block_offset))| { let garage = garage.clone(); async move { - let data = garage + garage .block_manager - .rpc_get_block(&block.hash, Some(order_stream.order(i as u64))) - .await?; - let start_in_block = if true_offset > begin { - 0 - } else { - begin - true_offset - }; - let end_in_block = if true_offset + block.size < end { - block.size - } else { - end - true_offset - }; - Result::::Ok( - data.slice(start_in_block as usize..end_in_block as usize), - ) + .rpc_get_block_streaming(&block.hash, Some(order_stream.order(i as u64))) + .await + .unwrap_or_else(|e| error_stream(i, e)) + .scan(block_offset, move |chunk_offset, chunk| { + let r = match chunk { + Ok(chunk_bytes) => { + let chunk_len = chunk_bytes.len() as u64; + let r = if *chunk_offset >= end { + // The current chunk is after the part we want to read. + // Returning None here will stop the scan, the rest of the + // stream will be ignored + None + } else if *chunk_offset + chunk_len <= begin { + // The current chunk is before the part we want to read. + // We return a None that will be removed by the filter_map + // below. + Some(None) + } else { + // The chunk has an intersection with the requested range + let start_in_chunk = if *chunk_offset > begin { + 0 + } else { + begin - *chunk_offset + }; + let end_in_chunk = if *chunk_offset + chunk_len < end { + chunk_len + } else { + end - *chunk_offset + }; + Some(Some(Ok(chunk_bytes + .slice(start_in_chunk as usize..end_in_chunk as usize)))) + }; + *chunk_offset += chunk_bytes.len() as u64; + r + } + Err(e) => Some(Some(Err(e))), + }; + futures::future::ready(r) + }) + .filter_map(futures::future::ready) } }) - .buffered(2); + .buffered(2) + .flatten(); hyper::body::Body::wrap_stream(body_stream) } + +fn error_stream(i: usize, e: garage_util::error::Error) -> ByteStream { + Box::pin(futures::stream::once(async move { + Err(std::io::Error::new( + std::io::ErrorKind::Other, + format!("Could not get block {}: {}", i, e), + )) + })) +} -- cgit v1.2.3 From 07febd3ecd0d491ed01b7ca43846aa43e423b2a1 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 13 Sep 2022 15:57:27 +0200 Subject: Ensure data dir is created immediately when Garage starts (fix #349) --- src/model/garage.rs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'src') diff --git a/src/model/garage.rs b/src/model/garage.rs index 66c359e7..ec1ec956 100644 --- a/src/model/garage.rs +++ b/src/model/garage.rs @@ -6,7 +6,7 @@ use garage_db as db; use garage_util::background::*; use garage_util::config::*; -use garage_util::error::Error; +use garage_util::error::*; use garage_rpc::system::System; @@ -76,9 +76,14 @@ pub struct GarageK2V { impl Garage { /// Create and run garage pub fn new(config: Config, background: Arc) -> Result, Error> { + // Create meta dir and data dir if they don't exist already + std::fs::create_dir_all(&config.metadata_dir) + .ok_or_message("Unable to create Garage metadata directory")?; + std::fs::create_dir_all(&config.data_dir) + .ok_or_message("Unable to create Garage data directory")?; + info!("Opening database..."); let mut db_path = config.metadata_dir.clone(); - std::fs::create_dir_all(&db_path).expect("Unable to create Garage meta data directory"); let db = match config.db_engine.as_str() { // ---- Sled DB ---- #[cfg(feature = "sled")] -- cgit v1.2.3 From 44733474bb6c9021c92b59e5c349b4b7ef71409a Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 13 Sep 2022 16:01:55 +0200 Subject: Remove/change println! in server code (fix #358) --- src/api/s3/bucket.rs | 1 - src/api/s3/copy.rs | 1 - src/rpc/kubernetes.rs | 2 +- src/table/table.rs | 1 - 4 files changed, 1 insertion(+), 4 deletions(-) (limited to 'src') diff --git a/src/api/s3/bucket.rs b/src/api/s3/bucket.rs index 2071fe55..3ac6a6ec 100644 --- a/src/api/s3/bucket.rs +++ b/src/api/s3/bucket.rs @@ -295,7 +295,6 @@ fn parse_create_bucket_xml(xml_bytes: &[u8]) -> Option> { let mut ret = None; for item in cbc.children() { - println!("{:?}", item); if item.has_tag_name("LocationConstraint") { if ret != None { return None; diff --git a/src/api/s3/copy.rs b/src/api/s3/copy.rs index a1a8c9a4..c5a0fc11 100644 --- a/src/api/s3/copy.rs +++ b/src/api/s3/copy.rs @@ -662,7 +662,6 @@ mod tests { last_modified: s3_xml::Value("2011-04-11T20:34:56.000Z".into()), etag: s3_xml::Value("\"9b2cf535f27731c974343645a3985328\"".into()), }; - println!("{}", to_xml_with_header(&v)?); assert_eq!(to_xml_with_header(&v)?, expected_retval); diff --git a/src/rpc/kubernetes.rs b/src/rpc/kubernetes.rs index 939a0eed..197245aa 100644 --- a/src/rpc/kubernetes.rs +++ b/src/rpc/kubernetes.rs @@ -56,7 +56,7 @@ pub async fn get_kubernetes_nodes( let mut ret = Vec::with_capacity(nodes.items.len()); for node in nodes { - println!("Found Pod: {:?}", node.metadata.name); + info!("Found Pod: {:?}", node.metadata.name); let pubkey = &node .metadata diff --git a/src/table/table.rs b/src/table/table.rs index 51f3837f..8e801be6 100644 --- a/src/table/table.rs +++ b/src/table/table.rs @@ -113,7 +113,6 @@ where async fn insert_internal(&self, e: &F::E) -> Result<(), Error> { let hash = e.partition_key().hash(); let who = self.data.replication.write_nodes(&hash); - //eprintln!("insert who: {:?}", who); let e_enc = Arc::new(ByteBuf::from(rmp_to_vec_all_named(e)?)); let rpc = TableRpc::::Update(vec![e_enc]); -- cgit v1.2.3 From 38be811b1cd20d9223b481c0ea91cc7e3ee795dc Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 13 Sep 2022 16:08:00 +0200 Subject: Fix clippy lint that says we should implement Eq --- src/api/s3/copy.rs | 4 ++-- src/api/s3/xml.rs | 42 ++++++++++++++++++------------------ src/model/bucket_alias_table.rs | 2 +- src/model/bucket_table.rs | 4 ++-- src/model/index_counter.rs | 2 +- src/model/key_table.rs | 4 ++-- src/model/prev/v051/bucket_table.rs | 6 +++--- src/model/prev/v051/key_table.rs | 2 +- src/model/prev/v051/object_table.rs | 6 +++--- src/model/prev/v051/version_table.rs | 2 +- src/model/s3/block_ref_table.rs | 2 +- src/model/s3/object_table.rs | 6 +++--- src/model/s3/version_table.rs | 2 +- src/util/crdt/bool.rs | 2 +- src/util/crdt/deletable.rs | 2 +- src/util/crdt/lww.rs | 2 +- src/util/crdt/lww_map.rs | 2 +- src/util/crdt/map.rs | 2 +- 18 files changed, 47 insertions(+), 47 deletions(-) (limited to 'src') diff --git a/src/api/s3/copy.rs b/src/api/s3/copy.rs index c5a0fc11..7eb6459d 100644 --- a/src/api/s3/copy.rs +++ b/src/api/s3/copy.rs @@ -609,7 +609,7 @@ impl> Defragmenter { } } -#[derive(Debug, Serialize, PartialEq)] +#[derive(Debug, Serialize, PartialEq, Eq)] pub struct CopyObjectResult { #[serde(rename = "LastModified")] pub last_modified: s3_xml::Value, @@ -617,7 +617,7 @@ pub struct CopyObjectResult { pub etag: s3_xml::Value, } -#[derive(Debug, Serialize, PartialEq)] +#[derive(Debug, Serialize, PartialEq, Eq)] pub struct CopyPartResult { #[serde(serialize_with = "xmlns_tag")] pub xmlns: (), diff --git a/src/api/s3/xml.rs b/src/api/s3/xml.rs index 111657a0..06f11288 100644 --- a/src/api/s3/xml.rs +++ b/src/api/s3/xml.rs @@ -25,7 +25,7 @@ impl From<&str> for Value { #[derive(Debug, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)] pub struct IntValue(#[serde(rename = "$value")] pub i64); -#[derive(Debug, Serialize, PartialEq)] +#[derive(Debug, Serialize, PartialEq, Eq)] pub struct Bucket { #[serde(rename = "CreationDate")] pub creation_date: Value, @@ -33,7 +33,7 @@ pub struct Bucket { pub name: Value, } -#[derive(Debug, Serialize, PartialEq)] +#[derive(Debug, Serialize, PartialEq, Eq)] pub struct Owner { #[serde(rename = "DisplayName")] pub display_name: Value, @@ -41,13 +41,13 @@ pub struct Owner { pub id: Value, } -#[derive(Debug, Serialize, PartialEq)] +#[derive(Debug, Serialize, PartialEq, Eq)] pub struct BucketList { #[serde(rename = "Bucket")] pub entries: Vec, } -#[derive(Debug, Serialize, PartialEq)] +#[derive(Debug, Serialize, PartialEq, Eq)] pub struct ListAllMyBucketsResult { #[serde(rename = "Buckets")] pub buckets: BucketList, @@ -55,7 +55,7 @@ pub struct ListAllMyBucketsResult { pub owner: Owner, } -#[derive(Debug, Serialize, PartialEq)] +#[derive(Debug, Serialize, PartialEq, Eq)] pub struct LocationConstraint { #[serde(serialize_with = "xmlns_tag")] pub xmlns: (), @@ -63,7 +63,7 @@ pub struct LocationConstraint { pub region: String, } -#[derive(Debug, Serialize, PartialEq)] +#[derive(Debug, Serialize, PartialEq, Eq)] pub struct Deleted { #[serde(rename = "Key")] pub key: Value, @@ -73,7 +73,7 @@ pub struct Deleted { pub delete_marker_version_id: Value, } -#[derive(Debug, Serialize, PartialEq)] +#[derive(Debug, Serialize, PartialEq, Eq)] pub struct Error { #[serde(rename = "Code")] pub code: Value, @@ -85,7 +85,7 @@ pub struct Error { pub region: Option, } -#[derive(Debug, Serialize, PartialEq)] +#[derive(Debug, Serialize, PartialEq, Eq)] pub struct DeleteError { #[serde(rename = "Code")] pub code: Value, @@ -97,7 +97,7 @@ pub struct DeleteError { pub version_id: Option, } -#[derive(Debug, Serialize, PartialEq)] +#[derive(Debug, Serialize, PartialEq, Eq)] pub struct DeleteResult { #[serde(serialize_with = "xmlns_tag")] pub xmlns: (), @@ -107,7 +107,7 @@ pub struct DeleteResult { pub errors: Vec, } -#[derive(Debug, Serialize, PartialEq)] +#[derive(Debug, Serialize, PartialEq, Eq)] pub struct InitiateMultipartUploadResult { #[serde(serialize_with = "xmlns_tag")] pub xmlns: (), @@ -119,7 +119,7 @@ pub struct InitiateMultipartUploadResult { pub upload_id: Value, } -#[derive(Debug, Serialize, PartialEq)] +#[derive(Debug, Serialize, PartialEq, Eq)] pub struct CompleteMultipartUploadResult { #[serde(serialize_with = "xmlns_tag")] pub xmlns: (), @@ -133,7 +133,7 @@ pub struct CompleteMultipartUploadResult { pub etag: Value, } -#[derive(Debug, Serialize, PartialEq)] +#[derive(Debug, Serialize, PartialEq, Eq)] pub struct Initiator { #[serde(rename = "DisplayName")] pub display_name: Value, @@ -141,7 +141,7 @@ pub struct Initiator { pub id: Value, } -#[derive(Debug, Serialize, PartialEq)] +#[derive(Debug, Serialize, PartialEq, Eq)] pub struct ListMultipartItem { #[serde(rename = "Initiated")] pub initiated: Value, @@ -157,7 +157,7 @@ pub struct ListMultipartItem { pub storage_class: Value, } -#[derive(Debug, Serialize, PartialEq)] +#[derive(Debug, Serialize, PartialEq, Eq)] pub struct ListMultipartUploadsResult { #[serde(serialize_with = "xmlns_tag")] pub xmlns: (), @@ -187,7 +187,7 @@ pub struct ListMultipartUploadsResult { pub encoding_type: Option, } -#[derive(Debug, Serialize, PartialEq)] +#[derive(Debug, Serialize, PartialEq, Eq)] pub struct PartItem { #[serde(rename = "ETag")] pub etag: Value, @@ -199,7 +199,7 @@ pub struct PartItem { pub size: IntValue, } -#[derive(Debug, Serialize, PartialEq)] +#[derive(Debug, Serialize, PartialEq, Eq)] pub struct ListPartsResult { #[serde(serialize_with = "xmlns_tag")] pub xmlns: (), @@ -227,7 +227,7 @@ pub struct ListPartsResult { pub storage_class: Value, } -#[derive(Debug, Serialize, PartialEq)] +#[derive(Debug, Serialize, PartialEq, Eq)] pub struct ListBucketItem { #[serde(rename = "Key")] pub key: Value, @@ -241,13 +241,13 @@ pub struct ListBucketItem { pub storage_class: Value, } -#[derive(Debug, Serialize, PartialEq)] +#[derive(Debug, Serialize, PartialEq, Eq)] pub struct CommonPrefix { #[serde(rename = "Prefix")] pub prefix: Value, } -#[derive(Debug, Serialize, PartialEq)] +#[derive(Debug, Serialize, PartialEq, Eq)] pub struct ListBucketResult { #[serde(serialize_with = "xmlns_tag")] pub xmlns: (), @@ -281,7 +281,7 @@ pub struct ListBucketResult { pub common_prefixes: Vec, } -#[derive(Debug, Serialize, PartialEq)] +#[derive(Debug, Serialize, PartialEq, Eq)] pub struct VersioningConfiguration { #[serde(serialize_with = "xmlns_tag")] pub xmlns: (), @@ -289,7 +289,7 @@ pub struct VersioningConfiguration { pub status: Option, } -#[derive(Debug, Serialize, PartialEq)] +#[derive(Debug, Serialize, PartialEq, Eq)] pub struct PostObject { #[serde(serialize_with = "xmlns_tag")] pub xmlns: (), diff --git a/src/model/bucket_alias_table.rs b/src/model/bucket_alias_table.rs index fce03d04..fcd1536e 100644 --- a/src/model/bucket_alias_table.rs +++ b/src/model/bucket_alias_table.rs @@ -7,7 +7,7 @@ use garage_table::*; /// The bucket alias table holds the names given to buckets /// in the global namespace. -#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] +#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] pub struct BucketAlias { name: String, pub state: crdt::Lww>, diff --git a/src/model/bucket_table.rs b/src/model/bucket_table.rs index 130eb6a6..7be42702 100644 --- a/src/model/bucket_table.rs +++ b/src/model/bucket_table.rs @@ -12,7 +12,7 @@ use crate::permission::BucketKeyPerm; /// Its parameters are not directly accessible as: /// - It must be possible to merge paramaters, hence the use of a LWW CRDT. /// - A bucket has 2 states, Present or Deleted and parameters make sense only if present. -#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] +#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] pub struct Bucket { /// ID of the bucket pub id: Uuid, @@ -21,7 +21,7 @@ pub struct Bucket { } /// Configuration for a bucket -#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] +#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] pub struct BucketParams { /// Bucket's creation date pub creation_date: u64, diff --git a/src/model/index_counter.rs b/src/model/index_counter.rs index 26833390..e6394f0c 100644 --- a/src/model/index_counter.rs +++ b/src/model/index_counter.rs @@ -81,7 +81,7 @@ impl CounterEntry { } /// A counter entry in the global table -#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] +#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] pub struct CounterValue { pub node_values: BTreeMap, } diff --git a/src/model/key_table.rs b/src/model/key_table.rs index 7288f6e4..9d2fc783 100644 --- a/src/model/key_table.rs +++ b/src/model/key_table.rs @@ -9,7 +9,7 @@ use crate::permission::BucketKeyPerm; use crate::prev::v051::key_table as old; /// An api key -#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] +#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] pub struct Key { /// The id of the key (immutable), used as partition key pub key_id: String, @@ -19,7 +19,7 @@ pub struct Key { } /// Configuration for a key -#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] +#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] pub struct KeyParams { /// The secret_key associated (immutable) pub secret_key: String, diff --git a/src/model/prev/v051/bucket_table.rs b/src/model/prev/v051/bucket_table.rs index 0c52b6ea..628a49dd 100644 --- a/src/model/prev/v051/bucket_table.rs +++ b/src/model/prev/v051/bucket_table.rs @@ -10,7 +10,7 @@ use super::key_table::PermissionSet; /// Its parameters are not directly accessible as: /// - It must be possible to merge paramaters, hence the use of a LWW CRDT. /// - A bucket has 2 states, Present or Deleted and parameters make sense only if present. -#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] +#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] pub struct Bucket { /// Name of the bucket pub name: String, @@ -19,7 +19,7 @@ pub struct Bucket { } /// State of a bucket -#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] +#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] pub enum BucketState { /// The bucket is deleted Deleted, @@ -41,7 +41,7 @@ impl Crdt for BucketState { } /// Configuration for a bucket -#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] +#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] pub struct BucketParams { /// Map of key with access to the bucket, and what kind of access they give pub authorized_keys: crdt::LwwMap, diff --git a/src/model/prev/v051/key_table.rs b/src/model/prev/v051/key_table.rs index fee24741..37516b1c 100644 --- a/src/model/prev/v051/key_table.rs +++ b/src/model/prev/v051/key_table.rs @@ -4,7 +4,7 @@ use garage_table::crdt::*; use garage_table::*; /// An api key -#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] +#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] pub struct Key { /// The id of the key (immutable), used as partition key pub key_id: String, diff --git a/src/model/prev/v051/object_table.rs b/src/model/prev/v051/object_table.rs index cb59b309..e79e5787 100644 --- a/src/model/prev/v051/object_table.rs +++ b/src/model/prev/v051/object_table.rs @@ -6,7 +6,7 @@ use garage_util::data::*; use garage_table::crdt::*; /// An object -#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] +#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] pub struct Object { /// The bucket in which the object is stored, used as partition key pub bucket: String, @@ -26,7 +26,7 @@ impl Object { } /// Informations about a version of an object -#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] +#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] pub struct ObjectVersion { /// Id of the version pub uuid: Uuid, @@ -37,7 +37,7 @@ pub struct ObjectVersion { } /// State of an object version -#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] +#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] pub enum ObjectVersionState { /// The version is being received Uploading(ObjectVersionHeaders), diff --git a/src/model/prev/v051/version_table.rs b/src/model/prev/v051/version_table.rs index 1e658f91..c11c62d5 100644 --- a/src/model/prev/v051/version_table.rs +++ b/src/model/prev/v051/version_table.rs @@ -6,7 +6,7 @@ use garage_table::crdt::*; use garage_table::*; /// A version of an object -#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] +#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] pub struct Version { /// UUID of the version, used as partition key pub uuid: Uuid, diff --git a/src/model/s3/block_ref_table.rs b/src/model/s3/block_ref_table.rs index 9589b4aa..c7017409 100644 --- a/src/model/s3/block_ref_table.rs +++ b/src/model/s3/block_ref_table.rs @@ -10,7 +10,7 @@ use garage_table::*; use garage_block::manager::*; -#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] +#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] pub struct BlockRef { /// Hash (blake2 sum) of the block, used as partition key pub block: Hash, diff --git a/src/model/s3/object_table.rs b/src/model/s3/object_table.rs index a151f1b1..26ff57f6 100644 --- a/src/model/s3/object_table.rs +++ b/src/model/s3/object_table.rs @@ -21,7 +21,7 @@ pub const UNFINISHED_UPLOADS: &str = "unfinished_uploads"; pub const BYTES: &str = "bytes"; /// An object -#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] +#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] pub struct Object { /// The bucket in which the object is stored, used as partition key pub bucket_id: Uuid, @@ -70,7 +70,7 @@ impl Object { } /// Informations about a version of an object -#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] +#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] pub struct ObjectVersion { /// Id of the version pub uuid: Uuid, @@ -81,7 +81,7 @@ pub struct ObjectVersion { } /// State of an object version -#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] +#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] pub enum ObjectVersionState { /// The version is being received Uploading(ObjectVersionHeaders), diff --git a/src/model/s3/version_table.rs b/src/model/s3/version_table.rs index b545e66a..6bc2ecd1 100644 --- a/src/model/s3/version_table.rs +++ b/src/model/s3/version_table.rs @@ -15,7 +15,7 @@ use crate::s3::block_ref_table::*; use crate::prev::v051::version_table as old; /// A version of an object -#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] +#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] pub struct Version { /// UUID of the version, used as partition key pub uuid: Uuid, diff --git a/src/util/crdt/bool.rs b/src/util/crdt/bool.rs index 53af8f82..111eb5f1 100644 --- a/src/util/crdt/bool.rs +++ b/src/util/crdt/bool.rs @@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize}; use crate::crdt::crdt::*; /// Boolean, where `true` is an absorbing state -#[derive(Clone, Copy, Debug, Serialize, Deserialize, PartialEq)] +#[derive(Clone, Copy, Debug, Serialize, Deserialize, PartialEq, Eq)] pub struct Bool(bool); impl Bool { diff --git a/src/util/crdt/deletable.rs b/src/util/crdt/deletable.rs index c76f5cbb..e771aceb 100644 --- a/src/util/crdt/deletable.rs +++ b/src/util/crdt/deletable.rs @@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize}; use crate::crdt::crdt::*; /// Deletable object (once deleted, cannot go back) -#[derive(Clone, Copy, Debug, Serialize, Deserialize, PartialEq)] +#[derive(Clone, Copy, Debug, Serialize, Deserialize, PartialEq, Eq)] pub enum Deletable { Present(T), Deleted, diff --git a/src/util/crdt/lww.rs b/src/util/crdt/lww.rs index 254abe8e..958844c9 100644 --- a/src/util/crdt/lww.rs +++ b/src/util/crdt/lww.rs @@ -37,7 +37,7 @@ use crate::crdt::crdt::*; /// /// This scheme is used by AWS S3 or Soundcloud and often without knowing /// in enterprise when reconciliating databases with ad-hoc scripts. -#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)] pub struct Lww { ts: u64, v: T, diff --git a/src/util/crdt/lww_map.rs b/src/util/crdt/lww_map.rs index 91d24c7f..88113856 100644 --- a/src/util/crdt/lww_map.rs +++ b/src/util/crdt/lww_map.rs @@ -23,7 +23,7 @@ use crate::crdt::crdt::*; /// However, note that even if we were using a more efficient data structure such as a `BTreeMap`, /// the serialization cost `O(n)` would still have to be paid at each modification, so we are /// actually not losing anything here. -#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)] pub struct LwwMap { vals: Vec<(K, u64, V)>, } diff --git a/src/util/crdt/map.rs b/src/util/crdt/map.rs index f9ed19b6..5d1e1520 100644 --- a/src/util/crdt/map.rs +++ b/src/util/crdt/map.rs @@ -16,7 +16,7 @@ use crate::crdt::crdt::*; /// However, note that even if we were using a more efficient data structure such as a `BTreeMap`, /// the serialization cost `O(n)` would still have to be paid at each modification, so we are /// actually not losing anything here. -#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)] pub struct Map { vals: Vec<(K, V)>, } -- cgit v1.2.3 From ab722cb40f5aacf661a280b7eb025acd3aefc1bb Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 13 Sep 2022 16:22:23 +0200 Subject: Add checks on replication_factor of layouts we use (fix #363, fix #364) --- src/model/garage.rs | 2 +- src/rpc/system.rs | 30 +++++++++++++++++++++++++----- 2 files changed, 26 insertions(+), 6 deletions(-) (limited to 'src') diff --git a/src/model/garage.rs b/src/model/garage.rs index ec1ec956..75012952 100644 --- a/src/model/garage.rs +++ b/src/model/garage.rs @@ -169,7 +169,7 @@ impl Garage { background.clone(), replication_mode.replication_factor(), &config, - ); + )?; let data_rep_param = TableShardedReplication { system: system.clone(), diff --git a/src/rpc/system.rs b/src/rpc/system.rs index c0e70c61..228b66a4 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -198,7 +198,7 @@ impl System { background: Arc, replication_factor: usize, config: &Config, - ) -> Arc { + ) -> Result, Error> { let node_key = gen_node_key(&config.metadata_dir).expect("Unable to read or generate node ID"); info!( @@ -206,11 +206,21 @@ impl System { hex::encode(&node_key.public_key()[..8]) ); - let persist_cluster_layout = Persister::new(&config.metadata_dir, "cluster_layout"); + let persist_cluster_layout: Persister = + Persister::new(&config.metadata_dir, "cluster_layout"); let persist_peer_list = Persister::new(&config.metadata_dir, "peer_list"); let cluster_layout = match persist_cluster_layout.load() { - Ok(x) => x, + Ok(x) => { + if x.replication_factor != replication_factor { + return Err(Error::Message(format!( + "Prevous cluster layout has replication factor {}, which is different than the one specified in the config file ({}). The previous cluster layout can be purged, if you know what you are doing, simply by deleting the `cluster_layout` file in your metadata directory.", + x.replication_factor, + replication_factor + ))); + } + x + } Err(e) => { info!( "No valid previous cluster layout stored ({}), starting fresh.", @@ -303,7 +313,7 @@ impl System { metadata_dir: config.metadata_dir.clone(), }); sys.system_endpoint.set_handler(sys.clone()); - sys + Ok(sys) } /// Perform bootstraping, starting the ping loop @@ -485,7 +495,7 @@ impl System { let local_info = self.local_status.load(); if local_info.replication_factor < info.replication_factor { - error!("Some node have a higher replication factor ({}) than this one ({}). This is not supported and might lead to bugs", + error!("Some node have a higher replication factor ({}) than this one ({}). This is not supported and will lead to data corruption. Shutting down for safety.", info.replication_factor, local_info.replication_factor); std::process::exit(1); @@ -513,6 +523,16 @@ impl System { self: &Arc, adv: &ClusterLayout, ) -> Result { + if adv.replication_factor != self.replication_factor { + let msg = format!( + "Received a cluster layout from another node with replication factor {}, which is different from what we have in our configuration ({}). Discarding the cluster layout we received.", + adv.replication_factor, + self.replication_factor + ); + error!("{}", msg); + return Err(Error::Message(msg)); + } + let update_ring = self.update_ring.lock().await; let mut layout: ClusterLayout = self.ring.borrow().layout.clone(); -- cgit v1.2.3 From e46dc2a8ef8a12e49aed3883b34b538b5f65ca31 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 14 Sep 2022 16:09:38 +0200 Subject: Allow for hostnames in bootstrap_peers and rpc_public_addr (fix #353) --- src/garage/main.rs | 8 +++++- src/rpc/Cargo.toml | 2 +- src/rpc/system.rs | 73 ++++++++++++++++++++++++++++++++++++++++++------------ src/util/config.rs | 28 +++------------------ 4 files changed, 68 insertions(+), 43 deletions(-) (limited to 'src') diff --git a/src/garage/main.rs b/src/garage/main.rs index 0eca24ae..e5cba553 100644 --- a/src/garage/main.rs +++ b/src/garage/main.rs @@ -162,7 +162,13 @@ async fn cli_command(opt: Opt) -> Result<(), Error> { } else { let node_id = garage_rpc::system::read_node_id(&config.as_ref().unwrap().metadata_dir) .err_context(READ_KEY_ERROR)?; - if let Some(a) = config.as_ref().and_then(|c| c.rpc_public_addr) { + if let Some(a) = config.as_ref().and_then(|c| c.rpc_public_addr.as_ref()) { + use std::net::ToSocketAddrs; + let a = a + .to_socket_addrs() + .ok_or_message("unable to resolve rpc_public_addr specified in config file")? + .next() + .ok_or_message("unable to resolve rpc_public_addr specified in config file")?; (node_id, a) } else { let default_addr = SocketAddr::new( diff --git a/src/rpc/Cargo.toml b/src/rpc/Cargo.toml index 079cfe34..e51f1f73 100644 --- a/src/rpc/Cargo.toml +++ b/src/rpc/Cargo.toml @@ -45,7 +45,7 @@ tokio = { version = "1.0", default-features = false, features = ["rt", "rt-multi tokio-stream = { version = "0.1", features = ["net"] } opentelemetry = "0.17" -netapp = { version = "0.5.0", features = ["telemetry"] } +netapp = { version = "0.5.1", features = ["telemetry"] } hyper = { version = "0.14", features = ["client", "http1", "runtime", "tcp"] } diff --git a/src/rpc/system.rs b/src/rpc/system.rs index 228b66a4..2c6136a8 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -18,7 +18,7 @@ use tokio::sync::Mutex; use netapp::endpoint::{Endpoint, EndpointHandler}; use netapp::message::*; use netapp::peering::fullmesh::FullMeshPeeringStrategy; -use netapp::util::parse_and_resolve_peer_addr; +use netapp::util::parse_and_resolve_peer_addr_async; use netapp::{NetApp, NetworkKey, NodeID, NodeKey}; use garage_util::background::BackgroundRunner; @@ -92,7 +92,7 @@ pub struct System { rpc_listen_addr: SocketAddr, rpc_public_addr: Option, - bootstrap_peers: Vec<(NodeID, SocketAddr)>, + bootstrap_peers: Vec, consul_discovery: Option, #[cfg(feature = "kubernetes-discovery")] @@ -242,8 +242,29 @@ impl System { let ring = Ring::new(cluster_layout, replication_factor); let (update_ring, ring) = watch::channel(Arc::new(ring)); - let rpc_public_addr = match config.rpc_public_addr { - Some(a) => Some(a), + let rpc_public_addr = match &config.rpc_public_addr { + Some(a_str) => { + use std::net::ToSocketAddrs; + match a_str.to_socket_addrs() { + Err(e) => { + error!( + "Cannot resolve rpc_public_addr {} from config file: {}.", + a_str, e + ); + None + } + Ok(a) => { + let a = a.collect::>(); + if a.is_empty() { + error!("rpc_public_addr {} resolve to no known IP address", a_str); + } + if a.len() > 1 { + warn!("Multiple possible resolutions for rpc_public_addr: {:?}. Taking the first one.", a); + } + a.into_iter().next() + } + } + } None => { let addr = get_default_ip().map(|ip| SocketAddr::new(ip, config.rpc_bind_addr.port())); @@ -253,13 +274,12 @@ impl System { addr } }; + if rpc_public_addr.is_none() { + warn!("This Garage node does not know its publicly reachable RPC address, this might hamper intra-cluster communication."); + } let netapp = NetApp::new(GARAGE_VERSION_TAG, network_key, node_key); - let fullmesh = FullMeshPeeringStrategy::new( - netapp.clone(), - config.bootstrap_peers.clone(), - rpc_public_addr, - ); + let fullmesh = FullMeshPeeringStrategy::new(netapp.clone(), vec![], rpc_public_addr); let system_endpoint = netapp.endpoint(SYSTEM_RPC_PATH.into()); @@ -370,12 +390,14 @@ impl System { } pub async fn connect(&self, node: &str) -> Result<(), Error> { - let (pubkey, addrs) = parse_and_resolve_peer_addr(node).ok_or_else(|| { - Error::Message(format!( - "Unable to parse or resolve node specification: {}", - node - )) - })?; + let (pubkey, addrs) = parse_and_resolve_peer_addr_async(node) + .await + .ok_or_else(|| { + Error::Message(format!( + "Unable to parse or resolve node specification: {}", + node + )) + })?; let mut errors = vec![]; for ip in addrs.iter() { match self @@ -604,7 +626,7 @@ impl System { if not_configured || no_peers || bad_peers { info!("Doing a bootstrap/discovery step (not_configured: {}, no_peers: {}, bad_peers: {})", not_configured, no_peers, bad_peers); - let mut ping_list = self.bootstrap_peers.clone(); + let mut ping_list = resolve_peers(&self.bootstrap_peers).await; // Add peer list from list stored on disk if let Ok(peers) = self.persist_peer_list.load_async().await { @@ -735,6 +757,25 @@ fn get_default_ip() -> Option { .map(|a| a.ip()) } +async fn resolve_peers(peers: &[String]) -> Vec<(NodeID, SocketAddr)> { + let mut ret = vec![]; + + for peer in peers.iter() { + match parse_and_resolve_peer_addr_async(peer).await { + Some((pubkey, addrs)) => { + for ip in addrs { + ret.push((pubkey, ip)); + } + } + None => { + warn!("Unable to parse and/or resolve peer hostname {}", peer); + } + } + } + + ret +} + struct ConsulDiscoveryParam { consul_host: String, service_name: String, diff --git a/src/util/config.rs b/src/util/config.rs index cccad101..5e113e13 100644 --- a/src/util/config.rs +++ b/src/util/config.rs @@ -3,12 +3,8 @@ use std::io::Read; use std::net::SocketAddr; use std::path::PathBuf; -use serde::de::Error as SerdeError; use serde::{de, Deserialize}; -use netapp::util::parse_and_resolve_peer_addr; -use netapp::NodeID; - use crate::error::Error; /// Represent the whole configuration @@ -43,11 +39,11 @@ pub struct Config { /// Address to bind for RPC pub rpc_bind_addr: SocketAddr, /// Public IP address of this node - pub rpc_public_addr: Option, + pub rpc_public_addr: Option, /// Bootstrap peers RPC address - #[serde(deserialize_with = "deserialize_vec_addr", default)] - pub bootstrap_peers: Vec<(NodeID, SocketAddr)>, + #[serde(default)] + pub bootstrap_peers: Vec, /// Consul host to connect to to discover more peers pub consul_host: Option, /// Consul service name to use @@ -154,24 +150,6 @@ pub fn read_config(config_file: PathBuf) -> Result { Ok(toml::from_str(&config)?) } -fn deserialize_vec_addr<'de, D>(deserializer: D) -> Result, D::Error> -where - D: de::Deserializer<'de>, -{ - let mut ret = vec![]; - - for peer in >::deserialize(deserializer)? { - let (pubkey, addrs) = parse_and_resolve_peer_addr(peer).ok_or_else(|| { - D::Error::custom(format!("Unable to parse or resolve peer: {}", peer)) - })?; - for ip in addrs { - ret.push((pubkey, ip)); - } - } - - Ok(ret) -} - fn default_compression() -> Option { Some(1) } -- cgit v1.2.3 From 76f42a1a2b5cf088968a0730cf6de31b75f7a055 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 14 Sep 2022 17:07:55 +0200 Subject: Properly return HTTP 204 when deleting non-existent object (fix #227) --- src/api/s3/delete.rs | 15 +++++++-------- src/garage/tests/s3/objects.rs | 9 +++++++++ 2 files changed, 16 insertions(+), 8 deletions(-) (limited to 'src') diff --git a/src/api/s3/delete.rs b/src/api/s3/delete.rs index 5065b285..b337155f 100644 --- a/src/api/s3/delete.rs +++ b/src/api/s3/delete.rs @@ -64,14 +64,13 @@ pub async fn handle_delete( bucket_id: Uuid, key: &str, ) -> Result, Error> { - let (_deleted_version, delete_marker_version) = - handle_delete_internal(&garage, bucket_id, key).await?; - - Ok(Response::builder() - .header("x-amz-version-id", hex::encode(delete_marker_version)) - .status(StatusCode::NO_CONTENT) - .body(Body::from(vec![])) - .unwrap()) + match handle_delete_internal(&garage, bucket_id, key).await { + Ok(_) | Err(Error::NoSuchKey) => Ok(Response::builder() + .status(StatusCode::NO_CONTENT) + .body(Body::from(vec![])) + .unwrap()), + Err(e) => Err(e), + } } pub async fn handle_delete_objects( diff --git a/src/garage/tests/s3/objects.rs b/src/garage/tests/s3/objects.rs index e1175b81..65f9e867 100644 --- a/src/garage/tests/s3/objects.rs +++ b/src/garage/tests/s3/objects.rs @@ -263,4 +263,13 @@ async fn test_deleteobject() { .unwrap(); assert!(l.contents.is_none()); + + // Deleting a non-existing object shouldn't be a problem + ctx.client + .delete_object() + .bucket(&bucket) + .key("l-0") + .send() + .await + .unwrap(); } -- cgit v1.2.3 From 5d4b6f2173344d59d59c7f6336c5d21799f8b37d Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 19 Sep 2022 12:16:38 +0200 Subject: Faster GetObject workflow for getting entire objects --- src/api/Cargo.toml | 1 + src/api/s3/get.rs | 86 ++++++++++++++++++++++++++++++++---------------------- 2 files changed, 52 insertions(+), 35 deletions(-) (limited to 'src') diff --git a/src/api/Cargo.toml b/src/api/Cargo.toml index cdfabcb8..7c3ed43b 100644 --- a/src/api/Cargo.toml +++ b/src/api/Cargo.toml @@ -38,6 +38,7 @@ futures = "0.3" futures-util = "0.3" pin-project = "1.0" tokio = { version = "1.0", default-features = false, features = ["rt", "rt-multi-thread", "io-util", "net", "time", "macros", "sync", "signal", "fs"] } +tokio-stream = "0.1" form_urlencoded = "1.0.0" http = "0.2" diff --git a/src/api/s3/get.rs b/src/api/s3/get.rs index ae4c287d..2a99551a 100644 --- a/src/api/s3/get.rs +++ b/src/api/s3/get.rs @@ -2,16 +2,19 @@ use std::sync::Arc; use std::time::{Duration, UNIX_EPOCH}; -use futures::stream::*; +use futures::future; +use futures::stream::{self, StreamExt}; use http::header::{ ACCEPT_RANGES, CONTENT_LENGTH, CONTENT_RANGE, CONTENT_TYPE, ETAG, IF_MODIFIED_SINCE, IF_NONE_MATCH, LAST_MODIFIED, RANGE, }; use hyper::{Body, Request, Response, StatusCode}; +use tokio::sync::mpsc; use garage_rpc::rpc_helper::{netapp::stream::ByteStream, OrderTag}; use garage_table::EmptyKey; use garage_util::data::*; +use garage_util::error::OkOrMessage; use garage_model::garage::Garage; use garage_model::s3::object_table::*; @@ -242,43 +245,56 @@ pub async fn handle_get( Ok(resp_builder.body(body)?) } ObjectVersionData::FirstBlock(_, first_block_hash) => { - let order_stream = OrderTag::stream(); - - let read_first_block = garage - .block_manager - .rpc_get_block_streaming(first_block_hash, Some(order_stream.order(0))); - let get_next_blocks = garage.version_table.get(&last_v.uuid, &EmptyKey); + let (tx, rx) = mpsc::channel(2); - let (first_block_stream, version) = - futures::try_join!(read_first_block, get_next_blocks)?; - let version = version.ok_or(Error::NoSuchKey)?; + let order_stream = OrderTag::stream(); + let first_block_hash = *first_block_hash; + let version_uuid = last_v.uuid; + + tokio::spawn(async move { + match async { + let garage2 = garage.clone(); + let version_fut = tokio::spawn(async move { + garage2.version_table.get(&version_uuid, &EmptyKey).await + }); + + let stream_block_0 = garage + .block_manager + .rpc_get_block_streaming(&first_block_hash, Some(order_stream.order(0))) + .await?; + tx.send(stream_block_0) + .await + .ok_or_message("channel closed")?; + + let version = version_fut.await.unwrap()?.ok_or(Error::NoSuchKey)?; + for (i, (_, vb)) in version.blocks.items().iter().enumerate().skip(1) { + let stream_block_i = garage + .block_manager + .rpc_get_block_streaming(&vb.hash, Some(order_stream.order(i as u64))) + .await?; + tx.send(stream_block_i) + .await + .ok_or_message("channel closed")?; + } - let mut blocks = version - .blocks - .items() - .iter() - .map(|(_, vb)| (vb.hash, None)) - .collect::>(); - blocks[0].1 = Some(first_block_stream); - - let body_stream = futures::stream::iter(blocks) - .enumerate() - .map(move |(i, (hash, stream_opt))| { - let garage = garage.clone(); - async move { - if let Some(stream) = stream_opt { - stream - } else { - garage - .block_manager - .rpc_get_block_streaming(&hash, Some(order_stream.order(i as u64))) - .await - .unwrap_or_else(|e| error_stream(i, e)) - } + Ok::<(), Error>(()) + } + .await + { + Ok(()) => (), + Err(e) => { + let err = std::io::Error::new( + std::io::ErrorKind::Other, + format!("Error while getting object data: {}", e), + ); + let _ = tx + .send(Box::pin(stream::once(future::ready(Err(err))))) + .await; } - }) - .buffered(2) - .flatten(); + } + }); + + let body_stream = tokio_stream::wrappers::ReceiverStream::new(rx).flatten(); let body = hyper::body::Body::wrap_stream(body_stream); Ok(resp_builder.body(body)?) -- cgit v1.2.3 From 56592e18538b379ccaaa7b7c1990a599ac83b191 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 19 Sep 2022 20:12:19 +0200 Subject: RPC performance changes - configurable ping timeout - single, much higher, configurable RPC timeout - no more concurrency semaphore --- src/block/manager.rs | 18 ++++++++-------- src/block/resync.rs | 14 ++----------- src/model/k2v/rpc.rs | 36 +++++++++++++++----------------- src/rpc/Cargo.toml | 2 +- src/rpc/metrics.rs | 19 +---------------- src/rpc/rpc_helper.rs | 57 +++++++++++++++++++++++++-------------------------- src/rpc/system.rs | 16 +++++++++++---- src/table/gc.rs | 10 ++------- src/table/sync.rs | 16 ++++----------- src/table/table.rs | 14 +++---------- src/util/config.rs | 5 +++++ 11 files changed, 84 insertions(+), 123 deletions(-) (limited to 'src') diff --git a/src/block/manager.rs b/src/block/manager.rs index ec694fc8..7f439b96 100644 --- a/src/block/manager.rs +++ b/src/block/manager.rs @@ -41,9 +41,6 @@ use crate::resync::*; /// Size under which data will be stored inlined in database instead of as files pub const INLINE_THRESHOLD: usize = 3072; -// Timeout for RPCs that read and write blocks to remote nodes -pub(crate) const BLOCK_RW_TIMEOUT: Duration = Duration::from_secs(60); - // The delay between the moment when the reference counter // drops to zero, and the moment where we allow ourselves // to delete the block locally. @@ -183,7 +180,7 @@ impl BlockManager { }; return Ok((header, stream)); } - _ = tokio::time::sleep(BLOCK_RW_TIMEOUT) => { + _ = tokio::time::sleep(self.system.rpc.rpc_timeout()) => { debug!("Node {:?} didn't return block in time, trying next.", node); } }; @@ -235,7 +232,7 @@ impl BlockManager { } } } - _ = tokio::time::sleep(BLOCK_RW_TIMEOUT) => { + _ = tokio::time::sleep(self.system.rpc.rpc_timeout()) => { debug!("Node {:?} didn't return block in time, trying next.", node); } }; @@ -300,8 +297,7 @@ impl BlockManager { &who[..], put_block_rpc, RequestStrategy::with_priority(PRIO_NORMAL | PRIO_SECONDARY) - .with_quorum(self.replication.write_quorum()) - .with_timeout(BLOCK_RW_TIMEOUT), + .with_quorum(self.replication.write_quorum()), ) .await?; @@ -336,7 +332,10 @@ impl BlockManager { // we will fecth it from someone. let this = self.clone(); tokio::spawn(async move { - if let Err(e) = this.resync.put_to_resync(&hash, 2 * BLOCK_RW_TIMEOUT) { + if let Err(e) = this + .resync + .put_to_resync(&hash, 2 * this.system.rpc.rpc_timeout()) + { error!("Block {:?} could not be put in resync queue: {}.", hash, e); } }); @@ -444,7 +443,8 @@ impl BlockManager { Ok(c) => c, Err(e) => { // Not found but maybe we should have had it ?? - self.resync.put_to_resync(hash, 2 * BLOCK_RW_TIMEOUT)?; + self.resync + .put_to_resync(hash, 2 * self.system.rpc.rpc_timeout())?; return Err(Into::into(e)); } }; diff --git a/src/block/resync.rs b/src/block/resync.rs index bde3e98c..ada3ac54 100644 --- a/src/block/resync.rs +++ b/src/block/resync.rs @@ -33,14 +33,6 @@ use garage_table::replication::TableReplication; use crate::manager::*; -// Timeout for RPCs that ask other nodes whether they need a copy -// of a given block before we delete it locally -// The timeout here is relatively low because we don't want to block -// the entire resync loop when some nodes are not responding. -// Nothing will be deleted if the nodes don't answer the queries, -// we will just retry later. -const NEED_BLOCK_QUERY_TIMEOUT: Duration = Duration::from_secs(15); - // The delay between the time where a resync operation fails // and the time when it is retried, with exponential backoff // (multiplied by 2, 4, 8, 16, etc. for every consecutive failure). @@ -346,8 +338,7 @@ impl BlockResyncManager { &manager.endpoint, &who, BlockRpc::NeedBlockQuery(*hash), - RequestStrategy::with_priority(PRIO_BACKGROUND) - .with_timeout(NEED_BLOCK_QUERY_TIMEOUT), + RequestStrategy::with_priority(PRIO_BACKGROUND), ) .await?; @@ -394,8 +385,7 @@ impl BlockResyncManager { &need_nodes[..], put_block_message, RequestStrategy::with_priority(PRIO_BACKGROUND) - .with_quorum(need_nodes.len()) - .with_timeout(BLOCK_RW_TIMEOUT), + .with_quorum(need_nodes.len()), ) .await .err_context("PutBlock RPC")?; diff --git a/src/model/k2v/rpc.rs b/src/model/k2v/rpc.rs index 90101d0f..a74df277 100644 --- a/src/model/k2v/rpc.rs +++ b/src/model/k2v/rpc.rs @@ -23,7 +23,6 @@ use garage_rpc::system::System; use garage_rpc::*; use garage_table::replication::{TableReplication, TableShardedReplication}; -use garage_table::table::TABLE_RPC_TIMEOUT; use garage_table::{PartitionKey, Table}; use crate::k2v::causality::*; @@ -117,7 +116,6 @@ impl K2VRpcHandler { }), RequestStrategy::with_priority(PRIO_NORMAL) .with_quorum(1) - .with_timeout(TABLE_RPC_TIMEOUT) .interrupt_after_quorum(true), ) .await?; @@ -169,7 +167,6 @@ impl K2VRpcHandler { K2VRpc::InsertManyItems(items), RequestStrategy::with_priority(PRIO_NORMAL) .with_quorum(1) - .with_timeout(TABLE_RPC_TIMEOUT) .interrupt_after_quorum(true), ) .await?; @@ -205,22 +202,23 @@ impl K2VRpcHandler { .replication .write_nodes(&poll_key.partition.hash()); - let resps = self - .system - .rpc - .try_call_many( - &self.endpoint, - &nodes[..], - K2VRpc::PollItem { - key: poll_key, - causal_context, - timeout_msec, - }, - RequestStrategy::with_priority(PRIO_NORMAL) - .with_quorum(self.item_table.data.replication.read_quorum()) - .with_timeout(Duration::from_millis(timeout_msec) + TABLE_RPC_TIMEOUT), - ) - .await?; + let rpc = self.system.rpc.try_call_many( + &self.endpoint, + &nodes[..], + K2VRpc::PollItem { + key: poll_key, + causal_context, + timeout_msec, + }, + RequestStrategy::with_priority(PRIO_NORMAL) + .with_quorum(self.item_table.data.replication.read_quorum()) + .without_timeout(), + ); + let timeout_duration = Duration::from_millis(timeout_msec) + self.system.rpc.rpc_timeout(); + let resps = select! { + r = rpc => r?, + _ = tokio::time::sleep(timeout_duration) => return Ok(None), + }; let mut resp: Option = None; for v in resps { diff --git a/src/rpc/Cargo.toml b/src/rpc/Cargo.toml index e51f1f73..d61acea4 100644 --- a/src/rpc/Cargo.toml +++ b/src/rpc/Cargo.toml @@ -45,7 +45,7 @@ tokio = { version = "1.0", default-features = false, features = ["rt", "rt-multi tokio-stream = { version = "0.1", features = ["net"] } opentelemetry = "0.17" -netapp = { version = "0.5.1", features = ["telemetry"] } +netapp = { version = "0.5.2", features = ["telemetry"] } hyper = { version = "0.14", features = ["client", "http1", "runtime", "tcp"] } diff --git a/src/rpc/metrics.rs b/src/rpc/metrics.rs index c900518c..61f8fa79 100644 --- a/src/rpc/metrics.rs +++ b/src/rpc/metrics.rs @@ -1,31 +1,18 @@ -use std::sync::Arc; - use opentelemetry::{global, metrics::*}; -use tokio::sync::Semaphore; /// TableMetrics reference all counter used for metrics pub struct RpcMetrics { - pub(crate) _rpc_available_permits: ValueObserver, - pub(crate) rpc_counter: Counter, pub(crate) rpc_timeout_counter: Counter, pub(crate) rpc_netapp_error_counter: Counter, pub(crate) rpc_garage_error_counter: Counter, pub(crate) rpc_duration: ValueRecorder, - pub(crate) rpc_queueing_time: ValueRecorder, } impl RpcMetrics { - pub fn new(sem: Arc) -> Self { + pub fn new() -> Self { let meter = global::meter("garage_rpc"); RpcMetrics { - _rpc_available_permits: meter - .u64_value_observer("rpc.available_permits", move |observer| { - observer.observe(sem.available_permits() as u64, &[]) - }) - .with_description("Number of available RPC permits") - .init(), - rpc_counter: meter .u64_counter("rpc.request_counter") .with_description("Number of RPC requests emitted") @@ -46,10 +33,6 @@ impl RpcMetrics { .f64_value_recorder("rpc.duration") .with_description("Duration of RPCs") .init(), - rpc_queueing_time: meter - .f64_value_recorder("rpc.queueing_time") - .with_description("Time RPC requests were queued for before being sent") - .init(), } } } diff --git a/src/rpc/rpc_helper.rs b/src/rpc/rpc_helper.rs index 19abb4c5..857ed620 100644 --- a/src/rpc/rpc_helper.rs +++ b/src/rpc/rpc_helper.rs @@ -7,7 +7,7 @@ use futures::stream::futures_unordered::FuturesUnordered; use futures::stream::StreamExt; use futures_util::future::FutureExt; use tokio::select; -use tokio::sync::{watch, Semaphore}; +use tokio::sync::watch; use opentelemetry::KeyValue; use opentelemetry::{ @@ -32,32 +32,30 @@ use garage_util::metrics::RecordDuration; use crate::metrics::RpcMetrics; use crate::ring::Ring; -const DEFAULT_TIMEOUT: Duration = Duration::from_secs(30); - -// Don't allow more than 100 concurrent outgoing RPCs. -const MAX_CONCURRENT_REQUESTS: usize = 100; +// Default RPC timeout = 5 minutes +const DEFAULT_TIMEOUT: Duration = Duration::from_secs(300); /// Strategy to apply when making RPC #[derive(Copy, Clone)] pub struct RequestStrategy { - /// Max time to wait for reponse - pub rs_timeout: Duration, /// Min number of response to consider the request successful pub rs_quorum: Option, /// Should requests be dropped after enough response are received pub rs_interrupt_after_quorum: bool, /// Request priority pub rs_priority: RequestPriority, + /// Deactivate timeout for this request + pub rs_no_timeout: bool, } impl RequestStrategy { /// Create a RequestStrategy with default timeout and not interrupting when quorum reached pub fn with_priority(prio: RequestPriority) -> Self { RequestStrategy { - rs_timeout: DEFAULT_TIMEOUT, rs_quorum: None, rs_interrupt_after_quorum: false, rs_priority: prio, + rs_no_timeout: false, } } /// Set quorum to be reached for request @@ -65,17 +63,17 @@ impl RequestStrategy { self.rs_quorum = Some(quorum); self } - /// Set timeout of the strategy - pub fn with_timeout(mut self, timeout: Duration) -> Self { - self.rs_timeout = timeout; - self - } /// Set if requests can be dropped after quorum has been reached /// In general true for read requests, and false for write pub fn interrupt_after_quorum(mut self, interrupt: bool) -> Self { self.rs_interrupt_after_quorum = interrupt; self } + /// Deactivate timeout for this request + pub fn without_timeout(mut self) -> Self { + self.rs_no_timeout = true; + self + } } #[derive(Clone)] @@ -86,8 +84,8 @@ struct RpcHelperInner { fullmesh: Arc, background: Arc, ring: watch::Receiver>, - request_buffer_semaphore: Arc, metrics: RpcMetrics, + rpc_timeout: Duration, } impl RpcHelper { @@ -96,21 +94,24 @@ impl RpcHelper { fullmesh: Arc, background: Arc, ring: watch::Receiver>, + rpc_timeout: Option, ) -> Self { - let sem = Arc::new(Semaphore::new(MAX_CONCURRENT_REQUESTS)); - - let metrics = RpcMetrics::new(sem.clone()); + let metrics = RpcMetrics::new(); Self(Arc::new(RpcHelperInner { our_node_id, fullmesh, background, ring, - request_buffer_semaphore: sem, metrics, + rpc_timeout: rpc_timeout.unwrap_or(DEFAULT_TIMEOUT), })) } + pub fn rpc_timeout(&self) -> Duration { + self.0.rpc_timeout + } + pub async fn call( &self, endpoint: &Endpoint, @@ -129,13 +130,6 @@ impl RpcHelper { KeyValue::new("to", format!("{:?}", to)), ]; - let permit = self - .0 - .request_buffer_semaphore - .acquire() - .record_duration(&self.0.metrics.rpc_queueing_time, &metric_tags) - .await?; - self.0.metrics.rpc_counter.add(1, &metric_tags); let node_id = to.into(); @@ -143,10 +137,16 @@ impl RpcHelper { .call_streaming(&node_id, msg, strat.rs_priority) .record_duration(&self.0.metrics.rpc_duration, &metric_tags); + let timeout = async { + if strat.rs_no_timeout { + futures::future::pending().await + } else { + tokio::time::sleep(self.0.rpc_timeout).await + } + }; + select! { res = rpc_call => { - drop(permit); - if res.is_err() { self.0.metrics.rpc_netapp_error_counter.add(1, &metric_tags); } @@ -158,8 +158,7 @@ impl RpcHelper { Ok(res?) } - _ = tokio::time::sleep(strat.rs_timeout) => { - drop(permit); + () = timeout => { self.0.metrics.rpc_timeout_counter.add(1, &metric_tags); Err(Error::Timeout) } diff --git a/src/rpc/system.rs b/src/rpc/system.rs index 2c6136a8..f8121193 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -37,7 +37,6 @@ use crate::rpc_helper::*; const DISCOVERY_INTERVAL: Duration = Duration::from_secs(60); const STATUS_EXCHANGE_INTERVAL: Duration = Duration::from_secs(10); -const SYSTEM_RPC_TIMEOUT: Duration = Duration::from_secs(15); /// Version tag used for version check upon Netapp connection. /// Cluster nodes with different version tags are deemed @@ -280,6 +279,9 @@ impl System { let netapp = NetApp::new(GARAGE_VERSION_TAG, network_key, node_key); let fullmesh = FullMeshPeeringStrategy::new(netapp.clone(), vec![], rpc_public_addr); + if let Some(ping_timeout) = config.rpc_ping_timeout_msec { + fullmesh.set_ping_timeout_millis(ping_timeout); + } let system_endpoint = netapp.endpoint(SYSTEM_RPC_PATH.into()); @@ -317,7 +319,13 @@ impl System { node_status: RwLock::new(HashMap::new()), netapp: netapp.clone(), fullmesh: fullmesh.clone(), - rpc: RpcHelper::new(netapp.id.into(), fullmesh, background.clone(), ring.clone()), + rpc: RpcHelper::new( + netapp.id.into(), + fullmesh, + background.clone(), + ring.clone(), + config.rpc_timeout_msec.map(Duration::from_millis), + ), system_endpoint, replication_factor, rpc_listen_addr: config.rpc_bind_addr, @@ -600,7 +608,7 @@ impl System { .broadcast( &self.system_endpoint, SystemRpc::AdvertiseStatus(local_status), - RequestStrategy::with_priority(PRIO_HIGH).with_timeout(SYSTEM_RPC_TIMEOUT), + RequestStrategy::with_priority(PRIO_HIGH), ) .await; @@ -724,7 +732,7 @@ impl System { &self.system_endpoint, peer, SystemRpc::PullClusterLayout, - RequestStrategy::with_priority(PRIO_HIGH).with_timeout(SYSTEM_RPC_TIMEOUT), + RequestStrategy::with_priority(PRIO_HIGH), ) .await; if let Ok(SystemRpc::AdvertiseClusterLayout(layout)) = resp { diff --git a/src/table/gc.rs b/src/table/gc.rs index 6cae9701..83e7eeff 100644 --- a/src/table/gc.rs +++ b/src/table/gc.rs @@ -25,8 +25,6 @@ use crate::replication::*; use crate::schema::*; const TABLE_GC_BATCH_SIZE: usize = 1024; -// Same timeout as NEED_BLOCK_QUERY_TIMEOUT in block manager -const TABLE_GC_RPC_TIMEOUT: Duration = Duration::from_secs(15); // GC delay for table entries: 1 day (24 hours) // (the delay before the entry is added in the GC todo list @@ -237,9 +235,7 @@ where &self.endpoint, &nodes[..], GcRpc::Update(updates), - RequestStrategy::with_priority(PRIO_BACKGROUND) - .with_quorum(nodes.len()) - .with_timeout(TABLE_GC_RPC_TIMEOUT), + RequestStrategy::with_priority(PRIO_BACKGROUND).with_quorum(nodes.len()), ) .await .err_context("GC: send tombstones")?; @@ -260,9 +256,7 @@ where &self.endpoint, &nodes[..], GcRpc::DeleteIfEqualHash(deletes), - RequestStrategy::with_priority(PRIO_BACKGROUND) - .with_quorum(nodes.len()) - .with_timeout(TABLE_GC_RPC_TIMEOUT), + RequestStrategy::with_priority(PRIO_BACKGROUND).with_quorum(nodes.len()), ) .await .err_context("GC: remote delete tombstones")?; diff --git a/src/table/sync.rs b/src/table/sync.rs index 62b88a58..76402d28 100644 --- a/src/table/sync.rs +++ b/src/table/sync.rs @@ -24,9 +24,6 @@ use crate::merkle::*; use crate::replication::*; use crate::*; -// Sync RPC can contain a lot of data, so have a 1min timeout -const TABLE_SYNC_RPC_TIMEOUT: Duration = Duration::from_secs(60); - // Do anti-entropy every 10 minutes const ANTI_ENTROPY_INTERVAL: Duration = Duration::from_secs(10 * 60); @@ -248,9 +245,7 @@ where &self.endpoint, nodes, SyncRpc::Items(values), - RequestStrategy::with_priority(PRIO_BACKGROUND) - .with_quorum(nodes.len()) - .with_timeout(TABLE_SYNC_RPC_TIMEOUT), + RequestStrategy::with_priority(PRIO_BACKGROUND).with_quorum(nodes.len()), ) .await?; @@ -311,8 +306,7 @@ where &self.endpoint, who, SyncRpc::RootCkHash(partition.partition, root_ck_hash), - RequestStrategy::with_priority(PRIO_BACKGROUND) - .with_timeout(TABLE_SYNC_RPC_TIMEOUT), + RequestStrategy::with_priority(PRIO_BACKGROUND), ) .await?; @@ -368,8 +362,7 @@ where &self.endpoint, who, SyncRpc::GetNode(key.clone()), - RequestStrategy::with_priority(PRIO_BACKGROUND) - .with_timeout(TABLE_SYNC_RPC_TIMEOUT), + RequestStrategy::with_priority(PRIO_BACKGROUND), ) .await? { @@ -445,8 +438,7 @@ where &self.endpoint, who, SyncRpc::Items(values), - RequestStrategy::with_priority(PRIO_BACKGROUND) - .with_timeout(TABLE_SYNC_RPC_TIMEOUT), + RequestStrategy::with_priority(PRIO_BACKGROUND), ) .await?; if let SyncRpc::Ok = rpc_resp { diff --git a/src/table/table.rs b/src/table/table.rs index 8e801be6..8a66c420 100644 --- a/src/table/table.rs +++ b/src/table/table.rs @@ -1,7 +1,6 @@ use std::borrow::Borrow; use std::collections::{BTreeMap, BTreeSet, HashMap}; use std::sync::Arc; -use std::time::Duration; use async_trait::async_trait; use futures::stream::*; @@ -31,8 +30,6 @@ use crate::schema::*; use crate::sync::*; use crate::util::*; -pub const TABLE_RPC_TIMEOUT: Duration = Duration::from_secs(30); - pub struct Table { pub system: Arc, pub data: Arc>, @@ -124,8 +121,7 @@ where &who[..], rpc, RequestStrategy::with_priority(PRIO_NORMAL) - .with_quorum(self.data.replication.write_quorum()) - .with_timeout(TABLE_RPC_TIMEOUT), + .with_quorum(self.data.replication.write_quorum()), ) .await?; @@ -177,7 +173,7 @@ where &self.endpoint, node, rpc, - RequestStrategy::with_priority(PRIO_NORMAL).with_timeout(TABLE_RPC_TIMEOUT), + RequestStrategy::with_priority(PRIO_NORMAL), ) .await?; Ok::<_, Error>((node, resp)) @@ -234,7 +230,6 @@ where rpc, RequestStrategy::with_priority(PRIO_NORMAL) .with_quorum(self.data.replication.read_quorum()) - .with_timeout(TABLE_RPC_TIMEOUT) .interrupt_after_quorum(true), ) .await?; @@ -329,7 +324,6 @@ where rpc, RequestStrategy::with_priority(PRIO_NORMAL) .with_quorum(self.data.replication.read_quorum()) - .with_timeout(TABLE_RPC_TIMEOUT) .interrupt_after_quorum(true), ) .await?; @@ -406,9 +400,7 @@ where &self.endpoint, who, TableRpc::::Update(vec![what_enc]), - RequestStrategy::with_priority(PRIO_NORMAL) - .with_quorum(who.len()) - .with_timeout(TABLE_RPC_TIMEOUT), + RequestStrategy::with_priority(PRIO_NORMAL).with_quorum(who.len()), ) .await?; Ok(()) diff --git a/src/util/config.rs b/src/util/config.rs index 5e113e13..2d4b4f57 100644 --- a/src/util/config.rs +++ b/src/util/config.rs @@ -41,6 +41,11 @@ pub struct Config { /// Public IP address of this node pub rpc_public_addr: Option, + /// Timeout for Netapp's ping messagess + pub rpc_ping_timeout_msec: Option, + /// Timeout for Netapp RPC calls + pub rpc_timeout_msec: Option, + /// Bootstrap peers RPC address #[serde(default)] pub bootstrap_peers: Vec, -- cgit v1.2.3 From 1f7b050b7dd975642c8cd5d8a7562d347cfa528d Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 20 Sep 2022 11:49:48 +0200 Subject: Change a warn! into a debug! --- src/table/Cargo.toml | 1 + src/table/sync.rs | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'src') diff --git a/src/table/Cargo.toml b/src/table/Cargo.toml index ae52e8d7..38c6b41c 100644 --- a/src/table/Cargo.toml +++ b/src/table/Cargo.toml @@ -22,6 +22,7 @@ opentelemetry = "0.17" async-trait = "0.1.7" bytes = "1.0" +hex = "0.4" hexdump = "0.1" tracing = "0.1.30" rand = "0.8" diff --git a/src/table/sync.rs b/src/table/sync.rs index 62b88a58..28e99dd3 100644 --- a/src/table/sync.rs +++ b/src/table/sync.rs @@ -351,11 +351,11 @@ where // Just send that item directly if let Some(val) = self.data.store.get(&ik[..])? { if blake2sum(&val[..]) != ivhash { - warn!("({}) Hashes differ between stored value and Merkle tree, key: {:?} (if your server is very busy, don't worry, this happens when the Merkle tree can't be updated fast enough)", F::TABLE_NAME, ik); + debug!("({}) Hashes differ between stored value and Merkle tree, key: {} (if your server is very busy, don't worry, this happens when the Merkle tree can't be updated fast enough)", F::TABLE_NAME, hex::encode(ik)); } todo_items.push(val.to_vec()); } else { - warn!("({}) Item from Merkle tree not found in store: {:?} (if your server is very busy, don't worry, this happens when the Merkle tree can't be updated fast enough)", F::TABLE_NAME, ik); + debug!("({}) Item from Merkle tree not found in store: {} (if your server is very busy, don't worry, this happens when the Merkle tree can't be updated fast enough)", F::TABLE_NAME, hex::encode(ik)); } } MerkleNode::Intermediate(l) => { -- cgit v1.2.3 From ded444f6c96f8ab991e762f65760b42e4d64246c Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 20 Sep 2022 16:01:41 +0200 Subject: Ability to have custom timeouts in request strategy (not used) --- src/model/k2v/causality.rs | 2 +- src/model/k2v/item_table.rs | 8 ++++---- src/rpc/rpc_helper.rs | 30 +++++++++++++++++++++--------- 3 files changed, 26 insertions(+), 14 deletions(-) (limited to 'src') diff --git a/src/model/k2v/causality.rs b/src/model/k2v/causality.rs index 8c76a32b..9a692870 100644 --- a/src/model/k2v/causality.rs +++ b/src/model/k2v/causality.rs @@ -15,7 +15,7 @@ pub fn make_node_id(node_id: Uuid) -> K2VNodeId { u64::from_be_bytes(tmp) } -#[derive(PartialEq, Debug, Serialize, Deserialize)] +#[derive(PartialEq, Eq, Debug, Serialize, Deserialize)] pub struct CausalContext { pub vector_clock: BTreeMap, } diff --git a/src/model/k2v/item_table.rs b/src/model/k2v/item_table.rs index baa1db4b..7860cb17 100644 --- a/src/model/k2v/item_table.rs +++ b/src/model/k2v/item_table.rs @@ -17,7 +17,7 @@ pub const CONFLICTS: &str = "conflicts"; pub const VALUES: &str = "values"; pub const BYTES: &str = "bytes"; -#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] +#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] pub struct K2VItem { pub partition: K2VItemPartition, pub sort_key: String, @@ -25,19 +25,19 @@ pub struct K2VItem { items: BTreeMap, } -#[derive(PartialEq, Clone, Debug, Serialize, Deserialize, Hash, Eq)] +#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize, Hash)] pub struct K2VItemPartition { pub bucket_id: Uuid, pub partition_key: String, } -#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] +#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] struct DvvsEntry { t_discard: u64, values: Vec<(u64, DvvsValue)>, } -#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] +#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)] pub enum DvvsValue { Value(#[serde(with = "serde_bytes")] Vec), Deleted, diff --git a/src/rpc/rpc_helper.rs b/src/rpc/rpc_helper.rs index 857ed620..949aced6 100644 --- a/src/rpc/rpc_helper.rs +++ b/src/rpc/rpc_helper.rs @@ -44,8 +44,15 @@ pub struct RequestStrategy { pub rs_interrupt_after_quorum: bool, /// Request priority pub rs_priority: RequestPriority, - /// Deactivate timeout for this request - pub rs_no_timeout: bool, + /// Custom timeout for this request + rs_timeout: Timeout, +} + +#[derive(Copy, Clone)] +enum Timeout { + None, + Default, + Custom(Duration), } impl RequestStrategy { @@ -55,7 +62,7 @@ impl RequestStrategy { rs_quorum: None, rs_interrupt_after_quorum: false, rs_priority: prio, - rs_no_timeout: false, + rs_timeout: Timeout::Default, } } /// Set quorum to be reached for request @@ -71,7 +78,12 @@ impl RequestStrategy { } /// Deactivate timeout for this request pub fn without_timeout(mut self) -> Self { - self.rs_no_timeout = true; + self.rs_timeout = Timeout::None; + self + } + /// Set custom timeout for this request + pub fn with_custom_timeout(mut self, timeout: Duration) -> Self { + self.rs_timeout = Timeout::Custom(timeout); self } } @@ -138,10 +150,10 @@ impl RpcHelper { .record_duration(&self.0.metrics.rpc_duration, &metric_tags); let timeout = async { - if strat.rs_no_timeout { - futures::future::pending().await - } else { - tokio::time::sleep(self.0.rpc_timeout).await + match strat.rs_timeout { + Timeout::None => futures::future::pending().await, + Timeout::Default => tokio::time::sleep(self.0.rpc_timeout).await, + Timeout::Custom(t) => tokio::time::sleep(t).await, } }; @@ -412,7 +424,7 @@ impl RpcHelper { .iter() .find(|x| x.id.as_ref() == to.as_slice()) .and_then(|pi| pi.avg_ping) - .unwrap_or_else(|| Duration::from_secs(1)); + .unwrap_or_else(|| Duration::from_secs(10)); ( *to != self.0.our_node_id, peer_zone != our_zone, -- cgit v1.2.3 From 782630fc27b41b9ae58d1417cace2995c99856fc Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 20 Sep 2022 17:45:18 +0200 Subject: Initialize metrics exporter earlier (fix #389) --- src/api/admin/api_server.rs | 7 +++++-- src/garage/server.rs | 9 ++++++++- 2 files changed, 13 insertions(+), 3 deletions(-) (limited to 'src') diff --git a/src/api/admin/api_server.rs b/src/api/admin/api_server.rs index fb0078cc..0816bda1 100644 --- a/src/api/admin/api_server.rs +++ b/src/api/admin/api_server.rs @@ -34,7 +34,10 @@ pub struct AdminApiServer { } impl AdminApiServer { - pub fn new(garage: Arc) -> Self { + pub fn new( + garage: Arc, + #[cfg(feature = "metrics")] exporter: PrometheusExporter, + ) -> Self { let cfg = &garage.config.admin; let metrics_token = cfg .metrics_token @@ -47,7 +50,7 @@ impl AdminApiServer { Self { garage, #[cfg(feature = "metrics")] - exporter: opentelemetry_prometheus::exporter().init(), + exporter, metrics_token, admin_token, } diff --git a/src/garage/server.rs b/src/garage/server.rs index aeef02a2..28710a8e 100644 --- a/src/garage/server.rs +++ b/src/garage/server.rs @@ -32,6 +32,9 @@ pub async fn run_server(config_file: PathBuf) -> Result<(), Error> { // ---- Initialize Garage internals ---- + #[cfg(feature = "metrics")] + let metrics_exporter = opentelemetry_prometheus::exporter().init(); + info!("Initializing background runner..."); let watch_cancel = netapp::util::watch_ctrl_c(); let (background, await_background_done) = BackgroundRunner::new(16, watch_cancel.clone()); @@ -50,7 +53,11 @@ pub async fn run_server(config_file: PathBuf) -> Result<(), Error> { } info!("Initialize Admin API server and metrics collector..."); - let admin_server = AdminApiServer::new(garage.clone()); + let admin_server = AdminApiServer::new( + garage.clone(), + #[cfg(feature = "metrics")] + metrics_exporter, + ); info!("Launching internal Garage cluster communications..."); let run_system = tokio::spawn(garage.system.clone().run(watch_cancel.clone())); -- cgit v1.2.3 From 1778e4b3187af979cd67098e555455be422c9500 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 26 Sep 2022 16:20:30 +0200 Subject: Fix span name for api server requests --- src/api/generic_server.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'src') diff --git a/src/api/generic_server.rs b/src/api/generic_server.rs index a48be1bc..62fe4e5a 100644 --- a/src/api/generic_server.rs +++ b/src/api/generic_server.rs @@ -174,7 +174,11 @@ impl ApiServer { let current_context = Context::current(); let current_span = current_context.span(); - current_span.update_name::(format!("S3 API {}", endpoint.name())); + current_span.update_name::(format!( + "{} API {}", + A::API_NAME_DISPLAY, + endpoint.name() + )); current_span.set_attribute(KeyValue::new("endpoint", endpoint.name())); endpoint.add_span_attributes(current_span); -- cgit v1.2.3 From 1f97ce37e682dff13472d6402f6115e8c1bbb0d7 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 28 Sep 2022 10:41:59 +0200 Subject: Shutdown properly on SIGTERM/SIGHUP and on Windows signals --- src/garage/server.rs | 43 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) (limited to 'src') diff --git a/src/garage/server.rs b/src/garage/server.rs index 28710a8e..d4099a97 100644 --- a/src/garage/server.rs +++ b/src/garage/server.rs @@ -36,7 +36,7 @@ pub async fn run_server(config_file: PathBuf) -> Result<(), Error> { let metrics_exporter = opentelemetry_prometheus::exporter().init(); info!("Initializing background runner..."); - let watch_cancel = netapp::util::watch_ctrl_c(); + let watch_cancel = watch_shutdown_signal(); let (background, await_background_done) = BackgroundRunner::new(16, watch_cancel.clone()); info!("Initializing Garage main data store..."); @@ -157,3 +157,44 @@ pub async fn run_server(config_file: PathBuf) -> Result<(), Error> { Ok(()) } + +#[cfg(unix)] +fn watch_shutdown_signal() -> watch::Receiver { + use tokio::signal::unix::*; + + let (send_cancel, watch_cancel) = watch::channel(false); + tokio::spawn(async move { + let mut sigint = signal(SignalKind::interrupt()).expect("Failed to install SIGINT handler"); + let mut sigterm = + signal(SignalKind::terminate()).expect("Failed to install SIGTERM handler"); + let mut sighup = signal(SignalKind::hangup()).expect("Failed to install SIGHUP handler"); + tokio::select! { + _ = sigint.recv() => info!("Received SIGINT, shutting down."), + _ = sigterm.recv() => info!("Received SIGTERM, shutting down."), + _ = sighup.recv() => info!("Received SIGHUP, shutting down."), + } + send_cancel.send(true).unwrap(); + }); + watch_cancel +} + +#[cfg(windows)] +fn watch_shutdown_signal() -> watch::Receiver { + use tokio::signal::windows::*; + + let (send_cancel, watch_cancel) = watch::channel(false); + tokio::spawn(async move { + let mut sigint = ctrl_c().expect("Failed to install Ctrl-C handler"); + let mut sigclose = ctrl_close().expect("Failed to install Ctrl-Close handler"); + let mut siglogoff = ctrl_logoff().expect("Failed to install Ctrl-Logoff handler"); + let mut sigsdown = ctrl_shutdown().expect("Failed to install Ctrl-Shutdown handler"); + tokio::select! { + _ = sigint.recv() => info!("Received Ctrl-C, shutting down."), + _ = sigclose.recv() => info!("Received Ctrl-Close, shutting down."), + _ = siglogoff.recv() => info!("Received Ctrl-Logoff, shutting down."), + _ = sigsdown.recv() => info!("Received Ctrl-Shutdown, shutting down."), + } + send_cancel.send(true).unwrap(); + }); + watch_cancel +} -- cgit v1.2.3 From ad917ffd3f76316e48b89ff17e2f8a600a269481 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 29 Sep 2022 15:53:54 +0200 Subject: Fix instant substractions that might have panicked --- src/rpc/system.rs | 4 +++- src/table/sync.rs | 2 +- src/util/metrics.rs | 16 +++++++++++----- src/util/tranquilizer.rs | 2 +- 4 files changed, 16 insertions(+), 8 deletions(-) (limited to 'src') diff --git a/src/rpc/system.rs b/src/rpc/system.rs index f8121193..9e0bfa11 100644 --- a/src/rpc/system.rs +++ b/src/rpc/system.rs @@ -369,7 +369,9 @@ impl System { id: n.id.into(), addr: n.addr, is_up: n.is_up(), - last_seen_secs_ago: n.last_seen.map(|t| (Instant::now() - t).as_secs()), + last_seen_secs_ago: n + .last_seen + .map(|t| (Instant::now().saturating_duration_since(t)).as_secs()), status: node_status .get(&n.id.into()) .cloned() diff --git a/src/table/sync.rs b/src/table/sync.rs index e34aa8d7..9d79d856 100644 --- a/src/table/sync.rs +++ b/src/table/sync.rs @@ -607,7 +607,7 @@ impl Worker for SyncWor self.add_full_sync(); } }, - _ = tokio::time::sleep(self.next_full_sync - Instant::now()) => { + _ = tokio::time::sleep_until(self.next_full_sync.into()) => { self.add_full_sync(); } } diff --git a/src/util/metrics.rs b/src/util/metrics.rs index 1b05eabe..b882a886 100644 --- a/src/util/metrics.rs +++ b/src/util/metrics.rs @@ -1,4 +1,4 @@ -use std::time::SystemTime; +use std::time::Instant; use futures::{future::BoxFuture, Future, FutureExt}; use rand::Rng; @@ -28,10 +28,12 @@ where attributes: &'a [KeyValue], ) -> BoxFuture<'a, Self::Output> { async move { - let request_start = SystemTime::now(); + let request_start = Instant::now(); let res = self.await; r.record( - request_start.elapsed().map_or(0.0, |d| d.as_secs_f64()), + Instant::now() + .saturating_duration_since(request_start) + .as_secs_f64(), attributes, ); res @@ -41,9 +43,13 @@ where fn bound_record_duration(self, r: &'a BoundValueRecorder) -> BoxFuture<'a, Self::Output> { async move { - let request_start = SystemTime::now(); + let request_start = Instant::now(); let res = self.await; - r.record(request_start.elapsed().map_or(0.0, |d| d.as_secs_f64())); + r.record( + Instant::now() + .saturating_duration_since(request_start) + .as_secs_f64(), + ); res } .boxed() diff --git a/src/util/tranquilizer.rs b/src/util/tranquilizer.rs index fdb2918b..8a96cbb3 100644 --- a/src/util/tranquilizer.rs +++ b/src/util/tranquilizer.rs @@ -36,7 +36,7 @@ impl Tranquilizer { } fn tranquilize_internal(&mut self, tranquility: u32) -> Option { - let observation = Instant::now() - self.last_step_begin; + let observation = Instant::now().saturating_duration_since(self.last_step_begin); self.observations.push_back(observation); self.sum_observations += observation; -- cgit v1.2.3