aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorMendes <mendes.oulamara@pm.me>2022-10-04 18:14:49 +0200
committerMendes <mendes.oulamara@pm.me>2022-10-04 18:14:49 +0200
commit829f815a897b04986559910bbcbf53625adcdf20 (patch)
tree6db3c27cff2aded754a641d1f2b05c83be701267 /src
parent99f96b9564c9c841dc6c56f1255a6e70ff884d46 (diff)
parenta096ced35562bd0a8877a1ee2f755be1edafe343 (diff)
downloadgarage-829f815a897b04986559910bbcbf53625adcdf20.tar.gz
garage-829f815a897b04986559910bbcbf53625adcdf20.zip
Merge remote-tracking branch 'origin/main' into optimal-layout
Diffstat (limited to 'src')
-rw-r--r--src/admin/Cargo.toml29
-rw-r--r--src/admin/lib.rs6
-rw-r--r--src/admin/metrics.rs146
-rw-r--r--src/api/Cargo.toml27
-rw-r--r--src/api/admin/api_server.rs209
-rw-r--r--src/api/admin/bucket.rs580
-rw-r--r--src/api/admin/cluster.rs193
-rw-r--r--src/api/admin/error.rs97
-rw-r--r--src/api/admin/key.rs256
-rw-r--r--src/api/admin/mod.rs7
-rw-r--r--src/api/admin/router.rs145
-rw-r--r--src/api/api_server.rs645
-rw-r--r--src/api/common_error.rs177
-rw-r--r--src/api/generic_server.rs211
-rw-r--r--src/api/helpers.rs191
-rw-r--r--src/api/k2v/api_server.rs190
-rw-r--r--src/api/k2v/batch.rs363
-rw-r--r--src/api/k2v/error.rs135
-rw-r--r--src/api/k2v/index.rs100
-rw-r--r--src/api/k2v/item.rs230
-rw-r--r--src/api/k2v/mod.rs9
-rw-r--r--src/api/k2v/range.rs100
-rw-r--r--src/api/k2v/router.rs252
-rw-r--r--src/api/lib.rs26
-rw-r--r--src/api/router_macros.rs213
-rw-r--r--src/api/s3/api_server.rs390
-rw-r--r--src/api/s3/bucket.rs (renamed from src/api/s3_bucket.rs)21
-rw-r--r--src/api/s3/copy.rs (renamed from src/api/s3_copy.rs)66
-rw-r--r--src/api/s3/cors.rs (renamed from src/api/s3_cors.rs)36
-rw-r--r--src/api/s3/delete.rs (renamed from src/api/s3_delete.rs)21
-rw-r--r--src/api/s3/error.rs (renamed from src/api/error.rs)243
-rw-r--r--src/api/s3/get.rs (renamed from src/api/s3_get.rs)180
-rw-r--r--src/api/s3/list.rs (renamed from src/api/s3_list.rs)106
-rw-r--r--src/api/s3/mod.rs15
-rw-r--r--src/api/s3/post_object.rs (renamed from src/api/s3_post_object.rs)89
-rw-r--r--src/api/s3/put.rs (renamed from src/api/s3_put.rs)217
-rw-r--r--src/api/s3/router.rs (renamed from src/api/s3_router.rs)225
-rw-r--r--src/api/s3/website.rs (renamed from src/api/s3_website.rs)51
-rw-r--r--src/api/s3/xml.rs (renamed from src/api/s3_xml.rs)44
-rw-r--r--src/api/signature/error.rs36
-rw-r--r--src/api/signature/mod.rs30
-rw-r--r--src/api/signature/payload.rs35
-rw-r--r--src/api/signature/streaming.rs71
-rw-r--r--src/block/Cargo.toml18
-rw-r--r--src/block/block.rs48
-rw-r--r--src/block/lib.rs2
-rw-r--r--src/block/manager.rs941
-rw-r--r--src/block/metrics.rs4
-rw-r--r--src/block/rc.rs49
-rw-r--r--src/block/repair.rs466
-rw-r--r--src/block/resync.rs589
-rw-r--r--src/db/Cargo.toml39
-rw-r--r--src/db/bin/convert.rs69
-rw-r--r--src/db/counted_tree_hack.rs127
-rw-r--r--src/db/lib.rs416
-rw-r--r--src/db/lmdb_adapter.rs350
-rw-r--r--src/db/sled_adapter.rs266
-rw-r--r--src/db/sqlite_adapter.rs508
-rw-r--r--src/db/test.rs106
-rw-r--r--src/garage/Cargo.toml66
-rw-r--r--src/garage/admin.rs269
-rw-r--r--src/garage/cli/cmd.rs30
-rw-r--r--src/garage/cli/layout.rs79
-rw-r--r--src/garage/cli/structs.rs202
-rw-r--r--src/garage/cli/util.rs137
-rw-r--r--src/garage/main.rs78
-rw-r--r--src/garage/repair.rs149
-rw-r--r--src/garage/repair/mod.rs2
-rw-r--r--src/garage/repair/offline.rs55
-rw-r--r--src/garage/repair/online.rs215
-rw-r--r--src/garage/server.rs188
-rw-r--r--src/garage/tests/bucket.rs8
-rw-r--r--src/garage/tests/common/client.rs2
-rw-r--r--src/garage/tests/common/custom_requester.rs55
-rw-r--r--src/garage/tests/common/garage.rs34
-rw-r--r--src/garage/tests/common/mod.rs11
-rw-r--r--src/garage/tests/k2v/batch.rs612
-rw-r--r--src/garage/tests/k2v/errorcodes.rs141
-rw-r--r--src/garage/tests/k2v/item.rs725
-rw-r--r--src/garage/tests/k2v/mod.rs18
-rw-r--r--src/garage/tests/k2v/poll.rs98
-rw-r--r--src/garage/tests/k2v/simple.rs40
-rw-r--r--src/garage/tests/lib.rs11
-rw-r--r--src/garage/tests/s3/list.rs (renamed from src/garage/tests/list.rs)0
-rw-r--r--src/garage/tests/s3/mod.rs6
-rw-r--r--src/garage/tests/s3/multipart.rs (renamed from src/garage/tests/multipart.rs)0
-rw-r--r--src/garage/tests/s3/objects.rs (renamed from src/garage/tests/objects.rs)9
-rw-r--r--src/garage/tests/s3/simple.rs (renamed from src/garage/tests/simple.rs)0
-rw-r--r--src/garage/tests/s3/streaming_signature.rs (renamed from src/garage/tests/streaming_signature.rs)0
-rw-r--r--src/garage/tests/s3/website.rs (renamed from src/garage/tests/website.rs)32
-rw-r--r--src/garage/tracing_setup.rs (renamed from src/admin/tracing_setup.rs)0
-rw-r--r--src/k2v-client/Cargo.toml37
-rw-r--r--src/k2v-client/README.md25
-rw-r--r--src/k2v-client/bin/k2v-cli.rs501
-rw-r--r--src/k2v-client/error.rs29
-rw-r--r--src/k2v-client/lib.rs611
-rw-r--r--src/model/Cargo.toml26
-rw-r--r--src/model/bucket_alias_table.rs2
-rw-r--r--src/model/bucket_table.rs23
-rw-r--r--src/model/garage.rs203
-rw-r--r--src/model/helper/bucket.rs151
-rw-r--r--src/model/helper/error.rs10
-rw-r--r--src/model/helper/key.rs102
-rw-r--r--src/model/helper/mod.rs1
-rw-r--r--src/model/index_counter.rs496
-rw-r--r--src/model/k2v/causality.rs96
-rw-r--r--src/model/k2v/item_table.rs305
-rw-r--r--src/model/k2v/mod.rs6
-rw-r--r--src/model/k2v/poll.rs50
-rw-r--r--src/model/k2v/rpc.rs341
-rw-r--r--src/model/key_table.rs6
-rw-r--r--src/model/lib.rs12
-rw-r--r--src/model/migrate.rs9
-rw-r--r--src/model/prev/mod.rs1
-rw-r--r--src/model/prev/v051/bucket_table.rs63
-rw-r--r--src/model/prev/v051/key_table.rs50
-rw-r--r--src/model/prev/v051/mod.rs4
-rw-r--r--src/model/prev/v051/object_table.rs149
-rw-r--r--src/model/prev/v051/version_table.rs79
-rw-r--r--src/model/s3/block_ref_table.rs (renamed from src/model/block_ref_table.rs)27
-rw-r--r--src/model/s3/mod.rs3
-rw-r--r--src/model/s3/object_table.rs (renamed from src/model/object_table.rs)84
-rw-r--r--src/model/s3/version_table.rs (renamed from src/model/version_table.rs)22
-rw-r--r--src/rpc/Cargo.toml11
-rw-r--r--src/rpc/kubernetes.rs2
-rw-r--r--src/rpc/layout.rs56
-rw-r--r--src/rpc/metrics.rs19
-rw-r--r--src/rpc/rpc_helper.rs244
-rw-r--r--src/rpc/system.rs252
-rw-r--r--src/table/Cargo.toml10
-rw-r--r--src/table/data.rs187
-rw-r--r--src/table/gc.rs139
-rw-r--r--src/table/merkle.rs150
-rw-r--r--src/table/metrics.rs21
-rw-r--r--src/table/schema.rs21
-rw-r--r--src/table/sync.rs233
-rw-r--r--src/table/table.rs143
-rw-r--r--src/table/util.rs18
-rw-r--r--src/util/Cargo.toml22
-rw-r--r--src/util/async_hash.rs61
-rw-r--r--src/util/background.rs153
-rw-r--r--src/util/background/job_worker.rs48
-rw-r--r--src/util/background/mod.rs117
-rw-r--r--src/util/background/worker.rs260
-rw-r--r--src/util/config.rs73
-rw-r--r--src/util/crdt/bool.rs2
-rw-r--r--src/util/crdt/deletable.rs2
-rw-r--r--src/util/crdt/lww.rs2
-rw-r--r--src/util/crdt/lww_map.rs7
-rw-r--r--src/util/crdt/map.rs2
-rw-r--r--src/util/error.rs15
-rw-r--r--src/util/formater.rs28
-rw-r--r--src/util/lib.rs4
-rw-r--r--src/util/metrics.rs16
-rw-r--r--src/util/sled_counter.rs100
-rw-r--r--src/util/tranquilizer.rs27
-rw-r--r--src/util/version.rs28
-rw-r--r--src/web/Cargo.toml10
-rw-r--r--src/web/error.rs36
-rw-r--r--src/web/lib.rs2
-rw-r--r--src/web/web_server.rs421
161 files changed, 16081 insertions, 4068 deletions
diff --git a/src/admin/Cargo.toml b/src/admin/Cargo.toml
deleted file mode 100644
index 2db4bb08..00000000
--- a/src/admin/Cargo.toml
+++ /dev/null
@@ -1,29 +0,0 @@
-[package]
-name = "garage_admin"
-version = "0.7.0"
-authors = ["Maximilien Richer <code@mricher.fr>"]
-edition = "2018"
-license = "AGPL-3.0"
-description = "Administration and metrics REST HTTP server for Garage"
-repository = "https://git.deuxfleurs.fr/Deuxfleurs/garage"
-
-[lib]
-path = "lib.rs"
-
-# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
-
-[dependencies]
-garage_util = { version = "0.7.0", path = "../util" }
-
-hex = "0.4"
-
-futures = "0.3"
-futures-util = "0.3"
-http = "0.2"
-hyper = "0.14"
-tracing = "0.1.30"
-
-opentelemetry = { version = "0.17", features = [ "rt-tokio" ] }
-opentelemetry-prometheus = "0.10"
-opentelemetry-otlp = "0.10"
-prometheus = "0.13"
diff --git a/src/admin/lib.rs b/src/admin/lib.rs
deleted file mode 100644
index b5b0775b..00000000
--- a/src/admin/lib.rs
+++ /dev/null
@@ -1,6 +0,0 @@
-//! Crate for handling the admin and metric HTTP APIs
-#[macro_use]
-extern crate tracing;
-
-pub mod metrics;
-pub mod tracing_setup;
diff --git a/src/admin/metrics.rs b/src/admin/metrics.rs
deleted file mode 100644
index 7edc36c6..00000000
--- a/src/admin/metrics.rs
+++ /dev/null
@@ -1,146 +0,0 @@
-use std::convert::Infallible;
-use std::net::SocketAddr;
-use std::sync::Arc;
-use std::time::SystemTime;
-
-use futures::future::*;
-use hyper::{
- header::CONTENT_TYPE,
- service::{make_service_fn, service_fn},
- Body, Method, Request, Response, Server,
-};
-
-use opentelemetry::{
- global,
- metrics::{BoundCounter, BoundValueRecorder},
- trace::{FutureExt, TraceContextExt, Tracer},
- Context,
-};
-use opentelemetry_prometheus::PrometheusExporter;
-
-use prometheus::{Encoder, TextEncoder};
-
-use garage_util::error::Error as GarageError;
-use garage_util::metrics::*;
-
-// serve_req on metric endpoint
-async fn serve_req(
- req: Request<Body>,
- admin_server: Arc<AdminServer>,
-) -> Result<Response<Body>, hyper::Error> {
- debug!("Receiving request at path {}", req.uri());
- let request_start = SystemTime::now();
-
- admin_server.metrics.http_counter.add(1);
-
- let response = match (req.method(), req.uri().path()) {
- (&Method::GET, "/metrics") => {
- let mut buffer = vec![];
- let encoder = TextEncoder::new();
-
- let tracer = opentelemetry::global::tracer("garage");
- let metric_families = tracer.in_span("admin/gather_metrics", |_| {
- admin_server.exporter.registry().gather()
- });
-
- encoder.encode(&metric_families, &mut buffer).unwrap();
- admin_server
- .metrics
- .http_body_gauge
- .record(buffer.len() as u64);
-
- Response::builder()
- .status(200)
- .header(CONTENT_TYPE, encoder.format_type())
- .body(Body::from(buffer))
- .unwrap()
- }
- _ => Response::builder()
- .status(404)
- .body(Body::from("Not implemented"))
- .unwrap(),
- };
-
- admin_server
- .metrics
- .http_req_histogram
- .record(request_start.elapsed().map_or(0.0, |d| d.as_secs_f64()));
- Ok(response)
-}
-
-// AdminServer hold the admin server internal admin_server and the metric exporter
-pub struct AdminServer {
- exporter: PrometheusExporter,
- metrics: AdminServerMetrics,
-}
-
-// GarageMetricadmin_server holds the metrics counter definition for Garage
-// FIXME: we would rather have that split up among the different libraries?
-struct AdminServerMetrics {
- http_counter: BoundCounter<u64>,
- http_body_gauge: BoundValueRecorder<u64>,
- http_req_histogram: BoundValueRecorder<f64>,
-}
-
-impl AdminServer {
- /// init initilialize the AdminServer and background metric server
- pub fn init() -> AdminServer {
- let exporter = opentelemetry_prometheus::exporter().init();
- let meter = global::meter("garage/admin_server");
- AdminServer {
- exporter,
- metrics: AdminServerMetrics {
- http_counter: meter
- .u64_counter("admin.http_requests_total")
- .with_description("Total number of HTTP requests made.")
- .init()
- .bind(&[]),
- http_body_gauge: meter
- .u64_value_recorder("admin.http_response_size_bytes")
- .with_description("The metrics HTTP response sizes in bytes.")
- .init()
- .bind(&[]),
- http_req_histogram: meter
- .f64_value_recorder("admin.http_request_duration_seconds")
- .with_description("The HTTP request latencies in seconds.")
- .init()
- .bind(&[]),
- },
- }
- }
- /// run execute the admin server on the designated HTTP port and listen for requests
- pub async fn run(
- self,
- bind_addr: SocketAddr,
- shutdown_signal: impl Future<Output = ()>,
- ) -> Result<(), GarageError> {
- let admin_server = Arc::new(self);
- // For every connection, we must make a `Service` to handle all
- // incoming HTTP requests on said connection.
- let make_svc = make_service_fn(move |_conn| {
- let admin_server = admin_server.clone();
- // This is the `Service` that will handle the connection.
- // `service_fn` is a helper to convert a function that
- // returns a Response into a `Service`.
- async move {
- Ok::<_, Infallible>(service_fn(move |req| {
- let tracer = opentelemetry::global::tracer("garage");
- let span = tracer
- .span_builder("admin/request")
- .with_trace_id(gen_trace_id())
- .start(&tracer);
-
- serve_req(req, admin_server.clone())
- .with_context(Context::current_with_span(span))
- }))
- }
- });
-
- let server = Server::bind(&bind_addr).serve(make_svc);
- let graceful = server.with_graceful_shutdown(shutdown_signal);
- info!("Admin server listening on http://{}", bind_addr);
-
- graceful.await?;
- Ok(())
- }
-}
diff --git a/src/api/Cargo.toml b/src/api/Cargo.toml
index 5e96b081..7c3ed43b 100644
--- a/src/api/Cargo.toml
+++ b/src/api/Cargo.toml
@@ -1,6 +1,6 @@
[package]
name = "garage_api"
-version = "0.7.0"
+version = "0.8.0"
authors = ["Alex Auvolat <alex@adnab.me>"]
edition = "2018"
license = "AGPL-3.0"
@@ -14,28 +14,31 @@ path = "lib.rs"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
-garage_model = { version = "0.7.0", path = "../model" }
-garage_table = { version = "0.7.0", path = "../table" }
-garage_block = { version = "0.7.0", path = "../block" }
-garage_util = { version = "0.7.0", path = "../util" }
+garage_model = { version = "0.8.0", path = "../model" }
+garage_table = { version = "0.8.0", path = "../table" }
+garage_block = { version = "0.8.0", path = "../block" }
+garage_util = { version = "0.8.0", path = "../util" }
+garage_rpc = { version = "0.8.0", path = "../rpc" }
+async-trait = "0.1.7"
base64 = "0.13"
bytes = "1.0"
chrono = "0.4"
-crypto-mac = "0.10"
+crypto-common = "0.1"
err-derive = "0.3"
hex = "0.4"
-hmac = "0.10"
+hmac = "0.12"
idna = "0.2"
tracing = "0.1.30"
-md-5 = "0.9"
+md-5 = "0.10"
nom = "7.1"
-sha2 = "0.9"
+sha2 = "0.10"
futures = "0.3"
futures-util = "0.3"
pin-project = "1.0"
tokio = { version = "1.0", default-features = false, features = ["rt", "rt-multi-thread", "io-util", "net", "time", "macros", "sync", "signal", "fs"] }
+tokio-stream = "0.1"
form_urlencoded = "1.0.0"
http = "0.2"
@@ -52,3 +55,9 @@ quick-xml = { version = "0.21", features = [ "serialize" ] }
url = "2.1"
opentelemetry = "0.17"
+opentelemetry-prometheus = { version = "0.10", optional = true }
+prometheus = { version = "0.13", optional = true }
+
+[features]
+k2v = [ "garage_util/k2v", "garage_model/k2v" ]
+metrics = [ "opentelemetry-prometheus", "prometheus" ]
diff --git a/src/api/admin/api_server.rs b/src/api/admin/api_server.rs
new file mode 100644
index 00000000..0816bda1
--- /dev/null
+++ b/src/api/admin/api_server.rs
@@ -0,0 +1,209 @@
+use std::net::SocketAddr;
+use std::sync::Arc;
+
+use async_trait::async_trait;
+
+use futures::future::Future;
+use http::header::{ACCESS_CONTROL_ALLOW_METHODS, ACCESS_CONTROL_ALLOW_ORIGIN, ALLOW};
+use hyper::{Body, Request, Response};
+
+use opentelemetry::trace::SpanRef;
+
+#[cfg(feature = "metrics")]
+use opentelemetry_prometheus::PrometheusExporter;
+#[cfg(feature = "metrics")]
+use prometheus::{Encoder, TextEncoder};
+
+use garage_model::garage::Garage;
+use garage_util::error::Error as GarageError;
+
+use crate::generic_server::*;
+
+use crate::admin::bucket::*;
+use crate::admin::cluster::*;
+use crate::admin::error::*;
+use crate::admin::key::*;
+use crate::admin::router::{Authorization, Endpoint};
+
+pub struct AdminApiServer {
+ garage: Arc<Garage>,
+ #[cfg(feature = "metrics")]
+ exporter: PrometheusExporter,
+ metrics_token: Option<String>,
+ admin_token: Option<String>,
+}
+
+impl AdminApiServer {
+ pub fn new(
+ garage: Arc<Garage>,
+ #[cfg(feature = "metrics")] exporter: PrometheusExporter,
+ ) -> Self {
+ let cfg = &garage.config.admin;
+ let metrics_token = cfg
+ .metrics_token
+ .as_ref()
+ .map(|tok| format!("Bearer {}", tok));
+ let admin_token = cfg
+ .admin_token
+ .as_ref()
+ .map(|tok| format!("Bearer {}", tok));
+ Self {
+ garage,
+ #[cfg(feature = "metrics")]
+ exporter,
+ metrics_token,
+ admin_token,
+ }
+ }
+
+ pub async fn run(
+ self,
+ bind_addr: SocketAddr,
+ shutdown_signal: impl Future<Output = ()>,
+ ) -> Result<(), GarageError> {
+ let region = self.garage.config.s3_api.s3_region.clone();
+ ApiServer::new(region, self)
+ .run_server(bind_addr, shutdown_signal)
+ .await
+ }
+
+ fn handle_options(&self, _req: &Request<Body>) -> Result<Response<Body>, Error> {
+ Ok(Response::builder()
+ .status(204)
+ .header(ALLOW, "OPTIONS, GET, POST")
+ .header(ACCESS_CONTROL_ALLOW_METHODS, "OPTIONS, GET, POST")
+ .header(ACCESS_CONTROL_ALLOW_ORIGIN, "*")
+ .body(Body::empty())?)
+ }
+
+ fn handle_metrics(&self) -> Result<Response<Body>, Error> {
+ #[cfg(feature = "metrics")]
+ {
+ use opentelemetry::trace::Tracer;
+
+ let mut buffer = vec![];
+ let encoder = TextEncoder::new();
+
+ let tracer = opentelemetry::global::tracer("garage");
+ let metric_families = tracer.in_span("admin/gather_metrics", |_| {
+ self.exporter.registry().gather()
+ });
+
+ encoder
+ .encode(&metric_families, &mut buffer)
+ .ok_or_internal_error("Could not serialize metrics")?;
+
+ Ok(Response::builder()
+ .status(200)
+ .header(http::header::CONTENT_TYPE, encoder.format_type())
+ .body(Body::from(buffer))?)
+ }
+ #[cfg(not(feature = "metrics"))]
+ Err(Error::bad_request(
+ "Garage was built without the metrics feature".to_string(),
+ ))
+ }
+}
+
+#[async_trait]
+impl ApiHandler for AdminApiServer {
+ const API_NAME: &'static str = "admin";
+ const API_NAME_DISPLAY: &'static str = "Admin";
+
+ type Endpoint = Endpoint;
+ type Error = Error;
+
+ fn parse_endpoint(&self, req: &Request<Body>) -> Result<Endpoint, Error> {
+ Endpoint::from_request(req)
+ }
+
+ async fn handle(
+ &self,
+ req: Request<Body>,
+ endpoint: Endpoint,
+ ) -> Result<Response<Body>, Error> {
+ let expected_auth_header =
+ match endpoint.authorization_type() {
+ Authorization::MetricsToken => self.metrics_token.as_ref(),
+ Authorization::AdminToken => match &self.admin_token {
+ None => return Err(Error::forbidden(
+ "Admin token isn't configured, admin API access is disabled for security.",
+ )),
+ Some(t) => Some(t),
+ },
+ };
+
+ if let Some(h) = expected_auth_header {
+ match req.headers().get("Authorization") {
+ None => return Err(Error::forbidden("Authorization token must be provided")),
+ Some(v) => {
+ let authorized = v.to_str().map(|hv| hv.trim() == h).unwrap_or(false);
+ if !authorized {
+ return Err(Error::forbidden("Invalid authorization token provided"));
+ }
+ }
+ }
+ }
+
+ match endpoint {
+ Endpoint::Options => self.handle_options(&req),
+ Endpoint::Metrics => self.handle_metrics(),
+ Endpoint::GetClusterStatus => handle_get_cluster_status(&self.garage).await,
+ Endpoint::ConnectClusterNodes => handle_connect_cluster_nodes(&self.garage, req).await,
+ // Layout
+ Endpoint::GetClusterLayout => handle_get_cluster_layout(&self.garage).await,
+ Endpoint::UpdateClusterLayout => handle_update_cluster_layout(&self.garage, req).await,
+ Endpoint::ApplyClusterLayout => handle_apply_cluster_layout(&self.garage, req).await,
+ Endpoint::RevertClusterLayout => handle_revert_cluster_layout(&self.garage, req).await,
+ // Keys
+ Endpoint::ListKeys => handle_list_keys(&self.garage).await,
+ Endpoint::GetKeyInfo { id, search } => {
+ handle_get_key_info(&self.garage, id, search).await
+ }
+ Endpoint::CreateKey => handle_create_key(&self.garage, req).await,
+ Endpoint::ImportKey => handle_import_key(&self.garage, req).await,
+ Endpoint::UpdateKey { id } => handle_update_key(&self.garage, id, req).await,
+ Endpoint::DeleteKey { id } => handle_delete_key(&self.garage, id).await,
+ // Buckets
+ Endpoint::ListBuckets => handle_list_buckets(&self.garage).await,
+ Endpoint::GetBucketInfo { id, global_alias } => {
+ handle_get_bucket_info(&self.garage, id, global_alias).await
+ }
+ Endpoint::CreateBucket => handle_create_bucket(&self.garage, req).await,
+ Endpoint::DeleteBucket { id } => handle_delete_bucket(&self.garage, id).await,
+ Endpoint::UpdateBucket { id } => handle_update_bucket(&self.garage, id, req).await,
+ // Bucket-key permissions
+ Endpoint::BucketAllowKey => {
+ handle_bucket_change_key_perm(&self.garage, req, true).await
+ }
+ Endpoint::BucketDenyKey => {
+ handle_bucket_change_key_perm(&self.garage, req, false).await
+ }
+ // Bucket aliasing
+ Endpoint::GlobalAliasBucket { id, alias } => {
+ handle_global_alias_bucket(&self.garage, id, alias).await
+ }
+ Endpoint::GlobalUnaliasBucket { id, alias } => {
+ handle_global_unalias_bucket(&self.garage, id, alias).await
+ }
+ Endpoint::LocalAliasBucket {
+ id,
+ access_key_id,
+ alias,
+ } => handle_local_alias_bucket(&self.garage, id, access_key_id, alias).await,
+ Endpoint::LocalUnaliasBucket {
+ id,
+ access_key_id,
+ alias,
+ } => handle_local_unalias_bucket(&self.garage, id, access_key_id, alias).await,
+ }
+ }
+}
+
+impl ApiEndpoint for Endpoint {
+ fn name(&self) -> &'static str {
+ Endpoint::name(self)
+ }
+
+ fn add_span_attributes(&self, _span: SpanRef<'_>) {}
+}
diff --git a/src/api/admin/bucket.rs b/src/api/admin/bucket.rs
new file mode 100644
index 00000000..ac8a8a40
--- /dev/null
+++ b/src/api/admin/bucket.rs
@@ -0,0 +1,580 @@
+use std::collections::HashMap;
+use std::sync::Arc;
+
+use hyper::{Body, Request, Response, StatusCode};
+use serde::{Deserialize, Serialize};
+
+use garage_util::crdt::*;
+use garage_util::data::*;
+use garage_util::time::*;
+
+use garage_table::*;
+
+use garage_model::bucket_alias_table::*;
+use garage_model::bucket_table::*;
+use garage_model::garage::Garage;
+use garage_model::permission::*;
+use garage_model::s3::object_table::*;
+
+use crate::admin::error::*;
+use crate::admin::key::ApiBucketKeyPerm;
+use crate::common_error::CommonError;
+use crate::helpers::{json_ok_response, parse_json_body};
+
+pub async fn handle_list_buckets(garage: &Arc<Garage>) -> Result<Response<Body>, Error> {
+ let buckets = garage
+ .bucket_table
+ .get_range(
+ &EmptyKey,
+ None,
+ Some(DeletedFilter::NotDeleted),
+ 10000,
+ EnumerationOrder::Forward,
+ )
+ .await?;
+
+ let res = buckets
+ .into_iter()
+ .map(|b| {
+ let state = b.state.as_option().unwrap();
+ ListBucketResultItem {
+ id: hex::encode(b.id),
+ global_aliases: state
+ .aliases
+ .items()
+ .iter()
+ .filter(|(_, _, a)| *a)
+ .map(|(n, _, _)| n.to_string())
+ .collect::<Vec<_>>(),
+ local_aliases: state
+ .local_aliases
+ .items()
+ .iter()
+ .filter(|(_, _, a)| *a)
+ .map(|((k, n), _, _)| BucketLocalAlias {
+ access_key_id: k.to_string(),
+ alias: n.to_string(),
+ })
+ .collect::<Vec<_>>(),
+ }
+ })
+ .collect::<Vec<_>>();
+
+ Ok(json_ok_response(&res)?)
+}
+
+#[derive(Serialize)]
+#[serde(rename_all = "camelCase")]
+struct ListBucketResultItem {
+ id: String,
+ global_aliases: Vec<String>,
+ local_aliases: Vec<BucketLocalAlias>,
+}
+
+#[derive(Serialize)]
+#[serde(rename_all = "camelCase")]
+struct BucketLocalAlias {
+ access_key_id: String,
+ alias: String,
+}
+
+#[derive(Serialize, Deserialize)]
+#[serde(rename_all = "camelCase")]
+struct ApiBucketQuotas {
+ max_size: Option<u64>,
+ max_objects: Option<u64>,
+}
+
+pub async fn handle_get_bucket_info(
+ garage: &Arc<Garage>,
+ id: Option<String>,
+ global_alias: Option<String>,
+) -> Result<Response<Body>, Error> {
+ let bucket_id = match (id, global_alias) {
+ (Some(id), None) => parse_bucket_id(&id)?,
+ (None, Some(ga)) => garage
+ .bucket_helper()
+ .resolve_global_bucket_name(&ga)
+ .await?
+ .ok_or_else(|| HelperError::NoSuchBucket(ga.to_string()))?,
+ _ => {
+ return Err(Error::bad_request(
+ "Either id or globalAlias must be provided (but not both)",
+ ));
+ }
+ };
+
+ bucket_info_results(garage, bucket_id).await
+}
+
+async fn bucket_info_results(
+ garage: &Arc<Garage>,
+ bucket_id: Uuid,
+) -> Result<Response<Body>, Error> {
+ let bucket = garage
+ .bucket_helper()
+ .get_existing_bucket(bucket_id)
+ .await?;
+
+ let counters = garage
+ .object_counter_table
+ .table
+ .get(&bucket_id, &EmptyKey)
+ .await?
+ .map(|x| x.filtered_values(&garage.system.ring.borrow()))
+ .unwrap_or_default();
+
+ let mut relevant_keys = HashMap::new();
+ for (k, _) in bucket
+ .state
+ .as_option()
+ .unwrap()
+ .authorized_keys
+ .items()
+ .iter()
+ {
+ if let Some(key) = garage
+ .key_table
+ .get(&EmptyKey, k)
+ .await?
+ .filter(|k| !k.is_deleted())
+ {
+ if !key.state.is_deleted() {
+ relevant_keys.insert(k.clone(), key);
+ }
+ }
+ }
+ for ((k, _), _, _) in bucket
+ .state
+ .as_option()
+ .unwrap()
+ .local_aliases
+ .items()
+ .iter()
+ {
+ if relevant_keys.contains_key(k) {
+ continue;
+ }
+ if let Some(key) = garage.key_table.get(&EmptyKey, k).await? {
+ if !key.state.is_deleted() {
+ relevant_keys.insert(k.clone(), key);
+ }
+ }
+ }
+
+ let state = bucket.state.as_option().unwrap();
+
+ let quotas = state.quotas.get();
+ let res =
+ GetBucketInfoResult {
+ id: hex::encode(&bucket.id),
+ global_aliases: state
+ .aliases
+ .items()
+ .iter()
+ .filter(|(_, _, a)| *a)
+ .map(|(n, _, _)| n.to_string())
+ .collect::<Vec<_>>(),
+ website_access: state.website_config.get().is_some(),
+ website_config: state.website_config.get().clone().map(|wsc| {
+ GetBucketInfoWebsiteResult {
+ index_document: wsc.index_document,
+ error_document: wsc.error_document,
+ }
+ }),
+ keys: relevant_keys
+ .into_iter()
+ .map(|(_, key)| {
+ let p = key.state.as_option().unwrap();
+ GetBucketInfoKey {
+ access_key_id: key.key_id,
+ name: p.name.get().to_string(),
+ permissions: p
+ .authorized_buckets
+ .get(&bucket.id)
+ .map(|p| ApiBucketKeyPerm {
+ read: p.allow_read,
+ write: p.allow_write,
+ owner: p.allow_owner,
+ })
+ .unwrap_or_default(),
+ bucket_local_aliases: p
+ .local_aliases
+ .items()
+ .iter()
+ .filter(|(_, _, b)| *b == Some(bucket.id))
+ .map(|(n, _, _)| n.to_string())
+ .collect::<Vec<_>>(),
+ }
+ })
+ .collect::<Vec<_>>(),
+ objects: counters.get(OBJECTS).cloned().unwrap_or_default(),
+ bytes: counters.get(BYTES).cloned().unwrap_or_default(),
+ unfinshed_uploads: counters
+ .get(UNFINISHED_UPLOADS)
+ .cloned()
+ .unwrap_or_default(),
+ quotas: ApiBucketQuotas {
+ max_size: quotas.max_size,
+ max_objects: quotas.max_objects,
+ },
+ };
+
+ Ok(json_ok_response(&res)?)
+}
+
+#[derive(Serialize)]
+#[serde(rename_all = "camelCase")]
+struct GetBucketInfoResult {
+ id: String,
+ global_aliases: Vec<String>,
+ website_access: bool,
+ #[serde(default)]
+ website_config: Option<GetBucketInfoWebsiteResult>,
+ keys: Vec<GetBucketInfoKey>,
+ objects: i64,
+ bytes: i64,
+ unfinshed_uploads: i64,
+ quotas: ApiBucketQuotas,
+}
+
+#[derive(Serialize)]
+#[serde(rename_all = "camelCase")]
+struct GetBucketInfoWebsiteResult {
+ index_document: String,
+ error_document: Option<String>,
+}
+
+#[derive(Serialize)]
+#[serde(rename_all = "camelCase")]
+struct GetBucketInfoKey {
+ access_key_id: String,
+ name: String,
+ permissions: ApiBucketKeyPerm,
+ bucket_local_aliases: Vec<String>,
+}
+
+pub async fn handle_create_bucket(
+ garage: &Arc<Garage>,
+ req: Request<Body>,
+) -> Result<Response<Body>, Error> {
+ let req = parse_json_body::<CreateBucketRequest>(req).await?;
+
+ if let Some(ga) = &req.global_alias {
+ if !is_valid_bucket_name(ga) {
+ return Err(Error::bad_request(format!(
+ "{}: {}",
+ ga, INVALID_BUCKET_NAME_MESSAGE
+ )));
+ }
+
+ if let Some(alias) = garage.bucket_alias_table.get(&EmptyKey, ga).await? {
+ if alias.state.get().is_some() {
+ return Err(CommonError::BucketAlreadyExists.into());
+ }
+ }
+ }
+
+ if let Some(la) = &req.local_alias {
+ if !is_valid_bucket_name(&la.alias) {
+ return Err(Error::bad_request(format!(
+ "{}: {}",
+ la.alias, INVALID_BUCKET_NAME_MESSAGE
+ )));
+ }
+
+ let key = garage
+ .key_helper()
+ .get_existing_key(&la.access_key_id)
+ .await?;
+ let state = key.state.as_option().unwrap();
+ if matches!(state.local_aliases.get(&la.alias), Some(_)) {
+ return Err(Error::bad_request("Local alias already exists"));
+ }
+ }
+
+ let bucket = Bucket::new();
+ garage.bucket_table.insert(&bucket).await?;
+
+ if let Some(ga) = &req.global_alias {
+ garage
+ .bucket_helper()
+ .set_global_bucket_alias(bucket.id, ga)
+ .await?;
+ }
+
+ if let Some(la) = &req.local_alias {
+ garage
+ .bucket_helper()
+ .set_local_bucket_alias(bucket.id, &la.access_key_id, &la.alias)
+ .await?;
+
+ if la.allow.read || la.allow.write || la.allow.owner {
+ garage
+ .bucket_helper()
+ .set_bucket_key_permissions(
+ bucket.id,
+ &la.access_key_id,
+ BucketKeyPerm {
+ timestamp: now_msec(),
+ allow_read: la.allow.read,
+ allow_write: la.allow.write,
+ allow_owner: la.allow.owner,
+ },
+ )
+ .await?;
+ }
+ }
+
+ bucket_info_results(garage, bucket.id).await
+}
+
+#[derive(Deserialize)]
+#[serde(rename_all = "camelCase")]
+struct CreateBucketRequest {
+ global_alias: Option<String>,
+ local_alias: Option<CreateBucketLocalAlias>,
+}
+
+#[derive(Deserialize)]
+#[serde(rename_all = "camelCase")]
+struct CreateBucketLocalAlias {
+ access_key_id: String,
+ alias: String,
+ #[serde(default)]
+ allow: ApiBucketKeyPerm,
+}
+
+pub async fn handle_delete_bucket(
+ garage: &Arc<Garage>,
+ id: String,
+) -> Result<Response<Body>, Error> {
+ let helper = garage.bucket_helper();
+
+ let bucket_id = parse_bucket_id(&id)?;
+
+ let mut bucket = helper.get_existing_bucket(bucket_id).await?;
+ let state = bucket.state.as_option().unwrap();
+
+ // Check bucket is empty
+ if !helper.is_bucket_empty(bucket_id).await? {
+ return Err(CommonError::BucketNotEmpty.into());
+ }
+
+ // --- done checking, now commit ---
+ // 1. delete authorization from keys that had access
+ for (key_id, perm) in bucket.authorized_keys() {
+ if perm.is_any() {
+ helper
+ .set_bucket_key_permissions(bucket.id, key_id, BucketKeyPerm::NO_PERMISSIONS)
+ .await?;
+ }
+ }
+ // 2. delete all local aliases
+ for ((key_id, alias), _, active) in state.local_aliases.items().iter() {
+ if *active {
+ helper
+ .unset_local_bucket_alias(bucket.id, key_id, alias)
+ .await?;
+ }
+ }
+ // 3. delete all global aliases
+ for (alias, _, active) in state.aliases.items().iter() {
+ if *active {
+ helper.purge_global_bucket_alias(bucket.id, alias).await?;
+ }
+ }
+
+ // 4. delete bucket
+ bucket.state = Deletable::delete();
+ garage.bucket_table.insert(&bucket).await?;
+
+ Ok(Response::builder()
+ .status(StatusCode::NO_CONTENT)
+ .body(Body::empty())?)
+}
+
+pub async fn handle_update_bucket(
+ garage: &Arc<Garage>,
+ id: String,
+ req: Request<Body>,
+) -> Result<Response<Body>, Error> {
+ let req = parse_json_body::<UpdateBucketRequest>(req).await?;
+ let bucket_id = parse_bucket_id(&id)?;
+
+ let mut bucket = garage
+ .bucket_helper()
+ .get_existing_bucket(bucket_id)
+ .await?;
+
+ let state = bucket.state.as_option_mut().unwrap();
+
+ if let Some(wa) = req.website_access {
+ if wa.enabled {
+ state.website_config.update(Some(WebsiteConfig {
+ index_document: wa.index_document.ok_or_bad_request(
+ "Please specify indexDocument when enabling website access.",
+ )?,
+ error_document: wa.error_document,
+ }));
+ } else {
+ if wa.index_document.is_some() || wa.error_document.is_some() {
+ return Err(Error::bad_request(
+ "Cannot specify indexDocument or errorDocument when disabling website access.",
+ ));
+ }
+ state.website_config.update(None);
+ }
+ }
+
+ if let Some(q) = req.quotas {
+ state.quotas.update(BucketQuotas {
+ max_size: q.max_size,
+ max_objects: q.max_objects,
+ });
+ }
+
+ garage.bucket_table.insert(&bucket).await?;
+
+ bucket_info_results(garage, bucket_id).await
+}
+
+#[derive(Deserialize)]
+#[serde(rename_all = "camelCase")]
+struct UpdateBucketRequest {
+ website_access: Option<UpdateBucketWebsiteAccess>,
+ quotas: Option<ApiBucketQuotas>,
+}
+
+#[derive(Deserialize)]
+#[serde(rename_all = "camelCase")]
+struct UpdateBucketWebsiteAccess {
+ enabled: bool,
+ index_document: Option<String>,
+ error_document: Option<String>,
+}
+
+// ---- BUCKET/KEY PERMISSIONS ----
+
+pub async fn handle_bucket_change_key_perm(
+ garage: &Arc<Garage>,
+ req: Request<Body>,
+ new_perm_flag: bool,
+) -> Result<Response<Body>, Error> {
+ let req = parse_json_body::<BucketKeyPermChangeRequest>(req).await?;
+
+ let bucket_id = parse_bucket_id(&req.bucket_id)?;
+
+ let bucket = garage
+ .bucket_helper()
+ .get_existing_bucket(bucket_id)
+ .await?;
+ let state = bucket.state.as_option().unwrap();
+
+ let key = garage
+ .key_helper()
+ .get_existing_key(&req.access_key_id)
+ .await?;
+
+ let mut perm = state
+ .authorized_keys
+ .get(&key.key_id)
+ .cloned()
+ .unwrap_or(BucketKeyPerm::NO_PERMISSIONS);
+
+ if req.permissions.read {
+ perm.allow_read = new_perm_flag;
+ }
+ if req.permissions.write {
+ perm.allow_write = new_perm_flag;
+ }
+ if req.permissions.owner {
+ perm.allow_owner = new_perm_flag;
+ }
+
+ garage
+ .bucket_helper()
+ .set_bucket_key_permissions(bucket.id, &key.key_id, perm)
+ .await?;
+
+ bucket_info_results(garage, bucket.id).await
+}
+
+#[derive(Deserialize)]
+#[serde(rename_all = "camelCase")]
+struct BucketKeyPermChangeRequest {
+ bucket_id: String,
+ access_key_id: String,
+ permissions: ApiBucketKeyPerm,
+}
+
+// ---- BUCKET ALIASES ----
+
+pub async fn handle_global_alias_bucket(
+ garage: &Arc<Garage>,
+ bucket_id: String,
+ alias: String,
+) -> Result<Response<Body>, Error> {
+ let bucket_id = parse_bucket_id(&bucket_id)?;
+
+ garage
+ .bucket_helper()
+ .set_global_bucket_alias(bucket_id, &alias)
+ .await?;
+
+ bucket_info_results(garage, bucket_id).await
+}
+
+pub async fn handle_global_unalias_bucket(
+ garage: &Arc<Garage>,
+ bucket_id: String,
+ alias: String,
+) -> Result<Response<Body>, Error> {
+ let bucket_id = parse_bucket_id(&bucket_id)?;
+
+ garage
+ .bucket_helper()
+ .unset_global_bucket_alias(bucket_id, &alias)
+ .await?;
+
+ bucket_info_results(garage, bucket_id).await
+}
+
+pub async fn handle_local_alias_bucket(
+ garage: &Arc<Garage>,
+ bucket_id: String,
+ access_key_id: String,
+ alias: String,
+) -> Result<Response<Body>, Error> {
+ let bucket_id = parse_bucket_id(&bucket_id)?;
+
+ garage
+ .bucket_helper()
+ .set_local_bucket_alias(bucket_id, &access_key_id, &alias)
+ .await?;
+
+ bucket_info_results(garage, bucket_id).await
+}
+
+pub async fn handle_local_unalias_bucket(
+ garage: &Arc<Garage>,
+ bucket_id: String,
+ access_key_id: String,
+ alias: String,
+) -> Result<Response<Body>, Error> {
+ let bucket_id = parse_bucket_id(&bucket_id)?;
+
+ garage
+ .bucket_helper()
+ .unset_local_bucket_alias(bucket_id, &access_key_id, &alias)
+ .await?;
+
+ bucket_info_results(garage, bucket_id).await
+}
+
+// ---- HELPER ----
+
+fn parse_bucket_id(id: &str) -> Result<Uuid, Error> {
+ let id_hex = hex::decode(&id).ok_or_bad_request("Invalid bucket id")?;
+ Ok(Uuid::try_from(&id_hex).ok_or_bad_request("Invalid bucket id")?)
+}
diff --git a/src/api/admin/cluster.rs b/src/api/admin/cluster.rs
new file mode 100644
index 00000000..99c6e332
--- /dev/null
+++ b/src/api/admin/cluster.rs
@@ -0,0 +1,193 @@
+use std::collections::HashMap;
+use std::net::SocketAddr;
+use std::sync::Arc;
+
+use hyper::{Body, Request, Response, StatusCode};
+use serde::{Deserialize, Serialize};
+
+use garage_util::crdt::*;
+use garage_util::data::*;
+
+use garage_rpc::layout::*;
+
+use garage_model::garage::Garage;
+
+use crate::admin::error::*;
+use crate::helpers::{json_ok_response, parse_json_body};
+
+pub async fn handle_get_cluster_status(garage: &Arc<Garage>) -> Result<Response<Body>, Error> {
+ let res = GetClusterStatusResponse {
+ node: hex::encode(garage.system.id),
+ garage_version: garage_util::version::garage_version(),
+ garage_features: garage_util::version::garage_features(),
+ db_engine: garage.db.engine(),
+ known_nodes: garage
+ .system
+ .get_known_nodes()
+ .into_iter()
+ .map(|i| {
+ (
+ hex::encode(i.id),
+ KnownNodeResp {
+ addr: i.addr,
+ is_up: i.is_up,
+ last_seen_secs_ago: i.last_seen_secs_ago,
+ hostname: i.status.hostname,
+ },
+ )
+ })
+ .collect(),
+ layout: get_cluster_layout(garage),
+ };
+
+ Ok(json_ok_response(&res)?)
+}
+
+pub async fn handle_connect_cluster_nodes(
+ garage: &Arc<Garage>,
+ req: Request<Body>,
+) -> Result<Response<Body>, Error> {
+ let req = parse_json_body::<Vec<String>>(req).await?;
+
+ let res = futures::future::join_all(req.iter().map(|node| garage.system.connect(node)))
+ .await
+ .into_iter()
+ .map(|r| match r {
+ Ok(()) => ConnectClusterNodesResponse {
+ success: true,
+ error: None,
+ },
+ Err(e) => ConnectClusterNodesResponse {
+ success: false,
+ error: Some(format!("{}", e)),
+ },
+ })
+ .collect::<Vec<_>>();
+
+ Ok(json_ok_response(&res)?)
+}
+
+pub async fn handle_get_cluster_layout(garage: &Arc<Garage>) -> Result<Response<Body>, Error> {
+ let res = get_cluster_layout(garage);
+
+ Ok(json_ok_response(&res)?)
+}
+
+fn get_cluster_layout(garage: &Arc<Garage>) -> GetClusterLayoutResponse {
+ let layout = garage.system.get_cluster_layout();
+
+ GetClusterLayoutResponse {
+ version: layout.version,
+ roles: layout
+ .roles
+ .items()
+ .iter()
+ .filter(|(_, _, v)| v.0.is_some())
+ .map(|(k, _, v)| (hex::encode(k), v.0.clone()))
+ .collect(),
+ staged_role_changes: layout
+ .staging
+ .items()
+ .iter()
+ .filter(|(k, _, v)| layout.roles.get(k) != Some(v))
+ .map(|(k, _, v)| (hex::encode(k), v.0.clone()))
+ .collect(),
+ }
+}
+
+#[derive(Serialize)]
+#[serde(rename_all = "camelCase")]
+struct GetClusterStatusResponse {
+ node: String,
+ garage_version: &'static str,
+ garage_features: Option<&'static [&'static str]>,
+ db_engine: String,
+ known_nodes: HashMap<String, KnownNodeResp>,
+ layout: GetClusterLayoutResponse,
+}
+
+#[derive(Serialize)]
+struct ConnectClusterNodesResponse {
+ success: bool,
+ error: Option<String>,
+}
+
+#[derive(Serialize)]
+#[serde(rename_all = "camelCase")]
+struct GetClusterLayoutResponse {
+ version: u64,
+ roles: HashMap<String, Option<NodeRole>>,
+ staged_role_changes: HashMap<String, Option<NodeRole>>,
+}
+
+#[derive(Serialize)]
+struct KnownNodeResp {
+ addr: SocketAddr,
+ is_up: bool,
+ last_seen_secs_ago: Option<u64>,
+ hostname: String,
+}
+
+pub async fn handle_update_cluster_layout(
+ garage: &Arc<Garage>,
+ req: Request<Body>,
+) -> Result<Response<Body>, Error> {
+ let updates = parse_json_body::<UpdateClusterLayoutRequest>(req).await?;
+
+ let mut layout = garage.system.get_cluster_layout();
+
+ let mut roles = layout.roles.clone();
+ roles.merge(&layout.staging);
+
+ for (node, role) in updates {
+ let node = hex::decode(node).ok_or_bad_request("Invalid node identifier")?;
+ let node = Uuid::try_from(&node).ok_or_bad_request("Invalid node identifier")?;
+
+ layout
+ .staging
+ .merge(&roles.update_mutator(node, NodeRoleV(role)));
+ }
+
+ garage.system.update_cluster_layout(&layout).await?;
+
+ Ok(Response::builder()
+ .status(StatusCode::OK)
+ .body(Body::empty())?)
+}
+
+pub async fn handle_apply_cluster_layout(
+ garage: &Arc<Garage>,
+ req: Request<Body>,
+) -> Result<Response<Body>, Error> {
+ let param = parse_json_body::<ApplyRevertLayoutRequest>(req).await?;
+
+ let layout = garage.system.get_cluster_layout();
+ let layout = layout.apply_staged_changes(Some(param.version))?;
+ garage.system.update_cluster_layout(&layout).await?;
+
+ Ok(Response::builder()
+ .status(StatusCode::OK)
+ .body(Body::empty())?)
+}
+
+pub async fn handle_revert_cluster_layout(
+ garage: &Arc<Garage>,
+ req: Request<Body>,
+) -> Result<Response<Body>, Error> {
+ let param = parse_json_body::<ApplyRevertLayoutRequest>(req).await?;
+
+ let layout = garage.system.get_cluster_layout();
+ let layout = layout.revert_staged_changes(Some(param.version))?;
+ garage.system.update_cluster_layout(&layout).await?;
+
+ Ok(Response::builder()
+ .status(StatusCode::OK)
+ .body(Body::empty())?)
+}
+
+type UpdateClusterLayoutRequest = HashMap<String, Option<NodeRole>>;
+
+#[derive(Deserialize)]
+struct ApplyRevertLayoutRequest {
+ version: u64,
+}
diff --git a/src/api/admin/error.rs b/src/api/admin/error.rs
new file mode 100644
index 00000000..ed1a07bd
--- /dev/null
+++ b/src/api/admin/error.rs
@@ -0,0 +1,97 @@
+use err_derive::Error;
+use hyper::header::HeaderValue;
+use hyper::{Body, HeaderMap, StatusCode};
+
+pub use garage_model::helper::error::Error as HelperError;
+
+use crate::common_error::CommonError;
+pub use crate::common_error::{CommonErrorDerivative, OkOrBadRequest, OkOrInternalError};
+use crate::generic_server::ApiError;
+use crate::helpers::CustomApiErrorBody;
+
+/// Errors of this crate
+#[derive(Debug, Error)]
+pub enum Error {
+ #[error(display = "{}", _0)]
+ /// Error from common error
+ Common(CommonError),
+
+ // Category: cannot process
+ /// The API access key does not exist
+ #[error(display = "Access key not found: {}", _0)]
+ NoSuchAccessKey(String),
+
+ /// In Import key, the key already exists
+ #[error(
+ display = "Key {} already exists in data store. Even if it is deleted, we can't let you create a new key with the same ID. Sorry.",
+ _0
+ )]
+ KeyAlreadyExists(String),
+}
+
+impl<T> From<T> for Error
+where
+ CommonError: From<T>,
+{
+ fn from(err: T) -> Self {
+ Error::Common(CommonError::from(err))
+ }
+}
+
+impl CommonErrorDerivative for Error {}
+
+impl From<HelperError> for Error {
+ fn from(err: HelperError) -> Self {
+ match err {
+ HelperError::Internal(i) => Self::Common(CommonError::InternalError(i)),
+ HelperError::BadRequest(b) => Self::Common(CommonError::BadRequest(b)),
+ HelperError::InvalidBucketName(n) => Self::Common(CommonError::InvalidBucketName(n)),
+ HelperError::NoSuchBucket(n) => Self::Common(CommonError::NoSuchBucket(n)),
+ HelperError::NoSuchAccessKey(n) => Self::NoSuchAccessKey(n),
+ }
+ }
+}
+
+impl Error {
+ fn code(&self) -> &'static str {
+ match self {
+ Error::Common(c) => c.aws_code(),
+ Error::NoSuchAccessKey(_) => "NoSuchAccessKey",
+ Error::KeyAlreadyExists(_) => "KeyAlreadyExists",
+ }
+ }
+}
+
+impl ApiError for Error {
+ /// Get the HTTP status code that best represents the meaning of the error for the client
+ fn http_status_code(&self) -> StatusCode {
+ match self {
+ Error::Common(c) => c.http_status_code(),
+ Error::NoSuchAccessKey(_) => StatusCode::NOT_FOUND,
+ Error::KeyAlreadyExists(_) => StatusCode::CONFLICT,
+ }
+ }
+
+ fn add_http_headers(&self, header_map: &mut HeaderMap<HeaderValue>) {
+ use hyper::header;
+ header_map.append(header::CONTENT_TYPE, "application/json".parse().unwrap());
+ }
+
+ fn http_body(&self, garage_region: &str, path: &str) -> Body {
+ let error = CustomApiErrorBody {
+ code: self.code().to_string(),
+ message: format!("{}", self),
+ path: path.to_string(),
+ region: garage_region.to_string(),
+ };
+ Body::from(serde_json::to_string_pretty(&error).unwrap_or_else(|_| {
+ r#"
+{
+ "code": "InternalError",
+ "message": "JSON encoding of error failed"
+}
+ "#
+ .into()
+ }))
+ }
+}
diff --git a/src/api/admin/key.rs b/src/api/admin/key.rs
new file mode 100644
index 00000000..2bbabb7b
--- /dev/null
+++ b/src/api/admin/key.rs
@@ -0,0 +1,256 @@
+use std::collections::HashMap;
+use std::sync::Arc;
+
+use hyper::{Body, Request, Response, StatusCode};
+use serde::{Deserialize, Serialize};
+
+use garage_table::*;
+
+use garage_model::garage::Garage;
+use garage_model::key_table::*;
+
+use crate::admin::error::*;
+use crate::helpers::{json_ok_response, parse_json_body};
+
+pub async fn handle_list_keys(garage: &Arc<Garage>) -> Result<Response<Body>, Error> {
+ let res = garage
+ .key_table
+ .get_range(
+ &EmptyKey,
+ None,
+ Some(KeyFilter::Deleted(DeletedFilter::NotDeleted)),
+ 10000,
+ EnumerationOrder::Forward,
+ )
+ .await?
+ .iter()
+ .map(|k| ListKeyResultItem {
+ id: k.key_id.to_string(),
+ name: k.params().unwrap().name.get().clone(),
+ })
+ .collect::<Vec<_>>();
+
+ Ok(json_ok_response(&res)?)
+}
+
+#[derive(Serialize)]
+struct ListKeyResultItem {
+ id: String,
+ name: String,
+}
+
+pub async fn handle_get_key_info(
+ garage: &Arc<Garage>,
+ id: Option<String>,
+ search: Option<String>,
+) -> Result<Response<Body>, Error> {
+ let key = if let Some(id) = id {
+ garage.key_helper().get_existing_key(&id).await?
+ } else if let Some(search) = search {
+ garage
+ .key_helper()
+ .get_existing_matching_key(&search)
+ .await?
+ } else {
+ unreachable!();
+ };
+
+ key_info_results(garage, key).await
+}
+
+pub async fn handle_create_key(
+ garage: &Arc<Garage>,
+ req: Request<Body>,
+) -> Result<Response<Body>, Error> {
+ let req = parse_json_body::<CreateKeyRequest>(req).await?;
+
+ let key = Key::new(&req.name);
+ garage.key_table.insert(&key).await?;
+
+ key_info_results(garage, key).await
+}
+
+#[derive(Deserialize)]
+struct CreateKeyRequest {
+ name: String,
+}
+
+pub async fn handle_import_key(
+ garage: &Arc<Garage>,
+ req: Request<Body>,
+) -> Result<Response<Body>, Error> {
+ let req = parse_json_body::<ImportKeyRequest>(req).await?;
+
+ let prev_key = garage.key_table.get(&EmptyKey, &req.access_key_id).await?;
+ if prev_key.is_some() {
+ return Err(Error::KeyAlreadyExists(req.access_key_id.to_string()));
+ }
+
+ let imported_key = Key::import(&req.access_key_id, &req.secret_access_key, &req.name);
+ garage.key_table.insert(&imported_key).await?;
+
+ key_info_results(garage, imported_key).await
+}
+
+#[derive(Deserialize)]
+#[serde(rename_all = "camelCase")]
+struct ImportKeyRequest {
+ access_key_id: String,
+ secret_access_key: String,
+ name: String,
+}
+
+pub async fn handle_update_key(
+ garage: &Arc<Garage>,
+ id: String,
+ req: Request<Body>,
+) -> Result<Response<Body>, Error> {
+ let req = parse_json_body::<UpdateKeyRequest>(req).await?;
+
+ let mut key = garage.key_helper().get_existing_key(&id).await?;
+
+ let key_state = key.state.as_option_mut().unwrap();
+
+ if let Some(new_name) = req.name {
+ key_state.name.update(new_name);
+ }
+ if let Some(allow) = req.allow {
+ if allow.create_bucket {
+ key_state.allow_create_bucket.update(true);
+ }
+ }
+ if let Some(deny) = req.deny {
+ if deny.create_bucket {
+ key_state.allow_create_bucket.update(false);
+ }
+ }
+
+ garage.key_table.insert(&key).await?;
+
+ key_info_results(garage, key).await
+}
+
+#[derive(Deserialize)]
+struct UpdateKeyRequest {
+ name: Option<String>,
+ allow: Option<KeyPerm>,
+ deny: Option<KeyPerm>,
+}
+
+pub async fn handle_delete_key(garage: &Arc<Garage>, id: String) -> Result<Response<Body>, Error> {
+ let mut key = garage.key_helper().get_existing_key(&id).await?;
+
+ key.state.as_option().unwrap();
+
+ garage.key_helper().delete_key(&mut key).await?;
+
+ Ok(Response::builder()
+ .status(StatusCode::NO_CONTENT)
+ .body(Body::empty())?)
+}
+
+async fn key_info_results(garage: &Arc<Garage>, key: Key) -> Result<Response<Body>, Error> {
+ let mut relevant_buckets = HashMap::new();
+
+ let key_state = key.state.as_option().unwrap();
+
+ for id in key_state
+ .authorized_buckets
+ .items()
+ .iter()
+ .map(|(id, _)| id)
+ .chain(
+ key_state
+ .local_aliases
+ .items()
+ .iter()
+ .filter_map(|(_, _, v)| v.as_ref()),
+ ) {
+ if !relevant_buckets.contains_key(id) {
+ if let Some(b) = garage.bucket_table.get(&EmptyKey, id).await? {
+ if b.state.as_option().is_some() {
+ relevant_buckets.insert(*id, b);
+ }
+ }
+ }
+ }
+
+ let res = GetKeyInfoResult {
+ name: key_state.name.get().clone(),
+ access_key_id: key.key_id.clone(),
+ secret_access_key: key_state.secret_key.clone(),
+ permissions: KeyPerm {
+ create_bucket: *key_state.allow_create_bucket.get(),
+ },
+ buckets: relevant_buckets
+ .into_iter()
+ .map(|(_, bucket)| {
+ let state = bucket.state.as_option().unwrap();
+ KeyInfoBucketResult {
+ id: hex::encode(bucket.id),
+ global_aliases: state
+ .aliases
+ .items()
+ .iter()
+ .filter(|(_, _, a)| *a)
+ .map(|(n, _, _)| n.to_string())
+ .collect::<Vec<_>>(),
+ local_aliases: state
+ .local_aliases
+ .items()
+ .iter()
+ .filter(|((k, _), _, a)| *a && *k == key.key_id)
+ .map(|((_, n), _, _)| n.to_string())
+ .collect::<Vec<_>>(),
+ permissions: key_state
+ .authorized_buckets
+ .get(&bucket.id)
+ .map(|p| ApiBucketKeyPerm {
+ read: p.allow_read,
+ write: p.allow_write,
+ owner: p.allow_owner,
+ })
+ .unwrap_or_default(),
+ }
+ })
+ .collect::<Vec<_>>(),
+ };
+
+ Ok(json_ok_response(&res)?)
+}
+
+#[derive(Serialize)]
+#[serde(rename_all = "camelCase")]
+struct GetKeyInfoResult {
+ name: String,
+ access_key_id: String,
+ secret_access_key: String,
+ permissions: KeyPerm,
+ buckets: Vec<KeyInfoBucketResult>,
+}
+
+#[derive(Serialize, Deserialize)]
+#[serde(rename_all = "camelCase")]
+struct KeyPerm {
+ #[serde(default)]
+ create_bucket: bool,
+}
+
+#[derive(Serialize)]
+#[serde(rename_all = "camelCase")]
+struct KeyInfoBucketResult {
+ id: String,
+ global_aliases: Vec<String>,
+ local_aliases: Vec<String>,
+ permissions: ApiBucketKeyPerm,
+}
+
+#[derive(Serialize, Deserialize, Default)]
+pub(crate) struct ApiBucketKeyPerm {
+ #[serde(default)]
+ pub(crate) read: bool,
+ #[serde(default)]
+ pub(crate) write: bool,
+ #[serde(default)]
+ pub(crate) owner: bool,
+}
diff --git a/src/api/admin/mod.rs b/src/api/admin/mod.rs
new file mode 100644
index 00000000..c4857c10
--- /dev/null
+++ b/src/api/admin/mod.rs
@@ -0,0 +1,7 @@
+pub mod api_server;
+mod error;
+mod router;
+
+mod bucket;
+mod cluster;
+mod key;
diff --git a/src/api/admin/router.rs b/src/api/admin/router.rs
new file mode 100644
index 00000000..3eee8b67
--- /dev/null
+++ b/src/api/admin/router.rs
@@ -0,0 +1,145 @@
+use std::borrow::Cow;
+
+use hyper::{Method, Request};
+
+use crate::admin::error::*;
+use crate::router_macros::*;
+
+pub enum Authorization {
+ MetricsToken,
+ AdminToken,
+}
+
+router_match! {@func
+
+/// List of all Admin API endpoints.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum Endpoint {
+ Options,
+ Metrics,
+ GetClusterStatus,
+ ConnectClusterNodes,
+ // Layout
+ GetClusterLayout,
+ UpdateClusterLayout,
+ ApplyClusterLayout,
+ RevertClusterLayout,
+ // Keys
+ ListKeys,
+ CreateKey,
+ ImportKey,
+ GetKeyInfo {
+ id: Option<String>,
+ search: Option<String>,
+ },
+ DeleteKey {
+ id: String,
+ },
+ UpdateKey {
+ id: String,
+ },
+ // Buckets
+ ListBuckets,
+ CreateBucket,
+ GetBucketInfo {
+ id: Option<String>,
+ global_alias: Option<String>,
+ },
+ DeleteBucket {
+ id: String,
+ },
+ UpdateBucket {
+ id: String,
+ },
+ // Bucket-Key Permissions
+ BucketAllowKey,
+ BucketDenyKey,
+ // Bucket aliases
+ GlobalAliasBucket {
+ id: String,
+ alias: String,
+ },
+ GlobalUnaliasBucket {
+ id: String,
+ alias: String,
+ },
+ LocalAliasBucket {
+ id: String,
+ access_key_id: String,
+ alias: String,
+ },
+ LocalUnaliasBucket {
+ id: String,
+ access_key_id: String,
+ alias: String,
+ },
+}}
+
+impl Endpoint {
+ /// Determine which S3 endpoint a request is for using the request, and a bucket which was
+ /// possibly extracted from the Host header.
+ /// Returns Self plus bucket name, if endpoint is not Endpoint::ListBuckets
+ pub fn from_request<T>(req: &Request<T>) -> Result<Self, Error> {
+ let uri = req.uri();
+ let path = uri.path();
+ let query = uri.query();
+
+ let mut query = QueryParameters::from_query(query.unwrap_or_default())?;
+
+ let res = router_match!(@gen_path_parser (req.method(), path, query) [
+ OPTIONS _ => Options,
+ GET "/metrics" => Metrics,
+ GET "/v0/status" => GetClusterStatus,
+ POST "/v0/connect" => ConnectClusterNodes,
+ // Layout endpoints
+ GET "/v0/layout" => GetClusterLayout,
+ POST "/v0/layout" => UpdateClusterLayout,
+ POST "/v0/layout/apply" => ApplyClusterLayout,
+ POST "/v0/layout/revert" => RevertClusterLayout,
+ // API key endpoints
+ GET "/v0/key" if id => GetKeyInfo (query_opt::id, query_opt::search),
+ GET "/v0/key" if search => GetKeyInfo (query_opt::id, query_opt::search),
+ POST "/v0/key" if id => UpdateKey (query::id),
+ POST "/v0/key" => CreateKey,
+ POST "/v0/key/import" => ImportKey,
+ DELETE "/v0/key" if id => DeleteKey (query::id),
+ GET "/v0/key" => ListKeys,
+ // Bucket endpoints
+ GET "/v0/bucket" if id => GetBucketInfo (query_opt::id, query_opt::global_alias),
+ GET "/v0/bucket" if global_alias => GetBucketInfo (query_opt::id, query_opt::global_alias),
+ GET "/v0/bucket" => ListBuckets,
+ POST "/v0/bucket" => CreateBucket,
+ DELETE "/v0/bucket" if id => DeleteBucket (query::id),
+ PUT "/v0/bucket" if id => UpdateBucket (query::id),
+ // Bucket-key permissions
+ POST "/v0/bucket/allow" => BucketAllowKey,
+ POST "/v0/bucket/deny" => BucketDenyKey,
+ // Bucket aliases
+ PUT "/v0/bucket/alias/global" => GlobalAliasBucket (query::id, query::alias),
+ DELETE "/v0/bucket/alias/global" => GlobalUnaliasBucket (query::id, query::alias),
+ PUT "/v0/bucket/alias/local" => LocalAliasBucket (query::id, query::access_key_id, query::alias),
+ DELETE "/v0/bucket/alias/local" => LocalUnaliasBucket (query::id, query::access_key_id, query::alias),
+ ]);
+
+ if let Some(message) = query.nonempty_message() {
+ debug!("Unused query parameter: {}", message)
+ }
+
+ Ok(res)
+ }
+ /// Get the kind of authorization which is required to perform the operation.
+ pub fn authorization_type(&self) -> Authorization {
+ match self {
+ Self::Metrics => Authorization::MetricsToken,
+ _ => Authorization::AdminToken,
+ }
+ }
+}
+
+generateQueryParameters! {
+ "id" => id,
+ "search" => search,
+ "globalAlias" => global_alias,
+ "alias" => alias,
+ "accessKeyId" => access_key_id
+}
diff --git a/src/api/api_server.rs b/src/api/api_server.rs
deleted file mode 100644
index e7b86d9e..00000000
--- a/src/api/api_server.rs
+++ /dev/null
@@ -1,645 +0,0 @@
-use std::net::SocketAddr;
-use std::sync::Arc;
-
-use chrono::{DateTime, NaiveDateTime, Utc};
-use futures::future::Future;
-use futures::prelude::*;
-use hyper::header;
-use hyper::server::conn::AddrStream;
-use hyper::service::{make_service_fn, service_fn};
-use hyper::{Body, Method, Request, Response, Server};
-
-use opentelemetry::{
- global,
- metrics::{Counter, ValueRecorder},
- trace::{FutureExt, TraceContextExt, Tracer},
- Context, KeyValue,
-};
-
-use garage_util::data::*;
-use garage_util::error::Error as GarageError;
-use garage_util::metrics::{gen_trace_id, RecordDuration};
-
-use garage_model::garage::Garage;
-use garage_model::key_table::Key;
-
-use garage_table::util::*;
-
-use crate::error::*;
-use crate::signature::compute_scope;
-use crate::signature::payload::check_payload_signature;
-use crate::signature::streaming::SignedPayloadStream;
-use crate::signature::LONG_DATETIME;
-
-use crate::helpers::*;
-use crate::s3_bucket::*;
-use crate::s3_copy::*;
-use crate::s3_cors::*;
-use crate::s3_delete::*;
-use crate::s3_get::*;
-use crate::s3_list::*;
-use crate::s3_post_object::handle_post_object;
-use crate::s3_put::*;
-use crate::s3_router::{Authorization, Endpoint};
-use crate::s3_website::*;
-
-struct ApiMetrics {
- request_counter: Counter<u64>,
- error_counter: Counter<u64>,
- request_duration: ValueRecorder<f64>,
-}
-
-impl ApiMetrics {
- fn new() -> Self {
- let meter = global::meter("garage/api");
- Self {
- request_counter: meter
- .u64_counter("api.request_counter")
- .with_description("Number of API calls to the various S3 API endpoints")
- .init(),
- error_counter: meter
- .u64_counter("api.error_counter")
- .with_description(
- "Number of API calls to the various S3 API endpoints that resulted in errors",
- )
- .init(),
- request_duration: meter
- .f64_value_recorder("api.request_duration")
- .with_description("Duration of API calls to the various S3 API endpoints")
- .init(),
- }
- }
-}
-
-/// Run the S3 API server
-pub async fn run_api_server(
- garage: Arc<Garage>,
- shutdown_signal: impl Future<Output = ()>,
-) -> Result<(), GarageError> {
- let addr = &garage.config.s3_api.api_bind_addr;
-
- let metrics = Arc::new(ApiMetrics::new());
-
- let service = make_service_fn(|conn: &AddrStream| {
- let garage = garage.clone();
- let metrics = metrics.clone();
-
- let client_addr = conn.remote_addr();
- async move {
- Ok::<_, GarageError>(service_fn(move |req: Request<Body>| {
- let garage = garage.clone();
- let metrics = metrics.clone();
-
- handler(garage, metrics, req, client_addr)
- }))
- }
- });
-
- let server = Server::bind(addr).serve(service);
-
- let graceful = server.with_graceful_shutdown(shutdown_signal);
- info!("API server listening on http://{}", addr);
-
- graceful.await?;
- Ok(())
-}
-
-async fn handler(
- garage: Arc<Garage>,
- metrics: Arc<ApiMetrics>,
- req: Request<Body>,
- addr: SocketAddr,
-) -> Result<Response<Body>, GarageError> {
- let uri = req.uri().clone();
- info!("{} {} {}", addr, req.method(), uri);
- debug!("{:?}", req);
-
- let tracer = opentelemetry::global::tracer("garage");
- let span = tracer
- .span_builder("S3 API call (unknown)")
- .with_trace_id(gen_trace_id())
- .with_attributes(vec![
- KeyValue::new("method", format!("{}", req.method())),
- KeyValue::new("uri", req.uri().to_string()),
- ])
- .start(&tracer);
-
- let res = handler_stage2(garage.clone(), metrics, req)
- .with_context(Context::current_with_span(span))
- .await;
-
- match res {
- Ok(x) => {
- debug!("{} {:?}", x.status(), x.headers());
- Ok(x)
- }
- Err(e) => {
- let body: Body = Body::from(e.aws_xml(&garage.config.s3_api.s3_region, uri.path()));
- let mut http_error_builder = Response::builder()
- .status(e.http_status_code())
- .header("Content-Type", "application/xml");
-
- if let Some(header_map) = http_error_builder.headers_mut() {
- e.add_headers(header_map)
- }
-
- let http_error = http_error_builder.body(body)?;
-
- if e.http_status_code().is_server_error() {
- warn!("Response: error {}, {}", e.http_status_code(), e);
- } else {
- info!("Response: error {}, {}", e.http_status_code(), e);
- }
- Ok(http_error)
- }
- }
-}
-
-async fn handler_stage2(
- garage: Arc<Garage>,
- metrics: Arc<ApiMetrics>,
- req: Request<Body>,
-) -> Result<Response<Body>, Error> {
- let authority = req
- .headers()
- .get(header::HOST)
- .ok_or_bad_request("Host header required")?
- .to_str()?;
-
- let host = authority_to_host(authority)?;
-
- let bucket_name = garage
- .config
- .s3_api
- .root_domain
- .as_ref()
- .and_then(|root_domain| host_to_bucket(&host, root_domain));
-
- let (endpoint, bucket_name) = Endpoint::from_request(&req, bucket_name.map(ToOwned::to_owned))?;
- debug!("Endpoint: {:?}", endpoint);
-
- let current_context = Context::current();
- let current_span = current_context.span();
- current_span.update_name::<String>(format!("S3 API {}", endpoint.name()));
- current_span.set_attribute(KeyValue::new("endpoint", endpoint.name()));
- current_span.set_attribute(KeyValue::new(
- "bucket",
- bucket_name.clone().unwrap_or_default(),
- ));
-
- let metrics_tags = &[KeyValue::new("api_endpoint", endpoint.name())];
-
- let res = handler_stage3(garage, req, endpoint, bucket_name)
- .record_duration(&metrics.request_duration, &metrics_tags[..])
- .await;
-
- metrics.request_counter.add(1, &metrics_tags[..]);
-
- let status_code = match &res {
- Ok(r) => r.status(),
- Err(e) => e.http_status_code(),
- };
- if status_code.is_client_error() || status_code.is_server_error() {
- metrics.error_counter.add(
- 1,
- &[
- metrics_tags[0].clone(),
- KeyValue::new("status_code", status_code.as_str().to_string()),
- ],
- );
- }
-
- res
-}
-
-async fn handler_stage3(
- garage: Arc<Garage>,
- req: Request<Body>,
- endpoint: Endpoint,
- bucket_name: Option<String>,
-) -> Result<Response<Body>, Error> {
- // Some endpoints are processed early, before we even check for an API key
- if let Endpoint::PostObject = endpoint {
- return handle_post_object(garage, req, bucket_name.unwrap()).await;
- }
- if let Endpoint::Options = endpoint {
- return handle_options_s3api(garage, &req, bucket_name).await;
- }
-
- let (api_key, mut content_sha256) = check_payload_signature(&garage, &req).await?;
- let api_key = api_key.ok_or_else(|| {
- Error::Forbidden("Garage does not support anonymous access yet".to_string())
- })?;
-
- let req = match req.headers().get("x-amz-content-sha256") {
- Some(header) if header == "STREAMING-AWS4-HMAC-SHA256-PAYLOAD" => {
- let signature = content_sha256
- .take()
- .ok_or_bad_request("No signature provided")?;
-
- let secret_key = &api_key
- .state
- .as_option()
- .ok_or_internal_error("Deleted key state")?
- .secret_key;
-
- let date = req
- .headers()
- .get("x-amz-date")
- .ok_or_bad_request("Missing X-Amz-Date field")?
- .to_str()?;
- let date: NaiveDateTime = NaiveDateTime::parse_from_str(date, LONG_DATETIME)
- .ok_or_bad_request("Invalid date")?;
- let date: DateTime<Utc> = DateTime::from_utc(date, Utc);
-
- let scope = compute_scope(&date, &garage.config.s3_api.s3_region);
- let signing_hmac = crate::signature::signing_hmac(
- &date,
- secret_key,
- &garage.config.s3_api.s3_region,
- "s3",
- )
- .ok_or_internal_error("Unable to build signing HMAC")?;
-
- req.map(move |body| {
- Body::wrap_stream(
- SignedPayloadStream::new(
- body.map_err(Error::from),
- signing_hmac,
- date,
- &scope,
- signature,
- )
- .map_err(Error::from),
- )
- })
- }
- _ => req,
- };
-
- let bucket_name = match bucket_name {
- None => return handle_request_without_bucket(garage, req, api_key, endpoint).await,
- Some(bucket) => bucket.to_string(),
- };
-
- // Special code path for CreateBucket API endpoint
- if let Endpoint::CreateBucket {} = endpoint {
- return handle_create_bucket(&garage, req, content_sha256, api_key, bucket_name).await;
- }
-
- let bucket_id = resolve_bucket(&garage, &bucket_name, &api_key).await?;
- let bucket = garage
- .bucket_table
- .get(&EmptyKey, &bucket_id)
- .await?
- .filter(|b| !b.state.is_deleted())
- .ok_or(Error::NoSuchBucket)?;
-
- let allowed = match endpoint.authorization_type() {
- Authorization::Read => api_key.allow_read(&bucket_id),
- Authorization::Write => api_key.allow_write(&bucket_id),
- Authorization::Owner => api_key.allow_owner(&bucket_id),
- _ => unreachable!(),
- };
-
- if !allowed {
- return Err(Error::Forbidden(
- "Operation is not allowed for this key.".to_string(),
- ));
- }
-
- // Look up what CORS rule might apply to response.
- // Requests for methods different than GET, HEAD or POST
- // are always preflighted, i.e. the browser should make
- // an OPTIONS call before to check it is allowed
- let matching_cors_rule = match *req.method() {
- Method::GET | Method::HEAD | Method::POST => find_matching_cors_rule(&bucket, &req)?,
- _ => None,
- };
-
- let resp = match endpoint {
- Endpoint::HeadObject {
- key, part_number, ..
- } => handle_head(garage, &req, bucket_id, &key, part_number).await,
- Endpoint::GetObject {
- key, part_number, ..
- } => handle_get(garage, &req, bucket_id, &key, part_number).await,
- Endpoint::UploadPart {
- key,
- part_number,
- upload_id,
- } => {
- handle_put_part(
- garage,
- req,
- bucket_id,
- &key,
- part_number,
- &upload_id,
- content_sha256,
- )
- .await
- }
- Endpoint::CopyObject { key } => handle_copy(garage, &api_key, &req, bucket_id, &key).await,
- Endpoint::UploadPartCopy {
- key,
- part_number,
- upload_id,
- } => {
- handle_upload_part_copy(
- garage,
- &api_key,
- &req,
- bucket_id,
- &key,
- part_number,
- &upload_id,
- )
- .await
- }
- Endpoint::PutObject { key } => {
- handle_put(garage, req, bucket_id, &key, content_sha256).await
- }
- Endpoint::AbortMultipartUpload { key, upload_id } => {
- handle_abort_multipart_upload(garage, bucket_id, &key, &upload_id).await
- }
- Endpoint::DeleteObject { key, .. } => handle_delete(garage, bucket_id, &key).await,
- Endpoint::CreateMultipartUpload { key } => {
- handle_create_multipart_upload(garage, &req, &bucket_name, bucket_id, &key).await
- }
- Endpoint::CompleteMultipartUpload { key, upload_id } => {
- handle_complete_multipart_upload(
- garage,
- req,
- &bucket_name,
- bucket_id,
- &key,
- &upload_id,
- content_sha256,
- )
- .await
- }
- Endpoint::CreateBucket {} => unreachable!(),
- Endpoint::HeadBucket {} => {
- let empty_body: Body = Body::from(vec![]);
- let response = Response::builder().body(empty_body).unwrap();
- Ok(response)
- }
- Endpoint::DeleteBucket {} => {
- handle_delete_bucket(&garage, bucket_id, bucket_name, api_key).await
- }
- Endpoint::GetBucketLocation {} => handle_get_bucket_location(garage),
- Endpoint::GetBucketVersioning {} => handle_get_bucket_versioning(),
- Endpoint::ListObjects {
- delimiter,
- encoding_type,
- marker,
- max_keys,
- prefix,
- } => {
- handle_list(
- garage,
- &ListObjectsQuery {
- common: ListQueryCommon {
- bucket_name,
- bucket_id,
- delimiter: delimiter.map(|d| d.to_string()),
- page_size: max_keys.map(|p| p.clamp(1, 1000)).unwrap_or(1000),
- prefix: prefix.unwrap_or_default(),
- urlencode_resp: encoding_type.map(|e| e == "url").unwrap_or(false),
- },
- is_v2: false,
- marker,
- continuation_token: None,
- start_after: None,
- },
- )
- .await
- }
- Endpoint::ListObjectsV2 {
- delimiter,
- encoding_type,
- max_keys,
- prefix,
- continuation_token,
- start_after,
- list_type,
- ..
- } => {
- if list_type == "2" {
- handle_list(
- garage,
- &ListObjectsQuery {
- common: ListQueryCommon {
- bucket_name,
- bucket_id,
- delimiter: delimiter.map(|d| d.to_string()),
- page_size: max_keys.map(|p| p.clamp(1, 1000)).unwrap_or(1000),
- urlencode_resp: encoding_type.map(|e| e == "url").unwrap_or(false),
- prefix: prefix.unwrap_or_default(),
- },
- is_v2: true,
- marker: None,
- continuation_token,
- start_after,
- },
- )
- .await
- } else {
- Err(Error::BadRequest(format!(
- "Invalid endpoint: list-type={}",
- list_type
- )))
- }
- }
- Endpoint::ListMultipartUploads {
- delimiter,
- encoding_type,
- key_marker,
- max_uploads,
- prefix,
- upload_id_marker,
- } => {
- handle_list_multipart_upload(
- garage,
- &ListMultipartUploadsQuery {
- common: ListQueryCommon {
- bucket_name,
- bucket_id,
- delimiter: delimiter.map(|d| d.to_string()),
- page_size: max_uploads.map(|p| p.clamp(1, 1000)).unwrap_or(1000),
- prefix: prefix.unwrap_or_default(),
- urlencode_resp: encoding_type.map(|e| e == "url").unwrap_or(false),
- },
- key_marker,
- upload_id_marker,
- },
- )
- .await
- }
- Endpoint::ListParts {
- key,
- max_parts,
- part_number_marker,
- upload_id,
- } => {
- handle_list_parts(
- garage,
- &ListPartsQuery {
- bucket_name,
- bucket_id,
- key,
- upload_id,
- part_number_marker: part_number_marker.map(|p| p.clamp(1, 10000)),
- max_parts: max_parts.map(|p| p.clamp(1, 1000)).unwrap_or(1000),
- },
- )
- .await
- }
- Endpoint::DeleteObjects {} => {
- handle_delete_objects(garage, bucket_id, req, content_sha256).await
- }
- Endpoint::GetBucketWebsite {} => handle_get_website(&bucket).await,
- Endpoint::PutBucketWebsite {} => {
- handle_put_website(garage, bucket_id, req, content_sha256).await
- }
- Endpoint::DeleteBucketWebsite {} => handle_delete_website(garage, bucket_id).await,
- Endpoint::GetBucketCors {} => handle_get_cors(&bucket).await,
- Endpoint::PutBucketCors {} => handle_put_cors(garage, bucket_id, req, content_sha256).await,
- Endpoint::DeleteBucketCors {} => handle_delete_cors(garage, bucket_id).await,
- endpoint => Err(Error::NotImplemented(endpoint.name().to_owned())),
- };
-
- // If request was a success and we have a CORS rule that applies to it,
- // add the corresponding CORS headers to the response
- let mut resp_ok = resp?;
- if let Some(rule) = matching_cors_rule {
- add_cors_headers(&mut resp_ok, rule)
- .ok_or_internal_error("Invalid bucket CORS configuration")?;
- }
-
- Ok(resp_ok)
-}
-
-async fn handle_request_without_bucket(
- garage: Arc<Garage>,
- _req: Request<Body>,
- api_key: Key,
- endpoint: Endpoint,
-) -> Result<Response<Body>, Error> {
- match endpoint {
- Endpoint::ListBuckets => handle_list_buckets(&garage, &api_key).await,
- endpoint => Err(Error::NotImplemented(endpoint.name().to_owned())),
- }
-}
-
-#[allow(clippy::ptr_arg)]
-pub async fn resolve_bucket(
- garage: &Garage,
- bucket_name: &String,
- api_key: &Key,
-) -> Result<Uuid, Error> {
- let api_key_params = api_key
- .state
- .as_option()
- .ok_or_internal_error("Key should not be deleted at this point")?;
-
- if let Some(Some(bucket_id)) = api_key_params.local_aliases.get(bucket_name) {
- Ok(*bucket_id)
- } else {
- Ok(garage
- .bucket_helper()
- .resolve_global_bucket_name(bucket_name)
- .await?
- .ok_or(Error::NoSuchBucket)?)
- }
-}
-
-/// Extract the bucket name and the key name from an HTTP path and possibly a bucket provided in
-/// the host header of the request
-///
-/// S3 internally manages only buckets and keys. This function splits
-/// an HTTP path to get the corresponding bucket name and key.
-pub fn parse_bucket_key<'a>(
- path: &'a str,
- host_bucket: Option<&'a str>,
-) -> Result<(&'a str, Option<&'a str>), Error> {
- let path = path.trim_start_matches('/');
-
- if let Some(bucket) = host_bucket {
- if !path.is_empty() {
- return Ok((bucket, Some(path)));
- } else {
- return Ok((bucket, None));
- }
- }
-
- let (bucket, key) = match path.find('/') {
- Some(i) => {
- let key = &path[i + 1..];
- if !key.is_empty() {
- (&path[..i], Some(key))
- } else {
- (&path[..i], None)
- }
- }
- None => (path, None),
- };
- if bucket.is_empty() {
- return Err(Error::BadRequest("No bucket specified".to_string()));
- }
- Ok((bucket, key))
-}
-
-#[cfg(test)]
-mod tests {
- use super::*;
-
- #[test]
- fn parse_bucket_containing_a_key() -> Result<(), Error> {
- let (bucket, key) = parse_bucket_key("/my_bucket/a/super/file.jpg", None)?;
- assert_eq!(bucket, "my_bucket");
- assert_eq!(key.expect("key must be set"), "a/super/file.jpg");
- Ok(())
- }
-
- #[test]
- fn parse_bucket_containing_no_key() -> Result<(), Error> {
- let (bucket, key) = parse_bucket_key("/my_bucket/", None)?;
- assert_eq!(bucket, "my_bucket");
- assert!(key.is_none());
- let (bucket, key) = parse_bucket_key("/my_bucket", None)?;
- assert_eq!(bucket, "my_bucket");
- assert!(key.is_none());
- Ok(())
- }
-
- #[test]
- fn parse_bucket_containing_no_bucket() {
- let parsed = parse_bucket_key("", None);
- assert!(parsed.is_err());
- let parsed = parse_bucket_key("/", None);
- assert!(parsed.is_err());
- let parsed = parse_bucket_key("////", None);
- assert!(parsed.is_err());
- }
-
- #[test]
- fn parse_bucket_with_vhost_and_key() -> Result<(), Error> {
- let (bucket, key) = parse_bucket_key("/a/super/file.jpg", Some("my-bucket"))?;
- assert_eq!(bucket, "my-bucket");
- assert_eq!(key.expect("key must be set"), "a/super/file.jpg");
- Ok(())
- }
-
- #[test]
- fn parse_bucket_with_vhost_no_key() -> Result<(), Error> {
- let (bucket, key) = parse_bucket_key("", Some("my-bucket"))?;
- assert_eq!(bucket, "my-bucket");
- assert!(key.is_none());
- let (bucket, key) = parse_bucket_key("/", Some("my-bucket"))?;
- assert_eq!(bucket, "my-bucket");
- assert!(key.is_none());
- Ok(())
- }
-}
diff --git a/src/api/common_error.rs b/src/api/common_error.rs
new file mode 100644
index 00000000..20f9f266
--- /dev/null
+++ b/src/api/common_error.rs
@@ -0,0 +1,177 @@
+use err_derive::Error;
+use hyper::StatusCode;
+
+use garage_util::error::Error as GarageError;
+
+/// Errors of this crate
+#[derive(Debug, Error)]
+pub enum CommonError {
+ // ---- INTERNAL ERRORS ----
+ /// Error related to deeper parts of Garage
+ #[error(display = "Internal error: {}", _0)]
+ InternalError(#[error(source)] GarageError),
+
+ /// Error related to Hyper
+ #[error(display = "Internal error (Hyper error): {}", _0)]
+ Hyper(#[error(source)] hyper::Error),
+
+ /// Error related to HTTP
+ #[error(display = "Internal error (HTTP error): {}", _0)]
+ Http(#[error(source)] http::Error),
+
+ // ---- GENERIC CLIENT ERRORS ----
+ /// Proper authentication was not provided
+ #[error(display = "Forbidden: {}", _0)]
+ Forbidden(String),
+
+ /// Generic bad request response with custom message
+ #[error(display = "Bad request: {}", _0)]
+ BadRequest(String),
+
+ // ---- SPECIFIC ERROR CONDITIONS ----
+ // These have to be error codes referenced in the S3 spec here:
+ // https://docs.aws.amazon.com/AmazonS3/latest/API/ErrorResponses.html#ErrorCodeList
+ /// The bucket requested don't exists
+ #[error(display = "Bucket not found: {}", _0)]
+ NoSuchBucket(String),
+
+ /// Tried to create a bucket that already exist
+ #[error(display = "Bucket already exists")]
+ BucketAlreadyExists,
+
+ /// Tried to delete a non-empty bucket
+ #[error(display = "Tried to delete a non-empty bucket")]
+ BucketNotEmpty,
+
+ // Category: bad request
+ /// Bucket name is not valid according to AWS S3 specs
+ #[error(display = "Invalid bucket name: {}", _0)]
+ InvalidBucketName(String),
+}
+
+impl CommonError {
+ pub fn http_status_code(&self) -> StatusCode {
+ match self {
+ CommonError::InternalError(
+ GarageError::Timeout
+ | GarageError::RemoteError(_)
+ | GarageError::Quorum(_, _, _, _),
+ ) => StatusCode::SERVICE_UNAVAILABLE,
+ CommonError::InternalError(_) | CommonError::Hyper(_) | CommonError::Http(_) => {
+ StatusCode::INTERNAL_SERVER_ERROR
+ }
+ CommonError::BadRequest(_) => StatusCode::BAD_REQUEST,
+ CommonError::Forbidden(_) => StatusCode::FORBIDDEN,
+ CommonError::NoSuchBucket(_) => StatusCode::NOT_FOUND,
+ CommonError::BucketNotEmpty | CommonError::BucketAlreadyExists => StatusCode::CONFLICT,
+ CommonError::InvalidBucketName(_) => StatusCode::BAD_REQUEST,
+ }
+ }
+
+ pub fn aws_code(&self) -> &'static str {
+ match self {
+ CommonError::Forbidden(_) => "AccessDenied",
+ CommonError::InternalError(
+ GarageError::Timeout
+ | GarageError::RemoteError(_)
+ | GarageError::Quorum(_, _, _, _),
+ ) => "ServiceUnavailable",
+ CommonError::InternalError(_) | CommonError::Hyper(_) | CommonError::Http(_) => {
+ "InternalError"
+ }
+ CommonError::BadRequest(_) => "InvalidRequest",
+ CommonError::NoSuchBucket(_) => "NoSuchBucket",
+ CommonError::BucketAlreadyExists => "BucketAlreadyExists",
+ CommonError::BucketNotEmpty => "BucketNotEmpty",
+ CommonError::InvalidBucketName(_) => "InvalidBucketName",
+ }
+ }
+
+ pub fn bad_request<M: ToString>(msg: M) -> Self {
+ CommonError::BadRequest(msg.to_string())
+ }
+}
+
+pub trait CommonErrorDerivative: From<CommonError> {
+ fn internal_error<M: ToString>(msg: M) -> Self {
+ Self::from(CommonError::InternalError(GarageError::Message(
+ msg.to_string(),
+ )))
+ }
+
+ fn bad_request<M: ToString>(msg: M) -> Self {
+ Self::from(CommonError::BadRequest(msg.to_string()))
+ }
+
+ fn forbidden<M: ToString>(msg: M) -> Self {
+ Self::from(CommonError::Forbidden(msg.to_string()))
+ }
+}
+
+/// Trait to map error to the Bad Request error code
+pub trait OkOrBadRequest {
+ type S;
+ fn ok_or_bad_request<M: AsRef<str>>(self, reason: M) -> Result<Self::S, CommonError>;
+}
+
+impl<T, E> OkOrBadRequest for Result<T, E>
+where
+ E: std::fmt::Display,
+{
+ type S = T;
+ fn ok_or_bad_request<M: AsRef<str>>(self, reason: M) -> Result<T, CommonError> {
+ match self {
+ Ok(x) => Ok(x),
+ Err(e) => Err(CommonError::BadRequest(format!(
+ "{}: {}",
+ reason.as_ref(),
+ e
+ ))),
+ }
+ }
+}
+
+impl<T> OkOrBadRequest for Option<T> {
+ type S = T;
+ fn ok_or_bad_request<M: AsRef<str>>(self, reason: M) -> Result<T, CommonError> {
+ match self {
+ Some(x) => Ok(x),
+ None => Err(CommonError::BadRequest(reason.as_ref().to_string())),
+ }
+ }
+}
+
+/// Trait to map an error to an Internal Error code
+pub trait OkOrInternalError {
+ type S;
+ fn ok_or_internal_error<M: AsRef<str>>(self, reason: M) -> Result<Self::S, CommonError>;
+}
+
+impl<T, E> OkOrInternalError for Result<T, E>
+where
+ E: std::fmt::Display,
+{
+ type S = T;
+ fn ok_or_internal_error<M: AsRef<str>>(self, reason: M) -> Result<T, CommonError> {
+ match self {
+ Ok(x) => Ok(x),
+ Err(e) => Err(CommonError::InternalError(GarageError::Message(format!(
+ "{}: {}",
+ reason.as_ref(),
+ e
+ )))),
+ }
+ }
+}
+
+impl<T> OkOrInternalError for Option<T> {
+ type S = T;
+ fn ok_or_internal_error<M: AsRef<str>>(self, reason: M) -> Result<T, CommonError> {
+ match self {
+ Some(x) => Ok(x),
+ None => Err(CommonError::InternalError(GarageError::Message(
+ reason.as_ref().to_string(),
+ ))),
+ }
+ }
+}
diff --git a/src/api/generic_server.rs b/src/api/generic_server.rs
new file mode 100644
index 00000000..62fe4e5a
--- /dev/null
+++ b/src/api/generic_server.rs
@@ -0,0 +1,211 @@
+use std::net::SocketAddr;
+use std::sync::Arc;
+
+use async_trait::async_trait;
+
+use futures::future::Future;
+
+use hyper::header::HeaderValue;
+use hyper::server::conn::AddrStream;
+use hyper::service::{make_service_fn, service_fn};
+use hyper::{Body, Request, Response, Server};
+use hyper::{HeaderMap, StatusCode};
+
+use opentelemetry::{
+ global,
+ metrics::{Counter, ValueRecorder},
+ trace::{FutureExt, SpanRef, TraceContextExt, Tracer},
+ Context, KeyValue,
+};
+
+use garage_util::error::Error as GarageError;
+use garage_util::metrics::{gen_trace_id, RecordDuration};
+
+pub(crate) trait ApiEndpoint: Send + Sync + 'static {
+ fn name(&self) -> &'static str;
+ fn add_span_attributes(&self, span: SpanRef<'_>);
+}
+
+pub trait ApiError: std::error::Error + Send + Sync + 'static {
+ fn http_status_code(&self) -> StatusCode;
+ fn add_http_headers(&self, header_map: &mut HeaderMap<HeaderValue>);
+ fn http_body(&self, garage_region: &str, path: &str) -> Body;
+}
+
+#[async_trait]
+pub(crate) trait ApiHandler: Send + Sync + 'static {
+ const API_NAME: &'static str;
+ const API_NAME_DISPLAY: &'static str;
+
+ type Endpoint: ApiEndpoint;
+ type Error: ApiError;
+
+ fn parse_endpoint(&self, r: &Request<Body>) -> Result<Self::Endpoint, Self::Error>;
+ async fn handle(
+ &self,
+ req: Request<Body>,
+ endpoint: Self::Endpoint,
+ ) -> Result<Response<Body>, Self::Error>;
+}
+
+pub(crate) struct ApiServer<A: ApiHandler> {
+ region: String,
+ api_handler: A,
+
+ // Metrics
+ request_counter: Counter<u64>,
+ error_counter: Counter<u64>,
+ request_duration: ValueRecorder<f64>,
+}
+
+impl<A: ApiHandler> ApiServer<A> {
+ pub fn new(region: String, api_handler: A) -> Arc<Self> {
+ let meter = global::meter("garage/api");
+ Arc::new(Self {
+ region,
+ api_handler,
+ request_counter: meter
+ .u64_counter(format!("api.{}.request_counter", A::API_NAME))
+ .with_description(format!(
+ "Number of API calls to the various {} API endpoints",
+ A::API_NAME_DISPLAY
+ ))
+ .init(),
+ error_counter: meter
+ .u64_counter(format!("api.{}.error_counter", A::API_NAME))
+ .with_description(format!(
+ "Number of API calls to the various {} API endpoints that resulted in errors",
+ A::API_NAME_DISPLAY
+ ))
+ .init(),
+ request_duration: meter
+ .f64_value_recorder(format!("api.{}.request_duration", A::API_NAME))
+ .with_description(format!(
+ "Duration of API calls to the various {} API endpoints",
+ A::API_NAME_DISPLAY
+ ))
+ .init(),
+ })
+ }
+
+ pub async fn run_server(
+ self: Arc<Self>,
+ bind_addr: SocketAddr,
+ shutdown_signal: impl Future<Output = ()>,
+ ) -> Result<(), GarageError> {
+ let service = make_service_fn(|conn: &AddrStream| {
+ let this = self.clone();
+
+ let client_addr = conn.remote_addr();
+ async move {
+ Ok::<_, GarageError>(service_fn(move |req: Request<Body>| {
+ let this = this.clone();
+
+ this.handler(req, client_addr)
+ }))
+ }
+ });
+
+ let server = Server::bind(&bind_addr).serve(service);
+
+ let graceful = server.with_graceful_shutdown(shutdown_signal);
+ info!(
+ "{} API server listening on http://{}",
+ A::API_NAME_DISPLAY,
+ bind_addr
+ );
+
+ graceful.await?;
+ Ok(())
+ }
+
+ async fn handler(
+ self: Arc<Self>,
+ req: Request<Body>,
+ addr: SocketAddr,
+ ) -> Result<Response<Body>, GarageError> {
+ let uri = req.uri().clone();
+ info!("{} {} {}", addr, req.method(), uri);
+ debug!("{:?}", req);
+
+ let tracer = opentelemetry::global::tracer("garage");
+ let span = tracer
+ .span_builder(format!("{} API call (unknown)", A::API_NAME_DISPLAY))
+ .with_trace_id(gen_trace_id())
+ .with_attributes(vec![
+ KeyValue::new("method", format!("{}", req.method())),
+ KeyValue::new("uri", req.uri().to_string()),
+ ])
+ .start(&tracer);
+
+ let res = self
+ .handler_stage2(req)
+ .with_context(Context::current_with_span(span))
+ .await;
+
+ match res {
+ Ok(x) => {
+ debug!("{} {:?}", x.status(), x.headers());
+ Ok(x)
+ }
+ Err(e) => {
+ let body: Body = e.http_body(&self.region, uri.path());
+ let mut http_error_builder = Response::builder().status(e.http_status_code());
+
+ if let Some(header_map) = http_error_builder.headers_mut() {
+ e.add_http_headers(header_map)
+ }
+
+ let http_error = http_error_builder.body(body)?;
+
+ if e.http_status_code().is_server_error() {
+ warn!("Response: error {}, {}", e.http_status_code(), e);
+ } else {
+ info!("Response: error {}, {}", e.http_status_code(), e);
+ }
+ Ok(http_error)
+ }
+ }
+ }
+
+ async fn handler_stage2(&self, req: Request<Body>) -> Result<Response<Body>, A::Error> {
+ let endpoint = self.api_handler.parse_endpoint(&req)?;
+ debug!("Endpoint: {}", endpoint.name());
+
+ let current_context = Context::current();
+ let current_span = current_context.span();
+ current_span.update_name::<String>(format!(
+ "{} API {}",
+ A::API_NAME_DISPLAY,
+ endpoint.name()
+ ));
+ current_span.set_attribute(KeyValue::new("endpoint", endpoint.name()));
+ endpoint.add_span_attributes(current_span);
+
+ let metrics_tags = &[KeyValue::new("api_endpoint", endpoint.name())];
+
+ let res = self
+ .api_handler
+ .handle(req, endpoint)
+ .record_duration(&self.request_duration, &metrics_tags[..])
+ .await;
+
+ self.request_counter.add(1, &metrics_tags[..]);
+
+ let status_code = match &res {
+ Ok(r) => r.status(),
+ Err(e) => e.http_status_code(),
+ };
+ if status_code.is_client_error() || status_code.is_server_error() {
+ self.error_counter.add(
+ 1,
+ &[
+ metrics_tags[0].clone(),
+ KeyValue::new("status_code", status_code.as_str().to_string()),
+ ],
+ );
+ }
+
+ res
+ }
+}
diff --git a/src/api/helpers.rs b/src/api/helpers.rs
index c2709bb3..642dbc42 100644
--- a/src/api/helpers.rs
+++ b/src/api/helpers.rs
@@ -1,5 +1,21 @@
-use crate::Error;
+use hyper::{Body, Request, Response};
use idna::domain_to_unicode;
+use serde::{Deserialize, Serialize};
+
+use crate::common_error::{CommonError as Error, *};
+
+/// What kind of authorization is required to perform a given action
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum Authorization {
+ /// No authorization is required
+ None,
+ /// Having Read permission on bucket
+ Read,
+ /// Having Write permission on bucket
+ Write,
+ /// Having Owner permission on bucket
+ Owner,
+}
/// Host to bucket
///
@@ -31,7 +47,7 @@ pub fn authority_to_host(authority: &str) -> Result<String, Error> {
let mut iter = authority.chars().enumerate();
let (_, first_char) = iter
.next()
- .ok_or_else(|| Error::BadRequest("Authority is empty".to_string()))?;
+ .ok_or_else(|| Error::bad_request("Authority is empty".to_string()))?;
let split = match first_char {
'[' => {
@@ -39,7 +55,7 @@ pub fn authority_to_host(authority: &str) -> Result<String, Error> {
match iter.next() {
Some((_, ']')) => iter.next(),
_ => {
- return Err(Error::BadRequest(format!(
+ return Err(Error::bad_request(format!(
"Authority {} has an illegal format",
authority
)))
@@ -52,7 +68,7 @@ pub fn authority_to_host(authority: &str) -> Result<String, Error> {
let authority = match split {
Some((i, ':')) => Ok(&authority[..i]),
None => Ok(authority),
- Some((_, _)) => Err(Error::BadRequest(format!(
+ Some((_, _)) => Err(Error::bad_request(format!(
"Authority {} has an illegal format",
authority
))),
@@ -60,11 +76,135 @@ pub fn authority_to_host(authority: &str) -> Result<String, Error> {
authority.map(|h| domain_to_unicode(h).0)
}
+/// Extract the bucket name and the key name from an HTTP path and possibly a bucket provided in
+/// the host header of the request
+///
+/// S3 internally manages only buckets and keys. This function splits
+/// an HTTP path to get the corresponding bucket name and key.
+pub fn parse_bucket_key<'a>(
+ path: &'a str,
+ host_bucket: Option<&'a str>,
+) -> Result<(&'a str, Option<&'a str>), Error> {
+ let path = path.trim_start_matches('/');
+
+ if let Some(bucket) = host_bucket {
+ if !path.is_empty() {
+ return Ok((bucket, Some(path)));
+ } else {
+ return Ok((bucket, None));
+ }
+ }
+
+ let (bucket, key) = match path.find('/') {
+ Some(i) => {
+ let key = &path[i + 1..];
+ if !key.is_empty() {
+ (&path[..i], Some(key))
+ } else {
+ (&path[..i], None)
+ }
+ }
+ None => (path, None),
+ };
+ if bucket.is_empty() {
+ return Err(Error::bad_request("No bucket specified"));
+ }
+ Ok((bucket, key))
+}
+
+const UTF8_BEFORE_LAST_CHAR: char = '\u{10FFFE}';
+
+/// Compute the key after the prefix
+pub fn key_after_prefix(pfx: &str) -> Option<String> {
+ let mut next = pfx.to_string();
+ while !next.is_empty() {
+ let tail = next.pop().unwrap();
+ if tail >= char::MAX {
+ continue;
+ }
+
+ // Circumvent a limitation of RangeFrom that overflow earlier than needed
+ // See: https://doc.rust-lang.org/core/ops/struct.RangeFrom.html
+ let new_tail = if tail == UTF8_BEFORE_LAST_CHAR {
+ char::MAX
+ } else {
+ (tail..).nth(1).unwrap()
+ };
+
+ next.push(new_tail);
+ return Some(next);
+ }
+
+ None
+}
+
+pub async fn parse_json_body<T: for<'de> Deserialize<'de>>(req: Request<Body>) -> Result<T, Error> {
+ let body = hyper::body::to_bytes(req.into_body()).await?;
+ let resp: T = serde_json::from_slice(&body).ok_or_bad_request("Invalid JSON")?;
+ Ok(resp)
+}
+
+pub fn json_ok_response<T: Serialize>(res: &T) -> Result<Response<Body>, Error> {
+ let resp_json = serde_json::to_string_pretty(res).map_err(garage_util::error::Error::from)?;
+ Ok(Response::builder()
+ .status(hyper::StatusCode::OK)
+ .header(http::header::CONTENT_TYPE, "application/json")
+ .body(Body::from(resp_json))?)
+}
+
#[cfg(test)]
mod tests {
use super::*;
#[test]
+ fn parse_bucket_containing_a_key() -> Result<(), Error> {
+ let (bucket, key) = parse_bucket_key("/my_bucket/a/super/file.jpg", None)?;
+ assert_eq!(bucket, "my_bucket");
+ assert_eq!(key.expect("key must be set"), "a/super/file.jpg");
+ Ok(())
+ }
+
+ #[test]
+ fn parse_bucket_containing_no_key() -> Result<(), Error> {
+ let (bucket, key) = parse_bucket_key("/my_bucket/", None)?;
+ assert_eq!(bucket, "my_bucket");
+ assert!(key.is_none());
+ let (bucket, key) = parse_bucket_key("/my_bucket", None)?;
+ assert_eq!(bucket, "my_bucket");
+ assert!(key.is_none());
+ Ok(())
+ }
+
+ #[test]
+ fn parse_bucket_containing_no_bucket() {
+ let parsed = parse_bucket_key("", None);
+ assert!(parsed.is_err());
+ let parsed = parse_bucket_key("/", None);
+ assert!(parsed.is_err());
+ let parsed = parse_bucket_key("////", None);
+ assert!(parsed.is_err());
+ }
+
+ #[test]
+ fn parse_bucket_with_vhost_and_key() -> Result<(), Error> {
+ let (bucket, key) = parse_bucket_key("/a/super/file.jpg", Some("my-bucket"))?;
+ assert_eq!(bucket, "my-bucket");
+ assert_eq!(key.expect("key must be set"), "a/super/file.jpg");
+ Ok(())
+ }
+
+ #[test]
+ fn parse_bucket_with_vhost_no_key() -> Result<(), Error> {
+ let (bucket, key) = parse_bucket_key("", Some("my-bucket"))?;
+ assert_eq!(bucket, "my-bucket");
+ assert!(key.is_none());
+ let (bucket, key) = parse_bucket_key("/", Some("my-bucket"))?;
+ assert_eq!(bucket, "my-bucket");
+ assert!(key.is_none());
+ Ok(())
+ }
+
+ #[test]
fn authority_to_host_with_port() -> Result<(), Error> {
let domain = authority_to_host("[::1]:3902")?;
assert_eq!(domain, "[::1]");
@@ -111,4 +251,47 @@ mod tests {
assert_eq!(host_to_bucket("not-garage.tld", "garage.tld"), None);
assert_eq!(host_to_bucket("not-garage.tld", ".garage.tld"), None);
}
+
+ #[test]
+ fn test_key_after_prefix() {
+ use std::iter::FromIterator;
+
+ assert_eq!(UTF8_BEFORE_LAST_CHAR as u32, (char::MAX as u32) - 1);
+ assert_eq!(key_after_prefix("a/b/").unwrap().as_str(), "a/b0");
+ assert_eq!(key_after_prefix("€").unwrap().as_str(), "₭");
+ assert_eq!(
+ key_after_prefix("􏿽").unwrap().as_str(),
+ String::from(char::from_u32(0x10FFFE).unwrap())
+ );
+
+ // When the last character is the biggest UTF8 char
+ let a = String::from_iter(['a', char::MAX].iter());
+ assert_eq!(key_after_prefix(a.as_str()).unwrap().as_str(), "b");
+
+ // When all characters are the biggest UTF8 char
+ let b = String::from_iter([char::MAX; 3].iter());
+ assert!(key_after_prefix(b.as_str()).is_none());
+
+ // Check utf8 surrogates
+ let c = String::from('\u{D7FF}');
+ assert_eq!(
+ key_after_prefix(c.as_str()).unwrap().as_str(),
+ String::from('\u{E000}')
+ );
+
+ // Check the character before the biggest one
+ let d = String::from('\u{10FFFE}');
+ assert_eq!(
+ key_after_prefix(d.as_str()).unwrap().as_str(),
+ String::from(char::MAX)
+ );
+ }
+}
+
+#[derive(Serialize)]
+pub(crate) struct CustomApiErrorBody {
+ pub(crate) code: String,
+ pub(crate) message: String,
+ pub(crate) region: String,
+ pub(crate) path: String,
}
diff --git a/src/api/k2v/api_server.rs b/src/api/k2v/api_server.rs
new file mode 100644
index 00000000..084867b5
--- /dev/null
+++ b/src/api/k2v/api_server.rs
@@ -0,0 +1,190 @@
+use std::net::SocketAddr;
+use std::sync::Arc;
+
+use async_trait::async_trait;
+
+use futures::future::Future;
+use hyper::{Body, Method, Request, Response};
+
+use opentelemetry::{trace::SpanRef, KeyValue};
+
+use garage_util::error::Error as GarageError;
+
+use garage_model::garage::Garage;
+
+use crate::generic_server::*;
+use crate::k2v::error::*;
+
+use crate::signature::payload::check_payload_signature;
+use crate::signature::streaming::*;
+
+use crate::helpers::*;
+use crate::k2v::batch::*;
+use crate::k2v::index::*;
+use crate::k2v::item::*;
+use crate::k2v::router::Endpoint;
+use crate::s3::cors::*;
+
+pub struct K2VApiServer {
+ garage: Arc<Garage>,
+}
+
+pub(crate) struct K2VApiEndpoint {
+ bucket_name: String,
+ endpoint: Endpoint,
+}
+
+impl K2VApiServer {
+ pub async fn run(
+ garage: Arc<Garage>,
+ bind_addr: SocketAddr,
+ s3_region: String,
+ shutdown_signal: impl Future<Output = ()>,
+ ) -> Result<(), GarageError> {
+ ApiServer::new(s3_region, K2VApiServer { garage })
+ .run_server(bind_addr, shutdown_signal)
+ .await
+ }
+}
+
+#[async_trait]
+impl ApiHandler for K2VApiServer {
+ const API_NAME: &'static str = "k2v";
+ const API_NAME_DISPLAY: &'static str = "K2V";
+
+ type Endpoint = K2VApiEndpoint;
+ type Error = Error;
+
+ fn parse_endpoint(&self, req: &Request<Body>) -> Result<K2VApiEndpoint, Error> {
+ let (endpoint, bucket_name) = Endpoint::from_request(req)?;
+
+ Ok(K2VApiEndpoint {
+ bucket_name,
+ endpoint,
+ })
+ }
+
+ async fn handle(
+ &self,
+ req: Request<Body>,
+ endpoint: K2VApiEndpoint,
+ ) -> Result<Response<Body>, Error> {
+ let K2VApiEndpoint {
+ bucket_name,
+ endpoint,
+ } = endpoint;
+ let garage = self.garage.clone();
+
+ // The OPTIONS method is procesed early, before we even check for an API key
+ if let Endpoint::Options = endpoint {
+ return Ok(handle_options_s3api(garage, &req, Some(bucket_name))
+ .await
+ .ok_or_bad_request("Error handling OPTIONS")?);
+ }
+
+ let (api_key, mut content_sha256) = check_payload_signature(&garage, "k2v", &req).await?;
+ let api_key = api_key
+ .ok_or_else(|| Error::forbidden("Garage does not support anonymous access yet"))?;
+
+ let req = parse_streaming_body(
+ &api_key,
+ req,
+ &mut content_sha256,
+ &garage.config.s3_api.s3_region,
+ "k2v",
+ )?;
+
+ let bucket_id = garage
+ .bucket_helper()
+ .resolve_bucket(&bucket_name, &api_key)
+ .await?;
+ let bucket = garage
+ .bucket_helper()
+ .get_existing_bucket(bucket_id)
+ .await?;
+
+ let allowed = match endpoint.authorization_type() {
+ Authorization::Read => api_key.allow_read(&bucket_id),
+ Authorization::Write => api_key.allow_write(&bucket_id),
+ Authorization::Owner => api_key.allow_owner(&bucket_id),
+ _ => unreachable!(),
+ };
+
+ if !allowed {
+ return Err(Error::forbidden("Operation is not allowed for this key."));
+ }
+
+ // Look up what CORS rule might apply to response.
+ // Requests for methods different than GET, HEAD or POST
+ // are always preflighted, i.e. the browser should make
+ // an OPTIONS call before to check it is allowed
+ let matching_cors_rule = match *req.method() {
+ Method::GET | Method::HEAD | Method::POST => find_matching_cors_rule(&bucket, &req)
+ .ok_or_internal_error("Error looking up CORS rule")?,
+ _ => None,
+ };
+
+ let resp = match endpoint {
+ Endpoint::DeleteItem {
+ partition_key,
+ sort_key,
+ } => handle_delete_item(garage, req, bucket_id, &partition_key, &sort_key).await,
+ Endpoint::InsertItem {
+ partition_key,
+ sort_key,
+ } => handle_insert_item(garage, req, bucket_id, &partition_key, &sort_key).await,
+ Endpoint::ReadItem {
+ partition_key,
+ sort_key,
+ } => handle_read_item(garage, &req, bucket_id, &partition_key, &sort_key).await,
+ Endpoint::PollItem {
+ partition_key,
+ sort_key,
+ causality_token,
+ timeout,
+ } => {
+ handle_poll_item(
+ garage,
+ &req,
+ bucket_id,
+ partition_key,
+ sort_key,
+ causality_token,
+ timeout,
+ )
+ .await
+ }
+ Endpoint::ReadIndex {
+ prefix,
+ start,
+ end,
+ limit,
+ reverse,
+ } => handle_read_index(garage, bucket_id, prefix, start, end, limit, reverse).await,
+ Endpoint::InsertBatch {} => handle_insert_batch(garage, bucket_id, req).await,
+ Endpoint::ReadBatch {} => handle_read_batch(garage, bucket_id, req).await,
+ Endpoint::DeleteBatch {} => handle_delete_batch(garage, bucket_id, req).await,
+ Endpoint::Options => unreachable!(),
+ };
+
+ // If request was a success and we have a CORS rule that applies to it,
+ // add the corresponding CORS headers to the response
+ let mut resp_ok = resp?;
+ if let Some(rule) = matching_cors_rule {
+ add_cors_headers(&mut resp_ok, rule)
+ .ok_or_internal_error("Invalid bucket CORS configuration")?;
+ }
+
+ Ok(resp_ok)
+ }
+}
+
+impl ApiEndpoint for K2VApiEndpoint {
+ fn name(&self) -> &'static str {
+ self.endpoint.name()
+ }
+
+ fn add_span_attributes(&self, span: SpanRef<'_>) {
+ span.set_attribute(KeyValue::new("bucket", self.bucket_name.clone()));
+ }
+}
diff --git a/src/api/k2v/batch.rs b/src/api/k2v/batch.rs
new file mode 100644
index 00000000..db9901cf
--- /dev/null
+++ b/src/api/k2v/batch.rs
@@ -0,0 +1,363 @@
+use std::sync::Arc;
+
+use hyper::{Body, Request, Response, StatusCode};
+use serde::{Deserialize, Serialize};
+
+use garage_util::data::*;
+use garage_util::error::Error as GarageError;
+
+use garage_table::{EnumerationOrder, TableSchema};
+
+use garage_model::garage::Garage;
+use garage_model::k2v::causality::*;
+use garage_model::k2v::item_table::*;
+
+use crate::helpers::*;
+use crate::k2v::error::*;
+use crate::k2v::range::read_range;
+
+pub async fn handle_insert_batch(
+ garage: Arc<Garage>,
+ bucket_id: Uuid,
+ req: Request<Body>,
+) -> Result<Response<Body>, Error> {
+ let items = parse_json_body::<Vec<InsertBatchItem>>(req).await?;
+
+ let mut items2 = vec![];
+ for it in items {
+ let ct = it
+ .ct
+ .map(|s| CausalContext::parse(&s))
+ .transpose()
+ .ok_or_bad_request("Invalid causality token")?;
+ let v = match it.v {
+ Some(vs) => {
+ DvvsValue::Value(base64::decode(vs).ok_or_bad_request("Invalid base64 value")?)
+ }
+ None => DvvsValue::Deleted,
+ };
+ items2.push((it.pk, it.sk, ct, v));
+ }
+
+ garage.k2v.rpc.insert_batch(bucket_id, items2).await?;
+
+ Ok(Response::builder()
+ .status(StatusCode::OK)
+ .body(Body::empty())?)
+}
+
+pub async fn handle_read_batch(
+ garage: Arc<Garage>,
+ bucket_id: Uuid,
+ req: Request<Body>,
+) -> Result<Response<Body>, Error> {
+ let queries = parse_json_body::<Vec<ReadBatchQuery>>(req).await?;
+
+ let resp_results = futures::future::join_all(
+ queries
+ .into_iter()
+ .map(|q| handle_read_batch_query(&garage, bucket_id, q)),
+ )
+ .await;
+
+ let mut resps: Vec<ReadBatchResponse> = vec![];
+ for resp in resp_results {
+ resps.push(resp?);
+ }
+
+ let resp_json = serde_json::to_string_pretty(&resps).map_err(GarageError::from)?;
+ Ok(Response::builder()
+ .status(StatusCode::OK)
+ .body(Body::from(resp_json))?)
+}
+
+async fn handle_read_batch_query(
+ garage: &Arc<Garage>,
+ bucket_id: Uuid,
+ query: ReadBatchQuery,
+) -> Result<ReadBatchResponse, Error> {
+ let partition = K2VItemPartition {
+ bucket_id,
+ partition_key: query.partition_key.clone(),
+ };
+
+ let filter = ItemFilter {
+ exclude_only_tombstones: !query.tombstones,
+ conflicts_only: query.conflicts_only,
+ };
+
+ let (items, more, next_start) = if query.single_item {
+ if query.prefix.is_some() || query.end.is_some() || query.limit.is_some() || query.reverse {
+ return Err(Error::bad_request("Batch query parameters 'prefix', 'end', 'limit' and 'reverse' must not be set when singleItem is true."));
+ }
+ let sk = query
+ .start
+ .as_ref()
+ .ok_or_bad_request("start should be specified if single_item is set")?;
+ let item = garage
+ .k2v
+ .item_table
+ .get(&partition, sk)
+ .await?
+ .filter(|e| K2VItemTable::matches_filter(e, &filter));
+ match item {
+ Some(i) => (vec![ReadBatchResponseItem::from(i)], false, None),
+ None => (vec![], false, None),
+ }
+ } else {
+ let (items, more, next_start) = read_range(
+ &garage.k2v.item_table,
+ &partition,
+ &query.prefix,
+ &query.start,
+ &query.end,
+ query.limit,
+ Some(filter),
+ EnumerationOrder::from_reverse(query.reverse),
+ )
+ .await?;
+
+ let items = items
+ .into_iter()
+ .map(ReadBatchResponseItem::from)
+ .collect::<Vec<_>>();
+
+ (items, more, next_start)
+ };
+
+ Ok(ReadBatchResponse {
+ partition_key: query.partition_key,
+ prefix: query.prefix,
+ start: query.start,
+ end: query.end,
+ limit: query.limit,
+ reverse: query.reverse,
+ single_item: query.single_item,
+ conflicts_only: query.conflicts_only,
+ tombstones: query.tombstones,
+ items,
+ more,
+ next_start,
+ })
+}
+
+pub async fn handle_delete_batch(
+ garage: Arc<Garage>,
+ bucket_id: Uuid,
+ req: Request<Body>,
+) -> Result<Response<Body>, Error> {
+ let queries = parse_json_body::<Vec<DeleteBatchQuery>>(req).await?;
+
+ let resp_results = futures::future::join_all(
+ queries
+ .into_iter()
+ .map(|q| handle_delete_batch_query(&garage, bucket_id, q)),
+ )
+ .await;
+
+ let mut resps: Vec<DeleteBatchResponse> = vec![];
+ for resp in resp_results {
+ resps.push(resp?);
+ }
+
+ let resp_json = serde_json::to_string_pretty(&resps).map_err(GarageError::from)?;
+ Ok(Response::builder()
+ .status(StatusCode::OK)
+ .body(Body::from(resp_json))?)
+}
+
+async fn handle_delete_batch_query(
+ garage: &Arc<Garage>,
+ bucket_id: Uuid,
+ query: DeleteBatchQuery,
+) -> Result<DeleteBatchResponse, Error> {
+ let partition = K2VItemPartition {
+ bucket_id,
+ partition_key: query.partition_key.clone(),
+ };
+
+ let filter = ItemFilter {
+ exclude_only_tombstones: true,
+ conflicts_only: false,
+ };
+
+ let deleted_items = if query.single_item {
+ if query.prefix.is_some() || query.end.is_some() {
+ return Err(Error::bad_request("Batch query parameters 'prefix' and 'end' must not be set when singleItem is true."));
+ }
+ let sk = query
+ .start
+ .as_ref()
+ .ok_or_bad_request("start should be specified if single_item is set")?;
+ let item = garage
+ .k2v
+ .item_table
+ .get(&partition, sk)
+ .await?
+ .filter(|e| K2VItemTable::matches_filter(e, &filter));
+ match item {
+ Some(i) => {
+ let cc = i.causal_context();
+ garage
+ .k2v
+ .rpc
+ .insert(
+ bucket_id,
+ i.partition.partition_key,
+ i.sort_key,
+ Some(cc),
+ DvvsValue::Deleted,
+ )
+ .await?;
+ 1
+ }
+ None => 0,
+ }
+ } else {
+ let (items, more, _next_start) = read_range(
+ &garage.k2v.item_table,
+ &partition,
+ &query.prefix,
+ &query.start,
+ &query.end,
+ None,
+ Some(filter),
+ EnumerationOrder::Forward,
+ )
+ .await?;
+ assert!(!more);
+
+ // TODO delete items
+ let items = items
+ .into_iter()
+ .map(|i| {
+ let cc = i.causal_context();
+ (
+ i.partition.partition_key,
+ i.sort_key,
+ Some(cc),
+ DvvsValue::Deleted,
+ )
+ })
+ .collect::<Vec<_>>();
+ let n = items.len();
+
+ garage.k2v.rpc.insert_batch(bucket_id, items).await?;
+
+ n
+ };
+
+ Ok(DeleteBatchResponse {
+ partition_key: query.partition_key,
+ prefix: query.prefix,
+ start: query.start,
+ end: query.end,
+ single_item: query.single_item,
+ deleted_items,
+ })
+}
+
+#[derive(Deserialize)]
+struct InsertBatchItem {
+ pk: String,
+ sk: String,
+ ct: Option<String>,
+ v: Option<String>,
+}
+
+#[derive(Deserialize)]
+struct ReadBatchQuery {
+ #[serde(rename = "partitionKey")]
+ partition_key: String,
+ #[serde(default)]
+ prefix: Option<String>,
+ #[serde(default)]
+ start: Option<String>,
+ #[serde(default)]
+ end: Option<String>,
+ #[serde(default)]
+ limit: Option<u64>,
+ #[serde(default)]
+ reverse: bool,
+ #[serde(default, rename = "singleItem")]
+ single_item: bool,
+ #[serde(default, rename = "conflictsOnly")]
+ conflicts_only: bool,
+ #[serde(default)]
+ tombstones: bool,
+}
+
+#[derive(Serialize)]
+struct ReadBatchResponse {
+ #[serde(rename = "partitionKey")]
+ partition_key: String,
+ prefix: Option<String>,
+ start: Option<String>,
+ end: Option<String>,
+ limit: Option<u64>,
+ reverse: bool,
+ #[serde(rename = "singleItem")]
+ single_item: bool,
+ #[serde(rename = "conflictsOnly")]
+ conflicts_only: bool,
+ tombstones: bool,
+
+ items: Vec<ReadBatchResponseItem>,
+ more: bool,
+ #[serde(rename = "nextStart")]
+ next_start: Option<String>,
+}
+
+#[derive(Serialize)]
+struct ReadBatchResponseItem {
+ sk: String,
+ ct: String,
+ v: Vec<Option<String>>,
+}
+
+impl ReadBatchResponseItem {
+ fn from(i: K2VItem) -> Self {
+ let ct = i.causal_context().serialize();
+ let v = i
+ .values()
+ .iter()
+ .map(|v| match v {
+ DvvsValue::Value(x) => Some(base64::encode(x)),
+ DvvsValue::Deleted => None,
+ })
+ .collect::<Vec<_>>();
+ Self {
+ sk: i.sort_key,
+ ct,
+ v,
+ }
+ }
+}
+
+#[derive(Deserialize)]
+struct DeleteBatchQuery {
+ #[serde(rename = "partitionKey")]
+ partition_key: String,
+ #[serde(default)]
+ prefix: Option<String>,
+ #[serde(default)]
+ start: Option<String>,
+ #[serde(default)]
+ end: Option<String>,
+ #[serde(default, rename = "singleItem")]
+ single_item: bool,
+}
+
+#[derive(Serialize)]
+struct DeleteBatchResponse {
+ #[serde(rename = "partitionKey")]
+ partition_key: String,
+ prefix: Option<String>,
+ start: Option<String>,
+ end: Option<String>,
+ #[serde(rename = "singleItem")]
+ single_item: bool,
+
+ #[serde(rename = "deletedItems")]
+ deleted_items: usize,
+}
diff --git a/src/api/k2v/error.rs b/src/api/k2v/error.rs
new file mode 100644
index 00000000..42491466
--- /dev/null
+++ b/src/api/k2v/error.rs
@@ -0,0 +1,135 @@
+use err_derive::Error;
+use hyper::header::HeaderValue;
+use hyper::{Body, HeaderMap, StatusCode};
+
+use garage_model::helper::error::Error as HelperError;
+
+use crate::common_error::CommonError;
+pub use crate::common_error::{CommonErrorDerivative, OkOrBadRequest, OkOrInternalError};
+use crate::generic_server::ApiError;
+use crate::helpers::CustomApiErrorBody;
+use crate::signature::error::Error as SignatureError;
+
+/// Errors of this crate
+#[derive(Debug, Error)]
+pub enum Error {
+ #[error(display = "{}", _0)]
+ /// Error from common error
+ Common(CommonError),
+
+ // Category: cannot process
+ /// Authorization Header Malformed
+ #[error(display = "Authorization header malformed, expected scope: {}", _0)]
+ AuthorizationHeaderMalformed(String),
+
+ /// The object requested don't exists
+ #[error(display = "Key not found")]
+ NoSuchKey,
+
+ /// Some base64 encoded data was badly encoded
+ #[error(display = "Invalid base64: {}", _0)]
+ InvalidBase64(#[error(source)] base64::DecodeError),
+
+ /// The client sent a header with invalid value
+ #[error(display = "Invalid header value: {}", _0)]
+ InvalidHeader(#[error(source)] hyper::header::ToStrError),
+
+ /// The client asked for an invalid return format (invalid Accept header)
+ #[error(display = "Not acceptable: {}", _0)]
+ NotAcceptable(String),
+
+ /// The request contained an invalid UTF-8 sequence in its path or in other parameters
+ #[error(display = "Invalid UTF-8: {}", _0)]
+ InvalidUtf8Str(#[error(source)] std::str::Utf8Error),
+}
+
+impl<T> From<T> for Error
+where
+ CommonError: From<T>,
+{
+ fn from(err: T) -> Self {
+ Error::Common(CommonError::from(err))
+ }
+}
+
+impl CommonErrorDerivative for Error {}
+
+impl From<HelperError> for Error {
+ fn from(err: HelperError) -> Self {
+ match err {
+ HelperError::Internal(i) => Self::Common(CommonError::InternalError(i)),
+ HelperError::BadRequest(b) => Self::Common(CommonError::BadRequest(b)),
+ HelperError::InvalidBucketName(n) => Self::Common(CommonError::InvalidBucketName(n)),
+ HelperError::NoSuchBucket(n) => Self::Common(CommonError::NoSuchBucket(n)),
+ e => Self::Common(CommonError::BadRequest(format!("{}", e))),
+ }
+ }
+}
+
+impl From<SignatureError> for Error {
+ fn from(err: SignatureError) -> Self {
+ match err {
+ SignatureError::Common(c) => Self::Common(c),
+ SignatureError::AuthorizationHeaderMalformed(c) => {
+ Self::AuthorizationHeaderMalformed(c)
+ }
+ SignatureError::InvalidUtf8Str(i) => Self::InvalidUtf8Str(i),
+ SignatureError::InvalidHeader(h) => Self::InvalidHeader(h),
+ }
+ }
+}
+
+impl Error {
+ /// This returns a keyword for the corresponding error.
+ /// Here, these keywords are not necessarily those from AWS S3,
+ /// as we are building a custom API
+ fn code(&self) -> &'static str {
+ match self {
+ Error::Common(c) => c.aws_code(),
+ Error::NoSuchKey => "NoSuchKey",
+ Error::NotAcceptable(_) => "NotAcceptable",
+ Error::AuthorizationHeaderMalformed(_) => "AuthorizationHeaderMalformed",
+ Error::InvalidBase64(_) => "InvalidBase64",
+ Error::InvalidHeader(_) => "InvalidHeaderValue",
+ Error::InvalidUtf8Str(_) => "InvalidUtf8String",
+ }
+ }
+}
+
+impl ApiError for Error {
+ /// Get the HTTP status code that best represents the meaning of the error for the client
+ fn http_status_code(&self) -> StatusCode {
+ match self {
+ Error::Common(c) => c.http_status_code(),
+ Error::NoSuchKey => StatusCode::NOT_FOUND,
+ Error::NotAcceptable(_) => StatusCode::NOT_ACCEPTABLE,
+ Error::AuthorizationHeaderMalformed(_)
+ | Error::InvalidBase64(_)
+ | Error::InvalidHeader(_)
+ | Error::InvalidUtf8Str(_) => StatusCode::BAD_REQUEST,
+ }
+ }
+
+ fn add_http_headers(&self, header_map: &mut HeaderMap<HeaderValue>) {
+ use hyper::header;
+ header_map.append(header::CONTENT_TYPE, "application/json".parse().unwrap());
+ }
+
+ fn http_body(&self, garage_region: &str, path: &str) -> Body {
+ let error = CustomApiErrorBody {
+ code: self.code().to_string(),
+ message: format!("{}", self),
+ path: path.to_string(),
+ region: garage_region.to_string(),
+ };
+ Body::from(serde_json::to_string_pretty(&error).unwrap_or_else(|_| {
+ r#"
+{
+ "code": "InternalError",
+ "message": "JSON encoding of error failed"
+}
+ "#
+ .into()
+ }))
+ }
+}
diff --git a/src/api/k2v/index.rs b/src/api/k2v/index.rs
new file mode 100644
index 00000000..210950bf
--- /dev/null
+++ b/src/api/k2v/index.rs
@@ -0,0 +1,100 @@
+use std::sync::Arc;
+
+use hyper::{Body, Response, StatusCode};
+use serde::Serialize;
+
+use garage_util::data::*;
+use garage_util::error::Error as GarageError;
+
+use garage_rpc::ring::Ring;
+use garage_table::util::*;
+
+use garage_model::garage::Garage;
+use garage_model::k2v::item_table::{BYTES, CONFLICTS, ENTRIES, VALUES};
+
+use crate::k2v::error::*;
+use crate::k2v::range::read_range;
+
+pub async fn handle_read_index(
+ garage: Arc<Garage>,
+ bucket_id: Uuid,
+ prefix: Option<String>,
+ start: Option<String>,
+ end: Option<String>,
+ limit: Option<u64>,
+ reverse: Option<bool>,
+) -> Result<Response<Body>, Error> {
+ let reverse = reverse.unwrap_or(false);
+
+ let ring: Arc<Ring> = garage.system.ring.borrow().clone();
+
+ let (partition_keys, more, next_start) = read_range(
+ &garage.k2v.counter_table.table,
+ &bucket_id,
+ &prefix,
+ &start,
+ &end,
+ limit,
+ Some((DeletedFilter::NotDeleted, ring.layout.node_id_vec.clone())),
+ EnumerationOrder::from_reverse(reverse),
+ )
+ .await?;
+
+ let s_entries = ENTRIES.to_string();
+ let s_conflicts = CONFLICTS.to_string();
+ let s_values = VALUES.to_string();
+ let s_bytes = BYTES.to_string();
+
+ let resp = ReadIndexResponse {
+ prefix,
+ start,
+ end,
+ limit,
+ reverse,
+ partition_keys: partition_keys
+ .into_iter()
+ .map(|part| {
+ let vals = part.filtered_values(&ring);
+ ReadIndexResponseEntry {
+ pk: part.sk,
+ entries: *vals.get(&s_entries).unwrap_or(&0),
+ conflicts: *vals.get(&s_conflicts).unwrap_or(&0),
+ values: *vals.get(&s_values).unwrap_or(&0),
+ bytes: *vals.get(&s_bytes).unwrap_or(&0),
+ }
+ })
+ .collect::<Vec<_>>(),
+ more,
+ next_start,
+ };
+
+ let resp_json = serde_json::to_string_pretty(&resp).map_err(GarageError::from)?;
+ Ok(Response::builder()
+ .status(StatusCode::OK)
+ .body(Body::from(resp_json))?)
+}
+
+#[derive(Serialize)]
+struct ReadIndexResponse {
+ prefix: Option<String>,
+ start: Option<String>,
+ end: Option<String>,
+ limit: Option<u64>,
+ reverse: bool,
+
+ #[serde(rename = "partitionKeys")]
+ partition_keys: Vec<ReadIndexResponseEntry>,
+
+ more: bool,
+ #[serde(rename = "nextStart")]
+ next_start: Option<String>,
+}
+
+#[derive(Serialize)]
+struct ReadIndexResponseEntry {
+ pk: String,
+ entries: i64,
+ conflicts: i64,
+ values: i64,
+ bytes: i64,
+}
diff --git a/src/api/k2v/item.rs b/src/api/k2v/item.rs
new file mode 100644
index 00000000..836d386f
--- /dev/null
+++ b/src/api/k2v/item.rs
@@ -0,0 +1,230 @@
+use std::sync::Arc;
+
+use http::header;
+
+use hyper::{Body, Request, Response, StatusCode};
+
+use garage_util::data::*;
+
+use garage_model::garage::Garage;
+use garage_model::k2v::causality::*;
+use garage_model::k2v::item_table::*;
+
+use crate::k2v::error::*;
+
+pub const X_GARAGE_CAUSALITY_TOKEN: &str = "X-Garage-Causality-Token";
+
+pub enum ReturnFormat {
+ Json,
+ Binary,
+ Either,
+}
+
+impl ReturnFormat {
+ pub fn from(req: &Request<Body>) -> Result<Self, Error> {
+ let accept = match req.headers().get(header::ACCEPT) {
+ Some(a) => a.to_str()?,
+ None => return Ok(Self::Json),
+ };
+
+ let accept = accept.split(',').map(|s| s.trim()).collect::<Vec<_>>();
+ let accept_json = accept.contains(&"application/json") || accept.contains(&"*/*");
+ let accept_binary = accept.contains(&"application/octet-stream") || accept.contains(&"*/*");
+
+ match (accept_json, accept_binary) {
+ (true, true) => Ok(Self::Either),
+ (true, false) => Ok(Self::Json),
+ (false, true) => Ok(Self::Binary),
+ (false, false) => Err(Error::NotAcceptable("Invalid Accept: header value, must contain either application/json or application/octet-stream (or both)".into())),
+ }
+ }
+
+ pub fn make_response(&self, item: &K2VItem) -> Result<Response<Body>, Error> {
+ let vals = item.values();
+
+ if vals.is_empty() {
+ return Err(Error::NoSuchKey);
+ }
+
+ let ct = item.causal_context().serialize();
+ match self {
+ Self::Binary if vals.len() > 1 => Ok(Response::builder()
+ .header(X_GARAGE_CAUSALITY_TOKEN, ct)
+ .status(StatusCode::CONFLICT)
+ .body(Body::empty())?),
+ Self::Binary => {
+ assert!(vals.len() == 1);
+ Self::make_binary_response(ct, vals[0])
+ }
+ Self::Either if vals.len() == 1 => Self::make_binary_response(ct, vals[0]),
+ _ => Self::make_json_response(ct, &vals[..]),
+ }
+ }
+
+ fn make_binary_response(ct: String, v: &DvvsValue) -> Result<Response<Body>, Error> {
+ match v {
+ DvvsValue::Deleted => Ok(Response::builder()
+ .header(X_GARAGE_CAUSALITY_TOKEN, ct)
+ .header(header::CONTENT_TYPE, "application/octet-stream")
+ .status(StatusCode::NO_CONTENT)
+ .body(Body::empty())?),
+ DvvsValue::Value(v) => Ok(Response::builder()
+ .header(X_GARAGE_CAUSALITY_TOKEN, ct)
+ .header(header::CONTENT_TYPE, "application/octet-stream")
+ .status(StatusCode::OK)
+ .body(Body::from(v.to_vec()))?),
+ }
+ }
+
+ fn make_json_response(ct: String, v: &[&DvvsValue]) -> Result<Response<Body>, Error> {
+ let items = v
+ .iter()
+ .map(|v| match v {
+ DvvsValue::Deleted => serde_json::Value::Null,
+ DvvsValue::Value(v) => serde_json::Value::String(base64::encode(v)),
+ })
+ .collect::<Vec<_>>();
+ let json_body =
+ serde_json::to_string_pretty(&items).ok_or_internal_error("JSON encoding error")?;
+ Ok(Response::builder()
+ .header(X_GARAGE_CAUSALITY_TOKEN, ct)
+ .header(header::CONTENT_TYPE, "application/json")
+ .status(StatusCode::OK)
+ .body(Body::from(json_body))?)
+ }
+}
+
+/// Handle ReadItem request
+#[allow(clippy::ptr_arg)]
+pub async fn handle_read_item(
+ garage: Arc<Garage>,
+ req: &Request<Body>,
+ bucket_id: Uuid,
+ partition_key: &str,
+ sort_key: &String,
+) -> Result<Response<Body>, Error> {
+ let format = ReturnFormat::from(req)?;
+
+ let item = garage
+ .k2v
+ .item_table
+ .get(
+ &K2VItemPartition {
+ bucket_id,
+ partition_key: partition_key.to_string(),
+ },
+ sort_key,
+ )
+ .await?
+ .ok_or(Error::NoSuchKey)?;
+
+ format.make_response(&item)
+}
+
+pub async fn handle_insert_item(
+ garage: Arc<Garage>,
+ req: Request<Body>,
+ bucket_id: Uuid,
+ partition_key: &str,
+ sort_key: &str,
+) -> Result<Response<Body>, Error> {
+ let causal_context = req
+ .headers()
+ .get(X_GARAGE_CAUSALITY_TOKEN)
+ .map(|s| s.to_str())
+ .transpose()?
+ .map(CausalContext::parse)
+ .transpose()
+ .ok_or_bad_request("Invalid causality token")?;
+
+ let body = hyper::body::to_bytes(req.into_body()).await?;
+ let value = DvvsValue::Value(body.to_vec());
+
+ garage
+ .k2v
+ .rpc
+ .insert(
+ bucket_id,
+ partition_key.to_string(),
+ sort_key.to_string(),
+ causal_context,
+ value,
+ )
+ .await?;
+
+ Ok(Response::builder()
+ .status(StatusCode::OK)
+ .body(Body::empty())?)
+}
+
+pub async fn handle_delete_item(
+ garage: Arc<Garage>,
+ req: Request<Body>,
+ bucket_id: Uuid,
+ partition_key: &str,
+ sort_key: &str,
+) -> Result<Response<Body>, Error> {
+ let causal_context = req
+ .headers()
+ .get(X_GARAGE_CAUSALITY_TOKEN)
+ .map(|s| s.to_str())
+ .transpose()?
+ .map(CausalContext::parse)
+ .transpose()
+ .ok_or_bad_request("Invalid causality token")?;
+
+ let value = DvvsValue::Deleted;
+
+ garage
+ .k2v
+ .rpc
+ .insert(
+ bucket_id,
+ partition_key.to_string(),
+ sort_key.to_string(),
+ causal_context,
+ value,
+ )
+ .await?;
+
+ Ok(Response::builder()
+ .status(StatusCode::NO_CONTENT)
+ .body(Body::empty())?)
+}
+
+/// Handle ReadItem request
+#[allow(clippy::ptr_arg)]
+pub async fn handle_poll_item(
+ garage: Arc<Garage>,
+ req: &Request<Body>,
+ bucket_id: Uuid,
+ partition_key: String,
+ sort_key: String,
+ causality_token: String,
+ timeout_secs: Option<u64>,
+) -> Result<Response<Body>, Error> {
+ let format = ReturnFormat::from(req)?;
+
+ let causal_context =
+ CausalContext::parse(&causality_token).ok_or_bad_request("Invalid causality token")?;
+
+ let item = garage
+ .k2v
+ .rpc
+ .poll(
+ bucket_id,
+ partition_key,
+ sort_key,
+ causal_context,
+ timeout_secs.unwrap_or(300) * 1000,
+ )
+ .await?;
+
+ if let Some(item) = item {
+ format.make_response(&item)
+ } else {
+ Ok(Response::builder()
+ .status(StatusCode::NOT_MODIFIED)
+ .body(Body::empty())?)
+ }
+}
diff --git a/src/api/k2v/mod.rs b/src/api/k2v/mod.rs
new file mode 100644
index 00000000..b6a8c5cf
--- /dev/null
+++ b/src/api/k2v/mod.rs
@@ -0,0 +1,9 @@
+pub mod api_server;
+mod error;
+mod router;
+
+mod batch;
+mod index;
+mod item;
+
+mod range;
diff --git a/src/api/k2v/range.rs b/src/api/k2v/range.rs
new file mode 100644
index 00000000..bb9d3be5
--- /dev/null
+++ b/src/api/k2v/range.rs
@@ -0,0 +1,100 @@
+//! Utility module for retrieving ranges of items in Garage tables
+//! Implements parameters (prefix, start, end, limit) as specified
+//! for endpoints ReadIndex, ReadBatch and DeleteBatch
+
+use std::sync::Arc;
+
+use garage_table::replication::TableShardedReplication;
+use garage_table::*;
+
+use crate::helpers::key_after_prefix;
+use crate::k2v::error::*;
+
+/// Read range in a Garage table.
+/// Returns (entries, more?, nextStart)
+#[allow(clippy::too_many_arguments)]
+pub(crate) async fn read_range<F>(
+ table: &Arc<Table<F, TableShardedReplication>>,
+ partition_key: &F::P,
+ prefix: &Option<String>,
+ start: &Option<String>,
+ end: &Option<String>,
+ limit: Option<u64>,
+ filter: Option<F::Filter>,
+ enumeration_order: EnumerationOrder,
+) -> Result<(Vec<F::E>, bool, Option<String>), Error>
+where
+ F: TableSchema<S = String> + 'static,
+{
+ let (mut start, mut start_ignore) = match (prefix, start) {
+ (None, None) => (None, false),
+ (None, Some(s)) => (Some(s.clone()), false),
+ (Some(p), Some(s)) => {
+ if !s.starts_with(p) {
+ return Err(Error::bad_request(format!(
+ "Start key '{}' does not start with prefix '{}'",
+ s, p
+ )));
+ }
+ (Some(s.clone()), false)
+ }
+ (Some(p), None) if enumeration_order == EnumerationOrder::Reverse => {
+ let start = key_after_prefix(p)
+ .ok_or_internal_error("Sorry, can't list this prefix in reverse order")?;
+ (Some(start), true)
+ }
+ (Some(p), None) => (Some(p.clone()), false),
+ };
+
+ let mut entries = vec![];
+ loop {
+ let n_get = std::cmp::min(
+ 1000,
+ limit.map(|x| x as usize).unwrap_or(usize::MAX - 10) - entries.len() + 2,
+ );
+ let get_ret = table
+ .get_range(
+ partition_key,
+ start.clone(),
+ filter.clone(),
+ n_get,
+ enumeration_order,
+ )
+ .await?;
+
+ let get_ret_len = get_ret.len();
+
+ for entry in get_ret {
+ if start_ignore && Some(entry.sort_key()) == start.as_ref() {
+ continue;
+ }
+ if let Some(p) = prefix {
+ if !entry.sort_key().starts_with(p) {
+ return Ok((entries, false, None));
+ }
+ }
+ if let Some(e) = end {
+ let is_finished = match enumeration_order {
+ EnumerationOrder::Forward => entry.sort_key() >= e,
+ EnumerationOrder::Reverse => entry.sort_key() <= e,
+ };
+ if is_finished {
+ return Ok((entries, false, None));
+ }
+ }
+ if let Some(l) = limit {
+ if entries.len() >= l as usize {
+ return Ok((entries, true, Some(entry.sort_key().clone())));
+ }
+ }
+ entries.push(entry);
+ }
+
+ if get_ret_len < n_get {
+ return Ok((entries, false, None));
+ }
+
+ start = Some(entries.last().unwrap().sort_key().clone());
+ start_ignore = true;
+ }
+}
diff --git a/src/api/k2v/router.rs b/src/api/k2v/router.rs
new file mode 100644
index 00000000..50e6965b
--- /dev/null
+++ b/src/api/k2v/router.rs
@@ -0,0 +1,252 @@
+use crate::k2v::error::*;
+
+use std::borrow::Cow;
+
+use hyper::{Method, Request};
+
+use crate::helpers::Authorization;
+use crate::router_macros::{generateQueryParameters, router_match};
+
+router_match! {@func
+
+
+/// List of all K2V API endpoints.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum Endpoint {
+ DeleteBatch {
+ },
+ DeleteItem {
+ partition_key: String,
+ sort_key: String,
+ },
+ InsertBatch {
+ },
+ InsertItem {
+ partition_key: String,
+ sort_key: String,
+ },
+ Options,
+ PollItem {
+ partition_key: String,
+ sort_key: String,
+ causality_token: String,
+ timeout: Option<u64>,
+ },
+ ReadBatch {
+ },
+ ReadIndex {
+ prefix: Option<String>,
+ start: Option<String>,
+ end: Option<String>,
+ limit: Option<u64>,
+ reverse: Option<bool>,
+ },
+ ReadItem {
+ partition_key: String,
+ sort_key: String,
+ },
+}}
+
+impl Endpoint {
+ /// Determine which S3 endpoint a request is for using the request, and a bucket which was
+ /// possibly extracted from the Host header.
+ /// Returns Self plus bucket name, if endpoint is not Endpoint::ListBuckets
+ pub fn from_request<T>(req: &Request<T>) -> Result<(Self, String), Error> {
+ let uri = req.uri();
+ let path = uri.path().trim_start_matches('/');
+ let query = uri.query();
+
+ let (bucket, partition_key) = path
+ .split_once('/')
+ .map(|(b, p)| (b.to_owned(), p.trim_start_matches('/')))
+ .unwrap_or((path.to_owned(), ""));
+
+ if bucket.is_empty() {
+ return Err(Error::bad_request("Missing bucket name"));
+ }
+
+ if *req.method() == Method::OPTIONS {
+ return Ok((Self::Options, bucket));
+ }
+
+ let partition_key = percent_encoding::percent_decode_str(partition_key)
+ .decode_utf8()?
+ .into_owned();
+
+ let mut query = QueryParameters::from_query(query.unwrap_or_default())?;
+
+ let method_search = Method::from_bytes(b"SEARCH").unwrap();
+ let res = match *req.method() {
+ Method::GET => Self::from_get(partition_key, &mut query)?,
+ //&Method::HEAD => Self::from_head(partition_key, &mut query)?,
+ Method::POST => Self::from_post(partition_key, &mut query)?,
+ Method::PUT => Self::from_put(partition_key, &mut query)?,
+ Method::DELETE => Self::from_delete(partition_key, &mut query)?,
+ _ if req.method() == method_search => Self::from_search(partition_key, &mut query)?,
+ _ => return Err(Error::bad_request("Unknown method")),
+ };
+
+ if let Some(message) = query.nonempty_message() {
+ debug!("Unused query parameter: {}", message)
+ }
+ Ok((res, bucket))
+ }
+
+ /// Determine which endpoint a request is for, knowing it is a GET.
+ fn from_get(partition_key: String, query: &mut QueryParameters<'_>) -> Result<Self, Error> {
+ router_match! {
+ @gen_parser
+ (query.keyword.take().unwrap_or_default().as_ref(), partition_key, query, None),
+ key: [
+ EMPTY if causality_token => PollItem (query::sort_key, query::causality_token, opt_parse::timeout),
+ EMPTY => ReadItem (query::sort_key),
+ ],
+ no_key: [
+ EMPTY => ReadIndex (query_opt::prefix, query_opt::start, query_opt::end, opt_parse::limit, opt_parse::reverse),
+ ]
+ }
+ }
+
+ /// Determine which endpoint a request is for, knowing it is a SEARCH.
+ fn from_search(partition_key: String, query: &mut QueryParameters<'_>) -> Result<Self, Error> {
+ router_match! {
+ @gen_parser
+ (query.keyword.take().unwrap_or_default().as_ref(), partition_key, query, None),
+ key: [
+ ],
+ no_key: [
+ EMPTY => ReadBatch,
+ ]
+ }
+ }
+
+ /*
+ /// Determine which endpoint a request is for, knowing it is a HEAD.
+ fn from_head(partition_key: String, query: &mut QueryParameters<'_>) -> Result<Self, Error> {
+ router_match! {
+ @gen_parser
+ (query.keyword.take().unwrap_or_default().as_ref(), partition_key, query, None),
+ key: [
+ EMPTY => HeadObject(opt_parse::part_number, query_opt::version_id),
+ ],
+ no_key: [
+ EMPTY => HeadBucket,
+ ]
+ }
+ }
+ */
+
+ /// Determine which endpoint a request is for, knowing it is a POST.
+ fn from_post(partition_key: String, query: &mut QueryParameters<'_>) -> Result<Self, Error> {
+ router_match! {
+ @gen_parser
+ (query.keyword.take().unwrap_or_default().as_ref(), partition_key, query, None),
+ key: [
+ ],
+ no_key: [
+ EMPTY => InsertBatch,
+ DELETE => DeleteBatch,
+ SEARCH => ReadBatch,
+ ]
+ }
+ }
+
+ /// Determine which endpoint a request is for, knowing it is a PUT.
+ fn from_put(partition_key: String, query: &mut QueryParameters<'_>) -> Result<Self, Error> {
+ router_match! {
+ @gen_parser
+ (query.keyword.take().unwrap_or_default().as_ref(), partition_key, query, None),
+ key: [
+ EMPTY => InsertItem (query::sort_key),
+
+ ],
+ no_key: [
+ ]
+ }
+ }
+
+ /// Determine which endpoint a request is for, knowing it is a DELETE.
+ fn from_delete(partition_key: String, query: &mut QueryParameters<'_>) -> Result<Self, Error> {
+ router_match! {
+ @gen_parser
+ (query.keyword.take().unwrap_or_default().as_ref(), partition_key, query, None),
+ key: [
+ EMPTY => DeleteItem (query::sort_key),
+ ],
+ no_key: [
+ ]
+ }
+ }
+
+ /// Get the partition key the request target. Returns None for requests which don't use a partition key.
+ #[allow(dead_code)]
+ pub fn get_partition_key(&self) -> Option<&str> {
+ router_match! {
+ @extract
+ self,
+ partition_key,
+ [
+ DeleteItem,
+ InsertItem,
+ PollItem,
+ ReadItem,
+ ]
+ }
+ }
+
+ /// Get the sort key the request target. Returns None for requests which don't use a sort key.
+ #[allow(dead_code)]
+ pub fn get_sort_key(&self) -> Option<&str> {
+ router_match! {
+ @extract
+ self,
+ sort_key,
+ [
+ DeleteItem,
+ InsertItem,
+ PollItem,
+ ReadItem,
+ ]
+ }
+ }
+
+ /// Get the kind of authorization which is required to perform the operation.
+ pub fn authorization_type(&self) -> Authorization {
+ let readonly = router_match! {
+ @match
+ self,
+ [
+ PollItem,
+ ReadBatch,
+ ReadIndex,
+ ReadItem,
+ ]
+ };
+ if readonly {
+ Authorization::Read
+ } else {
+ Authorization::Write
+ }
+ }
+}
+
+// parameter name => struct field
+generateQueryParameters! {
+ "prefix" => prefix,
+ "start" => start,
+ "causality_token" => causality_token,
+ "end" => end,
+ "limit" => limit,
+ "reverse" => reverse,
+ "sort_key" => sort_key,
+ "timeout" => timeout
+}
+
+mod keywords {
+ //! This module contain all query parameters with no associated value
+ //! used to differentiate endpoints.
+ pub const EMPTY: &str = "";
+
+ pub const DELETE: &str = "delete";
+ pub const SEARCH: &str = "search";
+}
diff --git a/src/api/lib.rs b/src/api/lib.rs
index de60ec53..370dfd7a 100644
--- a/src/api/lib.rs
+++ b/src/api/lib.rs
@@ -2,26 +2,16 @@
#[macro_use]
extern crate tracing;
-pub mod error;
-pub use error::Error;
+pub mod common_error;
mod encoding;
-
-mod api_server;
-pub use api_server::run_api_server;
-
+pub mod generic_server;
+pub mod helpers;
+mod router_macros;
/// This mode is public only to help testing. Don't expect stability here
pub mod signature;
-pub mod helpers;
-mod s3_bucket;
-mod s3_copy;
-pub mod s3_cors;
-mod s3_delete;
-pub mod s3_get;
-mod s3_list;
-mod s3_post_object;
-mod s3_put;
-mod s3_router;
-mod s3_website;
-mod s3_xml;
+pub mod admin;
+#[cfg(feature = "k2v")]
+pub mod k2v;
+pub mod s3;
diff --git a/src/api/router_macros.rs b/src/api/router_macros.rs
new file mode 100644
index 00000000..4c593300
--- /dev/null
+++ b/src/api/router_macros.rs
@@ -0,0 +1,213 @@
+/// This macro is used to generate very repetitive match {} blocks in this module
+/// It is _not_ made to be used anywhere else
+macro_rules! router_match {
+ (@match $enum:expr , [ $($endpoint:ident,)* ]) => {{
+ // usage: router_match {@match my_enum, [ VariantWithField1, VariantWithField2 ..] }
+ // returns true if the variant was one of the listed variants, false otherwise.
+ use Endpoint::*;
+ match $enum {
+ $(
+ $endpoint { .. } => true,
+ )*
+ _ => false
+ }
+ }};
+ (@extract $enum:expr , $param:ident, [ $($endpoint:ident,)* ]) => {{
+ // usage: router_match {@extract my_enum, field_name, [ VariantWithField1, VariantWithField2 ..] }
+ // returns Some(field_value), or None if the variant was not one of the listed variants.
+ use Endpoint::*;
+ match $enum {
+ $(
+ $endpoint {$param, ..} => Some($param),
+ )*
+ _ => None
+ }
+ }};
+ (@gen_path_parser ($method:expr, $reqpath:expr, $query:expr)
+ [
+ $($meth:ident $path:pat $(if $required:ident)? => $api:ident $(($($conv:ident :: $param:ident),*))?,)*
+ ]) => {{
+ {
+ use Endpoint::*;
+ match ($method, $reqpath) {
+ $(
+ (&Method::$meth, $path) if true $(&& $query.$required.is_some())? => $api {
+ $($(
+ $param: router_match!(@@parse_param $query, $conv, $param),
+ )*)?
+ },
+ )*
+ (m, p) => {
+ return Err(Error::bad_request(format!(
+ "Unknown API endpoint: {} {}",
+ m, p
+ )))
+ }
+ }
+ }
+ }};
+ (@gen_parser ($keyword:expr, $key:ident, $query:expr, $header:expr),
+ key: [$($kw_k:ident $(if $required_k:ident)? $(header $header_k:expr)? => $api_k:ident $(($($conv_k:ident :: $param_k:ident),*))?,)*],
+ no_key: [$($kw_nk:ident $(if $required_nk:ident)? $(if_header $header_nk:expr)? => $api_nk:ident $(($($conv_nk:ident :: $param_nk:ident),*))?,)*]) => {{
+ // usage: router_match {@gen_parser (keyword, key, query, header),
+ // key: [
+ // SOME_KEYWORD => VariantWithKey,
+ // ...
+ // ],
+ // no_key: [
+ // SOME_KEYWORD => VariantWithoutKey,
+ // ...
+ // ]
+ // }
+ // See in from_{method} for more detailed usage.
+ use Endpoint::*;
+ use keywords::*;
+ match ($keyword, !$key.is_empty()){
+ $(
+ ($kw_k, true) if true $(&& $query.$required_k.is_some())? $(&& $header.contains_key($header_k))? => Ok($api_k {
+ $key,
+ $($(
+ $param_k: router_match!(@@parse_param $query, $conv_k, $param_k),
+ )*)?
+ }),
+ )*
+ $(
+ ($kw_nk, false) $(if $query.$required_nk.is_some())? $(if $header.contains($header_nk))? => Ok($api_nk {
+ $($(
+ $param_nk: router_match!(@@parse_param $query, $conv_nk, $param_nk),
+ )*)?
+ }),
+ )*
+ (kw, _) => Err(Error::bad_request(format!("Invalid endpoint: {}", kw)))
+ }
+ }};
+
+ (@@parse_param $query:expr, query_opt, $param:ident) => {{
+ // extract optional query parameter
+ $query.$param.take().map(|param| param.into_owned())
+ }};
+ (@@parse_param $query:expr, query, $param:ident) => {{
+ // extract mendatory query parameter
+ $query.$param.take().ok_or_bad_request("Missing argument for endpoint")?.into_owned()
+ }};
+ (@@parse_param $query:expr, opt_parse, $param:ident) => {{
+ // extract and parse optional query parameter
+ // missing parameter is file, however parse error is reported as an error
+ $query.$param
+ .take()
+ .map(|param| param.parse())
+ .transpose()
+ .map_err(|_| Error::bad_request("Failed to parse query parameter"))?
+ }};
+ (@@parse_param $query:expr, parse, $param:ident) => {{
+ // extract and parse mandatory query parameter
+ // both missing and un-parseable parameters are reported as errors
+ $query.$param.take().ok_or_bad_request("Missing argument for endpoint")?
+ .parse()
+ .map_err(|_| Error::bad_request("Failed to parse query parameter"))?
+ }};
+ (@func
+ $(#[$doc:meta])*
+ pub enum Endpoint {
+ $(
+ $(#[$outer:meta])*
+ $variant:ident $({
+ $($name:ident: $ty:ty,)*
+ })?,
+ )*
+ }) => {
+ $(#[$doc])*
+ pub enum Endpoint {
+ $(
+ $(#[$outer])*
+ $variant $({
+ $($name: $ty, )*
+ })?,
+ )*
+ }
+ impl Endpoint {
+ pub fn name(&self) -> &'static str {
+ match self {
+ $(Endpoint::$variant $({ $($name: _,)* .. })? => stringify!($variant),)*
+ }
+ }
+ }
+ };
+ (@if ($($cond:tt)+) then ($($then:tt)*) else ($($else:tt)*)) => {
+ $($then)*
+ };
+ (@if () then ($($then:tt)*) else ($($else:tt)*)) => {
+ $($else)*
+ };
+}
+
+/// This macro is used to generate part of the code in this module. It must be called only one, and
+/// is useless outside of this module.
+macro_rules! generateQueryParameters {
+ ( $($rest:expr => $name:ident),* ) => {
+ /// Struct containing all query parameters used in endpoints. Think of it as an HashMap,
+ /// but with keys statically known.
+ #[derive(Debug, Default)]
+ struct QueryParameters<'a> {
+ keyword: Option<Cow<'a, str>>,
+ $(
+ $name: Option<Cow<'a, str>>,
+ )*
+ }
+
+ impl<'a> QueryParameters<'a> {
+ /// Build this struct from the query part of an URI.
+ fn from_query(query: &'a str) -> Result<Self, Error> {
+ let mut res: Self = Default::default();
+ for (k, v) in url::form_urlencoded::parse(query.as_bytes()) {
+ let repeated = match k.as_ref() {
+ $(
+ $rest => if !v.is_empty() {
+ res.$name.replace(v).is_some()
+ } else {
+ false
+ },
+ )*
+ _ => {
+ if k.starts_with("response-") || k.starts_with("X-Amz-") {
+ false
+ } else if v.as_ref().is_empty() {
+ if res.keyword.replace(k).is_some() {
+ return Err(Error::bad_request("Multiple keywords"));
+ }
+ continue;
+ } else {
+ debug!("Received an unknown query parameter: '{}'", k);
+ false
+ }
+ }
+ };
+ if repeated {
+ return Err(Error::bad_request(format!(
+ "Query parameter repeated: '{}'",
+ k
+ )));
+ }
+ }
+ Ok(res)
+ }
+
+ /// Get an error message in case not all parameters where used when extracting them to
+ /// build an Enpoint variant
+ fn nonempty_message(&self) -> Option<&str> {
+ if self.keyword.is_some() {
+ Some("Keyword not used")
+ } $(
+ else if self.$name.is_some() {
+ Some(concat!("'", $rest, "'"))
+ }
+ )* else {
+ None
+ }
+ }
+ }
+ }
+}
+
+pub(crate) use generateQueryParameters;
+pub(crate) use router_match;
diff --git a/src/api/s3/api_server.rs b/src/api/s3/api_server.rs
new file mode 100644
index 00000000..27837297
--- /dev/null
+++ b/src/api/s3/api_server.rs
@@ -0,0 +1,390 @@
+use std::net::SocketAddr;
+use std::sync::Arc;
+
+use async_trait::async_trait;
+
+use futures::future::Future;
+use hyper::header;
+use hyper::{Body, Request, Response};
+
+use opentelemetry::{trace::SpanRef, KeyValue};
+
+use garage_util::error::Error as GarageError;
+
+use garage_model::garage::Garage;
+use garage_model::key_table::Key;
+
+use crate::generic_server::*;
+use crate::s3::error::*;
+
+use crate::signature::payload::check_payload_signature;
+use crate::signature::streaming::*;
+
+use crate::helpers::*;
+use crate::s3::bucket::*;
+use crate::s3::copy::*;
+use crate::s3::cors::*;
+use crate::s3::delete::*;
+use crate::s3::get::*;
+use crate::s3::list::*;
+use crate::s3::post_object::handle_post_object;
+use crate::s3::put::*;
+use crate::s3::router::Endpoint;
+use crate::s3::website::*;
+
+pub struct S3ApiServer {
+ garage: Arc<Garage>,
+}
+
+pub(crate) struct S3ApiEndpoint {
+ bucket_name: Option<String>,
+ endpoint: Endpoint,
+}
+
+impl S3ApiServer {
+ pub async fn run(
+ garage: Arc<Garage>,
+ addr: SocketAddr,
+ s3_region: String,
+ shutdown_signal: impl Future<Output = ()>,
+ ) -> Result<(), GarageError> {
+ ApiServer::new(s3_region, S3ApiServer { garage })
+ .run_server(addr, shutdown_signal)
+ .await
+ }
+
+ async fn handle_request_without_bucket(
+ &self,
+ _req: Request<Body>,
+ api_key: Key,
+ endpoint: Endpoint,
+ ) -> Result<Response<Body>, Error> {
+ match endpoint {
+ Endpoint::ListBuckets => handle_list_buckets(&self.garage, &api_key).await,
+ endpoint => Err(Error::NotImplemented(endpoint.name().to_owned())),
+ }
+ }
+}
+
+#[async_trait]
+impl ApiHandler for S3ApiServer {
+ const API_NAME: &'static str = "s3";
+ const API_NAME_DISPLAY: &'static str = "S3";
+
+ type Endpoint = S3ApiEndpoint;
+ type Error = Error;
+
+ fn parse_endpoint(&self, req: &Request<Body>) -> Result<S3ApiEndpoint, Error> {
+ let authority = req
+ .headers()
+ .get(header::HOST)
+ .ok_or_bad_request("Host header required")?
+ .to_str()?;
+
+ let host = authority_to_host(authority)?;
+
+ let bucket_name = self
+ .garage
+ .config
+ .s3_api
+ .root_domain
+ .as_ref()
+ .and_then(|root_domain| host_to_bucket(&host, root_domain));
+
+ let (endpoint, bucket_name) =
+ Endpoint::from_request(req, bucket_name.map(ToOwned::to_owned))?;
+
+ Ok(S3ApiEndpoint {
+ bucket_name,
+ endpoint,
+ })
+ }
+
+ async fn handle(
+ &self,
+ req: Request<Body>,
+ endpoint: S3ApiEndpoint,
+ ) -> Result<Response<Body>, Error> {
+ let S3ApiEndpoint {
+ bucket_name,
+ endpoint,
+ } = endpoint;
+ let garage = self.garage.clone();
+
+ // Some endpoints are processed early, before we even check for an API key
+ if let Endpoint::PostObject = endpoint {
+ return handle_post_object(garage, req, bucket_name.unwrap()).await;
+ }
+ if let Endpoint::Options = endpoint {
+ return handle_options_s3api(garage, &req, bucket_name).await;
+ }
+
+ let (api_key, mut content_sha256) = check_payload_signature(&garage, "s3", &req).await?;
+ let api_key = api_key
+ .ok_or_else(|| Error::forbidden("Garage does not support anonymous access yet"))?;
+
+ let req = parse_streaming_body(
+ &api_key,
+ req,
+ &mut content_sha256,
+ &garage.config.s3_api.s3_region,
+ "s3",
+ )?;
+
+ let bucket_name = match bucket_name {
+ None => {
+ return self
+ .handle_request_without_bucket(req, api_key, endpoint)
+ .await
+ }
+ Some(bucket) => bucket.to_string(),
+ };
+
+ // Special code path for CreateBucket API endpoint
+ if let Endpoint::CreateBucket {} = endpoint {
+ return handle_create_bucket(&garage, req, content_sha256, api_key, bucket_name).await;
+ }
+
+ let bucket_id = garage
+ .bucket_helper()
+ .resolve_bucket(&bucket_name, &api_key)
+ .await?;
+ let bucket = garage
+ .bucket_helper()
+ .get_existing_bucket(bucket_id)
+ .await?;
+
+ let allowed = match endpoint.authorization_type() {
+ Authorization::Read => api_key.allow_read(&bucket_id),
+ Authorization::Write => api_key.allow_write(&bucket_id),
+ Authorization::Owner => api_key.allow_owner(&bucket_id),
+ _ => unreachable!(),
+ };
+
+ if !allowed {
+ return Err(Error::forbidden("Operation is not allowed for this key."));
+ }
+
+ let matching_cors_rule = find_matching_cors_rule(&bucket, &req)?;
+
+ let resp = match endpoint {
+ Endpoint::HeadObject {
+ key, part_number, ..
+ } => handle_head(garage, &req, bucket_id, &key, part_number).await,
+ Endpoint::GetObject {
+ key, part_number, ..
+ } => handle_get(garage, &req, bucket_id, &key, part_number).await,
+ Endpoint::UploadPart {
+ key,
+ part_number,
+ upload_id,
+ } => {
+ handle_put_part(
+ garage,
+ req,
+ bucket_id,
+ &key,
+ part_number,
+ &upload_id,
+ content_sha256,
+ )
+ .await
+ }
+ Endpoint::CopyObject { key } => {
+ handle_copy(garage, &api_key, &req, bucket_id, &key).await
+ }
+ Endpoint::UploadPartCopy {
+ key,
+ part_number,
+ upload_id,
+ } => {
+ handle_upload_part_copy(
+ garage,
+ &api_key,
+ &req,
+ bucket_id,
+ &key,
+ part_number,
+ &upload_id,
+ )
+ .await
+ }
+ Endpoint::PutObject { key } => {
+ handle_put(garage, req, &bucket, &key, content_sha256).await
+ }
+ Endpoint::AbortMultipartUpload { key, upload_id } => {
+ handle_abort_multipart_upload(garage, bucket_id, &key, &upload_id).await
+ }
+ Endpoint::DeleteObject { key, .. } => handle_delete(garage, bucket_id, &key).await,
+ Endpoint::CreateMultipartUpload { key } => {
+ handle_create_multipart_upload(garage, &req, &bucket_name, bucket_id, &key).await
+ }
+ Endpoint::CompleteMultipartUpload { key, upload_id } => {
+ handle_complete_multipart_upload(
+ garage,
+ req,
+ &bucket_name,
+ &bucket,
+ &key,
+ &upload_id,
+ content_sha256,
+ )
+ .await
+ }
+ Endpoint::CreateBucket {} => unreachable!(),
+ Endpoint::HeadBucket {} => {
+ let empty_body: Body = Body::from(vec![]);
+ let response = Response::builder().body(empty_body).unwrap();
+ Ok(response)
+ }
+ Endpoint::DeleteBucket {} => {
+ handle_delete_bucket(&garage, bucket_id, bucket_name, api_key).await
+ }
+ Endpoint::GetBucketLocation {} => handle_get_bucket_location(garage),
+ Endpoint::GetBucketVersioning {} => handle_get_bucket_versioning(),
+ Endpoint::ListObjects {
+ delimiter,
+ encoding_type,
+ marker,
+ max_keys,
+ prefix,
+ } => {
+ handle_list(
+ garage,
+ &ListObjectsQuery {
+ common: ListQueryCommon {
+ bucket_name,
+ bucket_id,
+ delimiter: delimiter.map(|d| d.to_string()),
+ page_size: max_keys.map(|p| p.clamp(1, 1000)).unwrap_or(1000),
+ prefix: prefix.unwrap_or_default(),
+ urlencode_resp: encoding_type.map(|e| e == "url").unwrap_or(false),
+ },
+ is_v2: false,
+ marker,
+ continuation_token: None,
+ start_after: None,
+ },
+ )
+ .await
+ }
+ Endpoint::ListObjectsV2 {
+ delimiter,
+ encoding_type,
+ max_keys,
+ prefix,
+ continuation_token,
+ start_after,
+ list_type,
+ ..
+ } => {
+ if list_type == "2" {
+ handle_list(
+ garage,
+ &ListObjectsQuery {
+ common: ListQueryCommon {
+ bucket_name,
+ bucket_id,
+ delimiter: delimiter.map(|d| d.to_string()),
+ page_size: max_keys.map(|p| p.clamp(1, 1000)).unwrap_or(1000),
+ urlencode_resp: encoding_type.map(|e| e == "url").unwrap_or(false),
+ prefix: prefix.unwrap_or_default(),
+ },
+ is_v2: true,
+ marker: None,
+ continuation_token,
+ start_after,
+ },
+ )
+ .await
+ } else {
+ Err(Error::bad_request(format!(
+ "Invalid endpoint: list-type={}",
+ list_type
+ )))
+ }
+ }
+ Endpoint::ListMultipartUploads {
+ delimiter,
+ encoding_type,
+ key_marker,
+ max_uploads,
+ prefix,
+ upload_id_marker,
+ } => {
+ handle_list_multipart_upload(
+ garage,
+ &ListMultipartUploadsQuery {
+ common: ListQueryCommon {
+ bucket_name,
+ bucket_id,
+ delimiter: delimiter.map(|d| d.to_string()),
+ page_size: max_uploads.map(|p| p.clamp(1, 1000)).unwrap_or(1000),
+ prefix: prefix.unwrap_or_default(),
+ urlencode_resp: encoding_type.map(|e| e == "url").unwrap_or(false),
+ },
+ key_marker,
+ upload_id_marker,
+ },
+ )
+ .await
+ }
+ Endpoint::ListParts {
+ key,
+ max_parts,
+ part_number_marker,
+ upload_id,
+ } => {
+ handle_list_parts(
+ garage,
+ &ListPartsQuery {
+ bucket_name,
+ bucket_id,
+ key,
+ upload_id,
+ part_number_marker: part_number_marker.map(|p| p.clamp(1, 10000)),
+ max_parts: max_parts.map(|p| p.clamp(1, 1000)).unwrap_or(1000),
+ },
+ )
+ .await
+ }
+ Endpoint::DeleteObjects {} => {
+ handle_delete_objects(garage, bucket_id, req, content_sha256).await
+ }
+ Endpoint::GetBucketWebsite {} => handle_get_website(&bucket).await,
+ Endpoint::PutBucketWebsite {} => {
+ handle_put_website(garage, bucket_id, req, content_sha256).await
+ }
+ Endpoint::DeleteBucketWebsite {} => handle_delete_website(garage, bucket_id).await,
+ Endpoint::GetBucketCors {} => handle_get_cors(&bucket).await,
+ Endpoint::PutBucketCors {} => {
+ handle_put_cors(garage, bucket_id, req, content_sha256).await
+ }
+ Endpoint::DeleteBucketCors {} => handle_delete_cors(garage, bucket_id).await,
+ endpoint => Err(Error::NotImplemented(endpoint.name().to_owned())),
+ };
+
+ // If request was a success and we have a CORS rule that applies to it,
+ // add the corresponding CORS headers to the response
+ let mut resp_ok = resp?;
+ if let Some(rule) = matching_cors_rule {
+ add_cors_headers(&mut resp_ok, rule)
+ .ok_or_internal_error("Invalid bucket CORS configuration")?;
+ }
+
+ Ok(resp_ok)
+ }
+}
+
+impl ApiEndpoint for S3ApiEndpoint {
+ fn name(&self) -> &'static str {
+ self.endpoint.name()
+ }
+
+ fn add_span_attributes(&self, span: SpanRef<'_>) {
+ span.set_attribute(KeyValue::new(
+ "bucket",
+ self.bucket_name.clone().unwrap_or_default(),
+ ));
+ }
+}
diff --git a/src/api/s3_bucket.rs b/src/api/s3/bucket.rs
index 8a5407d3..3ac6a6ec 100644
--- a/src/api/s3_bucket.rs
+++ b/src/api/s3/bucket.rs
@@ -7,15 +7,15 @@ use garage_model::bucket_alias_table::*;
use garage_model::bucket_table::Bucket;
use garage_model::garage::Garage;
use garage_model::key_table::Key;
-use garage_model::object_table::ObjectFilter;
use garage_model::permission::BucketKeyPerm;
use garage_table::util::*;
use garage_util::crdt::*;
use garage_util::data::*;
use garage_util::time::*;
-use crate::error::*;
-use crate::s3_xml;
+use crate::common_error::CommonError;
+use crate::s3::error::*;
+use crate::s3::xml as s3_xml;
use crate::signature::verify_signed_content;
pub fn handle_get_bucket_location(garage: Arc<Garage>) -> Result<Response<Body>, Error> {
@@ -130,7 +130,7 @@ pub async fn handle_create_bucket(
if let Some(location_constraint) = cmd {
if location_constraint != garage.config.s3_api.s3_region {
- return Err(Error::BadRequest(format!(
+ return Err(Error::bad_request(format!(
"Cannot satisfy location constraint `{}`: buckets can only be created in region `{}`",
location_constraint,
garage.config.s3_api.s3_region
@@ -158,12 +158,12 @@ pub async fn handle_create_bucket(
// otherwise return a forbidden error.
let kp = api_key.bucket_permissions(&bucket_id);
if !(kp.allow_write || kp.allow_owner) {
- return Err(Error::BucketAlreadyExists);
+ return Err(CommonError::BucketAlreadyExists.into());
}
} else {
// Create the bucket!
if !is_valid_bucket_name(&bucket_name) {
- return Err(Error::BadRequest(format!(
+ return Err(Error::bad_request(format!(
"{}: {}",
bucket_name, INVALID_BUCKET_NAME_MESSAGE
)));
@@ -228,12 +228,8 @@ pub async fn handle_delete_bucket(
// Delete bucket
// Check bucket is empty
- let objects = garage
- .object_table
- .get_range(&bucket_id, None, Some(ObjectFilter::IsData), 10)
- .await?;
- if !objects.is_empty() {
- return Err(Error::BucketNotEmpty);
+ if !garage.bucket_helper().is_bucket_empty(bucket_id).await? {
+ return Err(CommonError::BucketNotEmpty.into());
}
// --- done checking, now commit ---
@@ -299,7 +295,6 @@ fn parse_create_bucket_xml(xml_bytes: &[u8]) -> Option<Option<String>> {
let mut ret = None;
for item in cbc.children() {
- println!("{:?}", item);
if item.has_tag_name("LocationConstraint") {
if ret != None {
return None;
diff --git a/src/api/s3_copy.rs b/src/api/s3/copy.rs
index fc4707e2..7eb6459d 100644
--- a/src/api/s3_copy.rs
+++ b/src/api/s3/copy.rs
@@ -5,23 +5,26 @@ use std::time::{Duration, SystemTime, UNIX_EPOCH};
use futures::{stream, stream::Stream, StreamExt, TryFutureExt};
use md5::{Digest as Md5Digest, Md5};
+use bytes::Bytes;
use hyper::{Body, Request, Response};
use serde::Serialize;
+use garage_rpc::netapp::bytes_buf::BytesBuf;
+use garage_rpc::rpc_helper::OrderTag;
use garage_table::*;
use garage_util::data::*;
use garage_util::time::*;
-use garage_model::block_ref_table::*;
use garage_model::garage::Garage;
use garage_model::key_table::Key;
-use garage_model::object_table::*;
-use garage_model::version_table::*;
+use garage_model::s3::block_ref_table::*;
+use garage_model::s3::object_table::*;
+use garage_model::s3::version_table::*;
-use crate::api_server::{parse_bucket_key, resolve_bucket};
-use crate::error::*;
-use crate::s3_put::{decode_upload_id, get_headers};
-use crate::s3_xml::{self, xmlns_tag};
+use crate::helpers::parse_bucket_key;
+use crate::s3::error::*;
+use crate::s3::put::{decode_upload_id, get_headers};
+use crate::s3::xml::{self as s3_xml, xmlns_tag};
pub async fn handle_copy(
garage: Arc<Garage>,
@@ -201,8 +204,8 @@ pub async fn handle_upload_part_copy(
let mut ranges = http_range::HttpRange::parse(range_str, source_version_meta.size)
.map_err(|e| (e, source_version_meta.size))?;
if ranges.len() != 1 {
- return Err(Error::BadRequest(
- "Invalid x-amz-copy-source-range header: exactly 1 range must be given".into(),
+ return Err(Error::bad_request(
+ "Invalid x-amz-copy-source-range header: exactly 1 range must be given",
));
} else {
ranges.pop().unwrap()
@@ -230,8 +233,8 @@ pub async fn handle_upload_part_copy(
// This is only for small files, we don't bother handling this.
// (in AWS UploadPartCopy works for parts at least 5MB which
// is never the case of an inline object)
- return Err(Error::BadRequest(
- "Source object is too small (minimum part size is 5Mb)".into(),
+ return Err(Error::bad_request(
+ "Source object is too small (minimum part size is 5Mb)",
));
}
ObjectVersionData::FirstBlock(_meta, _first_block_hash) => (),
@@ -250,7 +253,7 @@ pub async fn handle_upload_part_copy(
// Check this part number hasn't yet been uploaded
if let Some(dv) = dest_version {
if dv.has_part_number(part_number) {
- return Err(Error::BadRequest(format!(
+ return Err(Error::bad_request(format!(
"Part number {} has already been uploaded",
part_number
)));
@@ -305,13 +308,18 @@ pub async fn handle_upload_part_copy(
// if and only if the block returned is a block that already existed
// in the Garage data store (thus we don't need to save it again).
let garage2 = garage.clone();
+ let order_stream = OrderTag::stream();
let source_blocks = stream::iter(blocks_to_copy)
- .flat_map(|(block_hash, range_to_copy)| {
+ .enumerate()
+ .flat_map(|(i, (block_hash, range_to_copy))| {
let garage3 = garage2.clone();
stream::once(async move {
- let data = garage3.block_manager.rpc_get_block(&block_hash).await?;
+ let data = garage3
+ .block_manager
+ .rpc_get_block(&block_hash, Some(order_stream.order(i as u64)))
+ .await?;
match range_to_copy {
- Some(r) => Ok((data[r].to_vec(), None)),
+ Some(r) => Ok((data.slice(r), None)),
None => Ok((data, Some(block_hash))),
}
})
@@ -413,10 +421,13 @@ async fn get_copy_source(
let copy_source = percent_encoding::percent_decode_str(copy_source).decode_utf8()?;
let (source_bucket, source_key) = parse_bucket_key(&copy_source, None)?;
- let source_bucket_id = resolve_bucket(garage, &source_bucket.to_string(), api_key).await?;
+ let source_bucket_id = garage
+ .bucket_helper()
+ .resolve_bucket(&source_bucket.to_string(), api_key)
+ .await?;
if !api_key.allow_read(&source_bucket_id) {
- return Err(Error::Forbidden(format!(
+ return Err(Error::forbidden(format!(
"Reading from bucket {} not allowed for this key",
source_bucket
)));
@@ -536,8 +547,8 @@ impl CopyPreconditionHeaders {
(None, None, None, Some(ims)) => v_date > *ims,
(None, None, None, None) => true,
_ => {
- return Err(Error::BadRequest(
- "Invalid combination of x-amz-copy-source-if-xxxxx headers".into(),
+ return Err(Error::bad_request(
+ "Invalid combination of x-amz-copy-source-if-xxxxx headers",
))
}
};
@@ -550,13 +561,13 @@ impl CopyPreconditionHeaders {
}
}
-type BlockStreamItemOk = (Vec<u8>, Option<Hash>);
+type BlockStreamItemOk = (Bytes, Option<Hash>);
type BlockStreamItem = Result<BlockStreamItemOk, garage_util::error::Error>;
struct Defragmenter<S: Stream<Item = BlockStreamItem>> {
block_size: usize,
block_stream: Pin<Box<stream::Peekable<S>>>,
- buffer: Vec<u8>,
+ buffer: BytesBuf,
hash: Option<Hash>,
}
@@ -565,7 +576,7 @@ impl<S: Stream<Item = BlockStreamItem>> Defragmenter<S> {
Self {
block_size,
block_stream,
- buffer: vec![],
+ buffer: BytesBuf::new(),
hash: None,
}
}
@@ -583,7 +594,7 @@ impl<S: Stream<Item = BlockStreamItem>> Defragmenter<S> {
if self.buffer.is_empty() {
let (next_block, next_block_hash) = self.block_stream.next().await.unwrap()?;
- self.buffer = next_block;
+ self.buffer.extend(next_block);
self.hash = next_block_hash;
} else if self.buffer.len() + peeked_next_block.len() > self.block_size {
break;
@@ -594,11 +605,11 @@ impl<S: Stream<Item = BlockStreamItem>> Defragmenter<S> {
}
}
- Ok((std::mem::take(&mut self.buffer), self.hash.take()))
+ Ok((self.buffer.take_all(), self.hash.take()))
}
}
-#[derive(Debug, Serialize, PartialEq)]
+#[derive(Debug, Serialize, PartialEq, Eq)]
pub struct CopyObjectResult {
#[serde(rename = "LastModified")]
pub last_modified: s3_xml::Value,
@@ -606,7 +617,7 @@ pub struct CopyObjectResult {
pub etag: s3_xml::Value,
}
-#[derive(Debug, Serialize, PartialEq)]
+#[derive(Debug, Serialize, PartialEq, Eq)]
pub struct CopyPartResult {
#[serde(serialize_with = "xmlns_tag")]
pub xmlns: (),
@@ -619,7 +630,7 @@ pub struct CopyPartResult {
#[cfg(test)]
mod tests {
use super::*;
- use crate::s3_xml::to_xml_with_header;
+ use crate::s3::xml::to_xml_with_header;
#[test]
fn copy_object_result() -> Result<(), Error> {
@@ -651,7 +662,6 @@ mod tests {
last_modified: s3_xml::Value("2011-04-11T20:34:56.000Z".into()),
etag: s3_xml::Value("\"9b2cf535f27731c974343645a3985328\"".into()),
};
- println!("{}", to_xml_with_header(&v)?);
assert_eq!(to_xml_with_header(&v)?, expected_retval);
diff --git a/src/api/s3_cors.rs b/src/api/s3/cors.rs
index ab77e23a..c7273464 100644
--- a/src/api/s3_cors.rs
+++ b/src/api/s3/cors.rs
@@ -9,13 +9,12 @@ use hyper::{header::HeaderName, Body, Method, Request, Response, StatusCode};
use serde::{Deserialize, Serialize};
-use crate::error::*;
-use crate::s3_xml::{to_xml_with_header, xmlns_tag, IntValue, Value};
+use crate::s3::error::*;
+use crate::s3::xml::{to_xml_with_header, xmlns_tag, IntValue, Value};
use crate::signature::verify_signed_content;
use garage_model::bucket_table::{Bucket, CorsRule as GarageCorsRule};
use garage_model::garage::Garage;
-use garage_table::*;
use garage_util::data::*;
pub async fn handle_get_cors(bucket: &Bucket) -> Result<Response<Body>, Error> {
@@ -48,14 +47,11 @@ pub async fn handle_delete_cors(
bucket_id: Uuid,
) -> Result<Response<Body>, Error> {
let mut bucket = garage
- .bucket_table
- .get(&EmptyKey, &bucket_id)
- .await?
- .ok_or(Error::NoSuchBucket)?;
+ .bucket_helper()
+ .get_existing_bucket(bucket_id)
+ .await?;
- let param = bucket
- .params_mut()
- .ok_or_internal_error("Bucket should not be deleted at this point")?;
+ let param = bucket.params_mut().unwrap();
param.cors_config.update(None);
garage.bucket_table.insert(&bucket).await?;
@@ -78,14 +74,11 @@ pub async fn handle_put_cors(
}
let mut bucket = garage
- .bucket_table
- .get(&EmptyKey, &bucket_id)
- .await?
- .ok_or(Error::NoSuchBucket)?;
+ .bucket_helper()
+ .get_existing_bucket(bucket_id)
+ .await?;
- let param = bucket
- .params_mut()
- .ok_or_internal_error("Bucket should not be deleted at this point")?;
+ let param = bucket.params_mut().unwrap();
let conf: CorsConfiguration = from_reader(&body as &[u8])?;
conf.validate()?;
@@ -119,12 +112,7 @@ pub async fn handle_options_s3api(
let helper = garage.bucket_helper();
let bucket_id = helper.resolve_global_bucket_name(&bn).await?;
if let Some(id) = bucket_id {
- let bucket = garage
- .bucket_table
- .get(&EmptyKey, &id)
- .await?
- .filter(|b| !b.state.is_deleted())
- .ok_or(Error::NoSuchBucket)?;
+ let bucket = garage.bucket_helper().get_existing_bucket(id).await?;
handle_options_for_bucket(req, &bucket)
} else {
// If there is a bucket name in the request, but that name
@@ -185,7 +173,7 @@ pub fn handle_options_for_bucket(
}
}
- Err(Error::Forbidden("This CORS request is not allowed.".into()))
+ Err(Error::forbidden("This CORS request is not allowed."))
}
pub fn find_matching_cors_rule<'a>(
diff --git a/src/api/s3_delete.rs b/src/api/s3/delete.rs
index b243d982..b337155f 100644
--- a/src/api/s3_delete.rs
+++ b/src/api/s3/delete.rs
@@ -6,10 +6,10 @@ use garage_util::data::*;
use garage_util::time::*;
use garage_model::garage::Garage;
-use garage_model::object_table::*;
+use garage_model::s3::object_table::*;
-use crate::error::*;
-use crate::s3_xml;
+use crate::s3::error::*;
+use crate::s3::xml as s3_xml;
use crate::signature::verify_signed_content;
async fn handle_delete_internal(
@@ -64,14 +64,13 @@ pub async fn handle_delete(
bucket_id: Uuid,
key: &str,
) -> Result<Response<Body>, Error> {
- let (_deleted_version, delete_marker_version) =
- handle_delete_internal(&garage, bucket_id, key).await?;
-
- Ok(Response::builder()
- .header("x-amz-version-id", hex::encode(delete_marker_version))
- .status(StatusCode::NO_CONTENT)
- .body(Body::from(vec![]))
- .unwrap())
+ match handle_delete_internal(&garage, bucket_id, key).await {
+ Ok(_) | Err(Error::NoSuchKey) => Ok(Response::builder()
+ .status(StatusCode::NO_CONTENT)
+ .body(Body::from(vec![]))
+ .unwrap()),
+ Err(e) => Err(e),
+ }
}
pub async fn handle_delete_objects(
diff --git a/src/api/error.rs b/src/api/s3/error.rs
index f53ed1fd..67009d63 100644
--- a/src/api/error.rs
+++ b/src/api/s3/error.rs
@@ -2,34 +2,24 @@ use std::convert::TryInto;
use err_derive::Error;
use hyper::header::HeaderValue;
-use hyper::{HeaderMap, StatusCode};
+use hyper::{Body, HeaderMap, StatusCode};
use garage_model::helper::error::Error as HelperError;
-use garage_util::error::Error as GarageError;
-use crate::s3_xml;
+use crate::common_error::CommonError;
+pub use crate::common_error::{CommonErrorDerivative, OkOrBadRequest, OkOrInternalError};
+use crate::generic_server::ApiError;
+use crate::s3::xml as s3_xml;
+use crate::signature::error::Error as SignatureError;
/// Errors of this crate
#[derive(Debug, Error)]
pub enum Error {
- // Category: internal error
- /// Error related to deeper parts of Garage
- #[error(display = "Internal error: {}", _0)]
- InternalError(#[error(source)] GarageError),
-
- /// Error related to Hyper
- #[error(display = "Internal error (Hyper error): {}", _0)]
- Hyper(#[error(source)] hyper::Error),
-
- /// Error related to HTTP
- #[error(display = "Internal error (HTTP error): {}", _0)]
- Http(#[error(source)] http::Error),
+ #[error(display = "{}", _0)]
+ /// Error from common error
+ Common(CommonError),
// Category: cannot process
- /// No proper api key was used, or the signature was invalid
- #[error(display = "Forbidden: {}", _0)]
- Forbidden(String),
-
/// Authorization Header Malformed
#[error(display = "Authorization header malformed, expected scope: {}", _0)]
AuthorizationHeaderMalformed(String),
@@ -38,22 +28,10 @@ pub enum Error {
#[error(display = "Key not found")]
NoSuchKey,
- /// The bucket requested don't exists
- #[error(display = "Bucket not found")]
- NoSuchBucket,
-
/// The multipart upload requested don't exists
#[error(display = "Upload not found")]
NoSuchUpload,
- /// Tried to create a bucket that already exist
- #[error(display = "Bucket already exists")]
- BucketAlreadyExists,
-
- /// Tried to delete a non-empty bucket
- #[error(display = "Tried to delete a non-empty bucket")]
- BucketNotEmpty,
-
/// Precondition failed (e.g. x-amz-copy-source-if-match)
#[error(display = "At least one of the preconditions you specified did not hold")]
PreconditionFailed,
@@ -80,10 +58,6 @@ pub enum Error {
#[error(display = "Invalid UTF-8: {}", _0)]
InvalidUtf8String(#[error(source)] std::string::FromUtf8Error),
- /// Some base64 encoded data was badly encoded
- #[error(display = "Invalid base64: {}", _0)]
- InvalidBase64(#[error(source)] base64::DecodeError),
-
/// The client sent invalid XML data
#[error(display = "Invalid XML: {}", _0)]
InvalidXml(String),
@@ -96,15 +70,34 @@ pub enum Error {
#[error(display = "Invalid HTTP range: {:?}", _0)]
InvalidRange(#[error(from)] (http_range::HttpRangeParseError, u64)),
- /// The client sent an invalid request
- #[error(display = "Bad request: {}", _0)]
- BadRequest(String),
-
/// The client sent a request for an action not supported by garage
#[error(display = "Unimplemented action: {}", _0)]
NotImplemented(String),
}
+impl<T> From<T> for Error
+where
+ CommonError: From<T>,
+{
+ fn from(err: T) -> Self {
+ Error::Common(CommonError::from(err))
+ }
+}
+
+impl CommonErrorDerivative for Error {}
+
+impl From<HelperError> for Error {
+ fn from(err: HelperError) -> Self {
+ match err {
+ HelperError::Internal(i) => Self::Common(CommonError::InternalError(i)),
+ HelperError::BadRequest(b) => Self::Common(CommonError::BadRequest(b)),
+ HelperError::InvalidBucketName(n) => Self::Common(CommonError::InvalidBucketName(n)),
+ HelperError::NoSuchBucket(n) => Self::Common(CommonError::NoSuchBucket(n)),
+ e => Self::bad_request(format!("{}", e)),
+ }
+ }
+}
+
impl From<roxmltree::Error> for Error {
fn from(err: roxmltree::Error) -> Self {
Self::InvalidXml(format!("{}", err))
@@ -117,88 +110,71 @@ impl From<quick_xml::de::DeError> for Error {
}
}
-impl From<HelperError> for Error {
- fn from(err: HelperError) -> Self {
+impl From<SignatureError> for Error {
+ fn from(err: SignatureError) -> Self {
match err {
- HelperError::Internal(i) => Self::InternalError(i),
- HelperError::BadRequest(b) => Self::BadRequest(b),
+ SignatureError::Common(c) => Self::Common(c),
+ SignatureError::AuthorizationHeaderMalformed(c) => {
+ Self::AuthorizationHeaderMalformed(c)
+ }
+ SignatureError::InvalidUtf8Str(i) => Self::InvalidUtf8Str(i),
+ SignatureError::InvalidHeader(h) => Self::InvalidHeader(h),
}
}
}
impl From<multer::Error> for Error {
fn from(err: multer::Error) -> Self {
- Self::BadRequest(err.to_string())
+ Self::bad_request(err)
}
}
impl Error {
- /// Get the HTTP status code that best represents the meaning of the error for the client
- pub fn http_status_code(&self) -> StatusCode {
- match self {
- Error::NoSuchKey | Error::NoSuchBucket | Error::NoSuchUpload => StatusCode::NOT_FOUND,
- Error::BucketNotEmpty | Error::BucketAlreadyExists => StatusCode::CONFLICT,
- Error::PreconditionFailed => StatusCode::PRECONDITION_FAILED,
- Error::Forbidden(_) => StatusCode::FORBIDDEN,
- Error::InternalError(
- GarageError::Timeout
- | GarageError::RemoteError(_)
- | GarageError::Quorum(_, _, _, _),
- ) => StatusCode::SERVICE_UNAVAILABLE,
- Error::InternalError(_) | Error::Hyper(_) | Error::Http(_) => {
- StatusCode::INTERNAL_SERVER_ERROR
- }
- Error::InvalidRange(_) => StatusCode::RANGE_NOT_SATISFIABLE,
- Error::NotImplemented(_) => StatusCode::NOT_IMPLEMENTED,
- _ => StatusCode::BAD_REQUEST,
- }
- }
-
pub fn aws_code(&self) -> &'static str {
match self {
+ Error::Common(c) => c.aws_code(),
Error::NoSuchKey => "NoSuchKey",
- Error::NoSuchBucket => "NoSuchBucket",
Error::NoSuchUpload => "NoSuchUpload",
- Error::BucketAlreadyExists => "BucketAlreadyExists",
- Error::BucketNotEmpty => "BucketNotEmpty",
Error::PreconditionFailed => "PreconditionFailed",
Error::InvalidPart => "InvalidPart",
Error::InvalidPartOrder => "InvalidPartOrder",
Error::EntityTooSmall => "EntityTooSmall",
- Error::Forbidden(_) => "AccessDenied",
Error::AuthorizationHeaderMalformed(_) => "AuthorizationHeaderMalformed",
Error::NotImplemented(_) => "NotImplemented",
- Error::InternalError(
- GarageError::Timeout
- | GarageError::RemoteError(_)
- | GarageError::Quorum(_, _, _, _),
- ) => "ServiceUnavailable",
- Error::InternalError(_) | Error::Hyper(_) | Error::Http(_) => "InternalError",
- _ => "InvalidRequest",
+ Error::InvalidXml(_) => "MalformedXML",
+ Error::InvalidRange(_) => "InvalidRange",
+ Error::InvalidUtf8Str(_) | Error::InvalidUtf8String(_) | Error::InvalidHeader(_) => {
+ "InvalidRequest"
+ }
}
}
+}
- pub fn aws_xml(&self, garage_region: &str, path: &str) -> String {
- let error = s3_xml::Error {
- code: s3_xml::Value(self.aws_code().to_string()),
- message: s3_xml::Value(format!("{}", self)),
- resource: Some(s3_xml::Value(path.to_string())),
- region: Some(s3_xml::Value(garage_region.to_string())),
- };
- s3_xml::to_xml_with_header(&error).unwrap_or_else(|_| {
- r#"
-<?xml version="1.0" encoding="UTF-8"?>
-<Error>
- <Code>InternalError</Code>
- <Message>XML encoding of error failed</Message>
-</Error>
- "#
- .into()
- })
+impl ApiError for Error {
+ /// Get the HTTP status code that best represents the meaning of the error for the client
+ fn http_status_code(&self) -> StatusCode {
+ match self {
+ Error::Common(c) => c.http_status_code(),
+ Error::NoSuchKey | Error::NoSuchUpload => StatusCode::NOT_FOUND,
+ Error::PreconditionFailed => StatusCode::PRECONDITION_FAILED,
+ Error::InvalidRange(_) => StatusCode::RANGE_NOT_SATISFIABLE,
+ Error::NotImplemented(_) => StatusCode::NOT_IMPLEMENTED,
+ Error::AuthorizationHeaderMalformed(_)
+ | Error::InvalidPart
+ | Error::InvalidPartOrder
+ | Error::EntityTooSmall
+ | Error::InvalidXml(_)
+ | Error::InvalidUtf8Str(_)
+ | Error::InvalidUtf8String(_)
+ | Error::InvalidHeader(_) => StatusCode::BAD_REQUEST,
+ }
}
- pub fn add_headers(&self, header_map: &mut HeaderMap<HeaderValue>) {
+ fn add_http_headers(&self, header_map: &mut HeaderMap<HeaderValue>) {
use hyper::header;
+
+ header_map.append(header::CONTENT_TYPE, "application/xml".parse().unwrap());
+
#[allow(clippy::single_match)]
match self {
Error::InvalidRange((_, len)) => {
@@ -212,68 +188,23 @@ impl Error {
_ => (),
}
}
-}
-
-/// Trait to map error to the Bad Request error code
-pub trait OkOrBadRequest {
- type S;
- fn ok_or_bad_request<M: AsRef<str>>(self, reason: M) -> Result<Self::S, Error>;
-}
-
-impl<T, E> OkOrBadRequest for Result<T, E>
-where
- E: std::fmt::Display,
-{
- type S = T;
- fn ok_or_bad_request<M: AsRef<str>>(self, reason: M) -> Result<T, Error> {
- match self {
- Ok(x) => Ok(x),
- Err(e) => Err(Error::BadRequest(format!("{}: {}", reason.as_ref(), e))),
- }
- }
-}
-
-impl<T> OkOrBadRequest for Option<T> {
- type S = T;
- fn ok_or_bad_request<M: AsRef<str>>(self, reason: M) -> Result<T, Error> {
- match self {
- Some(x) => Ok(x),
- None => Err(Error::BadRequest(reason.as_ref().to_string())),
- }
- }
-}
-
-/// Trait to map an error to an Internal Error code
-pub trait OkOrInternalError {
- type S;
- fn ok_or_internal_error<M: AsRef<str>>(self, reason: M) -> Result<Self::S, Error>;
-}
-
-impl<T, E> OkOrInternalError for Result<T, E>
-where
- E: std::fmt::Display,
-{
- type S = T;
- fn ok_or_internal_error<M: AsRef<str>>(self, reason: M) -> Result<T, Error> {
- match self {
- Ok(x) => Ok(x),
- Err(e) => Err(Error::InternalError(GarageError::Message(format!(
- "{}: {}",
- reason.as_ref(),
- e
- )))),
- }
- }
-}
-impl<T> OkOrInternalError for Option<T> {
- type S = T;
- fn ok_or_internal_error<M: AsRef<str>>(self, reason: M) -> Result<T, Error> {
- match self {
- Some(x) => Ok(x),
- None => Err(Error::InternalError(GarageError::Message(
- reason.as_ref().to_string(),
- ))),
- }
+ fn http_body(&self, garage_region: &str, path: &str) -> Body {
+ let error = s3_xml::Error {
+ code: s3_xml::Value(self.aws_code().to_string()),
+ message: s3_xml::Value(format!("{}", self)),
+ resource: Some(s3_xml::Value(path.to_string())),
+ region: Some(s3_xml::Value(garage_region.to_string())),
+ };
+ Body::from(s3_xml::to_xml_with_header(&error).unwrap_or_else(|_| {
+ r#"
+<?xml version="1.0" encoding="UTF-8"?>
+<Error>
+ <Code>InternalError</Code>
+ <Message>XML encoding of error failed</Message>
+</Error>
+ "#
+ .into()
+ }))
}
}
diff --git a/src/api/s3_get.rs b/src/api/s3/get.rs
index 7f647e15..2a99551a 100644
--- a/src/api/s3_get.rs
+++ b/src/api/s3/get.rs
@@ -2,22 +2,25 @@
use std::sync::Arc;
use std::time::{Duration, UNIX_EPOCH};
-use futures::stream::*;
+use futures::future;
+use futures::stream::{self, StreamExt};
use http::header::{
ACCEPT_RANGES, CONTENT_LENGTH, CONTENT_RANGE, CONTENT_TYPE, ETAG, IF_MODIFIED_SINCE,
IF_NONE_MATCH, LAST_MODIFIED, RANGE,
};
-use hyper::body::Bytes;
use hyper::{Body, Request, Response, StatusCode};
+use tokio::sync::mpsc;
+use garage_rpc::rpc_helper::{netapp::stream::ByteStream, OrderTag};
use garage_table::EmptyKey;
use garage_util::data::*;
+use garage_util::error::OkOrMessage;
use garage_model::garage::Garage;
-use garage_model::object_table::*;
-use garage_model::version_table::*;
+use garage_model::s3::object_table::*;
+use garage_model::s3::version_table::*;
-use crate::error::*;
+use crate::s3::error::*;
const X_AMZ_MP_PARTS_COUNT: &str = "x-amz-mp-parts-count";
@@ -210,8 +213,8 @@ pub async fn handle_get(
match (part_number, parse_range_header(req, last_v_meta.size)?) {
(Some(_), Some(_)) => {
- return Err(Error::BadRequest(
- "Cannot specify both partNumber and Range header".into(),
+ return Err(Error::bad_request(
+ "Cannot specify both partNumber and Range header",
));
}
(Some(pn), None) => {
@@ -242,36 +245,56 @@ pub async fn handle_get(
Ok(resp_builder.body(body)?)
}
ObjectVersionData::FirstBlock(_, first_block_hash) => {
- let read_first_block = garage.block_manager.rpc_get_block(first_block_hash);
- let get_next_blocks = garage.version_table.get(&last_v.uuid, &EmptyKey);
-
- let (first_block, version) = futures::try_join!(read_first_block, get_next_blocks)?;
- let version = version.ok_or(Error::NoSuchKey)?;
+ let (tx, rx) = mpsc::channel(2);
+
+ let order_stream = OrderTag::stream();
+ let first_block_hash = *first_block_hash;
+ let version_uuid = last_v.uuid;
+
+ tokio::spawn(async move {
+ match async {
+ let garage2 = garage.clone();
+ let version_fut = tokio::spawn(async move {
+ garage2.version_table.get(&version_uuid, &EmptyKey).await
+ });
+
+ let stream_block_0 = garage
+ .block_manager
+ .rpc_get_block_streaming(&first_block_hash, Some(order_stream.order(0)))
+ .await?;
+ tx.send(stream_block_0)
+ .await
+ .ok_or_message("channel closed")?;
+
+ let version = version_fut.await.unwrap()?.ok_or(Error::NoSuchKey)?;
+ for (i, (_, vb)) in version.blocks.items().iter().enumerate().skip(1) {
+ let stream_block_i = garage
+ .block_manager
+ .rpc_get_block_streaming(&vb.hash, Some(order_stream.order(i as u64)))
+ .await?;
+ tx.send(stream_block_i)
+ .await
+ .ok_or_message("channel closed")?;
+ }
- let mut blocks = version
- .blocks
- .items()
- .iter()
- .map(|(_, vb)| (vb.hash, None))
- .collect::<Vec<_>>();
- blocks[0].1 = Some(first_block);
-
- let body_stream = futures::stream::iter(blocks)
- .map(move |(hash, data_opt)| {
- let garage = garage.clone();
- async move {
- if let Some(data) = data_opt {
- Ok(Bytes::from(data))
- } else {
- garage
- .block_manager
- .rpc_get_block(&hash)
- .await
- .map(Bytes::from)
- }
+ Ok::<(), Error>(())
+ }
+ .await
+ {
+ Ok(()) => (),
+ Err(e) => {
+ let err = std::io::Error::new(
+ std::io::ErrorKind::Other,
+ format!("Error while getting object data: {}", e),
+ );
+ let _ = tx
+ .send(Box::pin(stream::once(future::ready(Err(err)))))
+ .await;
}
- })
- .buffered(2);
+ }
+ });
+
+ let body_stream = tokio_stream::wrappers::ReceiverStream::new(rx).flatten();
let body = hyper::body::Body::wrap_stream(body_stream);
Ok(resp_builder.body(body)?)
@@ -302,9 +325,9 @@ async fn handle_get_range(
let body: Body = Body::from(bytes[begin as usize..end as usize].to_vec());
Ok(resp_builder.body(body)?)
} else {
- None.ok_or_internal_error(
+ Err(Error::internal_error(
"Requested range not present in inline bytes when it should have been",
- )
+ ))
}
}
ObjectVersionData::FirstBlock(_meta, _first_block_hash) => {
@@ -422,40 +445,79 @@ fn body_from_blocks_range(
all_blocks.len(),
4 + ((end - begin) / std::cmp::max(all_blocks[0].1.size as u64, 1024)) as usize,
));
- let mut true_offset = 0;
+ let mut block_offset: u64 = 0;
for (_, b) in all_blocks.iter() {
- if true_offset >= end {
+ if block_offset >= end {
break;
}
// Keep only blocks that have an intersection with the requested range
- if true_offset < end && true_offset + b.size > begin {
- blocks.push((*b, true_offset));
+ if block_offset < end && block_offset + b.size > begin {
+ blocks.push((*b, block_offset));
}
- true_offset += b.size;
+ block_offset += b.size as u64;
}
+ let order_stream = OrderTag::stream();
let body_stream = futures::stream::iter(blocks)
- .map(move |(block, true_offset)| {
+ .enumerate()
+ .map(move |(i, (block, block_offset))| {
let garage = garage.clone();
async move {
- let data = garage.block_manager.rpc_get_block(&block.hash).await?;
- let data = Bytes::from(data);
- let start_in_block = if true_offset > begin {
- 0
- } else {
- begin - true_offset
- };
- let end_in_block = if true_offset + block.size < end {
- block.size
- } else {
- end - true_offset
- };
- Result::<Bytes, Error>::Ok(
- data.slice(start_in_block as usize..end_in_block as usize),
- )
+ garage
+ .block_manager
+ .rpc_get_block_streaming(&block.hash, Some(order_stream.order(i as u64)))
+ .await
+ .unwrap_or_else(|e| error_stream(i, e))
+ .scan(block_offset, move |chunk_offset, chunk| {
+ let r = match chunk {
+ Ok(chunk_bytes) => {
+ let chunk_len = chunk_bytes.len() as u64;
+ let r = if *chunk_offset >= end {
+ // The current chunk is after the part we want to read.
+ // Returning None here will stop the scan, the rest of the
+ // stream will be ignored
+ None
+ } else if *chunk_offset + chunk_len <= begin {
+ // The current chunk is before the part we want to read.
+ // We return a None that will be removed by the filter_map
+ // below.
+ Some(None)
+ } else {
+ // The chunk has an intersection with the requested range
+ let start_in_chunk = if *chunk_offset > begin {
+ 0
+ } else {
+ begin - *chunk_offset
+ };
+ let end_in_chunk = if *chunk_offset + chunk_len < end {
+ chunk_len
+ } else {
+ end - *chunk_offset
+ };
+ Some(Some(Ok(chunk_bytes
+ .slice(start_in_chunk as usize..end_in_chunk as usize))))
+ };
+ *chunk_offset += chunk_bytes.len() as u64;
+ r
+ }
+ Err(e) => Some(Some(Err(e))),
+ };
+ futures::future::ready(r)
+ })
+ .filter_map(futures::future::ready)
}
})
- .buffered(2);
+ .buffered(2)
+ .flatten();
hyper::body::Body::wrap_stream(body_stream)
}
+
+fn error_stream(i: usize, e: garage_util::error::Error) -> ByteStream {
+ Box::pin(futures::stream::once(async move {
+ Err(std::io::Error::new(
+ std::io::ErrorKind::Other,
+ format!("Could not get block {}: {}", i, e),
+ ))
+ }))
+}
diff --git a/src/api/s3_list.rs b/src/api/s3/list.rs
index 5852fc1b..e5f486c8 100644
--- a/src/api/s3_list.rs
+++ b/src/api/s3/list.rs
@@ -10,15 +10,16 @@ use garage_util::error::Error as GarageError;
use garage_util::time::*;
use garage_model::garage::Garage;
-use garage_model::object_table::*;
-use garage_model::version_table::Version;
+use garage_model::s3::object_table::*;
+use garage_model::s3::version_table::Version;
-use garage_table::EmptyKey;
+use garage_table::{EmptyKey, EnumerationOrder};
use crate::encoding::*;
-use crate::error::*;
-use crate::s3_put;
-use crate::s3_xml;
+use crate::helpers::key_after_prefix;
+use crate::s3::error::*;
+use crate::s3::put as s3_put;
+use crate::s3::xml as s3_xml;
const DUMMY_NAME: &str = "Dummy Key";
const DUMMY_KEY: &str = "GKDummyKey";
@@ -66,8 +67,14 @@ pub async fn handle_list(
let io = |bucket, key, count| {
let t = &garage.object_table;
async move {
- t.get_range(&bucket, key, Some(ObjectFilter::IsData), count)
- .await
+ t.get_range(
+ &bucket,
+ key,
+ Some(ObjectFilter::IsData),
+ count,
+ EnumerationOrder::Forward,
+ )
+ .await
}
};
@@ -165,8 +172,14 @@ pub async fn handle_list_multipart_upload(
let io = |bucket, key, count| {
let t = &garage.object_table;
async move {
- t.get_range(&bucket, key, Some(ObjectFilter::IsUploading), count)
- .await
+ t.get_range(
+ &bucket,
+ key,
+ Some(ObjectFilter::IsUploading),
+ count,
+ EnumerationOrder::Forward,
+ )
+ .await
}
};
@@ -569,13 +582,19 @@ impl ListObjectsQuery {
// representing the key to start with.
(Some(token), _) => match &token[..1] {
"[" => Ok(RangeBegin::IncludingKey {
- key: String::from_utf8(base64::decode(token[1..].as_bytes())?)?,
+ key: String::from_utf8(
+ base64::decode(token[1..].as_bytes())
+ .ok_or_bad_request("Invalid continuation token")?,
+ )?,
fallback_key: None,
}),
"]" => Ok(RangeBegin::AfterKey {
- key: String::from_utf8(base64::decode(token[1..].as_bytes())?)?,
+ key: String::from_utf8(
+ base64::decode(token[1..].as_bytes())
+ .ok_or_bad_request("Invalid continuation token")?,
+ )?,
}),
- _ => Err(Error::BadRequest("Invalid continuation token".to_string())),
+ _ => Err(Error::bad_request("Invalid continuation token")),
},
// StartAfter has defined semantics in the spec:
@@ -923,39 +942,13 @@ fn uriencode_maybe(s: &str, yes: bool) -> s3_xml::Value {
}
}
-const UTF8_BEFORE_LAST_CHAR: char = '\u{10FFFE}';
-
-/// Compute the key after the prefix
-fn key_after_prefix(pfx: &str) -> Option<String> {
- let mut next = pfx.to_string();
- while !next.is_empty() {
- let tail = next.pop().unwrap();
- if tail >= char::MAX {
- continue;
- }
-
- // Circumvent a limitation of RangeFrom that overflow earlier than needed
- // See: https://doc.rust-lang.org/core/ops/struct.RangeFrom.html
- let new_tail = if tail == UTF8_BEFORE_LAST_CHAR {
- char::MAX
- } else {
- (tail..).nth(1).unwrap()
- };
-
- next.push(new_tail);
- return Some(next);
- }
-
- None
-}
-
/*
* Unit tests of this module
*/
#[cfg(test)]
mod tests {
use super::*;
- use garage_model::version_table::*;
+ use garage_model::s3::version_table::*;
use garage_util::*;
use std::iter::FromIterator;
@@ -1003,39 +996,6 @@ mod tests {
}
#[test]
- fn test_key_after_prefix() {
- assert_eq!(UTF8_BEFORE_LAST_CHAR as u32, (char::MAX as u32) - 1);
- assert_eq!(key_after_prefix("a/b/").unwrap().as_str(), "a/b0");
- assert_eq!(key_after_prefix("€").unwrap().as_str(), "₭");
- assert_eq!(
- key_after_prefix("􏿽").unwrap().as_str(),
- String::from(char::from_u32(0x10FFFE).unwrap())
- );
-
- // When the last character is the biggest UTF8 char
- let a = String::from_iter(['a', char::MAX].iter());
- assert_eq!(key_after_prefix(a.as_str()).unwrap().as_str(), "b");
-
- // When all characters are the biggest UTF8 char
- let b = String::from_iter([char::MAX; 3].iter());
- assert!(key_after_prefix(b.as_str()).is_none());
-
- // Check utf8 surrogates
- let c = String::from('\u{D7FF}');
- assert_eq!(
- key_after_prefix(c.as_str()).unwrap().as_str(),
- String::from('\u{E000}')
- );
-
- // Check the character before the biggest one
- let d = String::from('\u{10FFFE}');
- assert_eq!(
- key_after_prefix(d.as_str()).unwrap().as_str(),
- String::from(char::MAX)
- );
- }
-
- #[test]
fn test_common_prefixes() {
let mut query = query();
let objs = objs();
diff --git a/src/api/s3/mod.rs b/src/api/s3/mod.rs
new file mode 100644
index 00000000..7b56d4d8
--- /dev/null
+++ b/src/api/s3/mod.rs
@@ -0,0 +1,15 @@
+pub mod api_server;
+pub mod error;
+
+mod bucket;
+mod copy;
+pub mod cors;
+mod delete;
+pub mod get;
+mod list;
+mod post_object;
+mod put;
+mod website;
+
+mod router;
+pub mod xml;
diff --git a/src/api/s3_post_object.rs b/src/api/s3/post_object.rs
index 585e0304..d063faa4 100644
--- a/src/api/s3_post_object.rs
+++ b/src/api/s3/post_object.rs
@@ -14,16 +14,15 @@ use serde::Deserialize;
use garage_model::garage::Garage;
-use crate::api_server::resolve_bucket;
-use crate::error::*;
-use crate::s3_put::{get_headers, save_stream};
-use crate::s3_xml;
+use crate::s3::error::*;
+use crate::s3::put::{get_headers, save_stream};
+use crate::s3::xml as s3_xml;
use crate::signature::payload::{parse_date, verify_v4};
pub async fn handle_post_object(
garage: Arc<Garage>,
req: Request<Body>,
- bucket: String,
+ bucket_name: String,
) -> Result<Response<Body>, Error> {
let boundary = req
.headers()
@@ -48,9 +47,7 @@ pub async fn handle_post_object(
let field = if let Some(field) = multipart.next_field().await? {
field
} else {
- return Err(Error::BadRequest(
- "Request did not contain a file".to_owned(),
- ));
+ return Err(Error::bad_request("Request did not contain a file"));
};
let name: HeaderName = if let Some(Ok(name)) = field.name().map(TryInto::try_into) {
name
@@ -66,14 +63,14 @@ pub async fn handle_post_object(
"tag" => (/* tag need to be reencoded, but we don't support them yet anyway */),
"acl" => {
if params.insert("x-amz-acl", content).is_some() {
- return Err(Error::BadRequest(
- "Field 'acl' provided more than one time".to_string(),
+ return Err(Error::bad_request(
+ "Field 'acl' provided more than one time",
));
}
}
_ => {
if params.insert(&name, content).is_some() {
- return Err(Error::BadRequest(format!(
+ return Err(Error::bad_request(format!(
"Field '{}' provided more than one time",
name
)));
@@ -90,9 +87,7 @@ pub async fn handle_post_object(
.to_str()?;
let credential = params
.get("x-amz-credential")
- .ok_or_else(|| {
- Error::Forbidden("Garage does not support anonymous access yet".to_string())
- })?
+ .ok_or_else(|| Error::forbidden("Garage does not support anonymous access yet"))?
.to_str()?;
let policy = params
.get("policy")
@@ -119,17 +114,31 @@ pub async fn handle_post_object(
};
let date = parse_date(date)?;
- let api_key = verify_v4(&garage, credential, &date, signature, policy.as_bytes()).await?;
+ let api_key = verify_v4(
+ &garage,
+ "s3",
+ credential,
+ &date,
+ signature,
+ policy.as_bytes(),
+ )
+ .await?;
- let bucket_id = resolve_bucket(&garage, &bucket, &api_key).await?;
+ let bucket_id = garage
+ .bucket_helper()
+ .resolve_bucket(&bucket_name, &api_key)
+ .await?;
if !api_key.allow_write(&bucket_id) {
- return Err(Error::Forbidden(
- "Operation is not allowed for this key.".to_string(),
- ));
+ return Err(Error::forbidden("Operation is not allowed for this key."));
}
- let decoded_policy = base64::decode(&policy)?;
+ let bucket = garage
+ .bucket_helper()
+ .get_existing_bucket(bucket_id)
+ .await?;
+
+ let decoded_policy = base64::decode(&policy).ok_or_bad_request("Invalid policy")?;
let decoded_policy: Policy =
serde_json::from_slice(&decoded_policy).ok_or_bad_request("Invalid policy")?;
@@ -137,9 +146,7 @@ pub async fn handle_post_object(
.ok_or_bad_request("Invalid expiration date")?
.into();
if Utc::now() - expiration > Duration::zero() {
- return Err(Error::BadRequest(
- "Expiration date is in the paste".to_string(),
- ));
+ return Err(Error::bad_request("Expiration date is in the paste"));
}
let mut conditions = decoded_policy.into_conditions()?;
@@ -151,7 +158,7 @@ pub async fn handle_post_object(
"policy" | "x-amz-signature" => (), // this is always accepted, as it's required to validate other fields
"content-type" => {
let conds = conditions.params.remove("content-type").ok_or_else(|| {
- Error::BadRequest(format!("Key '{}' is not allowed in policy", param_key))
+ Error::bad_request(format!("Key '{}' is not allowed in policy", param_key))
})?;
for cond in conds {
let ok = match cond {
@@ -161,7 +168,7 @@ pub async fn handle_post_object(
}
};
if !ok {
- return Err(Error::BadRequest(format!(
+ return Err(Error::bad_request(format!(
"Key '{}' has value not allowed in policy",
param_key
)));
@@ -170,7 +177,7 @@ pub async fn handle_post_object(
}
"key" => {
let conds = conditions.params.remove("key").ok_or_else(|| {
- Error::BadRequest(format!("Key '{}' is not allowed in policy", param_key))
+ Error::bad_request(format!("Key '{}' is not allowed in policy", param_key))
})?;
for cond in conds {
let ok = match cond {
@@ -178,7 +185,7 @@ pub async fn handle_post_object(
Operation::StartsWith(s) => key.starts_with(&s),
};
if !ok {
- return Err(Error::BadRequest(format!(
+ return Err(Error::bad_request(format!(
"Key '{}' has value not allowed in policy",
param_key
)));
@@ -193,7 +200,7 @@ pub async fn handle_post_object(
continue;
}
let conds = conditions.params.remove(&param_key).ok_or_else(|| {
- Error::BadRequest(format!("Key '{}' is not allowed in policy", param_key))
+ Error::bad_request(format!("Key '{}' is not allowed in policy", param_key))
})?;
for cond in conds {
let ok = match cond {
@@ -201,7 +208,7 @@ pub async fn handle_post_object(
Operation::StartsWith(s) => value.to_str()?.starts_with(s.as_str()),
};
if !ok {
- return Err(Error::BadRequest(format!(
+ return Err(Error::bad_request(format!(
"Key '{}' has value not allowed in policy",
param_key
)));
@@ -212,7 +219,7 @@ pub async fn handle_post_object(
}
if let Some((param_key, _)) = conditions.params.iter().next() {
- return Err(Error::BadRequest(format!(
+ return Err(Error::bad_request(format!(
"Key '{}' is required in policy, but no value was provided",
param_key
)));
@@ -225,7 +232,7 @@ pub async fn handle_post_object(
garage,
headers,
StreamLimiter::new(stream, conditions.content_length),
- bucket_id,
+ &bucket,
&key,
None,
None,
@@ -242,7 +249,7 @@ pub async fn handle_post_object(
{
target
.query_pairs_mut()
- .append_pair("bucket", &bucket)
+ .append_pair("bucket", &bucket_name)
.append_pair("key", &key)
.append_pair("etag", &etag);
let target = target.to_string();
@@ -287,7 +294,7 @@ pub async fn handle_post_object(
let xml = s3_xml::PostObject {
xmlns: (),
location: s3_xml::Value(location),
- bucket: s3_xml::Value(bucket),
+ bucket: s3_xml::Value(bucket_name),
key: s3_xml::Value(key),
etag: s3_xml::Value(etag),
};
@@ -318,7 +325,7 @@ impl Policy {
match condition {
PolicyCondition::Equal(map) => {
if map.len() != 1 {
- return Err(Error::BadRequest("Invalid policy item".to_owned()));
+ return Err(Error::bad_request("Invalid policy item"));
}
let (mut k, v) = map.into_iter().next().expect("size was verified");
k.make_ascii_lowercase();
@@ -326,7 +333,7 @@ impl Policy {
}
PolicyCondition::OtherOp([cond, mut key, value]) => {
if key.remove(0) != '$' {
- return Err(Error::BadRequest("Invalid policy item".to_owned()));
+ return Err(Error::bad_request("Invalid policy item"));
}
key.make_ascii_lowercase();
match cond.as_str() {
@@ -339,7 +346,7 @@ impl Policy {
.or_default()
.push(Operation::StartsWith(value));
}
- _ => return Err(Error::BadRequest("Invalid policy item".to_owned())),
+ _ => return Err(Error::bad_request("Invalid policy item")),
}
}
PolicyCondition::SizeRange(key, min, max) => {
@@ -347,7 +354,7 @@ impl Policy {
length.0 = length.0.max(min);
length.1 = length.1.min(max);
} else {
- return Err(Error::BadRequest("Invalid policy item".to_owned()));
+ return Err(Error::bad_request("Invalid policy item"));
}
}
}
@@ -412,15 +419,15 @@ where
self.read += bytes.len() as u64;
// optimization to fail early when we know before the end it's too long
if self.length.end() < &self.read {
- return Poll::Ready(Some(Err(Error::BadRequest(
- "File size does not match policy".to_owned(),
+ return Poll::Ready(Some(Err(Error::bad_request(
+ "File size does not match policy",
))));
}
}
Poll::Ready(None) => {
if !self.length.contains(&self.read) {
- return Poll::Ready(Some(Err(Error::BadRequest(
- "File size does not match policy".to_owned(),
+ return Poll::Ready(Some(Err(Error::bad_request(
+ "File size does not match policy",
))));
}
}
diff --git a/src/api/s3_put.rs b/src/api/s3/put.rs
index ed0bf00b..97b8e4e3 100644
--- a/src/api/s3_put.rs
+++ b/src/api/s3/put.rs
@@ -1,4 +1,4 @@
-use std::collections::{BTreeMap, BTreeSet, VecDeque};
+use std::collections::{BTreeMap, BTreeSet, HashMap};
use std::sync::Arc;
use futures::prelude::*;
@@ -8,25 +8,34 @@ use hyper::{Request, Response};
use md5::{digest::generic_array::*, Digest as Md5Digest, Md5};
use sha2::Sha256;
+use opentelemetry::{
+ trace::{FutureExt as OtelFutureExt, TraceContextExt, Tracer},
+ Context,
+};
+
+use garage_rpc::netapp::bytes_buf::BytesBuf;
use garage_table::*;
+use garage_util::async_hash::*;
use garage_util::data::*;
use garage_util::error::Error as GarageError;
use garage_util::time::*;
use garage_block::manager::INLINE_THRESHOLD;
-use garage_model::block_ref_table::*;
+use garage_model::bucket_table::Bucket;
use garage_model::garage::Garage;
-use garage_model::object_table::*;
-use garage_model::version_table::*;
+use garage_model::index_counter::CountedItem;
+use garage_model::s3::block_ref_table::*;
+use garage_model::s3::object_table::*;
+use garage_model::s3::version_table::*;
-use crate::error::*;
-use crate::s3_xml;
+use crate::s3::error::*;
+use crate::s3::xml as s3_xml;
use crate::signature::verify_signed_content;
pub async fn handle_put(
garage: Arc<Garage>,
req: Request<Body>,
- bucket_id: Uuid,
+ bucket: &Bucket,
key: &str,
content_sha256: Option<Hash>,
) -> Result<Response<Body>, Error> {
@@ -46,7 +55,7 @@ pub async fn handle_put(
garage,
headers,
body,
- bucket_id,
+ bucket,
key,
content_md5,
content_sha256,
@@ -59,7 +68,7 @@ pub(crate) async fn save_stream<S: Stream<Item = Result<Bytes, Error>> + Unpin>(
garage: Arc<Garage>,
headers: ObjectVersionHeaders,
body: S,
- bucket_id: Uuid,
+ bucket: &Bucket,
key: &str,
content_md5: Option<String>,
content_sha256: Option<FixedBytes32>,
@@ -80,6 +89,7 @@ pub(crate) async fn save_stream<S: Stream<Item = Result<Bytes, Error>> + Unpin>(
let data_md5sum_hex = hex::encode(data_md5sum);
let data_sha256sum = sha256sum(&first_block[..]);
+ let size = first_block.len() as u64;
ensure_checksum_matches(
data_md5sum.as_slice(),
@@ -88,20 +98,22 @@ pub(crate) async fn save_stream<S: Stream<Item = Result<Bytes, Error>> + Unpin>(
content_sha256,
)?;
+ check_quotas(&garage, bucket, key, size).await?;
+
let object_version = ObjectVersion {
uuid: version_uuid,
timestamp: version_timestamp,
state: ObjectVersionState::Complete(ObjectVersionData::Inline(
ObjectVersionMeta {
headers,
- size: first_block.len() as u64,
+ size,
etag: data_md5sum_hex.clone(),
},
- first_block,
+ first_block.to_vec(),
)),
};
- let object = Object::new(bucket_id, key.into(), vec![object_version]);
+ let object = Object::new(bucket.id, key.into(), vec![object_version]);
garage.object_table.insert(&object).await?;
return Ok((version_uuid, data_md5sum_hex));
@@ -114,36 +126,42 @@ pub(crate) async fn save_stream<S: Stream<Item = Result<Bytes, Error>> + Unpin>(
timestamp: version_timestamp,
state: ObjectVersionState::Uploading(headers.clone()),
};
- let object = Object::new(bucket_id, key.into(), vec![object_version.clone()]);
+ let object = Object::new(bucket.id, key.into(), vec![object_version.clone()]);
garage.object_table.insert(&object).await?;
// Initialize corresponding entry in version table
// Write this entry now, even with empty block list,
// to prevent block_ref entries from being deleted (they can be deleted
// if the reference a version that isn't found in the version table)
- let version = Version::new(version_uuid, bucket_id, key.into(), false);
+ let version = Version::new(version_uuid, bucket.id, key.into(), false);
garage.version_table.insert(&version).await?;
// Transfer data and verify checksum
- let first_block_hash = blake2sum(&first_block[..]);
- let tx_result = read_and_put_blocks(
- &garage,
- &version,
- 1,
- first_block,
- first_block_hash,
- &mut chunker,
- )
- .await
- .and_then(|(total_size, data_md5sum, data_sha256sum)| {
+ let first_block_hash = async_blake2sum(first_block.clone()).await;
+
+ let tx_result = (|| async {
+ let (total_size, data_md5sum, data_sha256sum) = read_and_put_blocks(
+ &garage,
+ &version,
+ 1,
+ first_block,
+ first_block_hash,
+ &mut chunker,
+ )
+ .await?;
+
ensure_checksum_matches(
data_md5sum.as_slice(),
data_sha256sum,
content_md5.as_deref(),
content_sha256,
- )
- .map(|()| (total_size, data_md5sum))
- });
+ )?;
+
+ check_quotas(&garage, bucket, key, total_size).await?;
+
+ Ok((total_size, data_md5sum))
+ })()
+ .await;
// If something went wrong, clean up
let (total_size, md5sum_arr) = match tx_result {
@@ -151,7 +169,7 @@ pub(crate) async fn save_stream<S: Stream<Item = Result<Bytes, Error>> + Unpin>(
Err(e) => {
// Mark object as aborted, this will free the blocks further down
object_version.state = ObjectVersionState::Aborted;
- let object = Object::new(bucket_id, key.into(), vec![object_version.clone()]);
+ let object = Object::new(bucket.id, key.into(), vec![object_version.clone()]);
garage.object_table.insert(&object).await?;
return Err(e);
}
@@ -167,7 +185,7 @@ pub(crate) async fn save_stream<S: Stream<Item = Result<Bytes, Error>> + Unpin>(
},
first_block_hash,
));
- let object = Object::new(bucket_id, key.into(), vec![object_version]);
+ let object = Object::new(bucket.id, key.into(), vec![object_version]);
garage.object_table.insert(&object).await?;
Ok((version_uuid, md5sum_hex))
@@ -183,8 +201,8 @@ fn ensure_checksum_matches(
) -> Result<(), Error> {
if let Some(expected_sha256) = content_sha256 {
if expected_sha256 != data_sha256sum {
- return Err(Error::BadRequest(
- "Unable to validate x-amz-content-sha256".to_string(),
+ return Err(Error::bad_request(
+ "Unable to validate x-amz-content-sha256",
));
} else {
trace!("Successfully validated x-amz-content-sha256");
@@ -192,9 +210,7 @@ fn ensure_checksum_matches(
}
if let Some(expected_md5) = content_md5 {
if expected_md5.trim_matches('"') != base64::encode(data_md5sum) {
- return Err(Error::BadRequest(
- "Unable to validate content-md5".to_string(),
- ));
+ return Err(Error::bad_request("Unable to validate content-md5"));
} else {
trace!("Successfully validated content-md5");
}
@@ -202,18 +218,85 @@ fn ensure_checksum_matches(
Ok(())
}
+/// Check that inserting this object with this size doesn't exceed bucket quotas
+async fn check_quotas(
+ garage: &Arc<Garage>,
+ bucket: &Bucket,
+ key: &str,
+ size: u64,
+) -> Result<(), Error> {
+ let quotas = bucket.state.as_option().unwrap().quotas.get();
+ if quotas.max_objects.is_none() && quotas.max_size.is_none() {
+ return Ok(());
+ };
+
+ let key = key.to_string();
+ let (prev_object, counters) = futures::try_join!(
+ garage.object_table.get(&bucket.id, &key),
+ garage.object_counter_table.table.get(&bucket.id, &EmptyKey),
+ )?;
+
+ let counters = counters
+ .map(|x| x.filtered_values(&garage.system.ring.borrow()))
+ .unwrap_or_default();
+
+ let (prev_cnt_obj, prev_cnt_size) = match prev_object {
+ Some(o) => {
+ let prev_cnt = o.counts().into_iter().collect::<HashMap<_, _>>();
+ (
+ prev_cnt.get(OBJECTS).cloned().unwrap_or_default(),
+ prev_cnt.get(BYTES).cloned().unwrap_or_default(),
+ )
+ }
+ None => (0, 0),
+ };
+ let cnt_obj_diff = 1 - prev_cnt_obj;
+ let cnt_size_diff = size as i64 - prev_cnt_size;
+
+ if let Some(mo) = quotas.max_objects {
+ let current_objects = counters.get(OBJECTS).cloned().unwrap_or_default();
+ if cnt_obj_diff > 0 && current_objects + cnt_obj_diff > mo as i64 {
+ return Err(Error::forbidden(format!(
+ "Object quota is reached, maximum objects for this bucket: {}",
+ mo
+ )));
+ }
+ }
+
+ if let Some(ms) = quotas.max_size {
+ let current_size = counters.get(BYTES).cloned().unwrap_or_default();
+ if cnt_size_diff > 0 && current_size + cnt_size_diff > ms as i64 {
+ return Err(Error::forbidden(format!(
+ "Bucket size quota is reached, maximum total size of objects for this bucket: {}. The bucket is already {} bytes, and this object would add {} bytes.",
+ ms, current_size, size
+ )));
+ }
+ }
+
+ Ok(())
+}
+
async fn read_and_put_blocks<S: Stream<Item = Result<Bytes, Error>> + Unpin>(
garage: &Garage,
version: &Version,
part_number: u64,
- first_block: Vec<u8>,
+ first_block: Bytes,
first_block_hash: Hash,
chunker: &mut StreamChunker<S>,
) -> Result<(u64, GenericArray<u8, typenum::U16>, Hash), Error> {
- let mut md5hasher = Md5::new();
- let mut sha256hasher = Sha256::new();
- md5hasher.update(&first_block[..]);
- sha256hasher.update(&first_block[..]);
+ let tracer = opentelemetry::global::tracer("garage");
+
+ let md5hasher = AsyncHasher::<Md5>::new();
+ let sha256hasher = AsyncHasher::<Sha256>::new();
+
+ futures::future::join(
+ md5hasher.update(first_block.clone()),
+ sha256hasher.update(first_block.clone()),
+ )
+ .with_context(Context::current_with_span(
+ tracer.start("Hash first block (md5, sha256)"),
+ ))
+ .await;
let mut next_offset = first_block.len();
let mut put_curr_version_block = put_block_meta(
@@ -235,9 +318,15 @@ async fn read_and_put_blocks<S: Stream<Item = Result<Bytes, Error>> + Unpin>(
chunker.next(),
)?;
if let Some(block) = next_block {
- md5hasher.update(&block[..]);
- sha256hasher.update(&block[..]);
- let block_hash = blake2sum(&block[..]);
+ let (_, _, block_hash) = futures::future::join3(
+ md5hasher.update(block.clone()),
+ sha256hasher.update(block.clone()),
+ async_blake2sum(block.clone()),
+ )
+ .with_context(Context::current_with_span(
+ tracer.start("Hash block (md5, sha256, blake2)"),
+ ))
+ .await;
let block_len = block.len();
put_curr_version_block = put_block_meta(
garage,
@@ -255,9 +344,9 @@ async fn read_and_put_blocks<S: Stream<Item = Result<Bytes, Error>> + Unpin>(
}
let total_size = next_offset as u64;
- let data_md5sum = md5hasher.finalize();
+ let data_md5sum = md5hasher.finalize().await;
- let data_sha256sum = sha256hasher.finalize();
+ let data_sha256sum = sha256hasher.finalize().await;
let data_sha256sum = Hash::try_from(&data_sha256sum[..]).unwrap();
Ok((total_size, data_md5sum, data_sha256sum))
@@ -297,7 +386,7 @@ struct StreamChunker<S: Stream<Item = Result<Bytes, Error>>> {
stream: S,
read_all: bool,
block_size: usize,
- buf: VecDeque<u8>,
+ buf: BytesBuf,
}
impl<S: Stream<Item = Result<Bytes, Error>> + Unpin> StreamChunker<S> {
@@ -306,11 +395,11 @@ impl<S: Stream<Item = Result<Bytes, Error>> + Unpin> StreamChunker<S> {
stream,
read_all: false,
block_size,
- buf: VecDeque::with_capacity(2 * block_size),
+ buf: BytesBuf::new(),
}
}
- async fn next(&mut self) -> Result<Option<Vec<u8>>, Error> {
+ async fn next(&mut self) -> Result<Option<Bytes>, Error> {
while !self.read_all && self.buf.len() < self.block_size {
if let Some(block) = self.stream.next().await {
let bytes = block?;
@@ -323,12 +412,8 @@ impl<S: Stream<Item = Result<Bytes, Error>> + Unpin> StreamChunker<S> {
if self.buf.is_empty() {
Ok(None)
- } else if self.buf.len() <= self.block_size {
- let block = self.buf.drain(..).collect::<Vec<u8>>();
- Ok(Some(block))
} else {
- let block = self.buf.drain(..self.block_size).collect::<Vec<u8>>();
- Ok(Some(block))
+ Ok(Some(self.buf.take_max(self.block_size)))
}
}
}
@@ -428,7 +513,7 @@ pub async fn handle_put_part(
// Check part hasn't already been uploaded
if let Some(v) = version {
if v.has_part_number(part_number) {
- return Err(Error::BadRequest(format!(
+ return Err(Error::bad_request(format!(
"Part number {} has already been uploaded",
part_number
)));
@@ -437,7 +522,9 @@ pub async fn handle_put_part(
// Copy block to store
let version = Version::new(version_uuid, bucket_id, key, false);
- let first_block_hash = blake2sum(&first_block[..]);
+
+ let first_block_hash = async_blake2sum(first_block.clone()).await;
+
let (_, data_md5sum, data_sha256sum) = read_and_put_blocks(
&garage,
&version,
@@ -475,7 +562,7 @@ pub async fn handle_complete_multipart_upload(
garage: Arc<Garage>,
req: Request<Body>,
bucket_name: &str,
- bucket_id: Uuid,
+ bucket: &Bucket,
key: &str,
upload_id: &str,
content_sha256: Option<Hash>,
@@ -499,7 +586,7 @@ pub async fn handle_complete_multipart_upload(
// Get object and version
let key = key.to_string();
let (object, version) = futures::try_join!(
- garage.object_table.get(&bucket_id, &key),
+ garage.object_table.get(&bucket.id, &key),
garage.version_table.get(&version_uuid, &EmptyKey),
)?;
@@ -513,7 +600,7 @@ pub async fn handle_complete_multipart_upload(
let version = version.ok_or(Error::NoSuchKey)?;
if version.blocks.is_empty() {
- return Err(Error::BadRequest("No data was uploaded".to_string()));
+ return Err(Error::bad_request("No data was uploaded"));
}
let headers = match object_version.state {
@@ -574,8 +661,8 @@ pub async fn handle_complete_multipart_upload(
.map(|x| x.part_number)
.eq(block_parts.into_iter());
if !same_parts {
- return Err(Error::BadRequest(
- "Part numbers in block list and part list do not match. This can happen if a part was partially uploaded. Please abort the multipart upload and try again.".into(),
+ return Err(Error::bad_request(
+ "Part numbers in block list and part list do not match. This can happen if a part was partially uploaded. Please abort the multipart upload and try again."
));
}
@@ -592,6 +679,14 @@ pub async fn handle_complete_multipart_upload(
// Calculate total size of final object
let total_size = version.blocks.items().iter().map(|x| x.1.size).sum();
+ if let Err(e) = check_quotas(&garage, bucket, &key, total_size).await {
+ object_version.state = ObjectVersionState::Aborted;
+ let final_object = Object::new(bucket.id, key.clone(), vec![object_version]);
+ garage.object_table.insert(&final_object).await?;
+
+ return Err(e);
+ }
+
// Write final object version
object_version.state = ObjectVersionState::Complete(ObjectVersionData::FirstBlock(
ObjectVersionMeta {
@@ -602,7 +697,7 @@ pub async fn handle_complete_multipart_upload(
version.blocks.items()[0].1.hash,
));
- let final_object = Object::new(bucket_id, key.clone(), vec![object_version]);
+ let final_object = Object::new(bucket.id, key.clone(), vec![object_version]);
garage.object_table.insert(&final_object).await?;
// Send response saying ok we're done
diff --git a/src/api/s3_router.rs b/src/api/s3/router.rs
index 95a7eceb..44f581ff 100644
--- a/src/api/s3_router.rs
+++ b/src/api/s3/router.rs
@@ -1,131 +1,13 @@
-use crate::error::{Error, OkOrBadRequest};
-
use std::borrow::Cow;
use hyper::header::HeaderValue;
use hyper::{HeaderMap, Method, Request};
-/// This macro is used to generate very repetitive match {} blocks in this module
-/// It is _not_ made to be used anywhere else
-macro_rules! s3_match {
- (@match $enum:expr , [ $($endpoint:ident,)* ]) => {{
- // usage: s3_match {@match my_enum, [ VariantWithField1, VariantWithField2 ..] }
- // returns true if the variant was one of the listed variants, false otherwise.
- use Endpoint::*;
- match $enum {
- $(
- $endpoint { .. } => true,
- )*
- _ => false
- }
- }};
- (@extract $enum:expr , $param:ident, [ $($endpoint:ident,)* ]) => {{
- // usage: s3_match {@extract my_enum, field_name, [ VariantWithField1, VariantWithField2 ..] }
- // returns Some(field_value), or None if the variant was not one of the listed variants.
- use Endpoint::*;
- match $enum {
- $(
- $endpoint {$param, ..} => Some($param),
- )*
- _ => None
- }
- }};
- (@gen_parser ($keyword:expr, $key:expr, $query:expr, $header:expr),
- key: [$($kw_k:ident $(if $required_k:ident)? $(header $header_k:expr)? => $api_k:ident $(($($conv_k:ident :: $param_k:ident),*))?,)*],
- no_key: [$($kw_nk:ident $(if $required_nk:ident)? $(if_header $header_nk:expr)? => $api_nk:ident $(($($conv_nk:ident :: $param_nk:ident),*))?,)*]) => {{
- // usage: s3_match {@gen_parser (keyword, key, query, header),
- // key: [
- // SOME_KEYWORD => VariantWithKey,
- // ...
- // ],
- // no_key: [
- // SOME_KEYWORD => VariantWithoutKey,
- // ...
- // ]
- // }
- // See in from_{method} for more detailed usage.
- use Endpoint::*;
- use keywords::*;
- match ($keyword, !$key.is_empty()){
- $(
- ($kw_k, true) if true $(&& $query.$required_k.is_some())? $(&& $header.contains_key($header_k))? => Ok($api_k {
- key: $key,
- $($(
- $param_k: s3_match!(@@parse_param $query, $conv_k, $param_k),
- )*)?
- }),
- )*
- $(
- ($kw_nk, false) $(if $query.$required_nk.is_some())? $(if $header.contains($header_nk))? => Ok($api_nk {
- $($(
- $param_nk: s3_match!(@@parse_param $query, $conv_nk, $param_nk),
- )*)?
- }),
- )*
- (kw, _) => Err(Error::BadRequest(format!("Invalid endpoint: {}", kw)))
- }
- }};
+use crate::helpers::Authorization;
+use crate::router_macros::{generateQueryParameters, router_match};
+use crate::s3::error::*;
- (@@parse_param $query:expr, query_opt, $param:ident) => {{
- // extract optional query parameter
- $query.$param.take().map(|param| param.into_owned())
- }};
- (@@parse_param $query:expr, query, $param:ident) => {{
- // extract mendatory query parameter
- $query.$param.take().ok_or_bad_request("Missing argument for endpoint")?.into_owned()
- }};
- (@@parse_param $query:expr, opt_parse, $param:ident) => {{
- // extract and parse optional query parameter
- // missing parameter is file, however parse error is reported as an error
- $query.$param
- .take()
- .map(|param| param.parse())
- .transpose()
- .map_err(|_| Error::BadRequest("Failed to parse query parameter".to_owned()))?
- }};
- (@@parse_param $query:expr, parse, $param:ident) => {{
- // extract and parse mandatory query parameter
- // both missing and un-parseable parameters are reported as errors
- $query.$param.take().ok_or_bad_request("Missing argument for endpoint")?
- .parse()
- .map_err(|_| Error::BadRequest("Failed to parse query parameter".to_owned()))?
- }};
- (@func
- $(#[$doc:meta])*
- pub enum Endpoint {
- $(
- $(#[$outer:meta])*
- $variant:ident $({
- $($name:ident: $ty:ty,)*
- })?,
- )*
- }) => {
- $(#[$doc])*
- pub enum Endpoint {
- $(
- $(#[$outer])*
- $variant $({
- $($name: $ty, )*
- })?,
- )*
- }
- impl Endpoint {
- pub fn name(&self) -> &'static str {
- match self {
- $(Endpoint::$variant $({ $($name: _,)* .. })? => stringify!($variant),)*
- }
- }
- }
- };
- (@if ($($cond:tt)+) then ($($then:tt)*) else ($($else:tt)*)) => {
- $($then)*
- };
- (@if () then ($($then:tt)*) else ($($else:tt)*)) => {
- $($else)*
- };
-}
-
-s3_match! {@func
+router_match! {@func
/// List of all S3 API endpoints.
///
@@ -460,7 +342,7 @@ impl Endpoint {
Method::POST => Self::from_post(key, &mut query)?,
Method::PUT => Self::from_put(key, &mut query, req.headers())?,
Method::DELETE => Self::from_delete(key, &mut query)?,
- _ => return Err(Error::BadRequest("Unknown method".to_owned())),
+ _ => return Err(Error::bad_request("Unknown method")),
};
if let Some(message) = query.nonempty_message() {
@@ -471,7 +353,7 @@ impl Endpoint {
/// Determine which endpoint a request is for, knowing it is a GET.
fn from_get(key: String, query: &mut QueryParameters<'_>) -> Result<Self, Error> {
- s3_match! {
+ router_match! {
@gen_parser
(query.keyword.take().unwrap_or_default().as_ref(), key, query, None),
key: [
@@ -528,7 +410,7 @@ impl Endpoint {
/// Determine which endpoint a request is for, knowing it is a HEAD.
fn from_head(key: String, query: &mut QueryParameters<'_>) -> Result<Self, Error> {
- s3_match! {
+ router_match! {
@gen_parser
(query.keyword.take().unwrap_or_default().as_ref(), key, query, None),
key: [
@@ -542,7 +424,7 @@ impl Endpoint {
/// Determine which endpoint a request is for, knowing it is a POST.
fn from_post(key: String, query: &mut QueryParameters<'_>) -> Result<Self, Error> {
- s3_match! {
+ router_match! {
@gen_parser
(query.keyword.take().unwrap_or_default().as_ref(), key, query, None),
key: [
@@ -564,7 +446,7 @@ impl Endpoint {
query: &mut QueryParameters<'_>,
headers: &HeaderMap<HeaderValue>,
) -> Result<Self, Error> {
- s3_match! {
+ router_match! {
@gen_parser
(query.keyword.take().unwrap_or_default().as_ref(), key, query, headers),
key: [
@@ -606,7 +488,7 @@ impl Endpoint {
/// Determine which endpoint a request is for, knowing it is a DELETE.
fn from_delete(key: String, query: &mut QueryParameters<'_>) -> Result<Self, Error> {
- s3_match! {
+ router_match! {
@gen_parser
(query.keyword.take().unwrap_or_default().as_ref(), key, query, None),
key: [
@@ -636,7 +518,7 @@ impl Endpoint {
/// Get the key the request target. Returns None for requests which don't use a key.
#[allow(dead_code)]
pub fn get_key(&self) -> Option<&str> {
- s3_match! {
+ router_match! {
@extract
self,
key,
@@ -673,7 +555,7 @@ impl Endpoint {
if let Endpoint::ListBuckets = self {
return Authorization::None;
};
- let readonly = s3_match! {
+ let readonly = router_match! {
@match
self,
[
@@ -717,7 +599,7 @@ impl Endpoint {
SelectObjectContent,
]
};
- let owner = s3_match! {
+ let owner = router_match! {
@match
self,
[
@@ -740,87 +622,6 @@ impl Endpoint {
}
}
-/// What kind of authorization is required to perform a given action
-#[derive(Debug, Clone, PartialEq, Eq)]
-pub enum Authorization {
- /// No authorization is required
- None,
- /// Having Read permission on bucket
- Read,
- /// Having Write permission on bucket
- Write,
- /// Having Owner permission on bucket
- Owner,
-}
-
-/// This macro is used to generate part of the code in this module. It must be called only one, and
-/// is useless outside of this module.
-macro_rules! generateQueryParameters {
- ( $($rest:expr => $name:ident),* ) => {
- /// Struct containing all query parameters used in endpoints. Think of it as an HashMap,
- /// but with keys statically known.
- #[derive(Debug, Default)]
- struct QueryParameters<'a> {
- keyword: Option<Cow<'a, str>>,
- $(
- $name: Option<Cow<'a, str>>,
- )*
- }
-
- impl<'a> QueryParameters<'a> {
- /// Build this struct from the query part of an URI.
- fn from_query(query: &'a str) -> Result<Self, Error> {
- let mut res: Self = Default::default();
- for (k, v) in url::form_urlencoded::parse(query.as_bytes()) {
- let repeated = match k.as_ref() {
- $(
- $rest => if !v.is_empty() {
- res.$name.replace(v).is_some()
- } else {
- false
- },
- )*
- _ => {
- if k.starts_with("response-") || k.starts_with("X-Amz-") {
- false
- } else if v.as_ref().is_empty() {
- if res.keyword.replace(k).is_some() {
- return Err(Error::BadRequest("Multiple keywords".to_owned()));
- }
- continue;
- } else {
- debug!("Received an unknown query parameter: '{}'", k);
- false
- }
- }
- };
- if repeated {
- return Err(Error::BadRequest(format!(
- "Query parameter repeated: '{}'",
- k
- )));
- }
- }
- Ok(res)
- }
-
- /// Get an error message in case not all parameters where used when extracting them to
- /// build an Enpoint variant
- fn nonempty_message(&self) -> Option<&str> {
- if self.keyword.is_some() {
- Some("Keyword not used")
- } $(
- else if self.$name.is_some() {
- Some(concat!("'", $rest, "'"))
- }
- )* else {
- None
- }
- }
- }
- }
-}
-
// parameter name => struct field
generateQueryParameters! {
"continuation-token" => continuation_token,
diff --git a/src/api/s3_website.rs b/src/api/s3/website.rs
index b464dd45..77738971 100644
--- a/src/api/s3_website.rs
+++ b/src/api/s3/website.rs
@@ -4,13 +4,12 @@ use std::sync::Arc;
use hyper::{Body, Request, Response, StatusCode};
use serde::{Deserialize, Serialize};
-use crate::error::*;
-use crate::s3_xml::{to_xml_with_header, xmlns_tag, IntValue, Value};
+use crate::s3::error::*;
+use crate::s3::xml::{to_xml_with_header, xmlns_tag, IntValue, Value};
use crate::signature::verify_signed_content;
use garage_model::bucket_table::*;
use garage_model::garage::Garage;
-use garage_table::*;
use garage_util::data::*;
pub async fn handle_get_website(bucket: &Bucket) -> Result<Response<Body>, Error> {
@@ -47,14 +46,11 @@ pub async fn handle_delete_website(
bucket_id: Uuid,
) -> Result<Response<Body>, Error> {
let mut bucket = garage
- .bucket_table
- .get(&EmptyKey, &bucket_id)
- .await?
- .ok_or(Error::NoSuchBucket)?;
+ .bucket_helper()
+ .get_existing_bucket(bucket_id)
+ .await?;
- let param = bucket
- .params_mut()
- .ok_or_internal_error("Bucket should not be deleted at this point")?;
+ let param = bucket.params_mut().unwrap();
param.website_config.update(None);
garage.bucket_table.insert(&bucket).await?;
@@ -77,14 +73,11 @@ pub async fn handle_put_website(
}
let mut bucket = garage
- .bucket_table
- .get(&EmptyKey, &bucket_id)
- .await?
- .ok_or(Error::NoSuchBucket)?;
+ .bucket_helper()
+ .get_existing_bucket(bucket_id)
+ .await?;
- let param = bucket
- .params_mut()
- .ok_or_internal_error("Bucket should not be deleted at this point")?;
+ let param = bucket.params_mut().unwrap();
let conf: WebsiteConfiguration = from_reader(&body as &[u8])?;
conf.validate()?;
@@ -176,8 +169,8 @@ impl WebsiteConfiguration {
|| self.index_document.is_some()
|| self.routing_rules.is_some())
{
- return Err(Error::BadRequest(
- "Bad XML: can't have RedirectAllRequestsTo and other fields".to_owned(),
+ return Err(Error::bad_request(
+ "Bad XML: can't have RedirectAllRequestsTo and other fields",
));
}
if let Some(ref ed) = self.error_document {
@@ -222,8 +215,8 @@ impl WebsiteConfiguration {
impl Key {
pub fn validate(&self) -> Result<(), Error> {
if self.key.0.is_empty() {
- Err(Error::BadRequest(
- "Bad XML: error document specified but empty".to_owned(),
+ Err(Error::bad_request(
+ "Bad XML: error document specified but empty",
))
} else {
Ok(())
@@ -234,8 +227,8 @@ impl Key {
impl Suffix {
pub fn validate(&self) -> Result<(), Error> {
if self.suffix.0.is_empty() | self.suffix.0.contains('/') {
- Err(Error::BadRequest(
- "Bad XML: index document is empty or contains /".to_owned(),
+ Err(Error::bad_request(
+ "Bad XML: index document is empty or contains /",
))
} else {
Ok(())
@@ -247,7 +240,7 @@ impl Target {
pub fn validate(&self) -> Result<(), Error> {
if let Some(ref protocol) = self.protocol {
if protocol.0 != "http" && protocol.0 != "https" {
- return Err(Error::BadRequest("Bad XML: invalid protocol".to_owned()));
+ return Err(Error::bad_request("Bad XML: invalid protocol"));
}
}
Ok(())
@@ -269,19 +262,19 @@ impl Redirect {
pub fn validate(&self, has_prefix: bool) -> Result<(), Error> {
if self.replace_prefix.is_some() {
if self.replace_full.is_some() {
- return Err(Error::BadRequest(
- "Bad XML: both ReplaceKeyPrefixWith and ReplaceKeyWith are set".to_owned(),
+ return Err(Error::bad_request(
+ "Bad XML: both ReplaceKeyPrefixWith and ReplaceKeyWith are set",
));
}
if !has_prefix {
- return Err(Error::BadRequest(
- "Bad XML: ReplaceKeyPrefixWith is set, but KeyPrefixEquals isn't".to_owned(),
+ return Err(Error::bad_request(
+ "Bad XML: ReplaceKeyPrefixWith is set, but KeyPrefixEquals isn't",
));
}
}
if let Some(ref protocol) = self.protocol {
if protocol.0 != "http" && protocol.0 != "https" {
- return Err(Error::BadRequest("Bad XML: invalid protocol".to_owned()));
+ return Err(Error::bad_request("Bad XML: invalid protocol"));
}
}
// TODO there are probably more invalide cases, but which ones?
diff --git a/src/api/s3_xml.rs b/src/api/s3/xml.rs
index 75ec4559..06f11288 100644
--- a/src/api/s3_xml.rs
+++ b/src/api/s3/xml.rs
@@ -1,7 +1,7 @@
use quick_xml::se::to_string;
use serde::{Deserialize, Serialize, Serializer};
-use crate::Error as ApiError;
+use crate::s3::error::Error as ApiError;
pub fn to_xml_with_header<T: Serialize>(x: &T) -> Result<String, ApiError> {
let mut xml = r#"<?xml version="1.0" encoding="UTF-8"?>"#.to_string();
@@ -25,7 +25,7 @@ impl From<&str> for Value {
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
pub struct IntValue(#[serde(rename = "$value")] pub i64);
-#[derive(Debug, Serialize, PartialEq)]
+#[derive(Debug, Serialize, PartialEq, Eq)]
pub struct Bucket {
#[serde(rename = "CreationDate")]
pub creation_date: Value,
@@ -33,7 +33,7 @@ pub struct Bucket {
pub name: Value,
}
-#[derive(Debug, Serialize, PartialEq)]
+#[derive(Debug, Serialize, PartialEq, Eq)]
pub struct Owner {
#[serde(rename = "DisplayName")]
pub display_name: Value,
@@ -41,13 +41,13 @@ pub struct Owner {
pub id: Value,
}
-#[derive(Debug, Serialize, PartialEq)]
+#[derive(Debug, Serialize, PartialEq, Eq)]
pub struct BucketList {
#[serde(rename = "Bucket")]
pub entries: Vec<Bucket>,
}
-#[derive(Debug, Serialize, PartialEq)]
+#[derive(Debug, Serialize, PartialEq, Eq)]
pub struct ListAllMyBucketsResult {
#[serde(rename = "Buckets")]
pub buckets: BucketList,
@@ -55,7 +55,7 @@ pub struct ListAllMyBucketsResult {
pub owner: Owner,
}
-#[derive(Debug, Serialize, PartialEq)]
+#[derive(Debug, Serialize, PartialEq, Eq)]
pub struct LocationConstraint {
#[serde(serialize_with = "xmlns_tag")]
pub xmlns: (),
@@ -63,7 +63,7 @@ pub struct LocationConstraint {
pub region: String,
}
-#[derive(Debug, Serialize, PartialEq)]
+#[derive(Debug, Serialize, PartialEq, Eq)]
pub struct Deleted {
#[serde(rename = "Key")]
pub key: Value,
@@ -73,7 +73,7 @@ pub struct Deleted {
pub delete_marker_version_id: Value,
}
-#[derive(Debug, Serialize, PartialEq)]
+#[derive(Debug, Serialize, PartialEq, Eq)]
pub struct Error {
#[serde(rename = "Code")]
pub code: Value,
@@ -85,7 +85,7 @@ pub struct Error {
pub region: Option<Value>,
}
-#[derive(Debug, Serialize, PartialEq)]
+#[derive(Debug, Serialize, PartialEq, Eq)]
pub struct DeleteError {
#[serde(rename = "Code")]
pub code: Value,
@@ -97,7 +97,7 @@ pub struct DeleteError {
pub version_id: Option<Value>,
}
-#[derive(Debug, Serialize, PartialEq)]
+#[derive(Debug, Serialize, PartialEq, Eq)]
pub struct DeleteResult {
#[serde(serialize_with = "xmlns_tag")]
pub xmlns: (),
@@ -107,7 +107,7 @@ pub struct DeleteResult {
pub errors: Vec<DeleteError>,
}
-#[derive(Debug, Serialize, PartialEq)]
+#[derive(Debug, Serialize, PartialEq, Eq)]
pub struct InitiateMultipartUploadResult {
#[serde(serialize_with = "xmlns_tag")]
pub xmlns: (),
@@ -119,7 +119,7 @@ pub struct InitiateMultipartUploadResult {
pub upload_id: Value,
}
-#[derive(Debug, Serialize, PartialEq)]
+#[derive(Debug, Serialize, PartialEq, Eq)]
pub struct CompleteMultipartUploadResult {
#[serde(serialize_with = "xmlns_tag")]
pub xmlns: (),
@@ -133,7 +133,7 @@ pub struct CompleteMultipartUploadResult {
pub etag: Value,
}
-#[derive(Debug, Serialize, PartialEq)]
+#[derive(Debug, Serialize, PartialEq, Eq)]
pub struct Initiator {
#[serde(rename = "DisplayName")]
pub display_name: Value,
@@ -141,7 +141,7 @@ pub struct Initiator {
pub id: Value,
}
-#[derive(Debug, Serialize, PartialEq)]
+#[derive(Debug, Serialize, PartialEq, Eq)]
pub struct ListMultipartItem {
#[serde(rename = "Initiated")]
pub initiated: Value,
@@ -157,7 +157,7 @@ pub struct ListMultipartItem {
pub storage_class: Value,
}
-#[derive(Debug, Serialize, PartialEq)]
+#[derive(Debug, Serialize, PartialEq, Eq)]
pub struct ListMultipartUploadsResult {
#[serde(serialize_with = "xmlns_tag")]
pub xmlns: (),
@@ -187,7 +187,7 @@ pub struct ListMultipartUploadsResult {
pub encoding_type: Option<Value>,
}
-#[derive(Debug, Serialize, PartialEq)]
+#[derive(Debug, Serialize, PartialEq, Eq)]
pub struct PartItem {
#[serde(rename = "ETag")]
pub etag: Value,
@@ -199,7 +199,7 @@ pub struct PartItem {
pub size: IntValue,
}
-#[derive(Debug, Serialize, PartialEq)]
+#[derive(Debug, Serialize, PartialEq, Eq)]
pub struct ListPartsResult {
#[serde(serialize_with = "xmlns_tag")]
pub xmlns: (),
@@ -227,7 +227,7 @@ pub struct ListPartsResult {
pub storage_class: Value,
}
-#[derive(Debug, Serialize, PartialEq)]
+#[derive(Debug, Serialize, PartialEq, Eq)]
pub struct ListBucketItem {
#[serde(rename = "Key")]
pub key: Value,
@@ -241,13 +241,13 @@ pub struct ListBucketItem {
pub storage_class: Value,
}
-#[derive(Debug, Serialize, PartialEq)]
+#[derive(Debug, Serialize, PartialEq, Eq)]
pub struct CommonPrefix {
#[serde(rename = "Prefix")]
pub prefix: Value,
}
-#[derive(Debug, Serialize, PartialEq)]
+#[derive(Debug, Serialize, PartialEq, Eq)]
pub struct ListBucketResult {
#[serde(serialize_with = "xmlns_tag")]
pub xmlns: (),
@@ -281,7 +281,7 @@ pub struct ListBucketResult {
pub common_prefixes: Vec<CommonPrefix>,
}
-#[derive(Debug, Serialize, PartialEq)]
+#[derive(Debug, Serialize, PartialEq, Eq)]
pub struct VersioningConfiguration {
#[serde(serialize_with = "xmlns_tag")]
pub xmlns: (),
@@ -289,7 +289,7 @@ pub struct VersioningConfiguration {
pub status: Option<Value>,
}
-#[derive(Debug, Serialize, PartialEq)]
+#[derive(Debug, Serialize, PartialEq, Eq)]
pub struct PostObject {
#[serde(serialize_with = "xmlns_tag")]
pub xmlns: (),
diff --git a/src/api/signature/error.rs b/src/api/signature/error.rs
new file mode 100644
index 00000000..f5a067bd
--- /dev/null
+++ b/src/api/signature/error.rs
@@ -0,0 +1,36 @@
+use err_derive::Error;
+
+use crate::common_error::CommonError;
+pub use crate::common_error::{CommonErrorDerivative, OkOrBadRequest, OkOrInternalError};
+
+/// Errors of this crate
+#[derive(Debug, Error)]
+pub enum Error {
+ #[error(display = "{}", _0)]
+ /// Error from common error
+ Common(CommonError),
+
+ /// Authorization Header Malformed
+ #[error(display = "Authorization header malformed, expected scope: {}", _0)]
+ AuthorizationHeaderMalformed(String),
+
+ // Category: bad request
+ /// The request contained an invalid UTF-8 sequence in its path or in other parameters
+ #[error(display = "Invalid UTF-8: {}", _0)]
+ InvalidUtf8Str(#[error(source)] std::str::Utf8Error),
+
+ /// The client sent a header with invalid value
+ #[error(display = "Invalid header value: {}", _0)]
+ InvalidHeader(#[error(source)] hyper::header::ToStrError),
+}
+
+impl<T> From<T> for Error
+where
+ CommonError: From<T>,
+{
+ fn from(err: T) -> Self {
+ Error::Common(CommonError::from(err))
+ }
+}
+
+impl CommonErrorDerivative for Error {}
diff --git a/src/api/signature/mod.rs b/src/api/signature/mod.rs
index ebdee6da..4b8b990f 100644
--- a/src/api/signature/mod.rs
+++ b/src/api/signature/mod.rs
@@ -1,14 +1,15 @@
use chrono::{DateTime, Utc};
-use hmac::{Hmac, Mac, NewMac};
+use hmac::{Hmac, Mac};
use sha2::Sha256;
use garage_util::data::{sha256sum, Hash};
-use crate::error::*;
-
+pub mod error;
pub mod payload;
pub mod streaming;
+use error::*;
+
pub const SHORT_DATE: &str = "%Y%m%d";
pub const LONG_DATETIME: &str = "%Y%m%dT%H%M%SZ";
@@ -16,7 +17,7 @@ type HmacSha256 = Hmac<Sha256>;
pub fn verify_signed_content(expected_sha256: Hash, body: &[u8]) -> Result<(), Error> {
if expected_sha256 != sha256sum(body) {
- return Err(Error::BadRequest(
+ return Err(Error::bad_request(
"Request content hash does not match signed hash".to_string(),
));
}
@@ -28,20 +29,25 @@ pub fn signing_hmac(
secret_key: &str,
region: &str,
service: &str,
-) -> Result<HmacSha256, crypto_mac::InvalidKeyLength> {
+) -> Result<HmacSha256, crypto_common::InvalidLength> {
let secret = String::from("AWS4") + secret_key;
- let mut date_hmac = HmacSha256::new_varkey(secret.as_bytes())?;
+ let mut date_hmac = HmacSha256::new_from_slice(secret.as_bytes())?;
date_hmac.update(datetime.format(SHORT_DATE).to_string().as_bytes());
- let mut region_hmac = HmacSha256::new_varkey(&date_hmac.finalize().into_bytes())?;
+ let mut region_hmac = HmacSha256::new_from_slice(&date_hmac.finalize().into_bytes())?;
region_hmac.update(region.as_bytes());
- let mut service_hmac = HmacSha256::new_varkey(&region_hmac.finalize().into_bytes())?;
+ let mut service_hmac = HmacSha256::new_from_slice(&region_hmac.finalize().into_bytes())?;
service_hmac.update(service.as_bytes());
- let mut signing_hmac = HmacSha256::new_varkey(&service_hmac.finalize().into_bytes())?;
+ let mut signing_hmac = HmacSha256::new_from_slice(&service_hmac.finalize().into_bytes())?;
signing_hmac.update(b"aws4_request");
- let hmac = HmacSha256::new_varkey(&signing_hmac.finalize().into_bytes())?;
+ let hmac = HmacSha256::new_from_slice(&signing_hmac.finalize().into_bytes())?;
Ok(hmac)
}
-pub fn compute_scope(datetime: &DateTime<Utc>, region: &str) -> String {
- format!("{}/{}/s3/aws4_request", datetime.format(SHORT_DATE), region,)
+pub fn compute_scope(datetime: &DateTime<Utc>, region: &str, service: &str) -> String {
+ format!(
+ "{}/{}/{}/aws4_request",
+ datetime.format(SHORT_DATE),
+ region,
+ service
+ )
}
diff --git a/src/api/signature/payload.rs b/src/api/signature/payload.rs
index 2a41b307..4c7934e5 100644
--- a/src/api/signature/payload.rs
+++ b/src/api/signature/payload.rs
@@ -11,14 +11,15 @@ use garage_util::data::Hash;
use garage_model::garage::Garage;
use garage_model::key_table::*;
-use super::signing_hmac;
-use super::{LONG_DATETIME, SHORT_DATE};
+use super::LONG_DATETIME;
+use super::{compute_scope, signing_hmac};
use crate::encoding::uri_encode;
-use crate::error::*;
+use crate::signature::error::*;
pub async fn check_payload_signature(
garage: &Garage,
+ service: &str,
request: &Request<Body>,
) -> Result<(Option<Key>, Option<Hash>), Error> {
let mut headers = HashMap::new();
@@ -64,6 +65,7 @@ pub async fn check_payload_signature(
let key = verify_v4(
garage,
+ service,
&authorization.credential,
&authorization.date,
&authorization.signature,
@@ -103,7 +105,7 @@ fn parse_authorization(
let (auth_kind, rest) = authorization.split_at(first_space);
if auth_kind != "AWS4-HMAC-SHA256" {
- return Err(Error::BadRequest("Unsupported authorization method".into()));
+ return Err(Error::bad_request("Unsupported authorization method"));
}
let mut auth_params = HashMap::new();
@@ -127,10 +129,11 @@ fn parse_authorization(
let date = headers
.get("x-amz-date")
.ok_or_bad_request("Missing X-Amz-Date field")
+ .map_err(Error::from)
.and_then(|d| parse_date(d))?;
if Utc::now() - date > Duration::hours(24) {
- return Err(Error::BadRequest("Date is too old".to_string()));
+ return Err(Error::bad_request("Date is too old".to_string()));
}
let auth = Authorization {
@@ -154,7 +157,7 @@ fn parse_query_authorization(
headers: &HashMap<String, String>,
) -> Result<Authorization, Error> {
if algorithm != "AWS4-HMAC-SHA256" {
- return Err(Error::BadRequest(
+ return Err(Error::bad_request(
"Unsupported authorization method".to_string(),
));
}
@@ -177,10 +180,10 @@ fn parse_query_authorization(
.get("x-amz-expires")
.ok_or_bad_request("X-Amz-Expires not found in query parameters")?
.parse()
- .map_err(|_| Error::BadRequest("X-Amz-Expires is not a number".to_string()))?;
+ .map_err(|_| Error::bad_request("X-Amz-Expires is not a number".to_string()))?;
if duration > 7 * 24 * 3600 {
- return Err(Error::BadRequest(
+ return Err(Error::bad_request(
"X-Amz-Exprires may not exceed a week".to_string(),
));
}
@@ -188,10 +191,11 @@ fn parse_query_authorization(
let date = headers
.get("x-amz-date")
.ok_or_bad_request("Missing X-Amz-Date field")
+ .map_err(Error::from)
.and_then(|d| parse_date(d))?;
if Utc::now() - date > Duration::seconds(duration) {
- return Err(Error::BadRequest("Date is too old".to_string()));
+ return Err(Error::bad_request("Date is too old".to_string()));
}
Ok(Authorization {
@@ -281,6 +285,7 @@ pub fn parse_date(date: &str) -> Result<DateTime<Utc>, Error> {
pub async fn verify_v4(
garage: &Garage,
+ service: &str,
credential: &str,
date: &DateTime<Utc>,
signature: &str,
@@ -288,11 +293,7 @@ pub async fn verify_v4(
) -> Result<Key, Error> {
let (key_id, scope) = parse_credential(credential)?;
- let scope_expected = format!(
- "{}/{}/s3/aws4_request",
- date.format(SHORT_DATE),
- garage.config.s3_api.s3_region
- );
+ let scope_expected = compute_scope(date, &garage.config.s3_api.s3_region, service);
if scope != scope_expected {
return Err(Error::AuthorizationHeaderMalformed(scope.to_string()));
}
@@ -302,20 +303,20 @@ pub async fn verify_v4(
.get(&EmptyKey, &key_id)
.await?
.filter(|k| !k.state.is_deleted())
- .ok_or_else(|| Error::Forbidden(format!("No such key: {}", &key_id)))?;
+ .ok_or_else(|| Error::forbidden(format!("No such key: {}", &key_id)))?;
let key_p = key.params().unwrap();
let mut hmac = signing_hmac(
date,
&key_p.secret_key,
&garage.config.s3_api.s3_region,
- "s3",
+ service,
)
.ok_or_internal_error("Unable to build signing HMAC")?;
hmac.update(payload);
let our_signature = hex::encode(hmac.finalize().into_bytes());
if signature != our_signature {
- return Err(Error::Forbidden("Invalid signature".to_string()));
+ return Err(Error::forbidden("Invalid signature".to_string()));
}
Ok(key)
diff --git a/src/api/signature/streaming.rs b/src/api/signature/streaming.rs
index 969a45d6..c8358c4f 100644
--- a/src/api/signature/streaming.rs
+++ b/src/api/signature/streaming.rs
@@ -1,18 +1,67 @@
use std::pin::Pin;
-use chrono::{DateTime, Utc};
+use chrono::{DateTime, NaiveDateTime, Utc};
use futures::prelude::*;
use futures::task;
+use garage_model::key_table::Key;
+use hmac::Mac;
use hyper::body::Bytes;
+use hyper::{Body, Request};
use garage_util::data::Hash;
-use hmac::Mac;
-
-use super::sha256sum;
-use super::HmacSha256;
-use super::LONG_DATETIME;
-use crate::error::*;
+use super::{compute_scope, sha256sum, HmacSha256, LONG_DATETIME};
+
+use crate::signature::error::*;
+
+pub fn parse_streaming_body(
+ api_key: &Key,
+ req: Request<Body>,
+ content_sha256: &mut Option<Hash>,
+ region: &str,
+ service: &str,
+) -> Result<Request<Body>, Error> {
+ match req.headers().get("x-amz-content-sha256") {
+ Some(header) if header == "STREAMING-AWS4-HMAC-SHA256-PAYLOAD" => {
+ let signature = content_sha256
+ .take()
+ .ok_or_bad_request("No signature provided")?;
+
+ let secret_key = &api_key
+ .state
+ .as_option()
+ .ok_or_internal_error("Deleted key state")?
+ .secret_key;
+
+ let date = req
+ .headers()
+ .get("x-amz-date")
+ .ok_or_bad_request("Missing X-Amz-Date field")?
+ .to_str()?;
+ let date: NaiveDateTime = NaiveDateTime::parse_from_str(date, LONG_DATETIME)
+ .ok_or_bad_request("Invalid date")?;
+ let date: DateTime<Utc> = DateTime::from_utc(date, Utc);
+
+ let scope = compute_scope(&date, region, service);
+ let signing_hmac = crate::signature::signing_hmac(&date, secret_key, region, service)
+ .ok_or_internal_error("Unable to build signing HMAC")?;
+
+ Ok(req.map(move |body| {
+ Body::wrap_stream(
+ SignedPayloadStream::new(
+ body.map_err(Error::from),
+ signing_hmac,
+ date,
+ &scope,
+ signature,
+ )
+ .map_err(Error::from),
+ )
+ }))
+ }
+ _ => Ok(req),
+ }
+}
/// Result of `sha256("")`
const EMPTY_STRING_HEX_DIGEST: &str =
@@ -38,7 +87,7 @@ fn compute_streaming_payload_signature(
let mut hmac = signing_hmac.clone();
hmac.update(string_to_sign.as_bytes());
- Hash::try_from(&hmac.finalize().into_bytes()).ok_or_internal_error("Invalid signature")
+ Ok(Hash::try_from(&hmac.finalize().into_bytes()).ok_or_internal_error("Invalid signature")?)
}
mod payload {
@@ -114,10 +163,10 @@ impl From<SignedPayloadStreamError> for Error {
match err {
SignedPayloadStreamError::Stream(e) => e,
SignedPayloadStreamError::InvalidSignature => {
- Error::BadRequest("Invalid payload signature".into())
+ Error::bad_request("Invalid payload signature")
}
SignedPayloadStreamError::Message(e) => {
- Error::BadRequest(format!("Chunk format error: {}", e))
+ Error::bad_request(format!("Chunk format error: {}", e))
}
}
}
@@ -295,7 +344,7 @@ mod tests {
.with_timezone(&Utc);
let secret_key = "test";
let region = "test";
- let scope = crate::signature::compute_scope(&datetime, region);
+ let scope = crate::signature::compute_scope(&datetime, region, "s3");
let signing_hmac =
crate::signature::signing_hmac(&datetime, secret_key, region, "s3").unwrap();
diff --git a/src/block/Cargo.toml b/src/block/Cargo.toml
index 9cba69ee..cd409001 100644
--- a/src/block/Cargo.toml
+++ b/src/block/Cargo.toml
@@ -1,6 +1,6 @@
[package]
name = "garage_block"
-version = "0.7.0"
+version = "0.8.0"
authors = ["Alex Auvolat <alex@adnab.me>"]
edition = "2018"
license = "AGPL-3.0"
@@ -14,20 +14,22 @@ path = "lib.rs"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
-garage_rpc = { version = "0.7.0", path = "../rpc" }
-garage_util = { version = "0.7.0", path = "../util" }
-garage_table = { version = "0.7.0", path = "../table" }
+garage_db = { version = "0.8.0", path = "../db" }
+garage_rpc = { version = "0.8.0", path = "../rpc" }
+garage_util = { version = "0.8.0", path = "../util" }
+garage_table = { version = "0.8.0", path = "../table" }
opentelemetry = "0.17"
+arc-swap = "1.5"
async-trait = "0.1.7"
bytes = "1.0"
hex = "0.4"
tracing = "0.1.30"
rand = "0.8"
-zstd = { version = "0.9", default-features = false }
-sled = "0.34"
+async-compression = { version = "0.3", features = ["tokio", "zstd"] }
+zstd = { version = "0.9", default-features = false }
rmp-serde = "0.15"
serde = { version = "1.0", default-features = false, features = ["derive", "rc"] }
@@ -36,3 +38,7 @@ serde_bytes = "0.11"
futures = "0.3"
futures-util = "0.3"
tokio = { version = "1.0", default-features = false, features = ["rt", "rt-multi-thread", "io-util", "net", "time", "macros", "sync", "signal", "fs"] }
+tokio-util = { version = "0.6", features = ["io"] }
+
+[features]
+system-libs = [ "zstd/pkg-config" ]
diff --git a/src/block/block.rs b/src/block/block.rs
index 4d3fbcb8..935aa900 100644
--- a/src/block/block.rs
+++ b/src/block/block.rs
@@ -1,16 +1,22 @@
+use bytes::Bytes;
use serde::{Deserialize, Serialize};
use zstd::stream::{decode_all as zstd_decode, Encoder};
use garage_util::data::*;
use garage_util::error::*;
+#[derive(Debug, Serialize, Deserialize, Copy, Clone)]
+pub enum DataBlockHeader {
+ Plain,
+ Compressed,
+}
+
/// A possibly compressed block of data
-#[derive(Debug, Serialize, Deserialize)]
pub enum DataBlock {
/// Uncompressed data
- Plain(#[serde(with = "serde_bytes")] Vec<u8>),
+ Plain(Bytes),
/// Data compressed with zstd
- Compressed(#[serde(with = "serde_bytes")] Vec<u8>),
+ Compressed(Bytes),
}
impl DataBlock {
@@ -30,7 +36,7 @@ impl DataBlock {
/// Get the buffer, possibly decompressing it, and verify it's integrity.
/// For Plain block, data is compared to hash, for Compressed block, zstd checksumming system
/// is used instead.
- pub fn verify_get(self, hash: Hash) -> Result<Vec<u8>, Error> {
+ pub fn verify_get(self, hash: Hash) -> Result<Bytes, Error> {
match self {
DataBlock::Plain(data) => {
if blake2sum(&data) == hash {
@@ -39,9 +45,9 @@ impl DataBlock {
Err(Error::CorruptData(hash))
}
}
- DataBlock::Compressed(data) => {
- zstd_decode(&data[..]).map_err(|_| Error::CorruptData(hash))
- }
+ DataBlock::Compressed(data) => zstd_decode(&data[..])
+ .map_err(|_| Error::CorruptData(hash))
+ .map(Bytes::from),
}
}
@@ -61,13 +67,31 @@ impl DataBlock {
}
}
- pub fn from_buffer(data: Vec<u8>, level: Option<i32>) -> DataBlock {
- if let Some(level) = level {
- if let Ok(data) = zstd_encode(&data[..], level) {
- return DataBlock::Compressed(data);
+ pub async fn from_buffer(data: Bytes, level: Option<i32>) -> DataBlock {
+ tokio::task::spawn_blocking(move || {
+ if let Some(level) = level {
+ if let Ok(data) = zstd_encode(&data[..], level) {
+ return DataBlock::Compressed(data.into());
+ }
}
+ DataBlock::Plain(data)
+ })
+ .await
+ .unwrap()
+ }
+
+ pub fn into_parts(self) -> (DataBlockHeader, Bytes) {
+ match self {
+ DataBlock::Plain(data) => (DataBlockHeader::Plain, data),
+ DataBlock::Compressed(data) => (DataBlockHeader::Compressed, data),
+ }
+ }
+
+ pub fn from_parts(h: DataBlockHeader, bytes: Bytes) -> Self {
+ match h {
+ DataBlockHeader::Plain => DataBlock::Plain(bytes),
+ DataBlockHeader::Compressed => DataBlock::Compressed(bytes),
}
- DataBlock::Plain(data)
}
}
diff --git a/src/block/lib.rs b/src/block/lib.rs
index dc685657..d2814f77 100644
--- a/src/block/lib.rs
+++ b/src/block/lib.rs
@@ -2,6 +2,8 @@
extern crate tracing;
pub mod manager;
+pub mod repair;
+pub mod resync;
mod block;
mod metrics;
diff --git a/src/block/manager.rs b/src/block/manager.rs
index 1c04a335..7f439b96 100644
--- a/src/block/manager.rs
+++ b/src/block/manager.rs
@@ -1,29 +1,32 @@
-use std::convert::TryInto;
-use std::path::{Path, PathBuf};
+use std::path::PathBuf;
+use std::pin::Pin;
use std::sync::Arc;
use std::time::Duration;
use async_trait::async_trait;
+use bytes::Bytes;
use serde::{Deserialize, Serialize};
-use futures::future::*;
-use futures::select;
+use futures::Stream;
+use futures_util::stream::StreamExt;
use tokio::fs;
-use tokio::io::{AsyncReadExt, AsyncWriteExt};
-use tokio::sync::{watch, Mutex, Notify};
+use tokio::io::{AsyncReadExt, AsyncWriteExt, BufReader};
+use tokio::sync::{mpsc, Mutex, MutexGuard};
use opentelemetry::{
trace::{FutureExt as OtelFutureExt, TraceContextExt, Tracer},
- Context, KeyValue,
+ Context,
};
+use garage_rpc::rpc_helper::netapp::stream::{stream_asyncread, ByteStream};
+
+use garage_db as db;
+
use garage_util::data::*;
use garage_util::error::*;
use garage_util::metrics::RecordDuration;
-use garage_util::sled_counter::SledCountedTree;
-use garage_util::time::*;
-use garage_util::tranquilizer::Tranquilizer;
+use garage_rpc::rpc_helper::OrderTag;
use garage_rpc::system::System;
use garage_rpc::*;
@@ -32,24 +35,12 @@ use garage_table::replication::{TableReplication, TableShardedReplication};
use crate::block::*;
use crate::metrics::*;
use crate::rc::*;
+use crate::repair::*;
+use crate::resync::*;
/// Size under which data will be stored inlined in database instead of as files
pub const INLINE_THRESHOLD: usize = 3072;
-// Timeout for RPCs that read and write blocks to remote nodes
-const BLOCK_RW_TIMEOUT: Duration = Duration::from_secs(30);
-// Timeout for RPCs that ask other nodes whether they need a copy
-// of a given block before we delete it locally
-const NEED_BLOCK_QUERY_TIMEOUT: Duration = Duration::from_secs(5);
-
-// The delay between the time where a resync operation fails
-// and the time when it is retried, with exponential backoff
-// (multiplied by 2, 4, 8, 16, etc. for every consecutive failure).
-const RESYNC_RETRY_DELAY: Duration = Duration::from_secs(60);
-// The minimum retry delay is 60 seconds = 1 minute
-// The maximum retry delay is 60 seconds * 2^6 = 60 seconds << 6 = 64 minutes (~1 hour)
-const RESYNC_RETRY_DELAY_MAX_BACKOFF_POWER: u64 = 6;
-
// The delay between the moment when the reference counter
// drops to zero, and the moment where we allow ourselves
// to delete the block locally.
@@ -60,12 +51,12 @@ pub(crate) const BLOCK_GC_DELAY: Duration = Duration::from_secs(600);
pub enum BlockRpc {
Ok,
/// Message to ask for a block of data, by hash
- GetBlock(Hash),
+ GetBlock(Hash, Option<OrderTag>),
/// Message to send a block of data, either because requested, of for first delivery of new
/// block
PutBlock {
hash: Hash,
- data: DataBlock,
+ header: DataBlockHeader,
},
/// Ask other node if they should have this block, but don't actually have it
NeedBlockQuery(Hash),
@@ -85,20 +76,18 @@ pub struct BlockManager {
pub data_dir: PathBuf,
compression_level: Option<i32>,
- background_tranquility: u32,
- mutation_lock: Mutex<BlockManagerLocked>,
+ mutation_lock: [Mutex<BlockManagerLocked>; 256],
- rc: BlockRc,
+ pub(crate) rc: BlockRc,
+ pub resync: BlockResyncManager,
- resync_queue: SledCountedTree,
- resync_notify: Notify,
- resync_errors: SledCountedTree,
+ pub(crate) system: Arc<System>,
+ pub(crate) endpoint: Arc<Endpoint<BlockRpc, Self>>,
- system: Arc<System>,
- endpoint: Arc<Endpoint<BlockRpc, Self>>,
+ pub(crate) metrics: BlockManagerMetrics,
- metrics: BlockManagerMetrics,
+ tx_scrub_command: mpsc::Sender<ScrubWorkerCommand>,
}
// This custom struct contains functions that must only be ran
@@ -108,10 +97,9 @@ struct BlockManagerLocked();
impl BlockManager {
pub fn new(
- db: &sled::Db,
+ db: &db::Db,
data_dir: PathBuf,
compression_level: Option<i32>,
- background_tranquility: u32,
replication: TableShardedReplication,
system: Arc<System>,
) -> Arc<Self> {
@@ -120,215 +108,323 @@ impl BlockManager {
.expect("Unable to open block_local_rc tree");
let rc = BlockRc::new(rc);
- let resync_queue = db
- .open_tree("block_local_resync_queue")
- .expect("Unable to open block_local_resync_queue tree");
- let resync_queue = SledCountedTree::new(resync_queue);
-
- let resync_errors = db
- .open_tree("block_local_resync_errors")
- .expect("Unable to open block_local_resync_errors tree");
- let resync_errors = SledCountedTree::new(resync_errors);
+ let resync = BlockResyncManager::new(db, &system);
let endpoint = system
.netapp
- .endpoint("garage_model/block.rs/Rpc".to_string());
+ .endpoint("garage_block/manager.rs/Rpc".to_string());
- let manager_locked = BlockManagerLocked();
+ let metrics = BlockManagerMetrics::new(resync.queue.clone(), resync.errors.clone());
- let metrics = BlockManagerMetrics::new(resync_queue.clone(), resync_errors.clone());
+ let (scrub_tx, scrub_rx) = mpsc::channel(1);
let block_manager = Arc::new(Self {
replication,
data_dir,
compression_level,
- background_tranquility,
- mutation_lock: Mutex::new(manager_locked),
+ mutation_lock: [(); 256].map(|_| Mutex::new(BlockManagerLocked())),
rc,
- resync_queue,
- resync_notify: Notify::new(),
- resync_errors,
+ resync,
system,
endpoint,
metrics,
+ tx_scrub_command: scrub_tx,
});
block_manager.endpoint.set_handler(block_manager.clone());
- block_manager.clone().spawn_background_worker();
+ // Spawn a bunch of resync workers
+ for index in 0..MAX_RESYNC_WORKERS {
+ let worker = ResyncWorker::new(index, block_manager.clone());
+ block_manager.system.background.spawn_worker(worker);
+ }
+
+ // Spawn scrub worker
+ let scrub_worker = ScrubWorker::new(block_manager.clone(), scrub_rx);
+ block_manager.system.background.spawn_worker(scrub_worker);
block_manager
}
/// Ask nodes that might have a (possibly compressed) block for it
- async fn rpc_get_raw_block(&self, hash: &Hash) -> Result<DataBlock, Error> {
+ /// Return it as a stream with a header
+ async fn rpc_get_raw_block_streaming(
+ &self,
+ hash: &Hash,
+ order_tag: Option<OrderTag>,
+ ) -> Result<(DataBlockHeader, ByteStream), Error> {
let who = self.replication.read_nodes(hash);
- let resps = self
- .system
- .rpc
- .try_call_many(
- &self.endpoint,
- &who[..],
- BlockRpc::GetBlock(*hash),
- RequestStrategy::with_priority(PRIO_NORMAL)
- .with_quorum(1)
- .with_timeout(BLOCK_RW_TIMEOUT)
- .interrupt_after_quorum(true),
- )
- .await?;
+ let who = self.system.rpc.request_order(&who);
+
+ for node in who.iter() {
+ let node_id = NodeID::from(*node);
+ let rpc = self.endpoint.call_streaming(
+ &node_id,
+ BlockRpc::GetBlock(*hash, order_tag),
+ PRIO_NORMAL | PRIO_SECONDARY,
+ );
+ tokio::select! {
+ res = rpc => {
+ let res = match res {
+ Ok(res) => res,
+ Err(e) => {
+ debug!("Node {:?} returned error: {}", node, e);
+ continue;
+ }
+ };
+ let (header, stream) = match res.into_parts() {
+ (Ok(BlockRpc::PutBlock { hash: _, header }), Some(stream)) => (header, stream),
+ _ => {
+ debug!("Node {:?} returned a malformed response", node);
+ continue;
+ }
+ };
+ return Ok((header, stream));
+ }
+ _ = tokio::time::sleep(self.system.rpc.rpc_timeout()) => {
+ debug!("Node {:?} didn't return block in time, trying next.", node);
+ }
+ };
+ }
- for resp in resps {
- if let BlockRpc::PutBlock { data, .. } = resp {
- return Ok(data);
- }
+ Err(Error::Message(format!(
+ "Unable to read block {:?}: no node returned a valid block",
+ hash
+ )))
+ }
+
+ /// Ask nodes that might have a (possibly compressed) block for it
+ /// Return its entire body
+ pub(crate) async fn rpc_get_raw_block(
+ &self,
+ hash: &Hash,
+ order_tag: Option<OrderTag>,
+ ) -> Result<DataBlock, Error> {
+ let who = self.replication.read_nodes(hash);
+ let who = self.system.rpc.request_order(&who);
+
+ for node in who.iter() {
+ let node_id = NodeID::from(*node);
+ let rpc = self.endpoint.call_streaming(
+ &node_id,
+ BlockRpc::GetBlock(*hash, order_tag),
+ PRIO_NORMAL | PRIO_SECONDARY,
+ );
+ tokio::select! {
+ res = rpc => {
+ let res = match res {
+ Ok(res) => res,
+ Err(e) => {
+ debug!("Node {:?} returned error: {}", node, e);
+ continue;
+ }
+ };
+ let (header, stream) = match res.into_parts() {
+ (Ok(BlockRpc::PutBlock { hash: _, header }), Some(stream)) => (header, stream),
+ _ => {
+ debug!("Node {:?} returned a malformed response", node);
+ continue;
+ }
+ };
+ match read_stream_to_end(stream).await {
+ Ok(bytes) => return Ok(DataBlock::from_parts(header, bytes)),
+ Err(e) => {
+ debug!("Error reading stream from node {:?}: {}", node, e);
+ }
+ }
+ }
+ _ = tokio::time::sleep(self.system.rpc.rpc_timeout()) => {
+ debug!("Node {:?} didn't return block in time, trying next.", node);
+ }
+ };
}
+
Err(Error::Message(format!(
- "Unable to read block {:?}: no valid blocks returned",
+ "Unable to read block {:?}: no node returned a valid block",
hash
)))
}
// ---- Public interface ----
+ /// Ask nodes that might have a block for it,
+ /// return it as a stream
+ pub async fn rpc_get_block_streaming(
+ &self,
+ hash: &Hash,
+ order_tag: Option<OrderTag>,
+ ) -> Result<
+ Pin<Box<dyn Stream<Item = Result<Bytes, std::io::Error>> + Send + Sync + 'static>>,
+ Error,
+ > {
+ let (header, stream) = self.rpc_get_raw_block_streaming(hash, order_tag).await?;
+ match header {
+ DataBlockHeader::Plain => Ok(stream),
+ DataBlockHeader::Compressed => {
+ // Too many things, I hate it.
+ let reader = stream_asyncread(stream);
+ let reader = BufReader::new(reader);
+ let reader = async_compression::tokio::bufread::ZstdDecoder::new(reader);
+ Ok(Box::pin(tokio_util::io::ReaderStream::new(reader)))
+ }
+ }
+ }
+
/// Ask nodes that might have a block for it
- pub async fn rpc_get_block(&self, hash: &Hash) -> Result<Vec<u8>, Error> {
- self.rpc_get_raw_block(hash).await?.verify_get(*hash)
+ pub async fn rpc_get_block(
+ &self,
+ hash: &Hash,
+ order_tag: Option<OrderTag>,
+ ) -> Result<Bytes, Error> {
+ self.rpc_get_raw_block(hash, order_tag)
+ .await?
+ .verify_get(*hash)
}
/// Send block to nodes that should have it
- pub async fn rpc_put_block(&self, hash: Hash, data: Vec<u8>) -> Result<(), Error> {
+ pub async fn rpc_put_block(&self, hash: Hash, data: Bytes) -> Result<(), Error> {
let who = self.replication.write_nodes(&hash);
- let data = DataBlock::from_buffer(data, self.compression_level);
+
+ let (header, bytes) = DataBlock::from_buffer(data, self.compression_level)
+ .await
+ .into_parts();
+ let put_block_rpc =
+ Req::new(BlockRpc::PutBlock { hash, header })?.with_stream_from_buffer(bytes);
+
self.system
.rpc
.try_call_many(
&self.endpoint,
&who[..],
- BlockRpc::PutBlock { hash, data },
- RequestStrategy::with_priority(PRIO_NORMAL)
- .with_quorum(self.replication.write_quorum())
- .with_timeout(BLOCK_RW_TIMEOUT),
+ put_block_rpc,
+ RequestStrategy::with_priority(PRIO_NORMAL | PRIO_SECONDARY)
+ .with_quorum(self.replication.write_quorum()),
)
.await?;
- Ok(())
- }
- /// Launch the repair procedure on the data store
- ///
- /// This will list all blocks locally present, as well as those
- /// that are required because of refcount > 0, and will try
- /// to fix any mismatch between the two.
- pub async fn repair_data_store(&self, must_exit: &watch::Receiver<bool>) -> Result<(), Error> {
- // 1. Repair blocks from RC table.
- for (i, entry) in self.rc.rc.iter().enumerate() {
- let (hash, _) = entry?;
- let hash = Hash::try_from(&hash[..]).unwrap();
- self.put_to_resync(&hash, Duration::from_secs(0))?;
- if i & 0xFF == 0 && *must_exit.borrow() {
- return Ok(());
- }
- }
-
- // 2. Repair blocks actually on disk
- // Lists all blocks on disk and adds them to the resync queue.
- // This allows us to find blocks we are storing but don't actually need,
- // so that we can offload them if necessary and then delete them locally.
- self.for_each_file(
- (),
- move |_, hash| async move {
- self.put_to_resync(&hash, Duration::from_secs(0))
- .map_err(Into::into)
- },
- must_exit,
- )
- .await
- }
-
- /// Verify integrity of each block on disk. Use `speed_limit` to limit the load generated by
- /// this function.
- pub async fn scrub_data_store(
- &self,
- must_exit: &watch::Receiver<bool>,
- tranquility: u32,
- ) -> Result<(), Error> {
- let tranquilizer = Tranquilizer::new(30);
- self.for_each_file(
- tranquilizer,
- move |mut tranquilizer, hash| async move {
- let _ = self.read_block(&hash).await;
- tranquilizer.tranquilize(tranquility).await;
- Ok(tranquilizer)
- },
- must_exit,
- )
- .await
- }
-
- /// Get lenght of resync queue
- pub fn resync_queue_len(&self) -> usize {
- self.resync_queue.len()
+ Ok(())
}
- /// Get number of blocks that have an error
- pub fn resync_errors_len(&self) -> usize {
- self.resync_errors.len()
+ /// Get number of items in the refcount table
+ pub fn rc_len(&self) -> Result<usize, Error> {
+ Ok(self.rc.rc.len()?)
}
- /// Get number of items in the refcount table
- pub fn rc_len(&self) -> usize {
- self.rc.rc.len()
+ /// Send command to start/stop/manager scrub worker
+ pub async fn send_scrub_command(&self, cmd: ScrubWorkerCommand) {
+ let _ = self.tx_scrub_command.send(cmd).await;
}
//// ----- Managing the reference counter ----
/// Increment the number of time a block is used, putting it to resynchronization if it is
/// required, but not known
- pub fn block_incref(&self, hash: &Hash) -> Result<(), Error> {
- if self.rc.block_incref(hash)? {
+ pub fn block_incref(
+ self: &Arc<Self>,
+ tx: &mut db::Transaction,
+ hash: Hash,
+ ) -> db::TxOpResult<()> {
+ if self.rc.block_incref(tx, &hash)? {
// When the reference counter is incremented, there is
// normally a node that is responsible for sending us the
// data of the block. However that operation may fail,
// so in all cases we add the block here to the todo list
// to check later that it arrived correctly, and if not
// we will fecth it from someone.
- self.put_to_resync(hash, 2 * BLOCK_RW_TIMEOUT)?;
+ let this = self.clone();
+ tokio::spawn(async move {
+ if let Err(e) = this
+ .resync
+ .put_to_resync(&hash, 2 * this.system.rpc.rpc_timeout())
+ {
+ error!("Block {:?} could not be put in resync queue: {}.", hash, e);
+ }
+ });
}
Ok(())
}
/// Decrement the number of time a block is used
- pub fn block_decref(&self, hash: &Hash) -> Result<(), Error> {
- if self.rc.block_decref(hash)? {
+ pub fn block_decref(
+ self: &Arc<Self>,
+ tx: &mut db::Transaction,
+ hash: Hash,
+ ) -> db::TxOpResult<()> {
+ if self.rc.block_decref(tx, &hash)? {
// When the RC is decremented, it might drop to zero,
// indicating that we don't need the block.
// There is a delay before we garbage collect it;
// make sure that it is handled in the resync loop
// after that delay has passed.
- self.put_to_resync(hash, BLOCK_GC_DELAY + Duration::from_secs(10))?;
+ let this = self.clone();
+ tokio::spawn(async move {
+ if let Err(e) = this
+ .resync
+ .put_to_resync(&hash, BLOCK_GC_DELAY + Duration::from_secs(10))
+ {
+ error!("Block {:?} could not be put in resync queue: {}.", hash, e);
+ }
+ });
}
Ok(())
}
// ---- Reading and writing blocks locally ----
+ async fn handle_put_block(
+ &self,
+ hash: Hash,
+ header: DataBlockHeader,
+ stream: Option<ByteStream>,
+ ) -> Result<(), Error> {
+ let stream = stream.ok_or_message("missing stream")?;
+ let bytes = read_stream_to_end(stream).await?;
+ let data = DataBlock::from_parts(header, bytes);
+ self.write_block(&hash, &data).await
+ }
+
/// Write a block to disk
- async fn write_block(&self, hash: &Hash, data: &DataBlock) -> Result<BlockRpc, Error> {
+ pub(crate) async fn write_block(&self, hash: &Hash, data: &DataBlock) -> Result<(), Error> {
+ let tracer = opentelemetry::global::tracer("garage");
+
let write_size = data.inner_buffer().len() as u64;
- let res = self
- .mutation_lock
- .lock()
+ self.lock_mutate(hash)
.await
.write_block(hash, data, self)
.bound_record_duration(&self.metrics.block_write_duration)
+ .with_context(Context::current_with_span(
+ tracer.start("BlockManagerLocked::write_block"),
+ ))
.await?;
self.metrics.bytes_written.add(write_size);
- Ok(res)
+ Ok(())
+ }
+
+ async fn handle_get_block(&self, hash: &Hash, order_tag: Option<OrderTag>) -> Resp<BlockRpc> {
+ let block = match self.read_block(hash).await {
+ Ok(data) => data,
+ Err(e) => return Resp::new(Err(e)),
+ };
+
+ let (header, data) = block.into_parts();
+
+ let resp = Resp::new(Ok(BlockRpc::PutBlock {
+ hash: *hash,
+ header,
+ }))
+ .with_stream_from_buffer(data);
+
+ if let Some(order_tag) = order_tag {
+ resp.with_order_tag(order_tag)
+ } else {
+ resp
+ }
}
/// Read block from disk, verifying it's integrity
- async fn read_block(&self, hash: &Hash) -> Result<BlockRpc, Error> {
+ pub(crate) async fn read_block(&self, hash: &Hash) -> Result<DataBlock, Error> {
let data = self
.read_block_internal(hash)
.bound_record_duration(&self.metrics.block_read_duration)
@@ -338,7 +434,7 @@ impl BlockManager {
.bytes_read
.add(data.inner_buffer().len() as u64);
- Ok(BlockRpc::PutBlock { hash: *hash, data })
+ Ok(data)
}
async fn read_block_internal(&self, hash: &Hash) -> Result<DataBlock, Error> {
@@ -347,7 +443,8 @@ impl BlockManager {
Ok(c) => c,
Err(e) => {
// Not found but maybe we should have had it ??
- self.put_to_resync(hash, 2 * BLOCK_RW_TIMEOUT)?;
+ self.resync
+ .put_to_resync(hash, 2 * self.system.rpc.rpc_timeout())?;
return Err(Into::into(e));
}
};
@@ -361,37 +458,47 @@ impl BlockManager {
drop(f);
let data = if compressed {
- DataBlock::Compressed(data)
+ DataBlock::Compressed(data.into())
} else {
- DataBlock::Plain(data)
+ DataBlock::Plain(data.into())
};
if data.verify(*hash).is_err() {
self.metrics.corruption_counter.add(1);
- self.mutation_lock
- .lock()
+ self.lock_mutate(hash)
.await
.move_block_to_corrupted(hash, self)
.await?;
- self.put_to_resync(hash, Duration::from_millis(0))?;
+ self.resync.put_to_resync(hash, Duration::from_millis(0))?;
return Err(Error::CorruptData(*hash));
}
Ok(data)
}
- /// Check if this node should have a block, but don't actually have it
- async fn need_block(&self, hash: &Hash) -> Result<bool, Error> {
- let BlockStatus { exists, needed } = self
- .mutation_lock
- .lock()
+ /// Check if this node has a block and whether it needs it
+ pub(crate) async fn check_block_status(&self, hash: &Hash) -> Result<BlockStatus, Error> {
+ self.lock_mutate(hash)
.await
.check_block_status(hash, self)
- .await?;
+ .await
+ }
+
+ /// Check if this node should have a block, but don't actually have it
+ async fn need_block(&self, hash: &Hash) -> Result<bool, Error> {
+ let BlockStatus { exists, needed } = self.check_block_status(hash).await?;
Ok(needed.is_nonzero() && !exists)
}
+ /// Delete block if it is not needed anymore
+ pub(crate) async fn delete_if_unneeded(&self, hash: &Hash) -> Result<(), Error> {
+ self.lock_mutate(hash)
+ .await
+ .delete_if_unneeded(hash, self)
+ .await
+ }
+
/// Utility: gives the path of the directory in which a block should be found
fn block_dir(&self, hash: &Hash) -> PathBuf {
let mut path = self.data_dir.clone();
@@ -419,431 +526,38 @@ impl BlockManager {
fs::metadata(&path).await.map(|_| false).map_err(Into::into)
}
- // ---- Resync loop ----
-
- // This part manages a queue of blocks that need to be
- // "resynchronized", i.e. that need to have a check that
- // they are at present if we need them, or that they are
- // deleted once the garbage collection delay has passed.
- //
- // Here are some explanations on how the resync queue works.
- // There are two Sled trees that are used to have information
- // about the status of blocks that need to be resynchronized:
- //
- // - resync_queue: a tree that is ordered first by a timestamp
- // (in milliseconds since Unix epoch) that is the time at which
- // the resync must be done, and second by block hash.
- // The key in this tree is just:
- // concat(timestamp (8 bytes), hash (32 bytes))
- // The value is the same 32-byte hash.
- //
- // - resync_errors: a tree that indicates for each block
- // if the last resync resulted in an error, and if so,
- // the following two informations (see the ErrorCounter struct):
- // - how many consecutive resync errors for this block?
- // - when was the last try?
- // These two informations are used to implement an
- // exponential backoff retry strategy.
- // The key in this tree is the 32-byte hash of the block,
- // and the value is the encoded ErrorCounter value.
- //
- // We need to have these two trees, because the resync queue
- // is not just a queue of items to process, but a set of items
- // that are waiting a specific delay until we can process them
- // (the delay being necessary both internally for the exponential
- // backoff strategy, and exposed as a parameter when adding items
- // to the queue, e.g. to wait until the GC delay has passed).
- // This is why we need one tree ordered by time, and one
- // ordered by identifier of item to be processed (block hash).
- //
- // When the worker wants to process an item it takes from
- // resync_queue, it checks in resync_errors that if there is an
- // exponential back-off delay to await, it has passed before we
- // process the item. If not, the item in the queue is skipped
- // (but added back for later processing after the time of the
- // delay).
- //
- // An alternative that would have seemed natural is to
- // only add items to resync_queue with a processing time that is
- // after the delay, but there are several issues with this:
- // - This requires to synchronize updates to resync_queue and
- // resync_errors (with the current model, there is only one thread,
- // the worker thread, that accesses resync_errors,
- // so no need to synchronize) by putting them both in a lock.
- // This would mean that block_incref might need to take a lock
- // before doing its thing, meaning it has much more chances of
- // not completing successfully if something bad happens to Garage.
- // Currently Garage is not able to recover from block_incref that
- // doesn't complete successfully, because it is necessary to ensure
- // the consistency between the state of the block manager and
- // information in the BlockRef table.
- // - If a resync fails, we put that block in the resync_errors table,
- // and also add it back to resync_queue to be processed after
- // the exponential back-off delay,
- // but maybe the block is already scheduled to be resynced again
- // at another time that is before the exponential back-off delay,
- // and we have no way to check that easily. This means that
- // in all cases, we need to check the resync_errors table
- // in the resync loop at the time when a block is popped from
- // the resync_queue.
- // Overall, the current design is therefore simpler and more robust
- // because it tolerates inconsistencies between the resync_queue
- // and resync_errors table (items being scheduled in resync_queue
- // for times that are earlier than the exponential back-off delay
- // is a natural condition that is handled properly).
-
- fn spawn_background_worker(self: Arc<Self>) {
- // Launch a background workers for background resync loop processing
- let background = self.system.background.clone();
- tokio::spawn(async move {
- tokio::time::sleep(Duration::from_secs(10)).await;
- background.spawn_worker("block resync worker".into(), move |must_exit| {
- self.resync_loop(must_exit)
- });
- });
- }
-
- fn put_to_resync(&self, hash: &Hash, delay: Duration) -> Result<(), sled::Error> {
- let when = now_msec() + delay.as_millis() as u64;
- self.put_to_resync_at(hash, when)
- }
-
- fn put_to_resync_at(&self, hash: &Hash, when: u64) -> Result<(), sled::Error> {
- trace!("Put resync_queue: {} {:?}", when, hash);
- let mut key = u64::to_be_bytes(when).to_vec();
- key.extend(hash.as_ref());
- self.resync_queue.insert(key, hash.as_ref())?;
- self.resync_notify.notify_waiters();
- Ok(())
- }
-
- async fn resync_loop(self: Arc<Self>, mut must_exit: watch::Receiver<bool>) {
- let mut tranquilizer = Tranquilizer::new(30);
-
- while !*must_exit.borrow() {
- match self.resync_iter(&mut must_exit).await {
- Ok(true) => {
- tranquilizer.tranquilize(self.background_tranquility).await;
- }
- Ok(false) => {
- tranquilizer.reset();
- }
- Err(e) => {
- // The errors that we have here are only Sled errors
- // We don't really know how to handle them so just ¯\_(ツ)_/¯
- // (there is kind of an assumption that Sled won't error on us,
- // if it does there is not much we can do -- TODO should we just panic?)
- error!(
- "Could not do a resync iteration: {} (this is a very bad error)",
- e
- );
- tranquilizer.reset();
- }
- }
- }
- }
-
- // The result of resync_iter is:
- // - Ok(true) -> a block was processed (successfully or not)
- // - Ok(false) -> no block was processed, but we are ready for the next iteration
- // - Err(_) -> a Sled error occurred when reading/writing from resync_queue/resync_errors
- async fn resync_iter(
- &self,
- must_exit: &mut watch::Receiver<bool>,
- ) -> Result<bool, sled::Error> {
- if let Some(first_pair_res) = self.resync_queue.iter().next() {
- let (time_bytes, hash_bytes) = first_pair_res?;
-
- let time_msec = u64::from_be_bytes(time_bytes[0..8].try_into().unwrap());
- let now = now_msec();
-
- if now >= time_msec {
- let hash = Hash::try_from(&hash_bytes[..]).unwrap();
-
- if let Some(ec) = self.resync_errors.get(hash.as_slice())? {
- let ec = ErrorCounter::decode(ec);
- if now < ec.next_try() {
- // if next retry after an error is not yet,
- // don't do resync and return early, but still
- // make sure the item is still in queue at expected time
- self.put_to_resync_at(&hash, ec.next_try())?;
- // ec.next_try() > now >= time_msec, so this remove
- // is not removing the one we added just above
- // (we want to do the remove after the insert to ensure
- // that the item is not lost if we crash in-between)
- self.resync_queue.remove(time_bytes)?;
- return Ok(false);
- }
- }
-
- let tracer = opentelemetry::global::tracer("garage");
- let trace_id = gen_uuid();
- let span = tracer
- .span_builder("Resync block")
- .with_trace_id(
- opentelemetry::trace::TraceId::from_hex(&hex::encode(
- &trace_id.as_slice()[..16],
- ))
- .unwrap(),
- )
- .with_attributes(vec![KeyValue::new("block", format!("{:?}", hash))])
- .start(&tracer);
-
- let res = self
- .resync_block(&hash)
- .with_context(Context::current_with_span(span))
- .bound_record_duration(&self.metrics.resync_duration)
- .await;
-
- self.metrics.resync_counter.add(1);
-
- if let Err(e) = &res {
- self.metrics.resync_error_counter.add(1);
- warn!("Error when resyncing {:?}: {}", hash, e);
-
- let err_counter = match self.resync_errors.get(hash.as_slice())? {
- Some(ec) => ErrorCounter::decode(ec).add1(now + 1),
- None => ErrorCounter::new(now + 1),
- };
-
- self.resync_errors
- .insert(hash.as_slice(), err_counter.encode())?;
-
- self.put_to_resync_at(&hash, err_counter.next_try())?;
- // err_counter.next_try() >= now + 1 > now,
- // the entry we remove from the queue is not
- // the entry we inserted with put_to_resync_at
- self.resync_queue.remove(time_bytes)?;
- } else {
- self.resync_errors.remove(hash.as_slice())?;
- self.resync_queue.remove(time_bytes)?;
- }
-
- Ok(true)
- } else {
- let delay = tokio::time::sleep(Duration::from_millis(time_msec - now));
- select! {
- _ = delay.fuse() => {},
- _ = self.resync_notify.notified().fuse() => {},
- _ = must_exit.changed().fuse() => {},
- }
- Ok(false)
- }
- } else {
- // Here we wait either for a notification that an item has been
- // added to the queue, or for a constant delay of 10 secs to expire.
- // The delay avoids a race condition where the notification happens
- // between the time we checked the queue and the first poll
- // to resync_notify.notified(): if that happens, we'll just loop
- // back 10 seconds later, which is fine.
- let delay = tokio::time::sleep(Duration::from_secs(10));
- select! {
- _ = delay.fuse() => {},
- _ = self.resync_notify.notified().fuse() => {},
- _ = must_exit.changed().fuse() => {},
- }
- Ok(false)
- }
- }
-
- async fn resync_block(&self, hash: &Hash) -> Result<(), Error> {
- let BlockStatus { exists, needed } = self
- .mutation_lock
+ async fn lock_mutate(&self, hash: &Hash) -> MutexGuard<'_, BlockManagerLocked> {
+ let tracer = opentelemetry::global::tracer("garage");
+ self.mutation_lock[hash.as_slice()[0] as usize]
.lock()
+ .with_context(Context::current_with_span(
+ tracer.start("Acquire mutation_lock"),
+ ))
.await
- .check_block_status(hash, self)
- .await?;
-
- if exists != needed.is_needed() || exists != needed.is_nonzero() {
- debug!(
- "Resync block {:?}: exists {}, nonzero rc {}, deletable {}",
- hash,
- exists,
- needed.is_nonzero(),
- needed.is_deletable(),
- );
- }
-
- if exists && needed.is_deletable() {
- info!("Resync block {:?}: offloading and deleting", hash);
-
- let mut who = self.replication.write_nodes(hash);
- if who.len() < self.replication.write_quorum() {
- return Err(Error::Message("Not trying to offload block because we don't have a quorum of nodes to write to".to_string()));
- }
- who.retain(|id| *id != self.system.id);
-
- let msg = Arc::new(BlockRpc::NeedBlockQuery(*hash));
- let who_needs_fut = who.iter().map(|to| {
- self.system.rpc.call_arc(
- &self.endpoint,
- *to,
- msg.clone(),
- RequestStrategy::with_priority(PRIO_BACKGROUND)
- .with_timeout(NEED_BLOCK_QUERY_TIMEOUT),
- )
- });
- let who_needs_resps = join_all(who_needs_fut).await;
-
- let mut need_nodes = vec![];
- for (node, needed) in who.iter().zip(who_needs_resps.into_iter()) {
- match needed.err_context("NeedBlockQuery RPC")? {
- BlockRpc::NeedBlockReply(needed) => {
- if needed {
- need_nodes.push(*node);
- }
- }
- m => {
- return Err(Error::unexpected_rpc_message(m));
- }
- }
- }
-
- if !need_nodes.is_empty() {
- trace!(
- "Block {:?} needed by {} nodes, sending",
- hash,
- need_nodes.len()
- );
-
- for node in need_nodes.iter() {
- self.metrics
- .resync_send_counter
- .add(1, &[KeyValue::new("to", format!("{:?}", node))]);
- }
-
- let put_block_message = self.read_block(hash).await?;
- self.system
- .rpc
- .try_call_many(
- &self.endpoint,
- &need_nodes[..],
- put_block_message,
- RequestStrategy::with_priority(PRIO_BACKGROUND)
- .with_quorum(need_nodes.len())
- .with_timeout(BLOCK_RW_TIMEOUT),
- )
- .await
- .err_context("PutBlock RPC")?;
- }
- info!(
- "Deleting unneeded block {:?}, offload finished ({} / {})",
- hash,
- need_nodes.len(),
- who.len()
- );
-
- self.mutation_lock
- .lock()
- .await
- .delete_if_unneeded(hash, self)
- .await?;
-
- self.rc.clear_deleted_block_rc(hash)?;
- }
-
- if needed.is_nonzero() && !exists {
- info!(
- "Resync block {:?}: fetching absent but needed block (refcount > 0)",
- hash
- );
-
- let block_data = self.rpc_get_raw_block(hash).await?;
-
- self.metrics.resync_recv_counter.add(1);
-
- self.write_block(hash, &block_data).await?;
- }
-
- Ok(())
- }
-
- // ---- Utility: iteration on files in the data directory ----
-
- async fn for_each_file<F, Fut, State>(
- &self,
- state: State,
- mut f: F,
- must_exit: &watch::Receiver<bool>,
- ) -> Result<(), Error>
- where
- F: FnMut(State, Hash) -> Fut + Send,
- Fut: Future<Output = Result<State, Error>> + Send,
- State: Send,
- {
- self.for_each_file_rec(&self.data_dir, state, &mut f, must_exit)
- .await
- .map(|_| ())
- }
-
- fn for_each_file_rec<'a, F, Fut, State>(
- &'a self,
- path: &'a Path,
- mut state: State,
- f: &'a mut F,
- must_exit: &'a watch::Receiver<bool>,
- ) -> BoxFuture<'a, Result<State, Error>>
- where
- F: FnMut(State, Hash) -> Fut + Send,
- Fut: Future<Output = Result<State, Error>> + Send,
- State: Send + 'a,
- {
- async move {
- let mut ls_data_dir = fs::read_dir(path).await?;
- while let Some(data_dir_ent) = ls_data_dir.next_entry().await? {
- if *must_exit.borrow() {
- break;
- }
-
- let name = data_dir_ent.file_name();
- let name = if let Ok(n) = name.into_string() {
- n
- } else {
- continue;
- };
- let ent_type = data_dir_ent.file_type().await?;
-
- let name = name.strip_suffix(".zst").unwrap_or(&name);
- if name.len() == 2 && hex::decode(&name).is_ok() && ent_type.is_dir() {
- state = self
- .for_each_file_rec(&data_dir_ent.path(), state, f, must_exit)
- .await?;
- } else if name.len() == 64 {
- let hash_bytes = if let Ok(h) = hex::decode(&name) {
- h
- } else {
- continue;
- };
- let mut hash = [0u8; 32];
- hash.copy_from_slice(&hash_bytes[..]);
- state = f(state, hash.into()).await?;
- }
- }
- Ok(state)
- }
- .boxed()
}
}
#[async_trait]
-impl EndpointHandler<BlockRpc> for BlockManager {
- async fn handle(
- self: &Arc<Self>,
- message: &BlockRpc,
- _from: NodeID,
- ) -> Result<BlockRpc, Error> {
- match message {
- BlockRpc::PutBlock { hash, data } => self.write_block(hash, data).await,
- BlockRpc::GetBlock(h) => self.read_block(h).await,
- BlockRpc::NeedBlockQuery(h) => self.need_block(h).await.map(BlockRpc::NeedBlockReply),
- m => Err(Error::unexpected_rpc_message(m)),
+impl StreamingEndpointHandler<BlockRpc> for BlockManager {
+ async fn handle(self: &Arc<Self>, mut message: Req<BlockRpc>, _from: NodeID) -> Resp<BlockRpc> {
+ match message.msg() {
+ BlockRpc::PutBlock { hash, header } => Resp::new(
+ self.handle_put_block(*hash, *header, message.take_stream())
+ .await
+ .map(|_| BlockRpc::Ok),
+ ),
+ BlockRpc::GetBlock(h, order_tag) => self.handle_get_block(h, *order_tag).await,
+ BlockRpc::NeedBlockQuery(h) => {
+ Resp::new(self.need_block(h).await.map(BlockRpc::NeedBlockReply))
+ }
+ m => Resp::new(Err(Error::unexpected_rpc_message(m))),
}
}
}
-struct BlockStatus {
- exists: bool,
- needed: RcEntry,
+pub(crate) struct BlockStatus {
+ pub(crate) exists: bool,
+ pub(crate) needed: RcEntry,
}
impl BlockManagerLocked {
@@ -863,7 +577,7 @@ impl BlockManagerLocked {
hash: &Hash,
data: &DataBlock,
mgr: &BlockManager,
- ) -> Result<BlockRpc, Error> {
+ ) -> Result<(), Error> {
let compressed = data.is_compressed();
let data = data.inner_buffer();
@@ -874,8 +588,8 @@ impl BlockManagerLocked {
fs::create_dir_all(&directory).await?;
let to_delete = match (mgr.is_block_compressed(hash).await, compressed) {
- (Ok(true), _) => return Ok(BlockRpc::Ok),
- (Ok(false), false) => return Ok(BlockRpc::Ok),
+ (Ok(true), _) => return Ok(()),
+ (Ok(false), false) => return Ok(()),
(Ok(false), true) => {
let path_to_delete = path.clone();
path.set_extension("zst");
@@ -914,7 +628,7 @@ impl BlockManagerLocked {
dir.sync_all().await?;
drop(dir);
- Ok(BlockRpc::Ok)
+ Ok(())
}
async fn move_block_to_corrupted(&self, hash: &Hash, mgr: &BlockManager) -> Result<(), Error> {
@@ -949,49 +663,16 @@ impl BlockManagerLocked {
}
}
-/// Counts the number of errors when resyncing a block,
-/// and the time of the last try.
-/// Used to implement exponential backoff.
-#[derive(Clone, Copy, Debug)]
-struct ErrorCounter {
- errors: u64,
- last_try: u64,
-}
-
-impl ErrorCounter {
- fn new(now: u64) -> Self {
- Self {
- errors: 1,
- last_try: now,
- }
+async fn read_stream_to_end(mut stream: ByteStream) -> Result<Bytes, Error> {
+ let mut parts: Vec<Bytes> = vec![];
+ while let Some(part) = stream.next().await {
+ parts.push(part.ok_or_message("error in stream")?);
}
- fn decode(data: sled::IVec) -> Self {
- Self {
- errors: u64::from_be_bytes(data[0..8].try_into().unwrap()),
- last_try: u64::from_be_bytes(data[8..16].try_into().unwrap()),
- }
- }
- fn encode(&self) -> Vec<u8> {
- [
- u64::to_be_bytes(self.errors),
- u64::to_be_bytes(self.last_try),
- ]
+ Ok(parts
+ .iter()
+ .map(|x| &x[..])
+ .collect::<Vec<_>>()
.concat()
- }
-
- fn add1(self, now: u64) -> Self {
- Self {
- errors: self.errors + 1,
- last_try: now,
- }
- }
-
- fn delay_msec(&self) -> u64 {
- (RESYNC_RETRY_DELAY.as_millis() as u64)
- << std::cmp::min(self.errors - 1, RESYNC_RETRY_DELAY_MAX_BACKOFF_POWER)
- }
- fn next_try(&self) -> u64 {
- self.last_try + self.delay_msec()
- }
+ .into())
}
diff --git a/src/block/metrics.rs b/src/block/metrics.rs
index f0f541a3..477add66 100644
--- a/src/block/metrics.rs
+++ b/src/block/metrics.rs
@@ -1,6 +1,6 @@
use opentelemetry::{global, metrics::*};
-use garage_util::sled_counter::SledCountedTree;
+use garage_db::counted_tree_hack::CountedTree;
/// TableMetrics reference all counter used for metrics
pub struct BlockManagerMetrics {
@@ -23,7 +23,7 @@ pub struct BlockManagerMetrics {
}
impl BlockManagerMetrics {
- pub fn new(resync_queue: SledCountedTree, resync_errors: SledCountedTree) -> Self {
+ pub fn new(resync_queue: CountedTree, resync_errors: CountedTree) -> Self {
let meter = global::meter("garage_model/block");
Self {
_resync_queue_len: meter
diff --git a/src/block/rc.rs b/src/block/rc.rs
index ec3ea44e..ce6defad 100644
--- a/src/block/rc.rs
+++ b/src/block/rc.rs
@@ -1,5 +1,7 @@
use std::convert::TryInto;
+use garage_db as db;
+
use garage_util::data::*;
use garage_util::error::*;
use garage_util::time::*;
@@ -7,31 +9,41 @@ use garage_util::time::*;
use crate::manager::BLOCK_GC_DELAY;
pub struct BlockRc {
- pub(crate) rc: sled::Tree,
+ pub(crate) rc: db::Tree,
}
impl BlockRc {
- pub(crate) fn new(rc: sled::Tree) -> Self {
+ pub(crate) fn new(rc: db::Tree) -> Self {
Self { rc }
}
/// Increment the reference counter associated to a hash.
/// Returns true if the RC goes from zero to nonzero.
- pub(crate) fn block_incref(&self, hash: &Hash) -> Result<bool, Error> {
- let old_rc = self
- .rc
- .fetch_and_update(&hash, |old| RcEntry::parse_opt(old).increment().serialize())?;
- let old_rc = RcEntry::parse_opt(old_rc);
+ pub(crate) fn block_incref(
+ &self,
+ tx: &mut db::Transaction,
+ hash: &Hash,
+ ) -> db::TxOpResult<bool> {
+ let old_rc = RcEntry::parse_opt(tx.get(&self.rc, &hash)?);
+ match old_rc.increment().serialize() {
+ Some(x) => tx.insert(&self.rc, &hash, x)?,
+ None => unreachable!(),
+ };
Ok(old_rc.is_zero())
}
/// Decrement the reference counter associated to a hash.
/// Returns true if the RC is now zero.
- pub(crate) fn block_decref(&self, hash: &Hash) -> Result<bool, Error> {
- let new_rc = self
- .rc
- .update_and_fetch(&hash, |old| RcEntry::parse_opt(old).decrement().serialize())?;
- let new_rc = RcEntry::parse_opt(new_rc);
+ pub(crate) fn block_decref(
+ &self,
+ tx: &mut db::Transaction,
+ hash: &Hash,
+ ) -> db::TxOpResult<bool> {
+ let new_rc = RcEntry::parse_opt(tx.get(&self.rc, &hash)?).decrement();
+ match new_rc.serialize() {
+ Some(x) => tx.insert(&self.rc, &hash, x)?,
+ None => tx.remove(&self.rc, &hash)?,
+ };
Ok(matches!(new_rc, RcEntry::Deletable { .. }))
}
@@ -44,12 +56,15 @@ impl BlockRc {
/// deletion time has passed
pub(crate) fn clear_deleted_block_rc(&self, hash: &Hash) -> Result<(), Error> {
let now = now_msec();
- self.rc.update_and_fetch(&hash, |rcval| {
- let updated = match RcEntry::parse_opt(rcval) {
- RcEntry::Deletable { at_time } if now > at_time => RcEntry::Absent,
- v => v,
+ self.rc.db().transaction(|mut tx| {
+ let rcval = RcEntry::parse_opt(tx.get(&self.rc, &hash)?);
+ match rcval {
+ RcEntry::Deletable { at_time } if now > at_time => {
+ tx.remove(&self.rc, &hash)?;
+ }
+ _ => (),
};
- updated.serialize()
+ tx.commit(())
})?;
Ok(())
}
diff --git a/src/block/repair.rs b/src/block/repair.rs
new file mode 100644
index 00000000..e2884b69
--- /dev/null
+++ b/src/block/repair.rs
@@ -0,0 +1,466 @@
+use core::ops::Bound;
+use std::path::PathBuf;
+use std::sync::Arc;
+use std::time::Duration;
+
+use async_trait::async_trait;
+use serde::{Deserialize, Serialize};
+use tokio::fs;
+use tokio::select;
+use tokio::sync::mpsc;
+use tokio::sync::watch;
+
+use garage_util::background::*;
+use garage_util::data::*;
+use garage_util::error::*;
+use garage_util::persister::Persister;
+use garage_util::time::*;
+use garage_util::tranquilizer::Tranquilizer;
+
+use crate::manager::*;
+
+// Full scrub every 30 days
+const SCRUB_INTERVAL: Duration = Duration::from_secs(3600 * 24 * 30);
+// Scrub tranquility is initially set to 4, but can be changed in the CLI
+// and the updated version is persisted over Garage restarts
+const INITIAL_SCRUB_TRANQUILITY: u32 = 4;
+
+// ---- ---- ----
+// FIRST KIND OF REPAIR: FINDING MISSING BLOCKS/USELESS BLOCKS
+// This is a one-shot repair operation that can be launched,
+// checks everything, and then exits.
+// ---- ---- ----
+
+pub struct RepairWorker {
+ manager: Arc<BlockManager>,
+ next_start: Option<Hash>,
+ block_iter: Option<BlockStoreIterator>,
+}
+
+impl RepairWorker {
+ pub fn new(manager: Arc<BlockManager>) -> Self {
+ Self {
+ manager,
+ next_start: None,
+ block_iter: None,
+ }
+ }
+}
+
+#[async_trait]
+impl Worker for RepairWorker {
+ fn name(&self) -> String {
+ "Block repair worker".into()
+ }
+
+ fn info(&self) -> Option<String> {
+ match self.block_iter.as_ref() {
+ None => {
+ let idx_bytes = self
+ .next_start
+ .as_ref()
+ .map(|x| x.as_slice())
+ .unwrap_or(&[]);
+ let idx_bytes = if idx_bytes.len() > 4 {
+ &idx_bytes[..4]
+ } else {
+ idx_bytes
+ };
+ Some(format!("Phase 1: {}", hex::encode(idx_bytes)))
+ }
+ Some(bi) => Some(format!("Phase 2: {:.2}% done", bi.progress() * 100.)),
+ }
+ }
+
+ async fn work(&mut self, _must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error> {
+ match self.block_iter.as_mut() {
+ None => {
+ // Phase 1: Repair blocks from RC table.
+
+ // We have to do this complicated two-step process where we first read a bunch
+ // of hashes from the RC table, and then insert them in the to-resync queue,
+ // because of SQLite. Basically, as long as we have an iterator on a DB table,
+ // we can't do anything else on the DB. The naive approach (which we had previously)
+ // of just iterating on the RC table and inserting items one to one in the resync
+ // queue can't work here, it would just provoke a deadlock in the SQLite adapter code.
+ // This is mostly because the Rust bindings for SQLite assume a worst-case scenario
+ // where SQLite is not compiled in thread-safe mode, so we have to wrap everything
+ // in a mutex (see db/sqlite_adapter.rs and discussion in PR #322).
+ // TODO: maybe do this with tokio::task::spawn_blocking ?
+ let mut batch_of_hashes = vec![];
+ let start_bound = match self.next_start.as_ref() {
+ None => Bound::Unbounded,
+ Some(x) => Bound::Excluded(x.as_slice()),
+ };
+ for entry in self
+ .manager
+ .rc
+ .rc
+ .range::<&[u8], _>((start_bound, Bound::Unbounded))?
+ {
+ let (hash, _) = entry?;
+ let hash = Hash::try_from(&hash[..]).unwrap();
+ batch_of_hashes.push(hash);
+ if batch_of_hashes.len() >= 1000 {
+ break;
+ }
+ }
+ if batch_of_hashes.is_empty() {
+ // move on to phase 2
+ self.block_iter = Some(BlockStoreIterator::new(&self.manager));
+ return Ok(WorkerState::Busy);
+ }
+
+ for hash in batch_of_hashes.into_iter() {
+ self.manager
+ .resync
+ .put_to_resync(&hash, Duration::from_secs(0))?;
+ self.next_start = Some(hash)
+ }
+
+ Ok(WorkerState::Busy)
+ }
+ Some(bi) => {
+ // Phase 2: Repair blocks actually on disk
+ // Lists all blocks on disk and adds them to the resync queue.
+ // This allows us to find blocks we are storing but don't actually need,
+ // so that we can offload them if necessary and then delete them locally.
+ if let Some(hash) = bi.next().await? {
+ self.manager
+ .resync
+ .put_to_resync(&hash, Duration::from_secs(0))?;
+ Ok(WorkerState::Busy)
+ } else {
+ Ok(WorkerState::Done)
+ }
+ }
+ }
+ }
+
+ async fn wait_for_work(&mut self, _must_exit: &watch::Receiver<bool>) -> WorkerState {
+ unreachable!()
+ }
+}
+
+// ---- ---- ----
+// SECOND KIND OF REPAIR: SCRUBBING THE DATASTORE
+// This is significantly more complex than the process above,
+// as it is a continuously-running task that triggers automatically
+// every SCRUB_INTERVAL, but can also be triggered manually
+// and whose parameter (esp. speed) can be controlled at runtime.
+// ---- ---- ----
+
+pub struct ScrubWorker {
+ manager: Arc<BlockManager>,
+ rx_cmd: mpsc::Receiver<ScrubWorkerCommand>,
+
+ work: ScrubWorkerState,
+ tranquilizer: Tranquilizer,
+
+ persister: Persister<ScrubWorkerPersisted>,
+ persisted: ScrubWorkerPersisted,
+}
+
+#[derive(Serialize, Deserialize)]
+struct ScrubWorkerPersisted {
+ tranquility: u32,
+ time_last_complete_scrub: u64,
+ corruptions_detected: u64,
+}
+
+enum ScrubWorkerState {
+ Running(BlockStoreIterator),
+ Paused(BlockStoreIterator, u64), // u64 = time when to resume scrub
+ Finished,
+}
+
+impl Default for ScrubWorkerState {
+ fn default() -> Self {
+ ScrubWorkerState::Finished
+ }
+}
+
+#[derive(Debug)]
+pub enum ScrubWorkerCommand {
+ Start,
+ Pause(Duration),
+ Resume,
+ Cancel,
+ SetTranquility(u32),
+}
+
+impl ScrubWorker {
+ pub fn new(manager: Arc<BlockManager>, rx_cmd: mpsc::Receiver<ScrubWorkerCommand>) -> Self {
+ let persister = Persister::new(&manager.system.metadata_dir, "scrub_info");
+ let persisted = match persister.load() {
+ Ok(v) => v,
+ Err(_) => ScrubWorkerPersisted {
+ time_last_complete_scrub: 0,
+ tranquility: INITIAL_SCRUB_TRANQUILITY,
+ corruptions_detected: 0,
+ },
+ };
+ Self {
+ manager,
+ rx_cmd,
+ work: ScrubWorkerState::Finished,
+ tranquilizer: Tranquilizer::new(30),
+ persister,
+ persisted,
+ }
+ }
+
+ async fn handle_cmd(&mut self, cmd: ScrubWorkerCommand) {
+ match cmd {
+ ScrubWorkerCommand::Start => {
+ self.work = match std::mem::take(&mut self.work) {
+ ScrubWorkerState::Finished => {
+ let iterator = BlockStoreIterator::new(&self.manager);
+ ScrubWorkerState::Running(iterator)
+ }
+ work => {
+ error!("Cannot start scrub worker: already running!");
+ work
+ }
+ };
+ }
+ ScrubWorkerCommand::Pause(dur) => {
+ self.work = match std::mem::take(&mut self.work) {
+ ScrubWorkerState::Running(it) | ScrubWorkerState::Paused(it, _) => {
+ ScrubWorkerState::Paused(it, now_msec() + dur.as_millis() as u64)
+ }
+ work => {
+ error!("Cannot pause scrub worker: not running!");
+ work
+ }
+ };
+ }
+ ScrubWorkerCommand::Resume => {
+ self.work = match std::mem::take(&mut self.work) {
+ ScrubWorkerState::Paused(it, _) => ScrubWorkerState::Running(it),
+ work => {
+ error!("Cannot resume scrub worker: not paused!");
+ work
+ }
+ };
+ }
+ ScrubWorkerCommand::Cancel => {
+ self.work = match std::mem::take(&mut self.work) {
+ ScrubWorkerState::Running(_) | ScrubWorkerState::Paused(_, _) => {
+ ScrubWorkerState::Finished
+ }
+ work => {
+ error!("Cannot cancel scrub worker: not running!");
+ work
+ }
+ }
+ }
+ ScrubWorkerCommand::SetTranquility(t) => {
+ self.persisted.tranquility = t;
+ if let Err(e) = self.persister.save_async(&self.persisted).await {
+ error!("Could not save new tranquilitiy value: {}", e);
+ }
+ }
+ }
+ }
+}
+
+#[async_trait]
+impl Worker for ScrubWorker {
+ fn name(&self) -> String {
+ "Block scrub worker".into()
+ }
+
+ fn info(&self) -> Option<String> {
+ let s = match &self.work {
+ ScrubWorkerState::Running(bsi) => format!(
+ "{:.2}% done (tranquility = {})",
+ bsi.progress() * 100.,
+ self.persisted.tranquility
+ ),
+ ScrubWorkerState::Paused(bsi, rt) => {
+ format!(
+ "Paused, {:.2}% done, resumes at {}",
+ bsi.progress() * 100.,
+ msec_to_rfc3339(*rt)
+ )
+ }
+ ScrubWorkerState::Finished => format!(
+ "Last completed scrub: {}",
+ msec_to_rfc3339(self.persisted.time_last_complete_scrub)
+ ),
+ };
+ Some(format!(
+ "{} ; corruptions detected: {}",
+ s, self.persisted.corruptions_detected
+ ))
+ }
+
+ async fn work(&mut self, _must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error> {
+ match self.rx_cmd.try_recv() {
+ Ok(cmd) => self.handle_cmd(cmd).await,
+ Err(mpsc::error::TryRecvError::Disconnected) => return Ok(WorkerState::Done),
+ Err(mpsc::error::TryRecvError::Empty) => (),
+ };
+
+ match &mut self.work {
+ ScrubWorkerState::Running(bsi) => {
+ self.tranquilizer.reset();
+ if let Some(hash) = bsi.next().await? {
+ match self.manager.read_block(&hash).await {
+ Err(Error::CorruptData(_)) => {
+ error!("Found corrupt data block during scrub: {:?}", hash);
+ self.persisted.corruptions_detected += 1;
+ self.persister.save_async(&self.persisted).await?;
+ }
+ Err(e) => return Err(e),
+ _ => (),
+ };
+ Ok(self
+ .tranquilizer
+ .tranquilize_worker(self.persisted.tranquility))
+ } else {
+ self.persisted.time_last_complete_scrub = now_msec();
+ self.persister.save_async(&self.persisted).await?;
+ self.work = ScrubWorkerState::Finished;
+ self.tranquilizer.clear();
+ Ok(WorkerState::Idle)
+ }
+ }
+ _ => Ok(WorkerState::Idle),
+ }
+ }
+
+ async fn wait_for_work(&mut self, _must_exit: &watch::Receiver<bool>) -> WorkerState {
+ let (wait_until, command) = match &self.work {
+ ScrubWorkerState::Running(_) => return WorkerState::Busy,
+ ScrubWorkerState::Paused(_, resume_time) => (*resume_time, ScrubWorkerCommand::Resume),
+ ScrubWorkerState::Finished => (
+ self.persisted.time_last_complete_scrub + SCRUB_INTERVAL.as_millis() as u64,
+ ScrubWorkerCommand::Start,
+ ),
+ };
+
+ let now = now_msec();
+ if now >= wait_until {
+ self.handle_cmd(command).await;
+ return WorkerState::Busy;
+ }
+ let delay = Duration::from_millis(wait_until - now);
+ select! {
+ _ = tokio::time::sleep(delay) => self.handle_cmd(command).await,
+ cmd = self.rx_cmd.recv() => if let Some(cmd) = cmd {
+ self.handle_cmd(cmd).await;
+ } else {
+ return WorkerState::Done;
+ }
+ }
+
+ match &self.work {
+ ScrubWorkerState::Running(_) => WorkerState::Busy,
+ _ => WorkerState::Idle,
+ }
+ }
+}
+
+// ---- ---- ----
+// UTILITY FOR ENUMERATING THE BLOCK STORE
+// ---- ---- ----
+
+struct BlockStoreIterator {
+ path: Vec<ReadingDir>,
+}
+
+enum ReadingDir {
+ Pending(PathBuf),
+ Read {
+ subpaths: Vec<fs::DirEntry>,
+ pos: usize,
+ },
+}
+
+impl BlockStoreIterator {
+ fn new(manager: &BlockManager) -> Self {
+ let root_dir = manager.data_dir.clone();
+ Self {
+ path: vec![ReadingDir::Pending(root_dir)],
+ }
+ }
+
+ /// Returns progress done, between 0 and 1
+ fn progress(&self) -> f32 {
+ if self.path.is_empty() {
+ 1.0
+ } else {
+ let mut ret = 0.0;
+ let mut next_div = 1;
+ for p in self.path.iter() {
+ match p {
+ ReadingDir::Pending(_) => break,
+ ReadingDir::Read { subpaths, pos } => {
+ next_div *= subpaths.len();
+ ret += ((*pos - 1) as f32) / (next_div as f32);
+ }
+ }
+ }
+ ret
+ }
+ }
+
+ async fn next(&mut self) -> Result<Option<Hash>, Error> {
+ loop {
+ let last_path = match self.path.last_mut() {
+ None => return Ok(None),
+ Some(lp) => lp,
+ };
+
+ if let ReadingDir::Pending(path) = last_path {
+ let mut reader = fs::read_dir(&path).await?;
+ let mut subpaths = vec![];
+ while let Some(ent) = reader.next_entry().await? {
+ subpaths.push(ent);
+ }
+ *last_path = ReadingDir::Read { subpaths, pos: 0 };
+ }
+
+ let (subpaths, pos) = match *last_path {
+ ReadingDir::Read {
+ ref subpaths,
+ ref mut pos,
+ } => (subpaths, pos),
+ ReadingDir::Pending(_) => unreachable!(),
+ };
+
+ let data_dir_ent = match subpaths.get(*pos) {
+ None => {
+ self.path.pop();
+ continue;
+ }
+ Some(ent) => {
+ *pos += 1;
+ ent
+ }
+ };
+
+ let name = data_dir_ent.file_name();
+ let name = if let Ok(n) = name.into_string() {
+ n
+ } else {
+ continue;
+ };
+ let ent_type = data_dir_ent.file_type().await?;
+
+ let name = name.strip_suffix(".zst").unwrap_or(&name);
+ if name.len() == 2 && hex::decode(&name).is_ok() && ent_type.is_dir() {
+ let path = data_dir_ent.path();
+ self.path.push(ReadingDir::Pending(path));
+ } else if name.len() == 64 {
+ if let Ok(h) = hex::decode(&name) {
+ let mut hash = [0u8; 32];
+ hash.copy_from_slice(&h);
+ return Ok(Some(hash.into()));
+ }
+ }
+ }
+ }
+}
diff --git a/src/block/resync.rs b/src/block/resync.rs
new file mode 100644
index 00000000..ada3ac54
--- /dev/null
+++ b/src/block/resync.rs
@@ -0,0 +1,589 @@
+use std::collections::HashSet;
+use std::convert::TryInto;
+use std::sync::{Arc, Mutex};
+use std::time::Duration;
+
+use arc_swap::ArcSwap;
+use async_trait::async_trait;
+use serde::{Deserialize, Serialize};
+
+use tokio::select;
+use tokio::sync::{watch, Notify};
+
+use opentelemetry::{
+ trace::{FutureExt as OtelFutureExt, TraceContextExt, Tracer},
+ Context, KeyValue,
+};
+
+use garage_db as db;
+use garage_db::counted_tree_hack::CountedTree;
+
+use garage_util::background::*;
+use garage_util::data::*;
+use garage_util::error::*;
+use garage_util::metrics::RecordDuration;
+use garage_util::persister::Persister;
+use garage_util::time::*;
+use garage_util::tranquilizer::Tranquilizer;
+
+use garage_rpc::system::System;
+use garage_rpc::*;
+
+use garage_table::replication::TableReplication;
+
+use crate::manager::*;
+
+// The delay between the time where a resync operation fails
+// and the time when it is retried, with exponential backoff
+// (multiplied by 2, 4, 8, 16, etc. for every consecutive failure).
+pub(crate) const RESYNC_RETRY_DELAY: Duration = Duration::from_secs(60);
+// The minimum retry delay is 60 seconds = 1 minute
+// The maximum retry delay is 60 seconds * 2^6 = 60 seconds << 6 = 64 minutes (~1 hour)
+pub(crate) const RESYNC_RETRY_DELAY_MAX_BACKOFF_POWER: u64 = 6;
+
+// No more than 4 resync workers can be running in the system
+pub(crate) const MAX_RESYNC_WORKERS: usize = 4;
+// Resync tranquility is initially set to 2, but can be changed in the CLI
+// and the updated version is persisted over Garage restarts
+const INITIAL_RESYNC_TRANQUILITY: u32 = 2;
+
+pub struct BlockResyncManager {
+ pub(crate) queue: CountedTree,
+ pub(crate) notify: Notify,
+ pub(crate) errors: CountedTree,
+
+ busy_set: BusySet,
+
+ persister: Persister<ResyncPersistedConfig>,
+ persisted: ArcSwap<ResyncPersistedConfig>,
+}
+
+#[derive(Serialize, Deserialize, Clone, Copy)]
+struct ResyncPersistedConfig {
+ n_workers: usize,
+ tranquility: u32,
+}
+
+enum ResyncIterResult {
+ BusyDidSomething,
+ BusyDidNothing,
+ IdleFor(Duration),
+}
+
+type BusySet = Arc<Mutex<HashSet<Vec<u8>>>>;
+
+struct BusyBlock {
+ time_bytes: Vec<u8>,
+ hash_bytes: Vec<u8>,
+ busy_set: BusySet,
+}
+
+impl BlockResyncManager {
+ pub(crate) fn new(db: &db::Db, system: &System) -> Self {
+ let queue = db
+ .open_tree("block_local_resync_queue")
+ .expect("Unable to open block_local_resync_queue tree");
+ let queue = CountedTree::new(queue).expect("Could not count block_local_resync_queue");
+
+ let errors = db
+ .open_tree("block_local_resync_errors")
+ .expect("Unable to open block_local_resync_errors tree");
+ let errors = CountedTree::new(errors).expect("Could not count block_local_resync_errors");
+
+ let persister = Persister::new(&system.metadata_dir, "resync_cfg");
+ let persisted = match persister.load() {
+ Ok(v) => v,
+ Err(_) => ResyncPersistedConfig {
+ n_workers: 1,
+ tranquility: INITIAL_RESYNC_TRANQUILITY,
+ },
+ };
+
+ Self {
+ queue,
+ notify: Notify::new(),
+ errors,
+ busy_set: Arc::new(Mutex::new(HashSet::new())),
+ persister,
+ persisted: ArcSwap::new(Arc::new(persisted)),
+ }
+ }
+
+ /// Get lenght of resync queue
+ pub fn queue_len(&self) -> Result<usize, Error> {
+ // This currently can't return an error because the CountedTree hack
+ // doesn't error on .len(), but this will change when we remove the hack
+ // (hopefully someday!)
+ Ok(self.queue.len())
+ }
+
+ /// Get number of blocks that have an error
+ pub fn errors_len(&self) -> Result<usize, Error> {
+ // (see queue_len comment)
+ Ok(self.errors.len())
+ }
+
+ // ---- Resync loop ----
+
+ // This part manages a queue of blocks that need to be
+ // "resynchronized", i.e. that need to have a check that
+ // they are at present if we need them, or that they are
+ // deleted once the garbage collection delay has passed.
+ //
+ // Here are some explanations on how the resync queue works.
+ // There are two Sled trees that are used to have information
+ // about the status of blocks that need to be resynchronized:
+ //
+ // - resync.queue: a tree that is ordered first by a timestamp
+ // (in milliseconds since Unix epoch) that is the time at which
+ // the resync must be done, and second by block hash.
+ // The key in this tree is just:
+ // concat(timestamp (8 bytes), hash (32 bytes))
+ // The value is the same 32-byte hash.
+ //
+ // - resync.errors: a tree that indicates for each block
+ // if the last resync resulted in an error, and if so,
+ // the following two informations (see the ErrorCounter struct):
+ // - how many consecutive resync errors for this block?
+ // - when was the last try?
+ // These two informations are used to implement an
+ // exponential backoff retry strategy.
+ // The key in this tree is the 32-byte hash of the block,
+ // and the value is the encoded ErrorCounter value.
+ //
+ // We need to have these two trees, because the resync queue
+ // is not just a queue of items to process, but a set of items
+ // that are waiting a specific delay until we can process them
+ // (the delay being necessary both internally for the exponential
+ // backoff strategy, and exposed as a parameter when adding items
+ // to the queue, e.g. to wait until the GC delay has passed).
+ // This is why we need one tree ordered by time, and one
+ // ordered by identifier of item to be processed (block hash).
+ //
+ // When the worker wants to process an item it takes from
+ // resync.queue, it checks in resync.errors that if there is an
+ // exponential back-off delay to await, it has passed before we
+ // process the item. If not, the item in the queue is skipped
+ // (but added back for later processing after the time of the
+ // delay).
+ //
+ // An alternative that would have seemed natural is to
+ // only add items to resync.queue with a processing time that is
+ // after the delay, but there are several issues with this:
+ // - This requires to synchronize updates to resync.queue and
+ // resync.errors (with the current model, there is only one thread,
+ // the worker thread, that accesses resync.errors,
+ // so no need to synchronize) by putting them both in a lock.
+ // This would mean that block_incref might need to take a lock
+ // before doing its thing, meaning it has much more chances of
+ // not completing successfully if something bad happens to Garage.
+ // Currently Garage is not able to recover from block_incref that
+ // doesn't complete successfully, because it is necessary to ensure
+ // the consistency between the state of the block manager and
+ // information in the BlockRef table.
+ // - If a resync fails, we put that block in the resync.errors table,
+ // and also add it back to resync.queue to be processed after
+ // the exponential back-off delay,
+ // but maybe the block is already scheduled to be resynced again
+ // at another time that is before the exponential back-off delay,
+ // and we have no way to check that easily. This means that
+ // in all cases, we need to check the resync.errors table
+ // in the resync loop at the time when a block is popped from
+ // the resync.queue.
+ // Overall, the current design is therefore simpler and more robust
+ // because it tolerates inconsistencies between the resync.queue
+ // and resync.errors table (items being scheduled in resync.queue
+ // for times that are earlier than the exponential back-off delay
+ // is a natural condition that is handled properly).
+
+ pub(crate) fn put_to_resync(&self, hash: &Hash, delay: Duration) -> db::Result<()> {
+ let when = now_msec() + delay.as_millis() as u64;
+ self.put_to_resync_at(hash, when)
+ }
+
+ pub(crate) fn put_to_resync_at(&self, hash: &Hash, when: u64) -> db::Result<()> {
+ trace!("Put resync_queue: {} {:?}", when, hash);
+ let mut key = u64::to_be_bytes(when).to_vec();
+ key.extend(hash.as_ref());
+ self.queue.insert(key, hash.as_ref())?;
+ self.notify.notify_waiters();
+ Ok(())
+ }
+
+ async fn resync_iter(&self, manager: &BlockManager) -> Result<ResyncIterResult, db::Error> {
+ if let Some(block) = self.get_block_to_resync()? {
+ let time_msec = u64::from_be_bytes(block.time_bytes[0..8].try_into().unwrap());
+ let now = now_msec();
+
+ if now >= time_msec {
+ let hash = Hash::try_from(&block.hash_bytes[..]).unwrap();
+
+ if let Some(ec) = self.errors.get(hash.as_slice())? {
+ let ec = ErrorCounter::decode(&ec);
+ if now < ec.next_try() {
+ // if next retry after an error is not yet,
+ // don't do resync and return early, but still
+ // make sure the item is still in queue at expected time
+ self.put_to_resync_at(&hash, ec.next_try())?;
+ // ec.next_try() > now >= time_msec, so this remove
+ // is not removing the one we added just above
+ // (we want to do the remove after the insert to ensure
+ // that the item is not lost if we crash in-between)
+ self.queue.remove(&block.time_bytes)?;
+ return Ok(ResyncIterResult::BusyDidNothing);
+ }
+ }
+
+ let tracer = opentelemetry::global::tracer("garage");
+ let trace_id = gen_uuid();
+ let span = tracer
+ .span_builder("Resync block")
+ .with_trace_id(
+ opentelemetry::trace::TraceId::from_hex(&hex::encode(
+ &trace_id.as_slice()[..16],
+ ))
+ .unwrap(),
+ )
+ .with_attributes(vec![KeyValue::new("block", format!("{:?}", hash))])
+ .start(&tracer);
+
+ let res = self
+ .resync_block(manager, &hash)
+ .with_context(Context::current_with_span(span))
+ .bound_record_duration(&manager.metrics.resync_duration)
+ .await;
+
+ manager.metrics.resync_counter.add(1);
+
+ if let Err(e) = &res {
+ manager.metrics.resync_error_counter.add(1);
+ warn!("Error when resyncing {:?}: {}", hash, e);
+
+ let err_counter = match self.errors.get(hash.as_slice())? {
+ Some(ec) => ErrorCounter::decode(&ec).add1(now + 1),
+ None => ErrorCounter::new(now + 1),
+ };
+
+ self.errors.insert(hash.as_slice(), err_counter.encode())?;
+
+ self.put_to_resync_at(&hash, err_counter.next_try())?;
+ // err_counter.next_try() >= now + 1 > now,
+ // the entry we remove from the queue is not
+ // the entry we inserted with put_to_resync_at
+ self.queue.remove(&block.time_bytes)?;
+ } else {
+ self.errors.remove(hash.as_slice())?;
+ self.queue.remove(&block.time_bytes)?;
+ }
+
+ Ok(ResyncIterResult::BusyDidSomething)
+ } else {
+ Ok(ResyncIterResult::IdleFor(Duration::from_millis(
+ time_msec - now,
+ )))
+ }
+ } else {
+ // Here we wait either for a notification that an item has been
+ // added to the queue, or for a constant delay of 10 secs to expire.
+ // The delay avoids a race condition where the notification happens
+ // between the time we checked the queue and the first poll
+ // to resync_notify.notified(): if that happens, we'll just loop
+ // back 10 seconds later, which is fine.
+ Ok(ResyncIterResult::IdleFor(Duration::from_secs(10)))
+ }
+ }
+
+ fn get_block_to_resync(&self) -> Result<Option<BusyBlock>, db::Error> {
+ let mut busy = self.busy_set.lock().unwrap();
+ for it in self.queue.iter()? {
+ let (time_bytes, hash_bytes) = it?;
+ if !busy.contains(&time_bytes) {
+ busy.insert(time_bytes.clone());
+ return Ok(Some(BusyBlock {
+ time_bytes,
+ hash_bytes,
+ busy_set: self.busy_set.clone(),
+ }));
+ }
+ }
+ Ok(None)
+ }
+
+ async fn resync_block(&self, manager: &BlockManager, hash: &Hash) -> Result<(), Error> {
+ let BlockStatus { exists, needed } = manager.check_block_status(hash).await?;
+
+ if exists != needed.is_needed() || exists != needed.is_nonzero() {
+ debug!(
+ "Resync block {:?}: exists {}, nonzero rc {}, deletable {}",
+ hash,
+ exists,
+ needed.is_nonzero(),
+ needed.is_deletable(),
+ );
+ }
+
+ if exists && needed.is_deletable() {
+ info!("Resync block {:?}: offloading and deleting", hash);
+
+ let mut who = manager.replication.write_nodes(hash);
+ if who.len() < manager.replication.write_quorum() {
+ return Err(Error::Message("Not trying to offload block because we don't have a quorum of nodes to write to".to_string()));
+ }
+ who.retain(|id| *id != manager.system.id);
+
+ let who_needs_resps = manager
+ .system
+ .rpc
+ .call_many(
+ &manager.endpoint,
+ &who,
+ BlockRpc::NeedBlockQuery(*hash),
+ RequestStrategy::with_priority(PRIO_BACKGROUND),
+ )
+ .await?;
+
+ let mut need_nodes = vec![];
+ for (node, needed) in who_needs_resps {
+ match needed.err_context("NeedBlockQuery RPC")? {
+ BlockRpc::NeedBlockReply(needed) => {
+ if needed {
+ need_nodes.push(node);
+ }
+ }
+ m => {
+ return Err(Error::unexpected_rpc_message(m));
+ }
+ }
+ }
+
+ if !need_nodes.is_empty() {
+ trace!(
+ "Block {:?} needed by {} nodes, sending",
+ hash,
+ need_nodes.len()
+ );
+
+ for node in need_nodes.iter() {
+ manager
+ .metrics
+ .resync_send_counter
+ .add(1, &[KeyValue::new("to", format!("{:?}", node))]);
+ }
+
+ let block = manager.read_block(hash).await?;
+ let (header, bytes) = block.into_parts();
+ let put_block_message = Req::new(BlockRpc::PutBlock {
+ hash: *hash,
+ header,
+ })?
+ .with_stream_from_buffer(bytes);
+ manager
+ .system
+ .rpc
+ .try_call_many(
+ &manager.endpoint,
+ &need_nodes[..],
+ put_block_message,
+ RequestStrategy::with_priority(PRIO_BACKGROUND)
+ .with_quorum(need_nodes.len()),
+ )
+ .await
+ .err_context("PutBlock RPC")?;
+ }
+ info!(
+ "Deleting unneeded block {:?}, offload finished ({} / {})",
+ hash,
+ need_nodes.len(),
+ who.len()
+ );
+
+ manager.delete_if_unneeded(hash).await?;
+
+ manager.rc.clear_deleted_block_rc(hash)?;
+ }
+
+ if needed.is_nonzero() && !exists {
+ info!(
+ "Resync block {:?}: fetching absent but needed block (refcount > 0)",
+ hash
+ );
+
+ let block_data = manager.rpc_get_raw_block(hash, None).await?;
+
+ manager.metrics.resync_recv_counter.add(1);
+
+ manager.write_block(hash, &block_data).await?;
+ }
+
+ Ok(())
+ }
+
+ async fn update_persisted(
+ &self,
+ update: impl Fn(&mut ResyncPersistedConfig),
+ ) -> Result<(), Error> {
+ let mut cfg: ResyncPersistedConfig = *self.persisted.load().as_ref();
+ update(&mut cfg);
+ self.persister.save_async(&cfg).await?;
+ self.persisted.store(Arc::new(cfg));
+ self.notify.notify_waiters();
+ Ok(())
+ }
+
+ pub async fn set_n_workers(&self, n_workers: usize) -> Result<(), Error> {
+ if !(1..=MAX_RESYNC_WORKERS).contains(&n_workers) {
+ return Err(Error::Message(format!(
+ "Invalid number of resync workers, must be between 1 and {}",
+ MAX_RESYNC_WORKERS
+ )));
+ }
+ self.update_persisted(|cfg| cfg.n_workers = n_workers).await
+ }
+
+ pub async fn set_tranquility(&self, tranquility: u32) -> Result<(), Error> {
+ self.update_persisted(|cfg| cfg.tranquility = tranquility)
+ .await
+ }
+}
+
+impl Drop for BusyBlock {
+ fn drop(&mut self) {
+ let mut busy = self.busy_set.lock().unwrap();
+ busy.remove(&self.time_bytes);
+ }
+}
+
+pub(crate) struct ResyncWorker {
+ index: usize,
+ manager: Arc<BlockManager>,
+ tranquilizer: Tranquilizer,
+ next_delay: Duration,
+}
+
+impl ResyncWorker {
+ pub(crate) fn new(index: usize, manager: Arc<BlockManager>) -> Self {
+ Self {
+ index,
+ manager,
+ tranquilizer: Tranquilizer::new(30),
+ next_delay: Duration::from_secs(10),
+ }
+ }
+}
+
+#[async_trait]
+impl Worker for ResyncWorker {
+ fn name(&self) -> String {
+ format!("Block resync worker #{}", self.index + 1)
+ }
+
+ fn info(&self) -> Option<String> {
+ let persisted = self.manager.resync.persisted.load();
+
+ if self.index >= persisted.n_workers {
+ return Some("(unused)".into());
+ }
+
+ let mut ret = vec![];
+ ret.push(format!("tranquility = {}", persisted.tranquility));
+
+ let qlen = self.manager.resync.queue_len().unwrap_or(0);
+ if qlen > 0 {
+ ret.push(format!("{} blocks in queue", qlen));
+ }
+
+ let elen = self.manager.resync.errors_len().unwrap_or(0);
+ if elen > 0 {
+ ret.push(format!("{} blocks in error state", elen));
+ }
+
+ Some(ret.join(", "))
+ }
+
+ async fn work(&mut self, _must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error> {
+ if self.index >= self.manager.resync.persisted.load().n_workers {
+ return Ok(WorkerState::Idle);
+ }
+
+ self.tranquilizer.reset();
+ match self.manager.resync.resync_iter(&self.manager).await {
+ Ok(ResyncIterResult::BusyDidSomething) => Ok(self
+ .tranquilizer
+ .tranquilize_worker(self.manager.resync.persisted.load().tranquility)),
+ Ok(ResyncIterResult::BusyDidNothing) => Ok(WorkerState::Busy),
+ Ok(ResyncIterResult::IdleFor(delay)) => {
+ self.next_delay = delay;
+ Ok(WorkerState::Idle)
+ }
+ Err(e) => {
+ // The errors that we have here are only Sled errors
+ // We don't really know how to handle them so just ¯\_(ツ)_/¯
+ // (there is kind of an assumption that Sled won't error on us,
+ // if it does there is not much we can do -- TODO should we just panic?)
+ // Here we just give the error to the worker manager,
+ // it will print it to the logs and increment a counter
+ Err(e.into())
+ }
+ }
+ }
+
+ async fn wait_for_work(&mut self, _must_exit: &watch::Receiver<bool>) -> WorkerState {
+ while self.index >= self.manager.resync.persisted.load().n_workers {
+ self.manager.resync.notify.notified().await
+ }
+
+ select! {
+ _ = tokio::time::sleep(self.next_delay) => (),
+ _ = self.manager.resync.notify.notified() => (),
+ };
+
+ WorkerState::Busy
+ }
+}
+
+/// Counts the number of errors when resyncing a block,
+/// and the time of the last try.
+/// Used to implement exponential backoff.
+#[derive(Clone, Copy, Debug)]
+struct ErrorCounter {
+ errors: u64,
+ last_try: u64,
+}
+
+impl ErrorCounter {
+ fn new(now: u64) -> Self {
+ Self {
+ errors: 1,
+ last_try: now,
+ }
+ }
+
+ fn decode(data: &[u8]) -> Self {
+ Self {
+ errors: u64::from_be_bytes(data[0..8].try_into().unwrap()),
+ last_try: u64::from_be_bytes(data[8..16].try_into().unwrap()),
+ }
+ }
+ fn encode(&self) -> Vec<u8> {
+ [
+ u64::to_be_bytes(self.errors),
+ u64::to_be_bytes(self.last_try),
+ ]
+ .concat()
+ }
+
+ fn add1(self, now: u64) -> Self {
+ Self {
+ errors: self.errors + 1,
+ last_try: now,
+ }
+ }
+
+ fn delay_msec(&self) -> u64 {
+ (RESYNC_RETRY_DELAY.as_millis() as u64)
+ << std::cmp::min(self.errors - 1, RESYNC_RETRY_DELAY_MAX_BACKOFF_POWER)
+ }
+ fn next_try(&self) -> u64 {
+ self.last_try + self.delay_msec()
+ }
+}
diff --git a/src/db/Cargo.toml b/src/db/Cargo.toml
new file mode 100644
index 00000000..62dda2ca
--- /dev/null
+++ b/src/db/Cargo.toml
@@ -0,0 +1,39 @@
+[package]
+name = "garage_db"
+version = "0.8.0"
+authors = ["Alex Auvolat <alex@adnab.me>"]
+edition = "2018"
+license = "AGPL-3.0"
+description = "Abstraction over multiple key/value storage engines that supports transactions"
+repository = "https://git.deuxfleurs.fr/Deuxfleurs/garage"
+readme = "../../README.md"
+
+[lib]
+path = "lib.rs"
+
+[[bin]]
+name = "convert"
+path = "bin/convert.rs"
+required-features = ["cli"]
+
+[dependencies]
+err-derive = "0.3"
+hexdump = "0.1"
+tracing = "0.1.30"
+
+heed = { version = "0.11", default-features = false, features = ["lmdb"], optional = true }
+rusqlite = { version = "0.27", optional = true }
+sled = { version = "0.34", optional = true }
+
+# cli deps
+clap = { version = "3.1.18", optional = true, features = ["derive", "env"] }
+pretty_env_logger = { version = "0.4", optional = true }
+
+[dev-dependencies]
+mktemp = "0.4"
+
+[features]
+bundled-libs = [ "rusqlite/bundled" ]
+cli = ["clap", "pretty_env_logger"]
+lmdb = [ "heed" ]
+sqlite = [ "rusqlite" ]
diff --git a/src/db/bin/convert.rs b/src/db/bin/convert.rs
new file mode 100644
index 00000000..bbde2048
--- /dev/null
+++ b/src/db/bin/convert.rs
@@ -0,0 +1,69 @@
+use std::path::PathBuf;
+
+use garage_db::*;
+
+use clap::Parser;
+
+/// K2V command line interface
+#[derive(Parser, Debug)]
+#[clap(author, version, about, long_about = None)]
+struct Args {
+ /// Input DB path
+ #[clap(short = 'i')]
+ input_path: PathBuf,
+ /// Input DB engine
+ #[clap(short = 'a')]
+ input_engine: String,
+
+ /// Output DB path
+ #[clap(short = 'o')]
+ output_path: PathBuf,
+ /// Output DB engine
+ #[clap(short = 'b')]
+ output_engine: String,
+}
+
+fn main() {
+ let args = Args::parse();
+ pretty_env_logger::init();
+
+ match do_conversion(args) {
+ Ok(()) => println!("Success!"),
+ Err(e) => eprintln!("Error: {}", e),
+ }
+}
+
+fn do_conversion(args: Args) -> Result<()> {
+ let input = open_db(args.input_path, args.input_engine)?;
+ let output = open_db(args.output_path, args.output_engine)?;
+ output.import(&input)?;
+ Ok(())
+}
+
+fn open_db(path: PathBuf, engine: String) -> Result<Db> {
+ match engine.as_str() {
+ "sled" => {
+ let db = sled_adapter::sled::Config::default().path(&path).open()?;
+ Ok(sled_adapter::SledDb::init(db))
+ }
+ "sqlite" | "sqlite3" | "rusqlite" => {
+ let db = sqlite_adapter::rusqlite::Connection::open(&path)?;
+ Ok(sqlite_adapter::SqliteDb::init(db))
+ }
+ "lmdb" | "heed" => {
+ std::fs::create_dir_all(&path).map_err(|e| {
+ Error(format!("Unable to create LMDB data directory: {}", e).into())
+ })?;
+
+ let map_size = lmdb_adapter::recommended_map_size();
+
+ let db = lmdb_adapter::heed::EnvOpenOptions::new()
+ .max_dbs(100)
+ .map_size(map_size)
+ .open(&path)
+ .unwrap();
+ Ok(lmdb_adapter::LmdbDb::init(db))
+ }
+ e => Err(Error(format!("Invalid DB engine: {}", e).into())),
+ }
+}
diff --git a/src/db/counted_tree_hack.rs b/src/db/counted_tree_hack.rs
new file mode 100644
index 00000000..bbe943a2
--- /dev/null
+++ b/src/db/counted_tree_hack.rs
@@ -0,0 +1,127 @@
+//! This hack allows a db tree to keep in RAM a counter of the number of entries
+//! it contains, which is used to call .len() on it. This is usefull only for
+//! the sled backend where .len() otherwise would have to traverse the whole
+//! tree to count items. For sqlite and lmdb, this is mostly useless (but
+//! hopefully not harmfull!). Note that a CountedTree cannot be part of a
+//! transaction.
+
+use std::sync::{
+ atomic::{AtomicUsize, Ordering},
+ Arc,
+};
+
+use crate::{Result, Tree, TxError, Value, ValueIter};
+
+#[derive(Clone)]
+pub struct CountedTree(Arc<CountedTreeInternal>);
+
+struct CountedTreeInternal {
+ tree: Tree,
+ len: AtomicUsize,
+}
+
+impl CountedTree {
+ pub fn new(tree: Tree) -> Result<Self> {
+ let len = tree.len()?;
+ Ok(Self(Arc::new(CountedTreeInternal {
+ tree,
+ len: AtomicUsize::new(len),
+ })))
+ }
+
+ pub fn len(&self) -> usize {
+ self.0.len.load(Ordering::SeqCst)
+ }
+
+ pub fn is_empty(&self) -> bool {
+ self.len() == 0
+ }
+
+ pub fn get<K: AsRef<[u8]>>(&self, key: K) -> Result<Option<Value>> {
+ self.0.tree.get(key)
+ }
+
+ pub fn first(&self) -> Result<Option<(Value, Value)>> {
+ self.0.tree.first()
+ }
+
+ pub fn iter(&self) -> Result<ValueIter<'_>> {
+ self.0.tree.iter()
+ }
+
+ // ---- writing functions ----
+
+ pub fn insert<K, V>(&self, key: K, value: V) -> Result<Option<Value>>
+ where
+ K: AsRef<[u8]>,
+ V: AsRef<[u8]>,
+ {
+ let old_val = self.0.tree.insert(key, value)?;
+ if old_val.is_none() {
+ self.0.len.fetch_add(1, Ordering::SeqCst);
+ }
+ Ok(old_val)
+ }
+
+ pub fn remove<K: AsRef<[u8]>>(&self, key: K) -> Result<Option<Value>> {
+ let old_val = self.0.tree.remove(key)?;
+ if old_val.is_some() {
+ self.0.len.fetch_sub(1, Ordering::SeqCst);
+ }
+ Ok(old_val)
+ }
+
+ pub fn compare_and_swap<K, OV, NV>(
+ &self,
+ key: K,
+ expected_old: Option<OV>,
+ new: Option<NV>,
+ ) -> Result<bool>
+ where
+ K: AsRef<[u8]>,
+ OV: AsRef<[u8]>,
+ NV: AsRef<[u8]>,
+ {
+ let old_some = expected_old.is_some();
+ let new_some = new.is_some();
+
+ let tx_res = self.0.tree.db().transaction(|mut tx| {
+ let old_val = tx.get(&self.0.tree, &key)?;
+ let is_same = match (&old_val, &expected_old) {
+ (None, None) => true,
+ (Some(x), Some(y)) if x == y.as_ref() => true,
+ _ => false,
+ };
+ if is_same {
+ match &new {
+ Some(v) => {
+ tx.insert(&self.0.tree, &key, v)?;
+ }
+ None => {
+ tx.remove(&self.0.tree, &key)?;
+ }
+ }
+ tx.commit(())
+ } else {
+ tx.abort(())
+ }
+ });
+
+ match tx_res {
+ Ok(()) => {
+ match (old_some, new_some) {
+ (false, true) => {
+ self.0.len.fetch_add(1, Ordering::SeqCst);
+ }
+ (true, false) => {
+ self.0.len.fetch_sub(1, Ordering::SeqCst);
+ }
+ _ => (),
+ }
+ Ok(true)
+ }
+ Err(TxError::Abort(())) => Ok(false),
+ Err(TxError::Db(e)) => Err(e),
+ }
+ }
+}
diff --git a/src/db/lib.rs b/src/db/lib.rs
new file mode 100644
index 00000000..d96586be
--- /dev/null
+++ b/src/db/lib.rs
@@ -0,0 +1,416 @@
+#[macro_use]
+#[cfg(feature = "sqlite")]
+extern crate tracing;
+
+#[cfg(not(any(feature = "lmdb", feature = "sled", feature = "sqlite")))]
+compile_error!("Must activate the Cargo feature for at least one DB engine: lmdb, sled or sqlite.");
+
+#[cfg(feature = "lmdb")]
+pub mod lmdb_adapter;
+#[cfg(feature = "sled")]
+pub mod sled_adapter;
+#[cfg(feature = "sqlite")]
+pub mod sqlite_adapter;
+
+pub mod counted_tree_hack;
+
+#[cfg(test)]
+pub mod test;
+
+use core::ops::{Bound, RangeBounds};
+
+use std::borrow::Cow;
+use std::cell::Cell;
+use std::sync::Arc;
+
+use err_derive::Error;
+
+#[derive(Clone)]
+pub struct Db(pub(crate) Arc<dyn IDb>);
+
+pub struct Transaction<'a>(&'a mut dyn ITx);
+
+#[derive(Clone)]
+pub struct Tree(Arc<dyn IDb>, usize);
+
+pub type Value = Vec<u8>;
+pub type ValueIter<'a> = Box<dyn std::iter::Iterator<Item = Result<(Value, Value)>> + 'a>;
+pub type TxValueIter<'a> = Box<dyn std::iter::Iterator<Item = TxOpResult<(Value, Value)>> + 'a>;
+
+// ----
+
+#[derive(Debug, Error)]
+#[error(display = "{}", _0)]
+pub struct Error(pub Cow<'static, str>);
+
+pub type Result<T> = std::result::Result<T, Error>;
+
+#[derive(Debug, Error)]
+#[error(display = "{}", _0)]
+pub struct TxOpError(pub(crate) Error);
+pub type TxOpResult<T> = std::result::Result<T, TxOpError>;
+
+pub enum TxError<E> {
+ Abort(E),
+ Db(Error),
+}
+pub type TxResult<R, E> = std::result::Result<R, TxError<E>>;
+
+impl<E> From<TxOpError> for TxError<E> {
+ fn from(e: TxOpError) -> TxError<E> {
+ TxError::Db(e.0)
+ }
+}
+
+pub fn unabort<R, E>(res: TxResult<R, E>) -> TxOpResult<std::result::Result<R, E>> {
+ match res {
+ Ok(v) => Ok(Ok(v)),
+ Err(TxError::Abort(e)) => Ok(Err(e)),
+ Err(TxError::Db(e)) => Err(TxOpError(e)),
+ }
+}
+
+// ----
+
+impl Db {
+ pub fn engine(&self) -> String {
+ self.0.engine()
+ }
+
+ pub fn open_tree<S: AsRef<str>>(&self, name: S) -> Result<Tree> {
+ let tree_id = self.0.open_tree(name.as_ref())?;
+ Ok(Tree(self.0.clone(), tree_id))
+ }
+
+ pub fn list_trees(&self) -> Result<Vec<String>> {
+ self.0.list_trees()
+ }
+
+ pub fn transaction<R, E, F>(&self, fun: F) -> TxResult<R, E>
+ where
+ F: Fn(Transaction<'_>) -> TxResult<R, E>,
+ {
+ let f = TxFn {
+ function: fun,
+ result: Cell::new(None),
+ };
+ let tx_res = self.0.transaction(&f);
+ let ret = f
+ .result
+ .into_inner()
+ .expect("Transaction did not store result");
+
+ match tx_res {
+ Ok(()) => {
+ assert!(matches!(ret, Ok(_)));
+ ret
+ }
+ Err(TxError::Abort(())) => {
+ assert!(matches!(ret, Err(TxError::Abort(_))));
+ ret
+ }
+ Err(TxError::Db(e2)) => match ret {
+ // Ok was stored -> the error occured when finalizing
+ // transaction
+ Ok(_) => Err(TxError::Db(e2)),
+ // An error was already stored: that's the one we want to
+ // return
+ Err(TxError::Db(e)) => Err(TxError::Db(e)),
+ _ => unreachable!(),
+ },
+ }
+ }
+
+ pub fn import(&self, other: &Db) -> Result<()> {
+ let existing_trees = self.list_trees()?;
+ if !existing_trees.is_empty() {
+ return Err(Error(
+ format!(
+ "destination database already contains data: {:?}",
+ existing_trees
+ )
+ .into(),
+ ));
+ }
+
+ let tree_names = other.list_trees()?;
+ for name in tree_names {
+ let tree = self.open_tree(&name)?;
+ if tree.len()? > 0 {
+ return Err(Error(format!("tree {} already contains data", name).into()));
+ }
+
+ let ex_tree = other.open_tree(&name)?;
+
+ let tx_res = self.transaction(|mut tx| {
+ let mut i = 0;
+ for item in ex_tree.iter().map_err(TxError::Abort)? {
+ let (k, v) = item.map_err(TxError::Abort)?;
+ tx.insert(&tree, k, v)?;
+ i += 1;
+ if i % 1000 == 0 {
+ println!("{}: imported {}", name, i);
+ }
+ }
+ tx.commit(i)
+ });
+ let total = match tx_res {
+ Err(TxError::Db(e)) => return Err(e),
+ Err(TxError::Abort(e)) => return Err(e),
+ Ok(x) => x,
+ };
+
+ println!("{}: finished importing, {} items", name, total);
+ }
+ Ok(())
+ }
+}
+
+#[allow(clippy::len_without_is_empty)]
+impl Tree {
+ #[inline]
+ pub fn db(&self) -> Db {
+ Db(self.0.clone())
+ }
+
+ #[inline]
+ pub fn get<T: AsRef<[u8]>>(&self, key: T) -> Result<Option<Value>> {
+ self.0.get(self.1, key.as_ref())
+ }
+ #[inline]
+ pub fn len(&self) -> Result<usize> {
+ self.0.len(self.1)
+ }
+
+ #[inline]
+ pub fn first(&self) -> Result<Option<(Value, Value)>> {
+ self.iter()?.next().transpose()
+ }
+ #[inline]
+ pub fn get_gt<T: AsRef<[u8]>>(&self, from: T) -> Result<Option<(Value, Value)>> {
+ self.range((Bound::Excluded(from), Bound::Unbounded))?
+ .next()
+ .transpose()
+ }
+
+ /// Returns the old value if there was one
+ #[inline]
+ pub fn insert<T: AsRef<[u8]>, U: AsRef<[u8]>>(
+ &self,
+ key: T,
+ value: U,
+ ) -> Result<Option<Value>> {
+ self.0.insert(self.1, key.as_ref(), value.as_ref())
+ }
+ /// Returns the old value if there was one
+ #[inline]
+ pub fn remove<T: AsRef<[u8]>>(&self, key: T) -> Result<Option<Value>> {
+ self.0.remove(self.1, key.as_ref())
+ }
+ /// Clears all values from the tree
+ #[inline]
+ pub fn clear(&self) -> Result<()> {
+ self.0.clear(self.1)
+ }
+
+ #[inline]
+ pub fn iter(&self) -> Result<ValueIter<'_>> {
+ self.0.iter(self.1)
+ }
+ #[inline]
+ pub fn iter_rev(&self) -> Result<ValueIter<'_>> {
+ self.0.iter_rev(self.1)
+ }
+
+ #[inline]
+ pub fn range<K, R>(&self, range: R) -> Result<ValueIter<'_>>
+ where
+ K: AsRef<[u8]>,
+ R: RangeBounds<K>,
+ {
+ let sb = range.start_bound();
+ let eb = range.end_bound();
+ self.0.range(self.1, get_bound(sb), get_bound(eb))
+ }
+ #[inline]
+ pub fn range_rev<K, R>(&self, range: R) -> Result<ValueIter<'_>>
+ where
+ K: AsRef<[u8]>,
+ R: RangeBounds<K>,
+ {
+ let sb = range.start_bound();
+ let eb = range.end_bound();
+ self.0.range_rev(self.1, get_bound(sb), get_bound(eb))
+ }
+}
+
+#[allow(clippy::len_without_is_empty)]
+impl<'a> Transaction<'a> {
+ #[inline]
+ pub fn get<T: AsRef<[u8]>>(&self, tree: &Tree, key: T) -> TxOpResult<Option<Value>> {
+ self.0.get(tree.1, key.as_ref())
+ }
+ #[inline]
+ pub fn len(&self, tree: &Tree) -> TxOpResult<usize> {
+ self.0.len(tree.1)
+ }
+
+ /// Returns the old value if there was one
+ #[inline]
+ pub fn insert<T: AsRef<[u8]>, U: AsRef<[u8]>>(
+ &mut self,
+ tree: &Tree,
+ key: T,
+ value: U,
+ ) -> TxOpResult<Option<Value>> {
+ self.0.insert(tree.1, key.as_ref(), value.as_ref())
+ }
+ /// Returns the old value if there was one
+ #[inline]
+ pub fn remove<T: AsRef<[u8]>>(&mut self, tree: &Tree, key: T) -> TxOpResult<Option<Value>> {
+ self.0.remove(tree.1, key.as_ref())
+ }
+
+ #[inline]
+ pub fn iter(&self, tree: &Tree) -> TxOpResult<TxValueIter<'_>> {
+ self.0.iter(tree.1)
+ }
+ #[inline]
+ pub fn iter_rev(&self, tree: &Tree) -> TxOpResult<TxValueIter<'_>> {
+ self.0.iter_rev(tree.1)
+ }
+
+ #[inline]
+ pub fn range<K, R>(&self, tree: &Tree, range: R) -> TxOpResult<TxValueIter<'_>>
+ where
+ K: AsRef<[u8]>,
+ R: RangeBounds<K>,
+ {
+ let sb = range.start_bound();
+ let eb = range.end_bound();
+ self.0.range(tree.1, get_bound(sb), get_bound(eb))
+ }
+ #[inline]
+ pub fn range_rev<K, R>(&self, tree: &Tree, range: R) -> TxOpResult<TxValueIter<'_>>
+ where
+ K: AsRef<[u8]>,
+ R: RangeBounds<K>,
+ {
+ let sb = range.start_bound();
+ let eb = range.end_bound();
+ self.0.range_rev(tree.1, get_bound(sb), get_bound(eb))
+ }
+
+ // ----
+
+ #[inline]
+ pub fn abort<R, E>(self, e: E) -> TxResult<R, E> {
+ Err(TxError::Abort(e))
+ }
+
+ #[inline]
+ pub fn commit<R, E>(self, r: R) -> TxResult<R, E> {
+ Ok(r)
+ }
+}
+
+// ---- Internal interfaces
+
+pub(crate) trait IDb: Send + Sync {
+ fn engine(&self) -> String;
+ fn open_tree(&self, name: &str) -> Result<usize>;
+ fn list_trees(&self) -> Result<Vec<String>>;
+
+ fn get(&self, tree: usize, key: &[u8]) -> Result<Option<Value>>;
+ fn len(&self, tree: usize) -> Result<usize>;
+
+ fn insert(&self, tree: usize, key: &[u8], value: &[u8]) -> Result<Option<Value>>;
+ fn remove(&self, tree: usize, key: &[u8]) -> Result<Option<Value>>;
+ fn clear(&self, tree: usize) -> Result<()>;
+
+ fn iter(&self, tree: usize) -> Result<ValueIter<'_>>;
+ fn iter_rev(&self, tree: usize) -> Result<ValueIter<'_>>;
+
+ fn range<'r>(
+ &self,
+ tree: usize,
+ low: Bound<&'r [u8]>,
+ high: Bound<&'r [u8]>,
+ ) -> Result<ValueIter<'_>>;
+ fn range_rev<'r>(
+ &self,
+ tree: usize,
+ low: Bound<&'r [u8]>,
+ high: Bound<&'r [u8]>,
+ ) -> Result<ValueIter<'_>>;
+
+ fn transaction(&self, f: &dyn ITxFn) -> TxResult<(), ()>;
+}
+
+pub(crate) trait ITx {
+ fn get(&self, tree: usize, key: &[u8]) -> TxOpResult<Option<Value>>;
+ fn len(&self, tree: usize) -> TxOpResult<usize>;
+
+ fn insert(&mut self, tree: usize, key: &[u8], value: &[u8]) -> TxOpResult<Option<Value>>;
+ fn remove(&mut self, tree: usize, key: &[u8]) -> TxOpResult<Option<Value>>;
+
+ fn iter(&self, tree: usize) -> TxOpResult<TxValueIter<'_>>;
+ fn iter_rev(&self, tree: usize) -> TxOpResult<TxValueIter<'_>>;
+
+ fn range<'r>(
+ &self,
+ tree: usize,
+ low: Bound<&'r [u8]>,
+ high: Bound<&'r [u8]>,
+ ) -> TxOpResult<TxValueIter<'_>>;
+ fn range_rev<'r>(
+ &self,
+ tree: usize,
+ low: Bound<&'r [u8]>,
+ high: Bound<&'r [u8]>,
+ ) -> TxOpResult<TxValueIter<'_>>;
+}
+
+pub(crate) trait ITxFn {
+ fn try_on(&self, tx: &mut dyn ITx) -> TxFnResult;
+}
+
+pub(crate) enum TxFnResult {
+ Ok,
+ Abort,
+ DbErr,
+}
+
+struct TxFn<F, R, E>
+where
+ F: Fn(Transaction<'_>) -> TxResult<R, E>,
+{
+ function: F,
+ result: Cell<Option<TxResult<R, E>>>,
+}
+
+impl<F, R, E> ITxFn for TxFn<F, R, E>
+where
+ F: Fn(Transaction<'_>) -> TxResult<R, E>,
+{
+ fn try_on(&self, tx: &mut dyn ITx) -> TxFnResult {
+ let res = (self.function)(Transaction(tx));
+ let res2 = match &res {
+ Ok(_) => TxFnResult::Ok,
+ Err(TxError::Abort(_)) => TxFnResult::Abort,
+ Err(TxError::Db(_)) => TxFnResult::DbErr,
+ };
+ self.result.set(Some(res));
+ res2
+ }
+}
+
+// ----
+
+fn get_bound<K: AsRef<[u8]>>(b: Bound<&K>) -> Bound<&[u8]> {
+ match b {
+ Bound::Included(v) => Bound::Included(v.as_ref()),
+ Bound::Excluded(v) => Bound::Excluded(v.as_ref()),
+ Bound::Unbounded => Bound::Unbounded,
+ }
+}
diff --git a/src/db/lmdb_adapter.rs b/src/db/lmdb_adapter.rs
new file mode 100644
index 00000000..c036c990
--- /dev/null
+++ b/src/db/lmdb_adapter.rs
@@ -0,0 +1,350 @@
+use core::ops::Bound;
+use core::ptr::NonNull;
+
+use std::collections::HashMap;
+use std::convert::TryInto;
+use std::sync::{Arc, RwLock};
+
+use heed::types::ByteSlice;
+use heed::{BytesDecode, Env, RoTxn, RwTxn, UntypedDatabase as Database};
+
+use crate::{
+ Db, Error, IDb, ITx, ITxFn, Result, TxError, TxFnResult, TxOpError, TxOpResult, TxResult,
+ TxValueIter, Value, ValueIter,
+};
+
+pub use heed;
+
+// -- err
+
+impl From<heed::Error> for Error {
+ fn from(e: heed::Error) -> Error {
+ Error(format!("LMDB: {}", e).into())
+ }
+}
+
+impl From<heed::Error> for TxOpError {
+ fn from(e: heed::Error) -> TxOpError {
+ TxOpError(e.into())
+ }
+}
+
+// -- db
+
+pub struct LmdbDb {
+ db: heed::Env,
+ trees: RwLock<(Vec<Database>, HashMap<String, usize>)>,
+}
+
+impl LmdbDb {
+ pub fn init(db: Env) -> Db {
+ let s = Self {
+ db,
+ trees: RwLock::new((Vec::new(), HashMap::new())),
+ };
+ Db(Arc::new(s))
+ }
+
+ fn get_tree(&self, i: usize) -> Result<Database> {
+ self.trees
+ .read()
+ .unwrap()
+ .0
+ .get(i)
+ .cloned()
+ .ok_or_else(|| Error("invalid tree id".into()))
+ }
+}
+
+impl IDb for LmdbDb {
+ fn engine(&self) -> String {
+ "LMDB (using Heed crate)".into()
+ }
+
+ fn open_tree(&self, name: &str) -> Result<usize> {
+ let mut trees = self.trees.write().unwrap();
+ if let Some(i) = trees.1.get(name) {
+ Ok(*i)
+ } else {
+ let tree = self.db.create_database(Some(name))?;
+ let i = trees.0.len();
+ trees.0.push(tree);
+ trees.1.insert(name.to_string(), i);
+ Ok(i)
+ }
+ }
+
+ fn list_trees(&self) -> Result<Vec<String>> {
+ let tree0 = match self.db.open_database::<heed::types::Str, ByteSlice>(None)? {
+ Some(x) => x,
+ None => return Ok(vec![]),
+ };
+
+ let mut ret = vec![];
+ let tx = self.db.read_txn()?;
+ for item in tree0.iter(&tx)? {
+ let (tree_name, _) = item?;
+ ret.push(tree_name.to_string());
+ }
+ drop(tx);
+
+ let mut ret2 = vec![];
+ for tree_name in ret {
+ if self
+ .db
+ .open_database::<ByteSlice, ByteSlice>(Some(&tree_name))?
+ .is_some()
+ {
+ ret2.push(tree_name);
+ }
+ }
+
+ Ok(ret2)
+ }
+
+ // ----
+
+ fn get(&self, tree: usize, key: &[u8]) -> Result<Option<Value>> {
+ let tree = self.get_tree(tree)?;
+
+ let tx = self.db.read_txn()?;
+ let val = tree.get(&tx, key)?;
+ match val {
+ None => Ok(None),
+ Some(v) => Ok(Some(v.to_vec())),
+ }
+ }
+
+ fn len(&self, tree: usize) -> Result<usize> {
+ let tree = self.get_tree(tree)?;
+ let tx = self.db.read_txn()?;
+ Ok(tree.len(&tx)?.try_into().unwrap())
+ }
+
+ fn insert(&self, tree: usize, key: &[u8], value: &[u8]) -> Result<Option<Value>> {
+ let tree = self.get_tree(tree)?;
+ let mut tx = self.db.write_txn()?;
+ let old_val = tree.get(&tx, key)?.map(Vec::from);
+ tree.put(&mut tx, key, value)?;
+ tx.commit()?;
+ Ok(old_val)
+ }
+
+ fn remove(&self, tree: usize, key: &[u8]) -> Result<Option<Value>> {
+ let tree = self.get_tree(tree)?;
+ let mut tx = self.db.write_txn()?;
+ let old_val = tree.get(&tx, key)?.map(Vec::from);
+ tree.delete(&mut tx, key)?;
+ tx.commit()?;
+ Ok(old_val)
+ }
+
+ fn clear(&self, tree: usize) -> Result<()> {
+ let tree = self.get_tree(tree)?;
+ let mut tx = self.db.write_txn()?;
+ tree.clear(&mut tx)?;
+ tx.commit()?;
+ Ok(())
+ }
+
+ fn iter(&self, tree: usize) -> Result<ValueIter<'_>> {
+ let tree = self.get_tree(tree)?;
+ let tx = self.db.read_txn()?;
+ TxAndIterator::make(tx, |tx| Ok(tree.iter(tx)?))
+ }
+
+ fn iter_rev(&self, tree: usize) -> Result<ValueIter<'_>> {
+ let tree = self.get_tree(tree)?;
+ let tx = self.db.read_txn()?;
+ TxAndIterator::make(tx, |tx| Ok(tree.rev_iter(tx)?))
+ }
+
+ fn range<'r>(
+ &self,
+ tree: usize,
+ low: Bound<&'r [u8]>,
+ high: Bound<&'r [u8]>,
+ ) -> Result<ValueIter<'_>> {
+ let tree = self.get_tree(tree)?;
+ let tx = self.db.read_txn()?;
+ TxAndIterator::make(tx, |tx| Ok(tree.range(tx, &(low, high))?))
+ }
+ fn range_rev<'r>(
+ &self,
+ tree: usize,
+ low: Bound<&'r [u8]>,
+ high: Bound<&'r [u8]>,
+ ) -> Result<ValueIter<'_>> {
+ let tree = self.get_tree(tree)?;
+ let tx = self.db.read_txn()?;
+ TxAndIterator::make(tx, |tx| Ok(tree.rev_range(tx, &(low, high))?))
+ }
+
+ // ----
+
+ fn transaction(&self, f: &dyn ITxFn) -> TxResult<(), ()> {
+ let trees = self.trees.read().unwrap();
+ let mut tx = LmdbTx {
+ trees: &trees.0[..],
+ tx: self
+ .db
+ .write_txn()
+ .map_err(Error::from)
+ .map_err(TxError::Db)?,
+ };
+
+ let res = f.try_on(&mut tx);
+ match res {
+ TxFnResult::Ok => {
+ tx.tx.commit().map_err(Error::from).map_err(TxError::Db)?;
+ Ok(())
+ }
+ TxFnResult::Abort => {
+ tx.tx.abort().map_err(Error::from).map_err(TxError::Db)?;
+ Err(TxError::Abort(()))
+ }
+ TxFnResult::DbErr => {
+ tx.tx.abort().map_err(Error::from).map_err(TxError::Db)?;
+ Err(TxError::Db(Error(
+ "(this message will be discarded)".into(),
+ )))
+ }
+ }
+ }
+}
+
+// ----
+
+struct LmdbTx<'a> {
+ trees: &'a [Database],
+ tx: RwTxn<'a, 'a>,
+}
+
+impl<'a> LmdbTx<'a> {
+ fn get_tree(&self, i: usize) -> TxOpResult<&Database> {
+ self.trees.get(i).ok_or_else(|| {
+ TxOpError(Error(
+ "invalid tree id (it might have been openned after the transaction started)".into(),
+ ))
+ })
+ }
+}
+
+impl<'a> ITx for LmdbTx<'a> {
+ fn get(&self, tree: usize, key: &[u8]) -> TxOpResult<Option<Value>> {
+ let tree = self.get_tree(tree)?;
+ match tree.get(&self.tx, key)? {
+ Some(v) => Ok(Some(v.to_vec())),
+ None => Ok(None),
+ }
+ }
+ fn len(&self, _tree: usize) -> TxOpResult<usize> {
+ unimplemented!(".len() in transaction not supported with LMDB backend")
+ }
+
+ fn insert(&mut self, tree: usize, key: &[u8], value: &[u8]) -> TxOpResult<Option<Value>> {
+ let tree = *self.get_tree(tree)?;
+ let old_val = tree.get(&self.tx, key)?.map(Vec::from);
+ tree.put(&mut self.tx, key, value)?;
+ Ok(old_val)
+ }
+ fn remove(&mut self, tree: usize, key: &[u8]) -> TxOpResult<Option<Value>> {
+ let tree = *self.get_tree(tree)?;
+ let old_val = tree.get(&self.tx, key)?.map(Vec::from);
+ tree.delete(&mut self.tx, key)?;
+ Ok(old_val)
+ }
+
+ fn iter(&self, _tree: usize) -> TxOpResult<TxValueIter<'_>> {
+ unimplemented!("Iterators in transactions not supported with LMDB backend");
+ }
+ fn iter_rev(&self, _tree: usize) -> TxOpResult<TxValueIter<'_>> {
+ unimplemented!("Iterators in transactions not supported with LMDB backend");
+ }
+
+ fn range<'r>(
+ &self,
+ _tree: usize,
+ _low: Bound<&'r [u8]>,
+ _high: Bound<&'r [u8]>,
+ ) -> TxOpResult<TxValueIter<'_>> {
+ unimplemented!("Iterators in transactions not supported with LMDB backend");
+ }
+ fn range_rev<'r>(
+ &self,
+ _tree: usize,
+ _low: Bound<&'r [u8]>,
+ _high: Bound<&'r [u8]>,
+ ) -> TxOpResult<TxValueIter<'_>> {
+ unimplemented!("Iterators in transactions not supported with LMDB backend");
+ }
+}
+
+// ----
+
+type IteratorItem<'a> = heed::Result<(
+ <ByteSlice as BytesDecode<'a>>::DItem,
+ <ByteSlice as BytesDecode<'a>>::DItem,
+)>;
+
+struct TxAndIterator<'a, I>
+where
+ I: Iterator<Item = IteratorItem<'a>> + 'a,
+{
+ tx: RoTxn<'a>,
+ iter: Option<I>,
+}
+
+impl<'a, I> TxAndIterator<'a, I>
+where
+ I: Iterator<Item = IteratorItem<'a>> + 'a,
+{
+ fn make<F>(tx: RoTxn<'a>, iterfun: F) -> Result<ValueIter<'a>>
+ where
+ F: FnOnce(&'a RoTxn<'a>) -> Result<I>,
+ {
+ let mut res = TxAndIterator { tx, iter: None };
+
+ let tx = unsafe { NonNull::from(&res.tx).as_ref() };
+ res.iter = Some(iterfun(tx)?);
+
+ Ok(Box::new(res))
+ }
+}
+
+impl<'a, I> Drop for TxAndIterator<'a, I>
+where
+ I: Iterator<Item = IteratorItem<'a>> + 'a,
+{
+ fn drop(&mut self) {
+ drop(self.iter.take());
+ }
+}
+
+impl<'a, I> Iterator for TxAndIterator<'a, I>
+where
+ I: Iterator<Item = IteratorItem<'a>> + 'a,
+{
+ type Item = Result<(Value, Value)>;
+
+ fn next(&mut self) -> Option<Self::Item> {
+ match self.iter.as_mut().unwrap().next() {
+ None => None,
+ Some(Err(e)) => Some(Err(e.into())),
+ Some(Ok((k, v))) => Some(Ok((k.to_vec(), v.to_vec()))),
+ }
+ }
+}
+
+// ----
+
+#[cfg(target_pointer_width = "64")]
+pub fn recommended_map_size() -> usize {
+ 1usize << 40
+}
+
+#[cfg(target_pointer_width = "32")]
+pub fn recommended_map_size() -> usize {
+ warn!("LMDB is not recommended on 32-bit systems, database size will be limited");
+ 1usize << 30
+}
diff --git a/src/db/sled_adapter.rs b/src/db/sled_adapter.rs
new file mode 100644
index 00000000..cf61867d
--- /dev/null
+++ b/src/db/sled_adapter.rs
@@ -0,0 +1,266 @@
+use core::ops::Bound;
+
+use std::cell::Cell;
+use std::collections::HashMap;
+use std::sync::{Arc, RwLock};
+
+use sled::transaction::{
+ ConflictableTransactionError, TransactionError, Transactional, TransactionalTree,
+ UnabortableTransactionError,
+};
+
+use crate::{
+ Db, Error, IDb, ITx, ITxFn, Result, TxError, TxFnResult, TxOpError, TxOpResult, TxResult,
+ TxValueIter, Value, ValueIter,
+};
+
+pub use sled;
+
+// -- err
+
+impl From<sled::Error> for Error {
+ fn from(e: sled::Error) -> Error {
+ Error(format!("Sled: {}", e).into())
+ }
+}
+
+impl From<sled::Error> for TxOpError {
+ fn from(e: sled::Error) -> TxOpError {
+ TxOpError(e.into())
+ }
+}
+
+// -- db
+
+pub struct SledDb {
+ db: sled::Db,
+ trees: RwLock<(Vec<sled::Tree>, HashMap<String, usize>)>,
+}
+
+impl SledDb {
+ pub fn init(db: sled::Db) -> Db {
+ let s = Self {
+ db,
+ trees: RwLock::new((Vec::new(), HashMap::new())),
+ };
+ Db(Arc::new(s))
+ }
+
+ fn get_tree(&self, i: usize) -> Result<sled::Tree> {
+ self.trees
+ .read()
+ .unwrap()
+ .0
+ .get(i)
+ .cloned()
+ .ok_or_else(|| Error("invalid tree id".into()))
+ }
+}
+
+impl IDb for SledDb {
+ fn engine(&self) -> String {
+ "Sled".into()
+ }
+
+ fn open_tree(&self, name: &str) -> Result<usize> {
+ let mut trees = self.trees.write().unwrap();
+ if let Some(i) = trees.1.get(name) {
+ Ok(*i)
+ } else {
+ let tree = self.db.open_tree(name)?;
+ let i = trees.0.len();
+ trees.0.push(tree);
+ trees.1.insert(name.to_string(), i);
+ Ok(i)
+ }
+ }
+
+ fn list_trees(&self) -> Result<Vec<String>> {
+ let mut trees = vec![];
+ for name in self.db.tree_names() {
+ let name = std::str::from_utf8(&name)
+ .map_err(|e| Error(format!("{}", e).into()))?
+ .to_string();
+ if name != "__sled__default" {
+ trees.push(name);
+ }
+ }
+ Ok(trees)
+ }
+
+ // ----
+
+ fn get(&self, tree: usize, key: &[u8]) -> Result<Option<Value>> {
+ let tree = self.get_tree(tree)?;
+ let val = tree.get(key)?;
+ Ok(val.map(|x| x.to_vec()))
+ }
+
+ fn len(&self, tree: usize) -> Result<usize> {
+ let tree = self.get_tree(tree)?;
+ Ok(tree.len())
+ }
+
+ fn insert(&self, tree: usize, key: &[u8], value: &[u8]) -> Result<Option<Value>> {
+ let tree = self.get_tree(tree)?;
+ let old_val = tree.insert(key, value)?;
+ Ok(old_val.map(|x| x.to_vec()))
+ }
+
+ fn remove(&self, tree: usize, key: &[u8]) -> Result<Option<Value>> {
+ let tree = self.get_tree(tree)?;
+ let old_val = tree.remove(key)?;
+ Ok(old_val.map(|x| x.to_vec()))
+ }
+
+ fn clear(&self, tree: usize) -> Result<()> {
+ let tree = self.get_tree(tree)?;
+ tree.clear()?;
+ Ok(())
+ }
+
+ fn iter(&self, tree: usize) -> Result<ValueIter<'_>> {
+ let tree = self.get_tree(tree)?;
+ Ok(Box::new(tree.iter().map(|v| {
+ v.map(|(x, y)| (x.to_vec(), y.to_vec())).map_err(Into::into)
+ })))
+ }
+
+ fn iter_rev(&self, tree: usize) -> Result<ValueIter<'_>> {
+ let tree = self.get_tree(tree)?;
+ Ok(Box::new(tree.iter().rev().map(|v| {
+ v.map(|(x, y)| (x.to_vec(), y.to_vec())).map_err(Into::into)
+ })))
+ }
+
+ fn range<'r>(
+ &self,
+ tree: usize,
+ low: Bound<&'r [u8]>,
+ high: Bound<&'r [u8]>,
+ ) -> Result<ValueIter<'_>> {
+ let tree = self.get_tree(tree)?;
+ Ok(Box::new(tree.range::<&'r [u8], _>((low, high)).map(|v| {
+ v.map(|(x, y)| (x.to_vec(), y.to_vec())).map_err(Into::into)
+ })))
+ }
+ fn range_rev<'r>(
+ &self,
+ tree: usize,
+ low: Bound<&'r [u8]>,
+ high: Bound<&'r [u8]>,
+ ) -> Result<ValueIter<'_>> {
+ let tree = self.get_tree(tree)?;
+ Ok(Box::new(tree.range::<&'r [u8], _>((low, high)).rev().map(
+ |v| v.map(|(x, y)| (x.to_vec(), y.to_vec())).map_err(Into::into),
+ )))
+ }
+
+ // ----
+
+ fn transaction(&self, f: &dyn ITxFn) -> TxResult<(), ()> {
+ let trees = self.trees.read().unwrap();
+ let res = trees.0.transaction(|txtrees| {
+ let mut tx = SledTx {
+ trees: txtrees,
+ err: Cell::new(None),
+ };
+ match f.try_on(&mut tx) {
+ TxFnResult::Ok => {
+ assert!(tx.err.into_inner().is_none());
+ Ok(())
+ }
+ TxFnResult::Abort => {
+ assert!(tx.err.into_inner().is_none());
+ Err(ConflictableTransactionError::Abort(()))
+ }
+ TxFnResult::DbErr => {
+ let e = tx.err.into_inner().expect("No DB error");
+ Err(e.into())
+ }
+ }
+ });
+ match res {
+ Ok(()) => Ok(()),
+ Err(TransactionError::Abort(())) => Err(TxError::Abort(())),
+ Err(TransactionError::Storage(s)) => Err(TxError::Db(s.into())),
+ }
+ }
+}
+
+// ----
+
+struct SledTx<'a> {
+ trees: &'a [TransactionalTree],
+ err: Cell<Option<UnabortableTransactionError>>,
+}
+
+impl<'a> SledTx<'a> {
+ fn get_tree(&self, i: usize) -> TxOpResult<&TransactionalTree> {
+ self.trees.get(i).ok_or_else(|| {
+ TxOpError(Error(
+ "invalid tree id (it might have been openned after the transaction started)".into(),
+ ))
+ })
+ }
+
+ fn save_error<R>(
+ &self,
+ v: std::result::Result<R, UnabortableTransactionError>,
+ ) -> TxOpResult<R> {
+ match v {
+ Ok(x) => Ok(x),
+ Err(e) => {
+ let txt = format!("{}", e);
+ self.err.set(Some(e));
+ Err(TxOpError(Error(txt.into())))
+ }
+ }
+ }
+}
+
+impl<'a> ITx for SledTx<'a> {
+ fn get(&self, tree: usize, key: &[u8]) -> TxOpResult<Option<Value>> {
+ let tree = self.get_tree(tree)?;
+ let tmp = self.save_error(tree.get(key))?;
+ Ok(tmp.map(|x| x.to_vec()))
+ }
+ fn len(&self, _tree: usize) -> TxOpResult<usize> {
+ unimplemented!(".len() in transaction not supported with Sled backend")
+ }
+
+ fn insert(&mut self, tree: usize, key: &[u8], value: &[u8]) -> TxOpResult<Option<Value>> {
+ let tree = self.get_tree(tree)?;
+ let old_val = self.save_error(tree.insert(key, value))?;
+ Ok(old_val.map(|x| x.to_vec()))
+ }
+ fn remove(&mut self, tree: usize, key: &[u8]) -> TxOpResult<Option<Value>> {
+ let tree = self.get_tree(tree)?;
+ let old_val = self.save_error(tree.remove(key))?;
+ Ok(old_val.map(|x| x.to_vec()))
+ }
+
+ fn iter(&self, _tree: usize) -> TxOpResult<TxValueIter<'_>> {
+ unimplemented!("Iterators in transactions not supported with Sled backend");
+ }
+ fn iter_rev(&self, _tree: usize) -> TxOpResult<TxValueIter<'_>> {
+ unimplemented!("Iterators in transactions not supported with Sled backend");
+ }
+
+ fn range<'r>(
+ &self,
+ _tree: usize,
+ _low: Bound<&'r [u8]>,
+ _high: Bound<&'r [u8]>,
+ ) -> TxOpResult<TxValueIter<'_>> {
+ unimplemented!("Iterators in transactions not supported with Sled backend");
+ }
+ fn range_rev<'r>(
+ &self,
+ _tree: usize,
+ _low: Bound<&'r [u8]>,
+ _high: Bound<&'r [u8]>,
+ ) -> TxOpResult<TxValueIter<'_>> {
+ unimplemented!("Iterators in transactions not supported with Sled backend");
+ }
+}
diff --git a/src/db/sqlite_adapter.rs b/src/db/sqlite_adapter.rs
new file mode 100644
index 00000000..886fda6e
--- /dev/null
+++ b/src/db/sqlite_adapter.rs
@@ -0,0 +1,508 @@
+use core::ops::Bound;
+
+use std::borrow::BorrowMut;
+use std::marker::PhantomPinned;
+use std::pin::Pin;
+use std::ptr::NonNull;
+use std::sync::{Arc, Mutex, MutexGuard};
+
+use rusqlite::{params, Connection, Rows, Statement, Transaction};
+
+use crate::{
+ Db, Error, IDb, ITx, ITxFn, Result, TxError, TxFnResult, TxOpError, TxOpResult, TxResult,
+ TxValueIter, Value, ValueIter,
+};
+
+pub use rusqlite;
+
+// --- err
+
+impl From<rusqlite::Error> for Error {
+ fn from(e: rusqlite::Error) -> Error {
+ Error(format!("Sqlite: {}", e).into())
+ }
+}
+
+impl From<rusqlite::Error> for TxOpError {
+ fn from(e: rusqlite::Error) -> TxOpError {
+ TxOpError(e.into())
+ }
+}
+
+// -- db
+
+pub struct SqliteDb(Mutex<SqliteDbInner>);
+
+struct SqliteDbInner {
+ db: Connection,
+ trees: Vec<String>,
+}
+
+impl SqliteDb {
+ pub fn init(db: rusqlite::Connection) -> Db {
+ let s = Self(Mutex::new(SqliteDbInner {
+ db,
+ trees: Vec::new(),
+ }));
+ Db(Arc::new(s))
+ }
+}
+
+impl SqliteDbInner {
+ fn get_tree(&self, i: usize) -> Result<&'_ str> {
+ self.trees
+ .get(i)
+ .map(String::as_str)
+ .ok_or_else(|| Error("invalid tree id".into()))
+ }
+
+ fn internal_get(&self, tree: &str, key: &[u8]) -> Result<Option<Value>> {
+ let mut stmt = self
+ .db
+ .prepare(&format!("SELECT v FROM {} WHERE k = ?1", tree))?;
+ let mut res_iter = stmt.query([key])?;
+ match res_iter.next()? {
+ None => Ok(None),
+ Some(v) => Ok(Some(v.get::<_, Vec<u8>>(0)?)),
+ }
+ }
+}
+
+impl IDb for SqliteDb {
+ fn engine(&self) -> String {
+ format!("sqlite3 v{} (using rusqlite crate)", rusqlite::version())
+ }
+
+ fn open_tree(&self, name: &str) -> Result<usize> {
+ let name = format!("tree_{}", name.replace(':', "_COLON_"));
+ let mut this = self.0.lock().unwrap();
+
+ if let Some(i) = this.trees.iter().position(|x| x == &name) {
+ Ok(i)
+ } else {
+ trace!("create table {}", name);
+ this.db.execute(
+ &format!(
+ "CREATE TABLE IF NOT EXISTS {} (
+ k BLOB PRIMARY KEY,
+ v BLOB
+ )",
+ name
+ ),
+ [],
+ )?;
+ trace!("table created: {}, unlocking", name);
+
+ let i = this.trees.len();
+ this.trees.push(name.to_string());
+ Ok(i)
+ }
+ }
+
+ fn list_trees(&self) -> Result<Vec<String>> {
+ let mut trees = vec![];
+
+ trace!("list_trees: lock db");
+ let this = self.0.lock().unwrap();
+ trace!("list_trees: lock acquired");
+
+ let mut stmt = this.db.prepare(
+ "SELECT name FROM sqlite_schema WHERE type = 'table' AND name LIKE 'tree_%'",
+ )?;
+ let mut rows = stmt.query([])?;
+ while let Some(row) = rows.next()? {
+ let name = row.get::<_, String>(0)?;
+ let name = name.replace("_COLON_", ":");
+ let name = name.strip_prefix("tree_").unwrap().to_string();
+ trees.push(name);
+ }
+ Ok(trees)
+ }
+
+ // ----
+
+ fn get(&self, tree: usize, key: &[u8]) -> Result<Option<Value>> {
+ trace!("get {}: lock db", tree);
+ let this = self.0.lock().unwrap();
+ trace!("get {}: lock acquired", tree);
+
+ let tree = this.get_tree(tree)?;
+ this.internal_get(tree, key)
+ }
+
+ fn len(&self, tree: usize) -> Result<usize> {
+ trace!("len {}: lock db", tree);
+ let this = self.0.lock().unwrap();
+ trace!("len {}: lock acquired", tree);
+
+ let tree = this.get_tree(tree)?;
+ let mut stmt = this.db.prepare(&format!("SELECT COUNT(*) FROM {}", tree))?;
+ let mut res_iter = stmt.query([])?;
+ match res_iter.next()? {
+ None => Ok(0),
+ Some(v) => Ok(v.get::<_, usize>(0)?),
+ }
+ }
+
+ fn insert(&self, tree: usize, key: &[u8], value: &[u8]) -> Result<Option<Value>> {
+ trace!("insert {}: lock db", tree);
+ let this = self.0.lock().unwrap();
+ trace!("insert {}: lock acquired", tree);
+
+ let tree = this.get_tree(tree)?;
+ let old_val = this.internal_get(tree, key)?;
+
+ let sql = match &old_val {
+ Some(_) => format!("UPDATE {} SET v = ?2 WHERE k = ?1", tree),
+ None => format!("INSERT INTO {} (k, v) VALUES (?1, ?2)", tree),
+ };
+ let n = this.db.execute(&sql, params![key, value])?;
+ assert_eq!(n, 1);
+
+ Ok(old_val)
+ }
+
+ fn remove(&self, tree: usize, key: &[u8]) -> Result<Option<Value>> {
+ trace!("remove {}: lock db", tree);
+ let this = self.0.lock().unwrap();
+ trace!("remove {}: lock acquired", tree);
+
+ let tree = this.get_tree(tree)?;
+ let old_val = this.internal_get(tree, key)?;
+
+ if old_val.is_some() {
+ let n = this
+ .db
+ .execute(&format!("DELETE FROM {} WHERE k = ?1", tree), params![key])?;
+ assert_eq!(n, 1);
+ }
+
+ Ok(old_val)
+ }
+
+ fn clear(&self, tree: usize) -> Result<()> {
+ trace!("clear {}: lock db", tree);
+ let this = self.0.lock().unwrap();
+ trace!("clear {}: lock acquired", tree);
+
+ let tree = this.get_tree(tree)?;
+ this.db.execute(&format!("DELETE FROM {}", tree), [])?;
+ Ok(())
+ }
+
+ fn iter(&self, tree: usize) -> Result<ValueIter<'_>> {
+ trace!("iter {}: lock db", tree);
+ let this = self.0.lock().unwrap();
+ trace!("iter {}: lock acquired", tree);
+
+ let tree = this.get_tree(tree)?;
+ let sql = format!("SELECT k, v FROM {} ORDER BY k ASC", tree);
+ DbValueIterator::make(this, &sql, [])
+ }
+
+ fn iter_rev(&self, tree: usize) -> Result<ValueIter<'_>> {
+ trace!("iter_rev {}: lock db", tree);
+ let this = self.0.lock().unwrap();
+ trace!("iter_rev {}: lock acquired", tree);
+
+ let tree = this.get_tree(tree)?;
+ let sql = format!("SELECT k, v FROM {} ORDER BY k DESC", tree);
+ DbValueIterator::make(this, &sql, [])
+ }
+
+ fn range<'r>(
+ &self,
+ tree: usize,
+ low: Bound<&'r [u8]>,
+ high: Bound<&'r [u8]>,
+ ) -> Result<ValueIter<'_>> {
+ trace!("range {}: lock db", tree);
+ let this = self.0.lock().unwrap();
+ trace!("range {}: lock acquired", tree);
+
+ let tree = this.get_tree(tree)?;
+
+ let (bounds_sql, params) = bounds_sql(low, high);
+ let sql = format!("SELECT k, v FROM {} {} ORDER BY k ASC", tree, bounds_sql);
+
+ let params = params
+ .iter()
+ .map(|x| x as &dyn rusqlite::ToSql)
+ .collect::<Vec<_>>();
+
+ DbValueIterator::make::<&[&dyn rusqlite::ToSql]>(this, &sql, params.as_ref())
+ }
+ fn range_rev<'r>(
+ &self,
+ tree: usize,
+ low: Bound<&'r [u8]>,
+ high: Bound<&'r [u8]>,
+ ) -> Result<ValueIter<'_>> {
+ trace!("range_rev {}: lock db", tree);
+ let this = self.0.lock().unwrap();
+ trace!("range_rev {}: lock acquired", tree);
+
+ let tree = this.get_tree(tree)?;
+
+ let (bounds_sql, params) = bounds_sql(low, high);
+ let sql = format!("SELECT k, v FROM {} {} ORDER BY k DESC", tree, bounds_sql);
+
+ let params = params
+ .iter()
+ .map(|x| x as &dyn rusqlite::ToSql)
+ .collect::<Vec<_>>();
+
+ DbValueIterator::make::<&[&dyn rusqlite::ToSql]>(this, &sql, params.as_ref())
+ }
+
+ // ----
+
+ fn transaction(&self, f: &dyn ITxFn) -> TxResult<(), ()> {
+ trace!("transaction: lock db");
+ let mut this = self.0.lock().unwrap();
+ trace!("transaction: lock acquired");
+
+ let this_mut_ref: &mut SqliteDbInner = this.borrow_mut();
+
+ let mut tx = SqliteTx {
+ tx: this_mut_ref
+ .db
+ .transaction()
+ .map_err(Error::from)
+ .map_err(TxError::Db)?,
+ trees: &this_mut_ref.trees,
+ };
+ let res = match f.try_on(&mut tx) {
+ TxFnResult::Ok => {
+ tx.tx.commit().map_err(Error::from).map_err(TxError::Db)?;
+ Ok(())
+ }
+ TxFnResult::Abort => {
+ tx.tx.rollback().map_err(Error::from).map_err(TxError::Db)?;
+ Err(TxError::Abort(()))
+ }
+ TxFnResult::DbErr => {
+ tx.tx.rollback().map_err(Error::from).map_err(TxError::Db)?;
+ Err(TxError::Db(Error(
+ "(this message will be discarded)".into(),
+ )))
+ }
+ };
+
+ trace!("transaction done");
+ res
+ }
+}
+
+// ----
+
+struct SqliteTx<'a> {
+ tx: Transaction<'a>,
+ trees: &'a [String],
+}
+
+impl<'a> SqliteTx<'a> {
+ fn get_tree(&self, i: usize) -> TxOpResult<&'_ str> {
+ self.trees.get(i).map(String::as_ref).ok_or_else(|| {
+ TxOpError(Error(
+ "invalid tree id (it might have been openned after the transaction started)".into(),
+ ))
+ })
+ }
+
+ fn internal_get(&self, tree: &str, key: &[u8]) -> TxOpResult<Option<Value>> {
+ let mut stmt = self
+ .tx
+ .prepare(&format!("SELECT v FROM {} WHERE k = ?1", tree))?;
+ let mut res_iter = stmt.query([key])?;
+ match res_iter.next()? {
+ None => Ok(None),
+ Some(v) => Ok(Some(v.get::<_, Vec<u8>>(0)?)),
+ }
+ }
+}
+
+impl<'a> ITx for SqliteTx<'a> {
+ fn get(&self, tree: usize, key: &[u8]) -> TxOpResult<Option<Value>> {
+ let tree = self.get_tree(tree)?;
+ self.internal_get(tree, key)
+ }
+ fn len(&self, tree: usize) -> TxOpResult<usize> {
+ let tree = self.get_tree(tree)?;
+ let mut stmt = self.tx.prepare(&format!("SELECT COUNT(*) FROM {}", tree))?;
+ let mut res_iter = stmt.query([])?;
+ match res_iter.next()? {
+ None => Ok(0),
+ Some(v) => Ok(v.get::<_, usize>(0)?),
+ }
+ }
+
+ fn insert(&mut self, tree: usize, key: &[u8], value: &[u8]) -> TxOpResult<Option<Value>> {
+ let tree = self.get_tree(tree)?;
+ let old_val = self.internal_get(tree, key)?;
+
+ let sql = match &old_val {
+ Some(_) => format!("UPDATE {} SET v = ?2 WHERE k = ?1", tree),
+ None => format!("INSERT INTO {} (k, v) VALUES (?1, ?2)", tree),
+ };
+ let n = self.tx.execute(&sql, params![key, value])?;
+ assert_eq!(n, 1);
+
+ Ok(old_val)
+ }
+ fn remove(&mut self, tree: usize, key: &[u8]) -> TxOpResult<Option<Value>> {
+ let tree = self.get_tree(tree)?;
+ let old_val = self.internal_get(tree, key)?;
+
+ if old_val.is_some() {
+ let n = self
+ .tx
+ .execute(&format!("DELETE FROM {} WHERE k = ?1", tree), params![key])?;
+ assert_eq!(n, 1);
+ }
+
+ Ok(old_val)
+ }
+
+ fn iter(&self, _tree: usize) -> TxOpResult<TxValueIter<'_>> {
+ unimplemented!();
+ }
+ fn iter_rev(&self, _tree: usize) -> TxOpResult<TxValueIter<'_>> {
+ unimplemented!();
+ }
+
+ fn range<'r>(
+ &self,
+ _tree: usize,
+ _low: Bound<&'r [u8]>,
+ _high: Bound<&'r [u8]>,
+ ) -> TxOpResult<TxValueIter<'_>> {
+ unimplemented!();
+ }
+ fn range_rev<'r>(
+ &self,
+ _tree: usize,
+ _low: Bound<&'r [u8]>,
+ _high: Bound<&'r [u8]>,
+ ) -> TxOpResult<TxValueIter<'_>> {
+ unimplemented!();
+ }
+}
+
+// ----
+
+struct DbValueIterator<'a> {
+ db: MutexGuard<'a, SqliteDbInner>,
+ stmt: Option<Statement<'a>>,
+ iter: Option<Rows<'a>>,
+ _pin: PhantomPinned,
+}
+
+impl<'a> DbValueIterator<'a> {
+ fn make<P: rusqlite::Params>(
+ db: MutexGuard<'a, SqliteDbInner>,
+ sql: &str,
+ args: P,
+ ) -> Result<ValueIter<'a>> {
+ let res = DbValueIterator {
+ db,
+ stmt: None,
+ iter: None,
+ _pin: PhantomPinned,
+ };
+ let mut boxed = Box::pin(res);
+ trace!("make iterator with sql: {}", sql);
+
+ unsafe {
+ let db = NonNull::from(&boxed.db);
+ let stmt = db.as_ref().db.prepare(sql)?;
+
+ let mut_ref: Pin<&mut DbValueIterator<'a>> = Pin::as_mut(&mut boxed);
+ Pin::get_unchecked_mut(mut_ref).stmt = Some(stmt);
+
+ let mut stmt = NonNull::from(&boxed.stmt);
+ let iter = stmt.as_mut().as_mut().unwrap().query(args)?;
+
+ let mut_ref: Pin<&mut DbValueIterator<'a>> = Pin::as_mut(&mut boxed);
+ Pin::get_unchecked_mut(mut_ref).iter = Some(iter);
+ }
+
+ Ok(Box::new(DbValueIteratorPin(boxed)))
+ }
+}
+
+impl<'a> Drop for DbValueIterator<'a> {
+ fn drop(&mut self) {
+ trace!("drop iter");
+ drop(self.iter.take());
+ drop(self.stmt.take());
+ }
+}
+
+struct DbValueIteratorPin<'a>(Pin<Box<DbValueIterator<'a>>>);
+
+impl<'a> Iterator for DbValueIteratorPin<'a> {
+ type Item = Result<(Value, Value)>;
+
+ fn next(&mut self) -> Option<Self::Item> {
+ let next = unsafe {
+ let mut_ref: Pin<&mut DbValueIterator<'a>> = Pin::as_mut(&mut self.0);
+ Pin::get_unchecked_mut(mut_ref).iter.as_mut()?.next()
+ };
+ let row = match next {
+ Err(e) => return Some(Err(e.into())),
+ Ok(None) => return None,
+ Ok(Some(r)) => r,
+ };
+ let k = match row.get::<_, Vec<u8>>(0) {
+ Err(e) => return Some(Err(e.into())),
+ Ok(x) => x,
+ };
+ let v = match row.get::<_, Vec<u8>>(1) {
+ Err(e) => return Some(Err(e.into())),
+ Ok(y) => y,
+ };
+ Some(Ok((k, v)))
+ }
+}
+
+// ----
+
+fn bounds_sql<'r>(low: Bound<&'r [u8]>, high: Bound<&'r [u8]>) -> (String, Vec<Vec<u8>>) {
+ let mut sql = String::new();
+ let mut params: Vec<Vec<u8>> = vec![];
+
+ match low {
+ Bound::Included(b) => {
+ sql.push_str(" WHERE k >= ?1");
+ params.push(b.to_vec());
+ }
+ Bound::Excluded(b) => {
+ sql.push_str(" WHERE k > ?1");
+ params.push(b.to_vec());
+ }
+ Bound::Unbounded => (),
+ };
+
+ match high {
+ Bound::Included(b) => {
+ if !params.is_empty() {
+ sql.push_str(" AND k <= ?2");
+ } else {
+ sql.push_str(" WHERE k <= ?1");
+ }
+ params.push(b.to_vec());
+ }
+ Bound::Excluded(b) => {
+ if !params.is_empty() {
+ sql.push_str(" AND k < ?2");
+ } else {
+ sql.push_str(" WHERE k < ?1");
+ }
+ params.push(b.to_vec());
+ }
+ Bound::Unbounded => (),
+ }
+
+ (sql, params)
+}
diff --git a/src/db/test.rs b/src/db/test.rs
new file mode 100644
index 00000000..cfcee643
--- /dev/null
+++ b/src/db/test.rs
@@ -0,0 +1,106 @@
+use crate::*;
+
+use crate::lmdb_adapter::LmdbDb;
+use crate::sled_adapter::SledDb;
+use crate::sqlite_adapter::SqliteDb;
+
+fn test_suite(db: Db) {
+ let tree = db.open_tree("tree").unwrap();
+
+ let ka: &[u8] = &b"test"[..];
+ let kb: &[u8] = &b"zwello"[..];
+ let kint: &[u8] = &b"tz"[..];
+ let va: &[u8] = &b"plop"[..];
+ let vb: &[u8] = &b"plip"[..];
+ let vc: &[u8] = &b"plup"[..];
+
+ assert!(tree.insert(ka, va).unwrap().is_none());
+ assert_eq!(tree.get(ka).unwrap().unwrap(), va);
+
+ let res = db.transaction::<_, (), _>(|mut tx| {
+ assert_eq!(tx.get(&tree, ka).unwrap().unwrap(), va);
+
+ assert_eq!(tx.insert(&tree, ka, vb).unwrap().unwrap(), va);
+
+ assert_eq!(tx.get(&tree, ka).unwrap().unwrap(), vb);
+
+ tx.commit(12)
+ });
+ assert!(matches!(res, Ok(12)));
+ assert_eq!(tree.get(ka).unwrap().unwrap(), vb);
+
+ let res = db.transaction::<(), _, _>(|mut tx| {
+ assert_eq!(tx.get(&tree, ka).unwrap().unwrap(), vb);
+
+ assert_eq!(tx.insert(&tree, ka, vc).unwrap().unwrap(), vb);
+
+ assert_eq!(tx.get(&tree, ka).unwrap().unwrap(), vc);
+
+ tx.abort(42)
+ });
+ assert!(matches!(res, Err(TxError::Abort(42))));
+ assert_eq!(tree.get(ka).unwrap().unwrap(), vb);
+
+ let mut iter = tree.iter().unwrap();
+ let next = iter.next().unwrap().unwrap();
+ assert_eq!((next.0.as_ref(), next.1.as_ref()), (ka, vb));
+ assert!(iter.next().is_none());
+ drop(iter);
+
+ assert!(tree.insert(kb, vc).unwrap().is_none());
+ assert_eq!(tree.get(kb).unwrap().unwrap(), vc);
+
+ let mut iter = tree.iter().unwrap();
+ let next = iter.next().unwrap().unwrap();
+ assert_eq!((next.0.as_ref(), next.1.as_ref()), (ka, vb));
+ let next = iter.next().unwrap().unwrap();
+ assert_eq!((next.0.as_ref(), next.1.as_ref()), (kb, vc));
+ assert!(iter.next().is_none());
+ drop(iter);
+
+ let mut iter = tree.range(kint..).unwrap();
+ let next = iter.next().unwrap().unwrap();
+ assert_eq!((next.0.as_ref(), next.1.as_ref()), (kb, vc));
+ assert!(iter.next().is_none());
+ drop(iter);
+
+ let mut iter = tree.range_rev(..kint).unwrap();
+ let next = iter.next().unwrap().unwrap();
+ assert_eq!((next.0.as_ref(), next.1.as_ref()), (ka, vb));
+ assert!(iter.next().is_none());
+ drop(iter);
+
+ let mut iter = tree.iter_rev().unwrap();
+ let next = iter.next().unwrap().unwrap();
+ assert_eq!((next.0.as_ref(), next.1.as_ref()), (kb, vc));
+ let next = iter.next().unwrap().unwrap();
+ assert_eq!((next.0.as_ref(), next.1.as_ref()), (ka, vb));
+ assert!(iter.next().is_none());
+ drop(iter);
+}
+
+#[test]
+fn test_lmdb_db() {
+ let path = mktemp::Temp::new_dir().unwrap();
+ let db = heed::EnvOpenOptions::new()
+ .max_dbs(100)
+ .open(&path)
+ .unwrap();
+ let db = LmdbDb::init(db);
+ test_suite(db);
+ drop(path);
+}
+
+#[test]
+fn test_sled_db() {
+ let path = mktemp::Temp::new_dir().unwrap();
+ let db = SledDb::init(sled::open(path.to_path_buf()).unwrap());
+ test_suite(db);
+ drop(path);
+}
+
+#[test]
+fn test_sqlite_db() {
+ let db = SqliteDb::init(rusqlite::Connection::open_in_memory().unwrap());
+ test_suite(db);
+}
diff --git a/src/garage/Cargo.toml b/src/garage/Cargo.toml
index 59f402ff..5ce40ff2 100644
--- a/src/garage/Cargo.toml
+++ b/src/garage/Cargo.toml
@@ -1,6 +1,6 @@
[package]
name = "garage"
-version = "0.7.0"
+version = "0.8.0"
authors = ["Alex Auvolat <alex@adnab.me>"]
edition = "2018"
license = "AGPL-3.0"
@@ -21,25 +21,25 @@ path = "tests/lib.rs"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
-garage_api = { version = "0.7.0", path = "../api" }
-garage_model = { version = "0.7.0", path = "../model" }
-garage_rpc = { version = "0.7.0", path = "../rpc" }
-garage_table = { version = "0.7.0", path = "../table" }
-garage_util = { version = "0.7.0", path = "../util" }
-garage_web = { version = "0.7.0", path = "../web" }
-garage_admin = { version = "0.7.0", path = "../admin" }
+garage_db = { version = "0.8.0", path = "../db" }
+garage_api = { version = "0.8.0", path = "../api" }
+garage_block = { version = "0.8.0", path = "../block" }
+garage_model = { version = "0.8.0", path = "../model" }
+garage_rpc = { version = "0.8.0", path = "../rpc" }
+garage_table = { version = "0.8.0", path = "../table" }
+garage_util = { version = "0.8.0", path = "../util" }
+garage_web = { version = "0.8.0", path = "../web" }
bytes = "1.0"
-git-version = "0.3.4"
+bytesize = "1.1"
+timeago = "0.3"
hex = "0.4"
tracing = { version = "0.1.30", features = ["log-always"] }
-pretty_env_logger = "0.4"
+tracing-subscriber = { version = "0.3", features = ["env-filter"] }
rand = "0.8"
async-trait = "0.1.7"
sodiumoxide = { version = "0.2.5-0", package = "kuska-sodiumoxide" }
-sled = "0.34"
-
rmp-serde = "0.15"
serde = { version = "1.0", default-features = false, features = ["derive", "rc"] }
serde_bytes = "0.11"
@@ -50,16 +50,48 @@ futures = "0.3"
futures-util = "0.3"
tokio = { version = "1.0", default-features = false, features = ["rt", "rt-multi-thread", "io-util", "net", "time", "macros", "sync", "signal", "fs"] }
-#netapp = { version = "0.3.2", git = "https://git.deuxfleurs.fr/lx/netapp" }
-#netapp = { version = "0.4", path = "../../../netapp" }
-netapp = "0.4"
+netapp = "0.5"
+
+opentelemetry = { version = "0.17", features = [ "rt-tokio" ] }
+opentelemetry-prometheus = { version = "0.10", optional = true }
+opentelemetry-otlp = { version = "0.10", optional = true }
+prometheus = { version = "0.13", optional = true }
[dev-dependencies]
aws-sdk-s3 = "0.8"
chrono = "0.4"
http = "0.2"
-hmac = "0.10"
+hmac = "0.12"
hyper = { version = "0.14", features = ["client", "http1", "runtime"] }
-sha2 = "0.9"
+sha2 = "0.10"
static_init = "1.0"
+assert-json-diff = "2.0"
+serde_json = "1.0"
+base64 = "0.13"
+
+
+[features]
+default = [ "bundled-libs", "metrics", "sled" ]
+
+k2v = [ "garage_util/k2v", "garage_api/k2v" ]
+
+# Database engines, Sled is still our default even though we don't like it
+sled = [ "garage_model/sled" ]
+lmdb = [ "garage_model/lmdb" ]
+sqlite = [ "garage_model/sqlite" ]
+
+# Automatic registration and discovery via Kubernetes API
+kubernetes-discovery = [ "garage_rpc/kubernetes-discovery" ]
+# Prometheus exporter (/metrics endpoint).
+metrics = [ "garage_api/metrics", "opentelemetry-prometheus", "prometheus" ]
+# Exporter for the OpenTelemetry Collector.
+telemetry-otlp = [ "opentelemetry-otlp" ]
+
+# NOTE: bundled-libs and system-libs should be treat as mutually exclusive;
+# exactly one of them should be enabled.
+
+# Use bundled libsqlite instead of linking against system-provided.
+bundled-libs = [ "garage_db/bundled-libs" ]
+# Link against system-provided libsodium and libzstd.
+system-libs = [ "garage_block/system-libs", "garage_rpc/system-libs", "sodiumoxide/use-pkg-config" ]
diff --git a/src/garage/admin.rs b/src/garage/admin.rs
index 0b20bb20..802a8261 100644
--- a/src/garage/admin.rs
+++ b/src/garage/admin.rs
@@ -15,34 +15,45 @@ use garage_table::*;
use garage_rpc::*;
+use garage_block::repair::ScrubWorkerCommand;
+
use garage_model::bucket_alias_table::*;
use garage_model::bucket_table::*;
use garage_model::garage::Garage;
use garage_model::helper::error::{Error, OkOrBadRequest};
use garage_model::key_table::*;
use garage_model::migrate::Migrate;
-use garage_model::object_table::ObjectFilter;
use garage_model::permission::*;
use crate::cli::*;
-use crate::repair::Repair;
+use crate::repair::online::launch_online_repair;
pub const ADMIN_RPC_PATH: &str = "garage/admin_rpc.rs/Rpc";
#[derive(Debug, Serialize, Deserialize)]
+#[allow(clippy::large_enum_variant)]
pub enum AdminRpc {
BucketOperation(BucketOperation),
KeyOperation(KeyOperation),
LaunchRepair(RepairOpt),
Migrate(MigrateOpt),
Stats(StatsOpt),
+ Worker(WorkerOpt),
// Replies
Ok(String),
BucketList(Vec<Bucket>),
- BucketInfo(Bucket, HashMap<String, Key>),
+ BucketInfo {
+ bucket: Bucket,
+ relevant_keys: HashMap<String, Key>,
+ counters: HashMap<String, i64>,
+ },
KeyList(Vec<(String, String)>),
KeyInfo(Key, HashMap<Uuid, Bucket>),
+ WorkerList(
+ HashMap<usize, garage_util::background::WorkerInfo>,
+ WorkerListOpt,
+ ),
}
impl Rpc for AdminRpc {
@@ -73,6 +84,7 @@ impl AdminRpcHandler {
BucketOperation::Allow(query) => self.handle_bucket_allow(query).await,
BucketOperation::Deny(query) => self.handle_bucket_deny(query).await,
BucketOperation::Website(query) => self.handle_bucket_website(query).await,
+ BucketOperation::SetQuotas(query) => self.handle_bucket_set_quotas(query).await,
}
}
@@ -80,8 +92,15 @@ impl AdminRpcHandler {
let buckets = self
.garage
.bucket_table
- .get_range(&EmptyKey, None, Some(DeletedFilter::NotDeleted), 10000)
+ .get_range(
+ &EmptyKey,
+ None,
+ Some(DeletedFilter::NotDeleted),
+ 10000,
+ EnumerationOrder::Forward,
+ )
.await?;
+
Ok(AdminRpc::BucketList(buckets))
}
@@ -99,6 +118,15 @@ impl AdminRpcHandler {
.get_existing_bucket(bucket_id)
.await?;
+ let counters = self
+ .garage
+ .object_counter_table
+ .table
+ .get(&bucket_id, &EmptyKey)
+ .await?
+ .map(|x| x.filtered_values(&self.garage.system.ring.borrow()))
+ .unwrap_or_default();
+
let mut relevant_keys = HashMap::new();
for (k, _) in bucket
.state
@@ -134,7 +162,11 @@ impl AdminRpcHandler {
}
}
- Ok(AdminRpc::BucketInfo(bucket, relevant_keys))
+ Ok(AdminRpc::BucketInfo {
+ bucket,
+ relevant_keys,
+ counters,
+ })
}
#[allow(clippy::ptr_arg)]
@@ -207,12 +239,7 @@ impl AdminRpcHandler {
}
// Check bucket is empty
- let objects = self
- .garage
- .object_table
- .get_range(&bucket_id, None, Some(ObjectFilter::IsData), 10)
- .await?;
- if !objects.is_empty() {
+ if !helper.is_bucket_empty(bucket_id).await? {
return Err(Error::BadRequest(format!(
"Bucket {} is not empty",
query.name
@@ -249,6 +276,7 @@ impl AdminRpcHandler {
async fn handle_alias_bucket(&self, query: &AliasBucketOpt) -> Result<AdminRpc, Error> {
let helper = self.garage.bucket_helper();
+ let key_helper = self.garage.key_helper();
let bucket_id = helper
.resolve_global_bucket_name(&query.existing_bucket)
@@ -256,7 +284,7 @@ impl AdminRpcHandler {
.ok_or_bad_request("Bucket not found")?;
if let Some(key_pattern) = &query.local {
- let key = helper.get_existing_matching_key(key_pattern).await?;
+ let key = key_helper.get_existing_matching_key(key_pattern).await?;
helper
.set_local_bucket_alias(bucket_id, &key.key_id, &query.new_name)
@@ -278,9 +306,10 @@ impl AdminRpcHandler {
async fn handle_unalias_bucket(&self, query: &UnaliasBucketOpt) -> Result<AdminRpc, Error> {
let helper = self.garage.bucket_helper();
+ let key_helper = self.garage.key_helper();
if let Some(key_pattern) = &query.local {
- let key = helper.get_existing_matching_key(key_pattern).await?;
+ let key = key_helper.get_existing_matching_key(key_pattern).await?;
let bucket_id = key
.state
@@ -319,12 +348,15 @@ impl AdminRpcHandler {
async fn handle_bucket_allow(&self, query: &PermBucketOpt) -> Result<AdminRpc, Error> {
let helper = self.garage.bucket_helper();
+ let key_helper = self.garage.key_helper();
let bucket_id = helper
.resolve_global_bucket_name(&query.bucket)
.await?
.ok_or_bad_request("Bucket not found")?;
- let key = helper.get_existing_matching_key(&query.key_pattern).await?;
+ let key = key_helper
+ .get_existing_matching_key(&query.key_pattern)
+ .await?;
let allow_read = query.read || key.allow_read(&bucket_id);
let allow_write = query.write || key.allow_write(&bucket_id);
@@ -351,12 +383,15 @@ impl AdminRpcHandler {
async fn handle_bucket_deny(&self, query: &PermBucketOpt) -> Result<AdminRpc, Error> {
let helper = self.garage.bucket_helper();
+ let key_helper = self.garage.key_helper();
let bucket_id = helper
.resolve_global_bucket_name(&query.bucket)
.await?
.ok_or_bad_request("Bucket not found")?;
- let key = helper.get_existing_matching_key(&query.key_pattern).await?;
+ let key = key_helper
+ .get_existing_matching_key(&query.key_pattern)
+ .await?;
let allow_read = !query.read && key.allow_read(&bucket_id);
let allow_write = !query.write && key.allow_write(&bucket_id);
@@ -423,6 +458,60 @@ impl AdminRpcHandler {
Ok(AdminRpc::Ok(msg))
}
+ async fn handle_bucket_set_quotas(&self, query: &SetQuotasOpt) -> Result<AdminRpc, Error> {
+ let bucket_id = self
+ .garage
+ .bucket_helper()
+ .resolve_global_bucket_name(&query.bucket)
+ .await?
+ .ok_or_bad_request("Bucket not found")?;
+
+ let mut bucket = self
+ .garage
+ .bucket_helper()
+ .get_existing_bucket(bucket_id)
+ .await?;
+ let bucket_state = bucket.state.as_option_mut().unwrap();
+
+ if query.max_size.is_none() && query.max_objects.is_none() {
+ return Err(Error::BadRequest(
+ "You must specify either --max-size or --max-objects (or both) for this command to do something.".to_string(),
+ ));
+ }
+
+ let mut quotas = bucket_state.quotas.get().clone();
+
+ match query.max_size.as_ref().map(String::as_ref) {
+ Some("none") => quotas.max_size = None,
+ Some(v) => {
+ let bs = v
+ .parse::<bytesize::ByteSize>()
+ .ok_or_bad_request(format!("Invalid size specified: {}", v))?;
+ quotas.max_size = Some(bs.as_u64());
+ }
+ _ => (),
+ }
+
+ match query.max_objects.as_ref().map(String::as_ref) {
+ Some("none") => quotas.max_objects = None,
+ Some(v) => {
+ let mo = v
+ .parse::<u64>()
+ .ok_or_bad_request(format!("Invalid number specified: {}", v))?;
+ quotas.max_objects = Some(mo);
+ }
+ _ => (),
+ }
+
+ bucket_state.quotas.update(quotas);
+ self.garage.bucket_table.insert(&bucket).await?;
+
+ Ok(AdminRpc::Ok(format!(
+ "Quotas updated for {}",
+ &query.bucket
+ )))
+ }
+
async fn handle_key_cmd(&self, cmd: &KeyOperation) -> Result<AdminRpc, Error> {
match cmd {
KeyOperation::List => self.handle_list_keys().await,
@@ -445,6 +534,7 @@ impl AdminRpcHandler {
None,
Some(KeyFilter::Deleted(DeletedFilter::NotDeleted)),
10000,
+ EnumerationOrder::Forward,
)
.await?
.iter()
@@ -456,7 +546,7 @@ impl AdminRpcHandler {
async fn handle_key_info(&self, query: &KeyOpt) -> Result<AdminRpc, Error> {
let key = self
.garage
- .bucket_helper()
+ .key_helper()
.get_existing_matching_key(&query.key_pattern)
.await?;
self.key_info_result(key).await
@@ -471,7 +561,7 @@ impl AdminRpcHandler {
async fn handle_rename_key(&self, query: &KeyRenameOpt) -> Result<AdminRpc, Error> {
let mut key = self
.garage
- .bucket_helper()
+ .key_helper()
.get_existing_matching_key(&query.key_pattern)
.await?;
key.params_mut()
@@ -483,9 +573,11 @@ impl AdminRpcHandler {
}
async fn handle_delete_key(&self, query: &KeyDeleteOpt) -> Result<AdminRpc, Error> {
- let helper = self.garage.bucket_helper();
+ let key_helper = self.garage.key_helper();
- let mut key = helper.get_existing_matching_key(&query.key_pattern).await?;
+ let mut key = key_helper
+ .get_existing_matching_key(&query.key_pattern)
+ .await?;
if !query.yes {
return Err(Error::BadRequest(
@@ -493,32 +585,7 @@ impl AdminRpcHandler {
));
}
- let state = key.state.as_option_mut().unwrap();
-
- // --- done checking, now commit ---
- // (the step at unset_local_bucket_alias will fail if a bucket
- // does not have another alias, the deletion will be
- // interrupted in the middle if that happens)
-
- // 1. Delete local aliases
- for (alias, _, to) in state.local_aliases.items().iter() {
- if let Some(bucket_id) = to {
- helper
- .unset_local_bucket_alias(*bucket_id, &key.key_id, alias)
- .await?;
- }
- }
-
- // 2. Remove permissions on all authorized buckets
- for (ab_id, _auth) in state.authorized_buckets.items().iter() {
- helper
- .set_bucket_key_permissions(*ab_id, &key.key_id, BucketKeyPerm::NO_PERMISSIONS)
- .await?;
- }
-
- // 3. Actually delete key
- key.state = Deletable::delete();
- self.garage.key_table.insert(&key).await?;
+ key_helper.delete_key(&mut key).await?;
Ok(AdminRpc::Ok(format!(
"Key {} was deleted successfully.",
@@ -529,7 +596,7 @@ impl AdminRpcHandler {
async fn handle_allow_key(&self, query: &KeyPermOpt) -> Result<AdminRpc, Error> {
let mut key = self
.garage
- .bucket_helper()
+ .key_helper()
.get_existing_matching_key(&query.key_pattern)
.await?;
if query.create_bucket {
@@ -542,7 +609,7 @@ impl AdminRpcHandler {
async fn handle_deny_key(&self, query: &KeyPermOpt) -> Result<AdminRpc, Error> {
let mut key = self
.garage
- .bucket_helper()
+ .key_helper()
.get_existing_matching_key(&query.key_pattern)
.await?;
if query.create_bucket {
@@ -616,7 +683,7 @@ impl AdminRpcHandler {
.endpoint
.call(
&node,
- &AdminRpc::LaunchRepair(opt_to_send.clone()),
+ AdminRpc::LaunchRepair(opt_to_send.clone()),
PRIO_NORMAL,
)
.await;
@@ -633,15 +700,7 @@ impl AdminRpcHandler {
)))
}
} else {
- let repair = Repair {
- garage: self.garage.clone(),
- };
- self.garage
- .system
- .background
- .spawn_worker("Repair worker".into(), move |must_exit| async move {
- repair.repair_worker(opt, must_exit).await
- });
+ launch_online_repair(self.garage.clone(), opt).await;
Ok(AdminRpc::Ok(format!(
"Repair launched on {:?}",
self.garage.system.id
@@ -664,7 +723,7 @@ impl AdminRpcHandler {
let node_id = (*node).into();
match self
.endpoint
- .call(&node_id, &AdminRpc::Stats(opt), PRIO_NORMAL)
+ .call(&node_id, AdminRpc::Stats(opt), PRIO_NORMAL)
.await?
{
Ok(AdminRpc::Ok(s)) => writeln!(&mut ret, "{}", s).unwrap(),
@@ -674,22 +733,22 @@ impl AdminRpcHandler {
}
Ok(AdminRpc::Ok(ret))
} else {
- Ok(AdminRpc::Ok(self.gather_stats_local(opt)))
+ Ok(AdminRpc::Ok(self.gather_stats_local(opt)?))
}
}
- fn gather_stats_local(&self, opt: StatsOpt) -> String {
+ fn gather_stats_local(&self, opt: StatsOpt) -> Result<String, Error> {
let mut ret = String::new();
writeln!(
&mut ret,
- "\nGarage version: {}",
- option_env!("GIT_VERSION").unwrap_or(git_version::git_version!(
- prefix = "git:",
- cargo_prefix = "cargo:",
- fallback = "unknown"
- ))
+ "\nGarage version: {} [features: {}]",
+ garage_util::version::garage_version(),
+ garage_util::version::garage_features()
+ .map(|list| list.join(", "))
+ .unwrap_or_else(|| "(unknown)".into()),
)
.unwrap();
+ writeln!(&mut ret, "\nDatabase engine: {}", self.garage.db.engine()).unwrap();
// Gather ring statistics
let ring = self.garage.system.ring.borrow().clone();
@@ -707,59 +766,108 @@ impl AdminRpcHandler {
writeln!(&mut ret, " {:?} {}", n, c).unwrap();
}
- self.gather_table_stats(&mut ret, &self.garage.bucket_table, &opt);
- self.gather_table_stats(&mut ret, &self.garage.key_table, &opt);
- self.gather_table_stats(&mut ret, &self.garage.object_table, &opt);
- self.gather_table_stats(&mut ret, &self.garage.version_table, &opt);
- self.gather_table_stats(&mut ret, &self.garage.block_ref_table, &opt);
+ self.gather_table_stats(&mut ret, &self.garage.bucket_table, &opt)?;
+ self.gather_table_stats(&mut ret, &self.garage.key_table, &opt)?;
+ self.gather_table_stats(&mut ret, &self.garage.object_table, &opt)?;
+ self.gather_table_stats(&mut ret, &self.garage.version_table, &opt)?;
+ self.gather_table_stats(&mut ret, &self.garage.block_ref_table, &opt)?;
writeln!(&mut ret, "\nBlock manager stats:").unwrap();
if opt.detailed {
writeln!(
&mut ret,
" number of RC entries (~= number of blocks): {}",
- self.garage.block_manager.rc_len()
+ self.garage.block_manager.rc_len()?
)
.unwrap();
}
writeln!(
&mut ret,
" resync queue length: {}",
- self.garage.block_manager.resync_queue_len()
+ self.garage.block_manager.resync.queue_len()?
)
.unwrap();
writeln!(
&mut ret,
" blocks with resync errors: {}",
- self.garage.block_manager.resync_errors_len()
+ self.garage.block_manager.resync.errors_len()?
)
.unwrap();
- ret
+ Ok(ret)
}
- fn gather_table_stats<F, R>(&self, to: &mut String, t: &Arc<Table<F, R>>, opt: &StatsOpt)
+ fn gather_table_stats<F, R>(
+ &self,
+ to: &mut String,
+ t: &Arc<Table<F, R>>,
+ opt: &StatsOpt,
+ ) -> Result<(), Error>
where
F: TableSchema + 'static,
R: TableReplication + 'static,
{
writeln!(to, "\nTable stats for {}", F::TABLE_NAME).unwrap();
if opt.detailed {
- writeln!(to, " number of items: {}", t.data.store.len()).unwrap();
+ writeln!(
+ to,
+ " number of items: {}",
+ t.data.store.len().map_err(GarageError::from)?
+ )
+ .unwrap();
writeln!(
to,
" Merkle tree size: {}",
- t.merkle_updater.merkle_tree_len()
+ t.merkle_updater.merkle_tree_len()?
)
.unwrap();
}
writeln!(
to,
" Merkle updater todo queue length: {}",
- t.merkle_updater.todo_len()
+ t.merkle_updater.todo_len()?
)
.unwrap();
- writeln!(to, " GC todo queue length: {}", t.data.gc_todo_len()).unwrap();
+ writeln!(to, " GC todo queue length: {}", t.data.gc_todo_len()?).unwrap();
+
+ Ok(())
+ }
+
+ // ----
+
+ async fn handle_worker_cmd(&self, opt: WorkerOpt) -> Result<AdminRpc, Error> {
+ match opt.cmd {
+ WorkerCmd::List { opt } => {
+ let workers = self.garage.background.get_worker_info();
+ Ok(AdminRpc::WorkerList(workers, opt))
+ }
+ WorkerCmd::Set { opt } => match opt {
+ WorkerSetCmd::ScrubTranquility { tranquility } => {
+ let scrub_command = ScrubWorkerCommand::SetTranquility(tranquility);
+ self.garage
+ .block_manager
+ .send_scrub_command(scrub_command)
+ .await;
+ Ok(AdminRpc::Ok("Scrub tranquility updated".into()))
+ }
+ WorkerSetCmd::ResyncNWorkers { n_workers } => {
+ self.garage
+ .block_manager
+ .resync
+ .set_n_workers(n_workers)
+ .await?;
+ Ok(AdminRpc::Ok("Number of resync workers updated".into()))
+ }
+ WorkerSetCmd::ResyncTranquility { tranquility } => {
+ self.garage
+ .block_manager
+ .resync
+ .set_tranquility(tranquility)
+ .await?;
+ Ok(AdminRpc::Ok("Resync tranquility updated".into()))
+ }
+ },
+ }
}
}
@@ -776,6 +884,7 @@ impl EndpointHandler<AdminRpc> for AdminRpcHandler {
AdminRpc::Migrate(opt) => self.handle_migrate(opt.clone()).await,
AdminRpc::LaunchRepair(opt) => self.handle_launch_repair(opt.clone()).await,
AdminRpc::Stats(opt) => self.handle_stats(opt.clone()).await,
+ AdminRpc::Worker(opt) => self.handle_worker_cmd(opt.clone()).await,
m => Err(GarageError::unexpected_rpc_message(m).into()),
}
}
diff --git a/src/garage/cli/cmd.rs b/src/garage/cli/cmd.rs
index a90277a0..c8b96489 100644
--- a/src/garage/cli/cmd.rs
+++ b/src/garage/cli/cmd.rs
@@ -1,6 +1,8 @@
use std::collections::HashSet;
+use std::time::Duration;
use garage_util::error::*;
+use garage_util::formater::format_table;
use garage_rpc::layout::*;
use garage_rpc::system::*;
@@ -38,13 +40,14 @@ pub async fn cli_command_dispatch(
cmd_admin(admin_rpc_endpoint, rpc_host, AdminRpc::LaunchRepair(ro)).await
}
Command::Stats(so) => cmd_admin(admin_rpc_endpoint, rpc_host, AdminRpc::Stats(so)).await,
+ Command::Worker(wo) => cmd_admin(admin_rpc_endpoint, rpc_host, AdminRpc::Worker(wo)).await,
_ => unreachable!(),
}
}
pub async fn cmd_status(rpc_cli: &Endpoint<SystemRpc, ()>, rpc_host: NodeID) -> Result<(), Error> {
let status = match rpc_cli
- .call(&rpc_host, &SystemRpc::GetKnownNodes, PRIO_NORMAL)
+ .call(&rpc_host, SystemRpc::GetKnownNodes, PRIO_NORMAL)
.await??
{
SystemRpc::ReturnKnownNodes(nodes) => nodes,
@@ -85,19 +88,21 @@ pub async fn cmd_status(rpc_cli: &Endpoint<SystemRpc, ()>, rpc_host: NodeID) ->
format_table(healthy_nodes);
let status_keys = status.iter().map(|adv| adv.id).collect::<HashSet<_>>();
- let failure_case_1 = status.iter().any(|adv| !adv.is_up);
+ let failure_case_1 = status
+ .iter()
+ .any(|adv| !adv.is_up && matches!(layout.roles.get(&adv.id), Some(NodeRoleV(Some(_)))));
let failure_case_2 = layout
.roles
.items()
.iter()
- .filter(|(_, _, v)| v.0.is_some())
- .any(|(id, _, _)| !status_keys.contains(id));
+ .any(|(id, _, v)| !status_keys.contains(id) && v.0.is_some());
if failure_case_1 || failure_case_2 {
println!("\n==== FAILED NODES ====");
let mut failed_nodes =
vec!["ID\tHostname\tAddress\tTags\tZone\tCapacity\tLast seen".to_string()];
for adv in status.iter().filter(|adv| !adv.is_up) {
if let Some(NodeRoleV(Some(cfg))) = layout.roles.get(&adv.id) {
+ let tf = timeago::Formatter::new();
failed_nodes.push(format!(
"{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\t{capacity}\t{last_seen}",
id = adv.id,
@@ -108,7 +113,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint<SystemRpc, ()>, rpc_host: NodeID) ->
capacity = cfg.capacity_string(),
last_seen = adv
.last_seen_secs_ago
- .map(|s| format!("{}s ago", s))
+ .map(|s| tf.convert(Duration::from_secs(s)))
.unwrap_or_else(|| "never seen".into()),
));
}
@@ -144,7 +149,7 @@ pub async fn cmd_connect(
args: ConnectNodeOpt,
) -> Result<(), Error> {
match rpc_cli
- .call(&rpc_host, &SystemRpc::Connect(args.node), PRIO_NORMAL)
+ .call(&rpc_host, SystemRpc::Connect(args.node), PRIO_NORMAL)
.await??
{
SystemRpc::Ok => {
@@ -160,15 +165,19 @@ pub async fn cmd_admin(
rpc_host: NodeID,
args: AdminRpc,
) -> Result<(), HelperError> {
- match rpc_cli.call(&rpc_host, &args, PRIO_NORMAL).await?? {
+ match rpc_cli.call(&rpc_host, args, PRIO_NORMAL).await?? {
AdminRpc::Ok(msg) => {
println!("{}", msg);
}
AdminRpc::BucketList(bl) => {
print_bucket_list(bl);
}
- AdminRpc::BucketInfo(bucket, rk) => {
- print_bucket_info(&bucket, &rk);
+ AdminRpc::BucketInfo {
+ bucket,
+ relevant_keys,
+ counters,
+ } => {
+ print_bucket_info(&bucket, &relevant_keys, &counters);
}
AdminRpc::KeyList(kl) => {
print_key_list(kl);
@@ -176,6 +185,9 @@ pub async fn cmd_admin(
AdminRpc::KeyInfo(key, rb) => {
print_key_info(&key, &rb);
}
+ AdminRpc::WorkerList(wi, wlo) => {
+ print_worker_info(wi, wlo);
+ }
r => {
error!("Unexpected response: {:?}", r);
}
diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs
index e76f7737..3884bb92 100644
--- a/src/garage/cli/layout.rs
+++ b/src/garage/cli/layout.rs
@@ -1,6 +1,6 @@
use garage_util::crdt::Crdt;
-use garage_util::data::*;
use garage_util::error::*;
+use garage_util::formater::format_table;
use garage_rpc::layout::*;
use garage_rpc::system::*;
@@ -36,21 +36,29 @@ pub async fn cmd_assign_role(
args: AssignRoleOpt,
) -> Result<(), Error> {
let status = match rpc_cli
- .call(&rpc_host, &SystemRpc::GetKnownNodes, PRIO_NORMAL)
+ .call(&rpc_host, SystemRpc::GetKnownNodes, PRIO_NORMAL)
.await??
{
SystemRpc::ReturnKnownNodes(nodes) => nodes,
resp => return Err(Error::Message(format!("Invalid RPC response: {:?}", resp))),
};
+ let mut layout = fetch_layout(rpc_cli, rpc_host).await?;
+
let added_nodes = args
.node_ids
.iter()
- .map(|node_id| find_matching_node(status.iter().map(|adv| adv.id), node_id))
+ .map(|node_id| {
+ find_matching_node(
+ status
+ .iter()
+ .map(|adv| adv.id)
+ .chain(layout.node_ids().iter().cloned()),
+ node_id,
+ )
+ })
.collect::<Result<Vec<_>, _>>()?;
- let mut layout = fetch_layout(rpc_cli, rpc_host).await?;
-
let mut roles = layout.roles.clone();
roles.merge(&layout.staging);
@@ -203,31 +211,9 @@ pub async fn cmd_apply_layout(
rpc_host: NodeID,
apply_opt: ApplyLayoutOpt,
) -> Result<(), Error> {
- let mut layout = fetch_layout(rpc_cli, rpc_host).await?;
-
- match apply_opt.version {
- None => {
- println!("Please pass the --version flag to ensure that you are writing the correct version of the cluster layout.");
- println!("To know the correct value of the --version flag, invoke `garage layout show` and review the proposed changes.");
- return Err(Error::Message("--version flag is missing".into()));
- }
- Some(v) => {
- if v != layout.version + 1 {
- return Err(Error::Message("Invalid value of --version flag".into()));
- }
- }
- }
-
- layout.roles.merge(&layout.staging);
-
- if !layout.calculate_partition_assignation() {
- return Err(Error::Message("Could not calculate new assignation of partitions to nodes. This can happen if there are less nodes than the desired number of copies of your data (see the replication_mode configuration parameter).".into()));
- }
+ let layout = fetch_layout(rpc_cli, rpc_host).await?;
- layout.staging.clear();
- layout.staging_hash = blake2sum(&rmp_to_vec_all_named(&layout.staging).unwrap()[..]);
-
- layout.version += 1;
+ let layout = layout.apply_staged_changes(apply_opt.version)?;
send_layout(rpc_cli, rpc_host, layout).await?;
@@ -242,25 +228,9 @@ pub async fn cmd_revert_layout(
rpc_host: NodeID,
revert_opt: RevertLayoutOpt,
) -> Result<(), Error> {
- let mut layout = fetch_layout(rpc_cli, rpc_host).await?;
-
- match revert_opt.version {
- None => {
- println!("Please pass the --version flag to ensure that you are writing the correct version of the cluster layout.");
- println!("To know the correct value of the --version flag, invoke `garage layout show` and review the proposed changes.");
- return Err(Error::Message("--version flag is missing".into()));
- }
- Some(v) => {
- if v != layout.version + 1 {
- return Err(Error::Message("Invalid value of --version flag".into()));
- }
- }
- }
-
- layout.staging.clear();
- layout.staging_hash = blake2sum(&rmp_to_vec_all_named(&layout.staging).unwrap()[..]);
+ let layout = fetch_layout(rpc_cli, rpc_host).await?;
- layout.version += 1;
+ let layout = layout.revert_staged_changes(revert_opt.version)?;
send_layout(rpc_cli, rpc_host, layout).await?;
@@ -275,7 +245,7 @@ pub async fn fetch_layout(
rpc_host: NodeID,
) -> Result<ClusterLayout, Error> {
match rpc_cli
- .call(&rpc_host, &SystemRpc::PullClusterLayout, PRIO_NORMAL)
+ .call(&rpc_host, SystemRpc::PullClusterLayout, PRIO_NORMAL)
.await??
{
SystemRpc::AdvertiseClusterLayout(t) => Ok(t),
@@ -291,7 +261,7 @@ pub async fn send_layout(
rpc_cli
.call(
&rpc_host,
- &SystemRpc::AdvertiseClusterLayout(layout),
+ SystemRpc::AdvertiseClusterLayout(layout),
PRIO_NORMAL,
)
.await??;
@@ -323,11 +293,20 @@ pub fn print_cluster_layout(layout: &ClusterLayout) -> bool {
}
pub fn print_staging_role_changes(layout: &ClusterLayout) -> bool {
- if !layout.staging.items().is_empty() {
+ let has_changes = layout
+ .staging
+ .items()
+ .iter()
+ .any(|(k, _, v)| layout.roles.get(k) != Some(v));
+
+ if has_changes {
println!();
println!("==== STAGED ROLE CHANGES ====");
let mut table = vec!["ID\tTags\tZone\tCapacity".to_string()];
for (id, _, role) in layout.staging.items().iter() {
+ if layout.roles.get(id) == Some(role) {
+ continue;
+ }
if let Some(role) = &role.0 {
let tags = role.tags.join(",");
table.push(format!(
diff --git a/src/garage/cli/structs.rs b/src/garage/cli/structs.rs
index a0c49aeb..06548e89 100644
--- a/src/garage/cli/structs.rs
+++ b/src/garage/cli/structs.rs
@@ -1,55 +1,65 @@
use serde::{Deserialize, Serialize};
-
use structopt::StructOpt;
+use garage_util::version::garage_version;
+
#[derive(StructOpt, Debug)]
pub enum Command {
/// Run Garage server
- #[structopt(name = "server")]
+ #[structopt(name = "server", version = garage_version())]
Server,
/// Get network status
- #[structopt(name = "status")]
+ #[structopt(name = "status", version = garage_version())]
Status,
/// Operations on individual Garage nodes
- #[structopt(name = "node")]
+ #[structopt(name = "node", version = garage_version())]
Node(NodeOperation),
/// Operations on the assignation of node roles in the cluster layout
- #[structopt(name = "layout")]
+ #[structopt(name = "layout", version = garage_version())]
Layout(LayoutOperation),
/// Operations on buckets
- #[structopt(name = "bucket")]
+ #[structopt(name = "bucket", version = garage_version())]
Bucket(BucketOperation),
/// Operations on S3 access keys
- #[structopt(name = "key")]
+ #[structopt(name = "key", version = garage_version())]
Key(KeyOperation),
/// Run migrations from previous Garage version
/// (DO NOT USE WITHOUT READING FULL DOCUMENTATION)
- #[structopt(name = "migrate")]
+ #[structopt(name = "migrate", version = garage_version())]
Migrate(MigrateOpt),
- /// Start repair of node data
- #[structopt(name = "repair")]
+ /// Start repair of node data on remote node
+ #[structopt(name = "repair", version = garage_version())]
Repair(RepairOpt),
+ /// Offline reparation of node data (these repairs must be run offline
+ /// directly on the server node)
+ #[structopt(name = "offline-repair", version = garage_version())]
+ OfflineRepair(OfflineRepairOpt),
+
/// Gather node statistics
- #[structopt(name = "stats")]
+ #[structopt(name = "stats", version = garage_version())]
Stats(StatsOpt),
+
+ /// Manage background workers
+ #[structopt(name = "worker", version = garage_version())]
+ Worker(WorkerOpt),
}
#[derive(StructOpt, Debug)]
pub enum NodeOperation {
/// Print identifier (public key) of this Garage node
- #[structopt(name = "id")]
+ #[structopt(name = "id", version = garage_version())]
NodeId(NodeIdOpt),
/// Connect to Garage node that is currently isolated from the system
- #[structopt(name = "connect")]
+ #[structopt(name = "connect", version = garage_version())]
Connect(ConnectNodeOpt),
}
@@ -70,23 +80,23 @@ pub struct ConnectNodeOpt {
#[derive(StructOpt, Debug)]
pub enum LayoutOperation {
/// Assign role to Garage node
- #[structopt(name = "assign")]
+ #[structopt(name = "assign", version = garage_version())]
Assign(AssignRoleOpt),
/// Remove role from Garage cluster node
- #[structopt(name = "remove")]
+ #[structopt(name = "remove", version = garage_version())]
Remove(RemoveRoleOpt),
/// Show roles currently assigned to nodes and changes staged for commit
- #[structopt(name = "show")]
+ #[structopt(name = "show", version = garage_version())]
Show,
/// Apply staged changes to cluster layout
- #[structopt(name = "apply")]
+ #[structopt(name = "apply", version = garage_version())]
Apply(ApplyLayoutOpt),
/// Revert staged changes to cluster layout
- #[structopt(name = "revert")]
+ #[structopt(name = "revert", version = garage_version())]
Revert(RevertLayoutOpt),
}
@@ -141,40 +151,44 @@ pub struct RevertLayoutOpt {
#[derive(Serialize, Deserialize, StructOpt, Debug)]
pub enum BucketOperation {
/// List buckets
- #[structopt(name = "list")]
+ #[structopt(name = "list", version = garage_version())]
List,
/// Get bucket info
- #[structopt(name = "info")]
+ #[structopt(name = "info", version = garage_version())]
Info(BucketOpt),
/// Create bucket
- #[structopt(name = "create")]
+ #[structopt(name = "create", version = garage_version())]
Create(BucketOpt),
/// Delete bucket
- #[structopt(name = "delete")]
+ #[structopt(name = "delete", version = garage_version())]
Delete(DeleteBucketOpt),
/// Alias bucket under new name
- #[structopt(name = "alias")]
+ #[structopt(name = "alias", version = garage_version())]
Alias(AliasBucketOpt),
/// Remove bucket alias
- #[structopt(name = "unalias")]
+ #[structopt(name = "unalias", version = garage_version())]
Unalias(UnaliasBucketOpt),
/// Allow key to read or write to bucket
- #[structopt(name = "allow")]
+ #[structopt(name = "allow", version = garage_version())]
Allow(PermBucketOpt),
/// Deny key from reading or writing to bucket
- #[structopt(name = "deny")]
+ #[structopt(name = "deny", version = garage_version())]
Deny(PermBucketOpt),
/// Expose as website or not
- #[structopt(name = "website")]
+ #[structopt(name = "website", version = garage_version())]
Website(WebsiteOpt),
+
+ /// Set the quotas for this bucket
+ #[structopt(name = "set-quotas", version = garage_version())]
+ SetQuotas(SetQuotasOpt),
}
#[derive(Serialize, Deserialize, StructOpt, Debug)]
@@ -262,37 +276,52 @@ pub struct PermBucketOpt {
}
#[derive(Serialize, Deserialize, StructOpt, Debug)]
+pub struct SetQuotasOpt {
+ /// Bucket name
+ pub bucket: String,
+
+ /// Set a maximum size for the bucket (specify a size e.g. in MiB or GiB,
+ /// or `none` for no size restriction)
+ #[structopt(long = "max-size")]
+ pub max_size: Option<String>,
+
+ /// Set a maximum number of objects for the bucket (or `none` for no restriction)
+ #[structopt(long = "max-objects")]
+ pub max_objects: Option<String>,
+}
+
+#[derive(Serialize, Deserialize, StructOpt, Debug)]
pub enum KeyOperation {
/// List keys
- #[structopt(name = "list")]
+ #[structopt(name = "list", version = garage_version())]
List,
/// Get key info
- #[structopt(name = "info")]
+ #[structopt(name = "info", version = garage_version())]
Info(KeyOpt),
/// Create new key
- #[structopt(name = "new")]
+ #[structopt(name = "new", version = garage_version())]
New(KeyNewOpt),
/// Rename key
- #[structopt(name = "rename")]
+ #[structopt(name = "rename", version = garage_version())]
Rename(KeyRenameOpt),
/// Delete key
- #[structopt(name = "delete")]
+ #[structopt(name = "delete", version = garage_version())]
Delete(KeyDeleteOpt),
/// Set permission flags for key
- #[structopt(name = "allow")]
+ #[structopt(name = "allow", version = garage_version())]
Allow(KeyPermOpt),
/// Unset permission flags for key
- #[structopt(name = "deny")]
+ #[structopt(name = "deny", version = garage_version())]
Deny(KeyPermOpt),
/// Import key
- #[structopt(name = "import")]
+ #[structopt(name = "import", version = garage_version())]
Import(KeyImportOpt),
}
@@ -364,7 +393,7 @@ pub struct MigrateOpt {
#[derive(Serialize, Deserialize, StructOpt, Debug, Eq, PartialEq, Clone)]
pub enum MigrateWhat {
/// Migrate buckets and permissions from v0.5.0
- #[structopt(name = "buckets050")]
+ #[structopt(name = "buckets050", version = garage_version())]
Buckets050,
}
@@ -385,27 +414,69 @@ pub struct RepairOpt {
#[derive(Serialize, Deserialize, StructOpt, Debug, Eq, PartialEq, Clone)]
pub enum RepairWhat {
/// Only do a full sync of metadata tables
- #[structopt(name = "tables")]
+ #[structopt(name = "tables", version = garage_version())]
Tables,
/// Only repair (resync/rebalance) the set of stored blocks
- #[structopt(name = "blocks")]
+ #[structopt(name = "blocks", version = garage_version())]
Blocks,
/// Only redo the propagation of object deletions to the version table (slow)
- #[structopt(name = "versions")]
+ #[structopt(name = "versions", version = garage_version())]
Versions,
/// Only redo the propagation of version deletions to the block ref table (extremely slow)
- #[structopt(name = "block_refs")]
+ #[structopt(name = "block_refs", version = garage_version())]
BlockRefs,
/// Verify integrity of all blocks on disc (extremely slow, i/o intensive)
- #[structopt(name = "scrub")]
+ #[structopt(name = "scrub", version = garage_version())]
Scrub {
- /// Tranquility factor (see tranquilizer documentation)
- #[structopt(name = "tranquility", default_value = "2")]
+ #[structopt(subcommand)]
+ cmd: ScrubCmd,
+ },
+}
+
+#[derive(Serialize, Deserialize, StructOpt, Debug, Eq, PartialEq, Clone)]
+pub enum ScrubCmd {
+ /// Start scrub
+ #[structopt(name = "start", version = garage_version())]
+ Start,
+ /// Pause scrub (it will resume automatically after 24 hours)
+ #[structopt(name = "pause", version = garage_version())]
+ Pause,
+ /// Resume paused scrub
+ #[structopt(name = "resume", version = garage_version())]
+ Resume,
+ /// Cancel scrub in progress
+ #[structopt(name = "cancel", version = garage_version())]
+ Cancel,
+ /// Set tranquility level for in-progress and future scrubs
+ #[structopt(name = "set-tranquility", version = garage_version())]
+ SetTranquility {
+ #[structopt()]
tranquility: u32,
},
}
#[derive(Serialize, Deserialize, StructOpt, Debug, Clone)]
+pub struct OfflineRepairOpt {
+ /// Confirm the launch of the repair operation
+ #[structopt(long = "yes")]
+ pub yes: bool,
+
+ #[structopt(subcommand)]
+ pub what: OfflineRepairWhat,
+}
+
+#[derive(Serialize, Deserialize, StructOpt, Debug, Eq, PartialEq, Clone)]
+pub enum OfflineRepairWhat {
+ /// Repair K2V item counters
+ #[cfg(feature = "k2v")]
+ #[structopt(name = "k2v_item_counters", version = garage_version())]
+ K2VItemCounters,
+ /// Repair object counters
+ #[structopt(name = "object_counters", version = garage_version())]
+ ObjectCounters,
+}
+
+#[derive(Serialize, Deserialize, StructOpt, Debug, Clone)]
pub struct StatsOpt {
/// Gather statistics from all nodes
#[structopt(short = "a", long = "all-nodes")]
@@ -415,3 +486,48 @@ pub struct StatsOpt {
#[structopt(short = "d", long = "detailed")]
pub detailed: bool,
}
+
+#[derive(Serialize, Deserialize, StructOpt, Debug, Clone)]
+pub struct WorkerOpt {
+ #[structopt(subcommand)]
+ pub cmd: WorkerCmd,
+}
+
+#[derive(Serialize, Deserialize, StructOpt, Debug, Eq, PartialEq, Clone)]
+pub enum WorkerCmd {
+ /// List all workers on Garage node
+ #[structopt(name = "list", version = garage_version())]
+ List {
+ #[structopt(flatten)]
+ opt: WorkerListOpt,
+ },
+ /// Set worker parameter
+ #[structopt(name = "set", version = garage_version())]
+ Set {
+ #[structopt(subcommand)]
+ opt: WorkerSetCmd,
+ },
+}
+
+#[derive(Serialize, Deserialize, StructOpt, Debug, Eq, PartialEq, Clone, Copy)]
+pub struct WorkerListOpt {
+ /// Show only busy workers
+ #[structopt(short = "b", long = "busy")]
+ pub busy: bool,
+ /// Show only workers with errors
+ #[structopt(short = "e", long = "errors")]
+ pub errors: bool,
+}
+
+#[derive(Serialize, Deserialize, StructOpt, Debug, Eq, PartialEq, Clone)]
+pub enum WorkerSetCmd {
+ /// Set tranquility of scrub operations
+ #[structopt(name = "scrub-tranquility", version = garage_version())]
+ ScrubTranquility { tranquility: u32 },
+ /// Set number of concurrent block resync workers
+ #[structopt(name = "resync-n-workers", version = garage_version())]
+ ResyncNWorkers { n_workers: usize },
+ /// Set tranquility of block resync operations
+ #[structopt(name = "resync-tranquility", version = garage_version())]
+ ResyncTranquility { tranquility: u32 },
+}
diff --git a/src/garage/cli/util.rs b/src/garage/cli/util.rs
index 7d496507..396938ae 100644
--- a/src/garage/cli/util.rs
+++ b/src/garage/cli/util.rs
@@ -1,11 +1,18 @@
use std::collections::HashMap;
+use std::time::Duration;
+use garage_util::background::*;
use garage_util::crdt::*;
use garage_util::data::Uuid;
use garage_util::error::*;
+use garage_util::formater::format_table;
+use garage_util::time::*;
use garage_model::bucket_table::*;
use garage_model::key_table::*;
+use garage_model::s3::object_table::{BYTES, OBJECTS, UNFINISHED_UPLOADS};
+
+use crate::cli::structs::WorkerListOpt;
pub fn print_bucket_list(bl: Vec<Bucket>) {
println!("List of buckets:");
@@ -28,11 +35,12 @@ pub fn print_bucket_list(bl: Vec<Bucket>) {
[((k, n), _, _)] => format!("{}:{}", k, n),
s => format!("[{} local aliases]", s.len()),
};
+
table.push(format!(
"\t{}\t{}\t{}",
aliases.join(","),
local_aliases_n,
- hex::encode(bucket.id)
+ hex::encode(bucket.id),
));
}
format_table(table);
@@ -120,7 +128,11 @@ pub fn print_key_info(key: &Key, relevant_buckets: &HashMap<Uuid, Bucket>) {
}
}
-pub fn print_bucket_info(bucket: &Bucket, relevant_keys: &HashMap<String, Key>) {
+pub fn print_bucket_info(
+ bucket: &Bucket,
+ relevant_keys: &HashMap<String, Key>,
+ counters: &HashMap<String, i64>,
+) {
let key_name = |k| {
relevant_keys
.get(k)
@@ -132,7 +144,42 @@ pub fn print_bucket_info(bucket: &Bucket, relevant_keys: &HashMap<String, Key>)
match &bucket.state {
Deletable::Deleted => println!("Bucket is deleted."),
Deletable::Present(p) => {
- println!("Website access: {}", p.website_config.get().is_some());
+ let size =
+ bytesize::ByteSize::b(counters.get(BYTES).cloned().unwrap_or_default() as u64);
+ println!(
+ "\nSize: {} ({})",
+ size.to_string_as(true),
+ size.to_string_as(false)
+ );
+ println!(
+ "Objects: {}",
+ counters.get(OBJECTS).cloned().unwrap_or_default()
+ );
+ println!(
+ "Unfinished multipart uploads: {}",
+ counters
+ .get(UNFINISHED_UPLOADS)
+ .cloned()
+ .unwrap_or_default()
+ );
+
+ println!("\nWebsite access: {}", p.website_config.get().is_some());
+
+ let quotas = p.quotas.get();
+ if quotas.max_size.is_some() || quotas.max_objects.is_some() {
+ println!("\nQuotas:");
+ if let Some(ms) = quotas.max_size {
+ let ms = bytesize::ByteSize::b(ms);
+ println!(
+ " maximum size: {} ({})",
+ ms.to_string_as(true),
+ ms.to_string_as(false)
+ );
+ }
+ if let Some(mo) = quotas.max_objects {
+ println!(" maximum number of objects: {}", mo);
+ }
+ }
println!("\nGlobal aliases:");
for (alias, _, active) in p.aliases.items().iter() {
@@ -173,42 +220,13 @@ pub fn print_bucket_info(bucket: &Bucket, relevant_keys: &HashMap<String, Key>)
};
}
-pub fn format_table(data: Vec<String>) {
- let data = data
- .iter()
- .map(|s| s.split('\t').collect::<Vec<_>>())
- .collect::<Vec<_>>();
-
- let columns = data.iter().map(|row| row.len()).fold(0, std::cmp::max);
- let mut column_size = vec![0; columns];
-
- let mut out = String::new();
-
- for row in data.iter() {
- for (i, col) in row.iter().enumerate() {
- column_size[i] = std::cmp::max(column_size[i], col.chars().count());
- }
- }
-
- for row in data.iter() {
- for (col, col_len) in row[..row.len() - 1].iter().zip(column_size.iter()) {
- out.push_str(col);
- (0..col_len - col.chars().count() + 2).for_each(|_| out.push(' '));
- }
- out.push_str(row[row.len() - 1]);
- out.push('\n');
- }
-
- print!("{}", out);
-}
-
pub fn find_matching_node(
cand: impl std::iter::Iterator<Item = Uuid>,
pattern: &str,
) -> Result<Uuid, Error> {
let mut candidates = vec![];
for c in cand {
- if hex::encode(&c).starts_with(&pattern) {
+ if hex::encode(&c).starts_with(&pattern) && !candidates.contains(&c) {
candidates.push(c);
}
}
@@ -222,3 +240,56 @@ pub fn find_matching_node(
Ok(candidates[0])
}
}
+
+pub fn print_worker_info(wi: HashMap<usize, WorkerInfo>, wlo: WorkerListOpt) {
+ let mut wi = wi.into_iter().collect::<Vec<_>>();
+ wi.sort_by_key(|(tid, info)| {
+ (
+ match info.state {
+ WorkerState::Busy | WorkerState::Throttled(_) => 0,
+ WorkerState::Idle => 1,
+ WorkerState::Done => 2,
+ },
+ *tid,
+ )
+ });
+
+ let mut table = vec![];
+ for (tid, info) in wi.iter() {
+ if wlo.busy && !matches!(info.state, WorkerState::Busy | WorkerState::Throttled(_)) {
+ continue;
+ }
+ if wlo.errors && info.errors == 0 {
+ continue;
+ }
+
+ table.push(format!("{}\t{}\t{}", tid, info.state, info.name));
+ if let Some(i) = &info.info {
+ table.push(format!("\t\t {}", i));
+ }
+ let tf = timeago::Formatter::new();
+ let (err_ago, err_msg) = info
+ .last_error
+ .as_ref()
+ .map(|(m, t)| {
+ (
+ tf.convert(Duration::from_millis(now_msec() - t)),
+ m.as_str(),
+ )
+ })
+ .unwrap_or(("(?) ago".into(), "(?)"));
+ if info.consecutive_errors > 0 {
+ table.push(format!(
+ "\t\t {} consecutive errors ({} total), last {}",
+ info.consecutive_errors, info.errors, err_ago,
+ ));
+ table.push(format!("\t\t {}", err_msg));
+ } else if info.errors > 0 {
+ table.push(format!("\t\t ({} errors, last {})", info.errors, err_ago,));
+ if wlo.errors {
+ table.push(format!("\t\t {}", err_msg));
+ }
+ }
+ }
+ format_table(table);
+}
diff --git a/src/garage/main.rs b/src/garage/main.rs
index e898e680..e5cba553 100644
--- a/src/garage/main.rs
+++ b/src/garage/main.rs
@@ -8,6 +8,14 @@ mod admin;
mod cli;
mod repair;
mod server;
+#[cfg(feature = "telemetry-otlp")]
+mod tracing_setup;
+
+#[cfg(not(any(feature = "bundled-libs", feature = "system-libs")))]
+compile_error!("Either bundled-libs or system-libs Cargo feature must be enabled");
+
+#[cfg(all(feature = "bundled-libs", feature = "system-libs"))]
+compile_error!("Only one of bundled-libs and system-libs Cargo features must be enabled");
use std::net::SocketAddr;
use std::path::PathBuf;
@@ -28,7 +36,10 @@ use admin::*;
use cli::*;
#[derive(StructOpt, Debug)]
-#[structopt(name = "garage")]
+#[structopt(
+ name = "garage",
+ about = "S3-compatible object store for self-hosted geo-distributed deployments"
+)]
struct Opt {
/// Host to connect to for admin operations, in the format:
/// <public-key>@<ip>:<port>
@@ -57,20 +68,56 @@ async fn main() {
if std::env::var("RUST_LOG").is_err() {
std::env::set_var("RUST_LOG", "netapp=info,garage=info")
}
- pretty_env_logger::init();
+ tracing_subscriber::fmt()
+ .with_writer(std::io::stderr)
+ .with_env_filter(tracing_subscriber::filter::EnvFilter::from_default_env())
+ .init();
sodiumoxide::init().expect("Unable to init sodiumoxide");
- let opt = Opt::from_args();
+ // Abort on panic (same behavior as in Go)
+ std::panic::set_hook(Box::new(|panic_info| {
+ error!("{}", panic_info.to_string());
+ std::process::abort();
+ }));
+
+ // Initialize version and features info
+ let features = &[
+ #[cfg(feature = "k2v")]
+ "k2v",
+ #[cfg(feature = "sled")]
+ "sled",
+ #[cfg(feature = "lmdb")]
+ "lmdb",
+ #[cfg(feature = "sqlite")]
+ "sqlite",
+ #[cfg(feature = "kubernetes-discovery")]
+ "kubernetes-discovery",
+ #[cfg(feature = "metrics")]
+ "metrics",
+ #[cfg(feature = "telemetry-otlp")]
+ "telemetry-otlp",
+ #[cfg(feature = "bundled-libs")]
+ "bundled-libs",
+ #[cfg(feature = "system-libs")]
+ "system-libs",
+ ][..];
+ if let Some(git_version) = option_env!("GIT_VERSION") {
+ garage_util::version::init_version(git_version);
+ }
+ garage_util::version::init_features(features);
+
+ // Parse arguments
+ let version = format!(
+ "{} [features: {}]",
+ garage_util::version::garage_version(),
+ features.join(", ")
+ );
+ let opt = Opt::from_clap(&Opt::clap().version(version.as_str()).get_matches());
let res = match opt.cmd {
- Command::Server => {
- // Abort on panic (same behavior as in Go)
- std::panic::set_hook(Box::new(|panic_info| {
- error!("{}", panic_info.to_string());
- std::process::abort();
- }));
-
- server::run_server(opt.config_file).await
+ Command::Server => server::run_server(opt.config_file).await,
+ Command::OfflineRepair(repair_opt) => {
+ repair::offline::offline_repair(opt.config_file, repair_opt).await
}
Command::Node(NodeOperation::NodeId(node_id_opt)) => {
node_id_command(opt.config_file, node_id_opt.quiet)
@@ -115,7 +162,13 @@ async fn cli_command(opt: Opt) -> Result<(), Error> {
} else {
let node_id = garage_rpc::system::read_node_id(&config.as_ref().unwrap().metadata_dir)
.err_context(READ_KEY_ERROR)?;
- if let Some(a) = config.as_ref().and_then(|c| c.rpc_public_addr) {
+ if let Some(a) = config.as_ref().and_then(|c| c.rpc_public_addr.as_ref()) {
+ use std::net::ToSocketAddrs;
+ let a = a
+ .to_socket_addrs()
+ .ok_or_message("unable to resolve rpc_public_addr specified in config file")?
+ .next()
+ .ok_or_message("unable to resolve rpc_public_addr specified in config file")?;
(node_id, a)
} else {
let default_addr = SocketAddr::new(
@@ -141,6 +194,7 @@ async fn cli_command(opt: Opt) -> Result<(), Error> {
match cli_command_dispatch(opt.cmd, &system_rpc_endpoint, &admin_rpc_endpoint, id).await {
Err(HelperError::Internal(i)) => Err(Error::Message(format!("Internal error: {}", i))),
Err(HelperError::BadRequest(b)) => Err(Error::Message(b)),
+ Err(e) => Err(Error::Message(format!("{}", e))),
Ok(x) => Ok(x),
}
}
diff --git a/src/garage/repair.rs b/src/garage/repair.rs
deleted file mode 100644
index 3666ca8f..00000000
--- a/src/garage/repair.rs
+++ /dev/null
@@ -1,149 +0,0 @@
-use std::sync::Arc;
-
-use tokio::sync::watch;
-
-use garage_model::block_ref_table::*;
-use garage_model::garage::Garage;
-use garage_model::object_table::*;
-use garage_model::version_table::*;
-use garage_table::*;
-use garage_util::error::Error;
-
-use crate::*;
-
-pub struct Repair {
- pub garage: Arc<Garage>,
-}
-
-impl Repair {
- pub async fn repair_worker(&self, opt: RepairOpt, must_exit: watch::Receiver<bool>) {
- if let Err(e) = self.repair_worker_aux(opt, must_exit).await {
- warn!("Repair worker failed with error: {}", e);
- }
- }
-
- async fn repair_worker_aux(
- &self,
- opt: RepairOpt,
- must_exit: watch::Receiver<bool>,
- ) -> Result<(), Error> {
- match opt.what {
- RepairWhat::Tables => {
- info!("Launching a full sync of tables");
- self.garage.bucket_table.syncer.add_full_sync();
- self.garage.object_table.syncer.add_full_sync();
- self.garage.version_table.syncer.add_full_sync();
- self.garage.block_ref_table.syncer.add_full_sync();
- self.garage.key_table.syncer.add_full_sync();
- }
- RepairWhat::Versions => {
- info!("Repairing the versions table");
- self.repair_versions(&must_exit).await?;
- }
- RepairWhat::BlockRefs => {
- info!("Repairing the block refs table");
- self.repair_block_ref(&must_exit).await?;
- }
- RepairWhat::Blocks => {
- info!("Repairing the stored blocks");
- self.garage
- .block_manager
- .repair_data_store(&must_exit)
- .await?;
- }
- RepairWhat::Scrub { tranquility } => {
- info!("Verifying integrity of stored blocks");
- self.garage
- .block_manager
- .scrub_data_store(&must_exit, tranquility)
- .await?;
- }
- }
- Ok(())
- }
-
- async fn repair_versions(&self, must_exit: &watch::Receiver<bool>) -> Result<(), Error> {
- let mut pos = vec![];
-
- while let Some((item_key, item_bytes)) =
- self.garage.version_table.data.store.get_gt(&pos)?
- {
- pos = item_key.to_vec();
-
- let version = rmp_serde::decode::from_read_ref::<_, Version>(item_bytes.as_ref())?;
- if version.deleted.get() {
- continue;
- }
- let object = self
- .garage
- .object_table
- .get(&version.bucket_id, &version.key)
- .await?;
- let version_exists = match object {
- Some(o) => o
- .versions()
- .iter()
- .any(|x| x.uuid == version.uuid && x.state != ObjectVersionState::Aborted),
- None => false,
- };
- if !version_exists {
- info!("Repair versions: marking version as deleted: {:?}", version);
- self.garage
- .version_table
- .insert(&Version::new(
- version.uuid,
- version.bucket_id,
- version.key,
- true,
- ))
- .await?;
- }
-
- if *must_exit.borrow() {
- break;
- }
- }
- Ok(())
- }
-
- async fn repair_block_ref(&self, must_exit: &watch::Receiver<bool>) -> Result<(), Error> {
- let mut pos = vec![];
-
- while let Some((item_key, item_bytes)) =
- self.garage.block_ref_table.data.store.get_gt(&pos)?
- {
- pos = item_key.to_vec();
-
- let block_ref = rmp_serde::decode::from_read_ref::<_, BlockRef>(item_bytes.as_ref())?;
- if block_ref.deleted.get() {
- continue;
- }
- let version = self
- .garage
- .version_table
- .get(&block_ref.version, &EmptyKey)
- .await?;
- // The version might not exist if it has been GC'ed
- let ref_exists = version.map(|v| !v.deleted.get()).unwrap_or(false);
- if !ref_exists {
- info!(
- "Repair block ref: marking block_ref as deleted: {:?}",
- block_ref
- );
- self.garage
- .block_ref_table
- .insert(&BlockRef {
- block: block_ref.block,
- version: block_ref.version,
- deleted: true.into(),
- })
- .await?;
- }
-
- if *must_exit.borrow() {
- break;
- }
- }
- Ok(())
- }
-}
diff --git a/src/garage/repair/mod.rs b/src/garage/repair/mod.rs
new file mode 100644
index 00000000..4699ace5
--- /dev/null
+++ b/src/garage/repair/mod.rs
@@ -0,0 +1,2 @@
+pub mod offline;
+pub mod online;
diff --git a/src/garage/repair/offline.rs b/src/garage/repair/offline.rs
new file mode 100644
index 00000000..7760a8bd
--- /dev/null
+++ b/src/garage/repair/offline.rs
@@ -0,0 +1,55 @@
+use std::path::PathBuf;
+
+use tokio::sync::watch;
+
+use garage_util::background::*;
+use garage_util::config::*;
+use garage_util::error::*;
+
+use garage_model::garage::Garage;
+
+use crate::cli::structs::*;
+
+pub async fn offline_repair(config_file: PathBuf, opt: OfflineRepairOpt) -> Result<(), Error> {
+ if !opt.yes {
+ return Err(Error::Message(
+ "Please add the --yes flag to launch repair operation".into(),
+ ));
+ }
+
+ info!("Loading configuration...");
+ let config = read_config(config_file)?;
+
+ info!("Initializing background runner...");
+ let (done_tx, done_rx) = watch::channel(false);
+ let (background, await_background_done) = BackgroundRunner::new(16, done_rx);
+
+ info!("Initializing Garage main data store...");
+ let garage = Garage::new(config.clone(), background)?;
+
+ info!("Launching repair operation...");
+ match opt.what {
+ #[cfg(feature = "k2v")]
+ OfflineRepairWhat::K2VItemCounters => {
+ garage
+ .k2v
+ .counter_table
+ .offline_recount_all(&garage.k2v.item_table)?;
+ }
+ OfflineRepairWhat::ObjectCounters => {
+ garage
+ .object_counter_table
+ .offline_recount_all(&garage.object_table)?;
+ }
+ }
+
+ info!("Repair operation finished, shutting down Garage internals...");
+ done_tx.send(true).unwrap();
+ drop(garage);
+
+ await_background_done.await?;
+
+ info!("Cleaning up...");
+
+ Ok(())
+}
diff --git a/src/garage/repair/online.rs b/src/garage/repair/online.rs
new file mode 100644
index 00000000..e33cf097
--- /dev/null
+++ b/src/garage/repair/online.rs
@@ -0,0 +1,215 @@
+use std::sync::Arc;
+use std::time::Duration;
+
+use async_trait::async_trait;
+use tokio::sync::watch;
+
+use garage_block::repair::ScrubWorkerCommand;
+use garage_model::garage::Garage;
+use garage_model::s3::block_ref_table::*;
+use garage_model::s3::object_table::*;
+use garage_model::s3::version_table::*;
+use garage_table::*;
+use garage_util::background::*;
+use garage_util::error::Error;
+
+use crate::*;
+
+pub async fn launch_online_repair(garage: Arc<Garage>, opt: RepairOpt) {
+ match opt.what {
+ RepairWhat::Tables => {
+ info!("Launching a full sync of tables");
+ garage.bucket_table.syncer.add_full_sync();
+ garage.object_table.syncer.add_full_sync();
+ garage.version_table.syncer.add_full_sync();
+ garage.block_ref_table.syncer.add_full_sync();
+ garage.key_table.syncer.add_full_sync();
+ }
+ RepairWhat::Versions => {
+ info!("Repairing the versions table");
+ garage
+ .background
+ .spawn_worker(RepairVersionsWorker::new(garage.clone()));
+ }
+ RepairWhat::BlockRefs => {
+ info!("Repairing the block refs table");
+ garage
+ .background
+ .spawn_worker(RepairBlockrefsWorker::new(garage.clone()));
+ }
+ RepairWhat::Blocks => {
+ info!("Repairing the stored blocks");
+ garage
+ .background
+ .spawn_worker(garage_block::repair::RepairWorker::new(
+ garage.block_manager.clone(),
+ ));
+ }
+ RepairWhat::Scrub { cmd } => {
+ let cmd = match cmd {
+ ScrubCmd::Start => ScrubWorkerCommand::Start,
+ ScrubCmd::Pause => ScrubWorkerCommand::Pause(Duration::from_secs(3600 * 24)),
+ ScrubCmd::Resume => ScrubWorkerCommand::Resume,
+ ScrubCmd::Cancel => ScrubWorkerCommand::Cancel,
+ ScrubCmd::SetTranquility { tranquility } => {
+ ScrubWorkerCommand::SetTranquility(tranquility)
+ }
+ };
+ info!("Sending command to scrub worker: {:?}", cmd);
+ garage.block_manager.send_scrub_command(cmd).await;
+ }
+ }
+}
+
+// ----
+
+struct RepairVersionsWorker {
+ garage: Arc<Garage>,
+ pos: Vec<u8>,
+ counter: usize,
+}
+
+impl RepairVersionsWorker {
+ fn new(garage: Arc<Garage>) -> Self {
+ Self {
+ garage,
+ pos: vec![],
+ counter: 0,
+ }
+ }
+}
+
+#[async_trait]
+impl Worker for RepairVersionsWorker {
+ fn name(&self) -> String {
+ "Version repair worker".into()
+ }
+
+ fn info(&self) -> Option<String> {
+ Some(format!("{} items done", self.counter))
+ }
+
+ async fn work(&mut self, _must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error> {
+ let item_bytes = match self.garage.version_table.data.store.get_gt(&self.pos)? {
+ Some((k, v)) => {
+ self.pos = k;
+ v
+ }
+ None => {
+ info!("repair_versions: finished, done {}", self.counter);
+ return Ok(WorkerState::Done);
+ }
+ };
+
+ self.counter += 1;
+
+ let version = rmp_serde::decode::from_read_ref::<_, Version>(&item_bytes)?;
+ if !version.deleted.get() {
+ let object = self
+ .garage
+ .object_table
+ .get(&version.bucket_id, &version.key)
+ .await?;
+ let version_exists = match object {
+ Some(o) => o
+ .versions()
+ .iter()
+ .any(|x| x.uuid == version.uuid && x.state != ObjectVersionState::Aborted),
+ None => false,
+ };
+ if !version_exists {
+ info!("Repair versions: marking version as deleted: {:?}", version);
+ self.garage
+ .version_table
+ .insert(&Version::new(
+ version.uuid,
+ version.bucket_id,
+ version.key,
+ true,
+ ))
+ .await?;
+ }
+ }
+
+ Ok(WorkerState::Busy)
+ }
+
+ async fn wait_for_work(&mut self, _must_exit: &watch::Receiver<bool>) -> WorkerState {
+ unreachable!()
+ }
+}
+
+// ----
+
+struct RepairBlockrefsWorker {
+ garage: Arc<Garage>,
+ pos: Vec<u8>,
+ counter: usize,
+}
+
+impl RepairBlockrefsWorker {
+ fn new(garage: Arc<Garage>) -> Self {
+ Self {
+ garage,
+ pos: vec![],
+ counter: 0,
+ }
+ }
+}
+
+#[async_trait]
+impl Worker for RepairBlockrefsWorker {
+ fn name(&self) -> String {
+ "Block refs repair worker".into()
+ }
+
+ fn info(&self) -> Option<String> {
+ Some(format!("{} items done", self.counter))
+ }
+
+ async fn work(&mut self, _must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error> {
+ let item_bytes = match self.garage.block_ref_table.data.store.get_gt(&self.pos)? {
+ Some((k, v)) => {
+ self.pos = k;
+ v
+ }
+ None => {
+ info!("repair_block_ref: finished, done {}", self.counter);
+ return Ok(WorkerState::Done);
+ }
+ };
+
+ self.counter += 1;
+
+ let block_ref = rmp_serde::decode::from_read_ref::<_, BlockRef>(&item_bytes)?;
+ if !block_ref.deleted.get() {
+ let version = self
+ .garage
+ .version_table
+ .get(&block_ref.version, &EmptyKey)
+ .await?;
+ // The version might not exist if it has been GC'ed
+ let ref_exists = version.map(|v| !v.deleted.get()).unwrap_or(false);
+ if !ref_exists {
+ info!(
+ "Repair block ref: marking block_ref as deleted: {:?}",
+ block_ref
+ );
+ self.garage
+ .block_ref_table
+ .insert(&BlockRef {
+ block: block_ref.block,
+ version: block_ref.version,
+ deleted: true.into(),
+ })
+ .await?;
+ }
+ }
+
+ Ok(WorkerState::Busy)
+ }
+
+ async fn wait_for_work(&mut self, _must_exit: &watch::Receiver<bool>) -> WorkerState {
+ unreachable!()
+ }
+}
diff --git a/src/garage/server.rs b/src/garage/server.rs
index 58c9e782..d4099a97 100644
--- a/src/garage/server.rs
+++ b/src/garage/server.rs
@@ -6,13 +6,17 @@ use garage_util::background::*;
use garage_util::config::*;
use garage_util::error::Error;
-use garage_admin::metrics::*;
-use garage_admin::tracing_setup::*;
-use garage_api::run_api_server;
+use garage_api::admin::api_server::AdminApiServer;
+use garage_api::s3::api_server::S3ApiServer;
use garage_model::garage::Garage;
-use garage_web::run_web_server;
+use garage_web::WebServer;
+
+#[cfg(feature = "k2v")]
+use garage_api::k2v::api_server::K2VApiServer;
use crate::admin::*;
+#[cfg(feature = "telemetry-otlp")]
+use crate::tracing_setup::*;
async fn wait_from(mut chan: watch::Receiver<bool>) {
while !*chan.borrow() {
@@ -24,79 +28,124 @@ async fn wait_from(mut chan: watch::Receiver<bool>) {
pub async fn run_server(config_file: PathBuf) -> Result<(), Error> {
info!("Loading configuration...");
- let config = read_config(config_file).expect("Unable to read config file");
+ let config = read_config(config_file)?;
- info!("Opening database...");
- let mut db_path = config.metadata_dir.clone();
- db_path.push("db");
- let db = sled::Config::default()
- .path(&db_path)
- .cache_capacity(config.sled_cache_capacity)
- .flush_every_ms(Some(config.sled_flush_every_ms))
- .open()
- .expect("Unable to open sled DB");
+ // ---- Initialize Garage internals ----
- info!("Initialize admin web server and metric backend...");
- let admin_server_init = AdminServer::init();
+ #[cfg(feature = "metrics")]
+ let metrics_exporter = opentelemetry_prometheus::exporter().init();
info!("Initializing background runner...");
- let watch_cancel = netapp::util::watch_ctrl_c();
+ let watch_cancel = watch_shutdown_signal();
let (background, await_background_done) = BackgroundRunner::new(16, watch_cancel.clone());
info!("Initializing Garage main data store...");
- let garage = Garage::new(config.clone(), db, background);
+ let garage = Garage::new(config.clone(), background)?;
+
+ if config.admin.trace_sink.is_some() {
+ info!("Initialize tracing...");
- info!("Initialize tracing...");
- if let Some(export_to) = config.admin.trace_sink {
- init_tracing(&export_to, garage.system.id)?;
+ #[cfg(feature = "telemetry-otlp")]
+ init_tracing(config.admin.trace_sink.as_ref().unwrap(), garage.system.id)?;
+
+ #[cfg(not(feature = "telemetry-otlp"))]
+ error!("Garage was built without OTLP exporter, admin.trace_sink is ignored.");
}
+ info!("Initialize Admin API server and metrics collector...");
+ let admin_server = AdminApiServer::new(
+ garage.clone(),
+ #[cfg(feature = "metrics")]
+ metrics_exporter,
+ );
+
+ info!("Launching internal Garage cluster communications...");
let run_system = tokio::spawn(garage.system.clone().run(watch_cancel.clone()));
info!("Create admin RPC handler...");
AdminRpcHandler::new(garage.clone());
- info!("Initializing API server...");
- let api_server = tokio::spawn(run_api_server(
- garage.clone(),
- wait_from(watch_cancel.clone()),
- ));
+ // ---- Launch public-facing API servers ----
+
+ let mut servers = vec![];
+
+ if let Some(s3_bind_addr) = &config.s3_api.api_bind_addr {
+ info!("Initializing S3 API server...");
+ servers.push((
+ "S3 API",
+ tokio::spawn(S3ApiServer::run(
+ garage.clone(),
+ *s3_bind_addr,
+ config.s3_api.s3_region.clone(),
+ wait_from(watch_cancel.clone()),
+ )),
+ ));
+ }
- info!("Initializing web server...");
- let web_server = tokio::spawn(run_web_server(
- garage.clone(),
- wait_from(watch_cancel.clone()),
- ));
-
- let admin_server = if let Some(admin_bind_addr) = config.admin.api_bind_addr {
- info!("Configure and run admin web server...");
- Some(tokio::spawn(
- admin_server_init.run(admin_bind_addr, wait_from(watch_cancel.clone())),
- ))
- } else {
- None
- };
+ if config.k2v_api.is_some() {
+ #[cfg(feature = "k2v")]
+ {
+ info!("Initializing K2V API server...");
+ servers.push((
+ "K2V API",
+ tokio::spawn(K2VApiServer::run(
+ garage.clone(),
+ config.k2v_api.as_ref().unwrap().api_bind_addr,
+ config.s3_api.s3_region.clone(),
+ wait_from(watch_cancel.clone()),
+ )),
+ ));
+ }
+ #[cfg(not(feature = "k2v"))]
+ error!("K2V is not enabled in this build, cannot start K2V API server");
+ }
- // Stuff runs
+ if let Some(web_config) = &config.s3_web {
+ info!("Initializing web server...");
+ servers.push((
+ "Web",
+ tokio::spawn(WebServer::run(
+ garage.clone(),
+ web_config.bind_addr,
+ web_config.root_domain.clone(),
+ wait_from(watch_cancel.clone()),
+ )),
+ ));
+ }
- // When a cancel signal is sent, stuff stops
- if let Err(e) = api_server.await? {
- warn!("API server exited with error: {}", e);
+ if let Some(admin_bind_addr) = &config.admin.api_bind_addr {
+ info!("Launching Admin API server...");
+ servers.push((
+ "Admin",
+ tokio::spawn(admin_server.run(*admin_bind_addr, wait_from(watch_cancel.clone()))),
+ ));
}
- if let Err(e) = web_server.await? {
- warn!("Web server exited with error: {}", e);
+
+ #[cfg(not(feature = "metrics"))]
+ if config.admin.metrics_token.is_some() {
+ warn!("This Garage version is built without the metrics feature");
}
- if let Some(a) = admin_server {
- if let Err(e) = a.await? {
- warn!("Admin web server exited with error: {}", e);
+
+ // Stuff runs
+
+ // When a cancel signal is sent, stuff stops
+
+ // Collect stuff
+ for (desc, join_handle) in servers {
+ if let Err(e) = join_handle.await? {
+ error!("{} server exited with error: {}", desc, e);
+ } else {
+ info!("{} server exited without error.", desc);
}
}
// Remove RPC handlers for system to break reference cycles
garage.system.netapp.drop_all_handlers();
+ opentelemetry::global::shutdown_tracer_provider();
// Await for netapp RPC system to end
run_system.await?;
+ info!("Netapp exited");
// Drop all references so that stuff can terminate properly
drop(garage);
@@ -108,3 +157,44 @@ pub async fn run_server(config_file: PathBuf) -> Result<(), Error> {
Ok(())
}
+
+#[cfg(unix)]
+fn watch_shutdown_signal() -> watch::Receiver<bool> {
+ use tokio::signal::unix::*;
+
+ let (send_cancel, watch_cancel) = watch::channel(false);
+ tokio::spawn(async move {
+ let mut sigint = signal(SignalKind::interrupt()).expect("Failed to install SIGINT handler");
+ let mut sigterm =
+ signal(SignalKind::terminate()).expect("Failed to install SIGTERM handler");
+ let mut sighup = signal(SignalKind::hangup()).expect("Failed to install SIGHUP handler");
+ tokio::select! {
+ _ = sigint.recv() => info!("Received SIGINT, shutting down."),
+ _ = sigterm.recv() => info!("Received SIGTERM, shutting down."),
+ _ = sighup.recv() => info!("Received SIGHUP, shutting down."),
+ }
+ send_cancel.send(true).unwrap();
+ });
+ watch_cancel
+}
+
+#[cfg(windows)]
+fn watch_shutdown_signal() -> watch::Receiver<bool> {
+ use tokio::signal::windows::*;
+
+ let (send_cancel, watch_cancel) = watch::channel(false);
+ tokio::spawn(async move {
+ let mut sigint = ctrl_c().expect("Failed to install Ctrl-C handler");
+ let mut sigclose = ctrl_close().expect("Failed to install Ctrl-Close handler");
+ let mut siglogoff = ctrl_logoff().expect("Failed to install Ctrl-Logoff handler");
+ let mut sigsdown = ctrl_shutdown().expect("Failed to install Ctrl-Shutdown handler");
+ tokio::select! {
+ _ = sigint.recv() => info!("Received Ctrl-C, shutting down."),
+ _ = sigclose.recv() => info!("Received Ctrl-Close, shutting down."),
+ _ = siglogoff.recv() => info!("Received Ctrl-Logoff, shutting down."),
+ _ = sigsdown.recv() => info!("Received Ctrl-Shutdown, shutting down."),
+ }
+ send_cancel.send(true).unwrap();
+ });
+ watch_cancel
+}
diff --git a/src/garage/tests/bucket.rs b/src/garage/tests/bucket.rs
index ff5cc8da..b32af068 100644
--- a/src/garage/tests/bucket.rs
+++ b/src/garage/tests/bucket.rs
@@ -29,8 +29,7 @@ async fn test_bucket_all() {
.unwrap()
.iter()
.filter(|x| x.name.as_ref().is_some())
- .find(|x| x.name.as_ref().unwrap() == "hello")
- .is_some());
+ .any(|x| x.name.as_ref().unwrap() == "hello"));
}
{
// Get its location
@@ -75,13 +74,12 @@ async fn test_bucket_all() {
{
// Check bucket is deleted with List buckets
let r = ctx.client.list_buckets().send().await.unwrap();
- assert!(r
+ assert!(!r
.buckets
.as_ref()
.unwrap()
.iter()
.filter(|x| x.name.as_ref().is_some())
- .find(|x| x.name.as_ref().unwrap() == "hello")
- .is_none());
+ .any(|x| x.name.as_ref().unwrap() == "hello"));
}
}
diff --git a/src/garage/tests/common/client.rs b/src/garage/tests/common/client.rs
index c5ddc6e5..212588b5 100644
--- a/src/garage/tests/common/client.rs
+++ b/src/garage/tests/common/client.rs
@@ -10,7 +10,7 @@ pub fn build_client(instance: &Instance) -> Client {
None,
"garage-integ-test",
);
- let endpoint = Endpoint::immutable(instance.uri());
+ let endpoint = Endpoint::immutable(instance.s3_uri());
let config = Config::builder()
.region(super::REGION)
diff --git a/src/garage/tests/common/custom_requester.rs b/src/garage/tests/common/custom_requester.rs
index 580691a1..1700cc90 100644
--- a/src/garage/tests/common/custom_requester.rs
+++ b/src/garage/tests/common/custom_requester.rs
@@ -17,14 +17,25 @@ use garage_api::signature;
pub struct CustomRequester {
key: Key,
uri: Uri,
+ service: &'static str,
client: Client<HttpConnector>,
}
impl CustomRequester {
- pub fn new(instance: &Instance) -> Self {
+ pub fn new_s3(instance: &Instance) -> Self {
CustomRequester {
key: instance.key.clone(),
- uri: instance.uri(),
+ uri: instance.s3_uri(),
+ service: "s3",
+ client: Client::new(),
+ }
+ }
+
+ pub fn new_k2v(instance: &Instance) -> Self {
+ CustomRequester {
+ key: instance.key.clone(),
+ uri: instance.k2v_uri(),
+ service: "k2v",
client: Client::new(),
}
}
@@ -32,6 +43,7 @@ impl CustomRequester {
pub fn builder(&self, bucket: String) -> RequestBuilder<'_> {
RequestBuilder {
requester: self,
+ service: self.service,
bucket,
method: Method::GET,
path: String::new(),
@@ -47,6 +59,7 @@ impl CustomRequester {
pub struct RequestBuilder<'a> {
requester: &'a CustomRequester,
+ service: &'static str,
bucket: String,
method: Method,
path: String,
@@ -59,13 +72,17 @@ pub struct RequestBuilder<'a> {
}
impl<'a> RequestBuilder<'a> {
+ pub fn service(&mut self, service: &'static str) -> &mut Self {
+ self.service = service;
+ self
+ }
pub fn method(&mut self, method: Method) -> &mut Self {
self.method = method;
self
}
- pub fn path(&mut self, path: String) -> &mut Self {
- self.path = path;
+ pub fn path(&mut self, path: impl ToString) -> &mut Self {
+ self.path = path.to_string();
self
}
@@ -74,16 +91,38 @@ impl<'a> RequestBuilder<'a> {
self
}
+ pub fn query_param<T, U>(&mut self, param: T, value: Option<U>) -> &mut Self
+ where
+ T: ToString,
+ U: ToString,
+ {
+ self.query_params
+ .insert(param.to_string(), value.as_ref().map(ToString::to_string));
+ self
+ }
+
pub fn signed_headers(&mut self, signed_headers: HashMap<String, String>) -> &mut Self {
self.signed_headers = signed_headers;
self
}
+ pub fn signed_header(&mut self, name: impl ToString, value: impl ToString) -> &mut Self {
+ self.signed_headers
+ .insert(name.to_string(), value.to_string());
+ self
+ }
+
pub fn unsigned_headers(&mut self, unsigned_headers: HashMap<String, String>) -> &mut Self {
self.unsigned_headers = unsigned_headers;
self
}
+ pub fn unsigned_header(&mut self, name: impl ToString, value: impl ToString) -> &mut Self {
+ self.unsigned_headers
+ .insert(name.to_string(), value.to_string());
+ self
+ }
+
pub fn body(&mut self, body: Vec<u8>) -> &mut Self {
self.body = body;
self
@@ -106,24 +145,24 @@ impl<'a> RequestBuilder<'a> {
let query = query_param_to_string(&self.query_params);
let (host, path) = if self.vhost_style {
(
- format!("{}.s3.garage", self.bucket),
+ format!("{}.{}.garage", self.bucket, self.service),
format!("{}{}", self.path, query),
)
} else {
(
- "s3.garage".to_owned(),
+ format!("{}.garage", self.service),
format!("{}/{}{}", self.bucket, self.path, query),
)
};
let uri = format!("{}{}", self.requester.uri, path);
let now = Utc::now();
- let scope = signature::compute_scope(&now, super::REGION.as_ref());
+ let scope = signature::compute_scope(&now, super::REGION.as_ref(), self.service);
let mut signer = signature::signing_hmac(
&now,
&self.requester.key.secret,
super::REGION.as_ref(),
- "s3",
+ self.service,
)
.unwrap();
let streaming_signer = signer.clone();
diff --git a/src/garage/tests/common/garage.rs b/src/garage/tests/common/garage.rs
index 88c51501..44d727f9 100644
--- a/src/garage/tests/common/garage.rs
+++ b/src/garage/tests/common/garage.rs
@@ -22,7 +22,9 @@ pub struct Instance {
process: process::Child,
pub path: PathBuf,
pub key: Key,
- pub api_port: u16,
+ pub s3_port: u16,
+ pub k2v_port: u16,
+ pub web_port: u16,
}
impl Instance {
@@ -58,9 +60,12 @@ rpc_secret = "{secret}"
[s3_api]
s3_region = "{region}"
-api_bind_addr = "127.0.0.1:{api_port}"
+api_bind_addr = "127.0.0.1:{s3_port}"
root_domain = ".s3.garage"
+[k2v_api]
+api_bind_addr = "127.0.0.1:{k2v_port}"
+
[s3_web]
bind_addr = "127.0.0.1:{web_port}"
root_domain = ".web.garage"
@@ -72,10 +77,11 @@ api_bind_addr = "127.0.0.1:{admin_port}"
path = path.display(),
secret = GARAGE_TEST_SECRET,
region = super::REGION,
- api_port = port,
- rpc_port = port + 1,
- web_port = port + 2,
- admin_port = port + 3,
+ s3_port = port,
+ k2v_port = port + 1,
+ rpc_port = port + 2,
+ web_port = port + 3,
+ admin_port = port + 4,
);
fs::write(path.join("config.toml"), config).expect("Could not write garage config file");
@@ -88,7 +94,7 @@ api_bind_addr = "127.0.0.1:{admin_port}"
.arg("server")
.stdout(stdout)
.stderr(stderr)
- .env("RUST_LOG", "garage=info,garage_api=debug")
+ .env("RUST_LOG", "garage=info,garage_api=trace")
.spawn()
.expect("Could not start garage");
@@ -96,7 +102,9 @@ api_bind_addr = "127.0.0.1:{admin_port}"
process: child,
path,
key: Key::default(),
- api_port: port,
+ s3_port: port,
+ k2v_port: port + 1,
+ web_port: port + 3,
}
}
@@ -147,8 +155,14 @@ api_bind_addr = "127.0.0.1:{admin_port}"
String::from_utf8(output.stdout).unwrap()
}
- pub fn uri(&self) -> http::Uri {
- format!("http://127.0.0.1:{api_port}", api_port = self.api_port)
+ pub fn s3_uri(&self) -> http::Uri {
+ format!("http://127.0.0.1:{s3_port}", s3_port = self.s3_port)
+ .parse()
+ .expect("Could not build garage endpoint URI")
+ }
+
+ pub fn k2v_uri(&self) -> http::Uri {
+ format!("http://127.0.0.1:{k2v_port}", k2v_port = self.k2v_port)
.parse()
.expect("Could not build garage endpoint URI")
}
diff --git a/src/garage/tests/common/mod.rs b/src/garage/tests/common/mod.rs
index 8f88c731..28874b02 100644
--- a/src/garage/tests/common/mod.rs
+++ b/src/garage/tests/common/mod.rs
@@ -17,18 +17,27 @@ pub struct Context {
pub garage: &'static garage::Instance,
pub client: Client,
pub custom_request: CustomRequester,
+ pub k2v: K2VContext,
+}
+
+pub struct K2VContext {
+ pub request: CustomRequester,
}
impl Context {
fn new() -> Self {
let garage = garage::instance();
let client = client::build_client(garage);
- let custom_request = CustomRequester::new(garage);
+ let custom_request = CustomRequester::new_s3(garage);
+ let k2v_request = CustomRequester::new_k2v(garage);
Context {
garage,
client,
custom_request,
+ k2v: K2VContext {
+ request: k2v_request,
+ },
}
}
diff --git a/src/garage/tests/k2v/batch.rs b/src/garage/tests/k2v/batch.rs
new file mode 100644
index 00000000..acae1910
--- /dev/null
+++ b/src/garage/tests/k2v/batch.rs
@@ -0,0 +1,612 @@
+use std::collections::HashMap;
+
+use crate::common;
+
+use assert_json_diff::assert_json_eq;
+use serde_json::json;
+
+use super::json_body;
+use hyper::Method;
+
+#[tokio::test]
+async fn test_batch() {
+ let ctx = common::context();
+ let bucket = ctx.create_bucket("test-k2v-batch");
+
+ let mut values = HashMap::new();
+ values.insert("a", "initial test 1");
+ values.insert("b", "initial test 2");
+ values.insert("c", "initial test 3");
+ values.insert("d.1", "initial test 4");
+ values.insert("d.2", "initial test 5");
+ values.insert("e", "initial test 6");
+ let mut ct = HashMap::new();
+
+ let res = ctx
+ .k2v
+ .request
+ .builder(bucket.clone())
+ .body(
+ format!(
+ r#"[
+ {{"pk": "root", "sk": "a", "ct": null, "v": "{}"}},
+ {{"pk": "root", "sk": "b", "ct": null, "v": "{}"}},
+ {{"pk": "root", "sk": "c", "ct": null, "v": "{}"}},
+ {{"pk": "root", "sk": "d.1", "ct": null, "v": "{}"}},
+ {{"pk": "root", "sk": "d.2", "ct": null, "v": "{}"}},
+ {{"pk": "root", "sk": "e", "ct": null, "v": "{}"}}
+ ]"#,
+ base64::encode(values.get(&"a").unwrap()),
+ base64::encode(values.get(&"b").unwrap()),
+ base64::encode(values.get(&"c").unwrap()),
+ base64::encode(values.get(&"d.1").unwrap()),
+ base64::encode(values.get(&"d.2").unwrap()),
+ base64::encode(values.get(&"e").unwrap()),
+ )
+ .into_bytes(),
+ )
+ .method(Method::POST)
+ .send()
+ .await
+ .unwrap();
+ assert_eq!(res.status(), 200);
+
+ for sk in ["a", "b", "c", "d.1", "d.2", "e"] {
+ let res = ctx
+ .k2v
+ .request
+ .builder(bucket.clone())
+ .path("root")
+ .query_param("sort_key", Some(sk))
+ .signed_header("accept", "*/*")
+ .send()
+ .await
+ .unwrap();
+ assert_eq!(res.status(), 200);
+ assert_eq!(
+ res.headers().get("content-type").unwrap().to_str().unwrap(),
+ "application/octet-stream"
+ );
+ ct.insert(
+ sk,
+ res.headers()
+ .get("x-garage-causality-token")
+ .unwrap()
+ .to_str()
+ .unwrap()
+ .to_string(),
+ );
+ let res_body = hyper::body::to_bytes(res.into_body())
+ .await
+ .unwrap()
+ .to_vec();
+ assert_eq!(res_body, values.get(sk).unwrap().as_bytes());
+ }
+
+ let res = ctx
+ .k2v
+ .request
+ .builder(bucket.clone())
+ .query_param("search", Option::<&str>::None)
+ .body(
+ br#"[
+ {"partitionKey": "root"},
+ {"partitionKey": "root", "start": "c"},
+ {"partitionKey": "root", "start": "c", "end": "dynamite"},
+ {"partitionKey": "root", "start": "c", "reverse": true, "end": "a"},
+ {"partitionKey": "root", "start": "c", "reverse": true, "end": "azerty"},
+ {"partitionKey": "root", "limit": 1},
+ {"partitionKey": "root", "prefix": "d"}
+ ]"#
+ .to_vec(),
+ )
+ .method(Method::POST)
+ .send()
+ .await
+ .unwrap();
+ assert_eq!(res.status(), 200);
+ let json_res = json_body(res).await;
+ assert_json_eq!(
+ json_res,
+ json!([
+ {
+ "partitionKey": "root",
+ "prefix": null,
+ "start": null,
+ "end": null,
+ "limit": null,
+ "reverse": false,
+ "conflictsOnly": false,
+ "tombstones": false,
+ "singleItem": false,
+ "items": [
+ {"sk": "a", "ct": ct.get("a").unwrap(), "v": [base64::encode(values.get("a").unwrap())]},
+ {"sk": "b", "ct": ct.get("b").unwrap(), "v": [base64::encode(values.get("b").unwrap())]},
+ {"sk": "c", "ct": ct.get("c").unwrap(), "v": [base64::encode(values.get("c").unwrap())]},
+ {"sk": "d.1", "ct": ct.get("d.1").unwrap(), "v": [base64::encode(values.get("d.1").unwrap())]},
+ {"sk": "d.2", "ct": ct.get("d.2").unwrap(), "v": [base64::encode(values.get("d.2").unwrap())]},
+ {"sk": "e", "ct": ct.get("e").unwrap(), "v": [base64::encode(values.get("e").unwrap())]}
+ ],
+ "more": false,
+ "nextStart": null,
+ },
+ {
+ "partitionKey": "root",
+ "prefix": null,
+ "start": "c",
+ "end": null,
+ "limit": null,
+ "reverse": false,
+ "conflictsOnly": false,
+ "tombstones": false,
+ "singleItem": false,
+ "items": [
+ {"sk": "c", "ct": ct.get("c").unwrap(), "v": [base64::encode(values.get("c").unwrap())]},
+ {"sk": "d.1", "ct": ct.get("d.1").unwrap(), "v": [base64::encode(values.get("d.1").unwrap())]},
+ {"sk": "d.2", "ct": ct.get("d.2").unwrap(), "v": [base64::encode(values.get("d.2").unwrap())]},
+ {"sk": "e", "ct": ct.get("e").unwrap(), "v": [base64::encode(values.get("e").unwrap())]}
+ ],
+ "more": false,
+ "nextStart": null,
+ },
+ {
+ "partitionKey": "root",
+ "prefix": null,
+ "start": "c",
+ "end": "dynamite",
+ "limit": null,
+ "reverse": false,
+ "conflictsOnly": false,
+ "tombstones": false,
+ "singleItem": false,
+ "items": [
+ {"sk": "c", "ct": ct.get("c").unwrap(), "v": [base64::encode(values.get("c").unwrap())]},
+ {"sk": "d.1", "ct": ct.get("d.1").unwrap(), "v": [base64::encode(values.get("d.1").unwrap())]},
+ {"sk": "d.2", "ct": ct.get("d.2").unwrap(), "v": [base64::encode(values.get("d.2").unwrap())]},
+ ],
+ "more": false,
+ "nextStart": null,
+ },
+ {
+ "partitionKey": "root",
+ "prefix": null,
+ "start": "c",
+ "end": "a",
+ "limit": null,
+ "reverse": true,
+ "conflictsOnly": false,
+ "tombstones": false,
+ "singleItem": false,
+ "items": [
+ {"sk": "c", "ct": ct.get("c").unwrap(), "v": [base64::encode(values.get("c").unwrap())]},
+ {"sk": "b", "ct": ct.get("b").unwrap(), "v": [base64::encode(values.get("b").unwrap())]},
+ ],
+ "more": false,
+ "nextStart": null,
+ },
+ {
+ "partitionKey": "root",
+ "prefix": null,
+ "start": "c",
+ "end": "azerty",
+ "limit": null,
+ "reverse": true,
+ "conflictsOnly": false,
+ "tombstones": false,
+ "singleItem": false,
+ "items": [
+ {"sk": "c", "ct": ct.get("c").unwrap(), "v": [base64::encode(values.get("c").unwrap())]},
+ {"sk": "b", "ct": ct.get("b").unwrap(), "v": [base64::encode(values.get("b").unwrap())]},
+ ],
+ "more": false,
+ "nextStart": null,
+ },
+ {
+ "partitionKey": "root",
+ "prefix": null,
+ "start": null,
+ "end": null,
+ "limit": 1,
+ "reverse": false,
+ "conflictsOnly": false,
+ "tombstones": false,
+ "singleItem": false,
+ "items": [
+ {"sk": "a", "ct": ct.get("a").unwrap(), "v": [base64::encode(values.get("a").unwrap())]}
+ ],
+ "more": true,
+ "nextStart": "b",
+ },
+ {
+ "partitionKey": "root",
+ "prefix": "d",
+ "start": null,
+ "end": null,
+ "limit": null,
+ "reverse": false,
+ "conflictsOnly": false,
+ "tombstones": false,
+ "singleItem": false,
+ "items": [
+ {"sk": "d.1", "ct": ct.get("d.1").unwrap(), "v": [base64::encode(values.get("d.1").unwrap())]},
+ {"sk": "d.2", "ct": ct.get("d.2").unwrap(), "v": [base64::encode(values.get("d.2").unwrap())]}
+ ],
+ "more": false,
+ "nextStart": null,
+ },
+ ])
+ );
+
+ // Insert some new values
+ values.insert("c'", "new test 3");
+ values.insert("d.1'", "new test 4");
+ values.insert("d.2'", "new test 5");
+
+ let res = ctx
+ .k2v
+ .request
+ .builder(bucket.clone())
+ .body(
+ format!(
+ r#"[
+ {{"pk": "root", "sk": "b", "ct": "{}", "v": null}},
+ {{"pk": "root", "sk": "c", "ct": null, "v": "{}"}},
+ {{"pk": "root", "sk": "d.1", "ct": "{}", "v": "{}"}},
+ {{"pk": "root", "sk": "d.2", "ct": null, "v": "{}"}}
+ ]"#,
+ ct.get(&"b").unwrap(),
+ base64::encode(values.get(&"c'").unwrap()),
+ ct.get(&"d.1").unwrap(),
+ base64::encode(values.get(&"d.1'").unwrap()),
+ base64::encode(values.get(&"d.2'").unwrap()),
+ )
+ .into_bytes(),
+ )
+ .method(Method::POST)
+ .send()
+ .await
+ .unwrap();
+ assert_eq!(res.status(), 200);
+
+ for sk in ["b", "c", "d.1", "d.2"] {
+ let res = ctx
+ .k2v
+ .request
+ .builder(bucket.clone())
+ .path("root")
+ .query_param("sort_key", Some(sk))
+ .signed_header("accept", "*/*")
+ .send()
+ .await
+ .unwrap();
+ if sk == "b" {
+ assert_eq!(res.status(), 204);
+ } else {
+ assert_eq!(res.status(), 200);
+ }
+ ct.insert(
+ sk,
+ res.headers()
+ .get("x-garage-causality-token")
+ .unwrap()
+ .to_str()
+ .unwrap()
+ .to_string(),
+ );
+ }
+
+ let res = ctx
+ .k2v
+ .request
+ .builder(bucket.clone())
+ .query_param("search", Option::<&str>::None)
+ .body(
+ br#"[
+ {"partitionKey": "root"},
+ {"partitionKey": "root", "prefix": "d"},
+ {"partitionKey": "root", "prefix": "d.", "end": "d.2"},
+ {"partitionKey": "root", "prefix": "d.", "limit": 1},
+ {"partitionKey": "root", "prefix": "d.", "start": "d.2", "limit": 1},
+ {"partitionKey": "root", "prefix": "d.", "reverse": true},
+ {"partitionKey": "root", "prefix": "d.", "start": "d.2", "reverse": true},
+ {"partitionKey": "root", "prefix": "d.", "limit": 2}
+ ]"#
+ .to_vec(),
+ )
+ .method(Method::POST)
+ .send()
+ .await
+ .unwrap();
+ assert_eq!(res.status(), 200);
+ let json_res = json_body(res).await;
+ assert_json_eq!(
+ json_res,
+ json!([
+ {
+ "partitionKey": "root",
+ "prefix": null,
+ "start": null,
+ "end": null,
+ "limit": null,
+ "reverse": false,
+ "conflictsOnly": false,
+ "tombstones": false,
+ "singleItem": false,
+ "items": [
+ {"sk": "a", "ct": ct.get("a").unwrap(), "v": [base64::encode(values.get("a").unwrap())]},
+ {"sk": "c", "ct": ct.get("c").unwrap(), "v": [base64::encode(values.get("c").unwrap()), base64::encode(values.get("c'").unwrap())]},
+ {"sk": "d.1", "ct": ct.get("d.1").unwrap(), "v": [base64::encode(values.get("d.1'").unwrap())]},
+ {"sk": "d.2", "ct": ct.get("d.2").unwrap(), "v": [base64::encode(values.get("d.2").unwrap()), base64::encode(values.get("d.2'").unwrap())]},
+ {"sk": "e", "ct": ct.get("e").unwrap(), "v": [base64::encode(values.get("e").unwrap())]}
+ ],
+ "more": false,
+ "nextStart": null,
+ },
+ {
+ "partitionKey": "root",
+ "prefix": "d",
+ "start": null,
+ "end": null,
+ "limit": null,
+ "reverse": false,
+ "conflictsOnly": false,
+ "tombstones": false,
+ "singleItem": false,
+ "items": [
+ {"sk": "d.1", "ct": ct.get("d.1").unwrap(), "v": [base64::encode(values.get("d.1'").unwrap())]},
+ {"sk": "d.2", "ct": ct.get("d.2").unwrap(), "v": [base64::encode(values.get("d.2").unwrap()), base64::encode(values.get("d.2'").unwrap())]},
+ ],
+ "more": false,
+ "nextStart": null,
+ },
+ {
+ "partitionKey": "root",
+ "prefix": "d.",
+ "start": null,
+ "end": "d.2",
+ "limit": null,
+ "reverse": false,
+ "conflictsOnly": false,
+ "tombstones": false,
+ "singleItem": false,
+ "items": [
+ {"sk": "d.1", "ct": ct.get("d.1").unwrap(), "v": [base64::encode(values.get("d.1'").unwrap())]},
+ ],
+ "more": false,
+ "nextStart": null,
+ },
+ {
+ "partitionKey": "root",
+ "prefix": "d.",
+ "start": null,
+ "end": null,
+ "limit": 1,
+ "reverse": false,
+ "conflictsOnly": false,
+ "tombstones": false,
+ "singleItem": false,
+ "items": [
+ {"sk": "d.1", "ct": ct.get("d.1").unwrap(), "v": [base64::encode(values.get("d.1'").unwrap())]},
+ ],
+ "more": true,
+ "nextStart": "d.2",
+ },
+ {
+ "partitionKey": "root",
+ "prefix": "d.",
+ "start": "d.2",
+ "end": null,
+ "limit": 1,
+ "reverse": false,
+ "conflictsOnly": false,
+ "tombstones": false,
+ "singleItem": false,
+ "items": [
+ {"sk": "d.2", "ct": ct.get("d.2").unwrap(), "v": [base64::encode(values.get("d.2").unwrap()), base64::encode(values.get("d.2'").unwrap())]},
+ ],
+ "more": false,
+ "nextStart": null,
+ },
+ {
+ "partitionKey": "root",
+ "prefix": "d.",
+ "start": null,
+ "end": null,
+ "limit": null,
+ "reverse": true,
+ "conflictsOnly": false,
+ "tombstones": false,
+ "singleItem": false,
+ "items": [
+ {"sk": "d.2", "ct": ct.get("d.2").unwrap(), "v": [base64::encode(values.get("d.2").unwrap()), base64::encode(values.get("d.2'").unwrap())]},
+ {"sk": "d.1", "ct": ct.get("d.1").unwrap(), "v": [base64::encode(values.get("d.1'").unwrap())]},
+ ],
+ "more": false,
+ "nextStart": null,
+ },
+ {
+ "partitionKey": "root",
+ "prefix": "d.",
+ "start": "d.2",
+ "end": null,
+ "limit": null,
+ "reverse": true,
+ "conflictsOnly": false,
+ "tombstones": false,
+ "singleItem": false,
+ "items": [
+ {"sk": "d.2", "ct": ct.get("d.2").unwrap(), "v": [base64::encode(values.get("d.2").unwrap()), base64::encode(values.get("d.2'").unwrap())]},
+ {"sk": "d.1", "ct": ct.get("d.1").unwrap(), "v": [base64::encode(values.get("d.1'").unwrap())]},
+ ],
+ "more": false,
+ "nextStart": null,
+ },
+ {
+ "partitionKey": "root",
+ "prefix": "d.",
+ "start": null,
+ "end": null,
+ "limit": 2,
+ "reverse": false,
+ "conflictsOnly": false,
+ "tombstones": false,
+ "singleItem": false,
+ "items": [
+ {"sk": "d.1", "ct": ct.get("d.1").unwrap(), "v": [base64::encode(values.get("d.1'").unwrap())]},
+ {"sk": "d.2", "ct": ct.get("d.2").unwrap(), "v": [base64::encode(values.get("d.2").unwrap()), base64::encode(values.get("d.2'").unwrap())]},
+ ],
+ "more": false,
+ "nextStart": null,
+ },
+ ])
+ );
+
+ // Test DeleteBatch
+ let res = ctx
+ .k2v
+ .request
+ .builder(bucket.clone())
+ .query_param("delete", Option::<&str>::None)
+ .body(
+ br#"[
+ {"partitionKey": "root", "start": "a", "end": "c"},
+ {"partitionKey": "root", "prefix": "d"}
+ ]"#
+ .to_vec(),
+ )
+ .method(Method::POST)
+ .send()
+ .await
+ .unwrap();
+ assert_eq!(res.status(), 200);
+ let json_res = json_body(res).await;
+ assert_json_eq!(
+ json_res,
+ json!([
+ {
+ "partitionKey": "root",
+ "prefix": null,
+ "start": "a",
+ "end": "c",
+ "singleItem": false,
+ "deletedItems": 1,
+ },
+ {
+ "partitionKey": "root",
+ "prefix": "d",
+ "start": null,
+ "end": null,
+ "singleItem": false,
+ "deletedItems": 2,
+ },
+ ])
+ );
+
+ // update our known tombstones
+ for sk in ["a", "b", "d.1", "d.2"] {
+ let res = ctx
+ .k2v
+ .request
+ .builder(bucket.clone())
+ .path("root")
+ .query_param("sort_key", Some(sk))
+ .signed_header("accept", "application/octet-stream")
+ .send()
+ .await
+ .unwrap();
+ assert_eq!(res.status(), 204);
+ assert_eq!(
+ res.headers().get("content-type").unwrap().to_str().unwrap(),
+ "application/octet-stream"
+ );
+ ct.insert(
+ sk,
+ res.headers()
+ .get("x-garage-causality-token")
+ .unwrap()
+ .to_str()
+ .unwrap()
+ .to_string(),
+ );
+ }
+
+ let res = ctx
+ .k2v
+ .request
+ .builder(bucket.clone())
+ .query_param("search", Option::<&str>::None)
+ .body(
+ br#"[
+ {"partitionKey": "root"},
+ {"partitionKey": "root", "reverse": true},
+ {"partitionKey": "root", "tombstones": true}
+ ]"#
+ .to_vec(),
+ )
+ .method(Method::POST)
+ .send()
+ .await
+ .unwrap();
+ assert_eq!(res.status(), 200);
+ let json_res = json_body(res).await;
+ assert_json_eq!(
+ json_res,
+ json!([
+ {
+ "partitionKey": "root",
+ "prefix": null,
+ "start": null,
+ "end": null,
+ "limit": null,
+ "reverse": false,
+ "conflictsOnly": false,
+ "tombstones": false,
+ "singleItem": false,
+ "items": [
+ {"sk": "c", "ct": ct.get("c").unwrap(), "v": [base64::encode(values.get("c").unwrap()), base64::encode(values.get("c'").unwrap())]},
+ {"sk": "e", "ct": ct.get("e").unwrap(), "v": [base64::encode(values.get("e").unwrap())]}
+ ],
+ "more": false,
+ "nextStart": null,
+ },
+ {
+ "partitionKey": "root",
+ "prefix": null,
+ "start": null,
+ "end": null,
+ "limit": null,
+ "reverse": true,
+ "conflictsOnly": false,
+ "tombstones": false,
+ "singleItem": false,
+ "items": [
+ {"sk": "e", "ct": ct.get("e").unwrap(), "v": [base64::encode(values.get("e").unwrap())]},
+ {"sk": "c", "ct": ct.get("c").unwrap(), "v": [base64::encode(values.get("c").unwrap()), base64::encode(values.get("c'").unwrap())]},
+ ],
+ "more": false,
+ "nextStart": null,
+ },
+ {
+ "partitionKey": "root",
+ "prefix": null,
+ "start": null,
+ "end": null,
+ "limit": null,
+ "reverse": false,
+ "conflictsOnly": false,
+ "tombstones": true,
+ "singleItem": false,
+ "items": [
+ {"sk": "a", "ct": ct.get("a").unwrap(), "v": [null]},
+ {"sk": "b", "ct": ct.get("b").unwrap(), "v": [null]},
+ {"sk": "c", "ct": ct.get("c").unwrap(), "v": [base64::encode(values.get("c").unwrap()), base64::encode(values.get("c'").unwrap())]},
+ {"sk": "d.1", "ct": ct.get("d.1").unwrap(), "v": [null]},
+ {"sk": "d.2", "ct": ct.get("d.2").unwrap(), "v": [null]},
+ {"sk": "e", "ct": ct.get("e").unwrap(), "v": [base64::encode(values.get("e").unwrap())]},
+ ],
+ "more": false,
+ "nextStart": null,
+ },
+ ])
+ );
+}
diff --git a/src/garage/tests/k2v/errorcodes.rs b/src/garage/tests/k2v/errorcodes.rs
new file mode 100644
index 00000000..2fcc45bc
--- /dev/null
+++ b/src/garage/tests/k2v/errorcodes.rs
@@ -0,0 +1,141 @@
+use crate::common;
+
+use hyper::Method;
+
+#[tokio::test]
+async fn test_error_codes() {
+ let ctx = common::context();
+ let bucket = ctx.create_bucket("test-k2v-error-codes");
+
+ // Regular insert should work (code 200)
+ let res = ctx
+ .k2v
+ .request
+ .builder(bucket.clone())
+ .method(Method::PUT)
+ .path("root")
+ .query_param("sort_key", Some("test1"))
+ .body(b"Hello, world!".to_vec())
+ .send()
+ .await
+ .unwrap();
+ assert_eq!(res.status(), 200);
+
+ // Insert with trash causality token: invalid request
+ let res = ctx
+ .k2v
+ .request
+ .builder(bucket.clone())
+ .method(Method::PUT)
+ .path("root")
+ .query_param("sort_key", Some("test1"))
+ .signed_header("x-garage-causality-token", "tra$sh")
+ .body(b"Hello, world!".to_vec())
+ .send()
+ .await
+ .unwrap();
+ assert_eq!(res.status(), 400);
+
+ // Search without partition key: invalid request
+ let res = ctx
+ .k2v
+ .request
+ .builder(bucket.clone())
+ .query_param("search", Option::<&str>::None)
+ .body(
+ br#"[
+ {},
+ ]"#
+ .to_vec(),
+ )
+ .method(Method::POST)
+ .send()
+ .await
+ .unwrap();
+ assert_eq!(res.status(), 400);
+
+ // Search with start that is not in prefix: invalid request
+ let res = ctx
+ .k2v
+ .request
+ .builder(bucket.clone())
+ .query_param("search", Option::<&str>::None)
+ .body(
+ br#"[
+ {"partition_key": "root", "prefix": "a", "start": "bx"},
+ ]"#
+ .to_vec(),
+ )
+ .method(Method::POST)
+ .send()
+ .await
+ .unwrap();
+ assert_eq!(res.status(), 400);
+
+ // Search with invalid json: 400
+ let res = ctx
+ .k2v
+ .request
+ .builder(bucket.clone())
+ .query_param("search", Option::<&str>::None)
+ .body(
+ br#"[
+ {"partition_key": "root"
+ ]"#
+ .to_vec(),
+ )
+ .method(Method::POST)
+ .send()
+ .await
+ .unwrap();
+ assert_eq!(res.status(), 400);
+
+ // Batch insert with invalid causality token: 400
+ let res = ctx
+ .k2v
+ .request
+ .builder(bucket.clone())
+ .body(
+ br#"[
+ {"pk": "root", "sk": "a", "ct": "tra$h", "v": "aGVsbG8sIHdvcmxkCg=="}
+ ]"#
+ .to_vec(),
+ )
+ .method(Method::POST)
+ .send()
+ .await
+ .unwrap();
+ assert_eq!(res.status(), 400);
+
+ // Batch insert with invalid data: 400
+ let res = ctx
+ .k2v
+ .request
+ .builder(bucket.clone())
+ .body(
+ br#"[
+ {"pk": "root", "sk": "a", "ct": null, "v": "aGVsbG8sIHdvcmx$Cg=="}
+ ]"#
+ .to_vec(),
+ )
+ .method(Method::POST)
+ .send()
+ .await
+ .unwrap();
+ assert_eq!(res.status(), 400);
+
+ // Poll with invalid causality token: 400
+ let res = ctx
+ .k2v
+ .request
+ .builder(bucket.clone())
+ .path("root")
+ .query_param("sort_key", Some("test1"))
+ .query_param("causality_token", Some("tra$h"))
+ .query_param("timeout", Some("10"))
+ .signed_header("accept", "application/octet-stream")
+ .send()
+ .await
+ .unwrap();
+ assert_eq!(res.status(), 400);
+}
diff --git a/src/garage/tests/k2v/item.rs b/src/garage/tests/k2v/item.rs
new file mode 100644
index 00000000..32537336
--- /dev/null
+++ b/src/garage/tests/k2v/item.rs
@@ -0,0 +1,725 @@
+use std::time::Duration;
+
+use crate::common;
+
+use assert_json_diff::assert_json_eq;
+use serde_json::json;
+
+use super::json_body;
+use hyper::Method;
+
+#[tokio::test]
+async fn test_items_and_indices() {
+ let ctx = common::context();
+ let bucket = ctx.create_bucket("test-k2v-item-and-index");
+
+ // ReadIndex -- there should be nothing
+ let res = ctx
+ .k2v
+ .request
+ .builder(bucket.clone())
+ .send()
+ .await
+ .unwrap();
+ let res_body = json_body(res).await;
+ assert_json_eq!(
+ res_body,
+ json!({
+ "prefix": null,
+ "start": null,
+ "end": null,
+ "limit": null,
+ "reverse": false,
+ "partitionKeys": [],
+ "more": false,
+ "nextStart": null
+ })
+ );
+
+ let content2_len = "_: hello universe".len();
+ let content3_len = "_: concurrent value".len();
+
+ for (i, sk) in ["a", "b", "c", "d"].iter().enumerate() {
+ let content = format!("{}: hello world", sk).into_bytes();
+ let content2 = format!("{}: hello universe", sk).into_bytes();
+ let content3 = format!("{}: concurrent value", sk).into_bytes();
+
+ // Put initially, no causality token
+ let res = ctx
+ .k2v
+ .request
+ .builder(bucket.clone())
+ .path("root")
+ .query_param("sort_key", Some(sk))
+ .body(content.clone())
+ .method(Method::PUT)
+ .send()
+ .await
+ .unwrap();
+ assert_eq!(res.status(), 200);
+
+ // Get value back
+ let res = ctx
+ .k2v
+ .request
+ .builder(bucket.clone())
+ .path("root")
+ .query_param("sort_key", Some(sk))
+ .signed_header("accept", "*/*")
+ .send()
+ .await
+ .unwrap();
+ assert_eq!(res.status(), 200);
+ assert_eq!(
+ res.headers().get("content-type").unwrap().to_str().unwrap(),
+ "application/octet-stream"
+ );
+ let ct = res
+ .headers()
+ .get("x-garage-causality-token")
+ .unwrap()
+ .to_str()
+ .unwrap()
+ .to_string();
+ let res_body = hyper::body::to_bytes(res.into_body())
+ .await
+ .unwrap()
+ .to_vec();
+ assert_eq!(res_body, content);
+
+ // ReadIndex -- now there should be some stuff
+ tokio::time::sleep(Duration::from_secs(1)).await;
+ let res = ctx
+ .k2v
+ .request
+ .builder(bucket.clone())
+ .send()
+ .await
+ .unwrap();
+ let res_body = json_body(res).await;
+ assert_json_eq!(
+ res_body,
+ json!({
+ "prefix": null,
+ "start": null,
+ "end": null,
+ "limit": null,
+ "reverse": false,
+ "partitionKeys": [
+ {
+ "pk": "root",
+ "entries": i+1,
+ "conflicts": i,
+ "values": i+i+1,
+ "bytes": i*(content2.len() + content3.len()) + content.len(),
+ }
+ ],
+ "more": false,
+ "nextStart": null
+ })
+ );
+
+ // Put again, this time with causality token
+ let res = ctx
+ .k2v
+ .request
+ .builder(bucket.clone())
+ .path("root")
+ .query_param("sort_key", Some(sk))
+ .signed_header("x-garage-causality-token", ct.clone())
+ .body(content2.clone())
+ .method(Method::PUT)
+ .send()
+ .await
+ .unwrap();
+ assert_eq!(res.status(), 200);
+
+ // Get value back
+ let res = ctx
+ .k2v
+ .request
+ .builder(bucket.clone())
+ .path("root")
+ .query_param("sort_key", Some(sk))
+ .signed_header("accept", "*/*")
+ .send()
+ .await
+ .unwrap();
+ assert_eq!(res.status(), 200);
+ assert_eq!(
+ res.headers().get("content-type").unwrap().to_str().unwrap(),
+ "application/octet-stream"
+ );
+ let res_body = hyper::body::to_bytes(res.into_body())
+ .await
+ .unwrap()
+ .to_vec();
+ assert_eq!(res_body, content2);
+
+ // ReadIndex -- now there should be some stuff
+ tokio::time::sleep(Duration::from_secs(1)).await;
+ let res = ctx
+ .k2v
+ .request
+ .builder(bucket.clone())
+ .send()
+ .await
+ .unwrap();
+ let res_body = json_body(res).await;
+ assert_json_eq!(
+ res_body,
+ json!({
+ "prefix": null,
+ "start": null,
+ "end": null,
+ "limit": null,
+ "reverse": false,
+ "partitionKeys": [
+ {
+ "pk": "root",
+ "entries": i+1,
+ "conflicts": i,
+ "values": i+i+1,
+ "bytes": i*content3.len() + (i+1)*content2.len(),
+ }
+ ],
+ "more": false,
+ "nextStart": null
+ })
+ );
+
+ // Put again with same CT, now we have concurrent values
+ let res = ctx
+ .k2v
+ .request
+ .builder(bucket.clone())
+ .path("root")
+ .query_param("sort_key", Some(sk))
+ .signed_header("x-garage-causality-token", ct.clone())
+ .body(content3.clone())
+ .method(Method::PUT)
+ .send()
+ .await
+ .unwrap();
+ assert_eq!(res.status(), 200);
+
+ // Get value back
+ let res = ctx
+ .k2v
+ .request
+ .builder(bucket.clone())
+ .path("root")
+ .query_param("sort_key", Some(sk))
+ .signed_header("accept", "*/*")
+ .send()
+ .await
+ .unwrap();
+ assert_eq!(res.status(), 200);
+ assert_eq!(
+ res.headers().get("content-type").unwrap().to_str().unwrap(),
+ "application/json"
+ );
+ let res_json = json_body(res).await;
+ assert_json_eq!(
+ res_json,
+ [base64::encode(&content2), base64::encode(&content3)]
+ );
+
+ // ReadIndex -- now there should be some stuff
+ tokio::time::sleep(Duration::from_secs(1)).await;
+ let res = ctx
+ .k2v
+ .request
+ .builder(bucket.clone())
+ .send()
+ .await
+ .unwrap();
+ let res_body = json_body(res).await;
+ assert_json_eq!(
+ res_body,
+ json!({
+ "prefix": null,
+ "start": null,
+ "end": null,
+ "limit": null,
+ "reverse": false,
+ "partitionKeys": [
+ {
+ "pk": "root",
+ "entries": i+1,
+ "conflicts": i+1,
+ "values": 2*(i+1),
+ "bytes": (i+1)*(content2.len() + content3.len()),
+ }
+ ],
+ "more": false,
+ "nextStart": null
+ })
+ );
+ }
+
+ // Now delete things
+ for (i, sk) in ["a", "b", "c", "d"].iter().enumerate() {
+ // Get value back (we just need the CT)
+ let res = ctx
+ .k2v
+ .request
+ .builder(bucket.clone())
+ .path("root")
+ .query_param("sort_key", Some(sk))
+ .signed_header("accept", "*/*")
+ .send()
+ .await
+ .unwrap();
+ assert_eq!(res.status(), 200);
+ let ct = res
+ .headers()
+ .get("x-garage-causality-token")
+ .unwrap()
+ .to_str()
+ .unwrap()
+ .to_string();
+
+ // Delete it
+ let res = ctx
+ .k2v
+ .request
+ .builder(bucket.clone())
+ .method(Method::DELETE)
+ .path("root")
+ .query_param("sort_key", Some(sk))
+ .signed_header("x-garage-causality-token", ct)
+ .send()
+ .await
+ .unwrap();
+ assert_eq!(res.status(), 204);
+
+ // ReadIndex -- now there should be some stuff
+ tokio::time::sleep(Duration::from_secs(1)).await;
+ let res = ctx
+ .k2v
+ .request
+ .builder(bucket.clone())
+ .send()
+ .await
+ .unwrap();
+ let res_body = json_body(res).await;
+ if i < 3 {
+ assert_json_eq!(
+ res_body,
+ json!({
+ "prefix": null,
+ "start": null,
+ "end": null,
+ "limit": null,
+ "reverse": false,
+ "partitionKeys": [
+ {
+ "pk": "root",
+ "entries": 3-i,
+ "conflicts": 3-i,
+ "values": 2*(3-i),
+ "bytes": (3-i)*(content2_len + content3_len),
+ }
+ ],
+ "more": false,
+ "nextStart": null
+ })
+ );
+ } else {
+ assert_json_eq!(
+ res_body,
+ json!({
+ "prefix": null,
+ "start": null,
+ "end": null,
+ "limit": null,
+ "reverse": false,
+ "partitionKeys": [],
+ "more": false,
+ "nextStart": null
+ })
+ );
+ }
+ }
+}
+
+#[tokio::test]
+async fn test_item_return_format() {
+ let ctx = common::context();
+ let bucket = ctx.create_bucket("test-k2v-item-return-format");
+
+ let single_value = b"A single value".to_vec();
+ let concurrent_value = b"A concurrent value".to_vec();
+
+ // -- Test with a single value --
+ let res = ctx
+ .k2v
+ .request
+ .builder(bucket.clone())
+ .path("root")
+ .query_param("sort_key", Some("v1"))
+ .body(single_value.clone())
+ .method(Method::PUT)
+ .send()
+ .await
+ .unwrap();
+ assert_eq!(res.status(), 200);
+
+ // f0: either
+ let res = ctx
+ .k2v
+ .request
+ .builder(bucket.clone())
+ .path("root")
+ .query_param("sort_key", Some("v1"))
+ .signed_header("accept", "*/*")
+ .send()
+ .await
+ .unwrap();
+ assert_eq!(res.status(), 200);
+ assert_eq!(
+ res.headers().get("content-type").unwrap().to_str().unwrap(),
+ "application/octet-stream"
+ );
+ let ct = res
+ .headers()
+ .get("x-garage-causality-token")
+ .unwrap()
+ .to_str()
+ .unwrap()
+ .to_string();
+ let res_body = hyper::body::to_bytes(res.into_body())
+ .await
+ .unwrap()
+ .to_vec();
+ assert_eq!(res_body, single_value);
+
+ // f1: not specified
+ let res = ctx
+ .k2v
+ .request
+ .builder(bucket.clone())
+ .path("root")
+ .query_param("sort_key", Some("v1"))
+ .send()
+ .await
+ .unwrap();
+ assert_eq!(res.status(), 200);
+ assert_eq!(
+ res.headers().get("content-type").unwrap().to_str().unwrap(),
+ "application/json"
+ );
+ let res_body = json_body(res).await;
+ assert_json_eq!(res_body, json!([base64::encode(&single_value)]));
+
+ // f2: binary
+ let res = ctx
+ .k2v
+ .request
+ .builder(bucket.clone())
+ .path("root")
+ .query_param("sort_key", Some("v1"))
+ .signed_header("accept", "application/octet-stream")
+ .send()
+ .await
+ .unwrap();
+ assert_eq!(res.status(), 200);
+ assert_eq!(
+ res.headers().get("content-type").unwrap().to_str().unwrap(),
+ "application/octet-stream"
+ );
+ let res_body = hyper::body::to_bytes(res.into_body())
+ .await
+ .unwrap()
+ .to_vec();
+ assert_eq!(res_body, single_value);
+
+ // f3: json
+ let res = ctx
+ .k2v
+ .request
+ .builder(bucket.clone())
+ .path("root")
+ .query_param("sort_key", Some("v1"))
+ .signed_header("accept", "application/json")
+ .send()
+ .await
+ .unwrap();
+ assert_eq!(res.status(), 200);
+ assert_eq!(
+ res.headers().get("content-type").unwrap().to_str().unwrap(),
+ "application/json"
+ );
+ let res_body = json_body(res).await;
+ assert_json_eq!(res_body, json!([base64::encode(&single_value)]));
+
+ // -- Test with a second, concurrent value --
+ let res = ctx
+ .k2v
+ .request
+ .builder(bucket.clone())
+ .path("root")
+ .query_param("sort_key", Some("v1"))
+ .body(concurrent_value.clone())
+ .method(Method::PUT)
+ .send()
+ .await
+ .unwrap();
+ assert_eq!(res.status(), 200);
+
+ // f0: either
+ let res = ctx
+ .k2v
+ .request
+ .builder(bucket.clone())
+ .path("root")
+ .query_param("sort_key", Some("v1"))
+ .signed_header("accept", "*/*")
+ .send()
+ .await
+ .unwrap();
+ assert_eq!(res.status(), 200);
+ assert_eq!(
+ res.headers().get("content-type").unwrap().to_str().unwrap(),
+ "application/json"
+ );
+ let res_body = json_body(res).await;
+ assert_json_eq!(
+ res_body,
+ json!([
+ base64::encode(&single_value),
+ base64::encode(&concurrent_value)
+ ])
+ );
+
+ // f1: not specified
+ let res = ctx
+ .k2v
+ .request
+ .builder(bucket.clone())
+ .path("root")
+ .query_param("sort_key", Some("v1"))
+ .send()
+ .await
+ .unwrap();
+ assert_eq!(res.status(), 200);
+ assert_eq!(
+ res.headers().get("content-type").unwrap().to_str().unwrap(),
+ "application/json"
+ );
+ let res_body = json_body(res).await;
+ assert_json_eq!(
+ res_body,
+ json!([
+ base64::encode(&single_value),
+ base64::encode(&concurrent_value)
+ ])
+ );
+
+ // f2: binary
+ let res = ctx
+ .k2v
+ .request
+ .builder(bucket.clone())
+ .path("root")
+ .query_param("sort_key", Some("v1"))
+ .signed_header("accept", "application/octet-stream")
+ .send()
+ .await
+ .unwrap();
+ assert_eq!(res.status(), 409); // CONFLICT
+
+ // f3: json
+ let res = ctx
+ .k2v
+ .request
+ .builder(bucket.clone())
+ .path("root")
+ .query_param("sort_key", Some("v1"))
+ .signed_header("accept", "application/json")
+ .send()
+ .await
+ .unwrap();
+ assert_eq!(res.status(), 200);
+ assert_eq!(
+ res.headers().get("content-type").unwrap().to_str().unwrap(),
+ "application/json"
+ );
+ let res_body = json_body(res).await;
+ assert_json_eq!(
+ res_body,
+ json!([
+ base64::encode(&single_value),
+ base64::encode(&concurrent_value)
+ ])
+ );
+
+ // -- Delete first value, concurrently with second insert --
+ // -- (we now have a concurrent value and a deletion) --
+ let res = ctx
+ .k2v
+ .request
+ .builder(bucket.clone())
+ .path("root")
+ .query_param("sort_key", Some("v1"))
+ .method(Method::DELETE)
+ .signed_header("x-garage-causality-token", ct)
+ .send()
+ .await
+ .unwrap();
+ assert_eq!(res.status(), 204);
+
+ // f0: either
+ let res = ctx
+ .k2v
+ .request
+ .builder(bucket.clone())
+ .path("root")
+ .query_param("sort_key", Some("v1"))
+ .signed_header("accept", "*/*")
+ .send()
+ .await
+ .unwrap();
+ assert_eq!(res.status(), 200);
+ assert_eq!(
+ res.headers().get("content-type").unwrap().to_str().unwrap(),
+ "application/json"
+ );
+ let res_body = json_body(res).await;
+ assert_json_eq!(res_body, json!([base64::encode(&concurrent_value), null]));
+
+ // f1: not specified
+ let res = ctx
+ .k2v
+ .request
+ .builder(bucket.clone())
+ .path("root")
+ .query_param("sort_key", Some("v1"))
+ .send()
+ .await
+ .unwrap();
+ assert_eq!(res.status(), 200);
+ assert_eq!(
+ res.headers().get("content-type").unwrap().to_str().unwrap(),
+ "application/json"
+ );
+ let ct = res
+ .headers()
+ .get("x-garage-causality-token")
+ .unwrap()
+ .to_str()
+ .unwrap()
+ .to_string();
+ let res_body = json_body(res).await;
+ assert_json_eq!(res_body, json!([base64::encode(&concurrent_value), null]));
+
+ // f2: binary
+ let res = ctx
+ .k2v
+ .request
+ .builder(bucket.clone())
+ .path("root")
+ .query_param("sort_key", Some("v1"))
+ .signed_header("accept", "application/octet-stream")
+ .send()
+ .await
+ .unwrap();
+ assert_eq!(res.status(), 409); // CONFLICT
+
+ // f3: json
+ let res = ctx
+ .k2v
+ .request
+ .builder(bucket.clone())
+ .path("root")
+ .query_param("sort_key", Some("v1"))
+ .signed_header("accept", "application/json")
+ .send()
+ .await
+ .unwrap();
+ assert_eq!(res.status(), 200);
+ assert_eq!(
+ res.headers().get("content-type").unwrap().to_str().unwrap(),
+ "application/json"
+ );
+ let res_body = json_body(res).await;
+ assert_json_eq!(res_body, json!([base64::encode(&concurrent_value), null]));
+
+ // -- Delete everything --
+ let res = ctx
+ .k2v
+ .request
+ .builder(bucket.clone())
+ .path("root")
+ .query_param("sort_key", Some("v1"))
+ .method(Method::DELETE)
+ .signed_header("x-garage-causality-token", ct)
+ .send()
+ .await
+ .unwrap();
+ assert_eq!(res.status(), 204);
+
+ // f0: either
+ let res = ctx
+ .k2v
+ .request
+ .builder(bucket.clone())
+ .path("root")
+ .query_param("sort_key", Some("v1"))
+ .signed_header("accept", "*/*")
+ .send()
+ .await
+ .unwrap();
+ assert_eq!(res.status(), 204); // NO CONTENT
+
+ // f1: not specified
+ let res = ctx
+ .k2v
+ .request
+ .builder(bucket.clone())
+ .path("root")
+ .query_param("sort_key", Some("v1"))
+ .send()
+ .await
+ .unwrap();
+ assert_eq!(res.status(), 200);
+ assert_eq!(
+ res.headers().get("content-type").unwrap().to_str().unwrap(),
+ "application/json"
+ );
+ let res_body = json_body(res).await;
+ assert_json_eq!(res_body, json!([null]));
+
+ // f2: binary
+ let res = ctx
+ .k2v
+ .request
+ .builder(bucket.clone())
+ .path("root")
+ .query_param("sort_key", Some("v1"))
+ .signed_header("accept", "application/octet-stream")
+ .send()
+ .await
+ .unwrap();
+ assert_eq!(res.status(), 204); // NO CONTENT
+
+ // f3: json
+ let res = ctx
+ .k2v
+ .request
+ .builder(bucket.clone())
+ .path("root")
+ .query_param("sort_key", Some("v1"))
+ .signed_header("accept", "application/json")
+ .send()
+ .await
+ .unwrap();
+ assert_eq!(res.status(), 200);
+ assert_eq!(
+ res.headers().get("content-type").unwrap().to_str().unwrap(),
+ "application/json"
+ );
+ let res_body = json_body(res).await;
+ assert_json_eq!(res_body, json!([null]));
+}
diff --git a/src/garage/tests/k2v/mod.rs b/src/garage/tests/k2v/mod.rs
new file mode 100644
index 00000000..a009460e
--- /dev/null
+++ b/src/garage/tests/k2v/mod.rs
@@ -0,0 +1,18 @@
+pub mod batch;
+pub mod errorcodes;
+pub mod item;
+pub mod poll;
+pub mod simple;
+
+use hyper::{Body, Response};
+
+pub async fn json_body(res: Response<Body>) -> serde_json::Value {
+ let res_body: serde_json::Value = serde_json::from_slice(
+ &hyper::body::to_bytes(res.into_body())
+ .await
+ .unwrap()
+ .to_vec()[..],
+ )
+ .unwrap();
+ res_body
+}
diff --git a/src/garage/tests/k2v/poll.rs b/src/garage/tests/k2v/poll.rs
new file mode 100644
index 00000000..70dc0410
--- /dev/null
+++ b/src/garage/tests/k2v/poll.rs
@@ -0,0 +1,98 @@
+use hyper::Method;
+use std::time::Duration;
+
+use crate::common;
+
+#[tokio::test]
+async fn test_poll() {
+ let ctx = common::context();
+ let bucket = ctx.create_bucket("test-k2v-poll");
+
+ // Write initial value
+ let res = ctx
+ .k2v
+ .request
+ .builder(bucket.clone())
+ .method(Method::PUT)
+ .path("root")
+ .query_param("sort_key", Some("test1"))
+ .body(b"Initial value".to_vec())
+ .send()
+ .await
+ .unwrap();
+ assert_eq!(res.status(), 200);
+
+ // Retrieve initial value to get its causality token
+ let res2 = ctx
+ .k2v
+ .request
+ .builder(bucket.clone())
+ .path("root")
+ .query_param("sort_key", Some("test1"))
+ .signed_header("accept", "application/octet-stream")
+ .send()
+ .await
+ .unwrap();
+ assert_eq!(res2.status(), 200);
+ let ct = res2
+ .headers()
+ .get("x-garage-causality-token")
+ .unwrap()
+ .to_str()
+ .unwrap()
+ .to_string();
+
+ let res2_body = hyper::body::to_bytes(res2.into_body())
+ .await
+ .unwrap()
+ .to_vec();
+ assert_eq!(res2_body, b"Initial value");
+
+ // Start poll operation
+ let poll = {
+ let bucket = bucket.clone();
+ let ct = ct.clone();
+ tokio::spawn(async move {
+ let ctx = common::context();
+ ctx.k2v
+ .request
+ .builder(bucket.clone())
+ .path("root")
+ .query_param("sort_key", Some("test1"))
+ .query_param("causality_token", Some(ct))
+ .query_param("timeout", Some("10"))
+ .signed_header("accept", "application/octet-stream")
+ .send()
+ .await
+ })
+ };
+
+ // Write new value that supersedes initial one
+ let res = ctx
+ .k2v
+ .request
+ .builder(bucket.clone())
+ .method(Method::PUT)
+ .path("root")
+ .query_param("sort_key", Some("test1"))
+ .signed_header("x-garage-causality-token", ct)
+ .body(b"New value".to_vec())
+ .send()
+ .await
+ .unwrap();
+ assert_eq!(res.status(), 200);
+
+ // Check poll finishes with correct value
+ let poll_res = tokio::select! {
+ _ = tokio::time::sleep(Duration::from_secs(10)) => panic!("poll did not terminate in time"),
+ res = poll => res.unwrap().unwrap(),
+ };
+
+ assert_eq!(poll_res.status(), 200);
+
+ let poll_res_body = hyper::body::to_bytes(poll_res.into_body())
+ .await
+ .unwrap()
+ .to_vec();
+ assert_eq!(poll_res_body, b"New value");
+}
diff --git a/src/garage/tests/k2v/simple.rs b/src/garage/tests/k2v/simple.rs
new file mode 100644
index 00000000..ae9a8674
--- /dev/null
+++ b/src/garage/tests/k2v/simple.rs
@@ -0,0 +1,40 @@
+use crate::common;
+
+use hyper::Method;
+
+#[tokio::test]
+async fn test_simple() {
+ let ctx = common::context();
+ let bucket = ctx.create_bucket("test-k2v-simple");
+
+ let res = ctx
+ .k2v
+ .request
+ .builder(bucket.clone())
+ .method(Method::PUT)
+ .path("root")
+ .query_param("sort_key", Some("test1"))
+ .body(b"Hello, world!".to_vec())
+ .send()
+ .await
+ .unwrap();
+ assert_eq!(res.status(), 200);
+
+ let res2 = ctx
+ .k2v
+ .request
+ .builder(bucket.clone())
+ .path("root")
+ .query_param("sort_key", Some("test1"))
+ .signed_header("accept", "application/octet-stream")
+ .send()
+ .await
+ .unwrap();
+ assert_eq!(res2.status(), 200);
+
+ let res2_body = hyper::body::to_bytes(res2.into_body())
+ .await
+ .unwrap()
+ .to_vec();
+ assert_eq!(res2_body, b"Hello, world!");
+}
diff --git a/src/garage/tests/lib.rs b/src/garage/tests/lib.rs
index 8799c395..87be1327 100644
--- a/src/garage/tests/lib.rs
+++ b/src/garage/tests/lib.rs
@@ -3,9 +3,8 @@ mod common;
mod admin;
mod bucket;
-mod list;
-mod multipart;
-mod objects;
-mod simple;
-mod streaming_signature;
-mod website;
+
+mod s3;
+
+#[cfg(feature = "k2v")]
+mod k2v;
diff --git a/src/garage/tests/list.rs b/src/garage/tests/s3/list.rs
index bb03f250..bb03f250 100644
--- a/src/garage/tests/list.rs
+++ b/src/garage/tests/s3/list.rs
diff --git a/src/garage/tests/s3/mod.rs b/src/garage/tests/s3/mod.rs
new file mode 100644
index 00000000..623eb665
--- /dev/null
+++ b/src/garage/tests/s3/mod.rs
@@ -0,0 +1,6 @@
+mod list;
+mod multipart;
+mod objects;
+mod simple;
+mod streaming_signature;
+mod website;
diff --git a/src/garage/tests/multipart.rs b/src/garage/tests/s3/multipart.rs
index 895a2993..895a2993 100644
--- a/src/garage/tests/multipart.rs
+++ b/src/garage/tests/s3/multipart.rs
diff --git a/src/garage/tests/objects.rs b/src/garage/tests/s3/objects.rs
index e1175b81..65f9e867 100644
--- a/src/garage/tests/objects.rs
+++ b/src/garage/tests/s3/objects.rs
@@ -263,4 +263,13 @@ async fn test_deleteobject() {
.unwrap();
assert!(l.contents.is_none());
+
+ // Deleting a non-existing object shouldn't be a problem
+ ctx.client
+ .delete_object()
+ .bucket(&bucket)
+ .key("l-0")
+ .send()
+ .await
+ .unwrap();
}
diff --git a/src/garage/tests/simple.rs b/src/garage/tests/s3/simple.rs
index f54ae9ac..f54ae9ac 100644
--- a/src/garage/tests/simple.rs
+++ b/src/garage/tests/s3/simple.rs
diff --git a/src/garage/tests/streaming_signature.rs b/src/garage/tests/s3/streaming_signature.rs
index c68f7dfc..c68f7dfc 100644
--- a/src/garage/tests/streaming_signature.rs
+++ b/src/garage/tests/s3/streaming_signature.rs
diff --git a/src/garage/tests/website.rs b/src/garage/tests/s3/website.rs
index 963d11ea..0570ac6a 100644
--- a/src/garage/tests/website.rs
+++ b/src/garage/tests/s3/website.rs
@@ -35,10 +35,7 @@ async fn test_website() {
let req = || {
Request::builder()
.method("GET")
- .uri(format!(
- "http://127.0.0.1:{}/",
- common::garage::DEFAULT_PORT + 2
- ))
+ .uri(format!("http://127.0.0.1:{}/", ctx.garage.web_port))
.header("Host", format!("{}.web.garage", BCKT_NAME))
.body(Body::empty())
.unwrap()
@@ -170,10 +167,7 @@ async fn test_website_s3_api() {
{
let req = Request::builder()
.method("GET")
- .uri(format!(
- "http://127.0.0.1:{}/site/",
- common::garage::DEFAULT_PORT + 2
- ))
+ .uri(format!("http://127.0.0.1:{}/site/", ctx.garage.web_port))
.header("Host", format!("{}.web.garage", BCKT_NAME))
.header("Origin", "https://example.com")
.body(Body::empty())
@@ -198,7 +192,7 @@ async fn test_website_s3_api() {
.method("GET")
.uri(format!(
"http://127.0.0.1:{}/wrong.html",
- common::garage::DEFAULT_PORT + 2
+ ctx.garage.web_port
))
.header("Host", format!("{}.web.garage", BCKT_NAME))
.body(Body::empty())
@@ -217,10 +211,7 @@ async fn test_website_s3_api() {
{
let req = Request::builder()
.method("OPTIONS")
- .uri(format!(
- "http://127.0.0.1:{}/site/",
- common::garage::DEFAULT_PORT + 2
- ))
+ .uri(format!("http://127.0.0.1:{}/site/", ctx.garage.web_port))
.header("Host", format!("{}.web.garage", BCKT_NAME))
.header("Origin", "https://example.com")
.header("Access-Control-Request-Method", "PUT")
@@ -244,10 +235,7 @@ async fn test_website_s3_api() {
{
let req = Request::builder()
.method("OPTIONS")
- .uri(format!(
- "http://127.0.0.1:{}/site/",
- common::garage::DEFAULT_PORT + 2
- ))
+ .uri(format!("http://127.0.0.1:{}/site/", ctx.garage.web_port))
.header("Host", format!("{}.web.garage", BCKT_NAME))
.header("Origin", "https://example.com")
.header("Access-Control-Request-Method", "DELETE")
@@ -288,10 +276,7 @@ async fn test_website_s3_api() {
{
let req = Request::builder()
.method("OPTIONS")
- .uri(format!(
- "http://127.0.0.1:{}/site/",
- common::garage::DEFAULT_PORT + 2
- ))
+ .uri(format!("http://127.0.0.1:{}/site/", ctx.garage.web_port))
.header("Host", format!("{}.web.garage", BCKT_NAME))
.header("Origin", "https://example.com")
.header("Access-Control-Request-Method", "PUT")
@@ -319,10 +304,7 @@ async fn test_website_s3_api() {
{
let req = Request::builder()
.method("GET")
- .uri(format!(
- "http://127.0.0.1:{}/site/",
- common::garage::DEFAULT_PORT + 2
- ))
+ .uri(format!("http://127.0.0.1:{}/site/", ctx.garage.web_port))
.header("Host", format!("{}.web.garage", BCKT_NAME))
.body(Body::empty())
.unwrap();
diff --git a/src/admin/tracing_setup.rs b/src/garage/tracing_setup.rs
index 55fc4094..55fc4094 100644
--- a/src/admin/tracing_setup.rs
+++ b/src/garage/tracing_setup.rs
diff --git a/src/k2v-client/Cargo.toml b/src/k2v-client/Cargo.toml
new file mode 100644
index 00000000..0f0b76ae
--- /dev/null
+++ b/src/k2v-client/Cargo.toml
@@ -0,0 +1,37 @@
+[package]
+name = "k2v-client"
+version = "0.0.1"
+authors = ["Trinity Pointard <trinity.pointard@gmail.com>", "Alex Auvolat <alex@adnab.me>"]
+edition = "2018"
+license = "AGPL-3.0"
+description = "Client library for the Garage K2V protocol"
+repository = "https://git.deuxfleurs.fr/Deuxfleurs/garage"
+readme = "../../README.md"
+
+[dependencies]
+base64 = "0.13.0"
+http = "0.2.6"
+log = "0.4"
+rusoto_core = "0.48.0"
+rusoto_credential = "0.48.0"
+rusoto_signature = "0.48.0"
+serde = "1.0.137"
+serde_json = "1.0.81"
+thiserror = "1.0.31"
+tokio = "1.17.0"
+
+# cli deps
+clap = { version = "3.1.18", optional = true, features = ["derive", "env"] }
+garage_util = { version = "0.8.0", path = "../util", optional = true }
+
+
+[features]
+cli = ["clap", "tokio/fs", "tokio/io-std", "garage_util"]
+
+[lib]
+path = "lib.rs"
+
+[[bin]]
+name = "k2v-cli"
+path = "bin/k2v-cli.rs"
+required-features = ["cli"]
diff --git a/src/k2v-client/README.md b/src/k2v-client/README.md
new file mode 100644
index 00000000..db454805
--- /dev/null
+++ b/src/k2v-client/README.md
@@ -0,0 +1,25 @@
+Example usage:
+```sh
+# all these values can be provided on the cli instead
+export AWS_ACCESS_KEY_ID=GK123456
+export AWS_SECRET_ACCESS_KEY=0123..789
+export AWS_REGION=garage
+export K2V_ENDPOINT=http://172.30.2.1:3903
+export K2V_BUCKET=my-bucket
+
+cargo run --features=cli -- read-range my-partition-key --all
+
+cargo run --features=cli -- insert my-partition-key my-sort-key --text "my string1"
+cargo run --features=cli -- insert my-partition-key my-sort-key --text "my string2"
+cargo run --features=cli -- insert my-partition-key my-sort-key2 --text "my string"
+
+cargo run --features=cli -- read-range my-partition-key --all
+
+causality=$(cargo run --features=cli -- read my-partition-key my-sort-key2 -b | head -n1)
+cargo run --features=cli -- delete my-partition-key my-sort-key2 -c $causality
+
+causality=$(cargo run --features=cli -- read my-partition-key my-sort-key -b | head -n1)
+cargo run --features=cli -- insert my-partition-key my-sort-key --text "my string3" -c $causality
+
+cargo run --features=cli -- read-range my-partition-key --all
+```
diff --git a/src/k2v-client/bin/k2v-cli.rs b/src/k2v-client/bin/k2v-cli.rs
new file mode 100644
index 00000000..925ebeb8
--- /dev/null
+++ b/src/k2v-client/bin/k2v-cli.rs
@@ -0,0 +1,501 @@
+use std::time::Duration;
+
+use k2v_client::*;
+
+use garage_util::formater::format_table;
+
+use rusoto_core::credential::AwsCredentials;
+use rusoto_core::Region;
+
+use clap::{Parser, Subcommand};
+
+/// K2V command line interface
+#[derive(Parser, Debug)]
+#[clap(author, version, about, long_about = None)]
+struct Args {
+ /// Name of the region to use
+ #[clap(short, long, env = "AWS_REGION", default_value = "garage")]
+ region: String,
+ /// Url of the endpoint to connect to
+ #[clap(short, long, env = "K2V_ENDPOINT")]
+ endpoint: String,
+ /// Access key ID
+ #[clap(short, long, env = "AWS_ACCESS_KEY_ID")]
+ key_id: String,
+ /// Access key ID
+ #[clap(short, long, env = "AWS_SECRET_ACCESS_KEY")]
+ secret: String,
+ /// Bucket name
+ #[clap(short, long, env = "K2V_BUCKET")]
+ bucket: String,
+ #[clap(subcommand)]
+ command: Command,
+}
+
+#[derive(Subcommand, Debug)]
+enum Command {
+ /// Insert a single value
+ Insert {
+ /// Partition key to insert to
+ partition_key: String,
+ /// Sort key to insert to
+ sort_key: String,
+ /// Causality of the insertion
+ #[clap(short, long)]
+ causality: Option<String>,
+ /// Value to insert
+ #[clap(flatten)]
+ value: Value,
+ },
+ /// Read a single value
+ Read {
+ /// Partition key to read from
+ partition_key: String,
+ /// Sort key to read from
+ sort_key: String,
+ /// Output formating
+ #[clap(flatten)]
+ output_kind: ReadOutputKind,
+ },
+ /// Watch changes on a single value
+ Poll {
+ /// Partition key to delete from
+ partition_key: String,
+ /// Sort key to delete from
+ sort_key: String,
+ /// Causality information
+ #[clap(short, long)]
+ causality: String,
+ /// Timeout, in seconds
+ #[clap(short, long)]
+ timeout: Option<u64>,
+ /// Output formating
+ #[clap(flatten)]
+ output_kind: ReadOutputKind,
+ },
+ /// Delete a single value
+ Delete {
+ /// Partition key to delete from
+ partition_key: String,
+ /// Sort key to delete from
+ sort_key: String,
+ /// Causality information
+ #[clap(short, long)]
+ causality: String,
+ },
+ /// List partition keys
+ ReadIndex {
+ /// Output formating
+ #[clap(flatten)]
+ output_kind: BatchOutputKind,
+ /// Output only partition keys matching this filter
+ #[clap(flatten)]
+ filter: Filter,
+ },
+ /// Read a range of sort keys
+ ReadRange {
+ /// Partition key to read from
+ partition_key: String,
+ /// Output formating
+ #[clap(flatten)]
+ output_kind: BatchOutputKind,
+ /// Output only sort keys matching this filter
+ #[clap(flatten)]
+ filter: Filter,
+ },
+ /// Delete a range of sort keys
+ DeleteRange {
+ /// Partition key to delete from
+ partition_key: String,
+ /// Output formating
+ #[clap(flatten)]
+ output_kind: BatchOutputKind,
+ /// Delete only sort keys matching this filter
+ #[clap(flatten)]
+ filter: Filter,
+ },
+}
+
+/// Where to read a value from
+#[derive(Parser, Debug)]
+#[clap(group = clap::ArgGroup::new("value").multiple(false).required(true))]
+struct Value {
+ /// Read value from a file. use - to read from stdin
+ #[clap(short, long, group = "value")]
+ file: Option<String>,
+ /// Read a base64 value from commandline
+ #[clap(short, long, group = "value")]
+ b64: Option<String>,
+ /// Read a raw (UTF-8) value from the commandline
+ #[clap(short, long, group = "value")]
+ text: Option<String>,
+}
+
+impl Value {
+ async fn to_data(&self) -> Result<Vec<u8>, Error> {
+ if let Some(ref text) = self.text {
+ Ok(text.as_bytes().to_vec())
+ } else if let Some(ref b64) = self.b64 {
+ base64::decode(b64).map_err(|_| Error::Message("invalid base64 input".into()))
+ } else if let Some(ref path) = self.file {
+ use tokio::io::AsyncReadExt;
+ if path == "-" {
+ let mut file = tokio::io::stdin();
+ let mut vec = Vec::new();
+ file.read_to_end(&mut vec).await?;
+ Ok(vec)
+ } else {
+ let mut file = tokio::fs::File::open(path).await?;
+ let mut vec = Vec::new();
+ file.read_to_end(&mut vec).await?;
+ Ok(vec)
+ }
+ } else {
+ unreachable!("Value must have one option set")
+ }
+ }
+}
+
+#[derive(Parser, Debug)]
+#[clap(group = clap::ArgGroup::new("output-kind").multiple(false).required(false))]
+struct ReadOutputKind {
+ /// Base64 output. Conflicts are line separated, first line is causality token
+ #[clap(short, long, group = "output-kind")]
+ b64: bool,
+ /// Raw output. Conflicts generate error, causality token is not returned
+ #[clap(short, long, group = "output-kind")]
+ raw: bool,
+ /// Human formated output
+ #[clap(short = 'H', long, group = "output-kind")]
+ human: bool,
+ /// JSON formated output
+ #[clap(short, long, group = "output-kind")]
+ json: bool,
+}
+
+impl ReadOutputKind {
+ fn display_output(&self, val: CausalValue) -> ! {
+ use std::io::Write;
+ use std::process::exit;
+
+ if self.json {
+ let stdout = std::io::stdout();
+ serde_json::to_writer_pretty(stdout, &val).unwrap();
+ exit(0);
+ }
+
+ if self.raw {
+ let mut val = val.value;
+ if val.len() != 1 {
+ eprintln!(
+ "Raw mode can only read non-concurent values, found {} values, expected 1",
+ val.len()
+ );
+ exit(1);
+ }
+ let val = val.pop().unwrap();
+ match val {
+ K2vValue::Value(v) => {
+ std::io::stdout().write_all(&v).unwrap();
+ exit(0);
+ }
+ K2vValue::Tombstone => {
+ eprintln!("Expected value, found tombstone");
+ exit(2);
+ }
+ }
+ }
+
+ let causality: String = val.causality.into();
+ if self.b64 {
+ println!("{}", causality);
+ for val in val.value {
+ match val {
+ K2vValue::Value(v) => {
+ println!("{}", base64::encode(&v))
+ }
+ K2vValue::Tombstone => {
+ println!();
+ }
+ }
+ }
+ exit(0);
+ }
+
+ // human
+ println!("causality: {}", causality);
+ println!("values:");
+ for val in val.value {
+ match val {
+ K2vValue::Value(v) => {
+ if let Ok(string) = std::str::from_utf8(&v) {
+ println!(" utf-8: {}", string);
+ } else {
+ println!(" base64: {}", base64::encode(&v));
+ }
+ }
+ K2vValue::Tombstone => {
+ println!(" tombstone");
+ }
+ }
+ }
+ exit(0);
+ }
+}
+
+#[derive(Parser, Debug)]
+#[clap(group = clap::ArgGroup::new("output-kind").multiple(false).required(false))]
+struct BatchOutputKind {
+ /// Human formated output
+ #[clap(short = 'H', long, group = "output-kind")]
+ human: bool,
+ /// JSON formated output
+ #[clap(short, long, group = "output-kind")]
+ json: bool,
+}
+
+/// Filter for batch operations
+#[derive(Parser, Debug)]
+#[clap(group = clap::ArgGroup::new("filter").multiple(true).required(true))]
+struct Filter {
+ /// Match only keys starting with this prefix
+ #[clap(short, long, group = "filter")]
+ prefix: Option<String>,
+ /// Match only keys lexicographically after this key (including this key itself)
+ #[clap(short, long, group = "filter")]
+ start: Option<String>,
+ /// Match only keys lexicographically before this key (excluding this key)
+ #[clap(short, long, group = "filter")]
+ end: Option<String>,
+ /// Only match the first X keys
+ #[clap(short, long)]
+ limit: Option<u64>,
+ /// Return keys in reverse order
+ #[clap(short, long)]
+ reverse: bool,
+ /// Return only keys where conflict happened
+ #[clap(short, long)]
+ conflicts_only: bool,
+ /// Also include keys storing only tombstones
+ #[clap(short, long)]
+ tombstones: bool,
+ /// Return any key
+ #[clap(short, long, group = "filter")]
+ all: bool,
+}
+
+impl Filter {
+ fn k2v_filter(&self) -> k2v_client::Filter<'_> {
+ k2v_client::Filter {
+ start: self.start.as_deref(),
+ end: self.end.as_deref(),
+ prefix: self.prefix.as_deref(),
+ limit: self.limit,
+ reverse: self.reverse,
+ }
+ }
+}
+
+#[tokio::main]
+async fn main() -> Result<(), Error> {
+ let args = Args::parse();
+
+ let region = Region::Custom {
+ name: args.region,
+ endpoint: args.endpoint,
+ };
+
+ let creds = AwsCredentials::new(args.key_id, args.secret, None, None);
+
+ let client = K2vClient::new(region, args.bucket, creds, None)?;
+
+ match args.command {
+ Command::Insert {
+ partition_key,
+ sort_key,
+ causality,
+ value,
+ } => {
+ client
+ .insert_item(
+ &partition_key,
+ &sort_key,
+ value.to_data().await?,
+ causality.map(Into::into),
+ )
+ .await?;
+ }
+ Command::Delete {
+ partition_key,
+ sort_key,
+ causality,
+ } => {
+ client
+ .delete_item(&partition_key, &sort_key, causality.into())
+ .await?;
+ }
+ Command::Read {
+ partition_key,
+ sort_key,
+ output_kind,
+ } => {
+ let res = client.read_item(&partition_key, &sort_key).await?;
+ output_kind.display_output(res);
+ }
+ Command::Poll {
+ partition_key,
+ sort_key,
+ causality,
+ timeout,
+ output_kind,
+ } => {
+ let timeout = timeout.map(Duration::from_secs);
+ let res_opt = client
+ .poll_item(&partition_key, &sort_key, causality.into(), timeout)
+ .await?;
+ if let Some(res) = res_opt {
+ output_kind.display_output(res);
+ } else {
+ println!("Delay expired and value didn't change.");
+ }
+ }
+ Command::ReadIndex {
+ output_kind,
+ filter,
+ } => {
+ if filter.conflicts_only || filter.tombstones {
+ return Err(Error::Message(
+ "conlicts-only and tombstones are invalid for read-index".into(),
+ ));
+ }
+ let res = client.read_index(filter.k2v_filter()).await?;
+ if output_kind.json {
+ let values = res
+ .items
+ .into_iter()
+ .map(|(k, v)| {
+ let mut value = serde_json::to_value(v).unwrap();
+ value
+ .as_object_mut()
+ .unwrap()
+ .insert("sort_key".to_owned(), k.into());
+ value
+ })
+ .collect::<Vec<_>>();
+ let json = serde_json::json!({
+ "next_key": res.next_start,
+ "values": values,
+ });
+
+ let stdout = std::io::stdout();
+ serde_json::to_writer_pretty(stdout, &json).unwrap();
+ } else {
+ if let Some(next) = res.next_start {
+ println!("next key: {}", next);
+ }
+
+ let mut to_print = Vec::new();
+ to_print.push(format!("key:\tentries\tconflicts\tvalues\tbytes"));
+ for (k, v) in res.items {
+ to_print.push(format!(
+ "{}\t{}\t{}\t{}\t{}",
+ k, v.entries, v.conflicts, v.values, v.bytes
+ ));
+ }
+ format_table(to_print);
+ }
+ }
+ Command::ReadRange {
+ partition_key,
+ output_kind,
+ filter,
+ } => {
+ let op = BatchReadOp {
+ partition_key: &partition_key,
+ filter: filter.k2v_filter(),
+ conflicts_only: filter.conflicts_only,
+ tombstones: filter.tombstones,
+ single_item: false,
+ };
+ let mut res = client.read_batch(&[op]).await?;
+ let res = res.pop().unwrap();
+ if output_kind.json {
+ let values = res
+ .items
+ .into_iter()
+ .map(|(k, v)| {
+ let mut value = serde_json::to_value(v).unwrap();
+ value
+ .as_object_mut()
+ .unwrap()
+ .insert("sort_key".to_owned(), k.into());
+ value
+ })
+ .collect::<Vec<_>>();
+ let json = serde_json::json!({
+ "next_key": res.next_start,
+ "values": values,
+ });
+
+ let stdout = std::io::stdout();
+ serde_json::to_writer_pretty(stdout, &json).unwrap();
+ } else {
+ if let Some(next) = res.next_start {
+ println!("next key: {}", next);
+ }
+ for (key, values) in res.items {
+ println!("key: {}", key);
+ let causality: String = values.causality.into();
+ println!("causality: {}", causality);
+ for value in values.value {
+ match value {
+ K2vValue::Value(v) => {
+ if let Ok(string) = std::str::from_utf8(&v) {
+ println!(" value(utf-8): {}", string);
+ } else {
+ println!(" value(base64): {}", base64::encode(&v));
+ }
+ }
+ K2vValue::Tombstone => {
+ println!(" tombstone");
+ }
+ }
+ }
+ }
+ }
+ }
+ Command::DeleteRange {
+ partition_key,
+ output_kind,
+ filter,
+ } => {
+ let op = BatchDeleteOp {
+ partition_key: &partition_key,
+ prefix: filter.prefix.as_deref(),
+ start: filter.start.as_deref(),
+ end: filter.end.as_deref(),
+ single_item: false,
+ };
+ if filter.reverse
+ || filter.conflicts_only
+ || filter.tombstones
+ || filter.limit.is_some()
+ {
+ return Err(Error::Message(
+ "limit, conlicts-only, reverse and tombstones are invalid for delete-range"
+ .into(),
+ ));
+ }
+
+ let res = client.delete_batch(&[op]).await?;
+
+ if output_kind.json {
+ println!("{}", res[0]);
+ } else {
+ println!("deleted {} keys", res[0]);
+ }
+ }
+ }
+
+ Ok(())
+}
diff --git a/src/k2v-client/error.rs b/src/k2v-client/error.rs
new file mode 100644
index 00000000..37c221f2
--- /dev/null
+++ b/src/k2v-client/error.rs
@@ -0,0 +1,29 @@
+use std::borrow::Cow;
+
+use thiserror::Error;
+
+/// Errors returned by this crate
+#[derive(Error, Debug)]
+pub enum Error {
+ #[error("{0}, {1}: {2} (path = {3})")]
+ Remote(
+ http::StatusCode,
+ Cow<'static, str>,
+ Cow<'static, str>,
+ Cow<'static, str>,
+ ),
+ #[error("received invalid response: {0}")]
+ InvalidResponse(Cow<'static, str>),
+ #[error("not found")]
+ NotFound,
+ #[error("io error: {0}")]
+ IoError(#[from] std::io::Error),
+ #[error("rusoto tls error: {0}")]
+ RusotoTls(#[from] rusoto_core::request::TlsError),
+ #[error("rusoto http error: {0}")]
+ RusotoHttp(#[from] rusoto_core::HttpDispatchError),
+ #[error("deserialization error: {0}")]
+ Deserialization(#[from] serde_json::Error),
+ #[error("{0}")]
+ Message(Cow<'static, str>),
+}
diff --git a/src/k2v-client/lib.rs b/src/k2v-client/lib.rs
new file mode 100644
index 00000000..c2606af4
--- /dev/null
+++ b/src/k2v-client/lib.rs
@@ -0,0 +1,611 @@
+use std::collections::BTreeMap;
+use std::time::Duration;
+
+use http::header::{ACCEPT, CONTENT_LENGTH, CONTENT_TYPE};
+use http::status::StatusCode;
+use http::HeaderMap;
+use log::{debug, error};
+
+use rusoto_core::{ByteStream, DispatchSignedRequest, HttpClient};
+use rusoto_credential::AwsCredentials;
+use rusoto_signature::region::Region;
+use rusoto_signature::signature::SignedRequest;
+use serde::de::Error as DeError;
+use serde::{Deserialize, Deserializer, Serialize, Serializer};
+
+use tokio::io::AsyncReadExt;
+
+mod error;
+
+pub use error::Error;
+
+const DEFAULT_TIMEOUT: Duration = Duration::from_secs(5);
+const DEFAULT_POLL_TIMEOUT: Duration = Duration::from_secs(300);
+const SERVICE: &str = "k2v";
+const GARAGE_CAUSALITY_TOKEN: &str = "X-Garage-Causality-Token";
+
+/// Client used to query a K2V server.
+pub struct K2vClient {
+ region: Region,
+ bucket: String,
+ creds: AwsCredentials,
+ client: HttpClient,
+}
+
+impl K2vClient {
+ /// Create a new K2V client.
+ pub fn new(
+ region: Region,
+ bucket: String,
+ creds: AwsCredentials,
+ user_agent: Option<String>,
+ ) -> Result<Self, Error> {
+ let mut client = HttpClient::new()?;
+ if let Some(ua) = user_agent {
+ client.local_agent_prepend(ua);
+ } else {
+ client.local_agent_prepend(format!("k2v/{}", env!("CARGO_PKG_VERSION")));
+ }
+ Ok(K2vClient {
+ region,
+ bucket,
+ creds,
+ client,
+ })
+ }
+
+ /// Perform a ReadItem request, reading the value(s) stored for a single pk+sk.
+ pub async fn read_item(
+ &self,
+ partition_key: &str,
+ sort_key: &str,
+ ) -> Result<CausalValue, Error> {
+ let mut req = SignedRequest::new(
+ "GET",
+ SERVICE,
+ &self.region,
+ &format!("/{}/{}", self.bucket, partition_key),
+ );
+ req.add_param("sort_key", sort_key);
+ req.add_header(ACCEPT, "application/octet-stream, application/json");
+
+ let res = self.dispatch(req, None).await?;
+
+ let causality = res
+ .causality_token
+ .ok_or_else(|| Error::InvalidResponse("missing causality token".into()))?;
+
+ if res.status == StatusCode::NO_CONTENT {
+ return Ok(CausalValue {
+ causality,
+ value: vec![K2vValue::Tombstone],
+ });
+ }
+
+ match res.content_type.as_deref() {
+ Some("application/octet-stream") => Ok(CausalValue {
+ causality,
+ value: vec![K2vValue::Value(res.body)],
+ }),
+ Some("application/json") => {
+ let value = serde_json::from_slice(&res.body)?;
+ Ok(CausalValue { causality, value })
+ }
+ Some(ct) => Err(Error::InvalidResponse(
+ format!("invalid content type: {}", ct).into(),
+ )),
+ None => Err(Error::InvalidResponse("missing content type".into())),
+ }
+ }
+
+ /// Perform a PollItem request, waiting for the value(s) stored for a single pk+sk to be
+ /// updated.
+ pub async fn poll_item(
+ &self,
+ partition_key: &str,
+ sort_key: &str,
+ causality: CausalityToken,
+ timeout: Option<Duration>,
+ ) -> Result<Option<CausalValue>, Error> {
+ let timeout = timeout.unwrap_or(DEFAULT_POLL_TIMEOUT);
+
+ let mut req = SignedRequest::new(
+ "GET",
+ SERVICE,
+ &self.region,
+ &format!("/{}/{}", self.bucket, partition_key),
+ );
+ req.add_param("sort_key", sort_key);
+ req.add_param("causality_token", &causality.0);
+ req.add_param("timeout", &timeout.as_secs().to_string());
+ req.add_header(ACCEPT, "application/octet-stream, application/json");
+
+ let res = self.dispatch(req, Some(timeout + DEFAULT_TIMEOUT)).await?;
+
+ if res.status == StatusCode::NOT_MODIFIED {
+ return Ok(None);
+ }
+
+ let causality = res
+ .causality_token
+ .ok_or_else(|| Error::InvalidResponse("missing causality token".into()))?;
+
+ if res.status == StatusCode::NO_CONTENT {
+ return Ok(Some(CausalValue {
+ causality,
+ value: vec![K2vValue::Tombstone],
+ }));
+ }
+
+ match res.content_type.as_deref() {
+ Some("application/octet-stream") => Ok(Some(CausalValue {
+ causality,
+ value: vec![K2vValue::Value(res.body)],
+ })),
+ Some("application/json") => {
+ let value = serde_json::from_slice(&res.body)?;
+ Ok(Some(CausalValue { causality, value }))
+ }
+ Some(ct) => Err(Error::InvalidResponse(
+ format!("invalid content type: {}", ct).into(),
+ )),
+ None => Err(Error::InvalidResponse("missing content type".into())),
+ }
+ }
+
+ /// Perform an InsertItem request, inserting a value for a single pk+sk.
+ pub async fn insert_item(
+ &self,
+ partition_key: &str,
+ sort_key: &str,
+ value: Vec<u8>,
+ causality: Option<CausalityToken>,
+ ) -> Result<(), Error> {
+ let mut req = SignedRequest::new(
+ "PUT",
+ SERVICE,
+ &self.region,
+ &format!("/{}/{}", self.bucket, partition_key),
+ );
+ req.add_param("sort_key", sort_key);
+ req.set_payload(Some(value));
+
+ if let Some(causality) = causality {
+ req.add_header(GARAGE_CAUSALITY_TOKEN, &causality.0);
+ }
+
+ self.dispatch(req, None).await?;
+ Ok(())
+ }
+
+ /// Perform a DeleteItem request, deleting the value(s) stored for a single pk+sk.
+ pub async fn delete_item(
+ &self,
+ partition_key: &str,
+ sort_key: &str,
+ causality: CausalityToken,
+ ) -> Result<(), Error> {
+ let mut req = SignedRequest::new(
+ "DELETE",
+ SERVICE,
+ &self.region,
+ &format!("/{}/{}", self.bucket, partition_key),
+ );
+ req.add_param("sort_key", sort_key);
+ req.add_header(GARAGE_CAUSALITY_TOKEN, &causality.0);
+
+ self.dispatch(req, None).await?;
+ Ok(())
+ }
+
+ /// Perform a ReadIndex request, listing partition key which have at least one associated
+ /// sort key, and which matches the filter.
+ pub async fn read_index(
+ &self,
+ filter: Filter<'_>,
+ ) -> Result<PaginatedRange<PartitionInfo>, Error> {
+ let mut req =
+ SignedRequest::new("GET", SERVICE, &self.region, &format!("/{}", self.bucket));
+ filter.insert_params(&mut req);
+
+ let res = self.dispatch(req, None).await?;
+
+ let resp: ReadIndexResponse = serde_json::from_slice(&res.body)?;
+
+ let items = resp
+ .partition_keys
+ .into_iter()
+ .map(|ReadIndexItem { pk, info }| (pk, info))
+ .collect();
+
+ Ok(PaginatedRange {
+ items,
+ next_start: resp.next_start,
+ })
+ }
+
+ /// Perform an InsertBatch request, inserting multiple values at once. Note: this operation is
+ /// *not* atomic: it is possible for some sub-operations to fails and others to success. In
+ /// that case, failure is reported.
+ pub async fn insert_batch(&self, operations: &[BatchInsertOp<'_>]) -> Result<(), Error> {
+ let mut req =
+ SignedRequest::new("POST", SERVICE, &self.region, &format!("/{}", self.bucket));
+
+ let payload = serde_json::to_vec(operations)?;
+ req.set_payload(Some(payload));
+ self.dispatch(req, None).await?;
+ Ok(())
+ }
+
+ /// Perform a ReadBatch request, reading multiple values or range of values at once.
+ pub async fn read_batch(
+ &self,
+ operations: &[BatchReadOp<'_>],
+ ) -> Result<Vec<PaginatedRange<CausalValue>>, Error> {
+ let mut req =
+ SignedRequest::new("POST", SERVICE, &self.region, &format!("/{}", self.bucket));
+ req.add_param("search", "");
+
+ let payload = serde_json::to_vec(operations)?;
+ req.set_payload(Some(payload));
+ let res = self.dispatch(req, None).await?;
+
+ let resp: Vec<BatchReadResponse> = serde_json::from_slice(&res.body)?;
+
+ Ok(resp
+ .into_iter()
+ .map(|e| PaginatedRange {
+ items: e
+ .items
+ .into_iter()
+ .map(|BatchReadItem { sk, ct, v }| {
+ (
+ sk,
+ CausalValue {
+ causality: ct,
+ value: v,
+ },
+ )
+ })
+ .collect(),
+ next_start: e.next_start,
+ })
+ .collect())
+ }
+
+ /// Perform a DeleteBatch request, deleting mutiple values or range of values at once, without
+ /// providing causality information.
+ pub async fn delete_batch(&self, operations: &[BatchDeleteOp<'_>]) -> Result<Vec<u64>, Error> {
+ let mut req =
+ SignedRequest::new("POST", SERVICE, &self.region, &format!("/{}", self.bucket));
+ req.add_param("delete", "");
+
+ let payload = serde_json::to_vec(operations)?;
+ req.set_payload(Some(payload));
+ let res = self.dispatch(req, None).await?;
+
+ let resp: Vec<BatchDeleteResponse> = serde_json::from_slice(&res.body)?;
+
+ Ok(resp.into_iter().map(|r| r.deleted_items).collect())
+ }
+
+ async fn dispatch(
+ &self,
+ mut req: SignedRequest,
+ timeout: Option<Duration>,
+ ) -> Result<Response, Error> {
+ req.sign(&self.creds);
+ let mut res = self
+ .client
+ .dispatch(req, Some(timeout.unwrap_or(DEFAULT_TIMEOUT)))
+ .await?;
+
+ let causality_token = res
+ .headers
+ .remove(GARAGE_CAUSALITY_TOKEN)
+ .map(CausalityToken);
+ let content_type = res.headers.remove(CONTENT_TYPE);
+
+ let body = match res.status {
+ StatusCode::OK => read_body(&mut res.headers, res.body).await?,
+ StatusCode::NO_CONTENT => Vec::new(),
+ StatusCode::NOT_FOUND => return Err(Error::NotFound),
+ StatusCode::NOT_MODIFIED => Vec::new(),
+ s => {
+ let err_body = read_body(&mut res.headers, res.body)
+ .await
+ .unwrap_or_default();
+ let err_body_str = std::str::from_utf8(&err_body)
+ .map(String::from)
+ .unwrap_or_else(|_| base64::encode(&err_body));
+
+ if s.is_client_error() || s.is_server_error() {
+ error!("Error response {}: {}", res.status, err_body_str);
+ let err = match serde_json::from_slice::<ErrorResponse>(&err_body) {
+ Ok(err) => Error::Remote(
+ res.status,
+ err.code.into(),
+ err.message.into(),
+ err.path.into(),
+ ),
+ Err(_) => Error::Remote(
+ res.status,
+ "unknown".into(),
+ err_body_str.into(),
+ "?".into(),
+ ),
+ };
+ return Err(err);
+ } else {
+ let msg = format!(
+ "Unexpected response code {}. Response body: {}",
+ res.status, err_body_str
+ );
+ error!("{}", msg);
+ return Err(Error::InvalidResponse(msg.into()));
+ }
+ }
+ };
+ debug!(
+ "Response body: {}",
+ std::str::from_utf8(&body)
+ .map(String::from)
+ .unwrap_or_else(|_| base64::encode(&body))
+ );
+
+ Ok(Response {
+ body,
+ status: res.status,
+ causality_token,
+ content_type,
+ })
+ }
+}
+
+async fn read_body(headers: &mut HeaderMap<String>, body: ByteStream) -> Result<Vec<u8>, Error> {
+ let body_len = headers
+ .get(CONTENT_LENGTH)
+ .and_then(|h| h.parse().ok())
+ .unwrap_or(0);
+ let mut res = Vec::with_capacity(body_len);
+ body.into_async_read().read_to_end(&mut res).await?;
+ Ok(res)
+}
+
+/// An opaque token used to convey causality between operations.
+#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)]
+#[serde(transparent)]
+pub struct CausalityToken(String);
+
+impl From<String> for CausalityToken {
+ fn from(v: String) -> Self {
+ CausalityToken(v)
+ }
+}
+
+impl From<CausalityToken> for String {
+ fn from(v: CausalityToken) -> Self {
+ v.0
+ }
+}
+
+/// A value in K2V. can be either a binary value, or a tombstone.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum K2vValue {
+ Tombstone,
+ Value(Vec<u8>),
+}
+
+impl From<Vec<u8>> for K2vValue {
+ fn from(v: Vec<u8>) -> Self {
+ K2vValue::Value(v)
+ }
+}
+
+impl From<Option<Vec<u8>>> for K2vValue {
+ fn from(v: Option<Vec<u8>>) -> Self {
+ match v {
+ Some(v) => K2vValue::Value(v),
+ None => K2vValue::Tombstone,
+ }
+ }
+}
+
+impl<'de> Deserialize<'de> for K2vValue {
+ fn deserialize<D>(d: D) -> Result<Self, D::Error>
+ where
+ D: Deserializer<'de>,
+ {
+ let val: Option<&str> = Option::deserialize(d)?;
+ Ok(match val {
+ Some(s) => {
+ K2vValue::Value(base64::decode(s).map_err(|_| DeError::custom("invalid base64"))?)
+ }
+ None => K2vValue::Tombstone,
+ })
+ }
+}
+
+impl Serialize for K2vValue {
+ fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+ where
+ S: Serializer,
+ {
+ match self {
+ K2vValue::Tombstone => serializer.serialize_none(),
+ K2vValue::Value(v) => {
+ let b64 = base64::encode(v);
+ serializer.serialize_str(&b64)
+ }
+ }
+ }
+}
+
+/// A set of K2vValue and associated causality information.
+#[derive(Debug, Clone, Serialize)]
+pub struct CausalValue {
+ pub causality: CausalityToken,
+ pub value: Vec<K2vValue>,
+}
+
+/// Result of paginated requests.
+#[derive(Debug, Clone)]
+pub struct PaginatedRange<V> {
+ pub items: BTreeMap<String, V>,
+ pub next_start: Option<String>,
+}
+
+/// Filter for batch operations.
+#[derive(Debug, Default, Clone, Deserialize, Serialize)]
+pub struct Filter<'a> {
+ pub start: Option<&'a str>,
+ pub end: Option<&'a str>,
+ pub prefix: Option<&'a str>,
+ pub limit: Option<u64>,
+ #[serde(default)]
+ pub reverse: bool,
+}
+
+impl<'a> Filter<'a> {
+ fn insert_params(&self, req: &mut SignedRequest) {
+ if let Some(start) = &self.start {
+ req.add_param("start", start);
+ }
+ if let Some(end) = &self.end {
+ req.add_param("end", end);
+ }
+ if let Some(prefix) = &self.prefix {
+ req.add_param("prefix", prefix);
+ }
+ if let Some(limit) = &self.limit {
+ req.add_param("limit", &limit.to_string());
+ }
+ if self.reverse {
+ req.add_param("reverse", "true");
+ }
+ }
+}
+
+#[derive(Debug, Clone, Deserialize)]
+#[serde(rename_all = "camelCase")]
+struct ReadIndexResponse<'a> {
+ #[serde(flatten, borrow)]
+ #[allow(dead_code)]
+ filter: Filter<'a>,
+ partition_keys: Vec<ReadIndexItem>,
+ #[allow(dead_code)]
+ more: bool,
+ next_start: Option<String>,
+}
+
+#[derive(Debug, Clone, Deserialize)]
+struct ReadIndexItem {
+ pk: String,
+ #[serde(flatten)]
+ info: PartitionInfo,
+}
+
+/// Information about data stored with a given partition key.
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct PartitionInfo {
+ pub entries: u64,
+ pub conflicts: u64,
+ pub values: u64,
+ pub bytes: u64,
+}
+
+/// Single sub-operation of an InsertBatch.
+#[derive(Debug, Clone, Serialize)]
+pub struct BatchInsertOp<'a> {
+ #[serde(rename = "pk")]
+ pub partition_key: &'a str,
+ #[serde(rename = "sk")]
+ pub sort_key: &'a str,
+ #[serde(rename = "ct")]
+ pub causality: Option<CausalityToken>,
+ #[serde(rename = "v")]
+ pub value: K2vValue,
+}
+
+/// Single sub-operation of a ReadBatch.
+#[derive(Debug, Default, Clone, Deserialize, Serialize)]
+#[serde(rename_all = "camelCase")]
+pub struct BatchReadOp<'a> {
+ pub partition_key: &'a str,
+ #[serde(flatten, borrow)]
+ pub filter: Filter<'a>,
+ #[serde(default)]
+ pub single_item: bool,
+ #[serde(default)]
+ pub conflicts_only: bool,
+ #[serde(default)]
+ pub tombstones: bool,
+}
+
+#[derive(Debug, Clone, Deserialize)]
+#[serde(rename_all = "camelCase")]
+struct BatchReadResponse<'a> {
+ #[serde(flatten, borrow)]
+ #[allow(dead_code)]
+ op: BatchReadOp<'a>,
+ items: Vec<BatchReadItem>,
+ #[allow(dead_code)]
+ more: bool,
+ next_start: Option<String>,
+}
+
+#[derive(Debug, Clone, Deserialize)]
+struct BatchReadItem {
+ sk: String,
+ ct: CausalityToken,
+ v: Vec<K2vValue>,
+}
+
+/// Single sub-operation of a DeleteBatch
+#[derive(Debug, Clone, Deserialize, Serialize)]
+#[serde(rename_all = "camelCase")]
+pub struct BatchDeleteOp<'a> {
+ pub partition_key: &'a str,
+ pub prefix: Option<&'a str>,
+ pub start: Option<&'a str>,
+ pub end: Option<&'a str>,
+ #[serde(default)]
+ pub single_item: bool,
+}
+
+impl<'a> BatchDeleteOp<'a> {
+ pub fn new(partition_key: &'a str) -> Self {
+ BatchDeleteOp {
+ partition_key,
+ prefix: None,
+ start: None,
+ end: None,
+ single_item: false,
+ }
+ }
+}
+
+#[derive(Debug, Clone, Deserialize)]
+#[serde(rename_all = "camelCase")]
+struct BatchDeleteResponse<'a> {
+ #[serde(flatten, borrow)]
+ #[allow(dead_code)]
+ filter: BatchDeleteOp<'a>,
+ deleted_items: u64,
+}
+
+#[derive(Deserialize)]
+struct ErrorResponse {
+ code: String,
+ message: String,
+ #[allow(dead_code)]
+ region: String,
+ path: String,
+}
+
+struct Response {
+ body: Vec<u8>,
+ status: StatusCode,
+ causality_token: Option<CausalityToken>,
+ content_type: Option<String>,
+}
diff --git a/src/model/Cargo.toml b/src/model/Cargo.toml
index 007cec89..2c2e2bfe 100644
--- a/src/model/Cargo.toml
+++ b/src/model/Cargo.toml
@@ -1,6 +1,6 @@
[package]
name = "garage_model"
-version = "0.7.0"
+version = "0.8.0"
authors = ["Alex Auvolat <alex@adnab.me>"]
edition = "2018"
license = "AGPL-3.0"
@@ -14,22 +14,22 @@ path = "lib.rs"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
-garage_rpc = { version = "0.7.0", path = "../rpc" }
-garage_table = { version = "0.7.0", path = "../table" }
-garage_block = { version = "0.7.0", path = "../block" }
-garage_util = { version = "0.7.0", path = "../util" }
-garage_model_050 = { package = "garage_model", version = "0.5.1" }
+garage_db = { version = "0.8.0", path = "../db" }
+garage_rpc = { version = "0.8.0", path = "../rpc" }
+garage_table = { version = "0.8.0", path = "../table" }
+garage_block = { version = "0.8.0", path = "../block" }
+garage_util = { version = "0.8.0", path = "../util" }
async-trait = "0.1.7"
arc-swap = "1.0"
+blake2 = "0.9"
err-derive = "0.3"
hex = "0.4"
+base64 = "0.13"
tracing = "0.1.30"
rand = "0.8"
zstd = { version = "0.9", default-features = false }
-sled = "0.34"
-
rmp-serde = "0.15"
serde = { version = "1.0", default-features = false, features = ["derive", "rc"] }
serde_bytes = "0.11"
@@ -39,6 +39,10 @@ futures-util = "0.3"
tokio = { version = "1.0", default-features = false, features = ["rt", "rt-multi-thread", "io-util", "net", "time", "macros", "sync", "signal", "fs"] }
opentelemetry = "0.17"
-#netapp = { version = "0.3.0", git = "https://git.deuxfleurs.fr/lx/netapp" }
-#netapp = { version = "0.4", path = "../../../netapp" }
-netapp = "0.4"
+netapp = "0.5"
+
+[features]
+k2v = [ "garage_util/k2v" ]
+lmdb = [ "garage_db/lmdb" ]
+sled = [ "garage_db/sled" ]
+sqlite = [ "garage_db/sqlite" ]
diff --git a/src/model/bucket_alias_table.rs b/src/model/bucket_alias_table.rs
index fce03d04..fcd1536e 100644
--- a/src/model/bucket_alias_table.rs
+++ b/src/model/bucket_alias_table.rs
@@ -7,7 +7,7 @@ use garage_table::*;
/// The bucket alias table holds the names given to buckets
/// in the global namespace.
-#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)]
+#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
pub struct BucketAlias {
name: String,
pub state: crdt::Lww<Option<Uuid>>,
diff --git a/src/model/bucket_table.rs b/src/model/bucket_table.rs
index 7c7b9f30..7be42702 100644
--- a/src/model/bucket_table.rs
+++ b/src/model/bucket_table.rs
@@ -1,6 +1,6 @@
use serde::{Deserialize, Serialize};
-use garage_table::crdt::Crdt;
+use garage_table::crdt::*;
use garage_table::*;
use garage_util::data::*;
use garage_util::time::*;
@@ -12,7 +12,7 @@ use crate::permission::BucketKeyPerm;
/// Its parameters are not directly accessible as:
/// - It must be possible to merge paramaters, hence the use of a LWW CRDT.
/// - A bucket has 2 states, Present or Deleted and parameters make sense only if present.
-#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)]
+#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
pub struct Bucket {
/// ID of the bucket
pub id: Uuid,
@@ -21,7 +21,7 @@ pub struct Bucket {
}
/// Configuration for a bucket
-#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)]
+#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
pub struct BucketParams {
/// Bucket's creation date
pub creation_date: u64,
@@ -44,6 +44,9 @@ pub struct BucketParams {
pub website_config: crdt::Lww<Option<WebsiteConfig>>,
/// CORS rules
pub cors_config: crdt::Lww<Option<Vec<CorsRule>>>,
+ /// Bucket quotas
+ #[serde(default)]
+ pub quotas: crdt::Lww<BucketQuotas>,
}
#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
@@ -62,6 +65,18 @@ pub struct CorsRule {
pub expose_headers: Vec<String>,
}
+#[derive(Default, PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)]
+pub struct BucketQuotas {
+ /// Maximum size in bytes (bucket size = sum of sizes of objects in the bucket)
+ pub max_size: Option<u64>,
+ /// Maximum number of non-deleted objects in the bucket
+ pub max_objects: Option<u64>,
+}
+
+impl AutoCrdt for BucketQuotas {
+ const WARN_IF_DIFFERENT: bool = true;
+}
+
impl BucketParams {
/// Create an empty BucketParams with no authorized keys and no website accesss
pub fn new() -> Self {
@@ -72,6 +87,7 @@ impl BucketParams {
local_aliases: crdt::LwwMap::new(),
website_config: crdt::Lww::new(None),
cors_config: crdt::Lww::new(None),
+ quotas: crdt::Lww::new(BucketQuotas::default()),
}
}
}
@@ -86,6 +102,7 @@ impl Crdt for BucketParams {
self.website_config.merge(&o.website_config);
self.cors_config.merge(&o.cors_config);
+ self.quotas.merge(&o.quotas);
}
}
diff --git a/src/model/garage.rs b/src/model/garage.rs
index abdb920a..75012952 100644
--- a/src/model/garage.rs
+++ b/src/model/garage.rs
@@ -2,8 +2,11 @@ use std::sync::Arc;
use netapp::NetworkKey;
+use garage_db as db;
+
use garage_util::background::*;
use garage_util::config::*;
+use garage_util::error::*;
use garage_rpc::system::System;
@@ -13,13 +16,18 @@ use garage_table::replication::TableFullReplication;
use garage_table::replication::TableShardedReplication;
use garage_table::*;
-use crate::block_ref_table::*;
+use crate::s3::block_ref_table::*;
+use crate::s3::object_table::*;
+use crate::s3::version_table::*;
+
use crate::bucket_alias_table::*;
use crate::bucket_table::*;
use crate::helper;
+use crate::index_counter::*;
use crate::key_table::*;
-use crate::object_table::*;
-use crate::version_table::*;
+
+#[cfg(feature = "k2v")]
+use crate::k2v::{item_table::*, poll::*, rpc::*};
/// An entire Garage full of data
pub struct Garage {
@@ -27,7 +35,7 @@ pub struct Garage {
pub config: Config,
/// The local database
- pub db: sled::Db,
+ pub db: db::Db,
/// A background job runner
pub background: Arc<BackgroundRunner>,
/// The membership manager
@@ -35,21 +43,118 @@ pub struct Garage {
/// The block manager
pub block_manager: Arc<BlockManager>,
- /// Table containing informations about buckets
+ /// Table containing buckets
pub bucket_table: Arc<Table<BucketTable, TableFullReplication>>,
- /// Table containing informations about bucket aliases
+ /// Table containing bucket aliases
pub bucket_alias_table: Arc<Table<BucketAliasTable, TableFullReplication>>,
- /// Table containing informations about api keys
+ /// Table containing api keys
pub key_table: Arc<Table<KeyTable, TableFullReplication>>,
+ /// Table containing S3 objects
pub object_table: Arc<Table<ObjectTable, TableShardedReplication>>,
+ /// Counting table containing object counters
+ pub object_counter_table: Arc<IndexCounter<Object>>,
+ /// Table containing S3 object versions
pub version_table: Arc<Table<VersionTable, TableShardedReplication>>,
+ /// Table containing S3 block references (not blocks themselves)
pub block_ref_table: Arc<Table<BlockRefTable, TableShardedReplication>>,
+
+ #[cfg(feature = "k2v")]
+ pub k2v: GarageK2V,
+}
+
+#[cfg(feature = "k2v")]
+pub struct GarageK2V {
+ /// Table containing K2V items
+ pub item_table: Arc<Table<K2VItemTable, TableShardedReplication>>,
+ /// Indexing table containing K2V item counters
+ pub counter_table: Arc<IndexCounter<K2VItem>>,
+ /// K2V RPC handler
+ pub rpc: Arc<K2VRpcHandler>,
}
impl Garage {
/// Create and run garage
- pub fn new(config: Config, db: sled::Db, background: Arc<BackgroundRunner>) -> Arc<Self> {
+ pub fn new(config: Config, background: Arc<BackgroundRunner>) -> Result<Arc<Self>, Error> {
+ // Create meta dir and data dir if they don't exist already
+ std::fs::create_dir_all(&config.metadata_dir)
+ .ok_or_message("Unable to create Garage metadata directory")?;
+ std::fs::create_dir_all(&config.data_dir)
+ .ok_or_message("Unable to create Garage data directory")?;
+
+ info!("Opening database...");
+ let mut db_path = config.metadata_dir.clone();
+ let db = match config.db_engine.as_str() {
+ // ---- Sled DB ----
+ #[cfg(feature = "sled")]
+ "sled" => {
+ db_path.push("db");
+ info!("Opening Sled database at: {}", db_path.display());
+ let db = db::sled_adapter::sled::Config::default()
+ .path(&db_path)
+ .cache_capacity(config.sled_cache_capacity)
+ .flush_every_ms(Some(config.sled_flush_every_ms))
+ .open()
+ .expect("Unable to open sled DB");
+ db::sled_adapter::SledDb::init(db)
+ }
+ #[cfg(not(feature = "sled"))]
+ "sled" => return Err(Error::Message("sled db not available in this build".into())),
+ // ---- Sqlite DB ----
+ #[cfg(feature = "sqlite")]
+ "sqlite" | "sqlite3" | "rusqlite" => {
+ db_path.push("db.sqlite");
+ info!("Opening Sqlite database at: {}", db_path.display());
+ let db = db::sqlite_adapter::rusqlite::Connection::open(db_path)
+ .expect("Unable to open sqlite DB");
+ db::sqlite_adapter::SqliteDb::init(db)
+ }
+ #[cfg(not(feature = "sqlite"))]
+ "sqlite" | "sqlite3" | "rusqlite" => {
+ return Err(Error::Message(
+ "sqlite db not available in this build".into(),
+ ))
+ }
+ // ---- LMDB DB ----
+ #[cfg(feature = "lmdb")]
+ "lmdb" | "heed" => {
+ db_path.push("db.lmdb");
+ info!("Opening LMDB database at: {}", db_path.display());
+ std::fs::create_dir_all(&db_path).expect("Unable to create LMDB data directory");
+ let map_size = garage_db::lmdb_adapter::recommended_map_size();
+
+ use db::lmdb_adapter::heed;
+ let mut env_builder = heed::EnvOpenOptions::new();
+ env_builder.max_dbs(100);
+ env_builder.max_readers(500);
+ env_builder.map_size(map_size);
+ unsafe {
+ env_builder.flag(heed::flags::Flags::MdbNoSync);
+ env_builder.flag(heed::flags::Flags::MdbNoMetaSync);
+ }
+ let db = env_builder.open(&db_path).expect("Unable to open LMDB DB");
+ db::lmdb_adapter::LmdbDb::init(db)
+ }
+ #[cfg(not(feature = "lmdb"))]
+ "lmdb" | "heed" => return Err(Error::Message("lmdb db not available in this build".into())),
+ // ---- Unavailable DB engine ----
+ e => {
+ return Err(Error::Message(format!(
+ "Unsupported DB engine: {} (options: {})",
+ e,
+ vec![
+ #[cfg(feature = "sled")]
+ "sled",
+ #[cfg(feature = "sqlite")]
+ "sqlite",
+ #[cfg(feature = "lmdb")]
+ "lmdb",
+ ]
+ .join(", ")
+ )));
+ }
+ };
+
let network_key = NetworkKey::from_slice(
&hex::decode(&config.rpc_secret).expect("Invalid RPC secret key")[..],
)
@@ -64,7 +169,7 @@ impl Garage {
background.clone(),
replication_mode.replication_factor(),
&config,
- );
+ )?;
let data_rep_param = TableShardedReplication {
system: system.clone(),
@@ -90,11 +195,25 @@ impl Garage {
&db,
config.data_dir.clone(),
config.compression_level,
- config.block_manager_background_tranquility,
data_rep_param,
system.clone(),
);
+ // ---- admin tables ----
+ info!("Initialize bucket_table...");
+ let bucket_table = Table::new(BucketTable, control_rep_param.clone(), system.clone(), &db);
+
+ info!("Initialize bucket_alias_table...");
+ let bucket_alias_table = Table::new(
+ BucketAliasTable,
+ control_rep_param.clone(),
+ system.clone(),
+ &db,
+ );
+ info!("Initialize key_table_table...");
+ let key_table = Table::new(KeyTable, control_rep_param, system.clone(), &db);
+
+ // ---- S3 tables ----
info!("Initialize block_ref_table...");
let block_ref_table = Table::new(
BlockRefTable {
@@ -116,34 +235,28 @@ impl Garage {
&db,
);
+ info!("Initialize object counter table...");
+ let object_counter_table = IndexCounter::new(system.clone(), meta_rep_param.clone(), &db);
+
info!("Initialize object_table...");
+ #[allow(clippy::redundant_clone)]
let object_table = Table::new(
ObjectTable {
background: background.clone(),
version_table: version_table.clone(),
+ object_counter_table: object_counter_table.clone(),
},
- meta_rep_param,
- system.clone(),
- &db,
- );
-
- info!("Initialize bucket_table...");
- let bucket_table = Table::new(BucketTable, control_rep_param.clone(), system.clone(), &db);
-
- info!("Initialize bucket_alias_table...");
- let bucket_alias_table = Table::new(
- BucketAliasTable,
- control_rep_param.clone(),
+ meta_rep_param.clone(),
system.clone(),
&db,
);
- info!("Initialize key_table_table...");
- let key_table = Table::new(KeyTable, control_rep_param, system.clone(), &db);
-
- info!("Initialize Garage...");
+ // ---- K2V ----
+ #[cfg(feature = "k2v")]
+ let k2v = GarageK2V::new(system.clone(), &db, meta_rep_param);
- Arc::new(Self {
+ // -- done --
+ Ok(Arc::new(Self {
config,
db,
background,
@@ -153,12 +266,46 @@ impl Garage {
bucket_alias_table,
key_table,
object_table,
+ object_counter_table,
version_table,
block_ref_table,
- })
+ #[cfg(feature = "k2v")]
+ k2v,
+ }))
}
pub fn bucket_helper(&self) -> helper::bucket::BucketHelper {
helper::bucket::BucketHelper(self)
}
+
+ pub fn key_helper(&self) -> helper::key::KeyHelper {
+ helper::key::KeyHelper(self)
+ }
+}
+
+#[cfg(feature = "k2v")]
+impl GarageK2V {
+ fn new(system: Arc<System>, db: &db::Db, meta_rep_param: TableShardedReplication) -> Self {
+ info!("Initialize K2V counter table...");
+ let counter_table = IndexCounter::new(system.clone(), meta_rep_param.clone(), db);
+ info!("Initialize K2V subscription manager...");
+ let subscriptions = Arc::new(SubscriptionManager::new());
+ info!("Initialize K2V item table...");
+ let item_table = Table::new(
+ K2VItemTable {
+ counter_table: counter_table.clone(),
+ subscriptions: subscriptions.clone(),
+ },
+ meta_rep_param,
+ system.clone(),
+ db,
+ );
+ let rpc = K2VRpcHandler::new(system, item_table.clone(), subscriptions);
+
+ Self {
+ item_table,
+ counter_table,
+ rpc,
+ }
+ }
}
diff --git a/src/model/helper/bucket.rs b/src/model/helper/bucket.rs
index 706faf26..130ba5be 100644
--- a/src/model/helper/bucket.rs
+++ b/src/model/helper/bucket.rs
@@ -1,15 +1,18 @@
-use garage_table::util::EmptyKey;
use garage_util::crdt::*;
use garage_util::data::*;
use garage_util::error::{Error as GarageError, OkOrMessage};
use garage_util::time::*;
+use garage_table::util::*;
+
use crate::bucket_alias_table::*;
use crate::bucket_table::*;
use crate::garage::Garage;
use crate::helper::error::*;
-use crate::key_table::{Key, KeyFilter};
+use crate::helper::key::KeyHelper;
+use crate::key_table::*;
use crate::permission::BucketKeyPerm;
+use crate::s3::object_table::ObjectFilter;
pub struct BucketHelper<'a>(pub(crate) &'a Garage);
@@ -49,6 +52,23 @@ impl<'a> BucketHelper<'a> {
}
}
+ #[allow(clippy::ptr_arg)]
+ pub async fn resolve_bucket(&self, bucket_name: &String, api_key: &Key) -> Result<Uuid, Error> {
+ let api_key_params = api_key
+ .state
+ .as_option()
+ .ok_or_message("Key should not be deleted at this point")?;
+
+ if let Some(Some(bucket_id)) = api_key_params.local_aliases.get(bucket_name) {
+ Ok(*bucket_id)
+ } else {
+ Ok(self
+ .resolve_global_bucket_name(bucket_name)
+ .await?
+ .ok_or_else(|| Error::NoSuchBucket(bucket_name.to_string()))?)
+ }
+ }
+
/// Returns a Bucket if it is present in bucket table,
/// even if it is in deleted state. Querying a non-existing
/// bucket ID returns an internal error.
@@ -71,63 +91,7 @@ impl<'a> BucketHelper<'a> {
.get(&EmptyKey, &bucket_id)
.await?
.filter(|b| !b.is_deleted())
- .ok_or_bad_request(format!(
- "Bucket {:?} does not exist or has been deleted",
- bucket_id
- ))
- }
-
- /// Returns a Key if it is present in key table,
- /// even if it is in deleted state. Querying a non-existing
- /// key ID returns an internal error.
- pub async fn get_internal_key(&self, key_id: &String) -> Result<Key, Error> {
- Ok(self
- .0
- .key_table
- .get(&EmptyKey, key_id)
- .await?
- .ok_or_message(format!("Key {} does not exist", key_id))?)
- }
-
- /// Returns a Key if it is present in key table,
- /// only if it is in non-deleted state.
- /// Querying a non-existing key ID or a deleted key
- /// returns a bad request error.
- pub async fn get_existing_key(&self, key_id: &String) -> Result<Key, Error> {
- self.0
- .key_table
- .get(&EmptyKey, key_id)
- .await?
- .filter(|b| !b.state.is_deleted())
- .ok_or_bad_request(format!("Key {} does not exist or has been deleted", key_id))
- }
-
- /// Returns a Key if it is present in key table,
- /// looking it up by key ID or by a match on its name,
- /// only if it is in non-deleted state.
- /// Querying a non-existing key ID or a deleted key
- /// returns a bad request error.
- pub async fn get_existing_matching_key(&self, pattern: &str) -> Result<Key, Error> {
- let candidates = self
- .0
- .key_table
- .get_range(
- &EmptyKey,
- None,
- Some(KeyFilter::MatchesAndNotDeleted(pattern.to_string())),
- 10,
- )
- .await?
- .into_iter()
- .collect::<Vec<_>>();
- if candidates.len() != 1 {
- Err(Error::BadRequest(format!(
- "{} matching keys",
- candidates.len()
- )))
- } else {
- Ok(candidates.into_iter().next().unwrap())
- }
+ .ok_or_else(|| Error::NoSuchBucket(hex::encode(bucket_id)))
}
/// Sets a new alias for a bucket in global namespace.
@@ -141,10 +105,7 @@ impl<'a> BucketHelper<'a> {
alias_name: &String,
) -> Result<(), Error> {
if !is_valid_bucket_name(alias_name) {
- return Err(Error::BadRequest(format!(
- "{}: {}",
- alias_name, INVALID_BUCKET_NAME_MESSAGE
- )));
+ return Err(Error::InvalidBucketName(alias_name.to_string()));
}
let mut bucket = self.get_existing_bucket(bucket_id).await?;
@@ -175,7 +136,7 @@ impl<'a> BucketHelper<'a> {
let alias = match alias {
None => BucketAlias::new(alias_name.clone(), alias_ts, Some(bucket_id))
- .ok_or_bad_request(format!("{}: {}", alias_name, INVALID_BUCKET_NAME_MESSAGE))?,
+ .ok_or_else(|| Error::InvalidBucketName(alias_name.clone()))?,
Some(mut a) => {
a.state = Lww::raw(alias_ts, Some(bucket_id));
a
@@ -263,7 +224,7 @@ impl<'a> BucketHelper<'a> {
.bucket_alias_table
.get(&EmptyKey, alias_name)
.await?
- .ok_or_message(format!("Alias {} not found", alias_name))?;
+ .ok_or_else(|| Error::NoSuchBucket(alias_name.to_string()))?;
// Checks ok, remove alias
let alias_ts = match bucket.state.as_option() {
@@ -302,15 +263,14 @@ impl<'a> BucketHelper<'a> {
key_id: &String,
alias_name: &String,
) -> Result<(), Error> {
+ let key_helper = KeyHelper(self.0);
+
if !is_valid_bucket_name(alias_name) {
- return Err(Error::BadRequest(format!(
- "{}: {}",
- alias_name, INVALID_BUCKET_NAME_MESSAGE
- )));
+ return Err(Error::InvalidBucketName(alias_name.to_string()));
}
let mut bucket = self.get_existing_bucket(bucket_id).await?;
- let mut key = self.get_existing_key(key_id).await?;
+ let mut key = key_helper.get_existing_key(key_id).await?;
let mut key_param = key.state.as_option_mut().unwrap();
@@ -359,8 +319,10 @@ impl<'a> BucketHelper<'a> {
key_id: &String,
alias_name: &String,
) -> Result<(), Error> {
+ let key_helper = KeyHelper(self.0);
+
let mut bucket = self.get_existing_bucket(bucket_id).await?;
- let mut key = self.get_existing_key(key_id).await?;
+ let mut key = key_helper.get_existing_key(key_id).await?;
let mut bucket_p = bucket.state.as_option_mut().unwrap();
@@ -428,8 +390,10 @@ impl<'a> BucketHelper<'a> {
key_id: &String,
mut perm: BucketKeyPerm,
) -> Result<(), Error> {
+ let key_helper = KeyHelper(self.0);
+
let mut bucket = self.get_internal_bucket(bucket_id).await?;
- let mut key = self.get_internal_key(key_id).await?;
+ let mut key = key_helper.get_internal_key(key_id).await?;
if let Some(bstate) = bucket.state.as_option() {
if let Some(kp) = bstate.authorized_keys.get(key_id) {
@@ -465,4 +429,47 @@ impl<'a> BucketHelper<'a> {
Ok(())
}
+
+ pub async fn is_bucket_empty(&self, bucket_id: Uuid) -> Result<bool, Error> {
+ let objects = self
+ .0
+ .object_table
+ .get_range(
+ &bucket_id,
+ None,
+ Some(ObjectFilter::IsData),
+ 10,
+ EnumerationOrder::Forward,
+ )
+ .await?;
+ if !objects.is_empty() {
+ return Ok(false);
+ }
+
+ #[cfg(feature = "k2v")]
+ {
+ use garage_rpc::ring::Ring;
+ use std::sync::Arc;
+
+ let ring: Arc<Ring> = self.0.system.ring.borrow().clone();
+ let k2vindexes = self
+ .0
+ .k2v
+ .counter_table
+ .table
+ .get_range(
+ &bucket_id,
+ None,
+ Some((DeletedFilter::NotDeleted, ring.layout.node_id_vec.clone())),
+ 10,
+ EnumerationOrder::Forward,
+ )
+ .await?;
+ if !k2vindexes.is_empty() {
+ return Ok(false);
+ }
+ }
+
+ Ok(true)
+ }
}
diff --git a/src/model/helper/error.rs b/src/model/helper/error.rs
index 30b2ba32..3ca8f55c 100644
--- a/src/model/helper/error.rs
+++ b/src/model/helper/error.rs
@@ -10,6 +10,16 @@ pub enum Error {
#[error(display = "Bad request: {}", _0)]
BadRequest(String),
+
+ /// Bucket name is not valid according to AWS S3 specs
+ #[error(display = "Invalid bucket name: {}", _0)]
+ InvalidBucketName(String),
+
+ #[error(display = "Access key not found: {}", _0)]
+ NoSuchAccessKey(String),
+
+ #[error(display = "Bucket not found: {}", _0)]
+ NoSuchBucket(String),
}
impl From<netapp::error::Error> for Error {
diff --git a/src/model/helper/key.rs b/src/model/helper/key.rs
new file mode 100644
index 00000000..c1a8e974
--- /dev/null
+++ b/src/model/helper/key.rs
@@ -0,0 +1,102 @@
+use garage_table::util::*;
+use garage_util::crdt::*;
+use garage_util::error::OkOrMessage;
+
+use crate::garage::Garage;
+use crate::helper::bucket::BucketHelper;
+use crate::helper::error::*;
+use crate::key_table::{Key, KeyFilter};
+use crate::permission::BucketKeyPerm;
+
+pub struct KeyHelper<'a>(pub(crate) &'a Garage);
+
+#[allow(clippy::ptr_arg)]
+impl<'a> KeyHelper<'a> {
+ /// Returns a Key if it is present in key table,
+ /// even if it is in deleted state. Querying a non-existing
+ /// key ID returns an internal error.
+ pub async fn get_internal_key(&self, key_id: &String) -> Result<Key, Error> {
+ Ok(self
+ .0
+ .key_table
+ .get(&EmptyKey, key_id)
+ .await?
+ .ok_or_message(format!("Key {} does not exist", key_id))?)
+ }
+
+ /// Returns a Key if it is present in key table,
+ /// only if it is in non-deleted state.
+ /// Querying a non-existing key ID or a deleted key
+ /// returns a bad request error.
+ pub async fn get_existing_key(&self, key_id: &String) -> Result<Key, Error> {
+ self.0
+ .key_table
+ .get(&EmptyKey, key_id)
+ .await?
+ .filter(|b| !b.state.is_deleted())
+ .ok_or_else(|| Error::NoSuchAccessKey(key_id.to_string()))
+ }
+
+ /// Returns a Key if it is present in key table,
+ /// looking it up by key ID or by a match on its name,
+ /// only if it is in non-deleted state.
+ /// Querying a non-existing key ID or a deleted key
+ /// returns a bad request error.
+ pub async fn get_existing_matching_key(&self, pattern: &str) -> Result<Key, Error> {
+ let candidates = self
+ .0
+ .key_table
+ .get_range(
+ &EmptyKey,
+ None,
+ Some(KeyFilter::MatchesAndNotDeleted(pattern.to_string())),
+ 10,
+ EnumerationOrder::Forward,
+ )
+ .await?
+ .into_iter()
+ .collect::<Vec<_>>();
+ if candidates.len() != 1 {
+ Err(Error::BadRequest(format!(
+ "{} matching keys",
+ candidates.len()
+ )))
+ } else {
+ Ok(candidates.into_iter().next().unwrap())
+ }
+ }
+
+ /// Deletes an API access key
+ pub async fn delete_key(&self, key: &mut Key) -> Result<(), Error> {
+ let bucket_helper = BucketHelper(self.0);
+
+ let state = key.state.as_option_mut().unwrap();
+
+ // --- done checking, now commit ---
+ // (the step at unset_local_bucket_alias will fail if a bucket
+ // does not have another alias, the deletion will be
+ // interrupted in the middle if that happens)
+
+ // 1. Delete local aliases
+ for (alias, _, to) in state.local_aliases.items().iter() {
+ if let Some(bucket_id) = to {
+ bucket_helper
+ .unset_local_bucket_alias(*bucket_id, &key.key_id, alias)
+ .await?;
+ }
+ }
+
+ // 2. Remove permissions on all authorized buckets
+ for (ab_id, _auth) in state.authorized_buckets.items().iter() {
+ bucket_helper
+ .set_bucket_key_permissions(*ab_id, &key.key_id, BucketKeyPerm::NO_PERMISSIONS)
+ .await?;
+ }
+
+ // 3. Actually delete key
+ key.state = Deletable::delete();
+ self.0.key_table.insert(key).await?;
+
+ Ok(())
+ }
+}
diff --git a/src/model/helper/mod.rs b/src/model/helper/mod.rs
index 2f4e8898..dd947c86 100644
--- a/src/model/helper/mod.rs
+++ b/src/model/helper/mod.rs
@@ -1,2 +1,3 @@
pub mod bucket;
pub mod error;
+pub mod key;
diff --git a/src/model/index_counter.rs b/src/model/index_counter.rs
new file mode 100644
index 00000000..e6394f0c
--- /dev/null
+++ b/src/model/index_counter.rs
@@ -0,0 +1,496 @@
+use core::ops::Bound;
+use std::collections::{hash_map, BTreeMap, HashMap};
+use std::marker::PhantomData;
+use std::sync::Arc;
+
+use async_trait::async_trait;
+use serde::{Deserialize, Serialize};
+use tokio::sync::{mpsc, watch};
+
+use garage_db as db;
+
+use garage_rpc::ring::Ring;
+use garage_rpc::system::System;
+use garage_util::background::*;
+use garage_util::data::*;
+use garage_util::error::*;
+use garage_util::time::*;
+
+use garage_table::crdt::*;
+use garage_table::replication::*;
+use garage_table::*;
+
+pub trait CountedItem: Clone + PartialEq + Send + Sync + 'static {
+ const COUNTER_TABLE_NAME: &'static str;
+
+ type CP: PartitionKey + Clone + PartialEq + Serialize + for<'de> Deserialize<'de> + Send + Sync;
+ type CS: SortKey + Clone + PartialEq + Serialize + for<'de> Deserialize<'de> + Send + Sync;
+
+ fn counter_partition_key(&self) -> &Self::CP;
+ fn counter_sort_key(&self) -> &Self::CS;
+ fn counts(&self) -> Vec<(&'static str, i64)>;
+}
+
+/// A counter entry in the global table
+#[derive(Clone, PartialEq, Debug, Serialize, Deserialize)]
+pub struct CounterEntry<T: CountedItem> {
+ pub pk: T::CP,
+ pub sk: T::CS,
+ pub values: BTreeMap<String, CounterValue>,
+}
+
+impl<T: CountedItem> Entry<T::CP, T::CS> for CounterEntry<T> {
+ fn partition_key(&self) -> &T::CP {
+ &self.pk
+ }
+ fn sort_key(&self) -> &T::CS {
+ &self.sk
+ }
+ fn is_tombstone(&self) -> bool {
+ self.values
+ .iter()
+ .all(|(_, v)| v.node_values.iter().all(|(_, (_, v))| *v == 0))
+ }
+}
+
+impl<T: CountedItem> CounterEntry<T> {
+ pub fn filtered_values(&self, ring: &Ring) -> HashMap<String, i64> {
+ let nodes = &ring.layout.node_id_vec[..];
+ self.filtered_values_with_nodes(nodes)
+ }
+
+ pub fn filtered_values_with_nodes(&self, nodes: &[Uuid]) -> HashMap<String, i64> {
+ let mut ret = HashMap::new();
+ for (name, vals) in self.values.iter() {
+ let new_vals = vals
+ .node_values
+ .iter()
+ .filter(|(n, _)| nodes.contains(n))
+ .map(|(_, (_, v))| *v)
+ .collect::<Vec<_>>();
+ if !new_vals.is_empty() {
+ ret.insert(
+ name.clone(),
+ new_vals.iter().fold(i64::MIN, |a, b| std::cmp::max(a, *b)),
+ );
+ }
+ }
+
+ ret
+ }
+}
+
+/// A counter entry in the global table
+#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
+pub struct CounterValue {
+ pub node_values: BTreeMap<Uuid, (u64, i64)>,
+}
+
+impl<T: CountedItem> Crdt for CounterEntry<T> {
+ fn merge(&mut self, other: &Self) {
+ for (name, e2) in other.values.iter() {
+ if let Some(e) = self.values.get_mut(name) {
+ e.merge(e2);
+ } else {
+ self.values.insert(name.clone(), e2.clone());
+ }
+ }
+ }
+}
+
+impl Crdt for CounterValue {
+ fn merge(&mut self, other: &Self) {
+ for (node, (t2, e2)) in other.node_values.iter() {
+ if let Some((t, e)) = self.node_values.get_mut(node) {
+ if t2 > t {
+ *e = *e2;
+ }
+ } else {
+ self.node_values.insert(*node, (*t2, *e2));
+ }
+ }
+ }
+}
+
+pub struct CounterTable<T: CountedItem> {
+ _phantom_t: PhantomData<T>,
+}
+
+impl<T: CountedItem> TableSchema for CounterTable<T> {
+ const TABLE_NAME: &'static str = T::COUNTER_TABLE_NAME;
+
+ type P = T::CP;
+ type S = T::CS;
+ type E = CounterEntry<T>;
+ type Filter = (DeletedFilter, Vec<Uuid>);
+
+ fn matches_filter(entry: &Self::E, filter: &Self::Filter) -> bool {
+ if filter.0 == DeletedFilter::Any {
+ return true;
+ }
+
+ let is_tombstone = entry
+ .filtered_values_with_nodes(&filter.1[..])
+ .iter()
+ .all(|(_, v)| *v == 0);
+ filter.0.apply(is_tombstone)
+ }
+}
+
+// ----
+
+pub struct IndexCounter<T: CountedItem> {
+ this_node: Uuid,
+ local_counter: db::Tree,
+ propagate_tx: mpsc::UnboundedSender<(T::CP, T::CS, LocalCounterEntry<T>)>,
+ pub table: Arc<Table<CounterTable<T>, TableShardedReplication>>,
+}
+
+impl<T: CountedItem> IndexCounter<T> {
+ pub fn new(
+ system: Arc<System>,
+ replication: TableShardedReplication,
+ db: &db::Db,
+ ) -> Arc<Self> {
+ let background = system.background.clone();
+
+ let (propagate_tx, propagate_rx) = mpsc::unbounded_channel();
+
+ let this = Arc::new(Self {
+ this_node: system.id,
+ local_counter: db
+ .open_tree(format!("local_counter_v2:{}", T::COUNTER_TABLE_NAME))
+ .expect("Unable to open local counter tree"),
+ propagate_tx,
+ table: Table::new(
+ CounterTable {
+ _phantom_t: Default::default(),
+ },
+ replication,
+ system,
+ db,
+ ),
+ });
+
+ background.spawn_worker(IndexPropagatorWorker {
+ index_counter: this.clone(),
+ propagate_rx,
+ buf: HashMap::new(),
+ errors: 0,
+ });
+
+ this
+ }
+
+ pub fn count(
+ &self,
+ tx: &mut db::Transaction,
+ old: Option<&T>,
+ new: Option<&T>,
+ ) -> db::TxResult<(), Error> {
+ let pk = old
+ .map(|e| e.counter_partition_key())
+ .unwrap_or_else(|| new.unwrap().counter_partition_key());
+ let sk = old
+ .map(|e| e.counter_sort_key())
+ .unwrap_or_else(|| new.unwrap().counter_sort_key());
+
+ // calculate counter differences
+ let mut counts = HashMap::new();
+ for (k, v) in old.map(|x| x.counts()).unwrap_or_default() {
+ *counts.entry(k).or_insert(0) -= v;
+ }
+ for (k, v) in new.map(|x| x.counts()).unwrap_or_default() {
+ *counts.entry(k).or_insert(0) += v;
+ }
+
+ // update local counter table
+ let tree_key = self.table.data.tree_key(pk, sk);
+
+ let mut entry = match tx.get(&self.local_counter, &tree_key[..])? {
+ Some(old_bytes) => {
+ rmp_serde::decode::from_read_ref::<_, LocalCounterEntry<T>>(&old_bytes)
+ .map_err(Error::RmpDecode)
+ .map_err(db::TxError::Abort)?
+ }
+ None => LocalCounterEntry {
+ pk: pk.clone(),
+ sk: sk.clone(),
+ values: BTreeMap::new(),
+ },
+ };
+
+ let now = now_msec();
+ for (s, inc) in counts.iter() {
+ let mut ent = entry.values.entry(s.to_string()).or_insert((0, 0));
+ ent.0 = std::cmp::max(ent.0 + 1, now);
+ ent.1 += *inc;
+ }
+
+ let new_entry_bytes = rmp_to_vec_all_named(&entry)
+ .map_err(Error::RmpEncode)
+ .map_err(db::TxError::Abort)?;
+ tx.insert(&self.local_counter, &tree_key[..], new_entry_bytes)?;
+
+ if let Err(e) = self.propagate_tx.send((pk.clone(), sk.clone(), entry)) {
+ error!(
+ "Could not propagate updated counter values, failed to send to channel: {}",
+ e
+ );
+ }
+
+ Ok(())
+ }
+
+ pub fn offline_recount_all<TS, TR>(
+ &self,
+ counted_table: &Arc<Table<TS, TR>>,
+ ) -> Result<(), Error>
+ where
+ TS: TableSchema<E = T>,
+ TR: TableReplication,
+ {
+ let save_counter_entry = |entry: CounterEntry<T>| -> Result<(), Error> {
+ let entry_k = self
+ .table
+ .data
+ .tree_key(entry.partition_key(), entry.sort_key());
+ self.table
+ .data
+ .update_entry_with(&entry_k, |ent| match ent {
+ Some(mut ent) => {
+ ent.merge(&entry);
+ ent
+ }
+ None => entry.clone(),
+ })?;
+ Ok(())
+ };
+
+ // 1. Set all old local counters to zero
+ let now = now_msec();
+ let mut next_start: Option<Vec<u8>> = None;
+ loop {
+ let low_bound = match next_start.take() {
+ Some(v) => Bound::Excluded(v),
+ None => Bound::Unbounded,
+ };
+ let mut batch = vec![];
+ for item in self.local_counter.range((low_bound, Bound::Unbounded))? {
+ batch.push(item?);
+ if batch.len() > 1000 {
+ break;
+ }
+ }
+
+ if batch.is_empty() {
+ break;
+ }
+
+ info!("zeroing old counters... ({})", hex::encode(&batch[0].0));
+ for (local_counter_k, local_counter) in batch {
+ let mut local_counter =
+ rmp_serde::decode::from_read_ref::<_, LocalCounterEntry<T>>(&local_counter)?;
+
+ for (_, tv) in local_counter.values.iter_mut() {
+ tv.0 = std::cmp::max(tv.0 + 1, now);
+ tv.1 = 0;
+ }
+
+ let local_counter_bytes = rmp_to_vec_all_named(&local_counter)?;
+ self.local_counter
+ .insert(&local_counter_k, &local_counter_bytes)?;
+
+ let counter_entry = local_counter.into_counter_entry(self.this_node);
+ save_counter_entry(counter_entry)?;
+
+ next_start = Some(local_counter_k);
+ }
+ }
+
+ // 2. Recount all table entries
+ let now = now_msec();
+ let mut next_start: Option<Vec<u8>> = None;
+ loop {
+ let low_bound = match next_start.take() {
+ Some(v) => Bound::Excluded(v),
+ None => Bound::Unbounded,
+ };
+ let mut batch = vec![];
+ for item in counted_table
+ .data
+ .store
+ .range((low_bound, Bound::Unbounded))?
+ {
+ batch.push(item?);
+ if batch.len() > 1000 {
+ break;
+ }
+ }
+
+ if batch.is_empty() {
+ break;
+ }
+
+ info!("counting entries... ({})", hex::encode(&batch[0].0));
+ for (counted_entry_k, counted_entry) in batch {
+ let counted_entry = counted_table.data.decode_entry(&counted_entry)?;
+
+ let pk = counted_entry.counter_partition_key();
+ let sk = counted_entry.counter_sort_key();
+ let counts = counted_entry.counts();
+
+ let local_counter_key = self.table.data.tree_key(pk, sk);
+ let mut local_counter = match self.local_counter.get(&local_counter_key)? {
+ Some(old_bytes) => {
+ let ent = rmp_serde::decode::from_read_ref::<_, LocalCounterEntry<T>>(
+ &old_bytes,
+ )?;
+ assert!(ent.pk == *pk);
+ assert!(ent.sk == *sk);
+ ent
+ }
+ None => LocalCounterEntry {
+ pk: pk.clone(),
+ sk: sk.clone(),
+ values: BTreeMap::new(),
+ },
+ };
+ for (s, v) in counts.iter() {
+ let mut tv = local_counter.values.entry(s.to_string()).or_insert((0, 0));
+ tv.0 = std::cmp::max(tv.0 + 1, now);
+ tv.1 += v;
+ }
+
+ let local_counter_bytes = rmp_to_vec_all_named(&local_counter)?;
+ self.local_counter
+ .insert(&local_counter_key, local_counter_bytes)?;
+
+ let counter_entry = local_counter.into_counter_entry(self.this_node);
+ save_counter_entry(counter_entry)?;
+
+ next_start = Some(counted_entry_k);
+ }
+ }
+
+ // Done
+ Ok(())
+ }
+}
+
+struct IndexPropagatorWorker<T: CountedItem> {
+ index_counter: Arc<IndexCounter<T>>,
+ propagate_rx: mpsc::UnboundedReceiver<(T::CP, T::CS, LocalCounterEntry<T>)>,
+
+ buf: HashMap<Vec<u8>, CounterEntry<T>>,
+ errors: usize,
+}
+
+impl<T: CountedItem> IndexPropagatorWorker<T> {
+ fn add_ent(&mut self, pk: T::CP, sk: T::CS, counters: LocalCounterEntry<T>) {
+ let tree_key = self.index_counter.table.data.tree_key(&pk, &sk);
+ let dist_entry = counters.into_counter_entry(self.index_counter.this_node);
+ match self.buf.entry(tree_key) {
+ hash_map::Entry::Vacant(e) => {
+ e.insert(dist_entry);
+ }
+ hash_map::Entry::Occupied(mut e) => {
+ e.get_mut().merge(&dist_entry);
+ }
+ }
+ }
+}
+
+#[async_trait]
+impl<T: CountedItem> Worker for IndexPropagatorWorker<T> {
+ fn name(&self) -> String {
+ format!("{} index counter propagator", T::COUNTER_TABLE_NAME)
+ }
+
+ fn info(&self) -> Option<String> {
+ if !self.buf.is_empty() {
+ Some(format!("{} items in queue", self.buf.len()))
+ } else {
+ None
+ }
+ }
+
+ async fn work(&mut self, must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error> {
+ // This loop batches updates to counters to be sent all at once.
+ // They are sent once the propagate_rx channel has been emptied (or is closed).
+ let closed = loop {
+ match self.propagate_rx.try_recv() {
+ Ok((pk, sk, counters)) => {
+ self.add_ent(pk, sk, counters);
+ }
+ Err(mpsc::error::TryRecvError::Empty) => break false,
+ Err(mpsc::error::TryRecvError::Disconnected) => break true,
+ }
+ };
+
+ if !self.buf.is_empty() {
+ let entries_k = self.buf.keys().take(100).cloned().collect::<Vec<_>>();
+ let entries = entries_k.iter().map(|k| self.buf.get(k).unwrap());
+ if let Err(e) = self.index_counter.table.insert_many(entries).await {
+ self.errors += 1;
+ if self.errors >= 2 && *must_exit.borrow() {
+ error!("({}) Could not propagate {} counter values: {}, these counters will not be updated correctly.", T::COUNTER_TABLE_NAME, self.buf.len(), e);
+ return Ok(WorkerState::Done);
+ }
+ // Propagate error up to worker manager, it will log it, increment a counter,
+ // and sleep for a certain delay (with exponential backoff), waiting for
+ // things to go back to normal
+ return Err(e);
+ } else {
+ for k in entries_k {
+ self.buf.remove(&k);
+ }
+ self.errors = 0;
+ }
+
+ return Ok(WorkerState::Busy);
+ } else if closed {
+ return Ok(WorkerState::Done);
+ } else {
+ return Ok(WorkerState::Idle);
+ }
+ }
+
+ async fn wait_for_work(&mut self, _must_exit: &watch::Receiver<bool>) -> WorkerState {
+ match self.propagate_rx.recv().await {
+ Some((pk, sk, counters)) => {
+ self.add_ent(pk, sk, counters);
+ WorkerState::Busy
+ }
+ None => match self.buf.is_empty() {
+ false => WorkerState::Busy,
+ true => WorkerState::Done,
+ },
+ }
+ }
+}
+
+#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)]
+struct LocalCounterEntry<T: CountedItem> {
+ pk: T::CP,
+ sk: T::CS,
+ values: BTreeMap<String, (u64, i64)>,
+}
+
+impl<T: CountedItem> LocalCounterEntry<T> {
+ fn into_counter_entry(self, this_node: Uuid) -> CounterEntry<T> {
+ CounterEntry {
+ pk: self.pk,
+ sk: self.sk,
+ values: self
+ .values
+ .into_iter()
+ .map(|(name, (ts, v))| {
+ let mut node_values = BTreeMap::new();
+ node_values.insert(this_node, (ts, v));
+ (name, CounterValue { node_values })
+ })
+ .collect(),
+ }
+ }
+}
diff --git a/src/model/k2v/causality.rs b/src/model/k2v/causality.rs
new file mode 100644
index 00000000..9a692870
--- /dev/null
+++ b/src/model/k2v/causality.rs
@@ -0,0 +1,96 @@
+use std::collections::BTreeMap;
+use std::convert::TryInto;
+
+use serde::{Deserialize, Serialize};
+
+use garage_util::data::*;
+
+/// Node IDs used in K2V are u64 integers that are the abbreviation
+/// of full Garage node IDs which are 256-bit UUIDs.
+pub type K2VNodeId = u64;
+
+pub fn make_node_id(node_id: Uuid) -> K2VNodeId {
+ let mut tmp = [0u8; 8];
+ tmp.copy_from_slice(&node_id.as_slice()[..8]);
+ u64::from_be_bytes(tmp)
+}
+
+#[derive(PartialEq, Eq, Debug, Serialize, Deserialize)]
+pub struct CausalContext {
+ pub vector_clock: BTreeMap<K2VNodeId, u64>,
+}
+
+impl CausalContext {
+ /// Empty causality context
+ pub fn new_empty() -> Self {
+ Self {
+ vector_clock: BTreeMap::new(),
+ }
+ }
+ /// Make binary representation and encode in base64
+ pub fn serialize(&self) -> String {
+ let mut ints = Vec::with_capacity(2 * self.vector_clock.len());
+ for (node, time) in self.vector_clock.iter() {
+ ints.push(*node);
+ ints.push(*time);
+ }
+ let checksum = ints.iter().fold(0, |acc, v| acc ^ *v);
+
+ let mut bytes = u64::to_be_bytes(checksum).to_vec();
+ for i in ints {
+ bytes.extend(u64::to_be_bytes(i));
+ }
+
+ base64::encode_config(bytes, base64::URL_SAFE_NO_PAD)
+ }
+ /// Parse from base64-encoded binary representation
+ pub fn parse(s: &str) -> Result<Self, String> {
+ let bytes = base64::decode_config(s, base64::URL_SAFE_NO_PAD)
+ .map_err(|e| format!("bad causality token base64: {}", e))?;
+ if bytes.len() % 16 != 8 || bytes.len() < 8 {
+ return Err("bad causality token length".into());
+ }
+
+ let checksum = u64::from_be_bytes(bytes[..8].try_into().unwrap());
+ let mut ret = CausalContext {
+ vector_clock: BTreeMap::new(),
+ };
+
+ for i in 0..(bytes.len() / 16) {
+ let node_id = u64::from_be_bytes(bytes[8 + i * 16..16 + i * 16].try_into().unwrap());
+ let time = u64::from_be_bytes(bytes[16 + i * 16..24 + i * 16].try_into().unwrap());
+ ret.vector_clock.insert(node_id, time);
+ }
+
+ let check = ret.vector_clock.iter().fold(0, |acc, (n, t)| acc ^ *n ^ *t);
+
+ if check != checksum {
+ return Err("bad causality token checksum".into());
+ }
+
+ Ok(ret)
+ }
+ /// Check if this causal context contains newer items than another one
+ pub fn is_newer_than(&self, other: &Self) -> bool {
+ self.vector_clock
+ .iter()
+ .any(|(k, v)| v > other.vector_clock.get(k).unwrap_or(&0))
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn test_causality_token_serialization() {
+ let ct = CausalContext {
+ vector_clock: [(4, 42), (1928131023, 76), (0xefc0c1c47f9de433, 2)]
+ .iter()
+ .cloned()
+ .collect(),
+ };
+
+ assert_eq!(CausalContext::parse(&ct.serialize()).unwrap(), ct);
+ }
+}
diff --git a/src/model/k2v/item_table.rs b/src/model/k2v/item_table.rs
new file mode 100644
index 00000000..7860cb17
--- /dev/null
+++ b/src/model/k2v/item_table.rs
@@ -0,0 +1,305 @@
+use serde::{Deserialize, Serialize};
+use std::collections::BTreeMap;
+use std::sync::Arc;
+
+use garage_db as db;
+use garage_util::data::*;
+
+use garage_table::crdt::*;
+use garage_table::*;
+
+use crate::index_counter::*;
+use crate::k2v::causality::*;
+use crate::k2v::poll::*;
+
+pub const ENTRIES: &str = "entries";
+pub const CONFLICTS: &str = "conflicts";
+pub const VALUES: &str = "values";
+pub const BYTES: &str = "bytes";
+
+#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
+pub struct K2VItem {
+ pub partition: K2VItemPartition,
+ pub sort_key: String,
+
+ items: BTreeMap<K2VNodeId, DvvsEntry>,
+}
+
+#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize, Hash)]
+pub struct K2VItemPartition {
+ pub bucket_id: Uuid,
+ pub partition_key: String,
+}
+
+#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
+struct DvvsEntry {
+ t_discard: u64,
+ values: Vec<(u64, DvvsValue)>,
+}
+
+#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
+pub enum DvvsValue {
+ Value(#[serde(with = "serde_bytes")] Vec<u8>),
+ Deleted,
+}
+
+impl K2VItem {
+ /// Creates a new K2VItem when no previous entry existed in the db
+ pub fn new(bucket_id: Uuid, partition_key: String, sort_key: String) -> Self {
+ Self {
+ partition: K2VItemPartition {
+ bucket_id,
+ partition_key,
+ },
+ sort_key,
+ items: BTreeMap::new(),
+ }
+ }
+ /// Updates a K2VItem with a new value or a deletion event
+ pub fn update(
+ &mut self,
+ this_node: Uuid,
+ context: &Option<CausalContext>,
+ new_value: DvvsValue,
+ ) {
+ if let Some(context) = context {
+ for (node, t_discard) in context.vector_clock.iter() {
+ if let Some(e) = self.items.get_mut(node) {
+ e.t_discard = std::cmp::max(e.t_discard, *t_discard);
+ } else {
+ self.items.insert(
+ *node,
+ DvvsEntry {
+ t_discard: *t_discard,
+ values: vec![],
+ },
+ );
+ }
+ }
+ }
+
+ self.discard();
+
+ let node_id = make_node_id(this_node);
+ let e = self.items.entry(node_id).or_insert(DvvsEntry {
+ t_discard: 0,
+ values: vec![],
+ });
+ let t_prev = e.max_time();
+ e.values.push((t_prev + 1, new_value));
+ }
+
+ /// Extract the causality context of a K2V Item
+ pub fn causal_context(&self) -> CausalContext {
+ let mut cc = CausalContext::new_empty();
+ for (node, ent) in self.items.iter() {
+ cc.vector_clock.insert(*node, ent.max_time());
+ }
+ cc
+ }
+
+ /// Extract the list of values
+ pub fn values(&'_ self) -> Vec<&'_ DvvsValue> {
+ let mut ret = vec![];
+ for (_, ent) in self.items.iter() {
+ for (_, v) in ent.values.iter() {
+ if !ret.contains(&v) {
+ ret.push(v);
+ }
+ }
+ }
+ ret
+ }
+
+ fn discard(&mut self) {
+ for (_, ent) in self.items.iter_mut() {
+ ent.discard();
+ }
+ }
+}
+
+impl DvvsEntry {
+ fn max_time(&self) -> u64 {
+ self.values
+ .iter()
+ .fold(self.t_discard, |acc, (vts, _)| std::cmp::max(acc, *vts))
+ }
+
+ fn discard(&mut self) {
+ self.values = std::mem::take(&mut self.values)
+ .into_iter()
+ .filter(|(t, _)| *t > self.t_discard)
+ .collect::<Vec<_>>();
+ }
+}
+
+impl Crdt for K2VItem {
+ fn merge(&mut self, other: &Self) {
+ for (node, e2) in other.items.iter() {
+ if let Some(e) = self.items.get_mut(node) {
+ e.merge(e2);
+ } else {
+ self.items.insert(*node, e2.clone());
+ }
+ }
+ }
+}
+
+impl Crdt for DvvsEntry {
+ fn merge(&mut self, other: &Self) {
+ self.t_discard = std::cmp::max(self.t_discard, other.t_discard);
+ self.discard();
+
+ let t_max = self.max_time();
+ for (vt, vv) in other.values.iter() {
+ if *vt > t_max {
+ self.values.push((*vt, vv.clone()));
+ }
+ }
+ }
+}
+
+impl PartitionKey for K2VItemPartition {
+ fn hash(&self) -> Hash {
+ use blake2::{Blake2b, Digest};
+
+ let mut hasher = Blake2b::new();
+ hasher.update(self.bucket_id.as_slice());
+ hasher.update(self.partition_key.as_bytes());
+ let mut hash = [0u8; 32];
+ hash.copy_from_slice(&hasher.finalize()[..32]);
+ hash.into()
+ }
+}
+
+impl Entry<K2VItemPartition, String> for K2VItem {
+ fn partition_key(&self) -> &K2VItemPartition {
+ &self.partition
+ }
+ fn sort_key(&self) -> &String {
+ &self.sort_key
+ }
+ fn is_tombstone(&self) -> bool {
+ self.values()
+ .iter()
+ .all(|v| matches!(v, DvvsValue::Deleted))
+ }
+}
+
+pub struct K2VItemTable {
+ pub(crate) counter_table: Arc<IndexCounter<K2VItem>>,
+ pub(crate) subscriptions: Arc<SubscriptionManager>,
+}
+
+#[derive(Clone, Copy, Debug, Serialize, Deserialize)]
+pub struct ItemFilter {
+ pub exclude_only_tombstones: bool,
+ pub conflicts_only: bool,
+}
+
+impl TableSchema for K2VItemTable {
+ const TABLE_NAME: &'static str = "k2v_item";
+
+ type P = K2VItemPartition;
+ type S = String;
+ type E = K2VItem;
+ type Filter = ItemFilter;
+
+ fn updated(
+ &self,
+ tx: &mut db::Transaction,
+ old: Option<&Self::E>,
+ new: Option<&Self::E>,
+ ) -> db::TxOpResult<()> {
+ // 1. Count
+ let counter_res = self.counter_table.count(tx, old, new);
+ if let Err(e) = db::unabort(counter_res)? {
+ // This result can be returned by `counter_table.count()` for instance
+ // if messagepack serialization or deserialization fails at some step.
+ // Warn admin but ignore this error for now, that's all we can do.
+ error!(
+ "Unable to update K2V item counter: {}. Index values will be wrong!",
+ e
+ );
+ }
+
+ // 2. Notify
+ if let Some(new_ent) = new {
+ self.subscriptions.notify(new_ent);
+ }
+
+ Ok(())
+ }
+
+ #[allow(clippy::nonminimal_bool)]
+ fn matches_filter(entry: &Self::E, filter: &Self::Filter) -> bool {
+ let v = entry.values();
+ !(filter.conflicts_only && v.len() < 2)
+ && !(filter.exclude_only_tombstones && entry.is_tombstone())
+ }
+}
+
+impl CountedItem for K2VItem {
+ const COUNTER_TABLE_NAME: &'static str = "k2v_index_counter_v2";
+
+ // Partition key = bucket id
+ type CP = Uuid;
+ // Sort key = K2V item's partition key
+ type CS = String;
+
+ fn counter_partition_key(&self) -> &Uuid {
+ &self.partition.bucket_id
+ }
+ fn counter_sort_key(&self) -> &String {
+ &self.partition.partition_key
+ }
+
+ fn counts(&self) -> Vec<(&'static str, i64)> {
+ let values = self.values();
+
+ let n_entries = if self.is_tombstone() { 0 } else { 1 };
+ let n_conflicts = if values.len() > 1 { 1 } else { 0 };
+ let n_values = values
+ .iter()
+ .filter(|v| matches!(v, DvvsValue::Value(_)))
+ .count() as i64;
+ let n_bytes = values
+ .iter()
+ .map(|v| match v {
+ DvvsValue::Deleted => 0,
+ DvvsValue::Value(v) => v.len() as i64,
+ })
+ .sum();
+
+ vec![
+ (ENTRIES, n_entries),
+ (CONFLICTS, n_conflicts),
+ (VALUES, n_values),
+ (BYTES, n_bytes),
+ ]
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn test_dvvsentry_merge_simple() {
+ let e1 = DvvsEntry {
+ t_discard: 4,
+ values: vec![
+ (5, DvvsValue::Value(vec![15])),
+ (6, DvvsValue::Value(vec![16])),
+ ],
+ };
+ let e2 = DvvsEntry {
+ t_discard: 5,
+ values: vec![(6, DvvsValue::Value(vec![16])), (7, DvvsValue::Deleted)],
+ };
+
+ let mut e3 = e1.clone();
+ e3.merge(&e2);
+ assert_eq!(e2, e3);
+ }
+}
diff --git a/src/model/k2v/mod.rs b/src/model/k2v/mod.rs
new file mode 100644
index 00000000..f6a96151
--- /dev/null
+++ b/src/model/k2v/mod.rs
@@ -0,0 +1,6 @@
+pub mod causality;
+
+pub mod item_table;
+
+pub mod poll;
+pub mod rpc;
diff --git a/src/model/k2v/poll.rs b/src/model/k2v/poll.rs
new file mode 100644
index 00000000..93105207
--- /dev/null
+++ b/src/model/k2v/poll.rs
@@ -0,0 +1,50 @@
+use std::collections::HashMap;
+use std::sync::Mutex;
+
+use serde::{Deserialize, Serialize};
+use tokio::sync::broadcast;
+
+use crate::k2v::item_table::*;
+
+#[derive(Debug, Hash, Clone, PartialEq, Eq, Serialize, Deserialize)]
+pub struct PollKey {
+ pub partition: K2VItemPartition,
+ pub sort_key: String,
+}
+
+#[derive(Default)]
+pub struct SubscriptionManager {
+ subscriptions: Mutex<HashMap<PollKey, broadcast::Sender<K2VItem>>>,
+}
+
+impl SubscriptionManager {
+ pub fn new() -> Self {
+ Self::default()
+ }
+
+ pub fn subscribe(&self, key: &PollKey) -> broadcast::Receiver<K2VItem> {
+ let mut subs = self.subscriptions.lock().unwrap();
+ if let Some(s) = subs.get(key) {
+ s.subscribe()
+ } else {
+ let (tx, rx) = broadcast::channel(8);
+ subs.insert(key.clone(), tx);
+ rx
+ }
+ }
+
+ pub fn notify(&self, item: &K2VItem) {
+ let key = PollKey {
+ partition: item.partition.clone(),
+ sort_key: item.sort_key.clone(),
+ };
+ let mut subs = self.subscriptions.lock().unwrap();
+ if let Some(s) = subs.get(&key) {
+ if s.send(item.clone()).is_err() {
+ // no more subscribers, remove channel from here
+ // (we will re-create it later if we need to subscribe again)
+ subs.remove(&key);
+ }
+ }
+ }
+}
diff --git a/src/model/k2v/rpc.rs b/src/model/k2v/rpc.rs
new file mode 100644
index 00000000..a74df277
--- /dev/null
+++ b/src/model/k2v/rpc.rs
@@ -0,0 +1,341 @@
+//! Module that implements RPCs specific to K2V.
+//! This is necessary for insertions into the K2V store,
+//! as they have to be transmitted to one of the nodes responsible
+//! for storing the entry to be processed (the API entry
+//! node does not process the entry directly, as this would
+//! mean the vector clock gets much larger than needed).
+
+use std::collections::HashMap;
+use std::sync::Arc;
+use std::time::Duration;
+
+use async_trait::async_trait;
+use futures::stream::FuturesUnordered;
+use futures::StreamExt;
+use serde::{Deserialize, Serialize};
+use tokio::select;
+
+use garage_util::crdt::*;
+use garage_util::data::*;
+use garage_util::error::*;
+
+use garage_rpc::system::System;
+use garage_rpc::*;
+
+use garage_table::replication::{TableReplication, TableShardedReplication};
+use garage_table::{PartitionKey, Table};
+
+use crate::k2v::causality::*;
+use crate::k2v::item_table::*;
+use crate::k2v::poll::*;
+
+/// RPC messages for K2V
+#[derive(Debug, Serialize, Deserialize)]
+enum K2VRpc {
+ Ok,
+ InsertItem(InsertedItem),
+ InsertManyItems(Vec<InsertedItem>),
+ PollItem {
+ key: PollKey,
+ causal_context: CausalContext,
+ timeout_msec: u64,
+ },
+ PollItemResponse(Option<K2VItem>),
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+struct InsertedItem {
+ partition: K2VItemPartition,
+ sort_key: String,
+ causal_context: Option<CausalContext>,
+ value: DvvsValue,
+}
+
+impl Rpc for K2VRpc {
+ type Response = Result<K2VRpc, Error>;
+}
+
+/// The block manager, handling block exchange between nodes, and block storage on local node
+pub struct K2VRpcHandler {
+ system: Arc<System>,
+ item_table: Arc<Table<K2VItemTable, TableShardedReplication>>,
+ endpoint: Arc<Endpoint<K2VRpc, Self>>,
+ subscriptions: Arc<SubscriptionManager>,
+}
+
+impl K2VRpcHandler {
+ pub fn new(
+ system: Arc<System>,
+ item_table: Arc<Table<K2VItemTable, TableShardedReplication>>,
+ subscriptions: Arc<SubscriptionManager>,
+ ) -> Arc<Self> {
+ let endpoint = system.netapp.endpoint("garage_model/k2v/Rpc".to_string());
+
+ let rpc_handler = Arc::new(Self {
+ system,
+ item_table,
+ endpoint,
+ subscriptions,
+ });
+ rpc_handler.endpoint.set_handler(rpc_handler.clone());
+
+ rpc_handler
+ }
+
+ // ---- public interface ----
+
+ pub async fn insert(
+ &self,
+ bucket_id: Uuid,
+ partition_key: String,
+ sort_key: String,
+ causal_context: Option<CausalContext>,
+ value: DvvsValue,
+ ) -> Result<(), Error> {
+ let partition = K2VItemPartition {
+ bucket_id,
+ partition_key,
+ };
+ let mut who = self
+ .item_table
+ .data
+ .replication
+ .write_nodes(&partition.hash());
+ who.sort();
+
+ self.system
+ .rpc
+ .try_call_many(
+ &self.endpoint,
+ &who[..],
+ K2VRpc::InsertItem(InsertedItem {
+ partition,
+ sort_key,
+ causal_context,
+ value,
+ }),
+ RequestStrategy::with_priority(PRIO_NORMAL)
+ .with_quorum(1)
+ .interrupt_after_quorum(true),
+ )
+ .await?;
+
+ Ok(())
+ }
+
+ pub async fn insert_batch(
+ &self,
+ bucket_id: Uuid,
+ items: Vec<(String, String, Option<CausalContext>, DvvsValue)>,
+ ) -> Result<(), Error> {
+ let n_items = items.len();
+
+ let mut call_list: HashMap<_, Vec<_>> = HashMap::new();
+
+ for (partition_key, sort_key, causal_context, value) in items {
+ let partition = K2VItemPartition {
+ bucket_id,
+ partition_key,
+ };
+ let mut who = self
+ .item_table
+ .data
+ .replication
+ .write_nodes(&partition.hash());
+ who.sort();
+
+ call_list.entry(who).or_default().push(InsertedItem {
+ partition,
+ sort_key,
+ causal_context,
+ value,
+ });
+ }
+
+ debug!(
+ "K2V insert_batch: {} requests to insert {} items",
+ call_list.len(),
+ n_items
+ );
+ let call_futures = call_list.into_iter().map(|(nodes, items)| async move {
+ let resp = self
+ .system
+ .rpc
+ .try_call_many(
+ &self.endpoint,
+ &nodes[..],
+ K2VRpc::InsertManyItems(items),
+ RequestStrategy::with_priority(PRIO_NORMAL)
+ .with_quorum(1)
+ .interrupt_after_quorum(true),
+ )
+ .await?;
+ Ok::<_, Error>((nodes, resp))
+ });
+
+ let mut resps = call_futures.collect::<FuturesUnordered<_>>();
+ while let Some(resp) = resps.next().await {
+ resp?;
+ }
+
+ Ok(())
+ }
+
+ pub async fn poll(
+ &self,
+ bucket_id: Uuid,
+ partition_key: String,
+ sort_key: String,
+ causal_context: CausalContext,
+ timeout_msec: u64,
+ ) -> Result<Option<K2VItem>, Error> {
+ let poll_key = PollKey {
+ partition: K2VItemPartition {
+ bucket_id,
+ partition_key,
+ },
+ sort_key,
+ };
+ let nodes = self
+ .item_table
+ .data
+ .replication
+ .write_nodes(&poll_key.partition.hash());
+
+ let rpc = self.system.rpc.try_call_many(
+ &self.endpoint,
+ &nodes[..],
+ K2VRpc::PollItem {
+ key: poll_key,
+ causal_context,
+ timeout_msec,
+ },
+ RequestStrategy::with_priority(PRIO_NORMAL)
+ .with_quorum(self.item_table.data.replication.read_quorum())
+ .without_timeout(),
+ );
+ let timeout_duration = Duration::from_millis(timeout_msec) + self.system.rpc.rpc_timeout();
+ let resps = select! {
+ r = rpc => r?,
+ _ = tokio::time::sleep(timeout_duration) => return Ok(None),
+ };
+
+ let mut resp: Option<K2VItem> = None;
+ for v in resps {
+ match v {
+ K2VRpc::PollItemResponse(Some(x)) => {
+ if let Some(y) = &mut resp {
+ y.merge(&x);
+ } else {
+ resp = Some(x);
+ }
+ }
+ K2VRpc::PollItemResponse(None) => {
+ return Ok(None);
+ }
+ v => return Err(Error::unexpected_rpc_message(v)),
+ }
+ }
+
+ Ok(resp)
+ }
+
+ // ---- internal handlers ----
+
+ async fn handle_insert(&self, item: &InsertedItem) -> Result<K2VRpc, Error> {
+ let new = self.local_insert(item)?;
+
+ // Propagate to rest of network
+ if let Some(updated) = new {
+ self.item_table.insert(&updated).await?;
+ }
+
+ Ok(K2VRpc::Ok)
+ }
+
+ async fn handle_insert_many(&self, items: &[InsertedItem]) -> Result<K2VRpc, Error> {
+ let mut updated_vec = vec![];
+
+ for item in items {
+ let new = self.local_insert(item)?;
+
+ if let Some(updated) = new {
+ updated_vec.push(updated);
+ }
+ }
+
+ // Propagate to rest of network
+ if !updated_vec.is_empty() {
+ self.item_table.insert_many(&updated_vec).await?;
+ }
+
+ Ok(K2VRpc::Ok)
+ }
+
+ fn local_insert(&self, item: &InsertedItem) -> Result<Option<K2VItem>, Error> {
+ let tree_key = self
+ .item_table
+ .data
+ .tree_key(&item.partition, &item.sort_key);
+
+ self.item_table
+ .data
+ .update_entry_with(&tree_key[..], |ent| {
+ let mut ent = ent.unwrap_or_else(|| {
+ K2VItem::new(
+ item.partition.bucket_id,
+ item.partition.partition_key.clone(),
+ item.sort_key.clone(),
+ )
+ });
+ ent.update(self.system.id, &item.causal_context, item.value.clone());
+ ent
+ })
+ }
+
+ async fn handle_poll(&self, key: &PollKey, ct: &CausalContext) -> Result<K2VItem, Error> {
+ let mut chan = self.subscriptions.subscribe(key);
+
+ let mut value = self
+ .item_table
+ .data
+ .read_entry(&key.partition, &key.sort_key)?
+ .map(|bytes| self.item_table.data.decode_entry(&bytes[..]))
+ .transpose()?
+ .unwrap_or_else(|| {
+ K2VItem::new(
+ key.partition.bucket_id,
+ key.partition.partition_key.clone(),
+ key.sort_key.clone(),
+ )
+ });
+
+ while !value.causal_context().is_newer_than(ct) {
+ value = chan.recv().await?;
+ }
+
+ Ok(value)
+ }
+}
+
+#[async_trait]
+impl EndpointHandler<K2VRpc> for K2VRpcHandler {
+ async fn handle(self: &Arc<Self>, message: &K2VRpc, _from: NodeID) -> Result<K2VRpc, Error> {
+ match message {
+ K2VRpc::InsertItem(item) => self.handle_insert(item).await,
+ K2VRpc::InsertManyItems(items) => self.handle_insert_many(&items[..]).await,
+ K2VRpc::PollItem {
+ key,
+ causal_context,
+ timeout_msec,
+ } => {
+ let delay = tokio::time::sleep(Duration::from_millis(*timeout_msec));
+ select! {
+ ret = self.handle_poll(key, causal_context) => ret.map(Some).map(K2VRpc::PollItemResponse),
+ _ = delay => Ok(K2VRpc::PollItemResponse(None)),
+ }
+ }
+ m => Err(Error::unexpected_rpc_message(m)),
+ }
+ }
+}
diff --git a/src/model/key_table.rs b/src/model/key_table.rs
index 330e83f0..9d2fc783 100644
--- a/src/model/key_table.rs
+++ b/src/model/key_table.rs
@@ -6,10 +6,10 @@ use garage_util::data::*;
use crate::permission::BucketKeyPerm;
-use garage_model_050::key_table as old;
+use crate::prev::v051::key_table as old;
/// An api key
-#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)]
+#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
pub struct Key {
/// The id of the key (immutable), used as partition key
pub key_id: String,
@@ -19,7 +19,7 @@ pub struct Key {
}
/// Configuration for a key
-#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)]
+#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
pub struct KeyParams {
/// The secret_key associated (immutable)
pub secret_key: String,
diff --git a/src/model/lib.rs b/src/model/lib.rs
index 05a4cdc7..4f20ea46 100644
--- a/src/model/lib.rs
+++ b/src/model/lib.rs
@@ -1,14 +1,20 @@
#[macro_use]
extern crate tracing;
+// For migration from previous versions
+pub(crate) mod prev;
+
pub mod permission;
-pub mod block_ref_table;
+pub mod index_counter;
+
pub mod bucket_alias_table;
pub mod bucket_table;
pub mod key_table;
-pub mod object_table;
-pub mod version_table;
+
+#[cfg(feature = "k2v")]
+pub mod k2v;
+pub mod s3;
pub mod garage;
pub mod helper;
diff --git a/src/model/migrate.rs b/src/model/migrate.rs
index 7e61957a..cd6ad26a 100644
--- a/src/model/migrate.rs
+++ b/src/model/migrate.rs
@@ -5,7 +5,7 @@ use garage_util::data::*;
use garage_util::error::Error as GarageError;
use garage_util::time::*;
-use garage_model_050::bucket_table as old_bucket;
+use crate::prev::v051::bucket_table as old_bucket;
use crate::bucket_alias_table::*;
use crate::bucket_table::*;
@@ -25,11 +25,15 @@ impl Migrate {
.open_tree("bucket:table")
.map_err(GarageError::from)?;
- for res in tree.iter() {
+ let mut old_buckets = vec![];
+ for res in tree.iter().map_err(GarageError::from)? {
let (_k, v) = res.map_err(GarageError::from)?;
let bucket = rmp_serde::decode::from_read_ref::<_, old_bucket::Bucket>(&v[..])
.map_err(GarageError::from)?;
+ old_buckets.push(bucket);
+ }
+ for bucket in old_buckets {
if let old_bucket::BucketState::Present(p) = bucket.state.get() {
self.migrate_buckets050_do_bucket(&bucket, p).await?;
}
@@ -73,6 +77,7 @@ impl Migrate {
local_aliases: LwwMap::new(),
website_config: Lww::new(website),
cors_config: Lww::new(None),
+ quotas: Lww::new(Default::default()),
}),
})
.await?;
diff --git a/src/model/prev/mod.rs b/src/model/prev/mod.rs
new file mode 100644
index 00000000..68bb1502
--- /dev/null
+++ b/src/model/prev/mod.rs
@@ -0,0 +1 @@
+pub(crate) mod v051;
diff --git a/src/model/prev/v051/bucket_table.rs b/src/model/prev/v051/bucket_table.rs
new file mode 100644
index 00000000..628a49dd
--- /dev/null
+++ b/src/model/prev/v051/bucket_table.rs
@@ -0,0 +1,63 @@
+use serde::{Deserialize, Serialize};
+
+use garage_table::crdt::Crdt;
+use garage_table::*;
+
+use super::key_table::PermissionSet;
+
+/// A bucket is a collection of objects
+///
+/// Its parameters are not directly accessible as:
+/// - It must be possible to merge paramaters, hence the use of a LWW CRDT.
+/// - A bucket has 2 states, Present or Deleted and parameters make sense only if present.
+#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
+pub struct Bucket {
+ /// Name of the bucket
+ pub name: String,
+ /// State, and configuration if not deleted, of the bucket
+ pub state: crdt::Lww<BucketState>,
+}
+
+/// State of a bucket
+#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
+pub enum BucketState {
+ /// The bucket is deleted
+ Deleted,
+ /// The bucket exists
+ Present(BucketParams),
+}
+
+impl Crdt for BucketState {
+ fn merge(&mut self, o: &Self) {
+ match o {
+ BucketState::Deleted => *self = BucketState::Deleted,
+ BucketState::Present(other_params) => {
+ if let BucketState::Present(params) = self {
+ params.merge(other_params);
+ }
+ }
+ }
+ }
+}
+
+/// Configuration for a bucket
+#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
+pub struct BucketParams {
+ /// Map of key with access to the bucket, and what kind of access they give
+ pub authorized_keys: crdt::LwwMap<String, PermissionSet>,
+ /// Is the bucket served as http
+ pub website: crdt::Lww<bool>,
+}
+
+impl Crdt for BucketParams {
+ fn merge(&mut self, o: &Self) {
+ self.authorized_keys.merge(&o.authorized_keys);
+ self.website.merge(&o.website);
+ }
+}
+
+impl Crdt for Bucket {
+ fn merge(&mut self, other: &Self) {
+ self.state.merge(&other.state);
+ }
+}
diff --git a/src/model/prev/v051/key_table.rs b/src/model/prev/v051/key_table.rs
new file mode 100644
index 00000000..37516b1c
--- /dev/null
+++ b/src/model/prev/v051/key_table.rs
@@ -0,0 +1,50 @@
+use serde::{Deserialize, Serialize};
+
+use garage_table::crdt::*;
+use garage_table::*;
+
+/// An api key
+#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
+pub struct Key {
+ /// The id of the key (immutable), used as partition key
+ pub key_id: String,
+
+ /// The secret_key associated
+ pub secret_key: String,
+
+ /// Name for the key
+ pub name: crdt::Lww<String>,
+
+ /// Is the key deleted
+ pub deleted: crdt::Bool,
+
+ /// Buckets in which the key is authorized. Empty if `Key` is deleted
+ // CRDT interaction: deleted implies authorized_buckets is empty
+ pub authorized_buckets: crdt::LwwMap<String, PermissionSet>,
+}
+
+/// Permission given to a key in a bucket
+#[derive(PartialOrd, Ord, PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
+pub struct PermissionSet {
+ /// The key can be used to read the bucket
+ pub allow_read: bool,
+ /// The key can be used to write in the bucket
+ pub allow_write: bool,
+}
+
+impl AutoCrdt for PermissionSet {
+ const WARN_IF_DIFFERENT: bool = true;
+}
+
+impl Crdt for Key {
+ fn merge(&mut self, other: &Self) {
+ self.name.merge(&other.name);
+ self.deleted.merge(&other.deleted);
+
+ if self.deleted.get() {
+ self.authorized_buckets.clear();
+ } else {
+ self.authorized_buckets.merge(&other.authorized_buckets);
+ }
+ }
+}
diff --git a/src/model/prev/v051/mod.rs b/src/model/prev/v051/mod.rs
new file mode 100644
index 00000000..7a954752
--- /dev/null
+++ b/src/model/prev/v051/mod.rs
@@ -0,0 +1,4 @@
+pub(crate) mod bucket_table;
+pub(crate) mod key_table;
+pub(crate) mod object_table;
+pub(crate) mod version_table;
diff --git a/src/model/prev/v051/object_table.rs b/src/model/prev/v051/object_table.rs
new file mode 100644
index 00000000..e79e5787
--- /dev/null
+++ b/src/model/prev/v051/object_table.rs
@@ -0,0 +1,149 @@
+use serde::{Deserialize, Serialize};
+use std::collections::BTreeMap;
+
+use garage_util::data::*;
+
+use garage_table::crdt::*;
+
+/// An object
+#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
+pub struct Object {
+ /// The bucket in which the object is stored, used as partition key
+ pub bucket: String,
+
+ /// The key at which the object is stored in its bucket, used as sorting key
+ pub key: String,
+
+ /// The list of currenty stored versions of the object
+ versions: Vec<ObjectVersion>,
+}
+
+impl Object {
+ /// Get a list of currently stored versions of `Object`
+ pub fn versions(&self) -> &[ObjectVersion] {
+ &self.versions[..]
+ }
+}
+
+/// Informations about a version of an object
+#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
+pub struct ObjectVersion {
+ /// Id of the version
+ pub uuid: Uuid,
+ /// Timestamp of when the object was created
+ pub timestamp: u64,
+ /// State of the version
+ pub state: ObjectVersionState,
+}
+
+/// State of an object version
+#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
+pub enum ObjectVersionState {
+ /// The version is being received
+ Uploading(ObjectVersionHeaders),
+ /// The version is fully received
+ Complete(ObjectVersionData),
+ /// The version uploaded containded errors or the upload was explicitly aborted
+ Aborted,
+}
+
+impl Crdt for ObjectVersionState {
+ fn merge(&mut self, other: &Self) {
+ use ObjectVersionState::*;
+ match other {
+ Aborted => {
+ *self = Aborted;
+ }
+ Complete(b) => match self {
+ Aborted => {}
+ Complete(a) => {
+ a.merge(b);
+ }
+ Uploading(_) => {
+ *self = Complete(b.clone());
+ }
+ },
+ Uploading(_) => {}
+ }
+ }
+}
+
+/// Data stored in object version
+#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)]
+pub enum ObjectVersionData {
+ /// The object was deleted, this Version is a tombstone to mark it as such
+ DeleteMarker,
+ /// The object is short, it's stored inlined
+ Inline(ObjectVersionMeta, #[serde(with = "serde_bytes")] Vec<u8>),
+ /// The object is not short, Hash of first block is stored here, next segments hashes are
+ /// stored in the version table
+ FirstBlock(ObjectVersionMeta, Hash),
+}
+
+impl AutoCrdt for ObjectVersionData {
+ const WARN_IF_DIFFERENT: bool = true;
+}
+
+/// Metadata about the object version
+#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)]
+pub struct ObjectVersionMeta {
+ /// Headers to send to the client
+ pub headers: ObjectVersionHeaders,
+ /// Size of the object
+ pub size: u64,
+ /// etag of the object
+ pub etag: String,
+}
+
+/// Additional headers for an object
+#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)]
+pub struct ObjectVersionHeaders {
+ /// Content type of the object
+ pub content_type: String,
+ /// Any other http headers to send
+ pub other: BTreeMap<String, String>,
+}
+
+impl ObjectVersion {
+ fn cmp_key(&self) -> (u64, Uuid) {
+ (self.timestamp, self.uuid)
+ }
+
+ /// Is the object version completely received
+ pub fn is_complete(&self) -> bool {
+ matches!(self.state, ObjectVersionState::Complete(_))
+ }
+}
+
+impl Crdt for Object {
+ fn merge(&mut self, other: &Self) {
+ // Merge versions from other into here
+ for other_v in other.versions.iter() {
+ match self
+ .versions
+ .binary_search_by(|v| v.cmp_key().cmp(&other_v.cmp_key()))
+ {
+ Ok(i) => {
+ self.versions[i].state.merge(&other_v.state);
+ }
+ Err(i) => {
+ self.versions.insert(i, other_v.clone());
+ }
+ }
+ }
+
+ // Remove versions which are obsolete, i.e. those that come
+ // before the last version which .is_complete().
+ let last_complete = self
+ .versions
+ .iter()
+ .enumerate()
+ .rev()
+ .find(|(_, v)| v.is_complete())
+ .map(|(vi, _)| vi);
+
+ if let Some(last_vi) = last_complete {
+ self.versions = self.versions.drain(last_vi..).collect::<Vec<_>>();
+ }
+ }
+}
diff --git a/src/model/prev/v051/version_table.rs b/src/model/prev/v051/version_table.rs
new file mode 100644
index 00000000..c11c62d5
--- /dev/null
+++ b/src/model/prev/v051/version_table.rs
@@ -0,0 +1,79 @@
+use serde::{Deserialize, Serialize};
+
+use garage_util::data::*;
+
+use garage_table::crdt::*;
+use garage_table::*;
+
+/// A version of an object
+#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
+pub struct Version {
+ /// UUID of the version, used as partition key
+ pub uuid: Uuid,
+
+ // Actual data: the blocks for this version
+ // In the case of a multipart upload, also store the etags
+ // of individual parts and check them when doing CompleteMultipartUpload
+ /// Is this version deleted
+ pub deleted: crdt::Bool,
+ /// list of blocks of data composing the version
+ pub blocks: crdt::Map<VersionBlockKey, VersionBlock>,
+ /// Etag of each part in case of a multipart upload, empty otherwise
+ pub parts_etags: crdt::Map<u64, String>,
+
+ // Back link to bucket+key so that we can figure if
+ // this was deleted later on
+ /// Bucket in which the related object is stored
+ pub bucket: String,
+ /// Key in which the related object is stored
+ pub key: String,
+}
+
+#[derive(PartialEq, Eq, Clone, Copy, Debug, Serialize, Deserialize)]
+pub struct VersionBlockKey {
+ /// Number of the part
+ pub part_number: u64,
+ /// Offset of this sub-segment in its part
+ pub offset: u64,
+}
+
+impl Ord for VersionBlockKey {
+ fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+ self.part_number
+ .cmp(&other.part_number)
+ .then(self.offset.cmp(&other.offset))
+ }
+}
+
+impl PartialOrd for VersionBlockKey {
+ fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+ Some(self.cmp(other))
+ }
+}
+
+/// Informations about a single block
+#[derive(PartialEq, Eq, Ord, PartialOrd, Clone, Copy, Debug, Serialize, Deserialize)]
+pub struct VersionBlock {
+ /// Blake2 sum of the block
+ pub hash: Hash,
+ /// Size of the block
+ pub size: u64,
+}
+
+impl AutoCrdt for VersionBlock {
+ const WARN_IF_DIFFERENT: bool = true;
+}
+
+impl Crdt for Version {
+ fn merge(&mut self, other: &Self) {
+ self.deleted.merge(&other.deleted);
+
+ if self.deleted.get() {
+ self.blocks.clear();
+ self.parts_etags.clear();
+ } else {
+ self.blocks.merge(&other.blocks);
+ self.parts_etags.merge(&other.parts_etags);
+ }
+ }
+}
diff --git a/src/model/block_ref_table.rs b/src/model/s3/block_ref_table.rs
index b6945403..c7017409 100644
--- a/src/model/block_ref_table.rs
+++ b/src/model/s3/block_ref_table.rs
@@ -1,6 +1,8 @@
use serde::{Deserialize, Serialize};
use std::sync::Arc;
+use garage_db as db;
+
use garage_util::data::*;
use garage_table::crdt::Crdt;
@@ -8,7 +10,7 @@ use garage_table::*;
use garage_block::manager::*;
-#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)]
+#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
pub struct BlockRef {
/// Hash (blake2 sum) of the block, used as partition key
pub block: Hash,
@@ -51,21 +53,22 @@ impl TableSchema for BlockRefTable {
type E = BlockRef;
type Filter = DeletedFilter;
- fn updated(&self, old: Option<Self::E>, new: Option<Self::E>) {
- #[allow(clippy::or_fun_call)]
- let block = &old.as_ref().or(new.as_ref()).unwrap().block;
- let was_before = old.as_ref().map(|x| !x.deleted.get()).unwrap_or(false);
- let is_after = new.as_ref().map(|x| !x.deleted.get()).unwrap_or(false);
+ fn updated(
+ &self,
+ tx: &mut db::Transaction,
+ old: Option<&Self::E>,
+ new: Option<&Self::E>,
+ ) -> db::TxOpResult<()> {
+ let block = old.or(new).unwrap().block;
+ let was_before = old.map(|x| !x.deleted.get()).unwrap_or(false);
+ let is_after = new.map(|x| !x.deleted.get()).unwrap_or(false);
if is_after && !was_before {
- if let Err(e) = self.block_manager.block_incref(block) {
- warn!("block_incref failed for block {:?}: {}", block, e);
- }
+ self.block_manager.block_incref(tx, block)?;
}
if was_before && !is_after {
- if let Err(e) = self.block_manager.block_decref(block) {
- warn!("block_decref failed for block {:?}: {}", block, e);
- }
+ self.block_manager.block_decref(tx, block)?;
}
+ Ok(())
}
fn matches_filter(entry: &Self::E, filter: &Self::Filter) -> bool {
diff --git a/src/model/s3/mod.rs b/src/model/s3/mod.rs
new file mode 100644
index 00000000..4e94337d
--- /dev/null
+++ b/src/model/s3/mod.rs
@@ -0,0 +1,3 @@
+pub mod block_ref_table;
+pub mod object_table;
+pub mod version_table;
diff --git a/src/model/object_table.rs b/src/model/s3/object_table.rs
index da53878e..26ff57f6 100644
--- a/src/model/object_table.rs
+++ b/src/model/s3/object_table.rs
@@ -2,6 +2,8 @@ use serde::{Deserialize, Serialize};
use std::collections::BTreeMap;
use std::sync::Arc;
+use garage_db as db;
+
use garage_util::background::BackgroundRunner;
use garage_util::data::*;
@@ -9,12 +11,17 @@ use garage_table::crdt::*;
use garage_table::replication::TableShardedReplication;
use garage_table::*;
-use crate::version_table::*;
+use crate::index_counter::*;
+use crate::s3::version_table::*;
+
+use crate::prev::v051::object_table as old;
-use garage_model_050::object_table as old;
+pub const OBJECTS: &str = "objects";
+pub const UNFINISHED_UPLOADS: &str = "unfinished_uploads";
+pub const BYTES: &str = "bytes";
/// An object
-#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)]
+#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
pub struct Object {
/// The bucket in which the object is stored, used as partition key
pub bucket_id: Uuid,
@@ -63,7 +70,7 @@ impl Object {
}
/// Informations about a version of an object
-#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)]
+#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
pub struct ObjectVersion {
/// Id of the version
pub uuid: Uuid,
@@ -74,7 +81,7 @@ pub struct ObjectVersion {
}
/// State of an object version
-#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)]
+#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
pub enum ObjectVersionState {
/// The version is being received
Uploading(ObjectVersionHeaders),
@@ -216,6 +223,7 @@ impl Crdt for Object {
pub struct ObjectTable {
pub background: Arc<BackgroundRunner>,
pub version_table: Arc<Table<VersionTable, TableShardedReplication>>,
+ pub object_counter_table: Arc<IndexCounter<Object>>,
}
#[derive(Clone, Copy, Debug, Serialize, Deserialize)]
@@ -232,8 +240,26 @@ impl TableSchema for ObjectTable {
type E = Object;
type Filter = ObjectFilter;
- fn updated(&self, old: Option<Self::E>, new: Option<Self::E>) {
+ fn updated(
+ &self,
+ tx: &mut db::Transaction,
+ old: Option<&Self::E>,
+ new: Option<&Self::E>,
+ ) -> db::TxOpResult<()> {
+ // 1. Count
+ let counter_res = self.object_counter_table.count(tx, old, new);
+ if let Err(e) = db::unabort(counter_res)? {
+ error!(
+ "Unable to update object counter: {}. Index values will be wrong!",
+ e
+ );
+ }
+
+ // 2. Spawn threads that propagates deletions to version table
let version_table = self.version_table.clone();
+ let old = old.cloned();
+ let new = new.cloned();
+
self.background.spawn(async move {
if let (Some(old_v), Some(new_v)) = (old, new) {
// Propagate deletion of old versions
@@ -256,7 +282,8 @@ impl TableSchema for ObjectTable {
}
}
Ok(())
- })
+ });
+ Ok(())
}
fn matches_filter(entry: &Self::E, filter: &Self::Filter) -> bool {
@@ -272,6 +299,49 @@ impl TableSchema for ObjectTable {
}
}
+impl CountedItem for Object {
+ const COUNTER_TABLE_NAME: &'static str = "bucket_object_counter";
+
+ // Partition key = bucket id
+ type CP = Uuid;
+ // Sort key = nothing
+ type CS = EmptyKey;
+
+ fn counter_partition_key(&self) -> &Uuid {
+ &self.bucket_id
+ }
+ fn counter_sort_key(&self) -> &EmptyKey {
+ &EmptyKey
+ }
+
+ fn counts(&self) -> Vec<(&'static str, i64)> {
+ let versions = self.versions();
+ let n_objects = if versions.iter().any(|v| v.is_data()) {
+ 1
+ } else {
+ 0
+ };
+ let n_unfinished_uploads = versions
+ .iter()
+ .filter(|v| matches!(v.state, ObjectVersionState::Uploading(_)))
+ .count();
+ let n_bytes = versions
+ .iter()
+ .map(|v| match &v.state {
+ ObjectVersionState::Complete(ObjectVersionData::Inline(meta, _))
+ | ObjectVersionState::Complete(ObjectVersionData::FirstBlock(meta, _)) => meta.size,
+ _ => 0,
+ })
+ .sum::<u64>();
+
+ vec![
+ (OBJECTS, n_objects),
+ (UNFINISHED_UPLOADS, n_unfinished_uploads as i64),
+ (BYTES, n_bytes as i64),
+ ]
+ }
+}
+
// vvvvvvvv migration code, stupid stuff vvvvvvvvvvvv
// (we just want to change bucket into bucket_id by hashing it)
diff --git a/src/model/version_table.rs b/src/model/s3/version_table.rs
index 839b1f4f..6bc2ecd1 100644
--- a/src/model/version_table.rs
+++ b/src/model/s3/version_table.rs
@@ -1,6 +1,8 @@
use serde::{Deserialize, Serialize};
use std::sync::Arc;
+use garage_db as db;
+
use garage_util::background::BackgroundRunner;
use garage_util::data::*;
@@ -8,12 +10,12 @@ use garage_table::crdt::*;
use garage_table::replication::TableShardedReplication;
use garage_table::*;
-use crate::block_ref_table::*;
+use crate::s3::block_ref_table::*;
-use garage_model_050::version_table as old;
+use crate::prev::v051::version_table as old;
/// A version of an object
-#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)]
+#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
pub struct Version {
/// UUID of the version, used as partition key
pub uuid: Uuid,
@@ -137,8 +139,16 @@ impl TableSchema for VersionTable {
type E = Version;
type Filter = DeletedFilter;
- fn updated(&self, old: Option<Self::E>, new: Option<Self::E>) {
+ fn updated(
+ &self,
+ _tx: &mut db::Transaction,
+ old: Option<&Self::E>,
+ new: Option<&Self::E>,
+ ) -> db::TxOpResult<()> {
let block_ref_table = self.block_ref_table.clone();
+ let old = old.cloned();
+ let new = new.cloned();
+
self.background.spawn(async move {
if let (Some(old_v), Some(new_v)) = (old, new) {
// Propagate deletion of version blocks
@@ -157,7 +167,9 @@ impl TableSchema for VersionTable {
}
}
Ok(())
- })
+ });
+
+ Ok(())
}
fn matches_filter(entry: &Self::E, filter: &Self::Filter) -> bool {
diff --git a/src/rpc/Cargo.toml b/src/rpc/Cargo.toml
index 654c1dc6..5bb6aae0 100644
--- a/src/rpc/Cargo.toml
+++ b/src/rpc/Cargo.toml
@@ -1,6 +1,6 @@
[package]
name = "garage_rpc"
-version = "0.7.0"
+version = "0.8.0"
authors = ["Alex Auvolat <alex@adnab.me>"]
edition = "2018"
license = "AGPL-3.0"
@@ -14,8 +14,7 @@ path = "lib.rs"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
-garage_util = { version = "0.7.0", path = "../util" }
-garage_admin = { version = "0.7.0", path = "../admin" }
+garage_util = { version = "0.8.0", path = "../util" }
arc-swap = "1.0"
bytes = "1.0"
@@ -47,11 +46,11 @@ tokio = { version = "1.0", default-features = false, features = ["rt", "rt-multi
tokio-stream = { version = "0.1", features = ["net"] }
opentelemetry = "0.17"
-#netapp = { version = "0.3.0", git = "https://git.deuxfleurs.fr/lx/netapp" }
-#netapp = { version = "0.4", path = "../../../netapp", features = ["telemetry"] }
-netapp = { version = "0.4.2", features = ["telemetry"] }
+netapp = { version = "0.5.2", features = ["telemetry"] }
hyper = { version = "0.14", features = ["client", "http1", "runtime", "tcp"] }
+
[features]
kubernetes-discovery = [ "kube", "k8s-openapi", "openssl", "schemars" ]
+system-libs = [ "sodiumoxide/use-pkg-config" ]
diff --git a/src/rpc/kubernetes.rs b/src/rpc/kubernetes.rs
index 939a0eed..197245aa 100644
--- a/src/rpc/kubernetes.rs
+++ b/src/rpc/kubernetes.rs
@@ -56,7 +56,7 @@ pub async fn get_kubernetes_nodes(
let mut ret = Vec::with_capacity(nodes.items.len());
for node in nodes {
- println!("Found Pod: {:?}", node.metadata.name);
+ info!("Found Pod: {:?}", node.metadata.name);
let pubkey = &node
.metadata
diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs
index a878f19c..16d573c7 100644
--- a/src/rpc/layout.rs
+++ b/src/rpc/layout.rs
@@ -9,6 +9,7 @@ use serde::{Deserialize, Serialize};
use garage_util::crdt::{AutoCrdt, Crdt, LwwMap};
use garage_util::data::*;
+use garage_util::error::*;
use crate::graph_algo::*;
@@ -144,6 +145,61 @@ impl ClusterLayout {
}
}
+ pub fn apply_staged_changes(mut self, version: Option<u64>) -> Result<Self, Error> {
+ match version {
+ None => {
+ let error = r#"
+Please pass the new layout version number to ensure that you are writing the correct version of the cluster layout.
+To know the correct value of the new layout version, invoke `garage layout show` and review the proposed changes.
+ "#;
+ return Err(Error::Message(error.into()));
+ }
+ Some(v) => {
+ if v != self.version + 1 {
+ return Err(Error::Message("Invalid new layout version".into()));
+ }
+ }
+ }
+
+ self.roles.merge(&self.staging);
+ self.roles.retain(|(_, _, v)| v.0.is_some());
+
+ if !self.calculate_partition_assignation() {
+ return Err(Error::Message("Could not calculate new assignation of partitions to nodes. This can happen if there are less nodes than the desired number of copies of your data (see the replication_mode configuration parameter).".into()));
+ }
+
+ self.staging.clear();
+ self.staging_hash = blake2sum(&rmp_to_vec_all_named(&self.staging).unwrap()[..]);
+
+ self.version += 1;
+
+ Ok(self)
+ }
+
+ pub fn revert_staged_changes(mut self, version: Option<u64>) -> Result<Self, Error> {
+ match version {
+ None => {
+ let error = r#"
+Please pass the new layout version number to ensure that you are writing the correct version of the cluster layout.
+To know the correct value of the new layout version, invoke `garage layout show` and review the proposed changes.
+ "#;
+ return Err(Error::Message(error.into()));
+ }
+ Some(v) => {
+ if v != self.version + 1 {
+ return Err(Error::Message("Invalid new layout version".into()));
+ }
+ }
+ }
+
+ self.staging.clear();
+ self.staging_hash = blake2sum(&rmp_to_vec_all_named(&self.staging).unwrap()[..]);
+
+ self.version += 1;
+
+ Ok(self)
+ }
+
/// Returns a list of IDs of nodes that currently have
/// a role in the cluster
pub fn node_ids(&self) -> &[Uuid] {
diff --git a/src/rpc/metrics.rs b/src/rpc/metrics.rs
index c900518c..61f8fa79 100644
--- a/src/rpc/metrics.rs
+++ b/src/rpc/metrics.rs
@@ -1,31 +1,18 @@
-use std::sync::Arc;
-
use opentelemetry::{global, metrics::*};
-use tokio::sync::Semaphore;
/// TableMetrics reference all counter used for metrics
pub struct RpcMetrics {
- pub(crate) _rpc_available_permits: ValueObserver<u64>,
-
pub(crate) rpc_counter: Counter<u64>,
pub(crate) rpc_timeout_counter: Counter<u64>,
pub(crate) rpc_netapp_error_counter: Counter<u64>,
pub(crate) rpc_garage_error_counter: Counter<u64>,
pub(crate) rpc_duration: ValueRecorder<f64>,
- pub(crate) rpc_queueing_time: ValueRecorder<f64>,
}
impl RpcMetrics {
- pub fn new(sem: Arc<Semaphore>) -> Self {
+ pub fn new() -> Self {
let meter = global::meter("garage_rpc");
RpcMetrics {
- _rpc_available_permits: meter
- .u64_value_observer("rpc.available_permits", move |observer| {
- observer.observe(sem.available_permits() as u64, &[])
- })
- .with_description("Number of available RPC permits")
- .init(),
-
rpc_counter: meter
.u64_counter("rpc.request_counter")
.with_description("Number of RPC requests emitted")
@@ -46,10 +33,6 @@ impl RpcMetrics {
.f64_value_recorder("rpc.duration")
.with_description("Duration of RPCs")
.init(),
- rpc_queueing_time: meter
- .f64_value_recorder("rpc.queueing_time")
- .with_description("Time RPC requests were queued for before being sent")
- .init(),
}
}
}
diff --git a/src/rpc/rpc_helper.rs b/src/rpc/rpc_helper.rs
index 34717d3b..949aced6 100644
--- a/src/rpc/rpc_helper.rs
+++ b/src/rpc/rpc_helper.rs
@@ -7,7 +7,7 @@ use futures::stream::futures_unordered::FuturesUnordered;
use futures::stream::StreamExt;
use futures_util::future::FutureExt;
use tokio::select;
-use tokio::sync::{watch, Semaphore};
+use tokio::sync::watch;
use opentelemetry::KeyValue;
use opentelemetry::{
@@ -15,10 +15,14 @@ use opentelemetry::{
Context,
};
-pub use netapp::endpoint::{Endpoint, EndpointHandler, Message as Rpc};
+pub use netapp::endpoint::{Endpoint, EndpointHandler, StreamingEndpointHandler};
+use netapp::message::IntoReq;
+pub use netapp::message::{
+ Message as Rpc, OrderTag, Req, RequestPriority, Resp, PRIO_BACKGROUND, PRIO_HIGH, PRIO_NORMAL,
+ PRIO_SECONDARY,
+};
use netapp::peering::fullmesh::FullMeshPeeringStrategy;
-pub use netapp::proto::*;
-pub use netapp::{NetApp, NodeID};
+pub use netapp::{self, NetApp, NodeID};
use garage_util::background::BackgroundRunner;
use garage_util::data::*;
@@ -28,34 +32,37 @@ use garage_util::metrics::RecordDuration;
use crate::metrics::RpcMetrics;
use crate::ring::Ring;
-const DEFAULT_TIMEOUT: Duration = Duration::from_secs(10);
-
-// Try to never have more than 200MB of outgoing requests
-// buffered at the same time. Other requests are queued until
-// space is freed.
-const REQUEST_BUFFER_SIZE: usize = 200 * 1024 * 1024;
+// Default RPC timeout = 5 minutes
+const DEFAULT_TIMEOUT: Duration = Duration::from_secs(300);
/// Strategy to apply when making RPC
#[derive(Copy, Clone)]
pub struct RequestStrategy {
- /// Max time to wait for reponse
- pub rs_timeout: Duration,
/// Min number of response to consider the request successful
pub rs_quorum: Option<usize>,
/// Should requests be dropped after enough response are received
pub rs_interrupt_after_quorum: bool,
/// Request priority
pub rs_priority: RequestPriority,
+ /// Custom timeout for this request
+ rs_timeout: Timeout,
+}
+
+#[derive(Copy, Clone)]
+enum Timeout {
+ None,
+ Default,
+ Custom(Duration),
}
impl RequestStrategy {
/// Create a RequestStrategy with default timeout and not interrupting when quorum reached
pub fn with_priority(prio: RequestPriority) -> Self {
RequestStrategy {
- rs_timeout: DEFAULT_TIMEOUT,
rs_quorum: None,
rs_interrupt_after_quorum: false,
rs_priority: prio,
+ rs_timeout: Timeout::Default,
}
}
/// Set quorum to be reached for request
@@ -63,17 +70,22 @@ impl RequestStrategy {
self.rs_quorum = Some(quorum);
self
}
- /// Set timeout of the strategy
- pub fn with_timeout(mut self, timeout: Duration) -> Self {
- self.rs_timeout = timeout;
- self
- }
/// Set if requests can be dropped after quorum has been reached
/// In general true for read requests, and false for write
pub fn interrupt_after_quorum(mut self, interrupt: bool) -> Self {
self.rs_interrupt_after_quorum = interrupt;
self
}
+ /// Deactivate timeout for this request
+ pub fn without_timeout(mut self) -> Self {
+ self.rs_timeout = Timeout::None;
+ self
+ }
+ /// Set custom timeout for this request
+ pub fn with_custom_timeout(mut self, timeout: Duration) -> Self {
+ self.rs_timeout = Timeout::Custom(timeout);
+ self
+ }
}
#[derive(Clone)]
@@ -84,8 +96,8 @@ struct RpcHelperInner {
fullmesh: Arc<FullMeshPeeringStrategy>,
background: Arc<BackgroundRunner>,
ring: watch::Receiver<Arc<Ring>>,
- request_buffer_semaphore: Arc<Semaphore>,
metrics: RpcMetrics,
+ rpc_timeout: Duration,
}
impl RpcHelper {
@@ -94,45 +106,35 @@ impl RpcHelper {
fullmesh: Arc<FullMeshPeeringStrategy>,
background: Arc<BackgroundRunner>,
ring: watch::Receiver<Arc<Ring>>,
+ rpc_timeout: Option<Duration>,
) -> Self {
- let sem = Arc::new(Semaphore::new(REQUEST_BUFFER_SIZE));
-
- let metrics = RpcMetrics::new(sem.clone());
+ let metrics = RpcMetrics::new();
Self(Arc::new(RpcHelperInner {
our_node_id,
fullmesh,
background,
ring,
- request_buffer_semaphore: sem,
metrics,
+ rpc_timeout: rpc_timeout.unwrap_or(DEFAULT_TIMEOUT),
}))
}
- pub async fn call<M, H, S>(
- &self,
- endpoint: &Endpoint<M, H>,
- to: Uuid,
- msg: M,
- strat: RequestStrategy,
- ) -> Result<S, Error>
- where
- M: Rpc<Response = Result<S, Error>>,
- H: EndpointHandler<M>,
- {
- self.call_arc(endpoint, to, Arc::new(msg), strat).await
+ pub fn rpc_timeout(&self) -> Duration {
+ self.0.rpc_timeout
}
- pub async fn call_arc<M, H, S>(
+ pub async fn call<M, N, H, S>(
&self,
endpoint: &Endpoint<M, H>,
to: Uuid,
- msg: Arc<M>,
+ msg: N,
strat: RequestStrategy,
) -> Result<S, Error>
where
M: Rpc<Response = Result<S, Error>>,
- H: EndpointHandler<M>,
+ N: IntoReq<M> + Send,
+ H: StreamingEndpointHandler<M>,
{
let metric_tags = [
KeyValue::new("rpc_endpoint", endpoint.path().to_string()),
@@ -140,29 +142,27 @@ impl RpcHelper {
KeyValue::new("to", format!("{:?}", to)),
];
- let msg_size = rmp_to_vec_all_named(&msg)?.len() as u32;
- let permit = self
- .0
- .request_buffer_semaphore
- .acquire_many(msg_size)
- .record_duration(&self.0.metrics.rpc_queueing_time, &metric_tags)
- .await?;
-
self.0.metrics.rpc_counter.add(1, &metric_tags);
let node_id = to.into();
let rpc_call = endpoint
- .call(&node_id, msg, strat.rs_priority)
+ .call_streaming(&node_id, msg, strat.rs_priority)
.record_duration(&self.0.metrics.rpc_duration, &metric_tags);
+ let timeout = async {
+ match strat.rs_timeout {
+ Timeout::None => futures::future::pending().await,
+ Timeout::Default => tokio::time::sleep(self.0.rpc_timeout).await,
+ Timeout::Custom(t) => tokio::time::sleep(t).await,
+ }
+ };
+
select! {
res = rpc_call => {
- drop(permit);
-
if res.is_err() {
self.0.metrics.rpc_netapp_error_counter.add(1, &metric_tags);
}
- let res = res?;
+ let res = res?.into_msg();
if res.is_err() {
self.0.metrics.rpc_garage_error_counter.add(1, &metric_tags);
@@ -170,46 +170,49 @@ impl RpcHelper {
Ok(res?)
}
- _ = tokio::time::sleep(strat.rs_timeout) => {
- drop(permit);
+ () = timeout => {
self.0.metrics.rpc_timeout_counter.add(1, &metric_tags);
Err(Error::Timeout)
}
}
}
- pub async fn call_many<M, H, S>(
+ pub async fn call_many<M, N, H, S>(
&self,
endpoint: &Endpoint<M, H>,
to: &[Uuid],
- msg: M,
+ msg: N,
strat: RequestStrategy,
- ) -> Vec<(Uuid, Result<S, Error>)>
+ ) -> Result<Vec<(Uuid, Result<S, Error>)>, Error>
where
M: Rpc<Response = Result<S, Error>>,
- H: EndpointHandler<M>,
+ N: IntoReq<M>,
+ H: StreamingEndpointHandler<M>,
{
- let msg = Arc::new(msg);
+ let msg = msg.into_req().map_err(netapp::error::Error::from)?;
+
let resps = join_all(
to.iter()
- .map(|to| self.call_arc(endpoint, *to, msg.clone(), strat)),
+ .map(|to| self.call(endpoint, *to, msg.clone(), strat)),
)
.await;
- to.iter()
+ Ok(to
+ .iter()
.cloned()
.zip(resps.into_iter())
- .collect::<Vec<_>>()
+ .collect::<Vec<_>>())
}
- pub async fn broadcast<M, H, S>(
+ pub async fn broadcast<M, N, H, S>(
&self,
endpoint: &Endpoint<M, H>,
- msg: M,
+ msg: N,
strat: RequestStrategy,
- ) -> Vec<(Uuid, Result<S, Error>)>
+ ) -> Result<Vec<(Uuid, Result<S, Error>)>, Error>
where
M: Rpc<Response = Result<S, Error>>,
- H: EndpointHandler<M>,
+ N: IntoReq<M>,
+ H: StreamingEndpointHandler<M>,
{
let to = self
.0
@@ -223,16 +226,17 @@ impl RpcHelper {
/// Make a RPC call to multiple servers, returning either a Vec of responses,
/// or an error if quorum could not be reached due to too many errors
- pub async fn try_call_many<M, H, S>(
+ pub async fn try_call_many<M, N, H, S>(
&self,
endpoint: &Arc<Endpoint<M, H>>,
to: &[Uuid],
- msg: M,
+ msg: N,
strategy: RequestStrategy,
) -> Result<Vec<S>, Error>
where
M: Rpc<Response = Result<S, Error>> + 'static,
- H: EndpointHandler<M> + 'static,
+ N: IntoReq<M>,
+ H: StreamingEndpointHandler<M> + 'static,
S: Send + 'static,
{
let quorum = strategy.rs_quorum.unwrap_or(to.len());
@@ -262,20 +266,21 @@ impl RpcHelper {
.await
}
- async fn try_call_many_internal<M, H, S>(
+ async fn try_call_many_internal<M, N, H, S>(
&self,
endpoint: &Arc<Endpoint<M, H>>,
to: &[Uuid],
- msg: M,
+ msg: N,
strategy: RequestStrategy,
quorum: usize,
) -> Result<Vec<S>, Error>
where
M: Rpc<Response = Result<S, Error>> + 'static,
- H: EndpointHandler<M> + 'static,
+ N: IntoReq<M>,
+ H: StreamingEndpointHandler<M> + 'static,
S: Send + 'static,
{
- let msg = Arc::new(msg);
+ let msg = msg.into_req().map_err(netapp::error::Error::from)?;
// Build future for each request
// They are not started now: they are added below in a FuturesUnordered
@@ -285,7 +290,7 @@ impl RpcHelper {
let msg = msg.clone();
let endpoint2 = endpoint.clone();
(to, async move {
- self2.call_arc(&endpoint2, to, msg, strategy).await
+ self2.call(&endpoint2, to, msg, strategy).await
})
});
@@ -299,47 +304,19 @@ impl RpcHelper {
// to reach a quorum, priorizing nodes with the lowest latency.
// When there are errors, we start new requests to compensate.
- // Retrieve some status variables that we will use to sort requests
- let peer_list = self.0.fullmesh.get_peer_list();
- let ring: Arc<Ring> = self.0.ring.borrow().clone();
- let our_zone = match ring.layout.node_role(&self.0.our_node_id) {
- Some(pc) => &pc.zone,
- None => "",
- };
-
- // Augment requests with some information used to sort them.
- // The tuples are as follows:
- // (is another node?, is another zone?, latency, node ID, request future)
- // We store all of these tuples in a vec that we can sort.
- // By sorting this vec, we priorize ourself, then nodes in the same zone,
- // and within a same zone we priorize nodes with the lowest latency.
- let mut requests = requests
- .map(|(to, fut)| {
- let peer_zone = match ring.layout.node_role(&to) {
- Some(pc) => &pc.zone,
- None => "",
- };
- let peer_avg_ping = peer_list
- .iter()
- .find(|x| x.id.as_ref() == to.as_slice())
- .and_then(|pi| pi.avg_ping)
- .unwrap_or_else(|| Duration::from_secs(1));
- (
- to != self.0.our_node_id,
- peer_zone != our_zone,
- peer_avg_ping,
- to,
- fut,
- )
- })
+ // Reorder requests to priorize closeness / low latency
+ let request_order = self.request_order(to);
+ let mut ord_requests = vec![(); request_order.len()]
+ .into_iter()
+ .map(|_| None)
.collect::<Vec<_>>();
-
- // Sort requests by (priorize ourself, priorize same zone, priorize low latency)
- requests
- .sort_by_key(|(diffnode, diffzone, ping, _to, _fut)| (*diffnode, *diffzone, *ping));
+ for (to, fut) in requests {
+ let i = request_order.iter().position(|x| *x == to).unwrap();
+ ord_requests[i] = Some((to, fut));
+ }
// Make an iterator to take requests in their sorted order
- let mut requests = requests.into_iter();
+ let mut requests = ord_requests.into_iter().map(Option::unwrap);
// resp_stream will contain all of the requests that are currently in flight.
// (for the moment none, they will be added in the loop below)
@@ -350,7 +327,7 @@ impl RpcHelper {
// If the current set of requests that are running is not enough to possibly
// reach quorum, start some new requests.
while successes.len() + resp_stream.len() < quorum {
- if let Some((_, _, _, req_to, fut)) = requests.next() {
+ if let Some((req_to, fut)) = requests.next() {
let tracer = opentelemetry::global::tracer("garage");
let span = tracer.start(format!("RPC to {:?}", req_to));
resp_stream.push(tokio::spawn(
@@ -420,4 +397,49 @@ impl RpcHelper {
Err(Error::Quorum(quorum, successes.len(), to.len(), errors))
}
}
+
+ pub fn request_order(&self, nodes: &[Uuid]) -> Vec<Uuid> {
+ // Retrieve some status variables that we will use to sort requests
+ let peer_list = self.0.fullmesh.get_peer_list();
+ let ring: Arc<Ring> = self.0.ring.borrow().clone();
+ let our_zone = match ring.layout.node_role(&self.0.our_node_id) {
+ Some(pc) => &pc.zone,
+ None => "",
+ };
+
+ // Augment requests with some information used to sort them.
+ // The tuples are as follows:
+ // (is another node?, is another zone?, latency, node ID, request future)
+ // We store all of these tuples in a vec that we can sort.
+ // By sorting this vec, we priorize ourself, then nodes in the same zone,
+ // and within a same zone we priorize nodes with the lowest latency.
+ let mut nodes = nodes
+ .iter()
+ .map(|to| {
+ let peer_zone = match ring.layout.node_role(to) {
+ Some(pc) => &pc.zone,
+ None => "",
+ };
+ let peer_avg_ping = peer_list
+ .iter()
+ .find(|x| x.id.as_ref() == to.as_slice())
+ .and_then(|pi| pi.avg_ping)
+ .unwrap_or_else(|| Duration::from_secs(10));
+ (
+ *to != self.0.our_node_id,
+ peer_zone != our_zone,
+ peer_avg_ping,
+ *to,
+ )
+ })
+ .collect::<Vec<_>>();
+
+ // Sort requests by (priorize ourself, priorize same zone, priorize low latency)
+ nodes.sort_by_key(|(diffnode, diffzone, ping, _to)| (*diffnode, *diffzone, *ping));
+
+ nodes
+ .into_iter()
+ .map(|(_, _, _, to)| to)
+ .collect::<Vec<_>>()
+ }
}
diff --git a/src/rpc/system.rs b/src/rpc/system.rs
index 34031b10..7eb25195 100644
--- a/src/rpc/system.rs
+++ b/src/rpc/system.rs
@@ -2,7 +2,7 @@
use std::collections::HashMap;
use std::io::{Read, Write};
use std::net::{IpAddr, SocketAddr};
-use std::path::Path;
+use std::path::{Path, PathBuf};
use std::sync::{Arc, RwLock};
use std::time::{Duration, Instant};
@@ -16,9 +16,9 @@ use tokio::sync::watch;
use tokio::sync::Mutex;
use netapp::endpoint::{Endpoint, EndpointHandler};
+use netapp::message::*;
use netapp::peering::fullmesh::FullMeshPeeringStrategy;
-use netapp::proto::*;
-use netapp::util::parse_and_resolve_peer_addr;
+use netapp::util::parse_and_resolve_peer_addr_async;
use netapp::{NetApp, NetworkKey, NodeID, NodeKey};
use garage_util::background::BackgroundRunner;
@@ -37,10 +37,11 @@ use crate::rpc_helper::*;
const DISCOVERY_INTERVAL: Duration = Duration::from_secs(60);
const STATUS_EXCHANGE_INTERVAL: Duration = Duration::from_secs(10);
-const PING_TIMEOUT: Duration = Duration::from_secs(2);
-/// Version tag used for version check upon Netapp connection
-pub const GARAGE_VERSION_TAG: u64 = 0x6761726167650007; // garage 0x0007
+/// Version tag used for version check upon Netapp connection.
+/// Cluster nodes with different version tags are deemed
+/// incompatible and will refuse to connect.
+pub const GARAGE_VERSION_TAG: u64 = 0x6761726167650008; // garage 0x0008
/// RPC endpoint used for calls related to membership
pub const SYSTEM_RPC_PATH: &str = "garage_rpc/membership.rs/SystemRpc";
@@ -90,7 +91,7 @@ pub struct System {
rpc_listen_addr: SocketAddr,
rpc_public_addr: Option<SocketAddr>,
- bootstrap_peers: Vec<(NodeID, SocketAddr)>,
+ bootstrap_peers: Vec<String>,
consul_discovery: Option<ConsulDiscoveryParam>,
#[cfg(feature = "kubernetes-discovery")]
@@ -104,6 +105,9 @@ pub struct System {
/// The job runner of this node
pub background: Arc<BackgroundRunner>,
+
+ /// Path to metadata directory
+ pub metadata_dir: PathBuf,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
@@ -194,7 +198,7 @@ impl System {
replication_factor: usize,
zone_redundancy: usize,
config: &Config,
- ) -> Arc<Self> {
+ ) -> Result<Arc<Self>, Error> {
let node_key =
gen_node_key(&config.metadata_dir).expect("Unable to read or generate node ID");
info!(
@@ -202,11 +206,21 @@ impl System {
hex::encode(&node_key.public_key()[..8])
);
- let persist_cluster_layout = Persister::new(&config.metadata_dir, "cluster_layout");
+ let persist_cluster_layout: Persister<ClusterLayout> =
+ Persister::new(&config.metadata_dir, "cluster_layout");
let persist_peer_list = Persister::new(&config.metadata_dir, "peer_list");
let cluster_layout = match persist_cluster_layout.load() {
- Ok(x) => x,
+ Ok(x) => {
+ if x.replication_factor != replication_factor {
+ return Err(Error::Message(format!(
+ "Prevous cluster layout has replication factor {}, which is different than the one specified in the config file ({}). The previous cluster layout can be purged, if you know what you are doing, simply by deleting the `cluster_layout` file in your metadata directory.",
+ x.replication_factor,
+ replication_factor
+ )));
+ }
+ x
+ }
Err(e) => {
info!(
"No valid previous cluster layout stored ({}), starting fresh.",
@@ -228,8 +242,29 @@ impl System {
let ring = Ring::new(cluster_layout, replication_factor);
let (update_ring, ring) = watch::channel(Arc::new(ring));
- let rpc_public_addr = match config.rpc_public_addr {
- Some(a) => Some(a),
+ let rpc_public_addr = match &config.rpc_public_addr {
+ Some(a_str) => {
+ use std::net::ToSocketAddrs;
+ match a_str.to_socket_addrs() {
+ Err(e) => {
+ error!(
+ "Cannot resolve rpc_public_addr {} from config file: {}.",
+ a_str, e
+ );
+ None
+ }
+ Ok(a) => {
+ let a = a.collect::<Vec<_>>();
+ if a.is_empty() {
+ error!("rpc_public_addr {} resolve to no known IP address", a_str);
+ }
+ if a.len() > 1 {
+ warn!("Multiple possible resolutions for rpc_public_addr: {:?}. Taking the first one.", a);
+ }
+ a.into_iter().next()
+ }
+ }
+ }
None => {
let addr =
get_default_ip().map(|ip| SocketAddr::new(ip, config.rpc_bind_addr.port()));
@@ -239,13 +274,15 @@ impl System {
addr
}
};
+ if rpc_public_addr.is_none() {
+ warn!("This Garage node does not know its publicly reachable RPC address, this might hamper intra-cluster communication.");
+ }
let netapp = NetApp::new(GARAGE_VERSION_TAG, network_key, node_key);
- let fullmesh = FullMeshPeeringStrategy::new(
- netapp.clone(),
- config.bootstrap_peers.clone(),
- rpc_public_addr,
- );
+ let fullmesh = FullMeshPeeringStrategy::new(netapp.clone(), vec![], rpc_public_addr);
+ if let Some(ping_timeout) = config.rpc_ping_timeout_msec {
+ fullmesh.set_ping_timeout_millis(ping_timeout);
+ }
let system_endpoint = netapp.endpoint(SYSTEM_RPC_PATH.into());
@@ -283,7 +320,13 @@ impl System {
node_status: RwLock::new(HashMap::new()),
netapp: netapp.clone(),
fullmesh: fullmesh.clone(),
- rpc: RpcHelper::new(netapp.id.into(), fullmesh, background.clone(), ring.clone()),
+ rpc: RpcHelper::new(
+ netapp.id.into(),
+ fullmesh,
+ background.clone(),
+ ring.clone(),
+ config.rpc_timeout_msec.map(Duration::from_millis),
+ ),
system_endpoint,
replication_factor,
rpc_listen_addr: config.rpc_bind_addr,
@@ -296,9 +339,10 @@ impl System {
ring,
update_ring: Mutex::new(update_ring),
background,
+ metadata_dir: config.metadata_dir.clone(),
});
sys.system_endpoint.set_handler(sys.clone());
- sys
+ Ok(sys)
}
/// Perform bootstraping, starting the ping loop
@@ -313,6 +357,80 @@ impl System {
);
}
+ // ---- Administrative operations (directly available and
+ // also available through RPC) ----
+
+ pub fn get_known_nodes(&self) -> Vec<KnownNodeInfo> {
+ let node_status = self.node_status.read().unwrap();
+ let known_nodes = self
+ .fullmesh
+ .get_peer_list()
+ .iter()
+ .map(|n| KnownNodeInfo {
+ id: n.id.into(),
+ addr: n.addr,
+ is_up: n.is_up(),
+ last_seen_secs_ago: n
+ .last_seen
+ .map(|t| (Instant::now().saturating_duration_since(t)).as_secs()),
+ status: node_status
+ .get(&n.id.into())
+ .cloned()
+ .map(|(_, st)| st)
+ .unwrap_or(NodeStatus {
+ hostname: "?".to_string(),
+ replication_factor: 0,
+ cluster_layout_version: 0,
+ cluster_layout_staging_hash: Hash::from([0u8; 32]),
+ }),
+ })
+ .collect::<Vec<_>>();
+ known_nodes
+ }
+
+ pub fn get_cluster_layout(&self) -> ClusterLayout {
+ self.ring.borrow().layout.clone()
+ }
+
+ pub async fn update_cluster_layout(
+ self: &Arc<Self>,
+ layout: &ClusterLayout,
+ ) -> Result<(), Error> {
+ self.handle_advertise_cluster_layout(layout).await?;
+ Ok(())
+ }
+
+ pub async fn connect(&self, node: &str) -> Result<(), Error> {
+ let (pubkey, addrs) = parse_and_resolve_peer_addr_async(node)
+ .await
+ .ok_or_else(|| {
+ Error::Message(format!(
+ "Unable to parse or resolve node specification: {}",
+ node
+ ))
+ })?;
+ let mut errors = vec![];
+ for ip in addrs.iter() {
+ match self
+ .netapp
+ .clone()
+ .try_connect(*ip, pubkey)
+ .await
+ .err_context(CONNECT_ERROR_MESSAGE)
+ {
+ Ok(()) => return Ok(()),
+ Err(e) => {
+ errors.push((*ip, e));
+ }
+ }
+ }
+ if errors.len() == 1 {
+ Err(Error::Message(errors[0].1.to_string()))
+ } else {
+ Err(Error::Message(format!("{:?}", errors)))
+ }
+ }
+
// ---- INTERNALS ----
async fn advertise_to_consul(self: Arc<Self>) -> Result<(), Error> {
@@ -385,32 +503,11 @@ impl System {
self.local_status.swap(Arc::new(new_si));
}
+ // --- RPC HANDLERS ---
+
async fn handle_connect(&self, node: &str) -> Result<SystemRpc, Error> {
- let (pubkey, addrs) = parse_and_resolve_peer_addr(node).ok_or_else(|| {
- Error::Message(format!(
- "Unable to parse or resolve node specification: {}",
- node
- ))
- })?;
- let mut errors = vec![];
- for ip in addrs.iter() {
- match self
- .netapp
- .clone()
- .try_connect(*ip, pubkey)
- .await
- .err_context(CONNECT_ERROR_MESSAGE)
- {
- Ok(()) => return Ok(SystemRpc::Ok),
- Err(e) => {
- errors.push((*ip, e));
- }
- }
- }
- return Err(Error::Message(format!(
- "Could not connect to specified peers. Errors: {:?}",
- errors
- )));
+ self.connect(node).await?;
+ Ok(SystemRpc::Ok)
}
fn handle_pull_cluster_layout(&self) -> SystemRpc {
@@ -419,28 +516,7 @@ impl System {
}
fn handle_get_known_nodes(&self) -> SystemRpc {
- let node_status = self.node_status.read().unwrap();
- let known_nodes = self
- .fullmesh
- .get_peer_list()
- .iter()
- .map(|n| KnownNodeInfo {
- id: n.id.into(),
- addr: n.addr,
- is_up: n.is_up(),
- last_seen_secs_ago: n.last_seen.map(|t| (Instant::now() - t).as_secs()),
- status: node_status
- .get(&n.id.into())
- .cloned()
- .map(|(_, st)| st)
- .unwrap_or(NodeStatus {
- hostname: "?".to_string(),
- replication_factor: 0,
- cluster_layout_version: 0,
- cluster_layout_staging_hash: Hash::from([0u8; 32]),
- }),
- })
- .collect::<Vec<_>>();
+ let known_nodes = self.get_known_nodes();
SystemRpc::ReturnKnownNodes(known_nodes)
}
@@ -452,7 +528,7 @@ impl System {
let local_info = self.local_status.load();
if local_info.replication_factor < info.replication_factor {
- error!("Some node have a higher replication factor ({}) than this one ({}). This is not supported and might lead to bugs",
+ error!("Some node have a higher replication factor ({}) than this one ({}). This is not supported and will lead to data corruption. Shutting down for safety.",
info.replication_factor,
local_info.replication_factor);
std::process::exit(1);
@@ -477,9 +553,19 @@ impl System {
}
async fn handle_advertise_cluster_layout(
- self: Arc<Self>,
+ self: &Arc<Self>,
adv: &ClusterLayout,
) -> Result<SystemRpc, Error> {
+ if adv.replication_factor != self.replication_factor {
+ let msg = format!(
+ "Received a cluster layout from another node with replication factor {}, which is different from what we have in our configuration ({}). Discarding the cluster layout we received.",
+ adv.replication_factor,
+ self.replication_factor
+ );
+ error!("{}", msg);
+ return Err(Error::Message(msg));
+ }
+
let update_ring = self.update_ring.lock().await;
let mut layout: ClusterLayout = self.ring.borrow().layout.clone();
@@ -505,7 +591,7 @@ impl System {
SystemRpc::AdvertiseClusterLayout(layout),
RequestStrategy::with_priority(PRIO_HIGH),
)
- .await;
+ .await?;
Ok(())
});
self.background.spawn(self.clone().save_cluster_layout());
@@ -520,11 +606,12 @@ impl System {
self.update_local_status();
let local_status: NodeStatus = self.local_status.load().as_ref().clone();
- self.rpc
+ let _ = self
+ .rpc
.broadcast(
&self.system_endpoint,
SystemRpc::AdvertiseStatus(local_status),
- RequestStrategy::with_priority(PRIO_HIGH).with_timeout(PING_TIMEOUT),
+ RequestStrategy::with_priority(PRIO_HIGH),
)
.await;
@@ -550,7 +637,7 @@ impl System {
if not_configured || no_peers || bad_peers {
info!("Doing a bootstrap/discovery step (not_configured: {}, no_peers: {}, bad_peers: {})", not_configured, no_peers, bad_peers);
- let mut ping_list = self.bootstrap_peers.clone();
+ let mut ping_list = resolve_peers(&self.bootstrap_peers).await;
// Add peer list from list stored on disk
if let Ok(peers) = self.persist_peer_list.load_async().await {
@@ -648,7 +735,7 @@ impl System {
&self.system_endpoint,
peer,
SystemRpc::PullClusterLayout,
- RequestStrategy::with_priority(PRIO_HIGH).with_timeout(PING_TIMEOUT),
+ RequestStrategy::with_priority(PRIO_HIGH),
)
.await;
if let Ok(SystemRpc::AdvertiseClusterLayout(layout)) = resp {
@@ -681,6 +768,25 @@ fn get_default_ip() -> Option<IpAddr> {
.map(|a| a.ip())
}
+async fn resolve_peers(peers: &[String]) -> Vec<(NodeID, SocketAddr)> {
+ let mut ret = vec![];
+
+ for peer in peers.iter() {
+ match parse_and_resolve_peer_addr_async(peer).await {
+ Some((pubkey, addrs)) => {
+ for ip in addrs {
+ ret.push((pubkey, ip));
+ }
+ }
+ None => {
+ warn!("Unable to parse and/or resolve peer hostname {}", peer);
+ }
+ }
+ }
+
+ ret
+}
+
struct ConsulDiscoveryParam {
consul_host: String,
service_name: String,
diff --git a/src/table/Cargo.toml b/src/table/Cargo.toml
index ed1a213f..38c6b41c 100644
--- a/src/table/Cargo.toml
+++ b/src/table/Cargo.toml
@@ -1,6 +1,6 @@
[package]
name = "garage_table"
-version = "0.7.0"
+version = "0.8.0"
authors = ["Alex Auvolat <alex@adnab.me>"]
edition = "2018"
license = "AGPL-3.0"
@@ -14,19 +14,19 @@ path = "lib.rs"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
-garage_rpc = { version = "0.7.0", path = "../rpc" }
-garage_util = { version = "0.7.0", path = "../util" }
+garage_db = { version = "0.8.0", path = "../db" }
+garage_rpc = { version = "0.8.0", path = "../rpc" }
+garage_util = { version = "0.8.0", path = "../util" }
opentelemetry = "0.17"
async-trait = "0.1.7"
bytes = "1.0"
+hex = "0.4"
hexdump = "0.1"
tracing = "0.1.30"
rand = "0.8"
-sled = "0.34"
-
rmp-serde = "0.15"
serde = { version = "1.0", default-features = false, features = ["derive", "rc"] }
serde_bytes = "0.11"
diff --git a/src/table/data.rs b/src/table/data.rs
index ff7965f5..3212e82b 100644
--- a/src/table/data.rs
+++ b/src/table/data.rs
@@ -1,13 +1,15 @@
use core::borrow::Borrow;
+use std::convert::TryInto;
use std::sync::Arc;
use serde_bytes::ByteBuf;
-use sled::Transactional;
use tokio::sync::Notify;
+use garage_db as db;
+use garage_db::counted_tree_hack::CountedTree;
+
use garage_util::data::*;
use garage_util::error::*;
-use garage_util::sled_counter::SledCountedTree;
use garage_rpc::system::System;
@@ -16,19 +18,20 @@ use crate::gc::GcTodoEntry;
use crate::metrics::*;
use crate::replication::*;
use crate::schema::*;
+use crate::util::*;
pub struct TableData<F: TableSchema, R: TableReplication> {
system: Arc<System>,
- pub(crate) instance: F,
- pub(crate) replication: R,
+ pub instance: F,
+ pub replication: R,
- pub store: sled::Tree,
+ pub store: db::Tree,
- pub(crate) merkle_tree: sled::Tree,
- pub(crate) merkle_todo: sled::Tree,
+ pub(crate) merkle_tree: db::Tree,
+ pub(crate) merkle_todo: db::Tree,
pub(crate) merkle_todo_notify: Notify,
- pub(crate) gc_todo: SledCountedTree,
+ pub(crate) gc_todo: CountedTree,
pub(crate) metrics: TableMetrics,
}
@@ -38,7 +41,7 @@ where
F: TableSchema,
R: TableReplication,
{
- pub fn new(system: Arc<System>, instance: F, replication: R, db: &sled::Db) -> Arc<Self> {
+ pub fn new(system: Arc<System>, instance: F, replication: R, db: &db::Db) -> Arc<Self> {
let store = db
.open_tree(&format!("{}:table", F::TABLE_NAME))
.expect("Unable to open DB tree");
@@ -53,7 +56,7 @@ where
let gc_todo = db
.open_tree(&format!("{}:gc_todo_v2", F::TABLE_NAME))
.expect("Unable to open DB tree");
- let gc_todo = SledCountedTree::new(gc_todo);
+ let gc_todo = CountedTree::new(gc_todo).expect("Cannot count gc_todo_v2");
let metrics = TableMetrics::new(F::TABLE_NAME, merkle_todo.clone(), gc_todo.clone());
@@ -83,18 +86,48 @@ where
pub fn read_range(
&self,
- p: &F::P,
- s: &Option<F::S>,
+ partition_key: &F::P,
+ start: &Option<F::S>,
+ filter: &Option<F::Filter>,
+ limit: usize,
+ enumeration_order: EnumerationOrder,
+ ) -> Result<Vec<Arc<ByteBuf>>, Error> {
+ let partition_hash = partition_key.hash();
+ match enumeration_order {
+ EnumerationOrder::Forward => {
+ let first_key = match start {
+ None => partition_hash.to_vec(),
+ Some(sk) => self.tree_key(partition_key, sk),
+ };
+ let range = self.store.range(first_key..)?;
+ self.read_range_aux(partition_hash, range, filter, limit)
+ }
+ EnumerationOrder::Reverse => match start {
+ Some(sk) => {
+ let last_key = self.tree_key(partition_key, sk);
+ let range = self.store.range_rev(..=last_key)?;
+ self.read_range_aux(partition_hash, range, filter, limit)
+ }
+ None => {
+ let mut last_key = partition_hash.to_vec();
+ let lower = u128::from_be_bytes(last_key[16..32].try_into().unwrap());
+ last_key[16..32].copy_from_slice(&u128::to_be_bytes(lower + 1));
+ let range = self.store.range_rev(..last_key)?;
+ self.read_range_aux(partition_hash, range, filter, limit)
+ }
+ },
+ }
+ }
+
+ fn read_range_aux<'a>(
+ &self,
+ partition_hash: Hash,
+ range: db::ValueIter<'a>,
filter: &Option<F::Filter>,
limit: usize,
) -> Result<Vec<Arc<ByteBuf>>, Error> {
- let partition_hash = p.hash();
- let first_key = match s {
- None => partition_hash.to_vec(),
- Some(sk) => self.tree_key(p, sk),
- };
let mut ret = vec![];
- for item in self.store.range(first_key..) {
+ for item in range {
let (key, value) = item?;
if &key[..32] != partition_hash.as_slice() {
break;
@@ -107,7 +140,7 @@ where
}
};
if keep {
- ret.push(Arc::new(ByteBuf::from(value.as_ref())));
+ ret.push(Arc::new(ByteBuf::from(value)));
}
if ret.len() >= limit {
break;
@@ -136,17 +169,29 @@ where
let update = self.decode_entry(update_bytes)?;
let tree_key = self.tree_key(update.partition_key(), update.sort_key());
- let changed = (&self.store, &self.merkle_todo).transaction(|(store, mkl_todo)| {
- let (old_entry, old_bytes, new_entry) = match store.get(&tree_key)? {
+ self.update_entry_with(&tree_key[..], |ent| match ent {
+ Some(mut ent) => {
+ ent.merge(&update);
+ ent
+ }
+ None => update.clone(),
+ })?;
+ Ok(())
+ }
+
+ pub fn update_entry_with(
+ &self,
+ tree_key: &[u8],
+ f: impl Fn(Option<F::E>) -> F::E,
+ ) -> Result<Option<F::E>, Error> {
+ let changed = self.store.db().transaction(|mut tx| {
+ let (old_entry, old_bytes, new_entry) = match tx.get(&self.store, tree_key)? {
Some(old_bytes) => {
- let old_entry = self
- .decode_entry(&old_bytes)
- .map_err(sled::transaction::ConflictableTransactionError::Abort)?;
- let mut new_entry = old_entry.clone();
- new_entry.merge(&update);
+ let old_entry = self.decode_entry(&old_bytes).map_err(db::TxError::Abort)?;
+ let new_entry = f(Some(old_entry.clone()));
(Some(old_entry), Some(old_bytes), new_entry)
}
- None => (None, None, update.clone()),
+ None => (None, None, f(None)),
};
// Scenario 1: the value changed, so of course there is a change
@@ -158,24 +203,28 @@ where
// the associated Merkle tree entry.
let new_bytes = rmp_to_vec_all_named(&new_entry)
.map_err(Error::RmpEncode)
- .map_err(sled::transaction::ConflictableTransactionError::Abort)?;
+ .map_err(db::TxError::Abort)?;
let encoding_changed = Some(&new_bytes[..]) != old_bytes.as_ref().map(|x| &x[..]);
+ drop(old_bytes);
if value_changed || encoding_changed {
let new_bytes_hash = blake2sum(&new_bytes[..]);
- mkl_todo.insert(tree_key.clone(), new_bytes_hash.as_slice())?;
- store.insert(tree_key.clone(), new_bytes)?;
- Ok(Some((old_entry, new_entry, new_bytes_hash)))
+ tx.insert(&self.merkle_todo, tree_key, new_bytes_hash.as_slice())?;
+ tx.insert(&self.store, tree_key, new_bytes)?;
+
+ self.instance
+ .updated(&mut tx, old_entry.as_ref(), Some(&new_entry))?;
+
+ Ok(Some((new_entry, new_bytes_hash)))
} else {
Ok(None)
}
})?;
- if let Some((old_entry, new_entry, new_bytes_hash)) = changed {
+ if let Some((new_entry, new_bytes_hash)) = changed {
self.metrics.internal_update_counter.add(1);
let is_tombstone = new_entry.is_tombstone();
- self.instance.updated(old_entry, Some(new_entry));
self.merkle_todo_notify.notify_one();
if is_tombstone {
// We are only responsible for GC'ing this item if we are the
@@ -187,31 +236,34 @@ where
let pk_hash = Hash::try_from(&tree_key[..32]).unwrap();
let nodes = self.replication.write_nodes(&pk_hash);
if nodes.first() == Some(&self.system.id) {
- GcTodoEntry::new(tree_key, new_bytes_hash).save(&self.gc_todo)?;
+ GcTodoEntry::new(tree_key.to_vec(), new_bytes_hash).save(&self.gc_todo)?;
}
}
- }
- Ok(())
+ Ok(Some(new_entry))
+ } else {
+ Ok(None)
+ }
}
pub(crate) fn delete_if_equal(self: &Arc<Self>, k: &[u8], v: &[u8]) -> Result<bool, Error> {
- let removed = (&self.store, &self.merkle_todo).transaction(|(store, mkl_todo)| {
- if let Some(cur_v) = store.get(k)? {
- if cur_v == v {
- store.remove(k)?;
- mkl_todo.insert(k, vec![])?;
- return Ok(true);
+ let removed = self
+ .store
+ .db()
+ .transaction(|mut tx| match tx.get(&self.store, k)? {
+ Some(cur_v) if cur_v == v => {
+ tx.remove(&self.store, k)?;
+ tx.insert(&self.merkle_todo, k, vec![])?;
+
+ let old_entry = self.decode_entry(v).map_err(db::TxError::Abort)?;
+ self.instance.updated(&mut tx, Some(&old_entry), None)?;
+ Ok(true)
}
- }
- Ok(false)
- })?;
+ _ => Ok(false),
+ })?;
if removed {
self.metrics.internal_delete_counter.add(1);
-
- let old_entry = self.decode_entry(v)?;
- self.instance.updated(Some(old_entry), None);
self.merkle_todo_notify.notify_one();
}
Ok(removed)
@@ -222,36 +274,37 @@ where
k: &[u8],
vhash: Hash,
) -> Result<bool, Error> {
- let removed = (&self.store, &self.merkle_todo).transaction(|(store, mkl_todo)| {
- if let Some(cur_v) = store.get(k)? {
- if blake2sum(&cur_v[..]) == vhash {
- store.remove(k)?;
- mkl_todo.insert(k, vec![])?;
- return Ok(Some(cur_v));
+ let removed = self
+ .store
+ .db()
+ .transaction(|mut tx| match tx.get(&self.store, k)? {
+ Some(cur_v) if blake2sum(&cur_v[..]) == vhash => {
+ tx.remove(&self.store, k)?;
+ tx.insert(&self.merkle_todo, k, vec![])?;
+
+ let old_entry = self.decode_entry(&cur_v[..]).map_err(db::TxError::Abort)?;
+ self.instance.updated(&mut tx, Some(&old_entry), None)?;
+ Ok(true)
}
- }
- Ok(None)
- })?;
+ _ => Ok(false),
+ })?;
- if let Some(old_v) = removed {
- let old_entry = self.decode_entry(&old_v[..])?;
- self.instance.updated(Some(old_entry), None);
+ if removed {
+ self.metrics.internal_delete_counter.add(1);
self.merkle_todo_notify.notify_one();
- Ok(true)
- } else {
- Ok(false)
}
+ Ok(removed)
}
// ---- Utility functions ----
- pub(crate) fn tree_key(&self, p: &F::P, s: &F::S) -> Vec<u8> {
+ pub fn tree_key(&self, p: &F::P, s: &F::S) -> Vec<u8> {
let mut ret = p.hash().to_vec();
ret.extend(s.sort_key());
ret
}
- pub(crate) fn decode_entry(&self, bytes: &[u8]) -> Result<F::E, Error> {
+ pub fn decode_entry(&self, bytes: &[u8]) -> Result<F::E, Error> {
match rmp_serde::decode::from_read_ref::<_, F::E>(bytes) {
Ok(x) => Ok(x),
Err(e) => match F::try_migrate(bytes) {
@@ -267,7 +320,7 @@ where
}
}
- pub fn gc_todo_len(&self) -> usize {
- self.gc_todo.len()
+ pub fn gc_todo_len(&self) -> Result<usize, Error> {
+ Ok(self.gc_todo.len())
}
}
diff --git a/src/table/gc.rs b/src/table/gc.rs
index 2a05b6ae..83e7eeff 100644
--- a/src/table/gc.rs
+++ b/src/table/gc.rs
@@ -8,13 +8,13 @@ use serde::{Deserialize, Serialize};
use serde_bytes::ByteBuf;
use futures::future::join_all;
-use futures::select;
-use futures_util::future::*;
use tokio::sync::watch;
+use garage_db::counted_tree_hack::CountedTree;
+
+use garage_util::background::*;
use garage_util::data::*;
use garage_util::error::*;
-use garage_util::sled_counter::SledCountedTree;
use garage_util::time::*;
use garage_rpc::system::System;
@@ -25,7 +25,6 @@ use crate::replication::*;
use crate::schema::*;
const TABLE_GC_BATCH_SIZE: usize = 1024;
-const TABLE_GC_RPC_TIMEOUT: Duration = Duration::from_secs(30);
// GC delay for table entries: 1 day (24 hours)
// (the delay before the entry is added in the GC todo list
@@ -68,50 +67,24 @@ where
gc.endpoint.set_handler(gc.clone());
- let gc1 = gc.clone();
- system.background.spawn_worker(
- format!("GC loop for {}", F::TABLE_NAME),
- move |must_exit: watch::Receiver<bool>| gc1.gc_loop(must_exit),
- );
+ system.background.spawn_worker(GcWorker::new(gc.clone()));
gc
}
- async fn gc_loop(self: Arc<Self>, mut must_exit: watch::Receiver<bool>) {
- while !*must_exit.borrow() {
- match self.gc_loop_iter().await {
- Ok(None) => {
- // Stuff was done, loop immediately
- }
- Ok(Some(wait_delay)) => {
- // Nothing was done, wait specified delay.
- select! {
- _ = tokio::time::sleep(wait_delay).fuse() => {},
- _ = must_exit.changed().fuse() => {},
- }
- }
- Err(e) => {
- warn!("({}) Error doing GC: {}", F::TABLE_NAME, e);
- }
- }
- }
- }
-
async fn gc_loop_iter(&self) -> Result<Option<Duration>, Error> {
let now = now_msec();
- let mut entries = vec![];
- let mut excluded = vec![];
-
// List entries in the GC todo list
// These entries are put there when a tombstone is inserted in the table
// (see update_entry in data.rs)
- for entry_kv in self.data.gc_todo.iter() {
+ let mut candidates = vec![];
+ for entry_kv in self.data.gc_todo.iter()? {
let (k, vhash) = entry_kv?;
- let mut todo_entry = GcTodoEntry::parse(&k, &vhash);
+ let todo_entry = GcTodoEntry::parse(&k, &vhash);
if todo_entry.deletion_time() > now {
- if entries.is_empty() && excluded.is_empty() {
+ if candidates.is_empty() {
// If the earliest entry in the todo list shouldn't yet be processed,
// return a duration to wait in the loop
return Ok(Some(Duration::from_millis(
@@ -123,15 +96,23 @@ where
}
}
- let vhash = Hash::try_from(&vhash[..]).unwrap();
+ candidates.push(todo_entry);
+ if candidates.len() >= 2 * TABLE_GC_BATCH_SIZE {
+ break;
+ }
+ }
+ let mut entries = vec![];
+ let mut excluded = vec![];
+ for mut todo_entry in candidates {
// Check if the tombstone is still the current value of the entry.
// If not, we don't actually want to GC it, and we will remove it
// from the gc_todo table later (below).
+ let vhash = todo_entry.value_hash;
todo_entry.value = self
.data
.store
- .get(&k[..])?
+ .get(&todo_entry.key[..])?
.filter(|v| blake2sum(&v[..]) == vhash)
.map(|v| v.to_vec());
@@ -254,9 +235,7 @@ where
&self.endpoint,
&nodes[..],
GcRpc::Update(updates),
- RequestStrategy::with_priority(PRIO_BACKGROUND)
- .with_quorum(nodes.len())
- .with_timeout(TABLE_GC_RPC_TIMEOUT),
+ RequestStrategy::with_priority(PRIO_BACKGROUND).with_quorum(nodes.len()),
)
.await
.err_context("GC: send tombstones")?;
@@ -277,9 +256,7 @@ where
&self.endpoint,
&nodes[..],
GcRpc::DeleteIfEqualHash(deletes),
- RequestStrategy::with_priority(PRIO_BACKGROUND)
- .with_quorum(nodes.len())
- .with_timeout(TABLE_GC_RPC_TIMEOUT),
+ RequestStrategy::with_priority(PRIO_BACKGROUND).with_quorum(nodes.len()),
)
.await
.err_context("GC: remote delete tombstones")?;
@@ -321,6 +298,66 @@ where
}
}
+struct GcWorker<F, R>
+where
+ F: TableSchema + 'static,
+ R: TableReplication + 'static,
+{
+ gc: Arc<TableGc<F, R>>,
+ wait_delay: Duration,
+}
+
+impl<F, R> GcWorker<F, R>
+where
+ F: TableSchema + 'static,
+ R: TableReplication + 'static,
+{
+ fn new(gc: Arc<TableGc<F, R>>) -> Self {
+ Self {
+ gc,
+ wait_delay: Duration::from_secs(0),
+ }
+ }
+}
+
+#[async_trait]
+impl<F, R> Worker for GcWorker<F, R>
+where
+ F: TableSchema + 'static,
+ R: TableReplication + 'static,
+{
+ fn name(&self) -> String {
+ format!("{} GC", F::TABLE_NAME)
+ }
+
+ fn info(&self) -> Option<String> {
+ let l = self.gc.data.gc_todo_len().unwrap_or(0);
+ if l > 0 {
+ Some(format!("{} items in queue", l))
+ } else {
+ None
+ }
+ }
+
+ async fn work(&mut self, _must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error> {
+ match self.gc.gc_loop_iter().await? {
+ None => Ok(WorkerState::Busy),
+ Some(delay) => {
+ self.wait_delay = delay;
+ Ok(WorkerState::Idle)
+ }
+ }
+ }
+
+ async fn wait_for_work(&mut self, must_exit: &watch::Receiver<bool>) -> WorkerState {
+ if *must_exit.borrow() {
+ return WorkerState::Done;
+ }
+ tokio::time::sleep(self.wait_delay).await;
+ WorkerState::Busy
+ }
+}
+
/// An entry stored in the gc_todo Sled tree associated with the table
/// Contains helper function for parsing, saving, and removing
/// such entry in Sled
@@ -353,17 +390,17 @@ impl GcTodoEntry {
}
/// Parses a GcTodoEntry from a (k, v) pair stored in the gc_todo tree
- pub(crate) fn parse(sled_k: &[u8], sled_v: &[u8]) -> Self {
+ pub(crate) fn parse(db_k: &[u8], db_v: &[u8]) -> Self {
Self {
- tombstone_timestamp: u64::from_be_bytes(sled_k[0..8].try_into().unwrap()),
- key: sled_k[8..].to_vec(),
- value_hash: Hash::try_from(sled_v).unwrap(),
+ tombstone_timestamp: u64::from_be_bytes(db_k[0..8].try_into().unwrap()),
+ key: db_k[8..].to_vec(),
+ value_hash: Hash::try_from(db_v).unwrap(),
value: None,
}
}
/// Saves the GcTodoEntry in the gc_todo tree
- pub(crate) fn save(&self, gc_todo_tree: &SledCountedTree) -> Result<(), Error> {
+ pub(crate) fn save(&self, gc_todo_tree: &CountedTree) -> Result<(), Error> {
gc_todo_tree.insert(self.todo_table_key(), self.value_hash.as_slice())?;
Ok(())
}
@@ -373,9 +410,9 @@ impl GcTodoEntry {
/// This is usefull to remove a todo entry only under the condition
/// that it has not changed since the time it was read, i.e.
/// what we have to do is still the same
- pub(crate) fn remove_if_equal(&self, gc_todo_tree: &SledCountedTree) -> Result<(), Error> {
- let _ = gc_todo_tree.compare_and_swap::<_, _, Vec<u8>>(
- &self.todo_table_key()[..],
+ pub(crate) fn remove_if_equal(&self, gc_todo_tree: &CountedTree) -> Result<(), Error> {
+ gc_todo_tree.compare_and_swap::<_, _, &[u8]>(
+ &self.todo_table_key(),
Some(self.value_hash),
None,
)?;
diff --git a/src/table/merkle.rs b/src/table/merkle.rs
index 93bf7e47..a5c29723 100644
--- a/src/table/merkle.rs
+++ b/src/table/merkle.rs
@@ -1,15 +1,13 @@
use std::sync::Arc;
use std::time::Duration;
-use futures::select;
-use futures_util::future::*;
+use async_trait::async_trait;
use serde::{Deserialize, Serialize};
-use sled::transaction::{
- ConflictableTransactionError, ConflictableTransactionResult, TransactionalTree,
-};
use tokio::sync::watch;
-use garage_util::background::BackgroundRunner;
+use garage_db as db;
+
+use garage_util::background::*;
use garage_util::data::*;
use garage_util::error::Error;
@@ -79,43 +77,17 @@ where
empty_node_hash,
});
- let ret2 = ret.clone();
- background.spawn_worker(
- format!("Merkle tree updater for {}", F::TABLE_NAME),
- |must_exit: watch::Receiver<bool>| ret2.updater_loop(must_exit),
- );
+ background.spawn_worker(MerkleWorker(ret.clone()));
ret
}
- async fn updater_loop(self: Arc<Self>, mut must_exit: watch::Receiver<bool>) {
- while !*must_exit.borrow() {
- if let Some(x) = self.data.merkle_todo.iter().next() {
- match x {
- Ok((key, valhash)) => {
- if let Err(e) = self.update_item(&key[..], &valhash[..]) {
- warn!(
- "({}) Error while updating Merkle tree item: {}",
- F::TABLE_NAME,
- e
- );
- }
- }
- Err(e) => {
- warn!(
- "({}) Error while iterating on Merkle todo tree: {}",
- F::TABLE_NAME,
- e
- );
- tokio::time::sleep(Duration::from_secs(10)).await;
- }
- }
- } else {
- select! {
- _ = self.data.merkle_todo_notify.notified().fuse() => {},
- _ = must_exit.changed().fuse() => {},
- }
- }
+ fn updater_loop_iter(&self) -> Result<WorkerState, Error> {
+ if let Some((key, valhash)) = self.data.merkle_todo.first()? {
+ self.update_item(&key, &valhash)?;
+ Ok(WorkerState::Busy)
+ } else {
+ Ok(WorkerState::Idle)
}
}
@@ -137,13 +109,16 @@ where
};
self.data
.merkle_tree
- .transaction(|tx| self.update_item_rec(tx, k, &khash, &key, new_vhash))?;
+ .db()
+ .transaction(|mut tx| self.update_item_rec(&mut tx, k, &khash, &key, new_vhash))?;
- let deleted = self
- .data
- .merkle_todo
- .compare_and_swap::<_, _, Vec<u8>>(k, Some(vhash_by), None)?
- .is_ok();
+ let deleted = self.data.merkle_todo.db().transaction(|mut tx| {
+ let remove = matches!(tx.get(&self.data.merkle_todo, k)?, Some(ov) if ov == vhash_by);
+ if remove {
+ tx.remove(&self.data.merkle_todo, k)?;
+ }
+ Ok(remove)
+ })?;
if !deleted {
debug!(
@@ -157,12 +132,12 @@ where
fn update_item_rec(
&self,
- tx: &TransactionalTree,
+ tx: &mut db::Transaction<'_>,
k: &[u8],
khash: &Hash,
key: &MerkleNodeKey,
new_vhash: Option<Hash>,
- ) -> ConflictableTransactionResult<Option<Hash>, Error> {
+ ) -> db::TxResult<Option<Hash>, Error> {
let i = key.prefix.len();
// Read node at current position (defined by the prefix stored in key)
@@ -203,7 +178,7 @@ where
}
MerkleNode::Intermediate(_) => Some(MerkleNode::Intermediate(children)),
x @ MerkleNode::Leaf(_, _) => {
- tx.remove(key_sub.encode())?;
+ tx.remove(&self.data.merkle_tree, key_sub.encode())?;
Some(x)
}
}
@@ -283,28 +258,27 @@ where
fn read_node_txn(
&self,
- tx: &TransactionalTree,
+ tx: &mut db::Transaction<'_>,
k: &MerkleNodeKey,
- ) -> ConflictableTransactionResult<MerkleNode, Error> {
- let ent = tx.get(k.encode())?;
- MerkleNode::decode_opt(ent).map_err(ConflictableTransactionError::Abort)
+ ) -> db::TxResult<MerkleNode, Error> {
+ let ent = tx.get(&self.data.merkle_tree, k.encode())?;
+ MerkleNode::decode_opt(&ent).map_err(db::TxError::Abort)
}
fn put_node_txn(
&self,
- tx: &TransactionalTree,
+ tx: &mut db::Transaction<'_>,
k: &MerkleNodeKey,
v: &MerkleNode,
- ) -> ConflictableTransactionResult<Hash, Error> {
+ ) -> db::TxResult<Hash, Error> {
trace!("Put Merkle node: {:?} => {:?}", k, v);
if *v == MerkleNode::Empty {
- tx.remove(k.encode())?;
+ tx.remove(&self.data.merkle_tree, k.encode())?;
Ok(self.empty_node_hash)
} else {
- let vby = rmp_to_vec_all_named(v)
- .map_err(|e| ConflictableTransactionError::Abort(e.into()))?;
+ let vby = rmp_to_vec_all_named(v).map_err(|e| db::TxError::Abort(e.into()))?;
let rethash = blake2sum(&vby[..]);
- tx.insert(k.encode(), vby)?;
+ tx.insert(&self.data.merkle_tree, k.encode(), vby)?;
Ok(rethash)
}
}
@@ -312,15 +286,63 @@ where
// Access a node in the Merkle tree, used by the sync protocol
pub(crate) fn read_node(&self, k: &MerkleNodeKey) -> Result<MerkleNode, Error> {
let ent = self.data.merkle_tree.get(k.encode())?;
- MerkleNode::decode_opt(ent)
+ MerkleNode::decode_opt(&ent)
+ }
+
+ pub fn merkle_tree_len(&self) -> Result<usize, Error> {
+ Ok(self.data.merkle_tree.len()?)
}
- pub fn merkle_tree_len(&self) -> usize {
- self.data.merkle_tree.len()
+ pub fn todo_len(&self) -> Result<usize, Error> {
+ Ok(self.data.merkle_todo.len()?)
}
+}
+
+struct MerkleWorker<F, R>(Arc<MerkleUpdater<F, R>>)
+where
+ F: TableSchema + 'static,
+ R: TableReplication + 'static;
- pub fn todo_len(&self) -> usize {
- self.data.merkle_todo.len()
+#[async_trait]
+impl<F, R> Worker for MerkleWorker<F, R>
+where
+ F: TableSchema + 'static,
+ R: TableReplication + 'static,
+{
+ fn name(&self) -> String {
+ format!("{} Merkle tree updater", F::TABLE_NAME)
+ }
+
+ fn info(&self) -> Option<String> {
+ let l = self.0.todo_len().unwrap_or(0);
+ if l > 0 {
+ Some(format!("{} items in queue", l))
+ } else {
+ None
+ }
+ }
+
+ async fn work(&mut self, _must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error> {
+ let updater = self.0.clone();
+ tokio::task::spawn_blocking(move || {
+ for _i in 0..100 {
+ let s = updater.updater_loop_iter();
+ if !matches!(s, Ok(WorkerState::Busy)) {
+ return s;
+ }
+ }
+ Ok(WorkerState::Busy)
+ })
+ .await
+ .unwrap()
+ }
+
+ async fn wait_for_work(&mut self, must_exit: &watch::Receiver<bool>) -> WorkerState {
+ if *must_exit.borrow() {
+ return WorkerState::Done;
+ }
+ tokio::time::sleep(Duration::from_secs(10)).await;
+ WorkerState::Busy
}
}
@@ -347,7 +369,7 @@ impl MerkleNodeKey {
}
impl MerkleNode {
- fn decode_opt(ent: Option<sled::IVec>) -> Result<Self, Error> {
+ fn decode_opt(ent: &Option<db::Value>) -> Result<Self, Error> {
match ent {
None => Ok(MerkleNode::Empty),
Some(v) => Ok(rmp_serde::decode::from_read_ref::<_, MerkleNode>(&v[..])?),
diff --git a/src/table/metrics.rs b/src/table/metrics.rs
index 752a2a6d..3a1783e0 100644
--- a/src/table/metrics.rs
+++ b/src/table/metrics.rs
@@ -1,6 +1,7 @@
use opentelemetry::{global, metrics::*, KeyValue};
-use garage_util::sled_counter::SledCountedTree;
+use garage_db as db;
+use garage_db::counted_tree_hack::CountedTree;
/// TableMetrics reference all counter used for metrics
pub struct TableMetrics {
@@ -19,21 +20,19 @@ pub struct TableMetrics {
pub(crate) sync_items_received: Counter<u64>,
}
impl TableMetrics {
- pub fn new(
- table_name: &'static str,
- merkle_todo: sled::Tree,
- gc_todo: SledCountedTree,
- ) -> Self {
+ pub fn new(table_name: &'static str, merkle_todo: db::Tree, gc_todo: CountedTree) -> Self {
let meter = global::meter(table_name);
TableMetrics {
_merkle_todo_len: meter
.u64_value_observer(
"table.merkle_updater_todo_queue_length",
move |observer| {
- observer.observe(
- merkle_todo.len() as u64,
- &[KeyValue::new("table_name", table_name)],
- )
+ if let Ok(v) = merkle_todo.len() {
+ observer.observe(
+ v as u64,
+ &[KeyValue::new("table_name", table_name)],
+ );
+ }
},
)
.with_description("Merkle tree updater TODO queue length")
@@ -45,7 +44,7 @@ impl TableMetrics {
observer.observe(
gc_todo.len() as u64,
&[KeyValue::new("table_name", table_name)],
- )
+ );
},
)
.with_description("Table garbage collector TODO queue length")
diff --git a/src/table/schema.rs b/src/table/schema.rs
index eba918a2..f37e98d8 100644
--- a/src/table/schema.rs
+++ b/src/table/schema.rs
@@ -1,5 +1,6 @@
use serde::{Deserialize, Serialize};
+use garage_db as db;
use garage_util::data::*;
use crate::crdt::Crdt;
@@ -59,7 +60,7 @@ pub trait Entry<P: PartitionKey, S: SortKey>:
}
/// Trait for the schema used in a table
-pub trait TableSchema: Send + Sync {
+pub trait TableSchema: Send + Sync + 'static {
/// The name of the table in the database
const TABLE_NAME: &'static str;
@@ -82,11 +83,19 @@ pub trait TableSchema: Send + Sync {
None
}
- // Updated triggers some stuff downstream, but it is not supposed to block or fail,
- // as the update itself is an unchangeable fact that will never go back
- // due to CRDT logic. Typically errors in propagation of info should be logged
- // to stderr.
- fn updated(&self, _old: Option<Self::E>, _new: Option<Self::E>) {}
+ /// Actions triggered by data changing in a table. If such actions
+ /// include updates to the local database that should be applied
+ /// atomically with the item update itself, a db transaction is
+ /// provided on which these changes should be done.
+ /// This function can return a DB error but that's all.
+ fn updated(
+ &self,
+ _tx: &mut db::Transaction,
+ _old: Option<&Self::E>,
+ _new: Option<&Self::E>,
+ ) -> db::TxOpResult<()> {
+ Ok(())
+ }
fn matches_filter(entry: &Self::E, filter: &Self::Filter) -> bool;
}
diff --git a/src/table/sync.rs b/src/table/sync.rs
index 08069ad0..9d79d856 100644
--- a/src/table/sync.rs
+++ b/src/table/sync.rs
@@ -1,17 +1,17 @@
use std::collections::VecDeque;
-use std::sync::{Arc, Mutex};
+use std::sync::Arc;
use std::time::{Duration, Instant};
use async_trait::async_trait;
-use futures::select;
-use futures_util::future::*;
use futures_util::stream::*;
use opentelemetry::KeyValue;
use rand::Rng;
use serde::{Deserialize, Serialize};
use serde_bytes::ByteBuf;
+use tokio::select;
use tokio::sync::{mpsc, watch};
+use garage_util::background::*;
use garage_util::data::*;
use garage_util::error::Error;
@@ -24,8 +24,6 @@ use crate::merkle::*;
use crate::replication::*;
use crate::*;
-const TABLE_SYNC_RPC_TIMEOUT: Duration = Duration::from_secs(30);
-
// Do anti-entropy every 10 minutes
const ANTI_ENTROPY_INTERVAL: Duration = Duration::from_secs(10 * 60);
@@ -34,7 +32,7 @@ pub struct TableSyncer<F: TableSchema + 'static, R: TableReplication + 'static>
data: Arc<TableData<F, R>>,
merkle: Arc<MerkleUpdater<F, R>>,
- todo: Mutex<SyncTodo>,
+ add_full_sync_tx: mpsc::UnboundedSender<()>,
endpoint: Arc<Endpoint<SyncRpc, Self>>,
}
@@ -52,10 +50,6 @@ impl Rpc for SyncRpc {
type Response = Result<SyncRpc, Error>;
}
-struct SyncTodo {
- todo: Vec<TodoPartition>,
-}
-
#[derive(Debug, Clone)]
struct TodoPartition {
partition: Partition,
@@ -80,118 +74,40 @@ where
.netapp
.endpoint(format!("garage_table/sync.rs/Rpc:{}", F::TABLE_NAME));
- let todo = SyncTodo { todo: vec![] };
+ let (add_full_sync_tx, add_full_sync_rx) = mpsc::unbounded_channel();
let syncer = Arc::new(Self {
system: system.clone(),
data,
merkle,
- todo: Mutex::new(todo),
+ add_full_sync_tx,
endpoint,
});
syncer.endpoint.set_handler(syncer.clone());
- let (busy_tx, busy_rx) = mpsc::unbounded_channel();
-
- let s1 = syncer.clone();
- system.background.spawn_worker(
- format!("table sync watcher for {}", F::TABLE_NAME),
- move |must_exit: watch::Receiver<bool>| s1.watcher_task(must_exit, busy_rx),
- );
-
- let s2 = syncer.clone();
- system.background.spawn_worker(
- format!("table syncer for {}", F::TABLE_NAME),
- move |must_exit: watch::Receiver<bool>| s2.syncer_task(must_exit, busy_tx),
- );
-
- let s3 = syncer.clone();
- tokio::spawn(async move {
- tokio::time::sleep(Duration::from_secs(20)).await;
- s3.add_full_sync();
+ system.background.spawn_worker(SyncWorker {
+ syncer: syncer.clone(),
+ ring_recv: system.ring.clone(),
+ ring: system.ring.borrow().clone(),
+ add_full_sync_rx,
+ todo: vec![],
+ next_full_sync: Instant::now() + Duration::from_secs(20),
});
syncer
}
- async fn watcher_task(
- self: Arc<Self>,
- mut must_exit: watch::Receiver<bool>,
- mut busy_rx: mpsc::UnboundedReceiver<bool>,
- ) {
- let mut prev_ring: Arc<Ring> = self.system.ring.borrow().clone();
- let mut ring_recv: watch::Receiver<Arc<Ring>> = self.system.ring.clone();
- let mut nothing_to_do_since = Some(Instant::now());
-
- while !*must_exit.borrow() {
- select! {
- _ = ring_recv.changed().fuse() => {
- let new_ring = ring_recv.borrow();
- if !Arc::ptr_eq(&new_ring, &prev_ring) {
- debug!("({}) Ring changed, adding full sync to syncer todo list", F::TABLE_NAME);
- self.add_full_sync();
- prev_ring = new_ring.clone();
- }
- }
- busy_opt = busy_rx.recv().fuse() => {
- if let Some(busy) = busy_opt {
- if busy {
- nothing_to_do_since = None;
- } else if nothing_to_do_since.is_none() {
- nothing_to_do_since = Some(Instant::now());
- }
- }
- }
- _ = must_exit.changed().fuse() => {},
- _ = tokio::time::sleep(Duration::from_secs(1)).fuse() => {
- if nothing_to_do_since.map(|t| Instant::now() - t >= ANTI_ENTROPY_INTERVAL).unwrap_or(false) {
- nothing_to_do_since = None;
- debug!("({}) Interval passed, adding full sync to syncer todo list", F::TABLE_NAME);
- self.add_full_sync();
- }
- }
- }
- }
- }
-
pub fn add_full_sync(&self) {
- self.todo
- .lock()
- .unwrap()
- .add_full_sync(&self.data, &self.system);
- }
-
- async fn syncer_task(
- self: Arc<Self>,
- mut must_exit: watch::Receiver<bool>,
- busy_tx: mpsc::UnboundedSender<bool>,
- ) {
- while !*must_exit.borrow() {
- let task = self.todo.lock().unwrap().pop_task();
- if let Some(partition) = task {
- busy_tx.send(true).unwrap();
- let res = self
- .clone()
- .sync_partition(&partition, &mut must_exit)
- .await;
- if let Err(e) = res {
- warn!(
- "({}) Error while syncing {:?}: {}",
- F::TABLE_NAME,
- partition,
- e
- );
- }
- } else {
- busy_tx.send(false).unwrap();
- tokio::time::sleep(Duration::from_secs(1)).await;
- }
+ if self.add_full_sync_tx.send(()).is_err() {
+ error!("({}) Could not add full sync", F::TABLE_NAME);
}
}
+ // ----
+
async fn sync_partition(
- self: Arc<Self>,
+ self: &Arc<Self>,
partition: &TodoPartition,
must_exit: &mut watch::Receiver<bool>,
) -> Result<(), Error> {
@@ -258,9 +174,9 @@ where
while !*must_exit.borrow() {
let mut items = Vec::new();
- for item in self.data.store.range(begin.to_vec()..end.to_vec()) {
+ for item in self.data.store.range(begin.to_vec()..end.to_vec())? {
let (key, value) = item?;
- items.push((key.to_vec(), Arc::new(ByteBuf::from(value.as_ref()))));
+ items.push((key.to_vec(), Arc::new(ByteBuf::from(value))));
if items.len() >= 1024 {
break;
@@ -329,9 +245,7 @@ where
&self.endpoint,
nodes,
SyncRpc::Items(values),
- RequestStrategy::with_priority(PRIO_BACKGROUND)
- .with_quorum(nodes.len())
- .with_timeout(TABLE_SYNC_RPC_TIMEOUT),
+ RequestStrategy::with_priority(PRIO_BACKGROUND).with_quorum(nodes.len()),
)
.await?;
@@ -392,8 +306,7 @@ where
&self.endpoint,
who,
SyncRpc::RootCkHash(partition.partition, root_ck_hash),
- RequestStrategy::with_priority(PRIO_BACKGROUND)
- .with_timeout(TABLE_SYNC_RPC_TIMEOUT),
+ RequestStrategy::with_priority(PRIO_BACKGROUND),
)
.await?;
@@ -432,11 +345,11 @@ where
// Just send that item directly
if let Some(val) = self.data.store.get(&ik[..])? {
if blake2sum(&val[..]) != ivhash {
- warn!("({}) Hashes differ between stored value and Merkle tree, key: {:?} (if your server is very busy, don't worry, this happens when the Merkle tree can't be updated fast enough)", F::TABLE_NAME, ik);
+ debug!("({}) Hashes differ between stored value and Merkle tree, key: {} (if your server is very busy, don't worry, this happens when the Merkle tree can't be updated fast enough)", F::TABLE_NAME, hex::encode(ik));
}
todo_items.push(val.to_vec());
} else {
- warn!("({}) Item from Merkle tree not found in store: {:?} (if your server is very busy, don't worry, this happens when the Merkle tree can't be updated fast enough)", F::TABLE_NAME, ik);
+ debug!("({}) Item from Merkle tree not found in store: {} (if your server is very busy, don't worry, this happens when the Merkle tree can't be updated fast enough)", F::TABLE_NAME, hex::encode(ik));
}
}
MerkleNode::Intermediate(l) => {
@@ -449,8 +362,7 @@ where
&self.endpoint,
who,
SyncRpc::GetNode(key.clone()),
- RequestStrategy::with_priority(PRIO_BACKGROUND)
- .with_timeout(TABLE_SYNC_RPC_TIMEOUT),
+ RequestStrategy::with_priority(PRIO_BACKGROUND),
)
.await?
{
@@ -526,8 +438,7 @@ where
&self.endpoint,
who,
SyncRpc::Items(values),
- RequestStrategy::with_priority(PRIO_BACKGROUND)
- .with_timeout(TABLE_SYNC_RPC_TIMEOUT),
+ RequestStrategy::with_priority(PRIO_BACKGROUND),
)
.await?;
if let SyncRpc::Ok = rpc_resp {
@@ -577,12 +488,22 @@ where
}
}
-impl SyncTodo {
- fn add_full_sync<F: TableSchema, R: TableReplication>(
- &mut self,
- data: &TableData<F, R>,
- system: &System,
- ) {
+// -------- Sync Worker ---------
+
+struct SyncWorker<F: TableSchema + 'static, R: TableReplication + 'static> {
+ syncer: Arc<TableSyncer<F, R>>,
+ ring_recv: watch::Receiver<Arc<Ring>>,
+ ring: Arc<Ring>,
+ add_full_sync_rx: mpsc::UnboundedReceiver<()>,
+ todo: Vec<TodoPartition>,
+ next_full_sync: Instant,
+}
+
+impl<F: TableSchema + 'static, R: TableReplication + 'static> SyncWorker<F, R> {
+ fn add_full_sync(&mut self) {
+ let system = &self.syncer.system;
+ let data = &self.syncer.data;
+
let my_id = system.id;
self.todo.clear();
@@ -603,8 +524,16 @@ impl SyncTodo {
let retain = nodes.contains(&my_id);
if !retain {
// Check if we have some data to send, otherwise skip
- if data.store.range(begin..end).next().is_none() {
- continue;
+ match data.store.range(begin..end) {
+ Ok(mut iter) => {
+ if iter.next().is_none() {
+ continue;
+ }
+ }
+ Err(e) => {
+ warn!("DB error in add_full_sync: {}", e);
+ continue;
+ }
}
}
@@ -615,6 +544,8 @@ impl SyncTodo {
retain,
});
}
+
+ self.next_full_sync = Instant::now() + ANTI_ENTROPY_INTERVAL;
}
fn pop_task(&mut self) -> Option<TodoPartition> {
@@ -633,6 +564,62 @@ impl SyncTodo {
}
}
+#[async_trait]
+impl<F: TableSchema + 'static, R: TableReplication + 'static> Worker for SyncWorker<F, R> {
+ fn name(&self) -> String {
+ format!("{} sync", F::TABLE_NAME)
+ }
+
+ fn info(&self) -> Option<String> {
+ let l = self.todo.len();
+ if l > 0 {
+ Some(format!("{} partitions remaining", l))
+ } else {
+ None
+ }
+ }
+
+ async fn work(&mut self, must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error> {
+ if let Some(partition) = self.pop_task() {
+ self.syncer.sync_partition(&partition, must_exit).await?;
+ Ok(WorkerState::Busy)
+ } else {
+ Ok(WorkerState::Idle)
+ }
+ }
+
+ async fn wait_for_work(&mut self, must_exit: &watch::Receiver<bool>) -> WorkerState {
+ if *must_exit.borrow() {
+ return WorkerState::Done;
+ }
+ select! {
+ s = self.add_full_sync_rx.recv() => {
+ if let Some(()) = s {
+ self.add_full_sync();
+ }
+ },
+ _ = self.ring_recv.changed() => {
+ let new_ring = self.ring_recv.borrow();
+ if !Arc::ptr_eq(&new_ring, &self.ring) {
+ self.ring = new_ring.clone();
+ drop(new_ring);
+ debug!("({}) Ring changed, adding full sync to syncer todo list", F::TABLE_NAME);
+ self.add_full_sync();
+ }
+ },
+ _ = tokio::time::sleep_until(self.next_full_sync.into()) => {
+ self.add_full_sync();
+ }
+ }
+ match self.todo.is_empty() {
+ false => WorkerState::Busy,
+ true => WorkerState::Idle,
+ }
+ }
+}
+
+// ---- UTIL ----
+
fn hash_of<T: Serialize>(x: &T) -> Result<Hash, Error> {
Ok(blake2sum(&rmp_to_vec_all_named(x)?[..]))
}
diff --git a/src/table/table.rs b/src/table/table.rs
index 7f87a449..8a66c420 100644
--- a/src/table/table.rs
+++ b/src/table/table.rs
@@ -1,6 +1,6 @@
-use std::collections::{BTreeMap, HashMap};
+use std::borrow::Borrow;
+use std::collections::{BTreeMap, BTreeSet, HashMap};
use std::sync::Arc;
-use std::time::Duration;
use async_trait::async_trait;
use futures::stream::*;
@@ -12,6 +12,8 @@ use opentelemetry::{
Context,
};
+use garage_db as db;
+
use garage_util::data::*;
use garage_util::error::Error;
use garage_util::metrics::RecordDuration;
@@ -26,8 +28,7 @@ use crate::merkle::*;
use crate::replication::*;
use crate::schema::*;
use crate::sync::*;
-
-const TABLE_RPC_TIMEOUT: Duration = Duration::from_secs(10);
+use crate::util::*;
pub struct Table<F: TableSchema + 'static, R: TableReplication + 'static> {
pub system: Arc<System>,
@@ -45,7 +46,13 @@ pub(crate) enum TableRpc<F: TableSchema> {
ReadEntryResponse(Option<ByteBuf>),
// Read range: read all keys in partition P, possibly starting at a certain sort key offset
- ReadRange(F::P, Option<F::S>, Option<F::Filter>, usize),
+ ReadRange {
+ partition: F::P,
+ begin_sort_key: Option<F::S>,
+ filter: Option<F::Filter>,
+ limit: usize,
+ enumeration_order: EnumerationOrder,
+ },
Update(Vec<Arc<ByteBuf>>),
}
@@ -61,7 +68,7 @@ where
{
// =============== PUBLIC INTERFACE FUNCTIONS (new, insert, get, etc) ===============
- pub fn new(instance: F, replication: R, system: Arc<System>, db: &sled::Db) -> Arc<Self> {
+ pub fn new(instance: F, replication: R, system: Arc<System>, db: &db::Db) -> Arc<Self> {
let endpoint = system
.netapp
.endpoint(format!("garage_table/table.rs/Rpc:{}", F::TABLE_NAME));
@@ -103,7 +110,6 @@ where
async fn insert_internal(&self, e: &F::E) -> Result<(), Error> {
let hash = e.partition_key().hash();
let who = self.data.replication.write_nodes(&hash);
- //eprintln!("insert who: {:?}", who);
let e_enc = Arc::new(ByteBuf::from(rmp_to_vec_all_named(e)?));
let rpc = TableRpc::<F>::Update(vec![e_enc]);
@@ -115,17 +121,20 @@ where
&who[..],
rpc,
RequestStrategy::with_priority(PRIO_NORMAL)
- .with_quorum(self.data.replication.write_quorum())
- .with_timeout(TABLE_RPC_TIMEOUT),
+ .with_quorum(self.data.replication.write_quorum()),
)
.await?;
Ok(())
}
- pub async fn insert_many(&self, entries: &[F::E]) -> Result<(), Error> {
+ pub async fn insert_many<I, IE>(&self, entries: I) -> Result<(), Error>
+ where
+ I: IntoIterator<Item = IE> + Send + Sync,
+ IE: Borrow<F::E> + Send + Sync,
+ {
let tracer = opentelemetry::global::tracer("garage_table");
- let span = tracer.start(format!("{} insert_many {}", F::TABLE_NAME, entries.len()));
+ let span = tracer.start(format!("{} insert_many", F::TABLE_NAME));
self.insert_many_internal(entries)
.bound_record_duration(&self.data.metrics.put_request_duration)
@@ -137,10 +146,15 @@ where
Ok(())
}
- async fn insert_many_internal(&self, entries: &[F::E]) -> Result<(), Error> {
+ async fn insert_many_internal<I, IE>(&self, entries: I) -> Result<(), Error>
+ where
+ I: IntoIterator<Item = IE> + Send + Sync,
+ IE: Borrow<F::E> + Send + Sync,
+ {
let mut call_list: HashMap<_, Vec<_>> = HashMap::new();
- for entry in entries.iter() {
+ for entry in entries.into_iter() {
+ let entry = entry.borrow();
let hash = entry.partition_key().hash();
let who = self.data.replication.write_nodes(&hash);
let e_enc = Arc::new(ByteBuf::from(rmp_to_vec_all_named(entry)?));
@@ -159,7 +173,7 @@ where
&self.endpoint,
node,
rpc,
- RequestStrategy::with_priority(PRIO_NORMAL).with_timeout(TABLE_RPC_TIMEOUT),
+ RequestStrategy::with_priority(PRIO_NORMAL),
)
.await?;
Ok::<_, Error>((node, resp))
@@ -216,7 +230,6 @@ where
rpc,
RequestStrategy::with_priority(PRIO_NORMAL)
.with_quorum(self.data.replication.read_quorum())
- .with_timeout(TABLE_RPC_TIMEOUT)
.interrupt_after_quorum(true),
)
.await?;
@@ -261,12 +274,19 @@ where
begin_sort_key: Option<F::S>,
filter: Option<F::Filter>,
limit: usize,
+ enumeration_order: EnumerationOrder,
) -> Result<Vec<F::E>, Error> {
let tracer = opentelemetry::global::tracer("garage_table");
let span = tracer.start(format!("{} get_range", F::TABLE_NAME));
let res = self
- .get_range_internal(partition_key, begin_sort_key, filter, limit)
+ .get_range_internal(
+ partition_key,
+ begin_sort_key,
+ filter,
+ limit,
+ enumeration_order,
+ )
.bound_record_duration(&self.data.metrics.get_request_duration)
.with_context(Context::current_with_span(span))
.await?;
@@ -282,11 +302,18 @@ where
begin_sort_key: Option<F::S>,
filter: Option<F::Filter>,
limit: usize,
+ enumeration_order: EnumerationOrder,
) -> Result<Vec<F::E>, Error> {
let hash = partition_key.hash();
let who = self.data.replication.read_nodes(&hash);
- let rpc = TableRpc::<F>::ReadRange(partition_key.clone(), begin_sort_key, filter, limit);
+ let rpc = TableRpc::<F>::ReadRange {
+ partition: partition_key.clone(),
+ begin_sort_key,
+ filter,
+ limit,
+ enumeration_order,
+ };
let resps = self
.system
@@ -297,49 +324,69 @@ where
rpc,
RequestStrategy::with_priority(PRIO_NORMAL)
.with_quorum(self.data.replication.read_quorum())
- .with_timeout(TABLE_RPC_TIMEOUT)
.interrupt_after_quorum(true),
)
.await?;
- let mut ret = BTreeMap::new();
- let mut to_repair = BTreeMap::new();
+ let mut ret: BTreeMap<Vec<u8>, F::E> = BTreeMap::new();
+ let mut to_repair = BTreeSet::new();
for resp in resps {
if let TableRpc::Update(entries) = resp {
for entry_bytes in entries.iter() {
let entry = self.data.decode_entry(entry_bytes.as_slice())?;
let entry_key = self.data.tree_key(entry.partition_key(), entry.sort_key());
- match ret.remove(&entry_key) {
- None => {
- ret.insert(entry_key, Some(entry));
- }
- Some(Some(mut prev)) => {
- let must_repair = prev != entry;
- prev.merge(&entry);
- if must_repair {
- to_repair.insert(entry_key.clone(), Some(prev.clone()));
+ match ret.get_mut(&entry_key) {
+ Some(e) => {
+ if *e != entry {
+ e.merge(&entry);
+ to_repair.insert(entry_key.clone());
}
- ret.insert(entry_key, Some(prev));
}
- Some(None) => unreachable!(),
+ None => {
+ ret.insert(entry_key, entry);
+ }
}
}
+ } else {
+ return Err(Error::unexpected_rpc_message(resp));
}
}
+
if !to_repair.is_empty() {
let self2 = self.clone();
+ let to_repair = to_repair
+ .into_iter()
+ .map(|k| ret.get(&k).unwrap().clone())
+ .collect::<Vec<_>>();
self.system.background.spawn_cancellable(async move {
- for (_, v) in to_repair.iter_mut() {
- self2.repair_on_read(&who[..], v.take().unwrap()).await?;
+ for v in to_repair {
+ self2.repair_on_read(&who[..], v).await?;
}
Ok(())
});
}
- let ret_vec = ret
- .iter_mut()
- .take(limit)
- .map(|(_k, v)| v.take().unwrap())
- .collect::<Vec<_>>();
+
+ // At this point, the `ret` btreemap might contain more than `limit`
+ // items, because nodes might have returned us each `limit` items
+ // but for different keys. We have to take only the first `limit` items
+ // in this map, in the specified enumeration order, for two reasons:
+ // 1. To return to the user no more than the number of items that they requested
+ // 2. To return only items for which we have a read quorum: we do not know
+ // that we have a read quorum for the items after the first `limit`
+ // of them
+ let ret_vec = match enumeration_order {
+ EnumerationOrder::Forward => ret
+ .into_iter()
+ .take(limit)
+ .map(|(_k, v)| v)
+ .collect::<Vec<_>>(),
+ EnumerationOrder::Reverse => ret
+ .into_iter()
+ .rev()
+ .take(limit)
+ .map(|(_k, v)| v)
+ .collect::<Vec<_>>(),
+ };
Ok(ret_vec)
}
@@ -353,9 +400,7 @@ where
&self.endpoint,
who,
TableRpc::<F>::Update(vec![what_enc]),
- RequestStrategy::with_priority(PRIO_NORMAL)
- .with_quorum(who.len())
- .with_timeout(TABLE_RPC_TIMEOUT),
+ RequestStrategy::with_priority(PRIO_NORMAL).with_quorum(who.len()),
)
.await?;
Ok(())
@@ -378,8 +423,20 @@ where
let value = self.data.read_entry(key, sort_key)?;
Ok(TableRpc::ReadEntryResponse(value))
}
- TableRpc::ReadRange(key, begin_sort_key, filter, limit) => {
- let values = self.data.read_range(key, begin_sort_key, filter, *limit)?;
+ TableRpc::ReadRange {
+ partition,
+ begin_sort_key,
+ filter,
+ limit,
+ enumeration_order,
+ } => {
+ let values = self.data.read_range(
+ partition,
+ begin_sort_key,
+ filter,
+ *limit,
+ *enumeration_order,
+ )?;
Ok(TableRpc::Update(values))
}
TableRpc::Update(pairs) => {
diff --git a/src/table/util.rs b/src/table/util.rs
index 2a5c3afe..20595a94 100644
--- a/src/table/util.rs
+++ b/src/table/util.rs
@@ -17,7 +17,7 @@ impl PartitionKey for EmptyKey {
}
}
-#[derive(Clone, Copy, Debug, Serialize, Deserialize)]
+#[derive(Clone, Copy, Debug, Serialize, Deserialize, PartialEq, Eq)]
pub enum DeletedFilter {
Any,
Deleted,
@@ -33,3 +33,19 @@ impl DeletedFilter {
}
}
}
+
+#[derive(Clone, Copy, Debug, Serialize, Deserialize, PartialEq, Eq)]
+pub enum EnumerationOrder {
+ Forward,
+ Reverse,
+}
+
+impl EnumerationOrder {
+ pub fn from_reverse(reverse: bool) -> Self {
+ if reverse {
+ Self::Reverse
+ } else {
+ Self::Forward
+ }
+ }
+}
diff --git a/src/util/Cargo.toml b/src/util/Cargo.toml
index f13c1589..8e978fc2 100644
--- a/src/util/Cargo.toml
+++ b/src/util/Cargo.toml
@@ -1,6 +1,6 @@
[package]
name = "garage_util"
-version = "0.7.0"
+version = "0.8.0"
authors = ["Alex Auvolat <alex@adnab.me>"]
edition = "2018"
license = "AGPL-3.0"
@@ -14,15 +14,21 @@ path = "lib.rs"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
+garage_db = { version = "0.8.0", path = "../db" }
+
+arc-swap = "1.0"
+async-trait = "0.1"
blake2 = "0.9"
+bytes = "1.0"
+digest = "0.10"
err-derive = "0.3"
+git-version = "0.3.4"
xxhash-rust = { version = "0.8", default-features = false, features = ["xxh3"] }
hex = "0.4"
+lazy_static = "1.4"
tracing = "0.1.30"
rand = "0.8"
-sha2 = "0.9"
-
-sled = "0.34"
+sha2 = "0.10"
chrono = "0.4"
rmp-serde = "0.15"
@@ -33,11 +39,13 @@ toml = "0.5"
futures = "0.3"
tokio = { version = "1.0", default-features = false, features = ["rt", "rt-multi-thread", "io-util", "net", "time", "macros", "sync", "signal", "fs"] }
-#netapp = { version = "0.3.0", git = "https://git.deuxfleurs.fr/lx/netapp" }
-#netapp = { version = "0.4", path = "../../../netapp" }
-netapp = "0.4"
+netapp = "0.5"
http = "0.2"
hyper = "0.14"
opentelemetry = { version = "0.17", features = [ "rt-tokio", "metrics", "trace" ] }
+
+
+[features]
+k2v = []
diff --git a/src/util/async_hash.rs b/src/util/async_hash.rs
new file mode 100644
index 00000000..5631ea6b
--- /dev/null
+++ b/src/util/async_hash.rs
@@ -0,0 +1,61 @@
+use bytes::Bytes;
+use digest::Digest;
+
+use tokio::sync::mpsc;
+use tokio::task::JoinHandle;
+
+use crate::data::*;
+
+/// Compute the sha256 of a slice,
+/// spawning on a tokio thread for CPU-intensive processing
+/// The argument has to be an owned Bytes, as it is moved out to a new thread.
+pub async fn async_sha256sum(data: Bytes) -> Hash {
+ tokio::task::spawn_blocking(move || sha256sum(&data))
+ .await
+ .unwrap()
+}
+
+/// Compute the blake2sum of a slice,
+/// spawning on a tokio thread for CPU-intensive processing.
+/// The argument has to be an owned Bytes, as it is moved out to a new thread.
+pub async fn async_blake2sum(data: Bytes) -> Hash {
+ tokio::task::spawn_blocking(move || blake2sum(&data))
+ .await
+ .unwrap()
+}
+
+// ----
+
+pub struct AsyncHasher<D: Digest> {
+ sendblk: mpsc::Sender<Bytes>,
+ task: JoinHandle<digest::Output<D>>,
+}
+
+impl<D: Digest> AsyncHasher<D> {
+ pub fn new() -> Self {
+ let (sendblk, mut recvblk) = mpsc::channel::<Bytes>(1);
+ let task = tokio::task::spawn_blocking(move || {
+ let mut digest = D::new();
+ while let Some(blk) = recvblk.blocking_recv() {
+ digest.update(&blk[..]);
+ }
+ digest.finalize()
+ });
+ Self { sendblk, task }
+ }
+
+ pub async fn update(&self, b: Bytes) {
+ self.sendblk.send(b).await.unwrap();
+ }
+
+ pub async fn finalize(self) -> digest::Output<D> {
+ drop(self.sendblk);
+ self.task.await.unwrap()
+ }
+}
+
+impl<D: Digest> Default for AsyncHasher<D> {
+ fn default() -> Self {
+ Self::new()
+ }
+}
diff --git a/src/util/background.rs b/src/util/background.rs
deleted file mode 100644
index bfdaaf1e..00000000
--- a/src/util/background.rs
+++ /dev/null
@@ -1,153 +0,0 @@
-//! Job runner for futures and async functions
-use core::future::Future;
-use std::pin::Pin;
-use std::sync::Arc;
-use std::time::Duration;
-
-use futures::future::*;
-use futures::select;
-use tokio::sync::{mpsc, watch, Mutex};
-
-use crate::error::Error;
-
-type JobOutput = Result<(), Error>;
-type Job = Pin<Box<dyn Future<Output = JobOutput> + Send>>;
-
-/// Job runner for futures and async functions
-pub struct BackgroundRunner {
- stop_signal: watch::Receiver<bool>,
- queue_in: mpsc::UnboundedSender<(Job, bool)>,
- worker_in: mpsc::UnboundedSender<tokio::task::JoinHandle<()>>,
-}
-
-impl BackgroundRunner {
- /// Create a new BackgroundRunner
- pub fn new(
- n_runners: usize,
- stop_signal: watch::Receiver<bool>,
- ) -> (Arc<Self>, tokio::task::JoinHandle<()>) {
- let (worker_in, mut worker_out) = mpsc::unbounded_channel();
-
- let stop_signal_2 = stop_signal.clone();
- let await_all_done = tokio::spawn(async move {
- loop {
- let wkr = {
- select! {
- item = worker_out.recv().fuse() => {
- match item {
- Some(x) => x,
- None => break,
- }
- }
- _ = tokio::time::sleep(Duration::from_secs(5)).fuse() => {
- if *stop_signal_2.borrow() {
- break;
- } else {
- continue;
- }
- }
- }
- };
- if let Err(e) = wkr.await {
- error!("Error while awaiting for worker: {}", e);
- }
- }
- });
-
- let (queue_in, queue_out) = mpsc::unbounded_channel();
- let queue_out = Arc::new(Mutex::new(queue_out));
-
- for i in 0..n_runners {
- let queue_out = queue_out.clone();
- let stop_signal = stop_signal.clone();
-
- worker_in
- .send(tokio::spawn(async move {
- loop {
- let (job, cancellable) = {
- select! {
- item = wait_job(&queue_out).fuse() => match item {
- // We received a task, process it
- Some(x) => x,
- // We received a signal that no more tasks will ever be sent
- // because the sending side was dropped. Exit now.
- None => break,
- },
- _ = tokio::time::sleep(Duration::from_secs(5)).fuse() => {
- if *stop_signal.borrow() {
- // Nothing has been going on for 5 secs, and we are shutting
- // down. Exit now.
- break;
- } else {
- // Nothing is going on but we don't want to exit.
- continue;
- }
- }
- }
- };
- if cancellable && *stop_signal.borrow() {
- continue;
- }
- if let Err(e) = job.await {
- error!("Job failed: {}", e)
- }
- }
- info!("Background worker {} exiting", i);
- }))
- .unwrap();
- }
-
- let bgrunner = Arc::new(Self {
- stop_signal,
- queue_in,
- worker_in,
- });
- (bgrunner, await_all_done)
- }
-
- /// Spawn a task to be run in background
- pub fn spawn<T>(&self, job: T)
- where
- T: Future<Output = JobOutput> + Send + 'static,
- {
- let boxed: Job = Box::pin(job);
- self.queue_in
- .send((boxed, false))
- .map_err(|_| "could not put job in queue")
- .unwrap();
- }
-
- /// Spawn a task to be run in background. It may get discarded before running if spawned while
- /// the runner is stopping
- pub fn spawn_cancellable<T>(&self, job: T)
- where
- T: Future<Output = JobOutput> + Send + 'static,
- {
- let boxed: Job = Box::pin(job);
- self.queue_in
- .send((boxed, true))
- .map_err(|_| "could not put job in queue")
- .unwrap();
- }
-
- pub fn spawn_worker<F, T>(&self, name: String, worker: F)
- where
- F: FnOnce(watch::Receiver<bool>) -> T + Send + 'static,
- T: Future<Output = ()> + Send + 'static,
- {
- let stop_signal = self.stop_signal.clone();
- let task = tokio::spawn(async move {
- info!("Worker started: {}", name);
- worker(stop_signal).await;
- info!("Worker exited: {}", name);
- });
- self.worker_in
- .send(task)
- .map_err(|_| "could not put job in queue")
- .unwrap();
- }
-}
-
-async fn wait_job(q: &Mutex<mpsc::UnboundedReceiver<(Job, bool)>>) -> Option<(Job, bool)> {
- q.lock().await.recv().await
-}
diff --git a/src/util/background/job_worker.rs b/src/util/background/job_worker.rs
new file mode 100644
index 00000000..2568ea11
--- /dev/null
+++ b/src/util/background/job_worker.rs
@@ -0,0 +1,48 @@
+//! Job worker: a generic worker that just processes incoming
+//! jobs one by one
+
+use std::sync::Arc;
+
+use async_trait::async_trait;
+use tokio::sync::{mpsc, Mutex};
+
+use crate::background::worker::*;
+use crate::background::*;
+
+pub(crate) struct JobWorker {
+ pub(crate) index: usize,
+ pub(crate) job_chan: Arc<Mutex<mpsc::UnboundedReceiver<(Job, bool)>>>,
+ pub(crate) next_job: Option<Job>,
+}
+
+#[async_trait]
+impl Worker for JobWorker {
+ fn name(&self) -> String {
+ format!("Job worker #{}", self.index)
+ }
+
+ async fn work(&mut self, _must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error> {
+ match self.next_job.take() {
+ None => return Ok(WorkerState::Idle),
+ Some(job) => {
+ job.await?;
+ Ok(WorkerState::Busy)
+ }
+ }
+ }
+
+ async fn wait_for_work(&mut self, must_exit: &watch::Receiver<bool>) -> WorkerState {
+ loop {
+ match self.job_chan.lock().await.recv().await {
+ Some((job, cancellable)) => {
+ if cancellable && *must_exit.borrow() {
+ continue;
+ }
+ self.next_job = Some(job);
+ return WorkerState::Busy;
+ }
+ None => return WorkerState::Done,
+ }
+ }
+ }
+}
diff --git a/src/util/background/mod.rs b/src/util/background/mod.rs
new file mode 100644
index 00000000..619f5068
--- /dev/null
+++ b/src/util/background/mod.rs
@@ -0,0 +1,117 @@
+//! Job runner for futures and async functions
+
+pub mod job_worker;
+pub mod worker;
+
+use core::future::Future;
+
+use std::collections::HashMap;
+use std::pin::Pin;
+use std::sync::Arc;
+
+use serde::{Deserialize, Serialize};
+use tokio::sync::{mpsc, watch, Mutex};
+
+use crate::error::Error;
+use worker::WorkerProcessor;
+pub use worker::{Worker, WorkerState};
+
+pub(crate) type JobOutput = Result<(), Error>;
+pub(crate) type Job = Pin<Box<dyn Future<Output = JobOutput> + Send>>;
+
+/// Job runner for futures and async functions
+pub struct BackgroundRunner {
+ send_job: mpsc::UnboundedSender<(Job, bool)>,
+ send_worker: mpsc::UnboundedSender<Box<dyn Worker>>,
+ worker_info: Arc<std::sync::Mutex<HashMap<usize, WorkerInfo>>>,
+}
+
+#[derive(Clone, Serialize, Deserialize, Debug)]
+pub struct WorkerInfo {
+ pub name: String,
+ pub info: Option<String>,
+ pub state: WorkerState,
+ pub errors: usize,
+ pub consecutive_errors: usize,
+ pub last_error: Option<(String, u64)>,
+}
+
+impl BackgroundRunner {
+ /// Create a new BackgroundRunner
+ pub fn new(
+ n_runners: usize,
+ stop_signal: watch::Receiver<bool>,
+ ) -> (Arc<Self>, tokio::task::JoinHandle<()>) {
+ let (send_worker, worker_out) = mpsc::unbounded_channel::<Box<dyn Worker>>();
+
+ let worker_info = Arc::new(std::sync::Mutex::new(HashMap::new()));
+ let mut worker_processor =
+ WorkerProcessor::new(worker_out, stop_signal, worker_info.clone());
+
+ let await_all_done = tokio::spawn(async move {
+ worker_processor.run().await;
+ });
+
+ let (send_job, queue_out) = mpsc::unbounded_channel();
+ let queue_out = Arc::new(Mutex::new(queue_out));
+
+ for i in 0..n_runners {
+ let queue_out = queue_out.clone();
+
+ send_worker
+ .send(Box::new(job_worker::JobWorker {
+ index: i,
+ job_chan: queue_out.clone(),
+ next_job: None,
+ }))
+ .ok()
+ .unwrap();
+ }
+
+ let bgrunner = Arc::new(Self {
+ send_job,
+ send_worker,
+ worker_info,
+ });
+ (bgrunner, await_all_done)
+ }
+
+ pub fn get_worker_info(&self) -> HashMap<usize, WorkerInfo> {
+ self.worker_info.lock().unwrap().clone()
+ }
+
+ /// Spawn a task to be run in background
+ pub fn spawn<T>(&self, job: T)
+ where
+ T: Future<Output = JobOutput> + Send + 'static,
+ {
+ let boxed: Job = Box::pin(job);
+ self.send_job
+ .send((boxed, false))
+ .ok()
+ .expect("Could not put job in queue");
+ }
+
+ /// Spawn a task to be run in background. It may get discarded before running if spawned while
+ /// the runner is stopping
+ pub fn spawn_cancellable<T>(&self, job: T)
+ where
+ T: Future<Output = JobOutput> + Send + 'static,
+ {
+ let boxed: Job = Box::pin(job);
+ self.send_job
+ .send((boxed, true))
+ .ok()
+ .expect("Could not put job in queue");
+ }
+
+ pub fn spawn_worker<W>(&self, worker: W)
+ where
+ W: Worker + 'static,
+ {
+ self.send_worker
+ .send(Box::new(worker))
+ .ok()
+ .expect("Could not put worker in queue");
+ }
+}
diff --git a/src/util/background/worker.rs b/src/util/background/worker.rs
new file mode 100644
index 00000000..f5e3addb
--- /dev/null
+++ b/src/util/background/worker.rs
@@ -0,0 +1,260 @@
+use std::collections::HashMap;
+use std::sync::Arc;
+use std::time::{Duration, Instant};
+
+use async_trait::async_trait;
+use futures::future::*;
+use futures::stream::FuturesUnordered;
+use futures::StreamExt;
+use serde::{Deserialize, Serialize};
+use tokio::select;
+use tokio::sync::{mpsc, watch};
+
+use crate::background::WorkerInfo;
+use crate::error::Error;
+use crate::time::now_msec;
+
+#[derive(PartialEq, Copy, Clone, Serialize, Deserialize, Debug)]
+pub enum WorkerState {
+ Busy,
+ Throttled(f32),
+ Idle,
+ Done,
+}
+
+impl std::fmt::Display for WorkerState {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ match self {
+ WorkerState::Busy => write!(f, "Busy"),
+ WorkerState::Throttled(t) => write!(f, "Thr:{:.3}", t),
+ WorkerState::Idle => write!(f, "Idle"),
+ WorkerState::Done => write!(f, "Done"),
+ }
+ }
+}
+
+#[async_trait]
+pub trait Worker: Send {
+ fn name(&self) -> String;
+
+ fn info(&self) -> Option<String> {
+ None
+ }
+
+ /// Work: do a basic unit of work, if one is available (otherwise, should return
+ /// WorkerState::Idle immediately). We will do our best to not interrupt this future in the
+ /// middle of processing, it will only be interrupted at the last minute when Garage is trying
+ /// to exit and this hasn't returned yet. This function may return an error to indicate that
+ /// its unit of work could not be processed due to an error: the error will be logged and
+ /// .work() will be called again after a short delay.
+ async fn work(&mut self, must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error>;
+
+ /// Wait for work: await for some task to become available. This future can be interrupted in
+ /// the middle for any reason. This future doesn't have to await on must_exit.changed(), we
+ /// are doing it for you. Therefore it only receives a read refernce to must_exit which allows
+ /// it to check if we are exiting.
+ async fn wait_for_work(&mut self, must_exit: &watch::Receiver<bool>) -> WorkerState;
+}
+
+pub(crate) struct WorkerProcessor {
+ stop_signal: watch::Receiver<bool>,
+ worker_chan: mpsc::UnboundedReceiver<Box<dyn Worker>>,
+ worker_info: Arc<std::sync::Mutex<HashMap<usize, WorkerInfo>>>,
+}
+
+impl WorkerProcessor {
+ pub(crate) fn new(
+ worker_chan: mpsc::UnboundedReceiver<Box<dyn Worker>>,
+ stop_signal: watch::Receiver<bool>,
+ worker_info: Arc<std::sync::Mutex<HashMap<usize, WorkerInfo>>>,
+ ) -> Self {
+ Self {
+ stop_signal,
+ worker_chan,
+ worker_info,
+ }
+ }
+
+ pub(crate) async fn run(&mut self) {
+ let mut workers = FuturesUnordered::new();
+ let mut next_task_id = 1;
+
+ while !*self.stop_signal.borrow() {
+ let await_next_worker = async {
+ if workers.is_empty() {
+ futures::future::pending().await
+ } else {
+ workers.next().await
+ }
+ };
+ select! {
+ new_worker_opt = self.worker_chan.recv() => {
+ if let Some(new_worker) = new_worker_opt {
+ let task_id = next_task_id;
+ next_task_id += 1;
+ let stop_signal = self.stop_signal.clone();
+ let stop_signal_worker = self.stop_signal.clone();
+ let mut worker = WorkerHandler {
+ task_id,
+ stop_signal,
+ stop_signal_worker,
+ worker: new_worker,
+ state: WorkerState::Busy,
+ errors: 0,
+ consecutive_errors: 0,
+ last_error: None,
+ };
+ workers.push(async move {
+ worker.step().await;
+ worker
+ }.boxed());
+ }
+ }
+ worker = await_next_worker => {
+ if let Some(mut worker) = worker {
+ trace!("{} (TID {}): {:?}", worker.worker.name(), worker.task_id, worker.state);
+
+ // Save worker info
+ let mut wi = self.worker_info.lock().unwrap();
+ match wi.get_mut(&worker.task_id) {
+ Some(i) => {
+ i.state = worker.state;
+ i.info = worker.worker.info();
+ i.errors = worker.errors;
+ i.consecutive_errors = worker.consecutive_errors;
+ if worker.last_error.is_some() {
+ i.last_error = worker.last_error.take();
+ }
+ }
+ None => {
+ wi.insert(worker.task_id, WorkerInfo {
+ name: worker.worker.name(),
+ state: worker.state,
+ info: worker.worker.info(),
+ errors: worker.errors,
+ consecutive_errors: worker.consecutive_errors,
+ last_error: worker.last_error.take(),
+ });
+ }
+ }
+
+ if worker.state == WorkerState::Done {
+ info!("Worker {} (TID {}) exited", worker.worker.name(), worker.task_id);
+ } else {
+ workers.push(async move {
+ worker.step().await;
+ worker
+ }.boxed());
+ }
+ }
+ }
+ _ = self.stop_signal.changed() => (),
+ }
+ }
+
+ // We are exiting, drain everything
+ let drain_half_time = Instant::now() + Duration::from_secs(5);
+ let drain_everything = async move {
+ while let Some(mut worker) = workers.next().await {
+ if worker.state == WorkerState::Done {
+ info!(
+ "Worker {} (TID {}) exited",
+ worker.worker.name(),
+ worker.task_id
+ );
+ } else if Instant::now() > drain_half_time {
+ warn!("Worker {} (TID {}) interrupted between two iterations in state {:?} (this should be fine)", worker.worker.name(), worker.task_id, worker.state);
+ } else {
+ workers.push(
+ async move {
+ worker.step().await;
+ worker
+ }
+ .boxed(),
+ );
+ }
+ }
+ };
+
+ select! {
+ _ = drain_everything => {
+ info!("All workers exited peacefully \\o/");
+ }
+ _ = tokio::time::sleep(Duration::from_secs(9)) => {
+ error!("Some workers could not exit in time, we are cancelling some things in the middle");
+ }
+ }
+ }
+}
+
+struct WorkerHandler {
+ task_id: usize,
+ stop_signal: watch::Receiver<bool>,
+ stop_signal_worker: watch::Receiver<bool>,
+ worker: Box<dyn Worker>,
+ state: WorkerState,
+ errors: usize,
+ consecutive_errors: usize,
+ last_error: Option<(String, u64)>,
+}
+
+impl WorkerHandler {
+ async fn step(&mut self) {
+ match self.state {
+ WorkerState::Busy => match self.worker.work(&mut self.stop_signal).await {
+ Ok(s) => {
+ self.state = s;
+ self.consecutive_errors = 0;
+ }
+ Err(e) => {
+ error!(
+ "Error in worker {} (TID {}): {}",
+ self.worker.name(),
+ self.task_id,
+ e
+ );
+ self.errors += 1;
+ self.consecutive_errors += 1;
+ self.last_error = Some((format!("{}", e), now_msec()));
+ // Sleep a bit so that error won't repeat immediately, exponential backoff
+ // strategy (min 1sec, max ~60sec)
+ self.state = WorkerState::Throttled(
+ (1.5f32).powf(std::cmp::min(10, self.consecutive_errors - 1) as f32),
+ );
+ }
+ },
+ WorkerState::Throttled(delay) => {
+ // Sleep for given delay and go back to busy state
+ if !*self.stop_signal.borrow() {
+ select! {
+ _ = tokio::time::sleep(Duration::from_secs_f32(delay)) => (),
+ _ = self.stop_signal.changed() => (),
+ }
+ }
+ self.state = WorkerState::Busy;
+ }
+ WorkerState::Idle => {
+ if *self.stop_signal.borrow() {
+ select! {
+ new_st = self.worker.wait_for_work(&self.stop_signal_worker) => {
+ self.state = new_st;
+ }
+ _ = tokio::time::sleep(Duration::from_secs(1)) => {
+ // stay in Idle state
+ }
+ }
+ } else {
+ select! {
+ new_st = self.worker.wait_for_work(&self.stop_signal_worker) => {
+ self.state = new_st;
+ }
+ _ = self.stop_signal.changed() => {
+ // stay in Idle state
+ }
+ }
+ }
+ }
+ WorkerState::Done => unreachable!(),
+ }
+ }
+}
diff --git a/src/util/config.rs b/src/util/config.rs
index e4d96476..2d4b4f57 100644
--- a/src/util/config.rs
+++ b/src/util/config.rs
@@ -3,12 +3,8 @@ use std::io::Read;
use std::net::SocketAddr;
use std::path::PathBuf;
-use serde::de::Error as SerdeError;
use serde::{de, Deserialize};
-use netapp::util::parse_and_resolve_peer_addr;
-use netapp::NodeID;
-
use crate::error::Error;
/// Represent the whole configuration
@@ -23,10 +19,6 @@ pub struct Config {
#[serde(default = "default_block_size")]
pub block_size: usize,
- /// Size of data blocks to save to disk
- #[serde(default = "default_block_manager_background_tranquility")]
- pub block_manager_background_tranquility: u32,
-
/// Replication mode. Supported values:
/// - none, 1 -> no replication
/// - 2 -> 2-way replication
@@ -47,11 +39,16 @@ pub struct Config {
/// Address to bind for RPC
pub rpc_bind_addr: SocketAddr,
/// Public IP address of this node
- pub rpc_public_addr: Option<SocketAddr>,
+ pub rpc_public_addr: Option<String>,
+
+ /// Timeout for Netapp's ping messagess
+ pub rpc_ping_timeout_msec: Option<u64>,
+ /// Timeout for Netapp RPC calls
+ pub rpc_timeout_msec: Option<u64>,
/// Bootstrap peers RPC address
- #[serde(deserialize_with = "deserialize_vec_addr", default)]
- pub bootstrap_peers: Vec<(NodeID, SocketAddr)>,
+ #[serde(default)]
+ pub bootstrap_peers: Vec<String>,
/// Consul host to connect to to discover more peers
pub consul_host: Option<String>,
/// Consul service name to use
@@ -64,19 +61,27 @@ pub struct Config {
#[serde(default)]
pub kubernetes_skip_crd: bool,
+ // -- DB
+ /// Database engine to use for metadata (options: sled, sqlite, lmdb)
+ #[serde(default = "default_db_engine")]
+ pub db_engine: String,
+
/// Sled cache size, in bytes
#[serde(default = "default_sled_cache_capacity")]
pub sled_cache_capacity: u64,
-
/// Sled flush interval in milliseconds
#[serde(default = "default_sled_flush_every_ms")]
pub sled_flush_every_ms: u64,
+ // -- APIs
/// Configuration for S3 api
- pub s3_api: ApiConfig,
+ pub s3_api: S3ApiConfig,
+
+ /// Configuration for K2V api
+ pub k2v_api: Option<K2VApiConfig>,
/// Configuration for serving files as normal web server
- pub s3_web: WebConfig,
+ pub s3_web: Option<WebConfig>,
/// Configuration for the admin API endpoint
#[serde(default = "Default::default")]
@@ -85,9 +90,9 @@ pub struct Config {
/// Configuration for S3 api
#[derive(Deserialize, Debug, Clone)]
-pub struct ApiConfig {
+pub struct S3ApiConfig {
/// Address and port to bind for api serving
- pub api_bind_addr: SocketAddr,
+ pub api_bind_addr: Option<SocketAddr>,
/// S3 region to use
pub s3_region: String,
/// Suffix to remove from domain name to find bucket. If None,
@@ -95,6 +100,13 @@ pub struct ApiConfig {
pub root_domain: Option<String>,
}
+/// Configuration for K2V api
+#[derive(Deserialize, Debug, Clone)]
+pub struct K2VApiConfig {
+ /// Address and port to bind for api serving
+ pub api_bind_addr: SocketAddr,
+}
+
/// Configuration for serving files as normal web server
#[derive(Deserialize, Debug, Clone)]
pub struct WebConfig {
@@ -109,10 +121,18 @@ pub struct WebConfig {
pub struct AdminConfig {
/// Address and port to bind for admin API serving
pub api_bind_addr: Option<SocketAddr>,
+ /// Bearer token to use to scrape metrics
+ pub metrics_token: Option<String>,
+ /// Bearer token to use to access Admin API endpoints
+ pub admin_token: Option<String>,
/// OTLP server to where to export traces
pub trace_sink: Option<String>,
}
+fn default_db_engine() -> String {
+ "sled".into()
+}
+
fn default_sled_cache_capacity() -> u64 {
128 * 1024 * 1024
}
@@ -122,9 +142,6 @@ fn default_sled_flush_every_ms() -> u64 {
fn default_block_size() -> usize {
1048576
}
-fn default_block_manager_background_tranquility() -> u32 {
- 2
-}
/// Read and parse configuration
pub fn read_config(config_file: PathBuf) -> Result<Config, Error> {
@@ -138,24 +155,6 @@ pub fn read_config(config_file: PathBuf) -> Result<Config, Error> {
Ok(toml::from_str(&config)?)
}
-fn deserialize_vec_addr<'de, D>(deserializer: D) -> Result<Vec<(NodeID, SocketAddr)>, D::Error>
-where
- D: de::Deserializer<'de>,
-{
- let mut ret = vec![];
-
- for peer in <Vec<&str>>::deserialize(deserializer)? {
- let (pubkey, addrs) = parse_and_resolve_peer_addr(peer).ok_or_else(|| {
- D::Error::custom(format!("Unable to parse or resolve peer: {}", peer))
- })?;
- for ip in addrs {
- ret.push((pubkey, ip));
- }
- }
-
- Ok(ret)
-}
-
fn default_compression() -> Option<i32> {
Some(1)
}
diff --git a/src/util/crdt/bool.rs b/src/util/crdt/bool.rs
index 53af8f82..111eb5f1 100644
--- a/src/util/crdt/bool.rs
+++ b/src/util/crdt/bool.rs
@@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize};
use crate::crdt::crdt::*;
/// Boolean, where `true` is an absorbing state
-#[derive(Clone, Copy, Debug, Serialize, Deserialize, PartialEq)]
+#[derive(Clone, Copy, Debug, Serialize, Deserialize, PartialEq, Eq)]
pub struct Bool(bool);
impl Bool {
diff --git a/src/util/crdt/deletable.rs b/src/util/crdt/deletable.rs
index c76f5cbb..e771aceb 100644
--- a/src/util/crdt/deletable.rs
+++ b/src/util/crdt/deletable.rs
@@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize};
use crate::crdt::crdt::*;
/// Deletable object (once deleted, cannot go back)
-#[derive(Clone, Copy, Debug, Serialize, Deserialize, PartialEq)]
+#[derive(Clone, Copy, Debug, Serialize, Deserialize, PartialEq, Eq)]
pub enum Deletable<T> {
Present(T),
Deleted,
diff --git a/src/util/crdt/lww.rs b/src/util/crdt/lww.rs
index 254abe8e..958844c9 100644
--- a/src/util/crdt/lww.rs
+++ b/src/util/crdt/lww.rs
@@ -37,7 +37,7 @@ use crate::crdt::crdt::*;
///
/// This scheme is used by AWS S3 or Soundcloud and often without knowing
/// in enterprise when reconciliating databases with ad-hoc scripts.
-#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
+#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
pub struct Lww<T> {
ts: u64,
v: T,
diff --git a/src/util/crdt/lww_map.rs b/src/util/crdt/lww_map.rs
index c155c3a8..88113856 100644
--- a/src/util/crdt/lww_map.rs
+++ b/src/util/crdt/lww_map.rs
@@ -23,7 +23,7 @@ use crate::crdt::crdt::*;
/// However, note that even if we were using a more efficient data structure such as a `BTreeMap`,
/// the serialization cost `O(n)` would still have to be paid at each modification, so we are
/// actually not losing anything here.
-#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
+#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
pub struct LwwMap<K, V> {
vals: Vec<(K, u64, V)>,
}
@@ -140,6 +140,11 @@ where
self.vals.clear();
}
+ /// Retain only values that match a certain predicate
+ pub fn retain(&mut self, pred: impl FnMut(&(K, u64, V)) -> bool) {
+ self.vals.retain(pred);
+ }
+
/// Get a reference to the value assigned to a key
pub fn get(&self, k: &K) -> Option<&V> {
match self.vals.binary_search_by(|(k2, _, _)| k2.cmp(k)) {
diff --git a/src/util/crdt/map.rs b/src/util/crdt/map.rs
index f9ed19b6..5d1e1520 100644
--- a/src/util/crdt/map.rs
+++ b/src/util/crdt/map.rs
@@ -16,7 +16,7 @@ use crate::crdt::crdt::*;
/// However, note that even if we were using a more efficient data structure such as a `BTreeMap`,
/// the serialization cost `O(n)` would still have to be paid at each modification, so we are
/// actually not losing anything here.
-#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
+#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
pub struct Map<K, V> {
vals: Vec<(K, V)>,
}
diff --git a/src/util/error.rs b/src/util/error.rs
index bdb3a69b..9995c746 100644
--- a/src/util/error.rs
+++ b/src/util/error.rs
@@ -26,8 +26,8 @@ pub enum Error {
#[error(display = "Netapp error: {}", _0)]
Netapp(#[error(source)] netapp::error::Error),
- #[error(display = "Sled error: {}", _0)]
- Sled(#[error(source)] sled::Error),
+ #[error(display = "DB error: {}", _0)]
+ Db(#[error(source)] garage_db::Error),
#[error(display = "Messagepack encode error: {}", _0)]
RmpEncode(#[error(source)] rmp_serde::encode::Error),
@@ -44,6 +44,9 @@ pub enum Error {
#[error(display = "Tokio semaphore acquire error: {}", _0)]
TokioSemAcquire(#[error(source)] tokio::sync::AcquireError),
+ #[error(display = "Tokio broadcast receive error: {}", _0)]
+ TokioBcastRecv(#[error(source)] tokio::sync::broadcast::error::RecvError),
+
#[error(display = "Remote error: {}", _0)]
RemoteError(String),
@@ -75,11 +78,11 @@ impl Error {
}
}
-impl From<sled::transaction::TransactionError<Error>> for Error {
- fn from(e: sled::transaction::TransactionError<Error>) -> Error {
+impl From<garage_db::TxError<Error>> for Error {
+ fn from(e: garage_db::TxError<Error>) -> Error {
match e {
- sled::transaction::TransactionError::Abort(x) => x,
- sled::transaction::TransactionError::Storage(x) => Error::Sled(x),
+ garage_db::TxError::Abort(x) => x,
+ garage_db::TxError::Db(x) => Error::Db(x),
}
}
}
diff --git a/src/util/formater.rs b/src/util/formater.rs
new file mode 100644
index 00000000..95324f9a
--- /dev/null
+++ b/src/util/formater.rs
@@ -0,0 +1,28 @@
+pub fn format_table(data: Vec<String>) {
+ let data = data
+ .iter()
+ .map(|s| s.split('\t').collect::<Vec<_>>())
+ .collect::<Vec<_>>();
+
+ let columns = data.iter().map(|row| row.len()).fold(0, std::cmp::max);
+ let mut column_size = vec![0; columns];
+
+ let mut out = String::new();
+
+ for row in data.iter() {
+ for (i, col) in row.iter().enumerate() {
+ column_size[i] = std::cmp::max(column_size[i], col.chars().count());
+ }
+ }
+
+ for row in data.iter() {
+ for (col, col_len) in row[..row.len() - 1].iter().zip(column_size.iter()) {
+ out.push_str(col);
+ (0..col_len - col.chars().count() + 2).for_each(|_| out.push(' '));
+ }
+ out.push_str(row[row.len() - 1]);
+ out.push('\n');
+ }
+
+ print!("{}", out);
+}
diff --git a/src/util/lib.rs b/src/util/lib.rs
index e83fc2e6..264cc192 100644
--- a/src/util/lib.rs
+++ b/src/util/lib.rs
@@ -3,14 +3,16 @@
#[macro_use]
extern crate tracing;
+pub mod async_hash;
pub mod background;
pub mod config;
pub mod crdt;
pub mod data;
pub mod error;
+pub mod formater;
pub mod metrics;
pub mod persister;
-pub mod sled_counter;
pub mod time;
pub mod token_bucket;
pub mod tranquilizer;
+pub mod version;
diff --git a/src/util/metrics.rs b/src/util/metrics.rs
index 1b05eabe..b882a886 100644
--- a/src/util/metrics.rs
+++ b/src/util/metrics.rs
@@ -1,4 +1,4 @@
-use std::time::SystemTime;
+use std::time::Instant;
use futures::{future::BoxFuture, Future, FutureExt};
use rand::Rng;
@@ -28,10 +28,12 @@ where
attributes: &'a [KeyValue],
) -> BoxFuture<'a, Self::Output> {
async move {
- let request_start = SystemTime::now();
+ let request_start = Instant::now();
let res = self.await;
r.record(
- request_start.elapsed().map_or(0.0, |d| d.as_secs_f64()),
+ Instant::now()
+ .saturating_duration_since(request_start)
+ .as_secs_f64(),
attributes,
);
res
@@ -41,9 +43,13 @@ where
fn bound_record_duration(self, r: &'a BoundValueRecorder<f64>) -> BoxFuture<'a, Self::Output> {
async move {
- let request_start = SystemTime::now();
+ let request_start = Instant::now();
let res = self.await;
- r.record(request_start.elapsed().map_or(0.0, |d| d.as_secs_f64()));
+ r.record(
+ Instant::now()
+ .saturating_duration_since(request_start)
+ .as_secs_f64(),
+ );
res
}
.boxed()
diff --git a/src/util/sled_counter.rs b/src/util/sled_counter.rs
deleted file mode 100644
index bc54cea0..00000000
--- a/src/util/sled_counter.rs
+++ /dev/null
@@ -1,100 +0,0 @@
-use std::sync::{
- atomic::{AtomicUsize, Ordering},
- Arc,
-};
-
-use sled::{CompareAndSwapError, IVec, Iter, Result, Tree};
-
-#[derive(Clone)]
-pub struct SledCountedTree(Arc<SledCountedTreeInternal>);
-
-struct SledCountedTreeInternal {
- tree: Tree,
- len: AtomicUsize,
-}
-
-impl SledCountedTree {
- pub fn new(tree: Tree) -> Self {
- let len = tree.len();
- Self(Arc::new(SledCountedTreeInternal {
- tree,
- len: AtomicUsize::new(len),
- }))
- }
-
- pub fn len(&self) -> usize {
- self.0.len.load(Ordering::Relaxed)
- }
-
- pub fn is_empty(&self) -> bool {
- self.0.tree.is_empty()
- }
-
- pub fn get<K: AsRef<[u8]>>(&self, key: K) -> Result<Option<IVec>> {
- self.0.tree.get(key)
- }
-
- pub fn iter(&self) -> Iter {
- self.0.tree.iter()
- }
-
- // ---- writing functions ----
-
- pub fn insert<K, V>(&self, key: K, value: V) -> Result<Option<IVec>>
- where
- K: AsRef<[u8]>,
- V: Into<IVec>,
- {
- let res = self.0.tree.insert(key, value);
- if res == Ok(None) {
- self.0.len.fetch_add(1, Ordering::Relaxed);
- }
- res
- }
-
- pub fn remove<K: AsRef<[u8]>>(&self, key: K) -> Result<Option<IVec>> {
- let res = self.0.tree.remove(key);
- if matches!(res, Ok(Some(_))) {
- self.0.len.fetch_sub(1, Ordering::Relaxed);
- }
- res
- }
-
- pub fn pop_min(&self) -> Result<Option<(IVec, IVec)>> {
- let res = self.0.tree.pop_min();
- if let Ok(Some(_)) = &res {
- self.0.len.fetch_sub(1, Ordering::Relaxed);
- };
- res
- }
-
- pub fn compare_and_swap<K, OV, NV>(
- &self,
- key: K,
- old: Option<OV>,
- new: Option<NV>,
- ) -> Result<std::result::Result<(), CompareAndSwapError>>
- where
- K: AsRef<[u8]>,
- OV: AsRef<[u8]>,
- NV: Into<IVec>,
- {
- let old_some = old.is_some();
- let new_some = new.is_some();
-
- let res = self.0.tree.compare_and_swap(key, old, new);
-
- if res == Ok(Ok(())) {
- match (old_some, new_some) {
- (false, true) => {
- self.0.len.fetch_add(1, Ordering::Relaxed);
- }
- (true, false) => {
- self.0.len.fetch_sub(1, Ordering::Relaxed);
- }
- _ => (),
- }
- }
- res
- }
-}
diff --git a/src/util/tranquilizer.rs b/src/util/tranquilizer.rs
index 28711387..8a96cbb3 100644
--- a/src/util/tranquilizer.rs
+++ b/src/util/tranquilizer.rs
@@ -3,6 +3,8 @@ use std::time::{Duration, Instant};
use tokio::time::sleep;
+use crate::background::WorkerState;
+
/// A tranquilizer is a helper object that is used to make
/// background operations not take up too much time.
///
@@ -33,8 +35,8 @@ impl Tranquilizer {
}
}
- pub async fn tranquilize(&mut self, tranquility: u32) {
- let observation = Instant::now() - self.last_step_begin;
+ fn tranquilize_internal(&mut self, tranquility: u32) -> Option<Duration> {
+ let observation = Instant::now().saturating_duration_since(self.last_step_begin);
self.observations.push_back(observation);
self.sum_observations += observation;
@@ -45,13 +47,32 @@ impl Tranquilizer {
if !self.observations.is_empty() {
let delay = (tranquility * self.sum_observations) / (self.observations.len() as u32);
+ Some(delay)
+ } else {
+ None
+ }
+ }
+
+ pub async fn tranquilize(&mut self, tranquility: u32) {
+ if let Some(delay) = self.tranquilize_internal(tranquility) {
sleep(delay).await;
+ self.reset();
}
+ }
- self.reset();
+ #[must_use]
+ pub fn tranquilize_worker(&mut self, tranquility: u32) -> WorkerState {
+ match self.tranquilize_internal(tranquility) {
+ Some(delay) => WorkerState::Throttled(delay.as_secs_f32()),
+ None => WorkerState::Busy,
+ }
}
pub fn reset(&mut self) {
self.last_step_begin = Instant::now();
}
+
+ pub fn clear(&mut self) {
+ self.observations.clear();
+ }
}
diff --git a/src/util/version.rs b/src/util/version.rs
new file mode 100644
index 00000000..b515dccc
--- /dev/null
+++ b/src/util/version.rs
@@ -0,0 +1,28 @@
+use std::sync::Arc;
+
+use arc_swap::{ArcSwap, ArcSwapOption};
+
+lazy_static::lazy_static! {
+ static ref VERSION: ArcSwap<&'static str> = ArcSwap::new(Arc::new(git_version::git_version!(
+ prefix = "git:",
+ cargo_prefix = "cargo:",
+ fallback = "unknown"
+ )));
+ static ref FEATURES: ArcSwapOption<&'static [&'static str]> = ArcSwapOption::new(None);
+}
+
+pub fn garage_version() -> &'static str {
+ &VERSION.load()
+}
+
+pub fn garage_features() -> Option<&'static [&'static str]> {
+ FEATURES.load().as_ref().map(|f| &f[..])
+}
+
+pub fn init_version(version: &'static str) {
+ VERSION.store(Arc::new(version));
+}
+
+pub fn init_features(features: &'static [&'static str]) {
+ FEATURES.store(Some(Arc::new(features)));
+}
diff --git a/src/web/Cargo.toml b/src/web/Cargo.toml
index 59a1231d..7bf70c55 100644
--- a/src/web/Cargo.toml
+++ b/src/web/Cargo.toml
@@ -1,6 +1,6 @@
[package]
name = "garage_web"
-version = "0.7.0"
+version = "0.8.0"
authors = ["Alex Auvolat <alex@adnab.me>", "Quentin Dufour <quentin@dufour.io>"]
edition = "2018"
license = "AGPL-3.0"
@@ -14,10 +14,10 @@ path = "lib.rs"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
-garage_api = { version = "0.7.0", path = "../api" }
-garage_model = { version = "0.7.0", path = "../model" }
-garage_util = { version = "0.7.0", path = "../util" }
-garage_table = { version = "0.7.0", path = "../table" }
+garage_api = { version = "0.8.0", path = "../api" }
+garage_model = { version = "0.8.0", path = "../model" }
+garage_util = { version = "0.8.0", path = "../util" }
+garage_table = { version = "0.8.0", path = "../table" }
err-derive = "0.3"
tracing = "0.1.30"
diff --git a/src/web/error.rs b/src/web/error.rs
index 55990e9d..bd8f17b5 100644
--- a/src/web/error.rs
+++ b/src/web/error.rs
@@ -2,57 +2,47 @@ use err_derive::Error;
use hyper::header::HeaderValue;
use hyper::{HeaderMap, StatusCode};
-use garage_util::error::Error as GarageError;
+use garage_api::generic_server::ApiError;
/// Errors of this crate
#[derive(Debug, Error)]
pub enum Error {
/// An error received from the API crate
#[error(display = "API error: {}", _0)]
- ApiError(#[error(source)] garage_api::Error),
-
- // Category: internal error
- /// Error internal to garage
- #[error(display = "Internal error: {}", _0)]
- InternalError(#[error(source)] GarageError),
+ ApiError(garage_api::s3::error::Error),
/// The file does not exist
#[error(display = "Not found")]
NotFound,
- /// The request contained an invalid UTF-8 sequence in its path or in other parameters
- #[error(display = "Invalid UTF-8: {}", _0)]
- InvalidUtf8(#[error(source)] std::str::Utf8Error),
-
- /// The client send a header with invalid value
- #[error(display = "Invalid header value: {}", _0)]
- InvalidHeader(#[error(source)] hyper::header::ToStrError),
-
/// The client sent a request without host, or with unsupported method
#[error(display = "Bad request: {}", _0)]
BadRequest(String),
}
+impl<T> From<T> for Error
+where
+ garage_api::s3::error::Error: From<T>,
+{
+ fn from(err: T) -> Self {
+ Error::ApiError(garage_api::s3::error::Error::from(err))
+ }
+}
+
impl Error {
/// Transform errors into http status code
pub fn http_status_code(&self) -> StatusCode {
match self {
Error::NotFound => StatusCode::NOT_FOUND,
Error::ApiError(e) => e.http_status_code(),
- Error::InternalError(
- GarageError::Timeout
- | GarageError::RemoteError(_)
- | GarageError::Quorum(_, _, _, _),
- ) => StatusCode::SERVICE_UNAVAILABLE,
- Error::InternalError(_) => StatusCode::INTERNAL_SERVER_ERROR,
- _ => StatusCode::BAD_REQUEST,
+ Error::BadRequest(_) => StatusCode::BAD_REQUEST,
}
}
pub fn add_headers(&self, header_map: &mut HeaderMap<HeaderValue>) {
#[allow(clippy::single_match)]
match self {
- Error::ApiError(e) => e.add_headers(header_map),
+ Error::ApiError(e) => e.add_http_headers(header_map),
_ => (),
}
}
diff --git a/src/web/lib.rs b/src/web/lib.rs
index 9b7c8573..7207c365 100644
--- a/src/web/lib.rs
+++ b/src/web/lib.rs
@@ -6,4 +6,4 @@ mod error;
pub use error::Error;
mod web_server;
-pub use web_server::run_web_server;
+pub use web_server::WebServer;
diff --git a/src/web/web_server.rs b/src/web/web_server.rs
index c3d691d0..c2322073 100644
--- a/src/web/web_server.rs
+++ b/src/web/web_server.rs
@@ -18,10 +18,12 @@ use opentelemetry::{
use crate::error::*;
-use garage_api::error::{Error as ApiError, OkOrBadRequest, OkOrInternalError};
use garage_api::helpers::{authority_to_host, host_to_bucket};
-use garage_api::s3_cors::{add_cors_headers, find_matching_cors_rule, handle_options_for_bucket};
-use garage_api::s3_get::{handle_get, handle_head};
+use garage_api::s3::cors::{add_cors_headers, find_matching_cors_rule, handle_options_for_bucket};
+use garage_api::s3::error::{
+ CommonErrorDerivative, Error as ApiError, OkOrBadRequest, OkOrInternalError,
+};
+use garage_api::s3::get::{handle_get, handle_head};
use garage_model::garage::Garage;
@@ -55,90 +57,226 @@ impl WebMetrics {
}
}
-/// Run a web server
-pub async fn run_web_server(
+pub struct WebServer {
garage: Arc<Garage>,
- shutdown_signal: impl Future<Output = ()>,
-) -> Result<(), GarageError> {
- let addr = &garage.config.s3_web.bind_addr;
+ metrics: Arc<WebMetrics>,
+ root_domain: String,
+}
- let metrics = Arc::new(WebMetrics::new());
+impl WebServer {
+ /// Run a web server
+ pub async fn run(
+ garage: Arc<Garage>,
+ addr: SocketAddr,
+ root_domain: String,
+ shutdown_signal: impl Future<Output = ()>,
+ ) -> Result<(), GarageError> {
+ let metrics = Arc::new(WebMetrics::new());
+ let web_server = Arc::new(WebServer {
+ garage,
+ metrics,
+ root_domain,
+ });
+
+ let service = make_service_fn(|conn: &AddrStream| {
+ let web_server = web_server.clone();
+
+ let client_addr = conn.remote_addr();
+ async move {
+ Ok::<_, Error>(service_fn(move |req: Request<Body>| {
+ let web_server = web_server.clone();
+
+ web_server.handle_request(req, client_addr)
+ }))
+ }
+ });
- let service = make_service_fn(|conn: &AddrStream| {
- let garage = garage.clone();
- let metrics = metrics.clone();
+ let server = Server::bind(&addr).serve(service);
+ let graceful = server.with_graceful_shutdown(shutdown_signal);
+ info!("Web server listening on http://{}", addr);
- let client_addr = conn.remote_addr();
- async move {
- Ok::<_, Error>(service_fn(move |req: Request<Body>| {
- let garage = garage.clone();
- let metrics = metrics.clone();
+ graceful.await?;
+ Ok(())
+ }
- handle_request(garage, metrics, req, client_addr)
- }))
+ async fn handle_request(
+ self: Arc<Self>,
+ req: Request<Body>,
+ addr: SocketAddr,
+ ) -> Result<Response<Body>, Infallible> {
+ info!("{} {} {}", addr, req.method(), req.uri());
+
+ // Lots of instrumentation
+ let tracer = opentelemetry::global::tracer("garage");
+ let span = tracer
+ .span_builder(format!("Web {} request", req.method()))
+ .with_trace_id(gen_trace_id())
+ .with_attributes(vec![
+ KeyValue::new("method", format!("{}", req.method())),
+ KeyValue::new("uri", req.uri().to_string()),
+ ])
+ .start(&tracer);
+
+ let metrics_tags = &[KeyValue::new("method", req.method().to_string())];
+
+ // The actual handler
+ let res = self
+ .serve_file(&req)
+ .with_context(Context::current_with_span(span))
+ .record_duration(&self.metrics.request_duration, &metrics_tags[..])
+ .await;
+
+ // More instrumentation
+ self.metrics.request_counter.add(1, &metrics_tags[..]);
+
+ // Returning the result
+ match res {
+ Ok(res) => {
+ debug!("{} {} {}", req.method(), res.status(), req.uri());
+ Ok(res)
+ }
+ Err(error) => {
+ info!(
+ "{} {} {} {}",
+ req.method(),
+ error.http_status_code(),
+ req.uri(),
+ error
+ );
+ self.metrics.error_counter.add(
+ 1,
+ &[
+ metrics_tags[0].clone(),
+ KeyValue::new("status_code", error.http_status_code().to_string()),
+ ],
+ );
+ Ok(error_to_res(error))
+ }
}
- });
+ }
- let server = Server::bind(addr).serve(service);
- let graceful = server.with_graceful_shutdown(shutdown_signal);
- info!("Web server listening on http://{}", addr);
+ async fn serve_file(self: &Arc<Self>, req: &Request<Body>) -> Result<Response<Body>, Error> {
+ // Get http authority string (eg. [::1]:3902 or garage.tld:80)
+ let authority = req
+ .headers()
+ .get(HOST)
+ .ok_or_bad_request("HOST header required")?
+ .to_str()?;
+
+ // Get bucket
+ let host = authority_to_host(authority)?;
+
+ let bucket_name = host_to_bucket(&host, &self.root_domain).unwrap_or(&host);
+ let bucket_id = self
+ .garage
+ .bucket_alias_table
+ .get(&EmptyKey, &bucket_name.to_string())
+ .await?
+ .and_then(|x| x.state.take())
+ .ok_or(Error::NotFound)?;
+
+ // Check bucket isn't deleted and has website access enabled
+ let bucket = self
+ .garage
+ .bucket_table
+ .get(&EmptyKey, &bucket_id)
+ .await?
+ .ok_or(Error::NotFound)?;
+
+ let website_config = bucket
+ .params()
+ .ok_or(Error::NotFound)?
+ .website_config
+ .get()
+ .as_ref()
+ .ok_or(Error::NotFound)?;
+
+ // Get path
+ let path = req.uri().path().to_string();
+ let index = &website_config.index_document;
+ let key = path_to_key(&path, index)?;
+
+ debug!(
+ "Selected bucket: \"{}\" {:?}, selected key: \"{}\"",
+ bucket_name, bucket_id, key
+ );
+
+ let ret_doc = match *req.method() {
+ Method::OPTIONS => handle_options_for_bucket(req, &bucket),
+ Method::HEAD => handle_head(self.garage.clone(), req, bucket_id, &key, None).await,
+ Method::GET => handle_get(self.garage.clone(), req, bucket_id, &key, None).await,
+ _ => Err(ApiError::bad_request("HTTP method not supported")),
+ }
+ .map_err(Error::from);
+
+ match ret_doc {
+ Err(error) => {
+ // For a HEAD or OPTIONS method, and for non-4xx errors,
+ // we don't return the error document as content,
+ // we return above and just return the error message
+ // by relying on err_to_res that is called when we return an Err.
+ if *req.method() == Method::HEAD
+ || *req.method() == Method::OPTIONS
+ || !error.http_status_code().is_client_error()
+ {
+ return Err(error);
+ }
- graceful.await?;
- Ok(())
-}
+ // If no error document is set: just return the error directly
+ let error_document = match &website_config.error_document {
+ Some(ed) => ed.trim_start_matches('/').to_owned(),
+ None => return Err(error),
+ };
+
+ // We want to return the error document
+ // Create a fake HTTP request with path = the error document
+ let req2 = Request::builder()
+ .uri(format!("http://{}/{}", host, &error_document))
+ .body(Body::empty())
+ .unwrap();
+
+ match handle_get(self.garage.clone(), &req2, bucket_id, &error_document, None).await
+ {
+ Ok(mut error_doc) => {
+ // The error won't be logged back in handle_request,
+ // so log it here
+ info!(
+ "{} {} {} {}",
+ req.method(),
+ req.uri(),
+ error.http_status_code(),
+ error
+ );
+
+ *error_doc.status_mut() = error.http_status_code();
+ error.add_headers(error_doc.headers_mut());
+
+ // Preserve error message in a special header
+ for error_line in error.to_string().split('\n') {
+ if let Ok(v) = HeaderValue::from_bytes(error_line.as_bytes()) {
+ error_doc.headers_mut().append("X-Garage-Error", v);
+ }
+ }
-async fn handle_request(
- garage: Arc<Garage>,
- metrics: Arc<WebMetrics>,
- req: Request<Body>,
- addr: SocketAddr,
-) -> Result<Response<Body>, Infallible> {
- info!("{} {} {}", addr, req.method(), req.uri());
-
- // Lots of instrumentation
- let tracer = opentelemetry::global::tracer("garage");
- let span = tracer
- .span_builder(format!("Web {} request", req.method()))
- .with_trace_id(gen_trace_id())
- .with_attributes(vec![
- KeyValue::new("method", format!("{}", req.method())),
- KeyValue::new("uri", req.uri().to_string()),
- ])
- .start(&tracer);
-
- let metrics_tags = &[KeyValue::new("method", req.method().to_string())];
-
- // The actual handler
- let res = serve_file(garage, &req)
- .with_context(Context::current_with_span(span))
- .record_duration(&metrics.request_duration, &metrics_tags[..])
- .await;
-
- // More instrumentation
- metrics.request_counter.add(1, &metrics_tags[..]);
-
- // Returning the result
- match res {
- Ok(res) => {
- debug!("{} {} {}", req.method(), res.status(), req.uri());
- Ok(res)
- }
- Err(error) => {
- info!(
- "{} {} {} {}",
- req.method(),
- error.http_status_code(),
- req.uri(),
- error
- );
- metrics.error_counter.add(
- 1,
- &[
- metrics_tags[0].clone(),
- KeyValue::new("status_code", error.http_status_code().to_string()),
- ],
- );
- Ok(error_to_res(error))
+ Ok(error_doc)
+ }
+ Err(error_doc_error) => {
+ warn!(
+ "Couldn't get error document {} for bucket {:?}: {}",
+ error_document, bucket_id, error_doc_error
+ );
+ Err(error)
+ }
+ }
+ }
+ Ok(mut resp) => {
+ // Maybe add CORS headers
+ if let Some(rule) = find_matching_cors_rule(&bucket, req)? {
+ add_cors_headers(&mut resp, rule)
+ .ok_or_internal_error("Invalid bucket CORS configuration")?;
+ }
+ Ok(resp)
+ }
}
}
}
@@ -158,129 +296,6 @@ fn error_to_res(e: Error) -> Response<Body> {
http_error
}
-async fn serve_file(garage: Arc<Garage>, req: &Request<Body>) -> Result<Response<Body>, Error> {
- // Get http authority string (eg. [::1]:3902 or garage.tld:80)
- let authority = req
- .headers()
- .get(HOST)
- .ok_or_bad_request("HOST header required")?
- .to_str()?;
-
- // Get bucket
- let host = authority_to_host(authority)?;
- let root = &garage.config.s3_web.root_domain;
-
- let bucket_name = host_to_bucket(&host, root).unwrap_or(&host);
- let bucket_id = garage
- .bucket_alias_table
- .get(&EmptyKey, &bucket_name.to_string())
- .await?
- .and_then(|x| x.state.take())
- .ok_or(Error::NotFound)?;
-
- // Check bucket isn't deleted and has website access enabled
- let bucket = garage
- .bucket_table
- .get(&EmptyKey, &bucket_id)
- .await?
- .ok_or(Error::NotFound)?;
-
- let website_config = bucket
- .params()
- .ok_or(Error::NotFound)?
- .website_config
- .get()
- .as_ref()
- .ok_or(Error::NotFound)?;
-
- // Get path
- let path = req.uri().path().to_string();
- let index = &website_config.index_document;
- let key = path_to_key(&path, index)?;
-
- debug!(
- "Selected bucket: \"{}\" {:?}, selected key: \"{}\"",
- bucket_name, bucket_id, key
- );
-
- let ret_doc = match *req.method() {
- Method::OPTIONS => handle_options_for_bucket(req, &bucket),
- Method::HEAD => handle_head(garage.clone(), req, bucket_id, &key, None).await,
- Method::GET => handle_get(garage.clone(), req, bucket_id, &key, None).await,
- _ => Err(ApiError::BadRequest("HTTP method not supported".into())),
- }
- .map_err(Error::from);
-
- match ret_doc {
- Err(error) => {
- // For a HEAD or OPTIONS method, and for non-4xx errors,
- // we don't return the error document as content,
- // we return above and just return the error message
- // by relying on err_to_res that is called when we return an Err.
- if *req.method() == Method::HEAD
- || *req.method() == Method::OPTIONS
- || !error.http_status_code().is_client_error()
- {
- return Err(error);
- }
-
- // If no error document is set: just return the error directly
- let error_document = match &website_config.error_document {
- Some(ed) => ed.trim_start_matches('/').to_owned(),
- None => return Err(error),
- };
-
- // We want to return the error document
- // Create a fake HTTP request with path = the error document
- let req2 = Request::builder()
- .uri(format!("http://{}/{}", host, &error_document))
- .body(Body::empty())
- .unwrap();
-
- match handle_get(garage, &req2, bucket_id, &error_document, None).await {
- Ok(mut error_doc) => {
- // The error won't be logged back in handle_request,
- // so log it here
- info!(
- "{} {} {} {}",
- req.method(),
- req.uri(),
- error.http_status_code(),
- error
- );
-
- *error_doc.status_mut() = error.http_status_code();
- error.add_headers(error_doc.headers_mut());
-
- // Preserve error message in a special header
- for error_line in error.to_string().split('\n') {
- if let Ok(v) = HeaderValue::from_bytes(error_line.as_bytes()) {
- error_doc.headers_mut().append("X-Garage-Error", v);
- }
- }
-
- Ok(error_doc)
- }
- Err(error_doc_error) => {
- warn!(
- "Couldn't get error document {} for bucket {:?}: {}",
- error_document, bucket_id, error_doc_error
- );
- Err(error)
- }
- }
- }
- Ok(mut resp) => {
- // Maybe add CORS headers
- if let Some(rule) = find_matching_cors_rule(&bucket, req)? {
- add_cors_headers(&mut resp, rule)
- .ok_or_internal_error("Invalid bucket CORS configuration")?;
- }
- Ok(resp)
- }
- }
-}
-
/// Path to key
///
/// Convert the provided path to the internal key
@@ -290,9 +305,7 @@ fn path_to_key<'a>(path: &'a str, index: &str) -> Result<Cow<'a, str>, Error> {
let path_utf8 = percent_encoding::percent_decode_str(path).decode_utf8()?;
if !path_utf8.starts_with('/') {
- return Err(Error::BadRequest(
- "Path must start with a / (slash)".to_string(),
- ));
+ return Err(Error::BadRequest("Path must start with a / (slash)".into()));
}
match path_utf8.chars().last() {