Merge remote-tracking branch 'origin/main' into optimal-layout

author: Mendes <mendes.oulamara@pm.me> 2022-10-04 18:14:49 +0200
committer: Mendes <mendes.oulamara@pm.me> 2022-10-04 18:14:49 +0200
commit: 829f815a897b04986559910bbcbf53625adcdf20 (patch)
tree: 6db3c27cff2aded754a641d1f2b05c83be701267 /src
parent: 99f96b9564c9c841dc6c56f1255a6e70ff884d46 (diff)
parent: a096ced35562bd0a8877a1ee2f755be1edafe343 (diff)
download: garage-829f815a897b04986559910bbcbf53625adcdf20.tar.gz
garage-829f815a897b04986559910bbcbf53625adcdf20.zip
161 files changed, 16081 insertions, 4068 deletions
diff --git a/src/admin/Cargo.toml b/src/admin/Cargo.toml
deleted file mode 100644
index 2db4bb08..00000000
--- a/src/admin/Cargo.toml
+++ /dev/null
@@ -1,29 +0,0 @@
-[package]
-name = "garage_admin"
-version = "0.7.0"
-authors = ["Maximilien Richer <code@mricher.fr>"]
-edition = "2018"
-license = "AGPL-3.0"
-description = "Administration and metrics REST HTTP server for Garage"
-repository = "https://git.deuxfleurs.fr/Deuxfleurs/garage"
-
-[lib]
-path = "lib.rs"
-
-# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
-
-[dependencies]
-garage_util = { version = "0.7.0", path = "../util" }
-
-hex = "0.4"
-
-futures = "0.3"
-futures-util = "0.3"
-http = "0.2"
-hyper = "0.14"
-tracing = "0.1.30"
-
-opentelemetry = { version = "0.17", features = [ "rt-tokio" ] }
-opentelemetry-prometheus = "0.10"
-opentelemetry-otlp = "0.10"
-prometheus = "0.13"
diff --git a/src/admin/lib.rs b/src/admin/lib.rs
deleted file mode 100644
index b5b0775b..00000000
--- a/src/admin/lib.rs
+++ /dev/null
@@ -1,6 +0,0 @@
-//! Crate for handling the admin and metric HTTP APIs
-#[macro_use]
-extern crate tracing;
-
-pub mod metrics;
-pub mod tracing_setup;
diff --git a/src/admin/metrics.rs b/src/admin/metrics.rs
deleted file mode 100644
index 7edc36c6..00000000
--- a/src/admin/metrics.rs
+++ /dev/null
@@ -1,146 +0,0 @@
-use std::convert::Infallible;
-use std::net::SocketAddr;
-use std::sync::Arc;
-use std::time::SystemTime;
-
-use futures::future::*;
-use hyper::{
-	header::CONTENT_TYPE,
-	service::{make_service_fn, service_fn},
-	Body, Method, Request, Response, Server,
-};
-
-use opentelemetry::{
-	global,
-	metrics::{BoundCounter, BoundValueRecorder},
-	trace::{FutureExt, TraceContextExt, Tracer},
-	Context,
-};
-use opentelemetry_prometheus::PrometheusExporter;
-
-use prometheus::{Encoder, TextEncoder};
-
-use garage_util::error::Error as GarageError;
-use garage_util::metrics::*;
-
-// serve_req on metric endpoint
-async fn serve_req(
-	req: Request<Body>,
-	admin_server: Arc<AdminServer>,
-) -> Result<Response<Body>, hyper::Error> {
-	debug!("Receiving request at path {}", req.uri());
-	let request_start = SystemTime::now();
-
-	admin_server.metrics.http_counter.add(1);
-
-	let response = match (req.method(), req.uri().path()) {
-		(&Method::GET, "/metrics") => {
-			let mut buffer = vec![];
-			let encoder = TextEncoder::new();
-
-			let tracer = opentelemetry::global::tracer("garage");
-			let metric_families = tracer.in_span("admin/gather_metrics", |_| {
-				admin_server.exporter.registry().gather()
-			});
-
-			encoder.encode(&metric_families, &mut buffer).unwrap();
-			admin_server
-				.metrics
-				.http_body_gauge
-				.record(buffer.len() as u64);
-
-			Response::builder()
-				.status(200)
-				.header(CONTENT_TYPE, encoder.format_type())
-				.body(Body::from(buffer))
-				.unwrap()
-		}
-		_ => Response::builder()
-			.status(404)
-			.body(Body::from("Not implemented"))
-			.unwrap(),
-	};
-
-	admin_server
-		.metrics
-		.http_req_histogram
-		.record(request_start.elapsed().map_or(0.0, |d| d.as_secs_f64()));
-	Ok(response)
-}
-
-// AdminServer hold the admin server internal admin_server and the metric exporter
-pub struct AdminServer {
-	exporter: PrometheusExporter,
-	metrics: AdminServerMetrics,
-}
-
-// GarageMetricadmin_server holds the metrics counter definition for Garage
-// FIXME: we would rather have that split up among the different libraries?
-struct AdminServerMetrics {
-	http_counter: BoundCounter<u64>,
-	http_body_gauge: BoundValueRecorder<u64>,
-	http_req_histogram: BoundValueRecorder<f64>,
-}
-
-impl AdminServer {
-	/// init initilialize the AdminServer and background metric server
-	pub fn init() -> AdminServer {
-		let exporter = opentelemetry_prometheus::exporter().init();
-		let meter = global::meter("garage/admin_server");
-		AdminServer {
-			exporter,
-			metrics: AdminServerMetrics {
-				http_counter: meter
-					.u64_counter("admin.http_requests_total")
-					.with_description("Total number of HTTP requests made.")
-					.init()
-					.bind(&[]),
-				http_body_gauge: meter
-					.u64_value_recorder("admin.http_response_size_bytes")
-					.with_description("The metrics HTTP response sizes in bytes.")
-					.init()
-					.bind(&[]),
-				http_req_histogram: meter
-					.f64_value_recorder("admin.http_request_duration_seconds")
-					.with_description("The HTTP request latencies in seconds.")
-					.init()
-					.bind(&[]),
-			},
-		}
-	}
-	/// run execute the admin server on the designated HTTP port and listen for requests
-	pub async fn run(
-		self,
-		bind_addr: SocketAddr,
-		shutdown_signal: impl Future<Output = ()>,
-	) -> Result<(), GarageError> {
-		let admin_server = Arc::new(self);
-		// For every connection, we must make a `Service` to handle all
-		// incoming HTTP requests on said connection.
-		let make_svc = make_service_fn(move |_conn| {
-			let admin_server = admin_server.clone();
-			// This is the `Service` that will handle the connection.
-			// `service_fn` is a helper to convert a function that
-			// returns a Response into a `Service`.
-			async move {
-				Ok::<_, Infallible>(service_fn(move |req| {
-					let tracer = opentelemetry::global::tracer("garage");
-					let span = tracer
-						.span_builder("admin/request")
-						.with_trace_id(gen_trace_id())
-						.start(&tracer);
-
-					serve_req(req, admin_server.clone())
-						.with_context(Context::current_with_span(span))
-				}))
-			}
-		});
-
-		let server = Server::bind(&bind_addr).serve(make_svc);
-		let graceful = server.with_graceful_shutdown(shutdown_signal);
-		info!("Admin server listening on http://{}", bind_addr);
-
-		graceful.await?;
-		Ok(())
-	}
-}
diff --git a/src/api/Cargo.toml b/src/api/Cargo.toml
index 5e96b081..7c3ed43b 100644
--- a/src/api/Cargo.toml
+++ b/src/api/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "garage_api"
-version = "0.7.0"
+version = "0.8.0"
 authors = ["Alex Auvolat <alex@adnab.me>"]
 edition = "2018"
 license = "AGPL-3.0"
@@ -14,28 +14,31 @@ path = "lib.rs"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
-garage_model = { version = "0.7.0", path = "../model" }
-garage_table = { version = "0.7.0", path = "../table" }
-garage_block = { version = "0.7.0", path = "../block" }
-garage_util = { version = "0.7.0", path = "../util" }
+garage_model = { version = "0.8.0", path = "../model" }
+garage_table = { version = "0.8.0", path = "../table" }
+garage_block = { version = "0.8.0", path = "../block" }
+garage_util = { version = "0.8.0", path = "../util" }
+garage_rpc = { version = "0.8.0", path = "../rpc" }
 
+async-trait = "0.1.7"
 base64 = "0.13"
 bytes = "1.0"
 chrono = "0.4"
-crypto-mac = "0.10"
+crypto-common = "0.1"
 err-derive = "0.3"
 hex = "0.4"
-hmac = "0.10"
+hmac = "0.12"
 idna = "0.2"
 tracing = "0.1.30"
-md-5 = "0.9"
+md-5 = "0.10"
 nom = "7.1"
-sha2 = "0.9"
+sha2 = "0.10"
 
 futures = "0.3"
 futures-util = "0.3"
 pin-project = "1.0"
 tokio = { version = "1.0", default-features = false, features = ["rt", "rt-multi-thread", "io-util", "net", "time", "macros", "sync", "signal", "fs"] }
+tokio-stream = "0.1"
 
 form_urlencoded = "1.0.0"
 http = "0.2"
@@ -52,3 +55,9 @@ quick-xml = { version = "0.21", features = [ "serialize" ] }
 url = "2.1"
 
 opentelemetry = "0.17"
+opentelemetry-prometheus = { version = "0.10", optional = true }
+prometheus = { version = "0.13", optional = true }
+
+[features]
+k2v = [ "garage_util/k2v", "garage_model/k2v" ]
+metrics = [ "opentelemetry-prometheus", "prometheus" ]
diff --git a/src/api/admin/api_server.rs b/src/api/admin/api_server.rs
new file mode 100644
index 00000000..0816bda1
--- /dev/null
+++ b/src/api/admin/api_server.rs
@@ -0,0 +1,209 @@
+use std::net::SocketAddr;
+use std::sync::Arc;
+
+use async_trait::async_trait;
+
+use futures::future::Future;
+use http::header::{ACCESS_CONTROL_ALLOW_METHODS, ACCESS_CONTROL_ALLOW_ORIGIN, ALLOW};
+use hyper::{Body, Request, Response};
+
+use opentelemetry::trace::SpanRef;
+
+#[cfg(feature = "metrics")]
+use opentelemetry_prometheus::PrometheusExporter;
+#[cfg(feature = "metrics")]
+use prometheus::{Encoder, TextEncoder};
+
+use garage_model::garage::Garage;
+use garage_util::error::Error as GarageError;
+
+use crate::generic_server::*;
+
+use crate::admin::bucket::*;
+use crate::admin::cluster::*;
+use crate::admin::error::*;
+use crate::admin::key::*;
+use crate::admin::router::{Authorization, Endpoint};
+
+pub struct AdminApiServer {
+	garage: Arc<Garage>,
+	#[cfg(feature = "metrics")]
+	exporter: PrometheusExporter,
+	metrics_token: Option<String>,
+	admin_token: Option<String>,
+}
+
+impl AdminApiServer {
+	pub fn new(
+		garage: Arc<Garage>,
+		#[cfg(feature = "metrics")] exporter: PrometheusExporter,
+	) -> Self {
+		let cfg = &garage.config.admin;
+		let metrics_token = cfg
+			.metrics_token
+			.as_ref()
+			.map(|tok| format!("Bearer {}", tok));
+		let admin_token = cfg
+			.admin_token
+			.as_ref()
+			.map(|tok| format!("Bearer {}", tok));
+		Self {
+			garage,
+			#[cfg(feature = "metrics")]
+			exporter,
+			metrics_token,
+			admin_token,
+		}
+	}
+
+	pub async fn run(
+		self,
+		bind_addr: SocketAddr,
+		shutdown_signal: impl Future<Output = ()>,
+	) -> Result<(), GarageError> {
+		let region = self.garage.config.s3_api.s3_region.clone();
+		ApiServer::new(region, self)
+			.run_server(bind_addr, shutdown_signal)
+			.await
+	}
+
+	fn handle_options(&self, _req: &Request<Body>) -> Result<Response<Body>, Error> {
+		Ok(Response::builder()
+			.status(204)
+			.header(ALLOW, "OPTIONS, GET, POST")
+			.header(ACCESS_CONTROL_ALLOW_METHODS, "OPTIONS, GET, POST")
+			.header(ACCESS_CONTROL_ALLOW_ORIGIN, "*")
+			.body(Body::empty())?)
+	}
+
+	fn handle_metrics(&self) -> Result<Response<Body>, Error> {
+		#[cfg(feature = "metrics")]
+		{
+			use opentelemetry::trace::Tracer;
+
+			let mut buffer = vec![];
+			let encoder = TextEncoder::new();
+
+			let tracer = opentelemetry::global::tracer("garage");
+			let metric_families = tracer.in_span("admin/gather_metrics", |_| {
+				self.exporter.registry().gather()
+			});
+
+			encoder
+				.encode(&metric_families, &mut buffer)
+				.ok_or_internal_error("Could not serialize metrics")?;
+
+			Ok(Response::builder()
+				.status(200)
+				.header(http::header::CONTENT_TYPE, encoder.format_type())
+				.body(Body::from(buffer))?)
+		}
+		#[cfg(not(feature = "metrics"))]
+		Err(Error::bad_request(
+			"Garage was built without the metrics feature".to_string(),
+		))
+	}
+}
+
+#[async_trait]
+impl ApiHandler for AdminApiServer {
+	const API_NAME: &'static str = "admin";
+	const API_NAME_DISPLAY: &'static str = "Admin";
+
+	type Endpoint = Endpoint;
+	type Error = Error;
+
+	fn parse_endpoint(&self, req: &Request<Body>) -> Result<Endpoint, Error> {
+		Endpoint::from_request(req)
+	}
+
+	async fn handle(
+		&self,
+		req: Request<Body>,
+		endpoint: Endpoint,
+	) -> Result<Response<Body>, Error> {
+		let expected_auth_header =
+			match endpoint.authorization_type() {
+				Authorization::MetricsToken => self.metrics_token.as_ref(),
+				Authorization::AdminToken => match &self.admin_token {
+					None => return Err(Error::forbidden(
+						"Admin token isn't configured, admin API access is disabled for security.",
+					)),
+					Some(t) => Some(t),
+				},
+			};
+
+		if let Some(h) = expected_auth_header {
+			match req.headers().get("Authorization") {
+				None => return Err(Error::forbidden("Authorization token must be provided")),
+				Some(v) => {
+					let authorized = v.to_str().map(|hv| hv.trim() == h).unwrap_or(false);
+					if !authorized {
+						return Err(Error::forbidden("Invalid authorization token provided"));
+					}
+				}
+			}
+		}
+
+		match endpoint {
+			Endpoint::Options => self.handle_options(&req),
+			Endpoint::Metrics => self.handle_metrics(),
+			Endpoint::GetClusterStatus => handle_get_cluster_status(&self.garage).await,
+			Endpoint::ConnectClusterNodes => handle_connect_cluster_nodes(&self.garage, req).await,
+			// Layout
+			Endpoint::GetClusterLayout => handle_get_cluster_layout(&self.garage).await,
+			Endpoint::UpdateClusterLayout => handle_update_cluster_layout(&self.garage, req).await,
+			Endpoint::ApplyClusterLayout => handle_apply_cluster_layout(&self.garage, req).await,
+			Endpoint::RevertClusterLayout => handle_revert_cluster_layout(&self.garage, req).await,
+			// Keys
+			Endpoint::ListKeys => handle_list_keys(&self.garage).await,
+			Endpoint::GetKeyInfo { id, search } => {
+				handle_get_key_info(&self.garage, id, search).await
+			}
+			Endpoint::CreateKey => handle_create_key(&self.garage, req).await,
+			Endpoint::ImportKey => handle_import_key(&self.garage, req).await,
+			Endpoint::UpdateKey { id } => handle_update_key(&self.garage, id, req).await,
+			Endpoint::DeleteKey { id } => handle_delete_key(&self.garage, id).await,
+			// Buckets
+			Endpoint::ListBuckets => handle_list_buckets(&self.garage).await,
+			Endpoint::GetBucketInfo { id, global_alias } => {
+				handle_get_bucket_info(&self.garage, id, global_alias).await
+			}
+			Endpoint::CreateBucket => handle_create_bucket(&self.garage, req).await,
+			Endpoint::DeleteBucket { id } => handle_delete_bucket(&self.garage, id).await,
+			Endpoint::UpdateBucket { id } => handle_update_bucket(&self.garage, id, req).await,
+			// Bucket-key permissions
+			Endpoint::BucketAllowKey => {
+				handle_bucket_change_key_perm(&self.garage, req, true).await
+			}
+			Endpoint::BucketDenyKey => {
+				handle_bucket_change_key_perm(&self.garage, req, false).await
+			}
+			// Bucket aliasing
+			Endpoint::GlobalAliasBucket { id, alias } => {
+				handle_global_alias_bucket(&self.garage, id, alias).await
+			}
+			Endpoint::GlobalUnaliasBucket { id, alias } => {
+				handle_global_unalias_bucket(&self.garage, id, alias).await
+			}
+			Endpoint::LocalAliasBucket {
+				id,
+				access_key_id,
+				alias,
+			} => handle_local_alias_bucket(&self.garage, id, access_key_id, alias).await,
+			Endpoint::LocalUnaliasBucket {
+				id,
+				access_key_id,
+				alias,
+			} => handle_local_unalias_bucket(&self.garage, id, access_key_id, alias).await,
+		}
+	}
+}
+
+impl ApiEndpoint for Endpoint {
+	fn name(&self) -> &'static str {
+		Endpoint::name(self)
+	}
+
+	fn add_span_attributes(&self, _span: SpanRef<'_>) {}
+}
diff --git a/src/api/admin/bucket.rs b/src/api/admin/bucket.rs
new file mode 100644
index 00000000..ac8a8a40
--- /dev/null
+++ b/src/api/admin/bucket.rs
@@ -0,0 +1,580 @@
+use std::collections::HashMap;
+use std::sync::Arc;
+
+use hyper::{Body, Request, Response, StatusCode};
+use serde::{Deserialize, Serialize};
+
+use garage_util::crdt::*;
+use garage_util::data::*;
+use garage_util::time::*;
+
+use garage_table::*;
+
+use garage_model::bucket_alias_table::*;
+use garage_model::bucket_table::*;
+use garage_model::garage::Garage;
+use garage_model::permission::*;
+use garage_model::s3::object_table::*;
+
+use crate::admin::error::*;
+use crate::admin::key::ApiBucketKeyPerm;
+use crate::common_error::CommonError;
+use crate::helpers::{json_ok_response, parse_json_body};
+
+pub async fn handle_list_buckets(garage: &Arc<Garage>) -> Result<Response<Body>, Error> {
+	let buckets = garage
+		.bucket_table
+		.get_range(
+			&EmptyKey,
+			None,
+			Some(DeletedFilter::NotDeleted),
+			10000,
+			EnumerationOrder::Forward,
+		)
+		.await?;
+
+	let res = buckets
+		.into_iter()
+		.map(|b| {
+			let state = b.state.as_option().unwrap();
+			ListBucketResultItem {
+				id: hex::encode(b.id),
+				global_aliases: state
+					.aliases
+					.items()
+					.iter()
+					.filter(|(_, _, a)| *a)
+					.map(|(n, _, _)| n.to_string())
+					.collect::<Vec<_>>(),
+				local_aliases: state
+					.local_aliases
+					.items()
+					.iter()
+					.filter(|(_, _, a)| *a)
+					.map(|((k, n), _, _)| BucketLocalAlias {
+						access_key_id: k.to_string(),
+						alias: n.to_string(),
+					})
+					.collect::<Vec<_>>(),
+			}
+		})
+		.collect::<Vec<_>>();
+
+	Ok(json_ok_response(&res)?)
+}
+
+#[derive(Serialize)]
+#[serde(rename_all = "camelCase")]
+struct ListBucketResultItem {
+	id: String,
+	global_aliases: Vec<String>,
+	local_aliases: Vec<BucketLocalAlias>,
+}
+
+#[derive(Serialize)]
+#[serde(rename_all = "camelCase")]
+struct BucketLocalAlias {
+	access_key_id: String,
+	alias: String,
+}
+
+#[derive(Serialize, Deserialize)]
+#[serde(rename_all = "camelCase")]
+struct ApiBucketQuotas {
+	max_size: Option<u64>,
+	max_objects: Option<u64>,
+}
+
+pub async fn handle_get_bucket_info(
+	garage: &Arc<Garage>,
+	id: Option<String>,
+	global_alias: Option<String>,
+) -> Result<Response<Body>, Error> {
+	let bucket_id = match (id, global_alias) {
+		(Some(id), None) => parse_bucket_id(&id)?,
+		(None, Some(ga)) => garage
+			.bucket_helper()
+			.resolve_global_bucket_name(&ga)
+			.await?
+			.ok_or_else(|| HelperError::NoSuchBucket(ga.to_string()))?,
+		_ => {
+			return Err(Error::bad_request(
+				"Either id or globalAlias must be provided (but not both)",
+			));
+		}
+	};
+
+	bucket_info_results(garage, bucket_id).await
+}
+
+async fn bucket_info_results(
+	garage: &Arc<Garage>,
+	bucket_id: Uuid,
+) -> Result<Response<Body>, Error> {
+	let bucket = garage
+		.bucket_helper()
+		.get_existing_bucket(bucket_id)
+		.await?;
+
+	let counters = garage
+		.object_counter_table
+		.table
+		.get(&bucket_id, &EmptyKey)
+		.await?
+		.map(|x| x.filtered_values(&garage.system.ring.borrow()))
+		.unwrap_or_default();
+
+	let mut relevant_keys = HashMap::new();
+	for (k, _) in bucket
+		.state
+		.as_option()
+		.unwrap()
+		.authorized_keys
+		.items()
+		.iter()
+	{
+		if let Some(key) = garage
+			.key_table
+			.get(&EmptyKey, k)
+			.await?
+			.filter(|k| !k.is_deleted())
+		{
+			if !key.state.is_deleted() {
+				relevant_keys.insert(k.clone(), key);
+			}
+		}
+	}
+	for ((k, _), _, _) in bucket
+		.state
+		.as_option()
+		.unwrap()
+		.local_aliases
+		.items()
+		.iter()
+	{
+		if relevant_keys.contains_key(k) {
+			continue;
+		}
+		if let Some(key) = garage.key_table.get(&EmptyKey, k).await? {
+			if !key.state.is_deleted() {
+				relevant_keys.insert(k.clone(), key);
+			}
+		}
+	}
+
+	let state = bucket.state.as_option().unwrap();
+
+	let quotas = state.quotas.get();
+	let res =
+		GetBucketInfoResult {
+			id: hex::encode(&bucket.id),
+			global_aliases: state
+				.aliases
+				.items()
+				.iter()
+				.filter(|(_, _, a)| *a)
+				.map(|(n, _, _)| n.to_string())
+				.collect::<Vec<_>>(),
+			website_access: state.website_config.get().is_some(),
+			website_config: state.website_config.get().clone().map(|wsc| {
+				GetBucketInfoWebsiteResult {
+					index_document: wsc.index_document,
+					error_document: wsc.error_document,
+				}
+			}),
+			keys: relevant_keys
+				.into_iter()
+				.map(|(_, key)| {
+					let p = key.state.as_option().unwrap();
+					GetBucketInfoKey {
+						access_key_id: key.key_id,
+						name: p.name.get().to_string(),
+						permissions: p
+							.authorized_buckets
+							.get(&bucket.id)
+							.map(|p| ApiBucketKeyPerm {
+								read: p.allow_read,
+								write: p.allow_write,
+								owner: p.allow_owner,
+							})
+							.unwrap_or_default(),
+						bucket_local_aliases: p
+							.local_aliases
+							.items()
+							.iter()
+							.filter(|(_, _, b)| *b == Some(bucket.id))
+							.map(|(n, _, _)| n.to_string())
+							.collect::<Vec<_>>(),
+					}
+				})
+				.collect::<Vec<_>>(),
+			objects: counters.get(OBJECTS).cloned().unwrap_or_default(),
+			bytes: counters.get(BYTES).cloned().unwrap_or_default(),
+			unfinshed_uploads: counters
+				.get(UNFINISHED_UPLOADS)
+				.cloned()
+				.unwrap_or_default(),
+			quotas: ApiBucketQuotas {
+				max_size: quotas.max_size,
+				max_objects: quotas.max_objects,
+			},
+		};
+
+	Ok(json_ok_response(&res)?)
+}
+
+#[derive(Serialize)]
+#[serde(rename_all = "camelCase")]
+struct GetBucketInfoResult {
+	id: String,
+	global_aliases: Vec<String>,
+	website_access: bool,
+	#[serde(default)]
+	website_config: Option<GetBucketInfoWebsiteResult>,
+	keys: Vec<GetBucketInfoKey>,
+	objects: i64,
+	bytes: i64,
+	unfinshed_uploads: i64,
+	quotas: ApiBucketQuotas,
+}
+
+#[derive(Serialize)]
+#[serde(rename_all = "camelCase")]
+struct GetBucketInfoWebsiteResult {
+	index_document: String,
+	error_document: Option<String>,
+}
+
+#[derive(Serialize)]
+#[serde(rename_all = "camelCase")]
+struct GetBucketInfoKey {
+	access_key_id: String,
+	name: String,
+	permissions: ApiBucketKeyPerm,
+	bucket_local_aliases: Vec<String>,
+}
+
+pub async fn handle_create_bucket(
+	garage: &Arc<Garage>,
+	req: Request<Body>,
+) -> Result<Response<Body>, Error> {
+	let req = parse_json_body::<CreateBucketRequest>(req).await?;
+
+	if let Some(ga) = &req.global_alias {
+		if !is_valid_bucket_name(ga) {
+			return Err(Error::bad_request(format!(
+				"{}: {}",
+				ga, INVALID_BUCKET_NAME_MESSAGE
+			)));
+		}
+
+		if let Some(alias) = garage.bucket_alias_table.get(&EmptyKey, ga).await? {
+			if alias.state.get().is_some() {
+				return Err(CommonError::BucketAlreadyExists.into());
+			}
+		}
+	}
+
+	if let Some(la) = &req.local_alias {
+		if !is_valid_bucket_name(&la.alias) {
+			return Err(Error::bad_request(format!(
+				"{}: {}",
+				la.alias, INVALID_BUCKET_NAME_MESSAGE
+			)));
+		}
+
+		let key = garage
+			.key_helper()
+			.get_existing_key(&la.access_key_id)
+			.await?;
+		let state = key.state.as_option().unwrap();
+		if matches!(state.local_aliases.get(&la.alias), Some(_)) {
+			return Err(Error::bad_request("Local alias already exists"));
+		}
+	}
+
+	let bucket = Bucket::new();
+	garage.bucket_table.insert(&bucket).await?;
+
+	if let Some(ga) = &req.global_alias {
+		garage
+			.bucket_helper()
+			.set_global_bucket_alias(bucket.id, ga)
+			.await?;
+	}
+
+	if let Some(la) = &req.local_alias {
+		garage
+			.bucket_helper()
+			.set_local_bucket_alias(bucket.id, &la.access_key_id, &la.alias)
+			.await?;
+
+		if la.allow.read || la.allow.write || la.allow.owner {
+			garage
+				.bucket_helper()
+				.set_bucket_key_permissions(
+					bucket.id,
+					&la.access_key_id,
+					BucketKeyPerm {
+						timestamp: now_msec(),
+						allow_read: la.allow.read,
+						allow_write: la.allow.write,
+						allow_owner: la.allow.owner,
+					},
+				)
+				.await?;
+		}
+	}
+
+	bucket_info_results(garage, bucket.id).await
+}
+
+#[derive(Deserialize)]
+#[serde(rename_all = "camelCase")]
+struct CreateBucketRequest {
+	global_alias: Option<String>,
+	local_alias: Option<CreateBucketLocalAlias>,
+}
+
+#[derive(Deserialize)]
+#[serde(rename_all = "camelCase")]
+struct CreateBucketLocalAlias {
+	access_key_id: String,
+	alias: String,
+	#[serde(default)]
+	allow: ApiBucketKeyPerm,
+}
+
+pub async fn handle_delete_bucket(
+	garage: &Arc<Garage>,
+	id: String,
+) -> Result<Response<Body>, Error> {
+	let helper = garage.bucket_helper();
+
+	let bucket_id = parse_bucket_id(&id)?;
+
+	let mut bucket = helper.get_existing_bucket(bucket_id).await?;
+	let state = bucket.state.as_option().unwrap();
+
+	// Check bucket is empty
+	if !helper.is_bucket_empty(bucket_id).await? {
+		return Err(CommonError::BucketNotEmpty.into());
+	}
+
+	// --- done checking, now commit ---
+	// 1. delete authorization from keys that had access
+	for (key_id, perm) in bucket.authorized_keys() {
+		if perm.is_any() {
+			helper
+				.set_bucket_key_permissions(bucket.id, key_id, BucketKeyPerm::NO_PERMISSIONS)
+				.await?;
+		}
+	}
+	// 2. delete all local aliases
+	for ((key_id, alias), _, active) in state.local_aliases.items().iter() {
+		if *active {
+			helper
+				.unset_local_bucket_alias(bucket.id, key_id, alias)
+				.await?;
+		}
+	}
+	// 3. delete all global aliases
+	for (alias, _, active) in state.aliases.items().iter() {
+		if *active {
+			helper.purge_global_bucket_alias(bucket.id, alias).await?;
+		}
+	}
+
+	// 4. delete bucket
+	bucket.state = Deletable::delete();
+	garage.bucket_table.insert(&bucket).await?;
+
+	Ok(Response::builder()
+		.status(StatusCode::NO_CONTENT)
+		.body(Body::empty())?)
+}
+
+pub async fn handle_update_bucket(
+	garage: &Arc<Garage>,
+	id: String,
+	req: Request<Body>,
+) -> Result<Response<Body>, Error> {
+	let req = parse_json_body::<UpdateBucketRequest>(req).await?;
+	let bucket_id = parse_bucket_id(&id)?;
+
+	let mut bucket = garage
+		.bucket_helper()
+		.get_existing_bucket(bucket_id)
+		.await?;
+
+	let state = bucket.state.as_option_mut().unwrap();
+
+	if let Some(wa) = req.website_access {
+		if wa.enabled {
+			state.website_config.update(Some(WebsiteConfig {
+				index_document: wa.index_document.ok_or_bad_request(
+					"Please specify indexDocument when enabling website access.",
+				)?,
+				error_document: wa.error_document,
+			}));
+		} else {
+			if wa.index_document.is_some() || wa.error_document.is_some() {
+				return Err(Error::bad_request(
+					"Cannot specify indexDocument or errorDocument when disabling website access.",
+				));
+			}
+			state.website_config.update(None);
+		}
+	}
+
+	if let Some(q) = req.quotas {
+		state.quotas.update(BucketQuotas {
+			max_size: q.max_size,
+			max_objects: q.max_objects,
+		});
+	}
+
+	garage.bucket_table.insert(&bucket).await?;
+
+	bucket_info_results(garage, bucket_id).await
+}
+
+#[derive(Deserialize)]
+#[serde(rename_all = "camelCase")]
+struct UpdateBucketRequest {
+	website_access: Option<UpdateBucketWebsiteAccess>,
+	quotas: Option<ApiBucketQuotas>,
+}
+
+#[derive(Deserialize)]
+#[serde(rename_all = "camelCase")]
+struct UpdateBucketWebsiteAccess {
+	enabled: bool,
+	index_document: Option<String>,
+	error_document: Option<String>,
+}
+
+// ---- BUCKET/KEY PERMISSIONS ----
+
+pub async fn handle_bucket_change_key_perm(
+	garage: &Arc<Garage>,
+	req: Request<Body>,
+	new_perm_flag: bool,
+) -> Result<Response<Body>, Error> {
+	let req = parse_json_body::<BucketKeyPermChangeRequest>(req).await?;
+
+	let bucket_id = parse_bucket_id(&req.bucket_id)?;
+
+	let bucket = garage
+		.bucket_helper()
+		.get_existing_bucket(bucket_id)
+		.await?;
+	let state = bucket.state.as_option().unwrap();
+
+	let key = garage
+		.key_helper()
+		.get_existing_key(&req.access_key_id)
+		.await?;
+
+	let mut perm = state
+		.authorized_keys
+		.get(&key.key_id)
+		.cloned()
+		.unwrap_or(BucketKeyPerm::NO_PERMISSIONS);
+
+	if req.permissions.read {
+		perm.allow_read = new_perm_flag;
+	}
+	if req.permissions.write {
+		perm.allow_write = new_perm_flag;
+	}
+	if req.permissions.owner {
+		perm.allow_owner = new_perm_flag;
+	}
+
+	garage
+		.bucket_helper()
+		.set_bucket_key_permissions(bucket.id, &key.key_id, perm)
+		.await?;
+
+	bucket_info_results(garage, bucket.id).await
+}
+
+#[derive(Deserialize)]
+#[serde(rename_all = "camelCase")]
+struct BucketKeyPermChangeRequest {
+	bucket_id: String,
+	access_key_id: String,
+	permissions: ApiBucketKeyPerm,
+}
+
+// ---- BUCKET ALIASES ----
+
+pub async fn handle_global_alias_bucket(
+	garage: &Arc<Garage>,
+	bucket_id: String,
+	alias: String,
+) -> Result<Response<Body>, Error> {
+	let bucket_id = parse_bucket_id(&bucket_id)?;
+
+	garage
+		.bucket_helper()
+		.set_global_bucket_alias(bucket_id, &alias)
+		.await?;
+
+	bucket_info_results(garage, bucket_id).await
+}
+
+pub async fn handle_global_unalias_bucket(
+	garage: &Arc<Garage>,
+	bucket_id: String,
+	alias: String,
+) -> Result<Response<Body>, Error> {
+	let bucket_id = parse_bucket_id(&bucket_id)?;
+
+	garage
+		.bucket_helper()
+		.unset_global_bucket_alias(bucket_id, &alias)
+		.await?;
+
+	bucket_info_results(garage, bucket_id).await
+}
+
+pub async fn handle_local_alias_bucket(
+	garage: &Arc<Garage>,
+	bucket_id: String,
+	access_key_id: String,
+	alias: String,
+) -> Result<Response<Body>, Error> {
+	let bucket_id = parse_bucket_id(&bucket_id)?;
+
+	garage
+		.bucket_helper()
+		.set_local_bucket_alias(bucket_id, &access_key_id, &alias)
+		.await?;
+
+	bucket_info_results(garage, bucket_id).await
+}
+
+pub async fn handle_local_unalias_bucket(
+	garage: &Arc<Garage>,
+	bucket_id: String,
+	access_key_id: String,
+	alias: String,
+) -> Result<Response<Body>, Error> {
+	let bucket_id = parse_bucket_id(&bucket_id)?;
+
+	garage
+		.bucket_helper()
+		.unset_local_bucket_alias(bucket_id, &access_key_id, &alias)
+		.await?;
+
+	bucket_info_results(garage, bucket_id).await
+}
+
+// ---- HELPER ----
+
+fn parse_bucket_id(id: &str) -> Result<Uuid, Error> {
+	let id_hex = hex::decode(&id).ok_or_bad_request("Invalid bucket id")?;
+	Ok(Uuid::try_from(&id_hex).ok_or_bad_request("Invalid bucket id")?)
+}
diff --git a/src/api/admin/cluster.rs b/src/api/admin/cluster.rs
new file mode 100644
index 00000000..99c6e332
--- /dev/null
+++ b/src/api/admin/cluster.rs
@@ -0,0 +1,193 @@
+use std::collections::HashMap;
+use std::net::SocketAddr;
+use std::sync::Arc;
+
+use hyper::{Body, Request, Response, StatusCode};
+use serde::{Deserialize, Serialize};
+
+use garage_util::crdt::*;
+use garage_util::data::*;
+
+use garage_rpc::layout::*;
+
+use garage_model::garage::Garage;
+
+use crate::admin::error::*;
+use crate::helpers::{json_ok_response, parse_json_body};
+
+pub async fn handle_get_cluster_status(garage: &Arc<Garage>) -> Result<Response<Body>, Error> {
+	let res = GetClusterStatusResponse {
+		node: hex::encode(garage.system.id),
+		garage_version: garage_util::version::garage_version(),
+		garage_features: garage_util::version::garage_features(),
+		db_engine: garage.db.engine(),
+		known_nodes: garage
+			.system
+			.get_known_nodes()
+			.into_iter()
+			.map(|i| {
+				(
+					hex::encode(i.id),
+					KnownNodeResp {
+						addr: i.addr,
+						is_up: i.is_up,
+						last_seen_secs_ago: i.last_seen_secs_ago,
+						hostname: i.status.hostname,
+					},
+				)
+			})
+			.collect(),
+		layout: get_cluster_layout(garage),
+	};
+
+	Ok(json_ok_response(&res)?)
+}
+
+pub async fn handle_connect_cluster_nodes(
+	garage: &Arc<Garage>,
+	req: Request<Body>,
+) -> Result<Response<Body>, Error> {
+	let req = parse_json_body::<Vec<String>>(req).await?;
+
+	let res = futures::future::join_all(req.iter().map(|node| garage.system.connect(node)))
+		.await
+		.into_iter()
+		.map(|r| match r {
+			Ok(()) => ConnectClusterNodesResponse {
+				success: true,
+				error: None,
+			},
+			Err(e) => ConnectClusterNodesResponse {
+				success: false,
+				error: Some(format!("{}", e)),
+			},
+		})
+		.collect::<Vec<_>>();
+
+	Ok(json_ok_response(&res)?)
+}
+
+pub async fn handle_get_cluster_layout(garage: &Arc<Garage>) -> Result<Response<Body>, Error> {
+	let res = get_cluster_layout(garage);
+
+	Ok(json_ok_response(&res)?)
+}
+
+fn get_cluster_layout(garage: &Arc<Garage>) -> GetClusterLayoutResponse {
+	let layout = garage.system.get_cluster_layout();
+
+	GetClusterLayoutResponse {
+		version: layout.version,
+		roles: layout
+			.roles
+			.items()
+			.iter()
+			.filter(|(_, _, v)| v.0.is_some())
+			.map(|(k, _, v)| (hex::encode(k), v.0.clone()))
+			.collect(),
+		staged_role_changes: layout
+			.staging
+			.items()
+			.iter()
+			.filter(|(k, _, v)| layout.roles.get(k) != Some(v))
+			.map(|(k, _, v)| (hex::encode(k), v.0.clone()))
+			.collect(),
+	}
+}
+
+#[derive(Serialize)]
+#[serde(rename_all = "camelCase")]
+struct GetClusterStatusResponse {
+	node: String,
+	garage_version: &'static str,
+	garage_features: Option<&'static [&'static str]>,
+	db_engine: String,
+	known_nodes: HashMap<String, KnownNodeResp>,
+	layout: GetClusterLayoutResponse,
+}
+
+#[derive(Serialize)]
+struct ConnectClusterNodesResponse {
+	success: bool,
+	error: Option<String>,
+}
+
+#[derive(Serialize)]
+#[serde(rename_all = "camelCase")]
+struct GetClusterLayoutResponse {
+	version: u64,
+	roles: HashMap<String, Option<NodeRole>>,
+	staged_role_changes: HashMap<String, Option<NodeRole>>,
+}
+
+#[derive(Serialize)]
+struct KnownNodeResp {
+	addr: SocketAddr,
+	is_up: bool,
+	last_seen_secs_ago: Option<u64>,
+	hostname: String,
+}
+
+pub async fn handle_update_cluster_layout(
+	garage: &Arc<Garage>,
+	req: Request<Body>,
+) -> Result<Response<Body>, Error> {
+	let updates = parse_json_body::<UpdateClusterLayoutRequest>(req).await?;
+
+	let mut layout = garage.system.get_cluster_layout();
+
+	let mut roles = layout.roles.clone();
+	roles.merge(&layout.staging);
+
+	for (node, role) in updates {
+		let node = hex::decode(node).ok_or_bad_request("Invalid node identifier")?;
+		let node = Uuid::try_from(&node).ok_or_bad_request("Invalid node identifier")?;
+
+		layout
+			.staging
+			.merge(&roles.update_mutator(node, NodeRoleV(role)));
+	}
+
+	garage.system.update_cluster_layout(&layout).await?;
+
+	Ok(Response::builder()
+		.status(StatusCode::OK)
+		.body(Body::empty())?)
+}
+
+pub async fn handle_apply_cluster_layout(
+	garage: &Arc<Garage>,
+	req: Request<Body>,
+) -> Result<Response<Body>, Error> {
+	let param = parse_json_body::<ApplyRevertLayoutRequest>(req).await?;
+
+	let layout = garage.system.get_cluster_layout();
+	let layout = layout.apply_staged_changes(Some(param.version))?;
+	garage.system.update_cluster_layout(&layout).await?;
+
+	Ok(Response::builder()
+		.status(StatusCode::OK)
+		.body(Body::empty())?)
+}
+
+pub async fn handle_revert_cluster_layout(
+	garage: &Arc<Garage>,
+	req: Request<Body>,
+) -> Result<Response<Body>, Error> {
+	let param = parse_json_body::<ApplyRevertLayoutRequest>(req).await?;
+
+	let layout = garage.system.get_cluster_layout();
+	let layout = layout.revert_staged_changes(Some(param.version))?;
+	garage.system.update_cluster_layout(&layout).await?;
+
+	Ok(Response::builder()
+		.status(StatusCode::OK)
+		.body(Body::empty())?)
+}
+
+type UpdateClusterLayoutRequest = HashMap<String, Option<NodeRole>>;
+
+#[derive(Deserialize)]
+struct ApplyRevertLayoutRequest {
+	version: u64,
+}
diff --git a/src/api/admin/error.rs b/src/api/admin/error.rs
new file mode 100644
index 00000000..ed1a07bd
--- /dev/null
+++ b/src/api/admin/error.rs
@@ -0,0 +1,97 @@
+use err_derive::Error;
+use hyper::header::HeaderValue;
+use hyper::{Body, HeaderMap, StatusCode};
+
+pub use garage_model::helper::error::Error as HelperError;
+
+use crate::common_error::CommonError;
+pub use crate::common_error::{CommonErrorDerivative, OkOrBadRequest, OkOrInternalError};
+use crate::generic_server::ApiError;
+use crate::helpers::CustomApiErrorBody;
+
+/// Errors of this crate
+#[derive(Debug, Error)]
+pub enum Error {
+	#[error(display = "{}", _0)]
+	/// Error from common error
+	Common(CommonError),
+
+	// Category: cannot process
+	/// The API access key does not exist
+	#[error(display = "Access key not found: {}", _0)]
+	NoSuchAccessKey(String),
+
+	/// In Import key, the key already exists
+	#[error(
+		display = "Key {} already exists in data store. Even if it is deleted, we can't let you create a new key with the same ID. Sorry.",
+		_0
+	)]
+	KeyAlreadyExists(String),
+}
+
+impl<T> From<T> for Error
+where
+	CommonError: From<T>,
+{
+	fn from(err: T) -> Self {
+		Error::Common(CommonError::from(err))
+	}
+}
+
+impl CommonErrorDerivative for Error {}
+
+impl From<HelperError> for Error {
+	fn from(err: HelperError) -> Self {
+		match err {
+			HelperError::Internal(i) => Self::Common(CommonError::InternalError(i)),
+			HelperError::BadRequest(b) => Self::Common(CommonError::BadRequest(b)),
+			HelperError::InvalidBucketName(n) => Self::Common(CommonError::InvalidBucketName(n)),
+			HelperError::NoSuchBucket(n) => Self::Common(CommonError::NoSuchBucket(n)),
+			HelperError::NoSuchAccessKey(n) => Self::NoSuchAccessKey(n),
+		}
+	}
+}
+
+impl Error {
+	fn code(&self) -> &'static str {
+		match self {
+			Error::Common(c) => c.aws_code(),
+			Error::NoSuchAccessKey(_) => "NoSuchAccessKey",
+			Error::KeyAlreadyExists(_) => "KeyAlreadyExists",
+		}
+	}
+}
+
+impl ApiError for Error {
+	/// Get the HTTP status code that best represents the meaning of the error for the client
+	fn http_status_code(&self) -> StatusCode {
+		match self {
+			Error::Common(c) => c.http_status_code(),
+			Error::NoSuchAccessKey(_) => StatusCode::NOT_FOUND,
+			Error::KeyAlreadyExists(_) => StatusCode::CONFLICT,
+		}
+	}
+
+	fn add_http_headers(&self, header_map: &mut HeaderMap<HeaderValue>) {
+		use hyper::header;
+		header_map.append(header::CONTENT_TYPE, "application/json".parse().unwrap());
+	}
+
+	fn http_body(&self, garage_region: &str, path: &str) -> Body {
+		let error = CustomApiErrorBody {
+			code: self.code().to_string(),
+			message: format!("{}", self),
+			path: path.to_string(),
+			region: garage_region.to_string(),
+		};
+		Body::from(serde_json::to_string_pretty(&error).unwrap_or_else(|_| {
+			r#"
+{
+	"code": "InternalError",
+	"message": "JSON encoding of error failed"
+}
+			"#
+			.into()
+		}))
+	}
+}
diff --git a/src/api/admin/key.rs b/src/api/admin/key.rs
new file mode 100644
index 00000000..2bbabb7b
--- /dev/null
+++ b/src/api/admin/key.rs
@@ -0,0 +1,256 @@
+use std::collections::HashMap;
+use std::sync::Arc;
+
+use hyper::{Body, Request, Response, StatusCode};
+use serde::{Deserialize, Serialize};
+
+use garage_table::*;
+
+use garage_model::garage::Garage;
+use garage_model::key_table::*;
+
+use crate::admin::error::*;
+use crate::helpers::{json_ok_response, parse_json_body};
+
+pub async fn handle_list_keys(garage: &Arc<Garage>) -> Result<Response<Body>, Error> {
+	let res = garage
+		.key_table
+		.get_range(
+			&EmptyKey,
+			None,
+			Some(KeyFilter::Deleted(DeletedFilter::NotDeleted)),
+			10000,
+			EnumerationOrder::Forward,
+		)
+		.await?
+		.iter()
+		.map(|k| ListKeyResultItem {
+			id: k.key_id.to_string(),
+			name: k.params().unwrap().name.get().clone(),
+		})
+		.collect::<Vec<_>>();
+
+	Ok(json_ok_response(&res)?)
+}
+
+#[derive(Serialize)]
+struct ListKeyResultItem {
+	id: String,
+	name: String,
+}
+
+pub async fn handle_get_key_info(
+	garage: &Arc<Garage>,
+	id: Option<String>,
+	search: Option<String>,
+) -> Result<Response<Body>, Error> {
+	let key = if let Some(id) = id {
+		garage.key_helper().get_existing_key(&id).await?
+	} else if let Some(search) = search {
+		garage
+			.key_helper()
+			.get_existing_matching_key(&search)
+			.await?
+	} else {
+		unreachable!();
+	};
+
+	key_info_results(garage, key).await
+}
+
+pub async fn handle_create_key(
+	garage: &Arc<Garage>,
+	req: Request<Body>,
+) -> Result<Response<Body>, Error> {
+	let req = parse_json_body::<CreateKeyRequest>(req).await?;
+
+	let key = Key::new(&req.name);
+	garage.key_table.insert(&key).await?;
+
+	key_info_results(garage, key).await
+}
+
+#[derive(Deserialize)]
+struct CreateKeyRequest {
+	name: String,
+}
+
+pub async fn handle_import_key(
+	garage: &Arc<Garage>,
+	req: Request<Body>,
+) -> Result<Response<Body>, Error> {
+	let req = parse_json_body::<ImportKeyRequest>(req).await?;
+
+	let prev_key = garage.key_table.get(&EmptyKey, &req.access_key_id).await?;
+	if prev_key.is_some() {
+		return Err(Error::KeyAlreadyExists(req.access_key_id.to_string()));
+	}
+
+	let imported_key = Key::import(&req.access_key_id, &req.secret_access_key, &req.name);
+	garage.key_table.insert(&imported_key).await?;
+
+	key_info_results(garage, imported_key).await
+}
+
+#[derive(Deserialize)]
+#[serde(rename_all = "camelCase")]
+struct ImportKeyRequest {
+	access_key_id: String,
+	secret_access_key: String,
+	name: String,
+}
+
+pub async fn handle_update_key(
+	garage: &Arc<Garage>,
+	id: String,
+	req: Request<Body>,
+) -> Result<Response<Body>, Error> {
+	let req = parse_json_body::<UpdateKeyRequest>(req).await?;
+
+	let mut key = garage.key_helper().get_existing_key(&id).await?;
+
+	let key_state = key.state.as_option_mut().unwrap();
+
+	if let Some(new_name) = req.name {
+		key_state.name.update(new_name);
+	}
+	if let Some(allow) = req.allow {
+		if allow.create_bucket {
+			key_state.allow_create_bucket.update(true);
+		}
+	}
+	if let Some(deny) = req.deny {
+		if deny.create_bucket {
+			key_state.allow_create_bucket.update(false);
+		}
+	}
+
+	garage.key_table.insert(&key).await?;
+
+	key_info_results(garage, key).await
+}
+
+#[derive(Deserialize)]
+struct UpdateKeyRequest {
+	name: Option<String>,
+	allow: Option<KeyPerm>,
+	deny: Option<KeyPerm>,
+}
+
+pub async fn handle_delete_key(garage: &Arc<Garage>, id: String) -> Result<Response<Body>, Error> {
+	let mut key = garage.key_helper().get_existing_key(&id).await?;
+
+	key.state.as_option().unwrap();
+
+	garage.key_helper().delete_key(&mut key).await?;
+
+	Ok(Response::builder()
+		.status(StatusCode::NO_CONTENT)
+		.body(Body::empty())?)
+}
+
+async fn key_info_results(garage: &Arc<Garage>, key: Key) -> Result<Response<Body>, Error> {
+	let mut relevant_buckets = HashMap::new();
+
+	let key_state = key.state.as_option().unwrap();
+
+	for id in key_state
+		.authorized_buckets
+		.items()
+		.iter()
+		.map(|(id, _)| id)
+		.chain(
+			key_state
+				.local_aliases
+				.items()
+				.iter()
+				.filter_map(|(_, _, v)| v.as_ref()),
+		) {
+		if !relevant_buckets.contains_key(id) {
+			if let Some(b) = garage.bucket_table.get(&EmptyKey, id).await? {
+				if b.state.as_option().is_some() {
+					relevant_buckets.insert(*id, b);
+				}
+			}
+		}
+	}
+
+	let res = GetKeyInfoResult {
+		name: key_state.name.get().clone(),
+		access_key_id: key.key_id.clone(),
+		secret_access_key: key_state.secret_key.clone(),
+		permissions: KeyPerm {
+			create_bucket: *key_state.allow_create_bucket.get(),
+		},
+		buckets: relevant_buckets
+			.into_iter()
+			.map(|(_, bucket)| {
+				let state = bucket.state.as_option().unwrap();
+				KeyInfoBucketResult {
+					id: hex::encode(bucket.id),
+					global_aliases: state
+						.aliases
+						.items()
+						.iter()
+						.filter(|(_, _, a)| *a)
+						.map(|(n, _, _)| n.to_string())
+						.collect::<Vec<_>>(),
+					local_aliases: state
+						.local_aliases
+						.items()
+						.iter()
+						.filter(|((k, _), _, a)| *a && *k == key.key_id)
+						.map(|((_, n), _, _)| n.to_string())
+						.collect::<Vec<_>>(),
+					permissions: key_state
+						.authorized_buckets
+						.get(&bucket.id)
+						.map(|p| ApiBucketKeyPerm {
+							read: p.allow_read,
+							write: p.allow_write,
+							owner: p.allow_owner,
+						})
+						.unwrap_or_default(),
+				}
+			})
+			.collect::<Vec<_>>(),
+	};
+
+	Ok(json_ok_response(&res)?)
+}
+
+#[derive(Serialize)]
+#[serde(rename_all = "camelCase")]
+struct GetKeyInfoResult {
+	name: String,
+	access_key_id: String,
+	secret_access_key: String,
+	permissions: KeyPerm,
+	buckets: Vec<KeyInfoBucketResult>,
+}
+
+#[derive(Serialize, Deserialize)]
+#[serde(rename_all = "camelCase")]
+struct KeyPerm {
+	#[serde(default)]
+	create_bucket: bool,
+}
+
+#[derive(Serialize)]
+#[serde(rename_all = "camelCase")]
+struct KeyInfoBucketResult {
+	id: String,
+	global_aliases: Vec<String>,
+	local_aliases: Vec<String>,
+	permissions: ApiBucketKeyPerm,
+}
+
+#[derive(Serialize, Deserialize, Default)]
+pub(crate) struct ApiBucketKeyPerm {
+	#[serde(default)]
+	pub(crate) read: bool,
+	#[serde(default)]
+	pub(crate) write: bool,
+	#[serde(default)]
+	pub(crate) owner: bool,
+}
diff --git a/src/api/admin/mod.rs b/src/api/admin/mod.rs
new file mode 100644
index 00000000..c4857c10
--- /dev/null
+++ b/src/api/admin/mod.rs
@@ -0,0 +1,7 @@
+pub mod api_server;
+mod error;
+mod router;
+
+mod bucket;
+mod cluster;
+mod key;
diff --git a/src/api/admin/router.rs b/src/api/admin/router.rs
new file mode 100644
index 00000000..3eee8b67
--- /dev/null
+++ b/src/api/admin/router.rs
@@ -0,0 +1,145 @@
+use std::borrow::Cow;
+
+use hyper::{Method, Request};
+
+use crate::admin::error::*;
+use crate::router_macros::*;
+
+pub enum Authorization {
+	MetricsToken,
+	AdminToken,
+}
+
+router_match! {@func
+
+/// List of all Admin API endpoints.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum Endpoint {
+	Options,
+	Metrics,
+	GetClusterStatus,
+	ConnectClusterNodes,
+	// Layout
+	GetClusterLayout,
+	UpdateClusterLayout,
+	ApplyClusterLayout,
+	RevertClusterLayout,
+	// Keys
+	ListKeys,
+	CreateKey,
+	ImportKey,
+	GetKeyInfo {
+		id: Option<String>,
+		search: Option<String>,
+	},
+	DeleteKey {
+		id: String,
+	},
+	UpdateKey {
+		id: String,
+	},
+	// Buckets
+	ListBuckets,
+	CreateBucket,
+	GetBucketInfo {
+		id: Option<String>,
+		global_alias: Option<String>,
+	},
+	DeleteBucket {
+		id: String,
+	},
+	UpdateBucket {
+		id: String,
+	},
+	// Bucket-Key Permissions
+	BucketAllowKey,
+	BucketDenyKey,
+	// Bucket aliases
+	GlobalAliasBucket {
+		id: String,
+		alias: String,
+	},
+	GlobalUnaliasBucket {
+		id: String,
+		alias: String,
+	},
+	LocalAliasBucket {
+		id: String,
+		access_key_id: String,
+		alias: String,
+	},
+	LocalUnaliasBucket {
+		id: String,
+		access_key_id: String,
+		alias: String,
+	},
+}}
+
+impl Endpoint {
+	/// Determine which S3 endpoint a request is for using the request, and a bucket which was
+	/// possibly extracted from the Host header.
+	/// Returns Self plus bucket name, if endpoint is not Endpoint::ListBuckets
+	pub fn from_request<T>(req: &Request<T>) -> Result<Self, Error> {
+		let uri = req.uri();
+		let path = uri.path();
+		let query = uri.query();
+
+		let mut query = QueryParameters::from_query(query.unwrap_or_default())?;
+
+		let res = router_match!(@gen_path_parser (req.method(), path, query) [
+			OPTIONS _ => Options,
+			GET "/metrics" => Metrics,
+			GET "/v0/status" => GetClusterStatus,
+			POST "/v0/connect" => ConnectClusterNodes,
+			// Layout endpoints
+			GET "/v0/layout" => GetClusterLayout,
+			POST "/v0/layout" => UpdateClusterLayout,
+			POST "/v0/layout/apply" => ApplyClusterLayout,
+			POST "/v0/layout/revert" => RevertClusterLayout,
+			// API key endpoints
+			GET "/v0/key" if id => GetKeyInfo (query_opt::id, query_opt::search),
+			GET "/v0/key" if search => GetKeyInfo (query_opt::id, query_opt::search),
+			POST "/v0/key" if id => UpdateKey (query::id),
+			POST "/v0/key" => CreateKey,
+			POST "/v0/key/import" => ImportKey,
+			DELETE "/v0/key" if id => DeleteKey (query::id),
+			GET "/v0/key" => ListKeys,
+			// Bucket endpoints
+			GET "/v0/bucket" if id => GetBucketInfo (query_opt::id, query_opt::global_alias),
+			GET "/v0/bucket" if global_alias => GetBucketInfo (query_opt::id, query_opt::global_alias),
+			GET "/v0/bucket" => ListBuckets,
+			POST "/v0/bucket" => CreateBucket,
+			DELETE "/v0/bucket" if id => DeleteBucket (query::id),
+			PUT "/v0/bucket" if id => UpdateBucket (query::id),
+			// Bucket-key permissions
+			POST "/v0/bucket/allow" => BucketAllowKey,
+			POST "/v0/bucket/deny" => BucketDenyKey,
+			// Bucket aliases
+			PUT "/v0/bucket/alias/global" => GlobalAliasBucket (query::id, query::alias),
+			DELETE "/v0/bucket/alias/global" => GlobalUnaliasBucket (query::id, query::alias),
+			PUT "/v0/bucket/alias/local" => LocalAliasBucket (query::id, query::access_key_id, query::alias),
+			DELETE "/v0/bucket/alias/local" => LocalUnaliasBucket (query::id, query::access_key_id, query::alias),
+		]);
+
+		if let Some(message) = query.nonempty_message() {
+			debug!("Unused query parameter: {}", message)
+		}
+
+		Ok(res)
+	}
+	/// Get the kind of authorization which is required to perform the operation.
+	pub fn authorization_type(&self) -> Authorization {
+		match self {
+			Self::Metrics => Authorization::MetricsToken,
+			_ => Authorization::AdminToken,
+		}
+	}
+}
+
+generateQueryParameters! {
+	"id" => id,
+	"search" => search,
+	"globalAlias" => global_alias,
+	"alias" => alias,
+	"accessKeyId" => access_key_id
+}
diff --git a/src/api/api_server.rs b/src/api/api_server.rs
deleted file mode 100644
index e7b86d9e..00000000
--- a/src/api/api_server.rs
+++ /dev/null
@@ -1,645 +0,0 @@
-use std::net::SocketAddr;
-use std::sync::Arc;
-
-use chrono::{DateTime, NaiveDateTime, Utc};
-use futures::future::Future;
-use futures::prelude::*;
-use hyper::header;
-use hyper::server::conn::AddrStream;
-use hyper::service::{make_service_fn, service_fn};
-use hyper::{Body, Method, Request, Response, Server};
-
-use opentelemetry::{
-	global,
-	metrics::{Counter, ValueRecorder},
-	trace::{FutureExt, TraceContextExt, Tracer},
-	Context, KeyValue,
-};
-
-use garage_util::data::*;
-use garage_util::error::Error as GarageError;
-use garage_util::metrics::{gen_trace_id, RecordDuration};
-
-use garage_model::garage::Garage;
-use garage_model::key_table::Key;
-
-use garage_table::util::*;
-
-use crate::error::*;
-use crate::signature::compute_scope;
-use crate::signature::payload::check_payload_signature;
-use crate::signature::streaming::SignedPayloadStream;
-use crate::signature::LONG_DATETIME;
-
-use crate::helpers::*;
-use crate::s3_bucket::*;
-use crate::s3_copy::*;
-use crate::s3_cors::*;
-use crate::s3_delete::*;
-use crate::s3_get::*;
-use crate::s3_list::*;
-use crate::s3_post_object::handle_post_object;
-use crate::s3_put::*;
-use crate::s3_router::{Authorization, Endpoint};
-use crate::s3_website::*;
-
-struct ApiMetrics {
-	request_counter: Counter<u64>,
-	error_counter: Counter<u64>,
-	request_duration: ValueRecorder<f64>,
-}
-
-impl ApiMetrics {
-	fn new() -> Self {
-		let meter = global::meter("garage/api");
-		Self {
-			request_counter: meter
-				.u64_counter("api.request_counter")
-				.with_description("Number of API calls to the various S3 API endpoints")
-				.init(),
-			error_counter: meter
-				.u64_counter("api.error_counter")
-				.with_description(
-					"Number of API calls to the various S3 API endpoints that resulted in errors",
-				)
-				.init(),
-			request_duration: meter
-				.f64_value_recorder("api.request_duration")
-				.with_description("Duration of API calls to the various S3 API endpoints")
-				.init(),
-		}
-	}
-}
-
-/// Run the S3 API server
-pub async fn run_api_server(
-	garage: Arc<Garage>,
-	shutdown_signal: impl Future<Output = ()>,
-) -> Result<(), GarageError> {
-	let addr = &garage.config.s3_api.api_bind_addr;
-
-	let metrics = Arc::new(ApiMetrics::new());
-
-	let service = make_service_fn(|conn: &AddrStream| {
-		let garage = garage.clone();
-		let metrics = metrics.clone();
-
-		let client_addr = conn.remote_addr();
-		async move {
-			Ok::<_, GarageError>(service_fn(move |req: Request<Body>| {
-				let garage = garage.clone();
-				let metrics = metrics.clone();
-
-				handler(garage, metrics, req, client_addr)
-			}))
-		}
-	});
-
-	let server = Server::bind(addr).serve(service);
-
-	let graceful = server.with_graceful_shutdown(shutdown_signal);
-	info!("API server listening on http://{}", addr);
-
-	graceful.await?;
-	Ok(())
-}
-
-async fn handler(
-	garage: Arc<Garage>,
-	metrics: Arc<ApiMetrics>,
-	req: Request<Body>,
-	addr: SocketAddr,
-) -> Result<Response<Body>, GarageError> {
-	let uri = req.uri().clone();
-	info!("{} {} {}", addr, req.method(), uri);
-	debug!("{:?}", req);
-
-	let tracer = opentelemetry::global::tracer("garage");
-	let span = tracer
-		.span_builder("S3 API call (unknown)")
-		.with_trace_id(gen_trace_id())
-		.with_attributes(vec![
-			KeyValue::new("method", format!("{}", req.method())),
-			KeyValue::new("uri", req.uri().to_string()),
-		])
-		.start(&tracer);
-
-	let res = handler_stage2(garage.clone(), metrics, req)
-		.with_context(Context::current_with_span(span))
-		.await;
-
-	match res {
-		Ok(x) => {
-			debug!("{} {:?}", x.status(), x.headers());
-			Ok(x)
-		}
-		Err(e) => {
-			let body: Body = Body::from(e.aws_xml(&garage.config.s3_api.s3_region, uri.path()));
-			let mut http_error_builder = Response::builder()
-				.status(e.http_status_code())
-				.header("Content-Type", "application/xml");
-
-			if let Some(header_map) = http_error_builder.headers_mut() {
-				e.add_headers(header_map)
-			}
-
-			let http_error = http_error_builder.body(body)?;
-
-			if e.http_status_code().is_server_error() {
-				warn!("Response: error {}, {}", e.http_status_code(), e);
-			} else {
-				info!("Response: error {}, {}", e.http_status_code(), e);
-			}
-			Ok(http_error)
-		}
-	}
-}
-
-async fn handler_stage2(
-	garage: Arc<Garage>,
-	metrics: Arc<ApiMetrics>,
-	req: Request<Body>,
-) -> Result<Response<Body>, Error> {
-	let authority = req
-		.headers()
-		.get(header::HOST)
-		.ok_or_bad_request("Host header required")?
-		.to_str()?;
-
-	let host = authority_to_host(authority)?;
-
-	let bucket_name = garage
-		.config
-		.s3_api
-		.root_domain
-		.as_ref()
-		.and_then(|root_domain| host_to_bucket(&host, root_domain));
-
-	let (endpoint, bucket_name) = Endpoint::from_request(&req, bucket_name.map(ToOwned::to_owned))?;
-	debug!("Endpoint: {:?}", endpoint);
-
-	let current_context = Context::current();
-	let current_span = current_context.span();
-	current_span.update_name::<String>(format!("S3 API {}", endpoint.name()));
-	current_span.set_attribute(KeyValue::new("endpoint", endpoint.name()));
-	current_span.set_attribute(KeyValue::new(
-		"bucket",
-		bucket_name.clone().unwrap_or_default(),
-	));
-
-	let metrics_tags = &[KeyValue::new("api_endpoint", endpoint.name())];
-
-	let res = handler_stage3(garage, req, endpoint, bucket_name)
-		.record_duration(&metrics.request_duration, &metrics_tags[..])
-		.await;
-
-	metrics.request_counter.add(1, &metrics_tags[..]);
-
-	let status_code = match &res {
-		Ok(r) => r.status(),
-		Err(e) => e.http_status_code(),
-	};
-	if status_code.is_client_error() || status_code.is_server_error() {
-		metrics.error_counter.add(
-			1,
-			&[
-				metrics_tags[0].clone(),
-				KeyValue::new("status_code", status_code.as_str().to_string()),
-			],
-		);
-	}
-
-	res
-}
-
-async fn handler_stage3(
-	garage: Arc<Garage>,
-	req: Request<Body>,
-	endpoint: Endpoint,
-	bucket_name: Option<String>,
-) -> Result<Response<Body>, Error> {
-	// Some endpoints are processed early, before we even check for an API key
-	if let Endpoint::PostObject = endpoint {
-		return handle_post_object(garage, req, bucket_name.unwrap()).await;
-	}
-	if let Endpoint::Options = endpoint {
-		return handle_options_s3api(garage, &req, bucket_name).await;
-	}
-
-	let (api_key, mut content_sha256) = check_payload_signature(&garage, &req).await?;
-	let api_key = api_key.ok_or_else(|| {
-		Error::Forbidden("Garage does not support anonymous access yet".to_string())
-	})?;
-
-	let req = match req.headers().get("x-amz-content-sha256") {
-		Some(header) if header == "STREAMING-AWS4-HMAC-SHA256-PAYLOAD" => {
-			let signature = content_sha256
-				.take()
-				.ok_or_bad_request("No signature provided")?;
-
-			let secret_key = &api_key
-				.state
-				.as_option()
-				.ok_or_internal_error("Deleted key state")?
-				.secret_key;
-
-			let date = req
-				.headers()
-				.get("x-amz-date")
-				.ok_or_bad_request("Missing X-Amz-Date field")?
-				.to_str()?;
-			let date: NaiveDateTime = NaiveDateTime::parse_from_str(date, LONG_DATETIME)
-				.ok_or_bad_request("Invalid date")?;
-			let date: DateTime<Utc> = DateTime::from_utc(date, Utc);
-
-			let scope = compute_scope(&date, &garage.config.s3_api.s3_region);
-			let signing_hmac = crate::signature::signing_hmac(
-				&date,
-				secret_key,
-				&garage.config.s3_api.s3_region,
-				"s3",
-			)
-			.ok_or_internal_error("Unable to build signing HMAC")?;
-
-			req.map(move |body| {
-				Body::wrap_stream(
-					SignedPayloadStream::new(
-						body.map_err(Error::from),
-						signing_hmac,
-						date,
-						&scope,
-						signature,
-					)
-					.map_err(Error::from),
-				)
-			})
-		}
-		_ => req,
-	};
-
-	let bucket_name = match bucket_name {
-		None => return handle_request_without_bucket(garage, req, api_key, endpoint).await,
-		Some(bucket) => bucket.to_string(),
-	};
-
-	// Special code path for CreateBucket API endpoint
-	if let Endpoint::CreateBucket {} = endpoint {
-		return handle_create_bucket(&garage, req, content_sha256, api_key, bucket_name).await;
-	}
-
-	let bucket_id = resolve_bucket(&garage, &bucket_name, &api_key).await?;
-	let bucket = garage
-		.bucket_table
-		.get(&EmptyKey, &bucket_id)
-		.await?
-		.filter(|b| !b.state.is_deleted())
-		.ok_or(Error::NoSuchBucket)?;
-
-	let allowed = match endpoint.authorization_type() {
-		Authorization::Read => api_key.allow_read(&bucket_id),
-		Authorization::Write => api_key.allow_write(&bucket_id),
-		Authorization::Owner => api_key.allow_owner(&bucket_id),
-		_ => unreachable!(),
-	};
-
-	if !allowed {
-		return Err(Error::Forbidden(
-			"Operation is not allowed for this key.".to_string(),
-		));
-	}
-
-	// Look up what CORS rule might apply to response.
-	// Requests for methods different than GET, HEAD or POST
-	// are always preflighted, i.e. the browser should make
-	// an OPTIONS call before to check it is allowed
-	let matching_cors_rule = match *req.method() {
-		Method::GET | Method::HEAD | Method::POST => find_matching_cors_rule(&bucket, &req)?,
-		_ => None,
-	};
-
-	let resp = match endpoint {
-		Endpoint::HeadObject {
-			key, part_number, ..
-		} => handle_head(garage, &req, bucket_id, &key, part_number).await,
-		Endpoint::GetObject {
-			key, part_number, ..
-		} => handle_get(garage, &req, bucket_id, &key, part_number).await,
-		Endpoint::UploadPart {
-			key,
-			part_number,
-			upload_id,
-		} => {
-			handle_put_part(
-				garage,
-				req,
-				bucket_id,
-				&key,
-				part_number,
-				&upload_id,
-				content_sha256,
-			)
-			.await
-		}
-		Endpoint::CopyObject { key } => handle_copy(garage, &api_key, &req, bucket_id, &key).await,
-		Endpoint::UploadPartCopy {
-			key,
-			part_number,
-			upload_id,
-		} => {
-			handle_upload_part_copy(
-				garage,
-				&api_key,
-				&req,
-				bucket_id,
-				&key,
-				part_number,
-				&upload_id,
-			)
-			.await
-		}
-		Endpoint::PutObject { key } => {
-			handle_put(garage, req, bucket_id, &key, content_sha256).await
-		}
-		Endpoint::AbortMultipartUpload { key, upload_id } => {
-			handle_abort_multipart_upload(garage, bucket_id, &key, &upload_id).await
-		}
-		Endpoint::DeleteObject { key, .. } => handle_delete(garage, bucket_id, &key).await,
-		Endpoint::CreateMultipartUpload { key } => {
-			handle_create_multipart_upload(garage, &req, &bucket_name, bucket_id, &key).await
-		}
-		Endpoint::CompleteMultipartUpload { key, upload_id } => {
-			handle_complete_multipart_upload(
-				garage,
-				req,
-				&bucket_name,
-				bucket_id,
-				&key,
-				&upload_id,
-				content_sha256,
-			)
-			.await
-		}
-		Endpoint::CreateBucket {} => unreachable!(),
-		Endpoint::HeadBucket {} => {
-			let empty_body: Body = Body::from(vec![]);
-			let response = Response::builder().body(empty_body).unwrap();
-			Ok(response)
-		}
-		Endpoint::DeleteBucket {} => {
-			handle_delete_bucket(&garage, bucket_id, bucket_name, api_key).await
-		}
-		Endpoint::GetBucketLocation {} => handle_get_bucket_location(garage),
-		Endpoint::GetBucketVersioning {} => handle_get_bucket_versioning(),
-		Endpoint::ListObjects {
-			delimiter,
-			encoding_type,
-			marker,
-			max_keys,
-			prefix,
-		} => {
-			handle_list(
-				garage,
-				&ListObjectsQuery {
-					common: ListQueryCommon {
-						bucket_name,
-						bucket_id,
-						delimiter: delimiter.map(|d| d.to_string()),
-						page_size: max_keys.map(|p| p.clamp(1, 1000)).unwrap_or(1000),
-						prefix: prefix.unwrap_or_default(),
-						urlencode_resp: encoding_type.map(|e| e == "url").unwrap_or(false),
-					},
-					is_v2: false,
-					marker,
-					continuation_token: None,
-					start_after: None,
-				},
-			)
-			.await
-		}
-		Endpoint::ListObjectsV2 {
-			delimiter,
-			encoding_type,
-			max_keys,
-			prefix,
-			continuation_token,
-			start_after,
-			list_type,
-			..
-		} => {
-			if list_type == "2" {
-				handle_list(
-					garage,
-					&ListObjectsQuery {
-						common: ListQueryCommon {
-							bucket_name,
-							bucket_id,
-							delimiter: delimiter.map(|d| d.to_string()),
-							page_size: max_keys.map(|p| p.clamp(1, 1000)).unwrap_or(1000),
-							urlencode_resp: encoding_type.map(|e| e == "url").unwrap_or(false),
-							prefix: prefix.unwrap_or_default(),
-						},
-						is_v2: true,
-						marker: None,
-						continuation_token,
-						start_after,
-					},
-				)
-				.await
-			} else {
-				Err(Error::BadRequest(format!(
-					"Invalid endpoint: list-type={}",
-					list_type
-				)))
-			}
-		}
-		Endpoint::ListMultipartUploads {
-			delimiter,
-			encoding_type,
-			key_marker,
-			max_uploads,
-			prefix,
-			upload_id_marker,
-		} => {
-			handle_list_multipart_upload(
-				garage,
-				&ListMultipartUploadsQuery {
-					common: ListQueryCommon {
-						bucket_name,
-						bucket_id,
-						delimiter: delimiter.map(|d| d.to_string()),
-						page_size: max_uploads.map(|p| p.clamp(1, 1000)).unwrap_or(1000),
-						prefix: prefix.unwrap_or_default(),
-						urlencode_resp: encoding_type.map(|e| e == "url").unwrap_or(false),
-					},
-					key_marker,
-					upload_id_marker,
-				},
-			)
-			.await
-		}
-		Endpoint::ListParts {
-			key,
-			max_parts,
-			part_number_marker,
-			upload_id,
-		} => {
-			handle_list_parts(
-				garage,
-				&ListPartsQuery {
-					bucket_name,
-					bucket_id,
-					key,
-					upload_id,
-					part_number_marker: part_number_marker.map(|p| p.clamp(1, 10000)),
-					max_parts: max_parts.map(|p| p.clamp(1, 1000)).unwrap_or(1000),
-				},
-			)
-			.await
-		}
-		Endpoint::DeleteObjects {} => {
-			handle_delete_objects(garage, bucket_id, req, content_sha256).await
-		}
-		Endpoint::GetBucketWebsite {} => handle_get_website(&bucket).await,
-		Endpoint::PutBucketWebsite {} => {
-			handle_put_website(garage, bucket_id, req, content_sha256).await
-		}
-		Endpoint::DeleteBucketWebsite {} => handle_delete_website(garage, bucket_id).await,
-		Endpoint::GetBucketCors {} => handle_get_cors(&bucket).await,
-		Endpoint::PutBucketCors {} => handle_put_cors(garage, bucket_id, req, content_sha256).await,
-		Endpoint::DeleteBucketCors {} => handle_delete_cors(garage, bucket_id).await,
-		endpoint => Err(Error::NotImplemented(endpoint.name().to_owned())),
-	};
-
-	// If request was a success and we have a CORS rule that applies to it,
-	// add the corresponding CORS headers to the response
-	let mut resp_ok = resp?;
-	if let Some(rule) = matching_cors_rule {
-		add_cors_headers(&mut resp_ok, rule)
-			.ok_or_internal_error("Invalid bucket CORS configuration")?;
-	}
-
-	Ok(resp_ok)
-}
-
-async fn handle_request_without_bucket(
-	garage: Arc<Garage>,
-	_req: Request<Body>,
-	api_key: Key,
-	endpoint: Endpoint,
-) -> Result<Response<Body>, Error> {
-	match endpoint {
-		Endpoint::ListBuckets => handle_list_buckets(&garage, &api_key).await,
-		endpoint => Err(Error::NotImplemented(endpoint.name().to_owned())),
-	}
-}
-
-#[allow(clippy::ptr_arg)]
-pub async fn resolve_bucket(
-	garage: &Garage,
-	bucket_name: &String,
-	api_key: &Key,
-) -> Result<Uuid, Error> {
-	let api_key_params = api_key
-		.state
-		.as_option()
-		.ok_or_internal_error("Key should not be deleted at this point")?;
-
-	if let Some(Some(bucket_id)) = api_key_params.local_aliases.get(bucket_name) {
-		Ok(*bucket_id)
-	} else {
-		Ok(garage
-			.bucket_helper()
-			.resolve_global_bucket_name(bucket_name)
-			.await?
-			.ok_or(Error::NoSuchBucket)?)
-	}
-}
-
-/// Extract the bucket name and the key name from an HTTP path and possibly a bucket provided in
-/// the host header of the request
-///
-/// S3 internally manages only buckets and keys. This function splits
-/// an HTTP path to get the corresponding bucket name and key.
-pub fn parse_bucket_key<'a>(
-	path: &'a str,
-	host_bucket: Option<&'a str>,
-) -> Result<(&'a str, Option<&'a str>), Error> {
-	let path = path.trim_start_matches('/');
-
-	if let Some(bucket) = host_bucket {
-		if !path.is_empty() {
-			return Ok((bucket, Some(path)));
-		} else {
-			return Ok((bucket, None));
-		}
-	}
-
-	let (bucket, key) = match path.find('/') {
-		Some(i) => {
-			let key = &path[i + 1..];
-			if !key.is_empty() {
-				(&path[..i], Some(key))
-			} else {
-				(&path[..i], None)
-			}
-		}
-		None => (path, None),
-	};
-	if bucket.is_empty() {
-		return Err(Error::BadRequest("No bucket specified".to_string()));
-	}
-	Ok((bucket, key))
-}
-
-#[cfg(test)]
-mod tests {
-	use super::*;
-
-	#[test]
-	fn parse_bucket_containing_a_key() -> Result<(), Error> {
-		let (bucket, key) = parse_bucket_key("/my_bucket/a/super/file.jpg", None)?;
-		assert_eq!(bucket, "my_bucket");
-		assert_eq!(key.expect("key must be set"), "a/super/file.jpg");
-		Ok(())
-	}
-
-	#[test]
-	fn parse_bucket_containing_no_key() -> Result<(), Error> {
-		let (bucket, key) = parse_bucket_key("/my_bucket/", None)?;
-		assert_eq!(bucket, "my_bucket");
-		assert!(key.is_none());
-		let (bucket, key) = parse_bucket_key("/my_bucket", None)?;
-		assert_eq!(bucket, "my_bucket");
-		assert!(key.is_none());
-		Ok(())
-	}
-
-	#[test]
-	fn parse_bucket_containing_no_bucket() {
-		let parsed = parse_bucket_key("", None);
-		assert!(parsed.is_err());
-		let parsed = parse_bucket_key("/", None);
-		assert!(parsed.is_err());
-		let parsed = parse_bucket_key("////", None);
-		assert!(parsed.is_err());
-	}
-
-	#[test]
-	fn parse_bucket_with_vhost_and_key() -> Result<(), Error> {
-		let (bucket, key) = parse_bucket_key("/a/super/file.jpg", Some("my-bucket"))?;
-		assert_eq!(bucket, "my-bucket");
-		assert_eq!(key.expect("key must be set"), "a/super/file.jpg");
-		Ok(())
-	}
-
-	#[test]
-	fn parse_bucket_with_vhost_no_key() -> Result<(), Error> {
-		let (bucket, key) = parse_bucket_key("", Some("my-bucket"))?;
-		assert_eq!(bucket, "my-bucket");
-		assert!(key.is_none());
-		let (bucket, key) = parse_bucket_key("/", Some("my-bucket"))?;
-		assert_eq!(bucket, "my-bucket");
-		assert!(key.is_none());
-		Ok(())
-	}
-}
diff --git a/src/api/common_error.rs b/src/api/common_error.rs
new file mode 100644
index 00000000..20f9f266
--- /dev/null
+++ b/src/api/common_error.rs
@@ -0,0 +1,177 @@
+use err_derive::Error;
+use hyper::StatusCode;
+
+use garage_util::error::Error as GarageError;
+
+/// Errors of this crate
+#[derive(Debug, Error)]
+pub enum CommonError {
+	// ---- INTERNAL ERRORS ----
+	/// Error related to deeper parts of Garage
+	#[error(display = "Internal error: {}", _0)]
+	InternalError(#[error(source)] GarageError),
+
+	/// Error related to Hyper
+	#[error(display = "Internal error (Hyper error): {}", _0)]
+	Hyper(#[error(source)] hyper::Error),
+
+	/// Error related to HTTP
+	#[error(display = "Internal error (HTTP error): {}", _0)]
+	Http(#[error(source)] http::Error),
+
+	// ---- GENERIC CLIENT ERRORS ----
+	/// Proper authentication was not provided
+	#[error(display = "Forbidden: {}", _0)]
+	Forbidden(String),
+
+	/// Generic bad request response with custom message
+	#[error(display = "Bad request: {}", _0)]
+	BadRequest(String),
+
+	// ---- SPECIFIC ERROR CONDITIONS ----
+	// These have to be error codes referenced in the S3 spec here:
+	// https://docs.aws.amazon.com/AmazonS3/latest/API/ErrorResponses.html#ErrorCodeList
+	/// The bucket requested don't exists
+	#[error(display = "Bucket not found: {}", _0)]
+	NoSuchBucket(String),
+
+	/// Tried to create a bucket that already exist
+	#[error(display = "Bucket already exists")]
+	BucketAlreadyExists,
+
+	/// Tried to delete a non-empty bucket
+	#[error(display = "Tried to delete a non-empty bucket")]
+	BucketNotEmpty,
+
+	// Category: bad request
+	/// Bucket name is not valid according to AWS S3 specs
+	#[error(display = "Invalid bucket name: {}", _0)]
+	InvalidBucketName(String),
+}
+
+impl CommonError {
+	pub fn http_status_code(&self) -> StatusCode {
+		match self {
+			CommonError::InternalError(
+				GarageError::Timeout
+				| GarageError::RemoteError(_)
+				| GarageError::Quorum(_, _, _, _),
+			) => StatusCode::SERVICE_UNAVAILABLE,
+			CommonError::InternalError(_) | CommonError::Hyper(_) | CommonError::Http(_) => {
+				StatusCode::INTERNAL_SERVER_ERROR
+			}
+			CommonError::BadRequest(_) => StatusCode::BAD_REQUEST,
+			CommonError::Forbidden(_) => StatusCode::FORBIDDEN,
+			CommonError::NoSuchBucket(_) => StatusCode::NOT_FOUND,
+			CommonError::BucketNotEmpty | CommonError::BucketAlreadyExists => StatusCode::CONFLICT,
+			CommonError::InvalidBucketName(_) => StatusCode::BAD_REQUEST,
+		}
+	}
+
+	pub fn aws_code(&self) -> &'static str {
+		match self {
+			CommonError::Forbidden(_) => "AccessDenied",
+			CommonError::InternalError(
+				GarageError::Timeout
+				| GarageError::RemoteError(_)
+				| GarageError::Quorum(_, _, _, _),
+			) => "ServiceUnavailable",
+			CommonError::InternalError(_) | CommonError::Hyper(_) | CommonError::Http(_) => {
+				"InternalError"
+			}
+			CommonError::BadRequest(_) => "InvalidRequest",
+			CommonError::NoSuchBucket(_) => "NoSuchBucket",
+			CommonError::BucketAlreadyExists => "BucketAlreadyExists",
+			CommonError::BucketNotEmpty => "BucketNotEmpty",
+			CommonError::InvalidBucketName(_) => "InvalidBucketName",
+		}
+	}
+
+	pub fn bad_request<M: ToString>(msg: M) -> Self {
+		CommonError::BadRequest(msg.to_string())
+	}
+}
+
+pub trait CommonErrorDerivative: From<CommonError> {
+	fn internal_error<M: ToString>(msg: M) -> Self {
+		Self::from(CommonError::InternalError(GarageError::Message(
+			msg.to_string(),
+		)))
+	}
+
+	fn bad_request<M: ToString>(msg: M) -> Self {
+		Self::from(CommonError::BadRequest(msg.to_string()))
+	}
+
+	fn forbidden<M: ToString>(msg: M) -> Self {
+		Self::from(CommonError::Forbidden(msg.to_string()))
+	}
+}
+
+/// Trait to map error to the Bad Request error code
+pub trait OkOrBadRequest {
+	type S;
+	fn ok_or_bad_request<M: AsRef<str>>(self, reason: M) -> Result<Self::S, CommonError>;
+}
+
+impl<T, E> OkOrBadRequest for Result<T, E>
+where
+	E: std::fmt::Display,
+{
+	type S = T;
+	fn ok_or_bad_request<M: AsRef<str>>(self, reason: M) -> Result<T, CommonError> {
+		match self {
+			Ok(x) => Ok(x),
+			Err(e) => Err(CommonError::BadRequest(format!(
+				"{}: {}",
+				reason.as_ref(),
+				e
+			))),
+		}
+	}
+}
+
+impl<T> OkOrBadRequest for Option<T> {
+	type S = T;
+	fn ok_or_bad_request<M: AsRef<str>>(self, reason: M) -> Result<T, CommonError> {
+		match self {
+			Some(x) => Ok(x),
+			None => Err(CommonError::BadRequest(reason.as_ref().to_string())),
+		}
+	}
+}
+
+/// Trait to map an error to an Internal Error code
+pub trait OkOrInternalError {
+	type S;
+	fn ok_or_internal_error<M: AsRef<str>>(self, reason: M) -> Result<Self::S, CommonError>;
+}
+
+impl<T, E> OkOrInternalError for Result<T, E>
+where
+	E: std::fmt::Display,
+{
+	type S = T;
+	fn ok_or_internal_error<M: AsRef<str>>(self, reason: M) -> Result<T, CommonError> {
+		match self {
+			Ok(x) => Ok(x),
+			Err(e) => Err(CommonError::InternalError(GarageError::Message(format!(
+				"{}: {}",
+				reason.as_ref(),
+				e
+			)))),
+		}
+	}
+}
+
+impl<T> OkOrInternalError for Option<T> {
+	type S = T;
+	fn ok_or_internal_error<M: AsRef<str>>(self, reason: M) -> Result<T, CommonError> {
+		match self {
+			Some(x) => Ok(x),
+			None => Err(CommonError::InternalError(GarageError::Message(
+				reason.as_ref().to_string(),
+			))),
+		}
+	}
+}
diff --git a/src/api/generic_server.rs b/src/api/generic_server.rs
new file mode 100644
index 00000000..62fe4e5a
--- /dev/null
+++ b/src/api/generic_server.rs
@@ -0,0 +1,211 @@
+use std::net::SocketAddr;
+use std::sync::Arc;
+
+use async_trait::async_trait;
+
+use futures::future::Future;
+
+use hyper::header::HeaderValue;
+use hyper::server::conn::AddrStream;
+use hyper::service::{make_service_fn, service_fn};
+use hyper::{Body, Request, Response, Server};
+use hyper::{HeaderMap, StatusCode};
+
+use opentelemetry::{
+	global,
+	metrics::{Counter, ValueRecorder},
+	trace::{FutureExt, SpanRef, TraceContextExt, Tracer},
+	Context, KeyValue,
+};
+
+use garage_util::error::Error as GarageError;
+use garage_util::metrics::{gen_trace_id, RecordDuration};
+
+pub(crate) trait ApiEndpoint: Send + Sync + 'static {
+	fn name(&self) -> &'static str;
+	fn add_span_attributes(&self, span: SpanRef<'_>);
+}
+
+pub trait ApiError: std::error::Error + Send + Sync + 'static {
+	fn http_status_code(&self) -> StatusCode;
+	fn add_http_headers(&self, header_map: &mut HeaderMap<HeaderValue>);
+	fn http_body(&self, garage_region: &str, path: &str) -> Body;
+}
+
+#[async_trait]
+pub(crate) trait ApiHandler: Send + Sync + 'static {
+	const API_NAME: &'static str;
+	const API_NAME_DISPLAY: &'static str;
+
+	type Endpoint: ApiEndpoint;
+	type Error: ApiError;
+
+	fn parse_endpoint(&self, r: &Request<Body>) -> Result<Self::Endpoint, Self::Error>;
+	async fn handle(
+		&self,
+		req: Request<Body>,
+		endpoint: Self::Endpoint,
+	) -> Result<Response<Body>, Self::Error>;
+}
+
+pub(crate) struct ApiServer<A: ApiHandler> {
+	region: String,
+	api_handler: A,
+
+	// Metrics
+	request_counter: Counter<u64>,
+	error_counter: Counter<u64>,
+	request_duration: ValueRecorder<f64>,
+}
+
+impl<A: ApiHandler> ApiServer<A> {
+	pub fn new(region: String, api_handler: A) -> Arc<Self> {
+		let meter = global::meter("garage/api");
+		Arc::new(Self {
+			region,
+			api_handler,
+			request_counter: meter
+				.u64_counter(format!("api.{}.request_counter", A::API_NAME))
+				.with_description(format!(
+					"Number of API calls to the various {} API endpoints",
+					A::API_NAME_DISPLAY
+				))
+				.init(),
+			error_counter: meter
+				.u64_counter(format!("api.{}.error_counter", A::API_NAME))
+				.with_description(format!(
+					"Number of API calls to the various {} API endpoints that resulted in errors",
+					A::API_NAME_DISPLAY
+				))
+				.init(),
+			request_duration: meter
+				.f64_value_recorder(format!("api.{}.request_duration", A::API_NAME))
+				.with_description(format!(
+					"Duration of API calls to the various {} API endpoints",
+					A::API_NAME_DISPLAY
+				))
+				.init(),
+		})
+	}
+
+	pub async fn run_server(
+		self: Arc<Self>,
+		bind_addr: SocketAddr,
+		shutdown_signal: impl Future<Output = ()>,
+	) -> Result<(), GarageError> {
+		let service = make_service_fn(|conn: &AddrStream| {
+			let this = self.clone();
+
+			let client_addr = conn.remote_addr();
+			async move {
+				Ok::<_, GarageError>(service_fn(move |req: Request<Body>| {
+					let this = this.clone();
+
+					this.handler(req, client_addr)
+				}))
+			}
+		});
+
+		let server = Server::bind(&bind_addr).serve(service);
+
+		let graceful = server.with_graceful_shutdown(shutdown_signal);
+		info!(
+			"{} API server listening on http://{}",
+			A::API_NAME_DISPLAY,
+			bind_addr
+		);
+
+		graceful.await?;
+		Ok(())
+	}
+
+	async fn handler(
+		self: Arc<Self>,
+		req: Request<Body>,
+		addr: SocketAddr,
+	) -> Result<Response<Body>, GarageError> {
+		let uri = req.uri().clone();
+		info!("{} {} {}", addr, req.method(), uri);
+		debug!("{:?}", req);
+
+		let tracer = opentelemetry::global::tracer("garage");
+		let span = tracer
+			.span_builder(format!("{} API call (unknown)", A::API_NAME_DISPLAY))
+			.with_trace_id(gen_trace_id())
+			.with_attributes(vec![
+				KeyValue::new("method", format!("{}", req.method())),
+				KeyValue::new("uri", req.uri().to_string()),
+			])
+			.start(&tracer);
+
+		let res = self
+			.handler_stage2(req)
+			.with_context(Context::current_with_span(span))
+			.await;
+
+		match res {
+			Ok(x) => {
+				debug!("{} {:?}", x.status(), x.headers());
+				Ok(x)
+			}
+			Err(e) => {
+				let body: Body = e.http_body(&self.region, uri.path());
+				let mut http_error_builder = Response::builder().status(e.http_status_code());
+
+				if let Some(header_map) = http_error_builder.headers_mut() {
+					e.add_http_headers(header_map)
+				}
+
+				let http_error = http_error_builder.body(body)?;
+
+				if e.http_status_code().is_server_error() {
+					warn!("Response: error {}, {}", e.http_status_code(), e);
+				} else {
+					info!("Response: error {}, {}", e.http_status_code(), e);
+				}
+				Ok(http_error)
+			}
+		}
+	}
+
+	async fn handler_stage2(&self, req: Request<Body>) -> Result<Response<Body>, A::Error> {
+		let endpoint = self.api_handler.parse_endpoint(&req)?;
+		debug!("Endpoint: {}", endpoint.name());
+
+		let current_context = Context::current();
+		let current_span = current_context.span();
+		current_span.update_name::<String>(format!(
+			"{} API {}",
+			A::API_NAME_DISPLAY,
+			endpoint.name()
+		));
+		current_span.set_attribute(KeyValue::new("endpoint", endpoint.name()));
+		endpoint.add_span_attributes(current_span);
+
+		let metrics_tags = &[KeyValue::new("api_endpoint", endpoint.name())];
+
+		let res = self
+			.api_handler
+			.handle(req, endpoint)
+			.record_duration(&self.request_duration, &metrics_tags[..])
+			.await;
+
+		self.request_counter.add(1, &metrics_tags[..]);
+
+		let status_code = match &res {
+			Ok(r) => r.status(),
+			Err(e) => e.http_status_code(),
+		};
+		if status_code.is_client_error() || status_code.is_server_error() {
+			self.error_counter.add(
+				1,
+				&[
+					metrics_tags[0].clone(),
+					KeyValue::new("status_code", status_code.as_str().to_string()),
+				],
+			);
+		}
+
+		res
+	}
+}
diff --git a/src/api/helpers.rs b/src/api/helpers.rs
index c2709bb3..642dbc42 100644
--- a/src/api/helpers.rs
+++ b/src/api/helpers.rs
@@ -1,5 +1,21 @@
-use crate::Error;
+use hyper::{Body, Request, Response};
 use idna::domain_to_unicode;
+use serde::{Deserialize, Serialize};
+
+use crate::common_error::{CommonError as Error, *};
+
+/// What kind of authorization is required to perform a given action
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum Authorization {
+	/// No authorization is required
+	None,
+	/// Having Read permission on bucket
+	Read,
+	/// Having Write permission on bucket
+	Write,
+	/// Having Owner permission on bucket
+	Owner,
+}
 
 /// Host to bucket
 ///
@@ -31,7 +47,7 @@ pub fn authority_to_host(authority: &str) -> Result<String, Error> {
 	let mut iter = authority.chars().enumerate();
 	let (_, first_char) = iter
 		.next()
-		.ok_or_else(|| Error::BadRequest("Authority is empty".to_string()))?;
+		.ok_or_else(|| Error::bad_request("Authority is empty".to_string()))?;
 
 	let split = match first_char {
 		'[' => {
@@ -39,7 +55,7 @@ pub fn authority_to_host(authority: &str) -> Result<String, Error> {
 			match iter.next() {
 				Some((_, ']')) => iter.next(),
 				_ => {
-					return Err(Error::BadRequest(format!(
+					return Err(Error::bad_request(format!(
 						"Authority {} has an illegal format",
 						authority
 					)))
@@ -52,7 +68,7 @@ pub fn authority_to_host(authority: &str) -> Result<String, Error> {
 	let authority = match split {
 		Some((i, ':')) => Ok(&authority[..i]),
 		None => Ok(authority),
-		Some((_, _)) => Err(Error::BadRequest(format!(
+		Some((_, _)) => Err(Error::bad_request(format!(
 			"Authority {} has an illegal format",
 			authority
 		))),
@@ -60,11 +76,135 @@ pub fn authority_to_host(authority: &str) -> Result<String, Error> {
 	authority.map(|h| domain_to_unicode(h).0)
 }
 
+/// Extract the bucket name and the key name from an HTTP path and possibly a bucket provided in
+/// the host header of the request
+///
+/// S3 internally manages only buckets and keys. This function splits
+/// an HTTP path to get the corresponding bucket name and key.
+pub fn parse_bucket_key<'a>(
+	path: &'a str,
+	host_bucket: Option<&'a str>,
+) -> Result<(&'a str, Option<&'a str>), Error> {
+	let path = path.trim_start_matches('/');
+
+	if let Some(bucket) = host_bucket {
+		if !path.is_empty() {
+			return Ok((bucket, Some(path)));
+		} else {
+			return Ok((bucket, None));
+		}
+	}
+
+	let (bucket, key) = match path.find('/') {
+		Some(i) => {
+			let key = &path[i + 1..];
+			if !key.is_empty() {
+				(&path[..i], Some(key))
+			} else {
+				(&path[..i], None)
+			}
+		}
+		None => (path, None),
+	};
+	if bucket.is_empty() {
+		return Err(Error::bad_request("No bucket specified"));
+	}
+	Ok((bucket, key))
+}
+
+const UTF8_BEFORE_LAST_CHAR: char = '\u{10FFFE}';
+
+/// Compute the key after the prefix
+pub fn key_after_prefix(pfx: &str) -> Option<String> {
+	let mut next = pfx.to_string();
+	while !next.is_empty() {
+		let tail = next.pop().unwrap();
+		if tail >= char::MAX {
+			continue;
+		}
+
+		// Circumvent a limitation of RangeFrom that overflow earlier than needed
+		// See: https://doc.rust-lang.org/core/ops/struct.RangeFrom.html
+		let new_tail = if tail == UTF8_BEFORE_LAST_CHAR {
+			char::MAX
+		} else {
+			(tail..).nth(1).unwrap()
+		};
+
+		next.push(new_tail);
+		return Some(next);
+	}
+
+	None
+}
+
+pub async fn parse_json_body<T: for<'de> Deserialize<'de>>(req: Request<Body>) -> Result<T, Error> {
+	let body = hyper::body::to_bytes(req.into_body()).await?;
+	let resp: T = serde_json::from_slice(&body).ok_or_bad_request("Invalid JSON")?;
+	Ok(resp)
+}
+
+pub fn json_ok_response<T: Serialize>(res: &T) -> Result<Response<Body>, Error> {
+	let resp_json = serde_json::to_string_pretty(res).map_err(garage_util::error::Error::from)?;
+	Ok(Response::builder()
+		.status(hyper::StatusCode::OK)
+		.header(http::header::CONTENT_TYPE, "application/json")
+		.body(Body::from(resp_json))?)
+}
+
 #[cfg(test)]
 mod tests {
 	use super::*;
 
 	#[test]
+	fn parse_bucket_containing_a_key() -> Result<(), Error> {
+		let (bucket, key) = parse_bucket_key("/my_bucket/a/super/file.jpg", None)?;
+		assert_eq!(bucket, "my_bucket");
+		assert_eq!(key.expect("key must be set"), "a/super/file.jpg");
+		Ok(())
+	}
+
+	#[test]
+	fn parse_bucket_containing_no_key() -> Result<(), Error> {
+		let (bucket, key) = parse_bucket_key("/my_bucket/", None)?;
+		assert_eq!(bucket, "my_bucket");
+		assert!(key.is_none());
+		let (bucket, key) = parse_bucket_key("/my_bucket", None)?;
+		assert_eq!(bucket, "my_bucket");
+		assert!(key.is_none());
+		Ok(())
+	}
+
+	#[test]
+	fn parse_bucket_containing_no_bucket() {
+		let parsed = parse_bucket_key("", None);
+		assert!(parsed.is_err());
+		let parsed = parse_bucket_key("/", None);
+		assert!(parsed.is_err());
+		let parsed = parse_bucket_key("////", None);
+		assert!(parsed.is_err());
+	}
+
+	#[test]
+	fn parse_bucket_with_vhost_and_key() -> Result<(), Error> {
+		let (bucket, key) = parse_bucket_key("/a/super/file.jpg", Some("my-bucket"))?;
+		assert_eq!(bucket, "my-bucket");
+		assert_eq!(key.expect("key must be set"), "a/super/file.jpg");
+		Ok(())
+	}
+
+	#[test]
+	fn parse_bucket_with_vhost_no_key() -> Result<(), Error> {
+		let (bucket, key) = parse_bucket_key("", Some("my-bucket"))?;
+		assert_eq!(bucket, "my-bucket");
+		assert!(key.is_none());
+		let (bucket, key) = parse_bucket_key("/", Some("my-bucket"))?;
+		assert_eq!(bucket, "my-bucket");
+		assert!(key.is_none());
+		Ok(())
+	}
+
+	#[test]
 	fn authority_to_host_with_port() -> Result<(), Error> {
 		let domain = authority_to_host("[::1]:3902")?;
 		assert_eq!(domain, "[::1]");
@@ -111,4 +251,47 @@ mod tests {
 		assert_eq!(host_to_bucket("not-garage.tld", "garage.tld"), None);
 		assert_eq!(host_to_bucket("not-garage.tld", ".garage.tld"), None);
 	}
+
+	#[test]
+	fn test_key_after_prefix() {
+		use std::iter::FromIterator;
+
+		assert_eq!(UTF8_BEFORE_LAST_CHAR as u32, (char::MAX as u32) - 1);
+		assert_eq!(key_after_prefix("a/b/").unwrap().as_str(), "a/b0");
+		assert_eq!(key_after_prefix("€").unwrap().as_str(), "₭");
+		assert_eq!(
+			key_after_prefix("􏿽").unwrap().as_str(),
+			String::from(char::from_u32(0x10FFFE).unwrap())
+		);
+
+		// When the last character is the biggest UTF8 char
+		let a = String::from_iter(['a', char::MAX].iter());
+		assert_eq!(key_after_prefix(a.as_str()).unwrap().as_str(), "b");
+
+		// When all characters are the biggest UTF8 char
+		let b = String::from_iter([char::MAX; 3].iter());
+		assert!(key_after_prefix(b.as_str()).is_none());
+
+		// Check utf8 surrogates
+		let c = String::from('\u{D7FF}');
+		assert_eq!(
+			key_after_prefix(c.as_str()).unwrap().as_str(),
+			String::from('\u{E000}')
+		);
+
+		// Check the character before the biggest one
+		let d = String::from('\u{10FFFE}');
+		assert_eq!(
+			key_after_prefix(d.as_str()).unwrap().as_str(),
+			String::from(char::MAX)
+		);
+	}
+}
+
+#[derive(Serialize)]
+pub(crate) struct CustomApiErrorBody {
+	pub(crate) code: String,
+	pub(crate) message: String,
+	pub(crate) region: String,
+	pub(crate) path: String,
 }
diff --git a/src/api/k2v/api_server.rs b/src/api/k2v/api_server.rs
new file mode 100644
index 00000000..084867b5
--- /dev/null
+++ b/src/api/k2v/api_server.rs
@@ -0,0 +1,190 @@
+use std::net::SocketAddr;
+use std::sync::Arc;
+
+use async_trait::async_trait;
+
+use futures::future::Future;
+use hyper::{Body, Method, Request, Response};
+
+use opentelemetry::{trace::SpanRef, KeyValue};
+
+use garage_util::error::Error as GarageError;
+
+use garage_model::garage::Garage;
+
+use crate::generic_server::*;
+use crate::k2v::error::*;
+
+use crate::signature::payload::check_payload_signature;
+use crate::signature::streaming::*;
+
+use crate::helpers::*;
+use crate::k2v::batch::*;
+use crate::k2v::index::*;
+use crate::k2v::item::*;
+use crate::k2v::router::Endpoint;
+use crate::s3::cors::*;
+
+pub struct K2VApiServer {
+	garage: Arc<Garage>,
+}
+
+pub(crate) struct K2VApiEndpoint {
+	bucket_name: String,
+	endpoint: Endpoint,
+}
+
+impl K2VApiServer {
+	pub async fn run(
+		garage: Arc<Garage>,
+		bind_addr: SocketAddr,
+		s3_region: String,
+		shutdown_signal: impl Future<Output = ()>,
+	) -> Result<(), GarageError> {
+		ApiServer::new(s3_region, K2VApiServer { garage })
+			.run_server(bind_addr, shutdown_signal)
+			.await
+	}
+}
+
+#[async_trait]
+impl ApiHandler for K2VApiServer {
+	const API_NAME: &'static str = "k2v";
+	const API_NAME_DISPLAY: &'static str = "K2V";
+
+	type Endpoint = K2VApiEndpoint;
+	type Error = Error;
+
+	fn parse_endpoint(&self, req: &Request<Body>) -> Result<K2VApiEndpoint, Error> {
+		let (endpoint, bucket_name) = Endpoint::from_request(req)?;
+
+		Ok(K2VApiEndpoint {
+			bucket_name,
+			endpoint,
+		})
+	}
+
+	async fn handle(
+		&self,
+		req: Request<Body>,
+		endpoint: K2VApiEndpoint,
+	) -> Result<Response<Body>, Error> {
+		let K2VApiEndpoint {
+			bucket_name,
+			endpoint,
+		} = endpoint;
+		let garage = self.garage.clone();
+
+		// The OPTIONS method is procesed early, before we even check for an API key
+		if let Endpoint::Options = endpoint {
+			return Ok(handle_options_s3api(garage, &req, Some(bucket_name))
+				.await
+				.ok_or_bad_request("Error handling OPTIONS")?);
+		}
+
+		let (api_key, mut content_sha256) = check_payload_signature(&garage, "k2v", &req).await?;
+		let api_key = api_key
+			.ok_or_else(|| Error::forbidden("Garage does not support anonymous access yet"))?;
+
+		let req = parse_streaming_body(
+			&api_key,
+			req,
+			&mut content_sha256,
+			&garage.config.s3_api.s3_region,
+			"k2v",
+		)?;
+
+		let bucket_id = garage
+			.bucket_helper()
+			.resolve_bucket(&bucket_name, &api_key)
+			.await?;
+		let bucket = garage
+			.bucket_helper()
+			.get_existing_bucket(bucket_id)
+			.await?;
+
+		let allowed = match endpoint.authorization_type() {
+			Authorization::Read => api_key.allow_read(&bucket_id),
+			Authorization::Write => api_key.allow_write(&bucket_id),
+			Authorization::Owner => api_key.allow_owner(&bucket_id),
+			_ => unreachable!(),
+		};
+
+		if !allowed {
+			return Err(Error::forbidden("Operation is not allowed for this key."));
+		}
+
+		// Look up what CORS rule might apply to response.
+		// Requests for methods different than GET, HEAD or POST
+		// are always preflighted, i.e. the browser should make
+		// an OPTIONS call before to check it is allowed
+		let matching_cors_rule = match *req.method() {
+			Method::GET | Method::HEAD | Method::POST => find_matching_cors_rule(&bucket, &req)
+				.ok_or_internal_error("Error looking up CORS rule")?,
+			_ => None,
+		};
+
+		let resp = match endpoint {
+			Endpoint::DeleteItem {
+				partition_key,
+				sort_key,
+			} => handle_delete_item(garage, req, bucket_id, &partition_key, &sort_key).await,
+			Endpoint::InsertItem {
+				partition_key,
+				sort_key,
+			} => handle_insert_item(garage, req, bucket_id, &partition_key, &sort_key).await,
+			Endpoint::ReadItem {
+				partition_key,
+				sort_key,
+			} => handle_read_item(garage, &req, bucket_id, &partition_key, &sort_key).await,
+			Endpoint::PollItem {
+				partition_key,
+				sort_key,
+				causality_token,
+				timeout,
+			} => {
+				handle_poll_item(
+					garage,
+					&req,
+					bucket_id,
+					partition_key,
+					sort_key,
+					causality_token,
+					timeout,
+				)
+				.await
+			}
+			Endpoint::ReadIndex {
+				prefix,
+				start,
+				end,
+				limit,
+				reverse,
+			} => handle_read_index(garage, bucket_id, prefix, start, end, limit, reverse).await,
+			Endpoint::InsertBatch {} => handle_insert_batch(garage, bucket_id, req).await,
+			Endpoint::ReadBatch {} => handle_read_batch(garage, bucket_id, req).await,
+			Endpoint::DeleteBatch {} => handle_delete_batch(garage, bucket_id, req).await,
+			Endpoint::Options => unreachable!(),
+		};
+
+		// If request was a success and we have a CORS rule that applies to it,
+		// add the corresponding CORS headers to the response
+		let mut resp_ok = resp?;
+		if let Some(rule) = matching_cors_rule {
+			add_cors_headers(&mut resp_ok, rule)
+				.ok_or_internal_error("Invalid bucket CORS configuration")?;
+		}
+
+		Ok(resp_ok)
+	}
+}
+
+impl ApiEndpoint for K2VApiEndpoint {
+	fn name(&self) -> &'static str {
+		self.endpoint.name()
+	}
+
+	fn add_span_attributes(&self, span: SpanRef<'_>) {
+		span.set_attribute(KeyValue::new("bucket", self.bucket_name.clone()));
+	}
+}
diff --git a/src/api/k2v/batch.rs b/src/api/k2v/batch.rs
new file mode 100644
index 00000000..db9901cf
--- /dev/null
+++ b/src/api/k2v/batch.rs
@@ -0,0 +1,363 @@
+use std::sync::Arc;
+
+use hyper::{Body, Request, Response, StatusCode};
+use serde::{Deserialize, Serialize};
+
+use garage_util::data::*;
+use garage_util::error::Error as GarageError;
+
+use garage_table::{EnumerationOrder, TableSchema};
+
+use garage_model::garage::Garage;
+use garage_model::k2v::causality::*;
+use garage_model::k2v::item_table::*;
+
+use crate::helpers::*;
+use crate::k2v::error::*;
+use crate::k2v::range::read_range;
+
+pub async fn handle_insert_batch(
+	garage: Arc<Garage>,
+	bucket_id: Uuid,
+	req: Request<Body>,
+) -> Result<Response<Body>, Error> {
+	let items = parse_json_body::<Vec<InsertBatchItem>>(req).await?;
+
+	let mut items2 = vec![];
+	for it in items {
+		let ct = it
+			.ct
+			.map(|s| CausalContext::parse(&s))
+			.transpose()
+			.ok_or_bad_request("Invalid causality token")?;
+		let v = match it.v {
+			Some(vs) => {
+				DvvsValue::Value(base64::decode(vs).ok_or_bad_request("Invalid base64 value")?)
+			}
+			None => DvvsValue::Deleted,
+		};
+		items2.push((it.pk, it.sk, ct, v));
+	}
+
+	garage.k2v.rpc.insert_batch(bucket_id, items2).await?;
+
+	Ok(Response::builder()
+		.status(StatusCode::OK)
+		.body(Body::empty())?)
+}
+
+pub async fn handle_read_batch(
+	garage: Arc<Garage>,
+	bucket_id: Uuid,
+	req: Request<Body>,
+) -> Result<Response<Body>, Error> {
+	let queries = parse_json_body::<Vec<ReadBatchQuery>>(req).await?;
+
+	let resp_results = futures::future::join_all(
+		queries
+			.into_iter()
+			.map(|q| handle_read_batch_query(&garage, bucket_id, q)),
+	)
+	.await;
+
+	let mut resps: Vec<ReadBatchResponse> = vec![];
+	for resp in resp_results {
+		resps.push(resp?);
+	}
+
+	let resp_json = serde_json::to_string_pretty(&resps).map_err(GarageError::from)?;
+	Ok(Response::builder()
+		.status(StatusCode::OK)
+		.body(Body::from(resp_json))?)
+}
+
+async fn handle_read_batch_query(
+	garage: &Arc<Garage>,
+	bucket_id: Uuid,
+	query: ReadBatchQuery,
+) -> Result<ReadBatchResponse, Error> {
+	let partition = K2VItemPartition {
+		bucket_id,
+		partition_key: query.partition_key.clone(),
+	};
+
+	let filter = ItemFilter {
+		exclude_only_tombstones: !query.tombstones,
+		conflicts_only: query.conflicts_only,
+	};
+
+	let (items, more, next_start) = if query.single_item {
+		if query.prefix.is_some() || query.end.is_some() || query.limit.is_some() || query.reverse {
+			return Err(Error::bad_request("Batch query parameters 'prefix', 'end', 'limit' and 'reverse' must not be set when singleItem is true."));
+		}
+		let sk = query
+			.start
+			.as_ref()
+			.ok_or_bad_request("start should be specified if single_item is set")?;
+		let item = garage
+			.k2v
+			.item_table
+			.get(&partition, sk)
+			.await?
+			.filter(|e| K2VItemTable::matches_filter(e, &filter));
+		match item {
+			Some(i) => (vec![ReadBatchResponseItem::from(i)], false, None),
+			None => (vec![], false, None),
+		}
+	} else {
+		let (items, more, next_start) = read_range(
+			&garage.k2v.item_table,
+			&partition,
+			&query.prefix,
+			&query.start,
+			&query.end,
+			query.limit,
+			Some(filter),
+			EnumerationOrder::from_reverse(query.reverse),
+		)
+		.await?;
+
+		let items = items
+			.into_iter()
+			.map(ReadBatchResponseItem::from)
+			.collect::<Vec<_>>();
+
+		(items, more, next_start)
+	};
+
+	Ok(ReadBatchResponse {
+		partition_key: query.partition_key,
+		prefix: query.prefix,
+		start: query.start,
+		end: query.end,
+		limit: query.limit,
+		reverse: query.reverse,
+		single_item: query.single_item,
+		conflicts_only: query.conflicts_only,
+		tombstones: query.tombstones,
+		items,
+		more,
+		next_start,
+	})
+}
+
+pub async fn handle_delete_batch(
+	garage: Arc<Garage>,
+	bucket_id: Uuid,
+	req: Request<Body>,
+) -> Result<Response<Body>, Error> {
+	let queries = parse_json_body::<Vec<DeleteBatchQuery>>(req).await?;
+
+	let resp_results = futures::future::join_all(
+		queries
+			.into_iter()
+			.map(|q| handle_delete_batch_query(&garage, bucket_id, q)),
+	)
+	.await;
+
+	let mut resps: Vec<DeleteBatchResponse> = vec![];
+	for resp in resp_results {
+		resps.push(resp?);
+	}
+
+	let resp_json = serde_json::to_string_pretty(&resps).map_err(GarageError::from)?;
+	Ok(Response::builder()
+		.status(StatusCode::OK)
+		.body(Body::from(resp_json))?)
+}
+
+async fn handle_delete_batch_query(
+	garage: &Arc<Garage>,
+	bucket_id: Uuid,
+	query: DeleteBatchQuery,
+) -> Result<DeleteBatchResponse, Error> {
+	let partition = K2VItemPartition {
+		bucket_id,
+		partition_key: query.partition_key.clone(),
+	};
+
+	let filter = ItemFilter {
+		exclude_only_tombstones: true,
+		conflicts_only: false,
+	};
+
+	let deleted_items = if query.single_item {
+		if query.prefix.is_some() || query.end.is_some() {
+			return Err(Error::bad_request("Batch query parameters 'prefix' and 'end' must not be set when singleItem is true."));
+		}
+		let sk = query
+			.start
+			.as_ref()
+			.ok_or_bad_request("start should be specified if single_item is set")?;
+		let item = garage
+			.k2v
+			.item_table
+			.get(&partition, sk)
+			.await?
+			.filter(|e| K2VItemTable::matches_filter(e, &filter));
+		match item {
+			Some(i) => {
+				let cc = i.causal_context();
+				garage
+					.k2v
+					.rpc
+					.insert(
+						bucket_id,
+						i.partition.partition_key,
+						i.sort_key,
+						Some(cc),
+						DvvsValue::Deleted,
+					)
+					.await?;
+				1
+			}
+			None => 0,
+		}
+	} else {
+		let (items, more, _next_start) = read_range(
+			&garage.k2v.item_table,
+			&partition,
+			&query.prefix,
+			&query.start,
+			&query.end,
+			None,
+			Some(filter),
+			EnumerationOrder::Forward,
+		)
+		.await?;
+		assert!(!more);
+
+		// TODO delete items
+		let items = items
+			.into_iter()
+			.map(|i| {
+				let cc = i.causal_context();
+				(
+					i.partition.partition_key,
+					i.sort_key,
+					Some(cc),
+					DvvsValue::Deleted,
+				)
+			})
+			.collect::<Vec<_>>();
+		let n = items.len();
+
+		garage.k2v.rpc.insert_batch(bucket_id, items).await?;
+
+		n
+	};
+
+	Ok(DeleteBatchResponse {
+		partition_key: query.partition_key,
+		prefix: query.prefix,
+		start: query.start,
+		end: query.end,
+		single_item: query.single_item,
+		deleted_items,
+	})
+}
+
+#[derive(Deserialize)]
+struct InsertBatchItem {
+	pk: String,
+	sk: String,
+	ct: Option<String>,
+	v: Option<String>,
+}
+
+#[derive(Deserialize)]
+struct ReadBatchQuery {
+	#[serde(rename = "partitionKey")]
+	partition_key: String,
+	#[serde(default)]
+	prefix: Option<String>,
+	#[serde(default)]
+	start: Option<String>,
+	#[serde(default)]
+	end: Option<String>,
+	#[serde(default)]
+	limit: Option<u64>,
+	#[serde(default)]
+	reverse: bool,
+	#[serde(default, rename = "singleItem")]
+	single_item: bool,
+	#[serde(default, rename = "conflictsOnly")]
+	conflicts_only: bool,
+	#[serde(default)]
+	tombstones: bool,
+}
+
+#[derive(Serialize)]
+struct ReadBatchResponse {
+	#[serde(rename = "partitionKey")]
+	partition_key: String,
+	prefix: Option<String>,
+	start: Option<String>,
+	end: Option<String>,
+	limit: Option<u64>,
+	reverse: bool,
+	#[serde(rename = "singleItem")]
+	single_item: bool,
+	#[serde(rename = "conflictsOnly")]
+	conflicts_only: bool,
+	tombstones: bool,
+
+	items: Vec<ReadBatchResponseItem>,
+	more: bool,
+	#[serde(rename = "nextStart")]
+	next_start: Option<String>,
+}
+
+#[derive(Serialize)]
+struct ReadBatchResponseItem {
+	sk: String,
+	ct: String,
+	v: Vec<Option<String>>,
+}
+
+impl ReadBatchResponseItem {
+	fn from(i: K2VItem) -> Self {
+		let ct = i.causal_context().serialize();
+		let v = i
+			.values()
+			.iter()
+			.map(|v| match v {
+				DvvsValue::Value(x) => Some(base64::encode(x)),
+				DvvsValue::Deleted => None,
+			})
+			.collect::<Vec<_>>();
+		Self {
+			sk: i.sort_key,
+			ct,
+			v,
+		}
+	}
+}
+
+#[derive(Deserialize)]
+struct DeleteBatchQuery {
+	#[serde(rename = "partitionKey")]
+	partition_key: String,
+	#[serde(default)]
+	prefix: Option<String>,
+	#[serde(default)]
+	start: Option<String>,
+	#[serde(default)]
+	end: Option<String>,
+	#[serde(default, rename = "singleItem")]
+	single_item: bool,
+}
+
+#[derive(Serialize)]
+struct DeleteBatchResponse {
+	#[serde(rename = "partitionKey")]
+	partition_key: String,
+	prefix: Option<String>,
+	start: Option<String>,
+	end: Option<String>,
+	#[serde(rename = "singleItem")]
+	single_item: bool,
+
+	#[serde(rename = "deletedItems")]
+	deleted_items: usize,
+}
diff --git a/src/api/k2v/error.rs b/src/api/k2v/error.rs
new file mode 100644
index 00000000..42491466
--- /dev/null
+++ b/src/api/k2v/error.rs
@@ -0,0 +1,135 @@
+use err_derive::Error;
+use hyper::header::HeaderValue;
+use hyper::{Body, HeaderMap, StatusCode};
+
+use garage_model::helper::error::Error as HelperError;
+
+use crate::common_error::CommonError;
+pub use crate::common_error::{CommonErrorDerivative, OkOrBadRequest, OkOrInternalError};
+use crate::generic_server::ApiError;
+use crate::helpers::CustomApiErrorBody;
+use crate::signature::error::Error as SignatureError;
+
+/// Errors of this crate
+#[derive(Debug, Error)]
+pub enum Error {
+	#[error(display = "{}", _0)]
+	/// Error from common error
+	Common(CommonError),
+
+	// Category: cannot process
+	/// Authorization Header Malformed
+	#[error(display = "Authorization header malformed, expected scope: {}", _0)]
+	AuthorizationHeaderMalformed(String),
+
+	/// The object requested don't exists
+	#[error(display = "Key not found")]
+	NoSuchKey,
+
+	/// Some base64 encoded data was badly encoded
+	#[error(display = "Invalid base64: {}", _0)]
+	InvalidBase64(#[error(source)] base64::DecodeError),
+
+	/// The client sent a header with invalid value
+	#[error(display = "Invalid header value: {}", _0)]
+	InvalidHeader(#[error(source)] hyper::header::ToStrError),
+
+	/// The client asked for an invalid return format (invalid Accept header)
+	#[error(display = "Not acceptable: {}", _0)]
+	NotAcceptable(String),
+
+	/// The request contained an invalid UTF-8 sequence in its path or in other parameters
+	#[error(display = "Invalid UTF-8: {}", _0)]
+	InvalidUtf8Str(#[error(source)] std::str::Utf8Error),
+}
+
+impl<T> From<T> for Error
+where
+	CommonError: From<T>,
+{
+	fn from(err: T) -> Self {
+		Error::Common(CommonError::from(err))
+	}
+}
+
+impl CommonErrorDerivative for Error {}
+
+impl From<HelperError> for Error {
+	fn from(err: HelperError) -> Self {
+		match err {
+			HelperError::Internal(i) => Self::Common(CommonError::InternalError(i)),
+			HelperError::BadRequest(b) => Self::Common(CommonError::BadRequest(b)),
+			HelperError::InvalidBucketName(n) => Self::Common(CommonError::InvalidBucketName(n)),
+			HelperError::NoSuchBucket(n) => Self::Common(CommonError::NoSuchBucket(n)),
+			e => Self::Common(CommonError::BadRequest(format!("{}", e))),
+		}
+	}
+}
+
+impl From<SignatureError> for Error {
+	fn from(err: SignatureError) -> Self {
+		match err {
+			SignatureError::Common(c) => Self::Common(c),
+			SignatureError::AuthorizationHeaderMalformed(c) => {
+				Self::AuthorizationHeaderMalformed(c)
+			}
+			SignatureError::InvalidUtf8Str(i) => Self::InvalidUtf8Str(i),
+			SignatureError::InvalidHeader(h) => Self::InvalidHeader(h),
+		}
+	}
+}
+
+impl Error {
+	/// This returns a keyword for the corresponding error.
+	/// Here, these keywords are not necessarily those from AWS S3,
+	/// as we are building a custom API
+	fn code(&self) -> &'static str {
+		match self {
+			Error::Common(c) => c.aws_code(),
+			Error::NoSuchKey => "NoSuchKey",
+			Error::NotAcceptable(_) => "NotAcceptable",
+			Error::AuthorizationHeaderMalformed(_) => "AuthorizationHeaderMalformed",
+			Error::InvalidBase64(_) => "InvalidBase64",
+			Error::InvalidHeader(_) => "InvalidHeaderValue",
+			Error::InvalidUtf8Str(_) => "InvalidUtf8String",
+		}
+	}
+}
+
+impl ApiError for Error {
+	/// Get the HTTP status code that best represents the meaning of the error for the client
+	fn http_status_code(&self) -> StatusCode {
+		match self {
+			Error::Common(c) => c.http_status_code(),
+			Error::NoSuchKey => StatusCode::NOT_FOUND,
+			Error::NotAcceptable(_) => StatusCode::NOT_ACCEPTABLE,
+			Error::AuthorizationHeaderMalformed(_)
+			| Error::InvalidBase64(_)
+			| Error::InvalidHeader(_)
+			| Error::InvalidUtf8Str(_) => StatusCode::BAD_REQUEST,
+		}
+	}
+
+	fn add_http_headers(&self, header_map: &mut HeaderMap<HeaderValue>) {
+		use hyper::header;
+		header_map.append(header::CONTENT_TYPE, "application/json".parse().unwrap());
+	}
+
+	fn http_body(&self, garage_region: &str, path: &str) -> Body {
+		let error = CustomApiErrorBody {
+			code: self.code().to_string(),
+			message: format!("{}", self),
+			path: path.to_string(),
+			region: garage_region.to_string(),
+		};
+		Body::from(serde_json::to_string_pretty(&error).unwrap_or_else(|_| {
+			r#"
+{
+	"code": "InternalError",
+	"message": "JSON encoding of error failed"
+}
+			"#
+			.into()
+		}))
+	}
+}
diff --git a/src/api/k2v/index.rs b/src/api/k2v/index.rs
new file mode 100644
index 00000000..210950bf
--- /dev/null
+++ b/src/api/k2v/index.rs
@@ -0,0 +1,100 @@
+use std::sync::Arc;
+
+use hyper::{Body, Response, StatusCode};
+use serde::Serialize;
+
+use garage_util::data::*;
+use garage_util::error::Error as GarageError;
+
+use garage_rpc::ring::Ring;
+use garage_table::util::*;
+
+use garage_model::garage::Garage;
+use garage_model::k2v::item_table::{BYTES, CONFLICTS, ENTRIES, VALUES};
+
+use crate::k2v::error::*;
+use crate::k2v::range::read_range;
+
+pub async fn handle_read_index(
+	garage: Arc<Garage>,
+	bucket_id: Uuid,
+	prefix: Option<String>,
+	start: Option<String>,
+	end: Option<String>,
+	limit: Option<u64>,
+	reverse: Option<bool>,
+) -> Result<Response<Body>, Error> {
+	let reverse = reverse.unwrap_or(false);
+
+	let ring: Arc<Ring> = garage.system.ring.borrow().clone();
+
+	let (partition_keys, more, next_start) = read_range(
+		&garage.k2v.counter_table.table,
+		&bucket_id,
+		&prefix,
+		&start,
+		&end,
+		limit,
+		Some((DeletedFilter::NotDeleted, ring.layout.node_id_vec.clone())),
+		EnumerationOrder::from_reverse(reverse),
+	)
+	.await?;
+
+	let s_entries = ENTRIES.to_string();
+	let s_conflicts = CONFLICTS.to_string();
+	let s_values = VALUES.to_string();
+	let s_bytes = BYTES.to_string();
+
+	let resp = ReadIndexResponse {
+		prefix,
+		start,
+		end,
+		limit,
+		reverse,
+		partition_keys: partition_keys
+			.into_iter()
+			.map(|part| {
+				let vals = part.filtered_values(&ring);
+				ReadIndexResponseEntry {
+					pk: part.sk,
+					entries: *vals.get(&s_entries).unwrap_or(&0),
+					conflicts: *vals.get(&s_conflicts).unwrap_or(&0),
+					values: *vals.get(&s_values).unwrap_or(&0),
+					bytes: *vals.get(&s_bytes).unwrap_or(&0),
+				}
+			})
+			.collect::<Vec<_>>(),
+		more,
+		next_start,
+	};
+
+	let resp_json = serde_json::to_string_pretty(&resp).map_err(GarageError::from)?;
+	Ok(Response::builder()
+		.status(StatusCode::OK)
+		.body(Body::from(resp_json))?)
+}
+
+#[derive(Serialize)]
+struct ReadIndexResponse {
+	prefix: Option<String>,
+	start: Option<String>,
+	end: Option<String>,
+	limit: Option<u64>,
+	reverse: bool,
+
+	#[serde(rename = "partitionKeys")]
+	partition_keys: Vec<ReadIndexResponseEntry>,
+
+	more: bool,
+	#[serde(rename = "nextStart")]
+	next_start: Option<String>,
+}
+
+#[derive(Serialize)]
+struct ReadIndexResponseEntry {
+	pk: String,
+	entries: i64,
+	conflicts: i64,
+	values: i64,
+	bytes: i64,
+}
diff --git a/src/api/k2v/item.rs b/src/api/k2v/item.rs
new file mode 100644
index 00000000..836d386f
--- /dev/null
+++ b/src/api/k2v/item.rs
@@ -0,0 +1,230 @@
+use std::sync::Arc;
+
+use http::header;
+
+use hyper::{Body, Request, Response, StatusCode};
+
+use garage_util::data::*;
+
+use garage_model::garage::Garage;
+use garage_model::k2v::causality::*;
+use garage_model::k2v::item_table::*;
+
+use crate::k2v::error::*;
+
+pub const X_GARAGE_CAUSALITY_TOKEN: &str = "X-Garage-Causality-Token";
+
+pub enum ReturnFormat {
+	Json,
+	Binary,
+	Either,
+}
+
+impl ReturnFormat {
+	pub fn from(req: &Request<Body>) -> Result<Self, Error> {
+		let accept = match req.headers().get(header::ACCEPT) {
+			Some(a) => a.to_str()?,
+			None => return Ok(Self::Json),
+		};
+
+		let accept = accept.split(',').map(|s| s.trim()).collect::<Vec<_>>();
+		let accept_json = accept.contains(&"application/json") || accept.contains(&"*/*");
+		let accept_binary = accept.contains(&"application/octet-stream") || accept.contains(&"*/*");
+
+		match (accept_json, accept_binary) {
+			(true, true) => Ok(Self::Either),
+			(true, false) => Ok(Self::Json),
+			(false, true) => Ok(Self::Binary),
+			(false, false) => Err(Error::NotAcceptable("Invalid Accept: header value, must contain either application/json or application/octet-stream (or both)".into())),
+		}
+	}
+
+	pub fn make_response(&self, item: &K2VItem) -> Result<Response<Body>, Error> {
+		let vals = item.values();
+
+		if vals.is_empty() {
+			return Err(Error::NoSuchKey);
+		}
+
+		let ct = item.causal_context().serialize();
+		match self {
+			Self::Binary if vals.len() > 1 => Ok(Response::builder()
+				.header(X_GARAGE_CAUSALITY_TOKEN, ct)
+				.status(StatusCode::CONFLICT)
+				.body(Body::empty())?),
+			Self::Binary => {
+				assert!(vals.len() == 1);
+				Self::make_binary_response(ct, vals[0])
+			}
+			Self::Either if vals.len() == 1 => Self::make_binary_response(ct, vals[0]),
+			_ => Self::make_json_response(ct, &vals[..]),
+		}
+	}
+
+	fn make_binary_response(ct: String, v: &DvvsValue) -> Result<Response<Body>, Error> {
+		match v {
+			DvvsValue::Deleted => Ok(Response::builder()
+				.header(X_GARAGE_CAUSALITY_TOKEN, ct)
+				.header(header::CONTENT_TYPE, "application/octet-stream")
+				.status(StatusCode::NO_CONTENT)
+				.body(Body::empty())?),
+			DvvsValue::Value(v) => Ok(Response::builder()
+				.header(X_GARAGE_CAUSALITY_TOKEN, ct)
+				.header(header::CONTENT_TYPE, "application/octet-stream")
+				.status(StatusCode::OK)
+				.body(Body::from(v.to_vec()))?),
+		}
+	}
+
+	fn make_json_response(ct: String, v: &[&DvvsValue]) -> Result<Response<Body>, Error> {
+		let items = v
+			.iter()
+			.map(|v| match v {
+				DvvsValue::Deleted => serde_json::Value::Null,
+				DvvsValue::Value(v) => serde_json::Value::String(base64::encode(v)),
+			})
+			.collect::<Vec<_>>();
+		let json_body =
+			serde_json::to_string_pretty(&items).ok_or_internal_error("JSON encoding error")?;
+		Ok(Response::builder()
+			.header(X_GARAGE_CAUSALITY_TOKEN, ct)
+			.header(header::CONTENT_TYPE, "application/json")
+			.status(StatusCode::OK)
+			.body(Body::from(json_body))?)
+	}
+}
+
+/// Handle ReadItem request
+#[allow(clippy::ptr_arg)]
+pub async fn handle_read_item(
+	garage: Arc<Garage>,
+	req: &Request<Body>,
+	bucket_id: Uuid,
+	partition_key: &str,
+	sort_key: &String,
+) -> Result<Response<Body>, Error> {
+	let format = ReturnFormat::from(req)?;
+
+	let item = garage
+		.k2v
+		.item_table
+		.get(
+			&K2VItemPartition {
+				bucket_id,
+				partition_key: partition_key.to_string(),
+			},
+			sort_key,
+		)
+		.await?
+		.ok_or(Error::NoSuchKey)?;
+
+	format.make_response(&item)
+}
+
+pub async fn handle_insert_item(
+	garage: Arc<Garage>,
+	req: Request<Body>,
+	bucket_id: Uuid,
+	partition_key: &str,
+	sort_key: &str,
+) -> Result<Response<Body>, Error> {
+	let causal_context = req
+		.headers()
+		.get(X_GARAGE_CAUSALITY_TOKEN)
+		.map(|s| s.to_str())
+		.transpose()?
+		.map(CausalContext::parse)
+		.transpose()
+		.ok_or_bad_request("Invalid causality token")?;
+
+	let body = hyper::body::to_bytes(req.into_body()).await?;
+	let value = DvvsValue::Value(body.to_vec());
+
+	garage
+		.k2v
+		.rpc
+		.insert(
+			bucket_id,
+			partition_key.to_string(),
+			sort_key.to_string(),
+			causal_context,
+			value,
+		)
+		.await?;
+
+	Ok(Response::builder()
+		.status(StatusCode::OK)
+		.body(Body::empty())?)
+}
+
+pub async fn handle_delete_item(
+	garage: Arc<Garage>,
+	req: Request<Body>,
+	bucket_id: Uuid,
+	partition_key: &str,
+	sort_key: &str,
+) -> Result<Response<Body>, Error> {
+	let causal_context = req
+		.headers()
+		.get(X_GARAGE_CAUSALITY_TOKEN)
+		.map(|s| s.to_str())
+		.transpose()?
+		.map(CausalContext::parse)
+		.transpose()
+		.ok_or_bad_request("Invalid causality token")?;
+
+	let value = DvvsValue::Deleted;
+
+	garage
+		.k2v
+		.rpc
+		.insert(
+			bucket_id,
+			partition_key.to_string(),
+			sort_key.to_string(),
+			causal_context,
+			value,
+		)
+		.await?;
+
+	Ok(Response::builder()
+		.status(StatusCode::NO_CONTENT)
+		.body(Body::empty())?)
+}
+
+/// Handle ReadItem request
+#[allow(clippy::ptr_arg)]
+pub async fn handle_poll_item(
+	garage: Arc<Garage>,
+	req: &Request<Body>,
+	bucket_id: Uuid,
+	partition_key: String,
+	sort_key: String,
+	causality_token: String,
+	timeout_secs: Option<u64>,
+) -> Result<Response<Body>, Error> {
+	let format = ReturnFormat::from(req)?;
+
+	let causal_context =
+		CausalContext::parse(&causality_token).ok_or_bad_request("Invalid causality token")?;
+
+	let item = garage
+		.k2v
+		.rpc
+		.poll(
+			bucket_id,
+			partition_key,
+			sort_key,
+			causal_context,
+			timeout_secs.unwrap_or(300) * 1000,
+		)
+		.await?;
+
+	if let Some(item) = item {
+		format.make_response(&item)
+	} else {
+		Ok(Response::builder()
+			.status(StatusCode::NOT_MODIFIED)
+			.body(Body::empty())?)
+	}
+}
diff --git a/src/api/k2v/mod.rs b/src/api/k2v/mod.rs
new file mode 100644
index 00000000..b6a8c5cf
--- /dev/null
+++ b/src/api/k2v/mod.rs
@@ -0,0 +1,9 @@
+pub mod api_server;
+mod error;
+mod router;
+
+mod batch;
+mod index;
+mod item;
+
+mod range;
diff --git a/src/api/k2v/range.rs b/src/api/k2v/range.rs
new file mode 100644
index 00000000..bb9d3be5
--- /dev/null
+++ b/src/api/k2v/range.rs
@@ -0,0 +1,100 @@
+//! Utility module for retrieving ranges of items in Garage tables
+//! Implements parameters (prefix, start, end, limit) as specified
+//! for endpoints ReadIndex, ReadBatch and DeleteBatch
+
+use std::sync::Arc;
+
+use garage_table::replication::TableShardedReplication;
+use garage_table::*;
+
+use crate::helpers::key_after_prefix;
+use crate::k2v::error::*;
+
+/// Read range in a Garage table.
+/// Returns (entries, more?, nextStart)
+#[allow(clippy::too_many_arguments)]
+pub(crate) async fn read_range<F>(
+	table: &Arc<Table<F, TableShardedReplication>>,
+	partition_key: &F::P,
+	prefix: &Option<String>,
+	start: &Option<String>,
+	end: &Option<String>,
+	limit: Option<u64>,
+	filter: Option<F::Filter>,
+	enumeration_order: EnumerationOrder,
+) -> Result<(Vec<F::E>, bool, Option<String>), Error>
+where
+	F: TableSchema<S = String> + 'static,
+{
+	let (mut start, mut start_ignore) = match (prefix, start) {
+		(None, None) => (None, false),
+		(None, Some(s)) => (Some(s.clone()), false),
+		(Some(p), Some(s)) => {
+			if !s.starts_with(p) {
+				return Err(Error::bad_request(format!(
+					"Start key '{}' does not start with prefix '{}'",
+					s, p
+				)));
+			}
+			(Some(s.clone()), false)
+		}
+		(Some(p), None) if enumeration_order == EnumerationOrder::Reverse => {
+			let start = key_after_prefix(p)
+				.ok_or_internal_error("Sorry, can't list this prefix in reverse order")?;
+			(Some(start), true)
+		}
+		(Some(p), None) => (Some(p.clone()), false),
+	};
+
+	let mut entries = vec![];
+	loop {
+		let n_get = std::cmp::min(
+			1000,
+			limit.map(|x| x as usize).unwrap_or(usize::MAX - 10) - entries.len() + 2,
+		);
+		let get_ret = table
+			.get_range(
+				partition_key,
+				start.clone(),
+				filter.clone(),
+				n_get,
+				enumeration_order,
+			)
+			.await?;
+
+		let get_ret_len = get_ret.len();
+
+		for entry in get_ret {
+			if start_ignore && Some(entry.sort_key()) == start.as_ref() {
+				continue;
+			}
+			if let Some(p) = prefix {
+				if !entry.sort_key().starts_with(p) {
+					return Ok((entries, false, None));
+				}
+			}
+			if let Some(e) = end {
+				let is_finished = match enumeration_order {
+					EnumerationOrder::Forward => entry.sort_key() >= e,
+					EnumerationOrder::Reverse => entry.sort_key() <= e,
+				};
+				if is_finished {
+					return Ok((entries, false, None));
+				}
+			}
+			if let Some(l) = limit {
+				if entries.len() >= l as usize {
+					return Ok((entries, true, Some(entry.sort_key().clone())));
+				}
+			}
+			entries.push(entry);
+		}
+
+		if get_ret_len < n_get {
+			return Ok((entries, false, None));
+		}
+
+		start = Some(entries.last().unwrap().sort_key().clone());
+		start_ignore = true;
+	}
+}
diff --git a/src/api/k2v/router.rs b/src/api/k2v/router.rs
new file mode 100644
index 00000000..50e6965b
--- /dev/null
+++ b/src/api/k2v/router.rs
@@ -0,0 +1,252 @@
+use crate::k2v::error::*;
+
+use std::borrow::Cow;
+
+use hyper::{Method, Request};
+
+use crate::helpers::Authorization;
+use crate::router_macros::{generateQueryParameters, router_match};
+
+router_match! {@func
+
+
+/// List of all K2V API endpoints.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum Endpoint {
+	DeleteBatch {
+	},
+	DeleteItem {
+		partition_key: String,
+		sort_key: String,
+	},
+	InsertBatch {
+	},
+	InsertItem {
+		partition_key: String,
+		sort_key: String,
+	},
+	Options,
+	PollItem {
+		partition_key: String,
+		sort_key: String,
+		causality_token: String,
+		timeout: Option<u64>,
+	},
+	ReadBatch {
+	},
+	ReadIndex {
+		prefix: Option<String>,
+		start: Option<String>,
+		end: Option<String>,
+		limit: Option<u64>,
+		reverse: Option<bool>,
+	},
+	ReadItem {
+		partition_key: String,
+		sort_key: String,
+	},
+}}
+
+impl Endpoint {
+	/// Determine which S3 endpoint a request is for using the request, and a bucket which was
+	/// possibly extracted from the Host header.
+	/// Returns Self plus bucket name, if endpoint is not Endpoint::ListBuckets
+	pub fn from_request<T>(req: &Request<T>) -> Result<(Self, String), Error> {
+		let uri = req.uri();
+		let path = uri.path().trim_start_matches('/');
+		let query = uri.query();
+
+		let (bucket, partition_key) = path
+			.split_once('/')
+			.map(|(b, p)| (b.to_owned(), p.trim_start_matches('/')))
+			.unwrap_or((path.to_owned(), ""));
+
+		if bucket.is_empty() {
+			return Err(Error::bad_request("Missing bucket name"));
+		}
+
+		if *req.method() == Method::OPTIONS {
+			return Ok((Self::Options, bucket));
+		}
+
+		let partition_key = percent_encoding::percent_decode_str(partition_key)
+			.decode_utf8()?
+			.into_owned();
+
+		let mut query = QueryParameters::from_query(query.unwrap_or_default())?;
+
+		let method_search = Method::from_bytes(b"SEARCH").unwrap();
+		let res = match *req.method() {
+			Method::GET => Self::from_get(partition_key, &mut query)?,
+			//&Method::HEAD => Self::from_head(partition_key, &mut query)?,
+			Method::POST => Self::from_post(partition_key, &mut query)?,
+			Method::PUT => Self::from_put(partition_key, &mut query)?,
+			Method::DELETE => Self::from_delete(partition_key, &mut query)?,
+			_ if req.method() == method_search => Self::from_search(partition_key, &mut query)?,
+			_ => return Err(Error::bad_request("Unknown method")),
+		};
+
+		if let Some(message) = query.nonempty_message() {
+			debug!("Unused query parameter: {}", message)
+		}
+		Ok((res, bucket))
+	}
+
+	/// Determine which endpoint a request is for, knowing it is a GET.
+	fn from_get(partition_key: String, query: &mut QueryParameters<'_>) -> Result<Self, Error> {
+		router_match! {
+			@gen_parser
+			(query.keyword.take().unwrap_or_default().as_ref(), partition_key, query, None),
+			key: [
+				EMPTY if causality_token => PollItem (query::sort_key, query::causality_token, opt_parse::timeout),
+				EMPTY => ReadItem (query::sort_key),
+			],
+			no_key: [
+				EMPTY => ReadIndex (query_opt::prefix, query_opt::start, query_opt::end, opt_parse::limit, opt_parse::reverse),
+			]
+		}
+	}
+
+	/// Determine which endpoint a request is for, knowing it is a SEARCH.
+	fn from_search(partition_key: String, query: &mut QueryParameters<'_>) -> Result<Self, Error> {
+		router_match! {
+			@gen_parser
+			(query.keyword.take().unwrap_or_default().as_ref(), partition_key, query, None),
+			key: [
+			],
+			no_key: [
+				EMPTY => ReadBatch,
+			]
+		}
+	}
+
+	/*
+	/// Determine which endpoint a request is for, knowing it is a HEAD.
+	fn from_head(partition_key: String, query: &mut QueryParameters<'_>) -> Result<Self, Error> {
+		router_match! {
+			@gen_parser
+			(query.keyword.take().unwrap_or_default().as_ref(), partition_key, query, None),
+			key: [
+				EMPTY => HeadObject(opt_parse::part_number, query_opt::version_id),
+			],
+			no_key: [
+				EMPTY => HeadBucket,
+			]
+		}
+	}
+	*/
+
+	/// Determine which endpoint a request is for, knowing it is a POST.
+	fn from_post(partition_key: String, query: &mut QueryParameters<'_>) -> Result<Self, Error> {
+		router_match! {
+			@gen_parser
+			(query.keyword.take().unwrap_or_default().as_ref(), partition_key, query, None),
+			key: [
+			],
+			no_key: [
+				EMPTY => InsertBatch,
+				DELETE => DeleteBatch,
+				SEARCH => ReadBatch,
+			]
+		}
+	}
+
+	/// Determine which endpoint a request is for, knowing it is a PUT.
+	fn from_put(partition_key: String, query: &mut QueryParameters<'_>) -> Result<Self, Error> {
+		router_match! {
+			@gen_parser
+			(query.keyword.take().unwrap_or_default().as_ref(), partition_key, query, None),
+			key: [
+				EMPTY => InsertItem (query::sort_key),
+
+			],
+			no_key: [
+			]
+		}
+	}
+
+	/// Determine which endpoint a request is for, knowing it is a DELETE.
+	fn from_delete(partition_key: String, query: &mut QueryParameters<'_>) -> Result<Self, Error> {
+		router_match! {
+			@gen_parser
+			(query.keyword.take().unwrap_or_default().as_ref(), partition_key, query, None),
+			key: [
+				EMPTY => DeleteItem (query::sort_key),
+			],
+			no_key: [
+			]
+		}
+	}
+
+	/// Get the partition key the request target. Returns None for requests which don't use a partition key.
+	#[allow(dead_code)]
+	pub fn get_partition_key(&self) -> Option<&str> {
+		router_match! {
+			@extract
+			self,
+			partition_key,
+			[
+				DeleteItem,
+				InsertItem,
+				PollItem,
+				ReadItem,
+			]
+		}
+	}
+
+	/// Get the sort key the request target. Returns None for requests which don't use a sort key.
+	#[allow(dead_code)]
+	pub fn get_sort_key(&self) -> Option<&str> {
+		router_match! {
+			@extract
+			self,
+			sort_key,
+			[
+				DeleteItem,
+				InsertItem,
+				PollItem,
+				ReadItem,
+			]
+		}
+	}
+
+	/// Get the kind of authorization which is required to perform the operation.
+	pub fn authorization_type(&self) -> Authorization {
+		let readonly = router_match! {
+			@match
+			self,
+			[
+				PollItem,
+				ReadBatch,
+				ReadIndex,
+				ReadItem,
+			]
+		};
+		if readonly {
+			Authorization::Read
+		} else {
+			Authorization::Write
+		}
+	}
+}
+
+// parameter name => struct field
+generateQueryParameters! {
+	"prefix" => prefix,
+	"start" => start,
+	"causality_token" => causality_token,
+	"end" => end,
+	"limit" => limit,
+	"reverse" => reverse,
+	"sort_key" => sort_key,
+	"timeout" => timeout
+}
+
+mod keywords {
+	//! This module contain all query parameters with no associated value
+	//! used to differentiate endpoints.
+	pub const EMPTY: &str = "";
+
+	pub const DELETE: &str = "delete";
+	pub const SEARCH: &str = "search";
+}
diff --git a/src/api/lib.rs b/src/api/lib.rs
index de60ec53..370dfd7a 100644
--- a/src/api/lib.rs
+++ b/src/api/lib.rs
@@ -2,26 +2,16 @@
 #[macro_use]
 extern crate tracing;
 
-pub mod error;
-pub use error::Error;
+pub mod common_error;
 
 mod encoding;
-
-mod api_server;
-pub use api_server::run_api_server;
-
+pub mod generic_server;
+pub mod helpers;
+mod router_macros;
 /// This mode is public only to help testing. Don't expect stability here
 pub mod signature;
 
-pub mod helpers;
-mod s3_bucket;
-mod s3_copy;
-pub mod s3_cors;
-mod s3_delete;
-pub mod s3_get;
-mod s3_list;
-mod s3_post_object;
-mod s3_put;
-mod s3_router;
-mod s3_website;
-mod s3_xml;
+pub mod admin;
+#[cfg(feature = "k2v")]
+pub mod k2v;
+pub mod s3;
diff --git a/src/api/router_macros.rs b/src/api/router_macros.rs
new file mode 100644
index 00000000..4c593300
--- /dev/null
+++ b/src/api/router_macros.rs
@@ -0,0 +1,213 @@
+/// This macro is used to generate very repetitive match {} blocks in this module
+/// It is _not_ made to be used anywhere else
+macro_rules! router_match {
+    (@match $enum:expr , [ $($endpoint:ident,)* ]) => {{
+        // usage: router_match {@match my_enum, [ VariantWithField1, VariantWithField2 ..] }
+        // returns true if the variant was one of the listed variants, false otherwise.
+        use Endpoint::*;
+        match $enum {
+            $(
+            $endpoint { .. } => true,
+            )*
+            _ => false
+        }
+    }};
+    (@extract $enum:expr , $param:ident, [ $($endpoint:ident,)* ]) => {{
+        // usage: router_match {@extract my_enum, field_name, [ VariantWithField1, VariantWithField2 ..] }
+        // returns Some(field_value), or None if the variant was not one of the listed variants.
+        use Endpoint::*;
+        match $enum {
+            $(
+            $endpoint {$param, ..} => Some($param),
+            )*
+            _ => None
+        }
+    }};
+	(@gen_path_parser ($method:expr, $reqpath:expr, $query:expr)
+	 [
+	 $($meth:ident $path:pat $(if $required:ident)? => $api:ident $(($($conv:ident :: $param:ident),*))?,)*
+	 ]) => {{
+		{
+			use Endpoint::*;
+			match ($method, $reqpath) {
+				$(
+					(&Method::$meth, $path) if true $(&& $query.$required.is_some())? => $api {
+						$($(
+							$param: router_match!(@@parse_param $query, $conv, $param),
+						)*)?
+					},
+				)*
+				(m, p) => {
+					return Err(Error::bad_request(format!(
+						"Unknown API endpoint: {} {}",
+						m, p
+					)))
+				}
+			}
+		}
+	}};
+    (@gen_parser ($keyword:expr, $key:ident, $query:expr, $header:expr),
+        key: [$($kw_k:ident $(if $required_k:ident)? $(header $header_k:expr)? => $api_k:ident $(($($conv_k:ident :: $param_k:ident),*))?,)*],
+        no_key: [$($kw_nk:ident $(if $required_nk:ident)? $(if_header $header_nk:expr)? => $api_nk:ident $(($($conv_nk:ident :: $param_nk:ident),*))?,)*]) => {{
+        // usage: router_match {@gen_parser (keyword, key, query, header),
+        //   key: [
+        //      SOME_KEYWORD => VariantWithKey,
+        //      ...
+        //   ],
+        //   no_key: [
+        //      SOME_KEYWORD => VariantWithoutKey,
+        //      ...
+        //   ]
+        // }
+        // See in from_{method} for more detailed usage.
+        use Endpoint::*;
+        use keywords::*;
+        match ($keyword, !$key.is_empty()){
+            $(
+            ($kw_k, true) if true $(&& $query.$required_k.is_some())? $(&& $header.contains_key($header_k))? => Ok($api_k {
+                $key,
+                $($(
+                    $param_k: router_match!(@@parse_param $query, $conv_k, $param_k),
+                )*)?
+            }),
+            )*
+            $(
+            ($kw_nk, false) $(if $query.$required_nk.is_some())? $(if $header.contains($header_nk))? => Ok($api_nk {
+                $($(
+                    $param_nk: router_match!(@@parse_param $query, $conv_nk, $param_nk),
+                )*)?
+            }),
+            )*
+            (kw, _) => Err(Error::bad_request(format!("Invalid endpoint: {}", kw)))
+        }
+    }};
+
+    (@@parse_param $query:expr, query_opt, $param:ident) => {{
+        // extract optional query parameter
+		$query.$param.take().map(|param| param.into_owned())
+    }};
+    (@@parse_param $query:expr, query, $param:ident) => {{
+        // extract mendatory query parameter
+        $query.$param.take().ok_or_bad_request("Missing argument for endpoint")?.into_owned()
+    }};
+    (@@parse_param $query:expr, opt_parse, $param:ident) => {{
+        // extract and parse optional query parameter
+        // missing parameter is file, however parse error is reported as an error
+		$query.$param
+            .take()
+            .map(|param| param.parse())
+            .transpose()
+            .map_err(|_| Error::bad_request("Failed to parse query parameter"))?
+    }};
+    (@@parse_param $query:expr, parse, $param:ident) => {{
+        // extract and parse mandatory query parameter
+        // both missing and un-parseable parameters are reported as errors
+        $query.$param.take().ok_or_bad_request("Missing argument for endpoint")?
+            .parse()
+            .map_err(|_| Error::bad_request("Failed to parse query parameter"))?
+    }};
+    (@func
+    $(#[$doc:meta])*
+     pub enum Endpoint {
+        $(
+            $(#[$outer:meta])*
+            $variant:ident $({
+                $($name:ident: $ty:ty,)*
+            })?,
+        )*
+    }) => {
+    $(#[$doc])*
+        pub enum Endpoint {
+            $(
+                $(#[$outer])*
+                $variant $({
+                    $($name: $ty, )*
+                })?,
+            )*
+        }
+        impl Endpoint {
+            pub fn name(&self) -> &'static str {
+                match self {
+                    $(Endpoint::$variant $({ $($name: _,)* .. })? => stringify!($variant),)*
+                }
+            }
+        }
+    };
+    (@if ($($cond:tt)+) then ($($then:tt)*) else ($($else:tt)*)) => {
+        $($then)*
+    };
+    (@if () then ($($then:tt)*) else ($($else:tt)*)) => {
+        $($else)*
+    };
+}
+
+/// This macro is used to generate part of the code in this module. It must be called only one, and
+/// is useless outside of this module.
+macro_rules! generateQueryParameters {
+    ( $($rest:expr => $name:ident),* ) => {
+        /// Struct containing all query parameters used in endpoints. Think of it as an HashMap,
+        /// but with keys statically known.
+        #[derive(Debug, Default)]
+        struct QueryParameters<'a> {
+            keyword: Option<Cow<'a, str>>,
+            $(
+            $name: Option<Cow<'a, str>>,
+            )*
+        }
+
+        impl<'a> QueryParameters<'a> {
+            /// Build this struct from the query part of an URI.
+            fn from_query(query: &'a str) -> Result<Self, Error> {
+                let mut res: Self = Default::default();
+                for (k, v) in url::form_urlencoded::parse(query.as_bytes()) {
+                    let repeated = match k.as_ref() {
+                        $(
+                            $rest => if !v.is_empty() {
+                                res.$name.replace(v).is_some()
+                            } else {
+                                false
+                            },
+                        )*
+                        _ => {
+                            if k.starts_with("response-") || k.starts_with("X-Amz-") {
+                                false
+                            } else if v.as_ref().is_empty() {
+                                if res.keyword.replace(k).is_some() {
+                                    return Err(Error::bad_request("Multiple keywords"));
+                                }
+                                continue;
+                            } else {
+                                debug!("Received an unknown query parameter: '{}'", k);
+                                false
+                            }
+                        }
+                    };
+                    if repeated {
+                        return Err(Error::bad_request(format!(
+                            "Query parameter repeated: '{}'",
+                            k
+                        )));
+                    }
+                }
+                Ok(res)
+            }
+
+            /// Get an error message in case not all parameters where used when extracting them to
+            /// build an Enpoint variant
+            fn nonempty_message(&self) -> Option<&str> {
+                if self.keyword.is_some() {
+                    Some("Keyword not used")
+                } $(
+                    else if self.$name.is_some() {
+                        Some(concat!("'", $rest, "'"))
+                    }
+                )* else {
+                    None
+                }
+            }
+        }
+    }
+}
+
+pub(crate) use generateQueryParameters;
+pub(crate) use router_match;
diff --git a/src/api/s3/api_server.rs b/src/api/s3/api_server.rs
new file mode 100644
index 00000000..27837297
--- /dev/null
+++ b/src/api/s3/api_server.rs
@@ -0,0 +1,390 @@
+use std::net::SocketAddr;
+use std::sync::Arc;
+
+use async_trait::async_trait;
+
+use futures::future::Future;
+use hyper::header;
+use hyper::{Body, Request, Response};
+
+use opentelemetry::{trace::SpanRef, KeyValue};
+
+use garage_util::error::Error as GarageError;
+
+use garage_model::garage::Garage;
+use garage_model::key_table::Key;
+
+use crate::generic_server::*;
+use crate::s3::error::*;
+
+use crate::signature::payload::check_payload_signature;
+use crate::signature::streaming::*;
+
+use crate::helpers::*;
+use crate::s3::bucket::*;
+use crate::s3::copy::*;
+use crate::s3::cors::*;
+use crate::s3::delete::*;
+use crate::s3::get::*;
+use crate::s3::list::*;
+use crate::s3::post_object::handle_post_object;
+use crate::s3::put::*;
+use crate::s3::router::Endpoint;
+use crate::s3::website::*;
+
+pub struct S3ApiServer {
+	garage: Arc<Garage>,
+}
+
+pub(crate) struct S3ApiEndpoint {
+	bucket_name: Option<String>,
+	endpoint: Endpoint,
+}
+
+impl S3ApiServer {
+	pub async fn run(
+		garage: Arc<Garage>,
+		addr: SocketAddr,
+		s3_region: String,
+		shutdown_signal: impl Future<Output = ()>,
+	) -> Result<(), GarageError> {
+		ApiServer::new(s3_region, S3ApiServer { garage })
+			.run_server(addr, shutdown_signal)
+			.await
+	}
+
+	async fn handle_request_without_bucket(
+		&self,
+		_req: Request<Body>,
+		api_key: Key,
+		endpoint: Endpoint,
+	) -> Result<Response<Body>, Error> {
+		match endpoint {
+			Endpoint::ListBuckets => handle_list_buckets(&self.garage, &api_key).await,
+			endpoint => Err(Error::NotImplemented(endpoint.name().to_owned())),
+		}
+	}
+}
+
+#[async_trait]
+impl ApiHandler for S3ApiServer {
+	const API_NAME: &'static str = "s3";
+	const API_NAME_DISPLAY: &'static str = "S3";
+
+	type Endpoint = S3ApiEndpoint;
+	type Error = Error;
+
+	fn parse_endpoint(&self, req: &Request<Body>) -> Result<S3ApiEndpoint, Error> {
+		let authority = req
+			.headers()
+			.get(header::HOST)
+			.ok_or_bad_request("Host header required")?
+			.to_str()?;
+
+		let host = authority_to_host(authority)?;
+
+		let bucket_name = self
+			.garage
+			.config
+			.s3_api
+			.root_domain
+			.as_ref()
+			.and_then(|root_domain| host_to_bucket(&host, root_domain));
+
+		let (endpoint, bucket_name) =
+			Endpoint::from_request(req, bucket_name.map(ToOwned::to_owned))?;
+
+		Ok(S3ApiEndpoint {
+			bucket_name,
+			endpoint,
+		})
+	}
+
+	async fn handle(
+		&self,
+		req: Request<Body>,
+		endpoint: S3ApiEndpoint,
+	) -> Result<Response<Body>, Error> {
+		let S3ApiEndpoint {
+			bucket_name,
+			endpoint,
+		} = endpoint;
+		let garage = self.garage.clone();
+
+		// Some endpoints are processed early, before we even check for an API key
+		if let Endpoint::PostObject = endpoint {
+			return handle_post_object(garage, req, bucket_name.unwrap()).await;
+		}
+		if let Endpoint::Options = endpoint {
+			return handle_options_s3api(garage, &req, bucket_name).await;
+		}
+
+		let (api_key, mut content_sha256) = check_payload_signature(&garage, "s3", &req).await?;
+		let api_key = api_key
+			.ok_or_else(|| Error::forbidden("Garage does not support anonymous access yet"))?;
+
+		let req = parse_streaming_body(
+			&api_key,
+			req,
+			&mut content_sha256,
+			&garage.config.s3_api.s3_region,
+			"s3",
+		)?;
+
+		let bucket_name = match bucket_name {
+			None => {
+				return self
+					.handle_request_without_bucket(req, api_key, endpoint)
+					.await
+			}
+			Some(bucket) => bucket.to_string(),
+		};
+
+		// Special code path for CreateBucket API endpoint
+		if let Endpoint::CreateBucket {} = endpoint {
+			return handle_create_bucket(&garage, req, content_sha256, api_key, bucket_name).await;
+		}
+
+		let bucket_id = garage
+			.bucket_helper()
+			.resolve_bucket(&bucket_name, &api_key)
+			.await?;
+		let bucket = garage
+			.bucket_helper()
+			.get_existing_bucket(bucket_id)
+			.await?;
+
+		let allowed = match endpoint.authorization_type() {
+			Authorization::Read => api_key.allow_read(&bucket_id),
+			Authorization::Write => api_key.allow_write(&bucket_id),
+			Authorization::Owner => api_key.allow_owner(&bucket_id),
+			_ => unreachable!(),
+		};
+
+		if !allowed {
+			return Err(Error::forbidden("Operation is not allowed for this key."));
+		}
+
+		let matching_cors_rule = find_matching_cors_rule(&bucket, &req)?;
+
+		let resp = match endpoint {
+			Endpoint::HeadObject {
+				key, part_number, ..
+			} => handle_head(garage, &req, bucket_id, &key, part_number).await,
+			Endpoint::GetObject {
+				key, part_number, ..
+			} => handle_get(garage, &req, bucket_id, &key, part_number).await,
+			Endpoint::UploadPart {
+				key,
+				part_number,
+				upload_id,
+			} => {
+				handle_put_part(
+					garage,
+					req,
+					bucket_id,
+					&key,
+					part_number,
+					&upload_id,
+					content_sha256,
+				)
+				.await
+			}
+			Endpoint::CopyObject { key } => {
+				handle_copy(garage, &api_key, &req, bucket_id, &key).await
+			}
+			Endpoint::UploadPartCopy {
+				key,
+				part_number,
+				upload_id,
+			} => {
+				handle_upload_part_copy(
+					garage,
+					&api_key,
+					&req,
+					bucket_id,
+					&key,
+					part_number,
+					&upload_id,
+				)
+				.await
+			}
+			Endpoint::PutObject { key } => {
+				handle_put(garage, req, &bucket, &key, content_sha256).await
+			}
+			Endpoint::AbortMultipartUpload { key, upload_id } => {
+				handle_abort_multipart_upload(garage, bucket_id, &key, &upload_id).await
+			}
+			Endpoint::DeleteObject { key, .. } => handle_delete(garage, bucket_id, &key).await,
+			Endpoint::CreateMultipartUpload { key } => {
+				handle_create_multipart_upload(garage, &req, &bucket_name, bucket_id, &key).await
+			}
+			Endpoint::CompleteMultipartUpload { key, upload_id } => {
+				handle_complete_multipart_upload(
+					garage,
+					req,
+					&bucket_name,
+					&bucket,
+					&key,
+					&upload_id,
+					content_sha256,
+				)
+				.await
+			}
+			Endpoint::CreateBucket {} => unreachable!(),
+			Endpoint::HeadBucket {} => {
+				let empty_body: Body = Body::from(vec![]);
+				let response = Response::builder().body(empty_body).unwrap();
+				Ok(response)
+			}
+			Endpoint::DeleteBucket {} => {
+				handle_delete_bucket(&garage, bucket_id, bucket_name, api_key).await
+			}
+			Endpoint::GetBucketLocation {} => handle_get_bucket_location(garage),
+			Endpoint::GetBucketVersioning {} => handle_get_bucket_versioning(),
+			Endpoint::ListObjects {
+				delimiter,
+				encoding_type,
+				marker,
+				max_keys,
+				prefix,
+			} => {
+				handle_list(
+					garage,
+					&ListObjectsQuery {
+						common: ListQueryCommon {
+							bucket_name,
+							bucket_id,
+							delimiter: delimiter.map(|d| d.to_string()),
+							page_size: max_keys.map(|p| p.clamp(1, 1000)).unwrap_or(1000),
+							prefix: prefix.unwrap_or_default(),
+							urlencode_resp: encoding_type.map(|e| e == "url").unwrap_or(false),
+						},
+						is_v2: false,
+						marker,
+						continuation_token: None,
+						start_after: None,
+					},
+				)
+				.await
+			}
+			Endpoint::ListObjectsV2 {
+				delimiter,
+				encoding_type,
+				max_keys,
+				prefix,
+				continuation_token,
+				start_after,
+				list_type,
+				..
+			} => {
+				if list_type == "2" {
+					handle_list(
+						garage,
+						&ListObjectsQuery {
+							common: ListQueryCommon {
+								bucket_name,
+								bucket_id,
+								delimiter: delimiter.map(|d| d.to_string()),
+								page_size: max_keys.map(|p| p.clamp(1, 1000)).unwrap_or(1000),
+								urlencode_resp: encoding_type.map(|e| e == "url").unwrap_or(false),
+								prefix: prefix.unwrap_or_default(),
+							},
+							is_v2: true,
+							marker: None,
+							continuation_token,
+							start_after,
+						},
+					)
+					.await
+				} else {
+					Err(Error::bad_request(format!(
+						"Invalid endpoint: list-type={}",
+						list_type
+					)))
+				}
+			}
+			Endpoint::ListMultipartUploads {
+				delimiter,
+				encoding_type,
+				key_marker,
+				max_uploads,
+				prefix,
+				upload_id_marker,
+			} => {
+				handle_list_multipart_upload(
+					garage,
+					&ListMultipartUploadsQuery {
+						common: ListQueryCommon {
+							bucket_name,
+							bucket_id,
+							delimiter: delimiter.map(|d| d.to_string()),
+							page_size: max_uploads.map(|p| p.clamp(1, 1000)).unwrap_or(1000),
+							prefix: prefix.unwrap_or_default(),
+							urlencode_resp: encoding_type.map(|e| e == "url").unwrap_or(false),
+						},
+						key_marker,
+						upload_id_marker,
+					},
+				)
+				.await
+			}
+			Endpoint::ListParts {
+				key,
+				max_parts,
+				part_number_marker,
+				upload_id,
+			} => {
+				handle_list_parts(
+					garage,
+					&ListPartsQuery {
+						bucket_name,
+						bucket_id,
+						key,
+						upload_id,
+						part_number_marker: part_number_marker.map(|p| p.clamp(1, 10000)),
+						max_parts: max_parts.map(|p| p.clamp(1, 1000)).unwrap_or(1000),
+					},
+				)
+				.await
+			}
+			Endpoint::DeleteObjects {} => {
+				handle_delete_objects(garage, bucket_id, req, content_sha256).await
+			}
+			Endpoint::GetBucketWebsite {} => handle_get_website(&bucket).await,
+			Endpoint::PutBucketWebsite {} => {
+				handle_put_website(garage, bucket_id, req, content_sha256).await
+			}
+			Endpoint::DeleteBucketWebsite {} => handle_delete_website(garage, bucket_id).await,
+			Endpoint::GetBucketCors {} => handle_get_cors(&bucket).await,
+			Endpoint::PutBucketCors {} => {
+				handle_put_cors(garage, bucket_id, req, content_sha256).await
+			}
+			Endpoint::DeleteBucketCors {} => handle_delete_cors(garage, bucket_id).await,
+			endpoint => Err(Error::NotImplemented(endpoint.name().to_owned())),
+		};
+
+		// If request was a success and we have a CORS rule that applies to it,
+		// add the corresponding CORS headers to the response
+		let mut resp_ok = resp?;
+		if let Some(rule) = matching_cors_rule {
+			add_cors_headers(&mut resp_ok, rule)
+				.ok_or_internal_error("Invalid bucket CORS configuration")?;
+		}
+
+		Ok(resp_ok)
+	}
+}
+
+impl ApiEndpoint for S3ApiEndpoint {
+	fn name(&self) -> &'static str {
+		self.endpoint.name()
+	}
+
+	fn add_span_attributes(&self, span: SpanRef<'_>) {
+		span.set_attribute(KeyValue::new(
+			"bucket",
+			self.bucket_name.clone().unwrap_or_default(),
+		));
+	}
+}
diff --git a/src/api/s3_bucket.rs b/src/api/s3/bucket.rs
index 8a5407d3..3ac6a6ec 100644
--- a/src/api/s3_bucket.rs
+++ b/src/api/s3/bucket.rs
@@ -7,15 +7,15 @@ use garage_model::bucket_alias_table::*;
 use garage_model::bucket_table::Bucket;
 use garage_model::garage::Garage;
 use garage_model::key_table::Key;
-use garage_model::object_table::ObjectFilter;
 use garage_model::permission::BucketKeyPerm;
 use garage_table::util::*;
 use garage_util::crdt::*;
 use garage_util::data::*;
 use garage_util::time::*;
 
-use crate::error::*;
-use crate::s3_xml;
+use crate::common_error::CommonError;
+use crate::s3::error::*;
+use crate::s3::xml as s3_xml;
 use crate::signature::verify_signed_content;
 
 pub fn handle_get_bucket_location(garage: Arc<Garage>) -> Result<Response<Body>, Error> {
@@ -130,7 +130,7 @@ pub async fn handle_create_bucket(
 
 	if let Some(location_constraint) = cmd {
 		if location_constraint != garage.config.s3_api.s3_region {
-			return Err(Error::BadRequest(format!(
+			return Err(Error::bad_request(format!(
 				"Cannot satisfy location constraint `{}`: buckets can only be created in region `{}`",
 				location_constraint,
 				garage.config.s3_api.s3_region
@@ -158,12 +158,12 @@ pub async fn handle_create_bucket(
 		// otherwise return a forbidden error.
 		let kp = api_key.bucket_permissions(&bucket_id);
 		if !(kp.allow_write || kp.allow_owner) {
-			return Err(Error::BucketAlreadyExists);
+			return Err(CommonError::BucketAlreadyExists.into());
 		}
 	} else {
 		// Create the bucket!
 		if !is_valid_bucket_name(&bucket_name) {
-			return Err(Error::BadRequest(format!(
+			return Err(Error::bad_request(format!(
 				"{}: {}",
 				bucket_name, INVALID_BUCKET_NAME_MESSAGE
 			)));
@@ -228,12 +228,8 @@ pub async fn handle_delete_bucket(
 		// Delete bucket
 
 		// Check bucket is empty
-		let objects = garage
-			.object_table
-			.get_range(&bucket_id, None, Some(ObjectFilter::IsData), 10)
-			.await?;
-		if !objects.is_empty() {
-			return Err(Error::BucketNotEmpty);
+		if !garage.bucket_helper().is_bucket_empty(bucket_id).await? {
+			return Err(CommonError::BucketNotEmpty.into());
 		}
 
 		// --- done checking, now commit ---
@@ -299,7 +295,6 @@ fn parse_create_bucket_xml(xml_bytes: &[u8]) -> Option<Option<String>> {
 
 	let mut ret = None;
 	for item in cbc.children() {
-		println!("{:?}", item);
 		if item.has_tag_name("LocationConstraint") {
 			if ret != None {
 				return None;
diff --git a/src/api/s3_copy.rs b/src/api/s3/copy.rs
index fc4707e2..7eb6459d 100644
--- a/src/api/s3_copy.rs
+++ b/src/api/s3/copy.rs
@@ -5,23 +5,26 @@ use std::time::{Duration, SystemTime, UNIX_EPOCH};
 use futures::{stream, stream::Stream, StreamExt, TryFutureExt};
 use md5::{Digest as Md5Digest, Md5};
 
+use bytes::Bytes;
 use hyper::{Body, Request, Response};
 use serde::Serialize;
 
+use garage_rpc::netapp::bytes_buf::BytesBuf;
+use garage_rpc::rpc_helper::OrderTag;
 use garage_table::*;
 use garage_util::data::*;
 use garage_util::time::*;
 
-use garage_model::block_ref_table::*;
 use garage_model::garage::Garage;
 use garage_model::key_table::Key;
-use garage_model::object_table::*;
-use garage_model::version_table::*;
+use garage_model::s3::block_ref_table::*;
+use garage_model::s3::object_table::*;
+use garage_model::s3::version_table::*;
 
-use crate::api_server::{parse_bucket_key, resolve_bucket};
-use crate::error::*;
-use crate::s3_put::{decode_upload_id, get_headers};
-use crate::s3_xml::{self, xmlns_tag};
+use crate::helpers::parse_bucket_key;
+use crate::s3::error::*;
+use crate::s3::put::{decode_upload_id, get_headers};
+use crate::s3::xml::{self as s3_xml, xmlns_tag};
 
 pub async fn handle_copy(
 	garage: Arc<Garage>,
@@ -201,8 +204,8 @@ pub async fn handle_upload_part_copy(
 			let mut ranges = http_range::HttpRange::parse(range_str, source_version_meta.size)
 				.map_err(|e| (e, source_version_meta.size))?;
 			if ranges.len() != 1 {
-				return Err(Error::BadRequest(
-					"Invalid x-amz-copy-source-range header: exactly 1 range must be given".into(),
+				return Err(Error::bad_request(
+					"Invalid x-amz-copy-source-range header: exactly 1 range must be given",
 				));
 			} else {
 				ranges.pop().unwrap()
@@ -230,8 +233,8 @@ pub async fn handle_upload_part_copy(
 			// This is only for small files, we don't bother handling this.
 			// (in AWS UploadPartCopy works for parts at least 5MB which
 			// is never the case of an inline object)
-			return Err(Error::BadRequest(
-				"Source object is too small (minimum part size is 5Mb)".into(),
+			return Err(Error::bad_request(
+				"Source object is too small (minimum part size is 5Mb)",
 			));
 		}
 		ObjectVersionData::FirstBlock(_meta, _first_block_hash) => (),
@@ -250,7 +253,7 @@ pub async fn handle_upload_part_copy(
 	// Check this part number hasn't yet been uploaded
 	if let Some(dv) = dest_version {
 		if dv.has_part_number(part_number) {
-			return Err(Error::BadRequest(format!(
+			return Err(Error::bad_request(format!(
 				"Part number {} has already been uploaded",
 				part_number
 			)));
@@ -305,13 +308,18 @@ pub async fn handle_upload_part_copy(
 	// if and only if the block returned is a block that already existed
 	// in the Garage data store (thus we don't need to save it again).
 	let garage2 = garage.clone();
+	let order_stream = OrderTag::stream();
 	let source_blocks = stream::iter(blocks_to_copy)
-		.flat_map(|(block_hash, range_to_copy)| {
+		.enumerate()
+		.flat_map(|(i, (block_hash, range_to_copy))| {
 			let garage3 = garage2.clone();
 			stream::once(async move {
-				let data = garage3.block_manager.rpc_get_block(&block_hash).await?;
+				let data = garage3
+					.block_manager
+					.rpc_get_block(&block_hash, Some(order_stream.order(i as u64)))
+					.await?;
 				match range_to_copy {
-					Some(r) => Ok((data[r].to_vec(), None)),
+					Some(r) => Ok((data.slice(r), None)),
 					None => Ok((data, Some(block_hash))),
 				}
 			})
@@ -413,10 +421,13 @@ async fn get_copy_source(
 	let copy_source = percent_encoding::percent_decode_str(copy_source).decode_utf8()?;
 
 	let (source_bucket, source_key) = parse_bucket_key(&copy_source, None)?;
-	let source_bucket_id = resolve_bucket(garage, &source_bucket.to_string(), api_key).await?;
+	let source_bucket_id = garage
+		.bucket_helper()
+		.resolve_bucket(&source_bucket.to_string(), api_key)
+		.await?;
 
 	if !api_key.allow_read(&source_bucket_id) {
-		return Err(Error::Forbidden(format!(
+		return Err(Error::forbidden(format!(
 			"Reading from bucket {} not allowed for this key",
 			source_bucket
 		)));
@@ -536,8 +547,8 @@ impl CopyPreconditionHeaders {
 			(None, None, None, Some(ims)) => v_date > *ims,
 			(None, None, None, None) => true,
 			_ => {
-				return Err(Error::BadRequest(
-					"Invalid combination of x-amz-copy-source-if-xxxxx headers".into(),
+				return Err(Error::bad_request(
+					"Invalid combination of x-amz-copy-source-if-xxxxx headers",
 				))
 			}
 		};
@@ -550,13 +561,13 @@ impl CopyPreconditionHeaders {
 	}
 }
 
-type BlockStreamItemOk = (Vec<u8>, Option<Hash>);
+type BlockStreamItemOk = (Bytes, Option<Hash>);
 type BlockStreamItem = Result<BlockStreamItemOk, garage_util::error::Error>;
 
 struct Defragmenter<S: Stream<Item = BlockStreamItem>> {
 	block_size: usize,
 	block_stream: Pin<Box<stream::Peekable<S>>>,
-	buffer: Vec<u8>,
+	buffer: BytesBuf,
 	hash: Option<Hash>,
 }
 
@@ -565,7 +576,7 @@ impl<S: Stream<Item = BlockStreamItem>> Defragmenter<S> {
 		Self {
 			block_size,
 			block_stream,
-			buffer: vec![],
+			buffer: BytesBuf::new(),
 			hash: None,
 		}
 	}
@@ -583,7 +594,7 @@ impl<S: Stream<Item = BlockStreamItem>> Defragmenter<S> {
 
 			if self.buffer.is_empty() {
 				let (next_block, next_block_hash) = self.block_stream.next().await.unwrap()?;
-				self.buffer = next_block;
+				self.buffer.extend(next_block);
 				self.hash = next_block_hash;
 			} else if self.buffer.len() + peeked_next_block.len() > self.block_size {
 				break;
@@ -594,11 +605,11 @@ impl<S: Stream<Item = BlockStreamItem>> Defragmenter<S> {
 			}
 		}
 
-		Ok((std::mem::take(&mut self.buffer), self.hash.take()))
+		Ok((self.buffer.take_all(), self.hash.take()))
 	}
 }
 
-#[derive(Debug, Serialize, PartialEq)]
+#[derive(Debug, Serialize, PartialEq, Eq)]
 pub struct CopyObjectResult {
 	#[serde(rename = "LastModified")]
 	pub last_modified: s3_xml::Value,
@@ -606,7 +617,7 @@ pub struct CopyObjectResult {
 	pub etag: s3_xml::Value,
 }
 
-#[derive(Debug, Serialize, PartialEq)]
+#[derive(Debug, Serialize, PartialEq, Eq)]
 pub struct CopyPartResult {
 	#[serde(serialize_with = "xmlns_tag")]
 	pub xmlns: (),
@@ -619,7 +630,7 @@ pub struct CopyPartResult {
 #[cfg(test)]
 mod tests {
 	use super::*;
-	use crate::s3_xml::to_xml_with_header;
+	use crate::s3::xml::to_xml_with_header;
 
 	#[test]
 	fn copy_object_result() -> Result<(), Error> {
@@ -651,7 +662,6 @@ mod tests {
 			last_modified: s3_xml::Value("2011-04-11T20:34:56.000Z".into()),
 			etag: s3_xml::Value("\"9b2cf535f27731c974343645a3985328\"".into()),
 		};
-		println!("{}", to_xml_with_header(&v)?);
 
 		assert_eq!(to_xml_with_header(&v)?, expected_retval);
 
diff --git a/src/api/s3_cors.rs b/src/api/s3/cors.rs
index ab77e23a..c7273464 100644
--- a/src/api/s3_cors.rs
+++ b/src/api/s3/cors.rs
@@ -9,13 +9,12 @@ use hyper::{header::HeaderName, Body, Method, Request, Response, StatusCode};
 
 use serde::{Deserialize, Serialize};
 
-use crate::error::*;
-use crate::s3_xml::{to_xml_with_header, xmlns_tag, IntValue, Value};
+use crate::s3::error::*;
+use crate::s3::xml::{to_xml_with_header, xmlns_tag, IntValue, Value};
 use crate::signature::verify_signed_content;
 
 use garage_model::bucket_table::{Bucket, CorsRule as GarageCorsRule};
 use garage_model::garage::Garage;
-use garage_table::*;
 use garage_util::data::*;
 
 pub async fn handle_get_cors(bucket: &Bucket) -> Result<Response<Body>, Error> {
@@ -48,14 +47,11 @@ pub async fn handle_delete_cors(
 	bucket_id: Uuid,
 ) -> Result<Response<Body>, Error> {
 	let mut bucket = garage
-		.bucket_table
-		.get(&EmptyKey, &bucket_id)
-		.await?
-		.ok_or(Error::NoSuchBucket)?;
+		.bucket_helper()
+		.get_existing_bucket(bucket_id)
+		.await?;
 
-	let param = bucket
-		.params_mut()
-		.ok_or_internal_error("Bucket should not be deleted at this point")?;
+	let param = bucket.params_mut().unwrap();
 
 	param.cors_config.update(None);
 	garage.bucket_table.insert(&bucket).await?;
@@ -78,14 +74,11 @@ pub async fn handle_put_cors(
 	}
 
 	let mut bucket = garage
-		.bucket_table
-		.get(&EmptyKey, &bucket_id)
-		.await?
-		.ok_or(Error::NoSuchBucket)?;
+		.bucket_helper()
+		.get_existing_bucket(bucket_id)
+		.await?;
 
-	let param = bucket
-		.params_mut()
-		.ok_or_internal_error("Bucket should not be deleted at this point")?;
+	let param = bucket.params_mut().unwrap();
 
 	let conf: CorsConfiguration = from_reader(&body as &[u8])?;
 	conf.validate()?;
@@ -119,12 +112,7 @@ pub async fn handle_options_s3api(
 		let helper = garage.bucket_helper();
 		let bucket_id = helper.resolve_global_bucket_name(&bn).await?;
 		if let Some(id) = bucket_id {
-			let bucket = garage
-				.bucket_table
-				.get(&EmptyKey, &id)
-				.await?
-				.filter(|b| !b.state.is_deleted())
-				.ok_or(Error::NoSuchBucket)?;
+			let bucket = garage.bucket_helper().get_existing_bucket(id).await?;
 			handle_options_for_bucket(req, &bucket)
 		} else {
 			// If there is a bucket name in the request, but that name
@@ -185,7 +173,7 @@ pub fn handle_options_for_bucket(
 		}
 	}
 
-	Err(Error::Forbidden("This CORS request is not allowed.".into()))
+	Err(Error::forbidden("This CORS request is not allowed."))
 }
 
 pub fn find_matching_cors_rule<'a>(
diff --git a/src/api/s3_delete.rs b/src/api/s3/delete.rs
index b243d982..b337155f 100644
--- a/src/api/s3_delete.rs
+++ b/src/api/s3/delete.rs
@@ -6,10 +6,10 @@ use garage_util::data::*;
 use garage_util::time::*;
 
 use garage_model::garage::Garage;
-use garage_model::object_table::*;
+use garage_model::s3::object_table::*;
 
-use crate::error::*;
-use crate::s3_xml;
+use crate::s3::error::*;
+use crate::s3::xml as s3_xml;
 use crate::signature::verify_signed_content;
 
 async fn handle_delete_internal(
@@ -64,14 +64,13 @@ pub async fn handle_delete(
 	bucket_id: Uuid,
 	key: &str,
 ) -> Result<Response<Body>, Error> {
-	let (_deleted_version, delete_marker_version) =
-		handle_delete_internal(&garage, bucket_id, key).await?;
-
-	Ok(Response::builder()
-		.header("x-amz-version-id", hex::encode(delete_marker_version))
-		.status(StatusCode::NO_CONTENT)
-		.body(Body::from(vec![]))
-		.unwrap())
+	match handle_delete_internal(&garage, bucket_id, key).await {
+		Ok(_) | Err(Error::NoSuchKey) => Ok(Response::builder()
+			.status(StatusCode::NO_CONTENT)
+			.body(Body::from(vec![]))
+			.unwrap()),
+		Err(e) => Err(e),
+	}
 }
 
 pub async fn handle_delete_objects(
diff --git a/src/api/error.rs b/src/api/s3/error.rs
index f53ed1fd..67009d63 100644
--- a/src/api/error.rs
+++ b/src/api/s3/error.rs
@@ -2,34 +2,24 @@ use std::convert::TryInto;
 
 use err_derive::Error;
 use hyper::header::HeaderValue;
-use hyper::{HeaderMap, StatusCode};
+use hyper::{Body, HeaderMap, StatusCode};
 
 use garage_model::helper::error::Error as HelperError;
-use garage_util::error::Error as GarageError;
 
-use crate::s3_xml;
+use crate::common_error::CommonError;
+pub use crate::common_error::{CommonErrorDerivative, OkOrBadRequest, OkOrInternalError};
+use crate::generic_server::ApiError;
+use crate::s3::xml as s3_xml;
+use crate::signature::error::Error as SignatureError;
 
 /// Errors of this crate
 #[derive(Debug, Error)]
 pub enum Error {
-	// Category: internal error
-	/// Error related to deeper parts of Garage
-	#[error(display = "Internal error: {}", _0)]
-	InternalError(#[error(source)] GarageError),
-
-	/// Error related to Hyper
-	#[error(display = "Internal error (Hyper error): {}", _0)]
-	Hyper(#[error(source)] hyper::Error),
-
-	/// Error related to HTTP
-	#[error(display = "Internal error (HTTP error): {}", _0)]
-	Http(#[error(source)] http::Error),
+	#[error(display = "{}", _0)]
+	/// Error from common error
+	Common(CommonError),
 
 	// Category: cannot process
-	/// No proper api key was used, or the signature was invalid
-	#[error(display = "Forbidden: {}", _0)]
-	Forbidden(String),
-
 	/// Authorization Header Malformed
 	#[error(display = "Authorization header malformed, expected scope: {}", _0)]
 	AuthorizationHeaderMalformed(String),
@@ -38,22 +28,10 @@ pub enum Error {
 	#[error(display = "Key not found")]
 	NoSuchKey,
 
-	/// The bucket requested don't exists
-	#[error(display = "Bucket not found")]
-	NoSuchBucket,
-
 	/// The multipart upload requested don't exists
 	#[error(display = "Upload not found")]
 	NoSuchUpload,
 
-	/// Tried to create a bucket that already exist
-	#[error(display = "Bucket already exists")]
-	BucketAlreadyExists,
-
-	/// Tried to delete a non-empty bucket
-	#[error(display = "Tried to delete a non-empty bucket")]
-	BucketNotEmpty,
-
 	/// Precondition failed (e.g. x-amz-copy-source-if-match)
 	#[error(display = "At least one of the preconditions you specified did not hold")]
 	PreconditionFailed,
@@ -80,10 +58,6 @@ pub enum Error {
 	#[error(display = "Invalid UTF-8: {}", _0)]
 	InvalidUtf8String(#[error(source)] std::string::FromUtf8Error),
 
-	/// Some base64 encoded data was badly encoded
-	#[error(display = "Invalid base64: {}", _0)]
-	InvalidBase64(#[error(source)] base64::DecodeError),
-
 	/// The client sent invalid XML data
 	#[error(display = "Invalid XML: {}", _0)]
 	InvalidXml(String),
@@ -96,15 +70,34 @@ pub enum Error {
 	#[error(display = "Invalid HTTP range: {:?}", _0)]
 	InvalidRange(#[error(from)] (http_range::HttpRangeParseError, u64)),
 
-	/// The client sent an invalid request
-	#[error(display = "Bad request: {}", _0)]
-	BadRequest(String),
-
 	/// The client sent a request for an action not supported by garage
 	#[error(display = "Unimplemented action: {}", _0)]
 	NotImplemented(String),
 }
 
+impl<T> From<T> for Error
+where
+	CommonError: From<T>,
+{
+	fn from(err: T) -> Self {
+		Error::Common(CommonError::from(err))
+	}
+}
+
+impl CommonErrorDerivative for Error {}
+
+impl From<HelperError> for Error {
+	fn from(err: HelperError) -> Self {
+		match err {
+			HelperError::Internal(i) => Self::Common(CommonError::InternalError(i)),
+			HelperError::BadRequest(b) => Self::Common(CommonError::BadRequest(b)),
+			HelperError::InvalidBucketName(n) => Self::Common(CommonError::InvalidBucketName(n)),
+			HelperError::NoSuchBucket(n) => Self::Common(CommonError::NoSuchBucket(n)),
+			e => Self::bad_request(format!("{}", e)),
+		}
+	}
+}
+
 impl From<roxmltree::Error> for Error {
 	fn from(err: roxmltree::Error) -> Self {
 		Self::InvalidXml(format!("{}", err))
@@ -117,88 +110,71 @@ impl From<quick_xml::de::DeError> for Error {
 	}
 }
 
-impl From<HelperError> for Error {
-	fn from(err: HelperError) -> Self {
+impl From<SignatureError> for Error {
+	fn from(err: SignatureError) -> Self {
 		match err {
-			HelperError::Internal(i) => Self::InternalError(i),
-			HelperError::BadRequest(b) => Self::BadRequest(b),
+			SignatureError::Common(c) => Self::Common(c),
+			SignatureError::AuthorizationHeaderMalformed(c) => {
+				Self::AuthorizationHeaderMalformed(c)
+			}
+			SignatureError::InvalidUtf8Str(i) => Self::InvalidUtf8Str(i),
+			SignatureError::InvalidHeader(h) => Self::InvalidHeader(h),
 		}
 	}
 }
 
 impl From<multer::Error> for Error {
 	fn from(err: multer::Error) -> Self {
-		Self::BadRequest(err.to_string())
+		Self::bad_request(err)
 	}
 }
 
 impl Error {
-	/// Get the HTTP status code that best represents the meaning of the error for the client
-	pub fn http_status_code(&self) -> StatusCode {
-		match self {
-			Error::NoSuchKey | Error::NoSuchBucket | Error::NoSuchUpload => StatusCode::NOT_FOUND,
-			Error::BucketNotEmpty | Error::BucketAlreadyExists => StatusCode::CONFLICT,
-			Error::PreconditionFailed => StatusCode::PRECONDITION_FAILED,
-			Error::Forbidden(_) => StatusCode::FORBIDDEN,
-			Error::InternalError(
-				GarageError::Timeout
-				| GarageError::RemoteError(_)
-				| GarageError::Quorum(_, _, _, _),
-			) => StatusCode::SERVICE_UNAVAILABLE,
-			Error::InternalError(_) | Error::Hyper(_) | Error::Http(_) => {
-				StatusCode::INTERNAL_SERVER_ERROR
-			}
-			Error::InvalidRange(_) => StatusCode::RANGE_NOT_SATISFIABLE,
-			Error::NotImplemented(_) => StatusCode::NOT_IMPLEMENTED,
-			_ => StatusCode::BAD_REQUEST,
-		}
-	}
-
 	pub fn aws_code(&self) -> &'static str {
 		match self {
+			Error::Common(c) => c.aws_code(),
 			Error::NoSuchKey => "NoSuchKey",
-			Error::NoSuchBucket => "NoSuchBucket",
 			Error::NoSuchUpload => "NoSuchUpload",
-			Error::BucketAlreadyExists => "BucketAlreadyExists",
-			Error::BucketNotEmpty => "BucketNotEmpty",
 			Error::PreconditionFailed => "PreconditionFailed",
 			Error::InvalidPart => "InvalidPart",
 			Error::InvalidPartOrder => "InvalidPartOrder",
 			Error::EntityTooSmall => "EntityTooSmall",
-			Error::Forbidden(_) => "AccessDenied",
 			Error::AuthorizationHeaderMalformed(_) => "AuthorizationHeaderMalformed",
 			Error::NotImplemented(_) => "NotImplemented",
-			Error::InternalError(
-				GarageError::Timeout
-				| GarageError::RemoteError(_)
-				| GarageError::Quorum(_, _, _, _),
-			) => "ServiceUnavailable",
-			Error::InternalError(_) | Error::Hyper(_) | Error::Http(_) => "InternalError",
-			_ => "InvalidRequest",
+			Error::InvalidXml(_) => "MalformedXML",
+			Error::InvalidRange(_) => "InvalidRange",
+			Error::InvalidUtf8Str(_) | Error::InvalidUtf8String(_) | Error::InvalidHeader(_) => {
+				"InvalidRequest"
+			}
 		}
 	}
+}
 
-	pub fn aws_xml(&self, garage_region: &str, path: &str) -> String {
-		let error = s3_xml::Error {
-			code: s3_xml::Value(self.aws_code().to_string()),
-			message: s3_xml::Value(format!("{}", self)),
-			resource: Some(s3_xml::Value(path.to_string())),
-			region: Some(s3_xml::Value(garage_region.to_string())),
-		};
-		s3_xml::to_xml_with_header(&error).unwrap_or_else(|_| {
-			r#"
-<?xml version="1.0" encoding="UTF-8"?>
-<Error>
-	<Code>InternalError</Code>
-	<Message>XML encoding of error failed</Message>
-</Error>
-			"#
-			.into()
-		})
+impl ApiError for Error {
+	/// Get the HTTP status code that best represents the meaning of the error for the client
+	fn http_status_code(&self) -> StatusCode {
+		match self {
+			Error::Common(c) => c.http_status_code(),
+			Error::NoSuchKey | Error::NoSuchUpload => StatusCode::NOT_FOUND,
+			Error::PreconditionFailed => StatusCode::PRECONDITION_FAILED,
+			Error::InvalidRange(_) => StatusCode::RANGE_NOT_SATISFIABLE,
+			Error::NotImplemented(_) => StatusCode::NOT_IMPLEMENTED,
+			Error::AuthorizationHeaderMalformed(_)
+			| Error::InvalidPart
+			| Error::InvalidPartOrder
+			| Error::EntityTooSmall
+			| Error::InvalidXml(_)
+			| Error::InvalidUtf8Str(_)
+			| Error::InvalidUtf8String(_)
+			| Error::InvalidHeader(_) => StatusCode::BAD_REQUEST,
+		}
 	}
 
-	pub fn add_headers(&self, header_map: &mut HeaderMap<HeaderValue>) {
+	fn add_http_headers(&self, header_map: &mut HeaderMap<HeaderValue>) {
 		use hyper::header;
+
+		header_map.append(header::CONTENT_TYPE, "application/xml".parse().unwrap());
+
 		#[allow(clippy::single_match)]
 		match self {
 			Error::InvalidRange((_, len)) => {
@@ -212,68 +188,23 @@ impl Error {
 			_ => (),
 		}
 	}
-}
-
-/// Trait to map error to the Bad Request error code
-pub trait OkOrBadRequest {
-	type S;
-	fn ok_or_bad_request<M: AsRef<str>>(self, reason: M) -> Result<Self::S, Error>;
-}
-
-impl<T, E> OkOrBadRequest for Result<T, E>
-where
-	E: std::fmt::Display,
-{
-	type S = T;
-	fn ok_or_bad_request<M: AsRef<str>>(self, reason: M) -> Result<T, Error> {
-		match self {
-			Ok(x) => Ok(x),
-			Err(e) => Err(Error::BadRequest(format!("{}: {}", reason.as_ref(), e))),
-		}
-	}
-}
-
-impl<T> OkOrBadRequest for Option<T> {
-	type S = T;
-	fn ok_or_bad_request<M: AsRef<str>>(self, reason: M) -> Result<T, Error> {
-		match self {
-			Some(x) => Ok(x),
-			None => Err(Error::BadRequest(reason.as_ref().to_string())),
-		}
-	}
-}
-
-/// Trait to map an error to an Internal Error code
-pub trait OkOrInternalError {
-	type S;
-	fn ok_or_internal_error<M: AsRef<str>>(self, reason: M) -> Result<Self::S, Error>;
-}
-
-impl<T, E> OkOrInternalError for Result<T, E>
-where
-	E: std::fmt::Display,
-{
-	type S = T;
-	fn ok_or_internal_error<M: AsRef<str>>(self, reason: M) -> Result<T, Error> {
-		match self {
-			Ok(x) => Ok(x),
-			Err(e) => Err(Error::InternalError(GarageError::Message(format!(
-				"{}: {}",
-				reason.as_ref(),
-				e
-			)))),
-		}
-	}
-}
 
-impl<T> OkOrInternalError for Option<T> {
-	type S = T;
-	fn ok_or_internal_error<M: AsRef<str>>(self, reason: M) -> Result<T, Error> {
-		match self {
-			Some(x) => Ok(x),
-			None => Err(Error::InternalError(GarageError::Message(
-				reason.as_ref().to_string(),
-			))),
-		}
+	fn http_body(&self, garage_region: &str, path: &str) -> Body {
+		let error = s3_xml::Error {
+			code: s3_xml::Value(self.aws_code().to_string()),
+			message: s3_xml::Value(format!("{}", self)),
+			resource: Some(s3_xml::Value(path.to_string())),
+			region: Some(s3_xml::Value(garage_region.to_string())),
+		};
+		Body::from(s3_xml::to_xml_with_header(&error).unwrap_or_else(|_| {
+			r#"
+<?xml version="1.0" encoding="UTF-8"?>
+<Error>
+	<Code>InternalError</Code>
+	<Message>XML encoding of error failed</Message>
+</Error>
+			"#
+			.into()
+		}))
 	}
 }
diff --git a/src/api/s3_get.rs b/src/api/s3/get.rs
index 7f647e15..2a99551a 100644
--- a/src/api/s3_get.rs
+++ b/src/api/s3/get.rs
@@ -2,22 +2,25 @@
 use std::sync::Arc;
 use std::time::{Duration, UNIX_EPOCH};
 
-use futures::stream::*;
+use futures::future;
+use futures::stream::{self, StreamExt};
 use http::header::{
 	ACCEPT_RANGES, CONTENT_LENGTH, CONTENT_RANGE, CONTENT_TYPE, ETAG, IF_MODIFIED_SINCE,
 	IF_NONE_MATCH, LAST_MODIFIED, RANGE,
 };
-use hyper::body::Bytes;
 use hyper::{Body, Request, Response, StatusCode};
+use tokio::sync::mpsc;
 
+use garage_rpc::rpc_helper::{netapp::stream::ByteStream, OrderTag};
 use garage_table::EmptyKey;
 use garage_util::data::*;
+use garage_util::error::OkOrMessage;
 
 use garage_model::garage::Garage;
-use garage_model::object_table::*;
-use garage_model::version_table::*;
+use garage_model::s3::object_table::*;
+use garage_model::s3::version_table::*;
 
-use crate::error::*;
+use crate::s3::error::*;
 
 const X_AMZ_MP_PARTS_COUNT: &str = "x-amz-mp-parts-count";
 
@@ -210,8 +213,8 @@ pub async fn handle_get(
 
 	match (part_number, parse_range_header(req, last_v_meta.size)?) {
 		(Some(_), Some(_)) => {
-			return Err(Error::BadRequest(
-				"Cannot specify both partNumber and Range header".into(),
+			return Err(Error::bad_request(
+				"Cannot specify both partNumber and Range header",
 			));
 		}
 		(Some(pn), None) => {
@@ -242,36 +245,56 @@ pub async fn handle_get(
 			Ok(resp_builder.body(body)?)
 		}
 		ObjectVersionData::FirstBlock(_, first_block_hash) => {
-			let read_first_block = garage.block_manager.rpc_get_block(first_block_hash);
-			let get_next_blocks = garage.version_table.get(&last_v.uuid, &EmptyKey);
-
-			let (first_block, version) = futures::try_join!(read_first_block, get_next_blocks)?;
-			let version = version.ok_or(Error::NoSuchKey)?;
+			let (tx, rx) = mpsc::channel(2);
+
+			let order_stream = OrderTag::stream();
+			let first_block_hash = *first_block_hash;
+			let version_uuid = last_v.uuid;
+
+			tokio::spawn(async move {
+				match async {
+					let garage2 = garage.clone();
+					let version_fut = tokio::spawn(async move {
+						garage2.version_table.get(&version_uuid, &EmptyKey).await
+					});
+
+					let stream_block_0 = garage
+						.block_manager
+						.rpc_get_block_streaming(&first_block_hash, Some(order_stream.order(0)))
+						.await?;
+					tx.send(stream_block_0)
+						.await
+						.ok_or_message("channel closed")?;
+
+					let version = version_fut.await.unwrap()?.ok_or(Error::NoSuchKey)?;
+					for (i, (_, vb)) in version.blocks.items().iter().enumerate().skip(1) {
+						let stream_block_i = garage
+							.block_manager
+							.rpc_get_block_streaming(&vb.hash, Some(order_stream.order(i as u64)))
+							.await?;
+						tx.send(stream_block_i)
+							.await
+							.ok_or_message("channel closed")?;
+					}
 
-			let mut blocks = version
-				.blocks
-				.items()
-				.iter()
-				.map(|(_, vb)| (vb.hash, None))
-				.collect::<Vec<_>>();
-			blocks[0].1 = Some(first_block);
-
-			let body_stream = futures::stream::iter(blocks)
-				.map(move |(hash, data_opt)| {
-					let garage = garage.clone();
-					async move {
-						if let Some(data) = data_opt {
-							Ok(Bytes::from(data))
-						} else {
-							garage
-								.block_manager
-								.rpc_get_block(&hash)
-								.await
-								.map(Bytes::from)
-						}
+					Ok::<(), Error>(())
+				}
+				.await
+				{
+					Ok(()) => (),
+					Err(e) => {
+						let err = std::io::Error::new(
+							std::io::ErrorKind::Other,
+							format!("Error while getting object data: {}", e),
+						);
+						let _ = tx
+							.send(Box::pin(stream::once(future::ready(Err(err)))))
+							.await;
 					}
-				})
-				.buffered(2);
+				}
+			});
+
+			let body_stream = tokio_stream::wrappers::ReceiverStream::new(rx).flatten();
 
 			let body = hyper::body::Body::wrap_stream(body_stream);
 			Ok(resp_builder.body(body)?)
@@ -302,9 +325,9 @@ async fn handle_get_range(
 				let body: Body = Body::from(bytes[begin as usize..end as usize].to_vec());
 				Ok(resp_builder.body(body)?)
 			} else {
-				None.ok_or_internal_error(
+				Err(Error::internal_error(
 					"Requested range not present in inline bytes when it should have been",
-				)
+				))
 			}
 		}
 		ObjectVersionData::FirstBlock(_meta, _first_block_hash) => {
@@ -422,40 +445,79 @@ fn body_from_blocks_range(
 		all_blocks.len(),
 		4 + ((end - begin) / std::cmp::max(all_blocks[0].1.size as u64, 1024)) as usize,
 	));
-	let mut true_offset = 0;
+	let mut block_offset: u64 = 0;
 	for (_, b) in all_blocks.iter() {
-		if true_offset >= end {
+		if block_offset >= end {
 			break;
 		}
 		// Keep only blocks that have an intersection with the requested range
-		if true_offset < end && true_offset + b.size > begin {
-			blocks.push((*b, true_offset));
+		if block_offset < end && block_offset + b.size > begin {
+			blocks.push((*b, block_offset));
 		}
-		true_offset += b.size;
+		block_offset += b.size as u64;
 	}
 
+	let order_stream = OrderTag::stream();
 	let body_stream = futures::stream::iter(blocks)
-		.map(move |(block, true_offset)| {
+		.enumerate()
+		.map(move |(i, (block, block_offset))| {
 			let garage = garage.clone();
 			async move {
-				let data = garage.block_manager.rpc_get_block(&block.hash).await?;
-				let data = Bytes::from(data);
-				let start_in_block = if true_offset > begin {
-					0
-				} else {
-					begin - true_offset
-				};
-				let end_in_block = if true_offset + block.size < end {
-					block.size
-				} else {
-					end - true_offset
-				};
-				Result::<Bytes, Error>::Ok(
-					data.slice(start_in_block as usize..end_in_block as usize),
-				)
+				garage
+					.block_manager
+					.rpc_get_block_streaming(&block.hash, Some(order_stream.order(i as u64)))
+					.await
+					.unwrap_or_else(|e| error_stream(i, e))
+					.scan(block_offset, move |chunk_offset, chunk| {
+						let r = match chunk {
+							Ok(chunk_bytes) => {
+								let chunk_len = chunk_bytes.len() as u64;
+								let r = if *chunk_offset >= end {
+									// The current chunk is after the part we want to read.
+									// Returning None here will stop the scan, the rest of the
+									// stream will be ignored
+									None
+								} else if *chunk_offset + chunk_len <= begin {
+									// The current chunk is before the part we want to read.
+									// We return a None that will be removed by the filter_map
+									// below.
+									Some(None)
+								} else {
+									// The chunk has an intersection with the requested range
+									let start_in_chunk = if *chunk_offset > begin {
+										0
+									} else {
+										begin - *chunk_offset
+									};
+									let end_in_chunk = if *chunk_offset + chunk_len < end {
+										chunk_len
+									} else {
+										end - *chunk_offset
+									};
+									Some(Some(Ok(chunk_bytes
+										.slice(start_in_chunk as usize..end_in_chunk as usize))))
+								};
+								*chunk_offset += chunk_bytes.len() as u64;
+								r
+							}
+							Err(e) => Some(Some(Err(e))),
+						};
+						futures::future::ready(r)
+					})
+					.filter_map(futures::future::ready)
 			}
 		})
-		.buffered(2);
+		.buffered(2)
+		.flatten();
 
 	hyper::body::Body::wrap_stream(body_stream)
 }
+
+fn error_stream(i: usize, e: garage_util::error::Error) -> ByteStream {
+	Box::pin(futures::stream::once(async move {
+		Err(std::io::Error::new(
+			std::io::ErrorKind::Other,
+			format!("Could not get block {}: {}", i, e),
+		))
+	}))
+}
diff --git a/src/api/s3_list.rs b/src/api/s3/list.rs
index 5852fc1b..e5f486c8 100644
--- a/src/api/s3_list.rs
+++ b/src/api/s3/list.rs
@@ -10,15 +10,16 @@ use garage_util::error::Error as GarageError;
 use garage_util::time::*;
 
 use garage_model::garage::Garage;
-use garage_model::object_table::*;
-use garage_model::version_table::Version;
+use garage_model::s3::object_table::*;
+use garage_model::s3::version_table::Version;
 
-use garage_table::EmptyKey;
+use garage_table::{EmptyKey, EnumerationOrder};
 
 use crate::encoding::*;
-use crate::error::*;
-use crate::s3_put;
-use crate::s3_xml;
+use crate::helpers::key_after_prefix;
+use crate::s3::error::*;
+use crate::s3::put as s3_put;
+use crate::s3::xml as s3_xml;
 
 const DUMMY_NAME: &str = "Dummy Key";
 const DUMMY_KEY: &str = "GKDummyKey";
@@ -66,8 +67,14 @@ pub async fn handle_list(
 	let io = |bucket, key, count| {
 		let t = &garage.object_table;
 		async move {
-			t.get_range(&bucket, key, Some(ObjectFilter::IsData), count)
-				.await
+			t.get_range(
+				&bucket,
+				key,
+				Some(ObjectFilter::IsData),
+				count,
+				EnumerationOrder::Forward,
+			)
+			.await
 		}
 	};
 
@@ -165,8 +172,14 @@ pub async fn handle_list_multipart_upload(
 	let io = |bucket, key, count| {
 		let t = &garage.object_table;
 		async move {
-			t.get_range(&bucket, key, Some(ObjectFilter::IsUploading), count)
-				.await
+			t.get_range(
+				&bucket,
+				key,
+				Some(ObjectFilter::IsUploading),
+				count,
+				EnumerationOrder::Forward,
+			)
+			.await
 		}
 	};
 
@@ -569,13 +582,19 @@ impl ListObjectsQuery {
 				// representing the key to start with.
 				(Some(token), _) => match &token[..1] {
 					"[" => Ok(RangeBegin::IncludingKey {
-						key: String::from_utf8(base64::decode(token[1..].as_bytes())?)?,
+						key: String::from_utf8(
+							base64::decode(token[1..].as_bytes())
+								.ok_or_bad_request("Invalid continuation token")?,
+						)?,
 						fallback_key: None,
 					}),
 					"]" => Ok(RangeBegin::AfterKey {
-						key: String::from_utf8(base64::decode(token[1..].as_bytes())?)?,
+						key: String::from_utf8(
+							base64::decode(token[1..].as_bytes())
+								.ok_or_bad_request("Invalid continuation token")?,
+						)?,
 					}),
-					_ => Err(Error::BadRequest("Invalid continuation token".to_string())),
+					_ => Err(Error::bad_request("Invalid continuation token")),
 				},
 
 				// StartAfter has defined semantics in the spec:
@@ -923,39 +942,13 @@ fn uriencode_maybe(s: &str, yes: bool) -> s3_xml::Value {
 	}
 }
 
-const UTF8_BEFORE_LAST_CHAR: char = '\u{10FFFE}';
-
-/// Compute the key after the prefix
-fn key_after_prefix(pfx: &str) -> Option<String> {
-	let mut next = pfx.to_string();
-	while !next.is_empty() {
-		let tail = next.pop().unwrap();
-		if tail >= char::MAX {
-			continue;
-		}
-
-		// Circumvent a limitation of RangeFrom that overflow earlier than needed
-		// See: https://doc.rust-lang.org/core/ops/struct.RangeFrom.html
-		let new_tail = if tail == UTF8_BEFORE_LAST_CHAR {
-			char::MAX
-		} else {
-			(tail..).nth(1).unwrap()
-		};
-
-		next.push(new_tail);
-		return Some(next);
-	}
-
-	None
-}
-
 /*
  * Unit tests of this module
  */
 #[cfg(test)]
 mod tests {
 	use super::*;
-	use garage_model::version_table::*;
+	use garage_model::s3::version_table::*;
 	use garage_util::*;
 	use std::iter::FromIterator;
 
@@ -1003,39 +996,6 @@ mod tests {
 	}
 
 	#[test]
-	fn test_key_after_prefix() {
-		assert_eq!(UTF8_BEFORE_LAST_CHAR as u32, (char::MAX as u32) - 1);
-		assert_eq!(key_after_prefix("a/b/").unwrap().as_str(), "a/b0");
-		assert_eq!(key_after_prefix("€").unwrap().as_str(), "₭");
-		assert_eq!(
-			key_after_prefix("􏿽").unwrap().as_str(),
-			String::from(char::from_u32(0x10FFFE).unwrap())
-		);
-
-		// When the last character is the biggest UTF8 char
-		let a = String::from_iter(['a', char::MAX].iter());
-		assert_eq!(key_after_prefix(a.as_str()).unwrap().as_str(), "b");
-
-		// When all characters are the biggest UTF8 char
-		let b = String::from_iter([char::MAX; 3].iter());
-		assert!(key_after_prefix(b.as_str()).is_none());
-
-		// Check utf8 surrogates
-		let c = String::from('\u{D7FF}');
-		assert_eq!(
-			key_after_prefix(c.as_str()).unwrap().as_str(),
-			String::from('\u{E000}')
-		);
-
-		// Check the character before the biggest one
-		let d = String::from('\u{10FFFE}');
-		assert_eq!(
-			key_after_prefix(d.as_str()).unwrap().as_str(),
-			String::from(char::MAX)
-		);
-	}
-
-	#[test]
 	fn test_common_prefixes() {
 		let mut query = query();
 		let objs = objs();
diff --git a/src/api/s3/mod.rs b/src/api/s3/mod.rs
new file mode 100644
index 00000000..7b56d4d8
--- /dev/null
+++ b/src/api/s3/mod.rs
@@ -0,0 +1,15 @@
+pub mod api_server;
+pub mod error;
+
+mod bucket;
+mod copy;
+pub mod cors;
+mod delete;
+pub mod get;
+mod list;
+mod post_object;
+mod put;
+mod website;
+
+mod router;
+pub mod xml;
diff --git a/src/api/s3_post_object.rs b/src/api/s3/post_object.rs
index 585e0304..d063faa4 100644
--- a/src/api/s3_post_object.rs
+++ b/src/api/s3/post_object.rs
@@ -14,16 +14,15 @@ use serde::Deserialize;
 
 use garage_model::garage::Garage;
 
-use crate::api_server::resolve_bucket;
-use crate::error::*;
-use crate::s3_put::{get_headers, save_stream};
-use crate::s3_xml;
+use crate::s3::error::*;
+use crate::s3::put::{get_headers, save_stream};
+use crate::s3::xml as s3_xml;
 use crate::signature::payload::{parse_date, verify_v4};
 
 pub async fn handle_post_object(
 	garage: Arc<Garage>,
 	req: Request<Body>,
-	bucket: String,
+	bucket_name: String,
 ) -> Result<Response<Body>, Error> {
 	let boundary = req
 		.headers()
@@ -48,9 +47,7 @@ pub async fn handle_post_object(
 		let field = if let Some(field) = multipart.next_field().await? {
 			field
 		} else {
-			return Err(Error::BadRequest(
-				"Request did not contain a file".to_owned(),
-			));
+			return Err(Error::bad_request("Request did not contain a file"));
 		};
 		let name: HeaderName = if let Some(Ok(name)) = field.name().map(TryInto::try_into) {
 			name
@@ -66,14 +63,14 @@ pub async fn handle_post_object(
 				"tag" => (/* tag need to be reencoded, but we don't support them yet anyway */),
 				"acl" => {
 					if params.insert("x-amz-acl", content).is_some() {
-						return Err(Error::BadRequest(
-							"Field 'acl' provided more than one time".to_string(),
+						return Err(Error::bad_request(
+							"Field 'acl' provided more than one time",
 						));
 					}
 				}
 				_ => {
 					if params.insert(&name, content).is_some() {
-						return Err(Error::BadRequest(format!(
+						return Err(Error::bad_request(format!(
 							"Field '{}' provided more than one time",
 							name
 						)));
@@ -90,9 +87,7 @@ pub async fn handle_post_object(
 		.to_str()?;
 	let credential = params
 		.get("x-amz-credential")
-		.ok_or_else(|| {
-			Error::Forbidden("Garage does not support anonymous access yet".to_string())
-		})?
+		.ok_or_else(|| Error::forbidden("Garage does not support anonymous access yet"))?
 		.to_str()?;
 	let policy = params
 		.get("policy")
@@ -119,17 +114,31 @@ pub async fn handle_post_object(
 	};
 
 	let date = parse_date(date)?;
-	let api_key = verify_v4(&garage, credential, &date, signature, policy.as_bytes()).await?;
+	let api_key = verify_v4(
+		&garage,
+		"s3",
+		credential,
+		&date,
+		signature,
+		policy.as_bytes(),
+	)
+	.await?;
 
-	let bucket_id = resolve_bucket(&garage, &bucket, &api_key).await?;
+	let bucket_id = garage
+		.bucket_helper()
+		.resolve_bucket(&bucket_name, &api_key)
+		.await?;
 
 	if !api_key.allow_write(&bucket_id) {
-		return Err(Error::Forbidden(
-			"Operation is not allowed for this key.".to_string(),
-		));
+		return Err(Error::forbidden("Operation is not allowed for this key."));
 	}
 
-	let decoded_policy = base64::decode(&policy)?;
+	let bucket = garage
+		.bucket_helper()
+		.get_existing_bucket(bucket_id)
+		.await?;
+
+	let decoded_policy = base64::decode(&policy).ok_or_bad_request("Invalid policy")?;
 	let decoded_policy: Policy =
 		serde_json::from_slice(&decoded_policy).ok_or_bad_request("Invalid policy")?;
 
@@ -137,9 +146,7 @@ pub async fn handle_post_object(
 		.ok_or_bad_request("Invalid expiration date")?
 		.into();
 	if Utc::now() - expiration > Duration::zero() {
-		return Err(Error::BadRequest(
-			"Expiration date is in the paste".to_string(),
-		));
+		return Err(Error::bad_request("Expiration date is in the paste"));
 	}
 
 	let mut conditions = decoded_policy.into_conditions()?;
@@ -151,7 +158,7 @@ pub async fn handle_post_object(
 			"policy" | "x-amz-signature" => (), // this is always accepted, as it's required to validate other fields
 			"content-type" => {
 				let conds = conditions.params.remove("content-type").ok_or_else(|| {
-					Error::BadRequest(format!("Key '{}' is not allowed in policy", param_key))
+					Error::bad_request(format!("Key '{}' is not allowed in policy", param_key))
 				})?;
 				for cond in conds {
 					let ok = match cond {
@@ -161,7 +168,7 @@ pub async fn handle_post_object(
 						}
 					};
 					if !ok {
-						return Err(Error::BadRequest(format!(
+						return Err(Error::bad_request(format!(
 							"Key '{}' has value not allowed in policy",
 							param_key
 						)));
@@ -170,7 +177,7 @@ pub async fn handle_post_object(
 			}
 			"key" => {
 				let conds = conditions.params.remove("key").ok_or_else(|| {
-					Error::BadRequest(format!("Key '{}' is not allowed in policy", param_key))
+					Error::bad_request(format!("Key '{}' is not allowed in policy", param_key))
 				})?;
 				for cond in conds {
 					let ok = match cond {
@@ -178,7 +185,7 @@ pub async fn handle_post_object(
 						Operation::StartsWith(s) => key.starts_with(&s),
 					};
 					if !ok {
-						return Err(Error::BadRequest(format!(
+						return Err(Error::bad_request(format!(
 							"Key '{}' has value not allowed in policy",
 							param_key
 						)));
@@ -193,7 +200,7 @@ pub async fn handle_post_object(
 					continue;
 				}
 				let conds = conditions.params.remove(&param_key).ok_or_else(|| {
-					Error::BadRequest(format!("Key '{}' is not allowed in policy", param_key))
+					Error::bad_request(format!("Key '{}' is not allowed in policy", param_key))
 				})?;
 				for cond in conds {
 					let ok = match cond {
@@ -201,7 +208,7 @@ pub async fn handle_post_object(
 						Operation::StartsWith(s) => value.to_str()?.starts_with(s.as_str()),
 					};
 					if !ok {
-						return Err(Error::BadRequest(format!(
+						return Err(Error::bad_request(format!(
 							"Key '{}' has value not allowed in policy",
 							param_key
 						)));
@@ -212,7 +219,7 @@ pub async fn handle_post_object(
 	}
 
 	if let Some((param_key, _)) = conditions.params.iter().next() {
-		return Err(Error::BadRequest(format!(
+		return Err(Error::bad_request(format!(
 			"Key '{}' is required in policy, but no value was provided",
 			param_key
 		)));
@@ -225,7 +232,7 @@ pub async fn handle_post_object(
 		garage,
 		headers,
 		StreamLimiter::new(stream, conditions.content_length),
-		bucket_id,
+		&bucket,
 		&key,
 		None,
 		None,
@@ -242,7 +249,7 @@ pub async fn handle_post_object(
 	{
 		target
 			.query_pairs_mut()
-			.append_pair("bucket", &bucket)
+			.append_pair("bucket", &bucket_name)
 			.append_pair("key", &key)
 			.append_pair("etag", &etag);
 		let target = target.to_string();
@@ -287,7 +294,7 @@ pub async fn handle_post_object(
 				let xml = s3_xml::PostObject {
 					xmlns: (),
 					location: s3_xml::Value(location),
-					bucket: s3_xml::Value(bucket),
+					bucket: s3_xml::Value(bucket_name),
 					key: s3_xml::Value(key),
 					etag: s3_xml::Value(etag),
 				};
@@ -318,7 +325,7 @@ impl Policy {
 			match condition {
 				PolicyCondition::Equal(map) => {
 					if map.len() != 1 {
-						return Err(Error::BadRequest("Invalid policy item".to_owned()));
+						return Err(Error::bad_request("Invalid policy item"));
 					}
 					let (mut k, v) = map.into_iter().next().expect("size was verified");
 					k.make_ascii_lowercase();
@@ -326,7 +333,7 @@ impl Policy {
 				}
 				PolicyCondition::OtherOp([cond, mut key, value]) => {
 					if key.remove(0) != '$' {
-						return Err(Error::BadRequest("Invalid policy item".to_owned()));
+						return Err(Error::bad_request("Invalid policy item"));
 					}
 					key.make_ascii_lowercase();
 					match cond.as_str() {
@@ -339,7 +346,7 @@ impl Policy {
 								.or_default()
 								.push(Operation::StartsWith(value));
 						}
-						_ => return Err(Error::BadRequest("Invalid policy item".to_owned())),
+						_ => return Err(Error::bad_request("Invalid policy item")),
 					}
 				}
 				PolicyCondition::SizeRange(key, min, max) => {
@@ -347,7 +354,7 @@ impl Policy {
 						length.0 = length.0.max(min);
 						length.1 = length.1.min(max);
 					} else {
-						return Err(Error::BadRequest("Invalid policy item".to_owned()));
+						return Err(Error::bad_request("Invalid policy item"));
 					}
 				}
 			}
@@ -412,15 +419,15 @@ where
 				self.read += bytes.len() as u64;
 				// optimization to fail early when we know before the end it's too long
 				if self.length.end() < &self.read {
-					return Poll::Ready(Some(Err(Error::BadRequest(
-						"File size does not match policy".to_owned(),
+					return Poll::Ready(Some(Err(Error::bad_request(
+						"File size does not match policy",
 					))));
 				}
 			}
 			Poll::Ready(None) => {
 				if !self.length.contains(&self.read) {
-					return Poll::Ready(Some(Err(Error::BadRequest(
-						"File size does not match policy".to_owned(),
+					return Poll::Ready(Some(Err(Error::bad_request(
+						"File size does not match policy",
 					))));
 				}
 			}
diff --git a/src/api/s3_put.rs b/src/api/s3/put.rs
index ed0bf00b..97b8e4e3 100644
--- a/src/api/s3_put.rs
+++ b/src/api/s3/put.rs
@@ -1,4 +1,4 @@
-use std::collections::{BTreeMap, BTreeSet, VecDeque};
+use std::collections::{BTreeMap, BTreeSet, HashMap};
 use std::sync::Arc;
 
 use futures::prelude::*;
@@ -8,25 +8,34 @@ use hyper::{Request, Response};
 use md5::{digest::generic_array::*, Digest as Md5Digest, Md5};
 use sha2::Sha256;
 
+use opentelemetry::{
+	trace::{FutureExt as OtelFutureExt, TraceContextExt, Tracer},
+	Context,
+};
+
+use garage_rpc::netapp::bytes_buf::BytesBuf;
 use garage_table::*;
+use garage_util::async_hash::*;
 use garage_util::data::*;
 use garage_util::error::Error as GarageError;
 use garage_util::time::*;
 
 use garage_block::manager::INLINE_THRESHOLD;
-use garage_model::block_ref_table::*;
+use garage_model::bucket_table::Bucket;
 use garage_model::garage::Garage;
-use garage_model::object_table::*;
-use garage_model::version_table::*;
+use garage_model::index_counter::CountedItem;
+use garage_model::s3::block_ref_table::*;
+use garage_model::s3::object_table::*;
+use garage_model::s3::version_table::*;
 
-use crate::error::*;
-use crate::s3_xml;
+use crate::s3::error::*;
+use crate::s3::xml as s3_xml;
 use crate::signature::verify_signed_content;
 
 pub async fn handle_put(
 	garage: Arc<Garage>,
 	req: Request<Body>,
-	bucket_id: Uuid,
+	bucket: &Bucket,
 	key: &str,
 	content_sha256: Option<Hash>,
 ) -> Result<Response<Body>, Error> {
@@ -46,7 +55,7 @@ pub async fn handle_put(
 		garage,
 		headers,
 		body,
-		bucket_id,
+		bucket,
 		key,
 		content_md5,
 		content_sha256,
@@ -59,7 +68,7 @@ pub(crate) async fn save_stream<S: Stream<Item = Result<Bytes, Error>> + Unpin>(
 	garage: Arc<Garage>,
 	headers: ObjectVersionHeaders,
 	body: S,
-	bucket_id: Uuid,
+	bucket: &Bucket,
 	key: &str,
 	content_md5: Option<String>,
 	content_sha256: Option<FixedBytes32>,
@@ -80,6 +89,7 @@ pub(crate) async fn save_stream<S: Stream<Item = Result<Bytes, Error>> + Unpin>(
 		let data_md5sum_hex = hex::encode(data_md5sum);
 
 		let data_sha256sum = sha256sum(&first_block[..]);
+		let size = first_block.len() as u64;
 
 		ensure_checksum_matches(
 			data_md5sum.as_slice(),
@@ -88,20 +98,22 @@ pub(crate) async fn save_stream<S: Stream<Item = Result<Bytes, Error>> + Unpin>(
 			content_sha256,
 		)?;
 
+		check_quotas(&garage, bucket, key, size).await?;
+
 		let object_version = ObjectVersion {
 			uuid: version_uuid,
 			timestamp: version_timestamp,
 			state: ObjectVersionState::Complete(ObjectVersionData::Inline(
 				ObjectVersionMeta {
 					headers,
-					size: first_block.len() as u64,
+					size,
 					etag: data_md5sum_hex.clone(),
 				},
-				first_block,
+				first_block.to_vec(),
 			)),
 		};
 
-		let object = Object::new(bucket_id, key.into(), vec![object_version]);
+		let object = Object::new(bucket.id, key.into(), vec![object_version]);
 		garage.object_table.insert(&object).await?;
 
 		return Ok((version_uuid, data_md5sum_hex));
@@ -114,36 +126,42 @@ pub(crate) async fn save_stream<S: Stream<Item = Result<Bytes, Error>> + Unpin>(
 		timestamp: version_timestamp,
 		state: ObjectVersionState::Uploading(headers.clone()),
 	};
-	let object = Object::new(bucket_id, key.into(), vec![object_version.clone()]);
+	let object = Object::new(bucket.id, key.into(), vec![object_version.clone()]);
 	garage.object_table.insert(&object).await?;
 
 	// Initialize corresponding entry in version table
 	// Write this entry now, even with empty block list,
 	// to prevent block_ref entries from being deleted (they can be deleted
 	// if the reference a version that isn't found in the version table)
-	let version = Version::new(version_uuid, bucket_id, key.into(), false);
+	let version = Version::new(version_uuid, bucket.id, key.into(), false);
 	garage.version_table.insert(&version).await?;
 
 	// Transfer data and verify checksum
-	let first_block_hash = blake2sum(&first_block[..]);
-	let tx_result = read_and_put_blocks(
-		&garage,
-		&version,
-		1,
-		first_block,
-		first_block_hash,
-		&mut chunker,
-	)
-	.await
-	.and_then(|(total_size, data_md5sum, data_sha256sum)| {
+	let first_block_hash = async_blake2sum(first_block.clone()).await;
+
+	let tx_result = (|| async {
+		let (total_size, data_md5sum, data_sha256sum) = read_and_put_blocks(
+			&garage,
+			&version,
+			1,
+			first_block,
+			first_block_hash,
+			&mut chunker,
+		)
+		.await?;
+
 		ensure_checksum_matches(
 			data_md5sum.as_slice(),
 			data_sha256sum,
 			content_md5.as_deref(),
 			content_sha256,
-		)
-		.map(|()| (total_size, data_md5sum))
-	});
+		)?;
+
+		check_quotas(&garage, bucket, key, total_size).await?;
+
+		Ok((total_size, data_md5sum))
+	})()
+	.await;
 
 	// If something went wrong, clean up
 	let (total_size, md5sum_arr) = match tx_result {
@@ -151,7 +169,7 @@ pub(crate) async fn save_stream<S: Stream<Item = Result<Bytes, Error>> + Unpin>(
 		Err(e) => {
 			// Mark object as aborted, this will free the blocks further down
 			object_version.state = ObjectVersionState::Aborted;
-			let object = Object::new(bucket_id, key.into(), vec![object_version.clone()]);
+			let object = Object::new(bucket.id, key.into(), vec![object_version.clone()]);
 			garage.object_table.insert(&object).await?;
 			return Err(e);
 		}
@@ -167,7 +185,7 @@ pub(crate) async fn save_stream<S: Stream<Item = Result<Bytes, Error>> + Unpin>(
 		},
 		first_block_hash,
 	));
-	let object = Object::new(bucket_id, key.into(), vec![object_version]);
+	let object = Object::new(bucket.id, key.into(), vec![object_version]);
 	garage.object_table.insert(&object).await?;
 
 	Ok((version_uuid, md5sum_hex))
@@ -183,8 +201,8 @@ fn ensure_checksum_matches(
 ) -> Result<(), Error> {
 	if let Some(expected_sha256) = content_sha256 {
 		if expected_sha256 != data_sha256sum {
-			return Err(Error::BadRequest(
-				"Unable to validate x-amz-content-sha256".to_string(),
+			return Err(Error::bad_request(
+				"Unable to validate x-amz-content-sha256",
 			));
 		} else {
 			trace!("Successfully validated x-amz-content-sha256");
@@ -192,9 +210,7 @@ fn ensure_checksum_matches(
 	}
 	if let Some(expected_md5) = content_md5 {
 		if expected_md5.trim_matches('"') != base64::encode(data_md5sum) {
-			return Err(Error::BadRequest(
-				"Unable to validate content-md5".to_string(),
-			));
+			return Err(Error::bad_request("Unable to validate content-md5"));
 		} else {
 			trace!("Successfully validated content-md5");
 		}
@@ -202,18 +218,85 @@ fn ensure_checksum_matches(
 	Ok(())
 }
 
+/// Check that inserting this object with this size doesn't exceed bucket quotas
+async fn check_quotas(
+	garage: &Arc<Garage>,
+	bucket: &Bucket,
+	key: &str,
+	size: u64,
+) -> Result<(), Error> {
+	let quotas = bucket.state.as_option().unwrap().quotas.get();
+	if quotas.max_objects.is_none() && quotas.max_size.is_none() {
+		return Ok(());
+	};
+
+	let key = key.to_string();
+	let (prev_object, counters) = futures::try_join!(
+		garage.object_table.get(&bucket.id, &key),
+		garage.object_counter_table.table.get(&bucket.id, &EmptyKey),
+	)?;
+
+	let counters = counters
+		.map(|x| x.filtered_values(&garage.system.ring.borrow()))
+		.unwrap_or_default();
+
+	let (prev_cnt_obj, prev_cnt_size) = match prev_object {
+		Some(o) => {
+			let prev_cnt = o.counts().into_iter().collect::<HashMap<_, _>>();
+			(
+				prev_cnt.get(OBJECTS).cloned().unwrap_or_default(),
+				prev_cnt.get(BYTES).cloned().unwrap_or_default(),
+			)
+		}
+		None => (0, 0),
+	};
+	let cnt_obj_diff = 1 - prev_cnt_obj;
+	let cnt_size_diff = size as i64 - prev_cnt_size;
+
+	if let Some(mo) = quotas.max_objects {
+		let current_objects = counters.get(OBJECTS).cloned().unwrap_or_default();
+		if cnt_obj_diff > 0 && current_objects + cnt_obj_diff > mo as i64 {
+			return Err(Error::forbidden(format!(
+				"Object quota is reached, maximum objects for this bucket: {}",
+				mo
+			)));
+		}
+	}
+
+	if let Some(ms) = quotas.max_size {
+		let current_size = counters.get(BYTES).cloned().unwrap_or_default();
+		if cnt_size_diff > 0 && current_size + cnt_size_diff > ms as i64 {
+			return Err(Error::forbidden(format!(
+				"Bucket size quota is reached, maximum total size of objects for this bucket: {}. The bucket is already {} bytes, and this object would add {} bytes.",
+				ms, current_size, size
+			)));
+		}
+	}
+
+	Ok(())
+}
+
 async fn read_and_put_blocks<S: Stream<Item = Result<Bytes, Error>> + Unpin>(
 	garage: &Garage,
 	version: &Version,
 	part_number: u64,
-	first_block: Vec<u8>,
+	first_block: Bytes,
 	first_block_hash: Hash,
 	chunker: &mut StreamChunker<S>,
 ) -> Result<(u64, GenericArray<u8, typenum::U16>, Hash), Error> {
-	let mut md5hasher = Md5::new();
-	let mut sha256hasher = Sha256::new();
-	md5hasher.update(&first_block[..]);
-	sha256hasher.update(&first_block[..]);
+	let tracer = opentelemetry::global::tracer("garage");
+
+	let md5hasher = AsyncHasher::<Md5>::new();
+	let sha256hasher = AsyncHasher::<Sha256>::new();
+
+	futures::future::join(
+		md5hasher.update(first_block.clone()),
+		sha256hasher.update(first_block.clone()),
+	)
+	.with_context(Context::current_with_span(
+		tracer.start("Hash first block (md5, sha256)"),
+	))
+	.await;
 
 	let mut next_offset = first_block.len();
 	let mut put_curr_version_block = put_block_meta(
@@ -235,9 +318,15 @@ async fn read_and_put_blocks<S: Stream<Item = Result<Bytes, Error>> + Unpin>(
 			chunker.next(),
 		)?;
 		if let Some(block) = next_block {
-			md5hasher.update(&block[..]);
-			sha256hasher.update(&block[..]);
-			let block_hash = blake2sum(&block[..]);
+			let (_, _, block_hash) = futures::future::join3(
+				md5hasher.update(block.clone()),
+				sha256hasher.update(block.clone()),
+				async_blake2sum(block.clone()),
+			)
+			.with_context(Context::current_with_span(
+				tracer.start("Hash block (md5, sha256, blake2)"),
+			))
+			.await;
 			let block_len = block.len();
 			put_curr_version_block = put_block_meta(
 				garage,
@@ -255,9 +344,9 @@ async fn read_and_put_blocks<S: Stream<Item = Result<Bytes, Error>> + Unpin>(
 	}
 
 	let total_size = next_offset as u64;
-	let data_md5sum = md5hasher.finalize();
+	let data_md5sum = md5hasher.finalize().await;
 
-	let data_sha256sum = sha256hasher.finalize();
+	let data_sha256sum = sha256hasher.finalize().await;
 	let data_sha256sum = Hash::try_from(&data_sha256sum[..]).unwrap();
 
 	Ok((total_size, data_md5sum, data_sha256sum))
@@ -297,7 +386,7 @@ struct StreamChunker<S: Stream<Item = Result<Bytes, Error>>> {
 	stream: S,
 	read_all: bool,
 	block_size: usize,
-	buf: VecDeque<u8>,
+	buf: BytesBuf,
 }
 
 impl<S: Stream<Item = Result<Bytes, Error>> + Unpin> StreamChunker<S> {
@@ -306,11 +395,11 @@ impl<S: Stream<Item = Result<Bytes, Error>> + Unpin> StreamChunker<S> {
 			stream,
 			read_all: false,
 			block_size,
-			buf: VecDeque::with_capacity(2 * block_size),
+			buf: BytesBuf::new(),
 		}
 	}
 
-	async fn next(&mut self) -> Result<Option<Vec<u8>>, Error> {
+	async fn next(&mut self) -> Result<Option<Bytes>, Error> {
 		while !self.read_all && self.buf.len() < self.block_size {
 			if let Some(block) = self.stream.next().await {
 				let bytes = block?;
@@ -323,12 +412,8 @@ impl<S: Stream<Item = Result<Bytes, Error>> + Unpin> StreamChunker<S> {
 
 		if self.buf.is_empty() {
 			Ok(None)
-		} else if self.buf.len() <= self.block_size {
-			let block = self.buf.drain(..).collect::<Vec<u8>>();
-			Ok(Some(block))
 		} else {
-			let block = self.buf.drain(..self.block_size).collect::<Vec<u8>>();
-			Ok(Some(block))
+			Ok(Some(self.buf.take_max(self.block_size)))
 		}
 	}
 }
@@ -428,7 +513,7 @@ pub async fn handle_put_part(
 	// Check part hasn't already been uploaded
 	if let Some(v) = version {
 		if v.has_part_number(part_number) {
-			return Err(Error::BadRequest(format!(
+			return Err(Error::bad_request(format!(
 				"Part number {} has already been uploaded",
 				part_number
 			)));
@@ -437,7 +522,9 @@ pub async fn handle_put_part(
 
 	// Copy block to store
 	let version = Version::new(version_uuid, bucket_id, key, false);
-	let first_block_hash = blake2sum(&first_block[..]);
+
+	let first_block_hash = async_blake2sum(first_block.clone()).await;
+
 	let (_, data_md5sum, data_sha256sum) = read_and_put_blocks(
 		&garage,
 		&version,
@@ -475,7 +562,7 @@ pub async fn handle_complete_multipart_upload(
 	garage: Arc<Garage>,
 	req: Request<Body>,
 	bucket_name: &str,
-	bucket_id: Uuid,
+	bucket: &Bucket,
 	key: &str,
 	upload_id: &str,
 	content_sha256: Option<Hash>,
@@ -499,7 +586,7 @@ pub async fn handle_complete_multipart_upload(
 	// Get object and version
 	let key = key.to_string();
 	let (object, version) = futures::try_join!(
-		garage.object_table.get(&bucket_id, &key),
+		garage.object_table.get(&bucket.id, &key),
 		garage.version_table.get(&version_uuid, &EmptyKey),
 	)?;
 
@@ -513,7 +600,7 @@ pub async fn handle_complete_multipart_upload(
 
 	let version = version.ok_or(Error::NoSuchKey)?;
 	if version.blocks.is_empty() {
-		return Err(Error::BadRequest("No data was uploaded".to_string()));
+		return Err(Error::bad_request("No data was uploaded"));
 	}
 
 	let headers = match object_version.state {
@@ -574,8 +661,8 @@ pub async fn handle_complete_multipart_upload(
 		.map(|x| x.part_number)
 		.eq(block_parts.into_iter());
 	if !same_parts {
-		return Err(Error::BadRequest(
-			"Part numbers in block list and part list do not match. This can happen if a part was partially uploaded. Please abort the multipart upload and try again.".into(),
+		return Err(Error::bad_request(
+			"Part numbers in block list and part list do not match. This can happen if a part was partially uploaded. Please abort the multipart upload and try again."
 		));
 	}
 
@@ -592,6 +679,14 @@ pub async fn handle_complete_multipart_upload(
 	// Calculate total size of final object
 	let total_size = version.blocks.items().iter().map(|x| x.1.size).sum();
 
+	if let Err(e) = check_quotas(&garage, bucket, &key, total_size).await {
+		object_version.state = ObjectVersionState::Aborted;
+		let final_object = Object::new(bucket.id, key.clone(), vec![object_version]);
+		garage.object_table.insert(&final_object).await?;
+
+		return Err(e);
+	}
+
 	// Write final object version
 	object_version.state = ObjectVersionState::Complete(ObjectVersionData::FirstBlock(
 		ObjectVersionMeta {
@@ -602,7 +697,7 @@ pub async fn handle_complete_multipart_upload(
 		version.blocks.items()[0].1.hash,
 	));
 
-	let final_object = Object::new(bucket_id, key.clone(), vec![object_version]);
+	let final_object = Object::new(bucket.id, key.clone(), vec![object_version]);
 	garage.object_table.insert(&final_object).await?;
 
 	// Send response saying ok we're done
diff --git a/src/api/s3_router.rs b/src/api/s3/router.rs
index 95a7eceb..44f581ff 100644
--- a/src/api/s3_router.rs
+++ b/src/api/s3/router.rs
@@ -1,131 +1,13 @@
-use crate::error::{Error, OkOrBadRequest};
-
 use std::borrow::Cow;
 
 use hyper::header::HeaderValue;
 use hyper::{HeaderMap, Method, Request};
 
-/// This macro is used to generate very repetitive match {} blocks in this module
-/// It is _not_ made to be used anywhere else
-macro_rules! s3_match {
-    (@match $enum:expr , [ $($endpoint:ident,)* ]) => {{
-        // usage: s3_match {@match my_enum, [ VariantWithField1, VariantWithField2 ..] }
-        // returns true if the variant was one of the listed variants, false otherwise.
-        use Endpoint::*;
-        match $enum {
-            $(
-            $endpoint { .. } => true,
-            )*
-            _ => false
-        }
-    }};
-    (@extract $enum:expr , $param:ident, [ $($endpoint:ident,)* ]) => {{
-        // usage: s3_match {@extract my_enum, field_name, [ VariantWithField1, VariantWithField2 ..] }
-        // returns Some(field_value), or None if the variant was not one of the listed variants.
-        use Endpoint::*;
-        match $enum {
-            $(
-            $endpoint {$param, ..} => Some($param),
-            )*
-            _ => None
-        }
-    }};
-    (@gen_parser ($keyword:expr, $key:expr, $query:expr, $header:expr),
-        key: [$($kw_k:ident $(if $required_k:ident)? $(header $header_k:expr)? => $api_k:ident $(($($conv_k:ident :: $param_k:ident),*))?,)*],
-        no_key: [$($kw_nk:ident $(if $required_nk:ident)? $(if_header $header_nk:expr)? => $api_nk:ident $(($($conv_nk:ident :: $param_nk:ident),*))?,)*]) => {{
-        // usage: s3_match {@gen_parser (keyword, key, query, header),
-        //   key: [
-        //      SOME_KEYWORD => VariantWithKey,
-        //      ...
-        //   ],
-        //   no_key: [
-        //      SOME_KEYWORD => VariantWithoutKey,
-        //      ...
-        //   ]
-        // }
-        // See in from_{method} for more detailed usage.
-        use Endpoint::*;
-        use keywords::*;
-        match ($keyword, !$key.is_empty()){
-            $(
-            ($kw_k, true) if true $(&& $query.$required_k.is_some())? $(&& $header.contains_key($header_k))? => Ok($api_k {
-                key: $key,
-                $($(
-                    $param_k: s3_match!(@@parse_param $query, $conv_k, $param_k),
-                )*)?
-            }),
-            )*
-            $(
-            ($kw_nk, false) $(if $query.$required_nk.is_some())? $(if $header.contains($header_nk))? => Ok($api_nk {
-                $($(
-                    $param_nk: s3_match!(@@parse_param $query, $conv_nk, $param_nk),
-                )*)?
-            }),
-            )*
-            (kw, _) => Err(Error::BadRequest(format!("Invalid endpoint: {}", kw)))
-        }
-    }};
+use crate::helpers::Authorization;
+use crate::router_macros::{generateQueryParameters, router_match};
+use crate::s3::error::*;
 
-    (@@parse_param $query:expr, query_opt, $param:ident) => {{
-        // extract optional query parameter
-		$query.$param.take().map(|param| param.into_owned())
-    }};
-    (@@parse_param $query:expr, query, $param:ident) => {{
-        // extract mendatory query parameter
-        $query.$param.take().ok_or_bad_request("Missing argument for endpoint")?.into_owned()
-    }};
-    (@@parse_param $query:expr, opt_parse, $param:ident) => {{
-        // extract and parse optional query parameter
-        // missing parameter is file, however parse error is reported as an error
-		$query.$param
-            .take()
-            .map(|param| param.parse())
-            .transpose()
-            .map_err(|_| Error::BadRequest("Failed to parse query parameter".to_owned()))?
-    }};
-    (@@parse_param $query:expr, parse, $param:ident) => {{
-        // extract and parse mandatory query parameter
-        // both missing and un-parseable parameters are reported as errors
-        $query.$param.take().ok_or_bad_request("Missing argument for endpoint")?
-            .parse()
-            .map_err(|_| Error::BadRequest("Failed to parse query parameter".to_owned()))?
-    }};
-    (@func
-    $(#[$doc:meta])*
-     pub enum Endpoint {
-        $(
-            $(#[$outer:meta])*
-            $variant:ident $({
-                $($name:ident: $ty:ty,)*
-            })?,
-        )*
-    }) => {
-    $(#[$doc])*
-        pub enum Endpoint {
-            $(
-                $(#[$outer])*
-                $variant $({
-                    $($name: $ty, )*
-                })?,
-            )*
-        }
-        impl Endpoint {
-            pub fn name(&self) -> &'static str {
-                match self {
-                    $(Endpoint::$variant $({ $($name: _,)* .. })? => stringify!($variant),)*
-                }
-            }
-        }
-    };
-    (@if ($($cond:tt)+) then ($($then:tt)*) else ($($else:tt)*)) => {
-        $($then)*
-    };
-    (@if () then ($($then:tt)*) else ($($else:tt)*)) => {
-        $($else)*
-    };
-}
-
-s3_match! {@func
+router_match! {@func
 
 /// List of all S3 API endpoints.
 ///
@@ -460,7 +342,7 @@ impl Endpoint {
 			Method::POST => Self::from_post(key, &mut query)?,
 			Method::PUT => Self::from_put(key, &mut query, req.headers())?,
 			Method::DELETE => Self::from_delete(key, &mut query)?,
-			_ => return Err(Error::BadRequest("Unknown method".to_owned())),
+			_ => return Err(Error::bad_request("Unknown method")),
 		};
 
 		if let Some(message) = query.nonempty_message() {
@@ -471,7 +353,7 @@ impl Endpoint {
 
 	/// Determine which endpoint a request is for, knowing it is a GET.
 	fn from_get(key: String, query: &mut QueryParameters<'_>) -> Result<Self, Error> {
-		s3_match! {
+		router_match! {
 			@gen_parser
 			(query.keyword.take().unwrap_or_default().as_ref(), key, query, None),
 			key: [
@@ -528,7 +410,7 @@ impl Endpoint {
 
 	/// Determine which endpoint a request is for, knowing it is a HEAD.
 	fn from_head(key: String, query: &mut QueryParameters<'_>) -> Result<Self, Error> {
-		s3_match! {
+		router_match! {
 			@gen_parser
 			(query.keyword.take().unwrap_or_default().as_ref(), key, query, None),
 			key: [
@@ -542,7 +424,7 @@ impl Endpoint {
 
 	/// Determine which endpoint a request is for, knowing it is a POST.
 	fn from_post(key: String, query: &mut QueryParameters<'_>) -> Result<Self, Error> {
-		s3_match! {
+		router_match! {
 			@gen_parser
 			(query.keyword.take().unwrap_or_default().as_ref(), key, query, None),
 			key: [
@@ -564,7 +446,7 @@ impl Endpoint {
 		query: &mut QueryParameters<'_>,
 		headers: &HeaderMap<HeaderValue>,
 	) -> Result<Self, Error> {
-		s3_match! {
+		router_match! {
 			@gen_parser
 			(query.keyword.take().unwrap_or_default().as_ref(), key, query, headers),
 			key: [
@@ -606,7 +488,7 @@ impl Endpoint {
 
 	/// Determine which endpoint a request is for, knowing it is a DELETE.
 	fn from_delete(key: String, query: &mut QueryParameters<'_>) -> Result<Self, Error> {
-		s3_match! {
+		router_match! {
 			@gen_parser
 			(query.keyword.take().unwrap_or_default().as_ref(), key, query, None),
 			key: [
@@ -636,7 +518,7 @@ impl Endpoint {
 	/// Get the key the request target. Returns None for requests which don't use a key.
 	#[allow(dead_code)]
 	pub fn get_key(&self) -> Option<&str> {
-		s3_match! {
+		router_match! {
 			@extract
 			self,
 			key,
@@ -673,7 +555,7 @@ impl Endpoint {
 		if let Endpoint::ListBuckets = self {
 			return Authorization::None;
 		};
-		let readonly = s3_match! {
+		let readonly = router_match! {
 			@match
 			self,
 			[
@@ -717,7 +599,7 @@ impl Endpoint {
 				SelectObjectContent,
 			]
 		};
-		let owner = s3_match! {
+		let owner = router_match! {
 			@match
 			self,
 			[
@@ -740,87 +622,6 @@ impl Endpoint {
 	}
 }
 
-/// What kind of authorization is required to perform a given action
-#[derive(Debug, Clone, PartialEq, Eq)]
-pub enum Authorization {
-	/// No authorization is required
-	None,
-	/// Having Read permission on bucket
-	Read,
-	/// Having Write permission on bucket
-	Write,
-	/// Having Owner permission on bucket
-	Owner,
-}
-
-/// This macro is used to generate part of the code in this module. It must be called only one, and
-/// is useless outside of this module.
-macro_rules! generateQueryParameters {
-    ( $($rest:expr => $name:ident),* ) => {
-        /// Struct containing all query parameters used in endpoints. Think of it as an HashMap,
-        /// but with keys statically known.
-        #[derive(Debug, Default)]
-        struct QueryParameters<'a> {
-            keyword: Option<Cow<'a, str>>,
-            $(
-            $name: Option<Cow<'a, str>>,
-            )*
-        }
-
-        impl<'a> QueryParameters<'a> {
-            /// Build this struct from the query part of an URI.
-            fn from_query(query: &'a str) -> Result<Self, Error> {
-                let mut res: Self = Default::default();
-                for (k, v) in url::form_urlencoded::parse(query.as_bytes()) {
-                    let repeated = match k.as_ref() {
-                        $(
-                            $rest => if !v.is_empty() {
-                                res.$name.replace(v).is_some()
-                            } else {
-                                false
-                            },
-                        )*
-                        _ => {
-                            if k.starts_with("response-") || k.starts_with("X-Amz-") {
-                                false
-                            } else if v.as_ref().is_empty() {
-                                if res.keyword.replace(k).is_some() {
-                                    return Err(Error::BadRequest("Multiple keywords".to_owned()));
-                                }
-                                continue;
-                            } else {
-                                debug!("Received an unknown query parameter: '{}'", k);
-                                false
-                            }
-                        }
-                    };
-                    if repeated {
-                        return Err(Error::BadRequest(format!(
-                            "Query parameter repeated: '{}'",
-                            k
-                        )));
-                    }
-                }
-                Ok(res)
-            }
-
-            /// Get an error message in case not all parameters where used when extracting them to
-            /// build an Enpoint variant
-            fn nonempty_message(&self) -> Option<&str> {
-                if self.keyword.is_some() {
-                    Some("Keyword not used")
-                } $(
-                    else if self.$name.is_some() {
-                        Some(concat!("'", $rest, "'"))
-                    }
-                )* else {
-                    None
-                }
-            }
-        }
-    }
-}
-
 // parameter name => struct field
 generateQueryParameters! {
 	"continuation-token" => continuation_token,
diff --git a/src/api/s3_website.rs b/src/api/s3/website.rs
index b464dd45..77738971 100644
--- a/src/api/s3_website.rs
+++ b/src/api/s3/website.rs
@@ -4,13 +4,12 @@ use std::sync::Arc;
 use hyper::{Body, Request, Response, StatusCode};
 use serde::{Deserialize, Serialize};
 
-use crate::error::*;
-use crate::s3_xml::{to_xml_with_header, xmlns_tag, IntValue, Value};
+use crate::s3::error::*;
+use crate::s3::xml::{to_xml_with_header, xmlns_tag, IntValue, Value};
 use crate::signature::verify_signed_content;
 
 use garage_model::bucket_table::*;
 use garage_model::garage::Garage;
-use garage_table::*;
 use garage_util::data::*;
 
 pub async fn handle_get_website(bucket: &Bucket) -> Result<Response<Body>, Error> {
@@ -47,14 +46,11 @@ pub async fn handle_delete_website(
 	bucket_id: Uuid,
 ) -> Result<Response<Body>, Error> {
 	let mut bucket = garage
-		.bucket_table
-		.get(&EmptyKey, &bucket_id)
-		.await?
-		.ok_or(Error::NoSuchBucket)?;
+		.bucket_helper()
+		.get_existing_bucket(bucket_id)
+		.await?;
 
-	let param = bucket
-		.params_mut()
-		.ok_or_internal_error("Bucket should not be deleted at this point")?;
+	let param = bucket.params_mut().unwrap();
 
 	param.website_config.update(None);
 	garage.bucket_table.insert(&bucket).await?;
@@ -77,14 +73,11 @@ pub async fn handle_put_website(
 	}
 
 	let mut bucket = garage
-		.bucket_table
-		.get(&EmptyKey, &bucket_id)
-		.await?
-		.ok_or(Error::NoSuchBucket)?;
+		.bucket_helper()
+		.get_existing_bucket(bucket_id)
+		.await?;
 
-	let param = bucket
-		.params_mut()
-		.ok_or_internal_error("Bucket should not be deleted at this point")?;
+	let param = bucket.params_mut().unwrap();
 
 	let conf: WebsiteConfiguration = from_reader(&body as &[u8])?;
 	conf.validate()?;
@@ -176,8 +169,8 @@ impl WebsiteConfiguration {
 				|| self.index_document.is_some()
 				|| self.routing_rules.is_some())
 		{
-			return Err(Error::BadRequest(
-				"Bad XML: can't have RedirectAllRequestsTo and other fields".to_owned(),
+			return Err(Error::bad_request(
+				"Bad XML: can't have RedirectAllRequestsTo and other fields",
 			));
 		}
 		if let Some(ref ed) = self.error_document {
@@ -222,8 +215,8 @@ impl WebsiteConfiguration {
 impl Key {
 	pub fn validate(&self) -> Result<(), Error> {
 		if self.key.0.is_empty() {
-			Err(Error::BadRequest(
-				"Bad XML: error document specified but empty".to_owned(),
+			Err(Error::bad_request(
+				"Bad XML: error document specified but empty",
 			))
 		} else {
 			Ok(())
@@ -234,8 +227,8 @@ impl Key {
 impl Suffix {
 	pub fn validate(&self) -> Result<(), Error> {
 		if self.suffix.0.is_empty() | self.suffix.0.contains('/') {
-			Err(Error::BadRequest(
-				"Bad XML: index document is empty or contains /".to_owned(),
+			Err(Error::bad_request(
+				"Bad XML: index document is empty or contains /",
 			))
 		} else {
 			Ok(())
@@ -247,7 +240,7 @@ impl Target {
 	pub fn validate(&self) -> Result<(), Error> {
 		if let Some(ref protocol) = self.protocol {
 			if protocol.0 != "http" && protocol.0 != "https" {
-				return Err(Error::BadRequest("Bad XML: invalid protocol".to_owned()));
+				return Err(Error::bad_request("Bad XML: invalid protocol"));
 			}
 		}
 		Ok(())
@@ -269,19 +262,19 @@ impl Redirect {
 	pub fn validate(&self, has_prefix: bool) -> Result<(), Error> {
 		if self.replace_prefix.is_some() {
 			if self.replace_full.is_some() {
-				return Err(Error::BadRequest(
-					"Bad XML: both ReplaceKeyPrefixWith and ReplaceKeyWith are set".to_owned(),
+				return Err(Error::bad_request(
+					"Bad XML: both ReplaceKeyPrefixWith and ReplaceKeyWith are set",
 				));
 			}
 			if !has_prefix {
-				return Err(Error::BadRequest(
-					"Bad XML: ReplaceKeyPrefixWith is set, but  KeyPrefixEquals isn't".to_owned(),
+				return Err(Error::bad_request(
+					"Bad XML: ReplaceKeyPrefixWith is set, but  KeyPrefixEquals isn't",
 				));
 			}
 		}
 		if let Some(ref protocol) = self.protocol {
 			if protocol.0 != "http" && protocol.0 != "https" {
-				return Err(Error::BadRequest("Bad XML: invalid protocol".to_owned()));
+				return Err(Error::bad_request("Bad XML: invalid protocol"));
 			}
 		}
 		// TODO there are probably more invalide cases, but which ones?
diff --git a/src/api/s3_xml.rs b/src/api/s3/xml.rs
index 75ec4559..06f11288 100644
--- a/src/api/s3_xml.rs
+++ b/src/api/s3/xml.rs
@@ -1,7 +1,7 @@
 use quick_xml::se::to_string;
 use serde::{Deserialize, Serialize, Serializer};
 
-use crate::Error as ApiError;
+use crate::s3::error::Error as ApiError;
 
 pub fn to_xml_with_header<T: Serialize>(x: &T) -> Result<String, ApiError> {
 	let mut xml = r#"<?xml version="1.0" encoding="UTF-8"?>"#.to_string();
@@ -25,7 +25,7 @@ impl From<&str> for Value {
 #[derive(Debug, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
 pub struct IntValue(#[serde(rename = "$value")] pub i64);
 
-#[derive(Debug, Serialize, PartialEq)]
+#[derive(Debug, Serialize, PartialEq, Eq)]
 pub struct Bucket {
 	#[serde(rename = "CreationDate")]
 	pub creation_date: Value,
@@ -33,7 +33,7 @@ pub struct Bucket {
 	pub name: Value,
 }
 
-#[derive(Debug, Serialize, PartialEq)]
+#[derive(Debug, Serialize, PartialEq, Eq)]
 pub struct Owner {
 	#[serde(rename = "DisplayName")]
 	pub display_name: Value,
@@ -41,13 +41,13 @@ pub struct Owner {
 	pub id: Value,
 }
 
-#[derive(Debug, Serialize, PartialEq)]
+#[derive(Debug, Serialize, PartialEq, Eq)]
 pub struct BucketList {
 	#[serde(rename = "Bucket")]
 	pub entries: Vec<Bucket>,
 }
 
-#[derive(Debug, Serialize, PartialEq)]
+#[derive(Debug, Serialize, PartialEq, Eq)]
 pub struct ListAllMyBucketsResult {
 	#[serde(rename = "Buckets")]
 	pub buckets: BucketList,
@@ -55,7 +55,7 @@ pub struct ListAllMyBucketsResult {
 	pub owner: Owner,
 }
 
-#[derive(Debug, Serialize, PartialEq)]
+#[derive(Debug, Serialize, PartialEq, Eq)]
 pub struct LocationConstraint {
 	#[serde(serialize_with = "xmlns_tag")]
 	pub xmlns: (),
@@ -63,7 +63,7 @@ pub struct LocationConstraint {
 	pub region: String,
 }
 
-#[derive(Debug, Serialize, PartialEq)]
+#[derive(Debug, Serialize, PartialEq, Eq)]
 pub struct Deleted {
 	#[serde(rename = "Key")]
 	pub key: Value,
@@ -73,7 +73,7 @@ pub struct Deleted {
 	pub delete_marker_version_id: Value,
 }
 
-#[derive(Debug, Serialize, PartialEq)]
+#[derive(Debug, Serialize, PartialEq, Eq)]
 pub struct Error {
 	#[serde(rename = "Code")]
 	pub code: Value,
@@ -85,7 +85,7 @@ pub struct Error {
 	pub region: Option<Value>,
 }
 
-#[derive(Debug, Serialize, PartialEq)]
+#[derive(Debug, Serialize, PartialEq, Eq)]
 pub struct DeleteError {
 	#[serde(rename = "Code")]
 	pub code: Value,
@@ -97,7 +97,7 @@ pub struct DeleteError {
 	pub version_id: Option<Value>,
 }
 
-#[derive(Debug, Serialize, PartialEq)]
+#[derive(Debug, Serialize, PartialEq, Eq)]
 pub struct DeleteResult {
 	#[serde(serialize_with = "xmlns_tag")]
 	pub xmlns: (),
@@ -107,7 +107,7 @@ pub struct DeleteResult {
 	pub errors: Vec<DeleteError>,
 }
 
-#[derive(Debug, Serialize, PartialEq)]
+#[derive(Debug, Serialize, PartialEq, Eq)]
 pub struct InitiateMultipartUploadResult {
 	#[serde(serialize_with = "xmlns_tag")]
 	pub xmlns: (),
@@ -119,7 +119,7 @@ pub struct InitiateMultipartUploadResult {
 	pub upload_id: Value,
 }
 
-#[derive(Debug, Serialize, PartialEq)]
+#[derive(Debug, Serialize, PartialEq, Eq)]
 pub struct CompleteMultipartUploadResult {
 	#[serde(serialize_with = "xmlns_tag")]
 	pub xmlns: (),
@@ -133,7 +133,7 @@ pub struct CompleteMultipartUploadResult {
 	pub etag: Value,
 }
 
-#[derive(Debug, Serialize, PartialEq)]
+#[derive(Debug, Serialize, PartialEq, Eq)]
 pub struct Initiator {
 	#[serde(rename = "DisplayName")]
 	pub display_name: Value,
@@ -141,7 +141,7 @@ pub struct Initiator {
 	pub id: Value,
 }
 
-#[derive(Debug, Serialize, PartialEq)]
+#[derive(Debug, Serialize, PartialEq, Eq)]
 pub struct ListMultipartItem {
 	#[serde(rename = "Initiated")]
 	pub initiated: Value,
@@ -157,7 +157,7 @@ pub struct ListMultipartItem {
 	pub storage_class: Value,
 }
 
-#[derive(Debug, Serialize, PartialEq)]
+#[derive(Debug, Serialize, PartialEq, Eq)]
 pub struct ListMultipartUploadsResult {
 	#[serde(serialize_with = "xmlns_tag")]
 	pub xmlns: (),
@@ -187,7 +187,7 @@ pub struct ListMultipartUploadsResult {
 	pub encoding_type: Option<Value>,
 }
 
-#[derive(Debug, Serialize, PartialEq)]
+#[derive(Debug, Serialize, PartialEq, Eq)]
 pub struct PartItem {
 	#[serde(rename = "ETag")]
 	pub etag: Value,
@@ -199,7 +199,7 @@ pub struct PartItem {
 	pub size: IntValue,
 }
 
-#[derive(Debug, Serialize, PartialEq)]
+#[derive(Debug, Serialize, PartialEq, Eq)]
 pub struct ListPartsResult {
 	#[serde(serialize_with = "xmlns_tag")]
 	pub xmlns: (),
@@ -227,7 +227,7 @@ pub struct ListPartsResult {
 	pub storage_class: Value,
 }
 
-#[derive(Debug, Serialize, PartialEq)]
+#[derive(Debug, Serialize, PartialEq, Eq)]
 pub struct ListBucketItem {
 	#[serde(rename = "Key")]
 	pub key: Value,
@@ -241,13 +241,13 @@ pub struct ListBucketItem {
 	pub storage_class: Value,
 }
 
-#[derive(Debug, Serialize, PartialEq)]
+#[derive(Debug, Serialize, PartialEq, Eq)]
 pub struct CommonPrefix {
 	#[serde(rename = "Prefix")]
 	pub prefix: Value,
 }
 
-#[derive(Debug, Serialize, PartialEq)]
+#[derive(Debug, Serialize, PartialEq, Eq)]
 pub struct ListBucketResult {
 	#[serde(serialize_with = "xmlns_tag")]
 	pub xmlns: (),
@@ -281,7 +281,7 @@ pub struct ListBucketResult {
 	pub common_prefixes: Vec<CommonPrefix>,
 }
 
-#[derive(Debug, Serialize, PartialEq)]
+#[derive(Debug, Serialize, PartialEq, Eq)]
 pub struct VersioningConfiguration {
 	#[serde(serialize_with = "xmlns_tag")]
 	pub xmlns: (),
@@ -289,7 +289,7 @@ pub struct VersioningConfiguration {
 	pub status: Option<Value>,
 }
 
-#[derive(Debug, Serialize, PartialEq)]
+#[derive(Debug, Serialize, PartialEq, Eq)]
 pub struct PostObject {
 	#[serde(serialize_with = "xmlns_tag")]
 	pub xmlns: (),
diff --git a/src/api/signature/error.rs b/src/api/signature/error.rs
new file mode 100644
index 00000000..f5a067bd
--- /dev/null
+++ b/src/api/signature/error.rs
@@ -0,0 +1,36 @@
+use err_derive::Error;
+
+use crate::common_error::CommonError;
+pub use crate::common_error::{CommonErrorDerivative, OkOrBadRequest, OkOrInternalError};
+
+/// Errors of this crate
+#[derive(Debug, Error)]
+pub enum Error {
+	#[error(display = "{}", _0)]
+	/// Error from common error
+	Common(CommonError),
+
+	/// Authorization Header Malformed
+	#[error(display = "Authorization header malformed, expected scope: {}", _0)]
+	AuthorizationHeaderMalformed(String),
+
+	// Category: bad request
+	/// The request contained an invalid UTF-8 sequence in its path or in other parameters
+	#[error(display = "Invalid UTF-8: {}", _0)]
+	InvalidUtf8Str(#[error(source)] std::str::Utf8Error),
+
+	/// The client sent a header with invalid value
+	#[error(display = "Invalid header value: {}", _0)]
+	InvalidHeader(#[error(source)] hyper::header::ToStrError),
+}
+
+impl<T> From<T> for Error
+where
+	CommonError: From<T>,
+{
+	fn from(err: T) -> Self {
+		Error::Common(CommonError::from(err))
+	}
+}
+
+impl CommonErrorDerivative for Error {}
diff --git a/src/api/signature/mod.rs b/src/api/signature/mod.rs
index ebdee6da..4b8b990f 100644
--- a/src/api/signature/mod.rs
+++ b/src/api/signature/mod.rs
@@ -1,14 +1,15 @@
 use chrono::{DateTime, Utc};
-use hmac::{Hmac, Mac, NewMac};
+use hmac::{Hmac, Mac};
 use sha2::Sha256;
 
 use garage_util::data::{sha256sum, Hash};
 
-use crate::error::*;
-
+pub mod error;
 pub mod payload;
 pub mod streaming;
 
+use error::*;
+
 pub const SHORT_DATE: &str = "%Y%m%d";
 pub const LONG_DATETIME: &str = "%Y%m%dT%H%M%SZ";
 
@@ -16,7 +17,7 @@ type HmacSha256 = Hmac<Sha256>;
 
 pub fn verify_signed_content(expected_sha256: Hash, body: &[u8]) -> Result<(), Error> {
 	if expected_sha256 != sha256sum(body) {
-		return Err(Error::BadRequest(
+		return Err(Error::bad_request(
 			"Request content hash does not match signed hash".to_string(),
 		));
 	}
@@ -28,20 +29,25 @@ pub fn signing_hmac(
 	secret_key: &str,
 	region: &str,
 	service: &str,
-) -> Result<HmacSha256, crypto_mac::InvalidKeyLength> {
+) -> Result<HmacSha256, crypto_common::InvalidLength> {
 	let secret = String::from("AWS4") + secret_key;
-	let mut date_hmac = HmacSha256::new_varkey(secret.as_bytes())?;
+	let mut date_hmac = HmacSha256::new_from_slice(secret.as_bytes())?;
 	date_hmac.update(datetime.format(SHORT_DATE).to_string().as_bytes());
-	let mut region_hmac = HmacSha256::new_varkey(&date_hmac.finalize().into_bytes())?;
+	let mut region_hmac = HmacSha256::new_from_slice(&date_hmac.finalize().into_bytes())?;
 	region_hmac.update(region.as_bytes());
-	let mut service_hmac = HmacSha256::new_varkey(&region_hmac.finalize().into_bytes())?;
+	let mut service_hmac = HmacSha256::new_from_slice(&region_hmac.finalize().into_bytes())?;
 	service_hmac.update(service.as_bytes());
-	let mut signing_hmac = HmacSha256::new_varkey(&service_hmac.finalize().into_bytes())?;
+	let mut signing_hmac = HmacSha256::new_from_slice(&service_hmac.finalize().into_bytes())?;
 	signing_hmac.update(b"aws4_request");
-	let hmac = HmacSha256::new_varkey(&signing_hmac.finalize().into_bytes())?;
+	let hmac = HmacSha256::new_from_slice(&signing_hmac.finalize().into_bytes())?;
 	Ok(hmac)
 }
 
-pub fn compute_scope(datetime: &DateTime<Utc>, region: &str) -> String {
-	format!("{}/{}/s3/aws4_request", datetime.format(SHORT_DATE), region,)
+pub fn compute_scope(datetime: &DateTime<Utc>, region: &str, service: &str) -> String {
+	format!(
+		"{}/{}/{}/aws4_request",
+		datetime.format(SHORT_DATE),
+		region,
+		service
+	)
 }
diff --git a/src/api/signature/payload.rs b/src/api/signature/payload.rs
index 2a41b307..4c7934e5 100644
--- a/src/api/signature/payload.rs
+++ b/src/api/signature/payload.rs
@@ -11,14 +11,15 @@ use garage_util::data::Hash;
 use garage_model::garage::Garage;
 use garage_model::key_table::*;
 
-use super::signing_hmac;
-use super::{LONG_DATETIME, SHORT_DATE};
+use super::LONG_DATETIME;
+use super::{compute_scope, signing_hmac};
 
 use crate::encoding::uri_encode;
-use crate::error::*;
+use crate::signature::error::*;
 
 pub async fn check_payload_signature(
 	garage: &Garage,
+	service: &str,
 	request: &Request<Body>,
 ) -> Result<(Option<Key>, Option<Hash>), Error> {
 	let mut headers = HashMap::new();
@@ -64,6 +65,7 @@ pub async fn check_payload_signature(
 
 	let key = verify_v4(
 		garage,
+		service,
 		&authorization.credential,
 		&authorization.date,
 		&authorization.signature,
@@ -103,7 +105,7 @@ fn parse_authorization(
 	let (auth_kind, rest) = authorization.split_at(first_space);
 
 	if auth_kind != "AWS4-HMAC-SHA256" {
-		return Err(Error::BadRequest("Unsupported authorization method".into()));
+		return Err(Error::bad_request("Unsupported authorization method"));
 	}
 
 	let mut auth_params = HashMap::new();
@@ -127,10 +129,11 @@ fn parse_authorization(
 	let date = headers
 		.get("x-amz-date")
 		.ok_or_bad_request("Missing X-Amz-Date field")
+		.map_err(Error::from)
 		.and_then(|d| parse_date(d))?;
 
 	if Utc::now() - date > Duration::hours(24) {
-		return Err(Error::BadRequest("Date is too old".to_string()));
+		return Err(Error::bad_request("Date is too old".to_string()));
 	}
 
 	let auth = Authorization {
@@ -154,7 +157,7 @@ fn parse_query_authorization(
 	headers: &HashMap<String, String>,
 ) -> Result<Authorization, Error> {
 	if algorithm != "AWS4-HMAC-SHA256" {
-		return Err(Error::BadRequest(
+		return Err(Error::bad_request(
 			"Unsupported authorization method".to_string(),
 		));
 	}
@@ -177,10 +180,10 @@ fn parse_query_authorization(
 		.get("x-amz-expires")
 		.ok_or_bad_request("X-Amz-Expires not found in query parameters")?
 		.parse()
-		.map_err(|_| Error::BadRequest("X-Amz-Expires is not a number".to_string()))?;
+		.map_err(|_| Error::bad_request("X-Amz-Expires is not a number".to_string()))?;
 
 	if duration > 7 * 24 * 3600 {
-		return Err(Error::BadRequest(
+		return Err(Error::bad_request(
 			"X-Amz-Exprires may not exceed a week".to_string(),
 		));
 	}
@@ -188,10 +191,11 @@ fn parse_query_authorization(
 	let date = headers
 		.get("x-amz-date")
 		.ok_or_bad_request("Missing X-Amz-Date field")
+		.map_err(Error::from)
 		.and_then(|d| parse_date(d))?;
 
 	if Utc::now() - date > Duration::seconds(duration) {
-		return Err(Error::BadRequest("Date is too old".to_string()));
+		return Err(Error::bad_request("Date is too old".to_string()));
 	}
 
 	Ok(Authorization {
@@ -281,6 +285,7 @@ pub fn parse_date(date: &str) -> Result<DateTime<Utc>, Error> {
 
 pub async fn verify_v4(
 	garage: &Garage,
+	service: &str,
 	credential: &str,
 	date: &DateTime<Utc>,
 	signature: &str,
@@ -288,11 +293,7 @@ pub async fn verify_v4(
 ) -> Result<Key, Error> {
 	let (key_id, scope) = parse_credential(credential)?;
 
-	let scope_expected = format!(
-		"{}/{}/s3/aws4_request",
-		date.format(SHORT_DATE),
-		garage.config.s3_api.s3_region
-	);
+	let scope_expected = compute_scope(date, &garage.config.s3_api.s3_region, service);
 	if scope != scope_expected {
 		return Err(Error::AuthorizationHeaderMalformed(scope.to_string()));
 	}
@@ -302,20 +303,20 @@ pub async fn verify_v4(
 		.get(&EmptyKey, &key_id)
 		.await?
 		.filter(|k| !k.state.is_deleted())
-		.ok_or_else(|| Error::Forbidden(format!("No such key: {}", &key_id)))?;
+		.ok_or_else(|| Error::forbidden(format!("No such key: {}", &key_id)))?;
 	let key_p = key.params().unwrap();
 
 	let mut hmac = signing_hmac(
 		date,
 		&key_p.secret_key,
 		&garage.config.s3_api.s3_region,
-		"s3",
+		service,
 	)
 	.ok_or_internal_error("Unable to build signing HMAC")?;
 	hmac.update(payload);
 	let our_signature = hex::encode(hmac.finalize().into_bytes());
 	if signature != our_signature {
-		return Err(Error::Forbidden("Invalid signature".to_string()));
+		return Err(Error::forbidden("Invalid signature".to_string()));
 	}
 
 	Ok(key)
diff --git a/src/api/signature/streaming.rs b/src/api/signature/streaming.rs
index 969a45d6..c8358c4f 100644
--- a/src/api/signature/streaming.rs
+++ b/src/api/signature/streaming.rs
@@ -1,18 +1,67 @@
 use std::pin::Pin;
 
-use chrono::{DateTime, Utc};
+use chrono::{DateTime, NaiveDateTime, Utc};
 use futures::prelude::*;
 use futures::task;
+use garage_model::key_table::Key;
+use hmac::Mac;
 use hyper::body::Bytes;
+use hyper::{Body, Request};
 
 use garage_util::data::Hash;
-use hmac::Mac;
-
-use super::sha256sum;
-use super::HmacSha256;
-use super::LONG_DATETIME;
 
-use crate::error::*;
+use super::{compute_scope, sha256sum, HmacSha256, LONG_DATETIME};
+
+use crate::signature::error::*;
+
+pub fn parse_streaming_body(
+	api_key: &Key,
+	req: Request<Body>,
+	content_sha256: &mut Option<Hash>,
+	region: &str,
+	service: &str,
+) -> Result<Request<Body>, Error> {
+	match req.headers().get("x-amz-content-sha256") {
+		Some(header) if header == "STREAMING-AWS4-HMAC-SHA256-PAYLOAD" => {
+			let signature = content_sha256
+				.take()
+				.ok_or_bad_request("No signature provided")?;
+
+			let secret_key = &api_key
+				.state
+				.as_option()
+				.ok_or_internal_error("Deleted key state")?
+				.secret_key;
+
+			let date = req
+				.headers()
+				.get("x-amz-date")
+				.ok_or_bad_request("Missing X-Amz-Date field")?
+				.to_str()?;
+			let date: NaiveDateTime = NaiveDateTime::parse_from_str(date, LONG_DATETIME)
+				.ok_or_bad_request("Invalid date")?;
+			let date: DateTime<Utc> = DateTime::from_utc(date, Utc);
+
+			let scope = compute_scope(&date, region, service);
+			let signing_hmac = crate::signature::signing_hmac(&date, secret_key, region, service)
+				.ok_or_internal_error("Unable to build signing HMAC")?;
+
+			Ok(req.map(move |body| {
+				Body::wrap_stream(
+					SignedPayloadStream::new(
+						body.map_err(Error::from),
+						signing_hmac,
+						date,
+						&scope,
+						signature,
+					)
+					.map_err(Error::from),
+				)
+			}))
+		}
+		_ => Ok(req),
+	}
+}
 
 /// Result of `sha256("")`
 const EMPTY_STRING_HEX_DIGEST: &str =
@@ -38,7 +87,7 @@ fn compute_streaming_payload_signature(
 	let mut hmac = signing_hmac.clone();
 	hmac.update(string_to_sign.as_bytes());
 
-	Hash::try_from(&hmac.finalize().into_bytes()).ok_or_internal_error("Invalid signature")
+	Ok(Hash::try_from(&hmac.finalize().into_bytes()).ok_or_internal_error("Invalid signature")?)
 }
 
 mod payload {
@@ -114,10 +163,10 @@ impl From<SignedPayloadStreamError> for Error {
 		match err {
 			SignedPayloadStreamError::Stream(e) => e,
 			SignedPayloadStreamError::InvalidSignature => {
-				Error::BadRequest("Invalid payload signature".into())
+				Error::bad_request("Invalid payload signature")
 			}
 			SignedPayloadStreamError::Message(e) => {
-				Error::BadRequest(format!("Chunk format error: {}", e))
+				Error::bad_request(format!("Chunk format error: {}", e))
 			}
 		}
 	}
@@ -295,7 +344,7 @@ mod tests {
 			.with_timezone(&Utc);
 		let secret_key = "test";
 		let region = "test";
-		let scope = crate::signature::compute_scope(&datetime, region);
+		let scope = crate::signature::compute_scope(&datetime, region, "s3");
 		let signing_hmac =
 			crate::signature::signing_hmac(&datetime, secret_key, region, "s3").unwrap();
 
diff --git a/src/block/Cargo.toml b/src/block/Cargo.toml
index 9cba69ee..cd409001 100644
--- a/src/block/Cargo.toml
+++ b/src/block/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "garage_block"
-version = "0.7.0"
+version = "0.8.0"
 authors = ["Alex Auvolat <alex@adnab.me>"]
 edition = "2018"
 license = "AGPL-3.0"
@@ -14,20 +14,22 @@ path = "lib.rs"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
-garage_rpc = { version = "0.7.0", path = "../rpc" }
-garage_util = { version = "0.7.0", path = "../util" }
-garage_table = { version = "0.7.0", path = "../table" }
+garage_db = { version = "0.8.0", path = "../db" }
+garage_rpc = { version = "0.8.0", path = "../rpc" }
+garage_util = { version = "0.8.0", path = "../util" }
+garage_table = { version = "0.8.0", path = "../table" }
 
 opentelemetry = "0.17"
 
+arc-swap = "1.5"
 async-trait = "0.1.7"
 bytes = "1.0"
 hex = "0.4"
 tracing = "0.1.30"
 rand = "0.8"
-zstd = { version = "0.9", default-features = false }
 
-sled = "0.34"
+async-compression = { version = "0.3", features = ["tokio", "zstd"] }
+zstd = { version = "0.9", default-features = false }
 
 rmp-serde = "0.15"
 serde = { version = "1.0", default-features = false, features = ["derive", "rc"] }
@@ -36,3 +38,7 @@ serde_bytes = "0.11"
 futures = "0.3"
 futures-util = "0.3"
 tokio = { version = "1.0", default-features = false, features = ["rt", "rt-multi-thread", "io-util", "net", "time", "macros", "sync", "signal", "fs"] }
+tokio-util = { version = "0.6", features = ["io"] }
+
+[features]
+system-libs = [ "zstd/pkg-config" ]
diff --git a/src/block/block.rs b/src/block/block.rs
index 4d3fbcb8..935aa900 100644
--- a/src/block/block.rs
+++ b/src/block/block.rs
@@ -1,16 +1,22 @@
+use bytes::Bytes;
 use serde::{Deserialize, Serialize};
 use zstd::stream::{decode_all as zstd_decode, Encoder};
 
 use garage_util::data::*;
 use garage_util::error::*;
 
+#[derive(Debug, Serialize, Deserialize, Copy, Clone)]
+pub enum DataBlockHeader {
+	Plain,
+	Compressed,
+}
+
 /// A possibly compressed block of data
-#[derive(Debug, Serialize, Deserialize)]
 pub enum DataBlock {
 	/// Uncompressed data
-	Plain(#[serde(with = "serde_bytes")] Vec<u8>),
+	Plain(Bytes),
 	/// Data compressed with zstd
-	Compressed(#[serde(with = "serde_bytes")] Vec<u8>),
+	Compressed(Bytes),
 }
 
 impl DataBlock {
@@ -30,7 +36,7 @@ impl DataBlock {
 	/// Get the buffer, possibly decompressing it, and verify it's integrity.
 	/// For Plain block, data is compared to hash, for Compressed block, zstd checksumming system
 	/// is used instead.
-	pub fn verify_get(self, hash: Hash) -> Result<Vec<u8>, Error> {
+	pub fn verify_get(self, hash: Hash) -> Result<Bytes, Error> {
 		match self {
 			DataBlock::Plain(data) => {
 				if blake2sum(&data) == hash {
@@ -39,9 +45,9 @@ impl DataBlock {
 					Err(Error::CorruptData(hash))
 				}
 			}
-			DataBlock::Compressed(data) => {
-				zstd_decode(&data[..]).map_err(|_| Error::CorruptData(hash))
-			}
+			DataBlock::Compressed(data) => zstd_decode(&data[..])
+				.map_err(|_| Error::CorruptData(hash))
+				.map(Bytes::from),
 		}
 	}
 
@@ -61,13 +67,31 @@ impl DataBlock {
 		}
 	}
 
-	pub fn from_buffer(data: Vec<u8>, level: Option<i32>) -> DataBlock {
-		if let Some(level) = level {
-			if let Ok(data) = zstd_encode(&data[..], level) {
-				return DataBlock::Compressed(data);
+	pub async fn from_buffer(data: Bytes, level: Option<i32>) -> DataBlock {
+		tokio::task::spawn_blocking(move || {
+			if let Some(level) = level {
+				if let Ok(data) = zstd_encode(&data[..], level) {
+					return DataBlock::Compressed(data.into());
+				}
 			}
+			DataBlock::Plain(data)
+		})
+		.await
+		.unwrap()
+	}
+
+	pub fn into_parts(self) -> (DataBlockHeader, Bytes) {
+		match self {
+			DataBlock::Plain(data) => (DataBlockHeader::Plain, data),
+			DataBlock::Compressed(data) => (DataBlockHeader::Compressed, data),
+		}
+	}
+
+	pub fn from_parts(h: DataBlockHeader, bytes: Bytes) -> Self {
+		match h {
+			DataBlockHeader::Plain => DataBlock::Plain(bytes),
+			DataBlockHeader::Compressed => DataBlock::Compressed(bytes),
 		}
-		DataBlock::Plain(data)
 	}
 }
 
diff --git a/src/block/lib.rs b/src/block/lib.rs
index dc685657..d2814f77 100644
--- a/src/block/lib.rs
+++ b/src/block/lib.rs
@@ -2,6 +2,8 @@
 extern crate tracing;
 
 pub mod manager;
+pub mod repair;
+pub mod resync;
 
 mod block;
 mod metrics;
diff --git a/src/block/manager.rs b/src/block/manager.rs
index 1c04a335..7f439b96 100644
--- a/src/block/manager.rs
+++ b/src/block/manager.rs
@@ -1,29 +1,32 @@
-use std::convert::TryInto;
-use std::path::{Path, PathBuf};
+use std::path::PathBuf;
+use std::pin::Pin;
 use std::sync::Arc;
 use std::time::Duration;
 
 use async_trait::async_trait;
+use bytes::Bytes;
 use serde::{Deserialize, Serialize};
 
-use futures::future::*;
-use futures::select;
+use futures::Stream;
+use futures_util::stream::StreamExt;
 use tokio::fs;
-use tokio::io::{AsyncReadExt, AsyncWriteExt};
-use tokio::sync::{watch, Mutex, Notify};
+use tokio::io::{AsyncReadExt, AsyncWriteExt, BufReader};
+use tokio::sync::{mpsc, Mutex, MutexGuard};
 
 use opentelemetry::{
 	trace::{FutureExt as OtelFutureExt, TraceContextExt, Tracer},
-	Context, KeyValue,
+	Context,
 };
 
+use garage_rpc::rpc_helper::netapp::stream::{stream_asyncread, ByteStream};
+
+use garage_db as db;
+
 use garage_util::data::*;
 use garage_util::error::*;
 use garage_util::metrics::RecordDuration;
-use garage_util::sled_counter::SledCountedTree;
-use garage_util::time::*;
-use garage_util::tranquilizer::Tranquilizer;
 
+use garage_rpc::rpc_helper::OrderTag;
 use garage_rpc::system::System;
 use garage_rpc::*;
 
@@ -32,24 +35,12 @@ use garage_table::replication::{TableReplication, TableShardedReplication};
 use crate::block::*;
 use crate::metrics::*;
 use crate::rc::*;
+use crate::repair::*;
+use crate::resync::*;
 
 /// Size under which data will be stored inlined in database instead of as files
 pub const INLINE_THRESHOLD: usize = 3072;
 
-// Timeout for RPCs that read and write blocks to remote nodes
-const BLOCK_RW_TIMEOUT: Duration = Duration::from_secs(30);
-// Timeout for RPCs that ask other nodes whether they need a copy
-// of a given block before we delete it locally
-const NEED_BLOCK_QUERY_TIMEOUT: Duration = Duration::from_secs(5);
-
-// The delay between the time where a resync operation fails
-// and the time when it is retried, with exponential backoff
-// (multiplied by 2, 4, 8, 16, etc. for every consecutive failure).
-const RESYNC_RETRY_DELAY: Duration = Duration::from_secs(60);
-// The minimum retry delay is 60 seconds = 1 minute
-// The maximum retry delay is 60 seconds * 2^6 = 60 seconds << 6 = 64 minutes (~1 hour)
-const RESYNC_RETRY_DELAY_MAX_BACKOFF_POWER: u64 = 6;
-
 // The delay between the moment when the reference counter
 // drops to zero, and the moment where we allow ourselves
 // to delete the block locally.
@@ -60,12 +51,12 @@ pub(crate) const BLOCK_GC_DELAY: Duration = Duration::from_secs(600);
 pub enum BlockRpc {
 	Ok,
 	/// Message to ask for a block of data, by hash
-	GetBlock(Hash),
+	GetBlock(Hash, Option<OrderTag>),
 	/// Message to send a block of data, either because requested, of for first delivery of new
 	/// block
 	PutBlock {
 		hash: Hash,
-		data: DataBlock,
+		header: DataBlockHeader,
 	},
 	/// Ask other node if they should have this block, but don't actually have it
 	NeedBlockQuery(Hash),
@@ -85,20 +76,18 @@ pub struct BlockManager {
 	pub data_dir: PathBuf,
 
 	compression_level: Option<i32>,
-	background_tranquility: u32,
 
-	mutation_lock: Mutex<BlockManagerLocked>,
+	mutation_lock: [Mutex<BlockManagerLocked>; 256],
 
-	rc: BlockRc,
+	pub(crate) rc: BlockRc,
+	pub resync: BlockResyncManager,
 
-	resync_queue: SledCountedTree,
-	resync_notify: Notify,
-	resync_errors: SledCountedTree,
+	pub(crate) system: Arc<System>,
+	pub(crate) endpoint: Arc<Endpoint<BlockRpc, Self>>,
 
-	system: Arc<System>,
-	endpoint: Arc<Endpoint<BlockRpc, Self>>,
+	pub(crate) metrics: BlockManagerMetrics,
 
-	metrics: BlockManagerMetrics,
+	tx_scrub_command: mpsc::Sender<ScrubWorkerCommand>,
 }
 
 // This custom struct contains functions that must only be ran
@@ -108,10 +97,9 @@ struct BlockManagerLocked();
 
 impl BlockManager {
 	pub fn new(
-		db: &sled::Db,
+		db: &db::Db,
 		data_dir: PathBuf,
 		compression_level: Option<i32>,
-		background_tranquility: u32,
 		replication: TableShardedReplication,
 		system: Arc<System>,
 	) -> Arc<Self> {
@@ -120,215 +108,323 @@ impl BlockManager {
 			.expect("Unable to open block_local_rc tree");
 		let rc = BlockRc::new(rc);
 
-		let resync_queue = db
-			.open_tree("block_local_resync_queue")
-			.expect("Unable to open block_local_resync_queue tree");
-		let resync_queue = SledCountedTree::new(resync_queue);
-
-		let resync_errors = db
-			.open_tree("block_local_resync_errors")
-			.expect("Unable to open block_local_resync_errors tree");
-		let resync_errors = SledCountedTree::new(resync_errors);
+		let resync = BlockResyncManager::new(db, &system);
 
 		let endpoint = system
 			.netapp
-			.endpoint("garage_model/block.rs/Rpc".to_string());
+			.endpoint("garage_block/manager.rs/Rpc".to_string());
 
-		let manager_locked = BlockManagerLocked();
+		let metrics = BlockManagerMetrics::new(resync.queue.clone(), resync.errors.clone());
 
-		let metrics = BlockManagerMetrics::new(resync_queue.clone(), resync_errors.clone());
+		let (scrub_tx, scrub_rx) = mpsc::channel(1);
 
 		let block_manager = Arc::new(Self {
 			replication,
 			data_dir,
 			compression_level,
-			background_tranquility,
-			mutation_lock: Mutex::new(manager_locked),
+			mutation_lock: [(); 256].map(|_| Mutex::new(BlockManagerLocked())),
 			rc,
-			resync_queue,
-			resync_notify: Notify::new(),
-			resync_errors,
+			resync,
 			system,
 			endpoint,
 			metrics,
+			tx_scrub_command: scrub_tx,
 		});
 		block_manager.endpoint.set_handler(block_manager.clone());
 
-		block_manager.clone().spawn_background_worker();
+		// Spawn a bunch of resync workers
+		for index in 0..MAX_RESYNC_WORKERS {
+			let worker = ResyncWorker::new(index, block_manager.clone());
+			block_manager.system.background.spawn_worker(worker);
+		}
+
+		// Spawn scrub worker
+		let scrub_worker = ScrubWorker::new(block_manager.clone(), scrub_rx);
+		block_manager.system.background.spawn_worker(scrub_worker);
 
 		block_manager
 	}
 
 	/// Ask nodes that might have a (possibly compressed) block for it
-	async fn rpc_get_raw_block(&self, hash: &Hash) -> Result<DataBlock, Error> {
+	/// Return it as a stream with a header
+	async fn rpc_get_raw_block_streaming(
+		&self,
+		hash: &Hash,
+		order_tag: Option<OrderTag>,
+	) -> Result<(DataBlockHeader, ByteStream), Error> {
 		let who = self.replication.read_nodes(hash);
-		let resps = self
-			.system
-			.rpc
-			.try_call_many(
-				&self.endpoint,
-				&who[..],
-				BlockRpc::GetBlock(*hash),
-				RequestStrategy::with_priority(PRIO_NORMAL)
-					.with_quorum(1)
-					.with_timeout(BLOCK_RW_TIMEOUT)
-					.interrupt_after_quorum(true),
-			)
-			.await?;
+		let who = self.system.rpc.request_order(&who);
+
+		for node in who.iter() {
+			let node_id = NodeID::from(*node);
+			let rpc = self.endpoint.call_streaming(
+				&node_id,
+				BlockRpc::GetBlock(*hash, order_tag),
+				PRIO_NORMAL | PRIO_SECONDARY,
+			);
+			tokio::select! {
+				res = rpc => {
+					let res = match res {
+						Ok(res) => res,
+						Err(e) => {
+							debug!("Node {:?} returned error: {}", node, e);
+							continue;
+						}
+					};
+					let (header, stream) = match res.into_parts() {
+						(Ok(BlockRpc::PutBlock { hash: _, header }), Some(stream)) => (header, stream),
+						_ => {
+							debug!("Node {:?} returned a malformed response", node);
+							continue;
+						}
+					};
+					return Ok((header, stream));
+				}
+				_ = tokio::time::sleep(self.system.rpc.rpc_timeout()) => {
+					debug!("Node {:?} didn't return block in time, trying next.", node);
+				}
+			};
+		}
 
-		for resp in resps {
-			if let BlockRpc::PutBlock { data, .. } = resp {
-				return Ok(data);
-			}
+		Err(Error::Message(format!(
+			"Unable to read block {:?}: no node returned a valid block",
+			hash
+		)))
+	}
+
+	/// Ask nodes that might have a (possibly compressed) block for it
+	/// Return its entire body
+	pub(crate) async fn rpc_get_raw_block(
+		&self,
+		hash: &Hash,
+		order_tag: Option<OrderTag>,
+	) -> Result<DataBlock, Error> {
+		let who = self.replication.read_nodes(hash);
+		let who = self.system.rpc.request_order(&who);
+
+		for node in who.iter() {
+			let node_id = NodeID::from(*node);
+			let rpc = self.endpoint.call_streaming(
+				&node_id,
+				BlockRpc::GetBlock(*hash, order_tag),
+				PRIO_NORMAL | PRIO_SECONDARY,
+			);
+			tokio::select! {
+				res = rpc => {
+					let res = match res {
+						Ok(res) => res,
+						Err(e) => {
+							debug!("Node {:?} returned error: {}", node, e);
+							continue;
+						}
+					};
+					let (header, stream) = match res.into_parts() {
+						(Ok(BlockRpc::PutBlock { hash: _, header }), Some(stream)) => (header, stream),
+						_ => {
+							debug!("Node {:?} returned a malformed response", node);
+							continue;
+						}
+					};
+					match read_stream_to_end(stream).await {
+						Ok(bytes) => return Ok(DataBlock::from_parts(header, bytes)),
+						Err(e) => {
+							debug!("Error reading stream from node {:?}: {}", node, e);
+						}
+					}
+				}
+				_ = tokio::time::sleep(self.system.rpc.rpc_timeout()) => {
+					debug!("Node {:?} didn't return block in time, trying next.", node);
+				}
+			};
 		}
+
 		Err(Error::Message(format!(
-			"Unable to read block {:?}: no valid blocks returned",
+			"Unable to read block {:?}: no node returned a valid block",
 			hash
 		)))
 	}
 
 	// ---- Public interface ----
 
+	/// Ask nodes that might have a block for it,
+	/// return it as a stream
+	pub async fn rpc_get_block_streaming(
+		&self,
+		hash: &Hash,
+		order_tag: Option<OrderTag>,
+	) -> Result<
+		Pin<Box<dyn Stream<Item = Result<Bytes, std::io::Error>> + Send + Sync + 'static>>,
+		Error,
+	> {
+		let (header, stream) = self.rpc_get_raw_block_streaming(hash, order_tag).await?;
+		match header {
+			DataBlockHeader::Plain => Ok(stream),
+			DataBlockHeader::Compressed => {
+				// Too many things, I hate it.
+				let reader = stream_asyncread(stream);
+				let reader = BufReader::new(reader);
+				let reader = async_compression::tokio::bufread::ZstdDecoder::new(reader);
+				Ok(Box::pin(tokio_util::io::ReaderStream::new(reader)))
+			}
+		}
+	}
+
 	/// Ask nodes that might have a block for it
-	pub async fn rpc_get_block(&self, hash: &Hash) -> Result<Vec<u8>, Error> {
-		self.rpc_get_raw_block(hash).await?.verify_get(*hash)
+	pub async fn rpc_get_block(
+		&self,
+		hash: &Hash,
+		order_tag: Option<OrderTag>,
+	) -> Result<Bytes, Error> {
+		self.rpc_get_raw_block(hash, order_tag)
+			.await?
+			.verify_get(*hash)
 	}
 
 	/// Send block to nodes that should have it
-	pub async fn rpc_put_block(&self, hash: Hash, data: Vec<u8>) -> Result<(), Error> {
+	pub async fn rpc_put_block(&self, hash: Hash, data: Bytes) -> Result<(), Error> {
 		let who = self.replication.write_nodes(&hash);
-		let data = DataBlock::from_buffer(data, self.compression_level);
+
+		let (header, bytes) = DataBlock::from_buffer(data, self.compression_level)
+			.await
+			.into_parts();
+		let put_block_rpc =
+			Req::new(BlockRpc::PutBlock { hash, header })?.with_stream_from_buffer(bytes);
+
 		self.system
 			.rpc
 			.try_call_many(
 				&self.endpoint,
 				&who[..],
-				BlockRpc::PutBlock { hash, data },
-				RequestStrategy::with_priority(PRIO_NORMAL)
-					.with_quorum(self.replication.write_quorum())
-					.with_timeout(BLOCK_RW_TIMEOUT),
+				put_block_rpc,
+				RequestStrategy::with_priority(PRIO_NORMAL | PRIO_SECONDARY)
+					.with_quorum(self.replication.write_quorum()),
 			)
 			.await?;
-		Ok(())
-	}
 
-	/// Launch the repair procedure on the data store
-	///
-	/// This will list all blocks locally present, as well as those
-	/// that are required because of refcount > 0, and will try
-	/// to fix any mismatch between the two.
-	pub async fn repair_data_store(&self, must_exit: &watch::Receiver<bool>) -> Result<(), Error> {
-		// 1. Repair blocks from RC table.
-		for (i, entry) in self.rc.rc.iter().enumerate() {
-			let (hash, _) = entry?;
-			let hash = Hash::try_from(&hash[..]).unwrap();
-			self.put_to_resync(&hash, Duration::from_secs(0))?;
-			if i & 0xFF == 0 && *must_exit.borrow() {
-				return Ok(());
-			}
-		}
-
-		// 2. Repair blocks actually on disk
-		// Lists all blocks on disk and adds them to the resync queue.
-		// This allows us to find blocks we are storing but don't actually need,
-		// so that we can offload them if necessary and then delete them locally.
-		self.for_each_file(
-			(),
-			move |_, hash| async move {
-				self.put_to_resync(&hash, Duration::from_secs(0))
-					.map_err(Into::into)
-			},
-			must_exit,
-		)
-		.await
-	}
-
-	/// Verify integrity of each block on disk. Use `speed_limit` to limit the load generated by
-	/// this function.
-	pub async fn scrub_data_store(
-		&self,
-		must_exit: &watch::Receiver<bool>,
-		tranquility: u32,
-	) -> Result<(), Error> {
-		let tranquilizer = Tranquilizer::new(30);
-		self.for_each_file(
-			tranquilizer,
-			move |mut tranquilizer, hash| async move {
-				let _ = self.read_block(&hash).await;
-				tranquilizer.tranquilize(tranquility).await;
-				Ok(tranquilizer)
-			},
-			must_exit,
-		)
-		.await
-	}
-
-	/// Get lenght of resync queue
-	pub fn resync_queue_len(&self) -> usize {
-		self.resync_queue.len()
+		Ok(())
 	}
 
-	/// Get number of blocks that have an error
-	pub fn resync_errors_len(&self) -> usize {
-		self.resync_errors.len()
+	/// Get number of items in the refcount table
+	pub fn rc_len(&self) -> Result<usize, Error> {
+		Ok(self.rc.rc.len()?)
 	}
 
-	/// Get number of items in the refcount table
-	pub fn rc_len(&self) -> usize {
-		self.rc.rc.len()
+	/// Send command to start/stop/manager scrub worker
+	pub async fn send_scrub_command(&self, cmd: ScrubWorkerCommand) {
+		let _ = self.tx_scrub_command.send(cmd).await;
 	}
 
 	//// ----- Managing the reference counter ----
 
 	/// Increment the number of time a block is used, putting it to resynchronization if it is
 	/// required, but not known
-	pub fn block_incref(&self, hash: &Hash) -> Result<(), Error> {
-		if self.rc.block_incref(hash)? {
+	pub fn block_incref(
+		self: &Arc<Self>,
+		tx: &mut db::Transaction,
+		hash: Hash,
+	) -> db::TxOpResult<()> {
+		if self.rc.block_incref(tx, &hash)? {
 			// When the reference counter is incremented, there is
 			// normally a node that is responsible for sending us the
 			// data of the block. However that operation may fail,
 			// so in all cases we add the block here to the todo list
 			// to check later that it arrived correctly, and if not
 			// we will fecth it from someone.
-			self.put_to_resync(hash, 2 * BLOCK_RW_TIMEOUT)?;
+			let this = self.clone();
+			tokio::spawn(async move {
+				if let Err(e) = this
+					.resync
+					.put_to_resync(&hash, 2 * this.system.rpc.rpc_timeout())
+				{
+					error!("Block {:?} could not be put in resync queue: {}.", hash, e);
+				}
+			});
 		}
 		Ok(())
 	}
 
 	/// Decrement the number of time a block is used
-	pub fn block_decref(&self, hash: &Hash) -> Result<(), Error> {
-		if self.rc.block_decref(hash)? {
+	pub fn block_decref(
+		self: &Arc<Self>,
+		tx: &mut db::Transaction,
+		hash: Hash,
+	) -> db::TxOpResult<()> {
+		if self.rc.block_decref(tx, &hash)? {
 			// When the RC is decremented, it might drop to zero,
 			// indicating that we don't need the block.
 			// There is a delay before we garbage collect it;
 			// make sure that it is handled in the resync loop
 			// after that delay has passed.
-			self.put_to_resync(hash, BLOCK_GC_DELAY + Duration::from_secs(10))?;
+			let this = self.clone();
+			tokio::spawn(async move {
+				if let Err(e) = this
+					.resync
+					.put_to_resync(&hash, BLOCK_GC_DELAY + Duration::from_secs(10))
+				{
+					error!("Block {:?} could not be put in resync queue: {}.", hash, e);
+				}
+			});
 		}
 		Ok(())
 	}
 
 	// ---- Reading and writing blocks locally ----
 
+	async fn handle_put_block(
+		&self,
+		hash: Hash,
+		header: DataBlockHeader,
+		stream: Option<ByteStream>,
+	) -> Result<(), Error> {
+		let stream = stream.ok_or_message("missing stream")?;
+		let bytes = read_stream_to_end(stream).await?;
+		let data = DataBlock::from_parts(header, bytes);
+		self.write_block(&hash, &data).await
+	}
+
 	/// Write a block to disk
-	async fn write_block(&self, hash: &Hash, data: &DataBlock) -> Result<BlockRpc, Error> {
+	pub(crate) async fn write_block(&self, hash: &Hash, data: &DataBlock) -> Result<(), Error> {
+		let tracer = opentelemetry::global::tracer("garage");
+
 		let write_size = data.inner_buffer().len() as u64;
 
-		let res = self
-			.mutation_lock
-			.lock()
+		self.lock_mutate(hash)
 			.await
 			.write_block(hash, data, self)
 			.bound_record_duration(&self.metrics.block_write_duration)
+			.with_context(Context::current_with_span(
+				tracer.start("BlockManagerLocked::write_block"),
+			))
 			.await?;
 
 		self.metrics.bytes_written.add(write_size);
 
-		Ok(res)
+		Ok(())
+	}
+
+	async fn handle_get_block(&self, hash: &Hash, order_tag: Option<OrderTag>) -> Resp<BlockRpc> {
+		let block = match self.read_block(hash).await {
+			Ok(data) => data,
+			Err(e) => return Resp::new(Err(e)),
+		};
+
+		let (header, data) = block.into_parts();
+
+		let resp = Resp::new(Ok(BlockRpc::PutBlock {
+			hash: *hash,
+			header,
+		}))
+		.with_stream_from_buffer(data);
+
+		if let Some(order_tag) = order_tag {
+			resp.with_order_tag(order_tag)
+		} else {
+			resp
+		}
 	}
 
 	/// Read block from disk, verifying it's integrity
-	async fn read_block(&self, hash: &Hash) -> Result<BlockRpc, Error> {
+	pub(crate) async fn read_block(&self, hash: &Hash) -> Result<DataBlock, Error> {
 		let data = self
 			.read_block_internal(hash)
 			.bound_record_duration(&self.metrics.block_read_duration)
@@ -338,7 +434,7 @@ impl BlockManager {
 			.bytes_read
 			.add(data.inner_buffer().len() as u64);
 
-		Ok(BlockRpc::PutBlock { hash: *hash, data })
+		Ok(data)
 	}
 
 	async fn read_block_internal(&self, hash: &Hash) -> Result<DataBlock, Error> {
@@ -347,7 +443,8 @@ impl BlockManager {
 			Ok(c) => c,
 			Err(e) => {
 				// Not found but maybe we should have had it ??
-				self.put_to_resync(hash, 2 * BLOCK_RW_TIMEOUT)?;
+				self.resync
+					.put_to_resync(hash, 2 * self.system.rpc.rpc_timeout())?;
 				return Err(Into::into(e));
 			}
 		};
@@ -361,37 +458,47 @@ impl BlockManager {
 		drop(f);
 
 		let data = if compressed {
-			DataBlock::Compressed(data)
+			DataBlock::Compressed(data.into())
 		} else {
-			DataBlock::Plain(data)
+			DataBlock::Plain(data.into())
 		};
 
 		if data.verify(*hash).is_err() {
 			self.metrics.corruption_counter.add(1);
 
-			self.mutation_lock
-				.lock()
+			self.lock_mutate(hash)
 				.await
 				.move_block_to_corrupted(hash, self)
 				.await?;
-			self.put_to_resync(hash, Duration::from_millis(0))?;
+			self.resync.put_to_resync(hash, Duration::from_millis(0))?;
 			return Err(Error::CorruptData(*hash));
 		}
 
 		Ok(data)
 	}
 
-	/// Check if this node should have a block, but don't actually have it
-	async fn need_block(&self, hash: &Hash) -> Result<bool, Error> {
-		let BlockStatus { exists, needed } = self
-			.mutation_lock
-			.lock()
+	/// Check if this node has a block and whether it needs it
+	pub(crate) async fn check_block_status(&self, hash: &Hash) -> Result<BlockStatus, Error> {
+		self.lock_mutate(hash)
 			.await
 			.check_block_status(hash, self)
-			.await?;
+			.await
+	}
+
+	/// Check if this node should have a block, but don't actually have it
+	async fn need_block(&self, hash: &Hash) -> Result<bool, Error> {
+		let BlockStatus { exists, needed } = self.check_block_status(hash).await?;
 		Ok(needed.is_nonzero() && !exists)
 	}
 
+	/// Delete block if it is not needed anymore
+	pub(crate) async fn delete_if_unneeded(&self, hash: &Hash) -> Result<(), Error> {
+		self.lock_mutate(hash)
+			.await
+			.delete_if_unneeded(hash, self)
+			.await
+	}
+
 	/// Utility: gives the path of the directory in which a block should be found
 	fn block_dir(&self, hash: &Hash) -> PathBuf {
 		let mut path = self.data_dir.clone();
@@ -419,431 +526,38 @@ impl BlockManager {
 		fs::metadata(&path).await.map(|_| false).map_err(Into::into)
 	}
 
-	// ---- Resync loop ----
-
-	// This part manages a queue of blocks that need to be
-	// "resynchronized", i.e. that need to have a check that
-	// they are at present if we need them, or that they are
-	// deleted once the garbage collection delay has passed.
-	//
-	// Here are some explanations on how the resync queue works.
-	// There are two Sled trees that are used to have information
-	// about the status of blocks that need to be resynchronized:
-	//
-	// - resync_queue: a tree that is ordered first by a timestamp
-	//   (in milliseconds since Unix epoch) that is the time at which
-	//   the resync must be done, and second by block hash.
-	//   The key in this tree is just:
-	//       concat(timestamp (8 bytes), hash (32 bytes))
-	//   The value is the same 32-byte hash.
-	//
-	// - resync_errors: a tree that indicates for each block
-	//   if the last resync resulted in an error, and if so,
-	//   the following two informations (see the ErrorCounter struct):
-	//   - how many consecutive resync errors for this block?
-	//   - when was the last try?
-	//   These two informations are used to implement an
-	//   exponential backoff retry strategy.
-	//   The key in this tree is the 32-byte hash of the block,
-	//   and the value is the encoded ErrorCounter value.
-	//
-	// We need to have these two trees, because the resync queue
-	// is not just a queue of items to process, but a set of items
-	// that are waiting a specific delay until we can process them
-	// (the delay being necessary both internally for the exponential
-	// backoff strategy, and exposed as a parameter when adding items
-	// to the queue, e.g. to wait until the GC delay has passed).
-	// This is why we need one tree ordered by time, and one
-	// ordered by identifier of item to be processed (block hash).
-	//
-	// When the worker wants to process an item it takes from
-	// resync_queue, it checks in resync_errors that if there is an
-	// exponential back-off delay to await, it has passed before we
-	// process the item. If not, the item in the queue is skipped
-	// (but added back for later processing after the time of the
-	// delay).
-	//
-	// An alternative that would have seemed natural is to
-	// only add items to resync_queue with a processing time that is
-	// after the delay, but there are several issues with this:
-	// - This requires to synchronize updates to resync_queue and
-	//   resync_errors (with the current model, there is only one thread,
-	//   the worker thread, that accesses resync_errors,
-	//   so no need to synchronize) by putting them both in a lock.
-	//   This would mean that block_incref might need to take a lock
-	//   before doing its thing, meaning it has much more chances of
-	//   not completing successfully if something bad happens to Garage.
-	//   Currently Garage is not able to recover from block_incref that
-	//   doesn't complete successfully, because it is necessary to ensure
-	//   the consistency between the state of the block manager and
-	//   information in the BlockRef table.
-	// - If a resync fails, we put that block in the resync_errors table,
-	//   and also add it back to resync_queue to be processed after
-	//   the exponential back-off delay,
-	//   but maybe the block is already scheduled to be resynced again
-	//   at another time that is before the exponential back-off delay,
-	//   and we have no way to check that easily. This means that
-	//   in all cases, we need to check the resync_errors table
-	//   in the resync loop at the time when a block is popped from
-	//   the resync_queue.
-	// Overall, the current design is therefore simpler and more robust
-	// because it tolerates inconsistencies between the resync_queue
-	// and resync_errors table (items being scheduled in resync_queue
-	// for times that are earlier than the exponential back-off delay
-	// is a natural condition that is handled properly).
-
-	fn spawn_background_worker(self: Arc<Self>) {
-		// Launch a background workers for background resync loop processing
-		let background = self.system.background.clone();
-		tokio::spawn(async move {
-			tokio::time::sleep(Duration::from_secs(10)).await;
-			background.spawn_worker("block resync worker".into(), move |must_exit| {
-				self.resync_loop(must_exit)
-			});
-		});
-	}
-
-	fn put_to_resync(&self, hash: &Hash, delay: Duration) -> Result<(), sled::Error> {
-		let when = now_msec() + delay.as_millis() as u64;
-		self.put_to_resync_at(hash, when)
-	}
-
-	fn put_to_resync_at(&self, hash: &Hash, when: u64) -> Result<(), sled::Error> {
-		trace!("Put resync_queue: {} {:?}", when, hash);
-		let mut key = u64::to_be_bytes(when).to_vec();
-		key.extend(hash.as_ref());
-		self.resync_queue.insert(key, hash.as_ref())?;
-		self.resync_notify.notify_waiters();
-		Ok(())
-	}
-
-	async fn resync_loop(self: Arc<Self>, mut must_exit: watch::Receiver<bool>) {
-		let mut tranquilizer = Tranquilizer::new(30);
-
-		while !*must_exit.borrow() {
-			match self.resync_iter(&mut must_exit).await {
-				Ok(true) => {
-					tranquilizer.tranquilize(self.background_tranquility).await;
-				}
-				Ok(false) => {
-					tranquilizer.reset();
-				}
-				Err(e) => {
-					// The errors that we have here are only Sled errors
-					// We don't really know how to handle them so just ¯\_(ツ)_/¯
-					// (there is kind of an assumption that Sled won't error on us,
-					// if it does there is not much we can do -- TODO should we just panic?)
-					error!(
-						"Could not do a resync iteration: {} (this is a very bad error)",
-						e
-					);
-					tranquilizer.reset();
-				}
-			}
-		}
-	}
-
-	// The result of resync_iter is:
-	// - Ok(true) -> a block was processed (successfully or not)
-	// - Ok(false) -> no block was processed, but we are ready for the next iteration
-	// - Err(_) -> a Sled error occurred when reading/writing from resync_queue/resync_errors
-	async fn resync_iter(
-		&self,
-		must_exit: &mut watch::Receiver<bool>,
-	) -> Result<bool, sled::Error> {
-		if let Some(first_pair_res) = self.resync_queue.iter().next() {
-			let (time_bytes, hash_bytes) = first_pair_res?;
-
-			let time_msec = u64::from_be_bytes(time_bytes[0..8].try_into().unwrap());
-			let now = now_msec();
-
-			if now >= time_msec {
-				let hash = Hash::try_from(&hash_bytes[..]).unwrap();
-
-				if let Some(ec) = self.resync_errors.get(hash.as_slice())? {
-					let ec = ErrorCounter::decode(ec);
-					if now < ec.next_try() {
-						// if next retry after an error is not yet,
-						// don't do resync and return early, but still
-						// make sure the item is still in queue at expected time
-						self.put_to_resync_at(&hash, ec.next_try())?;
-						// ec.next_try() > now >= time_msec, so this remove
-						// is not removing the one we added just above
-						// (we want to do the remove after the insert to ensure
-						// that the item is not lost if we crash in-between)
-						self.resync_queue.remove(time_bytes)?;
-						return Ok(false);
-					}
-				}
-
-				let tracer = opentelemetry::global::tracer("garage");
-				let trace_id = gen_uuid();
-				let span = tracer
-					.span_builder("Resync block")
-					.with_trace_id(
-						opentelemetry::trace::TraceId::from_hex(&hex::encode(
-							&trace_id.as_slice()[..16],
-						))
-						.unwrap(),
-					)
-					.with_attributes(vec![KeyValue::new("block", format!("{:?}", hash))])
-					.start(&tracer);
-
-				let res = self
-					.resync_block(&hash)
-					.with_context(Context::current_with_span(span))
-					.bound_record_duration(&self.metrics.resync_duration)
-					.await;
-
-				self.metrics.resync_counter.add(1);
-
-				if let Err(e) = &res {
-					self.metrics.resync_error_counter.add(1);
-					warn!("Error when resyncing {:?}: {}", hash, e);
-
-					let err_counter = match self.resync_errors.get(hash.as_slice())? {
-						Some(ec) => ErrorCounter::decode(ec).add1(now + 1),
-						None => ErrorCounter::new(now + 1),
-					};
-
-					self.resync_errors
-						.insert(hash.as_slice(), err_counter.encode())?;
-
-					self.put_to_resync_at(&hash, err_counter.next_try())?;
-					// err_counter.next_try() >= now + 1 > now,
-					// the entry we remove from the queue is not
-					// the entry we inserted with put_to_resync_at
-					self.resync_queue.remove(time_bytes)?;
-				} else {
-					self.resync_errors.remove(hash.as_slice())?;
-					self.resync_queue.remove(time_bytes)?;
-				}
-
-				Ok(true)
-			} else {
-				let delay = tokio::time::sleep(Duration::from_millis(time_msec - now));
-				select! {
-					_ = delay.fuse() => {},
-					_ = self.resync_notify.notified().fuse() => {},
-					_ = must_exit.changed().fuse() => {},
-				}
-				Ok(false)
-			}
-		} else {
-			// Here we wait either for a notification that an item has been
-			// added to the queue, or for a constant delay of 10 secs to expire.
-			// The delay avoids a race condition where the notification happens
-			// between the time we checked the queue and the first poll
-			// to resync_notify.notified(): if that happens, we'll just loop
-			// back 10 seconds later, which is fine.
-			let delay = tokio::time::sleep(Duration::from_secs(10));
-			select! {
-				_ = delay.fuse() => {},
-				_ = self.resync_notify.notified().fuse() => {},
-				_ = must_exit.changed().fuse() => {},
-			}
-			Ok(false)
-		}
-	}
-
-	async fn resync_block(&self, hash: &Hash) -> Result<(), Error> {
-		let BlockStatus { exists, needed } = self
-			.mutation_lock
+	async fn lock_mutate(&self, hash: &Hash) -> MutexGuard<'_, BlockManagerLocked> {
+		let tracer = opentelemetry::global::tracer("garage");
+		self.mutation_lock[hash.as_slice()[0] as usize]
 			.lock()
+			.with_context(Context::current_with_span(
+				tracer.start("Acquire mutation_lock"),
+			))
 			.await
-			.check_block_status(hash, self)
-			.await?;
-
-		if exists != needed.is_needed() || exists != needed.is_nonzero() {
-			debug!(
-				"Resync block {:?}: exists {}, nonzero rc {}, deletable {}",
-				hash,
-				exists,
-				needed.is_nonzero(),
-				needed.is_deletable(),
-			);
-		}
-
-		if exists && needed.is_deletable() {
-			info!("Resync block {:?}: offloading and deleting", hash);
-
-			let mut who = self.replication.write_nodes(hash);
-			if who.len() < self.replication.write_quorum() {
-				return Err(Error::Message("Not trying to offload block because we don't have a quorum of nodes to write to".to_string()));
-			}
-			who.retain(|id| *id != self.system.id);
-
-			let msg = Arc::new(BlockRpc::NeedBlockQuery(*hash));
-			let who_needs_fut = who.iter().map(|to| {
-				self.system.rpc.call_arc(
-					&self.endpoint,
-					*to,
-					msg.clone(),
-					RequestStrategy::with_priority(PRIO_BACKGROUND)
-						.with_timeout(NEED_BLOCK_QUERY_TIMEOUT),
-				)
-			});
-			let who_needs_resps = join_all(who_needs_fut).await;
-
-			let mut need_nodes = vec![];
-			for (node, needed) in who.iter().zip(who_needs_resps.into_iter()) {
-				match needed.err_context("NeedBlockQuery RPC")? {
-					BlockRpc::NeedBlockReply(needed) => {
-						if needed {
-							need_nodes.push(*node);
-						}
-					}
-					m => {
-						return Err(Error::unexpected_rpc_message(m));
-					}
-				}
-			}
-
-			if !need_nodes.is_empty() {
-				trace!(
-					"Block {:?} needed by {} nodes, sending",
-					hash,
-					need_nodes.len()
-				);
-
-				for node in need_nodes.iter() {
-					self.metrics
-						.resync_send_counter
-						.add(1, &[KeyValue::new("to", format!("{:?}", node))]);
-				}
-
-				let put_block_message = self.read_block(hash).await?;
-				self.system
-					.rpc
-					.try_call_many(
-						&self.endpoint,
-						&need_nodes[..],
-						put_block_message,
-						RequestStrategy::with_priority(PRIO_BACKGROUND)
-							.with_quorum(need_nodes.len())
-							.with_timeout(BLOCK_RW_TIMEOUT),
-					)
-					.await
-					.err_context("PutBlock RPC")?;
-			}
-			info!(
-				"Deleting unneeded block {:?}, offload finished ({} / {})",
-				hash,
-				need_nodes.len(),
-				who.len()
-			);
-
-			self.mutation_lock
-				.lock()
-				.await
-				.delete_if_unneeded(hash, self)
-				.await?;
-
-			self.rc.clear_deleted_block_rc(hash)?;
-		}
-
-		if needed.is_nonzero() && !exists {
-			info!(
-				"Resync block {:?}: fetching absent but needed block (refcount > 0)",
-				hash
-			);
-
-			let block_data = self.rpc_get_raw_block(hash).await?;
-
-			self.metrics.resync_recv_counter.add(1);
-
-			self.write_block(hash, &block_data).await?;
-		}
-
-		Ok(())
-	}
-
-	// ---- Utility: iteration on files in the data directory ----
-
-	async fn for_each_file<F, Fut, State>(
-		&self,
-		state: State,
-		mut f: F,
-		must_exit: &watch::Receiver<bool>,
-	) -> Result<(), Error>
-	where
-		F: FnMut(State, Hash) -> Fut + Send,
-		Fut: Future<Output = Result<State, Error>> + Send,
-		State: Send,
-	{
-		self.for_each_file_rec(&self.data_dir, state, &mut f, must_exit)
-			.await
-			.map(|_| ())
-	}
-
-	fn for_each_file_rec<'a, F, Fut, State>(
-		&'a self,
-		path: &'a Path,
-		mut state: State,
-		f: &'a mut F,
-		must_exit: &'a watch::Receiver<bool>,
-	) -> BoxFuture<'a, Result<State, Error>>
-	where
-		F: FnMut(State, Hash) -> Fut + Send,
-		Fut: Future<Output = Result<State, Error>> + Send,
-		State: Send + 'a,
-	{
-		async move {
-			let mut ls_data_dir = fs::read_dir(path).await?;
-			while let Some(data_dir_ent) = ls_data_dir.next_entry().await? {
-				if *must_exit.borrow() {
-					break;
-				}
-
-				let name = data_dir_ent.file_name();
-				let name = if let Ok(n) = name.into_string() {
-					n
-				} else {
-					continue;
-				};
-				let ent_type = data_dir_ent.file_type().await?;
-
-				let name = name.strip_suffix(".zst").unwrap_or(&name);
-				if name.len() == 2 && hex::decode(&name).is_ok() && ent_type.is_dir() {
-					state = self
-						.for_each_file_rec(&data_dir_ent.path(), state, f, must_exit)
-						.await?;
-				} else if name.len() == 64 {
-					let hash_bytes = if let Ok(h) = hex::decode(&name) {
-						h
-					} else {
-						continue;
-					};
-					let mut hash = [0u8; 32];
-					hash.copy_from_slice(&hash_bytes[..]);
-					state = f(state, hash.into()).await?;
-				}
-			}
-			Ok(state)
-		}
-		.boxed()
 	}
 }
 
 #[async_trait]
-impl EndpointHandler<BlockRpc> for BlockManager {
-	async fn handle(
-		self: &Arc<Self>,
-		message: &BlockRpc,
-		_from: NodeID,
-	) -> Result<BlockRpc, Error> {
-		match message {
-			BlockRpc::PutBlock { hash, data } => self.write_block(hash, data).await,
-			BlockRpc::GetBlock(h) => self.read_block(h).await,
-			BlockRpc::NeedBlockQuery(h) => self.need_block(h).await.map(BlockRpc::NeedBlockReply),
-			m => Err(Error::unexpected_rpc_message(m)),
+impl StreamingEndpointHandler<BlockRpc> for BlockManager {
+	async fn handle(self: &Arc<Self>, mut message: Req<BlockRpc>, _from: NodeID) -> Resp<BlockRpc> {
+		match message.msg() {
+			BlockRpc::PutBlock { hash, header } => Resp::new(
+				self.handle_put_block(*hash, *header, message.take_stream())
+					.await
+					.map(|_| BlockRpc::Ok),
+			),
+			BlockRpc::GetBlock(h, order_tag) => self.handle_get_block(h, *order_tag).await,
+			BlockRpc::NeedBlockQuery(h) => {
+				Resp::new(self.need_block(h).await.map(BlockRpc::NeedBlockReply))
+			}
+			m => Resp::new(Err(Error::unexpected_rpc_message(m))),
 		}
 	}
 }
 
-struct BlockStatus {
-	exists: bool,
-	needed: RcEntry,
+pub(crate) struct BlockStatus {
+	pub(crate) exists: bool,
+	pub(crate) needed: RcEntry,
 }
 
 impl BlockManagerLocked {
@@ -863,7 +577,7 @@ impl BlockManagerLocked {
 		hash: &Hash,
 		data: &DataBlock,
 		mgr: &BlockManager,
-	) -> Result<BlockRpc, Error> {
+	) -> Result<(), Error> {
 		let compressed = data.is_compressed();
 		let data = data.inner_buffer();
 
@@ -874,8 +588,8 @@ impl BlockManagerLocked {
 		fs::create_dir_all(&directory).await?;
 
 		let to_delete = match (mgr.is_block_compressed(hash).await, compressed) {
-			(Ok(true), _) => return Ok(BlockRpc::Ok),
-			(Ok(false), false) => return Ok(BlockRpc::Ok),
+			(Ok(true), _) => return Ok(()),
+			(Ok(false), false) => return Ok(()),
 			(Ok(false), true) => {
 				let path_to_delete = path.clone();
 				path.set_extension("zst");
@@ -914,7 +628,7 @@ impl BlockManagerLocked {
 		dir.sync_all().await?;
 		drop(dir);
 
-		Ok(BlockRpc::Ok)
+		Ok(())
 	}
 
 	async fn move_block_to_corrupted(&self, hash: &Hash, mgr: &BlockManager) -> Result<(), Error> {
@@ -949,49 +663,16 @@ impl BlockManagerLocked {
 	}
 }
 
-/// Counts the number of errors when resyncing a block,
-/// and the time of the last try.
-/// Used to implement exponential backoff.
-#[derive(Clone, Copy, Debug)]
-struct ErrorCounter {
-	errors: u64,
-	last_try: u64,
-}
-
-impl ErrorCounter {
-	fn new(now: u64) -> Self {
-		Self {
-			errors: 1,
-			last_try: now,
-		}
+async fn read_stream_to_end(mut stream: ByteStream) -> Result<Bytes, Error> {
+	let mut parts: Vec<Bytes> = vec![];
+	while let Some(part) = stream.next().await {
+		parts.push(part.ok_or_message("error in stream")?);
 	}
 
-	fn decode(data: sled::IVec) -> Self {
-		Self {
-			errors: u64::from_be_bytes(data[0..8].try_into().unwrap()),
-			last_try: u64::from_be_bytes(data[8..16].try_into().unwrap()),
-		}
-	}
-	fn encode(&self) -> Vec<u8> {
-		[
-			u64::to_be_bytes(self.errors),
-			u64::to_be_bytes(self.last_try),
-		]
+	Ok(parts
+		.iter()
+		.map(|x| &x[..])
+		.collect::<Vec<_>>()
 		.concat()
-	}
-
-	fn add1(self, now: u64) -> Self {
-		Self {
-			errors: self.errors + 1,
-			last_try: now,
-		}
-	}
-
-	fn delay_msec(&self) -> u64 {
-		(RESYNC_RETRY_DELAY.as_millis() as u64)
-			<< std::cmp::min(self.errors - 1, RESYNC_RETRY_DELAY_MAX_BACKOFF_POWER)
-	}
-	fn next_try(&self) -> u64 {
-		self.last_try + self.delay_msec()
-	}
+		.into())
 }
diff --git a/src/block/metrics.rs b/src/block/metrics.rs
index f0f541a3..477add66 100644
--- a/src/block/metrics.rs
+++ b/src/block/metrics.rs
@@ -1,6 +1,6 @@
 use opentelemetry::{global, metrics::*};
 
-use garage_util::sled_counter::SledCountedTree;
+use garage_db::counted_tree_hack::CountedTree;
 
 /// TableMetrics reference all counter used for metrics
 pub struct BlockManagerMetrics {
@@ -23,7 +23,7 @@ pub struct BlockManagerMetrics {
 }
 
 impl BlockManagerMetrics {
-	pub fn new(resync_queue: SledCountedTree, resync_errors: SledCountedTree) -> Self {
+	pub fn new(resync_queue: CountedTree, resync_errors: CountedTree) -> Self {
 		let meter = global::meter("garage_model/block");
 		Self {
 			_resync_queue_len: meter
diff --git a/src/block/rc.rs b/src/block/rc.rs
index ec3ea44e..ce6defad 100644
--- a/src/block/rc.rs
+++ b/src/block/rc.rs
@@ -1,5 +1,7 @@
 use std::convert::TryInto;
 
+use garage_db as db;
+
 use garage_util::data::*;
 use garage_util::error::*;
 use garage_util::time::*;
@@ -7,31 +9,41 @@ use garage_util::time::*;
 use crate::manager::BLOCK_GC_DELAY;
 
 pub struct BlockRc {
-	pub(crate) rc: sled::Tree,
+	pub(crate) rc: db::Tree,
 }
 
 impl BlockRc {
-	pub(crate) fn new(rc: sled::Tree) -> Self {
+	pub(crate) fn new(rc: db::Tree) -> Self {
 		Self { rc }
 	}
 
 	/// Increment the reference counter associated to a hash.
 	/// Returns true if the RC goes from zero to nonzero.
-	pub(crate) fn block_incref(&self, hash: &Hash) -> Result<bool, Error> {
-		let old_rc = self
-			.rc
-			.fetch_and_update(&hash, |old| RcEntry::parse_opt(old).increment().serialize())?;
-		let old_rc = RcEntry::parse_opt(old_rc);
+	pub(crate) fn block_incref(
+		&self,
+		tx: &mut db::Transaction,
+		hash: &Hash,
+	) -> db::TxOpResult<bool> {
+		let old_rc = RcEntry::parse_opt(tx.get(&self.rc, &hash)?);
+		match old_rc.increment().serialize() {
+			Some(x) => tx.insert(&self.rc, &hash, x)?,
+			None => unreachable!(),
+		};
 		Ok(old_rc.is_zero())
 	}
 
 	/// Decrement the reference counter associated to a hash.
 	/// Returns true if the RC is now zero.
-	pub(crate) fn block_decref(&self, hash: &Hash) -> Result<bool, Error> {
-		let new_rc = self
-			.rc
-			.update_and_fetch(&hash, |old| RcEntry::parse_opt(old).decrement().serialize())?;
-		let new_rc = RcEntry::parse_opt(new_rc);
+	pub(crate) fn block_decref(
+		&self,
+		tx: &mut db::Transaction,
+		hash: &Hash,
+	) -> db::TxOpResult<bool> {
+		let new_rc = RcEntry::parse_opt(tx.get(&self.rc, &hash)?).decrement();
+		match new_rc.serialize() {
+			Some(x) => tx.insert(&self.rc, &hash, x)?,
+			None => tx.remove(&self.rc, &hash)?,
+		};
 		Ok(matches!(new_rc, RcEntry::Deletable { .. }))
 	}
 
@@ -44,12 +56,15 @@ impl BlockRc {
 	/// deletion time has passed
 	pub(crate) fn clear_deleted_block_rc(&self, hash: &Hash) -> Result<(), Error> {
 		let now = now_msec();
-		self.rc.update_and_fetch(&hash, |rcval| {
-			let updated = match RcEntry::parse_opt(rcval) {
-				RcEntry::Deletable { at_time } if now > at_time => RcEntry::Absent,
-				v => v,
+		self.rc.db().transaction(|mut tx| {
+			let rcval = RcEntry::parse_opt(tx.get(&self.rc, &hash)?);
+			match rcval {
+				RcEntry::Deletable { at_time } if now > at_time => {
+					tx.remove(&self.rc, &hash)?;
+				}
+				_ => (),
 			};
-			updated.serialize()
+			tx.commit(())
 		})?;
 		Ok(())
 	}
diff --git a/src/block/repair.rs b/src/block/repair.rs
new file mode 100644
index 00000000..e2884b69
--- /dev/null
+++ b/src/block/repair.rs
@@ -0,0 +1,466 @@
+use core::ops::Bound;
+use std::path::PathBuf;
+use std::sync::Arc;
+use std::time::Duration;
+
+use async_trait::async_trait;
+use serde::{Deserialize, Serialize};
+use tokio::fs;
+use tokio::select;
+use tokio::sync::mpsc;
+use tokio::sync::watch;
+
+use garage_util::background::*;
+use garage_util::data::*;
+use garage_util::error::*;
+use garage_util::persister::Persister;
+use garage_util::time::*;
+use garage_util::tranquilizer::Tranquilizer;
+
+use crate::manager::*;
+
+// Full scrub every 30 days
+const SCRUB_INTERVAL: Duration = Duration::from_secs(3600 * 24 * 30);
+// Scrub tranquility is initially set to 4, but can be changed in the CLI
+// and the updated version is persisted over Garage restarts
+const INITIAL_SCRUB_TRANQUILITY: u32 = 4;
+
+// ---- ---- ----
+// FIRST KIND OF REPAIR: FINDING MISSING BLOCKS/USELESS BLOCKS
+// This is a one-shot repair operation that can be launched,
+// checks everything, and then exits.
+// ---- ---- ----
+
+pub struct RepairWorker {
+	manager: Arc<BlockManager>,
+	next_start: Option<Hash>,
+	block_iter: Option<BlockStoreIterator>,
+}
+
+impl RepairWorker {
+	pub fn new(manager: Arc<BlockManager>) -> Self {
+		Self {
+			manager,
+			next_start: None,
+			block_iter: None,
+		}
+	}
+}
+
+#[async_trait]
+impl Worker for RepairWorker {
+	fn name(&self) -> String {
+		"Block repair worker".into()
+	}
+
+	fn info(&self) -> Option<String> {
+		match self.block_iter.as_ref() {
+			None => {
+				let idx_bytes = self
+					.next_start
+					.as_ref()
+					.map(|x| x.as_slice())
+					.unwrap_or(&[]);
+				let idx_bytes = if idx_bytes.len() > 4 {
+					&idx_bytes[..4]
+				} else {
+					idx_bytes
+				};
+				Some(format!("Phase 1: {}", hex::encode(idx_bytes)))
+			}
+			Some(bi) => Some(format!("Phase 2: {:.2}% done", bi.progress() * 100.)),
+		}
+	}
+
+	async fn work(&mut self, _must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error> {
+		match self.block_iter.as_mut() {
+			None => {
+				// Phase 1: Repair blocks from RC table.
+
+				// We have to do this complicated two-step process where we first read a bunch
+				// of hashes from the RC table, and then insert them in the to-resync queue,
+				// because of SQLite. Basically, as long as we have an iterator on a DB table,
+				// we can't do anything else on the DB. The naive approach (which we had previously)
+				// of just iterating on the RC table and inserting items one to one in the resync
+				// queue can't work here, it would just provoke a deadlock in the SQLite adapter code.
+				// This is mostly because the Rust bindings for SQLite assume a worst-case scenario
+				// where SQLite is not compiled in thread-safe mode, so we have to wrap everything
+				// in a mutex (see db/sqlite_adapter.rs and discussion in PR #322).
+				// TODO: maybe do this with tokio::task::spawn_blocking ?
+				let mut batch_of_hashes = vec![];
+				let start_bound = match self.next_start.as_ref() {
+					None => Bound::Unbounded,
+					Some(x) => Bound::Excluded(x.as_slice()),
+				};
+				for entry in self
+					.manager
+					.rc
+					.rc
+					.range::<&[u8], _>((start_bound, Bound::Unbounded))?
+				{
+					let (hash, _) = entry?;
+					let hash = Hash::try_from(&hash[..]).unwrap();
+					batch_of_hashes.push(hash);
+					if batch_of_hashes.len() >= 1000 {
+						break;
+					}
+				}
+				if batch_of_hashes.is_empty() {
+					// move on to phase 2
+					self.block_iter = Some(BlockStoreIterator::new(&self.manager));
+					return Ok(WorkerState::Busy);
+				}
+
+				for hash in batch_of_hashes.into_iter() {
+					self.manager
+						.resync
+						.put_to_resync(&hash, Duration::from_secs(0))?;
+					self.next_start = Some(hash)
+				}
+
+				Ok(WorkerState::Busy)
+			}
+			Some(bi) => {
+				// Phase 2: Repair blocks actually on disk
+				// Lists all blocks on disk and adds them to the resync queue.
+				// This allows us to find blocks we are storing but don't actually need,
+				// so that we can offload them if necessary and then delete them locally.
+				if let Some(hash) = bi.next().await? {
+					self.manager
+						.resync
+						.put_to_resync(&hash, Duration::from_secs(0))?;
+					Ok(WorkerState::Busy)
+				} else {
+					Ok(WorkerState::Done)
+				}
+			}
+		}
+	}
+
+	async fn wait_for_work(&mut self, _must_exit: &watch::Receiver<bool>) -> WorkerState {
+		unreachable!()
+	}
+}
+
+// ---- ---- ----
+// SECOND KIND OF REPAIR: SCRUBBING THE DATASTORE
+// This is significantly more complex than the process above,
+// as it is a continuously-running task that triggers automatically
+// every SCRUB_INTERVAL, but can also be triggered manually
+// and whose parameter (esp. speed) can be controlled at runtime.
+// ---- ---- ----
+
+pub struct ScrubWorker {
+	manager: Arc<BlockManager>,
+	rx_cmd: mpsc::Receiver<ScrubWorkerCommand>,
+
+	work: ScrubWorkerState,
+	tranquilizer: Tranquilizer,
+
+	persister: Persister<ScrubWorkerPersisted>,
+	persisted: ScrubWorkerPersisted,
+}
+
+#[derive(Serialize, Deserialize)]
+struct ScrubWorkerPersisted {
+	tranquility: u32,
+	time_last_complete_scrub: u64,
+	corruptions_detected: u64,
+}
+
+enum ScrubWorkerState {
+	Running(BlockStoreIterator),
+	Paused(BlockStoreIterator, u64), // u64 = time when to resume scrub
+	Finished,
+}
+
+impl Default for ScrubWorkerState {
+	fn default() -> Self {
+		ScrubWorkerState::Finished
+	}
+}
+
+#[derive(Debug)]
+pub enum ScrubWorkerCommand {
+	Start,
+	Pause(Duration),
+	Resume,
+	Cancel,
+	SetTranquility(u32),
+}
+
+impl ScrubWorker {
+	pub fn new(manager: Arc<BlockManager>, rx_cmd: mpsc::Receiver<ScrubWorkerCommand>) -> Self {
+		let persister = Persister::new(&manager.system.metadata_dir, "scrub_info");
+		let persisted = match persister.load() {
+			Ok(v) => v,
+			Err(_) => ScrubWorkerPersisted {
+				time_last_complete_scrub: 0,
+				tranquility: INITIAL_SCRUB_TRANQUILITY,
+				corruptions_detected: 0,
+			},
+		};
+		Self {
+			manager,
+			rx_cmd,
+			work: ScrubWorkerState::Finished,
+			tranquilizer: Tranquilizer::new(30),
+			persister,
+			persisted,
+		}
+	}
+
+	async fn handle_cmd(&mut self, cmd: ScrubWorkerCommand) {
+		match cmd {
+			ScrubWorkerCommand::Start => {
+				self.work = match std::mem::take(&mut self.work) {
+					ScrubWorkerState::Finished => {
+						let iterator = BlockStoreIterator::new(&self.manager);
+						ScrubWorkerState::Running(iterator)
+					}
+					work => {
+						error!("Cannot start scrub worker: already running!");
+						work
+					}
+				};
+			}
+			ScrubWorkerCommand::Pause(dur) => {
+				self.work = match std::mem::take(&mut self.work) {
+					ScrubWorkerState::Running(it) | ScrubWorkerState::Paused(it, _) => {
+						ScrubWorkerState::Paused(it, now_msec() + dur.as_millis() as u64)
+					}
+					work => {
+						error!("Cannot pause scrub worker: not running!");
+						work
+					}
+				};
+			}
+			ScrubWorkerCommand::Resume => {
+				self.work = match std::mem::take(&mut self.work) {
+					ScrubWorkerState::Paused(it, _) => ScrubWorkerState::Running(it),
+					work => {
+						error!("Cannot resume scrub worker: not paused!");
+						work
+					}
+				};
+			}
+			ScrubWorkerCommand::Cancel => {
+				self.work = match std::mem::take(&mut self.work) {
+					ScrubWorkerState::Running(_) | ScrubWorkerState::Paused(_, _) => {
+						ScrubWorkerState::Finished
+					}
+					work => {
+						error!("Cannot cancel scrub worker: not running!");
+						work
+					}
+				}
+			}
+			ScrubWorkerCommand::SetTranquility(t) => {
+				self.persisted.tranquility = t;
+				if let Err(e) = self.persister.save_async(&self.persisted).await {
+					error!("Could not save new tranquilitiy value: {}", e);
+				}
+			}
+		}
+	}
+}
+
+#[async_trait]
+impl Worker for ScrubWorker {
+	fn name(&self) -> String {
+		"Block scrub worker".into()
+	}
+
+	fn info(&self) -> Option<String> {
+		let s = match &self.work {
+			ScrubWorkerState::Running(bsi) => format!(
+				"{:.2}% done (tranquility = {})",
+				bsi.progress() * 100.,
+				self.persisted.tranquility
+			),
+			ScrubWorkerState::Paused(bsi, rt) => {
+				format!(
+					"Paused, {:.2}% done, resumes at {}",
+					bsi.progress() * 100.,
+					msec_to_rfc3339(*rt)
+				)
+			}
+			ScrubWorkerState::Finished => format!(
+				"Last completed scrub: {}",
+				msec_to_rfc3339(self.persisted.time_last_complete_scrub)
+			),
+		};
+		Some(format!(
+			"{} ; corruptions detected: {}",
+			s, self.persisted.corruptions_detected
+		))
+	}
+
+	async fn work(&mut self, _must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error> {
+		match self.rx_cmd.try_recv() {
+			Ok(cmd) => self.handle_cmd(cmd).await,
+			Err(mpsc::error::TryRecvError::Disconnected) => return Ok(WorkerState::Done),
+			Err(mpsc::error::TryRecvError::Empty) => (),
+		};
+
+		match &mut self.work {
+			ScrubWorkerState::Running(bsi) => {
+				self.tranquilizer.reset();
+				if let Some(hash) = bsi.next().await? {
+					match self.manager.read_block(&hash).await {
+						Err(Error::CorruptData(_)) => {
+							error!("Found corrupt data block during scrub: {:?}", hash);
+							self.persisted.corruptions_detected += 1;
+							self.persister.save_async(&self.persisted).await?;
+						}
+						Err(e) => return Err(e),
+						_ => (),
+					};
+					Ok(self
+						.tranquilizer
+						.tranquilize_worker(self.persisted.tranquility))
+				} else {
+					self.persisted.time_last_complete_scrub = now_msec();
+					self.persister.save_async(&self.persisted).await?;
+					self.work = ScrubWorkerState::Finished;
+					self.tranquilizer.clear();
+					Ok(WorkerState::Idle)
+				}
+			}
+			_ => Ok(WorkerState::Idle),
+		}
+	}
+
+	async fn wait_for_work(&mut self, _must_exit: &watch::Receiver<bool>) -> WorkerState {
+		let (wait_until, command) = match &self.work {
+			ScrubWorkerState::Running(_) => return WorkerState::Busy,
+			ScrubWorkerState::Paused(_, resume_time) => (*resume_time, ScrubWorkerCommand::Resume),
+			ScrubWorkerState::Finished => (
+				self.persisted.time_last_complete_scrub + SCRUB_INTERVAL.as_millis() as u64,
+				ScrubWorkerCommand::Start,
+			),
+		};
+
+		let now = now_msec();
+		if now >= wait_until {
+			self.handle_cmd(command).await;
+			return WorkerState::Busy;
+		}
+		let delay = Duration::from_millis(wait_until - now);
+		select! {
+			_ = tokio::time::sleep(delay) => self.handle_cmd(command).await,
+			cmd = self.rx_cmd.recv() => if let Some(cmd) = cmd {
+				self.handle_cmd(cmd).await;
+			} else {
+				return WorkerState::Done;
+			}
+		}
+
+		match &self.work {
+			ScrubWorkerState::Running(_) => WorkerState::Busy,
+			_ => WorkerState::Idle,
+		}
+	}
+}
+
+// ---- ---- ----
+// UTILITY FOR ENUMERATING THE BLOCK STORE
+// ---- ---- ----
+
+struct BlockStoreIterator {
+	path: Vec<ReadingDir>,
+}
+
+enum ReadingDir {
+	Pending(PathBuf),
+	Read {
+		subpaths: Vec<fs::DirEntry>,
+		pos: usize,
+	},
+}
+
+impl BlockStoreIterator {
+	fn new(manager: &BlockManager) -> Self {
+		let root_dir = manager.data_dir.clone();
+		Self {
+			path: vec![ReadingDir::Pending(root_dir)],
+		}
+	}
+
+	/// Returns progress done, between 0 and 1
+	fn progress(&self) -> f32 {
+		if self.path.is_empty() {
+			1.0
+		} else {
+			let mut ret = 0.0;
+			let mut next_div = 1;
+			for p in self.path.iter() {
+				match p {
+					ReadingDir::Pending(_) => break,
+					ReadingDir::Read { subpaths, pos } => {
+						next_div *= subpaths.len();
+						ret += ((*pos - 1) as f32) / (next_div as f32);
+					}
+				}
+			}
+			ret
+		}
+	}
+
+	async fn next(&mut self) -> Result<Option<Hash>, Error> {
+		loop {
+			let last_path = match self.path.last_mut() {
+				None => return Ok(None),
+				Some(lp) => lp,
+			};
+
+			if let ReadingDir::Pending(path) = last_path {
+				let mut reader = fs::read_dir(&path).await?;
+				let mut subpaths = vec![];
+				while let Some(ent) = reader.next_entry().await? {
+					subpaths.push(ent);
+				}
+				*last_path = ReadingDir::Read { subpaths, pos: 0 };
+			}
+
+			let (subpaths, pos) = match *last_path {
+				ReadingDir::Read {
+					ref subpaths,
+					ref mut pos,
+				} => (subpaths, pos),
+				ReadingDir::Pending(_) => unreachable!(),
+			};
+
+			let data_dir_ent = match subpaths.get(*pos) {
+				None => {
+					self.path.pop();
+					continue;
+				}
+				Some(ent) => {
+					*pos += 1;
+					ent
+				}
+			};
+
+			let name = data_dir_ent.file_name();
+			let name = if let Ok(n) = name.into_string() {
+				n
+			} else {
+				continue;
+			};
+			let ent_type = data_dir_ent.file_type().await?;
+
+			let name = name.strip_suffix(".zst").unwrap_or(&name);
+			if name.len() == 2 && hex::decode(&name).is_ok() && ent_type.is_dir() {
+				let path = data_dir_ent.path();
+				self.path.push(ReadingDir::Pending(path));
+			} else if name.len() == 64 {
+				if let Ok(h) = hex::decode(&name) {
+					let mut hash = [0u8; 32];
+					hash.copy_from_slice(&h);
+					return Ok(Some(hash.into()));
+				}
+			}
+		}
+	}
+}
diff --git a/src/block/resync.rs b/src/block/resync.rs
new file mode 100644
index 00000000..ada3ac54
--- /dev/null
+++ b/src/block/resync.rs
@@ -0,0 +1,589 @@
+use std::collections::HashSet;
+use std::convert::TryInto;
+use std::sync::{Arc, Mutex};
+use std::time::Duration;
+
+use arc_swap::ArcSwap;
+use async_trait::async_trait;
+use serde::{Deserialize, Serialize};
+
+use tokio::select;
+use tokio::sync::{watch, Notify};
+
+use opentelemetry::{
+	trace::{FutureExt as OtelFutureExt, TraceContextExt, Tracer},
+	Context, KeyValue,
+};
+
+use garage_db as db;
+use garage_db::counted_tree_hack::CountedTree;
+
+use garage_util::background::*;
+use garage_util::data::*;
+use garage_util::error::*;
+use garage_util::metrics::RecordDuration;
+use garage_util::persister::Persister;
+use garage_util::time::*;
+use garage_util::tranquilizer::Tranquilizer;
+
+use garage_rpc::system::System;
+use garage_rpc::*;
+
+use garage_table::replication::TableReplication;
+
+use crate::manager::*;
+
+// The delay between the time where a resync operation fails
+// and the time when it is retried, with exponential backoff
+// (multiplied by 2, 4, 8, 16, etc. for every consecutive failure).
+pub(crate) const RESYNC_RETRY_DELAY: Duration = Duration::from_secs(60);
+// The minimum retry delay is 60 seconds = 1 minute
+// The maximum retry delay is 60 seconds * 2^6 = 60 seconds << 6 = 64 minutes (~1 hour)
+pub(crate) const RESYNC_RETRY_DELAY_MAX_BACKOFF_POWER: u64 = 6;
+
+// No more than 4 resync workers can be running in the system
+pub(crate) const MAX_RESYNC_WORKERS: usize = 4;
+// Resync tranquility is initially set to 2, but can be changed in the CLI
+// and the updated version is persisted over Garage restarts
+const INITIAL_RESYNC_TRANQUILITY: u32 = 2;
+
+pub struct BlockResyncManager {
+	pub(crate) queue: CountedTree,
+	pub(crate) notify: Notify,
+	pub(crate) errors: CountedTree,
+
+	busy_set: BusySet,
+
+	persister: Persister<ResyncPersistedConfig>,
+	persisted: ArcSwap<ResyncPersistedConfig>,
+}
+
+#[derive(Serialize, Deserialize, Clone, Copy)]
+struct ResyncPersistedConfig {
+	n_workers: usize,
+	tranquility: u32,
+}
+
+enum ResyncIterResult {
+	BusyDidSomething,
+	BusyDidNothing,
+	IdleFor(Duration),
+}
+
+type BusySet = Arc<Mutex<HashSet<Vec<u8>>>>;
+
+struct BusyBlock {
+	time_bytes: Vec<u8>,
+	hash_bytes: Vec<u8>,
+	busy_set: BusySet,
+}
+
+impl BlockResyncManager {
+	pub(crate) fn new(db: &db::Db, system: &System) -> Self {
+		let queue = db
+			.open_tree("block_local_resync_queue")
+			.expect("Unable to open block_local_resync_queue tree");
+		let queue = CountedTree::new(queue).expect("Could not count block_local_resync_queue");
+
+		let errors = db
+			.open_tree("block_local_resync_errors")
+			.expect("Unable to open block_local_resync_errors tree");
+		let errors = CountedTree::new(errors).expect("Could not count block_local_resync_errors");
+
+		let persister = Persister::new(&system.metadata_dir, "resync_cfg");
+		let persisted = match persister.load() {
+			Ok(v) => v,
+			Err(_) => ResyncPersistedConfig {
+				n_workers: 1,
+				tranquility: INITIAL_RESYNC_TRANQUILITY,
+			},
+		};
+
+		Self {
+			queue,
+			notify: Notify::new(),
+			errors,
+			busy_set: Arc::new(Mutex::new(HashSet::new())),
+			persister,
+			persisted: ArcSwap::new(Arc::new(persisted)),
+		}
+	}
+
+	/// Get lenght of resync queue
+	pub fn queue_len(&self) -> Result<usize, Error> {
+		// This currently can't return an error because the CountedTree hack
+		// doesn't error on .len(), but this will change when we remove the hack
+		// (hopefully someday!)
+		Ok(self.queue.len())
+	}
+
+	/// Get number of blocks that have an error
+	pub fn errors_len(&self) -> Result<usize, Error> {
+		// (see queue_len comment)
+		Ok(self.errors.len())
+	}
+
+	// ---- Resync loop ----
+
+	// This part manages a queue of blocks that need to be
+	// "resynchronized", i.e. that need to have a check that
+	// they are at present if we need them, or that they are
+	// deleted once the garbage collection delay has passed.
+	//
+	// Here are some explanations on how the resync queue works.
+	// There are two Sled trees that are used to have information
+	// about the status of blocks that need to be resynchronized:
+	//
+	// - resync.queue: a tree that is ordered first by a timestamp
+	//   (in milliseconds since Unix epoch) that is the time at which
+	//   the resync must be done, and second by block hash.
+	//   The key in this tree is just:
+	//       concat(timestamp (8 bytes), hash (32 bytes))
+	//   The value is the same 32-byte hash.
+	//
+	// - resync.errors: a tree that indicates for each block
+	//   if the last resync resulted in an error, and if so,
+	//   the following two informations (see the ErrorCounter struct):
+	//   - how many consecutive resync errors for this block?
+	//   - when was the last try?
+	//   These two informations are used to implement an
+	//   exponential backoff retry strategy.
+	//   The key in this tree is the 32-byte hash of the block,
+	//   and the value is the encoded ErrorCounter value.
+	//
+	// We need to have these two trees, because the resync queue
+	// is not just a queue of items to process, but a set of items
+	// that are waiting a specific delay until we can process them
+	// (the delay being necessary both internally for the exponential
+	// backoff strategy, and exposed as a parameter when adding items
+	// to the queue, e.g. to wait until the GC delay has passed).
+	// This is why we need one tree ordered by time, and one
+	// ordered by identifier of item to be processed (block hash).
+	//
+	// When the worker wants to process an item it takes from
+	// resync.queue, it checks in resync.errors that if there is an
+	// exponential back-off delay to await, it has passed before we
+	// process the item. If not, the item in the queue is skipped
+	// (but added back for later processing after the time of the
+	// delay).
+	//
+	// An alternative that would have seemed natural is to
+	// only add items to resync.queue with a processing time that is
+	// after the delay, but there are several issues with this:
+	// - This requires to synchronize updates to resync.queue and
+	//   resync.errors (with the current model, there is only one thread,
+	//   the worker thread, that accesses resync.errors,
+	//   so no need to synchronize) by putting them both in a lock.
+	//   This would mean that block_incref might need to take a lock
+	//   before doing its thing, meaning it has much more chances of
+	//   not completing successfully if something bad happens to Garage.
+	//   Currently Garage is not able to recover from block_incref that
+	//   doesn't complete successfully, because it is necessary to ensure
+	//   the consistency between the state of the block manager and
+	//   information in the BlockRef table.
+	// - If a resync fails, we put that block in the resync.errors table,
+	//   and also add it back to resync.queue to be processed after
+	//   the exponential back-off delay,
+	//   but maybe the block is already scheduled to be resynced again
+	//   at another time that is before the exponential back-off delay,
+	//   and we have no way to check that easily. This means that
+	//   in all cases, we need to check the resync.errors table
+	//   in the resync loop at the time when a block is popped from
+	//   the resync.queue.
+	// Overall, the current design is therefore simpler and more robust
+	// because it tolerates inconsistencies between the resync.queue
+	// and resync.errors table (items being scheduled in resync.queue
+	// for times that are earlier than the exponential back-off delay
+	// is a natural condition that is handled properly).
+
+	pub(crate) fn put_to_resync(&self, hash: &Hash, delay: Duration) -> db::Result<()> {
+		let when = now_msec() + delay.as_millis() as u64;
+		self.put_to_resync_at(hash, when)
+	}
+
+	pub(crate) fn put_to_resync_at(&self, hash: &Hash, when: u64) -> db::Result<()> {
+		trace!("Put resync_queue: {} {:?}", when, hash);
+		let mut key = u64::to_be_bytes(when).to_vec();
+		key.extend(hash.as_ref());
+		self.queue.insert(key, hash.as_ref())?;
+		self.notify.notify_waiters();
+		Ok(())
+	}
+
+	async fn resync_iter(&self, manager: &BlockManager) -> Result<ResyncIterResult, db::Error> {
+		if let Some(block) = self.get_block_to_resync()? {
+			let time_msec = u64::from_be_bytes(block.time_bytes[0..8].try_into().unwrap());
+			let now = now_msec();
+
+			if now >= time_msec {
+				let hash = Hash::try_from(&block.hash_bytes[..]).unwrap();
+
+				if let Some(ec) = self.errors.get(hash.as_slice())? {
+					let ec = ErrorCounter::decode(&ec);
+					if now < ec.next_try() {
+						// if next retry after an error is not yet,
+						// don't do resync and return early, but still
+						// make sure the item is still in queue at expected time
+						self.put_to_resync_at(&hash, ec.next_try())?;
+						// ec.next_try() > now >= time_msec, so this remove
+						// is not removing the one we added just above
+						// (we want to do the remove after the insert to ensure
+						// that the item is not lost if we crash in-between)
+						self.queue.remove(&block.time_bytes)?;
+						return Ok(ResyncIterResult::BusyDidNothing);
+					}
+				}
+
+				let tracer = opentelemetry::global::tracer("garage");
+				let trace_id = gen_uuid();
+				let span = tracer
+					.span_builder("Resync block")
+					.with_trace_id(
+						opentelemetry::trace::TraceId::from_hex(&hex::encode(
+							&trace_id.as_slice()[..16],
+						))
+						.unwrap(),
+					)
+					.with_attributes(vec![KeyValue::new("block", format!("{:?}", hash))])
+					.start(&tracer);
+
+				let res = self
+					.resync_block(manager, &hash)
+					.with_context(Context::current_with_span(span))
+					.bound_record_duration(&manager.metrics.resync_duration)
+					.await;
+
+				manager.metrics.resync_counter.add(1);
+
+				if let Err(e) = &res {
+					manager.metrics.resync_error_counter.add(1);
+					warn!("Error when resyncing {:?}: {}", hash, e);
+
+					let err_counter = match self.errors.get(hash.as_slice())? {
+						Some(ec) => ErrorCounter::decode(&ec).add1(now + 1),
+						None => ErrorCounter::new(now + 1),
+					};
+
+					self.errors.insert(hash.as_slice(), err_counter.encode())?;
+
+					self.put_to_resync_at(&hash, err_counter.next_try())?;
+					// err_counter.next_try() >= now + 1 > now,
+					// the entry we remove from the queue is not
+					// the entry we inserted with put_to_resync_at
+					self.queue.remove(&block.time_bytes)?;
+				} else {
+					self.errors.remove(hash.as_slice())?;
+					self.queue.remove(&block.time_bytes)?;
+				}
+
+				Ok(ResyncIterResult::BusyDidSomething)
+			} else {
+				Ok(ResyncIterResult::IdleFor(Duration::from_millis(
+					time_msec - now,
+				)))
+			}
+		} else {
+			// Here we wait either for a notification that an item has been
+			// added to the queue, or for a constant delay of 10 secs to expire.
+			// The delay avoids a race condition where the notification happens
+			// between the time we checked the queue and the first poll
+			// to resync_notify.notified(): if that happens, we'll just loop
+			// back 10 seconds later, which is fine.
+			Ok(ResyncIterResult::IdleFor(Duration::from_secs(10)))
+		}
+	}
+
+	fn get_block_to_resync(&self) -> Result<Option<BusyBlock>, db::Error> {
+		let mut busy = self.busy_set.lock().unwrap();
+		for it in self.queue.iter()? {
+			let (time_bytes, hash_bytes) = it?;
+			if !busy.contains(&time_bytes) {
+				busy.insert(time_bytes.clone());
+				return Ok(Some(BusyBlock {
+					time_bytes,
+					hash_bytes,
+					busy_set: self.busy_set.clone(),
+				}));
+			}
+		}
+		Ok(None)
+	}
+
+	async fn resync_block(&self, manager: &BlockManager, hash: &Hash) -> Result<(), Error> {
+		let BlockStatus { exists, needed } = manager.check_block_status(hash).await?;
+
+		if exists != needed.is_needed() || exists != needed.is_nonzero() {
+			debug!(
+				"Resync block {:?}: exists {}, nonzero rc {}, deletable {}",
+				hash,
+				exists,
+				needed.is_nonzero(),
+				needed.is_deletable(),
+			);
+		}
+
+		if exists && needed.is_deletable() {
+			info!("Resync block {:?}: offloading and deleting", hash);
+
+			let mut who = manager.replication.write_nodes(hash);
+			if who.len() < manager.replication.write_quorum() {
+				return Err(Error::Message("Not trying to offload block because we don't have a quorum of nodes to write to".to_string()));
+			}
+			who.retain(|id| *id != manager.system.id);
+
+			let who_needs_resps = manager
+				.system
+				.rpc
+				.call_many(
+					&manager.endpoint,
+					&who,
+					BlockRpc::NeedBlockQuery(*hash),
+					RequestStrategy::with_priority(PRIO_BACKGROUND),
+				)
+				.await?;
+
+			let mut need_nodes = vec![];
+			for (node, needed) in who_needs_resps {
+				match needed.err_context("NeedBlockQuery RPC")? {
+					BlockRpc::NeedBlockReply(needed) => {
+						if needed {
+							need_nodes.push(node);
+						}
+					}
+					m => {
+						return Err(Error::unexpected_rpc_message(m));
+					}
+				}
+			}
+
+			if !need_nodes.is_empty() {
+				trace!(
+					"Block {:?} needed by {} nodes, sending",
+					hash,
+					need_nodes.len()
+				);
+
+				for node in need_nodes.iter() {
+					manager
+						.metrics
+						.resync_send_counter
+						.add(1, &[KeyValue::new("to", format!("{:?}", node))]);
+				}
+
+				let block = manager.read_block(hash).await?;
+				let (header, bytes) = block.into_parts();
+				let put_block_message = Req::new(BlockRpc::PutBlock {
+					hash: *hash,
+					header,
+				})?
+				.with_stream_from_buffer(bytes);
+				manager
+					.system
+					.rpc
+					.try_call_many(
+						&manager.endpoint,
+						&need_nodes[..],
+						put_block_message,
+						RequestStrategy::with_priority(PRIO_BACKGROUND)
+							.with_quorum(need_nodes.len()),
+					)
+					.await
+					.err_context("PutBlock RPC")?;
+			}
+			info!(
+				"Deleting unneeded block {:?}, offload finished ({} / {})",
+				hash,
+				need_nodes.len(),
+				who.len()
+			);
+
+			manager.delete_if_unneeded(hash).await?;
+
+			manager.rc.clear_deleted_block_rc(hash)?;
+		}
+
+		if needed.is_nonzero() && !exists {
+			info!(
+				"Resync block {:?}: fetching absent but needed block (refcount > 0)",
+				hash
+			);
+
+			let block_data = manager.rpc_get_raw_block(hash, None).await?;
+
+			manager.metrics.resync_recv_counter.add(1);
+
+			manager.write_block(hash, &block_data).await?;
+		}
+
+		Ok(())
+	}
+
+	async fn update_persisted(
+		&self,
+		update: impl Fn(&mut ResyncPersistedConfig),
+	) -> Result<(), Error> {
+		let mut cfg: ResyncPersistedConfig = *self.persisted.load().as_ref();
+		update(&mut cfg);
+		self.persister.save_async(&cfg).await?;
+		self.persisted.store(Arc::new(cfg));
+		self.notify.notify_waiters();
+		Ok(())
+	}
+
+	pub async fn set_n_workers(&self, n_workers: usize) -> Result<(), Error> {
+		if !(1..=MAX_RESYNC_WORKERS).contains(&n_workers) {
+			return Err(Error::Message(format!(
+				"Invalid number of resync workers, must be between 1 and {}",
+				MAX_RESYNC_WORKERS
+			)));
+		}
+		self.update_persisted(|cfg| cfg.n_workers = n_workers).await
+	}
+
+	pub async fn set_tranquility(&self, tranquility: u32) -> Result<(), Error> {
+		self.update_persisted(|cfg| cfg.tranquility = tranquility)
+			.await
+	}
+}
+
+impl Drop for BusyBlock {
+	fn drop(&mut self) {
+		let mut busy = self.busy_set.lock().unwrap();
+		busy.remove(&self.time_bytes);
+	}
+}
+
+pub(crate) struct ResyncWorker {
+	index: usize,
+	manager: Arc<BlockManager>,
+	tranquilizer: Tranquilizer,
+	next_delay: Duration,
+}
+
+impl ResyncWorker {
+	pub(crate) fn new(index: usize, manager: Arc<BlockManager>) -> Self {
+		Self {
+			index,
+			manager,
+			tranquilizer: Tranquilizer::new(30),
+			next_delay: Duration::from_secs(10),
+		}
+	}
+}
+
+#[async_trait]
+impl Worker for ResyncWorker {
+	fn name(&self) -> String {
+		format!("Block resync worker #{}", self.index + 1)
+	}
+
+	fn info(&self) -> Option<String> {
+		let persisted = self.manager.resync.persisted.load();
+
+		if self.index >= persisted.n_workers {
+			return Some("(unused)".into());
+		}
+
+		let mut ret = vec![];
+		ret.push(format!("tranquility = {}", persisted.tranquility));
+
+		let qlen = self.manager.resync.queue_len().unwrap_or(0);
+		if qlen > 0 {
+			ret.push(format!("{} blocks in queue", qlen));
+		}
+
+		let elen = self.manager.resync.errors_len().unwrap_or(0);
+		if elen > 0 {
+			ret.push(format!("{} blocks in error state", elen));
+		}
+
+		Some(ret.join(", "))
+	}
+
+	async fn work(&mut self, _must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error> {
+		if self.index >= self.manager.resync.persisted.load().n_workers {
+			return Ok(WorkerState::Idle);
+		}
+
+		self.tranquilizer.reset();
+		match self.manager.resync.resync_iter(&self.manager).await {
+			Ok(ResyncIterResult::BusyDidSomething) => Ok(self
+				.tranquilizer
+				.tranquilize_worker(self.manager.resync.persisted.load().tranquility)),
+			Ok(ResyncIterResult::BusyDidNothing) => Ok(WorkerState::Busy),
+			Ok(ResyncIterResult::IdleFor(delay)) => {
+				self.next_delay = delay;
+				Ok(WorkerState::Idle)
+			}
+			Err(e) => {
+				// The errors that we have here are only Sled errors
+				// We don't really know how to handle them so just ¯\_(ツ)_/¯
+				// (there is kind of an assumption that Sled won't error on us,
+				// if it does there is not much we can do -- TODO should we just panic?)
+				// Here we just give the error to the worker manager,
+				// it will print it to the logs and increment a counter
+				Err(e.into())
+			}
+		}
+	}
+
+	async fn wait_for_work(&mut self, _must_exit: &watch::Receiver<bool>) -> WorkerState {
+		while self.index >= self.manager.resync.persisted.load().n_workers {
+			self.manager.resync.notify.notified().await
+		}
+
+		select! {
+			_ = tokio::time::sleep(self.next_delay) => (),
+			_ = self.manager.resync.notify.notified() => (),
+		};
+
+		WorkerState::Busy
+	}
+}
+
+/// Counts the number of errors when resyncing a block,
+/// and the time of the last try.
+/// Used to implement exponential backoff.
+#[derive(Clone, Copy, Debug)]
+struct ErrorCounter {
+	errors: u64,
+	last_try: u64,
+}
+
+impl ErrorCounter {
+	fn new(now: u64) -> Self {
+		Self {
+			errors: 1,
+			last_try: now,
+		}
+	}
+
+	fn decode(data: &[u8]) -> Self {
+		Self {
+			errors: u64::from_be_bytes(data[0..8].try_into().unwrap()),
+			last_try: u64::from_be_bytes(data[8..16].try_into().unwrap()),
+		}
+	}
+	fn encode(&self) -> Vec<u8> {
+		[
+			u64::to_be_bytes(self.errors),
+			u64::to_be_bytes(self.last_try),
+		]
+		.concat()
+	}
+
+	fn add1(self, now: u64) -> Self {
+		Self {
+			errors: self.errors + 1,
+			last_try: now,
+		}
+	}
+
+	fn delay_msec(&self) -> u64 {
+		(RESYNC_RETRY_DELAY.as_millis() as u64)
+			<< std::cmp::min(self.errors - 1, RESYNC_RETRY_DELAY_MAX_BACKOFF_POWER)
+	}
+	fn next_try(&self) -> u64 {
+		self.last_try + self.delay_msec()
+	}
+}
diff --git a/src/db/Cargo.toml b/src/db/Cargo.toml
new file mode 100644
index 00000000..62dda2ca
--- /dev/null
+++ b/src/db/Cargo.toml
@@ -0,0 +1,39 @@
+[package]
+name = "garage_db"
+version = "0.8.0"
+authors = ["Alex Auvolat <alex@adnab.me>"]
+edition = "2018"
+license = "AGPL-3.0"
+description = "Abstraction over multiple key/value storage engines that supports transactions"
+repository = "https://git.deuxfleurs.fr/Deuxfleurs/garage"
+readme = "../../README.md"
+
+[lib]
+path = "lib.rs"
+
+[[bin]]
+name = "convert"
+path = "bin/convert.rs"
+required-features = ["cli"]
+
+[dependencies]
+err-derive = "0.3"
+hexdump = "0.1"
+tracing = "0.1.30"
+
+heed = { version = "0.11", default-features = false, features = ["lmdb"], optional = true }
+rusqlite = { version = "0.27", optional = true }
+sled = { version = "0.34", optional = true }
+
+# cli deps
+clap = { version = "3.1.18", optional = true, features = ["derive", "env"] }
+pretty_env_logger = { version = "0.4", optional = true }
+
+[dev-dependencies]
+mktemp = "0.4"
+
+[features]
+bundled-libs = [ "rusqlite/bundled" ]
+cli = ["clap", "pretty_env_logger"]
+lmdb = [ "heed" ]
+sqlite = [ "rusqlite" ]
diff --git a/src/db/bin/convert.rs b/src/db/bin/convert.rs
new file mode 100644
index 00000000..bbde2048
--- /dev/null
+++ b/src/db/bin/convert.rs
@@ -0,0 +1,69 @@
+use std::path::PathBuf;
+
+use garage_db::*;
+
+use clap::Parser;
+
+/// K2V command line interface
+#[derive(Parser, Debug)]
+#[clap(author, version, about, long_about = None)]
+struct Args {
+	/// Input DB path
+	#[clap(short = 'i')]
+	input_path: PathBuf,
+	/// Input DB engine
+	#[clap(short = 'a')]
+	input_engine: String,
+
+	/// Output DB path
+	#[clap(short = 'o')]
+	output_path: PathBuf,
+	/// Output DB engine
+	#[clap(short = 'b')]
+	output_engine: String,
+}
+
+fn main() {
+	let args = Args::parse();
+	pretty_env_logger::init();
+
+	match do_conversion(args) {
+		Ok(()) => println!("Success!"),
+		Err(e) => eprintln!("Error: {}", e),
+	}
+}
+
+fn do_conversion(args: Args) -> Result<()> {
+	let input = open_db(args.input_path, args.input_engine)?;
+	let output = open_db(args.output_path, args.output_engine)?;
+	output.import(&input)?;
+	Ok(())
+}
+
+fn open_db(path: PathBuf, engine: String) -> Result<Db> {
+	match engine.as_str() {
+		"sled" => {
+			let db = sled_adapter::sled::Config::default().path(&path).open()?;
+			Ok(sled_adapter::SledDb::init(db))
+		}
+		"sqlite" | "sqlite3" | "rusqlite" => {
+			let db = sqlite_adapter::rusqlite::Connection::open(&path)?;
+			Ok(sqlite_adapter::SqliteDb::init(db))
+		}
+		"lmdb" | "heed" => {
+			std::fs::create_dir_all(&path).map_err(|e| {
+				Error(format!("Unable to create LMDB data directory: {}", e).into())
+			})?;
+
+			let map_size = lmdb_adapter::recommended_map_size();
+
+			let db = lmdb_adapter::heed::EnvOpenOptions::new()
+				.max_dbs(100)
+				.map_size(map_size)
+				.open(&path)
+				.unwrap();
+			Ok(lmdb_adapter::LmdbDb::init(db))
+		}
+		e => Err(Error(format!("Invalid DB engine: {}", e).into())),
+	}
+}
diff --git a/src/db/counted_tree_hack.rs b/src/db/counted_tree_hack.rs
new file mode 100644
index 00000000..bbe943a2
--- /dev/null
+++ b/src/db/counted_tree_hack.rs
@@ -0,0 +1,127 @@
+//! This hack allows a db tree to keep in RAM a counter of the number of entries
+//! it contains, which is used to call .len() on it.  This is usefull only for
+//! the sled backend where .len() otherwise would have to traverse the whole
+//! tree to count items.  For sqlite and lmdb, this is mostly useless (but
+//! hopefully not harmfull!). Note that a CountedTree cannot be part of a
+//! transaction.
+
+use std::sync::{
+	atomic::{AtomicUsize, Ordering},
+	Arc,
+};
+
+use crate::{Result, Tree, TxError, Value, ValueIter};
+
+#[derive(Clone)]
+pub struct CountedTree(Arc<CountedTreeInternal>);
+
+struct CountedTreeInternal {
+	tree: Tree,
+	len: AtomicUsize,
+}
+
+impl CountedTree {
+	pub fn new(tree: Tree) -> Result<Self> {
+		let len = tree.len()?;
+		Ok(Self(Arc::new(CountedTreeInternal {
+			tree,
+			len: AtomicUsize::new(len),
+		})))
+	}
+
+	pub fn len(&self) -> usize {
+		self.0.len.load(Ordering::SeqCst)
+	}
+
+	pub fn is_empty(&self) -> bool {
+		self.len() == 0
+	}
+
+	pub fn get<K: AsRef<[u8]>>(&self, key: K) -> Result<Option<Value>> {
+		self.0.tree.get(key)
+	}
+
+	pub fn first(&self) -> Result<Option<(Value, Value)>> {
+		self.0.tree.first()
+	}
+
+	pub fn iter(&self) -> Result<ValueIter<'_>> {
+		self.0.tree.iter()
+	}
+
+	// ---- writing functions ----
+
+	pub fn insert<K, V>(&self, key: K, value: V) -> Result<Option<Value>>
+	where
+		K: AsRef<[u8]>,
+		V: AsRef<[u8]>,
+	{
+		let old_val = self.0.tree.insert(key, value)?;
+		if old_val.is_none() {
+			self.0.len.fetch_add(1, Ordering::SeqCst);
+		}
+		Ok(old_val)
+	}
+
+	pub fn remove<K: AsRef<[u8]>>(&self, key: K) -> Result<Option<Value>> {
+		let old_val = self.0.tree.remove(key)?;
+		if old_val.is_some() {
+			self.0.len.fetch_sub(1, Ordering::SeqCst);
+		}
+		Ok(old_val)
+	}
+
+	pub fn compare_and_swap<K, OV, NV>(
+		&self,
+		key: K,
+		expected_old: Option<OV>,
+		new: Option<NV>,
+	) -> Result<bool>
+	where
+		K: AsRef<[u8]>,
+		OV: AsRef<[u8]>,
+		NV: AsRef<[u8]>,
+	{
+		let old_some = expected_old.is_some();
+		let new_some = new.is_some();
+
+		let tx_res = self.0.tree.db().transaction(|mut tx| {
+			let old_val = tx.get(&self.0.tree, &key)?;
+			let is_same = match (&old_val, &expected_old) {
+				(None, None) => true,
+				(Some(x), Some(y)) if x == y.as_ref() => true,
+				_ => false,
+			};
+			if is_same {
+				match &new {
+					Some(v) => {
+						tx.insert(&self.0.tree, &key, v)?;
+					}
+					None => {
+						tx.remove(&self.0.tree, &key)?;
+					}
+				}
+				tx.commit(())
+			} else {
+				tx.abort(())
+			}
+		});
+
+		match tx_res {
+			Ok(()) => {
+				match (old_some, new_some) {
+					(false, true) => {
+						self.0.len.fetch_add(1, Ordering::SeqCst);
+					}
+					(true, false) => {
+						self.0.len.fetch_sub(1, Ordering::SeqCst);
+					}
+					_ => (),
+				}
+				Ok(true)
+			}
+			Err(TxError::Abort(())) => Ok(false),
+			Err(TxError::Db(e)) => Err(e),
+		}
+	}
+}
diff --git a/src/db/lib.rs b/src/db/lib.rs
new file mode 100644
index 00000000..d96586be
--- /dev/null
+++ b/src/db/lib.rs
@@ -0,0 +1,416 @@
+#[macro_use]
+#[cfg(feature = "sqlite")]
+extern crate tracing;
+
+#[cfg(not(any(feature = "lmdb", feature = "sled", feature = "sqlite")))]
+compile_error!("Must activate the Cargo feature for at least one DB engine: lmdb, sled or sqlite.");
+
+#[cfg(feature = "lmdb")]
+pub mod lmdb_adapter;
+#[cfg(feature = "sled")]
+pub mod sled_adapter;
+#[cfg(feature = "sqlite")]
+pub mod sqlite_adapter;
+
+pub mod counted_tree_hack;
+
+#[cfg(test)]
+pub mod test;
+
+use core::ops::{Bound, RangeBounds};
+
+use std::borrow::Cow;
+use std::cell::Cell;
+use std::sync::Arc;
+
+use err_derive::Error;
+
+#[derive(Clone)]
+pub struct Db(pub(crate) Arc<dyn IDb>);
+
+pub struct Transaction<'a>(&'a mut dyn ITx);
+
+#[derive(Clone)]
+pub struct Tree(Arc<dyn IDb>, usize);
+
+pub type Value = Vec<u8>;
+pub type ValueIter<'a> = Box<dyn std::iter::Iterator<Item = Result<(Value, Value)>> + 'a>;
+pub type TxValueIter<'a> = Box<dyn std::iter::Iterator<Item = TxOpResult<(Value, Value)>> + 'a>;
+
+// ----
+
+#[derive(Debug, Error)]
+#[error(display = "{}", _0)]
+pub struct Error(pub Cow<'static, str>);
+
+pub type Result<T> = std::result::Result<T, Error>;
+
+#[derive(Debug, Error)]
+#[error(display = "{}", _0)]
+pub struct TxOpError(pub(crate) Error);
+pub type TxOpResult<T> = std::result::Result<T, TxOpError>;
+
+pub enum TxError<E> {
+	Abort(E),
+	Db(Error),
+}
+pub type TxResult<R, E> = std::result::Result<R, TxError<E>>;
+
+impl<E> From<TxOpError> for TxError<E> {
+	fn from(e: TxOpError) -> TxError<E> {
+		TxError::Db(e.0)
+	}
+}
+
+pub fn unabort<R, E>(res: TxResult<R, E>) -> TxOpResult<std::result::Result<R, E>> {
+	match res {
+		Ok(v) => Ok(Ok(v)),
+		Err(TxError::Abort(e)) => Ok(Err(e)),
+		Err(TxError::Db(e)) => Err(TxOpError(e)),
+	}
+}
+
+// ----
+
+impl Db {
+	pub fn engine(&self) -> String {
+		self.0.engine()
+	}
+
+	pub fn open_tree<S: AsRef<str>>(&self, name: S) -> Result<Tree> {
+		let tree_id = self.0.open_tree(name.as_ref())?;
+		Ok(Tree(self.0.clone(), tree_id))
+	}
+
+	pub fn list_trees(&self) -> Result<Vec<String>> {
+		self.0.list_trees()
+	}
+
+	pub fn transaction<R, E, F>(&self, fun: F) -> TxResult<R, E>
+	where
+		F: Fn(Transaction<'_>) -> TxResult<R, E>,
+	{
+		let f = TxFn {
+			function: fun,
+			result: Cell::new(None),
+		};
+		let tx_res = self.0.transaction(&f);
+		let ret = f
+			.result
+			.into_inner()
+			.expect("Transaction did not store result");
+
+		match tx_res {
+			Ok(()) => {
+				assert!(matches!(ret, Ok(_)));
+				ret
+			}
+			Err(TxError::Abort(())) => {
+				assert!(matches!(ret, Err(TxError::Abort(_))));
+				ret
+			}
+			Err(TxError::Db(e2)) => match ret {
+				// Ok was stored -> the error occured when finalizing
+				// transaction
+				Ok(_) => Err(TxError::Db(e2)),
+				// An error was already stored: that's the one we want to
+				// return
+				Err(TxError::Db(e)) => Err(TxError::Db(e)),
+				_ => unreachable!(),
+			},
+		}
+	}
+
+	pub fn import(&self, other: &Db) -> Result<()> {
+		let existing_trees = self.list_trees()?;
+		if !existing_trees.is_empty() {
+			return Err(Error(
+				format!(
+					"destination database already contains data: {:?}",
+					existing_trees
+				)
+				.into(),
+			));
+		}
+
+		let tree_names = other.list_trees()?;
+		for name in tree_names {
+			let tree = self.open_tree(&name)?;
+			if tree.len()? > 0 {
+				return Err(Error(format!("tree {} already contains data", name).into()));
+			}
+
+			let ex_tree = other.open_tree(&name)?;
+
+			let tx_res = self.transaction(|mut tx| {
+				let mut i = 0;
+				for item in ex_tree.iter().map_err(TxError::Abort)? {
+					let (k, v) = item.map_err(TxError::Abort)?;
+					tx.insert(&tree, k, v)?;
+					i += 1;
+					if i % 1000 == 0 {
+						println!("{}: imported {}", name, i);
+					}
+				}
+				tx.commit(i)
+			});
+			let total = match tx_res {
+				Err(TxError::Db(e)) => return Err(e),
+				Err(TxError::Abort(e)) => return Err(e),
+				Ok(x) => x,
+			};
+
+			println!("{}: finished importing, {} items", name, total);
+		}
+		Ok(())
+	}
+}
+
+#[allow(clippy::len_without_is_empty)]
+impl Tree {
+	#[inline]
+	pub fn db(&self) -> Db {
+		Db(self.0.clone())
+	}
+
+	#[inline]
+	pub fn get<T: AsRef<[u8]>>(&self, key: T) -> Result<Option<Value>> {
+		self.0.get(self.1, key.as_ref())
+	}
+	#[inline]
+	pub fn len(&self) -> Result<usize> {
+		self.0.len(self.1)
+	}
+
+	#[inline]
+	pub fn first(&self) -> Result<Option<(Value, Value)>> {
+		self.iter()?.next().transpose()
+	}
+	#[inline]
+	pub fn get_gt<T: AsRef<[u8]>>(&self, from: T) -> Result<Option<(Value, Value)>> {
+		self.range((Bound::Excluded(from), Bound::Unbounded))?
+			.next()
+			.transpose()
+	}
+
+	/// Returns the old value if there was one
+	#[inline]
+	pub fn insert<T: AsRef<[u8]>, U: AsRef<[u8]>>(
+		&self,
+		key: T,
+		value: U,
+	) -> Result<Option<Value>> {
+		self.0.insert(self.1, key.as_ref(), value.as_ref())
+	}
+	/// Returns the old value if there was one
+	#[inline]
+	pub fn remove<T: AsRef<[u8]>>(&self, key: T) -> Result<Option<Value>> {
+		self.0.remove(self.1, key.as_ref())
+	}
+	/// Clears all values from the tree
+	#[inline]
+	pub fn clear(&self) -> Result<()> {
+		self.0.clear(self.1)
+	}
+
+	#[inline]
+	pub fn iter(&self) -> Result<ValueIter<'_>> {
+		self.0.iter(self.1)
+	}
+	#[inline]
+	pub fn iter_rev(&self) -> Result<ValueIter<'_>> {
+		self.0.iter_rev(self.1)
+	}
+
+	#[inline]
+	pub fn range<K, R>(&self, range: R) -> Result<ValueIter<'_>>
+	where
+		K: AsRef<[u8]>,
+		R: RangeBounds<K>,
+	{
+		let sb = range.start_bound();
+		let eb = range.end_bound();
+		self.0.range(self.1, get_bound(sb), get_bound(eb))
+	}
+	#[inline]
+	pub fn range_rev<K, R>(&self, range: R) -> Result<ValueIter<'_>>
+	where
+		K: AsRef<[u8]>,
+		R: RangeBounds<K>,
+	{
+		let sb = range.start_bound();
+		let eb = range.end_bound();
+		self.0.range_rev(self.1, get_bound(sb), get_bound(eb))
+	}
+}
+
+#[allow(clippy::len_without_is_empty)]
+impl<'a> Transaction<'a> {
+	#[inline]
+	pub fn get<T: AsRef<[u8]>>(&self, tree: &Tree, key: T) -> TxOpResult<Option<Value>> {
+		self.0.get(tree.1, key.as_ref())
+	}
+	#[inline]
+	pub fn len(&self, tree: &Tree) -> TxOpResult<usize> {
+		self.0.len(tree.1)
+	}
+
+	/// Returns the old value if there was one
+	#[inline]
+	pub fn insert<T: AsRef<[u8]>, U: AsRef<[u8]>>(
+		&mut self,
+		tree: &Tree,
+		key: T,
+		value: U,
+	) -> TxOpResult<Option<Value>> {
+		self.0.insert(tree.1, key.as_ref(), value.as_ref())
+	}
+	/// Returns the old value if there was one
+	#[inline]
+	pub fn remove<T: AsRef<[u8]>>(&mut self, tree: &Tree, key: T) -> TxOpResult<Option<Value>> {
+		self.0.remove(tree.1, key.as_ref())
+	}
+
+	#[inline]
+	pub fn iter(&self, tree: &Tree) -> TxOpResult<TxValueIter<'_>> {
+		self.0.iter(tree.1)
+	}
+	#[inline]
+	pub fn iter_rev(&self, tree: &Tree) -> TxOpResult<TxValueIter<'_>> {
+		self.0.iter_rev(tree.1)
+	}
+
+	#[inline]
+	pub fn range<K, R>(&self, tree: &Tree, range: R) -> TxOpResult<TxValueIter<'_>>
+	where
+		K: AsRef<[u8]>,
+		R: RangeBounds<K>,
+	{
+		let sb = range.start_bound();
+		let eb = range.end_bound();
+		self.0.range(tree.1, get_bound(sb), get_bound(eb))
+	}
+	#[inline]
+	pub fn range_rev<K, R>(&self, tree: &Tree, range: R) -> TxOpResult<TxValueIter<'_>>
+	where
+		K: AsRef<[u8]>,
+		R: RangeBounds<K>,
+	{
+		let sb = range.start_bound();
+		let eb = range.end_bound();
+		self.0.range_rev(tree.1, get_bound(sb), get_bound(eb))
+	}
+
+	// ----
+
+	#[inline]
+	pub fn abort<R, E>(self, e: E) -> TxResult<R, E> {
+		Err(TxError::Abort(e))
+	}
+
+	#[inline]
+	pub fn commit<R, E>(self, r: R) -> TxResult<R, E> {
+		Ok(r)
+	}
+}
+
+// ---- Internal interfaces
+
+pub(crate) trait IDb: Send + Sync {
+	fn engine(&self) -> String;
+	fn open_tree(&self, name: &str) -> Result<usize>;
+	fn list_trees(&self) -> Result<Vec<String>>;
+
+	fn get(&self, tree: usize, key: &[u8]) -> Result<Option<Value>>;
+	fn len(&self, tree: usize) -> Result<usize>;
+
+	fn insert(&self, tree: usize, key: &[u8], value: &[u8]) -> Result<Option<Value>>;
+	fn remove(&self, tree: usize, key: &[u8]) -> Result<Option<Value>>;
+	fn clear(&self, tree: usize) -> Result<()>;
+
+	fn iter(&self, tree: usize) -> Result<ValueIter<'_>>;
+	fn iter_rev(&self, tree: usize) -> Result<ValueIter<'_>>;
+
+	fn range<'r>(
+		&self,
+		tree: usize,
+		low: Bound<&'r [u8]>,
+		high: Bound<&'r [u8]>,
+	) -> Result<ValueIter<'_>>;
+	fn range_rev<'r>(
+		&self,
+		tree: usize,
+		low: Bound<&'r [u8]>,
+		high: Bound<&'r [u8]>,
+	) -> Result<ValueIter<'_>>;
+
+	fn transaction(&self, f: &dyn ITxFn) -> TxResult<(), ()>;
+}
+
+pub(crate) trait ITx {
+	fn get(&self, tree: usize, key: &[u8]) -> TxOpResult<Option<Value>>;
+	fn len(&self, tree: usize) -> TxOpResult<usize>;
+
+	fn insert(&mut self, tree: usize, key: &[u8], value: &[u8]) -> TxOpResult<Option<Value>>;
+	fn remove(&mut self, tree: usize, key: &[u8]) -> TxOpResult<Option<Value>>;
+
+	fn iter(&self, tree: usize) -> TxOpResult<TxValueIter<'_>>;
+	fn iter_rev(&self, tree: usize) -> TxOpResult<TxValueIter<'_>>;
+
+	fn range<'r>(
+		&self,
+		tree: usize,
+		low: Bound<&'r [u8]>,
+		high: Bound<&'r [u8]>,
+	) -> TxOpResult<TxValueIter<'_>>;
+	fn range_rev<'r>(
+		&self,
+		tree: usize,
+		low: Bound<&'r [u8]>,
+		high: Bound<&'r [u8]>,
+	) -> TxOpResult<TxValueIter<'_>>;
+}
+
+pub(crate) trait ITxFn {
+	fn try_on(&self, tx: &mut dyn ITx) -> TxFnResult;
+}
+
+pub(crate) enum TxFnResult {
+	Ok,
+	Abort,
+	DbErr,
+}
+
+struct TxFn<F, R, E>
+where
+	F: Fn(Transaction<'_>) -> TxResult<R, E>,
+{
+	function: F,
+	result: Cell<Option<TxResult<R, E>>>,
+}
+
+impl<F, R, E> ITxFn for TxFn<F, R, E>
+where
+	F: Fn(Transaction<'_>) -> TxResult<R, E>,
+{
+	fn try_on(&self, tx: &mut dyn ITx) -> TxFnResult {
+		let res = (self.function)(Transaction(tx));
+		let res2 = match &res {
+			Ok(_) => TxFnResult::Ok,
+			Err(TxError::Abort(_)) => TxFnResult::Abort,
+			Err(TxError::Db(_)) => TxFnResult::DbErr,
+		};
+		self.result.set(Some(res));
+		res2
+	}
+}
+
+// ----
+
+fn get_bound<K: AsRef<[u8]>>(b: Bound<&K>) -> Bound<&[u8]> {
+	match b {
+		Bound::Included(v) => Bound::Included(v.as_ref()),
+		Bound::Excluded(v) => Bound::Excluded(v.as_ref()),
+		Bound::Unbounded => Bound::Unbounded,
+	}
+}
diff --git a/src/db/lmdb_adapter.rs b/src/db/lmdb_adapter.rs
new file mode 100644
index 00000000..c036c990
--- /dev/null
+++ b/src/db/lmdb_adapter.rs
@@ -0,0 +1,350 @@
+use core::ops::Bound;
+use core::ptr::NonNull;
+
+use std::collections::HashMap;
+use std::convert::TryInto;
+use std::sync::{Arc, RwLock};
+
+use heed::types::ByteSlice;
+use heed::{BytesDecode, Env, RoTxn, RwTxn, UntypedDatabase as Database};
+
+use crate::{
+	Db, Error, IDb, ITx, ITxFn, Result, TxError, TxFnResult, TxOpError, TxOpResult, TxResult,
+	TxValueIter, Value, ValueIter,
+};
+
+pub use heed;
+
+// -- err
+
+impl From<heed::Error> for Error {
+	fn from(e: heed::Error) -> Error {
+		Error(format!("LMDB: {}", e).into())
+	}
+}
+
+impl From<heed::Error> for TxOpError {
+	fn from(e: heed::Error) -> TxOpError {
+		TxOpError(e.into())
+	}
+}
+
+// -- db
+
+pub struct LmdbDb {
+	db: heed::Env,
+	trees: RwLock<(Vec<Database>, HashMap<String, usize>)>,
+}
+
+impl LmdbDb {
+	pub fn init(db: Env) -> Db {
+		let s = Self {
+			db,
+			trees: RwLock::new((Vec::new(), HashMap::new())),
+		};
+		Db(Arc::new(s))
+	}
+
+	fn get_tree(&self, i: usize) -> Result<Database> {
+		self.trees
+			.read()
+			.unwrap()
+			.0
+			.get(i)
+			.cloned()
+			.ok_or_else(|| Error("invalid tree id".into()))
+	}
+}
+
+impl IDb for LmdbDb {
+	fn engine(&self) -> String {
+		"LMDB (using Heed crate)".into()
+	}
+
+	fn open_tree(&self, name: &str) -> Result<usize> {
+		let mut trees = self.trees.write().unwrap();
+		if let Some(i) = trees.1.get(name) {
+			Ok(*i)
+		} else {
+			let tree = self.db.create_database(Some(name))?;
+			let i = trees.0.len();
+			trees.0.push(tree);
+			trees.1.insert(name.to_string(), i);
+			Ok(i)
+		}
+	}
+
+	fn list_trees(&self) -> Result<Vec<String>> {
+		let tree0 = match self.db.open_database::<heed::types::Str, ByteSlice>(None)? {
+			Some(x) => x,
+			None => return Ok(vec![]),
+		};
+
+		let mut ret = vec![];
+		let tx = self.db.read_txn()?;
+		for item in tree0.iter(&tx)? {
+			let (tree_name, _) = item?;
+			ret.push(tree_name.to_string());
+		}
+		drop(tx);
+
+		let mut ret2 = vec![];
+		for tree_name in ret {
+			if self
+				.db
+				.open_database::<ByteSlice, ByteSlice>(Some(&tree_name))?
+				.is_some()
+			{
+				ret2.push(tree_name);
+			}
+		}
+
+		Ok(ret2)
+	}
+
+	// ----
+
+	fn get(&self, tree: usize, key: &[u8]) -> Result<Option<Value>> {
+		let tree = self.get_tree(tree)?;
+
+		let tx = self.db.read_txn()?;
+		let val = tree.get(&tx, key)?;
+		match val {
+			None => Ok(None),
+			Some(v) => Ok(Some(v.to_vec())),
+		}
+	}
+
+	fn len(&self, tree: usize) -> Result<usize> {
+		let tree = self.get_tree(tree)?;
+		let tx = self.db.read_txn()?;
+		Ok(tree.len(&tx)?.try_into().unwrap())
+	}
+
+	fn insert(&self, tree: usize, key: &[u8], value: &[u8]) -> Result<Option<Value>> {
+		let tree = self.get_tree(tree)?;
+		let mut tx = self.db.write_txn()?;
+		let old_val = tree.get(&tx, key)?.map(Vec::from);
+		tree.put(&mut tx, key, value)?;
+		tx.commit()?;
+		Ok(old_val)
+	}
+
+	fn remove(&self, tree: usize, key: &[u8]) -> Result<Option<Value>> {
+		let tree = self.get_tree(tree)?;
+		let mut tx = self.db.write_txn()?;
+		let old_val = tree.get(&tx, key)?.map(Vec::from);
+		tree.delete(&mut tx, key)?;
+		tx.commit()?;
+		Ok(old_val)
+	}
+
+	fn clear(&self, tree: usize) -> Result<()> {
+		let tree = self.get_tree(tree)?;
+		let mut tx = self.db.write_txn()?;
+		tree.clear(&mut tx)?;
+		tx.commit()?;
+		Ok(())
+	}
+
+	fn iter(&self, tree: usize) -> Result<ValueIter<'_>> {
+		let tree = self.get_tree(tree)?;
+		let tx = self.db.read_txn()?;
+		TxAndIterator::make(tx, |tx| Ok(tree.iter(tx)?))
+	}
+
+	fn iter_rev(&self, tree: usize) -> Result<ValueIter<'_>> {
+		let tree = self.get_tree(tree)?;
+		let tx = self.db.read_txn()?;
+		TxAndIterator::make(tx, |tx| Ok(tree.rev_iter(tx)?))
+	}
+
+	fn range<'r>(
+		&self,
+		tree: usize,
+		low: Bound<&'r [u8]>,
+		high: Bound<&'r [u8]>,
+	) -> Result<ValueIter<'_>> {
+		let tree = self.get_tree(tree)?;
+		let tx = self.db.read_txn()?;
+		TxAndIterator::make(tx, |tx| Ok(tree.range(tx, &(low, high))?))
+	}
+	fn range_rev<'r>(
+		&self,
+		tree: usize,
+		low: Bound<&'r [u8]>,
+		high: Bound<&'r [u8]>,
+	) -> Result<ValueIter<'_>> {
+		let tree = self.get_tree(tree)?;
+		let tx = self.db.read_txn()?;
+		TxAndIterator::make(tx, |tx| Ok(tree.rev_range(tx, &(low, high))?))
+	}
+
+	// ----
+
+	fn transaction(&self, f: &dyn ITxFn) -> TxResult<(), ()> {
+		let trees = self.trees.read().unwrap();
+		let mut tx = LmdbTx {
+			trees: &trees.0[..],
+			tx: self
+				.db
+				.write_txn()
+				.map_err(Error::from)
+				.map_err(TxError::Db)?,
+		};
+
+		let res = f.try_on(&mut tx);
+		match res {
+			TxFnResult::Ok => {
+				tx.tx.commit().map_err(Error::from).map_err(TxError::Db)?;
+				Ok(())
+			}
+			TxFnResult::Abort => {
+				tx.tx.abort().map_err(Error::from).map_err(TxError::Db)?;
+				Err(TxError::Abort(()))
+			}
+			TxFnResult::DbErr => {
+				tx.tx.abort().map_err(Error::from).map_err(TxError::Db)?;
+				Err(TxError::Db(Error(
+					"(this message will be discarded)".into(),
+				)))
+			}
+		}
+	}
+}
+
+// ----
+
+struct LmdbTx<'a> {
+	trees: &'a [Database],
+	tx: RwTxn<'a, 'a>,
+}
+
+impl<'a> LmdbTx<'a> {
+	fn get_tree(&self, i: usize) -> TxOpResult<&Database> {
+		self.trees.get(i).ok_or_else(|| {
+			TxOpError(Error(
+				"invalid tree id (it might have been openned after the transaction started)".into(),
+			))
+		})
+	}
+}
+
+impl<'a> ITx for LmdbTx<'a> {
+	fn get(&self, tree: usize, key: &[u8]) -> TxOpResult<Option<Value>> {
+		let tree = self.get_tree(tree)?;
+		match tree.get(&self.tx, key)? {
+			Some(v) => Ok(Some(v.to_vec())),
+			None => Ok(None),
+		}
+	}
+	fn len(&self, _tree: usize) -> TxOpResult<usize> {
+		unimplemented!(".len() in transaction not supported with LMDB backend")
+	}
+
+	fn insert(&mut self, tree: usize, key: &[u8], value: &[u8]) -> TxOpResult<Option<Value>> {
+		let tree = *self.get_tree(tree)?;
+		let old_val = tree.get(&self.tx, key)?.map(Vec::from);
+		tree.put(&mut self.tx, key, value)?;
+		Ok(old_val)
+	}
+	fn remove(&mut self, tree: usize, key: &[u8]) -> TxOpResult<Option<Value>> {
+		let tree = *self.get_tree(tree)?;
+		let old_val = tree.get(&self.tx, key)?.map(Vec::from);
+		tree.delete(&mut self.tx, key)?;
+		Ok(old_val)
+	}
+
+	fn iter(&self, _tree: usize) -> TxOpResult<TxValueIter<'_>> {
+		unimplemented!("Iterators in transactions not supported with LMDB backend");
+	}
+	fn iter_rev(&self, _tree: usize) -> TxOpResult<TxValueIter<'_>> {
+		unimplemented!("Iterators in transactions not supported with LMDB backend");
+	}
+
+	fn range<'r>(
+		&self,
+		_tree: usize,
+		_low: Bound<&'r [u8]>,
+		_high: Bound<&'r [u8]>,
+	) -> TxOpResult<TxValueIter<'_>> {
+		unimplemented!("Iterators in transactions not supported with LMDB backend");
+	}
+	fn range_rev<'r>(
+		&self,
+		_tree: usize,
+		_low: Bound<&'r [u8]>,
+		_high: Bound<&'r [u8]>,
+	) -> TxOpResult<TxValueIter<'_>> {
+		unimplemented!("Iterators in transactions not supported with LMDB backend");
+	}
+}
+
+// ----
+
+type IteratorItem<'a> = heed::Result<(
+	<ByteSlice as BytesDecode<'a>>::DItem,
+	<ByteSlice as BytesDecode<'a>>::DItem,
+)>;
+
+struct TxAndIterator<'a, I>
+where
+	I: Iterator<Item = IteratorItem<'a>> + 'a,
+{
+	tx: RoTxn<'a>,
+	iter: Option<I>,
+}
+
+impl<'a, I> TxAndIterator<'a, I>
+where
+	I: Iterator<Item = IteratorItem<'a>> + 'a,
+{
+	fn make<F>(tx: RoTxn<'a>, iterfun: F) -> Result<ValueIter<'a>>
+	where
+		F: FnOnce(&'a RoTxn<'a>) -> Result<I>,
+	{
+		let mut res = TxAndIterator { tx, iter: None };
+
+		let tx = unsafe { NonNull::from(&res.tx).as_ref() };
+		res.iter = Some(iterfun(tx)?);
+
+		Ok(Box::new(res))
+	}
+}
+
+impl<'a, I> Drop for TxAndIterator<'a, I>
+where
+	I: Iterator<Item = IteratorItem<'a>> + 'a,
+{
+	fn drop(&mut self) {
+		drop(self.iter.take());
+	}
+}
+
+impl<'a, I> Iterator for TxAndIterator<'a, I>
+where
+	I: Iterator<Item = IteratorItem<'a>> + 'a,
+{
+	type Item = Result<(Value, Value)>;
+
+	fn next(&mut self) -> Option<Self::Item> {
+		match self.iter.as_mut().unwrap().next() {
+			None => None,
+			Some(Err(e)) => Some(Err(e.into())),
+			Some(Ok((k, v))) => Some(Ok((k.to_vec(), v.to_vec()))),
+		}
+	}
+}
+
+// ----
+
+#[cfg(target_pointer_width = "64")]
+pub fn recommended_map_size() -> usize {
+	1usize << 40
+}
+
+#[cfg(target_pointer_width = "32")]
+pub fn recommended_map_size() -> usize {
+	warn!("LMDB is not recommended on 32-bit systems, database size will be limited");
+	1usize << 30
+}
diff --git a/src/db/sled_adapter.rs b/src/db/sled_adapter.rs
new file mode 100644
index 00000000..cf61867d
--- /dev/null
+++ b/src/db/sled_adapter.rs
@@ -0,0 +1,266 @@
+use core::ops::Bound;
+
+use std::cell::Cell;
+use std::collections::HashMap;
+use std::sync::{Arc, RwLock};
+
+use sled::transaction::{
+	ConflictableTransactionError, TransactionError, Transactional, TransactionalTree,
+	UnabortableTransactionError,
+};
+
+use crate::{
+	Db, Error, IDb, ITx, ITxFn, Result, TxError, TxFnResult, TxOpError, TxOpResult, TxResult,
+	TxValueIter, Value, ValueIter,
+};
+
+pub use sled;
+
+// -- err
+
+impl From<sled::Error> for Error {
+	fn from(e: sled::Error) -> Error {
+		Error(format!("Sled: {}", e).into())
+	}
+}
+
+impl From<sled::Error> for TxOpError {
+	fn from(e: sled::Error) -> TxOpError {
+		TxOpError(e.into())
+	}
+}
+
+// -- db
+
+pub struct SledDb {
+	db: sled::Db,
+	trees: RwLock<(Vec<sled::Tree>, HashMap<String, usize>)>,
+}
+
+impl SledDb {
+	pub fn init(db: sled::Db) -> Db {
+		let s = Self {
+			db,
+			trees: RwLock::new((Vec::new(), HashMap::new())),
+		};
+		Db(Arc::new(s))
+	}
+
+	fn get_tree(&self, i: usize) -> Result<sled::Tree> {
+		self.trees
+			.read()
+			.unwrap()
+			.0
+			.get(i)
+			.cloned()
+			.ok_or_else(|| Error("invalid tree id".into()))
+	}
+}
+
+impl IDb for SledDb {
+	fn engine(&self) -> String {
+		"Sled".into()
+	}
+
+	fn open_tree(&self, name: &str) -> Result<usize> {
+		let mut trees = self.trees.write().unwrap();
+		if let Some(i) = trees.1.get(name) {
+			Ok(*i)
+		} else {
+			let tree = self.db.open_tree(name)?;
+			let i = trees.0.len();
+			trees.0.push(tree);
+			trees.1.insert(name.to_string(), i);
+			Ok(i)
+		}
+	}
+
+	fn list_trees(&self) -> Result<Vec<String>> {
+		let mut trees = vec![];
+		for name in self.db.tree_names() {
+			let name = std::str::from_utf8(&name)
+				.map_err(|e| Error(format!("{}", e).into()))?
+				.to_string();
+			if name != "__sled__default" {
+				trees.push(name);
+			}
+		}
+		Ok(trees)
+	}
+
+	// ----
+
+	fn get(&self, tree: usize, key: &[u8]) -> Result<Option<Value>> {
+		let tree = self.get_tree(tree)?;
+		let val = tree.get(key)?;
+		Ok(val.map(|x| x.to_vec()))
+	}
+
+	fn len(&self, tree: usize) -> Result<usize> {
+		let tree = self.get_tree(tree)?;
+		Ok(tree.len())
+	}
+
+	fn insert(&self, tree: usize, key: &[u8], value: &[u8]) -> Result<Option<Value>> {
+		let tree = self.get_tree(tree)?;
+		let old_val = tree.insert(key, value)?;
+		Ok(old_val.map(|x| x.to_vec()))
+	}
+
+	fn remove(&self, tree: usize, key: &[u8]) -> Result<Option<Value>> {
+		let tree = self.get_tree(tree)?;
+		let old_val = tree.remove(key)?;
+		Ok(old_val.map(|x| x.to_vec()))
+	}
+
+	fn clear(&self, tree: usize) -> Result<()> {
+		let tree = self.get_tree(tree)?;
+		tree.clear()?;
+		Ok(())
+	}
+
+	fn iter(&self, tree: usize) -> Result<ValueIter<'_>> {
+		let tree = self.get_tree(tree)?;
+		Ok(Box::new(tree.iter().map(|v| {
+			v.map(|(x, y)| (x.to_vec(), y.to_vec())).map_err(Into::into)
+		})))
+	}
+
+	fn iter_rev(&self, tree: usize) -> Result<ValueIter<'_>> {
+		let tree = self.get_tree(tree)?;
+		Ok(Box::new(tree.iter().rev().map(|v| {
+			v.map(|(x, y)| (x.to_vec(), y.to_vec())).map_err(Into::into)
+		})))
+	}
+
+	fn range<'r>(
+		&self,
+		tree: usize,
+		low: Bound<&'r [u8]>,
+		high: Bound<&'r [u8]>,
+	) -> Result<ValueIter<'_>> {
+		let tree = self.get_tree(tree)?;
+		Ok(Box::new(tree.range::<&'r [u8], _>((low, high)).map(|v| {
+			v.map(|(x, y)| (x.to_vec(), y.to_vec())).map_err(Into::into)
+		})))
+	}
+	fn range_rev<'r>(
+		&self,
+		tree: usize,
+		low: Bound<&'r [u8]>,
+		high: Bound<&'r [u8]>,
+	) -> Result<ValueIter<'_>> {
+		let tree = self.get_tree(tree)?;
+		Ok(Box::new(tree.range::<&'r [u8], _>((low, high)).rev().map(
+			|v| v.map(|(x, y)| (x.to_vec(), y.to_vec())).map_err(Into::into),
+		)))
+	}
+
+	// ----
+
+	fn transaction(&self, f: &dyn ITxFn) -> TxResult<(), ()> {
+		let trees = self.trees.read().unwrap();
+		let res = trees.0.transaction(|txtrees| {
+			let mut tx = SledTx {
+				trees: txtrees,
+				err: Cell::new(None),
+			};
+			match f.try_on(&mut tx) {
+				TxFnResult::Ok => {
+					assert!(tx.err.into_inner().is_none());
+					Ok(())
+				}
+				TxFnResult::Abort => {
+					assert!(tx.err.into_inner().is_none());
+					Err(ConflictableTransactionError::Abort(()))
+				}
+				TxFnResult::DbErr => {
+					let e = tx.err.into_inner().expect("No DB error");
+					Err(e.into())
+				}
+			}
+		});
+		match res {
+			Ok(()) => Ok(()),
+			Err(TransactionError::Abort(())) => Err(TxError::Abort(())),
+			Err(TransactionError::Storage(s)) => Err(TxError::Db(s.into())),
+		}
+	}
+}
+
+// ----
+
+struct SledTx<'a> {
+	trees: &'a [TransactionalTree],
+	err: Cell<Option<UnabortableTransactionError>>,
+}
+
+impl<'a> SledTx<'a> {
+	fn get_tree(&self, i: usize) -> TxOpResult<&TransactionalTree> {
+		self.trees.get(i).ok_or_else(|| {
+			TxOpError(Error(
+				"invalid tree id (it might have been openned after the transaction started)".into(),
+			))
+		})
+	}
+
+	fn save_error<R>(
+		&self,
+		v: std::result::Result<R, UnabortableTransactionError>,
+	) -> TxOpResult<R> {
+		match v {
+			Ok(x) => Ok(x),
+			Err(e) => {
+				let txt = format!("{}", e);
+				self.err.set(Some(e));
+				Err(TxOpError(Error(txt.into())))
+			}
+		}
+	}
+}
+
+impl<'a> ITx for SledTx<'a> {
+	fn get(&self, tree: usize, key: &[u8]) -> TxOpResult<Option<Value>> {
+		let tree = self.get_tree(tree)?;
+		let tmp = self.save_error(tree.get(key))?;
+		Ok(tmp.map(|x| x.to_vec()))
+	}
+	fn len(&self, _tree: usize) -> TxOpResult<usize> {
+		unimplemented!(".len() in transaction not supported with Sled backend")
+	}
+
+	fn insert(&mut self, tree: usize, key: &[u8], value: &[u8]) -> TxOpResult<Option<Value>> {
+		let tree = self.get_tree(tree)?;
+		let old_val = self.save_error(tree.insert(key, value))?;
+		Ok(old_val.map(|x| x.to_vec()))
+	}
+	fn remove(&mut self, tree: usize, key: &[u8]) -> TxOpResult<Option<Value>> {
+		let tree = self.get_tree(tree)?;
+		let old_val = self.save_error(tree.remove(key))?;
+		Ok(old_val.map(|x| x.to_vec()))
+	}
+
+	fn iter(&self, _tree: usize) -> TxOpResult<TxValueIter<'_>> {
+		unimplemented!("Iterators in transactions not supported with Sled backend");
+	}
+	fn iter_rev(&self, _tree: usize) -> TxOpResult<TxValueIter<'_>> {
+		unimplemented!("Iterators in transactions not supported with Sled backend");
+	}
+
+	fn range<'r>(
+		&self,
+		_tree: usize,
+		_low: Bound<&'r [u8]>,
+		_high: Bound<&'r [u8]>,
+	) -> TxOpResult<TxValueIter<'_>> {
+		unimplemented!("Iterators in transactions not supported with Sled backend");
+	}
+	fn range_rev<'r>(
+		&self,
+		_tree: usize,
+		_low: Bound<&'r [u8]>,
+		_high: Bound<&'r [u8]>,
+	) -> TxOpResult<TxValueIter<'_>> {
+		unimplemented!("Iterators in transactions not supported with Sled backend");
+	}
+}
diff --git a/src/db/sqlite_adapter.rs b/src/db/sqlite_adapter.rs
new file mode 100644
index 00000000..886fda6e
--- /dev/null
+++ b/src/db/sqlite_adapter.rs
@@ -0,0 +1,508 @@
+use core::ops::Bound;
+
+use std::borrow::BorrowMut;
+use std::marker::PhantomPinned;
+use std::pin::Pin;
+use std::ptr::NonNull;
+use std::sync::{Arc, Mutex, MutexGuard};
+
+use rusqlite::{params, Connection, Rows, Statement, Transaction};
+
+use crate::{
+	Db, Error, IDb, ITx, ITxFn, Result, TxError, TxFnResult, TxOpError, TxOpResult, TxResult,
+	TxValueIter, Value, ValueIter,
+};
+
+pub use rusqlite;
+
+// --- err
+
+impl From<rusqlite::Error> for Error {
+	fn from(e: rusqlite::Error) -> Error {
+		Error(format!("Sqlite: {}", e).into())
+	}
+}
+
+impl From<rusqlite::Error> for TxOpError {
+	fn from(e: rusqlite::Error) -> TxOpError {
+		TxOpError(e.into())
+	}
+}
+
+// -- db
+
+pub struct SqliteDb(Mutex<SqliteDbInner>);
+
+struct SqliteDbInner {
+	db: Connection,
+	trees: Vec<String>,
+}
+
+impl SqliteDb {
+	pub fn init(db: rusqlite::Connection) -> Db {
+		let s = Self(Mutex::new(SqliteDbInner {
+			db,
+			trees: Vec::new(),
+		}));
+		Db(Arc::new(s))
+	}
+}
+
+impl SqliteDbInner {
+	fn get_tree(&self, i: usize) -> Result<&'_ str> {
+		self.trees
+			.get(i)
+			.map(String::as_str)
+			.ok_or_else(|| Error("invalid tree id".into()))
+	}
+
+	fn internal_get(&self, tree: &str, key: &[u8]) -> Result<Option<Value>> {
+		let mut stmt = self
+			.db
+			.prepare(&format!("SELECT v FROM {} WHERE k = ?1", tree))?;
+		let mut res_iter = stmt.query([key])?;
+		match res_iter.next()? {
+			None => Ok(None),
+			Some(v) => Ok(Some(v.get::<_, Vec<u8>>(0)?)),
+		}
+	}
+}
+
+impl IDb for SqliteDb {
+	fn engine(&self) -> String {
+		format!("sqlite3 v{} (using rusqlite crate)", rusqlite::version())
+	}
+
+	fn open_tree(&self, name: &str) -> Result<usize> {
+		let name = format!("tree_{}", name.replace(':', "_COLON_"));
+		let mut this = self.0.lock().unwrap();
+
+		if let Some(i) = this.trees.iter().position(|x| x == &name) {
+			Ok(i)
+		} else {
+			trace!("create table {}", name);
+			this.db.execute(
+				&format!(
+					"CREATE TABLE IF NOT EXISTS {} (
+						k BLOB PRIMARY KEY,
+						v BLOB
+					)",
+					name
+				),
+				[],
+			)?;
+			trace!("table created: {}, unlocking", name);
+
+			let i = this.trees.len();
+			this.trees.push(name.to_string());
+			Ok(i)
+		}
+	}
+
+	fn list_trees(&self) -> Result<Vec<String>> {
+		let mut trees = vec![];
+
+		trace!("list_trees: lock db");
+		let this = self.0.lock().unwrap();
+		trace!("list_trees: lock acquired");
+
+		let mut stmt = this.db.prepare(
+			"SELECT name FROM sqlite_schema WHERE type = 'table' AND name LIKE 'tree_%'",
+		)?;
+		let mut rows = stmt.query([])?;
+		while let Some(row) = rows.next()? {
+			let name = row.get::<_, String>(0)?;
+			let name = name.replace("_COLON_", ":");
+			let name = name.strip_prefix("tree_").unwrap().to_string();
+			trees.push(name);
+		}
+		Ok(trees)
+	}
+
+	// ----
+
+	fn get(&self, tree: usize, key: &[u8]) -> Result<Option<Value>> {
+		trace!("get {}: lock db", tree);
+		let this = self.0.lock().unwrap();
+		trace!("get {}: lock acquired", tree);
+
+		let tree = this.get_tree(tree)?;
+		this.internal_get(tree, key)
+	}
+
+	fn len(&self, tree: usize) -> Result<usize> {
+		trace!("len {}: lock db", tree);
+		let this = self.0.lock().unwrap();
+		trace!("len {}: lock acquired", tree);
+
+		let tree = this.get_tree(tree)?;
+		let mut stmt = this.db.prepare(&format!("SELECT COUNT(*) FROM {}", tree))?;
+		let mut res_iter = stmt.query([])?;
+		match res_iter.next()? {
+			None => Ok(0),
+			Some(v) => Ok(v.get::<_, usize>(0)?),
+		}
+	}
+
+	fn insert(&self, tree: usize, key: &[u8], value: &[u8]) -> Result<Option<Value>> {
+		trace!("insert {}: lock db", tree);
+		let this = self.0.lock().unwrap();
+		trace!("insert {}: lock acquired", tree);
+
+		let tree = this.get_tree(tree)?;
+		let old_val = this.internal_get(tree, key)?;
+
+		let sql = match &old_val {
+			Some(_) => format!("UPDATE {} SET v = ?2 WHERE k = ?1", tree),
+			None => format!("INSERT INTO {} (k, v) VALUES (?1, ?2)", tree),
+		};
+		let n = this.db.execute(&sql, params![key, value])?;
+		assert_eq!(n, 1);
+
+		Ok(old_val)
+	}
+
+	fn remove(&self, tree: usize, key: &[u8]) -> Result<Option<Value>> {
+		trace!("remove {}: lock db", tree);
+		let this = self.0.lock().unwrap();
+		trace!("remove {}: lock acquired", tree);
+
+		let tree = this.get_tree(tree)?;
+		let old_val = this.internal_get(tree, key)?;
+
+		if old_val.is_some() {
+			let n = this
+				.db
+				.execute(&format!("DELETE FROM {} WHERE k = ?1", tree), params![key])?;
+			assert_eq!(n, 1);
+		}
+
+		Ok(old_val)
+	}
+
+	fn clear(&self, tree: usize) -> Result<()> {
+		trace!("clear {}: lock db", tree);
+		let this = self.0.lock().unwrap();
+		trace!("clear {}: lock acquired", tree);
+
+		let tree = this.get_tree(tree)?;
+		this.db.execute(&format!("DELETE FROM {}", tree), [])?;
+		Ok(())
+	}
+
+	fn iter(&self, tree: usize) -> Result<ValueIter<'_>> {
+		trace!("iter {}: lock db", tree);
+		let this = self.0.lock().unwrap();
+		trace!("iter {}: lock acquired", tree);
+
+		let tree = this.get_tree(tree)?;
+		let sql = format!("SELECT k, v FROM {} ORDER BY k ASC", tree);
+		DbValueIterator::make(this, &sql, [])
+	}
+
+	fn iter_rev(&self, tree: usize) -> Result<ValueIter<'_>> {
+		trace!("iter_rev {}: lock db", tree);
+		let this = self.0.lock().unwrap();
+		trace!("iter_rev {}: lock acquired", tree);
+
+		let tree = this.get_tree(tree)?;
+		let sql = format!("SELECT k, v FROM {} ORDER BY k DESC", tree);
+		DbValueIterator::make(this, &sql, [])
+	}
+
+	fn range<'r>(
+		&self,
+		tree: usize,
+		low: Bound<&'r [u8]>,
+		high: Bound<&'r [u8]>,
+	) -> Result<ValueIter<'_>> {
+		trace!("range {}: lock db", tree);
+		let this = self.0.lock().unwrap();
+		trace!("range {}: lock acquired", tree);
+
+		let tree = this.get_tree(tree)?;
+
+		let (bounds_sql, params) = bounds_sql(low, high);
+		let sql = format!("SELECT k, v FROM {} {} ORDER BY k ASC", tree, bounds_sql);
+
+		let params = params
+			.iter()
+			.map(|x| x as &dyn rusqlite::ToSql)
+			.collect::<Vec<_>>();
+
+		DbValueIterator::make::<&[&dyn rusqlite::ToSql]>(this, &sql, params.as_ref())
+	}
+	fn range_rev<'r>(
+		&self,
+		tree: usize,
+		low: Bound<&'r [u8]>,
+		high: Bound<&'r [u8]>,
+	) -> Result<ValueIter<'_>> {
+		trace!("range_rev {}: lock db", tree);
+		let this = self.0.lock().unwrap();
+		trace!("range_rev {}: lock acquired", tree);
+
+		let tree = this.get_tree(tree)?;
+
+		let (bounds_sql, params) = bounds_sql(low, high);
+		let sql = format!("SELECT k, v FROM {} {} ORDER BY k DESC", tree, bounds_sql);
+
+		let params = params
+			.iter()
+			.map(|x| x as &dyn rusqlite::ToSql)
+			.collect::<Vec<_>>();
+
+		DbValueIterator::make::<&[&dyn rusqlite::ToSql]>(this, &sql, params.as_ref())
+	}
+
+	// ----
+
+	fn transaction(&self, f: &dyn ITxFn) -> TxResult<(), ()> {
+		trace!("transaction: lock db");
+		let mut this = self.0.lock().unwrap();
+		trace!("transaction: lock acquired");
+
+		let this_mut_ref: &mut SqliteDbInner = this.borrow_mut();
+
+		let mut tx = SqliteTx {
+			tx: this_mut_ref
+				.db
+				.transaction()
+				.map_err(Error::from)
+				.map_err(TxError::Db)?,
+			trees: &this_mut_ref.trees,
+		};
+		let res = match f.try_on(&mut tx) {
+			TxFnResult::Ok => {
+				tx.tx.commit().map_err(Error::from).map_err(TxError::Db)?;
+				Ok(())
+			}
+			TxFnResult::Abort => {
+				tx.tx.rollback().map_err(Error::from).map_err(TxError::Db)?;
+				Err(TxError::Abort(()))
+			}
+			TxFnResult::DbErr => {
+				tx.tx.rollback().map_err(Error::from).map_err(TxError::Db)?;
+				Err(TxError::Db(Error(
+					"(this message will be discarded)".into(),
+				)))
+			}
+		};
+
+		trace!("transaction done");
+		res
+	}
+}
+
+// ----
+
+struct SqliteTx<'a> {
+	tx: Transaction<'a>,
+	trees: &'a [String],
+}
+
+impl<'a> SqliteTx<'a> {
+	fn get_tree(&self, i: usize) -> TxOpResult<&'_ str> {
+		self.trees.get(i).map(String::as_ref).ok_or_else(|| {
+			TxOpError(Error(
+				"invalid tree id (it might have been openned after the transaction started)".into(),
+			))
+		})
+	}
+
+	fn internal_get(&self, tree: &str, key: &[u8]) -> TxOpResult<Option<Value>> {
+		let mut stmt = self
+			.tx
+			.prepare(&format!("SELECT v FROM {} WHERE k = ?1", tree))?;
+		let mut res_iter = stmt.query([key])?;
+		match res_iter.next()? {
+			None => Ok(None),
+			Some(v) => Ok(Some(v.get::<_, Vec<u8>>(0)?)),
+		}
+	}
+}
+
+impl<'a> ITx for SqliteTx<'a> {
+	fn get(&self, tree: usize, key: &[u8]) -> TxOpResult<Option<Value>> {
+		let tree = self.get_tree(tree)?;
+		self.internal_get(tree, key)
+	}
+	fn len(&self, tree: usize) -> TxOpResult<usize> {
+		let tree = self.get_tree(tree)?;
+		let mut stmt = self.tx.prepare(&format!("SELECT COUNT(*) FROM {}", tree))?;
+		let mut res_iter = stmt.query([])?;
+		match res_iter.next()? {
+			None => Ok(0),
+			Some(v) => Ok(v.get::<_, usize>(0)?),
+		}
+	}
+
+	fn insert(&mut self, tree: usize, key: &[u8], value: &[u8]) -> TxOpResult<Option<Value>> {
+		let tree = self.get_tree(tree)?;
+		let old_val = self.internal_get(tree, key)?;
+
+		let sql = match &old_val {
+			Some(_) => format!("UPDATE {} SET v = ?2 WHERE k = ?1", tree),
+			None => format!("INSERT INTO {} (k, v) VALUES (?1, ?2)", tree),
+		};
+		let n = self.tx.execute(&sql, params![key, value])?;
+		assert_eq!(n, 1);
+
+		Ok(old_val)
+	}
+	fn remove(&mut self, tree: usize, key: &[u8]) -> TxOpResult<Option<Value>> {
+		let tree = self.get_tree(tree)?;
+		let old_val = self.internal_get(tree, key)?;
+
+		if old_val.is_some() {
+			let n = self
+				.tx
+				.execute(&format!("DELETE FROM {} WHERE k = ?1", tree), params![key])?;
+			assert_eq!(n, 1);
+		}
+
+		Ok(old_val)
+	}
+
+	fn iter(&self, _tree: usize) -> TxOpResult<TxValueIter<'_>> {
+		unimplemented!();
+	}
+	fn iter_rev(&self, _tree: usize) -> TxOpResult<TxValueIter<'_>> {
+		unimplemented!();
+	}
+
+	fn range<'r>(
+		&self,
+		_tree: usize,
+		_low: Bound<&'r [u8]>,
+		_high: Bound<&'r [u8]>,
+	) -> TxOpResult<TxValueIter<'_>> {
+		unimplemented!();
+	}
+	fn range_rev<'r>(
+		&self,
+		_tree: usize,
+		_low: Bound<&'r [u8]>,
+		_high: Bound<&'r [u8]>,
+	) -> TxOpResult<TxValueIter<'_>> {
+		unimplemented!();
+	}
+}
+
+// ----
+
+struct DbValueIterator<'a> {
+	db: MutexGuard<'a, SqliteDbInner>,
+	stmt: Option<Statement<'a>>,
+	iter: Option<Rows<'a>>,
+	_pin: PhantomPinned,
+}
+
+impl<'a> DbValueIterator<'a> {
+	fn make<P: rusqlite::Params>(
+		db: MutexGuard<'a, SqliteDbInner>,
+		sql: &str,
+		args: P,
+	) -> Result<ValueIter<'a>> {
+		let res = DbValueIterator {
+			db,
+			stmt: None,
+			iter: None,
+			_pin: PhantomPinned,
+		};
+		let mut boxed = Box::pin(res);
+		trace!("make iterator with sql: {}", sql);
+
+		unsafe {
+			let db = NonNull::from(&boxed.db);
+			let stmt = db.as_ref().db.prepare(sql)?;
+
+			let mut_ref: Pin<&mut DbValueIterator<'a>> = Pin::as_mut(&mut boxed);
+			Pin::get_unchecked_mut(mut_ref).stmt = Some(stmt);
+
+			let mut stmt = NonNull::from(&boxed.stmt);
+			let iter = stmt.as_mut().as_mut().unwrap().query(args)?;
+
+			let mut_ref: Pin<&mut DbValueIterator<'a>> = Pin::as_mut(&mut boxed);
+			Pin::get_unchecked_mut(mut_ref).iter = Some(iter);
+		}
+
+		Ok(Box::new(DbValueIteratorPin(boxed)))
+	}
+}
+
+impl<'a> Drop for DbValueIterator<'a> {
+	fn drop(&mut self) {
+		trace!("drop iter");
+		drop(self.iter.take());
+		drop(self.stmt.take());
+	}
+}
+
+struct DbValueIteratorPin<'a>(Pin<Box<DbValueIterator<'a>>>);
+
+impl<'a> Iterator for DbValueIteratorPin<'a> {
+	type Item = Result<(Value, Value)>;
+
+	fn next(&mut self) -> Option<Self::Item> {
+		let next = unsafe {
+			let mut_ref: Pin<&mut DbValueIterator<'a>> = Pin::as_mut(&mut self.0);
+			Pin::get_unchecked_mut(mut_ref).iter.as_mut()?.next()
+		};
+		let row = match next {
+			Err(e) => return Some(Err(e.into())),
+			Ok(None) => return None,
+			Ok(Some(r)) => r,
+		};
+		let k = match row.get::<_, Vec<u8>>(0) {
+			Err(e) => return Some(Err(e.into())),
+			Ok(x) => x,
+		};
+		let v = match row.get::<_, Vec<u8>>(1) {
+			Err(e) => return Some(Err(e.into())),
+			Ok(y) => y,
+		};
+		Some(Ok((k, v)))
+	}
+}
+
+// ----
+
+fn bounds_sql<'r>(low: Bound<&'r [u8]>, high: Bound<&'r [u8]>) -> (String, Vec<Vec<u8>>) {
+	let mut sql = String::new();
+	let mut params: Vec<Vec<u8>> = vec![];
+
+	match low {
+		Bound::Included(b) => {
+			sql.push_str(" WHERE k >= ?1");
+			params.push(b.to_vec());
+		}
+		Bound::Excluded(b) => {
+			sql.push_str(" WHERE k > ?1");
+			params.push(b.to_vec());
+		}
+		Bound::Unbounded => (),
+	};
+
+	match high {
+		Bound::Included(b) => {
+			if !params.is_empty() {
+				sql.push_str(" AND k <= ?2");
+			} else {
+				sql.push_str(" WHERE k <= ?1");
+			}
+			params.push(b.to_vec());
+		}
+		Bound::Excluded(b) => {
+			if !params.is_empty() {
+				sql.push_str(" AND k < ?2");
+			} else {
+				sql.push_str(" WHERE k < ?1");
+			}
+			params.push(b.to_vec());
+		}
+		Bound::Unbounded => (),
+	}
+
+	(sql, params)
+}
diff --git a/src/db/test.rs b/src/db/test.rs
new file mode 100644
index 00000000..cfcee643
--- /dev/null
+++ b/src/db/test.rs
@@ -0,0 +1,106 @@
+use crate::*;
+
+use crate::lmdb_adapter::LmdbDb;
+use crate::sled_adapter::SledDb;
+use crate::sqlite_adapter::SqliteDb;
+
+fn test_suite(db: Db) {
+	let tree = db.open_tree("tree").unwrap();
+
+	let ka: &[u8] = &b"test"[..];
+	let kb: &[u8] = &b"zwello"[..];
+	let kint: &[u8] = &b"tz"[..];
+	let va: &[u8] = &b"plop"[..];
+	let vb: &[u8] = &b"plip"[..];
+	let vc: &[u8] = &b"plup"[..];
+
+	assert!(tree.insert(ka, va).unwrap().is_none());
+	assert_eq!(tree.get(ka).unwrap().unwrap(), va);
+
+	let res = db.transaction::<_, (), _>(|mut tx| {
+		assert_eq!(tx.get(&tree, ka).unwrap().unwrap(), va);
+
+		assert_eq!(tx.insert(&tree, ka, vb).unwrap().unwrap(), va);
+
+		assert_eq!(tx.get(&tree, ka).unwrap().unwrap(), vb);
+
+		tx.commit(12)
+	});
+	assert!(matches!(res, Ok(12)));
+	assert_eq!(tree.get(ka).unwrap().unwrap(), vb);
+
+	let res = db.transaction::<(), _, _>(|mut tx| {
+		assert_eq!(tx.get(&tree, ka).unwrap().unwrap(), vb);
+
+		assert_eq!(tx.insert(&tree, ka, vc).unwrap().unwrap(), vb);
+
+		assert_eq!(tx.get(&tree, ka).unwrap().unwrap(), vc);
+
+		tx.abort(42)
+	});
+	assert!(matches!(res, Err(TxError::Abort(42))));
+	assert_eq!(tree.get(ka).unwrap().unwrap(), vb);
+
+	let mut iter = tree.iter().unwrap();
+	let next = iter.next().unwrap().unwrap();
+	assert_eq!((next.0.as_ref(), next.1.as_ref()), (ka, vb));
+	assert!(iter.next().is_none());
+	drop(iter);
+
+	assert!(tree.insert(kb, vc).unwrap().is_none());
+	assert_eq!(tree.get(kb).unwrap().unwrap(), vc);
+
+	let mut iter = tree.iter().unwrap();
+	let next = iter.next().unwrap().unwrap();
+	assert_eq!((next.0.as_ref(), next.1.as_ref()), (ka, vb));
+	let next = iter.next().unwrap().unwrap();
+	assert_eq!((next.0.as_ref(), next.1.as_ref()), (kb, vc));
+	assert!(iter.next().is_none());
+	drop(iter);
+
+	let mut iter = tree.range(kint..).unwrap();
+	let next = iter.next().unwrap().unwrap();
+	assert_eq!((next.0.as_ref(), next.1.as_ref()), (kb, vc));
+	assert!(iter.next().is_none());
+	drop(iter);
+
+	let mut iter = tree.range_rev(..kint).unwrap();
+	let next = iter.next().unwrap().unwrap();
+	assert_eq!((next.0.as_ref(), next.1.as_ref()), (ka, vb));
+	assert!(iter.next().is_none());
+	drop(iter);
+
+	let mut iter = tree.iter_rev().unwrap();
+	let next = iter.next().unwrap().unwrap();
+	assert_eq!((next.0.as_ref(), next.1.as_ref()), (kb, vc));
+	let next = iter.next().unwrap().unwrap();
+	assert_eq!((next.0.as_ref(), next.1.as_ref()), (ka, vb));
+	assert!(iter.next().is_none());
+	drop(iter);
+}
+
+#[test]
+fn test_lmdb_db() {
+	let path = mktemp::Temp::new_dir().unwrap();
+	let db = heed::EnvOpenOptions::new()
+		.max_dbs(100)
+		.open(&path)
+		.unwrap();
+	let db = LmdbDb::init(db);
+	test_suite(db);
+	drop(path);
+}
+
+#[test]
+fn test_sled_db() {
+	let path = mktemp::Temp::new_dir().unwrap();
+	let db = SledDb::init(sled::open(path.to_path_buf()).unwrap());
+	test_suite(db);
+	drop(path);
+}
+
+#[test]
+fn test_sqlite_db() {
+	let db = SqliteDb::init(rusqlite::Connection::open_in_memory().unwrap());
+	test_suite(db);
+}
diff --git a/src/garage/Cargo.toml b/src/garage/Cargo.toml
index 59f402ff..5ce40ff2 100644
--- a/src/garage/Cargo.toml
+++ b/src/garage/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "garage"
-version = "0.7.0"
+version = "0.8.0"
 authors = ["Alex Auvolat <alex@adnab.me>"]
 edition = "2018"
 license = "AGPL-3.0"
@@ -21,25 +21,25 @@ path = "tests/lib.rs"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
-garage_api = { version = "0.7.0", path = "../api" }
-garage_model = { version = "0.7.0", path = "../model" }
-garage_rpc = { version = "0.7.0", path = "../rpc" }
-garage_table = { version = "0.7.0", path = "../table" }
-garage_util = { version = "0.7.0", path = "../util" }
-garage_web = { version = "0.7.0", path = "../web" }
-garage_admin = { version = "0.7.0", path = "../admin" }
+garage_db = { version = "0.8.0", path = "../db" }
+garage_api = { version = "0.8.0", path = "../api" }
+garage_block = { version = "0.8.0", path = "../block" }
+garage_model = { version = "0.8.0", path = "../model" }
+garage_rpc = { version = "0.8.0", path = "../rpc" }
+garage_table = { version = "0.8.0", path = "../table" }
+garage_util = { version = "0.8.0", path = "../util" }
+garage_web = { version = "0.8.0", path = "../web" }
 
 bytes = "1.0"
-git-version = "0.3.4"
+bytesize = "1.1"
+timeago = "0.3"
 hex = "0.4"
 tracing = { version = "0.1.30", features = ["log-always"] }
-pretty_env_logger = "0.4"
+tracing-subscriber = { version = "0.3", features = ["env-filter"] }
 rand = "0.8"
 async-trait = "0.1.7"
 sodiumoxide = { version = "0.2.5-0", package = "kuska-sodiumoxide" }
 
-sled = "0.34"
-
 rmp-serde = "0.15"
 serde = { version = "1.0", default-features = false, features = ["derive", "rc"] }
 serde_bytes = "0.11"
@@ -50,16 +50,48 @@ futures = "0.3"
 futures-util = "0.3"
 tokio = { version = "1.0", default-features = false, features = ["rt", "rt-multi-thread", "io-util", "net", "time", "macros", "sync", "signal", "fs"] }
 
-#netapp = { version = "0.3.2", git = "https://git.deuxfleurs.fr/lx/netapp" }
-#netapp = { version = "0.4", path = "../../../netapp" }
-netapp = "0.4"
+netapp = "0.5"
+
+opentelemetry = { version = "0.17", features = [ "rt-tokio" ] }
+opentelemetry-prometheus = { version = "0.10", optional = true }
+opentelemetry-otlp = { version = "0.10", optional = true }
+prometheus = { version = "0.13", optional = true }
 
 [dev-dependencies]
 aws-sdk-s3 = "0.8"
 chrono = "0.4"
 http = "0.2"
-hmac = "0.10"
+hmac = "0.12"
 hyper = { version = "0.14", features = ["client", "http1", "runtime"] }
-sha2 = "0.9"
+sha2 = "0.10"
 
 static_init = "1.0"
+assert-json-diff = "2.0"
+serde_json = "1.0"
+base64 = "0.13"
+
+
+[features]
+default = [ "bundled-libs", "metrics", "sled" ]
+
+k2v = [ "garage_util/k2v", "garage_api/k2v" ]
+
+# Database engines, Sled is still our default even though we don't like it
+sled = [ "garage_model/sled" ]
+lmdb = [ "garage_model/lmdb" ]
+sqlite = [ "garage_model/sqlite" ]
+
+# Automatic registration and discovery via Kubernetes API
+kubernetes-discovery = [ "garage_rpc/kubernetes-discovery" ]
+# Prometheus exporter (/metrics endpoint).
+metrics = [ "garage_api/metrics", "opentelemetry-prometheus", "prometheus" ]
+# Exporter for the OpenTelemetry Collector.
+telemetry-otlp = [ "opentelemetry-otlp" ]
+
+# NOTE: bundled-libs and system-libs should be treat as mutually exclusive;
+# exactly one of them should be enabled.
+
+# Use bundled libsqlite instead of linking against system-provided.
+bundled-libs = [ "garage_db/bundled-libs" ]
+# Link against system-provided libsodium and libzstd.
+system-libs = [ "garage_block/system-libs", "garage_rpc/system-libs", "sodiumoxide/use-pkg-config" ]
diff --git a/src/garage/admin.rs b/src/garage/admin.rs
index 0b20bb20..802a8261 100644
--- a/src/garage/admin.rs
+++ b/src/garage/admin.rs
@@ -15,34 +15,45 @@ use garage_table::*;
 
 use garage_rpc::*;
 
+use garage_block::repair::ScrubWorkerCommand;
+
 use garage_model::bucket_alias_table::*;
 use garage_model::bucket_table::*;
 use garage_model::garage::Garage;
 use garage_model::helper::error::{Error, OkOrBadRequest};
 use garage_model::key_table::*;
 use garage_model::migrate::Migrate;
-use garage_model::object_table::ObjectFilter;
 use garage_model::permission::*;
 
 use crate::cli::*;
-use crate::repair::Repair;
+use crate::repair::online::launch_online_repair;
 
 pub const ADMIN_RPC_PATH: &str = "garage/admin_rpc.rs/Rpc";
 
 #[derive(Debug, Serialize, Deserialize)]
+#[allow(clippy::large_enum_variant)]
 pub enum AdminRpc {
 	BucketOperation(BucketOperation),
 	KeyOperation(KeyOperation),
 	LaunchRepair(RepairOpt),
 	Migrate(MigrateOpt),
 	Stats(StatsOpt),
+	Worker(WorkerOpt),
 
 	// Replies
 	Ok(String),
 	BucketList(Vec<Bucket>),
-	BucketInfo(Bucket, HashMap<String, Key>),
+	BucketInfo {
+		bucket: Bucket,
+		relevant_keys: HashMap<String, Key>,
+		counters: HashMap<String, i64>,
+	},
 	KeyList(Vec<(String, String)>),
 	KeyInfo(Key, HashMap<Uuid, Bucket>),
+	WorkerList(
+		HashMap<usize, garage_util::background::WorkerInfo>,
+		WorkerListOpt,
+	),
 }
 
 impl Rpc for AdminRpc {
@@ -73,6 +84,7 @@ impl AdminRpcHandler {
 			BucketOperation::Allow(query) => self.handle_bucket_allow(query).await,
 			BucketOperation::Deny(query) => self.handle_bucket_deny(query).await,
 			BucketOperation::Website(query) => self.handle_bucket_website(query).await,
+			BucketOperation::SetQuotas(query) => self.handle_bucket_set_quotas(query).await,
 		}
 	}
 
@@ -80,8 +92,15 @@ impl AdminRpcHandler {
 		let buckets = self
 			.garage
 			.bucket_table
-			.get_range(&EmptyKey, None, Some(DeletedFilter::NotDeleted), 10000)
+			.get_range(
+				&EmptyKey,
+				None,
+				Some(DeletedFilter::NotDeleted),
+				10000,
+				EnumerationOrder::Forward,
+			)
 			.await?;
+
 		Ok(AdminRpc::BucketList(buckets))
 	}
 
@@ -99,6 +118,15 @@ impl AdminRpcHandler {
 			.get_existing_bucket(bucket_id)
 			.await?;
 
+		let counters = self
+			.garage
+			.object_counter_table
+			.table
+			.get(&bucket_id, &EmptyKey)
+			.await?
+			.map(|x| x.filtered_values(&self.garage.system.ring.borrow()))
+			.unwrap_or_default();
+
 		let mut relevant_keys = HashMap::new();
 		for (k, _) in bucket
 			.state
@@ -134,7 +162,11 @@ impl AdminRpcHandler {
 			}
 		}
 
-		Ok(AdminRpc::BucketInfo(bucket, relevant_keys))
+		Ok(AdminRpc::BucketInfo {
+			bucket,
+			relevant_keys,
+			counters,
+		})
 	}
 
 	#[allow(clippy::ptr_arg)]
@@ -207,12 +239,7 @@ impl AdminRpcHandler {
 		}
 
 		// Check bucket is empty
-		let objects = self
-			.garage
-			.object_table
-			.get_range(&bucket_id, None, Some(ObjectFilter::IsData), 10)
-			.await?;
-		if !objects.is_empty() {
+		if !helper.is_bucket_empty(bucket_id).await? {
 			return Err(Error::BadRequest(format!(
 				"Bucket {} is not empty",
 				query.name
@@ -249,6 +276,7 @@ impl AdminRpcHandler {
 
 	async fn handle_alias_bucket(&self, query: &AliasBucketOpt) -> Result<AdminRpc, Error> {
 		let helper = self.garage.bucket_helper();
+		let key_helper = self.garage.key_helper();
 
 		let bucket_id = helper
 			.resolve_global_bucket_name(&query.existing_bucket)
@@ -256,7 +284,7 @@ impl AdminRpcHandler {
 			.ok_or_bad_request("Bucket not found")?;
 
 		if let Some(key_pattern) = &query.local {
-			let key = helper.get_existing_matching_key(key_pattern).await?;
+			let key = key_helper.get_existing_matching_key(key_pattern).await?;
 
 			helper
 				.set_local_bucket_alias(bucket_id, &key.key_id, &query.new_name)
@@ -278,9 +306,10 @@ impl AdminRpcHandler {
 
 	async fn handle_unalias_bucket(&self, query: &UnaliasBucketOpt) -> Result<AdminRpc, Error> {
 		let helper = self.garage.bucket_helper();
+		let key_helper = self.garage.key_helper();
 
 		if let Some(key_pattern) = &query.local {
-			let key = helper.get_existing_matching_key(key_pattern).await?;
+			let key = key_helper.get_existing_matching_key(key_pattern).await?;
 
 			let bucket_id = key
 				.state
@@ -319,12 +348,15 @@ impl AdminRpcHandler {
 
 	async fn handle_bucket_allow(&self, query: &PermBucketOpt) -> Result<AdminRpc, Error> {
 		let helper = self.garage.bucket_helper();
+		let key_helper = self.garage.key_helper();
 
 		let bucket_id = helper
 			.resolve_global_bucket_name(&query.bucket)
 			.await?
 			.ok_or_bad_request("Bucket not found")?;
-		let key = helper.get_existing_matching_key(&query.key_pattern).await?;
+		let key = key_helper
+			.get_existing_matching_key(&query.key_pattern)
+			.await?;
 
 		let allow_read = query.read || key.allow_read(&bucket_id);
 		let allow_write = query.write || key.allow_write(&bucket_id);
@@ -351,12 +383,15 @@ impl AdminRpcHandler {
 
 	async fn handle_bucket_deny(&self, query: &PermBucketOpt) -> Result<AdminRpc, Error> {
 		let helper = self.garage.bucket_helper();
+		let key_helper = self.garage.key_helper();
 
 		let bucket_id = helper
 			.resolve_global_bucket_name(&query.bucket)
 			.await?
 			.ok_or_bad_request("Bucket not found")?;
-		let key = helper.get_existing_matching_key(&query.key_pattern).await?;
+		let key = key_helper
+			.get_existing_matching_key(&query.key_pattern)
+			.await?;
 
 		let allow_read = !query.read && key.allow_read(&bucket_id);
 		let allow_write = !query.write && key.allow_write(&bucket_id);
@@ -423,6 +458,60 @@ impl AdminRpcHandler {
 		Ok(AdminRpc::Ok(msg))
 	}
 
+	async fn handle_bucket_set_quotas(&self, query: &SetQuotasOpt) -> Result<AdminRpc, Error> {
+		let bucket_id = self
+			.garage
+			.bucket_helper()
+			.resolve_global_bucket_name(&query.bucket)
+			.await?
+			.ok_or_bad_request("Bucket not found")?;
+
+		let mut bucket = self
+			.garage
+			.bucket_helper()
+			.get_existing_bucket(bucket_id)
+			.await?;
+		let bucket_state = bucket.state.as_option_mut().unwrap();
+
+		if query.max_size.is_none() && query.max_objects.is_none() {
+			return Err(Error::BadRequest(
+				"You must specify either --max-size or --max-objects (or both) for this command to do something.".to_string(),
+			));
+		}
+
+		let mut quotas = bucket_state.quotas.get().clone();
+
+		match query.max_size.as_ref().map(String::as_ref) {
+			Some("none") => quotas.max_size = None,
+			Some(v) => {
+				let bs = v
+					.parse::<bytesize::ByteSize>()
+					.ok_or_bad_request(format!("Invalid size specified: {}", v))?;
+				quotas.max_size = Some(bs.as_u64());
+			}
+			_ => (),
+		}
+
+		match query.max_objects.as_ref().map(String::as_ref) {
+			Some("none") => quotas.max_objects = None,
+			Some(v) => {
+				let mo = v
+					.parse::<u64>()
+					.ok_or_bad_request(format!("Invalid number specified: {}", v))?;
+				quotas.max_objects = Some(mo);
+			}
+			_ => (),
+		}
+
+		bucket_state.quotas.update(quotas);
+		self.garage.bucket_table.insert(&bucket).await?;
+
+		Ok(AdminRpc::Ok(format!(
+			"Quotas updated for {}",
+			&query.bucket
+		)))
+	}
+
 	async fn handle_key_cmd(&self, cmd: &KeyOperation) -> Result<AdminRpc, Error> {
 		match cmd {
 			KeyOperation::List => self.handle_list_keys().await,
@@ -445,6 +534,7 @@ impl AdminRpcHandler {
 				None,
 				Some(KeyFilter::Deleted(DeletedFilter::NotDeleted)),
 				10000,
+				EnumerationOrder::Forward,
 			)
 			.await?
 			.iter()
@@ -456,7 +546,7 @@ impl AdminRpcHandler {
 	async fn handle_key_info(&self, query: &KeyOpt) -> Result<AdminRpc, Error> {
 		let key = self
 			.garage
-			.bucket_helper()
+			.key_helper()
 			.get_existing_matching_key(&query.key_pattern)
 			.await?;
 		self.key_info_result(key).await
@@ -471,7 +561,7 @@ impl AdminRpcHandler {
 	async fn handle_rename_key(&self, query: &KeyRenameOpt) -> Result<AdminRpc, Error> {
 		let mut key = self
 			.garage
-			.bucket_helper()
+			.key_helper()
 			.get_existing_matching_key(&query.key_pattern)
 			.await?;
 		key.params_mut()
@@ -483,9 +573,11 @@ impl AdminRpcHandler {
 	}
 
 	async fn handle_delete_key(&self, query: &KeyDeleteOpt) -> Result<AdminRpc, Error> {
-		let helper = self.garage.bucket_helper();
+		let key_helper = self.garage.key_helper();
 
-		let mut key = helper.get_existing_matching_key(&query.key_pattern).await?;
+		let mut key = key_helper
+			.get_existing_matching_key(&query.key_pattern)
+			.await?;
 
 		if !query.yes {
 			return Err(Error::BadRequest(
@@ -493,32 +585,7 @@ impl AdminRpcHandler {
 			));
 		}
 
-		let state = key.state.as_option_mut().unwrap();
-
-		// --- done checking, now commit ---
-		// (the step at unset_local_bucket_alias will fail if a bucket
-		// does not have another alias, the deletion will be
-		// interrupted in the middle if that happens)
-
-		// 1. Delete local aliases
-		for (alias, _, to) in state.local_aliases.items().iter() {
-			if let Some(bucket_id) = to {
-				helper
-					.unset_local_bucket_alias(*bucket_id, &key.key_id, alias)
-					.await?;
-			}
-		}
-
-		// 2. Remove permissions on all authorized buckets
-		for (ab_id, _auth) in state.authorized_buckets.items().iter() {
-			helper
-				.set_bucket_key_permissions(*ab_id, &key.key_id, BucketKeyPerm::NO_PERMISSIONS)
-				.await?;
-		}
-
-		// 3. Actually delete key
-		key.state = Deletable::delete();
-		self.garage.key_table.insert(&key).await?;
+		key_helper.delete_key(&mut key).await?;
 
 		Ok(AdminRpc::Ok(format!(
 			"Key {} was deleted successfully.",
@@ -529,7 +596,7 @@ impl AdminRpcHandler {
 	async fn handle_allow_key(&self, query: &KeyPermOpt) -> Result<AdminRpc, Error> {
 		let mut key = self
 			.garage
-			.bucket_helper()
+			.key_helper()
 			.get_existing_matching_key(&query.key_pattern)
 			.await?;
 		if query.create_bucket {
@@ -542,7 +609,7 @@ impl AdminRpcHandler {
 	async fn handle_deny_key(&self, query: &KeyPermOpt) -> Result<AdminRpc, Error> {
 		let mut key = self
 			.garage
-			.bucket_helper()
+			.key_helper()
 			.get_existing_matching_key(&query.key_pattern)
 			.await?;
 		if query.create_bucket {
@@ -616,7 +683,7 @@ impl AdminRpcHandler {
 					.endpoint
 					.call(
 						&node,
-						&AdminRpc::LaunchRepair(opt_to_send.clone()),
+						AdminRpc::LaunchRepair(opt_to_send.clone()),
 						PRIO_NORMAL,
 					)
 					.await;
@@ -633,15 +700,7 @@ impl AdminRpcHandler {
 				)))
 			}
 		} else {
-			let repair = Repair {
-				garage: self.garage.clone(),
-			};
-			self.garage
-				.system
-				.background
-				.spawn_worker("Repair worker".into(), move |must_exit| async move {
-					repair.repair_worker(opt, must_exit).await
-				});
+			launch_online_repair(self.garage.clone(), opt).await;
 			Ok(AdminRpc::Ok(format!(
 				"Repair launched on {:?}",
 				self.garage.system.id
@@ -664,7 +723,7 @@ impl AdminRpcHandler {
 				let node_id = (*node).into();
 				match self
 					.endpoint
-					.call(&node_id, &AdminRpc::Stats(opt), PRIO_NORMAL)
+					.call(&node_id, AdminRpc::Stats(opt), PRIO_NORMAL)
 					.await?
 				{
 					Ok(AdminRpc::Ok(s)) => writeln!(&mut ret, "{}", s).unwrap(),
@@ -674,22 +733,22 @@ impl AdminRpcHandler {
 			}
 			Ok(AdminRpc::Ok(ret))
 		} else {
-			Ok(AdminRpc::Ok(self.gather_stats_local(opt)))
+			Ok(AdminRpc::Ok(self.gather_stats_local(opt)?))
 		}
 	}
 
-	fn gather_stats_local(&self, opt: StatsOpt) -> String {
+	fn gather_stats_local(&self, opt: StatsOpt) -> Result<String, Error> {
 		let mut ret = String::new();
 		writeln!(
 			&mut ret,
-			"\nGarage version: {}",
-			option_env!("GIT_VERSION").unwrap_or(git_version::git_version!(
-				prefix = "git:",
-				cargo_prefix = "cargo:",
-				fallback = "unknown"
-			))
+			"\nGarage version: {} [features: {}]",
+			garage_util::version::garage_version(),
+			garage_util::version::garage_features()
+				.map(|list| list.join(", "))
+				.unwrap_or_else(|| "(unknown)".into()),
 		)
 		.unwrap();
+		writeln!(&mut ret, "\nDatabase engine: {}", self.garage.db.engine()).unwrap();
 
 		// Gather ring statistics
 		let ring = self.garage.system.ring.borrow().clone();
@@ -707,59 +766,108 @@ impl AdminRpcHandler {
 			writeln!(&mut ret, "  {:?} {}", n, c).unwrap();
 		}
 
-		self.gather_table_stats(&mut ret, &self.garage.bucket_table, &opt);
-		self.gather_table_stats(&mut ret, &self.garage.key_table, &opt);
-		self.gather_table_stats(&mut ret, &self.garage.object_table, &opt);
-		self.gather_table_stats(&mut ret, &self.garage.version_table, &opt);
-		self.gather_table_stats(&mut ret, &self.garage.block_ref_table, &opt);
+		self.gather_table_stats(&mut ret, &self.garage.bucket_table, &opt)?;
+		self.gather_table_stats(&mut ret, &self.garage.key_table, &opt)?;
+		self.gather_table_stats(&mut ret, &self.garage.object_table, &opt)?;
+		self.gather_table_stats(&mut ret, &self.garage.version_table, &opt)?;
+		self.gather_table_stats(&mut ret, &self.garage.block_ref_table, &opt)?;
 
 		writeln!(&mut ret, "\nBlock manager stats:").unwrap();
 		if opt.detailed {
 			writeln!(
 				&mut ret,
 				"  number of RC entries (~= number of blocks): {}",
-				self.garage.block_manager.rc_len()
+				self.garage.block_manager.rc_len()?
 			)
 			.unwrap();
 		}
 		writeln!(
 			&mut ret,
 			"  resync queue length: {}",
-			self.garage.block_manager.resync_queue_len()
+			self.garage.block_manager.resync.queue_len()?
 		)
 		.unwrap();
 		writeln!(
 			&mut ret,
 			"  blocks with resync errors: {}",
-			self.garage.block_manager.resync_errors_len()
+			self.garage.block_manager.resync.errors_len()?
 		)
 		.unwrap();
 
-		ret
+		Ok(ret)
 	}
 
-	fn gather_table_stats<F, R>(&self, to: &mut String, t: &Arc<Table<F, R>>, opt: &StatsOpt)
+	fn gather_table_stats<F, R>(
+		&self,
+		to: &mut String,
+		t: &Arc<Table<F, R>>,
+		opt: &StatsOpt,
+	) -> Result<(), Error>
 	where
 		F: TableSchema + 'static,
 		R: TableReplication + 'static,
 	{
 		writeln!(to, "\nTable stats for {}", F::TABLE_NAME).unwrap();
 		if opt.detailed {
-			writeln!(to, "  number of items: {}", t.data.store.len()).unwrap();
+			writeln!(
+				to,
+				"  number of items: {}",
+				t.data.store.len().map_err(GarageError::from)?
+			)
+			.unwrap();
 			writeln!(
 				to,
 				"  Merkle tree size: {}",
-				t.merkle_updater.merkle_tree_len()
+				t.merkle_updater.merkle_tree_len()?
 			)
 			.unwrap();
 		}
 		writeln!(
 			to,
 			"  Merkle updater todo queue length: {}",
-			t.merkle_updater.todo_len()
+			t.merkle_updater.todo_len()?
 		)
 		.unwrap();
-		writeln!(to, "  GC todo queue length: {}", t.data.gc_todo_len()).unwrap();
+		writeln!(to, "  GC todo queue length: {}", t.data.gc_todo_len()?).unwrap();
+
+		Ok(())
+	}
+
+	// ----
+
+	async fn handle_worker_cmd(&self, opt: WorkerOpt) -> Result<AdminRpc, Error> {
+		match opt.cmd {
+			WorkerCmd::List { opt } => {
+				let workers = self.garage.background.get_worker_info();
+				Ok(AdminRpc::WorkerList(workers, opt))
+			}
+			WorkerCmd::Set { opt } => match opt {
+				WorkerSetCmd::ScrubTranquility { tranquility } => {
+					let scrub_command = ScrubWorkerCommand::SetTranquility(tranquility);
+					self.garage
+						.block_manager
+						.send_scrub_command(scrub_command)
+						.await;
+					Ok(AdminRpc::Ok("Scrub tranquility updated".into()))
+				}
+				WorkerSetCmd::ResyncNWorkers { n_workers } => {
+					self.garage
+						.block_manager
+						.resync
+						.set_n_workers(n_workers)
+						.await?;
+					Ok(AdminRpc::Ok("Number of resync workers updated".into()))
+				}
+				WorkerSetCmd::ResyncTranquility { tranquility } => {
+					self.garage
+						.block_manager
+						.resync
+						.set_tranquility(tranquility)
+						.await?;
+					Ok(AdminRpc::Ok("Resync tranquility updated".into()))
+				}
+			},
+		}
 	}
 }
 
@@ -776,6 +884,7 @@ impl EndpointHandler<AdminRpc> for AdminRpcHandler {
 			AdminRpc::Migrate(opt) => self.handle_migrate(opt.clone()).await,
 			AdminRpc::LaunchRepair(opt) => self.handle_launch_repair(opt.clone()).await,
 			AdminRpc::Stats(opt) => self.handle_stats(opt.clone()).await,
+			AdminRpc::Worker(opt) => self.handle_worker_cmd(opt.clone()).await,
 			m => Err(GarageError::unexpected_rpc_message(m).into()),
 		}
 	}
diff --git a/src/garage/cli/cmd.rs b/src/garage/cli/cmd.rs
index a90277a0..c8b96489 100644
--- a/src/garage/cli/cmd.rs
+++ b/src/garage/cli/cmd.rs
@@ -1,6 +1,8 @@
 use std::collections::HashSet;
+use std::time::Duration;
 
 use garage_util::error::*;
+use garage_util::formater::format_table;
 
 use garage_rpc::layout::*;
 use garage_rpc::system::*;
@@ -38,13 +40,14 @@ pub async fn cli_command_dispatch(
 			cmd_admin(admin_rpc_endpoint, rpc_host, AdminRpc::LaunchRepair(ro)).await
 		}
 		Command::Stats(so) => cmd_admin(admin_rpc_endpoint, rpc_host, AdminRpc::Stats(so)).await,
+		Command::Worker(wo) => cmd_admin(admin_rpc_endpoint, rpc_host, AdminRpc::Worker(wo)).await,
 		_ => unreachable!(),
 	}
 }
 
 pub async fn cmd_status(rpc_cli: &Endpoint<SystemRpc, ()>, rpc_host: NodeID) -> Result<(), Error> {
 	let status = match rpc_cli
-		.call(&rpc_host, &SystemRpc::GetKnownNodes, PRIO_NORMAL)
+		.call(&rpc_host, SystemRpc::GetKnownNodes, PRIO_NORMAL)
 		.await??
 	{
 		SystemRpc::ReturnKnownNodes(nodes) => nodes,
@@ -85,19 +88,21 @@ pub async fn cmd_status(rpc_cli: &Endpoint<SystemRpc, ()>, rpc_host: NodeID) ->
 	format_table(healthy_nodes);
 
 	let status_keys = status.iter().map(|adv| adv.id).collect::<HashSet<_>>();
-	let failure_case_1 = status.iter().any(|adv| !adv.is_up);
+	let failure_case_1 = status
+		.iter()
+		.any(|adv| !adv.is_up && matches!(layout.roles.get(&adv.id), Some(NodeRoleV(Some(_)))));
 	let failure_case_2 = layout
 		.roles
 		.items()
 		.iter()
-		.filter(|(_, _, v)| v.0.is_some())
-		.any(|(id, _, _)| !status_keys.contains(id));
+		.any(|(id, _, v)| !status_keys.contains(id) && v.0.is_some());
 	if failure_case_1 || failure_case_2 {
 		println!("\n==== FAILED NODES ====");
 		let mut failed_nodes =
 			vec!["ID\tHostname\tAddress\tTags\tZone\tCapacity\tLast seen".to_string()];
 		for adv in status.iter().filter(|adv| !adv.is_up) {
 			if let Some(NodeRoleV(Some(cfg))) = layout.roles.get(&adv.id) {
+				let tf = timeago::Formatter::new();
 				failed_nodes.push(format!(
 					"{id:?}\t{host}\t{addr}\t[{tags}]\t{zone}\t{capacity}\t{last_seen}",
 					id = adv.id,
@@ -108,7 +113,7 @@ pub async fn cmd_status(rpc_cli: &Endpoint<SystemRpc, ()>, rpc_host: NodeID) ->
 					capacity = cfg.capacity_string(),
 					last_seen = adv
 						.last_seen_secs_ago
-						.map(|s| format!("{}s ago", s))
+						.map(|s| tf.convert(Duration::from_secs(s)))
 						.unwrap_or_else(|| "never seen".into()),
 				));
 			}
@@ -144,7 +149,7 @@ pub async fn cmd_connect(
 	args: ConnectNodeOpt,
 ) -> Result<(), Error> {
 	match rpc_cli
-		.call(&rpc_host, &SystemRpc::Connect(args.node), PRIO_NORMAL)
+		.call(&rpc_host, SystemRpc::Connect(args.node), PRIO_NORMAL)
 		.await??
 	{
 		SystemRpc::Ok => {
@@ -160,15 +165,19 @@ pub async fn cmd_admin(
 	rpc_host: NodeID,
 	args: AdminRpc,
 ) -> Result<(), HelperError> {
-	match rpc_cli.call(&rpc_host, &args, PRIO_NORMAL).await?? {
+	match rpc_cli.call(&rpc_host, args, PRIO_NORMAL).await?? {
 		AdminRpc::Ok(msg) => {
 			println!("{}", msg);
 		}
 		AdminRpc::BucketList(bl) => {
 			print_bucket_list(bl);
 		}
-		AdminRpc::BucketInfo(bucket, rk) => {
-			print_bucket_info(&bucket, &rk);
+		AdminRpc::BucketInfo {
+			bucket,
+			relevant_keys,
+			counters,
+		} => {
+			print_bucket_info(&bucket, &relevant_keys, &counters);
 		}
 		AdminRpc::KeyList(kl) => {
 			print_key_list(kl);
@@ -176,6 +185,9 @@ pub async fn cmd_admin(
 		AdminRpc::KeyInfo(key, rb) => {
 			print_key_info(&key, &rb);
 		}
+		AdminRpc::WorkerList(wi, wlo) => {
+			print_worker_info(wi, wlo);
+		}
 		r => {
 			error!("Unexpected response: {:?}", r);
 		}
diff --git a/src/garage/cli/layout.rs b/src/garage/cli/layout.rs
index e76f7737..3884bb92 100644
--- a/src/garage/cli/layout.rs
+++ b/src/garage/cli/layout.rs
@@ -1,6 +1,6 @@
 use garage_util::crdt::Crdt;
-use garage_util::data::*;
 use garage_util::error::*;
+use garage_util::formater::format_table;
 
 use garage_rpc::layout::*;
 use garage_rpc::system::*;
@@ -36,21 +36,29 @@ pub async fn cmd_assign_role(
 	args: AssignRoleOpt,
 ) -> Result<(), Error> {
 	let status = match rpc_cli
-		.call(&rpc_host, &SystemRpc::GetKnownNodes, PRIO_NORMAL)
+		.call(&rpc_host, SystemRpc::GetKnownNodes, PRIO_NORMAL)
 		.await??
 	{
 		SystemRpc::ReturnKnownNodes(nodes) => nodes,
 		resp => return Err(Error::Message(format!("Invalid RPC response: {:?}", resp))),
 	};
 
+	let mut layout = fetch_layout(rpc_cli, rpc_host).await?;
+
 	let added_nodes = args
 		.node_ids
 		.iter()
-		.map(|node_id| find_matching_node(status.iter().map(|adv| adv.id), node_id))
+		.map(|node_id| {
+			find_matching_node(
+				status
+					.iter()
+					.map(|adv| adv.id)
+					.chain(layout.node_ids().iter().cloned()),
+				node_id,
+			)
+		})
 		.collect::<Result<Vec<_>, _>>()?;
 
-	let mut layout = fetch_layout(rpc_cli, rpc_host).await?;
-
 	let mut roles = layout.roles.clone();
 	roles.merge(&layout.staging);
 
@@ -203,31 +211,9 @@ pub async fn cmd_apply_layout(
 	rpc_host: NodeID,
 	apply_opt: ApplyLayoutOpt,
 ) -> Result<(), Error> {
-	let mut layout = fetch_layout(rpc_cli, rpc_host).await?;
-
-	match apply_opt.version {
-		None => {
-			println!("Please pass the --version flag to ensure that you are writing the correct version of the cluster layout.");
-			println!("To know the correct value of the --version flag, invoke `garage layout show` and review the proposed changes.");
-			return Err(Error::Message("--version flag is missing".into()));
-		}
-		Some(v) => {
-			if v != layout.version + 1 {
-				return Err(Error::Message("Invalid value of --version flag".into()));
-			}
-		}
-	}
-
-	layout.roles.merge(&layout.staging);
-
-	if !layout.calculate_partition_assignation() {
-		return Err(Error::Message("Could not calculate new assignation of partitions to nodes. This can happen if there are less nodes than the desired number of copies of your data (see the replication_mode configuration parameter).".into()));
-	}
+	let layout = fetch_layout(rpc_cli, rpc_host).await?;
 
-	layout.staging.clear();
-	layout.staging_hash = blake2sum(&rmp_to_vec_all_named(&layout.staging).unwrap()[..]);
-
-	layout.version += 1;
+	let layout = layout.apply_staged_changes(apply_opt.version)?;
 
 	send_layout(rpc_cli, rpc_host, layout).await?;
 
@@ -242,25 +228,9 @@ pub async fn cmd_revert_layout(
 	rpc_host: NodeID,
 	revert_opt: RevertLayoutOpt,
 ) -> Result<(), Error> {
-	let mut layout = fetch_layout(rpc_cli, rpc_host).await?;
-
-	match revert_opt.version {
-		None => {
-			println!("Please pass the --version flag to ensure that you are writing the correct version of the cluster layout.");
-			println!("To know the correct value of the --version flag, invoke `garage layout show` and review the proposed changes.");
-			return Err(Error::Message("--version flag is missing".into()));
-		}
-		Some(v) => {
-			if v != layout.version + 1 {
-				return Err(Error::Message("Invalid value of --version flag".into()));
-			}
-		}
-	}
-
-	layout.staging.clear();
-	layout.staging_hash = blake2sum(&rmp_to_vec_all_named(&layout.staging).unwrap()[..]);
+	let layout = fetch_layout(rpc_cli, rpc_host).await?;
 
-	layout.version += 1;
+	let layout = layout.revert_staged_changes(revert_opt.version)?;
 
 	send_layout(rpc_cli, rpc_host, layout).await?;
 
@@ -275,7 +245,7 @@ pub async fn fetch_layout(
 	rpc_host: NodeID,
 ) -> Result<ClusterLayout, Error> {
 	match rpc_cli
-		.call(&rpc_host, &SystemRpc::PullClusterLayout, PRIO_NORMAL)
+		.call(&rpc_host, SystemRpc::PullClusterLayout, PRIO_NORMAL)
 		.await??
 	{
 		SystemRpc::AdvertiseClusterLayout(t) => Ok(t),
@@ -291,7 +261,7 @@ pub async fn send_layout(
 	rpc_cli
 		.call(
 			&rpc_host,
-			&SystemRpc::AdvertiseClusterLayout(layout),
+			SystemRpc::AdvertiseClusterLayout(layout),
 			PRIO_NORMAL,
 		)
 		.await??;
@@ -323,11 +293,20 @@ pub fn print_cluster_layout(layout: &ClusterLayout) -> bool {
 }
 
 pub fn print_staging_role_changes(layout: &ClusterLayout) -> bool {
-	if !layout.staging.items().is_empty() {
+	let has_changes = layout
+		.staging
+		.items()
+		.iter()
+		.any(|(k, _, v)| layout.roles.get(k) != Some(v));
+
+	if has_changes {
 		println!();
 		println!("==== STAGED ROLE CHANGES ====");
 		let mut table = vec!["ID\tTags\tZone\tCapacity".to_string()];
 		for (id, _, role) in layout.staging.items().iter() {
+			if layout.roles.get(id) == Some(role) {
+				continue;
+			}
 			if let Some(role) = &role.0 {
 				let tags = role.tags.join(",");
 				table.push(format!(
diff --git a/src/garage/cli/structs.rs b/src/garage/cli/structs.rs
index a0c49aeb..06548e89 100644
--- a/src/garage/cli/structs.rs
+++ b/src/garage/cli/structs.rs
@@ -1,55 +1,65 @@
 use serde::{Deserialize, Serialize};
-
 use structopt::StructOpt;
 
+use garage_util::version::garage_version;
+
 #[derive(StructOpt, Debug)]
 pub enum Command {
 	/// Run Garage server
-	#[structopt(name = "server")]
+	#[structopt(name = "server", version = garage_version())]
 	Server,
 
 	/// Get network status
-	#[structopt(name = "status")]
+	#[structopt(name = "status", version = garage_version())]
 	Status,
 
 	/// Operations on individual Garage nodes
-	#[structopt(name = "node")]
+	#[structopt(name = "node", version = garage_version())]
 	Node(NodeOperation),
 
 	/// Operations on the assignation of node roles in the cluster layout
-	#[structopt(name = "layout")]
+	#[structopt(name = "layout", version = garage_version())]
 	Layout(LayoutOperation),
 
 	/// Operations on buckets
-	#[structopt(name = "bucket")]
+	#[structopt(name = "bucket", version = garage_version())]
 	Bucket(BucketOperation),
 
 	/// Operations on S3 access keys
-	#[structopt(name = "key")]
+	#[structopt(name = "key", version = garage_version())]
 	Key(KeyOperation),
 
 	/// Run migrations from previous Garage version
 	/// (DO NOT USE WITHOUT READING FULL DOCUMENTATION)
-	#[structopt(name = "migrate")]
+	#[structopt(name = "migrate", version = garage_version())]
 	Migrate(MigrateOpt),
 
-	/// Start repair of node data
-	#[structopt(name = "repair")]
+	/// Start repair of node data on remote node
+	#[structopt(name = "repair", version = garage_version())]
 	Repair(RepairOpt),
 
+	/// Offline reparation of node data (these repairs must be run offline
+	/// directly on the server node)
+	#[structopt(name = "offline-repair", version = garage_version())]
+	OfflineRepair(OfflineRepairOpt),
+
 	/// Gather node statistics
-	#[structopt(name = "stats")]
+	#[structopt(name = "stats", version = garage_version())]
 	Stats(StatsOpt),
+
+	/// Manage background workers
+	#[structopt(name = "worker", version = garage_version())]
+	Worker(WorkerOpt),
 }
 
 #[derive(StructOpt, Debug)]
 pub enum NodeOperation {
 	/// Print identifier (public key) of this Garage node
-	#[structopt(name = "id")]
+	#[structopt(name = "id", version = garage_version())]
 	NodeId(NodeIdOpt),
 
 	/// Connect to Garage node that is currently isolated from the system
-	#[structopt(name = "connect")]
+	#[structopt(name = "connect", version = garage_version())]
 	Connect(ConnectNodeOpt),
 }
 
@@ -70,23 +80,23 @@ pub struct ConnectNodeOpt {
 #[derive(StructOpt, Debug)]
 pub enum LayoutOperation {
 	/// Assign role to Garage node
-	#[structopt(name = "assign")]
+	#[structopt(name = "assign", version = garage_version())]
 	Assign(AssignRoleOpt),
 
 	/// Remove role from Garage cluster node
-	#[structopt(name = "remove")]
+	#[structopt(name = "remove", version = garage_version())]
 	Remove(RemoveRoleOpt),
 
 	/// Show roles currently assigned to nodes and changes staged for commit
-	#[structopt(name = "show")]
+	#[structopt(name = "show", version = garage_version())]
 	Show,
 
 	/// Apply staged changes to cluster layout
-	#[structopt(name = "apply")]
+	#[structopt(name = "apply", version = garage_version())]
 	Apply(ApplyLayoutOpt),
 
 	/// Revert staged changes to cluster layout
-	#[structopt(name = "revert")]
+	#[structopt(name = "revert", version = garage_version())]
 	Revert(RevertLayoutOpt),
 }
 
@@ -141,40 +151,44 @@ pub struct RevertLayoutOpt {
 #[derive(Serialize, Deserialize, StructOpt, Debug)]
 pub enum BucketOperation {
 	/// List buckets
-	#[structopt(name = "list")]
+	#[structopt(name = "list", version = garage_version())]
 	List,
 
 	/// Get bucket info
-	#[structopt(name = "info")]
+	#[structopt(name = "info", version = garage_version())]
 	Info(BucketOpt),
 
 	/// Create bucket
-	#[structopt(name = "create")]
+	#[structopt(name = "create", version = garage_version())]
 	Create(BucketOpt),
 
 	/// Delete bucket
-	#[structopt(name = "delete")]
+	#[structopt(name = "delete", version = garage_version())]
 	Delete(DeleteBucketOpt),
 
 	/// Alias bucket under new name
-	#[structopt(name = "alias")]
+	#[structopt(name = "alias", version = garage_version())]
 	Alias(AliasBucketOpt),
 
 	/// Remove bucket alias
-	#[structopt(name = "unalias")]
+	#[structopt(name = "unalias", version = garage_version())]
 	Unalias(UnaliasBucketOpt),
 
 	/// Allow key to read or write to bucket
-	#[structopt(name = "allow")]
+	#[structopt(name = "allow", version = garage_version())]
 	Allow(PermBucketOpt),
 
 	/// Deny key from reading or writing to bucket
-	#[structopt(name = "deny")]
+	#[structopt(name = "deny", version = garage_version())]
 	Deny(PermBucketOpt),
 
 	/// Expose as website or not
-	#[structopt(name = "website")]
+	#[structopt(name = "website", version = garage_version())]
 	Website(WebsiteOpt),
+
+	/// Set the quotas for this bucket
+	#[structopt(name = "set-quotas", version = garage_version())]
+	SetQuotas(SetQuotasOpt),
 }
 
 #[derive(Serialize, Deserialize, StructOpt, Debug)]
@@ -262,37 +276,52 @@ pub struct PermBucketOpt {
 }
 
 #[derive(Serialize, Deserialize, StructOpt, Debug)]
+pub struct SetQuotasOpt {
+	/// Bucket name
+	pub bucket: String,
+
+	/// Set a maximum size for the bucket (specify a size e.g. in MiB or GiB,
+	/// or `none` for no size restriction)
+	#[structopt(long = "max-size")]
+	pub max_size: Option<String>,
+
+	/// Set a maximum number of objects for the bucket (or `none` for no restriction)
+	#[structopt(long = "max-objects")]
+	pub max_objects: Option<String>,
+}
+
+#[derive(Serialize, Deserialize, StructOpt, Debug)]
 pub enum KeyOperation {
 	/// List keys
-	#[structopt(name = "list")]
+	#[structopt(name = "list", version = garage_version())]
 	List,
 
 	/// Get key info
-	#[structopt(name = "info")]
+	#[structopt(name = "info", version = garage_version())]
 	Info(KeyOpt),
 
 	/// Create new key
-	#[structopt(name = "new")]
+	#[structopt(name = "new", version = garage_version())]
 	New(KeyNewOpt),
 
 	/// Rename key
-	#[structopt(name = "rename")]
+	#[structopt(name = "rename", version = garage_version())]
 	Rename(KeyRenameOpt),
 
 	/// Delete key
-	#[structopt(name = "delete")]
+	#[structopt(name = "delete", version = garage_version())]
 	Delete(KeyDeleteOpt),
 
 	/// Set permission flags for key
-	#[structopt(name = "allow")]
+	#[structopt(name = "allow", version = garage_version())]
 	Allow(KeyPermOpt),
 
 	/// Unset permission flags for key
-	#[structopt(name = "deny")]
+	#[structopt(name = "deny", version = garage_version())]
 	Deny(KeyPermOpt),
 
 	/// Import key
-	#[structopt(name = "import")]
+	#[structopt(name = "import", version = garage_version())]
 	Import(KeyImportOpt),
 }
 
@@ -364,7 +393,7 @@ pub struct MigrateOpt {
 #[derive(Serialize, Deserialize, StructOpt, Debug, Eq, PartialEq, Clone)]
 pub enum MigrateWhat {
 	/// Migrate buckets and permissions from v0.5.0
-	#[structopt(name = "buckets050")]
+	#[structopt(name = "buckets050", version = garage_version())]
 	Buckets050,
 }
 
@@ -385,27 +414,69 @@ pub struct RepairOpt {
 #[derive(Serialize, Deserialize, StructOpt, Debug, Eq, PartialEq, Clone)]
 pub enum RepairWhat {
 	/// Only do a full sync of metadata tables
-	#[structopt(name = "tables")]
+	#[structopt(name = "tables", version = garage_version())]
 	Tables,
 	/// Only repair (resync/rebalance) the set of stored blocks
-	#[structopt(name = "blocks")]
+	#[structopt(name = "blocks", version = garage_version())]
 	Blocks,
 	/// Only redo the propagation of object deletions to the version table (slow)
-	#[structopt(name = "versions")]
+	#[structopt(name = "versions", version = garage_version())]
 	Versions,
 	/// Only redo the propagation of version deletions to the block ref table (extremely slow)
-	#[structopt(name = "block_refs")]
+	#[structopt(name = "block_refs", version = garage_version())]
 	BlockRefs,
 	/// Verify integrity of all blocks on disc (extremely slow, i/o intensive)
-	#[structopt(name = "scrub")]
+	#[structopt(name = "scrub", version = garage_version())]
 	Scrub {
-		/// Tranquility factor (see tranquilizer documentation)
-		#[structopt(name = "tranquility", default_value = "2")]
+		#[structopt(subcommand)]
+		cmd: ScrubCmd,
+	},
+}
+
+#[derive(Serialize, Deserialize, StructOpt, Debug, Eq, PartialEq, Clone)]
+pub enum ScrubCmd {
+	/// Start scrub
+	#[structopt(name = "start", version = garage_version())]
+	Start,
+	/// Pause scrub (it will resume automatically after 24 hours)
+	#[structopt(name = "pause", version = garage_version())]
+	Pause,
+	/// Resume paused scrub
+	#[structopt(name = "resume", version = garage_version())]
+	Resume,
+	/// Cancel scrub in progress
+	#[structopt(name = "cancel", version = garage_version())]
+	Cancel,
+	/// Set tranquility level for in-progress and future scrubs
+	#[structopt(name = "set-tranquility", version = garage_version())]
+	SetTranquility {
+		#[structopt()]
 		tranquility: u32,
 	},
 }
 
 #[derive(Serialize, Deserialize, StructOpt, Debug, Clone)]
+pub struct OfflineRepairOpt {
+	/// Confirm the launch of the repair operation
+	#[structopt(long = "yes")]
+	pub yes: bool,
+
+	#[structopt(subcommand)]
+	pub what: OfflineRepairWhat,
+}
+
+#[derive(Serialize, Deserialize, StructOpt, Debug, Eq, PartialEq, Clone)]
+pub enum OfflineRepairWhat {
+	/// Repair K2V item counters
+	#[cfg(feature = "k2v")]
+	#[structopt(name = "k2v_item_counters", version = garage_version())]
+	K2VItemCounters,
+	/// Repair object counters
+	#[structopt(name = "object_counters", version = garage_version())]
+	ObjectCounters,
+}
+
+#[derive(Serialize, Deserialize, StructOpt, Debug, Clone)]
 pub struct StatsOpt {
 	/// Gather statistics from all nodes
 	#[structopt(short = "a", long = "all-nodes")]
@@ -415,3 +486,48 @@ pub struct StatsOpt {
 	#[structopt(short = "d", long = "detailed")]
 	pub detailed: bool,
 }
+
+#[derive(Serialize, Deserialize, StructOpt, Debug, Clone)]
+pub struct WorkerOpt {
+	#[structopt(subcommand)]
+	pub cmd: WorkerCmd,
+}
+
+#[derive(Serialize, Deserialize, StructOpt, Debug, Eq, PartialEq, Clone)]
+pub enum WorkerCmd {
+	/// List all workers on Garage node
+	#[structopt(name = "list", version = garage_version())]
+	List {
+		#[structopt(flatten)]
+		opt: WorkerListOpt,
+	},
+	/// Set worker parameter
+	#[structopt(name = "set", version = garage_version())]
+	Set {
+		#[structopt(subcommand)]
+		opt: WorkerSetCmd,
+	},
+}
+
+#[derive(Serialize, Deserialize, StructOpt, Debug, Eq, PartialEq, Clone, Copy)]
+pub struct WorkerListOpt {
+	/// Show only busy workers
+	#[structopt(short = "b", long = "busy")]
+	pub busy: bool,
+	/// Show only workers with errors
+	#[structopt(short = "e", long = "errors")]
+	pub errors: bool,
+}
+
+#[derive(Serialize, Deserialize, StructOpt, Debug, Eq, PartialEq, Clone)]
+pub enum WorkerSetCmd {
+	/// Set tranquility of scrub operations
+	#[structopt(name = "scrub-tranquility", version = garage_version())]
+	ScrubTranquility { tranquility: u32 },
+	/// Set number of concurrent block resync workers
+	#[structopt(name = "resync-n-workers", version = garage_version())]
+	ResyncNWorkers { n_workers: usize },
+	/// Set tranquility of block resync operations
+	#[structopt(name = "resync-tranquility", version = garage_version())]
+	ResyncTranquility { tranquility: u32 },
+}
diff --git a/src/garage/cli/util.rs b/src/garage/cli/util.rs
index 7d496507..396938ae 100644
--- a/src/garage/cli/util.rs
+++ b/src/garage/cli/util.rs
@@ -1,11 +1,18 @@
 use std::collections::HashMap;
+use std::time::Duration;
 
+use garage_util::background::*;
 use garage_util::crdt::*;
 use garage_util::data::Uuid;
 use garage_util::error::*;
+use garage_util::formater::format_table;
+use garage_util::time::*;
 
 use garage_model::bucket_table::*;
 use garage_model::key_table::*;
+use garage_model::s3::object_table::{BYTES, OBJECTS, UNFINISHED_UPLOADS};
+
+use crate::cli::structs::WorkerListOpt;
 
 pub fn print_bucket_list(bl: Vec<Bucket>) {
 	println!("List of buckets:");
@@ -28,11 +35,12 @@ pub fn print_bucket_list(bl: Vec<Bucket>) {
 			[((k, n), _, _)] => format!("{}:{}", k, n),
 			s => format!("[{} local aliases]", s.len()),
 		};
+
 		table.push(format!(
 			"\t{}\t{}\t{}",
 			aliases.join(","),
 			local_aliases_n,
-			hex::encode(bucket.id)
+			hex::encode(bucket.id),
 		));
 	}
 	format_table(table);
@@ -120,7 +128,11 @@ pub fn print_key_info(key: &Key, relevant_buckets: &HashMap<Uuid, Bucket>) {
 	}
 }
 
-pub fn print_bucket_info(bucket: &Bucket, relevant_keys: &HashMap<String, Key>) {
+pub fn print_bucket_info(
+	bucket: &Bucket,
+	relevant_keys: &HashMap<String, Key>,
+	counters: &HashMap<String, i64>,
+) {
 	let key_name = |k| {
 		relevant_keys
 			.get(k)
@@ -132,7 +144,42 @@ pub fn print_bucket_info(bucket: &Bucket, relevant_keys: &HashMap<String, Key>)
 	match &bucket.state {
 		Deletable::Deleted => println!("Bucket is deleted."),
 		Deletable::Present(p) => {
-			println!("Website access: {}", p.website_config.get().is_some());
+			let size =
+				bytesize::ByteSize::b(counters.get(BYTES).cloned().unwrap_or_default() as u64);
+			println!(
+				"\nSize: {} ({})",
+				size.to_string_as(true),
+				size.to_string_as(false)
+			);
+			println!(
+				"Objects: {}",
+				counters.get(OBJECTS).cloned().unwrap_or_default()
+			);
+			println!(
+				"Unfinished multipart uploads: {}",
+				counters
+					.get(UNFINISHED_UPLOADS)
+					.cloned()
+					.unwrap_or_default()
+			);
+
+			println!("\nWebsite access: {}", p.website_config.get().is_some());
+
+			let quotas = p.quotas.get();
+			if quotas.max_size.is_some() || quotas.max_objects.is_some() {
+				println!("\nQuotas:");
+				if let Some(ms) = quotas.max_size {
+					let ms = bytesize::ByteSize::b(ms);
+					println!(
+						" maximum size: {} ({})",
+						ms.to_string_as(true),
+						ms.to_string_as(false)
+					);
+				}
+				if let Some(mo) = quotas.max_objects {
+					println!(" maximum number of objects: {}", mo);
+				}
+			}
 
 			println!("\nGlobal aliases:");
 			for (alias, _, active) in p.aliases.items().iter() {
@@ -173,42 +220,13 @@ pub fn print_bucket_info(bucket: &Bucket, relevant_keys: &HashMap<String, Key>)
 	};
 }
 
-pub fn format_table(data: Vec<String>) {
-	let data = data
-		.iter()
-		.map(|s| s.split('\t').collect::<Vec<_>>())
-		.collect::<Vec<_>>();
-
-	let columns = data.iter().map(|row| row.len()).fold(0, std::cmp::max);
-	let mut column_size = vec![0; columns];
-
-	let mut out = String::new();
-
-	for row in data.iter() {
-		for (i, col) in row.iter().enumerate() {
-			column_size[i] = std::cmp::max(column_size[i], col.chars().count());
-		}
-	}
-
-	for row in data.iter() {
-		for (col, col_len) in row[..row.len() - 1].iter().zip(column_size.iter()) {
-			out.push_str(col);
-			(0..col_len - col.chars().count() + 2).for_each(|_| out.push(' '));
-		}
-		out.push_str(row[row.len() - 1]);
-		out.push('\n');
-	}
-
-	print!("{}", out);
-}
-
 pub fn find_matching_node(
 	cand: impl std::iter::Iterator<Item = Uuid>,
 	pattern: &str,
 ) -> Result<Uuid, Error> {
 	let mut candidates = vec![];
 	for c in cand {
-		if hex::encode(&c).starts_with(&pattern) {
+		if hex::encode(&c).starts_with(&pattern) && !candidates.contains(&c) {
 			candidates.push(c);
 		}
 	}
@@ -222,3 +240,56 @@ pub fn find_matching_node(
 		Ok(candidates[0])
 	}
 }
+
+pub fn print_worker_info(wi: HashMap<usize, WorkerInfo>, wlo: WorkerListOpt) {
+	let mut wi = wi.into_iter().collect::<Vec<_>>();
+	wi.sort_by_key(|(tid, info)| {
+		(
+			match info.state {
+				WorkerState::Busy | WorkerState::Throttled(_) => 0,
+				WorkerState::Idle => 1,
+				WorkerState::Done => 2,
+			},
+			*tid,
+		)
+	});
+
+	let mut table = vec![];
+	for (tid, info) in wi.iter() {
+		if wlo.busy && !matches!(info.state, WorkerState::Busy | WorkerState::Throttled(_)) {
+			continue;
+		}
+		if wlo.errors && info.errors == 0 {
+			continue;
+		}
+
+		table.push(format!("{}\t{}\t{}", tid, info.state, info.name));
+		if let Some(i) = &info.info {
+			table.push(format!("\t\t  {}", i));
+		}
+		let tf = timeago::Formatter::new();
+		let (err_ago, err_msg) = info
+			.last_error
+			.as_ref()
+			.map(|(m, t)| {
+				(
+					tf.convert(Duration::from_millis(now_msec() - t)),
+					m.as_str(),
+				)
+			})
+			.unwrap_or(("(?) ago".into(), "(?)"));
+		if info.consecutive_errors > 0 {
+			table.push(format!(
+				"\t\t  {} consecutive errors ({} total), last {}",
+				info.consecutive_errors, info.errors, err_ago,
+			));
+			table.push(format!("\t\t  {}", err_msg));
+		} else if info.errors > 0 {
+			table.push(format!("\t\t  ({} errors, last {})", info.errors, err_ago,));
+			if wlo.errors {
+				table.push(format!("\t\t  {}", err_msg));
+			}
+		}
+	}
+	format_table(table);
+}
diff --git a/src/garage/main.rs b/src/garage/main.rs
index e898e680..e5cba553 100644
--- a/src/garage/main.rs
+++ b/src/garage/main.rs
@@ -8,6 +8,14 @@ mod admin;
 mod cli;
 mod repair;
 mod server;
+#[cfg(feature = "telemetry-otlp")]
+mod tracing_setup;
+
+#[cfg(not(any(feature = "bundled-libs", feature = "system-libs")))]
+compile_error!("Either bundled-libs or system-libs Cargo feature must be enabled");
+
+#[cfg(all(feature = "bundled-libs", feature = "system-libs"))]
+compile_error!("Only one of bundled-libs and system-libs Cargo features must be enabled");
 
 use std::net::SocketAddr;
 use std::path::PathBuf;
@@ -28,7 +36,10 @@ use admin::*;
 use cli::*;
 
 #[derive(StructOpt, Debug)]
-#[structopt(name = "garage")]
+#[structopt(
+	name = "garage",
+	about = "S3-compatible object store for self-hosted geo-distributed deployments"
+)]
 struct Opt {
 	/// Host to connect to for admin operations, in the format:
 	/// <public-key>@<ip>:<port>
@@ -57,20 +68,56 @@ async fn main() {
 	if std::env::var("RUST_LOG").is_err() {
 		std::env::set_var("RUST_LOG", "netapp=info,garage=info")
 	}
-	pretty_env_logger::init();
+	tracing_subscriber::fmt()
+		.with_writer(std::io::stderr)
+		.with_env_filter(tracing_subscriber::filter::EnvFilter::from_default_env())
+		.init();
 	sodiumoxide::init().expect("Unable to init sodiumoxide");
 
-	let opt = Opt::from_args();
+	// Abort on panic (same behavior as in Go)
+	std::panic::set_hook(Box::new(|panic_info| {
+		error!("{}", panic_info.to_string());
+		std::process::abort();
+	}));
+
+	// Initialize version and features info
+	let features = &[
+		#[cfg(feature = "k2v")]
+		"k2v",
+		#[cfg(feature = "sled")]
+		"sled",
+		#[cfg(feature = "lmdb")]
+		"lmdb",
+		#[cfg(feature = "sqlite")]
+		"sqlite",
+		#[cfg(feature = "kubernetes-discovery")]
+		"kubernetes-discovery",
+		#[cfg(feature = "metrics")]
+		"metrics",
+		#[cfg(feature = "telemetry-otlp")]
+		"telemetry-otlp",
+		#[cfg(feature = "bundled-libs")]
+		"bundled-libs",
+		#[cfg(feature = "system-libs")]
+		"system-libs",
+	][..];
+	if let Some(git_version) = option_env!("GIT_VERSION") {
+		garage_util::version::init_version(git_version);
+	}
+	garage_util::version::init_features(features);
+
+	// Parse arguments
+	let version = format!(
+		"{} [features: {}]",
+		garage_util::version::garage_version(),
+		features.join(", ")
+	);
+	let opt = Opt::from_clap(&Opt::clap().version(version.as_str()).get_matches());
 
 	let res = match opt.cmd {
-		Command::Server => {
-			// Abort on panic (same behavior as in Go)
-			std::panic::set_hook(Box::new(|panic_info| {
-				error!("{}", panic_info.to_string());
-				std::process::abort();
-			}));
-
-			server::run_server(opt.config_file).await
+		Command::Server => server::run_server(opt.config_file).await,
+		Command::OfflineRepair(repair_opt) => {
+			repair::offline::offline_repair(opt.config_file, repair_opt).await
 		}
 		Command::Node(NodeOperation::NodeId(node_id_opt)) => {
 			node_id_command(opt.config_file, node_id_opt.quiet)
@@ -115,7 +162,13 @@ async fn cli_command(opt: Opt) -> Result<(), Error> {
 	} else {
 		let node_id = garage_rpc::system::read_node_id(&config.as_ref().unwrap().metadata_dir)
 			.err_context(READ_KEY_ERROR)?;
-		if let Some(a) = config.as_ref().and_then(|c| c.rpc_public_addr) {
+		if let Some(a) = config.as_ref().and_then(|c| c.rpc_public_addr.as_ref()) {
+			use std::net::ToSocketAddrs;
+			let a = a
+				.to_socket_addrs()
+				.ok_or_message("unable to resolve rpc_public_addr specified in config file")?
+				.next()
+				.ok_or_message("unable to resolve rpc_public_addr specified in config file")?;
 			(node_id, a)
 		} else {
 			let default_addr = SocketAddr::new(
@@ -141,6 +194,7 @@ async fn cli_command(opt: Opt) -> Result<(), Error> {
 	match cli_command_dispatch(opt.cmd, &system_rpc_endpoint, &admin_rpc_endpoint, id).await {
 		Err(HelperError::Internal(i)) => Err(Error::Message(format!("Internal error: {}", i))),
 		Err(HelperError::BadRequest(b)) => Err(Error::Message(b)),
+		Err(e) => Err(Error::Message(format!("{}", e))),
 		Ok(x) => Ok(x),
 	}
 }
diff --git a/src/garage/repair.rs b/src/garage/repair.rs
deleted file mode 100644
index 3666ca8f..00000000
--- a/src/garage/repair.rs
+++ /dev/null
@@ -1,149 +0,0 @@
-use std::sync::Arc;
-
-use tokio::sync::watch;
-
-use garage_model::block_ref_table::*;
-use garage_model::garage::Garage;
-use garage_model::object_table::*;
-use garage_model::version_table::*;
-use garage_table::*;
-use garage_util::error::Error;
-
-use crate::*;
-
-pub struct Repair {
-	pub garage: Arc<Garage>,
-}
-
-impl Repair {
-	pub async fn repair_worker(&self, opt: RepairOpt, must_exit: watch::Receiver<bool>) {
-		if let Err(e) = self.repair_worker_aux(opt, must_exit).await {
-			warn!("Repair worker failed with error: {}", e);
-		}
-	}
-
-	async fn repair_worker_aux(
-		&self,
-		opt: RepairOpt,
-		must_exit: watch::Receiver<bool>,
-	) -> Result<(), Error> {
-		match opt.what {
-			RepairWhat::Tables => {
-				info!("Launching a full sync of tables");
-				self.garage.bucket_table.syncer.add_full_sync();
-				self.garage.object_table.syncer.add_full_sync();
-				self.garage.version_table.syncer.add_full_sync();
-				self.garage.block_ref_table.syncer.add_full_sync();
-				self.garage.key_table.syncer.add_full_sync();
-			}
-			RepairWhat::Versions => {
-				info!("Repairing the versions table");
-				self.repair_versions(&must_exit).await?;
-			}
-			RepairWhat::BlockRefs => {
-				info!("Repairing the block refs table");
-				self.repair_block_ref(&must_exit).await?;
-			}
-			RepairWhat::Blocks => {
-				info!("Repairing the stored blocks");
-				self.garage
-					.block_manager
-					.repair_data_store(&must_exit)
-					.await?;
-			}
-			RepairWhat::Scrub { tranquility } => {
-				info!("Verifying integrity of stored blocks");
-				self.garage
-					.block_manager
-					.scrub_data_store(&must_exit, tranquility)
-					.await?;
-			}
-		}
-		Ok(())
-	}
-
-	async fn repair_versions(&self, must_exit: &watch::Receiver<bool>) -> Result<(), Error> {
-		let mut pos = vec![];
-
-		while let Some((item_key, item_bytes)) =
-			self.garage.version_table.data.store.get_gt(&pos)?
-		{
-			pos = item_key.to_vec();
-
-			let version = rmp_serde::decode::from_read_ref::<_, Version>(item_bytes.as_ref())?;
-			if version.deleted.get() {
-				continue;
-			}
-			let object = self
-				.garage
-				.object_table
-				.get(&version.bucket_id, &version.key)
-				.await?;
-			let version_exists = match object {
-				Some(o) => o
-					.versions()
-					.iter()
-					.any(|x| x.uuid == version.uuid && x.state != ObjectVersionState::Aborted),
-				None => false,
-			};
-			if !version_exists {
-				info!("Repair versions: marking version as deleted: {:?}", version);
-				self.garage
-					.version_table
-					.insert(&Version::new(
-						version.uuid,
-						version.bucket_id,
-						version.key,
-						true,
-					))
-					.await?;
-			}
-
-			if *must_exit.borrow() {
-				break;
-			}
-		}
-		Ok(())
-	}
-
-	async fn repair_block_ref(&self, must_exit: &watch::Receiver<bool>) -> Result<(), Error> {
-		let mut pos = vec![];
-
-		while let Some((item_key, item_bytes)) =
-			self.garage.block_ref_table.data.store.get_gt(&pos)?
-		{
-			pos = item_key.to_vec();
-
-			let block_ref = rmp_serde::decode::from_read_ref::<_, BlockRef>(item_bytes.as_ref())?;
-			if block_ref.deleted.get() {
-				continue;
-			}
-			let version = self
-				.garage
-				.version_table
-				.get(&block_ref.version, &EmptyKey)
-				.await?;
-			// The version might not exist if it has been GC'ed
-			let ref_exists = version.map(|v| !v.deleted.get()).unwrap_or(false);
-			if !ref_exists {
-				info!(
-					"Repair block ref: marking block_ref as deleted: {:?}",
-					block_ref
-				);
-				self.garage
-					.block_ref_table
-					.insert(&BlockRef {
-						block: block_ref.block,
-						version: block_ref.version,
-						deleted: true.into(),
-					})
-					.await?;
-			}
-
-			if *must_exit.borrow() {
-				break;
-			}
-		}
-		Ok(())
-	}
-}
diff --git a/src/garage/repair/mod.rs b/src/garage/repair/mod.rs
new file mode 100644
index 00000000..4699ace5
--- /dev/null
+++ b/src/garage/repair/mod.rs
@@ -0,0 +1,2 @@
+pub mod offline;
+pub mod online;
diff --git a/src/garage/repair/offline.rs b/src/garage/repair/offline.rs
new file mode 100644
index 00000000..7760a8bd
--- /dev/null
+++ b/src/garage/repair/offline.rs
@@ -0,0 +1,55 @@
+use std::path::PathBuf;
+
+use tokio::sync::watch;
+
+use garage_util::background::*;
+use garage_util::config::*;
+use garage_util::error::*;
+
+use garage_model::garage::Garage;
+
+use crate::cli::structs::*;
+
+pub async fn offline_repair(config_file: PathBuf, opt: OfflineRepairOpt) -> Result<(), Error> {
+	if !opt.yes {
+		return Err(Error::Message(
+			"Please add the --yes flag to launch repair operation".into(),
+		));
+	}
+
+	info!("Loading configuration...");
+	let config = read_config(config_file)?;
+
+	info!("Initializing background runner...");
+	let (done_tx, done_rx) = watch::channel(false);
+	let (background, await_background_done) = BackgroundRunner::new(16, done_rx);
+
+	info!("Initializing Garage main data store...");
+	let garage = Garage::new(config.clone(), background)?;
+
+	info!("Launching repair operation...");
+	match opt.what {
+		#[cfg(feature = "k2v")]
+		OfflineRepairWhat::K2VItemCounters => {
+			garage
+				.k2v
+				.counter_table
+				.offline_recount_all(&garage.k2v.item_table)?;
+		}
+		OfflineRepairWhat::ObjectCounters => {
+			garage
+				.object_counter_table
+				.offline_recount_all(&garage.object_table)?;
+		}
+	}
+
+	info!("Repair operation finished, shutting down Garage internals...");
+	done_tx.send(true).unwrap();
+	drop(garage);
+
+	await_background_done.await?;
+
+	info!("Cleaning up...");
+
+	Ok(())
+}
diff --git a/src/garage/repair/online.rs b/src/garage/repair/online.rs
new file mode 100644
index 00000000..e33cf097
--- /dev/null
+++ b/src/garage/repair/online.rs
@@ -0,0 +1,215 @@
+use std::sync::Arc;
+use std::time::Duration;
+
+use async_trait::async_trait;
+use tokio::sync::watch;
+
+use garage_block::repair::ScrubWorkerCommand;
+use garage_model::garage::Garage;
+use garage_model::s3::block_ref_table::*;
+use garage_model::s3::object_table::*;
+use garage_model::s3::version_table::*;
+use garage_table::*;
+use garage_util::background::*;
+use garage_util::error::Error;
+
+use crate::*;
+
+pub async fn launch_online_repair(garage: Arc<Garage>, opt: RepairOpt) {
+	match opt.what {
+		RepairWhat::Tables => {
+			info!("Launching a full sync of tables");
+			garage.bucket_table.syncer.add_full_sync();
+			garage.object_table.syncer.add_full_sync();
+			garage.version_table.syncer.add_full_sync();
+			garage.block_ref_table.syncer.add_full_sync();
+			garage.key_table.syncer.add_full_sync();
+		}
+		RepairWhat::Versions => {
+			info!("Repairing the versions table");
+			garage
+				.background
+				.spawn_worker(RepairVersionsWorker::new(garage.clone()));
+		}
+		RepairWhat::BlockRefs => {
+			info!("Repairing the block refs table");
+			garage
+				.background
+				.spawn_worker(RepairBlockrefsWorker::new(garage.clone()));
+		}
+		RepairWhat::Blocks => {
+			info!("Repairing the stored blocks");
+			garage
+				.background
+				.spawn_worker(garage_block::repair::RepairWorker::new(
+					garage.block_manager.clone(),
+				));
+		}
+		RepairWhat::Scrub { cmd } => {
+			let cmd = match cmd {
+				ScrubCmd::Start => ScrubWorkerCommand::Start,
+				ScrubCmd::Pause => ScrubWorkerCommand::Pause(Duration::from_secs(3600 * 24)),
+				ScrubCmd::Resume => ScrubWorkerCommand::Resume,
+				ScrubCmd::Cancel => ScrubWorkerCommand::Cancel,
+				ScrubCmd::SetTranquility { tranquility } => {
+					ScrubWorkerCommand::SetTranquility(tranquility)
+				}
+			};
+			info!("Sending command to scrub worker: {:?}", cmd);
+			garage.block_manager.send_scrub_command(cmd).await;
+		}
+	}
+}
+
+// ----
+
+struct RepairVersionsWorker {
+	garage: Arc<Garage>,
+	pos: Vec<u8>,
+	counter: usize,
+}
+
+impl RepairVersionsWorker {
+	fn new(garage: Arc<Garage>) -> Self {
+		Self {
+			garage,
+			pos: vec![],
+			counter: 0,
+		}
+	}
+}
+
+#[async_trait]
+impl Worker for RepairVersionsWorker {
+	fn name(&self) -> String {
+		"Version repair worker".into()
+	}
+
+	fn info(&self) -> Option<String> {
+		Some(format!("{} items done", self.counter))
+	}
+
+	async fn work(&mut self, _must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error> {
+		let item_bytes = match self.garage.version_table.data.store.get_gt(&self.pos)? {
+			Some((k, v)) => {
+				self.pos = k;
+				v
+			}
+			None => {
+				info!("repair_versions: finished, done {}", self.counter);
+				return Ok(WorkerState::Done);
+			}
+		};
+
+		self.counter += 1;
+
+		let version = rmp_serde::decode::from_read_ref::<_, Version>(&item_bytes)?;
+		if !version.deleted.get() {
+			let object = self
+				.garage
+				.object_table
+				.get(&version.bucket_id, &version.key)
+				.await?;
+			let version_exists = match object {
+				Some(o) => o
+					.versions()
+					.iter()
+					.any(|x| x.uuid == version.uuid && x.state != ObjectVersionState::Aborted),
+				None => false,
+			};
+			if !version_exists {
+				info!("Repair versions: marking version as deleted: {:?}", version);
+				self.garage
+					.version_table
+					.insert(&Version::new(
+						version.uuid,
+						version.bucket_id,
+						version.key,
+						true,
+					))
+					.await?;
+			}
+		}
+
+		Ok(WorkerState::Busy)
+	}
+
+	async fn wait_for_work(&mut self, _must_exit: &watch::Receiver<bool>) -> WorkerState {
+		unreachable!()
+	}
+}
+
+// ----
+
+struct RepairBlockrefsWorker {
+	garage: Arc<Garage>,
+	pos: Vec<u8>,
+	counter: usize,
+}
+
+impl RepairBlockrefsWorker {
+	fn new(garage: Arc<Garage>) -> Self {
+		Self {
+			garage,
+			pos: vec![],
+			counter: 0,
+		}
+	}
+}
+
+#[async_trait]
+impl Worker for RepairBlockrefsWorker {
+	fn name(&self) -> String {
+		"Block refs repair worker".into()
+	}
+
+	fn info(&self) -> Option<String> {
+		Some(format!("{} items done", self.counter))
+	}
+
+	async fn work(&mut self, _must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error> {
+		let item_bytes = match self.garage.block_ref_table.data.store.get_gt(&self.pos)? {
+			Some((k, v)) => {
+				self.pos = k;
+				v
+			}
+			None => {
+				info!("repair_block_ref: finished, done {}", self.counter);
+				return Ok(WorkerState::Done);
+			}
+		};
+
+		self.counter += 1;
+
+		let block_ref = rmp_serde::decode::from_read_ref::<_, BlockRef>(&item_bytes)?;
+		if !block_ref.deleted.get() {
+			let version = self
+				.garage
+				.version_table
+				.get(&block_ref.version, &EmptyKey)
+				.await?;
+			// The version might not exist if it has been GC'ed
+			let ref_exists = version.map(|v| !v.deleted.get()).unwrap_or(false);
+			if !ref_exists {
+				info!(
+					"Repair block ref: marking block_ref as deleted: {:?}",
+					block_ref
+				);
+				self.garage
+					.block_ref_table
+					.insert(&BlockRef {
+						block: block_ref.block,
+						version: block_ref.version,
+						deleted: true.into(),
+					})
+					.await?;
+			}
+		}
+
+		Ok(WorkerState::Busy)
+	}
+
+	async fn wait_for_work(&mut self, _must_exit: &watch::Receiver<bool>) -> WorkerState {
+		unreachable!()
+	}
+}
diff --git a/src/garage/server.rs b/src/garage/server.rs
index 58c9e782..d4099a97 100644
--- a/src/garage/server.rs
+++ b/src/garage/server.rs
@@ -6,13 +6,17 @@ use garage_util::background::*;
 use garage_util::config::*;
 use garage_util::error::Error;
 
-use garage_admin::metrics::*;
-use garage_admin::tracing_setup::*;
-use garage_api::run_api_server;
+use garage_api::admin::api_server::AdminApiServer;
+use garage_api::s3::api_server::S3ApiServer;
 use garage_model::garage::Garage;
-use garage_web::run_web_server;
+use garage_web::WebServer;
+
+#[cfg(feature = "k2v")]
+use garage_api::k2v::api_server::K2VApiServer;
 
 use crate::admin::*;
+#[cfg(feature = "telemetry-otlp")]
+use crate::tracing_setup::*;
 
 async fn wait_from(mut chan: watch::Receiver<bool>) {
 	while !*chan.borrow() {
@@ -24,79 +28,124 @@ async fn wait_from(mut chan: watch::Receiver<bool>) {
 
 pub async fn run_server(config_file: PathBuf) -> Result<(), Error> {
 	info!("Loading configuration...");
-	let config = read_config(config_file).expect("Unable to read config file");
+	let config = read_config(config_file)?;
 
-	info!("Opening database...");
-	let mut db_path = config.metadata_dir.clone();
-	db_path.push("db");
-	let db = sled::Config::default()
-		.path(&db_path)
-		.cache_capacity(config.sled_cache_capacity)
-		.flush_every_ms(Some(config.sled_flush_every_ms))
-		.open()
-		.expect("Unable to open sled DB");
+	// ---- Initialize Garage internals ----
 
-	info!("Initialize admin web server and metric backend...");
-	let admin_server_init = AdminServer::init();
+	#[cfg(feature = "metrics")]
+	let metrics_exporter = opentelemetry_prometheus::exporter().init();
 
 	info!("Initializing background runner...");
-	let watch_cancel = netapp::util::watch_ctrl_c();
+	let watch_cancel = watch_shutdown_signal();
 	let (background, await_background_done) = BackgroundRunner::new(16, watch_cancel.clone());
 
 	info!("Initializing Garage main data store...");
-	let garage = Garage::new(config.clone(), db, background);
+	let garage = Garage::new(config.clone(), background)?;
+
+	if config.admin.trace_sink.is_some() {
+		info!("Initialize tracing...");
 
-	info!("Initialize tracing...");
-	if let Some(export_to) = config.admin.trace_sink {
-		init_tracing(&export_to, garage.system.id)?;
+		#[cfg(feature = "telemetry-otlp")]
+		init_tracing(config.admin.trace_sink.as_ref().unwrap(), garage.system.id)?;
+
+		#[cfg(not(feature = "telemetry-otlp"))]
+		error!("Garage was built without OTLP exporter, admin.trace_sink is ignored.");
 	}
 
+	info!("Initialize Admin API server and metrics collector...");
+	let admin_server = AdminApiServer::new(
+		garage.clone(),
+		#[cfg(feature = "metrics")]
+		metrics_exporter,
+	);
+
+	info!("Launching internal Garage cluster communications...");
 	let run_system = tokio::spawn(garage.system.clone().run(watch_cancel.clone()));
 
 	info!("Create admin RPC handler...");
 	AdminRpcHandler::new(garage.clone());
 
-	info!("Initializing API server...");
-	let api_server = tokio::spawn(run_api_server(
-		garage.clone(),
-		wait_from(watch_cancel.clone()),
-	));
+	// ---- Launch public-facing API servers ----
+
+	let mut servers = vec![];
+
+	if let Some(s3_bind_addr) = &config.s3_api.api_bind_addr {
+		info!("Initializing S3 API server...");
+		servers.push((
+			"S3 API",
+			tokio::spawn(S3ApiServer::run(
+				garage.clone(),
+				*s3_bind_addr,
+				config.s3_api.s3_region.clone(),
+				wait_from(watch_cancel.clone()),
+			)),
+		));
+	}
 
-	info!("Initializing web server...");
-	let web_server = tokio::spawn(run_web_server(
-		garage.clone(),
-		wait_from(watch_cancel.clone()),
-	));
-
-	let admin_server = if let Some(admin_bind_addr) = config.admin.api_bind_addr {
-		info!("Configure and run admin web server...");
-		Some(tokio::spawn(
-			admin_server_init.run(admin_bind_addr, wait_from(watch_cancel.clone())),
-		))
-	} else {
-		None
-	};
+	if config.k2v_api.is_some() {
+		#[cfg(feature = "k2v")]
+		{
+			info!("Initializing K2V API server...");
+			servers.push((
+				"K2V API",
+				tokio::spawn(K2VApiServer::run(
+					garage.clone(),
+					config.k2v_api.as_ref().unwrap().api_bind_addr,
+					config.s3_api.s3_region.clone(),
+					wait_from(watch_cancel.clone()),
+				)),
+			));
+		}
+		#[cfg(not(feature = "k2v"))]
+		error!("K2V is not enabled in this build, cannot start K2V API server");
+	}
 
-	// Stuff runs
+	if let Some(web_config) = &config.s3_web {
+		info!("Initializing web server...");
+		servers.push((
+			"Web",
+			tokio::spawn(WebServer::run(
+				garage.clone(),
+				web_config.bind_addr,
+				web_config.root_domain.clone(),
+				wait_from(watch_cancel.clone()),
+			)),
+		));
+	}
 
-	// When a cancel signal is sent, stuff stops
-	if let Err(e) = api_server.await? {
-		warn!("API server exited with error: {}", e);
+	if let Some(admin_bind_addr) = &config.admin.api_bind_addr {
+		info!("Launching Admin API server...");
+		servers.push((
+			"Admin",
+			tokio::spawn(admin_server.run(*admin_bind_addr, wait_from(watch_cancel.clone()))),
+		));
 	}
-	if let Err(e) = web_server.await? {
-		warn!("Web server exited with error: {}", e);
+
+	#[cfg(not(feature = "metrics"))]
+	if config.admin.metrics_token.is_some() {
+		warn!("This Garage version is built without the metrics feature");
 	}
-	if let Some(a) = admin_server {
-		if let Err(e) = a.await? {
-			warn!("Admin web server exited with error: {}", e);
+
+	// Stuff runs
+
+	// When a cancel signal is sent, stuff stops
+
+	// Collect stuff
+	for (desc, join_handle) in servers {
+		if let Err(e) = join_handle.await? {
+			error!("{} server exited with error: {}", desc, e);
+		} else {
+			info!("{} server exited without error.", desc);
 		}
 	}
 
 	// Remove RPC handlers for system to break reference cycles
 	garage.system.netapp.drop_all_handlers();
+	opentelemetry::global::shutdown_tracer_provider();
 
 	// Await for netapp RPC system to end
 	run_system.await?;
+	info!("Netapp exited");
 
 	// Drop all references so that stuff can terminate properly
 	drop(garage);
@@ -108,3 +157,44 @@ pub async fn run_server(config_file: PathBuf) -> Result<(), Error> {
 
 	Ok(())
 }
+
+#[cfg(unix)]
+fn watch_shutdown_signal() -> watch::Receiver<bool> {
+	use tokio::signal::unix::*;
+
+	let (send_cancel, watch_cancel) = watch::channel(false);
+	tokio::spawn(async move {
+		let mut sigint = signal(SignalKind::interrupt()).expect("Failed to install SIGINT handler");
+		let mut sigterm =
+			signal(SignalKind::terminate()).expect("Failed to install SIGTERM handler");
+		let mut sighup = signal(SignalKind::hangup()).expect("Failed to install SIGHUP handler");
+		tokio::select! {
+			_ = sigint.recv() => info!("Received SIGINT, shutting down."),
+			_ = sigterm.recv() => info!("Received SIGTERM, shutting down."),
+			_ = sighup.recv() => info!("Received SIGHUP, shutting down."),
+		}
+		send_cancel.send(true).unwrap();
+	});
+	watch_cancel
+}
+
+#[cfg(windows)]
+fn watch_shutdown_signal() -> watch::Receiver<bool> {
+	use tokio::signal::windows::*;
+
+	let (send_cancel, watch_cancel) = watch::channel(false);
+	tokio::spawn(async move {
+		let mut sigint = ctrl_c().expect("Failed to install Ctrl-C handler");
+		let mut sigclose = ctrl_close().expect("Failed to install Ctrl-Close handler");
+		let mut siglogoff = ctrl_logoff().expect("Failed to install Ctrl-Logoff handler");
+		let mut sigsdown = ctrl_shutdown().expect("Failed to install Ctrl-Shutdown handler");
+		tokio::select! {
+			_ = sigint.recv() => info!("Received Ctrl-C, shutting down."),
+			_ = sigclose.recv() => info!("Received Ctrl-Close, shutting down."),
+			_ = siglogoff.recv() => info!("Received Ctrl-Logoff, shutting down."),
+			_ = sigsdown.recv() => info!("Received Ctrl-Shutdown, shutting down."),
+		}
+		send_cancel.send(true).unwrap();
+	});
+	watch_cancel
+}
diff --git a/src/garage/tests/bucket.rs b/src/garage/tests/bucket.rs
index ff5cc8da..b32af068 100644
--- a/src/garage/tests/bucket.rs
+++ b/src/garage/tests/bucket.rs
@@ -29,8 +29,7 @@ async fn test_bucket_all() {
 			.unwrap()
 			.iter()
 			.filter(|x| x.name.as_ref().is_some())
-			.find(|x| x.name.as_ref().unwrap() == "hello")
-			.is_some());
+			.any(|x| x.name.as_ref().unwrap() == "hello"));
 	}
 	{
 		// Get its location
@@ -75,13 +74,12 @@ async fn test_bucket_all() {
 	{
 		// Check bucket is deleted with List buckets
 		let r = ctx.client.list_buckets().send().await.unwrap();
-		assert!(r
+		assert!(!r
 			.buckets
 			.as_ref()
 			.unwrap()
 			.iter()
 			.filter(|x| x.name.as_ref().is_some())
-			.find(|x| x.name.as_ref().unwrap() == "hello")
-			.is_none());
+			.any(|x| x.name.as_ref().unwrap() == "hello"));
 	}
 }
diff --git a/src/garage/tests/common/client.rs b/src/garage/tests/common/client.rs
index c5ddc6e5..212588b5 100644
--- a/src/garage/tests/common/client.rs
+++ b/src/garage/tests/common/client.rs
@@ -10,7 +10,7 @@ pub fn build_client(instance: &Instance) -> Client {
 		None,
 		"garage-integ-test",
 	);
-	let endpoint = Endpoint::immutable(instance.uri());
+	let endpoint = Endpoint::immutable(instance.s3_uri());
 
 	let config = Config::builder()
 		.region(super::REGION)
diff --git a/src/garage/tests/common/custom_requester.rs b/src/garage/tests/common/custom_requester.rs
index 580691a1..1700cc90 100644
--- a/src/garage/tests/common/custom_requester.rs
+++ b/src/garage/tests/common/custom_requester.rs
@@ -17,14 +17,25 @@ use garage_api::signature;
 pub struct CustomRequester {
 	key: Key,
 	uri: Uri,
+	service: &'static str,
 	client: Client<HttpConnector>,
 }
 
 impl CustomRequester {
-	pub fn new(instance: &Instance) -> Self {
+	pub fn new_s3(instance: &Instance) -> Self {
 		CustomRequester {
 			key: instance.key.clone(),
-			uri: instance.uri(),
+			uri: instance.s3_uri(),
+			service: "s3",
+			client: Client::new(),
+		}
+	}
+
+	pub fn new_k2v(instance: &Instance) -> Self {
+		CustomRequester {
+			key: instance.key.clone(),
+			uri: instance.k2v_uri(),
+			service: "k2v",
 			client: Client::new(),
 		}
 	}
@@ -32,6 +43,7 @@ impl CustomRequester {
 	pub fn builder(&self, bucket: String) -> RequestBuilder<'_> {
 		RequestBuilder {
 			requester: self,
+			service: self.service,
 			bucket,
 			method: Method::GET,
 			path: String::new(),
@@ -47,6 +59,7 @@ impl CustomRequester {
 
 pub struct RequestBuilder<'a> {
 	requester: &'a CustomRequester,
+	service: &'static str,
 	bucket: String,
 	method: Method,
 	path: String,
@@ -59,13 +72,17 @@ pub struct RequestBuilder<'a> {
 }
 
 impl<'a> RequestBuilder<'a> {
+	pub fn service(&mut self, service: &'static str) -> &mut Self {
+		self.service = service;
+		self
+	}
 	pub fn method(&mut self, method: Method) -> &mut Self {
 		self.method = method;
 		self
 	}
 
-	pub fn path(&mut self, path: String) -> &mut Self {
-		self.path = path;
+	pub fn path(&mut self, path: impl ToString) -> &mut Self {
+		self.path = path.to_string();
 		self
 	}
 
@@ -74,16 +91,38 @@ impl<'a> RequestBuilder<'a> {
 		self
 	}
 
+	pub fn query_param<T, U>(&mut self, param: T, value: Option<U>) -> &mut Self
+	where
+		T: ToString,
+		U: ToString,
+	{
+		self.query_params
+			.insert(param.to_string(), value.as_ref().map(ToString::to_string));
+		self
+	}
+
 	pub fn signed_headers(&mut self, signed_headers: HashMap<String, String>) -> &mut Self {
 		self.signed_headers = signed_headers;
 		self
 	}
 
+	pub fn signed_header(&mut self, name: impl ToString, value: impl ToString) -> &mut Self {
+		self.signed_headers
+			.insert(name.to_string(), value.to_string());
+		self
+	}
+
 	pub fn unsigned_headers(&mut self, unsigned_headers: HashMap<String, String>) -> &mut Self {
 		self.unsigned_headers = unsigned_headers;
 		self
 	}
 
+	pub fn unsigned_header(&mut self, name: impl ToString, value: impl ToString) -> &mut Self {
+		self.unsigned_headers
+			.insert(name.to_string(), value.to_string());
+		self
+	}
+
 	pub fn body(&mut self, body: Vec<u8>) -> &mut Self {
 		self.body = body;
 		self
@@ -106,24 +145,24 @@ impl<'a> RequestBuilder<'a> {
 		let query = query_param_to_string(&self.query_params);
 		let (host, path) = if self.vhost_style {
 			(
-				format!("{}.s3.garage", self.bucket),
+				format!("{}.{}.garage", self.bucket, self.service),
 				format!("{}{}", self.path, query),
 			)
 		} else {
 			(
-				"s3.garage".to_owned(),
+				format!("{}.garage", self.service),
 				format!("{}/{}{}", self.bucket, self.path, query),
 			)
 		};
 		let uri = format!("{}{}", self.requester.uri, path);
 
 		let now = Utc::now();
-		let scope = signature::compute_scope(&now, super::REGION.as_ref());
+		let scope = signature::compute_scope(&now, super::REGION.as_ref(), self.service);
 		let mut signer = signature::signing_hmac(
 			&now,
 			&self.requester.key.secret,
 			super::REGION.as_ref(),
-			"s3",
+			self.service,
 		)
 		.unwrap();
 		let streaming_signer = signer.clone();
diff --git a/src/garage/tests/common/garage.rs b/src/garage/tests/common/garage.rs
index 88c51501..44d727f9 100644
--- a/src/garage/tests/common/garage.rs
+++ b/src/garage/tests/common/garage.rs
@@ -22,7 +22,9 @@ pub struct Instance {
 	process: process::Child,
 	pub path: PathBuf,
 	pub key: Key,
-	pub api_port: u16,
+	pub s3_port: u16,
+	pub k2v_port: u16,
+	pub web_port: u16,
 }
 
 impl Instance {
@@ -58,9 +60,12 @@ rpc_secret = "{secret}"
 
 [s3_api]
 s3_region = "{region}"
-api_bind_addr = "127.0.0.1:{api_port}"
+api_bind_addr = "127.0.0.1:{s3_port}"
 root_domain = ".s3.garage"
 
+[k2v_api]
+api_bind_addr = "127.0.0.1:{k2v_port}"
+
 [s3_web]
 bind_addr = "127.0.0.1:{web_port}"
 root_domain = ".web.garage"
@@ -72,10 +77,11 @@ api_bind_addr = "127.0.0.1:{admin_port}"
 			path = path.display(),
 			secret = GARAGE_TEST_SECRET,
 			region = super::REGION,
-			api_port = port,
-			rpc_port = port + 1,
-			web_port = port + 2,
-			admin_port = port + 3,
+			s3_port = port,
+			k2v_port = port + 1,
+			rpc_port = port + 2,
+			web_port = port + 3,
+			admin_port = port + 4,
 		);
 		fs::write(path.join("config.toml"), config).expect("Could not write garage config file");
 
@@ -88,7 +94,7 @@ api_bind_addr = "127.0.0.1:{admin_port}"
 			.arg("server")
 			.stdout(stdout)
 			.stderr(stderr)
-			.env("RUST_LOG", "garage=info,garage_api=debug")
+			.env("RUST_LOG", "garage=info,garage_api=trace")
 			.spawn()
 			.expect("Could not start garage");
 
@@ -96,7 +102,9 @@ api_bind_addr = "127.0.0.1:{admin_port}"
 			process: child,
 			path,
 			key: Key::default(),
-			api_port: port,
+			s3_port: port,
+			k2v_port: port + 1,
+			web_port: port + 3,
 		}
 	}
 
@@ -147,8 +155,14 @@ api_bind_addr = "127.0.0.1:{admin_port}"
 		String::from_utf8(output.stdout).unwrap()
 	}
 
-	pub fn uri(&self) -> http::Uri {
-		format!("http://127.0.0.1:{api_port}", api_port = self.api_port)
+	pub fn s3_uri(&self) -> http::Uri {
+		format!("http://127.0.0.1:{s3_port}", s3_port = self.s3_port)
+			.parse()
+			.expect("Could not build garage endpoint URI")
+	}
+
+	pub fn k2v_uri(&self) -> http::Uri {
+		format!("http://127.0.0.1:{k2v_port}", k2v_port = self.k2v_port)
 			.parse()
 			.expect("Could not build garage endpoint URI")
 	}
diff --git a/src/garage/tests/common/mod.rs b/src/garage/tests/common/mod.rs
index 8f88c731..28874b02 100644
--- a/src/garage/tests/common/mod.rs
+++ b/src/garage/tests/common/mod.rs
@@ -17,18 +17,27 @@ pub struct Context {
 	pub garage: &'static garage::Instance,
 	pub client: Client,
 	pub custom_request: CustomRequester,
+	pub k2v: K2VContext,
+}
+
+pub struct K2VContext {
+	pub request: CustomRequester,
 }
 
 impl Context {
 	fn new() -> Self {
 		let garage = garage::instance();
 		let client = client::build_client(garage);
-		let custom_request = CustomRequester::new(garage);
+		let custom_request = CustomRequester::new_s3(garage);
+		let k2v_request = CustomRequester::new_k2v(garage);
 
 		Context {
 			garage,
 			client,
 			custom_request,
+			k2v: K2VContext {
+				request: k2v_request,
+			},
 		}
 	}
 
diff --git a/src/garage/tests/k2v/batch.rs b/src/garage/tests/k2v/batch.rs
new file mode 100644
index 00000000..acae1910
--- /dev/null
+++ b/src/garage/tests/k2v/batch.rs
@@ -0,0 +1,612 @@
+use std::collections::HashMap;
+
+use crate::common;
+
+use assert_json_diff::assert_json_eq;
+use serde_json::json;
+
+use super::json_body;
+use hyper::Method;
+
+#[tokio::test]
+async fn test_batch() {
+	let ctx = common::context();
+	let bucket = ctx.create_bucket("test-k2v-batch");
+
+	let mut values = HashMap::new();
+	values.insert("a", "initial test 1");
+	values.insert("b", "initial test 2");
+	values.insert("c", "initial test 3");
+	values.insert("d.1", "initial test 4");
+	values.insert("d.2", "initial test 5");
+	values.insert("e", "initial test 6");
+	let mut ct = HashMap::new();
+
+	let res = ctx
+		.k2v
+		.request
+		.builder(bucket.clone())
+		.body(
+			format!(
+				r#"[
+	{{"pk": "root", "sk": "a", "ct": null, "v": "{}"}},
+	{{"pk": "root", "sk": "b", "ct": null, "v": "{}"}},
+	{{"pk": "root", "sk": "c", "ct": null, "v": "{}"}},
+	{{"pk": "root", "sk": "d.1", "ct": null, "v": "{}"}},
+	{{"pk": "root", "sk": "d.2", "ct": null, "v": "{}"}},
+	{{"pk": "root", "sk": "e", "ct": null, "v": "{}"}}
+		]"#,
+				base64::encode(values.get(&"a").unwrap()),
+				base64::encode(values.get(&"b").unwrap()),
+				base64::encode(values.get(&"c").unwrap()),
+				base64::encode(values.get(&"d.1").unwrap()),
+				base64::encode(values.get(&"d.2").unwrap()),
+				base64::encode(values.get(&"e").unwrap()),
+			)
+			.into_bytes(),
+		)
+		.method(Method::POST)
+		.send()
+		.await
+		.unwrap();
+	assert_eq!(res.status(), 200);
+
+	for sk in ["a", "b", "c", "d.1", "d.2", "e"] {
+		let res = ctx
+			.k2v
+			.request
+			.builder(bucket.clone())
+			.path("root")
+			.query_param("sort_key", Some(sk))
+			.signed_header("accept", "*/*")
+			.send()
+			.await
+			.unwrap();
+		assert_eq!(res.status(), 200);
+		assert_eq!(
+			res.headers().get("content-type").unwrap().to_str().unwrap(),
+			"application/octet-stream"
+		);
+		ct.insert(
+			sk,
+			res.headers()
+				.get("x-garage-causality-token")
+				.unwrap()
+				.to_str()
+				.unwrap()
+				.to_string(),
+		);
+		let res_body = hyper::body::to_bytes(res.into_body())
+			.await
+			.unwrap()
+			.to_vec();
+		assert_eq!(res_body, values.get(sk).unwrap().as_bytes());
+	}
+
+	let res = ctx
+		.k2v
+		.request
+		.builder(bucket.clone())
+		.query_param("search", Option::<&str>::None)
+		.body(
+			br#"[
+	{"partitionKey": "root"},
+	{"partitionKey": "root", "start": "c"},
+	{"partitionKey": "root", "start": "c", "end": "dynamite"},
+	{"partitionKey": "root", "start": "c", "reverse": true, "end": "a"},
+	{"partitionKey": "root", "start": "c", "reverse": true, "end": "azerty"},
+	{"partitionKey": "root", "limit": 1},
+	{"partitionKey": "root", "prefix": "d"}
+		]"#
+			.to_vec(),
+		)
+		.method(Method::POST)
+		.send()
+		.await
+		.unwrap();
+	assert_eq!(res.status(), 200);
+	let json_res = json_body(res).await;
+	assert_json_eq!(
+		json_res,
+		json!([
+			{
+				"partitionKey": "root",
+				"prefix": null,
+				"start": null,
+				"end": null,
+				"limit": null,
+				"reverse": false,
+				"conflictsOnly": false,
+				"tombstones": false,
+				"singleItem": false,
+				"items": [
+				  {"sk": "a", "ct": ct.get("a").unwrap(), "v": [base64::encode(values.get("a").unwrap())]},
+				  {"sk": "b", "ct": ct.get("b").unwrap(), "v": [base64::encode(values.get("b").unwrap())]},
+				  {"sk": "c", "ct": ct.get("c").unwrap(), "v": [base64::encode(values.get("c").unwrap())]},
+				  {"sk": "d.1", "ct": ct.get("d.1").unwrap(), "v": [base64::encode(values.get("d.1").unwrap())]},
+				  {"sk": "d.2", "ct": ct.get("d.2").unwrap(), "v": [base64::encode(values.get("d.2").unwrap())]},
+				  {"sk": "e", "ct": ct.get("e").unwrap(), "v": [base64::encode(values.get("e").unwrap())]}
+				],
+				"more": false,
+				"nextStart": null,
+			},
+			{
+				"partitionKey": "root",
+				"prefix": null,
+				"start": "c",
+				"end": null,
+				"limit": null,
+				"reverse": false,
+				"conflictsOnly": false,
+				"tombstones": false,
+				"singleItem": false,
+				"items": [
+				  {"sk": "c", "ct": ct.get("c").unwrap(), "v": [base64::encode(values.get("c").unwrap())]},
+				  {"sk": "d.1", "ct": ct.get("d.1").unwrap(), "v": [base64::encode(values.get("d.1").unwrap())]},
+				  {"sk": "d.2", "ct": ct.get("d.2").unwrap(), "v": [base64::encode(values.get("d.2").unwrap())]},
+				  {"sk": "e", "ct": ct.get("e").unwrap(), "v": [base64::encode(values.get("e").unwrap())]}
+				],
+				"more": false,
+				"nextStart": null,
+			},
+			{
+				"partitionKey": "root",
+				"prefix": null,
+				"start": "c",
+				"end": "dynamite",
+				"limit": null,
+				"reverse": false,
+				"conflictsOnly": false,
+				"tombstones": false,
+				"singleItem": false,
+				"items": [
+				  {"sk": "c", "ct": ct.get("c").unwrap(), "v": [base64::encode(values.get("c").unwrap())]},
+				  {"sk": "d.1", "ct": ct.get("d.1").unwrap(), "v": [base64::encode(values.get("d.1").unwrap())]},
+				  {"sk": "d.2", "ct": ct.get("d.2").unwrap(), "v": [base64::encode(values.get("d.2").unwrap())]},
+				],
+				"more": false,
+				"nextStart": null,
+			},
+			{
+				"partitionKey": "root",
+				"prefix": null,
+				"start": "c",
+				"end": "a",
+				"limit": null,
+				"reverse": true,
+				"conflictsOnly": false,
+				"tombstones": false,
+				"singleItem": false,
+				"items": [
+				  {"sk": "c", "ct": ct.get("c").unwrap(), "v": [base64::encode(values.get("c").unwrap())]},
+				  {"sk": "b", "ct": ct.get("b").unwrap(), "v": [base64::encode(values.get("b").unwrap())]},
+				],
+				"more": false,
+				"nextStart": null,
+			},
+			{
+				"partitionKey": "root",
+				"prefix": null,
+				"start": "c",
+				"end": "azerty",
+				"limit": null,
+				"reverse": true,
+				"conflictsOnly": false,
+				"tombstones": false,
+				"singleItem": false,
+				"items": [
+				  {"sk": "c", "ct": ct.get("c").unwrap(), "v": [base64::encode(values.get("c").unwrap())]},
+				  {"sk": "b", "ct": ct.get("b").unwrap(), "v": [base64::encode(values.get("b").unwrap())]},
+				],
+				"more": false,
+				"nextStart": null,
+			},
+			{
+				"partitionKey": "root",
+				"prefix": null,
+				"start": null,
+				"end": null,
+				"limit": 1,
+				"reverse": false,
+				"conflictsOnly": false,
+				"tombstones": false,
+				"singleItem": false,
+				"items": [
+				  {"sk": "a", "ct": ct.get("a").unwrap(), "v": [base64::encode(values.get("a").unwrap())]}
+				],
+				"more": true,
+				"nextStart": "b",
+			},
+			{
+				"partitionKey": "root",
+				"prefix": "d",
+				"start": null,
+				"end": null,
+				"limit": null,
+				"reverse": false,
+				"conflictsOnly": false,
+				"tombstones": false,
+				"singleItem": false,
+				"items": [
+				  {"sk": "d.1", "ct": ct.get("d.1").unwrap(), "v": [base64::encode(values.get("d.1").unwrap())]},
+				  {"sk": "d.2", "ct": ct.get("d.2").unwrap(), "v": [base64::encode(values.get("d.2").unwrap())]}
+				],
+				"more": false,
+				"nextStart": null,
+			},
+		])
+	);
+
+	// Insert some new values
+	values.insert("c'", "new test 3");
+	values.insert("d.1'", "new test 4");
+	values.insert("d.2'", "new test 5");
+
+	let res = ctx
+		.k2v
+		.request
+		.builder(bucket.clone())
+		.body(
+			format!(
+				r#"[
+	{{"pk": "root", "sk": "b", "ct": "{}", "v": null}},
+	{{"pk": "root", "sk": "c", "ct": null, "v": "{}"}},
+	{{"pk": "root", "sk": "d.1", "ct": "{}", "v": "{}"}},
+	{{"pk": "root", "sk": "d.2", "ct": null, "v": "{}"}}
+		]"#,
+				ct.get(&"b").unwrap(),
+				base64::encode(values.get(&"c'").unwrap()),
+				ct.get(&"d.1").unwrap(),
+				base64::encode(values.get(&"d.1'").unwrap()),
+				base64::encode(values.get(&"d.2'").unwrap()),
+			)
+			.into_bytes(),
+		)
+		.method(Method::POST)
+		.send()
+		.await
+		.unwrap();
+	assert_eq!(res.status(), 200);
+
+	for sk in ["b", "c", "d.1", "d.2"] {
+		let res = ctx
+			.k2v
+			.request
+			.builder(bucket.clone())
+			.path("root")
+			.query_param("sort_key", Some(sk))
+			.signed_header("accept", "*/*")
+			.send()
+			.await
+			.unwrap();
+		if sk == "b" {
+			assert_eq!(res.status(), 204);
+		} else {
+			assert_eq!(res.status(), 200);
+		}
+		ct.insert(
+			sk,
+			res.headers()
+				.get("x-garage-causality-token")
+				.unwrap()
+				.to_str()
+				.unwrap()
+				.to_string(),
+		);
+	}
+
+	let res = ctx
+		.k2v
+		.request
+		.builder(bucket.clone())
+		.query_param("search", Option::<&str>::None)
+		.body(
+			br#"[
+	{"partitionKey": "root"},
+	{"partitionKey": "root", "prefix": "d"},
+	{"partitionKey": "root", "prefix": "d.", "end": "d.2"},
+	{"partitionKey": "root", "prefix": "d.", "limit": 1},
+	{"partitionKey": "root", "prefix": "d.", "start": "d.2", "limit": 1},
+	{"partitionKey": "root", "prefix": "d.", "reverse": true},
+	{"partitionKey": "root", "prefix": "d.", "start": "d.2", "reverse": true},
+	{"partitionKey": "root", "prefix": "d.", "limit": 2}
+		]"#
+			.to_vec(),
+		)
+		.method(Method::POST)
+		.send()
+		.await
+		.unwrap();
+	assert_eq!(res.status(), 200);
+	let json_res = json_body(res).await;
+	assert_json_eq!(
+		json_res,
+		json!([
+			{
+				"partitionKey": "root",
+				"prefix": null,
+				"start": null,
+				"end": null,
+				"limit": null,
+				"reverse": false,
+				"conflictsOnly": false,
+				"tombstones": false,
+				"singleItem": false,
+				"items": [
+				  {"sk": "a", "ct": ct.get("a").unwrap(), "v": [base64::encode(values.get("a").unwrap())]},
+				  {"sk": "c", "ct": ct.get("c").unwrap(), "v": [base64::encode(values.get("c").unwrap()), base64::encode(values.get("c'").unwrap())]},
+				  {"sk": "d.1", "ct": ct.get("d.1").unwrap(), "v": [base64::encode(values.get("d.1'").unwrap())]},
+				  {"sk": "d.2", "ct": ct.get("d.2").unwrap(), "v": [base64::encode(values.get("d.2").unwrap()), base64::encode(values.get("d.2'").unwrap())]},
+				  {"sk": "e", "ct": ct.get("e").unwrap(), "v": [base64::encode(values.get("e").unwrap())]}
+				],
+				"more": false,
+				"nextStart": null,
+			},
+			{
+				"partitionKey": "root",
+				"prefix": "d",
+				"start": null,
+				"end": null,
+				"limit": null,
+				"reverse": false,
+				"conflictsOnly": false,
+				"tombstones": false,
+				"singleItem": false,
+				"items": [
+				  {"sk": "d.1", "ct": ct.get("d.1").unwrap(), "v": [base64::encode(values.get("d.1'").unwrap())]},
+				  {"sk": "d.2", "ct": ct.get("d.2").unwrap(), "v": [base64::encode(values.get("d.2").unwrap()), base64::encode(values.get("d.2'").unwrap())]},
+				],
+				"more": false,
+				"nextStart": null,
+			},
+			{
+				"partitionKey": "root",
+				"prefix": "d.",
+				"start": null,
+				"end": "d.2",
+				"limit": null,
+				"reverse": false,
+				"conflictsOnly": false,
+				"tombstones": false,
+				"singleItem": false,
+				"items": [
+				  {"sk": "d.1", "ct": ct.get("d.1").unwrap(), "v": [base64::encode(values.get("d.1'").unwrap())]},
+				],
+				"more": false,
+				"nextStart": null,
+			},
+			{
+				"partitionKey": "root",
+				"prefix": "d.",
+				"start": null,
+				"end": null,
+				"limit": 1,
+				"reverse": false,
+				"conflictsOnly": false,
+				"tombstones": false,
+				"singleItem": false,
+				"items": [
+				  {"sk": "d.1", "ct": ct.get("d.1").unwrap(), "v": [base64::encode(values.get("d.1'").unwrap())]},
+				],
+				"more": true,
+				"nextStart": "d.2",
+			},
+			{
+				"partitionKey": "root",
+				"prefix": "d.",
+				"start": "d.2",
+				"end": null,
+				"limit": 1,
+				"reverse": false,
+				"conflictsOnly": false,
+				"tombstones": false,
+				"singleItem": false,
+				"items": [
+				  {"sk": "d.2", "ct": ct.get("d.2").unwrap(), "v": [base64::encode(values.get("d.2").unwrap()), base64::encode(values.get("d.2'").unwrap())]},
+				],
+				"more": false,
+				"nextStart": null,
+			},
+			{
+				"partitionKey": "root",
+				"prefix": "d.",
+				"start": null,
+				"end": null,
+				"limit": null,
+				"reverse": true,
+				"conflictsOnly": false,
+				"tombstones": false,
+				"singleItem": false,
+				"items": [
+				  {"sk": "d.2", "ct": ct.get("d.2").unwrap(), "v": [base64::encode(values.get("d.2").unwrap()), base64::encode(values.get("d.2'").unwrap())]},
+				  {"sk": "d.1", "ct": ct.get("d.1").unwrap(), "v": [base64::encode(values.get("d.1'").unwrap())]},
+				],
+				"more": false,
+				"nextStart": null,
+			},
+			{
+				"partitionKey": "root",
+				"prefix": "d.",
+				"start": "d.2",
+				"end": null,
+				"limit": null,
+				"reverse": true,
+				"conflictsOnly": false,
+				"tombstones": false,
+				"singleItem": false,
+				"items": [
+				  {"sk": "d.2", "ct": ct.get("d.2").unwrap(), "v": [base64::encode(values.get("d.2").unwrap()), base64::encode(values.get("d.2'").unwrap())]},
+				  {"sk": "d.1", "ct": ct.get("d.1").unwrap(), "v": [base64::encode(values.get("d.1'").unwrap())]},
+				],
+				"more": false,
+				"nextStart": null,
+			},
+			{
+				"partitionKey": "root",
+				"prefix": "d.",
+				"start": null,
+				"end": null,
+				"limit": 2,
+				"reverse": false,
+				"conflictsOnly": false,
+				"tombstones": false,
+				"singleItem": false,
+				"items": [
+				  {"sk": "d.1", "ct": ct.get("d.1").unwrap(), "v": [base64::encode(values.get("d.1'").unwrap())]},
+				  {"sk": "d.2", "ct": ct.get("d.2").unwrap(), "v": [base64::encode(values.get("d.2").unwrap()), base64::encode(values.get("d.2'").unwrap())]},
+				],
+				"more": false,
+				"nextStart": null,
+			},
+		])
+	);
+
+	// Test DeleteBatch
+	let res = ctx
+		.k2v
+		.request
+		.builder(bucket.clone())
+		.query_param("delete", Option::<&str>::None)
+		.body(
+			br#"[
+	{"partitionKey": "root", "start": "a", "end": "c"},
+	{"partitionKey": "root", "prefix": "d"}
+		]"#
+			.to_vec(),
+		)
+		.method(Method::POST)
+		.send()
+		.await
+		.unwrap();
+	assert_eq!(res.status(), 200);
+	let json_res = json_body(res).await;
+	assert_json_eq!(
+		json_res,
+		json!([
+			{
+				"partitionKey": "root",
+				"prefix": null,
+				"start": "a",
+				"end": "c",
+				"singleItem": false,
+				"deletedItems": 1,
+			},
+			{
+				"partitionKey": "root",
+				"prefix": "d",
+				"start": null,
+				"end": null,
+				"singleItem": false,
+				"deletedItems": 2,
+			},
+		])
+	);
+
+	// update our known tombstones
+	for sk in ["a", "b", "d.1", "d.2"] {
+		let res = ctx
+			.k2v
+			.request
+			.builder(bucket.clone())
+			.path("root")
+			.query_param("sort_key", Some(sk))
+			.signed_header("accept", "application/octet-stream")
+			.send()
+			.await
+			.unwrap();
+		assert_eq!(res.status(), 204);
+		assert_eq!(
+			res.headers().get("content-type").unwrap().to_str().unwrap(),
+			"application/octet-stream"
+		);
+		ct.insert(
+			sk,
+			res.headers()
+				.get("x-garage-causality-token")
+				.unwrap()
+				.to_str()
+				.unwrap()
+				.to_string(),
+		);
+	}
+
+	let res = ctx
+		.k2v
+		.request
+		.builder(bucket.clone())
+		.query_param("search", Option::<&str>::None)
+		.body(
+			br#"[
+	{"partitionKey": "root"},
+	{"partitionKey": "root", "reverse": true},
+	{"partitionKey": "root", "tombstones": true}
+		]"#
+			.to_vec(),
+		)
+		.method(Method::POST)
+		.send()
+		.await
+		.unwrap();
+	assert_eq!(res.status(), 200);
+	let json_res = json_body(res).await;
+	assert_json_eq!(
+		json_res,
+		json!([
+			{
+				"partitionKey": "root",
+				"prefix": null,
+				"start": null,
+				"end": null,
+				"limit": null,
+				"reverse": false,
+				"conflictsOnly": false,
+				"tombstones": false,
+				"singleItem": false,
+				"items": [
+				  {"sk": "c", "ct": ct.get("c").unwrap(), "v": [base64::encode(values.get("c").unwrap()), base64::encode(values.get("c'").unwrap())]},
+				  {"sk": "e", "ct": ct.get("e").unwrap(), "v": [base64::encode(values.get("e").unwrap())]}
+				],
+				"more": false,
+				"nextStart": null,
+			},
+			{
+				"partitionKey": "root",
+				"prefix": null,
+				"start": null,
+				"end": null,
+				"limit": null,
+				"reverse": true,
+				"conflictsOnly": false,
+				"tombstones": false,
+				"singleItem": false,
+				"items": [
+				  {"sk": "e", "ct": ct.get("e").unwrap(), "v": [base64::encode(values.get("e").unwrap())]},
+				  {"sk": "c", "ct": ct.get("c").unwrap(), "v": [base64::encode(values.get("c").unwrap()), base64::encode(values.get("c'").unwrap())]},
+				],
+				"more": false,
+				"nextStart": null,
+			},
+			{
+				"partitionKey": "root",
+				"prefix": null,
+				"start": null,
+				"end": null,
+				"limit": null,
+				"reverse": false,
+				"conflictsOnly": false,
+				"tombstones": true,
+				"singleItem": false,
+				"items": [
+				  {"sk": "a", "ct": ct.get("a").unwrap(), "v": [null]},
+				  {"sk": "b", "ct": ct.get("b").unwrap(), "v": [null]},
+				  {"sk": "c", "ct": ct.get("c").unwrap(), "v": [base64::encode(values.get("c").unwrap()), base64::encode(values.get("c'").unwrap())]},
+				  {"sk": "d.1", "ct": ct.get("d.1").unwrap(), "v": [null]},
+				  {"sk": "d.2", "ct": ct.get("d.2").unwrap(), "v": [null]},
+				  {"sk": "e", "ct": ct.get("e").unwrap(), "v": [base64::encode(values.get("e").unwrap())]},
+				],
+				"more": false,
+				"nextStart": null,
+			},
+		])
+	);
+}
diff --git a/src/garage/tests/k2v/errorcodes.rs b/src/garage/tests/k2v/errorcodes.rs
new file mode 100644
index 00000000..2fcc45bc
--- /dev/null
+++ b/src/garage/tests/k2v/errorcodes.rs
@@ -0,0 +1,141 @@
+use crate::common;
+
+use hyper::Method;
+
+#[tokio::test]
+async fn test_error_codes() {
+	let ctx = common::context();
+	let bucket = ctx.create_bucket("test-k2v-error-codes");
+
+	// Regular insert should work (code 200)
+	let res = ctx
+		.k2v
+		.request
+		.builder(bucket.clone())
+		.method(Method::PUT)
+		.path("root")
+		.query_param("sort_key", Some("test1"))
+		.body(b"Hello, world!".to_vec())
+		.send()
+		.await
+		.unwrap();
+	assert_eq!(res.status(), 200);
+
+	// Insert with trash causality token: invalid request
+	let res = ctx
+		.k2v
+		.request
+		.builder(bucket.clone())
+		.method(Method::PUT)
+		.path("root")
+		.query_param("sort_key", Some("test1"))
+		.signed_header("x-garage-causality-token", "tra$sh")
+		.body(b"Hello, world!".to_vec())
+		.send()
+		.await
+		.unwrap();
+	assert_eq!(res.status(), 400);
+
+	// Search without partition key: invalid request
+	let res = ctx
+		.k2v
+		.request
+		.builder(bucket.clone())
+		.query_param("search", Option::<&str>::None)
+		.body(
+			br#"[
+	{},
+		]"#
+			.to_vec(),
+		)
+		.method(Method::POST)
+		.send()
+		.await
+		.unwrap();
+	assert_eq!(res.status(), 400);
+
+	// Search with start that is not in prefix: invalid request
+	let res = ctx
+		.k2v
+		.request
+		.builder(bucket.clone())
+		.query_param("search", Option::<&str>::None)
+		.body(
+			br#"[
+	{"partition_key": "root", "prefix": "a", "start": "bx"},
+		]"#
+			.to_vec(),
+		)
+		.method(Method::POST)
+		.send()
+		.await
+		.unwrap();
+	assert_eq!(res.status(), 400);
+
+	// Search with invalid json: 400
+	let res = ctx
+		.k2v
+		.request
+		.builder(bucket.clone())
+		.query_param("search", Option::<&str>::None)
+		.body(
+			br#"[
+	{"partition_key": "root"
+		]"#
+			.to_vec(),
+		)
+		.method(Method::POST)
+		.send()
+		.await
+		.unwrap();
+	assert_eq!(res.status(), 400);
+
+	// Batch insert with invalid causality token: 400
+	let res = ctx
+		.k2v
+		.request
+		.builder(bucket.clone())
+		.body(
+			br#"[
+	{"pk": "root", "sk": "a", "ct": "tra$h", "v": "aGVsbG8sIHdvcmxkCg=="}
+		]"#
+			.to_vec(),
+		)
+		.method(Method::POST)
+		.send()
+		.await
+		.unwrap();
+	assert_eq!(res.status(), 400);
+
+	// Batch insert with invalid data: 400
+	let res = ctx
+		.k2v
+		.request
+		.builder(bucket.clone())
+		.body(
+			br#"[
+	{"pk": "root", "sk": "a", "ct": null, "v": "aGVsbG8sIHdvcmx$Cg=="}
+		]"#
+			.to_vec(),
+		)
+		.method(Method::POST)
+		.send()
+		.await
+		.unwrap();
+	assert_eq!(res.status(), 400);
+
+	// Poll with invalid causality token: 400
+	let res = ctx
+		.k2v
+		.request
+		.builder(bucket.clone())
+		.path("root")
+		.query_param("sort_key", Some("test1"))
+		.query_param("causality_token", Some("tra$h"))
+		.query_param("timeout", Some("10"))
+		.signed_header("accept", "application/octet-stream")
+		.send()
+		.await
+		.unwrap();
+	assert_eq!(res.status(), 400);
+}
diff --git a/src/garage/tests/k2v/item.rs b/src/garage/tests/k2v/item.rs
new file mode 100644
index 00000000..32537336
--- /dev/null
+++ b/src/garage/tests/k2v/item.rs
@@ -0,0 +1,725 @@
+use std::time::Duration;
+
+use crate::common;
+
+use assert_json_diff::assert_json_eq;
+use serde_json::json;
+
+use super::json_body;
+use hyper::Method;
+
+#[tokio::test]
+async fn test_items_and_indices() {
+	let ctx = common::context();
+	let bucket = ctx.create_bucket("test-k2v-item-and-index");
+
+	// ReadIndex -- there should be nothing
+	let res = ctx
+		.k2v
+		.request
+		.builder(bucket.clone())
+		.send()
+		.await
+		.unwrap();
+	let res_body = json_body(res).await;
+	assert_json_eq!(
+		res_body,
+		json!({
+			"prefix": null,
+			"start": null,
+			"end": null,
+			"limit": null,
+			"reverse": false,
+			"partitionKeys": [],
+			"more": false,
+			"nextStart": null
+		})
+	);
+
+	let content2_len = "_: hello universe".len();
+	let content3_len = "_: concurrent value".len();
+
+	for (i, sk) in ["a", "b", "c", "d"].iter().enumerate() {
+		let content = format!("{}: hello world", sk).into_bytes();
+		let content2 = format!("{}: hello universe", sk).into_bytes();
+		let content3 = format!("{}: concurrent value", sk).into_bytes();
+
+		// Put initially, no causality token
+		let res = ctx
+			.k2v
+			.request
+			.builder(bucket.clone())
+			.path("root")
+			.query_param("sort_key", Some(sk))
+			.body(content.clone())
+			.method(Method::PUT)
+			.send()
+			.await
+			.unwrap();
+		assert_eq!(res.status(), 200);
+
+		// Get value back
+		let res = ctx
+			.k2v
+			.request
+			.builder(bucket.clone())
+			.path("root")
+			.query_param("sort_key", Some(sk))
+			.signed_header("accept", "*/*")
+			.send()
+			.await
+			.unwrap();
+		assert_eq!(res.status(), 200);
+		assert_eq!(
+			res.headers().get("content-type").unwrap().to_str().unwrap(),
+			"application/octet-stream"
+		);
+		let ct = res
+			.headers()
+			.get("x-garage-causality-token")
+			.unwrap()
+			.to_str()
+			.unwrap()
+			.to_string();
+		let res_body = hyper::body::to_bytes(res.into_body())
+			.await
+			.unwrap()
+			.to_vec();
+		assert_eq!(res_body, content);
+
+		// ReadIndex -- now there should be some stuff
+		tokio::time::sleep(Duration::from_secs(1)).await;
+		let res = ctx
+			.k2v
+			.request
+			.builder(bucket.clone())
+			.send()
+			.await
+			.unwrap();
+		let res_body = json_body(res).await;
+		assert_json_eq!(
+			res_body,
+			json!({
+				"prefix": null,
+				"start": null,
+				"end": null,
+				"limit": null,
+				"reverse": false,
+				"partitionKeys": [
+				{
+					"pk": "root",
+					"entries": i+1,
+					"conflicts": i,
+					"values": i+i+1,
+					"bytes": i*(content2.len() + content3.len()) + content.len(),
+				}
+				],
+				"more": false,
+				"nextStart": null
+			})
+		);
+
+		// Put again, this time with causality token
+		let res = ctx
+			.k2v
+			.request
+			.builder(bucket.clone())
+			.path("root")
+			.query_param("sort_key", Some(sk))
+			.signed_header("x-garage-causality-token", ct.clone())
+			.body(content2.clone())
+			.method(Method::PUT)
+			.send()
+			.await
+			.unwrap();
+		assert_eq!(res.status(), 200);
+
+		// Get value back
+		let res = ctx
+			.k2v
+			.request
+			.builder(bucket.clone())
+			.path("root")
+			.query_param("sort_key", Some(sk))
+			.signed_header("accept", "*/*")
+			.send()
+			.await
+			.unwrap();
+		assert_eq!(res.status(), 200);
+		assert_eq!(
+			res.headers().get("content-type").unwrap().to_str().unwrap(),
+			"application/octet-stream"
+		);
+		let res_body = hyper::body::to_bytes(res.into_body())
+			.await
+			.unwrap()
+			.to_vec();
+		assert_eq!(res_body, content2);
+
+		// ReadIndex -- now there should be some stuff
+		tokio::time::sleep(Duration::from_secs(1)).await;
+		let res = ctx
+			.k2v
+			.request
+			.builder(bucket.clone())
+			.send()
+			.await
+			.unwrap();
+		let res_body = json_body(res).await;
+		assert_json_eq!(
+			res_body,
+			json!({
+				"prefix": null,
+				"start": null,
+				"end": null,
+				"limit": null,
+				"reverse": false,
+				"partitionKeys": [
+				{
+					"pk": "root",
+					"entries": i+1,
+					"conflicts": i,
+					"values": i+i+1,
+					"bytes": i*content3.len() + (i+1)*content2.len(),
+				}
+				],
+				"more": false,
+				"nextStart": null
+			})
+		);
+
+		// Put again with same CT, now we have concurrent values
+		let res = ctx
+			.k2v
+			.request
+			.builder(bucket.clone())
+			.path("root")
+			.query_param("sort_key", Some(sk))
+			.signed_header("x-garage-causality-token", ct.clone())
+			.body(content3.clone())
+			.method(Method::PUT)
+			.send()
+			.await
+			.unwrap();
+		assert_eq!(res.status(), 200);
+
+		// Get value back
+		let res = ctx
+			.k2v
+			.request
+			.builder(bucket.clone())
+			.path("root")
+			.query_param("sort_key", Some(sk))
+			.signed_header("accept", "*/*")
+			.send()
+			.await
+			.unwrap();
+		assert_eq!(res.status(), 200);
+		assert_eq!(
+			res.headers().get("content-type").unwrap().to_str().unwrap(),
+			"application/json"
+		);
+		let res_json = json_body(res).await;
+		assert_json_eq!(
+			res_json,
+			[base64::encode(&content2), base64::encode(&content3)]
+		);
+
+		// ReadIndex -- now there should be some stuff
+		tokio::time::sleep(Duration::from_secs(1)).await;
+		let res = ctx
+			.k2v
+			.request
+			.builder(bucket.clone())
+			.send()
+			.await
+			.unwrap();
+		let res_body = json_body(res).await;
+		assert_json_eq!(
+			res_body,
+			json!({
+				"prefix": null,
+				"start": null,
+				"end": null,
+				"limit": null,
+				"reverse": false,
+				"partitionKeys": [
+				{
+					"pk": "root",
+					"entries": i+1,
+					"conflicts": i+1,
+					"values": 2*(i+1),
+					"bytes": (i+1)*(content2.len() + content3.len()),
+				}
+				],
+				"more": false,
+				"nextStart": null
+			})
+		);
+	}
+
+	// Now delete things
+	for (i, sk) in ["a", "b", "c", "d"].iter().enumerate() {
+		// Get value back (we just need the CT)
+		let res = ctx
+			.k2v
+			.request
+			.builder(bucket.clone())
+			.path("root")
+			.query_param("sort_key", Some(sk))
+			.signed_header("accept", "*/*")
+			.send()
+			.await
+			.unwrap();
+		assert_eq!(res.status(), 200);
+		let ct = res
+			.headers()
+			.get("x-garage-causality-token")
+			.unwrap()
+			.to_str()
+			.unwrap()
+			.to_string();
+
+		// Delete it
+		let res = ctx
+			.k2v
+			.request
+			.builder(bucket.clone())
+			.method(Method::DELETE)
+			.path("root")
+			.query_param("sort_key", Some(sk))
+			.signed_header("x-garage-causality-token", ct)
+			.send()
+			.await
+			.unwrap();
+		assert_eq!(res.status(), 204);
+
+		// ReadIndex -- now there should be some stuff
+		tokio::time::sleep(Duration::from_secs(1)).await;
+		let res = ctx
+			.k2v
+			.request
+			.builder(bucket.clone())
+			.send()
+			.await
+			.unwrap();
+		let res_body = json_body(res).await;
+		if i < 3 {
+			assert_json_eq!(
+				res_body,
+				json!({
+					"prefix": null,
+					"start": null,
+					"end": null,
+					"limit": null,
+					"reverse": false,
+					"partitionKeys": [
+					{
+						"pk": "root",
+						"entries": 3-i,
+						"conflicts": 3-i,
+						"values": 2*(3-i),
+						"bytes": (3-i)*(content2_len + content3_len),
+					}
+					],
+					"more": false,
+					"nextStart": null
+				})
+			);
+		} else {
+			assert_json_eq!(
+				res_body,
+				json!({
+					"prefix": null,
+					"start": null,
+					"end": null,
+					"limit": null,
+					"reverse": false,
+					"partitionKeys": [],
+					"more": false,
+					"nextStart": null
+				})
+			);
+		}
+	}
+}
+
+#[tokio::test]
+async fn test_item_return_format() {
+	let ctx = common::context();
+	let bucket = ctx.create_bucket("test-k2v-item-return-format");
+
+	let single_value = b"A single value".to_vec();
+	let concurrent_value = b"A concurrent value".to_vec();
+
+	// -- Test with a single value --
+	let res = ctx
+		.k2v
+		.request
+		.builder(bucket.clone())
+		.path("root")
+		.query_param("sort_key", Some("v1"))
+		.body(single_value.clone())
+		.method(Method::PUT)
+		.send()
+		.await
+		.unwrap();
+	assert_eq!(res.status(), 200);
+
+	// f0: either
+	let res = ctx
+		.k2v
+		.request
+		.builder(bucket.clone())
+		.path("root")
+		.query_param("sort_key", Some("v1"))
+		.signed_header("accept", "*/*")
+		.send()
+		.await
+		.unwrap();
+	assert_eq!(res.status(), 200);
+	assert_eq!(
+		res.headers().get("content-type").unwrap().to_str().unwrap(),
+		"application/octet-stream"
+	);
+	let ct = res
+		.headers()
+		.get("x-garage-causality-token")
+		.unwrap()
+		.to_str()
+		.unwrap()
+		.to_string();
+	let res_body = hyper::body::to_bytes(res.into_body())
+		.await
+		.unwrap()
+		.to_vec();
+	assert_eq!(res_body, single_value);
+
+	// f1: not specified
+	let res = ctx
+		.k2v
+		.request
+		.builder(bucket.clone())
+		.path("root")
+		.query_param("sort_key", Some("v1"))
+		.send()
+		.await
+		.unwrap();
+	assert_eq!(res.status(), 200);
+	assert_eq!(
+		res.headers().get("content-type").unwrap().to_str().unwrap(),
+		"application/json"
+	);
+	let res_body = json_body(res).await;
+	assert_json_eq!(res_body, json!([base64::encode(&single_value)]));
+
+	// f2: binary
+	let res = ctx
+		.k2v
+		.request
+		.builder(bucket.clone())
+		.path("root")
+		.query_param("sort_key", Some("v1"))
+		.signed_header("accept", "application/octet-stream")
+		.send()
+		.await
+		.unwrap();
+	assert_eq!(res.status(), 200);
+	assert_eq!(
+		res.headers().get("content-type").unwrap().to_str().unwrap(),
+		"application/octet-stream"
+	);
+	let res_body = hyper::body::to_bytes(res.into_body())
+		.await
+		.unwrap()
+		.to_vec();
+	assert_eq!(res_body, single_value);
+
+	// f3: json
+	let res = ctx
+		.k2v
+		.request
+		.builder(bucket.clone())
+		.path("root")
+		.query_param("sort_key", Some("v1"))
+		.signed_header("accept", "application/json")
+		.send()
+		.await
+		.unwrap();
+	assert_eq!(res.status(), 200);
+	assert_eq!(
+		res.headers().get("content-type").unwrap().to_str().unwrap(),
+		"application/json"
+	);
+	let res_body = json_body(res).await;
+	assert_json_eq!(res_body, json!([base64::encode(&single_value)]));
+
+	// -- Test with a second, concurrent value --
+	let res = ctx
+		.k2v
+		.request
+		.builder(bucket.clone())
+		.path("root")
+		.query_param("sort_key", Some("v1"))
+		.body(concurrent_value.clone())
+		.method(Method::PUT)
+		.send()
+		.await
+		.unwrap();
+	assert_eq!(res.status(), 200);
+
+	// f0: either
+	let res = ctx
+		.k2v
+		.request
+		.builder(bucket.clone())
+		.path("root")
+		.query_param("sort_key", Some("v1"))
+		.signed_header("accept", "*/*")
+		.send()
+		.await
+		.unwrap();
+	assert_eq!(res.status(), 200);
+	assert_eq!(
+		res.headers().get("content-type").unwrap().to_str().unwrap(),
+		"application/json"
+	);
+	let res_body = json_body(res).await;
+	assert_json_eq!(
+		res_body,
+		json!([
+			base64::encode(&single_value),
+			base64::encode(&concurrent_value)
+		])
+	);
+
+	// f1: not specified
+	let res = ctx
+		.k2v
+		.request
+		.builder(bucket.clone())
+		.path("root")
+		.query_param("sort_key", Some("v1"))
+		.send()
+		.await
+		.unwrap();
+	assert_eq!(res.status(), 200);
+	assert_eq!(
+		res.headers().get("content-type").unwrap().to_str().unwrap(),
+		"application/json"
+	);
+	let res_body = json_body(res).await;
+	assert_json_eq!(
+		res_body,
+		json!([
+			base64::encode(&single_value),
+			base64::encode(&concurrent_value)
+		])
+	);
+
+	// f2: binary
+	let res = ctx
+		.k2v
+		.request
+		.builder(bucket.clone())
+		.path("root")
+		.query_param("sort_key", Some("v1"))
+		.signed_header("accept", "application/octet-stream")
+		.send()
+		.await
+		.unwrap();
+	assert_eq!(res.status(), 409); // CONFLICT
+
+	// f3: json
+	let res = ctx
+		.k2v
+		.request
+		.builder(bucket.clone())
+		.path("root")
+		.query_param("sort_key", Some("v1"))
+		.signed_header("accept", "application/json")
+		.send()
+		.await
+		.unwrap();
+	assert_eq!(res.status(), 200);
+	assert_eq!(
+		res.headers().get("content-type").unwrap().to_str().unwrap(),
+		"application/json"
+	);
+	let res_body = json_body(res).await;
+	assert_json_eq!(
+		res_body,
+		json!([
+			base64::encode(&single_value),
+			base64::encode(&concurrent_value)
+		])
+	);
+
+	// -- Delete first value, concurrently with second insert --
+	// -- (we now have a concurrent value and a deletion) --
+	let res = ctx
+		.k2v
+		.request
+		.builder(bucket.clone())
+		.path("root")
+		.query_param("sort_key", Some("v1"))
+		.method(Method::DELETE)
+		.signed_header("x-garage-causality-token", ct)
+		.send()
+		.await
+		.unwrap();
+	assert_eq!(res.status(), 204);
+
+	// f0: either
+	let res = ctx
+		.k2v
+		.request
+		.builder(bucket.clone())
+		.path("root")
+		.query_param("sort_key", Some("v1"))
+		.signed_header("accept", "*/*")
+		.send()
+		.await
+		.unwrap();
+	assert_eq!(res.status(), 200);
+	assert_eq!(
+		res.headers().get("content-type").unwrap().to_str().unwrap(),
+		"application/json"
+	);
+	let res_body = json_body(res).await;
+	assert_json_eq!(res_body, json!([base64::encode(&concurrent_value), null]));
+
+	// f1: not specified
+	let res = ctx
+		.k2v
+		.request
+		.builder(bucket.clone())
+		.path("root")
+		.query_param("sort_key", Some("v1"))
+		.send()
+		.await
+		.unwrap();
+	assert_eq!(res.status(), 200);
+	assert_eq!(
+		res.headers().get("content-type").unwrap().to_str().unwrap(),
+		"application/json"
+	);
+	let ct = res
+		.headers()
+		.get("x-garage-causality-token")
+		.unwrap()
+		.to_str()
+		.unwrap()
+		.to_string();
+	let res_body = json_body(res).await;
+	assert_json_eq!(res_body, json!([base64::encode(&concurrent_value), null]));
+
+	// f2: binary
+	let res = ctx
+		.k2v
+		.request
+		.builder(bucket.clone())
+		.path("root")
+		.query_param("sort_key", Some("v1"))
+		.signed_header("accept", "application/octet-stream")
+		.send()
+		.await
+		.unwrap();
+	assert_eq!(res.status(), 409); // CONFLICT
+
+	// f3: json
+	let res = ctx
+		.k2v
+		.request
+		.builder(bucket.clone())
+		.path("root")
+		.query_param("sort_key", Some("v1"))
+		.signed_header("accept", "application/json")
+		.send()
+		.await
+		.unwrap();
+	assert_eq!(res.status(), 200);
+	assert_eq!(
+		res.headers().get("content-type").unwrap().to_str().unwrap(),
+		"application/json"
+	);
+	let res_body = json_body(res).await;
+	assert_json_eq!(res_body, json!([base64::encode(&concurrent_value), null]));
+
+	// -- Delete everything --
+	let res = ctx
+		.k2v
+		.request
+		.builder(bucket.clone())
+		.path("root")
+		.query_param("sort_key", Some("v1"))
+		.method(Method::DELETE)
+		.signed_header("x-garage-causality-token", ct)
+		.send()
+		.await
+		.unwrap();
+	assert_eq!(res.status(), 204);
+
+	// f0: either
+	let res = ctx
+		.k2v
+		.request
+		.builder(bucket.clone())
+		.path("root")
+		.query_param("sort_key", Some("v1"))
+		.signed_header("accept", "*/*")
+		.send()
+		.await
+		.unwrap();
+	assert_eq!(res.status(), 204); // NO CONTENT
+
+	// f1: not specified
+	let res = ctx
+		.k2v
+		.request
+		.builder(bucket.clone())
+		.path("root")
+		.query_param("sort_key", Some("v1"))
+		.send()
+		.await
+		.unwrap();
+	assert_eq!(res.status(), 200);
+	assert_eq!(
+		res.headers().get("content-type").unwrap().to_str().unwrap(),
+		"application/json"
+	);
+	let res_body = json_body(res).await;
+	assert_json_eq!(res_body, json!([null]));
+
+	// f2: binary
+	let res = ctx
+		.k2v
+		.request
+		.builder(bucket.clone())
+		.path("root")
+		.query_param("sort_key", Some("v1"))
+		.signed_header("accept", "application/octet-stream")
+		.send()
+		.await
+		.unwrap();
+	assert_eq!(res.status(), 204); // NO CONTENT
+
+	// f3: json
+	let res = ctx
+		.k2v
+		.request
+		.builder(bucket.clone())
+		.path("root")
+		.query_param("sort_key", Some("v1"))
+		.signed_header("accept", "application/json")
+		.send()
+		.await
+		.unwrap();
+	assert_eq!(res.status(), 200);
+	assert_eq!(
+		res.headers().get("content-type").unwrap().to_str().unwrap(),
+		"application/json"
+	);
+	let res_body = json_body(res).await;
+	assert_json_eq!(res_body, json!([null]));
+}
diff --git a/src/garage/tests/k2v/mod.rs b/src/garage/tests/k2v/mod.rs
new file mode 100644
index 00000000..a009460e
--- /dev/null
+++ b/src/garage/tests/k2v/mod.rs
@@ -0,0 +1,18 @@
+pub mod batch;
+pub mod errorcodes;
+pub mod item;
+pub mod poll;
+pub mod simple;
+
+use hyper::{Body, Response};
+
+pub async fn json_body(res: Response<Body>) -> serde_json::Value {
+	let res_body: serde_json::Value = serde_json::from_slice(
+		&hyper::body::to_bytes(res.into_body())
+			.await
+			.unwrap()
+			.to_vec()[..],
+	)
+	.unwrap();
+	res_body
+}
diff --git a/src/garage/tests/k2v/poll.rs b/src/garage/tests/k2v/poll.rs
new file mode 100644
index 00000000..70dc0410
--- /dev/null
+++ b/src/garage/tests/k2v/poll.rs
@@ -0,0 +1,98 @@
+use hyper::Method;
+use std::time::Duration;
+
+use crate::common;
+
+#[tokio::test]
+async fn test_poll() {
+	let ctx = common::context();
+	let bucket = ctx.create_bucket("test-k2v-poll");
+
+	// Write initial value
+	let res = ctx
+		.k2v
+		.request
+		.builder(bucket.clone())
+		.method(Method::PUT)
+		.path("root")
+		.query_param("sort_key", Some("test1"))
+		.body(b"Initial value".to_vec())
+		.send()
+		.await
+		.unwrap();
+	assert_eq!(res.status(), 200);
+
+	// Retrieve initial value to get its causality token
+	let res2 = ctx
+		.k2v
+		.request
+		.builder(bucket.clone())
+		.path("root")
+		.query_param("sort_key", Some("test1"))
+		.signed_header("accept", "application/octet-stream")
+		.send()
+		.await
+		.unwrap();
+	assert_eq!(res2.status(), 200);
+	let ct = res2
+		.headers()
+		.get("x-garage-causality-token")
+		.unwrap()
+		.to_str()
+		.unwrap()
+		.to_string();
+
+	let res2_body = hyper::body::to_bytes(res2.into_body())
+		.await
+		.unwrap()
+		.to_vec();
+	assert_eq!(res2_body, b"Initial value");
+
+	// Start poll operation
+	let poll = {
+		let bucket = bucket.clone();
+		let ct = ct.clone();
+		tokio::spawn(async move {
+			let ctx = common::context();
+			ctx.k2v
+				.request
+				.builder(bucket.clone())
+				.path("root")
+				.query_param("sort_key", Some("test1"))
+				.query_param("causality_token", Some(ct))
+				.query_param("timeout", Some("10"))
+				.signed_header("accept", "application/octet-stream")
+				.send()
+				.await
+		})
+	};
+
+	// Write new value that supersedes initial one
+	let res = ctx
+		.k2v
+		.request
+		.builder(bucket.clone())
+		.method(Method::PUT)
+		.path("root")
+		.query_param("sort_key", Some("test1"))
+		.signed_header("x-garage-causality-token", ct)
+		.body(b"New value".to_vec())
+		.send()
+		.await
+		.unwrap();
+	assert_eq!(res.status(), 200);
+
+	// Check poll finishes with correct value
+	let poll_res = tokio::select! {
+		_ = tokio::time::sleep(Duration::from_secs(10)) => panic!("poll did not terminate in time"),
+		res = poll => res.unwrap().unwrap(),
+	};
+
+	assert_eq!(poll_res.status(), 200);
+
+	let poll_res_body = hyper::body::to_bytes(poll_res.into_body())
+		.await
+		.unwrap()
+		.to_vec();
+	assert_eq!(poll_res_body, b"New value");
+}
diff --git a/src/garage/tests/k2v/simple.rs b/src/garage/tests/k2v/simple.rs
new file mode 100644
index 00000000..ae9a8674
--- /dev/null
+++ b/src/garage/tests/k2v/simple.rs
@@ -0,0 +1,40 @@
+use crate::common;
+
+use hyper::Method;
+
+#[tokio::test]
+async fn test_simple() {
+	let ctx = common::context();
+	let bucket = ctx.create_bucket("test-k2v-simple");
+
+	let res = ctx
+		.k2v
+		.request
+		.builder(bucket.clone())
+		.method(Method::PUT)
+		.path("root")
+		.query_param("sort_key", Some("test1"))
+		.body(b"Hello, world!".to_vec())
+		.send()
+		.await
+		.unwrap();
+	assert_eq!(res.status(), 200);
+
+	let res2 = ctx
+		.k2v
+		.request
+		.builder(bucket.clone())
+		.path("root")
+		.query_param("sort_key", Some("test1"))
+		.signed_header("accept", "application/octet-stream")
+		.send()
+		.await
+		.unwrap();
+	assert_eq!(res2.status(), 200);
+
+	let res2_body = hyper::body::to_bytes(res2.into_body())
+		.await
+		.unwrap()
+		.to_vec();
+	assert_eq!(res2_body, b"Hello, world!");
+}
diff --git a/src/garage/tests/lib.rs b/src/garage/tests/lib.rs
index 8799c395..87be1327 100644
--- a/src/garage/tests/lib.rs
+++ b/src/garage/tests/lib.rs
@@ -3,9 +3,8 @@ mod common;
 
 mod admin;
 mod bucket;
-mod list;
-mod multipart;
-mod objects;
-mod simple;
-mod streaming_signature;
-mod website;
+
+mod s3;
+
+#[cfg(feature = "k2v")]
+mod k2v;
diff --git a/src/garage/tests/list.rs b/src/garage/tests/s3/list.rs
index bb03f250..bb03f250 100644
--- a/src/garage/tests/list.rs
+++ b/src/garage/tests/s3/list.rs
diff --git a/src/garage/tests/s3/mod.rs b/src/garage/tests/s3/mod.rs
new file mode 100644
index 00000000..623eb665
--- /dev/null
+++ b/src/garage/tests/s3/mod.rs
@@ -0,0 +1,6 @@
+mod list;
+mod multipart;
+mod objects;
+mod simple;
+mod streaming_signature;
+mod website;
diff --git a/src/garage/tests/multipart.rs b/src/garage/tests/s3/multipart.rs
index 895a2993..895a2993 100644
--- a/src/garage/tests/multipart.rs
+++ b/src/garage/tests/s3/multipart.rs
diff --git a/src/garage/tests/objects.rs b/src/garage/tests/s3/objects.rs
index e1175b81..65f9e867 100644
--- a/src/garage/tests/objects.rs
+++ b/src/garage/tests/s3/objects.rs
@@ -263,4 +263,13 @@ async fn test_deleteobject() {
 		.unwrap();
 
 	assert!(l.contents.is_none());
+
+	// Deleting a non-existing object shouldn't be a problem
+	ctx.client
+		.delete_object()
+		.bucket(&bucket)
+		.key("l-0")
+		.send()
+		.await
+		.unwrap();
 }
diff --git a/src/garage/tests/simple.rs b/src/garage/tests/s3/simple.rs
index f54ae9ac..f54ae9ac 100644
--- a/src/garage/tests/simple.rs
+++ b/src/garage/tests/s3/simple.rs
diff --git a/src/garage/tests/streaming_signature.rs b/src/garage/tests/s3/streaming_signature.rs
index c68f7dfc..c68f7dfc 100644
--- a/src/garage/tests/streaming_signature.rs
+++ b/src/garage/tests/s3/streaming_signature.rs
diff --git a/src/garage/tests/website.rs b/src/garage/tests/s3/website.rs
index 963d11ea..0570ac6a 100644
--- a/src/garage/tests/website.rs
+++ b/src/garage/tests/s3/website.rs
@@ -35,10 +35,7 @@ async fn test_website() {
 	let req = || {
 		Request::builder()
 			.method("GET")
-			.uri(format!(
-				"http://127.0.0.1:{}/",
-				common::garage::DEFAULT_PORT + 2
-			))
+			.uri(format!("http://127.0.0.1:{}/", ctx.garage.web_port))
 			.header("Host", format!("{}.web.garage", BCKT_NAME))
 			.body(Body::empty())
 			.unwrap()
@@ -170,10 +167,7 @@ async fn test_website_s3_api() {
 	{
 		let req = Request::builder()
 			.method("GET")
-			.uri(format!(
-				"http://127.0.0.1:{}/site/",
-				common::garage::DEFAULT_PORT + 2
-			))
+			.uri(format!("http://127.0.0.1:{}/site/", ctx.garage.web_port))
 			.header("Host", format!("{}.web.garage", BCKT_NAME))
 			.header("Origin", "https://example.com")
 			.body(Body::empty())
@@ -198,7 +192,7 @@ async fn test_website_s3_api() {
 			.method("GET")
 			.uri(format!(
 				"http://127.0.0.1:{}/wrong.html",
-				common::garage::DEFAULT_PORT + 2
+				ctx.garage.web_port
 			))
 			.header("Host", format!("{}.web.garage", BCKT_NAME))
 			.body(Body::empty())
@@ -217,10 +211,7 @@ async fn test_website_s3_api() {
 	{
 		let req = Request::builder()
 			.method("OPTIONS")
-			.uri(format!(
-				"http://127.0.0.1:{}/site/",
-				common::garage::DEFAULT_PORT + 2
-			))
+			.uri(format!("http://127.0.0.1:{}/site/", ctx.garage.web_port))
 			.header("Host", format!("{}.web.garage", BCKT_NAME))
 			.header("Origin", "https://example.com")
 			.header("Access-Control-Request-Method", "PUT")
@@ -244,10 +235,7 @@ async fn test_website_s3_api() {
 	{
 		let req = Request::builder()
 			.method("OPTIONS")
-			.uri(format!(
-				"http://127.0.0.1:{}/site/",
-				common::garage::DEFAULT_PORT + 2
-			))
+			.uri(format!("http://127.0.0.1:{}/site/", ctx.garage.web_port))
 			.header("Host", format!("{}.web.garage", BCKT_NAME))
 			.header("Origin", "https://example.com")
 			.header("Access-Control-Request-Method", "DELETE")
@@ -288,10 +276,7 @@ async fn test_website_s3_api() {
 	{
 		let req = Request::builder()
 			.method("OPTIONS")
-			.uri(format!(
-				"http://127.0.0.1:{}/site/",
-				common::garage::DEFAULT_PORT + 2
-			))
+			.uri(format!("http://127.0.0.1:{}/site/", ctx.garage.web_port))
 			.header("Host", format!("{}.web.garage", BCKT_NAME))
 			.header("Origin", "https://example.com")
 			.header("Access-Control-Request-Method", "PUT")
@@ -319,10 +304,7 @@ async fn test_website_s3_api() {
 	{
 		let req = Request::builder()
 			.method("GET")
-			.uri(format!(
-				"http://127.0.0.1:{}/site/",
-				common::garage::DEFAULT_PORT + 2
-			))
+			.uri(format!("http://127.0.0.1:{}/site/", ctx.garage.web_port))
 			.header("Host", format!("{}.web.garage", BCKT_NAME))
 			.body(Body::empty())
 			.unwrap();
diff --git a/src/admin/tracing_setup.rs b/src/garage/tracing_setup.rs
index 55fc4094..55fc4094 100644
--- a/src/admin/tracing_setup.rs
+++ b/src/garage/tracing_setup.rs
diff --git a/src/k2v-client/Cargo.toml b/src/k2v-client/Cargo.toml
new file mode 100644
index 00000000..0f0b76ae
--- /dev/null
+++ b/src/k2v-client/Cargo.toml
@@ -0,0 +1,37 @@
+[package]
+name = "k2v-client"
+version = "0.0.1"
+authors = ["Trinity Pointard <trinity.pointard@gmail.com>", "Alex Auvolat <alex@adnab.me>"]
+edition = "2018"
+license = "AGPL-3.0"
+description = "Client library for the Garage K2V protocol"
+repository = "https://git.deuxfleurs.fr/Deuxfleurs/garage"
+readme = "../../README.md"
+
+[dependencies]
+base64 = "0.13.0"
+http = "0.2.6"
+log = "0.4"
+rusoto_core = "0.48.0"
+rusoto_credential = "0.48.0"
+rusoto_signature = "0.48.0"
+serde = "1.0.137"
+serde_json = "1.0.81"
+thiserror = "1.0.31"
+tokio = "1.17.0"
+
+# cli deps
+clap = { version = "3.1.18", optional = true, features = ["derive", "env"] }
+garage_util = { version = "0.8.0", path = "../util", optional = true }
+
+
+[features]
+cli = ["clap", "tokio/fs", "tokio/io-std", "garage_util"]
+
+[lib]
+path = "lib.rs"
+
+[[bin]]
+name = "k2v-cli"
+path = "bin/k2v-cli.rs"
+required-features = ["cli"]
diff --git a/src/k2v-client/README.md b/src/k2v-client/README.md
new file mode 100644
index 00000000..db454805
--- /dev/null
+++ b/src/k2v-client/README.md
@@ -0,0 +1,25 @@
+Example usage:
+```sh
+# all these values can be provided on the cli instead
+export AWS_ACCESS_KEY_ID=GK123456
+export AWS_SECRET_ACCESS_KEY=0123..789
+export AWS_REGION=garage
+export K2V_ENDPOINT=http://172.30.2.1:3903
+export K2V_BUCKET=my-bucket
+
+cargo run --features=cli -- read-range my-partition-key --all
+
+cargo run --features=cli -- insert my-partition-key my-sort-key --text "my string1"
+cargo run --features=cli -- insert my-partition-key my-sort-key --text "my string2"
+cargo run --features=cli -- insert my-partition-key my-sort-key2 --text "my string"
+
+cargo run --features=cli -- read-range my-partition-key --all
+
+causality=$(cargo run --features=cli -- read my-partition-key my-sort-key2 -b | head -n1)
+cargo run --features=cli -- delete my-partition-key my-sort-key2 -c $causality
+
+causality=$(cargo run --features=cli -- read my-partition-key my-sort-key -b | head -n1)
+cargo run --features=cli -- insert my-partition-key my-sort-key --text "my string3" -c $causality
+
+cargo run --features=cli -- read-range my-partition-key --all
+```
diff --git a/src/k2v-client/bin/k2v-cli.rs b/src/k2v-client/bin/k2v-cli.rs
new file mode 100644
index 00000000..925ebeb8
--- /dev/null
+++ b/src/k2v-client/bin/k2v-cli.rs
@@ -0,0 +1,501 @@
+use std::time::Duration;
+
+use k2v_client::*;
+
+use garage_util::formater::format_table;
+
+use rusoto_core::credential::AwsCredentials;
+use rusoto_core::Region;
+
+use clap::{Parser, Subcommand};
+
+/// K2V command line interface
+#[derive(Parser, Debug)]
+#[clap(author, version, about, long_about = None)]
+struct Args {
+	/// Name of the region to use
+	#[clap(short, long, env = "AWS_REGION", default_value = "garage")]
+	region: String,
+	/// Url of the endpoint to connect to
+	#[clap(short, long, env = "K2V_ENDPOINT")]
+	endpoint: String,
+	/// Access key ID
+	#[clap(short, long, env = "AWS_ACCESS_KEY_ID")]
+	key_id: String,
+	/// Access key ID
+	#[clap(short, long, env = "AWS_SECRET_ACCESS_KEY")]
+	secret: String,
+	/// Bucket name
+	#[clap(short, long, env = "K2V_BUCKET")]
+	bucket: String,
+	#[clap(subcommand)]
+	command: Command,
+}
+
+#[derive(Subcommand, Debug)]
+enum Command {
+	/// Insert a single value
+	Insert {
+		/// Partition key to insert to
+		partition_key: String,
+		/// Sort key to insert to
+		sort_key: String,
+		/// Causality of the insertion
+		#[clap(short, long)]
+		causality: Option<String>,
+		/// Value to insert
+		#[clap(flatten)]
+		value: Value,
+	},
+	/// Read a single value
+	Read {
+		/// Partition key to read from
+		partition_key: String,
+		/// Sort key to read from
+		sort_key: String,
+		/// Output formating
+		#[clap(flatten)]
+		output_kind: ReadOutputKind,
+	},
+	/// Watch changes on  a single value
+	Poll {
+		/// Partition key to delete from
+		partition_key: String,
+		/// Sort key to delete from
+		sort_key: String,
+		/// Causality information
+		#[clap(short, long)]
+		causality: String,
+		/// Timeout, in seconds
+		#[clap(short, long)]
+		timeout: Option<u64>,
+		/// Output formating
+		#[clap(flatten)]
+		output_kind: ReadOutputKind,
+	},
+	/// Delete a single value
+	Delete {
+		/// Partition key to delete from
+		partition_key: String,
+		/// Sort key to delete from
+		sort_key: String,
+		/// Causality information
+		#[clap(short, long)]
+		causality: String,
+	},
+	/// List partition keys
+	ReadIndex {
+		/// Output formating
+		#[clap(flatten)]
+		output_kind: BatchOutputKind,
+		/// Output only partition keys matching this filter
+		#[clap(flatten)]
+		filter: Filter,
+	},
+	/// Read a range of sort keys
+	ReadRange {
+		/// Partition key to read from
+		partition_key: String,
+		/// Output formating
+		#[clap(flatten)]
+		output_kind: BatchOutputKind,
+		/// Output only sort keys matching this filter
+		#[clap(flatten)]
+		filter: Filter,
+	},
+	/// Delete a range of sort keys
+	DeleteRange {
+		/// Partition key to delete from
+		partition_key: String,
+		/// Output formating
+		#[clap(flatten)]
+		output_kind: BatchOutputKind,
+		/// Delete only sort keys matching this filter
+		#[clap(flatten)]
+		filter: Filter,
+	},
+}
+
+/// Where to read a value from
+#[derive(Parser, Debug)]
+#[clap(group = clap::ArgGroup::new("value").multiple(false).required(true))]
+struct Value {
+	/// Read value from a file. use - to read from stdin
+	#[clap(short, long, group = "value")]
+	file: Option<String>,
+	/// Read a base64 value from commandline
+	#[clap(short, long, group = "value")]
+	b64: Option<String>,
+	/// Read a raw (UTF-8) value from the commandline
+	#[clap(short, long, group = "value")]
+	text: Option<String>,
+}
+
+impl Value {
+	async fn to_data(&self) -> Result<Vec<u8>, Error> {
+		if let Some(ref text) = self.text {
+			Ok(text.as_bytes().to_vec())
+		} else if let Some(ref b64) = self.b64 {
+			base64::decode(b64).map_err(|_| Error::Message("invalid base64 input".into()))
+		} else if let Some(ref path) = self.file {
+			use tokio::io::AsyncReadExt;
+			if path == "-" {
+				let mut file = tokio::io::stdin();
+				let mut vec = Vec::new();
+				file.read_to_end(&mut vec).await?;
+				Ok(vec)
+			} else {
+				let mut file = tokio::fs::File::open(path).await?;
+				let mut vec = Vec::new();
+				file.read_to_end(&mut vec).await?;
+				Ok(vec)
+			}
+		} else {
+			unreachable!("Value must have one option set")
+		}
+	}
+}
+
+#[derive(Parser, Debug)]
+#[clap(group = clap::ArgGroup::new("output-kind").multiple(false).required(false))]
+struct ReadOutputKind {
+	/// Base64 output. Conflicts are line separated, first line is causality token
+	#[clap(short, long, group = "output-kind")]
+	b64: bool,
+	/// Raw output. Conflicts generate error, causality token is not returned
+	#[clap(short, long, group = "output-kind")]
+	raw: bool,
+	/// Human formated output
+	#[clap(short = 'H', long, group = "output-kind")]
+	human: bool,
+	/// JSON formated output
+	#[clap(short, long, group = "output-kind")]
+	json: bool,
+}
+
+impl ReadOutputKind {
+	fn display_output(&self, val: CausalValue) -> ! {
+		use std::io::Write;
+		use std::process::exit;
+
+		if self.json {
+			let stdout = std::io::stdout();
+			serde_json::to_writer_pretty(stdout, &val).unwrap();
+			exit(0);
+		}
+
+		if self.raw {
+			let mut val = val.value;
+			if val.len() != 1 {
+				eprintln!(
+					"Raw mode can only read non-concurent values, found {} values, expected 1",
+					val.len()
+				);
+				exit(1);
+			}
+			let val = val.pop().unwrap();
+			match val {
+				K2vValue::Value(v) => {
+					std::io::stdout().write_all(&v).unwrap();
+					exit(0);
+				}
+				K2vValue::Tombstone => {
+					eprintln!("Expected value, found tombstone");
+					exit(2);
+				}
+			}
+		}
+
+		let causality: String = val.causality.into();
+		if self.b64 {
+			println!("{}", causality);
+			for val in val.value {
+				match val {
+					K2vValue::Value(v) => {
+						println!("{}", base64::encode(&v))
+					}
+					K2vValue::Tombstone => {
+						println!();
+					}
+				}
+			}
+			exit(0);
+		}
+
+		// human
+		println!("causality: {}", causality);
+		println!("values:");
+		for val in val.value {
+			match val {
+				K2vValue::Value(v) => {
+					if let Ok(string) = std::str::from_utf8(&v) {
+						println!("  utf-8: {}", string);
+					} else {
+						println!("  base64: {}", base64::encode(&v));
+					}
+				}
+				K2vValue::Tombstone => {
+					println!("  tombstone");
+				}
+			}
+		}
+		exit(0);
+	}
+}
+
+#[derive(Parser, Debug)]
+#[clap(group = clap::ArgGroup::new("output-kind").multiple(false).required(false))]
+struct BatchOutputKind {
+	/// Human formated output
+	#[clap(short = 'H', long, group = "output-kind")]
+	human: bool,
+	/// JSON formated output
+	#[clap(short, long, group = "output-kind")]
+	json: bool,
+}
+
+/// Filter for batch operations
+#[derive(Parser, Debug)]
+#[clap(group = clap::ArgGroup::new("filter").multiple(true).required(true))]
+struct Filter {
+	/// Match only keys starting with this prefix
+	#[clap(short, long, group = "filter")]
+	prefix: Option<String>,
+	/// Match only keys lexicographically after this key (including this key itself)
+	#[clap(short, long, group = "filter")]
+	start: Option<String>,
+	/// Match only keys lexicographically before this key (excluding this key)
+	#[clap(short, long, group = "filter")]
+	end: Option<String>,
+	/// Only match the first X keys
+	#[clap(short, long)]
+	limit: Option<u64>,
+	/// Return keys in reverse order
+	#[clap(short, long)]
+	reverse: bool,
+	/// Return only keys where conflict happened
+	#[clap(short, long)]
+	conflicts_only: bool,
+	/// Also include keys storing only tombstones
+	#[clap(short, long)]
+	tombstones: bool,
+	/// Return any key
+	#[clap(short, long, group = "filter")]
+	all: bool,
+}
+
+impl Filter {
+	fn k2v_filter(&self) -> k2v_client::Filter<'_> {
+		k2v_client::Filter {
+			start: self.start.as_deref(),
+			end: self.end.as_deref(),
+			prefix: self.prefix.as_deref(),
+			limit: self.limit,
+			reverse: self.reverse,
+		}
+	}
+}
+
+#[tokio::main]
+async fn main() -> Result<(), Error> {
+	let args = Args::parse();
+
+	let region = Region::Custom {
+		name: args.region,
+		endpoint: args.endpoint,
+	};
+
+	let creds = AwsCredentials::new(args.key_id, args.secret, None, None);
+
+	let client = K2vClient::new(region, args.bucket, creds, None)?;
+
+	match args.command {
+		Command::Insert {
+			partition_key,
+			sort_key,
+			causality,
+			value,
+		} => {
+			client
+				.insert_item(
+					&partition_key,
+					&sort_key,
+					value.to_data().await?,
+					causality.map(Into::into),
+				)
+				.await?;
+		}
+		Command::Delete {
+			partition_key,
+			sort_key,
+			causality,
+		} => {
+			client
+				.delete_item(&partition_key, &sort_key, causality.into())
+				.await?;
+		}
+		Command::Read {
+			partition_key,
+			sort_key,
+			output_kind,
+		} => {
+			let res = client.read_item(&partition_key, &sort_key).await?;
+			output_kind.display_output(res);
+		}
+		Command::Poll {
+			partition_key,
+			sort_key,
+			causality,
+			timeout,
+			output_kind,
+		} => {
+			let timeout = timeout.map(Duration::from_secs);
+			let res_opt = client
+				.poll_item(&partition_key, &sort_key, causality.into(), timeout)
+				.await?;
+			if let Some(res) = res_opt {
+				output_kind.display_output(res);
+			} else {
+				println!("Delay expired and value didn't change.");
+			}
+		}
+		Command::ReadIndex {
+			output_kind,
+			filter,
+		} => {
+			if filter.conflicts_only || filter.tombstones {
+				return Err(Error::Message(
+					"conlicts-only and tombstones are invalid for read-index".into(),
+				));
+			}
+			let res = client.read_index(filter.k2v_filter()).await?;
+			if output_kind.json {
+				let values = res
+					.items
+					.into_iter()
+					.map(|(k, v)| {
+						let mut value = serde_json::to_value(v).unwrap();
+						value
+							.as_object_mut()
+							.unwrap()
+							.insert("sort_key".to_owned(), k.into());
+						value
+					})
+					.collect::<Vec<_>>();
+				let json = serde_json::json!({
+					"next_key": res.next_start,
+					"values": values,
+				});
+
+				let stdout = std::io::stdout();
+				serde_json::to_writer_pretty(stdout, &json).unwrap();
+			} else {
+				if let Some(next) = res.next_start {
+					println!("next key: {}", next);
+				}
+
+				let mut to_print = Vec::new();
+				to_print.push(format!("key:\tentries\tconflicts\tvalues\tbytes"));
+				for (k, v) in res.items {
+					to_print.push(format!(
+						"{}\t{}\t{}\t{}\t{}",
+						k, v.entries, v.conflicts, v.values, v.bytes
+					));
+				}
+				format_table(to_print);
+			}
+		}
+		Command::ReadRange {
+			partition_key,
+			output_kind,
+			filter,
+		} => {
+			let op = BatchReadOp {
+				partition_key: &partition_key,
+				filter: filter.k2v_filter(),
+				conflicts_only: filter.conflicts_only,
+				tombstones: filter.tombstones,
+				single_item: false,
+			};
+			let mut res = client.read_batch(&[op]).await?;
+			let res = res.pop().unwrap();
+			if output_kind.json {
+				let values = res
+					.items
+					.into_iter()
+					.map(|(k, v)| {
+						let mut value = serde_json::to_value(v).unwrap();
+						value
+							.as_object_mut()
+							.unwrap()
+							.insert("sort_key".to_owned(), k.into());
+						value
+					})
+					.collect::<Vec<_>>();
+				let json = serde_json::json!({
+					"next_key": res.next_start,
+					"values": values,
+				});
+
+				let stdout = std::io::stdout();
+				serde_json::to_writer_pretty(stdout, &json).unwrap();
+			} else {
+				if let Some(next) = res.next_start {
+					println!("next key: {}", next);
+				}
+				for (key, values) in res.items {
+					println!("key: {}", key);
+					let causality: String = values.causality.into();
+					println!("causality: {}", causality);
+					for value in values.value {
+						match value {
+							K2vValue::Value(v) => {
+								if let Ok(string) = std::str::from_utf8(&v) {
+									println!("  value(utf-8): {}", string);
+								} else {
+									println!("  value(base64): {}", base64::encode(&v));
+								}
+							}
+							K2vValue::Tombstone => {
+								println!("  tombstone");
+							}
+						}
+					}
+				}
+			}
+		}
+		Command::DeleteRange {
+			partition_key,
+			output_kind,
+			filter,
+		} => {
+			let op = BatchDeleteOp {
+				partition_key: &partition_key,
+				prefix: filter.prefix.as_deref(),
+				start: filter.start.as_deref(),
+				end: filter.end.as_deref(),
+				single_item: false,
+			};
+			if filter.reverse
+				|| filter.conflicts_only
+				|| filter.tombstones
+				|| filter.limit.is_some()
+			{
+				return Err(Error::Message(
+					"limit, conlicts-only, reverse and tombstones are invalid for delete-range"
+						.into(),
+				));
+			}
+
+			let res = client.delete_batch(&[op]).await?;
+
+			if output_kind.json {
+				println!("{}", res[0]);
+			} else {
+				println!("deleted {} keys", res[0]);
+			}
+		}
+	}
+
+	Ok(())
+}
diff --git a/src/k2v-client/error.rs b/src/k2v-client/error.rs
new file mode 100644
index 00000000..37c221f2
--- /dev/null
+++ b/src/k2v-client/error.rs
@@ -0,0 +1,29 @@
+use std::borrow::Cow;
+
+use thiserror::Error;
+
+/// Errors returned by this crate
+#[derive(Error, Debug)]
+pub enum Error {
+	#[error("{0}, {1}: {2} (path = {3})")]
+	Remote(
+		http::StatusCode,
+		Cow<'static, str>,
+		Cow<'static, str>,
+		Cow<'static, str>,
+	),
+	#[error("received invalid response: {0}")]
+	InvalidResponse(Cow<'static, str>),
+	#[error("not found")]
+	NotFound,
+	#[error("io error: {0}")]
+	IoError(#[from] std::io::Error),
+	#[error("rusoto tls error: {0}")]
+	RusotoTls(#[from] rusoto_core::request::TlsError),
+	#[error("rusoto http error: {0}")]
+	RusotoHttp(#[from] rusoto_core::HttpDispatchError),
+	#[error("deserialization error: {0}")]
+	Deserialization(#[from] serde_json::Error),
+	#[error("{0}")]
+	Message(Cow<'static, str>),
+}
diff --git a/src/k2v-client/lib.rs b/src/k2v-client/lib.rs
new file mode 100644
index 00000000..c2606af4
--- /dev/null
+++ b/src/k2v-client/lib.rs
@@ -0,0 +1,611 @@
+use std::collections::BTreeMap;
+use std::time::Duration;
+
+use http::header::{ACCEPT, CONTENT_LENGTH, CONTENT_TYPE};
+use http::status::StatusCode;
+use http::HeaderMap;
+use log::{debug, error};
+
+use rusoto_core::{ByteStream, DispatchSignedRequest, HttpClient};
+use rusoto_credential::AwsCredentials;
+use rusoto_signature::region::Region;
+use rusoto_signature::signature::SignedRequest;
+use serde::de::Error as DeError;
+use serde::{Deserialize, Deserializer, Serialize, Serializer};
+
+use tokio::io::AsyncReadExt;
+
+mod error;
+
+pub use error::Error;
+
+const DEFAULT_TIMEOUT: Duration = Duration::from_secs(5);
+const DEFAULT_POLL_TIMEOUT: Duration = Duration::from_secs(300);
+const SERVICE: &str = "k2v";
+const GARAGE_CAUSALITY_TOKEN: &str = "X-Garage-Causality-Token";
+
+/// Client used to query a K2V server.
+pub struct K2vClient {
+	region: Region,
+	bucket: String,
+	creds: AwsCredentials,
+	client: HttpClient,
+}
+
+impl K2vClient {
+	/// Create a new K2V client.
+	pub fn new(
+		region: Region,
+		bucket: String,
+		creds: AwsCredentials,
+		user_agent: Option<String>,
+	) -> Result<Self, Error> {
+		let mut client = HttpClient::new()?;
+		if let Some(ua) = user_agent {
+			client.local_agent_prepend(ua);
+		} else {
+			client.local_agent_prepend(format!("k2v/{}", env!("CARGO_PKG_VERSION")));
+		}
+		Ok(K2vClient {
+			region,
+			bucket,
+			creds,
+			client,
+		})
+	}
+
+	/// Perform a ReadItem request, reading the value(s) stored for a single pk+sk.
+	pub async fn read_item(
+		&self,
+		partition_key: &str,
+		sort_key: &str,
+	) -> Result<CausalValue, Error> {
+		let mut req = SignedRequest::new(
+			"GET",
+			SERVICE,
+			&self.region,
+			&format!("/{}/{}", self.bucket, partition_key),
+		);
+		req.add_param("sort_key", sort_key);
+		req.add_header(ACCEPT, "application/octet-stream, application/json");
+
+		let res = self.dispatch(req, None).await?;
+
+		let causality = res
+			.causality_token
+			.ok_or_else(|| Error::InvalidResponse("missing causality token".into()))?;
+
+		if res.status == StatusCode::NO_CONTENT {
+			return Ok(CausalValue {
+				causality,
+				value: vec![K2vValue::Tombstone],
+			});
+		}
+
+		match res.content_type.as_deref() {
+			Some("application/octet-stream") => Ok(CausalValue {
+				causality,
+				value: vec![K2vValue::Value(res.body)],
+			}),
+			Some("application/json") => {
+				let value = serde_json::from_slice(&res.body)?;
+				Ok(CausalValue { causality, value })
+			}
+			Some(ct) => Err(Error::InvalidResponse(
+				format!("invalid content type: {}", ct).into(),
+			)),
+			None => Err(Error::InvalidResponse("missing content type".into())),
+		}
+	}
+
+	/// Perform a PollItem request, waiting for the value(s) stored for a single pk+sk to be
+	/// updated.
+	pub async fn poll_item(
+		&self,
+		partition_key: &str,
+		sort_key: &str,
+		causality: CausalityToken,
+		timeout: Option<Duration>,
+	) -> Result<Option<CausalValue>, Error> {
+		let timeout = timeout.unwrap_or(DEFAULT_POLL_TIMEOUT);
+
+		let mut req = SignedRequest::new(
+			"GET",
+			SERVICE,
+			&self.region,
+			&format!("/{}/{}", self.bucket, partition_key),
+		);
+		req.add_param("sort_key", sort_key);
+		req.add_param("causality_token", &causality.0);
+		req.add_param("timeout", &timeout.as_secs().to_string());
+		req.add_header(ACCEPT, "application/octet-stream, application/json");
+
+		let res = self.dispatch(req, Some(timeout + DEFAULT_TIMEOUT)).await?;
+
+		if res.status == StatusCode::NOT_MODIFIED {
+			return Ok(None);
+		}
+
+		let causality = res
+			.causality_token
+			.ok_or_else(|| Error::InvalidResponse("missing causality token".into()))?;
+
+		if res.status == StatusCode::NO_CONTENT {
+			return Ok(Some(CausalValue {
+				causality,
+				value: vec![K2vValue::Tombstone],
+			}));
+		}
+
+		match res.content_type.as_deref() {
+			Some("application/octet-stream") => Ok(Some(CausalValue {
+				causality,
+				value: vec![K2vValue::Value(res.body)],
+			})),
+			Some("application/json") => {
+				let value = serde_json::from_slice(&res.body)?;
+				Ok(Some(CausalValue { causality, value }))
+			}
+			Some(ct) => Err(Error::InvalidResponse(
+				format!("invalid content type: {}", ct).into(),
+			)),
+			None => Err(Error::InvalidResponse("missing content type".into())),
+		}
+	}
+
+	/// Perform an InsertItem request, inserting a value for a single pk+sk.
+	pub async fn insert_item(
+		&self,
+		partition_key: &str,
+		sort_key: &str,
+		value: Vec<u8>,
+		causality: Option<CausalityToken>,
+	) -> Result<(), Error> {
+		let mut req = SignedRequest::new(
+			"PUT",
+			SERVICE,
+			&self.region,
+			&format!("/{}/{}", self.bucket, partition_key),
+		);
+		req.add_param("sort_key", sort_key);
+		req.set_payload(Some(value));
+
+		if let Some(causality) = causality {
+			req.add_header(GARAGE_CAUSALITY_TOKEN, &causality.0);
+		}
+
+		self.dispatch(req, None).await?;
+		Ok(())
+	}
+
+	/// Perform a DeleteItem request, deleting the value(s) stored for a single pk+sk.
+	pub async fn delete_item(
+		&self,
+		partition_key: &str,
+		sort_key: &str,
+		causality: CausalityToken,
+	) -> Result<(), Error> {
+		let mut req = SignedRequest::new(
+			"DELETE",
+			SERVICE,
+			&self.region,
+			&format!("/{}/{}", self.bucket, partition_key),
+		);
+		req.add_param("sort_key", sort_key);
+		req.add_header(GARAGE_CAUSALITY_TOKEN, &causality.0);
+
+		self.dispatch(req, None).await?;
+		Ok(())
+	}
+
+	/// Perform a ReadIndex request, listing partition key which have at least one associated
+	/// sort key, and which matches the filter.
+	pub async fn read_index(
+		&self,
+		filter: Filter<'_>,
+	) -> Result<PaginatedRange<PartitionInfo>, Error> {
+		let mut req =
+			SignedRequest::new("GET", SERVICE, &self.region, &format!("/{}", self.bucket));
+		filter.insert_params(&mut req);
+
+		let res = self.dispatch(req, None).await?;
+
+		let resp: ReadIndexResponse = serde_json::from_slice(&res.body)?;
+
+		let items = resp
+			.partition_keys
+			.into_iter()
+			.map(|ReadIndexItem { pk, info }| (pk, info))
+			.collect();
+
+		Ok(PaginatedRange {
+			items,
+			next_start: resp.next_start,
+		})
+	}
+
+	/// Perform an InsertBatch request, inserting multiple values at once. Note: this operation is
+	/// *not* atomic: it is possible for some sub-operations to fails and others to success. In
+	/// that case, failure is reported.
+	pub async fn insert_batch(&self, operations: &[BatchInsertOp<'_>]) -> Result<(), Error> {
+		let mut req =
+			SignedRequest::new("POST", SERVICE, &self.region, &format!("/{}", self.bucket));
+
+		let payload = serde_json::to_vec(operations)?;
+		req.set_payload(Some(payload));
+		self.dispatch(req, None).await?;
+		Ok(())
+	}
+
+	/// Perform a ReadBatch request, reading multiple values or range of values at once.
+	pub async fn read_batch(
+		&self,
+		operations: &[BatchReadOp<'_>],
+	) -> Result<Vec<PaginatedRange<CausalValue>>, Error> {
+		let mut req =
+			SignedRequest::new("POST", SERVICE, &self.region, &format!("/{}", self.bucket));
+		req.add_param("search", "");
+
+		let payload = serde_json::to_vec(operations)?;
+		req.set_payload(Some(payload));
+		let res = self.dispatch(req, None).await?;
+
+		let resp: Vec<BatchReadResponse> = serde_json::from_slice(&res.body)?;
+
+		Ok(resp
+			.into_iter()
+			.map(|e| PaginatedRange {
+				items: e
+					.items
+					.into_iter()
+					.map(|BatchReadItem { sk, ct, v }| {
+						(
+							sk,
+							CausalValue {
+								causality: ct,
+								value: v,
+							},
+						)
+					})
+					.collect(),
+				next_start: e.next_start,
+			})
+			.collect())
+	}
+
+	/// Perform a DeleteBatch request, deleting mutiple values or range of values at once, without
+	/// providing causality information.
+	pub async fn delete_batch(&self, operations: &[BatchDeleteOp<'_>]) -> Result<Vec<u64>, Error> {
+		let mut req =
+			SignedRequest::new("POST", SERVICE, &self.region, &format!("/{}", self.bucket));
+		req.add_param("delete", "");
+
+		let payload = serde_json::to_vec(operations)?;
+		req.set_payload(Some(payload));
+		let res = self.dispatch(req, None).await?;
+
+		let resp: Vec<BatchDeleteResponse> = serde_json::from_slice(&res.body)?;
+
+		Ok(resp.into_iter().map(|r| r.deleted_items).collect())
+	}
+
+	async fn dispatch(
+		&self,
+		mut req: SignedRequest,
+		timeout: Option<Duration>,
+	) -> Result<Response, Error> {
+		req.sign(&self.creds);
+		let mut res = self
+			.client
+			.dispatch(req, Some(timeout.unwrap_or(DEFAULT_TIMEOUT)))
+			.await?;
+
+		let causality_token = res
+			.headers
+			.remove(GARAGE_CAUSALITY_TOKEN)
+			.map(CausalityToken);
+		let content_type = res.headers.remove(CONTENT_TYPE);
+
+		let body = match res.status {
+			StatusCode::OK => read_body(&mut res.headers, res.body).await?,
+			StatusCode::NO_CONTENT => Vec::new(),
+			StatusCode::NOT_FOUND => return Err(Error::NotFound),
+			StatusCode::NOT_MODIFIED => Vec::new(),
+			s => {
+				let err_body = read_body(&mut res.headers, res.body)
+					.await
+					.unwrap_or_default();
+				let err_body_str = std::str::from_utf8(&err_body)
+					.map(String::from)
+					.unwrap_or_else(|_| base64::encode(&err_body));
+
+				if s.is_client_error() || s.is_server_error() {
+					error!("Error response {}: {}", res.status, err_body_str);
+					let err = match serde_json::from_slice::<ErrorResponse>(&err_body) {
+						Ok(err) => Error::Remote(
+							res.status,
+							err.code.into(),
+							err.message.into(),
+							err.path.into(),
+						),
+						Err(_) => Error::Remote(
+							res.status,
+							"unknown".into(),
+							err_body_str.into(),
+							"?".into(),
+						),
+					};
+					return Err(err);
+				} else {
+					let msg = format!(
+						"Unexpected response code {}. Response body: {}",
+						res.status, err_body_str
+					);
+					error!("{}", msg);
+					return Err(Error::InvalidResponse(msg.into()));
+				}
+			}
+		};
+		debug!(
+			"Response body: {}",
+			std::str::from_utf8(&body)
+				.map(String::from)
+				.unwrap_or_else(|_| base64::encode(&body))
+		);
+
+		Ok(Response {
+			body,
+			status: res.status,
+			causality_token,
+			content_type,
+		})
+	}
+}
+
+async fn read_body(headers: &mut HeaderMap<String>, body: ByteStream) -> Result<Vec<u8>, Error> {
+	let body_len = headers
+		.get(CONTENT_LENGTH)
+		.and_then(|h| h.parse().ok())
+		.unwrap_or(0);
+	let mut res = Vec::with_capacity(body_len);
+	body.into_async_read().read_to_end(&mut res).await?;
+	Ok(res)
+}
+
+/// An opaque token used to convey causality between operations.
+#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)]
+#[serde(transparent)]
+pub struct CausalityToken(String);
+
+impl From<String> for CausalityToken {
+	fn from(v: String) -> Self {
+		CausalityToken(v)
+	}
+}
+
+impl From<CausalityToken> for String {
+	fn from(v: CausalityToken) -> Self {
+		v.0
+	}
+}
+
+/// A value in K2V. can be either a binary value, or a tombstone.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum K2vValue {
+	Tombstone,
+	Value(Vec<u8>),
+}
+
+impl From<Vec<u8>> for K2vValue {
+	fn from(v: Vec<u8>) -> Self {
+		K2vValue::Value(v)
+	}
+}
+
+impl From<Option<Vec<u8>>> for K2vValue {
+	fn from(v: Option<Vec<u8>>) -> Self {
+		match v {
+			Some(v) => K2vValue::Value(v),
+			None => K2vValue::Tombstone,
+		}
+	}
+}
+
+impl<'de> Deserialize<'de> for K2vValue {
+	fn deserialize<D>(d: D) -> Result<Self, D::Error>
+	where
+		D: Deserializer<'de>,
+	{
+		let val: Option<&str> = Option::deserialize(d)?;
+		Ok(match val {
+			Some(s) => {
+				K2vValue::Value(base64::decode(s).map_err(|_| DeError::custom("invalid base64"))?)
+			}
+			None => K2vValue::Tombstone,
+		})
+	}
+}
+
+impl Serialize for K2vValue {
+	fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+	where
+		S: Serializer,
+	{
+		match self {
+			K2vValue::Tombstone => serializer.serialize_none(),
+			K2vValue::Value(v) => {
+				let b64 = base64::encode(v);
+				serializer.serialize_str(&b64)
+			}
+		}
+	}
+}
+
+/// A set of K2vValue and associated causality information.
+#[derive(Debug, Clone, Serialize)]
+pub struct CausalValue {
+	pub causality: CausalityToken,
+	pub value: Vec<K2vValue>,
+}
+
+/// Result of paginated requests.
+#[derive(Debug, Clone)]
+pub struct PaginatedRange<V> {
+	pub items: BTreeMap<String, V>,
+	pub next_start: Option<String>,
+}
+
+/// Filter for batch operations.
+#[derive(Debug, Default, Clone, Deserialize, Serialize)]
+pub struct Filter<'a> {
+	pub start: Option<&'a str>,
+	pub end: Option<&'a str>,
+	pub prefix: Option<&'a str>,
+	pub limit: Option<u64>,
+	#[serde(default)]
+	pub reverse: bool,
+}
+
+impl<'a> Filter<'a> {
+	fn insert_params(&self, req: &mut SignedRequest) {
+		if let Some(start) = &self.start {
+			req.add_param("start", start);
+		}
+		if let Some(end) = &self.end {
+			req.add_param("end", end);
+		}
+		if let Some(prefix) = &self.prefix {
+			req.add_param("prefix", prefix);
+		}
+		if let Some(limit) = &self.limit {
+			req.add_param("limit", &limit.to_string());
+		}
+		if self.reverse {
+			req.add_param("reverse", "true");
+		}
+	}
+}
+
+#[derive(Debug, Clone, Deserialize)]
+#[serde(rename_all = "camelCase")]
+struct ReadIndexResponse<'a> {
+	#[serde(flatten, borrow)]
+	#[allow(dead_code)]
+	filter: Filter<'a>,
+	partition_keys: Vec<ReadIndexItem>,
+	#[allow(dead_code)]
+	more: bool,
+	next_start: Option<String>,
+}
+
+#[derive(Debug, Clone, Deserialize)]
+struct ReadIndexItem {
+	pk: String,
+	#[serde(flatten)]
+	info: PartitionInfo,
+}
+
+/// Information about data stored with a given partition key.
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct PartitionInfo {
+	pub entries: u64,
+	pub conflicts: u64,
+	pub values: u64,
+	pub bytes: u64,
+}
+
+/// Single sub-operation of an InsertBatch.
+#[derive(Debug, Clone, Serialize)]
+pub struct BatchInsertOp<'a> {
+	#[serde(rename = "pk")]
+	pub partition_key: &'a str,
+	#[serde(rename = "sk")]
+	pub sort_key: &'a str,
+	#[serde(rename = "ct")]
+	pub causality: Option<CausalityToken>,
+	#[serde(rename = "v")]
+	pub value: K2vValue,
+}
+
+/// Single sub-operation of a ReadBatch.
+#[derive(Debug, Default, Clone, Deserialize, Serialize)]
+#[serde(rename_all = "camelCase")]
+pub struct BatchReadOp<'a> {
+	pub partition_key: &'a str,
+	#[serde(flatten, borrow)]
+	pub filter: Filter<'a>,
+	#[serde(default)]
+	pub single_item: bool,
+	#[serde(default)]
+	pub conflicts_only: bool,
+	#[serde(default)]
+	pub tombstones: bool,
+}
+
+#[derive(Debug, Clone, Deserialize)]
+#[serde(rename_all = "camelCase")]
+struct BatchReadResponse<'a> {
+	#[serde(flatten, borrow)]
+	#[allow(dead_code)]
+	op: BatchReadOp<'a>,
+	items: Vec<BatchReadItem>,
+	#[allow(dead_code)]
+	more: bool,
+	next_start: Option<String>,
+}
+
+#[derive(Debug, Clone, Deserialize)]
+struct BatchReadItem {
+	sk: String,
+	ct: CausalityToken,
+	v: Vec<K2vValue>,
+}
+
+/// Single sub-operation of a DeleteBatch
+#[derive(Debug, Clone, Deserialize, Serialize)]
+#[serde(rename_all = "camelCase")]
+pub struct BatchDeleteOp<'a> {
+	pub partition_key: &'a str,
+	pub prefix: Option<&'a str>,
+	pub start: Option<&'a str>,
+	pub end: Option<&'a str>,
+	#[serde(default)]
+	pub single_item: bool,
+}
+
+impl<'a> BatchDeleteOp<'a> {
+	pub fn new(partition_key: &'a str) -> Self {
+		BatchDeleteOp {
+			partition_key,
+			prefix: None,
+			start: None,
+			end: None,
+			single_item: false,
+		}
+	}
+}
+
+#[derive(Debug, Clone, Deserialize)]
+#[serde(rename_all = "camelCase")]
+struct BatchDeleteResponse<'a> {
+	#[serde(flatten, borrow)]
+	#[allow(dead_code)]
+	filter: BatchDeleteOp<'a>,
+	deleted_items: u64,
+}
+
+#[derive(Deserialize)]
+struct ErrorResponse {
+	code: String,
+	message: String,
+	#[allow(dead_code)]
+	region: String,
+	path: String,
+}
+
+struct Response {
+	body: Vec<u8>,
+	status: StatusCode,
+	causality_token: Option<CausalityToken>,
+	content_type: Option<String>,
+}
diff --git a/src/model/Cargo.toml b/src/model/Cargo.toml
index 007cec89..2c2e2bfe 100644
--- a/src/model/Cargo.toml
+++ b/src/model/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "garage_model"
-version = "0.7.0"
+version = "0.8.0"
 authors = ["Alex Auvolat <alex@adnab.me>"]
 edition = "2018"
 license = "AGPL-3.0"
@@ -14,22 +14,22 @@ path = "lib.rs"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
-garage_rpc = { version = "0.7.0", path = "../rpc" }
-garage_table = { version = "0.7.0", path = "../table" }
-garage_block = { version = "0.7.0", path = "../block" }
-garage_util = { version = "0.7.0", path = "../util" }
-garage_model_050 = { package = "garage_model", version = "0.5.1" }
+garage_db = { version = "0.8.0", path = "../db" }
+garage_rpc = { version = "0.8.0", path = "../rpc" }
+garage_table = { version = "0.8.0", path = "../table" }
+garage_block = { version = "0.8.0", path = "../block" }
+garage_util = { version = "0.8.0", path = "../util" }
 
 async-trait = "0.1.7"
 arc-swap = "1.0"
+blake2 = "0.9"
 err-derive = "0.3"
 hex = "0.4"
+base64 = "0.13"
 tracing = "0.1.30"
 rand = "0.8"
 zstd = { version = "0.9", default-features = false }
 
-sled = "0.34"
-
 rmp-serde = "0.15"
 serde = { version = "1.0", default-features = false, features = ["derive", "rc"] }
 serde_bytes = "0.11"
@@ -39,6 +39,10 @@ futures-util = "0.3"
 tokio = { version = "1.0", default-features = false, features = ["rt", "rt-multi-thread", "io-util", "net", "time", "macros", "sync", "signal", "fs"] }
 opentelemetry = "0.17"
 
-#netapp = { version = "0.3.0", git = "https://git.deuxfleurs.fr/lx/netapp" }
-#netapp = { version = "0.4", path = "../../../netapp" }
-netapp = "0.4"
+netapp = "0.5"
+
+[features]
+k2v = [ "garage_util/k2v" ]
+lmdb = [ "garage_db/lmdb" ]
+sled = [ "garage_db/sled" ]
+sqlite = [ "garage_db/sqlite" ]
diff --git a/src/model/bucket_alias_table.rs b/src/model/bucket_alias_table.rs
index fce03d04..fcd1536e 100644
--- a/src/model/bucket_alias_table.rs
+++ b/src/model/bucket_alias_table.rs
@@ -7,7 +7,7 @@ use garage_table::*;
 
 /// The bucket alias table holds the names given to buckets
 /// in the global namespace.
-#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)]
+#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
 pub struct BucketAlias {
 	name: String,
 	pub state: crdt::Lww<Option<Uuid>>,
diff --git a/src/model/bucket_table.rs b/src/model/bucket_table.rs
index 7c7b9f30..7be42702 100644
--- a/src/model/bucket_table.rs
+++ b/src/model/bucket_table.rs
@@ -1,6 +1,6 @@
 use serde::{Deserialize, Serialize};
 
-use garage_table::crdt::Crdt;
+use garage_table::crdt::*;
 use garage_table::*;
 use garage_util::data::*;
 use garage_util::time::*;
@@ -12,7 +12,7 @@ use crate::permission::BucketKeyPerm;
 /// Its parameters are not directly accessible as:
 ///  - It must be possible to merge paramaters, hence the use of a LWW CRDT.
 ///  - A bucket has 2 states, Present or Deleted and parameters make sense only if present.
-#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)]
+#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
 pub struct Bucket {
 	/// ID of the bucket
 	pub id: Uuid,
@@ -21,7 +21,7 @@ pub struct Bucket {
 }
 
 /// Configuration for a bucket
-#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)]
+#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
 pub struct BucketParams {
 	/// Bucket's creation date
 	pub creation_date: u64,
@@ -44,6 +44,9 @@ pub struct BucketParams {
 	pub website_config: crdt::Lww<Option<WebsiteConfig>>,
 	/// CORS rules
 	pub cors_config: crdt::Lww<Option<Vec<CorsRule>>>,
+	/// Bucket quotas
+	#[serde(default)]
+	pub quotas: crdt::Lww<BucketQuotas>,
 }
 
 #[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
@@ -62,6 +65,18 @@ pub struct CorsRule {
 	pub expose_headers: Vec<String>,
 }
 
+#[derive(Default, PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)]
+pub struct BucketQuotas {
+	/// Maximum size in bytes (bucket size = sum of sizes of objects in the bucket)
+	pub max_size: Option<u64>,
+	/// Maximum number of non-deleted objects in the bucket
+	pub max_objects: Option<u64>,
+}
+
+impl AutoCrdt for BucketQuotas {
+	const WARN_IF_DIFFERENT: bool = true;
+}
+
 impl BucketParams {
 	/// Create an empty BucketParams with no authorized keys and no website accesss
 	pub fn new() -> Self {
@@ -72,6 +87,7 @@ impl BucketParams {
 			local_aliases: crdt::LwwMap::new(),
 			website_config: crdt::Lww::new(None),
 			cors_config: crdt::Lww::new(None),
+			quotas: crdt::Lww::new(BucketQuotas::default()),
 		}
 	}
 }
@@ -86,6 +102,7 @@ impl Crdt for BucketParams {
 
 		self.website_config.merge(&o.website_config);
 		self.cors_config.merge(&o.cors_config);
+		self.quotas.merge(&o.quotas);
 	}
 }
 
diff --git a/src/model/garage.rs b/src/model/garage.rs
index abdb920a..75012952 100644
--- a/src/model/garage.rs
+++ b/src/model/garage.rs
@@ -2,8 +2,11 @@ use std::sync::Arc;
 
 use netapp::NetworkKey;
 
+use garage_db as db;
+
 use garage_util::background::*;
 use garage_util::config::*;
+use garage_util::error::*;
 
 use garage_rpc::system::System;
 
@@ -13,13 +16,18 @@ use garage_table::replication::TableFullReplication;
 use garage_table::replication::TableShardedReplication;
 use garage_table::*;
 
-use crate::block_ref_table::*;
+use crate::s3::block_ref_table::*;
+use crate::s3::object_table::*;
+use crate::s3::version_table::*;
+
 use crate::bucket_alias_table::*;
 use crate::bucket_table::*;
 use crate::helper;
+use crate::index_counter::*;
 use crate::key_table::*;
-use crate::object_table::*;
-use crate::version_table::*;
+
+#[cfg(feature = "k2v")]
+use crate::k2v::{item_table::*, poll::*, rpc::*};
 
 /// An entire Garage full of data
 pub struct Garage {
@@ -27,7 +35,7 @@ pub struct Garage {
 	pub config: Config,
 
 	/// The local database
-	pub db: sled::Db,
+	pub db: db::Db,
 	/// A background job runner
 	pub background: Arc<BackgroundRunner>,
 	/// The membership manager
@@ -35,21 +43,118 @@ pub struct Garage {
 	/// The block manager
 	pub block_manager: Arc<BlockManager>,
 
-	/// Table containing informations about buckets
+	/// Table containing buckets
 	pub bucket_table: Arc<Table<BucketTable, TableFullReplication>>,
-	/// Table containing informations about bucket aliases
+	/// Table containing bucket aliases
 	pub bucket_alias_table: Arc<Table<BucketAliasTable, TableFullReplication>>,
-	/// Table containing informations about api keys
+	/// Table containing api keys
 	pub key_table: Arc<Table<KeyTable, TableFullReplication>>,
 
+	/// Table containing S3 objects
 	pub object_table: Arc<Table<ObjectTable, TableShardedReplication>>,
+	/// Counting table containing object counters
+	pub object_counter_table: Arc<IndexCounter<Object>>,
+	/// Table containing S3 object versions
 	pub version_table: Arc<Table<VersionTable, TableShardedReplication>>,
+	/// Table containing S3 block references (not blocks themselves)
 	pub block_ref_table: Arc<Table<BlockRefTable, TableShardedReplication>>,
+
+	#[cfg(feature = "k2v")]
+	pub k2v: GarageK2V,
+}
+
+#[cfg(feature = "k2v")]
+pub struct GarageK2V {
+	/// Table containing K2V items
+	pub item_table: Arc<Table<K2VItemTable, TableShardedReplication>>,
+	/// Indexing table containing K2V item counters
+	pub counter_table: Arc<IndexCounter<K2VItem>>,
+	/// K2V RPC handler
+	pub rpc: Arc<K2VRpcHandler>,
 }
 
 impl Garage {
 	/// Create and run garage
-	pub fn new(config: Config, db: sled::Db, background: Arc<BackgroundRunner>) -> Arc<Self> {
+	pub fn new(config: Config, background: Arc<BackgroundRunner>) -> Result<Arc<Self>, Error> {
+		// Create meta dir and data dir if they don't exist already
+		std::fs::create_dir_all(&config.metadata_dir)
+			.ok_or_message("Unable to create Garage metadata directory")?;
+		std::fs::create_dir_all(&config.data_dir)
+			.ok_or_message("Unable to create Garage data directory")?;
+
+		info!("Opening database...");
+		let mut db_path = config.metadata_dir.clone();
+		let db = match config.db_engine.as_str() {
+			// ---- Sled DB ----
+			#[cfg(feature = "sled")]
+			"sled" => {
+				db_path.push("db");
+				info!("Opening Sled database at: {}", db_path.display());
+				let db = db::sled_adapter::sled::Config::default()
+					.path(&db_path)
+					.cache_capacity(config.sled_cache_capacity)
+					.flush_every_ms(Some(config.sled_flush_every_ms))
+					.open()
+					.expect("Unable to open sled DB");
+				db::sled_adapter::SledDb::init(db)
+			}
+			#[cfg(not(feature = "sled"))]
+			"sled" => return Err(Error::Message("sled db not available in this build".into())),
+			// ---- Sqlite DB ----
+			#[cfg(feature = "sqlite")]
+			"sqlite" | "sqlite3" | "rusqlite" => {
+				db_path.push("db.sqlite");
+				info!("Opening Sqlite database at: {}", db_path.display());
+				let db = db::sqlite_adapter::rusqlite::Connection::open(db_path)
+					.expect("Unable to open sqlite DB");
+				db::sqlite_adapter::SqliteDb::init(db)
+			}
+			#[cfg(not(feature = "sqlite"))]
+			"sqlite" | "sqlite3" | "rusqlite" => {
+				return Err(Error::Message(
+					"sqlite db not available in this build".into(),
+				))
+			}
+			// ---- LMDB DB ----
+			#[cfg(feature = "lmdb")]
+			"lmdb" | "heed" => {
+				db_path.push("db.lmdb");
+				info!("Opening LMDB database at: {}", db_path.display());
+				std::fs::create_dir_all(&db_path).expect("Unable to create LMDB data directory");
+				let map_size = garage_db::lmdb_adapter::recommended_map_size();
+
+				use db::lmdb_adapter::heed;
+				let mut env_builder = heed::EnvOpenOptions::new();
+				env_builder.max_dbs(100);
+				env_builder.max_readers(500);
+				env_builder.map_size(map_size);
+				unsafe {
+					env_builder.flag(heed::flags::Flags::MdbNoSync);
+					env_builder.flag(heed::flags::Flags::MdbNoMetaSync);
+				}
+				let db = env_builder.open(&db_path).expect("Unable to open LMDB DB");
+				db::lmdb_adapter::LmdbDb::init(db)
+			}
+			#[cfg(not(feature = "lmdb"))]
+			"lmdb" | "heed" => return Err(Error::Message("lmdb db not available in this build".into())),
+			// ---- Unavailable DB engine ----
+			e => {
+				return Err(Error::Message(format!(
+					"Unsupported DB engine: {} (options: {})",
+					e,
+					vec![
+						#[cfg(feature = "sled")]
+						"sled",
+						#[cfg(feature = "sqlite")]
+						"sqlite",
+						#[cfg(feature = "lmdb")]
+						"lmdb",
+					]
+					.join(", ")
+				)));
+			}
+		};
+
 		let network_key = NetworkKey::from_slice(
 			&hex::decode(&config.rpc_secret).expect("Invalid RPC secret key")[..],
 		)
@@ -64,7 +169,7 @@ impl Garage {
 			background.clone(),
 			replication_mode.replication_factor(),
 			&config,
-		);
+		)?;
 
 		let data_rep_param = TableShardedReplication {
 			system: system.clone(),
@@ -90,11 +195,25 @@ impl Garage {
 			&db,
 			config.data_dir.clone(),
 			config.compression_level,
-			config.block_manager_background_tranquility,
 			data_rep_param,
 			system.clone(),
 		);
 
+		// ---- admin tables ----
+		info!("Initialize bucket_table...");
+		let bucket_table = Table::new(BucketTable, control_rep_param.clone(), system.clone(), &db);
+
+		info!("Initialize bucket_alias_table...");
+		let bucket_alias_table = Table::new(
+			BucketAliasTable,
+			control_rep_param.clone(),
+			system.clone(),
+			&db,
+		);
+		info!("Initialize key_table_table...");
+		let key_table = Table::new(KeyTable, control_rep_param, system.clone(), &db);
+
+		// ---- S3 tables ----
 		info!("Initialize block_ref_table...");
 		let block_ref_table = Table::new(
 			BlockRefTable {
@@ -116,34 +235,28 @@ impl Garage {
 			&db,
 		);
 
+		info!("Initialize object counter table...");
+		let object_counter_table = IndexCounter::new(system.clone(), meta_rep_param.clone(), &db);
+
 		info!("Initialize object_table...");
+		#[allow(clippy::redundant_clone)]
 		let object_table = Table::new(
 			ObjectTable {
 				background: background.clone(),
 				version_table: version_table.clone(),
+				object_counter_table: object_counter_table.clone(),
 			},
-			meta_rep_param,
-			system.clone(),
-			&db,
-		);
-
-		info!("Initialize bucket_table...");
-		let bucket_table = Table::new(BucketTable, control_rep_param.clone(), system.clone(), &db);
-
-		info!("Initialize bucket_alias_table...");
-		let bucket_alias_table = Table::new(
-			BucketAliasTable,
-			control_rep_param.clone(),
+			meta_rep_param.clone(),
 			system.clone(),
 			&db,
 		);
 
-		info!("Initialize key_table_table...");
-		let key_table = Table::new(KeyTable, control_rep_param, system.clone(), &db);
-
-		info!("Initialize Garage...");
+		// ---- K2V ----
+		#[cfg(feature = "k2v")]
+		let k2v = GarageK2V::new(system.clone(), &db, meta_rep_param);
 
-		Arc::new(Self {
+		// -- done --
+		Ok(Arc::new(Self {
 			config,
 			db,
 			background,
@@ -153,12 +266,46 @@ impl Garage {
 			bucket_alias_table,
 			key_table,
 			object_table,
+			object_counter_table,
 			version_table,
 			block_ref_table,
-		})
+			#[cfg(feature = "k2v")]
+			k2v,
+		}))
 	}
 
 	pub fn bucket_helper(&self) -> helper::bucket::BucketHelper {
 		helper::bucket::BucketHelper(self)
 	}
+
+	pub fn key_helper(&self) -> helper::key::KeyHelper {
+		helper::key::KeyHelper(self)
+	}
+}
+
+#[cfg(feature = "k2v")]
+impl GarageK2V {
+	fn new(system: Arc<System>, db: &db::Db, meta_rep_param: TableShardedReplication) -> Self {
+		info!("Initialize K2V counter table...");
+		let counter_table = IndexCounter::new(system.clone(), meta_rep_param.clone(), db);
+		info!("Initialize K2V subscription manager...");
+		let subscriptions = Arc::new(SubscriptionManager::new());
+		info!("Initialize K2V item table...");
+		let item_table = Table::new(
+			K2VItemTable {
+				counter_table: counter_table.clone(),
+				subscriptions: subscriptions.clone(),
+			},
+			meta_rep_param,
+			system.clone(),
+			db,
+		);
+		let rpc = K2VRpcHandler::new(system, item_table.clone(), subscriptions);
+
+		Self {
+			item_table,
+			counter_table,
+			rpc,
+		}
+	}
 }
diff --git a/src/model/helper/bucket.rs b/src/model/helper/bucket.rs
index 706faf26..130ba5be 100644
--- a/src/model/helper/bucket.rs
+++ b/src/model/helper/bucket.rs
@@ -1,15 +1,18 @@
-use garage_table::util::EmptyKey;
 use garage_util::crdt::*;
 use garage_util::data::*;
 use garage_util::error::{Error as GarageError, OkOrMessage};
 use garage_util::time::*;
 
+use garage_table::util::*;
+
 use crate::bucket_alias_table::*;
 use crate::bucket_table::*;
 use crate::garage::Garage;
 use crate::helper::error::*;
-use crate::key_table::{Key, KeyFilter};
+use crate::helper::key::KeyHelper;
+use crate::key_table::*;
 use crate::permission::BucketKeyPerm;
+use crate::s3::object_table::ObjectFilter;
 
 pub struct BucketHelper<'a>(pub(crate) &'a Garage);
 
@@ -49,6 +52,23 @@ impl<'a> BucketHelper<'a> {
 		}
 	}
 
+	#[allow(clippy::ptr_arg)]
+	pub async fn resolve_bucket(&self, bucket_name: &String, api_key: &Key) -> Result<Uuid, Error> {
+		let api_key_params = api_key
+			.state
+			.as_option()
+			.ok_or_message("Key should not be deleted at this point")?;
+
+		if let Some(Some(bucket_id)) = api_key_params.local_aliases.get(bucket_name) {
+			Ok(*bucket_id)
+		} else {
+			Ok(self
+				.resolve_global_bucket_name(bucket_name)
+				.await?
+				.ok_or_else(|| Error::NoSuchBucket(bucket_name.to_string()))?)
+		}
+	}
+
 	/// Returns a Bucket if it is present in bucket table,
 	/// even if it is in deleted state. Querying a non-existing
 	/// bucket ID returns an internal error.
@@ -71,63 +91,7 @@ impl<'a> BucketHelper<'a> {
 			.get(&EmptyKey, &bucket_id)
 			.await?
 			.filter(|b| !b.is_deleted())
-			.ok_or_bad_request(format!(
-				"Bucket {:?} does not exist or has been deleted",
-				bucket_id
-			))
-	}
-
-	/// Returns a Key if it is present in key table,
-	/// even if it is in deleted state. Querying a non-existing
-	/// key ID returns an internal error.
-	pub async fn get_internal_key(&self, key_id: &String) -> Result<Key, Error> {
-		Ok(self
-			.0
-			.key_table
-			.get(&EmptyKey, key_id)
-			.await?
-			.ok_or_message(format!("Key {} does not exist", key_id))?)
-	}
-
-	/// Returns a Key if it is present in key table,
-	/// only if it is in non-deleted state.
-	/// Querying a non-existing key ID or a deleted key
-	/// returns a bad request error.
-	pub async fn get_existing_key(&self, key_id: &String) -> Result<Key, Error> {
-		self.0
-			.key_table
-			.get(&EmptyKey, key_id)
-			.await?
-			.filter(|b| !b.state.is_deleted())
-			.ok_or_bad_request(format!("Key {} does not exist or has been deleted", key_id))
-	}
-
-	/// Returns a Key if it is present in key table,
-	/// looking it up by key ID or by a match on its name,
-	/// only if it is in non-deleted state.
-	/// Querying a non-existing key ID or a deleted key
-	/// returns a bad request error.
-	pub async fn get_existing_matching_key(&self, pattern: &str) -> Result<Key, Error> {
-		let candidates = self
-			.0
-			.key_table
-			.get_range(
-				&EmptyKey,
-				None,
-				Some(KeyFilter::MatchesAndNotDeleted(pattern.to_string())),
-				10,
-			)
-			.await?
-			.into_iter()
-			.collect::<Vec<_>>();
-		if candidates.len() != 1 {
-			Err(Error::BadRequest(format!(
-				"{} matching keys",
-				candidates.len()
-			)))
-		} else {
-			Ok(candidates.into_iter().next().unwrap())
-		}
+			.ok_or_else(|| Error::NoSuchBucket(hex::encode(bucket_id)))
 	}
 
 	/// Sets a new alias for a bucket in global namespace.
@@ -141,10 +105,7 @@ impl<'a> BucketHelper<'a> {
 		alias_name: &String,
 	) -> Result<(), Error> {
 		if !is_valid_bucket_name(alias_name) {
-			return Err(Error::BadRequest(format!(
-				"{}: {}",
-				alias_name, INVALID_BUCKET_NAME_MESSAGE
-			)));
+			return Err(Error::InvalidBucketName(alias_name.to_string()));
 		}
 
 		let mut bucket = self.get_existing_bucket(bucket_id).await?;
@@ -175,7 +136,7 @@ impl<'a> BucketHelper<'a> {
 
 		let alias = match alias {
 			None => BucketAlias::new(alias_name.clone(), alias_ts, Some(bucket_id))
-				.ok_or_bad_request(format!("{}: {}", alias_name, INVALID_BUCKET_NAME_MESSAGE))?,
+				.ok_or_else(|| Error::InvalidBucketName(alias_name.clone()))?,
 			Some(mut a) => {
 				a.state = Lww::raw(alias_ts, Some(bucket_id));
 				a
@@ -263,7 +224,7 @@ impl<'a> BucketHelper<'a> {
 			.bucket_alias_table
 			.get(&EmptyKey, alias_name)
 			.await?
-			.ok_or_message(format!("Alias {} not found", alias_name))?;
+			.ok_or_else(|| Error::NoSuchBucket(alias_name.to_string()))?;
 
 		// Checks ok, remove alias
 		let alias_ts = match bucket.state.as_option() {
@@ -302,15 +263,14 @@ impl<'a> BucketHelper<'a> {
 		key_id: &String,
 		alias_name: &String,
 	) -> Result<(), Error> {
+		let key_helper = KeyHelper(self.0);
+
 		if !is_valid_bucket_name(alias_name) {
-			return Err(Error::BadRequest(format!(
-				"{}: {}",
-				alias_name, INVALID_BUCKET_NAME_MESSAGE
-			)));
+			return Err(Error::InvalidBucketName(alias_name.to_string()));
 		}
 
 		let mut bucket = self.get_existing_bucket(bucket_id).await?;
-		let mut key = self.get_existing_key(key_id).await?;
+		let mut key = key_helper.get_existing_key(key_id).await?;
 
 		let mut key_param = key.state.as_option_mut().unwrap();
 
@@ -359,8 +319,10 @@ impl<'a> BucketHelper<'a> {
 		key_id: &String,
 		alias_name: &String,
 	) -> Result<(), Error> {
+		let key_helper = KeyHelper(self.0);
+
 		let mut bucket = self.get_existing_bucket(bucket_id).await?;
-		let mut key = self.get_existing_key(key_id).await?;
+		let mut key = key_helper.get_existing_key(key_id).await?;
 
 		let mut bucket_p = bucket.state.as_option_mut().unwrap();
 
@@ -428,8 +390,10 @@ impl<'a> BucketHelper<'a> {
 		key_id: &String,
 		mut perm: BucketKeyPerm,
 	) -> Result<(), Error> {
+		let key_helper = KeyHelper(self.0);
+
 		let mut bucket = self.get_internal_bucket(bucket_id).await?;
-		let mut key = self.get_internal_key(key_id).await?;
+		let mut key = key_helper.get_internal_key(key_id).await?;
 
 		if let Some(bstate) = bucket.state.as_option() {
 			if let Some(kp) = bstate.authorized_keys.get(key_id) {
@@ -465,4 +429,47 @@ impl<'a> BucketHelper<'a> {
 
 		Ok(())
 	}
+
+	pub async fn is_bucket_empty(&self, bucket_id: Uuid) -> Result<bool, Error> {
+		let objects = self
+			.0
+			.object_table
+			.get_range(
+				&bucket_id,
+				None,
+				Some(ObjectFilter::IsData),
+				10,
+				EnumerationOrder::Forward,
+			)
+			.await?;
+		if !objects.is_empty() {
+			return Ok(false);
+		}
+
+		#[cfg(feature = "k2v")]
+		{
+			use garage_rpc::ring::Ring;
+			use std::sync::Arc;
+
+			let ring: Arc<Ring> = self.0.system.ring.borrow().clone();
+			let k2vindexes = self
+				.0
+				.k2v
+				.counter_table
+				.table
+				.get_range(
+					&bucket_id,
+					None,
+					Some((DeletedFilter::NotDeleted, ring.layout.node_id_vec.clone())),
+					10,
+					EnumerationOrder::Forward,
+				)
+				.await?;
+			if !k2vindexes.is_empty() {
+				return Ok(false);
+			}
+		}
+
+		Ok(true)
+	}
 }
diff --git a/src/model/helper/error.rs b/src/model/helper/error.rs
index 30b2ba32..3ca8f55c 100644
--- a/src/model/helper/error.rs
+++ b/src/model/helper/error.rs
@@ -10,6 +10,16 @@ pub enum Error {
 
 	#[error(display = "Bad request: {}", _0)]
 	BadRequest(String),
+
+	/// Bucket name is not valid according to AWS S3 specs
+	#[error(display = "Invalid bucket name: {}", _0)]
+	InvalidBucketName(String),
+
+	#[error(display = "Access key not found: {}", _0)]
+	NoSuchAccessKey(String),
+
+	#[error(display = "Bucket not found: {}", _0)]
+	NoSuchBucket(String),
 }
 
 impl From<netapp::error::Error> for Error {
diff --git a/src/model/helper/key.rs b/src/model/helper/key.rs
new file mode 100644
index 00000000..c1a8e974
--- /dev/null
+++ b/src/model/helper/key.rs
@@ -0,0 +1,102 @@
+use garage_table::util::*;
+use garage_util::crdt::*;
+use garage_util::error::OkOrMessage;
+
+use crate::garage::Garage;
+use crate::helper::bucket::BucketHelper;
+use crate::helper::error::*;
+use crate::key_table::{Key, KeyFilter};
+use crate::permission::BucketKeyPerm;
+
+pub struct KeyHelper<'a>(pub(crate) &'a Garage);
+
+#[allow(clippy::ptr_arg)]
+impl<'a> KeyHelper<'a> {
+	/// Returns a Key if it is present in key table,
+	/// even if it is in deleted state. Querying a non-existing
+	/// key ID returns an internal error.
+	pub async fn get_internal_key(&self, key_id: &String) -> Result<Key, Error> {
+		Ok(self
+			.0
+			.key_table
+			.get(&EmptyKey, key_id)
+			.await?
+			.ok_or_message(format!("Key {} does not exist", key_id))?)
+	}
+
+	/// Returns a Key if it is present in key table,
+	/// only if it is in non-deleted state.
+	/// Querying a non-existing key ID or a deleted key
+	/// returns a bad request error.
+	pub async fn get_existing_key(&self, key_id: &String) -> Result<Key, Error> {
+		self.0
+			.key_table
+			.get(&EmptyKey, key_id)
+			.await?
+			.filter(|b| !b.state.is_deleted())
+			.ok_or_else(|| Error::NoSuchAccessKey(key_id.to_string()))
+	}
+
+	/// Returns a Key if it is present in key table,
+	/// looking it up by key ID or by a match on its name,
+	/// only if it is in non-deleted state.
+	/// Querying a non-existing key ID or a deleted key
+	/// returns a bad request error.
+	pub async fn get_existing_matching_key(&self, pattern: &str) -> Result<Key, Error> {
+		let candidates = self
+			.0
+			.key_table
+			.get_range(
+				&EmptyKey,
+				None,
+				Some(KeyFilter::MatchesAndNotDeleted(pattern.to_string())),
+				10,
+				EnumerationOrder::Forward,
+			)
+			.await?
+			.into_iter()
+			.collect::<Vec<_>>();
+		if candidates.len() != 1 {
+			Err(Error::BadRequest(format!(
+				"{} matching keys",
+				candidates.len()
+			)))
+		} else {
+			Ok(candidates.into_iter().next().unwrap())
+		}
+	}
+
+	/// Deletes an API access key
+	pub async fn delete_key(&self, key: &mut Key) -> Result<(), Error> {
+		let bucket_helper = BucketHelper(self.0);
+
+		let state = key.state.as_option_mut().unwrap();
+
+		// --- done checking, now commit ---
+		// (the step at unset_local_bucket_alias will fail if a bucket
+		// does not have another alias, the deletion will be
+		// interrupted in the middle if that happens)
+
+		// 1. Delete local aliases
+		for (alias, _, to) in state.local_aliases.items().iter() {
+			if let Some(bucket_id) = to {
+				bucket_helper
+					.unset_local_bucket_alias(*bucket_id, &key.key_id, alias)
+					.await?;
+			}
+		}
+
+		// 2. Remove permissions on all authorized buckets
+		for (ab_id, _auth) in state.authorized_buckets.items().iter() {
+			bucket_helper
+				.set_bucket_key_permissions(*ab_id, &key.key_id, BucketKeyPerm::NO_PERMISSIONS)
+				.await?;
+		}
+
+		// 3. Actually delete key
+		key.state = Deletable::delete();
+		self.0.key_table.insert(key).await?;
+
+		Ok(())
+	}
+}
diff --git a/src/model/helper/mod.rs b/src/model/helper/mod.rs
index 2f4e8898..dd947c86 100644
--- a/src/model/helper/mod.rs
+++ b/src/model/helper/mod.rs
@@ -1,2 +1,3 @@
 pub mod bucket;
 pub mod error;
+pub mod key;
diff --git a/src/model/index_counter.rs b/src/model/index_counter.rs
new file mode 100644
index 00000000..e6394f0c
--- /dev/null
+++ b/src/model/index_counter.rs
@@ -0,0 +1,496 @@
+use core::ops::Bound;
+use std::collections::{hash_map, BTreeMap, HashMap};
+use std::marker::PhantomData;
+use std::sync::Arc;
+
+use async_trait::async_trait;
+use serde::{Deserialize, Serialize};
+use tokio::sync::{mpsc, watch};
+
+use garage_db as db;
+
+use garage_rpc::ring::Ring;
+use garage_rpc::system::System;
+use garage_util::background::*;
+use garage_util::data::*;
+use garage_util::error::*;
+use garage_util::time::*;
+
+use garage_table::crdt::*;
+use garage_table::replication::*;
+use garage_table::*;
+
+pub trait CountedItem: Clone + PartialEq + Send + Sync + 'static {
+	const COUNTER_TABLE_NAME: &'static str;
+
+	type CP: PartitionKey + Clone + PartialEq + Serialize + for<'de> Deserialize<'de> + Send + Sync;
+	type CS: SortKey + Clone + PartialEq + Serialize + for<'de> Deserialize<'de> + Send + Sync;
+
+	fn counter_partition_key(&self) -> &Self::CP;
+	fn counter_sort_key(&self) -> &Self::CS;
+	fn counts(&self) -> Vec<(&'static str, i64)>;
+}
+
+/// A counter entry in the global table
+#[derive(Clone, PartialEq, Debug, Serialize, Deserialize)]
+pub struct CounterEntry<T: CountedItem> {
+	pub pk: T::CP,
+	pub sk: T::CS,
+	pub values: BTreeMap<String, CounterValue>,
+}
+
+impl<T: CountedItem> Entry<T::CP, T::CS> for CounterEntry<T> {
+	fn partition_key(&self) -> &T::CP {
+		&self.pk
+	}
+	fn sort_key(&self) -> &T::CS {
+		&self.sk
+	}
+	fn is_tombstone(&self) -> bool {
+		self.values
+			.iter()
+			.all(|(_, v)| v.node_values.iter().all(|(_, (_, v))| *v == 0))
+	}
+}
+
+impl<T: CountedItem> CounterEntry<T> {
+	pub fn filtered_values(&self, ring: &Ring) -> HashMap<String, i64> {
+		let nodes = &ring.layout.node_id_vec[..];
+		self.filtered_values_with_nodes(nodes)
+	}
+
+	pub fn filtered_values_with_nodes(&self, nodes: &[Uuid]) -> HashMap<String, i64> {
+		let mut ret = HashMap::new();
+		for (name, vals) in self.values.iter() {
+			let new_vals = vals
+				.node_values
+				.iter()
+				.filter(|(n, _)| nodes.contains(n))
+				.map(|(_, (_, v))| *v)
+				.collect::<Vec<_>>();
+			if !new_vals.is_empty() {
+				ret.insert(
+					name.clone(),
+					new_vals.iter().fold(i64::MIN, |a, b| std::cmp::max(a, *b)),
+				);
+			}
+		}
+
+		ret
+	}
+}
+
+/// A counter entry in the global table
+#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
+pub struct CounterValue {
+	pub node_values: BTreeMap<Uuid, (u64, i64)>,
+}
+
+impl<T: CountedItem> Crdt for CounterEntry<T> {
+	fn merge(&mut self, other: &Self) {
+		for (name, e2) in other.values.iter() {
+			if let Some(e) = self.values.get_mut(name) {
+				e.merge(e2);
+			} else {
+				self.values.insert(name.clone(), e2.clone());
+			}
+		}
+	}
+}
+
+impl Crdt for CounterValue {
+	fn merge(&mut self, other: &Self) {
+		for (node, (t2, e2)) in other.node_values.iter() {
+			if let Some((t, e)) = self.node_values.get_mut(node) {
+				if t2 > t {
+					*e = *e2;
+				}
+			} else {
+				self.node_values.insert(*node, (*t2, *e2));
+			}
+		}
+	}
+}
+
+pub struct CounterTable<T: CountedItem> {
+	_phantom_t: PhantomData<T>,
+}
+
+impl<T: CountedItem> TableSchema for CounterTable<T> {
+	const TABLE_NAME: &'static str = T::COUNTER_TABLE_NAME;
+
+	type P = T::CP;
+	type S = T::CS;
+	type E = CounterEntry<T>;
+	type Filter = (DeletedFilter, Vec<Uuid>);
+
+	fn matches_filter(entry: &Self::E, filter: &Self::Filter) -> bool {
+		if filter.0 == DeletedFilter::Any {
+			return true;
+		}
+
+		let is_tombstone = entry
+			.filtered_values_with_nodes(&filter.1[..])
+			.iter()
+			.all(|(_, v)| *v == 0);
+		filter.0.apply(is_tombstone)
+	}
+}
+
+// ----
+
+pub struct IndexCounter<T: CountedItem> {
+	this_node: Uuid,
+	local_counter: db::Tree,
+	propagate_tx: mpsc::UnboundedSender<(T::CP, T::CS, LocalCounterEntry<T>)>,
+	pub table: Arc<Table<CounterTable<T>, TableShardedReplication>>,
+}
+
+impl<T: CountedItem> IndexCounter<T> {
+	pub fn new(
+		system: Arc<System>,
+		replication: TableShardedReplication,
+		db: &db::Db,
+	) -> Arc<Self> {
+		let background = system.background.clone();
+
+		let (propagate_tx, propagate_rx) = mpsc::unbounded_channel();
+
+		let this = Arc::new(Self {
+			this_node: system.id,
+			local_counter: db
+				.open_tree(format!("local_counter_v2:{}", T::COUNTER_TABLE_NAME))
+				.expect("Unable to open local counter tree"),
+			propagate_tx,
+			table: Table::new(
+				CounterTable {
+					_phantom_t: Default::default(),
+				},
+				replication,
+				system,
+				db,
+			),
+		});
+
+		background.spawn_worker(IndexPropagatorWorker {
+			index_counter: this.clone(),
+			propagate_rx,
+			buf: HashMap::new(),
+			errors: 0,
+		});
+
+		this
+	}
+
+	pub fn count(
+		&self,
+		tx: &mut db::Transaction,
+		old: Option<&T>,
+		new: Option<&T>,
+	) -> db::TxResult<(), Error> {
+		let pk = old
+			.map(|e| e.counter_partition_key())
+			.unwrap_or_else(|| new.unwrap().counter_partition_key());
+		let sk = old
+			.map(|e| e.counter_sort_key())
+			.unwrap_or_else(|| new.unwrap().counter_sort_key());
+
+		// calculate counter differences
+		let mut counts = HashMap::new();
+		for (k, v) in old.map(|x| x.counts()).unwrap_or_default() {
+			*counts.entry(k).or_insert(0) -= v;
+		}
+		for (k, v) in new.map(|x| x.counts()).unwrap_or_default() {
+			*counts.entry(k).or_insert(0) += v;
+		}
+
+		// update local counter table
+		let tree_key = self.table.data.tree_key(pk, sk);
+
+		let mut entry = match tx.get(&self.local_counter, &tree_key[..])? {
+			Some(old_bytes) => {
+				rmp_serde::decode::from_read_ref::<_, LocalCounterEntry<T>>(&old_bytes)
+					.map_err(Error::RmpDecode)
+					.map_err(db::TxError::Abort)?
+			}
+			None => LocalCounterEntry {
+				pk: pk.clone(),
+				sk: sk.clone(),
+				values: BTreeMap::new(),
+			},
+		};
+
+		let now = now_msec();
+		for (s, inc) in counts.iter() {
+			let mut ent = entry.values.entry(s.to_string()).or_insert((0, 0));
+			ent.0 = std::cmp::max(ent.0 + 1, now);
+			ent.1 += *inc;
+		}
+
+		let new_entry_bytes = rmp_to_vec_all_named(&entry)
+			.map_err(Error::RmpEncode)
+			.map_err(db::TxError::Abort)?;
+		tx.insert(&self.local_counter, &tree_key[..], new_entry_bytes)?;
+
+		if let Err(e) = self.propagate_tx.send((pk.clone(), sk.clone(), entry)) {
+			error!(
+				"Could not propagate updated counter values, failed to send to channel: {}",
+				e
+			);
+		}
+
+		Ok(())
+	}
+
+	pub fn offline_recount_all<TS, TR>(
+		&self,
+		counted_table: &Arc<Table<TS, TR>>,
+	) -> Result<(), Error>
+	where
+		TS: TableSchema<E = T>,
+		TR: TableReplication,
+	{
+		let save_counter_entry = |entry: CounterEntry<T>| -> Result<(), Error> {
+			let entry_k = self
+				.table
+				.data
+				.tree_key(entry.partition_key(), entry.sort_key());
+			self.table
+				.data
+				.update_entry_with(&entry_k, |ent| match ent {
+					Some(mut ent) => {
+						ent.merge(&entry);
+						ent
+					}
+					None => entry.clone(),
+				})?;
+			Ok(())
+		};
+
+		// 1. Set all old local counters to zero
+		let now = now_msec();
+		let mut next_start: Option<Vec<u8>> = None;
+		loop {
+			let low_bound = match next_start.take() {
+				Some(v) => Bound::Excluded(v),
+				None => Bound::Unbounded,
+			};
+			let mut batch = vec![];
+			for item in self.local_counter.range((low_bound, Bound::Unbounded))? {
+				batch.push(item?);
+				if batch.len() > 1000 {
+					break;
+				}
+			}
+
+			if batch.is_empty() {
+				break;
+			}
+
+			info!("zeroing old counters... ({})", hex::encode(&batch[0].0));
+			for (local_counter_k, local_counter) in batch {
+				let mut local_counter =
+					rmp_serde::decode::from_read_ref::<_, LocalCounterEntry<T>>(&local_counter)?;
+
+				for (_, tv) in local_counter.values.iter_mut() {
+					tv.0 = std::cmp::max(tv.0 + 1, now);
+					tv.1 = 0;
+				}
+
+				let local_counter_bytes = rmp_to_vec_all_named(&local_counter)?;
+				self.local_counter
+					.insert(&local_counter_k, &local_counter_bytes)?;
+
+				let counter_entry = local_counter.into_counter_entry(self.this_node);
+				save_counter_entry(counter_entry)?;
+
+				next_start = Some(local_counter_k);
+			}
+		}
+
+		// 2. Recount all table entries
+		let now = now_msec();
+		let mut next_start: Option<Vec<u8>> = None;
+		loop {
+			let low_bound = match next_start.take() {
+				Some(v) => Bound::Excluded(v),
+				None => Bound::Unbounded,
+			};
+			let mut batch = vec![];
+			for item in counted_table
+				.data
+				.store
+				.range((low_bound, Bound::Unbounded))?
+			{
+				batch.push(item?);
+				if batch.len() > 1000 {
+					break;
+				}
+			}
+
+			if batch.is_empty() {
+				break;
+			}
+
+			info!("counting entries... ({})", hex::encode(&batch[0].0));
+			for (counted_entry_k, counted_entry) in batch {
+				let counted_entry = counted_table.data.decode_entry(&counted_entry)?;
+
+				let pk = counted_entry.counter_partition_key();
+				let sk = counted_entry.counter_sort_key();
+				let counts = counted_entry.counts();
+
+				let local_counter_key = self.table.data.tree_key(pk, sk);
+				let mut local_counter = match self.local_counter.get(&local_counter_key)? {
+					Some(old_bytes) => {
+						let ent = rmp_serde::decode::from_read_ref::<_, LocalCounterEntry<T>>(
+							&old_bytes,
+						)?;
+						assert!(ent.pk == *pk);
+						assert!(ent.sk == *sk);
+						ent
+					}
+					None => LocalCounterEntry {
+						pk: pk.clone(),
+						sk: sk.clone(),
+						values: BTreeMap::new(),
+					},
+				};
+				for (s, v) in counts.iter() {
+					let mut tv = local_counter.values.entry(s.to_string()).or_insert((0, 0));
+					tv.0 = std::cmp::max(tv.0 + 1, now);
+					tv.1 += v;
+				}
+
+				let local_counter_bytes = rmp_to_vec_all_named(&local_counter)?;
+				self.local_counter
+					.insert(&local_counter_key, local_counter_bytes)?;
+
+				let counter_entry = local_counter.into_counter_entry(self.this_node);
+				save_counter_entry(counter_entry)?;
+
+				next_start = Some(counted_entry_k);
+			}
+		}
+
+		// Done
+		Ok(())
+	}
+}
+
+struct IndexPropagatorWorker<T: CountedItem> {
+	index_counter: Arc<IndexCounter<T>>,
+	propagate_rx: mpsc::UnboundedReceiver<(T::CP, T::CS, LocalCounterEntry<T>)>,
+
+	buf: HashMap<Vec<u8>, CounterEntry<T>>,
+	errors: usize,
+}
+
+impl<T: CountedItem> IndexPropagatorWorker<T> {
+	fn add_ent(&mut self, pk: T::CP, sk: T::CS, counters: LocalCounterEntry<T>) {
+		let tree_key = self.index_counter.table.data.tree_key(&pk, &sk);
+		let dist_entry = counters.into_counter_entry(self.index_counter.this_node);
+		match self.buf.entry(tree_key) {
+			hash_map::Entry::Vacant(e) => {
+				e.insert(dist_entry);
+			}
+			hash_map::Entry::Occupied(mut e) => {
+				e.get_mut().merge(&dist_entry);
+			}
+		}
+	}
+}
+
+#[async_trait]
+impl<T: CountedItem> Worker for IndexPropagatorWorker<T> {
+	fn name(&self) -> String {
+		format!("{} index counter propagator", T::COUNTER_TABLE_NAME)
+	}
+
+	fn info(&self) -> Option<String> {
+		if !self.buf.is_empty() {
+			Some(format!("{} items in queue", self.buf.len()))
+		} else {
+			None
+		}
+	}
+
+	async fn work(&mut self, must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error> {
+		// This loop batches updates to counters to be sent all at once.
+		// They are sent once the propagate_rx channel has been emptied (or is closed).
+		let closed = loop {
+			match self.propagate_rx.try_recv() {
+				Ok((pk, sk, counters)) => {
+					self.add_ent(pk, sk, counters);
+				}
+				Err(mpsc::error::TryRecvError::Empty) => break false,
+				Err(mpsc::error::TryRecvError::Disconnected) => break true,
+			}
+		};
+
+		if !self.buf.is_empty() {
+			let entries_k = self.buf.keys().take(100).cloned().collect::<Vec<_>>();
+			let entries = entries_k.iter().map(|k| self.buf.get(k).unwrap());
+			if let Err(e) = self.index_counter.table.insert_many(entries).await {
+				self.errors += 1;
+				if self.errors >= 2 && *must_exit.borrow() {
+					error!("({}) Could not propagate {} counter values: {}, these counters will not be updated correctly.", T::COUNTER_TABLE_NAME, self.buf.len(), e);
+					return Ok(WorkerState::Done);
+				}
+				// Propagate error up to worker manager, it will log it, increment a counter,
+				// and sleep for a certain delay (with exponential backoff), waiting for
+				// things to go back to normal
+				return Err(e);
+			} else {
+				for k in entries_k {
+					self.buf.remove(&k);
+				}
+				self.errors = 0;
+			}
+
+			return Ok(WorkerState::Busy);
+		} else if closed {
+			return Ok(WorkerState::Done);
+		} else {
+			return Ok(WorkerState::Idle);
+		}
+	}
+
+	async fn wait_for_work(&mut self, _must_exit: &watch::Receiver<bool>) -> WorkerState {
+		match self.propagate_rx.recv().await {
+			Some((pk, sk, counters)) => {
+				self.add_ent(pk, sk, counters);
+				WorkerState::Busy
+			}
+			None => match self.buf.is_empty() {
+				false => WorkerState::Busy,
+				true => WorkerState::Done,
+			},
+		}
+	}
+}
+
+#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)]
+struct LocalCounterEntry<T: CountedItem> {
+	pk: T::CP,
+	sk: T::CS,
+	values: BTreeMap<String, (u64, i64)>,
+}
+
+impl<T: CountedItem> LocalCounterEntry<T> {
+	fn into_counter_entry(self, this_node: Uuid) -> CounterEntry<T> {
+		CounterEntry {
+			pk: self.pk,
+			sk: self.sk,
+			values: self
+				.values
+				.into_iter()
+				.map(|(name, (ts, v))| {
+					let mut node_values = BTreeMap::new();
+					node_values.insert(this_node, (ts, v));
+					(name, CounterValue { node_values })
+				})
+				.collect(),
+		}
+	}
+}
diff --git a/src/model/k2v/causality.rs b/src/model/k2v/causality.rs
new file mode 100644
index 00000000..9a692870
--- /dev/null
+++ b/src/model/k2v/causality.rs
@@ -0,0 +1,96 @@
+use std::collections::BTreeMap;
+use std::convert::TryInto;
+
+use serde::{Deserialize, Serialize};
+
+use garage_util::data::*;
+
+/// Node IDs used in K2V are u64 integers that are the abbreviation
+/// of full Garage node IDs which are 256-bit UUIDs.
+pub type K2VNodeId = u64;
+
+pub fn make_node_id(node_id: Uuid) -> K2VNodeId {
+	let mut tmp = [0u8; 8];
+	tmp.copy_from_slice(&node_id.as_slice()[..8]);
+	u64::from_be_bytes(tmp)
+}
+
+#[derive(PartialEq, Eq, Debug, Serialize, Deserialize)]
+pub struct CausalContext {
+	pub vector_clock: BTreeMap<K2VNodeId, u64>,
+}
+
+impl CausalContext {
+	/// Empty causality context
+	pub fn new_empty() -> Self {
+		Self {
+			vector_clock: BTreeMap::new(),
+		}
+	}
+	/// Make binary representation and encode in base64
+	pub fn serialize(&self) -> String {
+		let mut ints = Vec::with_capacity(2 * self.vector_clock.len());
+		for (node, time) in self.vector_clock.iter() {
+			ints.push(*node);
+			ints.push(*time);
+		}
+		let checksum = ints.iter().fold(0, |acc, v| acc ^ *v);
+
+		let mut bytes = u64::to_be_bytes(checksum).to_vec();
+		for i in ints {
+			bytes.extend(u64::to_be_bytes(i));
+		}
+
+		base64::encode_config(bytes, base64::URL_SAFE_NO_PAD)
+	}
+	/// Parse from base64-encoded binary representation
+	pub fn parse(s: &str) -> Result<Self, String> {
+		let bytes = base64::decode_config(s, base64::URL_SAFE_NO_PAD)
+			.map_err(|e| format!("bad causality token base64: {}", e))?;
+		if bytes.len() % 16 != 8 || bytes.len() < 8 {
+			return Err("bad causality token length".into());
+		}
+
+		let checksum = u64::from_be_bytes(bytes[..8].try_into().unwrap());
+		let mut ret = CausalContext {
+			vector_clock: BTreeMap::new(),
+		};
+
+		for i in 0..(bytes.len() / 16) {
+			let node_id = u64::from_be_bytes(bytes[8 + i * 16..16 + i * 16].try_into().unwrap());
+			let time = u64::from_be_bytes(bytes[16 + i * 16..24 + i * 16].try_into().unwrap());
+			ret.vector_clock.insert(node_id, time);
+		}
+
+		let check = ret.vector_clock.iter().fold(0, |acc, (n, t)| acc ^ *n ^ *t);
+
+		if check != checksum {
+			return Err("bad causality token checksum".into());
+		}
+
+		Ok(ret)
+	}
+	/// Check if this causal context contains newer items than another one
+	pub fn is_newer_than(&self, other: &Self) -> bool {
+		self.vector_clock
+			.iter()
+			.any(|(k, v)| v > other.vector_clock.get(k).unwrap_or(&0))
+	}
+}
+
+#[cfg(test)]
+mod tests {
+	use super::*;
+
+	#[test]
+	fn test_causality_token_serialization() {
+		let ct = CausalContext {
+			vector_clock: [(4, 42), (1928131023, 76), (0xefc0c1c47f9de433, 2)]
+				.iter()
+				.cloned()
+				.collect(),
+		};
+
+		assert_eq!(CausalContext::parse(&ct.serialize()).unwrap(), ct);
+	}
+}
diff --git a/src/model/k2v/item_table.rs b/src/model/k2v/item_table.rs
new file mode 100644
index 00000000..7860cb17
--- /dev/null
+++ b/src/model/k2v/item_table.rs
@@ -0,0 +1,305 @@
+use serde::{Deserialize, Serialize};
+use std::collections::BTreeMap;
+use std::sync::Arc;
+
+use garage_db as db;
+use garage_util::data::*;
+
+use garage_table::crdt::*;
+use garage_table::*;
+
+use crate::index_counter::*;
+use crate::k2v::causality::*;
+use crate::k2v::poll::*;
+
+pub const ENTRIES: &str = "entries";
+pub const CONFLICTS: &str = "conflicts";
+pub const VALUES: &str = "values";
+pub const BYTES: &str = "bytes";
+
+#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
+pub struct K2VItem {
+	pub partition: K2VItemPartition,
+	pub sort_key: String,
+
+	items: BTreeMap<K2VNodeId, DvvsEntry>,
+}
+
+#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize, Hash)]
+pub struct K2VItemPartition {
+	pub bucket_id: Uuid,
+	pub partition_key: String,
+}
+
+#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
+struct DvvsEntry {
+	t_discard: u64,
+	values: Vec<(u64, DvvsValue)>,
+}
+
+#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
+pub enum DvvsValue {
+	Value(#[serde(with = "serde_bytes")] Vec<u8>),
+	Deleted,
+}
+
+impl K2VItem {
+	/// Creates a new K2VItem when no previous entry existed in the db
+	pub fn new(bucket_id: Uuid, partition_key: String, sort_key: String) -> Self {
+		Self {
+			partition: K2VItemPartition {
+				bucket_id,
+				partition_key,
+			},
+			sort_key,
+			items: BTreeMap::new(),
+		}
+	}
+	/// Updates a K2VItem with a new value or a deletion event
+	pub fn update(
+		&mut self,
+		this_node: Uuid,
+		context: &Option<CausalContext>,
+		new_value: DvvsValue,
+	) {
+		if let Some(context) = context {
+			for (node, t_discard) in context.vector_clock.iter() {
+				if let Some(e) = self.items.get_mut(node) {
+					e.t_discard = std::cmp::max(e.t_discard, *t_discard);
+				} else {
+					self.items.insert(
+						*node,
+						DvvsEntry {
+							t_discard: *t_discard,
+							values: vec![],
+						},
+					);
+				}
+			}
+		}
+
+		self.discard();
+
+		let node_id = make_node_id(this_node);
+		let e = self.items.entry(node_id).or_insert(DvvsEntry {
+			t_discard: 0,
+			values: vec![],
+		});
+		let t_prev = e.max_time();
+		e.values.push((t_prev + 1, new_value));
+	}
+
+	/// Extract the causality context of a K2V Item
+	pub fn causal_context(&self) -> CausalContext {
+		let mut cc = CausalContext::new_empty();
+		for (node, ent) in self.items.iter() {
+			cc.vector_clock.insert(*node, ent.max_time());
+		}
+		cc
+	}
+
+	/// Extract the list of values
+	pub fn values(&'_ self) -> Vec<&'_ DvvsValue> {
+		let mut ret = vec![];
+		for (_, ent) in self.items.iter() {
+			for (_, v) in ent.values.iter() {
+				if !ret.contains(&v) {
+					ret.push(v);
+				}
+			}
+		}
+		ret
+	}
+
+	fn discard(&mut self) {
+		for (_, ent) in self.items.iter_mut() {
+			ent.discard();
+		}
+	}
+}
+
+impl DvvsEntry {
+	fn max_time(&self) -> u64 {
+		self.values
+			.iter()
+			.fold(self.t_discard, |acc, (vts, _)| std::cmp::max(acc, *vts))
+	}
+
+	fn discard(&mut self) {
+		self.values = std::mem::take(&mut self.values)
+			.into_iter()
+			.filter(|(t, _)| *t > self.t_discard)
+			.collect::<Vec<_>>();
+	}
+}
+
+impl Crdt for K2VItem {
+	fn merge(&mut self, other: &Self) {
+		for (node, e2) in other.items.iter() {
+			if let Some(e) = self.items.get_mut(node) {
+				e.merge(e2);
+			} else {
+				self.items.insert(*node, e2.clone());
+			}
+		}
+	}
+}
+
+impl Crdt for DvvsEntry {
+	fn merge(&mut self, other: &Self) {
+		self.t_discard = std::cmp::max(self.t_discard, other.t_discard);
+		self.discard();
+
+		let t_max = self.max_time();
+		for (vt, vv) in other.values.iter() {
+			if *vt > t_max {
+				self.values.push((*vt, vv.clone()));
+			}
+		}
+	}
+}
+
+impl PartitionKey for K2VItemPartition {
+	fn hash(&self) -> Hash {
+		use blake2::{Blake2b, Digest};
+
+		let mut hasher = Blake2b::new();
+		hasher.update(self.bucket_id.as_slice());
+		hasher.update(self.partition_key.as_bytes());
+		let mut hash = [0u8; 32];
+		hash.copy_from_slice(&hasher.finalize()[..32]);
+		hash.into()
+	}
+}
+
+impl Entry<K2VItemPartition, String> for K2VItem {
+	fn partition_key(&self) -> &K2VItemPartition {
+		&self.partition
+	}
+	fn sort_key(&self) -> &String {
+		&self.sort_key
+	}
+	fn is_tombstone(&self) -> bool {
+		self.values()
+			.iter()
+			.all(|v| matches!(v, DvvsValue::Deleted))
+	}
+}
+
+pub struct K2VItemTable {
+	pub(crate) counter_table: Arc<IndexCounter<K2VItem>>,
+	pub(crate) subscriptions: Arc<SubscriptionManager>,
+}
+
+#[derive(Clone, Copy, Debug, Serialize, Deserialize)]
+pub struct ItemFilter {
+	pub exclude_only_tombstones: bool,
+	pub conflicts_only: bool,
+}
+
+impl TableSchema for K2VItemTable {
+	const TABLE_NAME: &'static str = "k2v_item";
+
+	type P = K2VItemPartition;
+	type S = String;
+	type E = K2VItem;
+	type Filter = ItemFilter;
+
+	fn updated(
+		&self,
+		tx: &mut db::Transaction,
+		old: Option<&Self::E>,
+		new: Option<&Self::E>,
+	) -> db::TxOpResult<()> {
+		// 1. Count
+		let counter_res = self.counter_table.count(tx, old, new);
+		if let Err(e) = db::unabort(counter_res)? {
+			// This result can be returned by `counter_table.count()` for instance
+			// if messagepack serialization or deserialization fails at some step.
+			// Warn admin but ignore this error for now, that's all we can do.
+			error!(
+				"Unable to update K2V item counter: {}. Index values will be wrong!",
+				e
+			);
+		}
+
+		// 2. Notify
+		if let Some(new_ent) = new {
+			self.subscriptions.notify(new_ent);
+		}
+
+		Ok(())
+	}
+
+	#[allow(clippy::nonminimal_bool)]
+	fn matches_filter(entry: &Self::E, filter: &Self::Filter) -> bool {
+		let v = entry.values();
+		!(filter.conflicts_only && v.len() < 2)
+			&& !(filter.exclude_only_tombstones && entry.is_tombstone())
+	}
+}
+
+impl CountedItem for K2VItem {
+	const COUNTER_TABLE_NAME: &'static str = "k2v_index_counter_v2";
+
+	// Partition key = bucket id
+	type CP = Uuid;
+	// Sort key = K2V item's partition key
+	type CS = String;
+
+	fn counter_partition_key(&self) -> &Uuid {
+		&self.partition.bucket_id
+	}
+	fn counter_sort_key(&self) -> &String {
+		&self.partition.partition_key
+	}
+
+	fn counts(&self) -> Vec<(&'static str, i64)> {
+		let values = self.values();
+
+		let n_entries = if self.is_tombstone() { 0 } else { 1 };
+		let n_conflicts = if values.len() > 1 { 1 } else { 0 };
+		let n_values = values
+			.iter()
+			.filter(|v| matches!(v, DvvsValue::Value(_)))
+			.count() as i64;
+		let n_bytes = values
+			.iter()
+			.map(|v| match v {
+				DvvsValue::Deleted => 0,
+				DvvsValue::Value(v) => v.len() as i64,
+			})
+			.sum();
+
+		vec![
+			(ENTRIES, n_entries),
+			(CONFLICTS, n_conflicts),
+			(VALUES, n_values),
+			(BYTES, n_bytes),
+		]
+	}
+}
+
+#[cfg(test)]
+mod tests {
+	use super::*;
+
+	#[test]
+	fn test_dvvsentry_merge_simple() {
+		let e1 = DvvsEntry {
+			t_discard: 4,
+			values: vec![
+				(5, DvvsValue::Value(vec![15])),
+				(6, DvvsValue::Value(vec![16])),
+			],
+		};
+		let e2 = DvvsEntry {
+			t_discard: 5,
+			values: vec![(6, DvvsValue::Value(vec![16])), (7, DvvsValue::Deleted)],
+		};
+
+		let mut e3 = e1.clone();
+		e3.merge(&e2);
+		assert_eq!(e2, e3);
+	}
+}
diff --git a/src/model/k2v/mod.rs b/src/model/k2v/mod.rs
new file mode 100644
index 00000000..f6a96151
--- /dev/null
+++ b/src/model/k2v/mod.rs
@@ -0,0 +1,6 @@
+pub mod causality;
+
+pub mod item_table;
+
+pub mod poll;
+pub mod rpc;
diff --git a/src/model/k2v/poll.rs b/src/model/k2v/poll.rs
new file mode 100644
index 00000000..93105207
--- /dev/null
+++ b/src/model/k2v/poll.rs
@@ -0,0 +1,50 @@
+use std::collections::HashMap;
+use std::sync::Mutex;
+
+use serde::{Deserialize, Serialize};
+use tokio::sync::broadcast;
+
+use crate::k2v::item_table::*;
+
+#[derive(Debug, Hash, Clone, PartialEq, Eq, Serialize, Deserialize)]
+pub struct PollKey {
+	pub partition: K2VItemPartition,
+	pub sort_key: String,
+}
+
+#[derive(Default)]
+pub struct SubscriptionManager {
+	subscriptions: Mutex<HashMap<PollKey, broadcast::Sender<K2VItem>>>,
+}
+
+impl SubscriptionManager {
+	pub fn new() -> Self {
+		Self::default()
+	}
+
+	pub fn subscribe(&self, key: &PollKey) -> broadcast::Receiver<K2VItem> {
+		let mut subs = self.subscriptions.lock().unwrap();
+		if let Some(s) = subs.get(key) {
+			s.subscribe()
+		} else {
+			let (tx, rx) = broadcast::channel(8);
+			subs.insert(key.clone(), tx);
+			rx
+		}
+	}
+
+	pub fn notify(&self, item: &K2VItem) {
+		let key = PollKey {
+			partition: item.partition.clone(),
+			sort_key: item.sort_key.clone(),
+		};
+		let mut subs = self.subscriptions.lock().unwrap();
+		if let Some(s) = subs.get(&key) {
+			if s.send(item.clone()).is_err() {
+				// no more subscribers, remove channel from here
+				// (we will re-create it later if we need to subscribe again)
+				subs.remove(&key);
+			}
+		}
+	}
+}
diff --git a/src/model/k2v/rpc.rs b/src/model/k2v/rpc.rs
new file mode 100644
index 00000000..a74df277
--- /dev/null
+++ b/src/model/k2v/rpc.rs
@@ -0,0 +1,341 @@
+//! Module that implements RPCs specific to K2V.
+//! This is necessary for insertions into the K2V store,
+//! as they have to be transmitted to one of the nodes responsible
+//! for storing the entry to be processed (the API entry
+//! node does not process the entry directly, as this would
+//! mean the vector clock gets much larger than needed).
+
+use std::collections::HashMap;
+use std::sync::Arc;
+use std::time::Duration;
+
+use async_trait::async_trait;
+use futures::stream::FuturesUnordered;
+use futures::StreamExt;
+use serde::{Deserialize, Serialize};
+use tokio::select;
+
+use garage_util::crdt::*;
+use garage_util::data::*;
+use garage_util::error::*;
+
+use garage_rpc::system::System;
+use garage_rpc::*;
+
+use garage_table::replication::{TableReplication, TableShardedReplication};
+use garage_table::{PartitionKey, Table};
+
+use crate::k2v::causality::*;
+use crate::k2v::item_table::*;
+use crate::k2v::poll::*;
+
+/// RPC messages for K2V
+#[derive(Debug, Serialize, Deserialize)]
+enum K2VRpc {
+	Ok,
+	InsertItem(InsertedItem),
+	InsertManyItems(Vec<InsertedItem>),
+	PollItem {
+		key: PollKey,
+		causal_context: CausalContext,
+		timeout_msec: u64,
+	},
+	PollItemResponse(Option<K2VItem>),
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+struct InsertedItem {
+	partition: K2VItemPartition,
+	sort_key: String,
+	causal_context: Option<CausalContext>,
+	value: DvvsValue,
+}
+
+impl Rpc for K2VRpc {
+	type Response = Result<K2VRpc, Error>;
+}
+
+/// The block manager, handling block exchange between nodes, and block storage on local node
+pub struct K2VRpcHandler {
+	system: Arc<System>,
+	item_table: Arc<Table<K2VItemTable, TableShardedReplication>>,
+	endpoint: Arc<Endpoint<K2VRpc, Self>>,
+	subscriptions: Arc<SubscriptionManager>,
+}
+
+impl K2VRpcHandler {
+	pub fn new(
+		system: Arc<System>,
+		item_table: Arc<Table<K2VItemTable, TableShardedReplication>>,
+		subscriptions: Arc<SubscriptionManager>,
+	) -> Arc<Self> {
+		let endpoint = system.netapp.endpoint("garage_model/k2v/Rpc".to_string());
+
+		let rpc_handler = Arc::new(Self {
+			system,
+			item_table,
+			endpoint,
+			subscriptions,
+		});
+		rpc_handler.endpoint.set_handler(rpc_handler.clone());
+
+		rpc_handler
+	}
+
+	// ---- public interface ----
+
+	pub async fn insert(
+		&self,
+		bucket_id: Uuid,
+		partition_key: String,
+		sort_key: String,
+		causal_context: Option<CausalContext>,
+		value: DvvsValue,
+	) -> Result<(), Error> {
+		let partition = K2VItemPartition {
+			bucket_id,
+			partition_key,
+		};
+		let mut who = self
+			.item_table
+			.data
+			.replication
+			.write_nodes(&partition.hash());
+		who.sort();
+
+		self.system
+			.rpc
+			.try_call_many(
+				&self.endpoint,
+				&who[..],
+				K2VRpc::InsertItem(InsertedItem {
+					partition,
+					sort_key,
+					causal_context,
+					value,
+				}),
+				RequestStrategy::with_priority(PRIO_NORMAL)
+					.with_quorum(1)
+					.interrupt_after_quorum(true),
+			)
+			.await?;
+
+		Ok(())
+	}
+
+	pub async fn insert_batch(
+		&self,
+		bucket_id: Uuid,
+		items: Vec<(String, String, Option<CausalContext>, DvvsValue)>,
+	) -> Result<(), Error> {
+		let n_items = items.len();
+
+		let mut call_list: HashMap<_, Vec<_>> = HashMap::new();
+
+		for (partition_key, sort_key, causal_context, value) in items {
+			let partition = K2VItemPartition {
+				bucket_id,
+				partition_key,
+			};
+			let mut who = self
+				.item_table
+				.data
+				.replication
+				.write_nodes(&partition.hash());
+			who.sort();
+
+			call_list.entry(who).or_default().push(InsertedItem {
+				partition,
+				sort_key,
+				causal_context,
+				value,
+			});
+		}
+
+		debug!(
+			"K2V insert_batch: {} requests to insert {} items",
+			call_list.len(),
+			n_items
+		);
+		let call_futures = call_list.into_iter().map(|(nodes, items)| async move {
+			let resp = self
+				.system
+				.rpc
+				.try_call_many(
+					&self.endpoint,
+					&nodes[..],
+					K2VRpc::InsertManyItems(items),
+					RequestStrategy::with_priority(PRIO_NORMAL)
+						.with_quorum(1)
+						.interrupt_after_quorum(true),
+				)
+				.await?;
+			Ok::<_, Error>((nodes, resp))
+		});
+
+		let mut resps = call_futures.collect::<FuturesUnordered<_>>();
+		while let Some(resp) = resps.next().await {
+			resp?;
+		}
+
+		Ok(())
+	}
+
+	pub async fn poll(
+		&self,
+		bucket_id: Uuid,
+		partition_key: String,
+		sort_key: String,
+		causal_context: CausalContext,
+		timeout_msec: u64,
+	) -> Result<Option<K2VItem>, Error> {
+		let poll_key = PollKey {
+			partition: K2VItemPartition {
+				bucket_id,
+				partition_key,
+			},
+			sort_key,
+		};
+		let nodes = self
+			.item_table
+			.data
+			.replication
+			.write_nodes(&poll_key.partition.hash());
+
+		let rpc = self.system.rpc.try_call_many(
+			&self.endpoint,
+			&nodes[..],
+			K2VRpc::PollItem {
+				key: poll_key,
+				causal_context,
+				timeout_msec,
+			},
+			RequestStrategy::with_priority(PRIO_NORMAL)
+				.with_quorum(self.item_table.data.replication.read_quorum())
+				.without_timeout(),
+		);
+		let timeout_duration = Duration::from_millis(timeout_msec) + self.system.rpc.rpc_timeout();
+		let resps = select! {
+			r = rpc => r?,
+			_ = tokio::time::sleep(timeout_duration) => return Ok(None),
+		};
+
+		let mut resp: Option<K2VItem> = None;
+		for v in resps {
+			match v {
+				K2VRpc::PollItemResponse(Some(x)) => {
+					if let Some(y) = &mut resp {
+						y.merge(&x);
+					} else {
+						resp = Some(x);
+					}
+				}
+				K2VRpc::PollItemResponse(None) => {
+					return Ok(None);
+				}
+				v => return Err(Error::unexpected_rpc_message(v)),
+			}
+		}
+
+		Ok(resp)
+	}
+
+	// ---- internal handlers ----
+
+	async fn handle_insert(&self, item: &InsertedItem) -> Result<K2VRpc, Error> {
+		let new = self.local_insert(item)?;
+
+		// Propagate to rest of network
+		if let Some(updated) = new {
+			self.item_table.insert(&updated).await?;
+		}
+
+		Ok(K2VRpc::Ok)
+	}
+
+	async fn handle_insert_many(&self, items: &[InsertedItem]) -> Result<K2VRpc, Error> {
+		let mut updated_vec = vec![];
+
+		for item in items {
+			let new = self.local_insert(item)?;
+
+			if let Some(updated) = new {
+				updated_vec.push(updated);
+			}
+		}
+
+		// Propagate to rest of network
+		if !updated_vec.is_empty() {
+			self.item_table.insert_many(&updated_vec).await?;
+		}
+
+		Ok(K2VRpc::Ok)
+	}
+
+	fn local_insert(&self, item: &InsertedItem) -> Result<Option<K2VItem>, Error> {
+		let tree_key = self
+			.item_table
+			.data
+			.tree_key(&item.partition, &item.sort_key);
+
+		self.item_table
+			.data
+			.update_entry_with(&tree_key[..], |ent| {
+				let mut ent = ent.unwrap_or_else(|| {
+					K2VItem::new(
+						item.partition.bucket_id,
+						item.partition.partition_key.clone(),
+						item.sort_key.clone(),
+					)
+				});
+				ent.update(self.system.id, &item.causal_context, item.value.clone());
+				ent
+			})
+	}
+
+	async fn handle_poll(&self, key: &PollKey, ct: &CausalContext) -> Result<K2VItem, Error> {
+		let mut chan = self.subscriptions.subscribe(key);
+
+		let mut value = self
+			.item_table
+			.data
+			.read_entry(&key.partition, &key.sort_key)?
+			.map(|bytes| self.item_table.data.decode_entry(&bytes[..]))
+			.transpose()?
+			.unwrap_or_else(|| {
+				K2VItem::new(
+					key.partition.bucket_id,
+					key.partition.partition_key.clone(),
+					key.sort_key.clone(),
+				)
+			});
+
+		while !value.causal_context().is_newer_than(ct) {
+			value = chan.recv().await?;
+		}
+
+		Ok(value)
+	}
+}
+
+#[async_trait]
+impl EndpointHandler<K2VRpc> for K2VRpcHandler {
+	async fn handle(self: &Arc<Self>, message: &K2VRpc, _from: NodeID) -> Result<K2VRpc, Error> {
+		match message {
+			K2VRpc::InsertItem(item) => self.handle_insert(item).await,
+			K2VRpc::InsertManyItems(items) => self.handle_insert_many(&items[..]).await,
+			K2VRpc::PollItem {
+				key,
+				causal_context,
+				timeout_msec,
+			} => {
+				let delay = tokio::time::sleep(Duration::from_millis(*timeout_msec));
+				select! {
+					ret = self.handle_poll(key, causal_context) => ret.map(Some).map(K2VRpc::PollItemResponse),
+					_ = delay => Ok(K2VRpc::PollItemResponse(None)),
+				}
+			}
+			m => Err(Error::unexpected_rpc_message(m)),
+		}
+	}
+}
diff --git a/src/model/key_table.rs b/src/model/key_table.rs
index 330e83f0..9d2fc783 100644
--- a/src/model/key_table.rs
+++ b/src/model/key_table.rs
@@ -6,10 +6,10 @@ use garage_util::data::*;
 
 use crate::permission::BucketKeyPerm;
 
-use garage_model_050::key_table as old;
+use crate::prev::v051::key_table as old;
 
 /// An api key
-#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)]
+#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
 pub struct Key {
 	/// The id of the key (immutable), used as partition key
 	pub key_id: String,
@@ -19,7 +19,7 @@ pub struct Key {
 }
 
 /// Configuration for a key
-#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)]
+#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
 pub struct KeyParams {
 	/// The secret_key associated (immutable)
 	pub secret_key: String,
diff --git a/src/model/lib.rs b/src/model/lib.rs
index 05a4cdc7..4f20ea46 100644
--- a/src/model/lib.rs
+++ b/src/model/lib.rs
@@ -1,14 +1,20 @@
 #[macro_use]
 extern crate tracing;
 
+// For migration from previous versions
+pub(crate) mod prev;
+
 pub mod permission;
 
-pub mod block_ref_table;
+pub mod index_counter;
+
 pub mod bucket_alias_table;
 pub mod bucket_table;
 pub mod key_table;
-pub mod object_table;
-pub mod version_table;
+
+#[cfg(feature = "k2v")]
+pub mod k2v;
+pub mod s3;
 
 pub mod garage;
 pub mod helper;
diff --git a/src/model/migrate.rs b/src/model/migrate.rs
index 7e61957a..cd6ad26a 100644
--- a/src/model/migrate.rs
+++ b/src/model/migrate.rs
@@ -5,7 +5,7 @@ use garage_util::data::*;
 use garage_util::error::Error as GarageError;
 use garage_util::time::*;
 
-use garage_model_050::bucket_table as old_bucket;
+use crate::prev::v051::bucket_table as old_bucket;
 
 use crate::bucket_alias_table::*;
 use crate::bucket_table::*;
@@ -25,11 +25,15 @@ impl Migrate {
 			.open_tree("bucket:table")
 			.map_err(GarageError::from)?;
 
-		for res in tree.iter() {
+		let mut old_buckets = vec![];
+		for res in tree.iter().map_err(GarageError::from)? {
 			let (_k, v) = res.map_err(GarageError::from)?;
 			let bucket = rmp_serde::decode::from_read_ref::<_, old_bucket::Bucket>(&v[..])
 				.map_err(GarageError::from)?;
+			old_buckets.push(bucket);
+		}
 
+		for bucket in old_buckets {
 			if let old_bucket::BucketState::Present(p) = bucket.state.get() {
 				self.migrate_buckets050_do_bucket(&bucket, p).await?;
 			}
@@ -73,6 +77,7 @@ impl Migrate {
 					local_aliases: LwwMap::new(),
 					website_config: Lww::new(website),
 					cors_config: Lww::new(None),
+					quotas: Lww::new(Default::default()),
 				}),
 			})
 			.await?;
diff --git a/src/model/prev/mod.rs b/src/model/prev/mod.rs
new file mode 100644
index 00000000..68bb1502
--- /dev/null
+++ b/src/model/prev/mod.rs
@@ -0,0 +1 @@
+pub(crate) mod v051;
diff --git a/src/model/prev/v051/bucket_table.rs b/src/model/prev/v051/bucket_table.rs
new file mode 100644
index 00000000..628a49dd
--- /dev/null
+++ b/src/model/prev/v051/bucket_table.rs
@@ -0,0 +1,63 @@
+use serde::{Deserialize, Serialize};
+
+use garage_table::crdt::Crdt;
+use garage_table::*;
+
+use super::key_table::PermissionSet;
+
+/// A bucket is a collection of objects
+///
+/// Its parameters are not directly accessible as:
+///  - It must be possible to merge paramaters, hence the use of a LWW CRDT.
+///  - A bucket has 2 states, Present or Deleted and parameters make sense only if present.
+#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
+pub struct Bucket {
+	/// Name of the bucket
+	pub name: String,
+	/// State, and configuration if not deleted, of the bucket
+	pub state: crdt::Lww<BucketState>,
+}
+
+/// State of a bucket
+#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
+pub enum BucketState {
+	/// The bucket is deleted
+	Deleted,
+	/// The bucket exists
+	Present(BucketParams),
+}
+
+impl Crdt for BucketState {
+	fn merge(&mut self, o: &Self) {
+		match o {
+			BucketState::Deleted => *self = BucketState::Deleted,
+			BucketState::Present(other_params) => {
+				if let BucketState::Present(params) = self {
+					params.merge(other_params);
+				}
+			}
+		}
+	}
+}
+
+/// Configuration for a bucket
+#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
+pub struct BucketParams {
+	/// Map of key with access to the bucket, and what kind of access they give
+	pub authorized_keys: crdt::LwwMap<String, PermissionSet>,
+	/// Is the bucket served as http
+	pub website: crdt::Lww<bool>,
+}
+
+impl Crdt for BucketParams {
+	fn merge(&mut self, o: &Self) {
+		self.authorized_keys.merge(&o.authorized_keys);
+		self.website.merge(&o.website);
+	}
+}
+
+impl Crdt for Bucket {
+	fn merge(&mut self, other: &Self) {
+		self.state.merge(&other.state);
+	}
+}
diff --git a/src/model/prev/v051/key_table.rs b/src/model/prev/v051/key_table.rs
new file mode 100644
index 00000000..37516b1c
--- /dev/null
+++ b/src/model/prev/v051/key_table.rs
@@ -0,0 +1,50 @@
+use serde::{Deserialize, Serialize};
+
+use garage_table::crdt::*;
+use garage_table::*;
+
+/// An api key
+#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
+pub struct Key {
+	/// The id of the key (immutable), used as partition key
+	pub key_id: String,
+
+	/// The secret_key associated
+	pub secret_key: String,
+
+	/// Name for the key
+	pub name: crdt::Lww<String>,
+
+	/// Is the key deleted
+	pub deleted: crdt::Bool,
+
+	/// Buckets in which the key is authorized. Empty if `Key` is deleted
+	// CRDT interaction: deleted implies authorized_buckets is empty
+	pub authorized_buckets: crdt::LwwMap<String, PermissionSet>,
+}
+
+/// Permission given to a key in a bucket
+#[derive(PartialOrd, Ord, PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
+pub struct PermissionSet {
+	/// The key can be used to read the bucket
+	pub allow_read: bool,
+	/// The key can be used to write in the bucket
+	pub allow_write: bool,
+}
+
+impl AutoCrdt for PermissionSet {
+	const WARN_IF_DIFFERENT: bool = true;
+}
+
+impl Crdt for Key {
+	fn merge(&mut self, other: &Self) {
+		self.name.merge(&other.name);
+		self.deleted.merge(&other.deleted);
+
+		if self.deleted.get() {
+			self.authorized_buckets.clear();
+		} else {
+			self.authorized_buckets.merge(&other.authorized_buckets);
+		}
+	}
+}
diff --git a/src/model/prev/v051/mod.rs b/src/model/prev/v051/mod.rs
new file mode 100644
index 00000000..7a954752
--- /dev/null
+++ b/src/model/prev/v051/mod.rs
@@ -0,0 +1,4 @@
+pub(crate) mod bucket_table;
+pub(crate) mod key_table;
+pub(crate) mod object_table;
+pub(crate) mod version_table;
diff --git a/src/model/prev/v051/object_table.rs b/src/model/prev/v051/object_table.rs
new file mode 100644
index 00000000..e79e5787
--- /dev/null
+++ b/src/model/prev/v051/object_table.rs
@@ -0,0 +1,149 @@
+use serde::{Deserialize, Serialize};
+use std::collections::BTreeMap;
+
+use garage_util::data::*;
+
+use garage_table::crdt::*;
+
+/// An object
+#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
+pub struct Object {
+	/// The bucket in which the object is stored, used as partition key
+	pub bucket: String,
+
+	/// The key at which the object is stored in its bucket, used as sorting key
+	pub key: String,
+
+	/// The list of currenty stored versions of the object
+	versions: Vec<ObjectVersion>,
+}
+
+impl Object {
+	/// Get a list of currently stored versions of `Object`
+	pub fn versions(&self) -> &[ObjectVersion] {
+		&self.versions[..]
+	}
+}
+
+/// Informations about a version of an object
+#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
+pub struct ObjectVersion {
+	/// Id of the version
+	pub uuid: Uuid,
+	/// Timestamp of when the object was created
+	pub timestamp: u64,
+	/// State of the version
+	pub state: ObjectVersionState,
+}
+
+/// State of an object version
+#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
+pub enum ObjectVersionState {
+	/// The version is being received
+	Uploading(ObjectVersionHeaders),
+	/// The version is fully received
+	Complete(ObjectVersionData),
+	/// The version uploaded containded errors or the upload was explicitly aborted
+	Aborted,
+}
+
+impl Crdt for ObjectVersionState {
+	fn merge(&mut self, other: &Self) {
+		use ObjectVersionState::*;
+		match other {
+			Aborted => {
+				*self = Aborted;
+			}
+			Complete(b) => match self {
+				Aborted => {}
+				Complete(a) => {
+					a.merge(b);
+				}
+				Uploading(_) => {
+					*self = Complete(b.clone());
+				}
+			},
+			Uploading(_) => {}
+		}
+	}
+}
+
+/// Data stored in object version
+#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)]
+pub enum ObjectVersionData {
+	/// The object was deleted, this Version is a tombstone to mark it as such
+	DeleteMarker,
+	/// The object is short, it's stored inlined
+	Inline(ObjectVersionMeta, #[serde(with = "serde_bytes")] Vec<u8>),
+	/// The object is not short, Hash of first block is stored here, next segments hashes are
+	/// stored in the version table
+	FirstBlock(ObjectVersionMeta, Hash),
+}
+
+impl AutoCrdt for ObjectVersionData {
+	const WARN_IF_DIFFERENT: bool = true;
+}
+
+/// Metadata about the object version
+#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)]
+pub struct ObjectVersionMeta {
+	/// Headers to send to the client
+	pub headers: ObjectVersionHeaders,
+	/// Size of the object
+	pub size: u64,
+	/// etag of the object
+	pub etag: String,
+}
+
+/// Additional headers for an object
+#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug, Serialize, Deserialize)]
+pub struct ObjectVersionHeaders {
+	/// Content type of the object
+	pub content_type: String,
+	/// Any other http headers to send
+	pub other: BTreeMap<String, String>,
+}
+
+impl ObjectVersion {
+	fn cmp_key(&self) -> (u64, Uuid) {
+		(self.timestamp, self.uuid)
+	}
+
+	/// Is the object version completely received
+	pub fn is_complete(&self) -> bool {
+		matches!(self.state, ObjectVersionState::Complete(_))
+	}
+}
+
+impl Crdt for Object {
+	fn merge(&mut self, other: &Self) {
+		// Merge versions from other into here
+		for other_v in other.versions.iter() {
+			match self
+				.versions
+				.binary_search_by(|v| v.cmp_key().cmp(&other_v.cmp_key()))
+			{
+				Ok(i) => {
+					self.versions[i].state.merge(&other_v.state);
+				}
+				Err(i) => {
+					self.versions.insert(i, other_v.clone());
+				}
+			}
+		}
+
+		// Remove versions which are obsolete, i.e. those that come
+		// before the last version which .is_complete().
+		let last_complete = self
+			.versions
+			.iter()
+			.enumerate()
+			.rev()
+			.find(|(_, v)| v.is_complete())
+			.map(|(vi, _)| vi);
+
+		if let Some(last_vi) = last_complete {
+			self.versions = self.versions.drain(last_vi..).collect::<Vec<_>>();
+		}
+	}
+}
diff --git a/src/model/prev/v051/version_table.rs b/src/model/prev/v051/version_table.rs
new file mode 100644
index 00000000..c11c62d5
--- /dev/null
+++ b/src/model/prev/v051/version_table.rs
@@ -0,0 +1,79 @@
+use serde::{Deserialize, Serialize};
+
+use garage_util::data::*;
+
+use garage_table::crdt::*;
+use garage_table::*;
+
+/// A version of an object
+#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
+pub struct Version {
+	/// UUID of the version, used as partition key
+	pub uuid: Uuid,
+
+	// Actual data: the blocks for this version
+	// In the case of a multipart upload, also store the etags
+	// of individual parts and check them when doing CompleteMultipartUpload
+	/// Is this version deleted
+	pub deleted: crdt::Bool,
+	/// list of blocks of data composing the version
+	pub blocks: crdt::Map<VersionBlockKey, VersionBlock>,
+	/// Etag of each part in case of a multipart upload, empty otherwise
+	pub parts_etags: crdt::Map<u64, String>,
+
+	// Back link to bucket+key so that we can figure if
+	// this was deleted later on
+	/// Bucket in which the related object is stored
+	pub bucket: String,
+	/// Key in which the related object is stored
+	pub key: String,
+}
+
+#[derive(PartialEq, Eq, Clone, Copy, Debug, Serialize, Deserialize)]
+pub struct VersionBlockKey {
+	/// Number of the part
+	pub part_number: u64,
+	/// Offset of this sub-segment in its part
+	pub offset: u64,
+}
+
+impl Ord for VersionBlockKey {
+	fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+		self.part_number
+			.cmp(&other.part_number)
+			.then(self.offset.cmp(&other.offset))
+	}
+}
+
+impl PartialOrd for VersionBlockKey {
+	fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+		Some(self.cmp(other))
+	}
+}
+
+/// Informations about a single block
+#[derive(PartialEq, Eq, Ord, PartialOrd, Clone, Copy, Debug, Serialize, Deserialize)]
+pub struct VersionBlock {
+	/// Blake2 sum of the block
+	pub hash: Hash,
+	/// Size of the block
+	pub size: u64,
+}
+
+impl AutoCrdt for VersionBlock {
+	const WARN_IF_DIFFERENT: bool = true;
+}
+
+impl Crdt for Version {
+	fn merge(&mut self, other: &Self) {
+		self.deleted.merge(&other.deleted);
+
+		if self.deleted.get() {
+			self.blocks.clear();
+			self.parts_etags.clear();
+		} else {
+			self.blocks.merge(&other.blocks);
+			self.parts_etags.merge(&other.parts_etags);
+		}
+	}
+}
diff --git a/src/model/block_ref_table.rs b/src/model/s3/block_ref_table.rs
index b6945403..c7017409 100644
--- a/src/model/block_ref_table.rs
+++ b/src/model/s3/block_ref_table.rs
@@ -1,6 +1,8 @@
 use serde::{Deserialize, Serialize};
 use std::sync::Arc;
 
+use garage_db as db;
+
 use garage_util::data::*;
 
 use garage_table::crdt::Crdt;
@@ -8,7 +10,7 @@ use garage_table::*;
 
 use garage_block::manager::*;
 
-#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)]
+#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
 pub struct BlockRef {
 	/// Hash (blake2 sum) of the block, used as partition key
 	pub block: Hash,
@@ -51,21 +53,22 @@ impl TableSchema for BlockRefTable {
 	type E = BlockRef;
 	type Filter = DeletedFilter;
 
-	fn updated(&self, old: Option<Self::E>, new: Option<Self::E>) {
-		#[allow(clippy::or_fun_call)]
-		let block = &old.as_ref().or(new.as_ref()).unwrap().block;
-		let was_before = old.as_ref().map(|x| !x.deleted.get()).unwrap_or(false);
-		let is_after = new.as_ref().map(|x| !x.deleted.get()).unwrap_or(false);
+	fn updated(
+		&self,
+		tx: &mut db::Transaction,
+		old: Option<&Self::E>,
+		new: Option<&Self::E>,
+	) -> db::TxOpResult<()> {
+		let block = old.or(new).unwrap().block;
+		let was_before = old.map(|x| !x.deleted.get()).unwrap_or(false);
+		let is_after = new.map(|x| !x.deleted.get()).unwrap_or(false);
 		if is_after && !was_before {
-			if let Err(e) = self.block_manager.block_incref(block) {
-				warn!("block_incref failed for block {:?}: {}", block, e);
-			}
+			self.block_manager.block_incref(tx, block)?;
 		}
 		if was_before && !is_after {
-			if let Err(e) = self.block_manager.block_decref(block) {
-				warn!("block_decref failed for block {:?}: {}", block, e);
-			}
+			self.block_manager.block_decref(tx, block)?;
 		}
+		Ok(())
 	}
 
 	fn matches_filter(entry: &Self::E, filter: &Self::Filter) -> bool {
diff --git a/src/model/s3/mod.rs b/src/model/s3/mod.rs
new file mode 100644
index 00000000..4e94337d
--- /dev/null
+++ b/src/model/s3/mod.rs
@@ -0,0 +1,3 @@
+pub mod block_ref_table;
+pub mod object_table;
+pub mod version_table;
diff --git a/src/model/object_table.rs b/src/model/s3/object_table.rs
index da53878e..26ff57f6 100644
--- a/src/model/object_table.rs
+++ b/src/model/s3/object_table.rs
@@ -2,6 +2,8 @@ use serde::{Deserialize, Serialize};
 use std::collections::BTreeMap;
 use std::sync::Arc;
 
+use garage_db as db;
+
 use garage_util::background::BackgroundRunner;
 use garage_util::data::*;
 
@@ -9,12 +11,17 @@ use garage_table::crdt::*;
 use garage_table::replication::TableShardedReplication;
 use garage_table::*;
 
-use crate::version_table::*;
+use crate::index_counter::*;
+use crate::s3::version_table::*;
+
+use crate::prev::v051::object_table as old;
 
-use garage_model_050::object_table as old;
+pub const OBJECTS: &str = "objects";
+pub const UNFINISHED_UPLOADS: &str = "unfinished_uploads";
+pub const BYTES: &str = "bytes";
 
 /// An object
-#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)]
+#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
 pub struct Object {
 	/// The bucket in which the object is stored, used as partition key
 	pub bucket_id: Uuid,
@@ -63,7 +70,7 @@ impl Object {
 }
 
 /// Informations about a version of an object
-#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)]
+#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
 pub struct ObjectVersion {
 	/// Id of the version
 	pub uuid: Uuid,
@@ -74,7 +81,7 @@ pub struct ObjectVersion {
 }
 
 /// State of an object version
-#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)]
+#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
 pub enum ObjectVersionState {
 	/// The version is being received
 	Uploading(ObjectVersionHeaders),
@@ -216,6 +223,7 @@ impl Crdt for Object {
 pub struct ObjectTable {
 	pub background: Arc<BackgroundRunner>,
 	pub version_table: Arc<Table<VersionTable, TableShardedReplication>>,
+	pub object_counter_table: Arc<IndexCounter<Object>>,
 }
 
 #[derive(Clone, Copy, Debug, Serialize, Deserialize)]
@@ -232,8 +240,26 @@ impl TableSchema for ObjectTable {
 	type E = Object;
 	type Filter = ObjectFilter;
 
-	fn updated(&self, old: Option<Self::E>, new: Option<Self::E>) {
+	fn updated(
+		&self,
+		tx: &mut db::Transaction,
+		old: Option<&Self::E>,
+		new: Option<&Self::E>,
+	) -> db::TxOpResult<()> {
+		// 1. Count
+		let counter_res = self.object_counter_table.count(tx, old, new);
+		if let Err(e) = db::unabort(counter_res)? {
+			error!(
+				"Unable to update object counter: {}. Index values will be wrong!",
+				e
+			);
+		}
+
+		// 2. Spawn threads that propagates deletions to version table
 		let version_table = self.version_table.clone();
+		let old = old.cloned();
+		let new = new.cloned();
+
 		self.background.spawn(async move {
 			if let (Some(old_v), Some(new_v)) = (old, new) {
 				// Propagate deletion of old versions
@@ -256,7 +282,8 @@ impl TableSchema for ObjectTable {
 				}
 			}
 			Ok(())
-		})
+		});
+		Ok(())
 	}
 
 	fn matches_filter(entry: &Self::E, filter: &Self::Filter) -> bool {
@@ -272,6 +299,49 @@ impl TableSchema for ObjectTable {
 	}
 }
 
+impl CountedItem for Object {
+	const COUNTER_TABLE_NAME: &'static str = "bucket_object_counter";
+
+	// Partition key = bucket id
+	type CP = Uuid;
+	// Sort key = nothing
+	type CS = EmptyKey;
+
+	fn counter_partition_key(&self) -> &Uuid {
+		&self.bucket_id
+	}
+	fn counter_sort_key(&self) -> &EmptyKey {
+		&EmptyKey
+	}
+
+	fn counts(&self) -> Vec<(&'static str, i64)> {
+		let versions = self.versions();
+		let n_objects = if versions.iter().any(|v| v.is_data()) {
+			1
+		} else {
+			0
+		};
+		let n_unfinished_uploads = versions
+			.iter()
+			.filter(|v| matches!(v.state, ObjectVersionState::Uploading(_)))
+			.count();
+		let n_bytes = versions
+			.iter()
+			.map(|v| match &v.state {
+				ObjectVersionState::Complete(ObjectVersionData::Inline(meta, _))
+				| ObjectVersionState::Complete(ObjectVersionData::FirstBlock(meta, _)) => meta.size,
+				_ => 0,
+			})
+			.sum::<u64>();
+
+		vec![
+			(OBJECTS, n_objects),
+			(UNFINISHED_UPLOADS, n_unfinished_uploads as i64),
+			(BYTES, n_bytes as i64),
+		]
+	}
+}
+
 // vvvvvvvv migration code, stupid stuff vvvvvvvvvvvv
 // (we just want to change bucket into bucket_id by hashing it)
 
diff --git a/src/model/version_table.rs b/src/model/s3/version_table.rs
index 839b1f4f..6bc2ecd1 100644
--- a/src/model/version_table.rs
+++ b/src/model/s3/version_table.rs
@@ -1,6 +1,8 @@
 use serde::{Deserialize, Serialize};
 use std::sync::Arc;
 
+use garage_db as db;
+
 use garage_util::background::BackgroundRunner;
 use garage_util::data::*;
 
@@ -8,12 +10,12 @@ use garage_table::crdt::*;
 use garage_table::replication::TableShardedReplication;
 use garage_table::*;
 
-use crate::block_ref_table::*;
+use crate::s3::block_ref_table::*;
 
-use garage_model_050::version_table as old;
+use crate::prev::v051::version_table as old;
 
 /// A version of an object
-#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)]
+#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
 pub struct Version {
 	/// UUID of the version, used as partition key
 	pub uuid: Uuid,
@@ -137,8 +139,16 @@ impl TableSchema for VersionTable {
 	type E = Version;
 	type Filter = DeletedFilter;
 
-	fn updated(&self, old: Option<Self::E>, new: Option<Self::E>) {
+	fn updated(
+		&self,
+		_tx: &mut db::Transaction,
+		old: Option<&Self::E>,
+		new: Option<&Self::E>,
+	) -> db::TxOpResult<()> {
 		let block_ref_table = self.block_ref_table.clone();
+		let old = old.cloned();
+		let new = new.cloned();
+
 		self.background.spawn(async move {
 			if let (Some(old_v), Some(new_v)) = (old, new) {
 				// Propagate deletion of version blocks
@@ -157,7 +167,9 @@ impl TableSchema for VersionTable {
 				}
 			}
 			Ok(())
-		})
+		});
+
+		Ok(())
 	}
 
 	fn matches_filter(entry: &Self::E, filter: &Self::Filter) -> bool {
diff --git a/src/rpc/Cargo.toml b/src/rpc/Cargo.toml
index 654c1dc6..5bb6aae0 100644
--- a/src/rpc/Cargo.toml
+++ b/src/rpc/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "garage_rpc"
-version = "0.7.0"
+version = "0.8.0"
 authors = ["Alex Auvolat <alex@adnab.me>"]
 edition = "2018"
 license = "AGPL-3.0"
@@ -14,8 +14,7 @@ path = "lib.rs"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
-garage_util = { version = "0.7.0", path = "../util" }
-garage_admin = { version = "0.7.0", path = "../admin" }
+garage_util = { version = "0.8.0", path = "../util" }
 
 arc-swap = "1.0"
 bytes = "1.0"
@@ -47,11 +46,11 @@ tokio = { version = "1.0", default-features = false, features = ["rt", "rt-multi
 tokio-stream = { version = "0.1", features = ["net"] }
 opentelemetry = "0.17"
 
-#netapp = { version = "0.3.0", git = "https://git.deuxfleurs.fr/lx/netapp" }
-#netapp = { version = "0.4", path = "../../../netapp", features = ["telemetry"] }
-netapp = { version = "0.4.2", features = ["telemetry"] }
+netapp = { version = "0.5.2", features = ["telemetry"] }
 
 hyper = { version = "0.14", features = ["client", "http1", "runtime", "tcp"] }
 
+
 [features]
 kubernetes-discovery = [ "kube", "k8s-openapi", "openssl", "schemars" ]
+system-libs = [ "sodiumoxide/use-pkg-config" ]
diff --git a/src/rpc/kubernetes.rs b/src/rpc/kubernetes.rs
index 939a0eed..197245aa 100644
--- a/src/rpc/kubernetes.rs
+++ b/src/rpc/kubernetes.rs
@@ -56,7 +56,7 @@ pub async fn get_kubernetes_nodes(
 	let mut ret = Vec::with_capacity(nodes.items.len());
 
 	for node in nodes {
-		println!("Found Pod: {:?}", node.metadata.name);
+		info!("Found Pod: {:?}", node.metadata.name);
 
 		let pubkey = &node
 			.metadata
diff --git a/src/rpc/layout.rs b/src/rpc/layout.rs
index a878f19c..16d573c7 100644
--- a/src/rpc/layout.rs
+++ b/src/rpc/layout.rs
@@ -9,6 +9,7 @@ use serde::{Deserialize, Serialize};
 
 use garage_util::crdt::{AutoCrdt, Crdt, LwwMap};
 use garage_util::data::*;
+use garage_util::error::*;
 
 use crate::graph_algo::*;
 
@@ -144,6 +145,61 @@ impl ClusterLayout {
 		}
 	}
 
+	pub fn apply_staged_changes(mut self, version: Option<u64>) -> Result<Self, Error> {
+		match version {
+			None => {
+				let error = r#"
+Please pass the new layout version number to ensure that you are writing the correct version of the cluster layout.
+To know the correct value of the new layout version, invoke `garage layout show` and review the proposed changes.
+				"#;
+				return Err(Error::Message(error.into()));
+			}
+			Some(v) => {
+				if v != self.version + 1 {
+					return Err(Error::Message("Invalid new layout version".into()));
+				}
+			}
+		}
+
+		self.roles.merge(&self.staging);
+		self.roles.retain(|(_, _, v)| v.0.is_some());
+
+		if !self.calculate_partition_assignation() {
+			return Err(Error::Message("Could not calculate new assignation of partitions to nodes. This can happen if there are less nodes than the desired number of copies of your data (see the replication_mode configuration parameter).".into()));
+		}
+
+		self.staging.clear();
+		self.staging_hash = blake2sum(&rmp_to_vec_all_named(&self.staging).unwrap()[..]);
+
+		self.version += 1;
+
+		Ok(self)
+	}
+
+	pub fn revert_staged_changes(mut self, version: Option<u64>) -> Result<Self, Error> {
+		match version {
+			None => {
+				let error = r#"
+Please pass the new layout version number to ensure that you are writing the correct version of the cluster layout.
+To know the correct value of the new layout version, invoke `garage layout show` and review the proposed changes.
+				"#;
+				return Err(Error::Message(error.into()));
+			}
+			Some(v) => {
+				if v != self.version + 1 {
+					return Err(Error::Message("Invalid new layout version".into()));
+				}
+			}
+		}
+
+		self.staging.clear();
+		self.staging_hash = blake2sum(&rmp_to_vec_all_named(&self.staging).unwrap()[..]);
+
+		self.version += 1;
+
+		Ok(self)
+	}
+
 	/// Returns a list of IDs of nodes that currently have
 	/// a role in the cluster
 	pub fn node_ids(&self) -> &[Uuid] {
diff --git a/src/rpc/metrics.rs b/src/rpc/metrics.rs
index c900518c..61f8fa79 100644
--- a/src/rpc/metrics.rs
+++ b/src/rpc/metrics.rs
@@ -1,31 +1,18 @@
-use std::sync::Arc;
-
 use opentelemetry::{global, metrics::*};
-use tokio::sync::Semaphore;
 
 /// TableMetrics reference all counter used for metrics
 pub struct RpcMetrics {
-	pub(crate) _rpc_available_permits: ValueObserver<u64>,
-
 	pub(crate) rpc_counter: Counter<u64>,
 	pub(crate) rpc_timeout_counter: Counter<u64>,
 	pub(crate) rpc_netapp_error_counter: Counter<u64>,
 	pub(crate) rpc_garage_error_counter: Counter<u64>,
 
 	pub(crate) rpc_duration: ValueRecorder<f64>,
-	pub(crate) rpc_queueing_time: ValueRecorder<f64>,
 }
 impl RpcMetrics {
-	pub fn new(sem: Arc<Semaphore>) -> Self {
+	pub fn new() -> Self {
 		let meter = global::meter("garage_rpc");
 		RpcMetrics {
-			_rpc_available_permits: meter
-				.u64_value_observer("rpc.available_permits", move |observer| {
-					observer.observe(sem.available_permits() as u64, &[])
-				})
-				.with_description("Number of available RPC permits")
-				.init(),
-
 			rpc_counter: meter
 				.u64_counter("rpc.request_counter")
 				.with_description("Number of RPC requests emitted")
@@ -46,10 +33,6 @@ impl RpcMetrics {
 				.f64_value_recorder("rpc.duration")
 				.with_description("Duration of RPCs")
 				.init(),
-			rpc_queueing_time: meter
-				.f64_value_recorder("rpc.queueing_time")
-				.with_description("Time RPC requests were queued for before being sent")
-				.init(),
 		}
 	}
 }
diff --git a/src/rpc/rpc_helper.rs b/src/rpc/rpc_helper.rs
index 34717d3b..949aced6 100644
--- a/src/rpc/rpc_helper.rs
+++ b/src/rpc/rpc_helper.rs
@@ -7,7 +7,7 @@ use futures::stream::futures_unordered::FuturesUnordered;
 use futures::stream::StreamExt;
 use futures_util::future::FutureExt;
 use tokio::select;
-use tokio::sync::{watch, Semaphore};
+use tokio::sync::watch;
 
 use opentelemetry::KeyValue;
 use opentelemetry::{
@@ -15,10 +15,14 @@ use opentelemetry::{
 	Context,
 };
 
-pub use netapp::endpoint::{Endpoint, EndpointHandler, Message as Rpc};
+pub use netapp::endpoint::{Endpoint, EndpointHandler, StreamingEndpointHandler};
+use netapp::message::IntoReq;
+pub use netapp::message::{
+	Message as Rpc, OrderTag, Req, RequestPriority, Resp, PRIO_BACKGROUND, PRIO_HIGH, PRIO_NORMAL,
+	PRIO_SECONDARY,
+};
 use netapp::peering::fullmesh::FullMeshPeeringStrategy;
-pub use netapp::proto::*;
-pub use netapp::{NetApp, NodeID};
+pub use netapp::{self, NetApp, NodeID};
 
 use garage_util::background::BackgroundRunner;
 use garage_util::data::*;
@@ -28,34 +32,37 @@ use garage_util::metrics::RecordDuration;
 use crate::metrics::RpcMetrics;
 use crate::ring::Ring;
 
-const DEFAULT_TIMEOUT: Duration = Duration::from_secs(10);
-
-// Try to never have more than 200MB of outgoing requests
-// buffered at the same time. Other requests are queued until
-// space is freed.
-const REQUEST_BUFFER_SIZE: usize = 200 * 1024 * 1024;
+// Default RPC timeout = 5 minutes
+const DEFAULT_TIMEOUT: Duration = Duration::from_secs(300);
 
 /// Strategy to apply when making RPC
 #[derive(Copy, Clone)]
 pub struct RequestStrategy {
-	/// Max time to wait for reponse
-	pub rs_timeout: Duration,
 	/// Min number of response to consider the request successful
 	pub rs_quorum: Option<usize>,
 	/// Should requests be dropped after enough response are received
 	pub rs_interrupt_after_quorum: bool,
 	/// Request priority
 	pub rs_priority: RequestPriority,
+	/// Custom timeout for this request
+	rs_timeout: Timeout,
+}
+
+#[derive(Copy, Clone)]
+enum Timeout {
+	None,
+	Default,
+	Custom(Duration),
 }
 
 impl RequestStrategy {
 	/// Create a RequestStrategy with default timeout and not interrupting when quorum reached
 	pub fn with_priority(prio: RequestPriority) -> Self {
 		RequestStrategy {
-			rs_timeout: DEFAULT_TIMEOUT,
 			rs_quorum: None,
 			rs_interrupt_after_quorum: false,
 			rs_priority: prio,
+			rs_timeout: Timeout::Default,
 		}
 	}
 	/// Set quorum to be reached for request
@@ -63,17 +70,22 @@ impl RequestStrategy {
 		self.rs_quorum = Some(quorum);
 		self
 	}
-	/// Set timeout of the strategy
-	pub fn with_timeout(mut self, timeout: Duration) -> Self {
-		self.rs_timeout = timeout;
-		self
-	}
 	/// Set if requests can be dropped after quorum has been reached
 	/// In general true for read requests, and false for write
 	pub fn interrupt_after_quorum(mut self, interrupt: bool) -> Self {
 		self.rs_interrupt_after_quorum = interrupt;
 		self
 	}
+	/// Deactivate timeout for this request
+	pub fn without_timeout(mut self) -> Self {
+		self.rs_timeout = Timeout::None;
+		self
+	}
+	/// Set custom timeout for this request
+	pub fn with_custom_timeout(mut self, timeout: Duration) -> Self {
+		self.rs_timeout = Timeout::Custom(timeout);
+		self
+	}
 }
 
 #[derive(Clone)]
@@ -84,8 +96,8 @@ struct RpcHelperInner {
 	fullmesh: Arc<FullMeshPeeringStrategy>,
 	background: Arc<BackgroundRunner>,
 	ring: watch::Receiver<Arc<Ring>>,
-	request_buffer_semaphore: Arc<Semaphore>,
 	metrics: RpcMetrics,
+	rpc_timeout: Duration,
 }
 
 impl RpcHelper {
@@ -94,45 +106,35 @@ impl RpcHelper {
 		fullmesh: Arc<FullMeshPeeringStrategy>,
 		background: Arc<BackgroundRunner>,
 		ring: watch::Receiver<Arc<Ring>>,
+		rpc_timeout: Option<Duration>,
 	) -> Self {
-		let sem = Arc::new(Semaphore::new(REQUEST_BUFFER_SIZE));
-
-		let metrics = RpcMetrics::new(sem.clone());
+		let metrics = RpcMetrics::new();
 
 		Self(Arc::new(RpcHelperInner {
 			our_node_id,
 			fullmesh,
 			background,
 			ring,
-			request_buffer_semaphore: sem,
 			metrics,
+			rpc_timeout: rpc_timeout.unwrap_or(DEFAULT_TIMEOUT),
 		}))
 	}
 
-	pub async fn call<M, H, S>(
-		&self,
-		endpoint: &Endpoint<M, H>,
-		to: Uuid,
-		msg: M,
-		strat: RequestStrategy,
-	) -> Result<S, Error>
-	where
-		M: Rpc<Response = Result<S, Error>>,
-		H: EndpointHandler<M>,
-	{
-		self.call_arc(endpoint, to, Arc::new(msg), strat).await
+	pub fn rpc_timeout(&self) -> Duration {
+		self.0.rpc_timeout
 	}
 
-	pub async fn call_arc<M, H, S>(
+	pub async fn call<M, N, H, S>(
 		&self,
 		endpoint: &Endpoint<M, H>,
 		to: Uuid,
-		msg: Arc<M>,
+		msg: N,
 		strat: RequestStrategy,
 	) -> Result<S, Error>
 	where
 		M: Rpc<Response = Result<S, Error>>,
-		H: EndpointHandler<M>,
+		N: IntoReq<M> + Send,
+		H: StreamingEndpointHandler<M>,
 	{
 		let metric_tags = [
 			KeyValue::new("rpc_endpoint", endpoint.path().to_string()),
@@ -140,29 +142,27 @@ impl RpcHelper {
 			KeyValue::new("to", format!("{:?}", to)),
 		];
 
-		let msg_size = rmp_to_vec_all_named(&msg)?.len() as u32;
-		let permit = self
-			.0
-			.request_buffer_semaphore
-			.acquire_many(msg_size)
-			.record_duration(&self.0.metrics.rpc_queueing_time, &metric_tags)
-			.await?;
-
 		self.0.metrics.rpc_counter.add(1, &metric_tags);
 
 		let node_id = to.into();
 		let rpc_call = endpoint
-			.call(&node_id, msg, strat.rs_priority)
+			.call_streaming(&node_id, msg, strat.rs_priority)
 			.record_duration(&self.0.metrics.rpc_duration, &metric_tags);
 
+		let timeout = async {
+			match strat.rs_timeout {
+				Timeout::None => futures::future::pending().await,
+				Timeout::Default => tokio::time::sleep(self.0.rpc_timeout).await,
+				Timeout::Custom(t) => tokio::time::sleep(t).await,
+			}
+		};
+
 		select! {
 			res = rpc_call => {
-				drop(permit);
-
 				if res.is_err() {
 					self.0.metrics.rpc_netapp_error_counter.add(1, &metric_tags);
 				}
-				let res = res?;
+				let res = res?.into_msg();
 
 				if res.is_err() {
 					self.0.metrics.rpc_garage_error_counter.add(1, &metric_tags);
@@ -170,46 +170,49 @@ impl RpcHelper {
 
 				Ok(res?)
 			}
-			_ = tokio::time::sleep(strat.rs_timeout) => {
-				drop(permit);
+			() = timeout => {
 				self.0.metrics.rpc_timeout_counter.add(1, &metric_tags);
 				Err(Error::Timeout)
 			}
 		}
 	}
 
-	pub async fn call_many<M, H, S>(
+	pub async fn call_many<M, N, H, S>(
 		&self,
 		endpoint: &Endpoint<M, H>,
 		to: &[Uuid],
-		msg: M,
+		msg: N,
 		strat: RequestStrategy,
-	) -> Vec<(Uuid, Result<S, Error>)>
+	) -> Result<Vec<(Uuid, Result<S, Error>)>, Error>
 	where
 		M: Rpc<Response = Result<S, Error>>,
-		H: EndpointHandler<M>,
+		N: IntoReq<M>,
+		H: StreamingEndpointHandler<M>,
 	{
-		let msg = Arc::new(msg);
+		let msg = msg.into_req().map_err(netapp::error::Error::from)?;
+
 		let resps = join_all(
 			to.iter()
-				.map(|to| self.call_arc(endpoint, *to, msg.clone(), strat)),
+				.map(|to| self.call(endpoint, *to, msg.clone(), strat)),
 		)
 		.await;
-		to.iter()
+		Ok(to
+			.iter()
 			.cloned()
 			.zip(resps.into_iter())
-			.collect::<Vec<_>>()
+			.collect::<Vec<_>>())
 	}
 
-	pub async fn broadcast<M, H, S>(
+	pub async fn broadcast<M, N, H, S>(
 		&self,
 		endpoint: &Endpoint<M, H>,
-		msg: M,
+		msg: N,
 		strat: RequestStrategy,
-	) -> Vec<(Uuid, Result<S, Error>)>
+	) -> Result<Vec<(Uuid, Result<S, Error>)>, Error>
 	where
 		M: Rpc<Response = Result<S, Error>>,
-		H: EndpointHandler<M>,
+		N: IntoReq<M>,
+		H: StreamingEndpointHandler<M>,
 	{
 		let to = self
 			.0
@@ -223,16 +226,17 @@ impl RpcHelper {
 
 	/// Make a RPC call to multiple servers, returning either a Vec of responses,
 	/// or an error if quorum could not be reached due to too many errors
-	pub async fn try_call_many<M, H, S>(
+	pub async fn try_call_many<M, N, H, S>(
 		&self,
 		endpoint: &Arc<Endpoint<M, H>>,
 		to: &[Uuid],
-		msg: M,
+		msg: N,
 		strategy: RequestStrategy,
 	) -> Result<Vec<S>, Error>
 	where
 		M: Rpc<Response = Result<S, Error>> + 'static,
-		H: EndpointHandler<M> + 'static,
+		N: IntoReq<M>,
+		H: StreamingEndpointHandler<M> + 'static,
 		S: Send + 'static,
 	{
 		let quorum = strategy.rs_quorum.unwrap_or(to.len());
@@ -262,20 +266,21 @@ impl RpcHelper {
 			.await
 	}
 
-	async fn try_call_many_internal<M, H, S>(
+	async fn try_call_many_internal<M, N, H, S>(
 		&self,
 		endpoint: &Arc<Endpoint<M, H>>,
 		to: &[Uuid],
-		msg: M,
+		msg: N,
 		strategy: RequestStrategy,
 		quorum: usize,
 	) -> Result<Vec<S>, Error>
 	where
 		M: Rpc<Response = Result<S, Error>> + 'static,
-		H: EndpointHandler<M> + 'static,
+		N: IntoReq<M>,
+		H: StreamingEndpointHandler<M> + 'static,
 		S: Send + 'static,
 	{
-		let msg = Arc::new(msg);
+		let msg = msg.into_req().map_err(netapp::error::Error::from)?;
 
 		// Build future for each request
 		// They are not started now: they are added below in a FuturesUnordered
@@ -285,7 +290,7 @@ impl RpcHelper {
 			let msg = msg.clone();
 			let endpoint2 = endpoint.clone();
 			(to, async move {
-				self2.call_arc(&endpoint2, to, msg, strategy).await
+				self2.call(&endpoint2, to, msg, strategy).await
 			})
 		});
 
@@ -299,47 +304,19 @@ impl RpcHelper {
 			// to reach a quorum, priorizing nodes with the lowest latency.
 			// When there are errors, we start new requests to compensate.
 
-			// Retrieve some status variables that we will use to sort requests
-			let peer_list = self.0.fullmesh.get_peer_list();
-			let ring: Arc<Ring> = self.0.ring.borrow().clone();
-			let our_zone = match ring.layout.node_role(&self.0.our_node_id) {
-				Some(pc) => &pc.zone,
-				None => "",
-			};
-
-			// Augment requests with some information used to sort them.
-			// The tuples are as follows:
-			//         (is another node?, is another zone?, latency, node ID, request future)
-			// We store all of these tuples in a vec that we can sort.
-			// By sorting this vec, we priorize ourself, then nodes in the same zone,
-			// and within a same zone we priorize nodes with the lowest latency.
-			let mut requests = requests
-				.map(|(to, fut)| {
-					let peer_zone = match ring.layout.node_role(&to) {
-						Some(pc) => &pc.zone,
-						None => "",
-					};
-					let peer_avg_ping = peer_list
-						.iter()
-						.find(|x| x.id.as_ref() == to.as_slice())
-						.and_then(|pi| pi.avg_ping)
-						.unwrap_or_else(|| Duration::from_secs(1));
-					(
-						to != self.0.our_node_id,
-						peer_zone != our_zone,
-						peer_avg_ping,
-						to,
-						fut,
-					)
-				})
+			// Reorder requests to priorize closeness / low latency
+			let request_order = self.request_order(to);
+			let mut ord_requests = vec![(); request_order.len()]
+				.into_iter()
+				.map(|_| None)
 				.collect::<Vec<_>>();
-
-			// Sort requests by (priorize ourself, priorize same zone, priorize low latency)
-			requests
-				.sort_by_key(|(diffnode, diffzone, ping, _to, _fut)| (*diffnode, *diffzone, *ping));
+			for (to, fut) in requests {
+				let i = request_order.iter().position(|x| *x == to).unwrap();
+				ord_requests[i] = Some((to, fut));
+			}
 
 			// Make an iterator to take requests in their sorted order
-			let mut requests = requests.into_iter();
+			let mut requests = ord_requests.into_iter().map(Option::unwrap);
 
 			// resp_stream will contain all of the requests that are currently in flight.
 			// (for the moment none, they will be added in the loop below)
@@ -350,7 +327,7 @@ impl RpcHelper {
 				// If the current set of requests that are running is not enough to possibly
 				// reach quorum, start some new requests.
 				while successes.len() + resp_stream.len() < quorum {
-					if let Some((_, _, _, req_to, fut)) = requests.next() {
+					if let Some((req_to, fut)) = requests.next() {
 						let tracer = opentelemetry::global::tracer("garage");
 						let span = tracer.start(format!("RPC to {:?}", req_to));
 						resp_stream.push(tokio::spawn(
@@ -420,4 +397,49 @@ impl RpcHelper {
 			Err(Error::Quorum(quorum, successes.len(), to.len(), errors))
 		}
 	}
+
+	pub fn request_order(&self, nodes: &[Uuid]) -> Vec<Uuid> {
+		// Retrieve some status variables that we will use to sort requests
+		let peer_list = self.0.fullmesh.get_peer_list();
+		let ring: Arc<Ring> = self.0.ring.borrow().clone();
+		let our_zone = match ring.layout.node_role(&self.0.our_node_id) {
+			Some(pc) => &pc.zone,
+			None => "",
+		};
+
+		// Augment requests with some information used to sort them.
+		// The tuples are as follows:
+		//         (is another node?, is another zone?, latency, node ID, request future)
+		// We store all of these tuples in a vec that we can sort.
+		// By sorting this vec, we priorize ourself, then nodes in the same zone,
+		// and within a same zone we priorize nodes with the lowest latency.
+		let mut nodes = nodes
+			.iter()
+			.map(|to| {
+				let peer_zone = match ring.layout.node_role(to) {
+					Some(pc) => &pc.zone,
+					None => "",
+				};
+				let peer_avg_ping = peer_list
+					.iter()
+					.find(|x| x.id.as_ref() == to.as_slice())
+					.and_then(|pi| pi.avg_ping)
+					.unwrap_or_else(|| Duration::from_secs(10));
+				(
+					*to != self.0.our_node_id,
+					peer_zone != our_zone,
+					peer_avg_ping,
+					*to,
+				)
+			})
+			.collect::<Vec<_>>();
+
+		// Sort requests by (priorize ourself, priorize same zone, priorize low latency)
+		nodes.sort_by_key(|(diffnode, diffzone, ping, _to)| (*diffnode, *diffzone, *ping));
+
+		nodes
+			.into_iter()
+			.map(|(_, _, _, to)| to)
+			.collect::<Vec<_>>()
+	}
 }
diff --git a/src/rpc/system.rs b/src/rpc/system.rs
index 34031b10..7eb25195 100644
--- a/src/rpc/system.rs
+++ b/src/rpc/system.rs
@@ -2,7 +2,7 @@
 use std::collections::HashMap;
 use std::io::{Read, Write};
 use std::net::{IpAddr, SocketAddr};
-use std::path::Path;
+use std::path::{Path, PathBuf};
 use std::sync::{Arc, RwLock};
 use std::time::{Duration, Instant};
 
@@ -16,9 +16,9 @@ use tokio::sync::watch;
 use tokio::sync::Mutex;
 
 use netapp::endpoint::{Endpoint, EndpointHandler};
+use netapp::message::*;
 use netapp::peering::fullmesh::FullMeshPeeringStrategy;
-use netapp::proto::*;
-use netapp::util::parse_and_resolve_peer_addr;
+use netapp::util::parse_and_resolve_peer_addr_async;
 use netapp::{NetApp, NetworkKey, NodeID, NodeKey};
 
 use garage_util::background::BackgroundRunner;
@@ -37,10 +37,11 @@ use crate::rpc_helper::*;
 
 const DISCOVERY_INTERVAL: Duration = Duration::from_secs(60);
 const STATUS_EXCHANGE_INTERVAL: Duration = Duration::from_secs(10);
-const PING_TIMEOUT: Duration = Duration::from_secs(2);
 
-/// Version tag used for version check upon Netapp connection
-pub const GARAGE_VERSION_TAG: u64 = 0x6761726167650007; // garage 0x0007
+/// Version tag used for version check upon Netapp connection.
+/// Cluster nodes with different version tags are deemed
+/// incompatible and will refuse to connect.
+pub const GARAGE_VERSION_TAG: u64 = 0x6761726167650008; // garage 0x0008
 
 /// RPC endpoint used for calls related to membership
 pub const SYSTEM_RPC_PATH: &str = "garage_rpc/membership.rs/SystemRpc";
@@ -90,7 +91,7 @@ pub struct System {
 
 	rpc_listen_addr: SocketAddr,
 	rpc_public_addr: Option<SocketAddr>,
-	bootstrap_peers: Vec<(NodeID, SocketAddr)>,
+	bootstrap_peers: Vec<String>,
 
 	consul_discovery: Option<ConsulDiscoveryParam>,
 	#[cfg(feature = "kubernetes-discovery")]
@@ -104,6 +105,9 @@ pub struct System {
 
 	/// The job runner of this node
 	pub background: Arc<BackgroundRunner>,
+
+	/// Path to metadata directory
+	pub metadata_dir: PathBuf,
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
@@ -194,7 +198,7 @@ impl System {
 		replication_factor: usize,
         zone_redundancy: usize,
 		config: &Config,
-	) -> Arc<Self> {
+	) -> Result<Arc<Self>, Error> {
 		let node_key =
 			gen_node_key(&config.metadata_dir).expect("Unable to read or generate node ID");
 		info!(
@@ -202,11 +206,21 @@ impl System {
 			hex::encode(&node_key.public_key()[..8])
 		);
 
-		let persist_cluster_layout = Persister::new(&config.metadata_dir, "cluster_layout");
+		let persist_cluster_layout: Persister<ClusterLayout> =
+			Persister::new(&config.metadata_dir, "cluster_layout");
 		let persist_peer_list = Persister::new(&config.metadata_dir, "peer_list");
 
 		let cluster_layout = match persist_cluster_layout.load() {
-			Ok(x) => x,
+			Ok(x) => {
+				if x.replication_factor != replication_factor {
+					return Err(Error::Message(format!(
+						"Prevous cluster layout has replication factor {}, which is different than the one specified in the config file ({}). The previous cluster layout can be purged, if you know what you are doing, simply by deleting the `cluster_layout` file in your metadata directory.",
+						x.replication_factor,
+						replication_factor
+					)));
+				}
+				x
+			}
 			Err(e) => {
 				info!(
 					"No valid previous cluster layout stored ({}), starting fresh.",
@@ -228,8 +242,29 @@ impl System {
 		let ring = Ring::new(cluster_layout, replication_factor);
 		let (update_ring, ring) = watch::channel(Arc::new(ring));
 
-		let rpc_public_addr = match config.rpc_public_addr {
-			Some(a) => Some(a),
+		let rpc_public_addr = match &config.rpc_public_addr {
+			Some(a_str) => {
+				use std::net::ToSocketAddrs;
+				match a_str.to_socket_addrs() {
+					Err(e) => {
+						error!(
+							"Cannot resolve rpc_public_addr {} from config file: {}.",
+							a_str, e
+						);
+						None
+					}
+					Ok(a) => {
+						let a = a.collect::<Vec<_>>();
+						if a.is_empty() {
+							error!("rpc_public_addr {} resolve to no known IP address", a_str);
+						}
+						if a.len() > 1 {
+							warn!("Multiple possible resolutions for rpc_public_addr: {:?}. Taking the first one.", a);
+						}
+						a.into_iter().next()
+					}
+				}
+			}
 			None => {
 				let addr =
 					get_default_ip().map(|ip| SocketAddr::new(ip, config.rpc_bind_addr.port()));
@@ -239,13 +274,15 @@ impl System {
 				addr
 			}
 		};
+		if rpc_public_addr.is_none() {
+			warn!("This Garage node does not know its publicly reachable RPC address, this might hamper intra-cluster communication.");
+		}
 
 		let netapp = NetApp::new(GARAGE_VERSION_TAG, network_key, node_key);
-		let fullmesh = FullMeshPeeringStrategy::new(
-			netapp.clone(),
-			config.bootstrap_peers.clone(),
-			rpc_public_addr,
-		);
+		let fullmesh = FullMeshPeeringStrategy::new(netapp.clone(), vec![], rpc_public_addr);
+		if let Some(ping_timeout) = config.rpc_ping_timeout_msec {
+			fullmesh.set_ping_timeout_millis(ping_timeout);
+		}
 
 		let system_endpoint = netapp.endpoint(SYSTEM_RPC_PATH.into());
 
@@ -283,7 +320,13 @@ impl System {
 			node_status: RwLock::new(HashMap::new()),
 			netapp: netapp.clone(),
 			fullmesh: fullmesh.clone(),
-			rpc: RpcHelper::new(netapp.id.into(), fullmesh, background.clone(), ring.clone()),
+			rpc: RpcHelper::new(
+				netapp.id.into(),
+				fullmesh,
+				background.clone(),
+				ring.clone(),
+				config.rpc_timeout_msec.map(Duration::from_millis),
+			),
 			system_endpoint,
 			replication_factor,
 			rpc_listen_addr: config.rpc_bind_addr,
@@ -296,9 +339,10 @@ impl System {
 			ring,
 			update_ring: Mutex::new(update_ring),
 			background,
+			metadata_dir: config.metadata_dir.clone(),
 		});
 		sys.system_endpoint.set_handler(sys.clone());
-		sys
+		Ok(sys)
 	}
 
 	/// Perform bootstraping, starting the ping loop
@@ -313,6 +357,80 @@ impl System {
 		);
 	}
 
+	// ---- Administrative operations (directly available and
+	//      also available through RPC) ----
+
+	pub fn get_known_nodes(&self) -> Vec<KnownNodeInfo> {
+		let node_status = self.node_status.read().unwrap();
+		let known_nodes = self
+			.fullmesh
+			.get_peer_list()
+			.iter()
+			.map(|n| KnownNodeInfo {
+				id: n.id.into(),
+				addr: n.addr,
+				is_up: n.is_up(),
+				last_seen_secs_ago: n
+					.last_seen
+					.map(|t| (Instant::now().saturating_duration_since(t)).as_secs()),
+				status: node_status
+					.get(&n.id.into())
+					.cloned()
+					.map(|(_, st)| st)
+					.unwrap_or(NodeStatus {
+						hostname: "?".to_string(),
+						replication_factor: 0,
+						cluster_layout_version: 0,
+						cluster_layout_staging_hash: Hash::from([0u8; 32]),
+					}),
+			})
+			.collect::<Vec<_>>();
+		known_nodes
+	}
+
+	pub fn get_cluster_layout(&self) -> ClusterLayout {
+		self.ring.borrow().layout.clone()
+	}
+
+	pub async fn update_cluster_layout(
+		self: &Arc<Self>,
+		layout: &ClusterLayout,
+	) -> Result<(), Error> {
+		self.handle_advertise_cluster_layout(layout).await?;
+		Ok(())
+	}
+
+	pub async fn connect(&self, node: &str) -> Result<(), Error> {
+		let (pubkey, addrs) = parse_and_resolve_peer_addr_async(node)
+			.await
+			.ok_or_else(|| {
+				Error::Message(format!(
+					"Unable to parse or resolve node specification: {}",
+					node
+				))
+			})?;
+		let mut errors = vec![];
+		for ip in addrs.iter() {
+			match self
+				.netapp
+				.clone()
+				.try_connect(*ip, pubkey)
+				.await
+				.err_context(CONNECT_ERROR_MESSAGE)
+			{
+				Ok(()) => return Ok(()),
+				Err(e) => {
+					errors.push((*ip, e));
+				}
+			}
+		}
+		if errors.len() == 1 {
+			Err(Error::Message(errors[0].1.to_string()))
+		} else {
+			Err(Error::Message(format!("{:?}", errors)))
+		}
+	}
+
 	// ---- INTERNALS ----
 
 	async fn advertise_to_consul(self: Arc<Self>) -> Result<(), Error> {
@@ -385,32 +503,11 @@ impl System {
 		self.local_status.swap(Arc::new(new_si));
 	}
 
+	// --- RPC HANDLERS ---
+
 	async fn handle_connect(&self, node: &str) -> Result<SystemRpc, Error> {
-		let (pubkey, addrs) = parse_and_resolve_peer_addr(node).ok_or_else(|| {
-			Error::Message(format!(
-				"Unable to parse or resolve node specification: {}",
-				node
-			))
-		})?;
-		let mut errors = vec![];
-		for ip in addrs.iter() {
-			match self
-				.netapp
-				.clone()
-				.try_connect(*ip, pubkey)
-				.await
-				.err_context(CONNECT_ERROR_MESSAGE)
-			{
-				Ok(()) => return Ok(SystemRpc::Ok),
-				Err(e) => {
-					errors.push((*ip, e));
-				}
-			}
-		}
-		return Err(Error::Message(format!(
-			"Could not connect to specified peers. Errors: {:?}",
-			errors
-		)));
+		self.connect(node).await?;
+		Ok(SystemRpc::Ok)
 	}
 
 	fn handle_pull_cluster_layout(&self) -> SystemRpc {
@@ -419,28 +516,7 @@ impl System {
 	}
 
 	fn handle_get_known_nodes(&self) -> SystemRpc {
-		let node_status = self.node_status.read().unwrap();
-		let known_nodes = self
-			.fullmesh
-			.get_peer_list()
-			.iter()
-			.map(|n| KnownNodeInfo {
-				id: n.id.into(),
-				addr: n.addr,
-				is_up: n.is_up(),
-				last_seen_secs_ago: n.last_seen.map(|t| (Instant::now() - t).as_secs()),
-				status: node_status
-					.get(&n.id.into())
-					.cloned()
-					.map(|(_, st)| st)
-					.unwrap_or(NodeStatus {
-						hostname: "?".to_string(),
-						replication_factor: 0,
-						cluster_layout_version: 0,
-						cluster_layout_staging_hash: Hash::from([0u8; 32]),
-					}),
-			})
-			.collect::<Vec<_>>();
+		let known_nodes = self.get_known_nodes();
 		SystemRpc::ReturnKnownNodes(known_nodes)
 	}
 
@@ -452,7 +528,7 @@ impl System {
 		let local_info = self.local_status.load();
 
 		if local_info.replication_factor < info.replication_factor {
-			error!("Some node have a higher replication factor ({}) than this one ({}). This is not supported and might lead to bugs",
+			error!("Some node have a higher replication factor ({}) than this one ({}). This is not supported and will lead to data corruption. Shutting down for safety.",
 				info.replication_factor,
 				local_info.replication_factor);
 			std::process::exit(1);
@@ -477,9 +553,19 @@ impl System {
 	}
 
 	async fn handle_advertise_cluster_layout(
-		self: Arc<Self>,
+		self: &Arc<Self>,
 		adv: &ClusterLayout,
 	) -> Result<SystemRpc, Error> {
+		if adv.replication_factor != self.replication_factor {
+			let msg = format!(
+				"Received a cluster layout from another node with replication factor {}, which is different from what we have in our configuration ({}). Discarding the cluster layout we received.",
+				adv.replication_factor,
+				self.replication_factor
+			);
+			error!("{}", msg);
+			return Err(Error::Message(msg));
+		}
+
 		let update_ring = self.update_ring.lock().await;
 		let mut layout: ClusterLayout = self.ring.borrow().layout.clone();
 
@@ -505,7 +591,7 @@ impl System {
 						SystemRpc::AdvertiseClusterLayout(layout),
 						RequestStrategy::with_priority(PRIO_HIGH),
 					)
-					.await;
+					.await?;
 				Ok(())
 			});
 			self.background.spawn(self.clone().save_cluster_layout());
@@ -520,11 +606,12 @@ impl System {
 
 			self.update_local_status();
 			let local_status: NodeStatus = self.local_status.load().as_ref().clone();
-			self.rpc
+			let _ = self
+				.rpc
 				.broadcast(
 					&self.system_endpoint,
 					SystemRpc::AdvertiseStatus(local_status),
-					RequestStrategy::with_priority(PRIO_HIGH).with_timeout(PING_TIMEOUT),
+					RequestStrategy::with_priority(PRIO_HIGH),
 				)
 				.await;
 
@@ -550,7 +637,7 @@ impl System {
 			if not_configured || no_peers || bad_peers {
 				info!("Doing a bootstrap/discovery step (not_configured: {}, no_peers: {}, bad_peers: {})", not_configured, no_peers, bad_peers);
 
-				let mut ping_list = self.bootstrap_peers.clone();
+				let mut ping_list = resolve_peers(&self.bootstrap_peers).await;
 
 				// Add peer list from list stored on disk
 				if let Ok(peers) = self.persist_peer_list.load_async().await {
@@ -648,7 +735,7 @@ impl System {
 				&self.system_endpoint,
 				peer,
 				SystemRpc::PullClusterLayout,
-				RequestStrategy::with_priority(PRIO_HIGH).with_timeout(PING_TIMEOUT),
+				RequestStrategy::with_priority(PRIO_HIGH),
 			)
 			.await;
 		if let Ok(SystemRpc::AdvertiseClusterLayout(layout)) = resp {
@@ -681,6 +768,25 @@ fn get_default_ip() -> Option<IpAddr> {
 		.map(|a| a.ip())
 }
 
+async fn resolve_peers(peers: &[String]) -> Vec<(NodeID, SocketAddr)> {
+	let mut ret = vec![];
+
+	for peer in peers.iter() {
+		match parse_and_resolve_peer_addr_async(peer).await {
+			Some((pubkey, addrs)) => {
+				for ip in addrs {
+					ret.push((pubkey, ip));
+				}
+			}
+			None => {
+				warn!("Unable to parse and/or resolve peer hostname {}", peer);
+			}
+		}
+	}
+
+	ret
+}
+
 struct ConsulDiscoveryParam {
 	consul_host: String,
 	service_name: String,
diff --git a/src/table/Cargo.toml b/src/table/Cargo.toml
index ed1a213f..38c6b41c 100644
--- a/src/table/Cargo.toml
+++ b/src/table/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "garage_table"
-version = "0.7.0"
+version = "0.8.0"
 authors = ["Alex Auvolat <alex@adnab.me>"]
 edition = "2018"
 license = "AGPL-3.0"
@@ -14,19 +14,19 @@ path = "lib.rs"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
-garage_rpc = { version = "0.7.0", path = "../rpc" }
-garage_util = { version = "0.7.0", path = "../util" }
+garage_db = { version = "0.8.0", path = "../db" }
+garage_rpc = { version = "0.8.0", path = "../rpc" }
+garage_util = { version = "0.8.0", path = "../util" }
 
 opentelemetry = "0.17"
 
 async-trait = "0.1.7"
 bytes = "1.0"
+hex = "0.4"
 hexdump = "0.1"
 tracing = "0.1.30"
 rand = "0.8"
 
-sled = "0.34"
-
 rmp-serde = "0.15"
 serde = { version = "1.0", default-features = false, features = ["derive", "rc"] }
 serde_bytes = "0.11"
diff --git a/src/table/data.rs b/src/table/data.rs
index ff7965f5..3212e82b 100644
--- a/src/table/data.rs
+++ b/src/table/data.rs
@@ -1,13 +1,15 @@
 use core::borrow::Borrow;
+use std::convert::TryInto;
 use std::sync::Arc;
 
 use serde_bytes::ByteBuf;
-use sled::Transactional;
 use tokio::sync::Notify;
 
+use garage_db as db;
+use garage_db::counted_tree_hack::CountedTree;
+
 use garage_util::data::*;
 use garage_util::error::*;
-use garage_util::sled_counter::SledCountedTree;
 
 use garage_rpc::system::System;
 
@@ -16,19 +18,20 @@ use crate::gc::GcTodoEntry;
 use crate::metrics::*;
 use crate::replication::*;
 use crate::schema::*;
+use crate::util::*;
 
 pub struct TableData<F: TableSchema, R: TableReplication> {
 	system: Arc<System>,
 
-	pub(crate) instance: F,
-	pub(crate) replication: R,
+	pub instance: F,
+	pub replication: R,
 
-	pub store: sled::Tree,
+	pub store: db::Tree,
 
-	pub(crate) merkle_tree: sled::Tree,
-	pub(crate) merkle_todo: sled::Tree,
+	pub(crate) merkle_tree: db::Tree,
+	pub(crate) merkle_todo: db::Tree,
 	pub(crate) merkle_todo_notify: Notify,
-	pub(crate) gc_todo: SledCountedTree,
+	pub(crate) gc_todo: CountedTree,
 
 	pub(crate) metrics: TableMetrics,
 }
@@ -38,7 +41,7 @@ where
 	F: TableSchema,
 	R: TableReplication,
 {
-	pub fn new(system: Arc<System>, instance: F, replication: R, db: &sled::Db) -> Arc<Self> {
+	pub fn new(system: Arc<System>, instance: F, replication: R, db: &db::Db) -> Arc<Self> {
 		let store = db
 			.open_tree(&format!("{}:table", F::TABLE_NAME))
 			.expect("Unable to open DB tree");
@@ -53,7 +56,7 @@ where
 		let gc_todo = db
 			.open_tree(&format!("{}:gc_todo_v2", F::TABLE_NAME))
 			.expect("Unable to open DB tree");
-		let gc_todo = SledCountedTree::new(gc_todo);
+		let gc_todo = CountedTree::new(gc_todo).expect("Cannot count gc_todo_v2");
 
 		let metrics = TableMetrics::new(F::TABLE_NAME, merkle_todo.clone(), gc_todo.clone());
 
@@ -83,18 +86,48 @@ where
 
 	pub fn read_range(
 		&self,
-		p: &F::P,
-		s: &Option<F::S>,
+		partition_key: &F::P,
+		start: &Option<F::S>,
+		filter: &Option<F::Filter>,
+		limit: usize,
+		enumeration_order: EnumerationOrder,
+	) -> Result<Vec<Arc<ByteBuf>>, Error> {
+		let partition_hash = partition_key.hash();
+		match enumeration_order {
+			EnumerationOrder::Forward => {
+				let first_key = match start {
+					None => partition_hash.to_vec(),
+					Some(sk) => self.tree_key(partition_key, sk),
+				};
+				let range = self.store.range(first_key..)?;
+				self.read_range_aux(partition_hash, range, filter, limit)
+			}
+			EnumerationOrder::Reverse => match start {
+				Some(sk) => {
+					let last_key = self.tree_key(partition_key, sk);
+					let range = self.store.range_rev(..=last_key)?;
+					self.read_range_aux(partition_hash, range, filter, limit)
+				}
+				None => {
+					let mut last_key = partition_hash.to_vec();
+					let lower = u128::from_be_bytes(last_key[16..32].try_into().unwrap());
+					last_key[16..32].copy_from_slice(&u128::to_be_bytes(lower + 1));
+					let range = self.store.range_rev(..last_key)?;
+					self.read_range_aux(partition_hash, range, filter, limit)
+				}
+			},
+		}
+	}
+
+	fn read_range_aux<'a>(
+		&self,
+		partition_hash: Hash,
+		range: db::ValueIter<'a>,
 		filter: &Option<F::Filter>,
 		limit: usize,
 	) -> Result<Vec<Arc<ByteBuf>>, Error> {
-		let partition_hash = p.hash();
-		let first_key = match s {
-			None => partition_hash.to_vec(),
-			Some(sk) => self.tree_key(p, sk),
-		};
 		let mut ret = vec![];
-		for item in self.store.range(first_key..) {
+		for item in range {
 			let (key, value) = item?;
 			if &key[..32] != partition_hash.as_slice() {
 				break;
@@ -107,7 +140,7 @@ where
 				}
 			};
 			if keep {
-				ret.push(Arc::new(ByteBuf::from(value.as_ref())));
+				ret.push(Arc::new(ByteBuf::from(value)));
 			}
 			if ret.len() >= limit {
 				break;
@@ -136,17 +169,29 @@ where
 		let update = self.decode_entry(update_bytes)?;
 		let tree_key = self.tree_key(update.partition_key(), update.sort_key());
 
-		let changed = (&self.store, &self.merkle_todo).transaction(|(store, mkl_todo)| {
-			let (old_entry, old_bytes, new_entry) = match store.get(&tree_key)? {
+		self.update_entry_with(&tree_key[..], |ent| match ent {
+			Some(mut ent) => {
+				ent.merge(&update);
+				ent
+			}
+			None => update.clone(),
+		})?;
+		Ok(())
+	}
+
+	pub fn update_entry_with(
+		&self,
+		tree_key: &[u8],
+		f: impl Fn(Option<F::E>) -> F::E,
+	) -> Result<Option<F::E>, Error> {
+		let changed = self.store.db().transaction(|mut tx| {
+			let (old_entry, old_bytes, new_entry) = match tx.get(&self.store, tree_key)? {
 				Some(old_bytes) => {
-					let old_entry = self
-						.decode_entry(&old_bytes)
-						.map_err(sled::transaction::ConflictableTransactionError::Abort)?;
-					let mut new_entry = old_entry.clone();
-					new_entry.merge(&update);
+					let old_entry = self.decode_entry(&old_bytes).map_err(db::TxError::Abort)?;
+					let new_entry = f(Some(old_entry.clone()));
 					(Some(old_entry), Some(old_bytes), new_entry)
 				}
-				None => (None, None, update.clone()),
+				None => (None, None, f(None)),
 			};
 
 			// Scenario 1: the value changed, so of course there is a change
@@ -158,24 +203,28 @@ where
 			// the associated Merkle tree entry.
 			let new_bytes = rmp_to_vec_all_named(&new_entry)
 				.map_err(Error::RmpEncode)
-				.map_err(sled::transaction::ConflictableTransactionError::Abort)?;
+				.map_err(db::TxError::Abort)?;
 			let encoding_changed = Some(&new_bytes[..]) != old_bytes.as_ref().map(|x| &x[..]);
+			drop(old_bytes);
 
 			if value_changed || encoding_changed {
 				let new_bytes_hash = blake2sum(&new_bytes[..]);
-				mkl_todo.insert(tree_key.clone(), new_bytes_hash.as_slice())?;
-				store.insert(tree_key.clone(), new_bytes)?;
-				Ok(Some((old_entry, new_entry, new_bytes_hash)))
+				tx.insert(&self.merkle_todo, tree_key, new_bytes_hash.as_slice())?;
+				tx.insert(&self.store, tree_key, new_bytes)?;
+
+				self.instance
+					.updated(&mut tx, old_entry.as_ref(), Some(&new_entry))?;
+
+				Ok(Some((new_entry, new_bytes_hash)))
 			} else {
 				Ok(None)
 			}
 		})?;
 
-		if let Some((old_entry, new_entry, new_bytes_hash)) = changed {
+		if let Some((new_entry, new_bytes_hash)) = changed {
 			self.metrics.internal_update_counter.add(1);
 
 			let is_tombstone = new_entry.is_tombstone();
-			self.instance.updated(old_entry, Some(new_entry));
 			self.merkle_todo_notify.notify_one();
 			if is_tombstone {
 				// We are only responsible for GC'ing this item if we are the
@@ -187,31 +236,34 @@ where
 				let pk_hash = Hash::try_from(&tree_key[..32]).unwrap();
 				let nodes = self.replication.write_nodes(&pk_hash);
 				if nodes.first() == Some(&self.system.id) {
-					GcTodoEntry::new(tree_key, new_bytes_hash).save(&self.gc_todo)?;
+					GcTodoEntry::new(tree_key.to_vec(), new_bytes_hash).save(&self.gc_todo)?;
 				}
 			}
-		}
 
-		Ok(())
+			Ok(Some(new_entry))
+		} else {
+			Ok(None)
+		}
 	}
 
 	pub(crate) fn delete_if_equal(self: &Arc<Self>, k: &[u8], v: &[u8]) -> Result<bool, Error> {
-		let removed = (&self.store, &self.merkle_todo).transaction(|(store, mkl_todo)| {
-			if let Some(cur_v) = store.get(k)? {
-				if cur_v == v {
-					store.remove(k)?;
-					mkl_todo.insert(k, vec![])?;
-					return Ok(true);
+		let removed = self
+			.store
+			.db()
+			.transaction(|mut tx| match tx.get(&self.store, k)? {
+				Some(cur_v) if cur_v == v => {
+					tx.remove(&self.store, k)?;
+					tx.insert(&self.merkle_todo, k, vec![])?;
+
+					let old_entry = self.decode_entry(v).map_err(db::TxError::Abort)?;
+					self.instance.updated(&mut tx, Some(&old_entry), None)?;
+					Ok(true)
 				}
-			}
-			Ok(false)
-		})?;
+				_ => Ok(false),
+			})?;
 
 		if removed {
 			self.metrics.internal_delete_counter.add(1);
-
-			let old_entry = self.decode_entry(v)?;
-			self.instance.updated(Some(old_entry), None);
 			self.merkle_todo_notify.notify_one();
 		}
 		Ok(removed)
@@ -222,36 +274,37 @@ where
 		k: &[u8],
 		vhash: Hash,
 	) -> Result<bool, Error> {
-		let removed = (&self.store, &self.merkle_todo).transaction(|(store, mkl_todo)| {
-			if let Some(cur_v) = store.get(k)? {
-				if blake2sum(&cur_v[..]) == vhash {
-					store.remove(k)?;
-					mkl_todo.insert(k, vec![])?;
-					return Ok(Some(cur_v));
+		let removed = self
+			.store
+			.db()
+			.transaction(|mut tx| match tx.get(&self.store, k)? {
+				Some(cur_v) if blake2sum(&cur_v[..]) == vhash => {
+					tx.remove(&self.store, k)?;
+					tx.insert(&self.merkle_todo, k, vec![])?;
+
+					let old_entry = self.decode_entry(&cur_v[..]).map_err(db::TxError::Abort)?;
+					self.instance.updated(&mut tx, Some(&old_entry), None)?;
+					Ok(true)
 				}
-			}
-			Ok(None)
-		})?;
+				_ => Ok(false),
+			})?;
 
-		if let Some(old_v) = removed {
-			let old_entry = self.decode_entry(&old_v[..])?;
-			self.instance.updated(Some(old_entry), None);
+		if removed {
+			self.metrics.internal_delete_counter.add(1);
 			self.merkle_todo_notify.notify_one();
-			Ok(true)
-		} else {
-			Ok(false)
 		}
+		Ok(removed)
 	}
 
 	// ---- Utility functions ----
 
-	pub(crate) fn tree_key(&self, p: &F::P, s: &F::S) -> Vec<u8> {
+	pub fn tree_key(&self, p: &F::P, s: &F::S) -> Vec<u8> {
 		let mut ret = p.hash().to_vec();
 		ret.extend(s.sort_key());
 		ret
 	}
 
-	pub(crate) fn decode_entry(&self, bytes: &[u8]) -> Result<F::E, Error> {
+	pub fn decode_entry(&self, bytes: &[u8]) -> Result<F::E, Error> {
 		match rmp_serde::decode::from_read_ref::<_, F::E>(bytes) {
 			Ok(x) => Ok(x),
 			Err(e) => match F::try_migrate(bytes) {
@@ -267,7 +320,7 @@ where
 		}
 	}
 
-	pub fn gc_todo_len(&self) -> usize {
-		self.gc_todo.len()
+	pub fn gc_todo_len(&self) -> Result<usize, Error> {
+		Ok(self.gc_todo.len())
 	}
 }
diff --git a/src/table/gc.rs b/src/table/gc.rs
index 2a05b6ae..83e7eeff 100644
--- a/src/table/gc.rs
+++ b/src/table/gc.rs
@@ -8,13 +8,13 @@ use serde::{Deserialize, Serialize};
 use serde_bytes::ByteBuf;
 
 use futures::future::join_all;
-use futures::select;
-use futures_util::future::*;
 use tokio::sync::watch;
 
+use garage_db::counted_tree_hack::CountedTree;
+
+use garage_util::background::*;
 use garage_util::data::*;
 use garage_util::error::*;
-use garage_util::sled_counter::SledCountedTree;
 use garage_util::time::*;
 
 use garage_rpc::system::System;
@@ -25,7 +25,6 @@ use crate::replication::*;
 use crate::schema::*;
 
 const TABLE_GC_BATCH_SIZE: usize = 1024;
-const TABLE_GC_RPC_TIMEOUT: Duration = Duration::from_secs(30);
 
 // GC delay for table entries: 1 day (24 hours)
 // (the delay before the entry is added in the GC todo list
@@ -68,50 +67,24 @@ where
 
 		gc.endpoint.set_handler(gc.clone());
 
-		let gc1 = gc.clone();
-		system.background.spawn_worker(
-			format!("GC loop for {}", F::TABLE_NAME),
-			move |must_exit: watch::Receiver<bool>| gc1.gc_loop(must_exit),
-		);
+		system.background.spawn_worker(GcWorker::new(gc.clone()));
 
 		gc
 	}
 
-	async fn gc_loop(self: Arc<Self>, mut must_exit: watch::Receiver<bool>) {
-		while !*must_exit.borrow() {
-			match self.gc_loop_iter().await {
-				Ok(None) => {
-					// Stuff was done, loop immediately
-				}
-				Ok(Some(wait_delay)) => {
-					// Nothing was done, wait specified delay.
-					select! {
-						_ = tokio::time::sleep(wait_delay).fuse() => {},
-						_ = must_exit.changed().fuse() => {},
-					}
-				}
-				Err(e) => {
-					warn!("({}) Error doing GC: {}", F::TABLE_NAME, e);
-				}
-			}
-		}
-	}
-
 	async fn gc_loop_iter(&self) -> Result<Option<Duration>, Error> {
 		let now = now_msec();
 
-		let mut entries = vec![];
-		let mut excluded = vec![];
-
 		// List entries in the GC todo list
 		// These entries are put there when a tombstone is inserted in the table
 		// (see update_entry in data.rs)
-		for entry_kv in self.data.gc_todo.iter() {
+		let mut candidates = vec![];
+		for entry_kv in self.data.gc_todo.iter()? {
 			let (k, vhash) = entry_kv?;
-			let mut todo_entry = GcTodoEntry::parse(&k, &vhash);
+			let todo_entry = GcTodoEntry::parse(&k, &vhash);
 
 			if todo_entry.deletion_time() > now {
-				if entries.is_empty() && excluded.is_empty() {
+				if candidates.is_empty() {
 					// If the earliest entry in the todo list shouldn't yet be processed,
 					// return a duration to wait in the loop
 					return Ok(Some(Duration::from_millis(
@@ -123,15 +96,23 @@ where
 				}
 			}
 
-			let vhash = Hash::try_from(&vhash[..]).unwrap();
+			candidates.push(todo_entry);
+			if candidates.len() >= 2 * TABLE_GC_BATCH_SIZE {
+				break;
+			}
+		}
 
+		let mut entries = vec![];
+		let mut excluded = vec![];
+		for mut todo_entry in candidates {
 			// Check if the tombstone is still the current value of the entry.
 			// If not, we don't actually want to GC it, and we will remove it
 			// from the gc_todo table later (below).
+			let vhash = todo_entry.value_hash;
 			todo_entry.value = self
 				.data
 				.store
-				.get(&k[..])?
+				.get(&todo_entry.key[..])?
 				.filter(|v| blake2sum(&v[..]) == vhash)
 				.map(|v| v.to_vec());
 
@@ -254,9 +235,7 @@ where
 				&self.endpoint,
 				&nodes[..],
 				GcRpc::Update(updates),
-				RequestStrategy::with_priority(PRIO_BACKGROUND)
-					.with_quorum(nodes.len())
-					.with_timeout(TABLE_GC_RPC_TIMEOUT),
+				RequestStrategy::with_priority(PRIO_BACKGROUND).with_quorum(nodes.len()),
 			)
 			.await
 			.err_context("GC: send tombstones")?;
@@ -277,9 +256,7 @@ where
 				&self.endpoint,
 				&nodes[..],
 				GcRpc::DeleteIfEqualHash(deletes),
-				RequestStrategy::with_priority(PRIO_BACKGROUND)
-					.with_quorum(nodes.len())
-					.with_timeout(TABLE_GC_RPC_TIMEOUT),
+				RequestStrategy::with_priority(PRIO_BACKGROUND).with_quorum(nodes.len()),
 			)
 			.await
 			.err_context("GC: remote delete tombstones")?;
@@ -321,6 +298,66 @@ where
 	}
 }
 
+struct GcWorker<F, R>
+where
+	F: TableSchema + 'static,
+	R: TableReplication + 'static,
+{
+	gc: Arc<TableGc<F, R>>,
+	wait_delay: Duration,
+}
+
+impl<F, R> GcWorker<F, R>
+where
+	F: TableSchema + 'static,
+	R: TableReplication + 'static,
+{
+	fn new(gc: Arc<TableGc<F, R>>) -> Self {
+		Self {
+			gc,
+			wait_delay: Duration::from_secs(0),
+		}
+	}
+}
+
+#[async_trait]
+impl<F, R> Worker for GcWorker<F, R>
+where
+	F: TableSchema + 'static,
+	R: TableReplication + 'static,
+{
+	fn name(&self) -> String {
+		format!("{} GC", F::TABLE_NAME)
+	}
+
+	fn info(&self) -> Option<String> {
+		let l = self.gc.data.gc_todo_len().unwrap_or(0);
+		if l > 0 {
+			Some(format!("{} items in queue", l))
+		} else {
+			None
+		}
+	}
+
+	async fn work(&mut self, _must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error> {
+		match self.gc.gc_loop_iter().await? {
+			None => Ok(WorkerState::Busy),
+			Some(delay) => {
+				self.wait_delay = delay;
+				Ok(WorkerState::Idle)
+			}
+		}
+	}
+
+	async fn wait_for_work(&mut self, must_exit: &watch::Receiver<bool>) -> WorkerState {
+		if *must_exit.borrow() {
+			return WorkerState::Done;
+		}
+		tokio::time::sleep(self.wait_delay).await;
+		WorkerState::Busy
+	}
+}
+
 /// An entry stored in the gc_todo Sled tree associated with the table
 /// Contains helper function for parsing, saving, and removing
 /// such entry in Sled
@@ -353,17 +390,17 @@ impl GcTodoEntry {
 	}
 
 	/// Parses a GcTodoEntry from a (k, v) pair stored in the gc_todo tree
-	pub(crate) fn parse(sled_k: &[u8], sled_v: &[u8]) -> Self {
+	pub(crate) fn parse(db_k: &[u8], db_v: &[u8]) -> Self {
 		Self {
-			tombstone_timestamp: u64::from_be_bytes(sled_k[0..8].try_into().unwrap()),
-			key: sled_k[8..].to_vec(),
-			value_hash: Hash::try_from(sled_v).unwrap(),
+			tombstone_timestamp: u64::from_be_bytes(db_k[0..8].try_into().unwrap()),
+			key: db_k[8..].to_vec(),
+			value_hash: Hash::try_from(db_v).unwrap(),
 			value: None,
 		}
 	}
 
 	/// Saves the GcTodoEntry in the gc_todo tree
-	pub(crate) fn save(&self, gc_todo_tree: &SledCountedTree) -> Result<(), Error> {
+	pub(crate) fn save(&self, gc_todo_tree: &CountedTree) -> Result<(), Error> {
 		gc_todo_tree.insert(self.todo_table_key(), self.value_hash.as_slice())?;
 		Ok(())
 	}
@@ -373,9 +410,9 @@ impl GcTodoEntry {
 	/// This is usefull to remove a todo entry only under the condition
 	/// that it has not changed since the time it was read, i.e.
 	/// what we have to do is still the same
-	pub(crate) fn remove_if_equal(&self, gc_todo_tree: &SledCountedTree) -> Result<(), Error> {
-		let _ = gc_todo_tree.compare_and_swap::<_, _, Vec<u8>>(
-			&self.todo_table_key()[..],
+	pub(crate) fn remove_if_equal(&self, gc_todo_tree: &CountedTree) -> Result<(), Error> {
+		gc_todo_tree.compare_and_swap::<_, _, &[u8]>(
+			&self.todo_table_key(),
 			Some(self.value_hash),
 			None,
 		)?;
diff --git a/src/table/merkle.rs b/src/table/merkle.rs
index 93bf7e47..a5c29723 100644
--- a/src/table/merkle.rs
+++ b/src/table/merkle.rs
@@ -1,15 +1,13 @@
 use std::sync::Arc;
 use std::time::Duration;
 
-use futures::select;
-use futures_util::future::*;
+use async_trait::async_trait;
 use serde::{Deserialize, Serialize};
-use sled::transaction::{
-	ConflictableTransactionError, ConflictableTransactionResult, TransactionalTree,
-};
 use tokio::sync::watch;
 
-use garage_util::background::BackgroundRunner;
+use garage_db as db;
+
+use garage_util::background::*;
 use garage_util::data::*;
 use garage_util::error::Error;
 
@@ -79,43 +77,17 @@ where
 			empty_node_hash,
 		});
 
-		let ret2 = ret.clone();
-		background.spawn_worker(
-			format!("Merkle tree updater for {}", F::TABLE_NAME),
-			|must_exit: watch::Receiver<bool>| ret2.updater_loop(must_exit),
-		);
+		background.spawn_worker(MerkleWorker(ret.clone()));
 
 		ret
 	}
 
-	async fn updater_loop(self: Arc<Self>, mut must_exit: watch::Receiver<bool>) {
-		while !*must_exit.borrow() {
-			if let Some(x) = self.data.merkle_todo.iter().next() {
-				match x {
-					Ok((key, valhash)) => {
-						if let Err(e) = self.update_item(&key[..], &valhash[..]) {
-							warn!(
-								"({}) Error while updating Merkle tree item: {}",
-								F::TABLE_NAME,
-								e
-							);
-						}
-					}
-					Err(e) => {
-						warn!(
-							"({}) Error while iterating on Merkle todo tree: {}",
-							F::TABLE_NAME,
-							e
-						);
-						tokio::time::sleep(Duration::from_secs(10)).await;
-					}
-				}
-			} else {
-				select! {
-					_ = self.data.merkle_todo_notify.notified().fuse() => {},
-					_ = must_exit.changed().fuse() => {},
-				}
-			}
+	fn updater_loop_iter(&self) -> Result<WorkerState, Error> {
+		if let Some((key, valhash)) = self.data.merkle_todo.first()? {
+			self.update_item(&key, &valhash)?;
+			Ok(WorkerState::Busy)
+		} else {
+			Ok(WorkerState::Idle)
 		}
 	}
 
@@ -137,13 +109,16 @@ where
 		};
 		self.data
 			.merkle_tree
-			.transaction(|tx| self.update_item_rec(tx, k, &khash, &key, new_vhash))?;
+			.db()
+			.transaction(|mut tx| self.update_item_rec(&mut tx, k, &khash, &key, new_vhash))?;
 
-		let deleted = self
-			.data
-			.merkle_todo
-			.compare_and_swap::<_, _, Vec<u8>>(k, Some(vhash_by), None)?
-			.is_ok();
+		let deleted = self.data.merkle_todo.db().transaction(|mut tx| {
+			let remove = matches!(tx.get(&self.data.merkle_todo, k)?, Some(ov) if ov == vhash_by);
+			if remove {
+				tx.remove(&self.data.merkle_todo, k)?;
+			}
+			Ok(remove)
+		})?;
 
 		if !deleted {
 			debug!(
@@ -157,12 +132,12 @@ where
 
 	fn update_item_rec(
 		&self,
-		tx: &TransactionalTree,
+		tx: &mut db::Transaction<'_>,
 		k: &[u8],
 		khash: &Hash,
 		key: &MerkleNodeKey,
 		new_vhash: Option<Hash>,
-	) -> ConflictableTransactionResult<Option<Hash>, Error> {
+	) -> db::TxResult<Option<Hash>, Error> {
 		let i = key.prefix.len();
 
 		// Read node at current position (defined by the prefix stored in key)
@@ -203,7 +178,7 @@ where
 							}
 							MerkleNode::Intermediate(_) => Some(MerkleNode::Intermediate(children)),
 							x @ MerkleNode::Leaf(_, _) => {
-								tx.remove(key_sub.encode())?;
+								tx.remove(&self.data.merkle_tree, key_sub.encode())?;
 								Some(x)
 							}
 						}
@@ -283,28 +258,27 @@ where
 
 	fn read_node_txn(
 		&self,
-		tx: &TransactionalTree,
+		tx: &mut db::Transaction<'_>,
 		k: &MerkleNodeKey,
-	) -> ConflictableTransactionResult<MerkleNode, Error> {
-		let ent = tx.get(k.encode())?;
-		MerkleNode::decode_opt(ent).map_err(ConflictableTransactionError::Abort)
+	) -> db::TxResult<MerkleNode, Error> {
+		let ent = tx.get(&self.data.merkle_tree, k.encode())?;
+		MerkleNode::decode_opt(&ent).map_err(db::TxError::Abort)
 	}
 
 	fn put_node_txn(
 		&self,
-		tx: &TransactionalTree,
+		tx: &mut db::Transaction<'_>,
 		k: &MerkleNodeKey,
 		v: &MerkleNode,
-	) -> ConflictableTransactionResult<Hash, Error> {
+	) -> db::TxResult<Hash, Error> {
 		trace!("Put Merkle node: {:?} => {:?}", k, v);
 		if *v == MerkleNode::Empty {
-			tx.remove(k.encode())?;
+			tx.remove(&self.data.merkle_tree, k.encode())?;
 			Ok(self.empty_node_hash)
 		} else {
-			let vby = rmp_to_vec_all_named(v)
-				.map_err(|e| ConflictableTransactionError::Abort(e.into()))?;
+			let vby = rmp_to_vec_all_named(v).map_err(|e| db::TxError::Abort(e.into()))?;
 			let rethash = blake2sum(&vby[..]);
-			tx.insert(k.encode(), vby)?;
+			tx.insert(&self.data.merkle_tree, k.encode(), vby)?;
 			Ok(rethash)
 		}
 	}
@@ -312,15 +286,63 @@ where
 	// Access a node in the Merkle tree, used by the sync protocol
 	pub(crate) fn read_node(&self, k: &MerkleNodeKey) -> Result<MerkleNode, Error> {
 		let ent = self.data.merkle_tree.get(k.encode())?;
-		MerkleNode::decode_opt(ent)
+		MerkleNode::decode_opt(&ent)
+	}
+
+	pub fn merkle_tree_len(&self) -> Result<usize, Error> {
+		Ok(self.data.merkle_tree.len()?)
 	}
 
-	pub fn merkle_tree_len(&self) -> usize {
-		self.data.merkle_tree.len()
+	pub fn todo_len(&self) -> Result<usize, Error> {
+		Ok(self.data.merkle_todo.len()?)
 	}
+}
+
+struct MerkleWorker<F, R>(Arc<MerkleUpdater<F, R>>)
+where
+	F: TableSchema + 'static,
+	R: TableReplication + 'static;
 
-	pub fn todo_len(&self) -> usize {
-		self.data.merkle_todo.len()
+#[async_trait]
+impl<F, R> Worker for MerkleWorker<F, R>
+where
+	F: TableSchema + 'static,
+	R: TableReplication + 'static,
+{
+	fn name(&self) -> String {
+		format!("{} Merkle tree updater", F::TABLE_NAME)
+	}
+
+	fn info(&self) -> Option<String> {
+		let l = self.0.todo_len().unwrap_or(0);
+		if l > 0 {
+			Some(format!("{} items in queue", l))
+		} else {
+			None
+		}
+	}
+
+	async fn work(&mut self, _must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error> {
+		let updater = self.0.clone();
+		tokio::task::spawn_blocking(move || {
+			for _i in 0..100 {
+				let s = updater.updater_loop_iter();
+				if !matches!(s, Ok(WorkerState::Busy)) {
+					return s;
+				}
+			}
+			Ok(WorkerState::Busy)
+		})
+		.await
+		.unwrap()
+	}
+
+	async fn wait_for_work(&mut self, must_exit: &watch::Receiver<bool>) -> WorkerState {
+		if *must_exit.borrow() {
+			return WorkerState::Done;
+		}
+		tokio::time::sleep(Duration::from_secs(10)).await;
+		WorkerState::Busy
 	}
 }
 
@@ -347,7 +369,7 @@ impl MerkleNodeKey {
 }
 
 impl MerkleNode {
-	fn decode_opt(ent: Option<sled::IVec>) -> Result<Self, Error> {
+	fn decode_opt(ent: &Option<db::Value>) -> Result<Self, Error> {
 		match ent {
 			None => Ok(MerkleNode::Empty),
 			Some(v) => Ok(rmp_serde::decode::from_read_ref::<_, MerkleNode>(&v[..])?),
diff --git a/src/table/metrics.rs b/src/table/metrics.rs
index 752a2a6d..3a1783e0 100644
--- a/src/table/metrics.rs
+++ b/src/table/metrics.rs
@@ -1,6 +1,7 @@
 use opentelemetry::{global, metrics::*, KeyValue};
 
-use garage_util::sled_counter::SledCountedTree;
+use garage_db as db;
+use garage_db::counted_tree_hack::CountedTree;
 
 /// TableMetrics reference all counter used for metrics
 pub struct TableMetrics {
@@ -19,21 +20,19 @@ pub struct TableMetrics {
 	pub(crate) sync_items_received: Counter<u64>,
 }
 impl TableMetrics {
-	pub fn new(
-		table_name: &'static str,
-		merkle_todo: sled::Tree,
-		gc_todo: SledCountedTree,
-	) -> Self {
+	pub fn new(table_name: &'static str, merkle_todo: db::Tree, gc_todo: CountedTree) -> Self {
 		let meter = global::meter(table_name);
 		TableMetrics {
 			_merkle_todo_len: meter
 				.u64_value_observer(
 					"table.merkle_updater_todo_queue_length",
 					move |observer| {
-						observer.observe(
-							merkle_todo.len() as u64,
-							&[KeyValue::new("table_name", table_name)],
-						)
+						if let Ok(v) = merkle_todo.len() {
+							observer.observe(
+								v as u64,
+								&[KeyValue::new("table_name", table_name)],
+							);
+						}
 					},
 				)
 				.with_description("Merkle tree updater TODO queue length")
@@ -45,7 +44,7 @@ impl TableMetrics {
 						observer.observe(
 							gc_todo.len() as u64,
 							&[KeyValue::new("table_name", table_name)],
-						)
+						);
 					},
 				)
 				.with_description("Table garbage collector TODO queue length")
diff --git a/src/table/schema.rs b/src/table/schema.rs
index eba918a2..f37e98d8 100644
--- a/src/table/schema.rs
+++ b/src/table/schema.rs
@@ -1,5 +1,6 @@
 use serde::{Deserialize, Serialize};
 
+use garage_db as db;
 use garage_util::data::*;
 
 use crate::crdt::Crdt;
@@ -59,7 +60,7 @@ pub trait Entry<P: PartitionKey, S: SortKey>:
 }
 
 /// Trait for the schema used in a table
-pub trait TableSchema: Send + Sync {
+pub trait TableSchema: Send + Sync + 'static {
 	/// The name of the table in the database
 	const TABLE_NAME: &'static str;
 
@@ -82,11 +83,19 @@ pub trait TableSchema: Send + Sync {
 		None
 	}
 
-	// Updated triggers some stuff downstream, but it is not supposed to block or fail,
-	// as the update itself is an unchangeable fact that will never go back
-	// due to CRDT logic. Typically errors in propagation of info should be logged
-	// to stderr.
-	fn updated(&self, _old: Option<Self::E>, _new: Option<Self::E>) {}
+	/// Actions triggered by data changing in a table. If such actions
+	/// include updates to the local database that should be applied
+	/// atomically with the item update itself, a db transaction is
+	/// provided on which these changes should be done.
+	/// This function can return a DB error but that's all.
+	fn updated(
+		&self,
+		_tx: &mut db::Transaction,
+		_old: Option<&Self::E>,
+		_new: Option<&Self::E>,
+	) -> db::TxOpResult<()> {
+		Ok(())
+	}
 
 	fn matches_filter(entry: &Self::E, filter: &Self::Filter) -> bool;
 }
diff --git a/src/table/sync.rs b/src/table/sync.rs
index 08069ad0..9d79d856 100644
--- a/src/table/sync.rs
+++ b/src/table/sync.rs
@@ -1,17 +1,17 @@
 use std::collections::VecDeque;
-use std::sync::{Arc, Mutex};
+use std::sync::Arc;
 use std::time::{Duration, Instant};
 
 use async_trait::async_trait;
-use futures::select;
-use futures_util::future::*;
 use futures_util::stream::*;
 use opentelemetry::KeyValue;
 use rand::Rng;
 use serde::{Deserialize, Serialize};
 use serde_bytes::ByteBuf;
+use tokio::select;
 use tokio::sync::{mpsc, watch};
 
+use garage_util::background::*;
 use garage_util::data::*;
 use garage_util::error::Error;
 
@@ -24,8 +24,6 @@ use crate::merkle::*;
 use crate::replication::*;
 use crate::*;
 
-const TABLE_SYNC_RPC_TIMEOUT: Duration = Duration::from_secs(30);
-
 // Do anti-entropy every 10 minutes
 const ANTI_ENTROPY_INTERVAL: Duration = Duration::from_secs(10 * 60);
 
@@ -34,7 +32,7 @@ pub struct TableSyncer<F: TableSchema + 'static, R: TableReplication + 'static>
 	data: Arc<TableData<F, R>>,
 	merkle: Arc<MerkleUpdater<F, R>>,
 
-	todo: Mutex<SyncTodo>,
+	add_full_sync_tx: mpsc::UnboundedSender<()>,
 	endpoint: Arc<Endpoint<SyncRpc, Self>>,
 }
 
@@ -52,10 +50,6 @@ impl Rpc for SyncRpc {
 	type Response = Result<SyncRpc, Error>;
 }
 
-struct SyncTodo {
-	todo: Vec<TodoPartition>,
-}
-
 #[derive(Debug, Clone)]
 struct TodoPartition {
 	partition: Partition,
@@ -80,118 +74,40 @@ where
 			.netapp
 			.endpoint(format!("garage_table/sync.rs/Rpc:{}", F::TABLE_NAME));
 
-		let todo = SyncTodo { todo: vec![] };
+		let (add_full_sync_tx, add_full_sync_rx) = mpsc::unbounded_channel();
 
 		let syncer = Arc::new(Self {
 			system: system.clone(),
 			data,
 			merkle,
-			todo: Mutex::new(todo),
+			add_full_sync_tx,
 			endpoint,
 		});
 
 		syncer.endpoint.set_handler(syncer.clone());
 
-		let (busy_tx, busy_rx) = mpsc::unbounded_channel();
-
-		let s1 = syncer.clone();
-		system.background.spawn_worker(
-			format!("table sync watcher for {}", F::TABLE_NAME),
-			move |must_exit: watch::Receiver<bool>| s1.watcher_task(must_exit, busy_rx),
-		);
-
-		let s2 = syncer.clone();
-		system.background.spawn_worker(
-			format!("table syncer for {}", F::TABLE_NAME),
-			move |must_exit: watch::Receiver<bool>| s2.syncer_task(must_exit, busy_tx),
-		);
-
-		let s3 = syncer.clone();
-		tokio::spawn(async move {
-			tokio::time::sleep(Duration::from_secs(20)).await;
-			s3.add_full_sync();
+		system.background.spawn_worker(SyncWorker {
+			syncer: syncer.clone(),
+			ring_recv: system.ring.clone(),
+			ring: system.ring.borrow().clone(),
+			add_full_sync_rx,
+			todo: vec![],
+			next_full_sync: Instant::now() + Duration::from_secs(20),
 		});
 
 		syncer
 	}
 
-	async fn watcher_task(
-		self: Arc<Self>,
-		mut must_exit: watch::Receiver<bool>,
-		mut busy_rx: mpsc::UnboundedReceiver<bool>,
-	) {
-		let mut prev_ring: Arc<Ring> = self.system.ring.borrow().clone();
-		let mut ring_recv: watch::Receiver<Arc<Ring>> = self.system.ring.clone();
-		let mut nothing_to_do_since = Some(Instant::now());
-
-		while !*must_exit.borrow() {
-			select! {
-				_ = ring_recv.changed().fuse() => {
-					let new_ring = ring_recv.borrow();
-					if !Arc::ptr_eq(&new_ring, &prev_ring) {
-						debug!("({}) Ring changed, adding full sync to syncer todo list", F::TABLE_NAME);
-						self.add_full_sync();
-						prev_ring = new_ring.clone();
-					}
-				}
-				busy_opt = busy_rx.recv().fuse() => {
-					if let Some(busy) = busy_opt {
-						if busy {
-							nothing_to_do_since = None;
-						} else if nothing_to_do_since.is_none() {
-							nothing_to_do_since = Some(Instant::now());
-						}
-					}
-				}
-				_ = must_exit.changed().fuse() => {},
-				_ = tokio::time::sleep(Duration::from_secs(1)).fuse() => {
-					if nothing_to_do_since.map(|t| Instant::now() - t >= ANTI_ENTROPY_INTERVAL).unwrap_or(false) {
-						nothing_to_do_since = None;
-						debug!("({}) Interval passed, adding full sync to syncer todo list", F::TABLE_NAME);
-						self.add_full_sync();
-					}
-				}
-			}
-		}
-	}
-
 	pub fn add_full_sync(&self) {
-		self.todo
-			.lock()
-			.unwrap()
-			.add_full_sync(&self.data, &self.system);
-	}
-
-	async fn syncer_task(
-		self: Arc<Self>,
-		mut must_exit: watch::Receiver<bool>,
-		busy_tx: mpsc::UnboundedSender<bool>,
-	) {
-		while !*must_exit.borrow() {
-			let task = self.todo.lock().unwrap().pop_task();
-			if let Some(partition) = task {
-				busy_tx.send(true).unwrap();
-				let res = self
-					.clone()
-					.sync_partition(&partition, &mut must_exit)
-					.await;
-				if let Err(e) = res {
-					warn!(
-						"({}) Error while syncing {:?}: {}",
-						F::TABLE_NAME,
-						partition,
-						e
-					);
-				}
-			} else {
-				busy_tx.send(false).unwrap();
-				tokio::time::sleep(Duration::from_secs(1)).await;
-			}
+		if self.add_full_sync_tx.send(()).is_err() {
+			error!("({}) Could not add full sync", F::TABLE_NAME);
 		}
 	}
 
+	// ----
+
 	async fn sync_partition(
-		self: Arc<Self>,
+		self: &Arc<Self>,
 		partition: &TodoPartition,
 		must_exit: &mut watch::Receiver<bool>,
 	) -> Result<(), Error> {
@@ -258,9 +174,9 @@ where
 		while !*must_exit.borrow() {
 			let mut items = Vec::new();
 
-			for item in self.data.store.range(begin.to_vec()..end.to_vec()) {
+			for item in self.data.store.range(begin.to_vec()..end.to_vec())? {
 				let (key, value) = item?;
-				items.push((key.to_vec(), Arc::new(ByteBuf::from(value.as_ref()))));
+				items.push((key.to_vec(), Arc::new(ByteBuf::from(value))));
 
 				if items.len() >= 1024 {
 					break;
@@ -329,9 +245,7 @@ where
 				&self.endpoint,
 				nodes,
 				SyncRpc::Items(values),
-				RequestStrategy::with_priority(PRIO_BACKGROUND)
-					.with_quorum(nodes.len())
-					.with_timeout(TABLE_SYNC_RPC_TIMEOUT),
+				RequestStrategy::with_priority(PRIO_BACKGROUND).with_quorum(nodes.len()),
 			)
 			.await?;
 
@@ -392,8 +306,7 @@ where
 				&self.endpoint,
 				who,
 				SyncRpc::RootCkHash(partition.partition, root_ck_hash),
-				RequestStrategy::with_priority(PRIO_BACKGROUND)
-					.with_timeout(TABLE_SYNC_RPC_TIMEOUT),
+				RequestStrategy::with_priority(PRIO_BACKGROUND),
 			)
 			.await?;
 
@@ -432,11 +345,11 @@ where
 					// Just send that item directly
 					if let Some(val) = self.data.store.get(&ik[..])? {
 						if blake2sum(&val[..]) != ivhash {
-							warn!("({}) Hashes differ between stored value and Merkle tree, key: {:?} (if your server is very busy, don't worry, this happens when the Merkle tree can't be updated fast enough)", F::TABLE_NAME, ik);
+							debug!("({}) Hashes differ between stored value and Merkle tree, key: {} (if your server is very busy, don't worry, this happens when the Merkle tree can't be updated fast enough)", F::TABLE_NAME, hex::encode(ik));
 						}
 						todo_items.push(val.to_vec());
 					} else {
-						warn!("({}) Item from Merkle tree not found in store: {:?} (if your server is very busy, don't worry, this happens when the Merkle tree can't be updated fast enough)", F::TABLE_NAME, ik);
+						debug!("({}) Item from Merkle tree not found in store: {} (if your server is very busy, don't worry, this happens when the Merkle tree can't be updated fast enough)", F::TABLE_NAME, hex::encode(ik));
 					}
 				}
 				MerkleNode::Intermediate(l) => {
@@ -449,8 +362,7 @@ where
 							&self.endpoint,
 							who,
 							SyncRpc::GetNode(key.clone()),
-							RequestStrategy::with_priority(PRIO_BACKGROUND)
-								.with_timeout(TABLE_SYNC_RPC_TIMEOUT),
+							RequestStrategy::with_priority(PRIO_BACKGROUND),
 						)
 						.await?
 					{
@@ -526,8 +438,7 @@ where
 				&self.endpoint,
 				who,
 				SyncRpc::Items(values),
-				RequestStrategy::with_priority(PRIO_BACKGROUND)
-					.with_timeout(TABLE_SYNC_RPC_TIMEOUT),
+				RequestStrategy::with_priority(PRIO_BACKGROUND),
 			)
 			.await?;
 		if let SyncRpc::Ok = rpc_resp {
@@ -577,12 +488,22 @@ where
 	}
 }
 
-impl SyncTodo {
-	fn add_full_sync<F: TableSchema, R: TableReplication>(
-		&mut self,
-		data: &TableData<F, R>,
-		system: &System,
-	) {
+// -------- Sync Worker ---------
+
+struct SyncWorker<F: TableSchema + 'static, R: TableReplication + 'static> {
+	syncer: Arc<TableSyncer<F, R>>,
+	ring_recv: watch::Receiver<Arc<Ring>>,
+	ring: Arc<Ring>,
+	add_full_sync_rx: mpsc::UnboundedReceiver<()>,
+	todo: Vec<TodoPartition>,
+	next_full_sync: Instant,
+}
+
+impl<F: TableSchema + 'static, R: TableReplication + 'static> SyncWorker<F, R> {
+	fn add_full_sync(&mut self) {
+		let system = &self.syncer.system;
+		let data = &self.syncer.data;
+
 		let my_id = system.id;
 
 		self.todo.clear();
@@ -603,8 +524,16 @@ impl SyncTodo {
 			let retain = nodes.contains(&my_id);
 			if !retain {
 				// Check if we have some data to send, otherwise skip
-				if data.store.range(begin..end).next().is_none() {
-					continue;
+				match data.store.range(begin..end) {
+					Ok(mut iter) => {
+						if iter.next().is_none() {
+							continue;
+						}
+					}
+					Err(e) => {
+						warn!("DB error in add_full_sync: {}", e);
+						continue;
+					}
 				}
 			}
 
@@ -615,6 +544,8 @@ impl SyncTodo {
 				retain,
 			});
 		}
+
+		self.next_full_sync = Instant::now() + ANTI_ENTROPY_INTERVAL;
 	}
 
 	fn pop_task(&mut self) -> Option<TodoPartition> {
@@ -633,6 +564,62 @@ impl SyncTodo {
 	}
 }
 
+#[async_trait]
+impl<F: TableSchema + 'static, R: TableReplication + 'static> Worker for SyncWorker<F, R> {
+	fn name(&self) -> String {
+		format!("{} sync", F::TABLE_NAME)
+	}
+
+	fn info(&self) -> Option<String> {
+		let l = self.todo.len();
+		if l > 0 {
+			Some(format!("{} partitions remaining", l))
+		} else {
+			None
+		}
+	}
+
+	async fn work(&mut self, must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error> {
+		if let Some(partition) = self.pop_task() {
+			self.syncer.sync_partition(&partition, must_exit).await?;
+			Ok(WorkerState::Busy)
+		} else {
+			Ok(WorkerState::Idle)
+		}
+	}
+
+	async fn wait_for_work(&mut self, must_exit: &watch::Receiver<bool>) -> WorkerState {
+		if *must_exit.borrow() {
+			return WorkerState::Done;
+		}
+		select! {
+			s = self.add_full_sync_rx.recv() => {
+				if let Some(()) = s {
+					self.add_full_sync();
+				}
+			},
+			_ = self.ring_recv.changed() => {
+				let new_ring = self.ring_recv.borrow();
+				if !Arc::ptr_eq(&new_ring, &self.ring) {
+					self.ring = new_ring.clone();
+					drop(new_ring);
+					debug!("({}) Ring changed, adding full sync to syncer todo list", F::TABLE_NAME);
+					self.add_full_sync();
+				}
+			},
+			_ = tokio::time::sleep_until(self.next_full_sync.into()) => {
+				self.add_full_sync();
+			}
+		}
+		match self.todo.is_empty() {
+			false => WorkerState::Busy,
+			true => WorkerState::Idle,
+		}
+	}
+}
+
+// ---- UTIL ----
+
 fn hash_of<T: Serialize>(x: &T) -> Result<Hash, Error> {
 	Ok(blake2sum(&rmp_to_vec_all_named(x)?[..]))
 }
diff --git a/src/table/table.rs b/src/table/table.rs
index 7f87a449..8a66c420 100644
--- a/src/table/table.rs
+++ b/src/table/table.rs
@@ -1,6 +1,6 @@
-use std::collections::{BTreeMap, HashMap};
+use std::borrow::Borrow;
+use std::collections::{BTreeMap, BTreeSet, HashMap};
 use std::sync::Arc;
-use std::time::Duration;
 
 use async_trait::async_trait;
 use futures::stream::*;
@@ -12,6 +12,8 @@ use opentelemetry::{
 	Context,
 };
 
+use garage_db as db;
+
 use garage_util::data::*;
 use garage_util::error::Error;
 use garage_util::metrics::RecordDuration;
@@ -26,8 +28,7 @@ use crate::merkle::*;
 use crate::replication::*;
 use crate::schema::*;
 use crate::sync::*;
-
-const TABLE_RPC_TIMEOUT: Duration = Duration::from_secs(10);
+use crate::util::*;
 
 pub struct Table<F: TableSchema + 'static, R: TableReplication + 'static> {
 	pub system: Arc<System>,
@@ -45,7 +46,13 @@ pub(crate) enum TableRpc<F: TableSchema> {
 	ReadEntryResponse(Option<ByteBuf>),
 
 	// Read range: read all keys in partition P, possibly starting at a certain sort key offset
-	ReadRange(F::P, Option<F::S>, Option<F::Filter>, usize),
+	ReadRange {
+		partition: F::P,
+		begin_sort_key: Option<F::S>,
+		filter: Option<F::Filter>,
+		limit: usize,
+		enumeration_order: EnumerationOrder,
+	},
 
 	Update(Vec<Arc<ByteBuf>>),
 }
@@ -61,7 +68,7 @@ where
 {
 	// =============== PUBLIC INTERFACE FUNCTIONS (new, insert, get, etc) ===============
 
-	pub fn new(instance: F, replication: R, system: Arc<System>, db: &sled::Db) -> Arc<Self> {
+	pub fn new(instance: F, replication: R, system: Arc<System>, db: &db::Db) -> Arc<Self> {
 		let endpoint = system
 			.netapp
 			.endpoint(format!("garage_table/table.rs/Rpc:{}", F::TABLE_NAME));
@@ -103,7 +110,6 @@ where
 	async fn insert_internal(&self, e: &F::E) -> Result<(), Error> {
 		let hash = e.partition_key().hash();
 		let who = self.data.replication.write_nodes(&hash);
-		//eprintln!("insert who: {:?}", who);
 
 		let e_enc = Arc::new(ByteBuf::from(rmp_to_vec_all_named(e)?));
 		let rpc = TableRpc::<F>::Update(vec![e_enc]);
@@ -115,17 +121,20 @@ where
 				&who[..],
 				rpc,
 				RequestStrategy::with_priority(PRIO_NORMAL)
-					.with_quorum(self.data.replication.write_quorum())
-					.with_timeout(TABLE_RPC_TIMEOUT),
+					.with_quorum(self.data.replication.write_quorum()),
 			)
 			.await?;
 
 		Ok(())
 	}
 
-	pub async fn insert_many(&self, entries: &[F::E]) -> Result<(), Error> {
+	pub async fn insert_many<I, IE>(&self, entries: I) -> Result<(), Error>
+	where
+		I: IntoIterator<Item = IE> + Send + Sync,
+		IE: Borrow<F::E> + Send + Sync,
+	{
 		let tracer = opentelemetry::global::tracer("garage_table");
-		let span = tracer.start(format!("{} insert_many {}", F::TABLE_NAME, entries.len()));
+		let span = tracer.start(format!("{} insert_many", F::TABLE_NAME));
 
 		self.insert_many_internal(entries)
 			.bound_record_duration(&self.data.metrics.put_request_duration)
@@ -137,10 +146,15 @@ where
 		Ok(())
 	}
 
-	async fn insert_many_internal(&self, entries: &[F::E]) -> Result<(), Error> {
+	async fn insert_many_internal<I, IE>(&self, entries: I) -> Result<(), Error>
+	where
+		I: IntoIterator<Item = IE> + Send + Sync,
+		IE: Borrow<F::E> + Send + Sync,
+	{
 		let mut call_list: HashMap<_, Vec<_>> = HashMap::new();
 
-		for entry in entries.iter() {
+		for entry in entries.into_iter() {
+			let entry = entry.borrow();
 			let hash = entry.partition_key().hash();
 			let who = self.data.replication.write_nodes(&hash);
 			let e_enc = Arc::new(ByteBuf::from(rmp_to_vec_all_named(entry)?));
@@ -159,7 +173,7 @@ where
 					&self.endpoint,
 					node,
 					rpc,
-					RequestStrategy::with_priority(PRIO_NORMAL).with_timeout(TABLE_RPC_TIMEOUT),
+					RequestStrategy::with_priority(PRIO_NORMAL),
 				)
 				.await?;
 			Ok::<_, Error>((node, resp))
@@ -216,7 +230,6 @@ where
 				rpc,
 				RequestStrategy::with_priority(PRIO_NORMAL)
 					.with_quorum(self.data.replication.read_quorum())
-					.with_timeout(TABLE_RPC_TIMEOUT)
 					.interrupt_after_quorum(true),
 			)
 			.await?;
@@ -261,12 +274,19 @@ where
 		begin_sort_key: Option<F::S>,
 		filter: Option<F::Filter>,
 		limit: usize,
+		enumeration_order: EnumerationOrder,
 	) -> Result<Vec<F::E>, Error> {
 		let tracer = opentelemetry::global::tracer("garage_table");
 		let span = tracer.start(format!("{} get_range", F::TABLE_NAME));
 
 		let res = self
-			.get_range_internal(partition_key, begin_sort_key, filter, limit)
+			.get_range_internal(
+				partition_key,
+				begin_sort_key,
+				filter,
+				limit,
+				enumeration_order,
+			)
 			.bound_record_duration(&self.data.metrics.get_request_duration)
 			.with_context(Context::current_with_span(span))
 			.await?;
@@ -282,11 +302,18 @@ where
 		begin_sort_key: Option<F::S>,
 		filter: Option<F::Filter>,
 		limit: usize,
+		enumeration_order: EnumerationOrder,
 	) -> Result<Vec<F::E>, Error> {
 		let hash = partition_key.hash();
 		let who = self.data.replication.read_nodes(&hash);
 
-		let rpc = TableRpc::<F>::ReadRange(partition_key.clone(), begin_sort_key, filter, limit);
+		let rpc = TableRpc::<F>::ReadRange {
+			partition: partition_key.clone(),
+			begin_sort_key,
+			filter,
+			limit,
+			enumeration_order,
+		};
 
 		let resps = self
 			.system
@@ -297,49 +324,69 @@ where
 				rpc,
 				RequestStrategy::with_priority(PRIO_NORMAL)
 					.with_quorum(self.data.replication.read_quorum())
-					.with_timeout(TABLE_RPC_TIMEOUT)
 					.interrupt_after_quorum(true),
 			)
 			.await?;
 
-		let mut ret = BTreeMap::new();
-		let mut to_repair = BTreeMap::new();
+		let mut ret: BTreeMap<Vec<u8>, F::E> = BTreeMap::new();
+		let mut to_repair = BTreeSet::new();
 		for resp in resps {
 			if let TableRpc::Update(entries) = resp {
 				for entry_bytes in entries.iter() {
 					let entry = self.data.decode_entry(entry_bytes.as_slice())?;
 					let entry_key = self.data.tree_key(entry.partition_key(), entry.sort_key());
-					match ret.remove(&entry_key) {
-						None => {
-							ret.insert(entry_key, Some(entry));
-						}
-						Some(Some(mut prev)) => {
-							let must_repair = prev != entry;
-							prev.merge(&entry);
-							if must_repair {
-								to_repair.insert(entry_key.clone(), Some(prev.clone()));
+					match ret.get_mut(&entry_key) {
+						Some(e) => {
+							if *e != entry {
+								e.merge(&entry);
+								to_repair.insert(entry_key.clone());
 							}
-							ret.insert(entry_key, Some(prev));
 						}
-						Some(None) => unreachable!(),
+						None => {
+							ret.insert(entry_key, entry);
+						}
 					}
 				}
+			} else {
+				return Err(Error::unexpected_rpc_message(resp));
 			}
 		}
+
 		if !to_repair.is_empty() {
 			let self2 = self.clone();
+			let to_repair = to_repair
+				.into_iter()
+				.map(|k| ret.get(&k).unwrap().clone())
+				.collect::<Vec<_>>();
 			self.system.background.spawn_cancellable(async move {
-				for (_, v) in to_repair.iter_mut() {
-					self2.repair_on_read(&who[..], v.take().unwrap()).await?;
+				for v in to_repair {
+					self2.repair_on_read(&who[..], v).await?;
 				}
 				Ok(())
 			});
 		}
-		let ret_vec = ret
-			.iter_mut()
-			.take(limit)
-			.map(|(_k, v)| v.take().unwrap())
-			.collect::<Vec<_>>();
+
+		// At this point, the `ret` btreemap might contain more than `limit`
+		// items, because nodes might have returned us each `limit` items
+		// but for different keys. We have to take only the first `limit` items
+		// in this map, in the specified enumeration order, for two reasons:
+		// 1. To return to the user no more than the number of items that they requested
+		// 2. To return only items for which we have a read quorum: we do not know
+		//    that we have a read quorum for the items after the first `limit`
+		//    of them
+		let ret_vec = match enumeration_order {
+			EnumerationOrder::Forward => ret
+				.into_iter()
+				.take(limit)
+				.map(|(_k, v)| v)
+				.collect::<Vec<_>>(),
+			EnumerationOrder::Reverse => ret
+				.into_iter()
+				.rev()
+				.take(limit)
+				.map(|(_k, v)| v)
+				.collect::<Vec<_>>(),
+		};
 		Ok(ret_vec)
 	}
 
@@ -353,9 +400,7 @@ where
 				&self.endpoint,
 				who,
 				TableRpc::<F>::Update(vec![what_enc]),
-				RequestStrategy::with_priority(PRIO_NORMAL)
-					.with_quorum(who.len())
-					.with_timeout(TABLE_RPC_TIMEOUT),
+				RequestStrategy::with_priority(PRIO_NORMAL).with_quorum(who.len()),
 			)
 			.await?;
 		Ok(())
@@ -378,8 +423,20 @@ where
 				let value = self.data.read_entry(key, sort_key)?;
 				Ok(TableRpc::ReadEntryResponse(value))
 			}
-			TableRpc::ReadRange(key, begin_sort_key, filter, limit) => {
-				let values = self.data.read_range(key, begin_sort_key, filter, *limit)?;
+			TableRpc::ReadRange {
+				partition,
+				begin_sort_key,
+				filter,
+				limit,
+				enumeration_order,
+			} => {
+				let values = self.data.read_range(
+					partition,
+					begin_sort_key,
+					filter,
+					*limit,
+					*enumeration_order,
+				)?;
 				Ok(TableRpc::Update(values))
 			}
 			TableRpc::Update(pairs) => {
diff --git a/src/table/util.rs b/src/table/util.rs
index 2a5c3afe..20595a94 100644
--- a/src/table/util.rs
+++ b/src/table/util.rs
@@ -17,7 +17,7 @@ impl PartitionKey for EmptyKey {
 	}
 }
 
-#[derive(Clone, Copy, Debug, Serialize, Deserialize)]
+#[derive(Clone, Copy, Debug, Serialize, Deserialize, PartialEq, Eq)]
 pub enum DeletedFilter {
 	Any,
 	Deleted,
@@ -33,3 +33,19 @@ impl DeletedFilter {
 		}
 	}
 }
+
+#[derive(Clone, Copy, Debug, Serialize, Deserialize, PartialEq, Eq)]
+pub enum EnumerationOrder {
+	Forward,
+	Reverse,
+}
+
+impl EnumerationOrder {
+	pub fn from_reverse(reverse: bool) -> Self {
+		if reverse {
+			Self::Reverse
+		} else {
+			Self::Forward
+		}
+	}
+}
diff --git a/src/util/Cargo.toml b/src/util/Cargo.toml
index f13c1589..8e978fc2 100644
--- a/src/util/Cargo.toml
+++ b/src/util/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "garage_util"
-version = "0.7.0"
+version = "0.8.0"
 authors = ["Alex Auvolat <alex@adnab.me>"]
 edition = "2018"
 license = "AGPL-3.0"
@@ -14,15 +14,21 @@ path = "lib.rs"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
+garage_db = { version = "0.8.0", path = "../db" }
+
+arc-swap = "1.0"
+async-trait = "0.1"
 blake2 = "0.9"
+bytes = "1.0"
+digest = "0.10"
 err-derive = "0.3"
+git-version = "0.3.4"
 xxhash-rust = { version = "0.8", default-features = false, features = ["xxh3"] }
 hex = "0.4"
+lazy_static = "1.4"
 tracing = "0.1.30"
 rand = "0.8"
-sha2 = "0.9"
-
-sled = "0.34"
+sha2 = "0.10"
 
 chrono = "0.4"
 rmp-serde = "0.15"
@@ -33,11 +39,13 @@ toml = "0.5"
 futures = "0.3"
 tokio = { version = "1.0", default-features = false, features = ["rt", "rt-multi-thread", "io-util", "net", "time", "macros", "sync", "signal", "fs"] }
 
-#netapp = { version = "0.3.0", git = "https://git.deuxfleurs.fr/lx/netapp" }
-#netapp = { version = "0.4", path = "../../../netapp" }
-netapp = "0.4"
+netapp = "0.5"
 
 http = "0.2"
 hyper = "0.14"
 
 opentelemetry = { version = "0.17", features = [ "rt-tokio", "metrics", "trace" ] }
+
+
+[features]
+k2v = []
diff --git a/src/util/async_hash.rs b/src/util/async_hash.rs
new file mode 100644
index 00000000..5631ea6b
--- /dev/null
+++ b/src/util/async_hash.rs
@@ -0,0 +1,61 @@
+use bytes::Bytes;
+use digest::Digest;
+
+use tokio::sync::mpsc;
+use tokio::task::JoinHandle;
+
+use crate::data::*;
+
+/// Compute the sha256 of a slice,
+/// spawning on a tokio thread for CPU-intensive processing
+/// The argument has to be an owned Bytes, as it is moved out to a new thread.
+pub async fn async_sha256sum(data: Bytes) -> Hash {
+	tokio::task::spawn_blocking(move || sha256sum(&data))
+		.await
+		.unwrap()
+}
+
+/// Compute the blake2sum of a slice,
+/// spawning on a tokio thread for CPU-intensive processing.
+/// The argument has to be an owned Bytes, as it is moved out to a new thread.
+pub async fn async_blake2sum(data: Bytes) -> Hash {
+	tokio::task::spawn_blocking(move || blake2sum(&data))
+		.await
+		.unwrap()
+}
+
+// ----
+
+pub struct AsyncHasher<D: Digest> {
+	sendblk: mpsc::Sender<Bytes>,
+	task: JoinHandle<digest::Output<D>>,
+}
+
+impl<D: Digest> AsyncHasher<D> {
+	pub fn new() -> Self {
+		let (sendblk, mut recvblk) = mpsc::channel::<Bytes>(1);
+		let task = tokio::task::spawn_blocking(move || {
+			let mut digest = D::new();
+			while let Some(blk) = recvblk.blocking_recv() {
+				digest.update(&blk[..]);
+			}
+			digest.finalize()
+		});
+		Self { sendblk, task }
+	}
+
+	pub async fn update(&self, b: Bytes) {
+		self.sendblk.send(b).await.unwrap();
+	}
+
+	pub async fn finalize(self) -> digest::Output<D> {
+		drop(self.sendblk);
+		self.task.await.unwrap()
+	}
+}
+
+impl<D: Digest> Default for AsyncHasher<D> {
+	fn default() -> Self {
+		Self::new()
+	}
+}
diff --git a/src/util/background.rs b/src/util/background.rs
deleted file mode 100644
index bfdaaf1e..00000000
--- a/src/util/background.rs
+++ /dev/null
@@ -1,153 +0,0 @@
-//! Job runner for futures and async functions
-use core::future::Future;
-use std::pin::Pin;
-use std::sync::Arc;
-use std::time::Duration;
-
-use futures::future::*;
-use futures::select;
-use tokio::sync::{mpsc, watch, Mutex};
-
-use crate::error::Error;
-
-type JobOutput = Result<(), Error>;
-type Job = Pin<Box<dyn Future<Output = JobOutput> + Send>>;
-
-/// Job runner for futures and async functions
-pub struct BackgroundRunner {
-	stop_signal: watch::Receiver<bool>,
-	queue_in: mpsc::UnboundedSender<(Job, bool)>,
-	worker_in: mpsc::UnboundedSender<tokio::task::JoinHandle<()>>,
-}
-
-impl BackgroundRunner {
-	/// Create a new BackgroundRunner
-	pub fn new(
-		n_runners: usize,
-		stop_signal: watch::Receiver<bool>,
-	) -> (Arc<Self>, tokio::task::JoinHandle<()>) {
-		let (worker_in, mut worker_out) = mpsc::unbounded_channel();
-
-		let stop_signal_2 = stop_signal.clone();
-		let await_all_done = tokio::spawn(async move {
-			loop {
-				let wkr = {
-					select! {
-						item = worker_out.recv().fuse() => {
-							match item {
-								Some(x) => x,
-								None => break,
-							}
-						}
-						_ = tokio::time::sleep(Duration::from_secs(5)).fuse() => {
-							if *stop_signal_2.borrow() {
-								break;
-							} else {
-								continue;
-							}
-						}
-					}
-				};
-				if let Err(e) = wkr.await {
-					error!("Error while awaiting for worker: {}", e);
-				}
-			}
-		});
-
-		let (queue_in, queue_out) = mpsc::unbounded_channel();
-		let queue_out = Arc::new(Mutex::new(queue_out));
-
-		for i in 0..n_runners {
-			let queue_out = queue_out.clone();
-			let stop_signal = stop_signal.clone();
-
-			worker_in
-				.send(tokio::spawn(async move {
-					loop {
-						let (job, cancellable) = {
-							select! {
-								item = wait_job(&queue_out).fuse() => match item {
-									// We received a task, process it
-									Some(x) => x,
-									// We received a signal that no more tasks will ever be sent
-									// because the sending side was dropped. Exit now.
-									None => break,
-								},
-								_ = tokio::time::sleep(Duration::from_secs(5)).fuse() => {
-									if *stop_signal.borrow() {
-										// Nothing has been going on for 5 secs, and we are shutting
-										// down. Exit now.
-										break;
-									} else {
-										// Nothing is going on but we don't want to exit.
-										continue;
-									}
-								}
-							}
-						};
-						if cancellable && *stop_signal.borrow() {
-							continue;
-						}
-						if let Err(e) = job.await {
-							error!("Job failed: {}", e)
-						}
-					}
-					info!("Background worker {} exiting", i);
-				}))
-				.unwrap();
-		}
-
-		let bgrunner = Arc::new(Self {
-			stop_signal,
-			queue_in,
-			worker_in,
-		});
-		(bgrunner, await_all_done)
-	}
-
-	/// Spawn a task to be run in background
-	pub fn spawn<T>(&self, job: T)
-	where
-		T: Future<Output = JobOutput> + Send + 'static,
-	{
-		let boxed: Job = Box::pin(job);
-		self.queue_in
-			.send((boxed, false))
-			.map_err(|_| "could not put job in queue")
-			.unwrap();
-	}
-
-	/// Spawn a task to be run in background. It may get discarded before running if spawned while
-	/// the runner is stopping
-	pub fn spawn_cancellable<T>(&self, job: T)
-	where
-		T: Future<Output = JobOutput> + Send + 'static,
-	{
-		let boxed: Job = Box::pin(job);
-		self.queue_in
-			.send((boxed, true))
-			.map_err(|_| "could not put job in queue")
-			.unwrap();
-	}
-
-	pub fn spawn_worker<F, T>(&self, name: String, worker: F)
-	where
-		F: FnOnce(watch::Receiver<bool>) -> T + Send + 'static,
-		T: Future<Output = ()> + Send + 'static,
-	{
-		let stop_signal = self.stop_signal.clone();
-		let task = tokio::spawn(async move {
-			info!("Worker started: {}", name);
-			worker(stop_signal).await;
-			info!("Worker exited: {}", name);
-		});
-		self.worker_in
-			.send(task)
-			.map_err(|_| "could not put job in queue")
-			.unwrap();
-	}
-}
-
-async fn wait_job(q: &Mutex<mpsc::UnboundedReceiver<(Job, bool)>>) -> Option<(Job, bool)> {
-	q.lock().await.recv().await
-}
diff --git a/src/util/background/job_worker.rs b/src/util/background/job_worker.rs
new file mode 100644
index 00000000..2568ea11
--- /dev/null
+++ b/src/util/background/job_worker.rs
@@ -0,0 +1,48 @@
+//! Job worker: a generic worker that just processes incoming
+//! jobs one by one
+
+use std::sync::Arc;
+
+use async_trait::async_trait;
+use tokio::sync::{mpsc, Mutex};
+
+use crate::background::worker::*;
+use crate::background::*;
+
+pub(crate) struct JobWorker {
+	pub(crate) index: usize,
+	pub(crate) job_chan: Arc<Mutex<mpsc::UnboundedReceiver<(Job, bool)>>>,
+	pub(crate) next_job: Option<Job>,
+}
+
+#[async_trait]
+impl Worker for JobWorker {
+	fn name(&self) -> String {
+		format!("Job worker #{}", self.index)
+	}
+
+	async fn work(&mut self, _must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error> {
+		match self.next_job.take() {
+			None => return Ok(WorkerState::Idle),
+			Some(job) => {
+				job.await?;
+				Ok(WorkerState::Busy)
+			}
+		}
+	}
+
+	async fn wait_for_work(&mut self, must_exit: &watch::Receiver<bool>) -> WorkerState {
+		loop {
+			match self.job_chan.lock().await.recv().await {
+				Some((job, cancellable)) => {
+					if cancellable && *must_exit.borrow() {
+						continue;
+					}
+					self.next_job = Some(job);
+					return WorkerState::Busy;
+				}
+				None => return WorkerState::Done,
+			}
+		}
+	}
+}
diff --git a/src/util/background/mod.rs b/src/util/background/mod.rs
new file mode 100644
index 00000000..619f5068
--- /dev/null
+++ b/src/util/background/mod.rs
@@ -0,0 +1,117 @@
+//! Job runner for futures and async functions
+
+pub mod job_worker;
+pub mod worker;
+
+use core::future::Future;
+
+use std::collections::HashMap;
+use std::pin::Pin;
+use std::sync::Arc;
+
+use serde::{Deserialize, Serialize};
+use tokio::sync::{mpsc, watch, Mutex};
+
+use crate::error::Error;
+use worker::WorkerProcessor;
+pub use worker::{Worker, WorkerState};
+
+pub(crate) type JobOutput = Result<(), Error>;
+pub(crate) type Job = Pin<Box<dyn Future<Output = JobOutput> + Send>>;
+
+/// Job runner for futures and async functions
+pub struct BackgroundRunner {
+	send_job: mpsc::UnboundedSender<(Job, bool)>,
+	send_worker: mpsc::UnboundedSender<Box<dyn Worker>>,
+	worker_info: Arc<std::sync::Mutex<HashMap<usize, WorkerInfo>>>,
+}
+
+#[derive(Clone, Serialize, Deserialize, Debug)]
+pub struct WorkerInfo {
+	pub name: String,
+	pub info: Option<String>,
+	pub state: WorkerState,
+	pub errors: usize,
+	pub consecutive_errors: usize,
+	pub last_error: Option<(String, u64)>,
+}
+
+impl BackgroundRunner {
+	/// Create a new BackgroundRunner
+	pub fn new(
+		n_runners: usize,
+		stop_signal: watch::Receiver<bool>,
+	) -> (Arc<Self>, tokio::task::JoinHandle<()>) {
+		let (send_worker, worker_out) = mpsc::unbounded_channel::<Box<dyn Worker>>();
+
+		let worker_info = Arc::new(std::sync::Mutex::new(HashMap::new()));
+		let mut worker_processor =
+			WorkerProcessor::new(worker_out, stop_signal, worker_info.clone());
+
+		let await_all_done = tokio::spawn(async move {
+			worker_processor.run().await;
+		});
+
+		let (send_job, queue_out) = mpsc::unbounded_channel();
+		let queue_out = Arc::new(Mutex::new(queue_out));
+
+		for i in 0..n_runners {
+			let queue_out = queue_out.clone();
+
+			send_worker
+				.send(Box::new(job_worker::JobWorker {
+					index: i,
+					job_chan: queue_out.clone(),
+					next_job: None,
+				}))
+				.ok()
+				.unwrap();
+		}
+
+		let bgrunner = Arc::new(Self {
+			send_job,
+			send_worker,
+			worker_info,
+		});
+		(bgrunner, await_all_done)
+	}
+
+	pub fn get_worker_info(&self) -> HashMap<usize, WorkerInfo> {
+		self.worker_info.lock().unwrap().clone()
+	}
+
+	/// Spawn a task to be run in background
+	pub fn spawn<T>(&self, job: T)
+	where
+		T: Future<Output = JobOutput> + Send + 'static,
+	{
+		let boxed: Job = Box::pin(job);
+		self.send_job
+			.send((boxed, false))
+			.ok()
+			.expect("Could not put job in queue");
+	}
+
+	/// Spawn a task to be run in background. It may get discarded before running if spawned while
+	/// the runner is stopping
+	pub fn spawn_cancellable<T>(&self, job: T)
+	where
+		T: Future<Output = JobOutput> + Send + 'static,
+	{
+		let boxed: Job = Box::pin(job);
+		self.send_job
+			.send((boxed, true))
+			.ok()
+			.expect("Could not put job in queue");
+	}
+
+	pub fn spawn_worker<W>(&self, worker: W)
+	where
+		W: Worker + 'static,
+	{
+		self.send_worker
+			.send(Box::new(worker))
+			.ok()
+			.expect("Could not put worker in queue");
+	}
+}
diff --git a/src/util/background/worker.rs b/src/util/background/worker.rs
new file mode 100644
index 00000000..f5e3addb
--- /dev/null
+++ b/src/util/background/worker.rs
@@ -0,0 +1,260 @@
+use std::collections::HashMap;
+use std::sync::Arc;
+use std::time::{Duration, Instant};
+
+use async_trait::async_trait;
+use futures::future::*;
+use futures::stream::FuturesUnordered;
+use futures::StreamExt;
+use serde::{Deserialize, Serialize};
+use tokio::select;
+use tokio::sync::{mpsc, watch};
+
+use crate::background::WorkerInfo;
+use crate::error::Error;
+use crate::time::now_msec;
+
+#[derive(PartialEq, Copy, Clone, Serialize, Deserialize, Debug)]
+pub enum WorkerState {
+	Busy,
+	Throttled(f32),
+	Idle,
+	Done,
+}
+
+impl std::fmt::Display for WorkerState {
+	fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+		match self {
+			WorkerState::Busy => write!(f, "Busy"),
+			WorkerState::Throttled(t) => write!(f, "Thr:{:.3}", t),
+			WorkerState::Idle => write!(f, "Idle"),
+			WorkerState::Done => write!(f, "Done"),
+		}
+	}
+}
+
+#[async_trait]
+pub trait Worker: Send {
+	fn name(&self) -> String;
+
+	fn info(&self) -> Option<String> {
+		None
+	}
+
+	/// Work: do a basic unit of work, if one is available (otherwise, should return
+	/// WorkerState::Idle immediately).  We will do our best to not interrupt this future in the
+	/// middle of processing, it will only be interrupted at the last minute when Garage is trying
+	/// to exit and this hasn't returned yet. This function may return an error to indicate that
+	/// its unit of work could not be processed due to an error: the error will be logged and
+	/// .work() will be called again after a short delay.
+	async fn work(&mut self, must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error>;
+
+	/// Wait for work: await for some task to become available.  This future can be interrupted in
+	/// the middle for any reason.  This future doesn't have to await on must_exit.changed(), we
+	/// are doing it for you.  Therefore it only receives a read refernce to must_exit which allows
+	/// it to check if we are exiting.
+	async fn wait_for_work(&mut self, must_exit: &watch::Receiver<bool>) -> WorkerState;
+}
+
+pub(crate) struct WorkerProcessor {
+	stop_signal: watch::Receiver<bool>,
+	worker_chan: mpsc::UnboundedReceiver<Box<dyn Worker>>,
+	worker_info: Arc<std::sync::Mutex<HashMap<usize, WorkerInfo>>>,
+}
+
+impl WorkerProcessor {
+	pub(crate) fn new(
+		worker_chan: mpsc::UnboundedReceiver<Box<dyn Worker>>,
+		stop_signal: watch::Receiver<bool>,
+		worker_info: Arc<std::sync::Mutex<HashMap<usize, WorkerInfo>>>,
+	) -> Self {
+		Self {
+			stop_signal,
+			worker_chan,
+			worker_info,
+		}
+	}
+
+	pub(crate) async fn run(&mut self) {
+		let mut workers = FuturesUnordered::new();
+		let mut next_task_id = 1;
+
+		while !*self.stop_signal.borrow() {
+			let await_next_worker = async {
+				if workers.is_empty() {
+					futures::future::pending().await
+				} else {
+					workers.next().await
+				}
+			};
+			select! {
+				new_worker_opt = self.worker_chan.recv() => {
+					if let Some(new_worker) = new_worker_opt {
+						let task_id = next_task_id;
+						next_task_id += 1;
+						let stop_signal = self.stop_signal.clone();
+						let stop_signal_worker = self.stop_signal.clone();
+						let mut worker = WorkerHandler {
+								task_id,
+								stop_signal,
+								stop_signal_worker,
+								worker: new_worker,
+								state: WorkerState::Busy,
+								errors: 0,
+								consecutive_errors: 0,
+								last_error: None,
+							};
+						workers.push(async move {
+							worker.step().await;
+							worker
+						}.boxed());
+					}
+				}
+				worker = await_next_worker => {
+					if let Some(mut worker) = worker {
+						trace!("{} (TID {}): {:?}", worker.worker.name(), worker.task_id, worker.state);
+
+						// Save worker info
+						let mut wi = self.worker_info.lock().unwrap();
+						match wi.get_mut(&worker.task_id) {
+							Some(i) => {
+								i.state = worker.state;
+								i.info = worker.worker.info();
+								i.errors = worker.errors;
+								i.consecutive_errors = worker.consecutive_errors;
+								if worker.last_error.is_some() {
+									i.last_error = worker.last_error.take();
+								}
+							}
+							None => {
+								wi.insert(worker.task_id, WorkerInfo {
+									name: worker.worker.name(),
+									state: worker.state,
+									info: worker.worker.info(),
+									errors: worker.errors,
+									consecutive_errors: worker.consecutive_errors,
+									last_error: worker.last_error.take(),
+								});
+							}
+						}
+
+						if worker.state == WorkerState::Done {
+							info!("Worker {} (TID {}) exited", worker.worker.name(), worker.task_id);
+						} else {
+							workers.push(async move {
+								worker.step().await;
+								worker
+							}.boxed());
+						}
+					}
+				}
+				_ = self.stop_signal.changed() => (),
+			}
+		}
+
+		// We are exiting, drain everything
+		let drain_half_time = Instant::now() + Duration::from_secs(5);
+		let drain_everything = async move {
+			while let Some(mut worker) = workers.next().await {
+				if worker.state == WorkerState::Done {
+					info!(
+						"Worker {} (TID {}) exited",
+						worker.worker.name(),
+						worker.task_id
+					);
+				} else if Instant::now() > drain_half_time {
+					warn!("Worker {} (TID {}) interrupted between two iterations in state {:?} (this should be fine)", worker.worker.name(), worker.task_id, worker.state);
+				} else {
+					workers.push(
+						async move {
+							worker.step().await;
+							worker
+						}
+						.boxed(),
+					);
+				}
+			}
+		};
+
+		select! {
+			_ = drain_everything => {
+				info!("All workers exited peacefully \\o/");
+			}
+			_ = tokio::time::sleep(Duration::from_secs(9)) => {
+				error!("Some workers could not exit in time, we are cancelling some things in the middle");
+			}
+		}
+	}
+}
+
+struct WorkerHandler {
+	task_id: usize,
+	stop_signal: watch::Receiver<bool>,
+	stop_signal_worker: watch::Receiver<bool>,
+	worker: Box<dyn Worker>,
+	state: WorkerState,
+	errors: usize,
+	consecutive_errors: usize,
+	last_error: Option<(String, u64)>,
+}
+
+impl WorkerHandler {
+	async fn step(&mut self) {
+		match self.state {
+			WorkerState::Busy => match self.worker.work(&mut self.stop_signal).await {
+				Ok(s) => {
+					self.state = s;
+					self.consecutive_errors = 0;
+				}
+				Err(e) => {
+					error!(
+						"Error in worker {} (TID {}): {}",
+						self.worker.name(),
+						self.task_id,
+						e
+					);
+					self.errors += 1;
+					self.consecutive_errors += 1;
+					self.last_error = Some((format!("{}", e), now_msec()));
+					// Sleep a bit so that error won't repeat immediately, exponential backoff
+					// strategy (min 1sec, max ~60sec)
+					self.state = WorkerState::Throttled(
+						(1.5f32).powf(std::cmp::min(10, self.consecutive_errors - 1) as f32),
+					);
+				}
+			},
+			WorkerState::Throttled(delay) => {
+				// Sleep for given delay and go back to busy state
+				if !*self.stop_signal.borrow() {
+					select! {
+						_ = tokio::time::sleep(Duration::from_secs_f32(delay)) => (),
+						_ = self.stop_signal.changed() => (),
+					}
+				}
+				self.state = WorkerState::Busy;
+			}
+			WorkerState::Idle => {
+				if *self.stop_signal.borrow() {
+					select! {
+						new_st = self.worker.wait_for_work(&self.stop_signal_worker) => {
+							self.state = new_st;
+						}
+						_ = tokio::time::sleep(Duration::from_secs(1)) => {
+							// stay in Idle state
+						}
+					}
+				} else {
+					select! {
+						new_st = self.worker.wait_for_work(&self.stop_signal_worker) => {
+							self.state = new_st;
+						}
+						_ = self.stop_signal.changed() => {
+							// stay in Idle state
+						}
+					}
+				}
+			}
+			WorkerState::Done => unreachable!(),
+		}
+	}
+}
diff --git a/src/util/config.rs b/src/util/config.rs
index e4d96476..2d4b4f57 100644
--- a/src/util/config.rs
+++ b/src/util/config.rs
@@ -3,12 +3,8 @@ use std::io::Read;
 use std::net::SocketAddr;
 use std::path::PathBuf;
 
-use serde::de::Error as SerdeError;
 use serde::{de, Deserialize};
 
-use netapp::util::parse_and_resolve_peer_addr;
-use netapp::NodeID;
-
 use crate::error::Error;
 
 /// Represent the whole configuration
@@ -23,10 +19,6 @@ pub struct Config {
 	#[serde(default = "default_block_size")]
 	pub block_size: usize,
 
-	/// Size of data blocks to save to disk
-	#[serde(default = "default_block_manager_background_tranquility")]
-	pub block_manager_background_tranquility: u32,
-
 	/// Replication mode. Supported values:
 	/// - none, 1 -> no replication
 	/// - 2 -> 2-way replication
@@ -47,11 +39,16 @@ pub struct Config {
 	/// Address to bind for RPC
 	pub rpc_bind_addr: SocketAddr,
 	/// Public IP address of this node
-	pub rpc_public_addr: Option<SocketAddr>,
+	pub rpc_public_addr: Option<String>,
+
+	/// Timeout for Netapp's ping messagess
+	pub rpc_ping_timeout_msec: Option<u64>,
+	/// Timeout for Netapp RPC calls
+	pub rpc_timeout_msec: Option<u64>,
 
 	/// Bootstrap peers RPC address
-	#[serde(deserialize_with = "deserialize_vec_addr", default)]
-	pub bootstrap_peers: Vec<(NodeID, SocketAddr)>,
+	#[serde(default)]
+	pub bootstrap_peers: Vec<String>,
 	/// Consul host to connect to to discover more peers
 	pub consul_host: Option<String>,
 	/// Consul service name to use
@@ -64,19 +61,27 @@ pub struct Config {
 	#[serde(default)]
 	pub kubernetes_skip_crd: bool,
 
+	// -- DB
+	/// Database engine to use for metadata (options: sled, sqlite, lmdb)
+	#[serde(default = "default_db_engine")]
+	pub db_engine: String,
+
 	/// Sled cache size, in bytes
 	#[serde(default = "default_sled_cache_capacity")]
 	pub sled_cache_capacity: u64,
-
 	/// Sled flush interval in milliseconds
 	#[serde(default = "default_sled_flush_every_ms")]
 	pub sled_flush_every_ms: u64,
 
+	// -- APIs
 	/// Configuration for S3 api
-	pub s3_api: ApiConfig,
+	pub s3_api: S3ApiConfig,
+
+	/// Configuration for K2V api
+	pub k2v_api: Option<K2VApiConfig>,
 
 	/// Configuration for serving files as normal web server
-	pub s3_web: WebConfig,
+	pub s3_web: Option<WebConfig>,
 
 	/// Configuration for the admin API endpoint
 	#[serde(default = "Default::default")]
@@ -85,9 +90,9 @@ pub struct Config {
 
 /// Configuration for S3 api
 #[derive(Deserialize, Debug, Clone)]
-pub struct ApiConfig {
+pub struct S3ApiConfig {
 	/// Address and port to bind for api serving
-	pub api_bind_addr: SocketAddr,
+	pub api_bind_addr: Option<SocketAddr>,
 	/// S3 region to use
 	pub s3_region: String,
 	/// Suffix to remove from domain name to find bucket. If None,
@@ -95,6 +100,13 @@ pub struct ApiConfig {
 	pub root_domain: Option<String>,
 }
 
+/// Configuration for K2V api
+#[derive(Deserialize, Debug, Clone)]
+pub struct K2VApiConfig {
+	/// Address and port to bind for api serving
+	pub api_bind_addr: SocketAddr,
+}
+
 /// Configuration for serving files as normal web server
 #[derive(Deserialize, Debug, Clone)]
 pub struct WebConfig {
@@ -109,10 +121,18 @@ pub struct WebConfig {
 pub struct AdminConfig {
 	/// Address and port to bind for admin API serving
 	pub api_bind_addr: Option<SocketAddr>,
+	/// Bearer token to use to scrape metrics
+	pub metrics_token: Option<String>,
+	/// Bearer token to use to access Admin API endpoints
+	pub admin_token: Option<String>,
 	/// OTLP server to where to export traces
 	pub trace_sink: Option<String>,
 }
 
+fn default_db_engine() -> String {
+	"sled".into()
+}
+
 fn default_sled_cache_capacity() -> u64 {
 	128 * 1024 * 1024
 }
@@ -122,9 +142,6 @@ fn default_sled_flush_every_ms() -> u64 {
 fn default_block_size() -> usize {
 	1048576
 }
-fn default_block_manager_background_tranquility() -> u32 {
-	2
-}
 
 /// Read and parse configuration
 pub fn read_config(config_file: PathBuf) -> Result<Config, Error> {
@@ -138,24 +155,6 @@ pub fn read_config(config_file: PathBuf) -> Result<Config, Error> {
 	Ok(toml::from_str(&config)?)
 }
 
-fn deserialize_vec_addr<'de, D>(deserializer: D) -> Result<Vec<(NodeID, SocketAddr)>, D::Error>
-where
-	D: de::Deserializer<'de>,
-{
-	let mut ret = vec![];
-
-	for peer in <Vec<&str>>::deserialize(deserializer)? {
-		let (pubkey, addrs) = parse_and_resolve_peer_addr(peer).ok_or_else(|| {
-			D::Error::custom(format!("Unable to parse or resolve peer: {}", peer))
-		})?;
-		for ip in addrs {
-			ret.push((pubkey, ip));
-		}
-	}
-
-	Ok(ret)
-}
-
 fn default_compression() -> Option<i32> {
 	Some(1)
 }
diff --git a/src/util/crdt/bool.rs b/src/util/crdt/bool.rs
index 53af8f82..111eb5f1 100644
--- a/src/util/crdt/bool.rs
+++ b/src/util/crdt/bool.rs
@@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize};
 use crate::crdt::crdt::*;
 
 /// Boolean, where `true` is an absorbing state
-#[derive(Clone, Copy, Debug, Serialize, Deserialize, PartialEq)]
+#[derive(Clone, Copy, Debug, Serialize, Deserialize, PartialEq, Eq)]
 pub struct Bool(bool);
 
 impl Bool {
diff --git a/src/util/crdt/deletable.rs b/src/util/crdt/deletable.rs
index c76f5cbb..e771aceb 100644
--- a/src/util/crdt/deletable.rs
+++ b/src/util/crdt/deletable.rs
@@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize};
 use crate::crdt::crdt::*;
 
 /// Deletable object (once deleted, cannot go back)
-#[derive(Clone, Copy, Debug, Serialize, Deserialize, PartialEq)]
+#[derive(Clone, Copy, Debug, Serialize, Deserialize, PartialEq, Eq)]
 pub enum Deletable<T> {
 	Present(T),
 	Deleted,
diff --git a/src/util/crdt/lww.rs b/src/util/crdt/lww.rs
index 254abe8e..958844c9 100644
--- a/src/util/crdt/lww.rs
+++ b/src/util/crdt/lww.rs
@@ -37,7 +37,7 @@ use crate::crdt::crdt::*;
 ///
 /// This scheme is used by AWS S3 or Soundcloud and often without knowing
 /// in enterprise when reconciliating databases with ad-hoc scripts.
-#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
+#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
 pub struct Lww<T> {
 	ts: u64,
 	v: T,
diff --git a/src/util/crdt/lww_map.rs b/src/util/crdt/lww_map.rs
index c155c3a8..88113856 100644
--- a/src/util/crdt/lww_map.rs
+++ b/src/util/crdt/lww_map.rs
@@ -23,7 +23,7 @@ use crate::crdt::crdt::*;
 /// However, note that even if we were using a more efficient data structure such as a `BTreeMap`,
 /// the serialization cost `O(n)` would still have to be paid at each modification, so we are
 /// actually not losing anything here.
-#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
+#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
 pub struct LwwMap<K, V> {
 	vals: Vec<(K, u64, V)>,
 }
@@ -140,6 +140,11 @@ where
 		self.vals.clear();
 	}
 
+	/// Retain only values that match a certain predicate
+	pub fn retain(&mut self, pred: impl FnMut(&(K, u64, V)) -> bool) {
+		self.vals.retain(pred);
+	}
+
 	/// Get a reference to the value assigned to a key
 	pub fn get(&self, k: &K) -> Option<&V> {
 		match self.vals.binary_search_by(|(k2, _, _)| k2.cmp(k)) {
diff --git a/src/util/crdt/map.rs b/src/util/crdt/map.rs
index f9ed19b6..5d1e1520 100644
--- a/src/util/crdt/map.rs
+++ b/src/util/crdt/map.rs
@@ -16,7 +16,7 @@ use crate::crdt::crdt::*;
 /// However, note that even if we were using a more efficient data structure such as a `BTreeMap`,
 /// the serialization cost `O(n)` would still have to be paid at each modification, so we are
 /// actually not losing anything here.
-#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
+#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
 pub struct Map<K, V> {
 	vals: Vec<(K, V)>,
 }
diff --git a/src/util/error.rs b/src/util/error.rs
index bdb3a69b..9995c746 100644
--- a/src/util/error.rs
+++ b/src/util/error.rs
@@ -26,8 +26,8 @@ pub enum Error {
 	#[error(display = "Netapp error: {}", _0)]
 	Netapp(#[error(source)] netapp::error::Error),
 
-	#[error(display = "Sled error: {}", _0)]
-	Sled(#[error(source)] sled::Error),
+	#[error(display = "DB error: {}", _0)]
+	Db(#[error(source)] garage_db::Error),
 
 	#[error(display = "Messagepack encode error: {}", _0)]
 	RmpEncode(#[error(source)] rmp_serde::encode::Error),
@@ -44,6 +44,9 @@ pub enum Error {
 	#[error(display = "Tokio semaphore acquire error: {}", _0)]
 	TokioSemAcquire(#[error(source)] tokio::sync::AcquireError),
 
+	#[error(display = "Tokio broadcast receive error: {}", _0)]
+	TokioBcastRecv(#[error(source)] tokio::sync::broadcast::error::RecvError),
+
 	#[error(display = "Remote error: {}", _0)]
 	RemoteError(String),
 
@@ -75,11 +78,11 @@ impl Error {
 	}
 }
 
-impl From<sled::transaction::TransactionError<Error>> for Error {
-	fn from(e: sled::transaction::TransactionError<Error>) -> Error {
+impl From<garage_db::TxError<Error>> for Error {
+	fn from(e: garage_db::TxError<Error>) -> Error {
 		match e {
-			sled::transaction::TransactionError::Abort(x) => x,
-			sled::transaction::TransactionError::Storage(x) => Error::Sled(x),
+			garage_db::TxError::Abort(x) => x,
+			garage_db::TxError::Db(x) => Error::Db(x),
 		}
 	}
 }
diff --git a/src/util/formater.rs b/src/util/formater.rs
new file mode 100644
index 00000000..95324f9a
--- /dev/null
+++ b/src/util/formater.rs
@@ -0,0 +1,28 @@
+pub fn format_table(data: Vec<String>) {
+	let data = data
+		.iter()
+		.map(|s| s.split('\t').collect::<Vec<_>>())
+		.collect::<Vec<_>>();
+
+	let columns = data.iter().map(|row| row.len()).fold(0, std::cmp::max);
+	let mut column_size = vec![0; columns];
+
+	let mut out = String::new();
+
+	for row in data.iter() {
+		for (i, col) in row.iter().enumerate() {
+			column_size[i] = std::cmp::max(column_size[i], col.chars().count());
+		}
+	}
+
+	for row in data.iter() {
+		for (col, col_len) in row[..row.len() - 1].iter().zip(column_size.iter()) {
+			out.push_str(col);
+			(0..col_len - col.chars().count() + 2).for_each(|_| out.push(' '));
+		}
+		out.push_str(row[row.len() - 1]);
+		out.push('\n');
+	}
+
+	print!("{}", out);
+}
diff --git a/src/util/lib.rs b/src/util/lib.rs
index e83fc2e6..264cc192 100644
--- a/src/util/lib.rs
+++ b/src/util/lib.rs
@@ -3,14 +3,16 @@
 #[macro_use]
 extern crate tracing;
 
+pub mod async_hash;
 pub mod background;
 pub mod config;
 pub mod crdt;
 pub mod data;
 pub mod error;
+pub mod formater;
 pub mod metrics;
 pub mod persister;
-pub mod sled_counter;
 pub mod time;
 pub mod token_bucket;
 pub mod tranquilizer;
+pub mod version;
diff --git a/src/util/metrics.rs b/src/util/metrics.rs
index 1b05eabe..b882a886 100644
--- a/src/util/metrics.rs
+++ b/src/util/metrics.rs
@@ -1,4 +1,4 @@
-use std::time::SystemTime;
+use std::time::Instant;
 
 use futures::{future::BoxFuture, Future, FutureExt};
 use rand::Rng;
@@ -28,10 +28,12 @@ where
 		attributes: &'a [KeyValue],
 	) -> BoxFuture<'a, Self::Output> {
 		async move {
-			let request_start = SystemTime::now();
+			let request_start = Instant::now();
 			let res = self.await;
 			r.record(
-				request_start.elapsed().map_or(0.0, |d| d.as_secs_f64()),
+				Instant::now()
+					.saturating_duration_since(request_start)
+					.as_secs_f64(),
 				attributes,
 			);
 			res
@@ -41,9 +43,13 @@ where
 
 	fn bound_record_duration(self, r: &'a BoundValueRecorder<f64>) -> BoxFuture<'a, Self::Output> {
 		async move {
-			let request_start = SystemTime::now();
+			let request_start = Instant::now();
 			let res = self.await;
-			r.record(request_start.elapsed().map_or(0.0, |d| d.as_secs_f64()));
+			r.record(
+				Instant::now()
+					.saturating_duration_since(request_start)
+					.as_secs_f64(),
+			);
 			res
 		}
 		.boxed()
diff --git a/src/util/sled_counter.rs b/src/util/sled_counter.rs
deleted file mode 100644
index bc54cea0..00000000
--- a/src/util/sled_counter.rs
+++ /dev/null
@@ -1,100 +0,0 @@
-use std::sync::{
-	atomic::{AtomicUsize, Ordering},
-	Arc,
-};
-
-use sled::{CompareAndSwapError, IVec, Iter, Result, Tree};
-
-#[derive(Clone)]
-pub struct SledCountedTree(Arc<SledCountedTreeInternal>);
-
-struct SledCountedTreeInternal {
-	tree: Tree,
-	len: AtomicUsize,
-}
-
-impl SledCountedTree {
-	pub fn new(tree: Tree) -> Self {
-		let len = tree.len();
-		Self(Arc::new(SledCountedTreeInternal {
-			tree,
-			len: AtomicUsize::new(len),
-		}))
-	}
-
-	pub fn len(&self) -> usize {
-		self.0.len.load(Ordering::Relaxed)
-	}
-
-	pub fn is_empty(&self) -> bool {
-		self.0.tree.is_empty()
-	}
-
-	pub fn get<K: AsRef<[u8]>>(&self, key: K) -> Result<Option<IVec>> {
-		self.0.tree.get(key)
-	}
-
-	pub fn iter(&self) -> Iter {
-		self.0.tree.iter()
-	}
-
-	// ---- writing functions ----
-
-	pub fn insert<K, V>(&self, key: K, value: V) -> Result<Option<IVec>>
-	where
-		K: AsRef<[u8]>,
-		V: Into<IVec>,
-	{
-		let res = self.0.tree.insert(key, value);
-		if res == Ok(None) {
-			self.0.len.fetch_add(1, Ordering::Relaxed);
-		}
-		res
-	}
-
-	pub fn remove<K: AsRef<[u8]>>(&self, key: K) -> Result<Option<IVec>> {
-		let res = self.0.tree.remove(key);
-		if matches!(res, Ok(Some(_))) {
-			self.0.len.fetch_sub(1, Ordering::Relaxed);
-		}
-		res
-	}
-
-	pub fn pop_min(&self) -> Result<Option<(IVec, IVec)>> {
-		let res = self.0.tree.pop_min();
-		if let Ok(Some(_)) = &res {
-			self.0.len.fetch_sub(1, Ordering::Relaxed);
-		};
-		res
-	}
-
-	pub fn compare_and_swap<K, OV, NV>(
-		&self,
-		key: K,
-		old: Option<OV>,
-		new: Option<NV>,
-	) -> Result<std::result::Result<(), CompareAndSwapError>>
-	where
-		K: AsRef<[u8]>,
-		OV: AsRef<[u8]>,
-		NV: Into<IVec>,
-	{
-		let old_some = old.is_some();
-		let new_some = new.is_some();
-
-		let res = self.0.tree.compare_and_swap(key, old, new);
-
-		if res == Ok(Ok(())) {
-			match (old_some, new_some) {
-				(false, true) => {
-					self.0.len.fetch_add(1, Ordering::Relaxed);
-				}
-				(true, false) => {
-					self.0.len.fetch_sub(1, Ordering::Relaxed);
-				}
-				_ => (),
-			}
-		}
-		res
-	}
-}
diff --git a/src/util/tranquilizer.rs b/src/util/tranquilizer.rs
index 28711387..8a96cbb3 100644
--- a/src/util/tranquilizer.rs
+++ b/src/util/tranquilizer.rs
@@ -3,6 +3,8 @@ use std::time::{Duration, Instant};
 
 use tokio::time::sleep;
 
+use crate::background::WorkerState;
+
 /// A tranquilizer is a helper object that is used to make
 /// background operations not take up too much time.
 ///
@@ -33,8 +35,8 @@ impl Tranquilizer {
 		}
 	}
 
-	pub async fn tranquilize(&mut self, tranquility: u32) {
-		let observation = Instant::now() - self.last_step_begin;
+	fn tranquilize_internal(&mut self, tranquility: u32) -> Option<Duration> {
+		let observation = Instant::now().saturating_duration_since(self.last_step_begin);
 
 		self.observations.push_back(observation);
 		self.sum_observations += observation;
@@ -45,13 +47,32 @@ impl Tranquilizer {
 
 		if !self.observations.is_empty() {
 			let delay = (tranquility * self.sum_observations) / (self.observations.len() as u32);
+			Some(delay)
+		} else {
+			None
+		}
+	}
+
+	pub async fn tranquilize(&mut self, tranquility: u32) {
+		if let Some(delay) = self.tranquilize_internal(tranquility) {
 			sleep(delay).await;
+			self.reset();
 		}
+	}
 
-		self.reset();
+	#[must_use]
+	pub fn tranquilize_worker(&mut self, tranquility: u32) -> WorkerState {
+		match self.tranquilize_internal(tranquility) {
+			Some(delay) => WorkerState::Throttled(delay.as_secs_f32()),
+			None => WorkerState::Busy,
+		}
 	}
 
 	pub fn reset(&mut self) {
 		self.last_step_begin = Instant::now();
 	}
+
+	pub fn clear(&mut self) {
+		self.observations.clear();
+	}
 }
diff --git a/src/util/version.rs b/src/util/version.rs
new file mode 100644
index 00000000..b515dccc
--- /dev/null
+++ b/src/util/version.rs
@@ -0,0 +1,28 @@
+use std::sync::Arc;
+
+use arc_swap::{ArcSwap, ArcSwapOption};
+
+lazy_static::lazy_static! {
+	static ref VERSION: ArcSwap<&'static str> = ArcSwap::new(Arc::new(git_version::git_version!(
+		prefix = "git:",
+		cargo_prefix = "cargo:",
+		fallback = "unknown"
+	)));
+	static ref FEATURES: ArcSwapOption<&'static [&'static str]> = ArcSwapOption::new(None);
+}
+
+pub fn garage_version() -> &'static str {
+	&VERSION.load()
+}
+
+pub fn garage_features() -> Option<&'static [&'static str]> {
+	FEATURES.load().as_ref().map(|f| &f[..])
+}
+
+pub fn init_version(version: &'static str) {
+	VERSION.store(Arc::new(version));
+}
+
+pub fn init_features(features: &'static [&'static str]) {
+	FEATURES.store(Some(Arc::new(features)));
+}
diff --git a/src/web/Cargo.toml b/src/web/Cargo.toml
index 59a1231d..7bf70c55 100644
--- a/src/web/Cargo.toml
+++ b/src/web/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "garage_web"
-version = "0.7.0"
+version = "0.8.0"
 authors = ["Alex Auvolat <alex@adnab.me>", "Quentin Dufour <quentin@dufour.io>"]
 edition = "2018"
 license = "AGPL-3.0"
@@ -14,10 +14,10 @@ path = "lib.rs"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
-garage_api = { version = "0.7.0", path = "../api" }
-garage_model = { version = "0.7.0", path = "../model" }
-garage_util = { version = "0.7.0", path = "../util" }
-garage_table = { version = "0.7.0", path = "../table" }
+garage_api = { version = "0.8.0", path = "../api" }
+garage_model = { version = "0.8.0", path = "../model" }
+garage_util = { version = "0.8.0", path = "../util" }
+garage_table = { version = "0.8.0", path = "../table" }
 
 err-derive = "0.3"
 tracing = "0.1.30"
diff --git a/src/web/error.rs b/src/web/error.rs
index 55990e9d..bd8f17b5 100644
--- a/src/web/error.rs
+++ b/src/web/error.rs
@@ -2,57 +2,47 @@ use err_derive::Error;
 use hyper::header::HeaderValue;
 use hyper::{HeaderMap, StatusCode};
 
-use garage_util::error::Error as GarageError;
+use garage_api::generic_server::ApiError;
 
 /// Errors of this crate
 #[derive(Debug, Error)]
 pub enum Error {
 	/// An error received from the API crate
 	#[error(display = "API error: {}", _0)]
-	ApiError(#[error(source)] garage_api::Error),
-
-	// Category: internal error
-	/// Error internal to garage
-	#[error(display = "Internal error: {}", _0)]
-	InternalError(#[error(source)] GarageError),
+	ApiError(garage_api::s3::error::Error),
 
 	/// The file does not exist
 	#[error(display = "Not found")]
 	NotFound,
 
-	/// The request contained an invalid UTF-8 sequence in its path or in other parameters
-	#[error(display = "Invalid UTF-8: {}", _0)]
-	InvalidUtf8(#[error(source)] std::str::Utf8Error),
-
-	/// The client send a header with invalid value
-	#[error(display = "Invalid header value: {}", _0)]
-	InvalidHeader(#[error(source)] hyper::header::ToStrError),
-
 	/// The client sent a request without host, or with unsupported method
 	#[error(display = "Bad request: {}", _0)]
 	BadRequest(String),
 }
 
+impl<T> From<T> for Error
+where
+	garage_api::s3::error::Error: From<T>,
+{
+	fn from(err: T) -> Self {
+		Error::ApiError(garage_api::s3::error::Error::from(err))
+	}
+}
+
 impl Error {
 	/// Transform errors into http status code
 	pub fn http_status_code(&self) -> StatusCode {
 		match self {
 			Error::NotFound => StatusCode::NOT_FOUND,
 			Error::ApiError(e) => e.http_status_code(),
-			Error::InternalError(
-				GarageError::Timeout
-				| GarageError::RemoteError(_)
-				| GarageError::Quorum(_, _, _, _),
-			) => StatusCode::SERVICE_UNAVAILABLE,
-			Error::InternalError(_) => StatusCode::INTERNAL_SERVER_ERROR,
-			_ => StatusCode::BAD_REQUEST,
+			Error::BadRequest(_) => StatusCode::BAD_REQUEST,
 		}
 	}
 
 	pub fn add_headers(&self, header_map: &mut HeaderMap<HeaderValue>) {
 		#[allow(clippy::single_match)]
 		match self {
-			Error::ApiError(e) => e.add_headers(header_map),
+			Error::ApiError(e) => e.add_http_headers(header_map),
 			_ => (),
 		}
 	}
diff --git a/src/web/lib.rs b/src/web/lib.rs
index 9b7c8573..7207c365 100644
--- a/src/web/lib.rs
+++ b/src/web/lib.rs
@@ -6,4 +6,4 @@ mod error;
 pub use error::Error;
 
 mod web_server;
-pub use web_server::run_web_server;
+pub use web_server::WebServer;
diff --git a/src/web/web_server.rs b/src/web/web_server.rs
index c3d691d0..c2322073 100644
--- a/src/web/web_server.rs
+++ b/src/web/web_server.rs
@@ -18,10 +18,12 @@ use opentelemetry::{
 
 use crate::error::*;
 
-use garage_api::error::{Error as ApiError, OkOrBadRequest, OkOrInternalError};
 use garage_api::helpers::{authority_to_host, host_to_bucket};
-use garage_api::s3_cors::{add_cors_headers, find_matching_cors_rule, handle_options_for_bucket};
-use garage_api::s3_get::{handle_get, handle_head};
+use garage_api::s3::cors::{add_cors_headers, find_matching_cors_rule, handle_options_for_bucket};
+use garage_api::s3::error::{
+	CommonErrorDerivative, Error as ApiError, OkOrBadRequest, OkOrInternalError,
+};
+use garage_api::s3::get::{handle_get, handle_head};
 
 use garage_model::garage::Garage;
 
@@ -55,90 +57,226 @@ impl WebMetrics {
 	}
 }
 
-/// Run a web server
-pub async fn run_web_server(
+pub struct WebServer {
 	garage: Arc<Garage>,
-	shutdown_signal: impl Future<Output = ()>,
-) -> Result<(), GarageError> {
-	let addr = &garage.config.s3_web.bind_addr;
+	metrics: Arc<WebMetrics>,
+	root_domain: String,
+}
 
-	let metrics = Arc::new(WebMetrics::new());
+impl WebServer {
+	/// Run a web server
+	pub async fn run(
+		garage: Arc<Garage>,
+		addr: SocketAddr,
+		root_domain: String,
+		shutdown_signal: impl Future<Output = ()>,
+	) -> Result<(), GarageError> {
+		let metrics = Arc::new(WebMetrics::new());
+		let web_server = Arc::new(WebServer {
+			garage,
+			metrics,
+			root_domain,
+		});
+
+		let service = make_service_fn(|conn: &AddrStream| {
+			let web_server = web_server.clone();
+
+			let client_addr = conn.remote_addr();
+			async move {
+				Ok::<_, Error>(service_fn(move |req: Request<Body>| {
+					let web_server = web_server.clone();
+
+					web_server.handle_request(req, client_addr)
+				}))
+			}
+		});
 
-	let service = make_service_fn(|conn: &AddrStream| {
-		let garage = garage.clone();
-		let metrics = metrics.clone();
+		let server = Server::bind(&addr).serve(service);
+		let graceful = server.with_graceful_shutdown(shutdown_signal);
+		info!("Web server listening on http://{}", addr);
 
-		let client_addr = conn.remote_addr();
-		async move {
-			Ok::<_, Error>(service_fn(move |req: Request<Body>| {
-				let garage = garage.clone();
-				let metrics = metrics.clone();
+		graceful.await?;
+		Ok(())
+	}
 
-				handle_request(garage, metrics, req, client_addr)
-			}))
+	async fn handle_request(
+		self: Arc<Self>,
+		req: Request<Body>,
+		addr: SocketAddr,
+	) -> Result<Response<Body>, Infallible> {
+		info!("{} {} {}", addr, req.method(), req.uri());
+
+		// Lots of instrumentation
+		let tracer = opentelemetry::global::tracer("garage");
+		let span = tracer
+			.span_builder(format!("Web {} request", req.method()))
+			.with_trace_id(gen_trace_id())
+			.with_attributes(vec![
+				KeyValue::new("method", format!("{}", req.method())),
+				KeyValue::new("uri", req.uri().to_string()),
+			])
+			.start(&tracer);
+
+		let metrics_tags = &[KeyValue::new("method", req.method().to_string())];
+
+		// The actual handler
+		let res = self
+			.serve_file(&req)
+			.with_context(Context::current_with_span(span))
+			.record_duration(&self.metrics.request_duration, &metrics_tags[..])
+			.await;
+
+		// More instrumentation
+		self.metrics.request_counter.add(1, &metrics_tags[..]);
+
+		// Returning the result
+		match res {
+			Ok(res) => {
+				debug!("{} {} {}", req.method(), res.status(), req.uri());
+				Ok(res)
+			}
+			Err(error) => {
+				info!(
+					"{} {} {} {}",
+					req.method(),
+					error.http_status_code(),
+					req.uri(),
+					error
+				);
+				self.metrics.error_counter.add(
+					1,
+					&[
+						metrics_tags[0].clone(),
+						KeyValue::new("status_code", error.http_status_code().to_string()),
+					],
+				);
+				Ok(error_to_res(error))
+			}
 		}
-	});
+	}
 
-	let server = Server::bind(addr).serve(service);
-	let graceful = server.with_graceful_shutdown(shutdown_signal);
-	info!("Web server listening on http://{}", addr);
+	async fn serve_file(self: &Arc<Self>, req: &Request<Body>) -> Result<Response<Body>, Error> {
+		// Get http authority string (eg. [::1]:3902 or garage.tld:80)
+		let authority = req
+			.headers()
+			.get(HOST)
+			.ok_or_bad_request("HOST header required")?
+			.to_str()?;
+
+		// Get bucket
+		let host = authority_to_host(authority)?;
+
+		let bucket_name = host_to_bucket(&host, &self.root_domain).unwrap_or(&host);
+		let bucket_id = self
+			.garage
+			.bucket_alias_table
+			.get(&EmptyKey, &bucket_name.to_string())
+			.await?
+			.and_then(|x| x.state.take())
+			.ok_or(Error::NotFound)?;
+
+		// Check bucket isn't deleted and has website access enabled
+		let bucket = self
+			.garage
+			.bucket_table
+			.get(&EmptyKey, &bucket_id)
+			.await?
+			.ok_or(Error::NotFound)?;
+
+		let website_config = bucket
+			.params()
+			.ok_or(Error::NotFound)?
+			.website_config
+			.get()
+			.as_ref()
+			.ok_or(Error::NotFound)?;
+
+		// Get path
+		let path = req.uri().path().to_string();
+		let index = &website_config.index_document;
+		let key = path_to_key(&path, index)?;
+
+		debug!(
+			"Selected bucket: \"{}\" {:?}, selected key: \"{}\"",
+			bucket_name, bucket_id, key
+		);
+
+		let ret_doc = match *req.method() {
+			Method::OPTIONS => handle_options_for_bucket(req, &bucket),
+			Method::HEAD => handle_head(self.garage.clone(), req, bucket_id, &key, None).await,
+			Method::GET => handle_get(self.garage.clone(), req, bucket_id, &key, None).await,
+			_ => Err(ApiError::bad_request("HTTP method not supported")),
+		}
+		.map_err(Error::from);
+
+		match ret_doc {
+			Err(error) => {
+				// For a HEAD or OPTIONS method, and for non-4xx errors,
+				// we don't return the error document as content,
+				// we return above and just return the error message
+				// by relying on err_to_res that is called when we return an Err.
+				if *req.method() == Method::HEAD
+					|| *req.method() == Method::OPTIONS
+					|| !error.http_status_code().is_client_error()
+				{
+					return Err(error);
+				}
 
-	graceful.await?;
-	Ok(())
-}
+				// If no error document is set: just return the error directly
+				let error_document = match &website_config.error_document {
+					Some(ed) => ed.trim_start_matches('/').to_owned(),
+					None => return Err(error),
+				};
+
+				// We want to return the error document
+				// Create a fake HTTP request with path = the error document
+				let req2 = Request::builder()
+					.uri(format!("http://{}/{}", host, &error_document))
+					.body(Body::empty())
+					.unwrap();
+
+				match handle_get(self.garage.clone(), &req2, bucket_id, &error_document, None).await
+				{
+					Ok(mut error_doc) => {
+						// The error won't be logged back in handle_request,
+						// so log it here
+						info!(
+							"{} {} {} {}",
+							req.method(),
+							req.uri(),
+							error.http_status_code(),
+							error
+						);
+
+						*error_doc.status_mut() = error.http_status_code();
+						error.add_headers(error_doc.headers_mut());
+
+						// Preserve error message in a special header
+						for error_line in error.to_string().split('\n') {
+							if let Ok(v) = HeaderValue::from_bytes(error_line.as_bytes()) {
+								error_doc.headers_mut().append("X-Garage-Error", v);
+							}
+						}
 
-async fn handle_request(
-	garage: Arc<Garage>,
-	metrics: Arc<WebMetrics>,
-	req: Request<Body>,
-	addr: SocketAddr,
-) -> Result<Response<Body>, Infallible> {
-	info!("{} {} {}", addr, req.method(), req.uri());
-
-	// Lots of instrumentation
-	let tracer = opentelemetry::global::tracer("garage");
-	let span = tracer
-		.span_builder(format!("Web {} request", req.method()))
-		.with_trace_id(gen_trace_id())
-		.with_attributes(vec![
-			KeyValue::new("method", format!("{}", req.method())),
-			KeyValue::new("uri", req.uri().to_string()),
-		])
-		.start(&tracer);
-
-	let metrics_tags = &[KeyValue::new("method", req.method().to_string())];
-
-	// The actual handler
-	let res = serve_file(garage, &req)
-		.with_context(Context::current_with_span(span))
-		.record_duration(&metrics.request_duration, &metrics_tags[..])
-		.await;
-
-	// More instrumentation
-	metrics.request_counter.add(1, &metrics_tags[..]);
-
-	// Returning the result
-	match res {
-		Ok(res) => {
-			debug!("{} {} {}", req.method(), res.status(), req.uri());
-			Ok(res)
-		}
-		Err(error) => {
-			info!(
-				"{} {} {} {}",
-				req.method(),
-				error.http_status_code(),
-				req.uri(),
-				error
-			);
-			metrics.error_counter.add(
-				1,
-				&[
-					metrics_tags[0].clone(),
-					KeyValue::new("status_code", error.http_status_code().to_string()),
-				],
-			);
-			Ok(error_to_res(error))
+						Ok(error_doc)
+					}
+					Err(error_doc_error) => {
+						warn!(
+							"Couldn't get error document {} for bucket {:?}: {}",
+							error_document, bucket_id, error_doc_error
+						);
+						Err(error)
+					}
+				}
+			}
+			Ok(mut resp) => {
+				// Maybe add CORS headers
+				if let Some(rule) = find_matching_cors_rule(&bucket, req)? {
+					add_cors_headers(&mut resp, rule)
+						.ok_or_internal_error("Invalid bucket CORS configuration")?;
+				}
+				Ok(resp)
+			}
 		}
 	}
 }
@@ -158,129 +296,6 @@ fn error_to_res(e: Error) -> Response<Body> {
 	http_error
 }
 
-async fn serve_file(garage: Arc<Garage>, req: &Request<Body>) -> Result<Response<Body>, Error> {
-	// Get http authority string (eg. [::1]:3902 or garage.tld:80)
-	let authority = req
-		.headers()
-		.get(HOST)
-		.ok_or_bad_request("HOST header required")?
-		.to_str()?;
-
-	// Get bucket
-	let host = authority_to_host(authority)?;
-	let root = &garage.config.s3_web.root_domain;
-
-	let bucket_name = host_to_bucket(&host, root).unwrap_or(&host);
-	let bucket_id = garage
-		.bucket_alias_table
-		.get(&EmptyKey, &bucket_name.to_string())
-		.await?
-		.and_then(|x| x.state.take())
-		.ok_or(Error::NotFound)?;
-
-	// Check bucket isn't deleted and has website access enabled
-	let bucket = garage
-		.bucket_table
-		.get(&EmptyKey, &bucket_id)
-		.await?
-		.ok_or(Error::NotFound)?;
-
-	let website_config = bucket
-		.params()
-		.ok_or(Error::NotFound)?
-		.website_config
-		.get()
-		.as_ref()
-		.ok_or(Error::NotFound)?;
-
-	// Get path
-	let path = req.uri().path().to_string();
-	let index = &website_config.index_document;
-	let key = path_to_key(&path, index)?;
-
-	debug!(
-		"Selected bucket: \"{}\" {:?}, selected key: \"{}\"",
-		bucket_name, bucket_id, key
-	);
-
-	let ret_doc = match *req.method() {
-		Method::OPTIONS => handle_options_for_bucket(req, &bucket),
-		Method::HEAD => handle_head(garage.clone(), req, bucket_id, &key, None).await,
-		Method::GET => handle_get(garage.clone(), req, bucket_id, &key, None).await,
-		_ => Err(ApiError::BadRequest("HTTP method not supported".into())),
-	}
-	.map_err(Error::from);
-
-	match ret_doc {
-		Err(error) => {
-			// For a HEAD or OPTIONS method, and for non-4xx errors,
-			// we don't return the error document as content,
-			// we return above and just return the error message
-			// by relying on err_to_res that is called when we return an Err.
-			if *req.method() == Method::HEAD
-				|| *req.method() == Method::OPTIONS
-				|| !error.http_status_code().is_client_error()
-			{
-				return Err(error);
-			}
-
-			// If no error document is set: just return the error directly
-			let error_document = match &website_config.error_document {
-				Some(ed) => ed.trim_start_matches('/').to_owned(),
-				None => return Err(error),
-			};
-
-			// We want to return the error document
-			// Create a fake HTTP request with path = the error document
-			let req2 = Request::builder()
-				.uri(format!("http://{}/{}", host, &error_document))
-				.body(Body::empty())
-				.unwrap();
-
-			match handle_get(garage, &req2, bucket_id, &error_document, None).await {
-				Ok(mut error_doc) => {
-					// The error won't be logged back in handle_request,
-					// so log it here
-					info!(
-						"{} {} {} {}",
-						req.method(),
-						req.uri(),
-						error.http_status_code(),
-						error
-					);
-
-					*error_doc.status_mut() = error.http_status_code();
-					error.add_headers(error_doc.headers_mut());
-
-					// Preserve error message in a special header
-					for error_line in error.to_string().split('\n') {
-						if let Ok(v) = HeaderValue::from_bytes(error_line.as_bytes()) {
-							error_doc.headers_mut().append("X-Garage-Error", v);
-						}
-					}
-
-					Ok(error_doc)
-				}
-				Err(error_doc_error) => {
-					warn!(
-						"Couldn't get error document {} for bucket {:?}: {}",
-						error_document, bucket_id, error_doc_error
-					);
-					Err(error)
-				}
-			}
-		}
-		Ok(mut resp) => {
-			// Maybe add CORS headers
-			if let Some(rule) = find_matching_cors_rule(&bucket, req)? {
-				add_cors_headers(&mut resp, rule)
-					.ok_or_internal_error("Invalid bucket CORS configuration")?;
-			}
-			Ok(resp)
-		}
-	}
-}
-
 /// Path to key
 ///
 /// Convert the provided path to the internal key
@@ -290,9 +305,7 @@ fn path_to_key<'a>(path: &'a str, index: &str) -> Result<Cow<'a, str>, Error> {
 	let path_utf8 = percent_encoding::percent_decode_str(path).decode_utf8()?;
 
 	if !path_utf8.starts_with('/') {
-		return Err(Error::BadRequest(
-			"Path must start with a / (slash)".to_string(),
-		));
+		return Err(Error::BadRequest("Path must start with a / (slash)".into()));
 	}
 
 	match path_utf8.chars().last() {
author	Mendes <mendes.oulamara@pm.me>	2022-10-04 18:14:49 +0200
committer	Mendes <mendes.oulamara@pm.me>	2022-10-04 18:14:49 +0200
commit	829f815a897b04986559910bbcbf53625adcdf20 (patch)
tree	6db3c27cff2aded754a641d1f2b05c83be701267 /src
parent	99f96b9564c9c841dc6c56f1255a6e70ff884d46 (diff)
parent	a096ced35562bd0a8877a1ee2f755be1edafe343 (diff)
download	garage-829f815a897b04986559910bbcbf53625adcdf20.tar.gz garage-829f815a897b04986559910bbcbf53625adcdf20.zip