From 5768bf362262f78376af14517c4921941986192e Mon Sep 17 00:00:00 2001 From: Alex Date: Tue, 10 May 2022 13:16:57 +0200 Subject: First implementation of K2V (#293) **Specification:** View spec at [this URL](https://git.deuxfleurs.fr/Deuxfleurs/garage/src/branch/k2v/doc/drafts/k2v-spec.md) - [x] Specify the structure of K2V triples - [x] Specify the DVVS format used for causality detection - [x] Specify the K2V index (just a counter of number of values per partition key) - [x] Specify single-item endpoints: ReadItem, InsertItem, DeleteItem - [x] Specify index endpoint: ReadIndex - [x] Specify multi-item endpoints: InsertBatch, ReadBatch, DeleteBatch - [x] Move to JSON objects instead of tuples - [x] Specify endpoints for polling for updates on single values (PollItem) **Implementation:** - [x] Table for K2V items, causal contexts - [x] Indexing mechanism and table for K2V index - [x] Make API handlers a bit more generic - [x] K2V API endpoint - [x] K2V API router - [x] ReadItem - [x] InsertItem - [x] DeleteItem - [x] PollItem - [x] ReadIndex - [x] InsertBatch - [x] ReadBatch - [x] DeleteBatch **Testing:** - [x] Just a simple Python script that does some requests to check visually that things are going right (does not contain parsing of results or assertions on returned values) - [x] Actual tests: - [x] Adapt testing framework - [x] Simple test with InsertItem + ReadItem - [x] Test with several Insert/Read/DeleteItem + ReadIndex - [x] Test all combinations of return formats for ReadItem - [x] Test with ReadBatch, InsertBatch, DeleteBatch - [x] Test with PollItem - [x] Test error codes - [ ] Fix most broken stuff - [x] test PollItem broken randomly - [x] when invalid causality tokens are given, errors should be 4xx not 5xx **Improvements:** - [x] Descending range queries - [x] Specify - [x] Implement - [x] Add test - [x] Batch updates to index counter - [x] Put K2V behind `k2v` feature flag Co-authored-by: Alex Auvolat Reviewed-on: https://git.deuxfleurs.fr/Deuxfleurs/garage/pulls/293 Co-authored-by: Alex Co-committed-by: Alex --- src/api/s3/get.rs | 461 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 461 insertions(+) create mode 100644 src/api/s3/get.rs (limited to 'src/api/s3/get.rs') diff --git a/src/api/s3/get.rs b/src/api/s3/get.rs new file mode 100644 index 00000000..3edf22a6 --- /dev/null +++ b/src/api/s3/get.rs @@ -0,0 +1,461 @@ +//! Function related to GET and HEAD requests +use std::sync::Arc; +use std::time::{Duration, UNIX_EPOCH}; + +use futures::stream::*; +use http::header::{ + ACCEPT_RANGES, CONTENT_LENGTH, CONTENT_RANGE, CONTENT_TYPE, ETAG, IF_MODIFIED_SINCE, + IF_NONE_MATCH, LAST_MODIFIED, RANGE, +}; +use hyper::body::Bytes; +use hyper::{Body, Request, Response, StatusCode}; + +use garage_table::EmptyKey; +use garage_util::data::*; + +use garage_model::garage::Garage; +use garage_model::s3::object_table::*; +use garage_model::s3::version_table::*; + +use crate::error::*; + +const X_AMZ_MP_PARTS_COUNT: &str = "x-amz-mp-parts-count"; + +fn object_headers( + version: &ObjectVersion, + version_meta: &ObjectVersionMeta, +) -> http::response::Builder { + debug!("Version meta: {:?}", version_meta); + + let date = UNIX_EPOCH + Duration::from_millis(version.timestamp); + let date_str = httpdate::fmt_http_date(date); + + let mut resp = Response::builder() + .header(CONTENT_TYPE, version_meta.headers.content_type.to_string()) + .header(LAST_MODIFIED, date_str) + .header(ACCEPT_RANGES, "bytes".to_string()); + + if !version_meta.etag.is_empty() { + resp = resp.header(ETAG, format!("\"{}\"", version_meta.etag)); + } + + for (k, v) in version_meta.headers.other.iter() { + resp = resp.header(k, v.to_string()); + } + + resp +} + +fn try_answer_cached( + version: &ObjectVersion, + version_meta: &ObjectVersionMeta, + req: &Request, +) -> Option> { + // It is possible, and is even usually the case, [that both If-None-Match and + // If-Modified-Since] are present in a request. In this situation If-None-Match takes + // precedence and If-Modified-Since is ignored (as per 6.Precedence from rfc7232). The rational + // being that etag based matching is more accurate, it has no issue with sub-second precision + // for instance (in case of very fast updates) + let cached = if let Some(none_match) = req.headers().get(IF_NONE_MATCH) { + let none_match = none_match.to_str().ok()?; + let expected = format!("\"{}\"", version_meta.etag); + let found = none_match + .split(',') + .map(str::trim) + .any(|etag| etag == expected || etag == "\"*\""); + found + } else if let Some(modified_since) = req.headers().get(IF_MODIFIED_SINCE) { + let modified_since = modified_since.to_str().ok()?; + let client_date = httpdate::parse_http_date(modified_since).ok()?; + let server_date = UNIX_EPOCH + Duration::from_millis(version.timestamp); + client_date >= server_date + } else { + false + }; + + if cached { + Some( + Response::builder() + .status(StatusCode::NOT_MODIFIED) + .body(Body::empty()) + .unwrap(), + ) + } else { + None + } +} + +/// Handle HEAD request +pub async fn handle_head( + garage: Arc, + req: &Request, + bucket_id: Uuid, + key: &str, + part_number: Option, +) -> Result, Error> { + let object = garage + .object_table + .get(&bucket_id, &key.to_string()) + .await? + .ok_or(Error::NoSuchKey)?; + + let object_version = object + .versions() + .iter() + .rev() + .find(|v| v.is_data()) + .ok_or(Error::NoSuchKey)?; + + let version_data = match &object_version.state { + ObjectVersionState::Complete(c) => c, + _ => unreachable!(), + }; + + let version_meta = match version_data { + ObjectVersionData::Inline(meta, _) => meta, + ObjectVersionData::FirstBlock(meta, _) => meta, + _ => unreachable!(), + }; + + if let Some(cached) = try_answer_cached(object_version, version_meta, req) { + return Ok(cached); + } + + if let Some(pn) = part_number { + match version_data { + ObjectVersionData::Inline(_, bytes) => { + if pn != 1 { + return Err(Error::InvalidPart); + } + Ok(object_headers(object_version, version_meta) + .header(CONTENT_LENGTH, format!("{}", bytes.len())) + .header( + CONTENT_RANGE, + format!("bytes 0-{}/{}", bytes.len() - 1, bytes.len()), + ) + .header(X_AMZ_MP_PARTS_COUNT, "1") + .status(StatusCode::PARTIAL_CONTENT) + .body(Body::empty())?) + } + ObjectVersionData::FirstBlock(_, _) => { + let version = garage + .version_table + .get(&object_version.uuid, &EmptyKey) + .await? + .ok_or(Error::NoSuchKey)?; + + let (part_offset, part_end) = + calculate_part_bounds(&version, pn).ok_or(Error::InvalidPart)?; + let n_parts = version.parts_etags.items().len(); + + Ok(object_headers(object_version, version_meta) + .header(CONTENT_LENGTH, format!("{}", part_end - part_offset)) + .header( + CONTENT_RANGE, + format!( + "bytes {}-{}/{}", + part_offset, + part_end - 1, + version_meta.size + ), + ) + .header(X_AMZ_MP_PARTS_COUNT, format!("{}", n_parts)) + .status(StatusCode::PARTIAL_CONTENT) + .body(Body::empty())?) + } + _ => unreachable!(), + } + } else { + Ok(object_headers(object_version, version_meta) + .header(CONTENT_LENGTH, format!("{}", version_meta.size)) + .status(StatusCode::OK) + .body(Body::empty())?) + } +} + +/// Handle GET request +pub async fn handle_get( + garage: Arc, + req: &Request, + bucket_id: Uuid, + key: &str, + part_number: Option, +) -> Result, Error> { + let object = garage + .object_table + .get(&bucket_id, &key.to_string()) + .await? + .ok_or(Error::NoSuchKey)?; + + let last_v = object + .versions() + .iter() + .rev() + .find(|v| v.is_complete()) + .ok_or(Error::NoSuchKey)?; + + let last_v_data = match &last_v.state { + ObjectVersionState::Complete(x) => x, + _ => unreachable!(), + }; + let last_v_meta = match last_v_data { + ObjectVersionData::DeleteMarker => return Err(Error::NoSuchKey), + ObjectVersionData::Inline(meta, _) => meta, + ObjectVersionData::FirstBlock(meta, _) => meta, + }; + + if let Some(cached) = try_answer_cached(last_v, last_v_meta, req) { + return Ok(cached); + } + + match (part_number, parse_range_header(req, last_v_meta.size)?) { + (Some(_), Some(_)) => { + return Err(Error::BadRequest( + "Cannot specify both partNumber and Range header".into(), + )); + } + (Some(pn), None) => { + return handle_get_part(garage, last_v, last_v_data, last_v_meta, pn).await; + } + (None, Some(range)) => { + return handle_get_range( + garage, + last_v, + last_v_data, + last_v_meta, + range.start, + range.start + range.length, + ) + .await; + } + (None, None) => (), + } + + let resp_builder = object_headers(last_v, last_v_meta) + .header(CONTENT_LENGTH, format!("{}", last_v_meta.size)) + .status(StatusCode::OK); + + match &last_v_data { + ObjectVersionData::DeleteMarker => unreachable!(), + ObjectVersionData::Inline(_, bytes) => { + let body: Body = Body::from(bytes.to_vec()); + Ok(resp_builder.body(body)?) + } + ObjectVersionData::FirstBlock(_, first_block_hash) => { + let read_first_block = garage.block_manager.rpc_get_block(first_block_hash); + let get_next_blocks = garage.version_table.get(&last_v.uuid, &EmptyKey); + + let (first_block, version) = futures::try_join!(read_first_block, get_next_blocks)?; + let version = version.ok_or(Error::NoSuchKey)?; + + let mut blocks = version + .blocks + .items() + .iter() + .map(|(_, vb)| (vb.hash, None)) + .collect::>(); + blocks[0].1 = Some(first_block); + + let body_stream = futures::stream::iter(blocks) + .map(move |(hash, data_opt)| { + let garage = garage.clone(); + async move { + if let Some(data) = data_opt { + Ok(Bytes::from(data)) + } else { + garage + .block_manager + .rpc_get_block(&hash) + .await + .map(Bytes::from) + } + } + }) + .buffered(2); + + let body = hyper::body::Body::wrap_stream(body_stream); + Ok(resp_builder.body(body)?) + } + } +} + +async fn handle_get_range( + garage: Arc, + version: &ObjectVersion, + version_data: &ObjectVersionData, + version_meta: &ObjectVersionMeta, + begin: u64, + end: u64, +) -> Result, Error> { + let resp_builder = object_headers(version, version_meta) + .header(CONTENT_LENGTH, format!("{}", end - begin)) + .header( + CONTENT_RANGE, + format!("bytes {}-{}/{}", begin, end - 1, version_meta.size), + ) + .status(StatusCode::PARTIAL_CONTENT); + + match &version_data { + ObjectVersionData::DeleteMarker => unreachable!(), + ObjectVersionData::Inline(_meta, bytes) => { + if end as usize <= bytes.len() { + let body: Body = Body::from(bytes[begin as usize..end as usize].to_vec()); + Ok(resp_builder.body(body)?) + } else { + None.ok_or_internal_error( + "Requested range not present in inline bytes when it should have been", + ) + } + } + ObjectVersionData::FirstBlock(_meta, _first_block_hash) => { + let version = garage + .version_table + .get(&version.uuid, &EmptyKey) + .await? + .ok_or(Error::NoSuchKey)?; + + let body = body_from_blocks_range(garage, version.blocks.items(), begin, end); + Ok(resp_builder.body(body)?) + } + } +} + +async fn handle_get_part( + garage: Arc, + object_version: &ObjectVersion, + version_data: &ObjectVersionData, + version_meta: &ObjectVersionMeta, + part_number: u64, +) -> Result, Error> { + let resp_builder = + object_headers(object_version, version_meta).status(StatusCode::PARTIAL_CONTENT); + + match version_data { + ObjectVersionData::Inline(_, bytes) => { + if part_number != 1 { + return Err(Error::InvalidPart); + } + Ok(resp_builder + .header(CONTENT_LENGTH, format!("{}", bytes.len())) + .header( + CONTENT_RANGE, + format!("bytes {}-{}/{}", 0, bytes.len() - 1, bytes.len()), + ) + .header(X_AMZ_MP_PARTS_COUNT, "1") + .body(Body::from(bytes.to_vec()))?) + } + ObjectVersionData::FirstBlock(_, _) => { + let version = garage + .version_table + .get(&object_version.uuid, &EmptyKey) + .await? + .ok_or(Error::NoSuchKey)?; + + let (begin, end) = + calculate_part_bounds(&version, part_number).ok_or(Error::InvalidPart)?; + let n_parts = version.parts_etags.items().len(); + + let body = body_from_blocks_range(garage, version.blocks.items(), begin, end); + + Ok(resp_builder + .header(CONTENT_LENGTH, format!("{}", end - begin)) + .header( + CONTENT_RANGE, + format!("bytes {}-{}/{}", begin, end - 1, version_meta.size), + ) + .header(X_AMZ_MP_PARTS_COUNT, format!("{}", n_parts)) + .body(body)?) + } + _ => unreachable!(), + } +} + +fn parse_range_header( + req: &Request, + total_size: u64, +) -> Result, Error> { + let range = match req.headers().get(RANGE) { + Some(range) => { + let range_str = range.to_str()?; + let mut ranges = + http_range::HttpRange::parse(range_str, total_size).map_err(|e| (e, total_size))?; + if ranges.len() > 1 { + // garage does not support multi-range requests yet, so we respond with the entire + // object when multiple ranges are requested + None + } else { + ranges.pop() + } + } + None => None, + }; + Ok(range) +} + +fn calculate_part_bounds(v: &Version, part_number: u64) -> Option<(u64, u64)> { + let mut offset = 0; + for (i, (bk, bv)) in v.blocks.items().iter().enumerate() { + if bk.part_number == part_number { + let size: u64 = v.blocks.items()[i..] + .iter() + .take_while(|(k, _)| k.part_number == part_number) + .map(|(_, v)| v.size) + .sum(); + return Some((offset, offset + size)); + } + offset += bv.size; + } + None +} + +fn body_from_blocks_range( + garage: Arc, + all_blocks: &[(VersionBlockKey, VersionBlock)], + begin: u64, + end: u64, +) -> Body { + // We will store here the list of blocks that have an intersection with the requested + // range, as well as their "true offset", which is their actual offset in the complete + // file (whereas block.offset designates the offset of the block WITHIN THE PART + // block.part_number, which is not the same in the case of a multipart upload) + let mut blocks: Vec<(VersionBlock, u64)> = Vec::with_capacity(std::cmp::min( + all_blocks.len(), + 4 + ((end - begin) / std::cmp::max(all_blocks[0].1.size as u64, 1024)) as usize, + )); + let mut true_offset = 0; + for (_, b) in all_blocks.iter() { + if true_offset >= end { + break; + } + // Keep only blocks that have an intersection with the requested range + if true_offset < end && true_offset + b.size > begin { + blocks.push((*b, true_offset)); + } + true_offset += b.size; + } + + let body_stream = futures::stream::iter(blocks) + .map(move |(block, true_offset)| { + let garage = garage.clone(); + async move { + let data = garage.block_manager.rpc_get_block(&block.hash).await?; + let data = Bytes::from(data); + let start_in_block = if true_offset > begin { + 0 + } else { + begin - true_offset + }; + let end_in_block = if true_offset + block.size < end { + block.size + } else { + end - true_offset + }; + Result::::Ok( + data.slice(start_in_block as usize..end_in_block as usize), + ) + } + }) + .buffered(2); + + hyper::body::Body::wrap_stream(body_stream) +} -- cgit v1.2.3 From 382e74c798263d042b1c6ca3788c866a8c69c4f4 Mon Sep 17 00:00:00 2001 From: Alex Date: Tue, 24 May 2022 12:16:39 +0200 Subject: First version of admin API (#298) **Spec:** - [x] Start writing - [x] Specify all layout endpoints - [x] Specify all endpoints for operations on keys - [x] Specify all endpoints for operations on key/bucket permissions - [x] Specify all endpoints for operations on buckets - [x] Specify all endpoints for operations on bucket aliases View rendered spec at **Code:** - [x] Refactor code for admin api to use common api code that was created for K2V **General endpoints:** - [x] Metrics - [x] GetClusterStatus - [x] ConnectClusterNodes - [x] GetClusterLayout - [x] UpdateClusterLayout - [x] ApplyClusterLayout - [x] RevertClusterLayout **Key-related endpoints:** - [x] ListKeys - [x] CreateKey - [x] ImportKey - [x] GetKeyInfo - [x] UpdateKey - [x] DeleteKey **Bucket-related endpoints:** - [x] ListBuckets - [x] CreateBucket - [x] GetBucketInfo - [x] DeleteBucket - [x] PutBucketWebsite - [x] DeleteBucketWebsite **Operations on key/bucket permissions:** - [x] BucketAllowKey - [x] BucketDenyKey **Operations on bucket aliases:** - [x] GlobalAliasBucket - [x] GlobalUnaliasBucket - [x] LocalAliasBucket - [x] LocalUnaliasBucket **And also:** - [x] Separate error type for the admin API (this PR includes a quite big refactoring of error handling) - [x] Add management of website access - [ ] Check that nothing is missing wrt what can be done using the CLI - [ ] Improve formatting of the spec - [x] Make sure everyone is cool with the API design Fix #231 Fix #295 Co-authored-by: Alex Auvolat Reviewed-on: https://git.deuxfleurs.fr/Deuxfleurs/garage/pulls/298 Co-authored-by: Alex Co-committed-by: Alex --- src/api/s3/get.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'src/api/s3/get.rs') diff --git a/src/api/s3/get.rs b/src/api/s3/get.rs index 3edf22a6..7fa1a177 100644 --- a/src/api/s3/get.rs +++ b/src/api/s3/get.rs @@ -17,7 +17,7 @@ use garage_model::garage::Garage; use garage_model::s3::object_table::*; use garage_model::s3::version_table::*; -use crate::error::*; +use crate::s3::error::*; const X_AMZ_MP_PARTS_COUNT: &str = "x-amz-mp-parts-count"; @@ -210,8 +210,8 @@ pub async fn handle_get( match (part_number, parse_range_header(req, last_v_meta.size)?) { (Some(_), Some(_)) => { - return Err(Error::BadRequest( - "Cannot specify both partNumber and Range header".into(), + return Err(Error::bad_request( + "Cannot specify both partNumber and Range header", )); } (Some(pn), None) => { @@ -302,9 +302,9 @@ async fn handle_get_range( let body: Body = Body::from(bytes[begin as usize..end as usize].to_vec()); Ok(resp_builder.body(body)?) } else { - None.ok_or_internal_error( + Err(Error::internal_error( "Requested range not present in inline bytes when it should have been", - ) + )) } } ObjectVersionData::FirstBlock(_meta, _first_block_hash) => { -- cgit v1.2.3 From 605a630333c8ee60c55fe011a375c01277bba173 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 22 Jul 2022 18:20:27 +0200 Subject: Use streaming in block manager --- src/api/s3/get.rs | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) (limited to 'src/api/s3/get.rs') diff --git a/src/api/s3/get.rs b/src/api/s3/get.rs index 7fa1a177..7d118f89 100644 --- a/src/api/s3/get.rs +++ b/src/api/s3/get.rs @@ -242,10 +242,13 @@ pub async fn handle_get( Ok(resp_builder.body(body)?) } ObjectVersionData::FirstBlock(_, first_block_hash) => { - let read_first_block = garage.block_manager.rpc_get_block(first_block_hash); + let read_first_block = garage + .block_manager + .rpc_get_block_streaming(first_block_hash); let get_next_blocks = garage.version_table.get(&last_v.uuid, &EmptyKey); - let (first_block, version) = futures::try_join!(read_first_block, get_next_blocks)?; + let (first_block_stream, version) = + futures::try_join!(read_first_block, get_next_blocks)?; let version = version.ok_or(Error::NoSuchKey)?; let mut blocks = version @@ -254,24 +257,32 @@ pub async fn handle_get( .iter() .map(|(_, vb)| (vb.hash, None)) .collect::>(); - blocks[0].1 = Some(first_block); + blocks[0].1 = Some(first_block_stream); let body_stream = futures::stream::iter(blocks) - .map(move |(hash, data_opt)| { + .map(move |(hash, stream_opt)| { let garage = garage.clone(); async move { - if let Some(data) = data_opt { - Ok(Bytes::from(data)) + if let Some(stream) = stream_opt { + stream } else { garage .block_manager - .rpc_get_block(&hash) + .rpc_get_block_streaming(&hash) .await - .map(Bytes::from) + .unwrap_or_else(|_| { + Box::pin(futures::stream::once(async move { + Err(std::io::Error::new( + std::io::ErrorKind::Other, + "Could not get next block", + )) + })) + }) } } }) - .buffered(2); + .buffered(3) + .flatten(); let body = hyper::body::Body::wrap_stream(body_stream); Ok(resp_builder.body(body)?) -- cgit v1.2.3 From 68087ee13dc22dbaeb0c1fa8dcb4bdbaa82098a6 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 22 Jul 2022 19:06:56 +0200 Subject: Fix clippy --- src/api/s3/get.rs | 1 - 1 file changed, 1 deletion(-) (limited to 'src/api/s3/get.rs') diff --git a/src/api/s3/get.rs b/src/api/s3/get.rs index 7d118f89..c7621ade 100644 --- a/src/api/s3/get.rs +++ b/src/api/s3/get.rs @@ -450,7 +450,6 @@ fn body_from_blocks_range( let garage = garage.clone(); async move { let data = garage.block_manager.rpc_get_block(&block.hash).await?; - let data = Bytes::from(data); let start_in_block = if true_offset > begin { 0 } else { -- cgit v1.2.3 From bc977f9a7a7a5bd87ccf5fe96d64b397591f8ba0 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 1 Sep 2022 12:58:20 +0200 Subject: Update to Netapp with OrderTag support and exploit OrderTags --- src/api/s3/get.rs | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'src/api/s3/get.rs') diff --git a/src/api/s3/get.rs b/src/api/s3/get.rs index c7621ade..dfc284fe 100644 --- a/src/api/s3/get.rs +++ b/src/api/s3/get.rs @@ -10,6 +10,7 @@ use http::header::{ use hyper::body::Bytes; use hyper::{Body, Request, Response, StatusCode}; +use garage_rpc::rpc_helper::OrderTag; use garage_table::EmptyKey; use garage_util::data::*; @@ -242,9 +243,11 @@ pub async fn handle_get( Ok(resp_builder.body(body)?) } ObjectVersionData::FirstBlock(_, first_block_hash) => { + let order_stream = OrderTag::stream(); + let read_first_block = garage .block_manager - .rpc_get_block_streaming(first_block_hash); + .rpc_get_block_streaming(first_block_hash, Some(order_stream.order(0))); let get_next_blocks = garage.version_table.get(&last_v.uuid, &EmptyKey); let (first_block_stream, version) = @@ -260,7 +263,8 @@ pub async fn handle_get( blocks[0].1 = Some(first_block_stream); let body_stream = futures::stream::iter(blocks) - .map(move |(hash, stream_opt)| { + .enumerate() + .map(move |(i, (hash, stream_opt))| { let garage = garage.clone(); async move { if let Some(stream) = stream_opt { @@ -268,7 +272,7 @@ pub async fn handle_get( } else { garage .block_manager - .rpc_get_block_streaming(&hash) + .rpc_get_block_streaming(&hash, Some(order_stream.order(i as u64))) .await .unwrap_or_else(|_| { Box::pin(futures::stream::once(async move { @@ -281,7 +285,7 @@ pub async fn handle_get( } } }) - .buffered(3) + .buffered(2) .flatten(); let body = hyper::body::Body::wrap_stream(body_stream); @@ -445,11 +449,16 @@ fn body_from_blocks_range( true_offset += b.size; } + let order_stream = OrderTag::stream(); let body_stream = futures::stream::iter(blocks) - .map(move |(block, true_offset)| { + .enumerate() + .map(move |(i, (block, true_offset))| { let garage = garage.clone(); async move { - let data = garage.block_manager.rpc_get_block(&block.hash).await?; + let data = garage + .block_manager + .rpc_get_block(&block.hash, Some(order_stream.order(i as u64))) + .await?; let start_in_block = if true_offset > begin { 0 } else { -- cgit v1.2.3 From 907054775dc71a10a92ab96112889db9113130ab Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 6 Sep 2022 22:25:23 +0200 Subject: Faster copy, better get error message --- src/api/s3/get.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'src/api/s3/get.rs') diff --git a/src/api/s3/get.rs b/src/api/s3/get.rs index dfc284fe..dd95f6e7 100644 --- a/src/api/s3/get.rs +++ b/src/api/s3/get.rs @@ -274,11 +274,11 @@ pub async fn handle_get( .block_manager .rpc_get_block_streaming(&hash, Some(order_stream.order(i as u64))) .await - .unwrap_or_else(|_| { + .unwrap_or_else(|e| { Box::pin(futures::stream::once(async move { Err(std::io::Error::new( std::io::ErrorKind::Other, - "Could not get next block", + format!("Could not get block {}: {}", i, e), )) })) }) -- cgit v1.2.3 From ff30891999b5be5421b80b89da1037e943179d2d Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 13 Sep 2022 15:13:07 +0200 Subject: Use streaming block API for get with Range requests --- src/api/s3/get.rs | 93 +++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 60 insertions(+), 33 deletions(-) (limited to 'src/api/s3/get.rs') diff --git a/src/api/s3/get.rs b/src/api/s3/get.rs index dd95f6e7..ae4c287d 100644 --- a/src/api/s3/get.rs +++ b/src/api/s3/get.rs @@ -7,10 +7,9 @@ use http::header::{ ACCEPT_RANGES, CONTENT_LENGTH, CONTENT_RANGE, CONTENT_TYPE, ETAG, IF_MODIFIED_SINCE, IF_NONE_MATCH, LAST_MODIFIED, RANGE, }; -use hyper::body::Bytes; use hyper::{Body, Request, Response, StatusCode}; -use garage_rpc::rpc_helper::OrderTag; +use garage_rpc::rpc_helper::{netapp::stream::ByteStream, OrderTag}; use garage_table::EmptyKey; use garage_util::data::*; @@ -274,14 +273,7 @@ pub async fn handle_get( .block_manager .rpc_get_block_streaming(&hash, Some(order_stream.order(i as u64))) .await - .unwrap_or_else(|e| { - Box::pin(futures::stream::once(async move { - Err(std::io::Error::new( - std::io::ErrorKind::Other, - format!("Could not get block {}: {}", i, e), - )) - })) - }) + .unwrap_or_else(|e| error_stream(i, e)) } } }) @@ -437,44 +429,79 @@ fn body_from_blocks_range( all_blocks.len(), 4 + ((end - begin) / std::cmp::max(all_blocks[0].1.size as u64, 1024)) as usize, )); - let mut true_offset = 0; + let mut block_offset: u64 = 0; for (_, b) in all_blocks.iter() { - if true_offset >= end { + if block_offset >= end { break; } // Keep only blocks that have an intersection with the requested range - if true_offset < end && true_offset + b.size > begin { - blocks.push((*b, true_offset)); + if block_offset < end && block_offset + b.size > begin { + blocks.push((*b, block_offset)); } - true_offset += b.size; + block_offset += b.size as u64; } let order_stream = OrderTag::stream(); let body_stream = futures::stream::iter(blocks) .enumerate() - .map(move |(i, (block, true_offset))| { + .map(move |(i, (block, block_offset))| { let garage = garage.clone(); async move { - let data = garage + garage .block_manager - .rpc_get_block(&block.hash, Some(order_stream.order(i as u64))) - .await?; - let start_in_block = if true_offset > begin { - 0 - } else { - begin - true_offset - }; - let end_in_block = if true_offset + block.size < end { - block.size - } else { - end - true_offset - }; - Result::::Ok( - data.slice(start_in_block as usize..end_in_block as usize), - ) + .rpc_get_block_streaming(&block.hash, Some(order_stream.order(i as u64))) + .await + .unwrap_or_else(|e| error_stream(i, e)) + .scan(block_offset, move |chunk_offset, chunk| { + let r = match chunk { + Ok(chunk_bytes) => { + let chunk_len = chunk_bytes.len() as u64; + let r = if *chunk_offset >= end { + // The current chunk is after the part we want to read. + // Returning None here will stop the scan, the rest of the + // stream will be ignored + None + } else if *chunk_offset + chunk_len <= begin { + // The current chunk is before the part we want to read. + // We return a None that will be removed by the filter_map + // below. + Some(None) + } else { + // The chunk has an intersection with the requested range + let start_in_chunk = if *chunk_offset > begin { + 0 + } else { + begin - *chunk_offset + }; + let end_in_chunk = if *chunk_offset + chunk_len < end { + chunk_len + } else { + end - *chunk_offset + }; + Some(Some(Ok(chunk_bytes + .slice(start_in_chunk as usize..end_in_chunk as usize)))) + }; + *chunk_offset += chunk_bytes.len() as u64; + r + } + Err(e) => Some(Some(Err(e))), + }; + futures::future::ready(r) + }) + .filter_map(futures::future::ready) } }) - .buffered(2); + .buffered(2) + .flatten(); hyper::body::Body::wrap_stream(body_stream) } + +fn error_stream(i: usize, e: garage_util::error::Error) -> ByteStream { + Box::pin(futures::stream::once(async move { + Err(std::io::Error::new( + std::io::ErrorKind::Other, + format!("Could not get block {}: {}", i, e), + )) + })) +} -- cgit v1.2.3 From 5d4b6f2173344d59d59c7f6336c5d21799f8b37d Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 19 Sep 2022 12:16:38 +0200 Subject: Faster GetObject workflow for getting entire objects --- src/api/s3/get.rs | 86 +++++++++++++++++++++++++++++++++---------------------- 1 file changed, 51 insertions(+), 35 deletions(-) (limited to 'src/api/s3/get.rs') diff --git a/src/api/s3/get.rs b/src/api/s3/get.rs index ae4c287d..2a99551a 100644 --- a/src/api/s3/get.rs +++ b/src/api/s3/get.rs @@ -2,16 +2,19 @@ use std::sync::Arc; use std::time::{Duration, UNIX_EPOCH}; -use futures::stream::*; +use futures::future; +use futures::stream::{self, StreamExt}; use http::header::{ ACCEPT_RANGES, CONTENT_LENGTH, CONTENT_RANGE, CONTENT_TYPE, ETAG, IF_MODIFIED_SINCE, IF_NONE_MATCH, LAST_MODIFIED, RANGE, }; use hyper::{Body, Request, Response, StatusCode}; +use tokio::sync::mpsc; use garage_rpc::rpc_helper::{netapp::stream::ByteStream, OrderTag}; use garage_table::EmptyKey; use garage_util::data::*; +use garage_util::error::OkOrMessage; use garage_model::garage::Garage; use garage_model::s3::object_table::*; @@ -242,43 +245,56 @@ pub async fn handle_get( Ok(resp_builder.body(body)?) } ObjectVersionData::FirstBlock(_, first_block_hash) => { - let order_stream = OrderTag::stream(); - - let read_first_block = garage - .block_manager - .rpc_get_block_streaming(first_block_hash, Some(order_stream.order(0))); - let get_next_blocks = garage.version_table.get(&last_v.uuid, &EmptyKey); + let (tx, rx) = mpsc::channel(2); - let (first_block_stream, version) = - futures::try_join!(read_first_block, get_next_blocks)?; - let version = version.ok_or(Error::NoSuchKey)?; + let order_stream = OrderTag::stream(); + let first_block_hash = *first_block_hash; + let version_uuid = last_v.uuid; + + tokio::spawn(async move { + match async { + let garage2 = garage.clone(); + let version_fut = tokio::spawn(async move { + garage2.version_table.get(&version_uuid, &EmptyKey).await + }); + + let stream_block_0 = garage + .block_manager + .rpc_get_block_streaming(&first_block_hash, Some(order_stream.order(0))) + .await?; + tx.send(stream_block_0) + .await + .ok_or_message("channel closed")?; + + let version = version_fut.await.unwrap()?.ok_or(Error::NoSuchKey)?; + for (i, (_, vb)) in version.blocks.items().iter().enumerate().skip(1) { + let stream_block_i = garage + .block_manager + .rpc_get_block_streaming(&vb.hash, Some(order_stream.order(i as u64))) + .await?; + tx.send(stream_block_i) + .await + .ok_or_message("channel closed")?; + } - let mut blocks = version - .blocks - .items() - .iter() - .map(|(_, vb)| (vb.hash, None)) - .collect::>(); - blocks[0].1 = Some(first_block_stream); - - let body_stream = futures::stream::iter(blocks) - .enumerate() - .map(move |(i, (hash, stream_opt))| { - let garage = garage.clone(); - async move { - if let Some(stream) = stream_opt { - stream - } else { - garage - .block_manager - .rpc_get_block_streaming(&hash, Some(order_stream.order(i as u64))) - .await - .unwrap_or_else(|e| error_stream(i, e)) - } + Ok::<(), Error>(()) + } + .await + { + Ok(()) => (), + Err(e) => { + let err = std::io::Error::new( + std::io::ErrorKind::Other, + format!("Error while getting object data: {}", e), + ); + let _ = tx + .send(Box::pin(stream::once(future::ready(Err(err))))) + .await; } - }) - .buffered(2) - .flatten(); + } + }); + + let body_stream = tokio_stream::wrappers::ReceiverStream::new(rx).flatten(); let body = hyper::body::Body::wrap_stream(body_stream); Ok(resp_builder.body(body)?) -- cgit v1.2.3