From 231cb32955080557b05c7dde7d7adee664457e0e Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 22 Apr 2020 19:25:15 +0000 Subject: Do not delete block if just a single replication error. Write TODO stuff. --- TODO | 34 ++++++++++++++++++++++++++-------- src/block.rs | 20 ++++++++++---------- 2 files changed, 36 insertions(+), 18 deletions(-) diff --git a/TODO b/TODO index 1b5f466d..a8ac6a49 100644 --- a/TODO +++ b/TODO @@ -1,18 +1,36 @@ -Replication ------------ - -Finish the thing that sends blocks to other nodes if needed before deleting them locally. +Testing +------- How are we going to test that our replication method works correctly? We will have to introduce lots of dummy data and then add/remove nodes many times. -Repair: -- re-propagate block ref table to rc + +Improvements +------------ + +Membership: keep IP addresses of failed nodes and try to reping them regularly + +RPC client/server: do not go through the serialization+HTTP+TLS+deserialization when doing a request to ourself. + +RPC requests: unify quorum + timeout in a "RequestStrategy" class, +and add to the request strategy whether or not the request should continue in the background +once `quorum` valid responses have been received + + +Attaining S3 compatibility +-------------------------- + +- table for access keys +- S3 request signature verification +- api_server following the S3 semantics for get/put/delete +- implement object listing +- possibly other necessary endpoints ? -To do list ----------- +Lower priority +-------------- - less a priority: hinted handoff +- repair: re-propagate block ref table to rc - FIXME in rpc_server when garage shuts down and futures can be interrupted (tokio::spawn should be replaced by a new function background::spawn_joinable) diff --git a/src/block.rs b/src/block.rs index 46abcf02..6c785f89 100644 --- a/src/block.rs +++ b/src/block.rs @@ -278,25 +278,25 @@ impl BlockManager { let who_needs = join_all(who_needs_fut).await; let mut need_nodes = vec![]; - let mut errors = 0; for (node, needed) in who.into_iter().zip(who_needs.iter()) { match needed { Ok(Message::NeedBlockReply(true)) => { need_nodes.push(node); } - Err(_) => { - errors += 1; + Err(e) => { + return Err(Error::Message(format!( + "Should delete block, but unable to confirm that all other nodes that need it have it: {}", + e + ))); + } + _ => { + return Err(Error::Message(format!( + "Unexpected response to NeedBlockQuery RPC" + ))); } - _ => (), } } - if errors > (garage.system.config.data_replication_factor - 1) / 2 { - return Err(Error::Message(format!( - "Should delete block, but not enough nodes confirm that they have it." - ))); - } - if need_nodes.len() > 0 { let put_block_message = self.read_block(hash).await?; let put_responses = self -- cgit v1.2.3