From 94f3d287742ff90f179f528421c690b00b71a912 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 11 Mar 2021 16:54:15 +0100 Subject: WIP big refactoring --- src/table/replication/fullcopy.rs | 59 +++++++++++++++++++++++++++++++++++++ src/table/replication/mod.rs | 6 ++++ src/table/replication/parameters.rs | 22 ++++++++++++++ src/table/replication/sharded.rs | 54 +++++++++++++++++++++++++++++++++ 4 files changed, 141 insertions(+) create mode 100644 src/table/replication/fullcopy.rs create mode 100644 src/table/replication/mod.rs create mode 100644 src/table/replication/parameters.rs create mode 100644 src/table/replication/sharded.rs (limited to 'src/table/replication') diff --git a/src/table/replication/fullcopy.rs b/src/table/replication/fullcopy.rs new file mode 100644 index 00000000..a62a6c3c --- /dev/null +++ b/src/table/replication/fullcopy.rs @@ -0,0 +1,59 @@ +use std::sync::Arc; + +use garage_rpc::membership::System; +use garage_rpc::ring::Ring; +use garage_util::data::*; + +use crate::replication::*; + +#[derive(Clone)] +pub struct TableFullReplication { + pub max_faults: usize, +} + +#[derive(Clone)] +struct Neighbors { + ring: Arc, + neighbors: Vec, +} + +impl TableFullReplication { + pub fn new(max_faults: usize) -> Self { + TableFullReplication { max_faults } + } +} + +impl TableReplication for TableFullReplication { + // Full replication schema: all nodes store everything + // Writes are disseminated in an epidemic manner in the network + + // Advantage: do all reads locally, extremely fast + // Inconvenient: only suitable to reasonably small tables + + fn read_nodes(&self, _hash: &Hash, system: &System) -> Vec { + vec![system.id] + } + fn read_quorum(&self) -> usize { + 1 + } + + fn write_nodes(&self, hash: &Hash, system: &System) -> Vec { + self.replication_nodes(hash, system.ring.borrow().as_ref()) + } + fn write_quorum(&self, system: &System) -> usize { + system.ring.borrow().config.members.len() - self.max_faults + } + fn max_write_errors(&self) -> usize { + self.max_faults + } + + fn replication_nodes(&self, _hash: &Hash, ring: &Ring) -> Vec { + ring.config.members.keys().cloned().collect::>() + } + fn split_points(&self, _ring: &Ring) -> Vec { + let mut ret = vec![]; + ret.push([0u8; 32].into()); + ret.push([0xFFu8; 32].into()); + ret + } +} diff --git a/src/table/replication/mod.rs b/src/table/replication/mod.rs new file mode 100644 index 00000000..d43d7f19 --- /dev/null +++ b/src/table/replication/mod.rs @@ -0,0 +1,6 @@ +mod parameters; + +pub mod fullcopy; +pub mod sharded; + +pub use parameters::*; diff --git a/src/table/replication/parameters.rs b/src/table/replication/parameters.rs new file mode 100644 index 00000000..4607b050 --- /dev/null +++ b/src/table/replication/parameters.rs @@ -0,0 +1,22 @@ +use garage_rpc::membership::System; +use garage_rpc::ring::Ring; + +use garage_util::data::*; + +pub trait TableReplication: Send + Sync { + // See examples in table_sharded.rs and table_fullcopy.rs + // To understand various replication methods + + // Which nodes to send reads from + fn read_nodes(&self, hash: &Hash, system: &System) -> Vec; + fn read_quorum(&self) -> usize; + + // Which nodes to send writes to + fn write_nodes(&self, hash: &Hash, system: &System) -> Vec; + fn write_quorum(&self, system: &System) -> usize; + fn max_write_errors(&self) -> usize; + + // Which are the nodes that do actually replicate the data + fn replication_nodes(&self, hash: &Hash, ring: &Ring) -> Vec; + fn split_points(&self, ring: &Ring) -> Vec; +} diff --git a/src/table/replication/sharded.rs b/src/table/replication/sharded.rs new file mode 100644 index 00000000..42a742cd --- /dev/null +++ b/src/table/replication/sharded.rs @@ -0,0 +1,54 @@ +use garage_rpc::membership::System; +use garage_rpc::ring::Ring; +use garage_util::data::*; + +use crate::replication::*; + +#[derive(Clone)] +pub struct TableShardedReplication { + pub replication_factor: usize, + pub read_quorum: usize, + pub write_quorum: usize, +} + +impl TableReplication for TableShardedReplication { + // Sharded replication schema: + // - based on the ring of nodes, a certain set of neighbors + // store entries, given as a function of the position of the + // entry's hash in the ring + // - reads are done on all of the nodes that replicate the data + // - writes as well + + fn read_nodes(&self, hash: &Hash, system: &System) -> Vec { + let ring = system.ring.borrow().clone(); + ring.walk_ring(&hash, self.replication_factor) + } + fn read_quorum(&self) -> usize { + self.read_quorum + } + + fn write_nodes(&self, hash: &Hash, system: &System) -> Vec { + let ring = system.ring.borrow().clone(); + ring.walk_ring(&hash, self.replication_factor) + } + fn write_quorum(&self, _system: &System) -> usize { + self.write_quorum + } + fn max_write_errors(&self) -> usize { + self.replication_factor - self.write_quorum + } + + fn replication_nodes(&self, hash: &Hash, ring: &Ring) -> Vec { + ring.walk_ring(&hash, self.replication_factor) + } + fn split_points(&self, ring: &Ring) -> Vec { + let mut ret = vec![]; + + ret.push([0u8; 32].into()); + for entry in ring.ring.iter() { + ret.push(entry.location); + } + ret.push([0xFFu8; 32].into()); + ret + } +} -- cgit v1.2.3 From 046b649bcc3b147140fc2b0af0e071d3dd1b2c8d Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 11 Mar 2021 18:28:03 +0100 Subject: (not well tested) use merkle tree for sync --- src/table/replication/fullcopy.rs | 1 - src/table/replication/sharded.rs | 6 ++++-- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'src/table/replication') diff --git a/src/table/replication/fullcopy.rs b/src/table/replication/fullcopy.rs index a62a6c3c..a20f20b7 100644 --- a/src/table/replication/fullcopy.rs +++ b/src/table/replication/fullcopy.rs @@ -53,7 +53,6 @@ impl TableReplication for TableFullReplication { fn split_points(&self, _ring: &Ring) -> Vec { let mut ret = vec![]; ret.push([0u8; 32].into()); - ret.push([0xFFu8; 32].into()); ret } } diff --git a/src/table/replication/sharded.rs b/src/table/replication/sharded.rs index 42a742cd..886c7c08 100644 --- a/src/table/replication/sharded.rs +++ b/src/table/replication/sharded.rs @@ -44,11 +44,13 @@ impl TableReplication for TableShardedReplication { fn split_points(&self, ring: &Ring) -> Vec { let mut ret = vec![]; - ret.push([0u8; 32].into()); for entry in ring.ring.iter() { ret.push(entry.location); } - ret.push([0xFFu8; 32].into()); + if ret.len() > 0 { + assert_eq!(ret[0], [0u8; 32].into()); + } + ret } } -- cgit v1.2.3 From 3f7a496355bdbeeeee859912fa6fa7a95cb47f3b Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 11 Mar 2021 19:06:27 +0100 Subject: More security: don't delete stuff too easily --- src/table/replication/fullcopy.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'src/table/replication') diff --git a/src/table/replication/fullcopy.rs b/src/table/replication/fullcopy.rs index a20f20b7..a5faece9 100644 --- a/src/table/replication/fullcopy.rs +++ b/src/table/replication/fullcopy.rs @@ -41,7 +41,12 @@ impl TableReplication for TableFullReplication { self.replication_nodes(hash, system.ring.borrow().as_ref()) } fn write_quorum(&self, system: &System) -> usize { - system.ring.borrow().config.members.len() - self.max_faults + let nmembers = system.ring.borrow().config.members.len(); + if nmembers > self.max_faults { + nmembers - self.max_faults + } else { + 1 + } } fn max_write_errors(&self) -> usize { self.max_faults -- cgit v1.2.3 From 1d9961e4118af0e26068e1d6c5c6c009a1292a88 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 16 Mar 2021 11:14:27 +0100 Subject: Simplify replication logic --- src/table/replication/fullcopy.rs | 33 ++++++++++++--------------------- src/table/replication/parameters.rs | 13 +++++++------ src/table/replication/sharded.rs | 20 ++++++++++++-------- 3 files changed, 31 insertions(+), 35 deletions(-) (limited to 'src/table/replication') diff --git a/src/table/replication/fullcopy.rs b/src/table/replication/fullcopy.rs index a5faece9..aea8c1f3 100644 --- a/src/table/replication/fullcopy.rs +++ b/src/table/replication/fullcopy.rs @@ -8,21 +8,10 @@ use crate::replication::*; #[derive(Clone)] pub struct TableFullReplication { + pub system: Arc, pub max_faults: usize, } -#[derive(Clone)] -struct Neighbors { - ring: Arc, - neighbors: Vec, -} - -impl TableFullReplication { - pub fn new(max_faults: usize) -> Self { - TableFullReplication { max_faults } - } -} - impl TableReplication for TableFullReplication { // Full replication schema: all nodes store everything // Writes are disseminated in an epidemic manner in the network @@ -30,18 +19,23 @@ impl TableReplication for TableFullReplication { // Advantage: do all reads locally, extremely fast // Inconvenient: only suitable to reasonably small tables - fn read_nodes(&self, _hash: &Hash, system: &System) -> Vec { - vec![system.id] + fn partition_of(&self, _hash: &Hash) -> u16 { + 0u16 + } + + fn read_nodes(&self, _hash: &Hash) -> Vec { + vec![self.system.id] } fn read_quorum(&self) -> usize { 1 } - fn write_nodes(&self, hash: &Hash, system: &System) -> Vec { - self.replication_nodes(hash, system.ring.borrow().as_ref()) + fn write_nodes(&self, _hash: &Hash) -> Vec { + let ring = self.system.ring.borrow(); + ring.config.members.keys().cloned().collect::>() } - fn write_quorum(&self, system: &System) -> usize { - let nmembers = system.ring.borrow().config.members.len(); + fn write_quorum(&self) -> usize { + let nmembers = self.system.ring.borrow().config.members.len(); if nmembers > self.max_faults { nmembers - self.max_faults } else { @@ -52,9 +46,6 @@ impl TableReplication for TableFullReplication { self.max_faults } - fn replication_nodes(&self, _hash: &Hash, ring: &Ring) -> Vec { - ring.config.members.keys().cloned().collect::>() - } fn split_points(&self, _ring: &Ring) -> Vec { let mut ret = vec![]; ret.push([0u8; 32].into()); diff --git a/src/table/replication/parameters.rs b/src/table/replication/parameters.rs index 4607b050..ace82bd9 100644 --- a/src/table/replication/parameters.rs +++ b/src/table/replication/parameters.rs @@ -1,4 +1,3 @@ -use garage_rpc::membership::System; use garage_rpc::ring::Ring; use garage_util::data::*; @@ -7,16 +6,18 @@ pub trait TableReplication: Send + Sync { // See examples in table_sharded.rs and table_fullcopy.rs // To understand various replication methods + // Partition number of data item (for Merkle tree) + fn partition_of(&self, hash: &Hash) -> u16; + // Which nodes to send reads from - fn read_nodes(&self, hash: &Hash, system: &System) -> Vec; + fn read_nodes(&self, hash: &Hash) -> Vec; fn read_quorum(&self) -> usize; // Which nodes to send writes to - fn write_nodes(&self, hash: &Hash, system: &System) -> Vec; - fn write_quorum(&self, system: &System) -> usize; + fn write_nodes(&self, hash: &Hash) -> Vec; + fn write_quorum(&self) -> usize; fn max_write_errors(&self) -> usize; - // Which are the nodes that do actually replicate the data - fn replication_nodes(&self, hash: &Hash, ring: &Ring) -> Vec; + // Get partition boundaries fn split_points(&self, ring: &Ring) -> Vec; } diff --git a/src/table/replication/sharded.rs b/src/table/replication/sharded.rs index 886c7c08..966be31a 100644 --- a/src/table/replication/sharded.rs +++ b/src/table/replication/sharded.rs @@ -1,3 +1,5 @@ +use std::sync::Arc; + use garage_rpc::membership::System; use garage_rpc::ring::Ring; use garage_util::data::*; @@ -6,6 +8,7 @@ use crate::replication::*; #[derive(Clone)] pub struct TableShardedReplication { + pub system: Arc, pub replication_factor: usize, pub read_quorum: usize, pub write_quorum: usize, @@ -19,28 +22,29 @@ impl TableReplication for TableShardedReplication { // - reads are done on all of the nodes that replicate the data // - writes as well - fn read_nodes(&self, hash: &Hash, system: &System) -> Vec { - let ring = system.ring.borrow().clone(); + fn partition_of(&self, hash: &Hash) -> u16 { + self.system.ring.borrow().partition_of(hash) + } + + fn read_nodes(&self, hash: &Hash) -> Vec { + let ring = self.system.ring.borrow().clone(); ring.walk_ring(&hash, self.replication_factor) } fn read_quorum(&self) -> usize { self.read_quorum } - fn write_nodes(&self, hash: &Hash, system: &System) -> Vec { - let ring = system.ring.borrow().clone(); + fn write_nodes(&self, hash: &Hash) -> Vec { + let ring = self.system.ring.borrow(); ring.walk_ring(&hash, self.replication_factor) } - fn write_quorum(&self, _system: &System) -> usize { + fn write_quorum(&self) -> usize { self.write_quorum } fn max_write_errors(&self) -> usize { self.replication_factor - self.write_quorum } - fn replication_nodes(&self, hash: &Hash, ring: &Ring) -> Vec { - ring.walk_ring(&hash, self.replication_factor) - } fn split_points(&self, ring: &Ring) -> Vec { let mut ret = vec![]; -- cgit v1.2.3 From 2a41b8238496dfeac5ee0f273445299cbd112ff6 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 16 Mar 2021 12:18:03 +0100 Subject: Simpler Merkle & sync --- src/table/replication/fullcopy.rs | 15 ++++++--------- src/table/replication/parameters.rs | 10 ++++------ src/table/replication/sharded.rs | 22 ++++++---------------- 3 files changed, 16 insertions(+), 31 deletions(-) (limited to 'src/table/replication') diff --git a/src/table/replication/fullcopy.rs b/src/table/replication/fullcopy.rs index aea8c1f3..bd658f63 100644 --- a/src/table/replication/fullcopy.rs +++ b/src/table/replication/fullcopy.rs @@ -1,7 +1,7 @@ use std::sync::Arc; use garage_rpc::membership::System; -use garage_rpc::ring::Ring; +use garage_rpc::ring::*; use garage_util::data::*; use crate::replication::*; @@ -19,10 +19,6 @@ impl TableReplication for TableFullReplication { // Advantage: do all reads locally, extremely fast // Inconvenient: only suitable to reasonably small tables - fn partition_of(&self, _hash: &Hash) -> u16 { - 0u16 - } - fn read_nodes(&self, _hash: &Hash) -> Vec { vec![self.system.id] } @@ -46,9 +42,10 @@ impl TableReplication for TableFullReplication { self.max_faults } - fn split_points(&self, _ring: &Ring) -> Vec { - let mut ret = vec![]; - ret.push([0u8; 32].into()); - ret + fn partition_of(&self, _hash: &Hash) -> Partition { + 0u16 + } + fn partitions(&self) -> Vec<(Partition, Hash)> { + vec![(0u16, [0u8; 32].into())] } } diff --git a/src/table/replication/parameters.rs b/src/table/replication/parameters.rs index ace82bd9..e46bd172 100644 --- a/src/table/replication/parameters.rs +++ b/src/table/replication/parameters.rs @@ -1,4 +1,4 @@ -use garage_rpc::ring::Ring; +use garage_rpc::ring::*; use garage_util::data::*; @@ -6,9 +6,6 @@ pub trait TableReplication: Send + Sync { // See examples in table_sharded.rs and table_fullcopy.rs // To understand various replication methods - // Partition number of data item (for Merkle tree) - fn partition_of(&self, hash: &Hash) -> u16; - // Which nodes to send reads from fn read_nodes(&self, hash: &Hash) -> Vec; fn read_quorum(&self) -> usize; @@ -18,6 +15,7 @@ pub trait TableReplication: Send + Sync { fn write_quorum(&self) -> usize; fn max_write_errors(&self) -> usize; - // Get partition boundaries - fn split_points(&self, ring: &Ring) -> Vec; + // Accessing partitions, for Merkle tree & sync + fn partition_of(&self, hash: &Hash) -> Partition; + fn partitions(&self) -> Vec<(Partition, Hash)>; } diff --git a/src/table/replication/sharded.rs b/src/table/replication/sharded.rs index 966be31a..dce74b03 100644 --- a/src/table/replication/sharded.rs +++ b/src/table/replication/sharded.rs @@ -1,7 +1,7 @@ use std::sync::Arc; use garage_rpc::membership::System; -use garage_rpc::ring::Ring; +use garage_rpc::ring::*; use garage_util::data::*; use crate::replication::*; @@ -22,10 +22,6 @@ impl TableReplication for TableShardedReplication { // - reads are done on all of the nodes that replicate the data // - writes as well - fn partition_of(&self, hash: &Hash) -> u16 { - self.system.ring.borrow().partition_of(hash) - } - fn read_nodes(&self, hash: &Hash) -> Vec { let ring = self.system.ring.borrow().clone(); ring.walk_ring(&hash, self.replication_factor) @@ -45,16 +41,10 @@ impl TableReplication for TableShardedReplication { self.replication_factor - self.write_quorum } - fn split_points(&self, ring: &Ring) -> Vec { - let mut ret = vec![]; - - for entry in ring.ring.iter() { - ret.push(entry.location); - } - if ret.len() > 0 { - assert_eq!(ret[0], [0u8; 32].into()); - } - - ret + fn partition_of(&self, hash: &Hash) -> Partition { + self.system.ring.borrow().partition_of(hash) + } + fn partitions(&self) -> Vec<(Partition, Hash)> { + self.system.ring.borrow().partitions() } } -- cgit v1.2.3