diff options
Diffstat (limited to 'src/rpc/ring.rs')
-rw-r--r-- | src/rpc/ring.rs | 36 |
1 files changed, 30 insertions, 6 deletions
diff --git a/src/rpc/ring.rs b/src/rpc/ring.rs index 2e997523..bffd7f1f 100644 --- a/src/rpc/ring.rs +++ b/src/rpc/ring.rs @@ -1,3 +1,5 @@ +//! Module containing types related to computing nodes which should receive a copy of data blocks +//! and metadata use std::collections::{HashMap, HashSet}; use std::convert::TryInto; @@ -8,23 +10,30 @@ use garage_util::data::*; // A partition number is encoded on 16 bits, // i.e. we have up to 2**16 partitions. // (in practice we have exactly 2**PARTITION_BITS partitions) +/// A partition id, stored on 16 bits pub type Partition = u16; // TODO: make this constant parametrizable in the config file // For deployments with many nodes it might make sense to bump // it up to 10. // Maximum value : 16 +/// How many bits from the hash are used to make partitions. Higher numbers means more fairness in +/// presence of numerous nodes, but exponentially bigger ring. Max 16 pub const PARTITION_BITS: usize = 8; const PARTITION_MASK_U16: u16 = ((1 << PARTITION_BITS) - 1) << (16 - PARTITION_BITS); // TODO: make this constant paraetrizable in the config file // (most deployments use a replication factor of 3, so...) +/// The maximum number of time an object might get replicated pub const MAX_REPLICATION: usize = 3; +/// The user-defined configuration of the cluster's nodes #[derive(Clone, Debug, Serialize, Deserialize)] pub struct NetworkConfig { + /// Map of each node's id to it's configuration pub members: HashMap<UUID, NetworkConfigEntry>, + /// Version of this config pub version: u64, } @@ -37,26 +46,40 @@ impl NetworkConfig { } } +/// The overall configuration of one (possibly remote) node #[derive(Clone, Debug, Serialize, Deserialize)] pub struct NetworkConfigEntry { + /// Datacenter at which this entry belong. This infromation might be used to perform a better + /// geodistribution pub datacenter: String, + /// The (relative) capacity of the node pub capacity: u32, + /// A tag to recognize the entry, not used for other things than display pub tag: String, } +/// A ring distributing fairly objects to nodes #[derive(Clone)] pub struct Ring { + /// The network configuration used to generate this ring pub config: NetworkConfig, + /// The list of entries in the ring pub ring: Vec<RingEntry>, } +/// An entry in the ring #[derive(Clone, Debug)] pub struct RingEntry { + /// The prefix of the Hash of object which should use this entry pub location: Hash, + /// The nodes in which a matching object should get stored pub nodes: [UUID; MAX_REPLICATION], } impl Ring { + // TODO this function MUST be refactored, it's 100 lines long, with a 50 lines loop, going up to 6 + // levels of imbrication. It is basically impossible to test, maintain, or understand for an + // outsider. pub(crate) fn new(config: NetworkConfig) -> Self { // Create a vector of partition indices (0 to 2**PARTITION_BITS-1) let partitions_idx = (0usize..(1usize << PARTITION_BITS)).collect::<Vec<_>>(); @@ -166,20 +189,16 @@ impl Ring { }) .collect::<Vec<_>>(); - // eprintln!("RING: --"); - // for e in ring.iter() { - // eprintln!("{:?}", e); - // } - // eprintln!("END --"); - Self { config, ring } } + /// Get the partition in which data would fall on pub fn partition_of(&self, from: &Hash) -> Partition { let top = u16::from_be_bytes(from.as_slice()[0..2].try_into().unwrap()); top >> (16 - PARTITION_BITS) } + /// Get the list of partitions and the first hash of a partition key that would fall in it pub fn partitions(&self) -> Vec<(Partition, Hash)> { let mut ret = vec![]; @@ -193,6 +212,8 @@ impl Ring { ret } + // TODO rename this function as it no longer walk the ring + /// Walk the ring to find the n servers in which data should be replicated pub fn walk_ring(&self, from: &Hash, n: usize) -> Vec<UUID> { if self.ring.len() != 1 << PARTITION_BITS { warn!("Ring not yet ready, read/writes will be lost!"); @@ -201,12 +222,15 @@ impl Ring { let top = u16::from_be_bytes(from.as_slice()[0..2].try_into().unwrap()); let partition_idx = (top >> (16 - PARTITION_BITS)) as usize; + // TODO why computing two time in the same way and asserting? assert_eq!(partition_idx, self.partition_of(from) as usize); let partition = &self.ring[partition_idx]; let partition_top = u16::from_be_bytes(partition.location.as_slice()[0..2].try_into().unwrap()); + // TODO is this an assertion on the validity of PARTITION_MASK_U16? If so, it should + // probably be a test more than a runtime assertion assert_eq!(partition_top & PARTITION_MASK_U16, top & PARTITION_MASK_U16); assert!(n <= partition.nodes.len()); |