aboutsummaryrefslogtreecommitdiff
path: root/src/rpc/ring.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/rpc/ring.rs')
-rw-r--r--src/rpc/ring.rs36
1 files changed, 30 insertions, 6 deletions
diff --git a/src/rpc/ring.rs b/src/rpc/ring.rs
index 2e997523..bffd7f1f 100644
--- a/src/rpc/ring.rs
+++ b/src/rpc/ring.rs
@@ -1,3 +1,5 @@
+//! Module containing types related to computing nodes which should receive a copy of data blocks
+//! and metadata
use std::collections::{HashMap, HashSet};
use std::convert::TryInto;
@@ -8,23 +10,30 @@ use garage_util::data::*;
// A partition number is encoded on 16 bits,
// i.e. we have up to 2**16 partitions.
// (in practice we have exactly 2**PARTITION_BITS partitions)
+/// A partition id, stored on 16 bits
pub type Partition = u16;
// TODO: make this constant parametrizable in the config file
// For deployments with many nodes it might make sense to bump
// it up to 10.
// Maximum value : 16
+/// How many bits from the hash are used to make partitions. Higher numbers means more fairness in
+/// presence of numerous nodes, but exponentially bigger ring. Max 16
pub const PARTITION_BITS: usize = 8;
const PARTITION_MASK_U16: u16 = ((1 << PARTITION_BITS) - 1) << (16 - PARTITION_BITS);
// TODO: make this constant paraetrizable in the config file
// (most deployments use a replication factor of 3, so...)
+/// The maximum number of time an object might get replicated
pub const MAX_REPLICATION: usize = 3;
+/// The user-defined configuration of the cluster's nodes
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct NetworkConfig {
+ /// Map of each node's id to it's configuration
pub members: HashMap<UUID, NetworkConfigEntry>,
+ /// Version of this config
pub version: u64,
}
@@ -37,26 +46,40 @@ impl NetworkConfig {
}
}
+/// The overall configuration of one (possibly remote) node
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct NetworkConfigEntry {
+ /// Datacenter at which this entry belong. This infromation might be used to perform a better
+ /// geodistribution
pub datacenter: String,
+ /// The (relative) capacity of the node
pub capacity: u32,
+ /// A tag to recognize the entry, not used for other things than display
pub tag: String,
}
+/// A ring distributing fairly objects to nodes
#[derive(Clone)]
pub struct Ring {
+ /// The network configuration used to generate this ring
pub config: NetworkConfig,
+ /// The list of entries in the ring
pub ring: Vec<RingEntry>,
}
+/// An entry in the ring
#[derive(Clone, Debug)]
pub struct RingEntry {
+ /// The prefix of the Hash of object which should use this entry
pub location: Hash,
+ /// The nodes in which a matching object should get stored
pub nodes: [UUID; MAX_REPLICATION],
}
impl Ring {
+ // TODO this function MUST be refactored, it's 100 lines long, with a 50 lines loop, going up to 6
+ // levels of imbrication. It is basically impossible to test, maintain, or understand for an
+ // outsider.
pub(crate) fn new(config: NetworkConfig) -> Self {
// Create a vector of partition indices (0 to 2**PARTITION_BITS-1)
let partitions_idx = (0usize..(1usize << PARTITION_BITS)).collect::<Vec<_>>();
@@ -166,20 +189,16 @@ impl Ring {
})
.collect::<Vec<_>>();
- // eprintln!("RING: --");
- // for e in ring.iter() {
- // eprintln!("{:?}", e);
- // }
- // eprintln!("END --");
-
Self { config, ring }
}
+ /// Get the partition in which data would fall on
pub fn partition_of(&self, from: &Hash) -> Partition {
let top = u16::from_be_bytes(from.as_slice()[0..2].try_into().unwrap());
top >> (16 - PARTITION_BITS)
}
+ /// Get the list of partitions and the first hash of a partition key that would fall in it
pub fn partitions(&self) -> Vec<(Partition, Hash)> {
let mut ret = vec![];
@@ -193,6 +212,8 @@ impl Ring {
ret
}
+ // TODO rename this function as it no longer walk the ring
+ /// Walk the ring to find the n servers in which data should be replicated
pub fn walk_ring(&self, from: &Hash, n: usize) -> Vec<UUID> {
if self.ring.len() != 1 << PARTITION_BITS {
warn!("Ring not yet ready, read/writes will be lost!");
@@ -201,12 +222,15 @@ impl Ring {
let top = u16::from_be_bytes(from.as_slice()[0..2].try_into().unwrap());
let partition_idx = (top >> (16 - PARTITION_BITS)) as usize;
+ // TODO why computing two time in the same way and asserting?
assert_eq!(partition_idx, self.partition_of(from) as usize);
let partition = &self.ring[partition_idx];
let partition_top =
u16::from_be_bytes(partition.location.as_slice()[0..2].try_into().unwrap());
+ // TODO is this an assertion on the validity of PARTITION_MASK_U16? If so, it should
+ // probably be a test more than a runtime assertion
assert_eq!(partition_top & PARTITION_MASK_U16, top & PARTITION_MASK_U16);
assert!(n <= partition.nodes.len());