aboutsummaryrefslogtreecommitdiff
path: root/src/model/k2v
diff options
context:
space:
mode:
authorAlex <alex@adnab.me>2022-05-10 13:16:57 +0200
committerAlex <alex@adnab.me>2022-05-10 13:16:57 +0200
commit5768bf362262f78376af14517c4921941986192e (patch)
treeb4baf3051eade0f63649443278bb3a3f4c38ec25 /src/model/k2v
parentdef78c5e6f5da37a0d17b5652c525fbeccbc2e86 (diff)
downloadgarage-5768bf362262f78376af14517c4921941986192e.tar.gz
garage-5768bf362262f78376af14517c4921941986192e.zip
First implementation of K2V (#293)
**Specification:** View spec at [this URL](https://git.deuxfleurs.fr/Deuxfleurs/garage/src/branch/k2v/doc/drafts/k2v-spec.md) - [x] Specify the structure of K2V triples - [x] Specify the DVVS format used for causality detection - [x] Specify the K2V index (just a counter of number of values per partition key) - [x] Specify single-item endpoints: ReadItem, InsertItem, DeleteItem - [x] Specify index endpoint: ReadIndex - [x] Specify multi-item endpoints: InsertBatch, ReadBatch, DeleteBatch - [x] Move to JSON objects instead of tuples - [x] Specify endpoints for polling for updates on single values (PollItem) **Implementation:** - [x] Table for K2V items, causal contexts - [x] Indexing mechanism and table for K2V index - [x] Make API handlers a bit more generic - [x] K2V API endpoint - [x] K2V API router - [x] ReadItem - [x] InsertItem - [x] DeleteItem - [x] PollItem - [x] ReadIndex - [x] InsertBatch - [x] ReadBatch - [x] DeleteBatch **Testing:** - [x] Just a simple Python script that does some requests to check visually that things are going right (does not contain parsing of results or assertions on returned values) - [x] Actual tests: - [x] Adapt testing framework - [x] Simple test with InsertItem + ReadItem - [x] Test with several Insert/Read/DeleteItem + ReadIndex - [x] Test all combinations of return formats for ReadItem - [x] Test with ReadBatch, InsertBatch, DeleteBatch - [x] Test with PollItem - [x] Test error codes - [ ] Fix most broken stuff - [x] test PollItem broken randomly - [x] when invalid causality tokens are given, errors should be 4xx not 5xx **Improvements:** - [x] Descending range queries - [x] Specify - [x] Implement - [x] Add test - [x] Batch updates to index counter - [x] Put K2V behind `k2v` feature flag Co-authored-by: Alex Auvolat <alex@adnab.me> Reviewed-on: https://git.deuxfleurs.fr/Deuxfleurs/garage/pulls/293 Co-authored-by: Alex <alex@adnab.me> Co-committed-by: Alex <alex@adnab.me>
Diffstat (limited to 'src/model/k2v')
-rw-r--r--src/model/k2v/causality.rs96
-rw-r--r--src/model/k2v/counter_table.rs20
-rw-r--r--src/model/k2v/item_table.rs291
-rw-r--r--src/model/k2v/mod.rs7
-rw-r--r--src/model/k2v/poll.rs50
-rw-r--r--src/model/k2v/rpc.rs343
6 files changed, 807 insertions, 0 deletions
diff --git a/src/model/k2v/causality.rs b/src/model/k2v/causality.rs
new file mode 100644
index 00000000..8c76a32b
--- /dev/null
+++ b/src/model/k2v/causality.rs
@@ -0,0 +1,96 @@
+use std::collections::BTreeMap;
+use std::convert::TryInto;
+
+use serde::{Deserialize, Serialize};
+
+use garage_util::data::*;
+
+/// Node IDs used in K2V are u64 integers that are the abbreviation
+/// of full Garage node IDs which are 256-bit UUIDs.
+pub type K2VNodeId = u64;
+
+pub fn make_node_id(node_id: Uuid) -> K2VNodeId {
+ let mut tmp = [0u8; 8];
+ tmp.copy_from_slice(&node_id.as_slice()[..8]);
+ u64::from_be_bytes(tmp)
+}
+
+#[derive(PartialEq, Debug, Serialize, Deserialize)]
+pub struct CausalContext {
+ pub vector_clock: BTreeMap<K2VNodeId, u64>,
+}
+
+impl CausalContext {
+ /// Empty causality context
+ pub fn new_empty() -> Self {
+ Self {
+ vector_clock: BTreeMap::new(),
+ }
+ }
+ /// Make binary representation and encode in base64
+ pub fn serialize(&self) -> String {
+ let mut ints = Vec::with_capacity(2 * self.vector_clock.len());
+ for (node, time) in self.vector_clock.iter() {
+ ints.push(*node);
+ ints.push(*time);
+ }
+ let checksum = ints.iter().fold(0, |acc, v| acc ^ *v);
+
+ let mut bytes = u64::to_be_bytes(checksum).to_vec();
+ for i in ints {
+ bytes.extend(u64::to_be_bytes(i));
+ }
+
+ base64::encode_config(bytes, base64::URL_SAFE_NO_PAD)
+ }
+ /// Parse from base64-encoded binary representation
+ pub fn parse(s: &str) -> Result<Self, String> {
+ let bytes = base64::decode_config(s, base64::URL_SAFE_NO_PAD)
+ .map_err(|e| format!("bad causality token base64: {}", e))?;
+ if bytes.len() % 16 != 8 || bytes.len() < 8 {
+ return Err("bad causality token length".into());
+ }
+
+ let checksum = u64::from_be_bytes(bytes[..8].try_into().unwrap());
+ let mut ret = CausalContext {
+ vector_clock: BTreeMap::new(),
+ };
+
+ for i in 0..(bytes.len() / 16) {
+ let node_id = u64::from_be_bytes(bytes[8 + i * 16..16 + i * 16].try_into().unwrap());
+ let time = u64::from_be_bytes(bytes[16 + i * 16..24 + i * 16].try_into().unwrap());
+ ret.vector_clock.insert(node_id, time);
+ }
+
+ let check = ret.vector_clock.iter().fold(0, |acc, (n, t)| acc ^ *n ^ *t);
+
+ if check != checksum {
+ return Err("bad causality token checksum".into());
+ }
+
+ Ok(ret)
+ }
+ /// Check if this causal context contains newer items than another one
+ pub fn is_newer_than(&self, other: &Self) -> bool {
+ self.vector_clock
+ .iter()
+ .any(|(k, v)| v > other.vector_clock.get(k).unwrap_or(&0))
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn test_causality_token_serialization() {
+ let ct = CausalContext {
+ vector_clock: [(4, 42), (1928131023, 76), (0xefc0c1c47f9de433, 2)]
+ .iter()
+ .cloned()
+ .collect(),
+ };
+
+ assert_eq!(CausalContext::parse(&ct.serialize()).unwrap(), ct);
+ }
+}
diff --git a/src/model/k2v/counter_table.rs b/src/model/k2v/counter_table.rs
new file mode 100644
index 00000000..4856eb2b
--- /dev/null
+++ b/src/model/k2v/counter_table.rs
@@ -0,0 +1,20 @@
+use garage_util::data::*;
+
+use crate::index_counter::*;
+
+pub const ENTRIES: &str = "entries";
+pub const CONFLICTS: &str = "conflicts";
+pub const VALUES: &str = "values";
+pub const BYTES: &str = "bytes";
+
+#[derive(PartialEq, Clone)]
+pub struct K2VCounterTable;
+
+impl CounterSchema for K2VCounterTable {
+ const NAME: &'static str = "k2v_index_counter";
+
+ // Partition key = bucket id
+ type P = Uuid;
+ // Sort key = K2V item's partition key
+ type S = String;
+}
diff --git a/src/model/k2v/item_table.rs b/src/model/k2v/item_table.rs
new file mode 100644
index 00000000..8b7cc08a
--- /dev/null
+++ b/src/model/k2v/item_table.rs
@@ -0,0 +1,291 @@
+use serde::{Deserialize, Serialize};
+use std::collections::BTreeMap;
+use std::sync::Arc;
+
+use garage_util::data::*;
+
+use garage_table::crdt::*;
+use garage_table::*;
+
+use crate::index_counter::*;
+use crate::k2v::causality::*;
+use crate::k2v::counter_table::*;
+use crate::k2v::poll::*;
+
+#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)]
+pub struct K2VItem {
+ pub partition: K2VItemPartition,
+ pub sort_key: String,
+
+ items: BTreeMap<K2VNodeId, DvvsEntry>,
+}
+
+#[derive(PartialEq, Clone, Debug, Serialize, Deserialize, Hash, Eq)]
+pub struct K2VItemPartition {
+ pub bucket_id: Uuid,
+ pub partition_key: String,
+}
+
+#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)]
+struct DvvsEntry {
+ t_discard: u64,
+ values: Vec<(u64, DvvsValue)>,
+}
+
+#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)]
+pub enum DvvsValue {
+ Value(#[serde(with = "serde_bytes")] Vec<u8>),
+ Deleted,
+}
+
+impl K2VItem {
+ /// Creates a new K2VItem when no previous entry existed in the db
+ pub fn new(bucket_id: Uuid, partition_key: String, sort_key: String) -> Self {
+ Self {
+ partition: K2VItemPartition {
+ bucket_id,
+ partition_key,
+ },
+ sort_key,
+ items: BTreeMap::new(),
+ }
+ }
+ /// Updates a K2VItem with a new value or a deletion event
+ pub fn update(
+ &mut self,
+ this_node: Uuid,
+ context: &Option<CausalContext>,
+ new_value: DvvsValue,
+ ) {
+ if let Some(context) = context {
+ for (node, t_discard) in context.vector_clock.iter() {
+ if let Some(e) = self.items.get_mut(node) {
+ e.t_discard = std::cmp::max(e.t_discard, *t_discard);
+ } else {
+ self.items.insert(
+ *node,
+ DvvsEntry {
+ t_discard: *t_discard,
+ values: vec![],
+ },
+ );
+ }
+ }
+ }
+
+ self.discard();
+
+ let node_id = make_node_id(this_node);
+ let e = self.items.entry(node_id).or_insert(DvvsEntry {
+ t_discard: 0,
+ values: vec![],
+ });
+ let t_prev = e.max_time();
+ e.values.push((t_prev + 1, new_value));
+ }
+
+ /// Extract the causality context of a K2V Item
+ pub fn causal_context(&self) -> CausalContext {
+ let mut cc = CausalContext::new_empty();
+ for (node, ent) in self.items.iter() {
+ cc.vector_clock.insert(*node, ent.max_time());
+ }
+ cc
+ }
+
+ /// Extract the list of values
+ pub fn values(&'_ self) -> Vec<&'_ DvvsValue> {
+ let mut ret = vec![];
+ for (_, ent) in self.items.iter() {
+ for (_, v) in ent.values.iter() {
+ if !ret.contains(&v) {
+ ret.push(v);
+ }
+ }
+ }
+ ret
+ }
+
+ fn discard(&mut self) {
+ for (_, ent) in self.items.iter_mut() {
+ ent.discard();
+ }
+ }
+
+ // returns counters: (non-deleted entries, conflict entries, non-tombstone values, bytes used)
+ fn stats(&self) -> (i64, i64, i64, i64) {
+ let values = self.values();
+
+ let n_entries = if self.is_tombstone() { 0 } else { 1 };
+ let n_conflicts = if values.len() > 1 { 1 } else { 0 };
+ let n_values = values
+ .iter()
+ .filter(|v| matches!(v, DvvsValue::Value(_)))
+ .count() as i64;
+ let n_bytes = values
+ .iter()
+ .map(|v| match v {
+ DvvsValue::Deleted => 0,
+ DvvsValue::Value(v) => v.len() as i64,
+ })
+ .sum();
+
+ (n_entries, n_conflicts, n_values, n_bytes)
+ }
+}
+
+impl DvvsEntry {
+ fn max_time(&self) -> u64 {
+ self.values
+ .iter()
+ .fold(self.t_discard, |acc, (vts, _)| std::cmp::max(acc, *vts))
+ }
+
+ fn discard(&mut self) {
+ self.values = std::mem::take(&mut self.values)
+ .into_iter()
+ .filter(|(t, _)| *t > self.t_discard)
+ .collect::<Vec<_>>();
+ }
+}
+
+impl Crdt for K2VItem {
+ fn merge(&mut self, other: &Self) {
+ for (node, e2) in other.items.iter() {
+ if let Some(e) = self.items.get_mut(node) {
+ e.merge(e2);
+ } else {
+ self.items.insert(*node, e2.clone());
+ }
+ }
+ }
+}
+
+impl Crdt for DvvsEntry {
+ fn merge(&mut self, other: &Self) {
+ self.t_discard = std::cmp::max(self.t_discard, other.t_discard);
+ self.discard();
+
+ let t_max = self.max_time();
+ for (vt, vv) in other.values.iter() {
+ if *vt > t_max {
+ self.values.push((*vt, vv.clone()));
+ }
+ }
+ }
+}
+
+impl PartitionKey for K2VItemPartition {
+ fn hash(&self) -> Hash {
+ use blake2::{Blake2b, Digest};
+
+ let mut hasher = Blake2b::new();
+ hasher.update(self.bucket_id.as_slice());
+ hasher.update(self.partition_key.as_bytes());
+ let mut hash = [0u8; 32];
+ hash.copy_from_slice(&hasher.finalize()[..32]);
+ hash.into()
+ }
+}
+
+impl Entry<K2VItemPartition, String> for K2VItem {
+ fn partition_key(&self) -> &K2VItemPartition {
+ &self.partition
+ }
+ fn sort_key(&self) -> &String {
+ &self.sort_key
+ }
+ fn is_tombstone(&self) -> bool {
+ self.values()
+ .iter()
+ .all(|v| matches!(v, DvvsValue::Deleted))
+ }
+}
+
+pub struct K2VItemTable {
+ pub(crate) counter_table: Arc<IndexCounter<K2VCounterTable>>,
+ pub(crate) subscriptions: Arc<SubscriptionManager>,
+}
+
+#[derive(Clone, Copy, Debug, Serialize, Deserialize)]
+pub struct ItemFilter {
+ pub exclude_only_tombstones: bool,
+ pub conflicts_only: bool,
+}
+
+impl TableSchema for K2VItemTable {
+ const TABLE_NAME: &'static str = "k2v_item";
+
+ type P = K2VItemPartition;
+ type S = String;
+ type E = K2VItem;
+ type Filter = ItemFilter;
+
+ fn updated(&self, old: Option<&Self::E>, new: Option<&Self::E>) {
+ // 1. Count
+ let (old_entries, old_conflicts, old_values, old_bytes) = match old {
+ None => (0, 0, 0, 0),
+ Some(e) => e.stats(),
+ };
+ let (new_entries, new_conflicts, new_values, new_bytes) = match new {
+ None => (0, 0, 0, 0),
+ Some(e) => e.stats(),
+ };
+
+ let count_pk = old
+ .map(|e| e.partition.bucket_id)
+ .unwrap_or_else(|| new.unwrap().partition.bucket_id);
+ let count_sk = old
+ .map(|e| &e.partition.partition_key)
+ .unwrap_or_else(|| &new.unwrap().partition.partition_key);
+
+ if let Err(e) = self.counter_table.count(
+ &count_pk,
+ count_sk,
+ &[
+ (ENTRIES, new_entries - old_entries),
+ (CONFLICTS, new_conflicts - old_conflicts),
+ (VALUES, new_values - old_values),
+ (BYTES, new_bytes - old_bytes),
+ ],
+ ) {
+ error!("Could not update K2V counter for bucket {:?} partition {}; counts will now be inconsistent. {}", count_pk, count_sk, e);
+ }
+
+ // 2. Notify
+ if let Some(new_ent) = new {
+ self.subscriptions.notify(new_ent);
+ }
+ }
+
+ #[allow(clippy::nonminimal_bool)]
+ fn matches_filter(entry: &Self::E, filter: &Self::Filter) -> bool {
+ let v = entry.values();
+ !(filter.conflicts_only && v.len() < 2)
+ && !(filter.exclude_only_tombstones && entry.is_tombstone())
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn test_dvvsentry_merge_simple() {
+ let e1 = DvvsEntry {
+ t_discard: 4,
+ values: vec![
+ (5, DvvsValue::Value(vec![15])),
+ (6, DvvsValue::Value(vec![16])),
+ ],
+ };
+ let e2 = DvvsEntry {
+ t_discard: 5,
+ values: vec![(6, DvvsValue::Value(vec![16])), (7, DvvsValue::Deleted)],
+ };
+
+ let mut e3 = e1.clone();
+ e3.merge(&e2);
+ assert_eq!(e2, e3);
+ }
+}
diff --git a/src/model/k2v/mod.rs b/src/model/k2v/mod.rs
new file mode 100644
index 00000000..664172a6
--- /dev/null
+++ b/src/model/k2v/mod.rs
@@ -0,0 +1,7 @@
+pub mod causality;
+
+pub mod counter_table;
+pub mod item_table;
+
+pub mod poll;
+pub mod rpc;
diff --git a/src/model/k2v/poll.rs b/src/model/k2v/poll.rs
new file mode 100644
index 00000000..93105207
--- /dev/null
+++ b/src/model/k2v/poll.rs
@@ -0,0 +1,50 @@
+use std::collections::HashMap;
+use std::sync::Mutex;
+
+use serde::{Deserialize, Serialize};
+use tokio::sync::broadcast;
+
+use crate::k2v::item_table::*;
+
+#[derive(Debug, Hash, Clone, PartialEq, Eq, Serialize, Deserialize)]
+pub struct PollKey {
+ pub partition: K2VItemPartition,
+ pub sort_key: String,
+}
+
+#[derive(Default)]
+pub struct SubscriptionManager {
+ subscriptions: Mutex<HashMap<PollKey, broadcast::Sender<K2VItem>>>,
+}
+
+impl SubscriptionManager {
+ pub fn new() -> Self {
+ Self::default()
+ }
+
+ pub fn subscribe(&self, key: &PollKey) -> broadcast::Receiver<K2VItem> {
+ let mut subs = self.subscriptions.lock().unwrap();
+ if let Some(s) = subs.get(key) {
+ s.subscribe()
+ } else {
+ let (tx, rx) = broadcast::channel(8);
+ subs.insert(key.clone(), tx);
+ rx
+ }
+ }
+
+ pub fn notify(&self, item: &K2VItem) {
+ let key = PollKey {
+ partition: item.partition.clone(),
+ sort_key: item.sort_key.clone(),
+ };
+ let mut subs = self.subscriptions.lock().unwrap();
+ if let Some(s) = subs.get(&key) {
+ if s.send(item.clone()).is_err() {
+ // no more subscribers, remove channel from here
+ // (we will re-create it later if we need to subscribe again)
+ subs.remove(&key);
+ }
+ }
+ }
+}
diff --git a/src/model/k2v/rpc.rs b/src/model/k2v/rpc.rs
new file mode 100644
index 00000000..90101d0f
--- /dev/null
+++ b/src/model/k2v/rpc.rs
@@ -0,0 +1,343 @@
+//! Module that implements RPCs specific to K2V.
+//! This is necessary for insertions into the K2V store,
+//! as they have to be transmitted to one of the nodes responsible
+//! for storing the entry to be processed (the API entry
+//! node does not process the entry directly, as this would
+//! mean the vector clock gets much larger than needed).
+
+use std::collections::HashMap;
+use std::sync::Arc;
+use std::time::Duration;
+
+use async_trait::async_trait;
+use futures::stream::FuturesUnordered;
+use futures::StreamExt;
+use serde::{Deserialize, Serialize};
+use tokio::select;
+
+use garage_util::crdt::*;
+use garage_util::data::*;
+use garage_util::error::*;
+
+use garage_rpc::system::System;
+use garage_rpc::*;
+
+use garage_table::replication::{TableReplication, TableShardedReplication};
+use garage_table::table::TABLE_RPC_TIMEOUT;
+use garage_table::{PartitionKey, Table};
+
+use crate::k2v::causality::*;
+use crate::k2v::item_table::*;
+use crate::k2v::poll::*;
+
+/// RPC messages for K2V
+#[derive(Debug, Serialize, Deserialize)]
+enum K2VRpc {
+ Ok,
+ InsertItem(InsertedItem),
+ InsertManyItems(Vec<InsertedItem>),
+ PollItem {
+ key: PollKey,
+ causal_context: CausalContext,
+ timeout_msec: u64,
+ },
+ PollItemResponse(Option<K2VItem>),
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+struct InsertedItem {
+ partition: K2VItemPartition,
+ sort_key: String,
+ causal_context: Option<CausalContext>,
+ value: DvvsValue,
+}
+
+impl Rpc for K2VRpc {
+ type Response = Result<K2VRpc, Error>;
+}
+
+/// The block manager, handling block exchange between nodes, and block storage on local node
+pub struct K2VRpcHandler {
+ system: Arc<System>,
+ item_table: Arc<Table<K2VItemTable, TableShardedReplication>>,
+ endpoint: Arc<Endpoint<K2VRpc, Self>>,
+ subscriptions: Arc<SubscriptionManager>,
+}
+
+impl K2VRpcHandler {
+ pub fn new(
+ system: Arc<System>,
+ item_table: Arc<Table<K2VItemTable, TableShardedReplication>>,
+ subscriptions: Arc<SubscriptionManager>,
+ ) -> Arc<Self> {
+ let endpoint = system.netapp.endpoint("garage_model/k2v/Rpc".to_string());
+
+ let rpc_handler = Arc::new(Self {
+ system,
+ item_table,
+ endpoint,
+ subscriptions,
+ });
+ rpc_handler.endpoint.set_handler(rpc_handler.clone());
+
+ rpc_handler
+ }
+
+ // ---- public interface ----
+
+ pub async fn insert(
+ &self,
+ bucket_id: Uuid,
+ partition_key: String,
+ sort_key: String,
+ causal_context: Option<CausalContext>,
+ value: DvvsValue,
+ ) -> Result<(), Error> {
+ let partition = K2VItemPartition {
+ bucket_id,
+ partition_key,
+ };
+ let mut who = self
+ .item_table
+ .data
+ .replication
+ .write_nodes(&partition.hash());
+ who.sort();
+
+ self.system
+ .rpc
+ .try_call_many(
+ &self.endpoint,
+ &who[..],
+ K2VRpc::InsertItem(InsertedItem {
+ partition,
+ sort_key,
+ causal_context,
+ value,
+ }),
+ RequestStrategy::with_priority(PRIO_NORMAL)
+ .with_quorum(1)
+ .with_timeout(TABLE_RPC_TIMEOUT)
+ .interrupt_after_quorum(true),
+ )
+ .await?;
+
+ Ok(())
+ }
+
+ pub async fn insert_batch(
+ &self,
+ bucket_id: Uuid,
+ items: Vec<(String, String, Option<CausalContext>, DvvsValue)>,
+ ) -> Result<(), Error> {
+ let n_items = items.len();
+
+ let mut call_list: HashMap<_, Vec<_>> = HashMap::new();
+
+ for (partition_key, sort_key, causal_context, value) in items {
+ let partition = K2VItemPartition {
+ bucket_id,
+ partition_key,
+ };
+ let mut who = self
+ .item_table
+ .data
+ .replication
+ .write_nodes(&partition.hash());
+ who.sort();
+
+ call_list.entry(who).or_default().push(InsertedItem {
+ partition,
+ sort_key,
+ causal_context,
+ value,
+ });
+ }
+
+ debug!(
+ "K2V insert_batch: {} requests to insert {} items",
+ call_list.len(),
+ n_items
+ );
+ let call_futures = call_list.into_iter().map(|(nodes, items)| async move {
+ let resp = self
+ .system
+ .rpc
+ .try_call_many(
+ &self.endpoint,
+ &nodes[..],
+ K2VRpc::InsertManyItems(items),
+ RequestStrategy::with_priority(PRIO_NORMAL)
+ .with_quorum(1)
+ .with_timeout(TABLE_RPC_TIMEOUT)
+ .interrupt_after_quorum(true),
+ )
+ .await?;
+ Ok::<_, Error>((nodes, resp))
+ });
+
+ let mut resps = call_futures.collect::<FuturesUnordered<_>>();
+ while let Some(resp) = resps.next().await {
+ resp?;
+ }
+
+ Ok(())
+ }
+
+ pub async fn poll(
+ &self,
+ bucket_id: Uuid,
+ partition_key: String,
+ sort_key: String,
+ causal_context: CausalContext,
+ timeout_msec: u64,
+ ) -> Result<Option<K2VItem>, Error> {
+ let poll_key = PollKey {
+ partition: K2VItemPartition {
+ bucket_id,
+ partition_key,
+ },
+ sort_key,
+ };
+ let nodes = self
+ .item_table
+ .data
+ .replication
+ .write_nodes(&poll_key.partition.hash());
+
+ let resps = self
+ .system
+ .rpc
+ .try_call_many(
+ &self.endpoint,
+ &nodes[..],
+ K2VRpc::PollItem {
+ key: poll_key,
+ causal_context,
+ timeout_msec,
+ },
+ RequestStrategy::with_priority(PRIO_NORMAL)
+ .with_quorum(self.item_table.data.replication.read_quorum())
+ .with_timeout(Duration::from_millis(timeout_msec) + TABLE_RPC_TIMEOUT),
+ )
+ .await?;
+
+ let mut resp: Option<K2VItem> = None;
+ for v in resps {
+ match v {
+ K2VRpc::PollItemResponse(Some(x)) => {
+ if let Some(y) = &mut resp {
+ y.merge(&x);
+ } else {
+ resp = Some(x);
+ }
+ }
+ K2VRpc::PollItemResponse(None) => {
+ return Ok(None);
+ }
+ v => return Err(Error::unexpected_rpc_message(v)),
+ }
+ }
+
+ Ok(resp)
+ }
+
+ // ---- internal handlers ----
+
+ async fn handle_insert(&self, item: &InsertedItem) -> Result<K2VRpc, Error> {
+ let new = self.local_insert(item)?;
+
+ // Propagate to rest of network
+ if let Some(updated) = new {
+ self.item_table.insert(&updated).await?;
+ }
+
+ Ok(K2VRpc::Ok)
+ }
+
+ async fn handle_insert_many(&self, items: &[InsertedItem]) -> Result<K2VRpc, Error> {
+ let mut updated_vec = vec![];
+
+ for item in items {
+ let new = self.local_insert(item)?;
+
+ if let Some(updated) = new {
+ updated_vec.push(updated);
+ }
+ }
+
+ // Propagate to rest of network
+ if !updated_vec.is_empty() {
+ self.item_table.insert_many(&updated_vec).await?;
+ }
+
+ Ok(K2VRpc::Ok)
+ }
+
+ fn local_insert(&self, item: &InsertedItem) -> Result<Option<K2VItem>, Error> {
+ let tree_key = self
+ .item_table
+ .data
+ .tree_key(&item.partition, &item.sort_key);
+
+ self.item_table
+ .data
+ .update_entry_with(&tree_key[..], |ent| {
+ let mut ent = ent.unwrap_or_else(|| {
+ K2VItem::new(
+ item.partition.bucket_id,
+ item.partition.partition_key.clone(),
+ item.sort_key.clone(),
+ )
+ });
+ ent.update(self.system.id, &item.causal_context, item.value.clone());
+ ent
+ })
+ }
+
+ async fn handle_poll(&self, key: &PollKey, ct: &CausalContext) -> Result<K2VItem, Error> {
+ let mut chan = self.subscriptions.subscribe(key);
+
+ let mut value = self
+ .item_table
+ .data
+ .read_entry(&key.partition, &key.sort_key)?
+ .map(|bytes| self.item_table.data.decode_entry(&bytes[..]))
+ .transpose()?
+ .unwrap_or_else(|| {
+ K2VItem::new(
+ key.partition.bucket_id,
+ key.partition.partition_key.clone(),
+ key.sort_key.clone(),
+ )
+ });
+
+ while !value.causal_context().is_newer_than(ct) {
+ value = chan.recv().await?;
+ }
+
+ Ok(value)
+ }
+}
+
+#[async_trait]
+impl EndpointHandler<K2VRpc> for K2VRpcHandler {
+ async fn handle(self: &Arc<Self>, message: &K2VRpc, _from: NodeID) -> Result<K2VRpc, Error> {
+ match message {
+ K2VRpc::InsertItem(item) => self.handle_insert(item).await,
+ K2VRpc::InsertManyItems(items) => self.handle_insert_many(&items[..]).await,
+ K2VRpc::PollItem {
+ key,
+ causal_context,
+ timeout_msec,
+ } => {
+ let delay = tokio::time::sleep(Duration::from_millis(*timeout_msec));
+ select! {
+ ret = self.handle_poll(key, causal_context) => ret.map(Some).map(K2VRpc::PollItemResponse),
+ _ = delay => Ok(K2VRpc::PollItemResponse(None)),
+ }
+ }
+ m => Err(Error::unexpected_rpc_message(m)),
+ }
+ }
+}