aboutsummaryrefslogtreecommitdiff
path: root/src/table/data.rs
diff options
context:
space:
mode:
authorAlex <alex@adnab.me>2022-05-10 13:16:57 +0200
committerAlex <alex@adnab.me>2022-05-10 13:16:57 +0200
commit5768bf362262f78376af14517c4921941986192e (patch)
treeb4baf3051eade0f63649443278bb3a3f4c38ec25 /src/table/data.rs
parentdef78c5e6f5da37a0d17b5652c525fbeccbc2e86 (diff)
downloadgarage-5768bf362262f78376af14517c4921941986192e.tar.gz
garage-5768bf362262f78376af14517c4921941986192e.zip
First implementation of K2V (#293)
**Specification:** View spec at [this URL](https://git.deuxfleurs.fr/Deuxfleurs/garage/src/branch/k2v/doc/drafts/k2v-spec.md) - [x] Specify the structure of K2V triples - [x] Specify the DVVS format used for causality detection - [x] Specify the K2V index (just a counter of number of values per partition key) - [x] Specify single-item endpoints: ReadItem, InsertItem, DeleteItem - [x] Specify index endpoint: ReadIndex - [x] Specify multi-item endpoints: InsertBatch, ReadBatch, DeleteBatch - [x] Move to JSON objects instead of tuples - [x] Specify endpoints for polling for updates on single values (PollItem) **Implementation:** - [x] Table for K2V items, causal contexts - [x] Indexing mechanism and table for K2V index - [x] Make API handlers a bit more generic - [x] K2V API endpoint - [x] K2V API router - [x] ReadItem - [x] InsertItem - [x] DeleteItem - [x] PollItem - [x] ReadIndex - [x] InsertBatch - [x] ReadBatch - [x] DeleteBatch **Testing:** - [x] Just a simple Python script that does some requests to check visually that things are going right (does not contain parsing of results or assertions on returned values) - [x] Actual tests: - [x] Adapt testing framework - [x] Simple test with InsertItem + ReadItem - [x] Test with several Insert/Read/DeleteItem + ReadIndex - [x] Test all combinations of return formats for ReadItem - [x] Test with ReadBatch, InsertBatch, DeleteBatch - [x] Test with PollItem - [x] Test error codes - [ ] Fix most broken stuff - [x] test PollItem broken randomly - [x] when invalid causality tokens are given, errors should be 4xx not 5xx **Improvements:** - [x] Descending range queries - [x] Specify - [x] Implement - [x] Add test - [x] Batch updates to index counter - [x] Put K2V behind `k2v` feature flag Co-authored-by: Alex Auvolat <alex@adnab.me> Reviewed-on: https://git.deuxfleurs.fr/Deuxfleurs/garage/pulls/293 Co-authored-by: Alex <alex@adnab.me> Co-committed-by: Alex <alex@adnab.me>
Diffstat (limited to 'src/table/data.rs')
-rw-r--r--src/table/data.rs98
1 files changed, 73 insertions, 25 deletions
diff --git a/src/table/data.rs b/src/table/data.rs
index ff7965f5..5cb10066 100644
--- a/src/table/data.rs
+++ b/src/table/data.rs
@@ -1,8 +1,9 @@
use core::borrow::Borrow;
+use std::convert::TryInto;
use std::sync::Arc;
use serde_bytes::ByteBuf;
-use sled::Transactional;
+use sled::{IVec, Transactional};
use tokio::sync::Notify;
use garage_util::data::*;
@@ -16,12 +17,13 @@ use crate::gc::GcTodoEntry;
use crate::metrics::*;
use crate::replication::*;
use crate::schema::*;
+use crate::util::*;
pub struct TableData<F: TableSchema, R: TableReplication> {
system: Arc<System>,
- pub(crate) instance: F,
- pub(crate) replication: R,
+ pub instance: F,
+ pub replication: R,
pub store: sled::Tree,
@@ -83,18 +85,48 @@ where
pub fn read_range(
&self,
- p: &F::P,
- s: &Option<F::S>,
+ partition_key: &F::P,
+ start: &Option<F::S>,
+ filter: &Option<F::Filter>,
+ limit: usize,
+ enumeration_order: EnumerationOrder,
+ ) -> Result<Vec<Arc<ByteBuf>>, Error> {
+ let partition_hash = partition_key.hash();
+ match enumeration_order {
+ EnumerationOrder::Forward => {
+ let first_key = match start {
+ None => partition_hash.to_vec(),
+ Some(sk) => self.tree_key(partition_key, sk),
+ };
+ let range = self.store.range(first_key..);
+ self.read_range_aux(partition_hash, range, filter, limit)
+ }
+ EnumerationOrder::Reverse => match start {
+ Some(sk) => {
+ let last_key = self.tree_key(partition_key, sk);
+ let range = self.store.range(..=last_key).rev();
+ self.read_range_aux(partition_hash, range, filter, limit)
+ }
+ None => {
+ let mut last_key = partition_hash.to_vec();
+ let lower = u128::from_be_bytes(last_key[16..32].try_into().unwrap());
+ last_key[16..32].copy_from_slice(&u128::to_be_bytes(lower + 1));
+ let range = self.store.range(..last_key).rev();
+ self.read_range_aux(partition_hash, range, filter, limit)
+ }
+ },
+ }
+ }
+
+ fn read_range_aux(
+ &self,
+ partition_hash: Hash,
+ range: impl Iterator<Item = sled::Result<(IVec, IVec)>>,
filter: &Option<F::Filter>,
limit: usize,
) -> Result<Vec<Arc<ByteBuf>>, Error> {
- let partition_hash = p.hash();
- let first_key = match s {
- None => partition_hash.to_vec(),
- Some(sk) => self.tree_key(p, sk),
- };
let mut ret = vec![];
- for item in self.store.range(first_key..) {
+ for item in range {
let (key, value) = item?;
if &key[..32] != partition_hash.as_slice() {
break;
@@ -136,17 +168,31 @@ where
let update = self.decode_entry(update_bytes)?;
let tree_key = self.tree_key(update.partition_key(), update.sort_key());
+ self.update_entry_with(&tree_key[..], |ent| match ent {
+ Some(mut ent) => {
+ ent.merge(&update);
+ ent
+ }
+ None => update.clone(),
+ })?;
+ Ok(())
+ }
+
+ pub fn update_entry_with(
+ &self,
+ tree_key: &[u8],
+ f: impl Fn(Option<F::E>) -> F::E,
+ ) -> Result<Option<F::E>, Error> {
let changed = (&self.store, &self.merkle_todo).transaction(|(store, mkl_todo)| {
- let (old_entry, old_bytes, new_entry) = match store.get(&tree_key)? {
+ let (old_entry, old_bytes, new_entry) = match store.get(tree_key)? {
Some(old_bytes) => {
let old_entry = self
.decode_entry(&old_bytes)
.map_err(sled::transaction::ConflictableTransactionError::Abort)?;
- let mut new_entry = old_entry.clone();
- new_entry.merge(&update);
+ let new_entry = f(Some(old_entry.clone()));
(Some(old_entry), Some(old_bytes), new_entry)
}
- None => (None, None, update.clone()),
+ None => (None, None, f(None)),
};
// Scenario 1: the value changed, so of course there is a change
@@ -163,8 +209,8 @@ where
if value_changed || encoding_changed {
let new_bytes_hash = blake2sum(&new_bytes[..]);
- mkl_todo.insert(tree_key.clone(), new_bytes_hash.as_slice())?;
- store.insert(tree_key.clone(), new_bytes)?;
+ mkl_todo.insert(tree_key.to_vec(), new_bytes_hash.as_slice())?;
+ store.insert(tree_key.to_vec(), new_bytes)?;
Ok(Some((old_entry, new_entry, new_bytes_hash)))
} else {
Ok(None)
@@ -175,7 +221,7 @@ where
self.metrics.internal_update_counter.add(1);
let is_tombstone = new_entry.is_tombstone();
- self.instance.updated(old_entry, Some(new_entry));
+ self.instance.updated(old_entry.as_ref(), Some(&new_entry));
self.merkle_todo_notify.notify_one();
if is_tombstone {
// We are only responsible for GC'ing this item if we are the
@@ -187,12 +233,14 @@ where
let pk_hash = Hash::try_from(&tree_key[..32]).unwrap();
let nodes = self.replication.write_nodes(&pk_hash);
if nodes.first() == Some(&self.system.id) {
- GcTodoEntry::new(tree_key, new_bytes_hash).save(&self.gc_todo)?;
+ GcTodoEntry::new(tree_key.to_vec(), new_bytes_hash).save(&self.gc_todo)?;
}
}
- }
- Ok(())
+ Ok(Some(new_entry))
+ } else {
+ Ok(None)
+ }
}
pub(crate) fn delete_if_equal(self: &Arc<Self>, k: &[u8], v: &[u8]) -> Result<bool, Error> {
@@ -211,7 +259,7 @@ where
self.metrics.internal_delete_counter.add(1);
let old_entry = self.decode_entry(v)?;
- self.instance.updated(Some(old_entry), None);
+ self.instance.updated(Some(&old_entry), None);
self.merkle_todo_notify.notify_one();
}
Ok(removed)
@@ -235,7 +283,7 @@ where
if let Some(old_v) = removed {
let old_entry = self.decode_entry(&old_v[..])?;
- self.instance.updated(Some(old_entry), None);
+ self.instance.updated(Some(&old_entry), None);
self.merkle_todo_notify.notify_one();
Ok(true)
} else {
@@ -245,13 +293,13 @@ where
// ---- Utility functions ----
- pub(crate) fn tree_key(&self, p: &F::P, s: &F::S) -> Vec<u8> {
+ pub fn tree_key(&self, p: &F::P, s: &F::S) -> Vec<u8> {
let mut ret = p.hash().to_vec();
ret.extend(s.sort_key());
ret
}
- pub(crate) fn decode_entry(&self, bytes: &[u8]) -> Result<F::E, Error> {
+ pub fn decode_entry(&self, bytes: &[u8]) -> Result<F::E, Error> {
match rmp_serde::decode::from_read_ref::<_, F::E>(bytes) {
Ok(x) => Ok(x),
Err(e) => match F::try_migrate(bytes) {