aboutsummaryrefslogblamecommitdiff
path: root/src/table/data.rs
blob: e76836ca45cdb703a2b7f1a46de2d8522d110d78 (plain) (tree)
1
2
3
4
5
6
7
8
9
10
                         
                          

                   
                         
                        
 


                                              

                          
                                  
 
                               
 
                      
                           
                      
                          
                     
                   
 
                                                           
                            
 

                           
 
                            
 

                                         
                                              



                                               
                                        

                                         

 
                                                           
                                                                                                
                              
                                                                      

                                                          
                                    
                                                                            
                                                                      
                                    
                                                                            
                                                                      
 
                                     
                                                                             

                                                                       
                                
                                                                           
                                                             
                                                                                          
 






                                                
 
                               
                               
                                 
                                    
                              


                                                          

                                                           
                                
                                






                                                                                        
                                                                







                                                               












                                                                                     
                                                                           




                                                                                         
                                                                                       





                                                                                                              
                                                                                      





                                                                                                 
                          

                                     
                                     


                                               
                                     
                                   











                                                                                       
                                                                         








                                               








                                                                                                  
                                                    
                                                                             





                                                                                     
 










                                                           




                                 

                                     
                                                                                               
                                          

                                                                      
                                                                    
                                                                                                       
                                                    
                                                                                                                   
                                                                                                     
                                                                                     
                                 
                                                                                

                          


                                                                                  
                                                                                            


                                                                                             

                                                 
                                                          
                                                              
                                                                                   
                                        
 



                                                                                                    




                                                                                                 



                                        
 
                                                                    

                                                                    
                                                                    
                                                             
                                         








                                                                                                      
                                                                                                        
                                 
                         
 



                                           


                                                                                                    




                                                                             

                                                                                                          


                                                                                 

                                                                                                
                                 

                                               

                            
                                                                    
                                                             



                           




                                           




                                                                                  

                                                                                                                   


                                                                                 

                                                                                                
                                 

                                               
 

                                                                    
                                                             
                 
                           

         












                                                                                                       
                                              
                         
                                             
                  


                                                      
                                                                     
                                                      



                      

                                      
                                                               




                                                
                                                                         





                                                                                      
                                 




                                                                       

                 
 

                                                           
         
 
use core::borrow::Borrow;
use std::convert::TryInto;
use std::sync::Arc;

use serde_bytes::ByteBuf;
use tokio::sync::Notify;

use garage_db as db;
use garage_db::counted_tree_hack::CountedTree;

use garage_util::data::*;
use garage_util::error::*;
use garage_util::migrate::Migrate;

use garage_rpc::system::System;

use crate::crdt::Crdt;
use crate::gc::GcTodoEntry;
use crate::metrics::*;
use crate::replication::*;
use crate::schema::*;
use crate::util::*;

pub struct TableData<F: TableSchema, R: TableReplication> {
	system: Arc<System>,

	pub instance: F,
	pub replication: R,

	pub store: db::Tree,

	pub(crate) merkle_tree: db::Tree,
	pub(crate) merkle_todo: db::Tree,
	pub(crate) merkle_todo_notify: Notify,

	pub(crate) insert_queue: db::Tree,
	pub(crate) insert_queue_notify: Notify,

	pub(crate) gc_todo: CountedTree,

	pub(crate) metrics: TableMetrics,
}

impl<F: TableSchema, R: TableReplication> TableData<F, R> {
	pub fn new(system: Arc<System>, instance: F, replication: R, db: &db::Db) -> Arc<Self> {
		let store = db
			.open_tree(format!("{}:table", F::TABLE_NAME))
			.expect("Unable to open DB tree");

		let merkle_tree = db
			.open_tree(format!("{}:merkle_tree", F::TABLE_NAME))
			.expect("Unable to open DB Merkle tree tree");
		let merkle_todo = db
			.open_tree(format!("{}:merkle_todo", F::TABLE_NAME))
			.expect("Unable to open DB Merkle TODO tree");

		let insert_queue = db
			.open_tree(format!("{}:insert_queue", F::TABLE_NAME))
			.expect("Unable to open insert queue DB tree");

		let gc_todo = db
			.open_tree(format!("{}:gc_todo_v2", F::TABLE_NAME))
			.expect("Unable to open GC DB tree");
		let gc_todo = CountedTree::new(gc_todo).expect("Cannot count gc_todo_v2");

		let metrics = TableMetrics::new(
			F::TABLE_NAME,
			store.clone(),
			merkle_tree.clone(),
			merkle_todo.clone(),
			gc_todo.clone(),
		);

		Arc::new(Self {
			system,
			instance,
			replication,
			store,
			merkle_tree,
			merkle_todo,
			merkle_todo_notify: Notify::new(),
			insert_queue,
			insert_queue_notify: Notify::new(),
			gc_todo,
			metrics,
		})
	}

	// Read functions

	pub fn read_entry(&self, p: &F::P, s: &F::S) -> Result<Option<ByteBuf>, Error> {
		let tree_key = self.tree_key(p, s);
		if let Some(bytes) = self.store.get(tree_key)? {
			Ok(Some(ByteBuf::from(bytes.to_vec())))
		} else {
			Ok(None)
		}
	}

	pub fn read_range(
		&self,
		partition_key: &F::P,
		start: &Option<F::S>,
		filter: &Option<F::Filter>,
		limit: usize,
		enumeration_order: EnumerationOrder,
	) -> Result<Vec<Arc<ByteBuf>>, Error> {
		let partition_hash = partition_key.hash();
		match enumeration_order {
			EnumerationOrder::Forward => {
				let first_key = match start {
					None => partition_hash.to_vec(),
					Some(sk) => self.tree_key(partition_key, sk),
				};
				let range = self.store.range(first_key..)?;
				self.read_range_aux(partition_hash, range, filter, limit)
			}
			EnumerationOrder::Reverse => match start {
				Some(sk) => {
					let last_key = self.tree_key(partition_key, sk);
					let range = self.store.range_rev(..=last_key)?;
					self.read_range_aux(partition_hash, range, filter, limit)
				}
				None => {
					let mut last_key = partition_hash.to_vec();
					let lower = u128::from_be_bytes(last_key[16..32].try_into().unwrap());
					last_key[16..32].copy_from_slice(&u128::to_be_bytes(lower + 1));
					let range = self.store.range_rev(..last_key)?;
					self.read_range_aux(partition_hash, range, filter, limit)
				}
			},
		}
	}

	fn read_range_aux(
		&self,
		partition_hash: Hash,
		range: db::ValueIter,
		filter: &Option<F::Filter>,
		limit: usize,
	) -> Result<Vec<Arc<ByteBuf>>, Error> {
		let mut ret = vec![];
		for item in range {
			let (key, value) = item?;
			if &key[..32] != partition_hash.as_slice() {
				break;
			}
			let keep = match filter {
				None => true,
				Some(f) => {
					let entry = self.decode_entry(value.as_ref())?;
					F::matches_filter(&entry, f)
				}
			};
			if keep {
				ret.push(Arc::new(ByteBuf::from(value)));
			}
			if ret.len() >= limit {
				break;
			}
		}
		Ok(ret)
	}

	// Mutation functions
	// When changing this code, take care of propagating modifications correctly:
	// - When an entry is modified or deleted, call the updated() function
	//   on the table instance
	// - When an entry is modified or deleted, add it to the merkle updater's todo list.
	//   This has to be done atomically with the modification for the merkle updater
	//   to maintain consistency. The merkle updater must then be notified with todo_notify.
	// - When an entry is updated to be a tombstone, add it to the gc_todo tree

	pub(crate) fn update_many<T: Borrow<ByteBuf>>(&self, entries: &[T]) -> Result<(), Error> {
		for update_bytes in entries.iter() {
			self.update_entry(update_bytes.borrow().as_slice())?;
		}
		Ok(())
	}

	pub(crate) fn update_entry(&self, update_bytes: &[u8]) -> Result<(), Error> {
		let update = self.decode_entry(update_bytes)?;

		self.update_entry_with(
			update.partition_key(),
			update.sort_key(),
			|_tx, ent| match ent {
				Some(mut ent) => {
					ent.merge(&update);
					Ok(ent)
				}
				None => Ok(update.clone()),
			},
		)?;
		Ok(())
	}

	pub fn update_entry_with(
		&self,
		partition_key: &F::P,
		sort_key: &F::S,
		update_fn: impl Fn(&mut db::Transaction, Option<F::E>) -> db::TxOpResult<F::E>,
	) -> Result<Option<F::E>, Error> {
		let tree_key = self.tree_key(partition_key, sort_key);

		let changed = self.store.db().transaction(|mut tx| {
			let (old_entry, old_bytes, new_entry) = match tx.get(&self.store, &tree_key)? {
				Some(old_bytes) => {
					let old_entry = self.decode_entry(&old_bytes).map_err(db::TxError::Abort)?;
					let new_entry = update_fn(&mut tx, Some(old_entry.clone()))?;
					(Some(old_entry), Some(old_bytes), new_entry)
				}
				None => (None, None, update_fn(&mut tx, None)?),
			};

			// Changed can be true in two scenarios
			// Scenario 1: the actual represented value changed,
			//   so of course the messagepack encoding changed as well
			// Scenario 2: the value didn't change but due to a migration in the
			//   data format, the messagepack encoding changed. In this case,
			//   we also have to write the migrated value in the table and update
			//   the associated Merkle tree entry.
			let new_bytes = new_entry
				.encode()
				.map_err(Error::RmpEncode)
				.map_err(db::TxError::Abort)?;
			let changed = Some(&new_bytes[..]) != old_bytes.as_deref();
			drop(old_bytes);

			if changed {
				let new_bytes_hash = blake2sum(&new_bytes);
				tx.insert(&self.merkle_todo, &tree_key, new_bytes_hash.as_slice())?;
				tx.insert(&self.store, &tree_key, new_bytes)?;

				self.instance
					.updated(&mut tx, old_entry.as_ref(), Some(&new_entry))?;

				Ok(Some((new_entry, new_bytes_hash)))
			} else {
				Ok(None)
			}
		})?;

		if let Some((new_entry, new_bytes_hash)) = changed {
			self.metrics.internal_update_counter.add(1);

			let is_tombstone = new_entry.is_tombstone();
			self.merkle_todo_notify.notify_one();
			if is_tombstone {
				// We are only responsible for GC'ing this item if we are the
				// "leader" of the partition, i.e. the first node in the
				// set of nodes that replicates this partition.
				// This avoids GC loops and does not change the termination properties
				// of the GC algorithm, as in all cases GC is suspended if
				// any node of the partition is unavailable.
				let pk_hash = Hash::try_from(&tree_key[..32]).unwrap();
				let nodes = self.replication.write_nodes(&pk_hash);
				if nodes.first() == Some(&self.system.id) {
					GcTodoEntry::new(tree_key, new_bytes_hash).save(&self.gc_todo)?;
				}
			}

			Ok(Some(new_entry))
		} else {
			Ok(None)
		}
	}

	pub(crate) fn delete_if_equal(self: &Arc<Self>, k: &[u8], v: &[u8]) -> Result<bool, Error> {
		let removed = self
			.store
			.db()
			.transaction(|mut tx| match tx.get(&self.store, k)? {
				Some(cur_v) if cur_v == v => {
					let old_entry = self.decode_entry(v).map_err(db::TxError::Abort)?;

					tx.remove(&self.store, k)?;
					tx.insert(&self.merkle_todo, k, vec![])?;

					self.instance.updated(&mut tx, Some(&old_entry), None)?;
					Ok(true)
				}
				_ => Ok(false),
			})?;

		if removed {
			self.metrics.internal_delete_counter.add(1);
			self.merkle_todo_notify.notify_one();
		}
		Ok(removed)
	}

	pub(crate) fn delete_if_equal_hash(
		self: &Arc<Self>,
		k: &[u8],
		vhash: Hash,
	) -> Result<bool, Error> {
		let removed = self
			.store
			.db()
			.transaction(|mut tx| match tx.get(&self.store, k)? {
				Some(cur_v) if blake2sum(&cur_v[..]) == vhash => {
					let old_entry = self.decode_entry(&cur_v[..]).map_err(db::TxError::Abort)?;

					tx.remove(&self.store, k)?;
					tx.insert(&self.merkle_todo, k, vec![])?;

					self.instance.updated(&mut tx, Some(&old_entry), None)?;
					Ok(true)
				}
				_ => Ok(false),
			})?;

		if removed {
			self.metrics.internal_delete_counter.add(1);
			self.merkle_todo_notify.notify_one();
		}
		Ok(removed)
	}

	// ---- Insert queue functions ----

	pub(crate) fn queue_insert(
		&self,
		tx: &mut db::Transaction,
		ins: &F::E,
	) -> db::TxResult<(), Error> {
		let tree_key = self.tree_key(ins.partition_key(), ins.sort_key());

		let new_entry = match tx.get(&self.insert_queue, &tree_key)? {
			Some(old_v) => {
				let mut entry = self.decode_entry(&old_v).map_err(db::TxError::Abort)?;
				entry.merge(ins);
				entry.encode()
			}
			None => ins.encode(),
		};
		let new_entry = new_entry
			.map_err(Error::RmpEncode)
			.map_err(db::TxError::Abort)?;
		tx.insert(&self.insert_queue, &tree_key, new_entry)?;
		self.insert_queue_notify.notify_one();

		Ok(())
	}

	// ---- Utility functions ----

	pub fn tree_key(&self, p: &F::P, s: &F::S) -> Vec<u8> {
		let mut ret = p.hash().to_vec();
		ret.extend(s.sort_key());
		ret
	}

	pub fn decode_entry(&self, bytes: &[u8]) -> Result<F::E, Error> {
		match F::E::decode(bytes) {
			Some(x) => Ok(x),
			None => {
				error!("Unable to decode entry of {}", F::TABLE_NAME);
				for line in hexdump::hexdump_iter(bytes) {
					debug!("{}", line);
				}
				Err(Error::Message(format!(
					"Unable to decode entry of {}",
					F::TABLE_NAME
				)))
			}
		}
	}

	pub fn gc_todo_len(&self) -> Result<usize, Error> {
		Ok(self.gc_todo.len())
	}
}