aboutsummaryrefslogblamecommitdiff
path: root/src/model/k2v/rpc.rs
blob: 37e142f682d9b4c7740365b593b45018fd9a1888 (plain) (tree)
1
2
3
4
5
6
7
8
9





                                                                  
                                          
                          
                                        
                                   





                                      
                    

                          
                                



                                                                           
                                        
                                               
                              
                        
                       
 
                                                                    
                                          
 









                                              



                                         
                                          
                                              
















                                                                                             


                                                                                   
                                              
 





                                                
                            

                                                                              

                                                                                  



                                                                                          
                                                                               








































                                                                           

















































                                                                                          












                                                                              
                               




                                              
                                             











                                                                 















                                                                                                           









                                                                      
                                                                     
                                                                                  




                         



                                         
                                                                               
                                                         
                                                                                        
                                       
                                                            


                                             
                                                                                                   



                                                              



                                                                            
                  














































                                                                                                                                    













                                                                                                      
                                                                                    

                         
                                                            




                                                             

                                                                                     


                                                                                             










                                                                                             


                                                                                             
 

                                                                  









                                                                         



                                                                
                                     
                               
                                                                                       
                                                                                  


                                                                        





                                                                                     






                                                                                
                                                             



                                                                              

                          
                                                                                                       



















                                                                                   




                                          
                                                  
                                                                                                          


























                                                                                                   
 




                                           
 

















                                                                                        
                             













                                                                                                     
                                                                                                                                        

                                                                                        









                                                                                                                                                           


                                                                   
//! Module that implements RPCs specific to K2V.
//! This is necessary for insertions into the K2V store,
//! as they have to be transmitted to one of the nodes responsible
//! for storing the entry to be processed (the API entry
//! node does not process the entry directly, as this would
//! mean the vector clock gets much larger than needed).

use std::collections::{BTreeMap, HashMap};
use std::convert::TryInto;
use std::sync::{Arc, Mutex, MutexGuard};
use std::time::{Duration, Instant};

use async_trait::async_trait;
use futures::stream::FuturesUnordered;
use futures::StreamExt;
use serde::{Deserialize, Serialize};
use tokio::select;

use garage_db as db;

use garage_util::crdt::*;
use garage_util::data::*;
use garage_util::error::*;
use garage_util::time::now_msec;

use garage_rpc::system::System;
use garage_rpc::*;

use garage_table::replication::{TableReplication, TableShardedReplication};
use garage_table::{PartitionKey, Table};

use crate::helper::error::Error as HelperError;
use crate::k2v::causality::*;
use crate::k2v::item_table::*;
use crate::k2v::seen::*;
use crate::k2v::sub::*;

const POLL_RANGE_EXTRA_DELAY: Duration = Duration::from_millis(200);

const TIMESTAMP_KEY: &[u8] = b"timestamp";

/// RPC messages for K2V
#[derive(Debug, Serialize, Deserialize)]
enum K2VRpc {
	Ok,
	InsertItem(InsertedItem),
	InsertManyItems(Vec<InsertedItem>),
	PollItem {
		key: PollKey,
		causal_context: CausalContext,
		timeout_msec: u64,
	},
	PollRange {
		range: PollRange,
		seen_str: Option<String>,
		timeout_msec: u64,
	},
	PollItemResponse(Option<K2VItem>),
	PollRangeResponse(Uuid, Vec<K2VItem>),
}

#[derive(Debug, Serialize, Deserialize)]
struct InsertedItem {
	partition: K2VItemPartition,
	sort_key: String,
	causal_context: Option<CausalContext>,
	value: DvvsValue,
}

impl Rpc for K2VRpc {
	type Response = Result<K2VRpc, Error>;
}

/// The block manager, handling block exchange between nodes, and block storage on local node
pub struct K2VRpcHandler {
	system: Arc<System>,
	item_table: Arc<Table<K2VItemTable, TableShardedReplication>>,

	// Using a mutex on the local_timestamp_tree is not strictly necessary,
	// but it helps to not try to do several inserts at the same time,
	// which would create transaction conflicts and force many useless retries.
	local_timestamp_tree: Mutex<db::Tree>,

	endpoint: Arc<Endpoint<K2VRpc, Self>>,
	subscriptions: Arc<SubscriptionManager>,
}

impl K2VRpcHandler {
	pub fn new(
		system: Arc<System>,
		db: &db::Db,
		item_table: Arc<Table<K2VItemTable, TableShardedReplication>>,
		subscriptions: Arc<SubscriptionManager>,
	) -> Arc<Self> {
		let local_timestamp_tree = db
			.open_tree("k2v_local_timestamp")
			.expect("Unable to open DB tree for k2v local timestamp");
		let endpoint = system.netapp.endpoint("garage_model/k2v/Rpc".to_string());

		let rpc_handler = Arc::new(Self {
			system,
			item_table,
			local_timestamp_tree: Mutex::new(local_timestamp_tree),
			endpoint,
			subscriptions,
		});
		rpc_handler.endpoint.set_handler(rpc_handler.clone());

		rpc_handler
	}

	// ---- public interface ----

	pub async fn insert(
		&self,
		bucket_id: Uuid,
		partition_key: String,
		sort_key: String,
		causal_context: Option<CausalContext>,
		value: DvvsValue,
	) -> Result<(), Error> {
		let partition = K2VItemPartition {
			bucket_id,
			partition_key,
		};
		let mut who = self
			.item_table
			.data
			.replication
			.write_nodes(&partition.hash());
		who.sort();

		self.system
			.rpc
			.try_call_many(
				&self.endpoint,
				&who[..],
				K2VRpc::InsertItem(InsertedItem {
					partition,
					sort_key,
					causal_context,
					value,
				}),
				RequestStrategy::with_priority(PRIO_NORMAL)
					.with_quorum(1)
					.interrupt_after_quorum(true),
			)
			.await?;

		Ok(())
	}

	pub async fn insert_batch(
		&self,
		bucket_id: Uuid,
		items: Vec<(String, String, Option<CausalContext>, DvvsValue)>,
	) -> Result<(), Error> {
		let n_items = items.len();

		let mut call_list: HashMap<_, Vec<_>> = HashMap::new();

		for (partition_key, sort_key, causal_context, value) in items {
			let partition = K2VItemPartition {
				bucket_id,
				partition_key,
			};
			let mut who = self
				.item_table
				.data
				.replication
				.write_nodes(&partition.hash());
			who.sort();

			call_list.entry(who).or_default().push(InsertedItem {
				partition,
				sort_key,
				causal_context,
				value,
			});
		}

		debug!(
			"K2V insert_batch: {} requests to insert {} items",
			call_list.len(),
			n_items
		);
		let call_futures = call_list.into_iter().map(|(nodes, items)| async move {
			let resp = self
				.system
				.rpc
				.try_call_many(
					&self.endpoint,
					&nodes[..],
					K2VRpc::InsertManyItems(items),
					RequestStrategy::with_priority(PRIO_NORMAL)
						.with_quorum(1)
						.interrupt_after_quorum(true),
				)
				.await?;
			Ok::<_, Error>((nodes, resp))
		});

		let mut resps = call_futures.collect::<FuturesUnordered<_>>();
		while let Some(resp) = resps.next().await {
			resp?;
		}

		Ok(())
	}

	pub async fn poll_item(
		&self,
		bucket_id: Uuid,
		partition_key: String,
		sort_key: String,
		causal_context: CausalContext,
		timeout_msec: u64,
	) -> Result<Option<K2VItem>, Error> {
		let poll_key = PollKey {
			partition: K2VItemPartition {
				bucket_id,
				partition_key,
			},
			sort_key,
		};
		let nodes = self
			.item_table
			.data
			.replication
			.write_nodes(&poll_key.partition.hash());

		let rpc = self.system.rpc.try_call_many(
			&self.endpoint,
			&nodes[..],
			K2VRpc::PollItem {
				key: poll_key,
				causal_context,
				timeout_msec,
			},
			RequestStrategy::with_priority(PRIO_NORMAL)
				.with_quorum(self.item_table.data.replication.read_quorum())
				.without_timeout(),
		);
		let timeout_duration = Duration::from_millis(timeout_msec) + self.system.rpc.rpc_timeout();
		let resps = select! {
			r = rpc => r?,
			_ = tokio::time::sleep(timeout_duration) => return Ok(None),
		};

		let mut resp: Option<K2VItem> = None;
		for v in resps {
			match v {
				K2VRpc::PollItemResponse(Some(x)) => {
					if let Some(y) = &mut resp {
						y.merge(&x);
					} else {
						resp = Some(x);
					}
				}
				K2VRpc::PollItemResponse(None) => (),
				v => return Err(Error::unexpected_rpc_message(v)),
			}
		}

		Ok(resp)
	}

	pub async fn poll_range(
		&self,
		range: PollRange,
		seen_str: Option<String>,
		timeout_msec: u64,
	) -> Result<Option<(BTreeMap<String, K2VItem>, String)>, HelperError> {
		let has_seen_marker = seen_str.is_some();

		// Parse seen marker, we will use it below. This is also the first check
		// that it is valid, which returns a bad request error if not.
		let mut seen = seen_str
			.as_deref()
			.map(RangeSeenMarker::decode_helper)
			.transpose()?
			.unwrap_or_default();
		seen.restrict(&range);

		// Prepare PollRange RPC to send to the storage nodes responsible for the parititon
		let nodes = self
			.item_table
			.data
			.replication
			.write_nodes(&range.partition.hash());
		let quorum = self.item_table.data.replication.read_quorum();
		let msg = K2VRpc::PollRange {
			range,
			seen_str,
			timeout_msec,
		};

		// Send the request to all nodes, use FuturesUnordered to get the responses in any order
		let msg = msg.into_req().map_err(netapp::error::Error::from)?;
		let rs = RequestStrategy::with_priority(PRIO_NORMAL).without_timeout();
		let mut requests = nodes
			.iter()
			.map(|node| self.system.rpc.call(&self.endpoint, *node, msg.clone(), rs))
			.collect::<FuturesUnordered<_>>();

		// Fetch responses. This procedure stops fetching responses when any of the following
		// conditions arise:
		// - we have a response to all requests
		// - we have a response to a read quorum of requests (e.g. 2/3), and an extra delay
		//   has passed since the quorum was achieved
		// - a global RPC timeout expired
		// The extra delay after a quorum was received is usefull if the third response was to
		// arrive during this short interval: this would allow us to consider all the data seen
		// by that last node in the response we produce, and would likely help reduce the
		// size of the seen marker that we will return (because we would have an info of the
		// kind: all items produced by that node until time ts have been returned, so we can
		// bump the entry in the global vector clock and possibly remove some item-specific
		// vector clocks)
		let mut deadline =
			Instant::now() + Duration::from_millis(timeout_msec) + self.system.rpc.rpc_timeout();
		let mut resps = vec![];
		let mut errors = vec![];
		loop {
			select! {
				_ =	tokio::time::sleep_until(deadline.into()) => {
					break;
				}
				res = requests.next() => match res {
					None => break,
					Some(Err(e)) => errors.push(e),
					Some(Ok(r)) => {
						resps.push(r);
						if resps.len() >= quorum {
							deadline = std::cmp::min(deadline, Instant::now() + POLL_RANGE_EXTRA_DELAY);
						}
					}
				}
			}
		}
		if errors.len() > nodes.len() - quorum {
			let errors = errors.iter().map(|e| format!("{}", e)).collect::<Vec<_>>();
			return Err(Error::Quorum(quorum, resps.len(), nodes.len(), errors).into());
		}

		// Take all returned items into account to produce the response.
		let mut new_items = BTreeMap::<String, K2VItem>::new();
		for v in resps {
			if let K2VRpc::PollRangeResponse(node, items) = v {
				seen.mark_seen_node_items(node, items.iter());
				for item in items.into_iter() {
					match new_items.get_mut(&item.sort_key) {
						Some(ent) => {
							ent.merge(&item);
						}
						None => {
							new_items.insert(item.sort_key.clone(), item);
						}
					}
				}
			} else {
				return Err(Error::unexpected_rpc_message(v).into());
			}
		}

		if new_items.is_empty() && has_seen_marker {
			Ok(None)
		} else {
			Ok(Some((new_items, seen.encode()?)))
		}
	}

	// ---- internal handlers ----

	async fn handle_insert(&self, item: &InsertedItem) -> Result<K2VRpc, Error> {
		let new = {
			let local_timestamp_tree = self.local_timestamp_tree.lock().unwrap();
			self.local_insert(&local_timestamp_tree, item)?
		};

		// Propagate to rest of network
		if let Some(updated) = new {
			self.item_table.insert(&updated).await?;
		}

		Ok(K2VRpc::Ok)
	}

	async fn handle_insert_many(&self, items: &[InsertedItem]) -> Result<K2VRpc, Error> {
		let mut updated_vec = vec![];

		{
			let local_timestamp_tree = self.local_timestamp_tree.lock().unwrap();
			for item in items {
				let new = self.local_insert(&local_timestamp_tree, item)?;

				if let Some(updated) = new {
					updated_vec.push(updated);
				}
			}
		}

		// Propagate to rest of network
		if !updated_vec.is_empty() {
			self.item_table.insert_many(&updated_vec).await?;
		}

		Ok(K2VRpc::Ok)
	}

	fn local_insert(
		&self,
		local_timestamp_tree: &MutexGuard<'_, db::Tree>,
		item: &InsertedItem,
	) -> Result<Option<K2VItem>, Error> {
		let now = now_msec();

		self.item_table
			.data
			.update_entry_with(&item.partition, &item.sort_key, |tx, ent| {
				let old_local_timestamp = tx
					.get(local_timestamp_tree, TIMESTAMP_KEY)?
					.and_then(|x| x.try_into().ok())
					.map(u64::from_be_bytes)
					.unwrap_or_default();

				let mut ent = ent.unwrap_or_else(|| {
					K2VItem::new(
						item.partition.bucket_id,
						item.partition.partition_key.clone(),
						item.sort_key.clone(),
					)
				});
				let new_local_timestamp = ent.update(
					self.system.id,
					&item.causal_context,
					item.value.clone(),
					std::cmp::max(old_local_timestamp, now),
				);

				tx.insert(
					local_timestamp_tree,
					TIMESTAMP_KEY,
					u64::to_be_bytes(new_local_timestamp),
				)?;

				Ok(ent)
			})
	}

	async fn handle_poll_item(&self, key: &PollKey, ct: &CausalContext) -> Result<K2VItem, Error> {
		let mut chan = self.subscriptions.subscribe_item(key);

		let mut value = self
			.item_table
			.data
			.read_entry(&key.partition, &key.sort_key)?
			.map(|bytes| self.item_table.data.decode_entry(&bytes[..]))
			.transpose()?
			.unwrap_or_else(|| {
				K2VItem::new(
					key.partition.bucket_id,
					key.partition.partition_key.clone(),
					key.sort_key.clone(),
				)
			});

		while !value.causal_context().is_newer_than(ct) {
			value = chan.recv().await?;
		}

		Ok(value)
	}

	async fn handle_poll_range(
		&self,
		range: &PollRange,
		seen_str: &Option<String>,
	) -> Result<Vec<K2VItem>, Error> {
		if let Some(seen_str) = seen_str {
			let seen = RangeSeenMarker::decode(seen_str).ok_or_message("Invalid seenMarker")?;

			// Subscribe now to all changes on that partition,
			// so that new items that are inserted while we are reading the range
			// will be seen in the loop below
			let mut chan = self.subscriptions.subscribe_partition(&range.partition);

			// Check for the presence of any new items already stored in the item table
			let mut new_items = self.poll_range_read_range(range, &seen)?;

			// If we found no new items, wait for a matching item to arrive
			// on the channel
			while new_items.is_empty() {
				let item = chan.recv().await?;
				if range.matches(&item) && seen.is_new_item(&item) {
					new_items.push(item);
				}
			}

			Ok(new_items)
		} else {
			// If no seen marker was specified, we do not poll for anything.
			// We return immediately with the set of known items (even if
			// it is empty), which will give the client an inital view of
			// the dataset and an initial seen marker for further
			// PollRange calls.
			self.poll_range_read_range(range, &RangeSeenMarker::default())
		}
	}

	fn poll_range_read_range(
		&self,
		range: &PollRange,
		seen: &RangeSeenMarker,
	) -> Result<Vec<K2VItem>, Error> {
		let mut new_items = vec![];

		let partition_hash = range.partition.hash();
		let first_key = match &range.start {
			None => partition_hash.to_vec(),
			Some(sk) => self.item_table.data.tree_key(&range.partition, sk),
		};
		for item in self.item_table.data.store.range(first_key..)? {
			let (key, value) = item?;
			if &key[..32] != partition_hash.as_slice() {
				break;
			}
			let item = self.item_table.data.decode_entry(&value)?;
			if !range.matches(&item) {
				break;
			}
			if seen.is_new_item(&item) {
				new_items.push(item);
			}
		}

		Ok(new_items)
	}
}

#[async_trait]
impl EndpointHandler<K2VRpc> for K2VRpcHandler {
	async fn handle(self: &Arc<Self>, message: &K2VRpc, _from: NodeID) -> Result<K2VRpc, Error> {
		match message {
			K2VRpc::InsertItem(item) => self.handle_insert(item).await,
			K2VRpc::InsertManyItems(items) => self.handle_insert_many(&items[..]).await,
			K2VRpc::PollItem {
				key,
				causal_context,
				timeout_msec,
			} => {
				let delay = tokio::time::sleep(Duration::from_millis(*timeout_msec));
				select! {
					ret = self.handle_poll_item(key, causal_context) => ret.map(Some).map(K2VRpc::PollItemResponse),
					_ = delay => Ok(K2VRpc::PollItemResponse(None)),
				}
			}
			K2VRpc::PollRange {
				range,
				seen_str,
				timeout_msec,
			} => {
				let delay = tokio::time::sleep(Duration::from_millis(*timeout_msec));
				select! {
					ret = self.handle_poll_range(range, seen_str) => ret.map(|items| K2VRpc::PollRangeResponse(self.system.id, items)),
					_ = delay => Ok(K2VRpc::PollRangeResponse(self.system.id, vec![])),
				}
			}
			m => Err(Error::unexpected_rpc_message(m)),
		}
	}
}