use core::ops::Bound; use std::collections::{hash_map, BTreeMap, HashMap}; use std::marker::PhantomData; use std::sync::Arc; use async_trait::async_trait; use serde::{Deserialize, Serialize}; use tokio::sync::{mpsc, watch}; use garage_db as db; use garage_rpc::ring::Ring; use garage_rpc::system::System; use garage_util::background::*; use garage_util::data::*; use garage_util::error::*; use garage_util::time::*; use garage_table::crdt::*; use garage_table::replication::*; use garage_table::*; pub trait CountedItem: Clone + PartialEq + Send + Sync + 'static { const COUNTER_TABLE_NAME: &'static str; type CP: PartitionKey + Clone + PartialEq + Serialize + for<'de> Deserialize<'de> + Send + Sync; type CS: SortKey + Clone + PartialEq + Serialize + for<'de> Deserialize<'de> + Send + Sync; fn counter_partition_key(&self) -> &Self::CP; fn counter_sort_key(&self) -> &Self::CS; fn counts(&self) -> Vec<(&'static str, i64)>; } /// A counter entry in the global table #[derive(Clone, PartialEq, Debug, Serialize, Deserialize)] pub struct CounterEntry<T: CountedItem> { pub pk: T::CP, pub sk: T::CS, pub values: BTreeMap<String, CounterValue>, } impl<T: CountedItem> Entry<T::CP, T::CS> for CounterEntry<T> { fn partition_key(&self) -> &T::CP { &self.pk } fn sort_key(&self) -> &T::CS { &self.sk } fn is_tombstone(&self) -> bool { self.values .iter() .all(|(_, v)| v.node_values.iter().all(|(_, (_, v))| *v == 0)) } } impl<T: CountedItem> CounterEntry<T> { pub fn filtered_values(&self, ring: &Ring) -> HashMap<String, i64> { let nodes = &ring.layout.node_id_vec[..]; self.filtered_values_with_nodes(nodes) } pub fn filtered_values_with_nodes(&self, nodes: &[Uuid]) -> HashMap<String, i64> { let mut ret = HashMap::new(); for (name, vals) in self.values.iter() { let new_vals = vals .node_values .iter() .filter(|(n, _)| nodes.contains(n)) .map(|(_, (_, v))| *v) .collect::<Vec<_>>(); if !new_vals.is_empty() { ret.insert( name.clone(), new_vals.iter().fold(i64::MIN, |a, b| std::cmp::max(a, *b)), ); } } ret } } /// A counter entry in the global table #[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] pub struct CounterValue { pub node_values: BTreeMap<Uuid, (u64, i64)>, } impl<T: CountedItem> Crdt for CounterEntry<T> { fn merge(&mut self, other: &Self) { for (name, e2) in other.values.iter() { if let Some(e) = self.values.get_mut(name) { e.merge(e2); } else { self.values.insert(name.clone(), e2.clone()); } } } } impl Crdt for CounterValue { fn merge(&mut self, other: &Self) { for (node, (t2, e2)) in other.node_values.iter() { if let Some((t, e)) = self.node_values.get_mut(node) { if t2 > t { *e = *e2; } } else { self.node_values.insert(*node, (*t2, *e2)); } } } } pub struct CounterTable<T: CountedItem> { _phantom_t: PhantomData<T>, } impl<T: CountedItem> TableSchema for CounterTable<T> { const TABLE_NAME: &'static str = T::COUNTER_TABLE_NAME; type P = T::CP; type S = T::CS; type E = CounterEntry<T>; type Filter = (DeletedFilter, Vec<Uuid>); fn matches_filter(entry: &Self::E, filter: &Self::Filter) -> bool { if filter.0 == DeletedFilter::Any { return true; } let is_tombstone = entry .filtered_values_with_nodes(&filter.1[..]) .iter() .all(|(_, v)| *v == 0); filter.0.apply(is_tombstone) } } // ---- pub struct IndexCounter<T: CountedItem> { this_node: Uuid, local_counter: db::Tree, propagate_tx: mpsc::UnboundedSender<(T::CP, T::CS, LocalCounterEntry<T>)>, pub table: Arc<Table<CounterTable<T>, TableShardedReplication>>, } impl<T: CountedItem> IndexCounter<T> { pub fn new( system: Arc<System>, replication: TableShardedReplication, db: &db::Db, ) -> Arc<Self> { let background = system.background.clone(); let (propagate_tx, propagate_rx) = mpsc::unbounded_channel(); let this = Arc::new(Self { this_node: system.id, local_counter: db .open_tree(format!("local_counter_v2:{}", T::COUNTER_TABLE_NAME)) .expect("Unable to open local counter tree"), propagate_tx, table: Table::new( CounterTable { _phantom_t: Default::default(), }, replication, system, db, ), }); background.spawn_worker(IndexPropagatorWorker { index_counter: this.clone(), propagate_rx, buf: HashMap::new(), errors: 0, }); this } pub fn count( &self, tx: &mut db::Transaction, old: Option<&T>, new: Option<&T>, ) -> db::TxResult<(), Error> { let pk = old .map(|e| e.counter_partition_key()) .unwrap_or_else(|| new.unwrap().counter_partition_key()); let sk = old .map(|e| e.counter_sort_key()) .unwrap_or_else(|| new.unwrap().counter_sort_key()); // calculate counter differences let mut counts = HashMap::new(); for (k, v) in old.map(|x| x.counts()).unwrap_or_default() { *counts.entry(k).or_insert(0) -= v; } for (k, v) in new.map(|x| x.counts()).unwrap_or_default() { *counts.entry(k).or_insert(0) += v; } // update local counter table let tree_key = self.table.data.tree_key(pk, sk); let mut entry = match tx.get(&self.local_counter, &tree_key[..])? { Some(old_bytes) => { rmp_serde::decode::from_read_ref::<_, LocalCounterEntry<T>>(&old_bytes) .map_err(Error::RmpDecode) .map_err(db::TxError::Abort)? } None => LocalCounterEntry { pk: pk.clone(), sk: sk.clone(), values: BTreeMap::new(), }, }; let now = now_msec(); for (s, inc) in counts.iter() { let mut ent = entry.values.entry(s.to_string()).or_insert((0, 0)); ent.0 = std::cmp::max(ent.0 + 1, now); ent.1 += *inc; } let new_entry_bytes = rmp_to_vec_all_named(&entry) .map_err(Error::RmpEncode) .map_err(db::TxError::Abort)?; tx.insert(&self.local_counter, &tree_key[..], new_entry_bytes)?; if let Err(e) = self.propagate_tx.send((pk.clone(), sk.clone(), entry)) { error!( "Could not propagate updated counter values, failed to send to channel: {}", e ); } Ok(()) } pub fn offline_recount_all<TS, TR>( &self, counted_table: &Arc<Table<TS, TR>>, ) -> Result<(), Error> where TS: TableSchema<E = T>, TR: TableReplication, { let save_counter_entry = |entry: CounterEntry<T>| -> Result<(), Error> { let entry_k = self .table .data .tree_key(entry.partition_key(), entry.sort_key()); self.table .data .update_entry_with(&entry_k, |ent| match ent { Some(mut ent) => { ent.merge(&entry); ent } None => entry.clone(), })?; Ok(()) }; // 1. Set all old local counters to zero let now = now_msec(); let mut next_start: Option<Vec<u8>> = None; loop { let low_bound = match next_start.take() { Some(v) => Bound::Excluded(v), None => Bound::Unbounded, }; let mut batch = vec![]; for item in self.local_counter.range((low_bound, Bound::Unbounded))? { batch.push(item?); if batch.len() > 1000 { break; } } if batch.is_empty() { break; } info!("zeroing old counters... ({})", hex::encode(&batch[0].0)); for (local_counter_k, local_counter) in batch { let mut local_counter = rmp_serde::decode::from_read_ref::<_, LocalCounterEntry<T>>(&local_counter)?; for (_, tv) in local_counter.values.iter_mut() { tv.0 = std::cmp::max(tv.0 + 1, now); tv.1 = 0; } let local_counter_bytes = rmp_to_vec_all_named(&local_counter)?; self.local_counter .insert(&local_counter_k, &local_counter_bytes)?; let counter_entry = local_counter.into_counter_entry(self.this_node); save_counter_entry(counter_entry)?; next_start = Some(local_counter_k); } } // 2. Recount all table entries let now = now_msec(); let mut next_start: Option<Vec<u8>> = None; loop { let low_bound = match next_start.take() { Some(v) => Bound::Excluded(v), None => Bound::Unbounded, }; let mut batch = vec![]; for item in counted_table .data .store .range((low_bound, Bound::Unbounded))? { batch.push(item?); if batch.len() > 1000 { break; } } if batch.is_empty() { break; } info!("counting entries... ({})", hex::encode(&batch[0].0)); for (counted_entry_k, counted_entry) in batch { let counted_entry = counted_table.data.decode_entry(&counted_entry)?; let pk = counted_entry.counter_partition_key(); let sk = counted_entry.counter_sort_key(); let counts = counted_entry.counts(); let local_counter_key = self.table.data.tree_key(pk, sk); let mut local_counter = match self.local_counter.get(&local_counter_key)? { Some(old_bytes) => { let ent = rmp_serde::decode::from_read_ref::<_, LocalCounterEntry<T>>( &old_bytes, )?; assert!(ent.pk == *pk); assert!(ent.sk == *sk); ent } None => LocalCounterEntry { pk: pk.clone(), sk: sk.clone(), values: BTreeMap::new(), }, }; for (s, v) in counts.iter() { let mut tv = local_counter.values.entry(s.to_string()).or_insert((0, 0)); tv.0 = std::cmp::max(tv.0 + 1, now); tv.1 += v; } let local_counter_bytes = rmp_to_vec_all_named(&local_counter)?; self.local_counter .insert(&local_counter_key, local_counter_bytes)?; let counter_entry = local_counter.into_counter_entry(self.this_node); save_counter_entry(counter_entry)?; next_start = Some(counted_entry_k); } } // Done Ok(()) } } struct IndexPropagatorWorker<T: CountedItem> { index_counter: Arc<IndexCounter<T>>, propagate_rx: mpsc::UnboundedReceiver<(T::CP, T::CS, LocalCounterEntry<T>)>, buf: HashMap<Vec<u8>, CounterEntry<T>>, errors: usize, } impl<T: CountedItem> IndexPropagatorWorker<T> { fn add_ent(&mut self, pk: T::CP, sk: T::CS, counters: LocalCounterEntry<T>) { let tree_key = self.index_counter.table.data.tree_key(&pk, &sk); let dist_entry = counters.into_counter_entry(self.index_counter.this_node); match self.buf.entry(tree_key) { hash_map::Entry::Vacant(e) => { e.insert(dist_entry); } hash_map::Entry::Occupied(mut e) => { e.get_mut().merge(&dist_entry); } } } } #[async_trait] impl<T: CountedItem> Worker for IndexPropagatorWorker<T> { fn name(&self) -> String { format!("{} index counter propagator", T::COUNTER_TABLE_NAME) } fn info(&self) -> Option<String> { if !self.buf.is_empty() { Some(format!("{} items in queue", self.buf.len())) } else { None } } async fn work(&mut self, must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error> { // This loop batches updates to counters to be sent all at once. // They are sent once the propagate_rx channel has been emptied (or is closed). let closed = loop { match self.propagate_rx.try_recv() { Ok((pk, sk, counters)) => { self.add_ent(pk, sk, counters); } Err(mpsc::error::TryRecvError::Empty) => break false, Err(mpsc::error::TryRecvError::Disconnected) => break true, } }; if !self.buf.is_empty() { let entries_k = self.buf.keys().take(100).cloned().collect::<Vec<_>>(); let entries = entries_k.iter().map(|k| self.buf.get(k).unwrap()); if let Err(e) = self.index_counter.table.insert_many(entries).await { self.errors += 1; if self.errors >= 2 && *must_exit.borrow() { error!("({}) Could not propagate {} counter values: {}, these counters will not be updated correctly.", T::COUNTER_TABLE_NAME, self.buf.len(), e); return Ok(WorkerState::Done); } // Propagate error up to worker manager, it will log it, increment a counter, // and sleep for a certain delay (with exponential backoff), waiting for // things to go back to normal return Err(e); } else { for k in entries_k { self.buf.remove(&k); } self.errors = 0; } return Ok(WorkerState::Busy); } else if closed { return Ok(WorkerState::Done); } else { return Ok(WorkerState::Idle); } } async fn wait_for_work(&mut self, _must_exit: &watch::Receiver<bool>) -> WorkerState { match self.propagate_rx.recv().await { Some((pk, sk, counters)) => { self.add_ent(pk, sk, counters); WorkerState::Busy } None => match self.buf.is_empty() { false => WorkerState::Busy, true => WorkerState::Done, }, } } } #[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] struct LocalCounterEntry<T: CountedItem> { pk: T::CP, sk: T::CS, values: BTreeMap<String, (u64, i64)>, } impl<T: CountedItem> LocalCounterEntry<T> { fn into_counter_entry(self, this_node: Uuid) -> CounterEntry<T> { CounterEntry { pk: self.pk, sk: self.sk, values: self .values .into_iter() .map(|(name, (ts, v))| { let mut node_values = BTreeMap::new(); node_values.insert(this_node, (ts, v)); (name, CounterValue { node_values }) }) .collect(), } } }