diff options
Diffstat (limited to 'src/bayou.rs')
-rw-r--r-- | src/bayou.rs | 514 |
1 files changed, 0 insertions, 514 deletions
diff --git a/src/bayou.rs b/src/bayou.rs deleted file mode 100644 index 9faff5a..0000000 --- a/src/bayou.rs +++ /dev/null @@ -1,514 +0,0 @@ -use std::sync::{Arc, Weak}; -use std::time::{Duration, Instant}; - -use anyhow::{anyhow, bail, Result}; -use log::error; -use rand::prelude::*; -use serde::{Deserialize, Serialize}; -use tokio::sync::{watch, Notify}; - -use crate::cryptoblob::*; -use crate::login::Credentials; -use crate::storage; -use crate::timestamp::*; - -const KEEP_STATE_EVERY: usize = 64; - -// Checkpointing interval constants: a checkpoint is not made earlier -// than CHECKPOINT_INTERVAL time after the last one, and is not made -// if there are less than CHECKPOINT_MIN_OPS new operations since last one. -const CHECKPOINT_INTERVAL: Duration = Duration::from_secs(6 * 3600); -const CHECKPOINT_MIN_OPS: usize = 16; -// HYPOTHESIS: processes are able to communicate in a synchronous -// fashion in times that are small compared to CHECKPOINT_INTERVAL. -// More precisely, if a process tried to save an operation within the last -// CHECKPOINT_INTERVAL, we are sure to read it from storage if it was -// successfully saved (and if we don't read it, it means it has been -// definitely discarded due to an error). - -// Keep at least two checkpoints, here three, to avoid race conditions -// between processes doing .checkpoint() and those doing .sync() -const CHECKPOINTS_TO_KEEP: usize = 3; - -const WATCH_SK: &str = "watch"; - -pub trait BayouState: - Default + Clone + Serialize + for<'de> Deserialize<'de> + Send + Sync + 'static -{ - type Op: Clone + Serialize + for<'de> Deserialize<'de> + std::fmt::Debug + Send + Sync + 'static; - - fn apply(&self, op: &Self::Op) -> Self; -} - -pub struct Bayou<S: BayouState> { - path: String, - key: Key, - - storage: storage::Store, - - checkpoint: (Timestamp, S), - history: Vec<(Timestamp, S::Op, Option<S>)>, - - last_sync: Option<Instant>, - last_try_checkpoint: Option<Instant>, - - watch: Arc<K2vWatch>, - last_sync_watch_ct: storage::RowRef, -} - -impl<S: BayouState> Bayou<S> { - pub async fn new(creds: &Credentials, path: String) -> Result<Self> { - let storage = creds.storage.build().await?; - - //let target = k2v_client.row(&path, WATCH_SK); - let target = storage::RowRef::new(&path, WATCH_SK); - let watch = K2vWatch::new(creds, target.clone()).await?; - - Ok(Self { - path, - storage, - key: creds.keys.master.clone(), - checkpoint: (Timestamp::zero(), S::default()), - history: vec![], - last_sync: None, - last_try_checkpoint: None, - watch, - last_sync_watch_ct: target, - }) - } - - /// Re-reads the state from persistent storage backend - pub async fn sync(&mut self) -> Result<()> { - let new_last_sync = Some(Instant::now()); - let new_last_sync_watch_ct = self.watch.rx.borrow().clone(); - - // 1. List checkpoints - let checkpoints = self.list_checkpoints().await?; - tracing::debug!("(sync) listed checkpoints: {:?}", checkpoints); - - // 2. Load last checkpoint if different from currently used one - let checkpoint = if let Some((ts, key)) = checkpoints.last() { - if *ts == self.checkpoint.0 { - (*ts, None) - } else { - tracing::debug!("(sync) loading checkpoint: {}", key); - - let buf = self - .storage - .blob_fetch(&storage::BlobRef(key.to_string())) - .await? - .value; - tracing::debug!("(sync) checkpoint body length: {}", buf.len()); - - let ck = open_deserialize::<S>(&buf, &self.key)?; - (*ts, Some(ck)) - } - } else { - (Timestamp::zero(), None) - }; - - if self.checkpoint.0 > checkpoint.0 { - bail!("Loaded checkpoint is more recent than stored one"); - } - - if let Some(ck) = checkpoint.1 { - tracing::debug!( - "(sync) updating checkpoint to loaded state at {:?}", - checkpoint.0 - ); - self.checkpoint = (checkpoint.0, ck); - }; - - // remove from history events before checkpoint - self.history = std::mem::take(&mut self.history) - .into_iter() - .skip_while(|(ts, _, _)| *ts < self.checkpoint.0) - .collect(); - - // 3. List all operations starting from checkpoint - let ts_ser = self.checkpoint.0.to_string(); - tracing::debug!("(sync) looking up operations starting at {}", ts_ser); - let ops_map = self - .storage - .row_fetch(&storage::Selector::Range { - shard: &self.path, - sort_begin: &ts_ser, - sort_end: WATCH_SK, - }) - .await?; - - let mut ops = vec![]; - for row_value in ops_map { - let row = row_value.row_ref; - let sort_key = row.uid.sort; - let ts = sort_key - .parse::<Timestamp>() - .map_err(|_| anyhow!("Invalid operation timestamp: {}", sort_key))?; - - let val = row_value.value; - if val.len() != 1 { - bail!("Invalid operation, has {} values", val.len()); - } - match &val[0] { - storage::Alternative::Value(v) => { - let op = open_deserialize::<S::Op>(v, &self.key)?; - tracing::trace!("(sync) operation {}: {:?}", sort_key, op); - ops.push((ts, op)); - } - storage::Alternative::Tombstone => { - continue; - } - } - } - ops.sort_by_key(|(ts, _)| *ts); - tracing::debug!("(sync) {} operations", ops.len()); - - if ops.len() < self.history.len() { - bail!("Some operations have disappeared from storage!"); - } - - // 4. Check that first operation has same timestamp as checkpoint (if not zero) - if self.checkpoint.0 != Timestamp::zero() && ops[0].0 != self.checkpoint.0 { - bail!( - "First operation in listing doesn't have timestamp that corresponds to checkpoint" - ); - } - - // 5. Apply all operations in order - // Hypothesis: before the loaded checkpoint, operations haven't changed - // between what's on storage and what we used to calculate the state in RAM here. - let i0 = self - .history - .iter() - .zip(ops.iter()) - .take_while(|((ts1, _, _), (ts2, _))| ts1 == ts2) - .count(); - - if ops.len() > i0 { - // Remove operations from first position where histories differ - self.history.truncate(i0); - - // Look up last calculated state which we have saved and start from there. - let mut last_state = (0, &self.checkpoint.1); - for (i, (_, _, state_opt)) in self.history.iter().enumerate().rev() { - if let Some(state) = state_opt { - last_state = (i + 1, state); - break; - } - } - - // Calculate state at the end of this common part of the history - let mut state = last_state.1.clone(); - for (_, op, _) in self.history[last_state.0..].iter() { - state = state.apply(op); - } - - // Now, apply all operations retrieved from storage after the common part - for (ts, op) in ops.drain(i0..) { - state = state.apply(&op); - if (self.history.len() + 1) % KEEP_STATE_EVERY == 0 { - self.history.push((ts, op, Some(state.clone()))); - } else { - self.history.push((ts, op, None)); - } - } - - // Always save final state as result of last operation - self.history.last_mut().unwrap().2 = Some(state); - } - - // Save info that sync has been done - self.last_sync = new_last_sync; - self.last_sync_watch_ct = new_last_sync_watch_ct; - Ok(()) - } - - /// Does a sync() if either of the two conditions is met: - /// - last sync was more than CHECKPOINT_INTERVAL/5 ago - /// - a change was detected - pub async fn opportunistic_sync(&mut self) -> Result<()> { - let too_old = match self.last_sync { - Some(t) => Instant::now() > t + (CHECKPOINT_INTERVAL / 5), - _ => true, - }; - let changed = self.last_sync_watch_ct != *self.watch.rx.borrow(); - if too_old || changed { - self.sync().await?; - } - Ok(()) - } - - pub fn notifier(&self) -> std::sync::Weak<Notify> { - Arc::downgrade(&self.watch.learnt_remote_update) - } - - /// Applies a new operation on the state. Once this function returns, - /// the operation has been safely persisted to storage backend. - /// Make sure to call `.opportunistic_sync()` before doing this, - /// and even before calculating the `op` argument given here. - pub async fn push(&mut self, op: S::Op) -> Result<()> { - tracing::debug!("(push) add operation: {:?}", op); - - let ts = Timestamp::after( - self.history - .last() - .map(|(ts, _, _)| ts) - .unwrap_or(&self.checkpoint.0), - ); - - let row_val = storage::RowVal::new( - storage::RowRef::new(&self.path, &ts.to_string()), - seal_serialize(&op, &self.key)?, - ); - self.storage.row_insert(vec![row_val]).await?; - self.watch.propagate_local_update.notify_one(); - - let new_state = self.state().apply(&op); - self.history.push((ts, op, Some(new_state))); - - // Clear previously saved state in history if not required - let hlen = self.history.len(); - if hlen >= 2 && (hlen - 1) % KEEP_STATE_EVERY != 0 { - self.history[hlen - 2].2 = None; - } - - self.checkpoint().await?; - - Ok(()) - } - - /// Save a new checkpoint if previous checkpoint is too old - pub async fn checkpoint(&mut self) -> Result<()> { - match self.last_try_checkpoint { - Some(ts) if Instant::now() - ts < CHECKPOINT_INTERVAL / 5 => Ok(()), - _ => { - let res = self.checkpoint_internal().await; - if res.is_ok() { - self.last_try_checkpoint = Some(Instant::now()); - } - res - } - } - } - - async fn checkpoint_internal(&mut self) -> Result<()> { - self.sync().await?; - - // Check what would be the possible time for a checkpoint in the history we have - let now = now_msec() as i128; - let i_cp = match self - .history - .iter() - .enumerate() - .rev() - .skip_while(|(_, (ts, _, _))| { - (now - ts.msec as i128) < CHECKPOINT_INTERVAL.as_millis() as i128 - }) - .map(|(i, _)| i) - .next() - { - Some(i) => i, - None => { - tracing::debug!("(cp) Oldest operation is too recent to trigger checkpoint"); - return Ok(()); - } - }; - - if i_cp < CHECKPOINT_MIN_OPS { - tracing::debug!("(cp) Not enough old operations to trigger checkpoint"); - return Ok(()); - } - - let ts_cp = self.history[i_cp].0; - tracing::debug!( - "(cp) we could checkpoint at time {} (index {} in history)", - ts_cp.to_string(), - i_cp - ); - - // Check existing checkpoints: if last one is too recent, don't checkpoint again. - let existing_checkpoints = self.list_checkpoints().await?; - tracing::debug!("(cp) listed checkpoints: {:?}", existing_checkpoints); - - if let Some(last_cp) = existing_checkpoints.last() { - if (ts_cp.msec as i128 - last_cp.0.msec as i128) - < CHECKPOINT_INTERVAL.as_millis() as i128 - { - tracing::debug!( - "(cp) last checkpoint is too recent: {}, not checkpointing", - last_cp.0.to_string() - ); - return Ok(()); - } - } - - tracing::debug!("(cp) saving checkpoint at {}", ts_cp.to_string()); - - // Calculate state at time of checkpoint - let mut last_known_state = (0, &self.checkpoint.1); - for (i, (_, _, st)) in self.history[..i_cp].iter().enumerate() { - if let Some(s) = st { - last_known_state = (i + 1, s); - } - } - let mut state_cp = last_known_state.1.clone(); - for (_, op, _) in self.history[last_known_state.0..i_cp].iter() { - state_cp = state_cp.apply(op); - } - - // Serialize and save checkpoint - let cryptoblob = seal_serialize(&state_cp, &self.key)?; - tracing::debug!("(cp) checkpoint body length: {}", cryptoblob.len()); - - let blob_val = storage::BlobVal::new( - storage::BlobRef(format!("{}/checkpoint/{}", self.path, ts_cp.to_string())), - cryptoblob.into(), - ); - self.storage.blob_insert(blob_val).await?; - - // Drop old checkpoints (but keep at least CHECKPOINTS_TO_KEEP of them) - let ecp_len = existing_checkpoints.len(); - if ecp_len + 1 > CHECKPOINTS_TO_KEEP { - let last_to_keep = ecp_len + 1 - CHECKPOINTS_TO_KEEP; - - // Delete blobs - for (_ts, key) in existing_checkpoints[..last_to_keep].iter() { - tracing::debug!("(cp) drop old checkpoint {}", key); - self.storage - .blob_rm(&storage::BlobRef(key.to_string())) - .await?; - } - - // Delete corresponding range of operations - let ts_ser = existing_checkpoints[last_to_keep].0.to_string(); - self.storage - .row_rm(&storage::Selector::Range { - shard: &self.path, - sort_begin: "", - sort_end: &ts_ser, - }) - .await? - } - - Ok(()) - } - - pub fn state(&self) -> &S { - if let Some(last) = self.history.last() { - last.2.as_ref().unwrap() - } else { - &self.checkpoint.1 - } - } - - // ---- INTERNAL ---- - - async fn list_checkpoints(&self) -> Result<Vec<(Timestamp, String)>> { - let prefix = format!("{}/checkpoint/", self.path); - - let checkpoints_res = self.storage.blob_list(&prefix).await?; - - let mut checkpoints = vec![]; - for object in checkpoints_res { - let key = object.0; - if let Some(ckid) = key.strip_prefix(&prefix) { - if let Ok(ts) = ckid.parse::<Timestamp>() { - checkpoints.push((ts, key.into())); - } - } - } - checkpoints.sort_by_key(|(ts, _)| *ts); - Ok(checkpoints) - } -} - -// ---- Bayou watch in K2V ---- - -struct K2vWatch { - target: storage::RowRef, - rx: watch::Receiver<storage::RowRef>, - propagate_local_update: Notify, - learnt_remote_update: Arc<Notify>, -} - -impl K2vWatch { - /// Creates a new watch and launches subordinate threads. - /// These threads hold Weak pointers to the struct; - /// they exit when the Arc is dropped. - async fn new(creds: &Credentials, target: storage::RowRef) -> Result<Arc<Self>> { - let storage = creds.storage.build().await?; - - let (tx, rx) = watch::channel::<storage::RowRef>(target.clone()); - let propagate_local_update = Notify::new(); - let learnt_remote_update = Arc::new(Notify::new()); - - let watch = Arc::new(K2vWatch { - target, - rx, - propagate_local_update, - learnt_remote_update, - }); - - tokio::spawn(Self::background_task(Arc::downgrade(&watch), storage, tx)); - - Ok(watch) - } - - async fn background_task( - self_weak: Weak<Self>, - storage: storage::Store, - tx: watch::Sender<storage::RowRef>, - ) { - let (mut row, remote_update) = match Weak::upgrade(&self_weak) { - Some(this) => (this.target.clone(), this.learnt_remote_update.clone()), - None => return, - }; - - while let Some(this) = Weak::upgrade(&self_weak) { - tracing::debug!( - "bayou k2v watch bg loop iter ({}, {})", - this.target.uid.shard, - this.target.uid.sort - ); - tokio::select!( - // Needed to exit: will force a loop iteration every minutes, - // that will stop the loop if other Arc references have been dropped - // and free resources. Otherwise we would be blocked waiting forever... - _ = tokio::time::sleep(Duration::from_secs(60)) => continue, - - // Watch if another instance has modified the log - update = storage.row_poll(&row) => { - match update { - Err(e) => { - error!("Error in bayou k2v wait value changed: {}", e); - tokio::time::sleep(Duration::from_secs(30)).await; - } - Ok(new_value) => { - row = new_value.row_ref; - if let Err(e) = tx.send(row.clone()) { - tracing::warn!(err=?e, "(watch) can't record the new log ref"); - break; - } - tracing::debug!(row=?row, "(watch) learnt remote update"); - this.learnt_remote_update.notify_waiters(); - } - } - } - - // It appears we have modified the log, informing other people - _ = this.propagate_local_update.notified() => { - let rand = u128::to_be_bytes(thread_rng().gen()).to_vec(); - let row_val = storage::RowVal::new(row.clone(), rand); - if let Err(e) = storage.row_insert(vec![row_val]).await - { - tracing::error!("Error in bayou k2v watch updater loop: {}", e); - tokio::time::sleep(Duration::from_secs(30)).await; - } - } - ); - } - // unblock listeners - remote_update.notify_waiters(); - tracing::info!("bayou k2v watch bg loop exiting"); - } -} |