From 046b649bcc3b147140fc2b0af0e071d3dd1b2c8d Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 11 Mar 2021 18:28:03 +0100 Subject: (not well tested) use merkle tree for sync --- src/table/sync.rs | 632 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 632 insertions(+) create mode 100644 src/table/sync.rs (limited to 'src/table/sync.rs') diff --git a/src/table/sync.rs b/src/table/sync.rs new file mode 100644 index 00000000..9c37c286 --- /dev/null +++ b/src/table/sync.rs @@ -0,0 +1,632 @@ +use std::collections::VecDeque; +use std::convert::TryInto; +use std::sync::{Arc, Mutex}; +use std::time::{Duration, Instant}; + +use futures::future::join_all; +use futures::{pin_mut, select}; +use futures_util::future::*; +use futures_util::stream::*; +use rand::Rng; +use serde::{Deserialize, Serialize}; +use serde_bytes::ByteBuf; +use tokio::sync::{mpsc, watch}; + +use garage_rpc::ring::Ring; +use garage_util::data::*; +use garage_util::error::Error; + +use crate::data::*; +use crate::merkle::*; +use crate::replication::*; +use crate::*; + +const TABLE_SYNC_RPC_TIMEOUT: Duration = Duration::from_secs(30); + +// Do anti-entropy every 10 minutes +const ANTI_ENTROPY_INTERVAL: Duration = Duration::from_secs(10 * 60); + +pub struct TableSyncer { + data: Arc>, + aux: Arc>, + + todo: Mutex, +} + +type RootCk = Vec<(MerklePartition, Hash)>; + +#[derive(Debug, Clone, Copy, Serialize, Deserialize)] +pub struct PartitionRange { + begin: MerklePartition, + // if end is None, go all the way to partition 0xFFFF included + end: Option, +} + +#[derive(Serialize, Deserialize)] +pub(crate) enum SyncRPC { + RootCkHash(PartitionRange, Hash), + RootCkList(PartitionRange, RootCk), + CkNoDifference, + GetNode(MerkleNodeKey), + Node(MerkleNodeKey, MerkleNode), + Items(Vec>), +} + +struct SyncTodo { + todo: Vec, +} + +#[derive(Debug, Clone)] +struct TodoPartition { + range: PartitionRange, + + // Are we a node that stores this partition or not? + retain: bool, +} + +impl TableSyncer +where + F: TableSchema + 'static, + R: TableReplication + 'static, +{ + pub(crate) fn launch(data: Arc>, aux: Arc>) -> Arc { + let todo = SyncTodo { todo: vec![] }; + + let syncer = Arc::new(Self { + data: data.clone(), + aux: aux.clone(), + todo: Mutex::new(todo), + }); + + let (busy_tx, busy_rx) = mpsc::unbounded_channel(); + + let s1 = syncer.clone(); + aux.system.background.spawn_worker( + format!("table sync watcher for {}", data.name), + move |must_exit: watch::Receiver| s1.watcher_task(must_exit, busy_rx), + ); + + let s2 = syncer.clone(); + aux.system.background.spawn_worker( + format!("table syncer for {}", data.name), + move |must_exit: watch::Receiver| s2.syncer_task(must_exit, busy_tx), + ); + + let s3 = syncer.clone(); + tokio::spawn(async move { + tokio::time::delay_for(Duration::from_secs(20)).await; + s3.add_full_sync(); + }); + + syncer + } + + async fn watcher_task( + self: Arc, + mut must_exit: watch::Receiver, + mut busy_rx: mpsc::UnboundedReceiver, + ) -> Result<(), Error> { + let mut ring_recv: watch::Receiver> = self.aux.system.ring.clone(); + let mut nothing_to_do_since = Some(Instant::now()); + + while !*must_exit.borrow() { + let s_ring_recv = ring_recv.recv().fuse(); + let s_busy = busy_rx.recv().fuse(); + let s_must_exit = must_exit.recv().fuse(); + let s_timeout = tokio::time::delay_for(Duration::from_secs(1)).fuse(); + pin_mut!(s_ring_recv, s_busy, s_must_exit, s_timeout); + + select! { + new_ring_r = s_ring_recv => { + if new_ring_r.is_some() { + debug!("({}) Adding ring difference to syncer todo list", self.data.name); + self.add_full_sync(); + } + } + busy_opt = s_busy => { + if let Some(busy) = busy_opt { + if busy { + nothing_to_do_since = None; + } else { + if nothing_to_do_since.is_none() { + nothing_to_do_since = Some(Instant::now()); + } + } + } + } + must_exit_v = s_must_exit => { + if must_exit_v.unwrap_or(false) { + break; + } + } + _ = s_timeout => { + if nothing_to_do_since.map(|t| Instant::now() - t >= ANTI_ENTROPY_INTERVAL).unwrap_or(false) { + nothing_to_do_since = None; + debug!("({}) Adding full sync to syncer todo list", self.data.name); + self.add_full_sync(); + } + } + } + } + Ok(()) + } + + pub fn add_full_sync(&self) { + self.todo + .lock() + .unwrap() + .add_full_sync(&self.data, &self.aux); + } + + async fn syncer_task( + self: Arc, + mut must_exit: watch::Receiver, + busy_tx: mpsc::UnboundedSender, + ) -> Result<(), Error> { + while !*must_exit.borrow() { + let task = self.todo.lock().unwrap().pop_task(); + if let Some(partition) = task { + busy_tx.send(true)?; + let res = self + .clone() + .sync_partition(&partition, &mut must_exit) + .await; + if let Err(e) = res { + warn!( + "({}) Error while syncing {:?}: {}", + self.data.name, partition, e + ); + } + } else { + busy_tx.send(false)?; + tokio::time::delay_for(Duration::from_secs(1)).await; + } + } + Ok(()) + } + + async fn sync_partition( + self: Arc, + partition: &TodoPartition, + must_exit: &mut watch::Receiver, + ) -> Result<(), Error> { + if partition.retain { + let my_id = self.aux.system.id; + + let nodes = self + .aux + .replication + .write_nodes( + &hash_of_merkle_partition(partition.range.begin), + &self.aux.system, + ) + .into_iter() + .filter(|node| *node != my_id) + .collect::>(); + + debug!( + "({}) Syncing {:?} with {:?}...", + self.data.name, partition, nodes + ); + let mut sync_futures = nodes + .iter() + .map(|node| { + self.clone() + .do_sync_with(partition.clone(), *node, must_exit.clone()) + }) + .collect::>(); + + let mut n_errors = 0; + while let Some(r) = sync_futures.next().await { + if let Err(e) = r { + n_errors += 1; + warn!("({}) Sync error: {}", self.data.name, e); + } + } + if n_errors > self.aux.replication.max_write_errors() { + return Err(Error::Message(format!( + "Sync failed with too many nodes (should have been: {:?}).", + nodes + ))); + } + } else { + self.offload_partition( + &hash_of_merkle_partition(partition.range.begin), + &hash_of_merkle_partition_opt(partition.range.end), + must_exit, + ) + .await?; + } + + Ok(()) + } + + // Offload partition: this partition is not something we are storing, + // so send it out to all other nodes that store it and delete items locally. + // We don't bother checking if the remote nodes already have the items, + // we just batch-send everything. Offloading isn't supposed to happen very often. + // If any of the nodes that are supposed to store the items is unable to + // save them, we interrupt the process. + async fn offload_partition( + self: &Arc, + begin: &Hash, + end: &Hash, + must_exit: &mut watch::Receiver, + ) -> Result<(), Error> { + let mut counter: usize = 0; + + while !*must_exit.borrow() { + let mut items = Vec::new(); + + for item in self.data.store.range(begin.to_vec()..end.to_vec()) { + let (key, value) = item?; + items.push((key.to_vec(), Arc::new(ByteBuf::from(value.as_ref())))); + + if items.len() >= 1024 { + break; + } + } + + if items.len() > 0 { + let nodes = self + .aux + .replication + .write_nodes(&begin, &self.aux.system) + .into_iter() + .collect::>(); + if nodes.contains(&self.aux.system.id) { + warn!("Interrupting offload as partitions seem to have changed"); + break; + } + + counter += 1; + debug!( + "Offloading {} items from {:?}..{:?} ({})", + items.len(), + begin, + end, + counter + ); + self.offload_items(&items, &nodes[..]).await?; + } else { + break; + } + } + + Ok(()) + } + + async fn offload_items( + self: &Arc, + items: &Vec<(Vec, Arc)>, + nodes: &[UUID], + ) -> Result<(), Error> { + let values = items.iter().map(|(_k, v)| v.clone()).collect::>(); + let update_msg = Arc::new(TableRPC::::Update(values)); + + for res in join_all(nodes.iter().map(|to| { + self.aux + .rpc_client + .call_arc(*to, update_msg.clone(), TABLE_SYNC_RPC_TIMEOUT) + })) + .await + { + res?; + } + + // All remote nodes have written those items, now we can delete them locally + let mut not_removed = 0; + for (k, v) in items.iter() { + if !self.data.delete_if_equal(&k[..], &v[..])? { + not_removed += 1; + } + } + + if not_removed > 0 { + debug!("{} items not removed during offload because they changed in between (trying again...)", not_removed); + } + + Ok(()) + } + + // ======= SYNCHRONIZATION PROCEDURE -- DRIVER SIDE ====== + + fn get_root_ck(&self, range: PartitionRange) -> Result { + let begin = u16::from_be_bytes(range.begin); + let range_iter = match range.end { + Some(end) => { + let end = u16::from_be_bytes(end); + begin..=(end - 1) + } + None => begin..=0xFFFF, + }; + + let mut ret = vec![]; + for i in range_iter { + let key = MerkleNodeKey { + partition: u16::to_be_bytes(i), + prefix: vec![], + }; + match self.data.merkle_updater.read_node(&key)? { + MerkleNode::Empty => (), + x => { + ret.push((key.partition, hash_of(&x)?)); + } + } + } + Ok(ret) + } + + async fn do_sync_with( + self: Arc, + partition: TodoPartition, + who: UUID, + must_exit: watch::Receiver, + ) -> Result<(), Error> { + let root_ck = self.get_root_ck(partition.range)?; + let root_ck_hash = hash_of(&root_ck)?; + + // If their root checksum has level > than us, use that as a reference + let root_resp = self + .aux + .rpc_client + .call( + who, + TableRPC::::SyncRPC(SyncRPC::RootCkHash(partition.range, root_ck_hash)), + TABLE_SYNC_RPC_TIMEOUT, + ) + .await?; + + let mut todo = match root_resp { + TableRPC::::SyncRPC(SyncRPC::CkNoDifference) => { + debug!( + "({}) Sync {:?} with {:?}: no difference", + self.data.name, partition, who + ); + return Ok(()); + } + TableRPC::::SyncRPC(SyncRPC::RootCkList(_, their_root_ck)) => { + let join = join_ordered(&root_ck[..], &their_root_ck[..]); + let mut todo = VecDeque::new(); + for (p, v1, v2) in join.iter() { + let diff = match (v1, v2) { + (Some(_), None) | (None, Some(_)) => true, + (Some(a), Some(b)) => a != b, + _ => false, + }; + if diff { + todo.push_back(MerkleNodeKey { + partition: **p, + prefix: vec![], + }); + } + } + debug!( + "({}) Sync {:?} with {:?}: todo.len() = {}", + self.data.name, + partition, + who, + todo.len() + ); + todo + } + x => { + return Err(Error::Message(format!( + "Invalid respone to RootCkHash RPC: {}", + debug_serialize(x) + ))); + } + }; + + let mut todo_items = vec![]; + + while !todo.is_empty() && !*must_exit.borrow() { + let key = todo.pop_front().unwrap(); + let node = self.data.merkle_updater.read_node(&key)?; + + match node { + MerkleNode::Empty => { + // They have items we don't have. + // We don't request those items from them, they will send them. + // We only bother with pushing items that differ + } + MerkleNode::Leaf(ik, _) => { + // Just send that item directly + if let Some(val) = self.data.store.get(ik)? { + todo_items.push(val.to_vec()); + } + } + MerkleNode::Intermediate(l) => { + let remote_node = match self + .aux + .rpc_client + .call( + who, + TableRPC::::SyncRPC(SyncRPC::GetNode(key.clone())), + TABLE_SYNC_RPC_TIMEOUT, + ) + .await? + { + TableRPC::::SyncRPC(SyncRPC::Node(_, node)) => node, + x => { + return Err(Error::Message(format!( + "Invalid respone to GetNode RPC: {}", + debug_serialize(x) + ))); + } + }; + let int_l2 = match remote_node { + MerkleNode::Intermediate(l2) => l2, + _ => vec![], + }; + + let join = join_ordered(&l[..], &int_l2[..]); + for (p, v1, v2) in join.into_iter() { + let diff = match (v1, v2) { + (Some(_), None) | (None, Some(_)) => true, + (Some(a), Some(b)) => a != b, + _ => false, + }; + if diff { + todo.push_back(key.add_byte(*p)); + } + } + } + } + + if todo_items.len() >= 256 { + self.send_items(who, std::mem::replace(&mut todo_items, vec![])) + .await?; + } + } + + if !todo_items.is_empty() { + self.send_items(who, todo_items).await?; + } + + Ok(()) + } + + async fn send_items(&self, who: UUID, item_list: Vec>) -> Result<(), Error> { + info!( + "({}) Sending {} items to {:?}", + self.data.name, + item_list.len(), + who + ); + + let mut values = vec![]; + for item in item_list.iter() { + if let Some(v) = self.data.store.get(&item[..])? { + values.push(Arc::new(ByteBuf::from(v.as_ref()))); + } + } + let rpc_resp = self + .aux + .rpc_client + .call(who, TableRPC::::Update(values), TABLE_SYNC_RPC_TIMEOUT) + .await?; + if let TableRPC::::Ok = rpc_resp { + Ok(()) + } else { + Err(Error::Message(format!( + "Unexpected response to RPC Update: {}", + debug_serialize(&rpc_resp) + ))) + } + } + + // ======= SYNCHRONIZATION PROCEDURE -- RECEIVER SIDE ====== + + pub(crate) async fn handle_rpc(self: &Arc, message: &SyncRPC) -> Result { + match message { + SyncRPC::RootCkHash(range, h) => { + let root_ck = self.get_root_ck(*range)?; + let hash = hash_of(&root_ck)?; + if hash == *h { + Ok(SyncRPC::CkNoDifference) + } else { + Ok(SyncRPC::RootCkList(*range, root_ck)) + } + } + SyncRPC::GetNode(k) => { + let node = self.data.merkle_updater.read_node(&k)?; + Ok(SyncRPC::Node(k.clone(), node)) + } + _ => Err(Error::Message(format!("Unexpected sync RPC"))), + } + } +} + +impl SyncTodo { + fn add_full_sync( + &mut self, + data: &TableData, + aux: &TableAux, + ) { + let my_id = aux.system.id; + + self.todo.clear(); + + let ring = aux.system.ring.borrow().clone(); + let split_points = aux.replication.split_points(&ring); + + for i in 0..split_points.len() { + let begin: MerklePartition = { + let b = split_points[i]; + assert_eq!(b.as_slice()[2..], [0u8; 30][..]); + b.as_slice()[..2].try_into().unwrap() + }; + + let end: Option = if i + 1 < split_points.len() { + let e = split_points[i + 1]; + assert_eq!(e.as_slice()[2..], [0u8; 30][..]); + Some(e.as_slice()[..2].try_into().unwrap()) + } else { + None + }; + + let begin_hash = hash_of_merkle_partition(begin); + let end_hash = hash_of_merkle_partition_opt(end); + + let nodes = aux.replication.replication_nodes(&begin_hash, &ring); + + let retain = nodes.contains(&my_id); + if !retain { + // Check if we have some data to send, otherwise skip + if data.store.range(begin_hash..end_hash).next().is_none() { + continue; + } + } + + self.todo.push(TodoPartition { + range: PartitionRange { begin, end }, + retain, + }); + } + } + + fn pop_task(&mut self) -> Option { + if self.todo.is_empty() { + return None; + } + + let i = rand::thread_rng().gen_range::(0, self.todo.len()); + if i == self.todo.len() - 1 { + self.todo.pop() + } else { + let replacement = self.todo.pop().unwrap(); + let ret = std::mem::replace(&mut self.todo[i], replacement); + Some(ret) + } + } +} + +fn hash_of(x: &T) -> Result { + Ok(blake2sum(&rmp_to_vec_all_named(x)?[..])) +} + +fn join_ordered<'a, K: Ord + Eq, V1, V2>( + x: &'a [(K, V1)], + y: &'a [(K, V2)], +) -> Vec<(&'a K, Option<&'a V1>, Option<&'a V2>)> { + let mut ret = vec![]; + let mut i = 0; + let mut j = 0; + while i < x.len() || j < y.len() { + if i < x.len() && j < y.len() && x[i].0 == y[j].0 { + ret.push((&x[i].0, Some(&x[i].1), Some(&y[j].1))); + i += 1; + j += 1; + } else if i < x.len() && (j == y.len() || x[i].0 < y[j].0) { + ret.push((&x[i].0, Some(&x[i].1), None)); + i += 1; + } else if j < y.len() && (i == x.len() || x[i].0 > y[j].0) { + ret.push((&x[i].0, None, Some(&y[j].1))); + j += 1; + } else { + unreachable!(); + } + } + ret +} -- cgit v1.2.3 From db7a9d4948d41e4b641f9c50f6ff8921a436431d Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 11 Mar 2021 18:45:26 +0100 Subject: Tiny changes --- src/table/sync.rs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'src/table/sync.rs') diff --git a/src/table/sync.rs b/src/table/sync.rs index 9c37c286..42321ac6 100644 --- a/src/table/sync.rs +++ b/src/table/sync.rs @@ -49,7 +49,6 @@ pub(crate) enum SyncRPC { CkNoDifference, GetNode(MerkleNodeKey), Node(MerkleNodeKey, MerkleNode), - Items(Vec>), } struct SyncTodo { @@ -119,7 +118,7 @@ where select! { new_ring_r = s_ring_recv => { if new_ring_r.is_some() { - debug!("({}) Adding ring difference to syncer todo list", self.data.name); + debug!("({}) Ring changed, adding full sync to syncer todo list", self.data.name); self.add_full_sync(); } } @@ -142,7 +141,7 @@ where _ = s_timeout => { if nothing_to_do_since.map(|t| Instant::now() - t >= ANTI_ENTROPY_INTERVAL).unwrap_or(false) { nothing_to_do_since = None; - debug!("({}) Adding full sync to syncer todo list", self.data.name); + debug!("({}) Interval passed, adding full sync to syncer todo list", self.data.name); self.add_full_sync(); } } @@ -330,6 +329,10 @@ where } // ======= SYNCHRONIZATION PROCEDURE -- DRIVER SIDE ====== + // The driver side is only concerned with sending out the item it has + // and the other side might not have. Receiving items that differ from one + // side to the other will happen when the other side syncs with us, + // which they also do regularly. fn get_root_ck(&self, range: PartitionRange) -> Result { let begin = u16::from_be_bytes(range.begin); -- cgit v1.2.3 From fae5104a2cf91206f995b183c5f217ea6729a551 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 11 Mar 2021 18:50:32 +0100 Subject: Add a nice warning --- src/table/sync.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'src/table/sync.rs') diff --git a/src/table/sync.rs b/src/table/sync.rs index 42321ac6..68fc9fcb 100644 --- a/src/table/sync.rs +++ b/src/table/sync.rs @@ -433,9 +433,12 @@ where // We don't request those items from them, they will send them. // We only bother with pushing items that differ } - MerkleNode::Leaf(ik, _) => { + MerkleNode::Leaf(ik, ivhash) => { // Just send that item directly - if let Some(val) = self.data.store.get(ik)? { + if let Some(val) = self.data.store.get(&ik[..])? { + if blake2sum(&val[..]) != ivhash { + warn!("Hashes differ between stored value and Merkle tree, key: {:?} (if your server is very busy, don't worry, this happens when the Merkle tree can't be updated fast enough)", ik); + } todo_items.push(val.to_vec()); } } -- cgit v1.2.3 From f7c2cd1cd7ee15b9c97b9fbdef25c0644b3523bb Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 11 Mar 2021 18:55:17 +0100 Subject: Add comment, and also whoops, this wasn't doing what we expected --- src/table/sync.rs | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) (limited to 'src/table/sync.rs') diff --git a/src/table/sync.rs b/src/table/sync.rs index 68fc9fcb..b5044a4e 100644 --- a/src/table/sync.rs +++ b/src/table/sync.rs @@ -440,9 +440,13 @@ where warn!("Hashes differ between stored value and Merkle tree, key: {:?} (if your server is very busy, don't worry, this happens when the Merkle tree can't be updated fast enough)", ik); } todo_items.push(val.to_vec()); + } else { + warn!("Item from Merkle tree not found in store: {:?} (if your server is very busy, don't worry, this happens when the Merkle tree can't be updated fast enough)", ik); } } MerkleNode::Intermediate(l) => { + // Get Merkle node for this tree position at remote node + // and compare it with local node let remote_node = match self .aux .rpc_client @@ -462,7 +466,11 @@ where } }; let int_l2 = match remote_node { + // If they have an intermediate node at this tree position, + // we can compare them to find differences MerkleNode::Intermediate(l2) => l2, + // Otherwise, treat it as if they have nothing for this subtree, + // which will have the consequence of sending them everything _ => vec![], }; @@ -493,20 +501,18 @@ where Ok(()) } - async fn send_items(&self, who: UUID, item_list: Vec>) -> Result<(), Error> { + async fn send_items(&self, who: UUID, item_value_list: Vec>) -> Result<(), Error> { info!( "({}) Sending {} items to {:?}", self.data.name, - item_list.len(), + item_value_list.len(), who ); - let mut values = vec![]; - for item in item_list.iter() { - if let Some(v) = self.data.store.get(&item[..])? { - values.push(Arc::new(ByteBuf::from(v.as_ref()))); - } - } + let values = item_value_list.into_iter() + .map(|x| Arc::new(ByteBuf::from(x))) + .collect::>(); + let rpc_resp = self .aux .rpc_client -- cgit v1.2.3 From 3f7a496355bdbeeeee859912fa6fa7a95cb47f3b Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 11 Mar 2021 19:06:27 +0100 Subject: More security: don't delete stuff too easily --- src/table/sync.rs | 3 +++ 1 file changed, 3 insertions(+) (limited to 'src/table/sync.rs') diff --git a/src/table/sync.rs b/src/table/sync.rs index b5044a4e..f8ebb2f0 100644 --- a/src/table/sync.rs +++ b/src/table/sync.rs @@ -277,6 +277,9 @@ where warn!("Interrupting offload as partitions seem to have changed"); break; } + if nodes.len() < self.aux.replication.write_quorum(&self.aux.system) { + return Err(Error::Message(format!("Not offloading as we don't have a quorum of nodes to write to."))); + } counter += 1; debug!( -- cgit v1.2.3 From 1ec49980ec876ef9395a9ae088f82d86a1a0d9f6 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Thu, 11 Mar 2021 19:30:24 +0100 Subject: whoops --- src/table/sync.rs | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'src/table/sync.rs') diff --git a/src/table/sync.rs b/src/table/sync.rs index f8ebb2f0..07d48155 100644 --- a/src/table/sync.rs +++ b/src/table/sync.rs @@ -370,6 +370,14 @@ where must_exit: watch::Receiver, ) -> Result<(), Error> { let root_ck = self.get_root_ck(partition.range)?; + if root_ck.is_empty() { + debug!( + "({}) Sync {:?} with {:?}: partition is empty.", + self.data.name, partition, who + ); + return Ok(()) + } + let root_ck_hash = hash_of(&root_ck)?; // If their root checksum has level > than us, use that as a reference @@ -637,7 +645,7 @@ fn join_ordered<'a, K: Ord + Eq, V1, V2>( ret.push((&x[i].0, Some(&x[i].1), None)); i += 1; } else if j < y.len() && (i == x.len() || x[i].0 > y[j].0) { - ret.push((&x[i].0, None, Some(&y[j].1))); + ret.push((&y[j].0, None, Some(&y[j].1))); j += 1; } else { unreachable!(); -- cgit v1.2.3 From 7fdaf7aef0c2aa8b38dbc7dac630f6f9baf8f0a4 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 12 Mar 2021 14:37:46 +0100 Subject: Fix merkle updater not being notified; improved logging --- src/table/sync.rs | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'src/table/sync.rs') diff --git a/src/table/sync.rs b/src/table/sync.rs index 07d48155..dbfa0a9f 100644 --- a/src/table/sync.rs +++ b/src/table/sync.rs @@ -274,7 +274,7 @@ where .into_iter() .collect::>(); if nodes.contains(&self.aux.system.id) { - warn!("Interrupting offload as partitions seem to have changed"); + warn!("({}) Interrupting offload as partitions seem to have changed", self.data.name); break; } if nodes.len() < self.aux.replication.write_quorum(&self.aux.system) { @@ -282,8 +282,9 @@ where } counter += 1; - debug!( - "Offloading {} items from {:?}..{:?} ({})", + info!( + "({}) Offloading {} items from {:?}..{:?} ({})", + self.data.name, items.len(), begin, end, @@ -325,7 +326,7 @@ where } if not_removed > 0 { - debug!("{} items not removed during offload because they changed in between (trying again...)", not_removed); + debug!("({}) {} items not removed during offload because they changed in between (trying again...)", self.data.name, not_removed); } Ok(()) @@ -448,11 +449,11 @@ where // Just send that item directly if let Some(val) = self.data.store.get(&ik[..])? { if blake2sum(&val[..]) != ivhash { - warn!("Hashes differ between stored value and Merkle tree, key: {:?} (if your server is very busy, don't worry, this happens when the Merkle tree can't be updated fast enough)", ik); + warn!("({}) Hashes differ between stored value and Merkle tree, key: {:?} (if your server is very busy, don't worry, this happens when the Merkle tree can't be updated fast enough)", self.data.name, ik); } todo_items.push(val.to_vec()); } else { - warn!("Item from Merkle tree not found in store: {:?} (if your server is very busy, don't worry, this happens when the Merkle tree can't be updated fast enough)", ik); + warn!("({}) Item from Merkle tree not found in store: {:?} (if your server is very busy, don't worry, this happens when the Merkle tree can't be updated fast enough)", self.data.name, ik); } } MerkleNode::Intermediate(l) => { -- cgit v1.2.3 From 1fea257291bdbf447f9918274ebf73848afb3a0c Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 12 Mar 2021 14:51:17 +0100 Subject: Don't sync at beginning --- src/table/sync.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'src/table/sync.rs') diff --git a/src/table/sync.rs b/src/table/sync.rs index dbfa0a9f..049a16ae 100644 --- a/src/table/sync.rs +++ b/src/table/sync.rs @@ -105,6 +105,7 @@ where mut must_exit: watch::Receiver, mut busy_rx: mpsc::UnboundedReceiver, ) -> Result<(), Error> { + let mut prev_ring: Arc = self.aux.system.ring.borrow().clone(); let mut ring_recv: watch::Receiver> = self.aux.system.ring.clone(); let mut nothing_to_do_since = Some(Instant::now()); @@ -117,9 +118,12 @@ where select! { new_ring_r = s_ring_recv => { - if new_ring_r.is_some() { - debug!("({}) Ring changed, adding full sync to syncer todo list", self.data.name); - self.add_full_sync(); + if let Some(new_ring) = new_ring_r { + if !Arc::ptr_eq(&new_ring, &prev_ring) { + debug!("({}) Ring changed, adding full sync to syncer todo list", self.data.name); + self.add_full_sync(); + prev_ring = new_ring; + } } } busy_opt = s_busy => { -- cgit v1.2.3 From 8860aa19b867183b83ee48efd9990cd34e567f53 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 12 Mar 2021 15:05:26 +0100 Subject: Make syncer have its own rpc client/server --- src/table/sync.rs | 81 +++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 58 insertions(+), 23 deletions(-) (limited to 'src/table/sync.rs') diff --git a/src/table/sync.rs b/src/table/sync.rs index 049a16ae..23161d15 100644 --- a/src/table/sync.rs +++ b/src/table/sync.rs @@ -12,10 +12,13 @@ use serde::{Deserialize, Serialize}; use serde_bytes::ByteBuf; use tokio::sync::{mpsc, watch}; -use garage_rpc::ring::Ring; use garage_util::data::*; use garage_util::error::Error; +use garage_rpc::ring::Ring; +use garage_rpc::rpc_client::*; +use garage_rpc::rpc_server::*; + use crate::data::*; use crate::merkle::*; use crate::replication::*; @@ -31,6 +34,7 @@ pub struct TableSyncer { aux: Arc>, todo: Mutex, + rpc_client: Arc>, } type RootCk = Vec<(MerklePartition, Hash)>; @@ -49,8 +53,12 @@ pub(crate) enum SyncRPC { CkNoDifference, GetNode(MerkleNodeKey), Node(MerkleNodeKey, MerkleNode), + Items(Vec>), + Ok, } +impl RpcMessage for SyncRPC {} + struct SyncTodo { todo: Vec, } @@ -68,15 +76,25 @@ where F: TableSchema + 'static, R: TableReplication + 'static, { - pub(crate) fn launch(data: Arc>, aux: Arc>) -> Arc { + pub(crate) fn launch( + data: Arc>, + aux: Arc>, + rpc_server: &mut RpcServer, + ) -> Arc { + let rpc_path = format!("table_{}/sync", data.name); + let rpc_client = aux.system.rpc_client::(&rpc_path); + let todo = SyncTodo { todo: vec![] }; let syncer = Arc::new(Self { data: data.clone(), aux: aux.clone(), todo: Mutex::new(todo), + rpc_client, }); + syncer.register_handler(rpc_server, rpc_path); + let (busy_tx, busy_rx) = mpsc::unbounded_channel(); let s1 = syncer.clone(); @@ -100,6 +118,21 @@ where syncer } + fn register_handler(self: &Arc, rpc_server: &mut RpcServer, path: String) { + let self2 = self.clone(); + rpc_server.add_handler::(path, move |msg, _addr| { + let self2 = self2.clone(); + async move { self2.handle_rpc(&msg).await } + }); + + let self2 = self.clone(); + self.rpc_client + .set_local_handler(self.aux.system.id, move |msg| { + let self2 = self2.clone(); + async move { self2.handle_rpc(&msg).await } + }); + } + async fn watcher_task( self: Arc, mut must_exit: watch::Receiver, @@ -278,11 +311,16 @@ where .into_iter() .collect::>(); if nodes.contains(&self.aux.system.id) { - warn!("({}) Interrupting offload as partitions seem to have changed", self.data.name); + warn!( + "({}) Interrupting offload as partitions seem to have changed", + self.data.name + ); break; } if nodes.len() < self.aux.replication.write_quorum(&self.aux.system) { - return Err(Error::Message(format!("Not offloading as we don't have a quorum of nodes to write to."))); + return Err(Error::Message(format!( + "Not offloading as we don't have a quorum of nodes to write to." + ))); } counter += 1; @@ -309,11 +347,10 @@ where nodes: &[UUID], ) -> Result<(), Error> { let values = items.iter().map(|(_k, v)| v.clone()).collect::>(); - let update_msg = Arc::new(TableRPC::::Update(values)); + let update_msg = Arc::new(SyncRPC::Items(values)); for res in join_all(nodes.iter().map(|to| { - self.aux - .rpc_client + self.rpc_client .call_arc(*to, update_msg.clone(), TABLE_SYNC_RPC_TIMEOUT) })) .await @@ -380,31 +417,30 @@ where "({}) Sync {:?} with {:?}: partition is empty.", self.data.name, partition, who ); - return Ok(()) + return Ok(()); } let root_ck_hash = hash_of(&root_ck)?; // If their root checksum has level > than us, use that as a reference let root_resp = self - .aux .rpc_client .call( who, - TableRPC::::SyncRPC(SyncRPC::RootCkHash(partition.range, root_ck_hash)), + SyncRPC::RootCkHash(partition.range, root_ck_hash), TABLE_SYNC_RPC_TIMEOUT, ) .await?; let mut todo = match root_resp { - TableRPC::::SyncRPC(SyncRPC::CkNoDifference) => { + SyncRPC::CkNoDifference => { debug!( "({}) Sync {:?} with {:?}: no difference", self.data.name, partition, who ); return Ok(()); } - TableRPC::::SyncRPC(SyncRPC::RootCkList(_, their_root_ck)) => { + SyncRPC::RootCkList(_, their_root_ck) => { let join = join_ordered(&root_ck[..], &their_root_ck[..]); let mut todo = VecDeque::new(); for (p, v1, v2) in join.iter() { @@ -464,16 +500,11 @@ where // Get Merkle node for this tree position at remote node // and compare it with local node let remote_node = match self - .aux .rpc_client - .call( - who, - TableRPC::::SyncRPC(SyncRPC::GetNode(key.clone())), - TABLE_SYNC_RPC_TIMEOUT, - ) + .call(who, SyncRPC::GetNode(key.clone()), TABLE_SYNC_RPC_TIMEOUT) .await? { - TableRPC::::SyncRPC(SyncRPC::Node(_, node)) => node, + SyncRPC::Node(_, node) => node, x => { return Err(Error::Message(format!( "Invalid respone to GetNode RPC: {}", @@ -525,16 +556,16 @@ where who ); - let values = item_value_list.into_iter() + let values = item_value_list + .into_iter() .map(|x| Arc::new(ByteBuf::from(x))) .collect::>(); let rpc_resp = self - .aux .rpc_client - .call(who, TableRPC::::Update(values), TABLE_SYNC_RPC_TIMEOUT) + .call(who, SyncRPC::Items(values), TABLE_SYNC_RPC_TIMEOUT) .await?; - if let TableRPC::::Ok = rpc_resp { + if let SyncRPC::Ok = rpc_resp { Ok(()) } else { Err(Error::Message(format!( @@ -561,6 +592,10 @@ where let node = self.data.merkle_updater.read_node(&k)?; Ok(SyncRPC::Node(k.clone(), node)) } + SyncRPC::Items(items) => { + self.data.update_many(items)?; + Ok(SyncRPC::Ok) + } _ => Err(Error::Message(format!("Unexpected sync RPC"))), } } -- cgit v1.2.3 From cbe7e1a66a9dceaaeae0467b4eefe51afd5b297c Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 12 Mar 2021 15:07:23 +0100 Subject: Move table rpc client out of tableaux --- src/table/sync.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'src/table/sync.rs') diff --git a/src/table/sync.rs b/src/table/sync.rs index 23161d15..4be8cd10 100644 --- a/src/table/sync.rs +++ b/src/table/sync.rs @@ -31,7 +31,7 @@ const ANTI_ENTROPY_INTERVAL: Duration = Duration::from_secs(10 * 60); pub struct TableSyncer { data: Arc>, - aux: Arc>, + aux: Arc>, todo: Mutex, rpc_client: Arc>, @@ -78,7 +78,7 @@ where { pub(crate) fn launch( data: Arc>, - aux: Arc>, + aux: Arc>, rpc_server: &mut RpcServer, ) -> Arc { let rpc_path = format!("table_{}/sync", data.name); @@ -605,7 +605,7 @@ impl SyncTodo { fn add_full_sync( &mut self, data: &TableData, - aux: &TableAux, + aux: &TableAux, ) { let my_id = aux.system.id; -- cgit v1.2.3 From c475471e7a8e7544f2be490898f4249cf27a17e9 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 12 Mar 2021 19:57:37 +0100 Subject: Implement table gc, currently for block_ref and version only --- src/table/sync.rs | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) (limited to 'src/table/sync.rs') diff --git a/src/table/sync.rs b/src/table/sync.rs index 4be8cd10..aae65852 100644 --- a/src/table/sync.rs +++ b/src/table/sync.rs @@ -3,7 +3,6 @@ use std::convert::TryInto; use std::sync::{Arc, Mutex}; use std::time::{Duration, Instant}; -use futures::future::join_all; use futures::{pin_mut, select}; use futures_util::future::*; use futures_util::stream::*; @@ -347,16 +346,11 @@ where nodes: &[UUID], ) -> Result<(), Error> { let values = items.iter().map(|(_k, v)| v.clone()).collect::>(); - let update_msg = Arc::new(SyncRPC::Items(values)); - - for res in join_all(nodes.iter().map(|to| { - self.rpc_client - .call_arc(*to, update_msg.clone(), TABLE_SYNC_RPC_TIMEOUT) - })) - .await - { - res?; - } + + self.rpc_client.try_call_many( + &nodes[..], + SyncRPC::Items(values), + RequestStrategy::with_quorum(nodes.len()).with_timeout(TABLE_SYNC_RPC_TIMEOUT)).await?; // All remote nodes have written those items, now we can delete them locally let mut not_removed = 0; @@ -577,7 +571,7 @@ where // ======= SYNCHRONIZATION PROCEDURE -- RECEIVER SIDE ====== - pub(crate) async fn handle_rpc(self: &Arc, message: &SyncRPC) -> Result { + async fn handle_rpc(self: &Arc, message: &SyncRPC) -> Result { match message { SyncRPC::RootCkHash(range, h) => { let root_ck = self.get_root_ck(*range)?; -- cgit v1.2.3 From 831eb35763fdaeecb7b6d6aa13ebd78da14db04e Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 12 Mar 2021 21:52:19 +0100 Subject: cargo fmt --- src/table/sync.rs | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'src/table/sync.rs') diff --git a/src/table/sync.rs b/src/table/sync.rs index aae65852..6c8792d2 100644 --- a/src/table/sync.rs +++ b/src/table/sync.rs @@ -347,10 +347,13 @@ where ) -> Result<(), Error> { let values = items.iter().map(|(_k, v)| v.clone()).collect::>(); - self.rpc_client.try_call_many( - &nodes[..], - SyncRPC::Items(values), - RequestStrategy::with_quorum(nodes.len()).with_timeout(TABLE_SYNC_RPC_TIMEOUT)).await?; + self.rpc_client + .try_call_many( + &nodes[..], + SyncRPC::Items(values), + RequestStrategy::with_quorum(nodes.len()).with_timeout(TABLE_SYNC_RPC_TIMEOUT), + ) + .await?; // All remote nodes have written those items, now we can delete them locally let mut not_removed = 0; -- cgit v1.2.3 From 4d4117f2b4eb69b63e2329f6e0b8929e6a8b5b31 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 15 Mar 2021 20:09:44 +0100 Subject: Refactor block resync loop; make workers infaillible --- src/table/sync.rs | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'src/table/sync.rs') diff --git a/src/table/sync.rs b/src/table/sync.rs index 6c8792d2..b344eb88 100644 --- a/src/table/sync.rs +++ b/src/table/sync.rs @@ -136,7 +136,7 @@ where self: Arc, mut must_exit: watch::Receiver, mut busy_rx: mpsc::UnboundedReceiver, - ) -> Result<(), Error> { + ) { let mut prev_ring: Arc = self.aux.system.ring.borrow().clone(); let mut ring_recv: watch::Receiver> = self.aux.system.ring.clone(); let mut nothing_to_do_since = Some(Instant::now()); @@ -183,7 +183,6 @@ where } } } - Ok(()) } pub fn add_full_sync(&self) { @@ -197,11 +196,11 @@ where self: Arc, mut must_exit: watch::Receiver, busy_tx: mpsc::UnboundedSender, - ) -> Result<(), Error> { + ) { while !*must_exit.borrow() { let task = self.todo.lock().unwrap().pop_task(); if let Some(partition) = task { - busy_tx.send(true)?; + busy_tx.send(true).unwrap(); let res = self .clone() .sync_partition(&partition, &mut must_exit) @@ -213,11 +212,10 @@ where ); } } else { - busy_tx.send(false)?; + busy_tx.send(false).unwrap(); tokio::time::delay_for(Duration::from_secs(1)).await; } } - Ok(()) } async fn sync_partition( -- cgit v1.2.3 From 0cd5b2ae19965b8c1f3176afeb8f678c4d8366dd Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 15 Mar 2021 22:36:41 +0100 Subject: WIP migrate to tokio 1 --- src/table/sync.rs | 35 ++++++++++++----------------------- 1 file changed, 12 insertions(+), 23 deletions(-) (limited to 'src/table/sync.rs') diff --git a/src/table/sync.rs b/src/table/sync.rs index b344eb88..65231cd5 100644 --- a/src/table/sync.rs +++ b/src/table/sync.rs @@ -3,7 +3,7 @@ use std::convert::TryInto; use std::sync::{Arc, Mutex}; use std::time::{Duration, Instant}; -use futures::{pin_mut, select}; +use futures::{select}; use futures_util::future::*; use futures_util::stream::*; use rand::Rng; @@ -110,7 +110,7 @@ where let s3 = syncer.clone(); tokio::spawn(async move { - tokio::time::delay_for(Duration::from_secs(20)).await; + tokio::time::sleep(Duration::from_secs(20)).await; s3.add_full_sync(); }); @@ -142,23 +142,16 @@ where let mut nothing_to_do_since = Some(Instant::now()); while !*must_exit.borrow() { - let s_ring_recv = ring_recv.recv().fuse(); - let s_busy = busy_rx.recv().fuse(); - let s_must_exit = must_exit.recv().fuse(); - let s_timeout = tokio::time::delay_for(Duration::from_secs(1)).fuse(); - pin_mut!(s_ring_recv, s_busy, s_must_exit, s_timeout); - select! { - new_ring_r = s_ring_recv => { - if let Some(new_ring) = new_ring_r { - if !Arc::ptr_eq(&new_ring, &prev_ring) { - debug!("({}) Ring changed, adding full sync to syncer todo list", self.data.name); - self.add_full_sync(); - prev_ring = new_ring; - } + _ = ring_recv.changed().fuse() => { + let new_ring = ring_recv.borrow(); + if !Arc::ptr_eq(&new_ring, &prev_ring) { + debug!("({}) Ring changed, adding full sync to syncer todo list", self.data.name); + self.add_full_sync(); + prev_ring = new_ring.clone(); } } - busy_opt = s_busy => { + busy_opt = busy_rx.recv().fuse() => { if let Some(busy) = busy_opt { if busy { nothing_to_do_since = None; @@ -169,12 +162,8 @@ where } } } - must_exit_v = s_must_exit => { - if must_exit_v.unwrap_or(false) { - break; - } - } - _ = s_timeout => { + _ = must_exit.changed().fuse() => (), + _ = tokio::time::sleep(Duration::from_secs(1)).fuse() => { if nothing_to_do_since.map(|t| Instant::now() - t >= ANTI_ENTROPY_INTERVAL).unwrap_or(false) { nothing_to_do_since = None; debug!("({}) Interval passed, adding full sync to syncer todo list", self.data.name); @@ -213,7 +202,7 @@ where } } else { busy_tx.send(false).unwrap(); - tokio::time::delay_for(Duration::from_secs(1)).await; + tokio::time::sleep(Duration::from_secs(1)).await; } } } -- cgit v1.2.3 From 6a8439fd1345ecae7414386f76dda7a03eb14df2 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 15 Mar 2021 23:14:12 +0100 Subject: Some improvements in background worker but we terminate late --- src/table/sync.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src/table/sync.rs') diff --git a/src/table/sync.rs b/src/table/sync.rs index 65231cd5..f8fef53c 100644 --- a/src/table/sync.rs +++ b/src/table/sync.rs @@ -3,7 +3,7 @@ use std::convert::TryInto; use std::sync::{Arc, Mutex}; use std::time::{Duration, Instant}; -use futures::{select}; +use futures::select; use futures_util::future::*; use futures_util::stream::*; use rand::Rng; -- cgit v1.2.3 From 1d9961e4118af0e26068e1d6c5c6c009a1292a88 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 16 Mar 2021 11:14:27 +0100 Subject: Simplify replication logic --- src/table/sync.rs | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'src/table/sync.rs') diff --git a/src/table/sync.rs b/src/table/sync.rs index f8fef53c..ac0305e2 100644 --- a/src/table/sync.rs +++ b/src/table/sync.rs @@ -218,10 +218,7 @@ where let nodes = self .aux .replication - .write_nodes( - &hash_of_merkle_partition(partition.range.begin), - &self.aux.system, - ) + .write_nodes(&hash_of_merkle_partition(partition.range.begin)) .into_iter() .filter(|node| *node != my_id) .collect::>(); @@ -293,7 +290,7 @@ where let nodes = self .aux .replication - .write_nodes(&begin, &self.aux.system) + .write_nodes(&begin) .into_iter() .collect::>(); if nodes.contains(&self.aux.system.id) { @@ -303,7 +300,7 @@ where ); break; } - if nodes.len() < self.aux.replication.write_quorum(&self.aux.system) { + if nodes.len() < self.aux.replication.write_quorum() { return Err(Error::Message(format!( "Not offloading as we don't have a quorum of nodes to write to." ))); @@ -616,7 +613,7 @@ impl SyncTodo { let begin_hash = hash_of_merkle_partition(begin); let end_hash = hash_of_merkle_partition_opt(end); - let nodes = aux.replication.replication_nodes(&begin_hash, &ring); + let nodes = aux.replication.write_nodes(&begin_hash); let retain = nodes.contains(&my_id); if !retain { -- cgit v1.2.3 From 515029d026937d29395379c76188f509984b8ace Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 16 Mar 2021 11:43:58 +0100 Subject: Refactor code --- src/table/sync.rs | 58 +++++++++++++++++++++++++++++-------------------------- 1 file changed, 31 insertions(+), 27 deletions(-) (limited to 'src/table/sync.rs') diff --git a/src/table/sync.rs b/src/table/sync.rs index ac0305e2..9c148393 100644 --- a/src/table/sync.rs +++ b/src/table/sync.rs @@ -14,6 +14,7 @@ use tokio::sync::{mpsc, watch}; use garage_util::data::*; use garage_util::error::Error; +use garage_rpc::membership::System; use garage_rpc::ring::Ring; use garage_rpc::rpc_client::*; use garage_rpc::rpc_server::*; @@ -29,8 +30,9 @@ const TABLE_SYNC_RPC_TIMEOUT: Duration = Duration::from_secs(30); const ANTI_ENTROPY_INTERVAL: Duration = Duration::from_secs(10 * 60); pub struct TableSyncer { - data: Arc>, - aux: Arc>, + system: Arc, + data: Arc>, + merkle: Arc>, todo: Mutex, rpc_client: Arc>, @@ -76,18 +78,20 @@ where R: TableReplication + 'static, { pub(crate) fn launch( - data: Arc>, - aux: Arc>, + system: Arc, + data: Arc>, + merkle: Arc>, rpc_server: &mut RpcServer, ) -> Arc { let rpc_path = format!("table_{}/sync", data.name); - let rpc_client = aux.system.rpc_client::(&rpc_path); + let rpc_client = system.rpc_client::(&rpc_path); let todo = SyncTodo { todo: vec![] }; let syncer = Arc::new(Self { + system: system.clone(), data: data.clone(), - aux: aux.clone(), + merkle, todo: Mutex::new(todo), rpc_client, }); @@ -97,13 +101,13 @@ where let (busy_tx, busy_rx) = mpsc::unbounded_channel(); let s1 = syncer.clone(); - aux.system.background.spawn_worker( + system.background.spawn_worker( format!("table sync watcher for {}", data.name), move |must_exit: watch::Receiver| s1.watcher_task(must_exit, busy_rx), ); let s2 = syncer.clone(); - aux.system.background.spawn_worker( + system.background.spawn_worker( format!("table syncer for {}", data.name), move |must_exit: watch::Receiver| s2.syncer_task(must_exit, busy_tx), ); @@ -126,7 +130,7 @@ where let self2 = self.clone(); self.rpc_client - .set_local_handler(self.aux.system.id, move |msg| { + .set_local_handler(self.system.id, move |msg| { let self2 = self2.clone(); async move { self2.handle_rpc(&msg).await } }); @@ -137,8 +141,8 @@ where mut must_exit: watch::Receiver, mut busy_rx: mpsc::UnboundedReceiver, ) { - let mut prev_ring: Arc = self.aux.system.ring.borrow().clone(); - let mut ring_recv: watch::Receiver> = self.aux.system.ring.clone(); + let mut prev_ring: Arc = self.system.ring.borrow().clone(); + let mut ring_recv: watch::Receiver> = self.system.ring.clone(); let mut nothing_to_do_since = Some(Instant::now()); while !*must_exit.borrow() { @@ -178,7 +182,7 @@ where self.todo .lock() .unwrap() - .add_full_sync(&self.data, &self.aux); + .add_full_sync(&self.data, &self.system); } async fn syncer_task( @@ -213,10 +217,10 @@ where must_exit: &mut watch::Receiver, ) -> Result<(), Error> { if partition.retain { - let my_id = self.aux.system.id; + let my_id = self.system.id; let nodes = self - .aux + .data .replication .write_nodes(&hash_of_merkle_partition(partition.range.begin)) .into_iter() @@ -242,7 +246,7 @@ where warn!("({}) Sync error: {}", self.data.name, e); } } - if n_errors > self.aux.replication.max_write_errors() { + if n_errors > self.data.replication.max_write_errors() { return Err(Error::Message(format!( "Sync failed with too many nodes (should have been: {:?}).", nodes @@ -288,19 +292,19 @@ where if items.len() > 0 { let nodes = self - .aux + .data .replication .write_nodes(&begin) .into_iter() .collect::>(); - if nodes.contains(&self.aux.system.id) { + if nodes.contains(&self.system.id) { warn!( "({}) Interrupting offload as partitions seem to have changed", self.data.name ); break; } - if nodes.len() < self.aux.replication.write_quorum() { + if nodes.len() < self.data.replication.write_quorum() { return Err(Error::Message(format!( "Not offloading as we don't have a quorum of nodes to write to." ))); @@ -376,7 +380,7 @@ where partition: u16::to_be_bytes(i), prefix: vec![], }; - match self.data.merkle_updater.read_node(&key)? { + match self.merkle.read_node(&key)? { MerkleNode::Empty => (), x => { ret.push((key.partition, hash_of(&x)?)); @@ -458,7 +462,7 @@ where while !todo.is_empty() && !*must_exit.borrow() { let key = todo.pop_front().unwrap(); - let node = self.data.merkle_updater.read_node(&key)?; + let node = self.merkle.read_node(&key)?; match node { MerkleNode::Empty => { @@ -570,7 +574,7 @@ where } } SyncRPC::GetNode(k) => { - let node = self.data.merkle_updater.read_node(&k)?; + let node = self.merkle.read_node(&k)?; Ok(SyncRPC::Node(k.clone(), node)) } SyncRPC::Items(items) => { @@ -585,15 +589,15 @@ where impl SyncTodo { fn add_full_sync( &mut self, - data: &TableData, - aux: &TableAux, + data: &TableData, + system: &System, ) { - let my_id = aux.system.id; + let my_id = system.id; self.todo.clear(); - let ring = aux.system.ring.borrow().clone(); - let split_points = aux.replication.split_points(&ring); + let ring = system.ring.borrow().clone(); + let split_points = data.replication.split_points(&ring); for i in 0..split_points.len() { let begin: MerklePartition = { @@ -613,7 +617,7 @@ impl SyncTodo { let begin_hash = hash_of_merkle_partition(begin); let end_hash = hash_of_merkle_partition_opt(end); - let nodes = aux.replication.write_nodes(&begin_hash); + let nodes = data.replication.write_nodes(&begin_hash); let retain = nodes.contains(&my_id); if !retain { -- cgit v1.2.3 From 2a41b8238496dfeac5ee0f273445299cbd112ff6 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 16 Mar 2021 12:18:03 +0100 Subject: Simpler Merkle & sync --- src/table/sync.rs | 134 +++++++++++++++--------------------------------------- 1 file changed, 37 insertions(+), 97 deletions(-) (limited to 'src/table/sync.rs') diff --git a/src/table/sync.rs b/src/table/sync.rs index 9c148393..f5c2ef33 100644 --- a/src/table/sync.rs +++ b/src/table/sync.rs @@ -1,5 +1,4 @@ use std::collections::VecDeque; -use std::convert::TryInto; use std::sync::{Arc, Mutex}; use std::time::{Duration, Instant}; @@ -15,7 +14,7 @@ use garage_util::data::*; use garage_util::error::Error; use garage_rpc::membership::System; -use garage_rpc::ring::Ring; +use garage_rpc::ring::*; use garage_rpc::rpc_client::*; use garage_rpc::rpc_server::*; @@ -38,20 +37,10 @@ pub struct TableSyncer { rpc_client: Arc>, } -type RootCk = Vec<(MerklePartition, Hash)>; - -#[derive(Debug, Clone, Copy, Serialize, Deserialize)] -pub struct PartitionRange { - begin: MerklePartition, - // if end is None, go all the way to partition 0xFFFF included - end: Option, -} - #[derive(Serialize, Deserialize)] pub(crate) enum SyncRPC { - RootCkHash(PartitionRange, Hash), - RootCkList(PartitionRange, RootCk), - CkNoDifference, + RootCkHash(Partition, Hash), + RootCkDifferent(bool), GetNode(MerkleNodeKey), Node(MerkleNodeKey, MerkleNode), Items(Vec>), @@ -66,7 +55,9 @@ struct SyncTodo { #[derive(Debug, Clone)] struct TodoPartition { - range: PartitionRange, + partition: Partition, + begin: Hash, + end: Hash, // Are we a node that stores this partition or not? retain: bool, @@ -222,7 +213,7 @@ where let nodes = self .data .replication - .write_nodes(&hash_of_merkle_partition(partition.range.begin)) + .write_nodes(&partition.begin) .into_iter() .filter(|node| *node != my_id) .collect::>(); @@ -254,8 +245,8 @@ where } } else { self.offload_partition( - &hash_of_merkle_partition(partition.range.begin), - &hash_of_merkle_partition_opt(partition.range.end), + &partition.begin, + &partition.end, must_exit, ) .await?; @@ -364,30 +355,13 @@ where // side to the other will happen when the other side syncs with us, // which they also do regularly. - fn get_root_ck(&self, range: PartitionRange) -> Result { - let begin = u16::from_be_bytes(range.begin); - let range_iter = match range.end { - Some(end) => { - let end = u16::from_be_bytes(end); - begin..=(end - 1) - } - None => begin..=0xFFFF, + fn get_root_ck(&self, partition: Partition) -> Result<(MerkleNodeKey, MerkleNode), Error> { + let key = MerkleNodeKey { + partition, + prefix: vec![], }; - - let mut ret = vec![]; - for i in range_iter { - let key = MerkleNodeKey { - partition: u16::to_be_bytes(i), - prefix: vec![], - }; - match self.merkle.read_node(&key)? { - MerkleNode::Empty => (), - x => { - ret.push((key.partition, hash_of(&x)?)); - } - } - } - Ok(ret) + let node = self.merkle.read_node(&key)?; + Ok((key, node)) } async fn do_sync_with( @@ -396,7 +370,7 @@ where who: UUID, must_exit: watch::Receiver, ) -> Result<(), Error> { - let root_ck = self.get_root_ck(partition.range)?; + let (root_ck_key, root_ck) = self.get_root_ck(partition.partition)?; if root_ck.is_empty() { debug!( "({}) Sync {:?} with {:?}: partition is empty.", @@ -404,51 +378,29 @@ where ); return Ok(()); } + let root_ck_hash = hash_of::(&root_ck)?; - let root_ck_hash = hash_of(&root_ck)?; - - // If their root checksum has level > than us, use that as a reference + // Check if they have the same root checksum + // If so, do nothing. let root_resp = self .rpc_client .call( who, - SyncRPC::RootCkHash(partition.range, root_ck_hash), + SyncRPC::RootCkHash(partition.partition, root_ck_hash), TABLE_SYNC_RPC_TIMEOUT, ) .await?; let mut todo = match root_resp { - SyncRPC::CkNoDifference => { + SyncRPC::RootCkDifferent(false) => { debug!( "({}) Sync {:?} with {:?}: no difference", self.data.name, partition, who ); return Ok(()); } - SyncRPC::RootCkList(_, their_root_ck) => { - let join = join_ordered(&root_ck[..], &their_root_ck[..]); - let mut todo = VecDeque::new(); - for (p, v1, v2) in join.iter() { - let diff = match (v1, v2) { - (Some(_), None) | (None, Some(_)) => true, - (Some(a), Some(b)) => a != b, - _ => false, - }; - if diff { - todo.push_back(MerkleNodeKey { - partition: **p, - prefix: vec![], - }); - } - } - debug!( - "({}) Sync {:?} with {:?}: todo.len() = {}", - self.data.name, - partition, - who, - todo.len() - ); - todo + SyncRPC::RootCkDifferent(true) => { + VecDeque::from(vec![root_ck_key]) } x => { return Err(Error::Message(format!( @@ -565,13 +517,9 @@ where async fn handle_rpc(self: &Arc, message: &SyncRPC) -> Result { match message { SyncRPC::RootCkHash(range, h) => { - let root_ck = self.get_root_ck(*range)?; - let hash = hash_of(&root_ck)?; - if hash == *h { - Ok(SyncRPC::CkNoDifference) - } else { - Ok(SyncRPC::RootCkList(*range, root_ck)) - } + let (_root_ck_key, root_ck) = self.get_root_ck(*range)?; + let hash = hash_of::(&root_ck)?; + Ok(SyncRPC::RootCkDifferent(hash != *h)) } SyncRPC::GetNode(k) => { let node = self.merkle.read_node(&k)?; @@ -596,39 +544,31 @@ impl SyncTodo { self.todo.clear(); - let ring = system.ring.borrow().clone(); - let split_points = data.replication.split_points(&ring); + let partitions = data.replication.partitions(); - for i in 0..split_points.len() { - let begin: MerklePartition = { - let b = split_points[i]; - assert_eq!(b.as_slice()[2..], [0u8; 30][..]); - b.as_slice()[..2].try_into().unwrap() - }; + for i in 0..partitions.len() { + let begin = partitions[i].1; - let end: Option = if i + 1 < split_points.len() { - let e = split_points[i + 1]; - assert_eq!(e.as_slice()[2..], [0u8; 30][..]); - Some(e.as_slice()[..2].try_into().unwrap()) + let end = if i + 1 < partitions.len() { + partitions[i+1].1 } else { - None + [0xFFu8; 32].into() }; - let begin_hash = hash_of_merkle_partition(begin); - let end_hash = hash_of_merkle_partition_opt(end); - - let nodes = data.replication.write_nodes(&begin_hash); + let nodes = data.replication.write_nodes(&begin); let retain = nodes.contains(&my_id); if !retain { // Check if we have some data to send, otherwise skip - if data.store.range(begin_hash..end_hash).next().is_none() { + if data.store.range(begin..end).next().is_none() { continue; } } self.todo.push(TodoPartition { - range: PartitionRange { begin, end }, + partition: partitions[i].0, + begin, + end, retain, }); } -- cgit v1.2.3 From f4346cc5f45839ace93d2d11ce6beea632fd8f2c Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 16 Mar 2021 15:58:40 +0100 Subject: Update dependencies --- src/table/sync.rs | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) (limited to 'src/table/sync.rs') diff --git a/src/table/sync.rs b/src/table/sync.rs index f5c2ef33..3130abe8 100644 --- a/src/table/sync.rs +++ b/src/table/sync.rs @@ -244,12 +244,8 @@ where ))); } } else { - self.offload_partition( - &partition.begin, - &partition.end, - must_exit, - ) - .await?; + self.offload_partition(&partition.begin, &partition.end, must_exit) + .await?; } Ok(()) @@ -399,9 +395,7 @@ where ); return Ok(()); } - SyncRPC::RootCkDifferent(true) => { - VecDeque::from(vec![root_ck_key]) - } + SyncRPC::RootCkDifferent(true) => VecDeque::from(vec![root_ck_key]), x => { return Err(Error::Message(format!( "Invalid respone to RootCkHash RPC: {}", @@ -550,7 +544,7 @@ impl SyncTodo { let begin = partitions[i].1; let end = if i + 1 < partitions.len() { - partitions[i+1].1 + partitions[i + 1].1 } else { [0xFFu8; 32].into() }; @@ -579,7 +573,7 @@ impl SyncTodo { return None; } - let i = rand::thread_rng().gen_range::(0, self.todo.len()); + let i = rand::thread_rng().gen_range(0..self.todo.len()); if i == self.todo.len() - 1 { self.todo.pop() } else { -- cgit v1.2.3