aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlex Auvolat <alex@adnab.me>2022-03-15 16:47:13 +0100
committerAlex Auvolat <alex@adnab.me>2022-03-23 10:25:39 +0100
commit3dc9214172302f658fc6e69f1a008286ecde94b7 (patch)
treef59c97ea44ea61085cdde880a35b5b8c7404a912
parent077dd1cde95debb41a6cd153407e17e48ceb71bf (diff)
downloadgarage-3dc9214172302f658fc6e69f1a008286ecde94b7.tar.gz
garage-3dc9214172302f658fc6e69f1a008286ecde94b7.zip
Add lots of comments on how the resync queue works
(I don't really want to change/refactor that code though)
-rw-r--r--src/block/manager.rs101
1 files changed, 96 insertions, 5 deletions
diff --git a/src/block/manager.rs b/src/block/manager.rs
index 3215d27e..9cf72019 100644
--- a/src/block/manager.rs
+++ b/src/block/manager.rs
@@ -86,7 +86,7 @@ pub struct BlockManager {
mutation_lock: Mutex<BlockManagerLocked>,
- pub rc: BlockRc,
+ rc: BlockRc,
resync_queue: SledCountedTree,
resync_notify: Notify,
@@ -231,7 +231,10 @@ impl BlockManager {
// so that we can offload them if necessary and then delete them locally.
self.for_each_file(
(),
- move |_, hash| async move { self.put_to_resync(&hash, Duration::from_secs(0)) },
+ move |_, hash| async move {
+ self.put_to_resync(&hash, Duration::from_secs(0))
+ .map_err(Into::into)
+ },
must_exit,
)
.await
@@ -410,6 +413,77 @@ impl BlockManager {
// ---- Resync loop ----
+ // This part manages a queue of blocks that need to be
+ // "resynchronized", i.e. that need to have a check that
+ // they are at present if we need them, or that they are
+ // deleted once the garbage collection delay has passed.
+ //
+ // Here are some explanations on how the resync queue works.
+ // There are two Sled trees that are used to have information
+ // about the status of blocks that need to be resynchronized:
+ //
+ // - resync_queue: a tree that is ordered first by a timestamp
+ // (in milliseconds since Unix epoch) that is the time at which
+ // the resync must be done, and second by block hash.
+ // The key in this tree is just:
+ // concat(timestamp (8 bytes), hash (32 bytes))
+ // The value is the same 32-byte hash.
+ //
+ // - resync_errors: a tree that indicates for each block
+ // if the last resync resulted in an error, and if so,
+ // the following two informations (see the ErrorCounter struct):
+ // - how many consecutive resync errors for this block?
+ // - when was the last try?
+ // These two informations are used to implement an
+ // exponential backoff retry strategy.
+ // The key in this tree is the 32-byte hash of the block,
+ // and the value is the encoded ErrorCounter value.
+ //
+ // We need to have these two trees, because the resync queue
+ // is not just a queue of items to process, but a set of items
+ // that are waiting a specific delay until we can process them
+ // (the delay being necessary both internally for the exponential
+ // backoff strategy, and exposed as a parameter when adding items
+ // to the queue, e.g. to wait until the GC delay has passed).
+ // This is why we need one tree ordered by time, and one
+ // ordered by identifier of item to be processed (block hash).
+ //
+ // When the worker wants to process an item it takes from
+ // resync_queue, it checks in resync_errors that if there is an
+ // exponential back-off delay to await, it has passed before we
+ // process the item. If not, the item in the queue is skipped
+ // (but added back for later processing after the time of the
+ // delay).
+ //
+ // An alternative that would have seemed natural is to
+ // only add items to resync_queue with a processing time that is
+ // after the delay, but there are several issues with this:
+ // - This requires to synchronize updates to resync_queue and
+ // resync_errors (with the current model, there is only one thread,
+ // the worker thread, that accesses resync_errors,
+ // so no need to synchronize) by putting them both in a lock.
+ // This would mean that block_incref might need to take a lock
+ // before doing its thing, meaning it has much more chances of
+ // not completing successfully if something bad happens to Garage.
+ // Currently Garage is not able to recover from block_incref that
+ // doesn't complete successfully, because it is necessary to ensure
+ // the consistency between the state of the block manager and
+ // information in the BlockRef table.
+ // - If a resync fails, we put that block in the resync_errors table,
+ // and also add it back to resync_queue to be processed after
+ // the exponential back-off delay,
+ // but maybe the block is already scheduled to be resynced again
+ // at another time that is before the exponential back-off delay,
+ // and we have no way to check that easily. This means that
+ // in all cases, we need to check the resync_errors table
+ // in the resync loop at the time when a block is popped from
+ // the resync_queue.
+ // Overall, the current design is therefore simpler and more robust
+ // because it tolerates inconsistencies between the resync_queue
+ // and resync_errors table (items being scheduled in resync_queue
+ // for times that are earlier than the exponential back-off delay
+ // is a natural condition that is handled properly).
+
fn spawn_background_worker(self: Arc<Self>) {
// Launch a background workers for background resync loop processing
let background = self.system.background.clone();
@@ -421,12 +495,12 @@ impl BlockManager {
});
}
- fn put_to_resync(&self, hash: &Hash, delay: Duration) -> Result<(), Error> {
+ fn put_to_resync(&self, hash: &Hash, delay: Duration) -> Result<(), sled::Error> {
let when = now_msec() + delay.as_millis() as u64;
self.put_to_resync_at(hash, when)
}
- fn put_to_resync_at(&self, hash: &Hash, when: u64) -> Result<(), Error> {
+ fn put_to_resync_at(&self, hash: &Hash, when: u64) -> Result<(), sled::Error> {
trace!("Put resync_queue: {} {:?}", when, hash);
let mut key = u64::to_be_bytes(when).to_vec();
key.extend(hash.as_ref());
@@ -461,7 +535,14 @@ impl BlockManager {
}
}
- async fn resync_iter(&self, must_exit: &mut watch::Receiver<bool>) -> Result<bool, Error> {
+ // The result of resync_iter is:
+ // - Ok(true) -> a block was processed (successfully or not)
+ // - Ok(false) -> no block was processed, but we are ready for the next iteration
+ // - Err(_) -> a Sled error occurred when reading/writing from resync_queue/resync_errors
+ async fn resync_iter(
+ &self,
+ must_exit: &mut watch::Receiver<bool>,
+ ) -> Result<bool, sled::Error> {
if let Some(first_pair_res) = self.resync_queue.iter().next() {
let (time_bytes, hash_bytes) = first_pair_res?;
@@ -480,6 +561,8 @@ impl BlockManager {
self.put_to_resync_at(&hash, ec.next_try())?;
// ec.next_try() > now >= time_msec, so this remove
// is not removing the one we added just above
+ // (we want to do the remove after the insert to ensure
+ // that the item is not lost if we crash in-between)
self.resync_queue.remove(time_bytes)?;
return Ok(false);
}
@@ -539,7 +622,15 @@ impl BlockManager {
Ok(false)
}
} else {
+ // Here we wait either for a notification that an item has been
+ // added to the queue, or for a constant delay of 10 secs to expire.
+ // The delay avoids a race condition where the notification happens
+ // between the time we checked the queue and the first poll
+ // to resync_notify.notified(): if that happens, we'll just loop
+ // back 10 seconds later, which is fine.
+ let delay = tokio::time::sleep(Duration::from_secs(10));
select! {
+ _ = delay.fuse() => {},
_ = self.resync_notify.notified().fuse() => {},
_ = must_exit.changed().fuse() => {},
}