Merge pull request 'metadata db snapshotting' (#775) from db-snapshot into main

Reviewed-on: https://git.deuxfleurs.fr/Deuxfleurs/garage/pulls/775
author: Alex <alex@adnab.me> 2024-03-15 13:17:53 +0000
committer: Alex <alex@adnab.me> 2024-03-15 13:17:53 +0000
commit: fd2e19bf1bf301bc03aa29ffa3fe1e71008cbe50 (patch)
tree: c92172dee172941c3daf32a08927f8ebab0ded9e
parent: a80ce6ab5ad9834c3721eeb4f626d53c9a8bb1f4 (diff)
parent: 8cf3d24875d41d79ab08d637cd38d2a5b9e527dd (diff)
download: garage-fd2e19bf1bf301bc03aa29ffa3fe1e71008cbe50.tar.gz
garage-fd2e19bf1bf301bc03aa29ffa3fe1e71008cbe50.zip
21 files changed, 380 insertions, 11 deletions
diff --git a/Cargo.lock b/Cargo.lock
index f9bf0c0a..573ff5ec 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1438,6 +1438,7 @@ dependencies = [
  "garage_util",
  "hex",
  "opentelemetry",
+ "parse_duration",
  "rand",
  "serde",
  "serde_bytes",
diff --git a/Cargo.nix b/Cargo.nix
index 0a4ca99a..a3ac4c90 100644
--- a/Cargo.nix
+++ b/Cargo.nix
@@ -34,7 +34,7 @@ args@{
   ignoreLockHash,
 }:
 let
-  nixifiedLockHash = "8112e20b0e356bed77a9769600c2b2952662ec8af9548eecf8a2d46fe8433189";
+  nixifiedLockHash = "f99156ba9724d370b33258f076f078fefc945f0af79292b1a246bd48bef2a9b2";
   workspaceSrc = if args.workspaceSrc == null then ./. else args.workspaceSrc;
   currentLockHash = builtins.hashFile "sha256" (workspaceSrc + /Cargo.lock);
   lockHashIgnored = if ignoreLockHash
@@ -2093,6 +2093,7 @@ in
       garage_util = (rustPackages."unknown".garage_util."0.9.3" { inherit profileName; }).out;
       hex = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".hex."0.4.3" { inherit profileName; }).out;
       opentelemetry = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".opentelemetry."0.17.0" { inherit profileName; }).out;
+      parse_duration = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".parse_duration."2.1.1" { inherit profileName; }).out;
       rand = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".rand."0.8.5" { inherit profileName; }).out;
       serde = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".serde."1.0.196" { inherit profileName; }).out;
       serde_bytes = (rustPackages."registry+https://github.com/rust-lang/crates.io-index".serde_bytes."0.11.14" { inherit profileName; }).out;
@@ -4769,6 +4770,7 @@ in
     registry = "registry+https://github.com/rust-lang/crates.io-index";
     src = fetchCratesIo { inherit name version; sha256 = "a78046161564f5e7cd9008aff3b2990b3850dc8e0349119b98e8f251e099f24d"; };
     features = builtins.concatLists [
+      (lib.optional (rootFeatures' ? "garage/bundled-libs" || rootFeatures' ? "garage/default" || rootFeatures' ? "garage/sqlite" || rootFeatures' ? "garage_db/bundled-libs" || rootFeatures' ? "garage_db/default" || rootFeatures' ? "garage_db/rusqlite" || rootFeatures' ? "garage_db/sqlite" || rootFeatures' ? "garage_model/default" || rootFeatures' ? "garage_model/sqlite") "backup")
       (lib.optional (rootFeatures' ? "garage/bundled-libs" || rootFeatures' ? "garage/default" || rootFeatures' ? "garage_db/bundled-libs") "bundled")
       (lib.optional (rootFeatures' ? "garage/bundled-libs" || rootFeatures' ? "garage/default" || rootFeatures' ? "garage_db/bundled-libs") "modern_sqlite")
     ];
diff --git a/doc/book/cookbook/real-world.md b/doc/book/cookbook/real-world.md
index 15a58b9b..9e226030 100644
--- a/doc/book/cookbook/real-world.md
+++ b/doc/book/cookbook/real-world.md
@@ -72,13 +72,14 @@ to store 2 TB of data in total.
   to RAID, see [our dedicated documentation page](@/documentation/operations/multi-hdd.md).
 
 - For the metadata storage, Garage does not do checksumming and integrity
-  verification on its own. Users have reported that when using the LMDB
-  database engine (the default), database files have a tendency of becoming
-  corrupted after an unclean shutdown (e.g. a power outage), so you should use
-  a robust filesystem such as BTRFS or ZFS for the metadata partition, and take
-  regular snapshots so that you can restore to a recent known-good state in
-  case of an incident.  If you cannot do so, you might want to switch to Sqlite
-  which is more robust.
+  verification on its own, so it is better to use a robust filesystem such as
+  BTRFS or ZFS. Users have reported that when using the LMDB database engine
+  (the default), database files have a tendency of becoming corrupted after an
+  unclean shutdown (e.g. a power outage), so you should take regular snapshots
+  to be able to recover from such a situation.  This can be done using Garage's
+  built-in automatic snapshotting (since v0.9.4), or by using filesystem level
+  snapshots. If you cannot do so, you might want to switch to Sqlite which is
+  more robust.
 
 - LMDB is the fastest and most tested database engine, but it has the following
   weaknesses: 1/ data files are not architecture-independent, you cannot simply
@@ -124,6 +125,7 @@ A valid `/etc/garage.toml` for our cluster would look as follows:
 metadata_dir = "/var/lib/garage/meta"
 data_dir = "/var/lib/garage/data"
 db_engine = "lmdb"
+metadata_auto_snapshot_interval = "6h"
 
 replication_mode = "3"
 
diff --git a/doc/book/operations/durability-repairs.md b/doc/book/operations/durability-repairs.md
index f4450dae..c76dc39e 100644
--- a/doc/book/operations/durability-repairs.md
+++ b/doc/book/operations/durability-repairs.md
@@ -104,6 +104,24 @@ operation will also move out all data from locations marked as read-only.
 
 # Metadata operations
 
+## Metadata snapshotting
+
+It is good practice to setup automatic snapshotting of your metadata database
+file, to recover from situations where it becomes corrupted on disk. This can
+be done at the filesystem level if you are using ZFS or BTRFS.
+
+Since Garage v0.9.4, Garage is able to take snapshots of the metadata database
+itself. This basically amounts to copying the database file, except that it can
+be run live while Garage is running without the risk of corruption or
+inconsistencies.  This can be setup to run automatically on a schedule using
+[`metadata_auto_snapshot_interval`](@/documentation/reference-manual/configuration.md#metadata_auto_snapshot_interval).
+A snapshot can also be triggered manually using the `garage meta snapshot`
+command. Note that taking a snapshot using this method is very intensive as it
+requires making a full copy of the database file, so you might prefer using
+filesystem-level snapshots if possible. To recover a corrupted node from such a
+snapshot, read the instructions
+[here](@/documentation/operations/recovering.md#corrupted_meta).
+
 ## Metadata table resync
 
 Garage automatically resyncs all entries stored in the metadata tables every hour,
diff --git a/doc/book/operations/recovering.md b/doc/book/operations/recovering.md
index 7a830788..6e19db0e 100644
--- a/doc/book/operations/recovering.md
+++ b/doc/book/operations/recovering.md
@@ -108,3 +108,57 @@ garage layout apply   # once satisfied, apply the changes
 
 Garage will then start synchronizing all required data on the new node.
 This process can be monitored using the `garage stats -a` command.
+
+## Replacement scenario 3: corrupted metadata {#corrupted_meta}
+
+In some cases, your metadata DB file might become corrupted, for instance if
+your node suffered a power outage and did not shut down properly. In this case,
+you can recover without having to change the node ID and rebuilding a cluster
+layout. This means that data blocks will not need to be shuffled around, you
+must simply find a way to repair the metadata file. The best way is generally
+to discard the corrupted file and recover it from another source.
+
+First of all, start by locating the database file in your metadata directory,
+which [depends on your `db_engine`
+choice](@/documentation/reference-manual/configuration.md#db_engine).  Then,
+your recovery options are as follows:
+
+- **Option 1: resyncing from other nodes.** In case your cluster is replicated
+  with two or three copies, you can simply delete the database file, and Garage
+  will resync from other nodes. To do so, stop Garage, delete the database file
+  or directory, and restart Garage. Then, do a full table repair by calling
+  `garage repair -a --yes tables`.  This will take a bit of time to complete as
+  the new node will need to receive copies of the metadata tables from the
+  network.
+
+- **Option 2: restoring a snapshot taken by Garage.** Since v0.9.4, Garage can
+  [automatically take regular
+  snapshots](@/documentation/reference-manual/configuration.md#metadata_auto_snapshot_interval)
+  of your metadata DB file. This file or directory should be located under
+  `<metadata_dir>/snapshots`, and is named according to the UTC time at which it
+  was taken. Stop Garage, discard the database file/directory and replace it by the
+  snapshot you want to use. For instance, in the case of LMDB:
+
+  ```bash
+  cd $METADATA_DIR
+  mv db.lmdb db.lmdb.bak
+  cp -r snapshots/2024-03-15T12:13:52Z db.lmdb
+  ```
+
+  And for Sqlite:
+
+  ```bash
+  cd $METADATA_DIR
+  mv db.sqlite db.sqlite.bak
+  cp snapshots/2024-03-15T12:13:52Z db.sqlite
+  ```
+
+  Then, restart Garage and run a full table repair by calling `garage repair -a
+  --yes tables`.  This should run relatively fast as only the changes that
+  occurred since the snapshot was taken will need to be resynchronized. Of
+  course, if your cluster is not replicated, you will lose all changes that
+  occurred since the snapshot was taken.
+
+- **Option 3: restoring a filesystem-level snapshot.** If you are using ZFS or
+  BTRFS to snapshot your metadata partition, refer to their specific
+  documentation on rolling back or copying files from an old snapshot.
diff --git a/doc/book/operations/upgrading.md b/doc/book/operations/upgrading.md
index 6b6ea26d..c239bfe4 100644
--- a/doc/book/operations/upgrading.md
+++ b/doc/book/operations/upgrading.md
@@ -73,6 +73,18 @@ The entire procedure would look something like this:
   You can do all of the nodes in a single zone at once as that won't impact global cluster availability.
   Do not try to make a backup of the metadata folder of a running node.
 
+  **Since Garage v0.9.4,** you can use the `garage meta snapshot --all` command
+  to take a simultaneous snapshot of the metadata database files of all your
+  nodes.  This avoids the tedious process of having to take them down one by
+  one before upgrading. Be careful that if automatic snapshotting is enabled,
+  Garage only keeps the last two snapshots and deletes older ones, so you might
+  want to disable automatic snapshotting in your upgraded configuration file
+  until you have confirmed that the upgrade ran successfully.  In addition to
+  snapshotting the metadata databases of your nodes, you should back-up at
+  least the `cluster_layout` file of one of your Garage instances (this file
+  should be the same on all nodes and you can copy it safely while Garage is
+  running).
+
 3. Prepare your binaries and configuration files for the new Garage version
 
 4. Restart all nodes simultaneously in the new version
diff --git a/doc/book/reference-manual/configuration.md b/doc/book/reference-manual/configuration.md
index 8e87b7d8..de800ec0 100644
--- a/doc/book/reference-manual/configuration.md
+++ b/doc/book/reference-manual/configuration.md
@@ -15,6 +15,7 @@ data_dir = "/var/lib/garage/data"
 metadata_fsync = true
 data_fsync = false
 disable_scrub = false
+metadata_auto_snapshot_interval = "6h"
 
 db_engine = "lmdb"
 
@@ -90,6 +91,7 @@ Top-level configuration options:
 [`db_engine`](#db_engine),
 [`disable_scrub`](#disable_scrub),
 [`lmdb_map_size`](#lmdb_map_size),
+[`metadata_auto_snapshot_interval`](#metadata_auto_snapshot_interval),
 [`metadata_dir`](#metadata_dir),
 [`metadata_fsync`](#metadata_fsync),
 [`replication_mode`](#replication_mode),
@@ -346,6 +348,25 @@ at the cost of a moderate drop in write performance.
 Similarly to `metatada_fsync`, this is likely not necessary
 if geographical replication is used.
 
+#### `metadata_auto_snapshot_interval` (since Garage v0.9.4) {#metadata_auto_snapshot_interval}
+
+If this value is set, Garage will automatically take a snapshot of the metadata
+DB file at a regular interval and save it in the metadata directory.
+This can allow to recover from situations where the metadata DB file is corrupted,
+for instance after an unclean shutdown.
+See [this page](@/documentation/operations/recovering.md#corrupted_meta) for details.
+
+Garage keeps only the two most recent snapshots of the metadata DB and deletes
+older ones automatically.
+
+Note that taking a metadata snapshot is a relatively intensive operation as the
+entire data file is copied. A snapshot being taken might have performance
+impacts on the Garage node while it is running. If the cluster is under heavy
+write load when a snapshot operation is running, this might also cause the
+database file to grow in size significantly as pages cannot be recycled easily.
+For this reason, it might be better to use filesystem-level snapshots instead
+if possible.
+
 #### `disable_scrub` {#disable_scrub}
 
 By default, Garage runs a scrub of the data directory approximately once per
diff --git a/src/db/Cargo.toml b/src/db/Cargo.toml
index d7c89620..324de74c 100644
--- a/src/db/Cargo.toml
+++ b/src/db/Cargo.toml
@@ -17,7 +17,7 @@ hexdump.workspace = true
 tracing.workspace = true
 
 heed = { workspace = true, optional = true }
-rusqlite = { workspace = true, optional = true }
+rusqlite = { workspace = true, optional = true, features = ["backup"] }
 sled = { workspace = true, optional = true }
 
 [dev-dependencies]
diff --git a/src/db/lib.rs b/src/db/lib.rs
index 0fb457ce..7f19172f 100644
--- a/src/db/lib.rs
+++ b/src/db/lib.rs
@@ -19,6 +19,7 @@ use core::ops::{Bound, RangeBounds};
 
 use std::borrow::Cow;
 use std::cell::Cell;
+use std::path::PathBuf;
 use std::sync::Arc;
 
 use err_derive::Error;
@@ -48,6 +49,12 @@ pub type TxValueIter<'a> = Box<dyn std::iter::Iterator<Item = TxOpResult<(Value,
 #[error(display = "{}", _0)]
 pub struct Error(pub Cow<'static, str>);
 
+impl From<std::io::Error> for Error {
+	fn from(e: std::io::Error) -> Error {
+		Error(format!("IO: {}", e).into())
+	}
+}
+
 pub type Result<T> = std::result::Result<T, Error>;
 
 #[derive(Debug, Error)]
@@ -129,6 +136,10 @@ impl Db {
 		}
 	}
 
+	pub fn snapshot(&self, path: &PathBuf) -> Result<()> {
+		self.0.snapshot(path)
+	}
+
 	pub fn import(&self, other: &Db) -> Result<()> {
 		let existing_trees = self.list_trees()?;
 		if !existing_trees.is_empty() {
@@ -325,6 +336,7 @@ pub(crate) trait IDb: Send + Sync {
 	fn engine(&self) -> String;
 	fn open_tree(&self, name: &str) -> Result<usize>;
 	fn list_trees(&self) -> Result<Vec<String>>;
+	fn snapshot(&self, path: &PathBuf) -> Result<()>;
 
 	fn get(&self, tree: usize, key: &[u8]) -> Result<Option<Value>>;
 	fn len(&self, tree: usize) -> Result<usize>;
diff --git a/src/db/lmdb_adapter.rs b/src/db/lmdb_adapter.rs
index 59fa132d..4b131aff 100644
--- a/src/db/lmdb_adapter.rs
+++ b/src/db/lmdb_adapter.rs
@@ -3,6 +3,7 @@ use core::ptr::NonNull;
 
 use std::collections::HashMap;
 use std::convert::TryInto;
+use std::path::PathBuf;
 use std::sync::{Arc, RwLock};
 
 use heed::types::ByteSlice;
@@ -102,6 +103,15 @@ impl IDb for LmdbDb {
 		Ok(ret2)
 	}
 
+	fn snapshot(&self, to: &PathBuf) -> Result<()> {
+		std::fs::create_dir_all(to)?;
+		let mut path = to.clone();
+		path.push("data.mdb");
+		self.db
+			.copy_to_path(path, heed::CompactionOption::Disabled)?;
+		Ok(())
+	}
+
 	// ----
 
 	fn get(&self, tree: usize, key: &[u8]) -> Result<Option<Value>> {
diff --git a/src/db/sled_adapter.rs b/src/db/sled_adapter.rs
index 84f2001b..c34b4d81 100644
--- a/src/db/sled_adapter.rs
+++ b/src/db/sled_adapter.rs
@@ -2,6 +2,7 @@ use core::ops::Bound;
 
 use std::cell::Cell;
 use std::collections::HashMap;
+use std::path::PathBuf;
 use std::sync::{Arc, RwLock};
 
 use sled::transaction::{
@@ -96,6 +97,13 @@ impl IDb for SledDb {
 		Ok(trees)
 	}
 
+	fn snapshot(&self, to: &PathBuf) -> Result<()> {
+		let to_db = sled::open(to)?;
+		let export = self.db.export();
+		to_db.import(export);
+		Ok(())
+	}
+
 	// ----
 
 	fn get(&self, tree: usize, key: &[u8]) -> Result<Option<Value>> {
diff --git a/src/db/sqlite_adapter.rs b/src/db/sqlite_adapter.rs
index 9f967c66..827f3cc3 100644
--- a/src/db/sqlite_adapter.rs
+++ b/src/db/sqlite_adapter.rs
@@ -2,6 +2,7 @@ use core::ops::Bound;
 
 use std::borrow::BorrowMut;
 use std::marker::PhantomPinned;
+use std::path::PathBuf;
 use std::pin::Pin;
 use std::ptr::NonNull;
 use std::sync::{Arc, Mutex, MutexGuard};
@@ -119,6 +120,17 @@ impl IDb for SqliteDb {
 		Ok(trees)
 	}
 
+	fn snapshot(&self, to: &PathBuf) -> Result<()> {
+		fn progress(p: rusqlite::backup::Progress) {
+			let percent = (p.pagecount - p.remaining) * 100 / p.pagecount;
+			info!("Sqlite snapshot progres: {}%", percent);
+		}
+		let this = self.0.lock().unwrap();
+		this.db
+			.backup(rusqlite::DatabaseName::Main, to, Some(progress))?;
+		Ok(())
+	}
+
 	// ----
 
 	fn get(&self, tree: usize, key: &[u8]) -> Result<Option<Value>> {
diff --git a/src/garage/admin/mod.rs b/src/garage/admin/mod.rs
index b6f9c426..f01ef3d6 100644
--- a/src/garage/admin/mod.rs
+++ b/src/garage/admin/mod.rs
@@ -46,6 +46,7 @@ pub enum AdminRpc {
 	Stats(StatsOpt),
 	Worker(WorkerOperation),
 	BlockOperation(BlockOperation),
+	MetaOperation(MetaOperation),
 
 	// Replies
 	Ok(String),
@@ -518,6 +519,44 @@ impl AdminRpcHandler {
 			)]))
 		}
 	}
+
+	// ================ META DB COMMANDS ====================
+
+	async fn handle_meta_cmd(self: &Arc<Self>, mo: &MetaOperation) -> Result<AdminRpc, Error> {
+		match mo {
+			MetaOperation::Snapshot { all: true } => {
+				let ring = self.garage.system.ring.borrow().clone();
+				let to = ring.layout.node_ids().to_vec();
+
+				let resps = futures::future::join_all(to.iter().map(|to| async move {
+					let to = (*to).into();
+					self.endpoint
+						.call(
+							&to,
+							AdminRpc::MetaOperation(MetaOperation::Snapshot { all: false }),
+							PRIO_NORMAL,
+						)
+						.await
+				}))
+				.await;
+
+				let mut ret = vec![];
+				for (to, resp) in to.iter().zip(resps.iter()) {
+					let res_str = match resp {
+						Ok(_) => "ok".to_string(),
+						Err(e) => format!("error: {}", e),
+					};
+					ret.push(format!("{:?}\t{}", to, res_str));
+				}
+
+				Ok(AdminRpc::Ok(format_table_to_string(ret)))
+			}
+			MetaOperation::Snapshot { all: false } => {
+				garage_model::snapshot::async_snapshot_metadata(&self.garage).await?;
+				Ok(AdminRpc::Ok("Snapshot has been saved.".into()))
+			}
+		}
+	}
 }
 
 #[async_trait]
@@ -535,6 +574,7 @@ impl EndpointHandler<AdminRpc> for AdminRpcHandler {
 			AdminRpc::Stats(opt) => self.handle_stats(opt.clone()).await,
 			AdminRpc::Worker(wo) => self.handle_worker_cmd(wo).await,
 			AdminRpc::BlockOperation(bo) => self.handle_block_cmd(bo).await,
+			AdminRpc::MetaOperation(mo) => self.handle_meta_cmd(mo).await,
 			m => Err(GarageError::unexpected_rpc_message(m).into()),
 		}
 	}
diff --git a/src/garage/cli/cmd.rs b/src/garage/cli/cmd.rs
index 48359614..4c0a5322 100644
--- a/src/garage/cli/cmd.rs
+++ b/src/garage/cli/cmd.rs
@@ -44,6 +44,9 @@ pub async fn cli_command_dispatch(
 		Command::Block(bo) => {
 			cmd_admin(admin_rpc_endpoint, rpc_host, AdminRpc::BlockOperation(bo)).await
 		}
+		Command::Meta(mo) => {
+			cmd_admin(admin_rpc_endpoint, rpc_host, AdminRpc::MetaOperation(mo)).await
+		}
 		_ => unreachable!(),
 	}
 }
diff --git a/src/garage/cli/structs.rs b/src/garage/cli/structs.rs
index be4d5bd6..51d2bed3 100644
--- a/src/garage/cli/structs.rs
+++ b/src/garage/cli/structs.rs
@@ -57,6 +57,10 @@ pub enum Command {
 	#[structopt(name = "block", version = garage_version())]
 	Block(BlockOperation),
 
+	/// Operations on the metadata db
+	#[structopt(name = "meta", version = garage_version())]
+	Meta(MetaOperation),
+
 	/// Convert metadata db between database engine formats
 	#[structopt(name = "convert-db", version = garage_version())]
 	ConvertDb(convert_db::ConvertDbOpt),
@@ -617,3 +621,14 @@ pub enum BlockOperation {
 		blocks: Vec<String>,
 	},
 }
+
+#[derive(Serialize, Deserialize, StructOpt, Debug, Eq, PartialEq, Clone, Copy)]
+pub enum MetaOperation {
+	/// Save a snapshot of the metadata db file
+	#[structopt(name = "snapshot", version = garage_version())]
+	Snapshot {
+		/// Run on all nodes instead of only local node
+		#[structopt(long = "all")]
+		all: bool,
+	},
+}
diff --git a/src/garage/server.rs b/src/garage/server.rs
index 6323f957..65bf34db 100644
--- a/src/garage/server.rs
+++ b/src/garage/server.rs
@@ -51,7 +51,7 @@ pub async fn run_server(config_file: PathBuf, secrets: Secrets) -> Result<(), Er
 	let (background, await_background_done) = BackgroundRunner::new(watch_cancel.clone());
 
 	info!("Spawning Garage workers...");
-	garage.spawn_workers(&background);
+	garage.spawn_workers(&background)?;
 
 	if config.admin.trace_sink.is_some() {
 		info!("Initialize tracing...");
diff --git a/src/model/Cargo.toml b/src/model/Cargo.toml
index 2e5b047d..bde354b5 100644
--- a/src/model/Cargo.toml
+++ b/src/model/Cargo.toml
@@ -28,6 +28,7 @@ chrono.workspace = true
 err-derive.workspace = true
 hex.workspace = true
 base64.workspace = true
+parse_duration.workspace = true
 tracing.workspace = true
 rand.workspace = true
 zstd.workspace = true
diff --git a/src/model/garage.rs b/src/model/garage.rs
index acf943f6..a6f60546 100644
--- a/src/model/garage.rs
+++ b/src/model/garage.rs
@@ -278,7 +278,7 @@ impl Garage {
 		}))
 	}
 
-	pub fn spawn_workers(self: &Arc<Self>, bg: &BackgroundRunner) {
+	pub fn spawn_workers(self: &Arc<Self>, bg: &BackgroundRunner) -> Result<(), Error> {
 		self.block_manager.spawn_workers(bg);
 
 		self.bucket_table.spawn_workers(bg);
@@ -299,6 +299,23 @@ impl Garage {
 
 		#[cfg(feature = "k2v")]
 		self.k2v.spawn_workers(bg);
+
+		if let Some(itv) = self.config.metadata_auto_snapshot_interval.as_deref() {
+			let interval = parse_duration::parse(itv)
+				.ok_or_message("Invalid `metadata_auto_snapshot_interval`")?;
+			if interval < std::time::Duration::from_secs(600) {
+				return Err(Error::Message(
+					"metadata_auto_snapshot_interval too small or negative".into(),
+				));
+			}
+
+			bg.spawn_worker(crate::snapshot::AutoSnapshotWorker::new(
+				self.clone(),
+				interval,
+			));
+		}
+
+		Ok(())
 	}
 
 	pub fn bucket_helper(&self) -> helper::bucket::BucketHelper {
diff --git a/src/model/lib.rs b/src/model/lib.rs
index 4f20ea46..8ec338da 100644
--- a/src/model/lib.rs
+++ b/src/model/lib.rs
@@ -19,3 +19,4 @@ pub mod s3;
 pub mod garage;
 pub mod helper;
 pub mod migrate;
+pub mod snapshot;
diff --git a/src/model/snapshot.rs b/src/model/snapshot.rs
new file mode 100644
index 00000000..36f9ec7d
--- /dev/null
+++ b/src/model/snapshot.rs
@@ -0,0 +1,136 @@
+use std::fs;
+use std::path::PathBuf;
+use std::sync::Arc;
+use std::sync::Mutex;
+use std::time::{Duration, Instant};
+
+use async_trait::async_trait;
+use rand::prelude::*;
+use tokio::sync::watch;
+
+use garage_util::background::*;
+use garage_util::error::*;
+
+use crate::garage::Garage;
+
+// The two most recent snapshots are kept
+const KEEP_SNAPSHOTS: usize = 2;
+
+static SNAPSHOT_MUTEX: Mutex<()> = Mutex::new(());
+
+// ================ snapshotting logic =====================
+
+/// Run snashot_metadata in a blocking thread and async await on it
+pub async fn async_snapshot_metadata(garage: &Arc<Garage>) -> Result<(), Error> {
+	let garage = garage.clone();
+	let worker = tokio::task::spawn_blocking(move || snapshot_metadata(&garage));
+	worker.await.unwrap()?;
+	Ok(())
+}
+
+/// Take a snapshot of the metadata database, and erase older
+/// snapshots if necessary.
+/// This is not an async function, it should be spawned on a thread pool
+pub fn snapshot_metadata(garage: &Garage) -> Result<(), Error> {
+	let lock = match SNAPSHOT_MUTEX.try_lock() {
+		Ok(lock) => lock,
+		Err(_) => {
+			return Err(Error::Message(
+				"Cannot acquire lock, another snapshot might be in progress".into(),
+			))
+		}
+	};
+
+	let mut snapshots_dir = garage.config.metadata_dir.clone();
+	snapshots_dir.push("snapshots");
+	fs::create_dir_all(&snapshots_dir)?;
+
+	let mut new_path = snapshots_dir.clone();
+	new_path.push(chrono::Utc::now().to_rfc3339_opts(chrono::SecondsFormat::Secs, true));
+
+	info!("Snapshotting metadata db to {}", new_path.display());
+	garage.db.snapshot(&new_path)?;
+	info!("Metadata db snapshot finished");
+
+	if let Err(e) = cleanup_snapshots(&snapshots_dir) {
+		error!("Failed to do cleanup in snapshots directory: {}", e);
+	}
+
+	drop(lock);
+
+	Ok(())
+}
+
+fn cleanup_snapshots(snapshots_dir: &PathBuf) -> Result<(), Error> {
+	let mut snapshots =
+		fs::read_dir(&snapshots_dir)?.collect::<Result<Vec<fs::DirEntry>, std::io::Error>>()?;
+
+	snapshots.retain(|x| x.file_name().len() > 8);
+	snapshots.sort_by_key(|x| x.file_name());
+
+	for to_delete in snapshots.iter().rev().skip(KEEP_SNAPSHOTS) {
+		let path = snapshots_dir.join(to_delete.path());
+		if to_delete.metadata()?.file_type().is_dir() {
+			for file in fs::read_dir(&path)? {
+				let file = file?;
+				if file.metadata()?.is_file() {
+					fs::remove_file(path.join(file.path()))?;
+				}
+			}
+			std::fs::remove_dir(&path)?;
+		} else {
+			std::fs::remove_file(&path)?;
+		}
+	}
+	Ok(())
+}
+
+// ================ auto snapshot worker =====================
+
+pub struct AutoSnapshotWorker {
+	garage: Arc<Garage>,
+	next_snapshot: Instant,
+	snapshot_interval: Duration,
+}
+
+impl AutoSnapshotWorker {
+	pub(crate) fn new(garage: Arc<Garage>, snapshot_interval: Duration) -> Self {
+		Self {
+			garage,
+			snapshot_interval,
+			next_snapshot: Instant::now() + (snapshot_interval / 2),
+		}
+	}
+}
+
+#[async_trait]
+impl Worker for AutoSnapshotWorker {
+	fn name(&self) -> String {
+		"Metadata snapshot worker".into()
+	}
+	fn status(&self) -> WorkerStatus {
+		WorkerStatus {
+			freeform: vec![format!(
+				"Next snapshot: {}",
+				(chrono::Utc::now() + (self.next_snapshot - Instant::now())).to_rfc3339()
+			)],
+			..Default::default()
+		}
+	}
+	async fn work(&mut self, _must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error> {
+		if Instant::now() < self.next_snapshot {
+			return Ok(WorkerState::Idle);
+		}
+
+		async_snapshot_metadata(&self.garage).await?;
+
+		let rand_factor = 1f32 + thread_rng().gen::<f32>() / 5f32;
+		self.next_snapshot = Instant::now() + self.snapshot_interval.mul_f32(rand_factor);
+
+		Ok(WorkerState::Idle)
+	}
+	async fn wait_for_work(&mut self) -> WorkerState {
+		tokio::time::sleep_until(self.next_snapshot.into()).await;
+		WorkerState::Busy
+	}
+}
diff --git a/src/util/config.rs b/src/util/config.rs
index 7338a506..8ecbdfbb 100644
--- a/src/util/config.rs
+++ b/src/util/config.rs
@@ -27,6 +27,10 @@ pub struct Config {
 	#[serde(default)]
 	pub disable_scrub: bool,
 
+	/// Automatic snapshot interval for metadata
+	#[serde(default)]
+	pub metadata_auto_snapshot_interval: Option<String>,
+
 	/// Size of data blocks to save to disk
 	#[serde(
 		deserialize_with = "deserialize_capacity",
author	Alex <alex@adnab.me>	2024-03-15 13:17:53 +0000
committer	Alex <alex@adnab.me>	2024-03-15 13:17:53 +0000
commit	fd2e19bf1bf301bc03aa29ffa3fe1e71008cbe50 (patch)
tree	c92172dee172941c3daf32a08927f8ebab0ded9e
parent	a80ce6ab5ad9834c3721eeb4f626d53c9a8bb1f4 (diff)
parent	8cf3d24875d41d79ab08d637cd38d2a5b9e527dd (diff)
download	garage-fd2e19bf1bf301bc03aa29ffa3fe1e71008cbe50.tar.gz garage-fd2e19bf1bf301bc03aa29ffa3fe1e71008cbe50.zip