aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlex Auvolat <alex@adnab.me>2023-06-09 16:23:21 +0200
committerAlex Auvolat <alex@adnab.me>2023-06-09 16:23:21 +0200
commite7e164a280dfc1c4adf9d6da6f3b2a9674eca4bd (patch)
tree610408aedb12063faf49c28de0f3071f4e1f703e
parent1e466b11eb9a3d5de2b8247fc6b635f9278bc3ac (diff)
downloadgarage-e7e164a280dfc1c4adf9d6da6f3b2a9674eca4bd.tar.gz
garage-e7e164a280dfc1c4adf9d6da6f3b2a9674eca4bd.zip
Make fsync an option for meta and data
-rw-r--r--doc/book/reference-manual/configuration.md45
-rw-r--r--src/block/manager.rs35
-rw-r--r--src/model/garage.rs15
-rw-r--r--src/util/config.rs7
4 files changed, 88 insertions, 14 deletions
diff --git a/doc/book/reference-manual/configuration.md b/doc/book/reference-manual/configuration.md
index 38062bab..de253393 100644
--- a/doc/book/reference-manual/configuration.md
+++ b/doc/book/reference-manual/configuration.md
@@ -10,6 +10,8 @@ Here is an example `garage.toml` configuration file that illustrates all of the
```toml
metadata_dir = "/var/lib/garage/meta"
data_dir = "/var/lib/garage/data"
+metadata_fsync = true
+data_fsync = false
db_engine = "lmdb"
@@ -124,6 +126,49 @@ convert-db -a <input db engine> -i <input db path> \
Make sure to specify the full database path as presented in the table above,
and not just the path to the metadata directory.
+### `metadata_fsync`
+
+Whether to enable synchronous mode for the database engine or not.
+This is disabled (`false`) by default.
+
+This reduces the risk of metadata corruption in case of power failures,
+at the cost of a significant drop in write performance,
+as Garage will have to pause to sync data to disk much more often
+(several times for API calls such as PutObject).
+
+Using this option reduces the risk of simultaneous metadata corruption on several
+cluster nodes, which could lead to data loss.
+
+If multi-site replication is used, this option is most likely not necessary, as
+it is extremely unlikely that two nodes in different locations will have a
+power failure at the exact same time.
+
+(Metadata corruption on a single node is not an issue, the corrupted data file
+can always be deleted and reconstructed from the other nodes in the cluster.)
+
+Here is how this option impacts the different database engines:
+
+| Database | `metadata_fsync = false` (default) | `metadata_fsync = true` |
+|----------|------------------------------------|-------------------------------|
+| Sled | default options | *unsupported* |
+| Sqlite | `PRAGMA synchronous = OFF` | `PRAGMA synchronous = NORMAL` |
+| LMDB | `MDB_NOMETASYNC` + `MDB_NOSYNC` | `MDB_NOMETASYNC` |
+
+Note that the Sqlite database is always ran in `WAL` mode (`PRAGMA journal_mode = WAL`).
+
+### `data_fsync`
+
+Whether to `fsync` data blocks and their containing directory after they are
+saved to disk.
+This is disabled (`false`) by default.
+
+This might reduce the risk that a data block is lost in rare
+situations such as simultaneous node losing power,
+at the cost of a moderate drop in write performance.
+
+Similarly to `metatada_fsync`, this is likely not necessary
+if geographical replication is used.
+
### `block_size`
Garage splits stored objects in consecutive chunks of size `block_size`
diff --git a/src/block/manager.rs b/src/block/manager.rs
index 3ece9a8a..c7e4cd03 100644
--- a/src/block/manager.rs
+++ b/src/block/manager.rs
@@ -80,6 +80,7 @@ pub struct BlockManager {
/// Directory in which block are stored
pub data_dir: PathBuf,
+ data_fsync: bool,
compression_level: Option<i32>,
mutation_lock: [Mutex<BlockManagerLocked>; 256],
@@ -114,6 +115,7 @@ impl BlockManager {
pub fn new(
db: &db::Db,
data_dir: PathBuf,
+ data_fsync: bool,
compression_level: Option<i32>,
replication: TableShardedReplication,
system: Arc<System>,
@@ -141,6 +143,7 @@ impl BlockManager {
let block_manager = Arc::new(Self {
replication,
data_dir,
+ data_fsync,
compression_level,
mutation_lock: [(); 256].map(|_| Mutex::new(BlockManagerLocked())),
rc,
@@ -713,7 +716,11 @@ impl BlockManagerLocked {
let mut f = fs::File::create(&path_tmp).await?;
f.write_all(data).await?;
- f.sync_all().await?;
+
+ if mgr.data_fsync {
+ f.sync_all().await?;
+ }
+
drop(f);
fs::rename(path_tmp, path).await?;
@@ -724,18 +731,20 @@ impl BlockManagerLocked {
fs::remove_file(to_delete).await?;
}
- // We want to ensure that when this function returns, data is properly persisted
- // to disk. The first step is the sync_all above that does an fsync on the data file.
- // Now, we do an fsync on the containing directory, to ensure that the rename
- // is persisted properly. See:
- // http://thedjbway.b0llix.net/qmail/syncdir.html
- let dir = fs::OpenOptions::new()
- .read(true)
- .mode(0)
- .open(directory)
- .await?;
- dir.sync_all().await?;
- drop(dir);
+ if mgr.data_fsync {
+ // We want to ensure that when this function returns, data is properly persisted
+ // to disk. The first step is the sync_all above that does an fsync on the data file.
+ // Now, we do an fsync on the containing directory, to ensure that the rename
+ // is persisted properly. See:
+ // http://thedjbway.b0llix.net/qmail/syncdir.html
+ let dir = fs::OpenOptions::new()
+ .read(true)
+ .mode(0)
+ .open(directory)
+ .await?;
+ dir.sync_all().await?;
+ drop(dir);
+ }
Ok(())
}
diff --git a/src/model/garage.rs b/src/model/garage.rs
index 0fbcf334..9b7121db 100644
--- a/src/model/garage.rs
+++ b/src/model/garage.rs
@@ -91,6 +91,11 @@ impl Garage {
// ---- Sled DB ----
#[cfg(feature = "sled")]
"sled" => {
+ if config.metadata_fsync {
+ return Err(Error::Message(format!(
+ "`metadata_fsync = true` is not supported with the Sled database engine"
+ )));
+ }
db_path.push("db");
info!("Opening Sled database at: {}", db_path.display());
let db = db::sled_adapter::sled::Config::default()
@@ -111,7 +116,11 @@ impl Garage {
let db = db::sqlite_adapter::rusqlite::Connection::open(db_path)
.and_then(|db| {
db.pragma_update(None, "journal_mode", &"WAL")?;
- db.pragma_update(None, "synchronous", &"NORMAL")?;
+ if config.metadata_fsync {
+ db.pragma_update(None, "synchronous", &"NORMAL")?;
+ } else {
+ db.pragma_update(None, "synchronous", &"OFF")?;
+ }
Ok(db)
})
.ok_or_message("Unable to open sqlite DB")?;
@@ -139,6 +148,9 @@ impl Garage {
env_builder.map_size(map_size);
unsafe {
env_builder.flag(heed::flags::Flags::MdbNoMetaSync);
+ if !config.metadata_fsync {
+ env_builder.flag(heed::flags::Flags::MdbNoSync);
+ }
}
let db = match env_builder.open(&db_path) {
Err(heed::Error::Io(e)) if e.kind() == std::io::ErrorKind::OutOfMemory => {
@@ -208,6 +220,7 @@ impl Garage {
let block_manager = BlockManager::new(
&db,
config.data_dir.clone(),
+ config.data_fsync,
config.compression_level,
data_rep_param,
system.clone(),
diff --git a/src/util/config.rs b/src/util/config.rs
index 77952356..009f0574 100644
--- a/src/util/config.rs
+++ b/src/util/config.rs
@@ -15,6 +15,13 @@ pub struct Config {
/// Path where to store data. Can be slower, but need higher volume
pub data_dir: PathBuf,
+ /// Whether to fsync after all metadata transactions (disabled by default)
+ #[serde(default)]
+ pub metadata_fsync: bool,
+ /// Whether to fsync after all data block writes (disabled by default)
+ #[serde(default)]
+ pub data_fsync: bool,
+
/// Size of data blocks to save to disk
#[serde(default = "default_block_size")]
pub block_size: usize,