aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlex Auvolat <alex@adnab.me>2022-09-01 16:30:44 +0200
committerAlex Auvolat <alex@adnab.me>2022-09-01 16:30:44 +0200
commitdf094bd8075332bb765b8b44c9b19cf2485e9ca8 (patch)
tree3cc838ad263c10960903b8b865e356d14eef9f60
parentf3bf34b6a18c547c5fb29346787648048c093d52 (diff)
downloadgarage-df094bd8075332bb765b8b44c9b19cf2485e9ca8.tar.gz
garage-df094bd8075332bb765b8b44c9b19cf2485e9ca8.zip
Less strict timeouts
-rw-r--r--Cargo.lock2
-rw-r--r--src/block/manager.rs8
-rw-r--r--src/rpc/rpc_helper.rs2
-rw-r--r--src/rpc/system.rs6
-rw-r--r--src/table/gc.rs3
-rw-r--r--src/table/sync.rs3
-rw-r--r--src/table/table.rs2
7 files changed, 16 insertions, 10 deletions
diff --git a/Cargo.lock b/Cargo.lock
index 4c31d697..632c2131 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2176,7 +2176,7 @@ dependencies = [
[[package]]
name = "netapp"
version = "0.5.0"
-source = "git+https://git.deuxfleurs.fr/lx/netapp?branch=stream-body#22d96929d5416750e1f5889ee6cc16b382293104"
+source = "git+https://git.deuxfleurs.fr/lx/netapp?branch=stream-body#f6ad1d0fab340e77fbfcb3488a98c342d334838e"
dependencies = [
"arc-swap",
"async-trait",
diff --git a/src/block/manager.rs b/src/block/manager.rs
index b9f6fc0f..00438648 100644
--- a/src/block/manager.rs
+++ b/src/block/manager.rs
@@ -48,10 +48,14 @@ use crate::repair::*;
pub const INLINE_THRESHOLD: usize = 3072;
// Timeout for RPCs that read and write blocks to remote nodes
-const BLOCK_RW_TIMEOUT: Duration = Duration::from_secs(30);
+const BLOCK_RW_TIMEOUT: Duration = Duration::from_secs(60);
// Timeout for RPCs that ask other nodes whether they need a copy
// of a given block before we delete it locally
-const NEED_BLOCK_QUERY_TIMEOUT: Duration = Duration::from_secs(5);
+// The timeout here is relatively low because we don't want to block
+// the entire resync loop when some nodes are not responding.
+// Nothing will be deleted if the nodes don't answer the queries,
+// we will just retry later.
+const NEED_BLOCK_QUERY_TIMEOUT: Duration = Duration::from_secs(15);
// The delay between the time where a resync operation fails
// and the time when it is retried, with exponential backoff
diff --git a/src/rpc/rpc_helper.rs b/src/rpc/rpc_helper.rs
index 6c79c502..e9575261 100644
--- a/src/rpc/rpc_helper.rs
+++ b/src/rpc/rpc_helper.rs
@@ -31,7 +31,7 @@ use garage_util::metrics::RecordDuration;
use crate::metrics::RpcMetrics;
use crate::ring::Ring;
-const DEFAULT_TIMEOUT: Duration = Duration::from_secs(10);
+const DEFAULT_TIMEOUT: Duration = Duration::from_secs(30);
// Don't allow more than 100 concurrent outgoing RPCs.
const MAX_CONCURRENT_REQUESTS: usize = 100;
diff --git a/src/rpc/system.rs b/src/rpc/system.rs
index 5858660e..d7ef2140 100644
--- a/src/rpc/system.rs
+++ b/src/rpc/system.rs
@@ -38,7 +38,7 @@ use crate::rpc_helper::*;
const DISCOVERY_INTERVAL: Duration = Duration::from_secs(60);
const STATUS_EXCHANGE_INTERVAL: Duration = Duration::from_secs(10);
-const PING_TIMEOUT: Duration = Duration::from_secs(2);
+const SYSTEM_RPC_TIMEOUT: Duration = Duration::from_secs(15);
/// Version tag used for version check upon Netapp connection
pub const GARAGE_VERSION_TAG: u64 = 0x6761726167650007; // garage 0x0007
@@ -561,7 +561,7 @@ impl System {
.broadcast(
&self.system_endpoint,
SystemRpc::AdvertiseStatus(local_status),
- RequestStrategy::with_priority(PRIO_HIGH).with_timeout(PING_TIMEOUT),
+ RequestStrategy::with_priority(PRIO_HIGH).with_timeout(SYSTEM_RPC_TIMEOUT),
)
.await;
@@ -685,7 +685,7 @@ impl System {
&self.system_endpoint,
peer,
SystemRpc::PullClusterLayout,
- RequestStrategy::with_priority(PRIO_HIGH).with_timeout(PING_TIMEOUT),
+ RequestStrategy::with_priority(PRIO_HIGH).with_timeout(SYSTEM_RPC_TIMEOUT),
)
.await;
if let Ok(SystemRpc::AdvertiseClusterLayout(layout)) = resp {
diff --git a/src/table/gc.rs b/src/table/gc.rs
index 12218d97..6cae9701 100644
--- a/src/table/gc.rs
+++ b/src/table/gc.rs
@@ -25,7 +25,8 @@ use crate::replication::*;
use crate::schema::*;
const TABLE_GC_BATCH_SIZE: usize = 1024;
-const TABLE_GC_RPC_TIMEOUT: Duration = Duration::from_secs(30);
+// Same timeout as NEED_BLOCK_QUERY_TIMEOUT in block manager
+const TABLE_GC_RPC_TIMEOUT: Duration = Duration::from_secs(15);
// GC delay for table entries: 1 day (24 hours)
// (the delay before the entry is added in the GC todo list
diff --git a/src/table/sync.rs b/src/table/sync.rs
index b3756a5e..62b88a58 100644
--- a/src/table/sync.rs
+++ b/src/table/sync.rs
@@ -24,7 +24,8 @@ use crate::merkle::*;
use crate::replication::*;
use crate::*;
-const TABLE_SYNC_RPC_TIMEOUT: Duration = Duration::from_secs(30);
+// Sync RPC can contain a lot of data, so have a 1min timeout
+const TABLE_SYNC_RPC_TIMEOUT: Duration = Duration::from_secs(60);
// Do anti-entropy every 10 minutes
const ANTI_ENTROPY_INTERVAL: Duration = Duration::from_secs(10 * 60);
diff --git a/src/table/table.rs b/src/table/table.rs
index 3c211728..51f3837f 100644
--- a/src/table/table.rs
+++ b/src/table/table.rs
@@ -31,7 +31,7 @@ use crate::schema::*;
use crate::sync::*;
use crate::util::*;
-pub const TABLE_RPC_TIMEOUT: Duration = Duration::from_secs(10);
+pub const TABLE_RPC_TIMEOUT: Duration = Duration::from_secs(30);
pub struct Table<F: TableSchema + 'static, R: TableReplication + 'static> {
pub system: Arc<System>,