The super large grace period of 1 day has proved to be harmful on

fulmicoton · guilload · commit 00b22bf83f52 · 2024-06-17T11:36:34.000-04:00
Cicada.

This PR lowers it to 2h.
For reminder, starting the detection of the node as dead,
the node gets into a zombie state for 1h.

We do share its KVs.

From timeofdeath+1h to timeofdeath+2h, we won't share the node.

After 2h, we will delete the node from the state.
diff --git a/quickwit/quickwit-cluster/src/lib.rs b/quickwit/quickwit-cluster/src/lib.rs
@@ -28,6 +28,7 @@ mod metrics;
 mod node;
 
 use std::net::SocketAddr;
+use std::time::Duration;
 
 use async_trait::async_trait;
 pub use chitchat::transport::ChannelTransport;
@@ -147,13 +148,17 @@ pub async fn start_cluster_service(node_config: &NodeConfig) -> anyhow::Result<C
         indexing_tasks,
         indexing_cpu_capacity,
     };
+    let failure_detector_config = FailureDetectorConfig {
+        dead_node_grace_period: Duration::from_secs(2 * 60 * 60), // 2 hours
+        ..Default::default()
+    };
     let cluster = Cluster::join(
         cluster_id,
         self_node,
         gossip_listen_addr,
         peer_seed_addrs,
         node_config.gossip_interval,
-        FailureDetectorConfig::default(),
+        failure_detector_config,
         &CountingUdpTransport,
     )
     .await?;