diff --git a/CHANGELOG.md b/CHANGELOG.md index fdcc6fd10e01..362fe7e807d1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -39,6 +39,8 @@ - [#6613](https://github.com/ChainSafe/forest/pull/6613): Fixed chain sync getting stuck when encountering time-travelling blocks by not marking the corresponding tipsets as permanently bad. +- [#6594](https://github.com/ChainSafe/forest/issues/6594): Added random GC delay to avoid a cluster of nodes run GC and reboot RPC services at the same time. + ## Forest v0.32.1 "Malfoy" This is a non-mandatory release for all node operators. It sets F3 initial power table on calibnet for late F3 participation and F3 data verification scenarios. It also includes new V2 RPC methods, a few bug fixes and `lotus-gateway` compatibility fixes. diff --git a/docs/docs/users/guides/gc.md b/docs/docs/users/guides/gc.md index 7000b975d9bc..b15734cd27be 100644 --- a/docs/docs/users/guides/gc.md +++ b/docs/docs/users/guides/gc.md @@ -5,7 +5,8 @@ sidebar_position: 5 ### Enabling/Disabling Automatic Garbage Collection -By default, automatic garbage collection is enabled in Forest to ensure that unnecessary data is regularly cleared out, optimizing disk usage and performance. By default, it runs every 7 days (20160 epochs). The interval can be overridden by setting environment variable `FOREST_SNAPSHOT_GC_INTERVAL_EPOCHS` +By default, automatic garbage collection is enabled in Forest to ensure that unnecessary data is regularly cleared out, optimizing disk usage and performance. The default GC interval is 20160 epochs(7 days). The interval can be overridden by setting environment variable `FOREST_SNAPSHOT_GC_INTERVAL_EPOCHS`. +Note that, an extra random small delay is added to the GC interval on every GC cycle to avoid a cluster of nodes run GC and reboot RPC services at the same time. If you want to disable the automatic GC, for example, while testing new features or running performance benchmarks where GC may cause unnecessary overhead, you can do so by starting the Forest daemon with the `--no-gc` flag. diff --git a/src/db/gc/snapshot.rs b/src/db/gc/snapshot.rs index 099c7410a863..5c0dd60b830e 100644 --- a/src/db/gc/snapshot.rs +++ b/src/db/gc/snapshot.rs @@ -57,6 +57,7 @@ use anyhow::Context as _; use cid::Cid; use fvm_ipld_blockstore::Blockstore; use parking_lot::RwLock; +use rand::Rng as _; use sha2::Sha256; use std::path::PathBuf; use std::sync::{ @@ -196,16 +197,21 @@ where let sync_status = &*sync_status.read(); let network_head_epoch = sync_status.network_head_epoch; let head_epoch = sync_status.current_head_epoch; + // Add some random delay to the GC interval to avoid a cluster of nodes run GC and reboot RPC services at the same time. + // This will no longer be needed once is implemented. + // 0..30 is 0-15min on mainnet and calibnet. + let gc_interval_random_delay_epochs = crate::utils::rand::forest_rng() + .gen_range(0..=30.min(snap_gc_interval_epochs / 5)); if head_epoch > 0 // sync_status has been initialized && head_epoch <= network_head_epoch // head epoch is within a sane range && sync_status.is_synced() // chain is in sync && sync_status.active_forks.is_empty() // no active fork - && head_epoch - car_db_head_epoch >= snap_gc_interval_epochs // the gap between chain head and car_db head is above threshold + && head_epoch - car_db_head_epoch >= snap_gc_interval_epochs + gc_interval_random_delay_epochs // the gap between chain head and car_db head is above threshold && self.trigger_tx.try_send(()).is_ok() { - tracing::info!(%car_db_head_epoch, %head_epoch, %network_head_epoch, %snap_gc_interval_epochs, "Snap GC scheduled"); + tracing::info!(%car_db_head_epoch, %head_epoch, %network_head_epoch, %snap_gc_interval_epochs, %gc_interval_random_delay_epochs, "Snap GC scheduled"); } else { - tracing::debug!(%car_db_head_epoch, %head_epoch, %network_head_epoch, %snap_gc_interval_epochs, "Snap GC not scheduled"); + tracing::debug!(%car_db_head_epoch, %head_epoch, %network_head_epoch, %snap_gc_interval_epochs, %gc_interval_random_delay_epochs, "Snap GC not scheduled"); } } tokio::time::sleep(snap_gc_check_interval).await;