ChainSafe · hanabi1224 · Apr 14, 2026 · Apr 1, 2026 · Apr 2, 2026 · Apr 2, 2026
@@ -21,10 +21,8 @@ GC can be trigger manually with `forest-cli chain prune snap`, regardless whethe
 Garbage Collection (GC) runs on a regular schedule and follows these steps:
 
 - Export an effective standard lite snapshot in `.forest.car.zst` format.
-- Stop the node.
 - Purge parity-db columns that serve as non-persistent blockstore.
 - Purge old CAR database files.
-- Restart the node.
 
 This process keeps the system clean by regularly removing old, unused data.
 
@@ -67,7 +65,6 @@ While GC runs in the background, it can cause some delays or pauses, particularl
 
 - **Syncing Pauses**: There may be brief interruptions in syncing as resources are allocated for the GC process.
 - **Performance Overhead**: While relatively efficient, the chain traversal algorithm could slow down operations slightly.
-- **Reboot pauses**: The GC stops the node before cleaning up parity-db and CAR snapshots and then restarts the node, which could take `~10s-~30s` on mainnet
 
 ## Disk Usage
 

@@ -85,7 +85,7 @@ pub struct ChainStore<DB> {
     genesis_block_header: CachingBlockHeader,
 
     /// validated blocks
-    validated_blocks: Mutex<HashSet<Cid>>,
+    pub(crate) validated_blocks: Mutex<HashSet<Cid>>,
 
     /// Ethereum mappings store
     eth_mappings: Arc<dyn EthMappingsStore + Sync + Send>,

@@ -43,6 +43,10 @@ impl BadBlockCache {
     pub fn peek(&self, c: &Cid) -> Option<()> {
         self.cache.peek_cloned(&(*c).into())
     }
+
+    pub fn clear(&self) {
+        self.cache.clear()
+    }
 }
 
 /// Thread-safe LRU cache for tracking recently seen gossip block CIDs.

@@ -46,11 +46,17 @@ use tokio::{sync::Notify, task::JoinSet};
 use tracing::{debug, error, info, trace, warn};
 
 pub struct ChainFollower<DB> {
+    /// Tasks
+    tasks: Arc<Mutex<HashSet<SyncTask>>>,
+
+    /// State machine
+    state_machine: Arc<Mutex<SyncStateMachine<DB>>>,
+
     /// Syncing status of the chain
     pub sync_status: SyncStatus,
 
     /// manages retrieving and updates state objects
-    state_manager: Arc<StateManager<DB>>,
+    pub state_manager: Arc<StateManager<DB>>,
 
     /// Context to be able to send requests to P2P network
     pub network: SyncNetworkContext<DB>,
@@ -93,17 +99,26 @@ impl<DB: Blockstore + Sync + Send + 'static> ChainFollower<DB> {
     ) -> Self {
         crate::def_is_env_truthy!(cache_disabled, "FOREST_DISABLE_BAD_BLOCK_CACHE");
         let (tipset_sender, tipset_receiver) = flume::bounded(20);
+        let tasks: Arc<Mutex<HashSet<SyncTask>>> = Arc::new(Mutex::new(HashSet::default()));
+        let bad_blocks = if cache_disabled() {
+            tracing::warn!("bad block cache is disabled by `FOREST_DISABLE_BAD_BLOCK_CACHE`");
+            None
+        } else {
+            Some(Default::default())
+        };
+        let state_machine = Arc::new(Mutex::new(SyncStateMachine::new(
+            state_manager.chain_store().clone(),
+            bad_blocks.clone(),
+            stateless_mode,
+        )));
         Self {
+            tasks,
+            state_machine,
             sync_status: Arc::new(RwLock::new(SyncStatusReport::init())),
             state_manager,
             network,
             genesis,
-            bad_blocks: if cache_disabled() {
-                tracing::warn!("bad block cache is disabled by `FOREST_DISABLE_BAD_BLOCK_CACHE`");
-                None
-            } else {
-                Some(Default::default())
-            },
+            bad_blocks,
             net_handler,
             tipset_sender,
             tipset_receiver,
@@ -112,16 +127,37 @@ impl<DB: Blockstore + Sync + Send + 'static> ChainFollower<DB> {
         }
     }
 
-    pub async fn run(self) -> anyhow::Result<()> {
+    /// Reset inner states
+    pub fn reset(&self) {
+        let start = Instant::now();
+        self.tasks.lock().clear();
+        self.state_manager
+            .chain_store()
+            .validated_blocks
+            .lock()
+            .clear();
+        self.state_machine.lock().tipsets.clear();
+        if let Some(bad_blocks) = &self.bad_blocks {
+            bad_blocks.clear();
+        }
+        tracing::info!(
+            "chain follower reset, took {}",
+            humantime::format_duration(start.elapsed())
+        );
+    }
+
+    pub async fn run(&self) -> anyhow::Result<()> {
         chain_follower(
-            self.state_manager,
-            self.bad_blocks,
-            self.net_handler,
-            self.tipset_receiver,
-            self.network,
-            self.mem_pool,
-            self.sync_status,
-            self.genesis,
+            &self.tasks,
+            &self.state_machine,
+            &self.state_manager,
+            self.bad_blocks.clone(),
+            self.net_handler.clone(),
+            self.tipset_receiver.clone(),
+            &self.network,
+            &self.mem_pool,
+            &self.sync_status,
+            &self.genesis,
             self.stateless_mode,
         )
         .await
@@ -130,24 +166,21 @@ impl<DB: Blockstore + Sync + Send + 'static> ChainFollower<DB> {
 
 #[allow(clippy::too_many_arguments)]
 // We receive new full tipsets from the p2p swarm, and from miners that use Forest as their frontend.
-pub async fn chain_follower<DB: Blockstore + Sync + Send + 'static>(
-    state_manager: Arc<StateManager<DB>>,
+async fn chain_follower<DB: Blockstore + Sync + Send + 'static>(
+    tasks: &Arc<Mutex<HashSet<SyncTask>>>,
+    state_machine: &Arc<Mutex<SyncStateMachine<DB>>>,
+    state_manager: &Arc<StateManager<DB>>,
     bad_block_cache: Option<Arc<BadBlockCache>>,
     network_rx: flume::Receiver<NetworkEvent>,
     tipset_receiver: flume::Receiver<FullTipset>,
-    network: SyncNetworkContext<DB>,
-    mem_pool: Arc<MessagePool<Arc<ChainStore<DB>>>>,
-    sync_status: SyncStatus,
-    genesis: Tipset,
+    network: &SyncNetworkContext<DB>,
+    mem_pool: &Arc<MessagePool<Arc<ChainStore<DB>>>>,
+    sync_status: &SyncStatus,
+    genesis: &Tipset,
     stateless_mode: bool,
 ) -> anyhow::Result<()> {
     let state_changed = Arc::new(Notify::new());
-    let state_machine = Arc::new(Mutex::new(SyncStateMachine::new(
-        state_manager.chain_store().clone(),
-        bad_block_cache.clone(),
-        stateless_mode,
-    )));
-    let tasks: Arc<Mutex<HashSet<SyncTask>>> = Arc::new(Mutex::new(HashSet::default()));
+
     let seen_block_cache = SeenBlockCache::default();
 
     let mut set = JoinSet::new();
@@ -158,6 +191,8 @@ pub async fn chain_follower<DB: Blockstore + Sync + Send + 'static>(
         let state_changed = state_changed.shallow_clone();
         let state_machine = state_machine.shallow_clone();
         let network = network.shallow_clone();
+        let mem_pool = mem_pool.shallow_clone();
+        let genesis = genesis.shallow_clone();
         let bad_block_cache = bad_block_cache.shallow_clone();
         let seen_block_cache = seen_block_cache.shallow_clone();
         async move {
@@ -244,11 +279,13 @@ pub async fn chain_follower<DB: Blockstore + Sync + Send + 'static>(
 
     // When the state machine is updated, we need to update the sync status and spawn tasks
     set.spawn({
-        let state_manager = state_manager.clone();
-        let state_machine = state_machine.clone();
-        let state_changed = state_changed.clone();
-        let tasks = tasks.clone();
-        let bad_block_cache = bad_block_cache.clone();
+        let state_manager = state_manager.shallow_clone();
+        let state_machine = state_machine.shallow_clone();
+        let network = network.shallow_clone();
+        let sync_status = sync_status.shallow_clone();
+        let state_changed = state_changed.shallow_clone();
+        let tasks = tasks.shallow_clone();
+        let bad_block_cache = bad_block_cache.shallow_clone();
         async move {
             loop {
                 state_changed.notified().await;
@@ -726,7 +763,7 @@ impl<DB: Blockstore> SyncStateMachine<DB> {
 
     fn mark_validated_tipset(&mut self, tipset: FullTipset, is_proposed_head: bool) {
         if !self.is_parent_validated(&tipset) {
-            tracing::error!(epoch = %tipset.epoch(), tsk = %tipset.key(), "Tipset must be validated");
+            tracing::error!(epoch = %tipset.epoch(), tsk = %tipset.key(), parent_state = %tipset.parent_state(), "Parent tipset must be validated");
             return;
         }
 

@@ -111,8 +111,9 @@ pub async fn validate_tipset<DB: Blockstore + Send + Sync + 'static>(
     let timer = metrics::TIPSET_PROCESSING_TIME.start_timer();
 
     let epoch = full_tipset.epoch();
-    let full_tipset_key = full_tipset.key().clone();
-    trace!("Tipset keys: {full_tipset_key}");
+    let parent_state = *full_tipset.parent_state();
+    let tipset_key = full_tipset.key();
+    trace!("Tipset keys: {tipset_key}");
     let blocks = full_tipset.into_blocks();
     let mut validations = JoinSet::new();
     for b in blocks {
@@ -127,14 +128,20 @@ pub async fn validate_tipset<DB: Blockstore + Send + Sync + 'static>(
                     .add_to_tipset_tracker(block.header());
             }
             Err((cid, why)) => {
-                warn!("Validating block [CID = {cid}] in EPOCH = {epoch} failed: {why}");
+                warn!(
+                    "Validating block [CID = {cid}, PARENT_STATE = {parent_state}] in EPOCH = {epoch} failed: {why}",
+                );
                 match &why {
                     TipsetSyncerError::TimeTravellingBlock(_, _) => {
                         // Do not mark a block as bad for temporary errors.
                         // See <https://github.com/filecoin-project/lotus/blob/v1.34.1/chain/sync.go#L602> in Lotus
                     }
                     _ => {
-                        if let Some(bad_block_cache) = bad_block_cache {
+                        // Do not mark block as bad if the parent state tree does not exist
+                        if StateTree::new_from_root(state_manager.blockstore_owned(), &parent_state)
+                            .is_ok()
+                            && let Some(bad_block_cache) = bad_block_cache
+                        {
                             bad_block_cache.push(cid);
                         }
                     }

@@ -9,8 +9,8 @@ use crate::daemon::asyncify;
 use crate::daemon::bundle::load_actor_bundles;
 use crate::daemon::db_util::load_all_forest_cars_with_cleanup;
 use crate::db::car::ManyCar;
-use crate::db::db_engine::{db_root, open_db};
-use crate::db::parity_db::ParityDb;
+use crate::db::db_engine::db_root;
+use crate::db::parity_db::{GarbageCollectableParityDb, ParityDb};
 use crate::db::{CAR_DB_DIR_NAME, DummyStore, EthMappingsStore};
 use crate::genesis::read_genesis_header;
 use crate::libp2p::{Keypair, PeerId};
@@ -178,7 +178,7 @@ fn maybe_migrate_db(config: &Config) {
     }
 }
 
-pub type DbType = ManyCar<Arc<ParityDb>>;
+pub type DbType = ManyCar<Arc<GarbageCollectableParityDb>>;
 
 pub(crate) struct DbMetadata {
     db_root_dir: PathBuf,
@@ -204,7 +204,10 @@ async fn setup_db(opts: &CliOpts, config: &Config) -> anyhow::Result<(Arc<DbType
     maybe_migrate_db(config);
     let chain_data_path = chain_path(config);
     let db_root_dir = db_root(&chain_data_path)?;
-    let db_writer = Arc::new(open_db(db_root_dir.clone(), config.db_config())?);
+    let db_writer = Arc::new(GarbageCollectableParityDb::new(ParityDb::to_options(
+        db_root_dir.clone(),
+        config.db_config(),
+    ))?);
     let db = Arc::new(ManyCar::new(db_writer.clone()));
     let forest_car_db_dir = db_root_dir.join(CAR_DB_DIR_NAME);
     load_all_forest_cars_with_cleanup(&db, &forest_car_db_dir)?;