Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
ce79f6c
fix: no reboot after GC
hanabi1224 Apr 1, 2026
f1a24de
Merge branch 'main' into hm/no-reboot-after-gc
hanabi1224 Apr 2, 2026
ee8f85b
max retries
hanabi1224 Apr 2, 2026
011f36a
Merge branch 'hm/no-reboot-after-gc' of github.com:ChainSafe/forest i…
hanabi1224 Apr 2, 2026
9b83119
resolve AI comments
hanabi1224 Apr 2, 2026
de2dfca
simplify ZstdFrameCache usage
hanabi1224 Apr 2, 2026
e694674
do proper cleanup
hanabi1224 Apr 2, 2026
568838e
refine the logic in gc_once
hanabi1224 Apr 2, 2026
374173f
fix
hanabi1224 Apr 2, 2026
974f40e
fix
hanabi1224 Apr 2, 2026
0f33f69
mitigate test timeout issue
hanabi1224 Apr 2, 2026
ae513c5
Merge remote-tracking branch 'origin/main' into hm/no-reboot-after-gc
hanabi1224 Apr 3, 2026
b3d5366
fix
hanabi1224 Apr 3, 2026
d72546c
Merge branch 'main' into hm/no-reboot-after-gc
hanabi1224 Apr 6, 2026
906356d
resolve comments
hanabi1224 Apr 6, 2026
ac7ef39
Merge remote-tracking branch 'origin/main' into hm/no-reboot-after-gc
hanabi1224 Apr 6, 2026
b0d5e06
fix
hanabi1224 Apr 6, 2026
e8a6070
tokio spawn blocking for reset_gc_columns
hanabi1224 Apr 7, 2026
ff816df
Merge remote-tracking branch 'origin/main' into hm/no-reboot-after-gc
hanabi1224 Apr 7, 2026
f6d8586
used cached state compute result
hanabi1224 Apr 8, 2026
a43093b
Merge remote-tracking branch 'origin/main' into hm/no-reboot-after-gc
hanabi1224 Apr 8, 2026
0ed0cc9
verify result state tree is loadable in ForestStateCompute
hanabi1224 Apr 8, 2026
188aa9d
Merge remote-tracking branch 'origin/main' into hm/no-reboot-after-gc
hanabi1224 Apr 8, 2026
5f9cff2
fix log
hanabi1224 Apr 9, 2026
f2faaff
Merge remote-tracking branch 'origin/main' into hm/no-reboot-after-gc
hanabi1224 Apr 9, 2026
0efe91f
Merge branch 'main' into hm/no-reboot-after-gc
hanabi1224 Apr 9, 2026
9f66d61
Merge branch 'main' into hm/no-reboot-after-gc
hanabi1224 Apr 10, 2026
46b0937
reset chain follower after GC
hanabi1224 Apr 13, 2026
3db595c
Merge remote-tracking branch 'origin/main' into hm/no-reboot-after-gc
hanabi1224 Apr 13, 2026
7e5ab8a
handle executed tipset handle for head reset
hanabi1224 Apr 13, 2026
60fa31f
Merge remote-tracking branch 'origin/main' into hm/no-reboot-after-gc
hanabi1224 Apr 13, 2026
4bb3998
Merge branch 'main' into hm/no-reboot-after-gc
hanabi1224 Apr 14, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions docs/docs/users/guides/gc.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,8 @@ GC can be trigger manually with `forest-cli chain prune snap`, regardless whethe
Garbage Collection (GC) runs on a regular schedule and follows these steps:

- Export an effective standard lite snapshot in `.forest.car.zst` format.
- Stop the node.
- Purge parity-db columns that serve as non-persistent blockstore.
- Purge old CAR database files.
- Restart the node.

This process keeps the system clean by regularly removing old, unused data.

Expand Down Expand Up @@ -67,7 +65,6 @@ While GC runs in the background, it can cause some delays or pauses, particularl

- **Syncing Pauses**: There may be brief interruptions in syncing as resources are allocated for the GC process.
- **Performance Overhead**: While relatively efficient, the chain traversal algorithm could slow down operations slightly.
- **Reboot pauses**: The GC stops the node before cleaning up parity-db and CAR snapshots and then restarts the node, which could take `~10s-~30s` on mainnet

## Disk Usage

Expand Down
2 changes: 1 addition & 1 deletion src/chain/store/chain_store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ pub struct ChainStore<DB> {
genesis_block_header: CachingBlockHeader,

/// validated blocks
validated_blocks: Mutex<HashSet<Cid>>,
pub(crate) validated_blocks: Mutex<HashSet<Cid>>,

/// Ethereum mappings store
eth_mappings: Arc<dyn EthMappingsStore + Sync + Send>,
Expand Down
4 changes: 4 additions & 0 deletions src/chain_sync/bad_block_cache.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,10 @@ impl BadBlockCache {
pub fn peek(&self, c: &Cid) -> Option<()> {
self.cache.peek_cloned(&(*c).into())
}

pub fn clear(&self) {
self.cache.clear()
}
}

/// Thread-safe LRU cache for tracking recently seen gossip block CIDs.
Expand Down
105 changes: 71 additions & 34 deletions src/chain_sync/chain_follower.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,17 @@ use tokio::{sync::Notify, task::JoinSet};
use tracing::{debug, error, info, trace, warn};

pub struct ChainFollower<DB> {
/// Tasks
tasks: Arc<Mutex<HashSet<SyncTask>>>,

/// State machine
state_machine: Arc<Mutex<SyncStateMachine<DB>>>,

/// Syncing status of the chain
pub sync_status: SyncStatus,

/// manages retrieving and updates state objects
state_manager: Arc<StateManager<DB>>,
pub state_manager: Arc<StateManager<DB>>,

/// Context to be able to send requests to P2P network
pub network: SyncNetworkContext<DB>,
Expand Down Expand Up @@ -93,17 +99,26 @@ impl<DB: Blockstore + Sync + Send + 'static> ChainFollower<DB> {
) -> Self {
crate::def_is_env_truthy!(cache_disabled, "FOREST_DISABLE_BAD_BLOCK_CACHE");
let (tipset_sender, tipset_receiver) = flume::bounded(20);
let tasks: Arc<Mutex<HashSet<SyncTask>>> = Arc::new(Mutex::new(HashSet::default()));
let bad_blocks = if cache_disabled() {
tracing::warn!("bad block cache is disabled by `FOREST_DISABLE_BAD_BLOCK_CACHE`");
None
} else {
Some(Default::default())
};
let state_machine = Arc::new(Mutex::new(SyncStateMachine::new(
state_manager.chain_store().clone(),
bad_blocks.clone(),
stateless_mode,
)));
Self {
tasks,
state_machine,
sync_status: Arc::new(RwLock::new(SyncStatusReport::init())),
state_manager,
network,
genesis,
bad_blocks: if cache_disabled() {
tracing::warn!("bad block cache is disabled by `FOREST_DISABLE_BAD_BLOCK_CACHE`");
None
} else {
Some(Default::default())
},
bad_blocks,
net_handler,
tipset_sender,
tipset_receiver,
Expand All @@ -112,16 +127,37 @@ impl<DB: Blockstore + Sync + Send + 'static> ChainFollower<DB> {
}
}

pub async fn run(self) -> anyhow::Result<()> {
/// Reset inner states
pub fn reset(&self) {
let start = Instant::now();
self.tasks.lock().clear();
self.state_manager
.chain_store()
.validated_blocks
.lock()
.clear();
self.state_machine.lock().tipsets.clear();
if let Some(bad_blocks) = &self.bad_blocks {
bad_blocks.clear();
}
tracing::info!(
"chain follower reset, took {}",
humantime::format_duration(start.elapsed())
);
}

pub async fn run(&self) -> anyhow::Result<()> {
chain_follower(
self.state_manager,
self.bad_blocks,
self.net_handler,
self.tipset_receiver,
self.network,
self.mem_pool,
self.sync_status,
self.genesis,
&self.tasks,
&self.state_machine,
&self.state_manager,
self.bad_blocks.clone(),
self.net_handler.clone(),
self.tipset_receiver.clone(),
&self.network,
&self.mem_pool,
&self.sync_status,
&self.genesis,
self.stateless_mode,
)
.await
Expand All @@ -130,24 +166,21 @@ impl<DB: Blockstore + Sync + Send + 'static> ChainFollower<DB> {

#[allow(clippy::too_many_arguments)]
// We receive new full tipsets from the p2p swarm, and from miners that use Forest as their frontend.
pub async fn chain_follower<DB: Blockstore + Sync + Send + 'static>(
state_manager: Arc<StateManager<DB>>,
async fn chain_follower<DB: Blockstore + Sync + Send + 'static>(
tasks: &Arc<Mutex<HashSet<SyncTask>>>,
state_machine: &Arc<Mutex<SyncStateMachine<DB>>>,
state_manager: &Arc<StateManager<DB>>,
bad_block_cache: Option<Arc<BadBlockCache>>,
network_rx: flume::Receiver<NetworkEvent>,
tipset_receiver: flume::Receiver<FullTipset>,
network: SyncNetworkContext<DB>,
mem_pool: Arc<MessagePool<Arc<ChainStore<DB>>>>,
sync_status: SyncStatus,
genesis: Tipset,
network: &SyncNetworkContext<DB>,
mem_pool: &Arc<MessagePool<Arc<ChainStore<DB>>>>,
sync_status: &SyncStatus,
genesis: &Tipset,
stateless_mode: bool,
) -> anyhow::Result<()> {
let state_changed = Arc::new(Notify::new());
let state_machine = Arc::new(Mutex::new(SyncStateMachine::new(
state_manager.chain_store().clone(),
bad_block_cache.clone(),
stateless_mode,
)));
let tasks: Arc<Mutex<HashSet<SyncTask>>> = Arc::new(Mutex::new(HashSet::default()));

let seen_block_cache = SeenBlockCache::default();

let mut set = JoinSet::new();
Expand All @@ -158,6 +191,8 @@ pub async fn chain_follower<DB: Blockstore + Sync + Send + 'static>(
let state_changed = state_changed.shallow_clone();
let state_machine = state_machine.shallow_clone();
let network = network.shallow_clone();
let mem_pool = mem_pool.shallow_clone();
let genesis = genesis.shallow_clone();
let bad_block_cache = bad_block_cache.shallow_clone();
let seen_block_cache = seen_block_cache.shallow_clone();
async move {
Expand Down Expand Up @@ -244,11 +279,13 @@ pub async fn chain_follower<DB: Blockstore + Sync + Send + 'static>(

// When the state machine is updated, we need to update the sync status and spawn tasks
set.spawn({
let state_manager = state_manager.clone();
let state_machine = state_machine.clone();
let state_changed = state_changed.clone();
let tasks = tasks.clone();
let bad_block_cache = bad_block_cache.clone();
let state_manager = state_manager.shallow_clone();
let state_machine = state_machine.shallow_clone();
let network = network.shallow_clone();
let sync_status = sync_status.shallow_clone();
let state_changed = state_changed.shallow_clone();
let tasks = tasks.shallow_clone();
let bad_block_cache = bad_block_cache.shallow_clone();
async move {
loop {
state_changed.notified().await;
Expand Down Expand Up @@ -726,7 +763,7 @@ impl<DB: Blockstore> SyncStateMachine<DB> {

fn mark_validated_tipset(&mut self, tipset: FullTipset, is_proposed_head: bool) {
if !self.is_parent_validated(&tipset) {
tracing::error!(epoch = %tipset.epoch(), tsk = %tipset.key(), "Tipset must be validated");
tracing::error!(epoch = %tipset.epoch(), tsk = %tipset.key(), parent_state = %tipset.parent_state(), "Parent tipset must be validated");
return;
}

Expand Down
15 changes: 11 additions & 4 deletions src/chain_sync/tipset_syncer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -111,8 +111,9 @@ pub async fn validate_tipset<DB: Blockstore + Send + Sync + 'static>(
let timer = metrics::TIPSET_PROCESSING_TIME.start_timer();

let epoch = full_tipset.epoch();
let full_tipset_key = full_tipset.key().clone();
trace!("Tipset keys: {full_tipset_key}");
let parent_state = *full_tipset.parent_state();
let tipset_key = full_tipset.key();
trace!("Tipset keys: {tipset_key}");
let blocks = full_tipset.into_blocks();
let mut validations = JoinSet::new();
for b in blocks {
Expand All @@ -127,14 +128,20 @@ pub async fn validate_tipset<DB: Blockstore + Send + Sync + 'static>(
.add_to_tipset_tracker(block.header());
}
Err((cid, why)) => {
warn!("Validating block [CID = {cid}] in EPOCH = {epoch} failed: {why}");
warn!(
"Validating block [CID = {cid}, PARENT_STATE = {parent_state}] in EPOCH = {epoch} failed: {why}",
);
match &why {
TipsetSyncerError::TimeTravellingBlock(_, _) => {
// Do not mark a block as bad for temporary errors.
// See <https://github.com/filecoin-project/lotus/blob/v1.34.1/chain/sync.go#L602> in Lotus
}
_ => {
if let Some(bad_block_cache) = bad_block_cache {
// Do not mark block as bad if the parent state tree does not exist
if StateTree::new_from_root(state_manager.blockstore_owned(), &parent_state)
.is_ok()
&& let Some(bad_block_cache) = bad_block_cache
{
bad_block_cache.push(cid);
}
}
Expand Down
11 changes: 7 additions & 4 deletions src/daemon/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ use crate::daemon::asyncify;
use crate::daemon::bundle::load_actor_bundles;
use crate::daemon::db_util::load_all_forest_cars_with_cleanup;
use crate::db::car::ManyCar;
use crate::db::db_engine::{db_root, open_db};
use crate::db::parity_db::ParityDb;
use crate::db::db_engine::db_root;
use crate::db::parity_db::{GarbageCollectableParityDb, ParityDb};
use crate::db::{CAR_DB_DIR_NAME, DummyStore, EthMappingsStore};
use crate::genesis::read_genesis_header;
use crate::libp2p::{Keypair, PeerId};
Expand Down Expand Up @@ -178,7 +178,7 @@ fn maybe_migrate_db(config: &Config) {
}
}

pub type DbType = ManyCar<Arc<ParityDb>>;
pub type DbType = ManyCar<Arc<GarbageCollectableParityDb>>;

pub(crate) struct DbMetadata {
db_root_dir: PathBuf,
Expand All @@ -204,7 +204,10 @@ async fn setup_db(opts: &CliOpts, config: &Config) -> anyhow::Result<(Arc<DbType
maybe_migrate_db(config);
let chain_data_path = chain_path(config);
let db_root_dir = db_root(&chain_data_path)?;
let db_writer = Arc::new(open_db(db_root_dir.clone(), config.db_config())?);
let db_writer = Arc::new(GarbageCollectableParityDb::new(ParityDb::to_options(
db_root_dir.clone(),
config.db_config(),
))?);
let db = Arc::new(ManyCar::new(db_writer.clone()));
let forest_car_db_dir = db_root_dir.join(CAR_DB_DIR_NAME);
load_all_forest_cars_with_cleanup(&db, &forest_car_db_dir)?;
Expand Down
Loading
Loading