Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 6 additions & 4 deletions src/daemon/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ pub mod main;
use crate::blocks::Tipset;
use crate::chain::HeadChange;
use crate::chain::index::ResolveNullTipset;
use crate::chain_sync::ChainFollower;
use crate::chain_sync::network_context::SyncNetworkContext;
use crate::chain_sync::{ChainFollower, SyncStatusReport};
use crate::cli_shared::snapshot;
use crate::cli_shared::{
chain_path,
Expand All @@ -36,6 +36,7 @@ use crate::utils::{proofs_api::ensure_proof_params_downloaded, version::FOREST_V
use anyhow::{Context as _, bail};
use dialoguer::theme::ColorfulTheme;
use futures::{Future, FutureExt, select};
use parking_lot::RwLock;
use std::path::Path;
use std::sync::Arc;
use std::sync::OnceLock;
Expand Down Expand Up @@ -561,8 +562,9 @@ pub(super) async fn start(
_ = snap_gc_reboot_rx.recv_async() => {
snap_gc.cleanup_before_reboot().await;
}
result = start_services(start_time, &opts, config.clone(), shutdown_send.clone(), |ctx| {
result = start_services(start_time, &opts, config.clone(), shutdown_send.clone(), |ctx, sync_status| {
snap_gc.set_db(ctx.db.clone());
snap_gc.set_sync_status(sync_status);
snap_gc.set_car_db_head_epoch(ctx.db.heaviest_tipset().map(|ts|ts.epoch()).unwrap_or_default());
}) => {
break result
Expand All @@ -576,7 +578,7 @@ pub(super) async fn start_services(
opts: &CliOpts,
mut config: Config,
shutdown_send: mpsc::Sender<()>,
on_app_context_and_db_initialized: impl Fn(&AppContext),
on_app_context_and_db_initialized: impl FnOnce(&AppContext, Arc<RwLock<SyncStatusReport>>),
) -> anyhow::Result<()> {
// Cleanup the collector prometheus metrics registry on start
crate::metrics::reset_collector_registry();
Expand Down Expand Up @@ -608,7 +610,7 @@ pub(super) async fn start_services(
services.shutdown().await;
return Ok(());
}
on_app_context_and_db_initialized(&ctx);
on_app_context_and_db_initialized(&ctx, chain_follower.sync_status.clone());
warmup_in_background(&ctx);
ctx.state_manager.populate_cache();
maybe_start_metrics_service(&mut services, &config, &ctx).await?;
Expand Down
32 changes: 24 additions & 8 deletions src/db/gc/snapshot.rs
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ pub struct SnapshotGarbageCollector<DB> {
running: AtomicBool,
blessed_lite_snapshot: RwLock<Option<PathBuf>>,
db: RwLock<Option<Arc<DB>>>,
sync_status: RwLock<Option<Arc<RwLock<crate::chain_sync::SyncStatusReport>>>>,
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you think this will be better here?

RwLock<Option<Arc<RwLock<crate::chain_sync::SyncStatusReport>>>> -> OnceLock<Arc<RwLock<crate::chain_sync::SyncStatusReport>>>

This way we can avoid doing: &*sync_status.read()

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's set more than once after every GC

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I missed this GC restarts the node.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need two RwLocks here?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we simplify this type?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@elmattic simplified type by adding pub type SyncStatus = Arc<RwLock<SyncStatusReport>>;

// On mainnet, it takes ~50MiB-200MiB RAM, depending on the time cost of snapshot export
memory_db: RwLock<Option<HashMap<Cid, Vec<u8>>>>,
memory_db_head_key: RwLock<Option<TipsetKey>>,
Expand Down Expand Up @@ -111,6 +112,7 @@ where
running: AtomicBool::new(false),
blessed_lite_snapshot: RwLock::new(None),
db: RwLock::new(None),
sync_status: RwLock::new(None),
memory_db: RwLock::new(None),
memory_db_head_key: RwLock::new(None),
exported_head_key: RwLock::new(None),
Expand All @@ -132,13 +134,18 @@ where
*self.car_db_head_epoch.write() = Some(epoch);
}

pub fn set_sync_status(&self, sync_status: Arc<RwLock<crate::chain_sync::SyncStatusReport>>) {
*self.sync_status.write() = Some(sync_status)
}

pub async fn event_loop(&self) {
while self.trigger_rx.recv_async().await.is_ok() {
if self.running.load(Ordering::Relaxed) {
Comment thread
akaladarshi marked this conversation as resolved.
tracing::warn!("snap gc has already been running");
} else {
self.running.store(true, Ordering::Relaxed);
if let Err(e) = self.export_snapshot().await {
self.running.store(false, Ordering::Relaxed);
tracing::warn!("{e}");
}
}
Expand Down Expand Up @@ -170,18 +177,22 @@ where
);
loop {
if !self.running.load(Ordering::Relaxed)
&& let Some(db) = &*self.db.read()
&& let Some(car_db_head_epoch) = *self.car_db_head_epoch.read()
&& let Ok(head_key) = HeaviestTipsetKeyProvider::heaviest_tipset_key(db)
&& let Ok(head) = Tipset::load_required(db, &head_key)
&& let Some(sync_status) = &*self.sync_status.read()
{
let head_epoch = head.epoch();
if head_epoch - car_db_head_epoch >= snap_gc_interval_epochs
let sync_status = &*sync_status.read();
let network_head_epoch = sync_status.network_head_epoch;
let head_epoch = sync_status.current_head_epoch;
if head_epoch > 0
Copy link
Copy Markdown
Contributor

@elmattic elmattic Oct 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: this test condition is quite a beast, maybe it's time to break it down into functions and add some comments.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've added comment to each condition

&& head_epoch <= network_head_epoch
&& sync_status.is_synced()
&& sync_status.active_forks.is_empty()
&& head_epoch - car_db_head_epoch >= snap_gc_interval_epochs
&& self.trigger_tx.try_send(()).is_ok()
{
tracing::info!(%car_db_head_epoch, %head_epoch, %snap_gc_interval_epochs, "Snap GC scheduled");
tracing::info!(%car_db_head_epoch, %head_epoch, %network_head_epoch, %snap_gc_interval_epochs, "Snap GC scheduled");
} else {
tracing::trace!(%car_db_head_epoch, %head_epoch, %snap_gc_interval_epochs, "Snap GC not scheduled");
tracing::debug!(%car_db_head_epoch, %head_epoch, %network_head_epoch, %snap_gc_interval_epochs, "Snap GC not scheduled");
}
}
tokio::time::sleep(snap_gc_check_interval).await;
Expand Down Expand Up @@ -219,6 +230,7 @@ where
}
map
});
let start = Instant::now();
let (head_ts, _) = crate::chain::export_from_head::<Sha256>(
&db,
self.recent_state_roots,
Expand All @@ -235,7 +247,11 @@ where
head_ts.epoch()
));
temp_path.persist(&target_path)?;
tracing::info!("exported lite snapshot at {}", target_path.display());
tracing::info!(
"exported lite snapshot at {}, took {}",
target_path.display(),
humantime::format_duration(start.elapsed())
);
*self.blessed_lite_snapshot.write() = Some(target_path);
*self.exported_head_key.write() = Some(head_ts.key().clone());

Expand Down
Loading