-
Notifications
You must be signed in to change notification settings - Fork 206
fix(l1): clearer sync diagnostics during catch-up #6733
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
665a577
fbb285b
d4d1d86
52bc365
487e1d0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,7 +5,7 @@ | |
|
|
||
| use std::cmp::min; | ||
| use std::sync::Arc; | ||
| use std::time::Duration; | ||
| use std::time::{Duration, SystemTime, UNIX_EPOCH}; | ||
|
|
||
| use ethrex_blockchain::{ | ||
| BatchBlockProcessingFailure, Blockchain, | ||
|
|
@@ -25,6 +25,31 @@ use crate::snap::constants::{MAX_BLOCK_BODIES_TO_REQUEST, MAX_HEADER_FETCH_ATTEM | |
|
|
||
| use super::{EXECUTE_BATCH_SIZE, SyncError}; | ||
|
|
||
| /// Forkchoice heads older than this (in seconds) trigger a "consensus is behind" | ||
| /// warning during sync. A synced consensus client always advertises a head only | ||
| /// a few seconds old, so a large age means the consensus client itself is lagging | ||
| /// chain head and is the sync bottleneck. | ||
| const STALE_FORKCHOICE_HEAD_SECS: u64 = 1800; | ||
|
|
||
| /// Distance (in blocks) below which the node is considered to be following head. | ||
| /// Below this we suppress the per-cycle sync-target logging to avoid noise on an | ||
| /// already-synced node, which runs a sync cycle on every slot. | ||
| const FOLLOW_DISTANCE: u64 = 8; | ||
|
|
||
| /// Render a duration in seconds as a compact human string, e.g. "13d 4h". | ||
| fn humanize_secs(secs: u64) -> String { | ||
| let days = secs / 86_400; | ||
| let hours = (secs % 86_400) / 3_600; | ||
| let mins = (secs % 3_600) / 60; | ||
| if days > 0 { | ||
| format!("{days}d {hours}h") | ||
| } else if hours > 0 { | ||
| format!("{hours}h {mins}m") | ||
| } else { | ||
| format!("{mins}m") | ||
| } | ||
| } | ||
|
|
||
| /// Performs full sync cycle - fetches and executes all blocks between current head and sync head | ||
| /// | ||
| /// # Returns | ||
|
|
@@ -60,6 +85,12 @@ pub async fn sync_cycle_full( | |
|
|
||
| let mut attempts = 0; | ||
|
|
||
| // Tracks whether this cycle started meaningfully behind the consensus-provided | ||
| // head, so we can log progress and a final "caught up" message without spamming | ||
| // a synced node. Set on the first batch of headers we fetch. | ||
| let mut started_behind = false; | ||
| let mut sync_target_logged = false; | ||
|
|
||
| // Request and store all block headers from the advertised sync head | ||
| loop { | ||
| let Some(mut block_headers) = peers | ||
|
|
@@ -92,6 +123,35 @@ pub async fn sync_cycle_full( | |
| first_header.number, | ||
| last_header.number, | ||
| ); | ||
|
|
||
| // On the first batch, report the distance to the consensus-provided head and | ||
| // warn if that head is stale (a strong signal the consensus client is behind). | ||
| if !sync_target_logged { | ||
| sync_target_logged = true; | ||
| let target = first_header.number; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: For a log line that's fine — it errs toward under-warning, not false alarms. But if you want the distance to reflect the actual FCU head (matching the body's claim), resolve |
||
| let target_ts = first_header.timestamp; | ||
| let local_head = store.get_latest_block_number().await?; | ||
| let behind = target.saturating_sub(local_head); | ||
| if behind > FOLLOW_DISTANCE { | ||
| started_behind = true; | ||
| info!( | ||
| "Sync target from consensus forkchoice: block {target} ({behind} blocks ahead of local head {local_head})" | ||
| ); | ||
| let now = SystemTime::now() | ||
| .duration_since(UNIX_EPOCH) | ||
| .unwrap_or_default() | ||
| .as_secs(); | ||
| let age = now.saturating_sub(target_ts); | ||
| if age > STALE_FORKCHOICE_HEAD_SECS { | ||
| warn!( | ||
| "Consensus forkchoice head (block {target}) is {} old. This can happen while the consensus client is still catching up to chain head; \ | ||
| if so, execution will only advance as fast as it does. If sync seems slow, it may be worth checking the consensus client's sync status.", | ||
| humanize_secs(age) | ||
| ); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| end_block_number = end_block_number.max(first_header.number); | ||
| start_block_number = last_header.number; | ||
|
|
||
|
|
@@ -266,6 +326,15 @@ pub async fn sync_cycle_full( | |
| .await?; | ||
| } | ||
|
|
||
| // If this cycle started behind, announce that we've caught up to the head the | ||
| // consensus client gave us, so the operator can tell idle-waiting from a hang. | ||
| if started_behind { | ||
| let local_head = store.get_latest_block_number().await?; | ||
| info!( | ||
| "Reached consensus-provided head at block {local_head}. Waiting for the next forkchoice update from the consensus client." | ||
| ); | ||
| } | ||
|
|
||
| store.clear_fullsync_headers().await?; | ||
| Ok(()) | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -58,13 +58,22 @@ impl RpcHandler for Syncing { | |
| if context.blockchain.is_synced() { | ||
| Ok(Value::Bool(!context.blockchain.is_synced())) | ||
| } else { | ||
| let current_block = context.storage.get_latest_block_number().await?; | ||
| // `get_last_fcu_head` returns the head *hash* from the last forkchoiceUpdated. | ||
| // Resolve it to a block number; if the head's header isn't stored yet (e.g. | ||
| // mid snap-sync), fall back to the current block instead of reporting garbage. | ||
| let head_hash = syncer | ||
| .get_last_fcu_head() | ||
| .map_err(|error| RpcErr::Internal(error.to_string()))?; | ||
| let highest_block = context | ||
| .storage | ||
| .get_block_number(head_hash) | ||
| .await? | ||
| .unwrap_or(current_block); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
When the FCU head hash isn't in storage yet (e.g. during snap-sync), Prompt To Fix With AIThis is a comment left during a code review.
Path: crates/networking/rpc/eth/client.rs
Line: 68-72
Comment:
**`highest_block` falls back to `current_block`, hiding sync progress from consumers**
When the FCU head hash isn't in storage yet (e.g. during snap-sync), `get_block_number` returns `None` and `highest_block` is set to `current_block`. The `eth_syncing` response then shows `highestBlock == currentBlock` while the node is still actively syncing. Monitoring tools and dashboards that use the `highestBlock − currentBlock` gap to render a progress bar will show 0 % remaining and may treat the node as caught-up. Returning `0` (or any sentinel that is clearly ≤ `current_block`) would at least not falsely indicate full sync, though the PR description explicitly acknowledges this as an intentional trade-off over the previous garbage value.
How can I resolve this? If you propose a fix, please make it concise.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The PR description says this resolves the head "via storage, then the pending-block store, falling back to That middle tier is exactly the interesting case: a forkchoice head that's been received but not yet executed (the mid-snap-sync scenario this PR is about) isn't in Two options:
Non-blocking, but worth closing the gap since accurate |
||
| let syncing_status = SyncingStatusRpc { | ||
| starting_block: context.storage.get_earliest_block_number().await?, | ||
| current_block: context.storage.get_latest_block_number().await?, | ||
| highest_block: syncer | ||
| .get_last_fcu_head() | ||
| .map_err(|error| RpcErr::Internal(error.to_string()))? | ||
| .to_low_u64_be(), | ||
| current_block, | ||
| highest_block, | ||
| }; | ||
| serde_json::to_value(syncing_status) | ||
| .map_err(|error| RpcErr::Internal(error.to_string())) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
humanize_secsreturns"0m"for sub-minute inputsIf
secsis 0–59,days,hours, andminsare all 0, so the function returns"0m". In the current call site this is unreachable (the guard isage > STALE_FORKCHOICE_HEAD_SECS= 1 800 s, somins ≥ 30), but if the function is ever called with a smaller value — e.g. for a future diagnostic or unit test — it will silently produce a misleading output instead of e.g."< 1m"or a seconds fallback.Prompt To Fix With AI