diff --git a/src/node/miner/bsc_miner.rs b/src/node/miner/bsc_miner.rs index a8eab2b3..182be084 100644 --- a/src/node/miner/bsc_miner.rs +++ b/src/node/miner/bsc_miner.rs @@ -41,8 +41,8 @@ use reth_provider::{ use reth_revm::cancelled::ManualCancel; use reth_tasks::TaskExecutor; use std::sync::atomic::{AtomicBool, Ordering}; -use std::sync::{Arc, Mutex}; -use std::time::Duration; +use std::sync::{Arc, Mutex, OnceLock}; +use std::time::{Duration, Instant}; use tokio::sync::mpsc; use tokio::task::JoinSet; use tokio_stream::StreamExt; @@ -109,6 +109,99 @@ fn is_network_ready_to_mine(tip_number: u64) -> bool { true } +/// Maximum blocks the local tip is allowed to lag behind the highest known peer +/// before mining is suppressed. Catches the case where the node has fallen behind +/// (sync recovery failed / bsc/2 disrupted / etc.) and would otherwise extend a +/// stale tip onto a divergent fork that peers do not accept. The threshold is +/// loose enough to tolerate the inevitable one-block staleness between an +/// in-flight `NewBlock` and our canonical-head update. +const MINING_LAG_THRESHOLD: u64 = 5; + +/// Maximum time the gate waits for any peer to publish a head before falling +/// through. Bounds the "all validators restarted simultaneously" deadlock where +/// every peer reports `best_number = None` until somebody produces a block. +const PEER_HEAD_WAIT_TIMEOUT_SECS: u64 = 5; + +/// Records the first instant the gate observed "no peer head yet". Used to +/// time-bound the deadlock-break window above. +static NO_PEER_HEAD_FIRST_HIT: OnceLock = OnceLock::new(); + +/// Verify the local tip is within `MINING_LAG_THRESHOLD` of the highest head +/// reported by any connected peer. Returning `false` suppresses mining and +/// prevents the validator from extending a stale tip into a divergent fork +/// while other recovery paths (`fork_recover`, staged-sync pipeline) catch up. +/// +/// The deadlock-break window kicks in only when **no** peer has published a +/// head at all — the typical all-validators-restart cold-start. As soon as one +/// peer publishes a head, the lag check is authoritative and the timer is +/// irrelevant; reaching the fallthrough later requires the network to genuinely +/// produce zero head announcements for `PEER_HEAD_WAIT_TIMEOUT_SECS`. +async fn is_caught_up_to_peers(local_tip: u64) -> bool { + use reth_network::Peers; + + let Some(network) = crate::shared::get_network_handle() else { + debug!( + target: "bsc::miner", + local_tip, + "Skip mining: network handle not yet available (post peer-ready gate)" + ); + return false; + }; + + let peers = match network.get_all_peers().await { + Ok(p) => p, + Err(e) => { + debug!( + target: "bsc::miner", + local_tip, + error = %e, + "Skip mining: get_all_peers failed" + ); + return false; + } + }; + + let max_peer_best: Option = peers.iter().filter_map(|p| p.best_number).max(); + + match max_peer_best { + Some(peer_best) => { + let lag = peer_best.saturating_sub(local_tip); + if lag > MINING_LAG_THRESHOLD { + debug!( + target: "bsc::miner", + local_tip, + peer_best, + lag, + threshold = MINING_LAG_THRESHOLD, + "Skip mining: local tip lags peers' best head beyond threshold" + ); + return false; + } + true + } + None => { + let first_hit = NO_PEER_HEAD_FIRST_HIT.get_or_init(Instant::now); + let elapsed = first_hit.elapsed(); + if elapsed < Duration::from_secs(PEER_HEAD_WAIT_TIMEOUT_SECS) { + debug!( + target: "bsc::miner", + local_tip, + elapsed_secs = elapsed.as_secs(), + "Skip mining: no peer head observed yet (cold-start grace window)" + ); + return false; + } + warn!( + target: "bsc::miner", + local_tip, + elapsed_secs = elapsed.as_secs(), + "Mining gate fallthrough: no peer head after grace window; permitting mining to break the all-validators-restart deadlock" + ); + true + } + } +} + impl NewWorkWorker where Provider: HeaderProvider
@@ -454,6 +547,10 @@ where return; } + if !is_caught_up_to_peers(tip.number()).await { + return; + } + let parent_header = match self.provider.sealed_header_by_hash(tip.hash()) { Ok(Some(header)) => { trace!( diff --git a/src/node/network/bsc_protocol/stream.rs b/src/node/network/bsc_protocol/stream.rs index 111f5621..035cc30b 100644 --- a/src/node/network/bsc_protocol/stream.rs +++ b/src/node/network/bsc_protocol/stream.rs @@ -14,8 +14,18 @@ use tokio::sync::{mpsc::UnboundedReceiver, oneshot}; use tokio::time::{Duration, Sleep}; use tokio_stream::wrappers::UnboundedReceiverStream; -/// Handshake timeout, mirroring the Go implementation. -const HANDSHAKE_TIMEOUT: Duration = Duration::from_secs(5); +/// Maximum time a freshly-opened bsc/n sub-stream waits for the peer's +/// `Capability` packet before tearing itself down. The previous 5 s value +/// (chosen to mirror geth-bsc) was tight enough to fire on its own during the +/// cold-start race when multiple validators boot together: protocol +/// negotiation contends with eth/68 handshake, RLPx hello, discovery, and +/// snapshot/triedb init, and the peer's first bsc/n frame can land later than +/// 5 s after our own. A timeout here drops the sub-stream without taking down +/// the RLPx connection, leaving a stale sender in the protocol registry that +/// downstream `GetBlocksByRange` calls fail against. 30 s gives the cold-start +/// race plenty of slack; truly dead peers are still surfaced by RLPx-level +/// keepalive/disconnect signaling, so the looser cap costs almost nothing. +const HANDSHAKE_TIMEOUT: Duration = Duration::from_secs(30); /// TTL for pending range requests before being pruned const PENDING_REQ_TTL: Duration = Duration::from_secs(15); /// Minimum interval between pending-request pruning passes