From 490b627074fbd60ac27ace62b38089ac61d4f764 Mon Sep 17 00:00:00 2001 From: Pawan Dhananjay Date: Wed, 13 Aug 2025 16:36:56 -0700 Subject: [PATCH 01/49] Penalize if invalid EL block --- .../lighthouse_network/src/service/mod.rs | 2 +- .../src/network_beacon_processor/rpc_methods.rs | 2 +- .../network_beacon_processor/sync_methods.rs | 17 +++++++++++++++-- 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/beacon_node/lighthouse_network/src/service/mod.rs b/beacon_node/lighthouse_network/src/service/mod.rs index 54c01ae16a5..bf233a3ec31 100644 --- a/beacon_node/lighthouse_network/src/service/mod.rs +++ b/beacon_node/lighthouse_network/src/service/mod.rs @@ -1911,7 +1911,7 @@ impl Network { } }, }; - debug!(our_addr = %local_addr, from = %send_back_addr, error = error_repr, "Failed incoming connection"); + tracing::trace!(our_addr = %local_addr, from = %send_back_addr, error = error_repr, "Failed incoming connection"); None } SwarmEvent::OutgoingConnectionError { diff --git a/beacon_node/network/src/network_beacon_processor/rpc_methods.rs b/beacon_node/network/src/network_beacon_processor/rpc_methods.rs index 117377c9245..e38fa6f842c 100644 --- a/beacon_node/network/src/network_beacon_processor/rpc_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/rpc_methods.rs @@ -212,7 +212,7 @@ impl NetworkBeaconProcessor { send_block_count += 1; } Ok(None) => { - debug!( + tracing::trace!( %peer_id, request_root = ?root, "Peer requested unknown block" diff --git a/beacon_node/network/src/network_beacon_processor/sync_methods.rs b/beacon_node/network/src/network_beacon_processor/sync_methods.rs index 306a184627e..9967f9c5e2b 100644 --- a/beacon_node/network/src/network_beacon_processor/sync_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/sync_methods.rs @@ -10,7 +10,8 @@ use beacon_chain::data_availability_checker::AvailabilityCheckError; use beacon_chain::data_availability_checker::MaybeAvailableBlock; use beacon_chain::{ AvailabilityProcessingStatus, BeaconChainTypes, BlockError, ChainSegmentResult, - HistoricalBlockError, NotifyExecutionLayer, validator_monitor::get_slot_delay_ms, + ExecutionPayloadError, HistoricalBlockError, NotifyExecutionLayer, + validator_monitor::get_slot_delay_ms, }; use beacon_processor::{ AsyncFn, BlockingFn, DuplicateCache, @@ -774,7 +775,19 @@ impl NetworkBeaconProcessor { }) } ref err @ BlockError::ExecutionPayloadError(ref epe) => { - if !epe.penalize_peer() { + if matches!(epe, ExecutionPayloadError::RejectedByExecutionEngine { .. }) { + debug!( + error = ?err, + "Invalid execution payload rejected by EE" + ); + Err(ChainSegmentFailed { + message: format!( + "Peer sent a block containing invalid execution payload. Reason: {:?}", + err + ), + peer_action: Some(PeerAction::LowToleranceError), + }) + } else if !epe.penalize_peer() { // These errors indicate an issue with the EL and not the `ChainSegment`. // Pause the syncing while the EL recovers debug!( From 836f9c6c1979a9f5bb4f5efd7674e4e74a314094 Mon Sep 17 00:00:00 2001 From: Pawan Dhananjay Date: Wed, 13 Aug 2025 16:37:12 -0700 Subject: [PATCH 02/49] Priorotize status v2 --- .../src/peer_manager/mod.rs | 50 ++++++++++++++++++- .../lighthouse_network/src/rpc/protocol.rs | 2 +- 2 files changed, 49 insertions(+), 3 deletions(-) diff --git a/beacon_node/lighthouse_network/src/peer_manager/mod.rs b/beacon_node/lighthouse_network/src/peer_manager/mod.rs index 13367a3e997..d0f3fd1c7c3 100644 --- a/beacon_node/lighthouse_network/src/peer_manager/mod.rs +++ b/beacon_node/lighthouse_network/src/peer_manager/mod.rs @@ -727,7 +727,7 @@ impl PeerManager { } } else { // we have no meta-data for this peer, update - debug!(%peer_id, new_seq_no = meta_data.seq_number(), "Obtained peer's metadata"); + debug!(%peer_id, new_seq_no = meta_data.seq_number(), cgc=?meta_data.custody_group_count().ok(), "Obtained peer's metadata"); } let known_custody_group_count = peer_info @@ -743,7 +743,7 @@ impl PeerManager { if let Some(custody_group_count) = custody_group_count_opt { match self.compute_peer_custody_groups(peer_id, custody_group_count) { Ok(custody_groups) => { - let custody_subnets = custody_groups + let custody_subnets: HashSet = custody_groups .into_iter() .flat_map(|custody_index| { self.subnets_by_custody_group @@ -759,6 +759,13 @@ impl PeerManager { }) }) .collect(); + let cgc = if custody_subnets.len() == 128 { + "supernode".to_string() + } else { + format!("{:?}", custody_subnets) + }; + + debug!(cgc, ?peer_id, "Peer custodied subnets"); peer_info.set_custody_subnets(custody_subnets); updated_cgc = Some(custody_group_count) != known_custody_group_count; @@ -947,6 +954,42 @@ impl PeerManager { } } + /// Run discovery query for additional custody peers if we fall below `TARGET_PEERS`. + fn maintain_custody_peers(&mut self) { + let subnets_to_discover: Vec = self + .network_globals + .sampling_subnets() + .iter() + .filter_map(|custody_subnet| { + if self + .network_globals + .peers + .read() + .good_range_sync_custody_subnet_peers(*custody_subnet) + .count() + < 2 + { + Some(SubnetDiscovery { + subnet: Subnet::DataColumn(*custody_subnet), + min_ttl: None, + }) + } else { + None + } + }) + .collect(); + + // request the subnet query from discovery + if !subnets_to_discover.is_empty() { + debug!( + subnets = ?subnets_to_discover.iter().map(|s| s.subnet).collect::>(), + "Making subnet queries for maintaining custody peers" + ); + self.events + .push(PeerManagerEvent::DiscoverSubnetPeers(subnets_to_discover)); + } + } + fn maintain_trusted_peers(&mut self) { let trusted_peers = self.trusted_peers.clone(); for trusted_peer in trusted_peers { @@ -1269,6 +1312,9 @@ impl PeerManager { // Update peer score metrics; self.update_peer_score_metrics(); + // Maintain minimum count for custody peers. + self.maintain_custody_peers(); + // Maintain minimum count for sync committee peers. self.maintain_sync_committee_peers(); diff --git a/beacon_node/lighthouse_network/src/rpc/protocol.rs b/beacon_node/lighthouse_network/src/rpc/protocol.rs index 388dbe63ef0..52dae3af351 100644 --- a/beacon_node/lighthouse_network/src/rpc/protocol.rs +++ b/beacon_node/lighthouse_network/src/rpc/protocol.rs @@ -825,8 +825,8 @@ impl RequestType { match self { // add more protocols when versions/encodings are supported RequestType::Status(_) => vec![ - ProtocolId::new(SupportedProtocol::StatusV1, Encoding::SSZSnappy), ProtocolId::new(SupportedProtocol::StatusV2, Encoding::SSZSnappy), + ProtocolId::new(SupportedProtocol::StatusV1, Encoding::SSZSnappy), ], RequestType::Goodbye(_) => vec![ProtocolId::new( SupportedProtocol::GoodbyeV1, From 156449ca265a66b44d65f1862e53de4ed228e2c9 Mon Sep 17 00:00:00 2001 From: Pawan Dhananjay Date: Thu, 14 Aug 2025 09:10:28 -0700 Subject: [PATCH 03/49] Increase columns_by_root quota --- beacon_node/lighthouse_network/src/rpc/config.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/beacon_node/lighthouse_network/src/rpc/config.rs b/beacon_node/lighthouse_network/src/rpc/config.rs index ef9347a1197..cdfb06284e3 100644 --- a/beacon_node/lighthouse_network/src/rpc/config.rs +++ b/beacon_node/lighthouse_network/src/rpc/config.rs @@ -120,10 +120,10 @@ impl RateLimiterConfig { // Range sync load balances when requesting blocks, and each batch is 32 blocks. pub const DEFAULT_DATA_COLUMNS_BY_RANGE_QUOTA: Quota = Quota::n_every(NonZeroU64::new(5120).unwrap(), 10); - // 512 columns per request from spec. This should be plenty as peers are unlikely to send all - // sampling requests to a single peer. + // 128 columns * 32 blocks in an epoch. Many clients try to request an entire batch by + // root instead of by range, so we should account for honest behaviour. pub const DEFAULT_DATA_COLUMNS_BY_ROOT_QUOTA: Quota = - Quota::n_every(NonZeroU64::new(512).unwrap(), 10); + Quota::n_every(NonZeroU64::new(4096).unwrap(), 10); pub const DEFAULT_LIGHT_CLIENT_BOOTSTRAP_QUOTA: Quota = Quota::one_every(10); pub const DEFAULT_LIGHT_CLIENT_OPTIMISTIC_UPDATE_QUOTA: Quota = Quota::one_every(10); pub const DEFAULT_LIGHT_CLIENT_FINALITY_UPDATE_QUOTA: Quota = Quota::one_every(10); From 6bd8944cef7dfeb6850742a85c8de5c82d659e27 Mon Sep 17 00:00:00 2001 From: Pawan Dhananjay Date: Fri, 15 Aug 2025 14:23:48 -0700 Subject: [PATCH 04/49] Reduce backfill buffer size --- beacon_node/network/src/sync/backfill_sync/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index ae9ac2e7705..dc70b08aec5 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -40,7 +40,7 @@ use types::{ColumnIndex, Epoch, EthSpec}; pub const BACKFILL_EPOCHS_PER_BATCH: u64 = 1; /// The maximum number of batches to queue before requesting more. -const BACKFILL_BATCH_BUFFER_SIZE: u8 = 20; +const BACKFILL_BATCH_BUFFER_SIZE: u8 = 3; /// The number of times to retry a batch before it is considered failed. const MAX_BATCH_DOWNLOAD_ATTEMPTS: u8 = 10; From 9455153542874d18ed1011a8f8b9037777ba79de Mon Sep 17 00:00:00 2001 From: Pawan Dhananjay Date: Mon, 18 Aug 2025 14:48:44 -0700 Subject: [PATCH 05/49] Without retries --- .../src/peer_manager/peerdb.rs | 28 ++ .../src/service/api_types.rs | 15 ++ .../src/sync/block_sidecar_coupling.rs | 142 ++++++++++ beacon_node/network/src/sync/manager.rs | 25 +- .../network/src/sync/network_context.rs | 250 +++++++++++++++++- .../src/sync/network_context/requests.rs | 3 +- .../requests/data_columns_by_root.rs | 95 ++++++- .../network/src/sync/range_sync/chain.rs | 2 +- 8 files changed, 545 insertions(+), 15 deletions(-) diff --git a/beacon_node/lighthouse_network/src/peer_manager/peerdb.rs b/beacon_node/lighthouse_network/src/peer_manager/peerdb.rs index 430ad2f6dae..1fa11357dc0 100644 --- a/beacon_node/lighthouse_network/src/peer_manager/peerdb.rs +++ b/beacon_node/lighthouse_network/src/peer_manager/peerdb.rs @@ -323,6 +323,34 @@ impl PeerDB { .map(|(peer_id, _)| peer_id) } + pub fn good_custody_subnet_peer_range_sync( + &self, + subnet: DataColumnSubnetId, + epoch: Epoch, + ) -> impl Iterator { + self.peers + .iter() + .filter(move |(_, info)| { + // The custody_subnets hashset can be populated via enr or metadata + let is_custody_subnet_peer = info.is_assigned_to_custody_subnet(&subnet); + + info.is_connected() + && is_custody_subnet_peer + && match info.sync_status() { + SyncStatus::Synced { info } => { + info.has_slot(epoch.end_slot(E::slots_per_epoch())) + } + SyncStatus::Advanced { info } => { + info.has_slot(epoch.end_slot(E::slots_per_epoch())) + } + SyncStatus::IrrelevantPeer + | SyncStatus::Behind { .. } + | SyncStatus::Unknown => false, + } + }) + .map(|(peer_id, _)| peer_id) + } + /// Returns an iterator of all peers that are supposed to be custodying /// the given subnet id. pub fn good_range_sync_custody_subnet_peers( diff --git a/beacon_node/lighthouse_network/src/service/api_types.rs b/beacon_node/lighthouse_network/src/service/api_types.rs index 0f5fd99c279..41119029904 100644 --- a/beacon_node/lighthouse_network/src/service/api_types.rs +++ b/beacon_node/lighthouse_network/src/service/api_types.rs @@ -38,6 +38,7 @@ pub enum SyncRequestId { pub struct DataColumnsByRootRequestId { pub id: Id, pub requester: DataColumnsByRootRequester, + pub peer: PeerId, } #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] @@ -48,6 +49,18 @@ pub struct BlocksByRangeRequestId { pub parent_request_id: ComponentsByRangeRequestId, } +impl BlocksByRangeRequestId { + pub fn batch_id(&self) -> Epoch { + match self.parent_request_id.requester { + RangeRequestId::BackfillSync { batch_id } => batch_id, + RangeRequestId::RangeSync { + chain_id: _, + batch_id, + } => batch_id, + } + } +} + #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] pub struct BlobsByRangeRequestId { /// Id to identify this attempt at a blobs_by_range request for `parent_request_id` @@ -92,6 +105,7 @@ pub enum RangeRequestId { #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] pub enum DataColumnsByRootRequester { Custody(CustodyId), + RangeSync { parent: ComponentsByRangeRequestId }, } #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] @@ -222,6 +236,7 @@ impl Display for DataColumnsByRootRequester { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { match self { Self::Custody(id) => write!(f, "Custody/{id}"), + Self::RangeSync { parent } => write!(f, "Range/{parent}"), } } } diff --git a/beacon_node/network/src/sync/block_sidecar_coupling.rs b/beacon_node/network/src/sync/block_sidecar_coupling.rs index 605da3b4bda..b807c2a0b21 100644 --- a/beacon_node/network/src/sync/block_sidecar_coupling.rs +++ b/beacon_node/network/src/sync/block_sidecar_coupling.rs @@ -5,6 +5,7 @@ use lighthouse_network::{ PeerAction, PeerId, service::api_types::{ BlobsByRangeRequestId, BlocksByRangeRequestId, DataColumnsByRangeRequestId, + DataColumnsByRootRequestId, }, }; use std::{collections::HashMap, sync::Arc}; @@ -51,6 +52,17 @@ enum RangeBlockDataRequest { expected_custody_columns: Vec, attempt: usize, }, + DataColumnsFromRoot { + requests: HashMap< + DataColumnsByRootRequestId, + ByRangeRequest>, + >, + init: bool, + /// The column indices corresponding to the request + column_peers: HashMap>, + expected_custody_columns: Vec, + attempt: usize, + }, } #[derive(Debug)] @@ -81,6 +93,7 @@ impl RangeBlockComponentsRequest { Vec<(DataColumnsByRangeRequestId, Vec)>, Vec, )>, + data_columns_from_root: bool, ) -> Self { let block_data_request = if let Some(blobs_req_id) = blobs_req_id { RangeBlockDataRequest::Blobs(ByRangeRequest::Active(blobs_req_id)) @@ -95,6 +108,14 @@ impl RangeBlockComponentsRequest { expected_custody_columns, attempt: 0, } + } else if data_columns_from_root { + RangeBlockDataRequest::DataColumnsFromRoot { + requests: HashMap::new(), + init: false, + attempt: 0, + column_peers: HashMap::new(), + expected_custody_columns: Vec::new(), + } } else { RangeBlockDataRequest::NoData }; @@ -128,6 +149,35 @@ impl RangeBlockComponentsRequest { } } + /// `column_requests`: each element represents a request id and the columns requested under that request. + pub fn insert_column_request_after_block_request( + &mut self, + column_requests: Vec<(DataColumnsByRootRequestId, Vec)>, + custody_columns: &[ColumnIndex], + ) -> Result<(), String> { + match &mut self.block_data_request { + RangeBlockDataRequest::DataColumnsFromRoot { + init, + requests, + attempt: _, + column_peers, + expected_custody_columns, + } => { + *init = true; + for (request, peers) in column_requests { + requests.insert(request, ByRangeRequest::Active(request)); + column_peers.insert(request, peers); + } + for column in custody_columns { + expected_custody_columns.push(*column); + } + + Ok(()) + } + _ => Err("Invalid initialization".to_string()), + } + } + /// Adds received blocks to the request. /// /// Returns an error if the request ID doesn't match the expected blocks request. @@ -150,6 +200,9 @@ impl RangeBlockComponentsRequest { ) -> Result<(), String> { match &mut self.block_data_request { RangeBlockDataRequest::NoData => Err("received blobs but expected no data".to_owned()), + RangeBlockDataRequest::DataColumnsFromRoot { .. } => { + Err("received blobs but expected no data columns by root".to_owned()) + } RangeBlockDataRequest::Blobs(req) => req.finish(req_id, blobs), RangeBlockDataRequest::DataColumns { .. } => { Err("received blobs but expected data columns".to_owned()) @@ -173,6 +226,9 @@ impl RangeBlockComponentsRequest { RangeBlockDataRequest::Blobs(_) => { Err("received data columns but expected blobs".to_owned()) } + RangeBlockDataRequest::DataColumnsFromRoot { .. } => { + Err("received data columns by root but expected range".to_owned()) + } RangeBlockDataRequest::DataColumns { requests, .. } => { let req = requests .get_mut(&req_id) @@ -182,6 +238,34 @@ impl RangeBlockComponentsRequest { } } + /// Adds received custody columns to the request. + /// + /// Returns an error if this request expects blobs instead of data columns, + /// or if the request ID is unknown. + pub fn add_custody_columns_by_root( + &mut self, + req_id: DataColumnsByRootRequestId, + columns: Vec>>, + ) -> Result<(), String> { + match &mut self.block_data_request { + RangeBlockDataRequest::NoData => { + Err("received data columns but expected no data".to_owned()) + } + RangeBlockDataRequest::Blobs(_) => { + Err("received data columns but expected blobs".to_owned()) + } + RangeBlockDataRequest::DataColumns { .. } => { + Err("received data columns by range but expected root".to_owned()) + } + RangeBlockDataRequest::DataColumnsFromRoot { requests, .. } => { + let req = requests + .get_mut(&req_id) + .ok_or(format!("unknown data columns by range req_id {req_id}"))?; + req.finish(req_id, columns) + } + } + } + /// Attempts to construct RPC blocks from all received components. /// /// Returns `None` if not all expected requests have completed. @@ -210,6 +294,64 @@ impl RangeBlockComponentsRequest { spec, )) } + RangeBlockDataRequest::DataColumnsFromRoot { + init, + attempt, + column_peers, + expected_custody_columns, + requests, + } => { + if !*init { + return None; + } + + let mut data_columns = vec![]; + let mut column_to_peer_id: HashMap = HashMap::new(); + for req in requests.values() { + let Some(data) = req.to_finished() else { + return None; + }; + data_columns.extend(data.clone()) + } + + // An "attempt" is complete here after we have received a response for all the + // requests we made. i.e. `req.to_finished()` returns Some for all requests. + *attempt += 1; + + // Note: this assumes that only 1 peer is responsible for a column + // with a batch. + for (id, columns) in column_peers { + for column in columns { + column_to_peer_id.insert(*column, id.peer); + } + } + + let resp = Self::responses_with_custody_columns( + blocks.to_vec(), + data_columns, + column_to_peer_id, + expected_custody_columns, + *attempt, + spec, + ); + + if let Err(CouplingError::DataColumnPeerFailure { + error: _, + faulty_peers, + action: _, + exceeded_retries: _, + }) = &resp + { + for (_, peer) in faulty_peers.iter() { + // find the req id associated with the peer and + // delete it from the entries as we are going to make + // a separate attempt for those components. + requests.retain(|&k, _| k.peer != *peer); + } + } + + Some(resp) + } RangeBlockDataRequest::DataColumns { requests, expected_custody_columns, diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index 448e784ab6d..cc1a6a51d89 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -1116,12 +1116,12 @@ impl SyncManager { peer_id: PeerId, data_column: RpcEvent>>, ) { - if let Some(resp) = - self.network - .on_data_columns_by_root_response(req_id, peer_id, data_column) - { - match req_id.requester { - DataColumnsByRootRequester::Custody(custody_id) => { + match req_id.requester { + DataColumnsByRootRequester::Custody(custody_id) => { + if let Some(resp) = + self.network + .on_data_columns_by_root_response(req_id, peer_id, data_column) + { if let Some(result) = self .network .on_custody_by_root_response(custody_id, req_id, peer_id, resp) @@ -1130,6 +1130,19 @@ impl SyncManager { } } } + DataColumnsByRootRequester::RangeSync { parent } => { + if let Some(resp) = self.network.on_data_columns_by_root_range_response( + req_id, + peer_id, + data_column, + ) { + self.on_range_components_response( + parent, + peer_id, + RangeBlockComponent::CustodyColumnsFromRoot(req_id, resp), + ); + } + } } } diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 76e5ed3f5d9..0e9ce0676f2 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -15,12 +15,16 @@ use crate::service::NetworkMessage; use crate::status::ToStatusMessage; use crate::sync::block_lookups::SingleLookupId; use crate::sync::block_sidecar_coupling::CouplingError; -use crate::sync::network_context::requests::BlobsByRootSingleBlockRequest; +use crate::sync::network_context::requests::{ + BlobsByRootSingleBlockRequest, DataColumnsByRootRangeRequestItems, +}; use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::{BeaconChain, BeaconChainTypes, BlockProcessStatus, EngineState}; use custody::CustodyRequestResult; use fnv::FnvHashMap; -use lighthouse_network::rpc::methods::{BlobsByRangeRequest, DataColumnsByRangeRequest}; +use lighthouse_network::rpc::methods::{ + BlobsByRangeRequest, DataColumnsByRangeRequest, DataColumnsByRootRequest, +}; use lighthouse_network::rpc::{BlocksByRangeRequest, GoodbyeReason, RPCError, RequestType}; pub use lighthouse_network::service::api_types::RangeRequestId; use lighthouse_network::service::api_types::{ @@ -33,7 +37,8 @@ use parking_lot::RwLock; pub use requests::LookupVerifyError; use requests::{ ActiveRequests, BlobsByRangeRequestItems, BlobsByRootRequestItems, BlocksByRangeRequestItems, - BlocksByRootRequestItems, DataColumnsByRangeRequestItems, DataColumnsByRootRequestItems, + BlocksByRootRequestItems, DataColumnsByRangeRequestItems, DataColumnsByRootBatchBlockRequest, + DataColumnsByRootRequestItems, }; #[cfg(test)] use slot_clock::SlotClock; @@ -48,8 +53,8 @@ use tokio::sync::mpsc; use tracing::{debug, error, warn}; use types::blob_sidecar::FixedBlobSidecarList; use types::{ - BlobSidecar, ColumnIndex, DataColumnSidecar, DataColumnSidecarList, EthSpec, ForkContext, - Hash256, SignedBeaconBlock, Slot, + BlobSidecar, ColumnIndex, DataColumnSidecar, DataColumnSidecarList, DataColumnSubnetId, + EthSpec, ForkContext, Hash256, SignedBeaconBlock, Slot, }; pub mod custody; @@ -198,7 +203,8 @@ pub struct SyncNetworkContext { /// A mapping of active DataColumnsByRange requests data_columns_by_range_requests: ActiveRequests>, - + data_columns_by_root_range_requests: + ActiveRequests>, /// Mapping of active custody column requests for a block root custody_by_root_requests: FnvHashMap>, @@ -232,6 +238,10 @@ pub enum RangeBlockComponent { DataColumnsByRangeRequestId, RpcResponseResult>>>, ), + CustodyColumnsFromRoot( + DataColumnsByRootRequestId, + RpcResponseResult>>>, + ), } #[cfg(test)] @@ -277,6 +287,7 @@ impl SyncNetworkContext { blocks_by_root_requests: ActiveRequests::new("blocks_by_root"), blobs_by_root_requests: ActiveRequests::new("blobs_by_root"), data_columns_by_root_requests: ActiveRequests::new("data_columns_by_root"), + data_columns_by_root_range_requests: ActiveRequests::new("data_columns_by_root_range"), blocks_by_range_requests: ActiveRequests::new("blocks_by_range"), blobs_by_range_requests: ActiveRequests::new("blobs_by_range"), data_columns_by_range_requests: ActiveRequests::new("data_columns_by_range"), @@ -307,6 +318,7 @@ impl SyncNetworkContext { blocks_by_range_requests, blobs_by_range_requests, data_columns_by_range_requests, + data_columns_by_root_range_requests, // custody_by_root_requests is a meta request of data_columns_by_root_requests custody_by_root_requests: _, // components_by_range_requests is a meta request of various _by_range requests @@ -342,12 +354,18 @@ impl SyncNetworkContext { .into_iter() .map(|req_id| SyncRequestId::DataColumnsByRange(*req_id)); + let data_column_by_root_range_ids = data_columns_by_root_range_requests + .active_requests_of_peer(peer_id) + .into_iter() + .map(|req_id| SyncRequestId::DataColumnsByRoot(*req_id)); + blocks_by_root_ids .chain(blobs_by_root_ids) .chain(data_column_by_root_ids) .chain(blocks_by_range_ids) .chain(blobs_by_range_ids) .chain(data_column_by_range_ids) + .chain(data_column_by_root_range_ids) .collect() } @@ -404,6 +422,7 @@ impl SyncNetworkContext { blocks_by_range_requests, blobs_by_range_requests, data_columns_by_range_requests, + data_columns_by_root_range_requests, // custody_by_root_requests is a meta request of data_columns_by_root_requests custody_by_root_requests: _, // components_by_range_requests is a meta request of various _by_range requests @@ -425,6 +444,7 @@ impl SyncNetworkContext { .chain(blocks_by_range_requests.iter_request_peers()) .chain(blobs_by_range_requests.iter_request_peers()) .chain(data_columns_by_range_requests.iter_request_peers()) + .chain(data_columns_by_root_range_requests.iter_request_peers()) { *active_request_count_by_peer.entry(peer_id).or_default() += 1; } @@ -605,6 +625,73 @@ impl SyncNetworkContext { self.chain.sampling_columns_for_epoch(epoch).to_vec(), ) }), + false, + ); + self.components_by_range_requests.insert(id, info); + + Ok(id.id) + } + + /// A blocks by range request sent by the range sync algorithm + pub fn block_components_by_range_request_without_components( + &mut self, + batch_type: ByRangeRequestType, + request: BlocksByRangeRequest, + requester: RangeRequestId, + peers: &HashSet, + peers_to_deprioritize: &HashSet, + ) -> Result { + let active_request_count_by_peer = self.active_request_count_by_peer(); + + let Some(block_peer) = peers + .iter() + .map(|peer| { + ( + // If contains -> 1 (order after), not contains -> 0 (order first) + peers_to_deprioritize.contains(peer), + // Prefer peers with less overall requests + active_request_count_by_peer.get(peer).copied().unwrap_or(0), + // Random factor to break ties, otherwise the PeerID breaks ties + rand::random::(), + peer, + ) + }) + .min() + .map(|(_, _, _, peer)| *peer) + else { + // Backfill and forward sync handle this condition gracefully. + // - Backfill sync: will pause waiting for more peers to join + // - Forward sync: can never happen as the chain is dropped when removing the last peer. + return Err(RpcRequestSendError::NoPeer(NoPeerError::BlockPeer)); + }; + + // Create the overall components_by_range request ID before its individual components + let id = ComponentsByRangeRequestId { + id: self.next_id(), + requester, + }; + + let blocks_req_id = self.send_blocks_by_range_request(block_peer, request.clone(), id)?; + + let blobs_req_id = if matches!(batch_type, ByRangeRequestType::BlocksAndBlobs) { + Some(self.send_blobs_by_range_request( + block_peer, + BlobsByRangeRequest { + start_slot: *request.start_slot(), + count: *request.count(), + }, + id, + )?) + } else { + None + }; + + let data_columns_by_root = matches!(batch_type, ByRangeRequestType::BlocksAndColumns); + let info = RangeBlockComponentsRequest::new( + blocks_req_id, + blobs_req_id, + None, + data_columns_by_root, ); self.components_by_range_requests.insert(id, info); @@ -706,6 +793,17 @@ impl SyncNetworkContext { }) }) } + RangeBlockComponent::CustodyColumnsFromRoot(req_id, resp) => { + resp.and_then(|(custody_columns, _)| { + request + .add_custody_columns_by_root(req_id, custody_columns) + .map_err(|e| { + RpcResponseError::BlockComponentCouplingError( + CouplingError::InternalError(e), + ) + }) + }) + } } } { entry.remove(); @@ -943,6 +1041,7 @@ impl SyncNetworkContext { let id = DataColumnsByRootRequestId { id: self.next_id(), requester, + peer: peer_id, }; self.send_network_msg(NetworkMessage::SendRequest { @@ -1180,6 +1279,48 @@ impl SyncNetworkContext { Ok((id, requested_columns)) } + fn send_data_columns_by_root_range_requests( + &mut self, + peer_id: PeerId, + request: DataColumnsByRootBatchBlockRequest, + requester: DataColumnsByRootRequester, + ) -> Result { + let id = DataColumnsByRootRequestId { + id: self.next_id(), + requester, + peer: peer_id, + }; + + self.send_network_msg(NetworkMessage::SendRequest { + peer_id, + request: RequestType::DataColumnsByRoot( + request + .clone() + .try_into_request(self.fork_context.current_fork_name(), &self.chain.spec) + .expect("should work"), + ), + app_request_id: AppRequestId::Sync(SyncRequestId::DataColumnsByRoot(id)), + }) + .map_err(|_| RpcRequestSendError::InternalError("network send error".to_owned()))?; + + debug!( + method = "DataColumnsByRoot", + ?request, + peer = %peer_id, + %id, + "Sync RPC request sent" + ); + + self.data_columns_by_root_range_requests.insert( + id, + peer_id, + // true = we are only requesting if we know there are blobs. + true, + DataColumnsByRootRangeRequestItems::new(request), + ); + Ok(id) + } + pub fn is_execution_engine_online(&self) -> bool { self.execution_engine_state == EngineState::Online } @@ -1369,6 +1510,19 @@ impl SyncNetworkContext { self.on_rpc_response_result(id, "DataColumnsByRoot", resp, peer_id, |_| 1) } + #[allow(clippy::type_complexity)] + pub(crate) fn on_data_columns_by_root_range_response( + &mut self, + id: DataColumnsByRootRequestId, + peer_id: PeerId, + rpc_event: RpcEvent>>, + ) -> Option>>>> { + let resp = self + .data_columns_by_root_range_requests + .on_response(id, rpc_event); + self.on_rpc_response_result(id, "DataColumnsByRootRange", resp, peer_id, |_| 1) + } + #[allow(clippy::type_complexity)] pub(crate) fn on_blocks_by_range_response( &mut self, @@ -1377,6 +1531,89 @@ impl SyncNetworkContext { rpc_event: RpcEvent>>, ) -> Option>>>> { let resp = self.blocks_by_range_requests.on_response(id, rpc_event); + match &resp { + // todo(pawan): send the data column request as soon as you get each chunk to spread out requests + Some(Ok((blocks, _))) => { + // We have blocks here, check if they need data columns and request them + let mut block_roots = Vec::new(); + let batch_epoch = id.batch_id(); + if !self.chain.spec.is_peer_das_enabled_for_epoch(batch_epoch) { + return self + .on_rpc_response_result(id, "BlocksByRange", resp, peer_id, |b| b.len()); + } + for block in blocks.iter() { + // Request columns only if the blob_kzg_commitments is non-empty + if let Ok(commitments) = block.message().body().blob_kzg_commitments() { + if !commitments.is_empty() { + block_roots.push(block.canonical_root()); + } + } + } + // Generate the data column by root requests + let mut peer_to_columns: HashMap> = HashMap::new(); + let mut no_peers_for_column: Vec = Vec::new(); + for column in self.chain.sampling_columns_for_epoch(batch_epoch).iter() { + let data_column = DataColumnSubnetId::new(*column); + if let Some(custody_peer) = self + .network_globals() + .peers + .read() + .good_custody_subnet_peer_range_sync(data_column, batch_epoch) + .next() + { + peer_to_columns + .entry(*custody_peer) + .or_default() + .push(*column); + } else { + debug!( + ?data_column, + ?id, + "Not enough column peers for batch, need to retry" + ); + no_peers_for_column.push(*column); + continue; + } + } + + // todo(pawan): no_peers_for_column nned to be requested once peers + // become available + let mut data_column_requests = Vec::new(); + for (peer, indices) in peer_to_columns.into_iter() { + let data_columns_by_root_request = DataColumnsByRootBatchBlockRequest { + block_roots: block_roots.clone(), + indices: indices.clone(), + }; + + let requester = DataColumnsByRootRequester::RangeSync { + parent: id.parent_request_id, + }; + + data_column_requests.push(( + self.send_data_columns_by_root_range_requests( + peer, + data_columns_by_root_request, + requester, + ) + .expect("should be able to send request"), + indices, + )); + } + + if let Some(req) = self + .components_by_range_requests + .get_mut(&id.parent_request_id) + { + req.insert_column_request_after_block_request( + data_column_requests, + self.chain.sampling_columns_for_epoch(batch_epoch), + ) + .expect("should be in the right state"); + } + } + None => {} + Some(Err(_)) => {} + } self.on_rpc_response_result(id, "BlocksByRange", resp, peer_id, |b| b.len()) } @@ -1431,6 +1668,7 @@ impl SyncNetworkContext { ); } } + if let Some(Err(RpcResponseError::VerifyError(e))) = &resp { self.report_peer(peer_id, PeerAction::LowToleranceError, e.into()); } diff --git a/beacon_node/network/src/sync/network_context/requests.rs b/beacon_node/network/src/sync/network_context/requests.rs index f42595fb690..cea636157c6 100644 --- a/beacon_node/network/src/sync/network_context/requests.rs +++ b/beacon_node/network/src/sync/network_context/requests.rs @@ -12,7 +12,8 @@ pub use blocks_by_range::BlocksByRangeRequestItems; pub use blocks_by_root::{BlocksByRootRequestItems, BlocksByRootSingleRequest}; pub use data_columns_by_range::DataColumnsByRangeRequestItems; pub use data_columns_by_root::{ - DataColumnsByRootRequestItems, DataColumnsByRootSingleBlockRequest, + DataColumnsByRootBatchBlockRequest, DataColumnsByRootRequestItems, + DataColumnsByRootSingleBlockRequest, DataColumnsByRootRangeRequestItems }; use crate::metrics; diff --git a/beacon_node/network/src/sync/network_context/requests/data_columns_by_root.rs b/beacon_node/network/src/sync/network_context/requests/data_columns_by_root.rs index 09d7f4b3b77..fc5d4fa5e9f 100644 --- a/beacon_node/network/src/sync/network_context/requests/data_columns_by_root.rs +++ b/beacon_node/network/src/sync/network_context/requests/data_columns_by_root.rs @@ -1,5 +1,5 @@ use lighthouse_network::rpc::methods::DataColumnsByRootRequest; -use std::sync::Arc; +use std::{collections::HashMap, sync::Arc}; use types::{ ChainSpec, DataColumnSidecar, DataColumnsByRootIdentifier, EthSpec, ForkName, Hash256, RuntimeVariableList, @@ -7,6 +7,38 @@ use types::{ use super::{ActiveRequestItems, LookupVerifyError}; +#[derive(Debug, Clone)] +pub struct DataColumnsByRootBatchBlockRequest { + pub block_roots: Vec, + pub indices: Vec, +} + +impl DataColumnsByRootBatchBlockRequest { + pub fn try_into_request( + self, + fork_name: ForkName, + spec: &ChainSpec, + ) -> Result { + let number_of_columns = spec.number_of_columns as usize; + let columns = RuntimeVariableList::new(self.indices, number_of_columns) + .map_err(|_| "Number of indices exceeds total number of columns")?; + let ids: Vec<_> = self + .block_roots + .into_iter() + .map(|block_root| DataColumnsByRootIdentifier { + block_root, + columns: columns.clone(), + }) + .collect(); + tracing::debug!(?ids, "Length ids"); + assert!(ids.len() <= 32); + Ok(DataColumnsByRootRequest::new( + ids, + spec.max_request_blocks(fork_name), + )) + } +} + #[derive(Debug, Clone)] pub struct DataColumnsByRootSingleBlockRequest { pub block_root: Hash256, @@ -79,3 +111,64 @@ impl ActiveRequestItems for DataColumnsByRootRequestItems { std::mem::take(&mut self.items) } } + +pub struct DataColumnsByRootRangeRequestItems { + request: DataColumnsByRootBatchBlockRequest, + items: HashMap>>>, +} + +impl DataColumnsByRootRangeRequestItems { + pub fn new(request: DataColumnsByRootBatchBlockRequest) -> Self { + Self { + request, + items: HashMap::new(), + } + } +} + +impl ActiveRequestItems for DataColumnsByRootRangeRequestItems { + type Item = Arc>; + + /// Appends a chunk to this multi-item request. If all expected chunks are received, this + /// method returns `Some`, resolving the request before the stream terminator. + /// The active request SHOULD be dropped after `add_response` returns an error + fn add(&mut self, data_column: Self::Item) -> Result { + let block_root = data_column.block_root(); + if !self.request.block_roots.contains(&block_root) { + return Err(LookupVerifyError::UnrequestedBlockRoot(block_root)); + } + if !data_column.verify_inclusion_proof() { + return Err(LookupVerifyError::InvalidInclusionProof); + } + if !self.request.indices.contains(&data_column.index) { + return Err(LookupVerifyError::UnrequestedIndex(data_column.index)); + } + if self + .items + .values() + .flatten() + .any(|d| d.index == data_column.index) + { + return Err(LookupVerifyError::DuplicatedData( + data_column.slot(), + data_column.index, + )); + } + + self.items.entry(block_root).or_default().push(data_column); + + Ok(self + .items + .values() + .map(|columns| columns.len()) + .sum::() + >= self.request.indices.len() * self.request.block_roots.len()) + } + + fn consume(&mut self) -> Vec { + std::mem::take(&mut self.items) + .into_values() + .flatten() + .collect() + } +} diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index cdbb9f25883..0656ecf9cc9 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -924,7 +924,7 @@ impl SyncingChain { .cloned() .collect::>(); - match network.block_components_by_range_request( + match network.block_components_by_range_request_without_components( batch_type, request, RangeRequestId::RangeSync { From 5337e4602e0bb46243e8c94ffa72f2babf4f9e4d Mon Sep 17 00:00:00 2001 From: Pawan Dhananjay Date: Mon, 18 Aug 2025 22:59:38 -0700 Subject: [PATCH 06/49] Add a function to retry column requests that could not be made --- .../src/service/api_types.rs | 11 ++ beacon_node/network/src/sync/manager.rs | 3 + .../network/src/sync/network_context.rs | 103 +++++++++++++++++- .../requests/data_columns_by_root.rs | 8 +- 4 files changed, 119 insertions(+), 6 deletions(-) diff --git a/beacon_node/lighthouse_network/src/service/api_types.rs b/beacon_node/lighthouse_network/src/service/api_types.rs index 41119029904..77d29133709 100644 --- a/beacon_node/lighthouse_network/src/service/api_types.rs +++ b/beacon_node/lighthouse_network/src/service/api_types.rs @@ -99,6 +99,17 @@ pub enum RangeRequestId { RangeSync { chain_id: Id, batch_id: Epoch }, BackfillSync { batch_id: Epoch }, } +impl RangeRequestId { + pub fn batch_id(&self) -> Epoch { + match &self { + RangeRequestId::BackfillSync { batch_id } => *batch_id, + RangeRequestId::RangeSync { + chain_id: _, + batch_id, + } => *batch_id, + } + } +} // TODO(das) refactor in a separate PR. We might be able to remove this and replace // [`DataColumnsByRootRequestId`] with a [`SingleLookupReqId`]. diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index cc1a6a51d89..9c5e116efeb 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -403,6 +403,9 @@ impl SyncManager { for (id, result) in self.network.continue_custody_by_root_requests() { self.on_custody_by_root_result(id, result); } + + // Try to make range requests that we failed to make because of lack of peers. + self.network.retry_pending_requests(); } /// Trigger range sync for a set of peers that claim to have imported a head unknown to us. diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 0e9ce0676f2..750bd8c4adb 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -22,9 +22,7 @@ use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::{BeaconChain, BeaconChainTypes, BlockProcessStatus, EngineState}; use custody::CustodyRequestResult; use fnv::FnvHashMap; -use lighthouse_network::rpc::methods::{ - BlobsByRangeRequest, DataColumnsByRangeRequest, DataColumnsByRootRequest, -}; +use lighthouse_network::rpc::methods::{BlobsByRangeRequest, DataColumnsByRangeRequest}; use lighthouse_network::rpc::{BlocksByRangeRequest, GoodbyeReason, RPCError, RequestType}; pub use lighthouse_network::service::api_types::RangeRequestId; use lighthouse_network::service::api_types::{ @@ -212,6 +210,14 @@ pub struct SyncNetworkContext { components_by_range_requests: FnvHashMap>, + // todo(pawan): make this a bounded queue, make the types better, add better docs + // A hashmap with the key being the parent request and the value being the data column by root + // requests that we have to retry because of one of the following reasons: + // 1. The root requests couldn't be made after the parent blocks request because there were no + // column peers available + // 2. The root request errored (either peer sent an RPC error or an empty response) + requests_to_retry: HashMap, + /// Whether the ee is online. If it's not, we don't allow access to the /// `beacon_processor_send`. execution_engine_state: EngineState, @@ -293,6 +299,7 @@ impl SyncNetworkContext { data_columns_by_range_requests: ActiveRequests::new("data_columns_by_range"), custody_by_root_requests: <_>::default(), components_by_range_requests: FnvHashMap::default(), + requests_to_retry: Default::default(), network_beacon_processor, chain, fork_context, @@ -323,6 +330,7 @@ impl SyncNetworkContext { custody_by_root_requests: _, // components_by_range_requests is a meta request of various _by_range requests components_by_range_requests: _, + requests_to_retry: _, execution_engine_state: _, network_beacon_processor: _, chain: _, @@ -429,6 +437,7 @@ impl SyncNetworkContext { components_by_range_requests: _, execution_engine_state: _, network_beacon_processor: _, + requests_to_retry: _, chain: _, fork_context: _, // Don't use a fallback match. We want to be sure that all requests are considered when @@ -522,6 +531,84 @@ impl SyncNetworkContext { Ok(()) } + /// Try to make all the requests that we failed to make earlier because of lack of peers + /// in the required subnets. + /// + /// This function must be manually invoked at regular intervals. + pub fn retry_pending_requests(&mut self) -> Result<(), String> { + let active_requests = self.active_request_count_by_peer(); + + // Collect entries to process and remove from requests_to_retry + let entries_to_process: Vec<_> = self.requests_to_retry.drain().collect(); + let mut entries_to_keep = Vec::new(); + + for (parent_request, requests) in entries_to_process { + let mut data_column_requests = Vec::new(); + let requester = DataColumnsByRootRequester::RangeSync { + parent: parent_request.clone(), + }; + let custody_indices = requests.indices.iter().cloned().collect(); + let synced_peers = self + .network_globals() + .peers + .read() + .synced_peers_for_epoch(parent_request.requester.batch_id(), None) + .cloned() + .collect(); + + match self.select_columns_by_range_peers_to_request( + &custody_indices, + &synced_peers, + active_requests.clone(), + &HashSet::new(), + ) { + Ok(peer_to_columns) => { + for (peer, indices) in peer_to_columns.into_iter() { + let data_columns_by_root_request = DataColumnsByRootBatchBlockRequest { + block_roots: requests.block_roots.clone(), + indices: indices.clone(), + }; + + data_column_requests.push(( + self.send_data_columns_by_root_range_requests( + peer, + data_columns_by_root_request, + requester, + ) + .expect("should be able to send request"), + indices, + )); + } + // we have sent out requests to peers, register these requests with the coupling service. + if let Some(req) = self.components_by_range_requests.get_mut(&parent_request) { + req.insert_column_request_after_block_request( + data_column_requests, + self.chain + .sampling_columns_for_epoch(parent_request.requester.batch_id()), + ) + .expect("should be in the right state"); + } + debug!(?requests, "Successfully retried requests"); + // Successfully processed, don't keep this entry + } + Err(err) => { + debug!( + ?err, + ?parent_request, + "Failed to retry request, no peers in subnets", + ); + // Failed to process, keep this entry for next retry + entries_to_keep.push((parent_request, requests)); + } + } + } + + // Re-insert entries that still need to be retried + self.requests_to_retry.extend(entries_to_keep); + + Ok(()) + } + /// A blocks by range request sent by the range sync algorithm pub fn block_components_by_range_request( &mut self, @@ -1568,7 +1655,7 @@ impl SyncNetworkContext { } else { debug!( ?data_column, - ?id, + block_request_id=?id, "Not enough column peers for batch, need to retry" ); no_peers_for_column.push(*column); @@ -1600,6 +1687,14 @@ impl SyncNetworkContext { )); } + let data_columns_by_root_request = DataColumnsByRootBatchBlockRequest { + block_roots: block_roots.clone(), + indices: no_peers_for_column, + }; + + self.requests_to_retry + .insert(id.parent_request_id, data_columns_by_root_request); + if let Some(req) = self .components_by_range_requests .get_mut(&id.parent_request_id) diff --git a/beacon_node/network/src/sync/network_context/requests/data_columns_by_root.rs b/beacon_node/network/src/sync/network_context/requests/data_columns_by_root.rs index fc5d4fa5e9f..c8cf1847c25 100644 --- a/beacon_node/network/src/sync/network_context/requests/data_columns_by_root.rs +++ b/beacon_node/network/src/sync/network_context/requests/data_columns_by_root.rs @@ -30,7 +30,6 @@ impl DataColumnsByRootBatchBlockRequest { columns: columns.clone(), }) .collect(); - tracing::debug!(?ids, "Length ids"); assert!(ids.len() <= 32); Ok(DataColumnsByRootRequest::new( ids, @@ -147,8 +146,13 @@ impl ActiveRequestItems for DataColumnsByRootRangeRequestItems { .items .values() .flatten() - .any(|d| d.index == data_column.index) + .any(|d| d.index == data_column.index && d.block_root() == block_root) { + tracing::debug!( + ?data_column, + existing_items=?self.items, + "Duplicated data", + ); return Err(LookupVerifyError::DuplicatedData( data_column.slot(), data_column.index, From ca9cfd5d739bc4ea12013941dfeb0e5ac91490ca Mon Sep 17 00:00:00 2001 From: Pawan Dhananjay Date: Tue, 19 Aug 2025 12:17:59 -0700 Subject: [PATCH 07/49] Small fixes --- .../src/sync/block_sidecar_coupling.rs | 9 ++++--- beacon_node/network/src/sync/manager.rs | 2 +- .../network/src/sync/network_context.rs | 26 ++++++++++++------- 3 files changed, 23 insertions(+), 14 deletions(-) diff --git a/beacon_node/network/src/sync/block_sidecar_coupling.rs b/beacon_node/network/src/sync/block_sidecar_coupling.rs index b807c2a0b21..d064ad8ccd1 100644 --- a/beacon_node/network/src/sync/block_sidecar_coupling.rs +++ b/beacon_node/network/src/sync/block_sidecar_coupling.rs @@ -163,14 +163,17 @@ impl RangeBlockComponentsRequest { column_peers, expected_custody_columns, } => { - *init = true; for (request, peers) in column_requests { requests.insert(request, ByRangeRequest::Active(request)); column_peers.insert(request, peers); } - for column in custody_columns { - expected_custody_columns.push(*column); + // expected custody columns should be populated only once during initialization + if !*init { + for column in custody_columns { + expected_custody_columns.push(*column); + } } + *init = true; Ok(()) } diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index 9c5e116efeb..f9019026524 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -405,7 +405,7 @@ impl SyncManager { } // Try to make range requests that we failed to make because of lack of peers. - self.network.retry_pending_requests(); + let _ = self.network.retry_pending_requests(); } /// Trigger range sync for a set of peers that claim to have imported a head unknown to us. diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 750bd8c4adb..5cc4236394d 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -755,7 +755,7 @@ impl SyncNetworkContext { // Create the overall components_by_range request ID before its individual components let id = ComponentsByRangeRequestId { id: self.next_id(), - requester, + requester: requester.clone(), }; let blocks_req_id = self.send_blocks_by_range_request(block_peer, request.clone(), id)?; @@ -774,6 +774,8 @@ impl SyncNetworkContext { }; let data_columns_by_root = matches!(batch_type, ByRangeRequestType::BlocksAndColumns); + + debug!(?requester, data_columns_by_root, "Batch type"); let info = RangeBlockComponentsRequest::new( blocks_req_id, blobs_req_id, @@ -1607,7 +1609,7 @@ impl SyncNetworkContext { let resp = self .data_columns_by_root_range_requests .on_response(id, rpc_event); - self.on_rpc_response_result(id, "DataColumnsByRootRange", resp, peer_id, |_| 1) + self.on_rpc_response_result(id, "DataColumnsByRootRange", resp, peer_id, |b| b.len()) } #[allow(clippy::type_complexity)] @@ -1624,7 +1626,10 @@ impl SyncNetworkContext { // We have blocks here, check if they need data columns and request them let mut block_roots = Vec::new(); let batch_epoch = id.batch_id(); - if !self.chain.spec.is_peer_das_enabled_for_epoch(batch_epoch) { + if !matches!( + self.batch_type(batch_epoch), + ByRangeRequestType::BlocksAndColumns + ) { return self .on_rpc_response_result(id, "BlocksByRange", resp, peer_id, |b| b.len()); } @@ -1659,7 +1664,6 @@ impl SyncNetworkContext { "Not enough column peers for batch, need to retry" ); no_peers_for_column.push(*column); - continue; } } @@ -1687,13 +1691,15 @@ impl SyncNetworkContext { )); } - let data_columns_by_root_request = DataColumnsByRootBatchBlockRequest { - block_roots: block_roots.clone(), - indices: no_peers_for_column, - }; + if !no_peers_for_column.is_empty() { + let data_columns_by_root_request = DataColumnsByRootBatchBlockRequest { + block_roots: block_roots.clone(), + indices: no_peers_for_column, + }; - self.requests_to_retry - .insert(id.parent_request_id, data_columns_by_root_request); + self.requests_to_retry + .insert(id.parent_request_id, data_columns_by_root_request); + } if let Some(req) = self .components_by_range_requests From 68cce376ecfcb735431952a9dbd4b34d004192bd Mon Sep 17 00:00:00 2001 From: Pawan Dhananjay Date: Wed, 20 Aug 2025 14:14:11 -0700 Subject: [PATCH 08/49] Try to avoid chains failing for rpc errors --- .../network/src/sync/range_sync/chain.rs | 35 +++++++++++++------ 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index 0656ecf9cc9..2da3cfdb699 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -327,7 +327,8 @@ impl SyncingChain { return Ok(KeepChain); } BatchState::Poisoned => unreachable!("Poisoned batch"), - BatchState::Processing(_) | BatchState::AwaitingDownload | BatchState::Failed => { + BatchState::AwaitingDownload => return Ok(KeepChain), + BatchState::Processing(_) | BatchState::Failed => { // these are all inconsistent states: // - Processing -> `self.current_processing_batch` is None // - Failed -> non recoverable batch. For an optimistic batch, it should @@ -361,7 +362,8 @@ impl SyncingChain { // Batch is not ready, nothing to process } BatchState::Poisoned => unreachable!("Poisoned batch"), - BatchState::Failed | BatchState::AwaitingDownload | BatchState::Processing(_) => { + BatchState::AwaitingDownload => return Ok(KeepChain), + BatchState::Failed | BatchState::Processing(_) => { // these are all inconsistent states: // - Failed -> non recoverable batch. Chain should have been removed // - AwaitingDownload -> A recoverable failed batch should have been @@ -559,7 +561,7 @@ impl SyncingChain { batch.processing_completed(BatchProcessingResult::NonFaultyFailure)?; // Simply re-download the batch. - self.send_batch(network, batch_id) + self.attempt_send_awaiting_download_batches(network, "non-faulty-failure") } } } @@ -729,7 +731,6 @@ impl SyncingChain { } // this is our robust `processing_target`. All previous batches must be awaiting // validation - let mut redownload_queue = Vec::new(); for (id, batch) in self.batches.range_mut(..batch_id) { if let BatchOperationOutcome::Failed { blacklist } = batch.validation_failed()? { @@ -739,18 +740,14 @@ impl SyncingChain { failing_batch: *id, }); } - redownload_queue.push(*id); } // no batch maxed out it process attempts, so now the chain's volatile progress must be // reset self.processing_target = self.start_epoch; - for id in redownload_queue { - self.send_batch(network, id)?; - } // finally, re-request the failed batch. - self.send_batch(network, batch_id) + self.attempt_send_awaiting_download_batches(network, "handle_invalid_batch") } pub fn stop_syncing(&mut self) { @@ -891,7 +888,7 @@ impl SyncingChain { failing_batch: batch_id, }); } - self.send_batch(network, batch_id) + self.attempt_send_awaiting_download_batches(network, "injecting error") } else { debug!( batch_epoch = %batch_id, @@ -905,6 +902,24 @@ impl SyncingChain { } } + pub fn attempt_send_awaiting_download_batches( + &mut self, + network: &mut SyncNetworkContext, + src: &str, + ) -> ProcessingResult { + debug!(?src, "In attempt_send_awaiting download batches"); + // Check all batches in AwaitingDownload state and see if they can be sent + for (batch_id, batch) in self.batches.iter() { + if matches!(batch.state(), BatchState::AwaitingDownload) { + debug!(?src, ?batch_id, "Sending batch"); + if self.good_peers_on_sampling_subnets(*batch_id, network) { + return self.send_batch(network, *batch_id); + } + } + } + Ok(KeepChain) + } + /// Requests the batch assigned to the given id from a given peer. pub fn send_batch( &mut self, From 6da924b1288977ad0def3dfa26228175d7c52433 Mon Sep 17 00:00:00 2001 From: Pawan Dhananjay Date: Wed, 20 Aug 2025 14:14:26 -0700 Subject: [PATCH 09/49] Fix bug in initialization code --- beacon_node/network/src/sync/block_sidecar_coupling.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/beacon_node/network/src/sync/block_sidecar_coupling.rs b/beacon_node/network/src/sync/block_sidecar_coupling.rs index d064ad8ccd1..fc9c4567ae3 100644 --- a/beacon_node/network/src/sync/block_sidecar_coupling.rs +++ b/beacon_node/network/src/sync/block_sidecar_coupling.rs @@ -155,6 +155,10 @@ impl RangeBlockComponentsRequest { column_requests: Vec<(DataColumnsByRootRequestId, Vec)>, custody_columns: &[ColumnIndex], ) -> Result<(), String> { + // Nothing to insert, do not initialize + if column_requests.is_empty() { + return Ok(()); + } match &mut self.block_data_request { RangeBlockDataRequest::DataColumnsFromRoot { init, From 1a0df3042cfc8f996c3f3fad2c68d95cbdc8eaf6 Mon Sep 17 00:00:00 2001 From: Pawan Dhananjay Date: Wed, 20 Aug 2025 14:14:48 -0700 Subject: [PATCH 10/49] Also penalize all batch peers for availability check errors --- .../network_beacon_processor/sync_methods.rs | 25 ++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/beacon_node/network/src/network_beacon_processor/sync_methods.rs b/beacon_node/network/src/network_beacon_processor/sync_methods.rs index 9967f9c5e2b..f1840cde242 100644 --- a/beacon_node/network/src/network_beacon_processor/sync_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/sync_methods.rs @@ -6,8 +6,10 @@ use crate::sync::{ manager::{BlockProcessType, SyncMessage}, }; use beacon_chain::block_verification_types::{AsBlock, RpcBlock}; -use beacon_chain::data_availability_checker::AvailabilityCheckError; use beacon_chain::data_availability_checker::MaybeAvailableBlock; +use beacon_chain::data_availability_checker::{ + AvailabilityCheckError, AvailabilityCheckErrorCategory, +}; use beacon_chain::{ AvailabilityProcessingStatus, BeaconChainTypes, BlockError, ChainSegmentResult, ExecutionPayloadError, HistoricalBlockError, NotifyExecutionLayer, @@ -836,6 +838,27 @@ impl NetworkBeaconProcessor { peer_action: Some(PeerAction::Fatal), }) } + BlockError::AvailabilityCheck(err) => { + if matches!(err.category(), AvailabilityCheckErrorCategory::Malicious) { + debug!( + msg = "peer sent invalid block", + outcome = ?err, + "Invalid block received" + ); + + Err(ChainSegmentFailed { + message: format!("Peer sent invalid block. Reason: {:?}", err), + // Do not penalize peers for internal errors. + peer_action: Some(PeerAction::MidToleranceError), + }) + } else { + Err(ChainSegmentFailed { + message: format!("Peer sent invalid block. Reason: {:?}", err), + // Do not penalize peers for internal errors. + peer_action: None, + }) + } + } other => { debug!( msg = "peer sent invalid block", From 17c4e348f53466cadd435c15e155fb78464b5fe8 Mon Sep 17 00:00:00 2001 From: Pawan Dhananjay Date: Wed, 20 Aug 2025 14:18:15 -0700 Subject: [PATCH 11/49] Avoid root requests for backfill sync --- beacon_node/network/src/sync/network_context.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 5cc4236394d..b4c145d5144 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -1623,6 +1623,14 @@ impl SyncNetworkContext { match &resp { // todo(pawan): send the data column request as soon as you get each chunk to spread out requests Some(Ok((blocks, _))) => { + // Return early if this is a backfill batch, backfill batches are handled by range requests instead of root + if matches!( + id.parent_request_id.requester, + RangeRequestId::BackfillSync { .. } + ) { + return self + .on_rpc_response_result(id, "BlocksByRange", resp, peer_id, |b| b.len()); + } // We have blocks here, check if they need data columns and request them let mut block_roots = Vec::new(); let batch_epoch = id.batch_id(); From fdce537747507b218ae3c0738ad2f3f7f9153028 Mon Sep 17 00:00:00 2001 From: Pawan Dhananjay Date: Wed, 20 Aug 2025 18:37:44 -0700 Subject: [PATCH 12/49] Implement responsible peer tracking --- .../src/service/api_types.rs | 2 + .../network_beacon_processor/sync_methods.rs | 84 +++++++++++++------ .../network/src/sync/backfill_sync/mod.rs | 45 +++++++--- .../src/sync/block_sidecar_coupling.rs | 24 +++++- beacon_node/network/src/sync/manager.rs | 20 +++-- .../network/src/sync/network_context.rs | 16 +++- .../network/src/sync/range_sync/batch.rs | 84 +++++++++++++------ .../network/src/sync/range_sync/chain.rs | 56 +++++++++---- .../network/src/sync/range_sync/mod.rs | 2 +- .../network/src/sync/range_sync/range.rs | 9 +- 10 files changed, 251 insertions(+), 91 deletions(-) diff --git a/beacon_node/lighthouse_network/src/service/api_types.rs b/beacon_node/lighthouse_network/src/service/api_types.rs index 77d29133709..65a6cf61c5d 100644 --- a/beacon_node/lighthouse_network/src/service/api_types.rs +++ b/beacon_node/lighthouse_network/src/service/api_types.rs @@ -47,6 +47,8 @@ pub struct BlocksByRangeRequestId { pub id: Id, /// The Id of the overall By Range request for block components. pub parent_request_id: ComponentsByRangeRequestId, + /// The peer that we made this request to + pub peer_id: PeerId, } impl BlocksByRangeRequestId { diff --git a/beacon_node/network/src/network_beacon_processor/sync_methods.rs b/beacon_node/network/src/network_beacon_processor/sync_methods.rs index f1840cde242..e722e3cf7eb 100644 --- a/beacon_node/network/src/network_beacon_processor/sync_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/sync_methods.rs @@ -1,15 +1,14 @@ use crate::metrics::{self, register_process_result_metrics}; use crate::network_beacon_processor::{FUTURE_SLOT_TOLERANCE, NetworkBeaconProcessor}; use crate::sync::BatchProcessResult; +use crate::sync::manager::FaultyComponent; use crate::sync::{ ChainId, manager::{BlockProcessType, SyncMessage}, }; use beacon_chain::block_verification_types::{AsBlock, RpcBlock}; +use beacon_chain::data_availability_checker::AvailabilityCheckError; use beacon_chain::data_availability_checker::MaybeAvailableBlock; -use beacon_chain::data_availability_checker::{ - AvailabilityCheckError, AvailabilityCheckErrorCategory, -}; use beacon_chain::{ AvailabilityProcessingStatus, BeaconChainTypes, BlockError, ChainSegmentResult, ExecutionPayloadError, HistoricalBlockError, NotifyExecutionLayer, @@ -44,6 +43,8 @@ struct ChainSegmentFailed { message: String, /// Used to penalize peers. peer_action: Option, + /// Used to identify the faulty component + faulty_component: Option, } impl NetworkBeaconProcessor { @@ -471,6 +472,7 @@ impl NetworkBeaconProcessor { Some(penalty) => BatchProcessResult::FaultyFailure { imported_blocks, penalty, + faulty_component: e.faulty_component, }, None => BatchProcessResult::NonFaultyFailure, } @@ -523,6 +525,7 @@ impl NetworkBeaconProcessor { Some(penalty) => BatchProcessResult::FaultyFailure { imported_blocks: 0, penalty, + faulty_component: e.faulty_component, }, None => BatchProcessResult::NonFaultyFailure, } @@ -595,15 +598,18 @@ impl NetworkBeaconProcessor { Err(ChainSegmentFailed { peer_action: None, message: "Failed to check block availability".into(), + faulty_component: None, }), ); } + e => { return ( 0, Err(ChainSegmentFailed { peer_action: Some(PeerAction::LowToleranceError), message: format!("Failed to check block availability : {:?}", e), + faulty_component: None, // Todo(pawan): replicate behaviour in forward sync once its proven }), ); } @@ -620,6 +626,7 @@ impl NetworkBeaconProcessor { (total_blocks - available_blocks.len()), total_blocks ), + faulty_component: Some(FaultyComponent::Blocks), }), ); } @@ -635,7 +642,7 @@ impl NetworkBeaconProcessor { metrics::inc_counter( &metrics::BEACON_PROCESSOR_BACKFILL_CHAIN_SEGMENT_FAILED_TOTAL, ); - let peer_action = match &e { + let (peer_action, faulty_component) = match &e { HistoricalBlockError::MismatchedBlockRoot { block_root, expected_block_root, @@ -647,7 +654,10 @@ impl NetworkBeaconProcessor { "Backfill batch processing error" ); // The peer is faulty if they send blocks with bad roots. - Some(PeerAction::LowToleranceError) + ( + Some(PeerAction::LowToleranceError), + Some(FaultyComponent::Blocks), + ) } HistoricalBlockError::InvalidSignature | HistoricalBlockError::SignatureSet(_) => { @@ -656,7 +666,10 @@ impl NetworkBeaconProcessor { "Backfill batch processing error" ); // The peer is faulty if they bad signatures. - Some(PeerAction::LowToleranceError) + ( + Some(PeerAction::LowToleranceError), + Some(FaultyComponent::Blocks), + ) } HistoricalBlockError::ValidatorPubkeyCacheTimeout => { warn!( @@ -664,7 +677,7 @@ impl NetworkBeaconProcessor { "Backfill batch processing error" ); // This is an internal error, do not penalize the peer. - None + (None, None) } HistoricalBlockError::IndexOutOfBounds => { error!( @@ -672,12 +685,12 @@ impl NetworkBeaconProcessor { "Backfill batch OOB error" ); // This should never occur, don't penalize the peer. - None + (None, None) } HistoricalBlockError::StoreError(e) => { warn!(error = ?e, "Backfill batch processing error"); // This is an internal error, don't penalize the peer. - None + (None, None) } // // Do not use a fallback match, handle all errors explicitly }; @@ -688,6 +701,7 @@ impl NetworkBeaconProcessor { message: format!("{:?}", err_str), // This is an internal error, don't penalize the peer. peer_action, + faulty_component, }), ) } @@ -702,7 +716,8 @@ impl NetworkBeaconProcessor { Err(ChainSegmentFailed { message: format!("Block has an unknown parent: {}", parent_root), // Peers are faulty if they send non-sequential blocks. - peer_action: Some(PeerAction::LowToleranceError), + peer_action: Some(PeerAction::LowToleranceError), // todo(pawan): revise this + faulty_component: Some(FaultyComponent::Blocks), }) } BlockError::DuplicateFullyImported(_) @@ -741,6 +756,7 @@ impl NetworkBeaconProcessor { ), // Peers are faulty if they send blocks from the future. peer_action: Some(PeerAction::LowToleranceError), + faulty_component: Some(FaultyComponent::Blocks), }) } BlockError::WouldRevertFinalizedSlot { .. } => { @@ -757,6 +773,7 @@ impl NetworkBeaconProcessor { block_parent_root ), peer_action: Some(PeerAction::Fatal), + faulty_component: Some(FaultyComponent::Blocks), }) } BlockError::GenesisBlock => { @@ -774,6 +791,7 @@ impl NetworkBeaconProcessor { message: format!("Internal error whilst processing block: {:?}", e), // Do not penalize peers for internal errors. peer_action: None, + faulty_component: None, }) } ref err @ BlockError::ExecutionPayloadError(ref epe) => { @@ -788,6 +806,7 @@ impl NetworkBeaconProcessor { err ), peer_action: Some(PeerAction::LowToleranceError), + faulty_component: Some(FaultyComponent::Blocks), // todo(pawan): recheck this }) } else if !epe.penalize_peer() { // These errors indicate an issue with the EL and not the `ChainSegment`. @@ -801,6 +820,7 @@ impl NetworkBeaconProcessor { message: format!("Execution layer offline. Reason: {:?}", err), // Do not penalize peers for internal errors. peer_action: None, + faulty_component: None, }) } else { debug!( @@ -813,6 +833,7 @@ impl NetworkBeaconProcessor { err ), peer_action: Some(PeerAction::LowToleranceError), + faulty_component: Some(FaultyComponent::Blocks), }) } } @@ -828,6 +849,7 @@ impl NetworkBeaconProcessor { // of a faulty EL it will usually require manual intervention to fix anyway, so // it's not too bad if we drop most of our peers. peer_action: Some(PeerAction::LowToleranceError), + faulty_component: Some(FaultyComponent::Blocks), }) } // Penalise peers for sending us banned blocks. @@ -836,27 +858,40 @@ impl NetworkBeaconProcessor { Err(ChainSegmentFailed { message: format!("Banned block: {block_root:?}"), peer_action: Some(PeerAction::Fatal), + faulty_component: Some(FaultyComponent::Blocks), }) } - BlockError::AvailabilityCheck(err) => { - if matches!(err.category(), AvailabilityCheckErrorCategory::Malicious) { - debug!( - msg = "peer sent invalid block", - outcome = ?err, - "Invalid block received" - ); - - Err(ChainSegmentFailed { + ref err @ BlockError::AvailabilityCheck(ref e) => { + match &e { + AvailabilityCheckError::InvalidBlobs(_) + | AvailabilityCheckError::BlobIndexInvalid(_) => Err(ChainSegmentFailed { + message: format!("Peer sent invalid block. Reason: {:?}", err), + // Do not penalize peers for internal errors. + peer_action: Some(PeerAction::LowToleranceError), + faulty_component: Some(FaultyComponent::Blobs), + }), + AvailabilityCheckError::InvalidColumn(columns) => Err(ChainSegmentFailed { message: format!("Peer sent invalid block. Reason: {:?}", err), // Do not penalize peers for internal errors. peer_action: Some(PeerAction::MidToleranceError), - }) - } else { - Err(ChainSegmentFailed { + faulty_component: Some(FaultyComponent::Columns( + columns.iter().map(|v| v.0).collect(), + )), + }), + AvailabilityCheckError::DataColumnIndexInvalid(column) => { + Err(ChainSegmentFailed { + message: format!("Peer sent invalid block. Reason: {:?}", err), + // Do not penalize peers for internal errors. + peer_action: Some(PeerAction::MidToleranceError), + faulty_component: Some(FaultyComponent::Columns(vec![*column])), + }) + } + _ => Err(ChainSegmentFailed { message: format!("Peer sent invalid block. Reason: {:?}", err), // Do not penalize peers for internal errors. - peer_action: None, - }) + peer_action: Some(PeerAction::MidToleranceError), + faulty_component: None, + }), } } other => { @@ -870,6 +905,7 @@ impl NetworkBeaconProcessor { message: format!("Peer sent invalid block. Reason: {:?}", other), // Do not penalize peers for internal errors. peer_action: None, + faulty_component: None, }) } } diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index dc70b08aec5..ac47310b3f0 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -10,12 +10,13 @@ use crate::network_beacon_processor::ChainSegmentProcessId; use crate::sync::block_sidecar_coupling::CouplingError; -use crate::sync::manager::BatchProcessResult; +use crate::sync::manager::{BatchProcessResult, FaultyComponent}; use crate::sync::network_context::{ RangeRequestId, RpcRequestSendError, RpcResponseError, SyncNetworkContext, }; use crate::sync::range_sync::{ BatchConfig, BatchId, BatchInfo, BatchOperationOutcome, BatchProcessingResult, BatchState, + ResponsiblePeers, }; use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::{BeaconChain, BeaconChainTypes}; @@ -380,9 +381,9 @@ impl BackFillSync { &mut self, network: &mut SyncNetworkContext, batch_id: BatchId, - peer_id: &PeerId, request_id: Id, blocks: Vec>, + responsible_peers: ResponsiblePeers, ) -> Result { // check if we have this batch let Some(batch) = self.batches.get_mut(&batch_id) else { @@ -401,7 +402,7 @@ impl BackFillSync { return Ok(ProcessResult::Successful); } - match batch.download_completed(blocks, *peer_id) { + match batch.download_completed(blocks, responsible_peers) { Ok(received) => { let awaiting_batches = self.processing_target.saturating_sub(batch_id) / BACKFILL_EPOCHS_PER_BATCH; @@ -557,7 +558,7 @@ impl BackFillSync { } }; - let Some(peer) = batch.processing_peer() else { + let Some(responsible_peers) = batch.processing_peers() else { self.fail_sync(BackFillError::BatchInvalidState( batch_id, String::from("Peer does not exist"), @@ -569,8 +570,8 @@ impl BackFillSync { ?result, %batch, batch_epoch = %batch_id, - %peer, - client = %network.client_type(peer), + ?responsible_peers, + // client = %network.client_type(peer), "Backfill batch processed" ); @@ -613,7 +614,31 @@ impl BackFillSync { BatchProcessResult::FaultyFailure { imported_blocks, penalty, + faulty_component, } => { + let Some(responsible_peers) = batch.responsible_peers() else { + crit!("Shouldn't happen"); + return self + .fail_sync(BackFillError::BatchProcessingFailed(batch_id)) + .map(|_| ProcessResult::Successful); + }; + // Penalize the peer appropriately. + match faulty_component { + Some(FaultyComponent::Blocks) | Some(FaultyComponent::Blobs) => { + network.report_peer(responsible_peers.block_blob, *penalty, "faulty_batch"); + } + // todo(pawan): clean this up + Some(FaultyComponent::Columns(faulty_columns)) => { + for (peer, columns) in responsible_peers.data_columns.iter() { + for faulty_column in faulty_columns { + if columns.contains(faulty_column) { + network.report_peer(*peer, *penalty, "faulty_batch"); + } + } + } + } + None => {} + } match batch.processing_completed(BatchProcessingResult::FaultyFailure) { Err(e) => { // Batch was in the wrong state @@ -687,7 +712,7 @@ impl BackFillSync { // Batch is not ready, nothing to process } BatchState::Poisoned => unreachable!("Poisoned batch"), - BatchState::Failed | BatchState::AwaitingDownload | BatchState::Processing(_) => { + BatchState::Failed | BatchState::AwaitingDownload | BatchState::Processing(..) => { // these are all inconsistent states: // - Failed -> non recoverable batch. Chain should have been removed // - AwaitingDownload -> A recoverable failed batch should have been @@ -698,7 +723,7 @@ impl BackFillSync { )))?; return Ok(ProcessResult::Successful); } - BatchState::AwaitingValidation(_) => { + BatchState::AwaitingValidation(_, _) => { // TODO: I don't think this state is possible, log a CRIT just in case. // If this is not observed, add it to the failed state branch above. crit!( @@ -748,7 +773,7 @@ impl BackFillSync { // only for batches awaiting validation can we be sure the last attempt is // right, and thus, that any different attempt is wrong match batch.state() { - BatchState::AwaitingValidation(processed_attempt) => { + BatchState::AwaitingValidation(processed_attempt, _) => { for attempt in batch.attempts() { // The validated batch has been re-processed if attempt.hash != processed_attempt.hash { @@ -794,7 +819,7 @@ impl BackFillSync { crit!("batch indicates inconsistent chain state while advancing chain") } BatchState::AwaitingProcessing(..) => {} - BatchState::Processing(_) => { + BatchState::Processing(..) => { debug!(batch = %id, %batch, "Advancing chain while processing a batch"); if let Some(processing_id) = self.current_processing_batch && id >= processing_id diff --git a/beacon_node/network/src/sync/block_sidecar_coupling.rs b/beacon_node/network/src/sync/block_sidecar_coupling.rs index fc9c4567ae3..9e72090561d 100644 --- a/beacon_node/network/src/sync/block_sidecar_coupling.rs +++ b/beacon_node/network/src/sync/block_sidecar_coupling.rs @@ -1,3 +1,5 @@ +use crate::sync::network_context::MAX_COLUMN_RETRIES; +use crate::sync::range_sync::ResponsiblePeers; use beacon_chain::{ block_verification_types::RpcBlock, data_column_verification::CustodyDataColumn, get_block_root, }; @@ -14,8 +16,6 @@ use types::{ Hash256, RuntimeVariableList, SignedBeaconBlock, }; -use crate::sync::network_context::MAX_COLUMN_RETRIES; - /// Accumulates and couples beacon blocks with their associated data (blobs or data columns) /// from range sync network responses. /// @@ -30,6 +30,7 @@ use crate::sync::network_context::MAX_COLUMN_RETRIES; pub struct RangeBlockComponentsRequest { /// Blocks we have received awaiting for their corresponding sidecar. blocks_request: ByRangeRequest>>>, + block_peer: PeerId, /// Sidecars we have received awaiting for their corresponding block. block_data_request: RangeBlockDataRequest, } @@ -95,6 +96,7 @@ impl RangeBlockComponentsRequest { )>, data_columns_from_root: bool, ) -> Self { + let block_peer = blocks_req_id.peer_id; let block_data_request = if let Some(blobs_req_id) = blobs_req_id { RangeBlockDataRequest::Blobs(ByRangeRequest::Active(blobs_req_id)) } else if let Some((requests, expected_custody_columns)) = data_columns { @@ -122,10 +124,28 @@ impl RangeBlockComponentsRequest { Self { blocks_request: ByRangeRequest::Active(blocks_req_id), + block_peer, block_data_request, } } + pub fn responsible_peers(&self) -> ResponsiblePeers { + ResponsiblePeers { + block_blob: self.block_peer, + data_columns: match &self.block_data_request { + RangeBlockDataRequest::NoData | RangeBlockDataRequest::Blobs(_) => HashMap::new(), + RangeBlockDataRequest::DataColumns { column_peers, .. } => column_peers + .iter() + .map(|(k, v)| (k.peer, v.clone())) + .collect(), + RangeBlockDataRequest::DataColumnsFromRoot { column_peers, .. } => column_peers + .iter() + .map(|(k, v)| (k.peer, v.clone())) + .collect(), + }, + } + } + /// Modifies `self` by inserting a new `DataColumnsByRangeRequestId` for a formerly failed /// request for some columns. pub fn reinsert_failed_column_requests( diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index f9019026524..3bda91ad1c5 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -70,7 +70,8 @@ use std::time::Duration; use tokio::sync::mpsc; use tracing::{debug, error, info, trace}; use types::{ - BlobSidecar, DataColumnSidecar, EthSpec, ForkContext, Hash256, SignedBeaconBlock, Slot, + BlobSidecar, ColumnIndex, DataColumnSidecar, EthSpec, ForkContext, Hash256, SignedBeaconBlock, + Slot, }; /// The number of slots ahead of us that is allowed before requesting a long-range (batch) Sync @@ -205,10 +206,19 @@ pub enum BatchProcessResult { FaultyFailure { imported_blocks: usize, penalty: PeerAction, + faulty_component: Option, }, NonFaultyFailure, } +/// Identifies the specific component that was faulty if the batch was a faulty failure. +#[derive(Debug)] +pub enum FaultyComponent { + Blocks, + Blobs, + Columns(Vec), +} + /// The primary object for handling and driving all the current syncing logic. It maintains the /// current state of the syncing process, the number of useful peers, downloaded blocks and /// controls the logic behind both the long-range (batch) sync and the on-going potential parent @@ -1218,7 +1228,7 @@ impl SyncManager { peer_id: PeerId, range_block_component: RangeBlockComponent, ) { - if let Some(resp) = self + if let Some((resp, responsible_peers)) = self .network .range_block_component_response(range_request_id, range_block_component) { @@ -1228,7 +1238,7 @@ impl SyncManager { RangeRequestId::RangeSync { chain_id, batch_id } => { self.range_sync.blocks_by_range_response( &mut self.network, - peer_id, + responsible_peers, chain_id, batch_id, range_request_id.id, @@ -1240,9 +1250,9 @@ impl SyncManager { match self.backfill_sync.on_block_response( &mut self.network, batch_id, - &peer_id, range_request_id.id, blocks, + responsible_peers, ) { Ok(ProcessResult::SyncCompleted) => self.update_sync_state(), Ok(ProcessResult::Successful) => {} @@ -1259,7 +1269,7 @@ impl SyncManager { RangeRequestId::RangeSync { chain_id, batch_id } => { self.range_sync.inject_error( &mut self.network, - peer_id, + responsible_peers, batch_id, chain_id, range_request_id.id, diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index b4c145d5144..c50769ddf6d 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -18,6 +18,7 @@ use crate::sync::block_sidecar_coupling::CouplingError; use crate::sync::network_context::requests::{ BlobsByRootSingleBlockRequest, DataColumnsByRootRangeRequestItems, }; +use crate::sync::range_sync::ResponsiblePeers; use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::{BeaconChain, BeaconChainTypes, BlockProcessStatus, EngineState}; use custody::CustodyRequestResult; @@ -848,7 +849,10 @@ impl SyncNetworkContext { &mut self, id: ComponentsByRangeRequestId, range_block_component: RangeBlockComponent, - ) -> Option>, RpcResponseError>> { + ) -> Option<( + Result>, RpcResponseError>, + ResponsiblePeers, + )> { let Entry::Occupied(mut entry) = self.components_by_range_requests.entry(id) else { metrics::inc_counter_vec(&metrics::SYNC_UNKNOWN_NETWORK_REQUESTS, &["range_blocks"]); return None; @@ -895,12 +899,14 @@ impl SyncNetworkContext { } } } { + let responsible_peers = entry.get().responsible_peers(); entry.remove(); - return Some(Err(e)); + return Some((Err(e), responsible_peers)); } let range_req = entry.get_mut(); if let Some(blocks_result) = range_req.responses(&self.chain.spec) { + let responsible_peers = range_req.responsible_peers(); if let Err(CouplingError::DataColumnPeerFailure { action: _, error, @@ -923,7 +929,10 @@ impl SyncNetworkContext { entry.remove(); } // If the request is finished, dequeue everything - Some(blocks_result.map_err(RpcResponseError::BlockComponentCouplingError)) + Some(( + blocks_result.map_err(RpcResponseError::BlockComponentCouplingError), + responsible_peers, + )) } else { None } @@ -1256,6 +1265,7 @@ impl SyncNetworkContext { let id = BlocksByRangeRequestId { id: self.next_id(), parent_request_id, + peer_id, }; self.network_send .send(NetworkMessage::SendRequest { diff --git a/beacon_node/network/src/sync/range_sync/batch.rs b/beacon_node/network/src/sync/range_sync/batch.rs index 1f516139969..14dd07ae31b 100644 --- a/beacon_node/network/src/sync/range_sync/batch.rs +++ b/beacon_node/network/src/sync/range_sync/batch.rs @@ -2,13 +2,13 @@ use beacon_chain::block_verification_types::RpcBlock; use lighthouse_network::PeerId; use lighthouse_network::rpc::methods::BlocksByRangeRequest; use lighthouse_network::service::api_types::Id; -use std::collections::HashSet; +use std::collections::{HashMap, HashSet}; use std::fmt; use std::hash::{Hash, Hasher}; use std::ops::Sub; use std::time::{Duration, Instant}; use strum::Display; -use types::{Epoch, EthSpec, Slot}; +use types::{ColumnIndex, Epoch, EthSpec, Slot}; /// The number of times to retry a batch before it is considered failed. const MAX_BATCH_DOWNLOAD_ATTEMPTS: u8 = 5; @@ -127,6 +127,15 @@ impl fmt::Display for BatchInfo { } } +/// The peers that we got responses for this batch from. +/// +/// This is used for penalizing in case of invalid batches. +#[derive(Debug, Clone)] +pub struct ResponsiblePeers { + pub block_blob: PeerId, + pub data_columns: HashMap>, +} + #[derive(Display)] /// Current state of a batch pub enum BatchState { @@ -135,15 +144,15 @@ pub enum BatchState { /// The batch is being downloaded. Downloading(Id), /// The batch has been completely downloaded and is ready for processing. - AwaitingProcessing(PeerId, Vec>, Instant), + AwaitingProcessing(ResponsiblePeers, Vec>, Instant), /// The batch is being processed. - Processing(Attempt), + Processing(Attempt, ResponsiblePeers), // todo(pawan): attempt contains the peer, remove that /// The batch was successfully processed and is waiting to be validated. /// /// It is not sufficient to process a batch successfully to consider it correct. This is /// because batches could be erroneously empty, or incomplete. Therefore, a batch is considered /// valid, only if the next sequential batch imports at least a block. - AwaitingValidation(Attempt), + AwaitingValidation(Attempt, ResponsiblePeers), /// Intermediate state for inner state handling. Poisoned, /// The batch has maxed out the allowed attempts for either downloading or processing. It @@ -213,13 +222,15 @@ impl BatchInfo { false } - /// Returns the peer that is currently responsible for progressing the state of the batch. - pub fn processing_peer(&self) -> Option<&PeerId> { + /// Returns the peers that are currently responsible for progressing the state of the batch. + pub fn processing_peers(&self) -> Option<&ResponsiblePeers> { match &self.state { BatchState::AwaitingDownload | BatchState::Failed | BatchState::Downloading(..) => None, - BatchState::AwaitingProcessing(peer_id, _, _) - | BatchState::Processing(Attempt { peer_id, .. }) - | BatchState::AwaitingValidation(Attempt { peer_id, .. }) => Some(peer_id), + BatchState::AwaitingProcessing(responsible_peers, _, _) + | BatchState::Processing(Attempt { .. }, responsible_peers) + | BatchState::AwaitingValidation(Attempt { .. }, responsible_peers) => { + Some(responsible_peers) + } BatchState::Poisoned => unreachable!("Poisoned batch"), } } @@ -276,12 +287,13 @@ impl BatchInfo { pub fn download_completed( &mut self, blocks: Vec>, - peer: PeerId, + responsible_peers: ResponsiblePeers, ) -> Result { match self.state.poison() { BatchState::Downloading(_) => { let received = blocks.len(); - self.state = BatchState::AwaitingProcessing(peer, blocks, Instant::now()); + self.state = + BatchState::AwaitingProcessing(responsible_peers, blocks, Instant::now()); Ok(received) } BatchState::Poisoned => unreachable!("Poisoned batch"), @@ -350,8 +362,11 @@ impl BatchInfo { pub fn start_processing(&mut self) -> Result<(Vec>, Duration), WrongState> { match self.state.poison() { - BatchState::AwaitingProcessing(peer, blocks, start_instant) => { - self.state = BatchState::Processing(Attempt::new::(peer, &blocks)); + BatchState::AwaitingProcessing(responsible_peers, blocks, start_instant) => { + self.state = BatchState::Processing( + Attempt::new::(responsible_peers.block_blob, &blocks), + responsible_peers, + ); Ok((blocks, start_instant.elapsed())) } BatchState::Poisoned => unreachable!("Poisoned batch"), @@ -365,14 +380,28 @@ impl BatchInfo { } } + pub fn responsible_peers(&self) -> Option<&ResponsiblePeers> { + match &self.state { + BatchState::AwaitingDownload + | BatchState::Failed + | BatchState::Poisoned + | BatchState::Downloading(_) => None, + BatchState::AwaitingProcessing(r, _, _) + | BatchState::AwaitingValidation(_, r) + | BatchState::Processing(_, r) => Some(r), + } + } + pub fn processing_completed( &mut self, processing_result: BatchProcessingResult, ) -> Result { match self.state.poison() { - BatchState::Processing(attempt) => { + BatchState::Processing(attempt, responsible_peers) => { self.state = match processing_result { - BatchProcessingResult::Success => BatchState::AwaitingValidation(attempt), + BatchProcessingResult::Success => { + BatchState::AwaitingValidation(attempt, responsible_peers) + } BatchProcessingResult::FaultyFailure => { // register the failed attempt self.failed_processing_attempts.push(attempt); @@ -408,7 +437,7 @@ impl BatchInfo { #[must_use = "Batch may have failed"] pub fn validation_failed(&mut self) -> Result { match self.state.poison() { - BatchState::AwaitingValidation(attempt) => { + BatchState::AwaitingValidation(attempt, responsible_peers) => { self.failed_processing_attempts.push(attempt); // check if the batch can be downloaded again @@ -459,16 +488,21 @@ impl Attempt { impl std::fmt::Debug for BatchState { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { - BatchState::Processing(Attempt { peer_id, hash: _ }) => { - write!(f, "Processing({})", peer_id) + BatchState::Processing(Attempt { peer_id, hash: _ }, responsible_peers) => { + write!(f, "Processing({}) {:?}", peer_id, responsible_peers) } - BatchState::AwaitingValidation(Attempt { peer_id, hash: _ }) => { - write!(f, "AwaitingValidation({})", peer_id) + BatchState::AwaitingValidation(Attempt { peer_id, hash: _ }, responsible_peers) => { + write!(f, "AwaitingValidation({}) {:?}", peer_id, responsible_peers) } BatchState::AwaitingDownload => f.write_str("AwaitingDownload"), BatchState::Failed => f.write_str("Failed"), - BatchState::AwaitingProcessing(peer, blocks, _) => { - write!(f, "AwaitingProcessing({}, {} blocks)", peer, blocks.len()) + BatchState::AwaitingProcessing(responsible_peers, blocks, _) => { + write!( + f, + "AwaitingProcessing({:?}, {:?} blocks)", + responsible_peers, + blocks.len() + ) } BatchState::Downloading(request_id) => { write!(f, "Downloading({})", request_id) @@ -484,8 +518,8 @@ impl BatchState { fn visualize(&self) -> char { match self { BatchState::Downloading(..) => 'D', - BatchState::Processing(_) => 'P', - BatchState::AwaitingValidation(_) => 'v', + BatchState::Processing(_, _) => 'P', + BatchState::AwaitingValidation(_, _) => 'v', BatchState::AwaitingDownload => 'd', BatchState::Failed => 'F', BatchState::AwaitingProcessing(..) => 'p', diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index 2da3cfdb699..bd176a0708c 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -3,7 +3,9 @@ use super::batch::{BatchInfo, BatchProcessingResult, BatchState}; use crate::metrics; use crate::network_beacon_processor::ChainSegmentProcessId; use crate::sync::block_sidecar_coupling::CouplingError; +use crate::sync::manager::FaultyComponent; use crate::sync::network_context::{RangeRequestId, RpcRequestSendError, RpcResponseError}; +use crate::sync::range_sync::batch::ResponsiblePeers; use crate::sync::{BatchOperationOutcome, BatchProcessResult, network_context::SyncNetworkContext}; use beacon_chain::BeaconChainTypes; use beacon_chain::block_verification_types::RpcBlock; @@ -209,9 +211,9 @@ impl SyncingChain { &mut self, network: &mut SyncNetworkContext, batch_id: BatchId, - peer_id: &PeerId, request_id: Id, blocks: Vec>, + responsible_peers: ResponsiblePeers, ) -> ProcessingResult { // check if we have this batch let batch = match self.batches.get_mut(&batch_id) { @@ -238,7 +240,7 @@ impl SyncingChain { // Remove the request from the peer's active batches // TODO(das): should use peer group here https://github.com/sigp/lighthouse/issues/6258 - let received = batch.download_completed(blocks, *peer_id)?; + let received = batch.download_completed(blocks, responsible_peers)?; let awaiting_batches = batch_id .saturating_sub(self.optimistic_start.unwrap_or(self.processing_target)) / EPOCHS_PER_BATCH; @@ -328,7 +330,7 @@ impl SyncingChain { } BatchState::Poisoned => unreachable!("Poisoned batch"), BatchState::AwaitingDownload => return Ok(KeepChain), - BatchState::Processing(_) | BatchState::Failed => { + BatchState::Processing(_, _) | BatchState::Failed => { // these are all inconsistent states: // - Processing -> `self.current_processing_batch` is None // - Failed -> non recoverable batch. For an optimistic batch, it should @@ -340,7 +342,7 @@ impl SyncingChain { state ))); } - BatchState::AwaitingValidation(_) => { + BatchState::AwaitingValidation(_, _) => { // If an optimistic start is given to the chain after the corresponding // batch has been requested and processed we can land here. We drop the // optimistic candidate since we can't conclude whether the batch included @@ -363,7 +365,7 @@ impl SyncingChain { } BatchState::Poisoned => unreachable!("Poisoned batch"), BatchState::AwaitingDownload => return Ok(KeepChain), - BatchState::Failed | BatchState::Processing(_) => { + BatchState::Failed | BatchState::Processing(_, _) => { // these are all inconsistent states: // - Failed -> non recoverable batch. Chain should have been removed // - AwaitingDownload -> A recoverable failed batch should have been @@ -374,7 +376,7 @@ impl SyncingChain { state ))); } - BatchState::AwaitingValidation(_) => { + BatchState::AwaitingValidation(_, _) => { // we can land here if an empty optimistic batch succeeds processing and is // inside the download buffer (between `self.processing_target` and // `self.to_be_downloaded`). In this case, eventually the chain advances to the @@ -441,7 +443,7 @@ impl SyncingChain { } }; - let peer = batch.processing_peer().cloned().ok_or_else(|| { + let peers = batch.processing_peers().cloned().ok_or_else(|| { RemoveChain::WrongBatchState(format!( "Processing target is in wrong state: {:?}", batch.state(), @@ -452,7 +454,7 @@ impl SyncingChain { debug!( result = ?result, batch_epoch = %batch_id, - client = %network.client_type(&peer), + ?peers, batch_state = ?batch_state, ?batch, "Batch processing result" @@ -516,9 +518,29 @@ impl SyncingChain { BatchProcessResult::FaultyFailure { imported_blocks, penalty, + faulty_component, } => { + let Some(responsible_peers) = batch.responsible_peers() else { + crit!("Shouldn't happen"); + return Ok(KeepChain); + }; // Penalize the peer appropriately. - network.report_peer(peer, *penalty, "faulty_batch"); + match faulty_component { + Some(FaultyComponent::Blocks) | Some(FaultyComponent::Blobs) => { + network.report_peer(responsible_peers.block_blob, *penalty, "faulty_batch"); + } + // todo(pawan): clean this up + Some(FaultyComponent::Columns(faulty_columns)) => { + for (peer, columns) in responsible_peers.data_columns.iter() { + for faulty_column in faulty_columns { + if columns.contains(faulty_column) { + network.report_peer(*peer, *penalty, "faulty_batch"); + } + } + } + } + None => {} + } // Check if this batch is allowed to continue match batch.processing_completed(BatchProcessingResult::FaultyFailure)? { @@ -621,11 +643,11 @@ impl SyncingChain { // only for batches awaiting validation can we be sure the last attempt is // right, and thus, that any different attempt is wrong match batch.state() { - BatchState::AwaitingValidation(processed_attempt) => { + BatchState::AwaitingValidation(processed_attempt, responsible_peers) => { for attempt in batch.attempts() { // The validated batch has been re-processed if attempt.hash != processed_attempt.hash { - // The re-downloaded version was different + // The re-downloaded version had a different block peer if processed_attempt.peer_id != attempt.peer_id { // A different peer sent the correct batch, the previous peer did not // We negatively score the original peer. @@ -665,7 +687,7 @@ impl SyncingChain { crit!("batch indicates inconsistent chain state while advancing chain") } BatchState::AwaitingProcessing(..) => {} - BatchState::Processing(_) => { + BatchState::Processing(_, _) => { debug!(batch = %id, %batch, "Advancing chain while processing a batch"); if let Some(processing_id) = self.current_processing_batch && id <= processing_id @@ -812,9 +834,9 @@ impl SyncingChain { &mut self, network: &mut SyncNetworkContext, batch_id: BatchId, - peer_id: &PeerId, request_id: Id, err: RpcResponseError, + responsible_peers: ResponsiblePeers, ) -> ProcessingResult { let batch_state = self.visualize_batch_state(); if let Some(batch) = self.batches.get_mut(&batch_id) { @@ -865,7 +887,7 @@ impl SyncingChain { debug!( batch_epoch = %batch_id, batch_state = ?batch.state(), - %peer_id, + ?responsible_peers, %request_id, ?batch_state, "Batch not expecting block" @@ -876,12 +898,12 @@ impl SyncingChain { batch_epoch = %batch_id, batch_state = ?batch.state(), error = ?err, - %peer_id, + ?responsible_peers, %request_id, "Batch download error" ); if let BatchOperationOutcome::Failed { blacklist } = - batch.download_failed(Some(*peer_id))? + batch.download_failed(Some(responsible_peers.block_blob))? { return Err(RemoveChain::ChainFailed { blacklist, @@ -892,7 +914,7 @@ impl SyncingChain { } else { debug!( batch_epoch = %batch_id, - %peer_id, + ?responsible_peers, %request_id, batch_state, "Batch not found" diff --git a/beacon_node/network/src/sync/range_sync/mod.rs b/beacon_node/network/src/sync/range_sync/mod.rs index 8f881fba90f..04b622cb42f 100644 --- a/beacon_node/network/src/sync/range_sync/mod.rs +++ b/beacon_node/network/src/sync/range_sync/mod.rs @@ -9,7 +9,7 @@ mod sync_type; pub use batch::{ BatchConfig, BatchInfo, BatchOperationOutcome, BatchProcessingResult, BatchState, - ByRangeRequestType, + ByRangeRequestType, ResponsiblePeers, }; pub use chain::{BatchId, ChainId, EPOCHS_PER_BATCH}; #[cfg(test)] diff --git a/beacon_node/network/src/sync/range_sync/range.rs b/beacon_node/network/src/sync/range_sync/range.rs index 465edd3697f..cd523d3e193 100644 --- a/beacon_node/network/src/sync/range_sync/range.rs +++ b/beacon_node/network/src/sync/range_sync/range.rs @@ -44,6 +44,7 @@ use super::chain_collection::{ChainCollection, SyncChainStatus}; use super::sync_type::RangeSyncType; use crate::metrics; use crate::status::ToStatusMessage; +use crate::sync::range_sync::ResponsiblePeers; use crate::sync::BatchProcessResult; use crate::sync::network_context::{RpcResponseError, SyncNetworkContext}; use beacon_chain::block_verification_types::RpcBlock; @@ -203,7 +204,7 @@ where pub fn blocks_by_range_response( &mut self, network: &mut SyncNetworkContext, - peer_id: PeerId, + responsible_peers: ResponsiblePeers, chain_id: ChainId, batch_id: BatchId, request_id: Id, @@ -211,7 +212,7 @@ where ) { // check if this chunk removes the chain match self.chains.call_by_id(chain_id, |chain| { - chain.on_block_response(network, batch_id, &peer_id, request_id, blocks) + chain.on_block_response(network, batch_id, request_id, blocks, responsible_peers) }) { Ok((removed_chain, sync_type)) => { if let Some((removed_chain, remove_reason)) = removed_chain { @@ -295,7 +296,7 @@ where pub fn inject_error( &mut self, network: &mut SyncNetworkContext, - peer_id: PeerId, + responsible_peers: ResponsiblePeers, batch_id: BatchId, chain_id: ChainId, request_id: Id, @@ -303,7 +304,7 @@ where ) { // check that this request is pending match self.chains.call_by_id(chain_id, |chain| { - chain.inject_error(network, batch_id, &peer_id, request_id, err) + chain.inject_error(network, batch_id, request_id, err, responsible_peers) }) { Ok((removed_chain, sync_type)) => { if let Some((removed_chain, remove_reason)) = removed_chain { From 45401958713d9ef75da09096664426757c37dfd3 Mon Sep 17 00:00:00 2001 From: Pawan Dhananjay Date: Wed, 13 Aug 2025 17:33:47 -0700 Subject: [PATCH 13/49] Request columns from global peer pool --- .../src/peer_manager/peerdb.rs | 2 ++ .../network/src/sync/range_sync/chain.rs | 20 +++++++++---------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/beacon_node/lighthouse_network/src/peer_manager/peerdb.rs b/beacon_node/lighthouse_network/src/peer_manager/peerdb.rs index 1fa11357dc0..4d191645d63 100644 --- a/beacon_node/lighthouse_network/src/peer_manager/peerdb.rs +++ b/beacon_node/lighthouse_network/src/peer_manager/peerdb.rs @@ -323,6 +323,8 @@ impl PeerDB { .map(|(peer_id, _)| peer_id) } + /// Returns an iterator of all good gossipsub peers that are supposed to be custodying + /// the given subnet id. pub fn good_custody_subnet_peer_range_sync( &self, subnet: DataColumnSubnetId, diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index bd176a0708c..fe9305d83ad 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -957,7 +957,7 @@ impl SyncingChain { .network_globals() .peers .read() - .synced_peers_for_epoch(batch_id, Some(&self.peers)) + .synced_peers_for_epoch(batch_id, None) .cloned() .collect::>(); @@ -1034,7 +1034,7 @@ impl SyncingChain { .network_globals() .peers .read() - .synced_peers_for_epoch(batch_id, Some(&self.peers)) + .synced_peers_for_epoch(batch_id, None) .cloned() .collect::>(); @@ -1129,21 +1129,21 @@ impl SyncingChain { ) -> bool { if network.chain.spec.is_peer_das_enabled_for_epoch(epoch) { // Require peers on all sampling column subnets before sending batches - network + let peers_on_all_custody_subnets = network .network_globals() .sampling_subnets() .iter() .all(|subnet_id| { - let peer_db = network.network_globals().peers.read(); - let peer_count = self + let peer_count = network + .network_globals() .peers - .iter() - .filter(|peer| { - peer_db.is_good_range_sync_custody_subnet_peer(*subnet_id, peer) - }) + .read() + .good_custody_subnet_peer_range_sync(*subnet_id, epoch) .count(); + peer_count > 0 - }) + }); + peers_on_all_custody_subnets } else { true } From 521778b0a5749c21e144f26410b77ba8930f2d00 Mon Sep 17 00:00:00 2001 From: Pawan Dhananjay Date: Thu, 21 Aug 2025 07:03:58 -0700 Subject: [PATCH 14/49] Random logs --- .../src/rpc/self_limiter.rs | 4 ++-- .../network_beacon_processor/rpc_methods.rs | 24 +++++++++---------- .../network/src/sync/range_sync/chain.rs | 1 + 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/beacon_node/lighthouse_network/src/rpc/self_limiter.rs b/beacon_node/lighthouse_network/src/rpc/self_limiter.rs index 90e2db91357..6b1f759c795 100644 --- a/beacon_node/lighthouse_network/src/rpc/self_limiter.rs +++ b/beacon_node/lighthouse_network/src/rpc/self_limiter.rs @@ -90,7 +90,7 @@ impl SelfRateLimiter { let protocol = req.versioned_protocol().protocol(); // First check that there are not already other requests waiting to be sent. if let Some(queued_requests) = self.delayed_requests.get_mut(&(peer_id, protocol)) { - debug!(%peer_id, protocol = %req.protocol(), "Self rate limiting since there are already other requests waiting to be sent"); + tracing::trace!(%peer_id, protocol = %req.protocol(), "Self rate limiting since there are already other requests waiting to be sent"); queued_requests.push_back(QueuedRequest { req, request_id, @@ -134,7 +134,7 @@ impl SelfRateLimiter { && let Some(count) = active_request.get(&req.protocol()) && *count >= MAX_CONCURRENT_REQUESTS { - debug!( + tracing::trace!( %peer_id, protocol = %req.protocol(), "Self rate limiting due to the number of concurrent requests" diff --git a/beacon_node/network/src/network_beacon_processor/rpc_methods.rs b/beacon_node/network/src/network_beacon_processor/rpc_methods.rs index e38fa6f842c..64e863be6c9 100644 --- a/beacon_node/network/src/network_beacon_processor/rpc_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/rpc_methods.rs @@ -393,7 +393,7 @@ impl NetworkBeaconProcessor { } } - debug!( + tracing::trace!( %peer_id, request = ?request.data_column_ids, returned = send_data_column_count, @@ -430,7 +430,7 @@ impl NetworkBeaconProcessor { inbound_request_id: InboundRequestId, req: LightClientUpdatesByRangeRequest, ) -> Result<(), (RpcErrorResponse, &'static str)> { - debug!( + tracing::trace!( %peer_id, count = req.count, start_period = req.start_period, @@ -473,7 +473,7 @@ impl NetworkBeaconProcessor { let lc_updates_sent = lc_updates.len(); if lc_updates_sent < req.count as usize { - debug!( + tracing::trace!( peer = %peer_id, info = "Failed to return all requested light client updates. The peer may have requested data ahead of whats currently available", start_period = req.start_period, @@ -482,7 +482,7 @@ impl NetworkBeaconProcessor { "LightClientUpdatesByRange outgoing response processed" ); } else { - debug!( + tracing::trace!( peer = %peer_id, start_period = req.start_period, requested = req.count, @@ -603,7 +603,7 @@ impl NetworkBeaconProcessor { let req_start_slot = *req.start_slot(); let req_count = *req.count(); - debug!( + tracing::trace!( %peer_id, count = req_count, start_slot = %req_start_slot, @@ -636,7 +636,7 @@ impl NetworkBeaconProcessor { let log_results = |peer_id, blocks_sent| { if blocks_sent < (req_count as usize) { - debug!( + tracing::trace!( %peer_id, msg = "Failed to return all requested blocks", start_slot = %req_start_slot, @@ -646,7 +646,7 @@ impl NetworkBeaconProcessor { "BlocksByRange outgoing response processed" ); } else { - debug!( + tracing::trace!( %peer_id, start_slot = %req_start_slot, %current_slot, @@ -790,7 +790,7 @@ impl NetworkBeaconProcessor { elapsed, ); - debug!( + tracing::trace!( req_type, start_slot = %req_start_slot, req_count, @@ -885,7 +885,7 @@ impl NetworkBeaconProcessor { inbound_request_id: InboundRequestId, req: BlobsByRangeRequest, ) -> Result<(), (RpcErrorResponse, &'static str)> { - debug!( + tracing::trace!( ?peer_id, count = req.count, start_slot = req.start_slot, @@ -938,7 +938,7 @@ impl NetworkBeaconProcessor { .unwrap_or_else(|_| self.chain.slot_clock.genesis_slot()); let log_results = |peer_id, req: BlobsByRangeRequest, blobs_sent| { - debug!( + tracing::trace!( %peer_id, start_slot = req.start_slot, %current_slot, @@ -1013,7 +1013,7 @@ impl NetworkBeaconProcessor { inbound_request_id: InboundRequestId, req: DataColumnsByRangeRequest, ) -> Result<(), (RpcErrorResponse, &'static str)> { - debug!( + tracing::trace!( %peer_id, count = req.count, start_slot = req.start_slot, @@ -1112,7 +1112,7 @@ impl NetworkBeaconProcessor { .slot() .unwrap_or_else(|_| self.chain.slot_clock.genesis_slot()); - debug!( + tracing::trace!( %peer_id, start_slot = req.start_slot, %current_slot, diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index fe9305d83ad..7fbb8317c20 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -717,6 +717,7 @@ impl SyncingChain { previous_start = %old_start, new_start = %self.start_epoch, processing_target = %self.processing_target, + id=%self.id, "Chain advanced" ); } From 52762b91e12ef050a9e7445179fcfdcae72ec590 Mon Sep 17 00:00:00 2001 From: Pawan Dhananjay Date: Fri, 22 Aug 2025 11:52:56 -0700 Subject: [PATCH 15/49] Handle 0 blobs per epoch case --- .../src/sync/block_sidecar_coupling.rs | 10 + .../network/src/sync/network_context.rs | 234 ++++++++++-------- .../requests/data_columns_by_root.rs | 2 +- .../network/src/sync/range_sync/chain.rs | 46 +++- 4 files changed, 183 insertions(+), 109 deletions(-) diff --git a/beacon_node/network/src/sync/block_sidecar_coupling.rs b/beacon_node/network/src/sync/block_sidecar_coupling.rs index e92bb0686ad..e5bb84813a3 100644 --- a/beacon_node/network/src/sync/block_sidecar_coupling.rs +++ b/beacon_node/network/src/sync/block_sidecar_coupling.rs @@ -210,6 +210,16 @@ impl RangeBlockComponentsRequest { } } + pub fn no_columns_for_batch(&mut self) -> Result<(), String> { + match self.block_data_request { + RangeBlockDataRequest::DataColumnsFromRoot { .. } => { + self.block_data_request = RangeBlockDataRequest::NoData; + Ok(()) + } + _ => Err("Invalid state: expected DataColumnsFromRoot".to_owned()), + } + } + /// Adds received blocks to the request. /// /// Returns an error if the request ID doesn't match the expected blocks request. diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 749adaf64d0..d590f9998e3 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -103,6 +103,7 @@ pub enum RpcResponseError { VerifyError(LookupVerifyError), CustodyRequestError(#[allow(dead_code)] CustodyRequestError), BlockComponentCouplingError(CouplingError), + InternalError(String), } #[derive(Debug, PartialEq, Eq)] @@ -1715,9 +1716,143 @@ impl SyncNetworkContext { let resp = self .data_columns_by_root_range_requests .on_response(id, rpc_event); + // This error implies we asked the peer for a specific root and it did not give it to us + // if let Some(Err(RpcResponseError::VerifyError( + // LookupVerifyError::NotEnoughResponsesReturned { .. }, + // ))) = resp + // { + + // } self.on_rpc_response_result(id, "DataColumnsByRootRange", resp, peer_id, |b| b.len()) } + fn request_columns_on_successful_blocks( + &mut self, + id: BlocksByRangeRequestId, + blocks: &Vec>>, + ) -> Result<(), RpcResponseError> { + let batch_epoch = id.batch_id(); + // Return early if no columns are required for this epoch + if !matches!( + self.batch_type(batch_epoch), + ByRangeRequestType::BlocksAndColumns + ) { + return Ok(()); + } + // Return early if this is a backfill batch, backfill batches are handled by range requests instead of root + if matches!( + id.parent_request_id.requester, + RangeRequestId::BackfillSync { .. } + ) { + return Ok(()); + } + // todo(pawan): send the data column request as soon as you get each chunk to spread out requests + debug!(count = blocks.len(), "Received blocks from byrange query"); + // We have blocks here, check if they need data columns and request them + let mut block_roots = Vec::new(); + + for block in blocks.iter() { + // Request columns only if the blob_kzg_commitments is non-empty + if let Ok(commitments) = block.message().body().blob_kzg_commitments() { + if !commitments.is_empty() { + block_roots.push(block.canonical_root()); + } + } + } + if block_roots.is_empty() { + // No blobs for the entire epoch, let the coupling logic know not to expect anything + // and return early + if let Some(req) = self + .components_by_range_requests + .get_mut(&id.parent_request_id) + { + if let Err(e) = req.no_columns_for_batch() { + debug!(?e, "Created range request in inconsistent state"); + return Err(RpcResponseError::InternalError(e)); + } + return Ok(()); + } else { + return Err(RpcResponseError::InternalError( + "Request sent without creating an entry".to_string(), + )); + } + } + // Generate the data column by root requests + let mut peer_to_columns: HashMap> = HashMap::new(); + let mut no_peers_for_column: Vec = Vec::new(); + for column in self.chain.sampling_columns_for_epoch(batch_epoch).iter() { + let data_column = DataColumnSubnetId::new(*column); + if let Some(custody_peer) = self + .network_globals() + .peers + .read() + .good_custody_subnet_peer_range_sync(data_column, batch_epoch) + .next() + { + peer_to_columns + .entry(*custody_peer) + .or_default() + .push(*column); + } else { + debug!( + ?data_column, + block_request_id=?id, + "Not enough column peers for batch, need to retry" + ); + no_peers_for_column.push(*column); + } + } + + let mut data_column_requests = Vec::new(); + for (peer, indices) in peer_to_columns.into_iter() { + let data_columns_by_root_request = DataColumnsByRootBatchBlockRequest { + block_roots: block_roots.clone(), + indices: indices.clone(), + }; + + let requester = DataColumnsByRootRequester::RangeSync { + parent: id.parent_request_id, + }; + + data_column_requests.push(( + self.send_data_columns_by_root_range_requests( + peer, + data_columns_by_root_request, + requester, + Span::none(), + ) + .expect("should be able to send request"), + indices, + )); + } + + if !no_peers_for_column.is_empty() { + let data_columns_by_root_request = DataColumnsByRootBatchBlockRequest { + block_roots: block_roots.clone(), + indices: no_peers_for_column, + }; + + self.requests_to_retry + .insert(id.parent_request_id, data_columns_by_root_request); + } + + if let Some(req) = self + .components_by_range_requests + .get_mut(&id.parent_request_id) + { + req.insert_column_request_after_block_request( + data_column_requests, + self.chain.sampling_columns_for_epoch(batch_epoch), + ) + .expect("should be in the right state"); + } else { + return Err(RpcResponseError::InternalError( + "Request sent without creating an entry".to_string(), + )); + } + Ok(()) + } + #[allow(clippy::type_complexity)] pub(crate) fn on_blocks_by_range_response( &mut self, @@ -1727,104 +1862,9 @@ impl SyncNetworkContext { ) -> Option>>>> { let resp = self.blocks_by_range_requests.on_response(id, rpc_event); match &resp { - // todo(pawan): send the data column request as soon as you get each chunk to spread out requests Some(Ok((blocks, _))) => { - // Return early if this is a backfill batch, backfill batches are handled by range requests instead of root - if matches!( - id.parent_request_id.requester, - RangeRequestId::BackfillSync { .. } - ) { - return self - .on_rpc_response_result(id, "BlocksByRange", resp, peer_id, |b| b.len()); - } - // We have blocks here, check if they need data columns and request them - let mut block_roots = Vec::new(); - let batch_epoch = id.batch_id(); - if !matches!( - self.batch_type(batch_epoch), - ByRangeRequestType::BlocksAndColumns - ) { - return self - .on_rpc_response_result(id, "BlocksByRange", resp, peer_id, |b| b.len()); - } - for block in blocks.iter() { - // Request columns only if the blob_kzg_commitments is non-empty - if let Ok(commitments) = block.message().body().blob_kzg_commitments() { - if !commitments.is_empty() { - block_roots.push(block.canonical_root()); - } - } - } - // Generate the data column by root requests - let mut peer_to_columns: HashMap> = HashMap::new(); - let mut no_peers_for_column: Vec = Vec::new(); - for column in self.chain.sampling_columns_for_epoch(batch_epoch).iter() { - let data_column = DataColumnSubnetId::new(*column); - if let Some(custody_peer) = self - .network_globals() - .peers - .read() - .good_custody_subnet_peer_range_sync(data_column, batch_epoch) - .next() - { - peer_to_columns - .entry(*custody_peer) - .or_default() - .push(*column); - } else { - debug!( - ?data_column, - block_request_id=?id, - "Not enough column peers for batch, need to retry" - ); - no_peers_for_column.push(*column); - } - } - - // todo(pawan): no_peers_for_column nned to be requested once peers - // become available - let mut data_column_requests = Vec::new(); - for (peer, indices) in peer_to_columns.into_iter() { - let data_columns_by_root_request = DataColumnsByRootBatchBlockRequest { - block_roots: block_roots.clone(), - indices: indices.clone(), - }; - - let requester = DataColumnsByRootRequester::RangeSync { - parent: id.parent_request_id, - }; - - data_column_requests.push(( - self.send_data_columns_by_root_range_requests( - peer, - data_columns_by_root_request, - requester, - Span::none(), - ) - .expect("should be able to send request"), - indices, - )); - } - - if !no_peers_for_column.is_empty() { - let data_columns_by_root_request = DataColumnsByRootBatchBlockRequest { - block_roots: block_roots.clone(), - indices: no_peers_for_column, - }; - - self.requests_to_retry - .insert(id.parent_request_id, data_columns_by_root_request); - } - - if let Some(req) = self - .components_by_range_requests - .get_mut(&id.parent_request_id) - { - req.insert_column_request_after_block_request( - data_column_requests, - self.chain.sampling_columns_for_epoch(batch_epoch), - ) - .expect("should be in the right state"); + if let Err(e) = self.request_columns_on_successful_blocks(id, blocks) { + return Some(Err(e)); } } None => {} diff --git a/beacon_node/network/src/sync/network_context/requests/data_columns_by_root.rs b/beacon_node/network/src/sync/network_context/requests/data_columns_by_root.rs index 642b88c9d9d..17faee4fd9b 100644 --- a/beacon_node/network/src/sync/network_context/requests/data_columns_by_root.rs +++ b/beacon_node/network/src/sync/network_context/requests/data_columns_by_root.rs @@ -146,7 +146,7 @@ impl ActiveRequestItems for DataColumnsByRootRangeRequestItems { .flatten() .any(|d| d.index == data_column.index && d.block_root() == block_root) { - tracing::debug!( + tracing::trace!( ?data_column, existing_items=?self.items, "Duplicated data", diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index 2d0362f2676..b955c0b0ab3 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -829,6 +829,7 @@ impl SyncingChain { let optimistic_epoch = align(optimistic_start_epoch); // advance the chain to the new validating epoch + debug!("Advancing chain"); self.advance_chain(network, validating_epoch); if self.optimistic_start.is_none() && optimistic_epoch > self.processing_target @@ -841,6 +842,7 @@ impl SyncingChain { self.state = ChainSyncingState::Syncing; // begin requesting blocks from the peer pool, until all peers are exhausted. + debug!("Requesting batches from inside start syncing"); self.request_batches(network)?; // start processing batches if needed @@ -965,13 +967,18 @@ impl SyncingChain { src: &str, ) -> ProcessingResult { debug!(?src, "In attempt_send_awaiting download batches"); - // Check all batches in AwaitingDownload state and see if they can be sent - for (batch_id, batch) in self.batches.iter() { - if matches!(batch.state(), BatchState::AwaitingDownload) { - debug!(?src, ?batch_id, "Sending batch"); - if self.good_peers_on_sampling_subnets(*batch_id, network) { - return self.send_batch(network, *batch_id); - } + // Collect all batches in AwaitingDownload state and see if they can be sent + let awaiting_downloads: Vec<_> = self + .batches + .iter() + .filter(|(_, batch)| matches!(batch.state(), BatchState::AwaitingDownload)) + .map(|(batch_id, _)| batch_id) + .copied() + .collect(); + for batch_id in awaiting_downloads { + debug!(?src, ?batch_id, "Sending batch"); + if self.good_peers_on_sampling_subnets(batch_id, network) { + self.send_batch(network, batch_id)?; } } Ok(KeepChain) @@ -1127,12 +1134,14 @@ impl SyncingChain { if !matches!(self.state, ChainSyncingState::Syncing) { return Ok(KeepChain); } + debug!("In request batches"); // find the next pending batch and request it from the peer // check if we have the batch for our optimistic start. If not, request it first. // We wait for this batch before requesting any other batches. if let Some(epoch) = self.optimistic_start { + debug!("In request batches optimistic start"); if !self.good_peers_on_sampling_subnets(epoch, network) { debug!("Waiting for peers to be available on sampling column subnets"); return Ok(KeepChain); @@ -1143,10 +1152,15 @@ impl SyncingChain { let optimistic_batch = BatchInfo::new(&epoch, EPOCHS_PER_BATCH, batch_type); entry.insert(optimistic_batch); self.send_batch(network, epoch)?; + } else { + debug!(batch=?self.batches.get(&epoch), "Optimistic batch info"); + self.attempt_send_awaiting_download_batches(network, "optimisitc"); } return Ok(KeepChain); } + debug!("In request batches checking if can send batch"); + // find the next pending batch and request it from the peer // Note: for this function to not infinite loop we must: // - If `include_next_batch` returns Some we MUST increase the count of batches that are @@ -1193,6 +1207,8 @@ impl SyncingChain { /// Creates the next required batch from the chain. If there are no more batches required, /// `false` is returned. fn include_next_batch(&mut self, network: &mut SyncNetworkContext) -> Option { + debug!("In include_next_batch"); + // don't request batches beyond the target head slot if self .to_be_downloaded @@ -1211,13 +1227,20 @@ impl SyncingChain { BatchState::Downloading(..) | BatchState::AwaitingProcessing(..) ) }; - if self + let in_buffer_batches: Vec<_> = self .batches .iter() .filter(|&(_epoch, batch)| in_buffer(batch)) - .count() - > BATCH_BUFFER_SIZE as usize - { + .map(|(epoch, _)| epoch) + .collect(); + + if in_buffer_batches.len() > BATCH_BUFFER_SIZE as usize { + debug!( + ?in_buffer_batches, + ?self.processing_target, + ?self.to_be_downloaded, "Too many batches already" + ); + return None; } @@ -1230,6 +1253,7 @@ impl SyncingChain { return None; } + debug!(?self.to_be_downloaded, "Trying to check next batch id"); // If no batch needs a retry, attempt to send the batch of the next epoch to download let next_batch_id = self.to_be_downloaded; // this batch could have been included already being an optimistic batch From 27d0b3666d1b65674357f3377d478e29fd90de47 Mon Sep 17 00:00:00 2001 From: Pawan Dhananjay Date: Tue, 26 Aug 2025 16:06:27 -0700 Subject: [PATCH 16/49] Remove debug statements --- .../src/peer_manager/mod.rs | 50 +------------------ .../src/rpc/self_limiter.rs | 4 +- .../lighthouse_network/src/service/mod.rs | 2 +- .../network_beacon_processor/rpc_methods.rs | 26 +++++----- 4 files changed, 18 insertions(+), 64 deletions(-) diff --git a/beacon_node/lighthouse_network/src/peer_manager/mod.rs b/beacon_node/lighthouse_network/src/peer_manager/mod.rs index 0f0249eed10..93515ed5f6b 100644 --- a/beacon_node/lighthouse_network/src/peer_manager/mod.rs +++ b/beacon_node/lighthouse_network/src/peer_manager/mod.rs @@ -729,7 +729,7 @@ impl PeerManager { } } else { // we have no meta-data for this peer, update - debug!(%peer_id, new_seq_no = meta_data.seq_number(), cgc=?meta_data.custody_group_count().ok(), "Obtained peer's metadata"); + debug!(%peer_id, new_seq_no = meta_data.seq_number(), "Obtained peer's metadata"); } let known_custody_group_count = peer_info @@ -745,7 +745,7 @@ impl PeerManager { if let Some(custody_group_count) = custody_group_count_opt { match self.compute_peer_custody_groups(peer_id, custody_group_count) { Ok(custody_groups) => { - let custody_subnets: HashSet = custody_groups + let custody_subnets = custody_groups .into_iter() .flat_map(|custody_index| { self.subnets_by_custody_group @@ -761,13 +761,6 @@ impl PeerManager { }) }) .collect(); - let cgc = if custody_subnets.len() == 128 { - "supernode".to_string() - } else { - format!("{:?}", custody_subnets) - }; - - debug!(cgc, ?peer_id, "Peer custodied subnets"); peer_info.set_custody_subnets(custody_subnets); updated_cgc = Some(custody_group_count) != known_custody_group_count; @@ -956,42 +949,6 @@ impl PeerManager { } } - /// Run discovery query for additional custody peers if we fall below `TARGET_PEERS`. - fn maintain_custody_peers(&mut self) { - let subnets_to_discover: Vec = self - .network_globals - .sampling_subnets() - .iter() - .filter_map(|custody_subnet| { - if self - .network_globals - .peers - .read() - .good_range_sync_custody_subnet_peers(*custody_subnet) - .count() - < 2 - { - Some(SubnetDiscovery { - subnet: Subnet::DataColumn(*custody_subnet), - min_ttl: None, - }) - } else { - None - } - }) - .collect(); - - // request the subnet query from discovery - if !subnets_to_discover.is_empty() { - debug!( - subnets = ?subnets_to_discover.iter().map(|s| s.subnet).collect::>(), - "Making subnet queries for maintaining custody peers" - ); - self.events - .push(PeerManagerEvent::DiscoverSubnetPeers(subnets_to_discover)); - } - } - fn maintain_trusted_peers(&mut self) { let trusted_peers = self.trusted_peers.clone(); for trusted_peer in trusted_peers { @@ -1314,9 +1271,6 @@ impl PeerManager { // Update peer score metrics; self.update_peer_score_metrics(); - // Maintain minimum count for custody peers. - self.maintain_custody_peers(); - // Maintain minimum count for sync committee peers. self.maintain_sync_committee_peers(); diff --git a/beacon_node/lighthouse_network/src/rpc/self_limiter.rs b/beacon_node/lighthouse_network/src/rpc/self_limiter.rs index 6b1f759c795..90e2db91357 100644 --- a/beacon_node/lighthouse_network/src/rpc/self_limiter.rs +++ b/beacon_node/lighthouse_network/src/rpc/self_limiter.rs @@ -90,7 +90,7 @@ impl SelfRateLimiter { let protocol = req.versioned_protocol().protocol(); // First check that there are not already other requests waiting to be sent. if let Some(queued_requests) = self.delayed_requests.get_mut(&(peer_id, protocol)) { - tracing::trace!(%peer_id, protocol = %req.protocol(), "Self rate limiting since there are already other requests waiting to be sent"); + debug!(%peer_id, protocol = %req.protocol(), "Self rate limiting since there are already other requests waiting to be sent"); queued_requests.push_back(QueuedRequest { req, request_id, @@ -134,7 +134,7 @@ impl SelfRateLimiter { && let Some(count) = active_request.get(&req.protocol()) && *count >= MAX_CONCURRENT_REQUESTS { - tracing::trace!( + debug!( %peer_id, protocol = %req.protocol(), "Self rate limiting due to the number of concurrent requests" diff --git a/beacon_node/lighthouse_network/src/service/mod.rs b/beacon_node/lighthouse_network/src/service/mod.rs index efac129724b..eebc2f02009 100644 --- a/beacon_node/lighthouse_network/src/service/mod.rs +++ b/beacon_node/lighthouse_network/src/service/mod.rs @@ -1909,7 +1909,7 @@ impl Network { } }, }; - tracing::trace!(our_addr = %local_addr, from = %send_back_addr, error = error_repr, "Failed incoming connection"); + debug!(our_addr = %local_addr, from = %send_back_addr, error = error_repr, "Failed incoming connection"); None } SwarmEvent::OutgoingConnectionError { diff --git a/beacon_node/network/src/network_beacon_processor/rpc_methods.rs b/beacon_node/network/src/network_beacon_processor/rpc_methods.rs index 8c0acb255c0..85e4f046410 100644 --- a/beacon_node/network/src/network_beacon_processor/rpc_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/rpc_methods.rs @@ -228,7 +228,7 @@ impl NetworkBeaconProcessor { send_block_count += 1; } Ok(None) => { - tracing::trace!( + debug!( %peer_id, request_root = ?root, "Peer requested unknown block" @@ -449,7 +449,7 @@ impl NetworkBeaconProcessor { } } - tracing::trace!( + debug!( %peer_id, request = ?request.data_column_ids, returned = send_data_column_count, @@ -495,7 +495,7 @@ impl NetworkBeaconProcessor { inbound_request_id: InboundRequestId, req: LightClientUpdatesByRangeRequest, ) -> Result<(), (RpcErrorResponse, &'static str)> { - tracing::trace!( + debug!( %peer_id, count = req.count, start_period = req.start_period, @@ -538,7 +538,7 @@ impl NetworkBeaconProcessor { let lc_updates_sent = lc_updates.len(); if lc_updates_sent < req.count as usize { - tracing::trace!( + debug!( peer = %peer_id, info = "Failed to return all requested light client updates. The peer may have requested data ahead of whats currently available", start_period = req.start_period, @@ -547,7 +547,7 @@ impl NetworkBeaconProcessor { "LightClientUpdatesByRange outgoing response processed" ); } else { - tracing::trace!( + debug!( peer = %peer_id, start_period = req.start_period, requested = req.count, @@ -704,7 +704,7 @@ impl NetworkBeaconProcessor { let req_start_slot = *req.start_slot(); let req_count = *req.count(); - tracing::trace!( + debug!( %peer_id, count = req_count, start_slot = %req_start_slot, @@ -737,7 +737,7 @@ impl NetworkBeaconProcessor { let log_results = |peer_id, blocks_sent| { if blocks_sent < (req_count as usize) { - tracing::trace!( + debug!( %peer_id, msg = "Failed to return all requested blocks", start_slot = %req_start_slot, @@ -747,7 +747,7 @@ impl NetworkBeaconProcessor { "BlocksByRange outgoing response processed" ); } else { - tracing::trace!( + debug!( %peer_id, start_slot = %req_start_slot, %current_slot, @@ -891,7 +891,7 @@ impl NetworkBeaconProcessor { elapsed, ); - tracing::trace!( + debug!( req_type, start_slot = %req_start_slot, req_count, @@ -995,7 +995,7 @@ impl NetworkBeaconProcessor { inbound_request_id: InboundRequestId, req: BlobsByRangeRequest, ) -> Result<(), (RpcErrorResponse, &'static str)> { - tracing::trace!( + debug!( ?peer_id, count = req.count, start_slot = req.start_slot, @@ -1048,7 +1048,7 @@ impl NetworkBeaconProcessor { .unwrap_or_else(|_| self.chain.slot_clock.genesis_slot()); let log_results = |peer_id, req: BlobsByRangeRequest, blobs_sent| { - tracing::trace!( + debug!( %peer_id, start_slot = req.start_slot, %current_slot, @@ -1137,7 +1137,7 @@ impl NetworkBeaconProcessor { inbound_request_id: InboundRequestId, req: DataColumnsByRangeRequest, ) -> Result<(), (RpcErrorResponse, &'static str)> { - tracing::trace!( + debug!( %peer_id, count = req.count, start_slot = req.start_slot, @@ -1242,7 +1242,7 @@ impl NetworkBeaconProcessor { .slot() .unwrap_or_else(|_| self.chain.slot_clock.genesis_slot()); - tracing::trace!( + debug!( %peer_id, start_slot = req.start_slot, %current_slot, From a97cf880f80f529d1a4b726aad8631add28acb45 Mon Sep 17 00:00:00 2001 From: Pawan Dhananjay Date: Tue, 26 Aug 2025 18:53:51 -0700 Subject: [PATCH 17/49] Add docs --- .../src/sync/block_sidecar_coupling.rs | 55 +++++++--- .../network/src/sync/network_context.rs | 102 +++++++++++++----- 2 files changed, 112 insertions(+), 45 deletions(-) diff --git a/beacon_node/network/src/sync/block_sidecar_coupling.rs b/beacon_node/network/src/sync/block_sidecar_coupling.rs index e5bb84813a3..9db4c5d5e07 100644 --- a/beacon_node/network/src/sync/block_sidecar_coupling.rs +++ b/beacon_node/network/src/sync/block_sidecar_coupling.rs @@ -56,11 +56,18 @@ enum RangeBlockDataRequest { expected_custody_columns: Vec, attempt: usize, }, + /// These are data columns fetched by root instead of by range like the previous variant. + /// + /// Note: this variant starts out in an uninitialized state because we typically make + /// the column requests by root only **after** we have fetched the corresponding blocks. + /// We can initialize this variant only after the columns requests have been made. DataColumnsFromRoot { requests: HashMap< DataColumnsByRootRequestId, ByRangeRequest>, >, + // Indicates if this variant has been initialized by sending columns by root requests. + // We only start expecting columns once this is set to true. init: bool, /// The column indices corresponding to the request column_peers: HashMap>, @@ -89,6 +96,8 @@ impl RangeBlockComponentsRequest { /// * `blocks_req_id` - Request ID for the blocks /// * `blobs_req_id` - Optional request ID for blobs (pre-Fulu fork) /// * `data_columns` - Optional tuple of (request_id->column_indices pairs, expected_custody_columns) for Fulu fork + /// * `request_columns_by_root` - Creates an uninitialized `RangeBlockDataRequest::DataColumnsFromRoot` variant if this is true. + /// Note: this is only relevant is `data_columns == None`. #[allow(clippy::type_complexity)] pub fn new( blocks_req_id: BlocksByRangeRequestId, @@ -97,7 +106,7 @@ impl RangeBlockComponentsRequest { Vec<(DataColumnsByRangeRequestId, Vec)>, Vec, )>, - data_columns_from_root: bool, + request_columns_by_root: bool, request_span: Span, ) -> Self { let block_peer = blocks_req_id.peer_id; @@ -114,7 +123,7 @@ impl RangeBlockComponentsRequest { expected_custody_columns, attempt: 0, } - } else if data_columns_from_root { + } else if request_columns_by_root { RangeBlockDataRequest::DataColumnsFromRoot { requests: HashMap::new(), init: false, @@ -134,6 +143,7 @@ impl RangeBlockComponentsRequest { } } + /// Returns the peers that we requested the blocks, blobs and columns for this component. pub fn responsible_peers(&self) -> ResponsiblePeers { ResponsiblePeers { block_blob: self.block_peer, @@ -174,8 +184,8 @@ impl RangeBlockComponentsRequest { } } - /// `column_requests`: each element represents a request id and the columns requested under that request. - pub fn insert_column_request_after_block_request( + /// Initialize the entries for this component after the column requests have been sent. + pub fn initialize_data_columns_from_root_component( &mut self, column_requests: Vec<(DataColumnsByRootRequestId, Vec)>, custody_columns: &[ColumnIndex], @@ -210,6 +220,13 @@ impl RangeBlockComponentsRequest { } } + /// This modifies the internal variant to `NoData`. + /// + /// Once we make the block request for a batch and get responses, it is possible + /// that the entire batch contained no blobs based on the values of `expected_kzg_commitments`. + /// + /// At this point, we do not need to make any requests and the blocks correspond to all the + /// available data for this batch. Hence, we indicate here that this component requires no data. pub fn no_columns_for_batch(&mut self) -> Result<(), String> { match self.block_data_request { RangeBlockDataRequest::DataColumnsFromRoot { .. } => { @@ -336,17 +353,13 @@ impl RangeBlockComponentsRequest { spec, )) } - RangeBlockDataRequest::DataColumnsFromRoot { - init, - attempt, - column_peers, - expected_custody_columns, + + RangeBlockDataRequest::DataColumns { requests, + expected_custody_columns, + column_peers, + attempt, } => { - if !*init { - return None; - } - let mut data_columns = vec![]; let mut column_to_peer_id: HashMap = HashMap::new(); for req in requests.values() { @@ -393,12 +406,20 @@ impl RangeBlockComponentsRequest { Some(resp) } - RangeBlockDataRequest::DataColumns { - requests, - expected_custody_columns, - column_peers, + // Reuse same logic that we use for coupling data columns for now. + // todo(pawan): we should never get a coupling error here, so simplify this + // variant's handling. + RangeBlockDataRequest::DataColumnsFromRoot { + init, attempt, + column_peers, + expected_custody_columns, + requests, } => { + if !*init { + return None; + } + let mut data_columns = vec![]; let mut column_to_peer_id: HashMap = HashMap::new(); for req in requests.values() { diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index d590f9998e3..c99008ebf18 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -559,9 +559,10 @@ impl SyncNetworkContext { } /// Try to make all the requests that we failed to make earlier because of lack of peers - /// in the required subnets. + /// in the required columns. /// - /// This function must be manually invoked at regular intervals. + /// This function must be manually invoked at regular intervals or when a new peer + /// gets added. pub fn retry_pending_requests(&mut self) -> Result<(), String> { let active_requests = self.active_request_count_by_peer(); @@ -603,21 +604,27 @@ impl SyncNetworkContext { requester, Span::none(), ) - .expect("should be able to send request"), + .map_err(|e| { + format!("Failed to send data columns by root request {:?}", e) + }), indices, - )); + )?); } // we have sent out requests to peers, register these requests with the coupling service. if let Some(req) = self.components_by_range_requests.get_mut(&parent_request) { - req.insert_column_request_after_block_request( + req.initialize_data_columns_from_root_component( data_column_requests, self.chain .sampling_columns_for_epoch(parent_request.requester.batch_id()), ) - .expect("should be in the right state"); + .map_err(|e| { + format!( + "Inconsistent state when inserting columns by root request {:?}", + e + ) + })?; } debug!(?requests, "Successfully retried requests"); - // Successfully processed, don't keep this entry } Err(err) => { debug!( @@ -625,7 +632,7 @@ impl SyncNetworkContext { ?parent_request, "Failed to retry request, no peers in subnets", ); - // Failed to process, keep this entry for next retry + // Still no peers, keep this entry for next retry entries_to_keep.push((parent_request, requests)); } } @@ -769,6 +776,7 @@ impl SyncNetworkContext { self.chain.sampling_columns_for_epoch(epoch).to_vec(), ) }), + // We are requesting data columns by range here false, range_request_span, ); @@ -778,6 +786,9 @@ impl SyncNetworkContext { } /// A blocks by range request sent by the range sync algorithm + /// + /// This function is used when we want to request data columns by root instead of range. + /// Pre-fulu, it works similar to `Self::block_components_by_range_request`. pub fn block_components_by_range_request_without_components( &mut self, batch_type: ByRangeRequestType, @@ -854,14 +865,13 @@ impl SyncNetworkContext { None }; - let data_columns_by_root = matches!(batch_type, ByRangeRequestType::BlocksAndColumns); - debug!(?requester, data_columns_by_root, "Batch type"); let info = RangeBlockComponentsRequest::new( blocks_req_id, blobs_req_id, None, - data_columns_by_root, + // request data columns by root only if this batch requires requesting columns + matches!(batch_type, ByRangeRequestType::BlocksAndColumns), range_request_span, ); self.components_by_range_requests.insert(id, info); @@ -1473,6 +1483,7 @@ impl SyncNetworkContext { Ok((id, requested_columns)) } + /// Send `DataColumnsByRoot` requests for progressing range sync. fn send_data_columns_by_root_range_requests( &mut self, peer_id: PeerId, @@ -1492,7 +1503,7 @@ impl SyncNetworkContext { request .clone() .try_into_request(self.fork_context.current_fork_name(), &self.chain.spec) - .expect("should work"), + .map_err(|e| RpcRequestSendError::InternalError(e.to_string()))?, ), app_request_id: AppRequestId::Sync(SyncRequestId::DataColumnsByRoot(id)), }) @@ -1716,16 +1727,27 @@ impl SyncNetworkContext { let resp = self .data_columns_by_root_range_requests .on_response(id, rpc_event); - // This error implies we asked the peer for a specific root and it did not give it to us - // if let Some(Err(RpcResponseError::VerifyError( - // LookupVerifyError::NotEnoughResponsesReturned { .. }, - // ))) = resp - // { - - // } self.on_rpc_response_result(id, "DataColumnsByRootRange", resp, peer_id, |b| b.len()) } + /// Requests data columns for the given blocks by root. + /// + /// We request by root because it is much easier to reason about + /// and handle for failure cases when we ask for the same roots that + /// we are trying to sync the blocks for. + /// + /// This is specially relevant in periods of non-finality when there are multiple + /// head chains to sync. + /// + /// This function piggybacks on the existing parent block request and inserts the + /// column requests made into `self.components_by_range_requests` such that when + /// the column requests complete, we return the coupled batch to range sync to progress. + /// + /// If there are no peers to serve the column requests, we add them to a queue for retrying + /// the requests once more peers become available. + /// + /// Note: we do not use the by root syncing mechanism for backfill since there is only + /// one canonical chain to sync. fn request_columns_on_successful_blocks( &mut self, id: BlocksByRangeRequestId, @@ -1746,11 +1768,11 @@ impl SyncNetworkContext { ) { return Ok(()); } - // todo(pawan): send the data column request as soon as you get each chunk to spread out requests debug!(count = blocks.len(), "Received blocks from byrange query"); - // We have blocks here, check if they need data columns and request them + let mut block_roots = Vec::new(); + // We have blocks here, check if they need data columns and request them for block in blocks.iter() { // Request columns only if the blob_kzg_commitments is non-empty if let Ok(commitments) = block.message().body().blob_kzg_commitments() { @@ -1759,9 +1781,10 @@ impl SyncNetworkContext { } } } + + // No blobs for the entire epoch, let the coupling logic know not to expect anything + // and return early if block_roots.is_empty() { - // No blobs for the entire epoch, let the coupling logic know not to expect anything - // and return early if let Some(req) = self .components_by_range_requests .get_mut(&id.parent_request_id) @@ -1773,7 +1796,7 @@ impl SyncNetworkContext { return Ok(()); } else { return Err(RpcResponseError::InternalError( - "Request sent without creating an entry".to_string(), + "Block request sent without creating a components_by_range entry".to_string(), )); } } @@ -1803,6 +1826,7 @@ impl SyncNetworkContext { } } + // Send the requests for all columns that we have peers for let mut data_column_requests = Vec::new(); for (peer, indices) in peer_to_columns.into_iter() { let data_columns_by_root_request = DataColumnsByRootBatchBlockRequest { @@ -1821,11 +1845,17 @@ impl SyncNetworkContext { requester, Span::none(), ) - .expect("should be able to send request"), + .map_err(|e| { + RpcResponseError::InternalError(format!( + "Failed to send data columns by root request {:?}", + e + )) + }), indices, - )); + )?); } + // There are columns for which we have no peers, queue them up for retry later if !no_peers_for_column.is_empty() { let data_columns_by_root_request = DataColumnsByRootBatchBlockRequest { block_roots: block_roots.clone(), @@ -1836,15 +1866,21 @@ impl SyncNetworkContext { .insert(id.parent_request_id, data_columns_by_root_request); } + // Insert the requests into the existing block parent request if let Some(req) = self .components_by_range_requests .get_mut(&id.parent_request_id) { - req.insert_column_request_after_block_request( + req.initialize_data_columns_from_root_component( data_column_requests, self.chain.sampling_columns_for_epoch(batch_epoch), ) - .expect("should be in the right state"); + .map_err(|e| { + format!( + "Inconsistent state when inserting columns by root request {:?}", + e + ) + })?; } else { return Err(RpcResponseError::InternalError( "Request sent without creating an entry".to_string(), @@ -1863,6 +1899,16 @@ impl SyncNetworkContext { let resp = self.blocks_by_range_requests.on_response(id, rpc_event); match &resp { Some(Ok((blocks, _))) => { + // On receving a successful response for a blocks by range request, + // request the corresponding data columns for this batch by root (if required). + // + // We request the columns by root instead of by range to avoid peers responding + // with the columns corresponding to their view of the canonical chain + // instead of the chain that we are trying to sync. Requesting by root allows + // us to be more specific and reduces the number of failure cases we have to handle. + // + // This is specially relevant when we are syncing at times when there are a lot of + // head chains in a non-finality scenario. if let Err(e) = self.request_columns_on_successful_blocks(id, blocks) { return Some(Err(e)); } From 05adb7195d7c1eb320ac2cd532cdd4818536c8d2 Mon Sep 17 00:00:00 2001 From: Pawan Dhananjay Date: Wed, 27 Aug 2025 14:26:28 -0700 Subject: [PATCH 18/49] Fix bug with partial column responses before all column requests sent --- .../src/sync/block_sidecar_coupling.rs | 44 ++++++++------- .../network/src/sync/network_context.rs | 53 ++++++++++--------- 2 files changed, 54 insertions(+), 43 deletions(-) diff --git a/beacon_node/network/src/sync/block_sidecar_coupling.rs b/beacon_node/network/src/sync/block_sidecar_coupling.rs index 9db4c5d5e07..59c0ebc81d2 100644 --- a/beacon_node/network/src/sync/block_sidecar_coupling.rs +++ b/beacon_node/network/src/sync/block_sidecar_coupling.rs @@ -10,7 +10,10 @@ use lighthouse_network::{ DataColumnsByRootRequestId, }, }; -use std::{collections::HashMap, sync::Arc}; +use std::{ + collections::{HashMap, HashSet}, + sync::Arc, +}; use tracing::Span; use types::{ BlobSidecar, ChainSpec, ColumnIndex, DataColumnSidecar, DataColumnSidecarList, EthSpec, @@ -66,12 +69,11 @@ enum RangeBlockDataRequest { DataColumnsByRootRequestId, ByRangeRequest>, >, - // Indicates if this variant has been initialized by sending columns by root requests. - // We only start expecting columns once this is set to true. - init: bool, + // Indicates if we have made column requests for each of the `expected_custody_columns` or not + all_requests_made: bool, /// The column indices corresponding to the request column_peers: HashMap>, - expected_custody_columns: Vec, + expected_custody_columns: HashSet, attempt: usize, }, } @@ -106,7 +108,7 @@ impl RangeBlockComponentsRequest { Vec<(DataColumnsByRangeRequestId, Vec)>, Vec, )>, - request_columns_by_root: bool, + data_columns_by_root: Option>, request_span: Span, ) -> Self { let block_peer = blocks_req_id.peer_id; @@ -123,13 +125,13 @@ impl RangeBlockComponentsRequest { expected_custody_columns, attempt: 0, } - } else if request_columns_by_root { + } else if let Some(expected_custody_columns) = data_columns_by_root { RangeBlockDataRequest::DataColumnsFromRoot { requests: HashMap::new(), - init: false, + all_requests_made: false, attempt: 0, column_peers: HashMap::new(), - expected_custody_columns: Vec::new(), + expected_custody_columns, } } else { RangeBlockDataRequest::NoData @@ -188,7 +190,6 @@ impl RangeBlockComponentsRequest { pub fn initialize_data_columns_from_root_component( &mut self, column_requests: Vec<(DataColumnsByRootRequestId, Vec)>, - custody_columns: &[ColumnIndex], ) -> Result<(), String> { // Nothing to insert, do not initialize if column_requests.is_empty() { @@ -196,9 +197,9 @@ impl RangeBlockComponentsRequest { } match &mut self.block_data_request { RangeBlockDataRequest::DataColumnsFromRoot { - init, requests, attempt: _, + all_requests_made, column_peers, expected_custody_columns, } => { @@ -206,13 +207,14 @@ impl RangeBlockComponentsRequest { requests.insert(request, ByRangeRequest::Active(request)); column_peers.insert(request, peers); } - // expected custody columns should be populated only once during initialization - if !*init { - for column in custody_columns { - expected_custody_columns.push(*column); + + if !*all_requests_made { + let mut all_columns_requested = HashSet::new(); + for columns in column_peers.values() { + all_columns_requested.extend(columns.iter()); } + *all_requests_made = all_columns_requested == *expected_custody_columns; } - *init = true; Ok(()) } @@ -410,13 +412,15 @@ impl RangeBlockComponentsRequest { // todo(pawan): we should never get a coupling error here, so simplify this // variant's handling. RangeBlockDataRequest::DataColumnsFromRoot { - init, + all_requests_made, attempt, column_peers, expected_custody_columns, requests, } => { - if !*init { + // Do not couple until requests covering all required columns + // have been made + if !*all_requests_made { return None; } @@ -441,11 +445,13 @@ impl RangeBlockComponentsRequest { } } + let expected_custody_columns: Vec<_> = + expected_custody_columns.iter().copied().collect(); let resp = Self::responses_with_custody_columns( blocks.to_vec(), data_columns, column_to_peer_id, - expected_custody_columns, + &expected_custody_columns, *attempt, ); diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index c99008ebf18..c6bd3ce2f57 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -606,16 +606,14 @@ impl SyncNetworkContext { ) .map_err(|e| { format!("Failed to send data columns by root request {:?}", e) - }), + })?, indices, - )?); + )); } // we have sent out requests to peers, register these requests with the coupling service. if let Some(req) = self.components_by_range_requests.get_mut(&parent_request) { req.initialize_data_columns_from_root_component( data_column_requests, - self.chain - .sampling_columns_for_epoch(parent_request.requester.batch_id()), ) .map_err(|e| { format!( @@ -777,7 +775,7 @@ impl SyncNetworkContext { ) }), // We are requesting data columns by range here - false, + None, range_request_span, ); self.components_by_range_requests.insert(id, info); @@ -865,13 +863,19 @@ impl SyncNetworkContext { None }; - + let epoch = Slot::new(*request.start_slot()).epoch(T::EthSpec::slots_per_epoch()); let info = RangeBlockComponentsRequest::new( blocks_req_id, blobs_req_id, None, // request data columns by root only if this batch requires requesting columns - matches!(batch_type, ByRangeRequestType::BlocksAndColumns), + if matches!(batch_type, ByRangeRequestType::BlocksAndColumns) { + Some(HashSet::from_iter( + self.chain.sampling_columns_for_epoch(epoch).iter().copied(), + )) + } else { + None + }, range_request_span, ); self.components_by_range_requests.insert(id, info); @@ -1817,11 +1821,6 @@ impl SyncNetworkContext { .or_default() .push(*column); } else { - debug!( - ?data_column, - block_request_id=?id, - "Not enough column peers for batch, need to retry" - ); no_peers_for_column.push(*column); } } @@ -1850,13 +1849,18 @@ impl SyncNetworkContext { "Failed to send data columns by root request {:?}", e )) - }), + })?, indices, - )?); + )); } // There are columns for which we have no peers, queue them up for retry later if !no_peers_for_column.is_empty() { + debug!( + block_request_id=?id, + ?no_peers_for_column, + "Not enough column peers for batch, will retry request" + ); let data_columns_by_root_request = DataColumnsByRootBatchBlockRequest { block_roots: block_roots.clone(), indices: no_peers_for_column, @@ -1871,16 +1875,13 @@ impl SyncNetworkContext { .components_by_range_requests .get_mut(&id.parent_request_id) { - req.initialize_data_columns_from_root_component( - data_column_requests, - self.chain.sampling_columns_for_epoch(batch_epoch), - ) - .map_err(|e| { - format!( - "Inconsistent state when inserting columns by root request {:?}", - e - ) - })?; + req.initialize_data_columns_from_root_component(data_column_requests) + .map_err(|e| { + RpcResponseError::InternalError(format!( + "Inconsistent state when inserting columns by root request {:?}", + e + )) + })?; } else { return Err(RpcResponseError::InternalError( "Request sent without creating an entry".to_string(), @@ -1910,6 +1911,10 @@ impl SyncNetworkContext { // This is specially relevant when we are syncing at times when there are a lot of // head chains in a non-finality scenario. if let Err(e) = self.request_columns_on_successful_blocks(id, blocks) { + debug!( + ?e, + "Error requesting columns on succesful blocks by range request" + ); return Some(Err(e)); } } From b4bc7fed69f6cff410624b68e73c89d33d0b0f89 Mon Sep 17 00:00:00 2001 From: Pawan Dhananjay Date: Wed, 27 Aug 2025 14:30:46 -0700 Subject: [PATCH 19/49] Remove more debug logs --- .../network/src/sync/range_sync/chain.rs | 21 +------------------ 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index b955c0b0ab3..a31a24f7d08 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -966,7 +966,6 @@ impl SyncingChain { network: &mut SyncNetworkContext, src: &str, ) -> ProcessingResult { - debug!(?src, "In attempt_send_awaiting download batches"); // Collect all batches in AwaitingDownload state and see if they can be sent let awaiting_downloads: Vec<_> = self .batches @@ -976,7 +975,6 @@ impl SyncingChain { .copied() .collect(); for batch_id in awaiting_downloads { - debug!(?src, ?batch_id, "Sending batch"); if self.good_peers_on_sampling_subnets(batch_id, network) { self.send_batch(network, batch_id)?; } @@ -991,7 +989,6 @@ impl SyncingChain { batch_id: BatchId, ) -> ProcessingResult { let _guard = self.span.clone().entered(); - debug!(batch_epoch = %batch_id, "Requesting batch"); let batch_state = self.visualize_batch_state(); if let Some(batch) = self.batches.get_mut(&batch_id) { let (request, batch_type) = batch.to_blocks_by_range_request(); @@ -1121,7 +1118,6 @@ impl SyncingChain { network: &mut SyncNetworkContext, ) -> Result { let _guard = self.span.clone().entered(); - debug!("Resuming chain"); // Request more batches if needed. self.request_batches(network)?; // If there is any batch ready for processing, send it. @@ -1134,14 +1130,11 @@ impl SyncingChain { if !matches!(self.state, ChainSyncingState::Syncing) { return Ok(KeepChain); } - debug!("In request batches"); - // find the next pending batch and request it from the peer // check if we have the batch for our optimistic start. If not, request it first. // We wait for this batch before requesting any other batches. if let Some(epoch) = self.optimistic_start { - debug!("In request batches optimistic start"); if !self.good_peers_on_sampling_subnets(epoch, network) { debug!("Waiting for peers to be available on sampling column subnets"); return Ok(KeepChain); @@ -1153,14 +1146,11 @@ impl SyncingChain { entry.insert(optimistic_batch); self.send_batch(network, epoch)?; } else { - debug!(batch=?self.batches.get(&epoch), "Optimistic batch info"); - self.attempt_send_awaiting_download_batches(network, "optimisitc"); + self.attempt_send_awaiting_download_batches(network, "optimistic"); } return Ok(KeepChain); } - debug!("In request batches checking if can send batch"); - // find the next pending batch and request it from the peer // Note: for this function to not infinite loop we must: // - If `include_next_batch` returns Some we MUST increase the count of batches that are @@ -1207,8 +1197,6 @@ impl SyncingChain { /// Creates the next required batch from the chain. If there are no more batches required, /// `false` is returned. fn include_next_batch(&mut self, network: &mut SyncNetworkContext) -> Option { - debug!("In include_next_batch"); - // don't request batches beyond the target head slot if self .to_be_downloaded @@ -1235,12 +1223,6 @@ impl SyncingChain { .collect(); if in_buffer_batches.len() > BATCH_BUFFER_SIZE as usize { - debug!( - ?in_buffer_batches, - ?self.processing_target, - ?self.to_be_downloaded, "Too many batches already" - ); - return None; } @@ -1253,7 +1235,6 @@ impl SyncingChain { return None; } - debug!(?self.to_be_downloaded, "Trying to check next batch id"); // If no batch needs a retry, attempt to send the batch of the next epoch to download let next_batch_id = self.to_be_downloaded; // this batch could have been included already being an optimistic batch From 73313239c7f9278c6fce5b9e5bdb311603699e4d Mon Sep 17 00:00:00 2001 From: Pawan Dhananjay Date: Wed, 27 Aug 2025 17:30:46 -0700 Subject: [PATCH 20/49] AwaitingValidation state only needs block peer --- .../network/src/sync/backfill_sync/mod.rs | 4 +-- .../network/src/sync/network_context.rs | 2 +- .../src/sync/network_context/requests.rs | 4 +-- .../requests/data_columns_by_root.rs | 5 +-- .../network/src/sync/range_sync/batch.rs | 34 +++++++++---------- .../network/src/sync/range_sync/chain.rs | 13 +++---- .../network/src/sync/range_sync/range.rs | 4 +-- 7 files changed, 31 insertions(+), 35 deletions(-) diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index ac47310b3f0..f8572a6eb0e 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -723,7 +723,7 @@ impl BackFillSync { )))?; return Ok(ProcessResult::Successful); } - BatchState::AwaitingValidation(_, _) => { + BatchState::AwaitingValidation(_) => { // TODO: I don't think this state is possible, log a CRIT just in case. // If this is not observed, add it to the failed state branch above. crit!( @@ -773,7 +773,7 @@ impl BackFillSync { // only for batches awaiting validation can we be sure the last attempt is // right, and thus, that any different attempt is wrong match batch.state() { - BatchState::AwaitingValidation(processed_attempt, _) => { + BatchState::AwaitingValidation(processed_attempt) => { for attempt in batch.attempts() { // The validated batch has been re-processed if attempt.hash != processed_attempt.hash { diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 19c2beaf8ef..139bf54109c 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -103,7 +103,7 @@ pub enum RpcResponseError { VerifyError(LookupVerifyError), CustodyRequestError(#[allow(dead_code)] CustodyRequestError), BlockComponentCouplingError(CouplingError), - InternalError(String), + InternalError(#[allow(dead_code)] String), } #[derive(Debug, PartialEq, Eq)] diff --git a/beacon_node/network/src/sync/network_context/requests.rs b/beacon_node/network/src/sync/network_context/requests.rs index 18cd00bfda3..950fc3db312 100644 --- a/beacon_node/network/src/sync/network_context/requests.rs +++ b/beacon_node/network/src/sync/network_context/requests.rs @@ -13,8 +13,8 @@ pub use blocks_by_range::BlocksByRangeRequestItems; pub use blocks_by_root::{BlocksByRootRequestItems, BlocksByRootSingleRequest}; pub use data_columns_by_range::DataColumnsByRangeRequestItems; pub use data_columns_by_root::{ - DataColumnsByRootBatchBlockRequest, DataColumnsByRootRequestItems, - DataColumnsByRootSingleBlockRequest, DataColumnsByRootRangeRequestItems + DataColumnsByRootBatchBlockRequest, DataColumnsByRootRangeRequestItems, + DataColumnsByRootRequestItems, DataColumnsByRootSingleBlockRequest, }; use crate::metrics; diff --git a/beacon_node/network/src/sync/network_context/requests/data_columns_by_root.rs b/beacon_node/network/src/sync/network_context/requests/data_columns_by_root.rs index 879c1036a67..22a91e23792 100644 --- a/beacon_node/network/src/sync/network_context/requests/data_columns_by_root.rs +++ b/beacon_node/network/src/sync/network_context/requests/data_columns_by_root.rs @@ -30,10 +30,7 @@ impl DataColumnsByRootBatchBlockRequest { }) .collect(); assert!(ids.len() <= 32); - Ok(DataColumnsByRootRequest::new( - ids, - spec.max_request_blocks(fork_name), - )) + DataColumnsByRootRequest::new(ids, spec.max_request_blocks(fork_name)) } } diff --git a/beacon_node/network/src/sync/range_sync/batch.rs b/beacon_node/network/src/sync/range_sync/batch.rs index 14dd07ae31b..fb7689ed392 100644 --- a/beacon_node/network/src/sync/range_sync/batch.rs +++ b/beacon_node/network/src/sync/range_sync/batch.rs @@ -152,7 +152,7 @@ pub enum BatchState { /// It is not sufficient to process a batch successfully to consider it correct. This is /// because batches could be erroneously empty, or incomplete. Therefore, a batch is considered /// valid, only if the next sequential batch imports at least a block. - AwaitingValidation(Attempt, ResponsiblePeers), + AwaitingValidation(Attempt), /// Intermediate state for inner state handling. Poisoned, /// The batch has maxed out the allowed attempts for either downloading or processing. It @@ -225,12 +225,12 @@ impl BatchInfo { /// Returns the peers that are currently responsible for progressing the state of the batch. pub fn processing_peers(&self) -> Option<&ResponsiblePeers> { match &self.state { - BatchState::AwaitingDownload | BatchState::Failed | BatchState::Downloading(..) => None, + BatchState::AwaitingDownload + | BatchState::Failed + | BatchState::Downloading(..) + | BatchState::AwaitingValidation(..) => None, BatchState::AwaitingProcessing(responsible_peers, _, _) - | BatchState::Processing(Attempt { .. }, responsible_peers) - | BatchState::AwaitingValidation(Attempt { .. }, responsible_peers) => { - Some(responsible_peers) - } + | BatchState::Processing(Attempt { .. }, responsible_peers) => Some(responsible_peers), BatchState::Poisoned => unreachable!("Poisoned batch"), } } @@ -385,10 +385,9 @@ impl BatchInfo { BatchState::AwaitingDownload | BatchState::Failed | BatchState::Poisoned - | BatchState::Downloading(_) => None, - BatchState::AwaitingProcessing(r, _, _) - | BatchState::AwaitingValidation(_, r) - | BatchState::Processing(_, r) => Some(r), + | BatchState::Downloading(_) + | BatchState::AwaitingValidation(_) => None, + BatchState::AwaitingProcessing(r, _, _) | BatchState::Processing(_, r) => Some(r), } } @@ -397,11 +396,9 @@ impl BatchInfo { processing_result: BatchProcessingResult, ) -> Result { match self.state.poison() { - BatchState::Processing(attempt, responsible_peers) => { + BatchState::Processing(attempt, _responsible_peers) => { self.state = match processing_result { - BatchProcessingResult::Success => { - BatchState::AwaitingValidation(attempt, responsible_peers) - } + BatchProcessingResult::Success => BatchState::AwaitingValidation(attempt), BatchProcessingResult::FaultyFailure => { // register the failed attempt self.failed_processing_attempts.push(attempt); @@ -437,7 +434,7 @@ impl BatchInfo { #[must_use = "Batch may have failed"] pub fn validation_failed(&mut self) -> Result { match self.state.poison() { - BatchState::AwaitingValidation(attempt, responsible_peers) => { + BatchState::AwaitingValidation(attempt) => { self.failed_processing_attempts.push(attempt); // check if the batch can be downloaded again @@ -473,6 +470,7 @@ impl BatchInfo { #[derive(PartialEq, Debug)] pub struct Attempt { /// The peer that made the attempt. + /// This peer is effectively the peer that we requested the blocks from. pub peer_id: PeerId, /// The hash of the blocks of the attempt. pub hash: u64, @@ -491,8 +489,8 @@ impl std::fmt::Debug for BatchState { BatchState::Processing(Attempt { peer_id, hash: _ }, responsible_peers) => { write!(f, "Processing({}) {:?}", peer_id, responsible_peers) } - BatchState::AwaitingValidation(Attempt { peer_id, hash: _ }, responsible_peers) => { - write!(f, "AwaitingValidation({}) {:?}", peer_id, responsible_peers) + BatchState::AwaitingValidation(Attempt { peer_id, hash: _ }) => { + write!(f, "AwaitingValidation({})", peer_id) } BatchState::AwaitingDownload => f.write_str("AwaitingDownload"), BatchState::Failed => f.write_str("Failed"), @@ -519,7 +517,7 @@ impl BatchState { match self { BatchState::Downloading(..) => 'D', BatchState::Processing(_, _) => 'P', - BatchState::AwaitingValidation(_, _) => 'v', + BatchState::AwaitingValidation(_) => 'v', BatchState::AwaitingDownload => 'd', BatchState::Failed => 'F', BatchState::AwaitingProcessing(..) => 'p', diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index a31a24f7d08..390a81b1cbf 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -365,7 +365,7 @@ impl SyncingChain { state ))); } - BatchState::AwaitingValidation(_, _) => { + BatchState::AwaitingValidation(_) => { // If an optimistic start is given to the chain after the corresponding // batch has been requested and processed we can land here. We drop the // optimistic candidate since we can't conclude whether the batch included @@ -399,7 +399,7 @@ impl SyncingChain { state ))); } - BatchState::AwaitingValidation(_, _) => { + BatchState::AwaitingValidation(_) => { // we can land here if an empty optimistic batch succeeds processing and is // inside the download buffer (between `self.processing_target` and // `self.to_be_downloaded`). In this case, eventually the chain advances to the @@ -667,7 +667,7 @@ impl SyncingChain { // only for batches awaiting validation can we be sure the last attempt is // right, and thus, that any different attempt is wrong match batch.state() { - BatchState::AwaitingValidation(processed_attempt, responsible_peers) => { + BatchState::AwaitingValidation(processed_attempt) => { for attempt in batch.attempts() { // The validated batch has been re-processed if attempt.hash != processed_attempt.hash { @@ -793,7 +793,7 @@ impl SyncingChain { // reset self.processing_target = self.start_epoch; - // finally, re-request the failed batch. + // finally, re-request the failed batch and all other batches in `AwaitingDownload` state. self.attempt_send_awaiting_download_batches(network, "handle_invalid_batch") } @@ -961,10 +961,11 @@ impl SyncingChain { } } + /// Attempts to send all batches that are in `AwaitingDownload` state. pub fn attempt_send_awaiting_download_batches( &mut self, network: &mut SyncNetworkContext, - src: &str, + _src: &str, ) -> ProcessingResult { // Collect all batches in AwaitingDownload state and see if they can be sent let awaiting_downloads: Vec<_> = self @@ -1146,7 +1147,7 @@ impl SyncingChain { entry.insert(optimistic_batch); self.send_batch(network, epoch)?; } else { - self.attempt_send_awaiting_download_batches(network, "optimistic"); + self.attempt_send_awaiting_download_batches(network, "optimistic")?; } return Ok(KeepChain); } diff --git a/beacon_node/network/src/sync/range_sync/range.rs b/beacon_node/network/src/sync/range_sync/range.rs index cd523d3e193..703164d6874 100644 --- a/beacon_node/network/src/sync/range_sync/range.rs +++ b/beacon_node/network/src/sync/range_sync/range.rs @@ -44,9 +44,9 @@ use super::chain_collection::{ChainCollection, SyncChainStatus}; use super::sync_type::RangeSyncType; use crate::metrics; use crate::status::ToStatusMessage; -use crate::sync::range_sync::ResponsiblePeers; use crate::sync::BatchProcessResult; use crate::sync::network_context::{RpcResponseError, SyncNetworkContext}; +use crate::sync::range_sync::ResponsiblePeers; use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::{BeaconChain, BeaconChainTypes}; use lighthouse_network::rpc::GoodbyeReason; @@ -212,7 +212,7 @@ where ) { // check if this chunk removes the chain match self.chains.call_by_id(chain_id, |chain| { - chain.on_block_response(network, batch_id, request_id, blocks, responsible_peers) + chain.on_block_response(network, batch_id, request_id, blocks, responsible_peers) }) { Ok((removed_chain, sync_type)) => { if let Some((removed_chain, remove_reason)) = removed_chain { From da1aabab7383f82efe2338be51710883a4e9d8bc Mon Sep 17 00:00:00 2001 From: Pawan Dhananjay Date: Wed, 27 Aug 2025 17:34:27 -0700 Subject: [PATCH 21/49] Revise error tolerance --- .../src/network_beacon_processor/sync_methods.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/beacon_node/network/src/network_beacon_processor/sync_methods.rs b/beacon_node/network/src/network_beacon_processor/sync_methods.rs index 820cf3ab75f..322cfdc23e4 100644 --- a/beacon_node/network/src/network_beacon_processor/sync_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/sync_methods.rs @@ -887,31 +887,31 @@ impl NetworkBeaconProcessor { match &e { AvailabilityCheckError::InvalidBlobs(_) | AvailabilityCheckError::BlobIndexInvalid(_) => Err(ChainSegmentFailed { - message: format!("Peer sent invalid block. Reason: {:?}", err), + message: format!("Peer sent invalid blobs. Reason: {:?}", err), // Do not penalize peers for internal errors. peer_action: Some(PeerAction::LowToleranceError), faulty_component: Some(FaultyComponent::Blobs), }), AvailabilityCheckError::InvalidColumn(columns) => Err(ChainSegmentFailed { - message: format!("Peer sent invalid block. Reason: {:?}", err), + message: format!("Peer sent invalid columns. Reason: {:?}", err), // Do not penalize peers for internal errors. - peer_action: Some(PeerAction::MidToleranceError), + peer_action: Some(PeerAction::LowToleranceError), faulty_component: Some(FaultyComponent::Columns( columns.iter().map(|v| v.0).collect(), )), }), AvailabilityCheckError::DataColumnIndexInvalid(column) => { Err(ChainSegmentFailed { - message: format!("Peer sent invalid block. Reason: {:?}", err), + message: format!("Peer sent invalid columns. Reason: {:?}", err), // Do not penalize peers for internal errors. - peer_action: Some(PeerAction::MidToleranceError), + peer_action: Some(PeerAction::LowToleranceError), faulty_component: Some(FaultyComponent::Columns(vec![*column])), }) } _ => Err(ChainSegmentFailed { message: format!("Peer sent invalid block. Reason: {:?}", err), // Do not penalize peers for internal errors. - peer_action: Some(PeerAction::MidToleranceError), + peer_action: None, faulty_component: None, }), } From b07bc6d4e4e97b02be85cc939d8990be9156cd28 Mon Sep 17 00:00:00 2001 From: Pawan Dhananjay Date: Fri, 29 Aug 2025 16:18:53 -0700 Subject: [PATCH 22/49] Force requests if batch buffer is full under certain conditions --- .../network/src/sync/range_sync/chain.rs | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index 390a81b1cbf..12cfef28564 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -1220,11 +1220,25 @@ impl SyncingChain { .batches .iter() .filter(|&(_epoch, batch)| in_buffer(batch)) - .map(|(epoch, _)| epoch) .collect(); if in_buffer_batches.len() > BATCH_BUFFER_SIZE as usize { - return None; + // Force the request to avoid stalling the chain if the batch to be downloaded is less + // than all batches sitting inside the buffer awaiting downloaded/processing. + let should_force_request = in_buffer_batches + .iter() + .all(|(epoch, _)| **epoch > self.to_be_downloaded); + debug!( + ?in_buffer_batches, + ?self.to_be_downloaded, + ?self.processing_target, + ?self.optimistic_start, + should_force_request, + "Batch buffer full, not able to make new requests" + ); + if !should_force_request { + return None; + } } // don't send batch requests until we have peers on sampling subnets From 4f60e86dc70b27b125f880eec76c93dd458fdef4 Mon Sep 17 00:00:00 2001 From: Pawan Dhananjay Date: Sun, 31 Aug 2025 14:54:44 -0700 Subject: [PATCH 23/49] Add logs to debug stuck range sync --- beacon_node/network/src/sync/range_sync/chain.rs | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index 12cfef28564..88a7e78baa6 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -829,8 +829,9 @@ impl SyncingChain { let optimistic_epoch = align(optimistic_start_epoch); // advance the chain to the new validating epoch - debug!("Advancing chain"); + debug!(?self.to_be_downloaded, ?self.processing_target,"Advancing chain"); self.advance_chain(network, validating_epoch); + debug!(?self.to_be_downloaded, ?self.processing_target,"Advanced chain"); if self.optimistic_start.is_none() && optimistic_epoch > self.processing_target && !self.attempted_optimistic_starts.contains(&optimistic_epoch) @@ -967,6 +968,7 @@ impl SyncingChain { network: &mut SyncNetworkContext, _src: &str, ) -> ProcessingResult { + debug!(?self.processing_target,?self.to_be_downloaded,"In attempt"); // Collect all batches in AwaitingDownload state and see if they can be sent let awaiting_downloads: Vec<_> = self .batches @@ -989,6 +991,7 @@ impl SyncingChain { network: &mut SyncNetworkContext, batch_id: BatchId, ) -> ProcessingResult { + debug!(?self.processing_target,?self.to_be_downloaded,"In send_batch"); let _guard = self.span.clone().entered(); let batch_state = self.visualize_batch_state(); if let Some(batch) = self.batches.get_mut(&batch_id) { @@ -1054,6 +1057,8 @@ impl SyncingChain { } }, } + } else { + debug!(?self.to_be_downloaded, ?self.processing_target,"Did not get batch"); } Ok(KeepChain) @@ -1131,22 +1136,26 @@ impl SyncingChain { if !matches!(self.state, ChainSyncingState::Syncing) { return Ok(KeepChain); } + debug!(?self.to_be_downloaded, ?self.processing_target,"Requesting batches"); // find the next pending batch and request it from the peer // check if we have the batch for our optimistic start. If not, request it first. // We wait for this batch before requesting any other batches. if let Some(epoch) = self.optimistic_start { + debug!(?self.to_be_downloaded, ?self.processing_target,"Optimistic start in request_batches"); if !self.good_peers_on_sampling_subnets(epoch, network) { debug!("Waiting for peers to be available on sampling column subnets"); return Ok(KeepChain); } if let Entry::Vacant(entry) = self.batches.entry(epoch) { + debug!(?self.to_be_downloaded, ?self.processing_target,"Vacant entry in request_batches"); let batch_type = network.batch_type(epoch); let optimistic_batch = BatchInfo::new(&epoch, EPOCHS_PER_BATCH, batch_type); entry.insert(optimistic_batch); self.send_batch(network, epoch)?; } else { + debug!(?self.to_be_downloaded, ?self.processing_target,"Not vacant in request_batches"); self.attempt_send_awaiting_download_batches(network, "optimistic")?; } return Ok(KeepChain); @@ -1159,6 +1168,7 @@ impl SyncingChain { // that function. while let Some(batch_id) = self.include_next_batch(network) { // send the batch + debug!(?self.to_be_downloaded, ?self.processing_target,"Got a batch to send"); self.send_batch(network, batch_id)?; } @@ -1198,6 +1208,7 @@ impl SyncingChain { /// Creates the next required batch from the chain. If there are no more batches required, /// `false` is returned. fn include_next_batch(&mut self, network: &mut SyncNetworkContext) -> Option { + debug!(?self.to_be_downloaded, ?self.processing_target,"In include next batch"); // don't request batches beyond the target head slot if self .to_be_downloaded From 7a6d0d9215431ef8e30e8f42b52fa4ba11ddbcfb Mon Sep 17 00:00:00 2001 From: Pawan Dhananjay Date: Sun, 31 Aug 2025 19:07:39 -0700 Subject: [PATCH 24/49] Force processing_target request --- .../network/src/sync/range_sync/chain.rs | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index 88a7e78baa6..67684a7bf52 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -1161,6 +1161,28 @@ impl SyncingChain { return Ok(KeepChain); } + // Try to force requesting the `processing_batch` to progress sync + if !self.batches.contains_key(&self.processing_target) { + debug!(?self.to_be_downloaded, ?self.processing_target,"Processing start in request_batches"); + if !self.good_peers_on_sampling_subnets(self.processing_target, network) { + debug!("Waiting for peers to be available on sampling column subnets"); + return Ok(KeepChain); + } + + if let Entry::Vacant(entry) = self.batches.entry(self.processing_target) { + debug!(?self.to_be_downloaded, ?self.processing_target,"Vacant entry in request_batches for processing"); + let batch_type = network.batch_type(self.processing_target); + let processing_batch = + BatchInfo::new(&self.processing_target, EPOCHS_PER_BATCH, batch_type); + entry.insert(processing_batch); + self.send_batch(network, self.processing_target)?; + } else { + debug!(?self.to_be_downloaded, ?self.processing_target,"Not vacant in request_batches processing"); + self.attempt_send_awaiting_download_batches(network, "optimistic")?; + } + return Ok(KeepChain); + } + // find the next pending batch and request it from the peer // Note: for this function to not infinite loop we must: // - If `include_next_batch` returns Some we MUST increase the count of batches that are From 8458df67526c38e04355ffe501151cdd1b5c2835 Mon Sep 17 00:00:00 2001 From: Pawan Dhananjay Date: Mon, 1 Sep 2025 12:53:01 -0700 Subject: [PATCH 25/49] Attempt sending awaitingDownload batches when restarting sync --- beacon_node/network/src/sync/range_sync/chain.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index 67684a7bf52..91943049502 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -832,6 +832,7 @@ impl SyncingChain { debug!(?self.to_be_downloaded, ?self.processing_target,"Advancing chain"); self.advance_chain(network, validating_epoch); debug!(?self.to_be_downloaded, ?self.processing_target,"Advanced chain"); + self.attempt_send_awaiting_download_batches(network, "start_syncing")?; if self.optimistic_start.is_none() && optimistic_epoch > self.processing_target && !self.attempted_optimistic_starts.contains(&optimistic_epoch) @@ -968,7 +969,6 @@ impl SyncingChain { network: &mut SyncNetworkContext, _src: &str, ) -> ProcessingResult { - debug!(?self.processing_target,?self.to_be_downloaded,"In attempt"); // Collect all batches in AwaitingDownload state and see if they can be sent let awaiting_downloads: Vec<_> = self .batches @@ -977,6 +977,8 @@ impl SyncingChain { .map(|(batch_id, _)| batch_id) .copied() .collect(); + debug!(?self.processing_target,?self.to_be_downloaded,_src, ?awaiting_downloads, "In attempt"); + for batch_id in awaiting_downloads { if self.good_peers_on_sampling_subnets(batch_id, network) { self.send_batch(network, batch_id)?; From 29c2f83bee3d68ae32d678ff17bad8a191d62a78 Mon Sep 17 00:00:00 2001 From: Pawan Dhananjay Date: Tue, 2 Sep 2025 13:29:19 -0700 Subject: [PATCH 26/49] Cleanup SyncingChain --- .../network/src/sync/range_sync/chain.rs | 115 +++++++++--------- 1 file changed, 59 insertions(+), 56 deletions(-) diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index 91943049502..821dcf9538e 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -352,6 +352,8 @@ impl SyncingChain { return Ok(KeepChain); } BatchState::Poisoned => unreachable!("Poisoned batch"), + // Batches can be in `AwaitingDownload` state if there weren't good data column subnet + // peers to send the request to. BatchState::AwaitingDownload => return Ok(KeepChain), BatchState::Processing(_, _) | BatchState::Failed => { // these are all inconsistent states: @@ -387,6 +389,8 @@ impl SyncingChain { // Batch is not ready, nothing to process } BatchState::Poisoned => unreachable!("Poisoned batch"), + // Batches can be in `AwaitingDownload` state if there weren't good data column subnet + // peers to send the request to. BatchState::AwaitingDownload => return Ok(KeepChain), BatchState::Failed | BatchState::Processing(_, _) => { // these are all inconsistent states: @@ -545,15 +549,20 @@ impl SyncingChain { faulty_component, } => { let Some(responsible_peers) = batch.responsible_peers() else { - crit!("Shouldn't happen"); - return Ok(KeepChain); + crit!( + current_state = ?batch.state(), + "Inconsistent state, batch must have been in processing state" + ); + return Err(RemoveChain::ChainFailed { + blacklist: false, + failing_batch: batch_id, + }); }; // Penalize the peer appropriately. match faulty_component { Some(FaultyComponent::Blocks) | Some(FaultyComponent::Blobs) => { network.report_peer(responsible_peers.block_blob, *penalty, "faulty_batch"); } - // todo(pawan): clean this up Some(FaultyComponent::Columns(faulty_columns)) => { for (peer, columns) in responsible_peers.data_columns.iter() { for faulty_column in faulty_columns { @@ -606,7 +615,7 @@ impl SyncingChain { BatchProcessResult::NonFaultyFailure => { batch.processing_completed(BatchProcessingResult::NonFaultyFailure)?; - // Simply re-download the batch. + // Simply re-download all batches in `AwaitingDownload` state. self.attempt_send_awaiting_download_batches(network, "non-faulty-failure") } } @@ -829,9 +838,9 @@ impl SyncingChain { let optimistic_epoch = align(optimistic_start_epoch); // advance the chain to the new validating epoch - debug!(?self.to_be_downloaded, ?self.processing_target,"Advancing chain"); self.advance_chain(network, validating_epoch); - debug!(?self.to_be_downloaded, ?self.processing_target,"Advanced chain"); + // attempt to download any batches stuck in the `AwaitingDownload` state because of + // a lack of peers earlier self.attempt_send_awaiting_download_batches(network, "start_syncing")?; if self.optimistic_start.is_none() && optimistic_epoch > self.processing_target @@ -844,7 +853,6 @@ impl SyncingChain { self.state = ChainSyncingState::Syncing; // begin requesting blocks from the peer pool, until all peers are exhausted. - debug!("Requesting batches from inside start syncing"); self.request_batches(network)?; // start processing batches if needed @@ -964,10 +972,13 @@ impl SyncingChain { } /// Attempts to send all batches that are in `AwaitingDownload` state. + /// + /// Batches might get stuck in `AwaitingDownload` post peerdas because of lack of peers + /// in required subnets. We need to progress them if peers are available at a later point. pub fn attempt_send_awaiting_download_batches( &mut self, network: &mut SyncNetworkContext, - _src: &str, + src: &str, ) -> ProcessingResult { // Collect all batches in AwaitingDownload state and see if they can be sent let awaiting_downloads: Vec<_> = self @@ -977,11 +988,19 @@ impl SyncingChain { .map(|(batch_id, _)| batch_id) .copied() .collect(); - debug!(?self.processing_target,?self.to_be_downloaded,_src, ?awaiting_downloads, "In attempt"); + debug!( + ?awaiting_downloads, + src, "Attempting to send batches awaiting downlaod" + ); for batch_id in awaiting_downloads { if self.good_peers_on_sampling_subnets(batch_id, network) { self.send_batch(network, batch_id)?; + } else { + debug!( + src = "attempt_send_awaiting_download_batches", + "Waiting for peers to be available on sampling column subnets" + ); } } Ok(KeepChain) @@ -993,7 +1012,6 @@ impl SyncingChain { network: &mut SyncNetworkContext, batch_id: BatchId, ) -> ProcessingResult { - debug!(?self.processing_target,?self.to_be_downloaded,"In send_batch"); let _guard = self.span.clone().entered(); let batch_state = self.visualize_batch_state(); if let Some(batch) = self.batches.get_mut(&batch_id) { @@ -1138,62 +1156,60 @@ impl SyncingChain { if !matches!(self.state, ChainSyncingState::Syncing) { return Ok(KeepChain); } - debug!(?self.to_be_downloaded, ?self.processing_target,"Requesting batches"); // find the next pending batch and request it from the peer // check if we have the batch for our optimistic start. If not, request it first. // We wait for this batch before requesting any other batches. if let Some(epoch) = self.optimistic_start { - debug!(?self.to_be_downloaded, ?self.processing_target,"Optimistic start in request_batches"); if !self.good_peers_on_sampling_subnets(epoch, network) { - debug!("Waiting for peers to be available on sampling column subnets"); + debug!( + src = "request_batches_optimistic", + "Waiting for peers to be available on sampling column subnets" + ); return Ok(KeepChain); } if let Entry::Vacant(entry) = self.batches.entry(epoch) { - debug!(?self.to_be_downloaded, ?self.processing_target,"Vacant entry in request_batches"); let batch_type = network.batch_type(epoch); let optimistic_batch = BatchInfo::new(&epoch, EPOCHS_PER_BATCH, batch_type); entry.insert(optimistic_batch); self.send_batch(network, epoch)?; } else { - debug!(?self.to_be_downloaded, ?self.processing_target,"Not vacant in request_batches"); - self.attempt_send_awaiting_download_batches(network, "optimistic")?; + self.attempt_send_awaiting_download_batches(network, "request_batches_optimistic")?; } return Ok(KeepChain); } - // Try to force requesting the `processing_batch` to progress sync + // find the next pending batch and request it from the peer + // Note: for this function to not infinite loop we must: + // - If `include_next_batch` returns Some we MUST increase the count of batches that are + // accounted in the `BACKFILL_BATCH_BUFFER_SIZE` limit in the `matches!` statement of + // that function. + while let Some(batch_id) = self.include_next_batch(network) { + // send the batch + self.send_batch(network, batch_id)?; + } + + // Force requesting the `processing_batch` to progress sync if required if !self.batches.contains_key(&self.processing_target) { - debug!(?self.to_be_downloaded, ?self.processing_target,"Processing start in request_batches"); + debug!(?self.processing_target,"Forcing requesting processing_target to progress sync"); if !self.good_peers_on_sampling_subnets(self.processing_target, network) { - debug!("Waiting for peers to be available on sampling column subnets"); + debug!( + src = "request_batches_processing", + "Waiting for peers to be available on sampling column subnets" + ); return Ok(KeepChain); } if let Entry::Vacant(entry) = self.batches.entry(self.processing_target) { - debug!(?self.to_be_downloaded, ?self.processing_target,"Vacant entry in request_batches for processing"); let batch_type = network.batch_type(self.processing_target); let processing_batch = BatchInfo::new(&self.processing_target, EPOCHS_PER_BATCH, batch_type); entry.insert(processing_batch); self.send_batch(network, self.processing_target)?; } else { - debug!(?self.to_be_downloaded, ?self.processing_target,"Not vacant in request_batches processing"); - self.attempt_send_awaiting_download_batches(network, "optimistic")?; + self.attempt_send_awaiting_download_batches(network, "request_batches_processing")?; } - return Ok(KeepChain); - } - - // find the next pending batch and request it from the peer - // Note: for this function to not infinite loop we must: - // - If `include_next_batch` returns Some we MUST increase the count of batches that are - // accounted in the `BACKFILL_BATCH_BUFFER_SIZE` limit in the `matches!` statement of - // that function. - while let Some(batch_id) = self.include_next_batch(network) { - // send the batch - debug!(?self.to_be_downloaded, ?self.processing_target,"Got a batch to send"); - self.send_batch(network, batch_id)?; } // No more batches, simply stop @@ -1232,7 +1248,6 @@ impl SyncingChain { /// Creates the next required batch from the chain. If there are no more batches required, /// `false` is returned. fn include_next_batch(&mut self, network: &mut SyncNetworkContext) -> Option { - debug!(?self.to_be_downloaded, ?self.processing_target,"In include next batch"); // don't request batches beyond the target head slot if self .to_be_downloaded @@ -1251,29 +1266,14 @@ impl SyncingChain { BatchState::Downloading(..) | BatchState::AwaitingProcessing(..) ) }; - let in_buffer_batches: Vec<_> = self + if self .batches .iter() .filter(|&(_epoch, batch)| in_buffer(batch)) - .collect(); - - if in_buffer_batches.len() > BATCH_BUFFER_SIZE as usize { - // Force the request to avoid stalling the chain if the batch to be downloaded is less - // than all batches sitting inside the buffer awaiting downloaded/processing. - let should_force_request = in_buffer_batches - .iter() - .all(|(epoch, _)| **epoch > self.to_be_downloaded); - debug!( - ?in_buffer_batches, - ?self.to_be_downloaded, - ?self.processing_target, - ?self.optimistic_start, - should_force_request, - "Batch buffer full, not able to make new requests" - ); - if !should_force_request { - return None; - } + .count() + > BATCH_BUFFER_SIZE as usize + { + return None; } // don't send batch requests until we have peers on sampling subnets @@ -1281,7 +1281,10 @@ impl SyncingChain { // block and data column requests are currently coupled. This can be removed once we find a // way to decouple the requests and do retries individually, see issue #6258. if !self.good_peers_on_sampling_subnets(self.to_be_downloaded, network) { - debug!("Waiting for peers to be available on custody column subnets"); + debug!( + src = "include_next_batch", + "Waiting for peers to be available on custody column subnets" + ); return None; } From e0d8f047ec13ca55c81be2b48d6b5f14bb53d6d1 Mon Sep 17 00:00:00 2001 From: Pawan Dhananjay Date: Thu, 4 Sep 2025 18:09:07 -0700 Subject: [PATCH 27/49] Tests compile --- .../lighthouse_network/src/peer_manager/peerdb.rs | 2 +- .../lighthouse_network/src/service/api_types.rs | 1 + .../network/src/sync/block_sidecar_coupling.rs | 11 +++++++++-- beacon_node/network/src/sync/manager.rs | 8 +++----- beacon_node/network/src/sync/network_context.rs | 15 ++++++++------- .../network/src/sync/network_context/custody.rs | 14 +++++++------- beacon_node/network/src/sync/range_sync/chain.rs | 5 ++--- 7 files changed, 31 insertions(+), 25 deletions(-) diff --git a/beacon_node/lighthouse_network/src/peer_manager/peerdb.rs b/beacon_node/lighthouse_network/src/peer_manager/peerdb.rs index 17c070c3d70..3e5b637220b 100644 --- a/beacon_node/lighthouse_network/src/peer_manager/peerdb.rs +++ b/beacon_node/lighthouse_network/src/peer_manager/peerdb.rs @@ -328,7 +328,7 @@ impl PeerDB { } /// Returns an iterator of all good gossipsub peers that are supposed to be custodying - /// the given subnet id. + /// the given subnet id and have the epoch according to their status messages. pub fn good_custody_subnet_peer_range_sync( &self, subnet: DataColumnSubnetId, diff --git a/beacon_node/lighthouse_network/src/service/api_types.rs b/beacon_node/lighthouse_network/src/service/api_types.rs index 65a6cf61c5d..645ab69ce50 100644 --- a/beacon_node/lighthouse_network/src/service/api_types.rs +++ b/beacon_node/lighthouse_network/src/service/api_types.rs @@ -283,6 +283,7 @@ mod tests { lookup_id: 101, }), }), + peer: PeerId::random(), }; assert_eq!(format!("{id}"), "123/Custody/121/Lookup/101"); } diff --git a/beacon_node/network/src/sync/block_sidecar_coupling.rs b/beacon_node/network/src/sync/block_sidecar_coupling.rs index 59c0ebc81d2..6c453547093 100644 --- a/beacon_node/network/src/sync/block_sidecar_coupling.rs +++ b/beacon_node/network/src/sync/block_sidecar_coupling.rs @@ -99,7 +99,7 @@ impl RangeBlockComponentsRequest { /// * `blobs_req_id` - Optional request ID for blobs (pre-Fulu fork) /// * `data_columns` - Optional tuple of (request_id->column_indices pairs, expected_custody_columns) for Fulu fork /// * `request_columns_by_root` - Creates an uninitialized `RangeBlockDataRequest::DataColumnsFromRoot` variant if this is true. - /// Note: this is only relevant is `data_columns == None`. + /// Note: this is only relevant is `data_columns == None`. #[allow(clippy::type_complexity)] pub fn new( blocks_req_id: BlocksByRangeRequestId, @@ -698,6 +698,7 @@ mod tests { BlocksByRangeRequestId { id: 1, parent_request_id, + peer_id: PeerId::random(), } } @@ -738,7 +739,7 @@ mod tests { let blocks_req_id = blocks_id(components_id()); let mut info = - RangeBlockComponentsRequest::::new(blocks_req_id, None, None, Span::none()); + RangeBlockComponentsRequest::::new(blocks_req_id, None, None, None, Span::none()); // Send blocks and complete terminate response info.add_blocks(blocks_req_id, blocks).unwrap(); @@ -772,6 +773,7 @@ mod tests { blocks_req_id, Some(blobs_req_id), None, + None, Span::none(), ); @@ -813,6 +815,7 @@ mod tests { blocks_req_id, None, Some((columns_req_id.clone(), expects_custody_columns.clone())), + None, Span::none(), ); // Send blocks and complete terminate response @@ -873,6 +876,7 @@ mod tests { blocks_req_id, None, Some((columns_req_id.clone(), expects_custody_columns.clone())), + None, Span::none(), ); @@ -953,6 +957,7 @@ mod tests { blocks_req_id, None, Some((columns_req_id.clone(), expected_custody_columns.clone())), + None, Span::none(), ); @@ -1033,6 +1038,7 @@ mod tests { blocks_req_id, None, Some((columns_req_id.clone(), expected_custody_columns.clone())), + None, Span::none(), ); @@ -1115,6 +1121,7 @@ mod tests { blocks_req_id, None, Some((columns_req_id.clone(), expected_custody_columns.clone())), + None, Span::none(), ); diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index 3bda91ad1c5..2b376402a19 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -1134,13 +1134,11 @@ impl SyncManager { if let Some(resp) = self.network .on_data_columns_by_root_response(req_id, peer_id, data_column) - { - if let Some(result) = self + && let Some(result) = self .network .on_custody_by_root_response(custody_id, req_id, peer_id, resp) - { - self.on_custody_by_root_result(custody_id.requester, result); - } + { + self.on_custody_by_root_result(custody_id.requester, result); } } DataColumnsByRootRequester::RangeSync { parent } => { diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 139bf54109c..187a81e9b1f 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -573,7 +573,7 @@ impl SyncNetworkContext { for (parent_request, requests) in entries_to_process { let mut data_column_requests = Vec::new(); let requester = DataColumnsByRootRequester::RangeSync { - parent: parent_request.clone(), + parent: parent_request, }; let custody_indices = requests.indices.iter().cloned().collect(); let synced_peers = self @@ -829,7 +829,7 @@ impl SyncNetworkContext { // Create the overall components_by_range request ID before its individual components let id = ComponentsByRangeRequestId { id: self.next_id(), - requester: requester.clone(), + requester, }; let blocks_req_id = self.send_blocks_by_range_request( @@ -940,6 +940,7 @@ impl SyncNetworkContext { /// Received a blocks by range or blobs by range response for a request that couples blocks ' /// and blobs. + #[allow(clippy::type_complexity)] pub fn range_block_component_response( &mut self, id: ComponentsByRangeRequestId, @@ -1766,7 +1767,7 @@ impl SyncNetworkContext { fn request_columns_on_successful_blocks( &mut self, id: BlocksByRangeRequestId, - blocks: &Vec>>, + blocks: &[Arc>], ) -> Result<(), RpcResponseError> { let batch_epoch = id.batch_id(); // Return early if no columns are required for this epoch @@ -1790,10 +1791,10 @@ impl SyncNetworkContext { // We have blocks here, check if they need data columns and request them for block in blocks.iter() { // Request columns only if the blob_kzg_commitments is non-empty - if let Ok(commitments) = block.message().body().blob_kzg_commitments() { - if !commitments.is_empty() { - block_roots.push(block.canonical_root()); - } + if let Ok(commitments) = block.message().body().blob_kzg_commitments() + && !commitments.is_empty() + { + block_roots.push(block.canonical_root()); } } diff --git a/beacon_node/network/src/sync/network_context/custody.rs b/beacon_node/network/src/sync/network_context/custody.rs index d973e83cea7..337fde619ac 100644 --- a/beacon_node/network/src/sync/network_context/custody.rs +++ b/beacon_node/network/src/sync/network_context/custody.rs @@ -5,7 +5,7 @@ use beacon_chain::BeaconChainTypes; use beacon_chain::validator_monitor::timestamp_now; use fnv::FnvHashMap; use lighthouse_network::PeerId; -use lighthouse_network::service::api_types::{CustodyId, DataColumnsByRootRequester}; +use lighthouse_network::service::api_types::{CustodyId, DataColumnsByRootRequester, Id}; use lighthouse_tracing::SPAN_OUTGOING_CUSTODY_REQUEST; use lru_cache::LRUTimeCache; use parking_lot::RwLock; @@ -50,8 +50,8 @@ pub enum Error { /// There should only exist a single request at a time. Having multiple requests is a bug and /// can result in undefined state, so it's treated as a hard error and the lookup is dropped. UnexpectedRequestId { - expected_req_id: DataColumnsByRootRequestId, - req_id: DataColumnsByRootRequestId, + expected_req_id: Id, + req_id: Id, }, } @@ -401,8 +401,8 @@ impl ColumnRequest { Status::Downloading(expected_req_id) => { if req_id != *expected_req_id { return Err(Error::UnexpectedRequestId { - expected_req_id: *expected_req_id, - req_id, + expected_req_id: expected_req_id.id, + req_id: req_id.id, }); } self.status = Status::NotStarted(Instant::now()); @@ -434,8 +434,8 @@ impl ColumnRequest { Status::Downloading(expected_req_id) => { if req_id != *expected_req_id { return Err(Error::UnexpectedRequestId { - expected_req_id: *expected_req_id, - req_id, + expected_req_id: expected_req_id.id, + req_id: req_id.id, }); } self.status = Status::Downloaded(peer_id, data_column, seen_timestamp); diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index 821dcf9538e..e762bfb55f0 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -1225,7 +1225,7 @@ impl SyncingChain { ) -> bool { if network.chain.spec.is_peer_das_enabled_for_epoch(epoch) { // Require peers on all sampling column subnets before sending batches - let peers_on_all_custody_subnets = network + network .network_globals() .sampling_subnets() .iter() @@ -1238,8 +1238,7 @@ impl SyncingChain { .count(); peer_count > 0 - }); - peers_on_all_custody_subnets + }) } else { true } From 6a2a33d459a54ffa94c9d0e09be5c04e767167dc Mon Sep 17 00:00:00 2001 From: Pawan Dhananjay Date: Fri, 5 Sep 2025 12:48:05 -0700 Subject: [PATCH 28/49] Fix some issues from review --- .../network/src/sync/backfill_sync/mod.rs | 2 +- .../src/sync/block_sidecar_coupling.rs | 22 +++++++++++-------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index 8c93b824244..dc27892092f 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -617,7 +617,7 @@ impl BackFillSync { faulty_component, } => { let Some(responsible_peers) = batch.responsible_peers() else { - crit!("Shouldn't happen"); + error!(?batch_id, "Responsible peers not found for a failed batch"); return self .fail_sync(BackFillError::BatchProcessingFailed(batch_id)) .map(|_| ProcessResult::Successful); diff --git a/beacon_node/network/src/sync/block_sidecar_coupling.rs b/beacon_node/network/src/sync/block_sidecar_coupling.rs index 6c453547093..c155609e81e 100644 --- a/beacon_node/network/src/sync/block_sidecar_coupling.rs +++ b/beacon_node/network/src/sync/block_sidecar_coupling.rs @@ -72,7 +72,7 @@ enum RangeBlockDataRequest { // Indicates if we have made column requests for each of the `expected_custody_columns` or not all_requests_made: bool, /// The column indices corresponding to the request - column_peers: HashMap>, + request_to_column_indices: HashMap>, expected_custody_columns: HashSet, attempt: usize, }, @@ -130,7 +130,7 @@ impl RangeBlockComponentsRequest { requests: HashMap::new(), all_requests_made: false, attempt: 0, - column_peers: HashMap::new(), + request_to_column_indices: HashMap::new(), expected_custody_columns, } } else { @@ -155,7 +155,10 @@ impl RangeBlockComponentsRequest { .iter() .map(|(k, v)| (k.peer, v.clone())) .collect(), - RangeBlockDataRequest::DataColumnsFromRoot { column_peers, .. } => column_peers + RangeBlockDataRequest::DataColumnsFromRoot { + request_to_column_indices: column_peers, + .. + } => column_peers .iter() .map(|(k, v)| (k.peer, v.clone())) .collect(), @@ -200,17 +203,17 @@ impl RangeBlockComponentsRequest { requests, attempt: _, all_requests_made, - column_peers, + request_to_column_indices, expected_custody_columns, } => { - for (request, peers) in column_requests { + for (request, indices) in column_requests { requests.insert(request, ByRangeRequest::Active(request)); - column_peers.insert(request, peers); + request_to_column_indices.insert(request, indices); } if !*all_requests_made { let mut all_columns_requested = HashSet::new(); - for columns in column_peers.values() { + for columns in request_to_column_indices.values() { all_columns_requested.extend(columns.iter()); } *all_requests_made = all_columns_requested == *expected_custody_columns; @@ -414,7 +417,7 @@ impl RangeBlockComponentsRequest { RangeBlockDataRequest::DataColumnsFromRoot { all_requests_made, attempt, - column_peers, + request_to_column_indices, expected_custody_columns, requests, } => { @@ -439,7 +442,7 @@ impl RangeBlockComponentsRequest { // Note: this assumes that only 1 peer is responsible for a column // with a batch. - for (id, columns) in column_peers { + for (id, columns) in request_to_column_indices.iter() { for column in columns { column_to_peer_id.insert(*column, id.peer); } @@ -467,6 +470,7 @@ impl RangeBlockComponentsRequest { // delete it from the entries as we are going to make // a separate attempt for those components. requests.retain(|&k, _| k.peer != *peer); + request_to_column_indices.retain(|&k, _| k.peer != *peer); } } From e259ecdf912e1605440ebd7708d6c4d00c22d5b6 Mon Sep 17 00:00:00 2001 From: Pawan Dhananjay Date: Fri, 5 Sep 2025 12:58:49 -0700 Subject: [PATCH 29/49] More renamings --- .../src/sync/block_sidecar_coupling.rs | 26 +++++++++++-------- .../network/src/sync/network_context.rs | 22 +++++++++------- 2 files changed, 27 insertions(+), 21 deletions(-) diff --git a/beacon_node/network/src/sync/block_sidecar_coupling.rs b/beacon_node/network/src/sync/block_sidecar_coupling.rs index c155609e81e..b0c2588e292 100644 --- a/beacon_node/network/src/sync/block_sidecar_coupling.rs +++ b/beacon_node/network/src/sync/block_sidecar_coupling.rs @@ -55,7 +55,7 @@ enum RangeBlockDataRequest { ByRangeRequest>, >, /// The column indices corresponding to the request - column_peers: HashMap>, + request_to_column_indices: HashMap>, expected_custody_columns: Vec, attempt: usize, }, @@ -115,13 +115,13 @@ impl RangeBlockComponentsRequest { let block_data_request = if let Some(blobs_req_id) = blobs_req_id { RangeBlockDataRequest::Blobs(ByRangeRequest::Active(blobs_req_id)) } else if let Some((requests, expected_custody_columns)) = data_columns { - let column_peers: HashMap<_, _> = requests.into_iter().collect(); + let request_to_column_indices: HashMap<_, _> = requests.into_iter().collect(); RangeBlockDataRequest::DataColumns { - requests: column_peers + requests: request_to_column_indices .keys() .map(|id| (*id, ByRangeRequest::Active(*id))) .collect(), - column_peers, + request_to_column_indices, expected_custody_columns, attempt: 0, } @@ -151,14 +151,17 @@ impl RangeBlockComponentsRequest { block_blob: self.block_peer, data_columns: match &self.block_data_request { RangeBlockDataRequest::NoData | RangeBlockDataRequest::Blobs(_) => HashMap::new(), - RangeBlockDataRequest::DataColumns { column_peers, .. } => column_peers + RangeBlockDataRequest::DataColumns { + request_to_column_indices, + .. + } => request_to_column_indices .iter() .map(|(k, v)| (k.peer, v.clone())) .collect(), RangeBlockDataRequest::DataColumnsFromRoot { - request_to_column_indices: column_peers, + request_to_column_indices, .. - } => column_peers + } => request_to_column_indices .iter() .map(|(k, v)| (k.peer, v.clone())) .collect(), @@ -176,12 +179,12 @@ impl RangeBlockComponentsRequest { RangeBlockDataRequest::DataColumns { requests, expected_custody_columns: _, - column_peers, + request_to_column_indices, attempt: _, } => { for (request, columns) in failed_column_requests.into_iter() { requests.insert(request, ByRangeRequest::Active(request)); - column_peers.insert(request, columns); + request_to_column_indices.insert(request, columns); } Ok(()) } @@ -362,7 +365,7 @@ impl RangeBlockComponentsRequest { RangeBlockDataRequest::DataColumns { requests, expected_custody_columns, - column_peers, + request_to_column_indices, attempt, } => { let mut data_columns = vec![]; @@ -380,7 +383,7 @@ impl RangeBlockComponentsRequest { // Note: this assumes that only 1 peer is responsible for a column // with a batch. - for (id, columns) in column_peers { + for (id, columns) in request_to_column_indices.iter() { for column in columns { column_to_peer_id.insert(*column, id.peer); } @@ -406,6 +409,7 @@ impl RangeBlockComponentsRequest { // delete it from the entries as we are going to make // a separate attempt for those components. requests.retain(|&k, _| k.peer != *peer); + request_to_column_indices.retain(|&k, _| k.peer != *peer); } } diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 187a81e9b1f..c4ba20a2ccb 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -225,13 +225,13 @@ pub struct SyncNetworkContext { components_by_range_requests: FnvHashMap>, - // todo(pawan): make this a bounded queue, make the types better, add better docs // A hashmap with the key being the parent request and the value being the data column by root // requests that we have to retry because of one of the following reasons: // 1. The root requests couldn't be made after the parent blocks request because there were no // column peers available // 2. The root request errored (either peer sent an RPC error or an empty response) - requests_to_retry: HashMap, + pending_column_by_root_range_requests: + HashMap, /// Whether the ee is online. If it's not, we don't allow access to the /// `beacon_processor_send`. @@ -314,7 +314,7 @@ impl SyncNetworkContext { data_columns_by_range_requests: ActiveRequests::new("data_columns_by_range"), custody_by_root_requests: <_>::default(), components_by_range_requests: FnvHashMap::default(), - requests_to_retry: Default::default(), + pending_column_by_root_range_requests: Default::default(), network_beacon_processor, chain, fork_context, @@ -345,7 +345,7 @@ impl SyncNetworkContext { custody_by_root_requests: _, // components_by_range_requests is a meta request of various _by_range requests components_by_range_requests: _, - requests_to_retry: _, + pending_column_by_root_range_requests: _, execution_engine_state: _, network_beacon_processor: _, chain: _, @@ -452,7 +452,7 @@ impl SyncNetworkContext { components_by_range_requests: _, execution_engine_state: _, network_beacon_processor: _, - requests_to_retry: _, + pending_column_by_root_range_requests: _, chain: _, fork_context: _, // Don't use a fallback match. We want to be sure that all requests are considered when @@ -567,7 +567,8 @@ impl SyncNetworkContext { let active_requests = self.active_request_count_by_peer(); // Collect entries to process and remove from requests_to_retry - let entries_to_process: Vec<_> = self.requests_to_retry.drain().collect(); + let entries_to_process: Vec<_> = + self.pending_column_by_root_range_requests.drain().collect(); let mut entries_to_keep = Vec::new(); for (parent_request, requests) in entries_to_process { @@ -637,7 +638,8 @@ impl SyncNetworkContext { } // Re-insert entries that still need to be retried - self.requests_to_retry.extend(entries_to_keep); + self.pending_column_by_root_range_requests + .extend(entries_to_keep); Ok(()) } @@ -1820,12 +1822,12 @@ impl SyncNetworkContext { let mut peer_to_columns: HashMap> = HashMap::new(); let mut no_peers_for_column: Vec = Vec::new(); for column in self.chain.sampling_columns_for_epoch(batch_epoch).iter() { - let data_column = DataColumnSubnetId::new(*column); + let subnet_id = DataColumnSubnetId::new(*column); if let Some(custody_peer) = self .network_globals() .peers .read() - .good_custody_subnet_peer_range_sync(data_column, batch_epoch) + .good_custody_subnet_peer_range_sync(subnet_id, batch_epoch) .next() { peer_to_columns @@ -1878,7 +1880,7 @@ impl SyncNetworkContext { indices: no_peers_for_column, }; - self.requests_to_retry + self.pending_column_by_root_range_requests .insert(id.parent_request_id, data_columns_by_root_request); } From 04398ad267e2cc304e4bc291b32d43305cde6c1c Mon Sep 17 00:00:00 2001 From: Pawan Dhananjay Date: Mon, 8 Sep 2025 15:37:26 -0700 Subject: [PATCH 30/49] Fix some more issues from review --- beacon_node/network/src/sync/manager.rs | 2 +- .../network/src/sync/network_context.rs | 6 +++--- .../network/src/sync/range_sync/batch.rs | 2 +- .../network/src/sync/range_sync/chain.rs | 18 +++++------------- 4 files changed, 10 insertions(+), 18 deletions(-) diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index 2b376402a19..c68c506b8bb 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -415,7 +415,7 @@ impl SyncManager { } // Try to make range requests that we failed to make because of lack of peers. - let _ = self.network.retry_pending_requests(); + let _ = self.network.retry_pending_root_range_requests(); } /// Trigger range sync for a set of peers that claim to have imported a head unknown to us. diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index c4ba20a2ccb..999fafcff8c 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -34,6 +34,7 @@ use lighthouse_network::service::api_types::{ use lighthouse_network::{Client, NetworkGlobals, PeerAction, PeerId, ReportSource}; use lighthouse_tracing::SPAN_OUTGOING_RANGE_REQUEST; use parking_lot::RwLock; +use rand::seq::IteratorRandom; pub use requests::LookupVerifyError; use requests::{ ActiveRequests, BlobsByRangeRequestItems, BlobsByRootRequestItems, BlocksByRangeRequestItems, @@ -563,7 +564,7 @@ impl SyncNetworkContext { /// /// This function must be manually invoked at regular intervals or when a new peer /// gets added. - pub fn retry_pending_requests(&mut self) -> Result<(), String> { + pub fn retry_pending_root_range_requests(&mut self) -> Result<(), String> { let active_requests = self.active_request_count_by_peer(); // Collect entries to process and remove from requests_to_retry @@ -1827,8 +1828,7 @@ impl SyncNetworkContext { .network_globals() .peers .read() - .good_custody_subnet_peer_range_sync(subnet_id, batch_epoch) - .next() + .good_custody_subnet_peer_range_sync(subnet_id, batch_epoch).choose(&mut rand::rng()) { peer_to_columns .entry(*custody_peer) diff --git a/beacon_node/network/src/sync/range_sync/batch.rs b/beacon_node/network/src/sync/range_sync/batch.rs index fb7689ed392..695262523a5 100644 --- a/beacon_node/network/src/sync/range_sync/batch.rs +++ b/beacon_node/network/src/sync/range_sync/batch.rs @@ -146,7 +146,7 @@ pub enum BatchState { /// The batch has been completely downloaded and is ready for processing. AwaitingProcessing(ResponsiblePeers, Vec>, Instant), /// The batch is being processed. - Processing(Attempt, ResponsiblePeers), // todo(pawan): attempt contains the peer, remove that + Processing(Attempt, ResponsiblePeers), /// The batch was successfully processed and is waiting to be validated. /// /// It is not sufficient to process a batch successfully to consider it correct. This is diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index e762bfb55f0..8006a501de7 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -990,7 +990,7 @@ impl SyncingChain { .collect(); debug!( ?awaiting_downloads, - src, "Attempting to send batches awaiting downlaod" + src, "Attempting to send batches awaiting download" ); for batch_id in awaiting_downloads { @@ -1225,20 +1225,12 @@ impl SyncingChain { ) -> bool { if network.chain.spec.is_peer_das_enabled_for_epoch(epoch) { // Require peers on all sampling column subnets before sending batches + let sampling_subnets = network.network_globals().sampling_subnets(); network .network_globals() - .sampling_subnets() - .iter() - .all(|subnet_id| { - let peer_count = network - .network_globals() - .peers - .read() - .good_custody_subnet_peer_range_sync(*subnet_id, epoch) - .count(); - - peer_count > 0 - }) + .peers + .read() + .has_good_custody_range_sync_peer(&sampling_subnets, epoch) } else { true } From bf09d57e29a45409172b5e7cd267785a06544e59 Mon Sep 17 00:00:00 2001 From: Pawan Dhananjay Date: Thu, 11 Sep 2025 15:06:07 -0700 Subject: [PATCH 31/49] Fix some issues from lion's review --- .../network/src/sync/backfill_sync/mod.rs | 16 +++--- .../src/sync/block_sidecar_coupling.rs | 10 ++-- .../network/src/sync/network_context.rs | 4 +- .../network/src/sync/range_sync/batch.rs | 41 +++++++------- .../network/src/sync/range_sync/chain.rs | 53 ++++++------------- .../network/src/sync/range_sync/mod.rs | 2 +- .../network/src/sync/range_sync/range.rs | 10 ++-- 7 files changed, 56 insertions(+), 80 deletions(-) diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index dc27892092f..e953244976b 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -16,7 +16,7 @@ use crate::sync::network_context::{ }; use crate::sync::range_sync::{ BatchConfig, BatchId, BatchInfo, BatchOperationOutcome, BatchProcessingResult, BatchState, - ResponsiblePeers, + BatchPeers, }; use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::{BeaconChain, BeaconChainTypes}; @@ -383,7 +383,7 @@ impl BackFillSync { batch_id: BatchId, request_id: Id, blocks: Vec>, - responsible_peers: ResponsiblePeers, + batch_peers: BatchPeers, ) -> Result { // check if we have this batch let Some(batch) = self.batches.get_mut(&batch_id) else { @@ -402,7 +402,7 @@ impl BackFillSync { return Ok(ProcessResult::Successful); } - match batch.download_completed(blocks, responsible_peers) { + match batch.download_completed(blocks, batch_peers) { Ok(received) => { let awaiting_batches = self.processing_target.saturating_sub(batch_id) / BACKFILL_EPOCHS_PER_BATCH; @@ -558,7 +558,7 @@ impl BackFillSync { } }; - let Some(responsible_peers) = batch.processing_peers() else { + let Some(batch_peers) = batch.processing_peers() else { self.fail_sync(BackFillError::BatchInvalidState( batch_id, String::from("Peer does not exist"), @@ -570,7 +570,7 @@ impl BackFillSync { ?result, %batch, batch_epoch = %batch_id, - ?responsible_peers, + ?batch_peers, // client = %network.client_type(peer), "Backfill batch processed" ); @@ -616,7 +616,7 @@ impl BackFillSync { penalty, faulty_component, } => { - let Some(responsible_peers) = batch.responsible_peers() else { + let Some(batch_peers) = batch.processing_peers() else { error!(?batch_id, "Responsible peers not found for a failed batch"); return self .fail_sync(BackFillError::BatchProcessingFailed(batch_id)) @@ -625,11 +625,11 @@ impl BackFillSync { // Penalize the peer appropriately. match faulty_component { Some(FaultyComponent::Blocks) | Some(FaultyComponent::Blobs) => { - network.report_peer(responsible_peers.block_blob, *penalty, "faulty_batch"); + network.report_peer(batch_peers.block_and_blob, *penalty, "faulty_batch"); } // todo(pawan): clean this up Some(FaultyComponent::Columns(faulty_columns)) => { - for (peer, columns) in responsible_peers.data_columns.iter() { + for (peer, columns) in batch_peers.data_columns.iter() { for faulty_column in faulty_columns { if columns.contains(faulty_column) { network.report_peer(*peer, *penalty, "faulty_batch"); diff --git a/beacon_node/network/src/sync/block_sidecar_coupling.rs b/beacon_node/network/src/sync/block_sidecar_coupling.rs index b0c2588e292..6a1c162e674 100644 --- a/beacon_node/network/src/sync/block_sidecar_coupling.rs +++ b/beacon_node/network/src/sync/block_sidecar_coupling.rs @@ -1,5 +1,5 @@ use crate::sync::network_context::MAX_COLUMN_RETRIES; -use crate::sync::range_sync::ResponsiblePeers; +use crate::sync::range_sync::BatchPeers; use beacon_chain::{ block_verification_types::RpcBlock, data_column_verification::CustodyDataColumn, get_block_root, }; @@ -146,9 +146,9 @@ impl RangeBlockComponentsRequest { } /// Returns the peers that we requested the blocks, blobs and columns for this component. - pub fn responsible_peers(&self) -> ResponsiblePeers { - ResponsiblePeers { - block_blob: self.block_peer, + pub fn responsible_peers(&self) -> BatchPeers { + BatchPeers { + block_and_blob: self.block_peer, data_columns: match &self.block_data_request { RangeBlockDataRequest::NoData | RangeBlockDataRequest::Blobs(_) => HashMap::new(), RangeBlockDataRequest::DataColumns { @@ -224,7 +224,7 @@ impl RangeBlockComponentsRequest { Ok(()) } - _ => Err("Invalid initialization".to_string()), + _ => Err("Invalid state: expected DataColumnsFromRoot".to_string()), } } diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 999fafcff8c..840e6ab6ae2 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -18,7 +18,7 @@ use crate::sync::block_sidecar_coupling::CouplingError; use crate::sync::network_context::requests::{ BlobsByRootSingleBlockRequest, DataColumnsByRootRangeRequestItems, }; -use crate::sync::range_sync::ResponsiblePeers; +use crate::sync::range_sync::BatchPeers; use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::{BeaconChain, BeaconChainTypes, BlockProcessStatus, EngineState}; use custody::CustodyRequestResult; @@ -950,7 +950,7 @@ impl SyncNetworkContext { range_block_component: RangeBlockComponent, ) -> Option<( Result>, RpcResponseError>, - ResponsiblePeers, + BatchPeers, )> { let Entry::Occupied(mut entry) = self.components_by_range_requests.entry(id) else { metrics::inc_counter_vec(&metrics::SYNC_UNKNOWN_NETWORK_REQUESTS, &["range_blocks"]); diff --git a/beacon_node/network/src/sync/range_sync/batch.rs b/beacon_node/network/src/sync/range_sync/batch.rs index 695262523a5..d2fa0d4eb96 100644 --- a/beacon_node/network/src/sync/range_sync/batch.rs +++ b/beacon_node/network/src/sync/range_sync/batch.rs @@ -131,8 +131,10 @@ impl fmt::Display for BatchInfo { /// /// This is used for penalizing in case of invalid batches. #[derive(Debug, Clone)] -pub struct ResponsiblePeers { - pub block_blob: PeerId, +pub struct BatchPeers { + /// Note: we send the blob request to the same peer as the block request + /// Hence, block and blob peers would be the same. + pub block_and_blob: PeerId, pub data_columns: HashMap>, } @@ -144,9 +146,9 @@ pub enum BatchState { /// The batch is being downloaded. Downloading(Id), /// The batch has been completely downloaded and is ready for processing. - AwaitingProcessing(ResponsiblePeers, Vec>, Instant), + AwaitingProcessing(BatchPeers, Vec>, Instant), /// The batch is being processed. - Processing(Attempt, ResponsiblePeers), + Processing(Attempt, BatchPeers), /// The batch was successfully processed and is waiting to be validated. /// /// It is not sufficient to process a batch successfully to consider it correct. This is @@ -223,7 +225,7 @@ impl BatchInfo { } /// Returns the peers that are currently responsible for progressing the state of the batch. - pub fn processing_peers(&self) -> Option<&ResponsiblePeers> { + pub fn processing_peers(&self) -> Option<&BatchPeers> { match &self.state { BatchState::AwaitingDownload | BatchState::Failed @@ -287,7 +289,7 @@ impl BatchInfo { pub fn download_completed( &mut self, blocks: Vec>, - responsible_peers: ResponsiblePeers, + responsible_peers: BatchPeers, ) -> Result { match self.state.poison() { BatchState::Downloading(_) => { @@ -364,7 +366,7 @@ impl BatchInfo { match self.state.poison() { BatchState::AwaitingProcessing(responsible_peers, blocks, start_instant) => { self.state = BatchState::Processing( - Attempt::new::(responsible_peers.block_blob, &blocks), + Attempt::new::(responsible_peers.block_and_blob, &blocks), responsible_peers, ); Ok((blocks, start_instant.elapsed())) @@ -380,17 +382,6 @@ impl BatchInfo { } } - pub fn responsible_peers(&self) -> Option<&ResponsiblePeers> { - match &self.state { - BatchState::AwaitingDownload - | BatchState::Failed - | BatchState::Poisoned - | BatchState::Downloading(_) - | BatchState::AwaitingValidation(_) => None, - BatchState::AwaitingProcessing(r, _, _) | BatchState::Processing(_, r) => Some(r), - } - } - pub fn processing_completed( &mut self, processing_result: BatchProcessingResult, @@ -486,19 +477,23 @@ impl Attempt { impl std::fmt::Debug for BatchState { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { - BatchState::Processing(Attempt { peer_id, hash: _ }, responsible_peers) => { - write!(f, "Processing({}) {:?}", peer_id, responsible_peers) + BatchState::Processing(Attempt { peer_id, hash: _ }, batch_peers) => { + write!( + f, + "Processing({}) {}", + peer_id, batch_peers.block_and_blob + ) } BatchState::AwaitingValidation(Attempt { peer_id, hash: _ }) => { write!(f, "AwaitingValidation({})", peer_id) } BatchState::AwaitingDownload => f.write_str("AwaitingDownload"), BatchState::Failed => f.write_str("Failed"), - BatchState::AwaitingProcessing(responsible_peers, blocks, _) => { + BatchState::AwaitingProcessing(batch_peers, blocks, _) => { write!( f, - "AwaitingProcessing({:?}, {:?} blocks)", - responsible_peers, + "AwaitingProcessing({}, {:?} blocks)", + batch_peers.block_and_blob, blocks.len() ) } diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index 8006a501de7..9d954c0dc72 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -5,7 +5,7 @@ use crate::network_beacon_processor::ChainSegmentProcessId; use crate::sync::block_sidecar_coupling::CouplingError; use crate::sync::manager::FaultyComponent; use crate::sync::network_context::{RangeRequestId, RpcRequestSendError, RpcResponseError}; -use crate::sync::range_sync::batch::ResponsiblePeers; +use crate::sync::range_sync::batch::BatchPeers; use crate::sync::{BatchOperationOutcome, BatchProcessResult, network_context::SyncNetworkContext}; use beacon_chain::BeaconChainTypes; use beacon_chain::block_verification_types::RpcBlock; @@ -225,10 +225,10 @@ impl SyncingChain { pub fn on_block_response( &mut self, network: &mut SyncNetworkContext, + batch_peers: BatchPeers, batch_id: BatchId, request_id: Id, blocks: Vec>, - responsible_peers: ResponsiblePeers, ) -> ProcessingResult { let _guard = self.span.clone().entered(); // check if we have this batch @@ -255,8 +255,7 @@ impl SyncingChain { // A stream termination has been sent. This batch has ended. Process a completed batch. // Remove the request from the peer's active batches - // TODO(das): should use peer group here https://github.com/sigp/lighthouse/issues/6258 - let received = batch.download_completed(blocks, responsible_peers.clone())?; + let received = batch.download_completed(blocks, batch_peers.clone())?; let awaiting_batches = batch_id .saturating_sub(self.optimistic_start.unwrap_or(self.processing_target)) / EPOCHS_PER_BATCH; @@ -265,7 +264,7 @@ impl SyncingChain { blocks = received, batch_state = self.visualize_batch_state(), %awaiting_batches, - ?responsible_peers, + ?batch_peers, "Batch downloaded" ); @@ -548,7 +547,7 @@ impl SyncingChain { penalty, faulty_component, } => { - let Some(responsible_peers) = batch.responsible_peers() else { + let Some(batch_peers) = batch.processing_peers() else { crit!( current_state = ?batch.state(), "Inconsistent state, batch must have been in processing state" @@ -561,10 +560,10 @@ impl SyncingChain { // Penalize the peer appropriately. match faulty_component { Some(FaultyComponent::Blocks) | Some(FaultyComponent::Blobs) => { - network.report_peer(responsible_peers.block_blob, *penalty, "faulty_batch"); + network.report_peer(batch_peers.block_and_blob, *penalty, "faulty_batch"); } Some(FaultyComponent::Columns(faulty_columns)) => { - for (peer, columns) in responsible_peers.data_columns.iter() { + for (peer, columns) in batch_peers.data_columns.iter() { for faulty_column in faulty_columns { if columns.contains(faulty_column) { network.report_peer(*peer, *penalty, "faulty_batch"); @@ -879,10 +878,10 @@ impl SyncingChain { pub fn inject_error( &mut self, network: &mut SyncNetworkContext, + batch_peers: BatchPeers, batch_id: BatchId, request_id: Id, err: RpcResponseError, - responsible_peers: ResponsiblePeers, ) -> ProcessingResult { let _guard = self.span.clone().entered(); let batch_state = self.visualize_batch_state(); @@ -934,7 +933,7 @@ impl SyncingChain { debug!( batch_epoch = %batch_id, batch_state = ?batch.state(), - ?responsible_peers, + ?batch_peers, %request_id, ?batch_state, "Batch not expecting block" @@ -945,12 +944,12 @@ impl SyncingChain { batch_epoch = %batch_id, batch_state = ?batch.state(), error = ?err, - ?responsible_peers, + ?batch_peers, %request_id, "Batch download error" ); if let BatchOperationOutcome::Failed { blacklist } = - batch.download_failed(Some(responsible_peers.block_blob))? + batch.download_failed(Some(batch_peers.block_and_blob))? { return Err(RemoveChain::ChainFailed { blacklist, @@ -961,7 +960,7 @@ impl SyncingChain { } else { debug!( batch_epoch = %batch_id, - ?responsible_peers, + ?batch_peers, %request_id, batch_state, "Batch not found" @@ -1078,7 +1077,7 @@ impl SyncingChain { }, } } else { - debug!(?self.to_be_downloaded, ?self.processing_target,"Did not get batch"); + debug!(?self.to_be_downloaded, ?self.processing_target, "Did not get batch"); } Ok(KeepChain) @@ -1144,6 +1143,10 @@ impl SyncingChain { network: &mut SyncNetworkContext, ) -> Result { let _guard = self.span.clone().entered(); + debug!("Resuming chain"); + // attempt to download any batches stuck in the `AwaitingDownload` state because of + // a lack of peers earlier + self.attempt_send_awaiting_download_batches(network, "resume")?; // Request more batches if needed. self.request_batches(network)?; // If there is any batch ready for processing, send it. @@ -1190,28 +1193,6 @@ impl SyncingChain { self.send_batch(network, batch_id)?; } - // Force requesting the `processing_batch` to progress sync if required - if !self.batches.contains_key(&self.processing_target) { - debug!(?self.processing_target,"Forcing requesting processing_target to progress sync"); - if !self.good_peers_on_sampling_subnets(self.processing_target, network) { - debug!( - src = "request_batches_processing", - "Waiting for peers to be available on sampling column subnets" - ); - return Ok(KeepChain); - } - - if let Entry::Vacant(entry) = self.batches.entry(self.processing_target) { - let batch_type = network.batch_type(self.processing_target); - let processing_batch = - BatchInfo::new(&self.processing_target, EPOCHS_PER_BATCH, batch_type); - entry.insert(processing_batch); - self.send_batch(network, self.processing_target)?; - } else { - self.attempt_send_awaiting_download_batches(network, "request_batches_processing")?; - } - } - // No more batches, simply stop Ok(KeepChain) } diff --git a/beacon_node/network/src/sync/range_sync/mod.rs b/beacon_node/network/src/sync/range_sync/mod.rs index 04b622cb42f..265840166ca 100644 --- a/beacon_node/network/src/sync/range_sync/mod.rs +++ b/beacon_node/network/src/sync/range_sync/mod.rs @@ -9,7 +9,7 @@ mod sync_type; pub use batch::{ BatchConfig, BatchInfo, BatchOperationOutcome, BatchProcessingResult, BatchState, - ByRangeRequestType, ResponsiblePeers, + ByRangeRequestType, BatchPeers, }; pub use chain::{BatchId, ChainId, EPOCHS_PER_BATCH}; #[cfg(test)] diff --git a/beacon_node/network/src/sync/range_sync/range.rs b/beacon_node/network/src/sync/range_sync/range.rs index 703164d6874..8f18f4e8f06 100644 --- a/beacon_node/network/src/sync/range_sync/range.rs +++ b/beacon_node/network/src/sync/range_sync/range.rs @@ -46,7 +46,7 @@ use crate::metrics; use crate::status::ToStatusMessage; use crate::sync::BatchProcessResult; use crate::sync::network_context::{RpcResponseError, SyncNetworkContext}; -use crate::sync::range_sync::ResponsiblePeers; +use crate::sync::range_sync::BatchPeers; use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::{BeaconChain, BeaconChainTypes}; use lighthouse_network::rpc::GoodbyeReason; @@ -204,7 +204,7 @@ where pub fn blocks_by_range_response( &mut self, network: &mut SyncNetworkContext, - responsible_peers: ResponsiblePeers, + batch_peers: BatchPeers, chain_id: ChainId, batch_id: BatchId, request_id: Id, @@ -212,7 +212,7 @@ where ) { // check if this chunk removes the chain match self.chains.call_by_id(chain_id, |chain| { - chain.on_block_response(network, batch_id, request_id, blocks, responsible_peers) + chain.on_block_response(network, batch_peers, batch_id, request_id, blocks) }) { Ok((removed_chain, sync_type)) => { if let Some((removed_chain, remove_reason)) = removed_chain { @@ -296,7 +296,7 @@ where pub fn inject_error( &mut self, network: &mut SyncNetworkContext, - responsible_peers: ResponsiblePeers, + batch_peers: BatchPeers, batch_id: BatchId, chain_id: ChainId, request_id: Id, @@ -304,7 +304,7 @@ where ) { // check that this request is pending match self.chains.call_by_id(chain_id, |chain| { - chain.inject_error(network, batch_id, request_id, err, responsible_peers) + chain.inject_error(network, batch_peers, batch_id, request_id, err) }) { Ok((removed_chain, sync_type)) => { if let Some((removed_chain, remove_reason)) = removed_chain { From cffbd34f3f8c7ea40887e9ff24563a3c687e11be Mon Sep 17 00:00:00 2001 From: Pawan Dhananjay Date: Thu, 11 Sep 2025 18:19:16 -0700 Subject: [PATCH 32/49] Reduce code duplication --- .../src/service/api_types.rs | 13 ++ .../network/src/sync/backfill_sync/mod.rs | 8 +- .../network/src/sync/network_context.rs | 136 ++++-------------- .../network/src/sync/range_sync/batch.rs | 8 ++ .../network/src/sync/range_sync/chain.rs | 8 +- 5 files changed, 58 insertions(+), 115 deletions(-) diff --git a/beacon_node/lighthouse_network/src/service/api_types.rs b/beacon_node/lighthouse_network/src/service/api_types.rs index 645ab69ce50..efcbcaf9561 100644 --- a/beacon_node/lighthouse_network/src/service/api_types.rs +++ b/beacon_node/lighthouse_network/src/service/api_types.rs @@ -9,6 +9,12 @@ use types::{ pub type Id = u32; +#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] +pub enum RangeRequestType { + ForwardSync, + BackfillSync, +} + #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] pub struct SingleLookupReqId { pub lookup_id: Id, @@ -111,6 +117,13 @@ impl RangeRequestId { } => *batch_id, } } + + pub fn batch_type(&self) -> RangeRequestType { + match &self { + RangeRequestId::BackfillSync { .. } => RangeRequestType::BackfillSync, + RangeRequestId::RangeSync { .. } => RangeRequestType::ForwardSync, + } + } } // TODO(das) refactor in a separate PR. We might be able to remove this and replace diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index e953244976b..6c94c1821a7 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -15,12 +15,12 @@ use crate::sync::network_context::{ RangeRequestId, RpcRequestSendError, RpcResponseError, SyncNetworkContext, }; use crate::sync::range_sync::{ - BatchConfig, BatchId, BatchInfo, BatchOperationOutcome, BatchProcessingResult, BatchState, - BatchPeers, + BatchConfig, BatchId, BatchInfo, BatchOperationOutcome, BatchPeers, BatchProcessingResult, + BatchState, }; use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::{BeaconChain, BeaconChainTypes}; -use lighthouse_network::service::api_types::Id; +use lighthouse_network::service::api_types::{Id, RangeRequestType}; use lighthouse_network::types::{BackFillState, NetworkGlobals}; use lighthouse_network::{PeerAction, PeerId}; use logging::crit; @@ -1114,7 +1114,7 @@ impl BackFillSync { self.include_next_batch(network) } Entry::Vacant(entry) => { - let batch_type = network.batch_type(batch_id); + let batch_type = network.batch_type(batch_id, RangeRequestType::BackfillSync); entry.insert(BatchInfo::new( &batch_id, BACKFILL_EPOCHS_PER_BATCH, diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 840e6ab6ae2..a68927d811d 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -29,7 +29,7 @@ pub use lighthouse_network::service::api_types::RangeRequestId; use lighthouse_network::service::api_types::{ AppRequestId, BlobsByRangeRequestId, BlocksByRangeRequestId, ComponentsByRangeRequestId, CustodyId, CustodyRequester, DataColumnsByRangeRequestId, DataColumnsByRootRequestId, - DataColumnsByRootRequester, Id, SingleLookupReqId, SyncRequestId, + DataColumnsByRootRequester, Id, RangeRequestType, SingleLookupReqId, SyncRequestId, }; use lighthouse_network::{Client, NetworkGlobals, PeerAction, PeerId, ReportSource}; use lighthouse_tracing::SPAN_OUTGOING_RANGE_REQUEST; @@ -768,6 +768,17 @@ impl SyncNetworkContext { .transpose()?; let epoch = Slot::new(*request.start_slot()).epoch(T::EthSpec::slots_per_epoch()); + + let data_column_by_root_range_request = + // with this variant, we request columns by root after we receive + // a successful blocks by range response. + if matches!(batch_type, ByRangeRequestType::BlocksAndColumnsSeparate) { + Some(HashSet::from_iter( + self.chain.sampling_columns_for_epoch(epoch).iter().copied(), + )) + } else { + None + }; let info = RangeBlockComponentsRequest::new( blocks_req_id, blobs_req_id, @@ -777,108 +788,7 @@ impl SyncNetworkContext { self.chain.sampling_columns_for_epoch(epoch).to_vec(), ) }), - // We are requesting data columns by range here - None, - range_request_span, - ); - self.components_by_range_requests.insert(id, info); - - Ok(id.id) - } - - /// A blocks by range request sent by the range sync algorithm - /// - /// This function is used when we want to request data columns by root instead of range. - /// Pre-fulu, it works similar to `Self::block_components_by_range_request`. - pub fn block_components_by_range_request_without_components( - &mut self, - batch_type: ByRangeRequestType, - request: BlocksByRangeRequest, - requester: RangeRequestId, - peers: &HashSet, - peers_to_deprioritize: &HashSet, - ) -> Result { - let range_request_span = debug_span!( - parent: None, - SPAN_OUTGOING_RANGE_REQUEST, - range_req_id = %requester, - peers = peers.len() - ); - let _guard = range_request_span.clone().entered(); - let active_request_count_by_peer = self.active_request_count_by_peer(); - - let Some(block_peer) = peers - .iter() - .map(|peer| { - ( - // If contains -> 1 (order after), not contains -> 0 (order first) - peers_to_deprioritize.contains(peer), - // Prefer peers with less overall requests - active_request_count_by_peer.get(peer).copied().unwrap_or(0), - // Random factor to break ties, otherwise the PeerID breaks ties - rand::random::(), - peer, - ) - }) - .min() - .map(|(_, _, _, peer)| *peer) - else { - // Backfill and forward sync handle this condition gracefully. - // - Backfill sync: will pause waiting for more peers to join - // - Forward sync: can never happen as the chain is dropped when removing the last peer. - return Err(RpcRequestSendError::NoPeer(NoPeerError::BlockPeer)); - }; - - // Create the overall components_by_range request ID before its individual components - let id = ComponentsByRangeRequestId { - id: self.next_id(), - requester, - }; - - let blocks_req_id = self.send_blocks_by_range_request( - block_peer, - request.clone(), - id, - new_range_request_span!( - self, - "outgoing_blocks_by_range", - range_request_span.clone(), - block_peer - ), - )?; - - let blobs_req_id = if matches!(batch_type, ByRangeRequestType::BlocksAndBlobs) { - Some(self.send_blobs_by_range_request( - block_peer, - BlobsByRangeRequest { - start_slot: *request.start_slot(), - count: *request.count(), - }, - id, - new_range_request_span!( - self, - "outgoing_blobs_by_range", - range_request_span.clone(), - block_peer - ), - )?) - } else { - None - }; - - let epoch = Slot::new(*request.start_slot()).epoch(T::EthSpec::slots_per_epoch()); - let info = RangeBlockComponentsRequest::new( - blocks_req_id, - blobs_req_id, - None, - // request data columns by root only if this batch requires requesting columns - if matches!(batch_type, ByRangeRequestType::BlocksAndColumns) { - Some(HashSet::from_iter( - self.chain.sampling_columns_for_epoch(epoch).iter().copied(), - )) - } else { - None - }, + data_column_by_root_range_request, range_request_span, ); self.components_by_range_requests.insert(id, info); @@ -1618,7 +1528,11 @@ impl SyncNetworkContext { /// Check whether a batch for this epoch (and only this epoch) should request just blocks or /// blocks and blobs. - pub fn batch_type(&self, epoch: types::Epoch) -> ByRangeRequestType { + pub fn batch_type( + &self, + epoch: types::Epoch, + request_type: RangeRequestType, + ) -> ByRangeRequestType { // Induces a compile time panic if this doesn't hold true. #[allow(clippy::assertions_on_constants)] const _: () = assert!( @@ -1632,7 +1546,14 @@ impl SyncNetworkContext { .data_availability_checker .data_columns_required_for_epoch(epoch) { - ByRangeRequestType::BlocksAndColumns + match request_type { + // Currently, we download blocks and columns separately when we forward sync as + // requesting columns by root is less ambiguous when there are multiple heads. + // For backfill, since there is just one chain, it makes more sense to download + // blocks and columns together. + RangeRequestType::BackfillSync => ByRangeRequestType::BlocksAndColumns, + RangeRequestType::ForwardSync => ByRangeRequestType::BlocksAndColumnsSeparate, + } } else if self .chain .data_availability_checker @@ -1775,7 +1696,7 @@ impl SyncNetworkContext { let batch_epoch = id.batch_id(); // Return early if no columns are required for this epoch if !matches!( - self.batch_type(batch_epoch), + self.batch_type(batch_epoch, id.parent_request_id.requester.batch_type()), ByRangeRequestType::BlocksAndColumns ) { return Ok(()); @@ -1828,7 +1749,8 @@ impl SyncNetworkContext { .network_globals() .peers .read() - .good_custody_subnet_peer_range_sync(subnet_id, batch_epoch).choose(&mut rand::rng()) + .good_custody_subnet_peer_range_sync(subnet_id, batch_epoch) + .choose(&mut rand::rng()) { peer_to_columns .entry(*custody_peer) diff --git a/beacon_node/network/src/sync/range_sync/batch.rs b/beacon_node/network/src/sync/range_sync/batch.rs index d2fa0d4eb96..748cf8ac1e4 100644 --- a/beacon_node/network/src/sync/range_sync/batch.rs +++ b/beacon_node/network/src/sync/range_sync/batch.rs @@ -21,7 +21,15 @@ const MAX_BATCH_PROCESSING_ATTEMPTS: u8 = 3; #[derive(Debug, Copy, Clone, Display)] #[strum(serialize_all = "snake_case")] pub enum ByRangeRequestType { + /// This variant requests the blocks and columns + /// simaltaneously and then tries to couple the + /// responses. BlocksAndColumns, + /// This variant requests the blocks first using + /// a byrange request and then requests the data columns + /// for the received blocks using the `DataColumnsByRoot` + /// root request. + BlocksAndColumnsSeparate, BlocksAndBlobs, Blocks, } diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index 9d954c0dc72..894c2756206 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -9,7 +9,7 @@ use crate::sync::range_sync::batch::BatchPeers; use crate::sync::{BatchOperationOutcome, BatchProcessResult, network_context::SyncNetworkContext}; use beacon_chain::BeaconChainTypes; use beacon_chain::block_verification_types::RpcBlock; -use lighthouse_network::service::api_types::Id; +use lighthouse_network::service::api_types::{Id, RangeRequestType}; use lighthouse_network::{PeerAction, PeerId}; use lighthouse_tracing::SPAN_SYNCING_CHAIN; use logging::crit; @@ -1025,7 +1025,7 @@ impl SyncingChain { .cloned() .collect::>(); - match network.block_components_by_range_request_without_components( + match network.block_components_by_range_request( batch_type, request, RangeRequestId::RangeSync { @@ -1173,7 +1173,7 @@ impl SyncingChain { } if let Entry::Vacant(entry) = self.batches.entry(epoch) { - let batch_type = network.batch_type(epoch); + let batch_type = network.batch_type(epoch, RangeRequestType::ForwardSync); let optimistic_batch = BatchInfo::new(&epoch, EPOCHS_PER_BATCH, batch_type); entry.insert(optimistic_batch); self.send_batch(network, epoch)?; @@ -1270,7 +1270,7 @@ impl SyncingChain { self.include_next_batch(network) } Entry::Vacant(entry) => { - let batch_type = network.batch_type(next_batch_id); + let batch_type = network.batch_type(next_batch_id, RangeRequestType::ForwardSync); entry.insert(BatchInfo::new(&next_batch_id, EPOCHS_PER_BATCH, batch_type)); self.to_be_downloaded += EPOCHS_PER_BATCH; Some(next_batch_id) From 08bba3f3beda4e2de048edb2ccd4f426acbb7570 Mon Sep 17 00:00:00 2001 From: Pawan Dhananjay Date: Thu, 11 Sep 2025 18:24:25 -0700 Subject: [PATCH 33/49] fmt --- beacon_node/network/src/sync/range_sync/batch.rs | 6 +----- beacon_node/network/src/sync/range_sync/mod.rs | 4 ++-- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/beacon_node/network/src/sync/range_sync/batch.rs b/beacon_node/network/src/sync/range_sync/batch.rs index 748cf8ac1e4..24f21a9441d 100644 --- a/beacon_node/network/src/sync/range_sync/batch.rs +++ b/beacon_node/network/src/sync/range_sync/batch.rs @@ -486,11 +486,7 @@ impl std::fmt::Debug for BatchState { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { BatchState::Processing(Attempt { peer_id, hash: _ }, batch_peers) => { - write!( - f, - "Processing({}) {}", - peer_id, batch_peers.block_and_blob - ) + write!(f, "Processing({}) {}", peer_id, batch_peers.block_and_blob) } BatchState::AwaitingValidation(Attempt { peer_id, hash: _ }) => { write!(f, "AwaitingValidation({})", peer_id) diff --git a/beacon_node/network/src/sync/range_sync/mod.rs b/beacon_node/network/src/sync/range_sync/mod.rs index 265840166ca..1218e0cd09c 100644 --- a/beacon_node/network/src/sync/range_sync/mod.rs +++ b/beacon_node/network/src/sync/range_sync/mod.rs @@ -8,8 +8,8 @@ mod range; mod sync_type; pub use batch::{ - BatchConfig, BatchInfo, BatchOperationOutcome, BatchProcessingResult, BatchState, - ByRangeRequestType, BatchPeers, + BatchConfig, BatchInfo, BatchOperationOutcome, BatchPeers, BatchProcessingResult, BatchState, + ByRangeRequestType, }; pub use chain::{BatchId, ChainId, EPOCHS_PER_BATCH}; #[cfg(test)] From 9db4c3071de79c22889769531766963eb6ffc751 Mon Sep 17 00:00:00 2001 From: Pawan Dhananjay Date: Tue, 16 Sep 2025 16:38:19 -0700 Subject: [PATCH 34/49] Fix small bug --- beacon_node/network/src/sync/network_context.rs | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index a68927d811d..7b76e3c32ff 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -1697,14 +1697,7 @@ impl SyncNetworkContext { // Return early if no columns are required for this epoch if !matches!( self.batch_type(batch_epoch, id.parent_request_id.requester.batch_type()), - ByRangeRequestType::BlocksAndColumns - ) { - return Ok(()); - } - // Return early if this is a backfill batch, backfill batches are handled by range requests instead of root - if matches!( - id.parent_request_id.requester, - RangeRequestId::BackfillSync { .. } + ByRangeRequestType::BlocksAndColumnsSeparate ) { return Ok(()); } From e3aed89749a5f0a92ba8a45d48c3e34f181c1cb6 Mon Sep 17 00:00:00 2001 From: Pawan Dhananjay Date: Tue, 16 Sep 2025 17:46:21 -0700 Subject: [PATCH 35/49] Remove retry test that we do not use anymore --- .../src/sync/block_sidecar_coupling.rs | 90 ------------------- 1 file changed, 90 deletions(-) diff --git a/beacon_node/network/src/sync/block_sidecar_coupling.rs b/beacon_node/network/src/sync/block_sidecar_coupling.rs index 6a1c162e674..2400e05c800 100644 --- a/beacon_node/network/src/sync/block_sidecar_coupling.rs +++ b/beacon_node/network/src/sync/block_sidecar_coupling.rs @@ -676,7 +676,6 @@ impl ByRangeRequest { #[cfg(test)] mod tests { use super::RangeBlockComponentsRequest; - use crate::sync::network_context::MAX_COLUMN_RETRIES; use beacon_chain::test_utils::{ NumBlobs, generate_rand_block_and_blobs, generate_rand_block_and_data_columns, test_spec, }; @@ -1100,93 +1099,4 @@ mod tests { let rpc_blocks = result.unwrap(); assert_eq!(rpc_blocks.len(), 2); } - - #[test] - fn max_retries_exceeded_behavior() { - // GIVEN: A request where peers consistently fail to provide required columns - let spec = test_spec::(); - let expected_custody_columns = vec![1, 2]; - let mut rng = XorShiftRng::from_seed([42; 16]); - let blocks = (0..1) - .map(|_| { - generate_rand_block_and_data_columns::( - ForkName::Fulu, - NumBlobs::Number(1), - &mut rng, - &spec, - ) - }) - .collect::>(); - - let components_id = components_id(); - let blocks_req_id = blocks_id(components_id); - let columns_req_id = expected_custody_columns - .iter() - .enumerate() - .map(|(i, column)| (columns_id(i as Id, components_id), vec![*column])) - .collect::>(); - let mut info = RangeBlockComponentsRequest::::new( - blocks_req_id, - None, - Some((columns_req_id.clone(), expected_custody_columns.clone())), - None, - Span::none(), - ); - - // AND: All blocks are received - info.add_blocks( - blocks_req_id, - blocks.iter().map(|b| b.0.clone().into()).collect(), - ) - .unwrap(); - - // AND: Only partial custody columns are provided (column 1 but not 2) - let (req1, _) = columns_req_id.first().unwrap(); - info.add_custody_columns( - *req1, - blocks - .iter() - .flat_map(|b| b.1.iter().filter(|d| d.index == 1).cloned()) - .collect(), - ) - .unwrap(); - - // AND: Column 2 request completes with empty data (persistent peer failure) - let (req2, _) = columns_req_id.get(1).unwrap(); - info.add_custody_columns(*req2, vec![]).unwrap(); - - // WHEN: Multiple retry attempts are made (up to max retries) - for _ in 0..MAX_COLUMN_RETRIES { - let result = info.responses(&spec).unwrap(); - assert!(result.is_err()); - - if let Err(super::CouplingError::DataColumnPeerFailure { - exceeded_retries, .. - }) = &result - && *exceeded_retries - { - break; - } - } - - // AND: One final attempt after exceeding max retries - let result = info.responses(&spec).unwrap(); - - // THEN: Should fail with exceeded_retries = true - assert!(result.is_err()); - if let Err(super::CouplingError::DataColumnPeerFailure { - error: _, - faulty_peers, - action, - exceeded_retries, - }) = result - { - assert_eq!(faulty_peers.len(), 1); // column 2 missing - assert_eq!(faulty_peers[0].0, 2); // column index 2 - assert!(matches!(action, PeerAction::LowToleranceError)); - assert!(exceeded_retries); // Should be true after max retries - } else { - panic!("Expected PeerFailure error with exceeded_retries=true"); - } - } } From b3b3756e6f2565acb80599b5cc0c00cd3a333cbd Mon Sep 17 00:00:00 2001 From: Pawan Dhananjay Date: Wed, 17 Sep 2025 15:01:46 -0700 Subject: [PATCH 36/49] Fix tests --- beacon_node/network/src/sync/tests/range.rs | 84 +++++++++++++++++++-- 1 file changed, 78 insertions(+), 6 deletions(-) diff --git a/beacon_node/network/src/sync/tests/range.rs b/beacon_node/network/src/sync/tests/range.rs index cb728a90c1b..516b66c45eb 100644 --- a/beacon_node/network/src/sync/tests/range.rs +++ b/beacon_node/network/src/sync/tests/range.rs @@ -16,7 +16,7 @@ use lighthouse_network::rpc::methods::{ }; use lighthouse_network::service::api_types::{ AppRequestId, BlobsByRangeRequestId, BlocksByRangeRequestId, DataColumnsByRangeRequestId, - SyncRequestId, + DataColumnsByRootRequestId, SyncRequestId, }; use lighthouse_network::{PeerId, SyncInfo}; use std::time::Duration; @@ -36,6 +36,7 @@ enum ByRangeDataRequestIds { PreDeneb, PrePeerDAS(BlobsByRangeRequestId, PeerId), PostPeerDAS(Vec<(DataColumnsByRangeRequestId, PeerId)>), + PostPeerDASByRoot(Vec<(DataColumnsByRootRequestId, PeerId)>), } /// Sync tests are usually written in the form: @@ -233,7 +234,8 @@ impl TestRig { }); let by_range_data_requests = if self.after_fulu() { - let mut data_columns_requests = vec![]; + // First check for DataColumnsByRange requests (old paradigm) + let mut data_columns_range_requests = vec![]; while let Ok(data_columns_request) = self.pop_received_network_event(|ev| match ev { NetworkMessage::SendRequest { peer_id, @@ -245,12 +247,34 @@ impl TestRig { } if filter_f(*peer_id, *start_slot) => Some((*id, *peer_id)), _ => None, }) { - data_columns_requests.push(data_columns_request); + data_columns_range_requests.push(data_columns_request); } - if data_columns_requests.is_empty() { - panic!("Found zero DataColumnsByRange requests, filter {request_filter:?}"); + + // If we found range requests, use the `ByRangeRequestType::BlocksAndColumns` paradigm + if !data_columns_range_requests.is_empty() { + ByRangeDataRequestIds::PostPeerDAS(data_columns_range_requests) + } else { + // Try to find the byroot requests associated with the `ByRangeRequestType::BlocksAndColumnsSeparate` + let mut data_columns_root_requests = vec![]; + while let Ok(data_columns_request) = self.pop_received_network_event(|ev| match ev { + NetworkMessage::SendRequest { + peer_id, + request: RequestType::DataColumnsByRoot(_), + app_request_id: AppRequestId::Sync(SyncRequestId::DataColumnsByRoot(id)), + } => Some((*id, *peer_id)), + _ => None, + }) { + data_columns_root_requests.push(data_columns_request); + } + + if !data_columns_root_requests.is_empty() { + ByRangeDataRequestIds::PostPeerDASByRoot(data_columns_root_requests) + } else { + // No data column requests found - this is expected for the new paradigm + // since DataColumnsByRoot requests are sent after blocks are received + ByRangeDataRequestIds::PostPeerDASByRoot(vec![]) + } } - ByRangeDataRequestIds::PostPeerDAS(data_columns_requests) } else if self.after_deneb() { let (id, peer) = self .pop_received_network_event(|ev| match ev { @@ -318,11 +342,54 @@ impl TestRig { }); } } + ByRangeDataRequestIds::PostPeerDASByRoot(data_column_req_ids) => { + // Complete the DataColumnsByRoot requests with stream termination + for (id, peer_id) in data_column_req_ids { + self.log(&format!( + "Completing DataColumnsByRoot request {id:?} with empty stream" + )); + self.send_sync_message(SyncMessage::RpcDataColumn { + sync_request_id: SyncRequestId::DataColumnsByRoot(id), + peer_id, + data_column: None, + seen_timestamp: D, + }); + } + } } blocks_req_id.parent_request_id.requester } + fn find_and_complete_data_columns_by_root_requests(&mut self) { + // In the new paradigm, DataColumnsByRoot requests are sent after blocks are received + // We need to complete any pending DataColumnsByRoot requests + let mut data_columns_root_requests = vec![]; + while let Ok(data_columns_request) = self.pop_received_network_event(|ev| match ev { + NetworkMessage::SendRequest { + peer_id, + request: RequestType::DataColumnsByRoot(_), + app_request_id: AppRequestId::Sync(SyncRequestId::DataColumnsByRoot(id)), + } => Some((*id, *peer_id)), + _ => None, + }) { + data_columns_root_requests.push(data_columns_request); + } + + // Complete the DataColumnsByRoot requests + for (id, peer_id) in data_columns_root_requests { + self.log(&format!( + "Completing DataColumnsByRoot request {id:?} with empty stream" + )); + self.send_sync_message(SyncMessage::RpcDataColumn { + sync_request_id: SyncRequestId::DataColumnsByRoot(id), + peer_id, + data_column: None, + seen_timestamp: D, + }); + } + } + fn find_and_complete_processing_chain_segment(&mut self, id: ChainSegmentProcessId) { self.pop_received_processor_event(|ev| { (ev.work_type() == WorkType::ChainSegment).then_some(()) @@ -366,6 +433,11 @@ impl TestRig { }; self.find_and_complete_processing_chain_segment(id); + + // In the new paradigm, DataColumnsByRoot requests are sent after blocks are processed + // We need to complete any pending DataColumnsByRoot requests + self.find_and_complete_data_columns_by_root_requests(); + if epoch < last_epoch - 1 { self.assert_state(RangeSyncType::Finalized); } else { From 2f35c360b2ccc8adf83f71fd4759e673c38cbf92 Mon Sep 17 00:00:00 2001 From: Pawan Dhananjay Date: Wed, 17 Sep 2025 15:13:40 -0700 Subject: [PATCH 37/49] Add some metrics --- beacon_node/network/src/metrics.rs | 6 ++++++ beacon_node/network/src/sync/network_context.rs | 13 +++++++++++++ 2 files changed, 19 insertions(+) diff --git a/beacon_node/network/src/metrics.rs b/beacon_node/network/src/metrics.rs index a2b5af8b086..6878d1f0755 100644 --- a/beacon_node/network/src/metrics.rs +++ b/beacon_node/network/src/metrics.rs @@ -484,6 +484,12 @@ pub static SYNC_ACTIVE_NETWORK_REQUESTS: LazyLock> = LazyLoc &["type"], ) }); +pub static SYNC_PENDING_ROOT_RANGE_REQUESTS: LazyLock> = LazyLock::new(|| { + try_create_int_gauge( + "sync_pending_root_range_requests", + "Current count of pending columns by root requests waiting for peers", + ) +}); pub static SYNC_UNKNOWN_NETWORK_REQUESTS: LazyLock> = LazyLock::new(|| { try_create_int_counter_vec( "sync_unknwon_network_request", diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 5a70f61a39a..3c8e01b7e77 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -638,6 +638,10 @@ impl SyncNetworkContext { } } + metrics::set_gauge( + &metrics::SYNC_PENDING_ROOT_RANGE_REQUESTS, + self.pending_column_by_root_range_requests.len() as i64, + ); // Re-insert entries that still need to be retried self.pending_column_by_root_range_requests .extend(entries_to_keep); @@ -1801,6 +1805,11 @@ impl SyncNetworkContext { self.pending_column_by_root_range_requests .insert(id.parent_request_id, data_columns_by_root_request); + + metrics::set_gauge( + &metrics::SYNC_PENDING_ROOT_RANGE_REQUESTS, + self.pending_column_by_root_range_requests.len() as i64, + ); } // Insert the requests into the existing block parent request @@ -2074,6 +2083,10 @@ impl SyncNetworkContext { "data_columns_by_range", self.data_columns_by_range_requests.len(), ), + ( + "data_columns_by_root_range", + self.data_columns_by_root_range_requests.len(), + ), ("custody_by_root", self.custody_by_root_requests.len()), ( "components_by_range", From aa6a1bc850884476018ab647dae4d9413b8a307a Mon Sep 17 00:00:00 2001 From: Pawan Dhananjay Date: Wed, 24 Sep 2025 12:16:05 -0700 Subject: [PATCH 38/49] Create a custom penalize_sync_peer method for clarity --- .../beacon_chain/src/block_verification.rs | 35 ++++++++++++++++++- .../gossip_methods.rs | 6 ++-- .../network_beacon_processor/sync_methods.rs | 20 ++--------- .../network/src/sync/block_lookups/mod.rs | 4 ++- 4 files changed, 44 insertions(+), 21 deletions(-) diff --git a/beacon_node/beacon_chain/src/block_verification.rs b/beacon_node/beacon_chain/src/block_verification.rs index 1d10fae0a49..97b850e917e 100644 --- a/beacon_node/beacon_chain/src/block_verification.rs +++ b/beacon_node/beacon_chain/src/block_verification.rs @@ -418,7 +418,7 @@ pub enum ExecutionPayloadError { } impl ExecutionPayloadError { - pub fn penalize_peer(&self) -> bool { + pub fn penalize_gossip_peer(&self) -> bool { // This match statement should never have a default case so that we are // always forced to consider here whether or not to penalize a peer when // we add a new error condition. @@ -447,6 +447,39 @@ impl ExecutionPayloadError { ExecutionPayloadError::UnverifiedNonOptimisticCandidate => false, } } + + pub fn penalize_sync_peer(&self) -> bool { + // This match statement should never have a default case so that we are + // always forced to consider here whether or not to penalize a peer when + // we add a new error condition. + match self { + // The peer has nothing to do with this error, do not penalize them. + ExecutionPayloadError::NoExecutionConnection => false, + // The peer has nothing to do with this error, do not penalize them. + ExecutionPayloadError::RequestFailed(_) => false, + // For the sync case, we do not want a peer to keep sending us blocks that our + // execution engine considers invalid. + // + // Also, we ask peers for blocks over sync/rpc only when they indicate + // that they have fully validated a given block (using their status message). + // + // Hence, we should penalize for this error in the sync case. + ExecutionPayloadError::RejectedByExecutionEngine { .. } => true, + // There is no reason for an honest peer to propagate a block with an invalid + // payload time stamp. + ExecutionPayloadError::InvalidPayloadTimestamp { .. } => true, + // We do not want to receive these blocks over rpc even though the gossip + // case is still allowed. + ExecutionPayloadError::InvalidTerminalPoWBlock { .. } => true, + // We should penalize RPC blocks, since even an optimistic node shouldn't + // verify this block. + ExecutionPayloadError::InvalidActivationEpoch { .. } => true, + // As per `Self::InvalidActivationEpoch`. + ExecutionPayloadError::InvalidTerminalBlockHash { .. } => true, + // Do not penalize the peer since it's not their fault that *we're* optimistic. + ExecutionPayloadError::UnverifiedNonOptimisticCandidate => false, + } + } } impl From for ExecutionPayloadError { diff --git a/beacon_node/network/src/network_beacon_processor/gossip_methods.rs b/beacon_node/network/src/network_beacon_processor/gossip_methods.rs index 5fc94c29587..20ed7a884a2 100644 --- a/beacon_node/network/src/network_beacon_processor/gossip_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/gossip_methods.rs @@ -1330,7 +1330,9 @@ impl NetworkBeaconProcessor { self.propagate_validation_result(message_id, peer_id, MessageAcceptance::Ignore); return None; } - Err(ref e @ BlockError::ExecutionPayloadError(ref epe)) if !epe.penalize_peer() => { + Err(ref e @ BlockError::ExecutionPayloadError(ref epe)) + if !epe.penalize_gossip_peer() => + { debug!(error = %e, "Could not verify block for gossip. Ignoring the block"); self.propagate_validation_result(message_id, peer_id, MessageAcceptance::Ignore); return None; @@ -1562,7 +1564,7 @@ impl NetworkBeaconProcessor { "Block with unknown parent attempted to be processed" ); } - Err(e @ BlockError::ExecutionPayloadError(epe)) if !epe.penalize_peer() => { + Err(e @ BlockError::ExecutionPayloadError(epe)) if !epe.penalize_gossip_peer() => { debug!( error = %e, "Failed to verify execution payload" diff --git a/beacon_node/network/src/network_beacon_processor/sync_methods.rs b/beacon_node/network/src/network_beacon_processor/sync_methods.rs index 897220ae8cd..c8bc1b0ef44 100644 --- a/beacon_node/network/src/network_beacon_processor/sync_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/sync_methods.rs @@ -11,8 +11,7 @@ use beacon_chain::data_availability_checker::AvailabilityCheckError; use beacon_chain::data_availability_checker::MaybeAvailableBlock; use beacon_chain::{ AvailabilityProcessingStatus, BeaconChainTypes, BlockError, ChainSegmentResult, - ExecutionPayloadError, HistoricalBlockError, NotifyExecutionLayer, - validator_monitor::get_slot_delay_ms, + HistoricalBlockError, NotifyExecutionLayer, validator_monitor::get_slot_delay_ms, }; use beacon_processor::{ AsyncFn, BlockingFn, DuplicateCache, @@ -773,7 +772,7 @@ impl NetworkBeaconProcessor { Err(ChainSegmentFailed { message: format!("Block has an unknown parent: {}", parent_root), // Peers are faulty if they send non-sequential blocks. - peer_action: Some(PeerAction::LowToleranceError), // todo(pawan): revise this + peer_action: Some(PeerAction::LowToleranceError), faulty_component: Some(FaultyComponent::Blocks), }) } @@ -852,20 +851,7 @@ impl NetworkBeaconProcessor { }) } ref err @ BlockError::ExecutionPayloadError(ref epe) => { - if matches!(epe, ExecutionPayloadError::RejectedByExecutionEngine { .. }) { - debug!( - error = ?err, - "Invalid execution payload rejected by EE" - ); - Err(ChainSegmentFailed { - message: format!( - "Peer sent a block containing invalid execution payload. Reason: {:?}", - err - ), - peer_action: Some(PeerAction::LowToleranceError), - faulty_component: Some(FaultyComponent::Blocks), // todo(pawan): recheck this - }) - } else if !epe.penalize_peer() { + if !epe.penalize_sync_peer() { // These errors indicate an issue with the EL and not the `ChainSegment`. // Pause the syncing while the EL recovers debug!( diff --git a/beacon_node/network/src/sync/block_lookups/mod.rs b/beacon_node/network/src/sync/block_lookups/mod.rs index f8ffd298caf..dfc106383ed 100644 --- a/beacon_node/network/src/sync/block_lookups/mod.rs +++ b/beacon_node/network/src/sync/block_lookups/mod.rs @@ -617,7 +617,9 @@ impl BlockLookups { request_state.revert_to_awaiting_processing()?; Action::ParentUnknown { parent_root } } - ref e @ BlockError::ExecutionPayloadError(ref epe) if !epe.penalize_peer() => { + ref e @ BlockError::ExecutionPayloadError(ref epe) + if !epe.penalize_sync_peer() => + { // These errors indicate that the execution layer is offline // and failed to validate the execution payload. Do not downscore peer. debug!( From 4b0b6550ae8b1b046d998024f8427f5f00212002 Mon Sep 17 00:00:00 2001 From: Pawan Dhananjay Date: Wed, 24 Sep 2025 12:31:06 -0700 Subject: [PATCH 39/49] Fix nits --- .../network/src/sync/block_sidecar_coupling.rs | 14 +++++++++----- beacon_node/network/src/sync/network_context.rs | 8 ++++---- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/beacon_node/network/src/sync/block_sidecar_coupling.rs b/beacon_node/network/src/sync/block_sidecar_coupling.rs index 2400e05c800..8f5e6aafeca 100644 --- a/beacon_node/network/src/sync/block_sidecar_coupling.rs +++ b/beacon_node/network/src/sync/block_sidecar_coupling.rs @@ -34,6 +34,9 @@ use types::{ pub struct RangeBlockComponentsRequest { /// Blocks we have received awaiting for their corresponding sidecar. blocks_request: ByRangeRequest>>>, + /// We store the peer that we requested the blocks from for this particular `RangeBlockComponentsRequest`. + /// This is to ensure that we penalize the block peer if the blocks turn out to be invalid + /// during processing. block_peer: PeerId, /// Sidecars we have received awaiting for their corresponding block. block_data_request: RangeBlockDataRequest, @@ -49,7 +52,8 @@ enum ByRangeRequest { enum RangeBlockDataRequest { NoData, Blobs(ByRangeRequest>>>), - DataColumns { + /// These are data columns fetched by a range request. + DataColumnsFromRange { requests: HashMap< DataColumnsByRangeRequestId, ByRangeRequest>, @@ -98,13 +102,13 @@ impl RangeBlockComponentsRequest { /// * `blocks_req_id` - Request ID for the blocks /// * `blobs_req_id` - Optional request ID for blobs (pre-Fulu fork) /// * `data_columns` - Optional tuple of (request_id->column_indices pairs, expected_custody_columns) for Fulu fork - /// * `request_columns_by_root` - Creates an uninitialized `RangeBlockDataRequest::DataColumnsFromRoot` variant if this is true. + /// * `data_columns_by_root` - Creates an uninitialized `RangeBlockDataRequest::DataColumnsFromRoot` variant if this is `Some`. /// Note: this is only relevant is `data_columns == None`. #[allow(clippy::type_complexity)] pub fn new( blocks_req_id: BlocksByRangeRequestId, blobs_req_id: Option, - data_columns: Option<( + data_columns_by_range: Option<( Vec<(DataColumnsByRangeRequestId, Vec)>, Vec, )>, @@ -114,7 +118,7 @@ impl RangeBlockComponentsRequest { let block_peer = blocks_req_id.peer_id; let block_data_request = if let Some(blobs_req_id) = blobs_req_id { RangeBlockDataRequest::Blobs(ByRangeRequest::Active(blobs_req_id)) - } else if let Some((requests, expected_custody_columns)) = data_columns { + } else if let Some((requests, expected_custody_columns)) = data_columns_by_range { let request_to_column_indices: HashMap<_, _> = requests.into_iter().collect(); RangeBlockDataRequest::DataColumns { requests: request_to_column_indices @@ -268,7 +272,7 @@ impl RangeBlockComponentsRequest { match &mut self.block_data_request { RangeBlockDataRequest::NoData => Err("received blobs but expected no data".to_owned()), RangeBlockDataRequest::DataColumnsFromRoot { .. } => { - Err("received blobs but expected no data columns by root".to_owned()) + Err("received blobs but expected data columns by root".to_owned()) } RangeBlockDataRequest::Blobs(req) => req.finish(req_id, blobs), RangeBlockDataRequest::DataColumns { .. } => { diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 3c8e01b7e77..217208138af 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -514,7 +514,7 @@ impl SyncNetworkContext { // Attempt to find all required custody peers to request the failed columns from let columns_by_range_peers_to_request = self - .select_columns_by_range_peers_to_request( + .select_column_peers_to_request( failed_columns, peers, active_request_count_by_peer, @@ -586,7 +586,7 @@ impl SyncNetworkContext { .cloned() .collect(); - match self.select_columns_by_range_peers_to_request( + match self.select_column_peers_to_request( &custody_indices, &synced_peers, active_requests.clone(), @@ -699,7 +699,7 @@ impl SyncNetworkContext { .iter() .cloned() .collect(); - Some(self.select_columns_by_range_peers_to_request( + Some(self.select_column_peers_to_request( &column_indexes, peers, active_request_count_by_peer, @@ -800,7 +800,7 @@ impl SyncNetworkContext { Ok(id.id) } - fn select_columns_by_range_peers_to_request( + fn select_column_peers_to_request( &self, custody_indexes: &HashSet, peers: &HashSet, From 7650032898178a8e33a0a5c37411a749ec0b671d Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Thu, 25 Sep 2025 23:52:47 +0200 Subject: [PATCH 40/49] Rename DataColumnsFromRange --- .../network/src/sync/block_sidecar_coupling.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/beacon_node/network/src/sync/block_sidecar_coupling.rs b/beacon_node/network/src/sync/block_sidecar_coupling.rs index 8f5e6aafeca..9caf84be20d 100644 --- a/beacon_node/network/src/sync/block_sidecar_coupling.rs +++ b/beacon_node/network/src/sync/block_sidecar_coupling.rs @@ -120,7 +120,7 @@ impl RangeBlockComponentsRequest { RangeBlockDataRequest::Blobs(ByRangeRequest::Active(blobs_req_id)) } else if let Some((requests, expected_custody_columns)) = data_columns_by_range { let request_to_column_indices: HashMap<_, _> = requests.into_iter().collect(); - RangeBlockDataRequest::DataColumns { + RangeBlockDataRequest::DataColumnsFromRange { requests: request_to_column_indices .keys() .map(|id| (*id, ByRangeRequest::Active(*id))) @@ -155,7 +155,7 @@ impl RangeBlockComponentsRequest { block_and_blob: self.block_peer, data_columns: match &self.block_data_request { RangeBlockDataRequest::NoData | RangeBlockDataRequest::Blobs(_) => HashMap::new(), - RangeBlockDataRequest::DataColumns { + RangeBlockDataRequest::DataColumnsFromRange { request_to_column_indices, .. } => request_to_column_indices @@ -180,7 +180,7 @@ impl RangeBlockComponentsRequest { failed_column_requests: Vec<(DataColumnsByRangeRequestId, Vec)>, ) -> Result<(), String> { match &mut self.block_data_request { - RangeBlockDataRequest::DataColumns { + RangeBlockDataRequest::DataColumnsFromRange { requests, expected_custody_columns: _, request_to_column_indices, @@ -275,7 +275,7 @@ impl RangeBlockComponentsRequest { Err("received blobs but expected data columns by root".to_owned()) } RangeBlockDataRequest::Blobs(req) => req.finish(req_id, blobs), - RangeBlockDataRequest::DataColumns { .. } => { + RangeBlockDataRequest::DataColumnsFromRange { .. } => { Err("received blobs but expected data columns".to_owned()) } } @@ -300,7 +300,7 @@ impl RangeBlockComponentsRequest { RangeBlockDataRequest::DataColumnsFromRoot { .. } => { Err("received data columns by root but expected range".to_owned()) } - RangeBlockDataRequest::DataColumns { requests, .. } => { + RangeBlockDataRequest::DataColumnsFromRange { requests, .. } => { let req = requests .get_mut(&req_id) .ok_or(format!("unknown data columns by range req_id {req_id}"))?; @@ -325,7 +325,7 @@ impl RangeBlockComponentsRequest { RangeBlockDataRequest::Blobs(_) => { Err("received data columns but expected blobs".to_owned()) } - RangeBlockDataRequest::DataColumns { .. } => { + RangeBlockDataRequest::DataColumnsFromRange { .. } => { Err("received data columns by range but expected root".to_owned()) } RangeBlockDataRequest::DataColumnsFromRoot { requests, .. } => { @@ -366,7 +366,7 @@ impl RangeBlockComponentsRequest { )) } - RangeBlockDataRequest::DataColumns { + RangeBlockDataRequest::DataColumnsFromRange { requests, expected_custody_columns, request_to_column_indices, From 7488755e833d725de4de6e450cbb84aa0a3968b8 Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Thu, 25 Sep 2025 22:09:06 +0200 Subject: [PATCH 41/49] De-duplicate data columns by root request type --- .../lighthouse_network/src/rpc/methods.rs | 12 ++ .../src/sync/block_sidecar_coupling.rs | 2 +- beacon_node/network/src/sync/manager.rs | 26 ++-- .../network/src/sync/network_context.rs | 115 +++--------------- .../src/sync/network_context/custody.rs | 66 +++++----- .../src/sync/network_context/requests.rs | 5 +- .../requests/data_columns_by_root.rs | 76 +----------- 7 files changed, 76 insertions(+), 226 deletions(-) diff --git a/beacon_node/lighthouse_network/src/rpc/methods.rs b/beacon_node/lighthouse_network/src/rpc/methods.rs index 9319973e597..4b930a091f4 100644 --- a/beacon_node/lighthouse_network/src/rpc/methods.rs +++ b/beacon_node/lighthouse_network/src/rpc/methods.rs @@ -535,6 +535,18 @@ impl DataColumnsByRootRequest { Ok(Self { data_column_ids }) } + pub fn from_single_block(block_root: Hash256, indices: Vec) -> Result { + let columns = VariableList::new(indices) + .map_err(|_| "Number of indices exceeds total number of columns")?; + DataColumnsByRootRequest::new( + vec![DataColumnsByRootIdentifier { + block_root, + columns, + }], + 1, + ) + } + pub fn max_requested(&self) -> usize { self.data_column_ids.iter().map(|id| id.columns.len()).sum() } diff --git a/beacon_node/network/src/sync/block_sidecar_coupling.rs b/beacon_node/network/src/sync/block_sidecar_coupling.rs index 9caf84be20d..fd221efc99d 100644 --- a/beacon_node/network/src/sync/block_sidecar_coupling.rs +++ b/beacon_node/network/src/sync/block_sidecar_coupling.rs @@ -53,7 +53,7 @@ enum RangeBlockDataRequest { NoData, Blobs(ByRangeRequest>>>), /// These are data columns fetched by a range request. - DataColumnsFromRange { + DataColumns { requests: HashMap< DataColumnsByRangeRequestId, ByRangeRequest>, diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index 34de8aa45bc..2bf4f831e82 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -1129,24 +1129,20 @@ impl SyncManager { peer_id: PeerId, data_column: RpcEvent>>, ) { - match req_id.requester { - DataColumnsByRootRequester::Custody(custody_id) => { - if let Some(resp) = - self.network - .on_data_columns_by_root_response(req_id, peer_id, data_column) - && let Some(result) = self + if let Some(resp) = + self.network + .on_data_columns_by_root_response(req_id, peer_id, data_column) + { + match req_id.requester { + DataColumnsByRootRequester::Custody(custody_id) => { + if let Some(result) = self .network .on_custody_by_root_response(custody_id, req_id, peer_id, resp) - { - self.on_custody_by_root_result(custody_id.requester, result); + { + self.on_custody_by_root_result(custody_id.requester, result); + } } - } - DataColumnsByRootRequester::RangeSync { parent } => { - if let Some(resp) = self.network.on_data_columns_by_root_range_response( - req_id, - peer_id, - data_column, - ) { + DataColumnsByRootRequester::RangeSync { parent } => { self.on_range_components_response( parent, peer_id, diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 217208138af..1aa3813284b 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -2,7 +2,7 @@ //! channel and stores a global RPC ID to perform requests. use self::custody::{ActiveCustodyRequest, Error as CustodyRequestError}; -pub use self::requests::{BlocksByRootSingleRequest, DataColumnsByRootSingleBlockRequest}; +pub use self::requests::BlocksByRootSingleRequest; use super::SyncMessage; use super::block_sidecar_coupling::RangeBlockComponentsRequest; use super::manager::BlockProcessType; @@ -15,9 +15,7 @@ use crate::service::NetworkMessage; use crate::status::ToStatusMessage; use crate::sync::block_lookups::SingleLookupId; use crate::sync::block_sidecar_coupling::CouplingError; -use crate::sync::network_context::requests::{ - BlobsByRootSingleBlockRequest, DataColumnsByRootRangeRequestItems, -}; +use crate::sync::network_context::requests::BlobsByRootSingleBlockRequest; use crate::sync::range_sync::BatchPeers; use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::{BeaconChain, BeaconChainTypes, BlockProcessStatus, EngineState}; @@ -217,8 +215,6 @@ pub struct SyncNetworkContext { /// A mapping of active DataColumnsByRange requests data_columns_by_range_requests: ActiveRequests>, - data_columns_by_root_range_requests: - ActiveRequests>, /// Mapping of active custody column requests for a block root custody_by_root_requests: FnvHashMap>, @@ -309,7 +305,6 @@ impl SyncNetworkContext { blocks_by_root_requests: ActiveRequests::new("blocks_by_root"), blobs_by_root_requests: ActiveRequests::new("blobs_by_root"), data_columns_by_root_requests: ActiveRequests::new("data_columns_by_root"), - data_columns_by_root_range_requests: ActiveRequests::new("data_columns_by_root_range"), blocks_by_range_requests: ActiveRequests::new("blocks_by_range"), blobs_by_range_requests: ActiveRequests::new("blobs_by_range"), data_columns_by_range_requests: ActiveRequests::new("data_columns_by_range"), @@ -341,7 +336,6 @@ impl SyncNetworkContext { blocks_by_range_requests, blobs_by_range_requests, data_columns_by_range_requests, - data_columns_by_root_range_requests, // custody_by_root_requests is a meta request of data_columns_by_root_requests custody_by_root_requests: _, // components_by_range_requests is a meta request of various _by_range requests @@ -378,18 +372,12 @@ impl SyncNetworkContext { .into_iter() .map(|req_id| SyncRequestId::DataColumnsByRange(*req_id)); - let data_column_by_root_range_ids = data_columns_by_root_range_requests - .active_requests_of_peer(peer_id) - .into_iter() - .map(|req_id| SyncRequestId::DataColumnsByRoot(*req_id)); - blocks_by_root_ids .chain(blobs_by_root_ids) .chain(data_column_by_root_ids) .chain(blocks_by_range_ids) .chain(blobs_by_range_ids) .chain(data_column_by_range_ids) - .chain(data_column_by_root_range_ids) .collect() } @@ -446,7 +434,6 @@ impl SyncNetworkContext { blocks_by_range_requests, blobs_by_range_requests, data_columns_by_range_requests, - data_columns_by_root_range_requests, // custody_by_root_requests is a meta request of data_columns_by_root_requests custody_by_root_requests: _, // components_by_range_requests is a meta request of various _by_range requests @@ -469,7 +456,6 @@ impl SyncNetworkContext { .chain(blocks_by_range_requests.iter_request_peers()) .chain(blobs_by_range_requests.iter_request_peers()) .chain(data_columns_by_range_requests.iter_request_peers()) - .chain(data_columns_by_root_range_requests.iter_request_peers()) { *active_request_count_by_peer.entry(peer_id).or_default() += 1; } @@ -600,11 +586,11 @@ impl SyncNetworkContext { }; data_column_requests.push(( - self.send_data_columns_by_root_range_requests( + self.send_data_columns_by_root_request( + requester, peer, data_columns_by_root_request, - requester, - Span::none(), + true, ) .map_err(|e| { format!("Failed to send data columns by root request {:?}", e) @@ -1161,13 +1147,13 @@ impl SyncNetworkContext { } /// Request to send a single `data_columns_by_root` request to the network. - pub fn data_column_lookup_request( + pub fn send_data_columns_by_root_request( &mut self, requester: DataColumnsByRootRequester, peer_id: PeerId, - request: DataColumnsByRootSingleBlockRequest, + request: DataColumnsByRootBatchBlockRequest, expect_max_responses: bool, - ) -> Result, &'static str> { + ) -> Result { let id = DataColumnsByRootRequestId { id: self.next_id(), requester, @@ -1177,17 +1163,18 @@ impl SyncNetworkContext { self.send_network_msg(NetworkMessage::SendRequest { peer_id, request: RequestType::DataColumnsByRoot( - request.clone().try_into_request::( - self.fork_context.current_fork_name(), - &self.chain.spec, - )?, + request + .clone() + .try_into_request(self.fork_context.current_fork_name(), &self.chain.spec) + .map_err(|_| "invalid count of data column indices")?, ), app_request_id: AppRequestId::Sync(SyncRequestId::DataColumnsByRoot(id)), - })?; + }) + .map_err(|_| "network send error")?; debug!( method = "DataColumnsByRoot", - block_root = ?request.block_root, + block_roots = ?request.block_roots, indices = ?request.indices, peer = %peer_id, %id, @@ -1199,12 +1186,10 @@ impl SyncNetworkContext { peer_id, expect_max_responses, DataColumnsByRootRequestItems::new(request), - // Span is tracked in `self.custody_columns_by_root_requests` in the - // `ActiveCustodyRequest` struct. Span::none(), ); - Ok(LookupRequestResult::RequestSent(id)) + Ok(id) } /// Request to fetch all needed custody columns of a specific block. This function may not send @@ -1420,51 +1405,6 @@ impl SyncNetworkContext { Ok((id, requested_columns)) } - /// Send `DataColumnsByRoot` requests for progressing range sync. - fn send_data_columns_by_root_range_requests( - &mut self, - peer_id: PeerId, - request: DataColumnsByRootBatchBlockRequest, - requester: DataColumnsByRootRequester, - request_span: Span, - ) -> Result { - let id = DataColumnsByRootRequestId { - id: self.next_id(), - requester, - peer: peer_id, - }; - - self.send_network_msg(NetworkMessage::SendRequest { - peer_id, - request: RequestType::DataColumnsByRoot( - request - .clone() - .try_into_request(self.fork_context.current_fork_name(), &self.chain.spec) - .map_err(|e| RpcRequestSendError::InternalError(e.to_string()))?, - ), - app_request_id: AppRequestId::Sync(SyncRequestId::DataColumnsByRoot(id)), - }) - .map_err(|_| RpcRequestSendError::InternalError("network send error".to_owned()))?; - - debug!( - method = "DataColumnsByRoot", - ?request, - peer = %peer_id, - %id, - "Sync RPC request sent" - ); - - self.data_columns_by_root_range_requests.insert( - id, - peer_id, - // true = we are only requesting if we know there are blobs. - true, - DataColumnsByRootRangeRequestItems::new(request), - request_span, - ); - Ok(id) - } - pub fn is_execution_engine_online(&self) -> bool { self.execution_engine_state == EngineState::Online } @@ -1665,19 +1605,6 @@ impl SyncNetworkContext { self.on_rpc_response_result(id, "DataColumnsByRoot", resp, peer_id, |_| 1) } - #[allow(clippy::type_complexity)] - pub(crate) fn on_data_columns_by_root_range_response( - &mut self, - id: DataColumnsByRootRequestId, - peer_id: PeerId, - rpc_event: RpcEvent>>, - ) -> Option>>>> { - let resp = self - .data_columns_by_root_range_requests - .on_response(id, rpc_event); - self.on_rpc_response_result(id, "DataColumnsByRootRange", resp, peer_id, |b| b.len()) - } - /// Requests data columns for the given blocks by root. /// /// We request by root because it is much easier to reason about @@ -1775,11 +1702,11 @@ impl SyncNetworkContext { }; data_column_requests.push(( - self.send_data_columns_by_root_range_requests( + self.send_data_columns_by_root_request( + requester, peer, data_columns_by_root_request, - requester, - Span::none(), + true, ) .map_err(|e| { RpcResponseError::InternalError(format!( @@ -2083,10 +2010,6 @@ impl SyncNetworkContext { "data_columns_by_range", self.data_columns_by_range_requests.len(), ), - ( - "data_columns_by_root_range", - self.data_columns_by_root_range_requests.len(), - ), ("custody_by_root", self.custody_by_root_requests.len()), ( "components_by_range", diff --git a/beacon_node/network/src/sync/network_context/custody.rs b/beacon_node/network/src/sync/network_context/custody.rs index d8d30fd1941..147948a20ee 100644 --- a/beacon_node/network/src/sync/network_context/custody.rs +++ b/beacon_node/network/src/sync/network_context/custody.rs @@ -1,5 +1,5 @@ use crate::sync::network_context::{ - DataColumnsByRootRequestId, DataColumnsByRootSingleBlockRequest, + DataColumnsByRootBatchBlockRequest, DataColumnsByRootRequestId, }; use beacon_chain::BeaconChainTypes; use beacon_chain::validator_monitor::timestamp_now; @@ -16,7 +16,7 @@ use tracing::{Span, debug, debug_span, warn}; use types::{DataColumnSidecar, Hash256, data_column_sidecar::ColumnIndex}; use types::{DataColumnSidecarList, EthSpec}; -use super::{LookupRequestResult, PeerGroup, RpcResponseResult, SyncNetworkContext}; +use super::{PeerGroup, RpcResponseResult, SyncNetworkContext}; const MAX_STALE_NO_PEERS_DURATION: Duration = Duration::from_secs(30); @@ -279,12 +279,12 @@ impl ActiveCustodyRequest { } for (peer_id, indices) in columns_to_request_by_peer.into_iter() { - let request_result = cx - .data_column_lookup_request( + let req_id = cx + .send_data_columns_by_root_request( DataColumnsByRootRequester::Custody(self.custody_id), peer_id, - DataColumnsByRootSingleBlockRequest { - block_root: self.block_root, + DataColumnsByRootBatchBlockRequest { + block_roots: vec![self.block_root], indices: indices.clone(), }, // If peer is in the lookup peer set, it claims to have imported the block and @@ -295,38 +295,32 @@ impl ActiveCustodyRequest { ) .map_err(Error::SendFailed)?; - match request_result { - LookupRequestResult::RequestSent(req_id) => { - *self.peer_attempts.entry(peer_id).or_insert(0) += 1; + *self.peer_attempts.entry(peer_id).or_insert(0) += 1; - let client = cx.network_globals().client(&peer_id).kind; - let batch_columns_req_span = debug_span!( - "batch_columns_req", - %peer_id, - %client, - ); - let _guard = batch_columns_req_span.clone().entered(); - for column_index in &indices { - let column_request = self - .column_requests - .get_mut(column_index) - // Should never happen: column_index is iterated from column_requests - .ok_or(Error::BadState("unknown column_index".to_owned()))?; - - column_request.on_download_start(req_id)?; - } - - self.active_batch_columns_requests.insert( - req_id, - ActiveBatchColumnsRequest { - indices, - span: batch_columns_req_span, - }, - ); - } - LookupRequestResult::NoRequestNeeded(_) => unreachable!(), - LookupRequestResult::Pending(_) => unreachable!(), + let client = cx.network_globals().client(&peer_id).kind; + let batch_columns_req_span = debug_span!( + "batch_columns_req", + %peer_id, + %client, + ); + let _guard = batch_columns_req_span.clone().entered(); + for column_index in &indices { + let column_request = self + .column_requests + .get_mut(column_index) + // Should never happen: column_index is iterated from column_requests + .ok_or(Error::BadState("unknown column_index".to_owned()))?; + + column_request.on_download_start(req_id)?; } + + self.active_batch_columns_requests.insert( + req_id, + ActiveBatchColumnsRequest { + indices, + span: batch_columns_req_span, + }, + ); } Ok(None) diff --git a/beacon_node/network/src/sync/network_context/requests.rs b/beacon_node/network/src/sync/network_context/requests.rs index 950fc3db312..2134860ef44 100644 --- a/beacon_node/network/src/sync/network_context/requests.rs +++ b/beacon_node/network/src/sync/network_context/requests.rs @@ -12,10 +12,7 @@ pub use blobs_by_root::{BlobsByRootRequestItems, BlobsByRootSingleBlockRequest}; pub use blocks_by_range::BlocksByRangeRequestItems; pub use blocks_by_root::{BlocksByRootRequestItems, BlocksByRootSingleRequest}; pub use data_columns_by_range::DataColumnsByRangeRequestItems; -pub use data_columns_by_root::{ - DataColumnsByRootBatchBlockRequest, DataColumnsByRootRangeRequestItems, - DataColumnsByRootRequestItems, DataColumnsByRootSingleBlockRequest, -}; +pub use data_columns_by_root::{DataColumnsByRootBatchBlockRequest, DataColumnsByRootRequestItems}; use crate::metrics; diff --git a/beacon_node/network/src/sync/network_context/requests/data_columns_by_root.rs b/beacon_node/network/src/sync/network_context/requests/data_columns_by_root.rs index 22a91e23792..c8bea7cc186 100644 --- a/beacon_node/network/src/sync/network_context/requests/data_columns_by_root.rs +++ b/beacon_node/network/src/sync/network_context/requests/data_columns_by_root.rs @@ -34,84 +34,12 @@ impl DataColumnsByRootBatchBlockRequest { } } -#[derive(Debug, Clone)] -pub struct DataColumnsByRootSingleBlockRequest { - pub block_root: Hash256, - pub indices: Vec, -} - -impl DataColumnsByRootSingleBlockRequest { - pub fn try_into_request( - self, - fork_name: ForkName, - spec: &ChainSpec, - ) -> Result, &'static str> { - let columns = VariableList::new(self.indices) - .map_err(|_| "Number of indices exceeds total number of columns")?; - DataColumnsByRootRequest::new( - vec![DataColumnsByRootIdentifier { - block_root: self.block_root, - columns, - }], - spec.max_request_blocks(fork_name), - ) - } -} - pub struct DataColumnsByRootRequestItems { - request: DataColumnsByRootSingleBlockRequest, - items: Vec>>, -} - -impl DataColumnsByRootRequestItems { - pub fn new(request: DataColumnsByRootSingleBlockRequest) -> Self { - Self { - request, - items: vec![], - } - } -} - -impl ActiveRequestItems for DataColumnsByRootRequestItems { - type Item = Arc>; - - /// Appends a chunk to this multi-item request. If all expected chunks are received, this - /// method returns `Some`, resolving the request before the stream terminator. - /// The active request SHOULD be dropped after `add_response` returns an error - fn add(&mut self, data_column: Self::Item) -> Result { - let block_root = data_column.block_root(); - if self.request.block_root != block_root { - return Err(LookupVerifyError::UnrequestedBlockRoot(block_root)); - } - if !data_column.verify_inclusion_proof() { - return Err(LookupVerifyError::InvalidInclusionProof); - } - if !self.request.indices.contains(&data_column.index) { - return Err(LookupVerifyError::UnrequestedIndex(data_column.index)); - } - if self.items.iter().any(|d| d.index == data_column.index) { - return Err(LookupVerifyError::DuplicatedData( - data_column.slot(), - data_column.index, - )); - } - - self.items.push(data_column); - - Ok(self.items.len() >= self.request.indices.len()) - } - - fn consume(&mut self) -> Vec { - std::mem::take(&mut self.items) - } -} - -pub struct DataColumnsByRootRangeRequestItems { request: DataColumnsByRootBatchBlockRequest, items: HashMap>>>, } -impl DataColumnsByRootRangeRequestItems { +impl DataColumnsByRootRequestItems { pub fn new(request: DataColumnsByRootBatchBlockRequest) -> Self { Self { request, @@ -120,7 +48,7 @@ impl DataColumnsByRootRangeRequestItems { } } -impl ActiveRequestItems for DataColumnsByRootRangeRequestItems { +impl ActiveRequestItems for DataColumnsByRootRequestItems { type Item = Arc>; /// Appends a chunk to this multi-item request. If all expected chunks are received, this From c2aa4ae8be614d183edfb2c3848bed58d36749ca Mon Sep 17 00:00:00 2001 From: dapplion <35266934+dapplion@users.noreply.github.com> Date: Thu, 25 Sep 2025 23:58:31 +0200 Subject: [PATCH 42/49] Revert type change in UnexpectedRequestId --- .../network/src/sync/network_context/custody.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/beacon_node/network/src/sync/network_context/custody.rs b/beacon_node/network/src/sync/network_context/custody.rs index 147948a20ee..eb34aae56c9 100644 --- a/beacon_node/network/src/sync/network_context/custody.rs +++ b/beacon_node/network/src/sync/network_context/custody.rs @@ -5,7 +5,7 @@ use beacon_chain::BeaconChainTypes; use beacon_chain::validator_monitor::timestamp_now; use fnv::FnvHashMap; use lighthouse_network::PeerId; -use lighthouse_network::service::api_types::{CustodyId, DataColumnsByRootRequester, Id}; +use lighthouse_network::service::api_types::{CustodyId, DataColumnsByRootRequester}; use lighthouse_tracing::SPAN_OUTGOING_CUSTODY_REQUEST; use parking_lot::RwLock; use std::collections::HashSet; @@ -46,8 +46,8 @@ pub enum Error { /// There should only exist a single request at a time. Having multiple requests is a bug and /// can result in undefined state, so it's treated as a hard error and the lookup is dropped. UnexpectedRequestId { - expected_req_id: Id, - req_id: Id, + expected_req_id: DataColumnsByRootRequestId, + req_id: DataColumnsByRootRequestId, }, } @@ -424,8 +424,8 @@ impl ColumnRequest { Status::Downloading(expected_req_id) => { if req_id != *expected_req_id { return Err(Error::UnexpectedRequestId { - expected_req_id: expected_req_id.id, - req_id: req_id.id, + expected_req_id: *expected_req_id, + req_id, }); } self.status = Status::NotStarted(Instant::now()); @@ -457,8 +457,8 @@ impl ColumnRequest { Status::Downloading(expected_req_id) => { if req_id != *expected_req_id { return Err(Error::UnexpectedRequestId { - expected_req_id: expected_req_id.id, - req_id: req_id.id, + expected_req_id: *expected_req_id, + req_id, }); } self.status = Status::Downloaded(peer_id, data_column, seen_timestamp); From cf46d103cddbeee37bcf33952726a3f50865b52a Mon Sep 17 00:00:00 2001 From: Pawan Dhananjay Date: Thu, 25 Sep 2025 15:54:32 -0700 Subject: [PATCH 43/49] Fix issues from review --- .../network/src/sync/backfill_sync/mod.rs | 37 ++++++++++--------- .../network/src/sync/range_sync/chain.rs | 10 +---- 2 files changed, 20 insertions(+), 27 deletions(-) diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index 6c94c1821a7..c4bd55ff8e1 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -616,28 +616,29 @@ impl BackFillSync { penalty, faulty_component, } => { - let Some(batch_peers) = batch.processing_peers() else { - error!(?batch_id, "Responsible peers not found for a failed batch"); - return self - .fail_sync(BackFillError::BatchProcessingFailed(batch_id)) - .map(|_| ProcessResult::Successful); - }; - // Penalize the peer appropriately. - match faulty_component { - Some(FaultyComponent::Blocks) | Some(FaultyComponent::Blobs) => { - network.report_peer(batch_peers.block_and_blob, *penalty, "faulty_batch"); - } - // todo(pawan): clean this up - Some(FaultyComponent::Columns(faulty_columns)) => { - for (peer, columns) in batch_peers.data_columns.iter() { - for faulty_column in faulty_columns { - if columns.contains(faulty_column) { - network.report_peer(*peer, *penalty, "faulty_batch"); + if let Some(batch_peers) = batch.processing_peers() { + // Penalize the peer appropriately. + match faulty_component { + Some(FaultyComponent::Blocks) | Some(FaultyComponent::Blobs) => { + network.report_peer( + batch_peers.block_and_blob, + *penalty, + "faulty_batch", + ); + } + Some(FaultyComponent::Columns(faulty_columns)) => { + for (peer, columns) in batch_peers.data_columns.iter() { + for faulty_column in faulty_columns { + if columns.contains(faulty_column) { + network.report_peer(*peer, *penalty, "faulty_batch"); + } } } } + None => {} } - None => {} + } else { + warn!(?batch_id, "Responsible peers not found for a failed batch"); } match batch.processing_completed(BatchProcessingResult::FaultyFailure) { Err(e) => { diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index b2eb1ef1b9c..8013a38ff62 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -476,18 +476,10 @@ impl SyncingChain { } }; - let peers = batch.processing_peers().cloned().ok_or_else(|| { - RemoveChain::WrongBatchState(format!( - "Processing target is in wrong state: {:?}", - batch.state(), - )) - })?; - // Log the process result and the batch for debugging purposes. debug!( result = ?result, batch_epoch = %batch_id, - ?peers, batch_state = ?batch_state, ?batch, "Batch processing result" @@ -554,7 +546,7 @@ impl SyncingChain { faulty_component, } => { let Some(batch_peers) = batch.processing_peers() else { - crit!( + warn!( current_state = ?batch.state(), "Inconsistent state, batch must have been in processing state" ); From d99df0af4f59c0c9a11b32cb26cb95fa8b8944c4 Mon Sep 17 00:00:00 2001 From: Jimmy Chen Date: Thu, 25 Sep 2025 12:52:07 +1000 Subject: [PATCH 44/49] Only send data coumn subnet discovery requests after peerdas is scheduled (#8109) #8105 (to be confirmed) I noticed a large number of failed discovery requests after deploying latest `unstable` to some of our testnet and mainnet nodes. This is because of a recent PeerDAS change to attempt to maintain sufficient peers across data column subnets - this shouldn't be enabled on network without peerdas scheduled, otherwise it will keep retrying discovery on these subnets and never succeed. Also removed some unused files. Co-Authored-By: Jimmy Chen Co-Authored-By: Jimmy Chen --- .../src/peer_manager/mod.rs | 69 +- .../src/subnet_service/attestation_subnets.rs | 681 ------------------ .../src/subnet_service/sync_subnets.rs | 345 --------- 3 files changed, 67 insertions(+), 1028 deletions(-) delete mode 100644 beacon_node/network/src/subnet_service/attestation_subnets.rs delete mode 100644 beacon_node/network/src/subnet_service/sync_subnets.rs diff --git a/beacon_node/lighthouse_network/src/peer_manager/mod.rs b/beacon_node/lighthouse_network/src/peer_manager/mod.rs index 592fccdc741..ad16bb0421c 100644 --- a/beacon_node/lighthouse_network/src/peer_manager/mod.rs +++ b/beacon_node/lighthouse_network/src/peer_manager/mod.rs @@ -23,6 +23,7 @@ pub use libp2p::identity::Keypair; pub mod peerdb; use crate::peer_manager::peerdb::client::ClientKind; +use crate::types::GossipKind; use libp2p::multiaddr; use network_utils::discovery_metrics; use network_utils::enr_ext::{EnrExt, peer_id_to_node_id}; @@ -1434,8 +1435,16 @@ impl PeerManager { // Update peer score metrics; self.update_peer_score_metrics(); - // Maintain minimum count for custody peers. - self.maintain_custody_peers(); + // Maintain minimum count for custody peers if we are subscribed to any data column topics (i.e. PeerDAS activated) + let peerdas_enabled = self + .network_globals + .gossipsub_subscriptions + .read() + .iter() + .any(|topic| matches!(topic.kind(), &GossipKind::DataColumnSidecar(_))); + if peerdas_enabled { + self.maintain_custody_peers(); + } // Maintain minimum count for sync committee peers. self.maintain_sync_committee_peers(); @@ -3140,4 +3149,60 @@ mod tests { }) } } + + #[tokio::test] + async fn test_custody_peer_logic_only_runs_when_peerdas_enabled() { + use crate::types::{GossipEncoding, GossipTopic}; + + let mut peer_manager = build_peer_manager(5).await; + + // Set up sampling subnets so maintain_custody_peers would have work to do + *peer_manager.network_globals.sampling_subnets.write() = std::collections::HashSet::from([ + DataColumnSubnetId::new(0), + DataColumnSubnetId::new(1), + ]); + + // Test 1: No data column subscriptions - custody peer logic should NOT run + peer_manager.heartbeat(); + + // Should be no new DiscoverSubnetPeers events since PeerDAS is not enabled + let discovery_events: Vec<_> = peer_manager + .events + .iter() + .filter(|event| matches!(event, PeerManagerEvent::DiscoverSubnetPeers(_))) + .collect(); + assert!( + discovery_events.is_empty(), + "Should not generate discovery events when PeerDAS is disabled, but found: {:?}", + discovery_events + ); + + // Test 2: Add data column subscription - custody peer logic should run + let data_column_topic = GossipTopic::new( + GossipKind::DataColumnSidecar(DataColumnSubnetId::new(0)), + GossipEncoding::SSZSnappy, + [0, 0, 0, 0], // fork_digest + ); + peer_manager + .network_globals + .gossipsub_subscriptions + .write() + .insert(data_column_topic); + + // Clear any existing events to isolate the test + peer_manager.events.clear(); + + peer_manager.heartbeat(); + + // Should now have DiscoverSubnetPeers events since PeerDAS is enabled + let discovery_events: Vec<_> = peer_manager + .events + .iter() + .filter(|event| matches!(event, PeerManagerEvent::DiscoverSubnetPeers(_))) + .collect(); + assert!( + !discovery_events.is_empty(), + "Should generate discovery events when PeerDAS is enabled, but found no discovery events" + ); + } } diff --git a/beacon_node/network/src/subnet_service/attestation_subnets.rs b/beacon_node/network/src/subnet_service/attestation_subnets.rs deleted file mode 100644 index 0da27c6a21f..00000000000 --- a/beacon_node/network/src/subnet_service/attestation_subnets.rs +++ /dev/null @@ -1,681 +0,0 @@ -//! This service keeps track of which shard subnet the beacon node should be subscribed to at any -//! given time. It schedules subscriptions to shard subnets, requests peer discoveries and -//! determines whether attestations should be aggregated and/or passed to the beacon node. - -use super::SubnetServiceMessage; -use std::collections::HashSet; -use std::collections::{HashMap, VecDeque}; -use std::pin::Pin; -use std::sync::Arc; -use std::task::{Context, Poll}; -use std::time::Duration; - -use beacon_chain::{BeaconChain, BeaconChainTypes}; -use delay_map::{HashMapDelay, HashSetDelay}; -use futures::prelude::*; -use lighthouse_network::{discv5::enr::NodeId, NetworkConfig, Subnet, SubnetDiscovery}; -use slot_clock::SlotClock; -use tracing::{debug, error, info, trace, warn}; -use types::{Attestation, EthSpec, Slot, SubnetId, ValidatorSubscription}; - -use crate::metrics; - -/// The minimum number of slots ahead that we attempt to discover peers for a subscription. If the -/// slot is less than this number, skip the peer discovery process. -/// Subnet discovery query takes at most 30 secs, 2 slots take 24s. -pub(crate) const MIN_PEER_DISCOVERY_SLOT_LOOK_AHEAD: u64 = 2; -/// The fraction of a slot that we subscribe to a subnet before the required slot. -/// -/// Currently a whole slot ahead. -const ADVANCE_SUBSCRIBE_SLOT_FRACTION: u32 = 1; - -/// The number of slots after an aggregator duty where we remove the entry from -/// `aggregate_validators_on_subnet` delay map. -const UNSUBSCRIBE_AFTER_AGGREGATOR_DUTY: u32 = 2; - -#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)] -pub(crate) enum SubscriptionKind { - /// Long lived subscriptions. - /// - /// These have a longer duration and are advertised in our ENR. - LongLived, - /// Short lived subscriptions. - /// - /// Subscribing to these subnets has a short duration and we don't advertise it in our ENR. - ShortLived, -} - -/// A particular subnet at a given slot. -#[derive(PartialEq, Eq, Hash, Clone, Debug, Copy)] -pub struct ExactSubnet { - /// The `SubnetId` associated with this subnet. - pub subnet_id: SubnetId, - /// The `Slot` associated with this subnet. - pub slot: Slot, -} - -pub struct AttestationService { - /// Queued events to return to the driving service. - events: VecDeque, - - /// A reference to the beacon chain to process received attestations. - pub(crate) beacon_chain: Arc>, - - /// Subnets we are currently subscribed to as short lived subscriptions. - /// - /// Once they expire, we unsubscribe from these. - /// We subscribe to subnets when we are an aggregator for an exact subnet. - short_lived_subscriptions: HashMapDelay, - - /// Subnets we are currently subscribed to as long lived subscriptions. - /// - /// We advertise these in our ENR. When these expire, the subnet is removed from our ENR. - /// These are required of all beacon nodes. The exact number is determined by the chain - /// specification. - long_lived_subscriptions: HashSet, - - /// Short lived subscriptions that need to be executed in the future. - scheduled_short_lived_subscriptions: HashSetDelay, - - /// A collection timeouts to track the existence of aggregate validator subscriptions at an - /// `ExactSubnet`. - aggregate_validators_on_subnet: Option>, - - /// The waker for the current thread. - waker: Option, - - /// The discovery mechanism of lighthouse is disabled. - discovery_disabled: bool, - - /// We are always subscribed to all subnets. - subscribe_all_subnets: bool, - - /// Our Discv5 node_id. - node_id: NodeId, - - /// Future used to manage subscribing and unsubscribing from long lived subnets. - next_long_lived_subscription_event: Pin>, - - /// Whether this node is a block proposer-only node. - proposer_only: bool, -} - -impl AttestationService { - /* Public functions */ - - /// Establish the service based on the passed configuration. - pub fn new(beacon_chain: Arc>, node_id: NodeId, config: &NetworkConfig) -> Self { - let slot_duration = beacon_chain.slot_clock.slot_duration(); - - if config.subscribe_all_subnets { - info!("Subscribing to all subnets"); - } else { - info!( - subnets_per_node = beacon_chain.spec.subnets_per_node, - subscription_duration_in_epochs = beacon_chain.spec.epochs_per_subnet_subscription, - "Deterministic long lived subnets enabled" - ); - } - - let track_validators = !config.import_all_attestations; - let aggregate_validators_on_subnet = - track_validators.then(|| HashSetDelay::new(slot_duration)); - let mut service = AttestationService { - events: VecDeque::with_capacity(10), - beacon_chain, - short_lived_subscriptions: HashMapDelay::new(slot_duration), - long_lived_subscriptions: HashSet::default(), - scheduled_short_lived_subscriptions: HashSetDelay::default(), - aggregate_validators_on_subnet, - waker: None, - discovery_disabled: config.disable_discovery, - subscribe_all_subnets: config.subscribe_all_subnets, - node_id, - next_long_lived_subscription_event: { - // Set a dummy sleep. Calculating the current subnet subscriptions will update this - // value with a smarter timing - Box::pin(tokio::time::sleep(Duration::from_secs(1))) - }, - proposer_only: config.proposer_only, - }; - - // If we are not subscribed to all subnets, handle the deterministic set of subnets - if !config.subscribe_all_subnets { - service.recompute_long_lived_subnets(); - } - - service - } - - /// Return count of all currently subscribed subnets (long-lived **and** short-lived). - #[cfg(test)] - pub fn subscription_count(&self) -> usize { - if self.subscribe_all_subnets { - self.beacon_chain.spec.attestation_subnet_count as usize - } else { - let count = self - .short_lived_subscriptions - .keys() - .chain(self.long_lived_subscriptions.iter()) - .collect::>() - .len(); - count - } - } - - /// Returns whether we are subscribed to a subnet for testing purposes. - #[cfg(test)] - pub(crate) fn is_subscribed( - &self, - subnet_id: &SubnetId, - subscription_kind: SubscriptionKind, - ) -> bool { - match subscription_kind { - SubscriptionKind::LongLived => self.long_lived_subscriptions.contains(subnet_id), - SubscriptionKind::ShortLived => self.short_lived_subscriptions.contains_key(subnet_id), - } - } - - #[cfg(test)] - pub(crate) fn long_lived_subscriptions(&self) -> &HashSet { - &self.long_lived_subscriptions - } - - /// Processes a list of validator subscriptions. - /// - /// This will: - /// - Register new validators as being known. - /// - Search for peers for required subnets. - /// - Request subscriptions for subnets on specific slots when required. - /// - Build the timeouts for each of these events. - /// - /// This returns a result simply for the ergonomics of using ?. The result can be - /// safely dropped. - pub fn validator_subscriptions( - &mut self, - subscriptions: impl Iterator, - ) -> Result<(), String> { - // If the node is in a proposer-only state, we ignore all subnet subscriptions. - if self.proposer_only { - return Ok(()); - } - - // Maps each subnet_id subscription to it's highest slot - let mut subnets_to_discover: HashMap = HashMap::new(); - - // Registers the validator with the attestation service. - for subscription in subscriptions { - metrics::inc_counter(&metrics::SUBNET_SUBSCRIPTION_REQUESTS); - - trace!(?subscription, "Validator subscription"); - - // Compute the subnet that is associated with this subscription - let subnet_id = match SubnetId::compute_subnet::( - subscription.slot, - subscription.attestation_committee_index, - subscription.committee_count_at_slot, - &self.beacon_chain.spec, - ) { - Ok(subnet_id) => subnet_id, - Err(e) => { - warn!( - error = ?e, - "Failed to compute subnet id for validator subscription" - ); - continue; - } - }; - // Ensure each subnet_id inserted into the map has the highest slot as it's value. - // Higher slot corresponds to higher min_ttl in the `SubnetDiscovery` entry. - if let Some(slot) = subnets_to_discover.get(&subnet_id) { - if subscription.slot > *slot { - subnets_to_discover.insert(subnet_id, subscription.slot); - } - } else if !self.discovery_disabled { - subnets_to_discover.insert(subnet_id, subscription.slot); - } - - let exact_subnet = ExactSubnet { - subnet_id, - slot: subscription.slot, - }; - - // Determine if the validator is an aggregator. If so, we subscribe to the subnet and - // if successful add the validator to a mapping of known aggregators for that exact - // subnet. - - if subscription.is_aggregator { - metrics::inc_counter(&metrics::SUBNET_SUBSCRIPTION_AGGREGATOR_REQUESTS); - if let Err(e) = self.subscribe_to_short_lived_subnet(exact_subnet) { - warn!(error = e, "Subscription to subnet error"); - } else { - trace!(?exact_subnet, "Subscribed to subnet for aggregator duties"); - } - } - } - - // If the discovery mechanism isn't disabled, attempt to set up a peer discovery for the - // required subnets. - if !self.discovery_disabled { - if let Err(e) = self.discover_peers_request( - subnets_to_discover - .into_iter() - .map(|(subnet_id, slot)| ExactSubnet { subnet_id, slot }), - ) { - warn!(error = e, "Discovery lookup request error"); - }; - } - - Ok(()) - } - - fn recompute_long_lived_subnets(&mut self) { - // Ensure the next computation is scheduled even if assigning subnets fails. - let next_subscription_event = self - .recompute_long_lived_subnets_inner() - .unwrap_or_else(|_| self.beacon_chain.slot_clock.slot_duration()); - - debug!("Recomputing deterministic long lived subnets"); - self.next_long_lived_subscription_event = - Box::pin(tokio::time::sleep(next_subscription_event)); - - if let Some(waker) = self.waker.as_ref() { - waker.wake_by_ref(); - } - } - - /// Gets the long lived subnets the node should be subscribed to during the current epoch and - /// the remaining duration for which they remain valid. - fn recompute_long_lived_subnets_inner(&mut self) -> Result { - let current_epoch = self.beacon_chain.epoch().map_err(|e| { - if !self - .beacon_chain - .slot_clock - .is_prior_to_genesis() - .unwrap_or(false) - { - error!(err = ?e,"Failed to get the current epoch from clock") - } - })?; - - let (subnets, next_subscription_epoch) = SubnetId::compute_subnets_for_epoch::( - self.node_id.raw(), - current_epoch, - &self.beacon_chain.spec, - ) - .map_err(|e| error!(err = e, "Could not compute subnets for current epoch"))?; - - let next_subscription_slot = - next_subscription_epoch.start_slot(T::EthSpec::slots_per_epoch()); - let next_subscription_event = self - .beacon_chain - .slot_clock - .duration_to_slot(next_subscription_slot) - .ok_or_else(|| { - error!("Failed to compute duration to next to long lived subscription event") - })?; - - self.update_long_lived_subnets(subnets.collect()); - - Ok(next_subscription_event) - } - - /// Updates the long lived subnets. - /// - /// New subnets are registered as subscribed, removed subnets as unsubscribed and the Enr - /// updated accordingly. - fn update_long_lived_subnets(&mut self, mut subnets: HashSet) { - info!(subnets = ?subnets.iter().collect::>(),"Subscribing to long-lived subnets"); - for subnet in &subnets { - // Add the events for those subnets that are new as long lived subscriptions. - if !self.long_lived_subscriptions.contains(subnet) { - // Check if this subnet is new and send the subscription event if needed. - if !self.short_lived_subscriptions.contains_key(subnet) { - debug!( - ?subnet, - subscription_kind = ?SubscriptionKind::LongLived, - "Subscribing to subnet" - ); - self.queue_event(SubnetServiceMessage::Subscribe(Subnet::Attestation( - *subnet, - ))); - } - self.queue_event(SubnetServiceMessage::EnrAdd(Subnet::Attestation(*subnet))); - if !self.discovery_disabled { - self.queue_event(SubnetServiceMessage::DiscoverPeers(vec![SubnetDiscovery { - subnet: Subnet::Attestation(*subnet), - min_ttl: None, - }])) - } - } - } - - // Update the long_lived_subnets set and check for subnets that are being removed - std::mem::swap(&mut self.long_lived_subscriptions, &mut subnets); - for subnet in subnets { - if !self.long_lived_subscriptions.contains(&subnet) { - self.handle_removed_subnet(subnet, SubscriptionKind::LongLived); - } - } - } - - /// Checks if we have subscribed aggregate validators for the subnet. If not, checks the gossip - /// verification, re-propagates and returns false. - pub fn should_process_attestation( - &self, - subnet: SubnetId, - attestation: &Attestation, - ) -> bool { - // Proposer-only mode does not need to process attestations - if self.proposer_only { - return false; - } - self.aggregate_validators_on_subnet - .as_ref() - .map(|tracked_vals| { - tracked_vals.contains_key(&ExactSubnet { - subnet_id: subnet, - slot: attestation.data().slot, - }) - }) - .unwrap_or(true) - } - - /* Internal private functions */ - - /// Adds an event to the event queue and notifies that this service is ready to be polled - /// again. - fn queue_event(&mut self, ev: SubnetServiceMessage) { - self.events.push_back(ev); - if let Some(waker) = &self.waker { - waker.wake_by_ref() - } - } - /// Checks if there are currently queued discovery requests and the time required to make the - /// request. - /// - /// If there is sufficient time, queues a peer discovery request for all the required subnets. - fn discover_peers_request( - &mut self, - exact_subnets: impl Iterator, - ) -> Result<(), &'static str> { - let current_slot = self - .beacon_chain - .slot_clock - .now() - .ok_or("Could not get the current slot")?; - - let discovery_subnets: Vec = exact_subnets - .filter_map(|exact_subnet| { - // Check if there is enough time to perform a discovery lookup. - if exact_subnet.slot - >= current_slot.saturating_add(MIN_PEER_DISCOVERY_SLOT_LOOK_AHEAD) - { - // Send out an event to start looking for peers. - // Require the peer for an additional slot to ensure we keep the peer for the - // duration of the subscription. - let min_ttl = self - .beacon_chain - .slot_clock - .duration_to_slot(exact_subnet.slot + 1) - .map(|duration| std::time::Instant::now() + duration); - Some(SubnetDiscovery { - subnet: Subnet::Attestation(exact_subnet.subnet_id), - min_ttl, - }) - } else { - // We may want to check the global PeerInfo to see estimated timeouts for each - // peer before they can be removed. - warn!( - subnet_id = ?exact_subnet, - "Not enough time for a discovery search" - ); - None - } - }) - .collect(); - - if !discovery_subnets.is_empty() { - self.queue_event(SubnetServiceMessage::DiscoverPeers(discovery_subnets)); - } - Ok(()) - } - - // Subscribes to the subnet if it should be done immediately, or schedules it if required. - fn subscribe_to_short_lived_subnet( - &mut self, - ExactSubnet { subnet_id, slot }: ExactSubnet, - ) -> Result<(), &'static str> { - let slot_duration = self.beacon_chain.slot_clock.slot_duration(); - - // The short time we schedule the subscription before it's actually required. This - // ensures we are subscribed on time, and allows consecutive subscriptions to the same - // subnet to overlap, reducing subnet churn. - let advance_subscription_duration = slot_duration / ADVANCE_SUBSCRIBE_SLOT_FRACTION; - // The time to the required slot. - let time_to_subscription_slot = self - .beacon_chain - .slot_clock - .duration_to_slot(slot) - .unwrap_or_default(); // If this is a past slot we will just get a 0 duration. - - // Calculate how long before we need to subscribe to the subnet. - let time_to_subscription_start = - time_to_subscription_slot.saturating_sub(advance_subscription_duration); - - // The time after a duty slot where we no longer need it in the `aggregate_validators_on_subnet` - // delay map. - let time_to_unsubscribe = - time_to_subscription_slot + UNSUBSCRIBE_AFTER_AGGREGATOR_DUTY * slot_duration; - if let Some(tracked_vals) = self.aggregate_validators_on_subnet.as_mut() { - tracked_vals.insert_at(ExactSubnet { subnet_id, slot }, time_to_unsubscribe); - } - - // If the subscription should be done in the future, schedule it. Otherwise subscribe - // immediately. - if time_to_subscription_start.is_zero() { - // This is a current or past slot, we subscribe immediately. - self.subscribe_to_short_lived_subnet_immediately(subnet_id, slot + 1)?; - } else { - // This is a future slot, schedule subscribing. - trace!(subnet = ?subnet_id, ?time_to_subscription_start,"Scheduling subnet subscription"); - self.scheduled_short_lived_subscriptions - .insert_at(ExactSubnet { subnet_id, slot }, time_to_subscription_start); - } - - Ok(()) - } - - /* A collection of functions that handle the various timeouts */ - - /// Registers a subnet as subscribed. - /// - /// Checks that the time in which the subscription would end is not in the past. If we are - /// already subscribed, extends the timeout if necessary. If this is a new subscription, we send - /// out the appropriate events. - /// - /// On determinist long lived subnets, this is only used for short lived subscriptions. - fn subscribe_to_short_lived_subnet_immediately( - &mut self, - subnet_id: SubnetId, - end_slot: Slot, - ) -> Result<(), &'static str> { - if self.subscribe_all_subnets { - // Case not handled by this service. - return Ok(()); - } - - let time_to_subscription_end = self - .beacon_chain - .slot_clock - .duration_to_slot(end_slot) - .unwrap_or_default(); - - // First check this is worth doing. - if time_to_subscription_end.is_zero() { - return Err("Time when subscription would end has already passed."); - } - - let subscription_kind = SubscriptionKind::ShortLived; - - // We need to check and add a subscription for the right kind, regardless of the presence - // of the subnet as a subscription of the other kind. This is mainly since long lived - // subscriptions can be removed at any time when a validator goes offline. - - let (subscriptions, already_subscribed_as_other_kind) = ( - &mut self.short_lived_subscriptions, - self.long_lived_subscriptions.contains(&subnet_id), - ); - - match subscriptions.get(&subnet_id) { - Some(current_end_slot) => { - // We are already subscribed. Check if we need to extend the subscription. - if &end_slot > current_end_slot { - trace!( - subnet = ?subnet_id, - prev_end_slot = %current_end_slot, - new_end_slot = %end_slot, - ?subscription_kind, - "Extending subscription to subnet" - ); - subscriptions.insert_at(subnet_id, end_slot, time_to_subscription_end); - } - } - None => { - // This is a new subscription. Add with the corresponding timeout and send the - // notification. - subscriptions.insert_at(subnet_id, end_slot, time_to_subscription_end); - - // Inform of the subscription. - if !already_subscribed_as_other_kind { - debug!( - subnet = ?subnet_id, - %end_slot, - ?subscription_kind, - "Subscribing to subnet" - ); - self.queue_event(SubnetServiceMessage::Subscribe(Subnet::Attestation( - subnet_id, - ))); - } - } - } - - Ok(()) - } - - // Unsubscribes from a subnet that was removed if it does not continue to exist as a - // subscription of the other kind. For long lived subscriptions, it also removes the - // advertisement from our ENR. - fn handle_removed_subnet(&mut self, subnet_id: SubnetId, subscription_kind: SubscriptionKind) { - let exists_in_other_subscriptions = match subscription_kind { - SubscriptionKind::LongLived => self.short_lived_subscriptions.contains_key(&subnet_id), - SubscriptionKind::ShortLived => self.long_lived_subscriptions.contains(&subnet_id), - }; - - if !exists_in_other_subscriptions { - // Subscription no longer exists as short lived or long lived. - debug!( - subnet = ?subnet_id, - ?subscription_kind, - "Unsubscribing from subnet" - ); - self.queue_event(SubnetServiceMessage::Unsubscribe(Subnet::Attestation( - subnet_id, - ))); - } - - if subscription_kind == SubscriptionKind::LongLived { - // Remove from our ENR even if we remain subscribed in other way. - self.queue_event(SubnetServiceMessage::EnrRemove(Subnet::Attestation( - subnet_id, - ))); - } - } -} - -impl Stream for AttestationService { - type Item = SubnetServiceMessage; - - fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - // Update the waker if needed. - if let Some(waker) = &self.waker { - if !waker.will_wake(cx.waker()) { - self.waker = Some(cx.waker().clone()); - } - } else { - self.waker = Some(cx.waker().clone()); - } - - // Send out any generated events. - if let Some(event) = self.events.pop_front() { - return Poll::Ready(Some(event)); - } - - // If we aren't subscribed to all subnets, handle the deterministic long-lived subnets - if !self.subscribe_all_subnets { - match self.next_long_lived_subscription_event.as_mut().poll(cx) { - Poll::Ready(_) => { - self.recompute_long_lived_subnets(); - // We re-wake the task as there could be other subscriptions to process - self.waker - .as_ref() - .expect("Waker has been set") - .wake_by_ref(); - } - Poll::Pending => {} - } - } - - // Process scheduled subscriptions that might be ready, since those can extend a soon to - // expire subscription. - match self.scheduled_short_lived_subscriptions.poll_next_unpin(cx) { - Poll::Ready(Some(Ok(ExactSubnet { subnet_id, slot }))) => { - if let Err(e) = - self.subscribe_to_short_lived_subnet_immediately(subnet_id, slot + 1) - { - debug!(subnet = ?subnet_id, err = e,"Failed to subscribe to short lived subnet"); - } - self.waker - .as_ref() - .expect("Waker has been set") - .wake_by_ref(); - } - Poll::Ready(Some(Err(e))) => { - error!( - error = e, - "Failed to check for scheduled subnet subscriptions" - ); - } - Poll::Ready(None) | Poll::Pending => {} - } - - // Finally process any expired subscriptions. - match self.short_lived_subscriptions.poll_next_unpin(cx) { - Poll::Ready(Some(Ok((subnet_id, _end_slot)))) => { - self.handle_removed_subnet(subnet_id, SubscriptionKind::ShortLived); - // We re-wake the task as there could be other subscriptions to process - self.waker - .as_ref() - .expect("Waker has been set") - .wake_by_ref(); - } - Poll::Ready(Some(Err(e))) => { - error!(error = e, "Failed to check for subnet unsubscription times"); - } - Poll::Ready(None) | Poll::Pending => {} - } - - // Poll to remove entries on expiration, no need to act on expiration events. - if let Some(tracked_vals) = self.aggregate_validators_on_subnet.as_mut() { - if let Poll::Ready(Some(Err(e))) = tracked_vals.poll_next_unpin(cx) { - error!( - error = e, - "Failed to check for aggregate validator on subnet expirations" - ); - } - } - - Poll::Pending - } -} diff --git a/beacon_node/network/src/subnet_service/sync_subnets.rs b/beacon_node/network/src/subnet_service/sync_subnets.rs deleted file mode 100644 index 6b3834e1958..00000000000 --- a/beacon_node/network/src/subnet_service/sync_subnets.rs +++ /dev/null @@ -1,345 +0,0 @@ -//! This service keeps track of which sync committee subnet the beacon node should be subscribed to at any -//! given time. It schedules subscriptions to sync committee subnets and requests peer discoveries. - -use std::collections::{hash_map::Entry, HashMap, VecDeque}; -use std::pin::Pin; -use std::sync::Arc; -use std::task::{Context, Poll}; -use std::time::Duration; - -use futures::prelude::*; -use tracing::{debug, error, trace, warn}; - -use super::SubnetServiceMessage; -use beacon_chain::{BeaconChain, BeaconChainTypes}; -use delay_map::HashSetDelay; -use lighthouse_network::{NetworkConfig, Subnet, SubnetDiscovery}; -use slot_clock::SlotClock; -use types::{Epoch, EthSpec, SyncCommitteeSubscription, SyncSubnetId}; - -use crate::metrics; - -/// The minimum number of slots ahead that we attempt to discover peers for a subscription. If the -/// slot is less than this number, skip the peer discovery process. -/// Subnet discovery query takes at most 30 secs, 2 slots take 24s. -const MIN_PEER_DISCOVERY_SLOT_LOOK_AHEAD: u64 = 2; - -/// A particular subnet at a given slot. -#[derive(PartialEq, Eq, Hash, Clone, Debug)] -pub struct ExactSubnet { - /// The `SyncSubnetId` associated with this subnet. - pub subnet_id: SyncSubnetId, - /// The epoch until which we need to stay subscribed to the subnet. - pub until_epoch: Epoch, -} -pub struct SyncCommitteeService { - /// Queued events to return to the driving service. - events: VecDeque, - - /// A reference to the beacon chain to process received attestations. - pub(crate) beacon_chain: Arc>, - - /// The collection of all currently subscribed subnets. - subscriptions: HashMap, - - /// A collection of timeouts for when to unsubscribe from a subnet. - unsubscriptions: HashSetDelay, - - /// The waker for the current thread. - waker: Option, - - /// The discovery mechanism of lighthouse is disabled. - discovery_disabled: bool, - - /// We are always subscribed to all subnets. - subscribe_all_subnets: bool, - - /// Whether this node is a block proposer-only node. - proposer_only: bool, -} - -impl SyncCommitteeService { - /* Public functions */ - - pub fn new(beacon_chain: Arc>, config: &NetworkConfig) -> Self { - let spec = &beacon_chain.spec; - let epoch_duration_secs = - beacon_chain.slot_clock.slot_duration().as_secs() * T::EthSpec::slots_per_epoch(); - let default_timeout = - epoch_duration_secs.saturating_mul(spec.epochs_per_sync_committee_period.as_u64()); - - SyncCommitteeService { - events: VecDeque::with_capacity(10), - beacon_chain, - subscriptions: HashMap::new(), - unsubscriptions: HashSetDelay::new(Duration::from_secs(default_timeout)), - waker: None, - subscribe_all_subnets: config.subscribe_all_subnets, - discovery_disabled: config.disable_discovery, - proposer_only: config.proposer_only, - } - } - - /// Return count of all currently subscribed subnets. - #[cfg(test)] - pub fn subscription_count(&self) -> usize { - use types::consts::altair::SYNC_COMMITTEE_SUBNET_COUNT; - if self.subscribe_all_subnets { - SYNC_COMMITTEE_SUBNET_COUNT as usize - } else { - self.subscriptions.len() - } - } - - /// Processes a list of sync committee subscriptions. - /// - /// This will: - /// - Search for peers for required subnets. - /// - Request subscriptions required subnets. - /// - Build the timeouts for each of these events. - /// - /// This returns a result simply for the ergonomics of using ?. The result can be - /// safely dropped. - pub fn validator_subscriptions( - &mut self, - subscriptions: Vec, - ) -> Result<(), String> { - // A proposer-only node does not subscribe to any sync-committees - if self.proposer_only { - return Ok(()); - } - - let mut subnets_to_discover = Vec::new(); - for subscription in subscriptions { - metrics::inc_counter(&metrics::SYNC_COMMITTEE_SUBSCRIPTION_REQUESTS); - //NOTE: We assume all subscriptions have been verified before reaching this service - - // Registers the validator with the subnet service. - // This will subscribe to long-lived random subnets if required. - trace!(?subscription, "Sync committee subscription"); - - let subnet_ids = match SyncSubnetId::compute_subnets_for_sync_committee::( - &subscription.sync_committee_indices, - ) { - Ok(subnet_ids) => subnet_ids, - Err(e) => { - warn!( - error = ?e, - validator_index = subscription.validator_index, - "Failed to compute subnet id for sync committee subscription" - ); - continue; - } - }; - - for subnet_id in subnet_ids { - let exact_subnet = ExactSubnet { - subnet_id, - until_epoch: subscription.until_epoch, - }; - subnets_to_discover.push(exact_subnet.clone()); - if let Err(e) = self.subscribe_to_subnet(exact_subnet.clone()) { - warn!( - error = e, - validator_index = subscription.validator_index, - "Subscription to sync subnet error" - ); - } else { - trace!( - ?exact_subnet, - validator_index = subscription.validator_index, - "Subscribed to subnet for sync committee duties" - ); - } - } - } - // If the discovery mechanism isn't disabled, attempt to set up a peer discovery for the - // required subnets. - if !self.discovery_disabled { - if let Err(e) = self.discover_peers_request(subnets_to_discover.iter()) { - warn!(error = e, "Discovery lookup request error"); - }; - } - - // pre-emptively wake the thread to check for new events - if let Some(waker) = &self.waker { - waker.wake_by_ref(); - } - Ok(()) - } - - /* Internal private functions */ - - /// Checks if there are currently queued discovery requests and the time required to make the - /// request. - /// - /// If there is sufficient time, queues a peer discovery request for all the required subnets. - fn discover_peers_request<'a>( - &mut self, - exact_subnets: impl Iterator, - ) -> Result<(), &'static str> { - let current_slot = self - .beacon_chain - .slot_clock - .now() - .ok_or("Could not get the current slot")?; - - let slots_per_epoch = T::EthSpec::slots_per_epoch(); - - let discovery_subnets: Vec = exact_subnets - .filter_map(|exact_subnet| { - let until_slot = exact_subnet.until_epoch.end_slot(slots_per_epoch); - // check if there is enough time to perform a discovery lookup - if until_slot >= current_slot.saturating_add(MIN_PEER_DISCOVERY_SLOT_LOOK_AHEAD) { - // if the slot is more than epoch away, add an event to start looking for peers - // add one slot to ensure we keep the peer for the subscription slot - let min_ttl = self - .beacon_chain - .slot_clock - .duration_to_slot(until_slot + 1) - .map(|duration| std::time::Instant::now() + duration); - Some(SubnetDiscovery { - subnet: Subnet::SyncCommittee(exact_subnet.subnet_id), - min_ttl, - }) - } else { - // We may want to check the global PeerInfo to see estimated timeouts for each - // peer before they can be removed. - warn!( - subnet_id = ?exact_subnet, - "Not enough time for a discovery search" - ); - None - } - }) - .collect(); - - if !discovery_subnets.is_empty() { - self.events - .push_back(SubnetServiceMessage::DiscoverPeers(discovery_subnets)); - } - Ok(()) - } - - /// Adds a subscription event and an associated unsubscription event if required. - fn subscribe_to_subnet(&mut self, exact_subnet: ExactSubnet) -> Result<(), &'static str> { - // Return if we have subscribed to all subnets - if self.subscribe_all_subnets { - return Ok(()); - } - - // Return if we already have a subscription for exact_subnet - if self.subscriptions.get(&exact_subnet.subnet_id) == Some(&exact_subnet.until_epoch) { - return Ok(()); - } - - // Return if we already have subscription set to expire later than the current request. - if let Some(until_epoch) = self.subscriptions.get(&exact_subnet.subnet_id) { - if *until_epoch >= exact_subnet.until_epoch { - return Ok(()); - } - } - - // initialise timing variables - let current_slot = self - .beacon_chain - .slot_clock - .now() - .ok_or("Could not get the current slot")?; - - let slots_per_epoch = T::EthSpec::slots_per_epoch(); - let until_slot = exact_subnet.until_epoch.end_slot(slots_per_epoch); - // Calculate the duration to the unsubscription event. - let expected_end_subscription_duration = if current_slot >= until_slot { - warn!( - %current_slot, - ?exact_subnet, - "Sync committee subscription is past expiration" - ); - return Ok(()); - } else { - let slot_duration = self.beacon_chain.slot_clock.slot_duration(); - - // the duration until we no longer need this subscription. We assume a single slot is - // sufficient. - self.beacon_chain - .slot_clock - .duration_to_slot(until_slot) - .ok_or("Unable to determine duration to unsubscription slot")? - + slot_duration - }; - - if let Entry::Vacant(e) = self.subscriptions.entry(exact_subnet.subnet_id) { - // We are not currently subscribed and have no waiting subscription, create one - debug!(subnet = *exact_subnet.subnet_id, until_epoch = ?exact_subnet.until_epoch, "Subscribing to subnet"); - e.insert(exact_subnet.until_epoch); - self.events - .push_back(SubnetServiceMessage::Subscribe(Subnet::SyncCommittee( - exact_subnet.subnet_id, - ))); - - // add the subnet to the ENR bitfield - self.events - .push_back(SubnetServiceMessage::EnrAdd(Subnet::SyncCommittee( - exact_subnet.subnet_id, - ))); - - // add an unsubscription event to remove ourselves from the subnet once completed - self.unsubscriptions - .insert_at(exact_subnet.subnet_id, expected_end_subscription_duration); - } else { - // We are already subscribed, extend the unsubscription duration - self.unsubscriptions - .update_timeout(&exact_subnet.subnet_id, expected_end_subscription_duration); - } - - Ok(()) - } - - /// A queued unsubscription is ready. - fn handle_unsubscriptions(&mut self, subnet_id: SyncSubnetId) { - debug!(subnet = *subnet_id, "Unsubscribing from subnet"); - - self.subscriptions.remove(&subnet_id); - self.events - .push_back(SubnetServiceMessage::Unsubscribe(Subnet::SyncCommittee( - subnet_id, - ))); - - self.events - .push_back(SubnetServiceMessage::EnrRemove(Subnet::SyncCommittee( - subnet_id, - ))); - } -} - -impl Stream for SyncCommitteeService { - type Item = SubnetServiceMessage; - - fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - // update the waker if needed - if let Some(waker) = &self.waker { - if !waker.will_wake(cx.waker()) { - self.waker = Some(cx.waker().clone()); - } - } else { - self.waker = Some(cx.waker().clone()); - } - - // process any un-subscription events - match self.unsubscriptions.poll_next_unpin(cx) { - Poll::Ready(Some(Ok(exact_subnet))) => self.handle_unsubscriptions(exact_subnet), - Poll::Ready(Some(Err(e))) => { - error!(error = e, "Failed to check for subnet unsubscription times"); - } - Poll::Ready(None) | Poll::Pending => {} - } - - // process any generated events - if let Some(event) = self.events.pop_front() { - return Poll::Ready(Some(event)); - } - - Poll::Pending - } -} From 3f8998f11fecc864882e98142b84b67be74e1572 Mon Sep 17 00:00:00 2001 From: Lion - dapplion <35266934+dapplion@users.noreply.github.com> Date: Thu, 25 Sep 2025 05:52:27 +0200 Subject: [PATCH 45/49] Only mark block lookups as pending if block is importing from gossip (#8112) - PR https://github.com/sigp/lighthouse/pull/8045 introduced a regression of how lookup sync interacts with the da_checker. Now in unstable block import from the HTTP API also insert the block in the da_checker while the block is being execution verified. If lookup sync finds the block in the da_checker in `NotValidated` state it expects a `GossipBlockProcessResult` message sometime later. That message is only sent after block import in gossip. I confirmed in our node's logs for 4/4 cases of stuck lookups are caused by this sequence of events: - Receive block through API, insert into da_checker in fn process_block in put_pre_execution_block - Create lookup and leave in AwaitingDownload(block in processing cache) state - Block from HTTP API finishes importing - Lookup is left stuck Closes https://github.com/sigp/lighthouse/issues/8104 - https://github.com/sigp/lighthouse/pull/8110 was my initial solution attempt but we can't send the `GossipBlockProcessResult` event from the `http_api` crate without adding new channels, which seems messy. For a given node it's rare that a lookup is created at the same time that a block is being published. This PR solves https://github.com/sigp/lighthouse/issues/8104 by allowing lookup sync to import the block twice in that case. Co-Authored-By: dapplion <35266934+dapplion@users.noreply.github.com> --- .../beacon_chain/src/beacon_block_streamer.rs | 2 +- beacon_node/beacon_chain/src/beacon_chain.rs | 9 +++-- .../src/data_availability_checker.rs | 7 ++-- .../overflow_lru_cache.rs | 37 +++++++++++++------ .../sync/block_lookups/single_block_lookup.rs | 2 +- .../network/src/sync/network_context.rs | 34 +++++++++++------ beacon_node/network/src/sync/tests/lookups.rs | 6 +-- consensus/types/src/beacon_block.rs | 1 + 8 files changed, 64 insertions(+), 34 deletions(-) diff --git a/beacon_node/beacon_chain/src/beacon_block_streamer.rs b/beacon_node/beacon_chain/src/beacon_block_streamer.rs index d4ce38927b2..c816a0b29f3 100644 --- a/beacon_node/beacon_chain/src/beacon_block_streamer.rs +++ b/beacon_node/beacon_chain/src/beacon_block_streamer.rs @@ -404,7 +404,7 @@ impl BeaconBlockStreamer { if self.check_caches == CheckCaches::Yes { match self.beacon_chain.get_block_process_status(&root) { BlockProcessStatus::Unknown => None, - BlockProcessStatus::NotValidated(block) + BlockProcessStatus::NotValidated(block, _) | BlockProcessStatus::ExecutionValidated(block) => { metrics::inc_counter(&metrics::BEACON_REQRESP_PRE_IMPORT_CACHE_HITS); Some(block) diff --git a/beacon_node/beacon_chain/src/beacon_chain.rs b/beacon_node/beacon_chain/src/beacon_chain.rs index 4f0c6aada0a..08e0d1c6745 100644 --- a/beacon_node/beacon_chain/src/beacon_chain.rs +++ b/beacon_node/beacon_chain/src/beacon_chain.rs @@ -334,7 +334,7 @@ pub enum BlockProcessStatus { /// Block is not in any pre-import cache. Block may be in the data-base or in the fork-choice. Unknown, /// Block is currently processing but not yet validated. - NotValidated(Arc>), + NotValidated(Arc>, BlockImportSource), /// Block is fully valid, but not yet imported. It's cached in the da_checker while awaiting /// missing block components. ExecutionValidated(Arc>), @@ -3351,8 +3351,11 @@ impl BeaconChain { ); } - self.data_availability_checker - .put_pre_execution_block(block_root, unverified_block.block_cloned())?; + self.data_availability_checker.put_pre_execution_block( + block_root, + unverified_block.block_cloned(), + block_source, + )?; // Start the Prometheus timer. let _full_timer = metrics::start_timer(&metrics::BLOCK_PROCESSING_TIMES); diff --git a/beacon_node/beacon_chain/src/data_availability_checker.rs b/beacon_node/beacon_chain/src/data_availability_checker.rs index a0ad1c2112d..43b7d8f7ea3 100644 --- a/beacon_node/beacon_chain/src/data_availability_checker.rs +++ b/beacon_node/beacon_chain/src/data_availability_checker.rs @@ -21,8 +21,8 @@ use task_executor::TaskExecutor; use tracing::{debug, error, instrument}; use types::blob_sidecar::{BlobIdentifier, BlobSidecar, FixedBlobSidecarList}; use types::{ - BlobSidecarList, ChainSpec, DataColumnSidecar, DataColumnSidecarList, Epoch, EthSpec, Hash256, - SignedBeaconBlock, Slot, + BlobSidecarList, BlockImportSource, ChainSpec, DataColumnSidecar, DataColumnSidecarList, Epoch, + EthSpec, Hash256, SignedBeaconBlock, Slot, }; mod error; @@ -354,9 +354,10 @@ impl DataAvailabilityChecker { &self, block_root: Hash256, block: Arc>, + source: BlockImportSource, ) -> Result<(), Error> { self.availability_cache - .put_pre_execution_block(block_root, block) + .put_pre_execution_block(block_root, block, source) } /// Removes a pre-execution block from the cache. diff --git a/beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs b/beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs index bb440096627..42f6dbd8567 100644 --- a/beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs +++ b/beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs @@ -19,13 +19,14 @@ use tracing::{Span, debug, debug_span}; use types::beacon_block_body::KzgCommitments; use types::blob_sidecar::BlobIdentifier; use types::{ - BlobSidecar, ChainSpec, ColumnIndex, DataColumnSidecar, DataColumnSidecarList, Epoch, EthSpec, - Hash256, RuntimeFixedVector, RuntimeVariableList, SignedBeaconBlock, + BlobSidecar, BlockImportSource, ChainSpec, ColumnIndex, DataColumnSidecar, + DataColumnSidecarList, Epoch, EthSpec, Hash256, RuntimeFixedVector, RuntimeVariableList, + SignedBeaconBlock, }; #[derive(Clone)] pub enum CachedBlock { - PreExecution(Arc>), + PreExecution(Arc>, BlockImportSource), Executed(Box>), } @@ -42,7 +43,7 @@ impl CachedBlock { fn as_block(&self) -> &SignedBeaconBlock { match self { - CachedBlock::PreExecution(b) => b, + CachedBlock::PreExecution(b, _) => b, CachedBlock::Executed(b) => b.as_block(), } } @@ -135,9 +136,13 @@ impl PendingComponents { /// Inserts a pre-execution block into the cache. /// This does NOT override an existing executed block. - pub fn insert_pre_execution_block(&mut self, block: Arc>) { + pub fn insert_pre_execution_block( + &mut self, + block: Arc>, + source: BlockImportSource, + ) { if self.block.is_none() { - self.block = Some(CachedBlock::PreExecution(block)) + self.block = Some(CachedBlock::PreExecution(block, source)) } } @@ -433,7 +438,9 @@ impl DataAvailabilityCheckerInner { .peek(block_root) .and_then(|pending_components| { pending_components.block.as_ref().map(|block| match block { - CachedBlock::PreExecution(b) => BlockProcessStatus::NotValidated(b.clone()), + CachedBlock::PreExecution(b, source) => { + BlockProcessStatus::NotValidated(b.clone(), *source) + } CachedBlock::Executed(b) => { BlockProcessStatus::ExecutionValidated(b.block_cloned()) } @@ -693,11 +700,12 @@ impl DataAvailabilityCheckerInner { &self, block_root: Hash256, block: Arc>, + source: BlockImportSource, ) -> Result<(), AvailabilityCheckError> { let epoch = block.epoch(); let pending_components = self.update_or_insert_pending_components(block_root, epoch, |pending_components| { - pending_components.insert_pre_execution_block(block); + pending_components.insert_pre_execution_block(block, source); Ok(()) })?; @@ -718,7 +726,7 @@ impl DataAvailabilityCheckerInner { /// This does NOT remove an existing executed block. pub fn remove_pre_execution_block(&self, block_root: &Hash256) { // The read lock is immediately dropped so we can safely remove the block from the cache. - if let Some(BlockProcessStatus::NotValidated(_)) = self.get_cached_block(block_root) { + if let Some(BlockProcessStatus::NotValidated(_, _)) = self.get_cached_block(block_root) { self.critical.write().pop(block_root); } } @@ -1459,9 +1467,13 @@ mod pending_components_tests { let mut pending_component = >::empty(block_root, max_len); let pre_execution_block = Arc::new(pre_execution_block); - pending_component.insert_pre_execution_block(pre_execution_block.clone()); + pending_component + .insert_pre_execution_block(pre_execution_block.clone(), BlockImportSource::Gossip); assert!( - matches!(pending_component.block, Some(CachedBlock::PreExecution(_))), + matches!( + pending_component.block, + Some(CachedBlock::PreExecution(_, _)) + ), "pre execution block inserted" ); @@ -1471,7 +1483,8 @@ mod pending_components_tests { "executed block inserted" ); - pending_component.insert_pre_execution_block(pre_execution_block); + pending_component + .insert_pre_execution_block(pre_execution_block, BlockImportSource::Gossip); assert!( matches!(pending_component.block, Some(CachedBlock::Executed(_))), "executed block should remain" diff --git a/beacon_node/network/src/sync/block_lookups/single_block_lookup.rs b/beacon_node/network/src/sync/block_lookups/single_block_lookup.rs index 36509d2563e..8fb3248a871 100644 --- a/beacon_node/network/src/sync/block_lookups/single_block_lookup.rs +++ b/beacon_node/network/src/sync/block_lookups/single_block_lookup.rs @@ -219,7 +219,7 @@ impl SingleBlockLookup { // can assert that this is the correct value of `blob_kzg_commitments_count`. match cx.chain.get_block_process_status(&self.block_root) { BlockProcessStatus::Unknown => None, - BlockProcessStatus::NotValidated(block) + BlockProcessStatus::NotValidated(block, _) | BlockProcessStatus::ExecutionValidated(block) => Some(block.clone()), } }) { diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 1aa3813284b..20b927724ec 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -52,8 +52,8 @@ use tokio::sync::mpsc; use tracing::{Span, debug, debug_span, error, warn}; use types::blob_sidecar::FixedBlobSidecarList; use types::{ - BlobSidecar, ColumnIndex, DataColumnSidecar, DataColumnSidecarList, DataColumnSubnetId, - EthSpec, ForkContext, Hash256, SignedBeaconBlock, Slot, + BlobSidecar, BlockImportSource, ColumnIndex, DataColumnSidecar, DataColumnSidecarList, + DataColumnSubnetId, Epoch, EthSpec, ForkContext, Hash256, SignedBeaconBlock, Slot, }; pub mod custody; @@ -975,14 +975,26 @@ impl SyncNetworkContext { match self.chain.get_block_process_status(&block_root) { // Unknown block, continue request to download BlockProcessStatus::Unknown => {} - // Block is known are currently processing, expect a future event with the result of - // processing. - BlockProcessStatus::NotValidated { .. } => { - // Lookup sync event safety: If the block is currently in the processing cache, we - // are guaranteed to receive a `SyncMessage::GossipBlockProcessResult` that will - // make progress on this lookup - return Ok(LookupRequestResult::Pending("block in processing cache")); - } + // Block is known and currently processing. Imports from gossip and HTTP API insert the + // block in the da_cache. However, HTTP API is unable to notify sync when it completes + // block import. Returning `Pending` here will result in stuck lookups if the block is + // importing from sync. + BlockProcessStatus::NotValidated(_, source) => match source { + BlockImportSource::Gossip => { + // Lookup sync event safety: If the block is currently in the processing cache, we + // are guaranteed to receive a `SyncMessage::GossipBlockProcessResult` that will + // make progress on this lookup + return Ok(LookupRequestResult::Pending("block in processing cache")); + } + BlockImportSource::Lookup + | BlockImportSource::RangeSync + | BlockImportSource::HttpApi => { + // Lookup, RangeSync or HttpApi block import don't emit the GossipBlockProcessResult + // event. If a lookup happens to be created during block import from one of + // those sources just import the block twice. Otherwise the lookup will get + // stuck. Double imports are fine, they just waste resources. + } + }, // Block is fully validated. If it's not yet imported it's waiting for missing block // components. Consider this request completed and do nothing. BlockProcessStatus::ExecutionValidated { .. } => { @@ -1478,7 +1490,7 @@ impl SyncNetworkContext { /// blocks and blobs. pub fn batch_type( &self, - epoch: types::Epoch, + epoch: Epoch, request_type: RangeRequestType, ) -> ByRangeRequestType { // Induces a compile time panic if this doesn't hold true. diff --git a/beacon_node/network/src/sync/tests/lookups.rs b/beacon_node/network/src/sync/tests/lookups.rs index 27968a06351..fc641861754 100644 --- a/beacon_node/network/src/sync/tests/lookups.rs +++ b/beacon_node/network/src/sync/tests/lookups.rs @@ -41,8 +41,8 @@ use slot_clock::{SlotClock, TestingSlotClock}; use tokio::sync::mpsc; use tracing::info; use types::{ - BeaconState, BeaconStateBase, BlobSidecar, DataColumnSidecar, EthSpec, ForkContext, ForkName, - Hash256, MinimalEthSpec as E, SignedBeaconBlock, Slot, + BeaconState, BeaconStateBase, BlobSidecar, BlockImportSource, DataColumnSidecar, EthSpec, + ForkContext, ForkName, Hash256, MinimalEthSpec as E, SignedBeaconBlock, Slot, data_column_sidecar::ColumnIndex, test_utils::{SeedableRng, TestRandom, XorShiftRng}, }; @@ -1113,7 +1113,7 @@ impl TestRig { self.harness .chain .data_availability_checker - .put_pre_execution_block(block.canonical_root(), block) + .put_pre_execution_block(block.canonical_root(), block, BlockImportSource::Gossip) .unwrap(); } diff --git a/consensus/types/src/beacon_block.rs b/consensus/types/src/beacon_block.rs index f4e4e369661..61c32dd4ac9 100644 --- a/consensus/types/src/beacon_block.rs +++ b/consensus/types/src/beacon_block.rs @@ -843,6 +843,7 @@ impl<'de, E: EthSpec, Payload: AbstractExecPayload> ContextDeserialize<'de, F } } +#[derive(Clone, Copy)] pub enum BlockImportSource { Gossip, Lookup, From 421e954c291fa78e6713776d1f1ef948231005dd Mon Sep 17 00:00:00 2001 From: Pawan Dhananjay Date: Thu, 25 Sep 2025 16:05:45 -0700 Subject: [PATCH 46/49] Revert "Revert type change in UnexpectedRequestId" This reverts commit 6ea14016f3d164456bc4c3cae0355ab532fe1a86. --- .../network/src/sync/network_context/custody.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/beacon_node/network/src/sync/network_context/custody.rs b/beacon_node/network/src/sync/network_context/custody.rs index eb34aae56c9..147948a20ee 100644 --- a/beacon_node/network/src/sync/network_context/custody.rs +++ b/beacon_node/network/src/sync/network_context/custody.rs @@ -5,7 +5,7 @@ use beacon_chain::BeaconChainTypes; use beacon_chain::validator_monitor::timestamp_now; use fnv::FnvHashMap; use lighthouse_network::PeerId; -use lighthouse_network::service::api_types::{CustodyId, DataColumnsByRootRequester}; +use lighthouse_network::service::api_types::{CustodyId, DataColumnsByRootRequester, Id}; use lighthouse_tracing::SPAN_OUTGOING_CUSTODY_REQUEST; use parking_lot::RwLock; use std::collections::HashSet; @@ -46,8 +46,8 @@ pub enum Error { /// There should only exist a single request at a time. Having multiple requests is a bug and /// can result in undefined state, so it's treated as a hard error and the lookup is dropped. UnexpectedRequestId { - expected_req_id: DataColumnsByRootRequestId, - req_id: DataColumnsByRootRequestId, + expected_req_id: Id, + req_id: Id, }, } @@ -424,8 +424,8 @@ impl ColumnRequest { Status::Downloading(expected_req_id) => { if req_id != *expected_req_id { return Err(Error::UnexpectedRequestId { - expected_req_id: *expected_req_id, - req_id, + expected_req_id: expected_req_id.id, + req_id: req_id.id, }); } self.status = Status::NotStarted(Instant::now()); @@ -457,8 +457,8 @@ impl ColumnRequest { Status::Downloading(expected_req_id) => { if req_id != *expected_req_id { return Err(Error::UnexpectedRequestId { - expected_req_id: *expected_req_id, - req_id, + expected_req_id: expected_req_id.id, + req_id: req_id.id, }); } self.status = Status::Downloaded(peer_id, data_column, seen_timestamp); From 826a06eb632cea424853facef5edb0151a6ef4e5 Mon Sep 17 00:00:00 2001 From: Pawan Dhananjay Date: Thu, 25 Sep 2025 16:21:55 -0700 Subject: [PATCH 47/49] Fix variant name --- beacon_node/network/src/sync/block_sidecar_coupling.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/beacon_node/network/src/sync/block_sidecar_coupling.rs b/beacon_node/network/src/sync/block_sidecar_coupling.rs index fd221efc99d..9caf84be20d 100644 --- a/beacon_node/network/src/sync/block_sidecar_coupling.rs +++ b/beacon_node/network/src/sync/block_sidecar_coupling.rs @@ -53,7 +53,7 @@ enum RangeBlockDataRequest { NoData, Blobs(ByRangeRequest>>>), /// These are data columns fetched by a range request. - DataColumns { + DataColumnsFromRange { requests: HashMap< DataColumnsByRangeRequestId, ByRangeRequest>, From 5c562c6543353aea0e4c71cf496cc3f362d7d47e Mon Sep 17 00:00:00 2001 From: Pawan Dhananjay Date: Mon, 6 Oct 2025 11:17:57 -0700 Subject: [PATCH 48/49] Fix some more issues --- .../network/src/sync/backfill_sync/mod.rs | 9 ---- .../src/sync/block_sidecar_coupling.rs | 6 ++- .../network/src/sync/range_sync/chain.rs | 43 ++++++++++--------- 3 files changed, 26 insertions(+), 32 deletions(-) diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index c4bd55ff8e1..f92c666832b 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -558,19 +558,10 @@ impl BackFillSync { } }; - let Some(batch_peers) = batch.processing_peers() else { - self.fail_sync(BackFillError::BatchInvalidState( - batch_id, - String::from("Peer does not exist"), - ))?; - return Ok(ProcessResult::Successful); - }; - debug!( ?result, %batch, batch_epoch = %batch_id, - ?batch_peers, // client = %network.client_type(peer), "Backfill batch processed" ); diff --git a/beacon_node/network/src/sync/block_sidecar_coupling.rs b/beacon_node/network/src/sync/block_sidecar_coupling.rs index 9caf84be20d..2981b08be79 100644 --- a/beacon_node/network/src/sync/block_sidecar_coupling.rs +++ b/beacon_node/network/src/sync/block_sidecar_coupling.rs @@ -594,7 +594,8 @@ impl RangeBlockComponentsRequest { return Err(CouplingError::DataColumnPeerFailure { error: format!("No columns for block {block_root:?} with data"), faulty_peers: responsible_peers, - action: PeerAction::LowToleranceError, + // The block peer might be malcicious so don't downscore the column peer too bad + action: PeerAction::MidToleranceError, exceeded_retries, }); @@ -619,7 +620,8 @@ impl RangeBlockComponentsRequest { return Err(CouplingError::DataColumnPeerFailure { error: format!("Peers did not return column for block_root {block_root:?} {naughty_peers:?}"), faulty_peers: naughty_peers, - action: PeerAction::LowToleranceError, + // The block peer might be malcicious so don't downscore the column peer too bad + action: PeerAction::MidToleranceError, exceeded_retries }); } diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index 8013a38ff62..08413120559 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -545,32 +545,33 @@ impl SyncingChain { penalty, faulty_component, } => { - let Some(batch_peers) = batch.processing_peers() else { + if let Some(batch_peers) = batch.processing_peers() { + // Penalize the peer appropriately. + match faulty_component { + Some(FaultyComponent::Blocks) | Some(FaultyComponent::Blobs) => { + network.report_peer( + batch_peers.block_and_blob, + *penalty, + "faulty_batch", + ); + } + Some(FaultyComponent::Columns(faulty_columns)) => { + for (peer, columns) in batch_peers.data_columns.iter() { + for faulty_column in faulty_columns { + if columns.contains(faulty_column) { + network.report_peer(*peer, *penalty, "faulty_batch"); + } + } + } + } + None => {} + } + } else { warn!( current_state = ?batch.state(), "Inconsistent state, batch must have been in processing state" ); - return Err(RemoveChain::ChainFailed { - blacklist: false, - failing_batch: batch_id, - }); }; - // Penalize the peer appropriately. - match faulty_component { - Some(FaultyComponent::Blocks) | Some(FaultyComponent::Blobs) => { - network.report_peer(batch_peers.block_and_blob, *penalty, "faulty_batch"); - } - Some(FaultyComponent::Columns(faulty_columns)) => { - for (peer, columns) in batch_peers.data_columns.iter() { - for faulty_column in faulty_columns { - if columns.contains(faulty_column) { - network.report_peer(*peer, *penalty, "faulty_batch"); - } - } - } - } - None => {} - } // Check if this batch is allowed to continue match batch.processing_completed(BatchProcessingResult::FaultyFailure)? { From 9b2de095c5b825a77953b007e2f5f58cd5ecbff8 Mon Sep 17 00:00:00 2001 From: Pawan Dhananjay Date: Mon, 6 Oct 2025 17:22:44 -0700 Subject: [PATCH 49/49] Rethink peer scoring --- .../src/sync/block_sidecar_coupling.rs | 4 +-- .../network/src/sync/network_context.rs | 36 ++++++++++++++++++- 2 files changed, 37 insertions(+), 3 deletions(-) diff --git a/beacon_node/network/src/sync/block_sidecar_coupling.rs b/beacon_node/network/src/sync/block_sidecar_coupling.rs index 2981b08be79..40bd9717a3c 100644 --- a/beacon_node/network/src/sync/block_sidecar_coupling.rs +++ b/beacon_node/network/src/sync/block_sidecar_coupling.rs @@ -595,7 +595,7 @@ impl RangeBlockComponentsRequest { error: format!("No columns for block {block_root:?} with data"), faulty_peers: responsible_peers, // The block peer might be malcicious so don't downscore the column peer too bad - action: PeerAction::MidToleranceError, + action: PeerAction::HighToleranceError, exceeded_retries, }); @@ -621,7 +621,7 @@ impl RangeBlockComponentsRequest { error: format!("Peers did not return column for block_root {block_root:?} {naughty_peers:?}"), faulty_peers: naughty_peers, // The block peer might be malcicious so don't downscore the column peer too bad - action: PeerAction::MidToleranceError, + action: PeerAction::HighToleranceError, exceeded_retries }); } diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index b02dd2f850b..7870a0b4904 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -1854,7 +1854,41 @@ impl SyncNetworkContext { } if let Some(Err(RpcResponseError::VerifyError(e))) = &resp { - self.report_peer(peer_id, PeerAction::LowToleranceError, e.into()); + warn!(?e, "Verification error on rpc response"); + match e { + LookupVerifyError::NotEnoughResponsesReturned { .. } => { + // This is a special case because in the case of a columns by root requests, there are 3 cases + // 1. the columns peer is honest and doesn't have the columns that we requested from it + // because its on a different chain. + // 2. the columns peer is honest but the block peer maliciously fed us bogus blocks for which + // there are no corresponding columns. + // 3. The column peer is buggy but non-malicious + // + // There is no way to differentiate between these 3 cases until we can verify the block + // before requesting the columns. + // Hence, we currently do not downscore them with a `LowToleranceError`. + // + // However, since majority of these errors are of type 3 currently, we downscore these errors with a + // HighTolerance error to avoid getting stuck in sync with buggy peers. + if method.contains("DataColumns") { + self.report_peer(peer_id, PeerAction::HighToleranceError, e.into()) + } else { + self.report_peer(peer_id, PeerAction::LowToleranceError, e.into()) + } + } + LookupVerifyError::UnrequestedSlot(_) + | LookupVerifyError::DuplicatedData(_, _) + | LookupVerifyError::TooManyResponses + | LookupVerifyError::UnrequestedBlockRoot(_) + | LookupVerifyError::UnrequestedIndex(_) => { + // Recoverable errors, don't downscore heavily + self.report_peer(peer_id, PeerAction::HighToleranceError, e.into()) + } + LookupVerifyError::InternalError(_) => {} // do not downscore peer for internal errors + LookupVerifyError::InvalidInclusionProof => { + self.report_peer(peer_id, PeerAction::LowToleranceError, e.into()) + } + } } resp }