Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 4 additions & 11 deletions beacon_node/lighthouse_network/src/peer_manager/peerdb.rs
Original file line number Diff line number Diff line change
Expand Up @@ -247,23 +247,16 @@ impl<E: EthSpec> PeerDB<E> {
.map(|(peer_id, _)| peer_id)
}

/// Returns all the synced peers from the list of allowed peers that claim to have the block
/// Returns all the synced peers from the peer db that claim to have the block
/// components for the given epoch based on `status.earliest_available_slot`.
///
/// If `earliest_available_slot` info is not available, then return peer anyway assuming it has the
/// required data.
///
/// If `allowed_peers` is `Some`, then filters for the epoch only for those peers.
pub fn synced_peers_for_epoch<'a>(
&'a self,
epoch: Epoch,
allowed_peers: Option<&'a HashSet<PeerId>>,
) -> impl Iterator<Item = &'a PeerId> {
pub fn synced_peers_for_epoch(&self, epoch: Epoch) -> impl Iterator<Item = &PeerId> {
self.peers
.iter()
.filter(move |(peer_id, info)| {
allowed_peers.is_none_or(|allowed| allowed.contains(peer_id))
&& info.is_connected()
.filter(move |(_, info)| {
info.is_connected()
&& match info.sync_status() {
SyncStatus::Synced { info } => {
info.has_slot(epoch.end_slot(E::slots_per_epoch()))
Expand Down
13 changes: 5 additions & 8 deletions beacon_node/network/src/sync/backfill_sync/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
.network_globals
.peers
.read()
.synced_peers_for_epoch(self.to_be_downloaded, None)
.synced_peers_for_epoch(self.to_be_downloaded)
.next()
.is_some()
// backfill can't progress if we do not have peers in the required subnets post peerdas.
Expand Down Expand Up @@ -313,7 +313,6 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
CouplingError::DataColumnPeerFailure {
error,
faulty_peers,
action,
exceeded_retries,
} => {
debug!(?batch_id, error, "Block components coupling error");
Expand All @@ -325,11 +324,8 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
failed_columns.insert(*column);
failed_peers.insert(*peer);
}
for peer in failed_peers.iter() {
network.report_peer(*peer, *action, "failed to return columns");
}

// Only retry if peer failure **and** retries have been exceeded
// Only retry if peer failure **and** retries haven't been exceeded
if !*exceeded_retries {
return self.retry_partial_batch(
network,
Expand Down Expand Up @@ -888,7 +884,7 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
.network_globals
.peers
.read()
.synced_peers_for_epoch(batch_id, None)
.synced_peers_for_epoch(batch_id)
.cloned()
.collect::<HashSet<_>>();

Expand All @@ -899,6 +895,7 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
request,
RangeRequestId::BackfillSync { batch_id },
&synced_peers,
&synced_peers, // All synced peers have imported up to the finalized slot so they must have their custody columns available
&failed_peers,
) {
Ok(request_id) => {
Expand Down Expand Up @@ -964,7 +961,7 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
.network_globals()
.peers
.read()
.synced_peers_for_epoch(batch_id, None)
.synced_peers_for_epoch(batch_id)
.cloned()
.collect::<HashSet<_>>();

Expand Down
12 changes: 2 additions & 10 deletions beacon_node/network/src/sync/block_sidecar_coupling.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use beacon_chain::{
block_verification_types::RpcBlock, data_column_verification::CustodyDataColumn, get_block_root,
};
use lighthouse_network::{
PeerAction, PeerId,
PeerId,
service::api_types::{
BlobsByRangeRequestId, BlocksByRangeRequestId, DataColumnsByRangeRequestId,
},
Expand Down Expand Up @@ -63,7 +63,6 @@ pub(crate) enum CouplingError {
DataColumnPeerFailure {
error: String,
faulty_peers: Vec<(ColumnIndex, PeerId)>,
action: PeerAction,
exceeded_retries: bool,
},
BlobPeerFailure(String),
Expand Down Expand Up @@ -253,7 +252,6 @@ impl<E: EthSpec> RangeBlockComponentsRequest<E> {
if let Err(CouplingError::DataColumnPeerFailure {
error: _,
faulty_peers,
action: _,
exceeded_retries: _,
}) = &resp
{
Expand Down Expand Up @@ -377,7 +375,6 @@ impl<E: EthSpec> RangeBlockComponentsRequest<E> {
return Err(CouplingError::DataColumnPeerFailure {
error: format!("No columns for block {block_root:?} with data"),
faulty_peers: responsible_peers,
action: PeerAction::LowToleranceError,
exceeded_retries,

});
Expand All @@ -402,7 +399,6 @@ impl<E: EthSpec> RangeBlockComponentsRequest<E> {
return Err(CouplingError::DataColumnPeerFailure {
error: format!("Peers did not return column for block_root {block_root:?} {naughty_peers:?}"),
faulty_peers: naughty_peers,
action: PeerAction::LowToleranceError,
exceeded_retries
});
}
Expand Down Expand Up @@ -468,7 +464,7 @@ mod tests {
NumBlobs, generate_rand_block_and_blobs, generate_rand_block_and_data_columns, test_spec,
};
use lighthouse_network::{
PeerAction, PeerId,
PeerId,
service::api_types::{
BlobsByRangeRequestId, BlocksByRangeRequestId, ComponentsByRangeRequestId,
DataColumnsByRangeRequestId, Id, RangeRequestId,
Expand Down Expand Up @@ -785,15 +781,13 @@ mod tests {
if let Err(super::CouplingError::DataColumnPeerFailure {
error,
faulty_peers,
action,
exceeded_retries,
}) = result
{
assert!(error.contains("Peers did not return column"));
assert_eq!(faulty_peers.len(), 2); // columns 3 and 4 missing
assert_eq!(faulty_peers[0].0, 3); // column index 3
assert_eq!(faulty_peers[1].0, 4); // column index 4
assert!(matches!(action, PeerAction::LowToleranceError));
assert!(!exceeded_retries); // First attempt, should be false
} else {
panic!("Expected PeerFailure error");
Expand Down Expand Up @@ -957,13 +951,11 @@ mod tests {
if let Err(super::CouplingError::DataColumnPeerFailure {
error: _,
faulty_peers,
action,
exceeded_retries,
}) = result
{
assert_eq!(faulty_peers.len(), 1); // column 2 missing
assert_eq!(faulty_peers[0].0, 2); // column index 2
assert!(matches!(action, PeerAction::LowToleranceError));
assert!(exceeded_retries); // Should be true after max retries
} else {
panic!("Expected PeerFailure error with exceeded_retries=true");
Expand Down
11 changes: 6 additions & 5 deletions beacon_node/network/src/sync/network_context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -533,19 +533,21 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
batch_type: ByRangeRequestType,
request: BlocksByRangeRequest,
requester: RangeRequestId,
peers: &HashSet<PeerId>,
block_peers: &HashSet<PeerId>,
column_peers: &HashSet<PeerId>,
peers_to_deprioritize: &HashSet<PeerId>,
) -> Result<Id, RpcRequestSendError> {
let range_request_span = debug_span!(
parent: None,
SPAN_OUTGOING_RANGE_REQUEST,
range_req_id = %requester,
peers = peers.len()
block_peers = block_peers.len(),
column_peers = column_peers.len()
);
let _guard = range_request_span.clone().entered();
let active_request_count_by_peer = self.active_request_count_by_peer();

let Some(block_peer) = peers
let Some(block_peer) = block_peers
.iter()
.map(|peer| {
(
Expand Down Expand Up @@ -579,7 +581,7 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
.collect();
Some(self.select_columns_by_range_peers_to_request(
&column_indexes,
peers,
column_peers,
active_request_count_by_peer,
peers_to_deprioritize,
)?)
Expand Down Expand Up @@ -770,7 +772,6 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
let range_req = entry.get_mut();
if let Some(blocks_result) = range_req.responses(&self.chain.spec) {
if let Err(CouplingError::DataColumnPeerFailure {
action: _,
error,
faulty_peers: _,
exceeded_retries,
Expand Down
25 changes: 25 additions & 0 deletions beacon_node/network/src/sync/range_sync/batch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,31 @@ impl<E: EthSpec, B: BatchConfig> BatchInfo<E, B> {
}
}

/// Change the batch state from `Self::Downloading` to `Self::AwaitingDownload` without
/// registering a failed attempt.
///
/// Note: must use this cautiously with some level of retry protection
/// as not registering a failed attempt could lead to requesting in a loop.
#[must_use = "Batch may have failed"]
pub fn downloading_to_awaiting_download(
&mut self,
) -> Result<BatchOperationOutcome, WrongState> {
match self.state.poison() {
BatchState::Downloading(_) => {
self.state = BatchState::AwaitingDownload;
Ok(self.outcome())
}
BatchState::Poisoned => unreachable!("Poisoned batch"),
other => {
self.state = other;
Err(WrongState(format!(
"Download failed for batch in wrong state {:?}",
self.state
)))
}
}
}

pub fn start_downloading(&mut self, request_id: Id) -> Result<(), WrongState> {
match self.state.poison() {
BatchState::AwaitingDownload => {
Expand Down
45 changes: 35 additions & 10 deletions beacon_node/network/src/sync/range_sync/chain.rs
Original file line number Diff line number Diff line change
Expand Up @@ -871,7 +871,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
CouplingError::DataColumnPeerFailure {
error,
faulty_peers,
action,
exceeded_retries,
} => {
debug!(?batch_id, error, "Block components coupling error");
Expand All @@ -883,12 +882,22 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
failed_columns.insert(*column);
failed_peers.insert(*peer);
}
for peer in failed_peers.iter() {
network.report_peer(*peer, *action, "failed to return columns");
}
// Retry the failed columns if the column requests haven't exceeded the
// max retries. Otherwise, remove treat it as a failed batch below.
if !*exceeded_retries {
// Set the batch back to `AwaitingDownload` before retrying.
// This is to ensure that the batch doesn't get stuck in `Downloading` state.
//
// DataColumn retries has a retry limit so calling `downloading_to_awaiting_download`
// is safe.
if let BatchOperationOutcome::Failed { blacklist } =
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BatchOperationOutcome::Failed is no longer reachable after changing the call to downloading_to_awaiting_download.

Also i think the function name conveys clearly what it does, but not its intended usage, but I also struggle to come up with something better.

batch.download_failed_skip_attempt_count seems a bit too wordy

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think handling BatchOperationOutcome::Failed is a good idea though.

batch.downloading_to_awaiting_download()?
{
return Err(RemoveChain::ChainFailed {
blacklist,
failing_batch: batch_id,
});
}
return self.retry_partial_batch(
network,
batch_id,
Expand Down Expand Up @@ -936,7 +945,10 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
failing_batch: batch_id,
});
}
self.send_batch(network, batch_id)
// The errored batch is set to AwaitingDownload above.
// We now just attempt to download all batches stuck in `AwaitingDownload`
// state in the right order.
self.attempt_send_awaiting_download_batches(network, "injecting error")
} else {
debug!(
batch_epoch = %batch_id,
Expand Down Expand Up @@ -969,7 +981,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
.collect();
debug!(
?awaiting_downloads,
src, "Attempting to send batches awaiting downlaod"
src, "Attempting to send batches awaiting download"
);

for batch_id in awaiting_downloads {
Expand Down Expand Up @@ -998,11 +1010,11 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
let (request, batch_type) = batch.to_blocks_by_range_request();
let failed_peers = batch.failed_peers();

let synced_peers = network
let synced_column_peers = network
.network_globals()
.peers
.read()
.synced_peers_for_epoch(batch_id, Some(&self.peers))
.synced_peers_for_epoch(batch_id)
.cloned()
.collect::<HashSet<_>>();

Expand All @@ -1013,7 +1025,13 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
chain_id: self.id,
batch_id,
},
&synced_peers,
// Request blocks only from peers of this specific chain
&self.peers,
// Request column from all synced peers, even if they are not part of this chain.
// This is to avoid splitting of good column peers across many head chains in a heavy forking
// environment. If the column peers and block peer are on different chains, then we return
// a coupling error and retry only the columns that failed to couple. See `Self::retry_partial_batch`.
&synced_column_peers,
&failed_peers,
) {
Ok(request_id) => {
Expand Down Expand Up @@ -1081,7 +1099,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
.network_globals()
.peers
.read()
.synced_peers_for_epoch(batch_id, Some(&self.peers))
.synced_peers_for_epoch(batch_id)
.cloned()
.collect::<HashSet<_>>();

Expand All @@ -1093,13 +1111,17 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
&failed_columns,
) {
Ok(_) => {
// inform the batch about the new request
batch.start_downloading(id)?;
debug!(
?batch_id,
id, "Retried column requests from different peers"
);
return Ok(KeepChain);
}
Err(e) => {
// No need to explicitly fail the batch since its in `AwaitingDownload` state
// before we attempted to retry.
debug!(?batch_id, id, e, "Failed to retry partial batch");
}
}
Expand All @@ -1123,6 +1145,9 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
) -> Result<KeepChain, RemoveChain> {
let _guard = self.span.clone().entered();
debug!("Resuming chain");
// attempt to download any batches stuck in the `AwaitingDownload` state because of
// a lack of peers before.
self.attempt_send_awaiting_download_batches(network, "resume")?;
// Request more batches if needed.
self.request_batches(network)?;
// If there is any batch ready for processing, send it.
Expand Down
Loading