Skip to content

Commit f4ffa9e

Browse files
Handle processing results of non faulty batches (#3439)
## Issue Addressed Solves #3390 So after checking some logs @pawanjay176 got, we conclude that this happened because we blacklisted a chain after trying it "too much". Now here, in all occurrences it seems that "too much" means we got too many download failures. This happened very slowly, exactly because the batch is allowed to stay alive for very long times after not counting penalties when the ee is offline. The error here then was not that the batch failed because of offline ee errors, but that we blacklisted a chain because of download errors, which we can't pin on the chain but on the peer. This PR fixes that. ## Proposed Changes Adds a missing piece of logic so that if a chain fails for errors that can't be attributed to an objectively bad behavior from the peer, it is not blacklisted. The issue at hand occurred when new peers arrived claiming a head that had wrongfully blacklisted, even if the original peers participating in the chain were not penalized. Another notable change is that we need to consider a batch invalid if it processed correctly but its next non empty batch fails processing. Now since a batch can fail processing in non empty ways, there is no need to mark as invalid previous batches. Improves some logging as well. ## Additional Info We should do this regardless of pausing sync on ee offline/unsynced state. This is because I think it's almost impossible to ensure a processing result will reach in a predictable order with a synced notification from the ee. Doing this handles what I think are inevitable data races when we actually pause sync This also fixes a return that reports which batch failed and caused us some confusion checking the logs
1 parent a476ae4 commit f4ffa9e

File tree

12 files changed

+298
-274
lines changed

12 files changed

+298
-274
lines changed

beacon_node/network/src/beacon_processor/mod.rs

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -76,9 +76,7 @@ mod work_reprocessing_queue;
7676
mod worker;
7777

7878
use crate::beacon_processor::work_reprocessing_queue::QueuedGossipBlock;
79-
pub use worker::{
80-
ChainSegmentProcessId, FailureMode, GossipAggregatePackage, GossipAttestationPackage,
81-
};
79+
pub use worker::{ChainSegmentProcessId, GossipAggregatePackage, GossipAttestationPackage};
8280

8381
/// The maximum size of the channel for work events to the `BeaconProcessor`.
8482
///

beacon_node/network/src/beacon_processor/worker/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ mod rpc_methods;
1010
mod sync_methods;
1111

1212
pub use gossip_methods::{GossipAggregatePackage, GossipAttestationPackage};
13-
pub use sync_methods::{ChainSegmentProcessId, FailureMode};
13+
pub use sync_methods::ChainSegmentProcessId;
1414

1515
pub(crate) const FUTURE_SLOT_TOLERANCE: u64 = 1;
1616

beacon_node/network/src/beacon_processor/worker/sync_methods.rs

Lines changed: 27 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -34,15 +34,6 @@ struct ChainSegmentFailed {
3434
message: String,
3535
/// Used to penalize peers.
3636
peer_action: Option<PeerAction>,
37-
/// Failure mode
38-
mode: FailureMode,
39-
}
40-
41-
/// Represents if a block processing failure was on the consensus or execution side.
42-
#[derive(Debug)]
43-
pub enum FailureMode {
44-
ExecutionLayer { pause_sync: bool },
45-
ConsensusLayer,
4637
}
4738

4839
impl<T: BeaconChainTypes> Worker<T> {
@@ -150,7 +141,9 @@ impl<T: BeaconChainTypes> Worker<T> {
150141
"last_block_slot" => end_slot,
151142
"processed_blocks" => sent_blocks,
152143
"service"=> "sync");
153-
BatchProcessResult::Success(sent_blocks > 0)
144+
BatchProcessResult::Success {
145+
was_non_empty: sent_blocks > 0,
146+
}
154147
}
155148
(imported_blocks, Err(e)) => {
156149
debug!(self.log, "Batch processing failed";
@@ -161,11 +154,12 @@ impl<T: BeaconChainTypes> Worker<T> {
161154
"imported_blocks" => imported_blocks,
162155
"error" => %e.message,
163156
"service" => "sync");
164-
165-
BatchProcessResult::Failed {
166-
imported_blocks: imported_blocks > 0,
167-
peer_action: e.peer_action,
168-
mode: e.mode,
157+
match e.peer_action {
158+
Some(penalty) => BatchProcessResult::FaultyFailure {
159+
imported_blocks: imported_blocks > 0,
160+
penalty,
161+
},
162+
None => BatchProcessResult::NonFaultyFailure,
169163
}
170164
}
171165
}
@@ -184,7 +178,9 @@ impl<T: BeaconChainTypes> Worker<T> {
184178
"last_block_slot" => end_slot,
185179
"processed_blocks" => sent_blocks,
186180
"service"=> "sync");
187-
BatchProcessResult::Success(sent_blocks > 0)
181+
BatchProcessResult::Success {
182+
was_non_empty: sent_blocks > 0,
183+
}
188184
}
189185
(_, Err(e)) => {
190186
debug!(self.log, "Backfill batch processing failed";
@@ -193,10 +189,12 @@ impl<T: BeaconChainTypes> Worker<T> {
193189
"last_block_slot" => end_slot,
194190
"error" => %e.message,
195191
"service" => "sync");
196-
BatchProcessResult::Failed {
197-
imported_blocks: false,
198-
peer_action: e.peer_action,
199-
mode: e.mode,
192+
match e.peer_action {
193+
Some(penalty) => BatchProcessResult::FaultyFailure {
194+
imported_blocks: false,
195+
penalty,
196+
},
197+
None => BatchProcessResult::NonFaultyFailure,
200198
}
201199
}
202200
}
@@ -216,15 +214,19 @@ impl<T: BeaconChainTypes> Worker<T> {
216214
{
217215
(imported_blocks, Err(e)) => {
218216
debug!(self.log, "Parent lookup failed"; "error" => %e.message);
219-
BatchProcessResult::Failed {
220-
imported_blocks: imported_blocks > 0,
221-
peer_action: e.peer_action,
222-
mode: e.mode,
217+
match e.peer_action {
218+
Some(penalty) => BatchProcessResult::FaultyFailure {
219+
imported_blocks: imported_blocks > 0,
220+
penalty,
221+
},
222+
None => BatchProcessResult::NonFaultyFailure,
223223
}
224224
}
225225
(imported_blocks, Ok(_)) => {
226226
debug!(self.log, "Parent lookup processed successfully");
227-
BatchProcessResult::Success(imported_blocks > 0)
227+
BatchProcessResult::Success {
228+
was_non_empty: imported_blocks > 0,
229+
}
228230
}
229231
}
230232
}
@@ -307,7 +309,6 @@ impl<T: BeaconChainTypes> Worker<T> {
307309
message: String::from("mismatched_block_root"),
308310
// The peer is faulty if they send blocks with bad roots.
309311
peer_action: Some(PeerAction::LowToleranceError),
310-
mode: FailureMode::ConsensusLayer,
311312
}
312313
}
313314
HistoricalBlockError::InvalidSignature
@@ -322,7 +323,6 @@ impl<T: BeaconChainTypes> Worker<T> {
322323
message: "invalid_signature".into(),
323324
// The peer is faulty if they bad signatures.
324325
peer_action: Some(PeerAction::LowToleranceError),
325-
mode: FailureMode::ConsensusLayer,
326326
}
327327
}
328328
HistoricalBlockError::ValidatorPubkeyCacheTimeout => {
@@ -336,7 +336,6 @@ impl<T: BeaconChainTypes> Worker<T> {
336336
message: "pubkey_cache_timeout".into(),
337337
// This is an internal error, do not penalize the peer.
338338
peer_action: None,
339-
mode: FailureMode::ConsensusLayer,
340339
}
341340
}
342341
HistoricalBlockError::NoAnchorInfo => {
@@ -347,7 +346,6 @@ impl<T: BeaconChainTypes> Worker<T> {
347346
// There is no need to do a historical sync, this is not a fault of
348347
// the peer.
349348
peer_action: None,
350-
mode: FailureMode::ConsensusLayer,
351349
}
352350
}
353351
HistoricalBlockError::IndexOutOfBounds => {
@@ -360,7 +358,6 @@ impl<T: BeaconChainTypes> Worker<T> {
360358
message: String::from("logic_error"),
361359
// This should never occur, don't penalize the peer.
362360
peer_action: None,
363-
mode: FailureMode::ConsensusLayer,
364361
}
365362
}
366363
HistoricalBlockError::BlockOutOfRange { .. } => {
@@ -373,7 +370,6 @@ impl<T: BeaconChainTypes> Worker<T> {
373370
message: String::from("unexpected_error"),
374371
// This should never occur, don't penalize the peer.
375372
peer_action: None,
376-
mode: FailureMode::ConsensusLayer,
377373
}
378374
}
379375
},
@@ -383,7 +379,6 @@ impl<T: BeaconChainTypes> Worker<T> {
383379
message: format!("{:?}", other),
384380
// This is an internal error, don't penalize the peer.
385381
peer_action: None,
386-
mode: FailureMode::ConsensusLayer,
387382
}
388383
}
389384
};
@@ -404,7 +399,6 @@ impl<T: BeaconChainTypes> Worker<T> {
404399
message: format!("Block has an unknown parent: {}", block.parent_root()),
405400
// Peers are faulty if they send non-sequential blocks.
406401
peer_action: Some(PeerAction::LowToleranceError),
407-
mode: FailureMode::ConsensusLayer,
408402
})
409403
}
410404
BlockError::BlockIsAlreadyKnown => {
@@ -442,7 +436,6 @@ impl<T: BeaconChainTypes> Worker<T> {
442436
),
443437
// Peers are faulty if they send blocks from the future.
444438
peer_action: Some(PeerAction::LowToleranceError),
445-
mode: FailureMode::ConsensusLayer,
446439
})
447440
}
448441
BlockError::WouldRevertFinalizedSlot { .. } => {
@@ -464,7 +457,6 @@ impl<T: BeaconChainTypes> Worker<T> {
464457
message: format!("Internal error whilst processing block: {:?}", e),
465458
// Do not penalize peers for internal errors.
466459
peer_action: None,
467-
mode: FailureMode::ConsensusLayer,
468460
})
469461
}
470462
ref err @ BlockError::ExecutionPayloadError(ref epe) => {
@@ -480,7 +472,6 @@ impl<T: BeaconChainTypes> Worker<T> {
480472
message: format!("Execution layer offline. Reason: {:?}", err),
481473
// Do not penalize peers for internal errors.
482474
peer_action: None,
483-
mode: FailureMode::ExecutionLayer { pause_sync: true },
484475
})
485476
} else {
486477
debug!(self.log,
@@ -493,7 +484,6 @@ impl<T: BeaconChainTypes> Worker<T> {
493484
err
494485
),
495486
peer_action: Some(PeerAction::LowToleranceError),
496-
mode: FailureMode::ExecutionLayer { pause_sync: false },
497487
})
498488
}
499489
}
@@ -508,7 +498,6 @@ impl<T: BeaconChainTypes> Worker<T> {
508498
message: format!("Peer sent invalid block. Reason: {:?}", other),
509499
// Do not penalize peers for internal errors.
510500
peer_action: None,
511-
mode: FailureMode::ConsensusLayer,
512501
})
513502
}
514503
}

0 commit comments

Comments
 (0)