From 18ea131c1bd46b0c721059c14724f253c57f13e3 Mon Sep 17 00:00:00 2001 From: Agnish Ghosh Date: Tue, 14 Oct 2025 18:45:54 +0530 Subject: [PATCH 1/4] reconstruction timeout + vcus detection on no column sidecars in quarantine --- beacon_chain/nimbus_beacon_node.nim | 4 +-- beacon_chain/spec/peerdas_helpers.nim | 38 ++++++++++++++++++++------- 2 files changed, 31 insertions(+), 11 deletions(-) diff --git a/beacon_chain/nimbus_beacon_node.nim b/beacon_chain/nimbus_beacon_node.nim index 53b78b5a7e..d1bfdc3088 100644 --- a/beacon_chain/nimbus_beacon_node.nim +++ b/beacon_chain/nimbus_beacon_node.nim @@ -9,7 +9,7 @@ import system/ansi_c, - std/[os, random, terminal, times], + std/[os, random, strutils, terminal, times], chronos, chronicles, metrics, metrics/chronos_httpserver, stew/[byteutils, io2], @@ -1974,7 +1974,7 @@ proc onSlotEnd(node: BeaconNode, slot: Slot) {.async.} = if (not node.config.peerdasSupernode) and (slot.epoch() + 1).start_slot() - slot == 1 and - node.quarantine.sidecarless.len == 0 and + node.dataColumnQuarantine[].len == 0 and node.attachedValidatorBalanceTotal > 0.Gwei: # Detect new validator custody at the last slot of every epoch node.validatorCustody.detectNewValidatorCustody(slot, diff --git a/beacon_chain/spec/peerdas_helpers.nim b/beacon_chain/spec/peerdas_helpers.nim index ba0738dd2f..656d1ebc09 100644 --- a/beacon_chain/spec/peerdas_helpers.nim +++ b/beacon_chain/spec/peerdas_helpers.nim @@ -9,7 +9,7 @@ # Uncategorized helper functions from the spec import - chronicles, results, taskpools, + chronos, chronicles, results, taskpools, eth/p2p/discoveryv5/node, kzg4844/kzg, ssz_serialization/[ @@ -163,30 +163,50 @@ proc recover_cells_and_proofs_parallel*( for column in dataColumns: if not (blobCount == column.column.len): - return err ("DataColumns do not have the same length") + return err("DataColumns do not have the same length") - # spawn threads for recovery var - pendingFuts = newSeq[Flowvar[Result[CellsAndProofs, void]]](blobCount) + pendingFuts: seq[Flowvar[Result[CellsAndProofs, void]]] res = newSeq[CellsAndProofs](blobCount) - for blobIdx in 0.. 2.seconds: + warn "Aborting reconstruction: spawn phase exceeded 2s", + spawned = pendingFuts.len, total = blobCount + break # Stop spawning new tasks + var cellIndices = newSeq[CellIndex](columnCount) cells = newSeq[Cell](columnCount) for i in 0 ..< dataColumns.len: cellIndices[i] = dataColumns[i][].index cells[i] = dataColumns[i][].column[blobIdx] - pendingFuts[blobIdx] = - tp.spawn recoverCellsAndKzgProofsTask(cellIndices, cells) + pendingFuts.add(tp.spawn recoverCellsAndKzgProofsTask(cellIndices, cells)) + + # ---- Sync phase ---- + for i in 0 ..< pendingFuts.len: + let now = Moment.now() + if (now - startTime) > 2.seconds: + warn "Aborting reconstruction: sync phase exceeded 2s", + completed = i, totalSpawned = pendingFuts.len + return err("Data column reconstruction aborted after timeout during sync") - # sync threads - for i in 0.. Date: Fri, 17 Oct 2025 18:48:21 +0530 Subject: [PATCH 2/4] reworked some logging --- beacon_chain/gossip_processing/block_processor.nim | 2 -- beacon_chain/nimbus_beacon_node.nim | 4 ++-- beacon_chain/spec/peerdas_helpers.nim | 13 +++++++------ 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/beacon_chain/gossip_processing/block_processor.nim b/beacon_chain/gossip_processing/block_processor.nim index a3c004fb90..58a2f305ca 100644 --- a/beacon_chain/gossip_processing/block_processor.nim +++ b/beacon_chain/gossip_processing/block_processor.nim @@ -222,13 +222,11 @@ proc verifySidecars( proc storeSidecars(self: BlockProcessor, sidecarsOpt: Opt[BlobSidecars]) = if sidecarsOpt.isSome(): - debug "Inserting blobs into database", blobs = sidecarsOpt[].len for b in sidecarsOpt[]: self.consensusManager.dag.db.putBlobSidecar(b[]) proc storeSidecars(self: BlockProcessor, sidecarsOpt: Opt[DataColumnSidecars]) = if sidecarsOpt.isSome(): - debug "Inserting columns into database", columns = sidecarsOpt[].len for c in sidecarsOpt[]: self.consensusManager.dag.db.putDataColumnSidecar(c[]) diff --git a/beacon_chain/nimbus_beacon_node.nim b/beacon_chain/nimbus_beacon_node.nim index d1bfdc3088..540e56662e 100644 --- a/beacon_chain/nimbus_beacon_node.nim +++ b/beacon_chain/nimbus_beacon_node.nim @@ -1725,7 +1725,7 @@ proc reconstructDataColumns(node: BeaconNode, slot: Slot) = if node.dag.db.getDataColumnSidecar(forkyBlck.root, i, colData): columns.add(newClone(colData)) indices.incl(i) - debug "Stored data columns", columns = indices.len + debug "PeerDAS: Data columns before reconstruction", columns = indices.len # Make sure the node has obtained 50%+ of all the columns if columns.lenu64 < (maxColCount div 2): @@ -1741,7 +1741,7 @@ proc reconstructDataColumns(node: BeaconNode, slot: Slot) = # Reconstruct columns let recovered = recover_cells_and_proofs_parallel( node.batchVerifier[].taskpool, columns).valueOr: - error "Error in data column reconstruction" + error "Data column reconstruction incomplete" return let rowCount = recovered.len var reconCounter = 0 diff --git a/beacon_chain/spec/peerdas_helpers.nim b/beacon_chain/spec/peerdas_helpers.nim index 656d1ebc09..fc6750d886 100644 --- a/beacon_chain/spec/peerdas_helpers.nim +++ b/beacon_chain/spec/peerdas_helpers.nim @@ -170,12 +170,13 @@ proc recover_cells_and_proofs_parallel*( res = newSeq[CellsAndProofs](blobCount) let startTime = Moment.now() + const reconstructionTimeout = 2.seconds # ---- Spawn phase with time limit ---- for blobIdx in 0 ..< blobCount: let now = Moment.now() - if (now - startTime) > 2.seconds: - warn "Aborting reconstruction: spawn phase exceeded 2s", + if (now - startTime) > reconstructionTimeout: + debug "PeerDAS column reconstruction timed out while preparing columns", spawned = pendingFuts.len, total = blobCount break # Stop spawning new tasks @@ -190,10 +191,10 @@ proc recover_cells_and_proofs_parallel*( # ---- Sync phase ---- for i in 0 ..< pendingFuts.len: let now = Moment.now() - if (now - startTime) > 2.seconds: - warn "Aborting reconstruction: sync phase exceeded 2s", + if (now - startTime) > reconstructionTimeout: + debug "PeerDAS column reconstruction timed out while preparing columns", completed = i, totalSpawned = pendingFuts.len - return err("Data column reconstruction aborted after timeout during sync") + return err("Data column reconstruction timed out") let futRes = sync pendingFuts[i] if futRes.isErr: @@ -202,7 +203,7 @@ proc recover_cells_and_proofs_parallel*( res[i] = futRes.get if pendingFuts.len < blobCount: - return err("KZG recovery aborted: timeout before completing all blobs") + return err("Data column reconstruction timed out") ok(res) From d46e13cb3d38ad3bcc6ece5af66acd6d9f8b8c39 Mon Sep 17 00:00:00 2001 From: tersec Date: Fri, 17 Oct 2025 23:36:10 +0000 Subject: [PATCH 3/4] Update beacon_chain/spec/peerdas_helpers.nim --- beacon_chain/spec/peerdas_helpers.nim | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/beacon_chain/spec/peerdas_helpers.nim b/beacon_chain/spec/peerdas_helpers.nim index fc6750d886..eb7641abaa 100644 --- a/beacon_chain/spec/peerdas_helpers.nim +++ b/beacon_chain/spec/peerdas_helpers.nim @@ -192,7 +192,7 @@ proc recover_cells_and_proofs_parallel*( for i in 0 ..< pendingFuts.len: let now = Moment.now() if (now - startTime) > reconstructionTimeout: - debug "PeerDAS column reconstruction timed out while preparing columns", + debug "PeerDAS reconstruction timed out", completed = i, totalSpawned = pendingFuts.len return err("Data column reconstruction timed out") From ed0432355831b3c1118ec5a5918d89963366848e Mon Sep 17 00:00:00 2001 From: tersec Date: Fri, 17 Oct 2025 23:36:18 +0000 Subject: [PATCH 4/4] Update beacon_chain/spec/peerdas_helpers.nim --- beacon_chain/spec/peerdas_helpers.nim | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/beacon_chain/spec/peerdas_helpers.nim b/beacon_chain/spec/peerdas_helpers.nim index eb7641abaa..cf43fd386d 100644 --- a/beacon_chain/spec/peerdas_helpers.nim +++ b/beacon_chain/spec/peerdas_helpers.nim @@ -176,7 +176,7 @@ proc recover_cells_and_proofs_parallel*( for blobIdx in 0 ..< blobCount: let now = Moment.now() if (now - startTime) > reconstructionTimeout: - debug "PeerDAS column reconstruction timed out while preparing columns", + debug "PeerDAS reconstruction timed out while preparing columns", spawned = pendingFuts.len, total = blobCount break # Stop spawning new tasks