From 63a5269457b58212bd3007c62346caa39a8585f8 Mon Sep 17 00:00:00 2001 From: Alexandru Vasile <60601340+lexnv@users.noreply.github.com> Date: Thu, 12 Mar 2026 13:40:55 +0200 Subject: [PATCH] aura/import: Skip block execution when collators have no parent block state (#11330) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR skips the execution of blocks when they are propagated to importing via `StateAction::Skip`. There is a bug in the import queue that is affecting collators, which is that they should not execute blocks for non-archive collators that are part of Gap Sync. The bug has surfaced by changing the `import_existing` from false to true in: - https://github.com/paritytech/polkadot-sdk/pull/10373 ### Issue The issue manifests for collators that have an unfilled block gap in their DB. During restarting with #10373, a collator would try the following: - client info has detected a gap at block 5800 with length 1 - collator [X] requests the block 5800 with `fields: HEADER | BODY | JUSTIFICATION, from: Number(5800)` - the other 2 collators respond with the full block, including the body, because by default collators will keep around the canonical chain but discard the block state - collator [X] tries to import the block because `import_existing` is true and we continue execution after the following check: https://github.com/paritytech/polkadot-sdk/blob/2b9576c163b1c2408291e2b6c98ae0f2465b4818/substrate/client/service/src/client/client.rs#L1809-L1812 - Before the changes, the code returned `return Ok(ImportResult::AlreadyInChain)` which short-circuited the importing of the block - collator [X] imports the block but fails with `State already discarded` - the error is propagated back to the sync engine that decides to restart the sync process with the same block gap `Restarting sync with client ...` - This results in a vicious cycle where the collator [X] requests the same block again, then restarts the sync engine - Eventually at the 3 request the other collators will notice that this behavior is malicious and ban and disconnect the peers. ### Fix The fix is to skip executing blocks when the gap sync has marked blocks as `StateAction::Skip`. Please note we are still dealing with the following, which should be part of a different PR: - Gap Sync was never closed from the database - When the node starts with a block gap, the node will always initiate a block request over the sync protocol to close the gap - Before the gap was marked as `import_existing: false` which short ciruited the circuit and returned `AlreadyInChain` - Effectively nodes would re-request the gap on reboot wasting networking bandwidth to close the gap "in memory" only, but this was never commited to the DB ### Full Logs ```rust 2026-03-10 13:43:41.138 DEBUG main sync: [Parachain] Restarting sync with client info Info { best_hash: 0xcb03c2aa7dd61f84b27d4c7db42ab848d2eaee9da77ddedc827e070ece063102, best_number: 5883392, genesis_hash: 0x8692fdabb7e55c3347c0f887343e3c0f3fbb560c5f52c9cdc1f7660a1f183c5d, finalized_hash: 0x43664710059a72b37c11db9f99a0f38323b478fbdc82afac058c530c7b002e4d, finalized_number: 5883372, finalized_state: Some((0x43664710059a72b37c11db9f99a0f38323b478fbdc82afac058c530c7b002e4d, 5883372)), number_leaves: 1, block_gap: Some(BlockGap { start: 5800, end: 5800, gap_type: MissingBody }) } 2026-03-10 13:43:41.138 DEBUG main sync: [Parachain] Starting gap sync #5800 - #5800 (old gap best and target: None) 2026-03-10 13:43:41.138 TRACE main sync: [Parachain] Restarted sync at #5883392 (0xcb03c2aa7dd61f84b27d4c7db42ab848d2eaee9da77ddedc827e070ece063102) 2026-03-10 13:45:17.775 TRACE tokio-runtime-worker sync: [Parachain] New gap block request for 12D3KooWRejf1JYYjaaKhHAn28VJJR9ryZqs3wiGPsVjk6eFLLrn, (best:5883362, common:5883362) BlockRequest { id: 0, fields: HEADER | BODY | JUSTIFICATION, from: Number(5800), direction: Descending, max: Some(1) } 2026-03-10 13:45:17.784 DEBUG tokio-runtime-worker sync::import-queue: [Parachain] Starting import of 1 blocks (5800) (origin: GapSync) 2026-03-10 13:45:17.784 TRACE tokio-runtime-worker sync::import-queue: [Parachain] Block 5800 (0x26dc…1cda) has 4 logs (origin: GapSync) 2026-03-10 13:45:17.792 DEBUG tokio-runtime-worker sync::import-queue: [Parachain] Error importing block 5800: 0x26dca166cfefe439262d201b10a8d2679edc4bd98ae59fe12d7f7eef9b871cda: Api called for an unknown Block: State already discarded for 0x4739cf07649d6383bb19d2adccbe9d3f5b1ed91ef5fd6530bc8e69e560b5be91 2026-03-10 13:45:17.792 WARN tokio-runtime-worker sync: [Parachain] šŸ’” Error importing block 0x26dca166cfefe439262d201b10a8d2679edc4bd98ae59fe12d7f7eef9b871cda: consensus error: Api called for an unknown Block: State already discarded for 0x4739cf07649d6383bb19d2adccbe9d3f5b1ed91ef5fd6530bc8e69e560b5be91 2026-03-10 13:45:17.792 DEBUG tokio-runtime-worker sync: [Parachain] Restarting sync with client info Info { best_hash: 0xcb03c2aa7dd61f84b27d4c7db42ab848d2eaee9da77ddedc827e070ece063102, best_number: 5883392, genesis_hash: 0x8692fdabb7e55c3347c0f887343e3c0f3fbb560c5f52c9cdc1f7660a1f183c5d, finalized_hash: 0xcb03c2aa7dd61f84b27d4c7db42ab848d2eaee9da77ddedc827e070ece063102, finalized_number: 5883392, finalized_state: Some((0xcb03c2aa7dd61f84b27d4c7db42ab848d2eaee9da77ddedc827e070ece063102, 5883392)), number_leaves: 1, block_gap: Some(BlockGap { start: 5800, end: 5800, gap_type: MissingBody }) } 2026-03-10 13:45:17.792 DEBUG tokio-runtime-worker sync: [Parachain] Starting gap sync #5800 - #5800 (old gap best and target: Some((5800, 5800))) 2026-03-10 13:45:17.792 TRACE tokio-runtime-worker sync: [Parachain] Restarted sync at #5883392 (0xcb03c2aa7dd61f84b27d4c7db42ab848d2eaee9da77ddedc827e070ece063102) ``` ### Testing Done - unblocks kusama yap 3392: https://grafana.teleport.parity.io/goto/KBKfuhKDR?orgId=1 - left side of the graph is origin/master, right side is the patch applied with connected peers Closes: - https://github.com/paritytech/polkadot-sdk/issues/11299 --------- Signed-off-by: Alexandru Vasile Co-authored-by: cmd[bot] <41898282+github-actions[bot]@users.noreply.github.com> (cherry picked from commit 3c93291ee80a9dae849d5892184069ef429da0ae) --- .../src/collators/slot_based/block_import.rs | 146 +++++++++++++++++- prdoc/pr_11330.prdoc | 10 ++ 2 files changed, 153 insertions(+), 3 deletions(-) create mode 100644 prdoc/pr_11330.prdoc diff --git a/cumulus/client/consensus/aura/src/collators/slot_based/block_import.rs b/cumulus/client/consensus/aura/src/collators/slot_based/block_import.rs index d22171f2586cc..a3e849a45b5bf 100644 --- a/cumulus/client/consensus/aura/src/collators/slot_based/block_import.rs +++ b/cumulus/client/consensus/aura/src/collators/slot_based/block_import.rs @@ -97,9 +97,16 @@ where ) -> Result { // If the channel exists and it is required to execute the block, we will execute the block // here. This is done to collect the storage proof and to prevent re-execution, we push - // downwards the state changes. `StateAction::ApplyChanges` is ignored, because it either - // means that the node produced the block itself or the block was imported via state sync. - if !self.sender.is_closed() && !matches!(params.state_action, StateAction::ApplyChanges(_)) + // downwards the state changes. + // + // The following states are ignored: + // - `StateAction::ApplyChanges`: means that the node produced the block itself or the + // block was imported via state sync. + // - `StateAction::Skip`: means that the block should be skipped. This is evident in the + // context of gap-sync with collators running in non-archive modes. The state of the + // parent block has already been discarded and therefore any import would fail. + if !self.sender.is_closed() && + !matches!(params.state_action, StateAction::ApplyChanges(_) | StateAction::Skip) { let mut runtime_api = self.client.runtime_api(); @@ -143,3 +150,136 @@ where self.inner.import_block(params).await.map_err(Into::into) } } + +#[cfg(test)] +mod tests { + use super::*; + use codec::Encode; + use cumulus_test_client::{ + runtime::Block, DefaultTestClientBuilderExt, InitBlockBuilder, TestClientBuilder, + TestClientBuilderExt, + }; + use cumulus_test_relay_sproof_builder::RelayStateSproofBuilder; + use polkadot_primitives::HeadData; + use sc_consensus::{BlockImportParams, ImportResult, StateAction}; + use sp_blockchain::HeaderBackend; + use sp_consensus::BlockOrigin; + + fn sproof_with_best_parent(client: &cumulus_test_client::Client) -> RelayStateSproofBuilder { + let best_hash = client.info().best_hash; + let header = client.header(best_hash).ok().flatten().expect("No header for best block"); + let mut builder = RelayStateSproofBuilder::default(); + builder.para_id = cumulus_test_client::runtime::PARACHAIN_ID.into(); + builder.included_para_head = Some(HeadData(header.encode())); + builder + } + + /// Mock inner block import that always succeeds. + #[derive(Clone)] + struct MockBlockImport; + + #[async_trait::async_trait] + impl BlockImport for MockBlockImport { + type Error = sp_consensus::Error; + + async fn check_block( + &self, + _block: sc_consensus::BlockCheckParams, + ) -> Result { + Ok(ImportResult::imported(false)) + } + + async fn import_block( + &self, + _block: BlockImportParams, + ) -> Result { + Ok(ImportResult::imported(true)) + } + } + + /// Regression test for the gap-sync infinite loop issue. + /// + /// When a non-archive collator has a block gap of size 1, gap-sync downloads + /// the block and marks it with `skip_execution: true` (which translates to + /// `StateAction::Skip`). Before the fix, `SlotBasedBlockImport` would attempt + /// to execute such blocks, fail with a consensus error ("State already + /// discarded for parent"), and trigger a chain-sync restart that re-creates + /// the same gap — leading to an infinite retry loop. + /// + /// This test verifies that `StateAction::Skip` blocks are forwarded to the + /// inner block import without attempting runtime execution. + #[tokio::test] + async fn gap_sync_block_with_skip_execution_does_not_attempt_runtime_call() { + sp_tracing::try_init_simple(); + + let client = Arc::new(TestClientBuilder::new().build()); + + // Build a valid block so we have realistic headers/bodies. + let sproof = sproof_with_best_parent(&client); + let block_builder_data = client.init_block_builder(None, sproof); + let block = block_builder_data.block_builder.build().unwrap().block; + + let (slot_based_import, mut handle) = + SlotBasedBlockImport::new(MockBlockImport, client.clone()); + + // Simulate the gap-sync scenario: a block arrives with StateAction::Skip + // because the parent state has been pruned. + let mut params = BlockImportParams::new(BlockOrigin::NetworkInitialSync, block.header); + params.body = Some(block.extrinsics); + params.state_action = StateAction::Skip; + params.import_existing = true; + + // Before the fix, this would fail with a consensus error because + // SlotBasedBlockImport would try to call `runtime_api.execute_block()` + // on the parent hash whose state is no longer available. + // + // After the fix, StateAction::Skip is recognized and the block is + // forwarded directly to the inner import without execution. + let result = slot_based_import.import_block(params).await; + assert!(result.is_ok(), "Gap-sync block with StateAction::Skip must not fail: {result:?}"); + + // The channel must be empty — execution should have been skipped entirely, + // so no (block, proof) was sent. This is the key assertion: without the + // StateAction::Skip guard, execute_block() would run and send a message. + // + // Drop the sender side so the channel closes, then verify no message was queued. + drop(slot_based_import); + assert!( + handle.receiver.next().await.is_none(), + "No block+proof should be sent through the channel for StateAction::Skip" + ); + } + + /// Verify that `StateAction::Execute` still triggers runtime execution. + /// + /// This complements the gap-sync regression test by ensuring we did not + /// accidentally disable execution for normal blocks. + #[tokio::test] + async fn normal_block_with_execute_action_triggers_runtime_execution() { + sp_tracing::try_init_simple(); + + let client = Arc::new(TestClientBuilder::new().build()); + + let sproof = sproof_with_best_parent(&client); + let block_builder_data = client.init_block_builder(None, sproof); + let block = block_builder_data.block_builder.build().unwrap().block; + + let (slot_based_import, mut handle) = + SlotBasedBlockImport::new(MockBlockImport, client.clone()); + + // Normal import with StateAction::Execute should trigger execution + // and send the block + proof through the channel. + let mut params = + BlockImportParams::new(BlockOrigin::NetworkInitialSync, block.header.clone()); + params.body = Some(block.extrinsics.clone()); + params.state_action = StateAction::Execute; + + let result = slot_based_import.import_block(params).await; + assert!(result.is_ok(), "Normal block import should succeed: {result:?}"); + + // The block and proof should have been sent through the channel, + // confirming that execution actually happened. + let (received_block, _proof) = handle.next().await; + assert_eq!(*received_block.header(), block.header); + } +} diff --git a/prdoc/pr_11330.prdoc b/prdoc/pr_11330.prdoc new file mode 100644 index 0000000000000..244571edd700d --- /dev/null +++ b/prdoc/pr_11330.prdoc @@ -0,0 +1,10 @@ +title: 'aura/import: Skip block execution when collators have no parent block state' +doc: +- audience: Node Dev + description: "Skip block execution in `SlotBasedBlockImport` when gap-sync marks + blocks with `StateAction::Skip`. This fixes an infinite retry loop where non-archive + collators failed to import gap-sync blocks because the parent state was already + pruned, causing repeated sync restarts and eventual peer bans." +crates: +- name: cumulus-client-consensus-aura + bump: patch