From 1dd1358e571408ffd23c5ce91cdb9d2c862b0672 Mon Sep 17 00:00:00 2001 From: Alexandru Vasile Date: Fri, 20 Mar 2026 12:03:42 +0000 Subject: [PATCH 01/13] aura/slot_based: Allow building blocks in the effective parachain slots Signed-off-by: Alexandru Vasile --- .../slot_based/block_builder_task.rs | 7 +++-- .../src/collators/slot_based/slot_timer.rs | 28 +++++++++++++++---- 2 files changed, 27 insertions(+), 8 deletions(-) diff --git a/cumulus/client/consensus/aura/src/collators/slot_based/block_builder_task.rs b/cumulus/client/consensus/aura/src/collators/slot_based/block_builder_task.rs index 7cfde76a7921e..9888493dee451 100644 --- a/cumulus/client/consensus/aura/src/collators/slot_based/block_builder_task.rs +++ b/cumulus/client/consensus/aura/src/collators/slot_based/block_builder_task.rs @@ -421,8 +421,11 @@ where validation_data.max_pov_size * 85 / 100 } as usize; - let adjusted_authoring_duration = - slot_timer.adjust_authoring_duration(authoring_duration); + let adjusted_authoring_duration = slot_timer.adjust_authoring_duration( + authoring_duration, + para_slot.slot, + relay_parent_offset, + ); tracing::debug!(target: crate::LOG_TARGET, duration = ?adjusted_authoring_duration, "Adjusted proposal duration."); let Some(adjusted_authoring_duration) = adjusted_authoring_duration else { diff --git a/cumulus/client/consensus/aura/src/collators/slot_based/slot_timer.rs b/cumulus/client/consensus/aura/src/collators/slot_based/slot_timer.rs index 5686a7b68b4e0..80c3c6ea0f178 100644 --- a/cumulus/client/consensus/aura/src/collators/slot_based/slot_timer.rs +++ b/cumulus/client/consensus/aura/src/collators/slot_based/slot_timer.rs @@ -285,12 +285,13 @@ where fn time_until_next_slot_change( &mut self, slot_duration: SlotDuration, + effective_slot: Slot, ) -> Option<(Duration, Slot)> { compute_time_until_next_slot_change( slot_duration, duration_now(), self.time_offset, - self.last_reported_slot.unwrap_or_default(), + effective_slot, ) } @@ -322,14 +323,29 @@ where /// Adjust the authoring duration to fit into the slot timing. /// /// Returns the adjusted authoring duration and the slot that it corresponds to. - pub fn adjust_authoring_duration(&mut self, authoring_duration: Duration) -> Option { + pub fn adjust_authoring_duration( + &mut self, + authoring_duration: Duration, + para_slot: Slot, + relay_parent_offset: u32, + ) -> Option { let Ok(slot_duration) = crate::slot_duration(&*self.client) else { tracing::error!(target: LOG_TARGET, "Failed to fetch slot duration from runtime."); return None; }; let next_block = self.time_until_next_block(slot_duration); - let Some(next_slot_change) = self.time_until_next_slot_change(slot_duration) else { + + // The effective parachain slot must count for the relay parent offset as well. + // + // For offset = 1, we build on RP n - 1 when RP n arrives. + // Assume the RP n - 1 has a slot of 803, the wall clock detects 804 (aura slot) + // then our deadline is effectively in slot 805. + let effective_slot = para_slot + Slot::from(relay_parent_offset as u64); + + let Some(next_slot_change) = + self.time_until_next_slot_change(slot_duration, effective_slot) + else { tracing::error!( target: LOG_TARGET, "Failed to compute time until next slot change. Using unadjusted authoring duration." @@ -337,9 +353,9 @@ where return Some(authoring_duration); }; - // Check if authors at current and next slots are different - let current_slot = self.last_reported_slot.unwrap_or(next_block.1); - let different_authors = self.check_different_slot_authors(current_slot, next_slot_change.1); + // Check if authors at current effective slot and next slots are different + let different_authors = + self.check_different_slot_authors(effective_slot, next_slot_change.1); adjust_authoring_duration( authoring_duration, From 35ffc0d03af6637290ebc115ef7da2028c82b784 Mon Sep 17 00:00:00 2001 From: Alexandru Vasile Date: Fri, 20 Mar 2026 12:12:13 +0000 Subject: [PATCH 02/13] aura/slot_based: Use saturating add for overflows Signed-off-by: Alexandru Vasile --- .../consensus/aura/src/collators/slot_based/slot_timer.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cumulus/client/consensus/aura/src/collators/slot_based/slot_timer.rs b/cumulus/client/consensus/aura/src/collators/slot_based/slot_timer.rs index 80c3c6ea0f178..d39d48c000545 100644 --- a/cumulus/client/consensus/aura/src/collators/slot_based/slot_timer.rs +++ b/cumulus/client/consensus/aura/src/collators/slot_based/slot_timer.rs @@ -341,7 +341,7 @@ where // For offset = 1, we build on RP n - 1 when RP n arrives. // Assume the RP n - 1 has a slot of 803, the wall clock detects 804 (aura slot) // then our deadline is effectively in slot 805. - let effective_slot = para_slot + Slot::from(relay_parent_offset as u64); + let effective_slot = para_slot.saturating_add(Slot::from(relay_parent_offset as u64)); let Some(next_slot_change) = self.time_until_next_slot_change(slot_duration, effective_slot) From 3c730de7aa64538f024d8320b548b9339e906e24 Mon Sep 17 00:00:00 2001 From: Alexandru Vasile Date: Fri, 20 Mar 2026 12:16:23 +0000 Subject: [PATCH 03/13] aura/slot_based: Log effective slot for debuggability Signed-off-by: Alexandru Vasile --- .../consensus/aura/src/collators/slot_based/slot_timer.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cumulus/client/consensus/aura/src/collators/slot_based/slot_timer.rs b/cumulus/client/consensus/aura/src/collators/slot_based/slot_timer.rs index d39d48c000545..f5caeda3b33dc 100644 --- a/cumulus/client/consensus/aura/src/collators/slot_based/slot_timer.rs +++ b/cumulus/client/consensus/aura/src/collators/slot_based/slot_timer.rs @@ -154,6 +154,7 @@ fn adjust_authoring_duration( next_block: (Duration, Slot), next_slot_change: (Duration, Slot), different_authors: bool, + effective_slot: Slot, ) -> Option { let (duration, next_block_slot) = next_block; let (duration_until_next_slot, next_slot) = next_slot_change; @@ -168,6 +169,7 @@ fn adjust_authoring_duration( ?next_block_slot, ?duration_until_next_slot, ?next_slot, + ?effective_slot, ?duration_until_deadline, ?different_authors, "Adjusting authoring duration for slot.", @@ -362,6 +364,7 @@ where next_block, next_slot_change, different_authors, + effective_slot, ) } From a963b3901fdace13f85078cccfb10e988031f16d Mon Sep 17 00:00:00 2001 From: Alexandru Vasile Date: Fri, 20 Mar 2026 12:20:06 +0000 Subject: [PATCH 04/13] aura/slot_based: Detect stale effective slots on different authors Signed-off-by: Alexandru Vasile --- .../aura/src/collators/slot_based/slot_timer.rs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/cumulus/client/consensus/aura/src/collators/slot_based/slot_timer.rs b/cumulus/client/consensus/aura/src/collators/slot_based/slot_timer.rs index f5caeda3b33dc..0e10a13024440 100644 --- a/cumulus/client/consensus/aura/src/collators/slot_based/slot_timer.rs +++ b/cumulus/client/consensus/aura/src/collators/slot_based/slot_timer.rs @@ -355,6 +355,16 @@ where return Some(authoring_duration); }; + if next_slot_change.0 == Duration::ZERO { + tracing::warn!( + target: LOG_TARGET, + ?effective_slot, + ?next_slot_change, + ?relay_parent_offset, + "Effective slot is already past. Relay parent may be stale." + ); + } + // Check if authors at current effective slot and next slots are different let different_authors = self.check_different_slot_authors(effective_slot, next_slot_change.1); From e9fdc0bb91e554981c4b1d0a86a3c308c5ba04ef Mon Sep 17 00:00:00 2001 From: Alexandru Vasile Date: Fri, 20 Mar 2026 13:00:36 +0000 Subject: [PATCH 05/13] aura/slot_based: Add tests Signed-off-by: Alexandru Vasile --- .../src/collators/slot_based/slot_timer.rs | 405 ++++++++++++++++++ 1 file changed, 405 insertions(+) diff --git a/cumulus/client/consensus/aura/src/collators/slot_based/slot_timer.rs b/cumulus/client/consensus/aura/src/collators/slot_based/slot_timer.rs index 0e10a13024440..f08c1ec0e766e 100644 --- a/cumulus/client/consensus/aura/src/collators/slot_based/slot_timer.rs +++ b/cumulus/client/consensus/aura/src/collators/slot_based/slot_timer.rs @@ -553,6 +553,7 @@ mod tests { (Duration::from_millis(2000), Slot::from(1)), // Next block (Duration::from_millis(4000), Slot::from(2)), // Next slot change true, // Different authors + Slot::from(1), // Effective slot Some(Duration::from_millis(2000)), // Expected )] #[case::blocks_2s_closer_next_slot( @@ -560,6 +561,7 @@ mod tests { (Duration::from_millis(1950), Slot::from(1)), // Next block (Duration::from_millis(4000), Slot::from(2)), // Next slot change true, // Different authors + Slot::from(1), // Effective slot Some(Duration::from_millis(1950)), // Expected )] #[case::blocks_2s_closer_next_slot_bigger( @@ -567,6 +569,7 @@ mod tests { (Duration::from_millis(1500), Slot::from(1)), // Next block (Duration::from_millis(4000), Slot::from(2)), // Next slot change true, // Different authors + Slot::from(1), // Effective slot Some(Duration::from_millis(1500)), // Expected )] #[case::blocks_2s_reduce_by_1s( @@ -574,6 +577,7 @@ mod tests { (Duration::from_millis(2000), Slot::from(1)), // Next block (Duration::from_millis(2000), Slot::from(2)), // Next slot change true, // Different authors + Slot::from(1), // Effective slot Some(Duration::from_millis(1000)), // Expected )] #[case::blocks_2s_reduce_by_1s_plus_offset( @@ -581,6 +585,7 @@ mod tests { (Duration::from_millis(1950), Slot::from(1)), // Next block (Duration::from_millis(1950), Slot::from(2)), // Next slot change true, // Different authors + Slot::from(1), // Effective slot Some(Duration::from_millis(950)), // Expected )] #[case::blocks_2s_reduce_to_minimum( @@ -588,6 +593,7 @@ mod tests { (Duration::from_millis(1400), Slot::from(1)), // Next block (Duration::from_millis(1400), Slot::from(2)), // Next slot change true, // Different authors + Slot::from(1), // Effective slot Some(Duration::from_millis(400)), // Expected )] #[case::blocks_2s_reduce_below_minimum( @@ -595,6 +601,7 @@ mod tests { (Duration::from_millis(1300), Slot::from(1)), // Next block (Duration::from_millis(1300), Slot::from(2)), // Next slot change true, // Different authors + Slot::from(1), // Effective slot None, // Expected to reduce below minimum )] #[case::blocks_2s_same_author( @@ -602,6 +609,7 @@ mod tests { (Duration::from_millis(1400), Slot::from(1)), // Next block (Duration::from_millis(1400), Slot::from(2)), // Next slot change false, // Different authors + Slot::from(1), // Effective slot Some(Duration::from_millis(1400)), // Expected no adjustment for last second. )] // Various scenarios for 500ms block production adjustment. @@ -610,6 +618,7 @@ mod tests { (Duration::from_millis(500), Slot::from(1)), // Next block (Duration::from_millis(2000), Slot::from(2)), // Next slot change true, // Different authors + Slot::from(1), // Effective slot Some(Duration::from_millis(500)), // Expected )] #[case::blocks_500ms_closer_next_slot( @@ -617,6 +626,7 @@ mod tests { (Duration::from_millis(450), Slot::from(1)), // Next block (Duration::from_millis(2000), Slot::from(2)), // Next slot change true, // Different authors + Slot::from(1), // Effective slot Some(Duration::from_millis(450)), // Expected )] #[case::blocks_500ms_closer_next_slot_bigger( @@ -624,6 +634,7 @@ mod tests { (Duration::from_millis(400), Slot::from(1)), // Next block (Duration::from_millis(1500), Slot::from(2)), // Next slot change true, // Different authors + Slot::from(1), // Effective slot Some(Duration::from_millis(400)), // Expected )] #[case::blocks_500ms_reduce_by_1s( @@ -631,6 +642,7 @@ mod tests { (Duration::from_millis(500), Slot::from(1)), // Next block (Duration::from_millis(1000), Slot::from(2)), // Next slot change true, // Different authors + Slot::from(1), // Effective slot None, // Expected )] #[case::blocks_500ms_reduce_by_1s_closer( @@ -638,6 +650,7 @@ mod tests { (Duration::from_millis(500), Slot::from(1)), // Next block (Duration::from_millis(500), Slot::from(2)), // Next slot change true, // Different authors + Slot::from(1), // Effective slot None, // Expected )] // If we are producing with 1 collator for 500ms authoring duration, @@ -647,6 +660,7 @@ mod tests { (Duration::from_millis(410), Slot::from(1)), // Next block (Duration::from_millis(1000), Slot::from(2)), // Next slot change false, // Different authors + Slot::from(1), // Effective slot Some(Duration::from_millis(410)), // Expected no adjustment for last second. )] #[case::blocks_500ms_same_author_closer( @@ -654,6 +668,7 @@ mod tests { (Duration::from_millis(400), Slot::from(1)), // Next block (Duration::from_millis(400), Slot::from(2)), // Next slot change false, // Different authors + Slot::from(1), // Effective slot Some(Duration::from_millis(400)), // Expected no adjustment for last second. )] fn test_adjust_authoring_duration( @@ -661,6 +676,7 @@ mod tests { #[case] next_block: (Duration, Slot), #[case] next_slot_change: (Duration, Slot), #[case] different_authors: bool, + #[case] effective_slot: Slot, #[case] expected: Option, ) { sp_tracing::init_for_tests(); @@ -670,8 +686,397 @@ mod tests { next_block, next_slot_change, different_authors, + effective_slot, ); tracing::debug!("Adjusted authoring duration: {:?}", result); assert_eq!(result, expected); } + + /// Tests the combined behavior of `relay_parent_offset` on effective slot computation + /// and authoring duration adjustment. + /// + /// This simulates what `SlotTimer::adjust_authoring_duration()` does internally: + /// 1. Compute `effective_slot = para_slot + relay_parent_offset` + /// 2. Compute time until next slot change using `effective_slot` + /// 3. Call `adjust_authoring_duration` with the results + #[rstest] + // relay_parent_offset = 0: normal behavior, block fits comfortably in slot. + // Wall clock at 31000ms (slot 5), effective_slot=5. + // Next slot change: 36000-31000 = 5000ms, deadline = 4000ms. + // Authoring duration 2000ms < 4000ms deadline → produce. + #[case::offset_0_normal( + 6000, // para_slot_duration_ms + 31000, // time_now_ms + Slot::from(5), // para_slot + 0u32, // relay_parent_offset + Duration::from_millis(2000), // authoring_duration + (Duration::from_millis(2000), Slot::from(6)), // next_block + true, // different_authors + Some(Duration::from_millis(2000)), + )] + // relay_parent_offset = 1, fresh relay parent: extended deadline. + // Wall clock at 31000ms, effective_slot=6. + // Next slot change: 42000-31000 = 11000ms, deadline = 10000ms. + // 2000ms < 10000ms → produce. + #[case::offset_1_fresh( + 6000, + 31000, + Slot::from(5), + 1u32, + Duration::from_millis(2000), + (Duration::from_millis(2000), Slot::from(6)), + true, + Some(Duration::from_millis(2000)), + )] + // relay_parent_offset = 1 with wall clock near end of current slot. + // Wall clock at 35500ms (near slot 5 boundary at 36000ms), effective_slot=6. + // Next slot change: 42000-35500 = 6500ms, deadline = 5500ms. + // 2000ms < 5500ms → produce. + #[case::offset_1_near_boundary( + 6000, + 35500, + Slot::from(5), + 1u32, + Duration::from_millis(2000), + (Duration::from_millis(2000), Slot::from(6)), + true, + Some(Duration::from_millis(2000)), + )] + // relay_parent_offset = 0 at slot boundary → stale effective slot, different authors → skip. + // Wall clock at 37000ms (past slot 5 boundary), effective_slot=5. + // Next slot change: 36000-37000 = 0 (saturating), deadline = 0. + // Different authors and deadline=0 → None. + #[case::offset_0_stale_different_authors( + 6000, + 37000, + Slot::from(5), + 0u32, + Duration::from_millis(2000), + (Duration::from_millis(2000), Slot::from(7)), + true, + None, + )] + // relay_parent_offset = 1 rescues a stale relay parent. + // Wall clock at 37000ms, para_slot=5, effective_slot=6. + // Next slot change: 42000-37000 = 5000ms, deadline = 4000ms. + // 2000ms < 4000ms → produce (offset=1 saved us from skipping!). + #[case::offset_1_rescues_stale( + 6000, + 37000, + Slot::from(5), + 1u32, + Duration::from_millis(2000), + (Duration::from_millis(2000), Slot::from(7)), + true, + Some(Duration::from_millis(2000)), + )] + // relay_parent_offset = 1 but relay parent is very stale (2+ slots behind). + // Wall clock at 43000ms (slot 7), para_slot=5, effective_slot=6. + // Next slot change: 42000-43000 = 0 (saturating), deadline = 0. + // Different authors and deadline=0 → None. + #[case::offset_1_very_stale( + 6000, + 43000, + Slot::from(5), + 1u32, + Duration::from_millis(2000), + (Duration::from_millis(2000), Slot::from(8)), + true, + None, + )] + // relay_parent_offset = 0, stale, but same author → still produce. + // Wall clock at 37000ms, effective_slot=5, next slot change = 0ms. + // Same author → return Some(min(authoring, next_block_duration)). + #[case::offset_0_stale_same_author( + 6000, + 37000, + Slot::from(5), + 0u32, + Duration::from_millis(2000), + (Duration::from_millis(2000), Slot::from(7)), + false, + Some(Duration::from_millis(2000)), + )] + // Large relay_parent_offset = 5, plenty of room. + // Wall clock at 31000ms, effective_slot=10. + // Next slot change: 66000-31000 = 35000ms, deadline = 34000ms. + // 2000ms < 34000ms → produce. + #[case::offset_5_large( + 6000, + 31000, + Slot::from(5), + 5u32, + Duration::from_millis(2000), + (Duration::from_millis(2000), Slot::from(6)), + true, + Some(Duration::from_millis(2000)), + )] + // relay_parent_offset = 0, near deadline → authoring clamped to deadline. + // Wall clock at 34500ms, effective_slot=5. + // Next slot change: 36000-34500 = 1500ms, deadline = 500ms. + // 2000ms > 500ms → clamped to 500ms. 500ms >= 400ms (min-threshold) → produce. + #[case::offset_0_clamped_to_minimum( + 6000, + 34500, + Slot::from(5), + 0u32, + Duration::from_millis(2000), + (Duration::from_millis(2000), Slot::from(6)), + true, + Some(Duration::from_millis(500)), + )] + // relay_parent_offset = 0, too close to deadline → below minimum threshold. + // Wall clock at 34800ms, effective_slot=5. + // Next slot change: 36000-34800 = 1200ms, deadline = 200ms. + // 2000ms > 200ms → clamped to 200ms. 200ms < 400ms (min-threshold) → None. + #[case::offset_0_below_minimum( + 6000, + 34800, + Slot::from(5), + 0u32, + Duration::from_millis(2000), + (Duration::from_millis(2000), Slot::from(6)), + true, + None, + )] + // relay_parent_offset = 1 rescues from below-minimum case. + // Same wall clock at 34800ms, but effective_slot=6. + // Next slot change: 42000-34800 = 7200ms, deadline = 6200ms. + // 2000ms < 6200ms → produce. + #[case::offset_1_rescues_below_minimum( + 6000, + 34800, + Slot::from(5), + 1u32, + Duration::from_millis(2000), + (Duration::from_millis(2000), Slot::from(6)), + true, + Some(Duration::from_millis(2000)), + )] + fn test_adjust_authoring_duration_with_relay_offset( + #[case] para_slot_duration_ms: u64, + #[case] time_now_ms: u64, + #[case] para_slot: Slot, + #[case] relay_parent_offset: u32, + #[case] authoring_duration: Duration, + #[case] next_block: (Duration, Slot), + #[case] different_authors: bool, + #[case] expected: Option, + ) { + sp_tracing::init_for_tests(); + + let slot_duration = SlotDuration::from_millis(para_slot_duration_ms); + let time_now = Duration::from_millis(time_now_ms); + let time_offset = Duration::ZERO; + + // Step 1: Compute effective slot (mirrors SlotTimer::adjust_authoring_duration). + let effective_slot = para_slot.saturating_add(Slot::from(relay_parent_offset as u64)); + + // Step 2: Compute time until next slot change using effective_slot. + let next_slot_change = compute_time_until_next_slot_change( + slot_duration, + time_now, + time_offset, + effective_slot, + ) + .expect("Should compute next slot change"); + + // Step 3: Call adjust_authoring_duration with computed values. + let result = adjust_authoring_duration( + authoring_duration, + next_block, + next_slot_change, + different_authors, + effective_slot, + ); + + tracing::debug!( + ?effective_slot, + ?next_slot_change, + ?relay_parent_offset, + ?result, + "Test result for relay_parent_offset scenario" + ); + assert_eq!(result, expected); + } + + /// Regression test for the "wrongful skip near slot boundary" bug. + /// + /// Reproduces a real-world scenario where the 1-second buffer + /// (`BLOCK_PRODUCTION_ADJUSTMENT_MS`) caused incorrect decisions: + /// + /// Attempt 1 — Wrongful SKIP (offset=0): + /// para_slot=803, wall_clock near end of slot 803 (491ms until slot 804). + /// deadline = 491ms - 1000ms = 0 → skip, even though we're still in our slot. + /// + /// Attempt 1 — Correct APPROVE (offset=1): + /// effective_slot=804, deadline based on slot 805 boundary (6491ms away). + /// deadline = 5491ms → plenty of room to produce. + /// + /// Attempt 3 — Wrongful APPROVE (offset=0, using wall clock slot): + /// para_slot=803 (stale), wall_clock=805, deadline based on slot 806 (5990ms). + /// Would approve despite para_slot being 2 slots behind. + /// + /// Attempt 3 — Correct SKIP (offset=1): + /// effective_slot=804, wall_clock past slot 805 boundary → remaining=0. + /// deadline = 0, different authors → skip. + #[test] + fn test_wrongful_skip_near_slot_boundary() { + sp_tracing::init_for_tests(); + + let slot_duration = SlotDuration::from_millis(6000); + let para_slot = Slot::from(803); + // Slot 803 = [4_818_000ms, 4_824_000ms) + // Slot 804 = [4_824_000ms, 4_830_000ms) + // Slot 805 = [4_830_000ms, 4_836_000ms) + + // --- Attempt 1: wall clock at 4_823_509ms (491ms before slot 804) --- + let time_now_attempt1 = Duration::from_millis(4_823_509); + let next_block = (Duration::from_millis(500), Slot::from(804)); + + // With offset=0: effective_slot=803, next_slot=804 at 4_824_000ms. + // remaining = 491ms, deadline = 0 → wrongful SKIP. + { + let effective_slot = para_slot; // offset=0 + let next_slot_change = compute_time_until_next_slot_change( + slot_duration, + time_now_attempt1, + Duration::ZERO, + effective_slot, + ) + .unwrap(); + assert_eq!(next_slot_change.0.as_millis(), 491); + + let result = adjust_authoring_duration( + Duration::from_millis(500), + next_block, + next_slot_change, + true, // different authors + effective_slot, + ); + // Deadline = 491 - 1000 = 0 → skip (the wrongful behavior without offset). + assert_eq!(result, None, "offset=0 near boundary: wrongful skip"); + } + + // With offset=1: effective_slot=804, next_slot=805 at 4_830_000ms. + // remaining = 6491ms, deadline = 5491ms → correct APPROVE. + { + let effective_slot = para_slot.saturating_add(Slot::from(1u64)); // offset=1 + let next_slot_change = compute_time_until_next_slot_change( + slot_duration, + time_now_attempt1, + Duration::ZERO, + effective_slot, + ) + .unwrap(); + assert_eq!(next_slot_change.0.as_millis(), 6491); + + let result = adjust_authoring_duration( + Duration::from_millis(500), + next_block, + next_slot_change, + true, // different authors + effective_slot, + ); + assert_eq!( + result, + Some(Duration::from_millis(500)), + "offset=1 near boundary: correctly approved" + ); + } + + // --- Attempt 3: wall clock at 4_831_000ms (in slot 805, para_slot=803 is stale) --- + let time_now_attempt3 = Duration::from_millis(4_831_000); + let next_block_attempt3 = (Duration::from_millis(500), Slot::from(806)); + + // With offset=0: effective_slot=803, next_slot=804 at 4_824_000ms. + // remaining = 0 (stale), deadline = 0 → correct SKIP. + { + let effective_slot = para_slot; // offset=0 + let next_slot_change = compute_time_until_next_slot_change( + slot_duration, + time_now_attempt3, + Duration::ZERO, + effective_slot, + ) + .unwrap(); + assert_eq!(next_slot_change.0, Duration::ZERO); + + let result = adjust_authoring_duration( + Duration::from_millis(500), + next_block_attempt3, + next_slot_change, + true, + effective_slot, + ); + assert_eq!(result, None, "offset=0 stale: correctly skipped"); + } + + // With offset=1: effective_slot=804, next_slot=805 at 4_830_000ms. + // remaining = 0 (stale, wall clock past 805) → correct SKIP. + { + let effective_slot = para_slot.saturating_add(Slot::from(1u64)); // offset=1 + let next_slot_change = compute_time_until_next_slot_change( + slot_duration, + time_now_attempt3, + Duration::ZERO, + effective_slot, + ) + .unwrap(); + assert_eq!(next_slot_change.0, Duration::ZERO); + + let result = adjust_authoring_duration( + Duration::from_millis(500), + next_block_attempt3, + next_slot_change, + true, + effective_slot, + ); + assert_eq!(result, None, "offset=1 stale: correctly skipped"); + } + } + + /// Test that `compute_time_until_next_slot_change` returns zero remaining time + /// when the effective slot is already past (stale relay parent scenario). + #[rstest] + // Effective slot is current → positive remaining time. + #[case::current_slot(6000, 31000, Slot::from(5), 5000)] + // Effective slot is one ahead → even more remaining time. + #[case::one_ahead(6000, 31000, Slot::from(6), 11000)] + // Effective slot is behind wall clock → zero remaining time (stale). + #[case::stale_behind(6000, 37000, Slot::from(5), 0)] + // Effective slot is far behind → zero remaining time. + #[case::very_stale(6000, 50000, Slot::from(5), 0)] + // Effective slot exactly at boundary → remaining time for next full slot. + #[case::at_boundary(6000, 36000, Slot::from(5), 0)] + fn test_stale_effective_slot_detection( + #[case] para_slot_millis: u64, + #[case] time_now_ms: u64, + #[case] effective_slot: Slot, + #[case] expected_remaining_ms: u128, + ) { + let slot_duration = SlotDuration::from_millis(para_slot_millis); + let time_now = Duration::from_millis(time_now_ms); + + let result = compute_time_until_next_slot_change( + slot_duration, + time_now, + Duration::ZERO, + effective_slot, + ) + .expect("Should compute next slot change"); + + assert_eq!( + result.0.as_millis(), + expected_remaining_ms, + "Remaining time mismatch for effective_slot={:?} at time={}ms", + effective_slot, + time_now_ms, + ); + + // Verify stale detection: Duration::ZERO means the effective slot is past. + if expected_remaining_ms == 0 { + assert_eq!(result.0, Duration::ZERO, "Stale effective slot should have zero remaining"); + } + } } From 68067d40510b8b9a9dd3e76000a9c8d8852c9692 Mon Sep 17 00:00:00 2001 From: Alexandru Vasile Date: Fri, 20 Mar 2026 14:07:13 +0000 Subject: [PATCH 06/13] aura/slot_based: Adjust the limits based on kusama data Signed-off-by: Alexandru Vasile --- .../src/collators/slot_based/slot_timer.rs | 34 +++++-------------- 1 file changed, 8 insertions(+), 26 deletions(-) diff --git a/cumulus/client/consensus/aura/src/collators/slot_based/slot_timer.rs b/cumulus/client/consensus/aura/src/collators/slot_based/slot_timer.rs index f08c1ec0e766e..562bcc3b9b82e 100644 --- a/cumulus/client/consensus/aura/src/collators/slot_based/slot_timer.rs +++ b/cumulus/client/consensus/aura/src/collators/slot_based/slot_timer.rs @@ -37,20 +37,6 @@ use std::{ /// Defensive mechanism, corresponds to 12 cores at 6 second block time. const BLOCK_PRODUCTION_MINIMUM_INTERVAL_MS: Duration = Duration::from_millis(500); -/// Theoretically, the block production is capped at `BLOCK_PRODUCTION_MINIMUM_INTERVAL_MS`. -/// In practice, there might be slight deviations due to timing inaccuracies and delays. -/// -/// This constant is taken into account while adjusting the authoring duration to fit into the slot. -/// Therefore, it will only reduce the authoring duration if we are within the -/// `BLOCK_PRODUCTION_ADJUSTMENT_MS` threshold of the next slot. -/// -/// ### 12 cores 500ms blocks -/// -/// For example, for 12 cores 500ms blocks: the next slot is scheduled in 490ms due to delays. -/// In that case, we still want to attempt producing the block, as missing the slot would be worse -/// than producing slightly too fast. -const BLOCK_PRODUCTION_THRESHOLD_MS: Duration = Duration::from_millis(100); - /// The amount of time the authoring duration of the last block production attempt /// should be reduced by to fit into the slot timing. const BLOCK_PRODUCTION_ADJUSTMENT_MS: Duration = Duration::from_millis(1000); @@ -199,16 +185,12 @@ fn adjust_authoring_duration( if different_authors && authoring_duration >= duration_until_deadline { authoring_duration = duration_until_deadline; - // Ensure we are not going below the minimum interval within a reasonable threshold. - // For 12 cores, we might have a scenario where the last 3 blocks are skipped: - // - Block 10: next slot change in 1.493s: - // - After adjusting the deadline: 1.493s - 1s = 0.493s the block could be produced - // without issues. - // - Block 11: next slot change in 0.993s - skipped by the deadline - // - Block 12: next slot change in 0.493s - skipped by the deadline - if authoring_duration < - BLOCK_PRODUCTION_MINIMUM_INTERVAL_MS.saturating_sub(BLOCK_PRODUCTION_THRESHOLD_MS) - { + // Ensure we are not going below the minimum block production interval. + // For 12+ cores at 500ms intervals with 1s buffer, the last 2 blocks + // in the slot are skipped: + // - Block 11: next slot change in 1.493s, deadline = 0.493s < 500ms → skip + // - Block 12: next slot change in 0.993s, deadline = 0 → skip + if authoring_duration < BLOCK_PRODUCTION_MINIMUM_INTERVAL_MS { tracing::debug!( target: LOG_TARGET, ?authoring_duration, @@ -588,13 +570,13 @@ mod tests { Slot::from(1), // Effective slot Some(Duration::from_millis(950)), // Expected )] - #[case::blocks_2s_reduce_to_minimum( + #[case::blocks_2s_reduce_below_minimum_at_400ms( Duration::from_millis(2000), // Authoring duration (Duration::from_millis(1400), Slot::from(1)), // Next block (Duration::from_millis(1400), Slot::from(2)), // Next slot change true, // Different authors Slot::from(1), // Effective slot - Some(Duration::from_millis(400)), // Expected + None, // Expected: deadline=400ms < 500ms minimum → skip )] #[case::blocks_2s_reduce_below_minimum( Duration::from_millis(2000), // Authoring duration From 30a70de56332c6923cac7e02700bedc5761d440f Mon Sep 17 00:00:00 2001 From: Alexandru Vasile Date: Mon, 23 Mar 2026 11:17:22 +0000 Subject: [PATCH 07/13] aura/slot_based: Revert the minimum duration Signed-off-by: Alexandru Vasile --- .../src/collators/slot_based/slot_timer.rs | 255 ++++++++++++++++-- 1 file changed, 236 insertions(+), 19 deletions(-) diff --git a/cumulus/client/consensus/aura/src/collators/slot_based/slot_timer.rs b/cumulus/client/consensus/aura/src/collators/slot_based/slot_timer.rs index 562bcc3b9b82e..f06053440e9f9 100644 --- a/cumulus/client/consensus/aura/src/collators/slot_based/slot_timer.rs +++ b/cumulus/client/consensus/aura/src/collators/slot_based/slot_timer.rs @@ -37,6 +37,20 @@ use std::{ /// Defensive mechanism, corresponds to 12 cores at 6 second block time. const BLOCK_PRODUCTION_MINIMUM_INTERVAL_MS: Duration = Duration::from_millis(500); +/// Theoretically, the block production is capped at `BLOCK_PRODUCTION_MINIMUM_INTERVAL_MS`. +/// In practice, there might be slight deviations due to timing inaccuracies and delays. +/// +/// This constant is taken into account while adjusting the authoring duration to fit into the slot. +/// Therefore, it will only reduce the authoring duration if we are within the +/// `BLOCK_PRODUCTION_ADJUSTMENT_MS` threshold of the next slot. +/// +/// ### 12 cores 500ms blocks +/// +/// For example, for 12 cores 500ms blocks: the next slot is scheduled in 490ms due to delays. +/// In that case, we still want to attempt producing the block, as missing the slot would be worse +/// than producing slightly too fast. +const BLOCK_PRODUCTION_THRESHOLD_MS: Duration = Duration::from_millis(100); + /// The amount of time the authoring duration of the last block production attempt /// should be reduced by to fit into the slot timing. const BLOCK_PRODUCTION_ADJUSTMENT_MS: Duration = Duration::from_millis(1000); @@ -185,12 +199,16 @@ fn adjust_authoring_duration( if different_authors && authoring_duration >= duration_until_deadline { authoring_duration = duration_until_deadline; - // Ensure we are not going below the minimum block production interval. - // For 12+ cores at 500ms intervals with 1s buffer, the last 2 blocks - // in the slot are skipped: - // - Block 11: next slot change in 1.493s, deadline = 0.493s < 500ms → skip - // - Block 12: next slot change in 0.993s, deadline = 0 → skip - if authoring_duration < BLOCK_PRODUCTION_MINIMUM_INTERVAL_MS { + // Ensure we are not going below the minimum interval within a reasonable threshold. + // For 12 cores, we might have a scenario where the last 3 blocks are skipped: + // - Block 10: next slot change in 1.493s: + // - After adjusting the deadline: 1.493s - 1s = 0.493s the block could be produced + // without issues. + // - Block 11: next slot change in 0.993s - skipped by the deadline + // - Block 12: next slot change in 0.493s - skipped by the deadline + if authoring_duration < + BLOCK_PRODUCTION_MINIMUM_INTERVAL_MS.saturating_sub(BLOCK_PRODUCTION_THRESHOLD_MS) + { tracing::debug!( target: LOG_TARGET, ?authoring_duration, @@ -337,6 +355,12 @@ where return Some(authoring_duration); }; + // Cap the remaining time at the slot duration. Arriving early (before the + // effective slot boundary) does not extend the slot. We never have more + // than one slot's worth of time to prduce blocks (6 seconds). + let next_slot_change = + (next_slot_change.0.min(slot_duration.as_duration()), next_slot_change.1); + if next_slot_change.0 == Duration::ZERO { tracing::warn!( target: LOG_TARGET, @@ -570,13 +594,13 @@ mod tests { Slot::from(1), // Effective slot Some(Duration::from_millis(950)), // Expected )] - #[case::blocks_2s_reduce_below_minimum_at_400ms( + #[case::blocks_2s_reduce_to_minimum( Duration::from_millis(2000), // Authoring duration (Duration::from_millis(1400), Slot::from(1)), // Next block (Duration::from_millis(1400), Slot::from(2)), // Next slot change true, // Different authors Slot::from(1), // Effective slot - None, // Expected: deadline=400ms < 500ms minimum → skip + Some(Duration::from_millis(400)), // Expected )] #[case::blocks_2s_reduce_below_minimum( Duration::from_millis(2000), // Authoring duration @@ -696,10 +720,10 @@ mod tests { true, // different_authors Some(Duration::from_millis(2000)), )] - // relay_parent_offset = 1, fresh relay parent: extended deadline. + // relay_parent_offset = 1, fresh relay parent: capped to slot duration. // Wall clock at 31000ms, effective_slot=6. - // Next slot change: 42000-31000 = 11000ms, deadline = 10000ms. - // 2000ms < 10000ms → produce. + // Next slot change: 42000-31000 = 11000ms, capped to 6000ms, deadline = 5000ms. + // 2000ms < 5000ms → produce. #[case::offset_1_fresh( 6000, 31000, @@ -712,8 +736,8 @@ mod tests { )] // relay_parent_offset = 1 with wall clock near end of current slot. // Wall clock at 35500ms (near slot 5 boundary at 36000ms), effective_slot=6. - // Next slot change: 42000-35500 = 6500ms, deadline = 5500ms. - // 2000ms < 5500ms → produce. + // Next slot change: 42000-35500 = 6500ms, capped to 6000ms, deadline = 5000ms. + // 2000ms < 5000ms → produce. #[case::offset_1_near_boundary( 6000, 35500, @@ -779,10 +803,10 @@ mod tests { false, Some(Duration::from_millis(2000)), )] - // Large relay_parent_offset = 5, plenty of room. + // Large relay_parent_offset = 5, capped to slot duration. // Wall clock at 31000ms, effective_slot=10. - // Next slot change: 66000-31000 = 35000ms, deadline = 34000ms. - // 2000ms < 34000ms → produce. + // Next slot change: 66000-31000 = 35000ms, capped to 6000ms, deadline = 5000ms. + // 2000ms < 5000ms → produce. #[case::offset_5_large( 6000, 31000, @@ -823,8 +847,8 @@ mod tests { )] // relay_parent_offset = 1 rescues from below-minimum case. // Same wall clock at 34800ms, but effective_slot=6. - // Next slot change: 42000-34800 = 7200ms, deadline = 6200ms. - // 2000ms < 6200ms → produce. + // Next slot change: 42000-34800 = 7200ms, capped to 6000ms, deadline = 5000ms. + // 2000ms < 5000ms → produce. #[case::offset_1_rescues_below_minimum( 6000, 34800, @@ -863,6 +887,12 @@ mod tests { ) .expect("Should compute next slot change"); + // Step 2b: Cap remaining time at slot duration (mirrors SlotTimer). + let next_slot_change = ( + next_slot_change.0.min(slot_duration.as_duration()), + next_slot_change.1, + ); + // Step 3: Call adjust_authoring_duration with computed values. let result = adjust_authoring_duration( authoring_duration, @@ -941,7 +971,7 @@ mod tests { } // With offset=1: effective_slot=804, next_slot=805 at 4_830_000ms. - // remaining = 6491ms, deadline = 5491ms → correct APPROVE. + // remaining = 6491ms, capped to 6000ms, deadline = 5000ms → correct APPROVE. { let effective_slot = para_slot.saturating_add(Slot::from(1u64)); // offset=1 let next_slot_change = compute_time_until_next_slot_change( @@ -952,6 +982,12 @@ mod tests { ) .unwrap(); assert_eq!(next_slot_change.0.as_millis(), 6491); + // Cap at slot duration. + let next_slot_change = ( + next_slot_change.0.min(slot_duration.as_duration()), + next_slot_change.1, + ); + assert_eq!(next_slot_change.0.as_millis(), 6000); let result = adjust_authoring_duration( Duration::from_millis(500), @@ -1018,6 +1054,187 @@ mod tests { } } + /// Regression test reproducing the kusama yap-3392 log pattern. + /// + /// Pre-fix behavior (no effective_slot, old 400ms threshold): + /// - First attempt at each relay parent: wrongful SKIP (490ms remaining, 1s buffer → + /// deadline=0) + /// - 11th block produced with deadline=492ms (passes old 400ms threshold) + /// - 12th block correctly skipped (deadline=0) + /// + /// Post-fix behavior (effective_slot + 500ms threshold + slot duration cap): + /// - First attempt: APPROVE (effective_slot pushes deadline, capped to 6000ms) + /// - 10 blocks produced, last two correctly skipped (deadline < 500ms) + /// + /// Actual kusama log values: + /// slot_duration=6000ms, 13 cores, relay_parent_offset=1 + /// Slot 295637405 = para_slot from relay parent + /// First attempt: duration_until_next_slot=490.969359ms next_slot=Slot(295637406) + /// 11th block: duration_until_next_slot=1.492010569s deadline=492.010569ms + /// 12th attempt: duration_until_next_slot=991.923868ms deadline=0ns → skip + #[test] + fn test_kusama_yap_3392_block_production_pattern() { + sp_tracing::init_for_tests(); + + let slot_duration = SlotDuration::from_millis(6000); + // Slot 295637405 starts at 295637405 * 6000 = 1773824430000ms + // Slot 295637406 starts at 1773824436000ms + // Slot 295637407 starts at 1773824442000ms + let para_slot = Slot::from(295637405u64); + + // Wall clock ~490ms before slot 295637406. + // In the old code: last_reported_slot = 295637405, next_slot = 295637406, + // duration_until_next_slot ≈ 491ms, deadline = 0 → wrongful skip. + let time_first_attempt = Duration::from_millis(1_773_824_435_509); + let next_block = (Duration::from_millis(491), Slot::from(295637406u64)); + + // --- Pre-fix (offset=0): wrongful SKIP on first attempt --- + { + let effective_slot = para_slot; // no offset + let next_slot_change = compute_time_until_next_slot_change( + slot_duration, + time_first_attempt, + Duration::ZERO, + effective_slot, + ) + .unwrap(); + // ~491ms until slot 295637406 + assert_eq!(next_slot_change.0.as_millis(), 491); + assert_eq!(next_slot_change.1, Slot::from(295637406u64)); + + let result = adjust_authoring_duration( + Duration::from_millis(2000), + next_block, + next_slot_change, + true, + effective_slot, + ); + assert_eq!(result, None, "Pre-fix: wrongful skip on first attempt"); + } + + // --- Post-fix (offset=1): first attempt correctly approved --- + { + let effective_slot = para_slot.saturating_add(Slot::from(1u64)); // 295637406 + let next_slot_change = compute_time_until_next_slot_change( + slot_duration, + time_first_attempt, + Duration::ZERO, + effective_slot, + ) + .unwrap(); + // ~6491ms until slot 295637407 + assert_eq!(next_slot_change.0.as_millis(), 6491); + assert_eq!(next_slot_change.1, Slot::from(295637407u64)); + // Cap at slot duration. + let next_slot_change = ( + next_slot_change.0.min(slot_duration.as_duration()), + next_slot_change.1, + ); + assert_eq!(next_slot_change.0.as_millis(), 6000); + + let result = adjust_authoring_duration( + Duration::from_millis(2000), + next_block, + next_slot_change, + true, + effective_slot, + ); + assert_eq!( + result, + Some(Duration::from_millis(491)), + "Post-fix: first attempt approved" + ); + } + + // --- 11th block attempt (with offset=1): ~1492ms until next slot --- + // Wall clock advanced ~5s from first attempt. + let time_11th = Duration::from_millis(1_773_824_440_508); + let next_block_11th = (Duration::from_millis(493), Slot::from(295637406u64)); + { + let effective_slot = para_slot.saturating_add(Slot::from(1u64)); // 295637406 + let next_slot_change = compute_time_until_next_slot_change( + slot_duration, + time_11th, + Duration::ZERO, + effective_slot, + ) + .unwrap(); + // ~1492ms until slot 295637407 + assert_eq!(next_slot_change.0.as_millis(), 1492); + + let result = adjust_authoring_duration( + Duration::from_millis(2000), + next_block_11th, + next_slot_change, + true, + effective_slot, + ); + // deadline = 1492 - 1000 = 492ms >= 400ms (minimum - threshold) + // → produced with the 100ms headroom. + assert_eq!( + result, + Some(Duration::from_millis(492)), + "11th block: deadline 492ms >= 400ms threshold → produce" + ); + } + + // --- 10th block attempt (with offset=1): ~1992ms until next slot --- + let time_10th = Duration::from_millis(1_773_824_440_008); + let next_block_10th = (Duration::from_millis(493), Slot::from(295637406u64)); + { + let effective_slot = para_slot.saturating_add(Slot::from(1u64)); + let next_slot_change = compute_time_until_next_slot_change( + slot_duration, + time_10th, + Duration::ZERO, + effective_slot, + ) + .unwrap(); + // ~1992ms until slot 295637407 + assert_eq!(next_slot_change.0.as_millis(), 1992); + + let result = adjust_authoring_duration( + Duration::from_millis(2000), + next_block_10th, + next_slot_change, + true, + effective_slot, + ); + // deadline = 1992 - 1000 = 992ms >= 500ms → produce. + assert_eq!( + result, + Some(Duration::from_millis(493)), + "10th block: deadline 992ms >= 500ms → produce" + ); + } + + // --- 12th block attempt (with offset=1): ~992ms until next slot --- + let time_12th = Duration::from_millis(1_773_824_441_008); + let next_block_12th = (Duration::from_millis(492), Slot::from(295637406u64)); + { + let effective_slot = para_slot.saturating_add(Slot::from(1u64)); + let next_slot_change = compute_time_until_next_slot_change( + slot_duration, + time_12th, + Duration::ZERO, + effective_slot, + ) + .unwrap(); + // ~992ms until slot 295637407 + assert_eq!(next_slot_change.0.as_millis(), 992); + + let result = adjust_authoring_duration( + Duration::from_millis(2000), + next_block_12th, + next_slot_change, + true, + effective_slot, + ); + // deadline = 992 - 1000 = 0 → skip. + assert_eq!(result, None, "12th block: deadline 0 → skip"); + } + } + /// Test that `compute_time_until_next_slot_change` returns zero remaining time /// when the effective slot is already past (stale relay parent scenario). #[rstest] From fead047ae6bb4d141aac5cf1e7ae875ffca87604 Mon Sep 17 00:00:00 2001 From: Alexandru Vasile Date: Mon, 23 Mar 2026 13:12:58 +0000 Subject: [PATCH 08/13] Revert to origin master Signed-off-by: Alexandru Vasile --- .../slot_based/block_builder_task.rs | 7 +- .../src/collators/slot_based/slot_timer.rs | 645 +----------------- 2 files changed, 8 insertions(+), 644 deletions(-) diff --git a/cumulus/client/consensus/aura/src/collators/slot_based/block_builder_task.rs b/cumulus/client/consensus/aura/src/collators/slot_based/block_builder_task.rs index 9888493dee451..7cfde76a7921e 100644 --- a/cumulus/client/consensus/aura/src/collators/slot_based/block_builder_task.rs +++ b/cumulus/client/consensus/aura/src/collators/slot_based/block_builder_task.rs @@ -421,11 +421,8 @@ where validation_data.max_pov_size * 85 / 100 } as usize; - let adjusted_authoring_duration = slot_timer.adjust_authoring_duration( - authoring_duration, - para_slot.slot, - relay_parent_offset, - ); + let adjusted_authoring_duration = + slot_timer.adjust_authoring_duration(authoring_duration); tracing::debug!(target: crate::LOG_TARGET, duration = ?adjusted_authoring_duration, "Adjusted proposal duration."); let Some(adjusted_authoring_duration) = adjusted_authoring_duration else { diff --git a/cumulus/client/consensus/aura/src/collators/slot_based/slot_timer.rs b/cumulus/client/consensus/aura/src/collators/slot_based/slot_timer.rs index f06053440e9f9..5686a7b68b4e0 100644 --- a/cumulus/client/consensus/aura/src/collators/slot_based/slot_timer.rs +++ b/cumulus/client/consensus/aura/src/collators/slot_based/slot_timer.rs @@ -154,7 +154,6 @@ fn adjust_authoring_duration( next_block: (Duration, Slot), next_slot_change: (Duration, Slot), different_authors: bool, - effective_slot: Slot, ) -> Option { let (duration, next_block_slot) = next_block; let (duration_until_next_slot, next_slot) = next_slot_change; @@ -169,7 +168,6 @@ fn adjust_authoring_duration( ?next_block_slot, ?duration_until_next_slot, ?next_slot, - ?effective_slot, ?duration_until_deadline, ?different_authors, "Adjusting authoring duration for slot.", @@ -287,13 +285,12 @@ where fn time_until_next_slot_change( &mut self, slot_duration: SlotDuration, - effective_slot: Slot, ) -> Option<(Duration, Slot)> { compute_time_until_next_slot_change( slot_duration, duration_now(), self.time_offset, - effective_slot, + self.last_reported_slot.unwrap_or_default(), ) } @@ -325,29 +322,14 @@ where /// Adjust the authoring duration to fit into the slot timing. /// /// Returns the adjusted authoring duration and the slot that it corresponds to. - pub fn adjust_authoring_duration( - &mut self, - authoring_duration: Duration, - para_slot: Slot, - relay_parent_offset: u32, - ) -> Option { + pub fn adjust_authoring_duration(&mut self, authoring_duration: Duration) -> Option { let Ok(slot_duration) = crate::slot_duration(&*self.client) else { tracing::error!(target: LOG_TARGET, "Failed to fetch slot duration from runtime."); return None; }; let next_block = self.time_until_next_block(slot_duration); - - // The effective parachain slot must count for the relay parent offset as well. - // - // For offset = 1, we build on RP n - 1 when RP n arrives. - // Assume the RP n - 1 has a slot of 803, the wall clock detects 804 (aura slot) - // then our deadline is effectively in slot 805. - let effective_slot = para_slot.saturating_add(Slot::from(relay_parent_offset as u64)); - - let Some(next_slot_change) = - self.time_until_next_slot_change(slot_duration, effective_slot) - else { + let Some(next_slot_change) = self.time_until_next_slot_change(slot_duration) else { tracing::error!( target: LOG_TARGET, "Failed to compute time until next slot change. Using unadjusted authoring duration." @@ -355,32 +337,15 @@ where return Some(authoring_duration); }; - // Cap the remaining time at the slot duration. Arriving early (before the - // effective slot boundary) does not extend the slot. We never have more - // than one slot's worth of time to prduce blocks (6 seconds). - let next_slot_change = - (next_slot_change.0.min(slot_duration.as_duration()), next_slot_change.1); - - if next_slot_change.0 == Duration::ZERO { - tracing::warn!( - target: LOG_TARGET, - ?effective_slot, - ?next_slot_change, - ?relay_parent_offset, - "Effective slot is already past. Relay parent may be stale." - ); - } - - // Check if authors at current effective slot and next slots are different - let different_authors = - self.check_different_slot_authors(effective_slot, next_slot_change.1); + // Check if authors at current and next slots are different + let current_slot = self.last_reported_slot.unwrap_or(next_block.1); + let different_authors = self.check_different_slot_authors(current_slot, next_slot_change.1); adjust_authoring_duration( authoring_duration, next_block, next_slot_change, different_authors, - effective_slot, ) } @@ -559,7 +524,6 @@ mod tests { (Duration::from_millis(2000), Slot::from(1)), // Next block (Duration::from_millis(4000), Slot::from(2)), // Next slot change true, // Different authors - Slot::from(1), // Effective slot Some(Duration::from_millis(2000)), // Expected )] #[case::blocks_2s_closer_next_slot( @@ -567,7 +531,6 @@ mod tests { (Duration::from_millis(1950), Slot::from(1)), // Next block (Duration::from_millis(4000), Slot::from(2)), // Next slot change true, // Different authors - Slot::from(1), // Effective slot Some(Duration::from_millis(1950)), // Expected )] #[case::blocks_2s_closer_next_slot_bigger( @@ -575,7 +538,6 @@ mod tests { (Duration::from_millis(1500), Slot::from(1)), // Next block (Duration::from_millis(4000), Slot::from(2)), // Next slot change true, // Different authors - Slot::from(1), // Effective slot Some(Duration::from_millis(1500)), // Expected )] #[case::blocks_2s_reduce_by_1s( @@ -583,7 +545,6 @@ mod tests { (Duration::from_millis(2000), Slot::from(1)), // Next block (Duration::from_millis(2000), Slot::from(2)), // Next slot change true, // Different authors - Slot::from(1), // Effective slot Some(Duration::from_millis(1000)), // Expected )] #[case::blocks_2s_reduce_by_1s_plus_offset( @@ -591,7 +552,6 @@ mod tests { (Duration::from_millis(1950), Slot::from(1)), // Next block (Duration::from_millis(1950), Slot::from(2)), // Next slot change true, // Different authors - Slot::from(1), // Effective slot Some(Duration::from_millis(950)), // Expected )] #[case::blocks_2s_reduce_to_minimum( @@ -599,7 +559,6 @@ mod tests { (Duration::from_millis(1400), Slot::from(1)), // Next block (Duration::from_millis(1400), Slot::from(2)), // Next slot change true, // Different authors - Slot::from(1), // Effective slot Some(Duration::from_millis(400)), // Expected )] #[case::blocks_2s_reduce_below_minimum( @@ -607,7 +566,6 @@ mod tests { (Duration::from_millis(1300), Slot::from(1)), // Next block (Duration::from_millis(1300), Slot::from(2)), // Next slot change true, // Different authors - Slot::from(1), // Effective slot None, // Expected to reduce below minimum )] #[case::blocks_2s_same_author( @@ -615,7 +573,6 @@ mod tests { (Duration::from_millis(1400), Slot::from(1)), // Next block (Duration::from_millis(1400), Slot::from(2)), // Next slot change false, // Different authors - Slot::from(1), // Effective slot Some(Duration::from_millis(1400)), // Expected no adjustment for last second. )] // Various scenarios for 500ms block production adjustment. @@ -624,7 +581,6 @@ mod tests { (Duration::from_millis(500), Slot::from(1)), // Next block (Duration::from_millis(2000), Slot::from(2)), // Next slot change true, // Different authors - Slot::from(1), // Effective slot Some(Duration::from_millis(500)), // Expected )] #[case::blocks_500ms_closer_next_slot( @@ -632,7 +588,6 @@ mod tests { (Duration::from_millis(450), Slot::from(1)), // Next block (Duration::from_millis(2000), Slot::from(2)), // Next slot change true, // Different authors - Slot::from(1), // Effective slot Some(Duration::from_millis(450)), // Expected )] #[case::blocks_500ms_closer_next_slot_bigger( @@ -640,7 +595,6 @@ mod tests { (Duration::from_millis(400), Slot::from(1)), // Next block (Duration::from_millis(1500), Slot::from(2)), // Next slot change true, // Different authors - Slot::from(1), // Effective slot Some(Duration::from_millis(400)), // Expected )] #[case::blocks_500ms_reduce_by_1s( @@ -648,7 +602,6 @@ mod tests { (Duration::from_millis(500), Slot::from(1)), // Next block (Duration::from_millis(1000), Slot::from(2)), // Next slot change true, // Different authors - Slot::from(1), // Effective slot None, // Expected )] #[case::blocks_500ms_reduce_by_1s_closer( @@ -656,7 +609,6 @@ mod tests { (Duration::from_millis(500), Slot::from(1)), // Next block (Duration::from_millis(500), Slot::from(2)), // Next slot change true, // Different authors - Slot::from(1), // Effective slot None, // Expected )] // If we are producing with 1 collator for 500ms authoring duration, @@ -666,7 +618,6 @@ mod tests { (Duration::from_millis(410), Slot::from(1)), // Next block (Duration::from_millis(1000), Slot::from(2)), // Next slot change false, // Different authors - Slot::from(1), // Effective slot Some(Duration::from_millis(410)), // Expected no adjustment for last second. )] #[case::blocks_500ms_same_author_closer( @@ -674,7 +625,6 @@ mod tests { (Duration::from_millis(400), Slot::from(1)), // Next block (Duration::from_millis(400), Slot::from(2)), // Next slot change false, // Different authors - Slot::from(1), // Effective slot Some(Duration::from_millis(400)), // Expected no adjustment for last second. )] fn test_adjust_authoring_duration( @@ -682,7 +632,6 @@ mod tests { #[case] next_block: (Duration, Slot), #[case] next_slot_change: (Duration, Slot), #[case] different_authors: bool, - #[case] effective_slot: Slot, #[case] expected: Option, ) { sp_tracing::init_for_tests(); @@ -692,590 +641,8 @@ mod tests { next_block, next_slot_change, different_authors, - effective_slot, ); tracing::debug!("Adjusted authoring duration: {:?}", result); assert_eq!(result, expected); } - - /// Tests the combined behavior of `relay_parent_offset` on effective slot computation - /// and authoring duration adjustment. - /// - /// This simulates what `SlotTimer::adjust_authoring_duration()` does internally: - /// 1. Compute `effective_slot = para_slot + relay_parent_offset` - /// 2. Compute time until next slot change using `effective_slot` - /// 3. Call `adjust_authoring_duration` with the results - #[rstest] - // relay_parent_offset = 0: normal behavior, block fits comfortably in slot. - // Wall clock at 31000ms (slot 5), effective_slot=5. - // Next slot change: 36000-31000 = 5000ms, deadline = 4000ms. - // Authoring duration 2000ms < 4000ms deadline → produce. - #[case::offset_0_normal( - 6000, // para_slot_duration_ms - 31000, // time_now_ms - Slot::from(5), // para_slot - 0u32, // relay_parent_offset - Duration::from_millis(2000), // authoring_duration - (Duration::from_millis(2000), Slot::from(6)), // next_block - true, // different_authors - Some(Duration::from_millis(2000)), - )] - // relay_parent_offset = 1, fresh relay parent: capped to slot duration. - // Wall clock at 31000ms, effective_slot=6. - // Next slot change: 42000-31000 = 11000ms, capped to 6000ms, deadline = 5000ms. - // 2000ms < 5000ms → produce. - #[case::offset_1_fresh( - 6000, - 31000, - Slot::from(5), - 1u32, - Duration::from_millis(2000), - (Duration::from_millis(2000), Slot::from(6)), - true, - Some(Duration::from_millis(2000)), - )] - // relay_parent_offset = 1 with wall clock near end of current slot. - // Wall clock at 35500ms (near slot 5 boundary at 36000ms), effective_slot=6. - // Next slot change: 42000-35500 = 6500ms, capped to 6000ms, deadline = 5000ms. - // 2000ms < 5000ms → produce. - #[case::offset_1_near_boundary( - 6000, - 35500, - Slot::from(5), - 1u32, - Duration::from_millis(2000), - (Duration::from_millis(2000), Slot::from(6)), - true, - Some(Duration::from_millis(2000)), - )] - // relay_parent_offset = 0 at slot boundary → stale effective slot, different authors → skip. - // Wall clock at 37000ms (past slot 5 boundary), effective_slot=5. - // Next slot change: 36000-37000 = 0 (saturating), deadline = 0. - // Different authors and deadline=0 → None. - #[case::offset_0_stale_different_authors( - 6000, - 37000, - Slot::from(5), - 0u32, - Duration::from_millis(2000), - (Duration::from_millis(2000), Slot::from(7)), - true, - None, - )] - // relay_parent_offset = 1 rescues a stale relay parent. - // Wall clock at 37000ms, para_slot=5, effective_slot=6. - // Next slot change: 42000-37000 = 5000ms, deadline = 4000ms. - // 2000ms < 4000ms → produce (offset=1 saved us from skipping!). - #[case::offset_1_rescues_stale( - 6000, - 37000, - Slot::from(5), - 1u32, - Duration::from_millis(2000), - (Duration::from_millis(2000), Slot::from(7)), - true, - Some(Duration::from_millis(2000)), - )] - // relay_parent_offset = 1 but relay parent is very stale (2+ slots behind). - // Wall clock at 43000ms (slot 7), para_slot=5, effective_slot=6. - // Next slot change: 42000-43000 = 0 (saturating), deadline = 0. - // Different authors and deadline=0 → None. - #[case::offset_1_very_stale( - 6000, - 43000, - Slot::from(5), - 1u32, - Duration::from_millis(2000), - (Duration::from_millis(2000), Slot::from(8)), - true, - None, - )] - // relay_parent_offset = 0, stale, but same author → still produce. - // Wall clock at 37000ms, effective_slot=5, next slot change = 0ms. - // Same author → return Some(min(authoring, next_block_duration)). - #[case::offset_0_stale_same_author( - 6000, - 37000, - Slot::from(5), - 0u32, - Duration::from_millis(2000), - (Duration::from_millis(2000), Slot::from(7)), - false, - Some(Duration::from_millis(2000)), - )] - // Large relay_parent_offset = 5, capped to slot duration. - // Wall clock at 31000ms, effective_slot=10. - // Next slot change: 66000-31000 = 35000ms, capped to 6000ms, deadline = 5000ms. - // 2000ms < 5000ms → produce. - #[case::offset_5_large( - 6000, - 31000, - Slot::from(5), - 5u32, - Duration::from_millis(2000), - (Duration::from_millis(2000), Slot::from(6)), - true, - Some(Duration::from_millis(2000)), - )] - // relay_parent_offset = 0, near deadline → authoring clamped to deadline. - // Wall clock at 34500ms, effective_slot=5. - // Next slot change: 36000-34500 = 1500ms, deadline = 500ms. - // 2000ms > 500ms → clamped to 500ms. 500ms >= 400ms (min-threshold) → produce. - #[case::offset_0_clamped_to_minimum( - 6000, - 34500, - Slot::from(5), - 0u32, - Duration::from_millis(2000), - (Duration::from_millis(2000), Slot::from(6)), - true, - Some(Duration::from_millis(500)), - )] - // relay_parent_offset = 0, too close to deadline → below minimum threshold. - // Wall clock at 34800ms, effective_slot=5. - // Next slot change: 36000-34800 = 1200ms, deadline = 200ms. - // 2000ms > 200ms → clamped to 200ms. 200ms < 400ms (min-threshold) → None. - #[case::offset_0_below_minimum( - 6000, - 34800, - Slot::from(5), - 0u32, - Duration::from_millis(2000), - (Duration::from_millis(2000), Slot::from(6)), - true, - None, - )] - // relay_parent_offset = 1 rescues from below-minimum case. - // Same wall clock at 34800ms, but effective_slot=6. - // Next slot change: 42000-34800 = 7200ms, capped to 6000ms, deadline = 5000ms. - // 2000ms < 5000ms → produce. - #[case::offset_1_rescues_below_minimum( - 6000, - 34800, - Slot::from(5), - 1u32, - Duration::from_millis(2000), - (Duration::from_millis(2000), Slot::from(6)), - true, - Some(Duration::from_millis(2000)), - )] - fn test_adjust_authoring_duration_with_relay_offset( - #[case] para_slot_duration_ms: u64, - #[case] time_now_ms: u64, - #[case] para_slot: Slot, - #[case] relay_parent_offset: u32, - #[case] authoring_duration: Duration, - #[case] next_block: (Duration, Slot), - #[case] different_authors: bool, - #[case] expected: Option, - ) { - sp_tracing::init_for_tests(); - - let slot_duration = SlotDuration::from_millis(para_slot_duration_ms); - let time_now = Duration::from_millis(time_now_ms); - let time_offset = Duration::ZERO; - - // Step 1: Compute effective slot (mirrors SlotTimer::adjust_authoring_duration). - let effective_slot = para_slot.saturating_add(Slot::from(relay_parent_offset as u64)); - - // Step 2: Compute time until next slot change using effective_slot. - let next_slot_change = compute_time_until_next_slot_change( - slot_duration, - time_now, - time_offset, - effective_slot, - ) - .expect("Should compute next slot change"); - - // Step 2b: Cap remaining time at slot duration (mirrors SlotTimer). - let next_slot_change = ( - next_slot_change.0.min(slot_duration.as_duration()), - next_slot_change.1, - ); - - // Step 3: Call adjust_authoring_duration with computed values. - let result = adjust_authoring_duration( - authoring_duration, - next_block, - next_slot_change, - different_authors, - effective_slot, - ); - - tracing::debug!( - ?effective_slot, - ?next_slot_change, - ?relay_parent_offset, - ?result, - "Test result for relay_parent_offset scenario" - ); - assert_eq!(result, expected); - } - - /// Regression test for the "wrongful skip near slot boundary" bug. - /// - /// Reproduces a real-world scenario where the 1-second buffer - /// (`BLOCK_PRODUCTION_ADJUSTMENT_MS`) caused incorrect decisions: - /// - /// Attempt 1 — Wrongful SKIP (offset=0): - /// para_slot=803, wall_clock near end of slot 803 (491ms until slot 804). - /// deadline = 491ms - 1000ms = 0 → skip, even though we're still in our slot. - /// - /// Attempt 1 — Correct APPROVE (offset=1): - /// effective_slot=804, deadline based on slot 805 boundary (6491ms away). - /// deadline = 5491ms → plenty of room to produce. - /// - /// Attempt 3 — Wrongful APPROVE (offset=0, using wall clock slot): - /// para_slot=803 (stale), wall_clock=805, deadline based on slot 806 (5990ms). - /// Would approve despite para_slot being 2 slots behind. - /// - /// Attempt 3 — Correct SKIP (offset=1): - /// effective_slot=804, wall_clock past slot 805 boundary → remaining=0. - /// deadline = 0, different authors → skip. - #[test] - fn test_wrongful_skip_near_slot_boundary() { - sp_tracing::init_for_tests(); - - let slot_duration = SlotDuration::from_millis(6000); - let para_slot = Slot::from(803); - // Slot 803 = [4_818_000ms, 4_824_000ms) - // Slot 804 = [4_824_000ms, 4_830_000ms) - // Slot 805 = [4_830_000ms, 4_836_000ms) - - // --- Attempt 1: wall clock at 4_823_509ms (491ms before slot 804) --- - let time_now_attempt1 = Duration::from_millis(4_823_509); - let next_block = (Duration::from_millis(500), Slot::from(804)); - - // With offset=0: effective_slot=803, next_slot=804 at 4_824_000ms. - // remaining = 491ms, deadline = 0 → wrongful SKIP. - { - let effective_slot = para_slot; // offset=0 - let next_slot_change = compute_time_until_next_slot_change( - slot_duration, - time_now_attempt1, - Duration::ZERO, - effective_slot, - ) - .unwrap(); - assert_eq!(next_slot_change.0.as_millis(), 491); - - let result = adjust_authoring_duration( - Duration::from_millis(500), - next_block, - next_slot_change, - true, // different authors - effective_slot, - ); - // Deadline = 491 - 1000 = 0 → skip (the wrongful behavior without offset). - assert_eq!(result, None, "offset=0 near boundary: wrongful skip"); - } - - // With offset=1: effective_slot=804, next_slot=805 at 4_830_000ms. - // remaining = 6491ms, capped to 6000ms, deadline = 5000ms → correct APPROVE. - { - let effective_slot = para_slot.saturating_add(Slot::from(1u64)); // offset=1 - let next_slot_change = compute_time_until_next_slot_change( - slot_duration, - time_now_attempt1, - Duration::ZERO, - effective_slot, - ) - .unwrap(); - assert_eq!(next_slot_change.0.as_millis(), 6491); - // Cap at slot duration. - let next_slot_change = ( - next_slot_change.0.min(slot_duration.as_duration()), - next_slot_change.1, - ); - assert_eq!(next_slot_change.0.as_millis(), 6000); - - let result = adjust_authoring_duration( - Duration::from_millis(500), - next_block, - next_slot_change, - true, // different authors - effective_slot, - ); - assert_eq!( - result, - Some(Duration::from_millis(500)), - "offset=1 near boundary: correctly approved" - ); - } - - // --- Attempt 3: wall clock at 4_831_000ms (in slot 805, para_slot=803 is stale) --- - let time_now_attempt3 = Duration::from_millis(4_831_000); - let next_block_attempt3 = (Duration::from_millis(500), Slot::from(806)); - - // With offset=0: effective_slot=803, next_slot=804 at 4_824_000ms. - // remaining = 0 (stale), deadline = 0 → correct SKIP. - { - let effective_slot = para_slot; // offset=0 - let next_slot_change = compute_time_until_next_slot_change( - slot_duration, - time_now_attempt3, - Duration::ZERO, - effective_slot, - ) - .unwrap(); - assert_eq!(next_slot_change.0, Duration::ZERO); - - let result = adjust_authoring_duration( - Duration::from_millis(500), - next_block_attempt3, - next_slot_change, - true, - effective_slot, - ); - assert_eq!(result, None, "offset=0 stale: correctly skipped"); - } - - // With offset=1: effective_slot=804, next_slot=805 at 4_830_000ms. - // remaining = 0 (stale, wall clock past 805) → correct SKIP. - { - let effective_slot = para_slot.saturating_add(Slot::from(1u64)); // offset=1 - let next_slot_change = compute_time_until_next_slot_change( - slot_duration, - time_now_attempt3, - Duration::ZERO, - effective_slot, - ) - .unwrap(); - assert_eq!(next_slot_change.0, Duration::ZERO); - - let result = adjust_authoring_duration( - Duration::from_millis(500), - next_block_attempt3, - next_slot_change, - true, - effective_slot, - ); - assert_eq!(result, None, "offset=1 stale: correctly skipped"); - } - } - - /// Regression test reproducing the kusama yap-3392 log pattern. - /// - /// Pre-fix behavior (no effective_slot, old 400ms threshold): - /// - First attempt at each relay parent: wrongful SKIP (490ms remaining, 1s buffer → - /// deadline=0) - /// - 11th block produced with deadline=492ms (passes old 400ms threshold) - /// - 12th block correctly skipped (deadline=0) - /// - /// Post-fix behavior (effective_slot + 500ms threshold + slot duration cap): - /// - First attempt: APPROVE (effective_slot pushes deadline, capped to 6000ms) - /// - 10 blocks produced, last two correctly skipped (deadline < 500ms) - /// - /// Actual kusama log values: - /// slot_duration=6000ms, 13 cores, relay_parent_offset=1 - /// Slot 295637405 = para_slot from relay parent - /// First attempt: duration_until_next_slot=490.969359ms next_slot=Slot(295637406) - /// 11th block: duration_until_next_slot=1.492010569s deadline=492.010569ms - /// 12th attempt: duration_until_next_slot=991.923868ms deadline=0ns → skip - #[test] - fn test_kusama_yap_3392_block_production_pattern() { - sp_tracing::init_for_tests(); - - let slot_duration = SlotDuration::from_millis(6000); - // Slot 295637405 starts at 295637405 * 6000 = 1773824430000ms - // Slot 295637406 starts at 1773824436000ms - // Slot 295637407 starts at 1773824442000ms - let para_slot = Slot::from(295637405u64); - - // Wall clock ~490ms before slot 295637406. - // In the old code: last_reported_slot = 295637405, next_slot = 295637406, - // duration_until_next_slot ≈ 491ms, deadline = 0 → wrongful skip. - let time_first_attempt = Duration::from_millis(1_773_824_435_509); - let next_block = (Duration::from_millis(491), Slot::from(295637406u64)); - - // --- Pre-fix (offset=0): wrongful SKIP on first attempt --- - { - let effective_slot = para_slot; // no offset - let next_slot_change = compute_time_until_next_slot_change( - slot_duration, - time_first_attempt, - Duration::ZERO, - effective_slot, - ) - .unwrap(); - // ~491ms until slot 295637406 - assert_eq!(next_slot_change.0.as_millis(), 491); - assert_eq!(next_slot_change.1, Slot::from(295637406u64)); - - let result = adjust_authoring_duration( - Duration::from_millis(2000), - next_block, - next_slot_change, - true, - effective_slot, - ); - assert_eq!(result, None, "Pre-fix: wrongful skip on first attempt"); - } - - // --- Post-fix (offset=1): first attempt correctly approved --- - { - let effective_slot = para_slot.saturating_add(Slot::from(1u64)); // 295637406 - let next_slot_change = compute_time_until_next_slot_change( - slot_duration, - time_first_attempt, - Duration::ZERO, - effective_slot, - ) - .unwrap(); - // ~6491ms until slot 295637407 - assert_eq!(next_slot_change.0.as_millis(), 6491); - assert_eq!(next_slot_change.1, Slot::from(295637407u64)); - // Cap at slot duration. - let next_slot_change = ( - next_slot_change.0.min(slot_duration.as_duration()), - next_slot_change.1, - ); - assert_eq!(next_slot_change.0.as_millis(), 6000); - - let result = adjust_authoring_duration( - Duration::from_millis(2000), - next_block, - next_slot_change, - true, - effective_slot, - ); - assert_eq!( - result, - Some(Duration::from_millis(491)), - "Post-fix: first attempt approved" - ); - } - - // --- 11th block attempt (with offset=1): ~1492ms until next slot --- - // Wall clock advanced ~5s from first attempt. - let time_11th = Duration::from_millis(1_773_824_440_508); - let next_block_11th = (Duration::from_millis(493), Slot::from(295637406u64)); - { - let effective_slot = para_slot.saturating_add(Slot::from(1u64)); // 295637406 - let next_slot_change = compute_time_until_next_slot_change( - slot_duration, - time_11th, - Duration::ZERO, - effective_slot, - ) - .unwrap(); - // ~1492ms until slot 295637407 - assert_eq!(next_slot_change.0.as_millis(), 1492); - - let result = adjust_authoring_duration( - Duration::from_millis(2000), - next_block_11th, - next_slot_change, - true, - effective_slot, - ); - // deadline = 1492 - 1000 = 492ms >= 400ms (minimum - threshold) - // → produced with the 100ms headroom. - assert_eq!( - result, - Some(Duration::from_millis(492)), - "11th block: deadline 492ms >= 400ms threshold → produce" - ); - } - - // --- 10th block attempt (with offset=1): ~1992ms until next slot --- - let time_10th = Duration::from_millis(1_773_824_440_008); - let next_block_10th = (Duration::from_millis(493), Slot::from(295637406u64)); - { - let effective_slot = para_slot.saturating_add(Slot::from(1u64)); - let next_slot_change = compute_time_until_next_slot_change( - slot_duration, - time_10th, - Duration::ZERO, - effective_slot, - ) - .unwrap(); - // ~1992ms until slot 295637407 - assert_eq!(next_slot_change.0.as_millis(), 1992); - - let result = adjust_authoring_duration( - Duration::from_millis(2000), - next_block_10th, - next_slot_change, - true, - effective_slot, - ); - // deadline = 1992 - 1000 = 992ms >= 500ms → produce. - assert_eq!( - result, - Some(Duration::from_millis(493)), - "10th block: deadline 992ms >= 500ms → produce" - ); - } - - // --- 12th block attempt (with offset=1): ~992ms until next slot --- - let time_12th = Duration::from_millis(1_773_824_441_008); - let next_block_12th = (Duration::from_millis(492), Slot::from(295637406u64)); - { - let effective_slot = para_slot.saturating_add(Slot::from(1u64)); - let next_slot_change = compute_time_until_next_slot_change( - slot_duration, - time_12th, - Duration::ZERO, - effective_slot, - ) - .unwrap(); - // ~992ms until slot 295637407 - assert_eq!(next_slot_change.0.as_millis(), 992); - - let result = adjust_authoring_duration( - Duration::from_millis(2000), - next_block_12th, - next_slot_change, - true, - effective_slot, - ); - // deadline = 992 - 1000 = 0 → skip. - assert_eq!(result, None, "12th block: deadline 0 → skip"); - } - } - - /// Test that `compute_time_until_next_slot_change` returns zero remaining time - /// when the effective slot is already past (stale relay parent scenario). - #[rstest] - // Effective slot is current → positive remaining time. - #[case::current_slot(6000, 31000, Slot::from(5), 5000)] - // Effective slot is one ahead → even more remaining time. - #[case::one_ahead(6000, 31000, Slot::from(6), 11000)] - // Effective slot is behind wall clock → zero remaining time (stale). - #[case::stale_behind(6000, 37000, Slot::from(5), 0)] - // Effective slot is far behind → zero remaining time. - #[case::very_stale(6000, 50000, Slot::from(5), 0)] - // Effective slot exactly at boundary → remaining time for next full slot. - #[case::at_boundary(6000, 36000, Slot::from(5), 0)] - fn test_stale_effective_slot_detection( - #[case] para_slot_millis: u64, - #[case] time_now_ms: u64, - #[case] effective_slot: Slot, - #[case] expected_remaining_ms: u128, - ) { - let slot_duration = SlotDuration::from_millis(para_slot_millis); - let time_now = Duration::from_millis(time_now_ms); - - let result = compute_time_until_next_slot_change( - slot_duration, - time_now, - Duration::ZERO, - effective_slot, - ) - .expect("Should compute next slot change"); - - assert_eq!( - result.0.as_millis(), - expected_remaining_ms, - "Remaining time mismatch for effective_slot={:?} at time={}ms", - effective_slot, - time_now_ms, - ); - - // Verify stale detection: Duration::ZERO means the effective slot is past. - if expected_remaining_ms == 0 { - assert_eq!(result.0, Duration::ZERO, "Stale effective slot should have zero remaining"); - } - } } From dcb9520487c14bad123b7144dc93f7bf8b0766d5 Mon Sep 17 00:00:00 2001 From: Alexandru Vasile Date: Tue, 24 Mar 2026 14:35:28 +0000 Subject: [PATCH 09/13] aura: Ensure block builder marks the slot as terminated after adjusting to none Signed-off-by: Alexandru Vasile --- .../slot_based/block_builder_task.rs | 50 ++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) diff --git a/cumulus/client/consensus/aura/src/collators/slot_based/block_builder_task.rs b/cumulus/client/consensus/aura/src/collators/slot_based/block_builder_task.rs index 7cfde76a7921e..98f0aad32c0bd 100644 --- a/cumulus/client/consensus/aura/src/collators/slot_based/block_builder_task.rs +++ b/cumulus/client/consensus/aura/src/collators/slot_based/block_builder_task.rs @@ -51,7 +51,7 @@ use sp_application_crypto::AppPublic; use sp_blockchain::HeaderBackend; use sp_consensus::Environment; use sp_consensus_aura::AuraApi; -use sp_core::crypto::Pair; +use sp_core::{crypto::Pair, H256}; use sp_inherents::CreateInherentDataProviders; use sp_keystore::KeystorePtr; use sp_runtime::{ @@ -188,6 +188,21 @@ where collator_util::Collator::::new(params) }; + // Tracker helper to ensure we do not build an extra block at slot boundaries + // with edge-cases of importing blocks after the `slot_timer` ticks. + struct ParentTracker { + /// Hash of the relay block. + hash: Option, + /// True if the collator built a block for the current relay parent, false otherwise. + has_built: bool, + /// True if the collator adjusted the slot timer and decided there is no time to build + /// a block, false otherwise. + has_terminated: bool, + } + + let mut parent_tracker = + ParentTracker { hash: None, has_built: false, has_terminated: false }; + let mut relay_chain_data_cache = RelayChainDataCache::new(relay_client.clone(), para_id); let mut connection_helper = BackingGroupConnectionHelper::new( keystore.clone(), @@ -240,6 +255,28 @@ where let relay_parent = rp_data.relay_parent().hash(); let relay_parent_header = rp_data.relay_parent().clone(); + // Reset the state of the tracker in case of new blocks or re-orgs. + if parent_tracker.hash != Some(relay_parent) { + parent_tracker = ParentTracker { + hash: Some(relay_parent), + has_built: false, + has_terminated: false, + }; + } + + // The collator has terminated the building time, the `adjust_authoring_duration` has + // determined that there is no time left to build the block. This block must be skipped + // to avoid building blocks wrongfully when the import races with + // `wait_until_next_slot`. + if parent_tracker.has_terminated { + tracing::debug!( + target: crate::LOG_TARGET, + ?relay_parent, + "Skipping block production on terminated relay parent." + ); + continue; + } + let Some(parent_search_result) = crate::collators::find_parent(relay_parent, para_id, &*para_backend, &relay_client) .await @@ -438,6 +475,14 @@ where "Not building block due to insufficient authoring duration." ); + // In practice with 12 cores, it is possible that the wall clock (aura_slot) aligns + // perfectly with the parachain slot deduced from the relay block. For those cases, + // the first block production opportunity will be skipped immediately. This field + // ensures the block is marked as terminated only after building. + if parent_tracker.has_built { + parent_tracker.has_terminated = true; + } + continue; }; @@ -461,6 +506,9 @@ where continue; }; + // Mark the collator has built a block on this relay parent. + parent_tracker.has_built = true; + let new_block_hash = candidate.block.header().hash(); // Announce the newly built block to our peers. From 542d7e76880570215f3a4a39b52c8eba460132bd Mon Sep 17 00:00:00 2001 From: Alexandru Vasile Date: Tue, 24 Mar 2026 14:46:41 +0000 Subject: [PATCH 10/13] aura: Add more documentation into the race condition Signed-off-by: Alexandru Vasile --- .../slot_based/block_builder_task.rs | 27 ++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/cumulus/client/consensus/aura/src/collators/slot_based/block_builder_task.rs b/cumulus/client/consensus/aura/src/collators/slot_based/block_builder_task.rs index 98f0aad32c0bd..ac928a665efc0 100644 --- a/cumulus/client/consensus/aura/src/collators/slot_based/block_builder_task.rs +++ b/cumulus/client/consensus/aura/src/collators/slot_based/block_builder_task.rs @@ -190,13 +190,38 @@ where // Tracker helper to ensure we do not build an extra block at slot boundaries // with edge-cases of importing blocks after the `slot_timer` ticks. + // + // The following scenarios are degrading block confidence (happening on the same instance): + // + // [ wall slot | para slot | next wall slot] + // opportunity 1: [ 803 | 803 | 490ms ] + // - The wall slot is behind the para slot deduced by the relay block + // - The next slot 804 arrives in 490ms leaving no room for the 1s authoring duration + // - collator must skip the building the first block for this relay block + // + // opportunity 2-10: [ 804 | 803 | 6s ] + // - This is the proper case where we have sufficient time to build and build 10 blocks + // + // opportunity 11-12: [ 804 | 803 | 990ms-500ms ] + // - At this point we skip building the last 2 blocks to give room for the 1s authoring + // duration before the next slot arrives. + // + // opportunity 13: [ 805 | 803 | 6s ] + // - The new relay block was not imported soon enough, therefore we detect the same relay + // parent that we just skipped. Since the import of a new block is usually ~15ms after + // the wall slot ticks. + // - We don't want to build on this relay parent and instead skip until the next relay + // block arrives. struct ParentTracker { /// Hash of the relay block. hash: Option, /// True if the collator built a block for the current relay parent, false otherwise. + /// + /// This state is needed, otherwise the opportunity 1 might mark the block as + /// terminated wrongfully. has_built: bool, /// True if the collator adjusted the slot timer and decided there is no time to build - /// a block, false otherwise. + /// a block, false otherwise. This is set to true on opportunity 11. has_terminated: bool, } From 2ee71e4535da68aa992a1364440865cc1687b683 Mon Sep 17 00:00:00 2001 From: Alexandru Vasile Date: Thu, 26 Mar 2026 14:21:28 +0000 Subject: [PATCH 11/13] Use block numbers Signed-off-by: Alexandru Vasile --- .../slot_based/block_builder_task.rs | 109 ++++++++---------- .../src/collators/slot_based/slot_timer.rs | 4 +- 2 files changed, 48 insertions(+), 65 deletions(-) diff --git a/cumulus/client/consensus/aura/src/collators/slot_based/block_builder_task.rs b/cumulus/client/consensus/aura/src/collators/slot_based/block_builder_task.rs index ac928a665efc0..9e7b8ae130865 100644 --- a/cumulus/client/consensus/aura/src/collators/slot_based/block_builder_task.rs +++ b/cumulus/client/consensus/aura/src/collators/slot_based/block_builder_task.rs @@ -51,7 +51,7 @@ use sp_application_crypto::AppPublic; use sp_blockchain::HeaderBackend; use sp_consensus::Environment; use sp_consensus_aura::AuraApi; -use sp_core::{crypto::Pair, H256}; +use sp_core::crypto::Pair; use sp_inherents::CreateInherentDataProviders; use sp_keystore::KeystorePtr; use sp_runtime::{ @@ -188,45 +188,41 @@ where collator_util::Collator::::new(params) }; - // Tracker helper to ensure we do not build an extra block at slot boundaries - // with edge-cases of importing blocks after the `slot_timer` ticks. + // Prevents a "stale parent" race condition that degrades block confidence. // - // The following scenarios are degrading block confidence (happening on the same instance): + // Block production is driven by two decoupled clocks: + // I. Aura slot (wall clock): Triggers wake-ups and enforces a 1-second hard deadline at + // the end of the slot to safely stop authoring. + // II. Parachain slot: Derived from the imported relay parent, which determins if we can + // build or not. // - // [ wall slot | para slot | next wall slot] - // opportunity 1: [ 803 | 803 | 490ms ] - // - The wall slot is behind the para slot deduced by the relay block - // - The next slot 804 arrives in 490ms leaving no room for the 1s authoring duration - // - collator must skip the building the first block for this relay block + // Because network imports race against the local wall clock, a node running + // elastic scaling can easily desync. // - // opportunity 2-10: [ 804 | 803 | 6s ] - // - This is the proper case where we have sufficient time to build and build 10 blocks + // Timeline of the failure scenario (observed on yap-polkadot): // - // opportunity 11-12: [ 804 | 803 | 990ms-500ms ] - // - At this point we skip building the last 2 blocks to give room for the 1s authoring - // duration before the next slot arrives. + // - T0: Aura 803 begins. The relay parent is still old (0xA). The node skips. // - // opportunity 13: [ 805 | 803 | 6s ] - // - The new relay block was not imported soon enough, therefore we detect the same relay - // parent that we just skipped. Since the import of a new block is usually ~15ms after - // the wall slot ticks. - // - We don't want to build on this relay parent and instead skip until the next relay - // block arrives. - struct ParentTracker { - /// Hash of the relay block. - hash: Option, - /// True if the collator built a block for the current relay parent, false otherwise. - /// - /// This state is needed, otherwise the opportunity 1 might mark the block as - /// terminated wrongfully. - has_built: bool, - /// True if the collator adjusted the slot timer and decided there is no time to build - /// a block, false otherwise. This is set to true on opportunity 11. - has_terminated: bool, - } - - let mut parent_tracker = - ParentTracker { hash: None, has_built: false, has_terminated: false }; + // - T1 (soft failure): Aura 803 last block production opportunity. Relay parent 0xB (para + // slot 803) is picked. However, aura 803 now has < 1000ms remaining. The 1s deadline + // triggers, skipping the block. + // + // - T2: Aura 804 begins. The node successfully builds 10 blocks on 0xB. As aura 804 ends, + // the 1s deadline triggers again, skipping the final 2 cores. + // + // - T3 (critical failure): Aura 805 begins. The wall clock resets, the next aura slot + // arrives in 6000ms. However, the network hasn't delivered a new relay parent yet. + // Because the timer reset, the deadline check passes, and the node erroneously builds a + // stale block on top of 0xB. + // + // To prevent T1, we rely on the 1-second hard deadline to skip block production. + // + // To prevent T3, we track the `last_slot_number` and the `last_rp_built_number`. If the + // aura wall clock advances to a new slot, but the relay parent number hasn't increased + // since our last successful build, we intentionally skip production until the network + // delivers the new state. + let mut last_rp_built_number = 0; + let mut last_slot_number = 0.into(); let mut relay_chain_data_cache = RelayChainDataCache::new(relay_client.clone(), para_id); let mut connection_helper = BackingGroupConnectionHelper::new( @@ -240,7 +236,7 @@ where loop { // We wait here until the next slot arrives. - if slot_timer.wait_until_next_slot().await.is_err() { + let Ok(aura_slot) = slot_timer.wait_until_next_slot().await else { tracing::error!(target: LOG_TARGET, "Unable to wait for next slot."); return; }; @@ -279,25 +275,21 @@ where let relay_parent = rp_data.relay_parent().hash(); let relay_parent_header = rp_data.relay_parent().clone(); + let relay_parent_number: u32 = (*relay_parent_header.number()).into(); - // Reset the state of the tracker in case of new blocks or re-orgs. - if parent_tracker.hash != Some(relay_parent) { - parent_tracker = ParentTracker { - hash: Some(relay_parent), - has_built: false, - has_terminated: false, - }; - } - - // The collator has terminated the building time, the `adjust_authoring_duration` has - // determined that there is no time left to build the block. This block must be skipped - // to avoid building blocks wrongfully when the import races with - // `wait_until_next_slot`. - if parent_tracker.has_terminated { + // Wall slot is guaranteed to be monotonically increasing. + // If the wall slot changes, and the relay parent number is smaller than the last built + // relay parent, it means we are in a new slot but a new relay parent has not been + // updated on yet (races with import). + if aura_slot > last_slot_number && relay_parent_number <= last_rp_built_number { tracing::debug!( - target: crate::LOG_TARGET, + target: LOG_TARGET, ?relay_parent, - "Skipping block production on terminated relay parent." + relay_parent_num = relay_parent_number, + last_rp_built_number, + ?aura_slot, + ?last_slot_number, + "Skipping block production on same relay parent as last built block." ); continue; } @@ -499,15 +491,6 @@ where slot = ?para_slot.slot, "Not building block due to insufficient authoring duration." ); - - // In practice with 12 cores, it is possible that the wall clock (aura_slot) aligns - // perfectly with the parachain slot deduced from the relay block. For those cases, - // the first block production opportunity will be skipped immediately. This field - // ensures the block is marked as terminated only after building. - if parent_tracker.has_built { - parent_tracker.has_terminated = true; - } - continue; }; @@ -531,8 +514,8 @@ where continue; }; - // Mark the collator has built a block on this relay parent. - parent_tracker.has_built = true; + last_rp_built_number = relay_parent_number; + last_slot_number = aura_slot; let new_block_hash = candidate.block.header().hash(); diff --git a/cumulus/client/consensus/aura/src/collators/slot_based/slot_timer.rs b/cumulus/client/consensus/aura/src/collators/slot_based/slot_timer.rs index 5686a7b68b4e0..45c77daba32f7 100644 --- a/cumulus/client/consensus/aura/src/collators/slot_based/slot_timer.rs +++ b/cumulus/client/consensus/aura/src/collators/slot_based/slot_timer.rs @@ -350,7 +350,7 @@ where } /// Returns a future that resolves when the next block production should be attempted. - pub async fn wait_until_next_slot(&mut self) -> Result<(), ()> { + pub async fn wait_until_next_slot(&mut self) -> Result { let slot_duration = match crate::slot_duration(&*self.client) { Ok(d) => d, Err(error) => { @@ -391,7 +391,7 @@ where self.last_reported_slot = Some(next_aura_slot); - Ok(()) + Ok(next_aura_slot) } } From df8c3c7b1ea52ec8e2cff0a7126e23dd51c95e77 Mon Sep 17 00:00:00 2001 From: Alexandru Vasile Date: Tue, 31 Mar 2026 10:39:58 +0000 Subject: [PATCH 12/13] Revert "Use block numbers" This reverts commit 2ee71e4535da68aa992a1364440865cc1687b683. --- .../slot_based/block_builder_task.rs | 109 ++++++++++-------- .../src/collators/slot_based/slot_timer.rs | 4 +- 2 files changed, 65 insertions(+), 48 deletions(-) diff --git a/cumulus/client/consensus/aura/src/collators/slot_based/block_builder_task.rs b/cumulus/client/consensus/aura/src/collators/slot_based/block_builder_task.rs index 9e7b8ae130865..ac928a665efc0 100644 --- a/cumulus/client/consensus/aura/src/collators/slot_based/block_builder_task.rs +++ b/cumulus/client/consensus/aura/src/collators/slot_based/block_builder_task.rs @@ -51,7 +51,7 @@ use sp_application_crypto::AppPublic; use sp_blockchain::HeaderBackend; use sp_consensus::Environment; use sp_consensus_aura::AuraApi; -use sp_core::crypto::Pair; +use sp_core::{crypto::Pair, H256}; use sp_inherents::CreateInherentDataProviders; use sp_keystore::KeystorePtr; use sp_runtime::{ @@ -188,41 +188,45 @@ where collator_util::Collator::::new(params) }; - // Prevents a "stale parent" race condition that degrades block confidence. + // Tracker helper to ensure we do not build an extra block at slot boundaries + // with edge-cases of importing blocks after the `slot_timer` ticks. // - // Block production is driven by two decoupled clocks: - // I. Aura slot (wall clock): Triggers wake-ups and enforces a 1-second hard deadline at - // the end of the slot to safely stop authoring. - // II. Parachain slot: Derived from the imported relay parent, which determins if we can - // build or not. + // The following scenarios are degrading block confidence (happening on the same instance): // - // Because network imports race against the local wall clock, a node running - // elastic scaling can easily desync. + // [ wall slot | para slot | next wall slot] + // opportunity 1: [ 803 | 803 | 490ms ] + // - The wall slot is behind the para slot deduced by the relay block + // - The next slot 804 arrives in 490ms leaving no room for the 1s authoring duration + // - collator must skip the building the first block for this relay block // - // Timeline of the failure scenario (observed on yap-polkadot): + // opportunity 2-10: [ 804 | 803 | 6s ] + // - This is the proper case where we have sufficient time to build and build 10 blocks // - // - T0: Aura 803 begins. The relay parent is still old (0xA). The node skips. + // opportunity 11-12: [ 804 | 803 | 990ms-500ms ] + // - At this point we skip building the last 2 blocks to give room for the 1s authoring + // duration before the next slot arrives. // - // - T1 (soft failure): Aura 803 last block production opportunity. Relay parent 0xB (para - // slot 803) is picked. However, aura 803 now has < 1000ms remaining. The 1s deadline - // triggers, skipping the block. - // - // - T2: Aura 804 begins. The node successfully builds 10 blocks on 0xB. As aura 804 ends, - // the 1s deadline triggers again, skipping the final 2 cores. - // - // - T3 (critical failure): Aura 805 begins. The wall clock resets, the next aura slot - // arrives in 6000ms. However, the network hasn't delivered a new relay parent yet. - // Because the timer reset, the deadline check passes, and the node erroneously builds a - // stale block on top of 0xB. - // - // To prevent T1, we rely on the 1-second hard deadline to skip block production. - // - // To prevent T3, we track the `last_slot_number` and the `last_rp_built_number`. If the - // aura wall clock advances to a new slot, but the relay parent number hasn't increased - // since our last successful build, we intentionally skip production until the network - // delivers the new state. - let mut last_rp_built_number = 0; - let mut last_slot_number = 0.into(); + // opportunity 13: [ 805 | 803 | 6s ] + // - The new relay block was not imported soon enough, therefore we detect the same relay + // parent that we just skipped. Since the import of a new block is usually ~15ms after + // the wall slot ticks. + // - We don't want to build on this relay parent and instead skip until the next relay + // block arrives. + struct ParentTracker { + /// Hash of the relay block. + hash: Option, + /// True if the collator built a block for the current relay parent, false otherwise. + /// + /// This state is needed, otherwise the opportunity 1 might mark the block as + /// terminated wrongfully. + has_built: bool, + /// True if the collator adjusted the slot timer and decided there is no time to build + /// a block, false otherwise. This is set to true on opportunity 11. + has_terminated: bool, + } + + let mut parent_tracker = + ParentTracker { hash: None, has_built: false, has_terminated: false }; let mut relay_chain_data_cache = RelayChainDataCache::new(relay_client.clone(), para_id); let mut connection_helper = BackingGroupConnectionHelper::new( @@ -236,7 +240,7 @@ where loop { // We wait here until the next slot arrives. - let Ok(aura_slot) = slot_timer.wait_until_next_slot().await else { + if slot_timer.wait_until_next_slot().await.is_err() { tracing::error!(target: LOG_TARGET, "Unable to wait for next slot."); return; }; @@ -275,21 +279,25 @@ where let relay_parent = rp_data.relay_parent().hash(); let relay_parent_header = rp_data.relay_parent().clone(); - let relay_parent_number: u32 = (*relay_parent_header.number()).into(); - // Wall slot is guaranteed to be monotonically increasing. - // If the wall slot changes, and the relay parent number is smaller than the last built - // relay parent, it means we are in a new slot but a new relay parent has not been - // updated on yet (races with import). - if aura_slot > last_slot_number && relay_parent_number <= last_rp_built_number { + // Reset the state of the tracker in case of new blocks or re-orgs. + if parent_tracker.hash != Some(relay_parent) { + parent_tracker = ParentTracker { + hash: Some(relay_parent), + has_built: false, + has_terminated: false, + }; + } + + // The collator has terminated the building time, the `adjust_authoring_duration` has + // determined that there is no time left to build the block. This block must be skipped + // to avoid building blocks wrongfully when the import races with + // `wait_until_next_slot`. + if parent_tracker.has_terminated { tracing::debug!( - target: LOG_TARGET, + target: crate::LOG_TARGET, ?relay_parent, - relay_parent_num = relay_parent_number, - last_rp_built_number, - ?aura_slot, - ?last_slot_number, - "Skipping block production on same relay parent as last built block." + "Skipping block production on terminated relay parent." ); continue; } @@ -491,6 +499,15 @@ where slot = ?para_slot.slot, "Not building block due to insufficient authoring duration." ); + + // In practice with 12 cores, it is possible that the wall clock (aura_slot) aligns + // perfectly with the parachain slot deduced from the relay block. For those cases, + // the first block production opportunity will be skipped immediately. This field + // ensures the block is marked as terminated only after building. + if parent_tracker.has_built { + parent_tracker.has_terminated = true; + } + continue; }; @@ -514,8 +531,8 @@ where continue; }; - last_rp_built_number = relay_parent_number; - last_slot_number = aura_slot; + // Mark the collator has built a block on this relay parent. + parent_tracker.has_built = true; let new_block_hash = candidate.block.header().hash(); diff --git a/cumulus/client/consensus/aura/src/collators/slot_based/slot_timer.rs b/cumulus/client/consensus/aura/src/collators/slot_based/slot_timer.rs index 45c77daba32f7..5686a7b68b4e0 100644 --- a/cumulus/client/consensus/aura/src/collators/slot_based/slot_timer.rs +++ b/cumulus/client/consensus/aura/src/collators/slot_based/slot_timer.rs @@ -350,7 +350,7 @@ where } /// Returns a future that resolves when the next block production should be attempted. - pub async fn wait_until_next_slot(&mut self) -> Result { + pub async fn wait_until_next_slot(&mut self) -> Result<(), ()> { let slot_duration = match crate::slot_duration(&*self.client) { Ok(d) => d, Err(error) => { @@ -391,7 +391,7 @@ where self.last_reported_slot = Some(next_aura_slot); - Ok(next_aura_slot) + Ok(()) } } From 0b47879606aa1f6d055083e35e893c36dc90ee04 Mon Sep 17 00:00:00 2001 From: Alexandru Vasile Date: Tue, 31 Mar 2026 16:43:19 +0000 Subject: [PATCH 13/13] Update comment Signed-off-by: Alexandru Vasile --- .../slot_based/block_builder_task.rs | 57 +++++++++++-------- 1 file changed, 32 insertions(+), 25 deletions(-) diff --git a/cumulus/client/consensus/aura/src/collators/slot_based/block_builder_task.rs b/cumulus/client/consensus/aura/src/collators/slot_based/block_builder_task.rs index ac928a665efc0..4f4c518af910c 100644 --- a/cumulus/client/consensus/aura/src/collators/slot_based/block_builder_task.rs +++ b/cumulus/client/consensus/aura/src/collators/slot_based/block_builder_task.rs @@ -188,40 +188,47 @@ where collator_util::Collator::::new(params) }; - // Tracker helper to ensure we do not build an extra block at slot boundaries - // with edge-cases of importing blocks after the `slot_timer` ticks. + // Prevents a "stale parent" race condition that degrades block confidence. // - // The following scenarios are degrading block confidence (happening on the same instance): + // Block production is driven by two decoupled clocks: + // I. Aura slot (wall clock): Triggers wake-ups and enforces a 1-second hard deadline + // at the end of the slot to safely stop authoring. + // II. Parachain slot: Derived from the imported relay parent, which determines if we + // can build or not. // - // [ wall slot | para slot | next wall slot] - // opportunity 1: [ 803 | 803 | 490ms ] - // - The wall slot is behind the para slot deduced by the relay block - // - The next slot 804 arrives in 490ms leaving no room for the 1s authoring duration - // - collator must skip the building the first block for this relay block + // Because network imports race against the local wall clock, a node running + // elastic scaling can easily desync. // - // opportunity 2-10: [ 804 | 803 | 6s ] - // - This is the proper case where we have sufficient time to build and build 10 blocks + // Timeline of the failure scenario (observed on yap-polkadot): // - // opportunity 11-12: [ 804 | 803 | 990ms-500ms ] - // - At this point we skip building the last 2 blocks to give room for the 1s authoring - // duration before the next slot arrives. + // - T0: Aura 803 begins. The relay parent is still old (0xA). The node skips. // - // opportunity 13: [ 805 | 803 | 6s ] - // - The new relay block was not imported soon enough, therefore we detect the same relay - // parent that we just skipped. Since the import of a new block is usually ~15ms after - // the wall slot ticks. - // - We don't want to build on this relay parent and instead skip until the next relay - // block arrives. + // - T1 (soft failure): Aura 803 last block production opportunity. Relay parent 0xB + // (para slot 803) is picked. However, aura 803 now has < 1000ms remaining. The 1s + // deadline triggers, skipping the block. + // + // - T2: Aura 804 begins. The node successfully builds 10 blocks on 0xB. As aura 804 + // ends, the 1s deadline triggers again, skipping the final 2 cores. + // + // - T3 (critical failure): Aura 805 begins. The wall clock resets, the next aura slot + // arrives in 6000ms. However, the network hasn't delivered a new relay parent yet. + // Because the timer reset, the deadline check passes, and the node erroneously + // builds a stale block on top of 0xB. + // + // To prevent T1, we rely on the 1-second hard deadline to skip block production. + // + // To prevent T3, we track the relay parent hash and whether we've already built and + // terminated on it. If the relay parent hasn't changed since our last terminated build, + // we skip production until the network delivers a new relay block. + // + // See: https://github.com/paritytech/polkadot-sdk/pull/11453 struct ParentTracker { /// Hash of the relay block. hash: Option, - /// True if the collator built a block for the current relay parent, false otherwise. - /// - /// This state is needed, otherwise the opportunity 1 might mark the block as - /// terminated wrongfully. + /// True if the collator built a block for the current relay parent. + /// Needed so that T1 (first opportunity skip) doesn't wrongfully mark as terminated. has_built: bool, - /// True if the collator adjusted the slot timer and decided there is no time to build - /// a block, false otherwise. This is set to true on opportunity 11. + /// True if the slot timer determined there is no time left to build. has_terminated: bool, }