From b3e7b47c03f84779e8886f48710060b9482ab6f4 Mon Sep 17 00:00:00 2001 From: Alexandru Gheorghe Date: Mon, 2 Dec 2024 09:45:11 +0200 Subject: [PATCH 1/6] Fix order of resending messages after restart The way we build the messages we need to send to approval-distribution can result in a situation where is we have multiple assignments covered by a coalesced approval, the messages are sent in this order: ASSIGNMENT1, APPROVAL, ASSIGNMENT2, because we iterate over each candidate and add to the queue of messages both the assignment and the approval for that candidate, and when the approval reaches the approval-distribution subsystem it won't be imported and gossiped because one of the assignment for it is not known. So in a network where a lot of nodes are restarting at the same time we could end up in a situation where a set of the nodes correctly received the assignments and approvals before the restart and approve their blocks and don't trigger their assignments. The other set of nodes should receive the assignments and approvals after the restart, but because the approvals never get broacasted anymore because of this bug, the only way they could approve is if other nodes start broadcasting their assignments. I think this bug contribute to the reason the network did not recovered on `25-11-25 15:55:40` after the restarts. Signed-off-by: Alexandru Gheorghe --- polkadot/node/core/approval-voting/src/lib.rs | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/polkadot/node/core/approval-voting/src/lib.rs b/polkadot/node/core/approval-voting/src/lib.rs index 2176cc7675be..7fa2e6296dce 100644 --- a/polkadot/node/core/approval-voting/src/lib.rs +++ b/polkadot/node/core/approval-voting/src/lib.rs @@ -1582,8 +1582,9 @@ async fn handle_actions< session_info_provider, ) .await?; - - approval_voting_sender.send_messages(messages.into_iter()).await; + for mesasge in messages.into_iter() { + approval_voting_sender.send_unbounded_message(message); + } let next_actions: Vec = next_actions.into_iter().map(|v| v.clone()).chain(actions_iter).collect(); @@ -1668,6 +1669,7 @@ async fn distribution_messages_for_activation &tick_now), next_no_show, ) - .map(|tick| Action::ScheduleWakeup { - block_hash, - block_number, - candidate_hash, - tick, - }) + .map(|tick| Action::ScheduleWakeup { block_hash, block_number, candidate_hash, tick }) }, RequiredTranches::Pending { considered, next_no_show, clock_drift, .. } => { // select the minimum of `next_no_show`, or the tick of the next non-empty tranche From 62904f181d8f41c663fc40abe401fa51f425ad1b Mon Sep 17 00:00:00 2001 From: Alexandru Gheorghe Date: Mon, 9 Dec 2024 10:30:31 +0200 Subject: [PATCH 2/6] Add unittests Signed-off-by: Alexandru Gheorghe --- polkadot/node/core/approval-voting/src/lib.rs | 2 +- .../node/core/approval-voting/src/tests.rs | 314 ++++++++++++++++++ 2 files changed, 315 insertions(+), 1 deletion(-) diff --git a/polkadot/node/core/approval-voting/src/lib.rs b/polkadot/node/core/approval-voting/src/lib.rs index 7fa2e6296dce..36370455342d 100644 --- a/polkadot/node/core/approval-voting/src/lib.rs +++ b/polkadot/node/core/approval-voting/src/lib.rs @@ -1582,7 +1582,7 @@ async fn handle_actions< session_info_provider, ) .await?; - for mesasge in messages.into_iter() { + for message in messages.into_iter() { approval_voting_sender.send_unbounded_message(message); } let next_actions: Vec = diff --git a/polkadot/node/core/approval-voting/src/tests.rs b/polkadot/node/core/approval-voting/src/tests.rs index 099ab419dfbf..be569a1de3ec 100644 --- a/polkadot/node/core/approval-voting/src/tests.rs +++ b/polkadot/node/core/approval-voting/src/tests.rs @@ -4459,6 +4459,114 @@ async fn setup_overseer_with_two_blocks_each_with_one_assignment_triggered( assert!(our_assignment.triggered()); } +// Builds a chain with a fork where both relay blocks include the same candidate. +async fn build_chain_with_block_with_two_candidates( + block_hash1: Hash, + slot: Slot, + sync_oracle_handle: TestSyncOracleHandle, + candidate_receipt: Vec, +) -> (ChainBuilder, SessionInfo) { + let validators = vec![ + Sr25519Keyring::Alice, + Sr25519Keyring::Bob, + Sr25519Keyring::Charlie, + Sr25519Keyring::Dave, + Sr25519Keyring::Eve, + ]; + let session_info = SessionInfo { + validator_groups: IndexedVec::>::from(vec![ + vec![ValidatorIndex(0), ValidatorIndex(1)], + vec![ValidatorIndex(2)], + vec![ValidatorIndex(3), ValidatorIndex(4)], + ]), + ..session_info(&validators) + }; + + let candidates = Some( + candidate_receipt + .iter() + .enumerate() + .map(|(i, receipt)| (receipt.clone(), CoreIndex(i as u32), GroupIndex(i as u32))) + .collect(), + ); + let mut chain_builder = ChainBuilder::new(); + + chain_builder + .major_syncing(sync_oracle_handle.is_major_syncing.clone()) + .add_block( + block_hash1, + ChainBuilder::GENESIS_HASH, + 1, + BlockConfig { + slot, + candidates: candidates.clone(), + session_info: Some(session_info.clone()), + end_syncing: true, + }, + ); + (chain_builder, session_info) +} + +async fn setup_overseer_with_blocks_with_two_assignments_triggered( + virtual_overseer: &mut VirtualOverseer, + store: TestStore, + clock: &Arc, + sync_oracle_handle: TestSyncOracleHandle, +) { + assert_matches!( + overseer_recv(virtual_overseer).await, + AllMessages::ChainApi(ChainApiMessage::FinalizedBlockNumber(rx)) => { + rx.send(Ok(0)).unwrap(); + } + ); + + let block_hash = Hash::repeat_byte(0x01); + let candidate_commitments = CandidateCommitments::default(); + let mut candidate_receipt = dummy_candidate_receipt_v2(block_hash); + candidate_receipt.commitments_hash = candidate_commitments.hash(); + let candidate_hash = candidate_receipt.hash(); + + let mut candidate_commitments2 = CandidateCommitments::default(); + candidate_commitments2.processed_downward_messages = 3; + let mut candidate_receipt2 = dummy_candidate_receipt_v2(block_hash); + candidate_receipt2.commitments_hash = candidate_commitments2.hash(); + let candidate_hash2 = candidate_receipt2.hash(); + + let slot = Slot::from(1); + let (chain_builder, _session_info) = build_chain_with_block_with_two_candidates( + block_hash, + slot, + sync_oracle_handle, + vec![candidate_receipt, candidate_receipt2], + ) + .await; + chain_builder.build(virtual_overseer).await; + + assert!(!clock.inner.lock().current_wakeup_is(1)); + clock.inner.lock().wakeup_all(1); + + assert!(clock.inner.lock().current_wakeup_is(slot_to_tick(slot))); + clock.inner.lock().wakeup_all(slot_to_tick(slot)); + + futures_timer::Delay::new(Duration::from_millis(200)).await; + + clock.inner.lock().wakeup_all(slot_to_tick(slot + 2)); + + assert_eq!(clock.inner.lock().wakeups.len(), 0); + + futures_timer::Delay::new(Duration::from_millis(200)).await; + + let candidate_entry = store.load_candidate_entry(&candidate_hash).unwrap().unwrap(); + let our_assignment = + candidate_entry.approval_entry(&block_hash).unwrap().our_assignment().unwrap(); + assert!(our_assignment.triggered()); + + let candidate_entry = store.load_candidate_entry(&candidate_hash2).unwrap().unwrap(); + let our_assignment = + candidate_entry.approval_entry(&block_hash).unwrap().our_assignment().unwrap(); + assert!(our_assignment.triggered()); +} + // Tests that for candidates that we did not approve yet, for which we triggered the assignment and // the approval work we restart the work to approve it. #[test] @@ -4920,6 +5028,212 @@ fn subsystem_sends_pending_approvals_on_approval_restart() { }); } +// Test that after restart approvals are sent after all assignments have been distributed. +#[test] +fn subsystem_sends_assignment_approval_in_correct_order_on_approval_restart() { + let assignment_criteria = Box::new(MockAssignmentCriteria( + || { + let mut assignments = HashMap::new(); + + let _ = assignments.insert( + CoreIndex(0), + approval_db::v2::OurAssignment { + cert: garbage_assignment_cert_v2(AssignmentCertKindV2::RelayVRFModuloCompact { + core_bitfield: vec![CoreIndex(0), CoreIndex(2)].try_into().unwrap(), + }), + tranche: 0, + validator_index: ValidatorIndex(0), + triggered: false, + } + .into(), + ); + + let _ = assignments.insert( + CoreIndex(1), + approval_db::v2::OurAssignment { + cert: garbage_assignment_cert_v2(AssignmentCertKindV2::RelayVRFDelay { + core_index: CoreIndex(1), + }), + tranche: 0, + validator_index: ValidatorIndex(0), + triggered: false, + } + .into(), + ); + assignments + }, + |_| Ok(0), + )); + let config = HarnessConfigBuilder::default().assignment_criteria(assignment_criteria).build(); + let store = config.backend(); + let store_clone = config.backend(); + + test_harness(config, |test_harness| async move { + let TestHarness { mut virtual_overseer, clock, sync_oracle_handle } = test_harness; + + setup_overseer_with_blocks_with_two_assignments_triggered( + &mut virtual_overseer, + store, + &clock, + sync_oracle_handle, + ) + .await; + + assert_matches!( + overseer_recv(&mut virtual_overseer).await, + AllMessages::ApprovalDistribution(ApprovalDistributionMessage::DistributeAssignment( + _, + _, + )) => { + } + ); + + recover_available_data(&mut virtual_overseer).await; + fetch_validation_code(&mut virtual_overseer).await; + + assert_matches!( + overseer_recv(&mut virtual_overseer).await, + AllMessages::ApprovalDistribution(ApprovalDistributionMessage::DistributeAssignment( + _, + _ + )) => { + } + ); + + recover_available_data(&mut virtual_overseer).await; + fetch_validation_code(&mut virtual_overseer).await; + + assert_matches!( + overseer_recv(&mut virtual_overseer).await, + AllMessages::CandidateValidation(CandidateValidationMessage::ValidateFromExhaustive { + exec_kind, + response_sender, + .. + }) if exec_kind == PvfExecKind::Approval => { + response_sender.send(Ok(ValidationResult::Valid(Default::default(), Default::default()))) + .unwrap(); + } + ); + + assert_matches!( + overseer_recv(&mut virtual_overseer).await, + AllMessages::CandidateValidation(CandidateValidationMessage::ValidateFromExhaustive { + exec_kind, + response_sender, + .. + }) if exec_kind == PvfExecKind::Approval => { + response_sender.send(Ok(ValidationResult::Valid(Default::default(), Default::default()))) + .unwrap(); + } + ); + + // Configure a big coalesce number, so that the signature is cached instead of being sent to + // approval-distribution. + assert_matches!( + overseer_recv(&mut virtual_overseer).await, + AllMessages::RuntimeApi(RuntimeApiMessage::Request(_, RuntimeApiRequest::ApprovalVotingParams(_, sender))) => { + let _ = sender.send(Ok(ApprovalVotingParams { + max_approval_coalesce_count: 2, + })); + } + ); + + assert_matches!( + overseer_recv(&mut virtual_overseer).await, + AllMessages::RuntimeApi(RuntimeApiMessage::Request(_, RuntimeApiRequest::ApprovalVotingParams(_, sender))) => { + let _ = sender.send(Ok(ApprovalVotingParams { + max_approval_coalesce_count: 2, + })); + } + ); + + assert_matches!( + overseer_recv(&mut virtual_overseer).await, + AllMessages::ApprovalDistribution(ApprovalDistributionMessage::DistributeApproval(_)) + ); + + // Assert that there are no more messages being sent by the subsystem + assert!(overseer_recv(&mut virtual_overseer).timeout(TIMEOUT / 2).await.is_none()); + + virtual_overseer + }); + + let config = HarnessConfigBuilder::default().backend(store_clone).major_syncing(true).build(); + // On restart we should first distribute all assignments covering a coalesced approval. + test_harness(config, |test_harness| async move { + let TestHarness { mut virtual_overseer, clock, sync_oracle_handle } = test_harness; + + assert_matches!( + overseer_recv(&mut virtual_overseer).await, + AllMessages::ChainApi(ChainApiMessage::FinalizedBlockNumber(rx)) => { + rx.send(Ok(0)).unwrap(); + } + ); + + let block_hash = Hash::repeat_byte(0x01); + let candidate_commitments = CandidateCommitments::default(); + let mut candidate_receipt = dummy_candidate_receipt_v2(block_hash); + candidate_receipt.commitments_hash = candidate_commitments.hash(); + + let mut candidate_commitments2 = CandidateCommitments::default(); + candidate_commitments2.processed_downward_messages = 3; + let mut candidate_receipt2 = dummy_candidate_receipt_v2(block_hash); + candidate_receipt2.commitments_hash = candidate_commitments2.hash(); + + let slot = Slot::from(1); + + clock.inner.lock().set_tick(slot_to_tick(slot + 2)); + let (chain_builder, _session_info) = build_chain_with_block_with_two_candidates( + block_hash, + slot, + sync_oracle_handle, + vec![candidate_receipt.into(), candidate_receipt2.into()], + ) + .await; + chain_builder.build(&mut virtual_overseer).await; + + futures_timer::Delay::new(Duration::from_millis(2000)).await; + + assert_matches!( + overseer_recv(&mut virtual_overseer).await, + AllMessages::ApprovalDistribution(ApprovalDistributionMessage::NewBlocks( + _, + )) => { + } + ); + + assert_matches!( + overseer_recv(&mut virtual_overseer).await, + AllMessages::ApprovalDistribution(ApprovalDistributionMessage::DistributeAssignment( + _, + _, + )) => { + } + ); + + assert_matches!( + overseer_recv(&mut virtual_overseer).await, + AllMessages::ApprovalDistribution(ApprovalDistributionMessage::DistributeAssignment( + _, + _, + )) => { + } + ); + + assert_matches!( + overseer_recv(&mut virtual_overseer).await, + AllMessages::ApprovalDistribution(ApprovalDistributionMessage::DistributeApproval(approval)) => { + assert_eq!(approval.candidate_indices.count_ones(), 2); + } + ); + + // Assert that there are no more messages being sent by the subsystem + assert!(overseer_recv(&mut virtual_overseer).timeout(TIMEOUT / 2).await.is_none()); + + virtual_overseer + }); +} + // Test we correctly update the timer when we mark the beginning of gathering assignments. #[test] fn test_gathering_assignments_statements() { From 39d0acca86bcbd9bda2dfe0da1f4c4b56aa04479 Mon Sep 17 00:00:00 2001 From: Alexandru Gheorghe <49718502+alexggh@users.noreply.github.com> Date: Mon, 9 Dec 2024 14:38:15 +0200 Subject: [PATCH 3/6] Update lib.rs --- polkadot/node/core/approval-voting/src/lib.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/polkadot/node/core/approval-voting/src/lib.rs b/polkadot/node/core/approval-voting/src/lib.rs index 36370455342d..c09bad5b2a62 100644 --- a/polkadot/node/core/approval-voting/src/lib.rs +++ b/polkadot/node/core/approval-voting/src/lib.rs @@ -1866,6 +1866,9 @@ async fn distribution_messages_for_activation Date: Tue, 10 Dec 2024 10:51:32 +0200 Subject: [PATCH 4/6] Fix formatting Signed-off-by: Alexandru Gheorghe --- polkadot/node/core/approval-voting/src/lib.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/polkadot/node/core/approval-voting/src/lib.rs b/polkadot/node/core/approval-voting/src/lib.rs index c09bad5b2a62..7cea22d1a6a7 100644 --- a/polkadot/node/core/approval-voting/src/lib.rs +++ b/polkadot/node/core/approval-voting/src/lib.rs @@ -2472,7 +2472,12 @@ fn schedule_wakeup_action( last_assignment_tick.map(|l| l + APPROVAL_DELAY).filter(|t| t > &tick_now), next_no_show, ) - .map(|tick| Action::ScheduleWakeup { block_hash, block_number, candidate_hash, tick }) + .map(|tick| Action::ScheduleWakeup { + block_hash, + block_number, + candidate_hash, + tick, + }) }, RequiredTranches::Pending { considered, next_no_show, clock_drift, .. } => { // select the minimum of `next_no_show`, or the tick of the next non-empty tranche From 1a24bd22a104c22ac9286a546b36d46fa91b2cce Mon Sep 17 00:00:00 2001 From: Alexandru Gheorghe Date: Tue, 10 Dec 2024 10:57:28 +0200 Subject: [PATCH 5/6] Add prdoc Signed-off-by: Alexandru Gheorghe --- prdoc/pr_6729.prdoc | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 prdoc/pr_6729.prdoc diff --git a/prdoc/pr_6729.prdoc b/prdoc/pr_6729.prdoc new file mode 100644 index 000000000000..23cb863c3432 --- /dev/null +++ b/prdoc/pr_6729.prdoc @@ -0,0 +1,15 @@ +# Schema: Polkadot SDK PRDoc Schema (prdoc) v1.0.0 +# See doc at https://raw.githubusercontent.com/paritytech/polkadot-sdk/master/prdoc/schema_user.json + +title: Fix order of resending messages after restart + +doc: + - audience: Node Dev + description: | + At restart when dealing with a coalesced approval we might end up in a situation where we sent to + approval-distribution the approval before all assignments covering it, in that case, the approval + is ignored and never distribute, which will lead to no-shows. + +crates: + - name: polkadot-approval-voting + bump: minor From c95be6742a168a04b03b89d3dd645d910e2c40a2 Mon Sep 17 00:00:00 2001 From: Alexandru Gheorghe <49718502+alexggh@users.noreply.github.com> Date: Tue, 10 Dec 2024 11:52:29 +0200 Subject: [PATCH 6/6] Update pr_6729.prdoc --- prdoc/pr_6729.prdoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prdoc/pr_6729.prdoc b/prdoc/pr_6729.prdoc index 23cb863c3432..9eaa67363c9a 100644 --- a/prdoc/pr_6729.prdoc +++ b/prdoc/pr_6729.prdoc @@ -11,5 +11,5 @@ doc: is ignored and never distribute, which will lead to no-shows. crates: - - name: polkadot-approval-voting + - name: polkadot-node-core-approval-voting bump: minor