From 97555c751e3bd32eaec7e5f43242f3bc99047c03 Mon Sep 17 00:00:00 2001 From: Alexandru Gheorghe Date: Fri, 11 Jul 2025 17:24:41 +0300 Subject: [PATCH 01/59] collator-protocol: cleanup connecting to backing group Signed-off-by: Alexandru Gheorghe --- .../src/collator_side/mod.rs | 45 ++++++++++++++----- .../src/collator_side/validators_buffer.rs | 30 +------------ 2 files changed, 36 insertions(+), 39 deletions(-) diff --git a/polkadot/node/network/collator-protocol/src/collator_side/mod.rs b/polkadot/node/network/collator-protocol/src/collator_side/mod.rs index a9d4116e132ac..29cb4023c5dc4 100644 --- a/polkadot/node/network/collator-protocol/src/collator_side/mod.rs +++ b/polkadot/node/network/collator-protocol/src/collator_side/mod.rs @@ -120,7 +120,7 @@ enum ShouldAdvertiseTo { /// Info about validators we are currently connected to. /// /// It keeps track to which validators we advertised our collation. -#[derive(Debug, Default)] +#[derive(Debug, Default, Clone)] struct ValidatorGroup { /// Validators discovery ids. Lazily initialized when first /// distributing a collation. @@ -386,7 +386,10 @@ async fn distribute_collation( ) -> Result<()> { let candidate_relay_parent = receipt.descriptor.relay_parent(); let candidate_hash = receipt.hash(); - let cores_assigned = has_assigned_cores(&state.implicit_view, &state.per_relay_parent); + + // We should already be connected to the validators, but if we aren't, we will connect to them + // now. + connect_to_validators(ctx, &state.implicit_view, &state.per_relay_parent).await; let per_relay_parent = match state.per_relay_parent.get_mut(&candidate_relay_parent) { Some(per_relay_parent) => per_relay_parent, @@ -498,9 +501,6 @@ async fn distribute_collation( group }); - // Update a set of connected validators if necessary. - connect_to_validators(ctx, cores_assigned, &state.validator_groups_buf).await; - if let Some(result_sender) = result_sender { state.collation_result_senders.insert(candidate_hash, result_sender); } @@ -647,19 +647,41 @@ fn has_assigned_cores( false } +fn list_of_backing_validators_in_view( + implicit_view: &Option, + per_relay_parent: &HashMap, +) -> Vec { + let mut backing_validators = HashSet::new(); + let Some(implicit_view) = implicit_view else { return vec![] }; + + for leaf in implicit_view.leaves() { + if let Some(relay_parent) = per_relay_parent.get(leaf) { + for group in relay_parent.validator_group.values() { + backing_validators.extend(group.validators.iter().cloned()); + } + } + } + + backing_validators.into_iter().collect() +} + /// Updates a set of connected validators based on their advertisement-bits /// in a validators buffer. #[overseer::contextbounds(CollatorProtocol, prefix = self::overseer)] async fn connect_to_validators( ctx: &mut Context, - cores_assigned: bool, - validator_groups_buf: &ValidatorGroupsBuffer, + implicit_view: &Option, + per_relay_parent: &HashMap, ) { + let cores_assigned = has_assigned_cores(implicit_view, per_relay_parent); // If no cores are assigned to the para, we still need to send a ConnectToValidators request to // the network bridge passing an empty list of validator ids. Otherwise, it will keep connecting // to the last requested validators until a new request is issued. - let validator_ids = - if cores_assigned { validator_groups_buf.validators_to_connect() } else { Vec::new() }; + let validator_ids = if cores_assigned { + list_of_backing_validators_in_view(implicit_view, per_relay_parent) + } else { + Vec::new() + }; gum::trace!( target: LOG_TARGET, @@ -1243,6 +1265,7 @@ async fn handle_network_msg( OurViewChange(view) => { gum::trace!(target: LOG_TARGET, ?view, "Own view change"); handle_our_view_change(ctx, state, view).await?; + connect_to_validators(ctx, &state.implicit_view, &state.per_relay_parent).await; }, PeerMessage(remote, msg) => { handle_incoming_peer_message(ctx, runtime, state, remote, msg).await?; @@ -1668,8 +1691,8 @@ async fn run_inner( } } _ = reconnect_timeout => { - let cores_assigned = has_assigned_cores(&state.implicit_view, &state.per_relay_parent); - connect_to_validators(&mut ctx, cores_assigned, &state.validator_groups_buf).await; + + connect_to_validators(&mut ctx, &state.implicit_view, &state.per_relay_parent).await; gum::trace!( target: LOG_TARGET, diff --git a/polkadot/node/network/collator-protocol/src/collator_side/validators_buffer.rs b/polkadot/node/network/collator-protocol/src/collator_side/validators_buffer.rs index 35202fc96299b..8ed4a3c266c0f 100644 --- a/polkadot/node/network/collator-protocol/src/collator_side/validators_buffer.rs +++ b/polkadot/node/network/collator-protocol/src/collator_side/validators_buffer.rs @@ -63,7 +63,7 @@ pub const VALIDATORS_BUFFER_CAPACITY: NonZeroUsize = }; /// Unique identifier of a validators group. -#[derive(Debug)] +#[derive(Debug, Clone)] struct ValidatorsGroupInfo { /// Number of validators in the group. len: usize, @@ -74,7 +74,7 @@ struct ValidatorsGroupInfo { /// Ring buffer of validator groups. /// /// Tracks which peers we want to be connected to with respect to advertised collations. -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct ValidatorGroupsBuffer { /// Validator groups identifiers we **had** advertisements for. group_infos: VecDeque, @@ -98,32 +98,6 @@ impl ValidatorGroupsBuffer { } } - /// Returns discovery ids of validators we are assigned to in this backing group window. - pub fn validators_to_connect(&self) -> Vec { - let validators_num = self.validators.len(); - let bits = self - .should_be_connected - .values() - .fold(bitvec![0; validators_num], |acc, next| acc | next); - - let mut should_be_connected: Vec = self - .validators - .iter() - .enumerate() - .filter_map(|(idx, authority_id)| bits[idx].then(|| authority_id.clone())) - .collect(); - - if let Some(last_group) = self.group_infos.iter().last() { - for validator in self.validators.iter().rev().take(last_group.len) { - if !should_be_connected.contains(validator) { - should_be_connected.push(validator.clone()); - } - } - } - - should_be_connected - } - /// Note a new advertisement, marking that we want to be connected to validators /// from this group. /// From 2fc08e1aaffb492a2c5aaa7f25f305aa2fc898f0 Mon Sep 17 00:00:00 2001 From: Alexandru Gheorghe Date: Tue, 15 Jul 2025 17:55:00 +0300 Subject: [PATCH 02/59] fixup connection to validators Signed-off-by: Alexandru Gheorghe --- .../src/collator_side/mod.rs | 53 ++++++++++++++----- 1 file changed, 39 insertions(+), 14 deletions(-) diff --git a/polkadot/node/network/collator-protocol/src/collator_side/mod.rs b/polkadot/node/network/collator-protocol/src/collator_side/mod.rs index 29cb4023c5dc4..30d7fb11d2a3e 100644 --- a/polkadot/node/network/collator-protocol/src/collator_side/mod.rs +++ b/polkadot/node/network/collator-protocol/src/collator_side/mod.rs @@ -244,11 +244,15 @@ struct PerRelayParent { } impl PerRelayParent { - fn new( + #[overseer::contextbounds(CollatorProtocol, prefix = self::overseer)] + async fn new( + ctx: &mut Context, + runtime: &mut RuntimeInfo, para_id: ParaId, claim_queue: ClaimQueueSnapshot, block_number: Option, - ) -> Self { + block_hash: Hash, + ) -> Result { let assignments = claim_queue.iter_all_claims().fold(HashMap::new(), |mut acc, (core, claims)| { let n_claims = claims.iter().filter(|para| para == &¶_id).count(); @@ -258,12 +262,22 @@ impl PerRelayParent { acc }); - Self { - validator_group: HashMap::default(), + let mut validator_groups = HashMap::default(); + + for (core, _) in &assignments { + let GroupValidators { validators, session_index: _, group_index: _ } = + determine_our_validators(ctx, runtime, *core, block_hash).await?; + let mut group = ValidatorGroup::default(); + group.validators = validators; + validator_groups.insert(*core, group); + } + + Ok(Self { + validator_group: validator_groups, collations: HashMap::new(), assignments, block_number, - } + }) } } @@ -387,8 +401,8 @@ async fn distribute_collation( let candidate_relay_parent = receipt.descriptor.relay_parent(); let candidate_hash = receipt.hash(); - // We should already be connected to the validators, but if we aren't, we will connect to them - // now. + // We should already be connected to the validators, but if we aren't, we will try to connect to + // them now. connect_to_validators(ctx, &state.implicit_view, &state.per_relay_parent).await; let per_relay_parent = match state.per_relay_parent.get_mut(&candidate_relay_parent) { @@ -1264,7 +1278,7 @@ async fn handle_network_msg( }, OurViewChange(view) => { gum::trace!(target: LOG_TARGET, ?view, "Own view change"); - handle_our_view_change(ctx, state, view).await?; + handle_our_view_change(ctx, runtime, state, view).await?; connect_to_validators(ctx, &state.implicit_view, &state.per_relay_parent).await; }, PeerMessage(remote, msg) => { @@ -1344,6 +1358,7 @@ async fn process_block_events( #[overseer::contextbounds(CollatorProtocol, prefix = crate::overseer)] async fn handle_our_view_change( ctx: &mut Context, + runtime: &mut RuntimeInfo, state: &mut State, view: OurView, ) -> Result<()> { @@ -1363,9 +1378,10 @@ async fn handle_our_view_change( .map_err(Error::ImplicitViewFetchError)?; let block_number = implicit_view.block_number(leaf); - state - .per_relay_parent - .insert(*leaf, PerRelayParent::new(para_id, claim_queue, block_number)); + state.per_relay_parent.insert( + *leaf, + PerRelayParent::new(ctx, runtime, para_id, claim_queue, block_number, *leaf).await?, + ); process_block_events( ctx, @@ -1393,9 +1409,18 @@ async fn handle_our_view_change( if state.per_relay_parent.get(block_hash).is_none() { let claim_queue = fetch_claim_queue(ctx.sender(), *block_hash).await?; - state - .per_relay_parent - .insert(*block_hash, PerRelayParent::new(para_id, claim_queue, block_number)); + state.per_relay_parent.insert( + *block_hash, + PerRelayParent::new( + ctx, + runtime, + para_id, + claim_queue, + block_number, + *block_hash, + ) + .await?, + ); } let per_relay_parent = From 48ab6d0363cf5d2791a77cb560a097e1d97843ad Mon Sep 17 00:00:00 2001 From: Alexandru Gheorghe Date: Wed, 16 Jul 2025 14:50:40 +0300 Subject: [PATCH 03/59] fix tests and cleanup unneeded code Signed-off-by: Alexandru Gheorghe --- .../src/collator_side/mod.rs | 58 +-- .../src/collator_side/tests/mod.rs | 334 +++++++++++------- .../tests/prospective_parachains.rs | 159 +++++++-- .../src/collator_side/validators_buffer.rs | 283 +-------------- 4 files changed, 359 insertions(+), 475 deletions(-) diff --git a/polkadot/node/network/collator-protocol/src/collator_side/mod.rs b/polkadot/node/network/collator-protocol/src/collator_side/mod.rs index 30d7fb11d2a3e..a684cb5e29602 100644 --- a/polkadot/node/network/collator-protocol/src/collator_side/mod.rs +++ b/polkadot/node/network/collator-protocol/src/collator_side/mod.rs @@ -53,8 +53,8 @@ use polkadot_node_subsystem_util::{ }; use polkadot_primitives::{ vstaging::{CandidateEvent, CandidateReceiptV2 as CandidateReceipt}, - AuthorityDiscoveryId, BlockNumber, CandidateHash, CollatorPair, CoreIndex, GroupIndex, Hash, - HeadData, Id as ParaId, SessionIndex, + AuthorityDiscoveryId, BlockNumber, CandidateHash, CollatorPair, CoreIndex, Hash, HeadData, + Id as ParaId, }; use crate::{modify_reputation, LOG_TARGET, LOG_TARGET_STATS}; @@ -71,9 +71,7 @@ use collation::{ VersionedCollationRequest, WaitingCollationFetches, }; use error::{log_error, Error, FatalError, Result}; -use validators_buffer::{ - ResetInterestTimeout, ValidatorGroupsBuffer, RESET_INTEREST_TIMEOUT, VALIDATORS_BUFFER_CAPACITY, -}; +use validators_buffer::{ResetInterestTimeout, RESET_INTEREST_TIMEOUT}; pub use metrics::Metrics; @@ -265,7 +263,7 @@ impl PerRelayParent { let mut validator_groups = HashMap::default(); for (core, _) in &assignments { - let GroupValidators { validators, session_index: _, group_index: _ } = + let GroupValidators { validators } = determine_our_validators(ctx, runtime, *core, block_hash).await?; let mut group = ValidatorGroup::default(); group.validators = validators; @@ -312,9 +310,6 @@ struct State { /// as we learn the [`PeerId`]'s by `PeerConnected` events. peer_ids: HashMap>, - /// Tracks which validators we want to stay connected to. - validator_groups_buf: ValidatorGroupsBuffer, - /// Timeout-future which is reset after every leaf to [`RECONNECT_AFTER_LEAF_TIMEOUT`] seconds. /// When it fires, we update our reserved peers. reconnect_timeout: ReconnectTimeout, @@ -367,7 +362,6 @@ impl State { per_relay_parent: Default::default(), collation_result_senders: Default::default(), peer_ids: Default::default(), - validator_groups_buf: ValidatorGroupsBuffer::with_capacity(VALIDATORS_BUFFER_CAPACITY), reconnect_timeout: Fuse::terminated(), waiting_collation_fetches: Default::default(), active_collation_fetches: Default::default(), @@ -472,7 +466,7 @@ async fn distribute_collation( let our_core = core_index; // Determine the group on that core. - let GroupValidators { validators, session_index, group_index } = + let GroupValidators { validators } = determine_our_validators(ctx, runtime, our_core, candidate_relay_parent).await?; if validators.is_empty() { @@ -485,18 +479,6 @@ async fn distribute_collation( return Ok(()) } - // It's important to insert new collation interests **before** - // issuing a connection request. - // - // If a validator managed to fetch all the relevant collations - // but still assigned to our core, we keep the connection alive. - state.validator_groups_buf.note_collation_advertised( - candidate_hash, - session_index, - group_index, - &validators, - ); - gum::debug!( target: LOG_TARGET, para_id = %id, @@ -577,9 +559,6 @@ async fn distribute_collation( struct GroupValidators { /// The validators of above group (their discovery keys). validators: Vec, - - session_index: SessionIndex, - group_index: GroupIndex, } /// Figure out current group of validators assigned to the para being collated on. @@ -611,11 +590,7 @@ async fn determine_our_validators( let current_validators = current_validators.iter().map(|i| validators[i.0 as usize].clone()).collect(); - let current_validators = GroupValidators { - validators: current_validators, - session_index, - group_index: current_group_index, - }; + let current_validators = GroupValidators { validators: current_validators }; Ok(current_validators) } @@ -1409,6 +1384,7 @@ async fn handle_our_view_change( if state.per_relay_parent.get(block_hash).is_none() { let claim_queue = fetch_claim_queue(ctx.sender(), *block_hash).await?; + state.per_relay_parent.insert( *block_hash, PerRelayParent::new( @@ -1469,7 +1445,6 @@ async fn handle_our_view_change( let candidate_hash: CandidateHash = collation.receipt.hash(); state.collation_result_senders.remove(&candidate_hash); - state.validator_groups_buf.remove_candidate(&candidate_hash); process_out_of_view_collation(&mut state.collation_tracker, collation_with_core); } @@ -1637,10 +1612,6 @@ async fn run_inner( // timeout, we simply start processing next request. // The request it still alive, it should be kept in a waiting queue. } else { - for authority_id in state.peer_ids.get(&peer_id).into_iter().flatten() { - // This peer has received the candidate. Not interested anymore. - state.validator_groups_buf.reset_validator_interest(candidate_hash, authority_id); - } waiting.waiting_peers.remove(&(peer_id, candidate_hash)); // Update collation status to fetched. @@ -1705,15 +1676,12 @@ async fn run_inner( } }, (candidate_hash, peer_id) = state.advertisement_timeouts.select_next_some() => { - // NOTE: it doesn't necessarily mean that a validator gets disconnected, - // it only will if there're no other advertisements we want to send. - // - // No-op if the collation was already fetched or went out of view. - for authority_id in state.peer_ids.get(&peer_id).into_iter().flatten() { - state - .validator_groups_buf - .reset_validator_interest(candidate_hash, &authority_id); - } + gum::debug!( + target: LOG_TARGET, + ?candidate_hash, + ?peer_id, + "Advertising to peer timed out" + ); } _ = reconnect_timeout => { diff --git a/polkadot/node/network/collator-protocol/src/collator_side/tests/mod.rs b/polkadot/node/network/collator-protocol/src/collator_side/tests/mod.rs index 0123ccb39a195..1627e93c75dec 100644 --- a/polkadot/node/network/collator-protocol/src/collator_side/tests/mod.rs +++ b/polkadot/node/network/collator-protocol/src/collator_side/tests/mod.rs @@ -99,7 +99,7 @@ impl Default for TestState { let validator_peer_id = std::iter::repeat_with(|| PeerId::random()).take(discovery_keys.len()).collect(); - let validator_groups = vec![vec![2, 0, 4], vec![1, 3]] + let validator_groups = vec![vec![2, 0, 4], vec![1, 3], vec![], vec![]] .into_iter() .map(|g| g.into_iter().map(ValidatorIndex).collect()) .collect(); @@ -162,6 +162,12 @@ impl TestState { &self.session_info.validator_groups.get(GroupIndex::from(group_idx)).unwrap() } + fn validator_indices_for_core(&self, core: CoreIndex) -> &[ValidatorIndex] { + let core_num = self.claim_queue.len(); + let GroupIndex(group_idx) = self.group_rotation_info.group_for_core(core, core_num); + &self.session_info.validator_groups.get(GroupIndex::from(group_idx)).unwrap() + } + fn current_session_index(&self) -> SessionIndex { self.session_index } @@ -179,6 +185,13 @@ impl TestState { .map(|i| self.session_info.discovery_keys[i.0 as usize].clone()) .collect() } + + fn validator_authority_ids_for_core(&self, core: CoreIndex) -> Vec { + self.validator_indices_for_core(core) + .iter() + .map(|i| self.session_info.discovery_keys[i.0 as usize].clone()) + .collect() + } } type VirtualOverseer = @@ -293,28 +306,31 @@ struct DistributeCollation { pov_block: PoV, } -async fn distribute_collation_with_receipt( +// Check that the next received message is a connection request to validators. +async fn check_connected_to_validators( virtual_overseer: &mut VirtualOverseer, - test_state: &TestState, - relay_parent: Hash, - should_connect: bool, - candidate: CandidateReceipt, - pov: PoV, - parent_head_data_hash: Hash, -) -> DistributeCollation { - overseer_send( - virtual_overseer, - CollatorProtocolMessage::DistributeCollation { - candidate_receipt: candidate.clone(), - parent_head_data_hash, - pov: pov.clone(), - parent_head_data: HeadData(vec![1, 2, 3]), - result_sender: None, - core_index: candidate.descriptor.core_index().unwrap(), - }, - ) - .await; + expected_connected: Vec, +) { + assert_matches!( + overseer_recv(virtual_overseer).await, + AllMessages::NetworkBridgeTx( + NetworkBridgeTxMessage::ConnectToValidators { + validator_ids, peer_set: _, failed: _, + } + ) => { + assert_eq!(validator_ids.len(), expected_connected.len()); + for validator in expected_connected.iter() { + assert!(validator_ids.contains(validator)); + } + } + ); +} +// Expect that the next received messages are the ones necessary to determine the validator group. +async fn expect_determine_validator_group( + virtual_overseer: &mut VirtualOverseer, + test_state: &TestState, +) { // We don't know precisely what is going to come as session info might be cached: loop { match overseer_recv(virtual_overseer).await { @@ -355,7 +371,6 @@ async fn distribute_collation_with_receipt( _relay_parent, RuntimeApiRequest::ValidatorGroups(tx), )) => { - assert_eq!(_relay_parent, relay_parent); tx.send(Ok(( test_state.session_info.validator_groups.to_vec(), test_state.group_rotation_info.clone(), @@ -367,28 +382,40 @@ async fn distribute_collation_with_receipt( other => panic!("Unexpected message received: {:?}", other), } } +} - if should_connect { - assert_matches!( - overseer_recv(virtual_overseer).await, - AllMessages::NetworkBridgeTx( - NetworkBridgeTxMessage::ConnectToValidators { - .. - } - ) => {} - ); - } +async fn distribute_collation_with_receipt( + virtual_overseer: &mut VirtualOverseer, + expected_connected: Vec, + test_state: &TestState, + candidate: CandidateReceipt, + pov: PoV, + parent_head_data_hash: Hash, +) -> DistributeCollation { + overseer_send( + virtual_overseer, + CollatorProtocolMessage::DistributeCollation { + candidate_receipt: candidate.clone(), + parent_head_data_hash, + pov: pov.clone(), + parent_head_data: HeadData(vec![1, 2, 3]), + result_sender: None, + core_index: candidate.descriptor.core_index().unwrap(), + }, + ) + .await; + check_connected_to_validators(virtual_overseer, expected_connected).await; + expect_determine_validator_group(virtual_overseer, test_state).await; DistributeCollation { candidate, pov_block: pov } } /// Create some PoV and distribute it. async fn distribute_collation( virtual_overseer: &mut VirtualOverseer, + expected_connected: Vec, test_state: &TestState, relay_parent: Hash, - // whether or not we expect a connection request or not. - should_connect: bool, ) -> DistributeCollation { // Now we want to distribute a `PoVBlock` let pov_block = PoV { block_data: BlockData(vec![42, 43, 44]) }; @@ -406,9 +433,8 @@ async fn distribute_collation( distribute_collation_with_receipt( virtual_overseer, + expected_connected, test_state, - relay_parent, - should_connect, candidate, pov_block, parent_head_data_hash, @@ -545,11 +571,22 @@ fn v1_protocol_rejected() { overseer_send(virtual_overseer, CollatorProtocolMessage::CollateOn(test_state.para_id)) .await; - update_view(&test_state, virtual_overseer, vec![(test_state.relay_parent, 10)], 1) - .await; + update_view( + Some(test_state.current_group_validator_authority_ids()), + &test_state, + virtual_overseer, + vec![(test_state.relay_parent, 10)], + 1, + ) + .await; - distribute_collation(virtual_overseer, &test_state, test_state.relay_parent, true) - .await; + distribute_collation( + virtual_overseer, + test_state.current_group_validator_authority_ids(), + &test_state, + test_state.relay_parent, + ) + .await; for (val, peer) in test_state .current_group_validator_authority_ids() @@ -591,15 +628,20 @@ fn advertise_and_send_collation() { CollatorProtocolMessage::CollateOn(test_state.para_id), ) .await; - - update_view(&test_state, &mut virtual_overseer, vec![(test_state.relay_parent, 10)], 1) - .await; + update_view( + Some(test_state.current_group_validator_authority_ids()), + &test_state, + &mut virtual_overseer, + vec![(test_state.relay_parent, 10)], + 1, + ) + .await; let DistributeCollation { candidate, pov_block } = distribute_collation( &mut virtual_overseer, + test_state.current_group_validator_authority_ids(), &test_state, test_state.relay_parent, - true, ) .await; @@ -702,8 +744,14 @@ fn advertise_and_send_collation() { test_state.relay_parent.randomize(); // Update our view, making the old relay parent go out of the implicit view. - update_view(&test_state, &mut virtual_overseer, vec![(test_state.relay_parent, 20)], 1) - .await; + update_view( + Some(test_state.current_group_validator_authority_ids()), + &test_state, + &mut virtual_overseer, + vec![(test_state.relay_parent, 20)], + 1, + ) + .await; let peer = test_state.validator_peer_id[2]; @@ -733,9 +781,9 @@ fn advertise_and_send_collation() { let DistributeCollation { candidate, .. } = distribute_collation( &mut virtual_overseer, + test_state.current_group_validator_authority_ids(), &test_state, test_state.relay_parent, - true, ) .await; @@ -781,14 +829,20 @@ fn delay_reputation_change() { ) .await; - update_view(&test_state, &mut virtual_overseer, vec![(test_state.relay_parent, 10)], 1) - .await; + update_view( + Some(test_state.current_group_validator_authority_ids()), + &test_state, + &mut virtual_overseer, + vec![(test_state.relay_parent, 10)], + 1, + ) + .await; let DistributeCollation { candidate, .. } = distribute_collation( &mut virtual_overseer, + test_state.current_group_validator_authority_ids(), &test_state, test_state.relay_parent, - true, ) .await; @@ -929,6 +983,7 @@ fn collators_declare_to_connected_peers() { .await; update_view( + Some(test_state.current_group_validator_authority_ids()), &test_state, &mut test_harness.virtual_overseer, vec![(test_state.relay_parent, 10)], @@ -972,8 +1027,14 @@ fn collations_are_only_advertised_to_validators_with_correct_view() { overseer_send(virtual_overseer, CollatorProtocolMessage::CollateOn(test_state.para_id)) .await; - update_view(&test_state, virtual_overseer, vec![(test_state.relay_parent, 10)], 1) - .await; + update_view( + Some(test_state.current_group_validator_authority_ids()), + &test_state, + virtual_overseer, + vec![(test_state.relay_parent, 10)], + 1, + ) + .await; // A validator connected to us connect_peer(virtual_overseer, peer, CollationVersion::V2, Some(validator_id)).await; @@ -987,9 +1048,13 @@ fn collations_are_only_advertised_to_validators_with_correct_view() { // And let it tell us that it is has the same view. send_peer_view_change(virtual_overseer, &peer2, vec![test_state.relay_parent]).await; - let DistributeCollation { candidate, .. } = - distribute_collation(virtual_overseer, &test_state, test_state.relay_parent, true) - .await; + let DistributeCollation { candidate, .. } = distribute_collation( + virtual_overseer, + test_state.current_group_validator_authority_ids(), + &test_state, + test_state.relay_parent, + ) + .await; expect_advertise_collation_msg( virtual_overseer, @@ -1037,8 +1102,14 @@ fn collate_on_two_different_relay_chain_blocks() { overseer_send(virtual_overseer, CollatorProtocolMessage::CollateOn(test_state.para_id)) .await; - update_view(&test_state, virtual_overseer, vec![(test_state.relay_parent, 10)], 1) - .await; + update_view( + Some(test_state.current_group_validator_authority_ids()), + &test_state, + virtual_overseer, + vec![(test_state.relay_parent, 10)], + 1, + ) + .await; // A validator connected to us connect_peer(virtual_overseer, peer, CollationVersion::V2, Some(validator_id)).await; @@ -1049,9 +1120,13 @@ fn collate_on_two_different_relay_chain_blocks() { expect_declare_msg(virtual_overseer, &test_state, &peer).await; expect_declare_msg(virtual_overseer, &test_state, &peer2).await; - let DistributeCollation { candidate: old_candidate, .. } = - distribute_collation(virtual_overseer, &test_state, test_state.relay_parent, true) - .await; + let DistributeCollation { candidate: old_candidate, .. } = distribute_collation( + virtual_overseer, + test_state.current_group_validator_authority_ids(), + &test_state, + test_state.relay_parent, + ) + .await; let old_relay_parent = test_state.relay_parent; @@ -1059,6 +1134,7 @@ fn collate_on_two_different_relay_chain_blocks() { // parent are active. test_state.relay_parent.randomize(); update_view( + Some(test_state.current_group_validator_authority_ids()), &test_state, virtual_overseer, vec![(old_relay_parent, 10), (test_state.relay_parent, 10)], @@ -1066,9 +1142,13 @@ fn collate_on_two_different_relay_chain_blocks() { ) .await; - let DistributeCollation { candidate: new_candidate, .. } = - distribute_collation(virtual_overseer, &test_state, test_state.relay_parent, true) - .await; + let DistributeCollation { candidate: new_candidate, .. } = distribute_collation( + virtual_overseer, + test_state.current_group_validator_authority_ids(), + &test_state, + test_state.relay_parent, + ) + .await; send_peer_view_change(virtual_overseer, &peer, vec![old_relay_parent]).await; expect_advertise_collation_msg( @@ -1112,17 +1192,27 @@ fn validator_reconnect_does_not_advertise_a_second_time() { overseer_send(virtual_overseer, CollatorProtocolMessage::CollateOn(test_state.para_id)) .await; - update_view(&test_state, virtual_overseer, vec![(test_state.relay_parent, 10)], 1) - .await; + update_view( + Some(test_state.current_group_validator_authority_ids()), + &test_state, + virtual_overseer, + vec![(test_state.relay_parent, 10)], + 1, + ) + .await; // A validator connected to us connect_peer(virtual_overseer, peer, CollationVersion::V2, Some(validator_id.clone())) .await; expect_declare_msg(virtual_overseer, &test_state, &peer).await; - let DistributeCollation { candidate, .. } = - distribute_collation(virtual_overseer, &test_state, test_state.relay_parent, true) - .await; + let DistributeCollation { candidate, .. } = distribute_collation( + virtual_overseer, + test_state.current_group_validator_authority_ids(), + &test_state, + test_state.relay_parent, + ) + .await; send_peer_view_change(virtual_overseer, &peer, vec![test_state.relay_parent]).await; expect_advertise_collation_msg( @@ -1166,8 +1256,14 @@ fn collators_reject_declare_messages() { overseer_send(virtual_overseer, CollatorProtocolMessage::CollateOn(test_state.para_id)) .await; - update_view(&test_state, virtual_overseer, vec![(test_state.relay_parent, 10)], 1) - .await; + update_view( + Some(test_state.current_group_validator_authority_ids()), + &test_state, + virtual_overseer, + vec![(test_state.relay_parent, 10)], + 1, + ) + .await; // A validator connected to us connect_peer(virtual_overseer, peer, CollationVersion::V2, Some(validator_id)).await; @@ -1227,12 +1323,22 @@ where overseer_send(virtual_overseer, CollatorProtocolMessage::CollateOn(test_state.para_id)) .await; - update_view(&test_state, virtual_overseer, vec![(test_state.relay_parent, 10)], 1) - .await; + update_view( + Some(test_state.current_group_validator_authority_ids()), + &test_state, + virtual_overseer, + vec![(test_state.relay_parent, 10)], + 1, + ) + .await; - let DistributeCollation { candidate, pov_block } = - distribute_collation(virtual_overseer, &test_state, test_state.relay_parent, true) - .await; + let DistributeCollation { candidate, pov_block } = distribute_collation( + virtual_overseer, + test_state.current_group_validator_authority_ids(), + &test_state, + test_state.relay_parent, + ) + .await; for (val, peer) in test_state .current_group_validator_authority_ids() @@ -1352,7 +1458,7 @@ where } #[test] -fn connect_to_buffered_groups() { +fn connect_to_group_in_view() { let mut test_state = TestState::default(); let local_peer_id = test_state.local_peer_id; let collator_pair = test_state.collator_pair.clone(); @@ -1371,8 +1477,14 @@ fn connect_to_buffered_groups() { ) .await; - update_view(&test_state, &mut virtual_overseer, vec![(test_state.relay_parent, 10)], 1) - .await; + update_view( + Some(test_state.current_group_validator_authority_ids()), + &test_state, + &mut virtual_overseer, + vec![(test_state.relay_parent, 10)], + 1, + ) + .await; let group_a = test_state.current_group_validator_authority_ids(); let peers_a = test_state.current_group_validator_peer_ids(); @@ -1380,21 +1492,12 @@ fn connect_to_buffered_groups() { let DistributeCollation { candidate, .. } = distribute_collation( &mut virtual_overseer, + test_state.current_group_validator_authority_ids(), &test_state, test_state.relay_parent, - false, ) .await; - assert_matches!( - overseer_recv(&mut virtual_overseer).await, - AllMessages::NetworkBridgeTx( - NetworkBridgeTxMessage::ConnectToValidators { validator_ids, .. } - ) => { - assert_eq!(group_a, validator_ids); - } - ); - let head_a = test_state.relay_parent; for (val, peer) in group_a.iter().zip(&peers_a) { @@ -1453,8 +1556,13 @@ fn connect_to_buffered_groups() { let old_relay_parent = test_state.relay_parent; test_state.relay_parent.randomize(); + test_state.group_rotation_info = test_state.group_rotation_info.bump_rotation(); + // Update our view. + let mut expected_group = group_a.clone(); + expected_group.extend(test_state.current_group_validator_authority_ids()); update_view( + Some(expected_group.clone()), &test_state, &mut virtual_overseer, vec![(old_relay_parent, 10), (test_state.relay_parent, 20)], @@ -1462,8 +1570,6 @@ fn connect_to_buffered_groups() { ) .await; - test_state.group_rotation_info = test_state.group_rotation_info.bump_rotation(); - let head_b = test_state.relay_parent; let group_b = test_state.current_group_validator_authority_ids(); assert_ne!(head_a, head_b); @@ -1471,27 +1577,12 @@ fn connect_to_buffered_groups() { distribute_collation( &mut virtual_overseer, + expected_group, &test_state, test_state.relay_parent, - false, ) .await; - // Should be connected to both groups except for the validator that fetched advertised - // collation. - assert_matches!( - overseer_recv(&mut virtual_overseer).await, - AllMessages::NetworkBridgeTx( - NetworkBridgeTxMessage::ConnectToValidators { validator_ids, .. } - ) => { - assert!(!validator_ids.contains(&group_a[0])); - - for validator in group_a[1..].iter().chain(&group_b) { - assert!(validator_ids.contains(validator)); - } - } - ); - TestHarness { virtual_overseer, req_v2_cfg: req_cfg } }, ); @@ -1517,35 +1608,38 @@ fn connect_with_no_cores_assigned() { ) .await; - update_view(&test_state, &mut virtual_overseer, vec![(test_state.relay_parent, 10)], 1) - .await; + update_view( + Some(test_state.current_group_validator_authority_ids()), + &test_state, + &mut virtual_overseer, + vec![(test_state.relay_parent, 10)], + 1, + ) + .await; let group_a = test_state.current_group_validator_authority_ids(); assert!(group_a.len() > 1); distribute_collation( &mut virtual_overseer, + test_state.current_group_validator_authority_ids(), &test_state, test_state.relay_parent, - false, ) .await; - assert_matches!( - overseer_recv(&mut virtual_overseer).await, - AllMessages::NetworkBridgeTx( - NetworkBridgeTxMessage::ConnectToValidators { validator_ids, .. } - ) => { - assert_eq!(group_a, validator_ids); - } - ); - // Create a new relay parent and remove the core assignments. test_state.relay_parent.randomize(); test_state.claim_queue.clear(); - update_view(&test_state, &mut virtual_overseer, vec![(test_state.relay_parent, 20)], 1) - .await; + update_view( + Some(vec![]), + &test_state, + &mut virtual_overseer, + vec![(test_state.relay_parent, 20)], + 1, + ) + .await; // Send the ActiveLeaves signal to trigger the reconnect timeout. overseer_signal( diff --git a/polkadot/node/network/collator-protocol/src/collator_side/tests/prospective_parachains.rs b/polkadot/node/network/collator-protocol/src/collator_side/tests/prospective_parachains.rs index 8be5c4e876b5b..fbf514f39105e 100644 --- a/polkadot/node/network/collator-protocol/src/collator_side/tests/prospective_parachains.rs +++ b/polkadot/node/network/collator-protocol/src/collator_side/tests/prospective_parachains.rs @@ -28,6 +28,7 @@ fn get_parent_hash(hash: Hash) -> Hash { /// Handle a view update. pub(super) async fn update_view( + expected_connected: Option>, test_state: &TestState, virtual_overseer: &mut VirtualOverseer, new_view: Vec<(Hash, u32)>, // Hash and block number. @@ -63,7 +64,6 @@ pub(super) async fn update_view( .take(ancestry_len as usize); let ancestry_numbers = (min_number..=leaf_number).rev(); let mut ancestry_iter = ancestry_hashes.clone().zip(ancestry_numbers).peekable(); - if let Some((hash, number)) = ancestry_iter.next() { assert_matches!( overseer_recv_with_timeout(virtual_overseer, Duration::from_millis(50)).await.unwrap(), @@ -180,6 +180,14 @@ pub(super) async fn update_view( ); } + for (_core, _paras) in test_state + .claim_queue + .iter() + .filter(|(_, paras)| paras.contains(&test_state.para_id)) + { + expect_determine_validator_group(virtual_overseer, &test_state).await; + } + for _ in ancestry_iter { while let Some(msg) = overseer_peek_with_timeout(virtual_overseer, Duration::from_millis(50)).await @@ -196,10 +204,33 @@ pub(super) async fn update_view( _, RuntimeApiRequest::CandidateEvents(_), )) + ) && !matches!( + &msg, + AllMessages::RuntimeApi(RuntimeApiMessage::Request( + _, + RuntimeApiRequest::SessionIndexForChild(_), + )) ) { break } + if matches!( + &msg, + AllMessages::RuntimeApi(RuntimeApiMessage::Request( + _, + RuntimeApiRequest::SessionIndexForChild(_), + )) + ) { + for (_core, _paras) in test_state + .claim_queue + .iter() + .filter(|(_, paras)| paras.contains(&test_state.para_id)) + { + expect_determine_validator_group(virtual_overseer, &test_state).await; + } + break; + } + match overseer_recv_with_timeout(virtual_overseer, Duration::from_millis(50)) .await .unwrap() @@ -223,6 +254,9 @@ pub(super) async fn update_view( } } } + if let Some(expected_connected) = expected_connected { + check_connected_to_validators(virtual_overseer, expected_connected).await; + } } /// Check that the next received message is a `Declare` message. @@ -303,12 +337,25 @@ fn distribute_collation_from_implicit_view(#[case] validator_sends_view_first: b if validator_sends_view_first { // Activate leaf `c` to accept at least the collation. - update_view(&test_state, virtual_overseer, vec![(head_c, head_c_num)], 1).await; + update_view( + Some(test_state.current_group_validator_authority_ids()), + &test_state, + virtual_overseer, + vec![(head_c, head_c_num)], + 1, + ) + .await; } else { // Activated leaf is `b`, but the collation will be based on `c`. - update_view(&test_state, virtual_overseer, vec![(head_b, head_b_num)], 1).await; + update_view( + Some(test_state.current_group_validator_authority_ids()), + &test_state, + virtual_overseer, + vec![(head_b, head_b_num)], + 1, + ) + .await; } - let validator_peer_ids = test_state.current_group_validator_peer_ids(); for (val, peer) in test_state .current_group_validator_authority_ids() @@ -332,27 +379,17 @@ fn distribute_collation_from_implicit_view(#[case] validator_sends_view_first: b ..Default::default() } .build(); + let DistributeCollation { candidate, pov_block: _ } = distribute_collation_with_receipt( virtual_overseer, + test_state.current_group_validator_authority_ids(), &test_state, - head_c, - false, // Check the group manually. candidate, pov, parent_head_data_hash, ) .await; - assert_matches!( - overseer_recv(virtual_overseer).await, - AllMessages::NetworkBridgeTx( - NetworkBridgeTxMessage::ConnectToValidators { validator_ids, .. } - ) => { - let expected_validators = test_state.current_group_validator_authority_ids(); - - assert_eq!(expected_validators, validator_ids); - } - ); let candidate_hash = candidate.hash(); @@ -373,7 +410,8 @@ fn distribute_collation_from_implicit_view(#[case] validator_sends_view_first: b if validator_sends_view_first { // Activated leaf is `b`, but the collation will be based on `c`. - update_view(&test_state, virtual_overseer, vec![(head_b, head_b_num)], 1).await; + update_view(None, &test_state, virtual_overseer, vec![(head_b, head_b_num)], 1) + .await; for _ in &validator_peer_ids { expect_advertise_collation_msg( @@ -384,11 +422,24 @@ fn distribute_collation_from_implicit_view(#[case] validator_sends_view_first: b ) .await; } + + check_connected_to_validators( + virtual_overseer, + test_state.current_group_validator_authority_ids(), + ) + .await; } // Head `c` goes out of view. // Build a different candidate for this relay parent and attempt to distribute it. - update_view(&test_state, virtual_overseer, vec![(head_a, head_a_num)], 1).await; + update_view( + Some(test_state.current_group_validator_authority_ids()), + &test_state, + virtual_overseer, + vec![(head_a, head_a_num)], + 1, + ) + .await; let pov = PoV { block_data: BlockData(vec![4, 5, 6]) }; let parent_head_data_hash = Hash::repeat_byte(0xBB); @@ -412,6 +463,12 @@ fn distribute_collation_from_implicit_view(#[case] validator_sends_view_first: b ) .await; + check_connected_to_validators( + virtual_overseer, + test_state.current_group_validator_authority_ids(), + ) + .await; + // Parent out of view, nothing happens. assert!(overseer_recv_with_timeout(virtual_overseer, Duration::from_millis(100)) .await @@ -453,7 +510,14 @@ fn distribute_collation_up_to_limit() { overseer_send(virtual_overseer, CollatorProtocolMessage::CollateOn(test_state.para_id)) .await; // Activated leaf is `a`, but the collation will be based on `b`. - update_view(&test_state, virtual_overseer, vec![(head_a, head_a_num)], 1).await; + update_view( + Some(test_state.current_group_validator_authority_ids()), + &test_state, + virtual_overseer, + vec![(head_a, head_a_num)], + 1, + ) + .await; for i in 0..expected_assignments { let pov = PoV { block_data: BlockData(vec![i as u8]) }; @@ -468,9 +532,8 @@ fn distribute_collation_up_to_limit() { .build(); distribute_collation_with_receipt( virtual_overseer, + test_state.current_group_validator_authority_ids(), &test_state, - head_b, - true, candidate, pov, parent_head_data_hash, @@ -501,6 +564,11 @@ fn distribute_collation_up_to_limit() { ) .await; + check_connected_to_validators( + virtual_overseer, + test_state.current_group_validator_authority_ids(), + ) + .await; // Limit has been reached. assert!(overseer_recv_with_timeout(virtual_overseer, Duration::from_millis(100)) .await @@ -531,6 +599,12 @@ fn distribute_collation_up_to_limit() { ) .await; + check_connected_to_validators( + virtual_overseer, + test_state.current_group_validator_authority_ids(), + ) + .await; + assert!(overseer_recv_with_timeout(virtual_overseer, Duration::from_millis(100)) .await .is_none()); @@ -566,7 +640,24 @@ fn send_parent_head_data_for_elastic_scaling() { CollatorProtocolMessage::CollateOn(test_state.para_id), ) .await; - update_view(&test_state, &mut virtual_overseer, vec![(head_b, head_b_num)], 1).await; + let expected_connected = [CoreIndex(0), CoreIndex(2), CoreIndex(3)] + .into_iter() + .map(|core| test_state.validator_authority_ids_for_core(core)) + .fold(HashSet::new(), |mut acc, res| { + acc.extend(res.into_iter()); + acc + }) + .into_iter() + .collect::>(); + + update_view( + Some(expected_connected.clone()), + &test_state, + &mut virtual_overseer, + vec![(head_b, head_b_num)], + 1, + ) + .await; let pov_data = PoV { block_data: BlockData(vec![1 as u8]) }; let candidate = TestCandidateBuilder { @@ -582,9 +673,8 @@ fn send_parent_head_data_for_elastic_scaling() { distribute_collation_with_receipt( &mut virtual_overseer, + expected_connected, &test_state, - head_b, - true, candidate.clone(), pov_data.clone(), phdh, @@ -679,8 +769,22 @@ fn advertise_and_send_collation_by_hash() { CollatorProtocolMessage::CollateOn(test_state.para_id), ) .await; - update_view(&test_state, &mut virtual_overseer, vec![(head_b, head_b_num)], 1).await; - update_view(&test_state, &mut virtual_overseer, vec![(head_a, head_a_num)], 1).await; + update_view( + Some(test_state.current_group_validator_authority_ids()), + &test_state, + &mut virtual_overseer, + vec![(head_b, head_b_num)], + 1, + ) + .await; + update_view( + Some(test_state.current_group_validator_authority_ids()), + &test_state, + &mut virtual_overseer, + vec![(head_a, head_a_num)], + 1, + ) + .await; let candidates: Vec<_> = (0..2) .map(|i| { @@ -699,9 +803,8 @@ fn advertise_and_send_collation_by_hash() { for (candidate, pov) in &candidates { distribute_collation_with_receipt( &mut virtual_overseer, + test_state.current_group_validator_authority_ids(), &test_state, - head_b, - true, candidate.clone(), pov.clone(), Hash::zero(), diff --git a/polkadot/node/network/collator-protocol/src/collator_side/validators_buffer.rs b/polkadot/node/network/collator-protocol/src/collator_side/validators_buffer.rs index 8ed4a3c266c0f..ec467c0ab843e 100644 --- a/polkadot/node/network/collator-protocol/src/collator_side/validators_buffer.rs +++ b/polkadot/node/network/collator-protocol/src/collator_side/validators_buffer.rs @@ -30,185 +30,16 @@ //! The bitwise OR over known advertisements gives us validators indices for connection request. use std::{ - collections::{HashMap, VecDeque}, future::Future, - num::NonZeroUsize, - ops::Range, pin::Pin, task::{Context, Poll}, time::Duration, }; -use bitvec::{bitvec, vec::BitVec}; use futures::FutureExt; use polkadot_node_network_protocol::PeerId; -use polkadot_primitives::{AuthorityDiscoveryId, CandidateHash, GroupIndex, SessionIndex}; - -/// Elastic scaling: how many candidates per relay chain block the collator supports building. -pub const MAX_CHAINED_CANDIDATES_PER_RCB: NonZeroUsize = match NonZeroUsize::new(3) { - Some(cap) => cap, - None => panic!("max candidates per rcb cannot be zero"), -}; - -/// The ring buffer stores at most this many unique validator groups. -/// -/// This value should be chosen in way that all groups assigned to our para -/// in the view can fit into the buffer multiplied by amount of candidates we support per relay -/// chain block in the case of elastic scaling. -pub const VALIDATORS_BUFFER_CAPACITY: NonZeroUsize = - match NonZeroUsize::new(3 * MAX_CHAINED_CANDIDATES_PER_RCB.get()) { - Some(cap) => cap, - None => panic!("buffer capacity must be non-zero"), - }; - -/// Unique identifier of a validators group. -#[derive(Debug, Clone)] -struct ValidatorsGroupInfo { - /// Number of validators in the group. - len: usize, - session_index: SessionIndex, - group_index: GroupIndex, -} - -/// Ring buffer of validator groups. -/// -/// Tracks which peers we want to be connected to with respect to advertised collations. -#[derive(Debug, Clone)] -pub struct ValidatorGroupsBuffer { - /// Validator groups identifiers we **had** advertisements for. - group_infos: VecDeque, - /// Continuous buffer of validators discovery keys. - validators: VecDeque, - /// Mapping from candidate hashes to bit-vectors with bits for all `validators`. - /// Invariants kept: All bit-vectors are guaranteed to have the same size. - should_be_connected: HashMap, - /// Buffer capacity, limits the number of **groups** tracked. - cap: NonZeroUsize, -} - -impl ValidatorGroupsBuffer { - /// Creates a new buffer with a non-zero capacity. - pub fn with_capacity(cap: NonZeroUsize) -> Self { - Self { - group_infos: VecDeque::new(), - validators: VecDeque::new(), - should_be_connected: HashMap::new(), - cap, - } - } - - /// Note a new advertisement, marking that we want to be connected to validators - /// from this group. - /// - /// If max capacity is reached and the group is new, drops validators from the back - /// of the buffer. - pub fn note_collation_advertised( - &mut self, - candidate_hash: CandidateHash, - session_index: SessionIndex, - group_index: GroupIndex, - validators: &[AuthorityDiscoveryId], - ) { - if validators.is_empty() { - return - } - - match self.group_infos.iter().enumerate().find(|(_, group)| { - group.session_index == session_index && group.group_index == group_index - }) { - Some((idx, group)) => { - let group_start_idx = self.group_lengths_iter().take(idx).sum(); - self.set_bits(candidate_hash, group_start_idx..(group_start_idx + group.len)); - }, - None => self.push(candidate_hash, session_index, group_index, validators), - } - } - - /// Note that a validator is no longer interested in a given candidate. - pub fn reset_validator_interest( - &mut self, - candidate_hash: CandidateHash, - authority_id: &AuthorityDiscoveryId, - ) { - let bits = match self.should_be_connected.get_mut(&candidate_hash) { - Some(bits) => bits, - None => return, - }; - - for (idx, auth_id) in self.validators.iter().enumerate() { - if auth_id == authority_id { - bits.set(idx, false); - } - } - } - - /// Remove advertised candidate from the buffer. - /// - /// The buffer will no longer track which validators are interested in a corresponding - /// advertisement. - pub fn remove_candidate(&mut self, candidate_hash: &CandidateHash) { - self.should_be_connected.remove(candidate_hash); - } - - /// Pushes a new group to the buffer along with advertisement, setting all validators - /// bits to 1. - /// - /// If the buffer is full, drops group from the tail. - fn push( - &mut self, - candidate_hash: CandidateHash, - session_index: SessionIndex, - group_index: GroupIndex, - validators: &[AuthorityDiscoveryId], - ) { - let new_group_info = - ValidatorsGroupInfo { len: validators.len(), session_index, group_index }; - - let buf = &mut self.group_infos; - let cap = self.cap.get(); - - if buf.len() >= cap { - let pruned_group = buf.pop_front().expect("buf is not empty; qed"); - self.validators.drain(..pruned_group.len); - - self.should_be_connected.values_mut().for_each(|bits| { - bits.as_mut_bitslice().shift_left(pruned_group.len); - }); - } - - self.validators.extend(validators.iter().cloned()); - buf.push_back(new_group_info); - let buf_len = buf.len(); - let group_start_idx = self.group_lengths_iter().take(buf_len - 1).sum(); - - let new_len = self.validators.len(); - self.should_be_connected - .values_mut() - .for_each(|bits| bits.resize(new_len, false)); - self.set_bits(candidate_hash, group_start_idx..(group_start_idx + validators.len())); - } - - /// Sets advertisement bits to 1 in a given range (usually corresponding to some group). - /// If the relay parent is unknown, inserts 0-initialized bitvec first. - /// - /// The range must be ensured to be within bounds. - fn set_bits(&mut self, candidate_hash: CandidateHash, range: Range) { - let bits = self - .should_be_connected - .entry(candidate_hash) - .or_insert_with(|| bitvec![0; self.validators.len()]); - - bits[range].fill(true); - } - - /// Returns iterator over numbers of validators in groups. - /// - /// Useful for getting an index of the first validator in i-th group. - fn group_lengths_iter(&self) -> impl Iterator + '_ { - self.group_infos.iter().map(|group| group.len) - } -} +use polkadot_primitives::CandidateHash; /// A timeout for resetting validators' interests in collations. pub const RESET_INTEREST_TIMEOUT: Duration = Duration::from_secs(6); @@ -239,115 +70,3 @@ impl Future for ResetInterestTimeout { self.fut.poll_unpin(cx).map(|_| (self.candidate_hash, self.peer_id)) } } - -#[cfg(test)] -mod tests { - use super::*; - use polkadot_primitives::Hash; - use sp_keyring::Sr25519Keyring; - - #[test] - fn one_capacity_buffer() { - let cap = NonZeroUsize::new(1).unwrap(); - let mut buf = ValidatorGroupsBuffer::with_capacity(cap); - - let hash_a = CandidateHash(Hash::repeat_byte(0x1)); - let hash_b = CandidateHash(Hash::repeat_byte(0x2)); - - let validators: Vec<_> = [ - Sr25519Keyring::Alice, - Sr25519Keyring::Bob, - Sr25519Keyring::Charlie, - Sr25519Keyring::Dave, - Sr25519Keyring::Ferdie, - ] - .into_iter() - .map(|key| AuthorityDiscoveryId::from(key.public())) - .collect(); - - assert!(buf.validators_to_connect().is_empty()); - - buf.note_collation_advertised(hash_a, 0, GroupIndex(0), &validators[..2]); - assert_eq!(buf.validators_to_connect(), validators[..2].to_vec()); - - buf.reset_validator_interest(hash_a, &validators[1]); - assert_eq!(buf.validators_to_connect(), validators[0..2].to_vec()); - - buf.note_collation_advertised(hash_b, 0, GroupIndex(1), &validators[2..]); - assert_eq!(buf.validators_to_connect(), validators[2..].to_vec()); - - for validator in &validators[2..] { - buf.reset_validator_interest(hash_b, validator); - } - let mut expected = validators[2..].to_vec(); - expected.sort(); - let mut result = buf.validators_to_connect(); - result.sort(); - assert_eq!(result, expected); - } - - #[test] - fn buffer_works() { - let cap = NonZeroUsize::new(3).unwrap(); - let mut buf = ValidatorGroupsBuffer::with_capacity(cap); - - let hashes: Vec<_> = (0..5).map(|i| CandidateHash(Hash::repeat_byte(i))).collect(); - - let validators: Vec<_> = [ - Sr25519Keyring::Alice, - Sr25519Keyring::Bob, - Sr25519Keyring::Charlie, - Sr25519Keyring::Dave, - Sr25519Keyring::Ferdie, - ] - .into_iter() - .map(|key| AuthorityDiscoveryId::from(key.public())) - .collect(); - - buf.note_collation_advertised(hashes[0], 0, GroupIndex(0), &validators[..2]); - buf.note_collation_advertised(hashes[1], 0, GroupIndex(0), &validators[..2]); - buf.note_collation_advertised(hashes[2], 0, GroupIndex(1), &validators[2..4]); - buf.note_collation_advertised(hashes[2], 0, GroupIndex(1), &validators[2..4]); - - assert_eq!(buf.validators_to_connect(), validators[..4].to_vec()); - - for validator in &validators[2..4] { - buf.reset_validator_interest(hashes[2], validator); - } - - buf.reset_validator_interest(hashes[1], &validators[0]); - let mut expected: Vec<_> = validators[..4].iter().cloned().collect(); - let mut result = buf.validators_to_connect(); - expected.sort(); - result.sort(); - assert_eq!(result, expected); - - buf.reset_validator_interest(hashes[0], &validators[0]); - let mut expected: Vec<_> = validators[1..4].iter().cloned().collect(); - expected.sort(); - let mut result = buf.validators_to_connect(); - result.sort(); - assert_eq!(result, expected); - - buf.note_collation_advertised(hashes[3], 0, GroupIndex(1), &validators[2..4]); - buf.note_collation_advertised( - hashes[4], - 0, - GroupIndex(2), - std::slice::from_ref(&validators[4]), - ); - - buf.reset_validator_interest(hashes[3], &validators[2]); - buf.note_collation_advertised( - hashes[4], - 0, - GroupIndex(3), - std::slice::from_ref(&validators[0]), - ); - - assert_eq!( - buf.validators_to_connect(), - vec![validators[3].clone(), validators[4].clone(), validators[0].clone()] - ); - } -} From 76d7cdfa2de9ed072b43b40513763137905a238c Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Thu, 2 Oct 2025 14:27:28 +0300 Subject: [PATCH 04/59] ensure we consider all valid relay parents Signed-off-by: Andrei Sandu --- .../src/collator_side/mod.rs | 46 ++++++++++++++----- 1 file changed, 35 insertions(+), 11 deletions(-) diff --git a/polkadot/node/network/collator-protocol/src/collator_side/mod.rs b/polkadot/node/network/collator-protocol/src/collator_side/mod.rs index 6ec1227abb984..9f22ea0c76a63 100644 --- a/polkadot/node/network/collator-protocol/src/collator_side/mod.rs +++ b/polkadot/node/network/collator-protocol/src/collator_side/mod.rs @@ -53,8 +53,8 @@ use polkadot_node_subsystem_util::{ }; use polkadot_primitives::{ AuthorityDiscoveryId, BlockNumber, CandidateEvent, CandidateHash, - CandidateReceiptV2 as CandidateReceipt, CollatorPair, CoreIndex, Hash, HeadData, - Id as ParaId, SessionIndex, + CandidateReceiptV2 as CandidateReceipt, CollatorPair, CoreIndex, Hash, HeadData, Id as ParaId, + SessionIndex, }; use crate::{modify_reputation, LOG_TARGET, LOG_TARGET_STATS}; @@ -402,7 +402,7 @@ async fn distribute_collation( // We should already be connected to the validators, but if we aren't, we will try to connect to // them now. - connect_to_validators(ctx, &state.implicit_view, &state.per_relay_parent).await; + connect_to_validators(ctx, &state.implicit_view, &state.per_relay_parent, id).await; let per_relay_parent = match state.per_relay_parent.get_mut(&candidate_relay_parent) { Some(per_relay_parent) => per_relay_parent, @@ -645,14 +645,21 @@ fn has_assigned_cores( fn list_of_backing_validators_in_view( implicit_view: &Option, per_relay_parent: &HashMap, + para_id: ParaId, ) -> Vec { let mut backing_validators = HashSet::new(); let Some(implicit_view) = implicit_view else { return vec![] }; for leaf in implicit_view.leaves() { - if let Some(relay_parent) = per_relay_parent.get(leaf) { - for group in relay_parent.validator_group.values() { - backing_validators.extend(group.validators.iter().cloned()); + let allowed_ancestry = implicit_view + .known_allowed_relay_parents_under(leaf, Some(para_id)) + .unwrap_or_default(); + + for allowed_relay_parent in allowed_ancestry { + if let Some(relay_parent) = per_relay_parent.get(allowed_relay_parent) { + for group in relay_parent.validator_group.values() { + backing_validators.extend(group.validators.iter().cloned()); + } } } } @@ -667,13 +674,14 @@ async fn connect_to_validators( ctx: &mut Context, implicit_view: &Option, per_relay_parent: &HashMap, + para_id: ParaId, ) { let cores_assigned = has_assigned_cores(implicit_view, per_relay_parent); // If no cores are assigned to the para, we still need to send a ConnectToValidators request to // the network bridge passing an empty list of validator ids. Otherwise, it will keep connecting // to the last requested validators until a new request is issued. let validator_ids = if cores_assigned { - list_of_backing_validators_in_view(implicit_view, per_relay_parent) + list_of_backing_validators_in_view(implicit_view, per_relay_parent, para_id) } else { Vec::new() }; @@ -1260,7 +1268,11 @@ async fn handle_network_msg( OurViewChange(view) => { gum::trace!(target: LOG_TARGET, ?view, "Own view change"); handle_our_view_change(ctx, runtime, state, view).await?; - connect_to_validators(ctx, &state.implicit_view, &state.per_relay_parent).await; + // Connect only if we are collating on a para. + if let Some(para_id) = state.collating_on { + connect_to_validators(ctx, &state.implicit_view, &state.per_relay_parent, para_id) + .await; + } }, PeerMessage(remote, msg) => { handle_incoming_peer_message(ctx, runtime, state, remote, msg).await?; @@ -1363,7 +1375,16 @@ async fn handle_our_view_change( state.per_relay_parent.insert( *leaf, - PerRelayParent::new(ctx, runtime, para_id, claim_queue, block_number, *leaf, session_index).await?, + PerRelayParent::new( + ctx, + runtime, + para_id, + claim_queue, + block_number, + *leaf, + session_index, + ) + .await?, ); process_block_events( @@ -1426,7 +1447,7 @@ async fn handle_our_view_change( claim_queue, block_number, *block_hash, - session_index + session_index, ) .await?, ) @@ -1740,7 +1761,10 @@ async fn run_inner( } _ = reconnect_timeout => { - connect_to_validators(&mut ctx, &state.implicit_view, &state.per_relay_parent).await; + // Connect only if we are collating on a para. + if let Some(para_id) = state.collating_on { + connect_to_validators(&mut ctx, &state.implicit_view, &state.per_relay_parent, para_id).await; + } gum::trace!( target: LOG_TARGET, From 1ac73b5dd9858845246b3f1400acea4d10a576c8 Mon Sep 17 00:00:00 2001 From: "cmd[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 2 Oct 2025 12:09:44 +0000 Subject: [PATCH 05/59] Update from github-actions[bot] running command 'prdoc generate --bump patch' --- prdoc/pr_9178.prdoc | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 prdoc/pr_9178.prdoc diff --git a/prdoc/pr_9178.prdoc b/prdoc/pr_9178.prdoc new file mode 100644 index 0000000000000..fd4f0aa52f63f --- /dev/null +++ b/prdoc/pr_9178.prdoc @@ -0,0 +1,22 @@ +title: 'collator-protocol: cleanup connecting to backing group' +doc: +- audience: Todo + description: |- + There are a few things wrong with the way we are handling connecting the validators in the backing group: + 1. `validators_to_connect` returns only validators in groups we already have a block to advertise and the last backing groups we advertised something to, that means that if our backing group changes, but we don't have anything to advertise it will continue to try to connect to the previous backing group and validator will log this and disconnect it immediately. + On the validator you will see`Declared as collator for unneeded para` and on the collator you will see Connect/Disconnect requests. This will continue every reconnect_timeout(4s from each active signal) until the collator advertises something to the new backing group. This is harmless, but it pollutes both the collator and the validator logs. + + 2. A collator connects only when it has something to advertise to its backing group, this is a bit too late and we can improve it by connecting the collators to the backing group immediately after they notice their assigned backing group. + + 3. Staying connected to the last backingroup we advertised something does not work for elastic scaling because we have different backing groups and if the collator set is big enough that collators author just one block per group rotation, then we will always connect just when we have a candidate to advertise. + + ## Proposal to fix: + + Have collators always connect to the backing group they got assigned to and keep the connection open until backing group changes. Also, try to connect when have something to advertise or on timeout to have more chances of being correctly connected. + + ## Todo + - [x] Confirm that proposal does not have other undesired side effects. + - [x] Tests +crates: +- name: polkadot-collator-protocol + bump: patch From 39fb8e12bc3c8607b833e52360f783a795068720 Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Fri, 3 Oct 2025 18:17:25 +0300 Subject: [PATCH 06/59] Support ConnectToBackingGroups Signed-off-by: Andrei Sandu --- Cargo.lock | 6 ++ cumulus/client/service/Cargo.toml | 6 ++ cumulus/client/service/src/lib.rs | 66 +++++++++++++++++++ .../polkadot-omni-node/lib/src/nodes/aura.rs | 35 ++++++++-- .../src/collator_side/mod.rs | 40 ++++++++++- .../src/validator_side/mod.rs | 12 ++++ polkadot/node/subsystem-types/src/messages.rs | 6 ++ 7 files changed, 164 insertions(+), 7 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 62b2015942dfc..4be25ebdeaf82 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4577,10 +4577,14 @@ dependencies = [ "cumulus-relay-chain-minimal-node", "cumulus-relay-chain-streams", "futures", + "parity-scale-codec", + "polkadot-node-subsystem", + "polkadot-overseer", "polkadot-primitives", "prometheus", "sc-client-api", "sc-consensus", + "sc-consensus-aura", "sc-network", "sc-network-sync", "sc-network-transactions", @@ -4593,8 +4597,10 @@ dependencies = [ "sp-api", "sp-blockchain", "sp-consensus", + "sp-consensus-aura", "sp-core 28.0.0", "sp-io", + "sp-keystore", "sp-runtime", "sp-transaction-pool", ] diff --git a/cumulus/client/service/Cargo.toml b/cumulus/client/service/Cargo.toml index 3ea36d70b42b6..739b30270adaf 100644 --- a/cumulus/client/service/Cargo.toml +++ b/cumulus/client/service/Cargo.toml @@ -15,10 +15,12 @@ workspace = true async-channel = { workspace = true } futures = { workspace = true } prometheus = { workspace = true } +codec = { workspace = true, default-features = true } # Substrate sc-client-api = { workspace = true, default-features = true } sc-consensus = { workspace = true, default-features = true } +sc-consensus-aura = { workspace = true, default-features = true } sc-network = { workspace = true, default-features = true } sc-network-sync = { workspace = true, default-features = true } sc-network-transactions = { workspace = true, default-features = true } @@ -31,13 +33,17 @@ sc-utils = { workspace = true, default-features = true } sp-api = { workspace = true, default-features = true } sp-blockchain = { workspace = true, default-features = true } sp-consensus = { workspace = true, default-features = true } +sp-consensus-aura = { workspace = true, default-features = true } sp-core = { workspace = true, default-features = true } sp-io = { workspace = true, default-features = true } sp-runtime = { workspace = true, default-features = true } sp-transaction-pool = { workspace = true, default-features = true } +sp-keystore = { workspace = true, default-features = true } # Polkadot polkadot-primitives = { workspace = true, default-features = true } +polkadot-node-subsystem = { workspace = true, default-features = true } +polkadot-overseer = { workspace = true, default-features = true } # Cumulus cumulus-client-cli = { workspace = true, default-features = true } diff --git a/cumulus/client/service/src/lib.rs b/cumulus/client/service/src/lib.rs index 62199f2704162..aecfd50778273 100644 --- a/cumulus/client/service/src/lib.rs +++ b/cumulus/client/service/src/lib.rs @@ -59,6 +59,18 @@ use std::{ time::{Duration, Instant}, }; +use polkadot_overseer::Handle as OverseerHandle; +use sc_consensus_aura::AuraApi; +use sp_consensus_aura::Slot; + +use sp_core::Pair; +use sp_keystore::KeystorePtr; +use sp_runtime::DigestItem; + +use codec::Codec; +use polkadot_node_subsystem::messages::CollatorProtocolMessage; +use sc_consensus_aura::{standalone::claim_slot, CompatibleDigestItem}; + /// Host functions that should be used in parachain nodes. /// /// Contains the standard substrate host functions, as well as a @@ -73,6 +85,7 @@ pub type ParachainHostFunctions = ( // In practice here we expect no more than one queued messages. const RECOVERY_CHAN_SIZE: usize = 8; const LOG_TARGET_SYNC: &str = "sync::cumulus"; +const LOG_TARGET_COLLATOR_HELPER: &str = "aura::collator-helper"; /// A hint about how long the node should wait before attempting to recover missing block data /// from the data availability layer. @@ -458,6 +471,59 @@ where Err("Stopping following imported blocks. Could not determine parachain target block".into()) } +/// Task for triggering backing group connections early. +pub async fn collator_protocol_helper( + client: Arc, + keystore: KeystorePtr, + mut overseer_handle: OverseerHandle, +) where + Client: HeaderBackend + + Send + + Sync + + BlockBackend + + BlockchainEvents + + ProvideRuntimeApi + + 'static, + Client::Api: AuraApi, + P: Pair + Send + Sync, + P::Public: Codec, +{ + let mut import_notifications = client.import_notification_stream().fuse(); + + log::debug!(target: LOG_TARGET_COLLATOR_HELPER, "Started collator protocol helper"); + + while let Some(notification) = import_notifications.next().await { + // Determine if this node is the next author + let digest = notification.header.digest(); + let slot = digest + .logs() + .iter() + .find_map(|log| >::as_aura_pre_digest(log)); + + log::debug!(target: LOG_TARGET_COLLATOR_HELPER, "Imported block with slot: {:?}", slot); + + if let Some(slot) = slot { + let authorities = + client.runtime_api().authorities(notification.header.hash()).unwrap_or_default(); + + // Determine if this node is the next author. + if claim_slot::

(slot + 2, &authorities, &keystore).await.is_some() { + log::debug!(target: LOG_TARGET_COLLATOR_HELPER, "Our slot comes next, sending preconnect message"); + + // Send a message to the collator protocol to pre-connect to backing groups + overseer_handle + .send_msg( + CollatorProtocolMessage::ConnectToBackingGroups, + "SlotBasedBlockImport", + ) + .await; + } + } + + // TODO: Send disconnect after we've imported the last block of our slot. + } +} + /// Task for logging candidate events and some related metrics. async fn parachain_informant( para_id: ParaId, diff --git a/cumulus/polkadot-omni-node/lib/src/nodes/aura.rs b/cumulus/polkadot-omni-node/lib/src/nodes/aura.rs index 39234f5adf6a2..baade2b2e66c9 100644 --- a/cumulus/polkadot-omni-node/lib/src/nodes/aura.rs +++ b/cumulus/polkadot-omni-node/lib/src/nodes/aura.rs @@ -48,7 +48,7 @@ use cumulus_client_consensus_aura::{ use cumulus_client_consensus_proposer::ProposerInterface; use cumulus_client_consensus_relay_chain::Verifier as RelayChainVerifier; #[allow(deprecated)] -use cumulus_client_service::CollatorSybilResistance; +use cumulus_client_service::{collator_protocol_helper, CollatorSybilResistance}; use cumulus_primitives_core::{relay_chain::ValidationCode, GetParachainInfo, ParaId}; use cumulus_relay_chain_interface::{OverseerHandle, RelayChainInterface}; use futures::prelude::*; @@ -217,7 +217,8 @@ where + pallet_transaction_payment_rpc::TransactionPaymentRuntimeApi + substrate_frame_rpc_system::AccountNonceApi + GetParachainInfo, - AuraId: AuraIdT + Sync, + AuraId: AuraIdT + Sync + Send, + ::Pair: Send + Sync, { if extra_args.authoring_policy == AuthoringPolicy::SlotBased { Box::new(AuraNode::< @@ -299,7 +300,8 @@ impl, RuntimeApi, AuraId> where RuntimeApi: ConstructNodeRuntimeApi>, RuntimeApi::RuntimeApi: AuraRuntimeApi, - AuraId: AuraIdT + Sync, + AuraId: AuraIdT + Sync + Send, + ::Pair: Send + Sync, { fn start_consensus( client: Arc>, @@ -320,7 +322,7 @@ where relay_chain_slot_duration: Duration, para_id: ParaId, collator_key: CollatorPair, - _overseer_handle: OverseerHandle, + overseer_handle: OverseerHandle, announce_block: Arc>) + Send + Sync>, backend: Arc>, node_extra_args: NodeExtraArgs, @@ -341,6 +343,17 @@ where client.clone(), ); + // Spawn the collator protocol helper task + task_manager.spawn_essential_handle().spawn( + "collator-protocol-helper", + None, + collator_protocol_helper::<_, _, ::Pair>( + client.clone(), + keystore.clone(), + overseer_handle, + ), + ); + let client_for_aura = client.clone(); let params = SlotBasedParams { create_inherent_data_providers: move |_, ()| async move { Ok(()) }, @@ -430,7 +443,8 @@ impl, RuntimeApi, AuraId> where RuntimeApi: ConstructNodeRuntimeApi>, RuntimeApi::RuntimeApi: AuraRuntimeApi, - AuraId: AuraIdT + Sync, + AuraId: AuraIdT + Sync + Send, + ::Pair: Send + Sync, { fn start_consensus( client: Arc>, @@ -465,6 +479,17 @@ where client.clone(), ); + // Spawn the collator protocol helper task + task_manager.spawn_essential_handle().spawn( + "collator-protocol-helper", + None, + collator_protocol_helper::<_, _, ::Pair>( + client.clone(), + keystore.clone(), + overseer_handle.clone(), + ), + ); + let params = aura::ParamsWithExport { export_pov: node_extra_args.export_pov, params: AuraParams { diff --git a/polkadot/node/network/collator-protocol/src/collator_side/mod.rs b/polkadot/node/network/collator-protocol/src/collator_side/mod.rs index 9f22ea0c76a63..523f35b8def3f 100644 --- a/polkadot/node/network/collator-protocol/src/collator_side/mod.rs +++ b/polkadot/node/network/collator-protocol/src/collator_side/mod.rs @@ -346,6 +346,9 @@ struct State { /// An utility for tracking all collations produced by the collator. collation_tracker: CollationTracker, + + /// Should we be connected to backers ? + connect_to_backers: bool, } impl State { @@ -373,6 +376,7 @@ impl State { advertisement_timeouts: Default::default(), reputation, collation_tracker: Default::default(), + connect_to_backers: false, } } } @@ -796,6 +800,29 @@ async fn process_msg( use CollatorProtocolMessage::*; match msg { + ConnectToBackingGroups => { + // This message is used to instruct the collator to pre-connect to backing groups. + // For now, we do not need to take any action here. + gum::debug!( + target: LOG_TARGET, + "Received PreConnectToBackingGroups message." + ); + state.connect_to_backers = true; + + if let Some(para_id) = state.collating_on { + connect_to_validators(ctx, &state.implicit_view, &state.per_relay_parent, para_id) + .await; + } + }, + DisconnectFromBackingGroups => { + // This message is used to instruct the collator to disconnect from backing groups. + // For now, we do not need to take any action here. + gum::debug!( + target: LOG_TARGET, + "Received DisconnectFromBackingGroups message." + ); + state.connect_to_backers = false; + }, CollateOn(id) => { state.collating_on = Some(id); state.implicit_view = Some(ImplicitView::new(Some(id))); @@ -1270,8 +1297,15 @@ async fn handle_network_msg( handle_our_view_change(ctx, runtime, state, view).await?; // Connect only if we are collating on a para. if let Some(para_id) = state.collating_on { - connect_to_validators(ctx, &state.implicit_view, &state.per_relay_parent, para_id) + if state.connect_to_backers { + connect_to_validators( + ctx, + &state.implicit_view, + &state.per_relay_parent, + para_id, + ) .await; + } } }, PeerMessage(remote, msg) => { @@ -1763,7 +1797,9 @@ async fn run_inner( // Connect only if we are collating on a para. if let Some(para_id) = state.collating_on { - connect_to_validators(&mut ctx, &state.implicit_view, &state.per_relay_parent, para_id).await; + if state.connect_to_backers { + connect_to_validators(&mut ctx, &state.implicit_view, &state.per_relay_parent, para_id).await; + } } gum::trace!( diff --git a/polkadot/node/network/collator-protocol/src/validator_side/mod.rs b/polkadot/node/network/collator-protocol/src/validator_side/mod.rs index 8ed62bb32208d..bc7e5ab87dec5 100644 --- a/polkadot/node/network/collator-protocol/src/validator_side/mod.rs +++ b/polkadot/node/network/collator-protocol/src/validator_side/mod.rs @@ -1464,6 +1464,18 @@ async fn process_msg( let _timer = state.metrics.time_process_msg(); match msg { + ConnectToBackingGroups => { + gum::warn!( + target: LOG_TARGET, + "PreConnectToBackingGroups message is not expected on the validator side of the protocol", + ); + }, + DisconnectFromBackingGroups => { + gum::warn!( + target: LOG_TARGET, + "DisconnectFromBackingGroups message is not expected on the validator side of the protocol", + ); + }, CollateOn(id) => { gum::warn!( target: LOG_TARGET, diff --git a/polkadot/node/subsystem-types/src/messages.rs b/polkadot/node/subsystem-types/src/messages.rs index 28d8c0ebf7675..6866979fa8dce 100644 --- a/polkadot/node/subsystem-types/src/messages.rs +++ b/polkadot/node/subsystem-types/src/messages.rs @@ -262,6 +262,12 @@ pub enum CollatorProtocolMessage { /// /// The hash is the relay parent. Seconded(Hash, SignedFullStatement), + /// A message sent by Cumulus consensus engine to the collator protocol to + /// pre-connect to backing groups at all allowed relay parents. + ConnectToBackingGroups, + /// A message sent by Cumulus consensus engine to the collator protocol to + /// disconnect from backing groups. + DisconnectFromBackingGroups, } impl Default for CollatorProtocolMessage { From 47923b6f1751a6d5e3ff9c7b6c8e24049765d9e8 Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Fri, 3 Oct 2025 18:25:53 +0300 Subject: [PATCH 07/59] fix origin Signed-off-by: Andrei Sandu --- cumulus/client/service/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cumulus/client/service/src/lib.rs b/cumulus/client/service/src/lib.rs index aecfd50778273..345822bdad6ef 100644 --- a/cumulus/client/service/src/lib.rs +++ b/cumulus/client/service/src/lib.rs @@ -514,7 +514,7 @@ pub async fn collator_protocol_helper( overseer_handle .send_msg( CollatorProtocolMessage::ConnectToBackingGroups, - "SlotBasedBlockImport", + "CollatorProtocolHelper", ) .await; } From 0ede4bb6fdc4ec04d8f34d141229b67cc48d0b02 Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Fri, 3 Oct 2025 19:05:05 +0300 Subject: [PATCH 08/59] add disconnect mechanism Signed-off-by: Andrei Sandu --- cumulus/client/service/src/lib.rs | 42 +++++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 10 deletions(-) diff --git a/cumulus/client/service/src/lib.rs b/cumulus/client/service/src/lib.rs index 345822bdad6ef..e9227533da6dc 100644 --- a/cumulus/client/service/src/lib.rs +++ b/cumulus/client/service/src/lib.rs @@ -492,6 +492,9 @@ pub async fn collator_protocol_helper( log::debug!(target: LOG_TARGET_COLLATOR_HELPER, "Started collator protocol helper"); + // Our own slot number, known in advance. + let mut our_slot = None; + while let Some(notification) = import_notifications.next().await { // Determine if this node is the next author let digest = notification.header.digest(); @@ -502,25 +505,44 @@ pub async fn collator_protocol_helper( log::debug!(target: LOG_TARGET_COLLATOR_HELPER, "Imported block with slot: {:?}", slot); - if let Some(slot) = slot { - let authorities = - client.runtime_api().authorities(notification.header.hash()).unwrap_or_default(); + let Some(slot) = slot else { + continue; + }; + + let authorities = + client.runtime_api().authorities(notification.header.hash()).unwrap_or_default(); - // Determine if this node is the next author. - if claim_slot::

(slot + 2, &authorities, &keystore).await.is_some() { - log::debug!(target: LOG_TARGET_COLLATOR_HELPER, "Our slot comes next, sending preconnect message"); + // Check if our slot has passed and we are not expected to author again in next slot. + match (our_slot, claim_slot::

(slot + 1, &authorities, &keystore).await.is_none()) { + (Some(last_slot), true) if slot > last_slot => { + log::debug!(target: LOG_TARGET_COLLATOR_HELPER, "Our slot {} has passed, current slot is {}, sending disconnect message", last_slot, slot); - // Send a message to the collator protocol to pre-connect to backing groups + // Send a message to the collator protocol to stop pre-connecting to backing + // groups overseer_handle .send_msg( - CollatorProtocolMessage::ConnectToBackingGroups, + CollatorProtocolMessage::DisconnectFromBackingGroups, "CollatorProtocolHelper", ) .await; - } + + our_slot = None; + }, + _ => {}, } - // TODO: Send disconnect after we've imported the last block of our slot. + // Check if our slot is coming up next. This means that there is still another slot + // before our turn. + let target_slot = slot + 2; + if claim_slot::

(target_slot, &authorities, &keystore).await.is_some() { + log::debug!(target: LOG_TARGET_COLLATOR_HELPER, "Our slot {} comes next, sending preconnect message ", target_slot ); + // Send a message to the collator protocol to pre-connect to backing groups + overseer_handle + .send_msg(CollatorProtocolMessage::ConnectToBackingGroups, "CollatorProtocolHelper") + .await; + + our_slot = Some(target_slot); + } } } From eb51a3e2f005577a091acc0a0432570141a3c98a Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Sat, 4 Oct 2025 12:58:34 +0300 Subject: [PATCH 09/59] move collator_protocol_helper Signed-off-by: Andrei Sandu --- cumulus/client/consensus/aura/src/collator.rs | 92 +++++++++++++++++++ .../polkadot-omni-node/lib/src/nodes/aura.rs | 3 +- 2 files changed, 94 insertions(+), 1 deletion(-) diff --git a/cumulus/client/consensus/aura/src/collator.rs b/cumulus/client/consensus/aura/src/collator.rs index e372162f21332..c4d348c0ad319 100644 --- a/cumulus/client/consensus/aura/src/collator.rs +++ b/cumulus/client/consensus/aura/src/collator.rs @@ -433,3 +433,95 @@ where Ok(block_import_params) } + +/// Task for triggering backing group connections early. +/// +/// This helper monitors block imports and proactively connects to backing groups +/// when the collator's slot is approaching, improving network connectivity for +/// slot-based collation. +pub async fn collator_protocol_helper( + client: std::sync::Arc, + keystore: sp_keystore::KeystorePtr, + mut overseer_handle: cumulus_relay_chain_interface::OverseerHandle, +) where + Block: sp_runtime::traits::Block, + Client: sc_client_api::HeaderBackend + + Send + + Sync + + sc_client_api::BlockBackend + + sc_client_api::BlockchainEvents + + ProvideRuntimeApi + + 'static, + Client::Api: AuraApi, + P: sp_core::Pair + Send + Sync, + P::Public: Codec, +{ + use polkadot_node_subsystem::messages::CollatorProtocolMessage; + use sc_consensus_aura::CompatibleDigestItem; + use sp_runtime::DigestItem; + + let mut import_notifications = client.import_notification_stream().fuse(); + + tracing::debug!(target: crate::LOG_TARGET, "Started collator protocol helper"); + + // Our own slot number, known in advance. + let mut our_slot = None; + + while let Some(notification) = import_notifications.next().await { + // Determine if this node is the next author + let digest = notification.header.digest(); + let slot = digest + .logs() + .iter() + .find_map(|log| >::as_aura_pre_digest(log)); + + tracing::debug!(target: crate::LOG_TARGET, "Imported block with slot: {:?}", slot); + + let Some(slot) = slot else { + continue; + }; + + let authorities = + client.runtime_api().authorities(notification.header.hash()).unwrap_or_default(); + + // Check if our slot has passed and we are not expected to author again in next slot. + match ( + our_slot, + aura_internal::claim_slot::

(slot + 1, &authorities, &keystore) + .await + .is_none(), + ) { + (Some(last_slot), true) if slot > last_slot => { + tracing::debug!(target: crate::LOG_TARGET, "Our slot {} has passed, current slot is {}, sending disconnect message", last_slot, slot); + + // Send a message to the collator protocol to stop pre-connecting to backing + // groups + overseer_handle + .send_msg( + CollatorProtocolMessage::DisconnectFromBackingGroups, + "CollatorProtocolHelper", + ) + .await; + + our_slot = None; + }, + _ => {}, + } + + // Check if our slot is coming up next. This means that there is still another slot + // before our turn. + let target_slot = slot + 2; + if aura_internal::claim_slot::

(target_slot, &authorities, &keystore) + .await + .is_some() + { + tracing::debug!(target: crate::LOG_TARGET, "Our slot {} comes next, sending preconnect message ", target_slot ); + // Send a message to the collator protocol to pre-connect to backing groups + overseer_handle + .send_msg(CollatorProtocolMessage::ConnectToBackingGroups, "CollatorProtocolHelper") + .await; + + our_slot = Some(target_slot); + } + } +} diff --git a/cumulus/polkadot-omni-node/lib/src/nodes/aura.rs b/cumulus/polkadot-omni-node/lib/src/nodes/aura.rs index baade2b2e66c9..28c5301fcf343 100644 --- a/cumulus/polkadot-omni-node/lib/src/nodes/aura.rs +++ b/cumulus/polkadot-omni-node/lib/src/nodes/aura.rs @@ -39,6 +39,7 @@ use cumulus_client_consensus_aura::collators::slot_based::{ self as slot_based, Params as SlotBasedParams, }; use cumulus_client_consensus_aura::{ + collator::collator_protocol_helper, collators::{ lookahead::{self as aura, Params as AuraParams}, slot_based::{SlotBasedBlockImport, SlotBasedBlockImportHandle}, @@ -48,7 +49,7 @@ use cumulus_client_consensus_aura::{ use cumulus_client_consensus_proposer::ProposerInterface; use cumulus_client_consensus_relay_chain::Verifier as RelayChainVerifier; #[allow(deprecated)] -use cumulus_client_service::{collator_protocol_helper, CollatorSybilResistance}; +use cumulus_client_service::CollatorSybilResistance; use cumulus_primitives_core::{relay_chain::ValidationCode, GetParachainInfo, ParaId}; use cumulus_relay_chain_interface::{OverseerHandle, RelayChainInterface}; use futures::prelude::*; From 17d1c6c53225fcf01d6c32f0c18664b4028cff55 Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Sat, 4 Oct 2025 13:03:25 +0300 Subject: [PATCH 10/59] comment Signed-off-by: Andrei Sandu --- cumulus/client/consensus/aura/src/collator.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cumulus/client/consensus/aura/src/collator.rs b/cumulus/client/consensus/aura/src/collator.rs index c4d348c0ad319..952433853305d 100644 --- a/cumulus/client/consensus/aura/src/collator.rs +++ b/cumulus/client/consensus/aura/src/collator.rs @@ -437,8 +437,7 @@ where /// Task for triggering backing group connections early. /// /// This helper monitors block imports and proactively connects to backing groups -/// when the collator's slot is approaching, improving network connectivity for -/// slot-based collation. +/// when the collator's slot is approaching, improving network connectivity. pub async fn collator_protocol_helper( client: std::sync::Arc, keystore: sp_keystore::KeystorePtr, From 06c444f2e71f0442917cc0d447df5365b1a96241 Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Sat, 4 Oct 2025 13:03:52 +0300 Subject: [PATCH 11/59] remove helper from service Signed-off-by: Andrei Sandu --- cumulus/client/service/src/lib.rs | 86 ------------------------------- 1 file changed, 86 deletions(-) diff --git a/cumulus/client/service/src/lib.rs b/cumulus/client/service/src/lib.rs index e9227533da6dc..79fd63192971b 100644 --- a/cumulus/client/service/src/lib.rs +++ b/cumulus/client/service/src/lib.rs @@ -60,16 +60,6 @@ use std::{ }; use polkadot_overseer::Handle as OverseerHandle; -use sc_consensus_aura::AuraApi; -use sp_consensus_aura::Slot; - -use sp_core::Pair; -use sp_keystore::KeystorePtr; -use sp_runtime::DigestItem; - -use codec::Codec; -use polkadot_node_subsystem::messages::CollatorProtocolMessage; -use sc_consensus_aura::{standalone::claim_slot, CompatibleDigestItem}; /// Host functions that should be used in parachain nodes. /// @@ -85,7 +75,6 @@ pub type ParachainHostFunctions = ( // In practice here we expect no more than one queued messages. const RECOVERY_CHAN_SIZE: usize = 8; const LOG_TARGET_SYNC: &str = "sync::cumulus"; -const LOG_TARGET_COLLATOR_HELPER: &str = "aura::collator-helper"; /// A hint about how long the node should wait before attempting to recover missing block data /// from the data availability layer. @@ -471,81 +460,6 @@ where Err("Stopping following imported blocks. Could not determine parachain target block".into()) } -/// Task for triggering backing group connections early. -pub async fn collator_protocol_helper( - client: Arc, - keystore: KeystorePtr, - mut overseer_handle: OverseerHandle, -) where - Client: HeaderBackend - + Send - + Sync - + BlockBackend - + BlockchainEvents - + ProvideRuntimeApi - + 'static, - Client::Api: AuraApi, - P: Pair + Send + Sync, - P::Public: Codec, -{ - let mut import_notifications = client.import_notification_stream().fuse(); - - log::debug!(target: LOG_TARGET_COLLATOR_HELPER, "Started collator protocol helper"); - - // Our own slot number, known in advance. - let mut our_slot = None; - - while let Some(notification) = import_notifications.next().await { - // Determine if this node is the next author - let digest = notification.header.digest(); - let slot = digest - .logs() - .iter() - .find_map(|log| >::as_aura_pre_digest(log)); - - log::debug!(target: LOG_TARGET_COLLATOR_HELPER, "Imported block with slot: {:?}", slot); - - let Some(slot) = slot else { - continue; - }; - - let authorities = - client.runtime_api().authorities(notification.header.hash()).unwrap_or_default(); - - // Check if our slot has passed and we are not expected to author again in next slot. - match (our_slot, claim_slot::

(slot + 1, &authorities, &keystore).await.is_none()) { - (Some(last_slot), true) if slot > last_slot => { - log::debug!(target: LOG_TARGET_COLLATOR_HELPER, "Our slot {} has passed, current slot is {}, sending disconnect message", last_slot, slot); - - // Send a message to the collator protocol to stop pre-connecting to backing - // groups - overseer_handle - .send_msg( - CollatorProtocolMessage::DisconnectFromBackingGroups, - "CollatorProtocolHelper", - ) - .await; - - our_slot = None; - }, - _ => {}, - } - - // Check if our slot is coming up next. This means that there is still another slot - // before our turn. - let target_slot = slot + 2; - if claim_slot::

(target_slot, &authorities, &keystore).await.is_some() { - log::debug!(target: LOG_TARGET_COLLATOR_HELPER, "Our slot {} comes next, sending preconnect message ", target_slot ); - // Send a message to the collator protocol to pre-connect to backing groups - overseer_handle - .send_msg(CollatorProtocolMessage::ConnectToBackingGroups, "CollatorProtocolHelper") - .await; - - our_slot = Some(target_slot); - } - } -} - /// Task for logging candidate events and some related metrics. async fn parachain_informant( para_id: ParaId, From 8fe9db55ab6eb9ef4012a01def6c394b29791133 Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Sat, 4 Oct 2025 13:05:49 +0300 Subject: [PATCH 12/59] unused Signed-off-by: Andrei Sandu --- cumulus/client/service/src/lib.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/cumulus/client/service/src/lib.rs b/cumulus/client/service/src/lib.rs index 79fd63192971b..62199f2704162 100644 --- a/cumulus/client/service/src/lib.rs +++ b/cumulus/client/service/src/lib.rs @@ -59,8 +59,6 @@ use std::{ time::{Duration, Instant}, }; -use polkadot_overseer::Handle as OverseerHandle; - /// Host functions that should be used in parachain nodes. /// /// Contains the standard substrate host functions, as well as a From 3b632659d135ebf0452359e95ac66d40069c15e8 Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Sat, 4 Oct 2025 13:17:19 +0300 Subject: [PATCH 13/59] unused deps in crates Signed-off-by: Andrei Sandu --- Cargo.lock | 5 ----- cumulus/client/service/Cargo.toml | 5 ----- 2 files changed, 10 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4be25ebdeaf82..26f4df522c18f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4577,14 +4577,11 @@ dependencies = [ "cumulus-relay-chain-minimal-node", "cumulus-relay-chain-streams", "futures", - "parity-scale-codec", - "polkadot-node-subsystem", "polkadot-overseer", "polkadot-primitives", "prometheus", "sc-client-api", "sc-consensus", - "sc-consensus-aura", "sc-network", "sc-network-sync", "sc-network-transactions", @@ -4597,10 +4594,8 @@ dependencies = [ "sp-api", "sp-blockchain", "sp-consensus", - "sp-consensus-aura", "sp-core 28.0.0", "sp-io", - "sp-keystore", "sp-runtime", "sp-transaction-pool", ] diff --git a/cumulus/client/service/Cargo.toml b/cumulus/client/service/Cargo.toml index 739b30270adaf..bcbf3c1ee536e 100644 --- a/cumulus/client/service/Cargo.toml +++ b/cumulus/client/service/Cargo.toml @@ -15,12 +15,10 @@ workspace = true async-channel = { workspace = true } futures = { workspace = true } prometheus = { workspace = true } -codec = { workspace = true, default-features = true } # Substrate sc-client-api = { workspace = true, default-features = true } sc-consensus = { workspace = true, default-features = true } -sc-consensus-aura = { workspace = true, default-features = true } sc-network = { workspace = true, default-features = true } sc-network-sync = { workspace = true, default-features = true } sc-network-transactions = { workspace = true, default-features = true } @@ -33,16 +31,13 @@ sc-utils = { workspace = true, default-features = true } sp-api = { workspace = true, default-features = true } sp-blockchain = { workspace = true, default-features = true } sp-consensus = { workspace = true, default-features = true } -sp-consensus-aura = { workspace = true, default-features = true } sp-core = { workspace = true, default-features = true } sp-io = { workspace = true, default-features = true } sp-runtime = { workspace = true, default-features = true } sp-transaction-pool = { workspace = true, default-features = true } -sp-keystore = { workspace = true, default-features = true } # Polkadot polkadot-primitives = { workspace = true, default-features = true } -polkadot-node-subsystem = { workspace = true, default-features = true } polkadot-overseer = { workspace = true, default-features = true } # Cumulus From bda8c384e1649f41b77ed0d4d66c2fd31a556ea0 Mon Sep 17 00:00:00 2001 From: "cmd[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Sat, 4 Oct 2025 10:25:03 +0000 Subject: [PATCH 14/59] Update from github-actions[bot] running command 'prdoc generate --bump major' --- prdoc/pr_9929.prdoc | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 prdoc/pr_9929.prdoc diff --git a/prdoc/pr_9929.prdoc b/prdoc/pr_9929.prdoc new file mode 100644 index 0000000000000..9494bd809a28e --- /dev/null +++ b/prdoc/pr_9929.prdoc @@ -0,0 +1,19 @@ +title: '[WIP] Cumulus: pre-connect to backers before own slot' +doc: +- audience: Todo + description: |- + On top of https://github.com/paritytech/polkadot-sdk/pull/9178. Implements a mechanism to pre-connect to backers, see https://github.com/paritytech/polkadot-sdk/issues/9767#issuecomment-3306292493 + + TODO: + - [ ] improve logic to handle 24s slots, we'd want to start connecting later than the begging of the prev author slot. +crates: +- name: cumulus-client-service + bump: major +- name: polkadot-omni-node-lib + bump: major +- name: polkadot-collator-protocol + bump: major +- name: polkadot-node-subsystem-types + bump: major +- name: cumulus-client-consensus-aura + bump: major From 9cfd857dc093207b329f164a6a87d456d7cf1117 Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Sat, 4 Oct 2025 15:45:03 +0300 Subject: [PATCH 15/59] fix tests Signed-off-by: Andrei Sandu --- .../src/collator_side/tests/mod.rs | 29 +++++++++++++++++++ .../tests/prospective_parachains.rs | 8 +++++ 2 files changed, 37 insertions(+) diff --git a/polkadot/node/network/collator-protocol/src/collator_side/tests/mod.rs b/polkadot/node/network/collator-protocol/src/collator_side/tests/mod.rs index 1627e93c75dec..9c3423ba176b2 100644 --- a/polkadot/node/network/collator-protocol/src/collator_side/tests/mod.rs +++ b/polkadot/node/network/collator-protocol/src/collator_side/tests/mod.rs @@ -567,6 +567,7 @@ fn v1_protocol_rejected() { ReputationAggregator::new(|_| true), |mut test_harness| async move { let virtual_overseer = &mut test_harness.virtual_overseer; + overseer_send(virtual_overseer, CollatorProtocolMessage::ConnectToBackingGroups).await; overseer_send(virtual_overseer, CollatorProtocolMessage::CollateOn(test_state.para_id)) .await; @@ -623,6 +624,9 @@ fn advertise_and_send_collation() { let mut virtual_overseer = test_harness.virtual_overseer; let mut req_v2_cfg = test_harness.req_v2_cfg; + overseer_send(&mut virtual_overseer, CollatorProtocolMessage::ConnectToBackingGroups) + .await; + overseer_send( &mut virtual_overseer, CollatorProtocolMessage::CollateOn(test_state.para_id), @@ -823,6 +827,9 @@ fn delay_reputation_change() { let mut virtual_overseer = test_harness.virtual_overseer; let mut req_v2_cfg = test_harness.req_v2_cfg; + overseer_send(&mut virtual_overseer, CollatorProtocolMessage::ConnectToBackingGroups) + .await; + overseer_send( &mut virtual_overseer, CollatorProtocolMessage::CollateOn(test_state.para_id), @@ -976,6 +983,12 @@ fn collators_declare_to_connected_peers() { let peer = test_state.validator_peer_id[0]; let validator_id = test_state.current_group_validator_authority_ids()[0].clone(); + overseer_send( + &mut test_harness.virtual_overseer, + CollatorProtocolMessage::ConnectToBackingGroups, + ) + .await; + overseer_send( &mut test_harness.virtual_overseer, CollatorProtocolMessage::CollateOn(test_state.para_id), @@ -1024,6 +1037,8 @@ fn collations_are_only_advertised_to_validators_with_correct_view() { let peer2 = test_state.current_group_validator_peer_ids()[1]; let validator_id2 = test_state.current_group_validator_authority_ids()[1].clone(); + overseer_send(virtual_overseer, CollatorProtocolMessage::ConnectToBackingGroups).await; + overseer_send(virtual_overseer, CollatorProtocolMessage::CollateOn(test_state.para_id)) .await; @@ -1099,6 +1114,8 @@ fn collate_on_two_different_relay_chain_blocks() { let peer2 = test_state.current_group_validator_peer_ids()[1]; let validator_id2 = test_state.current_group_validator_authority_ids()[1].clone(); + overseer_send(virtual_overseer, CollatorProtocolMessage::ConnectToBackingGroups).await; + overseer_send(virtual_overseer, CollatorProtocolMessage::CollateOn(test_state.para_id)) .await; @@ -1189,6 +1206,8 @@ fn validator_reconnect_does_not_advertise_a_second_time() { let peer = test_state.current_group_validator_peer_ids()[0]; let validator_id = test_state.current_group_validator_authority_ids()[0].clone(); + overseer_send(virtual_overseer, CollatorProtocolMessage::ConnectToBackingGroups).await; + overseer_send(virtual_overseer, CollatorProtocolMessage::CollateOn(test_state.para_id)) .await; @@ -1253,6 +1272,8 @@ fn collators_reject_declare_messages() { let peer = test_state.current_group_validator_peer_ids()[0]; let validator_id = test_state.current_group_validator_authority_ids()[0].clone(); + overseer_send(virtual_overseer, CollatorProtocolMessage::ConnectToBackingGroups).await; + overseer_send(virtual_overseer, CollatorProtocolMessage::CollateOn(test_state.para_id)) .await; @@ -1320,6 +1341,8 @@ where let virtual_overseer = &mut test_harness.virtual_overseer; let req_cfg = &mut test_harness.req_v2_cfg; + overseer_send(virtual_overseer, CollatorProtocolMessage::ConnectToBackingGroups).await; + overseer_send(virtual_overseer, CollatorProtocolMessage::CollateOn(test_state.para_id)) .await; @@ -1471,6 +1494,9 @@ fn connect_to_group_in_view() { let mut virtual_overseer = test_harness.virtual_overseer; let mut req_cfg = test_harness.req_v2_cfg; + overseer_send(&mut virtual_overseer, CollatorProtocolMessage::ConnectToBackingGroups) + .await; + overseer_send( &mut virtual_overseer, CollatorProtocolMessage::CollateOn(test_state.para_id), @@ -1602,6 +1628,9 @@ fn connect_with_no_cores_assigned() { let mut virtual_overseer = test_harness.virtual_overseer; let req_cfg = test_harness.req_v2_cfg; + overseer_send(&mut virtual_overseer, CollatorProtocolMessage::ConnectToBackingGroups) + .await; + overseer_send( &mut virtual_overseer, CollatorProtocolMessage::CollateOn(test_state.para_id), diff --git a/polkadot/node/network/collator-protocol/src/collator_side/tests/prospective_parachains.rs b/polkadot/node/network/collator-protocol/src/collator_side/tests/prospective_parachains.rs index 48f40fec6f05c..b307d0bf97186 100644 --- a/polkadot/node/network/collator-protocol/src/collator_side/tests/prospective_parachains.rs +++ b/polkadot/node/network/collator-protocol/src/collator_side/tests/prospective_parachains.rs @@ -341,6 +341,8 @@ fn distribute_collation_from_implicit_view(#[case] validator_sends_view_first: b |mut test_harness| async move { let virtual_overseer = &mut test_harness.virtual_overseer; + overseer_send(virtual_overseer, CollatorProtocolMessage::ConnectToBackingGroups).await; + // Set collating para id. overseer_send(virtual_overseer, CollatorProtocolMessage::CollateOn(test_state.para_id)) .await; @@ -516,6 +518,8 @@ fn distribute_collation_up_to_limit() { // Grandparent of head `a`. let head_b = Hash::from_low_u64_be(130); + overseer_send(virtual_overseer, CollatorProtocolMessage::ConnectToBackingGroups).await; + // Set collating para id. overseer_send(virtual_overseer, CollatorProtocolMessage::CollateOn(test_state.para_id)) .await; @@ -644,6 +648,8 @@ fn send_parent_head_data_for_elastic_scaling() { let head_b = Hash::from_low_u64_be(129); let head_b_num: u32 = 63; + overseer_send(&mut virtual_overseer, CollatorProtocolMessage::ConnectToBackingGroups).await; + // Set collating para id. overseer_send( &mut virtual_overseer, @@ -773,6 +779,8 @@ fn advertise_and_send_collation_by_hash() { let head_b = Hash::from_low_u64_be(129); let head_b_num: u32 = 63; + overseer_send(&mut virtual_overseer, CollatorProtocolMessage::ConnectToBackingGroups).await; + // Set collating para id. overseer_send( &mut virtual_overseer, From c2dce30db1974faed59f2a603828e8e6b83ae3d8 Mon Sep 17 00:00:00 2001 From: "cmd[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Sat, 4 Oct 2025 14:17:20 +0000 Subject: [PATCH 16/59] Update from github-actions[bot] running command 'fmt' --- cumulus/client/service/Cargo.toml | 2 +- .../src/collator_side/tests/prospective_parachains.rs | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/cumulus/client/service/Cargo.toml b/cumulus/client/service/Cargo.toml index bcbf3c1ee536e..ccd866b46e314 100644 --- a/cumulus/client/service/Cargo.toml +++ b/cumulus/client/service/Cargo.toml @@ -37,8 +37,8 @@ sp-runtime = { workspace = true, default-features = true } sp-transaction-pool = { workspace = true, default-features = true } # Polkadot -polkadot-primitives = { workspace = true, default-features = true } polkadot-overseer = { workspace = true, default-features = true } +polkadot-primitives = { workspace = true, default-features = true } # Cumulus cumulus-client-cli = { workspace = true, default-features = true } diff --git a/polkadot/node/network/collator-protocol/src/collator_side/tests/prospective_parachains.rs b/polkadot/node/network/collator-protocol/src/collator_side/tests/prospective_parachains.rs index b307d0bf97186..cde84136d3342 100644 --- a/polkadot/node/network/collator-protocol/src/collator_side/tests/prospective_parachains.rs +++ b/polkadot/node/network/collator-protocol/src/collator_side/tests/prospective_parachains.rs @@ -648,7 +648,8 @@ fn send_parent_head_data_for_elastic_scaling() { let head_b = Hash::from_low_u64_be(129); let head_b_num: u32 = 63; - overseer_send(&mut virtual_overseer, CollatorProtocolMessage::ConnectToBackingGroups).await; + overseer_send(&mut virtual_overseer, CollatorProtocolMessage::ConnectToBackingGroups) + .await; // Set collating para id. overseer_send( @@ -779,7 +780,8 @@ fn advertise_and_send_collation_by_hash() { let head_b = Hash::from_low_u64_be(129); let head_b_num: u32 = 63; - overseer_send(&mut virtual_overseer, CollatorProtocolMessage::ConnectToBackingGroups).await; + overseer_send(&mut virtual_overseer, CollatorProtocolMessage::ConnectToBackingGroups) + .await; // Set collating para id. overseer_send( From 2d78334763a8e09c3cb64fca4d268bbdffa5baac Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Mon, 6 Oct 2025 11:48:22 +0300 Subject: [PATCH 17/59] fmt Signed-off-by: Andrei Sandu --- .../src/collator_side/tests/prospective_parachains.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/polkadot/node/network/collator-protocol/src/collator_side/tests/prospective_parachains.rs b/polkadot/node/network/collator-protocol/src/collator_side/tests/prospective_parachains.rs index b307d0bf97186..cde84136d3342 100644 --- a/polkadot/node/network/collator-protocol/src/collator_side/tests/prospective_parachains.rs +++ b/polkadot/node/network/collator-protocol/src/collator_side/tests/prospective_parachains.rs @@ -648,7 +648,8 @@ fn send_parent_head_data_for_elastic_scaling() { let head_b = Hash::from_low_u64_be(129); let head_b_num: u32 = 63; - overseer_send(&mut virtual_overseer, CollatorProtocolMessage::ConnectToBackingGroups).await; + overseer_send(&mut virtual_overseer, CollatorProtocolMessage::ConnectToBackingGroups) + .await; // Set collating para id. overseer_send( @@ -779,7 +780,8 @@ fn advertise_and_send_collation_by_hash() { let head_b = Hash::from_low_u64_be(129); let head_b_num: u32 = 63; - overseer_send(&mut virtual_overseer, CollatorProtocolMessage::ConnectToBackingGroups).await; + overseer_send(&mut virtual_overseer, CollatorProtocolMessage::ConnectToBackingGroups) + .await; // Set collating para id. overseer_send( From 6bec87250bf537e168fa7fda416931a60cf20ba1 Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Mon, 6 Oct 2025 11:52:47 +0300 Subject: [PATCH 18/59] add offset Signed-off-by: Andrei Sandu --- Cargo.lock | 1 + cumulus/client/consensus/aura/Cargo.toml | 1 + cumulus/client/consensus/aura/src/collator.rs | 64 ++++++++++++++++--- .../polkadot-omni-node/lib/src/nodes/aura.rs | 16 +++-- 4 files changed, 66 insertions(+), 16 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 26f4df522c18f..acc5a141b6e28 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4358,6 +4358,7 @@ dependencies = [ "cumulus-test-client", "cumulus-test-relay-sproof-builder", "futures", + "futures-timer", "parity-scale-codec", "parking_lot 0.12.3", "polkadot-node-primitives", diff --git a/cumulus/client/consensus/aura/Cargo.toml b/cumulus/client/consensus/aura/Cargo.toml index 8dca303ffebdb..6dd43eeeaa40c 100644 --- a/cumulus/client/consensus/aura/Cargo.toml +++ b/cumulus/client/consensus/aura/Cargo.toml @@ -15,6 +15,7 @@ workspace = true async-trait = { workspace = true } codec = { features = ["derive"], workspace = true, default-features = true } futures = { workspace = true } +futures-timer = { workspace = true } parking_lot = { workspace = true } schnellru = { workspace = true } tokio = { workspace = true, features = ["macros"] } diff --git a/cumulus/client/consensus/aura/src/collator.rs b/cumulus/client/consensus/aura/src/collator.rs index 952433853305d..2b082692277af 100644 --- a/cumulus/client/consensus/aura/src/collator.rs +++ b/cumulus/client/consensus/aura/src/collator.rs @@ -59,6 +59,13 @@ use sp_state_machine::StorageChanges; use sp_timestamp::Timestamp; use std::{error::Error, time::Duration}; +/// The slot offset to start pre-connecting to backing groups. Represented as number +/// of seconds before own slot starts. +const PRE_CONNECT_SLOT_OFFSET: Duration = Duration::from_secs(6); + +/// Task name for the collator protocol helper. +pub const COLLATOR_PROTOCOL_HELPER_TASK_GROUP: &str = "collator-protocol-helper"; + /// Parameters for instantiating a [`Collator`]. pub struct Params { /// A builder for inherent data builders. @@ -438,10 +445,11 @@ where /// /// This helper monitors block imports and proactively connects to backing groups /// when the collator's slot is approaching, improving network connectivity. -pub async fn collator_protocol_helper( +pub async fn collator_protocol_helper( client: std::sync::Arc, keystore: sp_keystore::KeystorePtr, mut overseer_handle: cumulus_relay_chain_interface::OverseerHandle, + spawn_handle: Spawner, ) where Block: sp_runtime::traits::Block, Client: sc_client_api::HeaderBackend @@ -454,6 +462,7 @@ pub async fn collator_protocol_helper( Client::Api: AuraApi, P: sp_core::Pair + Send + Sync, P::Public: Codec, + Spawner: sp_core::traits::SpawnNamed, { use polkadot_node_subsystem::messages::CollatorProtocolMessage; use sc_consensus_aura::CompatibleDigestItem; @@ -480,6 +489,12 @@ pub async fn collator_protocol_helper( continue; }; + // Determine current slot duration. + let Ok(slot_duration) = client.runtime_api().slot_duration(notification.header.hash()) + else { + continue; + }; + let authorities = client.runtime_api().authorities(notification.header.hash()).unwrap_or_default(); @@ -512,15 +527,46 @@ pub async fn collator_protocol_helper( let target_slot = slot + 2; if aura_internal::claim_slot::

(target_slot, &authorities, &keystore) .await - .is_some() + .is_none() { - tracing::debug!(target: crate::LOG_TARGET, "Our slot {} comes next, sending preconnect message ", target_slot ); - // Send a message to the collator protocol to pre-connect to backing groups - overseer_handle - .send_msg(CollatorProtocolMessage::ConnectToBackingGroups, "CollatorProtocolHelper") - .await; - - our_slot = Some(target_slot); + continue; } + + tracing::debug!(target: crate::LOG_TARGET, "Our slot {} is due soon", target_slot ); + + // Determine our own slot timestamp. + let Some(own_slot_ts) = target_slot.timestamp(slot_duration) else { + tracing::warn!(target: crate::LOG_TARGET, "Failed to get own slot duration"); + + continue; + }; + + let pre_connect_delay = own_slot_ts + .saturating_sub(*Timestamp::current()) + .saturating_sub(PRE_CONNECT_SLOT_OFFSET.as_millis() as u64); + + tracing::debug!(target: crate::LOG_TARGET, "Pre-connecting to backing groups in {}ms", pre_connect_delay); + + let mut overseer_handle_clone = overseer_handle.clone(); + spawn_handle.spawn( + "send-pre-connect-message", + Some(COLLATOR_PROTOCOL_HELPER_TASK_GROUP), + async move { + tracing::debug!(target: crate::LOG_TARGET, "Sending pre-connect message"); + + if pre_connect_delay > 0 { + futures_timer::Delay::new(std::time::Duration::from_millis(pre_connect_delay)).await; + } + // Send a message to the collator protocol to pre-connect to backing groups + overseer_handle_clone + .send_msg( + CollatorProtocolMessage::ConnectToBackingGroups, + "CollatorProtocolHelper", + ) + .await; + } + .boxed(), + ); + our_slot = Some(target_slot); } } diff --git a/cumulus/polkadot-omni-node/lib/src/nodes/aura.rs b/cumulus/polkadot-omni-node/lib/src/nodes/aura.rs index 28c5301fcf343..6ec7267eb3e06 100644 --- a/cumulus/polkadot-omni-node/lib/src/nodes/aura.rs +++ b/cumulus/polkadot-omni-node/lib/src/nodes/aura.rs @@ -39,7 +39,7 @@ use cumulus_client_consensus_aura::collators::slot_based::{ self as slot_based, Params as SlotBasedParams, }; use cumulus_client_consensus_aura::{ - collator::collator_protocol_helper, + collator::{collator_protocol_helper, COLLATOR_PROTOCOL_HELPER_TASK_GROUP}, collators::{ lookahead::{self as aura, Params as AuraParams}, slot_based::{SlotBasedBlockImport, SlotBasedBlockImportHandle}, @@ -346,12 +346,13 @@ where // Spawn the collator protocol helper task task_manager.spawn_essential_handle().spawn( - "collator-protocol-helper", - None, - collator_protocol_helper::<_, _, ::Pair>( + COLLATOR_PROTOCOL_HELPER_TASK_GROUP, + Some(COLLATOR_PROTOCOL_HELPER_TASK_GROUP), + collator_protocol_helper::<_, _, ::Pair, _>( client.clone(), keystore.clone(), overseer_handle, + task_manager.spawn_handle(), ), ); @@ -482,12 +483,13 @@ where // Spawn the collator protocol helper task task_manager.spawn_essential_handle().spawn( - "collator-protocol-helper", - None, - collator_protocol_helper::<_, _, ::Pair>( + COLLATOR_PROTOCOL_HELPER_TASK_GROUP, + Some(COLLATOR_PROTOCOL_HELPER_TASK_GROUP), + collator_protocol_helper::<_, _, ::Pair, _>( client.clone(), keystore.clone(), overseer_handle.clone(), + task_manager.spawn_handle(), ), ); From b977e320084ce7158f15ec2ed45a773c1d0a6a02 Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Mon, 6 Oct 2025 12:01:50 +0300 Subject: [PATCH 19/59] fmt Signed-off-by: Andrei Sandu --- cumulus/client/consensus/aura/src/collator.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cumulus/client/consensus/aura/src/collator.rs b/cumulus/client/consensus/aura/src/collator.rs index 2b082692277af..b7408c4b283c9 100644 --- a/cumulus/client/consensus/aura/src/collator.rs +++ b/cumulus/client/consensus/aura/src/collator.rs @@ -555,7 +555,8 @@ pub async fn collator_protocol_helper( tracing::debug!(target: crate::LOG_TARGET, "Sending pre-connect message"); if pre_connect_delay > 0 { - futures_timer::Delay::new(std::time::Duration::from_millis(pre_connect_delay)).await; + futures_timer::Delay::new(std::time::Duration::from_millis(pre_connect_delay)) + .await; } // Send a message to the collator protocol to pre-connect to backing groups overseer_handle_clone From f0a76d5396b725aaab08145d86a08078c6bfdbbe Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Mon, 6 Oct 2025 12:13:39 +0300 Subject: [PATCH 20/59] prdoc Signed-off-by: Andrei Sandu --- prdoc/pr_9929.prdoc | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/prdoc/pr_9929.prdoc b/prdoc/pr_9929.prdoc index 9494bd809a28e..742172b90bb07 100644 --- a/prdoc/pr_9929.prdoc +++ b/prdoc/pr_9929.prdoc @@ -1,19 +1,17 @@ -title: '[WIP] Cumulus: pre-connect to backers before own slot' +title: 'Cumulus: pre-connect to backers before own slot' doc: -- audience: Todo +- audience: Node Dev description: |- - On top of https://github.com/paritytech/polkadot-sdk/pull/9178. Implements a mechanism to pre-connect to backers, see https://github.com/paritytech/polkadot-sdk/issues/9767#issuecomment-3306292493 - - TODO: - - [ ] improve logic to handle 24s slots, we'd want to start connecting later than the begging of the prev author slot. + Implements a mechanism to pre-connect to backers, see https://github.com/paritytech/polkadot-sdk/issues/9767#issuecomment-3306292493. + Improve backing group connectivity. crates: - name: cumulus-client-service - bump: major + bump: minor - name: polkadot-omni-node-lib - bump: major + bump: minor - name: polkadot-collator-protocol - bump: major + bump: patch - name: polkadot-node-subsystem-types bump: major - name: cumulus-client-consensus-aura - bump: major + bump: minor From 0dbdd9c6f70486d4ef016443c536095a3a4946d1 Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Mon, 6 Oct 2025 12:13:56 +0300 Subject: [PATCH 21/59] move log Signed-off-by: Andrei Sandu --- cumulus/client/consensus/aura/src/collator.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cumulus/client/consensus/aura/src/collator.rs b/cumulus/client/consensus/aura/src/collator.rs index b7408c4b283c9..1d9ff1abf4e9b 100644 --- a/cumulus/client/consensus/aura/src/collator.rs +++ b/cumulus/client/consensus/aura/src/collator.rs @@ -552,12 +552,13 @@ pub async fn collator_protocol_helper( "send-pre-connect-message", Some(COLLATOR_PROTOCOL_HELPER_TASK_GROUP), async move { - tracing::debug!(target: crate::LOG_TARGET, "Sending pre-connect message"); - if pre_connect_delay > 0 { futures_timer::Delay::new(std::time::Duration::from_millis(pre_connect_delay)) .await; } + + tracing::debug!(target: crate::LOG_TARGET, "Sending pre-connect message"); + // Send a message to the collator protocol to pre-connect to backing groups overseer_handle_clone .send_msg( From 699b00a8d9e663b8f58b501f8988822c61722be2 Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Mon, 6 Oct 2025 13:26:14 +0300 Subject: [PATCH 22/59] impl disconnect Signed-off-by: Andrei Sandu --- .../src/collator_side/mod.rs | 111 +++++++++++++----- 1 file changed, 79 insertions(+), 32 deletions(-) diff --git a/polkadot/node/network/collator-protocol/src/collator_side/mod.rs b/polkadot/node/network/collator-protocol/src/collator_side/mod.rs index 523f35b8def3f..0c151f5e18505 100644 --- a/polkadot/node/network/collator-protocol/src/collator_side/mod.rs +++ b/polkadot/node/network/collator-protocol/src/collator_side/mod.rs @@ -406,7 +406,15 @@ async fn distribute_collation( // We should already be connected to the validators, but if we aren't, we will try to connect to // them now. - connect_to_validators(ctx, &state.implicit_view, &state.per_relay_parent, id).await; + update_validator_connections( + ctx, + &state.peer_ids, + &state.implicit_view, + &state.per_relay_parent, + id, + true, + ) + .await; let per_relay_parent = match state.per_relay_parent.get_mut(&candidate_relay_parent) { Some(per_relay_parent) => per_relay_parent, @@ -671,14 +679,15 @@ fn list_of_backing_validators_in_view( backing_validators.into_iter().collect() } -/// Updates a set of connected validators based on their advertisement-bits -/// in a validators buffer. +/// Connect or disconnect to/from all backers at all viable relay parents. #[overseer::contextbounds(CollatorProtocol, prefix = self::overseer)] -async fn connect_to_validators( +async fn update_validator_connections( ctx: &mut Context, + peer_ids: &HashMap>, implicit_view: &Option, per_relay_parent: &HashMap, para_id: ParaId, + connect: bool, ) { let cores_assigned = has_assigned_cores(implicit_view, per_relay_parent); // If no cores are assigned to the para, we still need to send a ConnectToValidators request to @@ -690,22 +699,36 @@ async fn connect_to_validators( Vec::new() }; - gum::trace!( - target: LOG_TARGET, - ?cores_assigned, - "Sending connection request to validators: {:?}", - validator_ids, - ); - // ignore address resolution failure // will reissue a new request on new collation let (failed, _) = oneshot::channel(); - ctx.send_message(NetworkBridgeTxMessage::ConnectToValidators { - validator_ids, - peer_set: PeerSet::Collation, - failed, - }) - .await; + + let msg = if connect { + gum::trace!( + target: LOG_TARGET, + ?cores_assigned, + "Sending connection request to validators: {:?}", + validator_ids, + ); + NetworkBridgeTxMessage::ConnectToValidators { + validator_ids, + peer_set: PeerSet::Collation, + failed, + } + } else { + // Get all connected peer_ids on the Collation peer set. + let connected_validator_peer_ids: Vec<_> = peer_ids.keys().cloned().collect(); + + gum::trace!( + target: LOG_TARGET, + ?cores_assigned, + "Disconnecting from validators: {:?}", + connected_validator_peer_ids, + ); + NetworkBridgeTxMessage::DisconnectPeers(connected_validator_peer_ids, PeerSet::Collation) + }; + + ctx.send_message(msg).await; } /// Advertise collation to the given `peer`. @@ -810,8 +833,15 @@ async fn process_msg( state.connect_to_backers = true; if let Some(para_id) = state.collating_on { - connect_to_validators(ctx, &state.implicit_view, &state.per_relay_parent, para_id) - .await; + update_validator_connections( + ctx, + &state.peer_ids, + &state.implicit_view, + &state.per_relay_parent, + para_id, + state.connect_to_backers, + ) + .await; } }, DisconnectFromBackingGroups => { @@ -822,6 +852,18 @@ async fn process_msg( "Received DisconnectFromBackingGroups message." ); state.connect_to_backers = false; + + if let Some(para_id) = state.collating_on { + update_validator_connections( + ctx, + &state.peer_ids, + &state.implicit_view, + &state.per_relay_parent, + para_id, + state.connect_to_backers, + ) + .await; + } }, CollateOn(id) => { state.collating_on = Some(id); @@ -1297,15 +1339,15 @@ async fn handle_network_msg( handle_our_view_change(ctx, runtime, state, view).await?; // Connect only if we are collating on a para. if let Some(para_id) = state.collating_on { - if state.connect_to_backers { - connect_to_validators( - ctx, - &state.implicit_view, - &state.per_relay_parent, - para_id, - ) - .await; - } + update_validator_connections( + ctx, + &state.peer_ids, + &state.implicit_view, + &state.per_relay_parent, + para_id, + state.connect_to_backers, + ) + .await; } }, PeerMessage(remote, msg) => { @@ -1794,12 +1836,17 @@ async fn run_inner( ); } _ = reconnect_timeout => { - // Connect only if we are collating on a para. if let Some(para_id) = state.collating_on { - if state.connect_to_backers { - connect_to_validators(&mut ctx, &state.implicit_view, &state.per_relay_parent, para_id).await; - } + update_validator_connections( + &mut ctx, + &state.peer_ids, + &state.implicit_view, + &state.per_relay_parent, + para_id, + state.connect_to_backers, + ) + .await; } gum::trace!( From 597a50c523ecfa8daada7612c1e0fe153ba02fbd Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Mon, 6 Oct 2025 13:31:24 +0300 Subject: [PATCH 23/59] fix some stuff Signed-off-by: Andrei Sandu --- cumulus/client/consensus/aura/src/collator.rs | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/cumulus/client/consensus/aura/src/collator.rs b/cumulus/client/consensus/aura/src/collator.rs index 1d9ff1abf4e9b..a65adfd6ec940 100644 --- a/cumulus/client/consensus/aura/src/collator.rs +++ b/cumulus/client/consensus/aura/src/collator.rs @@ -38,12 +38,13 @@ use cumulus_primitives_core::{ use cumulus_relay_chain_interface::RelayChainInterface; use polkadot_node_primitives::{Collation, MaybeCompressedPoV}; +use polkadot_node_subsystem::messages::CollatorProtocolMessage; use polkadot_primitives::{Header as PHeader, Id as ParaId}; use crate::collators::RelayParentData; use futures::prelude::*; use sc_consensus::{BlockImport, BlockImportParams, ForkChoiceStrategy, StateAction}; -use sc_consensus_aura::standalone as aura_internal; +use sc_consensus_aura::{standalone as aura_internal, CompatibleDigestItem}; use sp_api::ProvideRuntimeApi; use sp_application_crypto::AppPublic; use sp_consensus::BlockOrigin; @@ -455,19 +456,14 @@ pub async fn collator_protocol_helper( Client: sc_client_api::HeaderBackend + Send + Sync - + sc_client_api::BlockBackend - + sc_client_api::BlockchainEvents + ProvideRuntimeApi + + sc_client_api::BlockchainEvents + 'static, Client::Api: AuraApi, P: sp_core::Pair + Send + Sync, P::Public: Codec, Spawner: sp_core::traits::SpawnNamed, { - use polkadot_node_subsystem::messages::CollatorProtocolMessage; - use sc_consensus_aura::CompatibleDigestItem; - use sp_runtime::DigestItem; - let mut import_notifications = client.import_notification_stream().fuse(); tracing::debug!(target: crate::LOG_TARGET, "Started collator protocol helper"); From bf35a8e661894e777260d3bf845c6dd3fdcdae0c Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Mon, 6 Oct 2025 18:04:05 +0300 Subject: [PATCH 24/59] add some tests for connect/disconnect Signed-off-by: Andrei Sandu --- Cargo.lock | 1 + .../node/network/collator-protocol/Cargo.toml | 2 + .../src/collator_side/mod.rs | 4 + .../src/collator_side/tests/mod.rs | 260 +++++++++++++++++- 4 files changed, 266 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index acc5a141b6e28..1bede150f60e7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -15048,6 +15048,7 @@ dependencies = [ "fatality", "futures", "futures-timer", + "itertools 0.11.0", "parity-scale-codec", "polkadot-node-network-protocol", "polkadot-node-primitives", diff --git a/polkadot/node/network/collator-protocol/Cargo.toml b/polkadot/node/network/collator-protocol/Cargo.toml index cd9ea08446a1f..ea41065ecb0a6 100644 --- a/polkadot/node/network/collator-protocol/Cargo.toml +++ b/polkadot/node/network/collator-protocol/Cargo.toml @@ -49,6 +49,8 @@ sp-keyring = { workspace = true, default-features = true } polkadot-node-subsystem-test-helpers = { workspace = true } polkadot-primitives-test-helpers = { workspace = true } +itertools = { workspace = true } + [features] default = [] diff --git a/polkadot/node/network/collator-protocol/src/collator_side/mod.rs b/polkadot/node/network/collator-protocol/src/collator_side/mod.rs index 0c151f5e18505..9b78ad7684952 100644 --- a/polkadot/node/network/collator-protocol/src/collator_side/mod.rs +++ b/polkadot/node/network/collator-protocol/src/collator_side/mod.rs @@ -719,6 +719,10 @@ async fn update_validator_connections( // Get all connected peer_ids on the Collation peer set. let connected_validator_peer_ids: Vec<_> = peer_ids.keys().cloned().collect(); + if connected_validator_peer_ids.is_empty() { + return + } + gum::trace!( target: LOG_TARGET, ?cores_assigned, diff --git a/polkadot/node/network/collator-protocol/src/collator_side/tests/mod.rs b/polkadot/node/network/collator-protocol/src/collator_side/tests/mod.rs index 9c3423ba176b2..182709c658a05 100644 --- a/polkadot/node/network/collator-protocol/src/collator_side/tests/mod.rs +++ b/polkadot/node/network/collator-protocol/src/collator_side/tests/mod.rs @@ -32,13 +32,14 @@ use sp_core::crypto::Pair; use sp_keyring::Sr25519Keyring; use sp_runtime::traits::AppVerify; +use itertools::Itertools; use polkadot_node_network_protocol::{ peer_set::CollationVersion, request_response::{ v2::{CollationFetchingRequest, CollationFetchingResponse}, IncomingRequest, ReqProtocolNames, }, - view, + view, ObservedRole, }; use polkadot_node_primitives::BlockData; use polkadot_node_subsystem::{ @@ -1700,3 +1701,260 @@ fn connect_with_no_cores_assigned() { }, ); } + +#[test] +fn no_connection_without_preconnect_message() { + let test_state = TestState::default(); + let local_peer_id = test_state.local_peer_id; + let collator_pair = test_state.collator_pair.clone(); + + test_harness( + local_peer_id, + collator_pair, + ReputationAggregator::new(|_| true), + |test_harness| async move { + let mut virtual_overseer = test_harness.virtual_overseer; + let req_cfg = test_harness.req_v2_cfg; + + // NOTE: We intentionally DO NOT send ConnectToBackingGroups here + // to verify that connections are not made without the pre-connect message. + + overseer_send( + &mut virtual_overseer, + CollatorProtocolMessage::CollateOn(test_state.para_id), + ) + .await; + + // Update view without expecting any connections (None parameter) + update_view( + None, // No connections should be made + &test_state, + &mut virtual_overseer, + vec![(test_state.relay_parent, 10)], + 1, + ) + .await; + + // Verify that no ConnectToValidators message was sent + // by attempting to receive a message with a short timeout. + // We should either timeout or receive messages that are NOT ConnectToValidators. + let timeout = Duration::from_millis(250); + match overseer_recv_with_timeout(&mut virtual_overseer, timeout).await { + None => { + // Timeout is fine - no messages were sent + }, + Some(msg) => { + // If we received a message, it should NOT be ConnectToValidators + panic!("Unexpected message was sent by subsystem: {:?}", msg); + }, + } + + TestHarness { virtual_overseer, req_v2_cfg: req_cfg } + }, + ); +} + +#[test] +fn distribute_collation_forces_connect() { + let test_state = TestState::default(); + let local_peer_id = test_state.local_peer_id; + let collator_pair = test_state.collator_pair.clone(); + + test_harness( + local_peer_id, + collator_pair, + ReputationAggregator::new(|_| true), + |test_harness| async move { + let mut virtual_overseer = test_harness.virtual_overseer; + let req_cfg = test_harness.req_v2_cfg; + + // NOTE: We intentionally DO NOT send ConnectToBackingGroups here + // to verify that connections are not made without the pre-connect message. + + overseer_send( + &mut virtual_overseer, + CollatorProtocolMessage::CollateOn(test_state.para_id), + ) + .await; + + // Update view without expecting any connections (None parameter) + update_view( + None, // No connections should be made + &test_state, + &mut virtual_overseer, + vec![(test_state.relay_parent, 10)], + 1, + ) + .await; + + // Verify that no ConnectToValidators message was sent + // by attempting to receive a message with a short timeout. + // We should either timeout or receive messages that are NOT ConnectToValidators. + let timeout = Duration::from_millis(250); + match overseer_recv_with_timeout(&mut virtual_overseer, timeout).await { + None => { + // Timeout is fine - no messages were sent + }, + Some(msg) => { + // If we received a message, it should NOT be ConnectToValidators + panic!("Unexpected message was sent by subsystem: {:?}", msg); + }, + } + + // Distribute a collation + let _ = distribute_collation( + &mut virtual_overseer, + test_state.current_group_validator_authority_ids(), + &test_state, + test_state.relay_parent, + ) + .await; + + for (val, peer) in test_state + .current_group_validator_authority_ids() + .into_iter() + .zip(test_state.current_group_validator_peer_ids()) + { + connect_peer(&mut virtual_overseer, peer, CollationVersion::V2, Some(val.clone())) + .await; + } + + // Expect advertisement for the candidate + expect_declare_msg( + &mut virtual_overseer, + &test_state, + &test_state.current_group_validator_peer_ids()[0], + ) + .await; + + TestHarness { virtual_overseer, req_v2_cfg: req_cfg } + }, + ); +} + +#[test] +fn connect_advertise_disconnect_three_backing_groups() { + // Create a test state with 3 non-empty backing groups + let mut test_state = TestState::default(); + let para_id = test_state.para_id; + + // We have 5 validators total (indices 0-4) + // Group 0: validators [0, 1] + // Group 1: validators [2, 3] + // Group 2: validators [4] + test_state.session_info.validator_groups = vec![ + vec![ValidatorIndex(0), ValidatorIndex(1)], + vec![ValidatorIndex(2), ValidatorIndex(3)], + vec![ValidatorIndex(4)], + vec![], + ] + .into_iter() + .collect(); + + // Assign our para_id to 3 cores (0, 1, 2) which will map to 3 groups + test_state.claim_queue.clear(); + test_state.claim_queue.insert(CoreIndex(0), [para_id].into_iter().collect()); + test_state.claim_queue.insert(CoreIndex(1), [para_id].into_iter().collect()); + test_state.claim_queue.insert(CoreIndex(2), [para_id].into_iter().collect()); + test_state.claim_queue.insert(CoreIndex(3), VecDeque::new()); + + let local_peer_id = test_state.local_peer_id; + let collator_pair = test_state.collator_pair.clone(); + + test_harness( + local_peer_id, + collator_pair, + ReputationAggregator::new(|_| true), + |test_harness| async move { + let mut virtual_overseer = test_harness.virtual_overseer; + let req_cfg = test_harness.req_v2_cfg; + + // Send the pre-connect message + overseer_send(&mut virtual_overseer, CollatorProtocolMessage::ConnectToBackingGroups) + .await; + + overseer_send( + &mut virtual_overseer, + CollatorProtocolMessage::CollateOn(test_state.para_id), + ) + .await; + + // Get validators from all 3 backing groups + let mut expected_validators = Vec::new(); + for core_idx in [0, 1, 2] { + let validators = test_state.validator_authority_ids_for_core(CoreIndex(core_idx)); + expected_validators.extend(validators); + } + + // Remove duplicates while preserving order + let mut seen = std::collections::HashSet::new(); + expected_validators.retain(|v| seen.insert(v.clone())); + + // Verify we're connecting to all 5 validators (from 3 groups) + // Group 0 (Core 0): 2 validators + // Group 1 (Core 1): 2 validators + // Group 2 (Core 2): 1 validator + assert_eq!( + expected_validators.len(), + 5, + "Expected 5 unique validators from 3 backing groups" + ); + + // Update view and expect connections to all validators from all 3 backing groups + update_view( + Some(expected_validators.clone()), + &test_state, + &mut virtual_overseer, + vec![(test_state.relay_parent, 10)], + 1, + ) + .await; + + // Generate NetworkBridgeEvent::PeerConnected messages for each expected validator peer + // Use some random peer ids + let validator_peer_ids: Vec<_> = + (0..expected_validators.len()).map(|_| PeerId::random()).sorted().collect(); + + for (auth_id, peer_id) in expected_validators.iter().zip(validator_peer_ids.iter()) { + overseer_send( + &mut virtual_overseer, + CollatorProtocolMessage::NetworkBridgeUpdate( + NetworkBridgeEvent::PeerConnected( + *peer_id, + ObservedRole::Authority, + CollationVersion::V2.into(), + Some(HashSet::from([auth_id.clone()])), + ), + ), + ) + .await; + } + + // Expect collation advertisement for each validator + for peer_id in validator_peer_ids.iter() { + expect_declare_msg(&mut virtual_overseer, &test_state, peer_id).await; + } + + // Send the disconnect message + overseer_send( + &mut virtual_overseer, + CollatorProtocolMessage::DisconnectFromBackingGroups, + ) + .await; + + // Expect a DisconnectPeers for all connected peers + assert_matches!( + overseer_recv(&mut virtual_overseer).await, + AllMessages::NetworkBridgeTx(NetworkBridgeTxMessage::DisconnectPeers( + peer_ids, + PeerSet::Collation, + )) => { + // We should disconnect from all peers we were connected to + assert_eq!(peer_ids.into_iter().sorted().collect::>(), validator_peer_ids, "Expected to disconnect from all validators"); + } + ); + + TestHarness { virtual_overseer, req_v2_cfg: req_cfg } + }, + ); +} From 334fd921f14ce4974919915a91a22efe948f9852 Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Mon, 6 Oct 2025 18:32:18 +0300 Subject: [PATCH 25/59] fix cargo toml Signed-off-by: Andrei Sandu --- polkadot/node/network/collator-protocol/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/polkadot/node/network/collator-protocol/Cargo.toml b/polkadot/node/network/collator-protocol/Cargo.toml index ea41065ecb0a6..daa76f3c01111 100644 --- a/polkadot/node/network/collator-protocol/Cargo.toml +++ b/polkadot/node/network/collator-protocol/Cargo.toml @@ -47,9 +47,9 @@ sc-network = { workspace = true, default-features = true } sp-core = { features = ["std"], workspace = true, default-features = true } sp-keyring = { workspace = true, default-features = true } +itertools = { workspace = true } polkadot-node-subsystem-test-helpers = { workspace = true } polkadot-primitives-test-helpers = { workspace = true } -itertools = { workspace = true } [features] From 9b7b37f904d80b4c71feb592d48ee4add6c10d15 Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Mon, 6 Oct 2025 18:34:38 +0300 Subject: [PATCH 26/59] fix prdoc Signed-off-by: Andrei Sandu --- prdoc/pr_9929.prdoc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/prdoc/pr_9929.prdoc b/prdoc/pr_9929.prdoc index 742172b90bb07..af9dec337b381 100644 --- a/prdoc/pr_9929.prdoc +++ b/prdoc/pr_9929.prdoc @@ -1,9 +1,10 @@ -title: 'Cumulus: pre-connect to backers before own slot' +title: Pre-connect to backers before own slot doc: - audience: Node Dev description: |- Implements a mechanism to pre-connect to backers, see https://github.com/paritytech/polkadot-sdk/issues/9767#issuecomment-3306292493. Improve backing group connectivity. + crates: - name: cumulus-client-service bump: minor From 7112e9f76b9ef3be2a335be35306ad355d5ba50b Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Mon, 6 Oct 2025 18:49:35 +0300 Subject: [PATCH 27/59] fix prdoc Signed-off-by: Andrei Sandu --- prdoc/pr_9178.prdoc | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/prdoc/pr_9178.prdoc b/prdoc/pr_9178.prdoc index fd4f0aa52f63f..5f5e11996d140 100644 --- a/prdoc/pr_9178.prdoc +++ b/prdoc/pr_9178.prdoc @@ -1,6 +1,6 @@ title: 'collator-protocol: cleanup connecting to backing group' doc: -- audience: Todo +- audience: Node Dev description: |- There are a few things wrong with the way we are handling connecting the validators in the backing group: 1. `validators_to_connect` returns only validators in groups we already have a block to advertise and the last backing groups we advertised something to, that means that if our backing group changes, but we don't have anything to advertise it will continue to try to connect to the previous backing group and validator will log this and disconnect it immediately. @@ -10,13 +10,10 @@ doc: 3. Staying connected to the last backingroup we advertised something does not work for elastic scaling because we have different backing groups and if the collator set is big enough that collators author just one block per group rotation, then we will always connect just when we have a candidate to advertise. - ## Proposal to fix: + ## Fix: Have collators always connect to the backing group they got assigned to and keep the connection open until backing group changes. Also, try to connect when have something to advertise or on timeout to have more chances of being correctly connected. - ## Todo - - [x] Confirm that proposal does not have other undesired side effects. - - [x] Tests crates: - name: polkadot-collator-protocol bump: patch From 89fb028e1d2782946c8e2cbbd9d2153e7667d0e0 Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Tue, 7 Oct 2025 11:40:43 +0300 Subject: [PATCH 28/59] review feedback Signed-off-by: Andrei Sandu --- cumulus/client/consensus/aura/src/collator.rs | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/cumulus/client/consensus/aura/src/collator.rs b/cumulus/client/consensus/aura/src/collator.rs index a65adfd6ec940..3d6795148266d 100644 --- a/cumulus/client/consensus/aura/src/collator.rs +++ b/cumulus/client/consensus/aura/src/collator.rs @@ -474,10 +474,9 @@ pub async fn collator_protocol_helper( while let Some(notification) = import_notifications.next().await { // Determine if this node is the next author let digest = notification.header.digest(); - let slot = digest - .logs() - .iter() - .find_map(|log| >::as_aura_pre_digest(log)); + + let slot = + digest.convert_first(>::as_aura_pre_digest); tracing::debug!(target: crate::LOG_TARGET, "Imported block with slot: {:?}", slot); @@ -515,6 +514,11 @@ pub async fn collator_protocol_helper( our_slot = None; }, + (Some(_), false) => { + // `last_slot` is `Some` means we alredy sent pre-connect message, no need to + // proceed further. + continue + }, _ => {}, } From cae1c25e1d07cc9e50da624f180dccf466d77632 Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Tue, 7 Oct 2025 11:42:32 +0300 Subject: [PATCH 29/59] fix comment Signed-off-by: Andrei Sandu --- cumulus/client/consensus/aura/src/collator.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cumulus/client/consensus/aura/src/collator.rs b/cumulus/client/consensus/aura/src/collator.rs index 3d6795148266d..fc02c80cc954d 100644 --- a/cumulus/client/consensus/aura/src/collator.rs +++ b/cumulus/client/consensus/aura/src/collator.rs @@ -515,7 +515,7 @@ pub async fn collator_protocol_helper( our_slot = None; }, (Some(_), false) => { - // `last_slot` is `Some` means we alredy sent pre-connect message, no need to + // `our_slot` is `Some` means we alredy sent pre-connect message, no need to // proceed further. continue }, From 471b8980e0d36e81ae629ee5d25b53f66f5ce8db Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Tue, 7 Oct 2025 11:48:12 +0300 Subject: [PATCH 30/59] fix test comments Signed-off-by: Andrei Sandu --- .../collator-protocol/src/collator_side/tests/mod.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/polkadot/node/network/collator-protocol/src/collator_side/tests/mod.rs b/polkadot/node/network/collator-protocol/src/collator_side/tests/mod.rs index 182709c658a05..36d1c7820eaae 100644 --- a/polkadot/node/network/collator-protocol/src/collator_side/tests/mod.rs +++ b/polkadot/node/network/collator-protocol/src/collator_side/tests/mod.rs @@ -1744,7 +1744,7 @@ fn no_connection_without_preconnect_message() { // Timeout is fine - no messages were sent }, Some(msg) => { - // If we received a message, it should NOT be ConnectToValidators + // No message expected here panic!("Unexpected message was sent by subsystem: {:?}", msg); }, } @@ -1789,14 +1789,14 @@ fn distribute_collation_forces_connect() { // Verify that no ConnectToValidators message was sent // by attempting to receive a message with a short timeout. - // We should either timeout or receive messages that are NOT ConnectToValidators. + // We expect timeout here. let timeout = Duration::from_millis(250); match overseer_recv_with_timeout(&mut virtual_overseer, timeout).await { None => { // Timeout is fine - no messages were sent }, Some(msg) => { - // If we received a message, it should NOT be ConnectToValidators + // No message expected here panic!("Unexpected message was sent by subsystem: {:?}", msg); }, } From b4e6db2681071d9e62702064225f6514c541ae65 Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Thu, 9 Oct 2025 17:17:47 +0300 Subject: [PATCH 31/59] review Signed-off-by: Andrei Sandu --- cumulus/client/consensus/aura/src/collator.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/cumulus/client/consensus/aura/src/collator.rs b/cumulus/client/consensus/aura/src/collator.rs index fc02c80cc954d..9bba5b69da4b1 100644 --- a/cumulus/client/consensus/aura/src/collator.rs +++ b/cumulus/client/consensus/aura/src/collator.rs @@ -552,10 +552,8 @@ pub async fn collator_protocol_helper( "send-pre-connect-message", Some(COLLATOR_PROTOCOL_HELPER_TASK_GROUP), async move { - if pre_connect_delay > 0 { - futures_timer::Delay::new(std::time::Duration::from_millis(pre_connect_delay)) - .await; - } + futures_timer::Delay::new(std::time::Duration::from_millis(pre_connect_delay)) + .await; tracing::debug!(target: crate::LOG_TARGET, "Sending pre-connect message"); From 85c05979847f0fdf172b14002fb51b9ae2cbc648 Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Thu, 9 Oct 2025 18:07:37 +0300 Subject: [PATCH 32/59] fix validator disconnect Signed-off-by: Andrei Sandu --- .../collator-protocol/src/collator_side/mod.rs | 10 ++++++++-- .../src/collator_side/tests/mod.rs | 16 +++++++++------- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/polkadot/node/network/collator-protocol/src/collator_side/mod.rs b/polkadot/node/network/collator-protocol/src/collator_side/mod.rs index 9b78ad7684952..57f3361a26d6c 100644 --- a/polkadot/node/network/collator-protocol/src/collator_side/mod.rs +++ b/polkadot/node/network/collator-protocol/src/collator_side/mod.rs @@ -716,7 +716,8 @@ async fn update_validator_connections( failed, } } else { - // Get all connected peer_ids on the Collation peer set. + // Get all connected peer_ids on the Collation peer set + // This is just for logging purposes. let connected_validator_peer_ids: Vec<_> = peer_ids.keys().cloned().collect(); if connected_validator_peer_ids.is_empty() { @@ -729,7 +730,12 @@ async fn update_validator_connections( "Disconnecting from validators: {:?}", connected_validator_peer_ids, ); - NetworkBridgeTxMessage::DisconnectPeers(connected_validator_peer_ids, PeerSet::Collation) + + NetworkBridgeTxMessage::ConnectToValidators { + validator_ids: vec![], + peer_set: PeerSet::Collation, + failed, + } }; ctx.send_message(msg).await; diff --git a/polkadot/node/network/collator-protocol/src/collator_side/tests/mod.rs b/polkadot/node/network/collator-protocol/src/collator_side/tests/mod.rs index 36d1c7820eaae..754a8c960f55a 100644 --- a/polkadot/node/network/collator-protocol/src/collator_side/tests/mod.rs +++ b/polkadot/node/network/collator-protocol/src/collator_side/tests/mod.rs @@ -1942,15 +1942,17 @@ fn connect_advertise_disconnect_three_backing_groups() { ) .await; - // Expect a DisconnectPeers for all connected peers + // Expect a DisconnectPeers for all connected validators assert_matches!( overseer_recv(&mut virtual_overseer).await, - AllMessages::NetworkBridgeTx(NetworkBridgeTxMessage::DisconnectPeers( - peer_ids, - PeerSet::Collation, - )) => { - // We should disconnect from all peers we were connected to - assert_eq!(peer_ids.into_iter().sorted().collect::>(), validator_peer_ids, "Expected to disconnect from all validators"); + AllMessages::NetworkBridgeTx(NetworkBridgeTxMessage::ConnectToValidators{ + validator_ids, + peer_set, + failed: _, + }) => { + // We should disconnect from all validators we were connected to + assert_eq!(validator_ids, vec![], "Expected to disconnect from all validators"); + assert_eq!(peer_set, PeerSet::Collation); } ); From dbcd11afed2fcd49acf0e2ab696be419783b2623 Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Thu, 9 Oct 2025 20:48:33 +0300 Subject: [PATCH 33/59] make Basti happy Signed-off-by: Andrei Sandu --- cumulus/client/consensus/aura/src/collator.rs | 139 +----------------- .../consensus/aura/src/collators/lookahead.rs | 71 ++++++--- .../consensus/aura/src/collators/mod.rs | 108 +++++++++++++- .../slot_based/block_builder_task.rs | 54 ++++++- .../aura/src/collators/slot_based/mod.rs | 12 +- .../polkadot-omni-node/lib/src/nodes/aura.rs | 35 +---- 6 files changed, 222 insertions(+), 197 deletions(-) diff --git a/cumulus/client/consensus/aura/src/collator.rs b/cumulus/client/consensus/aura/src/collator.rs index 9bba5b69da4b1..e372162f21332 100644 --- a/cumulus/client/consensus/aura/src/collator.rs +++ b/cumulus/client/consensus/aura/src/collator.rs @@ -38,13 +38,12 @@ use cumulus_primitives_core::{ use cumulus_relay_chain_interface::RelayChainInterface; use polkadot_node_primitives::{Collation, MaybeCompressedPoV}; -use polkadot_node_subsystem::messages::CollatorProtocolMessage; use polkadot_primitives::{Header as PHeader, Id as ParaId}; use crate::collators::RelayParentData; use futures::prelude::*; use sc_consensus::{BlockImport, BlockImportParams, ForkChoiceStrategy, StateAction}; -use sc_consensus_aura::{standalone as aura_internal, CompatibleDigestItem}; +use sc_consensus_aura::standalone as aura_internal; use sp_api::ProvideRuntimeApi; use sp_application_crypto::AppPublic; use sp_consensus::BlockOrigin; @@ -60,13 +59,6 @@ use sp_state_machine::StorageChanges; use sp_timestamp::Timestamp; use std::{error::Error, time::Duration}; -/// The slot offset to start pre-connecting to backing groups. Represented as number -/// of seconds before own slot starts. -const PRE_CONNECT_SLOT_OFFSET: Duration = Duration::from_secs(6); - -/// Task name for the collator protocol helper. -pub const COLLATOR_PROTOCOL_HELPER_TASK_GROUP: &str = "collator-protocol-helper"; - /// Parameters for instantiating a [`Collator`]. pub struct Params { /// A builder for inherent data builders. @@ -441,132 +433,3 @@ where Ok(block_import_params) } - -/// Task for triggering backing group connections early. -/// -/// This helper monitors block imports and proactively connects to backing groups -/// when the collator's slot is approaching, improving network connectivity. -pub async fn collator_protocol_helper( - client: std::sync::Arc, - keystore: sp_keystore::KeystorePtr, - mut overseer_handle: cumulus_relay_chain_interface::OverseerHandle, - spawn_handle: Spawner, -) where - Block: sp_runtime::traits::Block, - Client: sc_client_api::HeaderBackend - + Send - + Sync - + ProvideRuntimeApi - + sc_client_api::BlockchainEvents - + 'static, - Client::Api: AuraApi, - P: sp_core::Pair + Send + Sync, - P::Public: Codec, - Spawner: sp_core::traits::SpawnNamed, -{ - let mut import_notifications = client.import_notification_stream().fuse(); - - tracing::debug!(target: crate::LOG_TARGET, "Started collator protocol helper"); - - // Our own slot number, known in advance. - let mut our_slot = None; - - while let Some(notification) = import_notifications.next().await { - // Determine if this node is the next author - let digest = notification.header.digest(); - - let slot = - digest.convert_first(>::as_aura_pre_digest); - - tracing::debug!(target: crate::LOG_TARGET, "Imported block with slot: {:?}", slot); - - let Some(slot) = slot else { - continue; - }; - - // Determine current slot duration. - let Ok(slot_duration) = client.runtime_api().slot_duration(notification.header.hash()) - else { - continue; - }; - - let authorities = - client.runtime_api().authorities(notification.header.hash()).unwrap_or_default(); - - // Check if our slot has passed and we are not expected to author again in next slot. - match ( - our_slot, - aura_internal::claim_slot::

(slot + 1, &authorities, &keystore) - .await - .is_none(), - ) { - (Some(last_slot), true) if slot > last_slot => { - tracing::debug!(target: crate::LOG_TARGET, "Our slot {} has passed, current slot is {}, sending disconnect message", last_slot, slot); - - // Send a message to the collator protocol to stop pre-connecting to backing - // groups - overseer_handle - .send_msg( - CollatorProtocolMessage::DisconnectFromBackingGroups, - "CollatorProtocolHelper", - ) - .await; - - our_slot = None; - }, - (Some(_), false) => { - // `our_slot` is `Some` means we alredy sent pre-connect message, no need to - // proceed further. - continue - }, - _ => {}, - } - - // Check if our slot is coming up next. This means that there is still another slot - // before our turn. - let target_slot = slot + 2; - if aura_internal::claim_slot::

(target_slot, &authorities, &keystore) - .await - .is_none() - { - continue; - } - - tracing::debug!(target: crate::LOG_TARGET, "Our slot {} is due soon", target_slot ); - - // Determine our own slot timestamp. - let Some(own_slot_ts) = target_slot.timestamp(slot_duration) else { - tracing::warn!(target: crate::LOG_TARGET, "Failed to get own slot duration"); - - continue; - }; - - let pre_connect_delay = own_slot_ts - .saturating_sub(*Timestamp::current()) - .saturating_sub(PRE_CONNECT_SLOT_OFFSET.as_millis() as u64); - - tracing::debug!(target: crate::LOG_TARGET, "Pre-connecting to backing groups in {}ms", pre_connect_delay); - - let mut overseer_handle_clone = overseer_handle.clone(); - spawn_handle.spawn( - "send-pre-connect-message", - Some(COLLATOR_PROTOCOL_HELPER_TASK_GROUP), - async move { - futures_timer::Delay::new(std::time::Duration::from_millis(pre_connect_delay)) - .await; - - tracing::debug!(target: crate::LOG_TARGET, "Sending pre-connect message"); - - // Send a message to the collator protocol to pre-connect to backing groups - overseer_handle_clone - .send_msg( - CollatorProtocolMessage::ConnectToBackingGroups, - "CollatorProtocolHelper", - ) - .await; - } - .boxed(), - ); - our_slot = Some(target_slot); - } -} diff --git a/cumulus/client/consensus/aura/src/collators/lookahead.rs b/cumulus/client/consensus/aura/src/collators/lookahead.rs index 55835ef4dcb8b..ad68893e97550 100644 --- a/cumulus/client/consensus/aura/src/collators/lookahead.rs +++ b/cumulus/client/consensus/aura/src/collators/lookahead.rs @@ -45,7 +45,11 @@ use polkadot_node_subsystem::messages::CollationGenerationMessage; use polkadot_overseer::Handle as OverseerHandle; use polkadot_primitives::{CollatorPair, Id as ParaId, OccupiedCoreAssumption}; -use crate::{collator as collator_util, collators::claim_queue_at, export_pov_to_path}; +use crate::{ + collator as collator_util, + collators::{claim_queue_at, collator_protocol_helper}, + export_pov_to_path, +}; use futures::prelude::*; use sc_client_api::{backend::AuxStore, BlockBackend, BlockOf}; use sc_consensus::BlockImport; @@ -53,14 +57,14 @@ use sp_api::ProvideRuntimeApi; use sp_application_crypto::AppPublic; use sp_blockchain::HeaderBackend; use sp_consensus_aura::{AuraApi, Slot}; -use sp_core::crypto::Pair; +use sp_core::{crypto::Pair, traits::SpawnNamed}; use sp_inherents::CreateInherentDataProviders; use sp_keystore::KeystorePtr; use sp_runtime::traits::{Block as BlockT, Header as HeaderT, Member}; use std::{path::PathBuf, sync::Arc, time::Duration}; /// Parameters for [`run`]. -pub struct Params { +pub struct Params { /// Inherent data providers. Only non-consensus inherent data should be provided, i.e. /// the timestamp, slot, and paras inherents should be omitted, as they are set by this /// collator. @@ -96,11 +100,13 @@ pub struct Params { /// The maximum percentage of the maximum PoV size that the collator can use. /// It will be removed once is fixed. pub max_pov_percentage: Option, + /// Spawner for spawning tasks. + pub spawner: Spawner, } /// Run async-backing-friendly Aura. -pub fn run( - params: Params, +pub fn run( + params: Params, ) -> impl Future + Send + 'static where Block: BlockT, @@ -122,17 +128,21 @@ where Proposer: ProposerInterface + Send + Sync + 'static, CS: CollatorServiceInterface + Send + Sync + 'static, CHP: consensus_common::ValidationCodeHashProvider + Send + 'static, - P: Pair, + Spawner: SpawnNamed + Clone + 'static, + P: Pair + Send + Sync + 'static, P::Public: AppPublic + Member + Codec, P::Signature: TryFrom> + Member + Codec, { - run_with_export::<_, P, _, _, _, _, _, _, _, _>(ParamsWithExport { params, export_pov: None }) + run_with_export::<_, P, _, _, _, _, _, _, _, _, _>(ParamsWithExport { + params, + export_pov: None, + }) } /// Parameters for [`run_with_export`]. -pub struct ParamsWithExport { +pub struct ParamsWithExport { /// The parameters. - pub params: Params, + pub params: Params, /// When set, the collator will export every produced `POV` to this folder. pub export_pov: Option, @@ -142,7 +152,7 @@ pub struct ParamsWithExport( +pub fn run_with_export( ParamsWithExport { mut params, export_pov }: ParamsWithExport< BI, CIDP, @@ -152,6 +162,7 @@ pub fn run_with_export, ) -> impl Future + Send + 'static where @@ -174,7 +185,8 @@ where Proposer: ProposerInterface + Send + Sync + 'static, CS: CollatorServiceInterface + Send + Sync + 'static, CHP: consensus_common::ValidationCodeHashProvider + Send + 'static, - P: Pair, + Spawner: SpawnNamed + Clone + 'static, + P: Pair + Send + Sync + 'static, P::Public: AppPublic + Member + Codec, P::Signature: TryFrom> + Member + Codec, { @@ -215,6 +227,8 @@ where collator_util::Collator::::new(params) }; + let mut our_slot = None; + while let Some(relay_parent_header) = import_notifications.next().await { let relay_parent = relay_parent_header.hash(); @@ -290,14 +304,18 @@ where relay_chain_slot_duration = ?params.relay_chain_slot_duration, "Adjusted relay-chain slot to parachain slot" ); - Some(super::can_build_upon::<_, _, P>( + Some(( slot_now, - relay_slot, - timestamp, - block_hash, - included_block.hash(), - para_client, - &keystore, + slot_duration, + super::can_build_upon::<_, _, P>( + slot_now, + relay_slot, + timestamp, + block_hash, + included_block.hash(), + para_client, + &keystore, + ), )) }; @@ -316,8 +334,21 @@ where // scheduled chains this ensures that the backlog will grow steadily. for n_built in 0..2 { let slot_claim = match can_build_upon(parent_hash) { - Some(fut) => match fut.await { - None => break, + Some((current_slot, slot_duration, fut)) => match fut.await { + None => { + our_slot = collator_protocol_helper::<_, _, P, _>( + params.para_client.clone(), + params.keystore.clone(), + params.overseer_handle.clone(), + params.spawner.clone(), + parent_hash, + slot_duration, + current_slot, + our_slot, + ) + .await; + break + }, Some(c) => c, }, None => break, diff --git a/cumulus/client/consensus/aura/src/collators/mod.rs b/cumulus/client/consensus/aura/src/collators/mod.rs index 2ce38d0c07b68..4e1c14a42ad6d 100644 --- a/cumulus/client/consensus/aura/src/collators/mod.rs +++ b/cumulus/client/consensus/aura/src/collators/mod.rs @@ -27,7 +27,8 @@ use cumulus_client_consensus_common::{self as consensus_common, ParentSearchPara use cumulus_primitives_aura::{AuraUnincludedSegmentApi, Slot}; use cumulus_primitives_core::{relay_chain::Header as RelayHeader, BlockT}; use cumulus_relay_chain_interface::RelayChainInterface; -use polkadot_node_subsystem::messages::RuntimeApiRequest; +use futures::prelude::*; +use polkadot_node_subsystem::messages::{CollatorProtocolMessage, RuntimeApiRequest}; use polkadot_node_subsystem_util::runtime::ClaimQueueSnapshot; use polkadot_primitives::{ Hash as RelayHash, Id as ParaId, OccupiedCoreAssumption, ValidationCodeHash, @@ -35,9 +36,11 @@ use polkadot_primitives::{ }; use sc_consensus_aura::{standalone as aura_internal, AuraApi}; use sp_api::{ApiExt, ProvideRuntimeApi, RuntimeApiInfo}; +use sp_consensus_aura::SlotDuration; use sp_core::Pair; use sp_keystore::KeystorePtr; use sp_timestamp::Timestamp; +use std::time::Duration; pub mod basic; pub mod lookahead; @@ -52,6 +55,109 @@ pub mod slot_based; // sanity check. const PARENT_SEARCH_DEPTH: usize = 30; +/// The slot offset to start pre-connecting to backing groups. Represented as number +/// of seconds before own slot starts. +const PRE_CONNECT_SLOT_OFFSET: Duration = Duration::from_secs(6); + +/// Task name for the collator protocol helper. +pub const COLLATOR_PROTOCOL_HELPER_TASK_GROUP: &str = "collator-protocol-helper"; + +/// Helper for triggering backing group connections early. +/// +/// Returns the updated `our_slot` value. +pub async fn collator_protocol_helper( + client: std::sync::Arc, + keystore: sp_keystore::KeystorePtr, + mut overseer_handle: cumulus_relay_chain_interface::OverseerHandle, + spawn_handle: Spawner, + best_block: Block::Hash, + slot_duration: SlotDuration, + current_slot: Slot, + our_slot: Option, +) -> Option +where + Block: sp_runtime::traits::Block, + Client: sc_client_api::HeaderBackend + Send + Sync + ProvideRuntimeApi + 'static, + Client::Api: AuraApi, + P: sp_core::Pair + Send + Sync, + P::Public: Codec, + Spawner: sp_core::traits::SpawnNamed, +{ + let authorities = client.runtime_api().authorities(best_block).unwrap_or_default(); + + // Check if our slot has passed and we are not expected to author again in next slot. + match ( + our_slot, + aura_internal::claim_slot::

(current_slot + 1, &authorities, &keystore) + .await + .is_none(), + ) { + (Some(last_slot), true) if current_slot > last_slot => { + tracing::debug!(target: crate::LOG_TARGET, "Our slot {} has passed, current slot is {}, sending disconnect message", last_slot, current_slot); + + // Send a message to the collator protocol to stop pre-connecting to backing + // groups + overseer_handle + .send_msg( + CollatorProtocolMessage::DisconnectFromBackingGroups, + "CollatorProtocolHelper", + ) + .await; + + return None; + }, + (Some(_), false) => { + // `our_slot` is `Some` means we alredy sent pre-connect message, no need to + // proceed further. + return our_slot + }, + _ => {}, + } + + // Check if our slot is coming up next. This means that there is still another slot + // before our turn. + let target_slot = current_slot + 2; + if aura_internal::claim_slot::

(target_slot, &authorities, &keystore) + .await + .is_none() + { + return our_slot + } + + tracing::debug!(target: crate::LOG_TARGET, "Our slot {} is due soon", target_slot ); + + // Determine our own slot timestamp. + let Some(own_slot_ts) = target_slot.timestamp(slot_duration) else { + tracing::warn!(target: crate::LOG_TARGET, "Failed to get own slot timestamp"); + + return our_slot; + }; + + let pre_connect_delay = own_slot_ts + .saturating_sub(*Timestamp::current()) + .saturating_sub(PRE_CONNECT_SLOT_OFFSET.as_millis() as u64); + + tracing::debug!(target: crate::LOG_TARGET, "Pre-connecting to backing groups in {}ms", pre_connect_delay); + + let mut overseer_handle_clone = overseer_handle.clone(); + spawn_handle.spawn( + "send-pre-connect-message", + Some(COLLATOR_PROTOCOL_HELPER_TASK_GROUP), + async move { + futures_timer::Delay::new(std::time::Duration::from_millis(pre_connect_delay)).await; + + tracing::debug!(target: crate::LOG_TARGET, "Sending pre-connect message"); + + // Send a message to the collator protocol to pre-connect to backing groups + overseer_handle_clone + .send_msg(CollatorProtocolMessage::ConnectToBackingGroups, "CollatorProtocolHelper") + .await; + } + .boxed(), + ); + Some(target_slot) +} + /// Check the `local_validation_code_hash` against the validation code hash in the relay chain /// state. /// diff --git a/cumulus/client/consensus/aura/src/collators/slot_based/block_builder_task.rs b/cumulus/client/consensus/aura/src/collators/slot_based/block_builder_task.rs index e6360a3c8408e..bb2260e94fc57 100644 --- a/cumulus/client/consensus/aura/src/collators/slot_based/block_builder_task.rs +++ b/cumulus/client/consensus/aura/src/collators/slot_based/block_builder_task.rs @@ -21,7 +21,7 @@ use super::CollatorMessage; use crate::{ collator as collator_util, collators::{ - check_validation_code_or_log, + check_validation_code_or_log, collator_protocol_helper, slot_based::{ relay_chain_data_cache::{RelayChainData, RelayChainDataCache}, slot_timer::{SlotInfo, SlotTimer}, @@ -50,7 +50,7 @@ use sp_api::ProvideRuntimeApi; use sp_application_crypto::AppPublic; use sp_blockchain::HeaderBackend; use sp_consensus_aura::AuraApi; -use sp_core::crypto::Pair; +use sp_core::{crypto::Pair, traits::SpawnNamed}; use sp_inherents::CreateInherentDataProviders; use sp_keystore::KeystorePtr; use sp_runtime::traits::{Block as BlockT, Header as HeaderT, Member, Zero}; @@ -67,6 +67,7 @@ pub struct BuilderTaskParams< CHP, Proposer, CS, + Spawner, > { /// Inherent data providers. Only non-consensus inherent data should be provided, i.e. /// the timestamp, slot, and paras inherents should be omitted, as they are set by this @@ -107,11 +108,38 @@ pub struct BuilderTaskParams< /// The maximum percentage of the maximum PoV size that the collator can use. /// It will be removed once https://github.com/paritytech/polkadot-sdk/issues/6020 is fixed. pub max_pov_percentage: Option, + /// Spawner for spawning tasks. + pub spawn_handle: Spawner, + /// Handle to the overseer for sending messages. + pub overseer_handle: cumulus_relay_chain_interface::OverseerHandle, } /// Run block-builder. -pub fn run_block_builder( - params: BuilderTaskParams, +pub fn run_block_builder< + Block, + P, + BI, + CIDP, + Client, + Backend, + RelayClient, + CHP, + Proposer, + CS, + Spawner, +>( + params: BuilderTaskParams< + Block, + BI, + CIDP, + Client, + Backend, + RelayClient, + CHP, + Proposer, + CS, + Spawner, + >, ) -> impl Future + Send + 'static where Block: BlockT, @@ -134,9 +162,10 @@ where Proposer: ProposerInterface + Send + Sync + 'static, CS: CollatorServiceInterface + Send + Sync + 'static, CHP: consensus_common::ValidationCodeHashProvider + Send + 'static, - P: Pair, + P: Pair + Send + Sync + 'static, P::Public: AppPublic + Member + Codec, P::Signature: TryFrom> + Member + Codec, + Spawner: SpawnNamed + Clone + 'static, { async move { tracing::info!(target: LOG_TARGET, "Starting slot-based block-builder task."); @@ -156,6 +185,8 @@ where para_backend, slot_offset, max_pov_percentage, + spawn_handle, + overseer_handle, } = params; let mut slot_timer = SlotTimer::<_, _, P>::new_with_offset( @@ -179,7 +210,7 @@ where }; let mut relay_chain_data_cache = RelayChainDataCache::new(relay_client.clone(), para_id); - + let mut our_slot = None; loop { // We wait here until the next slot arrives. if slot_timer.wait_until_next_slot().await.is_err() { @@ -319,6 +350,17 @@ where slot = ?para_slot.slot, "Not building block." ); + our_slot = collator_protocol_helper::<_, _, P, _>( + para_client.clone(), + keystore.clone(), + overseer_handle.clone(), + spawn_handle.clone(), + best_hash, + para_slot_duration, + para_slot.slot, + our_slot, + ) + .await; continue }, }; diff --git a/cumulus/client/consensus/aura/src/collators/slot_based/mod.rs b/cumulus/client/consensus/aura/src/collators/slot_based/mod.rs index c939fb8d1275a..fb09079d024d5 100644 --- a/cumulus/client/consensus/aura/src/collators/slot_based/mod.rs +++ b/cumulus/client/consensus/aura/src/collators/slot_based/mod.rs @@ -172,10 +172,10 @@ pub fn run + Send + Sync + 'static, CS: CollatorServiceInterface + Send + Sync + Clone + 'static, CHP: consensus_common::ValidationCodeHashProvider + Send + 'static, - P: Pair + 'static, + P: Pair + Send + Sync + 'static, P::Public: AppPublic + Member + Codec, P::Signature: TryFrom> + Member + Codec, - Spawner: SpawnNamed, + Spawner: SpawnNamed + Clone + 'static, { let Params { create_inherent_data_providers, @@ -213,6 +213,10 @@ pub fn run(collator_task_params); + let overseer_handle = relay_client + .overseer_handle() + .expect("Relay chain interface should provide overseer handle"); + let block_builder_params = block_builder_task::BuilderTaskParams { create_inherent_data_providers, block_import, @@ -229,10 +233,12 @@ pub fn run(block_builder_params); + run_block_builder::(block_builder_params); spawner.spawn_blocking( "slot-based-block-builder", diff --git a/cumulus/polkadot-omni-node/lib/src/nodes/aura.rs b/cumulus/polkadot-omni-node/lib/src/nodes/aura.rs index 6ec7267eb3e06..70d4084a28992 100644 --- a/cumulus/polkadot-omni-node/lib/src/nodes/aura.rs +++ b/cumulus/polkadot-omni-node/lib/src/nodes/aura.rs @@ -39,7 +39,6 @@ use cumulus_client_consensus_aura::collators::slot_based::{ self as slot_based, Params as SlotBasedParams, }; use cumulus_client_consensus_aura::{ - collator::{collator_protocol_helper, COLLATOR_PROTOCOL_HELPER_TASK_GROUP}, collators::{ lookahead::{self as aura, Params as AuraParams}, slot_based::{SlotBasedBlockImport, SlotBasedBlockImportHandle}, @@ -250,7 +249,8 @@ impl, RuntimeApi, AuraId> where RuntimeApi: ConstructNodeRuntimeApi>, RuntimeApi::RuntimeApi: AuraRuntimeApi, - AuraId: AuraIdT + Sync, + AuraId: AuraIdT + Sync + Send, + ::Pair: Send + Sync, { #[docify::export_content] fn launch_slot_based_collator( @@ -279,7 +279,7 @@ where CHP: cumulus_client_consensus_common::ValidationCodeHashProvider + Send + 'static, Proposer: ProposerInterface + Send + Sync + 'static, CS: CollatorServiceInterface + Send + Sync + Clone + 'static, - Spawner: SpawnNamed, + Spawner: SpawnNamed + Clone + 'static, { slot_based::run::::Pair, _, _, _, _, _, _, _, _, _>( params_with_export, @@ -323,7 +323,7 @@ where relay_chain_slot_duration: Duration, para_id: ParaId, collator_key: CollatorPair, - overseer_handle: OverseerHandle, + _overseer_handle: OverseerHandle, announce_block: Arc>) + Send + Sync>, backend: Arc>, node_extra_args: NodeExtraArgs, @@ -344,18 +344,6 @@ where client.clone(), ); - // Spawn the collator protocol helper task - task_manager.spawn_essential_handle().spawn( - COLLATOR_PROTOCOL_HELPER_TASK_GROUP, - Some(COLLATOR_PROTOCOL_HELPER_TASK_GROUP), - collator_protocol_helper::<_, _, ::Pair, _>( - client.clone(), - keystore.clone(), - overseer_handle, - task_manager.spawn_handle(), - ), - ); - let client_for_aura = client.clone(); let params = SlotBasedParams { create_inherent_data_providers: move |_, ()| async move { Ok(()) }, @@ -481,18 +469,6 @@ where client.clone(), ); - // Spawn the collator protocol helper task - task_manager.spawn_essential_handle().spawn( - COLLATOR_PROTOCOL_HELPER_TASK_GROUP, - Some(COLLATOR_PROTOCOL_HELPER_TASK_GROUP), - collator_protocol_helper::<_, _, ::Pair, _>( - client.clone(), - keystore.clone(), - overseer_handle.clone(), - task_manager.spawn_handle(), - ), - ); - let params = aura::ParamsWithExport { export_pov: node_extra_args.export_pov, params: AuraParams { @@ -517,12 +493,13 @@ where authoring_duration: Duration::from_millis(2000), reinitialize: false, max_pov_percentage: node_extra_args.max_pov_percentage, + spawner: task_manager.spawn_handle(), }, }; let fut = async move { wait_for_aura(client).await; - aura::run_with_export::::Pair, _, _, _, _, _, _, _, _>( + aura::run_with_export::::Pair, _, _, _, _, _, _, _, _, _>( params, ) .await; From d3afbebd6868150d397968194fc658f32606c6c0 Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Thu, 9 Oct 2025 20:51:22 +0300 Subject: [PATCH 34/59] fix comments Signed-off-by: Andrei Sandu --- .../node/network/collator-protocol/src/collator_side/mod.rs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/polkadot/node/network/collator-protocol/src/collator_side/mod.rs b/polkadot/node/network/collator-protocol/src/collator_side/mod.rs index 57f3361a26d6c..1ade4528c75da 100644 --- a/polkadot/node/network/collator-protocol/src/collator_side/mod.rs +++ b/polkadot/node/network/collator-protocol/src/collator_side/mod.rs @@ -834,8 +834,6 @@ async fn process_msg( match msg { ConnectToBackingGroups => { - // This message is used to instruct the collator to pre-connect to backing groups. - // For now, we do not need to take any action here. gum::debug!( target: LOG_TARGET, "Received PreConnectToBackingGroups message." @@ -855,8 +853,6 @@ async fn process_msg( } }, DisconnectFromBackingGroups => { - // This message is used to instruct the collator to disconnect from backing groups. - // For now, we do not need to take any action here. gum::debug!( target: LOG_TARGET, "Received DisconnectFromBackingGroups message." From d4b17d9414406c2eebe50aefbd06c2477be8229e Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Fri, 10 Oct 2025 11:19:16 +0300 Subject: [PATCH 35/59] fix cumulus-test-service Signed-off-by: Andrei Sandu --- cumulus/test/service/src/lib.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cumulus/test/service/src/lib.rs b/cumulus/test/service/src/lib.rs index d0d2325c21b6c..cd57f9b1bb188 100644 --- a/cumulus/test/service/src/lib.rs +++ b/cumulus/test/service/src/lib.rs @@ -495,9 +495,10 @@ where authoring_duration: Duration::from_millis(2000), reinitialize: false, max_pov_percentage: None, + spawner: task_manager.spawn_handle(), }; - let fut = aura::run::(params); + let fut = aura::run::(params); task_manager.spawn_essential_handle().spawn("aura", None, fut); } } From 920a015072050773692c8ba31523f349f102b82e Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Fri, 10 Oct 2025 11:48:05 +0300 Subject: [PATCH 36/59] fix node tenplate Signed-off-by: Andrei Sandu --- templates/parachain/node/src/service.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/templates/parachain/node/src/service.rs b/templates/parachain/node/src/service.rs index 4ce1bba376e8d..ad67f97d6bc57 100644 --- a/templates/parachain/node/src/service.rs +++ b/templates/parachain/node/src/service.rs @@ -219,10 +219,12 @@ fn start_consensus( authoring_duration: Duration::from_millis(2000), reinitialize: false, max_pov_percentage: None, + spawner: task_manager.spawn_handle(), }; - let fut = aura::run::( - params, - ); + let fut = + aura::run::( + params, + ); task_manager.spawn_essential_handle().spawn("aura", None, fut); Ok(()) From ff8f2c79a87160fedabe75e5ac8b2e059f645844 Mon Sep 17 00:00:00 2001 From: Andrei Sandu <54316454+sandreim@users.noreply.github.com> Date: Fri, 10 Oct 2025 11:49:43 +0300 Subject: [PATCH 37/59] Update polkadot/node/network/collator-protocol/Cargo.toml Co-authored-by: Alexandru Gheorghe <49718502+alexggh@users.noreply.github.com> --- polkadot/node/network/collator-protocol/Cargo.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/polkadot/node/network/collator-protocol/Cargo.toml b/polkadot/node/network/collator-protocol/Cargo.toml index daa76f3c01111..92592045c0e44 100644 --- a/polkadot/node/network/collator-protocol/Cargo.toml +++ b/polkadot/node/network/collator-protocol/Cargo.toml @@ -51,7 +51,6 @@ itertools = { workspace = true } polkadot-node-subsystem-test-helpers = { workspace = true } polkadot-primitives-test-helpers = { workspace = true } - [features] default = [] experimental-collator-protocol = ["async-trait", "tokio"] From 339e80ebcac0dc66ce6207a03cb81cf77ec72c5f Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Fri, 10 Oct 2025 11:53:03 +0300 Subject: [PATCH 38/59] to much spawning Signed-off-by: Andrei Sandu --- cumulus/client/consensus/aura/src/collators/lookahead.rs | 2 +- .../aura/src/collators/slot_based/block_builder_task.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cumulus/client/consensus/aura/src/collators/lookahead.rs b/cumulus/client/consensus/aura/src/collators/lookahead.rs index ad68893e97550..a99c3fb9801f1 100644 --- a/cumulus/client/consensus/aura/src/collators/lookahead.rs +++ b/cumulus/client/consensus/aura/src/collators/lookahead.rs @@ -100,7 +100,7 @@ pub struct Params is fixed. pub max_pov_percentage: Option, - /// Spawner for spawning tasks. + /// Required for pre-connecting to backing groups task. pub spawner: Spawner, } diff --git a/cumulus/client/consensus/aura/src/collators/slot_based/block_builder_task.rs b/cumulus/client/consensus/aura/src/collators/slot_based/block_builder_task.rs index bb2260e94fc57..192cb4740d77b 100644 --- a/cumulus/client/consensus/aura/src/collators/slot_based/block_builder_task.rs +++ b/cumulus/client/consensus/aura/src/collators/slot_based/block_builder_task.rs @@ -108,7 +108,7 @@ pub struct BuilderTaskParams< /// The maximum percentage of the maximum PoV size that the collator can use. /// It will be removed once https://github.com/paritytech/polkadot-sdk/issues/6020 is fixed. pub max_pov_percentage: Option, - /// Spawner for spawning tasks. + /// Required for pre-connecting to backing groups task. pub spawn_handle: Spawner, /// Handle to the overseer for sending messages. pub overseer_handle: cumulus_relay_chain_interface::OverseerHandle, From fb0e4635f6ce852ceffca035f222edecb8a5d67e Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Fri, 10 Oct 2025 12:19:03 +0300 Subject: [PATCH 39/59] review Signed-off-by: Andrei Sandu --- .../consensus/aura/src/collators/lookahead.rs | 12 ++++---- .../consensus/aura/src/collators/mod.rs | 12 ++++---- .../slot_based/block_builder_task.rs | 30 +++++++++---------- 3 files changed, 27 insertions(+), 27 deletions(-) diff --git a/cumulus/client/consensus/aura/src/collators/lookahead.rs b/cumulus/client/consensus/aura/src/collators/lookahead.rs index a99c3fb9801f1..bcc50e8078c41 100644 --- a/cumulus/client/consensus/aura/src/collators/lookahead.rs +++ b/cumulus/client/consensus/aura/src/collators/lookahead.rs @@ -47,7 +47,7 @@ use polkadot_primitives::{CollatorPair, Id as ParaId, OccupiedCoreAssumption}; use crate::{ collator as collator_util, - collators::{claim_queue_at, collator_protocol_helper}, + collators::{claim_queue_at, update_backing_group_connections}, export_pov_to_path, }; use futures::prelude::*; @@ -336,11 +336,11 @@ where let slot_claim = match can_build_upon(parent_hash) { Some((current_slot, slot_duration, fut)) => match fut.await { None => { - our_slot = collator_protocol_helper::<_, _, P, _>( - params.para_client.clone(), - params.keystore.clone(), - params.overseer_handle.clone(), - params.spawner.clone(), + our_slot = update_backing_group_connections::<_, _, P, _>( + ¶ms.para_client, + ¶ms.keystore, + &mut params.overseer_handle.clone(), + ¶ms.spawner, parent_hash, slot_duration, current_slot, diff --git a/cumulus/client/consensus/aura/src/collators/mod.rs b/cumulus/client/consensus/aura/src/collators/mod.rs index 4e1c14a42ad6d..292f0dcf5bdc0 100644 --- a/cumulus/client/consensus/aura/src/collators/mod.rs +++ b/cumulus/client/consensus/aura/src/collators/mod.rs @@ -65,11 +65,11 @@ pub const COLLATOR_PROTOCOL_HELPER_TASK_GROUP: &str = "collator-protocol-helper" /// Helper for triggering backing group connections early. /// /// Returns the updated `our_slot` value. -pub async fn collator_protocol_helper( - client: std::sync::Arc, - keystore: sp_keystore::KeystorePtr, - mut overseer_handle: cumulus_relay_chain_interface::OverseerHandle, - spawn_handle: Spawner, +pub async fn update_backing_group_connections( + client: &std::sync::Arc, + keystore: &sp_keystore::KeystorePtr, + overseer_handle: &mut cumulus_relay_chain_interface::OverseerHandle, + spawn_handle: &Spawner, best_block: Block::Hash, slot_duration: SlotDuration, current_slot: Slot, @@ -81,7 +81,7 @@ where Client::Api: AuraApi, P: sp_core::Pair + Send + Sync, P::Public: Codec, - Spawner: sp_core::traits::SpawnNamed, + Spawner: sp_core::traits::SpawnNamed + Clone, { let authorities = client.runtime_api().authorities(best_block).unwrap_or_default(); diff --git a/cumulus/client/consensus/aura/src/collators/slot_based/block_builder_task.rs b/cumulus/client/consensus/aura/src/collators/slot_based/block_builder_task.rs index 192cb4740d77b..ec992bc956d6b 100644 --- a/cumulus/client/consensus/aura/src/collators/slot_based/block_builder_task.rs +++ b/cumulus/client/consensus/aura/src/collators/slot_based/block_builder_task.rs @@ -21,12 +21,12 @@ use super::CollatorMessage; use crate::{ collator as collator_util, collators::{ - check_validation_code_or_log, collator_protocol_helper, + check_validation_code_or_log, slot_based::{ relay_chain_data_cache::{RelayChainData, RelayChainDataCache}, slot_timer::{SlotInfo, SlotTimer}, }, - RelayParentData, + update_backing_group_connections, RelayParentData, }, LOG_TARGET, }; @@ -340,21 +340,21 @@ where Some(slot) => slot, None => { tracing::debug!( - target: crate::LOG_TARGET, - unincluded_segment_len = parent.depth, - relay_parent = ?relay_parent, - relay_parent_num = %relay_parent_header.number(), - included_hash = ?included_header_hash, - included_num = %included_header.number(), - parent = ?parent_hash, - slot = ?para_slot.slot, + target: crate::LOG_TARGET, + unincluded_segment_len = parent.depth, + relay_parent = ?relay_parent, + relay_parent_num = %relay_parent_header.number(), + included_hash = ?included_header_hash, + included_num = %included_header.number(), + parent = ?parent_hash, + slot = ?para_slot.slot, "Not building block." ); - our_slot = collator_protocol_helper::<_, _, P, _>( - para_client.clone(), - keystore.clone(), - overseer_handle.clone(), - spawn_handle.clone(), + our_slot = update_backing_group_connections::<_, _, P, _>( + ¶_client, + &keystore, + &mut overseer_handle.clone(), + &spawn_handle, best_hash, para_slot_duration, para_slot.slot, From 5261e54f6f3d8cf71aee7065f2bc35abd34d33f4 Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Fri, 10 Oct 2025 17:50:23 +0300 Subject: [PATCH 40/59] test comment Signed-off-by: Andrei Sandu --- .../network/collator-protocol/src/collator_side/tests/mod.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/polkadot/node/network/collator-protocol/src/collator_side/tests/mod.rs b/polkadot/node/network/collator-protocol/src/collator_side/tests/mod.rs index 754a8c960f55a..38e089dafc5c2 100644 --- a/polkadot/node/network/collator-protocol/src/collator_side/tests/mod.rs +++ b/polkadot/node/network/collator-protocol/src/collator_side/tests/mod.rs @@ -1737,7 +1737,6 @@ fn no_connection_without_preconnect_message() { // Verify that no ConnectToValidators message was sent // by attempting to receive a message with a short timeout. - // We should either timeout or receive messages that are NOT ConnectToValidators. let timeout = Duration::from_millis(250); match overseer_recv_with_timeout(&mut virtual_overseer, timeout).await { None => { From f106131debe045b1f0ff082b4e21548e9ea1cdf1 Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Fri, 10 Oct 2025 18:17:29 +0300 Subject: [PATCH 41/59] remove validator buffer file Signed-off-by: Andrei Sandu --- .../src/collator_side/mod.rs | 35 ++++++++- .../src/collator_side/validators_buffer.rs | 72 ------------------- 2 files changed, 33 insertions(+), 74 deletions(-) delete mode 100644 polkadot/node/network/collator-protocol/src/collator_side/validators_buffer.rs diff --git a/polkadot/node/network/collator-protocol/src/collator_side/mod.rs b/polkadot/node/network/collator-protocol/src/collator_side/mod.rs index bcfb6d9fbca7c..a390f12edae2c 100644 --- a/polkadot/node/network/collator-protocol/src/collator_side/mod.rs +++ b/polkadot/node/network/collator-protocol/src/collator_side/mod.rs @@ -66,14 +66,12 @@ mod error; mod metrics; #[cfg(test)] mod tests; -mod validators_buffer; use collation::{ ActiveCollationFetches, Collation, CollationSendResult, CollationStatus, VersionedCollationRequest, WaitingCollationFetches, }; use error::{log_error, Error, FatalError, Result}; -use validators_buffer::{ResetInterestTimeout, RESET_INTEREST_TIMEOUT}; pub use metrics::Metrics; @@ -90,6 +88,9 @@ const COST_APPARENT_FLOOD: Rep = /// For considerations on this value, see: https://github.com/paritytech/polkadot/issues/4386 const MAX_UNSHARED_UPLOAD_TIME: Duration = Duration::from_millis(150); +/// A timeout for resetting validators' interests in collations. +const RESET_INTEREST_TIMEOUT: Duration = Duration::from_secs(6); + /// Ensure that collator updates its connection requests to validators /// this long after the most recent leaf. /// @@ -113,6 +114,36 @@ const MAX_PARALLEL_CHAIN_API_REQUESTS: usize = 10; /// connected. type ReconnectTimeout = Fuse; +/// A future that returns a candidate hash along with validator discovery +/// keys once a timeout hit. +/// +/// If a validator doesn't manage to fetch a collation within this timeout +/// we should reset its interest in this advertisement in a buffer. For example, +/// when the PoV was already requested from another peer. +struct ResetInterestTimeout { + fut: futures_timer::Delay, + candidate_hash: CandidateHash, + peer_id: PeerId, +} + +impl ResetInterestTimeout { + /// Returns new `ResetInterestTimeout` that resolves after given timeout. + fn new(candidate_hash: CandidateHash, peer_id: PeerId, delay: Duration) -> Self { + Self { fut: futures_timer::Delay::new(delay), candidate_hash, peer_id } + } +} + +impl std::future::Future for ResetInterestTimeout { + type Output = (CandidateHash, PeerId); + + fn poll( + mut self: std::pin::Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> std::task::Poll { + self.fut.poll_unpin(cx).map(|_| (self.candidate_hash, self.peer_id)) + } +} + #[derive(Debug)] enum ShouldAdvertiseTo { Yes, diff --git a/polkadot/node/network/collator-protocol/src/collator_side/validators_buffer.rs b/polkadot/node/network/collator-protocol/src/collator_side/validators_buffer.rs deleted file mode 100644 index ec467c0ab843e..0000000000000 --- a/polkadot/node/network/collator-protocol/src/collator_side/validators_buffer.rs +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright (C) Parity Technologies (UK) Ltd. -// This file is part of Polkadot. - -// Polkadot is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. - -// Polkadot is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. - -// You should have received a copy of the GNU General Public License -// along with Polkadot. If not, see . - -//! Validator groups buffer for connection managements. -//! -//! Solves 2 problems: -//! 1. A collator may want to stay connected to multiple groups on rotation boundaries. -//! 2. It's important to disconnect from validator when there're no collations to be fetched. -//! -//! We keep a simple FIFO buffer of N validator groups and a bitvec for each advertisement, -//! 1 indicating we want to be connected to i-th validator in a buffer, 0 otherwise. -//! -//! The bit is set to 1 for the whole **group** whenever it's inserted into the buffer. Given a -//! relay parent, one can reset a bit back to 0 for particular **validator**. For example, if a -//! collation was fetched or some timeout has been hit. -//! -//! The bitwise OR over known advertisements gives us validators indices for connection request. - -use std::{ - future::Future, - pin::Pin, - task::{Context, Poll}, - time::Duration, -}; - -use futures::FutureExt; - -use polkadot_node_network_protocol::PeerId; -use polkadot_primitives::CandidateHash; - -/// A timeout for resetting validators' interests in collations. -pub const RESET_INTEREST_TIMEOUT: Duration = Duration::from_secs(6); - -/// A future that returns a candidate hash along with validator discovery -/// keys once a timeout hit. -/// -/// If a validator doesn't manage to fetch a collation within this timeout -/// we should reset its interest in this advertisement in a buffer. For example, -/// when the PoV was already requested from another peer. -pub struct ResetInterestTimeout { - fut: futures_timer::Delay, - candidate_hash: CandidateHash, - peer_id: PeerId, -} - -impl ResetInterestTimeout { - /// Returns new `ResetInterestTimeout` that resolves after given timeout. - pub fn new(candidate_hash: CandidateHash, peer_id: PeerId, delay: Duration) -> Self { - Self { fut: futures_timer::Delay::new(delay), candidate_hash, peer_id } - } -} - -impl Future for ResetInterestTimeout { - type Output = (CandidateHash, PeerId); - - fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { - self.fut.poll_unpin(cx).map(|_| (self.candidate_hash, self.peer_id)) - } -} From 05a498914aed758d6c94829e68e457966214db38 Mon Sep 17 00:00:00 2001 From: Andrei Sandu <54316454+sandreim@users.noreply.github.com> Date: Fri, 10 Oct 2025 19:00:56 +0300 Subject: [PATCH 42/59] Update cumulus/client/consensus/aura/src/collators/mod.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Bastian Köcher --- cumulus/client/consensus/aura/src/collators/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cumulus/client/consensus/aura/src/collators/mod.rs b/cumulus/client/consensus/aura/src/collators/mod.rs index a5ec43bcac242..f08a0d1278150 100644 --- a/cumulus/client/consensus/aura/src/collators/mod.rs +++ b/cumulus/client/consensus/aura/src/collators/mod.rs @@ -69,7 +69,7 @@ pub const COLLATOR_PROTOCOL_HELPER_TASK_GROUP: &str = "collator-protocol-helper" /// Helper for triggering backing group connections early. /// /// Returns the updated `our_slot` value. -pub async fn update_backing_group_connections( +pub(crate) async fn update_backing_group_connections( client: &std::sync::Arc, keystore: &sp_keystore::KeystorePtr, overseer_handle: &mut cumulus_relay_chain_interface::OverseerHandle, From 762e1d5940936497de77dbfcd27e0542bd521ad5 Mon Sep 17 00:00:00 2001 From: Andrei Sandu <54316454+sandreim@users.noreply.github.com> Date: Fri, 10 Oct 2025 19:01:09 +0300 Subject: [PATCH 43/59] Update polkadot/node/network/collator-protocol/src/collator_side/mod.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Bastian Köcher --- .../network/collator-protocol/src/collator_side/mod.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/polkadot/node/network/collator-protocol/src/collator_side/mod.rs b/polkadot/node/network/collator-protocol/src/collator_side/mod.rs index ad08bd741e514..75fd28fcd7f01 100644 --- a/polkadot/node/network/collator-protocol/src/collator_side/mod.rs +++ b/polkadot/node/network/collator-protocol/src/collator_side/mod.rs @@ -774,7 +774,12 @@ async fn update_validator_connections( "Disconnecting from validators: {:?}", connected_validator_peer_ids, ); - + // Disconnect from all connected validators on the `Collation` protocol. + NetworkBridgeTxMessage::ConnectToValidators { + validator_ids: vec![], + peer_set: PeerSet::Collation, + failed, + } NetworkBridgeTxMessage::ConnectToValidators { validator_ids: vec![], peer_set: PeerSet::Collation, From c66197358c90c38996d361f57c04653ce7263e0a Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Fri, 10 Oct 2025 19:02:56 +0300 Subject: [PATCH 44/59] review Signed-off-by: Andrei Sandu --- .../collator-protocol/src/collator_side/mod.rs | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/polkadot/node/network/collator-protocol/src/collator_side/mod.rs b/polkadot/node/network/collator-protocol/src/collator_side/mod.rs index a390f12edae2c..874c3a7b77017 100644 --- a/polkadot/node/network/collator-protocol/src/collator_side/mod.rs +++ b/polkadot/node/network/collator-protocol/src/collator_side/mod.rs @@ -424,7 +424,6 @@ impl State { #[overseer::contextbounds(CollatorProtocol, prefix = self::overseer)] async fn distribute_collation( ctx: &mut Context, - runtime: &mut RuntimeInfo, state: &mut State, id: ParaId, receipt: CandidateReceipt, @@ -504,16 +503,16 @@ async fn distribute_collation( ); } - let our_core = core_index; - - // Determine the group on that core. - let GroupValidators { validators } = - determine_our_validators(ctx, runtime, our_core, candidate_relay_parent).await?; + let validators = per_relay_parent + .validator_group + .get(&core_index) + .map(|v| v.validators.clone()) + .unwrap_or_default(); if validators.is_empty() { gum::warn!( target: LOG_TARGET, - core = ?our_core, + core = ?core_index, "there are no validators assigned to core", ); @@ -526,7 +525,7 @@ async fn distribute_collation( candidate_relay_parent = %candidate_relay_parent, ?candidate_hash, pov_hash = ?pov.hash(), - core = ?our_core, + ?core_index, current_validators = ?validators, "Accepted collation, connecting to validators." ); @@ -867,7 +866,6 @@ async fn process_msg( let _ = state.metrics.time_collation_distribution("distribute"); distribute_collation( ctx, - runtime, state, id, candidate_receipt, From c07e11738dd948159c7af8cefa70a112c87164af Mon Sep 17 00:00:00 2001 From: Andrei Sandu <54316454+sandreim@users.noreply.github.com> Date: Fri, 10 Oct 2025 19:24:34 +0300 Subject: [PATCH 45/59] Update cumulus/client/consensus/aura/src/collators/mod.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Bastian Köcher --- cumulus/client/consensus/aura/src/collators/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cumulus/client/consensus/aura/src/collators/mod.rs b/cumulus/client/consensus/aura/src/collators/mod.rs index f08a0d1278150..f854068c0c824 100644 --- a/cumulus/client/consensus/aura/src/collators/mod.rs +++ b/cumulus/client/consensus/aura/src/collators/mod.rs @@ -87,7 +87,7 @@ where P::Public: Codec, Spawner: sp_core::traits::SpawnNamed + Clone, { - let authorities = client.runtime_api().authorities(best_block).unwrap_or_default(); + let authorities = client.runtime_api().authorities(best_block).ok()?; // Check if our slot has passed and we are not expected to author again in next slot. match ( From 7435522f18e7a5f5b98242b31b794937e88b49fd Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Mon, 20 Oct 2025 13:33:20 +0300 Subject: [PATCH 46/59] review feedback Signed-off-by: Andrei Sandu --- .../consensus/aura/src/collators/lookahead.rs | 48 +-- .../consensus/aura/src/collators/mod.rs | 368 +++++++++++++----- .../slot_based/block_builder_task.rs | 65 +--- .../aura/src/collators/slot_based/mod.rs | 8 +- .../polkadot-omni-node/lib/src/nodes/aura.rs | 3 +- cumulus/test/service/src/lib.rs | 3 +- .../src/collator_side/mod.rs | 38 +- templates/parachain/node/src/service.rs | 8 +- 8 files changed, 323 insertions(+), 218 deletions(-) diff --git a/cumulus/client/consensus/aura/src/collators/lookahead.rs b/cumulus/client/consensus/aura/src/collators/lookahead.rs index bcc50e8078c41..86f8eddbb4a9c 100644 --- a/cumulus/client/consensus/aura/src/collators/lookahead.rs +++ b/cumulus/client/consensus/aura/src/collators/lookahead.rs @@ -47,7 +47,7 @@ use polkadot_primitives::{CollatorPair, Id as ParaId, OccupiedCoreAssumption}; use crate::{ collator as collator_util, - collators::{claim_queue_at, update_backing_group_connections}, + collators::{claim_queue_at, BackingGroupConnectionHelper}, export_pov_to_path, }; use futures::prelude::*; @@ -57,14 +57,14 @@ use sp_api::ProvideRuntimeApi; use sp_application_crypto::AppPublic; use sp_blockchain::HeaderBackend; use sp_consensus_aura::{AuraApi, Slot}; -use sp_core::{crypto::Pair, traits::SpawnNamed}; +use sp_core::crypto::Pair; use sp_inherents::CreateInherentDataProviders; use sp_keystore::KeystorePtr; use sp_runtime::traits::{Block as BlockT, Header as HeaderT, Member}; use std::{path::PathBuf, sync::Arc, time::Duration}; /// Parameters for [`run`]. -pub struct Params { +pub struct Params { /// Inherent data providers. Only non-consensus inherent data should be provided, i.e. /// the timestamp, slot, and paras inherents should be omitted, as they are set by this /// collator. @@ -100,13 +100,11 @@ pub struct Params is fixed. pub max_pov_percentage: Option, - /// Required for pre-connecting to backing groups task. - pub spawner: Spawner, } /// Run async-backing-friendly Aura. -pub fn run( - params: Params, +pub fn run( + params: Params, ) -> impl Future + Send + 'static where Block: BlockT, @@ -128,21 +126,17 @@ where Proposer: ProposerInterface + Send + Sync + 'static, CS: CollatorServiceInterface + Send + Sync + 'static, CHP: consensus_common::ValidationCodeHashProvider + Send + 'static, - Spawner: SpawnNamed + Clone + 'static, P: Pair + Send + Sync + 'static, P::Public: AppPublic + Member + Codec, P::Signature: TryFrom> + Member + Codec, { - run_with_export::<_, P, _, _, _, _, _, _, _, _, _>(ParamsWithExport { - params, - export_pov: None, - }) + run_with_export::<_, P, _, _, _, _, _, _, _, _>(ParamsWithExport { params, export_pov: None }) } /// Parameters for [`run_with_export`]. -pub struct ParamsWithExport { +pub struct ParamsWithExport { /// The parameters. - pub params: Params, + pub params: Params, /// When set, the collator will export every produced `POV` to this folder. pub export_pov: Option, @@ -152,7 +146,7 @@ pub struct ParamsWithExport( +pub fn run_with_export( ParamsWithExport { mut params, export_pov }: ParamsWithExport< BI, CIDP, @@ -162,7 +156,6 @@ pub fn run_with_export, ) -> impl Future + Send + 'static where @@ -185,7 +178,6 @@ where Proposer: ProposerInterface + Send + Sync + 'static, CS: CollatorServiceInterface + Send + Sync + 'static, CHP: consensus_common::ValidationCodeHashProvider + Send + 'static, - Spawner: SpawnNamed + Clone + 'static, P: Pair + Send + Sync + 'static, P::Public: AppPublic + Member + Codec, P::Signature: TryFrom> + Member + Codec, @@ -227,7 +219,12 @@ where collator_util::Collator::::new(params) }; - let mut our_slot = None; + let mut connection_helper: BackingGroupConnectionHelper = + BackingGroupConnectionHelper::new( + params.para_client.clone(), + params.keystore.clone(), + params.overseer_handle.clone(), + ); while let Some(relay_parent_header) = import_notifications.next().await { let relay_parent = relay_parent_header.hash(); @@ -306,7 +303,6 @@ where ); Some(( slot_now, - slot_duration, super::can_build_upon::<_, _, P>( slot_now, relay_slot, @@ -334,19 +330,9 @@ where // scheduled chains this ensures that the backlog will grow steadily. for n_built in 0..2 { let slot_claim = match can_build_upon(parent_hash) { - Some((current_slot, slot_duration, fut)) => match fut.await { + Some((current_slot, fut)) => match fut.await { None => { - our_slot = update_backing_group_connections::<_, _, P, _>( - ¶ms.para_client, - ¶ms.keystore, - &mut params.overseer_handle.clone(), - ¶ms.spawner, - parent_hash, - slot_duration, - current_slot, - our_slot, - ) - .await; + connection_helper.update::(current_slot, parent_hash).await; break }, Some(c) => c, diff --git a/cumulus/client/consensus/aura/src/collators/mod.rs b/cumulus/client/consensus/aura/src/collators/mod.rs index f854068c0c824..a24d3ecf908d8 100644 --- a/cumulus/client/consensus/aura/src/collators/mod.rs +++ b/cumulus/client/consensus/aura/src/collators/mod.rs @@ -26,8 +26,7 @@ use codec::Codec; use cumulus_client_consensus_common::{self as consensus_common, ParentSearchParams}; use cumulus_primitives_aura::{AuraUnincludedSegmentApi, Slot}; use cumulus_primitives_core::{relay_chain::Header as RelayHeader, BlockT}; -use cumulus_relay_chain_interface::RelayChainInterface; -use futures::prelude::*; +use cumulus_relay_chain_interface::{OverseerHandle, RelayChainInterface}; use polkadot_node_subsystem::messages::{CollatorProtocolMessage, RuntimeApiRequest}; use polkadot_node_subsystem_util::runtime::ClaimQueueSnapshot; use polkadot_primitives::{ @@ -36,11 +35,9 @@ use polkadot_primitives::{ }; use sc_consensus_aura::{standalone as aura_internal, AuraApi}; use sp_api::{ApiExt, ProvideRuntimeApi, RuntimeApiInfo}; -use sp_consensus_aura::SlotDuration; use sp_core::Pair; use sp_keystore::KeystorePtr; use sp_timestamp::Timestamp; -use std::time::Duration; pub mod basic; pub mod lookahead; @@ -58,108 +55,73 @@ pub mod slot_based; // rules specified by the parachain's runtime and thus will never be too deep. This is just an extra // sanity check. const PARENT_SEARCH_DEPTH: usize = 40; - -/// The slot offset to start pre-connecting to backing groups. Represented as number -/// of seconds before own slot starts. -const PRE_CONNECT_SLOT_OFFSET: Duration = Duration::from_secs(6); - -/// Task name for the collator protocol helper. -pub const COLLATOR_PROTOCOL_HELPER_TASK_GROUP: &str = "collator-protocol-helper"; - -/// Helper for triggering backing group connections early. -/// -/// Returns the updated `our_slot` value. -pub(crate) async fn update_backing_group_connections( - client: &std::sync::Arc, - keystore: &sp_keystore::KeystorePtr, - overseer_handle: &mut cumulus_relay_chain_interface::OverseerHandle, - spawn_handle: &Spawner, - best_block: Block::Hash, - slot_duration: SlotDuration, - current_slot: Slot, +struct BackingGroupConnectionHelper { + client: std::sync::Arc, + keystore: sp_keystore::KeystorePtr, + overseer_handle: OverseerHandle, our_slot: Option, -) -> Option -where - Block: sp_runtime::traits::Block, - Client: sc_client_api::HeaderBackend + Send + Sync + ProvideRuntimeApi + 'static, - Client::Api: AuraApi, - P: sp_core::Pair + Send + Sync, - P::Public: Codec, - Spawner: sp_core::traits::SpawnNamed + Clone, -{ - let authorities = client.runtime_api().authorities(best_block).ok()?; +} - // Check if our slot has passed and we are not expected to author again in next slot. - match ( - our_slot, - aura_internal::claim_slot::

(current_slot + 1, &authorities, &keystore) - .await - .is_none(), - ) { - (Some(last_slot), true) if current_slot > last_slot => { - tracing::debug!(target: crate::LOG_TARGET, "Our slot {} has passed, current slot is {}, sending disconnect message", last_slot, current_slot); - - // Send a message to the collator protocol to stop pre-connecting to backing - // groups - overseer_handle - .send_msg( - CollatorProtocolMessage::DisconnectFromBackingGroups, - "CollatorProtocolHelper", - ) - .await; - - return None; - }, - (Some(_), false) => { - // `our_slot` is `Some` means we alredy sent pre-connect message, no need to - // proceed further. - return our_slot - }, - _ => {}, +impl BackingGroupConnectionHelper { + pub fn new( + client: std::sync::Arc, + keystore: sp_keystore::KeystorePtr, + overseer_handle: OverseerHandle, + ) -> Self { + Self { client, keystore, overseer_handle, our_slot: None } } - // Check if our slot is coming up next. This means that there is still another slot - // before our turn. - let target_slot = current_slot + 2; - if aura_internal::claim_slot::

(target_slot, &authorities, &keystore) - .await - .is_none() - { - return our_slot + async fn send_connect_message(&mut self) { + self.overseer_handle + .send_msg(CollatorProtocolMessage::ConnectToBackingGroups, "CollatorProtocolHelper") + .await; } - tracing::debug!(target: crate::LOG_TARGET, "Our slot {} is due soon", target_slot ); - - // Determine our own slot timestamp. - let Some(own_slot_ts) = target_slot.timestamp(slot_duration) else { - tracing::warn!(target: crate::LOG_TARGET, "Failed to get own slot timestamp"); - - return our_slot; - }; - - let pre_connect_delay = own_slot_ts - .saturating_sub(*Timestamp::current()) - .saturating_sub(PRE_CONNECT_SLOT_OFFSET.as_millis() as u64); - - tracing::debug!(target: crate::LOG_TARGET, "Pre-connecting to backing groups in {}ms", pre_connect_delay); + async fn send_disconnect_message(&mut self) { + self.overseer_handle + .send_msg( + CollatorProtocolMessage::DisconnectFromBackingGroups, + "CollatorProtocolHelper", + ) + .await; + } - let mut overseer_handle_clone = overseer_handle.clone(); - spawn_handle.spawn( - "send-pre-connect-message", - Some(COLLATOR_PROTOCOL_HELPER_TASK_GROUP), - async move { - futures_timer::Delay::new(std::time::Duration::from_millis(pre_connect_delay)).await; + /// Update the current slot and initiate connections to backing groups if needed. + pub async fn update(&mut self, current_slot: Slot, best_block: Block::Hash) + where + Block: sp_runtime::traits::Block, + Client: + sc_client_api::HeaderBackend + Send + Sync + ProvideRuntimeApi + 'static, + Client::Api: AuraApi, + P: sp_core::Pair + Send + Sync, + P::Public: Codec, + { + if Some(current_slot) <= self.our_slot { + // Current slot or next slot is ours. + // We already sent pre-connect message, no need to proceed further. + return; + } - tracing::debug!(target: crate::LOG_TARGET, "Sending pre-connect message"); + let Some(authorities) = self.client.runtime_api().authorities(best_block).ok() else { + return; + }; - // Send a message to the collator protocol to pre-connect to backing groups - overseer_handle_clone - .send_msg(CollatorProtocolMessage::ConnectToBackingGroups, "CollatorProtocolHelper") - .await; + match aura_internal::claim_slot::

(current_slot + 1, &authorities, &self.keystore).await { + Some(_) => { + // Next slot is ours, send connect message. + tracing::debug!(target: crate::LOG_TARGET, "Our slot {} is next, connecting to backing groups", current_slot + 1); + self.send_connect_message().await; + self.our_slot = Some(current_slot + 1); + }, + None => { + // Next slot is not ours, send disconnect only if we had a slot before. + if self.our_slot.take().is_some() { + tracing::debug!(target: crate::LOG_TARGET, "Current slot = {}, disconnecting from backing groups", current_slot); + self.send_disconnect_message().await; + } + }, } - .boxed(), - ); - Some(target_slot) + } } /// Check the `local_validation_code_hash` against the validation code hash in the relay chain @@ -372,7 +334,8 @@ where #[cfg(test)] mod tests { - use crate::collators::can_build_upon; + use super::*; + use crate::collators::{can_build_upon, BackingGroupConnectionHelper}; use codec::Encode; use cumulus_primitives_aura::Slot; use cumulus_primitives_core::BlockT; @@ -383,12 +346,14 @@ mod tests { TestClientBuilderExt, }; use cumulus_test_relay_sproof_builder::RelayStateSproofBuilder; + use futures::StreamExt; + use polkadot_overseer::{Event, Handle}; use polkadot_primitives::HeadData; use sc_consensus::{BlockImport, BlockImportParams, ForkChoiceStrategy}; use sp_consensus::BlockOrigin; use sp_keystore::{Keystore, KeystorePtr}; use sp_timestamp::Timestamp; - use std::sync::Arc; + use std::sync::{Arc, Mutex}; async fn import_block>( importer: &I, @@ -425,9 +390,9 @@ mod tests { block } - fn set_up_components() -> (Arc, KeystorePtr) { + fn set_up_components(num_authorities: usize) -> (Arc, KeystorePtr) { let keystore = Arc::new(sp_keystore::testing::MemoryKeystore::new()) as Arc<_>; - for key in sp_keyring::Sr25519Keyring::iter() { + for key in sp_keyring::Sr25519Keyring::iter().take(num_authorities) { Keystore::sr25519_generate_new( &*keystore, sp_application_crypto::key_types::AURA, @@ -445,7 +410,7 @@ mod tests { /// we are ensuring on the node side that we are are always able to build on the included block. #[tokio::test] async fn test_can_build_upon() { - let (client, keystore) = set_up_components(); + let (client, keystore) = set_up_components(6); let genesis_hash = client.chain_info().genesis_hash; let mut last_hash = genesis_hash; @@ -481,6 +446,207 @@ mod tests { .await; assert!(result.is_some()); } + + /// Helper to create a mock overseer handle and message recorder + fn create_overseer_handle() -> (OverseerHandle, Arc>>) { + let messages = Arc::new(Mutex::new(Vec::new())); + let messages_clone = messages.clone(); + + let (tx, mut rx) = polkadot_node_subsystem_util::metered::channel(100); + + // Spawn a task to receive and record overseer messages + tokio::spawn(async move { + while let Some(event) = rx.next().await { + if let Event::MsgToSubsystem { msg, .. } = event { + if let polkadot_node_subsystem::AllMessages::CollatorProtocol(cp_msg) = msg { + messages_clone.lock().unwrap().push(cp_msg); + } + } + } + }); + + (Handle::new(tx), messages) + } + + #[tokio::test] + async fn preconnect_when_next_slot_is_ours() { + let (client, keystore) = set_up_components(6); + let genesis_hash = client.chain_info().genesis_hash; + let (overseer_handle, messages_recorder) = create_overseer_handle(); + + let mut helper = BackingGroupConnectionHelper::new(client, keystore, overseer_handle); + + // Update with slot 0, next slot (1) should be ours + helper + .update::(Slot::from(0), genesis_hash) + .await; + + // Give time for message to be processed + tokio::time::sleep(tokio::time::Duration::from_millis(50)).await; + + let messages = messages_recorder.lock().unwrap(); + assert_eq!(messages.len(), 1); + assert!(matches!(messages[0], CollatorProtocolMessage::ConnectToBackingGroups)); + assert_eq!(helper.our_slot, Some(Slot::from(1))); + } + + #[tokio::test] + async fn preconnect_no_duplicate_connect_message() { + let (client, keystore) = set_up_components(6); + let genesis_hash = client.chain_info().genesis_hash; + let (overseer_handle, messages_recorder) = create_overseer_handle(); + + let mut helper = BackingGroupConnectionHelper::new(client, keystore, overseer_handle); + + // Update with slot 0, next slot (1) is ours + helper + .update::(Slot::from(0), genesis_hash) + .await; + + // Give time for message to be processed + tokio::time::sleep(tokio::time::Duration::from_millis(50)).await; + assert_eq!(messages_recorder.lock().unwrap().len(), 1); + messages_recorder.lock().unwrap().clear(); + + // Update with slot 0 again - should not send another message + helper + .update::(Slot::from(0), genesis_hash) + .await; + tokio::time::sleep(tokio::time::Duration::from_millis(50)).await; + assert_eq!(messages_recorder.lock().unwrap().len(), 0); + + // Update with slot 1 (our slot) - should not send another message + helper + .update::(Slot::from(1), genesis_hash) + .await; + tokio::time::sleep(tokio::time::Duration::from_millis(50)).await; + assert_eq!(messages_recorder.lock().unwrap().len(), 0); + } + + #[tokio::test] + async fn preconnect_disconnect_when_slot_passes() { + let (client, keystore) = set_up_components(1); + let genesis_hash = client.chain_info().genesis_hash; + let (overseer_handle, messages_recorder) = create_overseer_handle(); + + let mut helper = BackingGroupConnectionHelper::new(client, keystore, overseer_handle); + + // Slot 0 -> Alice, Slot 1 -> Bob, Slot 2 -> Charlie, Slot 3 -> Dave, Slot 4 -> Eve, + // Slot 5 -> Ferdie, Slot 6 -> Alice + + // Update with slot 5, next slot (6) is ours -> should connect + helper + .update::(Slot::from(5), genesis_hash) + .await; + tokio::time::sleep(tokio::time::Duration::from_millis(50)).await; + assert_eq!(helper.our_slot, Some(Slot::from(6))); + messages_recorder.lock().unwrap().clear(); + + // Update with slot 8, next slot (9) is Charlie's -> should disconnect + helper + .update::(Slot::from(8), genesis_hash) + .await; + tokio::time::sleep(tokio::time::Duration::from_millis(50)).await; + + let messages = messages_recorder.lock().unwrap(); + assert_eq!(messages.len(), 1, "Expected exactly one disconnect message"); + assert!(matches!(messages[0], CollatorProtocolMessage::DisconnectFromBackingGroups)); + assert_eq!(helper.our_slot, None); + } + + #[tokio::test] + async fn preconnect_no_disconnect_without_previous_connection() { + let (client, keystore) = set_up_components(1); + let genesis_hash = client.chain_info().genesis_hash; + let (overseer_handle, messages_recorder) = create_overseer_handle(); + + let mut helper = BackingGroupConnectionHelper::new(client, keystore, overseer_handle); + + // Slot 0 -> Alice, Slot 1 -> Bob, Slot 2 -> Charlie, Slot 3 -> Dave, Slot 4 -> Eve, + // Slot 5 -> Ferdie + + // Update with slot 1 (Bob's slot), next slot (2) is Charlie's + // Since we never connected before (our_slot is None), we should not send disconnect + helper + .update::(Slot::from(1), genesis_hash) + .await; + + tokio::time::sleep(tokio::time::Duration::from_millis(50)).await; + // Should not send any message since we never connected + assert_eq!(messages_recorder.lock().unwrap().len(), 0); + assert_eq!(helper.our_slot, None); + } + + #[tokio::test] + async fn preconnect_multiple_cycles() { + let (client, keystore) = set_up_components(1); + let genesis_hash = client.chain_info().genesis_hash; + let (overseer_handle, messages_recorder) = create_overseer_handle(); + + let mut helper = BackingGroupConnectionHelper::new(client, keystore, overseer_handle); + + // Slot 0 -> Alice, Slot 1 -> Bob, Slot 2 -> Charlie, Slot 3 -> Dave, Slot 4 -> Eve, + // Slot 5 -> Ferdie, Slot 6 -> Alice, Slot 7 -> Bob, ... + + // Cycle 1: Connect at slot 5, next slot (6) is ours + helper + .update::(Slot::from(5), genesis_hash) + .await; + tokio::time::sleep(tokio::time::Duration::from_millis(50)).await; + { + let messages = messages_recorder.lock().unwrap(); + assert_eq!(messages.len(), 1); + assert!(matches!(messages[0], CollatorProtocolMessage::ConnectToBackingGroups)); + } + assert_eq!(helper.our_slot, Some(Slot::from(6))); + messages_recorder.lock().unwrap().clear(); + + // Cycle 1: Disconnect at slot 7, next slot (8) is Charlie's + helper + .update::(Slot::from(7), genesis_hash) + .await; + tokio::time::sleep(tokio::time::Duration::from_millis(50)).await; + { + let messages = messages_recorder.lock().unwrap(); + assert_eq!(messages.len(), 1); + assert!(matches!(messages[0], CollatorProtocolMessage::DisconnectFromBackingGroups)); + } + assert_eq!(helper.our_slot, None); + messages_recorder.lock().unwrap().clear(); + + // Cycle 2: Connect again at slot 11, next slot (12) is ours + helper + .update::( + Slot::from(11), + genesis_hash, + ) + .await; + tokio::time::sleep(tokio::time::Duration::from_millis(50)).await; + { + let messages = messages_recorder.lock().unwrap(); + assert_eq!(messages.len(), 1); + assert!(matches!(messages[0], CollatorProtocolMessage::ConnectToBackingGroups)); + } + assert_eq!(helper.our_slot, Some(Slot::from(12))); + } + + #[tokio::test] + async fn preconnect_handles_runtime_api_error() { + let keystore = Arc::new(sp_keystore::testing::MemoryKeystore::new()) as Arc<_>; + let client = Arc::new(TestClientBuilder::new().build()); + let (overseer_handle, messages_recorder) = create_overseer_handle(); + + let mut helper = BackingGroupConnectionHelper::new(client, keystore, overseer_handle); + + let invalid_hash = Hash::default(); + helper + .update::(Slot::from(0), invalid_hash) + .await; + + tokio::time::sleep(tokio::time::Duration::from_millis(50)).await; + // Should not send any message if runtime API fails + assert_eq!(messages_recorder.lock().unwrap().len(), 0); + } } /// Holds a relay parent and its descendants. diff --git a/cumulus/client/consensus/aura/src/collators/slot_based/block_builder_task.rs b/cumulus/client/consensus/aura/src/collators/slot_based/block_builder_task.rs index ec992bc956d6b..1129d5283699b 100644 --- a/cumulus/client/consensus/aura/src/collators/slot_based/block_builder_task.rs +++ b/cumulus/client/consensus/aura/src/collators/slot_based/block_builder_task.rs @@ -26,7 +26,7 @@ use crate::{ relay_chain_data_cache::{RelayChainData, RelayChainDataCache}, slot_timer::{SlotInfo, SlotTimer}, }, - update_backing_group_connections, RelayParentData, + BackingGroupConnectionHelper, RelayParentData, }, LOG_TARGET, }; @@ -50,7 +50,7 @@ use sp_api::ProvideRuntimeApi; use sp_application_crypto::AppPublic; use sp_blockchain::HeaderBackend; use sp_consensus_aura::AuraApi; -use sp_core::{crypto::Pair, traits::SpawnNamed}; +use sp_core::crypto::Pair; use sp_inherents::CreateInherentDataProviders; use sp_keystore::KeystorePtr; use sp_runtime::traits::{Block as BlockT, Header as HeaderT, Member, Zero}; @@ -67,7 +67,6 @@ pub struct BuilderTaskParams< CHP, Proposer, CS, - Spawner, > { /// Inherent data providers. Only non-consensus inherent data should be provided, i.e. /// the timestamp, slot, and paras inherents should be omitted, as they are set by this @@ -108,38 +107,11 @@ pub struct BuilderTaskParams< /// The maximum percentage of the maximum PoV size that the collator can use. /// It will be removed once https://github.com/paritytech/polkadot-sdk/issues/6020 is fixed. pub max_pov_percentage: Option, - /// Required for pre-connecting to backing groups task. - pub spawn_handle: Spawner, - /// Handle to the overseer for sending messages. - pub overseer_handle: cumulus_relay_chain_interface::OverseerHandle, } /// Run block-builder. -pub fn run_block_builder< - Block, - P, - BI, - CIDP, - Client, - Backend, - RelayClient, - CHP, - Proposer, - CS, - Spawner, ->( - params: BuilderTaskParams< - Block, - BI, - CIDP, - Client, - Backend, - RelayClient, - CHP, - Proposer, - CS, - Spawner, - >, +pub fn run_block_builder( + params: BuilderTaskParams, ) -> impl Future + Send + 'static where Block: BlockT, @@ -165,7 +137,6 @@ where P: Pair + Send + Sync + 'static, P::Public: AppPublic + Member + Codec, P::Signature: TryFrom> + Member + Codec, - Spawner: SpawnNamed + Clone + 'static, { async move { tracing::info!(target: LOG_TARGET, "Starting slot-based block-builder task."); @@ -185,8 +156,6 @@ where para_backend, slot_offset, max_pov_percentage, - spawn_handle, - overseer_handle, } = params; let mut slot_timer = SlotTimer::<_, _, P>::new_with_offset( @@ -210,7 +179,17 @@ where }; let mut relay_chain_data_cache = RelayChainDataCache::new(relay_client.clone(), para_id); - let mut our_slot = None; + + let mut maybe_connection_helper = relay_client + .overseer_handle() + .ok() + .map(|h| BackingGroupConnectionHelper::new(para_client.clone(), keystore.clone(), h.clone())) + .or_else(|| { + tracing::warn!(target: LOG_TARGET, + "Relay chain interface does not provide overseer handle. Backing group pre-connect is disabled."); + None + }); + loop { // We wait here until the next slot arrives. if slot_timer.wait_until_next_slot().await.is_err() { @@ -350,17 +329,9 @@ where slot = ?para_slot.slot, "Not building block." ); - our_slot = update_backing_group_connections::<_, _, P, _>( - ¶_client, - &keystore, - &mut overseer_handle.clone(), - &spawn_handle, - best_hash, - para_slot_duration, - para_slot.slot, - our_slot, - ) - .await; + if let Some(ref mut connection_helper) = maybe_connection_helper { + connection_helper.update::(para_slot.slot, parent_hash).await; + } continue }, }; diff --git a/cumulus/client/consensus/aura/src/collators/slot_based/mod.rs b/cumulus/client/consensus/aura/src/collators/slot_based/mod.rs index fb09079d024d5..2fcb662f88e91 100644 --- a/cumulus/client/consensus/aura/src/collators/slot_based/mod.rs +++ b/cumulus/client/consensus/aura/src/collators/slot_based/mod.rs @@ -213,10 +213,6 @@ pub fn run(collator_task_params); - let overseer_handle = relay_client - .overseer_handle() - .expect("Relay chain interface should provide overseer handle"); - let block_builder_params = block_builder_task::BuilderTaskParams { create_inherent_data_providers, block_import, @@ -233,12 +229,10 @@ pub fn run(block_builder_params); + run_block_builder::(block_builder_params); spawner.spawn_blocking( "slot-based-block-builder", diff --git a/cumulus/polkadot-omni-node/lib/src/nodes/aura.rs b/cumulus/polkadot-omni-node/lib/src/nodes/aura.rs index 33327fa53d450..927ea9b997dc2 100644 --- a/cumulus/polkadot-omni-node/lib/src/nodes/aura.rs +++ b/cumulus/polkadot-omni-node/lib/src/nodes/aura.rs @@ -716,13 +716,12 @@ where authoring_duration: Duration::from_millis(2000), reinitialize: false, max_pov_percentage: node_extra_args.max_pov_percentage, - spawner: task_manager.spawn_handle(), }, }; let fut = async move { wait_for_aura(client).await; - aura::run_with_export::::Pair, _, _, _, _, _, _, _, _, _>( + aura::run_with_export::::Pair, _, _, _, _, _, _, _, _>( params, ) .await; diff --git a/cumulus/test/service/src/lib.rs b/cumulus/test/service/src/lib.rs index cd57f9b1bb188..d0d2325c21b6c 100644 --- a/cumulus/test/service/src/lib.rs +++ b/cumulus/test/service/src/lib.rs @@ -495,10 +495,9 @@ where authoring_duration: Duration::from_millis(2000), reinitialize: false, max_pov_percentage: None, - spawner: task_manager.spawn_handle(), }; - let fut = aura::run::(params); + let fut = aura::run::(params); task_manager.spawn_essential_handle().spawn("aura", None, fut); } } diff --git a/polkadot/node/network/collator-protocol/src/collator_side/mod.rs b/polkadot/node/network/collator-protocol/src/collator_side/mod.rs index c88b96cee19e0..80e8a673de5b9 100644 --- a/polkadot/node/network/collator-protocol/src/collator_side/mod.rs +++ b/polkadot/node/network/collator-protocol/src/collator_side/mod.rs @@ -732,21 +732,22 @@ async fn update_validator_connections( para_id: ParaId, connect: bool, ) { - let cores_assigned = has_assigned_cores(implicit_view, per_relay_parent); - // If no cores are assigned to the para, we still need to send a ConnectToValidators request to - // the network bridge passing an empty list of validator ids. Otherwise, it will keep connecting - // to the last requested validators until a new request is issued. - let validator_ids = if cores_assigned { - list_of_backing_validators_in_view(implicit_view, per_relay_parent, para_id) - } else { - Vec::new() - }; + gum::trace!(target: LOG_TARGET, ?para_id, ?connect, "update_validator_connections"); - // ignore address resolution failure - // will reissue a new request on new collation + // Ignore address resolution failure, will reissue a new request on new collation. let (failed, _) = oneshot::channel(); let msg = if connect { + let cores_assigned = has_assigned_cores(implicit_view, per_relay_parent); + // If no cores are assigned to the para, we still need to send a ConnectToValidators request + // to the network bridge passing an empty list of validator ids. Otherwise, it will keep + // connecting to the last requested validators until a new request is issued. + let validator_ids = if cores_assigned { + list_of_backing_validators_in_view(implicit_view, per_relay_parent, para_id) + } else { + Vec::new() + }; + gum::trace!( target: LOG_TARGET, ?cores_assigned, @@ -759,31 +760,22 @@ async fn update_validator_connections( failed, } } else { - // Get all connected peer_ids on the Collation peer set - // This is just for logging purposes. - let connected_validator_peer_ids: Vec<_> = peer_ids.keys().cloned().collect(); - - if connected_validator_peer_ids.is_empty() { + if peer_ids.is_empty() { return } gum::trace!( target: LOG_TARGET, - ?cores_assigned, "Disconnecting from validators: {:?}", - connected_validator_peer_ids, + peer_ids.keys(), ); + // Disconnect from all connected validators on the `Collation` protocol. NetworkBridgeTxMessage::ConnectToValidators { validator_ids: vec![], peer_set: PeerSet::Collation, failed, } - NetworkBridgeTxMessage::ConnectToValidators { - validator_ids: vec![], - peer_set: PeerSet::Collation, - failed, - } }; ctx.send_message(msg).await; diff --git a/templates/parachain/node/src/service.rs b/templates/parachain/node/src/service.rs index ad67f97d6bc57..4ce1bba376e8d 100644 --- a/templates/parachain/node/src/service.rs +++ b/templates/parachain/node/src/service.rs @@ -219,12 +219,10 @@ fn start_consensus( authoring_duration: Duration::from_millis(2000), reinitialize: false, max_pov_percentage: None, - spawner: task_manager.spawn_handle(), }; - let fut = - aura::run::( - params, - ); + let fut = aura::run::( + params, + ); task_manager.spawn_essential_handle().spawn("aura", None, fut); Ok(()) From efa7ae7d5238aade14c70d8b2bf5cbd4dd6d30d5 Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Mon, 20 Oct 2025 13:33:59 +0300 Subject: [PATCH 47/59] update CI test Signed-off-by: Andrei Sandu --- .../elastic_scaling/slot_based_12cores.rs | 34 +++++++++++++++++-- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/polkadot/zombienet-sdk-tests/tests/elastic_scaling/slot_based_12cores.rs b/polkadot/zombienet-sdk-tests/tests/elastic_scaling/slot_based_12cores.rs index cbcb0ebc2dd9d..1f8816170e99e 100644 --- a/polkadot/zombienet-sdk-tests/tests/elastic_scaling/slot_based_12cores.rs +++ b/polkadot/zombienet-sdk-tests/tests/elastic_scaling/slot_based_12cores.rs @@ -4,6 +4,8 @@ // Test that a parachain that uses a single slot-based collator with elastic scaling can use 12 // cores in order to achieve 500ms blocks. +use std::time::Duration; + use anyhow::anyhow; use cumulus_zombienet_sdk_helpers::{ @@ -11,6 +13,7 @@ use cumulus_zombienet_sdk_helpers::{ }; use polkadot_primitives::Id as ParaId; use serde_json::json; +use zombienet_orchestrator::network::node::LogLineCountOptions; use zombienet_sdk::{ subxt::{OnlineClient, PolkadotConfig}, subxt_signer::sr25519::dev, @@ -60,9 +63,14 @@ async fn slot_based_12cores_test() -> Result<(), anyhow::Error> { .with_chain("elastic-scaling-500ms") .with_default_args(vec![ "--authoring=slot-based".into(), - ("-lparachain=debug,aura=debug").into(), + ("-lparachain=debug,aura=debug,parachain::collator-protocol=trace").into(), ]) - .with_collator(|n| n.with_name("collator-elastic")) + .with_collator(|n| n.with_name("collator-0")) + .with_collator(|n| n.with_name("collator-1")) + .with_collator(|n| n.with_name("collator-2")) + .with_collator(|n| n.with_name("collator-3")) + .with_collator(|n| n.with_name("collator-4")) + .with_collator(|n| n.with_name("collator-5")) }) .build() .map_err(|e| { @@ -74,7 +82,7 @@ async fn slot_based_12cores_test() -> Result<(), anyhow::Error> { let network = spawn_fn(config).await?; let relay_node = network.get_node("validator-0")?; - let para_node = network.get_node("collator-elastic")?; + let para_node = network.get_node("collator-5")?; let relay_client: OnlineClient = relay_node.wait_client().await?; let alice = dev::alice(); @@ -105,6 +113,26 @@ async fn slot_based_12cores_test() -> Result<(), anyhow::Error> { ) .await?; + // Wait for collator to claim a slot. + let result = para_node + .wait_log_line_count_with_timeout( + "*Received PreConnectToBackingGroups message*", + true, + LogLineCountOptions::new(|n| n >= 3, Duration::from_secs(120), false), + ) + .await?; + assert!(result.success()); + + // Wait for collator slot to pass. + let result = para_node + .wait_log_line_count_with_timeout( + "*Received DisconnectFromBackingGroups message*", + true, + LogLineCountOptions::new(|n| n >= 3, Duration::from_secs(120), false), + ) + .await?; + assert!(result.success()); + // Assert the parachain finalized block height is also on par with the number of backed // candidates. assert_finality_lag(¶_node.wait_client().await?, 60).await?; From 43285d617610e60192a86c3c59162f1737a71d65 Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Mon, 20 Oct 2025 13:37:17 +0300 Subject: [PATCH 48/59] fix ident Signed-off-by: Andrei Sandu --- .../collators/slot_based/block_builder_task.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/cumulus/client/consensus/aura/src/collators/slot_based/block_builder_task.rs b/cumulus/client/consensus/aura/src/collators/slot_based/block_builder_task.rs index 1129d5283699b..20ce8b284e4c2 100644 --- a/cumulus/client/consensus/aura/src/collators/slot_based/block_builder_task.rs +++ b/cumulus/client/consensus/aura/src/collators/slot_based/block_builder_task.rs @@ -319,14 +319,14 @@ where Some(slot) => slot, None => { tracing::debug!( - target: crate::LOG_TARGET, - unincluded_segment_len = parent.depth, - relay_parent = ?relay_parent, - relay_parent_num = %relay_parent_header.number(), - included_hash = ?included_header_hash, - included_num = %included_header.number(), - parent = ?parent_hash, - slot = ?para_slot.slot, + target: crate::LOG_TARGET, + unincluded_segment_len = parent.depth, + relay_parent = ?relay_parent, + relay_parent_num = %relay_parent_header.number(), + included_hash = ?included_header_hash, + included_num = %included_header.number(), + parent = ?parent_hash, + slot = ?para_slot.slot, "Not building block." ); if let Some(ref mut connection_helper) = maybe_connection_helper { From 64e00657326538462b1b53289457fdb0c08ecb4d Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Mon, 20 Oct 2025 13:50:20 +0300 Subject: [PATCH 49/59] ; Signed-off-by: Andrei Sandu --- cumulus/client/consensus/aura/src/collators/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cumulus/client/consensus/aura/src/collators/mod.rs b/cumulus/client/consensus/aura/src/collators/mod.rs index a24d3ecf908d8..7aba25d171c4f 100644 --- a/cumulus/client/consensus/aura/src/collators/mod.rs +++ b/cumulus/client/consensus/aura/src/collators/mod.rs @@ -99,11 +99,11 @@ impl BackingGroupConnectionHelper { if Some(current_slot) <= self.our_slot { // Current slot or next slot is ours. // We already sent pre-connect message, no need to proceed further. - return; + return } let Some(authorities) = self.client.runtime_api().authorities(best_block).ok() else { - return; + return }; match aura_internal::claim_slot::

(current_slot + 1, &authorities, &self.keystore).await { From 386ca1506739d24bf81fb354efcac8d4a85a94dd Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Mon, 20 Oct 2025 14:12:49 +0300 Subject: [PATCH 50/59] fix tests Signed-off-by: Andrei Sandu --- .../network/collator-protocol/src/collator_side/tests/mod.rs | 3 --- .../src/collator_side/tests/prospective_parachains.rs | 4 ---- 2 files changed, 7 deletions(-) diff --git a/polkadot/node/network/collator-protocol/src/collator_side/tests/mod.rs b/polkadot/node/network/collator-protocol/src/collator_side/tests/mod.rs index 1627e93c75dec..de14b5bb0389e 100644 --- a/polkadot/node/network/collator-protocol/src/collator_side/tests/mod.rs +++ b/polkadot/node/network/collator-protocol/src/collator_side/tests/mod.rs @@ -387,7 +387,6 @@ async fn expect_determine_validator_group( async fn distribute_collation_with_receipt( virtual_overseer: &mut VirtualOverseer, expected_connected: Vec, - test_state: &TestState, candidate: CandidateReceipt, pov: PoV, parent_head_data_hash: Hash, @@ -406,7 +405,6 @@ async fn distribute_collation_with_receipt( .await; check_connected_to_validators(virtual_overseer, expected_connected).await; - expect_determine_validator_group(virtual_overseer, test_state).await; DistributeCollation { candidate, pov_block: pov } } @@ -434,7 +432,6 @@ async fn distribute_collation( distribute_collation_with_receipt( virtual_overseer, expected_connected, - test_state, candidate, pov_block, parent_head_data_hash, diff --git a/polkadot/node/network/collator-protocol/src/collator_side/tests/prospective_parachains.rs b/polkadot/node/network/collator-protocol/src/collator_side/tests/prospective_parachains.rs index 48f40fec6f05c..4f4c3526d7f2f 100644 --- a/polkadot/node/network/collator-protocol/src/collator_side/tests/prospective_parachains.rs +++ b/polkadot/node/network/collator-protocol/src/collator_side/tests/prospective_parachains.rs @@ -394,7 +394,6 @@ fn distribute_collation_from_implicit_view(#[case] validator_sends_view_first: b distribute_collation_with_receipt( virtual_overseer, test_state.current_group_validator_authority_ids(), - &test_state, candidate, pov, parent_head_data_hash, @@ -543,7 +542,6 @@ fn distribute_collation_up_to_limit() { distribute_collation_with_receipt( virtual_overseer, test_state.current_group_validator_authority_ids(), - &test_state, candidate, pov, parent_head_data_hash, @@ -684,7 +682,6 @@ fn send_parent_head_data_for_elastic_scaling() { distribute_collation_with_receipt( &mut virtual_overseer, expected_connected, - &test_state, candidate.clone(), pov_data.clone(), phdh, @@ -814,7 +811,6 @@ fn advertise_and_send_collation_by_hash() { distribute_collation_with_receipt( &mut virtual_overseer, test_state.current_group_validator_authority_ids(), - &test_state, candidate.clone(), pov.clone(), Hash::zero(), From 02362a9f1f539a56f68aeed7b64e5bbb920fddc3 Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Mon, 20 Oct 2025 14:16:02 +0300 Subject: [PATCH 51/59] remove clone derive Signed-off-by: Andrei Sandu --- .../node/network/collator-protocol/src/collator_side/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/polkadot/node/network/collator-protocol/src/collator_side/mod.rs b/polkadot/node/network/collator-protocol/src/collator_side/mod.rs index 874c3a7b77017..8bd0c3bf2da61 100644 --- a/polkadot/node/network/collator-protocol/src/collator_side/mod.rs +++ b/polkadot/node/network/collator-protocol/src/collator_side/mod.rs @@ -154,7 +154,7 @@ enum ShouldAdvertiseTo { /// Info about validators we are currently connected to. /// /// It keeps track to which validators we advertised our collation. -#[derive(Debug, Default, Clone)] +#[derive(Debug, Default)] struct ValidatorGroup { /// Validators discovery ids. Lazily initialized when first /// distributing a collation. From 0169bd68d3fc4f27dc0420693b6bfbf808c0254e Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Mon, 20 Oct 2025 14:18:40 +0300 Subject: [PATCH 52/59] update prdoc Signed-off-by: Andrei Sandu --- prdoc/pr_9178.prdoc | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/prdoc/pr_9178.prdoc b/prdoc/pr_9178.prdoc index 5f5e11996d140..55d09d20e42f5 100644 --- a/prdoc/pr_9178.prdoc +++ b/prdoc/pr_9178.prdoc @@ -2,17 +2,10 @@ title: 'collator-protocol: cleanup connecting to backing group' doc: - audience: Node Dev description: |- - There are a few things wrong with the way we are handling connecting the validators in the backing group: - 1. `validators_to_connect` returns only validators in groups we already have a block to advertise and the last backing groups we advertised something to, that means that if our backing group changes, but we don't have anything to advertise it will continue to try to connect to the previous backing group and validator will log this and disconnect it immediately. - On the validator you will see`Declared as collator for unneeded para` and on the collator you will see Connect/Disconnect requests. This will continue every reconnect_timeout(4s from each active signal) until the collator advertises something to the new backing group. This is harmless, but it pollutes both the collator and the validator logs. - - 2. A collator connects only when it has something to advertise to its backing group, this is a bit too late and we can improve it by connecting the collators to the backing group immediately after they notice their assigned backing group. - - 3. Staying connected to the last backingroup we advertised something does not work for elastic scaling because we have different backing groups and if the collator set is big enough that collators author just one block per group rotation, then we will always connect just when we have a candidate to advertise. - - ## Fix: - - Have collators always connect to the backing group they got assigned to and keep the connection open until backing group changes. Also, try to connect when have something to advertise or on timeout to have more chances of being correctly connected. + + Have collators always connect to the backing group they got assigned to and keep the connection open + until backing group changes. Also, try to connect when have something to advertise or on timeout to + have more chances of being correctly connected. crates: - name: polkadot-collator-protocol From afaee88fed5e1280fff10b96bf3665bd42fd3a03 Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Mon, 20 Oct 2025 14:44:53 +0300 Subject: [PATCH 53/59] accurate comments Signed-off-by: Andrei Sandu --- .../tests/elastic_scaling/slot_based_12cores.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/polkadot/zombienet-sdk-tests/tests/elastic_scaling/slot_based_12cores.rs b/polkadot/zombienet-sdk-tests/tests/elastic_scaling/slot_based_12cores.rs index 1f8816170e99e..9cfcab0b77150 100644 --- a/polkadot/zombienet-sdk-tests/tests/elastic_scaling/slot_based_12cores.rs +++ b/polkadot/zombienet-sdk-tests/tests/elastic_scaling/slot_based_12cores.rs @@ -113,7 +113,7 @@ async fn slot_based_12cores_test() -> Result<(), anyhow::Error> { ) .await?; - // Wait for collator to claim a slot. + // Expect that `collator-5` claims at least 3 slots during this run. let result = para_node .wait_log_line_count_with_timeout( "*Received PreConnectToBackingGroups message*", @@ -123,7 +123,7 @@ async fn slot_based_12cores_test() -> Result<(), anyhow::Error> { .await?; assert!(result.success()); - // Wait for collator slot to pass. + // It should disconnect at least 3 times after it's slot passes. let result = para_node .wait_log_line_count_with_timeout( "*Received DisconnectFromBackingGroups message*", From 0f800b7f5e1e171e43c58c5ca3bb54004af0dd7f Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Mon, 20 Oct 2025 17:11:15 +0300 Subject: [PATCH 54/59] comment Signed-off-by: Andrei Sandu --- cumulus/client/consensus/aura/src/collators/mod.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cumulus/client/consensus/aura/src/collators/mod.rs b/cumulus/client/consensus/aura/src/collators/mod.rs index 7aba25d171c4f..2317f736dd4d5 100644 --- a/cumulus/client/consensus/aura/src/collators/mod.rs +++ b/cumulus/client/consensus/aura/src/collators/mod.rs @@ -55,6 +55,9 @@ pub mod slot_based; // rules specified by the parachain's runtime and thus will never be too deep. This is just an extra // sanity check. const PARENT_SEARCH_DEPTH: usize = 40; + +// Helper to pre-connect to the backing group we got assigned to and keep the connection +// open until backing group changes or own slot ends. struct BackingGroupConnectionHelper { client: std::sync::Arc, keystore: sp_keystore::KeystorePtr, From a9c986a7ee5dba70779dff78455e9343401f04e8 Mon Sep 17 00:00:00 2001 From: "cmd[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 22 Oct 2025 08:59:12 +0000 Subject: [PATCH 55/59] Update from github-actions[bot] running command 'fmt' --- cumulus/client/consensus/aura/src/collators/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cumulus/client/consensus/aura/src/collators/mod.rs b/cumulus/client/consensus/aura/src/collators/mod.rs index 2317f736dd4d5..e7e38376e4b58 100644 --- a/cumulus/client/consensus/aura/src/collators/mod.rs +++ b/cumulus/client/consensus/aura/src/collators/mod.rs @@ -56,7 +56,7 @@ pub mod slot_based; // sanity check. const PARENT_SEARCH_DEPTH: usize = 40; -// Helper to pre-connect to the backing group we got assigned to and keep the connection +// Helper to pre-connect to the backing group we got assigned to and keep the connection // open until backing group changes or own slot ends. struct BackingGroupConnectionHelper { client: std::sync::Arc, From 15728d83ed5d7d9774c90e1319df818b9d83baea Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Wed, 22 Oct 2025 15:06:02 +0300 Subject: [PATCH 56/59] unused dep Signed-off-by: Andrei Sandu --- Cargo.lock | 1 - cumulus/client/consensus/aura/Cargo.toml | 1 - 2 files changed, 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ac15a5b1544a6..61e43246e429c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4415,7 +4415,6 @@ dependencies = [ "cumulus-test-client", "cumulus-test-relay-sproof-builder", "futures", - "futures-timer", "parity-scale-codec", "parking_lot 0.12.3", "polkadot-node-primitives", diff --git a/cumulus/client/consensus/aura/Cargo.toml b/cumulus/client/consensus/aura/Cargo.toml index 6dd43eeeaa40c..8dca303ffebdb 100644 --- a/cumulus/client/consensus/aura/Cargo.toml +++ b/cumulus/client/consensus/aura/Cargo.toml @@ -15,7 +15,6 @@ workspace = true async-trait = { workspace = true } codec = { features = ["derive"], workspace = true, default-features = true } futures = { workspace = true } -futures-timer = { workspace = true } parking_lot = { workspace = true } schnellru = { workspace = true } tokio = { workspace = true, features = ["macros"] } From 82a0658e78939862b39418d1729c857ba693bd8a Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Wed, 22 Oct 2025 15:22:33 +0300 Subject: [PATCH 57/59] simplify and more coverage in tests Signed-off-by: Andrei Sandu --- .../consensus/aura/src/collators/mod.rs | 47 +++++++++++-------- 1 file changed, 28 insertions(+), 19 deletions(-) diff --git a/cumulus/client/consensus/aura/src/collators/mod.rs b/cumulus/client/consensus/aura/src/collators/mod.rs index e7e38376e4b58..305faaaa08885 100644 --- a/cumulus/client/consensus/aura/src/collators/mod.rs +++ b/cumulus/client/consensus/aura/src/collators/mod.rs @@ -74,19 +74,8 @@ impl BackingGroupConnectionHelper { Self { client, keystore, overseer_handle, our_slot: None } } - async fn send_connect_message(&mut self) { - self.overseer_handle - .send_msg(CollatorProtocolMessage::ConnectToBackingGroups, "CollatorProtocolHelper") - .await; - } - - async fn send_disconnect_message(&mut self) { - self.overseer_handle - .send_msg( - CollatorProtocolMessage::DisconnectFromBackingGroups, - "CollatorProtocolHelper", - ) - .await; + async fn send_subsystem_message(&mut self, message: CollatorProtocolMessage) { + self.overseer_handle.send_msg(message, "CollatorProtocolHelper").await; } /// Update the current slot and initiate connections to backing groups if needed. @@ -109,18 +98,23 @@ impl BackingGroupConnectionHelper { return }; - match aura_internal::claim_slot::

(current_slot + 1, &authorities, &self.keystore).await { + let next_slot = current_slot + 1; + match aura_internal::claim_slot::

(next_slot, &authorities, &self.keystore).await { Some(_) => { // Next slot is ours, send connect message. tracing::debug!(target: crate::LOG_TARGET, "Our slot {} is next, connecting to backing groups", current_slot + 1); - self.send_connect_message().await; - self.our_slot = Some(current_slot + 1); + self.send_subsystem_message(CollatorProtocolMessage::ConnectToBackingGroups) + .await; + self.our_slot = Some(next_slot); }, None => { // Next slot is not ours, send disconnect only if we had a slot before. if self.our_slot.take().is_some() { tracing::debug!(target: crate::LOG_TARGET, "Current slot = {}, disconnecting from backing groups", current_slot); - self.send_disconnect_message().await; + self.send_subsystem_message( + CollatorProtocolMessage::DisconnectFromBackingGroups, + ) + .await; } }, } @@ -551,9 +545,24 @@ mod tests { .await; tokio::time::sleep(tokio::time::Duration::from_millis(50)).await; + { + let messages = messages_recorder.lock().unwrap(); + assert_eq!(messages.len(), 1, "Expected exactly one disconnect message"); + assert!(matches!(messages[0], CollatorProtocolMessage::DisconnectFromBackingGroups)); + assert_eq!(helper.our_slot, None); + } + + messages_recorder.lock().unwrap().clear(); + + // Update again with slot 8, next slot (9) is Charlie's -> should not send another + // disconnect message + helper + .update::(Slot::from(8), genesis_hash) + .await; + tokio::time::sleep(tokio::time::Duration::from_millis(50)).await; + let messages = messages_recorder.lock().unwrap(); - assert_eq!(messages.len(), 1, "Expected exactly one disconnect message"); - assert!(matches!(messages[0], CollatorProtocolMessage::DisconnectFromBackingGroups)); + assert_eq!(messages.len(), 0, "Expected no messages"); assert_eq!(helper.our_slot, None); } From cce0b12ee9977669f9f7d84551d3cfe5386ba19f Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Thu, 23 Oct 2025 13:11:47 +0300 Subject: [PATCH 58/59] change string Signed-off-by: Andrei Sandu --- cumulus/client/consensus/aura/src/collators/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cumulus/client/consensus/aura/src/collators/mod.rs b/cumulus/client/consensus/aura/src/collators/mod.rs index 305faaaa08885..39b3abb49e74f 100644 --- a/cumulus/client/consensus/aura/src/collators/mod.rs +++ b/cumulus/client/consensus/aura/src/collators/mod.rs @@ -75,7 +75,7 @@ impl BackingGroupConnectionHelper { } async fn send_subsystem_message(&mut self, message: CollatorProtocolMessage) { - self.overseer_handle.send_msg(message, "CollatorProtocolHelper").await; + self.overseer_handle.send_msg(message, "BackingGroupConnectionHelper").await; } /// Update the current slot and initiate connections to backing groups if needed. From 9093bab6e7b164eea4092044f925bd37dd5b8c5c Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Thu, 23 Oct 2025 13:17:43 +0300 Subject: [PATCH 59/59] even simpler Signed-off-by: Andrei Sandu --- .../consensus/aura/src/collators/mod.rs | 34 +++++++++---------- 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/cumulus/client/consensus/aura/src/collators/mod.rs b/cumulus/client/consensus/aura/src/collators/mod.rs index 39b3abb49e74f..1a945236b392c 100644 --- a/cumulus/client/consensus/aura/src/collators/mod.rs +++ b/cumulus/client/consensus/aura/src/collators/mod.rs @@ -99,24 +99,22 @@ impl BackingGroupConnectionHelper { }; let next_slot = current_slot + 1; - match aura_internal::claim_slot::

(next_slot, &authorities, &self.keystore).await { - Some(_) => { - // Next slot is ours, send connect message. - tracing::debug!(target: crate::LOG_TARGET, "Our slot {} is next, connecting to backing groups", current_slot + 1); - self.send_subsystem_message(CollatorProtocolMessage::ConnectToBackingGroups) - .await; - self.our_slot = Some(next_slot); - }, - None => { - // Next slot is not ours, send disconnect only if we had a slot before. - if self.our_slot.take().is_some() { - tracing::debug!(target: crate::LOG_TARGET, "Current slot = {}, disconnecting from backing groups", current_slot); - self.send_subsystem_message( - CollatorProtocolMessage::DisconnectFromBackingGroups, - ) - .await; - } - }, + let next_slot_is_ours = + aura_internal::claim_slot::

(next_slot, &authorities, &self.keystore) + .await + .is_some(); + + if next_slot_is_ours { + // Next slot is ours, send connect message. + tracing::debug!(target: crate::LOG_TARGET, "Our slot {} is next, connecting to backing groups", next_slot); + self.send_subsystem_message(CollatorProtocolMessage::ConnectToBackingGroups) + .await; + self.our_slot = Some(next_slot); + } else if self.our_slot.take().is_some() { + // Next slot is not ours, send disconnect only if we had a slot before. + tracing::debug!(target: crate::LOG_TARGET, "Current slot = {}, disconnecting from backing groups", current_slot); + self.send_subsystem_message(CollatorProtocolMessage::DisconnectFromBackingGroups) + .await; } } }