diff --git a/polkadot/node/network/collator-protocol/src/validator_side/mod.rs b/polkadot/node/network/collator-protocol/src/validator_side/mod.rs index e320b6a50aba2..a205998e7592a 100644 --- a/polkadot/node/network/collator-protocol/src/validator_side/mod.rs +++ b/polkadot/node/network/collator-protocol/src/validator_side/mod.rs @@ -386,6 +386,15 @@ impl PeerData { return Err(InsertAdvertisementError::PeerLimitReached); } + gum::debug!( + target: LOG_TARGET, + ?on_scheduling_parent, + ?candidate_hash, + current_ads = candidates.len(), + max_ads, + "Tracking new advertisement from peer", + ); + candidates.insert(candidate_hash); } else { if self.version != CollationVersion::V1 { @@ -1659,6 +1668,14 @@ where if peer_data.version == CollationVersion::V1 && !state.leaf_claim_queues.contains_key(&scheduling_parent) { + gum::debug!( + target: LOG_TARGET, + ?peer_id, + ?scheduling_parent, + ?prospective_candidate, + "V1 advertisement must use an active leaf as relay parent", + ); + return Err(AdvertisementError::ProtocolMisuse); } @@ -1680,7 +1697,18 @@ where &per_scheduling_parent, &state.leaf_claim_queues, ) - .map_err(AdvertisementError::Invalid)?; + .map_err(|error| { + gum::debug!( + target: LOG_TARGET, + ?peer_id, + ?scheduling_parent, + ?prospective_candidate, + ?error, + "Failed to insert advertisement", + ); + + AdvertisementError::Invalid(error) + })?; if hold_off_asset_hub_collation_if_needed( state, @@ -1691,6 +1719,14 @@ where scheduling_parent, // For V1/V2, the relay parent is the same as the scheduling parent None, // V1/V2 don't have advertised descriptor version ) { + gum::debug!( + target: LOG_TARGET, + ?peer_id, + ?scheduling_parent, + ?prospective_candidate, + "Collation held off, not processing advertisement further for now", + ); + return Ok(()); } @@ -1802,7 +1838,16 @@ where // Check if there's a free slot accounting for obsolete positions and capacity. // This happens AFTER hold-off logic (for AssetHub) has run, so held-off advertisements // can be queued even when capacity is temporarily full. - is_slot_available(&scheduling_parent, para_id, state)?; + is_slot_available(&scheduling_parent, para_id, state).inspect_err(|error| { + gum::debug!( + target: LOG_TARGET, + ?peer_id, + ?scheduling_parent, + ?para_id, + ?error, + "Slot is not available", + ); + })?; if let Some((candidate_hash, parent_head_data_hash)) = prospective_candidate { // Check if backing subsystem allows to second this candidate. @@ -1813,6 +1858,16 @@ where .await; if !can_second { + gum::debug!( + target: LOG_TARGET, + ?peer_id, + ?scheduling_parent, + ?para_id, + ?candidate_hash, + ?parent_head_data_hash, + "Backing subsystem does not allow seconding this candidate", + ); + return Err(AdvertisementError::BlockedByBacking); } } diff --git a/prdoc/pr_11629.prdoc b/prdoc/pr_11629.prdoc new file mode 100644 index 0000000000000..56a752b6c86b0 --- /dev/null +++ b/prdoc/pr_11629.prdoc @@ -0,0 +1,20 @@ +title: 'collator-protocol: Enhance visibility into failed to fetch collations' +doc: +- audience: Node Dev + description: |- + This PR enhances the observability of our logs when validators refuse to fetch advertised collations. + + The validator keeps track of advertised collations from collators. + Then, the validator checks if it has sufficient slots (`is_slot_available`) and if the collation can be seconded (`can_second`). + If either of the mentioned functions fails, the validator silently ignores the collation. + + This PR aims to surface this behavior and help investigate further issues. + + This PR is already deployed on validators and has surfaced: + - https://github.com/paritytech/polkadot-sdk/issues/11625 + + Part of: + - https://github.com/paritytech/polkadot-sdk/issues/11377 +crates: +- name: polkadot-collator-protocol + bump: patch