From 51588850e2a0d797b023db19b302cf89446d6c3d Mon Sep 17 00:00:00 2001 From: Alexandru Gheorghe <49718502+alexggh@users.noreply.github.com> Date: Mon, 21 Jul 2025 12:57:39 +0300 Subject: [PATCH] gossip-support: make low connectivity message an error (#9264) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All is not well when a validator is not properly connected, e.g: of things that might happen: - Finality might be slightly delay because validator will be no-show because they can't retrieve PoVs to validate approval work: https://github.com/paritytech/polkadot-sdk/issues/8915. - When they author blocks they won't back things because gossiping of backing statements happen using the grid topology:, e.g blocks authored by validators with a low number of peers: https://polkadot.js.org/apps/?rpc=wss%3A%2F%2Frpc-polkadot.helixstreet.io#/explorer/query/26931262 https://polkadot.js.org/apps/?rpc=wss%3A%2F%2Frpc-polkadot.helixstreet.io#/explorer/query/26931260 https://polkadot.js.org/apps/?rpc=wss%3A%2F%2Fpolkadot.api.onfinality.io%2Fpublic-ws#/explorer/query/26931334 https://polkadot.js.org/apps/?rpc=wss%3A%2F%2Fpolkadot-public-rpc.blockops.network%2Fws#/explorer/query/26931314 https://polkadot.js.org/apps/?rpc=wss%3A%2F%2Fpolkadot-public-rpc.blockops.network%2Fws#/explorer/query/26931292 https://polkadot.js.org/apps/?rpc=wss%3A%2F%2Fpolkadot-public-rpc.blockops.network%2Fws#/explorer/query/26931447 The problem is seen in `polkadot_parachain_peer_count` metrics, but it seems people are not monitoring that well enough, so let's make it more visible nodes with low connectivity are not working in good conditions. I also reduced the threshold to 85%, so that we don't trigger this error to eagerly. --------- Signed-off-by: Alexandru Gheorghe Co-authored-by: Bastian Köcher Co-authored-by: cmd[bot] <41898282+github-actions[bot]@users.noreply.github.com> (cherry picked from commit c7f9908c2eeb1be70e57819537058beb53664446) --- .../node/network/gossip-support/src/lib.rs | 18 +++++++++------- prdoc/pr_9264.prdoc | 21 +++++++++++++++++++ 2 files changed, 32 insertions(+), 7 deletions(-) create mode 100644 prdoc/pr_9264.prdoc diff --git a/polkadot/node/network/gossip-support/src/lib.rs b/polkadot/node/network/gossip-support/src/lib.rs index 48ca61f4cd22a..5ab3a435822c3 100644 --- a/polkadot/node/network/gossip-support/src/lib.rs +++ b/polkadot/node/network/gossip-support/src/lib.rs @@ -90,13 +90,15 @@ const TRY_RERESOLVE_AUTHORITIES: Duration = Duration::from_secs(2); const LOW_CONNECTIVITY_WARN_DELAY: Duration = Duration::from_secs(600); /// If connectivity is lower than this in percent, issue warning in logs. -const LOW_CONNECTIVITY_WARN_THRESHOLD: usize = 90; +const LOW_CONNECTIVITY_WARN_THRESHOLD: usize = 85; /// The Gossip Support subsystem. pub struct GossipSupport { keystore: KeystorePtr, last_session_index: Option, + /// Whether we are currently an authority or not. + is_authority_now: bool, /// The minimum known session we build the topology for. min_known_session: SessionIndex, // Some(timestamp) if we failed to resolve @@ -163,6 +165,7 @@ where min_known_session: u32::MAX, authority_discovery, finalized_needed_session: None, + is_authority_now: false, metrics, } } @@ -282,6 +285,9 @@ where "New session detected", ); self.last_session_index = Some(session_index); + self.is_authority_now = + ensure_i_am_an_authority(&self.keystore, &session_info.discovery_keys) + .is_ok(); } // Connect to authorities from the past/present/future. @@ -705,13 +711,11 @@ where .resolved_authorities .iter() .filter(|(a, _)| !self.connected_authorities.contains_key(a)); - // TODO: Make that warning once connectivity issues are fixed (no point in warning, if - // we already know it is broken. - // https://github.com/paritytech/polkadot/issues/3921 - if connected_ratio <= LOW_CONNECTIVITY_WARN_THRESHOLD { - gum::debug!( + if connected_ratio <= LOW_CONNECTIVITY_WARN_THRESHOLD && self.is_authority_now { + gum::error!( target: LOG_TARGET, - "Connectivity seems low, we are only connected to {}% of available validators (see debug logs for details)", connected_ratio + session_index = self.last_session_index.as_ref().map(|s| *s).unwrap_or_default(), + "Connectivity seems low, we are only connected to {connected_ratio}% of available validators (see debug logs for details), if this persists more than a session action needs to be taken" ); } let pretty = PrettyAuthorities(unconnected_authorities); diff --git a/prdoc/pr_9264.prdoc b/prdoc/pr_9264.prdoc new file mode 100644 index 0000000000000..7e28fd6dbf9da --- /dev/null +++ b/prdoc/pr_9264.prdoc @@ -0,0 +1,21 @@ +title: 'gossip-support: make low connectivity message an error' +doc: +- audience: Node Dev + description: |- + All is not well when a validator is not properly connected, e.g: of things that might happen: + - Finality might be slightly delay because validator will be no-show because they can't retrieve PoVs to validate approval work: https://github.com/paritytech/polkadot-sdk/issues/8915. + - When they author blocks they won't back things because gossiping of backing statements happen using the grid topology:, e.g blocks authored by validators with a low number of peers: + https://polkadot.js.org/apps/?rpc=wss%3A%2F%2Frpc-polkadot.helixstreet.io#/explorer/query/26931262 + https://polkadot.js.org/apps/?rpc=wss%3A%2F%2Frpc-polkadot.helixstreet.io#/explorer/query/26931260 + https://polkadot.js.org/apps/?rpc=wss%3A%2F%2Fpolkadot.api.onfinality.io%2Fpublic-ws#/explorer/query/26931334 + https://polkadot.js.org/apps/?rpc=wss%3A%2F%2Fpolkadot-public-rpc.blockops.network%2Fws#/explorer/query/26931314 + https://polkadot.js.org/apps/?rpc=wss%3A%2F%2Fpolkadot-public-rpc.blockops.network%2Fws#/explorer/query/26931292 + https://polkadot.js.org/apps/?rpc=wss%3A%2F%2Fpolkadot-public-rpc.blockops.network%2Fws#/explorer/query/26931447 + + + The problem is seen in `polkadot_parachain_peer_count` metrics, but it seems people are not monitoring that well enough, so let's make it more visible nodes with low connectivity are not working in good conditions. + + I also reduced the threshold to 85%, so that we don't trigger this error to eagerly. +crates: +- name: polkadot-gossip-support + bump: patch