diff --git a/substrate/client/network/statement/src/lib.rs b/substrate/client/network/statement/src/lib.rs index 40575cf91702e..4110b4f723b84 100644 --- a/substrate/client/network/statement/src/lib.rs +++ b/substrate/client/network/statement/src/lib.rs @@ -33,7 +33,8 @@ use codec::{Compact, Decode, Encode, MaxEncodedLen}; use futures::future::pending; use futures::{channel::oneshot, future::FusedFuture, prelude::*, stream::FuturesUnordered}; use prometheus_endpoint::{ - prometheus, register, Counter, Gauge, Histogram, HistogramOpts, PrometheusError, Registry, U64, + exponential_buckets, register, Counter, Gauge, Histogram, HistogramOpts, PrometheusError, + Registry, U64, }; use sc_network::{ config::{NonReservedPeerMode, SetConfig}, @@ -59,6 +60,7 @@ use std::{ num::NonZeroUsize, pin::Pin, sync::Arc, + time::Instant, }; use tokio::time::timeout; pub mod config; @@ -98,6 +100,15 @@ struct Metrics { propagated_statements_chunks: Histogram, pending_statements: Gauge, ignored_statements: Counter, + peers_connected: Gauge, + statements_received: Counter, + bytes_sent_total: Counter, + bytes_received_total: Counter, + sent_latency_seconds: Histogram, + initial_sync_statements_sent: Counter, + initial_sync_bursts_total: Counter, + initial_sync_peers_active: Gauge, + initial_sync_duration_seconds: Histogram, } impl Metrics { @@ -129,7 +140,8 @@ impl Metrics { HistogramOpts::new( "substrate_sync_propagated_statements_chunks", "Distribution of chunk sizes when propagating statements", - ).buckets(prometheus::exponential_buckets(1.0, 2.0, 14)?), + ) + .buckets(exponential_buckets(1.0, 2.0, 14)?), )?, r, )?, @@ -147,6 +159,76 @@ impl Metrics { )?, r, )?, + peers_connected: register( + Gauge::new( + "substrate_sync_statement_peers_connected", + "Number of peers connected using the statement protocol", + )?, + r, + )?, + statements_received: register( + Counter::new( + "substrate_sync_statements_received", + "Total number of statements received from peers", + )?, + r, + )?, + bytes_sent_total: register( + Counter::new( + "substrate_sync_statement_bytes_sent_total", + "Total bytes sent for statement protocol messages", + )?, + r, + )?, + bytes_received_total: register( + Counter::new( + "substrate_sync_statement_bytes_received_total", + "Total bytes received for statement protocol messages", + )?, + r, + )?, + sent_latency_seconds: register( + Histogram::with_opts( + HistogramOpts::new( + "substrate_sync_statement_sent_latency_seconds", + "Time to send statement messages to peers", + ) + // Buckets from 1μs to ~1s covering microsecond to millisecond range. + .buckets(vec![0.000_001, 0.000_01, 0.000_1, 0.001, 0.01, 0.1, 1.0]), + )?, + r, + )?, + initial_sync_statements_sent: register( + Counter::new( + "substrate_sync_initial_sync_statements_sent", + "Total statements sent during initial sync bursts to newly connected peers", + )?, + r, + )?, + initial_sync_bursts_total: register( + Counter::new( + "substrate_sync_initial_sync_bursts_total", + "Total number of initial sync burst rounds processed", + )?, + r, + )?, + initial_sync_peers_active: register( + Gauge::new( + "substrate_sync_initial_sync_peers_active", + "Number of peers currently being synced via initial sync", + )?, + r, + )?, + initial_sync_duration_seconds: register( + Histogram::with_opts( + HistogramOpts::new( + "substrate_sync_initial_sync_duration_seconds", + "Per-peer total duration of initial sync from start to completion", + ) + .buckets(vec![0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0]), + )?, + r, + )?, }) } } @@ -220,6 +302,9 @@ impl StatementHandlerPrototype { num_submission_workers = 1; } + let metrics = + if let Some(r) = metrics_registry { Some(Metrics::register(r)?) } else { None }; + for _ in 0..num_submission_workers { let store = statement_store.clone(); let mut queue_receiver = queue_receiver.clone(); @@ -260,11 +345,7 @@ impl StatementHandlerPrototype { peers: HashMap::new(), statement_store, queue_sender, - metrics: if let Some(r) = metrics_registry { - Some(Metrics::register(r)?) - } else { - None - }, + metrics, initial_sync_timeout: Box::pin(tokio::time::sleep(INITIAL_SYNC_BURST_INTERVAL).fuse()), pending_initial_syncs: HashMap::new(), initial_sync_peer_queue: VecDeque::new(), @@ -324,6 +405,7 @@ pub struct Peer { /// Tracks pending initial sync state for a peer (hashes only, statements fetched on-demand). struct PendingInitialSync { hashes: Vec, + started_at: Instant, } /// Result of finding a sendable chunk of statements. @@ -503,18 +585,27 @@ where ChunkResult::Send(0) => SendChunkResult::Empty, ChunkResult::Send(chunk_end) => { let chunk = &statements[..chunk_end]; - if let Err(e) = timeout( + let encoded = chunk.encode(); + let bytes_to_send = encoded.len() as u64; + + let sent_latency_timer = + self.metrics.as_ref().map(|m| m.sent_latency_seconds.start_timer()); + let send_result = timeout( SEND_TIMEOUT, - self.notification_service.send_async_notification(peer, chunk.encode()), + self.notification_service.send_async_notification(peer, encoded), ) - .await - { + .await; + drop(sent_latency_timer); + + if let Err(e) = send_result { log::debug!(target: LOG_TARGET, "Failed to send notification to {peer}: {e:?}"); return SendChunkResult::Failed; } + log::trace!(target: LOG_TARGET, "Sent {} statements to {}", chunk.len(), peer); self.metrics.as_ref().map(|metrics| { metrics.propagated_statements.inc_by(chunk.len() as u64); + metrics.bytes_sent_total.inc_by(bytes_to_send); metrics.propagated_statements_chunks.observe(chunk.len() as f64); }); SendChunkResult::Sent(chunk_end) @@ -581,21 +672,46 @@ where ); debug_assert!(_was_in.is_none()); - if !self.sync.is_major_syncing() && !role.is_light() { + self.metrics.as_ref().map(|metrics| { + metrics.peers_connected.set(self.peers.len() as u64); + }); + + if !self.sync.is_major_syncing() { let hashes = self.statement_store.statement_hashes(); if !hashes.is_empty() { - self.pending_initial_syncs.insert(peer, PendingInitialSync { hashes }); + self.pending_initial_syncs.insert( + peer, + PendingInitialSync { hashes, started_at: Instant::now() }, + ); self.initial_sync_peer_queue.push_back(peer); + self.metrics.as_ref().map(|metrics| { + metrics.initial_sync_peers_active.inc(); + }); } } }, NotificationEvent::NotificationStreamClosed { peer } => { let _peer = self.peers.remove(&peer); debug_assert!(_peer.is_some()); - self.pending_initial_syncs.remove(&peer); + if let Some(pending) = self.pending_initial_syncs.remove(&peer) { + self.metrics.as_ref().map(|metrics| { + metrics.initial_sync_peers_active.dec(); + metrics + .initial_sync_duration_seconds + .observe(pending.started_at.elapsed().as_secs_f64()); + }); + } self.initial_sync_peer_queue.retain(|p| *p != peer); + self.metrics.as_ref().map(|metrics| { + metrics.peers_connected.set(self.peers.len() as u64); + }); }, NotificationEvent::NotificationReceived { peer, notification } => { + let bytes_received = notification.len() as u64; + self.metrics.as_ref().map(|metrics| { + metrics.bytes_received_total.inc_by(bytes_received); + }); + // Accept statements only when node is not major syncing if self.sync.is_major_syncing() { log::trace!( @@ -618,6 +734,11 @@ where #[cfg_attr(not(any(test, feature = "test-helpers")), doc(hidden))] pub fn on_statements(&mut self, who: PeerId, statements: Statements) { log::trace!(target: LOG_TARGET, "Received {} statements from {}", statements.len(), who); + + self.metrics.as_ref().map(|metrics| { + metrics.statements_received.inc_by(statements.len() as u64); + }); + if let Some(ref mut peer) = self.peers.get_mut(&who) { let mut statements_left = statements.len() as u64; for s in statements { @@ -788,6 +909,16 @@ where } } + /// Record initial sync completion metrics for a peer being removed. + fn record_initial_sync_completion(&self, started_at: Instant) { + self.metrics.as_ref().map(|metrics| { + metrics.initial_sync_peers_active.dec(); + metrics + .initial_sync_duration_seconds + .observe(started_at.elapsed().as_secs_f64()); + }); + } + /// Process one batch of initial sync for the next peer in the queue (round-robin). async fn process_initial_sync_burst(&mut self) { if self.sync.is_major_syncing() { @@ -802,8 +933,14 @@ where return; }; + self.metrics.as_ref().map(|metrics| { + metrics.initial_sync_bursts_total.inc(); + }); + if entry.get().hashes.is_empty() { + let started_at = entry.get().started_at; entry.remove(); + self.record_initial_sync_completion(started_at); return; } @@ -823,7 +960,9 @@ where Ok(r) => r, Err(e) => { log::debug!(target: LOG_TARGET, "Failed to fetch statements for initial sync: {e:?}"); + let started_at = entry.get().started_at; entry.remove(); + self.record_initial_sync_completion(started_at); return; }, }; @@ -837,11 +976,16 @@ where let to_send: Vec<_> = statements.iter().map(|(_, stmt)| stmt).collect(); match self.send_statement_chunk(&peer_id, &to_send).await { SendChunkResult::Failed => { - self.pending_initial_syncs.remove(&peer_id); + if let Some(pending) = self.pending_initial_syncs.remove(&peer_id) { + self.record_initial_sync_completion(pending.started_at); + } return; }, SendChunkResult::Sent(sent) => { debug_assert_eq!(to_send.len(), sent); + self.metrics.as_ref().map(|metrics| { + metrics.initial_sync_statements_sent.inc_by(sent as u64); + }); // Mark statements as known if let Some(peer) = self.peers.get_mut(&peer_id) { for (hash, _) in &statements { @@ -856,7 +1000,9 @@ where if has_more { self.initial_sync_peer_queue.push_back(peer_id); } else { - self.pending_initial_syncs.remove(&peer_id); + if let Some(pending) = self.pending_initial_syncs.remove(&peer_id) { + self.record_initial_sync_completion(pending.started_at); + } } } } diff --git a/substrate/client/statement-store/Metrics.md b/substrate/client/statement-store/Metrics.md new file mode 100644 index 0000000000000..8245c9ca11cc2 --- /dev/null +++ b/substrate/client/statement-store/Metrics.md @@ -0,0 +1,331 @@ +# Statement Store & Network Dashboards + +Document provides a reference for every metric displayed in the two +Statement Store Grafana dashboards. For each metric, you will find: what it measures, +why it matters how to read it and what problems it helps diagnose. + +## Table of Contents + +- [Dashboard 1: Substrate Statement Store](#dashboard-1-substrate-statement-store) + - [Storage Overview](#storage-overview) + - [Throughput & Operations](#throughput--operations-use-method) + - [Errors & Rejections](#errors--rejections-red-method) + - [Capacity & Limits](#capacity--limits) +- [Dashboard 2: Substrate Statement Network](#dashboard-2-substrate-statement-network) + - [Network Overview](#network-overview) + - [Network Throughput & Bandwidth](#network-throughput--bandwidth) + - [Operation Latency](#operation-latency) + - [Network Health Indicators](#network-health-indicators) + - [Statement Expiration & Cleanup](#statement-expiration--cleanup) + - [Initial Sync](#initial-sync) +--- + +## Dashboard 1: Substrate Statement Store + +**UID:** `substrate-statement-store` + +This dashboard covers the storage engine: capacity, throughput, errors, and lifecycle +of statements in the database + +### Storage Overview + +#### Total Statements +- **Metric:** `substrate_sub_statement_store_statements_total` +- **Type:** Gauge +- **What it measures:** Count of statements persisted in the ParityDB store. +- **Why it matters:** Primary capacity indicator. +- **How to read:** Single stat panel. Green = healthy, yellow = 70%+ of capacity, red = 90%+. + +#### Storage Used (Bytes) +- **Metric:** `substrate_sub_statement_store_bytes_total` +- **Type:** Gauge +- **What it measures:** Total bytes consumed by statement data in the store. +- **Why it matters:** Metric detects byte-level saturation independently of count. +- **How to read:** Color thresholds match statement count: green/yellow/red at 70%/90% of capacity. + +#### Unique Accounts +- **Metric:** `substrate_sub_statement_store_accounts_total` +- **Type:** Gauge +- **What it measures:** Number of distinct accounts (public keys) that have at least one statement + in the store. +- **Why it matters:** Indicates the diversity of statement authors. A single account flooding + the store is a sign of abuse or misconfiguration. A healthy network shows many accounts. +- **How to read:** Single stat panel. No threshold coloring (informational). +- **Problems it solves:** + - Detect single-account flooding (one account with thousands of statements). + - Monitor adoption: how many unique participants are using the statement store. + +#### Expired Statements +- **Metric:** `substrate_sub_statement_store_expired_total` +- **Type:** Gauge +- **What it measures:** Statements that have been marked as expired but not yet purged (purge happens after 48h by default). +- **Why it matters:** A growing backlog of expired statements means the cleanup process + is falling behind, or the expiration rate is very high. +- **How to read:** Single stat. Green = 0-9, yellow = 10-99, red = 100+. +- **Problems it solves:** + - Detect stalled or slow cleanup processes per node. + - Identify nodes where expired statements accumulate (possible disk I/O issue). + +--- + +### Throughput & Operations + +#### Submission Rate (Successful) +- **Metric:** `rate(substrate_sub_statement_store_submitted_statements[$__rate_interval])` +- **Type:** Counter (displayed as rate) +- **What it measures:** Successful statement submissions per second. +- **Why it matters:** Throughput metric it tells you how fast the system + is processing valid work. Drops indicate upstream issues (fewer clients, network problems); + spikes indicate bursts of activity. +- **How to read:** Line chart showing submissions/sec. Legend shows mean/max/sum. +- **Problems it solves:** + - Detect throughput degradation after a release (compare before/after deployment). + +#### Throughput vs Errors +- **Metrics:** + - `rate(submitted_statements)` (green = successful) + - `rate(validations_invalid)` (red = invalid) + - `rate(rejections_total)` (orange = rejected) +- **What it measures:** Comparison of successful vs failed operations. +- **Why it matters:** A healthy system shows green >> red + orange. +- **How to read:** Overlapping lines ratio of green to red/orange is the key signal. +- **Problems it solves:** + - Detect capacity saturation. + - Compare error ratios across releases to ensure quality. + +--- + +### Errors & Rejections + +#### Invalid Validations Rate +- **Metric:** `rate(substrate_sub_statement_store_validations_invalid[$__rate_interval])` +- **Type:** Counter (displayed as rate) +- **What it measures:** Statements that failed proof verification (BadProof, NoProof) + or exceeded encoding size limits during submission. +- **Why it matters:** Invalid statements consume validation resources + (CPU for signature verification) without producing useful work. High rates indicate + malicious actors submitting garbage, network corruption, or client bugs. +- **How to read:** Line chart Any sustained rate above 0 warrants investigation. +- **Problems it solves:** + - Detect ongoing attacks + - Detect client bugs + - Detect network corruption + +#### Rejection Reasons Breakdown +- **Metric:** `rate(substrate_sub_statement_store_rejections_total[$__rate_interval])` with label `reason` +- **Type:** CounterVec (displayed as stacked rate) +- **Labels:** `reason` = `data_too_large` | `channel_priority_too_low` | `account_full` | `store_full` | `no_allowance` +- **What it measures:** Statements that passed validation but were rejected by the store +- **Why it matters:** Each rejection reason points to a different problem: + - **`store_full`** (red): The global store is at capacity. All new submissions are rejected. + Action: increase capacity or reduce statement lifetimes. + - **`account_full`** (orange): A specific account has hit its per-account quota. The account + owner needs to remove old statements before submitting new ones. + - **`channel_priority_too_low`** (yellow): A statement tried to replace a higher-priority + statement in the same channel. The submitter should increase priority. + - **`data_too_large`** (purple): Statement data exceeds `max_size`. The client is submitting + oversized data. + - **`no_allowance`** (grey): The account has no statement allowance set by the runtime. + The runtime must grant an allowance before the account can submit statements. +- **How to read:** Bar chart color of the dominant bar tells you the primary + rejection cause. + +--- + +### Capacity & Limits + +#### Statement Capacity vs Usage +- **Metrics:** `capacity_statements` (dashed red line), `statements_total` (solid blue line) +- **What it measures:** Absolute statement count plotted against the configured maximum. +- **Why it matters:** Visual gap between the two lines shows remaining headroom. +- **How to read:** Two lines gap remaining capacity. +- **Problems it solves:** + - Capacity configuration changes took effect after a deployment + +#### Byte Capacity vs Usage +- **Metrics:** `capacity_bytes` (dashed red line), `bytes_total` (solid purple line) +- **What it measures:** Same as above, but for byte-level capacity. + +--- + +## Dashboard 2: Substrate Statement Network + +**UID:** `substrate-statement-network` + +Dashboard covers the gossip/networking layer: peer connectivity, statement propagation, +bandwidth, latency, and network health indicators. + +### Network Overview + +#### Peers Connected (Stat) +- **Metric:** `substrate_sync_statement_peers_connected` +- **Type:** Gauge +- **What it measures:** Current number of peers connected via the statement gossip protocol. +- **Why it matters:** Zero peers means complete isolation: the node cannot send or receive + statements. Low peer counts (<3) mean slow propagation and possible partitioning. +- **How to read:** Single stat. +- **Problems it solves:** + - Detect degraded connectivity: few peers mean slow statement propagation. + +#### Pending Validations (Stat) +- **Metric:** `substrate_sync_pending_statement_validations` +- **Type:** Gauge +- **What it measures:** Number of statements waiting in the validation queue. +- **Why it matters:** metric for the validation pipeline. +- **How to read:** Single stat. +- **Problems it solves:** + - Detect validation bottleneck: if consistently high + +#### Statements Received vs Propagated (Time Series) +- **Metrics:** + - `rate(statements_received)` (blue = received from peers) + - `rate(propagated_statements)` (green = sent to peers) +- **What it measures:** Inbound vs outbound statement rates over time. +- **Why it matters:** Primary indicator of network participation balance: + - **Received ≈ Propagated**: Balanced node. Receiving and forwarding roughly equally. + - **Received >> Propagated**: Mostly consuming. The node may be new or catching up. + - **Propagated >> Received**: Mostly producing. The node is generating more than it receives +- **How to read:** Two-line chart. Mean and max values in the legend table. +- **Problems it solves:** + - Compare network activity across releases to detect propagation regressions. + - Identify nodes that are not propagating (broken outbound path). + +#### Peers Connected Over Time +- **Metric:** `statement_peers_connected` (time series) +- **What it measures:** Peer count connected over time. +- **Why it matters:** Drops indicate network instability for example node restarts +- **How to read:** Line chart with mean/max/last in lgend. Look for step changes and trends. +- **Problems it solves:** + - Detect network partitioning (drop to 0). + - Detect gradual peer loss + +--- + +### Network Throughput & Bandwidth + +#### Network Bandwidth (Statement Protocol) +- **Metrics:** + - `rate(statement_bytes_sent_total)` (orange = bytes sent/sec) + - `rate(statement_bytes_received_total)` (blue = bytes received/sec) +- **Type:** Counter +- **What it measures:** Actual network bandwidth consumed by the statement protocol. +- **Why it matters:** Metric for sizing network requirements. +- **How to read:** Two-line chart in bytes/sec. Compare sent vs received for balance. +- **Problems it solves:** + - Detect bandwidth issues + - Compare bandwidth across releases to measure protocol efficiency improvements + +--- + +### Operation Latency + +Good for performance comparison across releases. Each shows p50/p90/p99 +percentiles, making it easy to spot tail latency regressions. + +#### Submit Latency (Total) +- **Metric:** `substrate_sub_statement_store_submit_duration_seconds` (histogram) +- **Buckets:** 1μs, 10μs, 100μs, 1ms, 10ms, 100ms, 1s +- **What it measures:** Time to submit a statement, including signature verification, + runtime validation, and database write. +- **Why it matters:** This is the single most important latency metric it affects + how fast the node can process incoming statements. If p99 > SLO per sec node will build + up backpressure under load. +- **How to read:** Three lines (green=p50, yellow=p90, red=p99 gap between p50 and p99 shows tail latency. +- **Problems it solves:** + - Detect latency regressions after deployments (compare p50/p90/p99 before and after). + - Identify which sub-operation is slow by comparing with verify/DB write panels. + - Set SLOs: "p99 submit latency < (N)ms". + +#### Statement Send Latency (Percentiles) +- **Metric:** `substrate_sync_statement_sent_latency_seconds` (histogram) +- **Buckets:** 1μs, 10μs, 100μs, 1ms, 10ms, 100ms, 1s +- **What it measures:** Time to send a statement notification to a peer via the network layer. +- **Why it matters:** Measures network-level latenc high values indicate network + congestion, slow peers +- **How to read:** Three lines (p50/p90/p99) +- **Problems it solves:** + - Detect network congestion between specific peers + - Identify slow peers + - Compare send latency across releases to measure protocol improvements + - Set SLOs: "p99 send latency < 2s" + +--- + +### Network Health Indicators + +--- + +### Statement Expiration & Cleanup + +#### Expiration Check Latency +- **Metric:** `substrate_sub_statement_store_check_expiration_duration_seconds` (histogram) +- **Buckets:** 1μs, 10μs, 100μs, 1ms, 10ms, 100ms, 1s +- **What it measures:** Time spent in each expiration check cycle, expiration periodically + scans accounts and marks statements as expired. +- **Why it matters:** Expiration checks run on the main store thread. If they take too long, + they block statement submissions +- **How to read:** Three lines (p50/p90/p99). Should typically be sub-millisec. +- **Problems it solves:** + - Detect expiration performance degradation as the store grows + - Identify if expiration is blocking the submission path + +#### Statements Expiration Rate +- **Metric:** `rate(substrate_sub_statement_store_statements_expired_total[$__rate_interval])` +- **What it measures:** Rate at which statements are expired (marked for later purge). +- **Why it matters:** A sudden spike means many statements expired + at once +- **How to read:** Line chart showing expired/sec. Legend shows mean/max/sum. +- **Problems it solves:** + - Verify that the expirtion system is working + - Detect mass expiration events + +--- + +### Initial Sync + +#### Initial Sync Active Peers +- **Metric:** `substrate_sync_initial_sync_peers_active` +- **Type:** Gauge +- **What it measures:** Number of peers currently being synced via the initial sync burst mechanism +- **Why it matters:** Each active initial sync consumes bandwidth and CPU. Too many concurrent + syncs can starve normal gossip propagation +- **How to read:** Single stat. Green = 0-4, yellow = 5-19, red = 20+. +- **Problems it solves:** + - Detect excessive initial syncs + - Capacity planning for nodes that frequently accept new peers + +#### Initial Sync Statements Sent +- **Metric:** `rate(substrate_sync_initial_sync_statements_sent[$__rate_interval])` +- **Type:** Counter (displayed as rate) +- **What it measures:** Rate of statements sent to peers during initial sync bursts. +- **Why it matters:** High rates indicate many new peers are connecting and receiving full + statement sets. Impacts network bandwidth +- **How to read:** Line chart showing statements/sec. Legend shows mean/max +- **Problems it solves:** + - Help to understand initial sync's share of network traffic + +#### Initial Sync Burst Rate +- **Metric:** `rate(substrate_sync_initial_sync_bursts_total[$__rate_interval])` +- **Type:** Counter (displayed as rate) +- **What it measures:** Rate of initial sync burst rounds being processed. +- **Why it matters:** Each burst sends one batch of statements to one peer (round-robin) + burst rate combined with statements-per-burst gives the effective sync throughput +- **How to read:** Line chart showing bursts/sec. Legend shows mean/max. +- **Problems it solves:** + - Verify that the round-robin distributing work across peers + - Detect stalled initial syncs (burst rate drops to 0 while active peers > 0) + +#### Per-Peer Initial Sync Duration (Percentiles) +- **Metric:** `substrate_sync_initial_sync_duration_seconds` (histogram) +- **Buckets:** 10ms, 50ms, 100ms, 250ms, 500ms, 1s, 2.5s, 5s, 10s, 30s, 60s +- **What it measures:** Total wall-clock time from the first burst to completion of initial + sync for each peer. +- **Why it matters:** Long sync durations mean new peers wait a long time before having + a complete view of the statement store. Threshold: 5s (yellow), 30s (red). +- **How to read:** Three lines (green=p50, yellow=p90, red=p99). +- **Problems it solves:** + - Detect slow initial syncs that delay new peer participation + - Compare sync durations as statement store size grows + - Identify network-level bottlenecks + +--- diff --git a/substrate/client/statement-store/src/lib.rs b/substrate/client/statement-store/src/lib.rs index f3dfd163dcf6f..7f9866d5e8e92 100644 --- a/substrate/client/statement-store/src/lib.rs +++ b/substrate/client/statement-store/src/lib.rs @@ -812,6 +812,7 @@ impl Store { // Statements are considered expired when their priority (which encodes the expiration // timestamp in the upper 32 bits) is less than the current timestamp. fn check_expiration(&self) { + let _start_check_expiration_timer = self.metrics.start_check_expiration_timer(); let current_time = self.timestamp(); let (needs_expiry, num_accounts_checked) = { @@ -855,6 +856,8 @@ impl Store { (needs_expiry, num_accounts_checked) }; + let mut expired = 0; + for hash in needs_expiry { if let Err(e) = self.remove(&hash) { log::debug!( @@ -864,6 +867,7 @@ impl Store { e ); } else { + expired += 1; log::trace!( target: LOG_TARGET, "Marked statement {:?} as expired", @@ -871,21 +875,44 @@ impl Store { ); } } + let mut index = self.index.write(); let new_len = index .accounts_to_check_for_expiry_stmts .len() .saturating_sub(num_accounts_checked); index.accounts_to_check_for_expiry_stmts.truncate(new_len); + + drop(_start_check_expiration_timer); + + self.metrics.report(|metrics| { + metrics.statements_expired_total.inc_by(expired); + }); } /// Perform periodic store maintenance pub fn maintain(&self) { log::trace!(target: LOG_TARGET, "Started store maintenance"); - let (deleted, active_count, expired_count): (Vec<_>, usize, usize) = { + let ( + deleted, + active_count, + expired_count, + total_size, + accounts_count, + capacity_statements, + capacity_bytes, + ): (Vec<_>, usize, usize, usize, usize, usize, usize) = { let mut index = self.index.write(); let deleted = index.maintain(self.timestamp()); - (deleted, index.entries.len(), index.expired.len()) + ( + deleted, + index.entries.len(), + index.expired.len(), + index.total_size, + index.accounts.len(), + index.options.max_total_statements, + index.options.max_total_size, + ) }; let deleted: Vec<_> = deleted.into_iter().map(|hash| (col::EXPIRED, hash.to_vec(), None)).collect(); @@ -895,6 +922,16 @@ impl Store { } else { self.metrics.report(|metrics| metrics.statements_pruned.inc_by(deleted_count)); } + + self.metrics.report(|metrics| { + metrics.statements_total.set(active_count as u64); + metrics.bytes_total.set(total_size as u64); + metrics.accounts_total.set(accounts_count as u64); + metrics.expired_total.set(expired_count as u64); + metrics.capacity_statements.set(capacity_statements as u64); + metrics.capacity_bytes.set(capacity_bytes as u64); + }); + log::trace!( target: LOG_TARGET, "Completed store maintenance. Purged: {}, Active: {}, Expired: {}", @@ -1148,6 +1185,7 @@ impl StatementStore for Store { /// Submit a statement to the store. Validates the statement and returns validation result. fn submit(&self, statement: Statement, source: StatementSource) -> SubmitResult { + let _histogram_submit_start_timer = self.metrics.start_submit_timer(); let hash = statement.hash(); // Get unix timestamp if self.timestamp() >= statement.get_expiration_timestamp_secs().into() { @@ -1156,6 +1194,7 @@ impl StatementStore for Store { "Statement is already expired: {:?}", HexDisplay::from(&hash), ); + self.metrics.report(|metrics| metrics.validations_invalid.inc()); return SubmitResult::Invalid(InvalidReason::AlreadyExpired); } let encoded_size = statement.encoded_size(); @@ -1167,6 +1206,7 @@ impl StatementStore for Store { statement.encoded_size(), MAX_STATEMENT_SIZE ); + self.metrics.report(|metrics| metrics.validations_invalid.inc()); return SubmitResult::Invalid(InvalidReason::EncodingTooLarge { submitted_size: encoded_size, max_size: MAX_STATEMENT_SIZE, @@ -1261,7 +1301,12 @@ impl StatementStore for Store { let evicted = match index.insert(hash, &statement, &account_id, &validation, current_time) { Ok(evicted) => evicted, - Err(reason) => return SubmitResult::Rejected(reason), + Err(reason) => { + self.metrics.report(|metrics| { + metrics.rejections.with_label_values(&[reason.label()]).inc(); + }); + return SubmitResult::Rejected(reason); + }, }; commit.push((col::STATEMENTS, hash.to_vec(), Some(statement.encode()))); diff --git a/substrate/client/statement-store/src/metrics.rs b/substrate/client/statement-store/src/metrics.rs index cf191b79757ed..6b6250126e8dd 100644 --- a/substrate/client/statement-store/src/metrics.rs +++ b/substrate/client/statement-store/src/metrics.rs @@ -20,7 +20,10 @@ use std::sync::Arc; -use prometheus_endpoint::{register, Counter, PrometheusError, Registry, U64}; +use prometheus_endpoint::{ + prometheus::HistogramTimer, register, Counter, CounterVec, Gauge, Histogram, HistogramOpts, + Opts, PrometheusError, Registry, U64, +}; #[derive(Clone, Default)] pub struct MetricsLink(Arc>); @@ -41,6 +44,17 @@ impl MetricsLink { do_this(metrics); } } + + pub fn start_submit_timer(&self) -> Option { + self.0.as_ref().as_ref().map(|m| m.submit_duration_seconds.start_timer()) + } + + pub fn start_check_expiration_timer(&self) -> Option { + self.0 + .as_ref() + .as_ref() + .map(|m| m.check_expiration_duration_seconds.start_timer()) + } } /// Statement store Prometheus metrics. @@ -48,6 +62,16 @@ pub struct Metrics { pub submitted_statements: Counter, pub validations_invalid: Counter, pub statements_pruned: Counter, + pub statements_total: Gauge, + pub bytes_total: Gauge, + pub accounts_total: Gauge, + pub expired_total: Gauge, + pub capacity_statements: Gauge, + pub capacity_bytes: Gauge, + pub rejections: CounterVec, + pub submit_duration_seconds: Histogram, + pub check_expiration_duration_seconds: Histogram, + pub statements_expired_total: Counter, } impl Metrics { @@ -63,7 +87,7 @@ impl Metrics { validations_invalid: register( Counter::new( "substrate_sub_statement_store_validations_invalid", - "Total number of statements that were removed from the pool as invalid", + "Total number of statements that were fail validation during submission", )?, registry, )?, @@ -74,6 +98,86 @@ impl Metrics { )?, registry, )?, + statements_total: register( + Gauge::new( + "substrate_sub_statement_store_statements_total", + "Current number of statements in the store", + )?, + registry, + )?, + capacity_statements: register( + Gauge::new( + "substrate_sub_statement_store_capacity_statements", + "Maximum number of statements the store can hold", + )?, + registry, + )?, + + capacity_bytes: register( + Gauge::new( + "substrate_sub_statement_store_capacity_bytes", + "Maximum total size of statement data in bytes", + )?, + registry, + )?, + bytes_total: register( + Gauge::new( + "substrate_sub_statement_store_bytes_total", + "Current total size of all statement data in bytes", + )?, + registry, + )?, + accounts_total: register( + Gauge::new( + "substrate_sub_statement_store_accounts_total", + "Current number of unique accounts with statements", + )?, + registry, + )?, + expired_total: register( + Gauge::new( + "substrate_sub_statement_store_expired_total", + "Current number of expired statements awaiting purge", + )?, + registry, + )?, + rejections: register( + CounterVec::new( + Opts::new( + "substrate_sub_statement_store_rejections_total", + "Total statement rejections by reason", + ), + &["reason"], + )?, + registry, + )?, + submit_duration_seconds: register( + Histogram::with_opts( + HistogramOpts::new( + "substrate_sub_statement_store_submit_duration_seconds", + "Time to submit a statement", + ) + .buckets(vec![0.000_001, 0.000_01, 0.000_1, 0.001, 0.01, 0.1, 1.0]), + )?, + registry, + )?, + check_expiration_duration_seconds: register( + Histogram::with_opts( + HistogramOpts::new( + "substrate_sub_statement_store_check_expiration_duration_seconds", + "Time to check and process statement expiration", + ) + .buckets(vec![0.000_001, 0.000_01, 0.000_1, 0.001, 0.01, 0.1, 1.0]), + )?, + registry, + )?, + statements_expired_total: register( + Counter::new( + "substrate_sub_statement_store_statements_expired_total", + "Total number of statements that expired and were removed", + )?, + registry, + )?, }) } } diff --git a/substrate/primitives/statement-store/src/store_api.rs b/substrate/primitives/statement-store/src/store_api.rs index e16dfb104ec35..50a23a625588b 100644 --- a/substrate/primitives/statement-store/src/store_api.rs +++ b/substrate/primitives/statement-store/src/store_api.rs @@ -134,6 +134,19 @@ pub enum RejectionReason { NoAllowance, } +impl RejectionReason { + /// Returns a short string label suitable for use in metrics. + pub fn label(&self) -> &'static str { + match self { + RejectionReason::DataTooLarge { .. } => "data_too_large", + RejectionReason::ChannelPriorityTooLow { .. } => "channel_priority_too_low", + RejectionReason::AccountFull { .. } => "account_full", + RejectionReason::StoreFull => "store_full", + RejectionReason::NoAllowance => "no_allowance", + } + } +} + /// Reason why a statement failed validation. #[derive(Debug, Clone, Eq, PartialEq)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]