diff --git a/.maintain/prometheus.yml b/.maintain/prometheus.yml index 5c72ca8e17..414c5e89ca 100644 --- a/.maintain/prometheus.yml +++ b/.maintain/prometheus.yml @@ -13,7 +13,7 @@ alerting: # Load rules once and periodically evaluate them according to the global 'evaluation_interval'. rule_files: - "rule_node.yml" - - "rule_polka.yml" + - "rule_para.yml" # A scrape configuration containing exactly one endpoint to scrape: # Here it's Prometheus itself. diff --git a/.maintain/rule_para.yaml b/.maintain/rule_para.yaml new file mode 100644 index 0000000000..a4b773111b --- /dev/null +++ b/.maintain/rule_para.yaml @@ -0,0 +1,162 @@ +groups: + - name: substrate.rules + rules: + + ############################################################################## + # Block production + ############################################################################## + + - alert: BlockProductionSlow + annotations: + message: 'Best block on instance {{ $labels.instance }} increases by + less than 1 per minute for more than 3 minutes.' + expr: increase(substrate_block_height{status="best"}[1m]) < 1 + for: 3m + labels: + severity: warning + - alert: BlockProductionSlow + annotations: + message: 'Best block on instance {{ $labels.instance }} increases by + less than 1 per minute for more than 10 minutes.' + expr: increase(substrate_block_height{status="best"}[1m]) < 1 + for: 10m + labels: + severity: critical + + ############################################################################## + # Block finalization + ############################################################################## + + - alert: BlockFinalizationSlow + expr: increase(substrate_block_height{status="finalized"}[1m]) < 1 + for: 3m + labels: + severity: warning + annotations: + message: 'Finalized block on instance {{ $labels.instance }} increases by + less than 1 per minute for more than 3 minutes.' + - alert: BlockFinalizationSlow + expr: increase(substrate_block_height{status="finalized"}[1m]) < 1 + for: 10m + labels: + severity: critical + annotations: + message: 'Finalized block on instance {{ $labels.instance }} increases by + less than 1 per minute for more than 10 minutes.' + - alert: BlockFinalizationLaggingBehind + # Under the assumption of an average block production of 6 seconds, + # "best" and "finalized" being more than 10 blocks apart would imply + # more than a 1 minute delay between block production and finalization. + expr: '(substrate_block_height{status="best"} - ignoring(status) + substrate_block_height{status="finalized"}) > 10' + for: 8m + labels: + severity: critical + annotations: + message: "Block finalization on instance {{ $labels.instance }} is behind + block production by {{ $value }} for more than 8 minutes." + + ############################################################################## + # Transaction queue + ############################################################################## + + - alert: TransactionQueueSizeIncreasing + expr: 'increase(substrate_sub_txpool_validations_scheduled[5m]) - + increase(substrate_sub_txpool_validations_finished[5m]) > 0' + for: 10m + labels: + severity: warning + annotations: + message: 'The transaction pool size on node {{ $labels.instance }} has + been monotonically increasing for more than 10 minutes.' + - alert: TransactionQueueSizeIncreasing + expr: 'increase(substrate_sub_txpool_validations_scheduled[5m]) - + increase(substrate_sub_txpool_validations_finished[5m]) > 0' + for: 30m + labels: + severity: warning + annotations: + message: 'The transaction pool size on node {{ $labels.instance }} has + been monotonically increasing for more than 30 minutes.' + - alert: TransactionQueueSizeHigh + expr: 'substrate_sub_txpool_validations_scheduled - + substrate_sub_txpool_validations_finished > 10000' + for: 5m + labels: + severity: warning + annotations: + message: 'The transaction pool size on node {{ $labels.instance }} has + been above 10_000 for more than 5 minutes.' + + ############################################################################## + # Networking + ############################################################################## + + - alert: NumberOfPeersLow + expr: substrate_sub_libp2p_peers_count < 3 + for: 3m + labels: + severity: warning + annotations: + message: 'The node {{ $labels.instance }} has less than 3 peers for more + than 3 minutes' + - alert: NumberOfPeersLow + expr: substrate_sub_libp2p_peers_count < 3 + for: 15m + labels: + severity: critical + annotations: + message: 'The node {{ $labels.instance }} has less than 3 peers for more + than 15 minutes' + - alert: NoIncomingConnection + expr: increase(substrate_sub_libp2p_incoming_connections_total[20m]) == 0 + labels: + severity: warning + annotations: + message: 'The node {{ $labels.instance }} has not received any new incoming + TCP connection in the past 20 minutes. Is it connected to the Internet?' + + ############################################################################## + # System + ############################################################################## + + + ############################################################################## + # Others + ############################################################################## + + - alert: ContinuousTaskEnded + expr: '(substrate_tasks_spawned_total{task_name != "basic-authorship-proposer", task_name != "substrate-rpc-subscription"} == 1) + - on(instance, task_name) group_left() (substrate_tasks_ended_total == 1)' + for: 5m + labels: + severity: warning + annotations: + message: 'Continuous task {{ $labels.task_name }} on node + {{ $labels.instance }} ended unexpectedly.' + + + - alert: UnboundedChannelPersistentlyLarge + expr: '( + (substrate_unbounded_channel_len{action = "send"} - + ignoring(action) substrate_unbounded_channel_len{action = "received"}) + or on(instance) substrate_unbounded_channel_len{action = "send"} + ) >= 200' + for: 5m + labels: + severity: warning + annotations: + message: 'Channel {{ $labels.entity }} on node {{ $labels.instance }} contains + more than 200 items for more than 5 minutes. Node might be frozen.' + + - alert: UnboundedChannelVeryLarge + expr: '( + (substrate_unbounded_channel_len{action = "send"} - + ignoring(action) substrate_unbounded_channel_len{action = "received"}) + or on(instance) substrate_unbounded_channel_len{action = "send"} + ) > 15000' + labels: + severity: warning + annotations: + message: 'Channel {{ $labels.entity }} on node {{ $labels.instance }} contains more than + 15000 items.'