diff --git a/.maintain/alertmanager.yml b/.maintain/alertmanager.yml new file mode 100644 index 0000000000..4f814d4817 --- /dev/null +++ b/.maintain/alertmanager.yml @@ -0,0 +1,24 @@ +route: + group_by: ['alertname'] + group_wait: 30s + group_interval: 5m + repeat_interval: 1h + receiver: 'slack' +receivers: +- name: 'web.hook' + webhook_configs: + - url: 'http://127.0.0.1:5001/' +- name: 'slack' + slack_configs: + - api_url: "https://hooks.slack.com/services/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" + channel: "#alert" + text: "{{ range .Alerts }} {{ .Annotations.description}}\n {{end}} {{ .CommonAnnotations.username}} <{{.CommonAnnotations.link}}| click here>" + title: "{{.CommonAnnotations.summary}}" + title_link: "{{.CommonAnnotations.link}}" + color: "{{.CommonAnnotations.color}}" +inhibit_rules: + - source_match: + severity: 'critical' + target_match: + severity: 'warning' + equal: ['alertname', 'dev', 'instance'] diff --git a/.maintain/prometheus.yml b/.maintain/prometheus.yml new file mode 100644 index 0000000000..5c72ca8e17 --- /dev/null +++ b/.maintain/prometheus.yml @@ -0,0 +1,38 @@ +# my global config +global: + scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute. + evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute. + # scrape_timeout is set to the global default (10s). + +# Alertmanager configuration +alerting: + alertmanagers: + - static_configs: + - targets: ['localhost:9093'] + +# Load rules once and periodically evaluate them according to the global 'evaluation_interval'. +rule_files: + - "rule_node.yml" + - "rule_polka.yml" + +# A scrape configuration containing exactly one endpoint to scrape: +# Here it's Prometheus itself. +scrape_configs: + # The job name is for prometheus itself added as a label `job=` to any timeseries scraped from this config. + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + + # The job is for `node_exporter` https://github.com/prometheus/node_exporter with the prefix 'node_' + - job_name: 'node' + static_configs: + - targets: ['localhost:9100'] + + # The job is for `polka_substrate_node` with the prefix 'polkadot_' + - job_name: 'polka_node' + # Override the global default and scrape targets from this job every 5 seconds. + # ** NOTE: you want to have this *LESS THAN* the block time in order to ensure + # ** that you have a data point for every block! + scrape_interval: 5s + static_configs: + - targets: ['localhost:9615'] diff --git a/.maintain/rule_node.yml b/.maintain/rule_node.yml new file mode 100644 index 0000000000..ecf3418d93 --- /dev/null +++ b/.maintain/rule_node.yml @@ -0,0 +1,35 @@ +groups: +- name: hostStatsAlert + rules: + - alert: InstanceDown + expr: up == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Instance [{{ $labels.instance }}] down" + description: "[{{ $labels.instance }}] of job [{{ $labels.job }}] has been down for more than 1 minute." + - alert: HostHighCpuLoad + expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80 + for: 1m + labels: + severity: critical + annotations: + summary: Host high CPU load (instance {{ $labels.instance }}) + description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: HostOutOfDiskSpace + expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0 + for: 2m + labels: + severity: warning + annotations: + summary: Host out of disk space (instance {{ $labels.instance }}) + description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: HostOutOfMemory + expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10 + for: 2m + labels: + severity: warning + annotations: + summary: Host out of memory (instance {{ $labels.instance }}) + description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/.maintain/rule_polka.yaml b/.maintain/rule_polka.yaml new file mode 100644 index 0000000000..7a69cba66c --- /dev/null +++ b/.maintain/rule_polka.yaml @@ -0,0 +1,181 @@ +groups: +- name: polkadot.rules + rules: + + ############################################################################## + # Block production + ############################################################################## + + - alert: BlockProductionSlow + annotations: + message: 'Best block on instance {{ $labels.instance }} increases by + less than 1 per minute for more than 3 minutes.' + expr: increase(polkadot_block_height{status="best"}[1m]) < 1 + for: 3m + labels: + severity: warning + - alert: BlockProductionSlow + annotations: + message: 'Best block on instance {{ $labels.instance }} increases by + less than 1 per minute for more than 10 minutes.' + expr: increase(polkadot_block_height{status="best"}[1m]) < 1 + for: 10m + labels: + severity: critical + + ############################################################################## + # Block finalization + ############################################################################## + + - alert: BlockFinalizationSlow + expr: increase(polkadot_block_height{status="finalized"}[1m]) < 1 + for: 3m + labels: + severity: warning + annotations: + message: 'Finalized block on instance {{ $labels.instance }} increases by + less than 1 per minute for more than 3 minutes.' + - alert: BlockFinalizationSlow + expr: increase(polkadot_block_height{status="finalized"}[1m]) < 1 + for: 10m + labels: + severity: critical + annotations: + message: 'Finalized block on instance {{ $labels.instance }} increases by + less than 1 per minute for more than 10 minutes.' + - alert: BlockFinalizationLaggingBehind + # Under the assumption of an average block production of 6 seconds, + # "best" and "finalized" being more than 10 blocks apart would imply + # more than a 1 minute delay between block production and finalization. + expr: '(polkadot_block_height{status="best"} - ignoring(status) + polkadot_block_height{status="finalized"}) > 10' + for: 8m + labels: + severity: critical + annotations: + message: "Block finalization on instance {{ $labels.instance }} is behind + block production by {{ $value }} for more than 8 minutes." + + ############################################################################## + # Transaction queue + ############################################################################## + + - alert: TransactionQueueSizeIncreasing + expr: 'increase(polkadot_sub_txpool_validations_scheduled[5m]) - + increase(polkadot_sub_txpool_validations_finished[5m]) > 0' + for: 10m + labels: + severity: warning + annotations: + message: 'The transaction pool size on node {{ $labels.instance }} has + been monotonically increasing for more than 10 minutes.' + - alert: TransactionQueueSizeIncreasing + expr: 'increase(polkadot_sub_txpool_validations_scheduled[5m]) - + increase(polkadot_sub_txpool_validations_finished[5m]) > 0' + for: 30m + labels: + severity: warning + annotations: + message: 'The transaction pool size on node {{ $labels.instance }} has + been monotonically increasing for more than 30 minutes.' + - alert: TransactionQueueSizeHigh + expr: 'polkadot_sub_txpool_validations_scheduled - + polkadot_sub_txpool_validations_finished > 10000' + for: 5m + labels: + severity: warning + annotations: + message: 'The transaction pool size on node {{ $labels.instance }} has + been above 10_000 for more than 5 minutes.' + + ############################################################################## + # Networking + ############################################################################## + + - alert: NumberOfPeersLow + expr: polkadot_sub_libp2p_peers_count < 3 + for: 3m + labels: + severity: warning + annotations: + message: 'The node {{ $labels.instance }} has less than 3 peers for more + than 3 minutes' + - alert: NumberOfPeersLow + expr: polkadot_sub_libp2p_peers_count < 3 + for: 15m + labels: + severity: critical + annotations: + message: 'The node {{ $labels.instance }} has less than 3 peers for more + than 15 minutes' + - alert: NoIncomingConnection + expr: increase(polkadot_sub_libp2p_incoming_connections_total[20m]) == 0 + labels: + severity: warning + annotations: + message: 'The node {{ $labels.instance }} has not received any new incoming + TCP connection in the past 20 minutes. Is it connected to the Internet?' + + ############################################################################## + # System + ############################################################################## + + - alert: NumberOfFileDescriptorsHigh + expr: 'node_filefd_allocated{domain=~"kusama|polkadot"} > 10000' + for: 3m + labels: + severity: warning + annotations: + message: 'The node {{ $labels.instance }} has more than 10_000 file + descriptors allocated for more than 3 minutes' + + ############################################################################## + # Others + ############################################################################## + + - alert: ContinuousTaskEnded + expr: '(polkadot_tasks_spawned_total{task_name != "basic-authorship-proposer", task_name != "substrate-rpc-subscription"} == 1) + - on(instance, task_name) group_left() (polkadot_tasks_ended_total == 1)' + for: 5m + labels: + severity: warning + annotations: + message: 'Continuous task {{ $labels.task_name }} on node + {{ $labels.instance }} ended unexpectedly.' + + - alert: AuthorityDiscoveryDiscoveryFailureHigh + expr: 'polkadot_authority_discovery_handle_value_found_event_failure / + ignoring(name) + polkadot_authority_discovery_dht_event_received{name="value_found"} > 0.5' + for: 2h + labels: + severity: warning + annotations: + message: 'Authority discovery on node {{ $labels.instance }} fails to + process more than 50 % of the values found on the DHT for more than 2 + hours.' + + - alert: UnboundedChannelPersistentlyLarge + expr: '( + (polkadot_unbounded_channel_len{action = "send"} - + ignoring(action) polkadot_unbounded_channel_len{action = "received"}) + or on(instance) polkadot_unbounded_channel_len{action = "send"} + ) >= 200' + for: 5m + labels: + severity: warning + annotations: + message: 'Channel {{ $labels.entity }} on node {{ $labels.instance }} contains + more than 200 items for more than 5 minutes. Node might be frozen.' + + - alert: UnboundedChannelVeryLarge + expr: '( + (polkadot_unbounded_channel_len{action = "send"} - + ignoring(action) polkadot_unbounded_channel_len{action = "received"}) + or on(instance) polkadot_unbounded_channel_len{action = "send"} + ) > 15000' + labels: + severity: warning + annotations: + message: 'Channel {{ $labels.entity }} on node {{ $labels.instance }} contains more than + 15000 items.'