bifrost-io · ark930 · Aug 2, 2021 · Aug 2, 2021
diff --git a/.maintain/alertmanager.yml b/.maintain/alertmanager.yml
@@ -0,0 +1,24 @@
+route:
+  group_by: ['alertname']
+  group_wait: 30s
+  group_interval: 5m
+  repeat_interval: 1h
+  receiver: 'slack'
+receivers:
+- name: 'web.hook'
+  webhook_configs:
+  - url: 'http://127.0.0.1:5001/'
+- name: 'slack'
+  slack_configs:
+    - api_url: "https://hooks.slack.com/services/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"
+      channel: "#alert"
+      text: "{{ range .Alerts }} {{ .Annotations.description}}\n {{end}} {{ .CommonAnnotations.username}} <{{.CommonAnnotations.link}}| click here>"
+      title: "{{.CommonAnnotations.summary}}"
+      title_link: "{{.CommonAnnotations.link}}"
+      color: "{{.CommonAnnotations.color}}"  
+inhibit_rules:
+  - source_match:
+      severity: 'critical'
+    target_match:
+      severity: 'warning'
+    equal: ['alertname', 'dev', 'instance']
diff --git a/.maintain/prometheus.yml b/.maintain/prometheus.yml
@@ -0,0 +1,38 @@
+# my global config
+global:
+  scrape_interval:     15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
+  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
+  # scrape_timeout is set to the global default (10s).
+
+# Alertmanager configuration
+alerting:
+  alertmanagers:
+  - static_configs:
+    - targets: ['localhost:9093']
+
+# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
+rule_files:
+  - "rule_node.yml"
+  - "rule_polka.yml"
+
+# A scrape configuration containing exactly one endpoint to scrape:
+# Here it's Prometheus itself.
+scrape_configs:
+  # The job name is for prometheus itself added as a label `job=<job_name>` to any timeseries scraped from this config.
+  - job_name: 'prometheus'
+    static_configs:
+    - targets: ['localhost:9090']
+
+  # The job is for `node_exporter` https://github.com/prometheus/node_exporter with the prefix 'node_'
+  - job_name: 'node'
+    static_configs:
+    - targets: ['localhost:9100']  
+
+  # The job is for `polka_substrate_node` with the prefix 'polkadot_'
+  - job_name: 'polka_node'
+    # Override the global default and scrape targets from this job every 5 seconds.
+    # ** NOTE: you want to have this *LESS THAN* the block time in order to ensure
+    # ** that you have a data point for every block!
+    scrape_interval: 5s
+    static_configs:
+    - targets: ['localhost:9615']  
diff --git a/.maintain/rule_node.yml b/.maintain/rule_node.yml
@@ -0,0 +1,35 @@
+groups:
+- name: hostStatsAlert
+  rules:
+  - alert: InstanceDown
+    expr: up == 0
+    for: 1m
+    labels:
+      severity: critical
+    annotations:
+      summary: "Instance [{{ $labels.instance }}] down"
+      description: "[{{ $labels.instance }}] of job [{{ $labels.job }}] has been down for more than 1 minute."  
+  - alert: HostHighCpuLoad
+    expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80
+    for: 1m
+    labels:
+      severity: critical
+    annotations:
+      summary: Host high CPU load (instance {{ $labels.instance }})
+      description: "CPU load is > 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+  - alert: HostOutOfDiskSpace
+    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host out of disk space (instance {{ $labels.instance }})
+      description: "Disk is almost full (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+  - alert: HostOutOfMemory
+    expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host out of memory (instance {{ $labels.instance }})
+      description: "Node memory is filling up (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"        
diff --git a/.maintain/rule_polka.yaml b/.maintain/rule_polka.yaml
@@ -0,0 +1,181 @@
+groups:
+- name: polkadot.rules
+  rules:
+
+  ##############################################################################
+  # Block production
+  ##############################################################################
+
+  - alert: BlockProductionSlow
+    annotations:
+      message: 'Best block on instance {{ $labels.instance }} increases by
+      less than 1 per minute for more than 3 minutes.'
+    expr: increase(polkadot_block_height{status="best"}[1m]) < 1
+    for: 3m
+    labels:
+      severity: warning
+  - alert: BlockProductionSlow
+    annotations:
+      message: 'Best block on instance {{ $labels.instance }} increases by
+      less than 1 per minute for more than 10 minutes.'
+    expr: increase(polkadot_block_height{status="best"}[1m]) < 1
+    for: 10m
+    labels:
+      severity: critical
+
+  ##############################################################################
+  # Block finalization
+  ##############################################################################
+
+  - alert: BlockFinalizationSlow
+    expr: increase(polkadot_block_height{status="finalized"}[1m]) < 1
+    for: 3m
+    labels:
+      severity: warning
+    annotations:
+      message: 'Finalized block on instance {{ $labels.instance }} increases by
+      less than 1 per minute for more than 3 minutes.'
+  - alert: BlockFinalizationSlow
+    expr: increase(polkadot_block_height{status="finalized"}[1m]) < 1
+    for: 10m
+    labels:
+      severity: critical
+    annotations:
+      message: 'Finalized block on instance {{ $labels.instance }} increases by
+      less than 1 per minute for more than 10 minutes.'
+  - alert: BlockFinalizationLaggingBehind
+    # Under the assumption of an average block production of 6 seconds,
+    # "best" and "finalized" being more than 10 blocks apart would imply
+    # more than a 1 minute delay between block production and finalization.
+    expr: '(polkadot_block_height{status="best"} - ignoring(status)
+    polkadot_block_height{status="finalized"}) > 10'
+    for: 8m
+    labels:
+      severity: critical
+    annotations:
+      message: "Block finalization on instance {{ $labels.instance }} is behind
+      block production by {{ $value }} for more than 8 minutes."
+
+  ##############################################################################
+  # Transaction queue
+  ##############################################################################
+
+  - alert: TransactionQueueSizeIncreasing
+    expr: 'increase(polkadot_sub_txpool_validations_scheduled[5m]) -
+    increase(polkadot_sub_txpool_validations_finished[5m]) > 0'
+    for: 10m
+    labels:
+      severity: warning
+    annotations:
+      message: 'The transaction pool size on node {{ $labels.instance }} has
+      been monotonically increasing for more than 10 minutes.'
+  - alert: TransactionQueueSizeIncreasing
+    expr: 'increase(polkadot_sub_txpool_validations_scheduled[5m]) -
+    increase(polkadot_sub_txpool_validations_finished[5m]) > 0'
+    for: 30m
+    labels:
+      severity: warning
+    annotations:
+      message: 'The transaction pool size on node {{ $labels.instance }} has
+      been monotonically increasing for more than 30 minutes.'
+  - alert: TransactionQueueSizeHigh
+    expr: 'polkadot_sub_txpool_validations_scheduled -
+    polkadot_sub_txpool_validations_finished > 10000'
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      message: 'The transaction pool size on node {{ $labels.instance }} has
+      been above 10_000 for more than 5 minutes.'
+
+  ##############################################################################
+  # Networking
+  ##############################################################################
+
+  - alert: NumberOfPeersLow
+    expr: polkadot_sub_libp2p_peers_count < 3
+    for: 3m
+    labels:
+      severity: warning
+    annotations:
+      message: 'The node {{ $labels.instance }} has less than 3 peers for more
+      than 3 minutes'
+  - alert: NumberOfPeersLow
+    expr: polkadot_sub_libp2p_peers_count < 3
+    for: 15m
+    labels:
+      severity: critical
+    annotations:
+      message: 'The node {{ $labels.instance }} has less than 3 peers for more
+      than 15 minutes'
+  - alert: NoIncomingConnection
+    expr: increase(polkadot_sub_libp2p_incoming_connections_total[20m]) == 0
+    labels:
+      severity: warning
+    annotations:
+      message: 'The node {{ $labels.instance }} has not received any new incoming
+      TCP connection in the past 20 minutes. Is it connected to the Internet?'
+
+  ##############################################################################
+  # System
+  ##############################################################################
+
+  - alert: NumberOfFileDescriptorsHigh
+    expr: 'node_filefd_allocated{domain=~"kusama|polkadot"} > 10000'
+    for: 3m
+    labels:
+      severity: warning
+    annotations:
+      message: 'The node {{ $labels.instance }} has more than 10_000 file
+      descriptors allocated for more than 3 minutes'
+
+  ##############################################################################
+  # Others
+  ##############################################################################
+
+  - alert: ContinuousTaskEnded
+    expr: '(polkadot_tasks_spawned_total{task_name != "basic-authorship-proposer", task_name != "substrate-rpc-subscription"} == 1)
+        - on(instance, task_name) group_left() (polkadot_tasks_ended_total == 1)'
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      message: 'Continuous task {{ $labels.task_name }} on node
+      {{ $labels.instance }} ended unexpectedly.'
+
+  - alert: AuthorityDiscoveryDiscoveryFailureHigh
+    expr: 'polkadot_authority_discovery_handle_value_found_event_failure /
+    ignoring(name)
+    polkadot_authority_discovery_dht_event_received{name="value_found"} > 0.5'
+    for: 2h
+    labels:
+      severity: warning
+    annotations:
+      message: 'Authority discovery on node {{ $labels.instance }} fails to
+      process more than 50 % of the values found on the DHT for more than 2
+      hours.'
+
+  - alert: UnboundedChannelPersistentlyLarge
+    expr: '(
+        (polkadot_unbounded_channel_len{action = "send"} -
+            ignoring(action) polkadot_unbounded_channel_len{action = "received"})
+        or on(instance) polkadot_unbounded_channel_len{action = "send"}
+    ) >= 200'
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      message: 'Channel {{ $labels.entity }} on node {{ $labels.instance }} contains
+      more than 200 items for more than 5 minutes. Node might be frozen.'
+
+  - alert: UnboundedChannelVeryLarge
+    expr: '(
+        (polkadot_unbounded_channel_len{action = "send"} -
+            ignoring(action) polkadot_unbounded_channel_len{action = "received"})
+        or on(instance) polkadot_unbounded_channel_len{action = "send"}
+    ) > 15000'
+    labels:
+      severity: warning
+    annotations:
+      message: 'Channel {{ $labels.entity }} on node {{ $labels.instance }} contains more than
+      15000 items.'