salt/monitoring_srv/prometheus/rules.yml

---
groups:
  # sap alerts
  - name: sap-hana-resource-monitoring
    rules:
      - alert: sap-hana-master-resource-down
        expr: absent(ha_cluster_pacemaker_resources{resource=~"rsc_SAPHana_.*",role="master",status="active"} == 1)
        labels:
          severity: page
        annotations:
          summary: Primary SAP-HANA resource down

      - alert: sap-hana-secondary-resource-absent
        expr: absent(ha_cluster_pacemaker_resources{resource=~"rsc_SAPHana_.*",role="slave",status="active"} == 1)
        labels:
          severity: page
        annotations:
          summary: Slave SAP-HANA resource absent

      - alert: sap-hana-internal-alerts
        expr: hanadb_alerts_current_rating > 3
        labels:
          severity: page
        annotations:
          summary: "HANA Internal alert raised for SID {{ $labels.sid }} InsNr {{ $labels.insnr }} DBName {{ $labels.database_name }}"
          description: "Alert Details: {{ $labels.alert_details }} User Action: {{ $labels.alert_useraction }}"

  # ha cluster alerts
  - name: cluster-resources-monitoring
    rules:
      - alert: cluster-resources-a-resource-failed
        expr: count(ha_cluster_pacemaker_resources{status="failed"} == 1) > 0
        labels:
          severity: page
        annotations:
          summary: A cluster resource failed

  # failcount exceed migration threshold (example on saphana specific resource)
  - name: resource-failcount-higher-threshold
    rules:
      - alert: resource-failcount-higher-threshold
        expr: count(ha_cluster_pacemaker_fail_count > ha_cluster_pacemaker_migration_threshold) > 0
        labels:
          severity: page
        annotations:
          summary: a resource fail count exceeded its migration threshold

  # drbd alerts
  - name: drbd-alerts
    rules:
      - alert: drbd-connections-in-a-bad-state
        expr: count(ha_cluster_drbd_connections{peer_disk_state=~"inconsistent|outdated|dunknown|failed"}) > 0
        labels:
          severity: page
        annotations:
          summary: a drbd connection is an bad state inconsistent|outdated|dunknown|failed

      - alert: drbd-sync-connections-percentage-lower-than-expected
        expr: ha_cluster_drbd_connections_sync < 90
        labels:
          severity: page
        annotations:
          summary: a drbd disk sync is lower than 90 percent!

      - alert: drbd-resource-in-a-bad-state
        expr: count(ha_cluster_drbd_resources{peer_disk_state=~"inconsistent|outdated|dunknown|failed"}) > 0
        labels:
          severity: page
        annotations:
          summary: a drbd resource is an bad state inconsistent|outdated|dunknown|failed

  # sbd alerts
  - name: sbd-device-alerts
    rules:
      - alert: a-sbd-device-unhealthy
        expr: count(ha_cluster_sbd_devices{status="unhealthy"} == 1) > 0
        labels:
          severity: page
        annotations:
          summary: An SBD device in the HA cluster is unhealthy

  # systemd services alerts
  - name: systemd-services-monitoring
    rules:
      - alert: service-down-pacemaker
        expr: node_systemd_unit_state{name="pacemaker.service", state="active"} == 0
        labels:
          severity: page
        annotations:
          summary: Pacemaker service not running

      - alert: service-down-corosync
        expr: node_systemd_unit_state{name="corosync.service", state="active"} == 0
        labels:
          severity: page
        annotations:
          summary: Corosync service not running

      - alert: service-down-sbd
        expr: node_systemd_unit_state{name="sbd.service", state="active"} == 0
        labels:
          severity: page
        annotations:
          summary: SBD service not running

      - alert: service-down-hawk
        expr: node_systemd_unit_state{name="hawk.service", state="active"} == 0
        labels:
          severity: page
        annotations:
          summary: Hawk service not running

      - alert: service-down-hawk-backend
        expr: node_systemd_unit_state{name="hawk-backend.service", state="active"} == 0
        labels:
          severity: page
        annotations:
          summary: Hawk backend service not running

      - alert: service-down-node-exporter
        expr: node_systemd_unit_state{name="prometheus-node_exporter.service", state="active"} == 0
        labels:
          severity: page
        annotations:
          summary: Node exporter service not running

      - alert: service-down-ha-cluster-exporter
        expr: node_systemd_unit_state{name="prometheus-ha_cluster_exporter.service", state="active"} == 0
        labels:
          severity: page
        annotations:
          summary: HA Cluster Exporter service not running

      - alert: service-down-hanadb-exporter
        expr: node_systemd_unit_state{name=~"hanadb_exporter@.*.service", state="active"} == 0
        labels:
          severity: page
        annotations:
          summary: HANA exporter service not running

      - alert: node-filesystem-space-low
        expr: ((node_filesystem_size_bytes{fstype!="tmpfs"} - node_filesystem_avail_bytes) / node_filesystem_size_bytes * 100) > 85
        labels:
          severity: page
        annotations:
          summary: Node filesystem space usage is higher than 85%

      - alert: stonith-disabled
        expr: ha_cluster_pacemaker_stonith_enabled == 0
        labels:
          severity: page
        annotations:
          summary: STONITH is disabled! Clusters without a fencing mechanism are not supported and have increased risk of data loss.

      - alert: negative-location-constraint-detected
        expr: ha_cluster_pacemaker_location_constraints < 0
        labels:
          severity: warning
        annotations:
          summary: A negative resource location constraint has been detected. Please ensure that no resource has been mistakenly banned from a node.