-
Notifications
You must be signed in to change notification settings - Fork 88
/
Copy pathrules.yml
160 lines (141 loc) · 5.75 KB
/
rules.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
---
groups:
# sap alerts
- name: sap-hana-resource-monitoring
rules:
- alert: sap-hana-master-resource-down
expr: absent(ha_cluster_pacemaker_resources{resource=~"rsc_SAPHana_.*",role="master",status="active"} == 1)
labels:
severity: page
annotations:
summary: Primary SAP-HANA resource down
- alert: sap-hana-secondary-resource-absent
expr: absent(ha_cluster_pacemaker_resources{resource=~"rsc_SAPHana_.*",role="slave",status="active"} == 1)
labels:
severity: page
annotations:
summary: Slave SAP-HANA resource absent
- alert: sap-hana-internal-alerts
expr: hanadb_alerts_current_rating > 3
labels:
severity: page
annotations:
summary: "HANA Internal alert raised for SID {{ $labels.sid }} InsNr {{ $labels.insnr }} DBName {{ $labels.database_name }}"
description: "Alert Details: {{ $labels.alert_details }} User Action: {{ $labels.alert_useraction }}"
# ha cluster alerts
- name: cluster-resources-monitoring
rules:
- alert: cluster-resources-a-resource-failed
expr: count(ha_cluster_pacemaker_resources{status="failed"} == 1) > 0
labels:
severity: page
annotations:
summary: A cluster resource failed
# failcount exceed migration threshold (example on saphana specific resource)
- name: resource-failcount-higher-threshold
rules:
- alert: resource-failcount-higher-threshold
expr: count(ha_cluster_pacemaker_fail_count > ha_cluster_pacemaker_migration_threshold) > 0
labels:
severity: page
annotations:
summary: a resource fail count exceeded its migration threshold
# drbd alerts
- name: drbd-alerts
rules:
- alert: drbd-connections-in-a-bad-state
expr: count(ha_cluster_drbd_connections{peer_disk_state=~"inconsistent|outdated|dunknown|failed"}) > 0
labels:
severity: page
annotations:
summary: a drbd connection is an bad state inconsistent|outdated|dunknown|failed
- alert: drbd-sync-connections-percentage-lower-than-expected
expr: ha_cluster_drbd_connections_sync < 90
labels:
severity: page
annotations:
summary: a drbd disk sync is lower than 90 percent!
- alert: drbd-resource-in-a-bad-state
expr: count(ha_cluster_drbd_resources{peer_disk_state=~"inconsistent|outdated|dunknown|failed"}) > 0
labels:
severity: page
annotations:
summary: a drbd resource is an bad state inconsistent|outdated|dunknown|failed
# sbd alerts
- name: sbd-device-alerts
rules:
- alert: a-sbd-device-unhealthy
expr: count(ha_cluster_sbd_devices{status="unhealthy"} == 1) > 0
labels:
severity: page
annotations:
summary: An SBD device in the HA cluster is unhealthy
# systemd services alerts
- name: systemd-services-monitoring
rules:
- alert: service-down-pacemaker
expr: node_systemd_unit_state{name="pacemaker.service", state="active"} == 0
labels:
severity: page
annotations:
summary: Pacemaker service not running
- alert: service-down-corosync
expr: node_systemd_unit_state{name="corosync.service", state="active"} == 0
labels:
severity: page
annotations:
summary: Corosync service not running
- alert: service-down-sbd
expr: node_systemd_unit_state{name="sbd.service", state="active"} == 0
labels:
severity: page
annotations:
summary: SBD service not running
- alert: service-down-hawk
expr: node_systemd_unit_state{name="hawk.service", state="active"} == 0
labels:
severity: page
annotations:
summary: Hawk service not running
- alert: service-down-hawk-backend
expr: node_systemd_unit_state{name="hawk-backend.service", state="active"} == 0
labels:
severity: page
annotations:
summary: Hawk backend service not running
- alert: service-down-node-exporter
expr: node_systemd_unit_state{name="prometheus-node_exporter.service", state="active"} == 0
labels:
severity: page
annotations:
summary: Node exporter service not running
- alert: service-down-ha-cluster-exporter
expr: node_systemd_unit_state{name="prometheus-ha_cluster_exporter.service", state="active"} == 0
labels:
severity: page
annotations:
summary: HA Cluster Exporter service not running
- alert: service-down-hanadb-exporter
expr: node_systemd_unit_state{name=~"hanadb_exporter@.*.service", state="active"} == 0
labels:
severity: page
annotations:
summary: HANA exporter service not running
- alert: node-filesystem-space-low
expr: ((node_filesystem_size_bytes{fstype!="tmpfs"} - node_filesystem_avail_bytes) / node_filesystem_size_bytes * 100) > 85
labels:
severity: page
annotations:
summary: Node filesystem space usage is higher than 85%
- alert: stonith-disabled
expr: ha_cluster_pacemaker_stonith_enabled == 0
labels:
severity: page
annotations:
summary: STONITH is disabled! Clusters without a fencing mechanism are not supported and have increased risk of data loss.
- alert: negative-location-constraint-detected
expr: ha_cluster_pacemaker_location_constraints < 0
labels:
severity: warning
annotations:
summary: A negative resource location constraint has been detected. Please ensure that no resource has been mistakenly banned from a node.