Skip to content

Commit 1537787

Browse files
committed
Add some examples of per-queue alerts/dashboards
1 parent 5b5ca6c commit 1537787

File tree

5 files changed

+361
-0
lines changed

5 files changed

+361
-0
lines changed
Lines changed: 278 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,278 @@
1+
---
2+
apiVersion: v1
3+
kind: ConfigMap
4+
metadata:
5+
namespace: kube-prometheus
6+
name: rabbitmq-queue-grafana-dashboard
7+
labels:
8+
grafana_dashboard: "1"
9+
data:
10+
rabbitmq-queue-grafana-dashboard.json: |-
11+
{
12+
"annotations": {
13+
"list": [
14+
{
15+
"builtIn": 1,
16+
"datasource": "-- Grafana --",
17+
"enable": true,
18+
"hide": true,
19+
"iconColor": "rgba(0, 211, 255, 1)",
20+
"name": "Annotations & Alerts",
21+
"type": "dashboard"
22+
}
23+
]
24+
},
25+
"editable": true,
26+
"gnetId": null,
27+
"graphTooltip": 0,
28+
"id": 31,
29+
"iteration": 1633436610573,
30+
"links": [],
31+
"panels": [
32+
{
33+
"datasource": null,
34+
"fieldConfig": {
35+
"defaults": {
36+
"color": {
37+
"mode": "palette-classic"
38+
},
39+
"custom": {
40+
"axisLabel": "Messages",
41+
"axisPlacement": "left",
42+
"axisSoftMin": 0,
43+
"barAlignment": 0,
44+
"drawStyle": "line",
45+
"fillOpacity": 0,
46+
"gradientMode": "none",
47+
"hideFrom": {
48+
"legend": false,
49+
"tooltip": false,
50+
"viz": false
51+
},
52+
"lineInterpolation": "linear",
53+
"lineWidth": 1,
54+
"pointSize": 5,
55+
"scaleDistribution": {
56+
"type": "linear"
57+
},
58+
"showPoints": "auto",
59+
"spanNulls": false,
60+
"stacking": {
61+
"group": "A",
62+
"mode": "none"
63+
},
64+
"thresholdsStyle": {
65+
"mode": "off"
66+
}
67+
},
68+
"mappings": [],
69+
"thresholds": {
70+
"mode": "absolute",
71+
"steps": [
72+
{
73+
"color": "green",
74+
"value": null
75+
},
76+
{
77+
"color": "red",
78+
"value": 80
79+
}
80+
]
81+
}
82+
},
83+
"overrides": [
84+
{
85+
"matcher": {
86+
"id": "byName",
87+
"options": "Consumers"
88+
},
89+
"properties": [
90+
{
91+
"id": "custom.axisPlacement",
92+
"value": "right"
93+
},
94+
{
95+
"id": "unit",
96+
"value": "prefix:"
97+
},
98+
{
99+
"id": "custom.axisLabel",
100+
"value": "Consumers"
101+
}
102+
]
103+
},
104+
{
105+
"matcher": {
106+
"id": "byName",
107+
"options": "Messages"
108+
},
109+
"properties": [
110+
{
111+
"id": "custom.drawStyle",
112+
"value": "line"
113+
},
114+
{
115+
"id": "custom.fillOpacity",
116+
"value": 0
117+
}
118+
]
119+
}
120+
]
121+
},
122+
"gridPos": {
123+
"h": 17,
124+
"w": 11,
125+
"x": 0,
126+
"y": 0
127+
},
128+
"id": 2,
129+
"options": {
130+
"legend": {
131+
"calcs": [],
132+
"displayMode": "list",
133+
"placement": "bottom"
134+
},
135+
"tooltip": {
136+
"mode": "single"
137+
}
138+
},
139+
"targets": [
140+
{
141+
"exemplar": true,
142+
"expr": "(rabbitmq_detailed_queue_messages{namespace=\"$namespace\", queue=\"$queue\"} * on (instance, job) rabbitmq_identity_info{namespace=\"$namespace\",rabbitmq_cluster=\"$rabbitmq_cluster\"})",
143+
"interval": "",
144+
"legendFormat": "Messages",
145+
"refId": "A"
146+
},
147+
{
148+
"exemplar": true,
149+
"expr": "rabbitmq_detailed_queue_consumers{namespace=\"$namespace\", queue=\"$queue\"} * on (instance, job) rabbitmq_identity_info{namespace=\"$namespace\",rabbitmq_cluster=\"$rabbitmq_cluster\"}",
150+
"hide": false,
151+
"interval": "",
152+
"legendFormat": "Consumers",
153+
"refId": "B"
154+
}
155+
],
156+
"title": "Queue messages and consumers",
157+
"type": "timeseries"
158+
}
159+
],
160+
"refresh": false,
161+
"schemaVersion": 30,
162+
"style": "dark",
163+
"tags": [],
164+
"templating": {
165+
"list": [
166+
{
167+
"current": {
168+
"selected": false,
169+
"text": "Prometheus",
170+
"value": "Prometheus"
171+
},
172+
"description": null,
173+
"error": null,
174+
"hide": 2,
175+
"includeAll": false,
176+
"label": "datasource",
177+
"multi": false,
178+
"name": "DS_PROMETHEUS",
179+
"options": [],
180+
"query": "prometheus",
181+
"refresh": 1,
182+
"regex": "",
183+
"skipUrlSync": false,
184+
"type": "datasource"
185+
},
186+
{
187+
"allValue": null,
188+
"current": {
189+
"selected": false,
190+
"text": "opportunity-92",
191+
"value": "opportunity-92"
192+
},
193+
"datasource": "${DS_PROMETHEUS}",
194+
"definition": "label_values(rabbitmq_identity_info, namespace)",
195+
"description": null,
196+
"error": null,
197+
"hide": 0,
198+
"includeAll": false,
199+
"label": "Namespace",
200+
"multi": false,
201+
"name": "namespace",
202+
"options": [],
203+
"query": {
204+
"query": "label_values(rabbitmq_identity_info, namespace)",
205+
"refId": "StandardVariableQuery"
206+
},
207+
"refresh": 2,
208+
"regex": "",
209+
"skipUrlSync": false,
210+
"sort": 1,
211+
"type": "query"
212+
},
213+
{
214+
"allValue": null,
215+
"current": {
216+
"selected": false,
217+
"text": "rabbitmq",
218+
"value": "rabbitmq"
219+
},
220+
"datasource": "${DS_PROMETHEUS}",
221+
"definition": "label_values(rabbitmq_identity_info{namespace=\"$namespace\"}, rabbitmq_cluster)",
222+
"description": null,
223+
"error": null,
224+
"hide": 0,
225+
"includeAll": false,
226+
"label": "RabbitMQ Cluster",
227+
"multi": false,
228+
"name": "rabbitmq_cluster",
229+
"options": [],
230+
"query": {
231+
"query": "label_values(rabbitmq_identity_info{namespace=\"$namespace\"}, rabbitmq_cluster)",
232+
"refId": "StandardVariableQuery"
233+
},
234+
"refresh": 2,
235+
"regex": "",
236+
"skipUrlSync": false,
237+
"sort": 1,
238+
"type": "query"
239+
},
240+
{
241+
"allValue": null,
242+
"current": {
243+
"selected": false,
244+
"text": "perf-test-1",
245+
"value": "perf-test-1"
246+
},
247+
"datasource": null,
248+
"definition": "query_result(rabbitmq_detailed_queue_messages{namespace=\"$namespace\"} * on (instance, job) group_left(rabbitmq_cluster) rabbitmq_identity_info{namespace=\"$namespace\"})",
249+
"description": null,
250+
"error": null,
251+
"hide": 0,
252+
"includeAll": false,
253+
"label": "Queue",
254+
"multi": false,
255+
"name": "queue",
256+
"options": [],
257+
"query": {
258+
"query": "query_result(rabbitmq_detailed_queue_messages{namespace=\"$namespace\"} * on (instance, job) group_left(rabbitmq_cluster) rabbitmq_identity_info{namespace=\"$namespace\"})",
259+
"refId": "StandardVariableQuery"
260+
},
261+
"refresh": 2,
262+
"regex": "/.*queue=\"([^\"]+)\".*/",
263+
"skipUrlSync": false,
264+
"sort": 0,
265+
"type": "query"
266+
}
267+
]
268+
},
269+
"time": {
270+
"from": "now-6h",
271+
"to": "now"
272+
},
273+
"timepicker": {},
274+
"timezone": "",
275+
"title": "RabbitMQ-Queue",
276+
"uid": "j9t8vwH7k",
277+
"version": 10
278+
}

observability/prometheus/monitors/rabbitmq-servicemonitor.yml

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,30 @@ spec:
1616
scrapeTimeout: 14s
1717
tlsConfig:
1818
insecureSkipVerify: true
19+
- port: prometheus
20+
scheme: http
21+
path: /metrics/detailed
22+
params:
23+
per-object:
24+
- "true"
25+
family:
26+
- queue_coarse_metrics
27+
- queue_consumer_count
28+
interval: 15s
29+
scrapeTimeout: 14s
30+
- port: prometheus-tls
31+
scheme: https
32+
path: /metrics/detailed
33+
params:
34+
per-object:
35+
- "true"
36+
family:
37+
- queue_coarse_metrics
38+
- queue_consumer_count
39+
interval: 15s
40+
scrapeTimeout: 14s
41+
tlsConfig:
42+
insecureSkipVerify: true
1943
selector:
2044
matchLabels:
2145
app.kubernetes.io/component: rabbitmq
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# RabbitMQ per-object rules
2+
3+
RabbitMQ >= 3.9.7 is required for functioning of these alerts.
4+
5+
Also they are highly opionated and probably require some tuning before applying, e.g. filtering by specific queue names.
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
---
2+
apiVersion: monitoring.coreos.com/v1
3+
kind: PrometheusRule
4+
metadata:
5+
name: rabbitmq-queue-has-no-consumers
6+
# If labels are defined in spec.ruleSelector.matchLabels of your deployed Prometheus object, make sure to include them here.
7+
labels:
8+
role: alert-rules
9+
spec:
10+
groups:
11+
- name: rabbitmq
12+
rules:
13+
- alert: QueueHasNoConsumers
14+
expr: |
15+
((rabbitmq_detailed_queue_consumers{vhost="/", queue=~".*"} == 0) + rabbitmq_detailed_queue_messages) > 0
16+
for: 10m
17+
annotations:
18+
description: |
19+
Over the last 10 minutes, non-empty queue `{{ $labels.queue }}` with {{ $value }} messages
20+
in virtual host `{{ $labels.vhost }}` didn't have any consumers in
21+
RabbitMQ cluster `{{ $labels.rabbitmq_cluster }}` in namespace `{{ $labels.namespace }}`.
22+
summary: |
23+
Messages are sitting idle in the queue, without any processing.
24+
This alert is highly application specific (and e.g. doesn't make sense for stream queues).
25+
labels:
26+
rulesgroup: rabbitmq
27+
severity: warning
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
---
2+
apiVersion: monitoring.coreos.com/v1
3+
kind: PrometheusRule
4+
metadata:
5+
name: rabbitmq-queue-is-growing
6+
# If labels are defined in spec.ruleSelector.matchLabels of your deployed Prometheus object, make sure to include them here.
7+
labels:
8+
role: alert-rules
9+
spec:
10+
groups:
11+
- name: rabbitmq
12+
rules:
13+
- alert: QueueIsGrowing
14+
# `> 1` because of floating point rounding errors
15+
expr: |
16+
avg_over_time(rabbitmq_detailed_queue_messages[10m]) - avg_over_time(rabbitmq_detailed_queue_messages[10m] offset 1m) > 1
17+
for: 10m
18+
annotations:
19+
description: |
20+
Over the last 10 minutes, queue `{{ $labels.queue }}` in virtual host `{{ $labels.vhost }}`
21+
was growing. 10 minute moving average has grown by {{ $value }}.
22+
This happens in RabbitMQ cluster `{{ $labels.rabbitmq_cluster }}` in namespace `{{ $labels.namespace }}`.
23+
summary: |
24+
Queue size is steadily growing over time.
25+
labels:
26+
rulesgroup: rabbitmq
27+
severity: warning

0 commit comments

Comments
 (0)