Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
277 changes: 277 additions & 0 deletions observability/grafana/dashboards/rabbitmq-queue.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,277 @@
---
apiVersion: v1
kind: ConfigMap
metadata:
name: rabbitmq-queue-grafana-dashboard
labels:
grafana_dashboard: "1"
data:
rabbitmq-queue-grafana-dashboard.json: |-
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": "-- Grafana --",
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"editable": true,
"gnetId": null,
"graphTooltip": 0,
"id": 31,
"iteration": 1633436610573,
"links": [],
"panels": [
{
"datasource": null,
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "Messages",
"axisPlacement": "left",
"axisSoftMin": 0,
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "Consumers"
},
"properties": [
{
"id": "custom.axisPlacement",
"value": "right"
},
{
"id": "unit",
"value": "prefix:"
},
{
"id": "custom.axisLabel",
"value": "Consumers"
}
]
},
{
"matcher": {
"id": "byName",
"options": "Messages"
},
"properties": [
{
"id": "custom.drawStyle",
"value": "line"
},
{
"id": "custom.fillOpacity",
"value": 0
}
]
}
]
},
"gridPos": {
"h": 17,
"w": 11,
"x": 0,
"y": 0
},
"id": 2,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single"
}
},
"targets": [
{
"exemplar": true,
"expr": "(rabbitmq_detailed_queue_messages{namespace=\"$namespace\", queue=\"$queue\"} * on (instance, job) rabbitmq_identity_info{namespace=\"$namespace\",rabbitmq_cluster=\"$rabbitmq_cluster\"})",
"interval": "",
"legendFormat": "Messages",
"refId": "A"
},
{
"exemplar": true,
"expr": "rabbitmq_detailed_queue_consumers{namespace=\"$namespace\", queue=\"$queue\"} * on (instance, job) rabbitmq_identity_info{namespace=\"$namespace\",rabbitmq_cluster=\"$rabbitmq_cluster\"}",
"hide": false,
"interval": "",
"legendFormat": "Consumers",
"refId": "B"
}
],
"title": "Queue messages and consumers",
"type": "timeseries"
}
],
"refresh": false,
"schemaVersion": 30,
"style": "dark",
"tags": [],
"templating": {
"list": [
{
"current": {
"selected": false,
"text": "Prometheus",
"value": "Prometheus"
},
"description": null,
"error": null,
"hide": 2,
"includeAll": false,
"label": "datasource",
"multi": false,
"name": "DS_PROMETHEUS",
"options": [],
"query": "prometheus",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"type": "datasource"
},
{
"allValue": null,
"current": {
"selected": false,
"text": "opportunity-92",
"value": "opportunity-92"
},
"datasource": "${DS_PROMETHEUS}",
"definition": "label_values(rabbitmq_identity_info, namespace)",
"description": null,
"error": null,
"hide": 0,
"includeAll": false,
"label": "Namespace",
"multi": false,
"name": "namespace",
"options": [],
"query": {
"query": "label_values(rabbitmq_identity_info, namespace)",
"refId": "StandardVariableQuery"
},
"refresh": 2,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"type": "query"
},
{
"allValue": null,
"current": {
"selected": false,
"text": "rabbitmq",
"value": "rabbitmq"
},
"datasource": "${DS_PROMETHEUS}",
"definition": "label_values(rabbitmq_identity_info{namespace=\"$namespace\"}, rabbitmq_cluster)",
"description": null,
"error": null,
"hide": 0,
"includeAll": false,
"label": "RabbitMQ Cluster",
"multi": false,
"name": "rabbitmq_cluster",
"options": [],
"query": {
"query": "label_values(rabbitmq_identity_info{namespace=\"$namespace\"}, rabbitmq_cluster)",
"refId": "StandardVariableQuery"
},
"refresh": 2,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"type": "query"
},
{
"allValue": null,
"current": {
"selected": false,
"text": "perf-test-1",
"value": "perf-test-1"
},
"datasource": null,
"definition": "query_result(rabbitmq_detailed_queue_messages{namespace=\"$namespace\"} * on (instance, job) group_left(rabbitmq_cluster) rabbitmq_identity_info{namespace=\"$namespace\"})",
"description": null,
"error": null,
"hide": 0,
"includeAll": false,
"label": "Queue",
"multi": false,
"name": "queue",
"options": [],
"query": {
"query": "query_result(rabbitmq_detailed_queue_messages{namespace=\"$namespace\"} * on (instance, job) group_left(rabbitmq_cluster) rabbitmq_identity_info{namespace=\"$namespace\"})",
"refId": "StandardVariableQuery"
},
"refresh": 2,
"regex": "/.*queue=\"([^\"]+)\".*/",
"skipUrlSync": false,
"sort": 0,
"type": "query"
}
]
},
"time": {
"from": "now-6h",
"to": "now"
},
"timepicker": {},
"timezone": "",
"title": "RabbitMQ-Queue",
"uid": "j9t8vwH7k",
"version": 10
}
20 changes: 20 additions & 0 deletions observability/prometheus/monitors/rabbitmq-servicemonitor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,26 @@ spec:
scrapeTimeout: 14s
tlsConfig:
insecureSkipVerify: true
- port: prometheus
scheme: http
path: /metrics/detailed
params:
family:
- queue_coarse_metrics
- queue_metrics
interval: 15s
scrapeTimeout: 14s
- port: prometheus-tls
scheme: https
path: /metrics/detailed
params:
family:
- queue_coarse_metrics
- queue_metrics
interval: 15s
scrapeTimeout: 14s
tlsConfig:
insecureSkipVerify: true
selector:
matchLabels:
app.kubernetes.io/component: rabbitmq
Expand Down
5 changes: 5 additions & 0 deletions observability/prometheus/rules/rabbitmq-per-object/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# RabbitMQ per-object rules

RabbitMQ >= 3.9.7 is required for functioning of these alerts.

Also they are highly opionated and probably require some tuning before applying, e.g. filtering by specific queue names.
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: rabbitmq-queue-has-no-consumers
# If labels are defined in spec.ruleSelector.matchLabels of your deployed Prometheus object, make sure to include them here.
labels:
role: alert-rules
spec:
groups:
- name: rabbitmq
rules:
- alert: QueueHasNoConsumers
expr: |
((rabbitmq_detailed_queue_consumers{vhost="/", queue=~".*"} == 0) + rabbitmq_detailed_queue_messages) > 0
for: 10m
annotations:
description: |
Over the last 10 minutes, non-empty queue `{{ $labels.queue }}` with {{ $value }} messages
in virtual host `{{ $labels.vhost }}` didn't have any consumers in
RabbitMQ cluster `{{ $labels.rabbitmq_cluster }}` in namespace `{{ $labels.namespace }}`.
summary: |
Messages are sitting idle in the queue, without any processing.
This alert is highly application specific (and e.g. doesn't make sense for stream queues).
labels:
rulesgroup: rabbitmq
severity: warning
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: rabbitmq-queue-is-growing
# If labels are defined in spec.ruleSelector.matchLabels of your deployed Prometheus object, make sure to include them here.
labels:
role: alert-rules
spec:
groups:
- name: rabbitmq
rules:
- alert: QueueIsGrowing
# `> 1` because of floating point rounding errors
expr: |
avg_over_time(rabbitmq_detailed_queue_messages[10m]) - avg_over_time(rabbitmq_detailed_queue_messages[10m] offset 1m) > 1
for: 10m
annotations:
description: |
Over the last 10 minutes, queue `{{ $labels.queue }}` in virtual host `{{ $labels.vhost }}`
was growing. 10 minute moving average has grown by {{ $value }}.
This happens in RabbitMQ cluster `{{ $labels.rabbitmq_cluster }}` in namespace `{{ $labels.namespace }}`.
summary: |
Queue size is steadily growing over time.
labels:
rulesgroup: rabbitmq
severity: warning