From e111f80d7fdaac0373642ba60d23f9aecc260896 Mon Sep 17 00:00:00 2001 From: Alexey Lebedeff Date: Tue, 5 Oct 2021 16:34:54 +0200 Subject: [PATCH 1/5] Add some examples of per-queue alerts/dashboards --- .../grafana/dashboards/rabbitmq-queue.yml | 277 ++++++++++++++++++ .../monitors/rabbitmq-servicemonitor.yml | 20 ++ .../rules/rabbitmq-per-object/README.md | 5 + .../queue-has-no-consumers.yml | 27 ++ .../rabbitmq-per-object/queue-is-growing.yml | 27 ++ 5 files changed, 356 insertions(+) create mode 100644 observability/grafana/dashboards/rabbitmq-queue.yml create mode 100644 observability/prometheus/rules/rabbitmq-per-object/README.md create mode 100644 observability/prometheus/rules/rabbitmq-per-object/queue-has-no-consumers.yml create mode 100644 observability/prometheus/rules/rabbitmq-per-object/queue-is-growing.yml diff --git a/observability/grafana/dashboards/rabbitmq-queue.yml b/observability/grafana/dashboards/rabbitmq-queue.yml new file mode 100644 index 000000000..1af7f37c5 --- /dev/null +++ b/observability/grafana/dashboards/rabbitmq-queue.yml @@ -0,0 +1,277 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: rabbitmq-queue-grafana-dashboard + labels: + grafana_dashboard: "1" +data: + rabbitmq-queue-grafana-dashboard.json: |- + { + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": 31, + "iteration": 1633436610573, + "links": [], + "panels": [ + { + "datasource": null, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "Messages", + "axisPlacement": "left", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Consumers" + }, + "properties": [ + { + "id": "custom.axisPlacement", + "value": "right" + }, + { + "id": "unit", + "value": "prefix:" + }, + { + "id": "custom.axisLabel", + "value": "Consumers" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Messages" + }, + "properties": [ + { + "id": "custom.drawStyle", + "value": "line" + }, + { + "id": "custom.fillOpacity", + "value": 0 + } + ] + } + ] + }, + "gridPos": { + "h": 17, + "w": 11, + "x": 0, + "y": 0 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "(rabbitmq_detailed_queue_messages{namespace=\"$namespace\", queue=\"$queue\"} * on (instance, job) rabbitmq_identity_info{namespace=\"$namespace\",rabbitmq_cluster=\"$rabbitmq_cluster\"})", + "interval": "", + "legendFormat": "Messages", + "refId": "A" + }, + { + "exemplar": true, + "expr": "rabbitmq_detailed_queue_consumers{namespace=\"$namespace\", queue=\"$queue\"} * on (instance, job) rabbitmq_identity_info{namespace=\"$namespace\",rabbitmq_cluster=\"$rabbitmq_cluster\"}", + "hide": false, + "interval": "", + "legendFormat": "Consumers", + "refId": "B" + } + ], + "title": "Queue messages and consumers", + "type": "timeseries" + } + ], + "refresh": false, + "schemaVersion": 30, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "Prometheus", + "value": "Prometheus" + }, + "description": null, + "error": null, + "hide": 2, + "includeAll": false, + "label": "datasource", + "multi": false, + "name": "DS_PROMETHEUS", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "allValue": null, + "current": { + "selected": false, + "text": "opportunity-92", + "value": "opportunity-92" + }, + "datasource": "${DS_PROMETHEUS}", + "definition": "label_values(rabbitmq_identity_info, namespace)", + "description": null, + "error": null, + "hide": 0, + "includeAll": false, + "label": "Namespace", + "multi": false, + "name": "namespace", + "options": [], + "query": { + "query": "label_values(rabbitmq_identity_info, namespace)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "allValue": null, + "current": { + "selected": false, + "text": "rabbitmq", + "value": "rabbitmq" + }, + "datasource": "${DS_PROMETHEUS}", + "definition": "label_values(rabbitmq_identity_info{namespace=\"$namespace\"}, rabbitmq_cluster)", + "description": null, + "error": null, + "hide": 0, + "includeAll": false, + "label": "RabbitMQ Cluster", + "multi": false, + "name": "rabbitmq_cluster", + "options": [], + "query": { + "query": "label_values(rabbitmq_identity_info{namespace=\"$namespace\"}, rabbitmq_cluster)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "allValue": null, + "current": { + "selected": false, + "text": "perf-test-1", + "value": "perf-test-1" + }, + "datasource": null, + "definition": "query_result(rabbitmq_detailed_queue_messages{namespace=\"$namespace\"} * on (instance, job) group_left(rabbitmq_cluster) rabbitmq_identity_info{namespace=\"$namespace\"})", + "description": null, + "error": null, + "hide": 0, + "includeAll": false, + "label": "Queue", + "multi": false, + "name": "queue", + "options": [], + "query": { + "query": "query_result(rabbitmq_detailed_queue_messages{namespace=\"$namespace\"} * on (instance, job) group_left(rabbitmq_cluster) rabbitmq_identity_info{namespace=\"$namespace\"})", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "/.*queue=\"([^\"]+)\".*/", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "RabbitMQ-Queue", + "uid": "j9t8vwH7k", + "version": 10 + } diff --git a/observability/prometheus/monitors/rabbitmq-servicemonitor.yml b/observability/prometheus/monitors/rabbitmq-servicemonitor.yml index 5019de974..b4ba4aa09 100644 --- a/observability/prometheus/monitors/rabbitmq-servicemonitor.yml +++ b/observability/prometheus/monitors/rabbitmq-servicemonitor.yml @@ -16,6 +16,26 @@ spec: scrapeTimeout: 14s tlsConfig: insecureSkipVerify: true + - port: prometheus + scheme: http + path: /metrics/detailed + params: + family: + - queue_coarse_metrics + - queue_metrics + interval: 15s + scrapeTimeout: 14s + - port: prometheus-tls + scheme: https + path: /metrics/detailed + params: + family: + - queue_coarse_metrics + - queue_metrics + interval: 15s + scrapeTimeout: 14s + tlsConfig: + insecureSkipVerify: true selector: matchLabels: app.kubernetes.io/component: rabbitmq diff --git a/observability/prometheus/rules/rabbitmq-per-object/README.md b/observability/prometheus/rules/rabbitmq-per-object/README.md new file mode 100644 index 000000000..a843d196a --- /dev/null +++ b/observability/prometheus/rules/rabbitmq-per-object/README.md @@ -0,0 +1,5 @@ +# RabbitMQ per-object rules + +RabbitMQ >= 3.9.7 is required for functioning of these alerts. + +Also they are highly opionated and probably require some tuning before applying, e.g. filtering by specific queue names. diff --git a/observability/prometheus/rules/rabbitmq-per-object/queue-has-no-consumers.yml b/observability/prometheus/rules/rabbitmq-per-object/queue-has-no-consumers.yml new file mode 100644 index 000000000..4017faf51 --- /dev/null +++ b/observability/prometheus/rules/rabbitmq-per-object/queue-has-no-consumers.yml @@ -0,0 +1,27 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: rabbitmq-queue-has-no-consumers + # If labels are defined in spec.ruleSelector.matchLabels of your deployed Prometheus object, make sure to include them here. + labels: + role: alert-rules +spec: + groups: + - name: rabbitmq + rules: + - alert: QueueHasNoConsumers + expr: | + ((rabbitmq_detailed_queue_consumers{vhost="/", queue=~".*"} == 0) + rabbitmq_detailed_queue_messages) > 0 + for: 10m + annotations: + description: | + Over the last 10 minutes, non-empty queue `{{ $labels.queue }}` with {{ $value }} messages + in virtual host `{{ $labels.vhost }}` didn't have any consumers in + RabbitMQ cluster `{{ $labels.rabbitmq_cluster }}` in namespace `{{ $labels.namespace }}`. + summary: | + Messages are sitting idle in the queue, without any processing. + This alert is highly application specific (and e.g. doesn't make sense for stream queues). + labels: + rulesgroup: rabbitmq + severity: warning diff --git a/observability/prometheus/rules/rabbitmq-per-object/queue-is-growing.yml b/observability/prometheus/rules/rabbitmq-per-object/queue-is-growing.yml new file mode 100644 index 000000000..4f82b099a --- /dev/null +++ b/observability/prometheus/rules/rabbitmq-per-object/queue-is-growing.yml @@ -0,0 +1,27 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: rabbitmq-queue-is-growing + # If labels are defined in spec.ruleSelector.matchLabels of your deployed Prometheus object, make sure to include them here. + labels: + role: alert-rules +spec: + groups: + - name: rabbitmq + rules: + - alert: QueueIsGrowing + # `> 1` because of floating point rounding errors + expr: | + avg_over_time(rabbitmq_detailed_queue_messages[10m]) - avg_over_time(rabbitmq_detailed_queue_messages[10m] offset 1m) > 1 + for: 10m + annotations: + description: | + Over the last 10 minutes, queue `{{ $labels.queue }}` in virtual host `{{ $labels.vhost }}` + was growing. 10 minute moving average has grown by {{ $value }}. + This happens in RabbitMQ cluster `{{ $labels.rabbitmq_cluster }}` in namespace `{{ $labels.namespace }}`. + summary: | + Queue size is steadily growing over time. + labels: + rulesgroup: rabbitmq + severity: warning From 209e68270170d60ffc02996999dfe23efc6c6be1 Mon Sep 17 00:00:00 2001 From: Alexey Lebedeff Date: Wed, 6 Oct 2021 10:27:33 +0200 Subject: [PATCH 2/5] Fix typo --- observability/prometheus/rules/rabbitmq-per-object/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/observability/prometheus/rules/rabbitmq-per-object/README.md b/observability/prometheus/rules/rabbitmq-per-object/README.md index a843d196a..39d0e6e99 100644 --- a/observability/prometheus/rules/rabbitmq-per-object/README.md +++ b/observability/prometheus/rules/rabbitmq-per-object/README.md @@ -2,4 +2,4 @@ RabbitMQ >= 3.9.7 is required for functioning of these alerts. -Also they are highly opionated and probably require some tuning before applying, e.g. filtering by specific queue names. +Also they are highly opinionated and probably require some tuning before applying, e.g. filtering by specific queue names. From fa21cb0265cf33ff95f1618fc71d74c08111d016 Mon Sep 17 00:00:00 2001 From: Alexey Lebedeff Date: Wed, 6 Oct 2021 10:28:43 +0200 Subject: [PATCH 3/5] Fix cluster name in per-queue alert examples --- observability/prometheus/rules/rabbitmq-per-object/README.md | 2 +- .../rules/rabbitmq-per-object/queue-has-no-consumers.yml | 4 +++- .../prometheus/rules/rabbitmq-per-object/queue-is-growing.yml | 4 +++- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/observability/prometheus/rules/rabbitmq-per-object/README.md b/observability/prometheus/rules/rabbitmq-per-object/README.md index 39d0e6e99..c4f9b7136 100644 --- a/observability/prometheus/rules/rabbitmq-per-object/README.md +++ b/observability/prometheus/rules/rabbitmq-per-object/README.md @@ -1,5 +1,5 @@ # RabbitMQ per-object rules -RabbitMQ >= 3.9.7 is required for functioning of these alerts. +RabbitMQ >= 3.9.8 is required for functioning of these alerts. Also they are highly opinionated and probably require some tuning before applying, e.g. filtering by specific queue names. diff --git a/observability/prometheus/rules/rabbitmq-per-object/queue-has-no-consumers.yml b/observability/prometheus/rules/rabbitmq-per-object/queue-has-no-consumers.yml index 4017faf51..3f95b099d 100644 --- a/observability/prometheus/rules/rabbitmq-per-object/queue-has-no-consumers.yml +++ b/observability/prometheus/rules/rabbitmq-per-object/queue-has-no-consumers.yml @@ -12,7 +12,9 @@ spec: rules: - alert: QueueHasNoConsumers expr: | - ((rabbitmq_detailed_queue_consumers{vhost="/", queue=~".*"} == 0) + rabbitmq_detailed_queue_messages) > 0 + ( + ((rabbitmq_detailed_queue_consumers{vhost="/", queue=~".*"} == 0) + rabbitmq_detailed_queue_messages) > 0 + ) * on (instance, job) group_left(rabbitmq_cluster) rabbitmq_identity_info for: 10m annotations: description: | diff --git a/observability/prometheus/rules/rabbitmq-per-object/queue-is-growing.yml b/observability/prometheus/rules/rabbitmq-per-object/queue-is-growing.yml index 4f82b099a..accbaf48b 100644 --- a/observability/prometheus/rules/rabbitmq-per-object/queue-is-growing.yml +++ b/observability/prometheus/rules/rabbitmq-per-object/queue-is-growing.yml @@ -13,7 +13,9 @@ spec: - alert: QueueIsGrowing # `> 1` because of floating point rounding errors expr: | - avg_over_time(rabbitmq_detailed_queue_messages[10m]) - avg_over_time(rabbitmq_detailed_queue_messages[10m] offset 1m) > 1 + ( + avg_over_time(rabbitmq_detailed_queue_messages[10m]) - avg_over_time(rabbitmq_detailed_queue_messages[10m] offset 1m) > 1 + ) * on (instance, job) group_left(rabbitmq_cluster) rabbitmq_identity_info for: 10m annotations: description: | From 7db5cecb88f1f974ab755401dd2ce6fc4d187abc Mon Sep 17 00:00:00 2001 From: Alexey Lebedeff Date: Wed, 6 Oct 2021 10:29:35 +0200 Subject: [PATCH 4/5] Use cleanly exported grafana dashboard definition --- .../grafana/dashboards/rabbitmq-queue.yml | 89 ++++++++++++------- 1 file changed, 58 insertions(+), 31 deletions(-) diff --git a/observability/grafana/dashboards/rabbitmq-queue.yml b/observability/grafana/dashboards/rabbitmq-queue.yml index 1af7f37c5..58808108f 100644 --- a/observability/grafana/dashboards/rabbitmq-queue.yml +++ b/observability/grafana/dashboards/rabbitmq-queue.yml @@ -8,6 +8,36 @@ metadata: data: rabbitmq-queue-grafana-dashboard.json: |- { + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "7.5.3" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], "annotations": { "list": [ { @@ -24,8 +54,8 @@ data: "editable": true, "gnetId": null, "graphTooltip": 0, - "id": 31, - "iteration": 1633436610573, + "id": null, + "iteration": 1633508002435, "links": [], "panels": [ { @@ -44,6 +74,7 @@ data: "fillOpacity": 0, "gradientMode": "none", "hideFrom": { + "graph": false, "legend": false, "tooltip": false, "viz": false @@ -55,14 +86,7 @@ data: "type": "linear" }, "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } + "spanNulls": false }, "mappings": [], "thresholds": { @@ -133,6 +157,9 @@ data: }, "tooltip": { "mode": "single" + }, + "tooltipOptions": { + "mode": "single" } }, "targets": [ @@ -157,7 +184,7 @@ data: } ], "refresh": false, - "schemaVersion": 30, + "schemaVersion": 27, "style": "dark", "tags": [], "templating": { @@ -184,11 +211,7 @@ data: }, { "allValue": null, - "current": { - "selected": false, - "text": "opportunity-92", - "value": "opportunity-92" - }, + "current": {}, "datasource": "${DS_PROMETHEUS}", "definition": "label_values(rabbitmq_identity_info, namespace)", "description": null, @@ -207,15 +230,15 @@ data: "regex": "", "skipUrlSync": false, "sort": 1, - "type": "query" + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false }, { "allValue": null, - "current": { - "selected": false, - "text": "rabbitmq", - "value": "rabbitmq" - }, + "current": {}, "datasource": "${DS_PROMETHEUS}", "definition": "label_values(rabbitmq_identity_info{namespace=\"$namespace\"}, rabbitmq_cluster)", "description": null, @@ -234,16 +257,16 @@ data: "regex": "", "skipUrlSync": false, "sort": 1, - "type": "query" + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false }, { "allValue": null, - "current": { - "selected": false, - "text": "perf-test-1", - "value": "perf-test-1" - }, - "datasource": null, + "current": {}, + "datasource": "${DS_PROMETHEUS}", "definition": "query_result(rabbitmq_detailed_queue_messages{namespace=\"$namespace\"} * on (instance, job) group_left(rabbitmq_cluster) rabbitmq_identity_info{namespace=\"$namespace\"})", "description": null, "error": null, @@ -261,7 +284,11 @@ data: "regex": "/.*queue=\"([^\"]+)\".*/", "skipUrlSync": false, "sort": 0, - "type": "query" + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false } ] }, @@ -273,5 +300,5 @@ data: "timezone": "", "title": "RabbitMQ-Queue", "uid": "j9t8vwH7k", - "version": 10 + "version": 1 } From aff28053cf9f5b05589af5e0894dc689967e4808 Mon Sep 17 00:00:00 2001 From: Alexey Lebedeff Date: Thu, 14 Oct 2021 10:58:10 +0200 Subject: [PATCH 5/5] Fix queue selector on a per-queue dashboard --- observability/grafana/dashboards/rabbitmq-queue.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/observability/grafana/dashboards/rabbitmq-queue.yml b/observability/grafana/dashboards/rabbitmq-queue.yml index 58808108f..82fac265d 100644 --- a/observability/grafana/dashboards/rabbitmq-queue.yml +++ b/observability/grafana/dashboards/rabbitmq-queue.yml @@ -277,7 +277,7 @@ data: "name": "queue", "options": [], "query": { - "query": "query_result(rabbitmq_detailed_queue_messages{namespace=\"$namespace\"} * on (instance, job) group_left(rabbitmq_cluster) rabbitmq_identity_info{namespace=\"$namespace\"})", + "query": "query_result(rabbitmq_detailed_queue_messages{namespace=\"$namespace\"} * on (instance, job) group_left(rabbitmq_cluster) rabbitmq_identity_info{namespace=\"$namespace\", rabbitmq_cluster=\"$rabbitmq_cluster\"})", "refId": "StandardVariableQuery" }, "refresh": 2,