diff --git a/CHANGELOG.md b/CHANGELOG.md index 79a51159dfb..b58629fcbc7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -38,6 +38,7 @@ * [CHANGE] Split `mimir_queries` rules group into `mimir_queries` and `mimir_ingester_queries` to keep number of rules per group within the default per-tenant limit. #1885 * [CHANGE] Dashboards: Expose full image tag in "Mimir / Rollout progress" dashboard's "Pod per version panel." #1932 * [CHANGE] Dashboards: Disabled gateway panels by default, because most users don't have a gateway exposing the metrics expected by Mimir dashboards. You can re-enable it setting `gateway_enabled: true` in the mixin config and recompiling the mixin running `make build-mixin`. #1954 +* [CHANGE] Alerts: adapt `MimirFrontendQueriesStuck` and `MimirSchedulerQueriesStuck` to consider ruler query path components. #1949 * [ENHANCEMENT] Dashboards: Add config option `datasource_regex` to customise the regular expression used to select valid datasources for Mimir dashboards. #1802 * [ENHANCEMENT] Dashboards: Added "Mimir / Remote ruler reads" and "Mimir / Remote ruler reads resources" dashboards. #1911 #1937 * [ENHANCEMENT] Dashboards: Make networking panels work for pods created by the mimir-distributed helm chart. #1927 diff --git a/docs/sources/operators-guide/mimir-runbooks/_index.md b/docs/sources/operators-guide/mimir-runbooks/_index.md index 6f5b5939b4e..51a0a53b2b3 100644 --- a/docs/sources/operators-guide/mimir-runbooks/_index.md +++ b/docs/sources/operators-guide/mimir-runbooks/_index.md @@ -318,6 +318,7 @@ There is a category of errors that is more important: errors due to failure to r How to **fix** it: - Investigate the ruler logs to find out the reason why ruler cannot evaluate queries. Note that ruler logs rule evaluation errors even for "user errors", but those are not causing the alert to fire. Focus on problems with ingesters or store-gateways. +- In case remote operational mode is enabled the problem could be at any of the ruler query path components (ruler-query-frontend, ruler-query-scheduler and ruler-querier). Check the `Mimir / Remote ruler reads` and `Mimir / Remote ruler reads resources` dashboards to find out in which Mimir service the error is being originated. ### MimirRulerMissedEvaluations diff --git a/operations/mimir-mixin-compiled/alerts.yaml b/operations/mimir-mixin-compiled/alerts.yaml index 2ec14e6ae57..28381bfe5a8 100644 --- a/operations/mimir-mixin-compiled/alerts.yaml +++ b/operations/mimir-mixin-compiled/alerts.yaml @@ -66,18 +66,18 @@ groups: - alert: MimirFrontendQueriesStuck annotations: message: | - There are {{ $value }} queued up queries in {{ $labels.cluster }}/{{ $labels.namespace }} query-frontend. + There are {{ $value }} queued up queries in {{ $labels.cluster }}/{{ $labels.namespace }} {{ $labels.job }}. expr: | - sum by (cluster, namespace) (cortex_query_frontend_queue_length) > 1 + sum by (cluster, namespace, job) (cortex_query_frontend_queue_length) > 1 for: 5m labels: severity: critical - alert: MimirSchedulerQueriesStuck annotations: message: | - There are {{ $value }} queued up queries in {{ $labels.cluster }}/{{ $labels.namespace }} query-scheduler. + There are {{ $value }} queued up queries in {{ $labels.cluster }}/{{ $labels.namespace }} {{ $labels.job }}. expr: | - sum by (cluster, namespace) (cortex_query_scheduler_queue_length) > 1 + sum by (cluster, namespace, job) (cortex_query_scheduler_queue_length) > 1 for: 5m labels: severity: critical diff --git a/operations/mimir-mixin/alerts/alerts.libsonnet b/operations/mimir-mixin/alerts/alerts.libsonnet index d15f5d76157..e328a5b3027 100644 --- a/operations/mimir-mixin/alerts/alerts.libsonnet +++ b/operations/mimir-mixin/alerts/alerts.libsonnet @@ -127,7 +127,7 @@ { alert: $.alertName('FrontendQueriesStuck'), expr: ||| - sum by (%s) (cortex_query_frontend_queue_length) > 1 + sum by (%s, job) (cortex_query_frontend_queue_length) > 1 ||| % $._config.alert_aggregation_labels, 'for': '5m', // We don't want to block for longer. labels: { @@ -135,14 +135,14 @@ }, annotations: { message: ||| - There are {{ $value }} queued up queries in %(alert_aggregation_variables)s query-frontend. + There are {{ $value }} queued up queries in %(alert_aggregation_variables)s {{ $labels.job }}. ||| % $._config, }, }, { alert: $.alertName('SchedulerQueriesStuck'), expr: ||| - sum by (%s) (cortex_query_scheduler_queue_length) > 1 + sum by (%s, job) (cortex_query_scheduler_queue_length) > 1 ||| % $._config.alert_aggregation_labels, 'for': '5m', // We don't want to block for longer. labels: { @@ -150,7 +150,7 @@ }, annotations: { message: ||| - There are {{ $value }} queued up queries in %(alert_aggregation_variables)s query-scheduler. + There are {{ $value }} queued up queries in %(alert_aggregation_variables)s {{ $labels.job }}. ||| % $._config, }, },