From 7f7debe0665c21f220ce1defe76aa74074586d62 Mon Sep 17 00:00:00 2001 From: "Abdurrahman J. Allawala" Date: Mon, 18 Oct 2021 15:05:23 -0700 Subject: [PATCH 1/3] use alertmanager jobname for alertmanager dashboard panels --- cortex-mixin/config.libsonnet | 1 + .../dashboards/alertmanager.libsonnet | 54 +++++++++---------- 2 files changed, 28 insertions(+), 27 deletions(-) diff --git a/cortex-mixin/config.libsonnet b/cortex-mixin/config.libsonnet index ef8d156..4482958 100644 --- a/cortex-mixin/config.libsonnet +++ b/cortex-mixin/config.libsonnet @@ -37,6 +37,7 @@ store_gateway: '(store-gateway|cortex$)', gateway: '(gateway|cortex-gw|cortex-gw-internal)', compactor: 'compactor.*', // Match also custom compactor deployments. + alertmanager: '(alertmanager.*|cortex$)', }, // Grouping labels, to uniquely identify and group by {jobs, clusters} diff --git a/cortex-mixin/dashboards/alertmanager.libsonnet b/cortex-mixin/dashboards/alertmanager.libsonnet index 8897034..0bf88c4 100644 --- a/cortex-mixin/dashboards/alertmanager.libsonnet +++ b/cortex-mixin/dashboards/alertmanager.libsonnet @@ -11,15 +11,15 @@ local utils = import 'mixin-utils/utils.libsonnet'; }) .addPanel( $.panel('Total Alerts') + - $.statPanel('sum(cluster_job_%s:cortex_alertmanager_alerts:sum{%s})' % [$._config.per_instance_label, $.jobMatcher('alertmanager')], format='short') + $.statPanel('sum(cluster_job_%s:cortex_alertmanager_alerts:sum{%s})' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.alertmanager)], format='short') ) .addPanel( $.panel('Total Silences') + - $.statPanel('sum(cluster_job_%s:cortex_alertmanager_silences:sum{%s})' % [$._config.per_instance_label, $.jobMatcher('alertmanager')], format='short') + $.statPanel('sum(cluster_job_%s:cortex_alertmanager_silences:sum{%s})' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.alertmanager)], format='short') ) .addPanel( $.panel('Tenants') + - $.statPanel('max(cortex_alertmanager_tenants_discovered{%s})' % $.jobMatcher('alertmanager'), format='short') + $.statPanel('max(cortex_alertmanager_tenants_discovered{%s})' % $.jobMatcher($._config.job_names.alertmanager), format='short') ) ) .addRow( @@ -32,8 +32,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; sum(cluster_job:cortex_alertmanager_alerts_received_total:rate5m{%s}) - sum(cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m{%s}) - ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], - 'sum(cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m{%s})' % $.jobMatcher('alertmanager'), + ||| % [$.jobMatcher($._config.job_names.alertmanager), $.jobMatcher($._config.job_names.alertmanager)], + 'sum(cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m{%s})' % $.jobMatcher($._config.job_names.alertmanager), ], ['success', 'failed'] ) @@ -49,8 +49,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; sum(cluster_job_integration:cortex_alertmanager_notifications_total:rate5m{%s}) - sum(cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m{%s}) - ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], - 'sum(cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m{%s})' % $.jobMatcher('alertmanager'), + ||| % [$.jobMatcher($._config.job_names.alertmanager), $.jobMatcher($._config.job_names.alertmanager)], + 'sum(cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m{%s})' % $.jobMatcher($._config.job_names.alertmanager), ], ['success', 'failed'] ) @@ -66,15 +66,15 @@ local utils = import 'mixin-utils/utils.libsonnet'; sum(cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m{%s}) by(integration) ) > 0 or on () vector(0) - ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], - 'sum(cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m{%s}) by(integration)' % $.jobMatcher('alertmanager'), + ||| % [$.jobMatcher($._config.job_names.alertmanager), $.jobMatcher($._config.job_names.alertmanager)], + 'sum(cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m{%s}) by(integration)' % $.jobMatcher($._config.job_names.alertmanager), ], ['success - {{ integration }}', 'failed - {{ integration }}'] ) ) .addPanel( $.panel('Latency') + - $.latencyPanel('cortex_alertmanager_notification_latency_seconds', '{%s}' % $.jobMatcher('alertmanager')) + $.latencyPanel('cortex_alertmanager_notification_latency_seconds', '{%s}' % $.jobMatcher($._config.job_names.alertmanager)) ) ) .addRow( @@ -96,7 +96,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.panel('Per %s Tenants' % $._config.per_instance_label) + $.queryPanel( - 'max by(%s) (cortex_alertmanager_tenants_owned{%s})' % [$._config.per_instance_label, $.jobMatcher('alertmanager')], + 'max by(%s) (cortex_alertmanager_tenants_owned{%s})' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.alertmanager)], '{{%s}}' % $._config.per_instance_label ) + $.stack @@ -104,7 +104,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.panel('Per %s Alerts' % $._config.per_instance_label) + $.queryPanel( - 'sum by(%s) (cluster_job_%s:cortex_alertmanager_alerts:sum{%s})' % [$._config.per_instance_label, $._config.per_instance_label, $.jobMatcher('alertmanager')], + 'sum by(%s) (cluster_job_%s:cortex_alertmanager_alerts:sum{%s})' % [$._config.per_instance_label, $._config.per_instance_label, $.jobMatcher($._config.job_names.alertmanager)], '{{%s}}' % $._config.per_instance_label ) + $.stack @@ -112,7 +112,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.panel('Per %s Silences' % $._config.per_instance_label) + $.queryPanel( - 'sum by(%s) (cluster_job_%s:cortex_alertmanager_silences:sum{%s})' % [$._config.per_instance_label, $._config.per_instance_label, $.jobMatcher('alertmanager')], + 'sum by(%s) (cluster_job_%s:cortex_alertmanager_silences:sum{%s})' % [$._config.per_instance_label, $._config.per_instance_label, $.jobMatcher($._config.job_names.alertmanager)], '{{%s}}' % $._config.per_instance_label ) + $.stack @@ -128,8 +128,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; sum(rate(cortex_alertmanager_sync_configs_total{%s}[$__rate_interval])) - sum(rate(cortex_alertmanager_sync_configs_failed_total{%s}[$__rate_interval])) - ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], - 'sum(rate(cortex_alertmanager_sync_configs_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), + ||| % [$.jobMatcher($._config.job_names.alertmanager), $.jobMatcher($._config.job_names.alertmanager)], + 'sum(rate(cortex_alertmanager_sync_configs_failed_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.alertmanager), ], ['success', 'failed'] ) @@ -137,14 +137,14 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.panel('Syncs/sec (By Reason)') + $.queryPanel( - 'sum by(reason) (rate(cortex_alertmanager_sync_configs_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), + 'sum by(reason) (rate(cortex_alertmanager_sync_configs_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.alertmanager), '{{reason}}' ) ) .addPanel( $.panel('Ring Check Errors/sec') + $.queryPanel( - 'sum (rate(cortex_alertmanager_ring_check_errors_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), + 'sum (rate(cortex_alertmanager_ring_check_errors_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.alertmanager), 'errors' ) ) @@ -154,7 +154,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.panel('Initial syncs /sec') + $.queryPanel( - 'sum by(outcome) (rate(cortex_alertmanager_state_initial_sync_completed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), + 'sum by(outcome) (rate(cortex_alertmanager_state_initial_sync_completed_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.alertmanager), '{{outcome}}' ) + { targets: [ @@ -167,7 +167,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) .addPanel( $.panel('Initial sync duration') + - $.latencyPanel('cortex_alertmanager_state_initial_sync_duration_seconds', '{%s}' % $.jobMatcher('alertmanager')) + { + $.latencyPanel('cortex_alertmanager_state_initial_sync_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.alertmanager)) + { targets: [ target { interval: '1m', @@ -184,8 +184,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; sum(rate(cortex_alertmanager_state_fetch_replica_state_total{%s}[$__rate_interval])) - sum(rate(cortex_alertmanager_state_fetch_replica_state_failed_total{%s}[$__rate_interval])) - ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], - 'sum(rate(cortex_alertmanager_state_fetch_replica_state_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), + ||| % [$.jobMatcher($._config.job_names.alertmanager), $.jobMatcher($._config.job_names.alertmanager)], + 'sum(rate(cortex_alertmanager_state_fetch_replica_state_failed_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.alertmanager), ], ['success', 'failed'] ) + { @@ -208,8 +208,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; sum(cluster_job:cortex_alertmanager_state_replication_total:rate5m{%s}) - sum(cluster_job:cortex_alertmanager_state_replication_failed_total:rate5m{%s}) - ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], - 'sum(cluster_job:cortex_alertmanager_state_replication_failed_total:rate5m{%s})' % $.jobMatcher('alertmanager'), + ||| % [$.jobMatcher($._config.job_names.alertmanager), $.jobMatcher($._config.job_names.alertmanager)], + 'sum(cluster_job:cortex_alertmanager_state_replication_failed_total:rate5m{%s})' % $.jobMatcher($._config.job_names.alertmanager), ], ['success', 'failed'] ) @@ -222,8 +222,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; sum(cluster_job:cortex_alertmanager_partial_state_merges_total:rate5m{%s}) - sum(cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m{%s}) - ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], - 'sum(cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m{%s})' % $.jobMatcher('alertmanager'), + ||| % [$.jobMatcher($._config.job_names.alertmanager), $.jobMatcher($._config.job_names.alertmanager)], + 'sum(cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m{%s})' % $.jobMatcher($._config.job_names.alertmanager), ], ['success', 'failed'] ) @@ -236,8 +236,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; sum(rate(cortex_alertmanager_state_persist_total{%s}[$__rate_interval])) - sum(rate(cortex_alertmanager_state_persist_failed_total{%s}[$__rate_interval])) - ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], - 'sum(rate(cortex_alertmanager_state_persist_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), + ||| % [$.jobMatcher($._config.job_names.alertmanager), $.jobMatcher($._config.job_names.alertmanager)], + 'sum(rate(cortex_alertmanager_state_persist_failed_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.alertmanager), ], ['success', 'failed'] ) From ad49d11cffe4d31217223a68b496212a560788be Mon Sep 17 00:00:00 2001 From: "Abdurrahman J. Allawala" Date: Mon, 18 Oct 2021 15:07:27 -0700 Subject: [PATCH 2/3] add changelog entry --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index dbc581a..e75163d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -69,6 +69,7 @@ * [ENHANCEMENT] Added `CortexKVStoreFailure` alert. #406 * [ENHANCEMENT] Use configured `ruler` jobname for ruler dashboard panels. #409 * [ENHANCEMENT] Add ability to override `datasource` for generated dashboards. #407 +* [ENHANCEMENT] Use alertmanager jobname for alertmanager dashboard panels #411 * [BUGFIX] Fixed `CortexIngesterHasNotShippedBlocks` alert false positive in case an ingester instance had ingested samples in the past, then no traffic was received for a long period and then it started receiving samples again. #308 * [BUGFIX] Alertmanager: fixed `--alertmanager.cluster.peers` CLI flag passed to alertmanager when HA is enabled. #329 * [BUGFIX] Fixed `CortexInconsistentRuntimeConfig` metric. #335 From d8045394fce593b88542779a1ae8375b237b9682 Mon Sep 17 00:00:00 2001 From: "Abdurrahman J. Allawala" <35939863+aallawala@users.noreply.github.com> Date: Wed, 20 Oct 2021 10:39:39 -0700 Subject: [PATCH 3/3] Update cortex-mixin/config.libsonnet Co-authored-by: Marco Pracucci --- cortex-mixin/config.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cortex-mixin/config.libsonnet b/cortex-mixin/config.libsonnet index 4482958..06941b6 100644 --- a/cortex-mixin/config.libsonnet +++ b/cortex-mixin/config.libsonnet @@ -37,7 +37,7 @@ store_gateway: '(store-gateway|cortex$)', gateway: '(gateway|cortex-gw|cortex-gw-internal)', compactor: 'compactor.*', // Match also custom compactor deployments. - alertmanager: '(alertmanager.*|cortex$)', + alertmanager: 'alertmanager', }, // Grouping labels, to uniquely identify and group by {jobs, clusters}