From 8a54908b00a7b3ac1286adcf8ec94097ca67a254 Mon Sep 17 00:00:00 2001 From: Stan Kwong Date: Tue, 6 Oct 2020 14:48:55 -0700 Subject: [PATCH 1/7] Modify Alertmanager config metrics Signed-off-by: Stan Kwong --- pkg/alertmanager/multitenant.go | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/pkg/alertmanager/multitenant.go b/pkg/alertmanager/multitenant.go index 1d7a473c97f..e33445007b5 100644 --- a/pkg/alertmanager/multitenant.go +++ b/pkg/alertmanager/multitenant.go @@ -126,16 +126,23 @@ func (cfg *MultitenantAlertmanagerConfig) RegisterFlags(f *flag.FlagSet) { } type multitenantAlertmanagerMetrics struct { - invalidConfig *prometheus.GaugeVec + lastReloadSuccessful *prometheus.GaugeVec + lastReloadSuccessfulTimestamp *prometheus.GaugeVec } func newMultitenantAlertmanagerMetrics(reg prometheus.Registerer) *multitenantAlertmanagerMetrics { m := &multitenantAlertmanagerMetrics{} - m.invalidConfig = promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{ + m.lastReloadSuccessful = promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{ Namespace: "cortex", - Name: "alertmanager_config_invalid", - Help: "Boolean set to 1 whenever the Alertmanager config is invalid for a user.", + Name: "alertmanager_config_last_reload_successful", + Help: "Boolean set to 1 whenever the last configuration reload attempt was successful.", + }, []string{"user"}) + + m.lastReloadSuccessfulTimestamp = promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "cortex", + Name: "alertmanager_config_last_reload_successful_seconds", + Help: "Timestamp of the last successful configuration reload.", }, []string{"user"}) return m @@ -314,12 +321,13 @@ func (am *MultitenantAlertmanager) syncConfigs(cfgs map[string]alerts.AlertConfi for user, cfg := range cfgs { err := am.setConfig(cfg) if err != nil { - am.multitenantMetrics.invalidConfig.WithLabelValues(user).Set(float64(1)) + am.multitenantMetrics.lastReloadSuccessful.WithLabelValues(user).Set(float64(1)) + am.multitenantMetrics.lastReloadSuccessfulTimestamp.WithLabelValues(user).SetToCurrentTime() level.Warn(am.logger).Log("msg", "error applying config", "err", err) continue } - am.multitenantMetrics.invalidConfig.WithLabelValues(user).Set(float64(0)) + am.multitenantMetrics.lastReloadSuccessful.WithLabelValues(user).Set(float64(0)) } am.alertmanagersMtx.Lock() @@ -332,7 +340,8 @@ func (am *MultitenantAlertmanager) syncConfigs(cfgs map[string]alerts.AlertConfi level.Info(am.logger).Log("msg", "deactivating per-tenant alertmanager", "user", user) userAM.Pause() delete(am.cfgs, user) - am.multitenantMetrics.invalidConfig.DeleteLabelValues(user) + am.multitenantMetrics.lastReloadSuccessful.DeleteLabelValues(user) + am.multitenantMetrics.lastReloadSuccessfulTimestamp.DeleteLabelValues(user) level.Info(am.logger).Log("msg", "deactivated per-tenant alertmanager", "user", user) } } From 331288303763df0ec74d6a6a694ec09687e97566 Mon Sep 17 00:00:00 2001 From: Stan Kwong Date: Tue, 6 Oct 2020 16:42:35 -0700 Subject: [PATCH 2/7] Fix gauge Signed-off-by: Stan Kwong --- pkg/alertmanager/multitenant.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/alertmanager/multitenant.go b/pkg/alertmanager/multitenant.go index e33445007b5..3730e116567 100644 --- a/pkg/alertmanager/multitenant.go +++ b/pkg/alertmanager/multitenant.go @@ -321,13 +321,13 @@ func (am *MultitenantAlertmanager) syncConfigs(cfgs map[string]alerts.AlertConfi for user, cfg := range cfgs { err := am.setConfig(cfg) if err != nil { - am.multitenantMetrics.lastReloadSuccessful.WithLabelValues(user).Set(float64(1)) + am.multitenantMetrics.lastReloadSuccessful.WithLabelValues(user).Set(float64(0)) am.multitenantMetrics.lastReloadSuccessfulTimestamp.WithLabelValues(user).SetToCurrentTime() level.Warn(am.logger).Log("msg", "error applying config", "err", err) continue } - am.multitenantMetrics.lastReloadSuccessful.WithLabelValues(user).Set(float64(0)) + am.multitenantMetrics.lastReloadSuccessful.WithLabelValues(user).Set(float64(1)) } am.alertmanagersMtx.Lock() From 67405fa0e4c33ce7d4e68e93b3938c841ec7a052 Mon Sep 17 00:00:00 2001 From: Stan Kwong Date: Tue, 6 Oct 2020 16:50:01 -0700 Subject: [PATCH 3/7] Fix tests Signed-off-by: Stan Kwong --- pkg/alertmanager/multitenant_test.go | 44 ++++++++++++++-------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/pkg/alertmanager/multitenant_test.go b/pkg/alertmanager/multitenant_test.go index c8acffd218b..be78223ab9f 100644 --- a/pkg/alertmanager/multitenant_test.go +++ b/pkg/alertmanager/multitenant_test.go @@ -96,11 +96,11 @@ func TestLoadAllConfigs(t *testing.T) { require.Equal(t, simpleConfigOne, currentConfig.RawConfig) assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(` - # HELP cortex_alertmanager_config_invalid Boolean set to 1 whenever the Alertmanager config is invalid for a user. - # TYPE cortex_alertmanager_config_invalid gauge - cortex_alertmanager_config_invalid{user="user1"} 0 - cortex_alertmanager_config_invalid{user="user2"} 0 - `), "cortex_alertmanager_config_invalid")) + # HELP cortex_alertmanager_config_last_reload_successful Boolean set to 1 whenever the last configuration reload attempt was successful. + # TYPE cortex_alertmanager_config_last_reload_successful gauge + cortex_alertmanager_config_last_reload_successful{user="user1"} 1 + cortex_alertmanager_config_last_reload_successful{user="user2"} 1 + `), "cortex_alertmanager_config_last_reload_successful")) // Ensure when a 3rd config is added, it is synced correctly mockStore.configs["user3"] = alerts.AlertConfigDesc{ @@ -113,12 +113,12 @@ func TestLoadAllConfigs(t *testing.T) { require.Len(t, am.alertmanagers, 3) assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(` - # HELP cortex_alertmanager_config_invalid Boolean set to 1 whenever the Alertmanager config is invalid for a user. - # TYPE cortex_alertmanager_config_invalid gauge - cortex_alertmanager_config_invalid{user="user1"} 0 - cortex_alertmanager_config_invalid{user="user2"} 0 - cortex_alertmanager_config_invalid{user="user3"} 0 - `), "cortex_alertmanager_config_invalid")) + # HELP cortex_alertmanager_config_last_reload_successful Boolean set to 1 whenever the last configuration reload attempt was successful. + # TYPE cortex_alertmanager_config_last_reload_successful gauge + cortex_alertmanager_config_last_reload_successful{user="user1"} 1 + cortex_alertmanager_config_last_reload_successful{user="user2"} 1 + cortex_alertmanager_config_last_reload_successful{user="user3"} 1 + `), "cortex_alertmanager_config_last_reload_successful")) // Ensure the config is updated mockStore.configs["user1"] = alerts.AlertConfigDesc{ @@ -146,11 +146,11 @@ func TestLoadAllConfigs(t *testing.T) { require.False(t, userAM.IsActive()) assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(` - # HELP cortex_alertmanager_config_invalid Boolean set to 1 whenever the Alertmanager config is invalid for a user. - # TYPE cortex_alertmanager_config_invalid gauge - cortex_alertmanager_config_invalid{user="user1"} 0 - cortex_alertmanager_config_invalid{user="user2"} 0 - `), "cortex_alertmanager_config_invalid")) + # HELP cortex_alertmanager_config_last_reload_successful Boolean set to 1 whenever the last configuration reload attempt was successful. + # TYPE cortex_alertmanager_config_last_reload_successful gauge + cortex_alertmanager_config_last_reload_successful{user="user1"} 1 + cortex_alertmanager_config_last_reload_successful{user="user2"} 1 + `), "cortex_alertmanager_config_last_reload_successful")) // Ensure when a 3rd config is re-added, it is synced correctly mockStore.configs["user3"] = alerts.AlertConfigDesc{ @@ -170,12 +170,12 @@ func TestLoadAllConfigs(t *testing.T) { require.True(t, userAM.IsActive()) assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(` - # HELP cortex_alertmanager_config_invalid Boolean set to 1 whenever the Alertmanager config is invalid for a user. - # TYPE cortex_alertmanager_config_invalid gauge - cortex_alertmanager_config_invalid{user="user1"} 0 - cortex_alertmanager_config_invalid{user="user2"} 0 - cortex_alertmanager_config_invalid{user="user3"} 0 - `), "cortex_alertmanager_config_invalid")) + # HELP cortex_alertmanager_config_last_reload_successful Boolean set to 1 whenever the last configuration reload attempt was successful. + # TYPE cortex_alertmanager_config_last_reload_successful gauge + cortex_alertmanager_config_last_reload_successful{user="user1"} 1 + cortex_alertmanager_config_last_reload_successful{user="user2"} 1 + cortex_alertmanager_config_last_reload_successful{user="user3"} 1 + `), "cortex_alertmanager_config_last_reload_successful")) } func TestAlertmanager_NoExternalURL(t *testing.T) { From 0fb014fe4850db170a467737a142e3950742e8d4 Mon Sep 17 00:00:00 2001 From: Stan Kwong Date: Mon, 12 Oct 2020 14:34:18 -0700 Subject: [PATCH 4/7] docs: add to CHANGELOG Signed-off-by: Stan Kwong --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index de8f3519d5e..2fb59c8ee34 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -47,6 +47,7 @@ - `-ruler.ring.instance-interface` renamed to `-ruler.ring.instance-interface-names` * [CHANGE] Renamed `-.redis.enable-tls` CLI flag to `-.redis.tls-enabled`, and its respective YAML config option from `enable_tls` to `tls_enabled`. #3298 * [CHANGE] Increased default `-.redis.timeout` from `100ms` to `500ms`. #3301 +* [CHANGE] `cortex_alertmanager_config_invalid` has been removed in favor of `cortex_alertmanager_config_last_reload_successful`. #3289 * [FEATURE] Added support for shuffle-sharding queriers in the query-frontend. When configured (`-frontend.max-queriers-per-tenant` globally, or using per-tenant limit `max_queriers_per_tenant`), each tenants's requests will be handled by different set of queriers. #3113 #3257 * [FEATURE] Query-frontend: added `compression` config to support results cache with compression. #3217 * [ENHANCEMENT] Allow to specify multiple comma-separated Cortex services to `-target` CLI option (or its respective YAML config option). For example, `-target=all,compactor` can be used to start Cortex single-binary with compactor as well. #3275 @@ -77,6 +78,7 @@ * [ENHANCEMENT] Return an explicit error when the store-gateway is explicitly requested without a blocks storage engine. #3287 * [ENHANCEMENT] Ruler: only load rules that belong to the ruler. Improves rules synching performances when ruler sharding is enabled. #3269 * [ENHANCEMENT] Added `-.redis.tls-insecure-skip-verify` flag. #3298 +* [ENHANCEMENT] Added `cortex_alertmanager_config_last_reload_successful_seconds` metric to show timestamp of last successful AM config reload. #3289 * [BUGFIX] No-longer-needed ingester operations for queries triggered by queriers and rulers are now canceled. #3178 * [BUGFIX] Ruler: directories in the configured `rules-path` will be removed on startup and shutdown in order to ensure they don't persist between runs. #3195 * [BUGFIX] Handle hash-collisions in the query path. #3192 From b3b7df9b2500d28e2ba4a82e64d131498f49c237 Mon Sep 17 00:00:00 2001 From: Stan Kwong Date: Tue, 13 Oct 2020 11:57:09 -0700 Subject: [PATCH 5/7] fix: record timestamp on success Signed-off-by: Stan Kwong --- pkg/alertmanager/multitenant.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/alertmanager/multitenant.go b/pkg/alertmanager/multitenant.go index 3730e116567..5db09e63eeb 100644 --- a/pkg/alertmanager/multitenant.go +++ b/pkg/alertmanager/multitenant.go @@ -322,12 +322,12 @@ func (am *MultitenantAlertmanager) syncConfigs(cfgs map[string]alerts.AlertConfi err := am.setConfig(cfg) if err != nil { am.multitenantMetrics.lastReloadSuccessful.WithLabelValues(user).Set(float64(0)) - am.multitenantMetrics.lastReloadSuccessfulTimestamp.WithLabelValues(user).SetToCurrentTime() level.Warn(am.logger).Log("msg", "error applying config", "err", err) continue } am.multitenantMetrics.lastReloadSuccessful.WithLabelValues(user).Set(float64(1)) + am.multitenantMetrics.lastReloadSuccessfulTimestamp.WithLabelValues(user).SetToCurrentTime() } am.alertmanagersMtx.Lock() From 9bbc5c99c6bbdbc720b3e8fc6a48a845f5df44f2 Mon Sep 17 00:00:00 2001 From: Stan Kwong Date: Tue, 13 Oct 2020 16:39:45 -0700 Subject: [PATCH 6/7] test: fix integration tests Signed-off-by: Stan Kwong --- integration/alertmanager_test.go | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/integration/alertmanager_test.go b/integration/alertmanager_test.go index fde6f98cd23..60c83fdbc9b 100644 --- a/integration/alertmanager_test.go +++ b/integration/alertmanager_test.go @@ -31,7 +31,7 @@ func TestAlertmanager(t *testing.T) { "", ) require.NoError(t, s.StartAndWaitReady(alertmanager)) - require.NoError(t, alertmanager.WaitSumMetrics(e2e.Equals(0), "cortex_alertmanager_config_invalid")) + require.NoError(t, alertmanager.WaitSumMetrics(e2e.Equals(1), "cortex_alertmanager_config_last_reload_successful")) c, err := e2ecortex.NewClient("", "", alertmanager.HTTPEndpoint(), "", "user-1") require.NoError(t, err) @@ -81,7 +81,10 @@ func TestAlertmanagerStoreAPI(t *testing.T) { err = c.SetAlertmanagerConfig(context.Background(), cortexAlertmanagerUserConfigYaml, map[string]string{}) require.NoError(t, err) - require.NoError(t, am.WaitSumMetricsWithOptions(e2e.Equals(0), []string{"cortex_alertmanager_config_invalid"}, + require.NoError(t, am.WaitSumMetricsWithOptions(e2e.Equals(1), []string{"cortex_alertmanager_config_last_reload_successful"}, + e2e.WithLabelMatchers(labels.MustNewMatcher(labels.MatchEqual, "user", "user-1")), + e2e.WaitMissingMetrics)) + require.NoError(t, am.WaitSumMetricsWithOptions(e2e.Greater(0), []string{"cortex_alertmanager_config_last_reload_successful_seconds"}, e2e.WithLabelMatchers(labels.MustNewMatcher(labels.MatchEqual, "user", "user-1")), e2e.WaitMissingMetrics)) @@ -102,13 +105,18 @@ func TestAlertmanagerStoreAPI(t *testing.T) { require.NoError(t, am.WaitSumMetricsWithOptions(e2e.Equals(1), []string{"cortex_alertmanager_alerts_received_total"}, e2e.WithLabelMatchers(labels.MustNewMatcher(labels.MatchEqual, "user", "user-1")), e2e.WaitMissingMetrics)) + require.NoError(t, am.WaitSumMetricsWithOptions(e2e.Greater(0), []string{"cortex_alertmanager_alerts_received_total"}, + e2e.WithLabelMatchers(labels.MustNewMatcher(labels.MatchEqual, "user", "user-1")), + e2e.WaitMissingMetrics)) err = c.DeleteAlertmanagerConfig(context.Background()) require.NoError(t, err) // The deleted config is applied asynchronously, so we should wait until the metric // disappear for the specific user. - require.NoError(t, am.WaitRemovedMetric("cortex_alertmanager_config_invalid", e2e.WithLabelMatchers( + require.NoError(t, am.WaitRemovedMetric("cortex_alertmanager_config_last_reload_successful", e2e.WithLabelMatchers( + labels.MustNewMatcher(labels.MatchEqual, "user", "user-1")))) + require.NoError(t, am.WaitRemovedMetric("cortex_alertmanager_config_last_reload_successful_seconds", e2e.WithLabelMatchers( labels.MustNewMatcher(labels.MatchEqual, "user", "user-1")))) cfg, err = c.GetAlertmanagerConfig(context.Background()) From 3d62519c64d130271111d445f7325c277130360f Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Wed, 14 Oct 2020 08:50:36 +0200 Subject: [PATCH 7/7] Update integration/alertmanager_test.go Signed-off-by: Marco Pracucci --- integration/alertmanager_test.go | 3 --- 1 file changed, 3 deletions(-) diff --git a/integration/alertmanager_test.go b/integration/alertmanager_test.go index 60c83fdbc9b..0d6313b808e 100644 --- a/integration/alertmanager_test.go +++ b/integration/alertmanager_test.go @@ -105,9 +105,6 @@ func TestAlertmanagerStoreAPI(t *testing.T) { require.NoError(t, am.WaitSumMetricsWithOptions(e2e.Equals(1), []string{"cortex_alertmanager_alerts_received_total"}, e2e.WithLabelMatchers(labels.MustNewMatcher(labels.MatchEqual, "user", "user-1")), e2e.WaitMissingMetrics)) - require.NoError(t, am.WaitSumMetricsWithOptions(e2e.Greater(0), []string{"cortex_alertmanager_alerts_received_total"}, - e2e.WithLabelMatchers(labels.MustNewMatcher(labels.MatchEqual, "user", "user-1")), - e2e.WaitMissingMetrics)) err = c.DeleteAlertmanagerConfig(context.Background()) require.NoError(t, err)