diff --git a/CHANGELOG.md b/CHANGELOG.md index de8f3519d5e..2fb59c8ee34 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -47,6 +47,7 @@ - `-ruler.ring.instance-interface` renamed to `-ruler.ring.instance-interface-names` * [CHANGE] Renamed `-.redis.enable-tls` CLI flag to `-.redis.tls-enabled`, and its respective YAML config option from `enable_tls` to `tls_enabled`. #3298 * [CHANGE] Increased default `-.redis.timeout` from `100ms` to `500ms`. #3301 +* [CHANGE] `cortex_alertmanager_config_invalid` has been removed in favor of `cortex_alertmanager_config_last_reload_successful`. #3289 * [FEATURE] Added support for shuffle-sharding queriers in the query-frontend. When configured (`-frontend.max-queriers-per-tenant` globally, or using per-tenant limit `max_queriers_per_tenant`), each tenants's requests will be handled by different set of queriers. #3113 #3257 * [FEATURE] Query-frontend: added `compression` config to support results cache with compression. #3217 * [ENHANCEMENT] Allow to specify multiple comma-separated Cortex services to `-target` CLI option (or its respective YAML config option). For example, `-target=all,compactor` can be used to start Cortex single-binary with compactor as well. #3275 @@ -77,6 +78,7 @@ * [ENHANCEMENT] Return an explicit error when the store-gateway is explicitly requested without a blocks storage engine. #3287 * [ENHANCEMENT] Ruler: only load rules that belong to the ruler. Improves rules synching performances when ruler sharding is enabled. #3269 * [ENHANCEMENT] Added `-.redis.tls-insecure-skip-verify` flag. #3298 +* [ENHANCEMENT] Added `cortex_alertmanager_config_last_reload_successful_seconds` metric to show timestamp of last successful AM config reload. #3289 * [BUGFIX] No-longer-needed ingester operations for queries triggered by queriers and rulers are now canceled. #3178 * [BUGFIX] Ruler: directories in the configured `rules-path` will be removed on startup and shutdown in order to ensure they don't persist between runs. #3195 * [BUGFIX] Handle hash-collisions in the query path. #3192 diff --git a/integration/alertmanager_test.go b/integration/alertmanager_test.go index fde6f98cd23..0d6313b808e 100644 --- a/integration/alertmanager_test.go +++ b/integration/alertmanager_test.go @@ -31,7 +31,7 @@ func TestAlertmanager(t *testing.T) { "", ) require.NoError(t, s.StartAndWaitReady(alertmanager)) - require.NoError(t, alertmanager.WaitSumMetrics(e2e.Equals(0), "cortex_alertmanager_config_invalid")) + require.NoError(t, alertmanager.WaitSumMetrics(e2e.Equals(1), "cortex_alertmanager_config_last_reload_successful")) c, err := e2ecortex.NewClient("", "", alertmanager.HTTPEndpoint(), "", "user-1") require.NoError(t, err) @@ -81,7 +81,10 @@ func TestAlertmanagerStoreAPI(t *testing.T) { err = c.SetAlertmanagerConfig(context.Background(), cortexAlertmanagerUserConfigYaml, map[string]string{}) require.NoError(t, err) - require.NoError(t, am.WaitSumMetricsWithOptions(e2e.Equals(0), []string{"cortex_alertmanager_config_invalid"}, + require.NoError(t, am.WaitSumMetricsWithOptions(e2e.Equals(1), []string{"cortex_alertmanager_config_last_reload_successful"}, + e2e.WithLabelMatchers(labels.MustNewMatcher(labels.MatchEqual, "user", "user-1")), + e2e.WaitMissingMetrics)) + require.NoError(t, am.WaitSumMetricsWithOptions(e2e.Greater(0), []string{"cortex_alertmanager_config_last_reload_successful_seconds"}, e2e.WithLabelMatchers(labels.MustNewMatcher(labels.MatchEqual, "user", "user-1")), e2e.WaitMissingMetrics)) @@ -108,7 +111,9 @@ func TestAlertmanagerStoreAPI(t *testing.T) { // The deleted config is applied asynchronously, so we should wait until the metric // disappear for the specific user. - require.NoError(t, am.WaitRemovedMetric("cortex_alertmanager_config_invalid", e2e.WithLabelMatchers( + require.NoError(t, am.WaitRemovedMetric("cortex_alertmanager_config_last_reload_successful", e2e.WithLabelMatchers( + labels.MustNewMatcher(labels.MatchEqual, "user", "user-1")))) + require.NoError(t, am.WaitRemovedMetric("cortex_alertmanager_config_last_reload_successful_seconds", e2e.WithLabelMatchers( labels.MustNewMatcher(labels.MatchEqual, "user", "user-1")))) cfg, err = c.GetAlertmanagerConfig(context.Background()) diff --git a/pkg/alertmanager/multitenant.go b/pkg/alertmanager/multitenant.go index 1d7a473c97f..5db09e63eeb 100644 --- a/pkg/alertmanager/multitenant.go +++ b/pkg/alertmanager/multitenant.go @@ -126,16 +126,23 @@ func (cfg *MultitenantAlertmanagerConfig) RegisterFlags(f *flag.FlagSet) { } type multitenantAlertmanagerMetrics struct { - invalidConfig *prometheus.GaugeVec + lastReloadSuccessful *prometheus.GaugeVec + lastReloadSuccessfulTimestamp *prometheus.GaugeVec } func newMultitenantAlertmanagerMetrics(reg prometheus.Registerer) *multitenantAlertmanagerMetrics { m := &multitenantAlertmanagerMetrics{} - m.invalidConfig = promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{ + m.lastReloadSuccessful = promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{ Namespace: "cortex", - Name: "alertmanager_config_invalid", - Help: "Boolean set to 1 whenever the Alertmanager config is invalid for a user.", + Name: "alertmanager_config_last_reload_successful", + Help: "Boolean set to 1 whenever the last configuration reload attempt was successful.", + }, []string{"user"}) + + m.lastReloadSuccessfulTimestamp = promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "cortex", + Name: "alertmanager_config_last_reload_successful_seconds", + Help: "Timestamp of the last successful configuration reload.", }, []string{"user"}) return m @@ -314,12 +321,13 @@ func (am *MultitenantAlertmanager) syncConfigs(cfgs map[string]alerts.AlertConfi for user, cfg := range cfgs { err := am.setConfig(cfg) if err != nil { - am.multitenantMetrics.invalidConfig.WithLabelValues(user).Set(float64(1)) + am.multitenantMetrics.lastReloadSuccessful.WithLabelValues(user).Set(float64(0)) level.Warn(am.logger).Log("msg", "error applying config", "err", err) continue } - am.multitenantMetrics.invalidConfig.WithLabelValues(user).Set(float64(0)) + am.multitenantMetrics.lastReloadSuccessful.WithLabelValues(user).Set(float64(1)) + am.multitenantMetrics.lastReloadSuccessfulTimestamp.WithLabelValues(user).SetToCurrentTime() } am.alertmanagersMtx.Lock() @@ -332,7 +340,8 @@ func (am *MultitenantAlertmanager) syncConfigs(cfgs map[string]alerts.AlertConfi level.Info(am.logger).Log("msg", "deactivating per-tenant alertmanager", "user", user) userAM.Pause() delete(am.cfgs, user) - am.multitenantMetrics.invalidConfig.DeleteLabelValues(user) + am.multitenantMetrics.lastReloadSuccessful.DeleteLabelValues(user) + am.multitenantMetrics.lastReloadSuccessfulTimestamp.DeleteLabelValues(user) level.Info(am.logger).Log("msg", "deactivated per-tenant alertmanager", "user", user) } } diff --git a/pkg/alertmanager/multitenant_test.go b/pkg/alertmanager/multitenant_test.go index c8acffd218b..be78223ab9f 100644 --- a/pkg/alertmanager/multitenant_test.go +++ b/pkg/alertmanager/multitenant_test.go @@ -96,11 +96,11 @@ func TestLoadAllConfigs(t *testing.T) { require.Equal(t, simpleConfigOne, currentConfig.RawConfig) assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(` - # HELP cortex_alertmanager_config_invalid Boolean set to 1 whenever the Alertmanager config is invalid for a user. - # TYPE cortex_alertmanager_config_invalid gauge - cortex_alertmanager_config_invalid{user="user1"} 0 - cortex_alertmanager_config_invalid{user="user2"} 0 - `), "cortex_alertmanager_config_invalid")) + # HELP cortex_alertmanager_config_last_reload_successful Boolean set to 1 whenever the last configuration reload attempt was successful. + # TYPE cortex_alertmanager_config_last_reload_successful gauge + cortex_alertmanager_config_last_reload_successful{user="user1"} 1 + cortex_alertmanager_config_last_reload_successful{user="user2"} 1 + `), "cortex_alertmanager_config_last_reload_successful")) // Ensure when a 3rd config is added, it is synced correctly mockStore.configs["user3"] = alerts.AlertConfigDesc{ @@ -113,12 +113,12 @@ func TestLoadAllConfigs(t *testing.T) { require.Len(t, am.alertmanagers, 3) assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(` - # HELP cortex_alertmanager_config_invalid Boolean set to 1 whenever the Alertmanager config is invalid for a user. - # TYPE cortex_alertmanager_config_invalid gauge - cortex_alertmanager_config_invalid{user="user1"} 0 - cortex_alertmanager_config_invalid{user="user2"} 0 - cortex_alertmanager_config_invalid{user="user3"} 0 - `), "cortex_alertmanager_config_invalid")) + # HELP cortex_alertmanager_config_last_reload_successful Boolean set to 1 whenever the last configuration reload attempt was successful. + # TYPE cortex_alertmanager_config_last_reload_successful gauge + cortex_alertmanager_config_last_reload_successful{user="user1"} 1 + cortex_alertmanager_config_last_reload_successful{user="user2"} 1 + cortex_alertmanager_config_last_reload_successful{user="user3"} 1 + `), "cortex_alertmanager_config_last_reload_successful")) // Ensure the config is updated mockStore.configs["user1"] = alerts.AlertConfigDesc{ @@ -146,11 +146,11 @@ func TestLoadAllConfigs(t *testing.T) { require.False(t, userAM.IsActive()) assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(` - # HELP cortex_alertmanager_config_invalid Boolean set to 1 whenever the Alertmanager config is invalid for a user. - # TYPE cortex_alertmanager_config_invalid gauge - cortex_alertmanager_config_invalid{user="user1"} 0 - cortex_alertmanager_config_invalid{user="user2"} 0 - `), "cortex_alertmanager_config_invalid")) + # HELP cortex_alertmanager_config_last_reload_successful Boolean set to 1 whenever the last configuration reload attempt was successful. + # TYPE cortex_alertmanager_config_last_reload_successful gauge + cortex_alertmanager_config_last_reload_successful{user="user1"} 1 + cortex_alertmanager_config_last_reload_successful{user="user2"} 1 + `), "cortex_alertmanager_config_last_reload_successful")) // Ensure when a 3rd config is re-added, it is synced correctly mockStore.configs["user3"] = alerts.AlertConfigDesc{ @@ -170,12 +170,12 @@ func TestLoadAllConfigs(t *testing.T) { require.True(t, userAM.IsActive()) assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(` - # HELP cortex_alertmanager_config_invalid Boolean set to 1 whenever the Alertmanager config is invalid for a user. - # TYPE cortex_alertmanager_config_invalid gauge - cortex_alertmanager_config_invalid{user="user1"} 0 - cortex_alertmanager_config_invalid{user="user2"} 0 - cortex_alertmanager_config_invalid{user="user3"} 0 - `), "cortex_alertmanager_config_invalid")) + # HELP cortex_alertmanager_config_last_reload_successful Boolean set to 1 whenever the last configuration reload attempt was successful. + # TYPE cortex_alertmanager_config_last_reload_successful gauge + cortex_alertmanager_config_last_reload_successful{user="user1"} 1 + cortex_alertmanager_config_last_reload_successful{user="user2"} 1 + cortex_alertmanager_config_last_reload_successful{user="user3"} 1 + `), "cortex_alertmanager_config_last_reload_successful")) } func TestAlertmanager_NoExternalURL(t *testing.T) {