Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
- `-ruler.ring.instance-interface` renamed to `-ruler.ring.instance-interface-names`
* [CHANGE] Renamed `-<prefix>.redis.enable-tls` CLI flag to `-<prefix>.redis.tls-enabled`, and its respective YAML config option from `enable_tls` to `tls_enabled`. #3298
* [CHANGE] Increased default `-<prefix>.redis.timeout` from `100ms` to `500ms`. #3301
* [CHANGE] `cortex_alertmanager_config_invalid` has been removed in favor of `cortex_alertmanager_config_last_reload_successful`. #3289
* [FEATURE] Added support for shuffle-sharding queriers in the query-frontend. When configured (`-frontend.max-queriers-per-tenant` globally, or using per-tenant limit `max_queriers_per_tenant`), each tenants's requests will be handled by different set of queriers. #3113 #3257
* [FEATURE] Query-frontend: added `compression` config to support results cache with compression. #3217
* [ENHANCEMENT] Allow to specify multiple comma-separated Cortex services to `-target` CLI option (or its respective YAML config option). For example, `-target=all,compactor` can be used to start Cortex single-binary with compactor as well. #3275
Expand Down Expand Up @@ -77,6 +78,7 @@
* [ENHANCEMENT] Return an explicit error when the store-gateway is explicitly requested without a blocks storage engine. #3287
* [ENHANCEMENT] Ruler: only load rules that belong to the ruler. Improves rules synching performances when ruler sharding is enabled. #3269
* [ENHANCEMENT] Added `-<prefix>.redis.tls-insecure-skip-verify` flag. #3298
* [ENHANCEMENT] Added `cortex_alertmanager_config_last_reload_successful_seconds` metric to show timestamp of last successful AM config reload. #3289
* [BUGFIX] No-longer-needed ingester operations for queries triggered by queriers and rulers are now canceled. #3178
* [BUGFIX] Ruler: directories in the configured `rules-path` will be removed on startup and shutdown in order to ensure they don't persist between runs. #3195
* [BUGFIX] Handle hash-collisions in the query path. #3192
Expand Down
23 changes: 16 additions & 7 deletions pkg/alertmanager/multitenant.go
Original file line number Diff line number Diff line change
Expand Up @@ -126,16 +126,23 @@ func (cfg *MultitenantAlertmanagerConfig) RegisterFlags(f *flag.FlagSet) {
}

type multitenantAlertmanagerMetrics struct {
invalidConfig *prometheus.GaugeVec
lastReloadSuccessful *prometheus.GaugeVec
lastReloadSuccessfulTimestamp *prometheus.GaugeVec
}

func newMultitenantAlertmanagerMetrics(reg prometheus.Registerer) *multitenantAlertmanagerMetrics {
m := &multitenantAlertmanagerMetrics{}

m.invalidConfig = promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{
m.lastReloadSuccessful = promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{
Namespace: "cortex",
Name: "alertmanager_config_invalid",
Help: "Boolean set to 1 whenever the Alertmanager config is invalid for a user.",
Name: "alertmanager_config_last_reload_successful",
Help: "Boolean set to 1 whenever the last configuration reload attempt was successful.",
}, []string{"user"})

m.lastReloadSuccessfulTimestamp = promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{
Namespace: "cortex",
Name: "alertmanager_config_last_reload_successful_seconds",
Help: "Timestamp of the last successful configuration reload.",
}, []string{"user"})

return m
Expand Down Expand Up @@ -314,12 +321,13 @@ func (am *MultitenantAlertmanager) syncConfigs(cfgs map[string]alerts.AlertConfi
for user, cfg := range cfgs {
err := am.setConfig(cfg)
if err != nil {
am.multitenantMetrics.invalidConfig.WithLabelValues(user).Set(float64(1))
am.multitenantMetrics.lastReloadSuccessful.WithLabelValues(user).Set(float64(0))
level.Warn(am.logger).Log("msg", "error applying config", "err", err)
continue
}

am.multitenantMetrics.invalidConfig.WithLabelValues(user).Set(float64(0))
am.multitenantMetrics.lastReloadSuccessful.WithLabelValues(user).Set(float64(1))
am.multitenantMetrics.lastReloadSuccessfulTimestamp.WithLabelValues(user).SetToCurrentTime()
}

am.alertmanagersMtx.Lock()
Expand All @@ -332,7 +340,8 @@ func (am *MultitenantAlertmanager) syncConfigs(cfgs map[string]alerts.AlertConfi
level.Info(am.logger).Log("msg", "deactivating per-tenant alertmanager", "user", user)
userAM.Pause()
delete(am.cfgs, user)
am.multitenantMetrics.invalidConfig.DeleteLabelValues(user)
am.multitenantMetrics.lastReloadSuccessful.DeleteLabelValues(user)
am.multitenantMetrics.lastReloadSuccessfulTimestamp.DeleteLabelValues(user)
level.Info(am.logger).Log("msg", "deactivated per-tenant alertmanager", "user", user)
}
}
Expand Down
44 changes: 22 additions & 22 deletions pkg/alertmanager/multitenant_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,11 +96,11 @@ func TestLoadAllConfigs(t *testing.T) {
require.Equal(t, simpleConfigOne, currentConfig.RawConfig)

assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(`
# HELP cortex_alertmanager_config_invalid Boolean set to 1 whenever the Alertmanager config is invalid for a user.
# TYPE cortex_alertmanager_config_invalid gauge
cortex_alertmanager_config_invalid{user="user1"} 0
cortex_alertmanager_config_invalid{user="user2"} 0
`), "cortex_alertmanager_config_invalid"))
# HELP cortex_alertmanager_config_last_reload_successful Boolean set to 1 whenever the last configuration reload attempt was successful.
# TYPE cortex_alertmanager_config_last_reload_successful gauge
cortex_alertmanager_config_last_reload_successful{user="user1"} 1
cortex_alertmanager_config_last_reload_successful{user="user2"} 1
`), "cortex_alertmanager_config_last_reload_successful"))

// Ensure when a 3rd config is added, it is synced correctly
mockStore.configs["user3"] = alerts.AlertConfigDesc{
Expand All @@ -113,12 +113,12 @@ func TestLoadAllConfigs(t *testing.T) {
require.Len(t, am.alertmanagers, 3)

assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(`
# HELP cortex_alertmanager_config_invalid Boolean set to 1 whenever the Alertmanager config is invalid for a user.
# TYPE cortex_alertmanager_config_invalid gauge
cortex_alertmanager_config_invalid{user="user1"} 0
cortex_alertmanager_config_invalid{user="user2"} 0
cortex_alertmanager_config_invalid{user="user3"} 0
`), "cortex_alertmanager_config_invalid"))
# HELP cortex_alertmanager_config_last_reload_successful Boolean set to 1 whenever the last configuration reload attempt was successful.
# TYPE cortex_alertmanager_config_last_reload_successful gauge
cortex_alertmanager_config_last_reload_successful{user="user1"} 1
cortex_alertmanager_config_last_reload_successful{user="user2"} 1
cortex_alertmanager_config_last_reload_successful{user="user3"} 1
`), "cortex_alertmanager_config_last_reload_successful"))

// Ensure the config is updated
mockStore.configs["user1"] = alerts.AlertConfigDesc{
Expand Down Expand Up @@ -146,11 +146,11 @@ func TestLoadAllConfigs(t *testing.T) {
require.False(t, userAM.IsActive())

assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(`
# HELP cortex_alertmanager_config_invalid Boolean set to 1 whenever the Alertmanager config is invalid for a user.
# TYPE cortex_alertmanager_config_invalid gauge
cortex_alertmanager_config_invalid{user="user1"} 0
cortex_alertmanager_config_invalid{user="user2"} 0
`), "cortex_alertmanager_config_invalid"))
# HELP cortex_alertmanager_config_last_reload_successful Boolean set to 1 whenever the last configuration reload attempt was successful.
# TYPE cortex_alertmanager_config_last_reload_successful gauge
cortex_alertmanager_config_last_reload_successful{user="user1"} 1
cortex_alertmanager_config_last_reload_successful{user="user2"} 1
`), "cortex_alertmanager_config_last_reload_successful"))

// Ensure when a 3rd config is re-added, it is synced correctly
mockStore.configs["user3"] = alerts.AlertConfigDesc{
Expand All @@ -170,12 +170,12 @@ func TestLoadAllConfigs(t *testing.T) {
require.True(t, userAM.IsActive())

assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(`
# HELP cortex_alertmanager_config_invalid Boolean set to 1 whenever the Alertmanager config is invalid for a user.
# TYPE cortex_alertmanager_config_invalid gauge
cortex_alertmanager_config_invalid{user="user1"} 0
cortex_alertmanager_config_invalid{user="user2"} 0
cortex_alertmanager_config_invalid{user="user3"} 0
`), "cortex_alertmanager_config_invalid"))
# HELP cortex_alertmanager_config_last_reload_successful Boolean set to 1 whenever the last configuration reload attempt was successful.
# TYPE cortex_alertmanager_config_last_reload_successful gauge
cortex_alertmanager_config_last_reload_successful{user="user1"} 1
cortex_alertmanager_config_last_reload_successful{user="user2"} 1
cortex_alertmanager_config_last_reload_successful{user="user3"} 1
`), "cortex_alertmanager_config_last_reload_successful"))
}

func TestAlertmanager_NoExternalURL(t *testing.T) {
Expand Down