Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

## master / unreleased

* [CHANGE] Replace the metric `cortex_alertmanager_configs` with `cortex_alertmanager_invalid_config` exposed by Alertmanager. #2960
* [CHANGE] Experimental Delete Series: Change target flag for purger from `data-purger` to `purger`. #2777
* [CHANGE] Experimental TSDB: The max concurrent queries against the long-term storage, configured via `-experimental.tsdb.bucket-store.max-concurrent`, is now a limit shared across all tenants and not a per-tenant limit anymore. The default value has changed from `20` to `100` and the following new metrics have been added: #2797
* `cortex_bucket_stores_gate_queries_concurrent_max`
Expand Down
11 changes: 7 additions & 4 deletions integration/alertmanager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ package main
import (
"context"
"testing"
"time"

"github.com/stretchr/testify/require"

Expand All @@ -29,7 +30,7 @@ func TestAlertmanager(t *testing.T) {
"",
)
require.NoError(t, s.StartAndWaitReady(alertmanager))
require.NoError(t, alertmanager.WaitSumMetrics(e2e.Equals(1), "cortex_alertmanager_configs"))
require.NoError(t, alertmanager.WaitSumMetrics(e2e.Equals(0), "cortex_alertmanager_invalid_config"))

c, err := e2ecortex.NewClient("", "", alertmanager.HTTPEndpoint(), "", "user-1")
require.NoError(t, err)
Expand Down Expand Up @@ -67,7 +68,7 @@ func TestAlertmanagerStoreAPI(t *testing.T) {
)

require.NoError(t, s.StartAndWaitReady(am))
require.NoError(t, am.WaitSumMetrics(e2e.Equals(0), "cortex_alertmanager_configs"))
require.NoError(t, am.WaitSumMetrics(e2e.Equals(1), "alertmanager_cluster_members"))

c, err := e2ecortex.NewClient("", "", am.HTTPEndpoint(), "", "user-1")
require.NoError(t, err)
Expand All @@ -79,7 +80,8 @@ func TestAlertmanagerStoreAPI(t *testing.T) {
err = c.SetAlertmanagerConfig(context.Background(), cortexAlertmanagerUserConfigYaml, map[string]string{})
require.NoError(t, err)

require.NoError(t, am.WaitSumMetrics(e2e.Equals(1), "cortex_alertmanager_configs"))
time.Sleep(2 * time.Second)
Copy link
Contributor Author

@gotjosh gotjosh Aug 3, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I solved the CI failure using a quick hack, but I dislike Sleep is tests as it a general source of flaky tests. The problem here is the metric does not get initially registered (given it is per tenant) until much later.

A solution I can see would be to create a new method (or argument to WaitSumMetrics) that waits for a metric to appear instead of bailing out if we don't see the metric on the initial metrics page load. As I can see this being useful in other areas.

Please let me know and I'll be happy to make the change.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's work on it in a separate PR. Could you open an issue, please? I have this pending PR #2522 which could be used as a baseline to add a functional option to enable the logic you suggest.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks marco, filed in #2975

require.NoError(t, am.WaitSumMetrics(e2e.Equals(0), "cortex_alertmanager_invalid_config"))

cfg, err := c.GetAlertmanagerConfig(context.Background())
require.NoError(t, err)
Expand All @@ -95,7 +97,8 @@ func TestAlertmanagerStoreAPI(t *testing.T) {
err = c.DeleteAlertmanagerConfig(context.Background())
require.NoError(t, err)

require.NoError(t, am.WaitSumMetrics(e2e.Equals(0), "cortex_alertmanager_configs"))
time.Sleep(2 * time.Second)

cfg, err = c.GetAlertmanagerConfig(context.Background())
require.Error(t, err)
require.Nil(t, cfg)
Expand Down
29 changes: 12 additions & 17 deletions pkg/alertmanager/multitenant.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,6 @@ const (
// a URL derived from Config.AutoWebhookRoot
autoWebhookURL = "http://internal.monitor"

configStatusValid = "valid"
configStatusInvalid = "invalid"

statusPage = `
<!doctype html>
<html>
Expand Down Expand Up @@ -129,19 +126,17 @@ func (cfg *MultitenantAlertmanagerConfig) RegisterFlags(f *flag.FlagSet) {
}

type multitenantAlertmanagerMetrics struct {
totalConfigs *prometheus.GaugeVec
invalidConfig *prometheus.GaugeVec
}

func newMultitenantAlertmanagerMetrics(reg prometheus.Registerer) *multitenantAlertmanagerMetrics {
m := &multitenantAlertmanagerMetrics{}

m.totalConfigs = promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{
m.invalidConfig = promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{
Namespace: "cortex",
Name: "alertmanager_configs",
Help: "How many configs the multitenant alertmanager knows about.",
}, []string{"status"})
m.totalConfigs.WithLabelValues(configStatusInvalid).Set(0)
m.totalConfigs.WithLabelValues(configStatusValid).Set(0)
Name: "alertmanager_invalid_config",
Help: "Whenever the Alertmanager config is invalid for a user.",
}, []string{"user"})

return m
}
Expand Down Expand Up @@ -311,15 +306,16 @@ func (am *MultitenantAlertmanager) poll() (map[string]alerts.AlertConfigDesc, er
}

func (am *MultitenantAlertmanager) syncConfigs(cfgs map[string]alerts.AlertConfigDesc) {
invalid := 0 // Count the number of invalid configs as we go.

level.Debug(am.logger).Log("msg", "adding configurations", "num_configs", len(cfgs))
for _, cfg := range cfgs {
for user, cfg := range cfgs {
err := am.setConfig(cfg)
if err != nil {
invalid++
am.multitenantMetrics.invalidConfig.WithLabelValues(user).Set(float64(1))
level.Warn(am.logger).Log("msg", "error applying config", "err", err)
continue
}

am.multitenantMetrics.invalidConfig.WithLabelValues(user).Set(float64(0))
}

am.alertmanagersMtx.Lock()
Expand All @@ -332,11 +328,10 @@ func (am *MultitenantAlertmanager) syncConfigs(cfgs map[string]alerts.AlertConfi
level.Info(am.logger).Log("msg", "deactivating per-tenant alertmanager", "user", user)
userAM.Pause()
delete(am.cfgs, user)
am.multitenantMetrics.invalidConfig.DeleteLabelValues(user)
level.Info(am.logger).Log("msg", "deactivated per-tenant alertmanager", "user", user)
}
}
am.multitenantMetrics.totalConfigs.WithLabelValues(configStatusInvalid).Set(float64(invalid))
am.multitenantMetrics.totalConfigs.WithLabelValues(configStatusValid).Set(float64(len(am.cfgs) - invalid))
}

func (am *MultitenantAlertmanager) transformConfig(userID string, amConfig *amconfig.Config) (*amconfig.Config, error) {
Expand Down Expand Up @@ -407,7 +402,7 @@ func (am *MultitenantAlertmanager) setConfig(cfg alerts.AlertConfigDesc) error {
if am.fallbackConfig == "" {
return fmt.Errorf("blank Alertmanager configuration for %v", cfg.User)
}
level.Info(am.logger).Log("msg", "blank Alertmanager configuration; using fallback", "user_id", cfg.User)
level.Info(am.logger).Log("msg", "blank Alertmanager configuration; using fallback", "user", cfg.User)
userAmConfig, err = amconfig.Load(am.fallbackConfig)
if err != nil {
return fmt.Errorf("unable to load fallback configuration for %v: %v", cfg.User, err)
Expand Down
42 changes: 22 additions & 20 deletions pkg/alertmanager/multitenant_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,11 +96,11 @@ func TestLoadAllConfigs(t *testing.T) {
require.Equal(t, simpleConfigOne, currentConfig.RawConfig)

assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(`
# HELP cortex_alertmanager_configs How many configs the multitenant alertmanager knows about.
# TYPE cortex_alertmanager_configs gauge
cortex_alertmanager_configs{status="valid"} 2
cortex_alertmanager_configs{status="invalid"} 0
`), "cortex_alertmanager_configs"))
# HELP cortex_alertmanager_invalid_config Whenever the Alertmanager config is invalid for a user.
# TYPE cortex_alertmanager_invalid_config gauge
cortex_alertmanager_invalid_config{user="user1"} 0
cortex_alertmanager_invalid_config{user="user2"} 0
`), "cortex_alertmanager_invalid_config"))

// Ensure when a 3rd config is added, it is synced correctly
mockStore.configs["user3"] = alerts.AlertConfigDesc{
Expand All @@ -113,11 +113,12 @@ func TestLoadAllConfigs(t *testing.T) {
require.Len(t, am.alertmanagers, 3)

assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(`
# HELP cortex_alertmanager_configs How many configs the multitenant alertmanager knows about.
# TYPE cortex_alertmanager_configs gauge
cortex_alertmanager_configs{status="valid"} 3
cortex_alertmanager_configs{status="invalid"} 0
`), "cortex_alertmanager_configs"))
# HELP cortex_alertmanager_invalid_config Whenever the Alertmanager config is invalid for a user.
# TYPE cortex_alertmanager_invalid_config gauge
cortex_alertmanager_invalid_config{user="user1"} 0
cortex_alertmanager_invalid_config{user="user2"} 0
cortex_alertmanager_invalid_config{user="user3"} 0
`), "cortex_alertmanager_invalid_config"))

// Ensure the config is updated
mockStore.configs["user1"] = alerts.AlertConfigDesc{
Expand Down Expand Up @@ -145,11 +146,11 @@ func TestLoadAllConfigs(t *testing.T) {
require.False(t, userAM.IsActive())

assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(`
# HELP cortex_alertmanager_configs How many configs the multitenant alertmanager knows about.
# TYPE cortex_alertmanager_configs gauge
cortex_alertmanager_configs{status="valid"} 2
cortex_alertmanager_configs{status="invalid"} 0
`), "cortex_alertmanager_configs"))
# HELP cortex_alertmanager_invalid_config Whenever the Alertmanager config is invalid for a user.
# TYPE cortex_alertmanager_invalid_config gauge
cortex_alertmanager_invalid_config{user="user1"} 0
cortex_alertmanager_invalid_config{user="user2"} 0
`), "cortex_alertmanager_invalid_config"))

// Ensure when a 3rd config is re-added, it is synced correctly
mockStore.configs["user3"] = alerts.AlertConfigDesc{
Expand All @@ -169,9 +170,10 @@ func TestLoadAllConfigs(t *testing.T) {
require.True(t, userAM.IsActive())

assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(`
# HELP cortex_alertmanager_configs How many configs the multitenant alertmanager knows about.
# TYPE cortex_alertmanager_configs gauge
cortex_alertmanager_configs{status="valid"} 3
cortex_alertmanager_configs{status="invalid"} 0
`), "cortex_alertmanager_configs"))
# HELP cortex_alertmanager_invalid_config Whenever the Alertmanager config is invalid for a user.
# TYPE cortex_alertmanager_invalid_config gauge
cortex_alertmanager_invalid_config{user="user1"} 0
cortex_alertmanager_invalid_config{user="user2"} 0
cortex_alertmanager_invalid_config{user="user3"} 0
`), "cortex_alertmanager_invalid_config"))
}