From 96dd698ce49b18c2023e98a1f8a8552b7456fef1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Tue, 1 Jun 2021 16:47:14 +0200 Subject: [PATCH 1/9] Add store limits. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- CHANGELOG.md | 1 + docs/configuration/config-file-reference.md | 10 ++ pkg/alertmanager/alertmanager.go | 130 +++++++++++++++++++- pkg/alertmanager/alertmanager_metrics.go | 7 ++ pkg/alertmanager/alertmanager_test.go | 96 +++++++++++++++ pkg/alertmanager/multitenant.go | 9 +- pkg/alertmanager/multitenant_test.go | 10 ++ pkg/util/validation/limits.go | 12 ++ 8 files changed, 273 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5632efe9aed..48402d661f2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -39,6 +39,7 @@ * `memberlist_client_kv_store_value_tombstones_removed_total` * `memberlist_client_messages_to_broadcast_dropped_total` * [ENHANCEMENT] Alertmanager: Added `-alertmanager.max-dispatcher-aggregation-groups` option to control max number of active dispatcher groups in Alertmanager (per tenant, also overrideable). When the limit is reached, Dispatcher produces log message and increases `cortex_alertmanager_dispatcher_aggregation_group_limit_reached_total` metric. #4254 +* [ENHANCEMENT] Alertmanager: Added `-alertmanager.max-alerts-count` and `-alertmanager.max-alerts-size-bytes` to control max number of alerts and total size of alerts that single user can have in Alertmanager's memory. Adding more alerts will fail with a log message and incrementing `cortex_alertmanager_insert_alert_failures_total` metric (per-user). These limits can be overrided by using per-tenant overrides. * [BUGFIX] Purger: fix `Invalid null value in condition for column range` caused by `nil` value in range for WriteBatch query. #4128 * [BUGFIX] Ingester: fixed infrequent panic caused by a race condition between TSDB mmap-ed head chunks truncation and queries. #4176 * [BUGFIX] Alertmanager: fix Alertmanager status page if clustering via gossip is disabled or sharding is enabled. #4184 diff --git a/docs/configuration/config-file-reference.md b/docs/configuration/config-file-reference.md index a0a84f7f80c..98bd93e7797 100644 --- a/docs/configuration/config-file-reference.md +++ b/docs/configuration/config-file-reference.md @@ -4180,6 +4180,16 @@ The `limits_config` configures default and per-tenant limits imposed by Cortex s # 0 = no limit. # CLI flag: -alertmanager.max-dispatcher-aggregation-groups [alertmanager_max_dispatcher_aggregation_groups: | default = 0] + +# Maximum number of alerts that single user can have. Inserting more alerts will +# fail with a log message and metric increment. 0 = no limit. +# CLI flag: -alertmanager.max-alerts-count +[alertmanager_max_alerts_count: | default = 0] + +# Maximum total size of alerts that single user can have. Inserting more alerts +# will fail with a log message and metric increment. 0 = no limit. +# CLI flag: -alertmanager.max-alerts-size-bytes +[alertmanager_max_alerts_size_bytes: | default = 0] ``` ### `redis_config` diff --git a/pkg/alertmanager/alertmanager.go b/pkg/alertmanager/alertmanager.go index b10937ea179..7403b5b25ad 100644 --- a/pkg/alertmanager/alertmanager.go +++ b/pkg/alertmanager/alertmanager.go @@ -113,6 +113,7 @@ type Alertmanager struct { configHashMetric prometheus.Gauge rateLimitedNotifications *prometheus.CounterVec + insertAlertFailures *prometheus.CounterVec } var ( @@ -166,8 +167,16 @@ func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) { Name: "alertmanager_notification_rate_limited_total", Help: "Number of rate-limited notifications per integration.", }, []string{"integration"}), // "integration" is consistent with other alertmanager metrics. + + insertAlertFailures: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ + Name: "alertmanager_insert_alert_failures_total", + Help: "Number of failures to insert new alerts to in-memory alert store.", + }, []string{"reason"}), } + am.insertAlertFailures.WithLabelValues(insertFailureTooManyAlerts) + am.insertAlertFailures.WithLabelValues(insertFailureAlertsTooBig) + am.registry = reg // We currently have 3 operational modes: @@ -241,7 +250,12 @@ func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) { am.wg.Done() }() - am.alerts, err = mem.NewAlerts(context.Background(), am.marker, 30*time.Minute, nil, am.logger) + var callback mem.AlertStoreCallback + if am.cfg.Limits != nil { + callback = newAlertsLimiter(am.cfg.UserID, am.cfg.Limits, am.insertAlertFailures) + } + + am.alerts, err = mem.NewAlerts(context.Background(), am.marker, 30*time.Minute, callback, am.logger) if err != nil { return nil, fmt.Errorf("failed to create alerts: %v", err) } @@ -584,3 +598,117 @@ type dispatcherLimits struct { func (g *dispatcherLimits) MaxNumberOfAggregationGroups() int { return g.limits.AlertmanagerMaxDispatcherAggregationGroups(g.tenant) } + +var ( + errTooManyAlerts = "too many alerts, limit: %d" + errAlertsTooBig = "alerts too big, total size limit: %d bytes" +) + +const ( + insertFailureTooManyAlerts = "too_many_alerts" + insertFailureAlertsTooBig = "alerts_too_big" +) + +type alertsLimiter struct { + tenant string + limits Limits + + failureCounter *prometheus.CounterVec + + mx sync.Mutex + sizes map[model.Fingerprint]int + count int + totalSize int +} + +func newAlertsLimiter(tenant string, limits Limits, failureCounter *prometheus.CounterVec) *alertsLimiter { + return &alertsLimiter{ + tenant: tenant, + limits: limits, + sizes: map[model.Fingerprint]int{}, + failureCounter: failureCounter, + } +} + +func (a *alertsLimiter) PreStore(alert *types.Alert, existing bool) error { + if alert == nil { + return nil + } + + countLimit := a.limits.AlertmanagerMaxAlertsCount(a.tenant) + sizeLimit := a.limits.AlertmanagerMaxAlertsSizeBytes(a.tenant) + + newSize := alertSize(alert.Alert) + + a.mx.Lock() + defer a.mx.Unlock() + + if !existing { + if countLimit > 0 && (a.count+1) > countLimit { + a.failureCounter.WithLabelValues(insertFailureTooManyAlerts).Inc() + return fmt.Errorf(errTooManyAlerts, countLimit) + } + + if sizeLimit > 0 && (a.totalSize+newSize) > sizeLimit { + a.failureCounter.WithLabelValues(insertFailureAlertsTooBig).Inc() + return fmt.Errorf(errAlertsTooBig, sizeLimit) + } + } + + return nil +} + +func (a *alertsLimiter) PostStore(alert *types.Alert, existing bool) { + if alert == nil { + return + } + + newSize := alertSize(alert.Alert) + + a.mx.Lock() + defer a.mx.Unlock() + + fp := alert.Fingerprint() + if existing { + a.totalSize -= a.sizes[fp] + } else { + a.count++ + } + a.sizes[fp] = newSize + a.totalSize += newSize +} + +func (a *alertsLimiter) PostDelete(alert *types.Alert) { + if alert == nil { + return + } + + a.mx.Lock() + defer a.mx.Unlock() + + fp := alert.Fingerprint() + a.totalSize -= a.sizes[fp] + delete(a.sizes, fp) + a.count-- +} + +func (a *alertsLimiter) currentStats() (count, totalSize int) { + a.mx.Lock() + defer a.mx.Unlock() + + return a.count, a.totalSize +} + +func alertSize(alert model.Alert) int { + size := 0 + for l, v := range alert.Labels { + size += len(l) + size += len(v) + } + for l, v := range alert.Annotations { + size += len(l) + size += len(v) + } + size += len(alert.GeneratorURL) + return size +} diff --git a/pkg/alertmanager/alertmanager_metrics.go b/pkg/alertmanager/alertmanager_metrics.go index dc7cc8c3f05..e3892165e0b 100644 --- a/pkg/alertmanager/alertmanager_metrics.go +++ b/pkg/alertmanager/alertmanager_metrics.go @@ -61,6 +61,7 @@ type alertmanagerMetrics struct { notificationRateLimited *prometheus.Desc dispatcherAggregationGroupsLimitReached *prometheus.Desc + insertAlertFailures *prometheus.Desc } func newAlertmanagerMetrics() *alertmanagerMetrics { @@ -214,6 +215,10 @@ func newAlertmanagerMetrics() *alertmanagerMetrics { "cortex_alertmanager_dispatcher_aggregation_group_limit_reached_total", "Number of times when dispatcher failed to create new aggregation group due to limit.", []string{"user"}, nil), + insertAlertFailures: prometheus.NewDesc( + "cortex_alertmanager_insert_alert_failures_total", + "Total number of failures to store alert due to hitting alertmanager limits.", + []string{"user"}, nil), } } @@ -265,6 +270,7 @@ func (m *alertmanagerMetrics) Describe(out chan<- *prometheus.Desc) { out <- m.persistFailed out <- m.notificationRateLimited out <- m.dispatcherAggregationGroupsLimitReached + out <- m.insertAlertFailures } func (m *alertmanagerMetrics) Collect(out chan<- prometheus.Metric) { @@ -313,4 +319,5 @@ func (m *alertmanagerMetrics) Collect(out chan<- prometheus.Metric) { data.SendSumOfCountersPerUserWithLabels(out, m.notificationRateLimited, "alertmanager_notification_rate_limited_total", "integration") data.SendSumOfCountersPerUser(out, m.dispatcherAggregationGroupsLimitReached, "alertmanager_dispatcher_aggregation_group_limit_reached_total") + data.SendSumOfCountersPerUser(out, m.insertAlertFailures, "alertmanager_insert_alert_failures_total") } diff --git a/pkg/alertmanager/alertmanager_test.go b/pkg/alertmanager/alertmanager_test.go index 9464187ff29..b31770b70d3 100644 --- a/pkg/alertmanager/alertmanager_test.go +++ b/pkg/alertmanager/alertmanager_test.go @@ -13,6 +13,7 @@ import ( "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/testutil" "github.com/prometheus/common/model" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "github.com/cortexproject/cortex/pkg/util/test" @@ -110,3 +111,98 @@ route: `, expectedFailures)), "alertmanager_dispatcher_aggregation_group_limit_reached_total") }) } + +var ( + alert1 = model.Alert{ + Labels: model.LabelSet{"alert": "first"}, + Annotations: model.LabelSet{"job": "test"}, + StartsAt: time.Now(), + EndsAt: time.Now(), + GeneratorURL: "some URL", + } + alert1Size = alertSize(alert1) + + alert2 = model.Alert{ + Labels: model.LabelSet{"alert": "second"}, + Annotations: model.LabelSet{"job": "test", "cluster": "prod"}, + StartsAt: time.Now(), + EndsAt: time.Now(), + GeneratorURL: "some URL", + } + alert2Size = alertSize(alert2) +) + +type callbackOp struct { + alert *types.Alert + existing bool + delete bool // true=delete, false=insert. + expectedInsertError error + + // expected values after operation. + expectedCount int + expectedTotalSize int +} + +func TestAlertsLimiterWithNoLimits(t *testing.T) { + ops := []callbackOp{ + {alert: &types.Alert{Alert: alert1}, existing: false, expectedCount: 1, expectedTotalSize: alert1Size}, + {alert: &types.Alert{Alert: alert2}, existing: false, expectedCount: 2, expectedTotalSize: alert1Size + alert2Size}, + {alert: &types.Alert{Alert: alert2}, delete: true, expectedCount: 1, expectedTotalSize: alert1Size}, + {alert: &types.Alert{Alert: alert1}, delete: true, expectedCount: 0, expectedTotalSize: 0}, + } + + testLimiter(t, &mockAlertManagerLimits{}, ops) +} + +func TestAlertsLimiterWithCountLimit(t *testing.T) { + ops := []callbackOp{ + {alert: &types.Alert{Alert: alert1}, existing: false, expectedCount: 1, expectedTotalSize: alert1Size}, + {alert: &types.Alert{Alert: alert2}, existing: false, expectedInsertError: fmt.Errorf(errTooManyAlerts, 1), expectedCount: 1, expectedTotalSize: alert1Size}, + {alert: &types.Alert{Alert: alert1}, delete: true, expectedCount: 0, expectedTotalSize: 0}, + + {alert: &types.Alert{Alert: alert2}, existing: false, expectedCount: 1, expectedTotalSize: alert2Size}, + {alert: &types.Alert{Alert: alert2}, delete: true, expectedCount: 0, expectedTotalSize: 0}, + } + + testLimiter(t, &mockAlertManagerLimits{maxAlertsCount: 1}, ops) +} + +func TestAlertsLimiterWithSizeLimit(t *testing.T) { + ops := []callbackOp{ + {alert: &types.Alert{Alert: alert1}, existing: false, expectedCount: 1, expectedTotalSize: alert1Size}, + {alert: &types.Alert{Alert: alert2}, existing: false, expectedInsertError: fmt.Errorf(errAlertsTooBig, alert2Size), expectedCount: 1, expectedTotalSize: alert1Size}, + {alert: &types.Alert{Alert: alert1}, delete: true, expectedCount: 0, expectedTotalSize: 0}, + + {alert: &types.Alert{Alert: alert2}, existing: false, expectedCount: 1, expectedTotalSize: alert2Size}, + {alert: &types.Alert{Alert: alert2}, delete: true, expectedCount: 0, expectedTotalSize: 0}, + } + + // Prerequisite for this test. We set size limit to alert2Size, but inserting alert1 first will prevent insertion of alert2. + require.True(t, alert2Size > alert1Size) + + testLimiter(t, &mockAlertManagerLimits{maxAlertsSizeBytes: alert2Size}, ops) +} + +// testLimiter sends sequence of alerts to limiter, and checks if limiter updated reacted correctly. +func testLimiter(t *testing.T, limits Limits, ops []callbackOp) { + counter := prometheus.NewCounterVec(prometheus.CounterOpts{}, []string{"user", "reason"}) + + limiter := newAlertsLimiter("test", limits, counter) + + for ix, op := range ops { + if op.delete { + limiter.PostDelete(op.alert) + } else { + err := limiter.PreStore(op.alert, op.existing) + require.Equal(t, op.expectedInsertError, err, "op %d", ix) + if err == nil { + limiter.PostStore(op.alert, op.existing) + } + } + + count, totalSize := limiter.currentStats() + + assert.Equal(t, op.expectedCount, count, "wrong count, op %d", ix) + assert.Equal(t, op.expectedTotalSize, totalSize, "wrong total size, op %d", ix) + } +} diff --git a/pkg/alertmanager/multitenant.go b/pkg/alertmanager/multitenant.go index ad9aa1b591f..a5f52ea1ed3 100644 --- a/pkg/alertmanager/multitenant.go +++ b/pkg/alertmanager/multitenant.go @@ -220,9 +220,16 @@ type Limits interface { // AlertmanagerMaxTemplateSize returns max size of individual template. 0 = no limit. AlertmanagerMaxTemplateSize(tenant string) int - // AlertmanagerMaxNumberOfDispatcherAggregationGroups returns maximum number of aggregation groups in Alertmanager's dispatcher that a tenant can have. + // AlertmanagerMaxDispatcherAggregationGroups returns maximum number of aggregation groups in Alertmanager's dispatcher that a tenant can have. // Each aggregation group consumes single goroutine. 0 = unlimited. AlertmanagerMaxDispatcherAggregationGroups(t string) int + + // AlertmanagerMaxAlertsCount returns max number of alerts that tenant can have active at the same time. 0 = no limit. + AlertmanagerMaxAlertsCount(tenant string) int + + // AlertmanagerMaxAlertsSizeBytes returns total max size of alerts that tenant can have active at the same time. 0 = no limit. + // Size of the alert is computed from alert labels and annotations. + AlertmanagerMaxAlertsSizeBytes(tenant string) int } // A MultitenantAlertmanager manages Alertmanager instances for multiple diff --git a/pkg/alertmanager/multitenant_test.go b/pkg/alertmanager/multitenant_test.go index a4ec86f2b8f..dc60301eb8e 100644 --- a/pkg/alertmanager/multitenant_test.go +++ b/pkg/alertmanager/multitenant_test.go @@ -2025,6 +2025,8 @@ type mockAlertManagerLimits struct { maxTemplatesCount int maxSizeOfTemplate int maxDispatcherAggregationGroups int + maxAlertsCount int + maxAlertsSizeBytes int } func (m *mockAlertManagerLimits) AlertmanagerMaxConfigSize(tenant string) int { @@ -2058,3 +2060,11 @@ func (m *mockAlertManagerLimits) NotificationBurstSize(_ string, integration str func (m *mockAlertManagerLimits) AlertmanagerMaxDispatcherAggregationGroups(_ string) int { return m.maxDispatcherAggregationGroups } + +func (m *mockAlertManagerLimits) AlertmanagerMaxAlertsCount(_ string) int { + return m.maxAlertsCount +} + +func (m *mockAlertManagerLimits) AlertmanagerMaxAlertsSizeBytes(_ string) int { + return m.maxAlertsSizeBytes +} diff --git a/pkg/util/validation/limits.go b/pkg/util/validation/limits.go index 4ea60b6b61c..44734a04a1e 100644 --- a/pkg/util/validation/limits.go +++ b/pkg/util/validation/limits.go @@ -110,6 +110,8 @@ type Limits struct { AlertmanagerMaxTemplatesCount int `yaml:"alertmanager_max_templates_count" json:"alertmanager_max_templates_count"` AlertmanagerMaxTemplateSizeBytes int `yaml:"alertmanager_max_template_size_bytes" json:"alertmanager_max_template_size_bytes"` AlertmanagerMaxDispatcherAggregationGroups int `yaml:"alertmanager_max_dispatcher_aggregation_groups" json:"alertmanager_max_dispatcher_aggregation_groups"` + AlertmanagerMaxAlertsCount int `yaml:"alertmanager_max_alerts_count" json:"alertmanager_max_alerts_count"` + AlertmanagerMaxAlertsSizeBytes int `yaml:"alertmanager_max_alerts_size_bytes" json:"alertmanager_max_alerts_size_bytes"` } // RegisterFlags adds the flags required to config this to the given FlagSet @@ -183,6 +185,8 @@ func (l *Limits) RegisterFlags(f *flag.FlagSet) { f.IntVar(&l.AlertmanagerMaxTemplatesCount, "alertmanager.max-templates-count", 0, "Maximum number of templates in tenant's Alertmanager configuration uploaded via Alertmanager API. 0 = no limit.") f.IntVar(&l.AlertmanagerMaxTemplateSizeBytes, "alertmanager.max-template-size-bytes", 0, "Maximum size of single template in tenant's Alertmanager configuration uploaded via Alertmanager API. 0 = no limit.") f.IntVar(&l.AlertmanagerMaxDispatcherAggregationGroups, "alertmanager.max-dispatcher-aggregation-groups", 0, "Maximum number of aggregation groups in Alertmanager's dispatcher that a tenant can have. Each active aggregation group uses single goroutine. When the limit is reached, dispatcher will not dispatch alerts that belong to additional aggregation groups, but existing groups will keep working properly. 0 = no limit.") + f.IntVar(&l.AlertmanagerMaxAlertsCount, "alertmanager.max-alerts-count", 0, "Maximum number of alerts that single user can have. Inserting more alerts will fail with a log message and metric increment. 0 = no limit.") + f.IntVar(&l.AlertmanagerMaxAlertsSizeBytes, "alertmanager.max-alerts-size-bytes", 0, "Maximum total size of alerts that single user can have. Inserting more alerts will fail with a log message and metric increment. 0 = no limit.") } // Validate the limits config and returns an error if the validation @@ -611,6 +615,14 @@ func (o *Overrides) AlertmanagerMaxDispatcherAggregationGroups(userID string) in return o.getOverridesForUser(userID).AlertmanagerMaxDispatcherAggregationGroups } +func (o *Overrides) AlertmanagerMaxAlertsCount(userID string) int { + return o.getOverridesForUser(userID).AlertmanagerMaxAlertsCount +} + +func (o *Overrides) AlertmanagerMaxAlertsSizeBytes(userID string) int { + return o.getOverridesForUser(userID).AlertmanagerMaxAlertsSizeBytes +} + func (o *Overrides) getOverridesForUser(userID string) *Limits { if o.tenantLimits != nil { l := o.tenantLimits.ByUserID(userID) From b6536dad54f8a213b2f5484f92d5aaeaee20e480 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Thu, 3 Jun 2021 14:59:41 +0200 Subject: [PATCH 2/9] Expose alerts limiter metrics. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- pkg/alertmanager/alertmanager.go | 36 +++++++++---- pkg/alertmanager/alertmanager_metrics.go | 14 +++++ pkg/alertmanager/alertmanager_metrics_test.go | 54 +++++++++++++++++++ 3 files changed, 95 insertions(+), 9 deletions(-) diff --git a/pkg/alertmanager/alertmanager.go b/pkg/alertmanager/alertmanager.go index 7403b5b25ad..f32b80a4a34 100644 --- a/pkg/alertmanager/alertmanager.go +++ b/pkg/alertmanager/alertmanager.go @@ -113,7 +113,6 @@ type Alertmanager struct { configHashMetric prometheus.Gauge rateLimitedNotifications *prometheus.CounterVec - insertAlertFailures *prometheus.CounterVec } var ( @@ -168,15 +167,8 @@ func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) { Help: "Number of rate-limited notifications per integration.", }, []string{"integration"}), // "integration" is consistent with other alertmanager metrics. - insertAlertFailures: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ - Name: "alertmanager_insert_alert_failures_total", - Help: "Number of failures to insert new alerts to in-memory alert store.", - }, []string{"reason"}), } - am.insertAlertFailures.WithLabelValues(insertFailureTooManyAlerts) - am.insertAlertFailures.WithLabelValues(insertFailureAlertsTooBig) - am.registry = reg // We currently have 3 operational modes: @@ -252,7 +244,33 @@ func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) { var callback mem.AlertStoreCallback if am.cfg.Limits != nil { - callback = newAlertsLimiter(am.cfg.UserID, am.cfg.Limits, am.insertAlertFailures) + insertAlertFailures := promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ + Name: "alertmanager_insert_alert_failures_total", + Help: "Number of failures to insert new alerts to in-memory alert store.", + }, []string{"reason"}) + + insertAlertFailures.WithLabelValues(insertFailureTooManyAlerts) + insertAlertFailures.WithLabelValues(insertFailureAlertsTooBig) + + limiter := newAlertsLimiter(am.cfg.UserID, am.cfg.Limits, insertAlertFailures) + + promauto.With(reg).NewGaugeFunc(prometheus.GaugeOpts{ + Name: "alertmanager_alerts_limiter_current_alerts_count", + Help: "Number of alerts tracked by alerts limiter.", + }, func() float64 { + c, _ := limiter.currentStats() + return float64(c) + }) + + promauto.With(reg).NewGaugeFunc(prometheus.GaugeOpts{ + Name: "alertmanager_alerts_limiter_current_alerts_size_bytes", + Help: "Total size of alerts tracked by alerts limiter.", + }, func() float64 { + _, s := limiter.currentStats() + return float64(s) + }) + + callback = limiter } am.alerts, err = mem.NewAlerts(context.Background(), am.marker, 30*time.Minute, callback, am.logger) diff --git a/pkg/alertmanager/alertmanager_metrics.go b/pkg/alertmanager/alertmanager_metrics.go index e3892165e0b..995041ddb62 100644 --- a/pkg/alertmanager/alertmanager_metrics.go +++ b/pkg/alertmanager/alertmanager_metrics.go @@ -62,6 +62,8 @@ type alertmanagerMetrics struct { notificationRateLimited *prometheus.Desc dispatcherAggregationGroupsLimitReached *prometheus.Desc insertAlertFailures *prometheus.Desc + alertsLimiterAlertsCount *prometheus.Desc + alertsLimiterAlertsSize *prometheus.Desc } func newAlertmanagerMetrics() *alertmanagerMetrics { @@ -219,6 +221,14 @@ func newAlertmanagerMetrics() *alertmanagerMetrics { "cortex_alertmanager_insert_alert_failures_total", "Total number of failures to store alert due to hitting alertmanager limits.", []string{"user"}, nil), + alertsLimiterAlertsCount: prometheus.NewDesc( + "cortex_alertmanager_alerts_limiter_current_alerts_count", + "Number of alerts tracked by alerts limiter.", + []string{"user"}, nil), + alertsLimiterAlertsSize: prometheus.NewDesc( + "cortex_alertmanager_alerts_limiter_current_alerts_size_bytes", + "Total size of alerts tracked by alerts limiter.", + []string{"user"}, nil), } } @@ -271,6 +281,8 @@ func (m *alertmanagerMetrics) Describe(out chan<- *prometheus.Desc) { out <- m.notificationRateLimited out <- m.dispatcherAggregationGroupsLimitReached out <- m.insertAlertFailures + out <- m.alertsLimiterAlertsCount + out <- m.alertsLimiterAlertsSize } func (m *alertmanagerMetrics) Collect(out chan<- prometheus.Metric) { @@ -320,4 +332,6 @@ func (m *alertmanagerMetrics) Collect(out chan<- prometheus.Metric) { data.SendSumOfCountersPerUserWithLabels(out, m.notificationRateLimited, "alertmanager_notification_rate_limited_total", "integration") data.SendSumOfCountersPerUser(out, m.dispatcherAggregationGroupsLimitReached, "alertmanager_dispatcher_aggregation_group_limit_reached_total") data.SendSumOfCountersPerUser(out, m.insertAlertFailures, "alertmanager_insert_alert_failures_total") + data.SendSumOfGaugesPerUser(out, m.alertsLimiterAlertsCount, "alertmanager_alerts_limiter_current_alerts_count") + data.SendSumOfGaugesPerUser(out, m.alertsLimiterAlertsSize, "alertmanager_alerts_limiter_current_alerts_size_bytes") } diff --git a/pkg/alertmanager/alertmanager_metrics_test.go b/pkg/alertmanager/alertmanager_metrics_test.go index 32d0071e2a5..3b0a88b3f65 100644 --- a/pkg/alertmanager/alertmanager_metrics_test.go +++ b/pkg/alertmanager/alertmanager_metrics_test.go @@ -274,6 +274,22 @@ func TestAlertmanagerMetricsStore(t *testing.T) { # HELP cortex_alertmanager_state_persist_total Number of times we have tried to persist the running state to storage. # TYPE cortex_alertmanager_state_persist_total counter cortex_alertmanager_state_persist_total 0 + + # HELP cortex_alertmanager_alerts_limiter_current_alerts_count Number of alerts tracked by alerts limiter. + # TYPE cortex_alertmanager_alerts_limiter_current_alerts_count gauge + cortex_alertmanager_alerts_limiter_current_alerts_count{user="user1"} 10 + cortex_alertmanager_alerts_limiter_current_alerts_count{user="user2"} 100 + cortex_alertmanager_alerts_limiter_current_alerts_count{user="user3"} 1000 + # HELP cortex_alertmanager_alerts_limiter_current_alerts_size_bytes Total size of alerts tracked by alerts limiter. + # TYPE cortex_alertmanager_alerts_limiter_current_alerts_size_bytes gauge + cortex_alertmanager_alerts_limiter_current_alerts_size_bytes{user="user1"} 100 + cortex_alertmanager_alerts_limiter_current_alerts_size_bytes{user="user2"} 1000 + cortex_alertmanager_alerts_limiter_current_alerts_size_bytes{user="user3"} 10000 + # HELP cortex_alertmanager_insert_alert_failures_total Total number of failures to store alert due to hitting alertmanager limits. + # TYPE cortex_alertmanager_insert_alert_failures_total counter + cortex_alertmanager_insert_alert_failures_total{user="user1"} 11 + cortex_alertmanager_insert_alert_failures_total{user="user2"} 110 + cortex_alertmanager_insert_alert_failures_total{user="user3"} 1100 `)) require.NoError(t, err) } @@ -838,6 +854,12 @@ func populateAlertmanager(base float64) *prometheus.Registry { v2APIMetrics.invalid.Add(base) v2APIMetrics.resolved.Add(base * 3) + lm := newLimiterMetrics(reg) + lm.count.Set(10 * base) + lm.size.Set(100 * base) + lm.insertFailures.WithLabelValues(insertFailureTooManyAlerts).Add(7 * base) + lm.insertFailures.WithLabelValues(insertFailureAlertsTooBig).Add(4 * base) + return reg } @@ -1041,3 +1063,35 @@ func newAPIMetrics(version string, r prometheus.Registerer) *apiMetrics { invalid: numInvalidAlerts, } } + +type limiterMetrics struct { + count prometheus.Gauge + size prometheus.Gauge + insertFailures *prometheus.CounterVec +} + +func newLimiterMetrics(r prometheus.Registerer) *limiterMetrics { + count := promauto.With(r).NewGauge(prometheus.GaugeOpts{ + Name: "alertmanager_alerts_limiter_current_alerts_count", + Help: "Number of alerts tracked by alerts limiter.", + }) + + size := promauto.With(r).NewGauge(prometheus.GaugeOpts{ + Name: "alertmanager_alerts_limiter_current_alerts_size_bytes", + Help: "Total size of alerts tracked by alerts limiter.", + }) + + insertAlertFailures := promauto.With(r).NewCounterVec(prometheus.CounterOpts{ + Name: "alertmanager_insert_alert_failures_total", + Help: "Number of failures to insert new alerts to in-memory alert store.", + }, []string{"reason"}) + + insertAlertFailures.WithLabelValues(insertFailureTooManyAlerts) + insertAlertFailures.WithLabelValues(insertFailureAlertsTooBig) + + return &limiterMetrics{ + count: count, + size: size, + insertFailures: insertAlertFailures, + } +} From 354aa2e605caaab6bd4b52592c1c721b97da501c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Thu, 3 Jun 2021 15:04:45 +0200 Subject: [PATCH 3/9] Fix tests. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- pkg/alertmanager/alertmanager_metrics_test.go | 30 +++++++++++++++++++ pkg/alertmanager/alertmanager_test.go | 2 +- 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/pkg/alertmanager/alertmanager_metrics_test.go b/pkg/alertmanager/alertmanager_metrics_test.go index 3b0a88b3f65..c1a79479484 100644 --- a/pkg/alertmanager/alertmanager_metrics_test.go +++ b/pkg/alertmanager/alertmanager_metrics_test.go @@ -573,6 +573,23 @@ func TestAlertmanagerMetricsRemoval(t *testing.T) { # HELP cortex_alertmanager_state_persist_total Number of times we have tried to persist the running state to storage. # TYPE cortex_alertmanager_state_persist_total counter cortex_alertmanager_state_persist_total 0 + + # HELP cortex_alertmanager_alerts_limiter_current_alerts_count Number of alerts tracked by alerts limiter. + # TYPE cortex_alertmanager_alerts_limiter_current_alerts_count gauge + cortex_alertmanager_alerts_limiter_current_alerts_count{user="user1"} 10 + cortex_alertmanager_alerts_limiter_current_alerts_count{user="user2"} 100 + cortex_alertmanager_alerts_limiter_current_alerts_count{user="user3"} 1000 + # HELP cortex_alertmanager_alerts_limiter_current_alerts_size_bytes Total size of alerts tracked by alerts limiter. + # TYPE cortex_alertmanager_alerts_limiter_current_alerts_size_bytes gauge + cortex_alertmanager_alerts_limiter_current_alerts_size_bytes{user="user1"} 100 + cortex_alertmanager_alerts_limiter_current_alerts_size_bytes{user="user2"} 1000 + cortex_alertmanager_alerts_limiter_current_alerts_size_bytes{user="user3"} 10000 + # HELP cortex_alertmanager_insert_alert_failures_total Total number of failures to store alert due to hitting alertmanager limits. + # TYPE cortex_alertmanager_insert_alert_failures_total counter + cortex_alertmanager_insert_alert_failures_total{user="user1"} 11 + cortex_alertmanager_insert_alert_failures_total{user="user2"} 110 + cortex_alertmanager_insert_alert_failures_total{user="user3"} 1100 + `)) require.NoError(t, err) @@ -804,6 +821,19 @@ func TestAlertmanagerMetricsRemoval(t *testing.T) { # HELP cortex_alertmanager_state_persist_total Number of times we have tried to persist the running state to storage. # TYPE cortex_alertmanager_state_persist_total counter cortex_alertmanager_state_persist_total 0 + + # HELP cortex_alertmanager_alerts_limiter_current_alerts_count Number of alerts tracked by alerts limiter. + # TYPE cortex_alertmanager_alerts_limiter_current_alerts_count gauge + cortex_alertmanager_alerts_limiter_current_alerts_count{user="user1"} 10 + cortex_alertmanager_alerts_limiter_current_alerts_count{user="user2"} 100 + # HELP cortex_alertmanager_alerts_limiter_current_alerts_size_bytes Total size of alerts tracked by alerts limiter. + # TYPE cortex_alertmanager_alerts_limiter_current_alerts_size_bytes gauge + cortex_alertmanager_alerts_limiter_current_alerts_size_bytes{user="user1"} 100 + cortex_alertmanager_alerts_limiter_current_alerts_size_bytes{user="user2"} 1000 + # HELP cortex_alertmanager_insert_alert_failures_total Total number of failures to store alert due to hitting alertmanager limits. + # TYPE cortex_alertmanager_insert_alert_failures_total counter + cortex_alertmanager_insert_alert_failures_total{user="user1"} 11 + cortex_alertmanager_insert_alert_failures_total{user="user2"} 110 `)) require.NoError(t, err) } diff --git a/pkg/alertmanager/alertmanager_test.go b/pkg/alertmanager/alertmanager_test.go index b31770b70d3..dca81be1d53 100644 --- a/pkg/alertmanager/alertmanager_test.go +++ b/pkg/alertmanager/alertmanager_test.go @@ -185,7 +185,7 @@ func TestAlertsLimiterWithSizeLimit(t *testing.T) { // testLimiter sends sequence of alerts to limiter, and checks if limiter updated reacted correctly. func testLimiter(t *testing.T, limits Limits, ops []callbackOp) { - counter := prometheus.NewCounterVec(prometheus.CounterOpts{}, []string{"user", "reason"}) + counter := prometheus.NewCounterVec(prometheus.CounterOpts{}, []string{"reason"}) limiter := newAlertsLimiter("test", limits, counter) From 4b51e9b1940d3c8a52790abe7c5a08402b6a9035 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Thu, 3 Jun 2021 15:23:41 +0200 Subject: [PATCH 4/9] CHANGELOG.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 48402d661f2..ffd0bc88208 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -39,7 +39,7 @@ * `memberlist_client_kv_store_value_tombstones_removed_total` * `memberlist_client_messages_to_broadcast_dropped_total` * [ENHANCEMENT] Alertmanager: Added `-alertmanager.max-dispatcher-aggregation-groups` option to control max number of active dispatcher groups in Alertmanager (per tenant, also overrideable). When the limit is reached, Dispatcher produces log message and increases `cortex_alertmanager_dispatcher_aggregation_group_limit_reached_total` metric. #4254 -* [ENHANCEMENT] Alertmanager: Added `-alertmanager.max-alerts-count` and `-alertmanager.max-alerts-size-bytes` to control max number of alerts and total size of alerts that single user can have in Alertmanager's memory. Adding more alerts will fail with a log message and incrementing `cortex_alertmanager_insert_alert_failures_total` metric (per-user). These limits can be overrided by using per-tenant overrides. +* [ENHANCEMENT] Alertmanager: Added `-alertmanager.max-alerts-count` and `-alertmanager.max-alerts-size-bytes` to control max number of alerts and total size of alerts that single user can have in Alertmanager's memory. Adding more alerts will fail with a log message and incrementing `cortex_alertmanager_insert_alert_failures_total` metric (per-user). These limits can be overrided by using per-tenant overrides. Current values are tracked in `cortex_alertmanager_alerts_limiter_current_alerts_count` and `cortex_alertmanager_alerts_limiter_current_alerts_size_bytes` metrics. #4253 * [BUGFIX] Purger: fix `Invalid null value in condition for column range` caused by `nil` value in range for WriteBatch query. #4128 * [BUGFIX] Ingester: fixed infrequent panic caused by a race condition between TSDB mmap-ed head chunks truncation and queries. #4176 * [BUGFIX] Alertmanager: fix Alertmanager status page if clustering via gossip is disabled or sharding is enabled. #4184 From 7ffe60b3aadca2b322a44967ddb7d9c00a24c1d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= Date: Tue, 8 Jun 2021 09:16:57 +0200 Subject: [PATCH 5/9] Address review feedback. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- CHANGELOG.md | 2 +- docs/configuration/config-file-reference.md | 9 +++--- pkg/alertmanager/alertmanager.go | 10 +++++-- pkg/alertmanager/alertmanager_metrics.go | 4 +-- pkg/alertmanager/alertmanager_metrics_test.go | 30 +++++++++---------- pkg/alertmanager/multitenant.go | 2 +- pkg/util/validation/limits.go | 4 +-- 7 files changed, 33 insertions(+), 28 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ffd0bc88208..fc1f328030e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -39,7 +39,7 @@ * `memberlist_client_kv_store_value_tombstones_removed_total` * `memberlist_client_messages_to_broadcast_dropped_total` * [ENHANCEMENT] Alertmanager: Added `-alertmanager.max-dispatcher-aggregation-groups` option to control max number of active dispatcher groups in Alertmanager (per tenant, also overrideable). When the limit is reached, Dispatcher produces log message and increases `cortex_alertmanager_dispatcher_aggregation_group_limit_reached_total` metric. #4254 -* [ENHANCEMENT] Alertmanager: Added `-alertmanager.max-alerts-count` and `-alertmanager.max-alerts-size-bytes` to control max number of alerts and total size of alerts that single user can have in Alertmanager's memory. Adding more alerts will fail with a log message and incrementing `cortex_alertmanager_insert_alert_failures_total` metric (per-user). These limits can be overrided by using per-tenant overrides. Current values are tracked in `cortex_alertmanager_alerts_limiter_current_alerts_count` and `cortex_alertmanager_alerts_limiter_current_alerts_size_bytes` metrics. #4253 +* [ENHANCEMENT] Alertmanager: Added `-alertmanager.max-alerts-count` and `-alertmanager.max-alerts-size-bytes` to control max number of alerts and total size of alerts that a single user can have in Alertmanager's memory. Adding more alerts will fail with a log message and incrementing `cortex_alertmanager_alerts_insert_failures_total` metric (per-user). These limits can be overrided by using per-tenant overrides. Current values are tracked in `cortex_alertmanager_alerts_limiter_current_alerts_count` and `cortex_alertmanager_alerts_limiter_current_alerts_size_bytes` metrics. #4253 * [BUGFIX] Purger: fix `Invalid null value in condition for column range` caused by `nil` value in range for WriteBatch query. #4128 * [BUGFIX] Ingester: fixed infrequent panic caused by a race condition between TSDB mmap-ed head chunks truncation and queries. #4176 * [BUGFIX] Alertmanager: fix Alertmanager status page if clustering via gossip is disabled or sharding is enabled. #4184 diff --git a/docs/configuration/config-file-reference.md b/docs/configuration/config-file-reference.md index 98bd93e7797..56100e0621b 100644 --- a/docs/configuration/config-file-reference.md +++ b/docs/configuration/config-file-reference.md @@ -4181,13 +4181,14 @@ The `limits_config` configures default and per-tenant limits imposed by Cortex s # CLI flag: -alertmanager.max-dispatcher-aggregation-groups [alertmanager_max_dispatcher_aggregation_groups: | default = 0] -# Maximum number of alerts that single user can have. Inserting more alerts will -# fail with a log message and metric increment. 0 = no limit. +# Maximum number of alerts that a single user can have. Inserting more alerts +# will fail with a log message and metric increment. 0 = no limit. # CLI flag: -alertmanager.max-alerts-count [alertmanager_max_alerts_count: | default = 0] -# Maximum total size of alerts that single user can have. Inserting more alerts -# will fail with a log message and metric increment. 0 = no limit. +# Maximum total size of alerts that a single user can have, alert size is the +# sum of the bytes of its labels, annotations and generatorURL. Inserting more +# alerts will fail with a log message and metric increment. 0 = no limit. # CLI flag: -alertmanager.max-alerts-size-bytes [alertmanager_max_alerts_size_bytes: | default = 0] ``` diff --git a/pkg/alertmanager/alertmanager.go b/pkg/alertmanager/alertmanager.go index f32b80a4a34..d7ae352a69f 100644 --- a/pkg/alertmanager/alertmanager.go +++ b/pkg/alertmanager/alertmanager.go @@ -245,7 +245,7 @@ func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) { var callback mem.AlertStoreCallback if am.cfg.Limits != nil { insertAlertFailures := promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ - Name: "alertmanager_insert_alert_failures_total", + Name: "alertmanager_alerts_insert_failures_total", Help: "Number of failures to insert new alerts to in-memory alert store.", }, []string{"reason"}) @@ -627,6 +627,9 @@ const ( insertFailureAlertsTooBig = "alerts_too_big" ) +// alertsLimiter limits the number and size of alerts being received by the Alertmanager. +// We consider an alert unique based on its fingerprint (a hash of its labels) and +// its size it's determined by the sum of bytes of its labels, annotations, and generator URL. type alertsLimiter struct { tenant string limits Limits @@ -682,11 +685,11 @@ func (a *alertsLimiter) PostStore(alert *types.Alert, existing bool) { } newSize := alertSize(alert.Alert) + fp := alert.Fingerprint() a.mx.Lock() defer a.mx.Unlock() - fp := alert.Fingerprint() if existing { a.totalSize -= a.sizes[fp] } else { @@ -701,10 +704,11 @@ func (a *alertsLimiter) PostDelete(alert *types.Alert) { return } + fp := alert.Fingerprint() + a.mx.Lock() defer a.mx.Unlock() - fp := alert.Fingerprint() a.totalSize -= a.sizes[fp] delete(a.sizes, fp) a.count-- diff --git a/pkg/alertmanager/alertmanager_metrics.go b/pkg/alertmanager/alertmanager_metrics.go index 995041ddb62..4d891c0d7ef 100644 --- a/pkg/alertmanager/alertmanager_metrics.go +++ b/pkg/alertmanager/alertmanager_metrics.go @@ -218,7 +218,7 @@ func newAlertmanagerMetrics() *alertmanagerMetrics { "Number of times when dispatcher failed to create new aggregation group due to limit.", []string{"user"}, nil), insertAlertFailures: prometheus.NewDesc( - "cortex_alertmanager_insert_alert_failures_total", + "cortex_alertmanager_alerts_insert_failures_total", "Total number of failures to store alert due to hitting alertmanager limits.", []string{"user"}, nil), alertsLimiterAlertsCount: prometheus.NewDesc( @@ -331,7 +331,7 @@ func (m *alertmanagerMetrics) Collect(out chan<- prometheus.Metric) { data.SendSumOfCountersPerUserWithLabels(out, m.notificationRateLimited, "alertmanager_notification_rate_limited_total", "integration") data.SendSumOfCountersPerUser(out, m.dispatcherAggregationGroupsLimitReached, "alertmanager_dispatcher_aggregation_group_limit_reached_total") - data.SendSumOfCountersPerUser(out, m.insertAlertFailures, "alertmanager_insert_alert_failures_total") + data.SendSumOfCountersPerUser(out, m.insertAlertFailures, "alertmanager_alerts_insert_failures_total") data.SendSumOfGaugesPerUser(out, m.alertsLimiterAlertsCount, "alertmanager_alerts_limiter_current_alerts_count") data.SendSumOfGaugesPerUser(out, m.alertsLimiterAlertsSize, "alertmanager_alerts_limiter_current_alerts_size_bytes") } diff --git a/pkg/alertmanager/alertmanager_metrics_test.go b/pkg/alertmanager/alertmanager_metrics_test.go index c1a79479484..d869cab9a71 100644 --- a/pkg/alertmanager/alertmanager_metrics_test.go +++ b/pkg/alertmanager/alertmanager_metrics_test.go @@ -285,11 +285,11 @@ func TestAlertmanagerMetricsStore(t *testing.T) { cortex_alertmanager_alerts_limiter_current_alerts_size_bytes{user="user1"} 100 cortex_alertmanager_alerts_limiter_current_alerts_size_bytes{user="user2"} 1000 cortex_alertmanager_alerts_limiter_current_alerts_size_bytes{user="user3"} 10000 - # HELP cortex_alertmanager_insert_alert_failures_total Total number of failures to store alert due to hitting alertmanager limits. - # TYPE cortex_alertmanager_insert_alert_failures_total counter - cortex_alertmanager_insert_alert_failures_total{user="user1"} 11 - cortex_alertmanager_insert_alert_failures_total{user="user2"} 110 - cortex_alertmanager_insert_alert_failures_total{user="user3"} 1100 + # HELP cortex_alertmanager_alerts_insert_failures_total Total number of failures to store alert due to hitting alertmanager limits. + # TYPE cortex_alertmanager_alerts_insert_failures_total counter + cortex_alertmanager_alerts_insert_failures_total{user="user1"} 11 + cortex_alertmanager_alerts_insert_failures_total{user="user2"} 110 + cortex_alertmanager_alerts_insert_failures_total{user="user3"} 1100 `)) require.NoError(t, err) } @@ -584,11 +584,11 @@ func TestAlertmanagerMetricsRemoval(t *testing.T) { cortex_alertmanager_alerts_limiter_current_alerts_size_bytes{user="user1"} 100 cortex_alertmanager_alerts_limiter_current_alerts_size_bytes{user="user2"} 1000 cortex_alertmanager_alerts_limiter_current_alerts_size_bytes{user="user3"} 10000 - # HELP cortex_alertmanager_insert_alert_failures_total Total number of failures to store alert due to hitting alertmanager limits. - # TYPE cortex_alertmanager_insert_alert_failures_total counter - cortex_alertmanager_insert_alert_failures_total{user="user1"} 11 - cortex_alertmanager_insert_alert_failures_total{user="user2"} 110 - cortex_alertmanager_insert_alert_failures_total{user="user3"} 1100 + # HELP cortex_alertmanager_alerts_insert_failures_total Total number of failures to store alert due to hitting alertmanager limits. + # TYPE cortex_alertmanager_alerts_insert_failures_total counter + cortex_alertmanager_alerts_insert_failures_total{user="user1"} 11 + cortex_alertmanager_alerts_insert_failures_total{user="user2"} 110 + cortex_alertmanager_alerts_insert_failures_total{user="user3"} 1100 `)) require.NoError(t, err) @@ -830,10 +830,10 @@ func TestAlertmanagerMetricsRemoval(t *testing.T) { # TYPE cortex_alertmanager_alerts_limiter_current_alerts_size_bytes gauge cortex_alertmanager_alerts_limiter_current_alerts_size_bytes{user="user1"} 100 cortex_alertmanager_alerts_limiter_current_alerts_size_bytes{user="user2"} 1000 - # HELP cortex_alertmanager_insert_alert_failures_total Total number of failures to store alert due to hitting alertmanager limits. - # TYPE cortex_alertmanager_insert_alert_failures_total counter - cortex_alertmanager_insert_alert_failures_total{user="user1"} 11 - cortex_alertmanager_insert_alert_failures_total{user="user2"} 110 + # HELP cortex_alertmanager_alerts_insert_failures_total Total number of failures to store alert due to hitting alertmanager limits. + # TYPE cortex_alertmanager_alerts_insert_failures_total counter + cortex_alertmanager_alerts_insert_failures_total{user="user1"} 11 + cortex_alertmanager_alerts_insert_failures_total{user="user2"} 110 `)) require.NoError(t, err) } @@ -1112,7 +1112,7 @@ func newLimiterMetrics(r prometheus.Registerer) *limiterMetrics { }) insertAlertFailures := promauto.With(r).NewCounterVec(prometheus.CounterOpts{ - Name: "alertmanager_insert_alert_failures_total", + Name: "alertmanager_alerts_insert_failures_total", Help: "Number of failures to insert new alerts to in-memory alert store.", }, []string{"reason"}) diff --git a/pkg/alertmanager/multitenant.go b/pkg/alertmanager/multitenant.go index a5f52ea1ed3..096f9f26758 100644 --- a/pkg/alertmanager/multitenant.go +++ b/pkg/alertmanager/multitenant.go @@ -228,7 +228,7 @@ type Limits interface { AlertmanagerMaxAlertsCount(tenant string) int // AlertmanagerMaxAlertsSizeBytes returns total max size of alerts that tenant can have active at the same time. 0 = no limit. - // Size of the alert is computed from alert labels and annotations. + // Size of the alert is computed from alert labels, annotations and generator URL. AlertmanagerMaxAlertsSizeBytes(tenant string) int } diff --git a/pkg/util/validation/limits.go b/pkg/util/validation/limits.go index 44734a04a1e..c3fb5aa9f94 100644 --- a/pkg/util/validation/limits.go +++ b/pkg/util/validation/limits.go @@ -185,8 +185,8 @@ func (l *Limits) RegisterFlags(f *flag.FlagSet) { f.IntVar(&l.AlertmanagerMaxTemplatesCount, "alertmanager.max-templates-count", 0, "Maximum number of templates in tenant's Alertmanager configuration uploaded via Alertmanager API. 0 = no limit.") f.IntVar(&l.AlertmanagerMaxTemplateSizeBytes, "alertmanager.max-template-size-bytes", 0, "Maximum size of single template in tenant's Alertmanager configuration uploaded via Alertmanager API. 0 = no limit.") f.IntVar(&l.AlertmanagerMaxDispatcherAggregationGroups, "alertmanager.max-dispatcher-aggregation-groups", 0, "Maximum number of aggregation groups in Alertmanager's dispatcher that a tenant can have. Each active aggregation group uses single goroutine. When the limit is reached, dispatcher will not dispatch alerts that belong to additional aggregation groups, but existing groups will keep working properly. 0 = no limit.") - f.IntVar(&l.AlertmanagerMaxAlertsCount, "alertmanager.max-alerts-count", 0, "Maximum number of alerts that single user can have. Inserting more alerts will fail with a log message and metric increment. 0 = no limit.") - f.IntVar(&l.AlertmanagerMaxAlertsSizeBytes, "alertmanager.max-alerts-size-bytes", 0, "Maximum total size of alerts that single user can have. Inserting more alerts will fail with a log message and metric increment. 0 = no limit.") + f.IntVar(&l.AlertmanagerMaxAlertsCount, "alertmanager.max-alerts-count", 0, "Maximum number of alerts that a single user can have. Inserting more alerts will fail with a log message and metric increment. 0 = no limit.") + f.IntVar(&l.AlertmanagerMaxAlertsSizeBytes, "alertmanager.max-alerts-size-bytes", 0, "Maximum total size of alerts that a single user can have, alert size is the sum of the bytes of its labels, annotations and generatorURL. Inserting more alerts will fail with a log message and metric increment. 0 = no limit.") } // Validate the limits config and returns an error if the validation From 803ad4ddffebfda33d57f3d3738433b6825f426c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= Date: Tue, 8 Jun 2021 09:22:05 +0200 Subject: [PATCH 6/9] Added comment. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- pkg/alertmanager/alertmanager.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pkg/alertmanager/alertmanager.go b/pkg/alertmanager/alertmanager.go index d7ae352a69f..bbf71ec94cc 100644 --- a/pkg/alertmanager/alertmanager.go +++ b/pkg/alertmanager/alertmanager.go @@ -664,6 +664,9 @@ func (a *alertsLimiter) PreStore(alert *types.Alert, existing bool) error { a.mx.Lock() defer a.mx.Unlock() + // We allow existing alerts in with no checks. Alert update currently cannot change labels, + // annotations or generator URL. Also we want to make sure that alerts already in + // store can be resolved. if !existing { if countLimit > 0 && (a.count+1) > countLimit { a.failureCounter.WithLabelValues(insertFailureTooManyAlerts).Inc() From 537b9ac5d3f2d2c58c2e60f8c4a71d784ccc1a91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= Date: Wed, 9 Jun 2021 08:23:32 +0200 Subject: [PATCH 7/9] Address review feedback. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- CHANGELOG.md | 2 +- pkg/alertmanager/alertmanager.go | 91 +++++++++---------- pkg/alertmanager/alertmanager_metrics.go | 8 +- pkg/alertmanager/alertmanager_metrics_test.go | 72 +++++++-------- pkg/alertmanager/alertmanager_test.go | 4 +- 5 files changed, 82 insertions(+), 95 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fc1f328030e..31ccb682fa1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -39,7 +39,7 @@ * `memberlist_client_kv_store_value_tombstones_removed_total` * `memberlist_client_messages_to_broadcast_dropped_total` * [ENHANCEMENT] Alertmanager: Added `-alertmanager.max-dispatcher-aggregation-groups` option to control max number of active dispatcher groups in Alertmanager (per tenant, also overrideable). When the limit is reached, Dispatcher produces log message and increases `cortex_alertmanager_dispatcher_aggregation_group_limit_reached_total` metric. #4254 -* [ENHANCEMENT] Alertmanager: Added `-alertmanager.max-alerts-count` and `-alertmanager.max-alerts-size-bytes` to control max number of alerts and total size of alerts that a single user can have in Alertmanager's memory. Adding more alerts will fail with a log message and incrementing `cortex_alertmanager_alerts_insert_failures_total` metric (per-user). These limits can be overrided by using per-tenant overrides. Current values are tracked in `cortex_alertmanager_alerts_limiter_current_alerts_count` and `cortex_alertmanager_alerts_limiter_current_alerts_size_bytes` metrics. #4253 +* [ENHANCEMENT] Alertmanager: Added `-alertmanager.max-alerts-count` and `-alertmanager.max-alerts-size-bytes` to control max number of alerts and total size of alerts that a single user can have in Alertmanager's memory. Adding more alerts will fail with a log message and incrementing `cortex_alertmanager_alerts_insert_limited_total` metric (per-user). These limits can be overrided by using per-tenant overrides. Current values are tracked in `cortex_alertmanager_alerts_limiter_current_alerts` and `cortex_alertmanager_alerts_limiter_current_alerts_size_bytes` metrics. #4253 * [BUGFIX] Purger: fix `Invalid null value in condition for column range` caused by `nil` value in range for WriteBatch query. #4128 * [BUGFIX] Ingester: fixed infrequent panic caused by a race condition between TSDB mmap-ed head chunks truncation and queries. #4176 * [BUGFIX] Alertmanager: fix Alertmanager status page if clustering via gossip is disabled or sharding is enabled. #4184 diff --git a/pkg/alertmanager/alertmanager.go b/pkg/alertmanager/alertmanager.go index bbf71ec94cc..dfdafb4c65b 100644 --- a/pkg/alertmanager/alertmanager.go +++ b/pkg/alertmanager/alertmanager.go @@ -244,33 +244,7 @@ func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) { var callback mem.AlertStoreCallback if am.cfg.Limits != nil { - insertAlertFailures := promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ - Name: "alertmanager_alerts_insert_failures_total", - Help: "Number of failures to insert new alerts to in-memory alert store.", - }, []string{"reason"}) - - insertAlertFailures.WithLabelValues(insertFailureTooManyAlerts) - insertAlertFailures.WithLabelValues(insertFailureAlertsTooBig) - - limiter := newAlertsLimiter(am.cfg.UserID, am.cfg.Limits, insertAlertFailures) - - promauto.With(reg).NewGaugeFunc(prometheus.GaugeOpts{ - Name: "alertmanager_alerts_limiter_current_alerts_count", - Help: "Number of alerts tracked by alerts limiter.", - }, func() float64 { - c, _ := limiter.currentStats() - return float64(c) - }) - - promauto.With(reg).NewGaugeFunc(prometheus.GaugeOpts{ - Name: "alertmanager_alerts_limiter_current_alerts_size_bytes", - Help: "Total size of alerts tracked by alerts limiter.", - }, func() float64 { - _, s := limiter.currentStats() - return float64(s) - }) - - callback = limiter + callback = newAlertsLimiter(am.cfg.UserID, am.cfg.Limits, reg) } am.alerts, err = mem.NewAlerts(context.Background(), am.marker, 30*time.Minute, callback, am.logger) @@ -622,11 +596,6 @@ var ( errAlertsTooBig = "alerts too big, total size limit: %d bytes" ) -const ( - insertFailureTooManyAlerts = "too_many_alerts" - insertFailureAlertsTooBig = "alerts_too_big" -) - // alertsLimiter limits the number and size of alerts being received by the Alertmanager. // We consider an alert unique based on its fingerprint (a hash of its labels) and // its size it's determined by the sum of bytes of its labels, annotations, and generator URL. @@ -634,7 +603,7 @@ type alertsLimiter struct { tenant string limits Limits - failureCounter *prometheus.CounterVec + failureCounter prometheus.Counter mx sync.Mutex sizes map[model.Fingerprint]int @@ -642,13 +611,34 @@ type alertsLimiter struct { totalSize int } -func newAlertsLimiter(tenant string, limits Limits, failureCounter *prometheus.CounterVec) *alertsLimiter { - return &alertsLimiter{ - tenant: tenant, - limits: limits, - sizes: map[model.Fingerprint]int{}, - failureCounter: failureCounter, +func newAlertsLimiter(tenant string, limits Limits, reg prometheus.Registerer) *alertsLimiter { + limiter := &alertsLimiter{ + tenant: tenant, + limits: limits, + sizes: map[model.Fingerprint]int{}, + failureCounter: promauto.With(reg).NewCounter(prometheus.CounterOpts{ + Name: "alertmanager_alerts_insert_limited_total", + Help: "Number of failures to insert new alerts to in-memory alert store.", + }), } + + promauto.With(reg).NewGaugeFunc(prometheus.GaugeOpts{ + Name: "alertmanager_alerts_limiter_current_alerts", + Help: "Number of alerts tracked by alerts limiter.", + }, func() float64 { + c, _ := limiter.currentStats() + return float64(c) + }) + + promauto.With(reg).NewGaugeFunc(prometheus.GaugeOpts{ + Name: "alertmanager_alerts_limiter_current_alerts_size_bytes", + Help: "Total size of alerts tracked by alerts limiter.", + }, func() float64 { + _, s := limiter.currentStats() + return float64(s) + }) + + return limiter } func (a *alertsLimiter) PreStore(alert *types.Alert, existing bool) error { @@ -664,19 +654,20 @@ func (a *alertsLimiter) PreStore(alert *types.Alert, existing bool) error { a.mx.Lock() defer a.mx.Unlock() - // We allow existing alerts in with no checks. Alert update currently cannot change labels, - // annotations or generator URL. Also we want to make sure that alerts already in + // We allow existing alerts in with no checks, as we want to make sure that alerts already in // store can be resolved. - if !existing { - if countLimit > 0 && (a.count+1) > countLimit { - a.failureCounter.WithLabelValues(insertFailureTooManyAlerts).Inc() - return fmt.Errorf(errTooManyAlerts, countLimit) - } + if existing { + return nil + } - if sizeLimit > 0 && (a.totalSize+newSize) > sizeLimit { - a.failureCounter.WithLabelValues(insertFailureAlertsTooBig).Inc() - return fmt.Errorf(errAlertsTooBig, sizeLimit) - } + if countLimit > 0 && (a.count+1) > countLimit { + a.failureCounter.Inc() + return fmt.Errorf(errTooManyAlerts, countLimit) + } + + if sizeLimit > 0 && (a.totalSize+newSize) > sizeLimit { + a.failureCounter.Inc() + return fmt.Errorf(errAlertsTooBig, sizeLimit) } return nil diff --git a/pkg/alertmanager/alertmanager_metrics.go b/pkg/alertmanager/alertmanager_metrics.go index 4d891c0d7ef..a5371f6c95b 100644 --- a/pkg/alertmanager/alertmanager_metrics.go +++ b/pkg/alertmanager/alertmanager_metrics.go @@ -218,11 +218,11 @@ func newAlertmanagerMetrics() *alertmanagerMetrics { "Number of times when dispatcher failed to create new aggregation group due to limit.", []string{"user"}, nil), insertAlertFailures: prometheus.NewDesc( - "cortex_alertmanager_alerts_insert_failures_total", + "cortex_alertmanager_alerts_insert_limited_total", "Total number of failures to store alert due to hitting alertmanager limits.", []string{"user"}, nil), alertsLimiterAlertsCount: prometheus.NewDesc( - "cortex_alertmanager_alerts_limiter_current_alerts_count", + "cortex_alertmanager_alerts_limiter_current_alerts", "Number of alerts tracked by alerts limiter.", []string{"user"}, nil), alertsLimiterAlertsSize: prometheus.NewDesc( @@ -331,7 +331,7 @@ func (m *alertmanagerMetrics) Collect(out chan<- prometheus.Metric) { data.SendSumOfCountersPerUserWithLabels(out, m.notificationRateLimited, "alertmanager_notification_rate_limited_total", "integration") data.SendSumOfCountersPerUser(out, m.dispatcherAggregationGroupsLimitReached, "alertmanager_dispatcher_aggregation_group_limit_reached_total") - data.SendSumOfCountersPerUser(out, m.insertAlertFailures, "alertmanager_alerts_insert_failures_total") - data.SendSumOfGaugesPerUser(out, m.alertsLimiterAlertsCount, "alertmanager_alerts_limiter_current_alerts_count") + data.SendSumOfCountersPerUser(out, m.insertAlertFailures, "alertmanager_alerts_insert_limited_total") + data.SendSumOfGaugesPerUser(out, m.alertsLimiterAlertsCount, "alertmanager_alerts_limiter_current_alerts") data.SendSumOfGaugesPerUser(out, m.alertsLimiterAlertsSize, "alertmanager_alerts_limiter_current_alerts_size_bytes") } diff --git a/pkg/alertmanager/alertmanager_metrics_test.go b/pkg/alertmanager/alertmanager_metrics_test.go index d869cab9a71..a61518d995b 100644 --- a/pkg/alertmanager/alertmanager_metrics_test.go +++ b/pkg/alertmanager/alertmanager_metrics_test.go @@ -275,21 +275,21 @@ func TestAlertmanagerMetricsStore(t *testing.T) { # TYPE cortex_alertmanager_state_persist_total counter cortex_alertmanager_state_persist_total 0 - # HELP cortex_alertmanager_alerts_limiter_current_alerts_count Number of alerts tracked by alerts limiter. - # TYPE cortex_alertmanager_alerts_limiter_current_alerts_count gauge - cortex_alertmanager_alerts_limiter_current_alerts_count{user="user1"} 10 - cortex_alertmanager_alerts_limiter_current_alerts_count{user="user2"} 100 - cortex_alertmanager_alerts_limiter_current_alerts_count{user="user3"} 1000 + # HELP cortex_alertmanager_alerts_limiter_current_alerts Number of alerts tracked by alerts limiter. + # TYPE cortex_alertmanager_alerts_limiter_current_alerts gauge + cortex_alertmanager_alerts_limiter_current_alerts{user="user1"} 10 + cortex_alertmanager_alerts_limiter_current_alerts{user="user2"} 100 + cortex_alertmanager_alerts_limiter_current_alerts{user="user3"} 1000 # HELP cortex_alertmanager_alerts_limiter_current_alerts_size_bytes Total size of alerts tracked by alerts limiter. # TYPE cortex_alertmanager_alerts_limiter_current_alerts_size_bytes gauge cortex_alertmanager_alerts_limiter_current_alerts_size_bytes{user="user1"} 100 cortex_alertmanager_alerts_limiter_current_alerts_size_bytes{user="user2"} 1000 cortex_alertmanager_alerts_limiter_current_alerts_size_bytes{user="user3"} 10000 - # HELP cortex_alertmanager_alerts_insert_failures_total Total number of failures to store alert due to hitting alertmanager limits. - # TYPE cortex_alertmanager_alerts_insert_failures_total counter - cortex_alertmanager_alerts_insert_failures_total{user="user1"} 11 - cortex_alertmanager_alerts_insert_failures_total{user="user2"} 110 - cortex_alertmanager_alerts_insert_failures_total{user="user3"} 1100 + # HELP cortex_alertmanager_alerts_insert_limited_total Total number of failures to store alert due to hitting alertmanager limits. + # TYPE cortex_alertmanager_alerts_insert_limited_total counter + cortex_alertmanager_alerts_insert_limited_total{user="user1"} 7 + cortex_alertmanager_alerts_insert_limited_total{user="user2"} 70 + cortex_alertmanager_alerts_insert_limited_total{user="user3"} 700 `)) require.NoError(t, err) } @@ -574,21 +574,21 @@ func TestAlertmanagerMetricsRemoval(t *testing.T) { # TYPE cortex_alertmanager_state_persist_total counter cortex_alertmanager_state_persist_total 0 - # HELP cortex_alertmanager_alerts_limiter_current_alerts_count Number of alerts tracked by alerts limiter. - # TYPE cortex_alertmanager_alerts_limiter_current_alerts_count gauge - cortex_alertmanager_alerts_limiter_current_alerts_count{user="user1"} 10 - cortex_alertmanager_alerts_limiter_current_alerts_count{user="user2"} 100 - cortex_alertmanager_alerts_limiter_current_alerts_count{user="user3"} 1000 + # HELP cortex_alertmanager_alerts_limiter_current_alerts Number of alerts tracked by alerts limiter. + # TYPE cortex_alertmanager_alerts_limiter_current_alerts gauge + cortex_alertmanager_alerts_limiter_current_alerts{user="user1"} 10 + cortex_alertmanager_alerts_limiter_current_alerts{user="user2"} 100 + cortex_alertmanager_alerts_limiter_current_alerts{user="user3"} 1000 # HELP cortex_alertmanager_alerts_limiter_current_alerts_size_bytes Total size of alerts tracked by alerts limiter. # TYPE cortex_alertmanager_alerts_limiter_current_alerts_size_bytes gauge cortex_alertmanager_alerts_limiter_current_alerts_size_bytes{user="user1"} 100 cortex_alertmanager_alerts_limiter_current_alerts_size_bytes{user="user2"} 1000 cortex_alertmanager_alerts_limiter_current_alerts_size_bytes{user="user3"} 10000 - # HELP cortex_alertmanager_alerts_insert_failures_total Total number of failures to store alert due to hitting alertmanager limits. - # TYPE cortex_alertmanager_alerts_insert_failures_total counter - cortex_alertmanager_alerts_insert_failures_total{user="user1"} 11 - cortex_alertmanager_alerts_insert_failures_total{user="user2"} 110 - cortex_alertmanager_alerts_insert_failures_total{user="user3"} 1100 + # HELP cortex_alertmanager_alerts_insert_limited_total Total number of failures to store alert due to hitting alertmanager limits. + # TYPE cortex_alertmanager_alerts_insert_limited_total counter + cortex_alertmanager_alerts_insert_limited_total{user="user1"} 7 + cortex_alertmanager_alerts_insert_limited_total{user="user2"} 70 + cortex_alertmanager_alerts_insert_limited_total{user="user3"} 700 `)) require.NoError(t, err) @@ -822,18 +822,18 @@ func TestAlertmanagerMetricsRemoval(t *testing.T) { # TYPE cortex_alertmanager_state_persist_total counter cortex_alertmanager_state_persist_total 0 - # HELP cortex_alertmanager_alerts_limiter_current_alerts_count Number of alerts tracked by alerts limiter. - # TYPE cortex_alertmanager_alerts_limiter_current_alerts_count gauge - cortex_alertmanager_alerts_limiter_current_alerts_count{user="user1"} 10 - cortex_alertmanager_alerts_limiter_current_alerts_count{user="user2"} 100 + # HELP cortex_alertmanager_alerts_limiter_current_alerts Number of alerts tracked by alerts limiter. + # TYPE cortex_alertmanager_alerts_limiter_current_alerts gauge + cortex_alertmanager_alerts_limiter_current_alerts{user="user1"} 10 + cortex_alertmanager_alerts_limiter_current_alerts{user="user2"} 100 # HELP cortex_alertmanager_alerts_limiter_current_alerts_size_bytes Total size of alerts tracked by alerts limiter. # TYPE cortex_alertmanager_alerts_limiter_current_alerts_size_bytes gauge cortex_alertmanager_alerts_limiter_current_alerts_size_bytes{user="user1"} 100 cortex_alertmanager_alerts_limiter_current_alerts_size_bytes{user="user2"} 1000 - # HELP cortex_alertmanager_alerts_insert_failures_total Total number of failures to store alert due to hitting alertmanager limits. - # TYPE cortex_alertmanager_alerts_insert_failures_total counter - cortex_alertmanager_alerts_insert_failures_total{user="user1"} 11 - cortex_alertmanager_alerts_insert_failures_total{user="user2"} 110 + # HELP cortex_alertmanager_alerts_insert_limited_total Total number of failures to store alert due to hitting alertmanager limits. + # TYPE cortex_alertmanager_alerts_insert_limited_total counter + cortex_alertmanager_alerts_insert_limited_total{user="user1"} 7 + cortex_alertmanager_alerts_insert_limited_total{user="user2"} 70 `)) require.NoError(t, err) } @@ -887,8 +887,7 @@ func populateAlertmanager(base float64) *prometheus.Registry { lm := newLimiterMetrics(reg) lm.count.Set(10 * base) lm.size.Set(100 * base) - lm.insertFailures.WithLabelValues(insertFailureTooManyAlerts).Add(7 * base) - lm.insertFailures.WithLabelValues(insertFailureAlertsTooBig).Add(4 * base) + lm.insertFailures.Add(7 * base) return reg } @@ -1097,12 +1096,12 @@ func newAPIMetrics(version string, r prometheus.Registerer) *apiMetrics { type limiterMetrics struct { count prometheus.Gauge size prometheus.Gauge - insertFailures *prometheus.CounterVec + insertFailures prometheus.Counter } func newLimiterMetrics(r prometheus.Registerer) *limiterMetrics { count := promauto.With(r).NewGauge(prometheus.GaugeOpts{ - Name: "alertmanager_alerts_limiter_current_alerts_count", + Name: "alertmanager_alerts_limiter_current_alerts", Help: "Number of alerts tracked by alerts limiter.", }) @@ -1111,13 +1110,10 @@ func newLimiterMetrics(r prometheus.Registerer) *limiterMetrics { Help: "Total size of alerts tracked by alerts limiter.", }) - insertAlertFailures := promauto.With(r).NewCounterVec(prometheus.CounterOpts{ - Name: "alertmanager_alerts_insert_failures_total", + insertAlertFailures := promauto.With(r).NewCounter(prometheus.CounterOpts{ + Name: "alertmanager_alerts_insert_limited_total", Help: "Number of failures to insert new alerts to in-memory alert store.", - }, []string{"reason"}) - - insertAlertFailures.WithLabelValues(insertFailureTooManyAlerts) - insertAlertFailures.WithLabelValues(insertFailureAlertsTooBig) + }) return &limiterMetrics{ count: count, diff --git a/pkg/alertmanager/alertmanager_test.go b/pkg/alertmanager/alertmanager_test.go index dca81be1d53..81e2a75b317 100644 --- a/pkg/alertmanager/alertmanager_test.go +++ b/pkg/alertmanager/alertmanager_test.go @@ -185,9 +185,9 @@ func TestAlertsLimiterWithSizeLimit(t *testing.T) { // testLimiter sends sequence of alerts to limiter, and checks if limiter updated reacted correctly. func testLimiter(t *testing.T, limits Limits, ops []callbackOp) { - counter := prometheus.NewCounterVec(prometheus.CounterOpts{}, []string{"reason"}) + reg := prometheus.NewPedanticRegistry() - limiter := newAlertsLimiter("test", limits, counter) + limiter := newAlertsLimiter("test", limits, reg) for ix, op := range ops { if op.delete { From 42ff3b11ae1079a9f911a3a4eaf08bdd5a634b16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= Date: Wed, 9 Jun 2021 09:02:11 +0200 Subject: [PATCH 8/9] Move check to the top. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- pkg/alertmanager/alertmanager.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pkg/alertmanager/alertmanager.go b/pkg/alertmanager/alertmanager.go index dfdafb4c65b..10b9121f478 100644 --- a/pkg/alertmanager/alertmanager.go +++ b/pkg/alertmanager/alertmanager.go @@ -646,6 +646,12 @@ func (a *alertsLimiter) PreStore(alert *types.Alert, existing bool) error { return nil } + // We allow existing alerts in with no checks, as we want to make sure that alerts already in + // store can be resolved. + if existing { + return nil + } + countLimit := a.limits.AlertmanagerMaxAlertsCount(a.tenant) sizeLimit := a.limits.AlertmanagerMaxAlertsSizeBytes(a.tenant) @@ -654,12 +660,6 @@ func (a *alertsLimiter) PreStore(alert *types.Alert, existing bool) error { a.mx.Lock() defer a.mx.Unlock() - // We allow existing alerts in with no checks, as we want to make sure that alerts already in - // store can be resolved. - if existing { - return nil - } - if countLimit > 0 && (a.count+1) > countLimit { a.failureCounter.Inc() return fmt.Errorf(errTooManyAlerts, countLimit) From 314c64307d4be76da9a3e0d026ba0e060d8d6d41 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Thu, 10 Jun 2021 17:43:18 +0200 Subject: [PATCH 9/9] When existing alert grows and doesn't fit the size limit anymore, it is rejected. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- pkg/alertmanager/alertmanager.go | 16 +++++++-------- pkg/alertmanager/alertmanager_test.go | 29 +++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 8 deletions(-) diff --git a/pkg/alertmanager/alertmanager.go b/pkg/alertmanager/alertmanager.go index 10b9121f478..fe89550f950 100644 --- a/pkg/alertmanager/alertmanager.go +++ b/pkg/alertmanager/alertmanager.go @@ -646,26 +646,26 @@ func (a *alertsLimiter) PreStore(alert *types.Alert, existing bool) error { return nil } - // We allow existing alerts in with no checks, as we want to make sure that alerts already in - // store can be resolved. - if existing { - return nil - } + fp := alert.Fingerprint() countLimit := a.limits.AlertmanagerMaxAlertsCount(a.tenant) sizeLimit := a.limits.AlertmanagerMaxAlertsSizeBytes(a.tenant) - newSize := alertSize(alert.Alert) + sizeDiff := alertSize(alert.Alert) a.mx.Lock() defer a.mx.Unlock() - if countLimit > 0 && (a.count+1) > countLimit { + if !existing && countLimit > 0 && (a.count+1) > countLimit { a.failureCounter.Inc() return fmt.Errorf(errTooManyAlerts, countLimit) } - if sizeLimit > 0 && (a.totalSize+newSize) > sizeLimit { + if existing { + sizeDiff -= a.sizes[fp] + } + + if sizeLimit > 0 && (a.totalSize+sizeDiff) > sizeLimit { a.failureCounter.Inc() return fmt.Errorf(errAlertsTooBig, sizeLimit) } diff --git a/pkg/alertmanager/alertmanager_test.go b/pkg/alertmanager/alertmanager_test.go index 81e2a75b317..402ec2edbe7 100644 --- a/pkg/alertmanager/alertmanager_test.go +++ b/pkg/alertmanager/alertmanager_test.go @@ -155,12 +155,18 @@ func TestAlertsLimiterWithNoLimits(t *testing.T) { } func TestAlertsLimiterWithCountLimit(t *testing.T) { + alert2WithMoreAnnotations := alert2 + alert2WithMoreAnnotations.Annotations = model.LabelSet{"job": "test", "cluster": "prod", "new": "super-long-annotation"} + alert2WithMoreAnnotationsSize := alertSize(alert2WithMoreAnnotations) + ops := []callbackOp{ {alert: &types.Alert{Alert: alert1}, existing: false, expectedCount: 1, expectedTotalSize: alert1Size}, {alert: &types.Alert{Alert: alert2}, existing: false, expectedInsertError: fmt.Errorf(errTooManyAlerts, 1), expectedCount: 1, expectedTotalSize: alert1Size}, {alert: &types.Alert{Alert: alert1}, delete: true, expectedCount: 0, expectedTotalSize: 0}, {alert: &types.Alert{Alert: alert2}, existing: false, expectedCount: 1, expectedTotalSize: alert2Size}, + // Update of existing alert works -- doesn't change count. + {alert: &types.Alert{Alert: alert2WithMoreAnnotations}, existing: true, expectedCount: 1, expectedTotalSize: alert2WithMoreAnnotationsSize}, {alert: &types.Alert{Alert: alert2}, delete: true, expectedCount: 0, expectedTotalSize: 0}, } @@ -168,9 +174,13 @@ func TestAlertsLimiterWithCountLimit(t *testing.T) { } func TestAlertsLimiterWithSizeLimit(t *testing.T) { + alert2WithMoreAnnotations := alert2 + alert2WithMoreAnnotations.Annotations = model.LabelSet{"job": "test", "cluster": "prod", "new": "super-long-annotation"} + ops := []callbackOp{ {alert: &types.Alert{Alert: alert1}, existing: false, expectedCount: 1, expectedTotalSize: alert1Size}, {alert: &types.Alert{Alert: alert2}, existing: false, expectedInsertError: fmt.Errorf(errAlertsTooBig, alert2Size), expectedCount: 1, expectedTotalSize: alert1Size}, + {alert: &types.Alert{Alert: alert2WithMoreAnnotations}, existing: false, expectedInsertError: fmt.Errorf(errAlertsTooBig, alert2Size), expectedCount: 1, expectedTotalSize: alert1Size}, {alert: &types.Alert{Alert: alert1}, delete: true, expectedCount: 0, expectedTotalSize: 0}, {alert: &types.Alert{Alert: alert2}, existing: false, expectedCount: 1, expectedTotalSize: alert2Size}, @@ -183,6 +193,25 @@ func TestAlertsLimiterWithSizeLimit(t *testing.T) { testLimiter(t, &mockAlertManagerLimits{maxAlertsSizeBytes: alert2Size}, ops) } +func TestAlertsLimiterWithSizeLimitAndAnnotationUpdate(t *testing.T) { + alert2WithMoreAnnotations := alert2 + alert2WithMoreAnnotations.Annotations = model.LabelSet{"job": "test", "cluster": "prod", "new": "super-long-annotation"} + alert2WithMoreAnnotationsSize := alertSize(alert2WithMoreAnnotations) + + // Updating alert with larger annotation that goes over the size limit fails. + testLimiter(t, &mockAlertManagerLimits{maxAlertsSizeBytes: alert2Size}, []callbackOp{ + {alert: &types.Alert{Alert: alert2}, existing: false, expectedCount: 1, expectedTotalSize: alert2Size}, + {alert: &types.Alert{Alert: alert2WithMoreAnnotations}, existing: true, expectedInsertError: fmt.Errorf(errAlertsTooBig, alert2Size), expectedCount: 1, expectedTotalSize: alert2Size}, + }) + + // Updating alert with larger annotations in the limit works fine. + testLimiter(t, &mockAlertManagerLimits{maxAlertsSizeBytes: alert2WithMoreAnnotationsSize}, []callbackOp{ + {alert: &types.Alert{Alert: alert2}, existing: false, expectedCount: 1, expectedTotalSize: alert2Size}, + {alert: &types.Alert{Alert: alert2WithMoreAnnotations}, existing: true, expectedCount: 1, expectedTotalSize: alert2WithMoreAnnotationsSize}, + {alert: &types.Alert{Alert: alert2}, existing: true, expectedCount: 1, expectedTotalSize: alert2Size}, + }) +} + // testLimiter sends sequence of alerts to limiter, and checks if limiter updated reacted correctly. func testLimiter(t *testing.T, limits Limits, ops []callbackOp) { reg := prometheus.NewPedanticRegistry()