Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,12 @@
* [CHANGE] Store-gateway: enabled attributes in-memory cache by default. New default configuration is `-blocks-storage.bucket-store.chunks-cache.attributes-in-memory-max-items=50000`. #1727
* [CHANGE] Compactor: Removed the metric `cortex_compactor_garbage_collected_blocks_total` since it duplicates `cortex_compactor_blocks_marked_for_deletion_total`. #1728
* [CHANGE] All: Logs that used the`org_id` label now use `user` label. #1634 #1758
* [CHANGE] Alertmanager: the following metrics are not exported for a given `user` and `integration` when the metric value is zero: #1783
* `cortex_alertmanager_notifications_total`
* `cortex_alertmanager_notifications_failed_total`
* `cortex_alertmanager_notification_requests_total`
* `cortex_alertmanager_notification_requests_failed_total`
* `cortex_alertmanager_notification_rate_limited_total`
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tangential question: What's our policy on the cortex_ prefix? I take it we don't want to break existing dashboards but is there a plan to eventually deprecate them?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We discussed it and we won't rename it until we can't provide a smooth upgrade path to users.

* [CHANGE] Removed the following metrics exposed by the Mimir hash rings: #1791
* `cortex_member_ring_tokens_owned`
* `cortex_member_ring_tokens_to_own`
Expand Down
10 changes: 5 additions & 5 deletions pkg/alertmanager/alertmanager_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -296,10 +296,10 @@ func (m *alertmanagerMetrics) Collect(out chan<- prometheus.Metric) {
data.SendSumOfCountersPerUser(out, m.alertsReceived, "alertmanager_alerts_received_total")
data.SendSumOfCountersPerUser(out, m.alertsInvalid, "alertmanager_alerts_invalid_total")

data.SendSumOfCountersPerUserWithLabels(out, m.numNotifications, "alertmanager_notifications_total", "integration")
data.SendSumOfCountersPerUserWithLabels(out, m.numFailedNotifications, "alertmanager_notifications_failed_total", "integration")
data.SendSumOfCountersPerUserWithLabels(out, m.numNotificationRequestsTotal, "alertmanager_notification_requests_total", "integration")
data.SendSumOfCountersPerUserWithLabels(out, m.numNotificationRequestsFailedTotal, "alertmanager_notification_requests_failed_total", "integration")
data.SendSumOfCountersPerUserWithLabelsAndOptions(out, m.numNotifications, "alertmanager_notifications_total", []string{"integration"}, util.SkipZeroValueMetrics)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On first sight "AndOptions" looked like premature generalization to me, and I was going to suggest to generalize only when there are at least three variants in the future.

But there are already 6! :)

SendSumOfCounters
SendSumOfCountersPerUser
SendSumOfCountersPerUserWithLabels
SendSumOfCountersPerUserWithLabelsAndOptions
SendSumOfCountersWithLabels

Perhaps we should only have SendSumOfCounters and variants should really be "options"?

(But that's for another PR)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looking into it.

data.SendSumOfCountersPerUserWithLabelsAndOptions(out, m.numFailedNotifications, "alertmanager_notifications_failed_total", []string{"integration"}, util.SkipZeroValueMetrics)
data.SendSumOfCountersPerUserWithLabelsAndOptions(out, m.numNotificationRequestsTotal, "alertmanager_notification_requests_total", []string{"integration"}, util.SkipZeroValueMetrics)
data.SendSumOfCountersPerUserWithLabelsAndOptions(out, m.numNotificationRequestsFailedTotal, "alertmanager_notification_requests_failed_total", []string{"integration"}, util.SkipZeroValueMetrics)
data.SendSumOfHistograms(out, m.notificationLatencySeconds, "alertmanager_notification_latency_seconds")
data.SendSumOfGaugesPerUserWithLabels(out, m.markerAlerts, "alertmanager_alerts", "state")

Expand Down Expand Up @@ -334,7 +334,7 @@ func (m *alertmanagerMetrics) Collect(out chan<- prometheus.Metric) {
data.SendSumOfCounters(out, m.persistTotal, "alertmanager_state_persist_total")
data.SendSumOfCounters(out, m.persistFailed, "alertmanager_state_persist_failed_total")

data.SendSumOfCountersPerUserWithLabels(out, m.notificationRateLimited, "alertmanager_notification_rate_limited_total", "integration")
data.SendSumOfCountersPerUserWithLabelsAndOptions(out, m.notificationRateLimited, "alertmanager_notification_rate_limited_total", []string{"integration"}, util.SkipZeroValueMetrics)
data.SendSumOfCountersPerUser(out, m.dispatcherAggregationGroupsLimitReached, "alertmanager_dispatcher_aggregation_group_limit_reached_total")
data.SendSumOfCountersPerUser(out, m.insertAlertFailures, "alertmanager_alerts_insert_limited_total")
data.SendSumOfGaugesPerUser(out, m.alertsLimiterAlertsCount, "alertmanager_alerts_limiter_current_alerts")
Expand Down
32 changes: 0 additions & 32 deletions pkg/alertmanager/alertmanager_metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,9 +110,6 @@ func TestAlertmanagerMetricsStore(t *testing.T) {
cortex_alertmanager_notification_latency_seconds_count 27
# HELP cortex_alertmanager_notifications_failed_total The total number of failed notifications.
# TYPE cortex_alertmanager_notifications_failed_total counter
cortex_alertmanager_notifications_failed_total{integration="email",user="user1"} 0
cortex_alertmanager_notifications_failed_total{integration="email",user="user2"} 0
cortex_alertmanager_notifications_failed_total{integration="email",user="user3"} 0
cortex_alertmanager_notifications_failed_total{integration="opsgenie",user="user1"} 5
cortex_alertmanager_notifications_failed_total{integration="opsgenie",user="user2"} 50
cortex_alertmanager_notifications_failed_total{integration="opsgenie",user="user3"} 500
Expand All @@ -139,9 +136,6 @@ func TestAlertmanagerMetricsStore(t *testing.T) {
cortex_alertmanager_notifications_failed_total{integration="sns",user="user3"} 800
# HELP cortex_alertmanager_notification_requests_total The total number of attempted notification requests.
# TYPE cortex_alertmanager_notification_requests_total counter
cortex_alertmanager_notification_requests_total{integration="email",user="user1"} 0
cortex_alertmanager_notification_requests_total{integration="email",user="user2"} 0
cortex_alertmanager_notification_requests_total{integration="email",user="user3"} 0
cortex_alertmanager_notification_requests_total{integration="opsgenie",user="user1"} 5
cortex_alertmanager_notification_requests_total{integration="opsgenie",user="user2"} 50
cortex_alertmanager_notification_requests_total{integration="opsgenie",user="user3"} 500
Expand All @@ -168,9 +162,6 @@ func TestAlertmanagerMetricsStore(t *testing.T) {
cortex_alertmanager_notification_requests_total{integration="sns",user="user3"} 800
# HELP cortex_alertmanager_notification_requests_failed_total The total number of failed notification requests.
# TYPE cortex_alertmanager_notification_requests_failed_total counter
cortex_alertmanager_notification_requests_failed_total{integration="email",user="user1"} 0
cortex_alertmanager_notification_requests_failed_total{integration="email",user="user2"} 0
cortex_alertmanager_notification_requests_failed_total{integration="email",user="user3"} 0
cortex_alertmanager_notification_requests_failed_total{integration="opsgenie",user="user1"} 5
cortex_alertmanager_notification_requests_failed_total{integration="opsgenie",user="user2"} 50
cortex_alertmanager_notification_requests_failed_total{integration="opsgenie",user="user3"} 500
Expand All @@ -197,9 +188,6 @@ func TestAlertmanagerMetricsStore(t *testing.T) {
cortex_alertmanager_notification_requests_failed_total{integration="sns",user="user3"} 800
# HELP cortex_alertmanager_notifications_total The total number of attempted notifications.
# TYPE cortex_alertmanager_notifications_total counter
cortex_alertmanager_notifications_total{integration="email",user="user1"} 0
cortex_alertmanager_notifications_total{integration="email",user="user2"} 0
cortex_alertmanager_notifications_total{integration="email",user="user3"} 0
cortex_alertmanager_notifications_total{integration="opsgenie",user="user1"} 5
cortex_alertmanager_notifications_total{integration="opsgenie",user="user2"} 50
cortex_alertmanager_notifications_total{integration="opsgenie",user="user3"} 500
Expand Down Expand Up @@ -411,9 +399,6 @@ func TestAlertmanagerMetricsRemoval(t *testing.T) {

# HELP cortex_alertmanager_notification_requests_failed_total The total number of failed notification requests.
# TYPE cortex_alertmanager_notification_requests_failed_total counter
cortex_alertmanager_notification_requests_failed_total{integration="email",user="user1"} 0
cortex_alertmanager_notification_requests_failed_total{integration="email",user="user2"} 0
cortex_alertmanager_notification_requests_failed_total{integration="email",user="user3"} 0
cortex_alertmanager_notification_requests_failed_total{integration="opsgenie",user="user1"} 5
cortex_alertmanager_notification_requests_failed_total{integration="opsgenie",user="user2"} 50
cortex_alertmanager_notification_requests_failed_total{integration="opsgenie",user="user3"} 500
Expand Down Expand Up @@ -441,9 +426,6 @@ func TestAlertmanagerMetricsRemoval(t *testing.T) {

# HELP cortex_alertmanager_notification_requests_total The total number of attempted notification requests.
# TYPE cortex_alertmanager_notification_requests_total counter
cortex_alertmanager_notification_requests_total{integration="email",user="user1"} 0
cortex_alertmanager_notification_requests_total{integration="email",user="user2"} 0
cortex_alertmanager_notification_requests_total{integration="email",user="user3"} 0
cortex_alertmanager_notification_requests_total{integration="opsgenie",user="user1"} 5
cortex_alertmanager_notification_requests_total{integration="opsgenie",user="user2"} 50
cortex_alertmanager_notification_requests_total{integration="opsgenie",user="user3"} 500
Expand Down Expand Up @@ -471,9 +453,6 @@ func TestAlertmanagerMetricsRemoval(t *testing.T) {

# HELP cortex_alertmanager_notifications_failed_total The total number of failed notifications.
# TYPE cortex_alertmanager_notifications_failed_total counter
cortex_alertmanager_notifications_failed_total{integration="email",user="user1"} 0
cortex_alertmanager_notifications_failed_total{integration="email",user="user2"} 0
cortex_alertmanager_notifications_failed_total{integration="email",user="user3"} 0
cortex_alertmanager_notifications_failed_total{integration="opsgenie",user="user1"} 5
cortex_alertmanager_notifications_failed_total{integration="opsgenie",user="user2"} 50
cortex_alertmanager_notifications_failed_total{integration="opsgenie",user="user3"} 500
Expand Down Expand Up @@ -501,9 +480,6 @@ func TestAlertmanagerMetricsRemoval(t *testing.T) {

# HELP cortex_alertmanager_notifications_total The total number of attempted notifications.
# TYPE cortex_alertmanager_notifications_total counter
cortex_alertmanager_notifications_total{integration="email",user="user1"} 0
cortex_alertmanager_notifications_total{integration="email",user="user2"} 0
cortex_alertmanager_notifications_total{integration="email",user="user3"} 0
cortex_alertmanager_notifications_total{integration="opsgenie",user="user1"} 5
cortex_alertmanager_notifications_total{integration="opsgenie",user="user2"} 50
cortex_alertmanager_notifications_total{integration="opsgenie",user="user3"} 500
Expand Down Expand Up @@ -705,8 +681,6 @@ func TestAlertmanagerMetricsRemoval(t *testing.T) {

# HELP cortex_alertmanager_notification_requests_failed_total The total number of failed notification requests.
# TYPE cortex_alertmanager_notification_requests_failed_total counter
cortex_alertmanager_notification_requests_failed_total{integration="email",user="user1"} 0
cortex_alertmanager_notification_requests_failed_total{integration="email",user="user2"} 0
cortex_alertmanager_notification_requests_failed_total{integration="opsgenie",user="user1"} 5
cortex_alertmanager_notification_requests_failed_total{integration="opsgenie",user="user2"} 50
cortex_alertmanager_notification_requests_failed_total{integration="pagerduty",user="user1"} 1
Expand All @@ -726,8 +700,6 @@ func TestAlertmanagerMetricsRemoval(t *testing.T) {

# HELP cortex_alertmanager_notification_requests_total The total number of attempted notification requests.
# TYPE cortex_alertmanager_notification_requests_total counter
cortex_alertmanager_notification_requests_total{integration="email",user="user1"} 0
cortex_alertmanager_notification_requests_total{integration="email",user="user2"} 0
cortex_alertmanager_notification_requests_total{integration="opsgenie",user="user1"} 5
cortex_alertmanager_notification_requests_total{integration="opsgenie",user="user2"} 50
cortex_alertmanager_notification_requests_total{integration="pagerduty",user="user1"} 1
Expand All @@ -747,8 +719,6 @@ func TestAlertmanagerMetricsRemoval(t *testing.T) {

# HELP cortex_alertmanager_notifications_failed_total The total number of failed notifications.
# TYPE cortex_alertmanager_notifications_failed_total counter
cortex_alertmanager_notifications_failed_total{integration="email",user="user1"} 0
cortex_alertmanager_notifications_failed_total{integration="email",user="user2"} 0
cortex_alertmanager_notifications_failed_total{integration="opsgenie",user="user1"} 5
cortex_alertmanager_notifications_failed_total{integration="opsgenie",user="user2"} 50
cortex_alertmanager_notifications_failed_total{integration="pagerduty",user="user1"} 1
Expand All @@ -768,8 +738,6 @@ func TestAlertmanagerMetricsRemoval(t *testing.T) {

# HELP cortex_alertmanager_notifications_total The total number of attempted notifications.
# TYPE cortex_alertmanager_notifications_total counter
cortex_alertmanager_notifications_total{integration="email",user="user1"} 0
cortex_alertmanager_notifications_total{integration="email",user="user2"} 0
cortex_alertmanager_notifications_total{integration="opsgenie",user="user1"} 5
cortex_alertmanager_notifications_total{integration="opsgenie",user="user2"} 50
cortex_alertmanager_notifications_total{integration="pagerduty",user="user1"} 1
Expand Down
47 changes: 40 additions & 7 deletions pkg/util/metrics_helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,12 +124,16 @@ func (mfm MetricFamilyMap) SumSummariesTo(name string, output *SummaryData) {
}
}

func (mfm MetricFamilyMap) sumOfSingleValuesWithLabels(metric string, labelNames []string, extractFn func(*dto.Metric) float64, aggregateFn func(labelsKey string, labelValues []string, value float64)) {
func (mfm MetricFamilyMap) sumOfSingleValuesWithLabels(metric string, labelNames []string, extractFn func(*dto.Metric) float64, aggregateFn func(labelsKey string, labelValues []string, value float64), skipZeroValue bool) {
metricsPerLabelValue := getMetricsWithLabelNames(mfm[metric], labelNames)

for key, mlv := range metricsPerLabelValue {
for _, m := range mlv.metrics {
val := extractFn(m)
if skipZeroValue && val == 0 {
continue
}

aggregateFn(key, mlv.labelValues, val)
}
}
Expand All @@ -155,7 +159,7 @@ func (d MetricFamiliesPerUser) SendSumOfCounters(out chan<- prometheus.Metric, d
}

func (d MetricFamiliesPerUser) SendSumOfCountersWithLabels(out chan<- prometheus.Metric, desc *prometheus.Desc, counter string, labelNames ...string) {
d.sumOfSingleValuesWithLabels(counter, counterValue, labelNames).WriteToMetricChannel(out, desc, prometheus.CounterValue)
d.sumOfSingleValuesWithLabels(counter, counterValue, labelNames, false).WriteToMetricChannel(out, desc, prometheus.CounterValue)
}

func (d MetricFamiliesPerUser) SendSumOfCountersPerUser(out chan<- prometheus.Metric, desc *prometheus.Desc, counter string) {
Expand All @@ -165,13 +169,20 @@ func (d MetricFamiliesPerUser) SendSumOfCountersPerUser(out chan<- prometheus.Me
// SendSumOfCountersPerUserWithLabels provides metrics with the provided label names on a per-user basis. This function assumes that `user` is the
// first label on the provided metric Desc
func (d MetricFamiliesPerUser) SendSumOfCountersPerUserWithLabels(out chan<- prometheus.Metric, desc *prometheus.Desc, metric string, labelNames ...string) {
d.SendSumOfCountersPerUserWithLabelsAndOptions(out, desc, metric, labelNames)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💯 on not having to do a shotgun surgery ⭐

}

// SendSumOfCountersPerUserWithLabelsAndOptions provides metrics with the provided label names on a per-user basis. This function assumes that `user` is the
// first label on the provided metric Desc
func (d MetricFamiliesPerUser) SendSumOfCountersPerUserWithLabelsAndOptions(out chan<- prometheus.Metric, desc *prometheus.Desc, metric string, labelNames []string, options ...MetricOption) {
for _, userEntry := range d {
if userEntry.user == "" {
continue
}

result := singleValueWithLabelsMap{}
userEntry.metrics.sumOfSingleValuesWithLabels(metric, labelNames, counterValue, result.aggregateFn)
opts := applyMetricOptions(options...)
userEntry.metrics.sumOfSingleValuesWithLabels(metric, labelNames, counterValue, result.aggregateFn, opts.skipZeroValueMetrics)
result.prependUserLabelValue(userEntry.user)
result.WriteToMetricChannel(out, desc, prometheus.CounterValue)
}
Expand All @@ -190,7 +201,7 @@ func (d MetricFamiliesPerUser) SendSumOfGauges(out chan<- prometheus.Metric, des
}

func (d MetricFamiliesPerUser) SendSumOfGaugesWithLabels(out chan<- prometheus.Metric, desc *prometheus.Desc, gauge string, labelNames ...string) {
d.sumOfSingleValuesWithLabels(gauge, gaugeValue, labelNames).WriteToMetricChannel(out, desc, prometheus.GaugeValue)
d.sumOfSingleValuesWithLabels(gauge, gaugeValue, labelNames, false).WriteToMetricChannel(out, desc, prometheus.GaugeValue)
}

func (d MetricFamiliesPerUser) SendSumOfGaugesPerUser(out chan<- prometheus.Metric, desc *prometheus.Desc, gauge string) {
Expand All @@ -206,16 +217,16 @@ func (d MetricFamiliesPerUser) SendSumOfGaugesPerUserWithLabels(out chan<- prome
}

result := singleValueWithLabelsMap{}
userEntry.metrics.sumOfSingleValuesWithLabels(metric, labelNames, gaugeValue, result.aggregateFn)
userEntry.metrics.sumOfSingleValuesWithLabels(metric, labelNames, gaugeValue, result.aggregateFn, false)
result.prependUserLabelValue(userEntry.user)
result.WriteToMetricChannel(out, desc, prometheus.GaugeValue)
}
}

func (d MetricFamiliesPerUser) sumOfSingleValuesWithLabels(metric string, fn func(*dto.Metric) float64, labelNames []string) singleValueWithLabelsMap {
func (d MetricFamiliesPerUser) sumOfSingleValuesWithLabels(metric string, fn func(*dto.Metric) float64, labelNames []string, skipZeroValue bool) singleValueWithLabelsMap {
result := singleValueWithLabelsMap{}
for _, userEntry := range d {
userEntry.metrics.sumOfSingleValuesWithLabels(metric, labelNames, fn, result.aggregateFn)
userEntry.metrics.sumOfSingleValuesWithLabels(metric, labelNames, fn, result.aggregateFn, skipZeroValue)
}
return result
}
Expand Down Expand Up @@ -810,3 +821,25 @@ type CollectorVec interface {
prometheus.Collector
Delete(labels prometheus.Labels) bool
}

// MetricOption defines a functional-style option for metrics aggregation.
type MetricOption func(options *metricOptions)

// SkipZeroValueMetrics controls whether metrics aggregation should skip zero value metrics.
func SkipZeroValueMetrics(options *metricOptions) {
options.skipZeroValueMetrics = true
}

// applyMetricOptions returns a metricOptions with all the input options applied.
func applyMetricOptions(options ...MetricOption) *metricOptions {
actual := &metricOptions{}
for _, option := range options {
option(actual)
}

return actual
}

type metricOptions struct {
skipZeroValueMetrics bool
}
Loading