Skip to content

Commit 0afffa8

Browse files
authored
Fix prometheus histogram rate overflows (#17753)
Fix some overflows on Prometheus histogram rate calculations. They could be caused by: * New buckets added to existing histograms on runtime, this happens at least with CockroachDB (see #17736). * Buckets with bigger upper limits have lower counters. This is wrong and has been only reproduced this on tests, but handling it just in case to avoid losing other data if this happens with some service. Rate calculation methods return now also a boolean to be able to differenciate if a zero value is caused because it was the first call, or because it the rate is actually zero.
1 parent 15ad134 commit 0afffa8

File tree

6 files changed

+450
-24
lines changed

6 files changed

+450
-24
lines changed

CHANGELOG.next.asciidoc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,8 @@ https://github.com/elastic/beats/compare/v7.0.0-alpha2...master[Check the HEAD d
157157
- Fix "ID" event generator of Google Cloud module {issue}17160[17160] {pull}17608[17608]
158158
- Add privileged option for Auditbeat in Openshift {pull}17637[17637]
159159
- Fix storage metricset to allow config without region/zone. {issue}17623[17623] {pull}17624[17624]
160-
- Add a switch to the driver definition on SQL module to use pretty names {pull}17378[17378]
160+
- Add a switch to the driver definition on SQL module to use pretty names. {pull}17378[17378]
161+
- Fix overflow on Prometheus rates when new buckets are added on the go. {pull}17753[17753]
161162

162163
*Packetbeat*
163164

x-pack/metricbeat/module/prometheus/collector/counter.go

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,14 @@ type CounterCache interface {
2222
Stop()
2323

2424
// RateUint64 returns, for a given counter name, the difference between the given value
25-
// and the value that was given in a previous call. It will return 0 on the first call
26-
RateUint64(counterName string, value uint64) uint64
25+
// and the value that was given in a previous call, and true if a previous value existed.
26+
// It will return 0 and false on the first call.
27+
RateUint64(counterName string, value uint64) (uint64, bool)
2728

2829
// RateFloat64 returns, for a given counter name, the difference between the given value
29-
// and the value that was given in a previous call. It will return 0.0 on the first call
30-
RateFloat64(counterName string, value float64) float64
30+
// and the value that was given in a previous call, and true if a previous value existed.
31+
// It will return 0 and false on the first call.
32+
RateFloat64(counterName string, value float64) (float64, bool)
3133
}
3234

3335
type counterCache struct {
@@ -47,35 +49,37 @@ func NewCounterCache(timeout time.Duration) CounterCache {
4749
}
4850

4951
// RateUint64 returns, for a given counter name, the difference between the given value
50-
// and the value that was given in a previous call. It will return 0 on the first call
51-
func (c *counterCache) RateUint64(counterName string, value uint64) uint64 {
52+
// and the value that was given in a previous call, and true if a previous value existed.
53+
// It will return 0 and false on the first call.
54+
func (c *counterCache) RateUint64(counterName string, value uint64) (uint64, bool) {
5255
prev := c.ints.PutWithTimeout(counterName, value, c.timeout)
5356
if prev != nil {
5457
if prev.(uint64) > value {
5558
// counter reset
56-
return 0
59+
return 0, true
5760
}
58-
return value - prev.(uint64)
61+
return value - prev.(uint64), true
5962
}
6063

6164
// first put for this value, return rate of 0
62-
return 0
65+
return 0, false
6366
}
6467

6568
// RateFloat64 returns, for a given counter name, the difference between the given value
66-
// and the value that was given in a previous call. It will return 0.0 on the first call
67-
func (c *counterCache) RateFloat64(counterName string, value float64) float64 {
69+
// and the value that was given in a previous call, and true if a previous value existed.
70+
// It will return 0 and false on the first call.
71+
func (c *counterCache) RateFloat64(counterName string, value float64) (float64, bool) {
6872
prev := c.floats.PutWithTimeout(counterName, value, c.timeout)
6973
if prev != nil {
7074
if prev.(float64) > value {
7175
// counter reset
72-
return 0
76+
return 0, true
7377
}
74-
return value - prev.(float64)
78+
return value - prev.(float64), true
7579
}
7680

7781
// first put for this value, return rate of 0
78-
return 0
82+
return 0, false
7983
}
8084

8185
// Start the cache cleanup worker. It mus be called once before start using

x-pack/metricbeat/module/prometheus/collector/counter_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,13 +50,13 @@ func Test_CounterCache(t *testing.T) {
5050
t.Run(tt.name, func(t *testing.T) {
5151
for i, val := range tt.valuesUint64 {
5252
want := tt.expectedUin64[i]
53-
if got := tt.counterCache.RateUint64(tt.counterName, val); got != want {
53+
if got, _ := tt.counterCache.RateUint64(tt.counterName, val); got != want {
5454
t.Errorf("counterCache.RateUint64() = %v, want %v", got, want)
5555
}
5656
}
5757
for i, val := range tt.valuesFloat64 {
5858
want := tt.expectedFloat64[i]
59-
if got := tt.counterCache.RateFloat64(tt.counterName, val); got != want {
59+
if got, _ := tt.counterCache.RateFloat64(tt.counterName, val); got != want {
6060
t.Errorf("counterCache.RateFloat64() = %v, want %v", got, want)
6161
}
6262
}

x-pack/metricbeat/module/prometheus/collector/data.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,7 @@ func (g *typedGenerator) rateCounterUint64(name string, labels common.MapStr, va
171171
}
172172

173173
if g.rateCounters {
174-
d["rate"] = g.counterCache.RateUint64(name+labels.String(), value)
174+
d["rate"], _ = g.counterCache.RateUint64(name+labels.String(), value)
175175
}
176176

177177
return d
@@ -184,7 +184,7 @@ func (g *typedGenerator) rateCounterFloat64(name string, labels common.MapStr, v
184184
}
185185

186186
if g.rateCounters {
187-
d["rate"] = g.counterCache.RateFloat64(name+labels.String(), value)
187+
d["rate"], _ = g.counterCache.RateFloat64(name+labels.String(), value)
188188
}
189189

190190
return d

x-pack/metricbeat/module/prometheus/collector/histogram.go

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ func promHistogramToES(cc CounterCache, name string, labels common.MapStr, histo
3333

3434
// calculate centroids and rated counts
3535
var lastUpper, prevUpper float64
36-
var sumCount uint64
36+
var sumCount, prevCount uint64
3737
for _, bucket := range histogram.GetBucket() {
3838
// Ignore non-numbers
3939
if bucket.GetCumulativeCount() == uint64(math.NaN()) || bucket.GetCumulativeCount() == uint64(math.Inf(0)) {
@@ -50,10 +50,25 @@ func promHistogramToES(cc CounterCache, name string, labels common.MapStr, histo
5050
lastUpper = bucket.GetUpperBound()
5151
}
5252

53-
// take count for this period (rate) + deacumulate
54-
count := cc.RateUint64(name+labels.String()+fmt.Sprintf("%f", bucket.GetUpperBound()), bucket.GetCumulativeCount()) - sumCount
55-
counts = append(counts, count)
56-
sumCount += count
53+
// Take count for this period (rate)
54+
countRate, found := cc.RateUint64(name+labels.String()+fmt.Sprintf("%f", bucket.GetUpperBound()), bucket.GetCumulativeCount())
55+
56+
switch {
57+
case !found:
58+
// This is a new bucket, consider it zero by now, but still increase the
59+
// sum to don't deviate following buckets that are not new.
60+
counts = append(counts, 0)
61+
sumCount += bucket.GetCumulativeCount() - prevCount
62+
case countRate < sumCount:
63+
// This should never happen, this means something is wrong in the
64+
// prometheus response. Handle it to avoid overflowing when deaccumulating.
65+
counts = append(counts, 0)
66+
default:
67+
// Store the deaccumulated count.
68+
counts = append(counts, countRate-sumCount)
69+
sumCount = countRate
70+
}
71+
prevCount = bucket.GetCumulativeCount()
5772
}
5873

5974
res := common.MapStr{

0 commit comments

Comments
 (0)