Skip to content

Commit 73dfd1a

Browse files
authored
Cleaning up stale ingester metrics (#5930)
* Cleaning up stale ingester metrics Signed-off-by: alanprot <[email protected]> * changelog Signed-off-by: alanprot <[email protected]> --------- Signed-off-by: alanprot <[email protected]>
1 parent c11dc24 commit 73dfd1a

File tree

3 files changed

+105
-1
lines changed

3 files changed

+105
-1
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
* [ENHANCEMENT] Query Frontend/Querier: Returns `warnings` on prometheus query responses. #5916
66
* [ENHANCEMENT] Ingester: Allowing to configure `-blocks-storage.tsdb.head-compaction-interval` flag up to 30 min and add a jitter on the first head compaction. #5919 #5928
77
* [ENHANCEMENT] Distributor: Added `max_inflight_push_requests` config to ingester client to protect distributor from OOMKilled. #5917
8+
* [ENHANCEMENT] Distributor/Querier: Clean stale per-ingester metrics after ingester restarts. #5930
89
* [CHANGE] Upgrade Dockerfile Node version from 14x to 18x. #5906
910
* [BUGFIX] Configsdb: Fix endline issue in db password. #5920
1011

pkg/distributor/distributor.go

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,8 @@ const (
6565

6666
instanceIngestionRateTickInterval = time.Second
6767

68+
clearStaleIngesterMetricsInterval = time.Minute
69+
6870
// mergeSlicesParallelism is a constant of how much go routines we should use to merge slices, and
6971
// it was based on empirical observation: See BenchmarkMergeSlicesParallel
7072
mergeSlicesParallelism = 8
@@ -398,6 +400,9 @@ func (d *Distributor) running(ctx context.Context) error {
398400
ingestionRateTicker := time.NewTicker(instanceIngestionRateTickInterval)
399401
defer ingestionRateTicker.Stop()
400402

403+
staleIngesterMetricTicker := time.NewTicker(clearStaleIngesterMetricsInterval)
404+
defer staleIngesterMetricTicker.Stop()
405+
401406
for {
402407
select {
403408
case <-ctx.Done():
@@ -406,6 +411,9 @@ func (d *Distributor) running(ctx context.Context) error {
406411
case <-ingestionRateTicker.C:
407412
d.ingestionRate.Tick()
408413

414+
case <-staleIngesterMetricTicker.C:
415+
d.cleanStaleIngesterMetrics()
416+
409417
case err := <-d.subservicesWatcher.Chan():
410418
return errors.Wrap(err, "distributor subservice failed")
411419
}
@@ -701,6 +709,41 @@ func (d *Distributor) Push(ctx context.Context, req *cortexpb.WriteRequest) (*co
701709
return &cortexpb.WriteResponse{}, firstPartialErr
702710
}
703711

712+
func (d *Distributor) cleanStaleIngesterMetrics() {
713+
healthy, unhealthy, err := d.ingestersRing.GetAllInstanceDescs(ring.WriteNoExtend)
714+
if err != nil {
715+
level.Warn(d.log).Log("msg", "error cleaning metrics: GetAllInstanceDescs", "err", err)
716+
return
717+
}
718+
719+
ipsMap := map[string]struct{}{}
720+
721+
for _, ing := range append(healthy, unhealthy...) {
722+
ipsMap[ing.Addr] = struct{}{}
723+
}
724+
725+
ingesterMetrics := []*prometheus.CounterVec{d.ingesterAppends, d.ingesterAppendFailures, d.ingesterQueries, d.ingesterQueryFailures}
726+
727+
for _, m := range ingesterMetrics {
728+
metrics, err := util.GetLabels(m, make(map[string]string))
729+
730+
if err != nil {
731+
level.Warn(d.log).Log("msg", "error cleaning metrics: GetLabels", "err", err)
732+
return
733+
}
734+
735+
for _, lbls := range metrics {
736+
if _, ok := ipsMap[lbls.Get("ingester")]; !ok {
737+
err := util.DeleteMatchingLabels(m, map[string]string{"ingester": lbls.Get("ingester")})
738+
if err != nil {
739+
level.Warn(d.log).Log("msg", "error cleaning metrics: DeleteMatchingLabels", "err", err)
740+
return
741+
}
742+
}
743+
}
744+
}
745+
}
746+
704747
func (d *Distributor) doBatch(ctx context.Context, req *cortexpb.WriteRequest, subRing ring.ReadRing, keys []uint32, initialMetadataIndex int, validatedMetadata []*cortexpb.MetricMetadata, validatedTimeseries []cortexpb.PreallocTimeseries, userID string) error {
705748
span, _ := opentracing.StartSpanFromContext(ctx, "doBatch")
706749
defer span.Finish()

pkg/distributor/distributor_test.go

Lines changed: 61 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -318,8 +318,10 @@ func TestDistributor_Push(t *testing.T) {
318318

319319
func TestDistributor_MetricsCleanup(t *testing.T) {
320320
t.Parallel()
321-
dists, _, regs, _ := prepare(t, prepConfig{
321+
dists, _, regs, r := prepare(t, prepConfig{
322322
numDistributors: 1,
323+
numIngesters: 2,
324+
happyIngesters: 2,
323325
})
324326
d := dists[0]
325327
reg := regs[0]
@@ -334,6 +336,10 @@ func TestDistributor_MetricsCleanup(t *testing.T) {
334336
"cortex_distributor_metadata_in_total",
335337
"cortex_distributor_non_ha_samples_received_total",
336338
"cortex_distributor_latest_seen_sample_timestamp_seconds",
339+
"cortex_distributor_ingester_append_failures_total",
340+
"cortex_distributor_ingester_appends_total",
341+
"cortex_distributor_ingester_query_failures_total",
342+
"cortex_distributor_ingester_queries_total",
337343
}
338344

339345
d.receivedSamples.WithLabelValues("userA").Add(5)
@@ -349,6 +355,16 @@ func TestDistributor_MetricsCleanup(t *testing.T) {
349355
d.dedupedSamples.WithLabelValues("userA", "cluster1").Inc() // We cannot clean this metric
350356
d.latestSeenSampleTimestampPerUser.WithLabelValues("userA").Set(1111)
351357

358+
h, _, _ := r.GetAllInstanceDescs(ring.WriteNoExtend)
359+
d.ingesterAppends.WithLabelValues(h[0].Addr, typeMetadata).Inc()
360+
d.ingesterAppendFailures.WithLabelValues(h[0].Addr, typeMetadata, "2xx").Inc()
361+
d.ingesterAppends.WithLabelValues(h[1].Addr, typeMetadata).Inc()
362+
d.ingesterAppendFailures.WithLabelValues(h[1].Addr, typeMetadata, "2xx").Inc()
363+
d.ingesterQueries.WithLabelValues(h[0].Addr).Inc()
364+
d.ingesterQueries.WithLabelValues(h[1].Addr).Inc()
365+
d.ingesterQueryFailures.WithLabelValues(h[0].Addr).Inc()
366+
d.ingesterQueryFailures.WithLabelValues(h[1].Addr).Inc()
367+
352368
require.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(`
353369
# HELP cortex_distributor_deduped_samples_total The total number of deduplicated samples.
354370
# TYPE cortex_distributor_deduped_samples_total counter
@@ -388,10 +404,41 @@ func TestDistributor_MetricsCleanup(t *testing.T) {
388404
# HELP cortex_distributor_exemplars_in_total The total number of exemplars that have come in to the distributor, including rejected or deduped exemplars.
389405
# TYPE cortex_distributor_exemplars_in_total counter
390406
cortex_distributor_exemplars_in_total{user="userA"} 5
407+
408+
# HELP cortex_distributor_ingester_append_failures_total The total number of failed batch appends sent to ingesters.
409+
# TYPE cortex_distributor_ingester_append_failures_total counter
410+
cortex_distributor_ingester_append_failures_total{ingester="0",status="2xx",type="metadata"} 1
411+
cortex_distributor_ingester_append_failures_total{ingester="1",status="2xx",type="metadata"} 1
412+
# HELP cortex_distributor_ingester_appends_total The total number of batch appends sent to ingesters.
413+
# TYPE cortex_distributor_ingester_appends_total counter
414+
cortex_distributor_ingester_appends_total{ingester="0",type="metadata"} 1
415+
cortex_distributor_ingester_appends_total{ingester="1",type="metadata"} 1
416+
# HELP cortex_distributor_ingester_queries_total The total number of queries sent to ingesters.
417+
# TYPE cortex_distributor_ingester_queries_total counter
418+
cortex_distributor_ingester_queries_total{ingester="0"} 1
419+
cortex_distributor_ingester_queries_total{ingester="1"} 1
420+
# HELP cortex_distributor_ingester_query_failures_total The total number of failed queries sent to ingesters.
421+
# TYPE cortex_distributor_ingester_query_failures_total counter
422+
cortex_distributor_ingester_query_failures_total{ingester="0"} 1
423+
cortex_distributor_ingester_query_failures_total{ingester="1"} 1
391424
`), metrics...))
392425

393426
d.cleanupInactiveUser("userA")
394427

428+
err := r.KVClient.CAS(context.Background(), ingester.RingKey, func(in interface{}) (interface{}, bool, error) {
429+
r := in.(*ring.Desc)
430+
delete(r.Ingesters, "0")
431+
return in, true, nil
432+
})
433+
434+
test.Poll(t, time.Second, true, func() interface{} {
435+
ings, _, _ := r.GetAllInstanceDescs(ring.Write)
436+
return len(ings) == 1
437+
})
438+
439+
require.NoError(t, err)
440+
d.cleanStaleIngesterMetrics()
441+
395442
require.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(`
396443
# HELP cortex_distributor_deduped_samples_total The total number of deduplicated samples.
397444
# TYPE cortex_distributor_deduped_samples_total counter
@@ -422,6 +469,19 @@ func TestDistributor_MetricsCleanup(t *testing.T) {
422469
423470
# HELP cortex_distributor_exemplars_in_total The total number of exemplars that have come in to the distributor, including rejected or deduped exemplars.
424471
# TYPE cortex_distributor_exemplars_in_total counter
472+
473+
# HELP cortex_distributor_ingester_append_failures_total The total number of failed batch appends sent to ingesters.
474+
# TYPE cortex_distributor_ingester_append_failures_total counter
475+
cortex_distributor_ingester_append_failures_total{ingester="1",status="2xx",type="metadata"} 1
476+
# HELP cortex_distributor_ingester_appends_total The total number of batch appends sent to ingesters.
477+
# TYPE cortex_distributor_ingester_appends_total counter
478+
cortex_distributor_ingester_appends_total{ingester="1",type="metadata"} 1
479+
# HELP cortex_distributor_ingester_queries_total The total number of queries sent to ingesters.
480+
# TYPE cortex_distributor_ingester_queries_total counter
481+
cortex_distributor_ingester_queries_total{ingester="1"} 1
482+
# HELP cortex_distributor_ingester_query_failures_total The total number of failed queries sent to ingesters.
483+
# TYPE cortex_distributor_ingester_query_failures_total counter
484+
cortex_distributor_ingester_query_failures_total{ingester="1"} 1
425485
`), metrics...))
426486
}
427487

0 commit comments

Comments
 (0)