Skip to content

Commit 6f2be96

Browse files
[r335] Expose per-user metrics for unknown series references during WAL/WBL replay (#10982)
* Expose per-user metrics for unknown series references during WAL/WBL replay (#10981) * Expose per-user metrics for unknown series references during WAL/WBL replay * Update CHANGELOG (cherry picked from commit 30ed07e) * Trigger CI --------- Co-authored-by: Patryk Prus <[email protected]> Co-authored-by: Patryk Prus <[email protected]>
1 parent 5b8a447 commit 6f2be96

File tree

3 files changed

+60
-4
lines changed

3 files changed

+60
-4
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
* [ENHANCEMENT] Ingester: Add support for exporting native histogram cost attribution metrics (`cortex_ingester_attributed_active_native_histogram_series` and `cortex_ingester_attributed_active_native_histogram_buckets`) with labels specified by customers to a custom Prometheus registry. #10892
1010
* [ENHANCEMENT] Store-gateway: Download sparse headers uploaded by compactors. Compactors have to be configured with `-compactor.upload-sparse-index-headers=true` option. #10879
1111
* [ENHANCEMENT] Compactor: Upload block index file and multiple segment files concurrently. Concurrency scales linearly with block size up to `-compactor.max-per-block-upload-concurrency`. #10947
12+
* [ENHANCEMENT] Ingester: Add per-user `cortex_ingester_tsdb_wal_replay_unknown_refs_total` and `cortex_ingester_tsdb_wbl_replay_unknown_refs_total` metrics to track unknown series references during WAL/WBL replay. #10981
1213
* [BUGFIX] OTLP: Fix response body and Content-Type header to align with spec. #10852
1314
* [BUGFIX] Compactor: fix issue where block becomes permanently stuck when the Compactor's block cleanup job partially deletes a block. #10888
1415
* [BUGFIX] Storage: fix intermittent failures in S3 upload retries. #10952

pkg/ingester/metrics.go

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -524,6 +524,9 @@ type tsdbMetrics struct {
524524
memSeriesCreatedTotal *prometheus.Desc
525525
memSeriesRemovedTotal *prometheus.Desc
526526

527+
tsdbWalReplayUnknownRefsTotal *prometheus.Desc
528+
tsdbWblReplayUnknownRefsTotal *prometheus.Desc
529+
527530
headPostingsForMatchersCacheMetrics *tsdb.PostingsForMatchersCacheMetrics
528531
blockPostingsForMatchersCacheMetrics *tsdb.PostingsForMatchersCacheMetrics
529532

@@ -707,6 +710,15 @@ func newTSDBMetrics(r prometheus.Registerer, logger log.Logger) *tsdbMetrics {
707710
"The total number of series that were removed per user.",
708711
[]string{"user"}, nil),
709712

713+
tsdbWalReplayUnknownRefsTotal: prometheus.NewDesc(
714+
"cortex_ingester_tsdb_wal_replay_unknown_refs_total",
715+
"Total number of unknown series references encountered during WAL replay.",
716+
[]string{"user", "type"}, nil),
717+
tsdbWblReplayUnknownRefsTotal: prometheus.NewDesc(
718+
"cortex_ingester_tsdb_wbl_replay_unknown_refs_total",
719+
"Total number of unknown series references encountered during WBL replay.",
720+
[]string{"user", "type"}, nil),
721+
710722
headPostingsForMatchersCacheMetrics: tsdb.NewPostingsForMatchersCacheMetrics(prometheus.WrapRegistererWithPrefix("cortex_ingester_tsdb_head_", r)),
711723
blockPostingsForMatchersCacheMetrics: tsdb.NewPostingsForMatchersCacheMetrics(prometheus.WrapRegistererWithPrefix("cortex_ingester_tsdb_block_", r)),
712724
}
@@ -762,6 +774,9 @@ func (sm *tsdbMetrics) Describe(out chan<- *prometheus.Desc) {
762774
out <- sm.memSeries
763775
out <- sm.memSeriesCreatedTotal
764776
out <- sm.memSeriesRemovedTotal
777+
778+
out <- sm.tsdbWalReplayUnknownRefsTotal
779+
out <- sm.tsdbWblReplayUnknownRefsTotal
765780
}
766781

767782
func (sm *tsdbMetrics) Collect(out chan<- prometheus.Metric) {
@@ -804,12 +819,12 @@ func (sm *tsdbMetrics) Collect(out chan<- prometheus.Metric) {
804819
data.SendSumOfGaugesPerTenant(out, sm.tsdbExemplarSeriesInStorage, "prometheus_tsdb_exemplar_series_with_exemplars_in_storage")
805820
data.SendSumOfGaugesPerTenant(out, sm.tsdbExemplarLastTs, "prometheus_tsdb_exemplar_last_exemplars_timestamp_seconds")
806821
data.SendSumOfCounters(out, sm.tsdbExemplarsOutOfOrder, "prometheus_tsdb_exemplar_out_of_order_exemplars_total")
807-
808822
data.SendSumOfCountersPerTenant(out, sm.tsdbOOOAppendedSamples, "prometheus_tsdb_head_out_of_order_samples_appended_total")
809-
810823
data.SendSumOfGauges(out, sm.memSeries, "prometheus_tsdb_head_series")
811824
data.SendSumOfCountersPerTenant(out, sm.memSeriesCreatedTotal, "prometheus_tsdb_head_series_created_total")
812825
data.SendSumOfCountersPerTenant(out, sm.memSeriesRemovedTotal, "prometheus_tsdb_head_series_removed_total")
826+
data.SendSumOfCountersPerTenant(out, sm.tsdbWalReplayUnknownRefsTotal, "prometheus_tsdb_wal_replay_unknown_refs_total", dskit_metrics.WithLabels("type"))
827+
data.SendSumOfCountersPerTenant(out, sm.tsdbWblReplayUnknownRefsTotal, "prometheus_tsdb_wbl_replay_unknown_refs_total", dskit_metrics.WithLabels("type"))
813828
}
814829

815830
func (sm *tsdbMetrics) setRegistryForUser(userID string, registry *prometheus.Registry) {

pkg/ingester/metrics_test.go

Lines changed: 42 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,7 @@ func TestTSDBMetrics(t *testing.T) {
217217
# HELP cortex_ingester_tsdb_exemplar_out_of_order_exemplars_total Total number of out-of-order exemplar ingestion failed attempts.
218218
# TYPE cortex_ingester_tsdb_exemplar_out_of_order_exemplars_total counter
219219
cortex_ingester_tsdb_exemplar_out_of_order_exemplars_total 9
220-
220+
221221
# HELP cortex_ingester_tsdb_exemplar_series_with_exemplars_in_storage Number of TSDB series with exemplars currently in storage.
222222
# TYPE cortex_ingester_tsdb_exemplar_series_with_exemplars_in_storage gauge
223223
cortex_ingester_tsdb_exemplar_series_with_exemplars_in_storage{user="user1"} 1
@@ -289,6 +289,21 @@ func TestTSDBMetrics(t *testing.T) {
289289
cortex_ingester_tsdb_block_postings_for_matchers_cache_evictions_total{reason="max-items-reached"} 0
290290
cortex_ingester_tsdb_block_postings_for_matchers_cache_evictions_total{reason="ttl-expired"} 0
291291
cortex_ingester_tsdb_block_postings_for_matchers_cache_evictions_total{reason="unknown"} 0
292+
293+
# HELP cortex_ingester_tsdb_wal_replay_unknown_refs_total Total number of unknown series references encountered during WAL replay.
294+
# TYPE cortex_ingester_tsdb_wal_replay_unknown_refs_total counter
295+
cortex_ingester_tsdb_wal_replay_unknown_refs_total{type="series", user="user1"} 12345
296+
cortex_ingester_tsdb_wal_replay_unknown_refs_total{type="samples", user="user1"} 24690
297+
cortex_ingester_tsdb_wal_replay_unknown_refs_total{type="series", user="user2"} 85787
298+
cortex_ingester_tsdb_wal_replay_unknown_refs_total{type="samples", user="user2"} 171574
299+
cortex_ingester_tsdb_wal_replay_unknown_refs_total{type="series", user="user3"} 999
300+
cortex_ingester_tsdb_wal_replay_unknown_refs_total{type="samples", user="user3"} 1998
301+
302+
# HELP cortex_ingester_tsdb_wbl_replay_unknown_refs_total Total number of unknown series references encountered during WBL replay.
303+
# TYPE cortex_ingester_tsdb_wbl_replay_unknown_refs_total counter
304+
cortex_ingester_tsdb_wbl_replay_unknown_refs_total{type="exemplars", user="user1"} 12345
305+
cortex_ingester_tsdb_wbl_replay_unknown_refs_total{type="exemplars", user="user2"} 85787
306+
cortex_ingester_tsdb_wbl_replay_unknown_refs_total{type="exemplars", user="user3"} 999
292307
`))
293308
require.NoError(t, err)
294309
}
@@ -488,7 +503,7 @@ func TestTSDBMetricsWithRemoval(t *testing.T) {
488503
# HELP cortex_ingester_tsdb_exemplar_out_of_order_exemplars_total Total number of out-of-order exemplar ingestion failed attempts.
489504
# TYPE cortex_ingester_tsdb_exemplar_out_of_order_exemplars_total counter
490505
cortex_ingester_tsdb_exemplar_out_of_order_exemplars_total 9
491-
506+
492507
# HELP cortex_ingester_tsdb_exemplar_series_with_exemplars_in_storage Number of TSDB series with exemplars currently in storage.
493508
# TYPE cortex_ingester_tsdb_exemplar_series_with_exemplars_in_storage gauge
494509
cortex_ingester_tsdb_exemplar_series_with_exemplars_in_storage{user="user1"} 1
@@ -557,6 +572,18 @@ func TestTSDBMetricsWithRemoval(t *testing.T) {
557572
cortex_ingester_tsdb_block_postings_for_matchers_cache_evictions_total{reason="max-items-reached"} 0
558573
cortex_ingester_tsdb_block_postings_for_matchers_cache_evictions_total{reason="ttl-expired"} 0
559574
cortex_ingester_tsdb_block_postings_for_matchers_cache_evictions_total{reason="unknown"} 0
575+
576+
# HELP cortex_ingester_tsdb_wal_replay_unknown_refs_total Total number of unknown series references encountered during WAL replay.
577+
# TYPE cortex_ingester_tsdb_wal_replay_unknown_refs_total counter
578+
cortex_ingester_tsdb_wal_replay_unknown_refs_total{type="series", user="user1"} 12345
579+
cortex_ingester_tsdb_wal_replay_unknown_refs_total{type="samples", user="user1"} 24690
580+
cortex_ingester_tsdb_wal_replay_unknown_refs_total{type="series", user="user2"} 85787
581+
cortex_ingester_tsdb_wal_replay_unknown_refs_total{type="samples", user="user2"} 171574
582+
583+
# HELP cortex_ingester_tsdb_wbl_replay_unknown_refs_total Total number of unknown series references encountered during WBL replay.
584+
# TYPE cortex_ingester_tsdb_wbl_replay_unknown_refs_total counter
585+
cortex_ingester_tsdb_wbl_replay_unknown_refs_total{type="exemplars", user="user1"} 12345
586+
cortex_ingester_tsdb_wbl_replay_unknown_refs_total{type="exemplars", user="user2"} 85787
560587
`))
561588
require.NoError(t, err)
562589
}
@@ -815,5 +842,18 @@ func populateTSDBMetrics(base float64) *prometheus.Registry {
815842
})
816843
chunksMmappedTotal.Add(30 * base)
817844

845+
tsdbWalReplayUnknownRefsTotal := promauto.With(r).NewCounterVec(prometheus.CounterOpts{
846+
Name: "prometheus_tsdb_wal_replay_unknown_refs_total",
847+
Help: "Total number of unknown series references encountered during WAL replay.",
848+
}, []string{"type"})
849+
tsdbWalReplayUnknownRefsTotal.WithLabelValues("series").Add(base)
850+
tsdbWalReplayUnknownRefsTotal.WithLabelValues("samples").Add(base * 2)
851+
852+
tsdbWblReplayUnknownRefsTotal := promauto.With(r).NewCounterVec(prometheus.CounterOpts{
853+
Name: "prometheus_tsdb_wbl_replay_unknown_refs_total",
854+
Help: "Total number of unknown series references encountered during WBL replay pprus.",
855+
}, []string{"type"})
856+
tsdbWblReplayUnknownRefsTotal.WithLabelValues("exemplars").Add(base)
857+
818858
return r
819859
}

0 commit comments

Comments
 (0)