Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@

## master / unreleased

* [ENHANCEMENT] Blocks storage ingester: exported more TSDB-related metrics. #3412
- `cortex_ingester_tsdb_wal_corruptions_total`
- `cortex_ingester_tsdb_head_truncations_failed_total`
- `cortex_ingester_tsdb_head_truncations_total`
- `cortex_ingester_tsdb_head_gc_duration_seconds`

## 1.5.0 in progress

* [CHANGE] Blocks storage: update the default HTTP configuration values for the S3 client to the upstream Thanos default values. #3244
Expand Down
8 changes: 4 additions & 4 deletions pkg/ingester/ingester_v2_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1860,12 +1860,12 @@ func TestIngester_flushing(t *testing.T) {
i.FlushHandler(httptest.NewRecorder(), httptest.NewRequest("POST", "/flush", nil))

// Flush handler only triggers compactions, but doesn't wait for them to finish. Let's wait for a moment, and then verify.
test.Poll(t, 1*time.Second, true, func() interface{} {
test.Poll(t, 5*time.Second, uint64(0), func() interface{} {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test was flaky again on my local machine, so I had:

  1. Increase polling timeout from 1s to 5s
  2. Compared on the actual number, so we can see what's the value of NumSeries() when it fails

db := i.getTSDB(userID)
if db == nil {
return false
}
return db.Head().NumSeries() == 0
return db.Head().NumSeries()
})

// The above waiting only ensures compaction, waiting another second to register the Sync call.
Expand Down Expand Up @@ -1900,12 +1900,12 @@ func TestIngester_flushing(t *testing.T) {
i.FlushHandler(httptest.NewRecorder(), httptest.NewRequest("POST", "/flush", nil))

// Flush handler only triggers compactions, but doesn't wait for them to finish. Let's wait for a moment, and then verify.
test.Poll(t, 1*time.Second, true, func() interface{} {
test.Poll(t, 5*time.Second, uint64(0), func() interface{} {
db := i.getTSDB(userID)
if db == nil {
return false
}
return db.Head().NumSeries() == 0
return db.Head().NumSeries()
})

// The above waiting only ensures compaction, waiting another second to register the Sync call.
Expand Down
60 changes: 44 additions & 16 deletions pkg/ingester/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -232,10 +232,14 @@ type tsdbMetrics struct {
tsdbFsyncDuration *prometheus.Desc
tsdbPageFlushes *prometheus.Desc
tsdbPageCompletions *prometheus.Desc
tsdbTruncateFail *prometheus.Desc
tsdbTruncateTotal *prometheus.Desc
tsdbTruncateDuration *prometheus.Desc
tsdbWritesFailed *prometheus.Desc
tsdbWALTruncateFail *prometheus.Desc
tsdbWALTruncateTotal *prometheus.Desc
tsdbWALTruncateDuration *prometheus.Desc
tsdbWALCorruptionsTotal *prometheus.Desc
tsdbWALWritesFailed *prometheus.Desc
tsdbHeadTruncateFail *prometheus.Desc
tsdbHeadTruncateTotal *prometheus.Desc
tsdbHeadGcDuration *prometheus.Desc
tsdbActiveAppenders *prometheus.Desc
tsdbSeriesNotFound *prometheus.Desc
tsdbChunks *prometheus.Desc
Expand Down Expand Up @@ -296,22 +300,38 @@ func newTSDBMetrics(r prometheus.Registerer) *tsdbMetrics {
"cortex_ingester_tsdb_wal_completed_pages_total",
"Total number of TSDB WAL completed pages.",
nil, nil),
tsdbTruncateFail: prometheus.NewDesc(
tsdbWALTruncateFail: prometheus.NewDesc(
"cortex_ingester_tsdb_wal_truncations_failed_total",
"Total number of TSDB WAL truncations that failed.",
nil, nil),
tsdbTruncateTotal: prometheus.NewDesc(
tsdbWALTruncateTotal: prometheus.NewDesc(
"cortex_ingester_tsdb_wal_truncations_total",
"Total number of TSDB WAL truncations attempted.",
nil, nil),
tsdbTruncateDuration: prometheus.NewDesc(
tsdbWALTruncateDuration: prometheus.NewDesc(
"cortex_ingester_tsdb_wal_truncate_duration_seconds",
"Duration of TSDB WAL truncation.",
nil, nil),
tsdbWritesFailed: prometheus.NewDesc(
tsdbWALCorruptionsTotal: prometheus.NewDesc(
"cortex_ingester_tsdb_wal_corruptions_total",
"Total number of TSDB WAL corruptions.",
nil, nil),
tsdbWALWritesFailed: prometheus.NewDesc(
"cortex_ingester_tsdb_wal_writes_failed_total",
"Total number of TSDB WAL writes that failed.",
nil, nil),
tsdbHeadTruncateFail: prometheus.NewDesc(
"cortex_ingester_tsdb_head_truncations_failed_total",
"Total number of TSDB head truncations that failed.",
nil, nil),
tsdbHeadTruncateTotal: prometheus.NewDesc(
"cortex_ingester_tsdb_head_truncations_total",
"Total number of TSDB head truncations attempted.",
nil, nil),
tsdbHeadGcDuration: prometheus.NewDesc(
"cortex_ingester_tsdb_head_gc_duration_seconds",
"Runtime of garbage collection in the TSDB head.",
nil, nil),
tsdbActiveAppenders: prometheus.NewDesc(
"cortex_ingester_tsdb_head_active_appenders",
"Number of currently active TSDB appender transactions.",
Expand Down Expand Up @@ -374,10 +394,14 @@ func (sm *tsdbMetrics) Describe(out chan<- *prometheus.Desc) {
out <- sm.tsdbFsyncDuration
out <- sm.tsdbPageFlushes
out <- sm.tsdbPageCompletions
out <- sm.tsdbTruncateFail
out <- sm.tsdbTruncateTotal
out <- sm.tsdbTruncateDuration
out <- sm.tsdbWritesFailed
out <- sm.tsdbWALTruncateFail
out <- sm.tsdbWALTruncateTotal
out <- sm.tsdbWALTruncateDuration
out <- sm.tsdbWALCorruptionsTotal
out <- sm.tsdbWALWritesFailed
out <- sm.tsdbHeadTruncateFail
out <- sm.tsdbHeadTruncateTotal
out <- sm.tsdbHeadGcDuration
out <- sm.tsdbActiveAppenders
out <- sm.tsdbSeriesNotFound
out <- sm.tsdbChunks
Expand Down Expand Up @@ -408,10 +432,14 @@ func (sm *tsdbMetrics) Collect(out chan<- prometheus.Metric) {
data.SendSumOfSummaries(out, sm.tsdbFsyncDuration, "prometheus_tsdb_wal_fsync_duration_seconds")
data.SendSumOfCounters(out, sm.tsdbPageFlushes, "prometheus_tsdb_wal_page_flushes_total")
data.SendSumOfCounters(out, sm.tsdbPageCompletions, "prometheus_tsdb_wal_completed_pages_total")
data.SendSumOfCounters(out, sm.tsdbTruncateFail, "prometheus_tsdb_wal_truncations_failed_total")
data.SendSumOfCounters(out, sm.tsdbTruncateTotal, "prometheus_tsdb_wal_truncations_total")
data.SendSumOfSummaries(out, sm.tsdbTruncateDuration, "prometheus_tsdb_wal_truncate_duration_seconds")
data.SendSumOfCounters(out, sm.tsdbWritesFailed, "prometheus_tsdb_wal_writes_failed_total")
data.SendSumOfCounters(out, sm.tsdbWALTruncateFail, "prometheus_tsdb_wal_truncations_failed_total")
data.SendSumOfCounters(out, sm.tsdbWALTruncateTotal, "prometheus_tsdb_wal_truncations_total")
data.SendSumOfSummaries(out, sm.tsdbWALTruncateDuration, "prometheus_tsdb_wal_truncate_duration_seconds")
data.SendSumOfCounters(out, sm.tsdbWALCorruptionsTotal, "prometheus_tsdb_wal_corruptions_total")
data.SendSumOfCounters(out, sm.tsdbWALWritesFailed, "prometheus_tsdb_wal_writes_failed_total")
data.SendSumOfCounters(out, sm.tsdbHeadTruncateFail, "prometheus_tsdb_head_truncations_failed_total")
data.SendSumOfCounters(out, sm.tsdbHeadTruncateTotal, "prometheus_tsdb_head_truncations_total")
data.SendSumOfSummaries(out, sm.tsdbHeadGcDuration, "prometheus_tsdb_head_gc_duration_seconds")
data.SendSumOfGauges(out, sm.tsdbActiveAppenders, "prometheus_tsdb_head_active_appenders")
data.SendSumOfCounters(out, sm.tsdbSeriesNotFound, "prometheus_tsdb_head_series_not_found_total")
data.SendSumOfGauges(out, sm.tsdbChunks, "prometheus_tsdb_head_chunks")
Expand Down
41 changes: 41 additions & 0 deletions pkg/ingester/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,10 +84,27 @@ func TestTSDBMetrics(t *testing.T) {
# TYPE cortex_ingester_tsdb_wal_truncations_total counter
cortex_ingester_tsdb_wal_truncations_total 1387834

# HELP cortex_ingester_tsdb_wal_corruptions_total Total number of TSDB WAL corruptions.
# TYPE cortex_ingester_tsdb_wal_corruptions_total counter
cortex_ingester_tsdb_wal_corruptions_total 2.676537e+06

# HELP cortex_ingester_tsdb_wal_writes_failed_total Total number of TSDB WAL writes that failed.
# TYPE cortex_ingester_tsdb_wal_writes_failed_total counter
cortex_ingester_tsdb_wal_writes_failed_total 1486965

# HELP cortex_ingester_tsdb_head_truncations_failed_total Total number of TSDB head truncations that failed.
# TYPE cortex_ingester_tsdb_head_truncations_failed_total counter
cortex_ingester_tsdb_head_truncations_failed_total 2.775668e+06

# HELP cortex_ingester_tsdb_head_truncations_total Total number of TSDB head truncations attempted.
# TYPE cortex_ingester_tsdb_head_truncations_total counter
cortex_ingester_tsdb_head_truncations_total 2.874799e+06

# HELP cortex_ingester_tsdb_head_gc_duration_seconds Runtime of garbage collection in the TSDB head.
# TYPE cortex_ingester_tsdb_head_gc_duration_seconds summary
cortex_ingester_tsdb_head_gc_duration_seconds_sum 9
cortex_ingester_tsdb_head_gc_duration_seconds_count 3

# HELP cortex_ingester_tsdb_checkpoint_deletions_failed_total Total number of TSDB checkpoint deletions that failed.
# TYPE cortex_ingester_tsdb_checkpoint_deletions_failed_total counter
cortex_ingester_tsdb_checkpoint_deletions_failed_total 1586096
Expand Down Expand Up @@ -309,5 +326,29 @@ func populateTSDBMetrics(base float64) *prometheus.Registry {
})
mmapChunkCorruptionTotal.Add(26 * base)

walCorruptionsTotal := promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "prometheus_tsdb_wal_corruptions_total",
Help: "Total number of WAL corruptions.",
})
walCorruptionsTotal.Add(27 * base)

headTruncateFail := promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "prometheus_tsdb_head_truncations_failed_total",
Help: "Total number of head truncations that failed.",
})
headTruncateFail.Add(28 * base)

headTruncateTotal := promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "prometheus_tsdb_head_truncations_total",
Help: "Total number of head truncations attempted.",
})
headTruncateTotal.Add(29 * base)

gcDuration := promauto.With(r).NewSummary(prometheus.SummaryOpts{
Name: "prometheus_tsdb_head_gc_duration_seconds",
Help: "Runtime of garbage collection in the head block.",
})
gcDuration.Observe(3)

return r
}