Skip to content

Commit 0f688a5

Browse files
committed
Added metrics to track on-going compaction
Signed-off-by: Marco Pracucci <[email protected]>
1 parent ac36a60 commit 0f688a5

File tree

2 files changed

+45
-6
lines changed

2 files changed

+45
-6
lines changed

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,11 @@
3939
* - `process_memory_map_areas`
4040
* - `process_memory_map_areas_limit`
4141
* [ENHANCEMENT] Ruler: Expose gRPC client options. #3523
42+
* [ENHANCEMENT] Compactor: added metrics to track on-going compaction. #3535
43+
* `cortex_compactor_tenants_discovered`
44+
* `cortex_compactor_tenants_skipped`
45+
* `cortex_compactor_tenants_processing_succeeded`
46+
* `cortex_compactor_tenants_processing_failed`
4247
* [BUGFIX] Blocks storage ingester: fixed some cases leading to a TSDB WAL corruption after a partial write to disk. #3423
4348
* [BUGFIX] Blocks storage: Fix the race between ingestion and `/flush` call resulting in overlapping blocks. #3422
4449
* [BUGFIX] Querier: fixed `-querier.max-query-into-future` which wasn't correctly enforced on range queries. #3452

pkg/compactor/compactor.go

Lines changed: 40 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -138,12 +138,16 @@ type Compactor struct {
138138
ringSubservicesWatcher *services.FailureWatcher
139139

140140
// Metrics.
141-
compactionRunsStarted prometheus.Counter
142-
compactionRunsCompleted prometheus.Counter
143-
compactionRunsFailed prometheus.Counter
144-
compactionRunsLastSuccess prometheus.Gauge
145-
blocksMarkedForDeletion prometheus.Counter
146-
garbageCollectedBlocks prometheus.Counter
141+
compactionRunsStarted prometheus.Counter
142+
compactionRunsCompleted prometheus.Counter
143+
compactionRunsFailed prometheus.Counter
144+
compactionRunsLastSuccess prometheus.Gauge
145+
compactionRunDiscoveredTenants prometheus.Gauge
146+
compactionRunSkippedTenants prometheus.Gauge
147+
compactionRunSucceededTenants prometheus.Gauge
148+
compactionRunFailedTenants prometheus.Gauge
149+
blocksMarkedForDeletion prometheus.Counter
150+
garbageCollectedBlocks prometheus.Counter
147151

148152
// TSDB syncer metrics
149153
syncerMetrics *syncerMetrics
@@ -206,6 +210,22 @@ func newCompactor(
206210
Name: "cortex_compactor_last_successful_run_timestamp_seconds",
207211
Help: "Unix timestamp of the last successful compaction run.",
208212
}),
213+
compactionRunDiscoveredTenants: promauto.With(registerer).NewGauge(prometheus.GaugeOpts{
214+
Name: "cortex_compactor_tenants_discovered",
215+
Help: "Number of tenants discovered during the current compaction run. Reset to 0 when compactor is idle.",
216+
}),
217+
compactionRunSkippedTenants: promauto.With(registerer).NewGauge(prometheus.GaugeOpts{
218+
Name: "cortex_compactor_tenants_skipped",
219+
Help: "Number of tenants skipped during the current compaction run. Reset to 0 when compactor is idle.",
220+
}),
221+
compactionRunSucceededTenants: promauto.With(registerer).NewGauge(prometheus.GaugeOpts{
222+
Name: "cortex_compactor_tenants_processing_succeeded",
223+
Help: "Number of tenants successfully processed during the current compaction run. Reset to 0 when compactor is idle.",
224+
}),
225+
compactionRunFailedTenants: promauto.With(registerer).NewGauge(prometheus.GaugeOpts{
226+
Name: "cortex_compactor_tenants_processing_failed",
227+
Help: "Number of tenants failed processing during the current compaction run. Reset to 0 when compactor is idle.",
228+
}),
209229
blocksMarkedForDeletion: promauto.With(registerer).NewCounter(prometheus.CounterOpts{
210230
Name: "cortex_compactor_blocks_marked_for_deletion_total",
211231
Help: "Total number of blocks marked for deletion in compactor.",
@@ -377,13 +397,23 @@ func (c *Compactor) compactUsersWithRetries(ctx context.Context) {
377397
}
378398

379399
func (c *Compactor) compactUsers(ctx context.Context) error {
400+
// Reset progress metrics once done.
401+
defer func() {
402+
c.compactionRunDiscoveredTenants.Set(0)
403+
c.compactionRunSkippedTenants.Set(0)
404+
c.compactionRunSucceededTenants.Set(0)
405+
c.compactionRunFailedTenants.Set(0)
406+
}()
407+
380408
level.Info(c.logger).Log("msg", "discovering users from bucket")
381409
users, err := c.discoverUsers(ctx)
382410
if err != nil {
383411
level.Error(c.logger).Log("msg", "failed to discover users from bucket", "err", err)
384412
return errors.Wrap(err, "failed to discover users from bucket")
385413
}
414+
386415
level.Info(c.logger).Log("msg", "discovered users from bucket", "users", len(users))
416+
c.compactionRunDiscoveredTenants.Set(float64(len(users)))
387417

388418
// When starting multiple compactor replicas nearly at the same time, running in a cluster with
389419
// a large number of tenants, we may end up in a situation where the 1st user is compacted by
@@ -403,21 +433,25 @@ func (c *Compactor) compactUsers(ctx context.Context) error {
403433

404434
// Ensure the user ID belongs to our shard.
405435
if owned, err := c.ownUser(userID); err != nil {
436+
c.compactionRunSkippedTenants.Inc()
406437
level.Warn(c.logger).Log("msg", "unable to check if user is owned by this shard", "user", userID, "err", err)
407438
continue
408439
} else if !owned {
440+
c.compactionRunSkippedTenants.Inc()
409441
level.Debug(c.logger).Log("msg", "skipping user because not owned by this shard", "user", userID)
410442
continue
411443
}
412444

413445
level.Info(c.logger).Log("msg", "starting compaction of user blocks", "user", userID)
414446

415447
if err = c.compactUser(ctx, userID); err != nil {
448+
c.compactionRunFailedTenants.Inc()
416449
level.Error(c.logger).Log("msg", "failed to compact user blocks", "user", userID, "err", err)
417450
errs.Add(errors.Wrapf(err, "failed to compact user blocks (user: %s)", userID))
418451
continue
419452
}
420453

454+
c.compactionRunSucceededTenants.Inc()
421455
level.Info(c.logger).Log("msg", "successfully compacted user blocks", "user", userID)
422456
}
423457

0 commit comments

Comments
 (0)