From 48f83b1cf959101aec74f79b714422f4f36b9cd3 Mon Sep 17 00:00:00 2001 From: rosstimothy <39066650+rosstimothy@users.noreply.github.com> Date: Fri, 16 May 2025 16:53:58 -0400 Subject: [PATCH] Expose the cache health as a prometheus metric (#54776) Adds two new gauges to track cache health. - `teleport_cache_health`: labeled by component, it reflects if the cache is healthy and populated. A value of 1 means healthy, a value of 0 means unhealthy. - `teleport_cache_last_reset_seconds`, labeled by component, it reflects the last unix time in seconds that the cache was reset. --- lib/cache/cache.go | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/lib/cache/cache.go b/lib/cache/cache.go index faac49a237119..a9fcd541078d7 100644 --- a/lib/cache/cache.go +++ b/lib/cache/cache.go @@ -80,7 +80,25 @@ var ( []string{teleport.TagCacheComponent}, ) - cacheCollectors = []prometheus.Collector{cacheEventsReceived, cacheStaleEventsReceived} + cacheHealth = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: teleport.MetricNamespace, + Subsystem: "cache", + Name: "health", + Help: "Whether the cache for a particular Teleport service is healthy.", + }, + []string{teleport.TagCacheComponent}, + ) + + cacheLastReset = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: teleport.MetricNamespace, + Subsystem: "cache", + Name: "last_reset_seconds", + Help: "The unix time in seconds that the last cache reset was performed.", + }, + []string{teleport.TagCacheComponent}, + ) ) // highVolumeResources is the set of cached resources that tend to produce high @@ -574,6 +592,12 @@ func (c *Cache) setInitError(err error) { c.initErr = err close(c.initC) }) + + if err == nil { + cacheHealth.WithLabelValues(c.Component).Set(1.0) + } else { + cacheHealth.WithLabelValues(c.Component).Set(0.0) + } } // setReadStatus updates Cache.ok, which determines whether the @@ -863,7 +887,12 @@ const ( // New creates a new instance of Cache func New(config Config) (*Cache, error) { - if err := metrics.RegisterPrometheusCollectors(cacheCollectors...); err != nil { + if err := metrics.RegisterPrometheusCollectors( + cacheEventsReceived, + cacheStaleEventsReceived, + cacheHealth, + cacheLastReset, + ); err != nil { return nil, trace.Wrap(err) } if err := config.CheckAndSetDefaults(); err != nil { @@ -1247,6 +1276,7 @@ func (c *Cache) notify(ctx context.Context, event Event) { // we assume that this cache will eventually end up in a correct state // potentially lagging behind the state of the database. func (c *Cache) fetchAndWatch(ctx context.Context, retry retryutils.Retry, timer *time.Timer) error { + cacheLastReset.WithLabelValues(c.Component).SetToCurrentTime() requestKinds := c.watchKinds() watcher, err := c.Events.NewWatcher(c.ctx, types.Watch{ Name: c.Component,