From 651cab3537c461ece9e19053f3a74b47f81334a8 Mon Sep 17 00:00:00 2001 From: Tim Ross Date: Tue, 13 May 2025 16:55:49 -0400 Subject: [PATCH] Expose the cache health as a prometheus metric Adds two new gauges to track cache health. - `teleport_cache_health`: labeled by component, it reflects if the cache is healthy and populated. A value of 1 means healthy, a value of 0 means unhealthy. - `teleport_cache_last_reset_seconds`, labeled by component, it reflects the last unix time in seconds that the cache was reset. --- lib/cache/cache.go | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/lib/cache/cache.go b/lib/cache/cache.go index 2568714123fcc..fafb42b8dad5e 100644 --- a/lib/cache/cache.go +++ b/lib/cache/cache.go @@ -72,7 +72,25 @@ var ( []string{teleport.TagCacheComponent}, ) - cacheCollectors = []prometheus.Collector{cacheEventsReceived, cacheStaleEventsReceived} + cacheHealth = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: teleport.MetricNamespace, + Subsystem: "cache", + Name: "health", + Help: "Whether the cache for a particular Teleport service is healthy.", + }, + []string{teleport.TagCacheComponent}, + ) + + cacheLastReset = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: teleport.MetricNamespace, + Subsystem: "cache", + Name: "last_reset_seconds", + Help: "The unix time in seconds that the last cache reset was performed.", + }, + []string{teleport.TagCacheComponent}, + ) ) // highVolumeResources is the set of cached resources that tend to produce high @@ -506,6 +524,12 @@ func (c *Cache) setInitError(err error) { c.initErr = err close(c.initC) }) + + if err == nil { + cacheHealth.WithLabelValues(c.Component).Set(1.0) + } else { + cacheHealth.WithLabelValues(c.Component).Set(0.0) + } } // setReadStatus updates Cache.ok, which determines whether the @@ -858,7 +882,12 @@ const ( // New creates a new instance of Cache func New(config Config) (*Cache, error) { - if err := metrics.RegisterPrometheusCollectors(cacheCollectors...); err != nil { + if err := metrics.RegisterPrometheusCollectors( + cacheEventsReceived, + cacheStaleEventsReceived, + cacheHealth, + cacheLastReset, + ); err != nil { return nil, trace.Wrap(err) } @@ -1143,6 +1172,7 @@ func (c *Cache) notify(ctx context.Context, event Event) { // we assume that this cache will eventually end up in a correct state // potentially lagging behind the state of the database. func (c *Cache) fetchAndWatch(ctx context.Context, retry retryutils.Retry, timer *time.Timer) error { + cacheLastReset.WithLabelValues(c.Component).SetToCurrentTime() requestKinds := c.Config.Watches watcher, err := c.Events.NewWatcher(c.ctx, types.Watch{ Name: c.Component,