From ca38bc407f1b8acb142f6204862113fc33cbc997 Mon Sep 17 00:00:00 2001 From: Tim Buckley Date: Fri, 21 Feb 2025 19:47:27 -0700 Subject: [PATCH 1/2] Machine ID: Add Prometheus metrics for loop tasks This adds a number of Prometheus metrics to help track success, failure, and timing for loop iterations. The loop helper is used across tbot services, so these metrics universally cover identity and output renewals, among other tasks. Also, renames `service_heatbeat.go`, which was misspelled. --- lib/tbot/loop.go | 44 ++++++++++++++++++- ...rvice_heatbeat.go => service_heartbeat.go} | 0 lib/tbot/tbot.go | 9 +++- 3 files changed, 50 insertions(+), 3 deletions(-) rename lib/tbot/{service_heatbeat.go => service_heartbeat.go} (100%) diff --git a/lib/tbot/loop.go b/lib/tbot/loop.go index aa3b2098302cb..e4c94e3790d66 100644 --- a/lib/tbot/loop.go +++ b/lib/tbot/loop.go @@ -26,10 +26,40 @@ import ( "github.com/gravitational/trace" "github.com/jonboulle/clockwork" + "github.com/prometheus/client_golang/prometheus" "github.com/gravitational/teleport/api/utils/retryutils" ) +var ( + loopIterationsCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "tbot_task_iterations", + Help: "Number of task iteration attempts, not counting retries", + }, []string{"name"}, + ) + loopIterationsSuccessCounter = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "tbot_task_iterations_successful", + Help: "Histogram of task iterations that ultimately succeeded, bucketed by number of retries before success", + Buckets: []float64{0, 1, 2, 3, 4, 5}, + }, []string{"name"}, + ) + loopIterationsFailureCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "tbot_task_iterations_failed", + Help: "Number of task iterations that ultimately failed, not counting retries", + }, []string{"name"}, + ) + loopIterationTime = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "tbot_task_iteration_duration_seconds", + Help: "Time between beginning and ultimate end of one task iteration regardless of outcome, including all retries", + Buckets: prometheus.ExponentialBuckets(0.1, 1.75, 6), + }, []string{"name"}, + ) +) + type runOnIntervalConfig struct { name string f func(ctx context.Context) error @@ -49,8 +79,6 @@ type runOnIntervalConfig struct { // runOnInterval runs a function on a given interval, with retries and jitter. // // TODO(noah): Emit Prometheus metrics for: -// - Success/Failure of attempts -// - Time taken to execute attempt // - Time of next attempt func runOnInterval(ctx context.Context, cfg runOnIntervalConfig) error { switch { @@ -87,6 +115,9 @@ func runOnInterval(ctx context.Context, cfg runOnIntervalConfig) error { } firstRun = false + loopIterationsCounter.WithLabelValues(cfg.name).Inc() + startTime := time.Now() + var err error for attempt := 1; attempt <= cfg.retryLimit; attempt++ { log.InfoContext( @@ -97,6 +128,7 @@ func runOnInterval(ctx context.Context, cfg runOnIntervalConfig) error { ) err = cfg.f(ctx) if err == nil { + loopIterationsSuccessCounter.WithLabelValues(cfg.name).Observe(float64(attempt - 1)) break } @@ -114,12 +146,20 @@ func runOnInterval(ctx context.Context, cfg runOnIntervalConfig) error { ) select { case <-ctx.Done(): + // Note: will discard metric update for this loop. It + // probably won't be collected if we're shutting down, + // anyway. return nil case <-cfg.clock.After(backoffTime): } } } + + loopIterationTime.WithLabelValues(cfg.name).Observe(time.Since(startTime).Seconds()) + if err != nil { + loopIterationsFailureCounter.WithLabelValues(cfg.name).Inc() + if cfg.exitOnRetryExhausted { log.ErrorContext( ctx, diff --git a/lib/tbot/service_heatbeat.go b/lib/tbot/service_heartbeat.go similarity index 100% rename from lib/tbot/service_heatbeat.go rename to lib/tbot/service_heartbeat.go diff --git a/lib/tbot/tbot.go b/lib/tbot/tbot.go index aa728db038724..8d29b85d647e3 100644 --- a/lib/tbot/tbot.go +++ b/lib/tbot/tbot.go @@ -132,7 +132,14 @@ func (b *Bot) Run(ctx context.Context) (err error) { defer func() { apitracing.EndSpan(span, err) }() startedAt := time.Now() - if err := metrics.RegisterPrometheusCollectors(clientMetrics); err != nil { + if err := metrics.RegisterPrometheusCollectors( + metrics.BuildCollector(), + clientMetrics, + loopIterationsCounter, + loopIterationsSuccessCounter, + loopIterationsFailureCounter, + loopIterationTime, + ); err != nil { return trace.Wrap(err) } From 65ef262425bd4da2cce85888563d3ef67810b97f Mon Sep 17 00:00:00 2001 From: Tim Buckley Date: Mon, 24 Feb 2025 17:39:06 -0700 Subject: [PATCH 2/2] Include service name as a label; rename metrics for conventions --- lib/tbot/loop.go | 25 ++++++++++++------------ lib/tbot/service_application_output.go | 1 + lib/tbot/service_bot_identity.go | 3 ++- lib/tbot/service_client_credential.go | 1 + lib/tbot/service_database_output.go | 1 + lib/tbot/service_heartbeat.go | 1 + lib/tbot/service_identity_output.go | 1 + lib/tbot/service_kubernetes_output.go | 1 + lib/tbot/service_kubernetes_v2_output.go | 1 + lib/tbot/service_ssh_host_output.go | 1 + lib/tbot/service_ssh_multiplexer.go | 3 ++- 11 files changed, 25 insertions(+), 14 deletions(-) diff --git a/lib/tbot/loop.go b/lib/tbot/loop.go index e4c94e3790d66..a5a9f7776909a 100644 --- a/lib/tbot/loop.go +++ b/lib/tbot/loop.go @@ -34,36 +34,37 @@ import ( var ( loopIterationsCounter = prometheus.NewCounterVec( prometheus.CounterOpts{ - Name: "tbot_task_iterations", + Name: "tbot_task_iterations_total", Help: "Number of task iteration attempts, not counting retries", - }, []string{"name"}, + }, []string{"service", "name"}, ) loopIterationsSuccessCounter = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Name: "tbot_task_iterations_successful", Help: "Histogram of task iterations that ultimately succeeded, bucketed by number of retries before success", Buckets: []float64{0, 1, 2, 3, 4, 5}, - }, []string{"name"}, + }, []string{"service", "name"}, ) loopIterationsFailureCounter = prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "tbot_task_iterations_failed", Help: "Number of task iterations that ultimately failed, not counting retries", - }, []string{"name"}, + }, []string{"service", "name"}, ) loopIterationTime = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Name: "tbot_task_iteration_duration_seconds", Help: "Time between beginning and ultimate end of one task iteration regardless of outcome, including all retries", Buckets: prometheus.ExponentialBuckets(0.1, 1.75, 6), - }, []string{"name"}, + }, []string{"service", "name"}, ) ) type runOnIntervalConfig struct { - name string - f func(ctx context.Context) error - clock clockwork.Clock + service string + name string + f func(ctx context.Context) error + clock clockwork.Clock // reloadCh allows the task to be triggered immediately, ideal for handling // CA rotations or a manual signal from a user. // reloadCh can be nil, in which case, the task will only run on the @@ -115,7 +116,7 @@ func runOnInterval(ctx context.Context, cfg runOnIntervalConfig) error { } firstRun = false - loopIterationsCounter.WithLabelValues(cfg.name).Inc() + loopIterationsCounter.WithLabelValues(cfg.service, cfg.name).Inc() startTime := time.Now() var err error @@ -128,7 +129,7 @@ func runOnInterval(ctx context.Context, cfg runOnIntervalConfig) error { ) err = cfg.f(ctx) if err == nil { - loopIterationsSuccessCounter.WithLabelValues(cfg.name).Observe(float64(attempt - 1)) + loopIterationsSuccessCounter.WithLabelValues(cfg.service, cfg.name).Observe(float64(attempt - 1)) break } @@ -155,10 +156,10 @@ func runOnInterval(ctx context.Context, cfg runOnIntervalConfig) error { } } - loopIterationTime.WithLabelValues(cfg.name).Observe(time.Since(startTime).Seconds()) + loopIterationTime.WithLabelValues(cfg.service, cfg.name).Observe(time.Since(startTime).Seconds()) if err != nil { - loopIterationsFailureCounter.WithLabelValues(cfg.name).Inc() + loopIterationsFailureCounter.WithLabelValues(cfg.service, cfg.name).Inc() if cfg.exitOnRetryExhausted { log.ErrorContext( diff --git a/lib/tbot/service_application_output.go b/lib/tbot/service_application_output.go index c07758549d6dc..4339f302da765 100644 --- a/lib/tbot/service_application_output.go +++ b/lib/tbot/service_application_output.go @@ -61,6 +61,7 @@ func (s *ApplicationOutputService) Run(ctx context.Context) error { defer unsubscribe() err := runOnInterval(ctx, runOnIntervalConfig{ + service: s.String(), name: "output-renewal", f: s.generate, interval: cmp.Or(s.cfg.CredentialLifetime, s.botCfg.CredentialLifetime).RenewalInterval, diff --git a/lib/tbot/service_bot_identity.go b/lib/tbot/service_bot_identity.go index 0fa5f07a0b938..c9a635084b61f 100644 --- a/lib/tbot/service_bot_identity.go +++ b/lib/tbot/service_bot_identity.go @@ -267,7 +267,8 @@ func (s *identityService) Run(ctx context.Context) error { ) err := runOnInterval(ctx, runOnIntervalConfig{ - name: "bot-identity-renewal", + service: s.String(), + name: "bot-identity-renewal", f: func(ctx context.Context) error { return s.renew(ctx, storageDestination) }, diff --git a/lib/tbot/service_client_credential.go b/lib/tbot/service_client_credential.go index f44f168535329..837705395dbdc 100644 --- a/lib/tbot/service_client_credential.go +++ b/lib/tbot/service_client_credential.go @@ -55,6 +55,7 @@ func (s *ClientCredentialOutputService) Run(ctx context.Context) error { defer unsubscribe() err := runOnInterval(ctx, runOnIntervalConfig{ + service: s.String(), name: "output-renewal", f: s.generate, interval: s.botCfg.CredentialLifetime.RenewalInterval, diff --git a/lib/tbot/service_database_output.go b/lib/tbot/service_database_output.go index b5f4af3e03125..ccceecdf082e9 100644 --- a/lib/tbot/service_database_output.go +++ b/lib/tbot/service_database_output.go @@ -61,6 +61,7 @@ func (s *DatabaseOutputService) Run(ctx context.Context) error { defer unsubscribe() err := runOnInterval(ctx, runOnIntervalConfig{ + service: s.String(), name: "output-renewal", f: s.generate, interval: cmp.Or(s.cfg.CredentialLifetime, s.botCfg.CredentialLifetime).RenewalInterval, diff --git a/lib/tbot/service_heartbeat.go b/lib/tbot/service_heartbeat.go index 02f976cc6a0f8..5edecbe6b1e38 100644 --- a/lib/tbot/service_heartbeat.go +++ b/lib/tbot/service_heartbeat.go @@ -95,6 +95,7 @@ func (s *heartbeatService) OneShot(ctx context.Context) error { func (s *heartbeatService) Run(ctx context.Context) error { isStartup := true err := runOnInterval(ctx, runOnIntervalConfig{ + service: s.String(), name: "submit-heartbeat", log: s.log, interval: s.interval, diff --git a/lib/tbot/service_identity_output.go b/lib/tbot/service_identity_output.go index e716bb62ebe29..a2aedcdbf3e80 100644 --- a/lib/tbot/service_identity_output.go +++ b/lib/tbot/service_identity_output.go @@ -73,6 +73,7 @@ func (s *IdentityOutputService) Run(ctx context.Context) error { defer unsubscribe() err := runOnInterval(ctx, runOnIntervalConfig{ + service: s.String(), name: "output-renewal", f: s.generate, interval: cmp.Or(s.cfg.CredentialLifetime, s.botCfg.CredentialLifetime).RenewalInterval, diff --git a/lib/tbot/service_kubernetes_output.go b/lib/tbot/service_kubernetes_output.go index 10dee53cac269..b413e5e8cdf4d 100644 --- a/lib/tbot/service_kubernetes_output.go +++ b/lib/tbot/service_kubernetes_output.go @@ -78,6 +78,7 @@ func (s *KubernetesOutputService) Run(ctx context.Context) error { defer unsubscribe() err := runOnInterval(ctx, runOnIntervalConfig{ + service: s.String(), name: "output-renewal", f: s.generate, interval: cmp.Or(s.cfg.CredentialLifetime, s.botCfg.CredentialLifetime).RenewalInterval, diff --git a/lib/tbot/service_kubernetes_v2_output.go b/lib/tbot/service_kubernetes_v2_output.go index 54563e459529c..b527379229243 100644 --- a/lib/tbot/service_kubernetes_v2_output.go +++ b/lib/tbot/service_kubernetes_v2_output.go @@ -77,6 +77,7 @@ func (s *KubernetesV2OutputService) Run(ctx context.Context) error { defer unsubscribe() return trace.Wrap(runOnInterval(ctx, runOnIntervalConfig{ + service: s.String(), name: "output-renewal", f: s.generate, interval: cmp.Or(s.cfg.CredentialLifetime, s.botCfg.CredentialLifetime).RenewalInterval, diff --git a/lib/tbot/service_ssh_host_output.go b/lib/tbot/service_ssh_host_output.go index 5dc0139d3cf58..9239373efd614 100644 --- a/lib/tbot/service_ssh_host_output.go +++ b/lib/tbot/service_ssh_host_output.go @@ -64,6 +64,7 @@ func (s *SSHHostOutputService) Run(ctx context.Context) error { defer unsubscribe() err := runOnInterval(ctx, runOnIntervalConfig{ + service: s.String(), name: "output-renewal", f: s.generate, interval: cmp.Or(s.cfg.CredentialLifetime, s.botCfg.CredentialLifetime).RenewalInterval, diff --git a/lib/tbot/service_ssh_multiplexer.go b/lib/tbot/service_ssh_multiplexer.go index 1c60efc3aeb70..1d8dd14a67084 100644 --- a/lib/tbot/service_ssh_multiplexer.go +++ b/lib/tbot/service_ssh_multiplexer.go @@ -391,7 +391,8 @@ func (s *SSHMultiplexerService) identityRenewalLoop( reloadCh, unsubscribe := s.reloadBroadcaster.subscribe() defer unsubscribe() err := runOnInterval(ctx, runOnIntervalConfig{ - name: "identity-renewal", + service: s.String(), + name: "identity-renewal", f: func(ctx context.Context) error { id, err := s.generateIdentity(ctx) if err != nil {