diff --git a/lib/tbot/loop.go b/lib/tbot/loop.go index e03694b858703..0851aa22256a0 100644 --- a/lib/tbot/loop.go +++ b/lib/tbot/loop.go @@ -26,14 +26,45 @@ import ( "github.com/gravitational/trace" "github.com/jonboulle/clockwork" + "github.com/prometheus/client_golang/prometheus" "github.com/gravitational/teleport/api/utils/retryutils" ) +var ( + loopIterationsCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "tbot_task_iterations_total", + Help: "Number of task iteration attempts, not counting retries", + }, []string{"service", "name"}, + ) + loopIterationsSuccessCounter = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "tbot_task_iterations_successful", + Help: "Histogram of task iterations that ultimately succeeded, bucketed by number of retries before success", + Buckets: []float64{0, 1, 2, 3, 4, 5}, + }, []string{"service", "name"}, + ) + loopIterationsFailureCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "tbot_task_iterations_failed", + Help: "Number of task iterations that ultimately failed, not counting retries", + }, []string{"service", "name"}, + ) + loopIterationTime = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "tbot_task_iteration_duration_seconds", + Help: "Time between beginning and ultimate end of one task iteration regardless of outcome, including all retries", + Buckets: prometheus.ExponentialBuckets(0.1, 1.75, 6), + }, []string{"service", "name"}, + ) +) + type runOnIntervalConfig struct { - name string - f func(ctx context.Context) error - clock clockwork.Clock + service string + name string + f func(ctx context.Context) error + clock clockwork.Clock // reloadCh allows the task to be triggered immediately, ideal for handling // CA rotations or a manual signal from a user. // reloadCh can be nil, in which case, the task will only run on the @@ -49,8 +80,6 @@ type runOnIntervalConfig struct { // runOnInterval runs a function on a given interval, with retries and jitter. // // TODO(noah): Emit Prometheus metrics for: -// - Success/Failure of attempts -// - Time taken to execute attempt // - Time of next attempt func runOnInterval(ctx context.Context, cfg runOnIntervalConfig) error { switch { @@ -87,6 +116,9 @@ func runOnInterval(ctx context.Context, cfg runOnIntervalConfig) error { } firstRun = false + loopIterationsCounter.WithLabelValues(cfg.service, cfg.name).Inc() + startTime := time.Now() + var err error for attempt := 1; attempt <= cfg.retryLimit; attempt++ { log.InfoContext( @@ -97,6 +129,7 @@ func runOnInterval(ctx context.Context, cfg runOnIntervalConfig) error { ) err = cfg.f(ctx) if err == nil { + loopIterationsSuccessCounter.WithLabelValues(cfg.service, cfg.name).Observe(float64(attempt - 1)) break } @@ -114,12 +147,20 @@ func runOnInterval(ctx context.Context, cfg runOnIntervalConfig) error { ) select { case <-ctx.Done(): + // Note: will discard metric update for this loop. It + // probably won't be collected if we're shutting down, + // anyway. return nil case <-cfg.clock.After(backoffTime): } } } + + loopIterationTime.WithLabelValues(cfg.service, cfg.name).Observe(time.Since(startTime).Seconds()) + if err != nil { + loopIterationsFailureCounter.WithLabelValues(cfg.service, cfg.name).Inc() + if cfg.exitOnRetryExhausted { log.ErrorContext( ctx, diff --git a/lib/tbot/service_application_output.go b/lib/tbot/service_application_output.go index b3b5b9de96a34..dca0959dc6c2c 100644 --- a/lib/tbot/service_application_output.go +++ b/lib/tbot/service_application_output.go @@ -60,6 +60,7 @@ func (s *ApplicationOutputService) Run(ctx context.Context) error { defer unsubscribe() err := runOnInterval(ctx, runOnIntervalConfig{ + service: s.String(), name: "output-renewal", f: s.generate, interval: s.botCfg.RenewalInterval, diff --git a/lib/tbot/service_bot_identity.go b/lib/tbot/service_bot_identity.go index 362a8120ab536..a2cd77ca017e7 100644 --- a/lib/tbot/service_bot_identity.go +++ b/lib/tbot/service_bot_identity.go @@ -264,7 +264,8 @@ func (s *identityService) Run(ctx context.Context) error { ) err := runOnInterval(ctx, runOnIntervalConfig{ - name: "bot-identity-renewal", + service: s.String(), + name: "bot-identity-renewal", f: func(ctx context.Context) error { return s.renew(ctx, storageDestination) }, diff --git a/lib/tbot/service_client_credential.go b/lib/tbot/service_client_credential.go index 03583874bb4cf..174154fa59758 100644 --- a/lib/tbot/service_client_credential.go +++ b/lib/tbot/service_client_credential.go @@ -55,6 +55,7 @@ func (s *ClientCredentialOutputService) Run(ctx context.Context) error { defer unsubscribe() err := runOnInterval(ctx, runOnIntervalConfig{ + service: s.String(), name: "output-renewal", f: s.generate, interval: s.botCfg.RenewalInterval, diff --git a/lib/tbot/service_database_output.go b/lib/tbot/service_database_output.go index f65d9e9ec1b9f..2dbbed362471e 100644 --- a/lib/tbot/service_database_output.go +++ b/lib/tbot/service_database_output.go @@ -60,6 +60,7 @@ func (s *DatabaseOutputService) Run(ctx context.Context) error { defer unsubscribe() err := runOnInterval(ctx, runOnIntervalConfig{ + service: s.String(), name: "output-renewal", f: s.generate, interval: s.botCfg.RenewalInterval, diff --git a/lib/tbot/service_heatbeat.go b/lib/tbot/service_heartbeat.go similarity index 99% rename from lib/tbot/service_heatbeat.go rename to lib/tbot/service_heartbeat.go index 02f976cc6a0f8..5edecbe6b1e38 100644 --- a/lib/tbot/service_heatbeat.go +++ b/lib/tbot/service_heartbeat.go @@ -95,6 +95,7 @@ func (s *heartbeatService) OneShot(ctx context.Context) error { func (s *heartbeatService) Run(ctx context.Context) error { isStartup := true err := runOnInterval(ctx, runOnIntervalConfig{ + service: s.String(), name: "submit-heartbeat", log: s.log, interval: s.interval, diff --git a/lib/tbot/service_identity_output.go b/lib/tbot/service_identity_output.go index 2460584901b65..032b51e8b2160 100644 --- a/lib/tbot/service_identity_output.go +++ b/lib/tbot/service_identity_output.go @@ -82,6 +82,7 @@ func (s *IdentityOutputService) Run(ctx context.Context) error { defer unsubscribe() err := runOnInterval(ctx, runOnIntervalConfig{ + service: s.String(), name: "output-renewal", f: s.generate, interval: s.botCfg.RenewalInterval, diff --git a/lib/tbot/service_kubernetes_output.go b/lib/tbot/service_kubernetes_output.go index 2277e292d4538..56801f4ff77af 100644 --- a/lib/tbot/service_kubernetes_output.go +++ b/lib/tbot/service_kubernetes_output.go @@ -77,6 +77,7 @@ func (s *KubernetesOutputService) Run(ctx context.Context) error { defer unsubscribe() err := runOnInterval(ctx, runOnIntervalConfig{ + service: s.String(), name: "output-renewal", f: s.generate, interval: s.botCfg.RenewalInterval, diff --git a/lib/tbot/service_ssh_host_output.go b/lib/tbot/service_ssh_host_output.go index 116a96268d988..a6fbae9314325 100644 --- a/lib/tbot/service_ssh_host_output.go +++ b/lib/tbot/service_ssh_host_output.go @@ -61,6 +61,7 @@ func (s *SSHHostOutputService) Run(ctx context.Context) error { defer unsubscribe() err := runOnInterval(ctx, runOnIntervalConfig{ + service: s.String(), name: "output-renewal", f: s.generate, interval: s.botCfg.RenewalInterval, diff --git a/lib/tbot/service_ssh_multiplexer.go b/lib/tbot/service_ssh_multiplexer.go index 725f0dceb5349..505b2d1df4ac7 100644 --- a/lib/tbot/service_ssh_multiplexer.go +++ b/lib/tbot/service_ssh_multiplexer.go @@ -391,7 +391,8 @@ func (s *SSHMultiplexerService) identityRenewalLoop( reloadCh, unsubscribe := s.reloadBroadcaster.subscribe() defer unsubscribe() err := runOnInterval(ctx, runOnIntervalConfig{ - name: "identity-renewal", + service: s.String(), + name: "identity-renewal", f: func(ctx context.Context) error { id, err := s.generateIdentity(ctx) if err != nil { diff --git a/lib/tbot/tbot.go b/lib/tbot/tbot.go index 2e5271dab82a0..98afe962e725d 100644 --- a/lib/tbot/tbot.go +++ b/lib/tbot/tbot.go @@ -133,7 +133,14 @@ func (b *Bot) Run(ctx context.Context) (err error) { defer func() { apitracing.EndSpan(span, err) }() startedAt := time.Now() - if err := metrics.RegisterPrometheusCollectors(clientMetrics); err != nil { + if err := metrics.RegisterPrometheusCollectors( + metrics.BuildCollector(), + clientMetrics, + loopIterationsCounter, + loopIterationsSuccessCounter, + loopIterationsFailureCounter, + loopIterationTime, + ); err != nil { return trace.Wrap(err) }