Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 46 additions & 5 deletions lib/tbot/loop.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,45 @@ import (

"github.com/gravitational/trace"
"github.com/jonboulle/clockwork"
"github.com/prometheus/client_golang/prometheus"

"github.com/gravitational/teleport/api/utils/retryutils"
)

var (
loopIterationsCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "tbot_task_iterations_total",
Help: "Number of task iteration attempts, not counting retries",
}, []string{"service", "name"},
)
loopIterationsSuccessCounter = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "tbot_task_iterations_successful",
Help: "Histogram of task iterations that ultimately succeeded, bucketed by number of retries before success",
Buckets: []float64{0, 1, 2, 3, 4, 5},
}, []string{"service", "name"},
)
loopIterationsFailureCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "tbot_task_iterations_failed",
Help: "Number of task iterations that ultimately failed, not counting retries",
}, []string{"service", "name"},
)
loopIterationTime = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "tbot_task_iteration_duration_seconds",
Help: "Time between beginning and ultimate end of one task iteration regardless of outcome, including all retries",
Buckets: prometheus.ExponentialBuckets(0.1, 1.75, 6),
}, []string{"service", "name"},
)
)

type runOnIntervalConfig struct {
name string
f func(ctx context.Context) error
clock clockwork.Clock
service string
name string
f func(ctx context.Context) error
clock clockwork.Clock
// reloadCh allows the task to be triggered immediately, ideal for handling
// CA rotations or a manual signal from a user.
// reloadCh can be nil, in which case, the task will only run on the
Expand All @@ -49,8 +80,6 @@ type runOnIntervalConfig struct {
// runOnInterval runs a function on a given interval, with retries and jitter.
//
// TODO(noah): Emit Prometheus metrics for:
// - Success/Failure of attempts
// - Time taken to execute attempt
// - Time of next attempt
func runOnInterval(ctx context.Context, cfg runOnIntervalConfig) error {
switch {
Expand Down Expand Up @@ -87,6 +116,9 @@ func runOnInterval(ctx context.Context, cfg runOnIntervalConfig) error {
}
firstRun = false

loopIterationsCounter.WithLabelValues(cfg.service, cfg.name).Inc()
startTime := time.Now()

var err error
for attempt := 1; attempt <= cfg.retryLimit; attempt++ {
log.InfoContext(
Expand All @@ -97,6 +129,7 @@ func runOnInterval(ctx context.Context, cfg runOnIntervalConfig) error {
)
err = cfg.f(ctx)
if err == nil {
loopIterationsSuccessCounter.WithLabelValues(cfg.service, cfg.name).Observe(float64(attempt - 1))
break
}

Expand All @@ -114,12 +147,20 @@ func runOnInterval(ctx context.Context, cfg runOnIntervalConfig) error {
)
select {
case <-ctx.Done():
// Note: will discard metric update for this loop. It
// probably won't be collected if we're shutting down,
// anyway.
return nil
case <-cfg.clock.After(backoffTime):
}
}
}

loopIterationTime.WithLabelValues(cfg.service, cfg.name).Observe(time.Since(startTime).Seconds())

if err != nil {
loopIterationsFailureCounter.WithLabelValues(cfg.service, cfg.name).Inc()

if cfg.exitOnRetryExhausted {
log.ErrorContext(
ctx,
Expand Down
1 change: 1 addition & 0 deletions lib/tbot/service_application_output.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ func (s *ApplicationOutputService) Run(ctx context.Context) error {
defer unsubscribe()

err := runOnInterval(ctx, runOnIntervalConfig{
service: s.String(),
name: "output-renewal",
f: s.generate,
interval: s.botCfg.RenewalInterval,
Expand Down
3 changes: 2 additions & 1 deletion lib/tbot/service_bot_identity.go
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,8 @@ func (s *identityService) Run(ctx context.Context) error {
)

err := runOnInterval(ctx, runOnIntervalConfig{
name: "bot-identity-renewal",
service: s.String(),
name: "bot-identity-renewal",
f: func(ctx context.Context) error {
return s.renew(ctx, storageDestination)
},
Expand Down
1 change: 1 addition & 0 deletions lib/tbot/service_client_credential.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ func (s *ClientCredentialOutputService) Run(ctx context.Context) error {
defer unsubscribe()

err := runOnInterval(ctx, runOnIntervalConfig{
service: s.String(),
name: "output-renewal",
f: s.generate,
interval: s.botCfg.RenewalInterval,
Expand Down
1 change: 1 addition & 0 deletions lib/tbot/service_database_output.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ func (s *DatabaseOutputService) Run(ctx context.Context) error {
defer unsubscribe()

err := runOnInterval(ctx, runOnIntervalConfig{
service: s.String(),
name: "output-renewal",
f: s.generate,
interval: s.botCfg.RenewalInterval,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ func (s *heartbeatService) OneShot(ctx context.Context) error {
func (s *heartbeatService) Run(ctx context.Context) error {
isStartup := true
err := runOnInterval(ctx, runOnIntervalConfig{
service: s.String(),
name: "submit-heartbeat",
log: s.log,
interval: s.interval,
Expand Down
1 change: 1 addition & 0 deletions lib/tbot/service_identity_output.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ func (s *IdentityOutputService) Run(ctx context.Context) error {
defer unsubscribe()

err := runOnInterval(ctx, runOnIntervalConfig{
service: s.String(),
name: "output-renewal",
f: s.generate,
interval: s.botCfg.RenewalInterval,
Expand Down
1 change: 1 addition & 0 deletions lib/tbot/service_kubernetes_output.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ func (s *KubernetesOutputService) Run(ctx context.Context) error {
defer unsubscribe()

err := runOnInterval(ctx, runOnIntervalConfig{
service: s.String(),
name: "output-renewal",
f: s.generate,
interval: s.botCfg.RenewalInterval,
Expand Down
1 change: 1 addition & 0 deletions lib/tbot/service_ssh_host_output.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ func (s *SSHHostOutputService) Run(ctx context.Context) error {
defer unsubscribe()

err := runOnInterval(ctx, runOnIntervalConfig{
service: s.String(),
name: "output-renewal",
f: s.generate,
interval: s.botCfg.RenewalInterval,
Expand Down
3 changes: 2 additions & 1 deletion lib/tbot/service_ssh_multiplexer.go
Original file line number Diff line number Diff line change
Expand Up @@ -391,7 +391,8 @@ func (s *SSHMultiplexerService) identityRenewalLoop(
reloadCh, unsubscribe := s.reloadBroadcaster.subscribe()
defer unsubscribe()
err := runOnInterval(ctx, runOnIntervalConfig{
name: "identity-renewal",
service: s.String(),
name: "identity-renewal",
f: func(ctx context.Context) error {
id, err := s.generateIdentity(ctx)
if err != nil {
Expand Down
9 changes: 8 additions & 1 deletion lib/tbot/tbot.go
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,14 @@ func (b *Bot) Run(ctx context.Context) (err error) {
defer func() { apitracing.EndSpan(span, err) }()
startedAt := time.Now()

if err := metrics.RegisterPrometheusCollectors(clientMetrics); err != nil {
if err := metrics.RegisterPrometheusCollectors(
metrics.BuildCollector(),
clientMetrics,
loopIterationsCounter,
loopIterationsSuccessCounter,
loopIterationsFailureCounter,
loopIterationTime,
); err != nil {
return trace.Wrap(err)
}

Expand Down
Loading