-
Notifications
You must be signed in to change notification settings - Fork 2.1k
Machine ID: Add Prometheus metrics for loop tasks #52410
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -26,14 +26,45 @@ import ( | |
|
|
||
| "github.com/gravitational/trace" | ||
| "github.com/jonboulle/clockwork" | ||
| "github.com/prometheus/client_golang/prometheus" | ||
|
|
||
| "github.com/gravitational/teleport/api/utils/retryutils" | ||
| ) | ||
|
|
||
| var ( | ||
| loopIterationsCounter = prometheus.NewCounterVec( | ||
| prometheus.CounterOpts{ | ||
| Name: "tbot_task_iterations_total", | ||
| Help: "Number of task iteration attempts, not counting retries", | ||
| }, []string{"service", "name"}, | ||
| ) | ||
| loopIterationsSuccessCounter = prometheus.NewHistogramVec( | ||
| prometheus.HistogramOpts{ | ||
| Name: "tbot_task_iterations_successful", | ||
| Help: "Histogram of task iterations that ultimately succeeded, bucketed by number of retries before success", | ||
| Buckets: []float64{0, 1, 2, 3, 4, 5}, | ||
| }, []string{"service", "name"}, | ||
| ) | ||
| loopIterationsFailureCounter = prometheus.NewCounterVec( | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I tend to see this grouped with
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think since we're tracking successful iterations via a histogram, the I've at least renamed
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I figured that the histogram could also have a "state" label - but this is fine how it is if you'd rather proceed as is. |
||
| prometheus.CounterOpts{ | ||
| Name: "tbot_task_iterations_failed", | ||
| Help: "Number of task iterations that ultimately failed, not counting retries", | ||
| }, []string{"service", "name"}, | ||
| ) | ||
| loopIterationTime = prometheus.NewHistogramVec( | ||
| prometheus.HistogramOpts{ | ||
| Name: "tbot_task_iteration_duration_seconds", | ||
| Help: "Time between beginning and ultimate end of one task iteration regardless of outcome, including all retries", | ||
| Buckets: prometheus.ExponentialBuckets(0.1, 1.75, 6), | ||
| }, []string{"service", "name"}, | ||
| ) | ||
| ) | ||
|
|
||
| type runOnIntervalConfig struct { | ||
| name string | ||
| f func(ctx context.Context) error | ||
| clock clockwork.Clock | ||
| service string | ||
| name string | ||
| f func(ctx context.Context) error | ||
| clock clockwork.Clock | ||
| // reloadCh allows the task to be triggered immediately, ideal for handling | ||
| // CA rotations or a manual signal from a user. | ||
| // reloadCh can be nil, in which case, the task will only run on the | ||
|
|
@@ -49,8 +80,6 @@ type runOnIntervalConfig struct { | |
| // runOnInterval runs a function on a given interval, with retries and jitter. | ||
| // | ||
| // TODO(noah): Emit Prometheus metrics for: | ||
| // - Success/Failure of attempts | ||
| // - Time taken to execute attempt | ||
| // - Time of next attempt | ||
| func runOnInterval(ctx context.Context, cfg runOnIntervalConfig) error { | ||
| switch { | ||
|
|
@@ -87,6 +116,9 @@ func runOnInterval(ctx context.Context, cfg runOnIntervalConfig) error { | |
| } | ||
| firstRun = false | ||
|
|
||
| loopIterationsCounter.WithLabelValues(cfg.service, cfg.name).Inc() | ||
| startTime := time.Now() | ||
|
|
||
| var err error | ||
| for attempt := 1; attempt <= cfg.retryLimit; attempt++ { | ||
| log.InfoContext( | ||
|
|
@@ -97,6 +129,7 @@ func runOnInterval(ctx context.Context, cfg runOnIntervalConfig) error { | |
| ) | ||
| err = cfg.f(ctx) | ||
| if err == nil { | ||
| loopIterationsSuccessCounter.WithLabelValues(cfg.service, cfg.name).Observe(float64(attempt - 1)) | ||
| break | ||
| } | ||
|
|
||
|
|
@@ -114,12 +147,20 @@ func runOnInterval(ctx context.Context, cfg runOnIntervalConfig) error { | |
| ) | ||
| select { | ||
| case <-ctx.Done(): | ||
| // Note: will discard metric update for this loop. It | ||
| // probably won't be collected if we're shutting down, | ||
| // anyway. | ||
| return nil | ||
| case <-cfg.clock.After(backoffTime): | ||
| } | ||
| } | ||
| } | ||
|
|
||
| loopIterationTime.WithLabelValues(cfg.service, cfg.name).Observe(time.Since(startTime).Seconds()) | ||
|
|
||
| if err != nil { | ||
| loopIterationsFailureCounter.WithLabelValues(cfg.service, cfg.name).Inc() | ||
|
|
||
| if cfg.exitOnRetryExhausted { | ||
| log.ErrorContext( | ||
| ctx, | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
One issue here is that all of our output services use
output-renewalas their task name, so they'll be grouped together. Do we want to make that more specific? I'd suggest either appending something more specific to the name (e.g.output-renewal/application) or adding a subtype field + prometheus label.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah - I think we can probably give these all more specific names I think. Perhaps we just leverage the service name first?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good call, I've plumbed through "service" as an additional label. The stringer has a mild caveat of including filepaths in the label value sometimes, though, so I'm tempted to replace
.String()withconfig.FooServiceTypeconstants? The cardinality isn't likely to be a huge issue and keeps individual outputs separate ... but it feels gross.