Skip to content

Commit 284874d

Browse files
gabemonterotekton-robot
authored andcommitted
Add taskrun/pipelinerun gauge metrics around resolving respective tasks/pipelines
This commit adds new experimental gauge metrics that count the number of TaskRuns who are waiting for resolution of any Tasks they reference, as well as count the number of PipelineRuns waiting on Pipeline resolution, and lastly count the number of PipelineRuns waiting on Task resolution for their underlying TaskRuns.
1 parent b8f7390 commit 284874d

File tree

5 files changed

+195
-39
lines changed

5 files changed

+195
-39
lines changed

docs/metrics.md

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -11,19 +11,22 @@ The following pipeline metrics are available at `controller-service` on port `90
1111

1212
We expose several kinds of exporters, including Prometheus, Google Stackdriver, and many others. You can set them up using [observability configuration](../config/config-observability.yaml).
1313

14-
| Name | Type | Labels/Tags | Status |
15-
| ---------- | ----------- | ----------- | ----------- |
16-
| `tekton_pipelines_controller_pipelinerun_duration_seconds_[bucket, sum, count]` | Histogram/LastValue(Gauge) | `*pipeline`=&lt;pipeline_name&gt; <br> `*pipelinerun`=&lt;pipelinerun_name&gt; <br> `status`=&lt;status&gt; <br> `namespace`=&lt;pipelinerun-namespace&gt; | experimental |
14+
| Name | Type | Labels/Tags | Status |
15+
|-----------------------------------------------------------------------------------------| ----------- | ----------- | ----------- |
16+
| `tekton_pipelines_controller_pipelinerun_duration_seconds_[bucket, sum, count]` | Histogram/LastValue(Gauge) | `*pipeline`=&lt;pipeline_name&gt; <br> `*pipelinerun`=&lt;pipelinerun_name&gt; <br> `status`=&lt;status&gt; <br> `namespace`=&lt;pipelinerun-namespace&gt; | experimental |
1717
| `tekton_pipelines_controller_pipelinerun_taskrun_duration_seconds_[bucket, sum, count]` | Histogram/LastValue(Gauge) | `*pipeline`=&lt;pipeline_name&gt; <br> `*pipelinerun`=&lt;pipelinerun_name&gt; <br> `status`=&lt;status&gt; <br> `*task`=&lt;task_name&gt; <br> `*taskrun`=&lt;taskrun_name&gt;<br> `namespace`=&lt;pipelineruns-taskruns-namespace&gt;| experimental |
18-
| `tekton_pipelines_controller_pipelinerun_count` | Counter | `status`=&lt;status&gt; | experimental |
19-
| `tekton_pipelines_controller_running_pipelineruns_count` | Gauge | | experimental |
20-
| `tekton_pipelines_controller_taskrun_duration_seconds_[bucket, sum, count]` | Histogram/LastValue(Gauge) | `status`=&lt;status&gt; <br> `*task`=&lt;task_name&gt; <br> `*taskrun`=&lt;taskrun_name&gt;<br> `namespace`=&lt;pipelineruns-taskruns-namespace&gt; | experimental |
21-
| `tekton_pipelines_controller_taskrun_count` | Counter | `status`=&lt;status&gt; | experimental |
22-
| `tekton_pipelines_controller_running_taskruns_count` | Gauge | | experimental |
23-
| `tekton_pipelines_controller_running_taskruns_throttled_by_quota_count` | Gauge | | experimental |
24-
| `tekton_pipelines_controller_running_taskruns_throttled_by_node_count` | Gauge | | experimental |
25-
| `tekton_pipelines_controller_taskruns_pod_latency_milliseconds` | Gauge | `namespace`=&lt;taskruns-namespace&gt; <br> `pod`= &lt; taskrun_pod_name&gt; <br> `*task`=&lt;task_name&gt; <br> `*taskrun`=&lt;taskrun_name&gt;<br> | experimental |
26-
| `tekton_pipelines_controller_client_latency_[bucket, sum, count]` | Histogram | | experimental |
18+
| `tekton_pipelines_controller_pipelinerun_count` | Counter | `status`=&lt;status&gt; | experimental |
19+
| `tekton_pipelines_controller_running_pipelineruns_count` | Gauge | | experimental |
20+
| `tekton_pipelines_controller_taskrun_duration_seconds_[bucket, sum, count]` | Histogram/LastValue(Gauge) | `status`=&lt;status&gt; <br> `*task`=&lt;task_name&gt; <br> `*taskrun`=&lt;taskrun_name&gt;<br> `namespace`=&lt;pipelineruns-taskruns-namespace&gt; | experimental |
21+
| `tekton_pipelines_controller_taskrun_count` | Counter | `status`=&lt;status&gt; | experimental |
22+
| `tekton_pipelines_controller_running_taskruns_count` | Gauge | | experimental |
23+
| `tekton_pipelines_controller_running_taskruns_throttled_by_quota_count` | Gauge | | experimental |
24+
| `tekton_pipelines_controller_running_taskruns_throttled_by_node_count` | Gauge | | experimental |
25+
| `tekton_pipelines_controller_running_taskruns_waiting_on_task_resolution_count` | Gauge | | experimental |
26+
| `tekton_pipelines_controller_running_pipelineruns_waiting_on_pipeline_resolution_count` | Gauge | | experimental |
27+
| `tekton_pipelines_controller_running_pipelineruns_waiting_on_task_resolution_count` | Gauge | | experimental |
28+
| `tekton_pipelines_controller_taskruns_pod_latency_milliseconds` | Gauge | `namespace`=&lt;taskruns-namespace&gt; <br> `pod`= &lt; taskrun_pod_name&gt; <br> `*task`=&lt;task_name&gt; <br> `*taskrun`=&lt;taskrun_name&gt;<br> | experimental |
29+
| `tekton_pipelines_controller_client_latency_[bucket, sum, count]` | Histogram | | experimental |
2730

2831
The Labels/Tag marked as "*" are optional. And there's a choice between Histogram and LastValue(Gauge) for pipelinerun and taskrun duration metrics.
2932

pkg/pipelinerunmetrics/metrics.go

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,16 @@ var (
5959
"Number of pipelineruns executing currently",
6060
stats.UnitDimensionless)
6161
runningPRsCountView *view.View
62+
63+
runningPRsWaitingOnPipelineResolutionCount = stats.Float64("running_pipelineruns_waiting_on_pipeline_resolution_count",
64+
"Number of pipelineruns executing currently that are waiting on resolution requests for their pipeline references.",
65+
stats.UnitDimensionless)
66+
runningPRsWaitingOnPipelineResolutionCountView *view.View
67+
68+
runningPRsWaitingOnTaskResolutionCount = stats.Float64("running_pipelineruns_waiting_on_task_resolution_count",
69+
"Number of pipelineruns executing currently that are waiting on resolution requests for the task references of their taskrun children.",
70+
stats.UnitDimensionless)
71+
runningPRsWaitingOnTaskResolutionCountView *view.View
6272
)
6373

6474
const (
@@ -161,16 +171,28 @@ func viewRegister(cfg *config.Metrics) error {
161171
Measure: runningPRsCount,
162172
Aggregation: view.LastValue(),
163173
}
174+
runningPRsWaitingOnPipelineResolutionCountView = &view.View{
175+
Description: runningPRsWaitingOnPipelineResolutionCount.Description(),
176+
Measure: runningPRsWaitingOnPipelineResolutionCount,
177+
Aggregation: view.LastValue(),
178+
}
179+
runningPRsWaitingOnTaskResolutionCountView = &view.View{
180+
Description: runningPRsWaitingOnTaskResolutionCount.Description(),
181+
Measure: runningPRsWaitingOnTaskResolutionCount,
182+
Aggregation: view.LastValue(),
183+
}
164184

165185
return view.Register(
166186
prDurationView,
167187
prCountView,
168188
runningPRsCountView,
189+
runningPRsWaitingOnPipelineResolutionCountView,
190+
runningPRsWaitingOnTaskResolutionCountView,
169191
)
170192
}
171193

172194
func viewUnregister() {
173-
view.Unregister(prDurationView, prCountView, runningPRsCountView)
195+
view.Unregister(prDurationView, prCountView, runningPRsCountView, runningPRsWaitingOnPipelineResolutionCountView, runningPRsWaitingOnTaskResolutionCountView)
174196
}
175197

176198
// MetricsOnStore returns a function that checks if metrics are configured for a config.Store, and registers it if so
@@ -273,9 +295,21 @@ func (r *Recorder) RunningPipelineRuns(lister listers.PipelineRunLister) error {
273295
}
274296

275297
var runningPRs int
298+
var trsWaitResolvingTaskRef int
299+
var prsWaitResolvingPipelineRef int
300+
276301
for _, pr := range prs {
277302
if !pr.IsDone() {
278303
runningPRs++
304+
succeedCondition := pr.Status.GetCondition(apis.ConditionSucceeded)
305+
if succeedCondition != nil && succeedCondition.Status == corev1.ConditionUnknown {
306+
switch succeedCondition.Reason {
307+
case v1.TaskRunReasonResolvingTaskRef:
308+
trsWaitResolvingTaskRef++
309+
case v1.PipelineRunReasonResolvingPipelineRef.String():
310+
prsWaitResolvingPipelineRef++
311+
}
312+
}
279313
}
280314
}
281315

@@ -284,6 +318,8 @@ func (r *Recorder) RunningPipelineRuns(lister listers.PipelineRunLister) error {
284318
return err
285319
}
286320
metrics.Record(ctx, runningPRsCount.M(float64(runningPRs)))
321+
metrics.Record(ctx, runningPRsWaitingOnPipelineResolutionCount.M(float64(prsWaitResolvingPipelineRef)))
322+
metrics.Record(ctx, runningPRsWaitingOnTaskResolutionCount.M(float64(trsWaitResolvingTaskRef)))
287323

288324
return nil
289325
}

pkg/pipelinerunmetrics/metrics_test.go

Lines changed: 83 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -375,8 +375,90 @@ func TestRecordRunningPipelineRunsCount(t *testing.T) {
375375
metricstest.CheckLastValueData(t, "running_pipelineruns_count", map[string]string{}, 1)
376376
}
377377

378+
func TestRecordRunningPipelineRunsResolutionWaitCounts(t *testing.T) {
379+
multiplier := 3
380+
for _, tc := range []struct {
381+
status corev1.ConditionStatus
382+
reason string
383+
prWaitCount float64
384+
trWaitCount float64
385+
}{
386+
{
387+
status: corev1.ConditionTrue,
388+
reason: "",
389+
},
390+
{
391+
status: corev1.ConditionTrue,
392+
reason: v1.PipelineRunReasonResolvingPipelineRef.String(),
393+
},
394+
{
395+
status: corev1.ConditionTrue,
396+
reason: v1.TaskRunReasonResolvingTaskRef,
397+
},
398+
{
399+
status: corev1.ConditionFalse,
400+
reason: "",
401+
},
402+
{
403+
status: corev1.ConditionFalse,
404+
reason: v1.PipelineRunReasonResolvingPipelineRef.String(),
405+
},
406+
{
407+
status: corev1.ConditionFalse,
408+
reason: v1.TaskRunReasonResolvingTaskRef,
409+
},
410+
{
411+
status: corev1.ConditionUnknown,
412+
reason: "",
413+
},
414+
{
415+
status: corev1.ConditionUnknown,
416+
reason: v1.PipelineRunReasonResolvingPipelineRef.String(),
417+
prWaitCount: 3,
418+
},
419+
{
420+
status: corev1.ConditionUnknown,
421+
reason: v1.TaskRunReasonResolvingTaskRef,
422+
trWaitCount: 3,
423+
},
424+
} {
425+
unregisterMetrics()
426+
ctx, _ := ttesting.SetupFakeContext(t)
427+
informer := fakepipelineruninformer.Get(ctx)
428+
for i := 0; i < multiplier; i++ {
429+
pr := &v1.PipelineRun{
430+
ObjectMeta: metav1.ObjectMeta{Name: names.SimpleNameGenerator.RestrictLengthWithRandomSuffix("pipelinerun-")},
431+
Status: v1.PipelineRunStatus{
432+
Status: duckv1.Status{
433+
Conditions: duckv1.Conditions{{
434+
Type: apis.ConditionSucceeded,
435+
Status: tc.status,
436+
Reason: tc.reason,
437+
}},
438+
},
439+
},
440+
}
441+
if err := informer.Informer().GetIndexer().Add(pr); err != nil {
442+
t.Fatalf("Adding TaskRun to informer: %v", err)
443+
}
444+
}
445+
446+
ctx = getConfigContext()
447+
metrics, err := NewRecorder(ctx)
448+
if err != nil {
449+
t.Fatalf("NewRecorder: %v", err)
450+
}
451+
452+
if err := metrics.RunningPipelineRuns(informer.Lister()); err != nil {
453+
t.Errorf("RunningTaskRuns: %v", err)
454+
}
455+
metricstest.CheckLastValueData(t, "running_pipelineruns_waiting_on_pipeline_resolution_count", map[string]string{}, tc.prWaitCount)
456+
metricstest.CheckLastValueData(t, "running_pipelineruns_waiting_on_task_resolution_count", map[string]string{}, tc.trWaitCount)
457+
}
458+
}
459+
378460
func unregisterMetrics() {
379-
metricstest.Unregister("pipelinerun_duration_seconds", "pipelinerun_count", "running_pipelineruns_count")
461+
metricstest.Unregister("pipelinerun_duration_seconds", "pipelinerun_count", "running_pipelineruns_waiting_on_pipeline_resolution_count", "running_pipelineruns_waiting_on_task_resolution_count", "running_pipelineruns_count")
380462

381463
// Allow the recorder singleton to be recreated.
382464
once = sync.Once{}

pkg/taskrunmetrics/metrics.go

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -52,13 +52,14 @@ var (
5252
statusTag = tag.MustNewKey("status")
5353
podTag = tag.MustNewKey("pod")
5454

55-
trDurationView *view.View
56-
prTRDurationView *view.View
57-
trCountView *view.View
58-
runningTRsCountView *view.View
59-
runningTRsThrottledByQuotaCountView *view.View
60-
runningTRsThrottledByNodeCountView *view.View
61-
podLatencyView *view.View
55+
trDurationView *view.View
56+
prTRDurationView *view.View
57+
trCountView *view.View
58+
runningTRsCountView *view.View
59+
runningTRsThrottledByQuotaCountView *view.View
60+
runningTRsThrottledByNodeCountView *view.View
61+
runningTRsWaitingOnTaskResolutionCountView *view.View
62+
podLatencyView *view.View
6263

6364
trDuration = stats.Float64(
6465
"taskrun_duration_seconds",
@@ -86,6 +87,10 @@ var (
8687
"Number of taskruns executing currently, but whose underlying Pods or Containers are suspended by k8s because of Node level constraints. Such suspensions can occur as part of initial scheduling of the Pod, or scheduling of any of the subsequent Container(s) in the Pod after the first Container is started",
8788
stats.UnitDimensionless)
8889

90+
runningTRsWaitingOnTaskResolutionCount = stats.Float64("running_taskruns_waiting_on_task_resolution_count",
91+
"Number of taskruns executing currently that are waiting on resolution requests for their task references.",
92+
stats.UnitDimensionless)
93+
8994
podLatency = stats.Float64("taskruns_pod_latency_milliseconds",
9095
"scheduling latency for the taskruns pods",
9196
stats.UnitMilliseconds)
@@ -219,6 +224,11 @@ func viewRegister(cfg *config.Metrics) error {
219224
Measure: runningTRsThrottledByNodeCount,
220225
Aggregation: view.LastValue(),
221226
}
227+
runningTRsWaitingOnTaskResolutionCountView = &view.View{
228+
Description: runningTRsWaitingOnTaskResolutionCount.Description(),
229+
Measure: runningTRsWaitingOnTaskResolutionCount,
230+
Aggregation: view.LastValue(),
231+
}
222232
podLatencyView = &view.View{
223233
Description: podLatency.Description(),
224234
Measure: podLatency,
@@ -232,6 +242,7 @@ func viewRegister(cfg *config.Metrics) error {
232242
runningTRsCountView,
233243
runningTRsThrottledByQuotaCountView,
234244
runningTRsThrottledByNodeCountView,
245+
runningTRsWaitingOnTaskResolutionCountView,
235246
podLatencyView,
236247
)
237248
}
@@ -244,6 +255,7 @@ func viewUnregister() {
244255
runningTRsCountView,
245256
runningTRsThrottledByQuotaCountView,
246257
runningTRsThrottledByNodeCountView,
258+
runningTRsWaitingOnTaskResolutionCountView,
247259
podLatencyView,
248260
)
249261
}
@@ -358,6 +370,7 @@ func (r *Recorder) RunningTaskRuns(ctx context.Context, lister listers.TaskRunLi
358370
var runningTrs int
359371
var trsThrottledByQuota int
360372
var trsThrottledByNode int
373+
var trsWaitResolvingTaskRef int
361374
for _, pr := range trs {
362375
if pr.IsDone() {
363376
continue
@@ -370,6 +383,8 @@ func (r *Recorder) RunningTaskRuns(ctx context.Context, lister listers.TaskRunLi
370383
trsThrottledByQuota++
371384
case pod.ReasonExceededNodeResources:
372385
trsThrottledByNode++
386+
case v1.TaskRunReasonResolvingTaskRef:
387+
trsWaitResolvingTaskRef++
373388
}
374389
}
375390
}
@@ -381,6 +396,7 @@ func (r *Recorder) RunningTaskRuns(ctx context.Context, lister listers.TaskRunLi
381396
metrics.Record(ctx, runningTRsCount.M(float64(runningTrs)))
382397
metrics.Record(ctx, runningTRsThrottledByNodeCount.M(float64(trsThrottledByNode)))
383398
metrics.Record(ctx, runningTRsThrottledByQuotaCount.M(float64(trsThrottledByQuota)))
399+
metrics.Record(ctx, runningTRsWaitingOnTaskResolutionCount.M(float64(trsWaitResolvingTaskRef)))
384400

385401
return nil
386402
}
@@ -400,7 +416,8 @@ func (r *Recorder) ReportRunningTaskRuns(ctx context.Context, lister listers.Tas
400416
return
401417

402418
case <-delay.C:
403-
// Every 30s surface a metric for the number of running tasks, as well as those running tasks that are currently throttled by k8s.
419+
// Every 30s surface a metric for the number of running tasks, as well as those running tasks that are currently throttled by k8s,
420+
// and those running tasks waiting on task reference resolution
404421
if err := r.RunningTaskRuns(ctx, lister); err != nil {
405422
logger.Warnf("Failed to log the metrics : %v", err)
406423
}

0 commit comments

Comments
 (0)