Skip to content

Commit 99f8b26

Browse files
authored
Make queued time metric reset when a job gets requeued (#3875)
* Make queued time metric reset when a job gets requeued Currently the queue duration metric always uses the jobs created time to determine how long it has been in the queue When jobs gets requeued, the time in state should start from when it was requeued rather than when the job was created - As it has only been in the queue since its last run terminated This was producing large "jumps" in queued time as jobs were tried for 15 minutes, requeued and immediately reporting 15 minutes in the queue - despite only being requeued a few seconds ago It is reasonable we may want to know when these retries are occurring / time they are costing, but I think they should be separate metrics, i.e: - Retry count - Total time queued? Total time attempting to start? Something along this line Signed-off-by: JamesMurkin <[email protected]> * Include actual logic change Signed-off-by: JamesMurkin <[email protected]> --------- Signed-off-by: JamesMurkin <[email protected]>
1 parent a2d3a7f commit 99f8b26

File tree

2 files changed

+51
-1
lines changed

2 files changed

+51
-1
lines changed

internal/scheduler/metrics.go

+8-1
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,14 @@ func (c *MetricsCollector) updateQueueMetrics(ctx *armadacontext.Context) ([]pro
200200
continue
201201
}
202202
recorder = qs.queuedJobRecorder
203-
timeInState = currentTime.Sub(time.Unix(0, job.Created()))
203+
queuedTime := time.Unix(0, job.Created())
204+
if job.HasRuns() {
205+
terminationTimeOfLatestRun := job.LatestRun().TerminatedTime()
206+
if terminationTimeOfLatestRun != nil && terminationTimeOfLatestRun.After(queuedTime) {
207+
queuedTime = *terminationTimeOfLatestRun
208+
}
209+
}
210+
timeInState = currentTime.Sub(queuedTime)
204211
queuedJobsCount[job.Queue()]++
205212
schedulingKeysByQueue[job.Queue()][job.SchedulingKey()] = true
206213
} else {

internal/scheduler/metrics_test.go

+43
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
package scheduler
22

33
import (
4+
"fmt"
45
"testing"
56
"time"
67

78
"github.com/golang/mock/gomock"
9+
"github.com/google/uuid"
810
"github.com/prometheus/client_golang/prometheus"
911
"github.com/stretchr/testify/assert"
1012
"github.com/stretchr/testify/require"
@@ -31,6 +33,17 @@ func TestMetricsCollector_TestCollect_QueueMetrics(t *testing.T) {
3133
runningJobs[i] = testfixtures.TestRunningJobDbJob(startTime)
3234
}
3335

36+
// Run that has been returned
37+
runStartTime := testfixtures.BaseTime.Add(-time.Duration(400) * time.Second).UnixNano()
38+
runTerminatedTime := testfixtures.BaseTime.Add(-time.Duration(200) * time.Second)
39+
run := jobdb.MinimalRun(uuid.New(), runStartTime)
40+
run = run.WithFailed(true)
41+
run = run.WithReturned(true)
42+
run = run.WithTerminatedTime(&runTerminatedTime)
43+
44+
jobCreationTime := testfixtures.BaseTime.Add(-time.Duration(500) * time.Second).UnixNano()
45+
jobWithTerminatedRun := testfixtures.TestQueuedJobDbJob().WithCreated(jobCreationTime).WithUpdatedRun(run)
46+
3447
tests := map[string]struct {
3548
initialJobs []*jobdb.Job
3649
defaultPool string
@@ -63,6 +76,33 @@ func TestMetricsCollector_TestCollect_QueueMetrics(t *testing.T) {
6376
commonmetrics.NewCountQueueResources(3, testfixtures.TestPool, testfixtures.TestDefaultPriorityClass, testfixtures.TestQueue, "memory"),
6477
},
6578
},
79+
"queued metrics for requeued job": {
80+
// This job was been requeued and has a terminated run
81+
// The queue duration stats should count from the time the last run finished instead of job creation time
82+
initialJobs: []*jobdb.Job{jobWithTerminatedRun},
83+
queues: []*api.Queue{testfixtures.MakeTestQueue()},
84+
defaultPool: testfixtures.TestPool,
85+
expected: []prometheus.Metric{
86+
commonmetrics.NewQueueSizeMetric(1.0, testfixtures.TestQueue),
87+
commonmetrics.NewQueueDistinctSchedulingKeyMetric(1.0, testfixtures.TestQueue),
88+
commonmetrics.NewQueueDuration(1, 200,
89+
map[float64]uint64{60: 0, 600: 1, 1800: 1, 3600: 1, 10800: 1, 43200: 1, 86400: 1, 172800: 1, 604800: 1},
90+
testfixtures.TestPool, testfixtures.TestDefaultPriorityClass, testfixtures.TestQueue),
91+
commonmetrics.NewMinQueueDuration(200, testfixtures.TestPool, testfixtures.TestDefaultPriorityClass, testfixtures.TestQueue),
92+
commonmetrics.NewMaxQueueDuration(200, testfixtures.TestPool, testfixtures.TestDefaultPriorityClass, testfixtures.TestQueue),
93+
commonmetrics.NewMedianQueueDuration(200, testfixtures.TestPool, testfixtures.TestDefaultPriorityClass, testfixtures.TestQueue),
94+
commonmetrics.NewQueueResources(1, testfixtures.TestPool, testfixtures.TestDefaultPriorityClass, testfixtures.TestQueue, "cpu"),
95+
commonmetrics.NewMinQueueResources(1, testfixtures.TestPool, testfixtures.TestDefaultPriorityClass, testfixtures.TestQueue, "cpu"),
96+
commonmetrics.NewMaxQueueResources(1, testfixtures.TestPool, testfixtures.TestDefaultPriorityClass, testfixtures.TestQueue, "cpu"),
97+
commonmetrics.NewMedianQueueResources(1, testfixtures.TestPool, testfixtures.TestDefaultPriorityClass, testfixtures.TestQueue, "cpu"),
98+
commonmetrics.NewCountQueueResources(1, testfixtures.TestPool, testfixtures.TestDefaultPriorityClass, testfixtures.TestQueue, "cpu"),
99+
commonmetrics.NewQueueResources(gb, testfixtures.TestPool, testfixtures.TestDefaultPriorityClass, testfixtures.TestQueue, "memory"),
100+
commonmetrics.NewMinQueueResources(gb, testfixtures.TestPool, testfixtures.TestDefaultPriorityClass, testfixtures.TestQueue, "memory"),
101+
commonmetrics.NewMaxQueueResources(gb, testfixtures.TestPool, testfixtures.TestDefaultPriorityClass, testfixtures.TestQueue, "memory"),
102+
commonmetrics.NewMedianQueueResources(gb, testfixtures.TestPool, testfixtures.TestDefaultPriorityClass, testfixtures.TestQueue, "memory"),
103+
commonmetrics.NewCountQueueResources(1, testfixtures.TestPool, testfixtures.TestDefaultPriorityClass, testfixtures.TestQueue, "memory"),
104+
},
105+
},
66106
"running metrics": {
67107
initialJobs: runningJobs,
68108
queues: []*api.Queue{testfixtures.MakeTestQueue()},
@@ -128,6 +168,9 @@ func TestMetricsCollector_TestCollect_QueueMetrics(t *testing.T) {
128168
for i := 0; i < len(tc.expected); i++ {
129169
a1 := actual[i]
130170
e1 := tc.expected[i]
171+
if !assert.Equal(t, e1, a1) {
172+
fmt.Println("here")
173+
}
131174
require.Equal(t, e1, a1)
132175
}
133176
})

0 commit comments

Comments
 (0)