Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

State Transition Metrics Per Pool #3842

Merged
merged 2 commits into from
Jul 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions internal/scheduler/jobdb/job.go
Original file line number Diff line number Diff line change
Expand Up @@ -730,6 +730,16 @@ func (job *Job) LatestRun() *JobRun {
return job.activeRun
}

// ResolvedPools returns the:
// - The pools that the job is capable of running on for queued jobs
// - The pool the job has been leased to for non-queued jobs
func (job *Job) ResolvedPools() []string {
if job.activeRun != nil && !job.queued {
return []string{job.activeRun.Pool()}
}
return job.pools
}

// RunById returns the Run corresponding to the provided run id or nil if no such Run exists.
func (job *Job) RunById(id uuid.UUID) *JobRun {
return job.runsById[id]
Expand Down
17 changes: 17 additions & 0 deletions internal/scheduler/jobdb/job_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -377,3 +377,20 @@ func TestJobSchedulingInfoFieldsInitialised(t *testing.T) {
assert.NotNil(t, updatedJob.NodeSelector())
assert.NotNil(t, updatedJob.Annotations())
}

func TestJob_TestResolvedPools(t *testing.T) {
jobWithNoPool := baseJob
jobWithPool := baseJob.WithPools([]string{"testPool"})
jobWithJobRunPool := jobWithPool.
WithQueued(false).
WithNewRun("testExecutor", "testNode", "testNode", "testPool2", 1)

// Job without pool
assert.Equal(t, []string{}, jobWithNoPool.ResolvedPools())

// Queued job withPool
assert.Equal(t, []string{"testPool"}, jobWithPool.ResolvedPools())

// Job with an active run
assert.Equal(t, []string{"testPool2"}, jobWithJobRunPool.ResolvedPools())
}
20 changes: 9 additions & 11 deletions internal/scheduler/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,7 @@ func (m *Metrics) UpdateLeased(jctx *schedulercontext.JobSchedulingContext) erro
labels = append(labels, leased)
labels = append(labels, "") // No category for leased.
labels = append(labels, "") // No subCategory for leased.
labels = appendLabelsFromJobSchedulingContext(labels, jctx)
labels = appendLabelsFromJob(labels, jctx.Job)

return m.updateMetrics(labels, job, duration)
}
Expand Down Expand Up @@ -391,16 +391,14 @@ func (m *Metrics) indexOfFirstMatchingRegexFromErrorMessage(message string) (int

func appendLabelsFromJob(labels []string, job *jobdb.Job) []string {
executor := executorNameFromRun(job.LatestRun())
pools := job.ResolvedPools()
pool := ""
if len(pools) > 0 {
pool = pools[0]
}
labels = append(labels, job.Queue())
labels = append(labels, executor)
return labels
}

func appendLabelsFromJobSchedulingContext(labels []string, jctx *schedulercontext.JobSchedulingContext) []string {
job := jctx.Job
executor := executorNameFromRun(job.LatestRun())
labels = append(labels, job.Queue())
labels = append(labels, executor)
labels = append(labels, pool)
return labels
}

Expand Down Expand Up @@ -497,7 +495,7 @@ func (m *Metrics) counterVectorsFromResource(resource v1.ResourceName) (*prometh
Name: name,
Help: resource.String() + "resource counter.",
},
[]string{"state", "category", "subCategory", "queue", "cluster"},
[]string{"state", "category", "subCategory", "queue", "cluster", "pool"},
)
m.resourceCounters[resource] = c
}
Expand All @@ -513,7 +511,7 @@ func (m *Metrics) counterVectorsFromResource(resource v1.ResourceName) (*prometh
Name: name,
Help: resource.String() + "-second resource counter.",
},
[]string{"priorState", "state", "category", "subCategory", "queue", "cluster"},
[]string{"priorState", "state", "category", "subCategory", "queue", "cluster", "pool"},
)
m.resourceCounters[resourceSeconds] = cSeconds
}
Expand Down