Skip to content

Commit

Permalink
Fix Error categorisation logic (#4054)
Browse files Browse the repository at this point in the history
Signed-off-by: Chris Martin <[email protected]>
  • Loading branch information
d80tb7 authored Nov 25, 2024
1 parent 2db6372 commit 39ea0d3
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 2 deletions.
5 changes: 3 additions & 2 deletions internal/scheduler/metrics/state_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ func newJobStateMetrics(errorRegexes []*regexp.Regexp, trackedResourceNames []v1
)
jobErrorsByNode := prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: prefix + "error_classification_by_node",
Name: prefix + "job_error_classification_by_node",
Help: "Failed jobs ey error classification at the node level",
},
[]string{nodeLabel, poolLabel, clusterLabel, errorCategoryLabel, errorSubcategoryLabel},
Expand Down Expand Up @@ -188,7 +188,8 @@ func (m *jobStateMetrics) ReportStateTransitions(
m.completedRunDurations.WithLabelValues(job.Queue(), run.Pool()).Observe(duration)
jobRunError := jobRunErrorsByRunId[run.Id()]
category, subCategory := m.failedCategoryAndSubCategoryFromJob(jobRunError)
m.jobErrorsByQueue.WithLabelValues(job.Queue(), run.Executor(), category, subCategory).Inc()
m.jobErrorsByQueue.WithLabelValues(job.Queue(), run.Pool(), category, subCategory).Inc()
m.jobErrorsByNode.WithLabelValues(run.NodeName(), run.Pool(), run.Executor(), category, subCategory).Inc()
}
if jst.Succeeded {
duration, priorState := stateDuration(job, run, run.TerminatedTime())
Expand Down
40 changes: 40 additions & 0 deletions internal/scheduler/metrics/state_metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/testutil"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
v1 "k8s.io/api/core/v1"

"github.com/armadaproject/armada/internal/scheduler/jobdb"
Expand Down Expand Up @@ -343,6 +344,45 @@ func TestReportJobStateTransitions(t *testing.T) {
}
}

func TestCategoriseErrors(t *testing.T) {
run := baseRun.
WithExecutor(testCluster).
WithNodeName(testNode).
WithPool(testPool)

job := baseJob.WithUpdatedRun(run)

r, err := regexp.Compile("generic pod error")
require.NoError(t, err)

jobRunErrorsByRunId := map[string]*armadaevents.Error{
run.Id(): {
Terminal: true,
Reason: &armadaevents.Error_PodError{
PodError: &armadaevents.PodError{
Message: "generic pod error",
},
},
},
}

jsts := []jobdb.JobStateTransitions{
{
Job: job,
Failed: true,
},
}

metrics := newJobStateMetrics([]*regexp.Regexp{r}, []v1.ResourceName{"cpu"}, 12*time.Hour)
metrics.ReportStateTransitions(jsts, jobRunErrorsByRunId)

actualjobErrorsByQueue := testutil.ToFloat64(metrics.jobErrorsByQueue.WithLabelValues(testQueue, testPool, "podError", "generic pod error"))
assert.InDelta(t, 1, actualjobErrorsByQueue, epsilon)

actualjobErrorsByNode := testutil.ToFloat64(metrics.jobErrorsByNode.WithLabelValues(testNode, testPool, testCluster, "podError", "generic pod error"))
assert.InDelta(t, 1, actualjobErrorsByNode, epsilon)
}

func TestReset(t *testing.T) {
byQueueLabels := []string{testQueue, testPool, "running", "pending"}
byNodeLabels := []string{testNode, testPool, testCluster, "running", "pending"}
Expand Down

0 comments on commit 39ea0d3

Please sign in to comment.