From 6e9f87c196de742c47f04a36c67772482e9b2383 Mon Sep 17 00:00:00 2001 From: robertdavidsmith <34475852+robertdavidsmith@users.noreply.github.com> Date: Fri, 22 Nov 2024 17:21:37 +0000 Subject: [PATCH] Scheduler: better diagnostics for unknown preemption reason (#4055) * Scheduler: better diagnostics for unknown preemption reason Signed-off-by: Robert Smith * fix Signed-off-by: Robert Smith --------- Signed-off-by: Robert Smith --- internal/scheduler/internaltypes/node.go | 82 +++++++++++++------ internal/scheduler/internaltypes/node_test.go | 10 +++ .../testfixtures/testfixtures.go | 24 ++++++ internal/scheduler/nodedb/nodedb_test.go | 5 +- internal/scheduler/nodedb/nodeidindex_test.go | 1 + .../scheduler/nodedb/nodematching_test.go | 2 + internal/scheduler/scheduling/context/job.go | 19 +++-- .../scheduler/scheduling/context/job_test.go | 17 ++-- internal/scheduler/scheduling/eviction.go | 2 +- .../preempting_queue_scheduler_test.go | 1 + .../scheduling/preemption_description.go | 4 +- .../scheduling/preemption_description_test.go | 35 ++++---- 12 files changed, 145 insertions(+), 57 deletions(-) create mode 100644 internal/scheduler/internaltypes/testfixtures/testfixtures.go diff --git a/internal/scheduler/internaltypes/node.go b/internal/scheduler/internaltypes/node.go index d91ed8f52c0..c1280577a6d 100644 --- a/internal/scheduler/internaltypes/node.go +++ b/internal/scheduler/internaltypes/node.go @@ -1,6 +1,7 @@ package internaltypes import ( + "fmt" "math" "github.com/pkg/errors" @@ -44,6 +45,8 @@ type Node struct { // Total space allocatable on this node totalResources ResourceList + unallocatableResources map[int32]ResourceList + // This field is set when inserting the Node into a NodeDb. Keys [][]byte @@ -92,6 +95,11 @@ func FromSchedulerObjectsNode(node *schedulerobjects.Node, } allocatableByPriority[EvictedPriority] = allocatableByPriority[minimumPriority] + unallocatableResources := map[int32]ResourceList{} + for p, u := range node.UnallocatableResources { + unallocatableResources[p] = resourceListFactory.FromJobResourceListIgnoreUnknown(u.Resources) + } + return CreateNode( node.Id, nodeType, @@ -102,6 +110,7 @@ func FromSchedulerObjectsNode(node *schedulerobjects.Node, taints, labels, resourceListFactory.FromNodeProto(totalResources.Resources), + unallocatableResources, allocatableByPriority, map[string]ResourceList{}, map[string]ResourceList{}, @@ -119,6 +128,7 @@ func CreateNode( taints []v1.Taint, labels map[string]string, totalResources ResourceList, + unallocatableResources map[int32]ResourceList, allocatableByPriority map[int32]ResourceList, allocatedByQueue map[string]ResourceList, allocatedByJobId map[string]ResourceList, @@ -126,20 +136,21 @@ func CreateNode( keys [][]byte, ) *Node { return &Node{ - id: id, - nodeType: nodeType, - index: index, - executor: executor, - name: name, - pool: pool, - taints: koTaint.DeepCopyTaints(taints), - labels: deepCopyLabels(labels), - totalResources: totalResources, - AllocatableByPriority: maps.Clone(allocatableByPriority), - AllocatedByQueue: maps.Clone(allocatedByQueue), - AllocatedByJobId: maps.Clone(allocatedByJobId), - EvictedJobRunIds: evictedJobRunIds, - Keys: keys, + id: id, + nodeType: nodeType, + index: index, + executor: executor, + name: name, + pool: pool, + taints: koTaint.DeepCopyTaints(taints), + labels: deepCopyLabels(labels), + totalResources: totalResources, + unallocatableResources: maps.Clone(unallocatableResources), + AllocatableByPriority: maps.Clone(allocatableByPriority), + AllocatedByQueue: maps.Clone(allocatedByQueue), + AllocatedByJobId: maps.Clone(allocatedByJobId), + EvictedJobRunIds: evictedJobRunIds, + Keys: keys, } } @@ -204,18 +215,23 @@ func (node *Node) GetTotalResources() ResourceList { return node.totalResources } +func (node *Node) GetUnallocatableResources() map[int32]ResourceList { + return maps.Clone(node.unallocatableResources) +} + func (node *Node) DeepCopyNilKeys() *Node { return &Node{ // private fields are immutable so a shallow copy is fine - id: node.id, - index: node.index, - executor: node.executor, - name: node.name, - pool: node.pool, - nodeType: node.nodeType, - taints: node.taints, - labels: node.labels, - totalResources: node.totalResources, + id: node.id, + index: node.index, + executor: node.executor, + name: node.name, + pool: node.pool, + nodeType: node.nodeType, + taints: node.taints, + labels: node.labels, + totalResources: node.totalResources, + unallocatableResources: node.unallocatableResources, // keys set to nil Keys: nil, @@ -228,6 +244,26 @@ func (node *Node) DeepCopyNilKeys() *Node { } } +func (node *Node) SummaryString() string { + if node == nil { + return "" + } + + result := fmt.Sprintf("Id: %s\n", node.id) + result += fmt.Sprintf("Index: %d\n", node.index) + result += fmt.Sprintf("Executor: %s\n", node.executor) + result += fmt.Sprintf("Name: %s\n", node.name) + result += fmt.Sprintf("Pool: %s\n", node.pool) + result += fmt.Sprintf("TotalResources: %s\n", node.totalResources.String()) + result += fmt.Sprintf("Labels: %v\n", node.labels) + result += fmt.Sprintf("Taints: %v\n", node.taints) + for p, u := range node.unallocatableResources { + result += fmt.Sprintf("Unallocatable at %d: %s\n", p, u.String()) + } + + return result +} + func deepCopyLabels(labels map[string]string) map[string]string { result := make(map[string]string, len(labels)) for k, v := range labels { diff --git a/internal/scheduler/internaltypes/node_test.go b/internal/scheduler/internaltypes/node_test.go index 58a8d6c446c..15c9320f16a 100644 --- a/internal/scheduler/internaltypes/node_test.go +++ b/internal/scheduler/internaltypes/node_test.go @@ -40,6 +40,14 @@ func TestNode(t *testing.T) { "memory": resource.MustParse("32Gi"), }, ) + unallocatableResources := map[int32]ResourceList{ + 1: resourceListFactory.FromJobResourceListIgnoreUnknown( + map[string]resource.Quantity{ + "cpu": resource.MustParse("8"), + "memory": resource.MustParse("16Gi"), + }, + ), + } allocatableByPriority := map[int32]ResourceList{ 1: resourceListFactory.FromNodeProto( map[string]resource.Quantity{ @@ -103,6 +111,7 @@ func TestNode(t *testing.T) { taints, labels, totalResources, + unallocatableResources, allocatableByPriority, allocatedByQueue, allocatedByJobId, @@ -119,6 +128,7 @@ func TestNode(t *testing.T) { assert.Equal(t, taints, node.GetTaints()) assert.Equal(t, labels, node.GetLabels()) assert.Equal(t, totalResources, node.GetTotalResources()) + assert.Equal(t, unallocatableResources, node.GetUnallocatableResources()) assert.Equal(t, allocatableByPriority, node.AllocatableByPriority) assert.Equal(t, allocatedByQueue, node.AllocatedByQueue) assert.Equal(t, allocatedByJobId, node.AllocatedByJobId) diff --git a/internal/scheduler/internaltypes/testfixtures/testfixtures.go b/internal/scheduler/internaltypes/testfixtures/testfixtures.go new file mode 100644 index 00000000000..9c8414a747d --- /dev/null +++ b/internal/scheduler/internaltypes/testfixtures/testfixtures.go @@ -0,0 +1,24 @@ +package testfixtures + +import ( + "github.com/armadaproject/armada/internal/scheduler/internaltypes" +) + +func TestSimpleNode(id string) *internaltypes.Node { + return internaltypes.CreateNode( + id, + nil, + 0, + "", + "", + "", + nil, + nil, + internaltypes.ResourceList{}, + nil, + nil, + nil, + nil, + nil, + nil) +} diff --git a/internal/scheduler/nodedb/nodedb_test.go b/internal/scheduler/nodedb/nodedb_test.go index 3ace44245c4..d72f8d5cbd5 100644 --- a/internal/scheduler/nodedb/nodedb_test.go +++ b/internal/scheduler/nodedb/nodedb_test.go @@ -14,6 +14,7 @@ import ( "github.com/armadaproject/armada/internal/common/util" schedulerconfig "github.com/armadaproject/armada/internal/scheduler/configuration" "github.com/armadaproject/armada/internal/scheduler/internaltypes" + ittestfixtures "github.com/armadaproject/armada/internal/scheduler/internaltypes/testfixtures" "github.com/armadaproject/armada/internal/scheduler/jobdb" "github.com/armadaproject/armada/internal/scheduler/schedulerobjects" "github.com/armadaproject/armada/internal/scheduler/scheduling/context" @@ -81,7 +82,7 @@ func TestSelectNodeForPod_NodeIdLabel_Success(t *testing.T) { jctxs := context.JobSchedulingContextsFromJobs(jobs) for _, jctx := range jctxs { txn := db.Txn(false) - jctx.SetAssignedNodeId(nodeId) + jctx.SetAssignedNode(ittestfixtures.TestSimpleNode(nodeId)) node, err := db.SelectNodeForJobWithTxn(txn, jctx) txn.Abort() require.NoError(t, err) @@ -106,7 +107,7 @@ func TestSelectNodeForPod_NodeIdLabel_Failure(t *testing.T) { jctxs := context.JobSchedulingContextsFromJobs(jobs) for _, jctx := range jctxs { txn := db.Txn(false) - jctx.SetAssignedNodeId("non-existent node") + jctx.SetAssignedNode(ittestfixtures.TestSimpleNode("non-existent node")) node, err := db.SelectNodeForJobWithTxn(txn, jctx) txn.Abort() if !assert.NoError(t, err) { diff --git a/internal/scheduler/nodedb/nodeidindex_test.go b/internal/scheduler/nodedb/nodeidindex_test.go index 024e19c130f..a36fb577633 100644 --- a/internal/scheduler/nodedb/nodeidindex_test.go +++ b/internal/scheduler/nodedb/nodeidindex_test.go @@ -56,6 +56,7 @@ func makeTestNode(id string) *internaltypes.Node { []v1.Taint{}, map[string]string{}, internaltypes.ResourceList{}, + nil, map[int32]internaltypes.ResourceList{}, map[string]internaltypes.ResourceList{}, map[string]internaltypes.ResourceList{}, diff --git a/internal/scheduler/nodedb/nodematching_test.go b/internal/scheduler/nodedb/nodematching_test.go index 9b1bdcfacff..0decf8d9fe4 100644 --- a/internal/scheduler/nodedb/nodematching_test.go +++ b/internal/scheduler/nodedb/nodematching_test.go @@ -662,6 +662,7 @@ func makeTestNodeTaintsLabels(taints []v1.Taint, labels map[string]string) *inte taints, labels, internaltypes.ResourceList{}, + nil, map[int32]internaltypes.ResourceList{}, map[string]internaltypes.ResourceList{}, map[string]internaltypes.ResourceList{}, @@ -685,6 +686,7 @@ func makeTestNodeResources(t *testing.T, allocatableByPriority map[int32]interna []v1.Taint{}, map[string]string{}, totalResources, + nil, allocatableByPriority, map[string]internaltypes.ResourceList{}, map[string]internaltypes.ResourceList{}, diff --git a/internal/scheduler/scheduling/context/job.go b/internal/scheduler/scheduling/context/job.go index e92167c3275..e9ad86ed71a 100644 --- a/internal/scheduler/scheduling/context/job.go +++ b/internal/scheduler/scheduling/context/job.go @@ -59,7 +59,7 @@ type JobSchedulingContext struct { GangInfo // This is the node the pod is assigned to. // This is only set for evicted jobs and is set alongside adding an additionalNodeSelector for the node - AssignedNodeId string + AssignedNode *internaltypes.Node // Id of job that preempted this pod PreemptingJobId string // Description of the cause of preemption @@ -109,14 +109,21 @@ func (jctx *JobSchedulingContext) Fail(unschedulableReason string) { } } +func (jctx *JobSchedulingContext) GetAssignedNode() *internaltypes.Node { + return jctx.AssignedNode +} + func (jctx *JobSchedulingContext) GetAssignedNodeId() string { - return jctx.AssignedNodeId + if jctx.AssignedNode == nil { + return "" + } + return jctx.AssignedNode.GetId() } -func (jctx *JobSchedulingContext) SetAssignedNodeId(assignedNodeId string) { - if assignedNodeId != "" { - jctx.AssignedNodeId = assignedNodeId - jctx.AddNodeSelector(schedulerconfig.NodeIdLabel, assignedNodeId) +func (jctx *JobSchedulingContext) SetAssignedNode(assignedNode *internaltypes.Node) { + if assignedNode != nil { + jctx.AssignedNode = assignedNode + jctx.AddNodeSelector(schedulerconfig.NodeIdLabel, assignedNode.GetId()) } } diff --git a/internal/scheduler/scheduling/context/job_test.go b/internal/scheduler/scheduling/context/job_test.go index 2d0ef0b0773..42999746039 100644 --- a/internal/scheduler/scheduling/context/job_test.go +++ b/internal/scheduler/scheduling/context/job_test.go @@ -6,21 +6,26 @@ import ( "github.com/stretchr/testify/assert" "github.com/armadaproject/armada/internal/scheduler/configuration" + ittestfixtures "github.com/armadaproject/armada/internal/scheduler/internaltypes/testfixtures" "github.com/armadaproject/armada/internal/scheduler/testfixtures" ) -func TestJobSchedulingContext_SetAssignedNodeId(t *testing.T) { +func TestJobSchedulingContext_SetAssignedNode(t *testing.T) { jctx := &JobSchedulingContext{} - assert.Equal(t, "", jctx.GetAssignedNodeId()) + assert.Nil(t, jctx.GetAssignedNode()) + assert.Empty(t, jctx.GetAssignedNodeId()) assert.Empty(t, jctx.AdditionalNodeSelectors) - // Will not add a node selector if input is empty - jctx.SetAssignedNodeId("") - assert.Equal(t, "", jctx.GetAssignedNodeId()) + // Will not add a node selector if input is nil + jctx.SetAssignedNode(nil) + assert.Nil(t, jctx.GetAssignedNode()) + assert.Empty(t, jctx.GetAssignedNodeId()) assert.Empty(t, jctx.AdditionalNodeSelectors) - jctx.SetAssignedNodeId("node1") + n := ittestfixtures.TestSimpleNode("node1") + jctx.SetAssignedNode(n) + assert.Equal(t, n, jctx.GetAssignedNode()) assert.Equal(t, "node1", jctx.GetAssignedNodeId()) assert.Len(t, jctx.AdditionalNodeSelectors, 1) assert.Equal(t, map[string]string{configuration.NodeIdLabel: "node1"}, jctx.AdditionalNodeSelectors) diff --git a/internal/scheduler/scheduling/eviction.go b/internal/scheduler/scheduling/eviction.go index eefbda16f2b..cdee7881bc9 100644 --- a/internal/scheduler/scheduling/eviction.go +++ b/internal/scheduler/scheduling/eviction.go @@ -192,7 +192,7 @@ func (evi *Evictor) Evict(ctx *armadacontext.Context, nodeDbTxn *memdb.Txn) (*Ev // TODO(albin): We can remove the checkOnlyDynamicRequirements flag in the nodeDb now that we've added the tolerations. jctx := schedulercontext.JobSchedulingContextFromJob(job) jctx.IsEvicted = true - jctx.SetAssignedNodeId(node.GetId()) + jctx.SetAssignedNode(node) evictedJctxsByJobId[job.Id()] = jctx jctx.AdditionalTolerations = append(jctx.AdditionalTolerations, node.GetTolerationsForTaints()...) diff --git a/internal/scheduler/scheduling/preempting_queue_scheduler_test.go b/internal/scheduler/scheduling/preempting_queue_scheduler_test.go index f8fdbac4ce8..8aae7c57230 100644 --- a/internal/scheduler/scheduling/preempting_queue_scheduler_test.go +++ b/internal/scheduler/scheduling/preempting_queue_scheduler_test.go @@ -2507,6 +2507,7 @@ func testNodeWithTaints(node *internaltypes.Node, taints []v1.Taint) *internalty taints, node.GetLabels(), node.GetTotalResources(), + nil, node.AllocatableByPriority, node.AllocatedByQueue, node.AllocatedByJobId, diff --git a/internal/scheduler/scheduling/preemption_description.go b/internal/scheduler/scheduling/preemption_description.go index 3ae3e595e38..781cfcfe80f 100644 --- a/internal/scheduler/scheduling/preemption_description.go +++ b/internal/scheduler/scheduling/preemption_description.go @@ -10,7 +10,7 @@ import ( ) const ( - unknownPreemptionCause = "Preempted by scheduler due to the job failing to reschedule - possibly node resource changed causing this job to be unschedulable" + unknownPreemptionCause = "Preempted by scheduler due to the job failing to reschedule - possibly node resource changed causing this job to be unschedulable\nNode Summary:\n%s" unknownGangPreemptionCause = "Preempted by scheduler due to the job failing to reschedule - possibly another job in the gang was preempted or the node resource changed causing this job to be unschedulable" fairSharePreemptionTemplate = "Preempted by scheduler using fair share preemption - preempting job %s" urgencyPreemptionTemplate = "Preempted by scheduler using urgency preemption - preempting job %s" @@ -45,7 +45,7 @@ func PopulatePreemptionDescriptions(preemptedJobs []*context.JobSchedulingContex if isGang { preemptedJctx.PreemptionDescription = fmt.Sprintf(unknownGangPreemptionCause) } else { - preemptedJctx.PreemptionDescription = fmt.Sprintf(unknownPreemptionCause) + preemptedJctx.PreemptionDescription = fmt.Sprintf(unknownPreemptionCause, preemptedJctx.GetAssignedNode().SummaryString()) } } else if len(potentialPreemptingJobs) == 1 { preemptedJctx.PreemptionDescription = fmt.Sprintf(urgencyPreemptionTemplate, potentialPreemptingJobs[0].JobId) diff --git a/internal/scheduler/scheduling/preemption_description_test.go b/internal/scheduler/scheduling/preemption_description_test.go index 0d70f3dad0a..952f3d75441 100644 --- a/internal/scheduler/scheduling/preemption_description_test.go +++ b/internal/scheduler/scheduling/preemption_description_test.go @@ -7,6 +7,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + ittestfixtures "github.com/armadaproject/armada/internal/scheduler/internaltypes/testfixtures" "github.com/armadaproject/armada/internal/scheduler/jobdb" "github.com/armadaproject/armada/internal/scheduler/schedulerobjects" "github.com/armadaproject/armada/internal/scheduler/scheduling/context" @@ -38,61 +39,61 @@ func TestPopulatePreemptionDescriptions(t *testing.T) { }{ "unknown cause - basic job": { preemptedJobContext: &context.JobSchedulingContext{ - JobId: "job-1", - AssignedNodeId: "node-3", - Job: makeJob(t, "job-1", false), + JobId: "job-1", + AssignedNode: ittestfixtures.TestSimpleNode("node-3"), + Job: makeJob(t, "job-1", false), }, expectedPreemptedJobContext: &context.JobSchedulingContext{ JobId: "job-1", - AssignedNodeId: "node-3", + AssignedNode: ittestfixtures.TestSimpleNode("node-3"), Job: makeJob(t, "job-1", false), - PreemptionDescription: unknownPreemptionCause, + PreemptionDescription: fmt.Sprintf(unknownPreemptionCause, ittestfixtures.TestSimpleNode("node-3").SummaryString()), }, }, "unknown cause - gang job": { preemptedJobContext: &context.JobSchedulingContext{ - JobId: "job-1", - AssignedNodeId: "node-3", - Job: makeJob(t, "job-1", true), + JobId: "job-1", + AssignedNode: ittestfixtures.TestSimpleNode("node-3"), + Job: makeJob(t, "job-1", true), }, expectedPreemptedJobContext: &context.JobSchedulingContext{ JobId: "job-1", - AssignedNodeId: "node-3", + AssignedNode: ittestfixtures.TestSimpleNode("node-3"), Job: makeJob(t, "job-1", true), PreemptionDescription: unknownGangPreemptionCause, }, }, "urgency preemption - single preempting job": { preemptedJobContext: &context.JobSchedulingContext{ - JobId: "job-1", - AssignedNodeId: "node-1", + JobId: "job-1", + AssignedNode: ittestfixtures.TestSimpleNode("node-1"), }, expectedPreemptedJobContext: &context.JobSchedulingContext{ JobId: "job-1", - AssignedNodeId: "node-1", + AssignedNode: ittestfixtures.TestSimpleNode("node-1"), PreemptionDescription: fmt.Sprintf(urgencyPreemptionTemplate, "job-2"), }, }, "urgency preemption - multiple preempting jobs": { preemptedJobContext: &context.JobSchedulingContext{ - JobId: "job-1", - AssignedNodeId: "node-2", + JobId: "job-1", + AssignedNode: ittestfixtures.TestSimpleNode("node-2"), }, expectedPreemptedJobContext: &context.JobSchedulingContext{ JobId: "job-1", - AssignedNodeId: "node-2", + AssignedNode: ittestfixtures.TestSimpleNode("node-2"), PreemptionDescription: fmt.Sprintf(urgencyPreemptionMultiJobTemplate, "job-3,job-4"), }, }, "fairshare": { preemptedJobContext: &context.JobSchedulingContext{ JobId: "job-1", - AssignedNodeId: "node-4", + AssignedNode: ittestfixtures.TestSimpleNode("node-4"), PreemptingJobId: "job-7", }, expectedPreemptedJobContext: &context.JobSchedulingContext{ JobId: "job-1", - AssignedNodeId: "node-4", + AssignedNode: ittestfixtures.TestSimpleNode("node-4"), PreemptingJobId: "job-7", PreemptionDescription: fmt.Sprintf(fairSharePreemptionTemplate, "job-7"), },