diff --git a/pkg/scheduler/flavorassigner/flavorassigner.go b/pkg/scheduler/flavorassigner/flavorassigner.go index cd727d225fa..9a3cb22fcfa 100644 --- a/pkg/scheduler/flavorassigner/flavorassigner.go +++ b/pkg/scheduler/flavorassigner/flavorassigner.go @@ -602,8 +602,8 @@ func flavorSelector(spec *corev1.PodSpec, allowedKeys sets.Set[string]) nodeaffi // could help), it returns a Status with reasons. func (a *FlavorAssigner) fitsResourceQuota(log logr.Logger, fr resources.FlavorResource, val int64, rQuota cache.ResourceQuota) (granularMode, bool, *Status) { var status Status - var borrow bool + borrow := a.cq.BorrowingWith(fr, val) && a.cq.HasParent() available := a.cq.Available(fr) maxCapacity := a.cq.PotentialAvailable(fr) @@ -616,7 +616,7 @@ func (a *FlavorAssigner) fitsResourceQuota(log logr.Logger, fr resources.FlavorR // Fit if val <= available { - return fit, a.cq.ResourceNode.Usage[fr]+val > rQuota.Nominal, nil + return fit, borrow, nil } // Check if preemption is possible @@ -628,7 +628,6 @@ func (a *FlavorAssigner) fitsResourceQuota(log logr.Logger, fr resources.FlavorR } } else if a.canPreemptWhileBorrowing() { mode = preempt - borrow = true } status.append(fmt.Sprintf("insufficient unused quota for %s in flavor %s, %s more needed", diff --git a/pkg/scheduler/flavorassigner/flavorassigner_test.go b/pkg/scheduler/flavorassigner/flavorassigner_test.go index eb60f73a312..6c72a61a33b 100644 --- a/pkg/scheduler/flavorassigner/flavorassigner_test.go +++ b/pkg/scheduler/flavorassigner/flavorassigner_test.go @@ -372,6 +372,7 @@ func TestAssignFlavors(t *testing.T) { }, Count: 1, }}, + Borrowing: true, Usage: resources.FlavorResourceQuantities{ {Flavor: "two", Resource: corev1.ResourceCPU}: 3_000, {Flavor: "two", Resource: corev1.ResourceMemory}: 10 * utiltesting.Mi, @@ -885,6 +886,7 @@ func TestAssignFlavors(t *testing.T) { }, Count: 1, }}, + Borrowing: true, Usage: resources.FlavorResourceQuantities{ {Flavor: "one", Resource: corev1.ResourceCPU}: 2_000, }, @@ -966,6 +968,7 @@ func TestAssignFlavors(t *testing.T) { }, Count: 1, }}, + Borrowing: true, Usage: resources.FlavorResourceQuantities{ {Flavor: "one", Resource: corev1.ResourceCPU}: 2_000, }, @@ -1816,6 +1819,7 @@ func TestAssignFlavors(t *testing.T) { }, Count: 1, }}, + Borrowing: true, Usage: resources.FlavorResourceQuantities{ {Flavor: "one", Resource: corev1.ResourceCPU}: 9_000, {Flavor: "one", Resource: corev1.ResourcePods}: 1, diff --git a/pkg/scheduler/scheduler_test.go b/pkg/scheduler/scheduler_test.go index d32bfc08abe..bbd5380ef82 100644 --- a/pkg/scheduler/scheduler_test.go +++ b/pkg/scheduler/scheduler_test.go @@ -816,6 +816,13 @@ func TestSchedule(t *testing.T) { }, }, "multiple CQs need preemption": { + // legacy mode considers eng-alpha/pending to + // be left in queue, rather than inadmissible, + // due to legacy overlapping preemption skip + // logic. Since we are deleting legacy code in + // 0.10, we will disable this test for it, + // rather than duplicate it. + multiplePreemptions: MultiplePremptions, additionalClusterQueues: []kueue.ClusterQueue{ *utiltesting.MakeClusterQueue("other-alpha"). Cohort("other"). @@ -2602,6 +2609,111 @@ func TestSchedule(t *testing.T) { "eng-beta/b1": *utiltesting.MakeAdmission("other-beta").Assignment("gpu", "spot", "5").Obj(), }, }, + "workload requiring reclaimation prioritized over wl in another full cq": { + // Also see #3405. + // + // CQ2 is lending out capacity to its + // Cohort. It has a pending workload, WL2, + // that fits within nominal capacity, and a + // reclaim policy set to any. + + // CQ1 is using half of its capacity, and is + // also lending out remaining capacity. + + // CQ3 has no capacity of its own, and is + // borrowing 10 nominal capacity. + + // With a pending workloads WL1 and WL2 queued + // in CQ1 and CQ2 respectively, we want to + // make sure that the WL2 is processed first, + // so that its preemption calculations are not + // invalidated by CQ1's WL1, which won't fit + // into its nominal capacity given the + // admitted Admitted-Workload-1. + + // As WL1 has an earlier creation timestamp + // than WL2, there was a bug where it would + // process first, reserving capacity which + // invalidated WL2's preemption calculations, + // blocking it indefinitely from reclaiming + // its nominal capacity. + // + // We don't test legacy mode as it classifies + // inadmissible/left different, and we will + // delete that logic shortly. + multiplePreemptions: MultiplePremptions, + additionalClusterQueues: []kueue.ClusterQueue{ + *utiltesting.MakeClusterQueue("CQ1"). + Cohort("other"). + ResourceGroup( + utiltesting.MakeFlavorQuotas("on-demand").Resource("gpu", "10").FlavorQuotas, + ). + Obj(), + *utiltesting.MakeClusterQueue("CQ2"). + Cohort("other"). + Preemption(kueue.ClusterQueuePreemption{ + ReclaimWithinCohort: kueue.PreemptionPolicyAny, + }). + ResourceGroup( + utiltesting.MakeFlavorQuotas("on-demand").Resource("gpu", "10").FlavorQuotas, + ). + Obj(), + *utiltesting.MakeClusterQueue("CQ3"). + Cohort("other"). + ResourceGroup( + utiltesting.MakeFlavorQuotas("on-demand").Resource("gpu", "0").FlavorQuotas, + ). + Obj(), + }, + additionalLocalQueues: []kueue.LocalQueue{ + *utiltesting.MakeLocalQueue("lq", "eng-alpha").ClusterQueue("CQ1").Obj(), + *utiltesting.MakeLocalQueue("lq", "eng-beta").ClusterQueue("CQ2").Obj(), + *utiltesting.MakeLocalQueue("lq", "eng-gamma").ClusterQueue("CQ3").Obj(), + }, + workloads: []kueue.Workload{ + *utiltesting.MakeWorkload("Admitted-Workload-1", "eng-alpha"). + Queue("lq"). + Request("gpu", "5"). + SimpleReserveQuota("CQ1", "on-demand", now). + Obj(), + *utiltesting.MakeWorkload("WL1", "eng-alpha"). + Creation(now). + Queue("lq"). + Request("gpu", "10"). + Obj(), + *utiltesting.MakeWorkload("WL2", "eng-beta"). + Creation(now.Add(time.Second)). + Queue("lq"). + Request("gpu", "10"). + Obj(), + *utiltesting.MakeWorkload("Admitted-Workload-2", "eng-gamma"). + Queue("lq"). + Priority(0). + Request("gpu", "5"). + SimpleReserveQuota("CQ3", "on-demand", now). + Obj(), + *utiltesting.MakeWorkload("Admitted-Workload-3", "eng-gamma"). + Queue("lq"). + Priority(1). + Request("gpu", "5"). + SimpleReserveQuota("CQ3", "on-demand", now). + Obj(), + }, + wantPreempted: sets.Set[string]( + sets.NewString("eng-gamma/Admitted-Workload-2"), + ), + wantLeft: map[string][]string{ + "CQ2": {"eng-beta/WL2"}, + }, + wantInadmissibleLeft: map[string][]string{ + "CQ1": {"eng-alpha/WL1"}, + }, + wantAssignments: map[string]kueue.Admission{ + "eng-alpha/Admitted-Workload-1": *utiltesting.MakeAdmission("CQ1").Assignment("gpu", "on-demand", "5").Obj(), + "eng-gamma/Admitted-Workload-2": *utiltesting.MakeAdmission("CQ3").Assignment("gpu", "on-demand", "5").Obj(), + "eng-gamma/Admitted-Workload-3": *utiltesting.MakeAdmission("CQ3").Assignment("gpu", "on-demand", "5").Obj(), + }, + }, } for name, tc := range cases { diff --git a/test/integration/scheduler/scheduler_test.go b/test/integration/scheduler/scheduler_test.go index 36264e9163f..37761566584 100644 --- a/test/integration/scheduler/scheduler_test.go +++ b/test/integration/scheduler/scheduler_test.go @@ -1801,9 +1801,10 @@ var _ = ginkgo.Describe("Scheduler", func() { g.Expect(k8sClient.Update(ctx, &cq)).Should(gomega.Succeed()) }, util.Timeout, util.Interval).Should(gomega.Succeed()) - // pendingWl exceed nominal+borrowing quota and cannot preempt due to low priority. + // pendingWl exceed nominal+borrowing quota and cannot preempt as priority based + // premption within CQ is disabled. pendingWl := testing.MakeWorkload("pending-wl", matchingNS.Name).Queue(strictFIFOLocalQueue. - Name).Request(corev1.ResourceCPU, "3").Priority(9).Obj() + Name).Request(corev1.ResourceCPU, "3").Priority(99).Obj() gomega.Expect(k8sClient.Create(ctx, pendingWl)).Should(gomega.Succeed()) // borrowingWL can borrow shared resources, so it should be scheduled even if workloads