Skip to content

Commit 6653ef6

Browse files
committed
KEP-5007 DRA Device Binding Conditions: Add dra integration test
1 parent 9e82c13 commit 6653ef6

File tree

1 file changed

+238
-7
lines changed

1 file changed

+238
-7
lines changed

test/integration/dra/dra_test.go

Lines changed: 238 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ import (
5050
utilfeature "k8s.io/apiserver/pkg/util/feature"
5151
resourceapiac "k8s.io/client-go/applyconfigurations/resource/v1"
5252
"k8s.io/client-go/informers"
53+
"k8s.io/client-go/kubernetes"
5354
"k8s.io/client-go/tools/cache"
5455
"k8s.io/component-base/featuregate"
5556
featuregatetesting "k8s.io/component-base/featuregate/testing"
@@ -244,9 +245,10 @@ func TestDRA(t *testing.T) {
244245
tCtx.Run("PrioritizedList", func(tCtx ktesting.TContext) { testPrioritizedList(tCtx, false) })
245246
tCtx.Run("Pod", func(tCtx ktesting.TContext) { testPod(tCtx, true) })
246247
tCtx.Run("PublishResourceSlices", func(tCtx ktesting.TContext) {
247-
testPublishResourceSlices(tCtx, true, features.DRADeviceTaints, features.DRAPartitionableDevices)
248+
testPublishResourceSlices(tCtx, true, features.DRADeviceTaints, features.DRAPartitionableDevices, features.DRADeviceBindingConditions)
248249
})
249250
tCtx.Run("ResourceClaimDeviceStatus", func(tCtx ktesting.TContext) { testResourceClaimDeviceStatus(tCtx, false) })
251+
tCtx.Run("DeviceBindingConditions", func(tCtx ktesting.TContext) { testDeviceBindingConditions(tCtx, false) })
250252
},
251253
},
252254
"v1beta1": {
@@ -257,7 +259,7 @@ func TestDRA(t *testing.T) {
257259
features: map[featuregate.Feature]bool{features.DynamicResourceAllocation: true},
258260
f: func(tCtx ktesting.TContext) {
259261
tCtx.Run("PublishResourceSlices", func(tCtx ktesting.TContext) {
260-
testPublishResourceSlices(tCtx, false, features.DRADeviceTaints, features.DRAPartitionableDevices)
262+
testPublishResourceSlices(tCtx, false, features.DRADeviceTaints, features.DRAPartitionableDevices, features.DRADeviceBindingConditions)
261263
})
262264
},
263265
},
@@ -269,7 +271,7 @@ func TestDRA(t *testing.T) {
269271
features: map[featuregate.Feature]bool{features.DynamicResourceAllocation: true},
270272
f: func(tCtx ktesting.TContext) {
271273
tCtx.Run("PublishResourceSlices", func(tCtx ktesting.TContext) {
272-
testPublishResourceSlices(tCtx, false, features.DRADeviceTaints, features.DRAPartitionableDevices)
274+
testPublishResourceSlices(tCtx, false, features.DRADeviceTaints, features.DRAPartitionableDevices, features.DRADeviceBindingConditions)
273275
})
274276
},
275277
},
@@ -283,15 +285,18 @@ func TestDRA(t *testing.T) {
283285
// Additional DRA feature gates go here,
284286
// in alphabetical order,
285287
// as needed by tests for them.
286-
features.DRAAdminAccess: true,
287-
features.DRADeviceTaints: true,
288-
features.DRAPartitionableDevices: true,
289-
features.DRAPrioritizedList: true,
288+
features.DRAAdminAccess: true,
289+
features.DRADeviceBindingConditions: true,
290+
features.DRADeviceTaints: true,
291+
features.DRAPartitionableDevices: true,
292+
features.DRAPrioritizedList: true,
293+
features.DRAResourceClaimDeviceStatus: true,
290294
},
291295
f: func(tCtx ktesting.TContext) {
292296
tCtx.Run("AdminAccess", func(tCtx ktesting.TContext) { testAdminAccess(tCtx, true) })
293297
tCtx.Run("Convert", testConvert)
294298
tCtx.Run("ControllerManagerMetrics", testControllerManagerMetrics)
299+
tCtx.Run("DeviceBindingConditions", func(tCtx ktesting.TContext) { testDeviceBindingConditions(tCtx, true) })
295300
tCtx.Run("PrioritizedList", func(tCtx ktesting.TContext) { testPrioritizedList(tCtx, true) })
296301
tCtx.Run("PublishResourceSlices", func(tCtx ktesting.TContext) { testPublishResourceSlices(tCtx, true) })
297302
tCtx.Run("ResourceClaimDeviceStatus", func(tCtx ktesting.TContext) { testResourceClaimDeviceStatus(tCtx, true) })
@@ -729,6 +734,18 @@ func testPublishResourceSlices(tCtx ktesting.TContext, haveLatestAPI bool, disab
729734
},
730735
}},
731736
},
737+
{
738+
Name: "device-binding-conditions",
739+
BindingConditions: []string{
740+
"condition-1",
741+
"condition-2",
742+
},
743+
BindingFailureConditions: []string{
744+
"failure-condition-1",
745+
"failure-condition-2",
746+
},
747+
BindsToNode: ptr.To(true),
748+
},
732749
},
733750
},
734751
},
@@ -766,6 +783,14 @@ func testPublishResourceSlices(tCtx ktesting.TContext, haveLatestAPI bool, disab
766783
expectedSliceSpecs[i].Devices[e].ConsumesCounters = nil
767784
}
768785
}
786+
case features.DRADeviceBindingConditions:
787+
for i := range expectedSliceSpecs {
788+
for e := range expectedSliceSpecs[i].Devices {
789+
expectedSliceSpecs[i].Devices[e].BindingConditions = nil
790+
expectedSliceSpecs[i].Devices[e].BindingFailureConditions = nil
791+
expectedSliceSpecs[i].Devices[e].BindsToNode = nil
792+
}
793+
}
769794
default:
770795
tCtx.Fatalf("faulty test, case for %s missing", disabled)
771796
}
@@ -813,6 +838,9 @@ func testPublishResourceSlices(tCtx ktesting.TContext, haveLatestAPI bool, disab
813838
}
814839
return expected
815840
}()...),
841+
"BindingConditions": gomega.Equal(device.BindingConditions),
842+
"BindingFailureConditions": gomega.Equal(device.BindingFailureConditions),
843+
"BindsToNode": gomega.Equal(device.BindsToNode),
816844
}))
817845
}
818846
return expected
@@ -1400,3 +1428,206 @@ func matchPointer[T any](p *T) gtypes.GomegaMatcher {
14001428
}
14011429
return gstruct.PointTo(gomega.Equal(*p))
14021430
}
1431+
1432+
// testDeviceBindingConditions tests scheduling with mixed devices: one with BindingConditions, one without.
1433+
// It verifies that the scheduler prioritizes the device without BindingConditions for the first pod.
1434+
// The second pod then uses the device with BindingConditions. The test checks that the scheduler retries
1435+
// after an initial binding failure of the second pod, ensuring successful scheduling after rescheduling.
1436+
func testDeviceBindingConditions(tCtx ktesting.TContext, enabled bool) {
1437+
namespace := createTestNamespace(tCtx, nil)
1438+
class, driverName := createTestClass(tCtx, namespace)
1439+
1440+
nodeName := "worker-0"
1441+
poolWithBinding := nodeName + "-with-binding"
1442+
poolWithoutBinding := nodeName + "-without-binding"
1443+
bindingCondition := "attached"
1444+
failureCondition := "failed"
1445+
startScheduler(tCtx)
1446+
1447+
slice := &resourceapi.ResourceSlice{
1448+
ObjectMeta: metav1.ObjectMeta{
1449+
GenerateName: namespace + "-",
1450+
},
1451+
Spec: resourceapi.ResourceSliceSpec{
1452+
NodeName: &nodeName,
1453+
Pool: resourceapi.ResourcePool{
1454+
Name: poolWithBinding,
1455+
ResourceSliceCount: 1,
1456+
},
1457+
Driver: driverName,
1458+
Devices: []resourceapi.Device{
1459+
{
1460+
Name: "with-binding",
1461+
BindingConditions: []string{bindingCondition},
1462+
BindingFailureConditions: []string{failureCondition},
1463+
},
1464+
},
1465+
},
1466+
}
1467+
slice, err := tCtx.Client().ResourceV1().ResourceSlices().Create(tCtx, slice, metav1.CreateOptions{})
1468+
tCtx.ExpectNoError(err, "create slice")
1469+
1470+
haveBindingConditionFields := len(slice.Spec.Devices[0].BindingConditions) > 0 || len(slice.Spec.Devices[0].BindingFailureConditions) > 0
1471+
if !enabled {
1472+
if haveBindingConditionFields {
1473+
tCtx.Fatalf("Expected device binding condition fields to get dropped, got instead:\n%s", format.Object(slice, 1))
1474+
}
1475+
return
1476+
}
1477+
if !haveBindingConditionFields {
1478+
tCtx.Fatalf("Expected device binding condition fields to be stored, got instead:\n%s", format.Object(slice, 1))
1479+
}
1480+
1481+
sliceWithoutBinding := &resourceapi.ResourceSlice{
1482+
ObjectMeta: metav1.ObjectMeta{
1483+
GenerateName: namespace + "-without-binding-",
1484+
},
1485+
Spec: resourceapi.ResourceSliceSpec{
1486+
NodeName: &nodeName,
1487+
Pool: resourceapi.ResourcePool{
1488+
Name: poolWithoutBinding,
1489+
ResourceSliceCount: 1,
1490+
},
1491+
Driver: driverName,
1492+
Devices: []resourceapi.Device{
1493+
{
1494+
Name: "without-binding",
1495+
},
1496+
},
1497+
},
1498+
}
1499+
_, err = tCtx.Client().ResourceV1().ResourceSlices().Create(tCtx, sliceWithoutBinding, metav1.CreateOptions{})
1500+
tCtx.ExpectNoError(err, "create slice without binding conditions")
1501+
1502+
// Schedule first pod and wait for the scheduler to reach the binding phase, which marks the claim as allocated.
1503+
start := time.Now()
1504+
claim1 := createClaim(tCtx, namespace, "-a", class, claim)
1505+
pod := createPod(tCtx, namespace, "-a", claim1, podWithClaimName)
1506+
ktesting.Eventually(tCtx, func(tCtx ktesting.TContext) *resourceapi.ResourceClaim {
1507+
c, err := tCtx.Client().ResourceV1().ResourceClaims(namespace).Get(tCtx, claim1.Name, metav1.GetOptions{})
1508+
tCtx.ExpectNoError(err)
1509+
claim1 = c
1510+
return claim1
1511+
}).WithTimeout(10*time.Second).WithPolling(time.Second).Should(gomega.HaveField("Status.Allocation", gomega.Not(gomega.BeNil())), "Claim should have been allocated.")
1512+
end := time.Now()
1513+
gomega.NewWithT(tCtx).Expect(claim1).To(gomega.HaveField("Status.Allocation", gstruct.PointTo(gstruct.MatchFields(gstruct.IgnoreExtras, gstruct.Fields{
1514+
"Devices": gomega.Equal(resourceapi.DeviceAllocationResult{
1515+
Results: []resourceapi.DeviceRequestAllocationResult{{
1516+
Request: claim1.Spec.Devices.Requests[0].Name,
1517+
Driver: driverName,
1518+
Pool: poolWithoutBinding,
1519+
Device: "without-binding",
1520+
}}}),
1521+
// NodeSelector intentionally not checked - that's covered elsewhere.
1522+
"AllocationTimestamp": gomega.HaveField("Time", gomega.And(
1523+
gomega.BeTemporally(">=", start.Truncate(time.Second) /* may get rounded down during round-tripping */),
1524+
gomega.BeTemporally("<=", end),
1525+
)),
1526+
}))), "first allocated claim")
1527+
1528+
err = waitForPodScheduled(tCtx, tCtx.Client(), namespace, pod.Name)
1529+
tCtx.ExpectNoError(err, "first pod scheduled")
1530+
1531+
// Second pod should get the device with binding conditions.
1532+
claim2 := createClaim(tCtx, namespace, "-b", class, claim)
1533+
pod = createPod(tCtx, namespace, "-b", claim2, podWithClaimName)
1534+
ktesting.Eventually(tCtx, func(tCtx ktesting.TContext) *resourceapi.ResourceClaim {
1535+
c, err := tCtx.Client().ResourceV1().ResourceClaims(namespace).Get(tCtx, claim2.Name, metav1.GetOptions{})
1536+
tCtx.ExpectNoError(err)
1537+
claim2 = c
1538+
return claim2
1539+
}).WithTimeout(10*time.Second).WithPolling(time.Second).Should(gomega.HaveField("Status.Allocation", gomega.Not(gomega.BeNil())), "Claim should have been allocated.")
1540+
end = time.Now()
1541+
gomega.NewWithT(tCtx).Expect(claim2).To(gomega.HaveField("Status.Allocation", gstruct.PointTo(gstruct.MatchFields(gstruct.IgnoreExtras, gstruct.Fields{
1542+
"Devices": gomega.Equal(resourceapi.DeviceAllocationResult{
1543+
Results: []resourceapi.DeviceRequestAllocationResult{{
1544+
Request: claim2.Spec.Devices.Requests[0].Name,
1545+
Driver: driverName,
1546+
Pool: poolWithBinding,
1547+
Device: "with-binding",
1548+
BindingConditions: []string{bindingCondition},
1549+
BindingFailureConditions: []string{failureCondition},
1550+
}}}),
1551+
// NodeSelector intentionally not checked - that's covered elsewhere.
1552+
"AllocationTimestamp": gomega.HaveField("Time", gomega.And(
1553+
gomega.BeTemporally(">=", start.Truncate(time.Second) /* may get rounded down during round-tripping */),
1554+
gomega.BeTemporally("<=", end),
1555+
)),
1556+
}))), "second allocated claim")
1557+
1558+
// fail the binding condition for the second claim, so that it gets scheduled later.
1559+
claim2.Status.Devices = []resourceapi.AllocatedDeviceStatus{{
1560+
Driver: driverName,
1561+
Pool: poolWithBinding,
1562+
Device: "with-binding",
1563+
Conditions: []metav1.Condition{{
1564+
Type: failureCondition,
1565+
Status: metav1.ConditionTrue,
1566+
ObservedGeneration: claim2.Generation,
1567+
LastTransitionTime: metav1.Now(),
1568+
Reason: "Testing",
1569+
Message: "The test has seen the allocation and is failing the binding.",
1570+
}},
1571+
}}
1572+
1573+
claim2, err = tCtx.Client().ResourceV1().ResourceClaims(namespace).UpdateStatus(tCtx, claim2, metav1.UpdateOptions{})
1574+
tCtx.ExpectNoError(err, "add binding failure condition to second claim")
1575+
1576+
// Wait until the claim.status.Devices[0].Conditions become nil again after rescheduling.
1577+
setConditionsFlag := false
1578+
ktesting.Eventually(tCtx, func(tCtx ktesting.TContext) *resourceapi.ResourceClaim {
1579+
c, err := tCtx.Client().ResourceV1().ResourceClaims(namespace).Get(tCtx, claim2.Name, metav1.GetOptions{})
1580+
tCtx.ExpectNoError(err, "get claim")
1581+
claim2 = c
1582+
if claim2.Status.Devices != nil && len(claim2.Status.Devices[0].Conditions) != 0 {
1583+
setConditionsFlag = true
1584+
}
1585+
if setConditionsFlag && len(claim2.Status.Devices) == 0 {
1586+
// The scheduler has retried and removed the conditions.
1587+
// This is the expected state. Finish waiting.
1588+
return nil
1589+
}
1590+
return claim2
1591+
}).WithTimeout(30*time.Second).WithPolling(time.Second).Should(gomega.BeNil(), "claim should not have any condition")
1592+
1593+
// Allow the scheduler to proceed.
1594+
claim2.Status.Devices = []resourceapi.AllocatedDeviceStatus{{
1595+
Driver: driverName,
1596+
Pool: poolWithBinding,
1597+
Device: "with-binding",
1598+
Conditions: []metav1.Condition{{
1599+
Type: bindingCondition,
1600+
Status: metav1.ConditionTrue,
1601+
ObservedGeneration: claim2.Generation,
1602+
LastTransitionTime: metav1.Now(),
1603+
Reason: "Testing",
1604+
Message: "The test has seen the allocation.",
1605+
}},
1606+
}}
1607+
1608+
claim2, err = tCtx.Client().ResourceV1().ResourceClaims(namespace).UpdateStatus(tCtx, claim2, metav1.UpdateOptions{})
1609+
tCtx.ExpectNoError(err, "add binding condition to second claim")
1610+
err = waitForPodScheduled(tCtx, tCtx.Client(), namespace, pod.Name)
1611+
tCtx.ExpectNoError(err, "second pod scheduled")
1612+
}
1613+
1614+
func waitForPodScheduled(ctx context.Context, client kubernetes.Interface, namespace, podName string) error {
1615+
timeout := time.After(60 * time.Second)
1616+
tick := time.Tick(1 * time.Second)
1617+
for {
1618+
select {
1619+
case <-timeout:
1620+
return fmt.Errorf("timed out waiting for pod %s/%s to be scheduled", namespace, podName)
1621+
case <-tick:
1622+
pod, err := client.CoreV1().Pods(namespace).Get(ctx, podName, metav1.GetOptions{})
1623+
if err != nil {
1624+
continue
1625+
}
1626+
for _, cond := range pod.Status.Conditions {
1627+
if cond.Type == v1.PodScheduled && cond.Status == v1.ConditionTrue {
1628+
return nil
1629+
}
1630+
}
1631+
}
1632+
}
1633+
}

0 commit comments

Comments
 (0)