From d458e3f1b301e1a2a9b963aef1d78c69e5238290 Mon Sep 17 00:00:00 2001 From: Pawel Kepka Date: Wed, 10 Dec 2025 20:54:10 +0000 Subject: [PATCH 01/25] Topology-aware workload scheduling KEP --- .../README.md | 991 ++++++++++++++++++ .../kep.yaml | 43 + 2 files changed, 1034 insertions(+) create mode 100644 keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md create mode 100644 keps/sig-scheduling/5732-topology-aware-workload-scheduling/kep.yaml diff --git a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md new file mode 100644 index 000000000000..b697985daf73 --- /dev/null +++ b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md @@ -0,0 +1,991 @@ +# KEP-5732: Topology-aware workload scheduling + + + +- [Release Signoff Checklist](#release-signoff-checklist) +- [Summary](#summary) +- [Motivation](#motivation) + - [Goals](#goals) + - [Non-Goals](#non-goals) +- [Proposal](#proposal) + - [User Stories (Optional)](#user-stories-optional) + - [Story 1: AI Training in a Single Rack](#story-1-ai-training-in-a-single-rack) + - [Story 2: Workload using Interconnected DRA Devices](#story-2-workload-using-interconnected-dra-devices) + - [Notes/Constraints/Caveats (Optional)](#notesconstraintscaveats-optional) + - [Risks and Mitigations](#risks-and-mitigations) +- [Design Details](#design-details) + - [Workload API Changes](#workload-api-changes) + - [Scheduling Framework Extensions](#scheduling-framework-extensions) + - [1. Data Structures](#1-data-structures) + - [2. New Plugin Interfaces](#2-new-plugin-interfaces) + - [Scheduling Algorithm Phases](#scheduling-algorithm-phases) + - [Phase 1: Candidate Placement Generation](#phase-1-candidate-placement-generation) + - [Phase 2: Pod-Level Filtering and Feasibility Check](#phase-2-pod-level-filtering-and-feasibility-check) + - [Phase 3: Placement Scoring and Selection](#phase-3-placement-scoring-and-selection) + - [Scheduler Plugins](#scheduler-plugins) + - [Potential Future Extensions (Beta Candidates)](#potential-future-extensions-beta-candidates) + - [Test Plan](#test-plan) + - [Prerequisite testing updates](#prerequisite-testing-updates) + - [Unit tests](#unit-tests) + - [Integration tests](#integration-tests) + - [e2e tests](#e2e-tests) + - [Graduation Criteria](#graduation-criteria) + - [Alpha](#alpha) + - [Beta](#beta) + - [Upgrade / Downgrade Strategy](#upgrade--downgrade-strategy) + - [Version Skew Strategy](#version-skew-strategy) +- [Production Readiness Review Questionnaire](#production-readiness-review-questionnaire) + - [Feature Enablement and Rollback](#feature-enablement-and-rollback) + - [Rollout, Upgrade and Rollback Planning](#rollout-upgrade-and-rollback-planning) + - [Monitoring Requirements](#monitoring-requirements) + - [Dependencies](#dependencies) + - [Scalability](#scalability) + - [Troubleshooting](#troubleshooting) +- [Implementation History](#implementation-history) +- [Drawbacks](#drawbacks) +- [Alternatives](#alternatives) + - [Pod Inter-Affinities](#pod-inter-affinities) + - [Standalone Schedulers (e.g., Volcano)](#standalone-schedulers-eg-volcano) +- [Infrastructure Needed (Optional)](#infrastructure-needed-optional) + + +## Release Signoff Checklist + +Items marked with (R) are required *prior to targeting to a milestone / release*. + +- [ ] (R) Enhancement issue in release milestone, which links to KEP dir in [kubernetes/enhancements] (not the initial KEP PR) +- [ ] (R) KEP approvers have approved the KEP status as `implementable` +- [ ] (R) Design details are appropriately documented +- [ ] (R) Test plan is in place, giving consideration to SIG Architecture and SIG Testing input (including test refactors) + - [ ] e2e Tests for all Beta API Operations (endpoints) + - [ ] (R) Ensure GA e2e tests meet requirements for [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) + - [ ] (R) Minimum Two Week Window for GA e2e tests to prove flake free +- [ ] (R) Graduation criteria is in place + - [ ] (R) [all GA Endpoints](https://github.com/kubernetes/community/pull/1806) must be hit by [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) within one minor version of promotion to GA +- [ ] (R) Production readiness review completed +- [ ] (R) Production readiness review approved +- [ ] "Implementation History" section is up-to-date for milestone +- [ ] User-facing documentation has been created in [kubernetes/website], for publication to [kubernetes.io] +- [ ] Supporting documentation—e.g., additional design documents, links to mailing list discussions/SIG meetings, relevant PRs/issues, release notes + + + +[kubernetes.io]: https://kubernetes.io/ +[kubernetes/enhancements]: https://git.k8s.io/enhancements +[kubernetes/kubernetes]: https://git.k8s.io/kubernetes +[kubernetes/website]: https://git.k8s.io/website + +## Summary + +This KEP describes the architectural design and implementation details for +integrating a Topology-Aware and DRA-Aware workload scheduling algorithm into +the Kubernetes kube-scheduler to address the complex placement requirements of +modern, high-performance distributed applications. + +The proposed algorithm fundamentally alters the scheduling lifecycle for gang +scheduled workloads. Instead of evaluating pods individually against the cluster +state - a process prone to fragmentation and deadlocks - the new mechanism +generates "Placements". These Placements represent candidate domains (sets of +nodes or DRA resources) where the entire workload is theoretically feasible. The +scheduler then simulates the placement of the full group of pods within these +domains, utilizing existing filtering and scoring logic to ensure high-fidelity +decisions before committing resources. + +This design introduces specific extensions to the Kubernetes Workload API to +support `TopologyConstraints` and `DRAConstraints`, defines new interfaces +within the Scheduling Framework (`PlacementGenerator`, `PlacementState`, +`PlacementScorer`), and details the algorithmic flow required to schedule Pod +Groups while maintaining compatibility with the scheduler's existing ecosystem. + +## Motivation + +Distributed workloads, particularly those driving the current AI/ML era, often +require high-bandwidth and low-latency communication between multiple pods to +function efficiently. While the [KEP-4671: Workload API] makes the first step +towards managing these applications as cohesive units, it primarily establishes +the API structure. For workloads sensitive to inter-pod communication, simply +grouping pods is insufficient; their physical placement within the cluster's +network topology is a decisive factor in their performance. + +In this KEP, we propose an algorithm for topology-aware and DRA-aware scheduling +that operates directly within the Kubernetes kube-scheduler. The core objective +is to ensure that pods belonging to a Workload are co-located within optimal +topological domains - such as specific racks or blocks - or are bound to shared +Dynamic Resource Allocation (DRA) devices that require cohesive management. +Without this level of precision, workloads may be fragmented across disparate +network domains, drastically degrading performance and wasting the potential of +expensive hardware. + +Given the economics of high-performance accelerators and network infrastructure, +maximizing application performance and resource utilization is a primary goal +for users. Achieving this requires intelligent placement decisions that +understand the physical constraints of the cluster. However, the default +scheduler's current pod-centric logic lacks the native mechanisms to efficiently +resolve these complex group-level constraints during the scheduling cycle. + +Topology-aware scheduling is not a new concept and is currently addressed by +external admission control systems like Kueue or alternative schedulers like +Volcano. However, relying on external admission controllers decouples the +topology decision from the scheduler's core logic, while alternative schedulers +introduce operational complexity. We believe that embedding topology and DRA +awareness deeply into the kube-scheduler is critical enough to warrant +standardization. This integration allows the algorithm to leverage the full +fidelity of the scheduler's existing pod-level filtering and scoring plugins, +ensuring highly accurate feasibility checks and placement outcomes without the +need for external dependencies. + +### Goals + +- To enhance kube-scheduler to perform topology-aware and DRA-aware scheduling + for multi-pod workloads, as defined by the Workload API (KEP-4671). +- To optimize the placement of distributed workloads by co-locating pods based + on network topology and DRA resource availability. +- To introduce new extension points and phases within the Kubernetes scheduler + framework to support the concept of "Placements" (candidate sets of nodes + and DRA resources). +- To define the required changes to the Workload API (KEP-4671) to support + scheduling constraints. +- To leverage the scheduler's existing pod-level filtering and scoring logic + within the evaluation of each Placement. +- To provide a flexible framework extensible by plugins for various topology + sources (e.g., node labels) and resource types (e.g., DRA). + +### Non-Goals + +- To replace the functionality of external workload queueing and admission + control systems like Kueue. This proposal focuses on the in-scheduler + placement decision for a single Workload at a time. +- To implement Workload-level queueing, fairness, or resource quotas within + kube-scheduler. +- To handle all aspects of the workload lifecycle management beyond + scheduling. +- To implement Workload-level preemption logic. +- To integrate with cluster autoscaling mechanisms in this phase. +- To support complex multi-PodSet dependency resolution with backtracking or + parallel processing in the initial version. +- To automatically discover network topology; the mechanisms rely on topology + information being present (e.g., via node labels or DRA ResourceSlices). + +## Proposal + +This proposal introduces an API to define constraints on a PodGroup (a +collection of pods within a Workload) requiring it to be scheduled onto a +specific subset of nodes or resources. + +We support two fundamental types of constraints: + +1. **Topology Constraint (Node Label Co-location)**: Ensures all pods in a + PodGroup are placed onto nodes sharing a common topological characteristic + (e.g., same rack), defined by a specific node label. + +2. **DRA Constraint (Shared Dynamic Resource Allocation)**: Ensures all pods in a + PodGroup bind to a single DRA claim fulfilled from a single, shared, + co-located resource (e.g., interconnected network interfaces or + accelerators). + +The scheduler is extended to interpret these constraints and find a "Placement" +(a subset of nodes and DRA resources) that satisfies them. + +### User Stories (Optional) + +#### Story 1: AI Training in a Single Rack + +As a data scientist, I want to run a distributed training job where all pods +need to be located in the same server rack to minimize latency. I define a +`TopologyConstraint` on the Workload's PodGroup specifying the rack topology +label. The scheduler identifies a rack with sufficient capacity and schedules +all pods there at once. + +#### Story 2: Workload using Interconnected DRA Devices + +As a cluster administrator, I want to schedule a workload that requires a set of +specialized accelerators that are physically interconnected. I use a +`DRAConstraint` targeting a specific `ResourceClaimTemplate`. The scheduler +finds a set of DRA resources (ResourceSlice) that are co-located and binds the +workload's pods to them. + +### Notes/Constraints/Caveats (Optional) + +### Risks and Mitigations + +- **Scheduling Latency:** Evaluating multiple placements involves running + filter/score plugins multiple times. + + - **Mitigation:** Implement pre-filtering optimizations to reject infeasible + placements early based on aggregate resource availability. + +- **Complexity of Pod Group Scheduling:** Scheduling heterogeneous Pod Groups + can be complex. + + - **Mitigation:** The initial version supports sequential processing of pods + within a PodGroup, avoiding complex backtracking or parallel processing + in the alpha release. + +## Design Details + +### Workload API Changes + +The Workload API (KEP-4671) will be extended to allow specifying group-level +scheduling constraints. An optional `ScheduleConstraints` field is added to the +`PodGroup` spec. + +```go +// PodGroup (definition from KEP-4671, with additions) +type PodGroup struct { + Name *string + + // SchedulingConstraints defines group-level scheduling requirements, + // including topology and DRA colocation. + SchedulingConstraints *PodGroupSchedulingConstraints +} + +// PodGroupSchedulingConstraints holds the scheduling constraints for the PodGroup. +type PodGroupSchedulingConstraints struct { + // TopologyConstraints specifies desired topological placements for all pods + // within this PodGroup. + TopologyConstraints []TopologyConstraint + + // DRAConstraints specifies constraints on how Dynamic Resources are allocated + // across the PodGroup. + DRAConstraints []DRAConstraint +} + +// TopologyConstraint describes a desired topological colocation for all pods in the PodGroup. +type TopologyConstraint struct { + // Level specifies the key of the node label representing the topology domain. + // All pods within the PodGroup must be colocated within the same domain instance. + // Examples: "topology.kubernetes.io/rack" + Level string +} + +// DRAConstraint provides constraints on how specific DRA claims across the group should +// be fulfilled. +type DRAConstraint struct { + // ResourceClaimName specifies the name of a specific ResourceClaim + // within the PodGroup's pods that this constraint applies to. + ResourceClaimName *string + + // ResourceClaimTemplateName specifies the name of a ResourceClaimTemplate. + // This applies to all ResourceClaim instances generated from this template. + ResourceClaimTemplateName *string +} +``` + +Note: For the initial alpha scope, only a single TopologyConstraint or +DRAConstraint will be supported. + +### Scheduling Framework Extensions + +The scheduler framework requires new plugin interfaces to handle "Placements". A +Placement represents a candidate domain (nodes and resources) for a PodGroup. + +#### 1. Data Structures + +```go +// PodGroupInfo holds information about a specific PodGroup within a Workload, +// including a reference to the Workload, the PodGroup's name, and its replica index. +// This struct is designed to be extensible with more fields in the future. +type PodGroupInfo struct { + // WorkloadRef is a reference to the parent Workload object. + WorkloadRef *workloadv1alpha1.Workload + + // PodGroupName is the name of the PodGroup. + PodGroupName string + + // PodGroupReplicaIndex is the index of the PodGroup replica, as defined in KEP-4671. + // This is relevant for PodGroups that have more than one replica. + PodGroupReplicaIndex int + + // -- Add other fields below for future extensions -- +} + +// PodSetInfo holds information about a specific PodSet within a PodGroup, +// primarily the list of Pods. +// This struct is designed to be extensible with more fields in the future. +type PodSetInfo struct { + // Pods is a list of Pod objects belonging to this PodSet. + Pods []*corev1.Pod + + // -- Add other fields below for future extensions -- +} + +// Placement represents a candidate domain for scheduling a PodSet. +// It defines a set of nodes and/or proposed Dynamic Resource Allocation (DRA) +// resource bindings necessary to satisfy the PodSet's requirements within that domain. +type Placement struct { + // NodeAffinity specifies the node constraints for this Placement. + // For Topology this is derived from topology labels (e.g., all nodes with label + // 'topology-rack: rack-1'). + // For DRA, this Affinity would be constructed based on nodeSelector from + // DRA's AllocationResult from DRAAllocations. + // All pods within the PodSet, when being evaluated against this Placement, + // are restricted to the nodes matching this NodeAffinity. + NodeAffinity *corev1.NodeAffinity + + // DRAAllocations details the proposed DRA resource assignments for + // the ResourceClaims made by the PodSet. This field is primarily used + // by DRA-aware plugins. + DRAAllocations []DraClaimAllocation +} + +// DraClaimAllocation maps a specific ResourceClaim name to a set of proposed +// device allocations. These allocations are tentative and used by the scheduler's +// AssumePlacement phase to simulate resource commitment. +type DraClaimAllocation struct { + // ResourceClaimName is the name of the ResourceClaim within the PodSet's context + // that these allocations are intended to satisfy. + ResourceClaimName string + + // Allocation contains DRA AllocationResult structures, specifying devices + // from ResourceSlices that are proposed to fulfill the ResourceClaim. + // The scheduler will use this information in AssumePlacement to temporarily + // consider these devices as allocated. + Allocation dra.AllocationResult +} +``` + +#### 2. New Plugin Interfaces + +**PlacementGenerator:** Generates candidate placements based on constraints. + +```go +// PlacementGenerator is an interface for plugins that generate candidate Placements. +type PlacementGenerator interface { + Name() string + + // GeneratePlacements generates a list of potential Placements for the given PodGroup and PodSet. + // Each Placement represents a candidate set of resources (e.g., nodes matching a selector) + // and potential DRA allocations where the PodSet might be scheduled. + GeneratePlacements(ctx context.Context, state *framework.CycleState, podGroup *PodGroupInfo, podSet *PodSetInfo, parentPlacements []*Placement) ([]*Placement, *framework.Status) +} +``` + +**PlacementState:** Manages state changes (simulating binding) during +feasibility checks. + +```go +// PlacementState is an interface for plugins that manage state changes +// when a Placement is being considered. +type PlacementState interface { + Name() string + + // AssumePlacement temporarily configures the scheduling context to evaluate the feasibility + // of the given Placement for the PodGroup and PodSet. + AssumePlacement(ctx context.Context, state *framework.CycleState, podGroup *PodGroupInfo, podSet *PodSetInfo, placement *Placement) *framework.Status + + // RevertPlacement reverts the temporary scheduling context changes made by AssumePlacement. + // This should be called after the evaluation of a Placement is complete to restore + // the scheduler's state and allow other Placements to be considered. + RevertPlacement(ctx context.Context, state *framework.CycleState, podGroup *PodGroupInfo, podSet *PodSetInfo, placement *Placement) *framework.Status +} +``` + +**PlacementScorer:** Scores feasible placements to select the best one. + +```go +// PodSetAssignment represents the assignment of pods to nodes within a PodSet for a specific Placement. +type PodSetAssignment struct { + // PodToNodeMap maps a Pod name (string) to a Node name (string). + PodToNodeMap map[string]string +} + +// PlacementScorer is an interface for plugins that score feasible Placements. +type PlacementScorer interface { + Name() string + + // ScorePlacement calculates a score for a given Placement. This function is called in Phase 3 + // (Placement Scoring and Selection) only for Placements that have been deemed feasible + // for all pods in the PodSet during Phase 2. The PodSetAssignment indicates the + // node assigned to each pod within this Placement. The returned score is a float64, + // with higher scores generally indicating more preferable Placements. + // Plugins can implement various scoring strategies, such as bin packing to minimize + // resource fragmentation. + ScorePlacement(ctx context.Context, state *framework.CycleState, podGroup *PodGroupInfo, podSet *PodSetInfo, placement *Placement, podsAssignment *PodSetAssignment) (float64, *framework.Status) +} +``` + +### Scheduling Algorithm Phases + +The algorithm proceeds in three main phases for a given Workload/PodGroup. + +#### Phase 1: Candidate Placement Generation + +- **Input:** PodGroupInfo and PodSetInfo. + +- **Action:** Iterate over distinct values of the topology label (TAS) or + available ResourceSlices (DRA). + +- **Output:** A list of Placement objects. + +- Example: If the label is rack, placements are generated for rack-1, rack-2, + etc. + +#### Phase 2: Pod-Level Filtering and Feasibility Check + +- **Action:** For each generated Placement: + + 1. Call `AssumePlacement` (binds context to the specific node selector/DRA + resources). + + 2. Iterate through every pod in the PodSet. + + 3. Run standard Pod-level Filter and Score. + + 4. Use internal logic to simulate placing the pod on a node. + + 5. If all pods fit, the Placement is marked Feasible. + + 6. Call `RevertPlacement`. + +- **Potential Optimization:** Pre-filtering can check aggregate resources + before running the full simulation. + +- **Heterogeneous PodGroup Handling**: Sequential Processing will be used + initially. Pods are processed sequentially; if any fail, the placement is + rejected. + +#### Phase 3: Placement Scoring and Selection + +- **Action:** Call `ScorePlacement` for all feasible placements. + +- **Selection:** Select the Placement with the highest score. + +- **Binding:** Proceed to bind pods to the assigned nodes and resources. + +### Scheduler Plugins + +**TopologyPlacementPlugin (New)** Implements `PlacementGenerator`. Generates +Placements based on distinct values of the designated node label (TAS) . + +**DRAPlugin (Extension)** Extended to implement `PlacementGenerator` and +`PlacementState`. + +- **Generator:** Returns Placements derived from available ResourceSlices + satisfying shared claims. + +- **State:** Temporarily assigns AllocationResults to ResourceClaims during + the Assume phase. + +**PlacementBinPackingPlugin (New)** Implements `PlacementScorer`. Scores +Placements to maximize utilization (tightest fit) and minimize fragmentation. + +### Potential Future Extensions (Beta Candidates) + +The following features are out of scope for the initial Alpha implementation but +are considered for future releases (post-1.36): + +1. **Prioritized Placement Scheduling:** Allowing a set of preferred placements + with fallbacks (e.g., prefer Rack, fallback to Block). This would introduce + a Rank field to the Placement struct. + +2. **Optional/Preferred Scheduling Constraints:** Constraints that serve purely + as scoring mechanisms without hard requirements. + +3. **Multi-level Scheduling Constraints:** Handling nested constraints (e.g., + Block -> Rack). This would involve iterative placement generation and a + Parent field in the Placement struct. + +4. **Pod Group Replicas Support:** Optimizing scheduling for identical + PodGroups (replicas) by scheduling the maximum feasible number of replicas + within a single placement pass. + +5. **Explicit Topology Definition:** Using a Custom Resource (NodeTopology) to + define and alias topology levels, removing the need for users to know exact + node label keys. + +### Test Plan + +[ ] I/we understand the owners of the involved components may require updates to +existing tests to make this code solid enough prior to committing the changes +necessary to implement this enhancement. + +#### Prerequisite testing updates + +#### Unit tests + +- PlacementGenerator: Test generation of placements for various topology + labels and DRA ResourceSlices. + +- PlacementState: Verify AssumePlacement and RevertPlacement correctly modify + and restore the CycleState. + +- Algorithm Logic: Test the sequential processing of Placements and the + selection logic based on scores. + +- DRA Integration: specific tests for DRAConstraint resolution. + +#### Integration tests + +- Topology Awareness: Verify that pods with TopologyConstraint are correctly + co-located on nodes sharing the label. + +- DRA Awareness: Verify that pods with DRAConstraint are bound to shared + ResourceSlices. + +- Infeasibility: Verify that Workloads remain pending if no Placement + satisfies the constraints. + +#### e2e tests + +- End-to-End Workload Scheduling: Submit a Workload with TopologyConstraint + (e.g., Rack) and verify all pods land on the same rack. + +- DRA Co-location: Submit a Workload requiring shared DRA devices and verify + correct allocation and placement. + +### Graduation Criteria + +#### Alpha + +- Feature implemented behind a feature flag. +- PodGroupSchedulingConstraints API defined. +- Basic topology (Node Label) and DRA constraints working. +- Initial unit and integration tests. + +#### Beta + +- Support for "Potential Future Extensions" (Prioritized placement, etc.) + evaluated. +- Scalability tests on large clusters with high placement counts. +- Comprehensive e2e testing. + +### Upgrade / Downgrade Strategy + +This KEP is additive and can safely fallback to the original behavior on +downgrade. + +When a user upgrades the cluster to the version which supports topology-aware +workload scheduling: + +- they can enable scheduling plugins implementing new Scheduling Framework + interfaces in kube-scheduler config +- they can start using the new API to create Workload objects with + `schedulingConstraints` field +- scheduler will use enabled plugins to generate placements for Workload and + check their feasibility + +When user downgrades the cluster to the version that no longer supports +topology-aware workload scheduling: + +- the `schedulingConstraints` field can no longer be set on the Workloads + (the already set fields continue to be set though) +- scheduler will revert to the original behavior of scheduling pods belonging + to a gang, without considering different potential placements. + +### Version Skew Strategy + +The feature is limited to the control plane, so the version skew with nodes +(kubelets) doesn't matter. + +For the API changes, the old version of components (in particular +kube-apiserver) may not handle those. Thus, users should not set those fields +before confirming all control-plane instances were upgraded to the version +supporting those. + +For the topology-aware workload scheduling itself, this is purely kube-scheduler +in-memory feature, so the skew doesn't matter (as there is always only a single +kube-scheduler instance being a leader). + +## Production Readiness Review Questionnaire + + + +### Feature Enablement and Rollback + + + +###### How can this feature be enabled / disabled in a live cluster? + + + +- [ ] Feature gate (also fill in values in `kep.yaml`) + - Feature gate name: + - Components depending on the feature gate: +- [ ] Other + - Describe the mechanism: + - Will enabling / disabling the feature require downtime of the control + plane? + - Will enabling / disabling the feature require downtime or reprovisioning + of a node? + +###### Does enabling the feature change any default behavior? + + + +###### Can the feature be disabled once it has been enabled (i.e. can we roll back the enablement)? + + + +###### What happens if we reenable the feature if it was previously rolled back? + +###### Are there any tests for feature enablement/disablement? + + + +### Rollout, Upgrade and Rollback Planning + + + +###### How can a rollout or rollback fail? Can it impact already running workloads? + + + +###### What specific metrics should inform a rollback? + + + +###### Were upgrade and rollback tested? Was the upgrade->downgrade->upgrade path tested? + + + +###### Is the rollout accompanied by any deprecations and/or removals of features, APIs, fields of API types, flags, etc.? + + + +### Monitoring Requirements + + + +###### How can an operator determine if the feature is in use by workloads? + + + +###### How can someone using this feature know that it is working for their instance? + + + +- [ ] Events + - Event Reason: +- [ ] API .status + - Condition name: + - Other field: +- [ ] Other (treat as last resort) + - Details: + +###### What are the reasonable SLOs (Service Level Objectives) for the enhancement? + + + +###### What are the SLIs (Service Level Indicators) an operator can use to determine the health of the service? + + + +- [ ] Metrics + - Metric name: + - [Optional] Aggregation method: + - Components exposing the metric: +- [ ] Other (treat as last resort) + - Details: + +###### Are there any missing metrics that would be useful to have to improve observability of this feature? + + + +### Dependencies + + + +###### Does this feature depend on any specific services running in the cluster? + + + +### Scalability + + + +###### Will enabling / using this feature result in any new API calls? + + + +###### Will enabling / using this feature result in introducing new API types? + + + +###### Will enabling / using this feature result in any new calls to the cloud provider? + + + +###### Will enabling / using this feature result in increasing size or count of the existing API objects? + + + +###### Will enabling / using this feature result in increasing time taken by any operations covered by existing SLIs/SLOs? + + + +###### Will enabling / using this feature result in non-negligible increase of resource usage (CPU, RAM, disk, IO, ...) in any components? + + + +###### Can enabling / using this feature result in resource exhaustion of some node resources (PIDs, sockets, inodes, etc.)? + + + +### Troubleshooting + + + +###### How does this feature react if the API server and/or etcd is unavailable? + +###### What are other known failure modes? + + + +###### What steps should be taken if SLOs are not being met to determine the problem? + +## Implementation History + + + +## Drawbacks + +- **Complexity:** This proposal adds significant logic to the kube-scheduler + framework, specifically the "Placement" abstraction and the simulation loop + (Phase 2). + +- **Performance:** Generating and simulating a large number of Placements + (e.g., every rack in a massive cluster) could be computationally expensive. + + - **Mitigation:** Pre-filtering of Placements will be implemented to discard + clearly infeasible Placements (insufficient total resources) before the + expensive pod-level simulation. + +## Alternatives + +### Pod Inter-Affinities + +Currently, users may attempt to simulate gang scheduling using podAffinity (to +co-locate pods) or podAntiAffinity. + +- **Pros:** Native to Kubernetes, no new CRDs. +- **Cons:** Affinity is evaluated per-Pod at the time of that Pod's + scheduling. It does not look ahead. This means that the scheduler might + place the first Pod on a node that satisfies its immediate affinity needs + but prevents the rest of the group from scheduling (e.g., locking a topology + domain that is too small for the rest of the group). + +### Standalone Schedulers (e.g., Volcano) + +Users can run a secondary scheduler like Volcano or Yunikorn. + +- **Pros:** Feature-rich, mature for batch workloads. +- **Cons:** Operationally complex (two schedulers), race conditions when + sharing cluster resources, and lack of integration with standard Kubernetes + features like common admission controllers or newer features like DRA + (initially). + +## Infrastructure Needed (Optional) + + diff --git a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/kep.yaml b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/kep.yaml new file mode 100644 index 000000000000..1a54a85aa9ab --- /dev/null +++ b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/kep.yaml @@ -0,0 +1,43 @@ +title: Topology-aware workload scheduling +kep-number: 5732 +authors: + - "@44past4" +owning-sig: sig-scheduling +participating-sigs: +status: provisional +creation-date: 2025-12-10 +reviewers: + - +approvers: + - + +see-also: + - "/keps/sig-scheduling/4671-gang-scheduling" + +# The target maturity stage in the current dev cycle for this KEP. +# If the purpose of this KEP is to deprecate a user-visible feature +# and a Deprecated feature gates are added, they should be deprecated|disabled|removed. +stage: alpha + +# The most recent milestone for which work toward delivery of this KEP has been +# done. This can be the current (upcoming) milestone, if it is being actively +# worked on. +latest-milestone: "v1.36" + +# The milestone at which this feature was, or is targeted to be, at each stage. +milestone: + alpha: "v1.36" + beta: "v1.37" + stable: "v1.39" + +# The following PRR answers are required at alpha release +# List the feature gate name and the components for which it must be enabled +feature-gates: + - name: TopologyAwareWorkloadScheduling + components: + - kube-apiserver + - kube-scheduler +disable-supported: true + +# The following PRR answers are required at beta release +metrics: From 493cc9a5e9b96f952196be943aead139b7ae1c31 Mon Sep 17 00:00:00 2001 From: Pawel Kepka Date: Thu, 11 Dec 2025 07:25:39 +0000 Subject: [PATCH 02/25] Fixed Toc --- .../README.md | 93 +++++++++---------- 1 file changed, 46 insertions(+), 47 deletions(-) diff --git a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md index b697985daf73..756e9f232b0b 100644 --- a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md +++ b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md @@ -1,53 +1,52 @@ # KEP-5732: Topology-aware workload scheduling - -- [Release Signoff Checklist](#release-signoff-checklist) -- [Summary](#summary) -- [Motivation](#motivation) - - [Goals](#goals) - - [Non-Goals](#non-goals) -- [Proposal](#proposal) - - [User Stories (Optional)](#user-stories-optional) - - [Story 1: AI Training in a Single Rack](#story-1-ai-training-in-a-single-rack) - - [Story 2: Workload using Interconnected DRA Devices](#story-2-workload-using-interconnected-dra-devices) - - [Notes/Constraints/Caveats (Optional)](#notesconstraintscaveats-optional) - - [Risks and Mitigations](#risks-and-mitigations) -- [Design Details](#design-details) - - [Workload API Changes](#workload-api-changes) - - [Scheduling Framework Extensions](#scheduling-framework-extensions) - - [1. Data Structures](#1-data-structures) - - [2. New Plugin Interfaces](#2-new-plugin-interfaces) - - [Scheduling Algorithm Phases](#scheduling-algorithm-phases) - - [Phase 1: Candidate Placement Generation](#phase-1-candidate-placement-generation) - - [Phase 2: Pod-Level Filtering and Feasibility Check](#phase-2-pod-level-filtering-and-feasibility-check) - - [Phase 3: Placement Scoring and Selection](#phase-3-placement-scoring-and-selection) - - [Scheduler Plugins](#scheduler-plugins) - - [Potential Future Extensions (Beta Candidates)](#potential-future-extensions-beta-candidates) - - [Test Plan](#test-plan) - - [Prerequisite testing updates](#prerequisite-testing-updates) - - [Unit tests](#unit-tests) - - [Integration tests](#integration-tests) - - [e2e tests](#e2e-tests) - - [Graduation Criteria](#graduation-criteria) - - [Alpha](#alpha) - - [Beta](#beta) - - [Upgrade / Downgrade Strategy](#upgrade--downgrade-strategy) - - [Version Skew Strategy](#version-skew-strategy) -- [Production Readiness Review Questionnaire](#production-readiness-review-questionnaire) - - [Feature Enablement and Rollback](#feature-enablement-and-rollback) - - [Rollout, Upgrade and Rollback Planning](#rollout-upgrade-and-rollback-planning) - - [Monitoring Requirements](#monitoring-requirements) - - [Dependencies](#dependencies) - - [Scalability](#scalability) - - [Troubleshooting](#troubleshooting) -- [Implementation History](#implementation-history) -- [Drawbacks](#drawbacks) -- [Alternatives](#alternatives) - - [Pod Inter-Affinities](#pod-inter-affinities) - - [Standalone Schedulers (e.g., Volcano)](#standalone-schedulers-eg-volcano) -- [Infrastructure Needed (Optional)](#infrastructure-needed-optional) - +- [Release Signoff Checklist](#release-signoff-checklist) +- [Summary](#summary) +- [Motivation](#motivation) + - [Goals](#goals) + - [Non-Goals](#non-goals) +- [Proposal](#proposal) + - [User Stories (Optional)](#user-stories-optional) + - [Story 1: AI Training in a Single Rack](#story-1-ai-training-in-a-single-rack) + - [Story 2: Workload using Interconnected DRA Devices](#story-2-workload-using-interconnected-dra-devices) + - [Notes/Constraints/Caveats (Optional)](#notesconstraintscaveats-optional) + - [Risks and Mitigations](#risks-and-mitigations) +- [Design Details](#design-details) + - [Workload API Changes](#workload-api-changes) + - [Scheduling Framework Extensions](#scheduling-framework-extensions) + - [1. Data Structures](#1-data-structures) + - [2. New Plugin Interfaces](#2-new-plugin-interfaces) + - [Scheduling Algorithm Phases](#scheduling-algorithm-phases) + - [Phase 1: Candidate Placement Generation](#phase-1-candidate-placement-generation) + - [Phase 2: Pod-Level Filtering and Feasibility Check](#phase-2-pod-level-filtering-and-feasibility-check) + - [Phase 3: Placement Scoring and Selection](#phase-3-placement-scoring-and-selection) + - [Scheduler Plugins](#scheduler-plugins) + - [Potential Future Extensions (Beta Candidates)](#potential-future-extensions-beta-candidates) + - [Test Plan](#test-plan) + - [Prerequisite testing updates](#prerequisite-testing-updates) + - [Unit tests](#unit-tests) + - [Integration tests](#integration-tests) + - [e2e tests](#e2e-tests) + - [Graduation Criteria](#graduation-criteria) + - [Alpha](#alpha) + - [Beta](#beta) + - [Upgrade / Downgrade Strategy](#upgrade--downgrade-strategy) + - [Version Skew Strategy](#version-skew-strategy) +- [Production Readiness Review Questionnaire](#production-readiness-review-questionnaire) + - [Feature Enablement and Rollback](#feature-enablement-and-rollback) + - [Rollout, Upgrade and Rollback Planning](#rollout-upgrade-and-rollback-planning) + - [Monitoring Requirements](#monitoring-requirements) + - [Dependencies](#dependencies) + - [Scalability](#scalability) + - [Troubleshooting](#troubleshooting) +- [Implementation History](#implementation-history) +- [Drawbacks](#drawbacks) +- [Alternatives](#alternatives) + - [Pod Inter-Affinities](#pod-inter-affinities) + - [Standalone Schedulers (e.g., Volcano)](#standalone-schedulers-eg-volcano) +- [Infrastructure Needed (Optional)](#infrastructure-needed-optional) + ## Release Signoff Checklist From 52fa7c962416a86c58823d5f599aa6c3528ae73d Mon Sep 17 00:00:00 2001 From: Pawel Kepka Date: Thu, 11 Dec 2025 11:09:31 +0000 Subject: [PATCH 03/25] Added KEP reviewers and approvers --- .../5732-topology-aware-workload-scheduling/kep.yaml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/kep.yaml b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/kep.yaml index 1a54a85aa9ab..912641c56da0 100644 --- a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/kep.yaml +++ b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/kep.yaml @@ -7,9 +7,13 @@ participating-sigs: status: provisional creation-date: 2025-12-10 reviewers: - - + - sanposhiho + - dom4ha + - macsko + - wojtek-t approvers: - - + - sanposhiho + - dom4ha see-also: - "/keps/sig-scheduling/4671-gang-scheduling" From 3dbf9b4369f75a8a77562f02bf15952f529d09ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20K=C4=99pka?= Date: Tue, 16 Dec 2025 14:25:51 +0100 Subject: [PATCH 04/25] Initial batch of fixes after reviews --- .../README.md | 61 +++++++++++-------- 1 file changed, 37 insertions(+), 24 deletions(-) diff --git a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md index 756e9f232b0b..529cdd7db0d7 100644 --- a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md +++ b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md @@ -255,6 +255,7 @@ type PodGroupSchedulingConstraints struct { type TopologyConstraint struct { // Level specifies the key of the node label representing the topology domain. // All pods within the PodGroup must be colocated within the same domain instance. + // Different replicas of the PodGroup can land on different domain instances. // Examples: "topology.kubernetes.io/rack" Level string } @@ -297,11 +298,15 @@ type PodGroupInfo struct { // This is relevant for PodGroups that have more than one replica. PodGroupReplicaIndex int + // PodSets is a list of PodSet objects within this PodGroup. + PodSets []*PodSetInfo + // -- Add other fields below for future extensions -- } // PodSetInfo holds information about a specific PodSet within a PodGroup, // primarily the list of Pods. +// Pods within a PodSet must be homogeneous (using the sementic defined in KEP-5598). // This struct is designed to be extensible with more fields in the future. type PodSetInfo struct { // Pods is a list of Pod objects belonging to this PodSet. @@ -310,21 +315,23 @@ type PodSetInfo struct { // -- Add other fields below for future extensions -- } -// Placement represents a candidate domain for scheduling a PodSet. +// Placement represents a candidate domain for scheduling a PodGroup. // It defines a set of nodes and/or proposed Dynamic Resource Allocation (DRA) -// resource bindings necessary to satisfy the PodSet's requirements within that domain. +// resource bindings necessary to satisfy the PodGroup's requirements within that domain. +// Placement is valid only in the context of a given PodGroup for a single cycle of +// workload scheduling. type Placement struct { // NodeAffinity specifies the node constraints for this Placement. // For Topology this is derived from topology labels (e.g., all nodes with label // 'topology-rack: rack-1'). // For DRA, this Affinity would be constructed based on nodeSelector from // DRA's AllocationResult from DRAAllocations. - // All pods within the PodSet, when being evaluated against this Placement, + // All pods within the PodGroup, when being evaluated against this Placement, // are restricted to the nodes matching this NodeAffinity. NodeAffinity *corev1.NodeAffinity // DRAAllocations details the proposed DRA resource assignments for - // the ResourceClaims made by the PodSet. This field is primarily used + // the ResourceClaims made by the PodGroup. This field is primarily used // by DRA-aware plugins. DRAAllocations []DraClaimAllocation } @@ -333,8 +340,8 @@ type Placement struct { // device allocations. These allocations are tentative and used by the scheduler's // AssumePlacement phase to simulate resource commitment. type DraClaimAllocation struct { - // ResourceClaimName is the name of the ResourceClaim within the PodSet's context - // that these allocations are intended to satisfy. + // ResourceClaimName is the name of the ResourceClaim within the PodGroup's + // context that these allocations are intended to satisfy. ResourceClaimName string // Allocation contains DRA AllocationResult structures, specifying devices @@ -354,10 +361,10 @@ type DraClaimAllocation struct { type PlacementGenerator interface { Name() string - // GeneratePlacements generates a list of potential Placements for the given PodGroup and PodSet. + // GeneratePlacements generates a list of potential Placements for the given PodGroup. // Each Placement represents a candidate set of resources (e.g., nodes matching a selector) - // and potential DRA allocations where the PodSet might be scheduled. - GeneratePlacements(ctx context.Context, state *framework.CycleState, podGroup *PodGroupInfo, podSet *PodSetInfo, parentPlacements []*Placement) ([]*Placement, *framework.Status) + // and potential DRA allocations where the PodGroup might be scheduled. + GeneratePlacements(ctx context.Context, state *framework.CycleState, podGroup *PodGroupInfo, parentPlacements []*Placement) ([]*Placement, *framework.Status) } ``` @@ -371,21 +378,21 @@ type PlacementState interface { Name() string // AssumePlacement temporarily configures the scheduling context to evaluate the feasibility - // of the given Placement for the PodGroup and PodSet. - AssumePlacement(ctx context.Context, state *framework.CycleState, podGroup *PodGroupInfo, podSet *PodSetInfo, placement *Placement) *framework.Status + // of the given Placement for the PodGroup. + AssumePlacement(ctx context.Context, state *framework.CycleState, podGroup *PodGroupInfo, placement *Placement) *framework.Status // RevertPlacement reverts the temporary scheduling context changes made by AssumePlacement. // This should be called after the evaluation of a Placement is complete to restore // the scheduler's state and allow other Placements to be considered. - RevertPlacement(ctx context.Context, state *framework.CycleState, podGroup *PodGroupInfo, podSet *PodSetInfo, placement *Placement) *framework.Status + RevertPlacement(ctx context.Context, state *framework.CycleState, podGroup *PodGroupInfo, placement *Placement) *framework.Status } ``` **PlacementScorer:** Scores feasible placements to select the best one. ```go -// PodSetAssignment represents the assignment of pods to nodes within a PodSet for a specific Placement. -type PodSetAssignment struct { +// PodGroupAssignment represents the assignment of pods to nodes within a PodGroup for a specific Placement. +type PodGroupAssignment struct { // PodToNodeMap maps a Pod name (string) to a Node name (string). PodToNodeMap map[string]string } @@ -396,12 +403,12 @@ type PlacementScorer interface { // ScorePlacement calculates a score for a given Placement. This function is called in Phase 3 // (Placement Scoring and Selection) only for Placements that have been deemed feasible - // for all pods in the PodSet during Phase 2. The PodSetAssignment indicates the + // for all pods in the PodGroup during Phase 2. The PodGroupAssignment indicates the // node assigned to each pod within this Placement. The returned score is a float64, // with higher scores generally indicating more preferable Placements. // Plugins can implement various scoring strategies, such as bin packing to minimize // resource fragmentation. - ScorePlacement(ctx context.Context, state *framework.CycleState, podGroup *PodGroupInfo, podSet *PodSetInfo, placement *Placement, podsAssignment *PodSetAssignment) (float64, *framework.Status) + ScorePlacement(ctx context.Context, state *framework.CycleState, podGroup *PodGroupInfo, placement *Placement, podsAssignment *PodGroupAssignment) (float64, *framework.Status) } ``` @@ -411,7 +418,7 @@ The algorithm proceeds in three main phases for a given Workload/PodGroup. #### Phase 1: Candidate Placement Generation -- **Input:** PodGroupInfo and PodSetInfo. +- **Input:** PodGroupInfo. - **Action:** Iterate over distinct values of the topology label (TAS) or available ResourceSlices (DRA). @@ -428,7 +435,7 @@ The algorithm proceeds in three main phases for a given Workload/PodGroup. 1. Call `AssumePlacement` (binds context to the specific node selector/DRA resources). - 2. Iterate through every pod in the PodSet. + 2. Iterate through every pod in the PodGroup. 3. Run standard Pod-level Filter and Score. @@ -439,9 +446,9 @@ The algorithm proceeds in three main phases for a given Workload/PodGroup. 6. Call `RevertPlacement`. - **Potential Optimization:** Pre-filtering can check aggregate resources - before running the full simulation. + requested by PodGroup Pods before running the full simulation. -- **Heterogeneous PodGroup Handling**: Sequential Processing will be used +- **Heterogeneous PodGroup Handling**: Sequential processing will be used initially. Pods are processed sequentially; if any fail, the placement is rejected. @@ -451,7 +458,9 @@ The algorithm proceeds in three main phases for a given Workload/PodGroup. - **Selection:** Select the Placement with the highest score. -- **Binding:** Proceed to bind pods to the assigned nodes and resources. +- **Binding:** Proceed to bind pods to the assigned nodes and resources using + pod-by-pod scheduling logic with each pod prebound to the selected node + by seting `nominatedNodeName` value. ### Scheduler Plugins @@ -470,10 +479,10 @@ Placements based on distinct values of the designated node label (TAS) . **PlacementBinPackingPlugin (New)** Implements `PlacementScorer`. Scores Placements to maximize utilization (tightest fit) and minimize fragmentation. -### Potential Future Extensions (Beta Candidates) +### Potential Future Extensions -The following features are out of scope for the initial Alpha implementation but -are considered for future releases (post-1.36): +The following features are out of scope for this KEP but are considered for +future separate KEPs improving and extending the proposed functionality: 1. **Prioritized Placement Scheduling:** Allowing a set of preferred placements with fallbacks (e.g., prefer Rack, fallback to Block). This would introduce @@ -494,6 +503,10 @@ are considered for future releases (post-1.36): define and alias topology levels, removing the need for users to know exact node label keys. +6. **Feasible Placements Limit:** Adding an option to provide a limit on the + number of feasible Placements which need to be found before moving to + Phase 3: Placement Scoring and Selection. + ### Test Plan [ ] I/we understand the owners of the involved components may require updates to From 03b0c3277ef570b2fa7675cc31941a3eaba97b72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20K=C4=99pka?= Date: Wed, 17 Dec 2025 00:27:11 +0100 Subject: [PATCH 05/25] Move DRA constraints support to beta --- .../README.md | 89 +++++++++++-------- 1 file changed, 50 insertions(+), 39 deletions(-) diff --git a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md index 529cdd7db0d7..4ce902754af0 100644 --- a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md +++ b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md @@ -102,11 +102,11 @@ Groups while maintaining compatibility with the scheduler's existing ecosystem. Distributed workloads, particularly those driving the current AI/ML era, often require high-bandwidth and low-latency communication between multiple pods to -function efficiently. While the [KEP-4671: Workload API] makes the first step -towards managing these applications as cohesive units, it primarily establishes -the API structure. For workloads sensitive to inter-pod communication, simply -grouping pods is insufficient; their physical placement within the cluster's -network topology is a decisive factor in their performance. +function efficiently. While the [KEP-4671: Workload API](https://kep.k8s.io/4671) +makes the first step towards managing these applications as cohesive units, it +primarily establishes the API structure. For workloads sensitive to inter-pod +communication, simply grouping pods is insufficient; their physical placement +within the cluster's network topology is a decisive factor in their performance. In this KEP, we propose an algorithm for topology-aware and DRA-aware scheduling that operates directly within the Kubernetes kube-scheduler. The core objective @@ -137,15 +137,16 @@ need for external dependencies. ### Goals -- To enhance kube-scheduler to perform topology-aware and DRA-aware scheduling - for multi-pod workloads, as defined by the Workload API (KEP-4671). +- To enhance kube-scheduler to be able to perform topology-aware and DRA-aware + scheduling for multi-pod workloads, as defined by the Workload API + ([KEP-4671](https://kep.k8s.io/4671)). - To optimize the placement of distributed workloads by co-locating pods based on network topology and DRA resource availability. - To introduce new extension points and phases within the Kubernetes scheduler framework to support the concept of "Placements" (candidate sets of nodes and DRA resources). - To define the required changes to the Workload API (KEP-4671) to support - scheduling constraints. + Topology scheduling constraints. - To leverage the scheduler's existing pod-level filtering and scoring logic within the evaluation of each Placement. - To provide a flexible framework extensible by plugins for various topology @@ -153,6 +154,10 @@ need for external dependencies. ### Non-Goals +- To define the required changes to the Workload API (KEP-4671) to support + ResourceClaims for DRA-aware workload scheduling. These changes will be + proposed in a separate KEP: + [KEP-5729: DRA: ResourceClaim Support for Workloads](https://github.com/kubernetes/enhancements/pull/5736) - To replace the functionality of external workload queueing and admission control systems like Kueue. This proposal focuses on the in-scheduler placement decision for a single Workload at a time. @@ -245,10 +250,6 @@ type PodGroupSchedulingConstraints struct { // TopologyConstraints specifies desired topological placements for all pods // within this PodGroup. TopologyConstraints []TopologyConstraint - - // DRAConstraints specifies constraints on how Dynamic Resources are allocated - // across the PodGroup. - DRAConstraints []DRAConstraint } // TopologyConstraint describes a desired topological colocation for all pods in the PodGroup. @@ -259,22 +260,15 @@ type TopologyConstraint struct { // Examples: "topology.kubernetes.io/rack" Level string } - -// DRAConstraint provides constraints on how specific DRA claims across the group should -// be fulfilled. -type DRAConstraint struct { - // ResourceClaimName specifies the name of a specific ResourceClaim - // within the PodGroup's pods that this constraint applies to. - ResourceClaimName *string - - // ResourceClaimTemplateName specifies the name of a ResourceClaimTemplate. - // This applies to all ResourceClaim instances generated from this template. - ResourceClaimTemplateName *string -} ``` -Note: For the initial alpha scope, only a single TopologyConstraint or -DRAConstraint will be supported. +The Workload API changes for DRA-aware scheduling, including the definition of +DRA constraints, are out of scope for the alpha version of this KEP. These changes +will be defined in a separate KEP: +[KEP-5729: DRA: ResourceClaim Support for Workloads](https://github.com/kubernetes/enhancements/pull/5736). + +Note: For the initial alpha scope, only a single TopologyConstraint will be +supported. ### Scheduling Framework Extensions @@ -421,7 +415,7 @@ The algorithm proceeds in three main phases for a given Workload/PodGroup. - **Input:** PodGroupInfo. - **Action:** Iterate over distinct values of the topology label (TAS) or - available ResourceSlices (DRA). + available Devices (DRA). - **Output:** A list of Placement objects. @@ -465,19 +459,36 @@ The algorithm proceeds in three main phases for a given Workload/PodGroup. ### Scheduler Plugins **TopologyPlacementPlugin (New)** Implements `PlacementGenerator`. Generates -Placements based on distinct values of the designated node label (TAS) . +Placements based on distinct values of the designated node label (TAS). -**DRAPlugin (Extension)** Extended to implement `PlacementGenerator` and -`PlacementState`. +**PlacementBinPackingPlugin (New)** Implements `PlacementScorer`. Scores +Placements to maximize utilization (tightest fit) and minimize fragmentation. -- **Generator:** Returns Placements derived from available ResourceSlices - satisfying shared claims. +**DRATestPlugin (New)** Implements `PlacementGenerator` and `PlacementState` +and is used only for testing the algorithm's support for DRA-aware scheduling. -- **State:** Temporarily assigns AllocationResults to ResourceClaims during - the Assume phase. +- **Generator:** Returns Placements derived from available Devices satisfying + claims shared by all Pods within a PodGroup. -**PlacementBinPackingPlugin (New)** Implements `PlacementScorer`. Scores -Placements to maximize utilization (tightest fit) and minimize fragmentation. +- **State:** Temporarily assigns AllocationResults to Devices during the + Assume phase. + +### Beta Extensions + +The beta version of this KEP will introduce full support for DRA-aware workload +scheduling. This enhancement will enable the scheduler to consider DRA claims +defined by users when making placement decisions, ensuring that workloads are +placed on nodes that can satisfy their resource requirements. This will be +achieved by using the API to be defined in +[KEP-5729: DRA: ResourceClaim Support for Workloads](https://github.com/kubernetes/enhancements/pull/5736). + +The implementation will build upon the extension points introduced in the +alpha version of this feature and the `DRATestPlugin` implementation. +Specifically, the `DRAPlugin` will be enhanced to generate placements based +on the ResourceClaim objects associated with the PodGroup. The plugin will +interact with the DRA framework to ensure that the selected placement can +satisfy the resource requirements of the workload, as expressed in its +ResourceClaim. ### Potential Future Extensions @@ -526,15 +537,15 @@ necessary to implement this enhancement. - Algorithm Logic: Test the sequential processing of Placements and the selection logic based on scores. -- DRA Integration: specific tests for DRAConstraint resolution. +- DRA Integration: specific tests for DRATestPlugin plugin. #### Integration tests - Topology Awareness: Verify that pods with TopologyConstraint are correctly co-located on nodes sharing the label. -- DRA Awareness: Verify that pods with DRAConstraint are bound to shared - ResourceSlices. +- DRA Awareness: Verify that pods with shared ResourceClaims are bound to shared + Devices. - Infeasibility: Verify that Workloads remain pending if no Placement satisfies the constraints. From 2e35c86a27419f9b9b4cf4e3876948534c3de02f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20K=C4=99pka?= Date: Wed, 17 Dec 2025 00:45:48 +0100 Subject: [PATCH 06/25] Fix TOC --- .../5732-topology-aware-workload-scheduling/README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md index 4ce902754af0..738e64ba39fc 100644 --- a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md +++ b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md @@ -22,7 +22,8 @@ - [Phase 2: Pod-Level Filtering and Feasibility Check](#phase-2-pod-level-filtering-and-feasibility-check) - [Phase 3: Placement Scoring and Selection](#phase-3-placement-scoring-and-selection) - [Scheduler Plugins](#scheduler-plugins) - - [Potential Future Extensions (Beta Candidates)](#potential-future-extensions-beta-candidates) + - [Beta Extensions](#beta-extensions) + - [Potential Future Extensions](#potential-future-extensions) - [Test Plan](#test-plan) - [Prerequisite testing updates](#prerequisite-testing-updates) - [Unit tests](#unit-tests) From 1d62a849571e92747f7a0ea902805747a16e48d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20K=C4=99pka?= Date: Mon, 22 Dec 2025 10:58:33 +0100 Subject: [PATCH 07/25] Smaller fixed based on review feedback. --- .../5732-topology-aware-workload-scheduling/README.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md index 738e64ba39fc..78b47c959dc6 100644 --- a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md +++ b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md @@ -455,7 +455,7 @@ The algorithm proceeds in three main phases for a given Workload/PodGroup. - **Binding:** Proceed to bind pods to the assigned nodes and resources using pod-by-pod scheduling logic with each pod prebound to the selected node - by seting `nominatedNodeName` value. + by setting `nominatedNodeName` value. ### Scheduler Plugins @@ -507,7 +507,7 @@ future separate KEPs improving and extending the proposed functionality: Block -> Rack). This would involve iterative placement generation and a Parent field in the Placement struct. -4. **Pod Group Replicas Support:** Optimizing scheduling for identical +4. **Pod Group Replicas Optimization:** Optimizing scheduling for identical PodGroups (replicas) by scheduling the maximum feasible number of replicas within a single placement pass. @@ -565,11 +565,12 @@ necessary to implement this enhancement. - Feature implemented behind a feature flag. - PodGroupSchedulingConstraints API defined. -- Basic topology (Node Label) and DRA constraints working. +- Basic topology (Node Label) working. - Initial unit and integration tests. #### Beta +- DRA constraints working. - Support for "Potential Future Extensions" (Prioritized placement, etc.) evaluated. - Scalability tests on large clusters with high placement counts. From c8cf6578faebd932f5315caa528d5cee5315edc9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20K=C4=99pka?= Date: Mon, 22 Dec 2025 11:22:16 +0100 Subject: [PATCH 08/25] Update Explicit Topology Definition description. --- .../5732-topology-aware-workload-scheduling/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md index 78b47c959dc6..2664e398f385 100644 --- a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md +++ b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md @@ -513,7 +513,7 @@ future separate KEPs improving and extending the proposed functionality: 5. **Explicit Topology Definition:** Using a Custom Resource (NodeTopology) to define and alias topology levels, removing the need for users to know exact - node label keys. + node label keys and opening addtional optimization and validation options. 6. **Feasible Placements Limit:** Adding an option to provide a limit on the number of feasible Placements which need to be found before moving to From 0173704bfab9f4e7039f872224343b83cd6a5cea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20K=C4=99pka?= Date: Thu, 8 Jan 2026 10:34:00 +0100 Subject: [PATCH 09/25] Added Plugin suffix to PlacementGenerator, PlacementState and PlacementScorer --- .../README.md | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md index 2664e398f385..6df7f5454af9 100644 --- a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md +++ b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md @@ -95,8 +95,8 @@ decisions before committing resources. This design introduces specific extensions to the Kubernetes Workload API to support `TopologyConstraints` and `DRAConstraints`, defines new interfaces -within the Scheduling Framework (`PlacementGenerator`, `PlacementState`, -`PlacementScorer`), and details the algorithmic flow required to schedule Pod +within the Scheduling Framework (`PlacementGeneratorPlugin`, `PlacementStatePlugin`, +`PlacementScorerPlugin`), and details the algorithmic flow required to schedule Pod Groups while maintaining compatibility with the scheduler's existing ecosystem. ## Motivation @@ -349,11 +349,11 @@ type DraClaimAllocation struct { #### 2. New Plugin Interfaces -**PlacementGenerator:** Generates candidate placements based on constraints. +**PlacementGeneratorPlugin:** Generates candidate placements based on constraints. ```go -// PlacementGenerator is an interface for plugins that generate candidate Placements. -type PlacementGenerator interface { +// PlacementGeneratorPlugin is an interface for plugins that generate candidate Placements. +type PlacementGeneratorPlugin interface { Name() string // GeneratePlacements generates a list of potential Placements for the given PodGroup. @@ -363,13 +363,13 @@ type PlacementGenerator interface { } ``` -**PlacementState:** Manages state changes (simulating binding) during +**PlacementStatePlugin:** Manages state changes (simulating binding) during feasibility checks. ```go -// PlacementState is an interface for plugins that manage state changes +// PlacementStatePlugin is an interface for plugins that manage state changes // when a Placement is being considered. -type PlacementState interface { +type PlacementStatePlugin interface { Name() string // AssumePlacement temporarily configures the scheduling context to evaluate the feasibility @@ -383,7 +383,7 @@ type PlacementState interface { } ``` -**PlacementScorer:** Scores feasible placements to select the best one. +**PlacementScorerPlugin:** Scores feasible placements to select the best one. ```go // PodGroupAssignment represents the assignment of pods to nodes within a PodGroup for a specific Placement. @@ -392,8 +392,8 @@ type PodGroupAssignment struct { PodToNodeMap map[string]string } -// PlacementScorer is an interface for plugins that score feasible Placements. -type PlacementScorer interface { +// PlacementScorerPlugin is an interface for plugins that score feasible Placements. +type PlacementScorerPlugin interface { Name() string // ScorePlacement calculates a score for a given Placement. This function is called in Phase 3 @@ -459,13 +459,13 @@ The algorithm proceeds in three main phases for a given Workload/PodGroup. ### Scheduler Plugins -**TopologyPlacementPlugin (New)** Implements `PlacementGenerator`. Generates +**TopologyPlacementPlugin (New)** Implements `PlacementGeneratorPlugin`. Generates Placements based on distinct values of the designated node label (TAS). -**PlacementBinPackingPlugin (New)** Implements `PlacementScorer`. Scores +**PlacementBinPackingPlugin (New)** Implements `PlacementScorerPlugin`. Scores Placements to maximize utilization (tightest fit) and minimize fragmentation. -**DRATestPlugin (New)** Implements `PlacementGenerator` and `PlacementState` +**DRATestPlugin (New)** Implements `PlacementGeneratorPlugin` and `PlacementStatePlugin` and is used only for testing the algorithm's support for DRA-aware scheduling. - **Generator:** Returns Placements derived from available Devices satisfying @@ -529,10 +529,10 @@ necessary to implement this enhancement. #### Unit tests -- PlacementGenerator: Test generation of placements for various topology +- PlacementGeneratorPlugin: Test generation of placements for various topology labels and DRA ResourceSlices. -- PlacementState: Verify AssumePlacement and RevertPlacement correctly modify +- PlacementStatePlugin: Verify AssumePlacement and RevertPlacement correctly modify and restore the CycleState. - Algorithm Logic: Test the sequential processing of Placements and the From f59db7c6bcaee9c5c608905887c9e93c1cc71ff4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20K=C4=99pka?= Date: Fri, 16 Jan 2026 14:42:17 +0100 Subject: [PATCH 10/25] Updating README.md based on the comments - Added requirement to PlacementGeneratorPlugin to implement EnqueueExtensions - Added information about PlacementGeneratorPlugins to be called after PreFilter scheduling phase. - Changed NodeAffinity to NodeSelector in Placement struc --- .../README.md | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md index 6df7f5454af9..11f47255e8a2 100644 --- a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md +++ b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md @@ -316,14 +316,14 @@ type PodSetInfo struct { // Placement is valid only in the context of a given PodGroup for a single cycle of // workload scheduling. type Placement struct { - // NodeAffinity specifies the node constraints for this Placement. + // NodeSelector specifies the node constraints for this Placement. // For Topology this is derived from topology labels (e.g., all nodes with label // 'topology-rack: rack-1'). - // For DRA, this Affinity would be constructed based on nodeSelector from + // For DRA, this selector would be constructed based on nodeSelector from // DRA's AllocationResult from DRAAllocations. // All pods within the PodGroup, when being evaluated against this Placement, - // are restricted to the nodes matching this NodeAffinity. - NodeAffinity *corev1.NodeAffinity + // are restricted to the nodes matching this NodeSelector. + NodeSelector *corev1.NodeSelector // DRAAllocations details the proposed DRA resource assignments for // the ResourceClaims made by the PodGroup. This field is primarily used @@ -353,6 +353,8 @@ type DraClaimAllocation struct { ```go // PlacementGeneratorPlugin is an interface for plugins that generate candidate Placements. +// Plugins implemeting PlacementGeneratorPlugin interface should also implement +// EnqueueExtensions interface. type PlacementGeneratorPlugin interface { Name() string @@ -420,6 +422,9 @@ The algorithm proceeds in three main phases for a given Workload/PodGroup. - **Output:** A list of Placement objects. +- Placement generation is executed after PreFilter giving PlacementGeneratorPlugins + a chance to get the list of nodes in the cluster. + - Example: If the label is rack, placements are generated for rack-1, rack-2, etc. From 045349e8b7f690ce5dd42d137933d0c1b4f052df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20K=C4=99pka?= Date: Tue, 3 Feb 2026 10:00:35 +0100 Subject: [PATCH 11/25] Add prod readiness file --- keps/prod-readiness/sig-scheduling/5732.yaml | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 keps/prod-readiness/sig-scheduling/5732.yaml diff --git a/keps/prod-readiness/sig-scheduling/5732.yaml b/keps/prod-readiness/sig-scheduling/5732.yaml new file mode 100644 index 000000000000..2afe6ab584e8 --- /dev/null +++ b/keps/prod-readiness/sig-scheduling/5732.yaml @@ -0,0 +1,3 @@ +kep-number: 5732 +alpha: + approver: "@wojtek-t" From f576836d479f1a548809c0dc1936bcf437493c9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20K=C4=99pka?= Date: Tue, 3 Feb 2026 10:27:19 +0100 Subject: [PATCH 12/25] Production Readiness Review Questionnaire --- .../README.md | 162 ++++-------------- 1 file changed, 34 insertions(+), 128 deletions(-) diff --git a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md index 11f47255e8a2..8095bf650417 100644 --- a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md +++ b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md @@ -620,49 +620,13 @@ kube-scheduler instance being a leader). ## Production Readiness Review Questionnaire - - ### Feature Enablement and Rollback - - ###### How can this feature be enabled / disabled in a live cluster? - - -- [ ] Feature gate (also fill in values in `kep.yaml`) - - Feature gate name: - - Components depending on the feature gate: +- [X] Feature gate (also fill in values in `kep.yaml`) + - Feature gate name: TopologyAwareWorkloadScheduling + - Components depending on the feature gate: kube-apiserver, kube-scheduler - [ ] Other - Describe the mechanism: - Will enabling / disabling the feature require downtime of the control @@ -672,40 +636,30 @@ well as the [existing list] of feature gates. ###### Does enabling the feature change any default behavior? - +No - even with a feature enabled scheduler by default will use existing scheduling +algorithm to scheudle worklaods. Only when workload will have an explicit topology +constraint set an alternative algorithm will be used. ###### Can the feature be disabled once it has been enabled (i.e. can we roll back the enablement)? - +The new API changes can also be disabled by disabling the feature gate in kube-apiserver. +However that doesn't result in clearing the new fields for workloads that already have +them set in the storage. ###### What happens if we reenable the feature if it was previously rolled back? +The feature starts working again. + ###### Are there any tests for feature enablement/disablement? - +The scheduler algorithm changes are purely in-memory and doesn't require any dedicated +enablement/disablement tests - the logic will be covered by regular feature tests. + +For the newly introduced API fields, dedicated enablement/disablement tests at the +kube-apiserver registry layer will be added in Alpha. ### Rollout, Upgrade and Rollback Planning @@ -844,91 +798,43 @@ and creating new ones, as well as about cluster-level services (e.g. DNS): ### Scalability - - ###### Will enabling / using this feature result in any new API calls? - +No. ###### Will enabling / using this feature result in introducing new API types? - +No. ###### Will enabling / using this feature result in any new calls to the cloud provider? - +No. ###### Will enabling / using this feature result in increasing size or count of the existing API objects? - +Using this feature will require setting topology constraint on Workload object. +The related increase in size of the Workload object should however be negligible. ###### Will enabling / using this feature result in increasing time taken by any operations covered by existing SLIs/SLOs? - +We will measure the exact impact using performance benchmarks and scalability tests and +update the section based on the results. The complexity of scheudling of a single worklaod +is O(#pods * #nodes), which is comparable to the algorithm not using topology constraints, +so the benchmarks are primarily to validate the potential inefficiencies of the implementation. ###### Will enabling / using this feature result in non-negligible increase of resource usage (CPU, RAM, disk, IO, ...) in any components? - +For large clusters and fine grained toplogy constraints we may observe some increase in CPU +and RAM usage for kube-scheduler. The exact scale of this increase will be confirmed by +scalability tests. ###### Can enabling / using this feature result in resource exhaustion of some node resources (PIDs, sockets, inodes, etc.)? - +No. ### Troubleshooting From 6b5b3156ca76d47dc9ebb85487b96d9490435eb4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20K=C4=99pka?= Date: Tue, 3 Feb 2026 11:08:43 +0100 Subject: [PATCH 13/25] Fixed spelling errors --- .../5732-topology-aware-workload-scheduling/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md index 8095bf650417..0ea97e3eb83e 100644 --- a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md +++ b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md @@ -518,7 +518,7 @@ future separate KEPs improving and extending the proposed functionality: 5. **Explicit Topology Definition:** Using a Custom Resource (NodeTopology) to define and alias topology levels, removing the need for users to know exact - node label keys and opening addtional optimization and validation options. + node label keys and opening additional optimization and validation options. 6. **Feasible Placements Limit:** Adding an option to provide a limit on the number of feasible Placements which need to be found before moving to @@ -822,7 +822,7 @@ latency / Pod Startup SLO may potentially increase especially for large clusters fine grained topology constraints. We will measure the exact impact using performance benchmarks and scalability tests and -update the section based on the results. The complexity of scheudling of a single worklaod +update the section based on the results. The complexity of scheuduling of a single worklaod is O(#pods * #nodes), which is comparable to the algorithm not using topology constraints, so the benchmarks are primarily to validate the potential inefficiencies of the implementation. From 290490f22a83e3456be4ad0eed34b6a64af164d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20K=C4=99pka?= Date: Tue, 3 Feb 2026 20:16:53 +0100 Subject: [PATCH 14/25] Update kep.yaml --- .../5732-topology-aware-workload-scheduling/kep.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/kep.yaml b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/kep.yaml index 912641c56da0..0253ec6a85c9 100644 --- a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/kep.yaml +++ b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/kep.yaml @@ -38,6 +38,10 @@ milestone: # List the feature gate name and the components for which it must be enabled feature-gates: - name: TopologyAwareWorkloadScheduling + components: + - kube-apiserver + - kube-scheduler + - name: WorkloadBasicPolicyDesiredCount components: - kube-apiserver - kube-scheduler From 0ffef879c47faee98128d622a549be3cdf9ffd1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20K=C4=99pka?= Date: Tue, 3 Feb 2026 20:31:03 +0100 Subject: [PATCH 15/25] Extend KEP with desiredCount. --- .../README.md | 44 ++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md index 0ea97e3eb83e..56c1b6131080 100644 --- a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md +++ b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md @@ -271,6 +271,39 @@ will be defined in a separate KEP: Note: For the initial alpha scope, only a single TopologyConstraint will be supported. +#### Basic Policy Extension + +In the first alpha version of the Workload API, the `Basic` policy was a no-op. +We propose extending the `Basic` policy to accept a `desiredCount` field. +This feature will be gated behind a separate feature gate +(`WorkloadBasicPolicyDesiredCount`) to decouple it from the core Gang Scheduling +and Topology Aware Scheduling features. + +```go +// BasicSchedulingPolicy indicates that standard Kubernetes +// scheduling behavior should be used. +type BasicSchedulingPolicy struct { + // DesiredCount is the expected number of pods that will belong to this + // PodGroup. This field is a hint to the scheduler to help it make better + // placement decisions for the group as a whole. + // + // Unlike gang's minCount, this field does not block scheduling. If the number + // of available pods is less than desiredCount, the scheduler can still attempt + // to schedule the available pods, but will optimistically try to select a + // placement that can accommodate the future pods. + // + // +optional + DesiredCount *int32 +} +``` + +This field allows users to express their "true" workloads more easily and enables +the scheduler to optimize the placement of such pod groups by taking the desired state +into account. Ideally, the scheduler should prefer placements that can accommodate +the full `desiredCount`, even if not all pods are created yet. When `desiredCount` +is specified, the scheduler can delay scheduling the first Pod it sees for a short +amount of time in order to wait for more Pods to be observed. + ### Scheduling Framework Extensions The scheduler framework requires new plugin interfaces to handle "Placements". A @@ -470,6 +503,9 @@ Placements based on distinct values of the designated node label (TAS). **PlacementBinPackingPlugin (New)** Implements `PlacementScorerPlugin`. Scores Placements to maximize utilization (tightest fit) and minimize fragmentation. +**PlacementPodCountScorerPlugin (New)** Implements `PlacementScorerPlugin`. Scores +Placements based on the number of pods fiting into each Placement. + **DRATestPlugin (New)** Implements `PlacementGeneratorPlugin` and `PlacementStatePlugin` and is used only for testing the algorithm's support for DRA-aware scheduling. @@ -626,7 +662,13 @@ kube-scheduler instance being a leader). - [X] Feature gate (also fill in values in `kep.yaml`) - Feature gate name: TopologyAwareWorkloadScheduling - - Components depending on the feature gate: kube-apiserver, kube-scheduler + - Components depending on the feature gate: + - kube-apiserver + - kube-scheduler + - Feature gate name: WorkloadBasicPolicyDesiredCount + - Components depending on the feature gate: + - kube-apiserver + - kube-scheduler - [ ] Other - Describe the mechanism: - Will enabling / disabling the feature require downtime of the control From 02f70fac6e8b331156b3509f252602d5a26559ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20K=C4=99pka?= Date: Tue, 3 Feb 2026 20:59:50 +0100 Subject: [PATCH 16/25] Address comments from dom4ha --- .../README.md | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md index 56c1b6131080..c5dba9a890f7 100644 --- a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md +++ b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md @@ -84,10 +84,11 @@ integrating a Topology-Aware and DRA-Aware workload scheduling algorithm into the Kubernetes kube-scheduler to address the complex placement requirements of modern, high-performance distributed applications. -The proposed algorithm fundamentally alters the scheduling lifecycle for gang -scheduled workloads. Instead of evaluating pods individually against the cluster -state - a process prone to fragmentation and deadlocks - the new mechanism -generates "Placements". These Placements represent candidate domains (sets of +The proposed topology algorithm leverages the workload-oriented scheduling +lifecycle introduced in KEP-4671, rather than fundamentally altering the scheduling +loop itself. It extends this foundation by enabling the evaluation of scheduling +options within specific "Placements" (subsets of the cluster). These Placements +represent candidate domains (sets of nodes or DRA resources) where the entire workload is theoretically feasible. The scheduler then simulates the placement of the full group of pods within these domains, utilizing existing filtering and scoring logic to ensure high-fidelity @@ -468,15 +469,11 @@ The algorithm proceeds in three main phases for a given Workload/PodGroup. 1. Call `AssumePlacement` (binds context to the specific node selector/DRA resources). - 2. Iterate through every pod in the PodGroup. + 2. Run default workload scheduling algorithm with the given context. - 3. Run standard Pod-level Filter and Score. + 3. If all pods fit, the Placement is marked Feasible. - 4. Use internal logic to simulate placing the pod on a node. - - 5. If all pods fit, the Placement is marked Feasible. - - 6. Call `RevertPlacement`. + 4. Call `RevertPlacement`. - **Potential Optimization:** Pre-filtering can check aggregate resources requested by PodGroup Pods before running the full simulation. From 85fdeded08cb087331138d8729ef9e6621834659 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20K=C4=99pka?= Date: Tue, 3 Feb 2026 21:15:02 +0100 Subject: [PATCH 17/25] Update README.md --- .../5732-topology-aware-workload-scheduling/README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md index c5dba9a890f7..ac9e94696db9 100644 --- a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md +++ b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md @@ -186,7 +186,7 @@ We support two fundamental types of constraints: PodGroup are placed onto nodes sharing a common topological characteristic (e.g., same rack), defined by a specific node label. -2. **DRA Constraint (Shared Dynamic Resource Allocation)**: Ensures all pods in a +2. **raint (Shared Dynamic Resource Allocation)**: Ensures all pods in a PodGroup bind to a single DRA claim fulfilled from a single, shared, co-located resource (e.g., interconnected network interfaces or accelerators). @@ -608,7 +608,6 @@ necessary to implement this enhancement. #### Beta -- DRA constraints working. - Support for "Potential Future Extensions" (Prioritized placement, etc.) evaluated. - Scalability tests on large clusters with high placement counts. From 1bd676dfaebbe05329c4e6fec9e83cd9b3e20131 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20K=C4=99pka?= Date: Tue, 3 Feb 2026 21:16:44 +0100 Subject: [PATCH 18/25] Update README.md --- .../5732-topology-aware-workload-scheduling/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md index ac9e94696db9..4ce3eebfd05e 100644 --- a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md +++ b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md @@ -186,7 +186,7 @@ We support two fundamental types of constraints: PodGroup are placed onto nodes sharing a common topological characteristic (e.g., same rack), defined by a specific node label. -2. **raint (Shared Dynamic Resource Allocation)**: Ensures all pods in a +2. **DRA Constraint (Shared Dynamic Resource Allocation)**: Ensures all pods in a PodGroup bind to a single DRA claim fulfilled from a single, shared, co-located resource (e.g., interconnected network interfaces or accelerators). From e3c67b1a7e5dabd1936d9f94a2421890d3ec2958 Mon Sep 17 00:00:00 2001 From: Pawel Kepka Date: Wed, 4 Feb 2026 10:51:56 +0000 Subject: [PATCH 19/25] Fixed Toc --- .../5732-topology-aware-workload-scheduling/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md index 4ce3eebfd05e..e75ab95c44d5 100644 --- a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md +++ b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md @@ -14,6 +14,7 @@ - [Risks and Mitigations](#risks-and-mitigations) - [Design Details](#design-details) - [Workload API Changes](#workload-api-changes) + - [Basic Policy Extension](#basic-policy-extension) - [Scheduling Framework Extensions](#scheduling-framework-extensions) - [1. Data Structures](#1-data-structures) - [2. New Plugin Interfaces](#2-new-plugin-interfaces) From 178e03952565fb7887e958d5fed7dde71dc95f3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20K=C4=99pka?= Date: Thu, 5 Feb 2026 09:55:47 +0100 Subject: [PATCH 20/25] Add desiredCount to Gang policy --- .../README.md | 47 +++++++++++++++++-- 1 file changed, 43 insertions(+), 4 deletions(-) diff --git a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md index e75ab95c44d5..5ef9f77aea61 100644 --- a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md +++ b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md @@ -14,7 +14,7 @@ - [Risks and Mitigations](#risks-and-mitigations) - [Design Details](#design-details) - [Workload API Changes](#workload-api-changes) - - [Basic Policy Extension](#basic-policy-extension) + - [Basic and Gang Policy Extension](#basic-and-gang-policy-extension) - [Scheduling Framework Extensions](#scheduling-framework-extensions) - [1. Data Structures](#1-data-structures) - [2. New Plugin Interfaces](#2-new-plugin-interfaces) @@ -273,14 +273,22 @@ will be defined in a separate KEP: Note: For the initial alpha scope, only a single TopologyConstraint will be supported. -#### Basic Policy Extension +#### Basic and Gang Policy Extension In the first alpha version of the Workload API, the `Basic` policy was a no-op. -We propose extending the `Basic` policy to accept a `desiredCount` field. +We propose extending the `Basic` and `Gang` policies to accept a `desiredCount` +field. This field serves as a scheduler hint to improve placement decisions +without imposing hard scheduling constraints. + This feature will be gated behind a separate feature gate (`WorkloadBasicPolicyDesiredCount`) to decouple it from the core Gang Scheduling and Topology Aware Scheduling features. +**1. Basic Policy Update** + +We introduce `desiredCount` to the `Basic` policy to allow users to signal the +expected group size for optimization purposes. + ```go // BasicSchedulingPolicy indicates that standard Kubernetes // scheduling behavior should be used. @@ -299,7 +307,38 @@ type BasicSchedulingPolicy struct { } ``` -This field allows users to express their "true" workloads more easily and enables +**2. Gang Policy Update** + +We similarly extend the `Gang` policy. While `minCount` provides a hard constraint +for admission, `desiredCount` provides a soft target for placement optimization. + +```go +// GangSchedulingPolicy defines the parameters for gang scheduling. +type GangSchedulingPolicy struct { + // MinCount is the minimum number of pods that must be schedulable or scheduled + // at the same time for the scheduler to admit the entire group. + // It must be a positive integer. + // + // +required + MinCount int32 + + // DesiredCount is the expected number of pods that will belong to this + // PodGroup. This field is a hint to the scheduler to help it make better + // placement decisions for the group as a whole. + // + // Unlike gang's minCount, this field does not block scheduling. If the number + // of available pods is less than desiredCount but at least minCount, the scheduler + // can still attempt to schedule the available pods, but will optimistically try + // to select a placement that can accommodate the future pods. + // + // When provided desiredCount must be greater or equal to minCount. + // + // +optional + DesiredCount *int32 +} +``` + +Those fields allow users to express their "true" workloads more easily and enables the scheduler to optimize the placement of such pod groups by taking the desired state into account. Ideally, the scheduler should prefer placements that can accommodate the full `desiredCount`, even if not all pods are created yet. When `desiredCount` From f133ab46b62d19e23ff66206f84f5947f7f060f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20K=C4=99pka?= Date: Thu, 5 Feb 2026 12:34:14 +0100 Subject: [PATCH 21/25] Added cluster autoscaling support as requirement for beta --- .../5732-topology-aware-workload-scheduling/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md index 5ef9f77aea61..b09f59ff3e91 100644 --- a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md +++ b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md @@ -652,6 +652,7 @@ necessary to implement this enhancement. evaluated. - Scalability tests on large clusters with high placement counts. - Comprehensive e2e testing. +- Cluster autoscaling compomnents are aware of workload topology constraints. ### Upgrade / Downgrade Strategy From d157ceaf3ae08f9e67d4d18bd0254e39ea0ba8b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20K=C4=99pka?= Date: Thu, 5 Feb 2026 12:39:22 +0100 Subject: [PATCH 22/25] Fix phrasing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Dominik Marciński --- .../5732-topology-aware-workload-scheduling/README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md index b09f59ff3e91..13d1ffd7c3c3 100644 --- a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md +++ b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md @@ -192,8 +192,7 @@ We support two fundamental types of constraints: co-located resource (e.g., interconnected network interfaces or accelerators). -The scheduler is extended to interpret these constraints and find a "Placement" -(a subset of nodes and DRA resources) that satisfies them. +The scheduler is extended to interpret these new PodGroup level scheduling constraints and similarly to scheduling pods on nodes (available scheduling options), find a "Placement" for this PodGroup among the feasible options (subsets of nodes and DRA resources) that satisfies them. ### User Stories (Optional) From 4bf21d2dea2ef7305d12dd8e38aa47a640b55848 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20K=C4=99pka?= Date: Thu, 5 Feb 2026 12:40:22 +0100 Subject: [PATCH 23/25] Fix phrasing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Dominik Marciński --- .../5732-topology-aware-workload-scheduling/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md index 13d1ffd7c3c3..b15db07f089c 100644 --- a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md +++ b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md @@ -217,7 +217,7 @@ workload's pods to them. ### Risks and Mitigations - **Scheduling Latency:** Evaluating multiple placements involves running - filter/score plugins multiple times. + filter/score plugins multiple times (multiple attempts to schedule a PodGroup considering all topology options). - **Mitigation:** Implement pre-filtering optimizations to reject infeasible placements early based on aggregate resource availability. From 3f6fffb0996ba2ce165daf7e14af8ceec9e2a902 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20K=C4=99pka?= Date: Thu, 5 Feb 2026 12:49:26 +0100 Subject: [PATCH 24/25] Updates from review. --- .../5732-topology-aware-workload-scheduling/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md index b15db07f089c..2dee3d6b43ae 100644 --- a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md +++ b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md @@ -243,7 +243,7 @@ type PodGroup struct { Name *string // SchedulingConstraints defines group-level scheduling requirements, - // including topology and DRA colocation. + // including topology. SchedulingConstraints *PodGroupSchedulingConstraints } @@ -280,7 +280,7 @@ field. This field serves as a scheduler hint to improve placement decisions without imposing hard scheduling constraints. This feature will be gated behind a separate feature gate -(`WorkloadBasicPolicyDesiredCount`) to decouple it from the core Gang Scheduling +(`PodGroupDesiredCount`) to decouple it from the core Gang Scheduling and Topology Aware Scheduling features. **1. Basic Policy Update** @@ -701,7 +701,7 @@ kube-scheduler instance being a leader). - Components depending on the feature gate: - kube-apiserver - kube-scheduler - - Feature gate name: WorkloadBasicPolicyDesiredCount + - Feature gate name: PodGroupDesiredCount - Components depending on the feature gate: - kube-apiserver - kube-scheduler From 7113f4be83cc285476cb1bb4bfce9c1447c68e6c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20K=C4=99pka?= Date: Thu, 5 Feb 2026 12:49:52 +0100 Subject: [PATCH 25/25] Updates from review. --- .../5732-topology-aware-workload-scheduling/kep.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/kep.yaml b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/kep.yaml index 0253ec6a85c9..99bc28c58277 100644 --- a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/kep.yaml +++ b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/kep.yaml @@ -41,7 +41,7 @@ feature-gates: components: - kube-apiserver - kube-scheduler - - name: WorkloadBasicPolicyDesiredCount + - name: PodGroupDesiredCount components: - kube-apiserver - kube-scheduler