From d458e3f1b301e1a2a9b963aef1d78c69e5238290 Mon Sep 17 00:00:00 2001
From: Pawel Kepka <pkepka@google.com>
Date: Wed, 10 Dec 2025 20:54:10 +0000
Subject: [PATCH 01/25] Topology-aware workload scheduling KEP

---
 .../README.md                                 | 991 ++++++++++++++++++
 .../kep.yaml                                  |  43 +
 2 files changed, 1034 insertions(+)
 create mode 100644 keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
 create mode 100644 keps/sig-scheduling/5732-topology-aware-workload-scheduling/kep.yaml

diff --git a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
new file mode 100644
index 000000000000..b697985daf73
--- /dev/null
+++ b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
@@ -0,0 +1,991 @@
+# KEP-5732: Topology-aware workload scheduling
+
+<!-- toc -->
+
+-   [Release Signoff Checklist](#release-signoff-checklist)
+-   [Summary](#summary)
+-   [Motivation](#motivation)
+    -   [Goals](#goals)
+    -   [Non-Goals](#non-goals)
+-   [Proposal](#proposal)
+    -   [User Stories (Optional)](#user-stories-optional)
+    -   [Story 1: AI Training in a Single Rack](#story-1-ai-training-in-a-single-rack)
+    -   [Story 2: Workload using Interconnected DRA Devices](#story-2-workload-using-interconnected-dra-devices)
+    -   [Notes/Constraints/Caveats (Optional)](#notesconstraintscaveats-optional)
+    -   [Risks and Mitigations](#risks-and-mitigations)
+-   [Design Details](#design-details)
+    -   [Workload API Changes](#workload-api-changes)
+    -   [Scheduling Framework Extensions](#scheduling-framework-extensions)
+    -   [1. Data Structures](#1-data-structures)
+    -   [2. New Plugin Interfaces](#2-new-plugin-interfaces)
+    -   [Scheduling Algorithm Phases](#scheduling-algorithm-phases)
+    -   [Phase 1: Candidate Placement Generation](#phase-1-candidate-placement-generation)
+    -   [Phase 2: Pod-Level Filtering and Feasibility Check](#phase-2-pod-level-filtering-and-feasibility-check)
+    -   [Phase 3: Placement Scoring and Selection](#phase-3-placement-scoring-and-selection)
+    -   [Scheduler Plugins](#scheduler-plugins)
+    -   [Potential Future Extensions (Beta Candidates)](#potential-future-extensions-beta-candidates)
+    -   [Test Plan](#test-plan)
+    -   [Prerequisite testing updates](#prerequisite-testing-updates)
+    -   [Unit tests](#unit-tests)
+    -   [Integration tests](#integration-tests)
+    -   [e2e tests](#e2e-tests)
+    -   [Graduation Criteria](#graduation-criteria)
+    -   [Alpha](#alpha)
+    -   [Beta](#beta)
+    -   [Upgrade / Downgrade Strategy](#upgrade--downgrade-strategy)
+    -   [Version Skew Strategy](#version-skew-strategy)
+-   [Production Readiness Review Questionnaire](#production-readiness-review-questionnaire)
+    -   [Feature Enablement and Rollback](#feature-enablement-and-rollback)
+    -   [Rollout, Upgrade and Rollback Planning](#rollout-upgrade-and-rollback-planning)
+    -   [Monitoring Requirements](#monitoring-requirements)
+    -   [Dependencies](#dependencies)
+    -   [Scalability](#scalability)
+    -   [Troubleshooting](#troubleshooting)
+-   [Implementation History](#implementation-history)
+-   [Drawbacks](#drawbacks)
+-   [Alternatives](#alternatives)
+    -   [Pod Inter-Affinities](#pod-inter-affinities)
+    -   [Standalone Schedulers (e.g., Volcano)](#standalone-schedulers-eg-volcano)
+-   [Infrastructure Needed (Optional)](#infrastructure-needed-optional)
+    <!-- /toc -->
+
+## Release Signoff Checklist
+
+Items marked with (R) are required *prior to targeting to a milestone / release*.
+
+- [ ] (R) Enhancement issue in release milestone, which links to KEP dir in [kubernetes/enhancements] (not the initial KEP PR)
+- [ ] (R) KEP approvers have approved the KEP status as `implementable`
+- [ ] (R) Design details are appropriately documented
+- [ ] (R) Test plan is in place, giving consideration to SIG Architecture and SIG Testing input (including test refactors)
+  - [ ] e2e Tests for all Beta API Operations (endpoints)
+  - [ ] (R) Ensure GA e2e tests meet requirements for [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md)
+  - [ ] (R) Minimum Two Week Window for GA e2e tests to prove flake free
+- [ ] (R) Graduation criteria is in place
+  - [ ] (R) [all GA Endpoints](https://github.com/kubernetes/community/pull/1806) must be hit by [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) within one minor version of promotion to GA
+- [ ] (R) Production readiness review completed
+- [ ] (R) Production readiness review approved
+- [ ] "Implementation History" section is up-to-date for milestone
+- [ ] User-facing documentation has been created in [kubernetes/website], for publication to [kubernetes.io]
+- [ ] Supporting documentation—e.g., additional design documents, links to mailing list discussions/SIG meetings, relevant PRs/issues, release notes
+
+<!--
+**Note:** This checklist is iterative and should be reviewed and updated every time this enhancement is being considered for a milestone.
+-->
+
+[kubernetes.io]: https://kubernetes.io/
+[kubernetes/enhancements]: https://git.k8s.io/enhancements
+[kubernetes/kubernetes]: https://git.k8s.io/kubernetes
+[kubernetes/website]: https://git.k8s.io/website
+
+## Summary
+
+This KEP describes the architectural design and implementation details for
+integrating a Topology-Aware and DRA-Aware workload scheduling algorithm into
+the Kubernetes kube-scheduler to address the complex placement requirements of
+modern, high-performance distributed applications.
+
+The proposed algorithm fundamentally alters the scheduling lifecycle for gang
+scheduled workloads. Instead of evaluating pods individually against the cluster
+state - a process prone to fragmentation and deadlocks - the new mechanism
+generates "Placements". These Placements represent candidate domains (sets of
+nodes or DRA resources) where the entire workload is theoretically feasible. The
+scheduler then simulates the placement of the full group of pods within these
+domains, utilizing existing filtering and scoring logic to ensure high-fidelity
+decisions before committing resources.
+
+This design introduces specific extensions to the Kubernetes Workload API to
+support `TopologyConstraints` and `DRAConstraints`, defines new interfaces
+within the Scheduling Framework (`PlacementGenerator`, `PlacementState`,
+`PlacementScorer`), and details the algorithmic flow required to schedule Pod
+Groups while maintaining compatibility with the scheduler's existing ecosystem.
+
+## Motivation
+
+Distributed workloads, particularly those driving the current AI/ML era, often
+require high-bandwidth and low-latency communication between multiple pods to
+function efficiently. While the [KEP-4671: Workload API] makes the first step
+towards managing these applications as cohesive units, it primarily establishes
+the API structure. For workloads sensitive to inter-pod communication, simply
+grouping pods is insufficient; their physical placement within the cluster's
+network topology is a decisive factor in their performance.
+
+In this KEP, we propose an algorithm for topology-aware and DRA-aware scheduling
+that operates directly within the Kubernetes kube-scheduler. The core objective
+is to ensure that pods belonging to a Workload are co-located within optimal
+topological domains - such as specific racks or blocks - or are bound to shared
+Dynamic Resource Allocation (DRA) devices that require cohesive management.
+Without this level of precision, workloads may be fragmented across disparate
+network domains, drastically degrading performance and wasting the potential of
+expensive hardware.
+
+Given the economics of high-performance accelerators and network infrastructure,
+maximizing application performance and resource utilization is a primary goal
+for users. Achieving this requires intelligent placement decisions that
+understand the physical constraints of the cluster. However, the default
+scheduler's current pod-centric logic lacks the native mechanisms to efficiently
+resolve these complex group-level constraints during the scheduling cycle.
+
+Topology-aware scheduling is not a new concept and is currently addressed by
+external admission control systems like Kueue or alternative schedulers like
+Volcano. However, relying on external admission controllers decouples the
+topology decision from the scheduler's core logic, while alternative schedulers
+introduce operational complexity. We believe that embedding topology and DRA
+awareness deeply into the kube-scheduler is critical enough to warrant
+standardization. This integration allows the algorithm to leverage the full
+fidelity of the scheduler's existing pod-level filtering and scoring plugins,
+ensuring highly accurate feasibility checks and placement outcomes without the
+need for external dependencies.
+
+### Goals
+
+- To enhance kube-scheduler to perform topology-aware and DRA-aware scheduling
+  for multi-pod workloads, as defined by the Workload API (KEP-4671).
+- To optimize the placement of distributed workloads by co-locating pods based
+  on network topology and DRA resource availability.
+- To introduce new extension points and phases within the Kubernetes scheduler
+  framework to support the concept of "Placements" (candidate sets of nodes
+  and DRA resources).
+- To define the required changes to the Workload API (KEP-4671) to support
+  scheduling constraints.
+- To leverage the scheduler's existing pod-level filtering and scoring logic
+  within the evaluation of each Placement.
+- To provide a flexible framework extensible by plugins for various topology
+  sources (e.g., node labels) and resource types (e.g., DRA).
+
+### Non-Goals
+
+- To replace the functionality of external workload queueing and admission
+  control systems like Kueue. This proposal focuses on the in-scheduler
+  placement decision for a single Workload at a time.
+- To implement Workload-level queueing, fairness, or resource quotas within
+  kube-scheduler.
+- To handle all aspects of the workload lifecycle management beyond
+  scheduling.
+- To implement Workload-level preemption logic.
+- To integrate with cluster autoscaling mechanisms in this phase.
+- To support complex multi-PodSet dependency resolution with backtracking or
+  parallel processing in the initial version.
+- To automatically discover network topology; the mechanisms rely on topology
+  information being present (e.g., via node labels or DRA ResourceSlices).
+
+## Proposal
+
+This proposal introduces an API to define constraints on a PodGroup (a
+collection of pods within a Workload) requiring it to be scheduled onto a
+specific subset of nodes or resources.
+
+We support two fundamental types of constraints:
+
+1. **Topology Constraint (Node Label Co-location)**: Ensures all pods in a
+   PodGroup are placed onto nodes sharing a common topological characteristic
+   (e.g., same rack), defined by a specific node label.
+
+2. **DRA Constraint (Shared Dynamic Resource Allocation)**: Ensures all pods in a
+   PodGroup bind to a single DRA claim fulfilled from a single, shared,
+   co-located resource (e.g., interconnected network interfaces or
+   accelerators).
+
+The scheduler is extended to interpret these constraints and find a "Placement"
+(a subset of nodes and DRA resources) that satisfies them.
+
+### User Stories (Optional)
+
+#### Story 1: AI Training in a Single Rack
+
+As a data scientist, I want to run a distributed training job where all pods
+need to be located in the same server rack to minimize latency. I define a
+`TopologyConstraint` on the Workload's PodGroup specifying the rack topology
+label. The scheduler identifies a rack with sufficient capacity and schedules
+all pods there at once.
+
+#### Story 2: Workload using Interconnected DRA Devices
+
+As a cluster administrator, I want to schedule a workload that requires a set of
+specialized accelerators that are physically interconnected. I use a
+`DRAConstraint` targeting a specific `ResourceClaimTemplate`. The scheduler
+finds a set of DRA resources (ResourceSlice) that are co-located and binds the
+workload's pods to them.
+
+### Notes/Constraints/Caveats (Optional)
+
+### Risks and Mitigations
+
+- **Scheduling Latency:** Evaluating multiple placements involves running
+  filter/score plugins multiple times.
+
+  - **Mitigation:** Implement pre-filtering optimizations to reject infeasible
+    placements early based on aggregate resource availability.
+
+- **Complexity of Pod Group Scheduling:** Scheduling heterogeneous Pod Groups
+  can be complex.
+
+  - **Mitigation:** The initial version supports sequential processing of pods
+    within a PodGroup, avoiding complex backtracking or parallel processing
+    in the alpha release.
+
+## Design Details
+
+### Workload API Changes
+
+The Workload API (KEP-4671) will be extended to allow specifying group-level
+scheduling constraints. An optional `ScheduleConstraints` field is added to the
+`PodGroup` spec.
+
+```go
+// PodGroup (definition from KEP-4671, with additions)
+type PodGroup struct {
+    Name *string
+
+    // SchedulingConstraints defines group-level scheduling requirements,
+    // including topology and DRA colocation.
+    SchedulingConstraints *PodGroupSchedulingConstraints
+}
+
+// PodGroupSchedulingConstraints holds the scheduling constraints for the PodGroup.
+type PodGroupSchedulingConstraints struct {
+    // TopologyConstraints specifies desired topological placements for all pods
+    // within this PodGroup.
+    TopologyConstraints []TopologyConstraint
+
+    // DRAConstraints specifies constraints on how Dynamic Resources are allocated
+    // across the PodGroup.
+    DRAConstraints []DRAConstraint
+}
+
+// TopologyConstraint describes a desired topological colocation for all pods in the PodGroup.
+type TopologyConstraint struct {
+    // Level specifies the key of the node label representing the topology domain.
+    // All pods within the PodGroup must be colocated within the same domain instance.
+    // Examples: "topology.kubernetes.io/rack"
+    Level string
+}
+
+// DRAConstraint provides constraints on how specific DRA claims across the group should
+// be fulfilled.
+type DRAConstraint struct {
+    // ResourceClaimName specifies the name of a specific ResourceClaim
+    // within the PodGroup's pods that this constraint applies to.
+    ResourceClaimName *string
+
+    // ResourceClaimTemplateName specifies the name of a ResourceClaimTemplate.
+    // This applies to all ResourceClaim instances generated from this template.
+    ResourceClaimTemplateName *string
+}
+```
+
+Note: For the initial alpha scope, only a single TopologyConstraint or
+DRAConstraint will be supported.
+
+### Scheduling Framework Extensions
+
+The scheduler framework requires new plugin interfaces to handle "Placements". A
+Placement represents a candidate domain (nodes and resources) for a PodGroup.
+
+#### 1. Data Structures
+
+```go
+// PodGroupInfo holds information about a specific PodGroup within a Workload,
+// including a reference to the Workload, the PodGroup's name, and its replica index.
+// This struct is designed to be extensible with more fields in the future.
+type PodGroupInfo struct {
+    // WorkloadRef is a reference to the parent Workload object.
+    WorkloadRef *workloadv1alpha1.Workload
+
+    // PodGroupName is the name of the PodGroup.
+    PodGroupName string
+
+    // PodGroupReplicaIndex is the index of the PodGroup replica, as defined in KEP-4671.
+    // This is relevant for PodGroups that have more than one replica.
+    PodGroupReplicaIndex int
+
+    // -- Add other fields below for future extensions --
+}
+
+// PodSetInfo holds information about a specific PodSet within a PodGroup,
+// primarily the list of Pods.
+// This struct is designed to be extensible with more fields in the future.
+type PodSetInfo struct {
+    // Pods is a list of Pod objects belonging to this PodSet.
+    Pods []*corev1.Pod
+
+    // -- Add other fields below for future extensions --
+}
+
+// Placement represents a candidate domain for scheduling a PodSet.
+// It defines a set of nodes and/or proposed Dynamic Resource Allocation (DRA)
+// resource bindings necessary to satisfy the PodSet's requirements within that domain.
+type Placement struct {
+    // NodeAffinity specifies the node constraints for this Placement.
+    // For Topology this is derived from topology labels (e.g., all nodes with label
+    // 'topology-rack: rack-1').
+    // For DRA, this Affinity would be constructed based on nodeSelector from
+    // DRA's AllocationResult from DRAAllocations.
+    // All pods within the PodSet, when being evaluated against this Placement,
+    // are restricted to the nodes matching this NodeAffinity.
+    NodeAffinity *corev1.NodeAffinity
+
+    // DRAAllocations details the proposed DRA resource assignments for
+    // the ResourceClaims made by the PodSet. This field is primarily used
+    // by DRA-aware plugins.
+    DRAAllocations []DraClaimAllocation
+}
+
+// DraClaimAllocation maps a specific ResourceClaim name to a set of proposed
+// device allocations. These allocations are tentative and used by the scheduler's
+// AssumePlacement phase to simulate resource commitment.
+type DraClaimAllocation struct {
+    // ResourceClaimName is the name of the ResourceClaim within the PodSet's context
+    // that these allocations are intended to satisfy.
+    ResourceClaimName string
+
+    // Allocation contains DRA AllocationResult structures, specifying devices
+    // from ResourceSlices that are proposed to fulfill the ResourceClaim.
+    // The scheduler will use this information in AssumePlacement to temporarily
+    // consider these devices as allocated.
+    Allocation dra.AllocationResult
+}
+```
+
+#### 2. New Plugin Interfaces
+
+**PlacementGenerator:** Generates candidate placements based on constraints.
+
+```go
+// PlacementGenerator is an interface for plugins that generate candidate Placements.
+type PlacementGenerator interface {
+    Name() string
+
+    // GeneratePlacements generates a list of potential Placements for the given PodGroup and PodSet.
+    // Each Placement represents a candidate set of resources (e.g., nodes matching a selector)
+    // and potential DRA allocations where the PodSet might be scheduled.
+    GeneratePlacements(ctx context.Context, state *framework.CycleState, podGroup *PodGroupInfo, podSet *PodSetInfo, parentPlacements []*Placement) ([]*Placement, *framework.Status)
+}
+```
+
+**PlacementState:** Manages state changes (simulating binding) during
+feasibility checks.
+
+```go
+// PlacementState is an interface for plugins that manage state changes
+// when a Placement is being considered.
+type PlacementState interface {
+    Name() string
+
+    // AssumePlacement temporarily configures the scheduling context to evaluate the feasibility
+    // of the given Placement for the PodGroup and PodSet.
+    AssumePlacement(ctx context.Context, state *framework.CycleState, podGroup *PodGroupInfo, podSet *PodSetInfo, placement *Placement) *framework.Status
+
+    // RevertPlacement reverts the temporary scheduling context changes made by AssumePlacement.
+    // This should be called after the evaluation of a Placement is complete to restore
+    // the scheduler's state and allow other Placements to be considered.
+    RevertPlacement(ctx context.Context, state *framework.CycleState, podGroup *PodGroupInfo, podSet *PodSetInfo, placement *Placement) *framework.Status
+}
+```
+
+**PlacementScorer:** Scores feasible placements to select the best one.
+
+```go
+// PodSetAssignment represents the assignment of pods to nodes within a PodSet for a specific Placement.
+type PodSetAssignment struct {
+    // PodToNodeMap maps a Pod name (string) to a Node name (string).
+    PodToNodeMap map[string]string
+}
+
+// PlacementScorer is an interface for plugins that score feasible Placements.
+type PlacementScorer interface {
+    Name() string
+
+    // ScorePlacement calculates a score for a given Placement. This function is called in Phase 3
+    // (Placement Scoring and Selection) only for Placements that have been deemed feasible
+    // for all pods in the PodSet during Phase 2. The PodSetAssignment indicates the
+    // node assigned to each pod within this Placement. The returned score is a float64,
+    // with higher scores generally indicating more preferable Placements.
+    // Plugins can implement various scoring strategies, such as bin packing to minimize
+    // resource fragmentation.
+    ScorePlacement(ctx context.Context, state *framework.CycleState, podGroup *PodGroupInfo, podSet *PodSetInfo, placement *Placement, podsAssignment *PodSetAssignment) (float64, *framework.Status)
+}
+```
+
+### Scheduling Algorithm Phases
+
+The algorithm proceeds in three main phases for a given Workload/PodGroup.
+
+#### Phase 1: Candidate Placement Generation
+
+- **Input:** PodGroupInfo and PodSetInfo.
+
+- **Action:** Iterate over distinct values of the topology label (TAS) or
+  available ResourceSlices (DRA).
+
+- **Output:** A list of Placement objects.
+
+- Example: If the label is rack, placements are generated for rack-1, rack-2,
+  etc.
+
+#### Phase 2: Pod-Level Filtering and Feasibility Check
+
+- **Action:** For each generated Placement:
+
+  1. Call `AssumePlacement` (binds context to the specific node selector/DRA
+     resources).
+
+  2. Iterate through every pod in the PodSet.
+
+  3. Run standard Pod-level Filter and Score.
+
+  4. Use internal logic to simulate placing the pod on a node.
+
+  5. If all pods fit, the Placement is marked Feasible.
+
+  6. Call `RevertPlacement`.
+
+- **Potential Optimization:** Pre-filtering can check aggregate resources
+  before running the full simulation.
+
+- **Heterogeneous PodGroup Handling**: Sequential Processing will be used
+  initially. Pods are processed sequentially; if any fail, the placement is
+  rejected.
+
+#### Phase 3: Placement Scoring and Selection
+
+- **Action:** Call `ScorePlacement` for all feasible placements.
+
+- **Selection:** Select the Placement with the highest score.
+
+- **Binding:** Proceed to bind pods to the assigned nodes and resources.
+
+### Scheduler Plugins
+
+**TopologyPlacementPlugin (New)** Implements `PlacementGenerator`. Generates
+Placements based on distinct values of the designated node label (TAS) .
+
+**DRAPlugin (Extension)** Extended to implement `PlacementGenerator` and
+`PlacementState`.
+
+- **Generator:** Returns Placements derived from available ResourceSlices
+  satisfying shared claims.
+
+- **State:** Temporarily assigns AllocationResults to ResourceClaims during
+  the Assume phase.
+
+**PlacementBinPackingPlugin (New)** Implements `PlacementScorer`. Scores
+Placements to maximize utilization (tightest fit) and minimize fragmentation.
+
+### Potential Future Extensions (Beta Candidates)
+
+The following features are out of scope for the initial Alpha implementation but
+are considered for future releases (post-1.36):
+
+1. **Prioritized Placement Scheduling:** Allowing a set of preferred placements
+   with fallbacks (e.g., prefer Rack, fallback to Block). This would introduce
+   a Rank field to the Placement struct.
+
+2. **Optional/Preferred Scheduling Constraints:** Constraints that serve purely
+   as scoring mechanisms without hard requirements.
+
+3. **Multi-level Scheduling Constraints:** Handling nested constraints (e.g.,
+   Block -> Rack). This would involve iterative placement generation and a
+   Parent field in the Placement struct.
+
+4. **Pod Group Replicas Support:** Optimizing scheduling for identical
+   PodGroups (replicas) by scheduling the maximum feasible number of replicas
+   within a single placement pass.
+
+5. **Explicit Topology Definition:** Using a Custom Resource (NodeTopology) to
+   define and alias topology levels, removing the need for users to know exact
+   node label keys.
+
+### Test Plan
+
+[ ] I/we understand the owners of the involved components may require updates to
+existing tests to make this code solid enough prior to committing the changes
+necessary to implement this enhancement.
+
+#### Prerequisite testing updates
+
+#### Unit tests
+
+- PlacementGenerator: Test generation of placements for various topology
+  labels and DRA ResourceSlices.
+
+- PlacementState: Verify AssumePlacement and RevertPlacement correctly modify
+  and restore the CycleState.
+
+- Algorithm Logic: Test the sequential processing of Placements and the
+  selection logic based on scores.
+
+- DRA Integration: specific tests for DRAConstraint resolution.
+
+#### Integration tests
+
+- Topology Awareness: Verify that pods with TopologyConstraint are correctly
+  co-located on nodes sharing the label.
+
+- DRA Awareness: Verify that pods with DRAConstraint are bound to shared
+  ResourceSlices.
+
+- Infeasibility: Verify that Workloads remain pending if no Placement
+  satisfies the constraints.
+
+#### e2e tests
+
+- End-to-End Workload Scheduling: Submit a Workload with TopologyConstraint
+  (e.g., Rack) and verify all pods land on the same rack.
+
+- DRA Co-location: Submit a Workload requiring shared DRA devices and verify
+  correct allocation and placement.
+
+### Graduation Criteria
+
+#### Alpha
+
+- Feature implemented behind a feature flag.
+- PodGroupSchedulingConstraints API defined.
+- Basic topology (Node Label) and DRA constraints working.
+- Initial unit and integration tests.
+
+#### Beta
+
+- Support for "Potential Future Extensions" (Prioritized placement, etc.)
+  evaluated.
+- Scalability tests on large clusters with high placement counts.
+- Comprehensive e2e testing.
+
+### Upgrade / Downgrade Strategy
+
+This KEP is additive and can safely fallback to the original behavior on
+downgrade.
+
+When a user upgrades the cluster to the version which supports topology-aware
+workload scheduling:
+
+- they can enable scheduling plugins implementing new Scheduling Framework
+  interfaces in kube-scheduler config
+- they can start using the new API to create Workload objects with
+  `schedulingConstraints` field
+- scheduler will use enabled plugins to generate placements for Workload and
+  check their feasibility
+
+When user downgrades the cluster to the version that no longer supports
+topology-aware workload scheduling:
+
+- the `schedulingConstraints` field can no longer be set on the Workloads
+  (the already set fields continue to be set though)
+- scheduler will revert to the original behavior of scheduling pods belonging
+  to a gang, without considering different potential placements.
+
+### Version Skew Strategy
+
+The feature is limited to the control plane, so the version skew with nodes
+(kubelets) doesn't matter.
+
+For the API changes, the old version of components (in particular
+kube-apiserver) may not handle those. Thus, users should not set those fields
+before confirming all control-plane instances were upgraded to the version
+supporting those.
+
+For the topology-aware workload scheduling itself, this is purely kube-scheduler
+in-memory feature, so the skew doesn't matter (as there is always only a single
+kube-scheduler instance being a leader).
+
+## Production Readiness Review Questionnaire
+
+<!--
+
+Production readiness reviews are intended to ensure that features merging into
+Kubernetes are observable, scalable and supportable; can be safely operated in
+production environments, and can be disabled or rolled back in the event they
+cause increased failures in production. See more in the PRR KEP at
+https://git.k8s.io/enhancements/keps/sig-architecture/1194-prod-readiness.
+
+The production readiness review questionnaire must be completed and approved
+for the KEP to move to `implementable` status and be included in the release.
+
+In some cases, the questions below should also have answers in `kep.yaml`. This
+is to enable automation to verify the presence of the review, and to reduce review
+burden and latency.
+
+The KEP must have a approver from the
+[`prod-readiness-approvers`](http://git.k8s.io/enhancements/OWNERS_ALIASES)
+team. Please reach out on the
+[#prod-readiness](https://kubernetes.slack.com/archives/CPNHUMN74) channel if
+you need any help or guidance.
+-->
+
+### Feature Enablement and Rollback
+
+<!--
+This section must be completed when targeting alpha to a release.
+-->
+
+###### How can this feature be enabled / disabled in a live cluster?
+
+<!--
+Pick one of these and delete the rest.
+
+Documentation is available on [feature gate lifecycle] and expectations, as
+well as the [existing list] of feature gates.
+
+[feature gate lifecycle]: https://git.k8s.io/community/contributors/devel/sig-architecture/feature-gates.md
+[existing list]: https://kubernetes.io/docs/reference/command-line-tools-reference/feature-gates/
+-->
+
+- [ ] Feature gate (also fill in values in `kep.yaml`)
+  - Feature gate name:
+  - Components depending on the feature gate:
+- [ ] Other
+  - Describe the mechanism:
+  - Will enabling / disabling the feature require downtime of the control
+    plane?
+  - Will enabling / disabling the feature require downtime or reprovisioning
+    of a node?
+
+###### Does enabling the feature change any default behavior?
+
+<!--
+Any change of default behavior may be surprising to users or break existing
+automations, so be extremely careful here.
+-->
+
+###### Can the feature be disabled once it has been enabled (i.e. can we roll back the enablement)?
+
+<!--
+Describe the consequences on existing workloads (e.g., if this is a runtime
+feature, can it break the existing applications?).
+
+Feature gates are typically disabled by setting the flag to `false` and
+restarting the component. No other changes should be necessary to disable the
+feature.
+
+NOTE: Also set `disable-supported` to `true` or `false` in `kep.yaml`.
+-->
+
+###### What happens if we reenable the feature if it was previously rolled back?
+
+###### Are there any tests for feature enablement/disablement?
+
+<!--
+The e2e framework does not currently support enabling or disabling feature
+gates. However, unit tests in each component dealing with managing data, created
+with and without the feature, are necessary. At the very least, think about
+conversion tests if API types are being modified.
+
+Additionally, for features that are introducing a new API field, unit tests that
+are exercising the `switch` of feature gate itself (what happens if I disable a
+feature gate after having objects written with the new field) are also critical.
+You can take a look at one potential example of such test in:
+https://github.com/kubernetes/kubernetes/pull/97058/files#diff-7826f7adbc1996a05ab52e3f5f02429e94b68ce6bce0dc534d1be636154fded3R246-R282
+-->
+
+### Rollout, Upgrade and Rollback Planning
+
+<!--
+This section must be completed when targeting beta to a release.
+-->
+
+###### How can a rollout or rollback fail? Can it impact already running workloads?
+
+<!--
+Try to be as paranoid as possible - e.g., what if some components will restart
+mid-rollout?
+
+Be sure to consider highly-available clusters, where, for example,
+feature flags will be enabled on some API servers and not others during the
+rollout. Similarly, consider large clusters and how enablement/disablement
+will rollout across nodes.
+-->
+
+###### What specific metrics should inform a rollback?
+
+<!--
+What signals should users be paying attention to when the feature is young
+that might indicate a serious problem?
+-->
+
+###### Were upgrade and rollback tested? Was the upgrade->downgrade->upgrade path tested?
+
+<!--
+Describe manual testing that was done and the outcomes.
+Longer term, we may want to require automated upgrade/rollback tests, but we
+are missing a bunch of machinery and tooling and can't do that now.
+-->
+
+###### Is the rollout accompanied by any deprecations and/or removals of features, APIs, fields of API types, flags, etc.?
+
+<!--
+Even if applying deprecation policies, they may still surprise some users.
+-->
+
+### Monitoring Requirements
+
+<!--
+This section must be completed when targeting beta to a release.
+
+For GA, this section is required: approvers should be able to confirm the
+previous answers based on experience in the field.
+-->
+
+###### How can an operator determine if the feature is in use by workloads?
+
+<!--
+Ideally, this should be a metric. Operations against the Kubernetes API (e.g.,
+checking if there are objects with field X set) may be a last resort. Avoid
+logs or events for this purpose.
+-->
+
+###### How can someone using this feature know that it is working for their instance?
+
+<!--
+For instance, if this is a pod-related feature, it should be possible to determine if the feature is functioning properly
+for each individual pod.
+Pick one more of these and delete the rest.
+Please describe all items visible to end users below with sufficient detail so that they can verify correct enablement
+and operation of this feature.
+Recall that end users cannot usually observe component logs or access metrics.
+-->
+
+- [ ] Events
+  - Event Reason:
+- [ ] API .status
+  - Condition name:
+  - Other field:
+- [ ] Other (treat as last resort)
+  - Details:
+
+###### What are the reasonable SLOs (Service Level Objectives) for the enhancement?
+
+<!--
+This is your opportunity to define what "normal" quality of service looks like
+for a feature.
+
+It's impossible to provide comprehensive guidance, but at the very
+high level (needs more precise definitions) those may be things like:
+  - per-day percentage of API calls finishing with 5XX errors <= 1%
+  - 99% percentile over day of absolute value from (job creation time minus expected
+    job creation time) for cron job <= 10%
+  - 99.9% of /health requests per day finish with 200 code
+
+These goals will help you determine what you need to measure (SLIs) in the next
+question.
+-->
+
+###### What are the SLIs (Service Level Indicators) an operator can use to determine the health of the service?
+
+<!--
+Pick one more of these and delete the rest.
+-->
+
+- [ ] Metrics
+  - Metric name:
+  - [Optional] Aggregation method:
+  - Components exposing the metric:
+- [ ] Other (treat as last resort)
+  - Details:
+
+###### Are there any missing metrics that would be useful to have to improve observability of this feature?
+
+<!--
+Describe the metrics themselves and the reasons why they weren't added (e.g., cost,
+implementation difficulties, etc.).
+-->
+
+### Dependencies
+
+<!--
+This section must be completed when targeting beta to a release.
+-->
+
+###### Does this feature depend on any specific services running in the cluster?
+
+<!--
+Think about both cluster-level services (e.g. metrics-server) as well
+as node-level agents (e.g. specific version of CRI). Focus on external or
+optional services that are needed. For example, if this feature depends on
+a cloud provider API, or upon an external software-defined storage or network
+control plane.
+
+For each of these, fill in the following—thinking about running existing user workloads
+and creating new ones, as well as about cluster-level services (e.g. DNS):
+  - [Dependency name]
+    - Usage description:
+      - Impact of its outage on the feature:
+      - Impact of its degraded performance or high-error rates on the feature:
+-->
+
+### Scalability
+
+<!--
+For alpha, this section is encouraged: reviewers should consider these questions
+and attempt to answer them.
+
+For beta, this section is required: reviewers must answer these questions.
+
+For GA, this section is required: approvers should be able to confirm the
+previous answers based on experience in the field.
+-->
+
+###### Will enabling / using this feature result in any new API calls?
+
+<!--
+Describe them, providing:
+  - API call type (e.g. PATCH pods)
+  - estimated throughput
+  - originating component(s) (e.g. Kubelet, Feature-X-controller)
+Focusing mostly on:
+  - components listing and/or watching resources they didn't before
+  - API calls that may be triggered by changes of some Kubernetes resources
+    (e.g. update of object X triggers new updates of object Y)
+  - periodic API calls to reconcile state (e.g. periodic fetching state,
+    heartbeats, leader election, etc.)
+-->
+
+###### Will enabling / using this feature result in introducing new API types?
+
+<!--
+Describe them, providing:
+  - API type
+  - Supported number of objects per cluster
+  - Supported number of objects per namespace (for namespace-scoped objects)
+-->
+
+###### Will enabling / using this feature result in any new calls to the cloud provider?
+
+<!--
+Describe them, providing:
+  - Which API(s):
+  - Estimated increase:
+-->
+
+###### Will enabling / using this feature result in increasing size or count of the existing API objects?
+
+<!--
+Describe them, providing:
+  - API type(s):
+  - Estimated increase in size: (e.g., new annotation of size 32B)
+  - Estimated amount of new objects: (e.g., new Object X for every existing Pod)
+-->
+
+###### Will enabling / using this feature result in increasing time taken by any operations covered by existing SLIs/SLOs?
+
+<!--
+Look at the [existing SLIs/SLOs].
+
+Think about adding additional work or introducing new steps in between
+(e.g. need to do X to start a container), etc. Please describe the details.
+
+[existing SLIs/SLOs]: https://git.k8s.io/community/sig-scalability/slos/slos.md#kubernetes-slisslos
+-->
+
+###### Will enabling / using this feature result in non-negligible increase of resource usage (CPU, RAM, disk, IO, ...) in any components?
+
+<!--
+Things to keep in mind include: additional in-memory state, additional
+non-trivial computations, excessive access to disks (including increased log
+volume), significant amount of data sent and/or received over network, etc.
+This through this both in small and large cases, again with respect to the
+[supported limits].
+
+[supported limits]: https://git.k8s.io/community//sig-scalability/configs-and-limits/thresholds.md
+-->
+
+###### Can enabling / using this feature result in resource exhaustion of some node resources (PIDs, sockets, inodes, etc.)?
+
+<!--
+Focus not just on happy cases, but primarily on more pathological cases
+(e.g. probes taking a minute instead of milliseconds, failed pods consuming resources, etc.).
+If any of the resources can be exhausted, how this is mitigated with the existing limits
+(e.g. pods per node) or new limits added by this KEP?
+
+Are there any tests that were run/should be run to understand performance characteristics better
+and validate the declared limits?
+-->
+
+### Troubleshooting
+
+<!--
+This section must be completed when targeting beta to a release.
+
+For GA, this section is required: approvers should be able to confirm the
+previous answers based on experience in the field.
+
+The Troubleshooting section currently serves the `Playbook` role. We may consider
+splitting it into a dedicated `Playbook` document (potentially with some monitoring
+details). For now, we leave it here.
+-->
+
+###### How does this feature react if the API server and/or etcd is unavailable?
+
+###### What are other known failure modes?
+
+<!--
+For each of them, fill in the following information by copying the below template:
+  - [Failure mode brief description]
+    - Detection: How can it be detected via metrics? Stated another way:
+      how can an operator troubleshoot without logging into a master or worker node?
+    - Mitigations: What can be done to stop the bleeding, especially for already
+      running user workloads?
+    - Diagnostics: What are the useful log messages and their required logging
+      levels that could help debug the issue?
+      Not required until feature graduated to beta.
+    - Testing: Are there any tests for failure mode? If not, describe why.
+-->
+
+###### What steps should be taken if SLOs are not being met to determine the problem?
+
+## Implementation History
+
+<!--
+Major milestones in the lifecycle of a KEP should be tracked in this section.
+Major milestones might include:
+- the `Summary` and `Motivation` sections being merged, signaling SIG acceptance
+- the `Proposal` section being merged, signaling agreement on a proposed design
+- the date implementation started
+- the first Kubernetes release where an initial version of the KEP was available
+- the version of Kubernetes where the KEP graduated to general availability
+- when the KEP was retired or superseded
+-->
+
+## Drawbacks
+
+- **Complexity:** This proposal adds significant logic to the kube-scheduler
+  framework, specifically the "Placement" abstraction and the simulation loop
+  (Phase 2).
+
+- **Performance:** Generating and simulating a large number of Placements
+  (e.g., every rack in a massive cluster) could be computationally expensive.
+
+  - **Mitigation:** Pre-filtering of Placements will be implemented to discard
+    clearly infeasible Placements (insufficient total resources) before the
+    expensive pod-level simulation.
+
+## Alternatives
+
+### Pod Inter-Affinities
+
+Currently, users may attempt to simulate gang scheduling using podAffinity (to
+co-locate pods) or podAntiAffinity.
+
+- **Pros:** Native to Kubernetes, no new CRDs.
+- **Cons:** Affinity is evaluated per-Pod at the time of that Pod's
+  scheduling. It does not look ahead. This means that the scheduler might
+  place the first Pod on a node that satisfies its immediate affinity needs
+  but prevents the rest of the group from scheduling (e.g., locking a topology
+  domain that is too small for the rest of the group).
+
+### Standalone Schedulers (e.g., Volcano)
+
+Users can run a secondary scheduler like Volcano or Yunikorn.
+
+- **Pros:** Feature-rich, mature for batch workloads.
+- **Cons:** Operationally complex (two schedulers), race conditions when
+  sharing cluster resources, and lack of integration with standard Kubernetes
+  features like common admission controllers or newer features like DRA
+  (initially).
+
+## Infrastructure Needed (Optional)
+
+<!--
+Use this section if you need things from the project/SIG. Examples include a
+new subproject, repos requested, or GitHub details. Listing these here allows a
+SIG to get the process for these resources started right away.
+-->
diff --git a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/kep.yaml b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/kep.yaml
new file mode 100644
index 000000000000..1a54a85aa9ab
--- /dev/null
+++ b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/kep.yaml
@@ -0,0 +1,43 @@
+title: Topology-aware workload scheduling
+kep-number: 5732
+authors:
+  - "@44past4"
+owning-sig: sig-scheduling
+participating-sigs:
+status: provisional
+creation-date: 2025-12-10
+reviewers:
+  -
+approvers:
+  -
+
+see-also:
+  - "/keps/sig-scheduling/4671-gang-scheduling"
+
+# The target maturity stage in the current dev cycle for this KEP.
+# If the purpose of this KEP is to deprecate a user-visible feature
+# and a Deprecated feature gates are added, they should be deprecated|disabled|removed.
+stage: alpha
+
+# The most recent milestone for which work toward delivery of this KEP has been
+# done. This can be the current (upcoming) milestone, if it is being actively
+# worked on.
+latest-milestone: "v1.36"
+
+# The milestone at which this feature was, or is targeted to be, at each stage.
+milestone:
+  alpha: "v1.36"
+  beta: "v1.37"
+  stable: "v1.39"
+
+# The following PRR answers are required at alpha release
+# List the feature gate name and the components for which it must be enabled
+feature-gates:
+  - name: TopologyAwareWorkloadScheduling
+    components:
+      - kube-apiserver
+      - kube-scheduler
+disable-supported: true
+
+# The following PRR answers are required at beta release
+metrics:

From 493cc9a5e9b96f952196be943aead139b7ae1c31 Mon Sep 17 00:00:00 2001
From: Pawel Kepka <pkepka@google.com>
Date: Thu, 11 Dec 2025 07:25:39 +0000
Subject: [PATCH 02/25] Fixed Toc

---
 .../README.md                                 | 93 +++++++++----------
 1 file changed, 46 insertions(+), 47 deletions(-)

diff --git a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
index b697985daf73..756e9f232b0b 100644
--- a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
+++ b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
@@ -1,53 +1,52 @@
 # KEP-5732: Topology-aware workload scheduling
 
 <!-- toc -->
-
--   [Release Signoff Checklist](#release-signoff-checklist)
--   [Summary](#summary)
--   [Motivation](#motivation)
-    -   [Goals](#goals)
-    -   [Non-Goals](#non-goals)
--   [Proposal](#proposal)
-    -   [User Stories (Optional)](#user-stories-optional)
-    -   [Story 1: AI Training in a Single Rack](#story-1-ai-training-in-a-single-rack)
-    -   [Story 2: Workload using Interconnected DRA Devices](#story-2-workload-using-interconnected-dra-devices)
-    -   [Notes/Constraints/Caveats (Optional)](#notesconstraintscaveats-optional)
-    -   [Risks and Mitigations](#risks-and-mitigations)
--   [Design Details](#design-details)
-    -   [Workload API Changes](#workload-api-changes)
-    -   [Scheduling Framework Extensions](#scheduling-framework-extensions)
-    -   [1. Data Structures](#1-data-structures)
-    -   [2. New Plugin Interfaces](#2-new-plugin-interfaces)
-    -   [Scheduling Algorithm Phases](#scheduling-algorithm-phases)
-    -   [Phase 1: Candidate Placement Generation](#phase-1-candidate-placement-generation)
-    -   [Phase 2: Pod-Level Filtering and Feasibility Check](#phase-2-pod-level-filtering-and-feasibility-check)
-    -   [Phase 3: Placement Scoring and Selection](#phase-3-placement-scoring-and-selection)
-    -   [Scheduler Plugins](#scheduler-plugins)
-    -   [Potential Future Extensions (Beta Candidates)](#potential-future-extensions-beta-candidates)
-    -   [Test Plan](#test-plan)
-    -   [Prerequisite testing updates](#prerequisite-testing-updates)
-    -   [Unit tests](#unit-tests)
-    -   [Integration tests](#integration-tests)
-    -   [e2e tests](#e2e-tests)
-    -   [Graduation Criteria](#graduation-criteria)
-    -   [Alpha](#alpha)
-    -   [Beta](#beta)
-    -   [Upgrade / Downgrade Strategy](#upgrade--downgrade-strategy)
-    -   [Version Skew Strategy](#version-skew-strategy)
--   [Production Readiness Review Questionnaire](#production-readiness-review-questionnaire)
-    -   [Feature Enablement and Rollback](#feature-enablement-and-rollback)
-    -   [Rollout, Upgrade and Rollback Planning](#rollout-upgrade-and-rollback-planning)
-    -   [Monitoring Requirements](#monitoring-requirements)
-    -   [Dependencies](#dependencies)
-    -   [Scalability](#scalability)
-    -   [Troubleshooting](#troubleshooting)
--   [Implementation History](#implementation-history)
--   [Drawbacks](#drawbacks)
--   [Alternatives](#alternatives)
-    -   [Pod Inter-Affinities](#pod-inter-affinities)
-    -   [Standalone Schedulers (e.g., Volcano)](#standalone-schedulers-eg-volcano)
--   [Infrastructure Needed (Optional)](#infrastructure-needed-optional)
-    <!-- /toc -->
+- [Release Signoff Checklist](#release-signoff-checklist)
+- [Summary](#summary)
+- [Motivation](#motivation)
+  - [Goals](#goals)
+  - [Non-Goals](#non-goals)
+- [Proposal](#proposal)
+  - [User Stories (Optional)](#user-stories-optional)
+    - [Story 1: AI Training in a Single Rack](#story-1-ai-training-in-a-single-rack)
+    - [Story 2: Workload using Interconnected DRA Devices](#story-2-workload-using-interconnected-dra-devices)
+  - [Notes/Constraints/Caveats (Optional)](#notesconstraintscaveats-optional)
+  - [Risks and Mitigations](#risks-and-mitigations)
+- [Design Details](#design-details)
+  - [Workload API Changes](#workload-api-changes)
+  - [Scheduling Framework Extensions](#scheduling-framework-extensions)
+    - [1. Data Structures](#1-data-structures)
+    - [2. New Plugin Interfaces](#2-new-plugin-interfaces)
+  - [Scheduling Algorithm Phases](#scheduling-algorithm-phases)
+    - [Phase 1: Candidate Placement Generation](#phase-1-candidate-placement-generation)
+    - [Phase 2: Pod-Level Filtering and Feasibility Check](#phase-2-pod-level-filtering-and-feasibility-check)
+    - [Phase 3: Placement Scoring and Selection](#phase-3-placement-scoring-and-selection)
+  - [Scheduler Plugins](#scheduler-plugins)
+  - [Potential Future Extensions (Beta Candidates)](#potential-future-extensions-beta-candidates)
+  - [Test Plan](#test-plan)
+    - [Prerequisite testing updates](#prerequisite-testing-updates)
+    - [Unit tests](#unit-tests)
+    - [Integration tests](#integration-tests)
+    - [e2e tests](#e2e-tests)
+  - [Graduation Criteria](#graduation-criteria)
+    - [Alpha](#alpha)
+    - [Beta](#beta)
+  - [Upgrade / Downgrade Strategy](#upgrade--downgrade-strategy)
+  - [Version Skew Strategy](#version-skew-strategy)
+- [Production Readiness Review Questionnaire](#production-readiness-review-questionnaire)
+  - [Feature Enablement and Rollback](#feature-enablement-and-rollback)
+  - [Rollout, Upgrade and Rollback Planning](#rollout-upgrade-and-rollback-planning)
+  - [Monitoring Requirements](#monitoring-requirements)
+  - [Dependencies](#dependencies)
+  - [Scalability](#scalability)
+  - [Troubleshooting](#troubleshooting)
+- [Implementation History](#implementation-history)
+- [Drawbacks](#drawbacks)
+- [Alternatives](#alternatives)
+  - [Pod Inter-Affinities](#pod-inter-affinities)
+  - [Standalone Schedulers (e.g., Volcano)](#standalone-schedulers-eg-volcano)
+- [Infrastructure Needed (Optional)](#infrastructure-needed-optional)
+<!-- /toc -->
 
 ## Release Signoff Checklist
 

From 52fa7c962416a86c58823d5f599aa6c3528ae73d Mon Sep 17 00:00:00 2001
From: Pawel Kepka <pkepka@google.com>
Date: Thu, 11 Dec 2025 11:09:31 +0000
Subject: [PATCH 03/25] Added KEP reviewers and approvers

---
 .../5732-topology-aware-workload-scheduling/kep.yaml      | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/kep.yaml b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/kep.yaml
index 1a54a85aa9ab..912641c56da0 100644
--- a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/kep.yaml
+++ b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/kep.yaml
@@ -7,9 +7,13 @@ participating-sigs:
 status: provisional
 creation-date: 2025-12-10
 reviewers:
-  -
+  - sanposhiho
+  - dom4ha
+  - macsko
+  - wojtek-t
 approvers:
-  -
+  - sanposhiho
+  - dom4ha
 
 see-also:
   - "/keps/sig-scheduling/4671-gang-scheduling"

From 3dbf9b4369f75a8a77562f02bf15952f529d09ca Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20K=C4=99pka?= <pkepka@google.com>
Date: Tue, 16 Dec 2025 14:25:51 +0100
Subject: [PATCH 04/25] Initial batch of fixes after reviews

---
 .../README.md                                 | 61 +++++++++++--------
 1 file changed, 37 insertions(+), 24 deletions(-)

diff --git a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
index 756e9f232b0b..529cdd7db0d7 100644
--- a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
+++ b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
@@ -255,6 +255,7 @@ type PodGroupSchedulingConstraints struct {
 type TopologyConstraint struct {
     // Level specifies the key of the node label representing the topology domain.
     // All pods within the PodGroup must be colocated within the same domain instance.
+    // Different replicas of the PodGroup can land on different domain instances.
     // Examples: "topology.kubernetes.io/rack"
     Level string
 }
@@ -297,11 +298,15 @@ type PodGroupInfo struct {
     // This is relevant for PodGroups that have more than one replica.
     PodGroupReplicaIndex int
 
+    // PodSets is a list of PodSet objects within this PodGroup.
+    PodSets []*PodSetInfo
+
     // -- Add other fields below for future extensions --
 }
 
 // PodSetInfo holds information about a specific PodSet within a PodGroup,
 // primarily the list of Pods.
+// Pods within a PodSet must be homogeneous (using the sementic defined in KEP-5598).
 // This struct is designed to be extensible with more fields in the future.
 type PodSetInfo struct {
     // Pods is a list of Pod objects belonging to this PodSet.
@@ -310,21 +315,23 @@ type PodSetInfo struct {
     // -- Add other fields below for future extensions --
 }
 
-// Placement represents a candidate domain for scheduling a PodSet.
+// Placement represents a candidate domain for scheduling a PodGroup.
 // It defines a set of nodes and/or proposed Dynamic Resource Allocation (DRA)
-// resource bindings necessary to satisfy the PodSet's requirements within that domain.
+// resource bindings necessary to satisfy the PodGroup's requirements within that domain.
+// Placement is valid only in the context of a given PodGroup for a single cycle of
+// workload scheduling.
 type Placement struct {
     // NodeAffinity specifies the node constraints for this Placement.
     // For Topology this is derived from topology labels (e.g., all nodes with label
     // 'topology-rack: rack-1').
     // For DRA, this Affinity would be constructed based on nodeSelector from
     // DRA's AllocationResult from DRAAllocations.
-    // All pods within the PodSet, when being evaluated against this Placement,
+    // All pods within the PodGroup, when being evaluated against this Placement,
     // are restricted to the nodes matching this NodeAffinity.
     NodeAffinity *corev1.NodeAffinity
 
     // DRAAllocations details the proposed DRA resource assignments for
-    // the ResourceClaims made by the PodSet. This field is primarily used
+    // the ResourceClaims made by the PodGroup. This field is primarily used
     // by DRA-aware plugins.
     DRAAllocations []DraClaimAllocation
 }
@@ -333,8 +340,8 @@ type Placement struct {
 // device allocations. These allocations are tentative and used by the scheduler's
 // AssumePlacement phase to simulate resource commitment.
 type DraClaimAllocation struct {
-    // ResourceClaimName is the name of the ResourceClaim within the PodSet's context
-    // that these allocations are intended to satisfy.
+    // ResourceClaimName is the name of the ResourceClaim within the PodGroup's
+    // context that these allocations are intended to satisfy.
     ResourceClaimName string
 
     // Allocation contains DRA AllocationResult structures, specifying devices
@@ -354,10 +361,10 @@ type DraClaimAllocation struct {
 type PlacementGenerator interface {
     Name() string
 
-    // GeneratePlacements generates a list of potential Placements for the given PodGroup and PodSet.
+    // GeneratePlacements generates a list of potential Placements for the given PodGroup.
     // Each Placement represents a candidate set of resources (e.g., nodes matching a selector)
-    // and potential DRA allocations where the PodSet might be scheduled.
-    GeneratePlacements(ctx context.Context, state *framework.CycleState, podGroup *PodGroupInfo, podSet *PodSetInfo, parentPlacements []*Placement) ([]*Placement, *framework.Status)
+    // and potential DRA allocations where the PodGroup might be scheduled.
+    GeneratePlacements(ctx context.Context, state *framework.CycleState, podGroup *PodGroupInfo, parentPlacements []*Placement) ([]*Placement, *framework.Status)
 }
 ```
 
@@ -371,21 +378,21 @@ type PlacementState interface {
     Name() string
 
     // AssumePlacement temporarily configures the scheduling context to evaluate the feasibility
-    // of the given Placement for the PodGroup and PodSet.
-    AssumePlacement(ctx context.Context, state *framework.CycleState, podGroup *PodGroupInfo, podSet *PodSetInfo, placement *Placement) *framework.Status
+    // of the given Placement for the PodGroup.
+    AssumePlacement(ctx context.Context, state *framework.CycleState, podGroup *PodGroupInfo, placement *Placement) *framework.Status
 
     // RevertPlacement reverts the temporary scheduling context changes made by AssumePlacement.
     // This should be called after the evaluation of a Placement is complete to restore
     // the scheduler's state and allow other Placements to be considered.
-    RevertPlacement(ctx context.Context, state *framework.CycleState, podGroup *PodGroupInfo, podSet *PodSetInfo, placement *Placement) *framework.Status
+    RevertPlacement(ctx context.Context, state *framework.CycleState, podGroup *PodGroupInfo, placement *Placement) *framework.Status
 }
 ```
 
 **PlacementScorer:** Scores feasible placements to select the best one.
 
 ```go
-// PodSetAssignment represents the assignment of pods to nodes within a PodSet for a specific Placement.
-type PodSetAssignment struct {
+// PodGroupAssignment represents the assignment of pods to nodes within a PodGroup for a specific Placement.
+type PodGroupAssignment struct {
     // PodToNodeMap maps a Pod name (string) to a Node name (string).
     PodToNodeMap map[string]string
 }
@@ -396,12 +403,12 @@ type PlacementScorer interface {
 
     // ScorePlacement calculates a score for a given Placement. This function is called in Phase 3
     // (Placement Scoring and Selection) only for Placements that have been deemed feasible
-    // for all pods in the PodSet during Phase 2. The PodSetAssignment indicates the
+    // for all pods in the PodGroup during Phase 2. The PodGroupAssignment indicates the
     // node assigned to each pod within this Placement. The returned score is a float64,
     // with higher scores generally indicating more preferable Placements.
     // Plugins can implement various scoring strategies, such as bin packing to minimize
     // resource fragmentation.
-    ScorePlacement(ctx context.Context, state *framework.CycleState, podGroup *PodGroupInfo, podSet *PodSetInfo, placement *Placement, podsAssignment *PodSetAssignment) (float64, *framework.Status)
+    ScorePlacement(ctx context.Context, state *framework.CycleState, podGroup *PodGroupInfo, placement *Placement, podsAssignment *PodGroupAssignment) (float64, *framework.Status)
 }
 ```
 
@@ -411,7 +418,7 @@ The algorithm proceeds in three main phases for a given Workload/PodGroup.
 
 #### Phase 1: Candidate Placement Generation
 
-- **Input:** PodGroupInfo and PodSetInfo.
+- **Input:** PodGroupInfo.
 
 - **Action:** Iterate over distinct values of the topology label (TAS) or
   available ResourceSlices (DRA).
@@ -428,7 +435,7 @@ The algorithm proceeds in three main phases for a given Workload/PodGroup.
   1. Call `AssumePlacement` (binds context to the specific node selector/DRA
      resources).
 
-  2. Iterate through every pod in the PodSet.
+  2. Iterate through every pod in the PodGroup.
 
   3. Run standard Pod-level Filter and Score.
 
@@ -439,9 +446,9 @@ The algorithm proceeds in three main phases for a given Workload/PodGroup.
   6. Call `RevertPlacement`.
 
 - **Potential Optimization:** Pre-filtering can check aggregate resources
-  before running the full simulation.
+  requested by PodGroup Pods before running the full simulation.
 
-- **Heterogeneous PodGroup Handling**: Sequential Processing will be used
+- **Heterogeneous PodGroup Handling**: Sequential processing will be used
   initially. Pods are processed sequentially; if any fail, the placement is
   rejected.
 
@@ -451,7 +458,9 @@ The algorithm proceeds in three main phases for a given Workload/PodGroup.
 
 - **Selection:** Select the Placement with the highest score.
 
-- **Binding:** Proceed to bind pods to the assigned nodes and resources.
+- **Binding:** Proceed to bind pods to the assigned nodes and resources using
+  pod-by-pod scheduling logic with each pod prebound to the selected node
+  by seting `nominatedNodeName` value.
 
 ### Scheduler Plugins
 
@@ -470,10 +479,10 @@ Placements based on distinct values of the designated node label (TAS) .
 **PlacementBinPackingPlugin (New)** Implements `PlacementScorer`. Scores
 Placements to maximize utilization (tightest fit) and minimize fragmentation.
 
-### Potential Future Extensions (Beta Candidates)
+### Potential Future Extensions
 
-The following features are out of scope for the initial Alpha implementation but
-are considered for future releases (post-1.36):
+The following features are out of scope for this KEP but are considered for
+future separate KEPs improving and extending the proposed functionality:
 
 1. **Prioritized Placement Scheduling:** Allowing a set of preferred placements
    with fallbacks (e.g., prefer Rack, fallback to Block). This would introduce
@@ -494,6 +503,10 @@ are considered for future releases (post-1.36):
    define and alias topology levels, removing the need for users to know exact
    node label keys.
 
+6. **Feasible Placements Limit:** Adding an option to provide a limit on the
+   number of feasible Placements which need to be found before moving to
+   Phase 3: Placement Scoring and Selection.
+
 ### Test Plan
 
 [ ] I/we understand the owners of the involved components may require updates to

From 03b0c3277ef570b2fa7675cc31941a3eaba97b72 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20K=C4=99pka?= <pkepka@google.com>
Date: Wed, 17 Dec 2025 00:27:11 +0100
Subject: [PATCH 05/25] Move DRA constraints support to beta

---
 .../README.md                                 | 89 +++++++++++--------
 1 file changed, 50 insertions(+), 39 deletions(-)

diff --git a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
index 529cdd7db0d7..4ce902754af0 100644
--- a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
+++ b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
@@ -102,11 +102,11 @@ Groups while maintaining compatibility with the scheduler's existing ecosystem.
 
 Distributed workloads, particularly those driving the current AI/ML era, often
 require high-bandwidth and low-latency communication between multiple pods to
-function efficiently. While the [KEP-4671: Workload API] makes the first step
-towards managing these applications as cohesive units, it primarily establishes
-the API structure. For workloads sensitive to inter-pod communication, simply
-grouping pods is insufficient; their physical placement within the cluster's
-network topology is a decisive factor in their performance.
+function efficiently. While the [KEP-4671: Workload API](https://kep.k8s.io/4671)
+makes the first step towards managing these applications as cohesive units, it
+primarily establishes the API structure. For workloads sensitive to inter-pod
+communication, simply grouping pods is insufficient; their physical placement
+within the cluster's network topology is a decisive factor in their performance.
 
 In this KEP, we propose an algorithm for topology-aware and DRA-aware scheduling
 that operates directly within the Kubernetes kube-scheduler. The core objective
@@ -137,15 +137,16 @@ need for external dependencies.
 
 ### Goals
 
-- To enhance kube-scheduler to perform topology-aware and DRA-aware scheduling
-  for multi-pod workloads, as defined by the Workload API (KEP-4671).
+- To enhance kube-scheduler to be able to perform topology-aware and DRA-aware
+  scheduling for multi-pod workloads, as defined by the Workload API
+  ([KEP-4671](https://kep.k8s.io/4671)).
 - To optimize the placement of distributed workloads by co-locating pods based
   on network topology and DRA resource availability.
 - To introduce new extension points and phases within the Kubernetes scheduler
   framework to support the concept of "Placements" (candidate sets of nodes
   and DRA resources).
 - To define the required changes to the Workload API (KEP-4671) to support
-  scheduling constraints.
+  Topology scheduling constraints.
 - To leverage the scheduler's existing pod-level filtering and scoring logic
   within the evaluation of each Placement.
 - To provide a flexible framework extensible by plugins for various topology
@@ -153,6 +154,10 @@ need for external dependencies.
 
 ### Non-Goals
 
+- To define the required changes to the Workload API (KEP-4671) to support
+  ResourceClaims for DRA-aware workload scheduling. These changes will be
+  proposed in a separate KEP:
+  [KEP-5729: DRA: ResourceClaim Support for Workloads](https://github.com/kubernetes/enhancements/pull/5736)
 - To replace the functionality of external workload queueing and admission
   control systems like Kueue. This proposal focuses on the in-scheduler
   placement decision for a single Workload at a time.
@@ -245,10 +250,6 @@ type PodGroupSchedulingConstraints struct {
     // TopologyConstraints specifies desired topological placements for all pods
     // within this PodGroup.
     TopologyConstraints []TopologyConstraint
-
-    // DRAConstraints specifies constraints on how Dynamic Resources are allocated
-    // across the PodGroup.
-    DRAConstraints []DRAConstraint
 }
 
 // TopologyConstraint describes a desired topological colocation for all pods in the PodGroup.
@@ -259,22 +260,15 @@ type TopologyConstraint struct {
     // Examples: "topology.kubernetes.io/rack"
     Level string
 }
-
-// DRAConstraint provides constraints on how specific DRA claims across the group should
-// be fulfilled.
-type DRAConstraint struct {
-    // ResourceClaimName specifies the name of a specific ResourceClaim
-    // within the PodGroup's pods that this constraint applies to.
-    ResourceClaimName *string
-
-    // ResourceClaimTemplateName specifies the name of a ResourceClaimTemplate.
-    // This applies to all ResourceClaim instances generated from this template.
-    ResourceClaimTemplateName *string
-}
 ```
 
-Note: For the initial alpha scope, only a single TopologyConstraint or
-DRAConstraint will be supported.
+The Workload API changes for DRA-aware scheduling, including the definition of
+DRA constraints, are out of scope for the alpha version of this KEP. These changes
+will be defined in a separate KEP: 
+[KEP-5729: DRA: ResourceClaim Support for Workloads](https://github.com/kubernetes/enhancements/pull/5736).
+
+Note: For the initial alpha scope, only a single TopologyConstraint will be
+supported.
 
 ### Scheduling Framework Extensions
 
@@ -421,7 +415,7 @@ The algorithm proceeds in three main phases for a given Workload/PodGroup.
 - **Input:** PodGroupInfo.
 
 - **Action:** Iterate over distinct values of the topology label (TAS) or
-  available ResourceSlices (DRA).
+  available Devices (DRA).
 
 - **Output:** A list of Placement objects.
 
@@ -465,19 +459,36 @@ The algorithm proceeds in three main phases for a given Workload/PodGroup.
 ### Scheduler Plugins
 
 **TopologyPlacementPlugin (New)** Implements `PlacementGenerator`. Generates
-Placements based on distinct values of the designated node label (TAS) .
+Placements based on distinct values of the designated node label (TAS).
 
-**DRAPlugin (Extension)** Extended to implement `PlacementGenerator` and
-`PlacementState`.
+**PlacementBinPackingPlugin (New)** Implements `PlacementScorer`. Scores
+Placements to maximize utilization (tightest fit) and minimize fragmentation.
 
-- **Generator:** Returns Placements derived from available ResourceSlices
-  satisfying shared claims.
+**DRATestPlugin (New)** Implements `PlacementGenerator` and `PlacementState`
+and is used only for testing the algorithm's support for DRA-aware scheduling.
 
-- **State:** Temporarily assigns AllocationResults to ResourceClaims during
-  the Assume phase.
+- **Generator:** Returns Placements derived from available Devices satisfying
+  claims shared by all Pods within a PodGroup.
 
-**PlacementBinPackingPlugin (New)** Implements `PlacementScorer`. Scores
-Placements to maximize utilization (tightest fit) and minimize fragmentation.
+- **State:** Temporarily assigns AllocationResults to Devices during the
+  Assume phase.
+
+### Beta Extensions
+
+The beta version of this KEP will introduce full support for DRA-aware workload
+scheduling. This enhancement will enable the scheduler to consider DRA claims
+defined by users when making placement decisions, ensuring that workloads are
+placed on nodes that can satisfy their resource requirements. This will be
+achieved by using the API to be defined in 
+[KEP-5729: DRA: ResourceClaim Support for Workloads](https://github.com/kubernetes/enhancements/pull/5736).
+
+The implementation will build upon the extension points introduced in the
+alpha version of this feature and the `DRATestPlugin` implementation.
+Specifically, the `DRAPlugin` will be enhanced to generate placements based
+on the ResourceClaim objects associated with the PodGroup. The plugin will
+interact with the DRA framework to ensure that the selected placement can
+satisfy the resource requirements of the workload, as expressed in its
+ResourceClaim.
 
 ### Potential Future Extensions
 
@@ -526,15 +537,15 @@ necessary to implement this enhancement.
 - Algorithm Logic: Test the sequential processing of Placements and the
   selection logic based on scores.
 
-- DRA Integration: specific tests for DRAConstraint resolution.
+- DRA Integration: specific tests for DRATestPlugin plugin.
 
 #### Integration tests
 
 - Topology Awareness: Verify that pods with TopologyConstraint are correctly
   co-located on nodes sharing the label.
 
-- DRA Awareness: Verify that pods with DRAConstraint are bound to shared
-  ResourceSlices.
+- DRA Awareness: Verify that pods with shared ResourceClaims are bound to shared
+  Devices.
 
 - Infeasibility: Verify that Workloads remain pending if no Placement
   satisfies the constraints.

From 2e35c86a27419f9b9b4cf4e3876948534c3de02f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20K=C4=99pka?= <pkepka@google.com>
Date: Wed, 17 Dec 2025 00:45:48 +0100
Subject: [PATCH 06/25] Fix TOC

---
 .../5732-topology-aware-workload-scheduling/README.md          | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
index 4ce902754af0..738e64ba39fc 100644
--- a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
+++ b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
@@ -22,7 +22,8 @@
     - [Phase 2: Pod-Level Filtering and Feasibility Check](#phase-2-pod-level-filtering-and-feasibility-check)
     - [Phase 3: Placement Scoring and Selection](#phase-3-placement-scoring-and-selection)
   - [Scheduler Plugins](#scheduler-plugins)
-  - [Potential Future Extensions (Beta Candidates)](#potential-future-extensions-beta-candidates)
+  - [Beta Extensions](#beta-extensions)
+  - [Potential Future Extensions](#potential-future-extensions)
   - [Test Plan](#test-plan)
     - [Prerequisite testing updates](#prerequisite-testing-updates)
     - [Unit tests](#unit-tests)

From 1d62a849571e92747f7a0ea902805747a16e48d3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20K=C4=99pka?= <pkepka@google.com>
Date: Mon, 22 Dec 2025 10:58:33 +0100
Subject: [PATCH 07/25] Smaller fixed based on review feedback.

---
 .../5732-topology-aware-workload-scheduling/README.md      | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
index 738e64ba39fc..78b47c959dc6 100644
--- a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
+++ b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
@@ -455,7 +455,7 @@ The algorithm proceeds in three main phases for a given Workload/PodGroup.
 
 - **Binding:** Proceed to bind pods to the assigned nodes and resources using
   pod-by-pod scheduling logic with each pod prebound to the selected node
-  by seting `nominatedNodeName` value.
+  by setting `nominatedNodeName` value.
 
 ### Scheduler Plugins
 
@@ -507,7 +507,7 @@ future separate KEPs improving and extending the proposed functionality:
    Block -> Rack). This would involve iterative placement generation and a
    Parent field in the Placement struct.
 
-4. **Pod Group Replicas Support:** Optimizing scheduling for identical
+4. **Pod Group Replicas Optimization:** Optimizing scheduling for identical
    PodGroups (replicas) by scheduling the maximum feasible number of replicas
    within a single placement pass.
 
@@ -565,11 +565,12 @@ necessary to implement this enhancement.
 
 - Feature implemented behind a feature flag.
 - PodGroupSchedulingConstraints API defined.
-- Basic topology (Node Label) and DRA constraints working.
+- Basic topology (Node Label) working.
 - Initial unit and integration tests.
 
 #### Beta
 
+- DRA constraints working.
 - Support for "Potential Future Extensions" (Prioritized placement, etc.)
   evaluated.
 - Scalability tests on large clusters with high placement counts.

From c8cf6578faebd932f5315caa528d5cee5315edc9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20K=C4=99pka?= <pkepka@google.com>
Date: Mon, 22 Dec 2025 11:22:16 +0100
Subject: [PATCH 08/25] Update Explicit Topology Definition description.

---
 .../5732-topology-aware-workload-scheduling/README.md           | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
index 78b47c959dc6..2664e398f385 100644
--- a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
+++ b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
@@ -513,7 +513,7 @@ future separate KEPs improving and extending the proposed functionality:
 
 5. **Explicit Topology Definition:** Using a Custom Resource (NodeTopology) to
    define and alias topology levels, removing the need for users to know exact
-   node label keys.
+   node label keys and opening addtional optimization and validation options.
 
 6. **Feasible Placements Limit:** Adding an option to provide a limit on the
    number of feasible Placements which need to be found before moving to

From 0173704bfab9f4e7039f872224343b83cd6a5cea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20K=C4=99pka?= <pkepka@google.com>
Date: Thu, 8 Jan 2026 10:34:00 +0100
Subject: [PATCH 09/25] Added Plugin suffix to PlacementGenerator,
 PlacementState and PlacementScorer

---
 .../README.md                                 | 32 +++++++++----------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
index 2664e398f385..6df7f5454af9 100644
--- a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
+++ b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
@@ -95,8 +95,8 @@ decisions before committing resources.
 
 This design introduces specific extensions to the Kubernetes Workload API to
 support `TopologyConstraints` and `DRAConstraints`, defines new interfaces
-within the Scheduling Framework (`PlacementGenerator`, `PlacementState`,
-`PlacementScorer`), and details the algorithmic flow required to schedule Pod
+within the Scheduling Framework (`PlacementGeneratorPlugin`, `PlacementStatePlugin`,
+`PlacementScorerPlugin`), and details the algorithmic flow required to schedule Pod
 Groups while maintaining compatibility with the scheduler's existing ecosystem.
 
 ## Motivation
@@ -349,11 +349,11 @@ type DraClaimAllocation struct {
 
 #### 2. New Plugin Interfaces
 
-**PlacementGenerator:** Generates candidate placements based on constraints.
+**PlacementGeneratorPlugin:** Generates candidate placements based on constraints.
 
 ```go
-// PlacementGenerator is an interface for plugins that generate candidate Placements.
-type PlacementGenerator interface {
+// PlacementGeneratorPlugin is an interface for plugins that generate candidate Placements.
+type PlacementGeneratorPlugin interface {
     Name() string
 
     // GeneratePlacements generates a list of potential Placements for the given PodGroup.
@@ -363,13 +363,13 @@ type PlacementGenerator interface {
 }
 ```
 
-**PlacementState:** Manages state changes (simulating binding) during
+**PlacementStatePlugin:** Manages state changes (simulating binding) during
 feasibility checks.
 
 ```go
-// PlacementState is an interface for plugins that manage state changes
+// PlacementStatePlugin is an interface for plugins that manage state changes
 // when a Placement is being considered.
-type PlacementState interface {
+type PlacementStatePlugin interface {
     Name() string
 
     // AssumePlacement temporarily configures the scheduling context to evaluate the feasibility
@@ -383,7 +383,7 @@ type PlacementState interface {
 }
 ```
 
-**PlacementScorer:** Scores feasible placements to select the best one.
+**PlacementScorerPlugin:** Scores feasible placements to select the best one.
 
 ```go
 // PodGroupAssignment represents the assignment of pods to nodes within a PodGroup for a specific Placement.
@@ -392,8 +392,8 @@ type PodGroupAssignment struct {
     PodToNodeMap map[string]string
 }
 
-// PlacementScorer is an interface for plugins that score feasible Placements.
-type PlacementScorer interface {
+// PlacementScorerPlugin is an interface for plugins that score feasible Placements.
+type PlacementScorerPlugin interface {
     Name() string
 
     // ScorePlacement calculates a score for a given Placement. This function is called in Phase 3
@@ -459,13 +459,13 @@ The algorithm proceeds in three main phases for a given Workload/PodGroup.
 
 ### Scheduler Plugins
 
-**TopologyPlacementPlugin (New)** Implements `PlacementGenerator`. Generates
+**TopologyPlacementPlugin (New)** Implements `PlacementGeneratorPlugin`. Generates
 Placements based on distinct values of the designated node label (TAS).
 
-**PlacementBinPackingPlugin (New)** Implements `PlacementScorer`. Scores
+**PlacementBinPackingPlugin (New)** Implements `PlacementScorerPlugin`. Scores
 Placements to maximize utilization (tightest fit) and minimize fragmentation.
 
-**DRATestPlugin (New)** Implements `PlacementGenerator` and `PlacementState`
+**DRATestPlugin (New)** Implements `PlacementGeneratorPlugin` and `PlacementStatePlugin`
 and is used only for testing the algorithm's support for DRA-aware scheduling.
 
 - **Generator:** Returns Placements derived from available Devices satisfying
@@ -529,10 +529,10 @@ necessary to implement this enhancement.
 
 #### Unit tests
 
-- PlacementGenerator: Test generation of placements for various topology
+- PlacementGeneratorPlugin: Test generation of placements for various topology
   labels and DRA ResourceSlices.
 
-- PlacementState: Verify AssumePlacement and RevertPlacement correctly modify
+- PlacementStatePlugin: Verify AssumePlacement and RevertPlacement correctly modify
   and restore the CycleState.
 
 - Algorithm Logic: Test the sequential processing of Placements and the

From f59db7c6bcaee9c5c608905887c9e93c1cc71ff4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20K=C4=99pka?= <pkepka@google.com>
Date: Fri, 16 Jan 2026 14:42:17 +0100
Subject: [PATCH 10/25] Updating README.md based on the comments

- Added requirement to PlacementGeneratorPlugin to implement EnqueueExtensions
- Added information about PlacementGeneratorPlugins to be called after PreFilter scheduling phase.
- Changed NodeAffinity to NodeSelector in Placement struc
---
 .../README.md                                       | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
index 6df7f5454af9..11f47255e8a2 100644
--- a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
+++ b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
@@ -316,14 +316,14 @@ type PodSetInfo struct {
 // Placement is valid only in the context of a given PodGroup for a single cycle of
 // workload scheduling.
 type Placement struct {
-    // NodeAffinity specifies the node constraints for this Placement.
+    // NodeSelector specifies the node constraints for this Placement.
     // For Topology this is derived from topology labels (e.g., all nodes with label
     // 'topology-rack: rack-1').
-    // For DRA, this Affinity would be constructed based on nodeSelector from
+    // For DRA, this selector would be constructed based on nodeSelector from
     // DRA's AllocationResult from DRAAllocations.
     // All pods within the PodGroup, when being evaluated against this Placement,
-    // are restricted to the nodes matching this NodeAffinity.
-    NodeAffinity *corev1.NodeAffinity
+    // are restricted to the nodes matching this NodeSelector.
+    NodeSelector *corev1.NodeSelector
 
     // DRAAllocations details the proposed DRA resource assignments for
     // the ResourceClaims made by the PodGroup. This field is primarily used
@@ -353,6 +353,8 @@ type DraClaimAllocation struct {
 
 ```go
 // PlacementGeneratorPlugin is an interface for plugins that generate candidate Placements.
+// Plugins implemeting PlacementGeneratorPlugin interface should also implement
+// EnqueueExtensions interface.
 type PlacementGeneratorPlugin interface {
     Name() string
 
@@ -420,6 +422,9 @@ The algorithm proceeds in three main phases for a given Workload/PodGroup.
 
 - **Output:** A list of Placement objects.
 
+- Placement generation is executed after PreFilter giving PlacementGeneratorPlugins
+  a chance to get the list of nodes in the cluster.
+
 - Example: If the label is rack, placements are generated for rack-1, rack-2,
   etc.
 

From 045349e8b7f690ce5dd42d137933d0c1b4f052df Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20K=C4=99pka?= <pkepka@google.com>
Date: Tue, 3 Feb 2026 10:00:35 +0100
Subject: [PATCH 11/25] Add prod readiness file

---
 keps/prod-readiness/sig-scheduling/5732.yaml | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 keps/prod-readiness/sig-scheduling/5732.yaml

diff --git a/keps/prod-readiness/sig-scheduling/5732.yaml b/keps/prod-readiness/sig-scheduling/5732.yaml
new file mode 100644
index 000000000000..2afe6ab584e8
--- /dev/null
+++ b/keps/prod-readiness/sig-scheduling/5732.yaml
@@ -0,0 +1,3 @@
+kep-number: 5732
+alpha:
+  approver: "@wojtek-t"

From f576836d479f1a548809c0dc1936bcf437493c9c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20K=C4=99pka?= <pkepka@google.com>
Date: Tue, 3 Feb 2026 10:27:19 +0100
Subject: [PATCH 12/25] Production Readiness Review Questionnaire

---
 .../README.md                                 | 162 ++++--------------
 1 file changed, 34 insertions(+), 128 deletions(-)

diff --git a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
index 11f47255e8a2..8095bf650417 100644
--- a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
+++ b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
@@ -620,49 +620,13 @@ kube-scheduler instance being a leader).
 
 ## Production Readiness Review Questionnaire
 
-<!--
-
-Production readiness reviews are intended to ensure that features merging into
-Kubernetes are observable, scalable and supportable; can be safely operated in
-production environments, and can be disabled or rolled back in the event they
-cause increased failures in production. See more in the PRR KEP at
-https://git.k8s.io/enhancements/keps/sig-architecture/1194-prod-readiness.
-
-The production readiness review questionnaire must be completed and approved
-for the KEP to move to `implementable` status and be included in the release.
-
-In some cases, the questions below should also have answers in `kep.yaml`. This
-is to enable automation to verify the presence of the review, and to reduce review
-burden and latency.
-
-The KEP must have a approver from the
-[`prod-readiness-approvers`](http://git.k8s.io/enhancements/OWNERS_ALIASES)
-team. Please reach out on the
-[#prod-readiness](https://kubernetes.slack.com/archives/CPNHUMN74) channel if
-you need any help or guidance.
--->
-
 ### Feature Enablement and Rollback
 
-<!--
-This section must be completed when targeting alpha to a release.
--->
-
 ###### How can this feature be enabled / disabled in a live cluster?
 
-<!--
-Pick one of these and delete the rest.
-
-Documentation is available on [feature gate lifecycle] and expectations, as
-well as the [existing list] of feature gates.
-
-[feature gate lifecycle]: https://git.k8s.io/community/contributors/devel/sig-architecture/feature-gates.md
-[existing list]: https://kubernetes.io/docs/reference/command-line-tools-reference/feature-gates/
--->
-
-- [ ] Feature gate (also fill in values in `kep.yaml`)
-  - Feature gate name:
-  - Components depending on the feature gate:
+- [X] Feature gate (also fill in values in `kep.yaml`)
+  - Feature gate name: TopologyAwareWorkloadScheduling
+  - Components depending on the feature gate: kube-apiserver, kube-scheduler
 - [ ] Other
   - Describe the mechanism:
   - Will enabling / disabling the feature require downtime of the control
@@ -672,40 +636,30 @@ well as the [existing list] of feature gates.
 
 ###### Does enabling the feature change any default behavior?
 
-<!--
-Any change of default behavior may be surprising to users or break existing
-automations, so be extremely careful here.
--->
+No - even with a feature enabled scheduler by default will use existing scheduling
+algorithm to scheudle worklaods. Only when workload will have an explicit topology
+constraint set an alternative algorithm will be used.
 
 ###### Can the feature be disabled once it has been enabled (i.e. can we roll back the enablement)?
 
-<!--
-Describe the consequences on existing workloads (e.g., if this is a runtime
-feature, can it break the existing applications?).
-
-Feature gates are typically disabled by setting the flag to `false` and
-restarting the component. No other changes should be necessary to disable the
-feature.
+Yes, the workload scheduling algorithm changes can be disabled by simply disabling
+the feature gate in kube-scheduler.
 
-NOTE: Also set `disable-supported` to `true` or `false` in `kep.yaml`.
--->
+The new API changes can also be disabled by disabling the feature gate in kube-apiserver.
+However that doesn't result in clearing the new fields for workloads that already have
+them set in the storage.
 
 ###### What happens if we reenable the feature if it was previously rolled back?
 
+The feature starts working again.
+
 ###### Are there any tests for feature enablement/disablement?
 
-<!--
-The e2e framework does not currently support enabling or disabling feature
-gates. However, unit tests in each component dealing with managing data, created
-with and without the feature, are necessary. At the very least, think about
-conversion tests if API types are being modified.
-
-Additionally, for features that are introducing a new API field, unit tests that
-are exercising the `switch` of feature gate itself (what happens if I disable a
-feature gate after having objects written with the new field) are also critical.
-You can take a look at one potential example of such test in:
-https://github.com/kubernetes/kubernetes/pull/97058/files#diff-7826f7adbc1996a05ab52e3f5f02429e94b68ce6bce0dc534d1be636154fded3R246-R282
--->
+The scheduler algorithm changes are purely in-memory and doesn't require any dedicated
+enablement/disablement tests - the logic will be covered by regular feature tests.
+
+For the newly introduced API fields, dedicated enablement/disablement tests at the
+kube-apiserver registry layer will be added in Alpha.
 
 ### Rollout, Upgrade and Rollback Planning
 
@@ -844,91 +798,43 @@ and creating new ones, as well as about cluster-level services (e.g. DNS):
 
 ### Scalability
 
-<!--
-For alpha, this section is encouraged: reviewers should consider these questions
-and attempt to answer them.
-
-For beta, this section is required: reviewers must answer these questions.
-
-For GA, this section is required: approvers should be able to confirm the
-previous answers based on experience in the field.
--->
-
 ###### Will enabling / using this feature result in any new API calls?
 
-<!--
-Describe them, providing:
-  - API call type (e.g. PATCH pods)
-  - estimated throughput
-  - originating component(s) (e.g. Kubelet, Feature-X-controller)
-Focusing mostly on:
-  - components listing and/or watching resources they didn't before
-  - API calls that may be triggered by changes of some Kubernetes resources
-    (e.g. update of object X triggers new updates of object Y)
-  - periodic API calls to reconcile state (e.g. periodic fetching state,
-    heartbeats, leader election, etc.)
--->
+No.
 
 ###### Will enabling / using this feature result in introducing new API types?
 
-<!--
-Describe them, providing:
-  - API type
-  - Supported number of objects per cluster
-  - Supported number of objects per namespace (for namespace-scoped objects)
--->
+No.
 
 ###### Will enabling / using this feature result in any new calls to the cloud provider?
 
-<!--
-Describe them, providing:
-  - Which API(s):
-  - Estimated increase:
--->
+No.
 
 ###### Will enabling / using this feature result in increasing size or count of the existing API objects?
 
-<!--
-Describe them, providing:
-  - API type(s):
-  - Estimated increase in size: (e.g., new annotation of size 32B)
-  - Estimated amount of new objects: (e.g., new Object X for every existing Pod)
--->
+Using this feature will require setting topology constraint on Workload object.
+The related increase in size of the Workload object should however be negligible.
 
 ###### Will enabling / using this feature result in increasing time taken by any operations covered by existing SLIs/SLOs?
 
-<!--
-Look at the [existing SLIs/SLOs].
-
-Think about adding additional work or introducing new steps in between
-(e.g. need to do X to start a container), etc. Please describe the details.
+Although the proposed algorithm was designed with performance in mind, the scheduling
+latency / Pod Startup SLO may potentially increase especially for large clusters and
+fine grained topology constraints.
 
-[existing SLIs/SLOs]: https://git.k8s.io/community/sig-scalability/slos/slos.md#kubernetes-slisslos
--->
+We will measure the exact impact using performance benchmarks and scalability tests and
+update the section based on the results. The complexity of scheudling of a single worklaod
+is O(#pods * #nodes), which is comparable to the algorithm not using topology constraints,
+so the benchmarks are primarily to validate the potential inefficiencies of the implementation.
 
 ###### Will enabling / using this feature result in non-negligible increase of resource usage (CPU, RAM, disk, IO, ...) in any components?
 
-<!--
-Things to keep in mind include: additional in-memory state, additional
-non-trivial computations, excessive access to disks (including increased log
-volume), significant amount of data sent and/or received over network, etc.
-This through this both in small and large cases, again with respect to the
-[supported limits].
-
-[supported limits]: https://git.k8s.io/community//sig-scalability/configs-and-limits/thresholds.md
--->
+For large clusters and fine grained toplogy constraints we may observe some increase in CPU
+and RAM usage for kube-scheduler. The exact scale of this increase will be confirmed by
+scalability tests.
 
 ###### Can enabling / using this feature result in resource exhaustion of some node resources (PIDs, sockets, inodes, etc.)?
 
-<!--
-Focus not just on happy cases, but primarily on more pathological cases
-(e.g. probes taking a minute instead of milliseconds, failed pods consuming resources, etc.).
-If any of the resources can be exhausted, how this is mitigated with the existing limits
-(e.g. pods per node) or new limits added by this KEP?
-
-Are there any tests that were run/should be run to understand performance characteristics better
-and validate the declared limits?
--->
+No.
 
 ### Troubleshooting
 

From 6b5b3156ca76d47dc9ebb85487b96d9490435eb4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20K=C4=99pka?= <pkepka@google.com>
Date: Tue, 3 Feb 2026 11:08:43 +0100
Subject: [PATCH 13/25] Fixed spelling errors

---
 .../5732-topology-aware-workload-scheduling/README.md         | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
index 8095bf650417..0ea97e3eb83e 100644
--- a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
+++ b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
@@ -518,7 +518,7 @@ future separate KEPs improving and extending the proposed functionality:
 
 5. **Explicit Topology Definition:** Using a Custom Resource (NodeTopology) to
    define and alias topology levels, removing the need for users to know exact
-   node label keys and opening addtional optimization and validation options.
+   node label keys and opening additional optimization and validation options.
 
 6. **Feasible Placements Limit:** Adding an option to provide a limit on the
    number of feasible Placements which need to be found before moving to
@@ -822,7 +822,7 @@ latency / Pod Startup SLO may potentially increase especially for large clusters
 fine grained topology constraints.
 
 We will measure the exact impact using performance benchmarks and scalability tests and
-update the section based on the results. The complexity of scheudling of a single worklaod
+update the section based on the results. The complexity of scheuduling of a single worklaod
 is O(#pods * #nodes), which is comparable to the algorithm not using topology constraints,
 so the benchmarks are primarily to validate the potential inefficiencies of the implementation.
 

From 290490f22a83e3456be4ad0eed34b6a64af164d2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20K=C4=99pka?= <pkepka@google.com>
Date: Tue, 3 Feb 2026 20:16:53 +0100
Subject: [PATCH 14/25] Update kep.yaml

---
 .../5732-topology-aware-workload-scheduling/kep.yaml          | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/kep.yaml b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/kep.yaml
index 912641c56da0..0253ec6a85c9 100644
--- a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/kep.yaml
+++ b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/kep.yaml
@@ -38,6 +38,10 @@ milestone:
 # List the feature gate name and the components for which it must be enabled
 feature-gates:
   - name: TopologyAwareWorkloadScheduling
+    components:
+      - kube-apiserver
+      - kube-scheduler  
+  - name: WorkloadBasicPolicyDesiredCount
     components:
       - kube-apiserver
       - kube-scheduler

From 0ffef879c47faee98128d622a549be3cdf9ffd1f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20K=C4=99pka?= <pkepka@google.com>
Date: Tue, 3 Feb 2026 20:31:03 +0100
Subject: [PATCH 15/25] Extend KEP with desiredCount.

---
 .../README.md                                 | 44 ++++++++++++++++++-
 1 file changed, 43 insertions(+), 1 deletion(-)

diff --git a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
index 0ea97e3eb83e..56c1b6131080 100644
--- a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
+++ b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
@@ -271,6 +271,39 @@ will be defined in a separate KEP:
 Note: For the initial alpha scope, only a single TopologyConstraint will be
 supported.
 
+#### Basic Policy Extension
+
+In the first alpha version of the Workload API, the `Basic` policy was a no-op.
+We propose extending the `Basic` policy to accept a `desiredCount` field.
+This feature will be gated behind a separate feature gate 
+(`WorkloadBasicPolicyDesiredCount`) to decouple it from the core Gang Scheduling
+and Topology Aware Scheduling features.
+
+```go
+// BasicSchedulingPolicy indicates that standard Kubernetes
+// scheduling behavior should be used.
+type BasicSchedulingPolicy struct {
+	// DesiredCount is the expected number of pods that will belong to this
+	// PodGroup. This field is a hint to the scheduler to help it make better
+	// placement decisions for the group as a whole.
+	//
+	// Unlike gang's minCount, this field does not block scheduling. If the number
+	// of available pods is less than desiredCount, the scheduler can still attempt
+	// to schedule the available pods, but will optimistically try to select a
+	// placement that can accommodate the future pods.
+	//
+	// +optional
+	DesiredCount *int32
+}
+```
+
+This field allows users to express their "true" workloads more easily and enables
+the scheduler to optimize the placement of such pod groups by taking the desired state
+into account. Ideally, the scheduler should prefer placements that can accommodate
+the full `desiredCount`, even if not all pods are created yet. When `desiredCount`
+is specified, the scheduler can delay scheduling the first Pod it sees for a short
+amount of time in order to wait for more Pods to be observed.
+
 ### Scheduling Framework Extensions
 
 The scheduler framework requires new plugin interfaces to handle "Placements". A
@@ -470,6 +503,9 @@ Placements based on distinct values of the designated node label (TAS).
 **PlacementBinPackingPlugin (New)** Implements `PlacementScorerPlugin`. Scores
 Placements to maximize utilization (tightest fit) and minimize fragmentation.
 
+**PlacementPodCountScorerPlugin (New)** Implements `PlacementScorerPlugin`. Scores
+Placements based on the number of pods fiting into each Placement.
+
 **DRATestPlugin (New)** Implements `PlacementGeneratorPlugin` and `PlacementStatePlugin`
 and is used only for testing the algorithm's support for DRA-aware scheduling.
 
@@ -626,7 +662,13 @@ kube-scheduler instance being a leader).
 
 - [X] Feature gate (also fill in values in `kep.yaml`)
   - Feature gate name: TopologyAwareWorkloadScheduling
-  - Components depending on the feature gate: kube-apiserver, kube-scheduler
+  - Components depending on the feature gate:
+    - kube-apiserver
+    - kube-scheduler
+  - Feature gate name: WorkloadBasicPolicyDesiredCount
+  - Components depending on the feature gate:
+    - kube-apiserver
+    - kube-scheduler
 - [ ] Other
   - Describe the mechanism:
   - Will enabling / disabling the feature require downtime of the control

From 02f70fac6e8b331156b3509f252602d5a26559ea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20K=C4=99pka?= <pkepka@google.com>
Date: Tue, 3 Feb 2026 20:59:50 +0100
Subject: [PATCH 16/25] Address comments from dom4ha

---
 .../README.md                                 | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
index 56c1b6131080..c5dba9a890f7 100644
--- a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
+++ b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
@@ -84,10 +84,11 @@ integrating a Topology-Aware and DRA-Aware workload scheduling algorithm into
 the Kubernetes kube-scheduler to address the complex placement requirements of
 modern, high-performance distributed applications.
 
-The proposed algorithm fundamentally alters the scheduling lifecycle for gang
-scheduled workloads. Instead of evaluating pods individually against the cluster
-state - a process prone to fragmentation and deadlocks - the new mechanism
-generates "Placements". These Placements represent candidate domains (sets of
+The proposed topology algorithm leverages the workload-oriented scheduling
+lifecycle introduced in KEP-4671, rather than fundamentally altering the scheduling
+loop itself. It extends this foundation by enabling the evaluation of scheduling
+options within specific "Placements" (subsets of the cluster). These Placements
+represent candidate domains (sets of
 nodes or DRA resources) where the entire workload is theoretically feasible. The
 scheduler then simulates the placement of the full group of pods within these
 domains, utilizing existing filtering and scoring logic to ensure high-fidelity
@@ -468,15 +469,11 @@ The algorithm proceeds in three main phases for a given Workload/PodGroup.
   1. Call `AssumePlacement` (binds context to the specific node selector/DRA
      resources).
 
-  2. Iterate through every pod in the PodGroup.
+  2. Run default workload scheduling algorithm with the given context.
 
-  3. Run standard Pod-level Filter and Score.
+  3. If all pods fit, the Placement is marked Feasible.
 
-  4. Use internal logic to simulate placing the pod on a node.
-
-  5. If all pods fit, the Placement is marked Feasible.
-
-  6. Call `RevertPlacement`.
+  4. Call `RevertPlacement`.
 
 - **Potential Optimization:** Pre-filtering can check aggregate resources
   requested by PodGroup Pods before running the full simulation.

From 85fdeded08cb087331138d8729ef9e6621834659 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20K=C4=99pka?= <pkepka@google.com>
Date: Tue, 3 Feb 2026 21:15:02 +0100
Subject: [PATCH 17/25] Update README.md

---
 .../5732-topology-aware-workload-scheduling/README.md          | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
index c5dba9a890f7..ac9e94696db9 100644
--- a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
+++ b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
@@ -186,7 +186,7 @@ We support two fundamental types of constraints:
    PodGroup are placed onto nodes sharing a common topological characteristic
    (e.g., same rack), defined by a specific node label.
 
-2. **DRA Constraint (Shared Dynamic Resource Allocation)**: Ensures all pods in a
+2. **raint (Shared Dynamic Resource Allocation)**: Ensures all pods in a
    PodGroup bind to a single DRA claim fulfilled from a single, shared,
    co-located resource (e.g., interconnected network interfaces or
    accelerators).
@@ -608,7 +608,6 @@ necessary to implement this enhancement.
 
 #### Beta
 
-- DRA constraints working.
 - Support for "Potential Future Extensions" (Prioritized placement, etc.)
   evaluated.
 - Scalability tests on large clusters with high placement counts.

From 1bd676dfaebbe05329c4e6fec9e83cd9b3e20131 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20K=C4=99pka?= <pkepka@google.com>
Date: Tue, 3 Feb 2026 21:16:44 +0100
Subject: [PATCH 18/25] Update README.md

---
 .../5732-topology-aware-workload-scheduling/README.md           | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
index ac9e94696db9..4ce3eebfd05e 100644
--- a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
+++ b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
@@ -186,7 +186,7 @@ We support two fundamental types of constraints:
    PodGroup are placed onto nodes sharing a common topological characteristic
    (e.g., same rack), defined by a specific node label.
 
-2. **raint (Shared Dynamic Resource Allocation)**: Ensures all pods in a
+2. **DRA Constraint (Shared Dynamic Resource Allocation)**: Ensures all pods in a
    PodGroup bind to a single DRA claim fulfilled from a single, shared,
    co-located resource (e.g., interconnected network interfaces or
    accelerators).

From e3c67b1a7e5dabd1936d9f94a2421890d3ec2958 Mon Sep 17 00:00:00 2001
From: Pawel Kepka <pkepka@google.com>
Date: Wed, 4 Feb 2026 10:51:56 +0000
Subject: [PATCH 19/25] Fixed Toc

---
 .../5732-topology-aware-workload-scheduling/README.md            | 1 +
 1 file changed, 1 insertion(+)

diff --git a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
index 4ce3eebfd05e..e75ab95c44d5 100644
--- a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
+++ b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
@@ -14,6 +14,7 @@
   - [Risks and Mitigations](#risks-and-mitigations)
 - [Design Details](#design-details)
   - [Workload API Changes](#workload-api-changes)
+    - [Basic Policy Extension](#basic-policy-extension)
   - [Scheduling Framework Extensions](#scheduling-framework-extensions)
     - [1. Data Structures](#1-data-structures)
     - [2. New Plugin Interfaces](#2-new-plugin-interfaces)

From 178e03952565fb7887e958d5fed7dde71dc95f3f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20K=C4=99pka?= <pkepka@google.com>
Date: Thu, 5 Feb 2026 09:55:47 +0100
Subject: [PATCH 20/25] Add desiredCount to Gang policy

---
 .../README.md                                 | 47 +++++++++++++++++--
 1 file changed, 43 insertions(+), 4 deletions(-)

diff --git a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
index e75ab95c44d5..5ef9f77aea61 100644
--- a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
+++ b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
@@ -14,7 +14,7 @@
   - [Risks and Mitigations](#risks-and-mitigations)
 - [Design Details](#design-details)
   - [Workload API Changes](#workload-api-changes)
-    - [Basic Policy Extension](#basic-policy-extension)
+    - [Basic and Gang Policy Extension](#basic-and-gang-policy-extension)
   - [Scheduling Framework Extensions](#scheduling-framework-extensions)
     - [1. Data Structures](#1-data-structures)
     - [2. New Plugin Interfaces](#2-new-plugin-interfaces)
@@ -273,14 +273,22 @@ will be defined in a separate KEP:
 Note: For the initial alpha scope, only a single TopologyConstraint will be
 supported.
 
-#### Basic Policy Extension
+#### Basic and Gang Policy Extension
 
 In the first alpha version of the Workload API, the `Basic` policy was a no-op.
-We propose extending the `Basic` policy to accept a `desiredCount` field.
+We propose extending the `Basic` and `Gang` policies to accept a `desiredCount`
+field. This field serves as a scheduler hint to improve placement decisions
+without imposing hard scheduling constraints.
+
 This feature will be gated behind a separate feature gate 
 (`WorkloadBasicPolicyDesiredCount`) to decouple it from the core Gang Scheduling
 and Topology Aware Scheduling features.
 
+**1. Basic Policy Update**
+
+We introduce `desiredCount` to the `Basic` policy to allow users to signal the
+expected group size for optimization purposes.
+
 ```go
 // BasicSchedulingPolicy indicates that standard Kubernetes
 // scheduling behavior should be used.
@@ -299,7 +307,38 @@ type BasicSchedulingPolicy struct {
 }
 ```
 
-This field allows users to express their "true" workloads more easily and enables
+**2. Gang Policy Update**
+
+We similarly extend the `Gang` policy. While `minCount` provides a hard constraint
+for admission, `desiredCount` provides a soft target for placement optimization.
+
+```go
+// GangSchedulingPolicy defines the parameters for gang scheduling.
+type GangSchedulingPolicy struct {
+	// MinCount is the minimum number of pods that must be schedulable or scheduled
+	// at the same time for the scheduler to admit the entire group.
+	// It must be a positive integer.
+	//
+	// +required
+	MinCount int32
+
+	// DesiredCount is the expected number of pods that will belong to this
+	// PodGroup. This field is a hint to the scheduler to help it make better
+	// placement decisions for the group as a whole.
+    //
+	// Unlike gang's minCount, this field does not block scheduling. If the number
+	// of available pods is less than desiredCount but at least minCount, the scheduler
+    // can still attempt to schedule the available pods, but will optimistically try
+    // to select a placement that can accommodate the future pods.
+    //
+    // When provided desiredCount must be greater or equal to minCount.
+	//
+	// +optional
+	DesiredCount *int32
+}
+```
+
+Those fields allow users to express their "true" workloads more easily and enables
 the scheduler to optimize the placement of such pod groups by taking the desired state
 into account. Ideally, the scheduler should prefer placements that can accommodate
 the full `desiredCount`, even if not all pods are created yet. When `desiredCount`

From f133ab46b62d19e23ff66206f84f5947f7f060f8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20K=C4=99pka?= <pkepka@google.com>
Date: Thu, 5 Feb 2026 12:34:14 +0100
Subject: [PATCH 21/25] Added cluster autoscaling support as requirement for
 beta

---
 .../5732-topology-aware-workload-scheduling/README.md            | 1 +
 1 file changed, 1 insertion(+)

diff --git a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
index 5ef9f77aea61..b09f59ff3e91 100644
--- a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
+++ b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
@@ -652,6 +652,7 @@ necessary to implement this enhancement.
   evaluated.
 - Scalability tests on large clusters with high placement counts.
 - Comprehensive e2e testing.
+- Cluster autoscaling compomnents are aware of workload topology constraints. 
 
 ### Upgrade / Downgrade Strategy
 

From d157ceaf3ae08f9e67d4d18bd0254e39ea0ba8b5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20K=C4=99pka?= <pkepka@google.com>
Date: Thu, 5 Feb 2026 12:39:22 +0100
Subject: [PATCH 22/25] Fix phrasing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Dominik Marciński <gmidon@gmail.com>
---
 .../5732-topology-aware-workload-scheduling/README.md          | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
index b09f59ff3e91..13d1ffd7c3c3 100644
--- a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
+++ b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
@@ -192,8 +192,7 @@ We support two fundamental types of constraints:
    co-located resource (e.g., interconnected network interfaces or
    accelerators).
 
-The scheduler is extended to interpret these constraints and find a "Placement"
-(a subset of nodes and DRA resources) that satisfies them.
+The scheduler is extended to interpret these new PodGroup level scheduling constraints and similarly to scheduling pods on nodes (available scheduling options), find a "Placement" for this PodGroup among the feasible options (subsets of nodes and DRA resources) that satisfies them.
 
 ### User Stories (Optional)
 

From 4bf21d2dea2ef7305d12dd8e38aa47a640b55848 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20K=C4=99pka?= <pkepka@google.com>
Date: Thu, 5 Feb 2026 12:40:22 +0100
Subject: [PATCH 23/25] Fix phrasing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Dominik Marciński <gmidon@gmail.com>
---
 .../5732-topology-aware-workload-scheduling/README.md           | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
index 13d1ffd7c3c3..b15db07f089c 100644
--- a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
+++ b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
@@ -217,7 +217,7 @@ workload's pods to them.
 ### Risks and Mitigations
 
 - **Scheduling Latency:** Evaluating multiple placements involves running
-  filter/score plugins multiple times.
+  filter/score plugins multiple times (multiple attempts to schedule a PodGroup considering all topology options).
 
   - **Mitigation:** Implement pre-filtering optimizations to reject infeasible
     placements early based on aggregate resource availability.

From 3f6fffb0996ba2ce165daf7e14af8ceec9e2a902 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20K=C4=99pka?= <pkepka@google.com>
Date: Thu, 5 Feb 2026 12:49:26 +0100
Subject: [PATCH 24/25] Updates from review.

---
 .../5732-topology-aware-workload-scheduling/README.md       | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
index b15db07f089c..2dee3d6b43ae 100644
--- a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
+++ b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/README.md
@@ -243,7 +243,7 @@ type PodGroup struct {
     Name *string
 
     // SchedulingConstraints defines group-level scheduling requirements,
-    // including topology and DRA colocation.
+    // including topology.
     SchedulingConstraints *PodGroupSchedulingConstraints
 }
 
@@ -280,7 +280,7 @@ field. This field serves as a scheduler hint to improve placement decisions
 without imposing hard scheduling constraints.
 
 This feature will be gated behind a separate feature gate 
-(`WorkloadBasicPolicyDesiredCount`) to decouple it from the core Gang Scheduling
+(`PodGroupDesiredCount`) to decouple it from the core Gang Scheduling
 and Topology Aware Scheduling features.
 
 **1. Basic Policy Update**
@@ -701,7 +701,7 @@ kube-scheduler instance being a leader).
   - Components depending on the feature gate:
     - kube-apiserver
     - kube-scheduler
-  - Feature gate name: WorkloadBasicPolicyDesiredCount
+  - Feature gate name: PodGroupDesiredCount
   - Components depending on the feature gate:
     - kube-apiserver
     - kube-scheduler

From 7113f4be83cc285476cb1bb4bfce9c1447c68e6c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20K=C4=99pka?= <pkepka@google.com>
Date: Thu, 5 Feb 2026 12:49:52 +0100
Subject: [PATCH 25/25] Updates from review.

---
 .../5732-topology-aware-workload-scheduling/kep.yaml            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/kep.yaml b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/kep.yaml
index 0253ec6a85c9..99bc28c58277 100644
--- a/keps/sig-scheduling/5732-topology-aware-workload-scheduling/kep.yaml
+++ b/keps/sig-scheduling/5732-topology-aware-workload-scheduling/kep.yaml
@@ -41,7 +41,7 @@ feature-gates:
     components:
       - kube-apiserver
       - kube-scheduler  
-  - name: WorkloadBasicPolicyDesiredCount
+  - name: PodGroupDesiredCount
     components:
       - kube-apiserver
       - kube-scheduler