From eb2eda936297c0255eaf36600d1cfd78990bfc96 Mon Sep 17 00:00:00 2001 From: Swati Sehgal Date: Fri, 31 Jan 2025 18:04:39 +0000 Subject: [PATCH 1/4] node: KEP- 2902: Updates for Beta graduation Signed-off-by: Swati Sehgal --- keps/prod-readiness/sig-node/2902.yaml | 2 ++ .../README.md | 9 ++++++--- .../kep.yaml | 19 ++++++++++++++----- 3 files changed, 22 insertions(+), 8 deletions(-) diff --git a/keps/prod-readiness/sig-node/2902.yaml b/keps/prod-readiness/sig-node/2902.yaml index 7ee7c25a5e3..2c78d0a538c 100644 --- a/keps/prod-readiness/sig-node/2902.yaml +++ b/keps/prod-readiness/sig-node/2902.yaml @@ -1,3 +1,5 @@ kep-number: 2902 alpha: approver: "@johnbelamaric" +beta: + approver: "@johnbelamaric" diff --git a/keps/sig-node/2902-cpumanager-distribute-cpus-policy-option/README.md b/keps/sig-node/2902-cpumanager-distribute-cpus-policy-option/README.md index b81c1c657ea..1831aaf6ee0 100644 --- a/keps/sig-node/2902-cpumanager-distribute-cpus-policy-option/README.md +++ b/keps/sig-node/2902-cpumanager-distribute-cpus-policy-option/README.md @@ -75,7 +75,7 @@ When enabled, this will trigger the `CPUManager` to evenly distribute CPUs acros ### Risks and Mitigations The risks of adding this new feature are quite low. -It is isolated to a specific policy option within the `CPUManager`, and is protected both by the option itself, as well as the `CPUManagerPolicyAlphaOptions` feature gate (which is disabled by default). +It is isolated to a specific policy option within the `CPUManager`, and is protected both by the option itself, as well as the `CPUManagerPolicyBetaOptions` feature gate (which is disabled by default). | Risk | Impact | Mitigation | | -------------------------------------------------| -------| ---------- | @@ -149,7 +149,9 @@ No changes needed ###### How can this feature be enabled / disabled in a live cluster? - [X] Feature gate (also fill in values in `kep.yaml`) + - Feature gate name: `CPUManagerPolicyOptions` - Feature gate name: `CPUManagerPolicyAlphaOptions` + - Feature gate name: `CPUManagerPolicyBetaOptions` - Components depending on the feature gate: `kubelet` - [X] Change the kubelet configuration to set a `CPUManager` policy of `static` and a `CPUManager` policy option of `distribute-cpus-across-numa` - Will enabling / disabling the feature require downtime of the control @@ -161,14 +163,14 @@ No changes needed ###### Does enabling the feature change any default behavior? No. In order to trigger any of the new logic, three things have to be true: -1. The `CPUManagerPolicyAlphaOptions` feature gate must be enabled +1. The `CPUManagerPolicyBetaOptions` feature gate must be enabled 1. The `static` `CPUManager` policy must be selected 1. The new `distribute-cpus-across-numa` policy option must be selected ###### Can the feature be disabled once it has been enabled (i.e. can we roll back the enablement)? Yes, the feature can be disabled by either: -1. Disabling the `CPUManagerPolicyAlphaOptions` feature gate +1. Disabling the `CPUManagerPolicyBetaOptions` feature gate 1. Switching the `CPUManager` policy to `none` 1. Removing `distribute-cpus-across-numa` from the list of `CPUManager` policy options @@ -255,3 +257,4 @@ No, the algorithm will run on a single `goroutine` with minimal memory requireme - 2021-08-30: Updates to fill out more sections, answer PRR questions - 2021-09-08: Change feature gate from `CPUManagerPolicyOptions` to `CPUManagerPolicyExperimentalOptions` - 2021-10-11: Change feature gate from `CPUManagerPolicyExperimentalOptions` to `CPUManagerPolicyAlphaOptions` +- 2025-01-30: KEP update for Beta graduation of the policy option diff --git a/keps/sig-node/2902-cpumanager-distribute-cpus-policy-option/kep.yaml b/keps/sig-node/2902-cpumanager-distribute-cpus-policy-option/kep.yaml index a2d0979b348..159afd99761 100644 --- a/keps/sig-node/2902-cpumanager-distribute-cpus-policy-option/kep.yaml +++ b/keps/sig-node/2902-cpumanager-distribute-cpus-policy-option/kep.yaml @@ -2,36 +2,45 @@ title: CPUManager Policy Option to Distribute CPUs Across NUMA Nodes Instead of kep-number: 2902 authors: - "@klueska" + - "@swatisehgal" # For Beta graduation owning-sig: sig-node participating-sigs: [] status: implementable creation-date: "2021-08-26" +last-updated: "2025-01-31" reviewers: - - "@fromani" + - "@ffromani" approvers: - "@sig-node-tech-leads" see-also: - "keps/sig-node/2625-cpumanager-policies-thread-placement" + - "keps/sig-node/3545-improved-multi-numa-alignment/" + - "keps/sig-node/4176-cpumanager-spread-cpus-preferred-policy/" + - "keps/sig-node/4540-strict-cpu-reservation" + - "keps/sig-node/4622-topologymanager-max-allowable-numa-nodes/" + - "keps/sig-node/4800-cpumanager-split-uncorecache/" replaces: [] # The target maturity stage in the current dev cycle for this KEP. -stage: alpha +stage: beta # The most recent milestone for which work toward delivery of this KEP has been # done. This can be the current (upcoming) milestone, if it is being actively # worked on. -latest-milestone: "v1.23" +latest-milestone: "v1.33" # The milestone at which this feature was, or is targeted to be, at each stage. milestone: alpha: "v1.23" - beta: "v1.24" - stable: "v1.25" + beta: "v1.33" + stable: "v1.35" # The following PRR answers are required at alpha release # List the feature gate name and the components for which it must be enabled feature-gates: + - name: "CPUManagerPolicyOptions" - name: "CPUManagerPolicyAlphaOptions" + - name: "CPUManagerPolicyBetaOptions" components: - kubelet disable-supported: true From 8f8608f1bc123dc3d663e6f84c5b55ccc81adcae Mon Sep 17 00:00:00 2001 From: Swati Sehgal Date: Wed, 5 Feb 2025 11:21:50 +0000 Subject: [PATCH 2/4] node: KEP-2902: Update to the latest KEP template Signed-off-by: Swati Sehgal --- .../README.md | 67 +++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/keps/sig-node/2902-cpumanager-distribute-cpus-policy-option/README.md b/keps/sig-node/2902-cpumanager-distribute-cpus-policy-option/README.md index 1831aaf6ee0..cd6d324e738 100644 --- a/keps/sig-node/2902-cpumanager-distribute-cpus-policy-option/README.md +++ b/keps/sig-node/2902-cpumanager-distribute-cpus-policy-option/README.md @@ -10,6 +10,10 @@ - [Risks and Mitigations](#risks-and-mitigations) - [Design Details](#design-details) - [Test Plan](#test-plan) + - [Prerequisite testing updates](#prerequisite-testing-updates) + - [Unit tests](#unit-tests) + - [Integration tests](#integration-tests) + - [e2e tests](#e2e-tests) - [Graduation Criteria](#graduation-criteria) - [Alpha](#alpha) - [Beta](#beta) @@ -18,8 +22,11 @@ - [Version Skew Strategy](#version-skew-strategy) - [Production Readiness Review Questionnaire](#production-readiness-review-questionnaire) - [Feature Enablement and Rollback](#feature-enablement-and-rollback) + - [Rollout, Upgrade and Rollback Planning](#rollout-upgrade-and-rollback-planning) - [Monitoring Requirements](#monitoring-requirements) + - [Dependencies](#dependencies) - [Scalability](#scalability) + - [Troubleshooting](#troubleshooting) - [Implementation History](#implementation-history) @@ -116,6 +123,28 @@ NOTE: The striping operation after all CPUs have been evenly distributed will be We will extend both the unit test suite and the E2E test suite to cover the new policy option described in this KEP. +[x] I/we understand the owners of the involved components may require updates to +existing tests to make this code solid enough prior to committing the changes necessary +to implement this enhancement. + +##### Prerequisite testing updates + +##### Unit tests + +- `k8s.io/kubernetes/pkg/kubelet/cm/cpumanager`: `20250205` - 85.5% of statements + +##### Integration tests + +Not Applicable as Kubelet features don't have integration tests. + +##### e2e tests + +Currently no e2e tests are present for this particular policy option. E2E tests will be added as part of Beta graduation. + +The plan is to add e2e tests to cover the basic flows for cases below: +1. `distribute-cpus-across-numa` option is enabled: The test will ensure that the allocated CPUs are distributed across NUMA nodes according to the policy. +1. `distribute-cpus-across-numa` option is disabled: The test will verify that the allocated CPUs are packed according to the default behavior. + ### Graduation Criteria #### Alpha @@ -184,6 +213,25 @@ No changes. Existing container will not see their allocation changed. New contai - A specific e2e test will demonstrate that the default behaviour is preserved when the feature gate is disabled, or when the feature is not used (2 separate tests) +### Rollout, Upgrade and Rollback Planning + +###### How can a rollout or rollback fail? Can it impact already running workloads? + +- A rollout or rollback can fail if the feature gate and the policy option are not configured properly and kubelet fails to start. + +###### What specific metrics should inform a rollback? + +Not Applicable. + +###### Were upgrade and rollback tested? Was the upgrade->downgrade->upgrade path tested? + +Not Applicable. This policy option only affects pods that meet certain conditions and are scheduled after the upgrade. Running pods will be unaffected +by any change. + +###### Is the rollout accompanied by any deprecations and/or removals of features, APIs, fields of API types, flags, etc.? + +No + ### Monitoring Requirements ###### How can an operator determine if the feature is in use by workloads? @@ -221,6 +269,12 @@ None This feature is `linux` specific, and requires a version of CRI that includes the `LinuxContainerResources.CpusetCpus` field. This has been available since `v1alpha2`. +### Dependencies + +###### Does this feature depend on any specific services running in the cluster? + +No + ### Scalability ###### Will enabling / using this feature result in any new API calls? @@ -251,6 +305,18 @@ This delay should be minimal. No, the algorithm will run on a single `goroutine` with minimal memory requirements. +### Troubleshooting + +###### How does this feature react if the API server and/or etcd is unavailable? + +No impact. The behavior of the feature does not change when API Server and/or etcd is unavailable since the feature is node local. + +###### What are other known failure modes? + +No known failure modes. + +###### What steps should be taken if SLOs are not being met to determine the problem? + ## Implementation History - 2021-08-26: Initial KEP created @@ -258,3 +324,4 @@ No, the algorithm will run on a single `goroutine` with minimal memory requireme - 2021-09-08: Change feature gate from `CPUManagerPolicyOptions` to `CPUManagerPolicyExperimentalOptions` - 2021-10-11: Change feature gate from `CPUManagerPolicyExperimentalOptions` to `CPUManagerPolicyAlphaOptions` - 2025-01-30: KEP update for Beta graduation of the policy option +- 2025-02-05: KEP update to the latest template From 0dea90496f010681111292a629ea95d94ce326ca Mon Sep 17 00:00:00 2001 From: Swati Sehgal Date: Wed, 5 Feb 2025 14:30:46 +0000 Subject: [PATCH 3/4] node: KEP-2902: Capture plans of metric addition for Beta graduation Signed-off-by: Swati Sehgal --- .../README.md | 10 ++++++++-- .../kep.yaml | 3 ++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/keps/sig-node/2902-cpumanager-distribute-cpus-policy-option/README.md b/keps/sig-node/2902-cpumanager-distribute-cpus-policy-option/README.md index cd6d324e738..8af43b2277c 100644 --- a/keps/sig-node/2902-cpumanager-distribute-cpus-policy-option/README.md +++ b/keps/sig-node/2902-cpumanager-distribute-cpus-policy-option/README.md @@ -221,7 +221,8 @@ No changes. Existing container will not see their allocation changed. New contai ###### What specific metrics should inform a rollback? -Not Applicable. +As part of graduation of this feature, we plan to add metric `cpu_manager_numa_allocation_spread` to see how the CPUs are distributed across NUMA nodes. +This can be used to see the CPU distribution across NUMA and will provide an indication of a rollback. ###### Were upgrade and rollback tested? Was the upgrade->downgrade->upgrade path tested? @@ -238,6 +239,8 @@ No Inspect the kubelet configuration of a node -- check for the presence of the feature gate and usage of the new policy option. +In addition to that, we can check the metric `cpu_manager_numa_allocation_spread` to determine how allocated CPUs are spread across NUMA node. + ###### How can someone using this feature know that it is working for their instance? In order to verify this feature is working, one should: @@ -251,6 +254,8 @@ To verify the list of CPUs allocated to the container, one can either: - `exec` into uthe container and run `taskset -cp 1` (assuming this command is available in the container). - Call the `GetCPUS()` method of the `CPUProvider` interface in the `kubelet`'s [podresources API](https://pkg.go.dev/k8s.io/kubernetes/pkg/kubelet/apis/podresources#CPUsProvider). +Also, we can check `cpu_manager_numa_allocation_spread` metric. + ###### What are the reasonable SLOs (Service Level Objectives) for the enhancement? There are no specific SLOs for this feature. @@ -262,7 +267,8 @@ None ###### Are there any missing metrics that would be useful to have to improve observability of this feature? -None +Yes, as part of graduation of this feature to Beta, we plan to add `cpu_manager_numa_allocation_spread` metric +to provide data on how the CPUs are distributed across NUMA nodes. ###### Does this feature depend on any specific services running in the cluster? diff --git a/keps/sig-node/2902-cpumanager-distribute-cpus-policy-option/kep.yaml b/keps/sig-node/2902-cpumanager-distribute-cpus-policy-option/kep.yaml index 159afd99761..190a5f0c06d 100644 --- a/keps/sig-node/2902-cpumanager-distribute-cpus-policy-option/kep.yaml +++ b/keps/sig-node/2902-cpumanager-distribute-cpus-policy-option/kep.yaml @@ -46,4 +46,5 @@ feature-gates: disable-supported: true # The following PRR answers are required at beta release -metrics: [] +metrics: + - cpu_manager_numa_allocation_spread From 8d095f45f881fc778d2a25c14cd16df90a8f56f9 Mon Sep 17 00:00:00 2001 From: Swati Sehgal Date: Fri, 7 Feb 2025 15:47:39 +0000 Subject: [PATCH 4/4] node: KEP-2902: Address review comments Signed-off-by: Swati Sehgal --- .../README.md | 40 +++++++++++++++++-- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/keps/sig-node/2902-cpumanager-distribute-cpus-policy-option/README.md b/keps/sig-node/2902-cpumanager-distribute-cpus-policy-option/README.md index 8af43b2277c..60956b7f271 100644 --- a/keps/sig-node/2902-cpumanager-distribute-cpus-policy-option/README.md +++ b/keps/sig-node/2902-cpumanager-distribute-cpus-policy-option/README.md @@ -9,6 +9,7 @@ - [Proposal](#proposal) - [Risks and Mitigations](#risks-and-mitigations) - [Design Details](#design-details) + - [Compatibility with full-pcpus-only policy options](#compatibility-with-full-pcpus-only-policy-options) - [Test Plan](#test-plan) - [Prerequisite testing updates](#prerequisite-testing-updates) - [Unit tests](#unit-tests) @@ -119,6 +120,12 @@ If none of the above conditions can be met, resort back to a best-effort fit of NOTE: The striping operation after all CPUs have been evenly distributed will be performed such that the overall disribution of CPUs across those NUMA nodes remains as balanced as possible. +### Compatibility with `full-pcpus-only` policy options + +| Compatibility | alpha | beta | GA | +| --- | --- | --- | --- | +| full-pcpus-only | x | x | x | + ### Test Plan We will extend both the unit test suite and the E2E test suite to cover the new policy option described in this KEP. @@ -135,7 +142,7 @@ to implement this enhancement. ##### Integration tests -Not Applicable as Kubelet features don't have integration tests. +Not Applicable as Kubelet features don't have integration tests. We use a mix of `e2e_node` and `e2e` tests. ##### e2e tests @@ -144,6 +151,7 @@ Currently no e2e tests are present for this particular policy option. E2E tests The plan is to add e2e tests to cover the basic flows for cases below: 1. `distribute-cpus-across-numa` option is enabled: The test will ensure that the allocated CPUs are distributed across NUMA nodes according to the policy. 1. `distribute-cpus-across-numa` option is disabled: The test will verify that the allocated CPUs are packed according to the default behavior. +1. Test how this option interacts with `full-pcpus-only` policy option (and test for it enabled and disabled). ### Graduation Criteria @@ -254,7 +262,30 @@ To verify the list of CPUs allocated to the container, one can either: - `exec` into uthe container and run `taskset -cp 1` (assuming this command is available in the container). - Call the `GetCPUS()` method of the `CPUProvider` interface in the `kubelet`'s [podresources API](https://pkg.go.dev/k8s.io/kubernetes/pkg/kubelet/apis/podresources#CPUsProvider). -Also, we can check `cpu_manager_numa_allocation_spread` metric. +Also, we can check `cpu_manager_numa_allocation_spread` metric. We plan to add metric to track how CPUs are distributed across NUMA zones +with labels/buckets representing NUMA nodes (numa_node=0, numa_node=1, ..., numa_node=N). + +With packed allocation (default, option off), the distribution should mostly be in numa_node=1, with a small tail to numa_node=2 (and possibly higher) +in cases of severe fragmentation. Users can compare this spread metric with the `container_aligned_compute_resources_count` metric to determine +if they are getting aligned packed allocation or just packed allocation due to implementation details. + +For example, if a node has 2 NUMA nodes and a pod requests 8 CPUs (with no other pods requesting exclusive CPUs on the node), the metric would look like this: + +cpu_manager_numa_allocation_spread{numa_node="0"} = 8 +cpu_manager_numa_allocation_spread{numa_node="1"} = 0 + + +When the option is enabled, we would expect a more even distribution of CPUs across NUMA nodes, with no sharp peaks as seen with packed allocation. +Users can also check the `container_aligned_compute_resources_count` metric to assess resource alignment and system behavior. + +In this case, the metric would show: +cpu_manager_numa_allocation_spread{numa_node="0"} = 4 +cpu_manager_numa_allocation_spread{numa_node="1"} = 4 + + +Note: This example is simplified to clearly highlight the difference between the two cases. Existing pods may slightly skew the counts, but the general +trend of peaks and troughs will still provide a good indication of CPU distribution across NUMA nodes, allowing users to determine if the policy option +is enabled or not. ###### What are the reasonable SLOs (Service Level Objectives) for the enhancement? @@ -319,7 +350,10 @@ No impact. The behavior of the feature does not change when API Server and/or et ###### What are other known failure modes? -No known failure modes. +Because of existing distribution of CPU resource across, a distributed allocation might not be possible. E.g. If all Available CPUs are present +on the same NUMA node. + +In that case we resort back to a best-effort fit of packing CPUs into NUMA nodes wherever they can fit. ###### What steps should be taken if SLOs are not being met to determine the problem?