From 65f46682b72186d7a297915045d6cd079dae8e0c Mon Sep 17 00:00:00 2001 From: Paco Xu Date: Mon, 3 Apr 2023 16:57:45 +0800 Subject: [PATCH] add feature gate for using --- .../sig-cluster-lifecycle/3929.yaml | 3 - .../3929-no-cri-socket-annotation/README.md | 460 +++--------------- .../3929-no-cri-socket-annotation/kep.yaml | 2 +- 3 files changed, 74 insertions(+), 391 deletions(-) delete mode 100644 keps/prod-readiness/sig-cluster-lifecycle/3929.yaml diff --git a/keps/prod-readiness/sig-cluster-lifecycle/3929.yaml b/keps/prod-readiness/sig-cluster-lifecycle/3929.yaml deleted file mode 100644 index f9a687a80079..000000000000 --- a/keps/prod-readiness/sig-cluster-lifecycle/3929.yaml +++ /dev/null @@ -1,3 +0,0 @@ -kep-number: 3929 -alpha: - approver: "@" diff --git a/keps/sig-cluster-lifecycle/kubeadm/3929-no-cri-socket-annotation/README.md b/keps/sig-cluster-lifecycle/kubeadm/3929-no-cri-socket-annotation/README.md index c813da06c813..1aca8af2a064 100644 --- a/keps/sig-cluster-lifecycle/kubeadm/3929-no-cri-socket-annotation/README.md +++ b/keps/sig-cluster-lifecycle/kubeadm/3929-no-cri-socket-annotation/README.md @@ -77,37 +77,73 @@ tags, and then generate with `hack/update-toc.sh`. --> -- [Release Signoff Checklist](#release-signoff-checklist) -- [Summary](#summary) -- [Motivation](#motivation) - - [Goals](#goals) - - [Non-Goals](#non-goals) -- [Proposal](#proposal) - - [User Stories (Optional)](#user-stories-optional) - - [Story 1](#story-1) - - [Story 2](#story-2) - - [Notes/Constraints/Caveats (Optional)](#notesconstraintscaveats-optional) - - [Risks and Mitigations](#risks-and-mitigations) -- [Design Details](#design-details) - - [Test Plan](#test-plan) - - [Prerequisite testing updates](#prerequisite-testing-updates) - - [Unit tests](#unit-tests) - - [Integration tests](#integration-tests) - - [e2e tests](#e2e-tests) - - [Graduation Criteria](#graduation-criteria) - - [Upgrade / Downgrade Strategy](#upgrade--downgrade-strategy) - - [Version Skew Strategy](#version-skew-strategy) -- [Production Readiness Review Questionnaire](#production-readiness-review-questionnaire) - - [Feature Enablement and Rollback](#feature-enablement-and-rollback) - - [Rollout, Upgrade and Rollback Planning](#rollout-upgrade-and-rollback-planning) - - [Monitoring Requirements](#monitoring-requirements) - - [Dependencies](#dependencies) - - [Scalability](#scalability) - - [Troubleshooting](#troubleshooting) -- [Implementation History](#implementation-history) -- [Drawbacks](#drawbacks) -- [Alternatives](#alternatives) -- [Infrastructure Needed (Optional)](#infrastructure-needed-optional) +- [3929: Remove CRI Socket Annotation from Node Object](#3929-remove-cri-socket-annotation-from-node-object) + - [Release Signoff Checklist](#release-signoff-checklist) + - [Summary](#summary) + - [Motivation](#motivation) + - [Goals](#goals) + - [Non-Goals](#non-goals) + - [Proposal](#proposal) + - [User Stories (Optional)](#user-stories-optional) + - [Story 1](#story-1) + - [Story 2](#story-2) + - [Notes/Constraints/Caveats (Optional)](#notesconstraintscaveats-optional) + - [Risks and Mitigations](#risks-and-mitigations) + - [Design Details](#design-details) + - [init: upload a global kubelet configuration with cri socket](#init-upload-a-global-kubelet-configuration-with-cri-socket) + - [join: can override it using --config](#join-can-override-it-using---config) + - [upgrade: re-download global one, but should use local kubelet configuration firstly](#upgrade-re-download-global-one-but-should-use-local-kubelet-configuration-firstly) + - [Proposal 1: respect a list of configuration in local kubelet configuration, and in v1.27, CRI socket is the only one](#proposal-1-respect-a-list-of-configuration-in-local-kubelet-configuration-and-in-v127-cri-socket-is-the-only-one) + - [Proposal 2: introduce a `/var/lib/kubelet/kubeadm-config.yaml` to maintain node specific configuration](#proposal-2-introduce-a-varlibkubeletkubeadm-configyaml-to-maintain-node-specific-configuration) + - [old version handling](#old-version-handling) + - [Test Plan](#test-plan) + - [Prerequisite testing updates](#prerequisite-testing-updates) + - [Unit tests](#unit-tests) + - [Integration tests](#integration-tests) + - [e2e tests](#e2e-tests) + - [Graduation Criteria](#graduation-criteria) + - [Alpha](#alpha) + - [Beta](#beta) + - [GA](#ga) + - [Deprecation](#deprecation) + - [Upgrade / Downgrade Strategy](#upgrade--downgrade-strategy) + - [Version Skew Strategy](#version-skew-strategy) + - [Production Readiness Review Questionnaire](#production-readiness-review-questionnaire) + - [Feature Enablement and Rollback](#feature-enablement-and-rollback) + - [How can this feature be enabled / disabled in a live cluster?](#how-can-this-feature-be-enabled--disabled-in-a-live-cluster) + - [Does enabling the feature change any default behavior?](#does-enabling-the-feature-change-any-default-behavior) + - [Can the feature be disabled once it has been enabled (i.e. can we roll back the enablement)?](#can-the-feature-be-disabled-once-it-has-been-enabled-ie-can-we-roll-back-the-enablement) + - [What happens if we reenable the feature if it was previously rolled back?](#what-happens-if-we-reenable-the-feature-if-it-was-previously-rolled-back) + - [Are there any tests for feature enablement/disablement?](#are-there-any-tests-for-feature-enablementdisablement) + - [Rollout, Upgrade and Rollback Planning](#rollout-upgrade-and-rollback-planning) + - [How can a rollout or rollback fail? Can it impact already running workloads?](#how-can-a-rollout-or-rollback-fail-can-it-impact-already-running-workloads) + - [What specific metrics should inform a rollback?](#what-specific-metrics-should-inform-a-rollback) + - [Were upgrade and rollback tested? Was the upgrade-\>downgrade-\>upgrade path tested?](#were-upgrade-and-rollback-tested-was-the-upgrade-downgrade-upgrade-path-tested) + - [Is the rollout accompanied by any deprecations and/or removals of features, APIs, fields of API types, flags, etc.?](#is-the-rollout-accompanied-by-any-deprecations-andor-removals-of-features-apis-fields-of-api-types-flags-etc) + - [Monitoring Requirements](#monitoring-requirements) + - [How can an operator determine if the feature is in use by workloads?](#how-can-an-operator-determine-if-the-feature-is-in-use-by-workloads) + - [How can someone using this feature know that it is working for their instance?](#how-can-someone-using-this-feature-know-that-it-is-working-for-their-instance) + - [What are the reasonable SLOs (Service Level Objectives) for the enhancement?](#what-are-the-reasonable-slos-service-level-objectives-for-the-enhancement) + - [What are the SLIs (Service Level Indicators) an operator can use to determine the health of the service?](#what-are-the-slis-service-level-indicators-an-operator-can-use-to-determine-the-health-of-the-service) + - [Are there any missing metrics that would be useful to have to improve observability of this feature?](#are-there-any-missing-metrics-that-would-be-useful-to-have-to-improve-observability-of-this-feature) + - [Dependencies](#dependencies) + - [Does this feature depend on any specific services running in the cluster?](#does-this-feature-depend-on-any-specific-services-running-in-the-cluster) + - [Scalability](#scalability) + - [Will enabling / using this feature result in any new API calls?](#will-enabling--using-this-feature-result-in-any-new-api-calls) + - [Will enabling / using this feature result in introducing new API types?](#will-enabling--using-this-feature-result-in-introducing-new-api-types) + - [Will enabling / using this feature result in any new calls to the cloud provider?](#will-enabling--using-this-feature-result-in-any-new-calls-to-the-cloud-provider) + - [Will enabling / using this feature result in increasing size or count of the existing API objects?](#will-enabling--using-this-feature-result-in-increasing-size-or-count-of-the-existing-api-objects) + - [Will enabling / using this feature result in increasing time taken by any operations covered by existing SLIs/SLOs?](#will-enabling--using-this-feature-result-in-increasing-time-taken-by-any-operations-covered-by-existing-slisslos) + - [Will enabling / using this feature result in non-negligible increase of resource usage (CPU, RAM, disk, IO, ...) in any components?](#will-enabling--using-this-feature-result-in-non-negligible-increase-of-resource-usage-cpu-ram-disk-io--in-any-components) + - [Can enabling / using this feature result in resource exhaustion of some node resources (PIDs, sockets, inodes, etc.)?](#can-enabling--using-this-feature-result-in-resource-exhaustion-of-some-node-resources-pids-sockets-inodes-etc) + - [Troubleshooting](#troubleshooting) + - [How does this feature react if the API server and/or etcd is unavailable?](#how-does-this-feature-react-if-the-api-server-andor-etcd-is-unavailable) + - [What are other known failure modes?](#what-are-other-known-failure-modes) + - [What steps should be taken if SLOs are not being met to determine the problem?](#what-steps-should-be-taken-if-slos-are-not-being-met-to-determine-the-problem) + - [Implementation History](#implementation-history) + - [Drawbacks](#drawbacks) + - [Alternatives](#alternatives) + - [Infrastructure Needed (Optional)](#infrastructure-needed-optional) ## Release Signoff Checklist @@ -221,7 +257,7 @@ cri socket in kubelet configuration. ### Proposal 1: respect a list of configuration in local kubelet configuration, and in v1.27, CRI socket is the only one -During `kubeadm ugprade`, kubeadm will read the local kubelet configuration in `/var/lib/kubelet/config.yaml`. +During `kubeadm upgrade`, kubeadm will read the local kubelet configuration in `/var/lib/kubelet/config.yaml`. kubeadm also download the kubelet configuration from configmap and replace the `containerRuntimeEndpoint` and `imageServiceEndpoint`(This maybe empty and I prefer to respect it as well) with the local configuration. @@ -239,6 +275,11 @@ It is similar to `/var/lib/kubelet/kubeadm-flags.env`. KUBELET_KUBEADM_ARGS="--container-runtime-endpoint=unix:///var/run/containerd/containerd.sock --pod-infra-container-image=k8s.m.daocloud.io/pause:3.9" ``` +We may introduce a feature gate "KubeadmNodeSpecificConfig" to enable the use the `/var/lib/kubelet/kubeadm-config.yaml` here. + +- If the feature gate is disabled, use the cri socket annotation directly. +- If the feature gate is enabled, `/var/lib/kubelet/kubeadm-config.yaml` will be created and the cri socket will be maintained in it. + [To be discussed] Another proposal is using a strategy like `--patch`. A file like `/var/lib/kubelet/kubeadm-config.patch` or a `kubelet.yaml`/`config.ayml` file under `/var/lib/kubelet/patch/`. (This should be removed if we make a decision). @@ -252,17 +293,6 @@ For old version cluster upgradation with the annotation, we will not touch the a ### Test Plan - - [x] I/we understand the owners of the involved components may require updates to existing tests to make this code solid enough prior to committing the changes necessary to implement this enhancement. @@ -274,58 +304,16 @@ Install/Join/Upgrade test in - ##### Unit tests - - - - - ``: `` - `` ##### Integration tests - - - : ##### e2e tests - - - : ### Graduation Criteria @@ -363,61 +351,12 @@ See above. ### Version Skew Strategy - - ## Production Readiness Review Questionnaire - - ### Feature Enablement and Rollback - - ###### How can this feature be enabled / disabled in a live cluster? - - - [ ] Feature gate (also fill in values in `kep.yaml`) - Feature gate name: No - Components depending on the feature gate: @@ -430,108 +369,28 @@ well as the [existing list] of feature gates. ###### Does enabling the feature change any default behavior? - - ###### Can the feature be disabled once it has been enabled (i.e. can we roll back the enablement)? - - ###### What happens if we reenable the feature if it was previously rolled back? ###### Are there any tests for feature enablement/disablement? - - ### Rollout, Upgrade and Rollback Planning - - ###### How can a rollout or rollback fail? Can it impact already running workloads? - - ###### What specific metrics should inform a rollback? - - ###### Were upgrade and rollback tested? Was the upgrade->downgrade->upgrade path tested? - - ###### Is the rollout accompanied by any deprecations and/or removals of features, APIs, fields of API types, flags, etc.? - - ### Monitoring Requirements - - ###### How can an operator determine if the feature is in use by workloads? - - ###### How can someone using this feature know that it is working for their instance? - - - [ ] Events - Event Reason: - [ ] API .status @@ -542,215 +401,42 @@ Recall that end users cannot usually observe component logs or access metrics. ###### What are the reasonable SLOs (Service Level Objectives) for the enhancement? - - ###### What are the SLIs (Service Level Indicators) an operator can use to determine the health of the service? - - -- [ ] Metrics - - Metric name: - - [Optional] Aggregation method: - - Components exposing the metric: -- [ ] Other (treat as last resort) - - Details: - ###### Are there any missing metrics that would be useful to have to improve observability of this feature? - - ### Dependencies - - ###### Does this feature depend on any specific services running in the cluster? - - ### Scalability - - ###### Will enabling / using this feature result in any new API calls? - - ###### Will enabling / using this feature result in introducing new API types? - - ###### Will enabling / using this feature result in any new calls to the cloud provider? - - ###### Will enabling / using this feature result in increasing size or count of the existing API objects? - - ###### Will enabling / using this feature result in increasing time taken by any operations covered by existing SLIs/SLOs? - - ###### Will enabling / using this feature result in non-negligible increase of resource usage (CPU, RAM, disk, IO, ...) in any components? - - ###### Can enabling / using this feature result in resource exhaustion of some node resources (PIDs, sockets, inodes, etc.)? - - ### Troubleshooting - - ###### How does this feature react if the API server and/or etcd is unavailable? ###### What are other known failure modes? - - ###### What steps should be taken if SLOs are not being met to determine the problem? ## Implementation History - - ## Drawbacks - - ## Alternatives - - ## Infrastructure Needed (Optional) - - diff --git a/keps/sig-cluster-lifecycle/kubeadm/3929-no-cri-socket-annotation/kep.yaml b/keps/sig-cluster-lifecycle/kubeadm/3929-no-cri-socket-annotation/kep.yaml index 403320c03385..d271c447c932 100644 --- a/keps/sig-cluster-lifecycle/kubeadm/3929-no-cri-socket-annotation/kep.yaml +++ b/keps/sig-cluster-lifecycle/kubeadm/3929-no-cri-socket-annotation/kep.yaml @@ -7,7 +7,7 @@ participating-sigs: - sig-cluster-lifecycle status: provisional creation-date: 2023-03-30 -last-updated: 2022-03-30 +last-updated: 2022-04-03 reviewers: - "@neolit123" approvers: