From d0d0e216511ce9f2f055a07b6e89548177bceae2 Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Mon, 17 May 2021 20:39:47 +0200 Subject: [PATCH] dynamic resource allocation: initial KEP draft Discussion of some earlier revisions of this KEP happened in https://github.com/pohly/enhancements/pull/1. --- keps/prod-readiness/sig-node/3063.yaml | 6 + .../3063-dynamic-resource-allocation/Makefile | 20 + .../README.md | 1477 +++++++++++++++++ .../components.puml | 40 + .../components.svg | 75 + .../3063-dynamic-resource-allocation/kep.yaml | 45 + 6 files changed, 1663 insertions(+) create mode 100644 keps/prod-readiness/sig-node/3063.yaml create mode 100644 keps/sig-node/3063-dynamic-resource-allocation/Makefile create mode 100644 keps/sig-node/3063-dynamic-resource-allocation/README.md create mode 100644 keps/sig-node/3063-dynamic-resource-allocation/components.puml create mode 100644 keps/sig-node/3063-dynamic-resource-allocation/components.svg create mode 100644 keps/sig-node/3063-dynamic-resource-allocation/kep.yaml diff --git a/keps/prod-readiness/sig-node/3063.yaml b/keps/prod-readiness/sig-node/3063.yaml new file mode 100644 index 00000000000..1f4d0df09aa --- /dev/null +++ b/keps/prod-readiness/sig-node/3063.yaml @@ -0,0 +1,6 @@ +# The KEP must have an approver from the +# "prod-readiness-approvers" group +# of http://git.k8s.io/enhancements/OWNERS_ALIASES +kep-number: 3063 +alpha: + approver: "TBD" diff --git a/keps/sig-node/3063-dynamic-resource-allocation/Makefile b/keps/sig-node/3063-dynamic-resource-allocation/Makefile new file mode 100644 index 00000000000..7da29b85831 --- /dev/null +++ b/keps/sig-node/3063-dynamic-resource-allocation/Makefile @@ -0,0 +1,20 @@ +IMAGES += components.svg + +all: $(IMAGES) +clean: + rm -f $(IMAGES) + +# We use the http://plantuml.com/plantuml server to generate +# images. That way nothing needs to be installed besides Go. +DOC_PLANTUML_GO = $(shell go env GOPATH)/bin/plantuml-go + +%.png: %.puml $(DOC_PLANTUML_GO) + $(DOC_PLANTUML_GO) -format png $< + +%.svg: %.puml $(DOC_PLANTUML_GO) + $(DOC_PLANTUML_GO) -format svg $< + +# Builds the binary in GOPATH/bin. Changing into / first avoids +# modifying the project's go.mod file. +$(DOC_PLANTUML_GO): + cd / && go get github.com/acarlson99/plantuml-go diff --git a/keps/sig-node/3063-dynamic-resource-allocation/README.md b/keps/sig-node/3063-dynamic-resource-allocation/README.md new file mode 100644 index 00000000000..ad6af919500 --- /dev/null +++ b/keps/sig-node/3063-dynamic-resource-allocation/README.md @@ -0,0 +1,1477 @@ + +# KEP-NNNN: Dynamic resource allocation + + + +- [Release Signoff Checklist](#release-signoff-checklist) +- [Summary](#summary) +- [Motivation](#motivation) + - [Goals](#goals) + - [Non-Goals](#non-goals) +- [Proposal](#proposal) + - [User Stories (Optional)](#user-stories-optional) + - [Story 1](#story-1) + - [Story 2](#story-2) + - [Notes/Constraints/Caveats (Optional)](#notesconstraintscaveats-optional) + - [Risks and Mitigations](#risks-and-mitigations) +- [Design Details](#design-details) + - [Implementation](#implementation) + - [API](#api) + - [Communication between kubelet and resource node plugin](#communication-between-kubelet-and-resource-node-plugin) + - [NodePrepareResource](#) + - [NodePrepareResource Errors](#nodeprepareresource-errors) + - [NodeUnprepareResource](#-1) + - [NodeUnprepareResource Errors](#nodeunprepareresource-errors) + - [Implementing a plugin for node resources](#implementing-a-plugin-for-node-resources) + - [Test Plan](#test-plan) + - [Graduation Criteria](#graduation-criteria) + - [Upgrade / Downgrade Strategy](#upgrade--downgrade-strategy) + - [Version Skew Strategy](#version-skew-strategy) +- [Production Readiness Review Questionnaire](#production-readiness-review-questionnaire) + - [Feature Enablement and Rollback](#feature-enablement-and-rollback) + - [Rollout, Upgrade and Rollback Planning](#rollout-upgrade-and-rollback-planning) + - [Monitoring Requirements](#monitoring-requirements) + - [Dependencies](#dependencies) + - [Scalability](#scalability) + - [Troubleshooting](#troubleshooting) +- [Implementation History](#implementation-history) +- [Drawbacks](#drawbacks) +- [Alternatives](#alternatives) + - [ResourceClaimTemplate](#resourceclaimtemplate) + - [Reusing volume support as-is](#reusing-volume-support-as-is) + - [Extend volume support](#extend-volume-support) + - [Extend Device Plugins](#extend-device-plugins) + - [Webhooks instead of ResourceClaim updates](#webhooks-instead-of-resourceclaim-updates) +- [Infrastructure Needed (Optional)](#infrastructure-needed-optional) + + +## Release Signoff Checklist + + + +Items marked with (R) are required *prior to targeting to a milestone / release*. + +- [ ] (R) Enhancement issue in release milestone, which links to KEP dir in [kubernetes/enhancements] (not the initial KEP PR) +- [ ] (R) KEP approvers have approved the KEP status as `implementable` +- [ ] (R) Design details are appropriately documented +- [ ] (R) Test plan is in place, giving consideration to SIG Architecture and SIG Testing input (including test refactors) +- [ ] (R) Graduation criteria is in place +- [ ] (R) Production readiness review completed +- [ ] (R) Production readiness review approved +- [ ] "Implementation History" section is up-to-date for milestone +- [ ] User-facing documentation has been created in [kubernetes/website], for publication to [kubernetes.io] +- [ ] Supporting documentation—e.g., additional design documents, links to mailing list discussions/SIG meetings, relevant PRs/issues, release notes + + + +[kubernetes.io]: https://kubernetes.io/ +[kubernetes/enhancements]: https://git.k8s.io/enhancements +[kubernetes/kubernetes]: https://git.k8s.io/kubernetes +[kubernetes/website]: https://git.k8s.io/website + +## Summary + + + +Dynamic resource allocation introduces an alternative to the existing [device +manager +API](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/resource-management/device-plugin.md) +for third-party hardware vendors. Both are expected to co-exist, with vendors +choosing the API that better suits their needs on a case-by-case basis. Because +the new API is going to be implemented independently of the existing device +manager, there's little risk of breaking stable APIs. + +The new API is inspired by the existing [volume provisioning support with CSI](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/storage/container-storage-interface.md#provisioning-and-deleting) and uses similar +concepts. The goal is to let users request resources with parameters that can +be different depending on what kind of resource gets requested. Resource +allocations can be ephemeral (specified in a Pod spec, allocated and destroyed +together with the Pod) and persistent (lifecycle managed separately from a Pod, +with an allocated resource used for multiple different Pods). + +Several core Kubernetes components must be modified (see the +[implementation](#implementation) section for details): +- kube-apiserver (new API) +- kube-controller-manager (new controller) +- kube-scheduler (new builtin plugin) +- kubelet (new third-party plugin kind) + +Resources are managed by third-party plugins that communicate with central +Kubernetes components, in particular the kube-scheduler, by updating +objects stored in the kube-apiserver. kube-scheduler only needs to be modified +once to support dynamic resource allocation. Then multiple plugins from +different vendors can be installed at the same time without making further +changes to the scheduler. + +Communication between the kubelet and the local part of the plugin is +handled through local Unix domain sockets and the plugin registration +mechanism, using a new plugin type and a new gRPC interface. +The container runtime uses the +[Container Device Interface +(CDI)](https://github.com/container-orchestrated-devices/container-device-interface) +to expose the resources. + +## Motivation + + + +Originally, Kubernetes and its scheduler only tracked CPU and RAM as +resources for containers. Later, support for storage and discrete, +countable per-node extended resources was added. The device plugin +interface then made such local resources available to containers. But +for many newer devices, this approach and the Kubernetes API for +requesting these custom resources is too limited. This KEP addresses +limitations of the current approach for the following use cases: + +- *Device initialization*: When starting a workload I’d like to have + the device reconfigured or reprogrammed during orchestration. For + security reasons workloads should not be able to reconfigure devices + directly. + + *Limitation*: Currently, it’s impossible to specify the desired + device properties that are required for reconfiguring devices. + +- *Device cleanup*: When my workload is finished, I would like to have + a mechanism for cleanup of the device, that will ensure that device + does not contain traces/parameters/data from previous workloads and + appropriate power state/shutdown. + + *Limitation*: Post-stop actions are not supported. + +- *Partial allocation*: When deploying a container I’d like to be able + to use part of the shareable device inside a container and other + containers should be able to use other free resources on the same + device. + + *Limitation*: Current implementation of the device plugin doesn’t + allow one to allocate part of the device because parameters are too limited + and Kubernetes doesn't have enough information about the extended + resources on a node to decide whether they can be shared. + +- *Optional allocation*: When deploying a workload I’d like to specify + soft(optional) device requirements. If a device exists and it’s + allocatable it will be allocated. If not - the workload will be run on + a node without a device. GPU and crypto-offload engines are + examples of this kind of device. If they’re not available, workloads + can still run by falling back to using only the CPU for the same + task. + + *Limitation*: Optional allocation is supported neither by the device + plugins nor by current Pod resource declaration. + +- *Support Over the Fabric devices*: When deploying a container, I’d + like to utilize devices available over the Fabric (PCIe, CXL, + Network, special links, etc). + + *Limitation*: Device Plugins framework supports only local devices. + +- *Access to the container image*: When deploying a container that + needs access to a GPU device, I would like my container to + gracefully fail to start (rather than SIGSEGV at runtime) if a + minimum driver requirement is not met on the host. + + *Limitation*: GPU driver requirements are currently stored as labels + on the container image, and device plugins do not have access to the + container image. + +Because this KEP enables the usage of +[CDI](https://github.com/container-orchestrated-devices/container-device-interface/#readme) +in Kubernetes, it also addresses those problems that are handled by +CDI: + +- *Perform container runtime specific operations*: When deploying a container + that needs access to a device, I would like to be able to reuse the + same pod spec, irrespective of the underlying container runtime in + use (e.g. kata vs. runc). + + *Limitation*: At present, there is no way to perform + runtime-specific operations that may need to occur as a result of + injecting a device into a container (device plugins are runtime + agnostic). A good example is supporting GPU passthrough + virtualization on kata vs. runc. + +- *Access to the plugin container*: When deploying a device plugin, I + would like to ensure that all of the operations that need to occur + as part of “injecting” a device into a container, also occur for the + “plugin container” itself. + + *Limitation*: At present, there is a chicken-and-egg problem to + supporting this because the device plugin is the sole dictator of + which containers will have these operations performed on them. + +### Goals + + + +* More flexibility: + * Arbitrary, resource-specific setup and cleanup actions + * Over-the-fabric resources + * Custom matching of resource requests with available resources, + including handling of optional resource requests +* User-friendly API for describing resource requests +* Allow resource management plugins that can be developed and deployed + separately from Kubernetes and are independent of specific container + runtimes. + +### Non-Goals + + + +* Extend the model that kube-scheduler has about + resources. Instead, it will need information from the plugin for + each resource request to determine where a Pod using the resource + might run. The [Representing Compute Resources in Kubernetes + proposal](https://docs.google.com/document/d/1666PPUs4Lz56TqKygcy6mXkNazde-vwA7q4e5H92sUc/edit#) + had some ideas what information the scheduler might need (“supports + overcommit”, “fractional”), but ultimately any choice regarding that + will only work for certain kinds of resources. + +* Standardize how to describe available resources. Only allocated + resources are visible through the APIs defined below. How to + describe available resources is plugin specific because it depends + on the kind of resource which attributes might be relevant. Plugins + should use and document their individual approach for this (for + example, defining a CRD and publishing through that). + +* Provide an abstraction layer for resource requests, i.e., something like a + “I want some kind of GPU”. Users will need to know about specific + resource plugins and which parameters they support. Portability of + workloads could be added on top of this proposal by introducing the + selection of a resource implementation through labels and + standardizing those labels and the associated parameters. The + [Resource Class + Proposal](https://docs.google.com/document/d/1qKiIVs9AMh2Ua5thhtvWqOqW0MSle_RV3lfriO1Aj6U/edit#heading=h.jzfmfdca34kj) + included such an approach. + +## Proposal + + + +The proposal is that a plugin handles all operations that are specific +to the resources managed by that plugin. This includes operations at +the control plane level (tracking where in the cluster resources are +available, helping with pod scheduling decisions, allocating resources +when requested) as well as the node level (preparing container +startup). Such a plugin can be implemented in arbitrary programming +languages as long as it supports the resource allocation protocol and +gRPC interfaces defined in this KEP. An utility package with Go +support code will be made available to simplify the development of +such a plugin, but using it will not be required and its API is not +part of this KEP. + +Three new API object types get added in a new API group: +- ResourcePlugin, not namespaced, with a description of the plugin. +- ResourceClass, not namespaced, with privileged parameters for + multiple resource instances of a certain kind. All these instances + are provided by the same resource plugin, which is identified by a + field in the class. +- ResourceClaim, namespaced, with parameters provided by a normal user + that describes a resource instance that needs to be allocated. A + ResourceClaim contains the usual meta data, a spec and a status. The + spec identifies the plugin that handles the resource via a class + name. + +To support arbitrarily complex parameters, both ResourceClass and +ResourceClaim contain one field which holds a +runtime.RawExtension. Validation can be handled by plugins through an +admission controller (if desired) or later at runtime when the +parameters are passed to the plugin. + +The ResourceClaim spec is read-only once created. The ResourceClaim +status is reserved for system usage and holds the current state of the +resource. The status must not get lost. This is departing from how +Kubernetes traditionally handled status, but something that more +recently [became more +acceptable](https://github.com/kubernetes/enhancements/pull/2537). Kube-scheduler +and plugin communicate by modifying that status. The status is also +how Kubernetes tracks that a plugin has allocated the resource and on +which nodes the resource is available. + +This approach is an intentional simplification compared to the PV/PVC +model for volumes because we don't need to deal with two objects when +allocating resources and therefore don't need something like the +volume binding controller. If desired, a resource plugin can implement +support for manually allocated (“static provisioning” in the context +of volumes) and/or existing resources by reacting to ResourceClaims by +using those resources to satisfy a claim. + +Allocation of a resource happens either immediately (“immediate +allocation”) or when a Pod needs the resource (“delayed allocation”), +depending on a flag in the ResourceClaim spec. Pods reference resource +claims in a new PodSpec.Resources list. Each resource in that list +then can be made available to one or more containers in that Pod. To +support ephemeral resources, an entry in the new PodSpec.Resources +list can also be a ResourceClaimTemplate. When a Pod gets created, +such a template will be used to create a normal ResourceClaim with the +Pod as owner, and then the normal allocation of the resource takes +place. + +For immediate allocation, scheduling Pods is simple because the +resource is already allocated and determines the nodes on which the +Pod may run. For delayed allocation, a node is selected tentatively +and plugin(s) try to allocate their resources for that node. If that +succeeds, the Pod can start to run. If it fails, the scheduler must +determine whether some other node fits the requirements and if so, +request allocation again. If no node fits because some resources were +already allocated for a node and are only usable there, then those +resources must be released and then get allocated elsewhere. + +The resources allocated for a ResourceClaim can be shared by multiple +containers in a pod. Depending on the capabilities defined in the +ResourceClaim by the plugin, a ResourceClaim can be used exclusively +by one pod at a time, by a certain maximum number of pods, or an +unlimited number of pods. + +### User Stories (Optional) + + + +#### Story 1 + +Partial allocation. + +Pod requests GPU resource with 2Gb memory: +``` +apiVersion: v1 +kind: Pod +metadata: + name: device-consumer +spec: + containers: + - name: my-ubuntu + image: ubuntu + command: ["/bin/program"] + podResources: + - name: gpu_2Gb + resources: + requests: + memory: "64Mi" + cpu: "250m" + limits: + memory: "128Mi" + cpu: "500m" + resources: + - name: gpu_2Gb + template: + resourceClassName: "gpu" + parameters: + memory: "2Gb" +``` + +This request triggers partial resource allocation on the node that has +a GPU device with 4Gb of memory for the duration of the execution of +this Pod. + +If some other Pod uses 1Gb of that device, then 1Gb free GPU memory +will left on that device after the resource is allocated. It can be +used by another Pod requesting up to 1Gb of GPU memory. + +#### Story 2 + +### Notes/Constraints/Caveats (Optional) + + + +Scheduling is likely to be slower when many Pods request the new +resource types, both because scheduling such a Pod involves more +round-trips through the API server for ResourceClaimStatus updates and +because scheduling one Pod may affect other Pods in ways that cannot +be anticipated by the kube-scheduler. When many Pods compete for +limited resources, multiple attempts may be needed before a suitable +node is found. + +The hardware that is expected to need this more flexible allocation +approach is going to be used by only a small subset of the pods in the +cluster and those pods are likely to run for extended periods of time, +so this is not a major concern. + +### Risks and Mitigations + + + +## Design Details + + + +### Implementation + +![components](./components.svg) + +Several components must be implemented or modified in Kubernetes: +- The new API must be added to kube-apiserver. +- A new controller in kube-controller-manager which generates + ResourceClaims from Pod ResourceClaimTemplates, similar to + https://github.com/kubernetes/kubernetes/tree/master/pkg/controller/volume/ephemeral +- A kube-scheduler plugin must detect Pods which reference a + ResourceClaim (directly or through a template) and ensure that the + resource is allocated before the Pod gets scheduled, similar to + https://github.com/kubernetes/kubernetes/blob/master/pkg/controller/volume/scheduling/scheduler_binder.go +- Kubelet must be extended to retrieve information from ResourceClaims + and then invoke local resource plugin methods. It must pass information about + the additional resources to the container runtime. It must detect + whether the container runtime has the necessary support and + advertise that in the cluster via node labels to simplify deployment + of plugins and Pod scheduling. + +For a resource plugin the following components are needed: +- Some utility library similar to + https://github.com/kubernetes-sigs/sig-storage-lib-external-provisioner + and the code in driver-registrar. +- *Resource controller*: a central component which handles resource allocation + by watching and modifying ResourceClaims. +- *Resource node plugin*: a component which cooperates with kubelet to prepare + the usage of the resource on a node. + +The utility library will be developed outside of Kubernetes and does not have +to be used by plugins, therefore it is not described further in this KEP. + + +``` +<<[UNRESOLVED @pohly]>> +All of the changes in Kubernetes need to be specified in a lot more detail. + +A flow diagram showing the state transitions of a ResourceClaim needs to be added. + +Upgrade and downgrade scenarios should already be considered for v1alpha1 to ensure +that whatever changes will be needed are in place before going to v1beta1 where +downgrades have to be supported. + +All of that will be added once there is consensus to move ahead with this proposal. +<<[/UNRESOLVED]>> +``` + +### API + +ResourceClaim, ResourceClass and ResourcePlugin are new built-in types +in a new `cdi.k8s.io/v1alpha1` API group. This was chosen instead of +using CRDs because core Kubernetes components must interact with them +and installing of CRDs as part of cluster creation is an unsolved +problem. + +The PodSpec gets extended. + +``` +type ResourceClass struct { + // Resource plugins have a unique name in reverse domain order (acme.example.com). + PluginName string + // Parameters holds arbitrary values that will be available to the plugin + // when allocating a resource that uses this class. The plugin will + // be able to distinguish between parameters stored here and + // and those stored in ResourceClaimSpec. These parameters + // here can only be set by cluster administrators. + Parameters runtime.RawExtension +} + +type ResourcePlugin struct { + // The name of the object is the unique plugin name. + ObjectMeta + + // Features contains a list of features supported by the plugin. + // New features may be added over time and must be ignored + // by code that does not know about them. + Features []ResourcePluginFeature +} + +type ResourcePluginFeature struct { + // Name is one of the pre-defined names for a feature. + Name ResourcePluginFeatureName + // Parameters might provide additional information about how + // the plugin supports the feature. Boolean features have + // no parameters, merely listing them indicates support. + Parameters runtime.RawExtension +} + +type ResourceClaim struct { + // The plugin must set a finalizer here before it attempts to + // allocate the resource. It removes the finalizer again when + // a) the allocation attempt has definitely failed or b) when + // the allocated resource was freed. This ensures that + // resources are not leaked. + ObjectMeta + // Spec describes the desired attributes of a resource that then + // needs to be allocated. It can only be set once when creating + // the ResourceClaim. + Spec ResourceClaimSpec + // Status describes whether the resource is available and with + // which attributes. + Status ResourceClaimStatus +} + +type ResourceClaimSpec struct { + // ResourceClassName references the plugin and additional + // parameters via the name of a ResourceClass that was + // created as part of the plugin deployment. + ResourceClassName string + + // Parameters holds arbitrary values that will be available to the plugin + // when allocating a resource for the claim. + Parameters runtime.RawExtension + + // Allocation can start immediately or when a Pod wants to use + // the resource. Waiting for a Pod is the default. + AllocationMode AllocationMode +} + +type AllocationMode string + +const ( + AllocationModeImmediate AllocationMode = “Immediate” + AllocationModeDelayed AllocationMode = “Delayed” +} + +type ResourceClaimStatus struct { + // Explains what the current status of the claim is and + // determines which component needs to do something. + Phase ResourceClaimPhase + + // When allocation is delayed, the scheduler must set + // the node for which it wants the resource to be allocated + // before the plugin proceeds with allocation. + // For immediate allocation, the scheduler will not set + // this field. The plugin controller component may then + // set it to trigger allocation on a specific node if the + // resources are local to nodes. + SelectedNode string + + // When allocation is delayed, and the scheduler needs to + // decide on which node a Pod should run, it must first + // ask the plugin on which nodes the resource might be + // made available. To trigger that check, the scheduler must + // provide the names of nodes which might be suitable + // for the Pod. + PotentialNodes []string + + // A change of the node candidates triggers a check + // on which nodes the resource could be made available. + // This can change, so the plugin must refresh + // this information periodically until a node gets + // selected by the scheduler. + SuitableNodes []string + + // An allocated resource is available on nodes that match this + // selector. If nil, the resource is available everywhere. + AvailableOnNodes *corev1.NodeSelector + + // Arbitrary data returned by the plugin after a successful allocation. + // This data is passed to the plugin for all operations involving + // the allocated resource. This is opaque for Kubernetes. + // Plugin documentation may explain to users how to interpret + // this data if needed. + Attributes map[string]string + + // UsersLimit determines how many entities are allowed to use this resource + // at the same time. The default is 1. -1 enables the usage by an unlimited number + // of users. Individual containers in a pod are not counted as users, only the Pod + // is. + UserLimit int + + // UsedBy indicates which entities are currently using the resource. + // Usually those are Pods. Only Pods listed as users can be scheduled, + // all others must wait. Updated by kube-scheduler as part of Pod scheduling + // (TBD – a separate controller might also work). + UsedBy []metav1.OwnerReference +} + +type ResourceClaimPhase string + +const ( + // The claim is waiting for a Pod. This the default for + // a new claim with delayed allocation. Once the scheduler + // sees a Pod which needs the claim, it changes the status + // to “pending”. + ResourceClaimWaitingForPod = “WaitingForPod” + + // The claim is waiting for allocation by the plugin. This is the default + // for a new claim with immediate allocation. + ResourceClaimPending ResourceClaimPhase = “Pending” + + // Set by the plugin once the resource has been successfully + // allocated. The scheduler waits for all resources used by + // a Pod to be in this phase. + ResourceClaimAllocated ResourceClaimPhase = “Allocated” + + // It can happen that a resource got allocated for a Pod and + // then the Pod cannot run on the nodes where the allocated + // resource is available. The scheduler detects this and + // then sets the “reallocate” phase to tell the plugin that it must + // free the resource. The plugin does that and moves it + // back to pending. + ResourceClaimReallocate ResourceClaimPhase = “Reallocate” + + // Set by the plugin once a resource allocation attempt + // failed. The plugin will retry the allocation. + ResourceClaimFailed ResourceClaimPhase = “Failed” + + // Deleting the ResourceClaim triggers freeing the resource. + // Because of the plugin’s finalizer, such a claim then + // continues to exist with a DeletionTimeStamp. + // Once the plugin has successfully freed the resource, + // it sets this status and removes the finalizer. Usually + // the claim object will then be removed quickly, but + // additional finalizers might also keep it around longer. + ResourceClaimFreed ResourceClaimPhase = “Freed” +) + +type PodSpec { + ... + Resources []PodResource + ... +} + +type Container { + ... + // The entries are the names of resources in PodSpec.Resources + // that are used by the container. + PodResources []string + ... +} + +type PodResource struct { + // A name under which this resource can be referenced by the containers. + Name string + + // The resource is independent of the Pod and defined by + // a separate ResourceClaim in the same namespace as + // the Pod. Either this or Template must be set, but not both. + ResourceClaimName *string + + // Will be used to create a stand-alone ResourceClaim to allocate the resource. + // The pod in which this PodResource is embedded will be the + // owner of the ResourceClaim, i.e. the ResourceClaim will be deleted together with the + // pod. The name of the ResourceClaim will be `-` where + // `` is the name PodResource.Name + // Pod validation will reject the pod if the concatenated name + // is not valid for a ResourceClaim (for example, too long). + // + // An existing ResourceClaim with that name that is not owned by the pod + // will *not* be used for the pod to avoid using an unrelated + // resource by mistake. Starting the pod is then blocked until + // the unrelated ResourceClaim is removed. If such a pre-created ResourceClaim is + // meant to be used by the pod, the ResourceClaim has to be updated with an + // owner reference to the pod once the pod exists. Normally + // this should not be necessary, but it may be useful when + // manually reconstructing a broken cluster. + // + // This field is read-only and no changes will be made by Kubernetes + // to the ResourceClaim after it has been created. + Template *ResourceClaimTemplate +} + +type ResourceClaimTemplate struct { + // May contain labels and annotations that will be copied into the ResourceClaim + // when creating it. No other fields are allowed and will be rejected during + // validation. + metav1.ObjectMeta + + // The specification for the ResourceClaim. The entire content is + // copied unchanged into the ResourceClaim that gets created from this + // template. The same fields as in a ResourceClaim + // are also valid here. + Spec ResourceClaimSpec +} +``` + +### Communication between kubelet and resource node plugin + +This gRPC interface is provided by the resource node plugin and invoked by +kubelet. It is inspired by +[CSI](https://github.com/container-storage-interface/spec/blob/master/spec.md), +with “volume” replaced by “resource” and volume specific parts removed. + +``` +<<[UNRESOLVED @pohly]>> +Do plugin operations need secrets? They are currently not part of the proposed Kubernetes API. +<<[/UNRESOLVED]>> +``` + +#### `NodePrepareResource` + +This RPC is called by kubelet when a Pod that wants to use the +specified resource is scheduled on a node. The Plugin SHALL assume +that this RPC will be executed on the node where the resource will be +used. The Plugin SHALL return device name and kind for allocated +device[s]. + +The Plugin SHALL create or update json file[s] in CDI format for each +allocated device. These files SHALL be used by runtime to update +runtime configuration before creating containers that use the +device[s]. + +This operation SHALL do as little work as possible as it’s called +after a pod is scheduled to a node. All potentially failing operations +SHALL be done during allocation phase. + +This operation MUST be idempotent. If the resource corresponding to +the `resource_id` has already been prepared, the Plugin MUST reply `0 +OK`. + +If this RPC failed, or kubelet does not know if it failed or not, it +MAY choose to call `NodePrepareResource` again, or choose to call +`NodeUnprepareResource`. + +After a successful call, Kubelet MUST pass device names and kinds to +the runtime through the CRI protocol. + +``` +<<[UNRESOLVED @bart0sh]>> +CRI protocol may need to be extended for +this purpose, e.g. device id can be added to the CRI Device structure. +<<[/UNRESOLVED]>> +``` + +```protobuf +message NodePrepareResourceRequest { + // The UID of the ResourceClaim. This field is REQUIRED. + string resource_uid = 1; +} + +message NodePrepareResourceResponse { + // These are the additional devices that kubelet must + // make available via the container runtime. A resource + // may have zero or more devices. + repeated CDIDevice devices = 1; +} + +message CDIDevice { + // Kind is the string that together with the name identifies a device + // (https://github.com/container-orchestrated-devices/container-device-interface/blob/master/SPEC.md#kind). + string kind = 1; + // Name is the name that within its kind uniquely identifies a + // device (https://github.com/container-orchestrated-devices/container-device-interface/blob/master/SPEC.md#cdi-devices). + string name = 2; +} +``` + +##### NodePrepareResource Errors + +If the plugin is unable to complete the NodePrepareResource call +successfully, it MUST return a non-ok gRPC code in the gRPC status. +If the conditions defined below are encountered, the plugin MUST +return the specified gRPC error code. Kublet MUST implement the +specified error recovery behavior when it encounters the gRPC error +code. + +| Condition | gRPC Code | Description | Recovery Behavior | +|-----------|-----------|-------------|-------------------| +| Resource does not exist | 5 NOT_FOUND | Indicates that a resource corresponding to the specified `resource_id` does not exist. | Caller MUST verify that the `resource_id` is correct and that the resource is accessible and has not been deleted before retrying with exponential back off. | + + +#### `NodeUnprepareResource` + +A Node Plugin MUST implement this RPC call. This RPC is a reverse +operation of `NodePrepareResource`. This RPC MUST undo the work by +the corresponding `NodePrepareResource`. This RPC SHALL be called by +kubelet at least once for each successful `NodePrepareResource`. The +Plugin SHALL assume that this RPC will be executed on the node where +the resource is being used. + +This RPC is called by kubelet when the Pod using the resource is being +deleted. + +This operation MUST be idempotent. If this RPC failed, or kubelet does +not know if it failed or not, it can choose to call +`NodeUnprepareResource` again. + +```protobuf +message NodeUnprepareResourceRequest { + // The UID of the ResourceClaim. This field is REQUIRED. + string resource_id = 1; +} + +message NodeUnprepareResourceResponse { + // Intentionally empty. +} +``` + +##### NodeUnprepareResource Errors + +If the plugin is unable to complete the NodeUprepareResource call +successfully, it MUST return a non-ok gRPC code in the gRPC status. +If the conditions defined below are encountered, the plugin MUST +return the specified gRPC error code. Kubelet MUST implement the +specified error recovery behavior when it encounters the gRPC error +code. + +| Condition | gRPC Code | Description | Recovery Behavior | +|-----------|-----------|-------------|-------------------| +| Resource does not exist | 5 NOT_FOUND | Indicates that a resource corresponding to the specified `resource_id` does not exist. | Caller MUST verify that the `resource_id` is correct and that the resource is accessible and has not been deleted before retrying with exponential back off. | + + +#### Implementing a plugin for node resources + +The proposal depends on a central controller plugin. Implementing that +part poses an additional challenge for plugins that so far only ran +locally on a node because they now need to establish a secure +communication path between node and controller. + +How plugins implement that is up to the developer. This section +outlines a possible solution. If there is sufficient demand, common +code for this solution could be made available as a reusable Go +module. + +- Each plugin defines a CRD which describes how much resources are + available per node and how much is currently allocated. +- RBAC rules ensure that only the plugin can modify objects of that + type. The objects can and should be namespaced, which makes it + possible to add automatic cleanup via owner references (similar to + CSIStorageCapacity). +- The node plugin publishes information about the local state via a + CRD object named after the node. Plugin developers can document + those CRDs and then users can query the cluster state by listing + those objects. +- The controller plugin watches those objects and resource claims. It + can keep track of claims that are in the process of being allocated + and consider that when determining where another claim might get + allocated. For delayed allocation, the controller plugin informs the + scheduler by updating the ResourceClaimStatus.SuitableNodes field + which then sets the selected node field. For immediate allocation, + the controller plugin itself sets the selected node field. +- In both cases, the node plugin waits for a ResourceClaim assigned to + its own node and tries to allocate the resource. If that fails, it + can unset the selected node field to trigger another allocation + attempt elsewhere. + +### Test Plan + + + +### Graduation Criteria + + + +### Upgrade / Downgrade Strategy + + + +### Version Skew Strategy + + + +## Production Readiness Review Questionnaire + + + +### Feature Enablement and Rollback + + + +###### How can this feature be enabled / disabled in a live cluster? + + + +- [ ] Feature gate (also fill in values in `kep.yaml`) + - Feature gate name: + - Components depending on the feature gate: +- [ ] Other + - Describe the mechanism: + - Will enabling / disabling the feature require downtime of the control + plane? + - Will enabling / disabling the feature require downtime or reprovisioning + of a node? (Do not assume `Dynamic Kubelet Config` feature is enabled). + +###### Does enabling the feature change any default behavior? + + + +###### Can the feature be disabled once it has been enabled (i.e. can we roll back the enablement)? + + + +###### What happens if we reenable the feature if it was previously rolled back? + +###### Are there any tests for feature enablement/disablement? + + + +### Rollout, Upgrade and Rollback Planning + + + +###### How can a rollout fail? Can it impact already running workloads? + + + +###### What specific metrics should inform a rollback? + + + +###### Were upgrade and rollback tested? Was the upgrade->downgrade->upgrade path tested? + + + +###### Is the rollout accompanied by any deprecations and/or removals of features, APIs, fields of API types, flags, etc.? + + + +### Monitoring Requirements + + + +###### How can an operator determine if the feature is in use by workloads? + + + +###### What are the SLIs (Service Level Indicators) an operator can use to determine the health of the service? + + + +- [ ] Metrics + - Metric name: + - [Optional] Aggregation method: + - Components exposing the metric: +- [ ] Other (treat as last resort) + - Details: + +###### What are the reasonable SLOs (Service Level Objectives) for the above SLIs? + + + +###### Are there any missing metrics that would be useful to have to improve observability of this feature? + + + +### Dependencies + + + +###### Does this feature depend on any specific services running in the cluster? + + + +### Scalability + + + +###### Will enabling / using this feature result in any new API calls? + + + +###### Will enabling / using this feature result in introducing new API types? + + + +###### Will enabling / using this feature result in any new calls to the cloud provider? + + + +###### Will enabling / using this feature result in increasing size or count of the existing API objects? + + + +###### Will enabling / using this feature result in increasing time taken by any operations covered by existing SLIs/SLOs? + + + +###### Will enabling / using this feature result in non-negligible increase of resource usage (CPU, RAM, disk, IO, ...) in any components? + + + +### Troubleshooting + + + +###### How does this feature react if the API server and/or etcd is unavailable? + +###### What are other known failure modes? + + + +###### What steps should be taken if SLOs are not being met to determine the problem? + +## Implementation History + + + +## Drawbacks + + + +## Alternatives + + + +### ResourceClaimTemplate + +Instead of creating a ResourceClaim from an embedded template, the +PodStatus could be extended to hold the same information as a +ResourceClaimStatus. Every component which works with that information +then needs permission and extra code to work with PodStatus. Creating +an extra object seems simpler. + +### Reusing volume support as-is + +ResourceClaims are similar to PersistentVolumeClaims and also a lot of +the associated logic is similar. An [early +prototype](https://github.com/intel/proof-of-concept-cdi) used a +custom CSI driver to manage resources. + +The user experience with that approach is poor because per-resource +parameters must be stored in annotations of a PVC due to the lack of +custom per-PVC parameters. Passing annotations as additional parameters was [proposed +before](https://github.com/kubernetes-csi/external-provisioner/issues/86) +but were essentially [rejected by +SIG-Storage](https://github.com/kubernetes-csi/external-provisioner/issues/86#issuecomment-465836185) +because allowing apps to set custom parameters would make apps +non-portable. + +The current volume support also has open issues that affect the +“volume as resource” approach: Multiple different Pods on a node are +allowed to use the same +volume. https://github.com/kubernetes/enhancements/pull/2489 will +address that, but is still work in progress. Recovery from a bad node +selection during delayed binding may get stuck when a Pod has multiple +volumes because volumes are not getting deleted after a partial +provisioning. A proposal to fix that needs further work +(https://github.com/kubernetes/enhancements/pull/1703). Each “fake” +CSI driver would have to implement and install a scheduler extender +because storage capacity tracking only considers volume size as +criteria for selecting nodes, which is not applicable for custom +resources. + +### Extend volume support + +The StorageClass and PersistentVolumeClaim structs could be extended +to allow custom parameters. Together with an extension of the CSI +standard that would address the main objection against the previous +alternative. + +However, SIG-Storage and the CSI community would have to agree to this +kind of reuse and accept that some of the code maintained by them +becomes more complex because of these new use cases. + +### Extend Device Plugins + +The Device Plugins API could be extended to implement some of the +requirements mentioned in the “Motivation” section of this +document. There were certain attempts to do it, for example an attempt +to [add ‘Deallocate’ API call](https://github.com/kubernetes/enhancements/pull/1949) and [pass pod annotations to 'Allocate' API call](https://github.com/kubernetes/kubernetes/pull/61775) + +However, most of the requirements couldn’t be satisfied using this +approach as they would require major incompatible changes in the +Device Plugins API. For example: partial and optional resource +allocation couldn’t be done without changing the way resources are +currently declared on the Pod and Device Plugin level. + +Extending the Device Plugins API to use [Container Device Interface](https://github.com/container-orchestrated-devices/container-device-interface) +would help address some of the requirements, but not all of them. + +It should be also taken into account that Device Plugins API is +beta. Introducing incompatible changes to it may not be accepted by +the Kubernetes community. + +### Webhooks instead of ResourceClaim updates + +In the current design, scheduler and the resource controller communicate by +updating fields in a ResourceClaim. This has several advantages compared to an +approach were kube-scheduler retrieves information from the resource controller +via HTTP: +* No need for a new webhook API. +* Simpler deployment of resource controller because all it needs are + credentials to communicate with the apiserver. +* Current status can be checked by querying the ResourceClaim. + +The downside is higher load on the apiserver and an increase of the size of +ResourceClaim objects. + +## Infrastructure Needed (Optional) + + diff --git a/keps/sig-node/3063-dynamic-resource-allocation/components.puml b/keps/sig-node/3063-dynamic-resource-allocation/components.puml new file mode 100644 index 00000000000..610d1174836 --- /dev/null +++ b/keps/sig-node/3063-dynamic-resource-allocation/components.puml @@ -0,0 +1,40 @@ +@startuml +skinparam componentStyle rectangle + +cloud "3rd party\ncluster add-on" as 3rdparty { + component "resource controller" as vendorcontroller + component "resource node plugin" as vendornodeplugin +} + +component Kubernetes { + component apiserver { + file Pod + file "..." as otherapi + file ResourceClaim + } + component scheduler { + component "resource plugin" as k8sresourceplugin + } + component "controller-manager" as controllermanager { + component "resource claim controller" as k8sresourceclaimcontroller + } + component kubelet { + component "plugin manager" as pluginmanager + component "resource manager" as resourcemanager + } +} + +vendorcontroller -[hidden]> vendornodeplugin +Pod -[hidden]> otherapi +otherapi -[hidden]> ResourceClaim + +Pod -u-> k8sresourceclaimcontroller: read resource template\nfrom Pod spec +ResourceClaim <-u- k8sresourceclaimcontroller: create + +Pod <--> scheduler +ResourceClaim <--> k8sresourceplugin + +ResourceClaim <-> vendorcontroller +pluginmanager <-> vendornodeplugin +resourcemanager <-> vendornodeplugin +@enduml diff --git a/keps/sig-node/3063-dynamic-resource-allocation/components.svg b/keps/sig-node/3063-dynamic-resource-allocation/components.svg new file mode 100644 index 00000000000..79c069ff6bc --- /dev/null +++ b/keps/sig-node/3063-dynamic-resource-allocation/components.svg @@ -0,0 +1,75 @@ +3rd partycluster add-onKubernetesapiserverschedulercontroller-managerkubeletresource controllerresource node pluginPod...ResourceClaimresource pluginresource claim controllerplugin managerresource managerread resource templatefrom Pod speccreate \ No newline at end of file diff --git a/keps/sig-node/3063-dynamic-resource-allocation/kep.yaml b/keps/sig-node/3063-dynamic-resource-allocation/kep.yaml new file mode 100644 index 00000000000..fd6ac3769ab --- /dev/null +++ b/keps/sig-node/3063-dynamic-resource-allocation/kep.yaml @@ -0,0 +1,45 @@ +title: dynamic resource allocation +kep-number: 3063 +authors: + - "@pohly" +owning-sig: sig-node +participating-sigs: + - sig-scheduling +status: provisional +creation-date: 2021-05-17 +reviewers: + - TBD + - "@alice.doe" +approvers: + - TBD + - "@oscar.doe" + +see-also: +replaces: + +# The target maturity stage in the current dev cycle for this KEP. +stage: alpha + +# The most recent milestone for which work toward delivery of this KEP has been +# done. This can be the current (upcoming) milestone, if it is being actively +# worked on. +latest-milestone: "v1.24" + +# The milestone at which this feature was, or is targeted to be, at each stage. +milestone: + alpha: "v1.24" + beta: "v1.26" + stable: "v1.28" + +feature-gates: + - name: DynamicResourceAllocation + components: + - kube-apiserver + - kube-controller-manager + - kube-scheduler + - kubelet +disable-supported: true + +# The following PRR answers are required at beta release +metrics: + - my_feature_metric