From d0d0e216511ce9f2f055a07b6e89548177bceae2 Mon Sep 17 00:00:00 2001
From: Patrick Ohly <patrick.ohly@intel.com>
Date: Mon, 17 May 2021 20:39:47 +0200
Subject: [PATCH] dynamic resource allocation: initial KEP draft

Discussion of some earlier revisions of this KEP happened in
https://github.com/pohly/enhancements/pull/1.
---
 keps/prod-readiness/sig-node/3063.yaml        |    6 +
 .../3063-dynamic-resource-allocation/Makefile |   20 +
 .../README.md                                 | 1477 +++++++++++++++++
 .../components.puml                           |   40 +
 .../components.svg                            |   75 +
 .../3063-dynamic-resource-allocation/kep.yaml |   45 +
 6 files changed, 1663 insertions(+)
 create mode 100644 keps/prod-readiness/sig-node/3063.yaml
 create mode 100644 keps/sig-node/3063-dynamic-resource-allocation/Makefile
 create mode 100644 keps/sig-node/3063-dynamic-resource-allocation/README.md
 create mode 100644 keps/sig-node/3063-dynamic-resource-allocation/components.puml
 create mode 100644 keps/sig-node/3063-dynamic-resource-allocation/components.svg
 create mode 100644 keps/sig-node/3063-dynamic-resource-allocation/kep.yaml

diff --git a/keps/prod-readiness/sig-node/3063.yaml b/keps/prod-readiness/sig-node/3063.yaml
new file mode 100644
index 00000000000..1f4d0df09aa
--- /dev/null
+++ b/keps/prod-readiness/sig-node/3063.yaml
@@ -0,0 +1,6 @@
+# The KEP must have an approver from the
+# "prod-readiness-approvers" group 
+# of http://git.k8s.io/enhancements/OWNERS_ALIASES
+kep-number: 3063
+alpha:
+  approver: "TBD"
diff --git a/keps/sig-node/3063-dynamic-resource-allocation/Makefile b/keps/sig-node/3063-dynamic-resource-allocation/Makefile
new file mode 100644
index 00000000000..7da29b85831
--- /dev/null
+++ b/keps/sig-node/3063-dynamic-resource-allocation/Makefile
@@ -0,0 +1,20 @@
+IMAGES += components.svg
+
+all: $(IMAGES)
+clean:
+	rm -f $(IMAGES)
+
+# We use the http://plantuml.com/plantuml server to generate
+# images. That way nothing needs to be installed besides Go.
+DOC_PLANTUML_GO = $(shell go env GOPATH)/bin/plantuml-go
+
+%.png: %.puml $(DOC_PLANTUML_GO)
+	$(DOC_PLANTUML_GO) -format png $<
+
+%.svg: %.puml $(DOC_PLANTUML_GO)
+	$(DOC_PLANTUML_GO) -format svg $<
+
+# Builds the binary in GOPATH/bin. Changing into / first avoids
+# modifying the project's go.mod file.
+$(DOC_PLANTUML_GO):
+	cd / && go get github.com/acarlson99/plantuml-go
diff --git a/keps/sig-node/3063-dynamic-resource-allocation/README.md b/keps/sig-node/3063-dynamic-resource-allocation/README.md
new file mode 100644
index 00000000000..ad6af919500
--- /dev/null
+++ b/keps/sig-node/3063-dynamic-resource-allocation/README.md
@@ -0,0 +1,1477 @@
+<!--
+**Note:** When your KEP is complete, all of these comment blocks should be removed.
+
+To get started with this template:
+
+- [ ] **Pick a hosting SIG.**
+  Make sure that the problem space is something the SIG is interested in taking
+  up. KEPs should not be checked in without a sponsoring SIG.
+
+- [ ] **Create an issue in kubernetes/enhancements**
+  When filing an enhancement tracking issue, please make sure to complete all
+  fields in that template. One of the fields asks for a link to the KEP. You
+  can leave that blank until this KEP is filed, and then go back to the
+  enhancement and add the link.
+
+- [ ] **Make a copy of this template directory.**
+  Copy this template into the owning SIG's directory and name it
+  `NNNN-short-descriptive-title`, where `NNNN` is the issue number (with no
+  leading-zero padding) assigned to your enhancement above.
+
+- [ ] **Fill out as much of the kep.yaml file as you can.**
+  At minimum, you should fill in the "Title", "Authors", "Owning-sig",
+  "Status", and date-related fields.
+
+- [ ] **Fill out this file as best you can.**
+  At minimum, you should fill in the "Summary" and "Motivation" sections.
+  These should be easy if you've preflighted the idea of the KEP with the
+  appropriate SIG(s).
+
+- [ ] **Create a PR for this KEP.**
+  Assign it to people in the SIG who are sponsoring this process.
+
+- [ ] **Merge early and iterate.**
+  Avoid getting hung up on specific details and instead aim to get the goals of
+  the KEP clarified and merged quickly. The best way to do this is to just
+  start with the high-level sections and fill out details incrementally in
+  subsequent PRs.
+
+Just because a KEP is merged does not mean it is complete or approved. Any KEP
+marked as `provisional` is a working document and subject to change. You can
+denote sections that are under active debate as follows:
+
+```
+<<[UNRESOLVED optional short context or usernames ]>>
+Stuff that is being argued.
+<<[/UNRESOLVED]>>
+```
+
+When editing KEPS, aim for tightly-scoped, single-topic PRs to keep discussions
+focused. If you disagree with what is already in a document, open a new PR
+with suggested changes.
+
+One KEP corresponds to one "feature" or "enhancement" for its whole lifecycle.
+You do not need a new KEP to move from beta to GA, for example. If
+new details emerge that belong in the KEP, edit the KEP. Once a feature has become
+"implemented", major changes should get new KEPs.
+
+The canonical place for the latest set of instructions (and the likely source
+of this file) is [here](/keps/NNNN-kep-template/README.md).
+
+**Note:** Any PRs to move a KEP to `implementable`, or significant changes once
+it is marked `implementable`, must be approved by each of the KEP approvers.
+If none of those approvers are still appropriate, then changes to that list
+should be approved by the remaining approvers and/or the owning SIG (or
+SIG Architecture for cross-cutting KEPs).
+-->
+# KEP-NNNN: Dynamic resource allocation
+
+
+<!-- toc -->
+- [Release Signoff Checklist](#release-signoff-checklist)
+- [Summary](#summary)
+- [Motivation](#motivation)
+  - [Goals](#goals)
+  - [Non-Goals](#non-goals)
+- [Proposal](#proposal)
+  - [User Stories (Optional)](#user-stories-optional)
+    - [Story 1](#story-1)
+    - [Story 2](#story-2)
+  - [Notes/Constraints/Caveats (Optional)](#notesconstraintscaveats-optional)
+  - [Risks and Mitigations](#risks-and-mitigations)
+- [Design Details](#design-details)
+  - [Implementation](#implementation)
+  - [API](#api)
+  - [Communication between kubelet and resource node plugin](#communication-between-kubelet-and-resource-node-plugin)
+    - [<code>NodePrepareResource</code>](#)
+      - [NodePrepareResource Errors](#nodeprepareresource-errors)
+    - [<code>NodeUnprepareResource</code>](#-1)
+      - [NodeUnprepareResource Errors](#nodeunprepareresource-errors)
+    - [Implementing a plugin for node resources](#implementing-a-plugin-for-node-resources)
+  - [Test Plan](#test-plan)
+  - [Graduation Criteria](#graduation-criteria)
+  - [Upgrade / Downgrade Strategy](#upgrade--downgrade-strategy)
+  - [Version Skew Strategy](#version-skew-strategy)
+- [Production Readiness Review Questionnaire](#production-readiness-review-questionnaire)
+  - [Feature Enablement and Rollback](#feature-enablement-and-rollback)
+  - [Rollout, Upgrade and Rollback Planning](#rollout-upgrade-and-rollback-planning)
+  - [Monitoring Requirements](#monitoring-requirements)
+  - [Dependencies](#dependencies)
+  - [Scalability](#scalability)
+  - [Troubleshooting](#troubleshooting)
+- [Implementation History](#implementation-history)
+- [Drawbacks](#drawbacks)
+- [Alternatives](#alternatives)
+  - [ResourceClaimTemplate](#resourceclaimtemplate)
+  - [Reusing volume support as-is](#reusing-volume-support-as-is)
+  - [Extend volume support](#extend-volume-support)
+  - [Extend Device Plugins](#extend-device-plugins)
+  - [Webhooks instead of ResourceClaim updates](#webhooks-instead-of-resourceclaim-updates)
+- [Infrastructure Needed (Optional)](#infrastructure-needed-optional)
+<!-- /toc -->
+
+## Release Signoff Checklist
+
+<!--
+**ACTION REQUIRED:** In order to merge code into a release, there must be an
+issue in [kubernetes/enhancements] referencing this KEP and targeting a release
+milestone **before the [Enhancement Freeze](https://git.k8s.io/sig-release/releases)
+of the targeted release**.
+
+For enhancements that make changes to code or processes/procedures in core
+Kubernetes—i.e., [kubernetes/kubernetes], we require the following Release
+Signoff checklist to be completed.
+
+Check these off as they are completed for the Release Team to track. These
+checklist items _must_ be updated for the enhancement to be released.
+-->
+
+Items marked with (R) are required *prior to targeting to a milestone / release*.
+
+- [ ] (R) Enhancement issue in release milestone, which links to KEP dir in [kubernetes/enhancements] (not the initial KEP PR)
+- [ ] (R) KEP approvers have approved the KEP status as `implementable`
+- [ ] (R) Design details are appropriately documented
+- [ ] (R) Test plan is in place, giving consideration to SIG Architecture and SIG Testing input (including test refactors)
+- [ ] (R) Graduation criteria is in place
+- [ ] (R) Production readiness review completed
+- [ ] (R) Production readiness review approved
+- [ ] "Implementation History" section is up-to-date for milestone
+- [ ] User-facing documentation has been created in [kubernetes/website], for publication to [kubernetes.io]
+- [ ] Supporting documentation—e.g., additional design documents, links to mailing list discussions/SIG meetings, relevant PRs/issues, release notes
+
+<!--
+**Note:** This checklist is iterative and should be reviewed and updated every time this enhancement is being considered for a milestone.
+-->
+
+[kubernetes.io]: https://kubernetes.io/
+[kubernetes/enhancements]: https://git.k8s.io/enhancements
+[kubernetes/kubernetes]: https://git.k8s.io/kubernetes
+[kubernetes/website]: https://git.k8s.io/website
+
+## Summary
+
+<!--
+This section is incredibly important for producing high-quality, user-focused
+documentation such as release notes or a development roadmap. It should be
+possible to collect this information before implementation begins, in order to
+avoid requiring implementors to split their attention between writing release
+notes and implementing the feature itself. KEP editors and SIG Docs
+should help to ensure that the tone and content of the `Summary` section is
+useful for a wide audience.
+
+A good summary is probably at least a paragraph in length.
+
+Both in this section and below, follow the guidelines of the [documentation
+style guide]. In particular, wrap lines to a reasonable length, to make it
+easier for reviewers to cite specific portions, and to minimize diff churn on
+updates.
+
+[documentation style guide]: https://github.com/kubernetes/community/blob/master/contributors/guide/style-guide.md
+-->
+
+Dynamic resource allocation introduces an alternative to the existing [device
+manager
+API](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/resource-management/device-plugin.md)
+for third-party hardware vendors. Both are expected to co-exist, with vendors
+choosing the API that better suits their needs on a case-by-case basis. Because
+the new API is going to be implemented independently of the existing device
+manager, there's little risk of breaking stable APIs.
+
+The new API is inspired by the existing [volume provisioning support with CSI](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/storage/container-storage-interface.md#provisioning-and-deleting) and uses similar
+concepts. The goal is to let users request resources with parameters that can
+be different depending on what kind of resource gets requested. Resource
+allocations can be ephemeral (specified in a Pod spec, allocated and destroyed
+together with the Pod) and persistent (lifecycle managed separately from a Pod,
+with an allocated resource used for multiple different Pods).
+
+Several core Kubernetes components must be modified (see the
+[implementation](#implementation) section for details):
+- kube-apiserver (new API)
+- kube-controller-manager (new controller)
+- kube-scheduler (new builtin plugin)
+- kubelet (new third-party plugin kind)
+
+Resources are managed by third-party plugins that communicate with central
+Kubernetes components, in particular the kube-scheduler, by updating
+objects stored in the kube-apiserver. kube-scheduler only needs to be modified
+once to support dynamic resource allocation. Then multiple plugins from
+different vendors can be installed at the same time without making further
+changes to the scheduler.
+
+Communication between the kubelet and the local part of the plugin is
+handled through local Unix domain sockets and the plugin registration
+mechanism, using a new plugin type and a new gRPC interface.
+The container runtime uses the
+[Container Device Interface
+(CDI)](https://github.com/container-orchestrated-devices/container-device-interface)
+to expose the resources.
+
+## Motivation
+
+<!--
+This section is for explicitly listing the motivation, goals, and non-goals of
+this KEP.  Describe why the change is important and the benefits to users. The
+motivation section can optionally provide links to [experience reports] to
+demonstrate the interest in a KEP within the wider Kubernetes community.
+
+[experience reports]: https://github.com/golang/go/wiki/ExperienceReports
+-->
+
+Originally, Kubernetes and its scheduler only tracked CPU and RAM as
+resources for containers. Later, support for storage and discrete,
+countable per-node extended resources was added. The device plugin
+interface then made such local resources available to containers. But
+for many newer devices, this approach and the Kubernetes API for
+requesting these custom resources is too limited. This KEP addresses
+limitations of the current approach for the following use cases:
+
+- *Device initialization*: When starting a workload I’d like to have
+  the device reconfigured or reprogrammed during orchestration. For
+  security reasons workloads should not be able to reconfigure devices
+  directly.
+
+  *Limitation*: Currently, it’s impossible to specify the desired
+  device properties that are required for reconfiguring devices.
+
+- *Device cleanup*: When my workload is finished, I would like to have
+  a mechanism for cleanup of the device, that will ensure that device
+  does not contain traces/parameters/data from previous workloads and
+  appropriate power state/shutdown.
+
+  *Limitation*: Post-stop actions are not supported.
+
+- *Partial allocation*: When deploying a container I’d like to be able
+  to use part of the shareable device inside a container and other
+  containers should be able to use other free resources on the same
+  device.
+
+  *Limitation*: Current implementation of the device plugin doesn’t
+  allow one to allocate part of the device because parameters are too limited
+  and Kubernetes doesn't have enough information about the extended
+  resources on a node to decide whether they can be shared.
+
+- *Optional allocation*: When deploying a workload I’d like to specify
+  soft(optional) device requirements. If a device exists and it’s
+  allocatable it will be allocated. If not - the workload will be run on
+  a node without a device. GPU and crypto-offload engines are
+  examples of this kind of device. If they’re not available, workloads
+  can still run by falling back to using only the CPU for the same
+  task.
+
+  *Limitation*: Optional allocation is supported neither by the device
+  plugins nor by current Pod resource declaration.
+
+- *Support Over the Fabric devices*: When deploying a container, I’d
+  like to utilize devices available over the Fabric (PCIe, CXL,
+  Network, special links, etc).
+
+  *Limitation*: Device Plugins framework supports only local devices.
+
+- *Access to the container image*: When deploying a container that
+  needs access to a GPU device, I would like my container to
+  gracefully fail to start (rather than SIGSEGV at runtime) if a
+  minimum driver requirement is not met on the host.
+
+  *Limitation*: GPU driver requirements are currently stored as labels
+  on the container image, and device plugins do not have access to the
+  container image.
+
+Because this KEP enables the usage of
+[CDI](https://github.com/container-orchestrated-devices/container-device-interface/#readme)
+in Kubernetes, it also addresses those problems that are handled by
+CDI:
+
+- *Perform container runtime specific operations*: When deploying a container
+  that needs access to a device, I would like to be able to reuse the
+  same pod spec, irrespective of the underlying container runtime in
+  use (e.g. kata vs. runc).
+
+  *Limitation*: At present, there is no way to perform
+  runtime-specific operations that may need to occur as a result of
+  injecting a device into a container (device plugins are runtime
+  agnostic). A good example is supporting GPU passthrough
+  virtualization on kata vs. runc.
+
+- *Access to the plugin container*: When deploying a device plugin, I
+  would like to ensure that all of the operations that need to occur
+  as part of “injecting” a device into a container, also occur for the
+  “plugin container” itself.
+
+  *Limitation*: At present, there is a chicken-and-egg problem to
+  supporting this because the device plugin is the sole dictator of
+  which containers will have these operations performed on them.
+
+### Goals
+
+<!--
+List the specific goals of the KEP. What is it trying to achieve? How will we
+know that this has succeeded?
+-->
+
+* More flexibility:
+  * Arbitrary, resource-specific setup and cleanup actions
+  * Over-the-fabric resources
+  * Custom matching of resource requests with available resources,
+    including handling of optional resource requests
+* User-friendly API for describing resource requests
+* Allow resource management plugins that can be developed and deployed
+  separately from Kubernetes and are independent of specific container
+  runtimes.
+
+### Non-Goals
+
+<!--
+What is out of scope for this KEP? Listing non-goals helps to focus discussion
+and make progress.
+-->
+
+* Extend the model that kube-scheduler has about
+  resources. Instead, it will need information from the plugin for
+  each resource request to determine where a Pod using the resource
+  might run. The [Representing Compute Resources in Kubernetes
+  proposal](https://docs.google.com/document/d/1666PPUs4Lz56TqKygcy6mXkNazde-vwA7q4e5H92sUc/edit#)
+  had some ideas what information the scheduler might need (“supports
+  overcommit”, “fractional”), but ultimately any choice regarding that
+  will only work for certain kinds of resources.
+
+* Standardize how to describe available resources. Only allocated
+  resources are visible through the APIs defined below. How to
+  describe available resources is plugin specific because it depends
+  on the kind of resource which attributes might be relevant. Plugins
+  should use and document their individual approach for this (for
+  example, defining a CRD and publishing through that).
+
+* Provide an abstraction layer for resource requests, i.e., something like a
+  “I want some kind of GPU”. Users will need to know about specific
+  resource plugins and which parameters they support. Portability of
+  workloads could be added on top of this proposal by introducing the
+  selection of a resource implementation through labels and
+  standardizing those labels and the associated parameters. The
+  [Resource Class
+  Proposal](https://docs.google.com/document/d/1qKiIVs9AMh2Ua5thhtvWqOqW0MSle_RV3lfriO1Aj6U/edit#heading=h.jzfmfdca34kj)
+  included such an approach.
+
+## Proposal
+
+<!--
+This is where we get down to the specifics of what the proposal actually is.
+This should have enough detail that reviewers can understand exactly what
+you're proposing, but should not include things like API designs or
+implementation. What is the desired outcome and how do we measure success?.
+The "Design Details" section below is for the real
+nitty-gritty.
+-->
+
+The proposal is that a plugin handles all operations that are specific
+to the resources managed by that plugin. This includes operations at
+the control plane level (tracking where in the cluster resources are
+available, helping with pod scheduling decisions, allocating resources
+when requested) as well as the node level (preparing container
+startup). Such a plugin can be implemented in arbitrary programming
+languages as long as it supports the resource allocation protocol and
+gRPC interfaces defined in this KEP. An utility package with Go
+support code will be made available to simplify the development of
+such a plugin, but using it will not be required and its API is not
+part of this KEP.
+
+Three new API object types get added in a new API group:
+- ResourcePlugin, not namespaced, with a description of the plugin.
+- ResourceClass, not namespaced, with privileged parameters for
+  multiple resource instances of a certain kind. All these instances
+  are provided by the same resource plugin, which is identified by a
+  field in the class.
+- ResourceClaim, namespaced, with parameters provided by a normal user
+  that describes a resource instance that needs to be allocated. A
+  ResourceClaim contains the usual meta data, a spec and a status. The
+  spec identifies the plugin that handles the resource via a class
+  name.
+
+To support arbitrarily complex parameters, both ResourceClass and
+ResourceClaim contain one field which holds a
+runtime.RawExtension. Validation can be handled by plugins through an
+admission controller (if desired) or later at runtime when the
+parameters are passed to the plugin.
+
+The ResourceClaim spec is read-only once created. The ResourceClaim
+status is reserved for system usage and holds the current state of the
+resource. The status must not get lost. This is departing from how
+Kubernetes traditionally handled status, but something that more
+recently [became more
+acceptable](https://github.com/kubernetes/enhancements/pull/2537). Kube-scheduler
+and plugin communicate by modifying that status. The status is also
+how Kubernetes tracks that a plugin has allocated the resource and on
+which nodes the resource is available.
+
+This approach is an intentional simplification compared to the PV/PVC
+model for volumes because we don't need to deal with two objects when
+allocating resources and therefore don't need something like the
+volume binding controller. If desired, a resource plugin can implement
+support for manually allocated (“static provisioning” in the context
+of volumes) and/or existing resources by reacting to ResourceClaims by
+using those resources to satisfy a claim.
+
+Allocation of a resource happens either immediately (“immediate
+allocation”) or when a Pod needs the resource (“delayed allocation”),
+depending on a flag in the ResourceClaim spec. Pods reference resource
+claims in a new PodSpec.Resources list. Each resource in that list
+then can be made available to one or more containers in that Pod.  To
+support ephemeral resources, an entry in the new PodSpec.Resources
+list can also be a ResourceClaimTemplate. When a Pod gets created,
+such a template will be used to create a normal ResourceClaim with the
+Pod as owner, and then the normal allocation of the resource takes
+place.
+
+For immediate allocation, scheduling Pods is simple because the
+resource is already allocated and determines the nodes on which the
+Pod may run. For delayed allocation, a node is selected tentatively
+and plugin(s) try to allocate their resources for that node. If that
+succeeds, the Pod can start to run. If it fails, the scheduler must
+determine whether some other node fits the requirements and if so,
+request allocation again. If no node fits because some resources were
+already allocated for a node and are only usable there, then those
+resources must be released and then get allocated elsewhere.
+
+The resources allocated for a ResourceClaim can be shared by multiple
+containers in a pod. Depending on the capabilities defined in the
+ResourceClaim by the plugin, a ResourceClaim can be used exclusively
+by one pod at a time, by a certain maximum number of pods, or an
+unlimited number of pods.
+
+### User Stories (Optional)
+
+<!--
+Detail the things that people will be able to do if this KEP is implemented.
+Include as much detail as possible so that people can understand the "how" of
+the system. The goal here is to make this feel real for users without getting
+bogged down.
+-->
+
+#### Story 1
+
+Partial allocation.
+
+Pod requests GPU resource with 2Gb memory:
+```
+apiVersion: v1
+kind: Pod
+metadata:
+  name: device-consumer
+spec:
+  containers:
+  - name: my-ubuntu
+    image: ubuntu
+    command: ["/bin/program"]
+    podResources:
+    - name: gpu_2Gb
+    resources:
+      requests:
+        memory: "64Mi"
+        cpu: "250m"
+      limits:
+        memory: "128Mi"
+        cpu: "500m"
+  resources:
+  - name: gpu_2Gb
+    template:
+      resourceClassName: "gpu"
+      parameters:
+        memory: "2Gb"
+```
+
+This request triggers partial resource allocation on the node that has
+a GPU device with 4Gb of memory for the duration of the execution of
+this Pod.
+
+If some other Pod uses 1Gb of that device, then 1Gb free GPU memory
+will left on that device after the resource is allocated. It can be
+used by another Pod requesting up to 1Gb of GPU memory.
+
+#### Story 2
+
+### Notes/Constraints/Caveats (Optional)
+
+<!--
+What are the caveats to the proposal?
+What are some important details that didn't come across above?
+Go in to as much detail as necessary here.
+This might be a good place to talk about core concepts and how they relate.
+-->
+
+Scheduling is likely to be slower when many Pods request the new
+resource types, both because scheduling such a Pod involves more
+round-trips through the API server for ResourceClaimStatus updates and
+because scheduling one Pod may affect other Pods in ways that cannot
+be anticipated by the kube-scheduler. When many Pods compete for
+limited resources, multiple attempts may be needed before a suitable
+node is found.
+
+The hardware that is expected to need this more flexible allocation
+approach is going to be used by only a small subset of the pods in the
+cluster and those pods are likely to run for extended periods of time,
+so this is not a major concern.
+
+### Risks and Mitigations
+
+<!--
+What are the risks of this proposal, and how do we mitigate? Think broadly.
+For example, consider both security and how this will impact the larger
+Kubernetes ecosystem.
+
+How will security be reviewed, and by whom?
+
+How will UX be reviewed, and by whom?
+
+Consider including folks who also work outside the SIG or subproject.
+-->
+
+## Design Details
+
+<!--
+This section should contain enough information that the specifics of your
+change are understandable. This may include API specs (though not always
+required) or even code snippets. If there's any ambiguity about HOW your
+proposal will be implemented, this is the place to discuss them.
+-->
+
+### Implementation
+
+![components](./components.svg)
+
+Several components must be implemented or modified in Kubernetes:
+- The new API must be added to kube-apiserver.
+- A new controller in kube-controller-manager which generates
+  ResourceClaims from Pod ResourceClaimTemplates, similar to
+  https://github.com/kubernetes/kubernetes/tree/master/pkg/controller/volume/ephemeral
+- A kube-scheduler plugin must detect Pods which reference a
+  ResourceClaim (directly or through a template) and ensure that the
+  resource is allocated before the Pod gets scheduled, similar to
+  https://github.com/kubernetes/kubernetes/blob/master/pkg/controller/volume/scheduling/scheduler_binder.go
+- Kubelet must be extended to retrieve information from ResourceClaims
+  and then invoke local resource plugin methods. It must pass information about
+  the additional resources to the container runtime. It must detect
+  whether the container runtime has the necessary support and
+  advertise that in the cluster via node labels to simplify deployment
+  of plugins and Pod scheduling.
+
+For a resource plugin the following components are needed:
+- Some utility library similar to
+  https://github.com/kubernetes-sigs/sig-storage-lib-external-provisioner
+  and the code in driver-registrar.
+- *Resource controller*: a central component which handles resource allocation
+  by watching and modifying ResourceClaims.
+- *Resource node plugin*: a component which cooperates with kubelet to prepare
+  the usage of the resource on a node.
+
+The utility library will be developed outside of Kubernetes and does not have
+to be used by plugins, therefore it is not described further in this KEP.
+
+
+```
+<<[UNRESOLVED @pohly]>>
+All of the changes in Kubernetes need to be specified in a lot more detail.
+
+A flow diagram showing the state transitions of a ResourceClaim needs to be added.
+
+Upgrade and downgrade scenarios should already be considered for v1alpha1 to ensure
+that whatever changes will be needed are in place before going to v1beta1 where
+downgrades have to be supported.
+
+All of that will be added once there is consensus to move ahead with this proposal.
+<<[/UNRESOLVED]>>
+```
+
+### API
+
+ResourceClaim, ResourceClass and ResourcePlugin are new built-in types
+in a new `cdi.k8s.io/v1alpha1` API group. This was chosen instead of
+using CRDs because core Kubernetes components must interact with them
+and installing of CRDs as part of cluster creation is an unsolved
+problem.
+
+The PodSpec gets extended.
+
+```
+type ResourceClass struct {
+    // Resource plugins have a unique name in reverse domain order (acme.example.com).
+    PluginName string
+    // Parameters holds arbitrary values that will be available to the plugin
+    // when allocating a resource that uses this class. The plugin will
+    // be able to distinguish between parameters stored here and
+    // and those stored in ResourceClaimSpec. These parameters
+    // here can only be set by cluster administrators.
+    Parameters runtime.RawExtension
+}
+
+type ResourcePlugin struct {
+    // The name of the object is the unique plugin name.
+    ObjectMeta
+
+    // Features contains a list of features supported by the plugin.
+    // New features may be added over time and must be ignored
+    // by code that does not know about them.
+    Features []ResourcePluginFeature
+}
+
+type ResourcePluginFeature struct {
+    // Name is one of the pre-defined names for a feature.
+    Name ResourcePluginFeatureName
+    // Parameters might provide additional information about how
+    // the plugin supports the feature. Boolean features have
+    // no parameters, merely listing them indicates support.
+    Parameters runtime.RawExtension
+}
+
+type ResourceClaim struct {
+    // The plugin must set a finalizer here before it attempts to
+    // allocate the resource. It removes the finalizer again when
+    // a) the allocation attempt has definitely failed or b) when
+    // the allocated resource was freed. This ensures that
+    // resources are not leaked.
+    ObjectMeta
+    // Spec describes the desired attributes of a resource that then
+    // needs to be allocated. It can only be set once when creating
+    // the ResourceClaim.
+    Spec ResourceClaimSpec
+    // Status describes whether the resource is available and with
+    // which attributes.
+    Status ResourceClaimStatus
+}
+
+type ResourceClaimSpec struct {
+    // ResourceClassName references the plugin and additional
+    // parameters via the name of a ResourceClass that was
+    // created as part of the plugin deployment.
+    ResourceClassName string
+
+    // Parameters holds arbitrary values that will be available to the plugin
+    // when allocating a resource for the claim.
+    Parameters runtime.RawExtension
+
+    // Allocation can start immediately or when a Pod wants to use
+    // the resource. Waiting for a Pod is the default.
+    AllocationMode AllocationMode
+}
+
+type AllocationMode string
+
+const (
+     AllocationModeImmediate AllocationMode = “Immediate”
+     AllocationModeDelayed AllocationMode = “Delayed”
+}
+
+type ResourceClaimStatus struct {
+   // Explains what the current status of the claim is and
+   // determines which component needs to do something.
+   Phase ResourceClaimPhase
+
+   // When allocation is delayed, the scheduler must set
+   // the node for which it wants the resource to be allocated
+   // before the plugin proceeds with allocation.
+   // For immediate allocation, the scheduler will not set
+   // this field. The plugin controller component may then
+   // set it to trigger allocation on a specific node if the
+   // resources are local to nodes.
+   SelectedNode string
+
+   // When allocation is delayed, and the scheduler needs to
+   // decide on which node a Pod should run, it must first
+   // ask the plugin on which nodes the resource might be
+   // made available. To trigger that check, the scheduler must
+   // provide the names of nodes which might be suitable
+   // for the Pod.
+   PotentialNodes []string
+
+   // A change of the node candidates triggers a check
+   // on which nodes the resource could be made available.
+   // This can change, so the plugin must refresh
+   // this information periodically until a node gets
+   // selected by the scheduler.
+   SuitableNodes []string
+
+   // An allocated resource is available on nodes that match this
+   // selector. If nil, the resource is available everywhere.
+   AvailableOnNodes *corev1.NodeSelector
+
+   // Arbitrary data returned by the plugin after a successful allocation.
+   // This data is passed to the plugin for all operations involving
+   // the allocated resource. This is opaque for Kubernetes.
+   // Plugin documentation may explain to users how to interpret
+   // this data if needed.
+   Attributes map[string]string
+
+    // UsersLimit determines how many entities are allowed to use this resource
+    // at the same time. The default is 1. -1 enables the usage by an unlimited number
+   // of users. Individual containers in a pod are not counted as users, only the Pod
+   // is.
+   UserLimit int
+
+   // UsedBy indicates which entities are currently using the resource.
+   // Usually those are Pods. Only Pods listed as users can be scheduled,
+   // all others must wait. Updated by kube-scheduler as part of Pod scheduling
+   // (TBD – a separate controller might also work).
+   UsedBy []metav1.OwnerReference
+}
+
+type ResourceClaimPhase string
+
+const (
+    // The claim is waiting for a Pod. This the default for
+    // a new claim with delayed allocation. Once the scheduler
+    // sees a Pod which needs the claim, it changes the status
+    // to “pending”.
+    ResourceClaimWaitingForPod = “WaitingForPod”
+
+    // The claim is waiting for allocation by the plugin. This is the default
+    // for a new claim with immediate allocation.
+    ResourceClaimPending ResourceClaimPhase = “Pending”
+
+    // Set by the plugin once the resource has been successfully
+    // allocated. The scheduler waits for all resources used by
+    // a Pod to be in this phase.
+    ResourceClaimAllocated ResourceClaimPhase = “Allocated”
+
+    // It can happen that a resource got allocated for a Pod and
+    // then the Pod cannot run on the nodes where the allocated
+    // resource is available. The scheduler detects this and
+    // then sets the “reallocate” phase to tell the plugin that it must
+    // free the resource. The plugin does that and moves it
+    // back to pending.
+    ResourceClaimReallocate ResourceClaimPhase = “Reallocate”
+
+    // Set by the plugin once a resource allocation attempt
+    // failed. The plugin will retry the allocation.
+    ResourceClaimFailed ResourceClaimPhase = “Failed”
+
+    // Deleting the ResourceClaim triggers freeing the resource.
+    // Because of the plugin’s finalizer, such a claim then
+    // continues to exist with a DeletionTimeStamp.
+    // Once the plugin has successfully freed the resource,
+    // it sets this status and removes the finalizer. Usually
+    // the claim object will then be removed quickly, but
+    // additional finalizers might also keep it around longer.
+    ResourceClaimFreed ResourceClaimPhase = “Freed”
+)
+
+type PodSpec {
+   ...
+   Resources []PodResource
+   ...
+}
+
+type Container {
+   ...
+   // The entries are the names of resources in PodSpec.Resources
+   // that are used by the container.
+   PodResources []string
+   ...
+}
+
+type PodResource struct {
+   // A name under which this resource can be referenced by the containers.
+   Name string
+
+   // The resource is independent of the Pod and defined by
+   // a separate ResourceClaim in the same namespace as
+   // the Pod. Either this or Template must be set, but not both.
+   ResourceClaimName *string
+
+    // Will be used to create a stand-alone ResourceClaim to allocate the resource.
+    // The pod in which this PodResource is embedded will be the
+    // owner of the ResourceClaim, i.e. the ResourceClaim will be deleted together with the
+    // pod.  The name of the ResourceClaim will be `<pod name>-<resource name>` where
+    // `<resource name>` is the name PodResource.Name
+    // Pod validation will reject the pod if the concatenated name
+    // is not valid for a ResourceClaim (for example, too long).
+    //
+    // An existing ResourceClaim with that name that is not owned by the pod
+    // will *not* be used for the pod to avoid using an unrelated
+    // resource by mistake. Starting the pod is then blocked until
+    // the unrelated ResourceClaim is removed. If such a pre-created ResourceClaim is
+    // meant to be used by the pod, the ResourceClaim has to be updated with an
+    // owner reference to the pod once the pod exists. Normally
+    // this should not be necessary, but it may be useful when
+    // manually reconstructing a broken cluster.
+    //
+    // This field is read-only and no changes will be made by Kubernetes
+    // to the ResourceClaim after it has been created.
+   Template *ResourceClaimTemplate
+}
+
+type ResourceClaimTemplate struct {
+    // May contain labels and annotations that will be copied into the ResourceClaim
+    // when creating it. No other fields are allowed and will be rejected during
+    // validation.
+    metav1.ObjectMeta
+
+    // The specification for the ResourceClaim. The entire content is
+    // copied unchanged into the ResourceClaim that gets created from this
+    // template. The same fields as in a ResourceClaim
+    // are also valid here.
+    Spec ResourceClaimSpec
+}
+```
+
+### Communication between kubelet and resource node plugin
+
+This gRPC interface is provided by the resource node plugin and invoked by
+kubelet. It is inspired by
+[CSI](https://github.com/container-storage-interface/spec/blob/master/spec.md),
+with “volume” replaced by “resource” and volume specific parts removed.
+
+```
+<<[UNRESOLVED @pohly]>>
+Do plugin operations need secrets? They are currently not part of the proposed Kubernetes API.
+<<[/UNRESOLVED]>>
+```
+
+#### `NodePrepareResource`
+
+This RPC is called by kubelet when a Pod that wants to use the
+specified resource is scheduled on a node.  The Plugin SHALL assume
+that this RPC will be executed on the node where the resource will be
+used.  The Plugin SHALL return device name and kind for allocated
+device[s].
+
+The Plugin SHALL create or update json file[s] in CDI format for each
+allocated device. These files SHALL be used by runtime to update
+runtime configuration before creating containers that use the
+device[s].
+
+This operation SHALL do as little work as possible as it’s called
+after a pod is scheduled to a node. All potentially failing operations
+SHALL be done during allocation phase.
+
+This operation MUST be idempotent. If the resource corresponding to
+the `resource_id` has already been prepared, the Plugin MUST reply `0
+OK`.
+
+If this RPC failed, or kubelet does not know if it failed or not, it
+MAY choose to call `NodePrepareResource` again, or choose to call
+`NodeUnprepareResource`.
+
+After a successful call, Kubelet MUST pass device names and kinds to
+the runtime through the CRI protocol.
+
+```
+<<[UNRESOLVED @bart0sh]>>
+CRI protocol may need to be extended for
+this purpose, e.g. device id can be added to the CRI Device structure.
+<<[/UNRESOLVED]>>
+```
+
+```protobuf
+message NodePrepareResourceRequest {
+  // The UID of the ResourceClaim. This field is REQUIRED.
+  string resource_uid = 1;
+}
+
+message NodePrepareResourceResponse {
+  // These are the additional devices that kubelet must
+  // make available via the container runtime. A resource
+  // may have zero or more devices.
+  repeated CDIDevice devices = 1;
+}
+
+message CDIDevice {
+  // Kind is the string that together with the name identifies a device
+  // (https://github.com/container-orchestrated-devices/container-device-interface/blob/master/SPEC.md#kind).
+  string kind = 1;
+  // Name is the name that within its kind uniquely identifies a
+  // device (https://github.com/container-orchestrated-devices/container-device-interface/blob/master/SPEC.md#cdi-devices).
+  string name = 2;
+}
+```
+
+##### NodePrepareResource Errors
+
+If the plugin is unable to complete the NodePrepareResource call
+successfully, it MUST return a non-ok gRPC code in the gRPC status.
+If the conditions defined below are encountered, the plugin MUST
+return the specified gRPC error code.  Kublet MUST implement the
+specified error recovery behavior when it encounters the gRPC error
+code.
+
+| Condition | gRPC Code | Description | Recovery Behavior |
+|-----------|-----------|-------------|-------------------|
+| Resource does not exist | 5 NOT_FOUND | Indicates that a resource corresponding to the specified `resource_id` does not exist. | Caller MUST verify that the `resource_id` is correct and that the resource is accessible and has not been deleted before retrying with exponential back off. |
+ 
+
+#### `NodeUnprepareResource`
+
+A Node Plugin MUST implement this RPC call. This RPC is a reverse
+operation of `NodePrepareResource`. This RPC MUST undo the work by
+the corresponding `NodePrepareResource`. This RPC SHALL be called by
+kubelet at least once for each successful `NodePrepareResource`. The
+Plugin SHALL assume that this RPC will be executed on the node where
+the resource is being used.
+
+This RPC is called by kubelet when the Pod using the resource is being
+deleted.
+
+This operation MUST be idempotent. If this RPC failed, or kubelet does
+not know if it failed or not, it can choose to call
+`NodeUnprepareResource` again.
+
+```protobuf
+message NodeUnprepareResourceRequest {
+  // The UID of the ResourceClaim. This field is REQUIRED.
+  string resource_id = 1;
+}
+
+message NodeUnprepareResourceResponse {
+  // Intentionally empty.
+}
+```
+
+##### NodeUnprepareResource Errors
+
+If the plugin is unable to complete the NodeUprepareResource call
+successfully, it MUST return a non-ok gRPC code in the gRPC status.
+If the conditions defined below are encountered, the plugin MUST
+return the specified gRPC error code.  Kubelet MUST implement the
+specified error recovery behavior when it encounters the gRPC error
+code.
+
+| Condition | gRPC Code | Description | Recovery Behavior |
+|-----------|-----------|-------------|-------------------|
+| Resource does not exist | 5 NOT_FOUND | Indicates that a resource corresponding to the specified `resource_id` does not exist. | Caller MUST verify that the `resource_id` is correct and that the resource is accessible and has not been deleted before retrying with exponential back off. |
+
+
+#### Implementing a plugin for node resources
+
+The proposal depends on a central controller plugin. Implementing that
+part poses an additional challenge for plugins that so far only ran
+locally on a node because they now need to establish a secure
+communication path between node and controller.
+
+How plugins implement that is up to the developer. This section
+outlines a possible solution. If there is sufficient demand, common
+code for this solution could be made available as a reusable Go
+module.
+
+- Each plugin defines a CRD which describes how much resources are
+  available per node and how much is currently allocated.
+- RBAC rules ensure that only the plugin can modify objects of that
+  type. The objects can and should be namespaced, which makes it
+  possible to add automatic cleanup via owner references (similar to
+  CSIStorageCapacity).
+- The node plugin publishes information about the local state via a
+  CRD object named after the node. Plugin developers can document
+  those CRDs and then users can query the cluster state by listing
+  those objects.
+- The controller plugin watches those objects and resource claims. It
+  can keep track of claims that are in the process of being allocated
+  and consider that when determining where another claim might get
+  allocated. For delayed allocation, the controller plugin informs the
+  scheduler by updating the ResourceClaimStatus.SuitableNodes field
+  which then sets the selected node field. For immediate allocation,
+  the controller plugin itself sets the selected node field.
+- In both cases, the node plugin waits for a ResourceClaim assigned to
+  its own node and tries to allocate the resource. If that fails, it
+  can unset the selected node field to trigger another allocation
+  attempt elsewhere.
+
+### Test Plan
+
+<!--
+**Note:** *Not required until targeted at a release.*
+
+Consider the following in developing a test plan for this enhancement:
+- Will there be e2e and integration tests, in addition to unit tests?
+- How will it be tested in isolation vs with other components?
+
+No need to outline all of the test cases, just the general strategy. Anything
+that would count as tricky in the implementation, and anything particularly
+challenging to test, should be called out.
+
+All code is expected to have adequate tests (eventually with coverage
+expectations). Please adhere to the [Kubernetes testing guidelines][testing-guidelines]
+when drafting this test plan.
+
+[testing-guidelines]: https://git.k8s.io/community/contributors/devel/sig-testing/testing.md
+-->
+
+### Graduation Criteria
+
+<!--
+**Note:** *Not required until targeted at a release.*
+
+Define graduation milestones.
+
+These may be defined in terms of API maturity, or as something else. The KEP
+should keep this high-level with a focus on what signals will be looked at to
+determine graduation.
+
+Consider the following in developing the graduation criteria for this enhancement:
+- [Maturity levels (`alpha`, `beta`, `stable`)][maturity-levels]
+- [Deprecation policy][deprecation-policy]
+
+Clearly define what graduation means by either linking to the [API doc
+definition](https://kubernetes.io/docs/concepts/overview/kubernetes-api/#api-versioning)
+or by redefining what graduation means.
+
+In general we try to use the same stages (alpha, beta, GA), regardless of how the
+functionality is accessed.
+
+[maturity-levels]: https://git.k8s.io/community/contributors/devel/sig-architecture/api_changes.md#alpha-beta-and-stable-versions
+[deprecation-policy]: https://kubernetes.io/docs/reference/using-api/deprecation-policy/
+
+Below are some examples to consider, in addition to the aforementioned [maturity levels][maturity-levels].
+
+#### Alpha -> Beta Graduation
+
+- Gather feedback from developers and surveys
+- Complete features A, B, C
+- Tests are in Testgrid and linked in KEP
+
+#### Beta -> GA Graduation
+
+- N examples of real-world usage
+- N installs
+- More rigorous forms of testing—e.g., downgrade tests and scalability tests
+- Allowing time for feedback
+
+**Note:** Generally we also wait at least two releases between beta and
+GA/stable, because there's no opportunity for user feedback, or even bug reports,
+in back-to-back releases.
+
+#### Removing a Deprecated Flag
+
+- Announce deprecation and support policy of the existing flag
+- Two versions passed since introducing the functionality that deprecates the flag (to address version skew)
+- Address feedback on usage/changed behavior, provided on GitHub issues
+- Deprecate the flag
+
+**For non-optional features moving to GA, the graduation criteria must include 
+[conformance tests].**
+
+[conformance tests]: https://git.k8s.io/community/contributors/devel/sig-architecture/conformance-tests.md
+-->
+
+### Upgrade / Downgrade Strategy
+
+<!--
+If applicable, how will the component be upgraded and downgraded? Make sure
+this is in the test plan.
+
+Consider the following in developing an upgrade/downgrade strategy for this
+enhancement:
+- What changes (in invocations, configurations, API use, etc.) is an existing
+  cluster required to make on upgrade, in order to maintain previous behavior?
+- What changes (in invocations, configurations, API use, etc.) is an existing
+  cluster required to make on upgrade, in order to make use of the enhancement?
+-->
+
+### Version Skew Strategy
+
+<!--
+If applicable, how will the component handle version skew with other
+components? What are the guarantees? Make sure this is in the test plan.
+
+Consider the following in developing a version skew strategy for this
+enhancement:
+- Does this enhancement involve coordinating behavior in the control plane and
+  in the kubelet? How does an n-2 kubelet without this feature available behave
+  when this feature is used?
+- Will any other components on the node change? For example, changes to CSI,
+  CRI or CNI may require updating that component before the kubelet.
+-->
+
+## Production Readiness Review Questionnaire
+
+<!--
+
+Production readiness reviews are intended to ensure that features merging into
+Kubernetes are observable, scalable and supportable; can be safely operated in
+production environments, and can be disabled or rolled back in the event they
+cause increased failures in production. See more in the PRR KEP at
+https://git.k8s.io/enhancements/keps/sig-architecture/1194-prod-readiness.
+
+The production readiness review questionnaire must be completed and approved
+for the KEP to move to `implementable` status and be included in the release.
+
+In some cases, the questions below should also have answers in `kep.yaml`. This
+is to enable automation to verify the presence of the review, and to reduce review
+burden and latency.
+
+The KEP must have a approver from the
+[`prod-readiness-approvers`](http://git.k8s.io/enhancements/OWNERS_ALIASES)
+team. Please reach out on the
+[#prod-readiness](https://kubernetes.slack.com/archives/CPNHUMN74) channel if
+you need any help or guidance.
+-->
+
+### Feature Enablement and Rollback
+
+<!--
+This section must be completed when targeting alpha to a release.
+-->
+
+###### How can this feature be enabled / disabled in a live cluster?
+
+<!--
+Pick one of these and delete the rest.
+-->
+
+- [ ] Feature gate (also fill in values in `kep.yaml`)
+  - Feature gate name:
+  - Components depending on the feature gate:
+- [ ] Other
+  - Describe the mechanism:
+  - Will enabling / disabling the feature require downtime of the control
+    plane?
+  - Will enabling / disabling the feature require downtime or reprovisioning
+    of a node? (Do not assume `Dynamic Kubelet Config` feature is enabled).
+
+###### Does enabling the feature change any default behavior?
+
+<!--
+Any change of default behavior may be surprising to users or break existing
+automations, so be extremely careful here.
+-->
+
+###### Can the feature be disabled once it has been enabled (i.e. can we roll back the enablement)?
+
+<!--
+Describe the consequences on existing workloads (e.g., if this is a runtime
+feature, can it break the existing applications?).
+
+NOTE: Also set `disable-supported` to `true` or `false` in `kep.yaml`.
+-->
+
+###### What happens if we reenable the feature if it was previously rolled back?
+
+###### Are there any tests for feature enablement/disablement?
+
+<!--
+The e2e framework does not currently support enabling or disabling feature
+gates. However, unit tests in each component dealing with managing data, created
+with and without the feature, are necessary. At the very least, think about
+conversion tests if API types are being modified.
+-->
+
+### Rollout, Upgrade and Rollback Planning
+
+<!--
+This section must be completed when targeting beta to a release.
+-->
+
+###### How can a rollout fail? Can it impact already running workloads?
+
+<!--
+Try to be as paranoid as possible - e.g., what if some components will restart
+mid-rollout?
+-->
+
+###### What specific metrics should inform a rollback?
+
+<!--
+What signals should users be paying attention to when the feature is young
+that might indicate a serious problem?
+-->
+
+###### Were upgrade and rollback tested? Was the upgrade->downgrade->upgrade path tested?
+
+<!--
+Describe manual testing that was done and the outcomes.
+Longer term, we may want to require automated upgrade/rollback tests, but we
+are missing a bunch of machinery and tooling and can't do that now.
+-->
+
+###### Is the rollout accompanied by any deprecations and/or removals of features, APIs, fields of API types, flags, etc.?
+
+<!--
+Even if applying deprecation policies, they may still surprise some users.
+-->
+
+### Monitoring Requirements
+
+<!--
+This section must be completed when targeting beta to a release.
+-->
+
+###### How can an operator determine if the feature is in use by workloads?
+
+<!--
+Ideally, this should be a metric. Operations against the Kubernetes API (e.g.,
+checking if there are objects with field X set) may be a last resort. Avoid
+logs or events for this purpose.
+-->
+
+###### What are the SLIs (Service Level Indicators) an operator can use to determine the health of the service?
+
+<!--
+Pick one more of these and delete the rest.
+-->
+
+- [ ] Metrics
+  - Metric name:
+  - [Optional] Aggregation method:
+  - Components exposing the metric:
+- [ ] Other (treat as last resort)
+  - Details:
+
+###### What are the reasonable SLOs (Service Level Objectives) for the above SLIs?
+
+<!--
+At a high level, this usually will be in the form of "high percentile of SLI
+per day <= X". It's impossible to provide comprehensive guidance, but at the very
+high level (needs more precise definitions) those may be things like:
+  - per-day percentage of API calls finishing with 5XX errors <= 1%
+  - 99% percentile over day of absolute value from (job creation time minus expected
+    job creation time) for cron job <= 10%
+  - 99,9% of /health requests per day finish with 200 code
+-->
+
+###### Are there any missing metrics that would be useful to have to improve observability of this feature?
+
+<!--
+Describe the metrics themselves and the reasons why they weren't added (e.g., cost,
+implementation difficulties, etc.).
+-->
+
+### Dependencies
+
+<!--
+This section must be completed when targeting beta to a release.
+-->
+
+###### Does this feature depend on any specific services running in the cluster?
+
+<!--
+Think about both cluster-level services (e.g. metrics-server) as well
+as node-level agents (e.g. specific version of CRI). Focus on external or
+optional services that are needed. For example, if this feature depends on
+a cloud provider API, or upon an external software-defined storage or network
+control plane.
+
+For each of these, fill in the following—thinking about running existing user workloads
+and creating new ones, as well as about cluster-level services (e.g. DNS):
+  - [Dependency name]
+    - Usage description:
+      - Impact of its outage on the feature:
+      - Impact of its degraded performance or high-error rates on the feature:
+-->
+
+### Scalability
+
+<!--
+For alpha, this section is encouraged: reviewers should consider these questions
+and attempt to answer them.
+
+For beta, this section is required: reviewers must answer these questions.
+
+For GA, this section is required: approvers should be able to confirm the
+previous answers based on experience in the field.
+-->
+
+###### Will enabling / using this feature result in any new API calls?
+
+<!--
+Describe them, providing:
+  - API call type (e.g. PATCH pods)
+  - estimated throughput
+  - originating component(s) (e.g. Kubelet, Feature-X-controller)
+Focusing mostly on:
+  - components listing and/or watching resources they didn't before
+  - API calls that may be triggered by changes of some Kubernetes resources
+    (e.g. update of object X triggers new updates of object Y)
+  - periodic API calls to reconcile state (e.g. periodic fetching state,
+    heartbeats, leader election, etc.)
+-->
+
+###### Will enabling / using this feature result in introducing new API types?
+
+<!--
+Describe them, providing:
+  - API type
+  - Supported number of objects per cluster
+  - Supported number of objects per namespace (for namespace-scoped objects)
+-->
+
+###### Will enabling / using this feature result in any new calls to the cloud provider?
+
+<!--
+Describe them, providing:
+  - Which API(s):
+  - Estimated increase:
+-->
+
+###### Will enabling / using this feature result in increasing size or count of the existing API objects?
+
+<!--
+Describe them, providing:
+  - API type(s):
+  - Estimated increase in size: (e.g., new annotation of size 32B)
+  - Estimated amount of new objects: (e.g., new Object X for every existing Pod)
+-->
+
+###### Will enabling / using this feature result in increasing time taken by any operations covered by existing SLIs/SLOs?
+
+<!--
+Look at the [existing SLIs/SLOs].
+
+Think about adding additional work or introducing new steps in between
+(e.g. need to do X to start a container), etc. Please describe the details.
+
+[existing SLIs/SLOs]: https://git.k8s.io/community/sig-scalability/slos/slos.md#kubernetes-slisslos
+-->
+
+###### Will enabling / using this feature result in non-negligible increase of resource usage (CPU, RAM, disk, IO, ...) in any components?
+
+<!--
+Things to keep in mind include: additional in-memory state, additional
+non-trivial computations, excessive access to disks (including increased log
+volume), significant amount of data sent and/or received over network, etc.
+This through this both in small and large cases, again with respect to the
+[supported limits].
+
+[supported limits]: https://git.k8s.io/community//sig-scalability/configs-and-limits/thresholds.md
+-->
+
+### Troubleshooting
+
+<!--
+This section must be completed when targeting beta to a release.
+
+The Troubleshooting section currently serves the `Playbook` role. We may consider
+splitting it into a dedicated `Playbook` document (potentially with some monitoring
+details). For now, we leave it here.
+-->
+
+###### How does this feature react if the API server and/or etcd is unavailable?
+
+###### What are other known failure modes?
+
+<!--
+For each of them, fill in the following information by copying the below template:
+  - [Failure mode brief description]
+    - Detection: How can it be detected via metrics? Stated another way:
+      how can an operator troubleshoot without logging into a master or worker node?
+    - Mitigations: What can be done to stop the bleeding, especially for already
+      running user workloads?
+    - Diagnostics: What are the useful log messages and their required logging
+      levels that could help debug the issue?
+      Not required until feature graduated to beta.
+    - Testing: Are there any tests for failure mode? If not, describe why.
+-->
+
+###### What steps should be taken if SLOs are not being met to determine the problem?
+
+## Implementation History
+
+<!--
+Major milestones in the lifecycle of a KEP should be tracked in this section.
+Major milestones might include:
+- the `Summary` and `Motivation` sections being merged, signaling SIG acceptance
+- the `Proposal` section being merged, signaling agreement on a proposed design
+- the date implementation started
+- the first Kubernetes release where an initial version of the KEP was available
+- the version of Kubernetes where the KEP graduated to general availability
+- when the KEP was retired or superseded
+-->
+
+## Drawbacks
+
+<!--
+Why should this KEP _not_ be implemented?
+-->
+
+## Alternatives
+
+<!--
+What other approaches did you consider, and why did you rule them out? These do
+not need to be as detailed as the proposal, but should include enough
+information to express the idea and why it was not acceptable.
+-->
+
+### ResourceClaimTemplate
+
+Instead of creating a ResourceClaim from an embedded template, the
+PodStatus could be extended to hold the same information as a
+ResourceClaimStatus. Every component which works with that information
+then needs permission and extra code to work with PodStatus. Creating
+an extra object seems simpler.
+
+### Reusing volume support as-is
+
+ResourceClaims are similar to PersistentVolumeClaims and also a lot of
+the associated logic is similar. An [early
+prototype](https://github.com/intel/proof-of-concept-cdi) used a
+custom CSI driver to manage resources.
+
+The user experience with that approach is poor because per-resource
+parameters must be stored in annotations of a PVC due to the lack of
+custom per-PVC parameters. Passing annotations as additional parameters was [proposed
+before](https://github.com/kubernetes-csi/external-provisioner/issues/86)
+but were essentially [rejected by
+SIG-Storage](https://github.com/kubernetes-csi/external-provisioner/issues/86#issuecomment-465836185)
+because allowing apps to set custom parameters would make apps
+non-portable.
+
+The current volume support also has open issues that affect the
+“volume as resource” approach: Multiple different Pods on a node are
+allowed to use the same
+volume. https://github.com/kubernetes/enhancements/pull/2489 will
+address that, but is still work in progress.  Recovery from a bad node
+selection during delayed binding may get stuck when a Pod has multiple
+volumes because volumes are not getting deleted after a partial
+provisioning. A proposal to fix that needs further work
+(https://github.com/kubernetes/enhancements/pull/1703).  Each “fake”
+CSI driver would have to implement and install a scheduler extender
+because storage capacity tracking only considers volume size as
+criteria for selecting nodes, which is not applicable for custom
+resources.
+
+### Extend volume support
+
+The StorageClass and PersistentVolumeClaim structs could be extended
+to allow custom parameters. Together with an extension of the CSI
+standard that would address the main objection against the previous
+alternative.
+
+However, SIG-Storage and the CSI community would have to agree to this
+kind of reuse and accept that some of the code maintained by them
+becomes more complex because of these new use cases.
+
+### Extend Device Plugins
+
+The Device Plugins API could be extended to implement some of the
+requirements mentioned in the “Motivation” section of this
+document. There were certain attempts to do it, for example an attempt
+to [add ‘Deallocate’ API call](https://github.com/kubernetes/enhancements/pull/1949) and [pass pod annotations to 'Allocate' API call](https://github.com/kubernetes/kubernetes/pull/61775)
+
+However, most of the requirements couldn’t be satisfied using this
+approach as they would require major incompatible changes in the
+Device Plugins API. For example: partial and optional resource
+allocation couldn’t be done without changing the way resources are
+currently declared on the Pod and Device Plugin level.
+
+Extending the Device Plugins API to use [Container Device Interface](https://github.com/container-orchestrated-devices/container-device-interface)
+would help address some of the requirements, but not all of them.
+
+It should be also taken into account that Device Plugins API is
+beta. Introducing incompatible changes to it may not be accepted by
+the Kubernetes community.
+
+### Webhooks instead of ResourceClaim updates
+
+In the current design, scheduler and the resource controller communicate by
+updating fields in a ResourceClaim. This has several advantages compared to an
+approach were kube-scheduler retrieves information from the resource controller
+via HTTP:
+* No need for a new webhook API.
+* Simpler deployment of resource controller because all it needs are
+  credentials to communicate with the apiserver.
+* Current status can be checked by querying the ResourceClaim.
+
+The downside is higher load on the apiserver and an increase of the size of
+ResourceClaim objects.
+
+## Infrastructure Needed (Optional)
+
+<!--
+Use this section if you need things from the project/SIG. Examples include a
+new subproject, repos requested, or GitHub details. Listing these here allows a
+SIG to get the process for these resources started right away.
+--> 
diff --git a/keps/sig-node/3063-dynamic-resource-allocation/components.puml b/keps/sig-node/3063-dynamic-resource-allocation/components.puml
new file mode 100644
index 00000000000..610d1174836
--- /dev/null
+++ b/keps/sig-node/3063-dynamic-resource-allocation/components.puml
@@ -0,0 +1,40 @@
+@startuml
+skinparam componentStyle rectangle
+
+cloud "3rd party\ncluster add-on" as 3rdparty {
+  component "resource controller" as vendorcontroller
+  component "resource node plugin" as vendornodeplugin
+}
+
+component Kubernetes {
+  component apiserver {
+      file Pod
+      file "..." as otherapi
+      file ResourceClaim
+  }
+  component scheduler {
+    component "resource plugin" as k8sresourceplugin
+  }
+  component "controller-manager" as controllermanager {
+    component "resource claim controller" as k8sresourceclaimcontroller
+  }
+  component kubelet {
+    component "plugin manager" as pluginmanager
+    component "resource manager" as resourcemanager
+  }
+}
+
+vendorcontroller -[hidden]> vendornodeplugin
+Pod -[hidden]> otherapi
+otherapi -[hidden]> ResourceClaim
+
+Pod -u-> k8sresourceclaimcontroller: read resource template\nfrom Pod spec
+ResourceClaim <-u- k8sresourceclaimcontroller: create
+
+Pod <--> scheduler
+ResourceClaim <--> k8sresourceplugin
+
+ResourceClaim <-> vendorcontroller
+pluginmanager <-> vendornodeplugin
+resourcemanager <-> vendornodeplugin
+@enduml
diff --git a/keps/sig-node/3063-dynamic-resource-allocation/components.svg b/keps/sig-node/3063-dynamic-resource-allocation/components.svg
new file mode 100644
index 00000000000..79c069ff6bc
--- /dev/null
+++ b/keps/sig-node/3063-dynamic-resource-allocation/components.svg
@@ -0,0 +1,75 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" contentScriptType="application/ecmascript" contentStyleType="text/css" height="492px" preserveAspectRatio="none" style="width:1099px;height:492px;background:#FFFFFF;" version="1.1" viewBox="0 0 1099 492" width="1099px" zoomAndPan="magnify"><defs><filter height="300%" id="fj7ubc9w9wlbr" width="300%" x="-1" y="-1"><feGaussianBlur result="blurOut" stdDeviation="2.0"/><feColorMatrix in="blurOut" result="blurOut2" type="matrix" values="0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 .4 0"/><feOffset dx="4.0" dy="4.0" in="blurOut2" result="blurOut3"/><feBlend in="SourceGraphic" in2="blurOut3" mode="normal"/></filter></defs><g><!--MD5=[dfef79abc52578acbe23b4fd575ab168]
+cluster 3rdparty--><path d="M701.6587,213.3173 C707.5971,202.3228 719.2158,199.2662 726.4332,211.981 C732.476,198.6229 740.4464,197.443 750.2782,208.2407 C759.1573,196.8914 772.5767,196.6047 778.8718,211.0996 C785.2287,198.6427 797.1713,194.6859 806.6902,207.6378 C812.8027,195.7721 823.1104,196.5201 829.8602,206.9763 C837.2499,192.6606 845.0617,192.3688 855.075,204.4722 C866.0341,192.5823 876.4519,190.011 884.9094,207.0519 C893.4238,196.1696 900.0472,198.1537 907.2406,208.4452 C915.6568,196.6926 925.338,198.8241 932.91,209.4151 C940.4762,196.0985 953.4091,194.2801 961.0801,209.3109 C967.3369,196.1955 978.7035,193.5516 987.2326,207.0805 C997.2013,192.1336 1005.8643,194.6231 1015.0001,207.9 C1023.5274,195.2492 1034.3157,197.1943 1041.8368,209.0039 C1052.9768,198.5787 1065.1723,201.0149 1070,216 C1072.8233,215.5202 1074.8051,217.6502 1073.8257,220.444 C1092.2149,228.0311 1092.0541,242.4712 1077.9898,254.0764 C1090.3391,272.5924 1085.6094,289.9835 1063,296 C1064.4693,294.3388 1065.6761,294.6513 1066.5377,296.5962 C1060.9186,312.7197 1049.4143,314.7132 1038.3309,301.9278 C1030.4796,313.261 1021.5324,313.4962 1013.4904,302.1365 C1006.095,312.721 996.491,313.8846 988.2777,303.1017 C980.662,318.7631 967.6701,315.6663 959.8626,303.2962 C952.8456,315.4473 944.0517,315.5664 936.8633,303.4587 C928.4231,316.0541 920.1248,318.7559 909.2644,306.1044 C902.9054,317.6207 890.5664,316.9907 884.3224,305.977 C875.5204,319.7755 865.9152,322.6937 855.6015,307.4732 C844.9755,318.8237 836.8544,316.7703 830.6878,303.043 C823.5826,314.9095 816.3511,315.2714 806.8655,305.6864 C796.0203,317.3258 786.5781,318.526 778.2265,303.0525 C768.5709,313.999 757.9857,312.7955 751.6305,299.4406 C744.6657,309.4197 734.2285,309.9199 727.2499,299.4569 C713.662,311.8499 695.6417,307.6197 691,289 C693.4676,288.3858 695.5099,289.3612 694.7295,292.3562 C672.466,287.1936 667.8471,274.8563 683.3104,257.3774 C666.5337,234.5748 671.0028,218.5108 698,209 C700.7143,208.5805 702.5197,210.7069 701.6587,213.3173 " fill="#FFFFFF" filter="url(#fj7ubc9w9wlbr)" style="stroke:#000000;stroke-width:1.5;"/><text fill="#000000" font-family="sans-serif" font-size="14" font-weight="bold" lengthAdjust="spacing" textLength="76" x="842.5" y="226.9951">3rd party</text><text fill="#000000" font-family="sans-serif" font-size="14" font-weight="bold" lengthAdjust="spacing" textLength="118" x="821.5" y="243.292">cluster add-on</text><!--MD5=[30afa6242ed7a9da9c35c8eba5ba651d]
+cluster Kubernetes--><rect fill="#FFFFFF" filter="url(#fj7ubc9w9wlbr)" height="473" style="stroke:#000000;stroke-width:1.5;" width="652" x="7" y="7"/><text fill="#000000" font-family="sans-serif" font-size="14" font-weight="bold" lengthAdjust="spacing" textLength="91" x="287.5" y="21.9951">Kubernetes</text><!--MD5=[6a449d165e7288571682c9f52c1007ed]
+cluster apiserver--><rect fill="#FFFFFF" filter="url(#fj7ubc9w9wlbr)" height="87" style="stroke:#000000;stroke-width:1.5;" width="311" x="108" y="217"/><text fill="#000000" font-family="sans-serif" font-size="14" font-weight="bold" lengthAdjust="spacing" textLength="75" x="226" y="231.9951">apiserver</text><!--MD5=[1346839f4d9afb2d0f5d9392cd5481da]
+cluster scheduler--><rect fill="#FFFFFF" filter="url(#fj7ubc9w9wlbr)" height="103" style="stroke:#000000;stroke-width:1.5;" width="197" x="197" y="345"/><text fill="#000000" font-family="sans-serif" font-size="14" font-weight="bold" lengthAdjust="spacing" textLength="78" x="256.5" y="359.9951">scheduler</text><!--MD5=[d5328c4966d1ca6e16a506ef7f462ad0]
+cluster controllermanager--><rect fill="#FFFFFF" filter="url(#fj7ubc9w9wlbr)" height="87" style="stroke:#000000;stroke-width:1.5;" width="232" x="31" y="50"/><text fill="#000000" font-family="sans-serif" font-size="14" font-weight="bold" lengthAdjust="spacing" textLength="157" x="68.5" y="64.9951">controller-manager</text><!--MD5=[9687c407887516dc192ba9dafcc7e07b]
+cluster kubelet--><rect fill="#FFFFFF" filter="url(#fj7ubc9w9wlbr)" height="87" style="stroke:#000000;stroke-width:1.5;" width="348" x="287" y="50"/><text fill="#000000" font-family="sans-serif" font-size="14" font-weight="bold" lengthAdjust="spacing" textLength="60" x="431" y="64.9951">kubelet</text><!--MD5=[1d1022e7f40d29fca5826d38bb5d69bf]
+entity vendorcontroller--><rect fill="#FEFECE" filter="url(#fj7ubc9w9wlbr)" height="36.2969" style="stroke:#000000;stroke-width:1.5;" width="157" x="699.5" y="252"/><text fill="#000000" font-family="sans-serif" font-size="14" lengthAdjust="spacing" textLength="137" x="709.5" y="274.9951">resource controller</text><!--MD5=[dd21688696bbed6fb1274309176b0faf]
+entity vendornodeplugin--><rect fill="#FEFECE" filter="url(#fj7ubc9w9wlbr)" height="36.2969" style="stroke:#000000;stroke-width:1.5;" width="170" x="892" y="252"/><text fill="#000000" font-family="sans-serif" font-size="14" lengthAdjust="spacing" textLength="150" x="902" y="274.9951">resource node plugin</text><!--MD5=[825324b7f558f659a104b3dc0e220d0f]
+entity Pod--><polygon fill="#FEFECE" filter="url(#fj7ubc9w9wlbr)" points="124.5,252,124.5,288.2969,171.5,288.2969,171.5,262,161.5,252,124.5,252" style="stroke:#000000;stroke-width:1.5;"/><path d="M161.5,252 L161.5,262 L171.5,262 " fill="#FEFECE" style="stroke:#000000;stroke-width:1.5;"/><text fill="#000000" font-family="sans-serif" font-size="14" lengthAdjust="spacing" textLength="27" x="134.5" y="274.9951">Pod</text><!--MD5=[c6d13f07182e2351a4c41a35e49756d3]
+entity otherapi--><polygon fill="#FEFECE" filter="url(#fj7ubc9w9wlbr)" points="206.5,252,206.5,288.2969,241.5,288.2969,241.5,262,231.5,252,206.5,252" style="stroke:#000000;stroke-width:1.5;"/><path d="M231.5,252 L231.5,262 L241.5,262 " fill="#FEFECE" style="stroke:#000000;stroke-width:1.5;"/><text fill="#000000" font-family="sans-serif" font-size="14" lengthAdjust="spacing" textLength="15" x="216.5" y="274.9951">...</text><!--MD5=[0caca760b2b8a6b5b7be12a2a89afc05]
+entity ResourceClaim--><polygon fill="#FEFECE" filter="url(#fj7ubc9w9wlbr)" points="277,252,277,288.2969,403,288.2969,403,262,393,252,277,252" style="stroke:#000000;stroke-width:1.5;"/><path d="M393,252 L393,262 L403,262 " fill="#FEFECE" style="stroke:#000000;stroke-width:1.5;"/><text fill="#000000" font-family="sans-serif" font-size="14" lengthAdjust="spacing" textLength="106" x="287" y="274.9951">ResourceClaim</text><!--MD5=[78a175b168b74c0adec3953d7ef16f8f]
+entity k8sresourceplugin--><rect fill="#FEFECE" filter="url(#fj7ubc9w9wlbr)" height="36.2969" style="stroke:#000000;stroke-width:1.5;" width="130" x="240" y="388"/><text fill="#000000" font-family="sans-serif" font-size="14" lengthAdjust="spacing" textLength="110" x="250" y="410.9951">resource plugin</text><!--MD5=[90eef232d02e2b2acff627f901dbda00]
+entity k8sresourceclaimcontroller--><rect fill="#FEFECE" filter="url(#fj7ubc9w9wlbr)" height="36.2969" style="stroke:#000000;stroke-width:1.5;" width="199" x="47.5" y="85"/><text fill="#000000" font-family="sans-serif" font-size="14" lengthAdjust="spacing" textLength="179" x="57.5" y="107.9951">resource claim controller</text><!--MD5=[04400e9409c1a81fafb175350e297fa6]
+entity pluginmanager--><rect fill="#FEFECE" filter="url(#fj7ubc9w9wlbr)" height="36.2969" style="stroke:#000000;stroke-width:1.5;" width="131" x="303.5" y="85"/><text fill="#000000" font-family="sans-serif" font-size="14" lengthAdjust="spacing" textLength="111" x="313.5" y="107.9951">plugin manager</text><!--MD5=[c091db1669938f46da442302e548e7be]
+entity resourcemanager--><rect fill="#FEFECE" filter="url(#fj7ubc9w9wlbr)" height="36.2969" style="stroke:#000000;stroke-width:1.5;" width="149" x="469.5" y="85"/><text fill="#000000" font-family="sans-serif" font-size="14" lengthAdjust="spacing" textLength="129" x="479.5" y="107.9951">resource manager</text><!--MD5=[3f0837bc815ece98ef2bb5fbce714c9f]
+link vendorcontroller to vendornodeplugin--><!--MD5=[e5d4e5ba63c52082584a04285cbb68d4]
+link Pod to otherapi--><!--MD5=[9b05b381b794c5da97d12dcb51ed74cb]
+link otherapi to ResourceClaim--><!--MD5=[982a0d768f1a4e021d09551cb2018a3f]
+reverse link k8sresourceclaimcontroller to Pod--><path d="M145.349,126.26 C144.803,134.638 144.268,144.252 144,153 C142.928,188.046 145.161,229.041 146.702,251.707 " fill="none" id="k8sresourceclaimcontroller-backto-Pod" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="#A80036" points="145.687,121.223,141.0917,129.9341,145.3513,126.2117,149.0737,130.4713,145.687,121.223" style="stroke:#A80036;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="151" x="145" y="166.0669">read resource template</text><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="93" x="174" y="181.1997">from Pod spec</text><!--MD5=[8d47d0e0d9256e4c2e63c05c87794cb5]
+link k8sresourceclaimcontroller to ResourceClaim--><path d="M209.744,121.065 C249.246,132.304 294.05,146.162 301,153 C326.593,178.179 335.392,220.66 338.416,246.63 " fill="none" id="k8sresourceclaimcontroller-to-ResourceClaim" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="#A80036" points="338.978,251.909,342.0014,242.5357,338.448,246.9372,334.0465,243.3837,338.978,251.909" style="stroke:#A80036;stroke-width:1.0;"/><text fill="#000000" font-family="sans-serif" font-size="13" lengthAdjust="spacing" textLength="41" x="323" y="173.5669">create</text><!--MD5=[a46ccd37cc6ea5160289e7ab91f06659]
+link Pod to scheduler--><path d="M157.386,293.065 C165.345,311.7755 176.993,339.1589 186.7759,362.1575 C189.2216,367.9071 191.5507,373.3827 193.6765,378.3803 C194.7394,380.879 195.7515,383.2583 196.7019,385.4925 C196.7613,385.6321 196.8204,385.7712 196.8793,385.9097 " fill="none" id="Pod-scheduler" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="#A80036" points="196.8793,385.9097,197.0373,376.0621,194.9222,381.3087,189.6756,379.1936,196.8793,385.9097" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="#A80036" points="155.335,288.244,155.1767,298.0916,157.292,292.8451,162.5385,294.9604,155.335,288.244" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[0039883b545247e65479470c5d4831d3]
+link ResourceClaim to k8sresourceplugin--><path d="M334.17,293.32 C327.687,318.14 317.278,357.992 310.805,382.774 " fill="none" id="ResourceClaim-k8sresourceplugin" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="#A80036" points="309.487,387.822,315.6323,380.1256,310.751,382.9844,307.8922,378.1031,309.487,387.822" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="#A80036" points="335.496,288.244,329.3507,295.9404,334.232,293.0816,337.0908,297.9629,335.496,288.244" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[fa1108eaf13ac1d69ce7162fdfba6bf5]
+link ResourceClaim to vendorcontroller--><path d="M408.508,270 C503.744,270 598.98,270 694.217,270 " fill="none" id="ResourceClaim-vendorcontroller" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="#A80036" points="699.329,270,690.329,266,694.329,270,690.329,274,699.329,270" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="#A80036" points="403.305,270,412.305,274,408.305,270,412.305,266,403.305,270" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[8b43d783065e745b3c78ced736782ecd]
+link pluginmanager to vendornodeplugin--><path d="M402.089,123.518 C416.891,131.404 434.843,139.843 452,145 C588.418,186.002 628.278,170.644 770,185 C793.061,187.336 852.276,184.916 874,193 C906.218,204.988 937.107,230.238 956.511,248.345 " fill="none" id="pluginmanager-vendornodeplugin" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="#A80036" points="960.235,251.865,956.4446,242.7748,956.6023,248.4294,950.9476,248.5871,960.235,251.865" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="#A80036" points="397.601,121.093,403.6199,128.8887,402.0006,123.4685,407.4208,121.8492,397.601,121.093" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[d3fe6cacefaea3b19b9bbf67da9ae812]
+link resourcemanager to vendornodeplugin--><path d="M623.858,116.903 C692.58,129.996 793.137,153.983 874,193 C904.21,207.577 934.602,231.384 954.46,248.47 " fill="none" id="resourcemanager-vendornodeplugin" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="#A80036" points="958.287,251.793,954.114,242.8719,954.5117,248.5148,948.8688,248.9124,958.287,251.793" style="stroke:#A80036;stroke-width:1.0;"/><polygon fill="#A80036" points="618.553,115.902,626.6551,121.5016,623.4663,116.8292,628.1387,113.6404,618.553,115.902" style="stroke:#A80036;stroke-width:1.0;"/><!--MD5=[c2e5448e387249491e47aa4b83400976]
+@startuml
+skinparam componentStyle rectangle
+
+cloud "3rd party\ncluster add-on" as 3rdparty {
+  component "resource controller" as vendorcontroller
+  component "resource node plugin" as vendornodeplugin
+}
+
+component Kubernetes {
+  component apiserver {
+      file Pod
+      file "..." as otherapi
+      file ResourceClaim
+  }
+  component scheduler {
+    component "resource plugin" as k8sresourceplugin
+  }
+  component "controller-manager" as controllermanager {
+    component "resource claim controller" as k8sresourceclaimcontroller
+  }
+  component kubelet {
+    component "plugin manager" as pluginmanager
+    component "resource manager" as resourcemanager
+  }
+}
+
+vendorcontroller -[hidden]> vendornodeplugin
+Pod -[hidden]> otherapi
+otherapi -[hidden]> ResourceClaim
+
+Pod -u-> k8sresourceclaimcontroller: read resource template\nfrom Pod spec
+ResourceClaim <-u- k8sresourceclaimcontroller: create
+
+Pod <- -> scheduler
+ResourceClaim <- -> k8sresourceplugin
+
+ResourceClaim <-> vendorcontroller
+pluginmanager <-> vendornodeplugin
+resourcemanager <-> vendornodeplugin
+@enduml
+
+PlantUML version 1.2021.10beta3(Unknown compile time)
+(GPL source distribution)
+Java Runtime: Java(TM) SE Runtime Environment
+JVM: Java HotSpot(TM) 64-Bit Server VM
+Default Encoding: UTF-8
+Language: en
+Country: US
+--></g></svg>
\ No newline at end of file
diff --git a/keps/sig-node/3063-dynamic-resource-allocation/kep.yaml b/keps/sig-node/3063-dynamic-resource-allocation/kep.yaml
new file mode 100644
index 00000000000..fd6ac3769ab
--- /dev/null
+++ b/keps/sig-node/3063-dynamic-resource-allocation/kep.yaml
@@ -0,0 +1,45 @@
+title: dynamic resource allocation
+kep-number: 3063
+authors:
+  - "@pohly"
+owning-sig: sig-node
+participating-sigs:
+  - sig-scheduling
+status: provisional
+creation-date: 2021-05-17
+reviewers:
+  - TBD
+  - "@alice.doe"
+approvers:
+  - TBD
+  - "@oscar.doe"
+
+see-also:
+replaces:
+
+# The target maturity stage in the current dev cycle for this KEP.
+stage: alpha
+
+# The most recent milestone for which work toward delivery of this KEP has been
+# done. This can be the current (upcoming) milestone, if it is being actively
+# worked on.
+latest-milestone: "v1.24"
+
+# The milestone at which this feature was, or is targeted to be, at each stage.
+milestone:
+  alpha: "v1.24"
+  beta: "v1.26"
+  stable: "v1.28"
+
+feature-gates:
+  - name: DynamicResourceAllocation
+    components:
+      - kube-apiserver
+      - kube-controller-manager
+      - kube-scheduler
+      - kubelet
+disable-supported: true
+
+# The following PRR answers are required at beta release
+metrics:
+  - my_feature_metric