diff --git a/.github/workflows/lib-build.yaml b/.github/workflows/lib-build.yaml index 27971dc6b..1cf26ec31 100644 --- a/.github/workflows/lib-build.yaml +++ b/.github/workflows/lib-build.yaml @@ -26,7 +26,6 @@ jobs: - intel-dsa-plugin - intel-iaa-plugin - intel-idxd-config-initcontainer - - intel-xpumanager-sidecar # # Demo images - crypto-perf diff --git a/.github/workflows/lib-publish.yaml b/.github/workflows/lib-publish.yaml index e665052bb..95e0c0d8c 100644 --- a/.github/workflows/lib-publish.yaml +++ b/.github/workflows/lib-publish.yaml @@ -56,7 +56,6 @@ jobs: - intel-dsa-plugin - intel-iaa-plugin - intel-idxd-config-initcontainer - - intel-xpumanager-sidecar steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 - uses: actions/setup-go@d35c59abb061a4a6fb18e82ac0862c26744d6ab5 # v5 diff --git a/.trivyignore.yaml b/.trivyignore.yaml index 7433e0d66..06c95300f 100644 --- a/.trivyignore.yaml +++ b/.trivyignore.yaml @@ -19,7 +19,6 @@ misconfigurations: - id: AVD-KSV-0047 statement: gpu plugin in kubelet mode requires "nodes/proxy" resource access paths: - - gpu_plugin/overlays/fractional_resources/gpu-manager-role.yaml - operator/rbac/gpu_manager_role.yaml - operator/rbac/role.yaml @@ -27,9 +26,6 @@ misconfigurations: statement: These are false detections for not setting "readOnlyFilesystem" paths: - fpga_plugin/overlays/region/mode-region.yaml - - gpu_plugin/overlays/fractional_resources/add-mounts.yaml - - gpu_plugin/overlays/fractional_resources/add-args.yaml - - gpu_plugin/overlays/fractional_resources/gpu-manager-role.yaml - gpu_plugin/overlays/monitoring_shared-dev_nfd/add-args.yaml - gpu_plugin/overlays/nfd_labeled_nodes/add-args.yaml - iaa_plugin/overlays/iaa_initcontainer/iaa_initcontainer.yaml diff --git a/Makefile b/Makefile index cf2cf5e53..2978ea443 100644 --- a/Makefile +++ b/Makefile @@ -177,7 +177,7 @@ endif dockerlib = build/docker/lib dockertemplates = build/docker/templates -images = $(shell basename -s .Dockerfile.in -a $(dockertemplates)/*.Dockerfile.in | grep -v -e dlb -e fpga -e kerneldrv) +images = $(shell basename -s .Dockerfile.in -a $(dockertemplates)/*.Dockerfile.in | grep -v -e dlb -e fpga -e xpumanager-sidecar) dockerfiles = $(shell basename -s .in -a $(dockertemplates)/*.Dockerfile.in | xargs -I"{}" echo build/docker/{}) test-image-base-layer: diff --git a/README.md b/README.md index 31ddfe29b..579797325 100644 --- a/README.md +++ b/README.md @@ -196,12 +196,6 @@ The [Device plugins operator README](cmd/operator/README.md) gives the installat The [Device plugins Operator for OpenShift](https://github.com/intel/intel-technology-enabling-for-openshift) gives the installation and usage details for the operator available on [Red Hat OpenShift Container Platform](https://catalog.redhat.com/software/operators/detail/61e9f2d7b9cdd99018fc5736). -## XeLink XPU Manager Sidecar - -To support interconnected GPUs in Kubernetes, XeLink sidecar is needed. - -The [XeLink XPU Manager sidecar README](cmd/xpumanager_sidecar/README.md) gives information how the sidecar functions and how to use it. - ## Intel GPU Level-Zero sidecar Sidecar uses Level-Zero API to provide additional GPU information for the GPU plugin that it cannot get through sysfs interfaces. diff --git a/build/docker/intel-qat-plugin-kerneldrv.Dockerfile b/build/docker/intel-qat-plugin-kerneldrv.Dockerfile deleted file mode 100644 index 7db3a68be..000000000 --- a/build/docker/intel-qat-plugin-kerneldrv.Dockerfile +++ /dev/null @@ -1,72 +0,0 @@ -## This is a generated file, do not edit directly. Edit build/docker/templates/intel-qat-plugin-kerneldrv.Dockerfile.in instead. -## -## Copyright 2022 Intel Corporation. All Rights Reserved. -## -## Licensed under the Apache License, Version 2.0 (the "License"); -## you may not use this file except in compliance with the License. -## You may obtain a copy of the License at -## -## http://www.apache.org/licenses/LICENSE-2.0 -## -## Unless required by applicable law or agreed to in writing, software -## distributed under the License is distributed on an "AS IS" BASIS, -## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -## See the License for the specific language governing permissions and -## limitations under the License. -### -## FINAL_BASE can be used to configure the base image of the final image. -## -## This is used in two ways: -## 1) make BUILDER= -## 2) docker build ... -f .Dockerfile -## -## The project default is 1) which sets FINAL_BASE=gcr.io/distroless/static -## (see build-image.sh). -## 2) and the default FINAL_BASE is primarily used to build Redhat Certified Openshift Operator container images that must be UBI based. -## The RedHat build tool does not allow additional image build parameters. -ARG FINAL_BASE=registry.access.redhat.com/ubi9-micro:latest -### -## -## GOLANG_BASE can be used to make the build reproducible by choosing an -## image by its hash: -## GOLANG_BASE=golang@sha256:9d64369fd3c633df71d7465d67d43f63bb31192193e671742fa1c26ebc3a6210 -## -## This is used on release branches before tagging a stable version. -## The main branch defaults to using the latest Golang base image. -ARG GOLANG_BASE=golang:1.24-bookworm -### -FROM ${GOLANG_BASE} AS builder -ARG DIR=/intel-device-plugins-for-kubernetes -ARG GO111MODULE=on -ARG LDFLAGS="all=-w -s" -ARG GOFLAGS="-trimpath" -ARG GCFLAGS="all=-spectre=all -N -l" -ARG ASMFLAGS="all=-spectre=all" -ARG GOLICENSES_VERSION -ARG EP=/usr/local/bin/intel_sgx_device_plugin -ARG CMD=qat_plugin -WORKDIR $DIR -COPY . . -ARG QAT_DRIVER_RELEASE="qat1.7.l.4.14.0-00031" -ARG QAT_DRIVER_SHA256="a68dfaea4308e0bb5f350b7528f1a076a0c6ba3ec577d60d99dc42c49307b76e" -SHELL ["/bin/bash", "-o", "pipefail", "-c"] -RUN mkdir -p /usr/src/qat && cd /usr/src/qat && wget -q https://downloadmirror.intel.com/30178/eng/$QAT_DRIVER_RELEASE.tar.gz && echo "$QAT_DRIVER_SHA256 $QAT_DRIVER_RELEASE.tar.gz" | sha256sum -c - && tar xf *.tar.gz && cd /usr/src/qat/quickassist/utilities/adf_ctl && LDFLAGS= make KERNEL_SOURCE_DIR=/usr/src/qat/quickassist/qat && install -D adf_ctl /install_root/usr/local/bin/adf_ctl -RUN (cd cmd/$CMD && GOFLAGS=${GOFLAGS} GO111MODULE=${GO111MODULE} CGO_ENABLED=1 go install -gcflags="${GCFLAGS}" -asmflags="${ASMFLAGS}" -ldflags="${LDFLAGS}" -tags kerneldrv) -RUN chmod a+x /go/bin/$CMD && install -D /go/bin/$CMD /install_root/usr/local/bin/intel_qat_device_plugin -RUN install -D ${DIR}/LICENSE /install_root/licenses/intel-device-plugins-for-kubernetes/LICENSE \ - && if [ ! -d "licenses/$CMD" ] ; then \ - GO111MODULE=on GOROOT=$(go env GOROOT) go run github.com/google/go-licenses@${GOLICENSES_VERSION} save "./cmd/$CMD" \ - --save_path /install_root/licenses/$CMD/go-licenses ; \ - else mkdir -p /install_root/licenses/$CMD/go-licenses/ && cd licenses/$CMD && cp -r * /install_root/licenses/$CMD/go-licenses/ ; fi && \ - echo "Verifying installed licenses" && test -e /install_root/licenses/$CMD/go-licenses -FROM debian:unstable-slim -LABEL vendor='Intel®' -LABEL org.opencontainers.image.source='https://github.com/intel/intel-device-plugins-for-kubernetes' -LABEL maintainer="Intel®" -LABEL version='devel' -LABEL release='1' -LABEL name='intel-qat-plugin-kerneldrv' -LABEL summary='Intel® QAT device plugin kerneldrv for Kubernetes' -COPY --from=builder /install_root / -ENV PATH=/usr/local/bin -ENTRYPOINT ["/usr/local/bin/intel_qat_device_plugin"] diff --git a/build/docker/templates/intel-qat-plugin-kerneldrv.Dockerfile.in b/build/docker/templates/intel-qat-plugin-kerneldrv.Dockerfile.in deleted file mode 100644 index dac5efdcd..000000000 --- a/build/docker/templates/intel-qat-plugin-kerneldrv.Dockerfile.in +++ /dev/null @@ -1,43 +0,0 @@ -#include "final_base.docker" -#include "golang_base.docker" - -FROM ${GOLANG_BASE} AS builder - -#include "default_args.docker" - -#define _ENTRYPOINT_ /usr/local/bin/intel_sgx_device_plugin -ARG EP=_ENTRYPOINT_ -ARG CMD=qat_plugin - -WORKDIR $DIR -COPY . . - -ARG QAT_DRIVER_RELEASE="qat1.7.l.4.14.0-00031" -ARG QAT_DRIVER_SHA256="a68dfaea4308e0bb5f350b7528f1a076a0c6ba3ec577d60d99dc42c49307b76e" - -SHELL ["/bin/bash", "-o", "pipefail", "-c"] -RUN mkdir -p /usr/src/qat \ - && cd /usr/src/qat \ - && wget -q https://downloadmirror.intel.com/30178/eng/$QAT_DRIVER_RELEASE.tar.gz \ - && echo "$QAT_DRIVER_SHA256 $QAT_DRIVER_RELEASE.tar.gz" | sha256sum -c - \ - && tar xf *.tar.gz \ - && cd /usr/src/qat/quickassist/utilities/adf_ctl \ - && LDFLAGS= make KERNEL_SOURCE_DIR=/usr/src/qat/quickassist/qat \ - && install -D adf_ctl /install_root/usr/local/bin/adf_ctl -RUN (cd cmd/$CMD && GOFLAGS=${GOFLAGS} GO111MODULE=${GO111MODULE} CGO_ENABLED=1 go install -gcflags="${GCFLAGS}" -asmflags="${ASMFLAGS}" -ldflags="${LDFLAGS}" -tags kerneldrv) -RUN chmod a+x /go/bin/$CMD \ - && install -D /go/bin/$CMD /install_root/usr/local/bin/intel_qat_device_plugin - -#include "default_licenses.docker" - - -FROM debian:unstable-slim - -#include "default_labels.docker" - -LABEL name='intel-qat-plugin-kerneldrv' -LABEL summary='Intel® QAT device plugin kerneldrv for Kubernetes' - -COPY --from=builder /install_root / -ENV PATH=/usr/local/bin -ENTRYPOINT ["/usr/local/bin/intel_qat_device_plugin"] diff --git a/cmd/gpu_plugin/README.md b/cmd/gpu_plugin/README.md index b5d4b8dd7..e86a90452 100644 --- a/cmd/gpu_plugin/README.md +++ b/cmd/gpu_plugin/README.md @@ -47,8 +47,6 @@ Intel GPU plugin may register four node resources to the Kubernetes cluster: | gpu.intel.com/xe | GPU instance running new `xe` KMD | | gpu.intel.com/xe_monitoring | Monitoring resource for the new `xe` KMD devices | -While GPU plugin basic operations support nodes having both (`i915` and `xe`) KMDs on the same node, its resource management (=GAS) does not, for that node needs to have only one of the KMDs present. - For workloads on different KMDs, see [KMD and UMD](#kmd-and-umd). ## Modes and Configuration Options @@ -56,11 +54,10 @@ For workloads on different KMDs, see [KMD and UMD](#kmd-and-umd). | Flag | Argument | Default | Meaning | |:---- |:-------- |:------- |:------- | | -enable-monitoring | - | disabled | Enable '*_monitoring' resource that provides access to all Intel GPU devices on the node, [see use](./monitoring.md) | -| -resource-manager | - | disabled | Deprecated. Enable fractional resource management, [see use](./fractional.md) | | -health-management | - | disabled | Enable health management by requesting data from oneAPI/Level-Zero interface. Requires [GPU Level-Zero](../gpu_levelzero/) sidecar. See [health management](#health-management) | | -wsl | - | disabled | Adapt plugin to run in the WSL environment. Requires [GPU Level-Zero](../gpu_levelzero/) sidecar. | | -shared-dev-num | int | 1 | Number of containers that can share the same GPU device | -| -allocation-policy | string | none | 3 possible values: balanced, packed, none. For shared-dev-num > 1: _balanced_ mode spreads workloads among GPU devices, _packed_ mode fills one GPU fully before moving to next, and _none_ selects first available device from kubelet. Default is _none_. Allocation policy does not have an effect when resource manager is enabled. | +| -allocation-policy | string | none | 3 possible values: balanced, packed, none. For shared-dev-num > 1: _balanced_ mode spreads workloads among GPU devices, _packed_ mode fills one GPU fully before moving to next, and _none_ selects first available device from kubelet. Default is _none_. | The plugin also accepts a number of other arguments (common to all plugins) related to logging. Please use the -h option to see the complete list of logging related options. @@ -75,9 +72,6 @@ Intel GPU-plugin supports a few different operation modes. Depending on the work |:---- |:-------- |:------- |:------- | | shared-dev-num == 1 | No, 1 container per GPU | Workloads using all GPU capacity, e.g. AI training | Yes | | shared-dev-num > 1 | Yes, >1 containers per GPU | (Batch) workloads using only part of GPU resources, e.g. inference, media transcode/analytics, or CPU bound GPU workloads | No | -| shared-dev-num > 1 && resource-management | Depends on resource requests | Any. For requirements and usage, see [fractional resource management](./fractional.md) | Yes. 1000 millicores = exclusive GPU usage. See note below. | - -> **Note**: Exclusive GPU usage with >=1000 millicores requires that also *all other GPU containers* specify (non-zero) millicores resource usage. ## Installing driver and firmware for Intel GPUs @@ -122,10 +116,6 @@ $ kubectl apply -k 'https://github.com/intel/intel-device-plugins-for-kubernetes GPU plugin can be installed with the Intel Device Plugin Operator. It allows configuring GPU plugin's parameters without kustomizing the deployment files. The general installation is described in the [install documentation](../operator/README.md#installation). For configuring the GPU Custom Resource (CR), see the [configuration options](#modes-and-configuration-options) and [operation modes](#operation-modes-for-different-workload-types). -### Install alongside with GPU Aware Scheduling (deprecated) - -GPU plugin can be installed alongside with GPU Aware Scheduling (GAS). It allows scheduling Pods which e.g. request only partial use of a GPU. The installation is described in [fractional resources](./fractional.md) page. - ### Verify Plugin Installation You can verify that the plugin has been installed on the expected nodes by searching for the relevant @@ -212,9 +202,9 @@ Furthermore, the deployments `securityContext` must be configured with appropria More info: https://kubernetes.io/blog/2021/11/09/non-root-containers-and-devices/ -### Labels created by GPU plugin +### Labels created for Intel GPUs via NFD -If installed with NFD and started with resource-management, plugin will export a set of labels for the node. For detailed info, see [labeling documentation](./labels.md). +When NFD's NodeFeatureRules for Intel GPUs are installed, nodes are labeled with a variaty of GPU specific labels. For detailed info, see [labeling documentation](./labels.md). ### SR-IOV use with the plugin diff --git a/cmd/gpu_plugin/device_props.go b/cmd/gpu_plugin/device_props.go index e6daf2f28..b89a02fa4 100644 --- a/cmd/gpu_plugin/device_props.go +++ b/cmd/gpu_plugin/device_props.go @@ -15,35 +15,22 @@ package main import ( - "slices" - - "github.com/intel/intel-device-plugins-for-kubernetes/cmd/internal/labeler" "github.com/intel/intel-device-plugins-for-kubernetes/cmd/internal/pluginutils" "k8s.io/klog/v2" ) type DeviceProperties struct { currentDriver string - drmDrivers map[string]bool - tileCounts []uint64 isPfWithVfs bool } -type invalidTileCountErr struct { - error -} - func newDeviceProperties() *DeviceProperties { - return &DeviceProperties{ - drmDrivers: make(map[string]bool), - } + return &DeviceProperties{} } func (d *DeviceProperties) fetch(cardPath string) { d.isPfWithVfs = pluginutils.IsSriovPFwithVFs(cardPath) - d.tileCounts = append(d.tileCounts, labeler.GetTileCount(cardPath)) - driverName, err := pluginutils.ReadDeviceDriver(cardPath) if err != nil { klog.Warningf("card (%s) doesn't have driver, using default: %s", cardPath, deviceTypeDefault) @@ -52,11 +39,6 @@ func (d *DeviceProperties) fetch(cardPath string) { } d.currentDriver = driverName - d.drmDrivers[d.currentDriver] = true -} - -func (d *DeviceProperties) drmDriverCount() int { - return len(d.drmDrivers) } func (d *DeviceProperties) driver() string { @@ -66,20 +48,3 @@ func (d *DeviceProperties) driver() string { func (d *DeviceProperties) monitorResource() string { return d.currentDriver + monitorSuffix } - -func (d *DeviceProperties) maxTileCount() (uint64, error) { - if len(d.tileCounts) == 0 { - return 0, invalidTileCountErr{} - } - - minCount := slices.Min(d.tileCounts) - maxCount := slices.Max(d.tileCounts) - - if minCount != maxCount { - klog.Warningf("Node's GPUs are heterogenous (min: %d, max: %d tiles)", minCount, maxCount) - - return 0, invalidTileCountErr{} - } - - return maxCount, nil -} diff --git a/cmd/gpu_plugin/fractional.md b/cmd/gpu_plugin/fractional.md deleted file mode 100644 index 974fdaea2..000000000 --- a/cmd/gpu_plugin/fractional.md +++ /dev/null @@ -1,66 +0,0 @@ -# GPU plugin with GPU Aware Scheduling (deprecated) - -This is a deprecated feature. In Kubernetes v1.32+, [DRA GPU resource driver](https://github.com/intel/intel-resource-drivers-for-kubernetes/blob/main/doc/gpu/README.md) can be used for fractional GPU resource allocating instead. - -Installing the GPU plugin with [GPU Aware Scheduling](https://github.com/intel/platform-aware-scheduling/tree/master/gpu-aware-scheduling) (GAS) enables containers to request partial (fractional) GPU resources. For example, a Pod's container can request GPU's millicores or memory and use only a fraction of the GPU. The remaining resources could be leveraged by another container. - -> *NOTE*: For this use case to work properly, all GPUs in a given node should provide equal amount of resources -i.e. heterogenous GPU nodes are not supported. - -> *NOTE*: Resource values are used only for scheduling workloads to nodes, not for limiting their GPU usage on the nodes. Container requesting 50% of the GPU's resources is not restricted by the kernel driver or firmware from using more than 50% of the resources. A container requesting 1% of the GPU could use 100% of it. - -## Install GPU Aware Scheduling - -GAS' installation is described in its [README](https://github.com/intel/platform-aware-scheduling/tree/master/gpu-aware-scheduling#usage-with-nfd-and-the-gpu-plugin). - -## Install GPU plugin with fractional resources - -### With yaml deployments - -The GPU Plugin DaemonSet needs additional RBAC-permissions and access to the kubelet podresources -gRPC service to function. All the required changes are gathered in the `fractional_resources` -overlay. Install GPU plugin by running: - -```bash -# Start NFD - if your cluster doesn't have NFD installed yet -$ kubectl apply -k 'https://github.com/intel/intel-device-plugins-for-kubernetes/deployments/nfd?ref=' - -# Create NodeFeatureRules for detecting GPUs on nodes -$ kubectl apply -k 'https://github.com/intel/intel-device-plugins-for-kubernetes/deployments/nfd/overlays/node-feature-rules?ref=' - -# Create GPU plugin daemonset -$ kubectl apply -k 'https://github.com/intel/intel-device-plugins-for-kubernetes/deployments/gpu_plugin/overlays/fractional_resources?ref=' -``` - -> **NOTE:** The yaml deployment above does not support deployment to non-default namespace. The ClusterRoleBinding object has a hardcoded namespace and does not respect the target namespace. If you would like to deploy to a custom namespace, you will need to either modify the [yaml file](../../deployments/gpu_plugin/overlays/fractional_resources/gpu-manager-rolebinding.yaml) or deploy using the Operator. - -### With Device Plugin Operator - -Install the Device Plugin Operator according to the [install](../operator/README.md#installation) instructions. When applying the [GPU plugin Custom Resource](../../deployments/operator/samples/deviceplugin_v1_gpudeviceplugin.yaml) (CR), set `resourceManager` option to `true`. The Operator will install all the required RBAC objects and service accounts. - -``` -spec: - resourceManager: true -``` - -## Details about fractional resources - -Use of fractional GPU resources requires that the cluster has node extended resources with the name prefix `gpu.intel.com/`. Those are automatically created by GPU plugin with the help of the NFD. When fractional resources are enabled, the plugin lets GAS do card selection decisions based on resource availability and the amount of extended resources requested in the [pod spec](https://github.com/intel/platform-aware-scheduling/blob/master/gpu-aware-scheduling/docs/usage.md#pods). - -GAS then annotates the pod objects with unique increasing numeric timestamps in the annotation `gas-ts` and container card selections in `gas-container-cards` annotation. The latter has container separator '`|`' and card separator '`,`'. Example for a pod with two containers and both containers getting two cards: `gas-container-cards:card0,card1|card2,card3`. - -Enabling the fractional resource support in the plugin without running GAS in the cluster will only slow down GPU-deployments, so do not enable this feature unnecessarily. - -## Tile level access and Level Zero workloads - -Level Zero library supports targeting different tiles on a GPU. If the host is equipped with multi-tile GPU devices, and the container requests both `gpu.intel.com/i915` and `gpu.intel.com/tiles` resources, GPU plugin (with GAS) adds an [affinity mask](https://spec.oneapi.io/level-zero/latest/core/PROG.html#affinity-mask) to the container. By default the mask is in "FLAT" [device hierarchy](https://spec.oneapi.io/level-zero/latest/core/PROG.html#device-hierarchy) format. With the affinity mask, two Level Zero workloads can share a two tile GPU so that workloads use one tile each. - -If a multi-tile workload is intended to work in "COMPOSITE" hierarchy mode, the container spec environment should include hierarchy mode variable (ZE_FLAT_DEVICE_HIERARCHY) with "COMPOSITE" value. GPU plugin will then adapt the affinity mask from the default "FLAT" to "COMPOSITE" format. - -If the GPU is a single tile device, GPU plugin does not set the affinity mask. Only exposing GPU devices is enough in that case. - -### Details about tile resources - -GAS makes the GPU and tile selection based on the Pod's resource specification. The selection is passed to GPU plugin via the Pod's annotation. - -Tiles targeted for containers are specified to Pod via `gas-container-tiles` annotation where the the annotation value describes a set of card and tile combinations. For example in a two container pod, the annotation could be `gas-container-tiles:card0:gt0+gt1|card1:gt1,card2:gt0`. Similarly to `gas-container-cards`, the container details are split via `|`. In the example above, the first container gets tiles 0 and 1 from card 0, and the second container gets tile 1 from card 1 and tile 0 from card 2. diff --git a/cmd/gpu_plugin/gpu_plugin.go b/cmd/gpu_plugin/gpu_plugin.go index c3b6ed73b..3f92da730 100644 --- a/cmd/gpu_plugin/gpu_plugin.go +++ b/cmd/gpu_plugin/gpu_plugin.go @@ -33,8 +33,6 @@ import ( pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" "github.com/intel/intel-device-plugins-for-kubernetes/cmd/gpu_plugin/levelzeroservice" - "github.com/intel/intel-device-plugins-for-kubernetes/cmd/gpu_plugin/rm" - "github.com/intel/intel-device-plugins-for-kubernetes/cmd/internal/labeler" gpulevelzero "github.com/intel/intel-device-plugins-for-kubernetes/cmd/internal/levelzero" dpapi "github.com/intel/intel-device-plugins-for-kubernetes/pkg/deviceplugin" cdispec "tags.cncf.io/container-device-interface/specs-go" @@ -63,11 +61,10 @@ const ( monitorSuffix = "_monitoring" monitorID = "all" + levelzeroAffinityMaskEnvVar = "ZE_AFFINITY_MASK" + // Period of device scans. scanPeriod = 5 * time.Second - - // Labeler's max update interval, 5min. - labelerMaxInterval = 5 * 60 * time.Second ) type cliOptions struct { @@ -75,15 +72,10 @@ type cliOptions struct { sharedDevNum int temperatureLimit int enableMonitoring bool - resourceManagement bool wslScan bool healthManagement bool } -type rmWithMultipleDriversErr struct { - error -} - type preferredAllocationPolicyFunc func(*pluginapi.ContainerPreferredAllocationRequest) []string // nonePolicy is used for allocating GPU devices randomly, while trying @@ -283,7 +275,6 @@ type devicePlugin struct { scanDone chan bool scanResources chan bool - resMan rm.ResourceManager levelzeroService levelzeroservice.LevelzeroService sysfsDir string @@ -314,20 +305,6 @@ func newDevicePlugin(sysfsDir, devfsDir string, options cliOptions) *devicePlugi healthStatuses: make(map[string]string), } - if options.resourceManagement { - var err error - - dp.resMan, err = rm.NewResourceManager(monitorID, - []string{ - namespace + "/" + deviceTypeI915, - namespace + "/" + deviceTypeXe, - }) - if err != nil { - klog.Errorf("Failed to create resource manager: %+v", err) - return nil - } - } - switch options.preferredAllocationPolicy { case "balanced": dp.policy = balancedPolicy @@ -418,10 +395,6 @@ func (dp *devicePlugin) healthStatusForCard(cardPath string) string { // Implement the PreferredAllocator interface. func (dp *devicePlugin) GetPreferredAllocation(rqt *pluginapi.PreferredAllocationRequest) (*pluginapi.PreferredAllocationResponse, error) { - if dp.resMan != nil { - return dp.resMan.GetPreferredFractionalAllocation(rqt) - } - response := &pluginapi.PreferredAllocationResponse{} for _, req := range rqt.ContainerRequests { @@ -488,7 +461,7 @@ func (dp *devicePlugin) wslGpuScan(notifier dpapi.Notifier) error { for _, index := range indices { envs := map[string]string{ - rm.LevelzeroAffinityMaskEnvVar: strconv.Itoa(int(index)), + levelzeroAffinityMaskEnvVar: strconv.Itoa(int(index)), } deviceInfo := dpapi.NewDeviceInfo(pluginapi.Healthy, devSpecs, mounts, envs, nil, nil) @@ -525,10 +498,6 @@ func (dp *devicePlugin) sysFsGpuScan(notifier dpapi.Notifier) error { for { devTree, err := dp.scan() if err != nil { - if errors.Is(err, rmWithMultipleDriversErr{}) { - return err - } - klog.Warning("Failed to scan: ", err) } @@ -548,7 +517,7 @@ func (dp *devicePlugin) sysFsGpuScan(notifier dpapi.Notifier) error { notifier.Notify(devTree) // Trigger resource scan if it's enabled. - if dp.resMan != nil && countChanged { + if countChanged { dp.scanResources <- true } @@ -689,7 +658,6 @@ func (dp *devicePlugin) scan() (dpapi.DeviceTree, error) { monitor := make(map[string][]pluginapi.DeviceSpec, 0) devTree := dpapi.NewDeviceTree() - rmDevInfos := rm.NewDeviceInfoMap() devProps := newDeviceProperties() for _, f := range dp.filterOutInvalidCards(files) { @@ -717,8 +685,6 @@ func (dp *devicePlugin) scan() (dpapi.DeviceTree, error) { for i := 0; i < dp.options.sharedDevNum; i++ { devID := fmt.Sprintf("%s-%d", name, i) devTree.AddDevice(devProps.driver(), devID, deviceInfo) - - rmDevInfos[devID] = rm.NewDeviceInfo(devSpecs, mounts, nil) } if dp.options.enableMonitoring { @@ -737,30 +703,10 @@ func (dp *devicePlugin) scan() (dpapi.DeviceTree, error) { } } - if dp.resMan != nil { - if devProps.drmDriverCount() <= 1 { - dp.resMan.SetDevInfos(rmDevInfos) - - if tileCount, err := devProps.maxTileCount(); err == nil { - dp.resMan.SetTileCountPerCard(tileCount) - } - } else { - klog.Warning("Plugin with RM doesn't support multiple DRM drivers:", devProps.drmDrivers) - - err := rmWithMultipleDriversErr{} - - return nil, err - } - } - return devTree, nil } func (dp *devicePlugin) Allocate(request *pluginapi.AllocateRequest) (*pluginapi.AllocateResponse, error) { - if dp.resMan != nil { - return dp.resMan.CreateFractionalResourceResponse(request) - } - return nil, &dpapi.UseDefaultMethodError{} } @@ -772,7 +718,6 @@ func main() { flag.StringVar(&prefix, "prefix", "", "Prefix for devfs & sysfs paths") flag.BoolVar(&opts.enableMonitoring, "enable-monitoring", false, "whether to enable '*_monitoring' (= all GPUs) resource") - flag.BoolVar(&opts.resourceManagement, "resource-manager", false, "fractional GPU resource management") flag.BoolVar(&opts.healthManagement, "health-management", false, "enable GPU health management") flag.BoolVar(&opts.wslScan, "wsl", false, "scan for / use WSL devices") flag.IntVar(&opts.sharedDevNum, "shared-dev-num", 1, "number of containers sharing the same GPU device") @@ -785,11 +730,6 @@ func main() { os.Exit(1) } - if opts.sharedDevNum == 1 && opts.resourceManagement { - klog.Error("Trying to use fractional resources with shared-dev-num 1 is pointless") - os.Exit(1) - } - var str = opts.preferredAllocationPolicy if !(str == "balanced" || str == "packed" || str == "none") { klog.Error("invalid value for preferredAllocationPolicy, the valid values: balanced, packed, none") @@ -803,12 +743,6 @@ func main() { if plugin.options.wslScan { klog.Info("WSL mode requested") - if plugin.options.resourceManagement { - klog.Error("Resource management is not supported within WSL. Please disable resource management.") - - os.Exit(1) - } - if plugin.options.enableMonitoring { klog.Error("Monitoring is not supported within WSL. Please disable monitoring.") @@ -828,20 +762,6 @@ func main() { go plugin.levelzeroService.Run(true) } - if plugin.options.resourceManagement { - // Start labeler to export labels file for NFD. - nfdFeatureFile := path.Join(nfdFeatureDir, resourceFilename) - - klog.V(2).Infof("NFD feature file location: %s", nfdFeatureFile) - - // Labeler catches OS signals and calls os.Exit() after receiving any. - go labeler.Run(prefix+sysfsDrmDirectory, nfdFeatureFile, - labelerMaxInterval, plugin.scanResources, plugin.levelzeroService, func() { - // Exit the whole app when labeler exits - os.Exit(0) - }) - } - manager := dpapi.NewManager(namespace, plugin) manager.Run() } diff --git a/cmd/gpu_plugin/gpu_plugin_test.go b/cmd/gpu_plugin/gpu_plugin_test.go index aadf96849..b3b305314 100644 --- a/cmd/gpu_plugin/gpu_plugin_test.go +++ b/cmd/gpu_plugin/gpu_plugin_test.go @@ -28,7 +28,6 @@ import ( "k8s.io/utils/strings/slices" "github.com/intel/intel-device-plugins-for-kubernetes/cmd/gpu_plugin/levelzeroservice" - "github.com/intel/intel-device-plugins-for-kubernetes/cmd/gpu_plugin/rm" dpapi "github.com/intel/intel-device-plugins-for-kubernetes/pkg/deviceplugin" cdispec "tags.cncf.io/container-device-interface/specs-go" ) @@ -58,23 +57,6 @@ func (n *mockNotifier) Notify(newDeviceTree dpapi.DeviceTree) { n.scanDone <- true } -type mockResourceManager struct { - tileCount uint64 -} - -func (m *mockResourceManager) CreateFractionalResourceResponse(*v1beta1.AllocateRequest) (*v1beta1.AllocateResponse, error) { - return &v1beta1.AllocateResponse{}, &dpapi.UseDefaultMethodError{} -} -func (m *mockResourceManager) SetDevInfos(rm.DeviceInfoMap) {} - -func (m *mockResourceManager) GetPreferredFractionalAllocation(*v1beta1.PreferredAllocationRequest) (*v1beta1.PreferredAllocationResponse, error) { - return &v1beta1.PreferredAllocationResponse{}, &dpapi.UseDefaultMethodError{} -} - -func (m *mockResourceManager) SetTileCountPerCard(count uint64) { - m.tileCount = count -} - type mockL0Service struct { indices []uint32 memSize uint64 @@ -203,13 +185,9 @@ func createTestFiles(root string, tc TestCaseDetails) (string, string, error) { } func TestNewDevicePlugin(t *testing.T) { - if newDevicePlugin("", "", cliOptions{sharedDevNum: 2, resourceManagement: false}) == nil { + if newDevicePlugin("", "", cliOptions{sharedDevNum: 2}) == nil { t.Error("Failed to create plugin") } - - if newDevicePlugin("", "", cliOptions{sharedDevNum: 2, resourceManagement: true}) != nil { - t.Error("Unexpectedly managed to create resource management enabled plugin inside unit tests") - } } func TestGetPreferredAllocation(t *testing.T) { @@ -240,7 +218,7 @@ func TestGetPreferredAllocation(t *testing.T) { }, } - plugin := newDevicePlugin("", "", cliOptions{sharedDevNum: 5, resourceManagement: false, preferredAllocationPolicy: "none"}) + plugin := newDevicePlugin("", "", cliOptions{sharedDevNum: 5, preferredAllocationPolicy: "none"}) response, _ := plugin.GetPreferredAllocation(rqt) sort.Strings(response.ContainerResponses[0].DeviceIDs) @@ -249,28 +227,28 @@ func TestGetPreferredAllocation(t *testing.T) { t.Error("Unexpected return value for none preferred allocation", response.ContainerResponses[0].DeviceIDs) } - plugin = newDevicePlugin("", "", cliOptions{sharedDevNum: 5, resourceManagement: false, preferredAllocationPolicy: "balanced"}) + plugin = newDevicePlugin("", "", cliOptions{sharedDevNum: 5, preferredAllocationPolicy: "balanced"}) response, _ = plugin.GetPreferredAllocation(rqt) if !reflect.DeepEqual(response.ContainerResponses[0].DeviceIDs, []string{"card0-0", "card1-0", "card2-0", "card3-0"}) { t.Error("Unexpected return value for balanced preferred allocation", response.ContainerResponses[0].DeviceIDs) } - plugin = newDevicePlugin("", "", cliOptions{sharedDevNum: 5, resourceManagement: false, preferredAllocationPolicy: "packed"}) + plugin = newDevicePlugin("", "", cliOptions{sharedDevNum: 5, preferredAllocationPolicy: "packed"}) response, _ = plugin.GetPreferredAllocation(rqt) if !reflect.DeepEqual(response.ContainerResponses[0].DeviceIDs, []string{"card0-0", "card0-1", "card0-2", "card0-3"}) { t.Error("Unexpected return value for packed preferred allocation", response.ContainerResponses[0].DeviceIDs) } - plugin = newDevicePlugin("", "", cliOptions{sharedDevNum: 5, resourceManagement: false, preferredAllocationPolicy: "none"}) + plugin = newDevicePlugin("", "", cliOptions{sharedDevNum: 5, preferredAllocationPolicy: "none"}) response, _ = plugin.GetPreferredAllocation(rqtErr) if response != nil { t.Error("Fail to handle the input error that req.AllocationSize is greater than len(req.AvailableDeviceIDs).") } - plugin = newDevicePlugin("", "", cliOptions{sharedDevNum: 5, resourceManagement: false, preferredAllocationPolicy: "none"}) + plugin = newDevicePlugin("", "", cliOptions{sharedDevNum: 5, preferredAllocationPolicy: "none"}) response, _ = plugin.GetPreferredAllocation(rqtNotEnough) sort.Strings(response.ContainerResponses[0].DeviceIDs) @@ -282,20 +260,12 @@ func TestGetPreferredAllocation(t *testing.T) { } func TestAllocate(t *testing.T) { - plugin := newDevicePlugin("", "", cliOptions{sharedDevNum: 2, resourceManagement: false}) + plugin := newDevicePlugin("", "", cliOptions{sharedDevNum: 2}) _, err := plugin.Allocate(&v1beta1.AllocateRequest{}) if _, ok := err.(*dpapi.UseDefaultMethodError); !ok { t.Errorf("Unexpected return value: %+v", err) } - - // mock the rm - plugin.resMan = &mockResourceManager{} - - _, err = plugin.Allocate(&v1beta1.AllocateRequest{}) - if _, ok := err.(*dpapi.UseDefaultMethodError); !ok { - t.Errorf("Unexpected return value: %+v", err) - } } func TestScan(t *testing.T) { @@ -689,137 +659,6 @@ func TestScanWsl(t *testing.T) { } } -func TestScanFails(t *testing.T) { - tc := TestCaseDetails{ - name: "xe and i915 devices with rm will fail", - sysfsdirs: []string{"card0/device/drm/card0", "card0/device/drm/controlD64", "card1/device/drm/card1"}, - sysfsfiles: map[string][]byte{ - "card0/device/vendor": []byte("0x8086"), - "card1/device/vendor": []byte("0x8086"), - }, - symlinkfiles: map[string]string{ - "card0/device/driver": "drivers/xe", - "card1/device/driver": "drivers/i915", - }, - devfsdirs: []string{ - "card0", - "card1", - }, - } - - t.Run(tc.name, func(t *testing.T) { - root, err := os.MkdirTemp("", "test_new_device_plugin") - if err != nil { - t.Fatalf("Can't create temporary directory: %+v", err) - } - // dirs/files need to be removed for the next test - defer os.RemoveAll(root) - - sysfs, devfs, err := createTestFiles(root, tc) - if err != nil { - t.Errorf("Unexpected error: %+v", err) - } - - plugin := newDevicePlugin(sysfs, devfs, tc.options) - - plugin.resMan = &mockResourceManager{} - - notifier := &mockNotifier{ - scanDone: plugin.scanDone, - } - - err = plugin.Scan(notifier) - if err == nil { - t.Error("Unexpected nil error") - } - }) -} - -func TestScanWithRmAndTiles(t *testing.T) { - tcs := []TestCaseDetails{ - { - name: "two tile xe devices with rm enabled - homogeneous", - sysfsdirs: []string{ - "card0/device/drm/card0", - "card1/device/drm/card1", - "card0/device/tile0/gt0", - "card0/device/tile1/gt1", - "card1/device/tile0/gt0", - "card1/device/tile1/gt1", - }, - sysfsfiles: map[string][]byte{ - "card0/device/vendor": []byte("0x8086"), - "card1/device/vendor": []byte("0x8086"), - }, - symlinkfiles: map[string]string{ - "card0/device/driver": "drivers/xe", - "card1/device/driver": "drivers/xe", - }, - devfsdirs: []string{ - "card0", - "card1", - }, - }, - { - name: "2 & 1 tile xe devices with rm enabled - heterogeneous", - sysfsdirs: []string{ - "card0/device/drm/card0", - "card1/device/drm/card1", - "card0/device/tile0/gt0", - "card0/device/tile1/gt1", - "card1/device/tile0/gt0", - }, - sysfsfiles: map[string][]byte{ - "card0/device/vendor": []byte("0x8086"), - "card1/device/vendor": []byte("0x8086"), - }, - symlinkfiles: map[string]string{ - "card0/device/driver": "drivers/xe", - "card1/device/driver": "drivers/xe", - }, - devfsdirs: []string{ - "card0", - "card1", - }, - }, - } - - expectedTileCounts := []uint64{2, 0} - - for i, tc := range tcs { - t.Run(tc.name, func(t *testing.T) { - root, err := os.MkdirTemp("", "test_new_device_plugin") - if err != nil { - t.Fatalf("Can't create temporary directory: %+v", err) - } - // dirs/files need to be removed for the next test - defer os.RemoveAll(root) - - sysfs, devfs, err := createTestFiles(root, tc) - if err != nil { - t.Errorf("Unexpected error: %+v", err) - } - - plugin := newDevicePlugin(sysfs, devfs, tc.options) - - rm := &mockResourceManager{} - plugin.resMan = rm - - notifier := &mockNotifier{ - scanDone: plugin.scanDone, - } - - err = plugin.Scan(notifier) - if err != nil { - t.Error("Unexpected error") - } - if rm.tileCount != expectedTileCounts[i] { - t.Error("Unexpected tilecount for RM") - } - }) - } -} - // Would be nice to combine these with the overall Scan unit tests. func createBypathTestFiles(t *testing.T, card, root, linkFile string, bypathFiles []string) (string, string) { drmPath := path.Join(root, "sys/class/drm/", card) diff --git a/cmd/gpu_plugin/labels.md b/cmd/gpu_plugin/labels.md index 3d3abbe94..7996040ba 100644 --- a/cmd/gpu_plugin/labels.md +++ b/cmd/gpu_plugin/labels.md @@ -1,8 +1,4 @@ -# Labels - -GPU labels originate from two main sources: NFD rules and GPU plugin (& NFD hook). - -## NFD rules +# Labels from NFD rules NFD rule is a method to instruct NFD to add certain label(s) to node based on the devices detected on it. There is a generic rule to identify all Intel GPUs. It will add labels for each PCI device type. For example, a Tigerlake iGPU (PCI Id 0x9a49) will show up as: @@ -29,59 +25,3 @@ Current covered platforms/devices are: Flex 140, Flex 170, Max 1100 and Max 1550 To identify other GPUs, see the graphics processor table [here](https://dgpu-docs.intel.com/devices/hardware-table.html#graphics-processor-table). -## GPU Plugin and NFD hook - -In GPU plugin, these labels are only applied when [Resource Management](README.md#fractional-resources-details) is enabled. With the NFD hook, labels are created regardless of how GPU plugin is configured. - -Numeric labels are converted into extended resources for the node (with NFD) and other labels are used directly by [GPU Aware Scheduling (GAS)](https://github.com/intel/platform-aware-scheduling/tree/master/gpu-aware-scheduling). Extended resources should only be used with GAS as Kubernetes scheduler doesn't properly handle resource allocations with multiple GPUs. - -### Default labels - -Following labels are created by default. - -name | type | description| ------|------|------| -|`gpu.intel.com/millicores`| number | node GPU count * 1000. -|`gpu.intel.com/memory.max`| number | sum of detected [GPU memory amounts](#gpu-memory) in bytes OR environment variable value * GPU count -|`gpu.intel.com/cards`| string | list of card names separated by '`.`'. The names match host `card*`-folders under `/sys/class/drm/`. Deprecated, use `gpu-numbers`. -|`gpu.intel.com/gpu-numbers`| string | list of numbers separated by '`.`'. The numbers correspond to device file numbers for the primary nodes of given GPUs in kernel DRI subsystem, listed as `/dev/dri/card` in devfs, and `/sys/class/drm/card` in sysfs. -|`gpu.intel.com/tiles`| number | sum of all detected GPU tiles in the system. -|`gpu.intel.com/numa-gpu-map`| string | list of numa node to gpu mappings. - -If the value of the `gpu-numbers` label would not fit into the 63 character length limit, you will also get labels `gpu-numbers2`, -`gpu-numbers3`... until all the gpu numbers have been labeled. - -The tile count `gpu.intel.com/tiles` describes the total amount of tiles on the system. System is expected to be homogeneous, and thus the number of tiles per GPU can be calculated by dividing the tile count with GPU count. - -The `numa-gpu-map` label is a list of numa to gpu mapping items separated by `_`. Each list item has a numa node id combined with a list of gpu indices. e.g. 0-1.2.3 would mean: numa node 0 has gpus 1, 2 and 3. More complex example would be: 0-0.1_1-3.4 where numa node 0 would have gpus 0 and 1, and numa node 1 would have gpus 3 and 4. As with `gpu-numbers`, this label will be extended to multiple labels if the length of the value exceeds the max label length. - -### PCI-groups (optional) - -GPUs which share the same PCI paths under `/sys/devices/pci*` can be grouped into a label. GPU nums are separated by '`.`' and -groups are separated by '`_`'. The label is created only if environment variable named `GPU_PCI_GROUPING_LEVEL` has a value greater -than zero. GPUs are considered to belong to the same group, if as many identical folder names are found for the GPUs, as is the value -of the environment variable. Counting starts from the folder name which starts with `pci`. - -For example, the SG1 card has 4 GPUs, which end up sharing pci-folder names under `/sys/devices`. With a `GPU_PCI_GROUPING_LEVEL` -of 3, a node with two such SG1 cards could produce a `pci-groups` label with a value of `0.1.2.3_4.5.6.7`. - -name | type | description| ------|------|------| -|`gpu.intel.com/pci-groups`| string | list of pci-groups separated by '`_`'. GPU numbers in the groups are separated by '`.`'. The numbers correspond to device file numbers for the primary nodes of given GPUs in kernel DRI subsystem, listed as `/dev/dri/card` in devfs, and `/sys/class/drm/card` in sysfs. - -If the value of the `pci-groups` label would not fit into the 63 character length limit, you will also get labels `pci-groups2`, -`pci-groups3`... until all the PCI groups have been labeled. - -### Limitations - -For the above to work as intended, GPUs on the same node must be identical in their capabilities. - -### GPU memory - -GPU memory amount is read from sysfs `gt/gt*` files and turned into a label. -There are two supported environment variables named `GPU_MEMORY_OVERRIDE` and -`GPU_MEMORY_RESERVED`. Both are supposed to hold numeric byte amounts. For systems with -older kernel drivers or GPUs which do not support reading the GPU memory -amount, the `GPU_MEMORY_OVERRIDE` environment variable value is turned into a GPU -memory amount label instead of a read value. `GPU_MEMORY_RESERVED` value will be -scoped out from the GPU memory amount found from sysfs. diff --git a/cmd/gpu_plugin/rm/gpu_plugin_resource_manager.go b/cmd/gpu_plugin/rm/gpu_plugin_resource_manager.go deleted file mode 100644 index 6eae4c7a6..000000000 --- a/cmd/gpu_plugin/rm/gpu_plugin_resource_manager.go +++ /dev/null @@ -1,985 +0,0 @@ -// Copyright 2021-2023 Intel Corporation. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package rm - -import ( - "context" - "crypto/rand" - "crypto/tls" - "crypto/x509" - "encoding/json" - "io" - "math/big" - "net" - "net/http" - "os" - "sort" - "strconv" - "strings" - "sync" - "time" - - dpapi "github.com/intel/intel-device-plugins-for-kubernetes/pkg/deviceplugin" - "github.com/pkg/errors" - "google.golang.org/grpc" - v1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/fields" - "k8s.io/client-go/kubernetes" - "k8s.io/client-go/rest" - "k8s.io/klog/v2" - pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" - podresourcesv1 "k8s.io/kubelet/pkg/apis/podresources/v1" - "k8s.io/kubernetes/pkg/kubelet/apis/podresources" - sslices "k8s.io/utils/strings/slices" -) - -const ( - gasTSAnnotation = "gas-ts" - gasCardAnnotation = "gas-container-cards" - gasTileAnnotation = "gas-container-tiles" - - LevelzeroAffinityMaskEnvVar = "ZE_AFFINITY_MASK" - levelzeroHierarchyEnvVar = "ZE_FLAT_DEVICE_HIERARCHY" - - hierarchyModeComposite = "COMPOSITE" - hierarchyModeFlat = "FLAT" - hierarchyModeCombined = "COMBINED" - - grpcAddress = "unix:///var/lib/kubelet/pod-resources/kubelet.sock" - grpcBufferSize = 4 * 1024 * 1024 - grpcTimeout = 5 * time.Second - - kubeletAPITimeout = 5 * time.Second - kubeletAPIMaxRetries = 5 - kubeletHTTPSCertPath = "/var/lib/kubelet/pki/kubelet.crt" - // This is detected incorrectly as credentials - //nolint:gosec - serviceAccountTokenPath = "/var/run/secrets/kubernetes.io/serviceaccount/token" -) - -// Errors. -type retryErr struct{} -type zeroPendingErr struct{} - -func (e *retryErr) Error() string { - return "things didn't work out, but perhaps a retry will help" -} -func (e *zeroPendingErr) Error() string { - return "there are no pending pods anymore in this node" -} - -type podCandidate struct { - pod *v1.Pod - name string - allocatedContainerCount int - allocationTargetNum int -} - -// DeviceInfo is a subset of deviceplugin.DeviceInfo -// It's a lighter version of the full DeviceInfo as it is used -// to store fractional devices. -type DeviceInfo struct { - envs map[string]string - nodes []pluginapi.DeviceSpec - mounts []pluginapi.Mount -} - -type getClientFunc func(string, time.Duration, int) (podresourcesv1.PodResourcesListerClient, *grpc.ClientConn, error) - -// ResourceManager interface for the fractional resource handling. -type ResourceManager interface { - CreateFractionalResourceResponse(*pluginapi.AllocateRequest) (*pluginapi.AllocateResponse, error) - GetPreferredFractionalAllocation(*pluginapi.PreferredAllocationRequest) (*pluginapi.PreferredAllocationResponse, error) - SetDevInfos(DeviceInfoMap) - SetTileCountPerCard(count uint64) -} - -type containerAssignments struct { - deviceIds map[string]bool - tileEnv string -} - -type podAssignmentDetails struct { - containers []containerAssignments -} - -type resourceManager struct { - clientset kubernetes.Interface - deviceInfos DeviceInfoMap - prGetClientFunc getClientFunc - assignments map[string]podAssignmentDetails // pod name -> assignment details - nodeName string - hostIP string - skipID string - fullResourceNames []string - retryTimeout time.Duration - cleanupInterval time.Duration - mutex sync.RWMutex // for devTree updates during scan - cleanupMutex sync.RWMutex // for assignment details during cleanup - useKubelet bool - tileCountPerCard uint64 -} - -// NewDeviceInfo creates a new DeviceInfo. -func NewDeviceInfo(nodes []pluginapi.DeviceSpec, mounts []pluginapi.Mount, envs map[string]string) *DeviceInfo { - return &DeviceInfo{ - nodes: nodes, - mounts: mounts, - envs: envs, - } -} - -// DeviceInfoMap is a map of device infos. deviceId -> *DeviceInfo. -type DeviceInfoMap map[string]*DeviceInfo - -// NewDeviceInfoMap creates a new DeviceInfoMap. -func NewDeviceInfoMap() DeviceInfoMap { - return DeviceInfoMap{} -} - -// NewResourceManager creates a new resource manager. -func NewResourceManager(skipID string, fullResourceNames []string) (ResourceManager, error) { - clientset, err := getClientset() - - if err != nil { - return nil, errors.Wrap(err, "couldn't get clientset") - } - - rm := resourceManager{ - nodeName: os.Getenv("NODE_NAME"), - hostIP: os.Getenv("HOST_IP"), - clientset: clientset, - skipID: skipID, - fullResourceNames: fullResourceNames, - prGetClientFunc: podresources.GetV1Client, - assignments: make(map[string]podAssignmentDetails), - retryTimeout: 1 * time.Second, - cleanupInterval: 20 * time.Minute, - useKubelet: true, - } - - klog.Info("GPU device plugin resource manager enabled") - - // Try listing Pods once to detect if Kubelet API works - _, err = rm.listPodsFromKubelet() - - if err != nil { - klog.V(2).Info("Not using Kubelet API") - - rm.useKubelet = false - } else { - klog.V(2).Info("Using Kubelet API") - } - - go func() { - getRandDuration := func() time.Duration { - cleanupIntervalSeconds := int(rm.cleanupInterval.Seconds()) - - n, _ := rand.Int(rand.Reader, big.NewInt(int64(cleanupIntervalSeconds))) - - return rm.cleanupInterval/2 + time.Duration(n.Int64())*time.Second - } - - ticker := time.NewTicker(getRandDuration()) - - for range ticker.C { - klog.V(4).Info("Running cleanup") - - ticker.Reset(getRandDuration()) - - // Gather both running and pending pods. It might happen that - // cleanup is triggered between GetPreferredAllocation and Allocate - // and it would remove the assignment data for the soon-to-be allocated pod - running := rm.listPodsOnNodeWithStates([]string{string(v1.PodRunning), string(v1.PodPending)}) - - func() { - rm.cleanupMutex.Lock() - defer rm.cleanupMutex.Unlock() - - for podName := range rm.assignments { - if _, found := running[podName]; !found { - klog.V(4).Info("Removing from assignments: ", podName) - delete(rm.assignments, podName) - } - } - }() - - klog.V(4).Info("Cleanup done") - } - }() - - return &rm, nil -} - -// Generate a unique key for Pod. -func getPodKey(pod *v1.Pod) string { - return pod.Namespace + "&" + pod.Name -} - -// Generate a unique key for PodResources. -func getPodResourceKey(res *podresourcesv1.PodResources) string { - return res.Namespace + "&" + res.Name -} - -func (rm *resourceManager) listPodsFromAPIServer() (*v1.PodList, error) { - selector, err := fields.ParseSelector("spec.nodeName=" + rm.nodeName) - - if err != nil { - return &v1.PodList{}, err - } - - klog.V(4).Info("Requesting pods from API server") - - podList, err := rm.clientset.CoreV1().Pods(v1.NamespaceAll).List(context.Background(), metav1.ListOptions{ - FieldSelector: selector.String(), - }) - - if err != nil { - klog.Error("pod listing failed:", err) - - if err != nil { - return &v1.PodList{}, err - } - } - - return podList, nil -} - -// +kubebuilder:rbac:groups="",resources=nodes/proxy,verbs=list;get - -func (rm *resourceManager) listPodsFromKubelet() (*v1.PodList, error) { - var podList v1.PodList - - token, err := os.ReadFile(serviceAccountTokenPath) - if err != nil { - klog.Warning("Failed to read token for kubelet API access: ", err) - - return &podList, err - } - - kubeletCert, err := os.ReadFile(kubeletHTTPSCertPath) - if err != nil { - klog.Warning("Failed to read kubelet cert: ", err) - - return &podList, err - } - - certPool := x509.NewCertPool() - certPool.AppendCertsFromPEM(kubeletCert) - - // There isn't an official documentation for the kubelet API. There is a blog post: - // https://www.deepnetwork.com/blog/2020/01/13/kubelet-api.html - // And a tool to work with the API: - // https://github.com/cyberark/kubeletctl - - kubeletURL := "https://" + rm.hostIP + ":10250/pods" - - req, err := http.NewRequestWithContext(context.Background(), "GET", kubeletURL, nil) - if err != nil { - klog.Warning("Failure creating new request: ", err) - - return &podList, err - } - - req.Header.Set("Authorization", "Bearer "+string(token)) - - tr := &http.Transport{ - TLSClientConfig: &tls.Config{ - MinVersion: tls.VersionTLS12, - RootCAs: certPool, - ServerName: rm.nodeName, - }, - } - client := &http.Client{ - Timeout: kubeletAPITimeout, - Transport: tr, - } - - klog.V(4).Infof("Requesting pods from kubelet (%s)", kubeletURL) - - resp, err := (*client).Do(req) - if err != nil { - klog.Warning("Failed to read pods from kubelet API: ", err) - - return &podList, err - } - - body, err := io.ReadAll(resp.Body) - if err != nil { - klog.Warning("Failed to read http response body: ", err) - - return &podList, err - } - - resp.Body.Close() - - err = json.Unmarshal(body, &podList) - if err != nil { - klog.Warning("Failed to unmarshal PodList from response: ", err) - - return &podList, err - } - - return &podList, nil -} - -func (rm *resourceManager) listPods() (*v1.PodList, error) { - // Try to use kubelet API as long as it provides listings within retries - if rm.useKubelet { - var neterr net.Error - - for i := 0; i < kubeletAPIMaxRetries; i++ { - if podList, err := rm.listPodsFromKubelet(); err == nil { - return podList, nil - } else if errors.As(err, &neterr) && neterr.Timeout() { - continue - } - - // If error is non-timeout, break to stop using kubelet API - break - } - - klog.Warning("Stopping Kubelet API use due to error/timeout") - - rm.useKubelet = false - } - - return rm.listPodsFromAPIServer() -} - -func (rm *resourceManager) listPodsOnNodeWithStates(states []string) map[string]*v1.Pod { - pods := make(map[string]*v1.Pod) - - podList, err := rm.listPods() - if err != nil { - klog.Error("pod listing failed:", err) - - return pods - } - - for i := range podList.Items { - phase := string(podList.Items[i].Status.Phase) - if sslices.Contains(states, phase) { - key := getPodKey(&podList.Items[i]) - pods[key] = &podList.Items[i] - } - } - - return pods -} - -// CreateFractionalResourceResponse returns allocate response with the details -// assigned in GetPreferredFractionalAllocation -// This intentionally only logs errors and returns with the UseDefaultMethodError, -// in case any errors are hit. This is to avoid clusters filling up with unexpected admission errors. -func (rm *resourceManager) CreateFractionalResourceResponse(request *pluginapi.AllocateRequest) (*pluginapi.AllocateResponse, error) { - if !isAllocateRequestOk(request, rm.skipID) { - // it is better to leave allocated gpu devices as is and return - return nil, &dpapi.UseDefaultMethodError{} - } - - klog.V(4).Info("Proposed device ids: ", request.ContainerRequests[0].DevicesIDs) - - podCandidate, err := rm.findAllocationPodCandidate() - if errors.Is(err, &retryErr{}) { - klog.Warning("retrying POD resolving after sleeping") - time.Sleep(rm.retryTimeout) - - podCandidate, err = rm.findAllocationPodCandidate() - } - - if err != nil { - if !errors.Is(err, &zeroPendingErr{}) { - klog.Error("allocation candidate not found, perhaps the GPU scheduler extender is not called, err:", err) - } - // it is better to leave allocated gpu devices as is and return - return nil, &dpapi.UseDefaultMethodError{} - } - - pod := podCandidate.pod - - rm.cleanupMutex.Lock() - - assignment, found := rm.assignments[getPodKey(pod)] - if !found { - rm.cleanupMutex.Unlock() - klog.Error("couldn't find allocation info from assignments:", getPodKey(pod)) - - return nil, &dpapi.UseDefaultMethodError{} - } - - containerIndex := podCandidate.allocatedContainerCount - - affinityMask := assignment.containers[containerIndex].tileEnv - getPrefDevices := assignment.containers[containerIndex].deviceIds - - rm.cleanupMutex.Unlock() - - devIds := request.ContainerRequests[0].DevicesIDs - - // Check if all the preferred devices were also used - if len(devIds) != len(getPrefDevices) { - klog.Warningf("Allocate called with odd number of device IDs: %d vs %d", len(devIds), len(getPrefDevices)) - } - - for _, devID := range devIds { - if _, found := getPrefDevices[devID]; !found { - klog.Warningf("Not preferred device used in Allocate: %s (%v)", devID, getPrefDevices) - } - } - - klog.V(4).Info("Allocate affinity mask: ", affinityMask) - klog.V(4).Info("Allocate device ids: ", devIds) - - return rm.createAllocateResponse(devIds, affinityMask) -} - -func (rm *resourceManager) GetPreferredFractionalAllocation(request *pluginapi.PreferredAllocationRequest) ( - *pluginapi.PreferredAllocationResponse, error) { - if !isPreferredAllocationRequestOk(request, rm.skipID) { - // it is better to leave allocated gpu devices as is and return - return &pluginapi.PreferredAllocationResponse{}, nil - } - - klog.V(4).Info("GetPreferredAllocation request: ", request) - - podCandidate, err := rm.findAllocationPodCandidate() - if errors.Is(err, &retryErr{}) { - klog.Warning("retrying POD resolving after sleeping") - time.Sleep(rm.retryTimeout) - - podCandidate, err = rm.findAllocationPodCandidate() - } - - if err != nil { - if !errors.Is(err, &zeroPendingErr{}) { - klog.Error("allocation candidate not found, perhaps the GPU scheduler extender is not called, err:", err) - } - - // Return empty response as returning an error causes - // the pod to be labeled as UnexpectedAdmissionError - return &pluginapi.PreferredAllocationResponse{}, nil - } - - pod := podCandidate.pod - containerIndex := podCandidate.allocatedContainerCount - cards := containerCards(pod, containerIndex) - affinityMask := containerTileAffinityMask(pod, containerIndex, int(rm.tileCountPerCard)) - podKey := getPodKey(pod) - - creq := request.ContainerRequests[0] - - klog.V(4).Info("Get preferred fractional allocation: ", - podKey, creq.AllocationSize, creq.MustIncludeDeviceIDs, creq.AvailableDeviceIDs) - - deviceIds := selectDeviceIDsForContainer( - int(creq.AllocationSize), cards, creq.AvailableDeviceIDs, creq.MustIncludeDeviceIDs) - - // Map container assignment details per pod name - - rm.cleanupMutex.Lock() - - assignments, found := rm.assignments[podKey] - - if !found { - assignments.containers = make([]containerAssignments, podCandidate.allocationTargetNum) - } - - assignments.containers[containerIndex].tileEnv = affinityMask - // Store device ids so we can double check the ones in Allocate - assignments.containers[containerIndex].deviceIds = make(map[string]bool) - for _, devID := range deviceIds { - assignments.containers[containerIndex].deviceIds[devID] = true - } - - rm.assignments[podKey] = assignments - - rm.cleanupMutex.Unlock() - - klog.V(4).Info("Selected devices for container: ", deviceIds) - - response := pluginapi.PreferredAllocationResponse{ - ContainerResponses: []*pluginapi.ContainerPreferredAllocationResponse{ - {DeviceIDs: deviceIds}, - }, - } - - return &response, nil -} - -// selectDeviceIDsForContainer selects suitable device ids from deviceIds and mustHaveDeviceIds -// the selection is guided by the cards list. -func selectDeviceIDsForContainer(requestedCount int, cards, deviceIds, mustHaveDeviceIds []string) []string { - getBaseCard := func(deviceId string) string { - return strings.Split(deviceId, "-")[0] - } - - if requestedCount < len(cards) { - klog.Warningf("Requested count is less than card count: %d vs %d.", requestedCount, len(cards)) - cards = cards[0:requestedCount] - } - - if requestedCount > len(cards) { - klog.Warningf("Requested count is higher than card count: %d vs %d.", requestedCount, len(cards)) - } - - // map of cardX -> device id list - available := map[string][]string{} - // Keep the last used index so we can pick the next one - availableIndex := map[string]int{} - - // Place must have IDs first so they get used - for _, devID := range mustHaveDeviceIds { - baseCard := getBaseCard(devID) - available[baseCard] = append(available[baseCard], devID) - } - - for _, devID := range deviceIds { - baseCard := getBaseCard(devID) - available[baseCard] = append(available[baseCard], devID) - } - - selected := []string{} - - for _, card := range cards { - indexNow := availableIndex[card] - - availableDevices, found := available[card] - if !found { - klog.Warningf("card %s is not found from known devices: %v", card, available) - continue - } - - if indexNow < len(availableDevices) { - selected = append(selected, availableDevices[indexNow]) - indexNow++ - availableIndex[card] = indexNow - } - } - - return selected -} - -func isAllocateRequestOk(rqt *pluginapi.AllocateRequest, skipID string) bool { - // so far kubelet calls allocate for each container separately. If that changes, we need to refine our logic. - if len(rqt.ContainerRequests) != 1 { - klog.Warning("multi-container allocation request not supported") - return false - } - - crqt := rqt.ContainerRequests[0] - for _, id := range crqt.DevicesIDs { - if id == skipID { - return false // intentionally not printing anything, this request is skipped - } - } - - return true -} - -func isPreferredAllocationRequestOk(rqt *pluginapi.PreferredAllocationRequest, skipID string) bool { - // so far kubelet calls allocate for each container separately. If that changes, we need to refine our logic. - if len(rqt.ContainerRequests) != 1 { - klog.Warning("multi-container allocation request not supported") - return false - } - - crqt := rqt.ContainerRequests[0] - for _, id := range crqt.AvailableDeviceIDs { - if id == skipID { - return false // intentionally not printing anything, this request is skipped - } - } - - return true -} - -// findAllocationPodCandidate tries to find the best allocation candidate pod, which must be: -// -// -pending for this node -// -using GPU resources in its spec -// -is found via grpc service with unallocated GPU devices -// -// returns: -// -// -the candidate pod struct pointer and no error, or -// -errRetry if unsuccessful, but there is perhaps hope of trying again with better luck -// -errZeroPending if no pending pods exist anymore (which is fine) -// -any grpc communication errors -func (rm *resourceManager) findAllocationPodCandidate() (*podCandidate, error) { - // get map of pending pods for this node - pendingPods, err := rm.getNodePendingGPUPods() - if err != nil { - return nil, err - } - - candidates, err := rm.findAllocationPodCandidates(pendingPods) - if err != nil { - return nil, err - } - - numCandidates := len(candidates) - switch numCandidates { - case 0: - // fine, this typically happens when deployment is deleted before PODs start - klog.V(4).Info("zero pending pods") - return nil, &zeroPendingErr{} - case 1: - // perfect, only one option - klog.V(4).Info("only one pending pod") - - if _, ok := candidates[0].pod.Annotations[gasCardAnnotation]; !ok { - klog.Warningf("Pending POD annotations from scheduler not yet visible for pod %q", candidates[0].pod.Name) - return nil, &retryErr{} - } - - return &candidates[0], nil - - default: // > 1 candidates, not good, need to pick the best - // look for scheduler timestamps and sort by them - klog.V(4).Infof("%v pods pending, picking oldest", numCandidates) - - timestampedCandidates := []podCandidate{} - - for _, candidate := range candidates { - if _, ok := pendingPods[candidate.name].Annotations[gasTSAnnotation]; ok { - timestampedCandidates = append(timestampedCandidates, candidate) - } - } - - // .name here refers to a namespace+name combination - sort.Slice(timestampedCandidates, - func(i, j int) bool { - return pendingPods[timestampedCandidates[i].name].Annotations[gasTSAnnotation] < - pendingPods[timestampedCandidates[j].name].Annotations[gasTSAnnotation] - }) - - if len(timestampedCandidates) == 0 { - klog.Warning("Pending POD annotations from scheduler not yet visible") - return nil, &retryErr{} - } - - return ×tampedCandidates[0], nil - } -} - -// +kubebuilder:rbac:groups="",resources=pods,verbs=list - -// getNodePendingGPUPods returns a map of pod names -> pods that are pending and use the gpu. -func (rm *resourceManager) getNodePendingGPUPods() (map[string]*v1.Pod, error) { - pendingPods := rm.listPodsOnNodeWithStates([]string{string(v1.PodPending)}) - - for podName, pod := range pendingPods { - if numGPUUsingContainers(pod, rm.fullResourceNames) == 0 { - delete(pendingPods, podName) - } - } - - return pendingPods, nil -} - -// findAllocationPodCandidates returns a slice of all potential allocation candidate pods. -// This goes through the PODs listed in the podresources grpc service and finds those among pending -// pods which don't have all GPU devices allocated. -func (rm *resourceManager) findAllocationPodCandidates(pendingPods map[string]*v1.Pod) ([]podCandidate, error) { - resListerClient, clientConn, err := rm.prGetClientFunc(grpcAddress, grpcTimeout, grpcBufferSize) - if err != nil { - return nil, errors.Wrap(err, "Could not get a grpc client for reading plugin resources") - } - - defer clientConn.Close() - - ctx, cancel := context.WithTimeout(context.Background(), grpcTimeout) - defer cancel() - - resp, err := resListerClient.List(ctx, &podresourcesv1.ListPodResourcesRequest{}) - if err != nil { - return nil, errors.Wrap(err, "Could not read plugin resources via grpc") - } - - candidates := []podCandidate{} - - for _, podRes := range resp.PodResources { - // count allocated gpu-using containers - numContainersAllocated := 0 - - for _, cont := range podRes.Containers { - for _, dev := range cont.Devices { - if sslices.Contains(rm.fullResourceNames, dev.ResourceName) { - numContainersAllocated++ - break - } - } - } - - key := getPodResourceKey(podRes) - - if pod, pending := pendingPods[key]; pending { - allocationTargetNum := numGPUUsingContainers(pod, rm.fullResourceNames) - if numContainersAllocated < allocationTargetNum { - candidate := podCandidate{ - pod: pod, - name: key, - allocatedContainerCount: numContainersAllocated, - allocationTargetNum: allocationTargetNum, - } - candidates = append(candidates, candidate) - } - } - } - - return candidates, nil -} - -func (rm *resourceManager) SetDevInfos(deviceInfos DeviceInfoMap) { - rm.mutex.Lock() - defer rm.mutex.Unlock() - rm.deviceInfos = deviceInfos -} - -func (rm *resourceManager) SetTileCountPerCard(count uint64) { - rm.mutex.Lock() - defer rm.mutex.Unlock() - rm.tileCountPerCard = count -} - -func (rm *resourceManager) createAllocateResponse(deviceIds []string, tileAffinityMask string) (*pluginapi.AllocateResponse, error) { - rm.mutex.Lock() - defer rm.mutex.Unlock() - - allocateResponse := pluginapi.AllocateResponse{} - cresp := pluginapi.ContainerAllocateResponse{} - - for _, devID := range deviceIds { - dev, ok := rm.deviceInfos[devID] - if !ok { - klog.Warningf("No device info for %q, using default allocation method devices", devID) - return nil, &dpapi.UseDefaultMethodError{} - } - - // add new devices - nodes := dev.nodes - for i := range nodes { - cresp.Devices = append(cresp.Devices, &nodes[i]) - } - - // add new mounts - mounts := dev.mounts - for i := range mounts { - cresp.Mounts = append(cresp.Mounts, &mounts[i]) - } - - for key, value := range dev.envs { - if cresp.Envs == nil { - cresp.Envs = make(map[string]string) - } - - cresp.Envs[key] = value - } - } - - if tileAffinityMask != "" { - if cresp.Envs == nil { - cresp.Envs = make(map[string]string) - } - - cresp.Envs[LevelzeroAffinityMaskEnvVar] = tileAffinityMask - } - - allocateResponse.ContainerResponses = append(allocateResponse.ContainerResponses, &cresp) - - return &allocateResponse, nil -} - -func numGPUUsingContainers(pod *v1.Pod, fullResourceNames []string) int { - num := 0 - - for _, container := range pod.Spec.Containers { - for reqName, quantity := range container.Resources.Requests { - resourceName := reqName.String() - if sslices.Contains(fullResourceNames, resourceName) { - value, _ := quantity.AsInt64() - if value > 0 { - num++ - break - } - } - } - } - - return num -} - -// containerCards returns the cards to use for a single container. -// gpuUsingContainerIndex 0 == first gpu-using container in the pod. -func containerCards(pod *v1.Pod, gpuUsingContainerIndex int) []string { - fullAnnotation := pod.Annotations[gasCardAnnotation] - cardLists := strings.Split(fullAnnotation, "|") - klog.V(3).Infof("%s:%v", fullAnnotation, cardLists) - - i := 0 - - for _, cardList := range cardLists { - cards := strings.Split(cardList, ",") - if len(cards) > 0 && len(cardList) > 0 { - if gpuUsingContainerIndex == i { - klog.V(3).Infof("Cards for container nr %v in pod %v are %v", gpuUsingContainerIndex, getPodKey(pod), cards) - return cards - } - - i++ - } - } - - klog.Warningf("couldn't find cards for gpu using container index %v", gpuUsingContainerIndex) - - return nil -} - -// Guesses level zero hierarchy mode for the container. Defaults to the new "flat" mode -// if no mode is set in the container's env variables. -func guessLevelzeroHierarchyMode(pod *v1.Pod, containerIndex int) string { - klog.V(4).Infof("Checking pod %s envs", pod.Name) - - if containerIndex < len(pod.Spec.Containers) { - c := pod.Spec.Containers[containerIndex] - - if c.Env != nil { - for _, env := range c.Env { - if env.Name == levelzeroHierarchyEnvVar { - switch env.Value { - // Check that the value is valid. - case hierarchyModeComposite: - fallthrough - case hierarchyModeFlat: - fallthrough - case hierarchyModeCombined: - klog.V(4).Infof("Returning %s hierarchy", env.Value) - return env.Value - } - - break - } - } - } - } - - klog.V(4).Infof("Returning default %s hierarchy", hierarchyModeFlat) - - return hierarchyModeFlat -} - -func convertTileInfoToEnvMask(tileInfo string, tilesPerCard int, hierarchyMode string) string { - cards := strings.Split(tileInfo, ",") - - tileIndices := make([]string, len(cards)) - - for i, cardTileCombos := range cards { - cardTileSplit := strings.Split(cardTileCombos, ":") - if len(cardTileSplit) != 2 { - klog.Warningf("invalid card tile combo string (%v)", cardTileCombos) - return "" - } - - tiles := strings.Split(cardTileSplit[1], "+") - - var maskItems []string - - for _, tile := range tiles { - if !strings.HasPrefix(tile, "gt") { - klog.Warningf("invalid tile syntax (%v)", tile) - return "" - } - - tileNoStr := strings.TrimPrefix(tile, "gt") - tileNo, err := strconv.ParseInt(tileNoStr, 10, 16) - - if err != nil { - klog.Warningf("invalid tile syntax (%v)", tile) - return "" - } - - maskItem := "" - if hierarchyMode == hierarchyModeComposite { - maskItem = - strconv.FormatInt(int64(i), 10) + "." + - strconv.FormatInt(tileNo, 10) - } else { - // This handles both FLAT and COMBINED hierarchy. - devIndex := i*tilesPerCard + int(tileNo) - - maskItem = strconv.FormatInt(int64(devIndex), 10) - } - - maskItems = append(maskItems, maskItem) - } - - tileIndices[i] = strings.Join(maskItems, ",") - } - - return strings.Join(tileIndices, ",") -} - -// containerTiles returns the tile indices to use for a single container. -// Indices should be passed to level zero env variable to guide execution -// gpuUsingContainerIndex 0 == first gpu-using container in the pod. -// The affinity mask is not needed for 1-tile GPUs. With 1-tile GPUs normal -// GPU exposing is enough to limit container's access to targeted devices. -// annotation example: -// gas-container-tiles=card0:gt0+gt1,card1:gt0|card2:gt1+gt2||card0:gt3. -func containerTileAffinityMask(pod *v1.Pod, gpuUsingContainerIndex, tilesPerCard int) string { - fullAnnotation := pod.Annotations[gasTileAnnotation] - onlyDividers := strings.Count(fullAnnotation, "|") == len(fullAnnotation) - - if fullAnnotation == "" || onlyDividers || tilesPerCard <= 1 { - return "" - } - - tileLists := strings.Split(fullAnnotation, "|") - klog.Infof("%s:%v", fullAnnotation, tileLists) - - i := 0 - - for containerIndex, containerTileInfo := range tileLists { - if len(containerTileInfo) == 0 { - continue - } - - if i == gpuUsingContainerIndex { - return convertTileInfoToEnvMask(containerTileInfo, tilesPerCard, guessLevelzeroHierarchyMode(pod, containerIndex)) - } - - i++ - } - - klog.Warningf("couldn't find tile info for gpu using container index %v", gpuUsingContainerIndex) - - return "" -} - -func getClientset() (*kubernetes.Clientset, error) { - config, err := rest.InClusterConfig() - if err != nil { - return nil, err - } - - clientset, err := kubernetes.NewForConfig(config) - if err != nil { - return nil, err - } - - return clientset, nil -} diff --git a/cmd/gpu_plugin/rm/gpu_plugin_resource_manager_test.go b/cmd/gpu_plugin/rm/gpu_plugin_resource_manager_test.go deleted file mode 100644 index 840c51dfa..000000000 --- a/cmd/gpu_plugin/rm/gpu_plugin_resource_manager_test.go +++ /dev/null @@ -1,1001 +0,0 @@ -// Copyright 2021-2023 Intel Corporation. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package rm - -import ( - "context" - "fmt" - "os" - "testing" - "time" - - "google.golang.org/grpc" - "google.golang.org/grpc/credentials/insecure" - v1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/resource" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/client-go/kubernetes/fake" - "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" - podresourcesv1 "k8s.io/kubelet/pkg/apis/podresources/v1" -) - -type mockPodResources struct { - pods []v1.Pod -} - -func (w *mockPodResources) List(ctx context.Context, - in *podresourcesv1.ListPodResourcesRequest, - opts ...grpc.CallOption) (*podresourcesv1.ListPodResourcesResponse, error) { - resp := podresourcesv1.ListPodResourcesResponse{} - for _, pod := range w.pods { - resp.PodResources = append(resp.PodResources, &podresourcesv1.PodResources{ - Name: pod.ObjectMeta.Name, Namespace: pod.ObjectMeta.Namespace, Containers: []*podresourcesv1.ContainerResources{{}}, - }) - } - - return &resp, nil -} -func (w *mockPodResources) GetAllocatableResources(ctx context.Context, - in *podresourcesv1.AllocatableResourcesRequest, - opts ...grpc.CallOption) (*podresourcesv1.AllocatableResourcesResponse, error) { - return nil, nil -} - -func (w *mockPodResources) Get(ctx context.Context, - in *podresourcesv1.GetPodResourcesRequest, - opts ...grpc.CallOption) (*podresourcesv1.GetPodResourcesResponse, error) { - return nil, nil -} - -func newMockResourceManager(pods []v1.Pod) ResourceManager { - client, err := grpc.NewClient("fake", grpc.WithTransportCredentials(insecure.NewCredentials())) - if err != nil { - fmt.Fprintf(os.Stderr, "failed to create client: %v\n", err) - - os.Exit(1) - } - - mc := fake.NewClientset() - - for _, p := range pods { - _, err = mc.CoreV1().Pods(p.Namespace).Create(context.Background(), &p, metav1.CreateOptions{}) - if err != nil { - fmt.Printf("failed to Create Pod: %v\n", err) - } - } - - rm := resourceManager{ - clientset: mc, - nodeName: "TestNode", - prGetClientFunc: func(string, time.Duration, int) (podresourcesv1.PodResourcesListerClient, *grpc.ClientConn, error) { - return &mockPodResources{pods: pods}, client, nil - }, - skipID: "all", - fullResourceNames: []string{"gpu.intel.com/i915", "gpu.intel.com/xe"}, - assignments: make(map[string]podAssignmentDetails), - retryTimeout: 1 * time.Millisecond, - useKubelet: false, - } - - deviceInfoMap := NewDeviceInfoMap() - deviceInfoMap["card0-0"] = NewDeviceInfo([]v1beta1.DeviceSpec{ - { - ContainerPath: "containerpath", - HostPath: "hostpath", - Permissions: "rw", - }, - }, - []v1beta1.Mount{{}}, - map[string]string{"more": "coverage"}) - deviceInfoMap["card1-0"] = NewDeviceInfo([]v1beta1.DeviceSpec{{}}, nil, nil) - deviceInfoMap["card2-0"] = NewDeviceInfo([]v1beta1.DeviceSpec{{}}, nil, nil) - rm.SetDevInfos(deviceInfoMap) - - return &rm -} - -type preferredTestCase struct { - name string - pods []v1.Pod - containerRequests []*v1beta1.ContainerPreferredAllocationRequest - expectDevices []string - expectedContainerLen int -} - -type testCase struct { - name string - pods []v1.Pod - prefContainerRequests []*v1beta1.ContainerPreferredAllocationRequest - containerRequests []*v1beta1.ContainerAllocateRequest - prefExpectErr bool - expectErr bool -} - -func TestNewResourceManager(t *testing.T) { - // normal clientset is unavailable inside the unit tests - _, err := NewResourceManager("foo", []string{"bar"}) - - if err == nil { - t.Errorf("unexpected success") - } -} - -func TestGetPreferredFractionalAllocation(t *testing.T) { - properTestPod := v1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Annotations: map[string]string{gasCardAnnotation: "card0"}, - Name: "TestPod", - Namespace: "neimspeis", - }, - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Resources: v1.ResourceRequirements{ - Requests: v1.ResourceList{ - "gpu.intel.com/i915": resource.MustParse("1"), - }, - }, - }, - }, - }, - Status: v1.PodStatus{ - Phase: v1.PodPending, - }, - } - - gpuLessTestPod := v1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "TestPodLessGpu", - Namespace: "neimspeis", - }, - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Resources: v1.ResourceRequirements{ - Requests: v1.ResourceList{ - "gpu.less.com/i915": resource.MustParse("1"), - }, - }, - }, - }, - }, - } - - properTestPodMultiGpu := *properTestPod.DeepCopy() - properTestPodMultiGpu.ObjectMeta.Annotations[gasCardAnnotation] = "card0,card1" - - properTestPodMultiGpu2 := *properTestPod.DeepCopy() - properTestPodMultiGpu2.ObjectMeta.Annotations[gasCardAnnotation] = "card0,card1,card0" - - monitoringPod := *properTestPod.DeepCopy() - delete(monitoringPod.Spec.Containers[0].Resources.Requests, "gpu.intel.com/i915") - monitoringPod.Spec.Containers[0].Resources.Requests["gpu.intel.com/i915_monitoring"] = resource.MustParse("1") - - allContainerRequests := []*v1beta1.ContainerPreferredAllocationRequest{ - {AvailableDeviceIDs: []string{"all"}, - AllocationSize: 1}, - } - - properPrefContainerRequests := []*v1beta1.ContainerPreferredAllocationRequest{ - {AvailableDeviceIDs: []string{"card0-0", "card0-1", "card1-0", "card1-1"}, - AllocationSize: 1}, - } - - outofRangePrefContainerRequests := []*v1beta1.ContainerPreferredAllocationRequest{ - {AvailableDeviceIDs: []string{"card6-0", "card5-1"}, - AllocationSize: 1}, - } - - mustHaveContainerRequests := []*v1beta1.ContainerPreferredAllocationRequest{ - {AvailableDeviceIDs: []string{"card0-0", "card0-1", "card1-0", "card1-1"}, - MustIncludeDeviceIDs: []string{"card0-2"}, - AllocationSize: 2}, - } - - properPrefContainerRequests3 := []*v1beta1.ContainerPreferredAllocationRequest{ - {AvailableDeviceIDs: []string{"card0-0", "card0-1", "card1-0", "card1-1"}, - AllocationSize: 3}, - } - - testCases := []preferredTestCase{ - { - name: "Wrong number of container requests should result in empty response", - pods: []v1.Pod{properTestPod}, - containerRequests: nil, - expectedContainerLen: 0, - }, - { - name: "Proper number of containers with good devices", - pods: []v1.Pod{properTestPod}, - containerRequests: properPrefContainerRequests, - expectDevices: []string{"card0-0"}, - expectedContainerLen: 1, - }, - { - name: "Inconsistent devices vs. gas' annotated ones", - pods: []v1.Pod{properTestPod}, - containerRequests: outofRangePrefContainerRequests, - expectDevices: []string{}, - expectedContainerLen: 1, - }, - { - name: "Preferred allocation is with must have device ids", - pods: []v1.Pod{properTestPodMultiGpu}, - containerRequests: mustHaveContainerRequests, - expectDevices: []string{"card0-2", "card1-0"}, - expectedContainerLen: 1, - }, - { - name: "Duplicate card requesting pod", - pods: []v1.Pod{properTestPodMultiGpu2}, - containerRequests: properPrefContainerRequests3, - expectDevices: []string{"card0-0", "card1-0", "card0-1"}, - expectedContainerLen: 1, - }, - { - name: "Allocation size is larger than cards assigned", - pods: []v1.Pod{properTestPodMultiGpu}, - containerRequests: properPrefContainerRequests3, - expectDevices: []string{"card0-0", "card1-0"}, - expectedContainerLen: 1, - }, - { - name: "Monitoring pod is being allocated", - pods: []v1.Pod{monitoringPod}, - containerRequests: allContainerRequests, - expectDevices: []string{}, - expectedContainerLen: 0, - }, - { - name: "Two pods with one without GPU", - pods: []v1.Pod{properTestPod, gpuLessTestPod}, - containerRequests: properPrefContainerRequests, - expectDevices: []string{"card0-0"}, - expectedContainerLen: 1, - }, - } - - for _, tCase := range testCases { - rm := newMockResourceManager(tCase.pods) - resp, perr := rm.GetPreferredFractionalAllocation(&v1beta1.PreferredAllocationRequest{ - ContainerRequests: tCase.containerRequests, - }) - - if perr != nil { - t.Errorf("test %v unexpected failure, err:%v", tCase.name, perr) - } - - if perr == nil { - // check response - expectTruef(len(resp.ContainerResponses) == tCase.expectedContainerLen, t, tCase.name, "wrong number of container responses, expected 1") - - if len(tCase.expectDevices) > 0 { - expectTruef(len(resp.ContainerResponses[0].DeviceIDs) == len(tCase.expectDevices), t, tCase.name, - "wrong number of device ids: %d (%v)", len(resp.ContainerResponses[0].DeviceIDs), resp.ContainerResponses[0].DeviceIDs) - - for i, expecteDevice := range tCase.expectDevices { - expectTruef(resp.ContainerResponses[0].DeviceIDs[i] == expecteDevice, t, tCase.name, - "wrong device id selected: %s", resp.ContainerResponses[0].DeviceIDs[i]) - } - } - } - } -} - -func TestCreateFractionalResourceResponse(t *testing.T) { - properTestPod := v1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Annotations: map[string]string{gasCardAnnotation: "card0"}, - Name: "TestPod", - }, - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Resources: v1.ResourceRequirements{ - Requests: v1.ResourceList{ - "gpu.intel.com/i915": resource.MustParse("1"), - }, - }, - }, - }, - }, - Status: v1.PodStatus{ - Phase: v1.PodPending, - }, - } - unAnnotatedTestPod := *properTestPod.DeepCopy() - unAnnotatedTestPod.ObjectMeta.Annotations = nil - properTestPod2 := *properTestPod.DeepCopy() - properTestPod2.ObjectMeta.Name = "TestPod2" - - timeStampedProperTestPod := *properTestPod.DeepCopy() - timeStampedProperTestPod.ObjectMeta.Annotations[gasTSAnnotation] = "2" - - timeStampedProperTestPod2 := *properTestPod2.DeepCopy() - timeStampedProperTestPod2.ObjectMeta.Annotations[gasTSAnnotation] = "1" - - properContainerRequests := []*v1beta1.ContainerAllocateRequest{{DevicesIDs: []string{"card0-0"}}} - - properPrefContainerRequests := []*v1beta1.ContainerPreferredAllocationRequest{ - {AvailableDeviceIDs: []string{"card0-0", "card0-1", "card1-0", "card1-1"}, - AllocationSize: 1}, - } - - testCases := []testCase{ - { - name: "Wrong number of container requests should fail", - pods: []v1.Pod{properTestPod}, - prefContainerRequests: properPrefContainerRequests, - prefExpectErr: false, - containerRequests: []*v1beta1.ContainerAllocateRequest{}, - expectErr: true, - }, - { - name: "Request for monitor resource should fail", - pods: []v1.Pod{properTestPod}, - prefContainerRequests: properPrefContainerRequests, - prefExpectErr: true, - containerRequests: []*v1beta1.ContainerAllocateRequest{{DevicesIDs: []string{"all"}}}, - expectErr: true, - }, - { - name: "Zero pending pods should fail", - pods: []v1.Pod{}, - prefContainerRequests: properPrefContainerRequests, - prefExpectErr: true, - containerRequests: properContainerRequests, - expectErr: true, - }, - { - name: "Single pending pod without annotations should fail", - pods: []v1.Pod{unAnnotatedTestPod}, - prefContainerRequests: properPrefContainerRequests, - prefExpectErr: true, - containerRequests: properContainerRequests, - expectErr: true, - }, - { - name: "Single pending pod should succeed", - pods: []v1.Pod{properTestPod}, - prefContainerRequests: properPrefContainerRequests, - prefExpectErr: true, - containerRequests: properContainerRequests, - expectErr: false, - }, - { - name: "Two pending pods without timestamps should fail", - pods: []v1.Pod{properTestPod, properTestPod2}, - prefContainerRequests: properPrefContainerRequests, - prefExpectErr: true, - containerRequests: properContainerRequests, - expectErr: true, - }, - { - name: "Two pending pods with timestamps should reduce to one candidate and succeed", - pods: []v1.Pod{timeStampedProperTestPod, timeStampedProperTestPod2}, - prefContainerRequests: properPrefContainerRequests, - prefExpectErr: true, - containerRequests: properContainerRequests, - expectErr: false, - }, - } - - for _, tCase := range testCases { - rm := newMockResourceManager(tCase.pods) - rm.SetTileCountPerCard(uint64(1)) - - _, perr := rm.GetPreferredFractionalAllocation(&v1beta1.PreferredAllocationRequest{ - ContainerRequests: tCase.prefContainerRequests, - }) - - if (perr != nil) && !tCase.prefExpectErr { - t.Errorf("test %v unexpected failure, err:%v", tCase.name, perr) - } - - resp, err := rm.CreateFractionalResourceResponse(&v1beta1.AllocateRequest{ - ContainerRequests: tCase.containerRequests, - }) - - if (err != nil) && !tCase.expectErr { - t.Errorf("test %v unexpected failure, err:%v", tCase.name, err) - } - - if err == nil { - if tCase.expectErr { - t.Errorf("test %v unexpected success", tCase.name) - } else { - // check response - expectTruef(len(resp.ContainerResponses) == 1, t, tCase.name, "wrong number of container responses, expected 1") - expectTruef(len(resp.ContainerResponses[0].Envs) == 1, t, tCase.name, "wrong number of env variables in container response, expected 1") - expectTruef(resp.ContainerResponses[0].Envs["more"] == "coverage", t, tCase.name, "env not set for container response") - expectTruef(len(resp.ContainerResponses[0].Devices) == 1, t, tCase.name, "wrong number of devices, expected 1") - expectTruef(resp.ContainerResponses[0].Devices[0].HostPath == "hostpath", t, tCase.name, "HostPath not set for device") - expectTruef(resp.ContainerResponses[0].Devices[0].ContainerPath == "containerpath", t, tCase.name, "ContainerPath not set for device") - expectTruef(resp.ContainerResponses[0].Devices[0].Permissions == "rw", t, tCase.name, "permissions not set for device") - } - } - } -} - -func TestCreateFractionalResourceResponseWithOneCardTwoTiles(t *testing.T) { - properTestPod := v1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Annotations: map[string]string{ - gasCardAnnotation: "card0", - gasTileAnnotation: "card0:gt0+gt1"}, - Name: "TestPod", - }, - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Resources: v1.ResourceRequirements{ - Requests: v1.ResourceList{ - "gpu.intel.com/i915": resource.MustParse("1"), - }, - }, - Env: []v1.EnvVar{ - { - Name: levelzeroHierarchyEnvVar, - Value: hierarchyModeComposite, - }, - }, - }, - }, - }, - Status: v1.PodStatus{ - Phase: v1.PodPending, - }, - } - - properPrefContainerRequests := []*v1beta1.ContainerPreferredAllocationRequest{ - {AvailableDeviceIDs: []string{"card0-0", "card0-1", "card1-0", "card1-1"}, - AllocationSize: 1}, - } - - properContainerRequests := []*v1beta1.ContainerAllocateRequest{{DevicesIDs: []string{"card0-0"}}} - - tCase := testCase{ - name: "Single pending pod with two tiles should succeed", - pods: []v1.Pod{properTestPod}, - prefContainerRequests: properPrefContainerRequests, - prefExpectErr: false, - containerRequests: properContainerRequests, - expectErr: false, - } - - rm := newMockResourceManager(tCase.pods) - rm.SetTileCountPerCard(uint64(2)) - - _, perr := rm.GetPreferredFractionalAllocation(&v1beta1.PreferredAllocationRequest{ - ContainerRequests: tCase.prefContainerRequests, - }) - - if (perr != nil) && !tCase.prefExpectErr { - t.Errorf("test %v unexpected failure, err:%v", tCase.name, perr) - } - - resp, err := rm.CreateFractionalResourceResponse(&v1beta1.AllocateRequest{ - ContainerRequests: tCase.containerRequests, - }) - - if (err != nil) && !tCase.expectErr { - t.Errorf("test %v unexpected failure, err:%v", tCase.name, err) - } - - // check response - expectTruef(len(resp.ContainerResponses) == 1, t, tCase.name, "wrong number of container responses, expected 1") - expectTruef(len(resp.ContainerResponses[0].Envs) == 2, t, tCase.name, "wrong number of env variables in container response, expected 2") - expectTruef(resp.ContainerResponses[0].Envs[LevelzeroAffinityMaskEnvVar] != "", t, tCase.name, "l0 tile mask not set") - expectTruef(resp.ContainerResponses[0].Envs[LevelzeroAffinityMaskEnvVar] == "0.0,0.1", t, tCase.name, "l0 affinity mask is incorrect") - expectTruef(len(resp.ContainerResponses[0].Devices) == 1, t, tCase.name, "wrong number of devices, expected 1") -} - -func TestCreateFractionalResourceResponseWithTwoCardsOneTile(t *testing.T) { - properTestPod := v1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Annotations: map[string]string{ - gasCardAnnotation: "card1,card2", - gasTileAnnotation: "card1:gt3,card2:gt4"}, - Name: "TestPod", - }, - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Resources: v1.ResourceRequirements{ - Requests: v1.ResourceList{ - "gpu.intel.com/i915": resource.MustParse("2"), - }, - }, - Env: []v1.EnvVar{ - { - Name: levelzeroHierarchyEnvVar, - Value: hierarchyModeComposite, - }, - }, - }, - }, - }, - Status: v1.PodStatus{ - Phase: v1.PodPending, - }, - } - - properPrefContainerRequests := []*v1beta1.ContainerPreferredAllocationRequest{ - {AvailableDeviceIDs: []string{"card0-0", "card0-1", "card1-0", "card1-1"}, - AllocationSize: 1}, - } - - properContainerRequests := []*v1beta1.ContainerAllocateRequest{{DevicesIDs: []string{"card1-0", "card2-0"}}} - - tCase := testCase{ - name: "Single pending pod with two cards and one tile each should succeed", - pods: []v1.Pod{properTestPod}, - prefContainerRequests: properPrefContainerRequests, - prefExpectErr: false, - containerRequests: properContainerRequests, - expectErr: false, - } - - rm := newMockResourceManager(tCase.pods) - rm.SetTileCountPerCard(uint64(5)) - - _, perr := rm.GetPreferredFractionalAllocation(&v1beta1.PreferredAllocationRequest{ - ContainerRequests: tCase.prefContainerRequests, - }) - - if (perr != nil) && !tCase.prefExpectErr { - t.Errorf("test %v unexpected failure, err:%v", tCase.name, perr) - } - - resp, err := rm.CreateFractionalResourceResponse(&v1beta1.AllocateRequest{ - ContainerRequests: tCase.containerRequests, - }) - - if (err != nil) && !tCase.expectErr { - t.Errorf("test %v unexpected failure, err:%v", tCase.name, err) - } - - if err == nil { - if tCase.expectErr { - t.Errorf("test %v unexpected success", tCase.name) - } else { - // check response - expectTruef(len(resp.ContainerResponses) == 1, t, tCase.name, "wrong number of container responses, expected 1") - expectTruef(resp.ContainerResponses[0].Envs[LevelzeroAffinityMaskEnvVar] != "", t, tCase.name, "l0 tile mask not set") - expectTruef(resp.ContainerResponses[0].Envs[LevelzeroAffinityMaskEnvVar] == "0.3,1.4", t, tCase.name, "l0 affinity mask is incorrect: ") - expectTruef(len(resp.ContainerResponses[0].Devices) == 2, t, tCase.name, "wrong number of devices, expected 2") - } - } -} - -func TestCreateFractionalResourceResponseWithThreeCardsTwoTiles(t *testing.T) { - properTestPod := v1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Annotations: map[string]string{ - gasCardAnnotation: "card0,card1,card2", - gasTileAnnotation: "card0:gt0+gt1,card1:gt2+gt3,card2:gt3+gt4"}, - Name: "TestPod", - }, - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Resources: v1.ResourceRequirements{ - Requests: v1.ResourceList{ - "gpu.intel.com/i915": resource.MustParse("3"), - }, - }, - Env: []v1.EnvVar{ - { - Name: levelzeroHierarchyEnvVar, - Value: hierarchyModeComposite, - }, - }, - }, - }, - }, - Status: v1.PodStatus{ - Phase: v1.PodPending, - }, - } - - properPrefContainerRequests := []*v1beta1.ContainerPreferredAllocationRequest{ - {AvailableDeviceIDs: []string{"card0-0", "card0-1", "card1-0", "card1-1", "card2-0", "card2-1"}, - AllocationSize: 1}, - } - - properContainerRequests := []*v1beta1.ContainerAllocateRequest{{DevicesIDs: []string{"card0-0", "card1-0", "card2-0"}}} - - tCase := testCase{ - name: "Single pending pod with three cards and two tiles each should succeed", - pods: []v1.Pod{properTestPod}, - prefContainerRequests: properPrefContainerRequests, - prefExpectErr: false, - containerRequests: properContainerRequests, - expectErr: false, - } - - rm := newMockResourceManager(tCase.pods) - rm.SetTileCountPerCard(uint64(5)) - - _, perr := rm.GetPreferredFractionalAllocation(&v1beta1.PreferredAllocationRequest{ - ContainerRequests: tCase.prefContainerRequests, - }) - - if (perr != nil) && !tCase.prefExpectErr { - t.Errorf("test %v unexpected failure, err:%v", tCase.name, perr) - } - - resp, err := rm.CreateFractionalResourceResponse(&v1beta1.AllocateRequest{ - ContainerRequests: tCase.containerRequests, - }) - - if (err != nil) && !tCase.expectErr { - t.Errorf("test %v unexpected failure, err:%v", tCase.name, err) - } - - if err == nil { - if tCase.expectErr { - t.Errorf("test %v unexpected success", tCase.name) - } else { - // check response - expectTruef(len(resp.ContainerResponses) == 1, t, tCase.name, "wrong number of container responses, expected 1") - expectTruef(resp.ContainerResponses[0].Envs[LevelzeroAffinityMaskEnvVar] != "", t, tCase.name, "l0 tile mask not set") - expectTruef(resp.ContainerResponses[0].Envs[LevelzeroAffinityMaskEnvVar] == "0.0,0.1,1.2,1.3,2.3,2.4", t, tCase.name, "l0 affinity mask is incorrect: ") - expectTruef(len(resp.ContainerResponses[0].Devices) == 3, t, tCase.name, "wrong number of devices, expected 3") - } - } -} - -func TestCreateFractionalResourceResponseWithMultipleContainersTileEach(t *testing.T) { - properTestPod := v1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Annotations: map[string]string{ - gasCardAnnotation: "card1|card2", - gasTileAnnotation: "card1:gt1|card2:gt0"}, - Name: "TestPod", - }, - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Resources: v1.ResourceRequirements{ - Requests: v1.ResourceList{ - "gpu.intel.com/i915": resource.MustParse("1"), - }, - }, - Env: []v1.EnvVar{ - { - Name: levelzeroHierarchyEnvVar, - Value: hierarchyModeComposite, - }, - }, - }, - { - Resources: v1.ResourceRequirements{ - Requests: v1.ResourceList{ - "gpu.intel.com/i915": resource.MustParse("1"), - }, - }, - Env: []v1.EnvVar{ - { - Name: levelzeroHierarchyEnvVar, - Value: hierarchyModeComposite, - }, - }, - }, - }, - }, - Status: v1.PodStatus{ - Phase: v1.PodPending, - }, - } - - properPrefContainerRequests := []*v1beta1.ContainerPreferredAllocationRequest{ - {AvailableDeviceIDs: []string{"card0-0", "card0-1", "card1-0", "card1-1", "card2-0", "card2-1"}, - AllocationSize: 1}, - } - _ = properPrefContainerRequests - - properContainerRequests := []*v1beta1.ContainerAllocateRequest{ - {DevicesIDs: []string{"card1-0"}}, - {DevicesIDs: []string{"card2-0"}}, - } - - tCase := testCase{ - name: "Single pending pod with two containers with one tile each should FAIL", - pods: []v1.Pod{properTestPod}, - prefContainerRequests: properPrefContainerRequests, - prefExpectErr: false, - containerRequests: properContainerRequests, - expectErr: true, - } - - rm := newMockResourceManager(tCase.pods) - rm.SetTileCountPerCard(uint64(2)) - - _, perr := rm.GetPreferredFractionalAllocation(&v1beta1.PreferredAllocationRequest{ - ContainerRequests: properPrefContainerRequests, - }) - - if (perr != nil) && !tCase.prefExpectErr { - t.Errorf("test %v unexpected failure, err:%v", tCase.name, perr) - } - - _, err := rm.CreateFractionalResourceResponse(&v1beta1.AllocateRequest{ - ContainerRequests: tCase.containerRequests, - }) - - if (err != nil) && !tCase.expectErr { - t.Errorf("test %v unexpected failure, err:%v", tCase.name, err) - } - - if err == nil { - if tCase.expectErr { - t.Errorf("test %v unexpected success", tCase.name) - } - } -} - -func TestTileAnnotationParsing(t *testing.T) { - type parseTest struct { - line string - result string - hierarchys []string - index int - tilesPerCard int - } - - parseTests := []parseTest{ - { - line: "card1:gt1", - index: 0, - result: "0.1", - hierarchys: []string{"COMPOSITE"}, - tilesPerCard: 2, - }, - { - line: "card1:gt0", - index: 0, - result: "", - hierarchys: []string{"COMPOSITE"}, - tilesPerCard: 1, - }, - { - line: "card1:gt1+gt2", - index: 0, - result: "0.1,0.2", - hierarchys: []string{"COMPOSITE"}, - tilesPerCard: 3, - }, - // Invalid hierarchy defaults to FLAT - { - line: "card1:gt1+gt2,card2:gt0", - index: 0, - result: "1,2,3", - hierarchys: []string{"FOOBAR"}, - tilesPerCard: 3, - }, - { - line: "card1:gt1+gt2,card2:gt0", - index: 0, - result: "1,2,3", - hierarchys: []string{"FLAT"}, - tilesPerCard: 3, - }, - { - line: "||card1:gt1+gt2,card2:gt0", - index: 0, - result: "1,2,3", - hierarchys: []string{"", "", "FLAT"}, - tilesPerCard: 3, - }, - { - line: "||||card1:gt3,card5:gt1", - index: 0, - result: "3,9", - hierarchys: []string{"", "", "", "", "FLAT"}, - tilesPerCard: 8, - }, - { - line: "card1:gt1+gt2,card2:gt1", - index: 0, - result: "1,2,4", - hierarchys: []string{"COMBINED"}, - tilesPerCard: 3, - }, - { - line: "card1:gt1,card2:gt1", - index: 0, - result: "1,3", - hierarchys: []string{"COMBINED"}, - tilesPerCard: 2, - }, - { - line: "card1:gt1", - index: 1, - result: "", - }, - { - line: "card1:gt1|card2:gt4", - index: 1, - result: "4", - tilesPerCard: 5, - }, - { - line: "card1:gt1|card2:gt4,card3:gt2", - index: 1, - result: "0.4,1.2", - hierarchys: []string{"COMPOSITE", "COMPOSITE"}, - tilesPerCard: 5, - }, - { - line: "card1:gt1|card2:gt4,card3:gt2|card5:gt0", - index: 2, - result: "0.0", - hierarchys: []string{"COMPOSITE", "COMPOSITE", "COMPOSITE"}, - tilesPerCard: 5, - }, - { - line: "||card5:gt0,card6:gt4||", - index: 0, - result: "0.0,1.4", - hierarchys: []string{"", "", "COMPOSITE"}, - tilesPerCard: 5, - }, - { - line: "||card5:gt0,card6:gt4||", - index: 1, - result: "", - }, - { - line: "||card5:gt0,card:6:gt4||", - index: 0, - result: "", - }, - { - line: "||card5:gt0,card6:gt+gt+gt||", - index: 0, - result: "", - }, - { - line: "card1:gtX", - index: 0, - result: "", - }, - { - line: "card1:64X", - index: 0, - result: "", - }, - { - line: "|", - index: 0, - result: "", - }, - { - line: "card1:gt1||card2:gt4,card3:gt2", - index: 1, - result: "0.4,1.2", - hierarchys: []string{"", "", "COMPOSITE"}, - tilesPerCard: 6, - }, - { - line: "|||card2:gt7", - index: 0, - result: "0.7", - hierarchys: []string{"", "", "", "COMPOSITE"}, - tilesPerCard: 8, - }, - { - line: "card5", - index: 0, - result: "", - }, - } - - for testIndex, pt := range parseTests { - pod := v1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Annotations: map[string]string{ - gasTileAnnotation: pt.line}, - }, - } - - if pt.hierarchys != nil { - // Create enough containers - pod.Spec.Containers = make([]v1.Container, 10) - - for i := range pod.Spec.Containers { - if i < len(pt.hierarchys) { - pod.Spec.Containers[i].Env = []v1.EnvVar{ - { - Name: levelzeroHierarchyEnvVar, - Value: pt.hierarchys[i], - }, - } - } - } - } - - ret := containerTileAffinityMask(&pod, pt.index, max(1, pt.tilesPerCard)) - - expectTruef(ret == pt.result, t, pt.line, "resulting mask is wrong (test index=%d). correct: %v, got: %v", testIndex, pt.result, ret) - } -} - -func TestSelectDeviceIDsForContainerDoubleCards(t *testing.T) { - cards := []string{ - "card0", - "card1", - } - - deviceIds := []string{ - "card0-0", - "card0-1", - "card0-2", - "card1-0", - "card1-1", - "card1-2", - } - - selected := selectDeviceIDsForContainer(2, cards, deviceIds, []string{}) - if len(selected) != 2 { - t.Errorf("Not the correct amount of devices were selected") - } - - correctDevices := map[string]bool{ - "card0-0": false, - "card1-0": false, - } - - for _, selected := range selected { - correctDevices[selected] = true - } - - for dev, used := range correctDevices { - if !used { - t.Errorf("correct device was not used: %s", dev) - } - } -} - -func TestSelectDeviceIDsForContainerSingleCard(t *testing.T) { - cards := []string{ - "card2", - } - - deviceIds := []string{ - "card0-0", - "card0-1", - "card1-0", - "card2-0", - "card2-1", - } - - selected := selectDeviceIDsForContainer(1, cards, deviceIds, []string{}) - if len(selected) != 1 { - t.Errorf("Not the correct amount of devices were selected") - } - - if selected[0] != "card2-0" { - t.Errorf("First selection is wrong: %s vs %s", selected[0], "card2-0") - } -} - -func expectTruef(predicate bool, t *testing.T, testName, format string, args ...interface{}) { - if !predicate { - t.Helper() - t.Errorf(fmt.Sprintf("in test %q ", testName)+format, args...) - } -} diff --git a/cmd/qat_plugin/README.md b/cmd/qat_plugin/README.md index db13ea013..611e9f39e 100644 --- a/cmd/qat_plugin/README.md +++ b/cmd/qat_plugin/README.md @@ -43,7 +43,6 @@ The QAT plugin can take a number of command line arguments, summarised in the fo | -dpdk-driver | string | DPDK Device driver for configuring the QAT device (default: `vfio-pci`) | | -kernel-vf-drivers | string | Comma separated list of the QuickAssist VFs to search and use in the system. Devices supported: DH895xCC, C62x, C3xxx, 4xxx/401xx/402xx, 420xx, C4xxx and D15xx (default: `4xxxvf,420xxvf`) | | -max-num-devices | int | maximum number of QAT devices to be provided to the QuickAssist device plugin (default: `64`) | -| -mode | string | Deprecated: plugin mode which can be either `dpdk` or `kernel` (default: `dpdk`).| | -allocation-policy | string | 2 possible values: balanced and packed. Balanced mode spreads allocated QAT VF resources balanced among QAT PF devices, and packed mode packs one QAT PF device full of QAT VF resources before allocating resources from the next QAT PF. (There is no default.) | The plugin also accepts a number of other arguments related to logging. Please use the `-h` option to see @@ -59,15 +58,6 @@ For more details on the `-dpdk-driver` choice, see For more details on the available options to the `-kernel-vf-drivers` option, see the list of vf drivers available in the [Linux Kernel](https://github.com/torvalds/linux/tree/master/drivers/crypto/intel/qat). -If the `-mode` parameter is set to `kernel`, no other parameter documented above are valid, -except the `klog` logging related parameters. -`kernel` mode implements resource allocation based on system configured [logical instances][7] and -it does not guarantee full device isolation between containers. Therefore, it's not recommended. - -> **Note**: `-mode` parameter is deprecated and it is also not made available as an option to -> the operator based deployment. Furthermore, `kernel` mode is excluded by default from all builds (including those hosted on the Docker hub), -> by default. See the [Build the plugin image](#build-the-plugin-image) section for more details. - ## Installation The below sections cover how to obtain, build and install this component. diff --git a/cmd/qat_plugin/kerneldrv/kerneldrv.go b/cmd/qat_plugin/kerneldrv/kerneldrv.go deleted file mode 100644 index 2bc1bc11d..000000000 --- a/cmd/qat_plugin/kerneldrv/kerneldrv.go +++ /dev/null @@ -1,388 +0,0 @@ -// Copyright 2018 Intel Corporation. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -//go:build kerneldrv -// +build kerneldrv - -// Package kerneldrv populates a device tree with QAT devices using the kernel QAT driver. -package kerneldrv - -import ( - "fmt" - "os" - "path/filepath" - "regexp" - "strings" - "time" - - "github.com/go-ini/ini" - "github.com/pkg/errors" - - "k8s.io/klog/v2" - pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" - utilsexec "k8s.io/utils/exec" - - dpapi "github.com/intel/intel-device-plugins-for-kubernetes/pkg/deviceplugin" -) - -var ( - adfCtlRegex = regexp.MustCompile(`type: (?P[[:print:]]+), .* inst_id: (?P[0-9]+), .* bsf: ([0-9a-f]{4}:)?(?P[0-9a-f]{2}:[0-9a-f]{2}\.[0-9a-f]), .* state: (?P[[:alpha:]]+)$`) -) - -type endpoint struct { - id string - processes int -} - -type section struct { - endpoints []endpoint - cryptoEngines int - compressionEngines int - pinned bool -} - -type device struct { - id string - devtype string - bsf string -} - -type driverConfig map[string]section - -func newDeviceSpec(devPath string) pluginapi.DeviceSpec { - return pluginapi.DeviceSpec{ - HostPath: devPath, - ContainerPath: devPath, - Permissions: "rw", - } -} - -func getDevTree(sysfs string, qatDevs []device, config map[string]section) (dpapi.DeviceTree, error) { - devTree := dpapi.NewDeviceTree() - - devs := []pluginapi.DeviceSpec{ - newDeviceSpec("/dev/qat_adf_ctl"), - newDeviceSpec("/dev/qat_dev_processes"), - newDeviceSpec("/dev/usdm_drv"), - } - - for _, qatDev := range qatDevs { - uiodevs, err := getUIODevices(sysfs, qatDev.devtype, qatDev.bsf) - if err != nil { - return nil, err - } - - for _, uiodev := range uiodevs { - devs = append(devs, newDeviceSpec(filepath.Join("/dev/", uiodev))) - } - } - - uniqID := 0 - - for sname, svalue := range config { - devType := fmt.Sprintf("cy%d_dc%d", svalue.cryptoEngines, svalue.compressionEngines) - - for _, ep := range svalue.endpoints { - for i := 0; i < ep.processes; i++ { - envs := map[string]string{ - fmt.Sprintf("QAT_SECTION_NAME_%s_%d", devType, uniqID): sname, - // This env variable may get overridden if a container requests more than one QAT process. - // But we keep this code since the majority of pod workloads run only one QAT process. - // The rest should use QAT_SECTION_NAME_XXX variables. - "QAT_SECTION_NAME": sname, - } - deviceInfo := dpapi.NewDeviceInfo(pluginapi.Healthy, devs, nil, envs, nil, nil) - devTree.AddDevice(devType, fmt.Sprintf("%s_%s_%d", sname, ep.id, i), deviceInfo) - - uniqID++ - } - - if !svalue.pinned { - break - } - } - } - - return devTree, nil -} - -// DevicePlugin represents QAT plugin exploiting kernel driver. -type DevicePlugin struct { - execer utilsexec.Interface - configDir string -} - -// NewDevicePlugin returns new instance of kernel based QAT plugin. -func NewDevicePlugin() *DevicePlugin { - return newDevicePlugin("/etc", utilsexec.New()) -} - -func newDevicePlugin(configDir string, execer utilsexec.Interface) *DevicePlugin { - return &DevicePlugin{ - execer: execer, - configDir: configDir, - } -} - -func (dp *DevicePlugin) getOnlineDevices(iommuOn bool) ([]device, error) { - outputBytes, err := dp.execer.Command("adf_ctl", "status").CombinedOutput() - if err != nil { - return nil, errors.Wrapf(err, "Can't get driver status") - } - - devices := []device{} - - // QAT Gen4 devices should not be used with "-mode kernel" - devicesDenyList := map[string]struct{}{ - "4xxx": {}, - "4xxxvf": {}, - } - - for _, line := range strings.Split(string(outputBytes[:]), "\n") { - matches := adfCtlRegex.FindStringSubmatch(line) - if len(matches) != 6 { - continue - } - - // Ignore devices which are down. - if matches[5] != "up" { - continue - } - - // Ignore devices which are on the denylist. - if _, ok := devicesDenyList[matches[1]]; ok { - klog.Warning("skip denylisted device ", matches[1]) - continue - } - - // "Cannot use PF with IOMMU enabled" - if iommuOn && !strings.HasSuffix(matches[1], "vf") { - continue - } - - devices = append(devices, device{ - id: fmt.Sprintf("dev%s", matches[2]), - devtype: matches[1], - bsf: fmt.Sprintf("%s%s", matches[3], matches[4]), - }) - klog.V(4).Info("New online device", devices[len(devices)-1]) - } - - return devices, nil -} - -func getUIODeviceListPath(sysfs, devtype, bsf string) string { - return filepath.Join(sysfs, "bus", "pci", "drivers", devtype, bsf, "uio") -} - -func getUIODevices(sysfs, devtype, bsf string) ([]string, error) { - sysfsDir := getUIODeviceListPath(sysfs, devtype, bsf) - klog.V(4).Info("Path to uio devices:", sysfsDir) - - devFiles, err := os.ReadDir(sysfsDir) - if err != nil { - return nil, errors.Wrapf(err, "Can't read %s", sysfsDir) - } - - if len(devFiles) == 0 { - klog.Warning("no uio devices listed in", sysfsDir) - } - - devices := []string{} - for _, devFile := range devFiles { - devices = append(devices, devFile.Name()) - } - - return devices, nil -} - -func (dp *DevicePlugin) parseConfigs(devices []device) (map[string]section, error) { - devNum := 0 - drvConfig := make(driverConfig) - - for _, dev := range devices { - // Parse the configuration. - config, err := ini.Load(filepath.Join(dp.configDir, fmt.Sprintf("%s_%s.conf", dev.devtype, dev.id))) - if err != nil { - return nil, errors.Wrap(err, "failed to parse device config") - } - - devNum++ - - for _, section := range config.Sections() { - if section.Name() == "GENERAL" || section.Name() == "KERNEL" || section.Name() == "KERNEL_QAT" || section.Name() == ini.DefaultSection { - continue - } - - klog.V(4).Info(section.Name()) - - if err := drvConfig.update(dev.id, section); err != nil { - return nil, err - } - } - } - - // check if the number of sections with LimitDevAccess=1 is equal to the number of endpoints - for sname, svalue := range drvConfig { - if svalue.pinned && len(svalue.endpoints) != devNum { - return nil, errors.Errorf("Section [%s] must be defined for all QAT devices since it contains LimitDevAccess=1", sname) - } - } - - return drvConfig, nil -} - -func (drvConfig driverConfig) update(devID string, iniSection *ini.Section) error { - numProcesses, err := iniSection.Key("NumProcesses").Int() - if err != nil { - return errors.Wrapf(err, "Can't parse NumProcesses in %s", iniSection.Name()) - } - - cryptoEngines, err := iniSection.Key("NumberCyInstances").Int() - if err != nil { - return errors.Wrapf(err, "Can't parse NumberCyInstances in %s", iniSection.Name()) - } - - compressionEngines, err := iniSection.Key("NumberDcInstances").Int() - if err != nil { - return errors.Wrapf(err, "Can't parse NumberDcInstances in %s", iniSection.Name()) - } - - pinned := false - - if limitDevAccessKey, err := iniSection.GetKey("LimitDevAccess"); err == nil { - limitDevAccess, err := limitDevAccessKey.Bool() - if err != nil { - return errors.Wrapf(err, "Can't parse LimitDevAccess in %s", iniSection.Name()) - } - - if limitDevAccess { - pinned = true - } - } - - if old, ok := drvConfig[iniSection.Name()]; ok { - // first check the sections are consistent across endpoints - if old.pinned != pinned { - return errors.Errorf("Value of LimitDevAccess must be consistent across all devices in %s", iniSection.Name()) - } - - if !pinned && old.endpoints[0].processes != numProcesses { - return errors.Errorf("For not pinned section \"%s\" NumProcesses must be equal for all devices", iniSection.Name()) - } - - if old.cryptoEngines != cryptoEngines || old.compressionEngines != compressionEngines { - return errors.Errorf("NumberCyInstances and NumberDcInstances must be consistent across all devices in %s", iniSection.Name()) - } - - // then add a new endpoint to the section - old.endpoints = append(old.endpoints, endpoint{ - id: devID, - processes: numProcesses, - }) - drvConfig[iniSection.Name()] = old - } else { - drvConfig[iniSection.Name()] = section{ - endpoints: []endpoint{ - { - id: devID, - processes: numProcesses, - }, - }, - cryptoEngines: cryptoEngines, - compressionEngines: compressionEngines, - pinned: pinned, - } - } - - return nil -} - -func getIOMMUStatus() (bool, error) { - iommus, err := os.ReadDir("/sys/class/iommu/") - if err != nil { - return false, errors.Wrapf(err, "Unable to read IOMMU status") - } - - if len(iommus) > 0 { - return true, nil - } - - return false, nil -} - -// Scan implements Scanner interface for kernel based QAT plugin. -func (dp *DevicePlugin) Scan(notifier dpapi.Notifier) error { - for { - iommuOn, err := getIOMMUStatus() - if err != nil { - return err - } - - devices, err := dp.getOnlineDevices(iommuOn) - if err != nil { - return err - } - - driverConfig, err := dp.parseConfigs(devices) - if err != nil { - return err - } - - devTree, err := getDevTree("/sys", devices, driverConfig) - if err != nil { - return err - } - - notifier.Notify(devTree) - - time.Sleep(5 * time.Second) - } -} - -// PostAllocate implements PostAllocator interface for kernel based QAT plugin. -func (dp *DevicePlugin) PostAllocate(response *pluginapi.AllocateResponse) error { - for _, containerResponse := range response.GetContainerResponses() { - envsToDelete := []string{} - envsToAdd := make(map[string]string) - counter := 0 - - for key, value := range containerResponse.Envs { - if !strings.HasPrefix(key, "QAT_SECTION_NAME_") { - continue - } - - parts := strings.Split(key, "_") - if len(parts) != 6 { - return errors.Errorf("Wrong format of env variable name %s", key) - } - - prefix := strings.Join(parts[0:5], "_") - - envsToDelete = append(envsToDelete, key) - envsToAdd[fmt.Sprintf("%s_%d", prefix, counter)] = value - counter++ - } - - for _, key := range envsToDelete { - delete(containerResponse.Envs, key) - } - - for key, value := range envsToAdd { - containerResponse.Envs[key] = value - } - } - - return nil -} diff --git a/cmd/qat_plugin/kerneldrv/kerneldrv_test.go b/cmd/qat_plugin/kerneldrv/kerneldrv_test.go deleted file mode 100644 index 9e2781af9..000000000 --- a/cmd/qat_plugin/kerneldrv/kerneldrv_test.go +++ /dev/null @@ -1,415 +0,0 @@ -// Copyright 2018 Intel Corporation. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -//go:build kerneldrv -// +build kerneldrv - -package kerneldrv - -import ( - "errors" - "flag" - "fmt" - "os" - "path" - "path/filepath" - "reflect" - "sort" - "testing" - "time" - - "k8s.io/klog/v2" - pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" - "k8s.io/utils/exec" - fakeexec "k8s.io/utils/exec/testing" -) - -var errFake = errors.New("fake error") - -const ( - adfCtlOutput = `Checking status of all devices. -There is 3 QAT acceleration device(s) in the system: - qat_dev0 - type: c6xx, inst_id: 0, node_id: 0, bsf: 0000:3b:00.0, #accel: 5 #engines: 10 state: up - qat_dev1 - type: c6xx, inst_id: 1, node_id: 0, bsf: 0000:3d:00.0, #accel: 5 #engines: 10 state: up - qat_dev2 - type: c6xx, inst_id: 2, node_id: 3, bsf: 0000:d8:00.0, #accel: 5 #engines: 10 state: up -` - adfCtlOutputOneDown = `Checking status of all devices. -There is 3 QAT acceleration device(s) in the system: - qat_dev0 - type: c6xx, inst_id: 0, node_id: 0, bsf: 3b:00.0, #accel: 5 #engines: 10 state: up - qat_dev1 - type: c6xx, inst_id: 1, node_id: 0, bsf: 3d:00.0, #accel: 5 #engines: 10 state: down - qat_dev2 - type: c6xx, inst_id: 2, node_id: 3, bsf: d8:00.0, #accel: 5 #engines: 10 state: up -` - adfCtlOutputVf = `Checking status of all devices. -There is 7 QAT acceleration device(s) in the system: - qat_dev0 - type: c6xx, inst_id: 0, node_id: 0, bsf: 0000:3b:00.0, #accel: 5 #engines: 10 state: up - qat_dev1 - type: c6xx, inst_id: 1, node_id: 0, bsf: 0000:3b:00.0, #accel: 5 #engines: 10 state: up - qat_dev2 - type: c6xx, inst_id: 2, node_id: 3, bsf: 0000:3b:00.0, #accel: 5 #engines: 10 state: up - qat_dev3 - type: c6xxvf, inst_id: 0, node_id: 0, bsf: 0000:3b:01.0, #accel: 1 #engines: 1 state: up - qat_dev4 - type: c6xxvf, inst_id: 1, node_id: 0, bsf: 0000:3b:01.1, #accel: 1 #engines: 1 state: up - qat_dev5 - type: c6xxvf, inst_id: 2, node_id: 0, bsf: 0000:3b:01.2, #accel: 1 #engines: 1 state: up - qat_dev6 - type: c6xxvf, inst_id: 3, node_id: 0, bsf: 0000:3b:01.3, #accel: 1 #engines: 1 state: up -` - adfCtlOutputDenyListDevices = `Checking status of all devices. -There is 7 QAT acceleration device(s) in the system: - qat_dev0 - type: 4xxx, inst_id: 0, node_id: 0, bsf: 0000:3b:00.0, #accel: 5 #engines: 10 state: up - qat_dev1 - type: 4xxx, inst_id: 1, node_id: 0, bsf: 0000:3c:00.0, #accel: 5 #engines: 10 state: up - qat_dev2 - type: 4xxx, inst_id: 2, node_id: 3, bsf: 0000:3d:00.0, #accel: 5 #engines: 10 state: up - qat_dev3 - type: 4xxxvf, inst_id: 0, node_id: 0, bsf: 0000:3b:01.0, #accel: 1 #engines: 1 state: up - qat_dev4 - type: 4xxxvf, inst_id: 1, node_id: 0, bsf: 0000:3b:01.1, #accel: 1 #engines: 1 state: up - qat_dev5 - type: 4xxxvf, inst_id: 2, node_id: 0, bsf: 0000:3b:01.2, #accel: 1 #engines: 1 state: up - qat_dev6 - type: 4xxxvf, inst_id: 3, node_id: 0, bsf: 0000:3b:01.3, #accel: 1 #engines: 1 state: up -` -) - -func init() { - _ = flag.Set("v", "4") //Enable debug output -} - -func TestGetOnlineDevices(t *testing.T) { - tcases := []struct { - name string - adfCtlError error - adfCtlOutput string - expectedDevNum int - expectedErr bool - iommuOn bool - }{ - { - name: "all is good", - adfCtlOutput: adfCtlOutput, - expectedDevNum: 3, - iommuOn: false, - }, - { - name: "one device is down", - adfCtlOutput: adfCtlOutputOneDown, - expectedDevNum: 2, - iommuOn: false, - }, - { - name: "virtual functions enabled", - adfCtlOutput: adfCtlOutputVf, - expectedDevNum: 4, - iommuOn: true, - }, - { - name: "denylisted devices", - adfCtlOutput: adfCtlOutputDenyListDevices, - expectedDevNum: 0, - iommuOn: true, - }, - { - name: "adf_ctl fails to run", - adfCtlError: errFake, - expectedErr: true, - }, - } - for _, tt := range tcases { - t.Run(tt.name, func(t *testing.T) { - fcmd := fakeexec.FakeCmd{ - CombinedOutputScript: []fakeexec.FakeAction{ - func() ([]byte, []byte, error) { - return []byte(tt.adfCtlOutput), []byte{}, tt.adfCtlError - }, - }, - } - execer := fakeexec.FakeExec{ - CommandScript: []fakeexec.FakeCommandAction{ - func(cmd string, args ...string) exec.Cmd { - return fakeexec.InitFakeCmd(&fcmd, cmd, args...) - }, - }, - } - dp := &DevicePlugin{ - execer: &execer, - } - devices, err := dp.getOnlineDevices(tt.iommuOn) - if tt.expectedErr && err == nil { - t.Error("Expected error hasn't been triggered") - } - if !tt.expectedErr && err != nil { - t.Errorf("Unexpected error: %+v", err) - } - if len(devices) != tt.expectedDevNum { - t.Errorf("Wrong number of device detected: %d instead of %d", len(devices), tt.expectedDevNum) - } - }) - } -} - -func TestGetUIODevices(t *testing.T) { - tcases := []struct { - name string - devType string - bsf string - uiodevs []string - expectedErr bool - }{ - { - name: "can't read sysfs", - devType: "faketype", - expectedErr: true, - }, - { - name: "all is good", - devType: "c6xx", - uiodevs: []string{"uio0", "uio1"}, - bsf: "da:00.0", - }, - } - for tnum, tt := range tcases { - t.Run(tt.name, func(t *testing.T) { - var err error - tmpdir := fmt.Sprintf("/tmp/qatplugin-getUIODevices-%d-%d", time.Now().Unix(), tnum) - sysfs := filepath.Join(tmpdir, "sys") - - for _, uiodev := range tt.uiodevs { - err = os.MkdirAll(filepath.Join(getUIODeviceListPath(sysfs, tt.devType, tt.bsf), uiodev), 0750) - if err != nil { - t.Fatal(err) - } - } - devs, err := getUIODevices(sysfs, tt.devType, tt.bsf) - if tt.expectedErr && err == nil { - t.Error("Expected error hasn't been triggered") - } - if !tt.expectedErr && err != nil { - t.Errorf("Unexpected error: %+v", err) - } - sort.Strings(tt.uiodevs) - sort.Strings(devs) - if tt.uiodevs != nil && !reflect.DeepEqual(devs, tt.uiodevs) { - t.Error("Unexpected devices: ", devs) - } - - err = os.RemoveAll(tmpdir) - if err != nil { - t.Fatal(err) - } - }) - } -} - -func TestParseConfigs(t *testing.T) { - qatdevs := []device{ - { - id: "dev0", - devtype: "c6xx", - }, - { - id: "dev1", - devtype: "c6xx", - }, - { - id: "dev2", - devtype: "c6xx", - }, - } - tcases := []struct { - name string - testData string - expectedErr bool - }{ - { - name: "All is good", - testData: "all_is_good", - }, - { - name: "Missing section with LinitDevAccess=1", - testData: "missing_pinned_section", - expectedErr: true, - }, - { - name: "Can't parse NumProcesses", - testData: "cant_parse_num_processes", - expectedErr: true, - }, - { - name: "Inconsistent LimitDevAccess", - testData: "inconsistent_limitdev", - expectedErr: true, - }, - } - - for _, tt := range tcases { - dp := &DevicePlugin{ - configDir: "./test_data/" + tt.testData, - } - - _, err := dp.parseConfigs(qatdevs) - if tt.expectedErr && err == nil { - t.Errorf("Test case '%s': expected error hasn't been triggered", tt.name) - } - - if !tt.expectedErr && err != nil { - t.Errorf("Test case '%s': Unexpected error: %+v", tt.name, err) - } - } -} - -func TestGetDevTree(t *testing.T) { - tmpdir := fmt.Sprintf("/tmp/qatplugin-getDevTree-%d", time.Now().Unix()) - tcases := []struct { - name string - sysfs string - config map[string]section - uiodevs map[string][]string - qatdevs []device - expectedErr bool - }{ - { - name: "All is good", - sysfs: "sys", - uiodevs: map[string][]string{ - "da:00.0": {"uio4", "uio5"}, - }, - qatdevs: []device{ - { - id: "dev0", - devtype: "c6xx", - bsf: "da:00.0", - }, - }, - config: map[string]section{ - "TESTSHIM": { - endpoints: []endpoint{ - { - id: "dev0", - processes: 2, - }, - }, - }, - "TESTSHIM2": { - endpoints: []endpoint{ - { - id: "dev0", - processes: 2, - }, - }, - }, - "TESTPINNED": { - endpoints: []endpoint{ - { - id: "dev0", - processes: 2, - }, - }, - pinned: true, - }, - }, - }, - { - name: "Wrong devfs", - sysfs: "wrongdev", - qatdevs: []device{ - { - id: "dev0", - devtype: "c6xx", - bsf: "da:00.0", - }, - }, - expectedErr: true, - }, - } - - for _, tt := range tcases { - t.Run(tt.name, func(t *testing.T) { - var err error - - sysfs := filepath.Join(tmpdir, "sys") - err = os.MkdirAll(sysfs, 0750) - if err != nil { - t.Fatal(err) - } - - for _, qatdev := range tt.qatdevs { - for _, uiodev := range tt.uiodevs[qatdev.bsf] { - err = os.MkdirAll(filepath.Join(getUIODeviceListPath(sysfs, qatdev.devtype, qatdev.bsf), uiodev), 0750) - if err != nil { - t.Fatal(err) - } - } - } - - _, err = getDevTree(path.Join(tmpdir, tt.sysfs), tt.qatdevs, tt.config) - if tt.expectedErr && err == nil { - t.Errorf("Test case '%s': expected error hasn't been triggered", tt.name) - } - if !tt.expectedErr && err != nil { - t.Errorf("Test case '%s': Unexpected error: %+v", tt.name, err) - } - - err = os.RemoveAll(tmpdir) - if err != nil { - t.Fatal(err) - } - }) - } -} - -func TestPostAllocate(t *testing.T) { - tcases := []struct { - name string - envs map[string]string - expectedEnvs []string - expectedErr bool - }{ - { - name: "All is good", - envs: map[string]string{ - "SOMEVAR": "some value", - "QAT_SECTION_NAME_cy1_dc0_15": "TESTSHIM", - "QAT_SECTION_NAME_cy1_dc0_32": "TESTSHIM2", - }, - expectedEnvs: []string{ - "SOMEVAR", - "QAT_SECTION_NAME_cy1_dc0_0", - "QAT_SECTION_NAME_cy1_dc0_1", - }, - }, - { - name: "Wrong env variable name format", - envs: map[string]string{ - "QAT_SECTION_NAME_JUSTWRONG": "some value", - }, - expectedErr: true, - }, - } - for _, tc := range tcases { - response := new(pluginapi.AllocateResponse) - cresp := new(pluginapi.ContainerAllocateResponse) - cresp.Envs = tc.envs - response.ContainerResponses = append(response.ContainerResponses, cresp) - - dp := &DevicePlugin{} - - err := dp.PostAllocate(response) - - for _, key := range tc.expectedEnvs { - if _, ok := cresp.Envs[key]; !ok { - t.Errorf("Test case '%s': expcted env variable '%s' is missing", tc.name, key) - } - } - - if tc.expectedErr && err == nil { - t.Errorf("Test case '%s': expected error hasn't been triggered", tc.name) - } - - if !tc.expectedErr && err != nil { - t.Errorf("Test case '%s': Unexpected error: %+v", tc.name, err) - } - - klog.V(4).Info(response) - } -} diff --git a/cmd/qat_plugin/kerneldrv/stub.go b/cmd/qat_plugin/kerneldrv/stub.go deleted file mode 100644 index 6824efd1a..000000000 --- a/cmd/qat_plugin/kerneldrv/stub.go +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2019 Intel Corporation. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -//go:build !kerneldrv -// +build !kerneldrv - -package kerneldrv - -import ( - "os" - - dpapi "github.com/intel/intel-device-plugins-for-kubernetes/pkg/deviceplugin" - "k8s.io/klog/v2" -) - -// NewDevicePlugin creates a non-functional stub for kernel mode device plugins. -func NewDevicePlugin() dpapi.Scanner { - klog.Errorf("kernel mode is not supported in this build. Use 'kerneldrv' build tag to have this mode enabled. Exiting...") - os.Exit(1) - - return nil -} diff --git a/cmd/qat_plugin/kerneldrv/test_data/all_is_good/c6xx_dev0.conf b/cmd/qat_plugin/kerneldrv/test_data/all_is_good/c6xx_dev0.conf deleted file mode 100644 index e63b5950e..000000000 --- a/cmd/qat_plugin/kerneldrv/test_data/all_is_good/c6xx_dev0.conf +++ /dev/null @@ -1,205 +0,0 @@ -################################################################ -# This file is provided under a dual BSD/GPLv2 license. When using or -# redistributing this file, you may do so under either license. -# -# GPL LICENSE SUMMARY -# -# Copyright(c) 2007-2018 Intel Corporation. All rights reserved. -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of version 2 of the GNU General Public License as -# published by the Free Software Foundation. -# -# This program is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. -# The full GNU General Public License is included in this distribution -# in the file called LICENSE.GPL. -# -# Contact Information: -# Intel Corporation -# -# BSD LICENSE -# -# Copyright(c) 2007-2018 Intel Corporation. All rights reserved. -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in -# the documentation and/or other materials provided with the -# distribution. -# * Neither the name of Intel Corporation nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# -# -# version: QAT1.7.L.4.2.0-00012 -################################################################ -[GENERAL] -ServicesEnabled = cy;dc - -ConfigVersion = 2 - -#Default values for number of concurrent requests*/ -CyNumConcurrentSymRequests = 512 -CyNumConcurrentAsymRequests = 64 - -#Statistics, valid values: 1,0 -statsGeneral = 1 -statsDh = 1 -statsDrbg = 1 -statsDsa = 1 -statsEcc = 1 -statsKeyGen = 1 -statsDc = 1 -statsLn = 1 -statsPrime = 1 -statsRsa = 1 -statsSym = 1 -KptEnabled = 0 - -# Disable public key crypto and prime number -# services by specifying a value of 1 (default is 0) -PkeServiceDisabled = 0 - -# Specify size of intermediate buffers for which to -# allocate on-chip buffers. Legal values are 32 and -# 64 (default is 64). Specify 32 to optimize for -# compressing buffers <=32KB in size. -DcIntermediateBufferSizeInKB = 64 - -# This flag is to enable device auto reset on heartbeat error -AutoResetOnError = 0 - -############################################## -# Kernel Instances Section -############################################## -[KERNEL] -NumberCyInstances = 1 -NumberDcInstances = 1 - -# Crypto - Kernel instance #0 -Cy0Name = "IPSec0" -Cy0IsPolled = 0 -Cy0CoreAffinity = 0 - -# Data Compression - Kernel instance #0 -Dc0Name = "IPComp0" -Dc0IsPolled = 0 -Dc0CoreAffinity = 0 - -############################################## -# User Process Instance Section -############################################## -[SSL] -NumberCyInstances = 6 -NumberDcInstances = 2 -NumProcesses = 1 -LimitDevAccess = 0 - -# Crypto - User instance #0 -Cy0Name = "SSL0" -Cy0IsPolled = 1 -# List of core affinities -Cy0CoreAffinity = 1 - -# Crypto - User instance #1 -Cy1Name = "SSL1" -Cy1IsPolled = 1 -# List of core affinities -Cy1CoreAffinity = 2 - -# Crypto - User instance #2 -Cy2Name = "SSL2" -Cy2IsPolled = 1 -# List of core affinities -Cy2CoreAffinity = 3 - -# Crypto - User instance #3 -Cy3Name = "SSL3" -Cy3IsPolled = 1 -# List of core affinities -Cy3CoreAffinity = 4 - -# Crypto - User instance #4 -Cy4Name = "SSL4" -Cy4IsPolled = 1 -# List of core affinities -Cy4CoreAffinity = 5 - -# Crypto - User instance #5 -Cy5Name = "SSL5" -Cy5IsPolled = 1 -# List of core affinities -Cy5CoreAffinity = 6 - - -# Data Compression - User instance #0 -Dc0Name = "Dc0" -Dc0IsPolled = 1 -# List of core affinities -Dc0CoreAffinity = 1 - -# Data Compression - User instance #1 -Dc1Name = "Dc1" -Dc1IsPolled = 1 -# List of core affinities -Dc1CoreAffinity = 2 - -[SHIM] -NumberCyInstances = 1 -NumberDcInstances = 0 -NumProcesses = 2 -LimitDevAccess = 0 - -# Crypto - User instance #0 -Cy0Name = "UserCY0" -Cy0IsPolled = 1 -# List of core affinities -Cy0CoreAffinity = 0 - -[SHIM2] -NumberCyInstances = 1 -NumberDcInstances = 0 -NumProcesses = 2 -LimitDevAccess = 0 - -# Crypto - User instance #0 -Cy0Name = "UserCY0" -Cy0IsPolled = 1 -# List of core affinities -Cy0CoreAffinity = 0 - -[PINNED] -NumberCyInstances = 1 -NumberDcInstances = 0 -NumProcesses = 4 -LimitDevAccess = 1 -# Crypto - User instance #0 -Cy0Name = "UserCY0" -Cy0IsPolled = 1 -# List of core affinities -Cy0CoreAffinity = 0 diff --git a/cmd/qat_plugin/kerneldrv/test_data/all_is_good/c6xx_dev1.conf b/cmd/qat_plugin/kerneldrv/test_data/all_is_good/c6xx_dev1.conf deleted file mode 100644 index df92e1d28..000000000 --- a/cmd/qat_plugin/kerneldrv/test_data/all_is_good/c6xx_dev1.conf +++ /dev/null @@ -1,193 +0,0 @@ -################################################################ -# This file is provided under a dual BSD/GPLv2 license. When using or -# redistributing this file, you may do so under either license. -# -# GPL LICENSE SUMMARY -# -# Copyright(c) 2007-2018 Intel Corporation. All rights reserved. -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of version 2 of the GNU General Public License as -# published by the Free Software Foundation. -# -# This program is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. -# The full GNU General Public License is included in this distribution -# in the file called LICENSE.GPL. -# -# Contact Information: -# Intel Corporation -# -# BSD LICENSE -# -# Copyright(c) 2007-2018 Intel Corporation. All rights reserved. -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in -# the documentation and/or other materials provided with the -# distribution. -# * Neither the name of Intel Corporation nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# -# -# version: QAT1.7.L.4.2.0-00012 -################################################################ -[GENERAL] -ServicesEnabled = cy;dc - -ConfigVersion = 2 - -#Default values for number of concurrent requests*/ -CyNumConcurrentSymRequests = 512 -CyNumConcurrentAsymRequests = 64 - -#Statistics, valid values: 1,0 -statsGeneral = 1 -statsDh = 1 -statsDrbg = 1 -statsDsa = 1 -statsEcc = 1 -statsKeyGen = 1 -statsDc = 1 -statsLn = 1 -statsPrime = 1 -statsRsa = 1 -statsSym = 1 -KptEnabled = 0 - -# Disable public key crypto and prime number -# services by specifying a value of 1 (default is 0) -PkeServiceDisabled = 0 - -# Specify size of intermediate buffers for which to -# allocate on-chip buffers. Legal values are 32 and -# 64 (default is 64). Specify 32 to optimize for -# compressing buffers <=32KB in size. -DcIntermediateBufferSizeInKB = 64 - -# This flag is to enable device auto reset on heartbeat error -AutoResetOnError = 0 - -############################################## -# Kernel Instances Section -############################################## -[KERNEL] -NumberCyInstances = 1 -NumberDcInstances = 1 - -# Crypto - Kernel instance #0 -Cy0Name = "IPSec0" -Cy0IsPolled = 0 -Cy0CoreAffinity = 0 - -# Data Compression - Kernel instance #0 -Dc0Name = "IPComp0" -Dc0IsPolled = 0 -Dc0CoreAffinity = 0 - -############################################## -# User Process Instance Section -############################################## -[SSL] -NumberCyInstances = 6 -NumberDcInstances = 2 -NumProcesses = 1 -LimitDevAccess = 0 - -# Crypto - User instance #0 -Cy0Name = "SSL0" -Cy0IsPolled = 1 -# List of core affinities -Cy0CoreAffinity = 9 - -# Crypto - User instance #1 -Cy1Name = "SSL1" -Cy1IsPolled = 1 -# List of core affinities -Cy1CoreAffinity = 10 - -# Crypto - User instance #2 -Cy2Name = "SSL2" -Cy2IsPolled = 1 -# List of core affinities -Cy2CoreAffinity = 11 - -# Crypto - User instance #3 -Cy3Name = "SSL3" -Cy3IsPolled = 1 -# List of core affinities -Cy3CoreAffinity = 12 - -# Crypto - User instance #4 -Cy4Name = "SSL4" -Cy4IsPolled = 1 -# List of core affinities -Cy4CoreAffinity = 13 - -# Crypto - User instance #5 -Cy5Name = "SSL5" -Cy5IsPolled = 1 -# List of core affinities -Cy5CoreAffinity = 14 - - -# Data Compression - User instance #0 -Dc0Name = "Dc0" -Dc0IsPolled = 1 -# List of core affinities -Dc0CoreAffinity = 9 - -# Data Compression - User instance #1 -Dc1Name = "Dc1" -Dc1IsPolled = 1 -# List of core affinities -Dc1CoreAffinity = 10 - -[SHIM] -NumberCyInstances = 1 -NumberDcInstances = 0 -NumProcesses = 2 -LimitDevAccess = 0 - -# Crypto - User instance #0 -Cy0Name = "UserCY0" -Cy0IsPolled = 1 -# List of core affinities -Cy0CoreAffinity = 0 - -[PINNED] -NumberCyInstances = 1 -NumberDcInstances = 0 -NumProcesses = 4 -LimitDevAccess = 1 -# Crypto - User instance #0 -Cy0Name = "UserCY0" -Cy0IsPolled = 1 -# List of core affinities -Cy0CoreAffinity = 0 diff --git a/cmd/qat_plugin/kerneldrv/test_data/all_is_good/c6xx_dev2.conf b/cmd/qat_plugin/kerneldrv/test_data/all_is_good/c6xx_dev2.conf deleted file mode 100644 index d0779e5c1..000000000 --- a/cmd/qat_plugin/kerneldrv/test_data/all_is_good/c6xx_dev2.conf +++ /dev/null @@ -1,193 +0,0 @@ -################################################################ -# This file is provided under a dual BSD/GPLv2 license. When using or -# redistributing this file, you may do so under either license. -# -# GPL LICENSE SUMMARY -# -# Copyright(c) 2007-2018 Intel Corporation. All rights reserved. -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of version 2 of the GNU General Public License as -# published by the Free Software Foundation. -# -# This program is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. -# The full GNU General Public License is included in this distribution -# in the file called LICENSE.GPL. -# -# Contact Information: -# Intel Corporation -# -# BSD LICENSE -# -# Copyright(c) 2007-2018 Intel Corporation. All rights reserved. -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in -# the documentation and/or other materials provided with the -# distribution. -# * Neither the name of Intel Corporation nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# -# -# version: QAT1.7.L.4.2.0-00012 -################################################################ -[GENERAL] -ServicesEnabled = cy;dc - -ConfigVersion = 2 - -#Default values for number of concurrent requests*/ -CyNumConcurrentSymRequests = 512 -CyNumConcurrentAsymRequests = 64 - -#Statistics, valid values: 1,0 -statsGeneral = 1 -statsDh = 1 -statsDrbg = 1 -statsDsa = 1 -statsEcc = 1 -statsKeyGen = 1 -statsDc = 1 -statsLn = 1 -statsPrime = 1 -statsRsa = 1 -statsSym = 1 -KptEnabled = 0 - -# Disable public key crypto and prime number -# services by specifying a value of 1 (default is 0) -PkeServiceDisabled = 0 - -# Specify size of intermediate buffers for which to -# allocate on-chip buffers. Legal values are 32 and -# 64 (default is 64). Specify 32 to optimize for -# compressing buffers <=32KB in size. -DcIntermediateBufferSizeInKB = 64 - -# This flag is to enable device auto reset on heartbeat error -AutoResetOnError = 0 - -############################################## -# Kernel Instances Section -############################################## -[KERNEL] -NumberCyInstances = 1 -NumberDcInstances = 1 - -# Crypto - Kernel instance #0 -Cy0Name = "IPSec0" -Cy0IsPolled = 0 -Cy0CoreAffinity = 0 - -# Data Compression - Kernel instance #0 -Dc0Name = "IPComp0" -Dc0IsPolled = 0 -Dc0CoreAffinity = 0 - -############################################## -# User Process Instance Section -############################################## -[SSL] -NumberCyInstances = 6 -NumberDcInstances = 2 -NumProcesses = 1 -LimitDevAccess = 0 - -# Crypto - User instance #0 -Cy0Name = "SSL0" -Cy0IsPolled = 1 -# List of core affinities -Cy0CoreAffinity = 17 - -# Crypto - User instance #1 -Cy1Name = "SSL1" -Cy1IsPolled = 1 -# List of core affinities -Cy1CoreAffinity = 18 - -# Crypto - User instance #2 -Cy2Name = "SSL2" -Cy2IsPolled = 1 -# List of core affinities -Cy2CoreAffinity = 19 - -# Crypto - User instance #3 -Cy3Name = "SSL3" -Cy3IsPolled = 1 -# List of core affinities -Cy3CoreAffinity = 20 - -# Crypto - User instance #4 -Cy4Name = "SSL4" -Cy4IsPolled = 1 -# List of core affinities -Cy4CoreAffinity = 21 - -# Crypto - User instance #5 -Cy5Name = "SSL5" -Cy5IsPolled = 1 -# List of core affinities -Cy5CoreAffinity = 22 - - -# Data Compression - User instance #0 -Dc0Name = "Dc0" -Dc0IsPolled = 1 -# List of core affinities -Dc0CoreAffinity = 17 - -# Data Compression - User instance #1 -Dc1Name = "Dc1" -Dc1IsPolled = 1 -# List of core affinities -Dc1CoreAffinity = 18 - -[SHIM] -NumberCyInstances = 1 -NumberDcInstances = 0 -NumProcesses = 2 -LimitDevAccess = 0 - -# Crypto - User instance #0 -Cy0Name = "UserCY0" -Cy0IsPolled = 1 -# List of core affinities -Cy0CoreAffinity = 0 - -[PINNED] -NumberCyInstances = 1 -NumberDcInstances = 0 -NumProcesses = 4 -LimitDevAccess = 1 -# Crypto - User instance #0 -Cy0Name = "UserCY0" -Cy0IsPolled = 1 -# List of core affinities -Cy0CoreAffinity = 0 diff --git a/cmd/qat_plugin/kerneldrv/test_data/cant_parse_num_processes/c6xx_dev0.conf b/cmd/qat_plugin/kerneldrv/test_data/cant_parse_num_processes/c6xx_dev0.conf deleted file mode 100644 index a33d7a6b5..000000000 --- a/cmd/qat_plugin/kerneldrv/test_data/cant_parse_num_processes/c6xx_dev0.conf +++ /dev/null @@ -1,5 +0,0 @@ -[SSL] -NumberCyInstances = 6 -NumberDcInstances = 2 -NumProcesses = this is error -LimitDevAccess = 0 diff --git a/cmd/qat_plugin/kerneldrv/test_data/inconsistent_limitdev/c6xx_dev0.conf b/cmd/qat_plugin/kerneldrv/test_data/inconsistent_limitdev/c6xx_dev0.conf deleted file mode 100644 index 39de58160..000000000 --- a/cmd/qat_plugin/kerneldrv/test_data/inconsistent_limitdev/c6xx_dev0.conf +++ /dev/null @@ -1,205 +0,0 @@ -################################################################ -# This file is provided under a dual BSD/GPLv2 license. When using or -# redistributing this file, you may do so under either license. -# -# GPL LICENSE SUMMARY -# -# Copyright(c) 2007-2018 Intel Corporation. All rights reserved. -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of version 2 of the GNU General Public License as -# published by the Free Software Foundation. -# -# This program is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. -# The full GNU General Public License is included in this distribution -# in the file called LICENSE.GPL. -# -# Contact Information: -# Intel Corporation -# -# BSD LICENSE -# -# Copyright(c) 2007-2018 Intel Corporation. All rights reserved. -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in -# the documentation and/or other materials provided with the -# distribution. -# * Neither the name of Intel Corporation nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# -# -# version: QAT1.7.L.4.2.0-00012 -################################################################ -[GENERAL] -ServicesEnabled = cy;dc - -ConfigVersion = 2 - -#Default values for number of concurrent requests*/ -CyNumConcurrentSymRequests = 512 -CyNumConcurrentAsymRequests = 64 - -#Statistics, valid values: 1,0 -statsGeneral = 1 -statsDh = 1 -statsDrbg = 1 -statsDsa = 1 -statsEcc = 1 -statsKeyGen = 1 -statsDc = 1 -statsLn = 1 -statsPrime = 1 -statsRsa = 1 -statsSym = 1 -KptEnabled = 0 - -# Disable public key crypto and prime number -# services by specifying a value of 1 (default is 0) -PkeServiceDisabled = 0 - -# Specify size of intermediate buffers for which to -# allocate on-chip buffers. Legal values are 32 and -# 64 (default is 64). Specify 32 to optimize for -# compressing buffers <=32KB in size. -DcIntermediateBufferSizeInKB = 64 - -# This flag is to enable device auto reset on heartbeat error -AutoResetOnError = 0 - -############################################## -# Kernel Instances Section -############################################## -[KERNEL] -NumberCyInstances = 1 -NumberDcInstances = 1 - -# Crypto - Kernel instance #0 -Cy0Name = "IPSec0" -Cy0IsPolled = 0 -Cy0CoreAffinity = 0 - -# Data Compression - Kernel instance #0 -Dc0Name = "IPComp0" -Dc0IsPolled = 0 -Dc0CoreAffinity = 0 - -############################################## -# User Process Instance Section -############################################## -[SSL] -NumberCyInstances = 6 -NumberDcInstances = 2 -NumProcesses = 1 -LimitDevAccess = 1 - -# Crypto - User instance #0 -Cy0Name = "SSL0" -Cy0IsPolled = 1 -# List of core affinities -Cy0CoreAffinity = 1 - -# Crypto - User instance #1 -Cy1Name = "SSL1" -Cy1IsPolled = 1 -# List of core affinities -Cy1CoreAffinity = 2 - -# Crypto - User instance #2 -Cy2Name = "SSL2" -Cy2IsPolled = 1 -# List of core affinities -Cy2CoreAffinity = 3 - -# Crypto - User instance #3 -Cy3Name = "SSL3" -Cy3IsPolled = 1 -# List of core affinities -Cy3CoreAffinity = 4 - -# Crypto - User instance #4 -Cy4Name = "SSL4" -Cy4IsPolled = 1 -# List of core affinities -Cy4CoreAffinity = 5 - -# Crypto - User instance #5 -Cy5Name = "SSL5" -Cy5IsPolled = 1 -# List of core affinities -Cy5CoreAffinity = 6 - - -# Data Compression - User instance #0 -Dc0Name = "Dc0" -Dc0IsPolled = 1 -# List of core affinities -Dc0CoreAffinity = 1 - -# Data Compression - User instance #1 -Dc1Name = "Dc1" -Dc1IsPolled = 1 -# List of core affinities -Dc1CoreAffinity = 2 - -[SHIM] -NumberCyInstances = 1 -NumberDcInstances = 0 -NumProcesses = 2 -LimitDevAccess = 0 - -# Crypto - User instance #0 -Cy0Name = "UserCY0" -Cy0IsPolled = 1 -# List of core affinities -Cy0CoreAffinity = 0 - -[SHIM2] -NumberCyInstances = 1 -NumberDcInstances = 0 -NumProcesses = 2 -LimitDevAccess = 0 - -# Crypto - User instance #0 -Cy0Name = "UserCY0" -Cy0IsPolled = 1 -# List of core affinities -Cy0CoreAffinity = 0 - -[PINNED] -NumberCyInstances = 1 -NumberDcInstances = 0 -NumProcesses = 4 -LimitDevAccess = 1 -# Crypto - User instance #0 -Cy0Name = "UserCY0" -Cy0IsPolled = 1 -# List of core affinities -Cy0CoreAffinity = 0 diff --git a/cmd/qat_plugin/kerneldrv/test_data/inconsistent_limitdev/c6xx_dev1.conf b/cmd/qat_plugin/kerneldrv/test_data/inconsistent_limitdev/c6xx_dev1.conf deleted file mode 100644 index df92e1d28..000000000 --- a/cmd/qat_plugin/kerneldrv/test_data/inconsistent_limitdev/c6xx_dev1.conf +++ /dev/null @@ -1,193 +0,0 @@ -################################################################ -# This file is provided under a dual BSD/GPLv2 license. When using or -# redistributing this file, you may do so under either license. -# -# GPL LICENSE SUMMARY -# -# Copyright(c) 2007-2018 Intel Corporation. All rights reserved. -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of version 2 of the GNU General Public License as -# published by the Free Software Foundation. -# -# This program is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. -# The full GNU General Public License is included in this distribution -# in the file called LICENSE.GPL. -# -# Contact Information: -# Intel Corporation -# -# BSD LICENSE -# -# Copyright(c) 2007-2018 Intel Corporation. All rights reserved. -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in -# the documentation and/or other materials provided with the -# distribution. -# * Neither the name of Intel Corporation nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# -# -# version: QAT1.7.L.4.2.0-00012 -################################################################ -[GENERAL] -ServicesEnabled = cy;dc - -ConfigVersion = 2 - -#Default values for number of concurrent requests*/ -CyNumConcurrentSymRequests = 512 -CyNumConcurrentAsymRequests = 64 - -#Statistics, valid values: 1,0 -statsGeneral = 1 -statsDh = 1 -statsDrbg = 1 -statsDsa = 1 -statsEcc = 1 -statsKeyGen = 1 -statsDc = 1 -statsLn = 1 -statsPrime = 1 -statsRsa = 1 -statsSym = 1 -KptEnabled = 0 - -# Disable public key crypto and prime number -# services by specifying a value of 1 (default is 0) -PkeServiceDisabled = 0 - -# Specify size of intermediate buffers for which to -# allocate on-chip buffers. Legal values are 32 and -# 64 (default is 64). Specify 32 to optimize for -# compressing buffers <=32KB in size. -DcIntermediateBufferSizeInKB = 64 - -# This flag is to enable device auto reset on heartbeat error -AutoResetOnError = 0 - -############################################## -# Kernel Instances Section -############################################## -[KERNEL] -NumberCyInstances = 1 -NumberDcInstances = 1 - -# Crypto - Kernel instance #0 -Cy0Name = "IPSec0" -Cy0IsPolled = 0 -Cy0CoreAffinity = 0 - -# Data Compression - Kernel instance #0 -Dc0Name = "IPComp0" -Dc0IsPolled = 0 -Dc0CoreAffinity = 0 - -############################################## -# User Process Instance Section -############################################## -[SSL] -NumberCyInstances = 6 -NumberDcInstances = 2 -NumProcesses = 1 -LimitDevAccess = 0 - -# Crypto - User instance #0 -Cy0Name = "SSL0" -Cy0IsPolled = 1 -# List of core affinities -Cy0CoreAffinity = 9 - -# Crypto - User instance #1 -Cy1Name = "SSL1" -Cy1IsPolled = 1 -# List of core affinities -Cy1CoreAffinity = 10 - -# Crypto - User instance #2 -Cy2Name = "SSL2" -Cy2IsPolled = 1 -# List of core affinities -Cy2CoreAffinity = 11 - -# Crypto - User instance #3 -Cy3Name = "SSL3" -Cy3IsPolled = 1 -# List of core affinities -Cy3CoreAffinity = 12 - -# Crypto - User instance #4 -Cy4Name = "SSL4" -Cy4IsPolled = 1 -# List of core affinities -Cy4CoreAffinity = 13 - -# Crypto - User instance #5 -Cy5Name = "SSL5" -Cy5IsPolled = 1 -# List of core affinities -Cy5CoreAffinity = 14 - - -# Data Compression - User instance #0 -Dc0Name = "Dc0" -Dc0IsPolled = 1 -# List of core affinities -Dc0CoreAffinity = 9 - -# Data Compression - User instance #1 -Dc1Name = "Dc1" -Dc1IsPolled = 1 -# List of core affinities -Dc1CoreAffinity = 10 - -[SHIM] -NumberCyInstances = 1 -NumberDcInstances = 0 -NumProcesses = 2 -LimitDevAccess = 0 - -# Crypto - User instance #0 -Cy0Name = "UserCY0" -Cy0IsPolled = 1 -# List of core affinities -Cy0CoreAffinity = 0 - -[PINNED] -NumberCyInstances = 1 -NumberDcInstances = 0 -NumProcesses = 4 -LimitDevAccess = 1 -# Crypto - User instance #0 -Cy0Name = "UserCY0" -Cy0IsPolled = 1 -# List of core affinities -Cy0CoreAffinity = 0 diff --git a/cmd/qat_plugin/kerneldrv/test_data/inconsistent_limitdev/c6xx_dev2.conf b/cmd/qat_plugin/kerneldrv/test_data/inconsistent_limitdev/c6xx_dev2.conf deleted file mode 100644 index d0779e5c1..000000000 --- a/cmd/qat_plugin/kerneldrv/test_data/inconsistent_limitdev/c6xx_dev2.conf +++ /dev/null @@ -1,193 +0,0 @@ -################################################################ -# This file is provided under a dual BSD/GPLv2 license. When using or -# redistributing this file, you may do so under either license. -# -# GPL LICENSE SUMMARY -# -# Copyright(c) 2007-2018 Intel Corporation. All rights reserved. -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of version 2 of the GNU General Public License as -# published by the Free Software Foundation. -# -# This program is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. -# The full GNU General Public License is included in this distribution -# in the file called LICENSE.GPL. -# -# Contact Information: -# Intel Corporation -# -# BSD LICENSE -# -# Copyright(c) 2007-2018 Intel Corporation. All rights reserved. -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in -# the documentation and/or other materials provided with the -# distribution. -# * Neither the name of Intel Corporation nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# -# -# version: QAT1.7.L.4.2.0-00012 -################################################################ -[GENERAL] -ServicesEnabled = cy;dc - -ConfigVersion = 2 - -#Default values for number of concurrent requests*/ -CyNumConcurrentSymRequests = 512 -CyNumConcurrentAsymRequests = 64 - -#Statistics, valid values: 1,0 -statsGeneral = 1 -statsDh = 1 -statsDrbg = 1 -statsDsa = 1 -statsEcc = 1 -statsKeyGen = 1 -statsDc = 1 -statsLn = 1 -statsPrime = 1 -statsRsa = 1 -statsSym = 1 -KptEnabled = 0 - -# Disable public key crypto and prime number -# services by specifying a value of 1 (default is 0) -PkeServiceDisabled = 0 - -# Specify size of intermediate buffers for which to -# allocate on-chip buffers. Legal values are 32 and -# 64 (default is 64). Specify 32 to optimize for -# compressing buffers <=32KB in size. -DcIntermediateBufferSizeInKB = 64 - -# This flag is to enable device auto reset on heartbeat error -AutoResetOnError = 0 - -############################################## -# Kernel Instances Section -############################################## -[KERNEL] -NumberCyInstances = 1 -NumberDcInstances = 1 - -# Crypto - Kernel instance #0 -Cy0Name = "IPSec0" -Cy0IsPolled = 0 -Cy0CoreAffinity = 0 - -# Data Compression - Kernel instance #0 -Dc0Name = "IPComp0" -Dc0IsPolled = 0 -Dc0CoreAffinity = 0 - -############################################## -# User Process Instance Section -############################################## -[SSL] -NumberCyInstances = 6 -NumberDcInstances = 2 -NumProcesses = 1 -LimitDevAccess = 0 - -# Crypto - User instance #0 -Cy0Name = "SSL0" -Cy0IsPolled = 1 -# List of core affinities -Cy0CoreAffinity = 17 - -# Crypto - User instance #1 -Cy1Name = "SSL1" -Cy1IsPolled = 1 -# List of core affinities -Cy1CoreAffinity = 18 - -# Crypto - User instance #2 -Cy2Name = "SSL2" -Cy2IsPolled = 1 -# List of core affinities -Cy2CoreAffinity = 19 - -# Crypto - User instance #3 -Cy3Name = "SSL3" -Cy3IsPolled = 1 -# List of core affinities -Cy3CoreAffinity = 20 - -# Crypto - User instance #4 -Cy4Name = "SSL4" -Cy4IsPolled = 1 -# List of core affinities -Cy4CoreAffinity = 21 - -# Crypto - User instance #5 -Cy5Name = "SSL5" -Cy5IsPolled = 1 -# List of core affinities -Cy5CoreAffinity = 22 - - -# Data Compression - User instance #0 -Dc0Name = "Dc0" -Dc0IsPolled = 1 -# List of core affinities -Dc0CoreAffinity = 17 - -# Data Compression - User instance #1 -Dc1Name = "Dc1" -Dc1IsPolled = 1 -# List of core affinities -Dc1CoreAffinity = 18 - -[SHIM] -NumberCyInstances = 1 -NumberDcInstances = 0 -NumProcesses = 2 -LimitDevAccess = 0 - -# Crypto - User instance #0 -Cy0Name = "UserCY0" -Cy0IsPolled = 1 -# List of core affinities -Cy0CoreAffinity = 0 - -[PINNED] -NumberCyInstances = 1 -NumberDcInstances = 0 -NumProcesses = 4 -LimitDevAccess = 1 -# Crypto - User instance #0 -Cy0Name = "UserCY0" -Cy0IsPolled = 1 -# List of core affinities -Cy0CoreAffinity = 0 diff --git a/cmd/qat_plugin/kerneldrv/test_data/missing_pinned_section/c6xx_dev0.conf b/cmd/qat_plugin/kerneldrv/test_data/missing_pinned_section/c6xx_dev0.conf deleted file mode 100644 index 3047702b2..000000000 --- a/cmd/qat_plugin/kerneldrv/test_data/missing_pinned_section/c6xx_dev0.conf +++ /dev/null @@ -1,195 +0,0 @@ -################################################################ -# This file is provided under a dual BSD/GPLv2 license. When using or -# redistributing this file, you may do so under either license. -# -# GPL LICENSE SUMMARY -# -# Copyright(c) 2007-2018 Intel Corporation. All rights reserved. -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of version 2 of the GNU General Public License as -# published by the Free Software Foundation. -# -# This program is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. -# The full GNU General Public License is included in this distribution -# in the file called LICENSE.GPL. -# -# Contact Information: -# Intel Corporation -# -# BSD LICENSE -# -# Copyright(c) 2007-2018 Intel Corporation. All rights reserved. -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in -# the documentation and/or other materials provided with the -# distribution. -# * Neither the name of Intel Corporation nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# -# -# version: QAT1.7.L.4.2.0-00012 -################################################################ -[GENERAL] -ServicesEnabled = cy;dc - -ConfigVersion = 2 - -#Default values for number of concurrent requests*/ -CyNumConcurrentSymRequests = 512 -CyNumConcurrentAsymRequests = 64 - -#Statistics, valid values: 1,0 -statsGeneral = 1 -statsDh = 1 -statsDrbg = 1 -statsDsa = 1 -statsEcc = 1 -statsKeyGen = 1 -statsDc = 1 -statsLn = 1 -statsPrime = 1 -statsRsa = 1 -statsSym = 1 -KptEnabled = 0 - -# Disable public key crypto and prime number -# services by specifying a value of 1 (default is 0) -PkeServiceDisabled = 0 - -# Specify size of intermediate buffers for which to -# allocate on-chip buffers. Legal values are 32 and -# 64 (default is 64). Specify 32 to optimize for -# compressing buffers <=32KB in size. -DcIntermediateBufferSizeInKB = 64 - -# This flag is to enable device auto reset on heartbeat error -AutoResetOnError = 0 - -############################################## -# Kernel Instances Section -############################################## -[KERNEL] -NumberCyInstances = 1 -NumberDcInstances = 1 - -# Crypto - Kernel instance #0 -Cy0Name = "IPSec0" -Cy0IsPolled = 0 -Cy0CoreAffinity = 0 - -# Data Compression - Kernel instance #0 -Dc0Name = "IPComp0" -Dc0IsPolled = 0 -Dc0CoreAffinity = 0 - -############################################## -# User Process Instance Section -############################################## -[SSL] -NumberCyInstances = 6 -NumberDcInstances = 2 -NumProcesses = 1 -LimitDevAccess = 0 - -# Crypto - User instance #0 -Cy0Name = "SSL0" -Cy0IsPolled = 1 -# List of core affinities -Cy0CoreAffinity = 1 - -# Crypto - User instance #1 -Cy1Name = "SSL1" -Cy1IsPolled = 1 -# List of core affinities -Cy1CoreAffinity = 2 - -# Crypto - User instance #2 -Cy2Name = "SSL2" -Cy2IsPolled = 1 -# List of core affinities -Cy2CoreAffinity = 3 - -# Crypto - User instance #3 -Cy3Name = "SSL3" -Cy3IsPolled = 1 -# List of core affinities -Cy3CoreAffinity = 4 - -# Crypto - User instance #4 -Cy4Name = "SSL4" -Cy4IsPolled = 1 -# List of core affinities -Cy4CoreAffinity = 5 - -# Crypto - User instance #5 -Cy5Name = "SSL5" -Cy5IsPolled = 1 -# List of core affinities -Cy5CoreAffinity = 6 - - -# Data Compression - User instance #0 -Dc0Name = "Dc0" -Dc0IsPolled = 1 -# List of core affinities -Dc0CoreAffinity = 1 - -# Data Compression - User instance #1 -Dc1Name = "Dc1" -Dc1IsPolled = 1 -# List of core affinities -Dc1CoreAffinity = 2 - -[SHIM] -NumberCyInstances = 1 -NumberDcInstances = 0 -NumProcesses = 2 -LimitDevAccess = 0 - -# Crypto - User instance #0 -Cy0Name = "UserCY0" -Cy0IsPolled = 1 -# List of core affinities -Cy0CoreAffinity = 0 - -[SHIM2] -NumberCyInstances = 1 -NumberDcInstances = 0 -NumProcesses = 2 -LimitDevAccess = 0 - -# Crypto - User instance #0 -Cy0Name = "UserCY0" -Cy0IsPolled = 1 -# List of core affinities -Cy0CoreAffinity = 0 - diff --git a/cmd/qat_plugin/kerneldrv/test_data/missing_pinned_section/c6xx_dev1.conf b/cmd/qat_plugin/kerneldrv/test_data/missing_pinned_section/c6xx_dev1.conf deleted file mode 100644 index df92e1d28..000000000 --- a/cmd/qat_plugin/kerneldrv/test_data/missing_pinned_section/c6xx_dev1.conf +++ /dev/null @@ -1,193 +0,0 @@ -################################################################ -# This file is provided under a dual BSD/GPLv2 license. When using or -# redistributing this file, you may do so under either license. -# -# GPL LICENSE SUMMARY -# -# Copyright(c) 2007-2018 Intel Corporation. All rights reserved. -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of version 2 of the GNU General Public License as -# published by the Free Software Foundation. -# -# This program is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. -# The full GNU General Public License is included in this distribution -# in the file called LICENSE.GPL. -# -# Contact Information: -# Intel Corporation -# -# BSD LICENSE -# -# Copyright(c) 2007-2018 Intel Corporation. All rights reserved. -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in -# the documentation and/or other materials provided with the -# distribution. -# * Neither the name of Intel Corporation nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# -# -# version: QAT1.7.L.4.2.0-00012 -################################################################ -[GENERAL] -ServicesEnabled = cy;dc - -ConfigVersion = 2 - -#Default values for number of concurrent requests*/ -CyNumConcurrentSymRequests = 512 -CyNumConcurrentAsymRequests = 64 - -#Statistics, valid values: 1,0 -statsGeneral = 1 -statsDh = 1 -statsDrbg = 1 -statsDsa = 1 -statsEcc = 1 -statsKeyGen = 1 -statsDc = 1 -statsLn = 1 -statsPrime = 1 -statsRsa = 1 -statsSym = 1 -KptEnabled = 0 - -# Disable public key crypto and prime number -# services by specifying a value of 1 (default is 0) -PkeServiceDisabled = 0 - -# Specify size of intermediate buffers for which to -# allocate on-chip buffers. Legal values are 32 and -# 64 (default is 64). Specify 32 to optimize for -# compressing buffers <=32KB in size. -DcIntermediateBufferSizeInKB = 64 - -# This flag is to enable device auto reset on heartbeat error -AutoResetOnError = 0 - -############################################## -# Kernel Instances Section -############################################## -[KERNEL] -NumberCyInstances = 1 -NumberDcInstances = 1 - -# Crypto - Kernel instance #0 -Cy0Name = "IPSec0" -Cy0IsPolled = 0 -Cy0CoreAffinity = 0 - -# Data Compression - Kernel instance #0 -Dc0Name = "IPComp0" -Dc0IsPolled = 0 -Dc0CoreAffinity = 0 - -############################################## -# User Process Instance Section -############################################## -[SSL] -NumberCyInstances = 6 -NumberDcInstances = 2 -NumProcesses = 1 -LimitDevAccess = 0 - -# Crypto - User instance #0 -Cy0Name = "SSL0" -Cy0IsPolled = 1 -# List of core affinities -Cy0CoreAffinity = 9 - -# Crypto - User instance #1 -Cy1Name = "SSL1" -Cy1IsPolled = 1 -# List of core affinities -Cy1CoreAffinity = 10 - -# Crypto - User instance #2 -Cy2Name = "SSL2" -Cy2IsPolled = 1 -# List of core affinities -Cy2CoreAffinity = 11 - -# Crypto - User instance #3 -Cy3Name = "SSL3" -Cy3IsPolled = 1 -# List of core affinities -Cy3CoreAffinity = 12 - -# Crypto - User instance #4 -Cy4Name = "SSL4" -Cy4IsPolled = 1 -# List of core affinities -Cy4CoreAffinity = 13 - -# Crypto - User instance #5 -Cy5Name = "SSL5" -Cy5IsPolled = 1 -# List of core affinities -Cy5CoreAffinity = 14 - - -# Data Compression - User instance #0 -Dc0Name = "Dc0" -Dc0IsPolled = 1 -# List of core affinities -Dc0CoreAffinity = 9 - -# Data Compression - User instance #1 -Dc1Name = "Dc1" -Dc1IsPolled = 1 -# List of core affinities -Dc1CoreAffinity = 10 - -[SHIM] -NumberCyInstances = 1 -NumberDcInstances = 0 -NumProcesses = 2 -LimitDevAccess = 0 - -# Crypto - User instance #0 -Cy0Name = "UserCY0" -Cy0IsPolled = 1 -# List of core affinities -Cy0CoreAffinity = 0 - -[PINNED] -NumberCyInstances = 1 -NumberDcInstances = 0 -NumProcesses = 4 -LimitDevAccess = 1 -# Crypto - User instance #0 -Cy0Name = "UserCY0" -Cy0IsPolled = 1 -# List of core affinities -Cy0CoreAffinity = 0 diff --git a/cmd/qat_plugin/kerneldrv/test_data/missing_pinned_section/c6xx_dev2.conf b/cmd/qat_plugin/kerneldrv/test_data/missing_pinned_section/c6xx_dev2.conf deleted file mode 100644 index d0779e5c1..000000000 --- a/cmd/qat_plugin/kerneldrv/test_data/missing_pinned_section/c6xx_dev2.conf +++ /dev/null @@ -1,193 +0,0 @@ -################################################################ -# This file is provided under a dual BSD/GPLv2 license. When using or -# redistributing this file, you may do so under either license. -# -# GPL LICENSE SUMMARY -# -# Copyright(c) 2007-2018 Intel Corporation. All rights reserved. -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of version 2 of the GNU General Public License as -# published by the Free Software Foundation. -# -# This program is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. -# The full GNU General Public License is included in this distribution -# in the file called LICENSE.GPL. -# -# Contact Information: -# Intel Corporation -# -# BSD LICENSE -# -# Copyright(c) 2007-2018 Intel Corporation. All rights reserved. -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in -# the documentation and/or other materials provided with the -# distribution. -# * Neither the name of Intel Corporation nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# -# -# version: QAT1.7.L.4.2.0-00012 -################################################################ -[GENERAL] -ServicesEnabled = cy;dc - -ConfigVersion = 2 - -#Default values for number of concurrent requests*/ -CyNumConcurrentSymRequests = 512 -CyNumConcurrentAsymRequests = 64 - -#Statistics, valid values: 1,0 -statsGeneral = 1 -statsDh = 1 -statsDrbg = 1 -statsDsa = 1 -statsEcc = 1 -statsKeyGen = 1 -statsDc = 1 -statsLn = 1 -statsPrime = 1 -statsRsa = 1 -statsSym = 1 -KptEnabled = 0 - -# Disable public key crypto and prime number -# services by specifying a value of 1 (default is 0) -PkeServiceDisabled = 0 - -# Specify size of intermediate buffers for which to -# allocate on-chip buffers. Legal values are 32 and -# 64 (default is 64). Specify 32 to optimize for -# compressing buffers <=32KB in size. -DcIntermediateBufferSizeInKB = 64 - -# This flag is to enable device auto reset on heartbeat error -AutoResetOnError = 0 - -############################################## -# Kernel Instances Section -############################################## -[KERNEL] -NumberCyInstances = 1 -NumberDcInstances = 1 - -# Crypto - Kernel instance #0 -Cy0Name = "IPSec0" -Cy0IsPolled = 0 -Cy0CoreAffinity = 0 - -# Data Compression - Kernel instance #0 -Dc0Name = "IPComp0" -Dc0IsPolled = 0 -Dc0CoreAffinity = 0 - -############################################## -# User Process Instance Section -############################################## -[SSL] -NumberCyInstances = 6 -NumberDcInstances = 2 -NumProcesses = 1 -LimitDevAccess = 0 - -# Crypto - User instance #0 -Cy0Name = "SSL0" -Cy0IsPolled = 1 -# List of core affinities -Cy0CoreAffinity = 17 - -# Crypto - User instance #1 -Cy1Name = "SSL1" -Cy1IsPolled = 1 -# List of core affinities -Cy1CoreAffinity = 18 - -# Crypto - User instance #2 -Cy2Name = "SSL2" -Cy2IsPolled = 1 -# List of core affinities -Cy2CoreAffinity = 19 - -# Crypto - User instance #3 -Cy3Name = "SSL3" -Cy3IsPolled = 1 -# List of core affinities -Cy3CoreAffinity = 20 - -# Crypto - User instance #4 -Cy4Name = "SSL4" -Cy4IsPolled = 1 -# List of core affinities -Cy4CoreAffinity = 21 - -# Crypto - User instance #5 -Cy5Name = "SSL5" -Cy5IsPolled = 1 -# List of core affinities -Cy5CoreAffinity = 22 - - -# Data Compression - User instance #0 -Dc0Name = "Dc0" -Dc0IsPolled = 1 -# List of core affinities -Dc0CoreAffinity = 17 - -# Data Compression - User instance #1 -Dc1Name = "Dc1" -Dc1IsPolled = 1 -# List of core affinities -Dc1CoreAffinity = 18 - -[SHIM] -NumberCyInstances = 1 -NumberDcInstances = 0 -NumProcesses = 2 -LimitDevAccess = 0 - -# Crypto - User instance #0 -Cy0Name = "UserCY0" -Cy0IsPolled = 1 -# List of core affinities -Cy0CoreAffinity = 0 - -[PINNED] -NumberCyInstances = 1 -NumberDcInstances = 0 -NumProcesses = 4 -LimitDevAccess = 1 -# Crypto - User instance #0 -Cy0Name = "UserCY0" -Cy0IsPolled = 1 -# List of core affinities -Cy0CoreAffinity = 0 diff --git a/cmd/qat_plugin/qat_plugin.go b/cmd/qat_plugin/qat_plugin.go index c93ef39c6..f60f45583 100644 --- a/cmd/qat_plugin/qat_plugin.go +++ b/cmd/qat_plugin/qat_plugin.go @@ -19,10 +19,7 @@ import ( "fmt" "os" - "github.com/pkg/errors" - "github.com/intel/intel-device-plugins-for-kubernetes/cmd/qat_plugin/dpdkdrv" - "github.com/intel/intel-device-plugins-for-kubernetes/cmd/qat_plugin/kerneldrv" "github.com/intel/intel-device-plugins-for-kubernetes/pkg/deviceplugin" "k8s.io/klog/v2" ) @@ -37,29 +34,19 @@ func main() { err error ) - mode := flag.String("mode", "dpdk", "plugin mode which can be either dpdk (default) or kernel") - dpdkDriver := flag.String("dpdk-driver", "vfio-pci", "DPDK Device driver for configuring the QAT device") kernelVfDrivers := flag.String("kernel-vf-drivers", "4xxxvf,420xxvf", "Comma separated VF Device Driver of the QuickAssist Devices in the system. Devices supported: DH895xCC, C62x, C3xxx, C4xxx, 4xxx, 420xxx, and D15xx") preferredAllocationPolicy := flag.String("allocation-policy", "", "Modes of allocating QAT devices: balanced and packed") maxNumDevices := flag.Int("max-num-devices", 64, "maximum number of QAT devices to be provided to the QuickAssist device plugin") flag.Parse() - switch *mode { - case "dpdk": - plugin, err = dpdkdrv.NewDevicePlugin(*maxNumDevices, *kernelVfDrivers, *dpdkDriver, *preferredAllocationPolicy) - case "kernel": - plugin = kerneldrv.NewDevicePlugin() - default: - err = errors.Errorf("Unknown mode: %s", *mode) - } - + plugin, err = dpdkdrv.NewDevicePlugin(*maxNumDevices, *kernelVfDrivers, *dpdkDriver, *preferredAllocationPolicy) if err != nil { fmt.Println(err.Error()) os.Exit(1) } - klog.V(1).Infof("QAT device plugin started in '%s' mode", *mode) + klog.V(1).Infof("QAT device plugin started") manager := deviceplugin.NewManager(namespace, plugin) diff --git a/cmd/xpumanager_sidecar/README.md b/cmd/xpumanager_sidecar/README.md index b5ecf7aef..6d3d33e49 100644 --- a/cmd/xpumanager_sidecar/README.md +++ b/cmd/xpumanager_sidecar/README.md @@ -1,136 +1,3 @@ # XeLink sidecar for Intel XPU Manager -Table of Contents - -* [Introduction](#introduction) -* [Modes and Configuration Options](#modes-and-configuration-options) -* [Installation](#installation) - * [Install XPU Manager with the Sidecar](#install-xpu-manager-with-the-sidecar) - * [Install Sidecar to an Existing XPU Manager](#install-sidecar-to-an-existing-xpu-manager) -* [Verify Sidecar Functionality](#verify-sidecar-functionality) -* [Use HTTPS with XPU Manager](#use-https-with-xpu-manager) - -## Introduction - -Intel GPUs can be interconnected via an XeLink. In some workloads it is beneficial to use GPUs that are XeLinked together for optimal performance. XeLink information is provided by [Intel XPU Manager](https://www.github.com/intel/xpumanager) via its metrics API. Xelink sidecar retrieves the information from XPU Manager and stores it on the node under ```/etc/kubernetes/node-feature-discovery/features.d/``` as a feature label file. [NFD](https://github.com/kubernetes-sigs/node-feature-discovery) reads this file and converts it to Kubernetes node labels. These labels are then used by [GAS](https://github.com/intel/platform-aware-scheduling/tree/master/gpu-aware-scheduling) to make [scheduling decisions](https://github.com/intel/platform-aware-scheduling/blob/master/gpu-aware-scheduling/docs/usage.md#multi-gpu-allocation-with-xe-link-connections) for Pods. - -## Modes and Configuration Options - -| Flag | Argument | Default | Meaning | -|:---- |:-------- |:------- |:------- | -| -lane-count | int | 4 | Minimum lane count for an XeLink interconnect to be accepted | -| -interval | int | 10 | Interval for XeLink topology fetching and label writing (seconds, >= 1) | -| -startup-delay | int | 10 | Startup delay before the first topology fetching (seconds, >= 0) | -| -label-namespace | string | gpu.intel.com | Namespace or prefix for the labels. i.e. **gpu.intel.com**/xe-links | -| -allow-subdeviceless-links | bool | false | Include xelinks also for devices that do not have subdevices | -| -cert | string | "" | Use HTTPS and verify server's endpoint | - -The sidecar also accepts a number of other arguments. Please use the -h option to see the complete list of options. - -## Installation - -The following sections detail how to obtain, deploy and test the XPU Manager XeLink sidecar. - -### Pre-built Images - -[Pre-built images](https://hub.docker.com/r/intel/intel-xpumanager-sidecar) -of this component are available on the Docker hub. These images are automatically built and uploaded -to the hub from the latest main branch of this repository. - -Release tagged images of the components are also available on the Docker hub, tagged with their -release version numbers in the format `x.y.z`, corresponding to the branches and releases in this -repository. - -Note: Replace `` with the desired [release tag](https://github.com/intel/intel-device-plugins-for-kubernetes/tags) or `main` to get `devel` images. - -See [the development guide](../../DEVEL.md) for details if you want to deploy a customized version of the plugin. - -#### Install XPU Manager with the Sidecar - -Install XPU Manager daemonset with the XeLink sidecar - -```bash -$ kubectl apply -k 'https://github.com/intel/intel-device-plugins-for-kubernetes/deployments/xpumanager_sidecar/overlays/http?ref=' -``` - -Please see XPU Manager Kubernetes files for additional info on [installation](https://github.com/intel/xpumanager/tree/master/deployment/kubernetes). - -#### Install Sidecar to an Existing XPU Manager - -Use patch to add sidecar into the XPU Manager daemonset. - -```bash -$ kubectl patch daemonsets.apps intel-xpumanager --patch-file 'https://github.com/intel/intel-device-plugins-for-kubernetes/deployments/xpumanager_sidecar/overlays/http/xpumanager.yaml?ref=' -``` - -NOTE: The sidecar patch will remove other resources from the XPU Manager container. If your XPU Manager daemonset is using, for example, the smarter device manager resources, those will be removed. - -### Verify Sidecar Functionality - -You can verify the sidecar's functionality by checking node's xe-links labels: - -```bash -$ kubectl get nodes -A -o=jsonpath="{range .items[*]}{.metadata.name},{.metadata.labels.gpu\.intel\.com\/xe-links}{'\n'}{end}" -master,0.0-1.0_0.1-1.1 -``` - -### Use HTTPS with XPU Manager - -There is an alternative deployment that uses HTTPS instead of HTTP. The reference deployment requires `cert-manager` to provide a certificate for TLS. To deploy: - -```bash -$ kubectl apply -k 'https://github.com/intel/intel-device-plugins-for-kubernetes/deployments/xpumanager_sidecar/overlays/cert-manager?ref=' -``` - -The deployment requests a certificate and key from `cert-manager`. They are then provided to the gunicorn container as secrets and are used in the HTTPS interface. The sidecar container uses the same certificate to verify the server. - -> *NOTE*: The HTTPS deployment uses self-signed certificates. For production use, the certificates should be properly set up. - -
-Enabling HTTPS manually - -If one doesn't want to use `cert-manager`, the same can be achieved manually by creating certificates with openssl and then adding it to the deployment. The steps are roughly: -1) Create a certificate with [openssl](https://www.linode.com/docs/guides/create-a-self-signed-tls-certificate/) -1) Create a secret from the [certificate & key](https://kubernetes.io/docs/reference/kubectl/generated/kubectl_create/kubectl_create_secret_tls/). -1) Change the deployment: - -* Add certificate and key to gunicorn container: -``` - - command: - - gunicorn -... - - --certfile=/certs/tls.crt - - --keyfile=/certs/tls.key -... - - xpum_rest_main:main() -``` - -* Add secret mounting to the Pod: -``` - containers: - - name: python-exporter - volumeMounts: - - mountPath: /certs - name: certs - readOnly: true - volumes: - - name: certs - secret: - defaultMode: 420 - secretName: xpum-server-cert - ``` - -* Add use-https and cert to sidecar -``` - name: xelink-sidecar - volumeMounts: - - mountPath: /certs - name: certs - readOnly: true - args: -... - - --cert=/certs/tls.crt -... -``` - -
+Use of XeLink sidecar is deprecated as GAS has been deprecated. The sources are left for future use. diff --git a/deployments/gpu_plugin/overlays/fractional_resources/add-args.yaml b/deployments/gpu_plugin/overlays/fractional_resources/add-args.yaml deleted file mode 100644 index 033f5ff00..000000000 --- a/deployments/gpu_plugin/overlays/fractional_resources/add-args.yaml +++ /dev/null @@ -1,13 +0,0 @@ -apiVersion: apps/v1 -kind: DaemonSet -metadata: - name: intel-gpu-plugin -spec: - template: - spec: - containers: - - name: intel-gpu-plugin - args: - - "-shared-dev-num=300" - - "-resource-manager" - - "-enable-monitoring" diff --git a/deployments/gpu_plugin/overlays/fractional_resources/add-mounts.yaml b/deployments/gpu_plugin/overlays/fractional_resources/add-mounts.yaml deleted file mode 100644 index 797bdb82f..000000000 --- a/deployments/gpu_plugin/overlays/fractional_resources/add-mounts.yaml +++ /dev/null @@ -1,34 +0,0 @@ -apiVersion: apps/v1 -kind: DaemonSet -metadata: - name: intel-gpu-plugin -spec: - template: - spec: - containers: - - name: intel-gpu-plugin - volumeMounts: - - name: kubeletcrt - mountPath: /var/lib/kubelet/pki/kubelet.crt - - mountPath: /etc/kubernetes/node-feature-discovery/features.d/ - name: nfd-features - - mountPath: /sys/devices - name: sysfsdevices - readOnly: true - - name: podresources - mountPath: /var/lib/kubelet/pod-resources - volumes: - - name: kubeletcrt - hostPath: - path: /var/lib/kubelet/pki/kubelet.crt - type: FileOrCreate - - name: sysfsdevices - hostPath: - path: /sys/devices - - name: nfd-features - hostPath: - path: /etc/kubernetes/node-feature-discovery/features.d/ - type: DirectoryOrCreate - - name: podresources - hostPath: - path: /var/lib/kubelet/pod-resources diff --git a/deployments/gpu_plugin/overlays/fractional_resources/add-nodeselector-intel-gpu.yaml b/deployments/gpu_plugin/overlays/fractional_resources/add-nodeselector-intel-gpu.yaml deleted file mode 100644 index 5f0fe703a..000000000 --- a/deployments/gpu_plugin/overlays/fractional_resources/add-nodeselector-intel-gpu.yaml +++ /dev/null @@ -1,9 +0,0 @@ -apiVersion: apps/v1 -kind: DaemonSet -metadata: - name: intel-gpu-plugin -spec: - template: - spec: - nodeSelector: - intel.feature.node.kubernetes.io/gpu: "true" diff --git a/deployments/gpu_plugin/overlays/fractional_resources/add-serviceaccount.yaml b/deployments/gpu_plugin/overlays/fractional_resources/add-serviceaccount.yaml deleted file mode 100644 index 396363a2e..000000000 --- a/deployments/gpu_plugin/overlays/fractional_resources/add-serviceaccount.yaml +++ /dev/null @@ -1,8 +0,0 @@ -apiVersion: apps/v1 -kind: DaemonSet -metadata: - name: intel-gpu-plugin -spec: - template: - spec: - serviceAccountName: gpu-manager-sa diff --git a/deployments/gpu_plugin/overlays/fractional_resources/gpu-manager-role.yaml b/deployments/gpu_plugin/overlays/fractional_resources/gpu-manager-role.yaml deleted file mode 100644 index 61db88233..000000000 --- a/deployments/gpu_plugin/overlays/fractional_resources/gpu-manager-role.yaml +++ /dev/null @@ -1,8 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: gpu-manager-role -rules: -- apiGroups: [""] - resources: ["pods", "nodes/proxy"] - verbs: ["list", "get"] diff --git a/deployments/gpu_plugin/overlays/fractional_resources/gpu-manager-rolebinding.yaml b/deployments/gpu_plugin/overlays/fractional_resources/gpu-manager-rolebinding.yaml deleted file mode 100644 index 444fc67b3..000000000 --- a/deployments/gpu_plugin/overlays/fractional_resources/gpu-manager-rolebinding.yaml +++ /dev/null @@ -1,12 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: gpu-manager-rolebinding -subjects: -- kind: ServiceAccount - name: gpu-manager-sa - namespace: default -roleRef: - kind: ClusterRole - name: gpu-manager-role - apiGroup: rbac.authorization.k8s.io diff --git a/deployments/gpu_plugin/overlays/fractional_resources/gpu-manager-sa.yaml b/deployments/gpu_plugin/overlays/fractional_resources/gpu-manager-sa.yaml deleted file mode 100644 index 76b459d05..000000000 --- a/deployments/gpu_plugin/overlays/fractional_resources/gpu-manager-sa.yaml +++ /dev/null @@ -1,4 +0,0 @@ -apiVersion: v1 -kind: ServiceAccount -metadata: - name: gpu-manager-sa diff --git a/deployments/gpu_plugin/overlays/fractional_resources/kustomization.yaml b/deployments/gpu_plugin/overlays/fractional_resources/kustomization.yaml deleted file mode 100644 index d5e347974..000000000 --- a/deployments/gpu_plugin/overlays/fractional_resources/kustomization.yaml +++ /dev/null @@ -1,18 +0,0 @@ -resources: - - ../../base - - gpu-manager-rolebinding.yaml - - gpu-manager-role.yaml - - gpu-manager-sa.yaml -patches: - - path: add-serviceaccount.yaml - target: - kind: DaemonSet - - path: add-mounts.yaml - target: - kind: DaemonSet - - path: add-args.yaml - target: - kind: DaemonSet - - path: add-nodeselector-intel-gpu.yaml - target: - kind: DaemonSet diff --git a/deployments/nfd/overlays/node-feature-rules/platform-labeling-rules.yaml b/deployments/nfd/overlays/node-feature-rules/platform-labeling-rules.yaml index 602cd8619..6f1932f86 100644 --- a/deployments/nfd/overlays/node-feature-rules/platform-labeling-rules.yaml +++ b/deployments/nfd/overlays/node-feature-rules/platform-labeling-rules.yaml @@ -4,17 +4,6 @@ metadata: name: intel-gpu-platform-labeling spec: rules: - - extendedResources: - gpu.intel.com/millicores: "@local.label.gpu.intel.com/millicores" - gpu.intel.com/memory.max: "@local.label.gpu.intel.com/memory.max" - gpu.intel.com/tiles: "@local.label.gpu.intel.com/tiles" - matchFeatures: - - feature: local.label - matchExpressions: - gpu.intel.com/millicores: {op: Exists} - gpu.intel.com/memory.max: {op: Exists} - gpu.intel.com/tiles: {op: Exists} - name: intel.gpu.fractionalresources # generic rule for older and upcoming devices - labelsTemplate: | {{ range .pci.device }}gpu.intel.com/device-id.{{ .class }}-{{ .device }}.present=true diff --git a/deployments/operator/crd/bases/deviceplugin.intel.com_gpudeviceplugins.yaml b/deployments/operator/crd/bases/deviceplugin.intel.com_gpudeviceplugins.yaml index bb44a8359..813abc6f4 100644 --- a/deployments/operator/crd/bases/deviceplugin.intel.com_gpudeviceplugins.yaml +++ b/deployments/operator/crd/bases/deviceplugin.intel.com_gpudeviceplugins.yaml @@ -81,16 +81,11 @@ spec: description: |- PreferredAllocationPolicy sets the mode of allocating GPU devices on a node. See documentation for detailed description of the policies. Only valid when SharedDevNum > 1 is set. - Not applicable with ResourceManager. enum: - balanced - packed - none type: string - resourceManager: - description: ResourceManager handles the fractional resource management - for multi-GPU nodes. Enable only for clusters with GPU Aware Scheduling. - type: boolean sharedDevNum: description: SharedDevNum is a number of containers that can share the same GPU device. diff --git a/deployments/operator/rbac/gpu_manager_role.yaml b/deployments/operator/rbac/gpu_manager_role.yaml deleted file mode 100644 index 59ba5271c..000000000 --- a/deployments/operator/rbac/gpu_manager_role.yaml +++ /dev/null @@ -1,19 +0,0 @@ ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: gpu-manager-role -rules: -- apiGroups: - - "" - resources: - - nodes/proxy - verbs: - - get - - list -- apiGroups: - - "" - resources: - - pods - verbs: - - list diff --git a/deployments/operator/rbac/kustomization.yaml b/deployments/operator/rbac/kustomization.yaml index ddec56bf2..679dd5cb6 100644 --- a/deployments/operator/rbac/kustomization.yaml +++ b/deployments/operator/rbac/kustomization.yaml @@ -3,7 +3,6 @@ resources: - role_binding.yaml - leader_election_role.yaml - leader_election_role_binding.yaml -- gpu_manager_role.yaml # The following RBAC configurations are used to protect # the metrics endpoint with authn/authz. These configurations # ensure that only authorized users and service accounts diff --git a/deployments/operator/rbac/role.yaml b/deployments/operator/rbac/role.yaml index f1dfb8ac3..b0ede6c58 100644 --- a/deployments/operator/rbac/role.yaml +++ b/deployments/operator/rbac/role.yaml @@ -4,13 +4,6 @@ kind: ClusterRole metadata: name: manager-role rules: -- apiGroups: - - "" - resources: - - nodes/proxy - verbs: - - get - - list - apiGroups: - "" resources: @@ -19,16 +12,6 @@ rules: - get - list - watch -- apiGroups: - - "" - resources: - - serviceaccounts - verbs: - - create - - delete - - get - - list - - watch - apiGroups: - apps resources: @@ -109,16 +92,6 @@ rules: - get - list - watch -- apiGroups: - - rbac.authorization.k8s.io - resources: - - clusterrolebindings - verbs: - - create - - delete - - get - - list - - watch - apiGroups: - security.openshift.io resources: diff --git a/pkg/apis/deviceplugin/v1/gpudeviceplugin_types.go b/pkg/apis/deviceplugin/v1/gpudeviceplugin_types.go index 2856a5863..01eb4d27f 100644 --- a/pkg/apis/deviceplugin/v1/gpudeviceplugin_types.go +++ b/pkg/apis/deviceplugin/v1/gpudeviceplugin_types.go @@ -36,7 +36,6 @@ type GpuDevicePluginSpec struct { // PreferredAllocationPolicy sets the mode of allocating GPU devices on a node. // See documentation for detailed description of the policies. Only valid when SharedDevNum > 1 is set. - // Not applicable with ResourceManager. // +kubebuilder:validation:Enum=balanced;packed;none PreferredAllocationPolicy string `json:"preferredAllocationPolicy,omitempty"` @@ -51,9 +50,6 @@ type GpuDevicePluginSpec struct { // +kubebuilder:validation:Minimum=0 LogLevel int `json:"logLevel,omitempty"` - // ResourceManager handles the fractional resource management for multi-GPU nodes. Enable only for clusters with GPU Aware Scheduling. - ResourceManager bool `json:"resourceManager,omitempty"` - // EnableMonitoring enables the monitoring resource ('i915_monitoring') // which gives access to all GPU devices on given node. Typically used with Intel XPU-Manager. EnableMonitoring bool `json:"enableMonitoring,omitempty"` diff --git a/pkg/apis/deviceplugin/v1/gpudeviceplugin_webhook.go b/pkg/apis/deviceplugin/v1/gpudeviceplugin_webhook.go index a886fbe02..64007dc12 100644 --- a/pkg/apis/deviceplugin/v1/gpudeviceplugin_webhook.go +++ b/pkg/apis/deviceplugin/v1/gpudeviceplugin_webhook.go @@ -15,22 +15,15 @@ package v1 import ( - "context" "fmt" ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/client" - logf "sigs.k8s.io/controller-runtime/pkg/log" "github.com/intel/intel-device-plugins-for-kubernetes/pkg/controllers" ) -var cli client.Client - // SetupWebhookWithManager sets up a webhook for GpuDevicePlugin custom resources. func (r *GpuDevicePlugin) SetupWebhookWithManager(mgr ctrl.Manager) error { - cli = mgr.GetClient() - return ctrl.NewWebhookManagedBy(mgr). For(r). WithDefaulter(&commonDevicePluginDefaulter{ @@ -46,42 +39,10 @@ func (r *GpuDevicePlugin) SetupWebhookWithManager(mgr ctrl.Manager) error { // +kubebuilder:webhook:path=/mutate-deviceplugin-intel-com-v1-gpudeviceplugin,mutating=true,failurePolicy=fail,groups=deviceplugin.intel.com,resources=gpudeviceplugins,verbs=create;update,versions=v1,name=mgpudeviceplugin.kb.io,sideEffects=None,admissionReviewVersions=v1 // +kubebuilder:webhook:verbs=create;update,path=/validate-deviceplugin-intel-com-v1-gpudeviceplugin,mutating=false,failurePolicy=fail,groups=deviceplugin.intel.com,resources=gpudeviceplugins,versions=v1,name=vgpudeviceplugin.kb.io,sideEffects=None,admissionReviewVersions=v1 -func (r *GpuDevicePlugin) crossCheckResourceManagement(ctx context.Context) bool { - log := logf.FromContext(ctx) - gpuCrs := GpuDevicePluginList{} - - if err := cli.List(ctx, &gpuCrs); err != nil { - log.Info("unable to list GPU CRs") - - return false - } - - for _, cr := range gpuCrs.Items { - // Ignore itself. - if cr.Name == r.Name { - continue - } - - if cr.Spec.ResourceManager != r.Spec.ResourceManager { - return false - } - } - - return true -} - -func (r *GpuDevicePlugin) validatePlugin(ctx context.Context, ref *commonDevicePluginValidator) error { +func (r *GpuDevicePlugin) validatePlugin(ref *commonDevicePluginValidator) error { if r.Spec.SharedDevNum == 1 && r.Spec.PreferredAllocationPolicy != "none" { return fmt.Errorf("%w: PreferredAllocationPolicy is valid only when setting sharedDevNum > 1", errValidation) } - if r.Spec.SharedDevNum == 1 && r.Spec.ResourceManager { - return fmt.Errorf("%w: resourceManager is valid only when setting sharedDevNum > 1", errValidation) - } - - if !r.crossCheckResourceManagement(ctx) { - return fmt.Errorf("%w: All GPU CRs must be with or without resource management", errValidation) - } - return validatePluginImage(r.Spec.Image, ref.expectedImage, &ref.expectedVersion) } diff --git a/pkg/apis/deviceplugin/v1/webhook_common.go b/pkg/apis/deviceplugin/v1/webhook_common.go index e175f11e2..27324c975 100644 --- a/pkg/apis/deviceplugin/v1/webhook_common.go +++ b/pkg/apis/deviceplugin/v1/webhook_common.go @@ -103,7 +103,7 @@ func (r *commonDevicePluginValidator) ValidateCreate(ctx context.Context, obj ru case *DsaDevicePlugin: return nil, v.validatePlugin(r) case *GpuDevicePlugin: - return nil, v.validatePlugin(ctx, r) + return nil, v.validatePlugin(r) case *FpgaDevicePlugin: return nil, v.validatePlugin(r) case *IaaDevicePlugin: @@ -118,16 +118,16 @@ func (r *commonDevicePluginValidator) ValidateCreate(ctx context.Context, obj ru } // ValidateUpdate implements admission.CustomValidator so a webhook will be registered for the type. -func (r *commonDevicePluginValidator) ValidateUpdate(ctx context.Context, oldObj runtime.Object, newObj runtime.Object) (admission.Warnings, error) { +func (r *commonDevicePluginValidator) ValidateUpdate(ctx context.Context, _ runtime.Object, newObj runtime.Object) (admission.Warnings, error) { logf.FromContext(ctx).Info("validate update") - switch v := oldObj.(type) { + switch v := newObj.(type) { case *DlbDevicePlugin: return nil, v.validatePlugin(r) case *DsaDevicePlugin: return nil, v.validatePlugin(r) case *GpuDevicePlugin: - return nil, v.validatePlugin(ctx, r) + return nil, v.validatePlugin(r) case *FpgaDevicePlugin: return nil, v.validatePlugin(r) case *IaaDevicePlugin: @@ -137,7 +137,7 @@ func (r *commonDevicePluginValidator) ValidateUpdate(ctx context.Context, oldObj case *SgxDevicePlugin: return nil, v.validatePlugin(r) default: - return nil, fmt.Errorf("%w: expected an xDevicePlugin object but got %T", errObjType, oldObj) + return nil, fmt.Errorf("%w: expected an xDevicePlugin object but got %T", errObjType, newObj) } } diff --git a/pkg/controllers/dlb/controller.go b/pkg/controllers/dlb/controller.go index 27b6461b7..f2cd3036d 100644 --- a/pkg/controllers/dlb/controller.go +++ b/pkg/controllers/dlb/controller.go @@ -57,7 +57,6 @@ func SetupReconciler(mgr ctrl.Manager, args controllers.ControllerOptions) error } type controller struct { - controllers.DefaultServiceAccountFactory scheme *runtime.Scheme args controllers.ControllerOptions } diff --git a/pkg/controllers/dsa/controller.go b/pkg/controllers/dsa/controller.go index 19f6b61f7..c4f4e294b 100644 --- a/pkg/controllers/dsa/controller.go +++ b/pkg/controllers/dsa/controller.go @@ -61,7 +61,6 @@ func SetupReconciler(mgr ctrl.Manager, args controllers.ControllerOptions) error } type controller struct { - controllers.DefaultServiceAccountFactory scheme *runtime.Scheme args controllers.ControllerOptions } diff --git a/pkg/controllers/fpga/controller.go b/pkg/controllers/fpga/controller.go index 3a96032e2..996c9df3d 100644 --- a/pkg/controllers/fpga/controller.go +++ b/pkg/controllers/fpga/controller.go @@ -57,7 +57,6 @@ func SetupReconciler(mgr ctrl.Manager, args controllers.ControllerOptions) error } type controller struct { - controllers.DefaultServiceAccountFactory scheme *runtime.Scheme args controllers.ControllerOptions } diff --git a/pkg/controllers/gpu/controller.go b/pkg/controllers/gpu/controller.go index 09e52675b..810299a36 100644 --- a/pkg/controllers/gpu/controller.go +++ b/pkg/controllers/gpu/controller.go @@ -23,8 +23,6 @@ import ( apps "k8s.io/api/apps/v1" v1 "k8s.io/api/core/v1" - rbacv1 "k8s.io/api/rbac/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/tools/reference" ctrl "sigs.k8s.io/controller-runtime" @@ -37,9 +35,7 @@ import ( ) const ( - ownerKey = ".metadata.controller.gpu" - serviceAccountName = "gpu-manager-sa" - roleBindingName = "gpu-manager-rolebinding" + ownerKey = ".metadata.controller.gpu" ) var defaultNodeSelector = deployments.GPUPluginDaemonSet().Spec.Template.Spec.NodeSelector @@ -76,56 +72,6 @@ func (c *controller) Upgrade(ctx context.Context, obj client.Object) bool { return controllers.UpgradeImages(ctx, &dp.Spec.Image, &dp.Spec.InitImage) } -func (c *controller) NewSharedServiceAccount() *v1.ServiceAccount { - return &v1.ServiceAccount{ - ObjectMeta: metav1.ObjectMeta{ - Name: serviceAccountName, - Namespace: c.args.Namespace, - }, - } -} - -func (c *controller) NewSharedClusterRoleBinding() *rbacv1.ClusterRoleBinding { - return &rbacv1.ClusterRoleBinding{ - ObjectMeta: metav1.ObjectMeta{ - Name: roleBindingName, - Namespace: c.args.Namespace, - }, - Subjects: []rbacv1.Subject{ - { - Kind: "ServiceAccount", - Name: serviceAccountName, - Namespace: c.args.Namespace, - }, - }, - RoleRef: rbacv1.RoleRef{ - Kind: "ClusterRole", - Name: "inteldeviceplugins-gpu-manager-role", - APIGroup: "rbac.authorization.k8s.io", - }, - } -} - -func (c *controller) PluginMayRequireSharedObjects() bool { - return true -} - -func (c *controller) PluginRequiresSharedObjects(ctx context.Context, client client.Client) bool { - var list devicepluginv1.GpuDevicePluginList - - if err := client.List(ctx, &list); err != nil { - return false - } - - for _, cr := range list.Items { - if cr.Spec.ResourceManager { - return true - } - } - - return false -} - func (c *controller) NewDaemonSet(rawObj client.Object) *apps.DaemonSet { devicePlugin := rawObj.(*devicepluginv1.GpuDevicePlugin) @@ -157,36 +103,9 @@ func (c *controller) NewDaemonSet(rawObj client.Object) *apps.DaemonSet { setInitContainer(&daemonSet.Spec.Template.Spec, devicePlugin.Spec.InitImage) } - // add service account if resource manager is enabled - if devicePlugin.Spec.ResourceManager { - daemonSet.Spec.Template.Spec.ServiceAccountName = serviceAccountName - addVolumeIfMissing(&daemonSet.Spec.Template.Spec, "podresources", "/var/lib/kubelet/pod-resources", v1.HostPathDirectory) - addVolumeMountIfMissing(&daemonSet.Spec.Template.Spec, "podresources", "/var/lib/kubelet/pod-resources", false) - addVolumeIfMissing(&daemonSet.Spec.Template.Spec, "kubeletcrt", "/var/lib/kubelet/pki/kubelet.crt", v1.HostPathFileOrCreate) - addVolumeMountIfMissing(&daemonSet.Spec.Template.Spec, "kubeletcrt", "/var/lib/kubelet/pki/kubelet.crt", true) - addVolumeIfMissing(&daemonSet.Spec.Template.Spec, "nfd-features", "/etc/kubernetes/node-feature-discovery/features.d/", v1.HostPathDirectory) - addVolumeMountIfMissing(&daemonSet.Spec.Template.Spec, "nfd-features", "/etc/kubernetes/node-feature-discovery/features.d/", false) - addVolumeIfMissing(&daemonSet.Spec.Template.Spec, "sysfsdevices", "/sys/devices", v1.HostPathDirectory) - addVolumeMountIfMissing(&daemonSet.Spec.Template.Spec, "sysfsdevices", "/sys/devices", true) - } - return daemonSet } -func addVolumeMountIfMissing(spec *v1.PodSpec, name, mountPath string, readOnly bool) { - for _, mount := range spec.Containers[0].VolumeMounts { - if mount.Name == name { - return - } - } - - spec.Containers[0].VolumeMounts = append(spec.Containers[0].VolumeMounts, v1.VolumeMount{ - Name: name, - MountPath: mountPath, - ReadOnly: readOnly, - }) -} - func addVolumeIfMissing(spec *v1.PodSpec, name, path string, hpType v1.HostPathType) { for _, vol := range spec.Volumes { if vol.Name == name { @@ -240,18 +159,6 @@ func removeVolume(volumes []v1.Volume, name string) []v1.Volume { return newVolumes } -func removeVolumeMount(volumeMounts []v1.VolumeMount, name string) []v1.VolumeMount { - newVolumeMounts := []v1.VolumeMount{} - - for _, volume := range volumeMounts { - if volume.Name != name { - newVolumeMounts = append(newVolumeMounts, volume) - } - } - - return newVolumeMounts -} - func processInitContainer(ds *apps.DaemonSet, dp *devicepluginv1.GpuDevicePlugin) bool { initContainers := ds.Spec.Template.Spec.InitContainers @@ -311,51 +218,6 @@ func (c *controller) UpdateDaemonSet(rawObj client.Object, ds *apps.DaemonSet) ( updated = true } - hadRM := strings.Contains(oldArgString, "-resource-manager") - - // Add volumes if they do not exist, or remove them when - // labels are not requested anymore. - if !hadRM && dp.Spec.ResourceManager { - addVolumeIfMissing(&ds.Spec.Template.Spec, "podresources", "/var/lib/kubelet/pod-resources", v1.HostPathDirectory) - addVolumeMountIfMissing(&ds.Spec.Template.Spec, "podresources", "/var/lib/kubelet/pod-resources", false) - addVolumeIfMissing(&ds.Spec.Template.Spec, "kubeletcrt", "/var/lib/kubelet/pki/kubelet.crt", v1.HostPathFileOrCreate) - addVolumeMountIfMissing(&ds.Spec.Template.Spec, "kubeletcrt", "/var/lib/kubelet/pki/kubelet.crt", true) - addVolumeIfMissing(&ds.Spec.Template.Spec, "nfd-features", "/etc/kubernetes/node-feature-discovery/features.d/", v1.HostPathDirectory) - addVolumeMountIfMissing(&ds.Spec.Template.Spec, "nfd-features", "/etc/kubernetes/node-feature-discovery/features.d/", false) - addVolumeIfMissing(&ds.Spec.Template.Spec, "sysfsdevices", "/sys/devices", v1.HostPathDirectory) - addVolumeMountIfMissing(&ds.Spec.Template.Spec, "sysfsdevices", "/sys/devices", true) - } else if hadRM && !dp.Spec.ResourceManager { - volMounts := &ds.Spec.Template.Spec.Containers[0].VolumeMounts - *volMounts = removeVolumeMount(*volMounts, "nfd-features") - *volMounts = removeVolumeMount(*volMounts, "sysfsdevices") - *volMounts = removeVolumeMount(*volMounts, "kubeletcrt") - *volMounts = removeVolumeMount(*volMounts, "podresources") - - volumes := &ds.Spec.Template.Spec.Volumes - *volumes = removeVolume(*volumes, "nfd-features") - *volumes = removeVolume(*volumes, "sysfsdevices") - *volumes = removeVolume(*volumes, "kubeletcrt") - *volumes = removeVolume(*volumes, "podresources") - } - - newServiceAccountName := "default" - if dp.Spec.ResourceManager { - newServiceAccountName = serviceAccountName - } - - if ds.Spec.Template.Spec.ServiceAccountName != newServiceAccountName { - ds.Spec.Template.Spec.ServiceAccountName = newServiceAccountName - if dp.Spec.ResourceManager { - addVolumeIfMissing(&ds.Spec.Template.Spec, "podresources", "/var/lib/kubelet/pod-resources", v1.HostPathDirectory) - addVolumeMountIfMissing(&ds.Spec.Template.Spec, "podresources", "/var/lib/kubelet/pod-resources", false) - } else { - ds.Spec.Template.Spec.Volumes = removeVolume(ds.Spec.Template.Spec.Volumes, "podresources") - ds.Spec.Template.Spec.Containers[0].VolumeMounts = removeVolumeMount(ds.Spec.Template.Spec.Containers[0].VolumeMounts, "podresources") - } - - updated = true - } - if controllers.HasTolerationsChanged(ds.Spec.Template.Spec.Tolerations, dp.Spec.Tolerations) { ds.Spec.Template.Spec.Tolerations = dp.Spec.Tolerations updated = true @@ -409,10 +271,6 @@ func getPodArgs(gdp *devicepluginv1.GpuDevicePlugin) []string { args = append(args, "-shared-dev-num", "1") } - if gdp.Spec.ResourceManager { - args = append(args, "-resource-manager") - } - if gdp.Spec.PreferredAllocationPolicy != "" { args = append(args, "-allocation-policy", gdp.Spec.PreferredAllocationPolicy) } else { diff --git a/pkg/controllers/gpu/controller_test.go b/pkg/controllers/gpu/controller_test.go index aad31697b..1a5f6abe1 100644 --- a/pkg/controllers/gpu/controller_test.go +++ b/pkg/controllers/gpu/controller_test.go @@ -17,7 +17,6 @@ package gpu import ( "reflect" - "strings" "testing" apps "k8s.io/api/apps/v1" @@ -183,20 +182,6 @@ func (c *controller) newDaemonSetExpected(rawObj client.Object) *apps.DaemonSet }, } - // add service account if resource manager is enabled - if devicePlugin.Spec.ResourceManager { - daemonSet.Spec.Template.Spec.ServiceAccountName = serviceAccountName - - addVolumeIfMissing(&daemonSet.Spec.Template.Spec, "podresources", "/var/lib/kubelet/pod-resources", v1.HostPathDirectory) - addVolumeMountIfMissing(&daemonSet.Spec.Template.Spec, "podresources", "/var/lib/kubelet/pod-resources", false) - addVolumeIfMissing(&daemonSet.Spec.Template.Spec, "kubeletcrt", "/var/lib/kubelet/pki/kubelet.crt", v1.HostPathFileOrCreate) - addVolumeMountIfMissing(&daemonSet.Spec.Template.Spec, "kubeletcrt", "/var/lib/kubelet/pki/kubelet.crt", true) - addVolumeIfMissing(&daemonSet.Spec.Template.Spec, "nfd-features", "/etc/kubernetes/node-feature-discovery/features.d/", v1.HostPathDirectory) - addVolumeMountIfMissing(&daemonSet.Spec.Template.Spec, "nfd-features", "/etc/kubernetes/node-feature-discovery/features.d/", false) - addVolumeIfMissing(&daemonSet.Spec.Template.Spec, "sysfsdevices", "/sys/devices", v1.HostPathDirectory) - addVolumeMountIfMissing(&daemonSet.Spec.Template.Spec, "sysfsdevices", "/sys/devices", true) - } - if len(c.args.ImagePullSecretName) > 0 { daemonSet.Spec.Template.Spec.ImagePullSecrets = []v1.LocalObjectReference{ {Name: c.args.ImagePullSecretName}, @@ -209,37 +194,6 @@ func (c *controller) newDaemonSetExpected(rawObj client.Object) *apps.DaemonSet func (c *controller) updateDaemonSetExpected(rawObj client.Object, ds *apps.DaemonSet) { dp := rawObj.(*devicepluginv1.GpuDevicePlugin) - argString := strings.Join(ds.Spec.Template.Spec.Containers[0].Args, " ") - - hadRM := strings.Contains(argString, "-resource-manager") - - if !hadRM && dp.Spec.ResourceManager { - ds.Spec.Template.Spec.ServiceAccountName = serviceAccountName - - addVolumeIfMissing(&ds.Spec.Template.Spec, "podresources", "/var/lib/kubelet/pod-resources", v1.HostPathDirectory) - addVolumeMountIfMissing(&ds.Spec.Template.Spec, "podresources", "/var/lib/kubelet/pod-resources", false) - addVolumeIfMissing(&ds.Spec.Template.Spec, "kubeletcrt", "/var/lib/kubelet/pki/kubelet.crt", v1.HostPathFileOrCreate) - addVolumeMountIfMissing(&ds.Spec.Template.Spec, "kubeletcrt", "/var/lib/kubelet/pki/kubelet.crt", true) - addVolumeIfMissing(&ds.Spec.Template.Spec, "nfd-features", "/etc/kubernetes/node-feature-discovery/features.d/", v1.HostPathDirectory) - addVolumeMountIfMissing(&ds.Spec.Template.Spec, "nfd-features", "/etc/kubernetes/node-feature-discovery/features.d/", false) - addVolumeIfMissing(&ds.Spec.Template.Spec, "sysfsdevices", "/sys/devices", v1.HostPathDirectory) - addVolumeMountIfMissing(&ds.Spec.Template.Spec, "sysfsdevices", "/sys/devices", true) - } else if hadRM && !dp.Spec.ResourceManager { - ds.Spec.Template.Spec.ServiceAccountName = "default" - - volMounts := &ds.Spec.Template.Spec.Containers[0].VolumeMounts - *volMounts = removeVolumeMount(*volMounts, "nfd-features") - *volMounts = removeVolumeMount(*volMounts, "sysfsdevices") - *volMounts = removeVolumeMount(*volMounts, "kubeletcrt") - *volMounts = removeVolumeMount(*volMounts, "podresources") - - volumes := &ds.Spec.Template.Spec.Volumes - *volumes = removeVolume(*volumes, "nfd-features") - *volumes = removeVolume(*volumes, "sysfsdevices") - *volumes = removeVolume(*volumes, "kubeletcrt") - *volumes = removeVolume(*volumes, "podresources") - } - ds.Spec.Template.Spec.Containers[0].Args = getPodArgs(dp) } @@ -248,15 +202,9 @@ func (c *controller) updateDaemonSetExpected(rawObj client.Object, ds *apps.Daem func TestNewDamonSetGPU(t *testing.T) { tcases := []struct { name string - rm bool }{ { - "plugin with resource manager", - true, - }, - { - "plugin without resource manager", - false, + "plugin as is", }, } @@ -266,7 +214,6 @@ func TestNewDamonSetGPU(t *testing.T) { plugin := &devicepluginv1.GpuDevicePlugin{} plugin.Name = "new-gpu-cr-testing" - plugin.Spec.ResourceManager = tc.rm t.Run(tc.name, func(t *testing.T) { expected := c.newDaemonSetExpected(plugin) @@ -300,15 +247,11 @@ func TestNewDamonSetGPUWithSecret(t *testing.T) { func TestUpdateDamonSetGPU(t *testing.T) { tcases := []struct { name string - rmInitially bool + sharedCount int }{ { - "plugin without rm and then with rm", - false, - }, - { - "plugin with rm and then without rm", - true, + "shared dev num as 5", + 5, }, } @@ -318,12 +261,12 @@ func TestUpdateDamonSetGPU(t *testing.T) { before := &devicepluginv1.GpuDevicePlugin{} before.Name = "update-gpu-cr-testing" - before.Spec.ResourceManager = tc.rmInitially + before.Spec.SharedDevNum = 1 after := &devicepluginv1.GpuDevicePlugin{} after.Name = "update-gpu-cr-testing" - after.Spec.ResourceManager = !tc.rmInitially + after.Spec.SharedDevNum = tc.sharedCount t.Run(tc.name, func(t *testing.T) { expected := c.newDaemonSetExpected(before) diff --git a/pkg/controllers/iaa/controller.go b/pkg/controllers/iaa/controller.go index 0fe996789..123393e8f 100644 --- a/pkg/controllers/iaa/controller.go +++ b/pkg/controllers/iaa/controller.go @@ -59,7 +59,6 @@ func SetupReconciler(mgr ctrl.Manager, args controllers.ControllerOptions) error } type controller struct { - controllers.DefaultServiceAccountFactory scheme *runtime.Scheme args controllers.ControllerOptions } diff --git a/pkg/controllers/qat/controller.go b/pkg/controllers/qat/controller.go index bcc45a957..1bfc2d0c9 100644 --- a/pkg/controllers/qat/controller.go +++ b/pkg/controllers/qat/controller.go @@ -61,7 +61,6 @@ func SetupReconciler(mgr ctrl.Manager, args controllers.ControllerOptions) error } type controller struct { - controllers.DefaultServiceAccountFactory scheme *runtime.Scheme args controllers.ControllerOptions } diff --git a/pkg/controllers/reconciler.go b/pkg/controllers/reconciler.go index ded2b7f0b..844344bac 100644 --- a/pkg/controllers/reconciler.go +++ b/pkg/controllers/reconciler.go @@ -26,7 +26,6 @@ import ( "github.com/google/go-cmp/cmp" apps "k8s.io/api/apps/v1" v1 "k8s.io/api/core/v1" - rbacv1 "k8s.io/api/rbac/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" @@ -41,52 +40,14 @@ var ( ImageMinVersion = versionutil.MustParseSemantic("0.32.0") ) -const ( - sharedObjectsNone = iota - sharedObjectsMayUse - sharedObjectsUsed -) - // +kubebuilder:rbac:groups=apps,resources=daemonsets,verbs=get;list;watch;create;update;patch;delete -// +kubebuilder:rbac:groups="",resources=serviceaccounts,verbs=get;list;watch;create;delete -// +kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=clusterrolebindings,verbs=get;list;watch;create;delete // +kubebuilder:rbac:groups="",resources=pods,verbs=get;list;watch -// +kubebuilder:rbac:groups="",resources=nodes/proxy,verbs=get;list // +kubebuilder:rbac:groups=coordination.k8s.io,resources=leases,verbs=create // +kubebuilder:rbac:groups=security.openshift.io,resources=securitycontextconstraints,verbs=use // +kubebuilder:rbac:groups=coordination.k8s.io,resources=leases,resourceNames=d1c7b6d5.intel.com,verbs=get;update -// SharedObjectsFactory provides functions for creating service account and cluster rule binding objects. -// Note that the rbac Role can be generated from kubebuilder:rbac comment (some examples above), -// which is the reason why this interface does not yet have a NewRole function. -type SharedObjectsFactory interface { - // Indicates if plugin will ever require shared objects. Not all plugins do. - PluginMayRequireSharedObjects() bool - // Indicates if plugin currently require shared objects. - PluginRequiresSharedObjects(ctx context.Context, client client.Client) bool - NewSharedServiceAccount() *v1.ServiceAccount - NewSharedClusterRoleBinding() *rbacv1.ClusterRoleBinding -} - -// DefaultServiceAccountFactory is an empty ServiceAccountFactory. "default" will be used for the service account then. -type DefaultServiceAccountFactory struct{} - -func (d *DefaultServiceAccountFactory) NewSharedServiceAccount() *v1.ServiceAccount { - return nil -} -func (d *DefaultServiceAccountFactory) NewSharedClusterRoleBinding() *rbacv1.ClusterRoleBinding { - return nil -} -func (d *DefaultServiceAccountFactory) PluginMayRequireSharedObjects() bool { - return false -} -func (d *DefaultServiceAccountFactory) PluginRequiresSharedObjects(ctx context.Context, client client.Client) bool { - return false -} - // DevicePluginController provides functionality for manipulating actual device plugin CRD objects. type DevicePluginController interface { - SharedObjectsFactory CreateEmptyObject() (devicePlugin client.Object) NewDaemonSet(devicePlugin client.Object) *apps.DaemonSet UpdateDaemonSet(client.Object, *apps.DaemonSet) (updated bool) @@ -138,27 +99,6 @@ func (r *reconciler) fetchObjects(ctx context.Context, req ctrl.Request, log log return &childDaemonSets, nil } -// createSharedObjects creates required objects for Reconcile. -func (r *reconciler) createSharedObjects(ctx context.Context, log logr.Logger) (result ctrl.Result, err error) { - // Since ServiceAccount and ClusterRoleBinding are can be shared by many, - // it's not owned by the CR. 'SetControllerReference' in the create daemonset function. - sa := r.controller.NewSharedServiceAccount() - - if err := r.Create(ctx, sa); client.IgnoreAlreadyExists(err) != nil { - log.Error(err, "unable to create shared ServiceAccount") - return result, err - } - - rb := r.controller.NewSharedClusterRoleBinding() - - if err := r.Create(ctx, rb); client.IgnoreAlreadyExists(err) != nil { - log.Error(err, "unable to create shared ClusterRoleBinding") - return ctrl.Result{}, err - } - - return result, nil -} - func UpgradeImages(ctx context.Context, image *string, initimage *string) (upgrade bool) { for _, s := range []*string{image, initimage} { if s == nil { @@ -202,25 +142,6 @@ func upgradeDevicePluginImages(ctx context.Context, r *reconciler, devicePlugin } } -// determinateSharedObjectReqs Determinates if the installed plugins require shared objects. -// The result is one of three: no, may use and uses currently. -func (r *reconciler) determinateSharedObjectReqs(ctx context.Context, req ctrl.Request) int { - ret := sharedObjectsNone - - if !r.controller.PluginMayRequireSharedObjects() { - return ret - } - - ret = sharedObjectsMayUse - - // Decide from the untyped objects the need to have shared objects. - if r.controller.PluginRequiresSharedObjects(ctx, r.Client) { - ret = sharedObjectsUsed - } - - return ret -} - // Reconcile reconciles a device plugin object. func (r *reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { log := log.FromContext(ctx) @@ -230,20 +151,10 @@ func (r *reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu return ctrl.Result{}, err2 } - sharedObjectsNeed := r.determinateSharedObjectReqs(ctx, req) devicePlugin := r.controller.CreateEmptyObject() if err := r.Get(ctx, req.NamespacedName, devicePlugin); err != nil { - // Delete shared objects if they are not needed anymore. - r.maybeDeleteSharedObjects(ctx, sharedObjectsNeed, log) - - return r.maybeDeleteDaemonSets(ctx, err, childDaemonSets.Items, log) - } - - if sharedObjectsNeed == sharedObjectsUsed { - if result, err := r.createSharedObjects(ctx, log); err != nil { - return result, err - } + return ctrl.Result{}, err } // Upgrade device plugin object's image, initImage etc. @@ -295,11 +206,6 @@ func (r *reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu } } - // Drop redundant daemon sets, role bindings and service accounts, if any. - r.maybeDeleteRedundantDaemonSets(ctx, childDaemonSets.Items, log) - // Delete shared objects if they are not needed anymore. - r.maybeDeleteSharedObjects(ctx, sharedObjectsNeed, log) - return ctrl.Result{}, nil } @@ -324,7 +230,7 @@ func indexDaemonSets(ctx context.Context, mgr ctrl.Manager, apiGVString, pluginK }) } -func indexPods(ctx context.Context, mgr ctrl.Manager, apiGVString, pluginKind, ownerKey string) error { +func indexPods(ctx context.Context, mgr ctrl.Manager, _, _, ownerKey string) error { return mgr.GetFieldIndexer().IndexField(ctx, &v1.Pod{}, ownerKey, func(rawObj client.Object) []string { // grab the Pod object, extract the owner... @@ -388,57 +294,3 @@ func (r *reconciler) createDaemonSet(ctx context.Context, dp client.Object, log return ctrl.Result{}, nil } - -func (r *reconciler) maybeDeleteDaemonSets(ctx context.Context, err error, daemonSets []apps.DaemonSet, log logr.Logger) (ctrl.Result, error) { - if apierrors.IsNotFound(err) { - for i := range daemonSets { - if err = r.Delete(ctx, &daemonSets[i], client.PropagationPolicy(metav1.DeletePropagationBackground)); client.IgnoreNotFound(err) != nil { - log.Error(err, "unable to delete DaemonSet", "DaemonSet", daemonSets[i]) - return ctrl.Result{}, err - } - } - - log.V(1).Info("deleted DaemonSets owned by deleted custom device plugin object") - - return ctrl.Result{}, nil - } - - log.Error(err, "unable to fetch custom device plugin object") - - return ctrl.Result{}, err -} - -func (r *reconciler) maybeDeleteRedundantDaemonSets(ctx context.Context, dsets []apps.DaemonSet, log logr.Logger) { - count := len(dsets) - if count > 1 { - log.V(0).Info("there are redundant DaemonSets", "redundantDS", count-1) - - redundantSets := dsets[1:] - for i := range redundantSets { - if err := r.Delete(ctx, &redundantSets[i], client.PropagationPolicy(metav1.DeletePropagationBackground)); client.IgnoreNotFound(err) != nil { - log.Error(err, "unable to delete redundant DaemonSet", "DaemonSet", redundantSets[i]) - } else { - log.V(1).Info("deleted redundant DaemonSet", "DaemonSet", redundantSets[i]) - } - } - } -} - -func (r *reconciler) maybeDeleteSharedObjects(ctx context.Context, sharedObjectsNeed int, log logr.Logger) { - // Delete shared objects only if plugin may use but is not currently using any. - if sharedObjectsNeed != sharedObjectsMayUse { - return - } - - sa := r.controller.NewSharedServiceAccount() - - if err := r.Delete(ctx, sa, client.PropagationPolicy(metav1.DeletePropagationBackground)); client.IgnoreNotFound(err) != nil { - log.Error(err, "unable to delete redundant shared ServiceAccount", "ServiceAccount", sa) - } - - crb := r.controller.NewSharedClusterRoleBinding() - - if err := r.Delete(ctx, crb, client.PropagationPolicy(metav1.DeletePropagationBackground)); client.IgnoreNotFound(err) != nil { - log.Error(err, "unable to delete redundant shared ClusterRoleBinding", "ClusterRoleBinding", crb) - } -} diff --git a/pkg/controllers/reconciler_test.go b/pkg/controllers/reconciler_test.go index 7dc176634..906817469 100644 --- a/pkg/controllers/reconciler_test.go +++ b/pkg/controllers/reconciler_test.go @@ -20,8 +20,367 @@ import ( "testing" v1 "k8s.io/api/core/v1" + + "errors" + + apps "k8s.io/api/apps/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" ) +type mockPlugin struct { + client.Object +} + +func (m *mockPlugin) GetNamespace() string { + return "default" +} + +func (m *mockPlugin) GetObjectKind() schema.ObjectKind { + return &metav1.TypeMeta{ + Kind: "MockPlugin", + APIVersion: "v1", + } +} + +func (m *mockPlugin) GetName() string { + return "mock" +} + +func (m *mockPlugin) GetUID() types.UID { + return "mock-uid" +} + +type mockController struct { + statusErr error + updated bool + upgrade bool +} + +func (m *mockController) CreateEmptyObject() client.Object { + return &mockPlugin{} +} + +func (m *mockController) NewDaemonSet(rawObj client.Object) *apps.DaemonSet { + return &apps.DaemonSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: "mock-ds", + Namespace: "default", + }, + Spec: apps.DaemonSetSpec{ + Selector: &metav1.LabelSelector{ + MatchLabels: map[string]string{"app": "mock"}, + }, + Template: v1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{"app": "mock"}, + }, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Name: "mock", + Image: "intel/intel-mock-plugin:latest", + }, + }, + }, + }, + }, + } +} + +func (m *mockController) UpdateDaemonSet(rawObj client.Object, ds *apps.DaemonSet) (updated bool) { + return m.updated +} + +func (m *mockController) UpdateStatus(rawObj client.Object, ds *apps.DaemonSet, messages []string) (updated bool, err error) { + if m.statusErr != nil { + return false, m.statusErr + } + + return true, nil +} + +func (m *mockController) Upgrade(ctx context.Context, obj client.Object) bool { + return m.upgrade +} + +type fakeStatusWriter struct{} + +func (f *fakeStatusWriter) Create(ctx context.Context, obj client.Object, obj2 client.Object, opts ...client.SubResourceCreateOption) error { + return nil +} +func (f *fakeStatusWriter) Update(ctx context.Context, obj client.Object, opts ...client.SubResourceUpdateOption) error { + return nil +} +func (f *fakeStatusWriter) Patch(ctx context.Context, obj client.Object, patch client.Patch, opts ...client.SubResourcePatchOption) error { + return nil +} +func (f *fakeStatusWriter) Status() client.StatusWriter { + return f +} + +type fakeClient struct { + client.StatusWriter + client.Client + getErr error + listErr error + updateErr error + createErr error + statusErr error + ds []*apps.DaemonSet + pods []*v1.Pod + createCalled bool +} + +func (f *fakeClient) Get(ctx context.Context, key client.ObjectKey, obj client.Object, opts ...client.GetOption) error { + if f.getErr != nil { + return f.getErr + } + + return nil +} +func (f *fakeClient) List(ctx context.Context, list client.ObjectList, opts ...client.ListOption) error { + if f.listErr != nil { + return f.listErr + } + switch l := list.(type) { + case *apps.DaemonSetList: + l.Items = []apps.DaemonSet{} + + for _, ds := range f.ds { + l.Items = append(l.Items, *ds) + } + case *v1.PodList: + l.Items = []v1.Pod{} + + for _, pod := range f.pods { + l.Items = append(l.Items, *pod) + } + } + return nil +} +func (f *fakeClient) Update(ctx context.Context, obj client.Object, opts ...client.UpdateOption) error { + return f.updateErr +} +func (f *fakeClient) Create(ctx context.Context, obj client.Object, opts ...client.CreateOption) error { + f.createCalled = true + return f.createErr +} +func (f *fakeClient) Patch(ctx context.Context, obj client.Object, patch client.Patch, opts ...client.PatchOption) error { + return nil +} +func (f *fakeClient) Delete(ctx context.Context, obj client.Object, opts ...client.DeleteOption) error { + return nil +} +func (f *fakeClient) UpdateStatus(ctx context.Context, obj client.Object, opts ...client.UpdateOption) error { + return f.statusErr +} +func (f *fakeClient) Status() client.StatusWriter { + return f.StatusWriter +} +func (f *fakeClient) Scheme() *runtime.Scheme { + s := runtime.NewScheme() + s.AddKnownTypeWithName(schema.GroupVersionKind{ + Group: "deviceplugin.intel.com", + Version: "v1", + Kind: "MockPlugin", + }, &mockPlugin{}) + + return s +} + +func fillDaemonSets() []*apps.DaemonSet { + return []*apps.DaemonSet{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "mock-ds", + Namespace: "default", + }, + Spec: apps.DaemonSetSpec{ + Selector: &metav1.LabelSelector{ + MatchLabels: map[string]string{"app": "mock"}, + }, + Template: v1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{"app": "mock"}, + }, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Name: "mock", + Image: "intel/intel-mock-plugin:latest", + }, + }, + }, + }, + }, + }, + } +} + +func fillPods() []*v1.Pod { + return []*v1.Pod{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "mock-pod", + Namespace: "default", + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "apps/v1", + Kind: "DaemonSet", + Name: "mock-ds", + UID: "mock-uid", + }, + }, + }, + Spec: v1.PodSpec{ + NodeName: "node1", + Containers: []v1.Container{ + { + Name: "mock", + Image: "intel/intel-mock-plugin:latest", + }, + }, + }, + }, + } +} + +func TestReconciler_Reconcile_CreateDaemonSet(t *testing.T) { + controller := &mockController{} + c := &fakeClient{ + StatusWriter: &fakeStatusWriter{}, + ds: []*apps.DaemonSet{}, + } + r := &reconciler{ + controller: controller, + Client: c, + scheme: c.Scheme(), + pluginKind: "MockPlugin", + ownerKey: "owner", + } + req := ctrl.Request{NamespacedName: types.NamespacedName{Name: "mock", Namespace: "default"}} + + res, err := r.Reconcile(context.Background(), req) + if err != nil { + t.Errorf("expected no error, got %v", err) + } + if res != (ctrl.Result{}) { + t.Errorf("expected empty result, got %v", res) + } + if c.createCalled == false { + t.Error("expected create to be called, but it was not") + } +} + +func TestReconciler_Reconcile_UpdateDaemonSetAndStatus(t *testing.T) { + controller := &mockController{ + updated: true, + upgrade: false, + } + c := &fakeClient{ + StatusWriter: &fakeStatusWriter{}, + ds: fillDaemonSets(), + pods: fillPods(), + } + r := &reconciler{ + controller: controller, + Client: c, + scheme: c.Scheme(), + pluginKind: "MockPlugin", + ownerKey: "owner", + } + req := ctrl.Request{NamespacedName: client.ObjectKey{Name: "mock", Namespace: "default"}} + + res, err := r.Reconcile(context.Background(), req) + if err != nil { + t.Errorf("expected no error, got %v", err) + } + if res != (ctrl.Result{}) { + t.Errorf("expected empty result, got %v", res) + } +} + +type getError struct { + error +} +type listError struct { + error +} +type updatestatusError struct { + error +} + +func TestReconciler_Reconcile_GetError(t *testing.T) { + controller := &mockController{} + c := &fakeClient{ + getErr: getError{}, + StatusWriter: &fakeStatusWriter{}, + } + r := &reconciler{ + controller: controller, + Client: c, + scheme: c.Scheme(), + pluginKind: "MockPlugin", + ownerKey: "owner", + } + req := ctrl.Request{NamespacedName: client.ObjectKey{Name: "mock", Namespace: "default"}} + + _, err := r.Reconcile(context.Background(), req) + if err == nil || errors.Is(err, c.getErr) == false { + t.Errorf("expected get error, got %v", err) + } +} + +func TestReconciler_Reconcile_ListError(t *testing.T) { + controller := &mockController{} + c := &fakeClient{ + listErr: listError{}, + StatusWriter: &fakeStatusWriter{}, + } + r := &reconciler{ + controller: controller, + Client: c, + scheme: c.Scheme(), + pluginKind: "MockPlugin", + ownerKey: "owner", + } + req := ctrl.Request{NamespacedName: client.ObjectKey{Name: "mock", Namespace: "default"}} + + _, err := r.Reconcile(context.Background(), req) + if err == nil || errors.Is(err, c.listErr) == false { + t.Errorf("expected list error, got %v", err) + } +} + +func TestReconciler_Reconcile_UpdateStatusError(t *testing.T) { + controller := &mockController{ + statusErr: updatestatusError{}, + } + c := &fakeClient{ + StatusWriter: &fakeStatusWriter{}, + ds: fillDaemonSets(), + pods: fillPods(), + } + r := &reconciler{ + controller: controller, + Client: c, + scheme: c.Scheme(), + pluginKind: "MockPlugin", + ownerKey: "owner", + } + req := ctrl.Request{NamespacedName: client.ObjectKey{Name: "mock", Namespace: "default"}} + + _, err := r.Reconcile(context.Background(), req) + if err == nil || errors.Is(err, controller.statusErr) == false { + t.Errorf("expected status update error, got %v", err) + } +} + func TestUpgrade(test *testing.T) { image := "intel/intel-dsa-plugin" initimage := "intel/intel-idxd-config-initcontainer" diff --git a/pkg/controllers/sgx/controller.go b/pkg/controllers/sgx/controller.go index 2804ceac4..d92395870 100644 --- a/pkg/controllers/sgx/controller.go +++ b/pkg/controllers/sgx/controller.go @@ -57,7 +57,6 @@ func SetupReconciler(mgr ctrl.Manager, args controllers.ControllerOptions) error } type controller struct { - controllers.DefaultServiceAccountFactory scheme *runtime.Scheme args controllers.ControllerOptions } diff --git a/test/e2e/gpu/gpu.go b/test/e2e/gpu/gpu.go index ae5b3fabc..0d8b6b9c9 100644 --- a/test/e2e/gpu/gpu.go +++ b/test/e2e/gpu/gpu.go @@ -38,7 +38,6 @@ import ( const ( kustomizationYaml = "deployments/gpu_plugin/kustomization.yaml" monitoringYaml = "deployments/gpu_plugin/overlays/monitoring_shared-dev_nfd/kustomization.yaml" - rmEnabledYaml = "deployments/gpu_plugin/overlays/fractional_resources/kustomization.yaml" healthMgmtYaml = "deployments/gpu_plugin/overlays/health/kustomization.yaml" nfdRulesYaml = "deployments/nfd/overlays/node-feature-rules/kustomization.yaml" containerName = "testcontainer" @@ -89,16 +88,6 @@ func describe() { framework.Failf("unable to locate %q: %v", monitoringYaml, errFailedToLocateRepoFile) } - nfdRulesPath, errFailedToLocateRepoFile := utils.LocateRepoFile(nfdRulesYaml) - if errFailedToLocateRepoFile != nil { - framework.Failf("unable to locate %q: %v", nfdRulesYaml, errFailedToLocateRepoFile) - } - - resourceManagerPath, errFailedToLocateRepoFile := utils.LocateRepoFile(rmEnabledYaml) - if errFailedToLocateRepoFile != nil { - framework.Failf("unable to locate %q: %v", rmEnabledYaml, errFailedToLocateRepoFile) - } - healthMgmtPath, errFailedToLocateRepoFile := utils.LocateRepoFile(healthMgmtYaml) if errFailedToLocateRepoFile != nil { framework.Failf("unable to locate %q: %v", healthMgmtYaml, errFailedToLocateRepoFile) @@ -206,27 +195,6 @@ func describe() { }) }) - ginkgo.Context("When [Deployment:resourceManager] deployment is applied [Resource:i915]", func() { - ginkgo.It("check if i915 resources is available", func(ctx context.Context) { - e2ekubectl.RunKubectlOrDie(f.Namespace.Name, "apply", "-k", filepath.Dir(nfdRulesPath)) - - createPluginAndVerifyExistence(f, ctx, resourceManagerPath, "gpu.intel.com/i915") - - // To speed up extended resource detection, let's restart NFD worker - e2ekubectl.RunKubectlOrDie("node-feature-discovery", "rollout", "restart", "daemonset", "nfd-worker") - - ginkgo.By("checking if the millicores resource is allocatable") - if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, "gpu.intel.com/millicores", 30*time.Second, utils.WaitForPositiveResource); err != nil { - framework.Failf("unable to wait for nodes to have positive allocatable resource: %v", err) - } - - ginkgo.By("checking if the tiles resource is allocatable") - if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, "gpu.intel.com/tiles", 30*time.Second, utils.WaitForPositiveResource); err != nil { - framework.Failf("unable to wait for nodes to have positive allocatable resource: %v", err) - } - }) - }) - ginkgo.It("run a small workload on the GPU [App:tensorflow]", func(ctx context.Context) { createPluginAndVerifyExistence(f, ctx, vanillaPath, "gpu.intel.com/i915") diff --git a/test/e2e/qat/qatplugin_kernel.go b/test/e2e/qat/qatplugin_kernel.go deleted file mode 100644 index 3a53206a5..000000000 --- a/test/e2e/qat/qatplugin_kernel.go +++ /dev/null @@ -1,122 +0,0 @@ -// Copyright 2020 Intel Corporation. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package qat - -import ( - "context" - "time" - - "github.com/intel/intel-device-plugins-for-kubernetes/test/e2e/utils" - "github.com/onsi/ginkgo/v2" - v1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/resource" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/labels" - "k8s.io/kubernetes/test/e2e/framework" - e2edebug "k8s.io/kubernetes/test/e2e/framework/debug" - e2ekubectl "k8s.io/kubernetes/test/e2e/framework/kubectl" - e2epod "k8s.io/kubernetes/test/e2e/framework/pod" - imageutils "k8s.io/kubernetes/test/utils/image" - admissionapi "k8s.io/pod-security-admission/api" -) - -const ( - qatPluginKernelYaml = "deployments/qat_plugin/base/intel-qat-kernel-plugin.yaml" -) - -func init() { - ginkgo.Describe("QAT plugin in kernel mode [Device:qat] [Mode:kernel]", describeQatKernelPlugin) -} - -func describeQatKernelPlugin() { - f := framework.NewDefaultFramework("qatpluginkernel") - f.NamespacePodSecurityEnforceLevel = admissionapi.LevelPrivileged - - yamlPath, errFailedToLocateRepoFile := utils.LocateRepoFile(qatPluginKernelYaml) - if errFailedToLocateRepoFile != nil { - framework.Failf("unable to locate %q: %v", qatPluginKernelYaml, errFailedToLocateRepoFile) - } - - var dpPodName string - - ginkgo.BeforeEach(func(ctx context.Context) { - ginkgo.By("deploying QAT plugin in kernel mode") - e2ekubectl.RunKubectlOrDie(f.Namespace.Name, "create", "-f", yamlPath) - - ginkgo.By("waiting for QAT plugin's availability") - podList, err := e2epod.WaitForPodsWithLabelRunningReady(ctx, f.ClientSet, f.Namespace.Name, - labels.Set{"app": "intel-qat-kernel-plugin"}.AsSelector(), 1 /* one replica */, 100*time.Second) - if err != nil { - e2edebug.DumpAllNamespaceInfo(ctx, f.ClientSet, f.Namespace.Name) - e2ekubectl.LogFailedContainers(ctx, f.ClientSet, f.Namespace.Name, framework.Logf) - framework.Failf("unable to wait for all pods to be running and ready: %v", err) - } - dpPodName = podList.Items[0].Name - - ginkgo.By("checking QAT plugin's securityContext") - if err = utils.TestPodsFileSystemInfo(podList.Items); err != nil { - framework.Failf("container filesystem info checks failed: %v", err) - } - }) - - ginkgo.AfterEach(func(ctx context.Context) { - ginkgo.By("undeploying QAT plugin") - e2ekubectl.RunKubectlOrDie(f.Namespace.Name, "delete", "-f", yamlPath) - if err := e2epod.WaitForPodNotFoundInNamespace(ctx, f.ClientSet, dpPodName, f.Namespace.Name, 30*time.Second); err != nil { - framework.Failf("failed to terminate pod: %v", err) - } - }) - - ginkgo.Context("When QAT resources are available [Resource:cy1_dc0]", func() { - ginkgo.BeforeEach(func(ctx context.Context) { - ginkgo.By("checking if the resource is allocatable") - if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, "qat.intel.com/cy1_dc0", 30*time.Second, utils.WaitForPositiveResource); err != nil { - framework.Failf("unable to wait for nodes to have positive allocatable resource: %v", err) - } - }) - - ginkgo.It("deploys a pod requesting QAT resources [App:busybox]", func(ctx context.Context) { - ginkgo.By("submitting a pod requesting QAT resources") - podSpec := &v1.Pod{ - ObjectMeta: metav1.ObjectMeta{Name: "qatplugin-tester"}, - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Args: []string{"-c", "echo mode"}, - Name: "testcontainer", - Image: imageutils.GetE2EImage(imageutils.BusyBox), - Command: []string{"/bin/sh"}, - Resources: v1.ResourceRequirements{ - Requests: v1.ResourceList{"qat.intel.com/cy1_dc0": resource.MustParse("1")}, - Limits: v1.ResourceList{"qat.intel.com/cy1_dc0": resource.MustParse("1")}, - }, - }, - }, - RestartPolicy: v1.RestartPolicyNever, - }, - } - pod, err := f.ClientSet.CoreV1().Pods(f.Namespace.Name).Create(ctx, - podSpec, metav1.CreateOptions{}) - framework.ExpectNoError(err, "pod Create API error") - - ginkgo.By("waiting the pod to finish successfully") - e2epod.NewPodClient(f).WaitForFinish(ctx, pod.ObjectMeta.Name, 60*time.Second) - }) - - ginkgo.When("there is no app to run [App:noapp]", func() { - ginkgo.It("does nothing", func() {}) - }) - }) -}