diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 9e31dd4c5..00c863fbd 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -14,6 +14,7 @@ /cmd/gpu_fakedev/ @tkatila @uniemimu @eero-t /cmd/gpu_plugin/ @tkatila @bart0sh @uniemimu /cmd/gpu_nfdhook/ @tkatila @bart0sh @uniemimu +/cmd/gpu_levelzero/ @tkatila @eero-t @uniemimu /cmd/qat_plugin/ @hj-johannes-lee @mythi /cmd/sgx_plugin/ @hj-johannes-lee @mythi /cmd/dsa_plugin/ @hj-johannes-lee @ozhuraki @mythi diff --git a/.github/workflows/lib-build.yaml b/.github/workflows/lib-build.yaml index c28c76644..7b2f6e964 100644 --- a/.github/workflows/lib-build.yaml +++ b/.github/workflows/lib-build.yaml @@ -18,6 +18,7 @@ jobs: - intel-gpu-fakedev - intel-gpu-initcontainer - intel-gpu-plugin + - intel-gpu-levelzero - intel-fpga-plugin - intel-qat-initcontainer - intel-qat-plugin diff --git a/.github/workflows/lib-e2e.yaml b/.github/workflows/lib-e2e.yaml index 659a5271c..87b90782f 100644 --- a/.github/workflows/lib-e2e.yaml +++ b/.github/workflows/lib-e2e.yaml @@ -55,6 +55,7 @@ jobs: - intel-iaa-plugin - crypto-perf - intel-gpu-plugin + - intel-gpu-levelzero - intel-sgx-plugin - intel-sgx-initcontainer - intel-sgx-admissionwebhook diff --git a/.github/workflows/lib-publish.yaml b/.github/workflows/lib-publish.yaml index 234f43db6..1b744a9ab 100644 --- a/.github/workflows/lib-publish.yaml +++ b/.github/workflows/lib-publish.yaml @@ -11,7 +11,7 @@ on: required: false type: string env: - no_base_check: "['intel-qat-plugin-kerneldrv', 'intel-idxd-config-initcontainer', 'crypto-perf', 'opae-nlb-demo']" + no_base_check: "['intel-qat-plugin-kerneldrv', 'intel-idxd-config-initcontainer', 'crypto-perf', 'opae-nlb-demo', 'intel-gpu-levelzero']" permissions: contents: read @@ -48,6 +48,7 @@ jobs: - intel-fpga-initcontainer - intel-gpu-initcontainer - intel-gpu-plugin + #- intel-gpu-levelzero - intel-fpga-plugin - intel-qat-initcontainer - intel-qat-plugin diff --git a/.github/workflows/lib-validate.yaml b/.github/workflows/lib-validate.yaml index 851534761..8c97dce3a 100644 --- a/.github/workflows/lib-validate.yaml +++ b/.github/workflows/lib-validate.yaml @@ -39,6 +39,10 @@ jobs: with: go-version-file: go.mod check-latest: true + - name: install levelzero dev + run: | + sudo apt-get update + sudo apt-get install -y libze1 libze-dev - name: golangci-lint uses: golangci/golangci-lint-action@aaa42aa0628b4ae2578232a66b541047968fac86 # v6 with: @@ -53,11 +57,17 @@ jobs: with: go-version-file: go.mod check-latest: true + - name: install levelzero dev + run: | + sudo apt-get update + sudo apt-get install -y libze1 libze-dev - name: Check Dockerfiles run: make check-dockerfiles - run: make go-mod-tidy - run: make BUILDTAGS=kerneldrv - run: make test BUILDTAGS=kerneldrv + env: + UNITTEST: 1 - run: make check-github-actions #- name: Codecov report # run: bash <(curl -s https://codecov.io/bash) diff --git a/.gitignore b/.gitignore index 721de6e6f..81ac5ee10 100644 --- a/.gitignore +++ b/.gitignore @@ -23,7 +23,6 @@ cmd/operator/operator deployments/fpga_admissionwebhook/base/intel-fpga-webhook-certs-secret -*.h *.gbs.* *.aocx *.aocx.* diff --git a/README.md b/README.md index d01ce74d3..40b4066c2 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,7 @@ Table of Contents * [IAA device plugin](#iaa-device-plugin) * [Device Plugins Operator](#device-plugins-operator) * [XeLink XPU Manager sidecar](#xelink-xpu-manager-sidecar) +* [Intel GPU Level-Zero sidecar](#intel-gpu-levelzero) * [Demos](#demos) * [Workload Authors](#workload-authors) * [Developers](#developers) @@ -201,6 +202,12 @@ To support interconnected GPUs in Kubernetes, XeLink sidecar is needed. The [XeLink XPU Manager sidecar README](cmd/xpumanager_sidecar/README.md) gives information how the sidecar functions and how to use it. +## Intel GPU Level-Zero sidecar + +Sidecar uses Level-Zero API to provide additional GPU information for the GPU plugin that it cannot get through sysfs interfaces. + +See [Intel GPU Level-Zero sidecar README](cmd/gpu_levelzero/README.md) for more details. + ## Demos The [demo subdirectory](demo/readme.md) contains a number of demonstrations for diff --git a/build/docker/build-image.sh b/build/docker/build-image.sh index b77f5ca49..be52ebf64 100755 --- a/build/docker/build-image.sh +++ b/build/docker/build-image.sh @@ -29,7 +29,12 @@ if [ -d $(dirname $0)/../../vendor ] ; then BUILD_ARGS="${BUILD_ARGS} --build-arg DIR=/go/src/github.com/intel/intel-device-plugins-for-kubernetes --build-arg GO111MODULE=off" fi -BUILD_ARGS="${BUILD_ARGS} --build-arg FINAL_BASE=gcr.io/distroless/static" +BUILD_ARGS="${BUILD_ARGS} \ + --build-arg FINAL_BASE=gcr.io/distroless/static \ + --build-arg BUILD_BASE=golang:1.23-bookworm \ + --build-arg FINAL_BASE_DYN=debian:unstable-slim \ + --build-arg ROCKYLINUX=0" + if [ -z "${BUILDER}" -o "${BUILDER}" = 'docker' -o "${BUILDER}" = 'podman' ] ; then ${BUILDER} build --pull -t ${IMG}:${TAG} ${BUILD_ARGS} -f ${DOCKERFILE} . elif [ "${BUILDER}" = 'buildah' ] ; then diff --git a/build/docker/intel-gpu-levelzero.Dockerfile b/build/docker/intel-gpu-levelzero.Dockerfile new file mode 100644 index 000000000..09740816b --- /dev/null +++ b/build/docker/intel-gpu-levelzero.Dockerfile @@ -0,0 +1,91 @@ +## This is a generated file, do not edit directly. Edit build/docker/templates/intel-gpu-levelzero.Dockerfile.in instead. +## +## Copyright 2022 Intel Corporation. All Rights Reserved. +## +## Licensed under the Apache License, Version 2.0 (the "License"); +## you may not use this file except in compliance with the License. +## You may obtain a copy of the License at +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +## See the License for the specific language governing permissions and +## limitations under the License. +### +ARG CMD=gpu_levelzero +ARG ROCKYLINUX=1 +## FINAL_BASE_DYN can be used to configure the base image of the final image. +## The project default is 1) which sets FINAL_BASE_DYN=gcr.io/distroless/cc-debian12 +## (see build-image.sh). +## 2) and the default FINAL_BASE is primarily used to build Redhat Certified Openshift Operator container images that must be UBI based. +## The RedHat build tool does not allow additional image build parameters. +ARG BUILD_BASE=rockylinux:9 +ARG FINAL_BASE_DYN=registry.access.redhat.com/ubi9/ubi-minimal:9.3 +### +FROM ${BUILD_BASE} as builder +ARG DIR=/intel-device-plugins-for-kubernetes +ENV CGO_CFLAGS="-pipe -fno-plt" +ENV CGO_LDFLAGS="-fstack-protector-strong -Wl,-O1,--sort-common,--as-needed,-z,relro,-z,now,-z,noexecstack,-z,defs,-s,-w" +ENV CGOFLAGS="-trimpath -mod=readonly -buildmode=pie" +ENV GCFLAGS="all=-spectre=all -N -l" +ENV ASMFLAGS="all=-spectre=all" +ENV LDFLAGS="all=-linkmode=external -s -w" +ARG GOLICENSES_VERSION +ARG CMD +ARG ROCKYLINUX +ARG CGO_VERSION=1.23 +RUN mkdir /runtime +RUN if [ $ROCKYLINUX -eq 0 ]; then \ + apt-get update && apt-get install --no-install-recommends -y wget libc6-dev ca-certificates ocl-icd-libopencl1 && \ + cd /runtime && \ + wget -q https://github.com/intel/compute-runtime/releases/download/24.26.30049.6/intel-level-zero-gpu_1.3.30049.6_amd64.deb && \ + wget -q https://github.com/intel/compute-runtime/releases/download/24.26.30049.6/intel-opencl-icd_24.26.30049.6_amd64.deb && \ + wget -q https://github.com/intel/compute-runtime/releases/download/24.26.30049.6/libigdgmm12_22.3.20_amd64.deb && \ + wget -q https://github.com/oneapi-src/level-zero/releases/download/v1.17.6/level-zero-devel_1.17.6+u22.04_amd64.deb && \ + wget -q https://github.com/oneapi-src/level-zero/releases/download/v1.17.6/level-zero_1.17.6+u22.04_amd64.deb && \ + dpkg --ignore-depends=intel-igc-core,intel-igc-opencl -i *.deb && \ + rm -rf /var/lib/apt/lists/\*; \ + else \ + source /etc/os-release && dnf install -y gcc jq wget 'dnf-command(config-manager)' && \ + dnf config-manager --add-repo https://repositories.intel.com/gpu/rhel/${VERSION_ID}/lts/2350/unified/intel-gpu-${VERSION_ID}.repo && \ + dnf install -y intel-opencl level-zero level-zero-devel intel-level-zero-gpu intel-gmmlib intel-ocloc && \ + dnf clean all && \ + LATEST_GO=$(curl --no-progress-meter https://go.dev/dl/?mode=json | jq ".[] | select(.version | startswith(\"go${CGO_VERSION}\")).version" | tr -d "\"") && \ + wget -q https://go.dev/dl/$LATEST_GO.linux-amd64.tar.gz -O - | tar -xz -C /usr/local && \ + cp -a /etc/OpenCL /usr/lib64/libocloc.so /usr/lib64/libze_intel_gpu.* /usr/lib64/libze_loader.* /usr/lib64/libigdgmm.* /runtime/ && \ + mkdir /runtime/licenses/ && cd /usr/share/licenses/ && cp -a level-zero intel-gmmlib intel-level-zero-gpu intel-ocloc /runtime/licenses/; \ + fi +ARG EP=/usr/local/bin/intel_gpu_levelzero +ARG CMD +WORKDIR ${DIR} +COPY . . +RUN export PATH=$PATH:/usr/local/go/bin/ && cd cmd/${CMD} && \ + GO111MODULE=on CGO_ENABLED=1 go install $CGOFLAGS --gcflags="$GCFLAGS" --asmflags="$ASMFLAGS" --ldflags="$LDFLAGS" +RUN [ $ROCKYLINUX -eq 0 ] && install -D /go/bin/${CMD} /install_root${EP} || install -D /root/go/bin/${CMD} /install_root${EP} +RUN install -D ${DIR}/LICENSE /install_root/licenses/intel-device-plugins-for-kubernetes/LICENSE \ + && if [ ! -d "licenses/$CMD" ] ; then \ + GO111MODULE=on go run github.com/google/go-licenses@${GOLICENSES_VERSION} save "./cmd/$CMD" \ + --save_path /install_root/licenses/$CMD/go-licenses ; \ + else mkdir -p /install_root/licenses/$CMD/go-licenses/ && cd licenses/$CMD && cp -r * /install_root/licenses/$CMD/go-licenses/ ; fi +FROM ${FINAL_BASE_DYN} +ARG CMD +ARG ROCKYLINUX +COPY --from=builder /runtime /runtime +RUN if [ $ROCKYLINUX -eq 0 ]; then \ + apt-get update && apt-get install --no-install-recommends -y ocl-icd-libopencl1 && \ + rm /runtime/level-zero-devel_*.deb && \ + cd /runtime && dpkg --ignore-depends=intel-igc-core,intel-igc-opencl -i *.deb && rm -rf /runtime && \ + rm "/lib/x86_64-linux-gnu/libze_validation"* && rm "/lib/x86_64-linux-gnu/libze_tracing_layer"*; \ + else \ + cp -a /runtime//*.so* /usr/lib64/ && cp -a /runtime/OpenCL /etc/ && cp -a /runtime/licenses/* /usr/share/licenses/; \ + fi +COPY --from=builder /install_root / +ENTRYPOINT ["/usr/local/bin/intel_gpu_levelzero"] +LABEL vendor='Intel®' +LABEL version='devel' +LABEL release='1' +LABEL name='intel-gpu-levelzero' +LABEL summary='Intel® GPU levelzero for Kubernetes' +LABEL description='The GPU levelzero container provides access to Levelzero API for the Intel GPU plugin' diff --git a/build/docker/templates/intel-gpu-levelzero.Dockerfile.in b/build/docker/templates/intel-gpu-levelzero.Dockerfile.in new file mode 100644 index 000000000..829233331 --- /dev/null +++ b/build/docker/templates/intel-gpu-levelzero.Dockerfile.in @@ -0,0 +1,87 @@ +#define _ENTRYPOINT_ /usr/local/bin/intel_gpu_levelzero + +ARG CMD=gpu_levelzero +ARG ROCKYLINUX=1 + +## FINAL_BASE_DYN can be used to configure the base image of the final image. +## The project default is 1) which sets FINAL_BASE_DYN=gcr.io/distroless/cc-debian12 +## (see build-image.sh). +## 2) and the default FINAL_BASE is primarily used to build Redhat Certified Openshift Operator container images that must be UBI based. +## The RedHat build tool does not allow additional image build parameters. +ARG BUILD_BASE=rockylinux:9 +ARG FINAL_BASE_DYN=registry.access.redhat.com/ubi9/ubi-minimal:9.3 +### + +FROM ${BUILD_BASE} as builder + +ARG DIR=/intel-device-plugins-for-kubernetes + +ENV CGO_CFLAGS="-pipe -fno-plt" +ENV CGO_LDFLAGS="-fstack-protector-strong -Wl,-O1,--sort-common,--as-needed,-z,relro,-z,now,-z,noexecstack,-z,defs,-s,-w" +ENV CGOFLAGS="-trimpath -mod=readonly -buildmode=pie" +ENV GCFLAGS="all=-spectre=all -N -l" +ENV ASMFLAGS="all=-spectre=all" +ENV LDFLAGS="all=-linkmode=external -s -w" + +ARG GOLICENSES_VERSION +ARG CMD +ARG ROCKYLINUX +ARG CGO_VERSION=1.23 + +RUN mkdir /runtime + +RUN if [ $ROCKYLINUX -eq 0 ]; then \N + apt-get update && apt-get install --no-install-recommends -y wget libc6-dev ca-certificates ocl-icd-libopencl1 && \N + cd /runtime && \N + wget -q https://github.com/intel/compute-runtime/releases/download/24.26.30049.6/intel-level-zero-gpu_1.3.30049.6_amd64.deb && \N + wget -q https://github.com/intel/compute-runtime/releases/download/24.26.30049.6/intel-opencl-icd_24.26.30049.6_amd64.deb && \N + wget -q https://github.com/intel/compute-runtime/releases/download/24.26.30049.6/libigdgmm12_22.3.20_amd64.deb && \N + wget -q https://github.com/oneapi-src/level-zero/releases/download/v1.17.6/level-zero-devel_1.17.6+u22.04_amd64.deb && \N + wget -q https://github.com/oneapi-src/level-zero/releases/download/v1.17.6/level-zero_1.17.6+u22.04_amd64.deb && \N + dpkg --ignore-depends=intel-igc-core,intel-igc-opencl -i *.deb && \N + rm -rf /var/lib/apt/lists/\*; \N + else \N + source /etc/os-release && dnf install -y gcc jq wget 'dnf-command(config-manager)' && \N + dnf config-manager --add-repo https://repositories.intel.com/gpu/rhel/${VERSION_ID}/lts/2350/unified/intel-gpu-${VERSION_ID}.repo && \N + dnf install -y intel-opencl level-zero level-zero-devel intel-level-zero-gpu intel-gmmlib intel-ocloc && \N + dnf clean all && \N + LATEST_GO=$(curl --no-progress-meter https://go.dev/dl/?mode=json | jq ".[] | select(.version | startswith(\"go${CGO_VERSION}\")).version" | tr -d "\"") && \N + wget -q https://go.dev/dl/$LATEST_GO.linux-amd64.tar.gz -O - | tar -xz -C /usr/local && \N + cp -a /etc/OpenCL /usr/lib64/libocloc.so /usr/lib64/libze_intel_gpu.* /usr/lib64/libze_loader.* /usr/lib64/libigdgmm.* /runtime/ && \N + mkdir /runtime/licenses/ && cd /usr/share/licenses/ && cp -a level-zero intel-gmmlib intel-level-zero-gpu intel-ocloc /runtime/licenses/; \N + fi + +ARG EP=_ENTRYPOINT_ +ARG CMD + +WORKDIR ${DIR} +COPY . . + +RUN export PATH=$PATH:/usr/local/go/bin/ && cd cmd/${CMD} && \N + GO111MODULE=on CGO_ENABLED=1 go install $CGOFLAGS --gcflags="$GCFLAGS" --asmflags="$ASMFLAGS" --ldflags="$LDFLAGS" +RUN [ $ROCKYLINUX -eq 0 ] && install -D /go/bin/${CMD} /install_root${EP} || install -D /root/go/bin/${CMD} /install_root${EP} + +#include "default_licenses.docker" + +FROM ${FINAL_BASE_DYN} + +ARG CMD +ARG ROCKYLINUX + +COPY --from=builder /runtime /runtime + +RUN if [ $ROCKYLINUX -eq 0 ]; then \N + apt-get update && apt-get install --no-install-recommends -y ocl-icd-libopencl1 && \N + rm /runtime/level-zero-devel_*.deb && \N + cd /runtime && dpkg --ignore-depends=intel-igc-core,intel-igc-opencl -i *.deb && rm -rf /runtime && \N + rm "/lib/x86_64-linux-gnu/libze_validation"* && rm "/lib/x86_64-linux-gnu/libze_tracing_layer"*; \N + else \N + cp -a /runtime//*.so* /usr/lib64/ && cp -a /runtime/OpenCL /etc/ && cp -a /runtime/licenses/* /usr/share/licenses/; \N + fi + +#include "default_end.docker" +#include "default_labels.docker" + +LABEL name='intel-gpu-levelzero' +LABEL summary='Intel® GPU levelzero for Kubernetes' +LABEL description='The GPU levelzero container provides access to Levelzero API for the Intel GPU plugin' diff --git a/cmd/gpu_levelzero/README.md b/cmd/gpu_levelzero/README.md new file mode 100644 index 000000000..871a4a223 --- /dev/null +++ b/cmd/gpu_levelzero/README.md @@ -0,0 +1,38 @@ +# Intel GPU Level-Zero sidecar + +Table of Contents + +* [Introduction](#introduction) +* [Install](#install) + +## Introduction + +Intel GPU Level-Zero sidecar is an extension for the Intel GPU plugin to query additional GPU details from the oneAPI/Level-Zero API. As the Level-Zero is a C/C++ API, it is preferred to keep the original GPU plugin as-is and add the additional functionality via the Level-Zero sidecar. The GPU plugin can be configured to use the Level-Zero sidecar with an overlay, see [install](#install). + +Intel GPU plugin and the Level-Zero sidecar communicate via gRPC on a local socket visible only to the containers. + +> **NOTE**: Intel Device Plugin Operator doesn't yet support enabling Level-Zero sidecar in the GPU CR object. + +## Modes and Configuration Options + +| Flag | Argument | Default | Meaning | +|:---- |:-------- |:------- |:------- | +| -socket | unix socket path | /var/lib/levelzero/server.sock | Unix socket path which the server registers itself into. | +| -wsl | - | disabled | Adapt sidecar to run in the WSL environment. | +| -v | verbosity | 1 | Log verbosity | + +## Install + +Installing the sidecar along with the GPU plugin happens via two possible overlays: [health](../../deployments/gpu_plugin/overlays/health/) and [wsl](../../deployments/gpu_plugin/overlays/wsl/). + +Health overlay adds the sidecar to the base GPU plugin deployment and configures GPU plugin to retrieve device health indicators from the Level-Zero API: + +```bash +$ kubectl -k deployments/gpu_plugin/overlays/health +``` + +WSL layer enables Intel GPU detection with WSL (Windows Subsystem for Linux) Kubernetes clusters. It also leverages the Level-Zero sidecar: + +```bash +$ kubectl -k deployments/gpu_plugin/overlays/wsl +``` diff --git a/cmd/gpu_levelzero/main.go b/cmd/gpu_levelzero/main.go new file mode 100644 index 000000000..7ff2d30ef --- /dev/null +++ b/cmd/gpu_levelzero/main.go @@ -0,0 +1,229 @@ +// Copyright 2024 Intel Corporation. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +// #cgo CFLAGS: "-I/usr/include/level_zero" "-Wall" "-Wextra" "-O2" +// #cgo LDFLAGS: "-lze_loader" +// #include "ze.h" +import "C" + +import ( + "context" + "flag" + "net" + "os" + "strconv" + "unsafe" + + levelzero "github.com/intel/intel-device-plugins-for-kubernetes/cmd/internal/levelzero" + "google.golang.org/grpc" + "k8s.io/klog/v2" +) + +type server struct { + levelzero.UnimplementedLevelzeroServer +} + +func retrieveStatusDescription(code uint32) string { + bSize := 64 + b := make([]byte, bSize) + + cwritten := C.ze_status_to_string(C.uint32_t(code), (*C.char)(unsafe.Pointer(&b[0])), C.uint32_t(bSize)) + + written := int(cwritten) + if written <= 0 { + return "failed to retrieve description" + } + + return string(b[0:written]) +} + +func (s *server) GetDeviceHealth(c context.Context, deviceid *levelzero.DeviceId) (*levelzero.DeviceHealth, error) { + klog.V(3).Infof("Retrieve device health for %s", deviceid.BdfAddress) + + var errorVal uint32 = 0 + + cBdfAddress := C.CString(deviceid.BdfAddress) + + memHealth := bool(C.zes_device_memory_is_healthy(cBdfAddress, (*C.uint32_t)(unsafe.Pointer(&errorVal)))) + if errorVal != 0 { + klog.Warningf("device memory health read returned an error: 0x%X", errorVal) + } + + busHealth := bool(C.zes_device_bus_is_healthy(cBdfAddress, (*C.uint32_t)(unsafe.Pointer(&errorVal)))) + if errorVal != 0 { + klog.Warningf("device bus health read returned an error: 0x%X", errorVal) + } + + var err levelzero.Error + if errorVal != 0 { + err.Errorcode = errorVal + err.Description = retrieveStatusDescription(errorVal) + } else { + klog.V(3).Infof("Health for %s: Memory=%t, Bus=%t", deviceid.BdfAddress, memHealth, busHealth) + } + + health := &levelzero.DeviceHealth{ + BusOk: busHealth, + MemoryOk: memHealth, + SocOk: true, // Placeholder, not available. + Error: &err, + } + + return health, nil +} + +func (s *server) GetDeviceTemperature(c context.Context, deviceid *levelzero.DeviceId) (*levelzero.DeviceTemperature, error) { + klog.V(3).Infof("Retrieve device temperature for %s", deviceid.BdfAddress) + + var errorVal uint32 = 0 + + cBdfAddress := C.CString(deviceid.BdfAddress) + + globalTemp := float64(C.zes_device_temp_max(cBdfAddress, C.CString("global"), (*C.uint32_t)(unsafe.Pointer(&errorVal)))) + if errorVal != 0 { + klog.Warningf("global temperature read returned an error: 0x%X", errorVal) + } + + gpuTemp := float64(C.zes_device_temp_max(cBdfAddress, C.CString("gpu"), (*C.uint32_t)(unsafe.Pointer(&errorVal)))) + if errorVal != 0 { + klog.Warningf("gpu temperature read returned an error: 0x%X", errorVal) + } + + memTemp := float64(C.zes_device_temp_max(cBdfAddress, C.CString("memory"), (*C.uint32_t)(unsafe.Pointer(&errorVal)))) + if errorVal != 0 { + klog.Warningf("memory temperature read returned an error: 0x%X", errorVal) + } + + var err levelzero.Error + if errorVal != 0 { + err.Errorcode = errorVal + err.Description = retrieveStatusDescription(errorVal) + } else { + klog.V(3).Infof("Temperatures for %s: Memory=%.1fC, GPU=%.1fC, Global=%.1fC", deviceid.BdfAddress, memTemp, gpuTemp, globalTemp) + } + + temps := &levelzero.DeviceTemperature{ + Global: globalTemp, + Gpu: gpuTemp, + Memory: memTemp, + Error: &err, + } + + return temps, nil +} + +func (s *server) GetIntelIndices(c context.Context, m *levelzero.GetIntelIndicesMessage) (*levelzero.DeviceIndices, error) { + klog.V(3).Infof("Retrieve Intel indices") + + errorVal := uint32(0) + + indices := make([]uint32, 8) + + // TODO: Move to zes_ version when crash in WSL env is fixed: + // https://github.com/intel/compute-runtime/issues/721 + count := C.ze_intel_device_indices((*C.uint32_t)(&indices[0]), C.uint32_t(len(indices)), (*C.uint32_t)(unsafe.Pointer(&errorVal))) + + var err levelzero.Error + if errorVal != 0 { + err.Errorcode = errorVal + err.Description = retrieveStatusDescription(errorVal) + } + + ret := levelzero.DeviceIndices{ + Indices: indices[0:count], + Error: &err, + } + + return &ret, nil +} + +func (s *server) GetDeviceMemoryAmount(c context.Context, deviceid *levelzero.DeviceId) (*levelzero.DeviceMemoryAmount, error) { + klog.V(3).Infof("Retrieve device memory amount for %s", deviceid.BdfAddress) + + errorVal := uint32(0) + + memSize := C.zes_device_memory_amount(C.CString(deviceid.BdfAddress), (*C.uint32_t)(unsafe.Pointer(&errorVal))) + + if errorVal != 0 { + klog.Warningf("device memory amount read returned an error: 0x%X", errorVal) + } + + description := retrieveStatusDescription(errorVal) + + var err levelzero.Error + if errorVal != 0 { + err.Errorcode = errorVal + err.Description = description + } + + ret := levelzero.DeviceMemoryAmount{ + MemorySize: uint64(memSize), + Error: &err, + } + + return &ret, nil +} + +func main() { + klog.InitFlags(nil) + + socketPath := flag.String("socket", levelzero.DefaultUnixSocketPath, "Unix socket path to listen on") + wslEnv := flag.Bool("wsl", false, "Running in WSL environment") + + flag.Parse() + + // Delete possible previous socket file + _ = os.Remove(*socketPath) + + verbosity := int64(0) + + flag.VisitAll(func(f *flag.Flag) { + if f.Name == "v" { + if v, err := strconv.ParseInt(f.Value.String(), 10, 16); err == nil { + verbosity = v + } + } + }) + + lis, err := net.Listen("unix", *socketPath) + if err != nil { + klog.Fatalf("failed to listen: %v", err) + } + + // TODO: Drop "ze_try_initialize" when crash in WSL env is fixed: + // https://github.com/intel/compute-runtime/issues/721 + if *wslEnv { + if !bool(C.ze_try_initialize()) { + klog.Fatal("Ze Init try failed, cannot continue") + } + } else { + if !bool(C.zes_try_initialize()) { + klog.Fatal("Zes Init try failed, cannot continue") + } + } + + C.zes_set_verbosity(C.int(verbosity)) + + s := grpc.NewServer() + + levelzero.RegisterLevelzeroServer(s, &server{}) + + klog.Infof("server listening at %v", lis.Addr()) + + if err := s.Serve(lis); err != nil { + klog.Fatalf("failed to serve: %v", err) + } +} diff --git a/cmd/gpu_levelzero/main_test.go b/cmd/gpu_levelzero/main_test.go new file mode 100644 index 000000000..8f5c7cfe0 --- /dev/null +++ b/cmd/gpu_levelzero/main_test.go @@ -0,0 +1,88 @@ +// Copyright 2024 Intel Corporation. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "context" + "testing" + + levelzero "github.com/intel/intel-device-plugins-for-kubernetes/cmd/internal/levelzero" +) + +func TestErrorConversion(t *testing.T) { + t.Run("Known conversion(s)", func(t *testing.T) { + desc := retrieveStatusDescription(0) + + if desc != "success (0x0)" { + t.Fatal("couldn't convert 0 to success: ", desc) + } + + desc = retrieveStatusDescription(1879048193) // device lost + + if desc != "device lost (0x70000001)" { + t.Fatal("couldn't convert 0 to success: ", desc) + } + }) +} + +func TestCallingMethods(t *testing.T) { + s := server{} + + // As we cannot control the testing environment, we can't really check the return values for any sane values. + + t.Run("Call get indices", func(t *testing.T) { + indices, err := s.GetIntelIndices(context.Background(), &levelzero.GetIntelIndicesMessage{}) + + if len(indices.Indices) == 0 { + t.Log("No indices received") + } + if err != nil { + t.Log("Received an error") + } + }) + + t.Run("Call get health", func(t *testing.T) { + health, err := s.GetDeviceHealth(context.Background(), &levelzero.DeviceId{BdfAddress: "0000:00:01.0"}) + + if health.MemoryOk { + t.Log("Memory is ok") + } + if err != nil { + t.Log("Received an error") + } + }) + + t.Run("Call get temperature", func(t *testing.T) { + temps, err := s.GetDeviceTemperature(context.Background(), &levelzero.DeviceId{BdfAddress: "0000:00:01.0"}) + + if temps.Global > -999.0 { + t.Log("Memory is ok") + } + if err != nil { + t.Log("Received an error") + } + }) + + t.Run("Call get memory", func(t *testing.T) { + amount, err := s.GetDeviceMemoryAmount(context.Background(), &levelzero.DeviceId{BdfAddress: "0000:00:01.0"}) + + if amount.MemorySize > 0 { + t.Log("Received some memory") + } + if err != nil { + t.Log("Received an error") + } + }) +} diff --git a/cmd/gpu_levelzero/ze.c b/cmd/gpu_levelzero/ze.c new file mode 100644 index 000000000..cf738f2f1 --- /dev/null +++ b/cmd/gpu_levelzero/ze.c @@ -0,0 +1,177 @@ +// Copyright 2024 Intel Corporation. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include + +#include "ze.h" + +int ze_status_to_string(const uint32_t error, char* out, uint32_t out_size) +{ + char* description; + + switch (error) { + case ZE_RESULT_SUCCESS: + description = "success"; break; + case ZE_RESULT_NOT_READY: + description = "not ready"; break; + case ZE_RESULT_ERROR_DEVICE_LOST: + description = "device lost"; break; + case ZE_RESULT_ERROR_DEVICE_REQUIRES_RESET: + description = "device requires reset"; break; + case ZE_RESULT_ERROR_DEVICE_IN_LOW_POWER_STATE: + description = "device in low power state"; break; + case ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS: + description = "insufficient permissions"; break; + case ZE_RESULT_ERROR_NOT_AVAILABLE: + description = "not available"; break; + case ZE_RESULT_ERROR_DEPENDENCY_UNAVAILABLE: + description = "dependency unavailable"; break; + case ZE_RESULT_ERROR_UNINITIALIZED: + description = "uninitialized"; break; + case ZE_RESULT_ERROR_UNSUPPORTED_VERSION: + description = "unsupported version"; break; + case ZE_RESULT_ERROR_UNSUPPORTED_FEATURE: + description = "unsupported feature"; break; + case ZE_RESULT_ERROR_INVALID_ARGUMENT: + description = "invalid argument"; break; + case ZE_RESULT_ERROR_INVALID_NULL_POINTER: + description = "invalid null pointer"; break; + case ZE_RESULT_ERROR_INVALID_NULL_HANDLE: + description = "invalid null handle"; break; + case ZE_RESULT_ERROR_UNKNOWN: + description = "unknown"; break; + default: + description = "not known"; break; + } + + return snprintf(out, out_size -1, "%s (0x%X)", description, error); +} + +static ze_driver_handle_t initialize_ze(void) +{ + ze_result_t res = zeInit(ZE_INIT_FLAG_GPU_ONLY); + if (res != ZE_RESULT_SUCCESS) { + fprintf(stderr, "zeInit failed: 0x%X \n", res); + + return 0; + } + + uint32_t count = 0; + + if (zeDriverGet(&count, NULL) != ZE_RESULT_SUCCESS || count == 0) { + fprintf(stderr, "zeDriverGet failed or no drivers\n"); + + return 0; + } + + count = 1; + + ze_driver_handle_t handle; + if (zeDriverGet(&count, &handle) != ZE_RESULT_SUCCESS) { + fprintf(stderr, "zeDriverGet failed\n"); + + return 0; + } + + return handle; +} + +bool ze_try_initialize(void) +{ + if (getenv("UNITTEST") != NULL) { + return false; + } + + return zeInit(0) == ZE_RESULT_SUCCESS; +} + +/// @brief Retrieve indices for Intel levelzero devices +/// @param indices Pointer to an array to store indices +/// @param indices_size Size of the array +/// @return Number of indices stored +int ze_intel_device_indices(uint32_t* indices, uint32_t indices_size, uint32_t *error) +{ + if (getenv("UNITTEST") != NULL) { + return 0; + } + + if (indices == NULL || 0 == indices_size) { + *error = ZE_RESULT_ERROR_INVALID_NULL_POINTER; + + return 0; + } + + ze_driver_handle_t handle = initialize_ze(); + + if (handle == 0) { + *error = ZE_RESULT_ERROR_INVALID_NULL_POINTER; + + return 0; + } + + ze_result_t res = 0; + uint32_t count = 0; + + res = zeDeviceGet(handle, &count, NULL); + if (res != ZE_RESULT_SUCCESS) { + *error = res; + + return 0; + } + + if (count == 0) { + *error = ZE_RESULT_ERROR_DEVICE_LOST; + + return 0; + } + + ze_device_handle_t dev_handle[count]; + + res = zeDeviceGet(handle, &count, dev_handle); + if (res != ZE_RESULT_SUCCESS) { + *error = res; + + return 0; + } + + if (count > indices_size) { + count = indices_size; + } + + int intel_device_count = 0; + + // Iterate over the devices and add Intel indices to be returned + for (uint32_t i = 0; i < count; ++i) { + ze_device_handle_t dev_h = dev_handle[i]; + + ze_device_properties_t dev_prop; + memset(&dev_prop, 0, sizeof(ze_device_properties_t)); + + res = zeDeviceGetProperties(dev_h, &dev_prop); + if (res != ZE_RESULT_SUCCESS) { + continue; + } + + if (dev_prop.vendorId == VENDOR_ID_INTEL) { + indices[intel_device_count] = i; + intel_device_count++; + } + } + + return intel_device_count; +} diff --git a/cmd/gpu_levelzero/ze.h b/cmd/gpu_levelzero/ze.h new file mode 100644 index 000000000..0d0155752 --- /dev/null +++ b/cmd/gpu_levelzero/ze.h @@ -0,0 +1,33 @@ +// Copyright 2024 Intel Corporation. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#define VENDOR_ID_INTEL 0x8086 + +void zes_set_verbosity(const int level); + +bool ze_try_initialize(void); +bool zes_try_initialize(void); + +int ze_status_to_string(const uint32_t error, char* out, uint32_t out_size); + +int ze_intel_device_indices(uint32_t* indices, uint32_t indices_size, uint32_t* error); +uint64_t zes_device_memory_amount(char* bdf_address, uint32_t* error); +bool zes_device_memory_is_healthy(char* bdf_address, uint32_t* error); +bool zes_device_bus_is_healthy(char* bdf_address, uint32_t* error); +double zes_device_temp_max(char* bdf_address, char* sensor, uint32_t* error); diff --git a/cmd/gpu_levelzero/zes.c b/cmd/gpu_levelzero/zes.c new file mode 100644 index 000000000..feeaa1c38 --- /dev/null +++ b/cmd/gpu_levelzero/zes.c @@ -0,0 +1,438 @@ +// Copyright 2024 Intel Corporation. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include + +#include + +#include "ze.h" + +#define MAX_BDF_BUFSIZE 32 + +struct device_info { + char bdf[MAX_BDF_BUFSIZE]; +}; + +zes_device_handle_t* zes_handles = NULL; +struct device_info* bdf_addresses = NULL; +uint32_t zes_handles_count = 0; + +static bool device_enumerated = false; + +typedef enum { + LOG_ERROR = 1, + LOG_WARNING, + LOG_INFO, + LOG_DEBUG +} log_level_t; + +static log_level_t verbosity_level = LOG_ERROR; + +static void print_log(log_level_t level, char* fmt, ...) __attribute__ ((format (printf, 2, 3))); + +static void print_log(log_level_t level, char* fmt, ...) +{ + if (verbosity_level >= level) { + va_list args; + + va_start(args, fmt); + vfprintf(stderr, fmt, args); + va_end(args); + } +} + +void zes_set_verbosity(const int level) +{ + verbosity_level = level; + + fprintf(stderr, "C set verbosity level: %d\n", verbosity_level); +} + +bool zes_try_initialize(void) +{ + if (getenv("UNITTEST") != NULL) { + return false; + } + + return zesInit(0) == ZE_RESULT_SUCCESS; +} + +static ze_result_t enumerate_zes_devices(void) +{ + ze_result_t res = zesInit(0); + if (res != ZE_RESULT_SUCCESS) { + return res; + } + + uint32_t count = 0; + + res = zesDriverGet(&count, NULL); + if (res != ZE_RESULT_SUCCESS) { + return res; + } + + if (count == 0) { + return ZE_RESULT_ERROR_NOT_AVAILABLE; + } + + if (count > 1) { + print_log(LOG_WARNING, "more than one zes driver detected, using first one\n"); + } + + count = 1; + + zes_driver_handle_t handle; + res = zesDriverGet(&count, &handle); + if (res != ZE_RESULT_SUCCESS) { + return res; + } + + count = 0; + res = zesDeviceGet(handle, &count, NULL); + if (res != ZE_RESULT_SUCCESS) { + return res; + } + + if (count == 0) { + return ZE_RESULT_ERROR_NOT_AVAILABLE; + } + + zes_handles = calloc(count, sizeof(zes_device_handle_t)); + + res = zesDeviceGet(handle, &count, zes_handles); + if (res != ZE_RESULT_SUCCESS) { + free(zes_handles); + + return res; + } + + zes_handles_count = count; + + bdf_addresses = (struct device_info*) calloc(count,sizeof(struct device_info)); + if (bdf_addresses == NULL) { + free(zes_handles); + + return ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } + + // Iterate over the devices and store their info into the cache array + for (uint32_t i = 0; i < count; ++i) { + zes_device_handle_t dev_h = zes_handles[i]; + + zes_pci_properties_t pci_props; + if (zesDevicePciGetProperties(dev_h, &pci_props) != ZE_RESULT_SUCCESS) { + continue; + } + + zes_pci_address_t* addr = &pci_props.address; + + snprintf(bdf_addresses[i].bdf, sizeof(bdf_addresses[i].bdf), + "%04x:%02x:%02x.%x", + addr->domain, addr->bus, addr->device, addr->function + ); + } + + device_enumerated = true; + + return res; +} + +static zes_device_handle_t retrieve_handle_for_bdf(char* bdf_address) +{ + zes_device_handle_t handle = 0; + + for (uint32_t i = 0; i < zes_handles_count; ++i) { + struct device_info* di = &bdf_addresses[i]; + + if (strncmp(bdf_address, di->bdf, sizeof(di->bdf)) == 0) { + handle = zes_handles[i]; + break; + } + } + + return handle; +} + +static bool is_integrated(zes_device_handle_t handle) +{ + ze_result_t res = ZE_RESULT_SUCCESS; + + zes_device_ext_properties_t ext = { + .stype = ZES_STRUCTURE_TYPE_DEVICE_EXT_PROPERTIES, + }; + zes_device_properties_t props = { + .stype = ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES, + .pNext = &ext, + }; + + if (res = zesDeviceGetProperties(handle, &props), res == ZE_RESULT_SUCCESS) { + if (ext.flags & ZES_DEVICE_PROPERTY_FLAG_INTEGRATED) { + return true; + } + } + + return false; +} + +/// @brief Retrieves memory amount for a specific device with bdf address +/// @param bdf_address +/// @return memory amount for the device +uint64_t zes_device_memory_amount(char* bdf_address, uint32_t* error) +{ + if (getenv("UNITTEST") != NULL) { + return 0; + } + + print_log(LOG_DEBUG, "Retrieve memory size for %s\n", bdf_address); + + ze_result_t res = ZE_RESULT_SUCCESS; + + if (!device_enumerated) { + res = enumerate_zes_devices(); + if (res != ZE_RESULT_SUCCESS) { + *error = res; + + return 0; + } + } + + zes_device_handle_t handle = retrieve_handle_for_bdf(bdf_address); + if (handle == 0) { + *error = ZE_RESULT_ERROR_UNKNOWN; + + return 0; + } + + // Levelzero does not provide memory details for integrated + if (is_integrated(handle)) { + print_log(LOG_DEBUG, "Device is integrated => no memory\n"); + + return 0; + } + + uint32_t modcount = 0; + uint64_t memory_size = 0; + if (!zesDeviceEnumMemoryModules(handle, &modcount, NULL) == ZE_RESULT_SUCCESS && modcount > 0) { + zes_mem_handle_t memhandles[modcount]; + + if (zesDeviceEnumMemoryModules(handle, &modcount, memhandles) == ZE_RESULT_SUCCESS) { + for (uint32_t mod_index = 0; mod_index < modcount; ++mod_index) { + zes_mem_state_t mem_state; + + if (zesMemoryGetState(memhandles[mod_index], &mem_state) == ZE_RESULT_SUCCESS) { + memory_size += mem_state.size; + } + } + } + } + + print_log(LOG_DEBUG, "> Memory size: %ld\n", memory_size); + + return memory_size; +} + +/// @brief Retrieve device memory's health status +/// @param bdf_address +/// @return true for good, false for bad +bool zes_device_memory_is_healthy(char* bdf_address, uint32_t* error) +{ + if (getenv("UNITTEST") != NULL) { + return false; + } + + print_log(LOG_DEBUG, "Fetching memory health for %s\n", bdf_address); + + if (!device_enumerated) { + ze_result_t res = enumerate_zes_devices(); + if (res != ZE_RESULT_SUCCESS) { + *error = res; + + return true; + } + } + + zes_device_handle_t handle = retrieve_handle_for_bdf(bdf_address); + if (handle == 0) { + *error = ZE_RESULT_ERROR_UNKNOWN; + + return true; + } + + // Levelzero does not provide memory details for integrated + if (is_integrated(handle)) { + return true; + } + + uint32_t modcount = 0; + if (zesDeviceEnumMemoryModules(handle, &modcount, NULL) == ZE_RESULT_SUCCESS && modcount > 0) { + zes_mem_handle_t memhandles[modcount]; + + if (zesDeviceEnumMemoryModules(handle, &modcount, memhandles) == ZE_RESULT_SUCCESS) { + for (uint32_t mod_index = 0; mod_index < modcount; ++mod_index) { + zes_mem_state_t mem_state; + + if (zesMemoryGetState(memhandles[mod_index], &mem_state) == ZE_RESULT_SUCCESS) { + if (mem_state.health >= ZES_MEM_HEALTH_CRITICAL) { + print_log(LOG_DEBUG, "> Health: Critical\n"); + + return false; + } + } + } + } + } + + print_log(LOG_DEBUG, "> Health: OK\n"); + + return true; +} + +/// @brief Retrieve device bus' health status +/// @param bdf_address +/// @return true for good, false for bad +bool zes_device_bus_is_healthy(char* bdf_address, uint32_t* error) +{ + if (getenv("UNITTEST") != NULL) { + return false; + } + + print_log(LOG_DEBUG, "Fetching bus health for %s\n", bdf_address); + + if (!device_enumerated) { + ze_result_t res = enumerate_zes_devices(); + if (res != ZE_RESULT_SUCCESS) { + *error = res; + + return true; + } + } + + zes_device_handle_t handle = retrieve_handle_for_bdf(bdf_address); + if (handle == 0) { + *error = ZE_RESULT_ERROR_UNKNOWN; + + return true; + } + + zes_pci_state_t pci_state; + memset(&pci_state, 0, sizeof(pci_state)); + + ze_result_t res = zesDevicePciGetState(handle, &pci_state); + if (res == ZE_RESULT_SUCCESS) { + if (pci_state.qualityIssues & ZES_PCI_LINK_QUAL_ISSUE_FLAG_SPEED) { + print_log(LOG_DEBUG, "> Health: Critical\n"); + + return false; + } + } else if (res != ZE_RESULT_ERROR_UNSUPPORTED_FEATURE) { + *error = res; + } + + print_log(LOG_DEBUG, "> Health: OK\n"); + + return true; +} + +/// @brief Retrieve device's temperatur for a sensor +/// @param bdf_address - bdf address +/// @param sensor - name of the sensor: global, gpu or memory +/// @return temperature for the sensor +double zes_device_temp_max(char* bdf_address, char* sensor, uint32_t* error) +{ + if (getenv("UNITTEST") != NULL) { + return -999.0; + } + + uint32_t requestedType = 0; + if (!strncmp("global", sensor, 6)) { + requestedType = ZES_TEMP_SENSORS_GLOBAL; + } else if (!strncmp("gpu", sensor, 3)) { + requestedType = ZES_TEMP_SENSORS_GPU; + } else if (!strncmp("memory", sensor, 6)) { + requestedType = ZES_TEMP_SENSORS_MEMORY; + } else { + *error = ZE_RESULT_ERROR_INVALID_ARGUMENT; + + return -999.0; + } + + print_log(LOG_DEBUG, "Fetch %s temperature for %s\n", sensor, bdf_address); + + if (!device_enumerated) { + ze_result_t res = enumerate_zes_devices(); + if (res != ZE_RESULT_SUCCESS) { + *error = res; + + return -999.0; + } + } + + zes_device_handle_t handle = retrieve_handle_for_bdf(bdf_address); + if (handle == 0) { + *error = ZE_RESULT_ERROR_UNKNOWN; + + return -999.0; + } + + uint32_t count = 0; + ze_result_t res = zesDeviceEnumTemperatureSensors(handle, &count, NULL); + if (res != ZE_RESULT_SUCCESS || count == 0) { + *error = res; + + return -999.0; + } + + zes_temp_handle_t tempHandles[count]; + res = zesDeviceEnumTemperatureSensors(handle, &count, tempHandles); + if (res != ZE_RESULT_SUCCESS) { + *error = res; + + return -999.0; + } + + for (uint32_t i = 0; i < count; ++i) { + zes_temp_properties_t props; + + res = zesTemperatureGetProperties(tempHandles[i], &props); + if (res != ZE_RESULT_SUCCESS) { + *error = res; + + return -999.0; + } + + if (props.type != requestedType) { + continue; + } + + double tempCelsius = 0.0; + res = zesTemperatureGetState(tempHandles[i], &tempCelsius); + if (res != ZE_RESULT_SUCCESS) { + *error = res; + + return -999.0; + } + + print_log(LOG_DEBUG, "> Temperature: %.1f\n", tempCelsius); + + return tempCelsius; + } + + *error = ZE_RESULT_ERROR_NOT_AVAILABLE; + + return -999.0; +} diff --git a/cmd/gpu_plugin/README.md b/cmd/gpu_plugin/README.md index c59ca0259..5a19ceedc 100644 --- a/cmd/gpu_plugin/README.md +++ b/cmd/gpu_plugin/README.md @@ -18,6 +18,7 @@ Table of Contents * [SR-IOV use with the plugin](#sr-iov-use-with-the-plugin) * [CDI support](#cdi-support) * [KMD and UMD](#kmd-and-umd) + * [Health management](#health-management) * [Issues with media workloads on multi-GPU setups](#issues-with-media-workloads-on-multi-gpu-setups) * [Workaround for QSV and VA-API](#workaround-for-qsv-and-va-api) @@ -56,6 +57,8 @@ For workloads on different KMDs, see [KMD and UMD](#kmd-and-umd). |:---- |:-------- |:------- |:------- | | -enable-monitoring | - | disabled | Enable '*_monitoring' resource that provides access to all Intel GPU devices on the node, [see use](./monitoring.md) | | -resource-manager | - | disabled | Enable fractional resource management, [see use](./fractional.md) | +| -health-management | - | disabled | Enable health management by requesting data from oneAPI/Level-Zero interface. Requires [GPU Level-Zero](../gpu_levelzero/) sidecar. See [health management](#health-management) | +| -wsl | - | disabled | Adapt plugin to run in the WSL environment. Requires [GPU Level-Zero](../gpu_levelzero/) sidecar. | | -shared-dev-num | int | 1 | Number of containers that can share the same GPU device | | -allocation-policy | string | none | 3 possible values: balanced, packed, none. For shared-dev-num > 1: _balanced_ mode spreads workloads among GPU devices, _packed_ mode fills one GPU fully before moving to next, and _none_ selects first available device from kubelet. Default is _none_. Allocation policy does not have an effect when resource manager is enabled. | @@ -257,6 +260,14 @@ Creating a workload that would support all the different KMDs is not currently p | Media | Default | [ENABLE_PRODUCTION_KMD](https://github.com/intel/media-driver/blob/a66b076e83876fbfa9c9ab633ad9c5517f8d74fd/CMakeLists.txt#L58) | [ENABLE_XE_KMD](https://github.com/intel/media-driver/blob/a66b076e83876fbfa9c9ab633ad9c5517f8d74fd/media_driver/cmake/linux/media_feature_flags_linux.cmake#L187-L190) | Xe with upstream or backport i915, not all three. | | Graphics | Default | Unknown | [intel-xe-kmd](https://gitlab.freedesktop.org/mesa/mesa/-/blob/e9169881dbd1f72eab65a68c2b8e7643f74489b7/meson_options.txt#L708) | i915 and xe KMDs can be supported at the same time. | +### Health management + +Kubernetes Device Plugin API allows passing device's healthiness to Kubelet. By default GPU plugin reports all devices to be `Healthy`. If health management is enabled, GPU plugin retrieves health related data from oneAPI/Level-Zero interface via [GPU levelzero](../gpu_levelzero/). Depending on the data received, GPU plugin will report device to be `Unhealthy` if: +1) Direct health indicators report issues: [memory](https://spec.oneapi.io/level-zero/latest/sysman/api.html#zes-mem-health-t) & [pci](https://spec.oneapi.io/level-zero/latest/sysman/api.html#zes-pci-link-status-t) +1) Device temperature is over the limit + +Temperature limit can be provided via the command line argument, default is 100C. + ### Issues with media workloads on multi-GPU setups OneVPL media API, 3D and compute APIs provide device discovery diff --git a/cmd/gpu_plugin/gpu_plugin.go b/cmd/gpu_plugin/gpu_plugin.go index f048b13d5..c3b6ed73b 100644 --- a/cmd/gpu_plugin/gpu_plugin.go +++ b/cmd/gpu_plugin/gpu_plugin.go @@ -23,6 +23,7 @@ import ( "path/filepath" "regexp" "sort" + "strconv" "strings" "time" @@ -31,8 +32,10 @@ import ( "k8s.io/klog/v2" pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" + "github.com/intel/intel-device-plugins-for-kubernetes/cmd/gpu_plugin/levelzeroservice" "github.com/intel/intel-device-plugins-for-kubernetes/cmd/gpu_plugin/rm" "github.com/intel/intel-device-plugins-for-kubernetes/cmd/internal/labeler" + gpulevelzero "github.com/intel/intel-device-plugins-for-kubernetes/cmd/internal/levelzero" dpapi "github.com/intel/intel-device-plugins-for-kubernetes/pkg/deviceplugin" cdispec "tags.cncf.io/container-device-interface/specs-go" ) @@ -40,6 +43,8 @@ import ( const ( sysfsDrmDirectory = "/sys/class/drm" devfsDriDirectory = "/dev/dri" + wslDxgPath = "/dev/dxg" + wslLibPath = "/usr/lib/wsl" nfdFeatureDir = "/etc/kubernetes/node-feature-discovery/features.d" resourceFilename = "intel-gpu-resources.txt" gpuDeviceRE = `^card[0-9]+$` @@ -51,6 +56,7 @@ const ( namespace = "gpu.intel.com" deviceTypeI915 = "i915" deviceTypeXe = "xe" + deviceTypeDxg = "dxg" deviceTypeDefault = deviceTypeI915 // telemetry resource settings. @@ -67,8 +73,11 @@ const ( type cliOptions struct { preferredAllocationPolicy string sharedDevNum int + temperatureLimit int enableMonitoring bool resourceManagement bool + wslScan bool + healthManagement bool } type rmWithMultipleDriversErr struct { @@ -209,14 +218,14 @@ func (dp *devicePlugin) pciAddressForCard(cardPath, cardName string) (string, er return "", err } - // Fetches the pci address for a drm card by reading the + // Fetches the PCI address for a drm card by reading the // symbolic link that the /sys/class/drm/cardX points to. // ../../devices/pci0000:00/0000:00:02.0/drm/card // -------------------------^^^^^^^^^^^^---------. pciAddress := filepath.Base(strings.TrimSuffix(linkPath, filepath.Join("drm", cardName))) if !dp.pciAddressReg.MatchString(pciAddress) { - klog.Warningf("Invalid pci address for %s: %s", cardPath, pciAddress) + klog.Warningf("Invalid PCI address for %s: %s", cardPath, pciAddress) return "", os.ErrInvalid } @@ -274,11 +283,13 @@ type devicePlugin struct { scanDone chan bool scanResources chan bool - resMan rm.ResourceManager + resMan rm.ResourceManager + levelzeroService levelzeroservice.LevelzeroService - sysfsDir string - devfsDir string - bypathDir string + sysfsDir string + devfsDir string + bypathDir string + healthStatuses map[string]string // Note: If restarting the plugin with a new policy, the allocations for existing pods remain with old policy. policy preferredAllocationPolicyFunc @@ -300,6 +311,7 @@ func newDevicePlugin(sysfsDir, devfsDir string, options cliOptions) *devicePlugi scanDone: make(chan bool, 1), // buffered as we may send to it before Scan starts receiving from it bypathFound: true, scanResources: make(chan bool, 1), + healthStatuses: make(map[string]string), } if options.resourceManagement { @@ -325,15 +337,85 @@ func newDevicePlugin(sysfsDir, devfsDir string, options cliOptions) *devicePlugi dp.policy = nonePolicy } - if _, err := os.ReadDir(dp.bypathDir); err != nil { - klog.Warningf("failed to read by-path dir: %+v", err) + if !options.wslScan { + if _, err := os.ReadDir(dp.bypathDir); err != nil { + klog.Warningf("failed to read by-path dir: %+v", err) - dp.bypathFound = false + dp.bypathFound = false + } } return dp } +func logHealthStatusChange(card, newStatus string, statuses map[string]string) { + prevState, found := statuses[card] + if !found { + klog.V(2).Infof("%s: new => %s", card, newStatus) + + statuses[card] = newStatus + } else if prevState != newStatus { + klog.V(2).Infof("%s: %s => %s", card, prevState, newStatus) + + statuses[card] = newStatus + } +} + +func (dp *devicePlugin) healthStatusForCard(cardPath string) string { + if dp.levelzeroService == nil { + return pluginapi.Healthy + } + + link, err := os.Readlink(filepath.Join(cardPath, "device")) + if err != nil { + klog.Warning("couldn't read device link for", cardPath) + + return pluginapi.Healthy + } + + health := pluginapi.Healthy + + // Check status changes after the function exits + defer func() { logHealthStatusChange(cardPath, health, dp.healthStatuses) }() + + bdfAddr := filepath.Base(link) + + dh, err := dp.levelzeroService.GetDeviceHealth(bdfAddr) + if err != nil { + klog.Warningf("Device health retrieval failed: %v", err) + + return health + } + + // Direct Health indicators + klog.V(4).Infof("Health indicators: Memory=%t, Bus=%t, SoC=%t", dh.Memory, dh.Bus, dh.SoC) + + if !dh.Memory || !dh.Bus || !dh.SoC { + health = pluginapi.Unhealthy + + return health + } + + dt, err := dp.levelzeroService.GetDeviceTemperature(bdfAddr) + // In case of any errors, return the current health status + if err != nil { + klog.Warningf("Device temperature retrieval failed: %v", err) + + return health + } + + limit := float64(dp.options.temperatureLimit) + + // Temperatures for different areas + klog.V(4).Infof("Temperatures: Memory=%.1fC, GPU=%.1fC, Global=%.1fC", dh.MemoryTemperature, dh.GPUTemperature, dh.GlobalTemperature) + + if dt.GPU > limit || dt.Global > limit || dt.Memory > limit { + health = pluginapi.Unhealthy + } + + return health +} + // Implement the PreferredAllocator interface. func (dp *devicePlugin) GetPreferredAllocation(rqt *pluginapi.PreferredAllocationRequest) (*pluginapi.PreferredAllocationResponse, error) { if dp.resMan != nil { @@ -369,6 +451,68 @@ func (dp *devicePlugin) GetPreferredAllocation(rqt *pluginapi.PreferredAllocatio } func (dp *devicePlugin) Scan(notifier dpapi.Notifier) error { + if dp.options.wslScan { + return dp.wslGpuScan(notifier) + } else { + return dp.sysFsGpuScan(notifier) + } +} + +func (dp *devicePlugin) wslGpuScan(notifier dpapi.Notifier) error { + defer dp.scanTicker.Stop() + + klog.V(1).Infof("GPU (%s) resource share count = %d", deviceTypeDxg, dp.options.sharedDevNum) + + devSpecs := []pluginapi.DeviceSpec{ + { + HostPath: wslDxgPath, + ContainerPath: wslDxgPath, + Permissions: "rw", + }, + } + + mounts := []pluginapi.Mount{ + { + ContainerPath: wslLibPath, + HostPath: wslLibPath, + ReadOnly: true, + }, + } + + for { + indices, err := dp.levelzeroService.GetIntelIndices() + if err == nil { + klog.V(4).Info("Intel Level-Zero indices: ", indices) + + devTree := dpapi.NewDeviceTree() + + for _, index := range indices { + envs := map[string]string{ + rm.LevelzeroAffinityMaskEnvVar: strconv.Itoa(int(index)), + } + + deviceInfo := dpapi.NewDeviceInfo(pluginapi.Healthy, devSpecs, mounts, envs, nil, nil) + + for i := 0; i < dp.options.sharedDevNum; i++ { + devID := fmt.Sprintf("card%d-%d", index, i) + devTree.AddDevice(deviceTypeDxg, devID, deviceInfo) + } + } + + notifier.Notify(devTree) + } else { + klog.Warning("Failed to get Intel indices from Level-Zero") + } + + select { + case <-dp.scanDone: + return nil + case <-dp.scanTicker.C: + } + } +} + +func (dp *devicePlugin) sysFsGpuScan(notifier dpapi.Notifier) error { defer dp.scanTicker.Stop() klog.V(1).Infof("GPU (%s/%s) resource share count = %d", deviceTypeI915, deviceTypeXe, dp.options.sharedDevNum) @@ -566,7 +710,9 @@ func (dp *devicePlugin) scan() (dpapi.DeviceTree, error) { mounts, cdiDevices := dp.createMountsAndCDIDevices(cardPath, name, devSpecs) - deviceInfo := dpapi.NewDeviceInfo(pluginapi.Healthy, devSpecs, mounts, nil, nil, cdiDevices) + health := dp.healthStatusForCard(cardPath) + + deviceInfo := dpapi.NewDeviceInfo(health, devSpecs, mounts, nil, nil, cdiDevices) for i := 0; i < dp.options.sharedDevNum; i++ { devID := fmt.Sprintf("%s-%d", name, i) @@ -627,7 +773,10 @@ func main() { flag.StringVar(&prefix, "prefix", "", "Prefix for devfs & sysfs paths") flag.BoolVar(&opts.enableMonitoring, "enable-monitoring", false, "whether to enable '*_monitoring' (= all GPUs) resource") flag.BoolVar(&opts.resourceManagement, "resource-manager", false, "fractional GPU resource management") + flag.BoolVar(&opts.healthManagement, "health-management", false, "enable GPU health management") + flag.BoolVar(&opts.wslScan, "wsl", false, "scan for / use WSL devices") flag.IntVar(&opts.sharedDevNum, "shared-dev-num", 1, "number of containers sharing the same GPU device") + flag.IntVar(&opts.temperatureLimit, "temp-limit", 100, "temperature limit at which device is marked unhealthy") flag.StringVar(&opts.preferredAllocationPolicy, "allocation-policy", "none", "modes of allocating GPU devices: balanced, packed and none") flag.Parse() @@ -651,6 +800,34 @@ func main() { plugin := newDevicePlugin(prefix+sysfsDrmDirectory, prefix+devfsDriDirectory, opts) + if plugin.options.wslScan { + klog.Info("WSL mode requested") + + if plugin.options.resourceManagement { + klog.Error("Resource management is not supported within WSL. Please disable resource management.") + + os.Exit(1) + } + + if plugin.options.enableMonitoring { + klog.Error("Monitoring is not supported within WSL. Please disable monitoring.") + + os.Exit(1) + } + + if plugin.options.healthManagement { + klog.Error("Health management is not supported within WSL. Please disable health management.") + + os.Exit(1) + } + } + + if plugin.options.healthManagement || plugin.options.wslScan { + plugin.levelzeroService = levelzeroservice.NewLevelzero(gpulevelzero.DefaultUnixSocketPath) + + go plugin.levelzeroService.Run(true) + } + if plugin.options.resourceManagement { // Start labeler to export labels file for NFD. nfdFeatureFile := path.Join(nfdFeatureDir, resourceFilename) @@ -659,7 +836,10 @@ func main() { // Labeler catches OS signals and calls os.Exit() after receiving any. go labeler.Run(prefix+sysfsDrmDirectory, nfdFeatureFile, - labelerMaxInterval, plugin.scanResources) + labelerMaxInterval, plugin.scanResources, plugin.levelzeroService, func() { + // Exit the whole app when labeler exits + os.Exit(0) + }) } manager := dpapi.NewManager(namespace, plugin) diff --git a/cmd/gpu_plugin/gpu_plugin_test.go b/cmd/gpu_plugin/gpu_plugin_test.go index 447a87e53..492e2ee69 100644 --- a/cmd/gpu_plugin/gpu_plugin_test.go +++ b/cmd/gpu_plugin/gpu_plugin_test.go @@ -27,6 +27,7 @@ import ( "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" "k8s.io/utils/strings/slices" + "github.com/intel/intel-device-plugins-for-kubernetes/cmd/gpu_plugin/levelzeroservice" "github.com/intel/intel-device-plugins-for-kubernetes/cmd/gpu_plugin/rm" dpapi "github.com/intel/intel-device-plugins-for-kubernetes/pkg/deviceplugin" cdispec "tags.cncf.io/container-device-interface/specs-go" @@ -41,6 +42,7 @@ type mockNotifier struct { scanDone chan bool i915Count int xeCount int + dxgCount int i915monitorCount int xeMonitorCount int } @@ -50,6 +52,7 @@ func (n *mockNotifier) Notify(newDeviceTree dpapi.DeviceTree) { n.xeCount = len(newDeviceTree[deviceTypeXe]) n.xeMonitorCount = len(newDeviceTree[deviceTypeXe+monitorSuffix]) n.i915Count = len(newDeviceTree[deviceTypeI915]) + n.dxgCount = len(newDeviceTree[deviceTypeDxg]) n.i915monitorCount = len(newDeviceTree[deviceTypeDefault+monitorSuffix]) n.scanDone <- true @@ -72,18 +75,63 @@ func (m *mockResourceManager) SetTileCountPerCard(count uint64) { m.tileCount = count } +type mockL0Service struct { + indices []uint32 + memSize uint64 + healthy bool + fail bool +} + +func (m *mockL0Service) Run(keep bool) { +} +func (m *mockL0Service) Stop() { +} +func (m *mockL0Service) GetIntelIndices() ([]uint32, error) { + if m.fail { + return m.indices, errors.Errorf("error, error") + } + + return m.indices, nil +} +func (m *mockL0Service) GetDeviceHealth(bdfAddress string) (levelzeroservice.DeviceHealth, error) { + if m.fail { + return levelzeroservice.DeviceHealth{}, errors.Errorf("error, error") + } + + return levelzeroservice.DeviceHealth{Memory: m.healthy, Bus: m.healthy, SoC: m.healthy}, nil +} +func (m *mockL0Service) GetDeviceTemperature(bdfAddress string) (levelzeroservice.DeviceTemperature, error) { + if m.fail { + return levelzeroservice.DeviceTemperature{}, errors.Errorf("error, error") + } + + return levelzeroservice.DeviceTemperature{Global: 35.0, GPU: 35.0, Memory: 35.0}, nil +} +func (m *mockL0Service) GetDeviceMemoryAmount(bdfAddress string) (uint64, error) { + if m.fail { + return m.memSize, errors.Errorf("error, error") + } + + return m.memSize, nil +} + type TestCaseDetails struct { - name string + // possible mock l0 service + l0mock levelzeroservice.LevelzeroService // test-case environment - sysfsdirs []string + pciAddresses map[string]string sysfsfiles map[string][]byte symlinkfiles map[string]string + name string + sysfsdirs []string devfsdirs []string // how plugin should interpret it options cliOptions // what the result should be (i915) expectedI915Devs int expectedI915Monitors int + // what the result should be (dxg) + expectedDxgDevs int // what the result should be (xe) expectedXeDevs int expectedXeMonitors int @@ -99,6 +147,33 @@ func createTestFiles(root string, tc TestCaseDetails) (string, string, error) { } } + if err := os.MkdirAll(sysfs, 0750); err != nil { + return "", "", errors.Wrap(err, "Failed to create fake base sysfs directory") + } + + if len(tc.pciAddresses) > 0 { + if err := os.MkdirAll(filepath.Join(sysfs, ".devices"), 0750); err != nil { + return "", "", errors.Wrap(err, "Failed to create fake PCI address base") + } + + for pci, card := range tc.pciAddresses { + fullPci := filepath.Join(sysfs, ".devices", pci) + cardPath := filepath.Join(sysfs, card) + + if err := os.MkdirAll(fullPci, 0750); err != nil { + return "", "", errors.Wrap(err, "Failed to create fake PCI address entry") + } + + if err := os.MkdirAll(cardPath, 0750); err != nil { + return "", "", errors.Wrap(err, "Failed to create fake card entry") + } + + if err := os.Symlink(fullPci, filepath.Join(sysfs, card, "device")); err != nil { + return "", "", errors.Wrap(err, "Failed to create fake PCI address symlinks") + } + } + } + for _, sysfsdir := range tc.sysfsdirs { if err := os.MkdirAll(path.Join(sysfs, sysfsdir), 0750); err != nil { return "", "", errors.Wrap(err, "Failed to create fake device directory") @@ -444,6 +519,176 @@ func TestScan(t *testing.T) { } } +func TestScanWithHealth(t *testing.T) { + tcases := []TestCaseDetails{ + { + name: "one device with no symlink", + sysfsdirs: []string{"card0/device/drm/card0", "card0/device/drm/controlD64"}, + sysfsfiles: map[string][]byte{ + "card0/device/vendor": []byte("0x8086"), + }, + devfsdirs: []string{ + "card0", + "by-path/pci-0000:00:00.0-card", + "by-path/pci-0000:00:00.0-render", + }, + expectedI915Devs: 1, + }, + { + name: "one device with proper symlink", + pciAddresses: map[string]string{"0000:00:00.0": "card0"}, + sysfsdirs: []string{"card0/device/drm/card0", "card0/device/drm/controlD64"}, + sysfsfiles: map[string][]byte{ + "card0/device/vendor": []byte("0x8086"), + }, + devfsdirs: []string{ + "card0", + "by-path/pci-0000:00:00.0-card", + "by-path/pci-0000:00:00.0-render", + }, + expectedI915Devs: 1, + l0mock: &mockL0Service{ + healthy: true, + }, + }, + { + name: "one unhealthy device with proper symlink", + pciAddresses: map[string]string{"0000:00:00.0": "card0"}, + sysfsdirs: []string{"card0/device/drm/card0", "card0/device/drm/controlD64"}, + sysfsfiles: map[string][]byte{ + "card0/device/vendor": []byte("0x8086"), + }, + devfsdirs: []string{ + "card0", + "by-path/pci-0000:00:00.0-card", + "by-path/pci-0000:00:00.0-render", + }, + expectedI915Devs: 1, + l0mock: &mockL0Service{ + healthy: false, + }, + }, + { + name: "one device with proper symlink returns error", + pciAddresses: map[string]string{"0000:00:00.0": "card0"}, + sysfsdirs: []string{"card0/device/drm/card0", "card0/device/drm/controlD64"}, + sysfsfiles: map[string][]byte{ + "card0/device/vendor": []byte("0x8086"), + }, + devfsdirs: []string{ + "card0", + "by-path/pci-0000:00:00.0-card", + "by-path/pci-0000:00:00.0-render", + }, + expectedI915Devs: 1, + l0mock: &mockL0Service{ + fail: true, + }, + }, + } + + for _, tc := range tcases { + if tc.options.sharedDevNum == 0 { + tc.options.sharedDevNum = 1 + } + + t.Run(tc.name, func(t *testing.T) { + root, err := os.MkdirTemp("", "test_new_device_plugin") + if err != nil { + t.Fatalf("can't create temporary directory: %+v", err) + } + + // dirs/files need to be removed for the next test + defer os.RemoveAll(root) + + sysfs, devfs, err := createTestFiles(root, tc) + if err != nil { + t.Errorf("unexpected error: %+v", err) + } + + plugin := newDevicePlugin(sysfs, devfs, tc.options) + + plugin.levelzeroService = tc.l0mock + + notifier := &mockNotifier{ + scanDone: plugin.scanDone, + } + + err = plugin.Scan(notifier) + // Scans in GPU plugin never fail + if err != nil { + t.Errorf("unexpected error: %+v", err) + } + if tc.expectedI915Devs != notifier.i915Count { + t.Errorf("Expected %d, discovered %d devices (i915)", + tc.expectedI915Devs, notifier.i915Count) + } + if tc.expectedI915Monitors != notifier.i915monitorCount { + t.Errorf("Expected %d, discovered %d monitors (i915)", + tc.expectedI915Monitors, notifier.i915monitorCount) + } + }) + } +} + +func TestScanWsl(t *testing.T) { + tcases := []TestCaseDetails{ + { + name: "one wsl device", + expectedDxgDevs: 1, + l0mock: &mockL0Service{ + indices: []uint32{0}, + }, + }, + { + name: "four wsl device", + expectedDxgDevs: 4, + l0mock: &mockL0Service{ + indices: []uint32{0, 1, 2, 3}, + }, + }, + } + + for _, tc := range tcases { + if tc.options.sharedDevNum == 0 { + tc.options.sharedDevNum = 1 + } + + t.Run(tc.name, func(t *testing.T) { + root, err := os.MkdirTemp("", "test_new_device_plugin") + if err != nil { + t.Fatalf("can't create temporary directory: %+v", err) + } + + // dirs/files need to be removed for the next test + defer os.RemoveAll(root) + + sysfs, devfs, err := createTestFiles(root, tc) + if err != nil { + t.Errorf("unexpected error: %+v", err) + } + + plugin := newDevicePlugin(sysfs, devfs, tc.options) + plugin.options.wslScan = true + plugin.levelzeroService = tc.l0mock + + notifier := &mockNotifier{ + scanDone: plugin.scanDone, + } + + err = plugin.Scan(notifier) + // Scans in GPU plugin never fail + if err != nil { + t.Errorf("unexpected error: %+v", err) + } + if tc.expectedDxgDevs != notifier.dxgCount { + t.Errorf("Expected %d, discovered %d devices (dxg)", + tc.expectedI915Devs, notifier.i915Count) + } + }) + } +} + func TestScanFails(t *testing.T) { tc := TestCaseDetails{ name: "xe and i915 devices with rm will fail", @@ -582,11 +827,11 @@ func createBypathTestFiles(t *testing.T, card, root, linkFile string, bypathFile byPath := path.Join(root, "by-path") if linkFile != "" { - if err := os.MkdirAll(filepath.Dir(devPath), os.ModePerm); err != nil { + if err := os.MkdirAll(filepath.Dir(devPath), 0700); err != nil { t.Fatal("Couldn't create test dev dir", err) } - if err := os.MkdirAll(filepath.Dir(drmPath), os.ModePerm); err != nil { + if err := os.MkdirAll(filepath.Dir(drmPath), 0700); err != nil { t.Fatal("Couldn't create test drm dir", err) } @@ -595,12 +840,12 @@ func createBypathTestFiles(t *testing.T, card, root, linkFile string, bypathFile } if err := os.Symlink(devPath, drmPath); err != nil { - t.Fatal("Couldn't create symlink between pci path and sysfs drm path") + t.Fatal("Couldn't create symlink between PCI path and sysfs drm path") } } if len(bypathFiles) > 0 { - if err := os.MkdirAll(byPath, os.ModePerm); err != nil { + if err := os.MkdirAll(byPath, 0700); err != nil { t.Fatal("Mkdir failed:", byPath) } @@ -641,7 +886,7 @@ func TestBypath(t *testing.T) { 0, }, { - "invalid pci address", + "invalid PCI address", "00.10.2/00.334.302/0.0.1.00/000:ff:05.1/drm/" + cardName, []string{"pci-0000:0f:05.0-card", "pci-0000:0f:05.0-render"}, false, diff --git a/cmd/gpu_plugin/labels.md b/cmd/gpu_plugin/labels.md index 19a00c66e..3d3abbe94 100644 --- a/cmd/gpu_plugin/labels.md +++ b/cmd/gpu_plugin/labels.md @@ -57,7 +57,7 @@ The `numa-gpu-map` label is a list of numa to gpu mapping items separated by `_` ### PCI-groups (optional) -GPUs which share the same pci paths under `/sys/devices/pci*` can be grouped into a label. GPU nums are separated by '`.`' and +GPUs which share the same PCI paths under `/sys/devices/pci*` can be grouped into a label. GPU nums are separated by '`.`' and groups are separated by '`_`'. The label is created only if environment variable named `GPU_PCI_GROUPING_LEVEL` has a value greater than zero. GPUs are considered to belong to the same group, if as many identical folder names are found for the GPUs, as is the value of the environment variable. Counting starts from the folder name which starts with `pci`. @@ -70,7 +70,7 @@ name | type | description| |`gpu.intel.com/pci-groups`| string | list of pci-groups separated by '`_`'. GPU numbers in the groups are separated by '`.`'. The numbers correspond to device file numbers for the primary nodes of given GPUs in kernel DRI subsystem, listed as `/dev/dri/card` in devfs, and `/sys/class/drm/card` in sysfs. If the value of the `pci-groups` label would not fit into the 63 character length limit, you will also get labels `pci-groups2`, -`pci-groups3`... until all the pci groups have been labeled. +`pci-groups3`... until all the PCI groups have been labeled. ### Limitations diff --git a/cmd/gpu_plugin/levelzeroservice/levelzero_service.go b/cmd/gpu_plugin/levelzeroservice/levelzero_service.go new file mode 100644 index 000000000..ddcae25b5 --- /dev/null +++ b/cmd/gpu_plugin/levelzeroservice/levelzero_service.go @@ -0,0 +1,208 @@ +// Copyright 2024 Intel Corporation. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package levelzeroservice + +import ( + "context" + + lz "github.com/intel/intel-device-plugins-for-kubernetes/cmd/internal/levelzero" + "google.golang.org/grpc" + "google.golang.org/grpc/connectivity" + "google.golang.org/grpc/credentials/insecure" + "k8s.io/klog/v2" +) + +type LevelzeroService interface { + Run(bool) + GetIntelIndices() ([]uint32, error) + GetDeviceHealth(bdfAddress string) (DeviceHealth, error) + GetDeviceTemperature(bdfAddress string) (DeviceTemperature, error) + GetDeviceMemoryAmount(bdfAddress string) (uint64, error) +} + +type DeviceHealth struct { + Memory bool + Bus bool + SoC bool + GlobalTemperature float64 + GPUTemperature float64 + MemoryTemperature float64 +} + +type DeviceTemperature struct { + Global float64 + GPU float64 + Memory float64 +} + +type clientNotReadyErr struct{} + +func (e *clientNotReadyErr) Error() string { + return "client is not (yet) ready" +} + +func NewLevelzero(socket string) LevelzeroService { + return &levelzero{ + socketPath: socket, + ctx: context.Background(), + conn: nil, + client: nil, + } +} + +type levelzero struct { + client lz.LevelzeroClient + ctx context.Context + conn *grpc.ClientConn + socketPath string +} + +func (l *levelzero) Run(keep bool) { + url := "unix://" + l.socketPath + + klog.V(3).Info("Starting Level-Zero client. Connecting to: ", url) + + conn, err := grpc.NewClient(url, grpc.WithTransportCredentials(insecure.NewCredentials())) + if err != nil { + klog.Error("Failed to connect to socket", err) + + return + } + + ctx := context.Background() + + l.conn = conn + + for { + state := l.conn.GetState() + if state == connectivity.Idle { + conn.Connect() + } + + if state == connectivity.Ready { + klog.V(2).Info("Connection ready") + + l.client = lz.NewLevelzeroClient(conn) + + if !keep { + break + } + } + + if !conn.WaitForStateChange(ctx, state) { + continue + } + } +} + +func (l *levelzero) isClientReady() bool { + return l.client != nil +} + +func (l *levelzero) GetIntelIndices() ([]uint32, error) { + if !l.isClientReady() { + return []uint32{}, &clientNotReadyErr{} + } + + cli := l.client + + indices, err := cli.GetIntelIndices(l.ctx, &lz.GetIntelIndicesMessage{}) + if err != nil || indices == nil { + return []uint32{}, err + } + + if indices.Error != nil && indices.Error.Errorcode != 0 { + klog.Warningf("indices request returned internal error: 0x%X (%s)", indices.Error.Errorcode, indices.Error.Description) + } + + return indices.Indices, nil +} + +func (l *levelzero) GetDeviceHealth(bdfAddress string) (DeviceHealth, error) { + if !l.isClientReady() { + return DeviceHealth{}, &clientNotReadyErr{} + } + + cli := l.client + + did := lz.DeviceId{ + BdfAddress: bdfAddress, + } + + health, err := cli.GetDeviceHealth(l.ctx, &did) + if err != nil || health == nil { + return DeviceHealth{}, err + } + + if health.Error != nil && health.Error.Errorcode != 0 { + klog.Warningf("health request returned internal error: 0x%X (%s)", health.Error.Errorcode, health.Error.Description) + } + + return DeviceHealth{ + Memory: health.MemoryOk, + Bus: health.BusOk, + SoC: health.SocOk, + }, nil +} + +func (l *levelzero) GetDeviceTemperature(bdfAddress string) (DeviceTemperature, error) { + if !l.isClientReady() { + return DeviceTemperature{}, &clientNotReadyErr{} + } + + cli := l.client + + did := lz.DeviceId{ + BdfAddress: bdfAddress, + } + + temps, err := cli.GetDeviceTemperature(l.ctx, &did) + if err != nil || temps == nil { + return DeviceTemperature{}, err + } + + if temps.Error != nil && temps.Error.Errorcode != 0 { + klog.Warningf("temperature request returned internal error: 0x%X (%s)", temps.Error.Errorcode, temps.Error.Description) + } + + return DeviceTemperature{ + Global: temps.Global, + GPU: temps.Gpu, + Memory: temps.Memory, + }, nil +} + +func (l *levelzero) GetDeviceMemoryAmount(bdfAddress string) (uint64, error) { + if !l.isClientReady() { + return 0, &clientNotReadyErr{} + } + + cli := l.client + + did := lz.DeviceId{ + BdfAddress: bdfAddress, + } + + memSize, err := cli.GetDeviceMemoryAmount(l.ctx, &did) + if err != nil || memSize == nil { + return 0, err + } + + if memSize.Error != nil && memSize.Error.Errorcode != 0 { + klog.Warningf("memory request returned internal error: 0x%X (%s)", memSize.Error.Errorcode, memSize.Error.Description) + } + + return memSize.MemorySize, nil +} diff --git a/cmd/gpu_plugin/levelzeroservice/levelzero_service_test.go b/cmd/gpu_plugin/levelzeroservice/levelzero_service_test.go new file mode 100644 index 000000000..471a4c291 --- /dev/null +++ b/cmd/gpu_plugin/levelzeroservice/levelzero_service_test.go @@ -0,0 +1,307 @@ +// Copyright 2024 Intel Corporation. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package levelzeroservice + +import ( + "context" + "flag" + "log" + "net" + "os" + "path/filepath" + "testing" + + lz "github.com/intel/intel-device-plugins-for-kubernetes/cmd/internal/levelzero" + "google.golang.org/grpc" +) + +const ( + NoError = iota + InternalError = iota + ExternalError = iota +) + +func init() { + _ = flag.Set("v", "4") //Enable debug output +} + +type mockServer struct { + lz.UnimplementedLevelzeroServer + failRequest int +} + +func (m *mockServer) serve(socketPath string) { + lis, err := net.Listen("unix", socketPath) + if err != nil { + log.Fatalf("failed to listen: %v", err) + } + + s := grpc.NewServer() + + lz.RegisterLevelzeroServer(s, m) + + go func() { + if err := s.Serve(lis); err != nil { + log.Fatalf("failed to serve: %v", err) + } + }() +} + +func (m *mockServer) GetDeviceHealth(c context.Context, deviceid *lz.DeviceId) (*lz.DeviceHealth, error) { + if m.failRequest == ExternalError { + return nil, os.ErrInvalid + } + + health := &lz.DeviceHealth{ + BusOk: true, + MemoryOk: true, + Error: nil, + } + + if m.failRequest == InternalError { + health.MemoryOk = false + health.Error = &lz.Error{ + Description: "error error", + Errorcode: 99, + } + } + + return health, nil +} + +func (m *mockServer) GetDeviceTemperature(c context.Context, deviceid *lz.DeviceId) (*lz.DeviceTemperature, error) { + if m.failRequest == ExternalError { + return nil, os.ErrInvalid + } + + temps := &lz.DeviceTemperature{ + Global: 35.0, + Gpu: 35.0, + Memory: 35.0, + Error: nil, + } + + if m.failRequest == InternalError { + temps.Global = -999.0 + temps.Gpu = -999.0 + temps.Memory = -999.0 + temps.Error = &lz.Error{ + Description: "error error", + Errorcode: 99, + } + } + + return temps, nil +} + +func (m *mockServer) GetIntelIndices(c context.Context, msg *lz.GetIntelIndicesMessage) (*lz.DeviceIndices, error) { + if m.failRequest == ExternalError { + return nil, os.ErrInvalid + } + + ret := lz.DeviceIndices{ + Indices: []uint32{0}, + Error: nil, + } + + if m.failRequest == InternalError { + ret.Indices = []uint32{} + ret.Error = &lz.Error{ + Description: "error error", + Errorcode: 99, + } + } + + return &ret, nil +} + +func (m *mockServer) GetDeviceMemoryAmount(c context.Context, deviceid *lz.DeviceId) (*lz.DeviceMemoryAmount, error) { + if m.failRequest == ExternalError { + return nil, os.ErrInvalid + } + + ret := lz.DeviceMemoryAmount{ + MemorySize: 1000, + Error: nil, + } + + if m.failRequest == InternalError { + ret.MemorySize = 0 + ret.Error = &lz.Error{ + Description: "error error", + Errorcode: 99, + } + } + + return &ret, nil +} + +type testcase struct { + name string + fail int +} + +var tcases = []testcase{ + { + name: "normal flow", + fail: NoError, + }, + { + name: "fail flow - internal", + fail: InternalError, + }, + { + name: "fail flow - external", + fail: ExternalError, + }, +} + +func TestGetDeviceHealth(t *testing.T) { + for _, tc := range tcases { + t.Run(tc.name, func(t *testing.T) { + d, err := os.MkdirTemp("", "testinglevelzero*") + if err != nil { + t.Fatal("failed to create tmp directory") + } + + defer os.RemoveAll(d) + + sockPath := filepath.Join(d, "server.sock") + + mock := mockServer{ + failRequest: tc.fail, + } + + mock.serve(sockPath) + + n := NewLevelzero(sockPath) + + n.Run(false) + + dh, err := n.GetDeviceHealth("0000:00:00.1") + + if tc.fail == NoError && err != nil { + t.Error("GetDeviceHealth returned an error:", err) + } + + if tc.fail == NoError && (!dh.Memory || !dh.Bus) { + t.Error("Call to device health returned unhealthy", dh, tc.fail) + } + + if tc.fail == ExternalError && err == nil { + t.Error("GetDeviceHealth returned nil and expected error") + } + }) + } +} + +func TestGetIndices(t *testing.T) { + for _, tc := range tcases { + t.Run(tc.name, func(t *testing.T) { + d, err := os.MkdirTemp("", "testinglevelzero*") + if err != nil { + t.Fatal("failed to create tmp directory") + } + + defer os.RemoveAll(d) + + sockPath := filepath.Join(d, "server.sock") + + mock := mockServer{ + failRequest: tc.fail, + } + + mock.serve(sockPath) + + n := NewLevelzero(sockPath) + + n.Run(false) + + indices, err := n.GetIntelIndices() + + if tc.fail == NoError && err != nil { + t.Error("GetIntelIndices returned error:", err) + } + + if tc.fail == ExternalError && err == nil { + t.Error("GetIntelIndices returned nil and expected error") + } + + if tc.fail == NoError && len(indices) != 1 { + t.Error("Wrong number of indices received", indices) + } + if tc.fail != NoError && len(indices) != 0 { + t.Error("Wrong number of indices received", indices) + } + }) + } +} + +func TestGetMemoryAmount(t *testing.T) { + for _, tc := range tcases { + t.Run(tc.name, func(t *testing.T) { + d, err := os.MkdirTemp("", "testinglevelzero*") + if err != nil { + t.Fatal("failed to create tmp directory") + } + + defer os.RemoveAll(d) + + sockPath := filepath.Join(d, "server.sock") + + mock := mockServer{ + failRequest: tc.fail, + } + + mock.serve(sockPath) + + n := NewLevelzero(sockPath) + n.Run(false) + + memSize, err := n.GetDeviceMemoryAmount("0000:11:22.3") + + if tc.fail == NoError && err != nil { + t.Error("TestGetMemoryAmount returned error:", err) + } + + if tc.fail == ExternalError && err == nil { + t.Error("TestGetMemoryAmount returned nil and expected error") + } + + if tc.fail == NoError && memSize != 1000 { + t.Error("Wrong mem size received", memSize) + } + }) + } +} + +func TestAccessBeforeReady(t *testing.T) { + n := NewLevelzero("/tmp/foobar.sock") + + _, err := n.GetDeviceMemoryAmount("") + if err == nil { + t.Error("Got non-error for memory amount, expected error") + } + + _, err = n.GetDeviceHealth("") + if err == nil { + t.Error("Got non-error for health, expected error") + } + + _, err = n.GetIntelIndices() + if err == nil { + t.Error("Got non-error for indices, expected error") + } +} diff --git a/cmd/gpu_plugin/rm/gpu_plugin_resource_manager.go b/cmd/gpu_plugin/rm/gpu_plugin_resource_manager.go index 95c7c2a84..6eae4c7a6 100644 --- a/cmd/gpu_plugin/rm/gpu_plugin_resource_manager.go +++ b/cmd/gpu_plugin/rm/gpu_plugin_resource_manager.go @@ -51,8 +51,8 @@ const ( gasCardAnnotation = "gas-container-cards" gasTileAnnotation = "gas-container-tiles" - levelZeroAffinityMaskEnvVar = "ZE_AFFINITY_MASK" - levelZeroHierarchyEnvVar = "ZE_FLAT_DEVICE_HIERARCHY" + LevelzeroAffinityMaskEnvVar = "ZE_AFFINITY_MASK" + levelzeroHierarchyEnvVar = "ZE_FLAT_DEVICE_HIERARCHY" hierarchyModeComposite = "COMPOSITE" hierarchyModeFlat = "FLAT" @@ -796,7 +796,7 @@ func (rm *resourceManager) createAllocateResponse(deviceIds []string, tileAffini cresp.Envs = make(map[string]string) } - cresp.Envs[levelZeroAffinityMaskEnvVar] = tileAffinityMask + cresp.Envs[LevelzeroAffinityMaskEnvVar] = tileAffinityMask } allocateResponse.ContainerResponses = append(allocateResponse.ContainerResponses, &cresp) @@ -851,7 +851,7 @@ func containerCards(pod *v1.Pod, gpuUsingContainerIndex int) []string { // Guesses level zero hierarchy mode for the container. Defaults to the new "flat" mode // if no mode is set in the container's env variables. -func guessLevelZeroHierarchyMode(pod *v1.Pod, containerIndex int) string { +func guessLevelzeroHierarchyMode(pod *v1.Pod, containerIndex int) string { klog.V(4).Infof("Checking pod %s envs", pod.Name) if containerIndex < len(pod.Spec.Containers) { @@ -859,7 +859,7 @@ func guessLevelZeroHierarchyMode(pod *v1.Pod, containerIndex int) string { if c.Env != nil { for _, env := range c.Env { - if env.Name == levelZeroHierarchyEnvVar { + if env.Name == levelzeroHierarchyEnvVar { switch env.Value { // Check that the value is valid. case hierarchyModeComposite: @@ -959,7 +959,7 @@ func containerTileAffinityMask(pod *v1.Pod, gpuUsingContainerIndex, tilesPerCard } if i == gpuUsingContainerIndex { - return convertTileInfoToEnvMask(containerTileInfo, tilesPerCard, guessLevelZeroHierarchyMode(pod, containerIndex)) + return convertTileInfoToEnvMask(containerTileInfo, tilesPerCard, guessLevelzeroHierarchyMode(pod, containerIndex)) } i++ diff --git a/cmd/gpu_plugin/rm/gpu_plugin_resource_manager_test.go b/cmd/gpu_plugin/rm/gpu_plugin_resource_manager_test.go index d2c1a7a65..ddd83d6ff 100644 --- a/cmd/gpu_plugin/rm/gpu_plugin_resource_manager_test.go +++ b/cmd/gpu_plugin/rm/gpu_plugin_resource_manager_test.go @@ -472,7 +472,7 @@ func TestCreateFractionalResourceResponseWithOneCardTwoTiles(t *testing.T) { }, Env: []v1.EnvVar{ { - Name: levelZeroHierarchyEnvVar, + Name: levelzeroHierarchyEnvVar, Value: hierarchyModeComposite, }, }, @@ -522,8 +522,8 @@ func TestCreateFractionalResourceResponseWithOneCardTwoTiles(t *testing.T) { // check response expectTruef(len(resp.ContainerResponses) == 1, t, tCase.name, "wrong number of container responses, expected 1") expectTruef(len(resp.ContainerResponses[0].Envs) == 2, t, tCase.name, "wrong number of env variables in container response, expected 2") - expectTruef(resp.ContainerResponses[0].Envs[levelZeroAffinityMaskEnvVar] != "", t, tCase.name, "l0 tile mask not set") - expectTruef(resp.ContainerResponses[0].Envs[levelZeroAffinityMaskEnvVar] == "0.0,0.1", t, tCase.name, "l0 affinity mask is incorrect") + expectTruef(resp.ContainerResponses[0].Envs[LevelzeroAffinityMaskEnvVar] != "", t, tCase.name, "l0 tile mask not set") + expectTruef(resp.ContainerResponses[0].Envs[LevelzeroAffinityMaskEnvVar] == "0.0,0.1", t, tCase.name, "l0 affinity mask is incorrect") expectTruef(len(resp.ContainerResponses[0].Devices) == 1, t, tCase.name, "wrong number of devices, expected 1") } @@ -545,7 +545,7 @@ func TestCreateFractionalResourceResponseWithTwoCardsOneTile(t *testing.T) { }, Env: []v1.EnvVar{ { - Name: levelZeroHierarchyEnvVar, + Name: levelzeroHierarchyEnvVar, Value: hierarchyModeComposite, }, }, @@ -598,8 +598,8 @@ func TestCreateFractionalResourceResponseWithTwoCardsOneTile(t *testing.T) { } else { // check response expectTruef(len(resp.ContainerResponses) == 1, t, tCase.name, "wrong number of container responses, expected 1") - expectTruef(resp.ContainerResponses[0].Envs[levelZeroAffinityMaskEnvVar] != "", t, tCase.name, "l0 tile mask not set") - expectTruef(resp.ContainerResponses[0].Envs[levelZeroAffinityMaskEnvVar] == "0.3,1.4", t, tCase.name, "l0 affinity mask is incorrect: ") + expectTruef(resp.ContainerResponses[0].Envs[LevelzeroAffinityMaskEnvVar] != "", t, tCase.name, "l0 tile mask not set") + expectTruef(resp.ContainerResponses[0].Envs[LevelzeroAffinityMaskEnvVar] == "0.3,1.4", t, tCase.name, "l0 affinity mask is incorrect: ") expectTruef(len(resp.ContainerResponses[0].Devices) == 2, t, tCase.name, "wrong number of devices, expected 2") } } @@ -623,7 +623,7 @@ func TestCreateFractionalResourceResponseWithThreeCardsTwoTiles(t *testing.T) { }, Env: []v1.EnvVar{ { - Name: levelZeroHierarchyEnvVar, + Name: levelzeroHierarchyEnvVar, Value: hierarchyModeComposite, }, }, @@ -676,8 +676,8 @@ func TestCreateFractionalResourceResponseWithThreeCardsTwoTiles(t *testing.T) { } else { // check response expectTruef(len(resp.ContainerResponses) == 1, t, tCase.name, "wrong number of container responses, expected 1") - expectTruef(resp.ContainerResponses[0].Envs[levelZeroAffinityMaskEnvVar] != "", t, tCase.name, "l0 tile mask not set") - expectTruef(resp.ContainerResponses[0].Envs[levelZeroAffinityMaskEnvVar] == "0.0,0.1,1.2,1.3,2.3,2.4", t, tCase.name, "l0 affinity mask is incorrect: ") + expectTruef(resp.ContainerResponses[0].Envs[LevelzeroAffinityMaskEnvVar] != "", t, tCase.name, "l0 tile mask not set") + expectTruef(resp.ContainerResponses[0].Envs[LevelzeroAffinityMaskEnvVar] == "0.0,0.1,1.2,1.3,2.3,2.4", t, tCase.name, "l0 affinity mask is incorrect: ") expectTruef(len(resp.ContainerResponses[0].Devices) == 3, t, tCase.name, "wrong number of devices, expected 3") } } @@ -701,7 +701,7 @@ func TestCreateFractionalResourceResponseWithMultipleContainersTileEach(t *testi }, Env: []v1.EnvVar{ { - Name: levelZeroHierarchyEnvVar, + Name: levelzeroHierarchyEnvVar, Value: hierarchyModeComposite, }, }, @@ -714,7 +714,7 @@ func TestCreateFractionalResourceResponseWithMultipleContainersTileEach(t *testi }, Env: []v1.EnvVar{ { - Name: levelZeroHierarchyEnvVar, + Name: levelzeroHierarchyEnvVar, Value: hierarchyModeComposite, }, }, @@ -945,7 +945,7 @@ func TestTileAnnotationParsing(t *testing.T) { if i < len(pt.hierarchys) { pod.Spec.Containers[i].Env = []v1.EnvVar{ { - Name: levelZeroHierarchyEnvVar, + Name: levelzeroHierarchyEnvVar, Value: pt.hierarchys[i], }, } diff --git a/cmd/internal/labeler/labeler.go b/cmd/internal/labeler/labeler.go index 0d2fdc19f..b4f18f491 100644 --- a/cmd/internal/labeler/labeler.go +++ b/cmd/internal/labeler/labeler.go @@ -29,6 +29,7 @@ import ( "syscall" "time" + "github.com/intel/intel-device-plugins-for-kubernetes/cmd/gpu_plugin/levelzeroservice" "github.com/intel/intel-device-plugins-for-kubernetes/cmd/internal/pluginutils" "github.com/pkg/errors" "k8s.io/klog/v2" @@ -60,6 +61,8 @@ type labeler struct { controlDeviceReg *regexp.Regexp labels labelMap + levelzero levelzeroservice.LevelzeroService + sysfsDRMDir string labelsChanged bool } @@ -163,7 +166,7 @@ func fallback() uint64 { return getEnvVarNumber(memoryOverrideEnv) } -func GetMemoryAmount(sysfsDrmDir, gpuName string, numTiles uint64) uint64 { +func legacyFallback(sysfsDrmDir, gpuName string, numTiles uint64) uint64 { reserved := getEnvVarNumber(memoryReservedEnv) filePath := filepath.Join(sysfsDrmDir, gpuName, "lmem_total_bytes") @@ -183,6 +186,26 @@ func GetMemoryAmount(sysfsDrmDir, gpuName string, numTiles uint64) uint64 { return totalPerTile*numTiles - reserved } +func (l *labeler) GetMemoryAmount(sysfsDrmDir, gpuName string, numTiles uint64) uint64 { + link, err := os.Readlink(filepath.Join(sysfsDrmDir, gpuName, "device")) + if err != nil { + return legacyFallback(sysfsDrmDir, gpuName, numTiles) + } + + amount := uint64(0) + + if l.levelzero != nil { + amount, err = l.levelzero.GetDeviceMemoryAmount(filepath.Base(link)) + if amount == 0 || err != nil { + return legacyFallback(sysfsDrmDir, gpuName, numTiles) + } + } else { + return legacyFallback(sysfsDrmDir, gpuName, numTiles) + } + + return amount +} + // GetTileCount reads the tile count. func GetTileCount(cardPath string) (numTiles uint64) { files := []string{} @@ -251,7 +274,7 @@ func (lm labelMap) addSplittableString(labelBase, fullValue string) { } } -// this returns pci groups label value, groups separated by "_", gpus separated by ".". +// this returns PCI groups label value, groups separated by "_", gpus separated by ".". // Example for two groups with 4 gpus: "0.1.2.3_4.5.6.7". func (l *labeler) createPCIGroupLabel(gpuNumList []string) string { pciGroups := map[string][]string{} @@ -317,7 +340,7 @@ func (l *labeler) createLabels() error { numTiles := GetTileCount(filepath.Join(l.sysfsDRMDir, gpuName)) tileCount += int(numTiles) - memoryAmount := GetMemoryAmount(l.sysfsDRMDir, gpuName, numTiles) + memoryAmount := l.GetMemoryAmount(l.sysfsDRMDir, gpuName, numTiles) gpuNumList = append(gpuNumList, gpuName[4:]) // get numa node of the GPU @@ -446,9 +469,11 @@ func CreateAndPrintLabels(sysfsDRMDir string) { // Gathers node's GPU labels on channel trigger or timeout, and write them to a file. // The created label file is deleted on exit (process dying). -func Run(sysfsDrmDir, nfdFeatureFile string, updateInterval time.Duration, scanResources chan bool) { +func Run(sysfsDrmDir, nfdFeatureFile string, updateInterval time.Duration, scanResources chan bool, levelzero levelzeroservice.LevelzeroService, exitFunc func()) { l := newLabeler(sysfsDrmDir) + l.levelzero = levelzero + interruptChan := make(chan os.Signal, 1) signal.Notify(interruptChan, syscall.SIGTERM, syscall.SIGINT, syscall.SIGHUP, syscall.SIGQUIT) @@ -499,6 +524,6 @@ Loop: klog.V(1).Info("Stopping GPU labeler") - // Close the whole application - os.Exit(0) + // Call exitFunc that might exit the app + exitFunc() } diff --git a/cmd/internal/labeler/labeler_test.go b/cmd/internal/labeler/labeler_test.go index 7725307e3..deca2df78 100644 --- a/cmd/internal/labeler/labeler_test.go +++ b/cmd/internal/labeler/labeler_test.go @@ -17,10 +17,15 @@ package labeler import ( "os" "path" + "path/filepath" "reflect" "strconv" + "syscall" "testing" + "time" + "github.com/fsnotify/fsnotify" + "github.com/intel/intel-device-plugins-for-kubernetes/cmd/gpu_plugin/levelzeroservice" "github.com/intel/intel-device-plugins-for-kubernetes/cmd/internal/pluginutils" ) @@ -28,6 +33,30 @@ const ( sysfsDirectory = "/sys/" ) +type mockL0Service struct { + memSize uint64 + fail bool +} + +func (m *mockL0Service) Run(bool) { +} +func (m *mockL0Service) GetIntelIndices() ([]uint32, error) { + return nil, nil +} +func (m *mockL0Service) GetDeviceHealth(bdfAddress string) (levelzeroservice.DeviceHealth, error) { + return levelzeroservice.DeviceHealth{}, nil +} +func (m *mockL0Service) GetDeviceTemperature(bdfAddress string) (levelzeroservice.DeviceTemperature, error) { + return levelzeroservice.DeviceTemperature{}, nil +} +func (m *mockL0Service) GetDeviceMemoryAmount(bdfAddress string) (uint64, error) { + if m.fail { + return m.memSize, os.ErrInvalid + } + + return m.memSize, nil +} + type testcase struct { capabilityFile map[string][]byte expectedRetval error @@ -579,7 +608,7 @@ func getTestCases() []testcase { } } -func (tc *testcase) createFiles(t *testing.T, sysfs, root string) { +func (tc *testcase) createFiles(t *testing.T, sysfs string) { for _, sysfsdir := range tc.sysfsdirs { if err := os.MkdirAll(path.Join(sysfs, sysfsdir), 0750); err != nil { t.Fatalf("Failed to create fake sysfs directory: %+v", err) @@ -645,7 +674,7 @@ func TestLabeling(t *testing.T) { } sysfs := path.Join(subroot, "pci0000:00/0000:00:1b.4", sysfsDirectory) - tc.createFiles(t, sysfs, subroot) + tc.createFiles(t, sysfs) os.Setenv(memoryOverrideEnv, strconv.FormatUint(tc.memoryOverride, 10)) os.Setenv(memoryReservedEnv, strconv.FormatUint(tc.memoryReserved, 10)) @@ -663,3 +692,176 @@ func TestLabeling(t *testing.T) { }) } } + +func TestCreateAndRun(t *testing.T) { + root, err := os.MkdirTemp("", "test_new_device_plugin") + if err != nil { + t.Fatalf("can't create temporary directory: %+v", err) + } + + defer os.RemoveAll(root) + + tc := getTestCases()[0] + + subroot, err := os.MkdirTemp(root, "tc") + if err != nil { + t.Fatalf("can't create temporary subroot directory: %+v", err) + } + + t.Run("CreateAndPrintLabels", func(t *testing.T) { + err := os.MkdirAll(path.Join(subroot, "0"), 0750) + if err != nil { + t.Fatalf("couldn't create dir: %s", err.Error()) + } + sysfs := path.Join(subroot, "pci0000:00/0000:00:1b.4", sysfsDirectory) + + tc.createFiles(t, sysfs) + + CreateAndPrintLabels(sysfs) + }) + + waitForFileOp := func(directory, file string, eventType fsnotify.Op, duration time.Duration) bool { + watcher, err := fsnotify.NewWatcher() + if err != nil { + t.Fatal(err) + } + defer watcher.Close() + + if err := watcher.Add(directory); err != nil { + t.Fatal(err) + } + + timer := time.NewTimer(duration) + + for { + select { + case event := <-watcher.Events: + if filepath.Base(event.Name) == file && event.Has(eventType) { + return true + } + case <-timer.C: + return false + } + } + } + + t.Run("Run", func(t *testing.T) { + err := os.MkdirAll(path.Join(subroot, "0"), 0750) + if err != nil { + t.Fatalf("couldn't create dir: %s", err.Error()) + } + sysfs := path.Join(subroot, "pci0000:00/0000:00:1b.4", sysfsDirectory) + + tc.createFiles(t, sysfs) + + c := make(chan bool, 1) + + nfdLabelBase := "nfd-labelfile.txt" + nfdLabelFile := filepath.Join(root, nfdLabelBase) + + go Run(sysfs, nfdLabelFile, time.Millisecond, c, nil, func() {}) + + // Wait for the labeling timeout to trigger + if !waitForFileOp(root, nfdLabelBase, fsnotify.Create, time.Second*2) { + t.Error("Run didn't create label file") + } + + err = syscall.Kill(syscall.Getpid(), syscall.SIGHUP) + if err != nil { + t.Error("Calling Kill failed") + } + + // Wait for the labeling timeout to trigger + if !waitForFileOp(root, nfdLabelBase, fsnotify.Remove, time.Second*2) { + t.Error("Run didn't remove label file") + } + }) +} + +func TestL0ServiceUse(t *testing.T) { + root, err := os.MkdirTemp("", "test_new_device_plugin") + if err != nil { + t.Fatalf("can't create temporary directory: %+v", err) + } + + defer os.RemoveAll(root) + + pciAddr := path.Join(root, "sys", ".devices", "0000:00:01.0") + cardPath := path.Join(root, "sys", "card0") + + err = os.MkdirAll(pciAddr, 0750) + if err != nil { + t.Fatalf("couldn't create PCI dir: %s", err.Error()) + } + + err = os.MkdirAll(cardPath, 0750) + if err != nil { + t.Fatalf("couldn't create card dir: %s", err.Error()) + } + + err = os.Symlink(pciAddr, filepath.Join(cardPath, "device")) + if err != nil { + t.Fatalf("couldn't create symlink: %s", err.Error()) + } + + err = os.WriteFile(filepath.Join(root, "sys/card0/device/vendor"), []byte("0x8086"), 0600) + if err != nil { + t.Fatalf("couldn't write vendor file: %s", err.Error()) + } + + err = os.MkdirAll(filepath.Join(root, "sys/card0/device/drm"), 0600) + if err != nil { + t.Fatalf("couldn't create card drm dir: %s", err.Error()) + } + + t.Run("fetch memory from l0 service", func(t *testing.T) { + labeler := newLabeler(filepath.Join(root, "sys")) + labeler.levelzero = &mockL0Service{ + memSize: 12345678, + } + err = labeler.createLabels() + + if err != nil { + t.Errorf("labeler didn't work with l0 service") + } + + if labeler.labels["gpu.intel.com/memory.max"] != "12345678" { + t.Errorf("labeler didn't get memory amount from l0 service: %v", labeler.labels) + } + }) + + t.Run("memory fetch from l0 fails", func(t *testing.T) { + labeler := newLabeler(filepath.Join(root, "sys")) + labeler.levelzero = &mockL0Service{ + memSize: 0, + fail: true, + } + + os.Setenv(memoryOverrideEnv, "87654321") + err = labeler.createLabels() + + if err != nil { + t.Errorf("labeler didn't work with l0 service") + } + + if labeler.labels["gpu.intel.com/memory.max"] != "87654321" { + t.Errorf("labeler got an invalid memory amount: %v", labeler.labels) + } + }) + + t.Run("memory fetch with nil l0 service", func(t *testing.T) { + labeler := newLabeler(filepath.Join(root, "sys")) + labeler.levelzero = nil + + os.Setenv(memoryOverrideEnv, "87654321") + err = labeler.createLabels() + + if err != nil { + t.Errorf("labeler didn't work with l0 service") + } + + if labeler.labels["gpu.intel.com/memory.max"] != "87654321" { + t.Errorf("labeler got an invalid memory amount: %v", labeler.labels) + } + }) +} diff --git a/cmd/internal/levelzero/README.md b/cmd/internal/levelzero/README.md new file mode 100644 index 000000000..b59a98619 --- /dev/null +++ b/cmd/internal/levelzero/README.md @@ -0,0 +1,10 @@ +To update the golang gRPC/protobuf files, use the following `protoc` commandline: + +``` +protoc --go_out=. --go_opt=paths=source_relative --go-grpc_out=. --go-grpc_opt=paths=source_relative levelzero.proto +# To fix bad package name +sed -i -e 's/gpu_levelzero/gpulevelzero/' levelzero.pb.go levelzero_grpc.pb.go +``` + +> *Note*: Running `protoc` will erase copyright header and change the package name from "gpulevelzero" to "gpu.levelzero". The header and the package name needs to be added/modified after regeneration. + diff --git a/cmd/internal/levelzero/levelzero.go b/cmd/internal/levelzero/levelzero.go new file mode 100644 index 000000000..a99dad8b5 --- /dev/null +++ b/cmd/internal/levelzero/levelzero.go @@ -0,0 +1,19 @@ +// Copyright 2024 Intel Corporation. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gpulevelzero + +const ( + DefaultUnixSocketPath = "/var/lib/levelzero/server.sock" +) diff --git a/cmd/internal/levelzero/levelzero.pb.go b/cmd/internal/levelzero/levelzero.pb.go new file mode 100644 index 000000000..8a8a0a826 --- /dev/null +++ b/cmd/internal/levelzero/levelzero.pb.go @@ -0,0 +1,637 @@ +// Copyright 2024 Intel Corporation. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by protoc-gen-go. DO NOT EDIT. +// versions: +// protoc-gen-go v1.28.1 +// protoc v3.12.4 +// source: levelzero.proto + +package gpulevelzero + +import ( + protoreflect "google.golang.org/protobuf/reflect/protoreflect" + protoimpl "google.golang.org/protobuf/runtime/protoimpl" + reflect "reflect" + sync "sync" +) + +const ( + // Verify that this generated code is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) + // Verify that runtime/protoimpl is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) +) + +type GetIntelIndicesMessage struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields +} + +func (x *GetIntelIndicesMessage) Reset() { + *x = GetIntelIndicesMessage{} + if protoimpl.UnsafeEnabled { + mi := &file_levelzero_proto_msgTypes[0] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *GetIntelIndicesMessage) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*GetIntelIndicesMessage) ProtoMessage() {} + +func (x *GetIntelIndicesMessage) ProtoReflect() protoreflect.Message { + mi := &file_levelzero_proto_msgTypes[0] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use GetIntelIndicesMessage.ProtoReflect.Descriptor instead. +func (*GetIntelIndicesMessage) Descriptor() ([]byte, []int) { + return file_levelzero_proto_rawDescGZIP(), []int{0} +} + +type DeviceId struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + BdfAddress string `protobuf:"bytes,1,opt,name=bdfAddress,proto3" json:"bdfAddress,omitempty"` +} + +func (x *DeviceId) Reset() { + *x = DeviceId{} + if protoimpl.UnsafeEnabled { + mi := &file_levelzero_proto_msgTypes[1] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *DeviceId) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*DeviceId) ProtoMessage() {} + +func (x *DeviceId) ProtoReflect() protoreflect.Message { + mi := &file_levelzero_proto_msgTypes[1] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use DeviceId.ProtoReflect.Descriptor instead. +func (*DeviceId) Descriptor() ([]byte, []int) { + return file_levelzero_proto_rawDescGZIP(), []int{1} +} + +func (x *DeviceId) GetBdfAddress() string { + if x != nil { + return x.BdfAddress + } + return "" +} + +type DeviceHealth struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + MemoryOk bool `protobuf:"varint,1,opt,name=memory_ok,json=memoryOk,proto3" json:"memory_ok,omitempty"` + BusOk bool `protobuf:"varint,2,opt,name=bus_ok,json=busOk,proto3" json:"bus_ok,omitempty"` + SocOk bool `protobuf:"varint,3,opt,name=soc_ok,json=socOk,proto3" json:"soc_ok,omitempty"` + Error *Error `protobuf:"bytes,42,opt,name=error,proto3" json:"error,omitempty"` +} + +func (x *DeviceHealth) Reset() { + *x = DeviceHealth{} + if protoimpl.UnsafeEnabled { + mi := &file_levelzero_proto_msgTypes[2] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *DeviceHealth) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*DeviceHealth) ProtoMessage() {} + +func (x *DeviceHealth) ProtoReflect() protoreflect.Message { + mi := &file_levelzero_proto_msgTypes[2] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use DeviceHealth.ProtoReflect.Descriptor instead. +func (*DeviceHealth) Descriptor() ([]byte, []int) { + return file_levelzero_proto_rawDescGZIP(), []int{2} +} + +func (x *DeviceHealth) GetMemoryOk() bool { + if x != nil { + return x.MemoryOk + } + return false +} + +func (x *DeviceHealth) GetBusOk() bool { + if x != nil { + return x.BusOk + } + return false +} + +func (x *DeviceHealth) GetSocOk() bool { + if x != nil { + return x.SocOk + } + return false +} + +func (x *DeviceHealth) GetError() *Error { + if x != nil { + return x.Error + } + return nil +} + +type DeviceTemperature struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Global float64 `protobuf:"fixed64,1,opt,name=global,proto3" json:"global,omitempty"` + Gpu float64 `protobuf:"fixed64,2,opt,name=gpu,proto3" json:"gpu,omitempty"` + Memory float64 `protobuf:"fixed64,3,opt,name=memory,proto3" json:"memory,omitempty"` + Error *Error `protobuf:"bytes,42,opt,name=error,proto3" json:"error,omitempty"` +} + +func (x *DeviceTemperature) Reset() { + *x = DeviceTemperature{} + if protoimpl.UnsafeEnabled { + mi := &file_levelzero_proto_msgTypes[3] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *DeviceTemperature) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*DeviceTemperature) ProtoMessage() {} + +func (x *DeviceTemperature) ProtoReflect() protoreflect.Message { + mi := &file_levelzero_proto_msgTypes[3] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use DeviceTemperature.ProtoReflect.Descriptor instead. +func (*DeviceTemperature) Descriptor() ([]byte, []int) { + return file_levelzero_proto_rawDescGZIP(), []int{3} +} + +func (x *DeviceTemperature) GetGlobal() float64 { + if x != nil { + return x.Global + } + return 0 +} + +func (x *DeviceTemperature) GetGpu() float64 { + if x != nil { + return x.Gpu + } + return 0 +} + +func (x *DeviceTemperature) GetMemory() float64 { + if x != nil { + return x.Memory + } + return 0 +} + +func (x *DeviceTemperature) GetError() *Error { + if x != nil { + return x.Error + } + return nil +} + +type DeviceIndices struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Indices []uint32 `protobuf:"varint,1,rep,packed,name=indices,proto3" json:"indices,omitempty"` + Error *Error `protobuf:"bytes,42,opt,name=error,proto3" json:"error,omitempty"` +} + +func (x *DeviceIndices) Reset() { + *x = DeviceIndices{} + if protoimpl.UnsafeEnabled { + mi := &file_levelzero_proto_msgTypes[4] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *DeviceIndices) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*DeviceIndices) ProtoMessage() {} + +func (x *DeviceIndices) ProtoReflect() protoreflect.Message { + mi := &file_levelzero_proto_msgTypes[4] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use DeviceIndices.ProtoReflect.Descriptor instead. +func (*DeviceIndices) Descriptor() ([]byte, []int) { + return file_levelzero_proto_rawDescGZIP(), []int{4} +} + +func (x *DeviceIndices) GetIndices() []uint32 { + if x != nil { + return x.Indices + } + return nil +} + +func (x *DeviceIndices) GetError() *Error { + if x != nil { + return x.Error + } + return nil +} + +type DeviceMemoryAmount struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + MemorySize uint64 `protobuf:"varint,1,opt,name=memory_size,json=memorySize,proto3" json:"memory_size,omitempty"` + Error *Error `protobuf:"bytes,42,opt,name=error,proto3" json:"error,omitempty"` +} + +func (x *DeviceMemoryAmount) Reset() { + *x = DeviceMemoryAmount{} + if protoimpl.UnsafeEnabled { + mi := &file_levelzero_proto_msgTypes[5] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *DeviceMemoryAmount) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*DeviceMemoryAmount) ProtoMessage() {} + +func (x *DeviceMemoryAmount) ProtoReflect() protoreflect.Message { + mi := &file_levelzero_proto_msgTypes[5] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use DeviceMemoryAmount.ProtoReflect.Descriptor instead. +func (*DeviceMemoryAmount) Descriptor() ([]byte, []int) { + return file_levelzero_proto_rawDescGZIP(), []int{5} +} + +func (x *DeviceMemoryAmount) GetMemorySize() uint64 { + if x != nil { + return x.MemorySize + } + return 0 +} + +func (x *DeviceMemoryAmount) GetError() *Error { + if x != nil { + return x.Error + } + return nil +} + +type Error struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Description string `protobuf:"bytes,1,opt,name=description,proto3" json:"description,omitempty"` + Errorcode uint32 `protobuf:"varint,2,opt,name=errorcode,proto3" json:"errorcode,omitempty"` +} + +func (x *Error) Reset() { + *x = Error{} + if protoimpl.UnsafeEnabled { + mi := &file_levelzero_proto_msgTypes[6] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *Error) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*Error) ProtoMessage() {} + +func (x *Error) ProtoReflect() protoreflect.Message { + mi := &file_levelzero_proto_msgTypes[6] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use Error.ProtoReflect.Descriptor instead. +func (*Error) Descriptor() ([]byte, []int) { + return file_levelzero_proto_rawDescGZIP(), []int{6} +} + +func (x *Error) GetDescription() string { + if x != nil { + return x.Description + } + return "" +} + +func (x *Error) GetErrorcode() uint32 { + if x != nil { + return x.Errorcode + } + return 0 +} + +var File_levelzero_proto protoreflect.FileDescriptor + +var file_levelzero_proto_rawDesc = []byte{ + 0x0a, 0x0f, 0x6c, 0x65, 0x76, 0x65, 0x6c, 0x7a, 0x65, 0x72, 0x6f, 0x2e, 0x70, 0x72, 0x6f, 0x74, + 0x6f, 0x22, 0x18, 0x0a, 0x16, 0x47, 0x65, 0x74, 0x49, 0x6e, 0x74, 0x65, 0x6c, 0x49, 0x6e, 0x64, + 0x69, 0x63, 0x65, 0x73, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x22, 0x2a, 0x0a, 0x08, 0x44, + 0x65, 0x76, 0x69, 0x63, 0x65, 0x49, 0x64, 0x12, 0x1e, 0x0a, 0x0a, 0x62, 0x64, 0x66, 0x41, 0x64, + 0x64, 0x72, 0x65, 0x73, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0a, 0x62, 0x64, 0x66, + 0x41, 0x64, 0x64, 0x72, 0x65, 0x73, 0x73, 0x22, 0x77, 0x0a, 0x0c, 0x44, 0x65, 0x76, 0x69, 0x63, + 0x65, 0x48, 0x65, 0x61, 0x6c, 0x74, 0x68, 0x12, 0x1b, 0x0a, 0x09, 0x6d, 0x65, 0x6d, 0x6f, 0x72, + 0x79, 0x5f, 0x6f, 0x6b, 0x18, 0x01, 0x20, 0x01, 0x28, 0x08, 0x52, 0x08, 0x6d, 0x65, 0x6d, 0x6f, + 0x72, 0x79, 0x4f, 0x6b, 0x12, 0x15, 0x0a, 0x06, 0x62, 0x75, 0x73, 0x5f, 0x6f, 0x6b, 0x18, 0x02, + 0x20, 0x01, 0x28, 0x08, 0x52, 0x05, 0x62, 0x75, 0x73, 0x4f, 0x6b, 0x12, 0x15, 0x0a, 0x06, 0x73, + 0x6f, 0x63, 0x5f, 0x6f, 0x6b, 0x18, 0x03, 0x20, 0x01, 0x28, 0x08, 0x52, 0x05, 0x73, 0x6f, 0x63, + 0x4f, 0x6b, 0x12, 0x1c, 0x0a, 0x05, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x18, 0x2a, 0x20, 0x01, 0x28, + 0x0b, 0x32, 0x06, 0x2e, 0x45, 0x72, 0x72, 0x6f, 0x72, 0x52, 0x05, 0x65, 0x72, 0x72, 0x6f, 0x72, + 0x22, 0x73, 0x0a, 0x11, 0x44, 0x65, 0x76, 0x69, 0x63, 0x65, 0x54, 0x65, 0x6d, 0x70, 0x65, 0x72, + 0x61, 0x74, 0x75, 0x72, 0x65, 0x12, 0x16, 0x0a, 0x06, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x18, + 0x01, 0x20, 0x01, 0x28, 0x01, 0x52, 0x06, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x12, 0x10, 0x0a, + 0x03, 0x67, 0x70, 0x75, 0x18, 0x02, 0x20, 0x01, 0x28, 0x01, 0x52, 0x03, 0x67, 0x70, 0x75, 0x12, + 0x16, 0x0a, 0x06, 0x6d, 0x65, 0x6d, 0x6f, 0x72, 0x79, 0x18, 0x03, 0x20, 0x01, 0x28, 0x01, 0x52, + 0x06, 0x6d, 0x65, 0x6d, 0x6f, 0x72, 0x79, 0x12, 0x1c, 0x0a, 0x05, 0x65, 0x72, 0x72, 0x6f, 0x72, + 0x18, 0x2a, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x06, 0x2e, 0x45, 0x72, 0x72, 0x6f, 0x72, 0x52, 0x05, + 0x65, 0x72, 0x72, 0x6f, 0x72, 0x22, 0x47, 0x0a, 0x0d, 0x44, 0x65, 0x76, 0x69, 0x63, 0x65, 0x49, + 0x6e, 0x64, 0x69, 0x63, 0x65, 0x73, 0x12, 0x18, 0x0a, 0x07, 0x69, 0x6e, 0x64, 0x69, 0x63, 0x65, + 0x73, 0x18, 0x01, 0x20, 0x03, 0x28, 0x0d, 0x52, 0x07, 0x69, 0x6e, 0x64, 0x69, 0x63, 0x65, 0x73, + 0x12, 0x1c, 0x0a, 0x05, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x18, 0x2a, 0x20, 0x01, 0x28, 0x0b, 0x32, + 0x06, 0x2e, 0x45, 0x72, 0x72, 0x6f, 0x72, 0x52, 0x05, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x22, 0x53, + 0x0a, 0x12, 0x44, 0x65, 0x76, 0x69, 0x63, 0x65, 0x4d, 0x65, 0x6d, 0x6f, 0x72, 0x79, 0x41, 0x6d, + 0x6f, 0x75, 0x6e, 0x74, 0x12, 0x1f, 0x0a, 0x0b, 0x6d, 0x65, 0x6d, 0x6f, 0x72, 0x79, 0x5f, 0x73, + 0x69, 0x7a, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x04, 0x52, 0x0a, 0x6d, 0x65, 0x6d, 0x6f, 0x72, + 0x79, 0x53, 0x69, 0x7a, 0x65, 0x12, 0x1c, 0x0a, 0x05, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x18, 0x2a, + 0x20, 0x01, 0x28, 0x0b, 0x32, 0x06, 0x2e, 0x45, 0x72, 0x72, 0x6f, 0x72, 0x52, 0x05, 0x65, 0x72, + 0x72, 0x6f, 0x72, 0x22, 0x47, 0x0a, 0x05, 0x45, 0x72, 0x72, 0x6f, 0x72, 0x12, 0x20, 0x0a, 0x0b, + 0x64, 0x65, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x69, 0x6f, 0x6e, 0x18, 0x01, 0x20, 0x01, 0x28, + 0x09, 0x52, 0x0b, 0x64, 0x65, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x69, 0x6f, 0x6e, 0x12, 0x1c, + 0x0a, 0x09, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x63, 0x6f, 0x64, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, + 0x0d, 0x52, 0x09, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x63, 0x6f, 0x64, 0x65, 0x32, 0xec, 0x01, 0x0a, + 0x09, 0x4c, 0x65, 0x76, 0x65, 0x6c, 0x7a, 0x65, 0x72, 0x6f, 0x12, 0x2d, 0x0a, 0x0f, 0x47, 0x65, + 0x74, 0x44, 0x65, 0x76, 0x69, 0x63, 0x65, 0x48, 0x65, 0x61, 0x6c, 0x74, 0x68, 0x12, 0x09, 0x2e, + 0x44, 0x65, 0x76, 0x69, 0x63, 0x65, 0x49, 0x64, 0x1a, 0x0d, 0x2e, 0x44, 0x65, 0x76, 0x69, 0x63, + 0x65, 0x48, 0x65, 0x61, 0x6c, 0x74, 0x68, 0x22, 0x00, 0x12, 0x37, 0x0a, 0x14, 0x47, 0x65, 0x74, + 0x44, 0x65, 0x76, 0x69, 0x63, 0x65, 0x54, 0x65, 0x6d, 0x70, 0x65, 0x72, 0x61, 0x74, 0x75, 0x72, + 0x65, 0x12, 0x09, 0x2e, 0x44, 0x65, 0x76, 0x69, 0x63, 0x65, 0x49, 0x64, 0x1a, 0x12, 0x2e, 0x44, + 0x65, 0x76, 0x69, 0x63, 0x65, 0x54, 0x65, 0x6d, 0x70, 0x65, 0x72, 0x61, 0x74, 0x75, 0x72, 0x65, + 0x22, 0x00, 0x12, 0x3c, 0x0a, 0x0f, 0x47, 0x65, 0x74, 0x49, 0x6e, 0x74, 0x65, 0x6c, 0x49, 0x6e, + 0x64, 0x69, 0x63, 0x65, 0x73, 0x12, 0x17, 0x2e, 0x47, 0x65, 0x74, 0x49, 0x6e, 0x74, 0x65, 0x6c, + 0x49, 0x6e, 0x64, 0x69, 0x63, 0x65, 0x73, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x1a, 0x0e, + 0x2e, 0x44, 0x65, 0x76, 0x69, 0x63, 0x65, 0x49, 0x6e, 0x64, 0x69, 0x63, 0x65, 0x73, 0x22, 0x00, + 0x12, 0x39, 0x0a, 0x15, 0x47, 0x65, 0x74, 0x44, 0x65, 0x76, 0x69, 0x63, 0x65, 0x4d, 0x65, 0x6d, + 0x6f, 0x72, 0x79, 0x41, 0x6d, 0x6f, 0x75, 0x6e, 0x74, 0x12, 0x09, 0x2e, 0x44, 0x65, 0x76, 0x69, + 0x63, 0x65, 0x49, 0x64, 0x1a, 0x13, 0x2e, 0x44, 0x65, 0x76, 0x69, 0x63, 0x65, 0x4d, 0x65, 0x6d, + 0x6f, 0x72, 0x79, 0x41, 0x6d, 0x6f, 0x75, 0x6e, 0x74, 0x22, 0x00, 0x42, 0x0f, 0x5a, 0x0d, 0x67, + 0x70, 0x75, 0x2e, 0x6c, 0x65, 0x76, 0x65, 0x6c, 0x7a, 0x65, 0x72, 0x6f, 0x62, 0x06, 0x70, 0x72, + 0x6f, 0x74, 0x6f, 0x33, +} + +var ( + file_levelzero_proto_rawDescOnce sync.Once + file_levelzero_proto_rawDescData = file_levelzero_proto_rawDesc +) + +func file_levelzero_proto_rawDescGZIP() []byte { + file_levelzero_proto_rawDescOnce.Do(func() { + file_levelzero_proto_rawDescData = protoimpl.X.CompressGZIP(file_levelzero_proto_rawDescData) + }) + return file_levelzero_proto_rawDescData +} + +var file_levelzero_proto_msgTypes = make([]protoimpl.MessageInfo, 7) +var file_levelzero_proto_goTypes = []interface{}{ + (*GetIntelIndicesMessage)(nil), // 0: GetIntelIndicesMessage + (*DeviceId)(nil), // 1: DeviceId + (*DeviceHealth)(nil), // 2: DeviceHealth + (*DeviceTemperature)(nil), // 3: DeviceTemperature + (*DeviceIndices)(nil), // 4: DeviceIndices + (*DeviceMemoryAmount)(nil), // 5: DeviceMemoryAmount + (*Error)(nil), // 6: Error +} +var file_levelzero_proto_depIdxs = []int32{ + 6, // 0: DeviceHealth.error:type_name -> Error + 6, // 1: DeviceTemperature.error:type_name -> Error + 6, // 2: DeviceIndices.error:type_name -> Error + 6, // 3: DeviceMemoryAmount.error:type_name -> Error + 1, // 4: Levelzero.GetDeviceHealth:input_type -> DeviceId + 1, // 5: Levelzero.GetDeviceTemperature:input_type -> DeviceId + 0, // 6: Levelzero.GetIntelIndices:input_type -> GetIntelIndicesMessage + 1, // 7: Levelzero.GetDeviceMemoryAmount:input_type -> DeviceId + 2, // 8: Levelzero.GetDeviceHealth:output_type -> DeviceHealth + 3, // 9: Levelzero.GetDeviceTemperature:output_type -> DeviceTemperature + 4, // 10: Levelzero.GetIntelIndices:output_type -> DeviceIndices + 5, // 11: Levelzero.GetDeviceMemoryAmount:output_type -> DeviceMemoryAmount + 8, // [8:12] is the sub-list for method output_type + 4, // [4:8] is the sub-list for method input_type + 4, // [4:4] is the sub-list for extension type_name + 4, // [4:4] is the sub-list for extension extendee + 0, // [0:4] is the sub-list for field type_name +} + +func init() { file_levelzero_proto_init() } +func file_levelzero_proto_init() { + if File_levelzero_proto != nil { + return + } + if !protoimpl.UnsafeEnabled { + file_levelzero_proto_msgTypes[0].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*GetIntelIndicesMessage); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_levelzero_proto_msgTypes[1].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*DeviceId); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_levelzero_proto_msgTypes[2].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*DeviceHealth); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_levelzero_proto_msgTypes[3].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*DeviceTemperature); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_levelzero_proto_msgTypes[4].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*DeviceIndices); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_levelzero_proto_msgTypes[5].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*DeviceMemoryAmount); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_levelzero_proto_msgTypes[6].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*Error); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + } + type x struct{} + out := protoimpl.TypeBuilder{ + File: protoimpl.DescBuilder{ + GoPackagePath: reflect.TypeOf(x{}).PkgPath(), + RawDescriptor: file_levelzero_proto_rawDesc, + NumEnums: 0, + NumMessages: 7, + NumExtensions: 0, + NumServices: 1, + }, + GoTypes: file_levelzero_proto_goTypes, + DependencyIndexes: file_levelzero_proto_depIdxs, + MessageInfos: file_levelzero_proto_msgTypes, + }.Build() + File_levelzero_proto = out.File + file_levelzero_proto_rawDesc = nil + file_levelzero_proto_goTypes = nil + file_levelzero_proto_depIdxs = nil +} diff --git a/cmd/internal/levelzero/levelzero.proto b/cmd/internal/levelzero/levelzero.proto new file mode 100644 index 000000000..e62b12efe --- /dev/null +++ b/cmd/internal/levelzero/levelzero.proto @@ -0,0 +1,45 @@ +syntax = "proto3"; + +option go_package = "gpu.levelzero"; + +service Levelzero { + rpc GetDeviceHealth(DeviceId) returns (DeviceHealth) {} + rpc GetDeviceTemperature(DeviceId) returns (DeviceTemperature) {} + rpc GetIntelIndices(GetIntelIndicesMessage) returns (DeviceIndices) {} + rpc GetDeviceMemoryAmount(DeviceId) returns (DeviceMemoryAmount) {} +} + +message GetIntelIndicesMessage {} + +message DeviceId { + string bdfAddress = 1; +} + +message DeviceHealth { + bool memory_ok = 1; + bool bus_ok = 2; + bool soc_ok = 3; + Error error = 42; +} + +message DeviceTemperature { + double global = 1; + double gpu = 2; + double memory = 3; + Error error = 42; +} + +message DeviceIndices { + repeated uint32 indices = 1; + Error error = 42; +} + +message DeviceMemoryAmount { + uint64 memory_size = 1; + Error error = 42; +} + +message Error { + string description = 1; + uint32 errorcode = 2; +} diff --git a/cmd/internal/levelzero/levelzero_grpc.pb.go b/cmd/internal/levelzero/levelzero_grpc.pb.go new file mode 100644 index 000000000..35229c88a --- /dev/null +++ b/cmd/internal/levelzero/levelzero_grpc.pb.go @@ -0,0 +1,227 @@ +// Copyright 2024 Intel Corporation. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by protoc-gen-go-grpc. DO NOT EDIT. +// versions: +// - protoc-gen-go-grpc v1.2.0 +// - protoc v3.12.4 +// source: levelzero.proto + +package gpulevelzero + +import ( + context "context" + grpc "google.golang.org/grpc" + codes "google.golang.org/grpc/codes" + status "google.golang.org/grpc/status" +) + +// This is a compile-time assertion to ensure that this generated file +// is compatible with the grpc package it is being compiled against. +// Requires gRPC-Go v1.32.0 or later. +const _ = grpc.SupportPackageIsVersion7 + +// LevelzeroClient is the client API for Levelzero service. +// +// For semantics around ctx use and closing/ending streaming RPCs, please refer to https://pkg.go.dev/google.golang.org/grpc/?tab=doc#ClientConn.NewStream. +type LevelzeroClient interface { + GetDeviceHealth(ctx context.Context, in *DeviceId, opts ...grpc.CallOption) (*DeviceHealth, error) + GetDeviceTemperature(ctx context.Context, in *DeviceId, opts ...grpc.CallOption) (*DeviceTemperature, error) + GetIntelIndices(ctx context.Context, in *GetIntelIndicesMessage, opts ...grpc.CallOption) (*DeviceIndices, error) + GetDeviceMemoryAmount(ctx context.Context, in *DeviceId, opts ...grpc.CallOption) (*DeviceMemoryAmount, error) +} + +type levelzeroClient struct { + cc grpc.ClientConnInterface +} + +func NewLevelzeroClient(cc grpc.ClientConnInterface) LevelzeroClient { + return &levelzeroClient{cc} +} + +func (c *levelzeroClient) GetDeviceHealth(ctx context.Context, in *DeviceId, opts ...grpc.CallOption) (*DeviceHealth, error) { + out := new(DeviceHealth) + err := c.cc.Invoke(ctx, "/Levelzero/GetDeviceHealth", in, out, opts...) + if err != nil { + return nil, err + } + return out, nil +} + +func (c *levelzeroClient) GetDeviceTemperature(ctx context.Context, in *DeviceId, opts ...grpc.CallOption) (*DeviceTemperature, error) { + out := new(DeviceTemperature) + err := c.cc.Invoke(ctx, "/Levelzero/GetDeviceTemperature", in, out, opts...) + if err != nil { + return nil, err + } + return out, nil +} + +func (c *levelzeroClient) GetIntelIndices(ctx context.Context, in *GetIntelIndicesMessage, opts ...grpc.CallOption) (*DeviceIndices, error) { + out := new(DeviceIndices) + err := c.cc.Invoke(ctx, "/Levelzero/GetIntelIndices", in, out, opts...) + if err != nil { + return nil, err + } + return out, nil +} + +func (c *levelzeroClient) GetDeviceMemoryAmount(ctx context.Context, in *DeviceId, opts ...grpc.CallOption) (*DeviceMemoryAmount, error) { + out := new(DeviceMemoryAmount) + err := c.cc.Invoke(ctx, "/Levelzero/GetDeviceMemoryAmount", in, out, opts...) + if err != nil { + return nil, err + } + return out, nil +} + +// LevelzeroServer is the server API for Levelzero service. +// All implementations must embed UnimplementedLevelzeroServer +// for forward compatibility +type LevelzeroServer interface { + GetDeviceHealth(context.Context, *DeviceId) (*DeviceHealth, error) + GetDeviceTemperature(context.Context, *DeviceId) (*DeviceTemperature, error) + GetIntelIndices(context.Context, *GetIntelIndicesMessage) (*DeviceIndices, error) + GetDeviceMemoryAmount(context.Context, *DeviceId) (*DeviceMemoryAmount, error) + mustEmbedUnimplementedLevelzeroServer() +} + +// UnimplementedLevelzeroServer must be embedded to have forward compatible implementations. +type UnimplementedLevelzeroServer struct { +} + +func (UnimplementedLevelzeroServer) GetDeviceHealth(context.Context, *DeviceId) (*DeviceHealth, error) { + return nil, status.Errorf(codes.Unimplemented, "method GetDeviceHealth not implemented") +} +func (UnimplementedLevelzeroServer) GetDeviceTemperature(context.Context, *DeviceId) (*DeviceTemperature, error) { + return nil, status.Errorf(codes.Unimplemented, "method GetDeviceTemperature not implemented") +} +func (UnimplementedLevelzeroServer) GetIntelIndices(context.Context, *GetIntelIndicesMessage) (*DeviceIndices, error) { + return nil, status.Errorf(codes.Unimplemented, "method GetIntelIndices not implemented") +} +func (UnimplementedLevelzeroServer) GetDeviceMemoryAmount(context.Context, *DeviceId) (*DeviceMemoryAmount, error) { + return nil, status.Errorf(codes.Unimplemented, "method GetDeviceMemoryAmount not implemented") +} +func (UnimplementedLevelzeroServer) mustEmbedUnimplementedLevelzeroServer() {} + +// UnsafeLevelzeroServer may be embedded to opt out of forward compatibility for this service. +// Use of this interface is not recommended, as added methods to LevelzeroServer will +// result in compilation errors. +type UnsafeLevelzeroServer interface { + mustEmbedUnimplementedLevelzeroServer() +} + +func RegisterLevelzeroServer(s grpc.ServiceRegistrar, srv LevelzeroServer) { + s.RegisterService(&Levelzero_ServiceDesc, srv) +} + +func _Levelzero_GetDeviceHealth_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(DeviceId) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(LevelzeroServer).GetDeviceHealth(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: "/Levelzero/GetDeviceHealth", + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(LevelzeroServer).GetDeviceHealth(ctx, req.(*DeviceId)) + } + return interceptor(ctx, in, info, handler) +} + +func _Levelzero_GetDeviceTemperature_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(DeviceId) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(LevelzeroServer).GetDeviceTemperature(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: "/Levelzero/GetDeviceTemperature", + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(LevelzeroServer).GetDeviceTemperature(ctx, req.(*DeviceId)) + } + return interceptor(ctx, in, info, handler) +} + +func _Levelzero_GetIntelIndices_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(GetIntelIndicesMessage) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(LevelzeroServer).GetIntelIndices(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: "/Levelzero/GetIntelIndices", + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(LevelzeroServer).GetIntelIndices(ctx, req.(*GetIntelIndicesMessage)) + } + return interceptor(ctx, in, info, handler) +} + +func _Levelzero_GetDeviceMemoryAmount_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(DeviceId) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(LevelzeroServer).GetDeviceMemoryAmount(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: "/Levelzero/GetDeviceMemoryAmount", + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(LevelzeroServer).GetDeviceMemoryAmount(ctx, req.(*DeviceId)) + } + return interceptor(ctx, in, info, handler) +} + +// Levelzero_ServiceDesc is the grpc.ServiceDesc for Levelzero service. +// It's only intended for direct use with grpc.RegisterService, +// and not to be introspected or modified (even as a copy) +var Levelzero_ServiceDesc = grpc.ServiceDesc{ + ServiceName: "Levelzero", + HandlerType: (*LevelzeroServer)(nil), + Methods: []grpc.MethodDesc{ + { + MethodName: "GetDeviceHealth", + Handler: _Levelzero_GetDeviceHealth_Handler, + }, + { + MethodName: "GetDeviceTemperature", + Handler: _Levelzero_GetDeviceTemperature_Handler, + }, + { + MethodName: "GetIntelIndices", + Handler: _Levelzero_GetIntelIndices_Handler, + }, + { + MethodName: "GetDeviceMemoryAmount", + Handler: _Levelzero_GetDeviceMemoryAmount_Handler, + }, + }, + Streams: []grpc.StreamDesc{}, + Metadata: "levelzero.proto", +} diff --git a/deployments/gpu_plugin/overlays/health/args.yaml b/deployments/gpu_plugin/overlays/health/args.yaml new file mode 100644 index 000000000..2b90b66b3 --- /dev/null +++ b/deployments/gpu_plugin/overlays/health/args.yaml @@ -0,0 +1,4 @@ +- op: add + path: /spec/template/spec/containers/0/args + value: + - "-health-management" diff --git a/deployments/gpu_plugin/overlays/health/kustomization.yaml b/deployments/gpu_plugin/overlays/health/kustomization.yaml new file mode 100644 index 000000000..b6e3b2125 --- /dev/null +++ b/deployments/gpu_plugin/overlays/health/kustomization.yaml @@ -0,0 +1,6 @@ +resources: + - ../levelzero +patches: + - path: args.yaml + target: + kind: DaemonSet diff --git a/deployments/gpu_plugin/overlays/levelzero/kustomization.yaml b/deployments/gpu_plugin/overlays/levelzero/kustomization.yaml new file mode 100644 index 000000000..ccdc5fab7 --- /dev/null +++ b/deployments/gpu_plugin/overlays/levelzero/kustomization.yaml @@ -0,0 +1,9 @@ +resources: + - ../../base +patches: + - path: l0-mounts.yaml + target: + kind: DaemonSet + - path: levelzero.yaml + target: + kind: DaemonSet diff --git a/deployments/gpu_plugin/overlays/levelzero/l0-mounts.yaml b/deployments/gpu_plugin/overlays/levelzero/l0-mounts.yaml new file mode 100644 index 000000000..2757d1d08 --- /dev/null +++ b/deployments/gpu_plugin/overlays/levelzero/l0-mounts.yaml @@ -0,0 +1,10 @@ +- op: add + path: /spec/template/spec/containers/0/volumeMounts/- + value: + name: levelzerosocket + mountPath: /var/lib/levelzero +- op: add + path: /spec/template/spec/volumes/- + value: + name: levelzerosocket + emptyDir: {} diff --git a/deployments/gpu_plugin/overlays/levelzero/levelzero.yaml b/deployments/gpu_plugin/overlays/levelzero/levelzero.yaml new file mode 100644 index 000000000..baa30f18d --- /dev/null +++ b/deployments/gpu_plugin/overlays/levelzero/levelzero.yaml @@ -0,0 +1,17 @@ +- op: add + path: /spec/template/spec/containers/- + value: + name: intel-gpu-levelzero + image: intel/intel-gpu-levelzero:devel + imagePullPolicy: IfNotPresent + args: + - "-v=2" + securityContext: + readOnlyRootFilesystem: true + privileged: true + capabilities: + drop: + - ALL + volumeMounts: + - name: levelzerosocket + mountPath: /var/lib/levelzero diff --git a/deployments/gpu_plugin/overlays/wsl/kustomization.yaml b/deployments/gpu_plugin/overlays/wsl/kustomization.yaml new file mode 100644 index 000000000..a4d7f3b4e --- /dev/null +++ b/deployments/gpu_plugin/overlays/wsl/kustomization.yaml @@ -0,0 +1,9 @@ +resources: + - ../levelzero +patches: + - path: wsl_mounts.yaml + target: + kind: DaemonSet + - path: wsl_args.yaml + target: + kind: DaemonSet diff --git a/deployments/gpu_plugin/overlays/wsl/wsl_args.yaml b/deployments/gpu_plugin/overlays/wsl/wsl_args.yaml new file mode 100644 index 000000000..2d1d356a8 --- /dev/null +++ b/deployments/gpu_plugin/overlays/wsl/wsl_args.yaml @@ -0,0 +1,8 @@ +- op: add + path: /spec/template/spec/containers/0/args + value: + - "-wsl" +- op: add + path: /spec/template/spec/containers/1/args/- + value: + "-wsl" diff --git a/deployments/gpu_plugin/overlays/wsl/wsl_mounts.yaml b/deployments/gpu_plugin/overlays/wsl/wsl_mounts.yaml new file mode 100644 index 000000000..6ee9ef00e --- /dev/null +++ b/deployments/gpu_plugin/overlays/wsl/wsl_mounts.yaml @@ -0,0 +1,24 @@ +- op: add + path: /spec/template/spec/containers/1/volumeMounts/- + value: + name: wsllib + mountPath: /usr/lib/wsl +- op: add + path: /spec/template/spec/containers/0/volumeMounts/- + value: + name: devdxg + mountPath: /dev/dxg +- op: add + path: /spec/template/spec/volumes/- + value: + name: wsllib + hostPath: + path: /usr/lib/wsl + type: DirectoryOrCreate +- op: add + path: /spec/template/spec/volumes/- + value: + name: devdxg + hostPath: + path: /dev/dxg + type: CharDevice diff --git a/go.mod b/go.mod index d6bb30102..a81730ae7 100644 --- a/go.mod +++ b/go.mod @@ -16,6 +16,7 @@ require ( golang.org/x/sys v0.25.0 golang.org/x/text v0.18.0 google.golang.org/grpc v1.66.2 + google.golang.org/protobuf v1.34.2 gopkg.in/yaml.v2 v2.4.0 k8s.io/api v0.31.0 k8s.io/apimachinery v0.31.0 @@ -115,7 +116,6 @@ require ( google.golang.org/genproto v0.0.0-20240227224415-6ceb2ff114de // indirect google.golang.org/genproto/googleapis/api v0.0.0-20240604185151-ef581f913117 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20240701130421-f6361c86f094 // indirect - google.golang.org/protobuf v1.34.2 // indirect gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/natefinch/lumberjack.v2 v2.2.1 // indirect diff --git a/test/e2e/gpu/gpu.go b/test/e2e/gpu/gpu.go index e53a3dbc5..ae5b3fabc 100644 --- a/test/e2e/gpu/gpu.go +++ b/test/e2e/gpu/gpu.go @@ -38,7 +38,8 @@ import ( const ( kustomizationYaml = "deployments/gpu_plugin/kustomization.yaml" monitoringYaml = "deployments/gpu_plugin/overlays/monitoring_shared-dev_nfd/kustomization.yaml" - rmEnabledYaml = "deployments/gpu_plugin/overlays/fractional_resources//kustomization.yaml" + rmEnabledYaml = "deployments/gpu_plugin/overlays/fractional_resources/kustomization.yaml" + healthMgmtYaml = "deployments/gpu_plugin/overlays/health/kustomization.yaml" nfdRulesYaml = "deployments/nfd/overlays/node-feature-rules/kustomization.yaml" containerName = "testcontainer" tfKustomizationYaml = "deployments/gpu_tensorflow_test/kustomization.yaml" @@ -98,6 +99,11 @@ func describe() { framework.Failf("unable to locate %q: %v", rmEnabledYaml, errFailedToLocateRepoFile) } + healthMgmtPath, errFailedToLocateRepoFile := utils.LocateRepoFile(healthMgmtYaml) + if errFailedToLocateRepoFile != nil { + framework.Failf("unable to locate %q: %v", healthMgmtYaml, errFailedToLocateRepoFile) + } + ginkgo.Context("When GPU plugin is deployed [Resource:i915]", func() { ginkgo.AfterEach(func(ctx context.Context) { framework.Logf("Removing gpu-plugin manually") @@ -194,6 +200,12 @@ func describe() { }) }) + ginkgo.Context("When [Deployment:healthManagement] deployment is applied [Resource:i915]", func() { + ginkgo.It("check if i915 resources is available", func(ctx context.Context) { + createPluginAndVerifyExistence(f, ctx, healthMgmtPath, "gpu.intel.com/i915") + }) + }) + ginkgo.Context("When [Deployment:resourceManager] deployment is applied [Resource:i915]", func() { ginkgo.It("check if i915 resources is available", func(ctx context.Context) { e2ekubectl.RunKubectlOrDie(f.Namespace.Name, "apply", "-k", filepath.Dir(nfdRulesPath))