Add configurable runtimeClassName for gpu-agent (#55)

* Add configurable runtimeClassName for gpu-agent * Fix helm chart syntax for runtimeClassName * fix: gpu-agent custom runtime class --------- Co-authored-by: Wojtek Czekalski <[email protected]>
nebuly-ai · Apr 21, 2024 · 80f7b39 · 80f7b39
1 parent b96e507
commit 80f7b39
Show file tree

Hide file tree

Showing 4 changed files with 7 additions and 0 deletions.
diff --git a/docs/en/docs/helm-charts/nos/README.md b/docs/en/docs/helm-charts/nos/README.md
@@ -35,6 +35,7 @@ The open-source platform for running AI workloads on k8s in an optimized way, bo
 | gpuPartitioner.gpuAgent.logLevel | int | `0` | The level of log of the GPU Agent. Zero corresponds to `info`, while values greater or equal than 1 corresponds to higher debug levels. **Must be >= 0**. |
 | gpuPartitioner.gpuAgent.reportConfigIntervalSeconds | int | `10` | Interval at which the mig-agent will report to k8s status of the GPUs of the Node |
 | gpuPartitioner.gpuAgent.resources | object | `{"limits":{"cpu":"100m","memory":"128Mi"}}` | Sets the resource requests and limits of the GPU Agent container. |
+| gpuPartitioner.gpuAgent.runtimeClassName | string | `nil` | The container runtime class name to use for the GPU Agent container. |
 | gpuPartitioner.gpuAgent.tolerations | list | `[{"effect":"NoSchedule","key":"kubernetes.azure.com/scalesetpriority","operator":"Equal","value":"spot"}]` | Sets the tolerations of the GPU Agent Pod. |
 | gpuPartitioner.image.pullPolicy | string | `"IfNotPresent"` | Sets the GPU Partitioner Docker image pull policy. |
 | gpuPartitioner.image.repository | string | `"ghcr.io/nebuly-ai/nos-gpu-partitioner"` | Sets the GPU Partitioner Docker image. |

diff --git a/helm-charts/nos/README.md b/helm-charts/nos/README.md
@@ -35,6 +35,7 @@ The open-source platform for running AI workloads on k8s in an optimized way, bo
 | gpuPartitioner.gpuAgent.logLevel | int | `0` | The level of log of the GPU Agent. Zero corresponds to `info`, while values greater or equal than 1 corresponds to higher debug levels. **Must be >= 0**. |
 | gpuPartitioner.gpuAgent.reportConfigIntervalSeconds | int | `10` | Interval at which the mig-agent will report to k8s status of the GPUs of the Node |
 | gpuPartitioner.gpuAgent.resources | object | `{"limits":{"cpu":"100m","memory":"128Mi"}}` | Sets the resource requests and limits of the GPU Agent container. |
+| gpuPartitioner.gpuAgent.runtimeClassName | string | `nil` | The container runtime class name to use for the GPU Agent container. |
 | gpuPartitioner.gpuAgent.tolerations | list | `[{"effect":"NoSchedule","key":"kubernetes.azure.com/scalesetpriority","operator":"Equal","value":"spot"}]` | Sets the tolerations of the GPU Agent Pod. |
 | gpuPartitioner.image.pullPolicy | string | `"IfNotPresent"` | Sets the GPU Partitioner Docker image pull policy. |
 | gpuPartitioner.image.repository | string | `"ghcr.io/nebuly-ai/nos-gpu-partitioner"` | Sets the GPU Partitioner Docker image. |

diff --git a/helm-charts/nos/templates/gpu-partitioner/daemonset_gpu-agent.yaml b/helm-charts/nos/templates/gpu-partitioner/daemonset_gpu-agent.yaml
@@ -22,6 +22,9 @@ spec:
         nos.nebuly.com/gpu-partitioning: mps
       priorityClassName: system-node-critical
       terminationGracePeriodSeconds: 20
+      {{- if .Values.gpuPartitioner.gpuAgent.runtimeClassName }}
+      runtimeClassName: {{ .Values.gpuPartitioner.gpuAgent.runtimeClassName }}
+      {{- end }}
       containers:
         - image: "{{ .Values.gpuPartitioner.gpuAgent.image.repository }}:{{ .Values.gpuPartitioner.gpuAgent.image.tag | default .Chart.AppVersion }}"
           name: {{ include "gpuAgent.fullname" . }}

diff --git a/helm-charts/nos/values.yaml b/helm-charts/nos/values.yaml
@@ -232,6 +232,8 @@ gpuPartitioner:
     # Zero corresponds to `info`, while values greater or equal than 1 corresponds to higher debug levels.
     # **Must be >= 0**.
     logLevel: 0
+    # -- The container runtime class name to use for the GPU Agent container.
+    runtimeClassName:
     image:
       # -- Sets the GPU Agent Docker image.
       repository: ghcr.io/nebuly-ai/nos-gpu-agent