NVIDIA-NeMo · esnvidia · Feb 23, 2026 · Feb 23, 2026 · Feb 23, 2026 · Feb 23, 2026
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -28,7 +28,11 @@ repos:
         args: ['--maxkb=1000']
       - id: check-case-conflict
       - id: check-yaml
-        exclude: ^mkdocs\.yml$
+        exclude: ^(mkdocs\.yml|cluster_configs/kubernetes/.*\.ya?ml|scripts/k8s-tests/(manifests|image-build)/.*\.ya?ml)$
+      - id: check-yaml
+        name: check-yaml-k8s-multidoc
+        files: ^(cluster_configs/kubernetes/.*\.ya?ml|scripts/k8s-tests/(manifests|image-build)/.*\.ya?ml)$
+        args: ['--allow-multiple-documents']
       - id: detect-private-key
       - id: end-of-file-fixer
         exclude: docs/|\.txt$|\.patch$|test$
@@ -45,6 +49,11 @@ repos:
 
   - repo: local
     hooks:
+      - id: validate-k8s-manifests
+        name: Validate Kubernetes manifests (kubeconform/kubectl)
+        entry: scripts/k8s-tests/validate_k8s_manifests.sh
+        language: system
+        files: ^(cluster_configs/kubernetes/.*\.ya?ml|scripts/k8s-tests/(manifests|image-build)/.*\.ya?ml)$
       - id: check-signoff
         name: Check Signed-off-by
         entry: bash -c 'if ! grep -q "Signed-off-by:" "$1"; then echo "❌ Commit message must be signed off. Use git commit -s to add it automatically."; exit 1; fi' --

diff --git a/cluster_configs/example-kubernetes.yaml b/cluster_configs/example-kubernetes.yaml
@@ -0,0 +1,190 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+executor: kubernetes
+
+# Kubernetes cluster configuration
+# kubeconfig: ~/.kube/config  # Optional: path to kubeconfig file (uses in-cluster config if omitted)
+# context: my-cluster-context  # Optional: kubectl context to use
+namespace: nemo-skills  # Kubernetes namespace for jobs
+
+# Container images
+containers:
+  # Map logical names to actual images
+  # These can be referenced by name in job specs
+  vllm: nvcr.io/nvidia/vllm:latest
+  nemo-skills: nvcr.io/nvidia/nemo-skills:latest
+  sglang: lmsysorg/sglang:latest
+  trtllm: nvcr.io/nvidia/tensorrt-llm:latest
+  sandbox: nvcr.io/nvidia/nemo-skills-sandbox:latest
+
+# Image pull secrets for private registries
+# image_pull_secrets:
+#   - nvcr-secret
+#   - private-registry-secret
+
+# Service account for pods (should have appropriate RBAC permissions)
+service_account: nemo-skills-sa
+
+# Resource pools for node scheduling
+# These map to node selectors and tolerations for GPU/CPU workloads
+resource_pools:
+  # GPU pool for inference/training workloads
+  gpu-a100:
+    node_selector:
+      nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+    tolerations:
+      - key: nvidia.com/gpu
+        operator: Exists
+        effect: NoSchedule
+
+  # GPU pool for H100 nodes
+  gpu-h100:
+    node_selector:
+      nvidia.com/gpu.product: NVIDIA-H100-80GB-HBM3
+    tolerations:
+      - key: nvidia.com/gpu
+        operator: Exists
+        effect: NoSchedule
+
+  # CPU pool for data processing workloads
+  cpu:
+    node_selector:
+      node-type: cpu-worker
+    tolerations: []
+
+# PVC-based storage configuration
+# These volumes are mounted to all containers in jobs
+storage:
+  models:
+    pvc_name: nemo-models-pvc
+    mount_path: /models
+  data:
+    pvc_name: nemo-data-pvc
+    mount_path: /data
+  results:
+    pvc_name: nemo-results-pvc
+    mount_path: /results
+
+# Job timeout settings
+default_timeout: "6h"
+timeouts:
+  inference: "2h"
+  training: "24h"
+  data-processing: "12h"
+
+# Environment variables for all containers
+env_vars:
+  - HF_HOME=/models/hf-cache
+  - NCCL_DEBUG=INFO
+  # - CUDA_VISIBLE_DEVICES=all
+
+# RDMA/InfiniBand for multi-node GPU jobs (opt-in)
+# When enabled, multi-node jobs request RDMA shared devices so NCCL
+# can use IB/RoCE instead of falling back to TCP/Socket.
+# Requires NVIDIA Network Operator installed on the cluster.
+# rdma:
+#   enabled: true
+#   resource_name: nvidia.com/rdma_shared_device  # default
+#   resource_count: 1  # default (number of RDMA devices per container)
+
+# Pod scheduling for multi-node jobs
+# Spreads pods across different nodes to ensure multi-node training
+# uses distinct physical nodes (not all pods on the same node).
+# scheduling:
+#   spread_across_nodes: true                    # default: true for multi-node
+#   topology_key: kubernetes.io/hostname          # default (spread per node)
+
+# DNS readiness check for multi-node jobs (enabled by default for multi-node)
+# An init container waits for MASTER_ADDR DNS to resolve before training starts.
+# dns_check:
+#   enabled: true                # default: true for multi-node jobs
+#   image: busybox:1.36          # default (lightweight image for nslookup)
+#   timeout_seconds: 300         # default (5 minutes)
+
+# RBAC preflight checks for multi-node jobs (enabled by default)
+# Verifies create/delete/get/list permissions on Services before submit.
+# rbac_preflight: true
+
+# Fallback backend if Kubernetes is unavailable
+# Image pull policy (default: IfNotPresent)
+# Set to "Never" when using locally-loaded images via ctr import
+# image_pull_policy: Never
+
+# Useful for development/testing when K8s cluster is not accessible
+# fallback_executor: local
+
+# =============================================================================
+# Setup Instructions
+# =============================================================================
+#
+# 1. Create the namespace:
+#    kubectl create namespace nemo-skills
+#
+# 2. Create a service account with appropriate permissions:
+#    kubectl create serviceaccount nemo-skills-sa -n nemo-skills
+#
+# 3. Create RBAC role for job management:
+#    kubectl apply -f - <<EOF
+#    apiVersion: rbac.authorization.k8s.io/v1
+#    kind: Role
+#    metadata:
+#      name: nemo-skills-role
+#      namespace: nemo-skills
+#    rules:
+#    - apiGroups: ["batch"]
+#      resources: ["jobs"]
+#      verbs: ["create", "delete", "get", "list", "watch"]
+#    - apiGroups: [""]
+#      resources: ["pods", "pods/log"]
+#      verbs: ["get", "list", "watch"]
+#    - apiGroups: [""]
+#      resources: ["services"]
+#      verbs: ["create", "delete", "get", "list"]  # Required for multi-node headless services
+#    EOF
+#
+# 4. Bind the role to the service account:
+#    kubectl create rolebinding nemo-skills-binding \
+#      --role=nemo-skills-role \
+#      --serviceaccount=nemo-skills:nemo-skills-sa \
+#      -n nemo-skills
+#
+# 5. Create PVCs for storage (example for models):
+#    kubectl apply -f - <<EOF
+#    apiVersion: v1
+#    kind: PersistentVolumeClaim
+#    metadata:
+#      name: nemo-models-pvc
+#      namespace: nemo-skills
+#    spec:
+#      accessModes:
+#        - ReadWriteMany
+#      resources:
+#        requests:
+#          storage: 500Gi
+#      storageClassName: your-storage-class
+#    EOF
+#
+# 6. Create image pull secrets for private registries:
+#    kubectl create secret docker-registry nvcr-secret \
+#      --docker-server=nvcr.io \
+#      --docker-username='$oauthtoken' \
+#      --docker-password=<NGC_API_KEY> \
+#      -n nemo-skills
+#
+# 7. Ensure GPU Operator is installed for GPU scheduling:
+#    https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/getting-started.html
+#
+# 8. For multi-node GPU jobs with RDMA/InfiniBand, ensure Network Operator is installed:
+#    https://docs.nvidia.com/networking/display/cokan10/network+operator
diff --git a/cluster_configs/kubernetes/README.md b/cluster_configs/kubernetes/README.md
@@ -0,0 +1,107 @@
+# NeMo-Skills Kubernetes Setup
+
+This directory contains Kubernetes manifests for setting up NeMo-Skills on a Kubernetes cluster.
+
+## Prerequisites
+
+- Kubernetes cluster 1.24+ (multi-node Indexed Jobs + Headless Service pattern)
+- [NVIDIA GPU Operator](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/getting-started.html) installed
+- Storage class that supports ReadWriteMany (RWX) access mode
+- `kubectl` configured to access your cluster
+
+## Quick Start
+
+```bash
+# 1. Create namespace and RBAC
+kubectl apply -f rbac.yaml
+
+# 2. Create image pull secret for NGC (if using NVIDIA containers)
+kubectl create secret docker-registry nvcr-secret \
+  --namespace=nemo-skills \
+  --docker-server=nvcr.io \
+  --docker-username='$oauthtoken' \
+  --docker-password=YOUR_NGC_API_KEY
+
+# 3. Update storage.yaml with your storage class, then apply
+# Edit storage.yaml first: replace REPLACE_WITH_YOUR_STORAGE_CLASS
+kubectl apply -f storage.yaml
+
+# 4. Copy and customize the cluster config
+cp ../example-kubernetes.yaml ../my-cluster.yaml
+# Edit my-cluster.yaml with your settings
+```
+
+## Files
+
+| File | Description |
+|------|-------------|
+| `rbac.yaml` | ServiceAccount, Role, and RoleBinding for jobs/pods/services management |
+| `storage.yaml` | PVC templates for models, data, and results |
+| `image-pull-secret.yaml` | Instructions for creating image pull secrets |
+
+## Verification
+
+```bash
+# Check namespace and service account
+kubectl get namespace nemo-skills
+kubectl get serviceaccount -n nemo-skills
+
+# Check RBAC
+kubectl auth can-i create jobs --as=system:serviceaccount:nemo-skills:nemo-skills-sa -n nemo-skills
+kubectl auth can-i create services --as=system:serviceaccount:nemo-skills:nemo-skills-sa -n nemo-skills
+kubectl auth can-i delete services --as=system:serviceaccount:nemo-skills:nemo-skills-sa -n nemo-skills
+
+# Check PVCs
+kubectl get pvc -n nemo-skills
+
+# Check GPU nodes
+kubectl get nodes -l nvidia.com/gpu.present=true
+```
+
+## Troubleshooting
+
+### Jobs stuck in Pending
+```bash
+kubectl describe job <job-name> -n nemo-skills
+kubectl describe pod -l job-name=<job-name> -n nemo-skills
+```
+
+### Image pull errors
+```bash
+kubectl get events -n nemo-skills --field-selector reason=Failed
+```
+
+### GPU not available
+```bash
+# Check GPU operator
+kubectl get pods -n gpu-operator
+
+# Check node GPU resources
+kubectl describe node <node-name> | grep nvidia.com/gpu
+```
+
+## Multi-Tenant Setup
+
+For multiple teams, create separate namespaces with resource quotas:
+
+```bash
+# Create team namespace
+kubectl create namespace team-alpha
+
+# Apply RBAC (update namespace in rbac.yaml)
+sed 's/nemo-skills/team-alpha/g' rbac.yaml | kubectl apply -f -
+
+# Add resource quota
+kubectl apply -f - <<EOF
+apiVersion: v1
+kind: ResourceQuota
+metadata:
+  name: team-alpha-quota
+  namespace: team-alpha
+spec:
+  hard:
+    requests.nvidia.com/gpu: "16"
+    requests.memory: "128Gi"
+    pods: "20"
+EOF
+```