Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -28,7 +28,11 @@ repos:
args: ['--maxkb=1000']
- id: check-case-conflict
- id: check-yaml
exclude: ^mkdocs\.yml$
exclude: ^(mkdocs\.yml|cluster_configs/kubernetes/.*\.ya?ml|scripts/k8s-tests/(manifests|image-build)/.*\.ya?ml)$
- id: check-yaml
name: check-yaml-k8s-multidoc
files: ^(cluster_configs/kubernetes/.*\.ya?ml|scripts/k8s-tests/(manifests|image-build)/.*\.ya?ml)$
args: ['--allow-multiple-documents']
- id: detect-private-key
- id: end-of-file-fixer
exclude: docs/|\.txt$|\.patch$|test$
Expand All @@ -45,6 +49,11 @@ repos:

- repo: local
hooks:
- id: validate-k8s-manifests
name: Validate Kubernetes manifests (kubeconform/kubectl)
entry: scripts/k8s-tests/validate_k8s_manifests.sh
language: system
files: ^(cluster_configs/kubernetes/.*\.ya?ml|scripts/k8s-tests/(manifests|image-build)/.*\.ya?ml)$
- id: check-signoff
name: Check Signed-off-by
entry: bash -c 'if ! grep -q "Signed-off-by:" "$1"; then echo "❌ Commit message must be signed off. Use git commit -s to add it automatically."; exit 1; fi' --
Expand Down
190 changes: 190 additions & 0 deletions cluster_configs/example-kubernetes.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

executor: kubernetes

# Kubernetes cluster configuration
# kubeconfig: ~/.kube/config # Optional: path to kubeconfig file (uses in-cluster config if omitted)
# context: my-cluster-context # Optional: kubectl context to use
namespace: nemo-skills # Kubernetes namespace for jobs

# Container images
containers:
# Map logical names to actual images
# These can be referenced by name in job specs
vllm: nvcr.io/nvidia/vllm:latest
nemo-skills: nvcr.io/nvidia/nemo-skills:latest
sglang: lmsysorg/sglang:latest
trtllm: nvcr.io/nvidia/tensorrt-llm:latest
sandbox: nvcr.io/nvidia/nemo-skills-sandbox:latest

# Image pull secrets for private registries
# image_pull_secrets:
# - nvcr-secret
# - private-registry-secret

# Service account for pods (should have appropriate RBAC permissions)
service_account: nemo-skills-sa

# Resource pools for node scheduling
# These map to node selectors and tolerations for GPU/CPU workloads
resource_pools:
# GPU pool for inference/training workloads
gpu-a100:
node_selector:
nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule

# GPU pool for H100 nodes
gpu-h100:
node_selector:
nvidia.com/gpu.product: NVIDIA-H100-80GB-HBM3
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule

# CPU pool for data processing workloads
cpu:
node_selector:
node-type: cpu-worker
tolerations: []

# PVC-based storage configuration
# These volumes are mounted to all containers in jobs
storage:
models:
pvc_name: nemo-models-pvc
mount_path: /models
data:
pvc_name: nemo-data-pvc
mount_path: /data
results:
pvc_name: nemo-results-pvc
mount_path: /results

# Job timeout settings
default_timeout: "6h"
timeouts:
inference: "2h"
training: "24h"
data-processing: "12h"

# Environment variables for all containers
env_vars:
- HF_HOME=/models/hf-cache
- NCCL_DEBUG=INFO
# - CUDA_VISIBLE_DEVICES=all

# RDMA/InfiniBand for multi-node GPU jobs (opt-in)
# When enabled, multi-node jobs request RDMA shared devices so NCCL
# can use IB/RoCE instead of falling back to TCP/Socket.
# Requires NVIDIA Network Operator installed on the cluster.
# rdma:
# enabled: true
# resource_name: nvidia.com/rdma_shared_device # default
# resource_count: 1 # default (number of RDMA devices per container)

# Pod scheduling for multi-node jobs
# Spreads pods across different nodes to ensure multi-node training
# uses distinct physical nodes (not all pods on the same node).
# scheduling:
# spread_across_nodes: true # default: true for multi-node
# topology_key: kubernetes.io/hostname # default (spread per node)

# DNS readiness check for multi-node jobs (enabled by default for multi-node)
# An init container waits for MASTER_ADDR DNS to resolve before training starts.
# dns_check:
# enabled: true # default: true for multi-node jobs
# image: busybox:1.36 # default (lightweight image for nslookup)
# timeout_seconds: 300 # default (5 minutes)

# RBAC preflight checks for multi-node jobs (enabled by default)
# Verifies create/delete/get/list permissions on Services before submit.
# rbac_preflight: true

# Fallback backend if Kubernetes is unavailable
# Image pull policy (default: IfNotPresent)
# Set to "Never" when using locally-loaded images via ctr import
# image_pull_policy: Never

# Useful for development/testing when K8s cluster is not accessible
# fallback_executor: local

# =============================================================================
# Setup Instructions
# =============================================================================
#
# 1. Create the namespace:
# kubectl create namespace nemo-skills
#
# 2. Create a service account with appropriate permissions:
# kubectl create serviceaccount nemo-skills-sa -n nemo-skills
#
# 3. Create RBAC role for job management:
# kubectl apply -f - <<EOF
# apiVersion: rbac.authorization.k8s.io/v1
# kind: Role
# metadata:
# name: nemo-skills-role
# namespace: nemo-skills
# rules:
# - apiGroups: ["batch"]
# resources: ["jobs"]
# verbs: ["create", "delete", "get", "list", "watch"]
# - apiGroups: [""]
# resources: ["pods", "pods/log"]
# verbs: ["get", "list", "watch"]
# - apiGroups: [""]
# resources: ["services"]
# verbs: ["create", "delete", "get", "list"] # Required for multi-node headless services
# EOF
#
# 4. Bind the role to the service account:
# kubectl create rolebinding nemo-skills-binding \
# --role=nemo-skills-role \
# --serviceaccount=nemo-skills:nemo-skills-sa \
# -n nemo-skills
#
# 5. Create PVCs for storage (example for models):
# kubectl apply -f - <<EOF
# apiVersion: v1
# kind: PersistentVolumeClaim
# metadata:
# name: nemo-models-pvc
# namespace: nemo-skills
# spec:
# accessModes:
# - ReadWriteMany
# resources:
# requests:
# storage: 500Gi
# storageClassName: your-storage-class
# EOF
#
# 6. Create image pull secrets for private registries:
# kubectl create secret docker-registry nvcr-secret \
# --docker-server=nvcr.io \
# --docker-username='$oauthtoken' \
# --docker-password=<NGC_API_KEY> \
# -n nemo-skills
#
# 7. Ensure GPU Operator is installed for GPU scheduling:
# https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/getting-started.html
#
# 8. For multi-node GPU jobs with RDMA/InfiniBand, ensure Network Operator is installed:
# https://docs.nvidia.com/networking/display/cokan10/network+operator
107 changes: 107 additions & 0 deletions cluster_configs/kubernetes/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
# NeMo-Skills Kubernetes Setup

This directory contains Kubernetes manifests for setting up NeMo-Skills on a Kubernetes cluster.

## Prerequisites

- Kubernetes cluster 1.24+ (multi-node Indexed Jobs + Headless Service pattern)
- [NVIDIA GPU Operator](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/getting-started.html) installed
- Storage class that supports ReadWriteMany (RWX) access mode
- `kubectl` configured to access your cluster

## Quick Start

```bash
# 1. Create namespace and RBAC
kubectl apply -f rbac.yaml

# 2. Create image pull secret for NGC (if using NVIDIA containers)
kubectl create secret docker-registry nvcr-secret \
--namespace=nemo-skills \
--docker-server=nvcr.io \
--docker-username='$oauthtoken' \
--docker-password=YOUR_NGC_API_KEY

# 3. Update storage.yaml with your storage class, then apply
# Edit storage.yaml first: replace REPLACE_WITH_YOUR_STORAGE_CLASS
kubectl apply -f storage.yaml

# 4. Copy and customize the cluster config
cp ../example-kubernetes.yaml ../my-cluster.yaml
# Edit my-cluster.yaml with your settings
```

## Files

| File | Description |
|------|-------------|
| `rbac.yaml` | ServiceAccount, Role, and RoleBinding for jobs/pods/services management |
| `storage.yaml` | PVC templates for models, data, and results |
| `image-pull-secret.yaml` | Instructions for creating image pull secrets |

## Verification

```bash
# Check namespace and service account
kubectl get namespace nemo-skills
kubectl get serviceaccount -n nemo-skills

# Check RBAC
kubectl auth can-i create jobs --as=system:serviceaccount:nemo-skills:nemo-skills-sa -n nemo-skills
kubectl auth can-i create services --as=system:serviceaccount:nemo-skills:nemo-skills-sa -n nemo-skills
kubectl auth can-i delete services --as=system:serviceaccount:nemo-skills:nemo-skills-sa -n nemo-skills

# Check PVCs
kubectl get pvc -n nemo-skills

# Check GPU nodes
kubectl get nodes -l nvidia.com/gpu.present=true
```

## Troubleshooting

### Jobs stuck in Pending
```bash
kubectl describe job <job-name> -n nemo-skills
kubectl describe pod -l job-name=<job-name> -n nemo-skills
```

### Image pull errors
```bash
kubectl get events -n nemo-skills --field-selector reason=Failed
```

### GPU not available
```bash
# Check GPU operator
kubectl get pods -n gpu-operator

# Check node GPU resources
kubectl describe node <node-name> | grep nvidia.com/gpu
```

## Multi-Tenant Setup

For multiple teams, create separate namespaces with resource quotas:

```bash
# Create team namespace
kubectl create namespace team-alpha

# Apply RBAC (update namespace in rbac.yaml)
sed 's/nemo-skills/team-alpha/g' rbac.yaml | kubectl apply -f -

# Add resource quota
kubectl apply -f - <<EOF
apiVersion: v1
kind: ResourceQuota
metadata:
name: team-alpha-quota
namespace: team-alpha
spec:
hard:
requests.nvidia.com/gpu: "16"
requests.memory: "128Gi"
pods: "20"
EOF
```
Loading