diff --git a/.claude/clusters.yaml.example b/.claude/clusters.yaml.example new file mode 100644 index 00000000000..5bf4182e5c2 --- /dev/null +++ b/.claude/clusters.yaml.example @@ -0,0 +1,18 @@ +# ModelOpt Remote Cluster Configuration +# Copy to ~/.config/modelopt/clusters.yaml (user-level, recommended) +# or .claude/clusters.yaml (project-level, can be committed). + +clusters: + # GPU workstation or SLURM login node + my-cluster: + login_node: cluster-login.example.com + user: myusername + ssh_key: ~/.ssh/id_rsa + # ssh_proxy: "socat - PROXY:localhost:%h:%p,proxyport=3128" # optional + workspace: /path/to/remote/workdir + gpu_type: H100 # used for quantization format recommendation + # slurm: + # default_account: my_account + # default_partition: batch_short + +default_cluster: my-cluster diff --git a/.claude/skills/common/remote_exec.sh b/.claude/skills/common/remote_exec.sh new file mode 100644 index 00000000000..e5a1bc2b242 --- /dev/null +++ b/.claude/skills/common/remote_exec.sh @@ -0,0 +1,492 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# remote_exec.sh — Remote execution utility for ModelOpt agent skills +# +# Usage: +# source .claude/skills/common/remote_exec.sh +# remote_load_cluster # or: remote_load_cluster (uses default) +# remote_check_ssh +# remote_detect_env # detect SLURM vs Docker vs bare metal +# remote_run "command" +# remote_sync_to [remote_subdir] +# remote_sync_from +# remote_submit_job # SLURM only +# remote_poll_job # SLURM only +# remote_wait_job [interval=30] # SLURM only +# remote_docker_run "" # Docker only +# remote_tail_log [lines=50] +# +# After remote_load_cluster, these env vars are set: +# REMOTE_HOST, REMOTE_USER, REMOTE_SSH_KEY, REMOTE_SSH_PROXY, +# REMOTE_WORKSPACE, REMOTE_GPU_TYPE, REMOTE_ENV_TYPE, +# REMOTE_CONTAINER_IMAGE, REMOTE_SLURM_ACCOUNT, REMOTE_SLURM_PARTITION + +set -euo pipefail + +# ── Helpers ────────────────────────────────────────────────────────────────── + +_remote_config_file() { + # Find clusters.yaml: user-level > project-level + local user_config="${HOME}/.config/modelopt/clusters.yaml" + local project_config + # Walk up from pwd looking for .claude/clusters.yaml + local dir="$PWD" + while [[ "$dir" != "/" ]]; do + if [[ -f "$dir/.claude/clusters.yaml" ]]; then + project_config="$dir/.claude/clusters.yaml" + break + fi + dir="$(dirname "$dir")" + done + + if [[ -f "$user_config" ]]; then + echo "$user_config" + elif [[ -n "${project_config:-}" && -f "$project_config" ]]; then + echo "$project_config" + else + echo "" + fi +} + +_parse_yaml_value() { + # Simple YAML value extractor: _parse_yaml_value + # Handles simple scalar values only (not arrays/nested objects) + local file="$1" path="$2" + python3 -c " +import yaml, sys +with open('$file') as f: + data = yaml.safe_load(f) +keys = '$path'.split('.') +for k in keys: + if isinstance(data, dict) and k in data: + data = data[k] + else: + sys.exit(0) +if data is not None: + print(data) +" 2>/dev/null || true +} + +_ssh_control_path() { + # Return the path for the SSH ControlMaster socket + # Use a per-host socket so multiple clusters don't collide + # Try multiple writable locations (sandbox may restrict /tmp) + local tmpdir + for candidate in "${TMPDIR:-}" /tmp/claude-*/ssh-ctl /tmp; do + if [[ -n "$candidate" && -d "$candidate" && -w "$candidate" ]]; then + tmpdir="$candidate" + break + fi + done + # Fallback: create in home dir + tmpdir="${tmpdir:-$HOME/.cache/ssh-ctl}" + mkdir -p "$tmpdir" 2>/dev/null || true + # Use short name to avoid Unix socket path length limit (108 chars) + local host_hash + host_hash=$(echo "${REMOTE_USER}@${REMOTE_HOST}" | md5sum | cut -c1-12) + echo "${tmpdir}/ctl-${host_hash}" +} + +_ssh_base_opts() { + # Build SSH options (without the ssh command itself or user@host) + local opts="-o BatchMode=yes -o ConnectTimeout=15 -o StrictHostKeyChecking=accept-new" + # ControlMaster multiplexing: reuse a single persistent SSH connection + local ctl_path + ctl_path="$(_ssh_control_path)" + opts+=" -o ControlPath='${ctl_path}'" + # ControlMaster=auto: reuse existing master if available, otherwise start a new one + opts+=" -o ControlMaster=auto" + if [[ -n "${REMOTE_SSH_KEY:-}" ]]; then + opts+=" -i $REMOTE_SSH_KEY" + fi + if [[ -n "${REMOTE_SSH_PROXY:-}" ]]; then + opts+=" -o ProxyCommand='${REMOTE_SSH_PROXY}'" + fi + echo "$opts" +} + +_ssh_base_cmd() { + # Build the full SSH command + echo "ssh $(_ssh_base_opts) ${REMOTE_USER}@${REMOTE_HOST}" +} + +# ── Session Management ─────────────────────────────────────────────────────── + +remote_start_session() { + # Start a persistent SSH ControlMaster connection in the background. + # All subsequent remote_run / remote_sync_* / scp calls reuse this connection. + # Call this once after remote_load_cluster + remote_check_ssh. + local ctl_path + ctl_path="$(_ssh_control_path)" + + # If a master is already running, skip + if ssh -o ControlPath="$ctl_path" -O check "${REMOTE_USER}@${REMOTE_HOST}" 2>/dev/null; then + echo "SSH session already active (reusing existing connection)." + return 0 + fi + + echo "Starting persistent SSH session to ${REMOTE_USER}@${REMOTE_HOST}..." + local opts="-o BatchMode=yes -o ConnectTimeout=15 -o StrictHostKeyChecking=accept-new" + opts+=" -o ControlMaster=yes -o ControlPath='${ctl_path}' -o ControlPersist=3600" + if [[ -n "${REMOTE_SSH_KEY:-}" ]]; then + opts+=" -i $REMOTE_SSH_KEY" + fi + if [[ -n "${REMOTE_SSH_PROXY:-}" ]]; then + opts+=" -o ProxyCommand='${REMOTE_SSH_PROXY}'" + fi + + # Start the master in the background (-f -N: go background, no command) + eval "ssh $opts -f -N ${REMOTE_USER}@${REMOTE_HOST}" 2>&1 + local rc=$? + if (( rc == 0 )); then + echo "SSH session established. All commands will reuse this connection." + # Register cleanup trap + trap 'remote_stop_session 2>/dev/null' EXIT + else + echo "WARNING: Failed to start persistent SSH session (rc=$rc). Commands will use individual connections." >&2 + fi + return $rc +} + +remote_stop_session() { + # Gracefully close the persistent SSH connection + local ctl_path + ctl_path="$(_ssh_control_path)" + if [[ -S "$ctl_path" ]]; then + ssh -o ControlPath="$ctl_path" -O exit "${REMOTE_USER}@${REMOTE_HOST}" 2>/dev/null || true + echo "SSH session closed." + fi +} + +# ── Core Functions ─────────────────────────────────────────────────────────── + +remote_load_cluster() { + # Load cluster config by name. If no name given, use default_cluster. + local cluster_name="${1:-}" + local config_file + config_file="$(_remote_config_file)" + + if [[ -z "$config_file" ]]; then + echo "ERROR: No clusters.yaml found. Provide cluster info interactively or create one." >&2 + echo " User config: ~/.config/modelopt/clusters.yaml" >&2 + echo " Project config: .claude/clusters.yaml" >&2 + return 1 + fi + + # Get default cluster if none specified + if [[ -z "$cluster_name" ]]; then + cluster_name="$(_parse_yaml_value "$config_file" "default_cluster")" + if [[ -z "$cluster_name" ]]; then + echo "ERROR: No cluster name specified and no default_cluster in config." >&2 + return 1 + fi + fi + + # Parse cluster config + REMOTE_HOST="$(_parse_yaml_value "$config_file" "clusters.${cluster_name}.login_node")" + REMOTE_USER="$(_parse_yaml_value "$config_file" "clusters.${cluster_name}.user")" + REMOTE_SSH_KEY="$(_parse_yaml_value "$config_file" "clusters.${cluster_name}.ssh_key")" + REMOTE_SSH_PROXY="$(_parse_yaml_value "$config_file" "clusters.${cluster_name}.ssh_proxy")" + REMOTE_WORKSPACE="$(_parse_yaml_value "$config_file" "clusters.${cluster_name}.workspace")" + REMOTE_GPU_TYPE="$(_parse_yaml_value "$config_file" "clusters.${cluster_name}.gpu_type")" + REMOTE_CONTAINER_IMAGE="$(_parse_yaml_value "$config_file" "clusters.${cluster_name}.container_image")" + REMOTE_ENV_TYPE="$(_parse_yaml_value "$config_file" "clusters.${cluster_name}.env_type")" + + # SLURM-specific + REMOTE_SLURM_ACCOUNT="$(_parse_yaml_value "$config_file" "clusters.${cluster_name}.slurm.default_account")" + REMOTE_SLURM_PARTITION="$(_parse_yaml_value "$config_file" "clusters.${cluster_name}.slurm.default_partition")" + + # Expand ~ in ssh_key + if [[ "${REMOTE_SSH_KEY:-}" == "~/"* ]]; then + REMOTE_SSH_KEY="${HOME}/${REMOTE_SSH_KEY#\~/}" + fi + + # Validate required fields + if [[ -z "$REMOTE_HOST" ]]; then + echo "ERROR: Cluster '$cluster_name' has no login_node defined." >&2 + return 1 + fi + + # Default user to current user + REMOTE_USER="${REMOTE_USER:-$USER}" + + export REMOTE_HOST REMOTE_USER REMOTE_SSH_KEY REMOTE_SSH_PROXY + export REMOTE_WORKSPACE REMOTE_GPU_TYPE REMOTE_CONTAINER_IMAGE + export REMOTE_ENV_TYPE REMOTE_SLURM_ACCOUNT REMOTE_SLURM_PARTITION + + echo "Loaded cluster: $cluster_name (${REMOTE_USER}@${REMOTE_HOST}:${REMOTE_WORKSPACE})" +} + +remote_check_ssh() { + # Validate SSH connectivity and start a persistent session. + # After this call, all remote_run / remote_sync_* commands reuse one connection. + echo "Checking SSH connectivity to ${REMOTE_USER}@${REMOTE_HOST}..." + # Start persistent session (also validates connectivity) + if remote_start_session 2>&1; then + return 0 + fi + # Fallback: try a one-off connection + local result + if result=$(eval "$(_ssh_base_cmd)" '"echo SSH_OK"' 2>&1); then + if echo "$result" | grep -q "SSH_OK"; then + echo "SSH connection OK (no persistent session — commands will be slower)." + return 0 + fi + fi + echo "ERROR: SSH connection failed:" >&2 + echo "$result" >&2 + return 1 +} + +remote_detect_env() { + # Auto-detect remote environment: slurm, docker, or bare + # Sets REMOTE_ENV_TYPE and discovers GPU info + if [[ -n "${REMOTE_ENV_TYPE:-}" && "$REMOTE_ENV_TYPE" != "auto" ]]; then + echo "Environment type: $REMOTE_ENV_TYPE (from config)" + return 0 + fi + + echo "Detecting remote environment..." + local info + info=$(remote_run " + echo ENV_DETECT_START; + # Check SLURM + if command -v sbatch &>/dev/null; then + echo 'HAS_SLURM=yes'; + sacctmgr show associations user=\$USER format=account%30,partition%20,cluster%20 -n 2>/dev/null | head -20; + echo 'SLURM_PARTITIONS_START'; + sinfo -o '%P %a %l %D %G' 2>/dev/null | head -30; + echo 'SLURM_PARTITIONS_END'; + else + echo 'HAS_SLURM=no'; + fi; + # Check Docker + if command -v docker &>/dev/null; then + echo 'HAS_DOCKER=yes'; + # Check if docker can access GPUs + docker run --rm --gpus all nvidia/cuda:12.4.0-base-ubuntu22.04 nvidia-smi --query-gpu=name,memory.total --format=csv,noheader 2>/dev/null && echo 'DOCKER_GPU=yes' || echo 'DOCKER_GPU=no'; + else + echo 'HAS_DOCKER=no'; + fi; + # Check bare metal GPU + if command -v nvidia-smi &>/dev/null; then + echo 'HAS_BARE_GPU=yes'; + nvidia-smi --query-gpu=name,memory.total --format=csv,noheader 2>/dev/null; + else + echo 'HAS_BARE_GPU=no'; + fi; + echo ENV_DETECT_END; + " 2>&1) + + echo "$info" + + if echo "$info" | grep -q "HAS_SLURM=yes"; then + REMOTE_ENV_TYPE="slurm" + elif echo "$info" | grep -q "HAS_DOCKER=yes"; then + REMOTE_ENV_TYPE="docker" + elif echo "$info" | grep -q "HAS_BARE_GPU=yes"; then + REMOTE_ENV_TYPE="bare" + else + REMOTE_ENV_TYPE="unknown" + fi + + export REMOTE_ENV_TYPE + echo "Detected environment: $REMOTE_ENV_TYPE" +} + +remote_run() { + # Run a command on the remote machine + # Usage: remote_run "command" + # Uses base64 encoding to avoid all quoting/escaping issues. + # Retries up to 3 times on SSH connection failures. + local cmd="$1" + local ws="${REMOTE_WORKSPACE:-\$HOME}" + local full_cmd="cd $ws && $cmd" + local encoded + encoded=$(printf '%s' "$full_cmd" | base64 -w0) + + local attempt=0 max_attempts=3 result rc + while (( attempt < max_attempts )); do + result=$(eval "$(_ssh_base_cmd)" "'echo $encoded | base64 -d | bash'" 2>&1) && rc=$? || rc=$? + if (( rc != 255 )); then + # rc=255 is SSH connection failure; anything else is the remote command's exit code + echo "$result" + return $rc + fi + attempt=$((attempt + 1)) + if (( attempt < max_attempts )); then + echo "SSH connection failed (attempt $attempt/$max_attempts), retrying in 10s..." >&2 + sleep 10 + fi + done + echo "$result" + return $rc +} + +remote_sync_to() { + # Sync local path to remote workspace + # Usage: remote_sync_to [remote_subdir] + local local_path="$1" + local remote_subdir="${2:-}" + local remote_dest="${REMOTE_USER}@${REMOTE_HOST}:${REMOTE_WORKSPACE}/${remote_subdir}" + + local rsync_cmd="rsync -avz --progress" + # Add default excludes + for excl in .git __pycache__ "*.pyc" .claude node_modules "*.egg-info"; do + rsync_cmd+=" --exclude='$excl'" + done + # Reuse the shared SSH options (including ControlMaster) + rsync_cmd+=" -e \"ssh $(_ssh_base_opts)\"" + rsync_cmd+=" '${local_path}/' '${remote_dest}'" + + echo "Syncing ${local_path} → ${remote_dest} ..." + eval "$rsync_cmd" +} + +remote_sync_from() { + # Sync from remote to local + # Usage: remote_sync_from + local remote_subdir="$1" + local local_path="$2" + local remote_src="${REMOTE_USER}@${REMOTE_HOST}:${REMOTE_WORKSPACE}/${remote_subdir}" + + mkdir -p "$local_path" + echo "Fetching ${remote_src} → ${local_path} ..." + eval "rsync -avz --progress -e \"ssh $(_ssh_base_opts)\" '${remote_src}/' '${local_path}/'" +} + +# ── SLURM Functions ────────────────────────────────────────────────────────── + +remote_submit_job() { + # Submit a SLURM job script that's already on the remote machine + # Usage: remote_submit_job + # Returns: job ID on stdout + local script_path="$1" + local output + output=$(remote_run "sbatch '$script_path'" 2>&1) + local jobid + jobid=$(echo "$output" | grep -o '[0-9]\+' | tail -1) + if [[ -z "$jobid" ]]; then + echo "ERROR: Failed to submit job:" >&2 + echo "$output" >&2 + return 1 + fi + echo "$jobid" +} + +remote_poll_job() { + # Check SLURM job state + # Usage: remote_poll_job + # Returns: PENDING, RUNNING, COMPLETED, FAILED, TIMEOUT, CANCELLED, etc. + local jobid="$1" + local state + state=$(remote_run "squeue -j $jobid -h -o %T 2>/dev/null" 2>&1 | grep -v "^$" | tail -1) + if [[ -z "$state" ]]; then + # Job no longer in queue — check sacct + state=$(remote_run "sacct -j $jobid --format=State -n -X 2>/dev/null" 2>&1 | awk '{print $1}' | head -1) + fi + echo "${state:-UNKNOWN}" +} + +remote_wait_job() { + # Wait for a SLURM job to complete + # Usage: remote_wait_job [poll_interval_seconds=30] + local jobid="$1" + local interval="${2:-30}" + echo "Waiting for job $jobid (polling every ${interval}s)..." + while true; do + local state + state=$(remote_poll_job "$jobid") + echo "$(date '+%H:%M:%S') Job $jobid: $state" + case "$state" in + COMPLETED) + echo "Job $jobid completed successfully." + return 0 + ;; + FAILED|TIMEOUT|CANCELLED|OUT_OF_MEMORY|NODE_FAIL) + echo "ERROR: Job $jobid ended with state: $state" >&2 + remote_job_result "$jobid" + return 1 + ;; + UNKNOWN) + echo "WARNING: Could not determine job state. Checking sacct..." >&2 + remote_job_result "$jobid" + return 1 + ;; + esac + sleep "$interval" + done +} + +remote_job_result() { + # Get job result details from sacct + # Usage: remote_job_result + local jobid="$1" + remote_run "sacct -j $jobid --format=JobID,State,ExitCode,Elapsed,MaxRSS -n 2>/dev/null" +} + +# ── Docker Functions ───────────────────────────────────────────────────────── + +remote_docker_run() { + # Run a command inside a Docker container on the remote machine + # Usage: remote_docker_run "" + # If container_or_image matches a running container name, uses docker exec. + # Otherwise, uses docker run with the given image. + local container_or_image="$1" + local cmd="$2" + + # Check if it's a running container + local is_running + is_running=$(remote_run "docker ps --format '{{.Names}}' | grep -x '$container_or_image' 2>/dev/null" 2>&1 || true) + + if [[ -n "$is_running" ]]; then + echo "Executing in running container: $container_or_image" + remote_run "docker exec $container_or_image bash -c '$cmd'" + else + echo "Running in new container: $container_or_image" + remote_run "docker run --rm --gpus all -v ${REMOTE_WORKSPACE}:${REMOTE_WORKSPACE} -w ${REMOTE_WORKSPACE} $container_or_image bash -c '$cmd'" + fi +} + +# ── Log Functions ──────────────────────────────────────────────────────────── + +remote_tail_log() { + # Tail a log file on the remote machine + # Usage: remote_tail_log [num_lines=50] + local log_path="$1" + local lines="${2:-50}" + remote_run "tail -n $lines '$log_path' 2>/dev/null || echo 'Log file not found: $log_path'" +} + +# ── Workspace Functions ────────────────────────────────────────────────────── + +remote_ensure_workspace() { + # Create the remote workspace directory if it doesn't exist + remote_run "mkdir -p '${REMOTE_WORKSPACE}'" + echo "Remote workspace ready: ${REMOTE_WORKSPACE}" +} + +remote_workspace_info() { + # Print useful info about the remote workspace + remote_run " + echo '=== Workspace: ${REMOTE_WORKSPACE} ==='; + echo '--- Disk usage ---'; + du -sh '${REMOTE_WORKSPACE}' 2>/dev/null || echo 'N/A'; + echo '--- Contents ---'; + ls -la '${REMOTE_WORKSPACE}/' 2>/dev/null | head -20; + " +} diff --git a/.claude/skills/common/workspace-management.md b/.claude/skills/common/workspace-management.md new file mode 100644 index 00000000000..b7adb34ac41 --- /dev/null +++ b/.claude/skills/common/workspace-management.md @@ -0,0 +1,84 @@ +# Workspace Management + +When running via the Slack bot (or any multi-user environment), each user has a **workspace root** containing model-specific workspaces. Each workspace is a copy of the Model-Optimizer repo where the agent can freely modify code. + +## Environment Variables + +The bot sets these env vars before launching Claude: + +- `MODELOPT_WORKSPACE_ROOT` — user's workspace root (e.g., `/data/modelopt/users/U123/jobs/`) +- `MODELOPT_REPO_DIR` — path to the shared upstream repo (read-only source for copies) + +If these are not set, you are running locally — skip workspace management. + +## When to Create vs Reuse a Workspace + +**Before starting any task**, check for an existing workspace that matches: + +```bash +# List existing workspaces +ls "$MODELOPT_WORKSPACE_ROOT/" 2>/dev/null +``` + +**Reuse** an existing workspace when: +- The task involves the same model (e.g., deploying a model you just quantized) +- The task needs output from a previous step (e.g., eval needs the PTQ checkpoint) +- The user says "deploy the model I just quantized" or similar + +**Create a new workspace** when: +- This is a new model not seen before +- The user explicitly asks for a fresh start +- The existing workspace's code modifications are incompatible (rare) + +## Creating a New Workspace + +Name workspaces by model/purpose, not timestamps: + +```bash +# Good names +qwen3-0.6b +llama-3.1-8b-fp8 +deepseek-v3-nvfp4 + +# Bad names (don't use) +ptq-20260318-143022 +job-001 +``` + +To create: + +```bash +rsync -a --quiet \ + --exclude .git --exclude __pycache__ --exclude '*.pyc' \ + --exclude node_modules --exclude '*.egg-info' --exclude '*.sqsh' \ + "$MODELOPT_REPO_DIR/" "$MODELOPT_WORKSPACE_ROOT//" +``` + +Then `cd` into the new workspace and continue with the task. + +## Injecting Cluster Config + +If `.claude/clusters.yaml` exists in the current workspace, it was injected by the bot. When creating a new workspace, copy it over: + +```bash +cp "$MODELOPT_WORKSPACE_ROOT/default/.claude/clusters.yaml" \ + "$MODELOPT_WORKSPACE_ROOT//.claude/clusters.yaml" 2>/dev/null +``` + +## Example Flow + +``` +User: "quantize Qwen3-0.6B with nvfp4" +Agent: ls $MODELOPT_WORKSPACE_ROOT/ → empty or no "qwen3-0.6b" + → create workspace "qwen3-0.6b" + → run PTQ, output to qwen3-0.6b/output/ + +User: "deploy the model I just quantized" +Agent: ls $MODELOPT_WORKSPACE_ROOT/ → sees "qwen3-0.6b" + → reuse workspace, find checkpoint at qwen3-0.6b/output/ + → deploy from there + +User: "now quantize Llama-3.1-8B with fp8" +Agent: ls $MODELOPT_WORKSPACE_ROOT/ → sees "qwen3-0.6b", no llama + → create workspace "llama-3.1-8b-fp8" +``` diff --git a/.claude/skills/deployment/SKILL.md b/.claude/skills/deployment/SKILL.md new file mode 100644 index 00000000000..90871526af9 --- /dev/null +++ b/.claude/skills/deployment/SKILL.md @@ -0,0 +1,262 @@ +--- +name: deployment +description: Serve a quantized or unquantized LLM checkpoint as an OpenAI-compatible API endpoint using vLLM, SGLang, or TRT-LLM. Use when user says "deploy model", "serve model", "start vLLM server", "launch SGLang", "TRT-LLM deploy", "AutoDeploy", "benchmark throughput", "serve checkpoint", or needs an inference endpoint from a HuggingFace or ModelOpt-quantized checkpoint. +license: Apache-2.0 +--- + +# Deployment Skill + +Serve a model checkpoint as an OpenAI-compatible inference endpoint. Supports vLLM, SGLang, and TRT-LLM (including AutoDeploy). + +## Quick Start + +Use the deploy script for the fastest path. It auto-detects quantization format from the checkpoint: + +```bash +# Start vLLM server with a ModelOpt checkpoint +scripts/deploy.sh start --model ./qwen3-0.6b-fp8 + +# Start with SGLang and tensor parallelism +scripts/deploy.sh start --model ./llama-70b-nvfp4 --framework sglang --tp 4 + +# Start from HuggingFace hub +scripts/deploy.sh start --model nvidia/Llama-3.1-8B-Instruct-FP8 + +# Test the API +scripts/deploy.sh test + +# Check status +scripts/deploy.sh status + +# Stop +scripts/deploy.sh stop +``` + +The script handles: GPU detection, quantization flag auto-detection (FP8 vs FP4), server lifecycle (start/stop/restart/status), health check polling, and API testing. + +## Decision Flow + +### 0. Check workspace (multi-user / Slack bot) + +If `MODELOPT_WORKSPACE_ROOT` is set, read `skills/common/workspace-management.md`. Before creating a new workspace, check for existing ones — especially if deploying a checkpoint from a prior PTQ run: + +```bash +ls "$MODELOPT_WORKSPACE_ROOT/" 2>/dev/null +``` + +If the user says "deploy the model I just quantized" or references a previous PTQ, find the matching workspace and `cd` into it. The checkpoint should be in that workspace's output directory. + +### 1. Identify the checkpoint + +Determine what the user wants to deploy: + +- **Local quantized checkpoint** (from ptq skill or manual export): look for `hf_quant_config.json` in the directory +- **HuggingFace model hub** (e.g., `nvidia/Llama-3.1-8B-Instruct-FP8`): use directly +- **Unquantized model**: deploy as-is (BF16) or suggest quantizing first with the ptq skill + +Check the quantization format if applicable: + +```bash +cat /hf_quant_config.json 2>/dev/null || echo "No quant config — unquantized or legacy format" +``` + +### 2. Choose the framework + +If the user hasn't specified a framework, recommend based on this priority: + +| Situation | Recommended | Why | +|-----------|-------------|-----| +| General use | **vLLM** | Widest ecosystem, easy setup, OpenAI-compatible | +| Best SGLang model support | **SGLang** | Strong DeepSeek/Llama 4 support | +| Maximum optimization | **TRT-LLM** | Best throughput via engine compilation | +| Mixed-precision / AutoQuant | **TRT-LLM AutoDeploy** | Only option for AutoQuant checkpoints | + +Check the support matrix in `references/support-matrix.md` to confirm the model + format + framework combination is supported. + +### 3. Check the environment + +**GPU availability:** + +```bash +python -c "import torch; [print(f'GPU {i}: {torch.cuda.get_device_name(i)}') for i in range(torch.cuda.device_count())] if torch.cuda.is_available() else print('no-gpu')" +``` + +**Framework installed?** + +```bash +# vLLM +python -c "import vllm; print(f'vLLM {vllm.__version__}')" 2>/dev/null || echo "vLLM not installed" + +# SGLang +python -c "import sglang; print(f'SGLang {sglang.__version__}')" 2>/dev/null || echo "SGLang not installed" + +# TRT-LLM +python -c "import tensorrt_llm; print(f'TRT-LLM {tensorrt_llm.__version__}')" 2>/dev/null || echo "TRT-LLM not installed" +``` + +If the framework is not installed, consult `references/setup.md` for installation instructions. + +**GPU memory estimate:** + +- BF16 model: `num_params × 2 bytes` (e.g., 8B model ≈ 16 GB) +- FP8 model: `num_params × 1 byte` (e.g., 8B model ≈ 8 GB) +- FP4 model: `num_params × 0.5 bytes` (e.g., 8B model ≈ 4 GB) +- Add ~2-4 GB for KV cache and framework overhead + +If the model exceeds single GPU memory, use tensor parallelism (`-tp `). + +### 4. Deploy + +Read the framework-specific reference for detailed instructions: + +| Framework | Reference file | +|-----------|---------------| +| vLLM | `references/vllm.md` | +| SGLang | `references/sglang.md` | +| TRT-LLM | `references/trtllm.md` | + +**Quick-start commands** (for common cases): + +#### vLLM + +```bash +# Serve as OpenAI-compatible endpoint +python -m vllm.entrypoints.openai.api_server \ + --model \ + --quantization modelopt \ + --tensor-parallel-size \ + --host 0.0.0.0 --port 8000 +``` + +For NVFP4 checkpoints, use `--quantization modelopt_fp4`. + +#### SGLang + +```bash +python -m sglang.launch_server \ + --model-path \ + --quantization modelopt \ + --tp \ + --host 0.0.0.0 --port 8000 +``` + +#### TRT-LLM (direct) + +```python +from tensorrt_llm import LLM, SamplingParams +llm = LLM(model="") +outputs = llm.generate(["Hello, my name is"], SamplingParams(temperature=0.8, top_p=0.95)) +``` + +#### TRT-LLM AutoDeploy + +For AutoQuant or mixed-precision checkpoints, see `references/trtllm.md`. + +### 5. Verify the deployment + +After the server starts, verify it's healthy: + +```bash +# Health check +curl -s http://localhost:8000/health + +# List models +curl -s http://localhost:8000/v1/models | python -m json.tool + +# Test generation +curl -s http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "", + "prompt": "The capital of France is", + "max_tokens": 32 + }' | python -m json.tool +``` + +All checks must pass before reporting success to the user. + +### 6. Benchmark (optional) + +If the user wants throughput/latency numbers, run a quick benchmark: + +```bash +# vLLM benchmark +python -m vllm.entrypoints.openai.api_server ... & # if not already running + +python -m vllm.benchmark_serving \ + --model \ + --port 8000 \ + --num-prompts 100 \ + --request-rate 10 +``` + +Report: throughput (tok/s), latency p50/p99, time to first token (TTFT). + +### 7. Remote deployment (SSH/SLURM) + +If a cluster config exists (`~/.config/modelopt/clusters.yaml` or `.claude/clusters.yaml`), or the user mentions running on a remote machine: + +1. **Source remote utilities:** + + ```bash + source .claude/skills/common/remote_exec.sh + remote_load_cluster + remote_check_ssh + remote_detect_env + ``` + +2. **Sync the checkpoint** (if it was produced locally): + + ```bash + remote_sync_to checkpoints/ + ``` + +3. **Deploy based on remote environment:** + + - **SLURM** — write a job script that starts the server inside a container, then submit: + + ```bash + srun --container-image="" \ + --container-mounts=":" \ + python -m vllm.entrypoints.openai.api_server \ + --model \ + --quantization modelopt \ + --host 0.0.0.0 --port 8000 + ``` + + Use `remote_submit_job` and `remote_poll_job` to manage the job. The server runs on the allocated node — get its hostname from `squeue -j $JOBID -o %N`. + + - **Bare metal / Docker** — use `remote_run` to start the server directly: + + ```bash + remote_run "nohup python -m vllm.entrypoints.openai.api_server --model --port 8000 > deploy.log 2>&1 &" + ``` + +4. **Verify remotely:** + + ```bash + remote_run "curl -s http://localhost:8000/health" + remote_run "curl -s http://localhost:8000/v1/models" + ``` + +5. **Report the endpoint** — include the remote hostname and port so the user can connect (e.g., `http://:8000`). For SLURM, note that the port is only reachable from within the cluster network. + +For NEL-managed deployment (evaluation with self-deployment), use the evaluation skill instead — NEL handles SLURM container deployment, health checks, and teardown automatically. + +## Error Handling + +| Error | Cause | Fix | +|-------|-------|-----| +| `CUDA out of memory` | Model too large for GPU(s) | Increase `--tensor-parallel-size` or use a smaller model | +| `quantization="modelopt" not recognized` | vLLM/SGLang version too old | Upgrade: vLLM >= 0.10.1, SGLang >= 0.4.10 | +| `hf_quant_config.json not found` | Not a ModelOpt-exported checkpoint | Re-export with `export_hf_checkpoint()`, or remove `--quantization` flag | +| `Connection refused` on health check | Server still starting | Wait 30-60s for large models; check logs for errors | +| `modelopt_fp4 not supported` | Framework doesn't support FP4 for this model | Check support matrix in `references/support-matrix.md` | + +## Success Criteria + +1. Server process is running and healthy (`/health` returns 200) +2. Model is listed at `/v1/models` +3. Test generation produces coherent output +4. Server URL and port are reported to the user +5. If benchmarking was requested, throughput/latency numbers are reported diff --git a/.claude/skills/deployment/references/setup.md b/.claude/skills/deployment/references/setup.md new file mode 100644 index 00000000000..4209f08647b --- /dev/null +++ b/.claude/skills/deployment/references/setup.md @@ -0,0 +1,85 @@ +# Deployment Environment Setup + +## Framework Installation + +### vLLM + +```bash +pip install vllm +``` + +Minimum version: 0.10.1 + +### SGLang + +```bash +pip install "sglang[all]" +``` + +Minimum version: 0.4.10 + +### TRT-LLM + +TRT-LLM is best installed via NVIDIA container: + +```bash +docker pull nvcr.io/nvidia/tensorrt-llm/release: +``` + +Or via pip (requires CUDA toolkit): + +```bash +pip install tensorrt-llm +``` + +Minimum version: 0.17.0 + +## SLURM Deployment + +For SLURM clusters, deploy inside a container. Container flags MUST be on the `srun` line: + +```bash +#!/bin/bash +#SBATCH --job-name=deploy +#SBATCH --account= +#SBATCH --partition= +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --gpus-per-node= +#SBATCH --time=04:00:00 +#SBATCH --output=deploy_%j.log + +srun \ + --container-image="" \ + --container-mounts=":" \ + --container-workdir="" \ + --no-container-mount-home \ + bash -c "python -m vllm.entrypoints.openai.api_server \ + --model \ + --quantization modelopt \ + --tensor-parallel-size \ + --host 0.0.0.0 --port 8000" +``` + +To access the server from outside the SLURM node, note the allocated hostname: + +```bash +squeue -u $USER -o "%j %N %S" # Get the node name +# Then SSH tunnel or use the node's hostname directly +``` + +## Docker Deployment + +### vLLM with ModelOpt + +A Dockerfile is available at `examples/vllm_serve/Dockerfile`: + +```bash +docker build -f examples/vllm_serve/Dockerfile -t vllm-modelopt . + +docker run --gpus all -p 8000:8000 vllm-modelopt \ + python -m vllm.entrypoints.openai.api_server \ + --model \ + --quantization modelopt \ + --host 0.0.0.0 --port 8000 +``` diff --git a/.claude/skills/deployment/references/sglang.md b/.claude/skills/deployment/references/sglang.md new file mode 100644 index 00000000000..62d5c57b591 --- /dev/null +++ b/.claude/skills/deployment/references/sglang.md @@ -0,0 +1,81 @@ +# SGLang Deployment Reference + +## Requirements + +- SGLang >= 0.4.10 +- `pip install sglang[all]` + +## Server Deployment + +### As OpenAI-compatible server + +```bash +python -m sglang.launch_server \ + --model-path \ + --quantization modelopt \ + --tp \ + --host 0.0.0.0 --port 8000 +``` + +For NVFP4 checkpoints, use `--quantization modelopt_fp4`. + +### As Python API + +```python +import sglang as sgl + +llm = sgl.Engine(model_path="", quantization="modelopt") +# For FP4: quantization="modelopt_fp4" + +sampling_params = {"temperature": 0.8, "top_p": 0.95} +outputs = llm.generate(["Hello, my name is"], sampling_params) + +for output in outputs: + print(f"Generated: {output['text']}") +``` + +### From HuggingFace Hub + +```python +import sglang as sgl + +llm = sgl.Engine(model_path="nvidia/Llama-3.1-8B-Instruct-FP8", quantization="modelopt") +outputs = llm.generate(["What is AI?"], {"temperature": 0.8}) +``` + +## Speculative Decoding + +SGLang supports speculative decoding with EAGLE and EAGLE3 models: + +```bash +python -m sglang.launch_server \ + --model-path \ + --speculative-algorithm EAGLE \ + --speculative-draft-model-path \ + --speculative-num-steps 3 \ + --speculative-eagle-topk 4 \ + --tp \ + --host 0.0.0.0 --port 8000 +``` + +Reference: `examples/specdec_bench/specdec_bench/models/sglang.py` + +## Key SGLang Flags + +| Flag | Description | +|------|-------------| +| `--model-path` | Path to checkpoint or HF model ID | +| `--quantization` | `modelopt` (FP8) or `modelopt_fp4` (FP4) | +| `--tp` | Tensor parallelism size | +| `--ep` | Expert parallelism (for MoE models) | +| `--enable-torch-compile` | Enable torch.compile for better perf | +| `--cuda-graph-max-bs` | Max batch size for CUDA graphs | +| `--attention-backend` | `flashinfer` (default) or `triton` | + +## Common Issues + +| Issue | Fix | +|-------|-----| +| `quantization="modelopt"` not recognized | Upgrade SGLang to >= 0.4.10 | +| DeepSeek FP4 not working | Check support matrix — SGLang FP4 support varies by model | +| OOM on startup | Increase `--tp` or reduce `--max-total-tokens` | diff --git a/.claude/skills/deployment/references/support-matrix.md b/.claude/skills/deployment/references/support-matrix.md new file mode 100644 index 00000000000..8d0a6715375 --- /dev/null +++ b/.claude/skills/deployment/references/support-matrix.md @@ -0,0 +1,58 @@ +# Deployment Support Matrix + +## Unified HF Checkpoint — Framework Compatibility + +| Model | Quant Format | TRT-LLM | vLLM | SGLang | +|-------|-------------|---------|------|--------| +| Llama 3.x | FP8 | yes | yes | yes | +| Llama 3.x | FP4 | yes | yes | yes | +| Llama 4 | FP8 | yes | — | yes | +| Llama 4 | FP4 | yes | — | — | +| DeepSeek R1 | FP8 | yes | yes | yes | +| DeepSeek R1 | FP4 | yes | yes | yes | +| DeepSeek V3 | FP8 | yes | yes | yes | +| DeepSeek V3 | FP4 | yes | yes | yes | +| Qwen 3 | FP8 | yes | yes | yes | +| Qwen 3 | FP4 | yes | yes | — | +| Qwen 3 MoE | FP8 | yes | yes | yes | +| Qwen 3 MoE | FP4 | yes | — | — | +| Qwen 2.5 | FP8 | yes | yes | yes | +| Qwen 2.5 | FP4 | yes | yes | — | +| QwQ-32B | FP8 | yes | yes | yes | +| QwQ-32B | FP4 | yes | yes | — | +| Mixtral 8x7B | FP8 | yes | yes | yes | +| Mixtral 8x7B | FP4 | yes | — | — | + +## Supported Quantization Formats + +| Format | Description | +|--------|-------------| +| FP8 | 8-bit floating point (E4M3) | +| FP8_PB | 8-bit floating point with per-block scaling | +| NVFP4 | NVIDIA 4-bit floating point | +| NVFP4_AWQ | NVIDIA 4-bit floating point with AWQ optimization | +| INT4_AWQ | 4-bit integer with AWQ (TRT-LLM only) | +| W4A8_AWQ | 4-bit weights, 8-bit activations with AWQ (TRT-LLM only) | + +## Minimum Framework Versions + +| Framework | Minimum Version | +|-----------|----------------| +| TensorRT-LLM | v0.17.0 | +| vLLM | v0.10.1 | +| SGLang | v0.4.10 | + +## Quantization Flag by Framework + +| Framework | FP8 flag | FP4 flag | +|-----------|----------|----------| +| vLLM | `quantization="modelopt"` | `quantization="modelopt_fp4"` | +| SGLang | `quantization="modelopt"` | `quantization="modelopt_fp4"` | +| TRT-LLM | auto-detected from checkpoint | auto-detected from checkpoint | + +## Notes + +- **NVFP4 inference requires Blackwell GPUs** (B100, B200, GB200). Hopper can run FP4 calibration but not inference. +- INT4_AWQ and W4A8_AWQ are only supported by TRT-LLM (not vLLM or SGLang). +- Other models/formats may work but are not officially validated. +- Source: `examples/llm_ptq/README.md` and `docs/source/deployment/3_unified_hf.rst` diff --git a/.claude/skills/deployment/references/trtllm.md b/.claude/skills/deployment/references/trtllm.md new file mode 100644 index 00000000000..5725bed3bf7 --- /dev/null +++ b/.claude/skills/deployment/references/trtllm.md @@ -0,0 +1,109 @@ +# TRT-LLM Deployment Reference + +## Requirements + +- TensorRT-LLM >= 0.17.0 +- Typically installed via NVIDIA container: `nvcr.io/nvidia/tensorrt-llm/release:` +- Or: `pip install tensorrt-llm` + +## Direct LLM API (recommended for unified HF checkpoints) + +### Python API + +```python +from tensorrt_llm import LLM, SamplingParams + +llm = LLM(model="") +# Quantization format is auto-detected from hf_quant_config.json + +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) +outputs = llm.generate(["Hello, my name is"], sampling_params) + +for output in outputs: + print(f"Prompt: {output.prompt!r}, Generated: {output.outputs[0].text!r}") +``` + +### From HuggingFace Hub + +```python +from tensorrt_llm import LLM + +llm = LLM(model="nvidia/Llama-3.1-8B-Instruct-FP8") +print(llm.generate(["What is AI?"])) +``` + +### With tensor parallelism + +```python +from tensorrt_llm import LLM + +llm = LLM(model="", tensor_parallel_size=4) +``` + +## AutoDeploy (for AutoQuant / mixed-precision) + +AutoDeploy automates graph transformations for optimized inference. Required for AutoQuant checkpoints. + +### End-to-end script + +```bash +# Quantize and deploy in one step +./examples/llm_autodeploy/scripts/run_auto_quant_and_deploy.sh \ + --hf_ckpt \ + --save_quantized_ckpt \ + --quant fp8,nvfp4 \ + --effective_bits 4.5 +``` + +Parameters: + +- `--hf_ckpt`: Path to unquantized HuggingFace checkpoint +- `--save_quantized_ckpt`: Output path for quantized checkpoint +- `--quant`: Quantization formats (e.g., `fp8,nvfp4`) +- `--effective_bits`: Target precision (higher = more accuracy for sensitive layers) +- `--world_size`: Number of GPUs for tensor parallelism +- `--calib_batch_size`: Calibration batch size (reduce if OOM, default 8) + +### AutoDeploy API server + +```python +# examples/llm_autodeploy/api_server.py provides a FastAPI server +# with OpenAI-compatible endpoints using AutoDeploy +``` + +### Test AutoDeploy + +```bash +python examples/llm_autodeploy/api_client.py --prompt "What is AI?" "What is golf?" +``` + +### Notes + +- NVFP4 in AutoDeploy requires Blackwell GPUs +- For Hopper: remove `nvfp4` from `--quant` and set `--effective_bits` above 8.0 +- AutoDeploy supports CUDA graphs, torch compile backends, and KV cache optimization + +## Legacy TRT-LLM Checkpoint (deprecated) + +The legacy export path using `export_tensorrt_llm_checkpoint()` is deprecated. Use the unified HF checkpoint format with `export_hf_checkpoint()` instead. + +If you encounter a legacy checkpoint (no `hf_quant_config.json`, has `rank*.safetensors` pattern), it needs the TRT-LLM build API to create an engine before deployment. See `docs/source/deployment/1_tensorrt_llm.rst`. + +## Evaluation with TRT-LLM + +```python +# examples/llm_eval/lm_eval_tensorrt_llm.py +# Runs lm_evaluation_harness benchmarks with TRT-LLM +python examples/llm_eval/lm_eval_tensorrt_llm.py \ + --model_path \ + --tasks gsm8k,mmlu +``` + +## Common Issues + +| Issue | Fix | +|-------|-----| +| `No module named tensorrt_llm` | Install via container or pip | +| NVFP4 inference fails on Hopper | NVFP4 requires Blackwell GPUs for inference | +| Slow first inference | Engine compilation happens on first run; subsequent runs are cached | +| OOM during engine build | Reduce `--max_batch_size` or increase TP | diff --git a/.claude/skills/deployment/references/vllm.md b/.claude/skills/deployment/references/vllm.md new file mode 100644 index 00000000000..89e06bde424 --- /dev/null +++ b/.claude/skills/deployment/references/vllm.md @@ -0,0 +1,91 @@ +# vLLM Deployment Reference + +## Requirements + +- vLLM >= 0.10.1 +- `pip install vllm` + +## Realquant Deployment (recommended) + +Realquant uses dedicated quantized kernels for maximum performance. This is the default path for ModelOpt-exported checkpoints. + +### As OpenAI-compatible server + +```bash +python -m vllm.entrypoints.openai.api_server \ + --model \ + --quantization modelopt \ + --tensor-parallel-size \ + --host 0.0.0.0 --port 8000 \ + --served-model-name +``` + +For NVFP4 checkpoints, use `--quantization modelopt_fp4`. + +### As Python API + +```python +from vllm import LLM, SamplingParams + +llm = LLM(model="", quantization="modelopt") +# For FP4: quantization="modelopt_fp4" + +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) +outputs = llm.generate(["Hello, my name is"], sampling_params) + +for output in outputs: + print(f"Prompt: {output.prompt!r}, Generated: {output.outputs[0].text!r}") +``` + +### From HuggingFace Hub + +```python +from vllm import LLM, SamplingParams + +llm = LLM(model="nvidia/Llama-3.1-8B-Instruct-FP8", quantization="modelopt") +outputs = llm.generate(["What is AI?"], SamplingParams(temperature=0.8)) +``` + +## Fakequant Deployment (research) + +Fakequant is 2-5x slower than realquant but doesn't require dedicated kernel support. Useful for research and testing new quantization schemes. + +Reference: `examples/vllm_serve/` + +```bash +# Environment variables for configuration +export QUANT_CFG=NVFP4_DEFAULT_CFG # Quantization format +export QUANT_CALIB_SIZE=512 # Calibration samples +export QUANT_DATASET=cnn_dailymail # Calibration dataset + +python examples/vllm_serve/vllm_serve_fakequant.py \ + -tp --host 0.0.0.0 --port 8000 +``` + +## Benchmarking + +```bash +# Start server first, then benchmark +python -m vllm.benchmark_serving \ + --model \ + --port 8000 \ + --num-prompts 100 \ + --request-rate 10 +``` + +Or use lm_eval for accuracy: + +```bash +lm_eval --model local-completions \ + --tasks gsm8k \ + --model_args model=,base_url=http://localhost:8000/v1/completions,num_concurrent=1,max_retries=3,tokenized_requests=False,batch_size=128 +``` + +## Common Issues + +| Issue | Fix | +|-------|-----| +| `quantization="modelopt"` not recognized | Upgrade vLLM to >= 0.10.1 | +| OOM on startup | Increase `--tensor-parallel-size` or reduce `--max-model-len` | +| AWQ checkpoints not loading | AWQ is not supported in vLLM via modelopt path; use FP8 or NVFP4 | +| Mixed precision not working | Not supported for fakequant | diff --git a/.claude/skills/deployment/scripts/deploy.sh b/.claude/skills/deployment/scripts/deploy.sh new file mode 100755 index 00000000000..a56d5e92eb6 --- /dev/null +++ b/.claude/skills/deployment/scripts/deploy.sh @@ -0,0 +1,447 @@ +#!/bin/bash + +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# ModelOpt Deployment Script +# Deploy quantized or unquantized models via vLLM, SGLang, or TRT-LLM +# Supports ModelOpt FP8/FP4 checkpoints with automatic quantization flag detection + +set -e + +# Default configuration +MODEL="" +PORT=8000 +HOST="0.0.0.0" +FRAMEWORK="vllm" +TP_SIZE=1 +VRAM=0.9 +MAX_WAIT=300 # 5 min for large models +QUANTIZATION="" # auto-detected from checkpoint + +# Paths +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +LOG_DIR="${LOG_DIR:-/tmp/modelopt-deploy}" +LOG_FILE="$LOG_DIR/server.log" +PID_FILE="$LOG_DIR/server.pid" +META_FILE="$LOG_DIR/server.meta" # persists model/framework/port for status + +# Colors +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' + +log_info() { printf "${BLUE}[INFO]${NC} %s\n" "$1"; } +log_success() { printf "${GREEN}[OK]${NC} %s\n" "$1"; } +log_warn() { printf "${YELLOW}[WARN]${NC} %s\n" "$1"; } +log_error() { printf "${RED}[ERROR]${NC} %s\n" "$1"; } + +usage() { + cat < [OPTIONS] + +Commands: + start - Start the inference server + stop - Stop the inference server + test - Test the API endpoint + status - Show server status + restart - Restart the server + detect - Detect checkpoint format (without starting) + +Options: + --model PATH Model path or HF model ID (required for start) + --framework FRAMEWORK vllm, sglang, or trtllm (default: vllm) + --port PORT Server port (default: 8000) + --tp SIZE Tensor parallel size (default: 1) + --quantization QUANT Force quantization flag (modelopt, modelopt_fp4, or none) + --gpu-memory-utilization GPU memory utilization 0.0-1.0 (default: 0.9) + --log-dir DIR Log directory (default: /tmp/modelopt-deploy) + +Examples: + $0 start --model ./qwen3-0.6b-fp8 + $0 start --model ./llama-70b-nvfp4 --framework sglang --tp 4 + $0 start --model nvidia/Llama-3.1-8B-Instruct-FP8 --framework vllm + $0 test --port 8000 + $0 stop +EOF + exit 1 +} + +# ─── Checkpoint Detection ─────────────────────────────────────────── + +detect_quantization() { + local model_path="$1" + + # Skip detection for HF model IDs (no local path) + if [[ ! -d "$model_path" ]]; then + log_info "Model is a HF ID, checking if quantization flag is needed..." + # HF hub models with FP8/FP4 in name likely need modelopt flag + if echo "$model_path" | grep -qi "fp8"; then + echo "modelopt" + elif echo "$model_path" | grep -qi "fp4\|nvfp4"; then + echo "modelopt_fp4" + else + echo "none" + fi + return + fi + + # Local checkpoint: check hf_quant_config.json + local quant_config="$model_path/hf_quant_config.json" + if [[ -f "$quant_config" ]]; then + log_info "Found hf_quant_config.json" + + # Check for FP4/NVFP4 + if python3 -c " +import json, sys +with open(sys.argv[1]) as f: + cfg = json.load(f) +quant_algo = cfg.get('quantization', {}).get('quant_algo', '') +print(quant_algo) +" "$quant_config" 2>/dev/null | grep -qi "fp4"; then + echo "modelopt_fp4" + else + echo "modelopt" + fi + else + log_info "No hf_quant_config.json found — treating as unquantized" + echo "none" + fi +} + +detect_gpu() { + if command -v nvidia-smi &>/dev/null; then + local gpu_count + gpu_count=$(nvidia-smi -L 2>/dev/null | wc -l) + local gpu_name + gpu_name=$(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | head -1) + log_info "GPUs: ${gpu_count}x ${gpu_name}" + echo "$gpu_count" + else + log_error "No NVIDIA GPU detected (nvidia-smi not found)" + echo "0" + fi +} + +# ─── Server Management ────────────────────────────────────────────── + +is_server_running() { + if [[ -f "$PID_FILE" ]]; then + local pid + pid=$(cat "$PID_FILE") + if ps -p "$pid" >/dev/null 2>&1; then + return 0 + fi + fi + return 1 +} + +start_server() { + if [[ -z "$MODEL" ]]; then + log_error "--model is required" + usage + fi + + if is_server_running; then + log_warn "Server already running (PID: $(cat "$PID_FILE"))" + return 0 + fi + + mkdir -p "$LOG_DIR" + + # Auto-detect quantization if not forced + if [[ -z "$QUANTIZATION" ]]; then + QUANTIZATION=$(detect_quantization "$MODEL") + fi + log_info "Quantization: $QUANTIZATION" + + # Save metadata for status command + cat >"$META_FILE" <"$LOG_FILE" 2>&1 & + echo $! >"$PID_FILE" + log_success "vLLM started (PID: $(cat "$PID_FILE"))" +} + +start_sglang() { + log_info "Starting SGLang server..." + + local -a cmd=(python3 -m sglang.launch_server + --model-path "$MODEL" + --host "$HOST" --port "$PORT" + --tp "$TP_SIZE") + + if [[ "$QUANTIZATION" != "none" ]]; then + cmd+=(--quantization "$QUANTIZATION") + fi + + log_info "Command: ${cmd[*]}" + nohup "${cmd[@]}" >"$LOG_FILE" 2>&1 & + echo $! >"$PID_FILE" + log_success "SGLang started (PID: $(cat "$PID_FILE"))" +} + +start_trtllm() { + log_info "Starting TRT-LLM server..." + log_info "TRT-LLM uses the Python API directly (no OpenAI server built-in)" + log_info "For OpenAI-compatible serving, use AutoDeploy:" + + cat < \\ + --quant fp8,nvfp4 \\ + --effective_bits 4.5 + +# Option 2: Python API +python3 -c " +from tensorrt_llm import LLM, SamplingParams +llm = LLM(model='$MODEL') +print(llm.generate(['Hello, my name is'], SamplingParams(temperature=0.8))) +" +TRTEOF + + log_warn "TRT-LLM server mode not yet automated in this script." + log_warn "Use vLLM or SGLang for OpenAI-compatible serving of ModelOpt checkpoints." + exit 1 +} + +wait_for_server() { + log_info "Waiting for server at http://localhost:$PORT ..." + local elapsed=0 + while [[ $elapsed -lt $MAX_WAIT ]]; do + if curl -s "http://localhost:$PORT/health" >/dev/null 2>&1; then + log_success "Server is ready! (${elapsed}s)" + return 0 + fi + + # Check if process died + if ! is_server_running; then + log_error "Server process died. Check logs: $LOG_FILE" + tail -20 "$LOG_FILE" 2>/dev/null + exit 1 + fi + + sleep 5 + elapsed=$((elapsed + 5)) + printf "." + done + + echo "" + log_error "Server not ready after ${MAX_WAIT}s. Check logs: $LOG_FILE" + tail -20 "$LOG_FILE" 2>/dev/null + exit 1 +} + +stop_server() { + if ! is_server_running; then + log_warn "Server is not running" + return 0 + fi + + local pid + pid=$(cat "$PID_FILE") + log_info "Stopping server (PID: $pid)..." + + # Kill the process group to catch child processes (vLLM/SGLang may fork) + kill -- -"$pid" 2>/dev/null || kill "$pid" 2>/dev/null || true + + # Wait for graceful shutdown + for i in {1..15}; do + if ! ps -p "$pid" >/dev/null 2>&1; then + rm -f "$PID_FILE" "$META_FILE" + log_success "Server stopped" + return 0 + fi + sleep 1 + done + + # Force kill + log_warn "Force killing..." + kill -9 -- -"$pid" 2>/dev/null || kill -9 "$pid" 2>/dev/null || true + rm -f "$PID_FILE" "$META_FILE" + log_success "Server stopped (forced)" +} + +test_api() { + log_info "Testing API at http://localhost:$PORT ..." + + # Health check + if ! curl -s "http://localhost:$PORT/health" >/dev/null 2>&1; then + log_error "Server not responding at port $PORT" + exit 1 + fi + log_success "Health check passed" + + # List models + log_info "Available models:" + curl -s "http://localhost:$PORT/v1/models" | python3 -m json.tool 2>/dev/null || true + + # Test completion + log_info "Sending test request..." + local model_id + model_id=$(curl -s "http://localhost:$PORT/v1/models" | python3 -c " +import sys, json +data = json.load(sys.stdin) +print(data['data'][0]['id']) +" 2>/dev/null) + + if [[ -z "$model_id" ]]; then + log_error "Could not determine model ID from /v1/models endpoint" + exit 1 + fi + + local response + response=$(curl -s "http://localhost:$PORT/v1/completions" \ + -H "Content-Type: application/json" \ + -d "{ + \"model\": \"$model_id\", + \"prompt\": \"The capital of France is\", + \"max_tokens\": 32, + \"temperature\": 0.7 + }") + + echo "$response" | python3 -m json.tool 2>/dev/null || echo "$response" + + local text + text=$(echo "$response" | python3 -c " +import sys, json +data = json.load(sys.stdin) +print(data['choices'][0]['text']) +" 2>/dev/null) + + if [[ -n "$text" ]]; then + log_success "API test passed!" + printf "${GREEN}Response:${NC} %s\n" "$text" + else + log_error "No valid response from API" + exit 1 + fi +} + +show_status() { + echo "=== ModelOpt Deployment Status ===" + echo "" + if is_server_running; then + local pid + pid=$(cat "$PID_FILE") + log_success "Server running (PID: $pid)" + + # Read saved metadata if available + if [[ -f "$META_FILE" ]]; then + source "$META_FILE" + fi + + echo " Framework: ${FRAMEWORK:-unknown}" + echo " Model: ${MODEL:-unknown}" + echo " Endpoint: http://localhost:${PORT:-8000}" + echo " Logs: $LOG_FILE" + echo "" + if [[ -f "$LOG_FILE" ]]; then + echo "Recent logs:" + tail -5 "$LOG_FILE" + fi + else + log_warn "Server is not running" + echo " Start with: $0 start --model " + fi +} + +# ─── Argument Parsing ──────────────────────────────────────────────── + +COMMAND="" +while [[ $# -gt 0 ]]; do + case $1 in + --model) MODEL="$2"; shift 2 ;; + --framework) FRAMEWORK="$2"; shift 2 ;; + --port) PORT="$2"; shift 2 ;; + --tp) TP_SIZE="$2"; shift 2 ;; + --quantization) QUANTIZATION="$2"; shift 2 ;; + --gpu-memory-utilization) VRAM="$2"; shift 2 ;; + --log-dir) LOG_DIR="$2"; LOG_FILE="$LOG_DIR/server.log"; PID_FILE="$LOG_DIR/server.pid"; META_FILE="$LOG_DIR/server.meta"; shift 2 ;; + start|stop|test|status|restart|detect) + COMMAND="$1"; shift ;; + *) + log_error "Unknown option: $1" + usage ;; + esac +done + +if [[ -z "$COMMAND" ]]; then + usage +fi + +# Execute +case "$COMMAND" in + start) start_server ;; + stop) stop_server ;; + test) test_api ;; + status) show_status ;; + restart) stop_server; sleep 2; start_server ;; + detect) + if [[ -z "$MODEL" ]]; then + log_error "--model is required for detect" + exit 1 + fi + quant=$(detect_quantization "$MODEL") + echo "Detected quantization: $quant" + ;; + *) usage ;; +esac diff --git a/.claude/skills/evaluation/SKILL.md b/.claude/skills/evaluation/SKILL.md new file mode 100644 index 00000000000..e5268c4d2ef --- /dev/null +++ b/.claude/skills/evaluation/SKILL.md @@ -0,0 +1,378 @@ +--- +name: evaluation +description: Evaluate accuracy of quantized or unquantized LLMs using NeMo Evaluator Launcher (NEL). Use when user says "evaluate model", "benchmark accuracy", "run MMLU", "evaluate quantized model", "accuracy drop", "run nel", or needs to measure how quantization affects model quality. Handles model deployment, config generation, and evaluation execution. +license: Apache-2.0 +# Based on nel-assistant skill from NeMo Evaluator Launcher (commit f1fa073) +# https://github.com/NVIDIA-NeMo/Evaluator/tree/f1fa073/packages/nemo-evaluator-launcher/.claude/skills/nel-assistant +# Modifications: renamed to evaluation, added workspace management (Step 0), +# auto-detect ModelOpt quantization format, quantization-aware benchmark defaults. +--- + +## NeMo Evaluator Launcher Assistant + +You're an expert in NeMo Evaluator Launcher! Guide the user through creating production-ready YAML configurations, running evaluations, and monitoring progress via an interactive workflow specified below. + +### Workspace (multi-user / Slack bot) + +If `MODELOPT_WORKSPACE_ROOT` is set, read `skills/common/workspace-management.md`. Check for existing workspaces — especially if evaluating a model from a prior PTQ or deployment step. Reuse the existing workspace so you have access to the quantized checkpoint and any code modifications. + +### Workflow + +```text +Config Generation Progress: +- [ ] Step 0: Check workspace (if MODELOPT_WORKSPACE_ROOT is set) +- [ ] Step 1: Check if nel is installed +- [ ] Step 2: Build the base config file +- [ ] Step 3: Configure model path and parameters +- [ ] Step 4: Fill in remaining missing values +- [ ] Step 5: Confirm tasks (iterative) +- [ ] Step 6: Advanced - Multi-node (Data Parallel) +- [ ] Step 7: Advanced - Interceptors +- [ ] Step 8: Run the evaluation +``` + +**Step 1: Check if nel is installed** + +Test that `nel` is installed with `nel --version`. + +If not, instruct the user to `pip install nemo-evaluator-launcher`. + +**Step 2: Build the base config file** + +Prompt the user with "I'll ask you 5 questions to build the base config we'll adjust in the next steps". Guide the user through the 5 questions using AskUserQuestion: + +1. Execution: + - Local + - SLURM +2. Deployment: + - None (External) + - vLLM + - SGLang + - NIM + - TRT-LLM +3. Auto-export: + - None (auto-export disabled) + - MLflow + - wandb +4. Model type + - Base + - Chat + - Reasoning +5. Benchmarks: + Allow for multiple choices in this question. + 1. Standard LLM Benchmarks (like MMLU, IFEval, GSM8K, ...) + 2. Code Evaluation (like HumanEval, MBPP, and LiveCodeBench) + 3. Math & Reasoning (like AIME, GPQA, MATH-500, ...) + 4. Safety & Security (like Garak and Safety Harness) + 5. Multilingual (like MMATH, Global MMLU, MMLU-Prox) + +DON'T ALLOW FOR ANY OTHER OPTIONS, only the ones listed above under each category (Execution, Deployment, Auto-export, Model type, Benchmarks). YOU HAVE TO GATHER THE ANSWERS for the 5 questions before you can build the base config. + +When you have all the answers, run the script to build the base config: + +```bash +nel skills build-config --execution --deployment --model_type --benchmarks [--export ] [--output ] +``` + +Where `--output` depends on what the user provides: + +- Omit: Uses current directory with auto-generated filename +- Directory: Writes to that directory with auto-generated filename +- File path (*.yaml): Writes to that specific file + +It never overwrites existing files. + +**Step 3: Configure model path and parameters** + +Ask for model path. Determine type: + +- Checkpoint path (starts with `/` or `./`) → set `deployment.checkpoint_path: ` and `deployment.hf_model_handle: null` +- HF handle (e.g., `org/model-name`) → set `deployment.hf_model_handle: ` and `deployment.checkpoint_path: null` + +**Auto-detect ModelOpt quantization format** (checkpoint paths only): + +Check for `hf_quant_config.json` in the checkpoint directory: + +```bash +cat /hf_quant_config.json 2>/dev/null +``` + +If found, read `quantization.quant_algo` and set the correct vLLM/SGLang quantization flag in `deployment.extra_args`: + +| `quant_algo` | Flag to add | +|-------------|-------------| +| `FP8` | `--quantization modelopt` | +| `W4A8_AWQ` | `--quantization modelopt` | +| `NVFP4`, `NVFP4_AWQ` | `--quantization modelopt_fp4` | + +If no `hf_quant_config.json`, the checkpoint is unquantized — no flag needed. + +**Quantization-aware benchmark defaults:** + +When a quantized checkpoint is detected, recommend benchmarks sensitive to quantization accuracy loss: + +- **Always include**: MMLU (general knowledge, most affected by quantization) +- **Recommended**: GSM8K (math reasoning — sensitive to precision loss), ARC-Challenge (reasoning) +- **Good to add**: HumanEval (code generation — catches subtle degradation), Winogrande (commonsense) +- **Less useful for quant comparison**: IFEval (instruction following — rarely affected by quantization) + +Present these recommendations to the user and ask which to include. If the user already specified benchmarks, keep their choice but mention any accuracy-sensitive benchmarks they may have missed. + +Use WebSearch to find model card (HuggingFace, build.nvidia.com). Read it carefully, the FULL text, the devil is in the details. Extract ALL relevant configurations: + +- Sampling params (`temperature`, `top_p`) +- Context length (`deployment.extra_args: "--max-model-len "`) +- TP/DP settings (to set them appropriately, AskUserQuestion on how many GPUs the model will be deployed) +- Reasoning config (if applicable): + - reasoning on/off: use either: + - `adapter_config.custom_system_prompt` (like `/think`, `/no_think`) and no `adapter_config.params_to_add` (leave `params_to_add` unrelated to reasoning untouched) + - `adapter_config.params_to_add` for payload modifier (like `"chat_template_kwargs": {"enable_thinking": true/false}`) and no `adapter_config.custom_system_prompt` and `adapter_config.use_system_prompt: false` (leave `custom_system_prompt` and `use_system_prompt` unrelated to reasoning untouched). + - reasoning effort/budget (if it's configurable, AskUserQuestion what reasoning effort they want) + - higher `max_new_tokens` + - etc. +- Deployment-specific `extra_args` for vLLM/SGLang (look for the vLLM/SGLang deployment command) +- Deployment-specific vLLM/SGLang versions (by default we use latest docker images, but you can control it with `deployment.image` e.g. vLLM above `vllm/vllm-openai:v0.11.0` stopped supporting `rope-scaling` arg used by Qwen models) +- ARM64 / non-standard GPU compatibility: The default `vllm/vllm-openai` image only supports common GPU architectures. For ARM64 platforms or GPUs with non-standard compute capabilities (e.g., NVIDIA GB10 with sm_121), use NGC vLLM images instead: + - Example: `deployment.image: nvcr.io/nvidia/vllm:26.01-py3` + - AskUserQuestion about their GPU architecture if the model card doesn't specify deployment constraints +- Any preparation requirements (e.g., downloading reasoning parsers, custom plugins): + - If the model card mentions downloading files (like reasoning parsers, custom plugins) before deployment, add `deployment.pre_cmd` with the download command + - Use `curl` instead of `wget` as it's more widely available in Docker containers + - Example: `pre_cmd: curl -L -o reasoning_parser.py https://huggingface.co/.../reasoning_parser.py` + - When using `pip install` in `pre_cmd`, always use `--no-cache-dir` to avoid cross-device link errors in Docker containers (the pip cache and temp directories may be on different filesystems) + - Example: `pre_cmd: pip3 install --no-cache-dir flash-attn --no-build-isolation` +- Any other model-specific requirements + +Remember to check `evaluation.nemo_evaluator_config` and `evaluation.tasks.*.nemo_evaluator_config` overrides too for parameters to adjust (e.g. disabling reasoning)! + +Present findings, explain each setting, ask user to confirm or adjust. If no model card found, ask user directly for the above configurations. + +**Step 4: Fill in remaining missing values** + +- Find all remaining `???` missing values in the config. +- Ask the user only for values that couldn't be auto-discovered from the model card (e.g., SLURM hostname, account, output directory, MLflow/wandb tracking URI). Don't propose any defaults here. Let the user give you the values in plain text. +- Ask the user if they want to change any other defaults e.g. execution partition or walltime (if running on SLURM) or add MLflow/wandb tags (if auto-export enabled). + +**Step 5: Confirm tasks (iterative)** + +Show tasks in the current config. Loop until the user confirms the task list is final: + +1. Tell the user: "Run `nel ls tasks` to see all available tasks". +2. Ask if they want to add/remove tasks or add/remove/modify task-specific parameter overrides. + To add per-task `nemo_evaluator_config` as specified by the user, e.g.: + + ```yaml + tasks: + - name: + nemo_evaluator_config: + config: + params: + temperature: + max_new_tokens: + ... + ``` + +3. Apply changes. +4. Show updated list and ask: "Is the task list final, or do you want to make more changes?" + +**Known Issues** + +- NeMo-Skills workaround (self-deployment only): If using `nemo_skills.*` tasks with self-deployment (vLLM/SGLang/NIM), add at top level: + + ```yaml + target: + api_endpoint: + api_key_name: DUMMY_API_KEY + ``` + + For the None (External) deployment the `api_key_name` should be already defined. The `DUMMY_API_KEY` export is handled in Step 8. + +**Step 6: Advanced - Multi-node** + +There are two multi-node patterns. Ask the user which applies: + +**Pattern A: Multi-instance (independent instances with HAProxy)** + +Only if model >120B parameters or user wants more throughput. Explain: "Each node runs an independent deployment instance. HAProxy load-balances requests across all instances." + +```yaml +execution: + num_nodes: 4 # Total nodes + num_instances: 4 # 4 independent instances → HAProxy auto-enabled +``` + +**Pattern B: Multi-node single instance (Ray TP/PP across nodes)** + +When a single model is too large for one node and needs pipeline parallelism across nodes. Use `vllm_ray` deployment config: + +```yaml +defaults: + - deployment: vllm_ray # Built-in Ray cluster setup (replaces manual pre_cmd) + +execution: + num_nodes: 2 # Single instance spanning 2 nodes + +deployment: + tensor_parallel_size: 8 + pipeline_parallel_size: 2 +``` + +**Pattern A+B combined: Multi-instance with multi-node instances** + +For very large models needing both cross-node parallelism AND multiple instances: + +```yaml +defaults: + - deployment: vllm_ray + +execution: + num_nodes: 4 # Total nodes + num_instances: 2 # 2 instances of 2 nodes each → HAProxy auto-enabled + +deployment: + tensor_parallel_size: 8 + pipeline_parallel_size: 2 +``` + +**Common Confusions** + +- **`num_instances`** controls independent deployment instances with HAProxy. **`data_parallel_size`** controls DP replicas *within* a single instance. +- Global data parallelism is `num_instances x data_parallel_size` (e.g., 2 instances x 8 DP each = 16 replicas). +- With multi-instance, `parallelism` in task config is the total concurrent requests across all instances, not per-instance. +- `num_nodes` must be divisible by `num_instances`. + +**Step 7: Advanced - Interceptors** + +- Tell the user they should see: . +- DON'T provide any general information about what interceptors typically do in API frameworks without reading the docs. If the user asks about interceptors, only then read the webpage to provide precise information. +- If the user asks you to configure some interceptor, then read the webpage of this interceptor and configure it according to the `--overrides` syntax but put the values in the YAML config under `evaluation.nemo_evaluator_config.config.target.api_endpoint.adapter_config` (NOT under `target.api_endpoint.adapter_config`) instead of using CLI overrides. + By defining `interceptors` list you'd override the full chain of interceptors which can have unintended consequences like disabling default interceptors. That's why use the fields specified in the `CLI Configuration` section after the `--overrides` keyword to configure interceptors in the YAML config. + +**Documentation Errata** + +- The docs may show incorrect parameter names for logging. Use `max_logged_requests` and `max_logged_responses` (NOT `max_saved_*` or `max_*`). + +**Step 8: Run the evaluation** + +Print the following commands to the user. Propose to execute them in order to confirm the config works as expected before the full run. + +**Important**: Export required environment variables based on your config. If any tokens or keys are missing (e.g. `HF_TOKEN`, `NGC_API_KEY`, `api_key_name` from the config), ask the user to put them in a `.env` file in the project root so you can run `set -a && source .env && set +a` (or equivalent) before executing `nel run` commands. + +```bash +# If using pre_cmd or post_cmd: +export NEMO_EVALUATOR_TRUST_PRE_CMD=1 + +# If using nemo_skills.* tasks with self-deployment: +export DUMMY_API_KEY=dummy +``` + +1. **Dry-run** (validates config without running): + + ```bash + nel run --config --dry-run + ``` + +2. **Test with limited samples** (quick validation run): + + ```bash + nel run --config -o ++evaluation.nemo_evaluator_config.config.params.limit_samples=10 + ``` + +3. **Re-run a single task** (useful for debugging or re-testing after config changes): + + ```bash + nel run --config -t + ``` + + Combine with `-o` for limited samples: `nel run --config -t -o ++evaluation.nemo_evaluator_config.config.params.limit_samples=10` + +4. **Full evaluation** (production run): + + ```bash + nel run --config + ``` + +After the dry-run, check the output from `nel` for any problems with the config. If there are no problems, propose to first execute the test run with limited samples and then execute the full evaluation. If there are problems, resolve them before executing the full evaluation. + +**Monitoring Progress** + +After job submission, you can monitor progress using: + +1. **Check job status:** + + ```bash + nel status + nel info + ``` + +2. **Stream logs** (Local execution only): + + ```bash + nel logs + ``` + + Note: `nel logs` is not supported for SLURM execution. + +3. **Inspect logs via SSH** (SLURM workaround): + + When `nel logs` is unavailable (SLURM), use SSH to inspect logs directly: + + First, get log locations: + + ```bash + nel info --logs + ``` + + Then, use SSH to view logs: + + **Check server deployment logs:** + + ```bash + ssh @ "tail -100 --logs`>/server--*.log" + ``` + + Shows vLLM server startup, model loading, and deployment errors (e.g., missing wget/curl). + + **Check evaluation client logs:** + + ```bash + ssh @ "tail -100 --logs`>/client-.log" + ``` + + Shows evaluation progress, task execution, and results. + + **Check SLURM scheduler logs:** + + ```bash + ssh @ "tail -100 --logs`>/slurm-.log" + ``` + + Shows job scheduling, health checks, and overall execution flow. + + **Search for errors:** + + ```bash + ssh @ "grep -i 'error\|warning\|failed' --logs`>/*.log" + ``` + +--- + +Direct users with issues to: + +- **GitHub Issues:** +- **GitHub Discussions:** + +Now, copy this checklist and track your progress: + +```text +Config Generation Progress: +- [ ] Step 0: Check workspace (if multi-user) +- [ ] Step 1: Check if nel is installed +- [ ] Step 2: Build the base config file +- [ ] Step 3: Configure model path and parameters +- [ ] Step 4: Fill in remaining missing values +- [ ] Step 5: Confirm tasks (iterative) +- [ ] Step 6: Advanced - Multi-node (Data Parallel) +- [ ] Step 7: Advanced - Interceptors +- [ ] Step 8: Run the evaluation +``` diff --git a/.claude/skills/evaluation/evals/nemotron3-nano-bf16-reasoning.json b/.claude/skills/evaluation/evals/nemotron3-nano-bf16-reasoning.json new file mode 100644 index 00000000000..6fb32570eb3 --- /dev/null +++ b/.claude/skills/evaluation/evals/nemotron3-nano-bf16-reasoning.json @@ -0,0 +1,26 @@ +{ + "skills": ["nel-assistant"], + "query": "Help me evaluate Nemotron 3 Nano BF16 from NVIDIA", + "files": [], + "expected_behavior": [ + "Verifies nel is installed by running 'nel --version'", + "Asks all 5 base config questions (execution, deployment, auto-export, model type, benchmarks) before generating the config", + "Runs 'nel skills build-config' with correct flags matching user answers: --execution slurm --deployment vllm --model-type reasoning --benchmarks standard code math_reasoning --export mlflow", + "Searches the web for the model card on HuggingFace and extracts model-specific settings", + "Sets correct HF handle: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16", + "Sets reasoning sampling params from model card: temperature=1.0, top_p=1.0", + "Configures reasoning toggle via params_to_add with chat_template_kwargs.enable_thinking (not via system prompt)", + "Disables reasoning for IFEval task using enable_thinking: false with use_system_prompt: false", + "Adds deployment.pre_cmd using curl (not wget) to download nano_v3_reasoning_parser.py from HuggingFace", + "Adds vLLM extra_args including --trust-remote-code, --reasoning-parser-plugin, --reasoning-parser nano_v3, --max-num-seqs 8", + "Pins vLLM image to v0.12.0 or later as required by model card", + "Adds target.api_endpoint.api_key_name: DUMMY_API_KEY for nemo_skills tasks with self-deployment", + "Fills in all ??? placeholders after asking the user for SLURM hostname, account, output_dir, MLflow tracking_uri, and experiment_name", + "Applies user-requested SLURM customizations: partition batch_short, walltime 00:20:00, MLflow tag scenario: demo", + "Presents task list and waits for user confirmation before proceeding", + "Configures request and response logging interceptors under evaluation.nemo_evaluator_config.config.target.api_endpoint.adapter_config using correct field names (max_logged_requests/max_logged_responses, not max_saved_*)", + "Handles dry-run failure for missing HF_TOKEN_FOR_GPQA_DIAMOND by offering to fix the config", + "Successfully submits test run with limit_samples=10 after dry-run passes", + "Provides monitoring commands (nel status, nel info --logs) and inspects server logs via SSH when asked" + ] +} diff --git a/.claude/skills/modelopt/SKILL.md b/.claude/skills/modelopt/SKILL.md new file mode 100644 index 00000000000..da9cb9ea208 --- /dev/null +++ b/.claude/skills/modelopt/SKILL.md @@ -0,0 +1,90 @@ +--- +name: modelopt +description: End-to-end model optimization pipeline that chains quantization with deployment or evaluation. Use when user says "optimize model end-to-end", "quantize and deploy", "quantize and serve", "quantize and evaluate", "quantize and benchmark accuracy", "full optimization loop", "run the full pipeline", "optimize and test accuracy", "find best quantization recipe", or wants to go from a pretrained model to a deployed or accuracy-verified quantized model. Do NOT use for individual tasks like only quantizing (use ptq), only deploying (use deployment), or only evaluating (use evaluation). +license: Apache-2.0 +--- + +# ModelOpt Optimizer — Pipeline Orchestrator + +Orchestrates optimization pipelines by chaining skills. Supports two modes: + +1. **PTQ + Deploy** — quantize then serve as an API endpoint +2. **PTQ + Evaluate** — quantize then benchmark accuracy (evaluation handles deployment internally) + +This skill delegates to sub-skills. **Do not duplicate their logic — invoke them.** + +## Workspace Management + +If `MODELOPT_WORKSPACE_ROOT` is set (multi-user / Slack bot), read `skills/common/workspace-management.md` first. **All sub-skills in the pipeline must run in the same workspace** so they share the checkpoint and any code modifications. Create or reuse a workspace named after the model (e.g., `qwen3-0.6b`, `llama-3.1-8b-fp8`) before invoking any sub-skill. + +## Pipeline Selection + +Determine which pipeline the user needs: + +| User says | Pipeline | +|-----------|----------| +| "quantize and deploy", "quantize and serve" | PTQ + Deploy | +| "quantize and evaluate", "optimize end-to-end", "find best recipe" | PTQ + Evaluate | + +If the user only wants quantization without deploy/eval, the `ptq` skill handles it directly — this skill should not be used. + +If unclear, ask: **"After quantization, do you want to (a) deploy the model as a server, (b) evaluate accuracy, or (c) just get the checkpoint?"** If they answer (c), hand off to the `ptq` skill. + +## Step 1: Gather Info + +Collect from the user (skip what's already provided): + +1. **Model path** — local path or HuggingFace model ID (save this for baseline comparison in Step 4) +2. **Quantization format** — e.g., fp8, nvfp4, int4_awq (or "recommend one") +3. **Execution target** — local GPU or remote cluster. Check for `~/.config/modelopt/clusters.yaml` or `.claude/clusters.yaml`. If found, ask which cluster to use. Both sub-skills support remote execution via `remote_exec.sh`. +4. **GPU IDs** — which GPUs to use (default: `0`; skip if remote — sub-skills handle GPU allocation via SLURM) +5. For Deploy pipeline: **Deployment framework** — vLLM, SGLang, or TRT-LLM (default: vLLM) +6. For Evaluate pipeline: **Evaluation tasks** — default: `mmlu` + +## Step 2: Quantize + +**Invoke the `ptq` skill.** It handles environment detection, model compatibility, format selection, job submission, and checkpoint verification. + +Input: model path, quantization format, export path, GPU IDs. +Output: quantized checkpoint at export path. + +## Step 3: Deploy or Evaluate + +### PTQ + Deploy + +**Invoke the `deployment` skill.** It starts an inference server with the quantized checkpoint. + +Input: checkpoint path, framework, GPU IDs, port. +Output: running server at `http://localhost:`. + +### PTQ + Evaluate + +**Invoke the `evaluation` skill.** It handles deploying the quantized model, configuring NEL evaluation, running benchmarks, and collecting results. + +Input: quantized checkpoint path, evaluation tasks. +Output: accuracy scores per task. + +## Step 4: Baseline Comparison (PTQ + Evaluate only) + +After evaluation completes, ask: **"Would you like to compare against the unquantized baseline?"** + +If yes: +1. Run the evaluation skill again with the **original model path** (from Step 1) and the same benchmark tasks +2. Present a side-by-side comparison table: + +```text +| Benchmark | BF16 (baseline) | FP8 (quantized) | Delta | +|-----------|-----------------|-----------------|-------| +| MMLU | 67.3% | 65.2% | -2.1% | +| GSM8K | 54.1% | 52.8% | -1.3% | +``` + +3. Flag any benchmark with >2% accuracy drop — suggest trying a lighter quantization format + +## Step 5: Present Results and Iterate + +Show results and ask: **"Are you satisfied with these results?"** + +- **Yes** — Done. Report final model path and summary. +- **No** — Propose a different recipe (lighter or heavier quantization), loop to Step 2. +- **Quit** — Report partial results. Clean up any running servers. diff --git a/.claude/skills/ptq/SKILL.md b/.claude/skills/ptq/SKILL.md new file mode 100644 index 00000000000..43e8eaa43f6 --- /dev/null +++ b/.claude/skills/ptq/SKILL.md @@ -0,0 +1,266 @@ +--- +name: ptq +description: This skill should be used when the user asks to "quantize a model", "run PTQ", "post-training quantization", "NVFP4 quantization", "FP8 quantization", "INT8 quantization", "INT4 AWQ", "quantize LLM", "quantize MoE", "quantize VLM", or needs to produce a quantized HuggingFace or TensorRT-LLM checkpoint from a pretrained model using ModelOpt. +--- + +# ModelOpt Post-Training Quantization + +Produce a quantized checkpoint from a pretrained HuggingFace model using NVIDIA Model Optimizer. The output is ready for TensorRT-LLM deployment or HuggingFace-compatible inference. + +## Decision Process + +### 0. Check the execution environment + +Do this first — the environment determines how to run the job and which formats are viable. + +**Multi-user / Slack bot mode?** + +If `MODELOPT_WORKSPACE_ROOT` is set, you are running in a multi-user environment. Read `skills/common/workspace-management.md` and check for an existing workspace for this model before proceeding. If you create or switch to a model-specific workspace, all subsequent steps run there. + +**Is this a remote execution?** + +Check if a remote cluster config exists or the user mentioned running on a remote machine: + +```bash +cat ~/.config/modelopt/clusters.yaml 2>/dev/null || cat .claude/clusters.yaml 2>/dev/null +``` + +**Case A — config found, or user says "run on [cluster]" / "run remotely" / "use SSH":** +Switch to remote execution mode — read `references/remote-execution.md` now. All subsequent steps apply whether local or remote. + +**Case B — no config, user hasn't mentioned a cluster:** +Skip remote mode and proceed with local execution below. + +**Case C — no config, but user clearly wants remote (e.g. "run on the cluster", "use SSH", mentions a hostname):** +Ask the user for the following info, then create `~/.config/modelopt/clusters.yaml` before proceeding: + +```text +I need a few details to set up the remote cluster. Please provide: +1. Login node hostname (e.g. cluster-login.example.com) +2. SSH username +3. SSH key path (default: ~/.ssh/id_rsa) — press Enter to use default +4. Remote working directory (e.g. /lustre/fs1/username/modelopt or ~/modelopt) +5. Cluster name/alias for future reference (e.g. "selene", "cw-dfw") +``` + +Once you have the answers, write `~/.config/modelopt/clusters.yaml`: + +```yaml +clusters: + : + login_node: + user: + ssh_key: + workspace: + +default_cluster: +``` + +Then read `references/remote-execution.md` and continue. + +**Is this a SLURM cluster?** + +```bash +which srun squeue sbatch 2>/dev/null | head -1 +``` + +If any of those exist, you're on SLURM. Query accounts and partitions: + +```bash +# Get user's accounts and cluster +sacctmgr show associations user=$USER format=account%30,partition%20,cluster%20 -n 2>/dev/null + +# List partitions with time limits +sinfo -o "%P %a %l %G" 2>/dev/null | grep -v "^PARTITION" +``` + +- If the user has **one account**: use it automatically. +- If the user has **multiple accounts**: show them and ask which to use. Default to the account whose name most closely matches the project or working directory. +- For partition, use the default (marked with `*` in `sinfo` output). Report the choice. + +**If not SLURM, check for a local GPU:** + +```bash +python -c "import torch; [print(f'GPU {i}: {torch.cuda.get_device_name(i)}') for i in range(torch.cuda.device_count())] if torch.cuda.is_available() else print('no-gpu')" +``` + +| Result | Action | +|--------|--------| +| SLURM detected | Proceed — GPU will be allocated via `srun`. Infer GPU type from `sinfo` node features. | +| Local GPU found | Proceed — report the GPU model(s) to the user. | +| Neither found | **Stop and report**: "No GPU found and this doesn't appear to be a SLURM cluster. PTQ calibration requires a CUDA GPU. Please confirm the target environment." | + +The GPU model feeds directly into format recommendation in the next step. + +### 1. Is the model architecture supported? + +**Read `examples/llm_ptq/README.md` first.** It is the authoritative reference for this workflow and contains information that isn't duplicated here: the full support matrix, correct CLI flag names, accuracy guidance, and hardware requirements. Key sections to check: + +- Support matrix (~line 100) — which architectures and formats are supported +- Correct flags `--pyt_ckpt_path` / `--export_path` (~line 149) +- Accuracy note: prefer `nvfp4_mlp_only` or `nvfp4_omlp_only` for NVFP4 (~line 131) +- Blackwell GPU requirement for NVFP4 inference (~line 126, footnote 5) + +After reading the README, check `modelopt/torch/export/model_utils.py` for `MODEL_NAME_TO_TYPE`. If the model's class name substring-matches a key in that dict, it is supported. + +**Supported** → Use the existing `examples/llm_ptq/hf_ptq.py` script directly. No custom code needed. + +**Unsupported** → **Read `references/unsupported-models.md` now.** It covers model source investigation, FP8 detection, patch assessment, weight name verification, and all implementation patterns. + +### 2. Choose the quantization format + +If the user has not specified a format, **recommend one based on the GPU detected above**: + +| GPU generation | Memory priority | Accuracy priority | +|----------------|-----------------|-------------------| +| **Blackwell** (B100, B200, GB200) | `nvfp4_mlp_only` | `nvfp4_awq_lite` | +| **Hopper** (H100, H200) or older | `int4_awq` | `fp8` | + +Tell the user which GPU was detected and which format you are recommending, and why. + +> **If the user explicitly requests `nvfp4` on a Hopper GPU**: proceed — H100/H200 can *calibrate* NVFP4 checkpoints fine. Just note: "NVFP4 inference requires Blackwell GPUs; this checkpoint will be calibrated on H100 but must be deployed on Blackwell." + +For reference, all available configs are in `modelopt/torch/quantization/config.py`: + +| Format | Config | Notes | +|--------|--------|-------| +| NVFP4 MLP-only | `NVFP4_MLP_ONLY_CFG` | Recommended for Blackwell; best accuracy/throughput tradeoff | +| NVFP4 MLP weight-only | `NVFP4_MLP_WEIGHT_ONLY_CFG` | Quantize MLP weights only (no activations) | +| NVFP4 all layers | `NVFP4_DEFAULT_CFG` | May reduce accuracy; see README | +| NVFP4 + AWQ calibration | `NVFP4_AWQ_LITE_CFG` | Best NVFP4 accuracy, slower calibration | +| FP8 per-tensor | `FP8_DEFAULT_CFG` | Accuracy-first for Hopper | +| INT4 weight-only | `INT4_AWQ_CFG` | Memory-first for Hopper/older | +| INT8 + SmoothQuant | `INT8_SMOOTHQUANT_CFG` | Older GPUs, activation quantization | + +> **NVFP4 requires Blackwell GPUs** for inference. H100 can run NVFP4 calibration but not inference. + +For MLP-only quantization (skipping attention), use configs with `MLP_ONLY` in the name, or create a custom config by disabling `*self_attn*`. + +### 3. Set up the environment + +- **SLURM**: Read `references/slurm-setup.md` — it has container setup, account/partition selection, the job script template, smoke-test strategy, and monitoring instructions. +- **Local GPU**: Check if Docker is available first — it's the cleanest isolation: + - **Docker available**: use the TRT-LLM NGC container (version from `examples/llm_ptq/README.md`): + + ```bash + docker run --gpus all -v : -v : \ + nvcr.io/nvidia/tensorrt-llm/release: bash -c "pip install --no-build-isolation -e [hf] --quiet && python ..." + ``` + + - **No Docker**: set up a virtual environment with conda (preferred) or venv: + + ```bash + # conda + conda create -n modelopt python=3.10 -y && conda activate modelopt + # or venv + python -m venv modelopt-env && source modelopt-env/bin/activate + + pip install --no-build-isolation nvidia-modelopt[hf] + ``` + +**GPU memory**: Estimate `num_params × 2 bytes` for BF16. Use `device_map="auto"` for multi-GPU. If the model exceeds single-node memory, see the FSDP2 section in `references/slurm-setup.md`. + +### 4. Write and run + +**The goal is a quantized checkpoint on disk — not a script handed to the user.** Write the script, run it (or submit it), follow the logs, fix errors, and rerun until the export directory contains `.safetensors` shards and a `config.json`. + +#### Supported models + +```bash +python examples/llm_ptq/hf_ptq.py \ + --pyt_ckpt_path \ + --qformat \ + --export_fmt hf \ + --calib_size 512 \ + --export_path +``` + +Always pass `--export_fmt hf` explicitly — older versions of the script default to `tensorrt_llm` which produces TRT-LLM format instead of a HuggingFace checkpoint. + +Run `python examples/llm_ptq/hf_ptq.py --help` to see all options. + +#### Unsupported models + +Write a custom script following `references/unsupported-models.md`. Core steps: + +1. Load model (dequantize FP8 if needed) +2. Register monkey-patched modules via `mtq.register()` +3. Create calibration dataloader +4. Call `mtq.quantize(model, config, forward_loop)` +5. Export with `export_hf_checkpoint(model, export_dir)` + +#### Local GPU — run and monitor + +```bash +nohup python ptq_script.py ... > 2>&1 & +tail -f +``` + +PTQ-specific failure modes to check via `mtq.print_quant_summary()`: + +- **Quantizers not enabled**: wildcard missed modules — check `*gate*` vs `*mlp.gate*` +- **FP8 tensors still present after dequant**: missed a non-standard param name — inspect `model.named_parameters()` for `float8_e4m3fn` dtypes + +### 5. Verify the output checkpoint + +Once the job succeeds, confirm the export is valid: + +```bash +# Check export directory has model shards and config +ls -lh / +# Expect: config.json, tokenizer files, model-*.safetensors + +# Verify no unexpected FP8 tensors remain +python -c " +from safetensors import safe_open +import glob, os +for f in sorted(glob.glob('/model*.safetensors'))[:1]: + with safe_open(f, framework='pt') as sf: + for k in list(sf.keys())[:5]: + t = sf.get_tensor(k) + print(k, t.dtype, t.shape) +" +``` + +Report the output path and checkpoint size to the user. + +## Key API Rules + +These are non-obvious requirements that cause hard-to-debug failures: + +- **`mtq.register()` requires `_setup` method**: Any class registered with `mtq.register(original_cls=X, quantized_cls=Y)` MUST define a method named exactly `_setup()`. Not `_init_quantizers`, not `setup` — exactly `_setup`. Also, the `__init__` must call `self._setup()` — if you forget this, TensorQuantizers are never instantiated and quantization silently does nothing. + +- **Call `mto.enable_huggingface_checkpointing()` before quantization**: Required for HF checkpoint export to work. + +- **Wildcard pattern `*gate*` is dangerously broad**: It matches both MoE router gates AND any quantizer with "gate" in the name (e.g., `gate_up_weight_quantizer`). Use `*mlp.gate*` or `*router*` to target router gates specifically. Always verify with `mtq.print_quant_summary()`. + +- **VLMs need `AutoModel`**: Vision-Language Models (e.g., `Mistral3ForConditionalGeneration`, `Mllama`) are NOT registered under `AutoModelForCausalLM`. Use `AutoModel.from_pretrained()`. + +- **FP8 checkpoints need the config class**: When loading an FP8-quantized checkpoint with `dequantize=True`, pass `FineGrainedFP8Config(dequantize=True)` — not a plain dict. HF validates the config type matches. + +- **Quantizer naming convention**: Custom `TensorQuantizer` modules must end with `_input_quantizer` or `_weight_quantizer` for ModelOpt's wildcard matching. + +- **Do not modify ModelOpt core source**: All custom code (monkey-patching, `mtq.register()` wrappers, dequantization helpers) must live in your own script or under `examples/`. Never edit files under `modelopt/torch/` unless there is no easy way to patch from outside — and if you must, note it explicitly so it can be upstreamed. + +## Additional Resources + +### Reference Files + +- **`references/unsupported-models.md`** — Patterns for extending ModelOpt to new architectures: MoE expert quantization, VLM language model extraction, FP8 dequantization, calibration routing +- **`references/slurm-setup.md`** — SLURM job script template, container/enroot setup, partition selection, smoke-test strategy, monitoring, multi-node FSDP2 +- **`references/remote-execution.md`** — **Read this when running PTQ on a remote machine/cluster via SSH.** Covers cluster config, persistent SSH sessions, SLURM container jobs, the two-script pattern, and troubleshooting. +- **`skills/common/workspace-management.md`** — **Read this when `MODELOPT_WORKSPACE_ROOT` is set (Slack bot / multi-user).** Covers when to create vs reuse workspaces, naming conventions, and cross-task workspace sharing (PTQ → deploy → eval). + +### ModelOpt Examples + +- **`examples/llm_ptq/README.md`** ← **read this first** — support matrix, correct flag names, accuracy guidance, hardware requirements +- **`examples/llm_ptq/hf_ptq.py`** — Main PTQ script for supported models +- **`examples/llm_ptq/multinode_ptq.py`** — Multi-node PTQ with FSDP2 +- **`examples/deepseek/ptq.py`** — Custom PTQ for DeepSeek MoE (reference for MoE monkey-patching) + +### Source Code + +- **`modelopt/torch/quantization/config.py`** — All quantization configs and format definitions +- **`modelopt/torch/export/model_utils.py`** — `MODEL_NAME_TO_TYPE` (supported architectures), `get_model_type()`, `is_multimodal_model()` +- **`modelopt/torch/quantization/conversion.py`** — `mtq.register()` implementation (see `_setup` requirement) +- **`modelopt/torch/utils/dataset_utils.py`** — `get_dataset_dataloader()`, `get_supported_datasets()` diff --git a/.claude/skills/ptq/references/remote-execution.md b/.claude/skills/ptq/references/remote-execution.md new file mode 100644 index 00000000000..5b276c1820b --- /dev/null +++ b/.claude/skills/ptq/references/remote-execution.md @@ -0,0 +1,149 @@ +# Remote Execution + +Read this when Claude Code runs on a different machine than the target GPU cluster/workstation. This covers SSH connectivity, cluster config, persistent sessions, and remote command execution. For SLURM-specific details (job scripts, containers, partitions, monitoring), see `slurm-setup.md`. + +--- + +## 1. Cluster Config + +Config locations (checked in order, first found wins): + +1. `~/.config/modelopt/clusters.yaml` — user-level (not committed, recommended) +2. `.claude/clusters.yaml` — project-level (can be committed for shared defaults) +3. Interactive input — if neither file exists, ask the user (see SKILL.md Step 0) and write `~/.config/modelopt/clusters.yaml` before proceeding + +```yaml +clusters: + my-cluster: + login_node: cluster-login.example.com # SSH hostname or SSH config alias + user: username # SSH user + ssh_key: ~/.ssh/id_rsa # (optional) SSH key path + ssh_proxy: "socat - PROXY:localhost:%h:%p,proxyport=3128" # (optional) proxy + workspace: /absolute/path/to/workdir # Remote working directory + gpu_type: H100 # For quant format recommendation + slurm: # (optional) pre-fill SLURM defaults + default_account: my_account + default_partition: batch_short + +default_cluster: my-cluster +``` + +See `.claude/clusters.yaml.example` for a fully annotated example with multiple cluster types. + +--- + +## 2. Connect and Establish Persistent Session + +```bash +source .claude/skills/common/remote_exec.sh +remote_load_cluster # or omit name to use default_cluster +remote_check_ssh # validates connectivity + starts persistent session +``` + +`remote_check_ssh` starts an SSH **ControlMaster** connection. All subsequent `remote_run` / `remote_sync_*` / SCP calls reuse this single connection: + +- ~180ms per command (vs 5-15s per new connection) +- Eliminates flaky proxy timeouts +- Auto-cleaned up when the shell exits + +--- + +## 3. Detect Remote Environment + +```bash +remote_detect_env +``` + +Auto-discovers whether the remote has SLURM, Docker, or bare-metal GPUs. Sets `REMOTE_ENV_TYPE` to `slurm`, `docker`, `bare`, or `unknown`. + +After detection, proceed with the environment-specific setup: + +- **SLURM** → read `slurm-setup.md`, but prefix all commands with `remote_run` +- **Docker** → use `remote_docker_run ""` +- **Bare metal** → use `remote_run` directly + +--- + +## 4. Running Commands Remotely + +### Single commands + +```bash +remote_run "nvidia-smi" +remote_run "python --version" +remote_run "sbatch /path/to/job.sh" +``` + +`remote_run` uses base64 encoding internally, so special characters (`%`, `$`, quotes) work without escaping. It retries up to 3 times on SSH failures. + +### Syncing files + +```bash +# Local → remote +remote_sync_to /local/path remote_subdir + +# Remote → local +remote_sync_from remote_subdir /local/path +``` + +Both use rsync over the persistent SSH session with default excludes (`.git`, `__pycache__`, etc.). + +### SCP (alternative to rsync) + +SCP also reuses the persistent session automatically via ControlMaster: + +```bash +scp /local/script.sh ${REMOTE_USER}@${REMOTE_HOST}:/remote/path/ +``` + +--- + +## 5. The Two-Script Pattern + +When submitting SLURM jobs remotely, write **two files** locally to avoid shell escaping issues: + +1. **SLURM wrapper** (e.g., `ptq_slurm.sh`) — `#SBATCH` directives + `srun` with container +2. **Inner runner** (e.g., `ptq_run.sh`) — the actual work (runs inside the container) + +Then upload both and submit: + +```bash +remote_sync_to /local/scripts/ scripts/ +JOBID=$(remote_run "sbatch /remote/path/scripts/ptq_slurm.sh" | grep -o '[0-9]\+' | tail -1) +``` + +For the SLURM wrapper template and container flags, see `slurm-setup.md`. + +--- + +## 6. Verifying Results Remotely + +```bash +remote_run "ls -lh /" +remote_run "cat /hf_quant_config.json" +``` + +Or fetch results to local: + +```bash +remote_sync_from /local/output/ +``` + +--- + +## 7. Troubleshooting + +| Problem | Cause | Fix | +| ------- | ----- | --- | +| `Connection timed out during banner exchange` | Proxy/login node overloaded | `remote_run` retries 3x automatically; use persistent session to avoid | +| SSH proxy completely unreachable (`Network is unreachable`) | VPN/proxy host is down or not running on this machine | Check if VPN is connected; verify `socat`/proxy service is running locally; try direct SSH by temporarily removing `ssh_proxy` from config | +| `unix_listener: cannot bind to path ... Read-only file system` | SSH ControlMaster socket in non-writable `/tmp` | `remote_exec.sh` auto-finds writable dir; ensure `TMPDIR` or `/tmp/claude-*` exists | +| `cd: /home/user/~/path: No such file or directory` | `~` not expanding on remote | Use absolute paths in `workspace` config, not `~/...` | +| Login nodes resolve home dirs differently | Symlinked home dirs vary by node | Use absolute lustre/NFS paths (e.g., `/lustre/fs1/...`) in job scripts | +| `#!` becomes `#\!` in scripts | Shell environment mangles shebang | Fix with `sed -i 's\|^#\\\\!\|#!\| script.sh'` after writing | + +## Reference Files + +- **`skills/common/remote_exec.sh`** — Full utility library (session, run, sync, SLURM, Docker helpers) +- **`.claude/clusters.yaml`** — Active cluster configuration +- **`.claude/clusters.yaml.example`** — Annotated example config diff --git a/.claude/skills/ptq/references/slurm-setup.md b/.claude/skills/ptq/references/slurm-setup.md new file mode 100644 index 00000000000..610eb32d3de --- /dev/null +++ b/.claude/skills/ptq/references/slurm-setup.md @@ -0,0 +1,136 @@ +# SLURM Environment Setup for ModelOpt PTQ + +Read this file when running on a SLURM cluster. It covers container setup, job submission, smoke-test strategy, and monitoring. + +--- + +## 1. Container + +Get the recommended image version from `examples/llm_ptq/README.md`, then look for a `.sqsh` file in the workspace and common sibling directories: + +```bash +ls *.sqsh ../*.sqsh ~/containers/*.sqsh 2>/dev/null +``` + +If you find a `.sqsh` but aren't sure of its version, check it: + +```bash +srun --container-image= --ntasks=1 bash -c \ + "pip show tensorrt-llm 2>/dev/null | grep Version || cat /VERSION 2>/dev/null || echo unknown" +``` + +If no `.sqsh` exists, import it with enroot. Set writable cache paths first — the default `/raid/containers` is often not writable: + +```bash +export ENROOT_CACHE_PATH=/path/to/writable/enroot-cache +export ENROOT_DATA_PATH=/path/to/writable/enroot-data +export TMPDIR=/path/to/writable/tmp +mkdir -p "$ENROOT_CACHE_PATH" "$ENROOT_DATA_PATH" "$TMPDIR" + +enroot import --output /path/to/container.sqsh \ + docker://nvcr.io#nvidia/tensorrt-llm/release: +``` + +--- + +## 2. Account and Partition + +```bash +# Accounts available to you +sacctmgr show associations user=$USER format=account%30,cluster%20 -n 2>/dev/null + +# GPU partitions and their time/node limits (exclude CPU-only) +sinfo -o "%P %a %l %D %G" 2>/dev/null | grep -v "null\|CPU\|cpu" +``` + +- One account → use it automatically +- Multiple accounts → show them to the user and ask which to use +- Partition → use the default (marked `*`); report the choice + +--- + +## 3. Job Script Template + +**Critical**: container flags (`--container-image`, `--container-mounts`) MUST be on the `srun` line — they do NOT work as `#SBATCH` directives. + +**GPU count**: estimate based on model size. Rough guide: 1 GPU per ~20B params in BF16 (e.g., 0.6B → 1 GPU, 70B → 4 GPUs, 405B → 8 GPUs). `hf_ptq.py` uses `device_map="auto"` so it fills GPUs automatically — request only as many as needed. + +```bash +#!/bin/bash +#SBATCH --job-name=ptq +#SBATCH --account= +#SBATCH --partition= +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --gpus-per-node= # 1 for small models (<20B), 4-8 for large models +#SBATCH --time= +#SBATCH --output=/ptq_%j.log + +srun \ + --container-image="" \ + --container-mounts=":" \ + --container-workdir="" \ + --no-container-mount-home \ + bash -c "pip install -e [hf] --quiet && python ..." +``` + +Submit and capture the job ID: + +```bash +mkdir -p +JOBID=$(sbatch