diff --git a/.claude/clusters.yaml.example b/.claude/clusters.yaml.example
new file mode 100644
index 00000000000..5bf4182e5c2
--- /dev/null
+++ b/.claude/clusters.yaml.example
@@ -0,0 +1,18 @@
+# ModelOpt Remote Cluster Configuration
+# Copy to ~/.config/modelopt/clusters.yaml (user-level, recommended)
+# or .claude/clusters.yaml (project-level, can be committed).
+
+clusters:
+  # GPU workstation or SLURM login node
+  my-cluster:
+    login_node: cluster-login.example.com
+    user: myusername
+    ssh_key: ~/.ssh/id_rsa
+    # ssh_proxy: "socat - PROXY:localhost:%h:%p,proxyport=3128"  # optional
+    workspace: /path/to/remote/workdir
+    gpu_type: H100   # used for quantization format recommendation
+    # slurm:
+    #   default_account: my_account
+    #   default_partition: batch_short
+
+default_cluster: my-cluster
diff --git a/.claude/skills/common/remote_exec.sh b/.claude/skills/common/remote_exec.sh
new file mode 100644
index 00000000000..e5a1bc2b242
--- /dev/null
+++ b/.claude/skills/common/remote_exec.sh
@@ -0,0 +1,492 @@
+#!/usr/bin/env bash
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# remote_exec.sh — Remote execution utility for ModelOpt agent skills
+#
+# Usage:
+#   source .claude/skills/common/remote_exec.sh
+#   remote_load_cluster <cluster_name>     # or: remote_load_cluster (uses default)
+#   remote_check_ssh
+#   remote_detect_env                       # detect SLURM vs Docker vs bare metal
+#   remote_run "command"
+#   remote_sync_to <local_path> [remote_subdir]
+#   remote_sync_from <remote_subdir> <local_path>
+#   remote_submit_job <job_script>          # SLURM only
+#   remote_poll_job <job_id>                # SLURM only
+#   remote_wait_job <job_id> [interval=30]  # SLURM only
+#   remote_docker_run <container_or_image> "<command>"  # Docker only
+#   remote_tail_log <remote_log_path> [lines=50]
+#
+# After remote_load_cluster, these env vars are set:
+#   REMOTE_HOST, REMOTE_USER, REMOTE_SSH_KEY, REMOTE_SSH_PROXY,
+#   REMOTE_WORKSPACE, REMOTE_GPU_TYPE, REMOTE_ENV_TYPE,
+#   REMOTE_CONTAINER_IMAGE, REMOTE_SLURM_ACCOUNT, REMOTE_SLURM_PARTITION
+
+set -euo pipefail
+
+# ── Helpers ──────────────────────────────────────────────────────────────────
+
+_remote_config_file() {
+    # Find clusters.yaml: user-level > project-level
+    local user_config="${HOME}/.config/modelopt/clusters.yaml"
+    local project_config
+    # Walk up from pwd looking for .claude/clusters.yaml
+    local dir="$PWD"
+    while [[ "$dir" != "/" ]]; do
+        if [[ -f "$dir/.claude/clusters.yaml" ]]; then
+            project_config="$dir/.claude/clusters.yaml"
+            break
+        fi
+        dir="$(dirname "$dir")"
+    done
+
+    if [[ -f "$user_config" ]]; then
+        echo "$user_config"
+    elif [[ -n "${project_config:-}" && -f "$project_config" ]]; then
+        echo "$project_config"
+    else
+        echo ""
+    fi
+}
+
+_parse_yaml_value() {
+    # Simple YAML value extractor: _parse_yaml_value <file> <dot.path>
+    # Handles simple scalar values only (not arrays/nested objects)
+    local file="$1" path="$2"
+    python3 -c "
+import yaml, sys
+with open('$file') as f:
+    data = yaml.safe_load(f)
+keys = '$path'.split('.')
+for k in keys:
+    if isinstance(data, dict) and k in data:
+        data = data[k]
+    else:
+        sys.exit(0)
+if data is not None:
+    print(data)
+" 2>/dev/null || true
+}
+
+_ssh_control_path() {
+    # Return the path for the SSH ControlMaster socket
+    # Use a per-host socket so multiple clusters don't collide
+    # Try multiple writable locations (sandbox may restrict /tmp)
+    local tmpdir
+    for candidate in "${TMPDIR:-}" /tmp/claude-*/ssh-ctl /tmp; do
+        if [[ -n "$candidate" && -d "$candidate" && -w "$candidate" ]]; then
+            tmpdir="$candidate"
+            break
+        fi
+    done
+    # Fallback: create in home dir
+    tmpdir="${tmpdir:-$HOME/.cache/ssh-ctl}"
+    mkdir -p "$tmpdir" 2>/dev/null || true
+    # Use short name to avoid Unix socket path length limit (108 chars)
+    local host_hash
+    host_hash=$(echo "${REMOTE_USER}@${REMOTE_HOST}" | md5sum | cut -c1-12)
+    echo "${tmpdir}/ctl-${host_hash}"
+}
+
+_ssh_base_opts() {
+    # Build SSH options (without the ssh command itself or user@host)
+    local opts="-o BatchMode=yes -o ConnectTimeout=15 -o StrictHostKeyChecking=accept-new"
+    # ControlMaster multiplexing: reuse a single persistent SSH connection
+    local ctl_path
+    ctl_path="$(_ssh_control_path)"
+    opts+=" -o ControlPath='${ctl_path}'"
+    # ControlMaster=auto: reuse existing master if available, otherwise start a new one
+    opts+=" -o ControlMaster=auto"
+    if [[ -n "${REMOTE_SSH_KEY:-}" ]]; then
+        opts+=" -i $REMOTE_SSH_KEY"
+    fi
+    if [[ -n "${REMOTE_SSH_PROXY:-}" ]]; then
+        opts+=" -o ProxyCommand='${REMOTE_SSH_PROXY}'"
+    fi
+    echo "$opts"
+}
+
+_ssh_base_cmd() {
+    # Build the full SSH command
+    echo "ssh $(_ssh_base_opts) ${REMOTE_USER}@${REMOTE_HOST}"
+}
+
+# ── Session Management ───────────────────────────────────────────────────────
+
+remote_start_session() {
+    # Start a persistent SSH ControlMaster connection in the background.
+    # All subsequent remote_run / remote_sync_* / scp calls reuse this connection.
+    # Call this once after remote_load_cluster + remote_check_ssh.
+    local ctl_path
+    ctl_path="$(_ssh_control_path)"
+
+    # If a master is already running, skip
+    if ssh -o ControlPath="$ctl_path" -O check "${REMOTE_USER}@${REMOTE_HOST}" 2>/dev/null; then
+        echo "SSH session already active (reusing existing connection)."
+        return 0
+    fi
+
+    echo "Starting persistent SSH session to ${REMOTE_USER}@${REMOTE_HOST}..."
+    local opts="-o BatchMode=yes -o ConnectTimeout=15 -o StrictHostKeyChecking=accept-new"
+    opts+=" -o ControlMaster=yes -o ControlPath='${ctl_path}' -o ControlPersist=3600"
+    if [[ -n "${REMOTE_SSH_KEY:-}" ]]; then
+        opts+=" -i $REMOTE_SSH_KEY"
+    fi
+    if [[ -n "${REMOTE_SSH_PROXY:-}" ]]; then
+        opts+=" -o ProxyCommand='${REMOTE_SSH_PROXY}'"
+    fi
+
+    # Start the master in the background (-f -N: go background, no command)
+    eval "ssh $opts -f -N ${REMOTE_USER}@${REMOTE_HOST}" 2>&1
+    local rc=$?
+    if (( rc == 0 )); then
+        echo "SSH session established. All commands will reuse this connection."
+        # Register cleanup trap
+        trap 'remote_stop_session 2>/dev/null' EXIT
+    else
+        echo "WARNING: Failed to start persistent SSH session (rc=$rc). Commands will use individual connections." >&2
+    fi
+    return $rc
+}
+
+remote_stop_session() {
+    # Gracefully close the persistent SSH connection
+    local ctl_path
+    ctl_path="$(_ssh_control_path)"
+    if [[ -S "$ctl_path" ]]; then
+        ssh -o ControlPath="$ctl_path" -O exit "${REMOTE_USER}@${REMOTE_HOST}" 2>/dev/null || true
+        echo "SSH session closed."
+    fi
+}
+
+# ── Core Functions ───────────────────────────────────────────────────────────
+
+remote_load_cluster() {
+    # Load cluster config by name. If no name given, use default_cluster.
+    local cluster_name="${1:-}"
+    local config_file
+    config_file="$(_remote_config_file)"
+
+    if [[ -z "$config_file" ]]; then
+        echo "ERROR: No clusters.yaml found. Provide cluster info interactively or create one." >&2
+        echo "  User config:    ~/.config/modelopt/clusters.yaml" >&2
+        echo "  Project config: .claude/clusters.yaml" >&2
+        return 1
+    fi
+
+    # Get default cluster if none specified
+    if [[ -z "$cluster_name" ]]; then
+        cluster_name="$(_parse_yaml_value "$config_file" "default_cluster")"
+        if [[ -z "$cluster_name" ]]; then
+            echo "ERROR: No cluster name specified and no default_cluster in config." >&2
+            return 1
+        fi
+    fi
+
+    # Parse cluster config
+    REMOTE_HOST="$(_parse_yaml_value "$config_file" "clusters.${cluster_name}.login_node")"
+    REMOTE_USER="$(_parse_yaml_value "$config_file" "clusters.${cluster_name}.user")"
+    REMOTE_SSH_KEY="$(_parse_yaml_value "$config_file" "clusters.${cluster_name}.ssh_key")"
+    REMOTE_SSH_PROXY="$(_parse_yaml_value "$config_file" "clusters.${cluster_name}.ssh_proxy")"
+    REMOTE_WORKSPACE="$(_parse_yaml_value "$config_file" "clusters.${cluster_name}.workspace")"
+    REMOTE_GPU_TYPE="$(_parse_yaml_value "$config_file" "clusters.${cluster_name}.gpu_type")"
+    REMOTE_CONTAINER_IMAGE="$(_parse_yaml_value "$config_file" "clusters.${cluster_name}.container_image")"
+    REMOTE_ENV_TYPE="$(_parse_yaml_value "$config_file" "clusters.${cluster_name}.env_type")"
+
+    # SLURM-specific
+    REMOTE_SLURM_ACCOUNT="$(_parse_yaml_value "$config_file" "clusters.${cluster_name}.slurm.default_account")"
+    REMOTE_SLURM_PARTITION="$(_parse_yaml_value "$config_file" "clusters.${cluster_name}.slurm.default_partition")"
+
+    # Expand ~ in ssh_key
+    if [[ "${REMOTE_SSH_KEY:-}" == "~/"* ]]; then
+        REMOTE_SSH_KEY="${HOME}/${REMOTE_SSH_KEY#\~/}"
+    fi
+
+    # Validate required fields
+    if [[ -z "$REMOTE_HOST" ]]; then
+        echo "ERROR: Cluster '$cluster_name' has no login_node defined." >&2
+        return 1
+    fi
+
+    # Default user to current user
+    REMOTE_USER="${REMOTE_USER:-$USER}"
+
+    export REMOTE_HOST REMOTE_USER REMOTE_SSH_KEY REMOTE_SSH_PROXY
+    export REMOTE_WORKSPACE REMOTE_GPU_TYPE REMOTE_CONTAINER_IMAGE
+    export REMOTE_ENV_TYPE REMOTE_SLURM_ACCOUNT REMOTE_SLURM_PARTITION
+
+    echo "Loaded cluster: $cluster_name (${REMOTE_USER}@${REMOTE_HOST}:${REMOTE_WORKSPACE})"
+}
+
+remote_check_ssh() {
+    # Validate SSH connectivity and start a persistent session.
+    # After this call, all remote_run / remote_sync_* commands reuse one connection.
+    echo "Checking SSH connectivity to ${REMOTE_USER}@${REMOTE_HOST}..."
+    # Start persistent session (also validates connectivity)
+    if remote_start_session 2>&1; then
+        return 0
+    fi
+    # Fallback: try a one-off connection
+    local result
+    if result=$(eval "$(_ssh_base_cmd)" '"echo SSH_OK"' 2>&1); then
+        if echo "$result" | grep -q "SSH_OK"; then
+            echo "SSH connection OK (no persistent session — commands will be slower)."
+            return 0
+        fi
+    fi
+    echo "ERROR: SSH connection failed:" >&2
+    echo "$result" >&2
+    return 1
+}
+
+remote_detect_env() {
+    # Auto-detect remote environment: slurm, docker, or bare
+    # Sets REMOTE_ENV_TYPE and discovers GPU info
+    if [[ -n "${REMOTE_ENV_TYPE:-}" && "$REMOTE_ENV_TYPE" != "auto" ]]; then
+        echo "Environment type: $REMOTE_ENV_TYPE (from config)"
+        return 0
+    fi
+
+    echo "Detecting remote environment..."
+    local info
+    info=$(remote_run "
+        echo ENV_DETECT_START;
+        # Check SLURM
+        if command -v sbatch &>/dev/null; then
+            echo 'HAS_SLURM=yes';
+            sacctmgr show associations user=\$USER format=account%30,partition%20,cluster%20 -n 2>/dev/null | head -20;
+            echo 'SLURM_PARTITIONS_START';
+            sinfo -o '%P %a %l %D %G' 2>/dev/null | head -30;
+            echo 'SLURM_PARTITIONS_END';
+        else
+            echo 'HAS_SLURM=no';
+        fi;
+        # Check Docker
+        if command -v docker &>/dev/null; then
+            echo 'HAS_DOCKER=yes';
+            # Check if docker can access GPUs
+            docker run --rm --gpus all nvidia/cuda:12.4.0-base-ubuntu22.04 nvidia-smi --query-gpu=name,memory.total --format=csv,noheader 2>/dev/null && echo 'DOCKER_GPU=yes' || echo 'DOCKER_GPU=no';
+        else
+            echo 'HAS_DOCKER=no';
+        fi;
+        # Check bare metal GPU
+        if command -v nvidia-smi &>/dev/null; then
+            echo 'HAS_BARE_GPU=yes';
+            nvidia-smi --query-gpu=name,memory.total --format=csv,noheader 2>/dev/null;
+        else
+            echo 'HAS_BARE_GPU=no';
+        fi;
+        echo ENV_DETECT_END;
+    " 2>&1)
+
+    echo "$info"
+
+    if echo "$info" | grep -q "HAS_SLURM=yes"; then
+        REMOTE_ENV_TYPE="slurm"
+    elif echo "$info" | grep -q "HAS_DOCKER=yes"; then
+        REMOTE_ENV_TYPE="docker"
+    elif echo "$info" | grep -q "HAS_BARE_GPU=yes"; then
+        REMOTE_ENV_TYPE="bare"
+    else
+        REMOTE_ENV_TYPE="unknown"
+    fi
+
+    export REMOTE_ENV_TYPE
+    echo "Detected environment: $REMOTE_ENV_TYPE"
+}
+
+remote_run() {
+    # Run a command on the remote machine
+    # Usage: remote_run "command"
+    # Uses base64 encoding to avoid all quoting/escaping issues.
+    # Retries up to 3 times on SSH connection failures.
+    local cmd="$1"
+    local ws="${REMOTE_WORKSPACE:-\$HOME}"
+    local full_cmd="cd $ws && $cmd"
+    local encoded
+    encoded=$(printf '%s' "$full_cmd" | base64 -w0)
+
+    local attempt=0 max_attempts=3 result rc
+    while (( attempt < max_attempts )); do
+        result=$(eval "$(_ssh_base_cmd)" "'echo $encoded | base64 -d | bash'" 2>&1) && rc=$? || rc=$?
+        if (( rc != 255 )); then
+            # rc=255 is SSH connection failure; anything else is the remote command's exit code
+            echo "$result"
+            return $rc
+        fi
+        attempt=$((attempt + 1))
+        if (( attempt < max_attempts )); then
+            echo "SSH connection failed (attempt $attempt/$max_attempts), retrying in 10s..." >&2
+            sleep 10
+        fi
+    done
+    echo "$result"
+    return $rc
+}
+
+remote_sync_to() {
+    # Sync local path to remote workspace
+    # Usage: remote_sync_to <local_path> [remote_subdir]
+    local local_path="$1"
+    local remote_subdir="${2:-}"
+    local remote_dest="${REMOTE_USER}@${REMOTE_HOST}:${REMOTE_WORKSPACE}/${remote_subdir}"
+
+    local rsync_cmd="rsync -avz --progress"
+    # Add default excludes
+    for excl in .git __pycache__ "*.pyc" .claude node_modules "*.egg-info"; do
+        rsync_cmd+=" --exclude='$excl'"
+    done
+    # Reuse the shared SSH options (including ControlMaster)
+    rsync_cmd+=" -e \"ssh $(_ssh_base_opts)\""
+    rsync_cmd+=" '${local_path}/' '${remote_dest}'"
+
+    echo "Syncing ${local_path} → ${remote_dest} ..."
+    eval "$rsync_cmd"
+}
+
+remote_sync_from() {
+    # Sync from remote to local
+    # Usage: remote_sync_from <remote_subdir> <local_path>
+    local remote_subdir="$1"
+    local local_path="$2"
+    local remote_src="${REMOTE_USER}@${REMOTE_HOST}:${REMOTE_WORKSPACE}/${remote_subdir}"
+
+    mkdir -p "$local_path"
+    echo "Fetching ${remote_src} → ${local_path} ..."
+    eval "rsync -avz --progress -e \"ssh $(_ssh_base_opts)\" '${remote_src}/' '${local_path}/'"
+}
+
+# ── SLURM Functions ──────────────────────────────────────────────────────────
+
+remote_submit_job() {
+    # Submit a SLURM job script that's already on the remote machine
+    # Usage: remote_submit_job <remote_script_path>
+    # Returns: job ID on stdout
+    local script_path="$1"
+    local output
+    output=$(remote_run "sbatch '$script_path'" 2>&1)
+    local jobid
+    jobid=$(echo "$output" | grep -o '[0-9]\+' | tail -1)
+    if [[ -z "$jobid" ]]; then
+        echo "ERROR: Failed to submit job:" >&2
+        echo "$output" >&2
+        return 1
+    fi
+    echo "$jobid"
+}
+
+remote_poll_job() {
+    # Check SLURM job state
+    # Usage: remote_poll_job <job_id>
+    # Returns: PENDING, RUNNING, COMPLETED, FAILED, TIMEOUT, CANCELLED, etc.
+    local jobid="$1"
+    local state
+    state=$(remote_run "squeue -j $jobid -h -o %T 2>/dev/null" 2>&1 | grep -v "^$" | tail -1)
+    if [[ -z "$state" ]]; then
+        # Job no longer in queue — check sacct
+        state=$(remote_run "sacct -j $jobid --format=State -n -X 2>/dev/null" 2>&1 | awk '{print $1}' | head -1)
+    fi
+    echo "${state:-UNKNOWN}"
+}
+
+remote_wait_job() {
+    # Wait for a SLURM job to complete
+    # Usage: remote_wait_job <job_id> [poll_interval_seconds=30]
+    local jobid="$1"
+    local interval="${2:-30}"
+    echo "Waiting for job $jobid (polling every ${interval}s)..."
+    while true; do
+        local state
+        state=$(remote_poll_job "$jobid")
+        echo "$(date '+%H:%M:%S') Job $jobid: $state"
+        case "$state" in
+            COMPLETED)
+                echo "Job $jobid completed successfully."
+                return 0
+                ;;
+            FAILED|TIMEOUT|CANCELLED|OUT_OF_MEMORY|NODE_FAIL)
+                echo "ERROR: Job $jobid ended with state: $state" >&2
+                remote_job_result "$jobid"
+                return 1
+                ;;
+            UNKNOWN)
+                echo "WARNING: Could not determine job state. Checking sacct..." >&2
+                remote_job_result "$jobid"
+                return 1
+                ;;
+        esac
+        sleep "$interval"
+    done
+}
+
+remote_job_result() {
+    # Get job result details from sacct
+    # Usage: remote_job_result <job_id>
+    local jobid="$1"
+    remote_run "sacct -j $jobid --format=JobID,State,ExitCode,Elapsed,MaxRSS -n 2>/dev/null"
+}
+
+# ── Docker Functions ─────────────────────────────────────────────────────────
+
+remote_docker_run() {
+    # Run a command inside a Docker container on the remote machine
+    # Usage: remote_docker_run <container_or_image> "<command>"
+    # If container_or_image matches a running container name, uses docker exec.
+    # Otherwise, uses docker run with the given image.
+    local container_or_image="$1"
+    local cmd="$2"
+
+    # Check if it's a running container
+    local is_running
+    is_running=$(remote_run "docker ps --format '{{.Names}}' | grep -x '$container_or_image' 2>/dev/null" 2>&1 || true)
+
+    if [[ -n "$is_running" ]]; then
+        echo "Executing in running container: $container_or_image"
+        remote_run "docker exec $container_or_image bash -c '$cmd'"
+    else
+        echo "Running in new container: $container_or_image"
+        remote_run "docker run --rm --gpus all -v ${REMOTE_WORKSPACE}:${REMOTE_WORKSPACE} -w ${REMOTE_WORKSPACE} $container_or_image bash -c '$cmd'"
+    fi
+}
+
+# ── Log Functions ────────────────────────────────────────────────────────────
+
+remote_tail_log() {
+    # Tail a log file on the remote machine
+    # Usage: remote_tail_log <remote_log_path> [num_lines=50]
+    local log_path="$1"
+    local lines="${2:-50}"
+    remote_run "tail -n $lines '$log_path' 2>/dev/null || echo 'Log file not found: $log_path'"
+}
+
+# ── Workspace Functions ──────────────────────────────────────────────────────
+
+remote_ensure_workspace() {
+    # Create the remote workspace directory if it doesn't exist
+    remote_run "mkdir -p '${REMOTE_WORKSPACE}'"
+    echo "Remote workspace ready: ${REMOTE_WORKSPACE}"
+}
+
+remote_workspace_info() {
+    # Print useful info about the remote workspace
+    remote_run "
+        echo '=== Workspace: ${REMOTE_WORKSPACE} ===';
+        echo '--- Disk usage ---';
+        du -sh '${REMOTE_WORKSPACE}' 2>/dev/null || echo 'N/A';
+        echo '--- Contents ---';
+        ls -la '${REMOTE_WORKSPACE}/' 2>/dev/null | head -20;
+    "
+}
diff --git a/.claude/skills/common/workspace-management.md b/.claude/skills/common/workspace-management.md
new file mode 100644
index 00000000000..b7adb34ac41
--- /dev/null
+++ b/.claude/skills/common/workspace-management.md
@@ -0,0 +1,84 @@
+# Workspace Management
+
+When running via the Slack bot (or any multi-user environment), each user has a **workspace root** containing model-specific workspaces. Each workspace is a copy of the Model-Optimizer repo where the agent can freely modify code.
+
+## Environment Variables
+
+The bot sets these env vars before launching Claude:
+
+- `MODELOPT_WORKSPACE_ROOT` — user's workspace root (e.g., `/data/modelopt/users/U123/jobs/`)
+- `MODELOPT_REPO_DIR` — path to the shared upstream repo (read-only source for copies)
+
+If these are not set, you are running locally — skip workspace management.
+
+## When to Create vs Reuse a Workspace
+
+**Before starting any task**, check for an existing workspace that matches:
+
+```bash
+# List existing workspaces
+ls "$MODELOPT_WORKSPACE_ROOT/" 2>/dev/null
+```
+
+**Reuse** an existing workspace when:
+- The task involves the same model (e.g., deploying a model you just quantized)
+- The task needs output from a previous step (e.g., eval needs the PTQ checkpoint)
+- The user says "deploy the model I just quantized" or similar
+
+**Create a new workspace** when:
+- This is a new model not seen before
+- The user explicitly asks for a fresh start
+- The existing workspace's code modifications are incompatible (rare)
+
+## Creating a New Workspace
+
+Name workspaces by model/purpose, not timestamps:
+
+```bash
+# Good names
+qwen3-0.6b
+llama-3.1-8b-fp8
+deepseek-v3-nvfp4
+
+# Bad names (don't use)
+ptq-20260318-143022
+job-001
+```
+
+To create:
+
+```bash
+rsync -a --quiet \
+    --exclude .git --exclude __pycache__ --exclude '*.pyc' \
+    --exclude node_modules --exclude '*.egg-info' --exclude '*.sqsh' \
+    "$MODELOPT_REPO_DIR/" "$MODELOPT_WORKSPACE_ROOT/<name>/"
+```
+
+Then `cd` into the new workspace and continue with the task.
+
+## Injecting Cluster Config
+
+If `.claude/clusters.yaml` exists in the current workspace, it was injected by the bot. When creating a new workspace, copy it over:
+
+```bash
+cp "$MODELOPT_WORKSPACE_ROOT/default/.claude/clusters.yaml" \
+   "$MODELOPT_WORKSPACE_ROOT/<new-name>/.claude/clusters.yaml" 2>/dev/null
+```
+
+## Example Flow
+
+```
+User: "quantize Qwen3-0.6B with nvfp4"
+Agent: ls $MODELOPT_WORKSPACE_ROOT/  → empty or no "qwen3-0.6b"
+       → create workspace "qwen3-0.6b"
+       → run PTQ, output to qwen3-0.6b/output/
+
+User: "deploy the model I just quantized"
+Agent: ls $MODELOPT_WORKSPACE_ROOT/  → sees "qwen3-0.6b"
+       → reuse workspace, find checkpoint at qwen3-0.6b/output/
+       → deploy from there
+
+User: "now quantize Llama-3.1-8B with fp8"
+Agent: ls $MODELOPT_WORKSPACE_ROOT/  → sees "qwen3-0.6b", no llama
+       → create workspace "llama-3.1-8b-fp8"
+```
diff --git a/.claude/skills/deployment/SKILL.md b/.claude/skills/deployment/SKILL.md
new file mode 100644
index 00000000000..90871526af9
--- /dev/null
+++ b/.claude/skills/deployment/SKILL.md
@@ -0,0 +1,262 @@
+---
+name: deployment
+description: Serve a quantized or unquantized LLM checkpoint as an OpenAI-compatible API endpoint using vLLM, SGLang, or TRT-LLM. Use when user says "deploy model", "serve model", "start vLLM server", "launch SGLang", "TRT-LLM deploy", "AutoDeploy", "benchmark throughput", "serve checkpoint", or needs an inference endpoint from a HuggingFace or ModelOpt-quantized checkpoint.
+license: Apache-2.0
+---
+
+# Deployment Skill
+
+Serve a model checkpoint as an OpenAI-compatible inference endpoint. Supports vLLM, SGLang, and TRT-LLM (including AutoDeploy).
+
+## Quick Start
+
+Use the deploy script for the fastest path. It auto-detects quantization format from the checkpoint:
+
+```bash
+# Start vLLM server with a ModelOpt checkpoint
+scripts/deploy.sh start --model ./qwen3-0.6b-fp8
+
+# Start with SGLang and tensor parallelism
+scripts/deploy.sh start --model ./llama-70b-nvfp4 --framework sglang --tp 4
+
+# Start from HuggingFace hub
+scripts/deploy.sh start --model nvidia/Llama-3.1-8B-Instruct-FP8
+
+# Test the API
+scripts/deploy.sh test
+
+# Check status
+scripts/deploy.sh status
+
+# Stop
+scripts/deploy.sh stop
+```
+
+The script handles: GPU detection, quantization flag auto-detection (FP8 vs FP4), server lifecycle (start/stop/restart/status), health check polling, and API testing.
+
+## Decision Flow
+
+### 0. Check workspace (multi-user / Slack bot)
+
+If `MODELOPT_WORKSPACE_ROOT` is set, read `skills/common/workspace-management.md`. Before creating a new workspace, check for existing ones — especially if deploying a checkpoint from a prior PTQ run:
+
+```bash
+ls "$MODELOPT_WORKSPACE_ROOT/" 2>/dev/null
+```
+
+If the user says "deploy the model I just quantized" or references a previous PTQ, find the matching workspace and `cd` into it. The checkpoint should be in that workspace's output directory.
+
+### 1. Identify the checkpoint
+
+Determine what the user wants to deploy:
+
+- **Local quantized checkpoint** (from ptq skill or manual export): look for `hf_quant_config.json` in the directory
+- **HuggingFace model hub** (e.g., `nvidia/Llama-3.1-8B-Instruct-FP8`): use directly
+- **Unquantized model**: deploy as-is (BF16) or suggest quantizing first with the ptq skill
+
+Check the quantization format if applicable:
+
+```bash
+cat <checkpoint_path>/hf_quant_config.json 2>/dev/null || echo "No quant config — unquantized or legacy format"
+```
+
+### 2. Choose the framework
+
+If the user hasn't specified a framework, recommend based on this priority:
+
+| Situation | Recommended | Why |
+|-----------|-------------|-----|
+| General use | **vLLM** | Widest ecosystem, easy setup, OpenAI-compatible |
+| Best SGLang model support | **SGLang** | Strong DeepSeek/Llama 4 support |
+| Maximum optimization | **TRT-LLM** | Best throughput via engine compilation |
+| Mixed-precision / AutoQuant | **TRT-LLM AutoDeploy** | Only option for AutoQuant checkpoints |
+
+Check the support matrix in `references/support-matrix.md` to confirm the model + format + framework combination is supported.
+
+### 3. Check the environment
+
+**GPU availability:**
+
+```bash
+python -c "import torch; [print(f'GPU {i}: {torch.cuda.get_device_name(i)}') for i in range(torch.cuda.device_count())] if torch.cuda.is_available() else print('no-gpu')"
+```
+
+**Framework installed?**
+
+```bash
+# vLLM
+python -c "import vllm; print(f'vLLM {vllm.__version__}')" 2>/dev/null || echo "vLLM not installed"
+
+# SGLang
+python -c "import sglang; print(f'SGLang {sglang.__version__}')" 2>/dev/null || echo "SGLang not installed"
+
+# TRT-LLM
+python -c "import tensorrt_llm; print(f'TRT-LLM {tensorrt_llm.__version__}')" 2>/dev/null || echo "TRT-LLM not installed"
+```
+
+If the framework is not installed, consult `references/setup.md` for installation instructions.
+
+**GPU memory estimate:**
+
+- BF16 model: `num_params × 2 bytes` (e.g., 8B model ≈ 16 GB)
+- FP8 model: `num_params × 1 byte` (e.g., 8B model ≈ 8 GB)
+- FP4 model: `num_params × 0.5 bytes` (e.g., 8B model ≈ 4 GB)
+- Add ~2-4 GB for KV cache and framework overhead
+
+If the model exceeds single GPU memory, use tensor parallelism (`-tp <num_gpus>`).
+
+### 4. Deploy
+
+Read the framework-specific reference for detailed instructions:
+
+| Framework | Reference file |
+|-----------|---------------|
+| vLLM | `references/vllm.md` |
+| SGLang | `references/sglang.md` |
+| TRT-LLM | `references/trtllm.md` |
+
+**Quick-start commands** (for common cases):
+
+#### vLLM
+
+```bash
+# Serve as OpenAI-compatible endpoint
+python -m vllm.entrypoints.openai.api_server \
+    --model <checkpoint_path> \
+    --quantization modelopt \
+    --tensor-parallel-size <num_gpus> \
+    --host 0.0.0.0 --port 8000
+```
+
+For NVFP4 checkpoints, use `--quantization modelopt_fp4`.
+
+#### SGLang
+
+```bash
+python -m sglang.launch_server \
+    --model-path <checkpoint_path> \
+    --quantization modelopt \
+    --tp <num_gpus> \
+    --host 0.0.0.0 --port 8000
+```
+
+#### TRT-LLM (direct)
+
+```python
+from tensorrt_llm import LLM, SamplingParams
+llm = LLM(model="<checkpoint_path>")
+outputs = llm.generate(["Hello, my name is"], SamplingParams(temperature=0.8, top_p=0.95))
+```
+
+#### TRT-LLM AutoDeploy
+
+For AutoQuant or mixed-precision checkpoints, see `references/trtllm.md`.
+
+### 5. Verify the deployment
+
+After the server starts, verify it's healthy:
+
+```bash
+# Health check
+curl -s http://localhost:8000/health
+
+# List models
+curl -s http://localhost:8000/v1/models | python -m json.tool
+
+# Test generation
+curl -s http://localhost:8000/v1/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+        "model": "<model_name>",
+        "prompt": "The capital of France is",
+        "max_tokens": 32
+    }' | python -m json.tool
+```
+
+All checks must pass before reporting success to the user.
+
+### 6. Benchmark (optional)
+
+If the user wants throughput/latency numbers, run a quick benchmark:
+
+```bash
+# vLLM benchmark
+python -m vllm.entrypoints.openai.api_server ... &  # if not already running
+
+python -m vllm.benchmark_serving \
+    --model <model_name> \
+    --port 8000 \
+    --num-prompts 100 \
+    --request-rate 10
+```
+
+Report: throughput (tok/s), latency p50/p99, time to first token (TTFT).
+
+### 7. Remote deployment (SSH/SLURM)
+
+If a cluster config exists (`~/.config/modelopt/clusters.yaml` or `.claude/clusters.yaml`), or the user mentions running on a remote machine:
+
+1. **Source remote utilities:**
+
+   ```bash
+   source .claude/skills/common/remote_exec.sh
+   remote_load_cluster
+   remote_check_ssh
+   remote_detect_env
+   ```
+
+2. **Sync the checkpoint** (if it was produced locally):
+
+   ```bash
+   remote_sync_to <local_checkpoint_path> checkpoints/
+   ```
+
+3. **Deploy based on remote environment:**
+
+   - **SLURM** — write a job script that starts the server inside a container, then submit:
+
+     ```bash
+     srun --container-image="<container.sqsh>" \
+         --container-mounts="<data_root>:<data_root>" \
+         python -m vllm.entrypoints.openai.api_server \
+             --model <remote_checkpoint_path> \
+             --quantization modelopt \
+             --host 0.0.0.0 --port 8000
+     ```
+
+     Use `remote_submit_job` and `remote_poll_job` to manage the job. The server runs on the allocated node — get its hostname from `squeue -j $JOBID -o %N`.
+
+   - **Bare metal / Docker** — use `remote_run` to start the server directly:
+
+     ```bash
+     remote_run "nohup python -m vllm.entrypoints.openai.api_server --model <path> --port 8000 > deploy.log 2>&1 &"
+     ```
+
+4. **Verify remotely:**
+
+   ```bash
+   remote_run "curl -s http://localhost:8000/health"
+   remote_run "curl -s http://localhost:8000/v1/models"
+   ```
+
+5. **Report the endpoint** — include the remote hostname and port so the user can connect (e.g., `http://<node_hostname>:8000`). For SLURM, note that the port is only reachable from within the cluster network.
+
+For NEL-managed deployment (evaluation with self-deployment), use the evaluation skill instead — NEL handles SLURM container deployment, health checks, and teardown automatically.
+
+## Error Handling
+
+| Error | Cause | Fix |
+|-------|-------|-----|
+| `CUDA out of memory` | Model too large for GPU(s) | Increase `--tensor-parallel-size` or use a smaller model |
+| `quantization="modelopt" not recognized` | vLLM/SGLang version too old | Upgrade: vLLM >= 0.10.1, SGLang >= 0.4.10 |
+| `hf_quant_config.json not found` | Not a ModelOpt-exported checkpoint | Re-export with `export_hf_checkpoint()`, or remove `--quantization` flag |
+| `Connection refused` on health check | Server still starting | Wait 30-60s for large models; check logs for errors |
+| `modelopt_fp4 not supported` | Framework doesn't support FP4 for this model | Check support matrix in `references/support-matrix.md` |
+
+## Success Criteria
+
+1. Server process is running and healthy (`/health` returns 200)
+2. Model is listed at `/v1/models`
+3. Test generation produces coherent output
+4. Server URL and port are reported to the user
+5. If benchmarking was requested, throughput/latency numbers are reported
diff --git a/.claude/skills/deployment/references/setup.md b/.claude/skills/deployment/references/setup.md
new file mode 100644
index 00000000000..4209f08647b
--- /dev/null
+++ b/.claude/skills/deployment/references/setup.md
@@ -0,0 +1,85 @@
+# Deployment Environment Setup
+
+## Framework Installation
+
+### vLLM
+
+```bash
+pip install vllm
+```
+
+Minimum version: 0.10.1
+
+### SGLang
+
+```bash
+pip install "sglang[all]"
+```
+
+Minimum version: 0.4.10
+
+### TRT-LLM
+
+TRT-LLM is best installed via NVIDIA container:
+
+```bash
+docker pull nvcr.io/nvidia/tensorrt-llm/release:<version>
+```
+
+Or via pip (requires CUDA toolkit):
+
+```bash
+pip install tensorrt-llm
+```
+
+Minimum version: 0.17.0
+
+## SLURM Deployment
+
+For SLURM clusters, deploy inside a container. Container flags MUST be on the `srun` line:
+
+```bash
+#!/bin/bash
+#SBATCH --job-name=deploy
+#SBATCH --account=<account>
+#SBATCH --partition=<partition>
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --gpus-per-node=<num_gpus>
+#SBATCH --time=04:00:00
+#SBATCH --output=deploy_%j.log
+
+srun \
+    --container-image="<path/to/container.sqsh>" \
+    --container-mounts="<data_root>:<data_root>" \
+    --container-workdir="<workdir>" \
+    --no-container-mount-home \
+    bash -c "python -m vllm.entrypoints.openai.api_server \
+        --model <checkpoint_path> \
+        --quantization modelopt \
+        --tensor-parallel-size <num_gpus> \
+        --host 0.0.0.0 --port 8000"
+```
+
+To access the server from outside the SLURM node, note the allocated hostname:
+
+```bash
+squeue -u $USER -o "%j %N %S"  # Get the node name
+# Then SSH tunnel or use the node's hostname directly
+```
+
+## Docker Deployment
+
+### vLLM with ModelOpt
+
+A Dockerfile is available at `examples/vllm_serve/Dockerfile`:
+
+```bash
+docker build -f examples/vllm_serve/Dockerfile -t vllm-modelopt .
+
+docker run --gpus all -p 8000:8000 vllm-modelopt \
+    python -m vllm.entrypoints.openai.api_server \
+        --model <checkpoint_path> \
+        --quantization modelopt \
+        --host 0.0.0.0 --port 8000
+```
diff --git a/.claude/skills/deployment/references/sglang.md b/.claude/skills/deployment/references/sglang.md
new file mode 100644
index 00000000000..62d5c57b591
--- /dev/null
+++ b/.claude/skills/deployment/references/sglang.md
@@ -0,0 +1,81 @@
+# SGLang Deployment Reference
+
+## Requirements
+
+- SGLang >= 0.4.10
+- `pip install sglang[all]`
+
+## Server Deployment
+
+### As OpenAI-compatible server
+
+```bash
+python -m sglang.launch_server \
+    --model-path <checkpoint_path> \
+    --quantization modelopt \
+    --tp <num_gpus> \
+    --host 0.0.0.0 --port 8000
+```
+
+For NVFP4 checkpoints, use `--quantization modelopt_fp4`.
+
+### As Python API
+
+```python
+import sglang as sgl
+
+llm = sgl.Engine(model_path="<checkpoint_path>", quantization="modelopt")
+# For FP4: quantization="modelopt_fp4"
+
+sampling_params = {"temperature": 0.8, "top_p": 0.95}
+outputs = llm.generate(["Hello, my name is"], sampling_params)
+
+for output in outputs:
+    print(f"Generated: {output['text']}")
+```
+
+### From HuggingFace Hub
+
+```python
+import sglang as sgl
+
+llm = sgl.Engine(model_path="nvidia/Llama-3.1-8B-Instruct-FP8", quantization="modelopt")
+outputs = llm.generate(["What is AI?"], {"temperature": 0.8})
+```
+
+## Speculative Decoding
+
+SGLang supports speculative decoding with EAGLE and EAGLE3 models:
+
+```bash
+python -m sglang.launch_server \
+    --model-path <target_model> \
+    --speculative-algorithm EAGLE \
+    --speculative-draft-model-path <draft_model> \
+    --speculative-num-steps 3 \
+    --speculative-eagle-topk 4 \
+    --tp <num_gpus> \
+    --host 0.0.0.0 --port 8000
+```
+
+Reference: `examples/specdec_bench/specdec_bench/models/sglang.py`
+
+## Key SGLang Flags
+
+| Flag | Description |
+|------|-------------|
+| `--model-path` | Path to checkpoint or HF model ID |
+| `--quantization` | `modelopt` (FP8) or `modelopt_fp4` (FP4) |
+| `--tp` | Tensor parallelism size |
+| `--ep` | Expert parallelism (for MoE models) |
+| `--enable-torch-compile` | Enable torch.compile for better perf |
+| `--cuda-graph-max-bs` | Max batch size for CUDA graphs |
+| `--attention-backend` | `flashinfer` (default) or `triton` |
+
+## Common Issues
+
+| Issue | Fix |
+|-------|-----|
+| `quantization="modelopt"` not recognized | Upgrade SGLang to >= 0.4.10 |
+| DeepSeek FP4 not working | Check support matrix — SGLang FP4 support varies by model |
+| OOM on startup | Increase `--tp` or reduce `--max-total-tokens` |
diff --git a/.claude/skills/deployment/references/support-matrix.md b/.claude/skills/deployment/references/support-matrix.md
new file mode 100644
index 00000000000..8d0a6715375
--- /dev/null
+++ b/.claude/skills/deployment/references/support-matrix.md
@@ -0,0 +1,58 @@
+# Deployment Support Matrix
+
+## Unified HF Checkpoint — Framework Compatibility
+
+| Model | Quant Format | TRT-LLM | vLLM | SGLang |
+|-------|-------------|---------|------|--------|
+| Llama 3.x | FP8 | yes | yes | yes |
+| Llama 3.x | FP4 | yes | yes | yes |
+| Llama 4 | FP8 | yes | — | yes |
+| Llama 4 | FP4 | yes | — | — |
+| DeepSeek R1 | FP8 | yes | yes | yes |
+| DeepSeek R1 | FP4 | yes | yes | yes |
+| DeepSeek V3 | FP8 | yes | yes | yes |
+| DeepSeek V3 | FP4 | yes | yes | yes |
+| Qwen 3 | FP8 | yes | yes | yes |
+| Qwen 3 | FP4 | yes | yes | — |
+| Qwen 3 MoE | FP8 | yes | yes | yes |
+| Qwen 3 MoE | FP4 | yes | — | — |
+| Qwen 2.5 | FP8 | yes | yes | yes |
+| Qwen 2.5 | FP4 | yes | yes | — |
+| QwQ-32B | FP8 | yes | yes | yes |
+| QwQ-32B | FP4 | yes | yes | — |
+| Mixtral 8x7B | FP8 | yes | yes | yes |
+| Mixtral 8x7B | FP4 | yes | — | — |
+
+## Supported Quantization Formats
+
+| Format | Description |
+|--------|-------------|
+| FP8 | 8-bit floating point (E4M3) |
+| FP8_PB | 8-bit floating point with per-block scaling |
+| NVFP4 | NVIDIA 4-bit floating point |
+| NVFP4_AWQ | NVIDIA 4-bit floating point with AWQ optimization |
+| INT4_AWQ | 4-bit integer with AWQ (TRT-LLM only) |
+| W4A8_AWQ | 4-bit weights, 8-bit activations with AWQ (TRT-LLM only) |
+
+## Minimum Framework Versions
+
+| Framework | Minimum Version |
+|-----------|----------------|
+| TensorRT-LLM | v0.17.0 |
+| vLLM | v0.10.1 |
+| SGLang | v0.4.10 |
+
+## Quantization Flag by Framework
+
+| Framework | FP8 flag | FP4 flag |
+|-----------|----------|----------|
+| vLLM | `quantization="modelopt"` | `quantization="modelopt_fp4"` |
+| SGLang | `quantization="modelopt"` | `quantization="modelopt_fp4"` |
+| TRT-LLM | auto-detected from checkpoint | auto-detected from checkpoint |
+
+## Notes
+
+- **NVFP4 inference requires Blackwell GPUs** (B100, B200, GB200). Hopper can run FP4 calibration but not inference.
+- INT4_AWQ and W4A8_AWQ are only supported by TRT-LLM (not vLLM or SGLang).
+- Other models/formats may work but are not officially validated.
+- Source: `examples/llm_ptq/README.md` and `docs/source/deployment/3_unified_hf.rst`
diff --git a/.claude/skills/deployment/references/trtllm.md b/.claude/skills/deployment/references/trtllm.md
new file mode 100644
index 00000000000..5725bed3bf7
--- /dev/null
+++ b/.claude/skills/deployment/references/trtllm.md
@@ -0,0 +1,109 @@
+# TRT-LLM Deployment Reference
+
+## Requirements
+
+- TensorRT-LLM >= 0.17.0
+- Typically installed via NVIDIA container: `nvcr.io/nvidia/tensorrt-llm/release:<version>`
+- Or: `pip install tensorrt-llm`
+
+## Direct LLM API (recommended for unified HF checkpoints)
+
+### Python API
+
+```python
+from tensorrt_llm import LLM, SamplingParams
+
+llm = LLM(model="<checkpoint_path>")
+# Quantization format is auto-detected from hf_quant_config.json
+
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+outputs = llm.generate(["Hello, my name is"], sampling_params)
+
+for output in outputs:
+    print(f"Prompt: {output.prompt!r}, Generated: {output.outputs[0].text!r}")
+```
+
+### From HuggingFace Hub
+
+```python
+from tensorrt_llm import LLM
+
+llm = LLM(model="nvidia/Llama-3.1-8B-Instruct-FP8")
+print(llm.generate(["What is AI?"]))
+```
+
+### With tensor parallelism
+
+```python
+from tensorrt_llm import LLM
+
+llm = LLM(model="<checkpoint_path>", tensor_parallel_size=4)
+```
+
+## AutoDeploy (for AutoQuant / mixed-precision)
+
+AutoDeploy automates graph transformations for optimized inference. Required for AutoQuant checkpoints.
+
+### End-to-end script
+
+```bash
+# Quantize and deploy in one step
+./examples/llm_autodeploy/scripts/run_auto_quant_and_deploy.sh \
+    --hf_ckpt <model_path> \
+    --save_quantized_ckpt <output_path> \
+    --quant fp8,nvfp4 \
+    --effective_bits 4.5
+```
+
+Parameters:
+
+- `--hf_ckpt`: Path to unquantized HuggingFace checkpoint
+- `--save_quantized_ckpt`: Output path for quantized checkpoint
+- `--quant`: Quantization formats (e.g., `fp8,nvfp4`)
+- `--effective_bits`: Target precision (higher = more accuracy for sensitive layers)
+- `--world_size`: Number of GPUs for tensor parallelism
+- `--calib_batch_size`: Calibration batch size (reduce if OOM, default 8)
+
+### AutoDeploy API server
+
+```python
+# examples/llm_autodeploy/api_server.py provides a FastAPI server
+# with OpenAI-compatible endpoints using AutoDeploy
+```
+
+### Test AutoDeploy
+
+```bash
+python examples/llm_autodeploy/api_client.py --prompt "What is AI?" "What is golf?"
+```
+
+### Notes
+
+- NVFP4 in AutoDeploy requires Blackwell GPUs
+- For Hopper: remove `nvfp4` from `--quant` and set `--effective_bits` above 8.0
+- AutoDeploy supports CUDA graphs, torch compile backends, and KV cache optimization
+
+## Legacy TRT-LLM Checkpoint (deprecated)
+
+The legacy export path using `export_tensorrt_llm_checkpoint()` is deprecated. Use the unified HF checkpoint format with `export_hf_checkpoint()` instead.
+
+If you encounter a legacy checkpoint (no `hf_quant_config.json`, has `rank*.safetensors` pattern), it needs the TRT-LLM build API to create an engine before deployment. See `docs/source/deployment/1_tensorrt_llm.rst`.
+
+## Evaluation with TRT-LLM
+
+```python
+# examples/llm_eval/lm_eval_tensorrt_llm.py
+# Runs lm_evaluation_harness benchmarks with TRT-LLM
+python examples/llm_eval/lm_eval_tensorrt_llm.py \
+    --model_path <checkpoint_path> \
+    --tasks gsm8k,mmlu
+```
+
+## Common Issues
+
+| Issue | Fix |
+|-------|-----|
+| `No module named tensorrt_llm` | Install via container or pip |
+| NVFP4 inference fails on Hopper | NVFP4 requires Blackwell GPUs for inference |
+| Slow first inference | Engine compilation happens on first run; subsequent runs are cached |
+| OOM during engine build | Reduce `--max_batch_size` or increase TP |
diff --git a/.claude/skills/deployment/references/vllm.md b/.claude/skills/deployment/references/vllm.md
new file mode 100644
index 00000000000..89e06bde424
--- /dev/null
+++ b/.claude/skills/deployment/references/vllm.md
@@ -0,0 +1,91 @@
+# vLLM Deployment Reference
+
+## Requirements
+
+- vLLM >= 0.10.1
+- `pip install vllm`
+
+## Realquant Deployment (recommended)
+
+Realquant uses dedicated quantized kernels for maximum performance. This is the default path for ModelOpt-exported checkpoints.
+
+### As OpenAI-compatible server
+
+```bash
+python -m vllm.entrypoints.openai.api_server \
+    --model <checkpoint_path> \
+    --quantization modelopt \
+    --tensor-parallel-size <num_gpus> \
+    --host 0.0.0.0 --port 8000 \
+    --served-model-name <model_name>
+```
+
+For NVFP4 checkpoints, use `--quantization modelopt_fp4`.
+
+### As Python API
+
+```python
+from vllm import LLM, SamplingParams
+
+llm = LLM(model="<checkpoint_path>", quantization="modelopt")
+# For FP4: quantization="modelopt_fp4"
+
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+outputs = llm.generate(["Hello, my name is"], sampling_params)
+
+for output in outputs:
+    print(f"Prompt: {output.prompt!r}, Generated: {output.outputs[0].text!r}")
+```
+
+### From HuggingFace Hub
+
+```python
+from vllm import LLM, SamplingParams
+
+llm = LLM(model="nvidia/Llama-3.1-8B-Instruct-FP8", quantization="modelopt")
+outputs = llm.generate(["What is AI?"], SamplingParams(temperature=0.8))
+```
+
+## Fakequant Deployment (research)
+
+Fakequant is 2-5x slower than realquant but doesn't require dedicated kernel support. Useful for research and testing new quantization schemes.
+
+Reference: `examples/vllm_serve/`
+
+```bash
+# Environment variables for configuration
+export QUANT_CFG=NVFP4_DEFAULT_CFG    # Quantization format
+export QUANT_CALIB_SIZE=512            # Calibration samples
+export QUANT_DATASET=cnn_dailymail     # Calibration dataset
+
+python examples/vllm_serve/vllm_serve_fakequant.py <model_path> \
+    -tp <num_gpus> --host 0.0.0.0 --port 8000
+```
+
+## Benchmarking
+
+```bash
+# Start server first, then benchmark
+python -m vllm.benchmark_serving \
+    --model <model_name> \
+    --port 8000 \
+    --num-prompts 100 \
+    --request-rate 10
+```
+
+Or use lm_eval for accuracy:
+
+```bash
+lm_eval --model local-completions \
+    --tasks gsm8k \
+    --model_args model=<model_name>,base_url=http://localhost:8000/v1/completions,num_concurrent=1,max_retries=3,tokenized_requests=False,batch_size=128
+```
+
+## Common Issues
+
+| Issue | Fix |
+|-------|-----|
+| `quantization="modelopt"` not recognized | Upgrade vLLM to >= 0.10.1 |
+| OOM on startup | Increase `--tensor-parallel-size` or reduce `--max-model-len` |
+| AWQ checkpoints not loading | AWQ is not supported in vLLM via modelopt path; use FP8 or NVFP4 |
+| Mixed precision not working | Not supported for fakequant |
diff --git a/.claude/skills/deployment/scripts/deploy.sh b/.claude/skills/deployment/scripts/deploy.sh
new file mode 100755
index 00000000000..a56d5e92eb6
--- /dev/null
+++ b/.claude/skills/deployment/scripts/deploy.sh
@@ -0,0 +1,447 @@
+#!/bin/bash
+
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# ModelOpt Deployment Script
+# Deploy quantized or unquantized models via vLLM, SGLang, or TRT-LLM
+# Supports ModelOpt FP8/FP4 checkpoints with automatic quantization flag detection
+
+set -e
+
+# Default configuration
+MODEL=""
+PORT=8000
+HOST="0.0.0.0"
+FRAMEWORK="vllm"
+TP_SIZE=1
+VRAM=0.9
+MAX_WAIT=300  # 5 min for large models
+QUANTIZATION=""  # auto-detected from checkpoint
+
+# Paths
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+LOG_DIR="${LOG_DIR:-/tmp/modelopt-deploy}"
+LOG_FILE="$LOG_DIR/server.log"
+PID_FILE="$LOG_DIR/server.pid"
+META_FILE="$LOG_DIR/server.meta"  # persists model/framework/port for status
+
+# Colors
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m'
+
+log_info()    { printf "${BLUE}[INFO]${NC} %s\n" "$1"; }
+log_success() { printf "${GREEN}[OK]${NC} %s\n" "$1"; }
+log_warn()    { printf "${YELLOW}[WARN]${NC} %s\n" "$1"; }
+log_error()   { printf "${RED}[ERROR]${NC} %s\n" "$1"; }
+
+usage() {
+    cat <<EOF
+Usage: $0 <command> [OPTIONS]
+
+Commands:
+  start    - Start the inference server
+  stop     - Stop the inference server
+  test     - Test the API endpoint
+  status   - Show server status
+  restart  - Restart the server
+  detect   - Detect checkpoint format (without starting)
+
+Options:
+  --model PATH              Model path or HF model ID (required for start)
+  --framework FRAMEWORK     vllm, sglang, or trtllm (default: vllm)
+  --port PORT               Server port (default: 8000)
+  --tp SIZE                 Tensor parallel size (default: 1)
+  --quantization QUANT      Force quantization flag (modelopt, modelopt_fp4, or none)
+  --gpu-memory-utilization  GPU memory utilization 0.0-1.0 (default: 0.9)
+  --log-dir DIR             Log directory (default: /tmp/modelopt-deploy)
+
+Examples:
+  $0 start --model ./qwen3-0.6b-fp8
+  $0 start --model ./llama-70b-nvfp4 --framework sglang --tp 4
+  $0 start --model nvidia/Llama-3.1-8B-Instruct-FP8 --framework vllm
+  $0 test --port 8000
+  $0 stop
+EOF
+    exit 1
+}
+
+# ─── Checkpoint Detection ───────────────────────────────────────────
+
+detect_quantization() {
+    local model_path="$1"
+
+    # Skip detection for HF model IDs (no local path)
+    if [[ ! -d "$model_path" ]]; then
+        log_info "Model is a HF ID, checking if quantization flag is needed..."
+        # HF hub models with FP8/FP4 in name likely need modelopt flag
+        if echo "$model_path" | grep -qi "fp8"; then
+            echo "modelopt"
+        elif echo "$model_path" | grep -qi "fp4\|nvfp4"; then
+            echo "modelopt_fp4"
+        else
+            echo "none"
+        fi
+        return
+    fi
+
+    # Local checkpoint: check hf_quant_config.json
+    local quant_config="$model_path/hf_quant_config.json"
+    if [[ -f "$quant_config" ]]; then
+        log_info "Found hf_quant_config.json"
+
+        # Check for FP4/NVFP4
+        if python3 -c "
+import json, sys
+with open(sys.argv[1]) as f:
+    cfg = json.load(f)
+quant_algo = cfg.get('quantization', {}).get('quant_algo', '')
+print(quant_algo)
+" "$quant_config" 2>/dev/null | grep -qi "fp4"; then
+            echo "modelopt_fp4"
+        else
+            echo "modelopt"
+        fi
+    else
+        log_info "No hf_quant_config.json found — treating as unquantized"
+        echo "none"
+    fi
+}
+
+detect_gpu() {
+    if command -v nvidia-smi &>/dev/null; then
+        local gpu_count
+        gpu_count=$(nvidia-smi -L 2>/dev/null | wc -l)
+        local gpu_name
+        gpu_name=$(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | head -1)
+        log_info "GPUs: ${gpu_count}x ${gpu_name}"
+        echo "$gpu_count"
+    else
+        log_error "No NVIDIA GPU detected (nvidia-smi not found)"
+        echo "0"
+    fi
+}
+
+# ─── Server Management ──────────────────────────────────────────────
+
+is_server_running() {
+    if [[ -f "$PID_FILE" ]]; then
+        local pid
+        pid=$(cat "$PID_FILE")
+        if ps -p "$pid" >/dev/null 2>&1; then
+            return 0
+        fi
+    fi
+    return 1
+}
+
+start_server() {
+    if [[ -z "$MODEL" ]]; then
+        log_error "--model is required"
+        usage
+    fi
+
+    if is_server_running; then
+        log_warn "Server already running (PID: $(cat "$PID_FILE"))"
+        return 0
+    fi
+
+    mkdir -p "$LOG_DIR"
+
+    # Auto-detect quantization if not forced
+    if [[ -z "$QUANTIZATION" ]]; then
+        QUANTIZATION=$(detect_quantization "$MODEL")
+    fi
+    log_info "Quantization: $QUANTIZATION"
+
+    # Save metadata for status command
+    cat >"$META_FILE" <<METAEOF
+FRAMEWORK=$FRAMEWORK
+MODEL=$MODEL
+PORT=$PORT
+QUANTIZATION=$QUANTIZATION
+TP_SIZE=$TP_SIZE
+METAEOF
+
+    # Build and run the command
+    case "$FRAMEWORK" in
+        vllm)
+            start_vllm
+            ;;
+        sglang)
+            start_sglang
+            ;;
+        trtllm)
+            start_trtllm
+            ;;
+        *)
+            log_error "Unknown framework: $FRAMEWORK (use vllm, sglang, or trtllm)"
+            exit 1
+            ;;
+    esac
+
+    # Wait for server readiness
+    wait_for_server
+}
+
+start_vllm() {
+    log_info "Starting vLLM server..."
+
+    local -a cmd=(python3 -m vllm.entrypoints.openai.api_server
+        --model "$MODEL"
+        --host "$HOST" --port "$PORT"
+        --tensor-parallel-size "$TP_SIZE"
+        --gpu-memory-utilization "$VRAM")
+
+    if [[ "$QUANTIZATION" != "none" ]]; then
+        cmd+=(--quantization "$QUANTIZATION")
+    fi
+
+    log_info "Command: ${cmd[*]}"
+    nohup "${cmd[@]}" >"$LOG_FILE" 2>&1 &
+    echo $! >"$PID_FILE"
+    log_success "vLLM started (PID: $(cat "$PID_FILE"))"
+}
+
+start_sglang() {
+    log_info "Starting SGLang server..."
+
+    local -a cmd=(python3 -m sglang.launch_server
+        --model-path "$MODEL"
+        --host "$HOST" --port "$PORT"
+        --tp "$TP_SIZE")
+
+    if [[ "$QUANTIZATION" != "none" ]]; then
+        cmd+=(--quantization "$QUANTIZATION")
+    fi
+
+    log_info "Command: ${cmd[*]}"
+    nohup "${cmd[@]}" >"$LOG_FILE" 2>&1 &
+    echo $! >"$PID_FILE"
+    log_success "SGLang started (PID: $(cat "$PID_FILE"))"
+}
+
+start_trtllm() {
+    log_info "Starting TRT-LLM server..."
+    log_info "TRT-LLM uses the Python API directly (no OpenAI server built-in)"
+    log_info "For OpenAI-compatible serving, use AutoDeploy:"
+
+    cat <<TRTEOF
+
+# Option 1: AutoDeploy (recommended)
+./examples/llm_autodeploy/scripts/run_auto_quant_and_deploy.sh \\
+    --hf_ckpt $MODEL \\
+    --save_quantized_ckpt <output_path> \\
+    --quant fp8,nvfp4 \\
+    --effective_bits 4.5
+
+# Option 2: Python API
+python3 -c "
+from tensorrt_llm import LLM, SamplingParams
+llm = LLM(model='$MODEL')
+print(llm.generate(['Hello, my name is'], SamplingParams(temperature=0.8)))
+"
+TRTEOF
+
+    log_warn "TRT-LLM server mode not yet automated in this script."
+    log_warn "Use vLLM or SGLang for OpenAI-compatible serving of ModelOpt checkpoints."
+    exit 1
+}
+
+wait_for_server() {
+    log_info "Waiting for server at http://localhost:$PORT ..."
+    local elapsed=0
+    while [[ $elapsed -lt $MAX_WAIT ]]; do
+        if curl -s "http://localhost:$PORT/health" >/dev/null 2>&1; then
+            log_success "Server is ready! (${elapsed}s)"
+            return 0
+        fi
+
+        # Check if process died
+        if ! is_server_running; then
+            log_error "Server process died. Check logs: $LOG_FILE"
+            tail -20 "$LOG_FILE" 2>/dev/null
+            exit 1
+        fi
+
+        sleep 5
+        elapsed=$((elapsed + 5))
+        printf "."
+    done
+
+    echo ""
+    log_error "Server not ready after ${MAX_WAIT}s. Check logs: $LOG_FILE"
+    tail -20 "$LOG_FILE" 2>/dev/null
+    exit 1
+}
+
+stop_server() {
+    if ! is_server_running; then
+        log_warn "Server is not running"
+        return 0
+    fi
+
+    local pid
+    pid=$(cat "$PID_FILE")
+    log_info "Stopping server (PID: $pid)..."
+
+    # Kill the process group to catch child processes (vLLM/SGLang may fork)
+    kill -- -"$pid" 2>/dev/null || kill "$pid" 2>/dev/null || true
+
+    # Wait for graceful shutdown
+    for i in {1..15}; do
+        if ! ps -p "$pid" >/dev/null 2>&1; then
+            rm -f "$PID_FILE" "$META_FILE"
+            log_success "Server stopped"
+            return 0
+        fi
+        sleep 1
+    done
+
+    # Force kill
+    log_warn "Force killing..."
+    kill -9 -- -"$pid" 2>/dev/null || kill -9 "$pid" 2>/dev/null || true
+    rm -f "$PID_FILE" "$META_FILE"
+    log_success "Server stopped (forced)"
+}
+
+test_api() {
+    log_info "Testing API at http://localhost:$PORT ..."
+
+    # Health check
+    if ! curl -s "http://localhost:$PORT/health" >/dev/null 2>&1; then
+        log_error "Server not responding at port $PORT"
+        exit 1
+    fi
+    log_success "Health check passed"
+
+    # List models
+    log_info "Available models:"
+    curl -s "http://localhost:$PORT/v1/models" | python3 -m json.tool 2>/dev/null || true
+
+    # Test completion
+    log_info "Sending test request..."
+    local model_id
+    model_id=$(curl -s "http://localhost:$PORT/v1/models" | python3 -c "
+import sys, json
+data = json.load(sys.stdin)
+print(data['data'][0]['id'])
+" 2>/dev/null)
+
+    if [[ -z "$model_id" ]]; then
+        log_error "Could not determine model ID from /v1/models endpoint"
+        exit 1
+    fi
+
+    local response
+    response=$(curl -s "http://localhost:$PORT/v1/completions" \
+        -H "Content-Type: application/json" \
+        -d "{
+            \"model\": \"$model_id\",
+            \"prompt\": \"The capital of France is\",
+            \"max_tokens\": 32,
+            \"temperature\": 0.7
+        }")
+
+    echo "$response" | python3 -m json.tool 2>/dev/null || echo "$response"
+
+    local text
+    text=$(echo "$response" | python3 -c "
+import sys, json
+data = json.load(sys.stdin)
+print(data['choices'][0]['text'])
+" 2>/dev/null)
+
+    if [[ -n "$text" ]]; then
+        log_success "API test passed!"
+        printf "${GREEN}Response:${NC} %s\n" "$text"
+    else
+        log_error "No valid response from API"
+        exit 1
+    fi
+}
+
+show_status() {
+    echo "=== ModelOpt Deployment Status ==="
+    echo ""
+    if is_server_running; then
+        local pid
+        pid=$(cat "$PID_FILE")
+        log_success "Server running (PID: $pid)"
+
+        # Read saved metadata if available
+        if [[ -f "$META_FILE" ]]; then
+            source "$META_FILE"
+        fi
+
+        echo "  Framework:    ${FRAMEWORK:-unknown}"
+        echo "  Model:        ${MODEL:-unknown}"
+        echo "  Endpoint:     http://localhost:${PORT:-8000}"
+        echo "  Logs:         $LOG_FILE"
+        echo ""
+        if [[ -f "$LOG_FILE" ]]; then
+            echo "Recent logs:"
+            tail -5 "$LOG_FILE"
+        fi
+    else
+        log_warn "Server is not running"
+        echo "  Start with: $0 start --model <path>"
+    fi
+}
+
+# ─── Argument Parsing ────────────────────────────────────────────────
+
+COMMAND=""
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --model)               MODEL="$2"; shift 2 ;;
+        --framework)           FRAMEWORK="$2"; shift 2 ;;
+        --port)                PORT="$2"; shift 2 ;;
+        --tp)                  TP_SIZE="$2"; shift 2 ;;
+        --quantization)        QUANTIZATION="$2"; shift 2 ;;
+        --gpu-memory-utilization) VRAM="$2"; shift 2 ;;
+        --log-dir)             LOG_DIR="$2"; LOG_FILE="$LOG_DIR/server.log"; PID_FILE="$LOG_DIR/server.pid"; META_FILE="$LOG_DIR/server.meta"; shift 2 ;;
+        start|stop|test|status|restart|detect)
+            COMMAND="$1"; shift ;;
+        *)
+            log_error "Unknown option: $1"
+            usage ;;
+    esac
+done
+
+if [[ -z "$COMMAND" ]]; then
+    usage
+fi
+
+# Execute
+case "$COMMAND" in
+    start)   start_server ;;
+    stop)    stop_server ;;
+    test)    test_api ;;
+    status)  show_status ;;
+    restart) stop_server; sleep 2; start_server ;;
+    detect)
+        if [[ -z "$MODEL" ]]; then
+            log_error "--model is required for detect"
+            exit 1
+        fi
+        quant=$(detect_quantization "$MODEL")
+        echo "Detected quantization: $quant"
+        ;;
+    *)       usage ;;
+esac
diff --git a/.claude/skills/evaluation/SKILL.md b/.claude/skills/evaluation/SKILL.md
new file mode 100644
index 00000000000..e5268c4d2ef
--- /dev/null
+++ b/.claude/skills/evaluation/SKILL.md
@@ -0,0 +1,378 @@
+---
+name: evaluation
+description: Evaluate accuracy of quantized or unquantized LLMs using NeMo Evaluator Launcher (NEL). Use when user says "evaluate model", "benchmark accuracy", "run MMLU", "evaluate quantized model", "accuracy drop", "run nel", or needs to measure how quantization affects model quality. Handles model deployment, config generation, and evaluation execution.
+license: Apache-2.0
+# Based on nel-assistant skill from NeMo Evaluator Launcher (commit f1fa073)
+# https://github.com/NVIDIA-NeMo/Evaluator/tree/f1fa073/packages/nemo-evaluator-launcher/.claude/skills/nel-assistant
+# Modifications: renamed to evaluation, added workspace management (Step 0),
+# auto-detect ModelOpt quantization format, quantization-aware benchmark defaults.
+---
+
+## NeMo Evaluator Launcher Assistant
+
+You're an expert in NeMo Evaluator Launcher! Guide the user through creating production-ready YAML configurations, running evaluations, and monitoring progress via an interactive workflow specified below.
+
+### Workspace (multi-user / Slack bot)
+
+If `MODELOPT_WORKSPACE_ROOT` is set, read `skills/common/workspace-management.md`. Check for existing workspaces — especially if evaluating a model from a prior PTQ or deployment step. Reuse the existing workspace so you have access to the quantized checkpoint and any code modifications.
+
+### Workflow
+
+```text
+Config Generation Progress:
+- [ ] Step 0: Check workspace (if MODELOPT_WORKSPACE_ROOT is set)
+- [ ] Step 1: Check if nel is installed
+- [ ] Step 2: Build the base config file
+- [ ] Step 3: Configure model path and parameters
+- [ ] Step 4: Fill in remaining missing values
+- [ ] Step 5: Confirm tasks (iterative)
+- [ ] Step 6: Advanced - Multi-node (Data Parallel)
+- [ ] Step 7: Advanced - Interceptors
+- [ ] Step 8: Run the evaluation
+```
+
+**Step 1: Check if nel is installed**
+
+Test that `nel` is installed with `nel --version`.
+
+If not, instruct the user to `pip install nemo-evaluator-launcher`.
+
+**Step 2: Build the base config file**
+
+Prompt the user with "I'll ask you 5 questions to build the base config we'll adjust in the next steps". Guide the user through the 5 questions using AskUserQuestion:
+
+1. Execution:
+  - Local
+  - SLURM
+2. Deployment:
+  - None (External)
+  - vLLM
+  - SGLang
+  - NIM
+  - TRT-LLM
+3. Auto-export:
+  - None (auto-export disabled)
+  - MLflow
+  - wandb
+4. Model type
+  - Base
+  - Chat
+  - Reasoning
+5. Benchmarks:
+  Allow for multiple choices in this question.
+  1. Standard LLM Benchmarks (like MMLU, IFEval, GSM8K, ...)
+  2. Code Evaluation (like HumanEval, MBPP, and LiveCodeBench)
+  3. Math & Reasoning (like AIME, GPQA, MATH-500, ...)
+  4. Safety & Security (like Garak and Safety Harness)
+  5. Multilingual (like MMATH, Global MMLU, MMLU-Prox)
+
+DON'T ALLOW FOR ANY OTHER OPTIONS, only the ones listed above under each category (Execution, Deployment, Auto-export, Model type, Benchmarks). YOU HAVE TO GATHER THE ANSWERS for the 5 questions before you can build the base config.
+
+When you have all the answers, run the script to build the base config:
+
+```bash
+nel skills build-config --execution <local|slurm> --deployment <none|vllm|sglang|nim|trtllm> --model_type <base|chat|reasoning> --benchmarks <standard|code|math_reasoning|safety|multilingual> [--export <none|mlflow|wandb>] [--output <OUTPUT>]
+```
+
+Where `--output` depends on what the user provides:
+
+- Omit: Uses current directory with auto-generated filename
+- Directory: Writes to that directory with auto-generated filename
+- File path (*.yaml): Writes to that specific file
+
+It never overwrites existing files.
+
+**Step 3: Configure model path and parameters**
+
+Ask for model path. Determine type:
+
+- Checkpoint path (starts with `/` or `./`) → set `deployment.checkpoint_path: <path>` and `deployment.hf_model_handle: null`
+- HF handle (e.g., `org/model-name`) → set `deployment.hf_model_handle: <handle>` and `deployment.checkpoint_path: null`
+
+**Auto-detect ModelOpt quantization format** (checkpoint paths only):
+
+Check for `hf_quant_config.json` in the checkpoint directory:
+
+```bash
+cat <checkpoint_path>/hf_quant_config.json 2>/dev/null
+```
+
+If found, read `quantization.quant_algo` and set the correct vLLM/SGLang quantization flag in `deployment.extra_args`:
+
+| `quant_algo` | Flag to add |
+|-------------|-------------|
+| `FP8` | `--quantization modelopt` |
+| `W4A8_AWQ` | `--quantization modelopt` |
+| `NVFP4`, `NVFP4_AWQ` | `--quantization modelopt_fp4` |
+
+If no `hf_quant_config.json`, the checkpoint is unquantized — no flag needed.
+
+**Quantization-aware benchmark defaults:**
+
+When a quantized checkpoint is detected, recommend benchmarks sensitive to quantization accuracy loss:
+
+- **Always include**: MMLU (general knowledge, most affected by quantization)
+- **Recommended**: GSM8K (math reasoning — sensitive to precision loss), ARC-Challenge (reasoning)
+- **Good to add**: HumanEval (code generation — catches subtle degradation), Winogrande (commonsense)
+- **Less useful for quant comparison**: IFEval (instruction following — rarely affected by quantization)
+
+Present these recommendations to the user and ask which to include. If the user already specified benchmarks, keep their choice but mention any accuracy-sensitive benchmarks they may have missed.
+
+Use WebSearch to find model card (HuggingFace, build.nvidia.com). Read it carefully, the FULL text, the devil is in the details. Extract ALL relevant configurations:
+
+- Sampling params (`temperature`, `top_p`)
+- Context length (`deployment.extra_args: "--max-model-len <value>"`)
+- TP/DP settings (to set them appropriately, AskUserQuestion on how many GPUs the model will be deployed)
+- Reasoning config (if applicable):
+  - reasoning on/off: use either:
+    - `adapter_config.custom_system_prompt` (like `/think`, `/no_think`) and no `adapter_config.params_to_add` (leave `params_to_add` unrelated to reasoning untouched)
+    - `adapter_config.params_to_add` for payload modifier (like `"chat_template_kwargs": {"enable_thinking": true/false}`) and no `adapter_config.custom_system_prompt` and `adapter_config.use_system_prompt: false` (leave `custom_system_prompt` and `use_system_prompt` unrelated to reasoning untouched).
+  - reasoning effort/budget (if it's configurable, AskUserQuestion what reasoning effort they want)
+  - higher `max_new_tokens`
+  - etc.
+- Deployment-specific `extra_args` for vLLM/SGLang (look for the vLLM/SGLang deployment command)
+- Deployment-specific vLLM/SGLang versions (by default we use latest docker images, but you can control it with `deployment.image` e.g. vLLM above `vllm/vllm-openai:v0.11.0` stopped supporting `rope-scaling` arg used by Qwen models)
+- ARM64 / non-standard GPU compatibility: The default `vllm/vllm-openai` image only supports common GPU architectures. For ARM64 platforms or GPUs with non-standard compute capabilities (e.g., NVIDIA GB10 with sm_121), use NGC vLLM images instead:
+  - Example: `deployment.image: nvcr.io/nvidia/vllm:26.01-py3`
+  - AskUserQuestion about their GPU architecture if the model card doesn't specify deployment constraints
+- Any preparation requirements (e.g., downloading reasoning parsers, custom plugins):
+  - If the model card mentions downloading files (like reasoning parsers, custom plugins) before deployment, add `deployment.pre_cmd` with the download command
+  - Use `curl` instead of `wget` as it's more widely available in Docker containers
+  - Example: `pre_cmd: curl -L -o reasoning_parser.py https://huggingface.co/.../reasoning_parser.py`
+  - When using `pip install` in `pre_cmd`, always use `--no-cache-dir` to avoid cross-device link errors in Docker containers (the pip cache and temp directories may be on different filesystems)
+  - Example: `pre_cmd: pip3 install --no-cache-dir flash-attn --no-build-isolation`
+- Any other model-specific requirements
+
+Remember to check `evaluation.nemo_evaluator_config` and `evaluation.tasks.*.nemo_evaluator_config` overrides too for parameters to adjust (e.g. disabling reasoning)!
+
+Present findings, explain each setting, ask user to confirm or adjust. If no model card found, ask user directly for the above configurations.
+
+**Step 4: Fill in remaining missing values**
+
+- Find all remaining `???` missing values in the config.
+- Ask the user only for values that couldn't be auto-discovered from the model card (e.g., SLURM hostname, account, output directory, MLflow/wandb tracking URI). Don't propose any defaults here. Let the user give you the values in plain text.
+- Ask the user if they want to change any other defaults e.g. execution partition or walltime (if running on SLURM) or add MLflow/wandb tags (if auto-export enabled).
+
+**Step 5: Confirm tasks (iterative)**
+
+Show tasks in the current config. Loop until the user confirms the task list is final:
+
+1. Tell the user: "Run `nel ls tasks` to see all available tasks".
+2. Ask if they want to add/remove tasks or add/remove/modify task-specific parameter overrides.
+   To add per-task `nemo_evaluator_config` as specified by the user, e.g.:
+
+   ```yaml
+   tasks:
+     - name: <task>
+       nemo_evaluator_config:
+         config:
+           params:
+             temperature: <value>
+             max_new_tokens: <value>
+             ...
+   ```
+
+3. Apply changes.
+4. Show updated list and ask: "Is the task list final, or do you want to make more changes?"
+
+**Known Issues**
+
+- NeMo-Skills workaround (self-deployment only): If using `nemo_skills.*` tasks with self-deployment (vLLM/SGLang/NIM), add at top level:
+
+  ```yaml
+  target:
+    api_endpoint:
+      api_key_name: DUMMY_API_KEY
+  ```
+
+  For the None (External) deployment the `api_key_name` should be already defined. The `DUMMY_API_KEY` export is handled in Step 8.
+
+**Step 6: Advanced - Multi-node**
+
+There are two multi-node patterns. Ask the user which applies:
+
+**Pattern A: Multi-instance (independent instances with HAProxy)**
+
+Only if model >120B parameters or user wants more throughput. Explain: "Each node runs an independent deployment instance. HAProxy load-balances requests across all instances."
+
+```yaml
+execution:
+    num_nodes: 4       # Total nodes
+    num_instances: 4   # 4 independent instances → HAProxy auto-enabled
+```
+
+**Pattern B: Multi-node single instance (Ray TP/PP across nodes)**
+
+When a single model is too large for one node and needs pipeline parallelism across nodes. Use `vllm_ray` deployment config:
+
+```yaml
+defaults:
+  - deployment: vllm_ray   # Built-in Ray cluster setup (replaces manual pre_cmd)
+
+execution:
+    num_nodes: 2           # Single instance spanning 2 nodes
+
+deployment:
+    tensor_parallel_size: 8
+    pipeline_parallel_size: 2
+```
+
+**Pattern A+B combined: Multi-instance with multi-node instances**
+
+For very large models needing both cross-node parallelism AND multiple instances:
+
+```yaml
+defaults:
+  - deployment: vllm_ray
+
+execution:
+    num_nodes: 4       # Total nodes
+    num_instances: 2   # 2 instances of 2 nodes each → HAProxy auto-enabled
+
+deployment:
+    tensor_parallel_size: 8
+    pipeline_parallel_size: 2
+```
+
+**Common Confusions**
+
+- **`num_instances`** controls independent deployment instances with HAProxy. **`data_parallel_size`** controls DP replicas *within* a single instance.
+- Global data parallelism is `num_instances x data_parallel_size` (e.g., 2 instances x 8 DP each = 16 replicas).
+- With multi-instance, `parallelism` in task config is the total concurrent requests across all instances, not per-instance.
+- `num_nodes` must be divisible by `num_instances`.
+
+**Step 7: Advanced - Interceptors**
+
+- Tell the user they should see: <https://docs.nvidia.com/nemo/evaluator/latest/libraries/nemo-evaluator/interceptors/index.html> .
+- DON'T provide any general information about what interceptors typically do in API frameworks without reading the docs. If the user asks about interceptors, only then read the webpage to provide precise information.
+- If the user asks you to configure some interceptor, then read the webpage of this interceptor and configure it according to the `--overrides` syntax but put the values in the YAML config under `evaluation.nemo_evaluator_config.config.target.api_endpoint.adapter_config` (NOT under `target.api_endpoint.adapter_config`) instead of using CLI overrides.
+  By defining `interceptors` list you'd override the full chain of interceptors which can have unintended consequences like disabling default interceptors. That's why use the fields specified in the `CLI Configuration` section after the `--overrides` keyword to configure interceptors in the YAML config.
+
+**Documentation Errata**
+
+- The docs may show incorrect parameter names for logging. Use `max_logged_requests` and `max_logged_responses` (NOT `max_saved_*` or `max_*`).
+
+**Step 8: Run the evaluation**
+
+Print the following commands to the user. Propose to execute them in order to confirm the config works as expected before the full run.
+
+**Important**: Export required environment variables based on your config. If any tokens or keys are missing (e.g. `HF_TOKEN`, `NGC_API_KEY`, `api_key_name` from the config), ask the user to put them in a `.env` file in the project root so you can run `set -a && source .env && set +a` (or equivalent) before executing `nel run` commands.
+
+```bash
+# If using pre_cmd or post_cmd:
+export NEMO_EVALUATOR_TRUST_PRE_CMD=1
+
+# If using nemo_skills.* tasks with self-deployment:
+export DUMMY_API_KEY=dummy
+```
+
+1. **Dry-run** (validates config without running):
+
+   ```bash
+   nel run --config <config_path> --dry-run
+   ```
+
+2. **Test with limited samples** (quick validation run):
+
+   ```bash
+   nel run --config <config_path> -o ++evaluation.nemo_evaluator_config.config.params.limit_samples=10
+   ```
+
+3. **Re-run a single task** (useful for debugging or re-testing after config changes):
+
+   ```bash
+   nel run --config <config_path> -t <task_name>
+   ```
+
+   Combine with `-o` for limited samples: `nel run --config <config_path> -t <task_name> -o ++evaluation.nemo_evaluator_config.config.params.limit_samples=10`
+
+4. **Full evaluation** (production run):
+
+   ```bash
+   nel run --config <config_path>
+   ```
+
+After the dry-run, check the output from `nel` for any problems with the config. If there are no problems, propose to first execute the test run with limited samples and then execute the full evaluation. If there are problems, resolve them before executing the full evaluation.
+
+**Monitoring Progress**
+
+After job submission, you can monitor progress using:
+
+1. **Check job status:**
+
+   ```bash
+   nel status <invocation_id>
+   nel info <invocation_id>
+   ```
+
+2. **Stream logs** (Local execution only):
+
+   ```bash
+   nel logs <invocation_id>
+   ```
+
+   Note: `nel logs` is not supported for SLURM execution.
+
+3. **Inspect logs via SSH** (SLURM workaround):
+
+   When `nel logs` is unavailable (SLURM), use SSH to inspect logs directly:
+
+   First, get log locations:
+
+   ```bash
+   nel info <invocation_id> --logs
+   ```
+
+   Then, use SSH to view logs:
+
+   **Check server deployment logs:**
+
+   ```bash
+   ssh <username>@<hostname> "tail -100 <log path from `nel info <invocation_id> --logs`>/server-<slurm_job_id>-*.log"
+   ```
+
+   Shows vLLM server startup, model loading, and deployment errors (e.g., missing wget/curl).
+
+   **Check evaluation client logs:**
+
+   ```bash
+   ssh <username>@<hostname> "tail -100 <log path from `nel info <invocation_id> --logs`>/client-<slurm_job_id>.log"
+   ```
+
+   Shows evaluation progress, task execution, and results.
+
+   **Check SLURM scheduler logs:**
+
+   ```bash
+   ssh <username>@<hostname> "tail -100 <log path from `nel info <invocation_id> --logs`>/slurm-<slurm_job_id>.log"
+   ```
+
+   Shows job scheduling, health checks, and overall execution flow.
+
+   **Search for errors:**
+
+   ```bash
+   ssh <username>@<hostname> "grep -i 'error\|warning\|failed' <log path from `nel info <invocation_id> --logs`>/*.log"
+   ```
+
+---
+
+Direct users with issues to:
+
+- **GitHub Issues:** <https://github.com/NVIDIA-NeMo/Evaluator/issues>
+- **GitHub Discussions:** <https://github.com/NVIDIA-NeMo/Evaluator/discussions>
+
+Now, copy this checklist and track your progress:
+
+```text
+Config Generation Progress:
+- [ ] Step 0: Check workspace (if multi-user)
+- [ ] Step 1: Check if nel is installed
+- [ ] Step 2: Build the base config file
+- [ ] Step 3: Configure model path and parameters
+- [ ] Step 4: Fill in remaining missing values
+- [ ] Step 5: Confirm tasks (iterative)
+- [ ] Step 6: Advanced - Multi-node (Data Parallel)
+- [ ] Step 7: Advanced - Interceptors
+- [ ] Step 8: Run the evaluation
+```
diff --git a/.claude/skills/evaluation/evals/nemotron3-nano-bf16-reasoning.json b/.claude/skills/evaluation/evals/nemotron3-nano-bf16-reasoning.json
new file mode 100644
index 00000000000..6fb32570eb3
--- /dev/null
+++ b/.claude/skills/evaluation/evals/nemotron3-nano-bf16-reasoning.json
@@ -0,0 +1,26 @@
+{
+  "skills": ["nel-assistant"],
+  "query": "Help me evaluate Nemotron 3 Nano BF16 from NVIDIA",
+  "files": [],
+  "expected_behavior": [
+    "Verifies nel is installed by running 'nel --version'",
+    "Asks all 5 base config questions (execution, deployment, auto-export, model type, benchmarks) before generating the config",
+    "Runs 'nel skills build-config' with correct flags matching user answers: --execution slurm --deployment vllm --model-type reasoning --benchmarks standard code math_reasoning --export mlflow",
+    "Searches the web for the model card on HuggingFace and extracts model-specific settings",
+    "Sets correct HF handle: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16",
+    "Sets reasoning sampling params from model card: temperature=1.0, top_p=1.0",
+    "Configures reasoning toggle via params_to_add with chat_template_kwargs.enable_thinking (not via system prompt)",
+    "Disables reasoning for IFEval task using enable_thinking: false with use_system_prompt: false",
+    "Adds deployment.pre_cmd using curl (not wget) to download nano_v3_reasoning_parser.py from HuggingFace",
+    "Adds vLLM extra_args including --trust-remote-code, --reasoning-parser-plugin, --reasoning-parser nano_v3, --max-num-seqs 8",
+    "Pins vLLM image to v0.12.0 or later as required by model card",
+    "Adds target.api_endpoint.api_key_name: DUMMY_API_KEY for nemo_skills tasks with self-deployment",
+    "Fills in all ??? placeholders after asking the user for SLURM hostname, account, output_dir, MLflow tracking_uri, and experiment_name",
+    "Applies user-requested SLURM customizations: partition batch_short, walltime 00:20:00, MLflow tag scenario: demo",
+    "Presents task list and waits for user confirmation before proceeding",
+    "Configures request and response logging interceptors under evaluation.nemo_evaluator_config.config.target.api_endpoint.adapter_config using correct field names (max_logged_requests/max_logged_responses, not max_saved_*)",
+    "Handles dry-run failure for missing HF_TOKEN_FOR_GPQA_DIAMOND by offering to fix the config",
+    "Successfully submits test run with limit_samples=10 after dry-run passes",
+    "Provides monitoring commands (nel status, nel info --logs) and inspects server logs via SSH when asked"
+  ]
+}
diff --git a/.claude/skills/modelopt/SKILL.md b/.claude/skills/modelopt/SKILL.md
new file mode 100644
index 00000000000..da9cb9ea208
--- /dev/null
+++ b/.claude/skills/modelopt/SKILL.md
@@ -0,0 +1,90 @@
+---
+name: modelopt
+description: End-to-end model optimization pipeline that chains quantization with deployment or evaluation. Use when user says "optimize model end-to-end", "quantize and deploy", "quantize and serve", "quantize and evaluate", "quantize and benchmark accuracy", "full optimization loop", "run the full pipeline", "optimize and test accuracy", "find best quantization recipe", or wants to go from a pretrained model to a deployed or accuracy-verified quantized model. Do NOT use for individual tasks like only quantizing (use ptq), only deploying (use deployment), or only evaluating (use evaluation).
+license: Apache-2.0
+---
+
+# ModelOpt Optimizer — Pipeline Orchestrator
+
+Orchestrates optimization pipelines by chaining skills. Supports two modes:
+
+1. **PTQ + Deploy** — quantize then serve as an API endpoint
+2. **PTQ + Evaluate** — quantize then benchmark accuracy (evaluation handles deployment internally)
+
+This skill delegates to sub-skills. **Do not duplicate their logic — invoke them.**
+
+## Workspace Management
+
+If `MODELOPT_WORKSPACE_ROOT` is set (multi-user / Slack bot), read `skills/common/workspace-management.md` first. **All sub-skills in the pipeline must run in the same workspace** so they share the checkpoint and any code modifications. Create or reuse a workspace named after the model (e.g., `qwen3-0.6b`, `llama-3.1-8b-fp8`) before invoking any sub-skill.
+
+## Pipeline Selection
+
+Determine which pipeline the user needs:
+
+| User says | Pipeline |
+|-----------|----------|
+| "quantize and deploy", "quantize and serve" | PTQ + Deploy |
+| "quantize and evaluate", "optimize end-to-end", "find best recipe" | PTQ + Evaluate |
+
+If the user only wants quantization without deploy/eval, the `ptq` skill handles it directly — this skill should not be used.
+
+If unclear, ask: **"After quantization, do you want to (a) deploy the model as a server, (b) evaluate accuracy, or (c) just get the checkpoint?"** If they answer (c), hand off to the `ptq` skill.
+
+## Step 1: Gather Info
+
+Collect from the user (skip what's already provided):
+
+1. **Model path** — local path or HuggingFace model ID (save this for baseline comparison in Step 4)
+2. **Quantization format** — e.g., fp8, nvfp4, int4_awq (or "recommend one")
+3. **Execution target** — local GPU or remote cluster. Check for `~/.config/modelopt/clusters.yaml` or `.claude/clusters.yaml`. If found, ask which cluster to use. Both sub-skills support remote execution via `remote_exec.sh`.
+4. **GPU IDs** — which GPUs to use (default: `0`; skip if remote — sub-skills handle GPU allocation via SLURM)
+5. For Deploy pipeline: **Deployment framework** — vLLM, SGLang, or TRT-LLM (default: vLLM)
+6. For Evaluate pipeline: **Evaluation tasks** — default: `mmlu`
+
+## Step 2: Quantize
+
+**Invoke the `ptq` skill.** It handles environment detection, model compatibility, format selection, job submission, and checkpoint verification.
+
+Input: model path, quantization format, export path, GPU IDs.
+Output: quantized checkpoint at export path.
+
+## Step 3: Deploy or Evaluate
+
+### PTQ + Deploy
+
+**Invoke the `deployment` skill.** It starts an inference server with the quantized checkpoint.
+
+Input: checkpoint path, framework, GPU IDs, port.
+Output: running server at `http://localhost:<port>`.
+
+### PTQ + Evaluate
+
+**Invoke the `evaluation` skill.** It handles deploying the quantized model, configuring NEL evaluation, running benchmarks, and collecting results.
+
+Input: quantized checkpoint path, evaluation tasks.
+Output: accuracy scores per task.
+
+## Step 4: Baseline Comparison (PTQ + Evaluate only)
+
+After evaluation completes, ask: **"Would you like to compare against the unquantized baseline?"**
+
+If yes:
+1. Run the evaluation skill again with the **original model path** (from Step 1) and the same benchmark tasks
+2. Present a side-by-side comparison table:
+
+```text
+| Benchmark | BF16 (baseline) | FP8 (quantized) | Delta |
+|-----------|-----------------|-----------------|-------|
+| MMLU      | 67.3%           | 65.2%           | -2.1% |
+| GSM8K     | 54.1%           | 52.8%           | -1.3% |
+```
+
+3. Flag any benchmark with >2% accuracy drop — suggest trying a lighter quantization format
+
+## Step 5: Present Results and Iterate
+
+Show results and ask: **"Are you satisfied with these results?"**
+
+- **Yes** — Done. Report final model path and summary.
+- **No** — Propose a different recipe (lighter or heavier quantization), loop to Step 2.
+- **Quit** — Report partial results. Clean up any running servers.
diff --git a/.claude/skills/ptq/SKILL.md b/.claude/skills/ptq/SKILL.md
new file mode 100644
index 00000000000..43e8eaa43f6
--- /dev/null
+++ b/.claude/skills/ptq/SKILL.md
@@ -0,0 +1,266 @@
+---
+name: ptq
+description: This skill should be used when the user asks to "quantize a model", "run PTQ", "post-training quantization", "NVFP4 quantization", "FP8 quantization", "INT8 quantization", "INT4 AWQ", "quantize LLM", "quantize MoE", "quantize VLM", or needs to produce a quantized HuggingFace or TensorRT-LLM checkpoint from a pretrained model using ModelOpt.
+---
+
+# ModelOpt Post-Training Quantization
+
+Produce a quantized checkpoint from a pretrained HuggingFace model using NVIDIA Model Optimizer. The output is ready for TensorRT-LLM deployment or HuggingFace-compatible inference.
+
+## Decision Process
+
+### 0. Check the execution environment
+
+Do this first — the environment determines how to run the job and which formats are viable.
+
+**Multi-user / Slack bot mode?**
+
+If `MODELOPT_WORKSPACE_ROOT` is set, you are running in a multi-user environment. Read `skills/common/workspace-management.md` and check for an existing workspace for this model before proceeding. If you create or switch to a model-specific workspace, all subsequent steps run there.
+
+**Is this a remote execution?**
+
+Check if a remote cluster config exists or the user mentioned running on a remote machine:
+
+```bash
+cat ~/.config/modelopt/clusters.yaml 2>/dev/null || cat .claude/clusters.yaml 2>/dev/null
+```
+
+**Case A — config found, or user says "run on [cluster]" / "run remotely" / "use SSH":**
+Switch to remote execution mode — read `references/remote-execution.md` now. All subsequent steps apply whether local or remote.
+
+**Case B — no config, user hasn't mentioned a cluster:**
+Skip remote mode and proceed with local execution below.
+
+**Case C — no config, but user clearly wants remote (e.g. "run on the cluster", "use SSH", mentions a hostname):**
+Ask the user for the following info, then create `~/.config/modelopt/clusters.yaml` before proceeding:
+
+```text
+I need a few details to set up the remote cluster. Please provide:
+1. Login node hostname (e.g. cluster-login.example.com)
+2. SSH username
+3. SSH key path (default: ~/.ssh/id_rsa) — press Enter to use default
+4. Remote working directory (e.g. /lustre/fs1/username/modelopt or ~/modelopt)
+5. Cluster name/alias for future reference (e.g. "selene", "cw-dfw")
+```
+
+Once you have the answers, write `~/.config/modelopt/clusters.yaml`:
+
+```yaml
+clusters:
+  <alias>:
+    login_node: <hostname>
+    user: <username>
+    ssh_key: <ssh_key_path>
+    workspace: <remote_workdir>
+
+default_cluster: <alias>
+```
+
+Then read `references/remote-execution.md` and continue.
+
+**Is this a SLURM cluster?**
+
+```bash
+which srun squeue sbatch 2>/dev/null | head -1
+```
+
+If any of those exist, you're on SLURM. Query accounts and partitions:
+
+```bash
+# Get user's accounts and cluster
+sacctmgr show associations user=$USER format=account%30,partition%20,cluster%20 -n 2>/dev/null
+
+# List partitions with time limits
+sinfo -o "%P %a %l %G" 2>/dev/null | grep -v "^PARTITION"
+```
+
+- If the user has **one account**: use it automatically.
+- If the user has **multiple accounts**: show them and ask which to use. Default to the account whose name most closely matches the project or working directory.
+- For partition, use the default (marked with `*` in `sinfo` output). Report the choice.
+
+**If not SLURM, check for a local GPU:**
+
+```bash
+python -c "import torch; [print(f'GPU {i}: {torch.cuda.get_device_name(i)}') for i in range(torch.cuda.device_count())] if torch.cuda.is_available() else print('no-gpu')"
+```
+
+| Result | Action |
+|--------|--------|
+| SLURM detected | Proceed — GPU will be allocated via `srun`. Infer GPU type from `sinfo` node features. |
+| Local GPU found | Proceed — report the GPU model(s) to the user. |
+| Neither found | **Stop and report**: "No GPU found and this doesn't appear to be a SLURM cluster. PTQ calibration requires a CUDA GPU. Please confirm the target environment." |
+
+The GPU model feeds directly into format recommendation in the next step.
+
+### 1. Is the model architecture supported?
+
+**Read `examples/llm_ptq/README.md` first.** It is the authoritative reference for this workflow and contains information that isn't duplicated here: the full support matrix, correct CLI flag names, accuracy guidance, and hardware requirements. Key sections to check:
+
+- Support matrix (~line 100) — which architectures and formats are supported
+- Correct flags `--pyt_ckpt_path` / `--export_path` (~line 149)
+- Accuracy note: prefer `nvfp4_mlp_only` or `nvfp4_omlp_only` for NVFP4 (~line 131)
+- Blackwell GPU requirement for NVFP4 inference (~line 126, footnote 5)
+
+After reading the README, check `modelopt/torch/export/model_utils.py` for `MODEL_NAME_TO_TYPE`. If the model's class name substring-matches a key in that dict, it is supported.
+
+**Supported** → Use the existing `examples/llm_ptq/hf_ptq.py` script directly. No custom code needed.
+
+**Unsupported** → **Read `references/unsupported-models.md` now.** It covers model source investigation, FP8 detection, patch assessment, weight name verification, and all implementation patterns.
+
+### 2. Choose the quantization format
+
+If the user has not specified a format, **recommend one based on the GPU detected above**:
+
+| GPU generation | Memory priority | Accuracy priority |
+|----------------|-----------------|-------------------|
+| **Blackwell** (B100, B200, GB200) | `nvfp4_mlp_only` | `nvfp4_awq_lite` |
+| **Hopper** (H100, H200) or older | `int4_awq` | `fp8` |
+
+Tell the user which GPU was detected and which format you are recommending, and why.
+
+> **If the user explicitly requests `nvfp4` on a Hopper GPU**: proceed — H100/H200 can *calibrate* NVFP4 checkpoints fine. Just note: "NVFP4 inference requires Blackwell GPUs; this checkpoint will be calibrated on H100 but must be deployed on Blackwell."
+
+For reference, all available configs are in `modelopt/torch/quantization/config.py`:
+
+| Format | Config | Notes |
+|--------|--------|-------|
+| NVFP4 MLP-only | `NVFP4_MLP_ONLY_CFG` | Recommended for Blackwell; best accuracy/throughput tradeoff |
+| NVFP4 MLP weight-only | `NVFP4_MLP_WEIGHT_ONLY_CFG` | Quantize MLP weights only (no activations) |
+| NVFP4 all layers | `NVFP4_DEFAULT_CFG` | May reduce accuracy; see README |
+| NVFP4 + AWQ calibration | `NVFP4_AWQ_LITE_CFG` | Best NVFP4 accuracy, slower calibration |
+| FP8 per-tensor | `FP8_DEFAULT_CFG` | Accuracy-first for Hopper |
+| INT4 weight-only | `INT4_AWQ_CFG` | Memory-first for Hopper/older |
+| INT8 + SmoothQuant | `INT8_SMOOTHQUANT_CFG` | Older GPUs, activation quantization |
+
+> **NVFP4 requires Blackwell GPUs** for inference. H100 can run NVFP4 calibration but not inference.
+
+For MLP-only quantization (skipping attention), use configs with `MLP_ONLY` in the name, or create a custom config by disabling `*self_attn*`.
+
+### 3. Set up the environment
+
+- **SLURM**: Read `references/slurm-setup.md` — it has container setup, account/partition selection, the job script template, smoke-test strategy, and monitoring instructions.
+- **Local GPU**: Check if Docker is available first — it's the cleanest isolation:
+  - **Docker available**: use the TRT-LLM NGC container (version from `examples/llm_ptq/README.md`):
+
+    ```bash
+    docker run --gpus all -v <model_path>:<model_path> -v <output_path>:<output_path> \
+        nvcr.io/nvidia/tensorrt-llm/release:<version> bash -c "pip install --no-build-isolation -e <modelopt_path>[hf] --quiet && python <ptq_script.py> ..."
+    ```
+
+  - **No Docker**: set up a virtual environment with conda (preferred) or venv:
+
+    ```bash
+    # conda
+    conda create -n modelopt python=3.10 -y && conda activate modelopt
+    # or venv
+    python -m venv modelopt-env && source modelopt-env/bin/activate
+
+    pip install --no-build-isolation nvidia-modelopt[hf]
+    ```
+
+**GPU memory**: Estimate `num_params × 2 bytes` for BF16. Use `device_map="auto"` for multi-GPU. If the model exceeds single-node memory, see the FSDP2 section in `references/slurm-setup.md`.
+
+### 4. Write and run
+
+**The goal is a quantized checkpoint on disk — not a script handed to the user.** Write the script, run it (or submit it), follow the logs, fix errors, and rerun until the export directory contains `.safetensors` shards and a `config.json`.
+
+#### Supported models
+
+```bash
+python examples/llm_ptq/hf_ptq.py \
+    --pyt_ckpt_path <model_path> \
+    --qformat <format_name> \
+    --export_fmt hf \
+    --calib_size 512 \
+    --export_path <output_path>
+```
+
+Always pass `--export_fmt hf` explicitly — older versions of the script default to `tensorrt_llm` which produces TRT-LLM format instead of a HuggingFace checkpoint.
+
+Run `python examples/llm_ptq/hf_ptq.py --help` to see all options.
+
+#### Unsupported models
+
+Write a custom script following `references/unsupported-models.md`. Core steps:
+
+1. Load model (dequantize FP8 if needed)
+2. Register monkey-patched modules via `mtq.register()`
+3. Create calibration dataloader
+4. Call `mtq.quantize(model, config, forward_loop)`
+5. Export with `export_hf_checkpoint(model, export_dir)`
+
+#### Local GPU — run and monitor
+
+```bash
+nohup python ptq_script.py ... > <log_file> 2>&1 &
+tail -f <log_file>
+```
+
+PTQ-specific failure modes to check via `mtq.print_quant_summary()`:
+
+- **Quantizers not enabled**: wildcard missed modules — check `*gate*` vs `*mlp.gate*`
+- **FP8 tensors still present after dequant**: missed a non-standard param name — inspect `model.named_parameters()` for `float8_e4m3fn` dtypes
+
+### 5. Verify the output checkpoint
+
+Once the job succeeds, confirm the export is valid:
+
+```bash
+# Check export directory has model shards and config
+ls -lh <output_path>/
+# Expect: config.json, tokenizer files, model-*.safetensors
+
+# Verify no unexpected FP8 tensors remain
+python -c "
+from safetensors import safe_open
+import glob, os
+for f in sorted(glob.glob('<output_path>/model*.safetensors'))[:1]:
+    with safe_open(f, framework='pt') as sf:
+        for k in list(sf.keys())[:5]:
+            t = sf.get_tensor(k)
+            print(k, t.dtype, t.shape)
+"
+```
+
+Report the output path and checkpoint size to the user.
+
+## Key API Rules
+
+These are non-obvious requirements that cause hard-to-debug failures:
+
+- **`mtq.register()` requires `_setup` method**: Any class registered with `mtq.register(original_cls=X, quantized_cls=Y)` MUST define a method named exactly `_setup()`. Not `_init_quantizers`, not `setup` — exactly `_setup`. Also, the `__init__` must call `self._setup()` — if you forget this, TensorQuantizers are never instantiated and quantization silently does nothing.
+
+- **Call `mto.enable_huggingface_checkpointing()` before quantization**: Required for HF checkpoint export to work.
+
+- **Wildcard pattern `*gate*` is dangerously broad**: It matches both MoE router gates AND any quantizer with "gate" in the name (e.g., `gate_up_weight_quantizer`). Use `*mlp.gate*` or `*router*` to target router gates specifically. Always verify with `mtq.print_quant_summary()`.
+
+- **VLMs need `AutoModel`**: Vision-Language Models (e.g., `Mistral3ForConditionalGeneration`, `Mllama`) are NOT registered under `AutoModelForCausalLM`. Use `AutoModel.from_pretrained()`.
+
+- **FP8 checkpoints need the config class**: When loading an FP8-quantized checkpoint with `dequantize=True`, pass `FineGrainedFP8Config(dequantize=True)` — not a plain dict. HF validates the config type matches.
+
+- **Quantizer naming convention**: Custom `TensorQuantizer` modules must end with `_input_quantizer` or `_weight_quantizer` for ModelOpt's wildcard matching.
+
+- **Do not modify ModelOpt core source**: All custom code (monkey-patching, `mtq.register()` wrappers, dequantization helpers) must live in your own script or under `examples/`. Never edit files under `modelopt/torch/` unless there is no easy way to patch from outside — and if you must, note it explicitly so it can be upstreamed.
+
+## Additional Resources
+
+### Reference Files
+
+- **`references/unsupported-models.md`** — Patterns for extending ModelOpt to new architectures: MoE expert quantization, VLM language model extraction, FP8 dequantization, calibration routing
+- **`references/slurm-setup.md`** — SLURM job script template, container/enroot setup, partition selection, smoke-test strategy, monitoring, multi-node FSDP2
+- **`references/remote-execution.md`** — **Read this when running PTQ on a remote machine/cluster via SSH.** Covers cluster config, persistent SSH sessions, SLURM container jobs, the two-script pattern, and troubleshooting.
+- **`skills/common/workspace-management.md`** — **Read this when `MODELOPT_WORKSPACE_ROOT` is set (Slack bot / multi-user).** Covers when to create vs reuse workspaces, naming conventions, and cross-task workspace sharing (PTQ → deploy → eval).
+
+### ModelOpt Examples
+
+- **`examples/llm_ptq/README.md`** ← **read this first** — support matrix, correct flag names, accuracy guidance, hardware requirements
+- **`examples/llm_ptq/hf_ptq.py`** — Main PTQ script for supported models
+- **`examples/llm_ptq/multinode_ptq.py`** — Multi-node PTQ with FSDP2
+- **`examples/deepseek/ptq.py`** — Custom PTQ for DeepSeek MoE (reference for MoE monkey-patching)
+
+### Source Code
+
+- **`modelopt/torch/quantization/config.py`** — All quantization configs and format definitions
+- **`modelopt/torch/export/model_utils.py`** — `MODEL_NAME_TO_TYPE` (supported architectures), `get_model_type()`, `is_multimodal_model()`
+- **`modelopt/torch/quantization/conversion.py`** — `mtq.register()` implementation (see `_setup` requirement)
+- **`modelopt/torch/utils/dataset_utils.py`** — `get_dataset_dataloader()`, `get_supported_datasets()`
diff --git a/.claude/skills/ptq/references/remote-execution.md b/.claude/skills/ptq/references/remote-execution.md
new file mode 100644
index 00000000000..5b276c1820b
--- /dev/null
+++ b/.claude/skills/ptq/references/remote-execution.md
@@ -0,0 +1,149 @@
+# Remote Execution
+
+Read this when Claude Code runs on a different machine than the target GPU cluster/workstation. This covers SSH connectivity, cluster config, persistent sessions, and remote command execution. For SLURM-specific details (job scripts, containers, partitions, monitoring), see `slurm-setup.md`.
+
+---
+
+## 1. Cluster Config
+
+Config locations (checked in order, first found wins):
+
+1. `~/.config/modelopt/clusters.yaml` — user-level (not committed, recommended)
+2. `.claude/clusters.yaml` — project-level (can be committed for shared defaults)
+3. Interactive input — if neither file exists, ask the user (see SKILL.md Step 0) and write `~/.config/modelopt/clusters.yaml` before proceeding
+
+```yaml
+clusters:
+  my-cluster:
+    login_node: cluster-login.example.com   # SSH hostname or SSH config alias
+    user: username                           # SSH user
+    ssh_key: ~/.ssh/id_rsa                   # (optional) SSH key path
+    ssh_proxy: "socat - PROXY:localhost:%h:%p,proxyport=3128"  # (optional) proxy
+    workspace: /absolute/path/to/workdir     # Remote working directory
+    gpu_type: H100                           # For quant format recommendation
+    slurm:                                   # (optional) pre-fill SLURM defaults
+      default_account: my_account
+      default_partition: batch_short
+
+default_cluster: my-cluster
+```
+
+See `.claude/clusters.yaml.example` for a fully annotated example with multiple cluster types.
+
+---
+
+## 2. Connect and Establish Persistent Session
+
+```bash
+source .claude/skills/common/remote_exec.sh
+remote_load_cluster <cluster_name>    # or omit name to use default_cluster
+remote_check_ssh                      # validates connectivity + starts persistent session
+```
+
+`remote_check_ssh` starts an SSH **ControlMaster** connection. All subsequent `remote_run` / `remote_sync_*` / SCP calls reuse this single connection:
+
+- ~180ms per command (vs 5-15s per new connection)
+- Eliminates flaky proxy timeouts
+- Auto-cleaned up when the shell exits
+
+---
+
+## 3. Detect Remote Environment
+
+```bash
+remote_detect_env
+```
+
+Auto-discovers whether the remote has SLURM, Docker, or bare-metal GPUs. Sets `REMOTE_ENV_TYPE` to `slurm`, `docker`, `bare`, or `unknown`.
+
+After detection, proceed with the environment-specific setup:
+
+- **SLURM** → read `slurm-setup.md`, but prefix all commands with `remote_run`
+- **Docker** → use `remote_docker_run <container> "<command>"`
+- **Bare metal** → use `remote_run` directly
+
+---
+
+## 4. Running Commands Remotely
+
+### Single commands
+
+```bash
+remote_run "nvidia-smi"
+remote_run "python --version"
+remote_run "sbatch /path/to/job.sh"
+```
+
+`remote_run` uses base64 encoding internally, so special characters (`%`, `$`, quotes) work without escaping. It retries up to 3 times on SSH failures.
+
+### Syncing files
+
+```bash
+# Local → remote
+remote_sync_to /local/path remote_subdir
+
+# Remote → local
+remote_sync_from remote_subdir /local/path
+```
+
+Both use rsync over the persistent SSH session with default excludes (`.git`, `__pycache__`, etc.).
+
+### SCP (alternative to rsync)
+
+SCP also reuses the persistent session automatically via ControlMaster:
+
+```bash
+scp /local/script.sh ${REMOTE_USER}@${REMOTE_HOST}:/remote/path/
+```
+
+---
+
+## 5. The Two-Script Pattern
+
+When submitting SLURM jobs remotely, write **two files** locally to avoid shell escaping issues:
+
+1. **SLURM wrapper** (e.g., `ptq_slurm.sh`) — `#SBATCH` directives + `srun` with container
+2. **Inner runner** (e.g., `ptq_run.sh`) — the actual work (runs inside the container)
+
+Then upload both and submit:
+
+```bash
+remote_sync_to /local/scripts/ scripts/
+JOBID=$(remote_run "sbatch /remote/path/scripts/ptq_slurm.sh" | grep -o '[0-9]\+' | tail -1)
+```
+
+For the SLURM wrapper template and container flags, see `slurm-setup.md`.
+
+---
+
+## 6. Verifying Results Remotely
+
+```bash
+remote_run "ls -lh <output_path>/"
+remote_run "cat <output_path>/hf_quant_config.json"
+```
+
+Or fetch results to local:
+
+```bash
+remote_sync_from <remote_output_subdir> /local/output/
+```
+
+---
+
+## 7. Troubleshooting
+
+| Problem | Cause | Fix |
+| ------- | ----- | --- |
+| `Connection timed out during banner exchange` | Proxy/login node overloaded | `remote_run` retries 3x automatically; use persistent session to avoid |
+| SSH proxy completely unreachable (`Network is unreachable`) | VPN/proxy host is down or not running on this machine | Check if VPN is connected; verify `socat`/proxy service is running locally; try direct SSH by temporarily removing `ssh_proxy` from config |
+| `unix_listener: cannot bind to path ... Read-only file system` | SSH ControlMaster socket in non-writable `/tmp` | `remote_exec.sh` auto-finds writable dir; ensure `TMPDIR` or `/tmp/claude-*` exists |
+| `cd: /home/user/~/path: No such file or directory` | `~` not expanding on remote | Use absolute paths in `workspace` config, not `~/...` |
+| Login nodes resolve home dirs differently | Symlinked home dirs vary by node | Use absolute lustre/NFS paths (e.g., `/lustre/fs1/...`) in job scripts |
+| `#!` becomes `#\!` in scripts | Shell environment mangles shebang | Fix with `sed -i 's\|^#\\\\!\|#!\| script.sh'` after writing |
+
+## Reference Files
+
+- **`skills/common/remote_exec.sh`** — Full utility library (session, run, sync, SLURM, Docker helpers)
+- **`.claude/clusters.yaml`** — Active cluster configuration
+- **`.claude/clusters.yaml.example`** — Annotated example config
diff --git a/.claude/skills/ptq/references/slurm-setup.md b/.claude/skills/ptq/references/slurm-setup.md
new file mode 100644
index 00000000000..610eb32d3de
--- /dev/null
+++ b/.claude/skills/ptq/references/slurm-setup.md
@@ -0,0 +1,136 @@
+# SLURM Environment Setup for ModelOpt PTQ
+
+Read this file when running on a SLURM cluster. It covers container setup, job submission, smoke-test strategy, and monitoring.
+
+---
+
+## 1. Container
+
+Get the recommended image version from `examples/llm_ptq/README.md`, then look for a `.sqsh` file in the workspace and common sibling directories:
+
+```bash
+ls *.sqsh ../*.sqsh ~/containers/*.sqsh 2>/dev/null
+```
+
+If you find a `.sqsh` but aren't sure of its version, check it:
+
+```bash
+srun --container-image=<path/to/container.sqsh> --ntasks=1 bash -c \
+    "pip show tensorrt-llm 2>/dev/null | grep Version || cat /VERSION 2>/dev/null || echo unknown"
+```
+
+If no `.sqsh` exists, import it with enroot. Set writable cache paths first — the default `/raid/containers` is often not writable:
+
+```bash
+export ENROOT_CACHE_PATH=/path/to/writable/enroot-cache
+export ENROOT_DATA_PATH=/path/to/writable/enroot-data
+export TMPDIR=/path/to/writable/tmp
+mkdir -p "$ENROOT_CACHE_PATH" "$ENROOT_DATA_PATH" "$TMPDIR"
+
+enroot import --output /path/to/container.sqsh \
+    docker://nvcr.io#nvidia/tensorrt-llm/release:<version>
+```
+
+---
+
+## 2. Account and Partition
+
+```bash
+# Accounts available to you
+sacctmgr show associations user=$USER format=account%30,cluster%20 -n 2>/dev/null
+
+# GPU partitions and their time/node limits (exclude CPU-only)
+sinfo -o "%P %a %l %D %G" 2>/dev/null | grep -v "null\|CPU\|cpu"
+```
+
+- One account → use it automatically
+- Multiple accounts → show them to the user and ask which to use
+- Partition → use the default (marked `*`); report the choice
+
+---
+
+## 3. Job Script Template
+
+**Critical**: container flags (`--container-image`, `--container-mounts`) MUST be on the `srun` line — they do NOT work as `#SBATCH` directives.
+
+**GPU count**: estimate based on model size. Rough guide: 1 GPU per ~20B params in BF16 (e.g., 0.6B → 1 GPU, 70B → 4 GPUs, 405B → 8 GPUs). `hf_ptq.py` uses `device_map="auto"` so it fills GPUs automatically — request only as many as needed.
+
+```bash
+#!/bin/bash
+#SBATCH --job-name=ptq
+#SBATCH --account=<account>
+#SBATCH --partition=<partition>
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --gpus-per-node=<N>   # 1 for small models (<20B), 4-8 for large models
+#SBATCH --time=<HH:MM:SS>
+#SBATCH --output=<log_dir>/ptq_%j.log
+
+srun \
+    --container-image="<path/to/container.sqsh>" \
+    --container-mounts="<data_root>:<data_root>" \
+    --container-workdir="<workdir>" \
+    --no-container-mount-home \
+    bash -c "pip install -e <modelopt_path>[hf] --quiet && python <ptq_script.py> ..."
+```
+
+Submit and capture the job ID:
+
+```bash
+mkdir -p <log_dir>
+JOBID=$(sbatch <script>.sh | awk '{print $4}')
+echo "Submitted job $JOBID"
+```
+
+---
+
+## 4. Smoke Test First (Always)
+
+Before the full calibration run, submit a smoke test with `--calib_size 4` and `--time=00:30:00`. This catches script errors cheaply before using GPU quota on a real run.
+
+Use a comma-separated partition list — SLURM picks whichever allocates first. Shorter/interactive partitions queue faster:
+
+```bash
+#SBATCH --partition=interactive,batch_short,batch_block1
+#SBATCH --time=00:30:00
+```
+
+Note: interactive/short partitions may cap node count. If the smoke test needs multiple nodes, include a multi-node-capable partition as the last fallback.
+
+Only submit the full calibration job after the smoke test exits cleanly.
+
+---
+
+## 5. Monitor Until Completion
+
+After submitting the final job, do not stop — the goal is a finished checkpoint, not a submitted job. Poll with sleep until done:
+
+```bash
+while squeue -j $JOBID -h 2>/dev/null | grep -q .; do
+    echo "$(date): job $JOBID still running..."; sleep 60
+done
+echo "Job $JOBID finished"
+sacct -j $JOBID --format=JobID,State,ExitCode,Elapsed
+```
+
+**IMPORTANT**: Always use `sleep`-based polling (as above) rather than `CronCreate` or background tasks. This keeps output in the current session so the user can see progress. The sleep loop will wait as long as needed — even hours — until the job completes or fails.
+
+Once the job ends, tail the last 50 lines of the log and verify the export directory before reporting success.
+
+---
+
+## 6. Multi-node PTQ (FSDP2)
+
+For models too large for a single node (rough guide: 200B+ params), use `examples/llm_ptq/multinode_ptq.py` with FSDP2.
+
+Edit `examples/llm_ptq/fsdp2.yaml`:
+
+- `num_machines` and `num_processes` → match SLURM allocation
+- `fsdp_transformer_layer_cls_to_wrap` → model's decoder layer class name
+
+```bash
+accelerate launch --config_file fsdp2.yaml multinode_ptq.py \
+    --pyt_ckpt_path <model> --qformat <format> --export_path <output>
+```
+
+Set `--nodes=N` and `--ntasks-per-node=8` in the SLURM script accordingly.
diff --git a/.claude/skills/ptq/references/unsupported-models.md b/.claude/skills/ptq/references/unsupported-models.md
new file mode 100644
index 00000000000..d728d544587
--- /dev/null
+++ b/.claude/skills/ptq/references/unsupported-models.md
@@ -0,0 +1,295 @@
+# Extending ModelOpt PTQ to Unsupported Models
+
+When a model architecture is not in `MODEL_NAME_TO_TYPE` (in `modelopt/torch/export/model_utils.py`), follow the investigation steps below before writing any custom code.
+
+## Step A — Locate the model source
+
+**Is it a HuggingFace checkpoint?** Check for `config.json`. If present, try loading:
+
+```bash
+python -c "
+from transformers import AutoConfig
+cfg = AutoConfig.from_pretrained('<ckpt_path>')
+print(type(cfg).__name__)
+"
+```
+
+- **Succeeds** → transformers knows the architecture. Find the source file:
+
+  ```bash
+  python -c "
+  import inspect
+  from transformers import AutoConfig, AutoModel
+  cfg = AutoConfig.from_pretrained('<ckpt_path>')
+  cls = AutoModel._model_type_to_module_name.get(cfg.model_type)
+  import transformers; mod = getattr(transformers, cls, None)
+  print(inspect.getfile(mod) if mod else 'not found')
+  "
+  ```
+
+  Read the modeling file and proceed to Step B.
+
+- **Raises `ValueError` / `OSError` (unknown architecture)** → not in the installed transformers. Determine why:
+
+  1. **Search the working directory** for the class — a local fork or custom modeling file may already be present. If found, add its path to `sys.path`.
+
+  2. **Check the transformers `main` branch** (not yet released):
+
+     ```bash
+     git clone --depth 1 https://github.com/huggingface/transformers.git /tmp/transformers-main --quiet
+     grep -r "class <ArchName>" /tmp/transformers-main/src/transformers/models/
+     ```
+
+     - **Found** → install from that clone: `pip install /tmp/transformers-main --quiet`, then re-run `AutoConfig.from_pretrained()`.
+     - **Not found** → ask the user: *"The checkpoint uses `<ArchName>` which isn't in released or main-branch transformers. Do you have a private fork or custom modeling code?"*
+
+- **No `config.json`** → not a standard HF checkpoint. List the directory for README or `.py` files. If nothing useful, ask the user for the modeling code.
+
+## Step B — Is the checkpoint already FP8-quantized?
+
+Check `config.json` for `"quantization_config"` or scan weight files for `*_scale_inv*` tensors. If found, the model must be dequantized before re-quantizing. HuggingFace's `WeightConverter` only handles standard `weight` / `weight_scale_inv` names and will silently miss non-standard parameter names (e.g., 3D expert tensors in MoE layers). See **Pattern 5** below.
+
+## Step C — Determine what custom patches are needed
+
+Read the model source to identify how weights are stored. **If all linear layers are plain `nn.Linear`, no custom code is needed** — ModelOpt quantizes them automatically.
+
+**For HuggingFace models**, check `modelopt/torch/quantization/plugins/huggingface.py` first — it already registers patches for common non-standard modules (`Llama4TextExperts`, `FP8Linear`, `FalconLinear`, `Conv1D`, `Qwen3_5MoeExperts`, etc.). If your model's non-standard class is already registered there, no extra code is needed.
+
+Custom patches are required when:
+- **Fused/batched expert weights** — experts stored as a single parameter (e.g., 3D `[num_experts, in, out]`) rather than separate `nn.Linear` modules → Pattern 1 + 2
+- **Self-defined weight parameters** (`nn.Parameter` used directly instead of `nn.Linear`) — common in non-HF or research models → Pattern 1 + 3
+- **VLM structure** (vision encoder that should be excluded) → Pattern 4
+- **FP8 checkpoint** that needs dequantization before re-quantizing → Pattern 5
+
+## Step D — Check weight names against ModelOpt's config patterns
+
+Scan actual parameter names in the checkpoint and compare them against the wildcard patterns in the chosen quant config (`modelopt/torch/quantization/config.py`). If a module has a weight with a non-standard name (e.g., `gate_up_proj` instead of `gate_proj`/`up_proj`, or `experts.w1` instead of `experts.*.w1`), the wildcard will silently miss it.
+
+```python
+import json
+idx = json.load(open('<ckpt_path>/model.safetensors.index.json'))
+import re
+names = set(re.sub(r'\.\d+\.', '.N.', k) for k in idx['weight_map'])
+for n in sorted(names): print(n)
+```
+
+Compare against the `enable`/`disable` patterns in the config. Add custom overrides using Pattern 6 if needed. Always verify with `mtq.print_quant_summary(model)` after quantization.
+
+## Pattern 1: Custom Module with TensorQuantizer
+
+For modules that use raw `nn.Parameter` + `F.linear()` instead of `nn.Linear`, inject `TensorQuantizer` modules and apply them in the forward pass.
+
+```python
+from modelopt.torch.quantization.nn import TensorQuantizer
+
+class QuantCustomModule(OriginalModule):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._setup()
+
+    def _setup(self):
+        # One pair per projection
+        self.proj_a_input_quantizer = TensorQuantizer()
+        self.proj_a_weight_quantizer = TensorQuantizer()
+        self.proj_b_input_quantizer = TensorQuantizer()
+        self.proj_b_weight_quantizer = TensorQuantizer()
+
+    def forward(self, x, ...):
+        # Apply quantizers around F.linear calls
+        q_x = self.proj_a_input_quantizer(x)
+        q_w = self.proj_a_weight_quantizer(self.weight_a)
+        out = F.linear(q_x, q_w)
+        # ... continue with proj_b ...
+```
+
+**Rules:**
+
+- Method MUST be named `_setup` (ModelOpt's `mtq.register()` asserts this)
+- Quantizer names MUST end with `_input_quantizer` or `_weight_quantizer` for wildcard matching
+- The `__init__` must call `super().__init__()` then `self._setup()`
+
+## Pattern 2: MoE Calibration Wrapper
+
+MoE models route tokens to a subset of experts (top-k). During calibration, experts that receive no tokens won't have their quantization scales calibrated. Fix this with a wrapper that temporarily routes all tokens to all experts:
+
+```python
+class CalibMoE(OriginalMoE):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._setup()
+
+    def _setup(self):
+        self._original_top_k = self.top_k
+
+    def forward(self, hidden_states):
+        # First pass: all experts get calibration data
+        self.top_k = self.num_experts
+        super().forward(hidden_states)
+        # Second pass: normal routing for actual output
+        self.top_k = self._original_top_k
+        return super().forward(hidden_states)
+```
+
+Adjust attribute names (`top_k`, `num_experts`, `topk_group`, `n_group`, etc.) to match the model's implementation. Read the model's MoE source code to find the correct names.
+
+## Pattern 3: Registering with ModelOpt
+
+Register all custom classes BEFORE calling `mtq.quantize()`:
+
+```python
+import modelopt.torch.quantization as mtq
+
+mtq.register(original_cls=OriginalModule, quantized_cls=QuantCustomModule)
+mtq.register(original_cls=OriginalMoE, quantized_cls=CalibMoE)
+```
+
+`mtq.register()` tells ModelOpt to replace all instances of `original_cls` with `quantized_cls` during quantization. The replacement class must be a subclass of the original.
+
+## Pattern 4: VLM Language Model Extraction
+
+For multimodal models, only quantize the language model backbone:
+
+```python
+from modelopt.torch.export.model_utils import get_language_model_from_vl, is_multimodal_model
+
+if is_multimodal_model(model):
+    lineage = get_language_model_from_vl(model)
+    language_model = lineage[-1]
+
+    # Disable quantization for non-language modules
+    disabled_cfg = {"quant_cfg": {"default": {"enable": False}}, "algorithm": "max"}
+    memo = set(lineage)
+    for ancestor in lineage[:-1]:
+        for _, child in ancestor.named_children():
+            if child not in memo:
+                mtq.quantize(child, disabled_cfg, forward_loop=None)
+                memo.add(child)
+
+    # Now quantize only language_model
+    language_model = mtq.quantize(language_model, quant_cfg, forward_loop=forward_loop)
+```
+
+Also add safety overrides to the config:
+
+```python
+quant_cfg["quant_cfg"]["*vision*"] = {"enable": False}
+quant_cfg["quant_cfg"]["*multi_modal_projector*"] = {"enable": False}
+```
+
+## Pattern 5: FP8 Checkpoint Dequantization
+
+### Standard nn.Linear weights
+
+HuggingFace handles these automatically with `dequantize=True`:
+
+```python
+from transformers.utils.quantization_config import FineGrainedFP8Config
+
+model = AutoModel.from_pretrained(
+    model_path,
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    quantization_config=FineGrainedFP8Config(dequantize=True),
+)
+```
+
+### Non-standard parameter names (e.g., 3D expert weights)
+
+HF's `WeightConverter` uses source patterns `["weight$", "weight_scale_inv", "activation_scale"]`. Parameters with names like `gate_up_proj`, `down_proj`, `w1`, `w2`, `w3` won't match these patterns and will remain in FP8 after loading. Dequantize them manually:
+
+```python
+def dequantize_fp8_params(model, param_names=("gate_up_proj", "down_proj")):
+    """Dequantize remaining FP8 parameters that HF's WeightConverter missed."""
+    count = 0
+    for name, module in model.named_modules():
+        for param_name in param_names:
+            param = getattr(module, param_name, None)
+            if not isinstance(param, torch.nn.Parameter) or param.dtype != torch.float8_e4m3fn:
+                continue
+            scale = getattr(module, f"{param_name}_scale_inv", None)
+            if scale is None:
+                param.data = param.data.to(torch.bfloat16)
+            elif scale.dim() == 1:
+                # Per-tensor scale
+                param.data = param.data.to(torch.bfloat16) * scale.data[:, None, None].to(torch.bfloat16)
+            elif scale.dim() == 3:
+                # Per-block scale: reshape, broadcast, multiply
+                w = param.data
+                s = scale.data
+                block_m = w.shape[-2] // s.shape[-2]
+                block_n = w.shape[-1] // s.shape[-1]
+                reshaped = w.to(torch.bfloat16).reshape(-1, s.shape[-2], block_m, s.shape[-1], block_n)
+                scaled = reshaped * s.to(torch.bfloat16).unsqueeze(-1).unsqueeze(2)
+                param.data = scaled.reshape(w.shape)
+            else:
+                param.data = param.data.to(torch.bfloat16)
+            count += 1
+    if count:
+        print(f"Dequantized {count} FP8 parameters to BF16.")
+```
+
+Adapt `param_names` to match the model's actual parameter naming convention. Inspect the model's `modeling_*.py` and `config.json` to find the right names.
+
+## Pattern 6: Custom Quantization Config
+
+When stock configs don't match the model's module naming:
+
+```python
+import copy
+import modelopt.torch.quantization as mtq
+
+# Start from a stock config
+cfg = copy.deepcopy(mtq.NVFP4_MLP_ONLY_CFG)
+
+# Add patterns for custom module names
+cfg["quant_cfg"]["*custom_experts*weight_quantizer"] = {
+    "num_bits": (2, 1),
+    "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)},
+    "enable": True,
+}
+cfg["quant_cfg"]["*custom_experts*input_quantizer"] = {
+    "num_bits": (2, 1),
+    "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)},
+    "enable": True,
+}
+
+# Verify wildcards target the right modules
+# After quantization, always run:
+mtq.print_quant_summary(model)
+```
+
+## General Custom PTQ Script Structure
+
+```python
+import modelopt.torch.opt as mto
+import modelopt.torch.quantization as mtq
+from modelopt.torch.export import export_hf_checkpoint
+
+mto.enable_huggingface_checkpointing()
+
+# 1. Load model (with FP8 dequant if needed)
+model = load_and_dequantize(model_path)
+
+# 2. Register monkey-patched modules
+mtq.register(original_cls=..., quantized_cls=...)
+
+# 3. Calibrate and quantize
+dataloader = get_dataset_dataloader(dataset_name=["cnn_dailymail"], tokenizer=tokenizer, ...)
+def forward_loop(model):
+    for batch in dataloader:
+        model(**batch)
+
+model = mtq.quantize(model, quant_cfg, forward_loop=forward_loop)
+mtq.print_quant_summary(model)
+
+# 4. Export
+export_hf_checkpoint(model, export_dir=output_path)
+tokenizer.save_pretrained(output_path)
+```
+
+## Debugging Tips
+
+- **Smoke test first**: Run with `--calib_size 4` to verify the pipeline end-to-end before full calibration
+- **Check quantizer summary**: `mtq.print_quant_summary(model)` shows which quantizers are enabled/disabled
+- **Inspect dtypes**: After loading, iterate `model.named_parameters()` and check for unexpected FP8 tensors
+- **Watch for silent disabling**: A misconfigured wildcard pattern can silently disable quantizers — always verify the summary
diff --git a/.markdownlint-cli2.yaml b/.markdownlint-cli2.yaml
index 4c5a690145f..45bd2eb6834 100644
--- a/.markdownlint-cli2.yaml
+++ b/.markdownlint-cli2.yaml
@@ -5,3 +5,8 @@ config:
   MD033: false # no-inline-html
   MD041: false # first-line-heading
   MD059: false # no-hard-tabs
+  MD029: false # ol-prefix - allow 1. 2. 3. style numbered lists
+  MD032: false # blanks-around-lists - don't force blank lines around lists
+  MD036: false # no-emphasis-as-heading - allow **bold** as section markers
+  MD005: false # list-indent - allow flexible list item indentation
+  MD007: false # ul-indent - allow unindented sub-lists under numbered lists
diff --git a/slack-bot/.env.example b/slack-bot/.env.example
new file mode 100644
index 00000000000..0de8788eaa4
--- /dev/null
+++ b/slack-bot/.env.example
@@ -0,0 +1,23 @@
+# Slack tokens (required)
+SLACK_BOT_TOKEN=xoxb-your-bot-token
+SLACK_APP_TOKEN=xapp-your-app-token
+
+# Shared/default Anthropic API key (used when users choose "shared team key")
+ANTHROPIC_API_KEY=sk-ant-your-default-key
+
+# Path to the shared Model-Optimizer repo (default: parent of slack-bot/)
+# REPO_DIR=/opt/Model-Optimizer
+
+# Data directory for user state, keys, job dirs (default: ~/.local/share/modelopt)
+# DATA_DIR=~/.local/share/modelopt
+
+# Master key for encrypting stored API keys (32-byte hex)
+# If not set, a random key is generated on first run (dev only)
+# KEY_STORE_SECRET=
+
+# Session/workspace settings
+# CLAUDE_IDLE_TIMEOUT=7200           # Kill Claude if idle for this many seconds (default: 7200)
+# MAX_WORKSPACES_PER_USER=20        # Max workspaces per user before oldest is removed (default: 20)
+# WORKSPACE_MAX_AGE=2592000         # Max workspace age in seconds (default: 30 days)
+# SESSION_MAX_AGE_DAYS=30           # Auto-cleanup sessions older than this (default: 30)
+# CLEANUP_INTERVAL_HOURS=6          # Run auto-cleanup every N hours (default: 6)
diff --git a/slack-bot/.gitignore b/slack-bot/.gitignore
new file mode 100644
index 00000000000..0c54f604fc1
--- /dev/null
+++ b/slack-bot/.gitignore
@@ -0,0 +1,2 @@
+data/
+.env
diff --git a/slack-bot/README.md b/slack-bot/README.md
new file mode 100644
index 00000000000..aa14dcc9a3f
--- /dev/null
+++ b/slack-bot/README.md
@@ -0,0 +1,118 @@
+# ModelOpt Slack Bot
+
+Centralized Slack bot for ModelOpt agent skills (PTQ, deployment, evaluation). Shared bot with per-user authentication, isolated job directories, and remote cluster support.
+
+## Architecture
+
+```
+Slack                      Bot Server                         Per-Job Execution
+┌──────────┐  event      ┌──────────────────┐               ┌──────────────────┐
+│ User A   │ ──────────> │  Slack Bot       │  fresh copy   │ Job dir (User A) │
+│ @modelopt│             │  (slack-bolt)    │ ────────────> │ Model-Optimizer/ │
+│ quantize │ <────────── │                  │  claude --cwd │ .claude/skills/  │
+│ Qwen3... │  response   │  ┌────────────┐  │               │ clusters.yaml    │
+└──────────┘             │  │ UserStore  │  │               └──────────────────┘
+                         │  │ JobManager │  │
+┌──────────┐             │  │ KeyStore   │  │               ┌──────────────────┐
+│ User B   │ ──────────> │  └────────────┘  │  fresh copy   │ Job dir (User B) │
+│ @modelopt│             │                  │ ────────────> │ Model-Optimizer/ │
+│ deploy   │ <────────── │                  │               │ .claude/skills/  │
+└──────────┘             └──────────────────┘               └──────────────────┘
+```
+
+**Key design:**
+- Single shared upstream repo (read-only)
+- Each job gets a fresh copy (no `.git`) — agent can freely modify code
+- User's `clusters.yaml` is injected into each job copy
+- Claude CLI runs with user's own auth credentials
+
+## Setup
+
+### 1. Create Slack App
+
+1. Go to [api.slack.com/apps](https://api.slack.com/apps) and create a new app
+2. Enable **Socket Mode** (Settings > Socket Mode > Enable)
+3. Generate an **App-Level Token** with `connections:write` scope → `SLACK_APP_TOKEN` (xapp-...)
+4. Add **Bot Token Scopes** (OAuth & Permissions):
+   - `app_mentions:read`
+   - `chat:write`
+   - `commands`
+   - `files:write`
+   - `im:history`
+   - `im:read`
+   - `im:write`
+5. **Subscribe to Events** (Event Subscriptions):
+   - `app_mention`
+   - `message.im`
+6. **Slash Commands**: Create `/modelopt` command
+7. Install the app to your workspace
+8. Copy the **Bot User OAuth Token** → `SLACK_BOT_TOKEN` (xoxb-...)
+
+### 2. Install Dependencies
+
+```bash
+cd slack-bot
+pip install -r requirements.txt
+```
+
+### 3. Configure
+
+```bash
+cp .env.example .env
+# Edit .env with your tokens and settings
+```
+
+### 4. Create Data Directory
+
+```bash
+mkdir -p /data/modelopt
+```
+
+### 5. Run
+
+```bash
+source .env
+python bot.py
+```
+
+## User Onboarding
+
+First-time users are guided through setup automatically:
+
+1. **Auth choice**: shared team key, own API key, or browser OAuth
+2. **Cluster config** (optional): interactive setup of remote SLURM cluster
+
+## Commands
+
+| Command | Description |
+|---------|-------------|
+| `@modelopt <prompt>` | Run a ModelOpt task |
+| `/modelopt setup` | Onboard (auth + cluster config) |
+| `/modelopt set-key <key>` | Set own API key (DM only) |
+| `/modelopt add-cluster` | Configure a remote cluster |
+| `/modelopt clusters` | List configured clusters |
+| `/modelopt jobs` | List recent jobs |
+| `/modelopt cleanup` | Remove old job directories |
+| `/modelopt status` | Show your current status |
+| `/modelopt help` | Show available commands |
+
+## Data Layout
+
+```
+/data/modelopt/
+  keys/                              ← encrypted key store
+  users/<slack_uid>/
+    auth.json                        ← auth method
+    clusters.yaml                    ← user's cluster config
+    jobs/
+      ptq-20260318-143022/           ← fresh repo copy per job
+      ptq-20260318-160511/
+```
+
+## Examples
+
+```
+@modelopt quantize Qwen3-0.6B with nvfp4
+@modelopt quantize and evaluate Llama-3.1-8B with fp8 on cw-dfw
+@modelopt deploy ./my-checkpoint with vLLM
+```
diff --git a/slack-bot/auth_session.py b/slack-bot/auth_session.py
new file mode 100644
index 00000000000..399afc7adbc
--- /dev/null
+++ b/slack-bot/auth_session.py
@@ -0,0 +1,307 @@
+#!/usr/bin/env python3
+
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Interactive Claude session for authentication.
+
+Spawns a temporary interactive `claude` session via pexpect to handle the
+/login flow (theme selection → Console login → paste code). After auth
+succeeds, the session is killed and subsequent requests use --print mode.
+
+The credentials are stored in a per-user CLAUDE_CONFIG_DIR.
+"""
+
+import asyncio
+import contextlib
+import logging
+import os
+import re
+import shutil
+import tempfile
+
+logger = logging.getLogger(__name__)
+
+
+class AuthSession:
+    """Manages a temporary interactive Claude session for login."""
+
+    def __init__(self, user_id: str, data_dir: str):
+        """Initialize the auth session for the given user."""
+        self.user_id = user_id
+        self.config_dir = tempfile.mkdtemp(prefix=f"claude-auth-{user_id}-")
+        self._data_dir = data_dir
+        self._child = None
+        self._url = None
+
+    async def start_and_get_url(self) -> tuple[str | None, int | None]:
+        """Start interactive session, navigate to Console login, return OAuth URL and port.
+
+        Returns (url, local_port) — the URL the user opens after setting up
+        an SSH tunnel to local_port. Returns (None, None) on failure.
+        """
+        import pexpect
+
+        claude_bin = shutil.which("claude")
+        if not claude_bin:
+            raise FileNotFoundError("`claude` CLI not found in PATH")
+
+        env = os.environ.copy()
+        env["CLAUDE_CONFIG_DIR"] = self.config_dir
+
+        self._child = pexpect.spawn(
+            f"{claude_bin} --no-chrome",
+            timeout=30,
+            env=env,
+            encoding="utf-8",
+            dimensions=(50, 200),
+        )
+
+        def _navigate_to_login():
+            import time
+
+            assert self._child is not None
+            # Wait for theme picker
+            time.sleep(4)
+            with contextlib.suppress(Exception):
+                self._child.read_nonblocking(16384, timeout=3)
+
+            # Select default theme (press Enter)
+            self._child.send("\r")
+            time.sleep(3)
+            with contextlib.suppress(Exception):
+                self._child.read_nonblocking(16384, timeout=3)
+
+            # At login menu — select option 2 (Console account)
+            # Press down arrow once, then Enter
+            self._child.send("\x1b[B")  # down arrow
+            time.sleep(0.5)
+            self._child.send("\r")  # enter
+            time.sleep(5)
+
+            # Read output in a loop until we get the complete URL.
+            # The URL gets line-wrapped by the PTY (\r\n mid-URL), so we
+            # must strip all whitespace before extracting.
+            buf = ""
+            for _ in range(15):
+                try:
+                    chunk = self._child.read_nonblocking(32768, timeout=2)
+                    buf += chunk
+                except Exception:
+                    pass
+                # Strip ANSI escapes, then remove all \r\n to rejoin wrapped lines
+                clean = re.sub(r"\x1b\[[0-9;]*[a-zA-Z]", "", buf)
+                clean = re.sub(r"[\r\n]+", "", clean)
+                urls = re.findall(
+                    r"https://platform\.claude\.com/oauth/authorize[^\s'\"<>]+",
+                    clean,
+                )
+                if urls and "&state=" in urls[0]:
+                    url = urls[0]
+                    # State is always 43 chars (base64url of 32 bytes, no padding).
+                    # Truncate after &state=<43 chars> to remove any trailing text.
+                    m = re.search(r"&state=", url)
+                    if m:
+                        url = url[: m.end() + 43]
+                    return url
+                time.sleep(1)
+
+            return None
+
+        url = await asyncio.to_thread(_navigate_to_login)
+        self._url = url
+
+        # Find the local port the CLI is listening on
+        port = None
+        if url and self._child:
+            try:
+                import subprocess as _sp  # nosec B404
+
+                pid = self._child.pid
+                ss = _sp.run(  # nosec B603 B607
+                    ["ss", "-tlnp"], capture_output=True, text=True
+                ).stdout
+                for line in ss.split("\n"):
+                    if str(pid) in line:
+                        m = re.search(r":(\d+)\s", line)
+                        if m:
+                            port = int(m.group(1))
+                            break
+            except Exception:
+                pass
+
+        if url:
+            logger.info("Auth URL captured for %s (port=%s)", self.user_id, port)
+        else:
+            logger.error("Failed to capture auth URL for %s", self.user_id)
+
+        return url, port
+
+    async def wait_for_auth(self, timeout: int = 300) -> bool:
+        """Wait for auth to complete (config.json gets written with API key).
+
+        The user completes the browser auth with an SSH tunnel. The CLI's
+        localhost listener receives the callback and writes credentials.
+
+        Returns True if auth succeeded within timeout.
+        """
+        import json
+        from pathlib import Path
+
+        config_file = Path(self.config_dir) / ".claude.json"
+
+        def _poll():
+            import time
+
+            start = time.time()
+            while time.time() - start < timeout:
+                if config_file.exists():
+                    try:
+                        data = json.loads(config_file.read_text())
+                        if data.get("primaryApiKey"):
+                            return True
+                    except Exception:
+                        pass
+                # Also check if process exited
+                if self._child and not self._child.isalive():
+                    # Check one more time
+                    if config_file.exists():
+                        try:
+                            data = json.loads(config_file.read_text())
+                            if data.get("primaryApiKey"):
+                                return True
+                        except Exception:
+                            pass
+                    return False
+                time.sleep(2)
+            return False
+
+        return await asyncio.to_thread(_poll)
+
+    async def submit_code(self, code: str) -> bool:
+        """Paste the auth code into the interactive session.
+
+        Sends characters one-by-one (Ink TUI uses raw mode and can't handle
+        bulk sendline), then presses Enter to submit.
+
+        Returns True if login succeeded.
+        """
+        if not self._child or not self._child.isalive():
+            logger.error("Auth session not alive for %s", self.user_id)
+            return False
+
+        def _submit():
+            import json
+            import time
+            from pathlib import Path
+
+            # Send code char-by-char (Ink raw mode requires this)
+            for ch in code:
+                self._child.send(ch)
+                time.sleep(0.02)
+            time.sleep(0.3)
+            self._child.send("\r")  # Enter to submit
+
+            config_file = Path(self.config_dir) / ".claude.json"
+
+            # Poll for up to 30s: check config.json and process output
+            for attempt in range(15):
+                time.sleep(2)
+
+                # Check if API key was written
+                if config_file.exists():
+                    try:
+                        data = json.loads(config_file.read_text())
+                        if data.get("primaryApiKey"):
+                            logger.info("API key found in config for %s", self.user_id)
+                            return True
+                    except Exception:
+                        pass
+
+                # Read any output (non-blocking)
+                try:
+                    buf = self._child.read_nonblocking(16384, timeout=1)
+                    # Strip ALL ANSI escape sequences
+                    clean = re.sub(r"\x1b\[[\?0-9;]*[a-zA-Z]", "", buf)
+                    clean = re.sub(r"\x1b[><=()][0-9]*", "", clean)
+                    clean = re.sub(r"[\r\n\s]+", " ", clean).strip()
+                    # Filter out char echoes (single chars or masked *)
+                    words = [w for w in clean.split() if len(w) > 5 and not w.startswith("*")]
+                    meaningful = " ".join(words)
+                    if meaningful:
+                        logger.info("Auth output for %s: %s", self.user_id, meaningful[:200])
+                    if "successful" in meaningful.lower() or "logged" in meaningful.lower():
+                        # Press Enter to continue past the success prompt
+                        logger.info("Login successful for %s, pressing Enter", self.user_id)
+                        time.sleep(1)
+                        self._child.send("\r")
+                        time.sleep(3)
+                        # Check config now
+                        if config_file.exists():
+                            try:
+                                data = json.loads(config_file.read_text())
+                                if data.get("primaryApiKey"):
+                                    return True
+                            except Exception:
+                                pass
+                        # Give more time after Enter
+                        time.sleep(5)
+                        if config_file.exists():
+                            try:
+                                data = json.loads(config_file.read_text())
+                                if data.get("primaryApiKey"):
+                                    return True
+                            except Exception:
+                                pass
+                        # Even without config.json, login was successful
+                        return True
+                    if "oauth error" in meaningful.lower() or "login failed" in meaningful.lower():
+                        logger.error("Auth failed for %s: %s", self.user_id, meaningful[:200])
+                        return False
+                except Exception:
+                    pass
+
+                # Check if process died
+                if not self._child.isalive():
+                    # Final check
+                    if config_file.exists():
+                        try:
+                            data = json.loads(config_file.read_text())
+                            if data.get("primaryApiKey"):
+                                return True
+                        except Exception:
+                            pass
+                    return False
+
+            return False
+
+        return await asyncio.to_thread(_submit)
+
+    def get_config_dir(self) -> str:
+        """Return the temporary config directory path."""
+        return self.config_dir
+
+    def close(self):
+        """Kill the interactive session."""
+        if self._child:
+            with contextlib.suppress(Exception):
+                self._child.close(force=True)
+            self._child = None
+
+    def __del__(self):
+        self.close()
diff --git a/slack-bot/bot.py b/slack-bot/bot.py
new file mode 100644
index 00000000000..408f10d7e35
--- /dev/null
+++ b/slack-bot/bot.py
@@ -0,0 +1,1083 @@
+#!/usr/bin/env python3
+
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""ModelOpt Slack Bot — centralized bot with per-user sessions.
+
+Architecture:
+    - Single shared Model-Optimizer repo (upstream source)
+    - Per-user workspaces (agent decides when to create/reuse)
+    - User's cluster config injected into workspaces
+    - Claude CLI runs with user's auth credentials
+
+Auth options (per user):
+    1. Shared team key (no setup needed)
+    2. Anthropic Console account (interactive login via PTY)
+
+Usage:
+    @modelopt <prompt>           — run a prompt
+    /modelopt setup              — onboard (auth + optional cluster config)
+    /modelopt add-cluster        — interactive cluster setup
+    /modelopt clusters           — list configured clusters
+    /modelopt workspaces         — list your workspaces
+    /modelopt cleanup            — remove old workspaces
+    /modelopt status             — show session info
+    /modelopt help               — show commands
+"""
+
+import asyncio
+import logging
+import os
+import re
+import uuid
+from pathlib import Path
+
+from job_manager import WorkspaceManager
+from key_store import KeyStore
+from session_manager import run_claude_streaming
+from slack_bolt.adapter.socket_mode.async_handler import AsyncSocketModeHandler
+from slack_bolt.async_app import AsyncApp
+from user_store import UserStore
+
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
+logger = logging.getLogger(__name__)
+
+# ─── Configuration ───────────────────────────────────────────────────
+
+SLACK_BOT_TOKEN = os.environ["SLACK_BOT_TOKEN"]
+SLACK_APP_TOKEN = os.environ["SLACK_APP_TOKEN"]
+
+REPO_DIR = os.environ.get(
+    "REPO_DIR",
+    str(Path(__file__).resolve().parent.parent),
+)
+
+DATA_DIR = os.environ.get(
+    "DATA_DIR", os.path.join(os.path.expanduser("~"), ".local", "share", "modelopt")
+)
+
+MAX_SLACK_LENGTH = 3900
+
+# ─── Initialize Components ───────────────────────────────────────────
+
+app = AsyncApp(token=SLACK_BOT_TOKEN)
+
+key_store = KeyStore(data_dir=DATA_DIR)
+user_store = UserStore(data_dir=DATA_DIR, key_store=key_store)
+workspace_mgr = WorkspaceManager(repo_dir=REPO_DIR)
+
+# Onboarding state machines
+onboarding_state: dict[str, str] = {}
+cluster_setup_state: dict[str, dict] = {}
+
+# Store last full response per user for /modelopt logs
+_last_response: dict[str, str] = {}
+
+# Pending credential imports awaiting user confirmation
+_pending_cred_import: dict[str, dict] = {}
+
+# Per-session locks to prevent concurrent Claude processes on the same session
+_session_locks: dict[str, asyncio.Lock] = {}
+
+# Keep strong references to background tasks to prevent GC
+_background_tasks: set = set()
+
+# ─── Helpers ─────────────────────────────────────────────────────────
+
+
+def strip_bot_mention(text: str) -> str:
+    """Remove @bot mention prefix from a message."""
+    return re.sub(r"<@[A-Z0-9]+>\s*", "", text).strip()
+
+
+def truncate(text: str, limit: int = MAX_SLACK_LENGTH) -> str:
+    """Truncate text to the given limit, appending a notice if cut."""
+    if len(text) <= limit:
+        return text
+    return text[:limit] + "\n\n... (truncated, full output in job dir)"
+
+
+async def send_long_response(say, text: str, thread_ts: str, channel: str):
+    """Send response, uploading as file if too long."""
+    if len(text) <= MAX_SLACK_LENGTH:
+        await say(text=text, thread_ts=thread_ts)
+    else:
+        await say(
+            text=truncate(text) + "\n\n_Full output uploaded as file._",
+            thread_ts=thread_ts,
+        )
+        await app.client.files_upload_v2(
+            channel=channel,
+            content=text,
+            filename="claude_response.md",
+            title="Full Claude Response",
+            thread_ts=thread_ts,
+        )
+
+
+def is_dm(event: dict) -> bool:
+    """Return True if the event is a direct message."""
+    return event.get("channel_type") == "im"
+
+
+# ─── Onboarding ─────────────────────────────────────────────────────
+
+WELCOME_MSG = """Welcome to *ModelOpt Bot*!
+
+I need to set you up first. How would you like to authenticate with Claude?
+
+*1️⃣* Use shared team key (no setup needed)
+*2️⃣* Log in with your own Anthropic Console account
+
+Reply with `1` or `2`."""
+
+HELP_MSG = """*ModelOpt Bot Commands*
+
+*Run prompts:*
+• `@modelopt <prompt>` — run a ModelOpt task (PTQ, deploy, eval)
+
+*Setup:*
+• `/modelopt setup` — onboard (auth + cluster config)
+• `/modelopt add-cluster` — configure a remote cluster
+• `/modelopt clusters` — list your configured clusters
+• `/modelopt set-env KEY=VALUE` — set personal env var (DM only, e.g. `HF_TOKEN`, `NGC_API_KEY`)
+• `/modelopt env` — list your env vars
+• `/modelopt unset-env KEY` — remove an env var
+
+*Workspaces & Logs:*
+• `/modelopt workspaces` — list your workspaces
+• `/modelopt logs` — upload full output of last task as a file
+• `/modelopt cleanup` — remove old workspaces
+• `/modelopt status` — show your current status
+
+*Examples:*
+```
+@modelopt quantize Qwen3-0.6B with nvfp4
+@modelopt quantize and evaluate Llama-3.1-8B with fp8
+@modelopt deploy ./my-checkpoint with vLLM
+```"""
+
+# ─── Auth via interactive Claude session ─────────────────────────────
+
+_auth_sessions: dict = {}  # {user_id: AuthSession}
+
+
+async def _start_interactive_login(user_id: str, say, thread_ts: str | None):
+    """Start interactive Claude session, navigate to Console login, send URL to user."""
+    from auth_session import AuthSession
+
+    try:
+        session = AuthSession(user_id, DATA_DIR)
+        await say(text="Starting login... this takes a few seconds.", thread_ts=thread_ts)
+        url, port = await session.start_and_get_url()
+
+        if url:
+            _auth_sessions[user_id] = session
+            onboarding_state[user_id] = "awaiting_auth_code"
+            await say(
+                text=(
+                    "Open this link in your browser to sign in with your Anthropic Console account:\n\n"
+                    f"{url}\n\n"
+                    "After signing in, you'll see a code on the page. Paste that code back here."
+                ),
+                thread_ts=thread_ts,
+            )
+        else:
+            session.close()
+            onboarding_state.pop(user_id, None)
+            await say(
+                text="Could not start login flow. Try `/modelopt setup` again.",
+                thread_ts=thread_ts,
+            )
+    except Exception as e:
+        logger.error("Interactive login error for %s: %s", user_id, e)
+        onboarding_state.pop(user_id, None)
+        await say(text=f"Login error: {e}\nTry `/modelopt setup` again.", thread_ts=thread_ts)
+
+
+# ─── Onboarding Response Handler ────────────────────────────────────
+
+
+async def _finish_onboarding(say, thread_ts):
+    """Send the final onboarding message."""
+    await say(
+        text=(
+            "You're all set! Try: `@modelopt quantize Qwen3-0.6B with nvfp4`\n\n"
+            "Use `/modelopt help` for all commands."
+        ),
+        thread_ts=thread_ts,
+    )
+
+
+async def _start_creds_import(user_id, say, thread_ts):
+    """Ask user if they want to import credentials from their home dir."""
+    onboarding_state[user_id] = "awaiting_creds_choice"
+    await say(
+        text=(
+            "Would you like to import credentials (HF token, NGC key, etc.)?\n\n"
+            "*1️⃣* I have a user account on this server — scan my home dir\n"
+            "*2️⃣* Skip for now (set later with `/modelopt set-env`)\n\n"
+            "Reply with `1` or `2`."
+        ),
+        thread_ts=thread_ts,
+    )
+
+
+async def handle_onboarding_response(event, say):
+    """Handle responses during the onboarding flow."""
+    user_id = event["user"]
+    text = event.get("text", "").strip()
+    thread_ts = event.get("thread_ts", event["ts"])
+    state = onboarding_state.get(user_id)
+
+    if state == "awaiting_auth_choice":
+        if text == "1":
+            user_store.setup_shared_key(user_id)
+            del onboarding_state[user_id]
+            await say(
+                text=(
+                    "Using shared team key. No setup needed!\n\n"
+                    "Would you like to configure a remote cluster? Reply `yes` or `no`."
+                ),
+                thread_ts=thread_ts,
+            )
+            onboarding_state[user_id] = "awaiting_cluster_choice"
+        elif text == "2":
+            await _start_interactive_login(user_id, say, thread_ts)
+        else:
+            await say(text="Please reply with `1` or `2`.", thread_ts=thread_ts)
+        return True
+
+    if state == "awaiting_auth_code":
+        # User pasted the code from the browser callback page
+        code = text.strip()
+        session = _auth_sessions.pop(user_id, None)
+        if not session:
+            onboarding_state.pop(user_id, None)
+            await say(
+                text="Login session expired. Try `/modelopt setup` again.", thread_ts=thread_ts
+            )
+            return True
+
+        try:
+            await say(text="Verifying code...", thread_ts=thread_ts)
+            success = await session.submit_code(code)
+
+            if success:
+                user_store.setup_login_auth(user_id, session.get_config_dir())
+                onboarding_state.pop(user_id, None)
+                session.close()
+                await say(
+                    text=(
+                        "Logged in successfully!\n\n"
+                        "Would you like to configure a remote cluster? Reply `yes` or `no`."
+                    ),
+                    thread_ts=thread_ts,
+                )
+                onboarding_state[user_id] = "awaiting_cluster_choice"
+            else:
+                onboarding_state.pop(user_id, None)
+                session.close()
+                await say(
+                    text="Login failed. The code may be invalid or expired.\nTry `/modelopt setup` again.",
+                    thread_ts=thread_ts,
+                )
+        except Exception as e:
+            logger.error("Auth code error for %s: %s", user_id, e)
+            onboarding_state.pop(user_id, None)
+            session.close()
+            await say(text=f"Login error: {e}\nTry `/modelopt setup` again.", thread_ts=thread_ts)
+        return True
+
+    if state == "awaiting_reconfig_choice":
+        del onboarding_state[user_id]
+        if text == "1":
+            # Re-authenticate
+            onboarding_state[user_id] = "awaiting_auth_choice"
+            await say(text=WELCOME_MSG, thread_ts=thread_ts)
+        elif text == "2":
+            await start_cluster_setup(user_id, say, thread_ts)
+        elif text == "3":
+            await _start_creds_import(user_id, say, thread_ts)
+        else:
+            await say(
+                text="Keeping current setup. Use `/modelopt help` for commands.",
+                thread_ts=thread_ts,
+            )
+        return True
+
+    if state == "awaiting_cluster_choice":
+        del onboarding_state[user_id]
+        if text.lower() in ("yes", "y"):
+            await start_cluster_setup(user_id, say, thread_ts, from_onboarding=True)
+        else:
+            onboarding_state[user_id] = "awaiting_creds_choice"
+            await say(
+                text=(
+                    "Would you like to import credentials (HF token, NGC key, etc.)?\n\n"
+                    "*1️⃣* I have a user account on this server — scan my home dir\n"
+                    "*2️⃣* Skip for now (set later with `/modelopt set-env`)\n\n"
+                    "Reply with `1` or `2`."
+                ),
+                thread_ts=thread_ts,
+            )
+        return True
+
+    if state == "awaiting_creds_choice":
+        del onboarding_state[user_id]
+        if text == "1":
+            onboarding_state[user_id] = "awaiting_username"
+            await say(
+                text="What is your username on this server?",
+                thread_ts=thread_ts,
+            )
+        else:
+            await _finish_onboarding(say, thread_ts)
+        return True
+
+    if state == "awaiting_username":
+        del onboarding_state[user_id]
+        username = text.strip()
+        home_dir = user_store.resolve_home_dir(username)
+        if not home_dir:
+            await say(
+                text=(
+                    f"Could not find home directory for `{username}`.\n"
+                    "You can set tokens manually with `/modelopt set-env HF_TOKEN=...`"
+                ),
+                thread_ts=thread_ts,
+            )
+            await _finish_onboarding(say, thread_ts)
+            return True
+
+        creds = user_store.scan_local_credentials(home_dir)
+        if not creds:
+            await say(
+                text=(
+                    f"No credentials found in `{home_dir}`.\n"
+                    "You can set tokens manually with `/modelopt set-env HF_TOKEN=...`"
+                ),
+                thread_ts=thread_ts,
+            )
+            await _finish_onboarding(say, thread_ts)
+            return True
+
+        # Show what was found
+        lines = ["*Found credentials:*"]
+        for key, val in creds.items():
+            if key.startswith("_"):
+                if key == "_DOCKER_CONFIG":
+                    lines.append(f"• Docker config: `{val}`")
+            else:
+                masked = val[:6] + "..." if len(val) > 6 else val
+                lines.append(f"• `{key}` = `{masked}`")
+
+        # Store found creds temporarily for confirmation
+        _pending_cred_import[user_id] = creds
+        onboarding_state[user_id] = "awaiting_creds_confirm"
+        await say(
+            text="\n".join(lines) + "\n\nImport these? Reply `yes` or `no`.",
+            thread_ts=thread_ts,
+        )
+        return True
+
+    if state == "awaiting_creds_confirm":
+        del onboarding_state[user_id]
+        creds = _pending_cred_import.pop(user_id, {})
+        if text.lower() in ("yes", "y") and creds:
+            imported = user_store.import_credentials(user_id, creds)
+            await say(
+                text=f"Imported: {', '.join(f'`{k}`' for k in imported)}",
+                thread_ts=thread_ts,
+            )
+        else:
+            await say(
+                text="Skipped. Use `/modelopt set-env` to add tokens later.",
+                thread_ts=thread_ts,
+            )
+        await _finish_onboarding(say, thread_ts)
+        return True
+
+    # Handle cluster setup flow
+    if user_id in cluster_setup_state:
+        return await handle_cluster_setup_response(user_id, text, say, thread_ts)
+
+    return False
+
+
+# ─── Cluster Setup ───────────────────────────────────────────────────
+
+
+async def start_cluster_setup(user_id, say, thread_ts, *, from_onboarding=False):
+    """Begin interactive cluster configuration.
+
+    Shows existing clusters and offers to add/update/remove.
+    """
+    existing_yaml = user_store.read_clusters_yaml(user_id)
+    if existing_yaml:
+        try:
+            import yaml
+
+            existing = yaml.safe_load(existing_yaml) or {}
+            clusters = existing.get("clusters", {})
+            default = existing.get("default_cluster", "")
+            names = list(clusters.keys())
+            cluster_list = "\n".join(
+                f"• `{n}`" + (" _(default)_" if n == default else "") for n in names
+            )
+            cluster_setup_state[user_id] = {
+                "step": "action",
+                "existing": existing,
+                "from_onboarding": from_onboarding,
+            }
+            await say(
+                text=(
+                    f"You have {len(names)} cluster(s) configured:\n{cluster_list}\n\n"
+                    "*1️⃣* Add a new cluster\n"
+                    "*2️⃣* Update an existing cluster\n"
+                    "*3️⃣* Remove a cluster\n"
+                    "*4️⃣* Done — keep current config\n\n"
+                    "Reply with `1`, `2`, `3`, or `4`."
+                ),
+                thread_ts=thread_ts,
+            )
+            return
+        except Exception:
+            pass  # Fall through to fresh setup
+
+    cluster_setup_state[user_id] = {"step": "name", "from_onboarding": from_onboarding}
+    await say(
+        text=(
+            "Let's set up a remote cluster.\n\n*Step 1/6:* What would you like to call this"
+            " cluster? (e.g., `cw-dfw`, `selene`, `my-workstation`)"
+        ),
+        thread_ts=thread_ts,
+    )
+
+
+async def handle_cluster_setup_response(user_id, text, say, thread_ts):
+    """Handle multi-step cluster configuration with add/update/remove support."""
+    import yaml as _yaml
+
+    state = cluster_setup_state[user_id]
+    step = state["step"]
+
+    # ── Action selection (existing clusters) ──
+    if step == "action":
+        existing = state.get("existing", {})
+        clusters = existing.get("clusters", {})
+        if text == "1":
+            state["step"] = "name"
+            await say(
+                text="*Step 1/6:* Name for the new cluster?"
+                " (e.g., `cw-dfw`, `selene`, `my-workstation`)",
+                thread_ts=thread_ts,
+            )
+        elif text == "2":
+            names = list(clusters.keys())
+            state["step"] = "pick_update"
+            await say(
+                text="Which cluster to update?\n"
+                + "\n".join(f"• `{n}`" for n in names)
+                + "\n\nReply with the cluster name.",
+                thread_ts=thread_ts,
+            )
+        elif text == "3":
+            names = list(clusters.keys())
+            state["step"] = "pick_remove"
+            await say(
+                text="Which cluster to remove?\n"
+                + "\n".join(f"• `{n}`" for n in names)
+                + "\n\nReply with the cluster name.",
+                thread_ts=thread_ts,
+            )
+        else:
+            del cluster_setup_state[user_id]
+            await say(text="Keeping current cluster config.", thread_ts=thread_ts)
+        return True
+
+    if step == "pick_update":
+        name = text.strip()
+        existing = state.get("existing", {})
+        if name not in existing.get("clusters", {}):
+            await say(text=f"Cluster `{name}` not found. Try again.", thread_ts=thread_ts)
+            return True
+        state["name"] = name
+        state["step"] = "login_node"
+        old = existing["clusters"][name]
+        await say(
+            text=f"Updating *{name}*. Type `skip` to keep current value.\n\n"
+            f"*Step 2/6:* Login node? (current: `{old.get('login_node', '?')}`)",
+            thread_ts=thread_ts,
+        )
+        return True
+
+    if step == "pick_remove":
+        name = text.strip()
+        existing = state.get("existing", {})
+        clusters = existing.get("clusters", {})
+        if name not in clusters:
+            await say(text=f"Cluster `{name}` not found. Try again.", thread_ts=thread_ts)
+            return True
+        del clusters[name]
+        if existing.get("default_cluster") == name and clusters:
+            existing["default_cluster"] = next(iter(clusters))
+        del cluster_setup_state[user_id]
+        yaml_content = _yaml.dump(existing, default_flow_style=False)
+        user_store.save_clusters_yaml(user_id, yaml_content)
+        await say(
+            text=f"Removed cluster `{name}`.\n\n```{yaml_content}```",
+            thread_ts=thread_ts,
+        )
+        return True
+
+    # ── Shared steps for add/update ──
+    existing = state.get("existing", {})
+    old = existing.get("clusters", {}).get(state.get("name", ""), {})
+
+    if step == "name":
+        state["name"] = text.strip().replace(" ", "-")
+        state["step"] = "login_node"
+        await say(
+            text=f"Cluster alias: *{state['name']}*\n\n"
+            "*Step 2/6:* Login node hostname?"
+            " (e.g., `cluster-login.example.com`)",
+            thread_ts=thread_ts,
+        )
+    elif step == "login_node":
+        val = text.strip()
+        state["login_node"] = old.get("login_node", "") if val.lower() == "skip" else val
+        state["step"] = "user"
+        cur = old.get("user", "")
+        await say(
+            text="*Step 3/6:* SSH username?"
+            + (f" (current: `{cur}`, type `skip` to keep)" if cur else ""),
+            thread_ts=thread_ts,
+        )
+    elif step == "user":
+        val = text.strip()
+        state["user"] = old.get("user") if val.lower() == "skip" else (val or None)
+        state["step"] = "ssh_key"
+        cur = old.get("ssh_key", "")
+        await say(
+            text="*Step 4/6:* SSH private key path? Must be an absolute path"
+            " (e.g., `/home/username/.ssh/id_rsa`)."
+            + (f" (current: `{cur}`, type `skip` to keep)" if cur else "")
+            + " Type `skip` to use SSH default.",
+            thread_ts=thread_ts,
+        )
+    elif step == "ssh_key":
+        val = text.strip()
+        if val.lower() == "skip":
+            state["ssh_key"] = old.get("ssh_key")
+        else:
+            state["ssh_key"] = val
+        state["step"] = "workspace"
+        cur = old.get("workspace", "")
+        await say(
+            text="*Step 5/6:* Remote working directory?"
+            + (f" (current: `{cur}`, type `skip` to keep)" if cur else ""),
+            thread_ts=thread_ts,
+        )
+    elif step == "workspace":
+        val = text.strip()
+        state["workspace"] = old.get("workspace", "") if val.lower() == "skip" else val
+        state["step"] = "gpu_type"
+        cur = old.get("gpu_type", "")
+        await say(
+            text="*Step 6/6:* GPU type?"
+            + (f" (current: `{cur}`, type `skip` to keep)" if cur else "")
+            + " Type `skip` if unknown.",
+            thread_ts=thread_ts,
+        )
+    elif step == "gpu_type":
+        gpu_text = text.strip()
+        if gpu_text.lower() == "skip":
+            gpu = old.get("gpu_type")
+        else:
+            gpu = gpu_text or old.get("gpu_type")
+        del cluster_setup_state[user_id]
+
+        name = state["name"]
+        new_cluster = {"login_node": state["login_node"]}
+        if state.get("user"):
+            new_cluster["user"] = state["user"]
+        if state.get("ssh_key"):
+            new_cluster["ssh_key"] = state["ssh_key"]
+        new_cluster["workspace"] = state["workspace"]
+        if gpu:
+            new_cluster["gpu_type"] = gpu
+
+        # Merge into existing config
+        config = existing if existing else {}
+        if "clusters" not in config:
+            config["clusters"] = {}
+        config["clusters"][name] = new_cluster
+        if not config.get("default_cluster"):
+            config["default_cluster"] = name
+
+        yaml_content = _yaml.dump(config, default_flow_style=False)
+        user_store.save_clusters_yaml(user_id, yaml_content)
+        await say(
+            text=f"Cluster *{name}* configured!\n\n```{yaml_content}```",
+            thread_ts=thread_ts,
+        )
+        # Only chain to credentials import during initial onboarding
+        if state.get("from_onboarding"):
+            await _start_creds_import(user_id, say, thread_ts)
+
+    return True
+
+
+# ─── Slash Command: /modelopt ────────────────────────────────────────
+
+
+@app.command("/modelopt")
+async def handle_slash_command(ack, command, say, respond):
+    """Handle /modelopt slash commands."""
+    await ack()
+    user_id = command["user_id"]
+    text = command.get("text", "").strip()
+    channel = command["channel_id"]
+
+    parts = text.split(maxsplit=1)
+    subcmd = parts[0].lower() if parts else ""
+    args = parts[1] if len(parts) > 1 else ""
+
+    if subcmd == "setup":
+        if user_store.is_registered(user_id):
+            # User already set up — show current config and offer options
+            info = user_store.user_info(user_id)
+            env_vars = user_store.get_env_vars(user_id)
+            has_clusters = user_store.has_clusters(user_id)
+
+            lines = ["You're already set up! Current config:\n"]
+            lines.append(f"• *Auth:* {info['auth_method'] if info else 'unknown'}")
+            lines.append(f"• *Clusters:* {'configured' if has_clusters else 'none'}")
+            if env_vars:
+                env_list = ", ".join(f"`{k}`" for k in env_vars)
+                lines.append(f"• *Env vars:* {env_list}")
+            else:
+                lines.append("• *Env vars:* none")
+
+            lines.append("\nWhat would you like to do?\n")
+            lines.append("*1️⃣* Re-authenticate (new Claude login)")
+            lines.append("*2️⃣* Reconfigure cluster")
+            lines.append("*3️⃣* Import/update credentials (HF, NGC, etc.)")
+            lines.append("*4️⃣* Keep current setup — nothing to change")
+            lines.append("\nReply with `1`, `2`, `3`, or `4`.")
+
+            onboarding_state[user_id] = "awaiting_reconfig_choice"
+            await respond(text="\n".join(lines))
+        else:
+            onboarding_state[user_id] = "awaiting_auth_choice"
+            await respond(text=WELCOME_MSG)
+
+    elif subcmd == "add-cluster":
+        await start_cluster_setup(user_id, respond, None)
+
+    elif subcmd == "clusters":
+        yaml = user_store.read_clusters_yaml(user_id)
+        if yaml:
+            await respond(text=f"Your cluster config:\n```{yaml}```")
+        else:
+            await respond(text="No clusters configured. Use `/modelopt add-cluster` to set one up.")
+
+    elif subcmd == "set-env":
+        if command.get("channel_name") != "directmessage":
+            await respond(text=":warning: Use this command in a DM with me (contains secrets).")
+            return
+        if not args or "=" not in args:
+            await respond(
+                text=(
+                    "Usage: `/modelopt set-env HF_TOKEN=hf_abc123...`\n\n"
+                    "Common variables: `HF_TOKEN`, `NGC_API_KEY`, `DOCKER_TOKEN`"
+                )
+            )
+            return
+        key, _, value = args.partition("=")
+        user_store.set_env_var(user_id, key.strip(), value.strip())
+        await respond(text=f"`{key.strip()}` saved.")
+
+    elif subcmd == "env":
+        env_vars = user_store.get_env_vars(user_id)
+        if env_vars:
+            lines = [f"• `{k}` = `{v}`" for k, v in env_vars.items()]
+            await respond(
+                text="*Your env vars* (values masked):\n"
+                + "\n".join(lines)
+                + "\n\nUse `/modelopt set-env KEY=VALUE` to add/update, `/modelopt unset-env KEY` to remove."
+            )
+        else:
+            await respond(
+                text="No personal env vars set.\n\nUse `/modelopt set-env HF_TOKEN=hf_abc...` to add one."
+            )
+
+    elif subcmd == "unset-env":
+        if not args:
+            await respond(text="Usage: `/modelopt unset-env HF_TOKEN`")
+            return
+        if user_store.remove_env_var(user_id, args.strip()):
+            await respond(text=f"`{args.strip()}` removed.")
+        else:
+            await respond(text=f"`{args.strip()}` not found.")
+
+    elif subcmd == "workspaces":
+        if not user_store.is_registered(user_id):
+            await respond(text="Not registered yet. Use `/modelopt setup` first.")
+            return
+        ws_root = user_store.jobs_dir(user_id)
+        workspaces = workspace_mgr.list_workspaces(ws_root)
+        if not workspaces:
+            await respond(
+                text="No workspaces yet. They'll be created when you run your first task."
+            )
+            return
+        lines = ["*Your workspaces:*"]
+        for w in workspaces[:15]:
+            lines.append(f"• `{w['name']}` — {w['size_mb']}MB (modified {w['modified']})")
+        await respond(text="\n".join(lines))
+
+    elif subcmd == "cleanup":
+        if not user_store.is_registered(user_id):
+            await respond(text="Not registered yet.")
+            return
+        ws_root = user_store.jobs_dir(user_id)
+        removed = await workspace_mgr.cleanup_old(ws_root)
+        await respond(text=f"Cleaned up {removed} old workspace(s).")
+
+    elif subcmd == "status":
+        info = user_store.user_info(user_id)
+        if info:
+            ws_root = user_store.jobs_dir(user_id)
+            workspaces = workspace_mgr.list_workspaces(ws_root)
+            clusters_str = "configured" if info["has_clusters"] else "none"
+            msg = (
+                f"*Auth:* {info['auth_method']}\n"
+                f"*Clusters:* {clusters_str}\n"
+                f"*Workspaces:* {len(workspaces)}"
+            )
+            await respond(text=msg)
+        else:
+            await respond(text="Not registered yet. Use `/modelopt setup` first.")
+
+    elif subcmd in ("help", ""):
+        await respond(text=HELP_MSG)
+
+    elif subcmd == "logs":
+        last = _last_response.get(user_id)
+        if not last:
+            await respond(text="No recent task output. Run a task first.")
+            return
+        await app.client.files_upload_v2(
+            channel=channel,
+            content=last,
+            filename="modelopt_task_log.md",
+            title="Last Task Output",
+        )
+
+    else:
+        # Treat as a prompt
+        await respond(text="Processing...")
+        await _run_job(user_id, text, say_func=respond, channel=channel, thread_ts=None)
+
+
+# ─── Event Handlers ──────────────────────────────────────────────────
+
+
+@app.event("app_mention")
+async def handle_mention(event, say):
+    """Handle @bot mentions in channels."""
+    user_id = event["user"]
+    text = strip_bot_mention(event.get("text", ""))
+    channel = event["channel"]
+    thread_ts = event.get("thread_ts", event["ts"])
+
+    if not text:
+        await say(
+            text="How can I help? Try: `@modelopt quantize Qwen3-0.6B with nvfp4`",
+            thread_ts=thread_ts,
+        )
+        return
+
+    if not user_store.is_registered(user_id):
+        onboarding_state[user_id] = "awaiting_auth_choice"
+        await say(text=WELCOME_MSG, thread_ts=thread_ts)
+        return
+
+    await _run_job(user_id, text, say_func=say, channel=channel, thread_ts=thread_ts)
+
+
+@app.event("message")
+async def handle_dm(event, say):
+    """Handle direct messages."""
+    if event.get("bot_id") or event.get("subtype"):
+        return
+    if event.get("channel_type") != "im":
+        return
+
+    user_id = event.get("user")
+    if not user_id:
+        return
+    text = event.get("text", "").strip()
+    if not text:
+        return
+
+    thread_ts = event.get("thread_ts", event["ts"])
+    channel = event["channel"]
+
+    if user_id in onboarding_state or user_id in cluster_setup_state:
+        handled = await handle_onboarding_response(event, say)
+        if handled:
+            return
+
+    if not user_store.is_registered(user_id):
+        onboarding_state[user_id] = "awaiting_auth_choice"
+        await say(text=WELCOME_MSG, thread_ts=thread_ts)
+        return
+
+    await _run_job(user_id, text, say_func=say, channel=channel, thread_ts=thread_ts)
+
+
+# ─── Core Job Execution ─────────────────────────────────────────────
+
+
+async def _run_job(user_id: str, prompt: str, say_func, channel: str, thread_ts: str | None):
+    """Ensure a workspace exists and run Claude in it."""
+    clusters_yaml = user_store.read_clusters_yaml(user_id)
+    ws_root = user_store.jobs_dir(user_id)
+
+    try:
+        workspace = await workspace_mgr.ensure_default_workspace(ws_root, clusters_yaml)
+    except Exception as e:
+        logger.error("Failed to set up workspace for user %s: %s", user_id, e)
+        await say_func(
+            text=f":x: Failed to set up workspace: {e}",
+            **({"thread_ts": thread_ts} if thread_ts else {}),
+        )
+        return
+
+    env = user_store.get_claude_env(user_id)
+    env["MODELOPT_WORKSPACE_ROOT"] = str(ws_root)
+    env["MODELOPT_REPO_DIR"] = str(workspace_mgr.repo_dir)
+
+    bot_context = (
+        f"You are running via the ModelOpt Slack bot in --print mode. "
+        f"Workspace root: {ws_root} (contains per-model workspaces). "
+        f"Upstream repo: {workspace_mgr.repo_dir} (read-only, use for fresh copies). "
+        f"Read skills/common/workspace-management.md before creating workspaces. "
+        f"Check existing workspaces with: ls $MODELOPT_WORKSPACE_ROOT/ "
+        f"IMPORTANT RESTRICTIONS: "
+        f"1. You are running in --print mode. CronCreate, background tasks, and Agent "
+        f"subagents are NOT available. They will silently fail. "
+        f"To monitor SLURM jobs, use sleep-based polling in a bash loop "
+        f"(e.g., while squeue -j JOBID | grep -q RUNNING; do sleep 60; done). "
+        f"2. SAFETY: You are running unattended — no human can approve actions. "
+        f"NEVER run destructive commands (rm -rf /, kill -9, fdisk, mkfs, etc.). "
+        f"NEVER modify files outside your workspace ({ws_root}) or the user's "
+        f"remote home directory. "
+        f"Do NOT modify the upstream repo ({workspace_mgr.repo_dir}). "
+        f"Do NOT modify system files, global configs, or other users' data. "
+        f"If a task seems risky or ambiguous, output a warning instead of proceeding."
+    )
+
+    # Session per Slack thread: messages in the same thread share context,
+    # new top-level messages start fresh sessions.
+    # thread_ts is the parent message ts (or the message's own ts if it IS the parent).
+    session_key = f"modelopt-slack-{user_id}-{thread_ts or 'ephemeral'}"
+    session_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, session_key))
+
+    # Acquire per-session lock to prevent concurrent Claude on same session
+    if session_id not in _session_locks:
+        _session_locks[session_id] = asyncio.Lock()
+    lock = _session_locks[session_id]
+
+    if lock.locked():
+        kwargs = {"thread_ts": thread_ts} if thread_ts else {}
+        await say_func(
+            text=":hourglass: Previous request still running. Waiting for it to finish...",
+            **kwargs,
+        )
+
+    async with lock:
+        logger.info("Session %s acquired for user %s", session_id[:8], user_id)
+        return await _run_job_inner(
+            user_id,
+            prompt,
+            say_func,
+            channel,
+            thread_ts,
+            workspace,
+            env,
+            bot_context,
+            session_id,
+        )
+
+
+async def _run_job_inner(
+    user_id,
+    prompt,
+    say_func,
+    channel,
+    thread_ts,
+    workspace,
+    env,
+    bot_context,
+    session_id,
+):
+    """Run Claude (called under session lock)."""
+    # Delayed "working on it" — only show if response takes > 5 seconds
+    working_msg_sent = False
+
+    async def _send_delayed_notice():
+        nonlocal working_msg_sent
+        await asyncio.sleep(5)
+        if thread_ts:
+            working_msg_sent = True
+            await say_func(
+                text=(
+                    ":rocket: Working on it — this may take a while."
+                    " I'll let you know when it's done."
+                ),
+                thread_ts=thread_ts,
+            )
+
+    notice_task = asyncio.create_task(_send_delayed_notice())
+
+    # Stream internally to keep idle detection alive. Only send final result to Slack.
+    full_response = ""
+    try:
+        async for chunk in run_claude_streaming(
+            prompt=prompt,
+            cwd=workspace,
+            env=env,
+            session_id=session_id,
+            system_prompt_extra=bot_context,
+        ):
+            full_response += chunk.text
+            if chunk.is_final:
+                break
+
+    except Exception as e:
+        full_response += f"\n\n:x: Failed: {e}"
+        logger.error("Request failed for user %s: %s", user_id, e)
+    finally:
+        notice_task.cancel()
+
+    logger.info("Session %s done for user %s", session_id[:8], user_id)
+
+    # Send final response
+    if not full_response.strip():
+        full_response = "No response from Claude."
+
+    # Save for /modelopt logs
+    _last_response[user_id] = full_response
+
+    kwargs = {"thread_ts": thread_ts} if thread_ts else {}
+    if channel and thread_ts and len(full_response) > MAX_SLACK_LENGTH:
+        await send_long_response(say_func, full_response, thread_ts, channel)
+    else:
+        await say_func(text=truncate(full_response), **kwargs)
+
+
+# ─── Auto Cleanup ────────────────────────────────────────────────────
+
+SESSION_MAX_AGE_DAYS = int(os.environ.get("SESSION_MAX_AGE_DAYS", "30"))
+CLEANUP_INTERVAL_HOURS = int(os.environ.get("CLEANUP_INTERVAL_HOURS", "6"))
+
+
+async def _auto_cleanup_loop():
+    """Periodically clean up old sessions and workspaces."""
+    while True:
+        await asyncio.sleep(CLEANUP_INTERVAL_HOURS * 3600)
+        try:
+            import time
+
+            cutoff = time.time() - SESSION_MAX_AGE_DAYS * 86400
+            total_removed = 0
+
+            for uid in user_store.list_users():
+                # Clean old Claude sessions
+                config_dir = Path(user_store.get_claude_config_dir(uid))
+                sessions_dir = config_dir / "projects"
+                if sessions_dir.exists():
+                    for entry in sessions_dir.iterdir():
+                        if entry.is_dir() and entry.stat().st_mtime < cutoff:
+                            import shutil
+
+                            shutil.rmtree(entry, ignore_errors=True)
+                            total_removed += 1
+
+                # Clean old workspaces (older than 7 days, not the default)
+                ws_root = user_store.jobs_dir(uid)
+                removed = await workspace_mgr.cleanup_old(
+                    ws_root, max_age_days=SESSION_MAX_AGE_DAYS
+                )
+                total_removed += removed
+
+            if total_removed:
+                logger.info("Auto-cleanup: removed %d old sessions/workspaces", total_removed)
+        except Exception as e:
+            logger.error("Auto-cleanup error: %s", e)
+
+
+# ─── Main ────────────────────────────────────────────────────────────
+
+
+async def main():
+    """Start the ModelOpt Slack bot."""
+    logger.info("Starting ModelOpt Slack Bot...")
+    logger.info("Repo dir: %s", REPO_DIR)
+    logger.info("Data dir: %s", DATA_DIR)
+
+    if not Path(REPO_DIR).exists():
+        logger.error("Repo dir not found: %s", REPO_DIR)
+        return
+
+    skills_path = Path(REPO_DIR) / ".claude" / "skills"
+    if skills_path.exists():
+        skills = [d.name for d in skills_path.iterdir() if d.is_dir() and (d / "SKILL.md").exists()]
+        logger.info("Found skills: %s", ", ".join(skills))
+
+    import shutil
+
+    claude_bin = shutil.which("claude")
+    if claude_bin:
+        logger.info("Claude CLI: %s", claude_bin)
+    else:
+        logger.error("Claude CLI not found in PATH — bot will not work")
+
+    logger.info("Registered users: %d", len(user_store.list_users()))
+    logger.info(
+        "Auto-cleanup: every %dh, sessions older than %dd",
+        CLEANUP_INTERVAL_HOURS,
+        SESSION_MAX_AGE_DAYS,
+    )
+
+    # Start background cleanup task (keep reference to prevent GC)
+    _cleanup_task = asyncio.create_task(_auto_cleanup_loop())
+    _background_tasks.add(_cleanup_task)
+    _cleanup_task.add_done_callback(_background_tasks.discard)
+
+    handler = AsyncSocketModeHandler(app, SLACK_APP_TOKEN)
+    await handler.start_async()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/slack-bot/job_manager.py b/slack-bot/job_manager.py
new file mode 100644
index 00000000000..63fecd78a4c
--- /dev/null
+++ b/slack-bot/job_manager.py
@@ -0,0 +1,230 @@
+#!/usr/bin/env python3
+
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""User workspace management.
+
+Each user gets a workspace root directory. Inside it, the agent creates
+model/task-specific subdirectories (e.g., qwen3-0.6b/, llama-3.1-8b-fp8/).
+Each subdirectory is a copy of the Model-Optimizer repo (no .git) that the
+agent can freely modify.
+
+The agent decides when to reuse an existing workspace vs create a fresh copy.
+This module provides the copy utility and cleanup logic; the actual decision
+is driven by skill instructions (see skills/common/workspace-management.md).
+
+Layout:
+    <workspace_root>/
+        qwen3-0.6b/                  ← agent-created, reused across PTQ/deploy/eval
+            .claude/skills/...
+            .claude/clusters.yaml    ← injected from user config
+            examples/...
+            output/                  ← PTQ output checkpoint
+        llama-3.1-8b-fp8/           ← different model, separate workspace
+"""
+
+import asyncio
+import logging
+import os
+import shutil
+import time
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+# Max age for workspaces before auto-cleanup (seconds). 0 = never.
+WORKSPACE_MAX_AGE = int(os.environ.get("WORKSPACE_MAX_AGE", str(30 * 24 * 3600)))  # 30 days
+
+# Max workspaces per user. 0 = unlimited.
+MAX_WORKSPACES_PER_USER = int(os.environ.get("MAX_WORKSPACES_PER_USER", "20"))
+
+# Rsync excludes when copying the repo
+COPY_EXCLUDES = [
+    ".git",
+    "__pycache__",
+    "*.pyc",
+    "*.pyo",
+    "node_modules",
+    "*.egg-info",
+    ".tox",
+    ".pytest_cache",
+    "dist",
+    "build",
+    "*.sqsh",
+    ".claude/clusters.yaml",  # per-user, never copy from repo
+]
+
+
+class WorkspaceManager:
+    """Manages per-user workspace roots and repo copies."""
+
+    def __init__(self, repo_dir: str | Path):
+        """Initialize the workspace manager.
+
+        Args:
+            repo_dir: Path to the shared Model-Optimizer repo (upstream, read-only).
+        """
+        self._repo_dir = Path(repo_dir)
+        if not self._repo_dir.exists():
+            raise FileNotFoundError(f"Repo dir not found: {self._repo_dir}")
+
+    @property
+    def repo_dir(self) -> Path:
+        """Return the upstream repo directory."""
+        return self._repo_dir
+
+    async def create_workspace(
+        self,
+        workspace_root: Path,
+        name: str,
+        clusters_yaml: str | None = None,
+    ) -> Path:
+        """Create a new workspace (fresh repo copy) for a model/task.
+
+        Args:
+            workspace_root: User's workspace root directory
+            name: Workspace name (e.g., "qwen3-0.6b", "llama-3.1-8b-fp8")
+            clusters_yaml: User's cluster config to inject
+
+        Returns:
+            Path to the created workspace directory.
+        """
+        workspace_root.mkdir(parents=True, exist_ok=True)
+
+        # Enforce limit
+        if MAX_WORKSPACES_PER_USER > 0:
+            await self._enforce_limit(workspace_root)
+
+        dest = workspace_root / name
+        if dest.exists():
+            logger.info("Workspace %s already exists, skipping copy", dest)
+        else:
+            await self._copy_repo(dest)
+
+        # Remove any repo-level clusters.yaml (don't leak other users' config)
+        claude_dir = dest / ".claude"
+        repo_clusters = claude_dir / "clusters.yaml"
+        if repo_clusters.exists():
+            repo_clusters.unlink()
+
+        # Inject user's own cluster config if they have one
+        if clusters_yaml:
+            claude_dir.mkdir(exist_ok=True)
+            repo_clusters.write_text(clusters_yaml, encoding="utf-8")
+
+        return dest
+
+    async def ensure_default_workspace(
+        self,
+        workspace_root: Path,
+        clusters_yaml: str | None = None,
+    ) -> Path:
+        """Ensure there's at least one workspace (named 'default') for the user.
+
+        Used when the agent starts — the agent can then create model-specific
+        workspaces from within the session using the copy utility.
+        """
+        return await self.create_workspace(workspace_root, "default", clusters_yaml)
+
+    def list_workspaces(self, workspace_root: Path) -> list[dict]:
+        """List all workspaces for a user."""
+        if not workspace_root.exists():
+            return []
+        result = []
+        for entry in sorted(workspace_root.iterdir()):
+            if not entry.is_dir():
+                continue
+            result.append(
+                {
+                    "name": entry.name,
+                    "path": str(entry),
+                    "modified": time.strftime(
+                        "%Y-%m-%d %H:%M", time.localtime(entry.stat().st_mtime)
+                    ),
+                    "size_mb": self._dir_size_mb(entry),
+                }
+            )
+        return result
+
+    async def cleanup_old(self, workspace_root: Path, max_age_days: int | None = None) -> int:
+        """Remove workspaces older than max_age_days. Returns count removed."""
+        max_age_secs = (max_age_days * 86400) if max_age_days else WORKSPACE_MAX_AGE
+        if max_age_secs <= 0 or not workspace_root.exists():
+            return 0
+        cutoff = time.time() - max_age_secs
+        removed = 0
+        for entry in sorted(workspace_root.iterdir()):
+            if entry.is_dir() and entry.stat().st_mtime < cutoff:
+                logger.info("Cleaning up old workspace: %s", entry)
+                await asyncio.to_thread(shutil.rmtree, entry, ignore_errors=True)
+                removed += 1
+        return removed
+
+    async def copy_repo_to(self, dest: Path) -> None:
+        """Public interface for the agent to request a fresh repo copy."""
+        await self._copy_repo(dest)
+
+    # ── Internal ─────────────────────────────────────────────────────
+
+    async def _copy_repo(self, dest: Path) -> None:
+        dest.mkdir(parents=True, exist_ok=True)
+        exclude_args = []
+        for excl in COPY_EXCLUDES:
+            exclude_args.extend(["--exclude", excl])
+
+        cmd = [
+            "rsync",
+            "-a",
+            "--quiet",
+            *exclude_args,
+            f"{self._repo_dir}/",
+            f"{dest}/",
+        ]
+        proc = await asyncio.create_subprocess_exec(
+            *cmd,
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE,
+        )
+        _, stderr = await proc.communicate()
+        if proc.returncode != 0:
+            raise RuntimeError(f"Failed to copy repo: {stderr.decode(errors='replace')}")
+        logger.info("Copied repo to %s", dest)
+
+    async def _enforce_limit(self, workspace_root: Path) -> None:
+        if not workspace_root.exists():
+            return
+        dirs = sorted(
+            [d for d in workspace_root.iterdir() if d.is_dir()],
+            key=lambda d: d.stat().st_mtime,
+        )
+        while len(dirs) >= MAX_WORKSPACES_PER_USER:
+            oldest = dirs.pop(0)
+            logger.info("Removing oldest workspace to enforce limit: %s", oldest)
+            await asyncio.to_thread(shutil.rmtree, oldest, ignore_errors=True)
+
+    @staticmethod
+    def _dir_size_mb(path: Path) -> float:
+        total = 0
+        try:
+            for f in path.rglob("*"):
+                if f.is_file():
+                    total += f.stat().st_size
+        except OSError:
+            pass
+        return round(total / (1024 * 1024), 1)
diff --git a/slack-bot/key_store.py b/slack-bot/key_store.py
new file mode 100644
index 00000000000..9f328060e4b
--- /dev/null
+++ b/slack-bot/key_store.py
@@ -0,0 +1,165 @@
+#!/usr/bin/env python3
+
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Encrypted per-user API key storage.
+
+Keys are AES-256-GCM encrypted at rest using a server-side master key.
+Each user's key is stored in a separate file under DATA_DIR/keys/<user_id>.enc
+
+The master key is read from the KEY_STORE_SECRET environment variable (32-byte hex or
+base64). If not set, a random key is generated and written to DATA_DIR/.master_key
+on first use (suitable for single-server dev setups, NOT for production).
+"""
+
+import base64
+import hashlib
+import json
+import logging
+import os
+import secrets
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+
+class KeyStore:
+    """Manages encrypted per-user Anthropic API keys."""
+
+    def __init__(self, data_dir: str | Path):
+        """Initialize the key store with the given data directory."""
+        self._data_dir = Path(data_dir)
+        self._keys_dir = self._data_dir / "keys"
+        self._keys_dir.mkdir(parents=True, exist_ok=True)
+        self._master_key = self._load_master_key()
+
+    # ── Public API ───────────────────────────────────────────────────
+
+    def store_key(self, user_id: str, api_key: str) -> None:
+        """Encrypt and store an API key for a Slack user."""
+        if not api_key.startswith("sk-ant-"):
+            raise ValueError("Invalid Anthropic API key format (expected sk-ant-...)")
+        encrypted = self._encrypt(api_key, user_id)
+        path = self._key_path(user_id)
+        path.write_text(json.dumps(encrypted), encoding="utf-8")
+        path.chmod(0o600)
+        logger.info("Stored API key for user %s", user_id)
+
+    def get_key(self, user_id: str) -> str | None:
+        """Retrieve and decrypt the API key for a user. Returns None if not found."""
+        path = self._key_path(user_id)
+        if not path.exists():
+            return None
+        try:
+            data = json.loads(path.read_text(encoding="utf-8"))
+            return self._decrypt(data, user_id)
+        except Exception:
+            logger.exception("Failed to decrypt key for user %s", user_id)
+            return None
+
+    def remove_key(self, user_id: str) -> bool:
+        """Remove stored key. Returns True if key existed."""
+        path = self._key_path(user_id)
+        if path.exists():
+            path.unlink()
+            logger.info("Removed API key for user %s", user_id)
+            return True
+        return False
+
+    def has_key(self, user_id: str) -> bool:
+        """Return True if an encrypted key exists for this user."""
+        return self._key_path(user_id).exists()
+
+    # ── Internal ─────────────────────────────────────────────────────
+
+    def _key_path(self, user_id: str) -> Path:
+        # Use a hash of user_id for the filename to avoid path-injection
+        safe_name = hashlib.sha256(user_id.encode()).hexdigest()[:16]
+        return self._keys_dir / f"{safe_name}.enc"
+
+    def _load_master_key(self) -> bytes:
+        """Load or generate the master encryption key."""
+        env_key = os.environ.get("KEY_STORE_SECRET", "")
+        if env_key:
+            # Accept hex (64 chars) or base64 (44 chars)
+            try:
+                return bytes.fromhex(env_key)
+            except ValueError:
+                return base64.b64decode(env_key)
+
+        # Dev fallback: file-based key
+        key_file = self._data_dir / ".master_key"
+        if key_file.exists():
+            return bytes.fromhex(key_file.read_text().strip())
+
+        key = secrets.token_bytes(32)
+        key_file.write_text(key.hex())
+        key_file.chmod(0o600)
+        logger.warning(
+            "Generated master key at %s. Set KEY_STORE_SECRET env var in production.",
+            key_file,
+        )
+        return key
+
+    def _encrypt(self, plaintext: str, aad: str) -> dict:
+        """AES-256-GCM encrypt. Returns dict with nonce, ciphertext, tag (all base64)."""
+        try:
+            from cryptography.hazmat.primitives.ciphers.aead import AESGCM
+        except ImportError:
+            # Fallback: XOR-based obfuscation (NOT secure, but functional for dev)
+            return self._encrypt_fallback(plaintext, aad)
+
+        nonce = secrets.token_bytes(12)
+        aesgcm = AESGCM(self._master_key)
+        ct = aesgcm.encrypt(nonce, plaintext.encode(), aad.encode())
+        return {
+            "v": 1,
+            "nonce": base64.b64encode(nonce).decode(),
+            "ct": base64.b64encode(ct).decode(),
+        }
+
+    def _decrypt(self, data: dict, aad: str) -> str:
+        if data.get("v") == 1:
+            try:
+                from cryptography.hazmat.primitives.ciphers.aead import AESGCM
+            except ImportError:
+                return self._decrypt_fallback(data, aad)
+            aesgcm = AESGCM(self._master_key)
+            nonce = base64.b64decode(data["nonce"])
+            ct = base64.b64decode(data["ct"])
+            return aesgcm.decrypt(nonce, ct, aad.encode()).decode()
+        elif data.get("v") == 0:
+            return self._decrypt_fallback(data, aad)
+        else:
+            raise ValueError(f"Unknown key store version: {data.get('v')}")
+
+    # Dev-only fallback when cryptography package isn't installed
+    def _encrypt_fallback(self, plaintext: str, aad: str) -> dict:
+        key_material = hashlib.sha256(self._master_key + aad.encode()).digest()
+        data = plaintext.encode()
+        encrypted = bytes(a ^ b for a, b in zip(data, key_material * (len(data) // 32 + 1)))
+        return {"v": 0, "ct": base64.b64encode(encrypted).decode()}
+
+    def _decrypt_fallback(self, data: dict, aad: str) -> str:
+        key_material = hashlib.sha256(self._master_key + aad.encode()).digest()
+        encrypted = base64.b64decode(data["ct"])
+        decrypted = bytes(
+            a ^ b for a, b in zip(encrypted, key_material * (len(encrypted) // 32 + 1))
+        )
+        return decrypted.decode()
diff --git a/slack-bot/requirements.txt b/slack-bot/requirements.txt
new file mode 100644
index 00000000000..9d3b673aab4
--- /dev/null
+++ b/slack-bot/requirements.txt
@@ -0,0 +1,3 @@
+cryptography>=41.0  # optional: for AES-256-GCM key encryption (falls back to XOR without it)
+pyyaml>=6.0
+slack-bolt>=1.18.0
diff --git a/slack-bot/session_manager.py b/slack-bot/session_manager.py
new file mode 100644
index 00000000000..f69a1a64093
--- /dev/null
+++ b/slack-bot/session_manager.py
@@ -0,0 +1,216 @@
+#!/usr/bin/env python3
+
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Claude Code subprocess runner for jobs.
+
+Runs Claude CLI in a job's working directory with the user's auth credentials.
+Uses streaming output so progress is visible in real time.
+"""
+
+import asyncio
+import contextlib
+import json
+import logging
+import os
+import shutil
+from dataclasses import dataclass
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+# If no output for this long, assume the process is stuck.
+# Must be generous — agent may sleep/poll for long-running SLURM jobs.
+IDLE_TIMEOUT = int(os.environ.get("CLAUDE_IDLE_TIMEOUT", "7200"))  # 2 hours
+
+# Track sessions that have been created (--session-id used once).
+# Subsequent calls use --resume to avoid "session already in use" error.
+_known_sessions: set[str] = set()
+
+
+@dataclass
+class StreamChunk:
+    """A chunk of Claude's streamed output."""
+
+    text: str
+    is_final: bool = False
+    is_error: bool = False
+
+
+async def run_claude_streaming(
+    prompt: str,
+    cwd: Path,
+    env: dict[str, str],
+    session_id: str | None = None,
+    idle_timeout: int = IDLE_TIMEOUT,
+    system_prompt_extra: str | None = None,
+):
+    """Run Claude CLI with streaming output.
+
+    Yields StreamChunk objects as Claude produces output.
+    No total timeout — only kills if idle for idle_timeout seconds.
+    """
+    cmd = _build_cmd(prompt, session_id, streaming=True, system_prompt_extra=system_prompt_extra)
+
+    logger.info("Running claude (streaming) in %s: %.80s...", cwd, prompt)
+
+    try:
+        proc = await asyncio.create_subprocess_exec(
+            *cmd,
+            cwd=str(cwd),
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE,
+            env=env,
+        )
+    except Exception as e:
+        yield StreamChunk(text=f"Error starting Claude: {e}", is_final=True, is_error=True)
+        return
+
+    buffer = ""
+    try:
+        while True:
+            try:
+                assert proc.stdout is not None  # guaranteed by PIPE
+                chunk = await asyncio.wait_for(proc.stdout.read(4096), timeout=idle_timeout)
+            except asyncio.TimeoutError:
+                # No output for idle_timeout — kill process
+                logger.error("Claude idle for %ds, killing", idle_timeout)
+                with contextlib.suppress(Exception):
+                    proc.kill()
+                yield StreamChunk(
+                    text=f"\n\nNo output for {idle_timeout // 60}m — process appears stuck. Killed.",
+                    is_final=True,
+                    is_error=True,
+                )
+                return
+
+            if not chunk:
+                break  # EOF — process finished
+
+            buffer += chunk.decode(errors="replace")
+
+            # Parse complete JSON lines
+            while "\n" in buffer:
+                line, buffer = buffer.split("\n", 1)
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    event = json.loads(line)
+                    text = _extract_text_from_event(event)
+                    if text:
+                        yield StreamChunk(text=text)
+                except json.JSONDecodeError:
+                    yield StreamChunk(text=line)
+
+    except Exception as e:
+        logger.error("Stream read error: %s", e)
+        yield StreamChunk(
+            text=f"\nStream error: {e}",
+            is_final=True,
+            is_error=True,
+        )
+        with contextlib.suppress(Exception):
+            proc.kill()
+        return
+
+    await proc.wait()
+
+    if proc.returncode != 0:
+        assert proc.stderr is not None  # guaranteed by PIPE
+        stderr = await proc.stderr.read()
+        stderr_text = stderr.decode(errors="replace")[:500]
+        yield StreamChunk(
+            text=f"\nClaude CLI error (exit {proc.returncode}): {stderr_text}",
+            is_final=True,
+            is_error=True,
+        )
+    else:
+        yield StreamChunk(text="", is_final=True)
+
+
+def _build_cmd(
+    prompt: str,
+    session_id: str | None = None,
+    streaming: bool = False,
+    system_prompt_extra: str | None = None,
+) -> list[str]:
+    """Build the claude CLI command."""
+    claude_bin = shutil.which("claude")
+    if not claude_bin:
+        raise FileNotFoundError("`claude` CLI not found in PATH")
+
+    cmd = [
+        claude_bin,
+        "--print",
+        "--dangerously-skip-permissions",
+        "--disallowed-tools", "CronCreate,CronDelete,CronList,Agent",
+    ]
+
+    if system_prompt_extra:
+        cmd.extend(["--append-system-prompt", system_prompt_extra])
+
+    if streaming:
+        cmd.extend(["--output-format", "stream-json", "--verbose"])
+
+    if session_id:
+        if session_id in _known_sessions:
+            cmd.extend(["--resume", session_id])
+        else:
+            cmd.extend(["--session-id", session_id])
+            _known_sessions.add(session_id)
+
+    cmd.extend(["-p", prompt])
+    return cmd
+
+
+def _extract_text_from_event(event: dict) -> str:
+    """Extract displayable text from a stream-json event."""
+    # Assistant text messages
+    if event.get("type") == "assistant" and "message" in event:
+        content = event["message"].get("content", [])
+        parts = []
+        for block in content:
+            if block.get("type") == "text":
+                parts.append(block["text"])
+            elif block.get("type") == "tool_use":
+                # Show tool usage so user sees activity during long waits
+                tool = block.get("name", "")
+                if tool == "Bash":
+                    cmd = block.get("input", {}).get("command", "")
+                    if cmd:
+                        # Truncate long commands
+                        cmd_short = cmd[:120] + "..." if len(cmd) > 120 else cmd
+                        parts.append(f"\n`$ {cmd_short}`\n")
+                elif tool == "Read":
+                    path = block.get("input", {}).get("file_path", "")
+                    parts.append(f"\n_Reading {path}_\n")
+                elif tool == "Edit":
+                    path = block.get("input", {}).get("file_path", "")
+                    parts.append(f"\n_Editing {path}_\n")
+                elif tool == "Write":
+                    path = block.get("input", {}).get("file_path", "")
+                    parts.append(f"\n_Writing {path}_\n")
+        return "".join(parts)
+
+    # Result message (final)
+    if event.get("type") == "result":
+        return ""
+
+    return ""
diff --git a/slack-bot/user_store.py b/slack-bot/user_store.py
new file mode 100644
index 00000000000..2011f3b7e61
--- /dev/null
+++ b/slack-bot/user_store.py
@@ -0,0 +1,383 @@
+#!/usr/bin/env python3
+
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Per-user data management: auth, cluster config, onboarding state.
+
+Directory layout per user:
+    <data_dir>/users/<slack_uid>/
+        auth.json          — auth method + encrypted credentials
+        clusters.yaml      — SSH/cluster configs
+        jobs/              — per-job working directories
+"""
+
+import json
+import logging
+import os
+from enum import Enum
+from pathlib import Path
+
+from key_store import KeyStore
+
+logger = logging.getLogger(__name__)
+
+
+class AuthMethod(str, Enum):
+    """Authentication method choices for a user."""
+
+    SHARED_KEY = "shared_key"  # Use the server's default ANTHROPIC_API_KEY
+    OWN_KEY = "own_key"  # User provided their own sk-ant-... key
+    LOGIN = "login"  # User authenticated via `claude auth login` (headless browser flow)
+
+
+class UserStore:
+    """Manages per-user data: auth credentials, cluster configs, onboarding state."""
+
+    def __init__(self, data_dir: str | Path, key_store: KeyStore):
+        """Initialize the user store with the given data directory and key store."""
+        self._data_dir = Path(data_dir)
+        self._users_dir = self._data_dir / "users"
+        self._users_dir.mkdir(parents=True, exist_ok=True)
+        self._key_store = key_store
+
+        # Shared cache for large model downloads (all users share this)
+        self._shared_cache = self._data_dir / "shared-cache"
+        self._shared_cache.mkdir(parents=True, exist_ok=True)
+        (self._shared_cache / "huggingface").mkdir(exist_ok=True)
+        (self._shared_cache / "torch").mkdir(exist_ok=True)
+
+    # ── User Directory ───────────────────────────────────────────────
+
+    def user_dir(self, user_id: str) -> Path:
+        """Return the base directory for a user's data."""
+        return self._users_dir / user_id
+
+    def jobs_dir(self, user_id: str) -> Path:
+        """Return (and create) the jobs/workspace root directory for a user."""
+        d = self.user_dir(user_id) / "jobs"
+        d.mkdir(parents=True, exist_ok=True)
+        return d
+
+    def is_registered(self, user_id: str) -> bool:
+        """Return True if the user has completed onboarding."""
+        return (self.user_dir(user_id) / "auth.json").exists()
+
+    # ── Auth ─────────────────────────────────────────────────────────
+
+    def setup_shared_key(self, user_id: str) -> None:
+        """Register user with the shared/default API key."""
+        self._ensure_user_dir(user_id)
+        self._write_auth(user_id, {"method": AuthMethod.SHARED_KEY})
+        logger.info("User %s registered with shared key", user_id)
+
+    def setup_own_key(self, user_id: str, api_key: str) -> None:
+        """Register user with their own Anthropic API key."""
+        self._ensure_user_dir(user_id)
+        self._key_store.store_key(user_id, api_key)
+        self._write_auth(user_id, {"method": AuthMethod.OWN_KEY})
+        logger.info("User %s registered with own API key", user_id)
+
+    def setup_login_auth(self, user_id: str, config_dir: str) -> None:
+        """Register user who authenticated via `claude auth login`.
+
+        The config_dir contains the .credentials.json from the login flow.
+        We store this path so we can set CLAUDE_CONFIG_DIR when running Claude.
+        """
+        self._ensure_user_dir(user_id)
+        # Copy the credentials into the user's persistent dir
+        import shutil
+
+        user_auth_dir = self.user_dir(user_id) / "claude-config"
+        if user_auth_dir.exists():
+            shutil.rmtree(user_auth_dir)
+        shutil.copytree(config_dir, str(user_auth_dir))
+        self._write_auth(user_id, {"method": AuthMethod.LOGIN, "config_dir": str(user_auth_dir)})
+        logger.info("User %s registered with claude login auth", user_id)
+
+    def get_auth_method(self, user_id: str) -> AuthMethod | None:
+        """Return the auth method for this user, or None if not registered."""
+        auth = self._read_auth(user_id)
+        if auth is None:
+            return None
+        return AuthMethod(auth["method"])
+
+    def get_api_key(self, user_id: str) -> str | None:
+        """Get the API key to use for this user's Claude session."""
+        auth = self._read_auth(user_id)
+        if auth is None:
+            return None
+
+        method = AuthMethod(auth["method"])
+        if method == AuthMethod.SHARED_KEY:
+            # Use server's default key
+            return os.environ.get("ANTHROPIC_API_KEY")
+        elif method == AuthMethod.OWN_KEY:
+            return self._key_store.get_key(user_id)
+        return None
+
+    def get_claude_env(self, user_id: str) -> dict[str, str]:
+        """Build environment variables for this user's Claude subprocess.
+
+        Loads:
+        1. System env
+        2. Shared cache paths (HF_HOME, TORCH_HOME — shared across users)
+        3. User's personal env file (HF_TOKEN, NGC credentials, etc.)
+        4. Claude auth credentials
+
+        Note: user env vars can override shared cache paths if needed.
+        """
+        env = os.environ.copy()
+
+        # Set shared cache dirs (large model downloads shared across users)
+        shared_cache = str(self._shared_cache)
+        env.setdefault("HF_HOME", f"{shared_cache}/huggingface")
+        env.setdefault("TORCH_HOME", f"{shared_cache}/torch")
+        env.setdefault("TRANSFORMERS_CACHE", f"{shared_cache}/huggingface/hub")
+
+        # Load user's personal env vars (can override cache paths)
+        env_file = self.user_dir(user_id) / "env"
+        if env_file.exists():
+            for line in env_file.read_text().splitlines():
+                line = line.strip()
+                if not line or line.startswith("#"):
+                    continue
+                if "=" in line:
+                    key, _, value = line.partition("=")
+                    env[key.strip()] = value.strip()
+
+        # Apply Claude auth
+        auth = self._read_auth(user_id)
+        if auth is None:
+            return env
+
+        method = AuthMethod(auth["method"])
+        if method == AuthMethod.SHARED_KEY:
+            # ANTHROPIC_API_KEY already in env (server default)
+            pass
+        elif method == AuthMethod.OWN_KEY:
+            key = self._key_store.get_key(user_id)
+            if key:
+                env["ANTHROPIC_API_KEY"] = key
+        elif method == AuthMethod.LOGIN:
+            # Point Claude CLI at the user's stored credentials
+            config_dir = auth.get("config_dir", "")
+            if config_dir:
+                env["CLAUDE_CONFIG_DIR"] = config_dir
+
+        return env
+
+    def set_env_var(self, user_id: str, key: str, value: str):
+        """Set a personal env var for this user."""
+        env_file = self.user_dir(user_id) / "env"
+        existing = {}
+        if env_file.exists():
+            for line in env_file.read_text().splitlines():
+                line = line.strip()
+                if not line or line.startswith("#"):
+                    continue
+                if "=" in line:
+                    k, _, v = line.partition("=")
+                    existing[k.strip()] = v.strip()
+        existing[key] = value
+        env_file.write_text("\n".join(f"{k}={v}" for k, v in sorted(existing.items())) + "\n")
+
+    def get_env_vars(self, user_id: str) -> dict[str, str]:
+        """List user's personal env vars (values masked)."""
+        env_file = self.user_dir(user_id) / "env"
+        result = {}
+        if env_file.exists():
+            for line in env_file.read_text().splitlines():
+                line = line.strip()
+                if not line or line.startswith("#"):
+                    continue
+                if "=" in line:
+                    k, _, v = line.partition("=")
+                    result[k.strip()] = v.strip()[:4] + "..." if len(v.strip()) > 4 else v.strip()
+        return result
+
+    def remove_env_var(self, user_id: str, key: str) -> bool:
+        """Remove a personal env var."""
+        env_file = self.user_dir(user_id) / "env"
+        if not env_file.exists():
+            return False
+        lines = []
+        removed = False
+        for line in env_file.read_text().splitlines():
+            if line.strip().startswith(f"{key}="):
+                removed = True
+            else:
+                lines.append(line)
+        if removed:
+            env_file.write_text("\n".join(lines) + "\n" if lines else "")
+        return removed
+
+    def get_claude_config_dir(self, user_id: str) -> str:
+        """Return the path to this user's Claude config directory."""
+        auth = self._read_auth(user_id)
+        if auth and auth.get("config_dir"):
+            return auth["config_dir"]
+        return str(self.user_dir(user_id) / "claude-config")
+
+    def remove_auth(self, user_id: str) -> bool:
+        """Remove user's auth credentials."""
+        self._key_store.remove_key(user_id)
+        auth_file = self.user_dir(user_id) / "auth.json"
+        if auth_file.exists():
+            auth_file.unlink()
+            return True
+        return False
+
+    # ── Cluster Config ───────────────────────────────────────────────
+
+    def get_clusters_yaml_path(self, user_id: str) -> Path:
+        """Return the path to the user's cluster config file."""
+        return self.user_dir(user_id) / "clusters.yaml"
+
+    def has_clusters(self, user_id: str) -> bool:
+        """Return True if the user has a cluster config file."""
+        return self.get_clusters_yaml_path(user_id).exists()
+
+    def save_clusters_yaml(self, user_id: str, content: str) -> None:
+        """Write cluster config for a user."""
+        self._ensure_user_dir(user_id)
+        path = self.get_clusters_yaml_path(user_id)
+        path.write_text(content, encoding="utf-8")
+        logger.info("Saved cluster config for user %s", user_id)
+
+    def read_clusters_yaml(self, user_id: str) -> str | None:
+        """Read and return the user's cluster config, or None if not set."""
+        path = self.get_clusters_yaml_path(user_id)
+        if path.exists():
+            return path.read_text(encoding="utf-8")
+        return None
+
+    # ── User Info ────────────────────────────────────────────────────
+
+    def user_info(self, user_id: str) -> dict | None:
+        """Get summary info about a user."""
+        if not self.is_registered(user_id):
+            return None
+        auth = self._read_auth(user_id)
+        jobs_path = self.user_dir(user_id) / "jobs"
+        job_count = len(list(jobs_path.iterdir())) if jobs_path.exists() else 0
+        return {
+            "user_id": user_id,
+            "auth_method": auth.get("method", "unknown") if auth else "unknown",
+            "has_clusters": self.has_clusters(user_id),
+            "job_count": job_count,
+        }
+
+    def list_users(self) -> list[str]:
+        """List all registered user IDs."""
+        if not self._users_dir.exists():
+            return []
+        return [
+            d.name for d in self._users_dir.iterdir() if d.is_dir() and (d / "auth.json").exists()
+        ]
+
+    # ── Credential Import ──────────────────────────────────────────
+
+    def scan_local_credentials(self, home_dir: str) -> dict[str, str]:
+        """Scan a local home directory for known credentials.
+
+        Returns dict of {ENV_VAR_NAME: value} for found credentials.
+        Only reads, never modifies.
+        """
+        found: dict[str, str] = {}
+        home = Path(home_dir)
+        if not home.is_dir():
+            return found
+
+        # HuggingFace token
+        for hf_path in [
+            home / ".cache" / "huggingface" / "token",
+            home / ".huggingface" / "token",
+        ]:
+            if hf_path.exists():
+                token = hf_path.read_text(encoding="utf-8").strip()
+                if token:
+                    found["HF_TOKEN"] = token
+                    break
+
+        # NGC API key
+        ngc_config = home / ".ngc" / "config"
+        if ngc_config.exists():
+            for line in ngc_config.read_text(encoding="utf-8").splitlines():
+                if line.strip().startswith("apikey"):
+                    _, _, val = line.partition("=")
+                    val = val.strip()
+                    if val:
+                        found["NGC_API_KEY"] = val
+                        break
+
+        # Docker config (for registry auth — just note it exists, don't extract)
+        docker_config = home / ".docker" / "config.json"
+        if docker_config.exists():
+            found["_DOCKER_CONFIG"] = str(docker_config)
+
+        return found
+
+    def import_credentials(self, user_id: str, creds: dict[str, str]) -> list[str]:
+        """Import scanned credentials into user's env file.
+
+        Returns list of imported variable names.
+        """
+        imported = []
+        for key, value in creds.items():
+            if key.startswith("_"):
+                continue  # Skip metadata entries like _DOCKER_CONFIG
+            self.set_env_var(user_id, key, value)
+            imported.append(key)
+        return imported
+
+    def resolve_home_dir(self, username: str) -> str | None:
+        """Resolve a local username to their home directory."""
+        import pwd
+
+        try:
+            pw = pwd.getpwnam(username)
+            if Path(pw.pw_dir).is_dir():
+                return pw.pw_dir
+        except KeyError:
+            pass
+
+        # Fallback: check common paths
+        for prefix in ["/home", "/home/scratch." + username]:
+            candidate = Path(prefix) / username
+            if candidate.is_dir():
+                return str(candidate)
+        return None
+
+    # ── Internal ─────────────────────────────────────────────────────
+
+    def _ensure_user_dir(self, user_id: str):
+        d = self.user_dir(user_id)
+        d.mkdir(parents=True, exist_ok=True)
+        (d / "jobs").mkdir(exist_ok=True)
+
+    def _write_auth(self, user_id: str, auth: dict):
+        path = self.user_dir(user_id) / "auth.json"
+        path.write_text(json.dumps(auth, indent=2), encoding="utf-8")
+        path.chmod(0o600)
+
+    def _read_auth(self, user_id: str) -> dict | None:
+        path = self.user_dir(user_id) / "auth.json"
+        if not path.exists():
+            return None
+        return json.loads(path.read_text(encoding="utf-8"))