NVIDIA · yuantailing · Jan 5, 2026 · Dec 4, 2025 · Dec 5, 2025 · Dec 5, 2025
diff --git a/.gitignore b/.gitignore
@@ -76,6 +76,7 @@ cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmha_v2_cu/
 cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_cubin.h
 cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_cubin.cpp
 .devcontainer/.env
+/examples/layer_wise_benchmarks/autotuner_cache/
 /examples/layer_wise_benchmarks/profiles/
 
 # User config files

@@ -15,6 +15,9 @@ pip install -e ../..
 **Step 3:** In the container, run benchmarks and generate profiles:
 
 ```bash
+# Set autotune cache path
+export TLLM_AUTOTUNER_CACHE_PATH=autotuner_cache/cache
+
 # Run DeepSeek-R1 NVFP4
 NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml
 NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml
@@ -24,16 +27,16 @@ NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --model deepseek-ai/DeepSeek-V3.2-
 NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --moe-backend DEEPGEMM
 
 # Run DeepSeek-V3.2-Exp with 32k context length
-NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --max-seq-len $((32768 + 1024 + 4)) --moe-backend DEEPGEMM --batch-size 1 --seq-len-q 32769
-NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --max-seq-len $((32768 + 1024 + 4)) --moe-backend DEEPGEMM --seq-len-kv-cache 32769
+NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --moe-backend DEEPGEMM --batch-size 1 --seq-len-q 32769
+NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --moe-backend DEEPGEMM --seq-len-kv-cache 32769
 
 # Run with attention TP
 NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --no-enable-attention-dp
 NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --no-enable-attention-dp
 
 # Run with attention TP and TRTLLMGen
-NP=4 ./mpi_launch.sh -x TRTLLM_ENABLE_PDL=1 ./run.sh config_ctx.yaml --no-enable-attention-dp --moe-backend TRTLLM --balance-method NotModified
-NP=4 ./mpi_launch.sh -x TRTLLM_ENABLE_PDL=1 ./run.sh config_gen.yaml --no-enable-attention-dp --moe-backend TRTLLM --balance-method NotModified
+NP=4 ./mpi_launch.sh -x TRTLLM_ENABLE_PDL=1 ./run.sh config_ctx.yaml --no-enable-attention-dp --moe-backend TRTLLM
+NP=4 ./mpi_launch.sh -x TRTLLM_ENABLE_PDL=1 ./run.sh config_gen.yaml --no-enable-attention-dp --moe-backend TRTLLM
 
 # Run with MTP3
 NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --batch-size 32 --seq-len-q 4
@@ -48,7 +51,7 @@ NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --scaled-from 16 --moe-backend WID
 # Scale TEP=16 to 4 GPUs: reduce the number of attention heads and experts
 NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --scaled-from 16 --no-enable-attention-dp
 
-# Run Qwen3-Next (balanced routing is not implemented)
+# Run Qwen3-Next
 NP=2 ./mpi_launch.sh ./run.sh config_ctx.yaml --model Qwen/Qwen3-Next-80B-A3B-Instruct --layer-indices 6,7 --no-enable-attention-dp --batch-size 4
 NP=2 ./mpi_launch.sh ./run.sh config_gen.yaml --model Qwen/Qwen3-Next-80B-A3B-Instruct --layer-indices 6,7 --no-enable-attention-dp --batch-size 512
 
@@ -69,42 +72,61 @@ NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --balance-method ImbalancedExperts
 ### Run with Slurm
 
 > Tips:
-> 1. If you have a running Slurm job, please skip step 1 and go straight to step 2 and 3.
-> 2. Further, if you have installed `tensorrt_llm` in the Slurm job, you can also skip step 2 and run step 3 with `export CONTAINER_NAME=aaa` specified. If you don't know the container name, run `export CONTAINER_NAME=$(SLURM_JOB_ID=$SLURM_JOB_ID ./slurm_query_container_name.sh)` to get it.
+> 1. If you have a running Slurm job, you can set environment variable `export SLURM_JOB_ID=aaa` and skip step 1.
+> 2. Further, if you have installed `tensorrt_llm` in the Slurm job, you can also skip step 2. Just run step 3 with `export CONTAINER_NAME=aaa` specified. If you don't know the container name, run `export CONTAINER_NAME=$(./slurm_query_container_name.sh)` to get it.
 
-**Step 1:** On the controller node, allocate one or multiple nodes, and record the `SLURM_JOB_ID`:
+**Step 1:** On the controller node, allocate one or multiple nodes, and export the `SLURM_JOB_ID`:
 
 ```bash
-SLURM_JOB_ID=$(NODES=4 TIME=02:00:00 ./slurm_alloc.sh)
+export SLURM_JOB_ID=$(NODES=4 TIME=02:00:00 ./slurm_alloc.sh)
 ```
 
 Please fill the variables in `./slurm_alloc.sh`.
 
 **Step 2:** Start a container and install `tensorrt_llm`. Run the following command on the controller node:
 
 ```bash
-SLURM_JOB_ID=$SLURM_JOB_ID ./slurm_init_containers.sh
+./slurm_init_containers.sh
 ```
 
 It uses the image recorded in `../../jenkins/current_image_tags.properties`. The image will be downloaded to `../../enroot/` for once.
 
 > Tips: If you want to change the image, no need to reallocate Slurm jobs. Just start another container by running step 2 with `export CONTAINER_NAME=aaa`, and step 3 will run in the container specified by the `CONTAINER_NAME` env.
 
+**(Optional) Get an interactive shell**
+
+```bash
+NODES=1 NP=1 ./slurm_launch.sh --overlap --pty middleware/exclude_slurm_envs bash
+```
+
+The `--overlap` option allows this shell to share the node with other jobs. The middleware enables nested MPI process spawning from within Slurm jobs.
+
+You may compile C++ extensions in the interactive shell:
+
+```bash
+cd ../..
+export CCACHE_DIR=$(realpath cpp/.ccache)
+python3 scripts/build_wheel.py --cuda_architectures native --no-venv --skip_building_wheel -G Ninja --use_ccache --clean
+```
+
 **Step 3:** Run benchmarks to generate profiles. Run the following command on the controller node, where `NODES` &le; the number of allocated nodes:
 
 ```bash
+# Set autotune cache path
+export TLLM_AUTOTUNER_CACHE_PATH=autotuner_cache/cache
+
 # Run DeepSeek-R1 NVFP4 with wide ep: uses MNNVL A2A if applicable
-SLURM_JOB_ID=$SLURM_JOB_ID NODES=4 NP=16 ./slurm_launch.sh ./run.sh config_gen.yaml --moe-backend WIDEEP
+NODES=4 NP=16 ./slurm_launch.sh ./run.sh config_gen.yaml --moe-backend WIDEEP
 
 # Run with TRTLLMGen
-SLURM_JOB_ID=$SLURM_JOB_ID NODES=4 NP=16 TRTLLM_ENABLE_PDL=1 ./slurm_launch.sh ./run.sh config_gen.yaml --moe-backend TRTLLM
+NODES=4 NP=16 TRTLLM_ENABLE_PDL=1 ./slurm_launch.sh ./run.sh config_gen.yaml --moe-backend TRTLLM
 
 # Run with DeepEPLowLatency
-SLURM_JOB_ID=$SLURM_JOB_ID NODES=4 NP=16 TRTLLM_FORCE_ALLTOALL_METHOD=DeepEPLowLatency ./slurm_launch.sh ./run.sh config_gen.yaml --moe-backend WIDEEP
+NODES=4 NP=16 TRTLLM_FORCE_ALLTOALL_METHOD=DeepEPLowLatency ./slurm_launch.sh ./run.sh config_gen.yaml --moe-backend WIDEEP
 
-# You can run 4-GPU and 8-GPU tasks without reallocate the slurm job
-SLURM_JOB_ID=$SLURM_JOB_ID NODES=1 NP=4 ./slurm_launch.sh ./run.sh config_ctx.yaml
-SLURM_JOB_ID=$SLURM_JOB_ID NODES=2 NP=8 ./slurm_launch.sh ./run.sh config_gtx.yaml
+# You can run 4-GPU and 8-GPU tasks without reallocating the slurm job
+NODES=1 NP=4 ./slurm_launch.sh ./run.sh config_ctx.yaml
+NODES=2 NP=8 ./slurm_launch.sh ./run.sh config_gen.yaml
 ```
 
 ### Batched run
@@ -131,9 +153,11 @@ NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --scaled-from 16 --moe-backend WID
 Run the following command in the container:
 
 ```bash
+# Parse the profile at the default directory
 python3 parse.py --world-size 4
 
-# Specify the location of the .nsys-rep file
+# Specify the file path
+python3 parse.py --file-path profiles/report_np4_rank0.nsys-rep
 python3 parse.py --profile-dir ./profiles --world-size 4 --rank 0
 
 # Parse a specific module. The module must appear exactly once in each run.
@@ -145,6 +169,15 @@ You will receive three reports, each containing kernel timing statistics grouped
 2. A CSV report at `profiles/report_np4_rank0.csv`
 3. An HTML report at `profiles/report_np4_rank0.html`
 
+## Developer utilities
+
+1. Less startup time when debug a model
+   1. Disable autotuner: add `--no-enable-autotuner` option
+   2. Disable nsys profile: set `PROFILE=0` environment variable
+2. Capture more information
+   1. Enable GPU metrics: set `GPU_METRICS=1` environment variable
+   2. Enable backtrace: set `BACKTRACE=1` environment variable
+
 ## Trouble shooting
 
 1. Error `fp8 blockscale gemm only support Hopper` on Blackwell.

@@ -0,0 +1,8 @@
+#!/bin/bash
+
+set -euo pipefail
+
+# Clear slurm envs
+unset $(env | awk -F'=' '{print $1}' | (grep -E "SLURM_|SLURMD_|slurm_|MPI_|PMIX_" || true))
+
+"$@"
@@ -3,8 +3,12 @@
 set -euo pipefail
 
 # Clear slurm envs
-unset $(env | grep -i slurm | awk -F'=' '{print $1}')
-unset $(env | grep MPI | awk -F'=' '{print $1}')
+unset $(env | awk -F'=' '{print $1}' | (grep -E "SLURM_|SLURMD_|slurm_|MPI_|PMIX_" || true))
+
+extra_args=
+if [ -v TLLM_AUTOTUNER_CACHE_PATH ]; then
+    extra_args+="-x TLLM_AUTOTUNER_CACHE_PATH"
+fi
 
 set -x
-mpirun --allow-run-as-root --np ${NP} "$@"
+mpirun --allow-run-as-root --np ${NP} $extra_args "$@"
@@ -5,6 +5,7 @@
 import re
 import sqlite3
 import subprocess
+import sys
 from collections import defaultdict
 from pathlib import Path
 
@@ -14,8 +15,9 @@
 
 # Parse cmdline
 parser = argparse.ArgumentParser()
+parser.add_argument("--file-path", type=str)
 parser.add_argument("--profile-dir", type=str, default="profiles")
-parser.add_argument("--world-size", "--np", type=int, required=True)
+parser.add_argument("--world-size", "--np", type=int)
 parser.add_argument("--rank", type=int, default=0)
 parser.add_argument("--warmup-times", type=int)
 parser.add_argument("--module", type=str)
@@ -27,6 +29,8 @@
 )
 parser.set_defaults(error_on_unknown_kernel=False)
 args = parser.parse_args()
+if (args.file_path is None) == (args.world_size is None):
+    parser.error("Please specify exactly one of --file-path and --world-size.")
 print(args)
 
 
@@ -89,11 +93,20 @@ def shortest_common_supersequence(a, b):
     return res
 
 
-profile_dir = Path(args.profile_dir)
-nsys_rep_file_path = profile_dir / f"report_np{args.world_size}_rank{args.rank}.nsys-rep"
-sqlite_file_path = profile_dir / f"report_np{args.world_size}_rank{args.rank}.sqlite"
-csv_file_path = profile_dir / f"report_np{args.world_size}_rank{args.rank}.csv"
-html_file_path = profile_dir / f"report_np{args.world_size}_rank{args.rank}.html"
+if args.file_path is not None:
+    nsys_rep_file_path = Path(args.file_path)
+    if not nsys_rep_file_path.name.endswith(".nsys-rep"):
+        raise ValueError("Expect a .nsys-rep file")
+else:
+    profile_dir = Path(args.profile_dir)
+    nsys_rep_file_path = profile_dir / f"report_np{args.world_size}_rank{args.rank}.nsys-rep"
+sqlite_file_path = nsys_rep_file_path.parent / (
+    nsys_rep_file_path.name[: -len(".nsys-rep")] + ".sqlite"
+)
+csv_file_path = nsys_rep_file_path.parent / (nsys_rep_file_path.name[: -len(".nsys-rep")] + ".csv")
+html_file_path = nsys_rep_file_path.parent / (
+    nsys_rep_file_path.name[: -len(".nsys-rep")] + ".html"
+)
 lazy_convert_sqlite(nsys_rep_file_path, sqlite_file_path)
 
 conn = sqlite3.connect(f"file:{sqlite_file_path}?mode=ro", uri=True)
@@ -196,7 +209,8 @@ def shortest_common_supersequence(a, b):
            R.start AS runtime_start, R.end AS runtime_end,
            CGE2.start AS capture_start, CGE2.end AS capture_end
     FROM ({unified_subquery}) AS unified
-    JOIN CUPTI_ACTIVITY_KIND_RUNTIME AS R ON unified.correlationId = R.correlationId
+    JOIN CUPTI_ACTIVITY_KIND_RUNTIME AS R ON unified.graphNodeId IS NOT NULL AND
+                                             unified.correlationId = R.correlationId
     LEFT JOIN CUDA_GRAPH_NODE_EVENTS AS CGE1 ON unified.graphNodeId = CGE1.graphNodeId AND
                                                 CGE1.originalGraphNodeId IS NOT NULL
     LEFT JOIN CUDA_GRAPH_NODE_EVENTS AS CGE2 ON CGE1.originalGraphNodeId = CGE2.graphNodeId"""
@@ -318,6 +332,7 @@ def shortest_common_supersequence(a, b):
     ("routingInitExpertCounts", "routingInitExpertCounts"),
     ("routingIndicesCluster", "routingIndicesClusterKernel"),
     ("routingIndicesCoop", "routingIndicesCoopKernel"),
+    ("router_gemm", "router_gemm_kernel"),
     ("bmm_4_44_32", "bmm_E2m1_E2m1E2m1_Fp32_t"),
     ("finalize", "finalize::finalizeKernel"),
     ("bmm_16_44_32", "bmm_Bfloat16_E2m1E2m1_Fp32_"),
@@ -381,7 +396,7 @@ def parse_kernel_name(demangledName):
         if all(keyword in name for keyword in src):
             return dst
     if name not in warned_names:
-        print(f"Unknown kernel name: {name}")
+        print(f"Unknown kernel name: {name}", file=sys.stderr)
         warned_names.add(name)
         if args.error_on_unknown_kernel:
             raise NotImplementedError(f"Unknown kernel name: {name}")
@@ -491,11 +506,33 @@ def parse_kernel_name(demangledName):
     for row in csv_data:
         csv_writer.writerow(row)
 js_header_config = [{"name": problem["text"]} for problem in problem_set]
+js_header_config = []
+for problem in problem_set:
+    innermost_children = js_header_config
+    for k, msg_prefix in [
+        ("batch_size", "b="),
+        ("seq_len_q", "q="),
+        ("seq_len_kv_cache", "past="),
+    ]:
+        if len(run_args[k + "_list"]) > 1:
+            if len(innermost_children) == 0 or problem["spec"][k] != innermost_children[-1][k]:
+                innermost_children.append(
+                    {
+                        "name": msg_prefix + str(problem["spec"][k]),
+                        "children": [],
+                        k: problem["spec"][k],
+                    }
+                )
+            innermost_children = innermost_children[-1]["children"]
+    innermost_children.append({"name": problem["text"]})
 loader = jinja2.FileSystemLoader(Path(__file__).parent)
 template = jinja2.Environment(loader=loader).get_template("template.html")
 with html_file_path.open("w") as f:
     configText = (
-        "Run:\n" + json.dumps(run_args, indent=4) + "\n\nParse:\n" + json.dumps(args.__dict__)
+        "Run:\n"
+        + json.dumps(run_args, indent=4)
+        + "\n\nParse:\n"
+        + json.dumps(args.__dict__, indent=4)
     )
     f.write(template.render(headerConfig=js_header_config, rawData=js_data, configText=configText))