NVIDIA · yuantailing · Jan 13, 2026 · Jan 6, 2026 · Jan 8, 2026 · Jan 8, 2026
@@ -15,28 +15,29 @@ pip install -e ../..
 **Step 3:** In the container, run benchmarks and generate profiles:
 
 ```bash
-# Set autotune cache path
-export TLLM_AUTOTUNER_CACHE_PATH=autotuner_cache/cache
-
 # Run DeepSeek-R1 NVFP4
 NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml
 NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml
 
+# Run with weights loaded. Requires local model directory
+NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --model "$LLM_MODELS_ROOT/DeepSeek-R1/DeepSeek-R1-0528-FP4-v2" --load-format AUTO
+NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --model "$LLM_MODELS_ROOT/DeepSeek-R1/DeepSeek-R1-0528-FP4-v2" --load-format AUTO
+
 # Run DeepSeek-V3.2-Exp
 NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --moe-backend DEEPGEMM
-NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --moe-backend DEEPGEMM
+NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --moe-backend DEEPGEMM --moe-backend-for-prefill DEEPGEMM
 
 # Run DeepSeek-V3.2-Exp with 32k context length
 NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --moe-backend DEEPGEMM --batch-size 1 --seq-len-q 32769
-NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --moe-backend DEEPGEMM --seq-len-kv-cache 32769
+NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --moe-backend DEEPGEMM --moe-backend-for-prefill DEEPGEMM --seq-len-kv-cache 32769
 
 # Run with attention TP
 NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --no-enable-attention-dp
 NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --no-enable-attention-dp
 
 # Run with attention TP and TRTLLMGen
-NP=4 ./mpi_launch.sh -x TRTLLM_ENABLE_PDL=1 ./run.sh config_ctx.yaml --no-enable-attention-dp --moe-backend TRTLLM
-NP=4 ./mpi_launch.sh -x TRTLLM_ENABLE_PDL=1 ./run.sh config_gen.yaml --no-enable-attention-dp --moe-backend TRTLLM
+NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --no-enable-attention-dp --moe-backend TRTLLM
+NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --no-enable-attention-dp --moe-backend TRTLLM
 
 # Run with MTP3
 NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --batch-size 32 --seq-len-q 4
@@ -51,9 +52,13 @@ NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --scaled-from 16 --moe-backend WID
 # Scale TEP=16 to 4 GPUs: reduce the number of attention heads and experts
 NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --scaled-from 16 --no-enable-attention-dp
 
+# Run Nemotron-3-Nano
+NP=1 ./mpi_launch.sh ./run.sh config_ctx.yaml --model nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16 --layer-indices 4,5,6 --mamba-ssm-cache-dtype float16
+NP=1 ./mpi_launch.sh ./run.sh config_gen.yaml --model nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16 --layer-indices 4,5,6 --mamba-ssm-cache-dtype float16
+
 # Run Qwen3-Next
-NP=2 ./mpi_launch.sh ./run.sh config_ctx.yaml --model Qwen/Qwen3-Next-80B-A3B-Instruct --layer-indices 6,7 --no-enable-attention-dp --batch-size 4
-NP=2 ./mpi_launch.sh ./run.sh config_gen.yaml --model Qwen/Qwen3-Next-80B-A3B-Instruct --layer-indices 6,7 --no-enable-attention-dp --batch-size 512
+NP=2 ./mpi_launch.sh ./run.sh config_ctx.yaml --model Qwen/Qwen3-Next-80B-A3B-Instruct --layer-indices 6,7 --no-enable-attention-dp --mamba-ssm-cache-dtype float16 --batch-size 4
+NP=2 ./mpi_launch.sh ./run.sh config_gen.yaml --model Qwen/Qwen3-Next-80B-A3B-Instruct --layer-indices 6,7 --no-enable-attention-dp --mamba-ssm-cache-dtype float16 --batch-size 512
 
 # Run with DeepEP A2A
 NP=4 ./mpi_launch.sh -x TRTLLM_FORCE_ALLTOALL_METHOD=DeepEP ./run.sh config_ctx.yaml --moe-backend WIDEEP
@@ -112,14 +117,11 @@ python3 scripts/build_wheel.py --cuda_architectures native --no-venv --skip_buil
 **Step 3:** Run benchmarks to generate profiles. Run the following command on the controller node, where `NODES` &le; the number of allocated nodes:
 
 ```bash
-# Set autotune cache path
-export TLLM_AUTOTUNER_CACHE_PATH=autotuner_cache/cache
-
 # Run DeepSeek-R1 NVFP4 with wide ep: uses MNNVL A2A if applicable
 NODES=4 NP=16 ./slurm_launch.sh ./run.sh config_gen.yaml --moe-backend WIDEEP
 
 # Run with TRTLLMGen
-NODES=4 NP=16 TRTLLM_ENABLE_PDL=1 ./slurm_launch.sh ./run.sh config_gen.yaml --moe-backend TRTLLM
+NODES=4 NP=16 ./slurm_launch.sh ./run.sh config_gen.yaml --moe-backend TRTLLM
 
 # Run with DeepEPLowLatency
 NODES=4 NP=16 TRTLLM_FORCE_ALLTOALL_METHOD=DeepEPLowLatency ./slurm_launch.sh ./run.sh config_gen.yaml --moe-backend WIDEEP
@@ -172,7 +174,9 @@ You will receive three reports, each containing kernel timing statistics grouped
 ## Developer utilities
 
 1. Less startup time when debug a model
-   1. Disable autotuner: add `--no-enable-autotuner` option
+   1. Set autotuner cache or disable autotuner
+      1. Set autotuner cache: add `TLLM_AUTOTUNER_CACHE_PATH=autotuner_cache/cache` environment variable. This is enabled at your own risk, and you may need to delete the cache if `NP` changes or the code changes
+      2. Disable autotuner: add `--no-enable-autotuner` option
    2. Disable nsys profile: set `PROFILE=0` environment variable
 2. Capture more information
    1. Enable GPU metrics: set `GPU_METRICS=1` environment variable
@@ -182,4 +186,8 @@ You will receive three reports, each containing kernel timing statistics grouped
 
 1. Error `fp8 blockscale gemm only support Hopper` on Blackwell.
 
-   The default MoE backend "CUTLASS" does not support FP8 weights. Please choose the same MoE backend as your end-to-end config. A typical choice is adding `--moe-backend DEEPGEMM`, `--moe-backend TRTLLM`, or `--moe-backend WIDEEP` option.
+   The default MoE backend "CUTLASS" does not support FP8 weights. Please choose the same MoE backend as your end-to-end config. A typical choice is adding `--moe-backend DEEPGEMM` (or `TRTLLM`, `WIDEEP`) and `--moe-backend-for-prefill DEEPGEMM` (or `WIDEEP`) option.
+
+2. Error `huggingface_hub.errors.HfHubHTTPError: 429 Client Error: Too Many Requests for url: https://huggingface.co/nvidia/DeepSeek-R1-0528-FP4-v2/resolve/main/config.json`.
+
+   Please use a local model through the `--model` option, or follow Hugging Face's instructions: "We had to rate limit your IP. To continue using our service, create a HF account or login to your existing account, and make sure you pass a HF_TOKEN if you're using the API."
@@ -6,7 +6,6 @@
 import sqlite3
 import subprocess
 import sys
-from collections import defaultdict
 from pathlib import Path
 
 import jinja2
@@ -139,7 +138,7 @@ def shortest_common_supersequence(a, b):
                 "runs": [],
                 "runs_end": [],
                 "ranges": [],
-                "range_in_module": [],
+                "kernel_count_per_range": [],
             }
         )
 
@@ -161,28 +160,7 @@ def shortest_common_supersequence(a, b):
         problem_set[problem_id]["runs_end"].append(end)
     else:
         problem_set[problem_id]["ranges"].append((start, end, text))
-
-# Determine whether each range is the first range that matches `args.module`,
-# and store the result in `problem["range_in_module"]`
-for problem in problem_set:
-    if args.module is not None:
-        problem["range_in_module"] = [False] * len(problem["ranges"])
-        run_ids = [bisect.bisect(problem["runs"], start) - 1 for start, _, _ in problem["ranges"]]
-        run2ranges = defaultdict(list)
-        for i, run_id in enumerate(run_ids):
-            run2ranges[run_id].append(i)
-        for run_id, ranges in run2ranges.items():
-            ranges = sorted(ranges, key=lambda i: problem["ranges"][i][0])
-            num_matches = 0
-            for range_id in ranges:
-                if problem["ranges"][range_id][2] == args.module:
-                    problem["range_in_module"][range_id] = True
-                    num_matches += 1
-            if num_matches != 1:
-                raise ValueError(
-                    f'Module "{args.module}" appears {num_matches} times'
-                    f' in "{problem["text"]}"\'s {run_id + 1}-th run'
-                )
+        problem_set[problem_id]["kernel_count_per_range"].append(0)
 
 query = """SELECT name FROM sqlite_master WHERE type = ?"""
 df = pd.read_sql_query(query, conn, params=("table",))
@@ -228,19 +206,17 @@ def shortest_common_supersequence(a, b):
     problem_id = bisect.bisect(problem_start, start) - 1
     problem = problem_set[problem_id]
     run_id = bisect.bisect(problem["runs"], runtime_start) - 1
-    if (
-        run_id == -1
-        or run_id == len(problem["runs"])
-        or runtime_start >= problem["runs_end"][run_id]
-    ):
-        run_id = -1
+    if run_id == -1 or runtime_start >= problem["runs_end"][run_id]:
+        continue
     ranges = [
         i
         for i, (range_start, range_end, text) in enumerate(problem["ranges"])
         if capture_start >= range_start and capture_end <= range_end
     ]
-    if args.module is None or any(problem["range_in_module"][i] for i in ranges):
-        range_names = [problem["ranges"][i][2] for i in ranges]
+    for range_id in ranges:
+        problem["kernel_count_per_range"][range_id] += 1
+    range_names = [problem["ranges"][i][2] for i in ranges]
+    if args.module is None or args.module in range_names:
         kernel_list.append(
             (
                 problem_id,
@@ -262,6 +238,22 @@ def shortest_common_supersequence(a, b):
 
 conn.close()
 
+# Check ambiguous modules
+if args.module:
+    for problem in problem_set:
+        num_matches_per_run = [0] * (len(problem["runs"]) + 1)
+        for (range_start, _, text), kernel_count in zip(
+            problem["ranges"], problem["kernel_count_per_range"]
+        ):
+            if text == args.module and kernel_count > 0:
+                num_matches_per_run[bisect.bisect(problem["runs"], range_start)] += 1
+        for run_id_plus_one, num_matches in enumerate(num_matches_per_run):
+            if num_matches > 1:
+                raise ValueError(
+                    f'Module is ambiguous: "{args.module}" appears {num_matches} times'
+                    f' in "{problem["text"]}"\'s {run_id_plus_one}-th run'
+                )
+
 kernel_list.sort(key=lambda t: (t[6], t[8]))
 kernels = [[[] for _ in problem["runs"]] for problem in problem_set]
 for (
@@ -276,8 +268,7 @@ def shortest_common_supersequence(a, b):
     capture_start,
     capture_end,
 ) in kernel_list:
-    if run_id != -1:
-        kernels[problem_id][run_id].append((demangledName, start, end, ranges))
+    kernels[problem_id][run_id].append((demangledName, start, end, ranges))
 for problem_id in range(len(kernels)):
     required_seq = [demangledName for demangledName, _, _, _ in kernels[problem_id][0]]
     for run_id in range(len(kernels[problem_id])):
@@ -287,86 +278,8 @@ def shortest_common_supersequence(a, b):
 
 parser_keywords = [
     ("cuBLASGemm", "nvjet"),
-    ("splitKreduce", "splitKreduce_kernel"),
-    ("fusedAGemm", "fused_a_gemm_kernel"),
-    ("RMSNorm", "RMSNormKernel"),
-    ("torchCat", "CatArrayBatchedCopy"),
-    ("applyMLARope", "applyMLARope"),
-    ("fmhaSm100f", "fmhaSm100fKernel_Qkv"),
-    ("fmhaReduction", "fmhaReductionKernel"),
-    ("quant", "quantize_with_block_size"),
-    ("AllGather", "ncclDevKernel_AllGather_"),
-    ("ReduceScatter", "ncclDevKernel_ReduceScatter_"),
-    ("allreduce_oneshot", "allreduce_fusion_kernel_oneshot_lamport"),
-    ("allreduce_twoshot", "allreduce_fusion_kernel_twoshot_sync"),
-    ("expandInput", "expandInputRowsKernel"),
-    ("computeStrides", "computeStridesTmaWarpSpecializedKernel"),
     ("cutlassGroupGemm", "cutlass::device_kernel<cutlass::gemm::kernel::GemmUniversal"),
-    ("doActivation", "doActivationKernel"),
     ("cutlassGemm", "GemmUniversal"),
-    ("deepseek_v3_topk", "deepseek_v3_topk_kernel"),
-    ("CountAndIndice", "computeCountAndIndiceDevice"),
-    ("Cumsum", "computeCumsumDevice"),
-    ("moveIndice", "moveIndiceDevice"),
-    ("moeAllToAll", "moeAllToAllKernel"),
-    ("moeA2APrepareDispatch", "moe_comm::moeA2APrepareDispatchKernel"),
-    ("moeA2ADispatch", "moe_comm::moeA2ADispatchKernel"),
-    ("moeA2ASanitizeExpertIds", "moe_comm::moeA2ASanitizeExpertIdsKernel"),
-    ("moeA2APrepareCombine", "moe_comm::moeA2APrepareCombineKernel"),
-    ("moeA2ACombine", "moe_comm::moeA2ACombineKernel"),
-    ("memsetExpertIds", "memsetExpertIdsDevice"),
-    ("blockSum", "blockExpertPrefixSumKernel"),
-    ("globalSum", "globalExpertPrefixSumKernel"),
-    ("globalSumLarge", "globalExpertPrefixSumLargeKernel"),
-    ("mergePrefix", "mergeExpertPrefixSumKernel"),
-    ("fusedBuildExpertMaps", "fusedBuildExpertMapsSortFirstTokenKernel"),
-    ("swiglu", "silu_and_mul_kernel"),
-    ("torchAdd", "CUDAFunctor_add"),
-    ("torchFill", "at::native::FillFunctor"),
-    ("triton_fused_add_sum", "triton_red_fused_add_sum_0"),
-    ("torchCopy", "at::native::bfloat16_copy_kernel_cuda"),
-    ("torchDistribution", "distribution_elementwise_grid_stride_kernel"),
-    ("torchArange", "at::native::arange_cuda_out"),
-    ("torchDirectCopy", "at::native::direct_copy_kernel_cuda"),
-    ("torchBitonicSort", "at::native::bitonicSortKVInPlace"),
-    ("routingInitExpertCounts", "routingInitExpertCounts"),
-    ("routingIndicesCluster", "routingIndicesClusterKernel"),
-    ("routingIndicesCoop", "routingIndicesCoopKernel"),
-    ("router_gemm", "router_gemm_kernel"),
-    ("bmm_4_44_32", "bmm_E2m1_E2m1E2m1_Fp32_t"),
-    ("finalize", "finalize::finalizeKernel"),
-    ("bmm_16_44_32", "bmm_Bfloat16_E2m1E2m1_Fp32_"),
-    ("deep_gemm_gemm", "deep_gemm::sm100_fp8_gemm_1d1d_impl<"),
-    ("per_token_quant", "_per_token_quant_and_transform_kernel"),
-    ("triton_fused_layer_norm", "triton_per_fused__to_copy_native_layer_norm_0"),
-    ("flashinferRoPE", "flashinfer::BatchQKApplyRotaryPosIdsCosSinCacheHeadParallelismKernel<"),
-    ("flashinferRoPE", "flashinfer::BatchQKApplyRotaryPosIdsCosSinCacheKernel<"),
-    ("fp8_blockscale_gemm", "tensorrt_llm::kernels::fp8_blockscale_gemm"),
-    ("triton_fused_mul_squeeze", "triton_poi_fused_mul_squeeze_0"),
-    ("indexerKCacheScatter", "tensorrt_llm::kernels::indexerKCacheScatterUnifiedKernel"),
-    ("deep_gemm_mqa_logits", "deep_gemm::sm100_fp8_paged_mqa_logits<"),
-    ("topKPerRowDecode", "tensorrt_llm::kernels::topKPerRowDecode<"),
-    ("torchAdd<int>", "at::native::CUDAFunctorOnSelf_add"),
-    ("convert_req_index", "_convert_req_index_to_global_index_kernel_with_stride_factor"),
-    ("preprocess_after_permute", "_preprocess_after_permute_kernel"),
-    ("masked_index_copy_quant", "_masked_index_copy_group_quant_fp8"),
-    ("swiglu_quant", "_silu_and_mul_post_quant_kernel"),
-    ("masked_index_gather", "masked_index_gather_kernel"),
-    ("finalizeMoeRouting", "tensorrt_llm::kernels::cutlass_kernels::finalizeMoeRoutingKernel<"),
-    ("fused_qkvzba_split", "fused_qkvzba_split_reshape_cat_kernel"),
-    ("causal_conv1d_update", "tensorrt_llm::kernels::causal_conv1d::causal_conv1d_update_kernel<"),
-    ("fused_delta_rule_update", "fused_sigmoid_gating_delta_rule_update_kernel"),
-    ("layer_norm_fwd_1pass", "_layer_norm_fwd_1pass_kernel"),
-    ("torchGatherTopK", "at::native::sbtopk::gatherTopK<"),
-    ("softmax_warp_forward", "softmax_warp_forward<"),
-    ("torchSigmoid", "at::native::sigmoid_kernel_cuda"),
-    ("torchMul", "at::native::binary_internal::MulFunctor<"),
-    ("computeSeqAndPaddingOffsets", "tensorrt_llm::kernels::computeSeqAndPaddingOffsets<"),
-    ("applyBiasRopeUpdateKVCache", "tensorrt_llm::kernels::applyBiasRopeUpdateKVCacheV2<"),
-    ("routingIndicesHistogramScores", "routingRenormalize::routingIndicesHistogramScoresKernel<"),
-    ("routingIndicesHistogram", "routingIndicesHistogramKernel<"),
-    ("routingIndicesOffsets", "routingIndicesOffsetsKernel<"),
-    ("torchReduceSum", ["at::native::reduce_kernel<", "at::native::sum_functor<"]),
     ("CuteDSLMoePermute", "cute_dsl::moePermuteKernel"),
     (
         "CuteDSLGemm",
@@ -380,6 +293,19 @@ def shortest_common_supersequence(a, b):
         "CuteDSLGroupedGemmFinalize",
         ["cute_dsl_kernels", "blockscaled_contiguous_grouped_gemm_finalize_fusion"],
     ),
+    ("torchAdd", "at::native::CUDAFunctorOnSelf_add"),
+    ("torchAdd", "CUDAFunctor_add"),
+    ("torchClamp", "at::native::<unnamed>::launch_clamp_scalar("),
+    ("torchCompare", "at::native::<unnamed>::CompareFunctor<"),
+    ("torchCopy", "at::native::bfloat16_copy_kernel_cuda"),
+    ("torchCopy", "at::native::direct_copy_kernel_cuda("),
+    ("torchFill", "at::native::FillFunctor"),
+    ("torchIndexPut", "at::native::index_put_kernel_impl<"),
+    ("torchMul", "at::native::binary_internal::MulFunctor<"),
+    ("torchPow", "at::native::<unnamed>::pow_tensor_scalar_kernel_impl<"),
+    ("torchReduceSum", ["at::native::reduce_kernel<", "at::native::sum_functor<"]),
+    ("torchSigmoid", "at::native::sigmoid_kernel_cuda"),
+    ("torchWhere", "at::native::<unnamed>::where_kernel_impl("),
 ]
 warned_names = set()
 
@@ -395,15 +321,19 @@ def parse_kernel_name(demangledName):
             src = [src]
         if all(keyword in name for keyword in src):
             return dst
-    if name not in warned_names:
-        print(f"Unknown kernel name: {name}", file=sys.stderr)
-        warned_names.add(name)
-        if args.error_on_unknown_kernel:
-            raise NotImplementedError(f"Unknown kernel name: {name}")
+    if re.search(r"at::native::.*elementwise_kernel<", name):
+        if name not in warned_names:
+            print(f"Not parsed torch kernel name: {name}", file=sys.stderr)
+            warned_names.add(name)
+    assert "!unnamed!" not in name
+    name = name.replace("<unnamed>", "!unnamed!")
     if "<" in name:
         name = name[: name.index("<")]
     if "(" in name:
         name = name[: name.index("(")]
+    if "::" in name:
+        name = name[name.rindex("::") + 2 :]
+    name = name.replace("!unnamed!", "<unnamed>")
     return name
 
 
@@ -438,6 +368,8 @@ def parse_kernel_name(demangledName):
     converted_seq.append((("Space",), np.mean(space_list[warmup_times:]).tolist()))
     converted_seq.append((("Total",), sum(t for _, t in converted_seq)))
     converted_seqs.append(converted_seq)
+if args.error_on_unknown_kernel and warned_names:
+    raise ValueError("Unknown kernel names encountered")
 
 merged_title = []
 for converted_seq in converted_seqs:
@@ -459,7 +391,7 @@ def parse_kernel_name(demangledName):
 for problem in problem_set:
     print(
         f'- "{problem["text"]}"    {len(problem["runs"])} runs'
-        f"    Ranges: [{', '.join(text for _, _, text in problem['ranges'])}]"
+        f"    Ranges: [{', '.join(text for _, end, text in problem['ranges'] if end <= problem['runs_end'][0])}]"
     )
 
 stack = []