Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 23 additions & 15 deletions examples/layer_wise_benchmarks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,28 +15,29 @@ pip install -e ../..
**Step 3:** In the container, run benchmarks and generate profiles:

```bash
# Set autotune cache path
export TLLM_AUTOTUNER_CACHE_PATH=autotuner_cache/cache

# Run DeepSeek-R1 NVFP4
NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml
NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml

# Run with weights loaded. Requires local model directory
NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --model "$LLM_MODELS_ROOT/DeepSeek-R1/DeepSeek-R1-0528-FP4-v2" --load-format AUTO
NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --model "$LLM_MODELS_ROOT/DeepSeek-R1/DeepSeek-R1-0528-FP4-v2" --load-format AUTO

# Run DeepSeek-V3.2-Exp
NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --moe-backend DEEPGEMM
NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --moe-backend DEEPGEMM
NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --moe-backend DEEPGEMM --moe-backend-for-prefill DEEPGEMM

# Run DeepSeek-V3.2-Exp with 32k context length
NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --moe-backend DEEPGEMM --batch-size 1 --seq-len-q 32769
NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --moe-backend DEEPGEMM --seq-len-kv-cache 32769
NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --moe-backend DEEPGEMM --moe-backend-for-prefill DEEPGEMM --seq-len-kv-cache 32769

# Run with attention TP
NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --no-enable-attention-dp
NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --no-enable-attention-dp

# Run with attention TP and TRTLLMGen
NP=4 ./mpi_launch.sh -x TRTLLM_ENABLE_PDL=1 ./run.sh config_ctx.yaml --no-enable-attention-dp --moe-backend TRTLLM
NP=4 ./mpi_launch.sh -x TRTLLM_ENABLE_PDL=1 ./run.sh config_gen.yaml --no-enable-attention-dp --moe-backend TRTLLM
NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --no-enable-attention-dp --moe-backend TRTLLM
NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --no-enable-attention-dp --moe-backend TRTLLM

# Run with MTP3
NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --batch-size 32 --seq-len-q 4
Expand All @@ -51,9 +52,13 @@ NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --scaled-from 16 --moe-backend WID
# Scale TEP=16 to 4 GPUs: reduce the number of attention heads and experts
NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --scaled-from 16 --no-enable-attention-dp

# Run Nemotron-3-Nano
NP=1 ./mpi_launch.sh ./run.sh config_ctx.yaml --model nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16 --layer-indices 4,5,6 --mamba-ssm-cache-dtype float16
NP=1 ./mpi_launch.sh ./run.sh config_gen.yaml --model nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16 --layer-indices 4,5,6 --mamba-ssm-cache-dtype float16

# Run Qwen3-Next
NP=2 ./mpi_launch.sh ./run.sh config_ctx.yaml --model Qwen/Qwen3-Next-80B-A3B-Instruct --layer-indices 6,7 --no-enable-attention-dp --batch-size 4
NP=2 ./mpi_launch.sh ./run.sh config_gen.yaml --model Qwen/Qwen3-Next-80B-A3B-Instruct --layer-indices 6,7 --no-enable-attention-dp --batch-size 512
NP=2 ./mpi_launch.sh ./run.sh config_ctx.yaml --model Qwen/Qwen3-Next-80B-A3B-Instruct --layer-indices 6,7 --no-enable-attention-dp --mamba-ssm-cache-dtype float16 --batch-size 4
NP=2 ./mpi_launch.sh ./run.sh config_gen.yaml --model Qwen/Qwen3-Next-80B-A3B-Instruct --layer-indices 6,7 --no-enable-attention-dp --mamba-ssm-cache-dtype float16 --batch-size 512

# Run with DeepEP A2A
NP=4 ./mpi_launch.sh -x TRTLLM_FORCE_ALLTOALL_METHOD=DeepEP ./run.sh config_ctx.yaml --moe-backend WIDEEP
Expand Down Expand Up @@ -112,14 +117,11 @@ python3 scripts/build_wheel.py --cuda_architectures native --no-venv --skip_buil
**Step 3:** Run benchmarks to generate profiles. Run the following command on the controller node, where `NODES` ≤ the number of allocated nodes:

```bash
# Set autotune cache path
export TLLM_AUTOTUNER_CACHE_PATH=autotuner_cache/cache

# Run DeepSeek-R1 NVFP4 with wide ep: uses MNNVL A2A if applicable
NODES=4 NP=16 ./slurm_launch.sh ./run.sh config_gen.yaml --moe-backend WIDEEP

# Run with TRTLLMGen
NODES=4 NP=16 TRTLLM_ENABLE_PDL=1 ./slurm_launch.sh ./run.sh config_gen.yaml --moe-backend TRTLLM
NODES=4 NP=16 ./slurm_launch.sh ./run.sh config_gen.yaml --moe-backend TRTLLM

# Run with DeepEPLowLatency
NODES=4 NP=16 TRTLLM_FORCE_ALLTOALL_METHOD=DeepEPLowLatency ./slurm_launch.sh ./run.sh config_gen.yaml --moe-backend WIDEEP
Expand Down Expand Up @@ -172,7 +174,9 @@ You will receive three reports, each containing kernel timing statistics grouped
## Developer utilities

1. Less startup time when debug a model
1. Disable autotuner: add `--no-enable-autotuner` option
1. Set autotuner cache or disable autotuner
1. Set autotuner cache: add `TLLM_AUTOTUNER_CACHE_PATH=autotuner_cache/cache` environment variable. This is enabled at your own risk, and you may need to delete the cache if `NP` changes or the code changes
2. Disable autotuner: add `--no-enable-autotuner` option
2. Disable nsys profile: set `PROFILE=0` environment variable
2. Capture more information
1. Enable GPU metrics: set `GPU_METRICS=1` environment variable
Expand All @@ -182,4 +186,8 @@ You will receive three reports, each containing kernel timing statistics grouped

1. Error `fp8 blockscale gemm only support Hopper` on Blackwell.

The default MoE backend "CUTLASS" does not support FP8 weights. Please choose the same MoE backend as your end-to-end config. A typical choice is adding `--moe-backend DEEPGEMM`, `--moe-backend TRTLLM`, or `--moe-backend WIDEEP` option.
The default MoE backend "CUTLASS" does not support FP8 weights. Please choose the same MoE backend as your end-to-end config. A typical choice is adding `--moe-backend DEEPGEMM` (or `TRTLLM`, `WIDEEP`) and `--moe-backend-for-prefill DEEPGEMM` (or `WIDEEP`) option.

2. Error `huggingface_hub.errors.HfHubHTTPError: 429 Client Error: Too Many Requests for url: https://huggingface.co/nvidia/DeepSeek-R1-0528-FP4-v2/resolve/main/config.json`.

Please use a local model through the `--model` option, or follow Hugging Face's instructions: "We had to rate limit your IP. To continue using our service, create a HF account or login to your existing account, and make sure you pass a HF_TOKEN if you're using the API."
168 changes: 50 additions & 118 deletions examples/layer_wise_benchmarks/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import sqlite3
import subprocess
import sys
from collections import defaultdict
from pathlib import Path

import jinja2
Expand Down Expand Up @@ -139,7 +138,7 @@ def shortest_common_supersequence(a, b):
"runs": [],
"runs_end": [],
"ranges": [],
"range_in_module": [],
"kernel_count_per_range": [],
}
)

Expand All @@ -161,28 +160,7 @@ def shortest_common_supersequence(a, b):
problem_set[problem_id]["runs_end"].append(end)
else:
problem_set[problem_id]["ranges"].append((start, end, text))

# Determine whether each range is the first range that matches `args.module`,
# and store the result in `problem["range_in_module"]`
for problem in problem_set:
if args.module is not None:
problem["range_in_module"] = [False] * len(problem["ranges"])
run_ids = [bisect.bisect(problem["runs"], start) - 1 for start, _, _ in problem["ranges"]]
run2ranges = defaultdict(list)
for i, run_id in enumerate(run_ids):
run2ranges[run_id].append(i)
for run_id, ranges in run2ranges.items():
ranges = sorted(ranges, key=lambda i: problem["ranges"][i][0])
num_matches = 0
for range_id in ranges:
if problem["ranges"][range_id][2] == args.module:
problem["range_in_module"][range_id] = True
num_matches += 1
if num_matches != 1:
raise ValueError(
f'Module "{args.module}" appears {num_matches} times'
f' in "{problem["text"]}"\'s {run_id + 1}-th run'
)
problem_set[problem_id]["kernel_count_per_range"].append(0)

query = """SELECT name FROM sqlite_master WHERE type = ?"""
df = pd.read_sql_query(query, conn, params=("table",))
Expand Down Expand Up @@ -228,19 +206,17 @@ def shortest_common_supersequence(a, b):
problem_id = bisect.bisect(problem_start, start) - 1
problem = problem_set[problem_id]
run_id = bisect.bisect(problem["runs"], runtime_start) - 1
if (
run_id == -1
or run_id == len(problem["runs"])
or runtime_start >= problem["runs_end"][run_id]
):
run_id = -1
if run_id == -1 or runtime_start >= problem["runs_end"][run_id]:
continue
ranges = [
i
for i, (range_start, range_end, text) in enumerate(problem["ranges"])
if capture_start >= range_start and capture_end <= range_end
]
if args.module is None or any(problem["range_in_module"][i] for i in ranges):
range_names = [problem["ranges"][i][2] for i in ranges]
for range_id in ranges:
problem["kernel_count_per_range"][range_id] += 1
range_names = [problem["ranges"][i][2] for i in ranges]
if args.module is None or args.module in range_names:
kernel_list.append(
(
problem_id,
Expand All @@ -262,6 +238,22 @@ def shortest_common_supersequence(a, b):

conn.close()

# Check ambiguous modules
if args.module:
for problem in problem_set:
num_matches_per_run = [0] * (len(problem["runs"]) + 1)
for (range_start, _, text), kernel_count in zip(
problem["ranges"], problem["kernel_count_per_range"]
):
if text == args.module and kernel_count > 0:
num_matches_per_run[bisect.bisect(problem["runs"], range_start)] += 1
for run_id_plus_one, num_matches in enumerate(num_matches_per_run):
if num_matches > 1:
raise ValueError(
f'Module is ambiguous: "{args.module}" appears {num_matches} times'
f' in "{problem["text"]}"\'s {run_id_plus_one}-th run'
)

kernel_list.sort(key=lambda t: (t[6], t[8]))
kernels = [[[] for _ in problem["runs"]] for problem in problem_set]
for (
Expand All @@ -276,8 +268,7 @@ def shortest_common_supersequence(a, b):
capture_start,
capture_end,
) in kernel_list:
if run_id != -1:
kernels[problem_id][run_id].append((demangledName, start, end, ranges))
kernels[problem_id][run_id].append((demangledName, start, end, ranges))
for problem_id in range(len(kernels)):
required_seq = [demangledName for demangledName, _, _, _ in kernels[problem_id][0]]
for run_id in range(len(kernels[problem_id])):
Expand All @@ -287,86 +278,8 @@ def shortest_common_supersequence(a, b):

parser_keywords = [
("cuBLASGemm", "nvjet"),
("splitKreduce", "splitKreduce_kernel"),
("fusedAGemm", "fused_a_gemm_kernel"),
("RMSNorm", "RMSNormKernel"),
("torchCat", "CatArrayBatchedCopy"),
("applyMLARope", "applyMLARope"),
("fmhaSm100f", "fmhaSm100fKernel_Qkv"),
("fmhaReduction", "fmhaReductionKernel"),
("quant", "quantize_with_block_size"),
("AllGather", "ncclDevKernel_AllGather_"),
("ReduceScatter", "ncclDevKernel_ReduceScatter_"),
("allreduce_oneshot", "allreduce_fusion_kernel_oneshot_lamport"),
("allreduce_twoshot", "allreduce_fusion_kernel_twoshot_sync"),
("expandInput", "expandInputRowsKernel"),
("computeStrides", "computeStridesTmaWarpSpecializedKernel"),
("cutlassGroupGemm", "cutlass::device_kernel<cutlass::gemm::kernel::GemmUniversal"),
("doActivation", "doActivationKernel"),
("cutlassGemm", "GemmUniversal"),
("deepseek_v3_topk", "deepseek_v3_topk_kernel"),
("CountAndIndice", "computeCountAndIndiceDevice"),
("Cumsum", "computeCumsumDevice"),
("moveIndice", "moveIndiceDevice"),
("moeAllToAll", "moeAllToAllKernel"),
("moeA2APrepareDispatch", "moe_comm::moeA2APrepareDispatchKernel"),
("moeA2ADispatch", "moe_comm::moeA2ADispatchKernel"),
("moeA2ASanitizeExpertIds", "moe_comm::moeA2ASanitizeExpertIdsKernel"),
("moeA2APrepareCombine", "moe_comm::moeA2APrepareCombineKernel"),
("moeA2ACombine", "moe_comm::moeA2ACombineKernel"),
("memsetExpertIds", "memsetExpertIdsDevice"),
("blockSum", "blockExpertPrefixSumKernel"),
("globalSum", "globalExpertPrefixSumKernel"),
("globalSumLarge", "globalExpertPrefixSumLargeKernel"),
("mergePrefix", "mergeExpertPrefixSumKernel"),
("fusedBuildExpertMaps", "fusedBuildExpertMapsSortFirstTokenKernel"),
("swiglu", "silu_and_mul_kernel"),
("torchAdd", "CUDAFunctor_add"),
("torchFill", "at::native::FillFunctor"),
("triton_fused_add_sum", "triton_red_fused_add_sum_0"),
("torchCopy", "at::native::bfloat16_copy_kernel_cuda"),
("torchDistribution", "distribution_elementwise_grid_stride_kernel"),
("torchArange", "at::native::arange_cuda_out"),
("torchDirectCopy", "at::native::direct_copy_kernel_cuda"),
("torchBitonicSort", "at::native::bitonicSortKVInPlace"),
("routingInitExpertCounts", "routingInitExpertCounts"),
("routingIndicesCluster", "routingIndicesClusterKernel"),
("routingIndicesCoop", "routingIndicesCoopKernel"),
("router_gemm", "router_gemm_kernel"),
("bmm_4_44_32", "bmm_E2m1_E2m1E2m1_Fp32_t"),
("finalize", "finalize::finalizeKernel"),
("bmm_16_44_32", "bmm_Bfloat16_E2m1E2m1_Fp32_"),
("deep_gemm_gemm", "deep_gemm::sm100_fp8_gemm_1d1d_impl<"),
("per_token_quant", "_per_token_quant_and_transform_kernel"),
("triton_fused_layer_norm", "triton_per_fused__to_copy_native_layer_norm_0"),
("flashinferRoPE", "flashinfer::BatchQKApplyRotaryPosIdsCosSinCacheHeadParallelismKernel<"),
("flashinferRoPE", "flashinfer::BatchQKApplyRotaryPosIdsCosSinCacheKernel<"),
("fp8_blockscale_gemm", "tensorrt_llm::kernels::fp8_blockscale_gemm"),
("triton_fused_mul_squeeze", "triton_poi_fused_mul_squeeze_0"),
("indexerKCacheScatter", "tensorrt_llm::kernels::indexerKCacheScatterUnifiedKernel"),
("deep_gemm_mqa_logits", "deep_gemm::sm100_fp8_paged_mqa_logits<"),
("topKPerRowDecode", "tensorrt_llm::kernels::topKPerRowDecode<"),
("torchAdd<int>", "at::native::CUDAFunctorOnSelf_add"),
("convert_req_index", "_convert_req_index_to_global_index_kernel_with_stride_factor"),
("preprocess_after_permute", "_preprocess_after_permute_kernel"),
("masked_index_copy_quant", "_masked_index_copy_group_quant_fp8"),
("swiglu_quant", "_silu_and_mul_post_quant_kernel"),
("masked_index_gather", "masked_index_gather_kernel"),
("finalizeMoeRouting", "tensorrt_llm::kernels::cutlass_kernels::finalizeMoeRoutingKernel<"),
("fused_qkvzba_split", "fused_qkvzba_split_reshape_cat_kernel"),
("causal_conv1d_update", "tensorrt_llm::kernels::causal_conv1d::causal_conv1d_update_kernel<"),
("fused_delta_rule_update", "fused_sigmoid_gating_delta_rule_update_kernel"),
("layer_norm_fwd_1pass", "_layer_norm_fwd_1pass_kernel"),
("torchGatherTopK", "at::native::sbtopk::gatherTopK<"),
("softmax_warp_forward", "softmax_warp_forward<"),
("torchSigmoid", "at::native::sigmoid_kernel_cuda"),
("torchMul", "at::native::binary_internal::MulFunctor<"),
("computeSeqAndPaddingOffsets", "tensorrt_llm::kernels::computeSeqAndPaddingOffsets<"),
("applyBiasRopeUpdateKVCache", "tensorrt_llm::kernels::applyBiasRopeUpdateKVCacheV2<"),
("routingIndicesHistogramScores", "routingRenormalize::routingIndicesHistogramScoresKernel<"),
("routingIndicesHistogram", "routingIndicesHistogramKernel<"),
("routingIndicesOffsets", "routingIndicesOffsetsKernel<"),
("torchReduceSum", ["at::native::reduce_kernel<", "at::native::sum_functor<"]),
("CuteDSLMoePermute", "cute_dsl::moePermuteKernel"),
(
"CuteDSLGemm",
Expand All @@ -380,6 +293,19 @@ def shortest_common_supersequence(a, b):
"CuteDSLGroupedGemmFinalize",
["cute_dsl_kernels", "blockscaled_contiguous_grouped_gemm_finalize_fusion"],
),
("torchAdd", "at::native::CUDAFunctorOnSelf_add"),
("torchAdd", "CUDAFunctor_add"),
("torchClamp", "at::native::<unnamed>::launch_clamp_scalar("),
("torchCompare", "at::native::<unnamed>::CompareFunctor<"),
("torchCopy", "at::native::bfloat16_copy_kernel_cuda"),
("torchCopy", "at::native::direct_copy_kernel_cuda("),
("torchFill", "at::native::FillFunctor"),
("torchIndexPut", "at::native::index_put_kernel_impl<"),
("torchMul", "at::native::binary_internal::MulFunctor<"),
("torchPow", "at::native::<unnamed>::pow_tensor_scalar_kernel_impl<"),
("torchReduceSum", ["at::native::reduce_kernel<", "at::native::sum_functor<"]),
("torchSigmoid", "at::native::sigmoid_kernel_cuda"),
("torchWhere", "at::native::<unnamed>::where_kernel_impl("),
]
warned_names = set()

Expand All @@ -395,15 +321,19 @@ def parse_kernel_name(demangledName):
src = [src]
if all(keyword in name for keyword in src):
return dst
if name not in warned_names:
print(f"Unknown kernel name: {name}", file=sys.stderr)
warned_names.add(name)
if args.error_on_unknown_kernel:
raise NotImplementedError(f"Unknown kernel name: {name}")
if re.search(r"at::native::.*elementwise_kernel<", name):
if name not in warned_names:
print(f"Not parsed torch kernel name: {name}", file=sys.stderr)
warned_names.add(name)
assert "!unnamed!" not in name
name = name.replace("<unnamed>", "!unnamed!")
if "<" in name:
name = name[: name.index("<")]
if "(" in name:
name = name[: name.index("(")]
if "::" in name:
name = name[name.rindex("::") + 2 :]
name = name.replace("!unnamed!", "<unnamed>")
return name


Expand Down Expand Up @@ -438,6 +368,8 @@ def parse_kernel_name(demangledName):
converted_seq.append((("Space",), np.mean(space_list[warmup_times:]).tolist()))
converted_seq.append((("Total",), sum(t for _, t in converted_seq)))
converted_seqs.append(converted_seq)
if args.error_on_unknown_kernel and warned_names:
raise ValueError("Unknown kernel names encountered")

merged_title = []
for converted_seq in converted_seqs:
Expand All @@ -459,7 +391,7 @@ def parse_kernel_name(demangledName):
for problem in problem_set:
print(
f'- "{problem["text"]}" {len(problem["runs"])} runs'
f" Ranges: [{', '.join(text for _, _, text in problem['ranges'])}]"
f" Ranges: [{', '.join(text for _, end, text in problem['ranges'] if end <= problem['runs_end'][0])}]"
)

stack = []
Expand Down
Loading