diff --git a/.gitignore b/.gitignore index e2e8a6e58ef..78d8da20e47 100644 --- a/.gitignore +++ b/.gitignore @@ -73,6 +73,7 @@ cpp/include/tensorrt_llm/executor/version.h cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmha_v2_cu/ cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_cubin.h .devcontainer/.env +/examples/layer_wise_benchmarks/profiles/ # User config files CMakeUserPresets.json diff --git a/examples/layer_wise_benchmarks/README.md b/examples/layer_wise_benchmarks/README.md index 6cb324cd126..e9db0f24ac7 100644 --- a/examples/layer_wise_benchmarks/README.md +++ b/examples/layer_wise_benchmarks/README.md @@ -2,7 +2,7 @@ ## Generate profiles -### Run with MPI +### Run with OpenMPI **Step 1:** Start a container using Docker, Enroot or others. Please refer to `../../jenkins/current_image_tags.properties` for the Docker image URI. @@ -16,50 +16,61 @@ pip install -e ../.. ```bash # Run DeepSeek-R1 NVFP4 -NP=4 ./mpi_launch.sh ./run_single.sh config_ctx.yaml -NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml +NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml +NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml # Run DeepSeek-V3.2-Exp -NP=4 ./mpi_launch.sh ./run_single.sh config_ctx.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --moe-backend DEEPGEMM -NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --moe-backend DEEPGEMM +NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --moe-backend DEEPGEMM +NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --moe-backend DEEPGEMM # Run DeepSeek-V3.2-Exp with 32k context length -NP=4 ./mpi_launch.sh ./run_single.sh config_ctx.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --max-seq-len $((32768 + 1024 + 4)) --moe-backend DEEPGEMM --batch-size 1 --seq-len-q 32769 -NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --max-seq-len $((32768 + 1024 + 4)) --moe-backend DEEPGEMM --seq-len-kv-cache 32769 +NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --max-seq-len $((32768 + 1024 + 4)) --moe-backend DEEPGEMM --batch-size 1 --seq-len-q 32769 +NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --max-seq-len $((32768 + 1024 + 4)) --moe-backend DEEPGEMM --seq-len-kv-cache 32769 # Run with attention TP -NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml --no-enable-attention-dp -NP=4 ./mpi_launch.sh ./run_single.sh config_ctx.yaml --no-enable-attention-dp +NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --no-enable-attention-dp +NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --no-enable-attention-dp # Run with attention TP and TRTLLMGen -NP=4 TRTLLM_ENABLE_PDL=1 ./mpi_launch.sh ./run_single.sh config_ctx.yaml --no-enable-attention-dp --moe-backend TRTLLM -NP=4 TRTLLM_ENABLE_PDL=1 ./mpi_launch.sh ./run_single.sh config_gen.yaml --no-enable-attention-dp --moe-backend TRTLLM +NP=4 ./mpi_launch.sh -x TRTLLM_ENABLE_PDL=1 ./run.sh config_ctx.yaml --no-enable-attention-dp --moe-backend TRTLLM --balance-method NotModified +NP=4 ./mpi_launch.sh -x TRTLLM_ENABLE_PDL=1 ./run.sh config_gen.yaml --no-enable-attention-dp --moe-backend TRTLLM --balance-method NotModified # Run with MTP3 -NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml --batch-size 32 --seq-len-q 4 +NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --batch-size 32 --seq-len-q 4 # Run 4 layers -NP=4 ./mpi_launch.sh ./run_single.sh config_ctx.yaml --layer-indices 5,6,7,8 -NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml --layer-indices 5,6,7,8 +NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --layer-indices 5,6,7,8 +NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --layer-indices 5,6,7,8 # Scale DEP=16 to 4 GPUs: reduce the number of experts, uses MNNVL A2A if applicable -NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml --scaled-from 16 --moe-backend WIDEEP +NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --scaled-from 16 --moe-backend WIDEEP # Scale TEP=16 to 4 GPUs: reduce the number of attention heads and experts -NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml --scaled-from 16 --no-enable-attention-dp +NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --scaled-from 16 --no-enable-attention-dp # Run Qwen3-Next (balanced routing is not implemented) -NP=2 TRTLLM_ENABLE_PDL=1 ./mpi_launch.sh ./run_single.sh config_ctx.yaml --model Qwen/Qwen3-Next-80B-A3B-Instruct --layer-indices 6,7 --no-enable-attention-dp --moe-backend TRTLLM --balance-method NotModified -NP=2 TRTLLM_ENABLE_PDL=1 ./mpi_launch.sh ./run_single.sh config_gen.yaml --model Qwen/Qwen3-Next-80B-A3B-Instruct --layer-indices 6,7 --no-enable-attention-dp --moe-backend TRTLLM --balance-method NotModified +NP=2 ./mpi_launch.sh ./run.sh config_ctx.yaml --model Qwen/Qwen3-Next-80B-A3B-Instruct --layer-indices 6,7 --no-enable-attention-dp --batch-size 4 +NP=2 ./mpi_launch.sh ./run.sh config_gen.yaml --model Qwen/Qwen3-Next-80B-A3B-Instruct --layer-indices 6,7 --no-enable-attention-dp --batch-size 512 # Run with DeepEP A2A -NP=4 TRTLLM_FORCE_ALLTOALL_METHOD=DeepEP ./mpi_launch.sh ./run_single.sh config_ctx.yaml --moe-backend WIDEEP -NP=4 TRTLLM_FORCE_ALLTOALL_METHOD=DeepEP ./mpi_launch.sh ./run_single.sh config_gen.yaml --moe-backend WIDEEP +NP=4 ./mpi_launch.sh -x TRTLLM_FORCE_ALLTOALL_METHOD=DeepEP ./run.sh config_ctx.yaml --moe-backend WIDEEP +NP=4 ./mpi_launch.sh -x TRTLLM_FORCE_ALLTOALL_METHOD=DeepEP ./run.sh config_gen.yaml --moe-backend WIDEEP + +# Run with imbalanced ranks: except for activating all experts, a% of the tokens are sent to the 1st rank +# Note: if balance ratio is 0, ignore activating all experts +NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --balance-method ImbalancedRanks --balance-ratio 0.5 +NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --balance-method ImbalancedRanks --balance-ratio 0.5 + +# Run with imbalanced experts and balanced ranks: except for activating all experts, a% of the tokens are sent to the front experts on each rank +NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --balance-method ImbalancedExperts --balance-ratio 0.5 +NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --balance-method ImbalancedExperts --balance-ratio 0.5 ``` ### Run with Slurm -> Tips: If you have a running job with environment installed, please skip step 1 and 2 and go straight to step 3. In this case, your job must be run with `--container-name aaa`, and if the container name is not "layer_wise_benchmarks" please `export CONTAINER_NAME=aaa`. +> Tips: +> 1. If you have a running Slurm job, please skip step 1 and go straight to step 2 and 3. +> 2. Further, if you have installed `tensorrt_llm` in the Slurm job, you can also skip step 2 and run step 3 with `export CONTAINER_NAME=aaa` specified. If you don't know the container name, run `export CONTAINER_NAME=$(SLURM_JOB_ID=$SLURM_JOB_ID ./slurm_query_container_name.sh)` to get it. **Step 1:** On the controller node, allocate one or multiple nodes, and record the `SLURM_JOB_ID`: @@ -77,26 +88,61 @@ SLURM_JOB_ID=$SLURM_JOB_ID ./slurm_init_containers.sh It uses the image recorded in `../../jenkins/current_image_tags.properties`. The image will be downloaded to `../../enroot/` for once. +> Tips: If you want to change the image, no need to reallocate Slurm jobs. Just start another container by running step 2 with `export CONTAINER_NAME=aaa`, and step 3 will run in the container specified by the `CONTAINER_NAME` env. + **Step 3:** Run benchmarks to generate profiles. Run the following command on the controller node, where `NODES` ≤ the number of allocated nodes: ```bash # Run DeepSeek-R1 NVFP4 with wide ep: uses MNNVL A2A if applicable -SLURM_JOB_ID=$SLURM_JOB_ID NODES=4 NP=16 ./slurm_launch.sh ./run_single.sh config_gen.yaml --moe-backend WIDEEP +SLURM_JOB_ID=$SLURM_JOB_ID NODES=4 NP=16 ./slurm_launch.sh ./run.sh config_gen.yaml --moe-backend WIDEEP -# Run with attention TP and TRTLLMGen -SLURM_JOB_ID=$SLURM_JOB_ID NODES=4 NP=16 TRTLLM_ENABLE_PDL=1 ./slurm_launch.sh ./run_single.sh config_gen.yaml --no-enable-attention-dp --moe-backend TRTLLM +# Run with TRTLLMGen +SLURM_JOB_ID=$SLURM_JOB_ID NODES=4 NP=16 TRTLLM_ENABLE_PDL=1 ./slurm_launch.sh ./run.sh config_gen.yaml --moe-backend TRTLLM # Run with DeepEPLowLatency -SLURM_JOB_ID=$SLURM_JOB_ID NODES=4 NP=16 TRTLLM_FORCE_ALLTOALL_METHOD=DeepEPLowLatency ./slurm_launch.sh ./run_single.sh config_gen.yaml --moe-backend WIDEEP +SLURM_JOB_ID=$SLURM_JOB_ID NODES=4 NP=16 TRTLLM_FORCE_ALLTOALL_METHOD=DeepEPLowLatency ./slurm_launch.sh ./run.sh config_gen.yaml --moe-backend WIDEEP # You can run 4-GPU and 8-GPU tasks without reallocate the slurm job -SLURM_JOB_ID=$SLURM_JOB_ID NODES=1 NP=4 ./slurm_launch.sh ./run_single.sh config_ctx.yaml -SLURM_JOB_ID=$SLURM_JOB_ID NODES=2 NP=8 ./slurm_launch.sh ./run_single.sh config_ctx.yaml +SLURM_JOB_ID=$SLURM_JOB_ID NODES=1 NP=4 ./slurm_launch.sh ./run.sh config_ctx.yaml +SLURM_JOB_ID=$SLURM_JOB_ID NODES=2 NP=8 ./slurm_launch.sh ./run.sh config_gtx.yaml +``` + +### Batched run + +By specifying a list for `--batch-size` on the command line (or `batch_size` in the YAML file), the script runs multiple configurations in a single process. This significantly reduces the total runtime because it avoids repeated library initialization and model initialization. + +Supported list arguments: +- `--batch-size` (or `batch_size` in YAML) +- `--seq-len-q` (or `seq_len_q` in YAML) +- `--seq-len-kv-cache` (or `seq_len_kv_cache` in YAML) +- `--balance-ratio` (or `balance_ratio` in YAML) + +Command line arguments are comma separated, for example, `--batch-size 1,2,4`. Configs in the YAML file are lists, for example, `batch_size: [1, 2, 4]`. + +Run with OpenMPI: + +```bash +NP=4 ./mpi_launch.sh ./run.sh config_ctx.yaml --batch-size 1,2,4 --seq-len-q 1024,8192 +NP=4 ./mpi_launch.sh ./run.sh config_gen.yaml --scaled-from 16 --moe-backend WIDEEP --batch-size 32,64,128,256,512 --seq-len-q 1,2,3,4 ``` ## Parse profiles -Coming soon. +Run the following command in the container: + +```bash +python3 parse.py --world-size 4 + +# Specify the location of the .nsys-rep file +python3 parse.py --profile-dir ./profiles --world-size 4 --rank 0 +``` + +It can parse only GEN phase profiles for now. + +You will receive three reports, each containing kernel timing statistics grouped by module: +1. A printed report on stdout +2. A CSV report at `profiles/report_np4_rank0.csv` +3. An HTML report at `profiles/report_np4_rank0.html` ## Trouble shooting diff --git a/examples/layer_wise_benchmarks/parse.py b/examples/layer_wise_benchmarks/parse.py new file mode 100644 index 00000000000..36248d8f114 --- /dev/null +++ b/examples/layer_wise_benchmarks/parse.py @@ -0,0 +1,449 @@ +import argparse +import bisect +import csv +import json +import re +import sqlite3 +import subprocess +from pathlib import Path + +import jinja2 +import numpy as np +import pandas as pd + +# Parse cmdline +parser = argparse.ArgumentParser() +parser.add_argument("--profile-dir", type=str, default="profiles") +parser.add_argument("--world-size", "--np", type=int) +parser.add_argument("--rank", type=int, default=0) +parser.add_argument("--warmup-times", type=int) +group = parser.add_mutually_exclusive_group(required=False) +group.add_argument("--error-on-unknown-kernel", action="store_true", dest="error_on_unknown_kernel") +group.add_argument( + "--no-error-on-unknown-kernel", action="store_false", dest="error_on_unknown_kernel" +) +parser.set_defaults(error_on_unknown_kernel=None) +args = parser.parse_args() +print(args) + + +def lazy_convert_sqlite(nsys_rep_file_path, sqlite_file_path): + if ( + not sqlite_file_path.is_file() + or nsys_rep_file_path.stat().st_mtime > sqlite_file_path.stat().st_mtime + ): + subprocess.check_call( + [ + "nsys", + "export", + "--type", + "sqlite", + "-o", + sqlite_file_path, + "--force-overwrite=true", + nsys_rep_file_path, + ] + ) + + +def shortest_common_supersequence(a, b): + # Merge two lists into their shortest common supersequence, + # so that both `a` and `b` are subsequences of the result. + # Uses dynamic programming to compute the shortest common supersequence, then reconstructs it. + m, n = len(a), len(b) + dp = [[0] * (n + 1) for _ in range(m + 1)] + for i in range(m + 1): + dp[i][0] = i + for j in range(n + 1): + dp[0][j] = j + for i in range(1, m + 1): + for j in range(1, n + 1): + if a[i - 1] == b[j - 1]: + dp[i][j] = dp[i - 1][j - 1] + 1 + else: + dp[i][j] = min(dp[i - 1][j] + 1, dp[i][j - 1] + 1) + # Backtrack to build the merged sequence + res = [] + i, j = m, n + while i > 0 and j > 0: + if a[i - 1] == b[j - 1]: + res.append(a[i - 1]) + i -= 1 + j -= 1 + elif dp[i - 1][j] < dp[i][j - 1]: + res.append(a[i - 1]) + i -= 1 + else: + res.append(b[j - 1]) + j -= 1 + while i > 0: + res.append(a[i - 1]) + i -= 1 + while j > 0: + res.append(b[j - 1]) + j -= 1 + res.reverse() + return res + + +profile_dir = Path(args.profile_dir) +nsys_rep_file_path = profile_dir / f"report_np{args.world_size}_rank{args.rank}.nsys-rep" +sqlite_file_path = profile_dir / f"report_np{args.world_size}_rank{args.rank}.sqlite" +csv_file_path = profile_dir / f"report_np{args.world_size}_rank{args.rank}.csv" +html_file_path = profile_dir / f"report_np{args.world_size}_rank{args.rank}.html" +lazy_convert_sqlite(nsys_rep_file_path, sqlite_file_path) + +conn = sqlite3.connect(f"file:{sqlite_file_path}?mode=ro", uri=True) + +query = "SELECT * FROM ENUM_NSYS_EVENT_TYPE" +df = pd.read_sql_query(query, conn) +event_id_NvtxDomainCreate = df[df["name"] == "NvtxDomainCreate"].iloc[0]["id"].tolist() +event_id_NvtxPushPopRange = df[df["name"] == "NvtxPushPopRange"].iloc[0]["id"].tolist() + +query = "SELECT domainId FROM NVTX_EVENTS WHERE eventType = ? AND text = ?" +df = pd.read_sql_query(query, conn, params=(event_id_NvtxDomainCreate, "NCCL")) +nccl_domain_id = -1 if df.empty else df.iloc[0]["domainId"].tolist() + +query = """SELECT T1.start, T2.value AS text + FROM NVTX_EVENTS AS T1 + JOIN StringIds AS T2 ON T1.textId = T2.id + WHERE eventType = ? AND T2.value LIKE ?""" +df = pd.read_sql_query(query, conn, params=(event_id_NvtxPushPopRange, "layer_wise_benchmarks %")) +problem_start = [] +problem_set = [] +for start, text in df.itertuples(index=False): + if text.startswith("layer_wise_benchmarks args {"): + run_args = json.loads(text[len("layer_wise_benchmarks args") :]) + elif text.startswith("layer_wise_benchmarks problem_spec {"): + problem_start.append(start) + problem_set.append( + { + "spec": json.loads(text[len("layer_wise_benchmarks problem_spec") :]), + "text": "", + "runs": [], + "runs_end": [], + "ranges": [], + } + ) + +query = """SELECT T1.start, T1.end, T2.value AS text + FROM NVTX_EVENTS AS T1 + JOIN StringIds AS T2 ON T1.textId = T2.id + WHERE eventType = ? AND T2.value NOT LIKE ? AND T2.value NOT LIKE ? AND domainId != ?""" +df = pd.read_sql_query( + query, + conn, + params=(event_id_NvtxPushPopRange, "layer_wise_benchmarks %", "[DG]%", nccl_domain_id), +) +for start, end, text in df.itertuples(index=False): + problem_id = bisect.bisect(problem_start, start) - 1 + assert problem_id != -1 + if re.match(r"b=\d+ s=\d+ ", text): + problem_set[problem_id]["text"] = text + problem_set[problem_id]["runs"].append(start) + problem_set[problem_id]["runs_end"].append(end) + else: + problem_set[problem_id]["ranges"].append((start, end, text)) + +query = """SELECT name FROM sqlite_master WHERE type = ?""" +df = pd.read_sql_query(query, conn, params=("table",)) +tables = df["name"].tolist() +unified_subquery = """SELECT T1.start, T1.end, T1.demangledName, T1.correlationId, T1.graphNodeId + FROM CUPTI_ACTIVITY_KIND_KERNEL AS T1""" +if "CUPTI_ACTIVITY_KIND_MEMCPY" in tables: + unified_subquery += """ UNION ALL + SELECT T2.start, T2.end, -2 AS demangledName, T2.correlationId, T2.graphNodeId + FROM CUPTI_ACTIVITY_KIND_MEMCPY AS T2""" +if "CUPTI_ACTIVITY_KIND_MEMSET" in tables: + unified_subquery += """ UNION ALL + SELECT T3.start, T3.end, -3 AS demangledName, T3.correlationId, T3.graphNodeId + FROM CUPTI_ACTIVITY_KIND_MEMSET AS T3""" +query = f"""SELECT unified.start, unified.end, unified.demangledName, + R.start AS runtime_start, R.end AS runtime_end, + CGE2.start AS capture_start, CGE2.end AS capture_end +FROM ({unified_subquery}) AS unified +JOIN CUPTI_ACTIVITY_KIND_RUNTIME AS R ON unified.correlationId = R.correlationId +LEFT JOIN CUDA_GRAPH_NODE_EVENTS AS CGE1 ON unified.graphNodeId = CGE1.graphNodeId AND + CGE1.originalGraphNodeId IS NOT NULL +LEFT JOIN CUDA_GRAPH_NODE_EVENTS AS CGE2 ON CGE1.originalGraphNodeId = CGE2.graphNodeId""" +df = pd.read_sql_query(query, conn) +kernel_list = [] +for ( + start, + end, + demangledName, + runtime_start, + runtime_end, + capture_start, + capture_end, +) in df.itertuples(index=False): + problem_id = bisect.bisect(problem_start, start) - 1 + run_id = bisect.bisect(problem_set[problem_id]["runs"], runtime_start) - 1 + if ( + run_id == -1 + or run_id == len(problem_set[problem_id]["runs"]) + or runtime_start >= problem_set[problem_id]["runs_end"][run_id] + ): + run_id = -1 + ranges = [ + text + for range_start, range_end, text in problem_set[problem_id]["ranges"] + if capture_start >= range_start and capture_end <= range_end + ] + kernel_list.append( + ( + problem_id, + run_id, + ranges, + start, + end, + demangledName, + runtime_start, + runtime_end, + capture_start, + capture_end, + ) + ) +# TODO: Parse CTX phases + +query = "SELECT * FROM StringIds" +df = pd.read_sql_query(query, conn) +string_ids = dict(zip(df["id"], df["value"])) + +conn.close() + +kernel_list.sort(key=lambda t: (t[6], t[8])) +kernels = [[[] for _ in problem["runs"]] for problem in problem_set] +for ( + problem_id, + run_id, + ranges, + start, + end, + demangledName, + runtime_start, + runtime_end, + capture_start, + capture_end, +) in kernel_list: + if run_id != -1: + kernels[problem_id][run_id].append((demangledName, start, end, ranges)) +for problem_id in range(len(kernels)): + required_seq = [demangledName for demangledName, _, _, _ in kernels[problem_id][0]] + for run_id in range(len(kernels[problem_id])): + seq = [demangledName for demangledName, _, _, _ in kernels[problem_id][run_id]] + assert seq == required_seq + + +parser_keywords = [ + ("cuBLASGemm", "nvjet"), + ("splitKreduce", "splitKreduce_kernel"), + ("fusedAGemm", "fused_a_gemm_kernel"), + ("RMSNorm", "RMSNormKernel"), + ("torchCat", "CatArrayBatchedCopy"), + ("applyMLARope", "applyMLARope"), + ("fmhaSm100f", "fmhaSm100fKernel_Qkv"), + ("fmhaReduction", "fmhaReductionKernel"), + ("quant", "quantize_with_block_size"), + ("AllGather", "ncclDevKernel_AllGather_"), + ("ReduceScatter", "ncclDevKernel_ReduceScatter_"), + ("allreduce_oneshot", "allreduce_fusion_kernel_oneshot_lamport"), + ("allreduce_twoshot", "allreduce_fusion_kernel_twoshot_sync"), + ("expandInput", "expandInputRowsKernel"), + ("computeStrides", "computeStridesTmaWarpSpecializedKernel"), + ("cutlassGroupGemm", "cutlass::device_kernel", "at::native::CUDAFunctorOnSelf_add"), + ("convert_req_index", "_convert_req_index_to_global_index_kernel_with_stride_factor"), + ("preprocess_after_permute", "_preprocess_after_permute_kernel"), + ("masked_index_copy_quant", "_masked_index_copy_group_quant_fp8"), + ("swiglu_quant", "_silu_and_mul_post_quant_kernel"), + ("masked_index_gather", "masked_index_gather_kernel"), + ("finalizeMoeRouting", "tensorrt_llm::kernels::cutlass_kernels::finalizeMoeRoutingKernel<"), + ("fused_qkvzba_split", "fused_qkvzba_split_reshape_cat_kernel"), + ("causal_conv1d_update", "tensorrt_llm::kernels::causal_conv1d::causal_conv1d_update_kernel<"), + ("fused_delta_rule_update", "fused_sigmoid_gating_delta_rule_update_kernel"), + ("layer_norm_fwd_1pass", "_layer_norm_fwd_1pass_kernel"), + ("torchGatherTopK", "at::native::sbtopk::gatherTopK<"), + ("softmax_warp_forward", "softmax_warp_forward<"), + ("torchSigmoid", "at::native::sigmoid_kernel_cuda"), + ("torchMul", "at::native::binary_internal::MulFunctor<"), + ("applyBiasRopeUpdateKVCache", "tensorrt_llm::kernels::applyBiasRopeUpdateKVCacheV2<"), + ("routingIndicesHistogramScores", "routingRenormalize::routingIndicesHistogramScoresKernel<"), + ("routingIndicesHistogram", "routingIndicesHistogramKernel<"), + ("routingIndicesOffsets", "routingIndicesOffsetsKernel<"), + ("torchReduceSum", ["at::native::reduce_kernel<", "at::native::sum_functor<"]), + ("CuteDSLMoePermute", "cute_dsl::moePermuteKernel"), + ( + "CuteDSLGroupedGemmSwiglu", + ["cute_dsl_kernels", "blockscaled_contiguous_grouped_gemm_swiglu_fusion"], + ), + ( + "CuteDSLGroupedGemmFinalize", + ["cute_dsl_kernels", "blockscaled_contiguous_grouped_gemm_finalize_fusion"], + ), +] +warned_names = set() + + +def parse_kernel_name(demangledName): + if demangledName == -2: + return "Memcpy" + if demangledName == -3: + return "Memset" + name = string_ids[demangledName] + for dst, src in parser_keywords: + if not isinstance(src, (tuple, list)): + src = [src] + if all(keyword in name for keyword in src): + return dst + if name not in warned_names: + print(f"Unknown kernel name: {name}") + warned_names.add(name) + if args.error_on_unknown_kernel: + raise NotImplementedError(f"Unknown kernel name: {name}") + return name[:30] + + +converted_seqs = [] +for runs in kernels: + warmup_times = run_args["warmup_times"] if args.warmup_times is None else args.warmup_times + converted_seq = [] + # Kernel time + for i, (demangledName, _, _, ranges) in enumerate(runs[0]): + name = parse_kernel_name(demangledName) + category = (*ranges, name) + time_list = [run[i][2] - run[i][1] for run in runs] + t = np.mean(time_list[warmup_times:]).tolist() + converted_seq.append((category, t)) + # Space and Overlap + overlap_list = [] + space_list = [] + for run in runs: + sorted_run = sorted(run, key=lambda op: op[1]) + last_end = sorted_run[0][1] + overlap_time = 0 + space_time = 0 + for _, start, end, _ in sorted_run: + if start > last_end: + space_time += start - last_end + else: + overlap_time += min(last_end, end) - start + last_end = max(last_end, end) + overlap_list.append(-overlap_time) + space_list.append(space_time) + converted_seq.append((("Overlap",), np.mean(overlap_list[warmup_times:]).tolist())) + converted_seq.append((("Space",), np.mean(space_list[warmup_times:]).tolist())) + converted_seq.append((("Total",), sum(t for _, t in converted_seq))) + converted_seqs.append(converted_seq) + +merged_title = [] +for converted_seq in converted_seqs: + title = [name for name, _ in converted_seq] + merged_title = shortest_common_supersequence(merged_title, title) + +merged_data = [[0.0] * len(problem_set) for _ in merged_title] +for problem_id, converted_seq in enumerate(converted_seqs): + cur = 0 + for category, t in converted_seq: + cur = merged_title.index(category, cur) + merged_data[cur][problem_id] = t + cur += 1 + +print("Problem set:") +for problem in problem_set: + print( + f'- "{problem["text"]}" {len(problem["runs"])} runs' + f" Ranges: [{', '.join(text for _, _, text in problem['ranges'])}]" + ) + +stack = [] +csv_data = [["", *[problem["text"] for problem in problem_set]]] +js_data = [] +js_stack = [js_data] +max_title_len = max((len(title) - 1) * 3 + len(title[-1]) for title in merged_title) +for title, time_data in zip(merged_title, merged_data): + while stack != list(title[: len(stack)]): + level_title = stack[-1] + stack.pop() + js_stack[-2].append( + { + "name": level_title, + "children": js_stack[-1], + } + ) + js_stack.pop() + while len(stack) != len(title) - 1: + level_title = title[len(stack)] + stack.append(level_title) + level = len(stack) + print("|--" * (level - 1) + level_title) + csv_data.append(["|--" * (level - 1) + level_title]) + js_stack.append([]) + level = len(stack) + 1 + print( + "|--" * (level - 1) + title[-1] + " " * (max_title_len - (level - 1) * 3 - len(title[-1])), + *[f"{x / 1000:-6.1f}" for x in time_data], + ) + csv_data.append(["|--" * (level - 1) + title[-1], *[f"{x / 1000:.1f}" for x in time_data]]) + if title != ("Total",): + js_stack[-1].append( + { + "name": title[-1], + "time": [x / 1000 for x in time_data], + } + ) +# TODO: Group repeated modules +with csv_file_path.open("w", newline="") as f: + csv_writer = csv.writer(f, quoting=csv.QUOTE_MINIMAL) + for row in csv_data: + csv_writer.writerow(row) +js_header_config = [{"name": problem["text"]} for problem in problem_set] +loader = jinja2.FileSystemLoader(Path(__file__).parent) +template = jinja2.Environment(loader=loader).get_template("template.html") +with html_file_path.open("w") as f: + f.write( + template.render( + headerConfig=js_header_config, rawData=js_data, runArgs=json.dumps(run_args, indent=4) + ) + ) diff --git a/examples/layer_wise_benchmarks/run.py b/examples/layer_wise_benchmarks/run.py new file mode 100644 index 00000000000..6a074cb67d9 --- /dev/null +++ b/examples/layer_wise_benchmarks/run.py @@ -0,0 +1,235 @@ +import argparse +import itertools +import json + +import numpy as np +import nvtx +import torch +import yaml + +from tensorrt_llm._torch.autotuner import AutoTuner, autotune +from tensorrt_llm._torch.modules.multi_stream_utils import with_multi_stream +from tensorrt_llm._utils import local_mpi_rank, mpi_rank, mpi_world_size +from tensorrt_llm.tools.layer_wise_benchmarks import BalanceMethod, get_runner_cls, mark_ranges + + +def comma_separated_ints(s): + return [int(x) for x in s.split(",")] + + +def comma_separated_floats(s): + return [float(x) for x in s.split(",")] + + +# Parse cmdline +parser = argparse.ArgumentParser() +parser.add_argument("config_path", type=str) +parser.add_argument("--model", type=str, help="Pretrained model name or path") +parser.add_argument( + "--layer-indices", + type=comma_separated_ints, + help="Comma separated indices of layers, should be a contiguous range", +) +parser.add_argument("--run-type", type=str, choices=["CTX", "GEN"]) +parser.add_argument("--scaled-from", type=int) +# KV cache related args +parser.add_argument("--max-batch-size", type=int) +parser.add_argument("--tokens-per-block", type=int) +parser.add_argument("--max-seq-len", type=int) +group = parser.add_mutually_exclusive_group(required=False) +group.add_argument("--enable-attention-dp", action="store_true", dest="enable_attention_dp") +group.add_argument("--no-enable-attention-dp", action="store_false", dest="enable_attention_dp") +parser.set_defaults(enable_attention_dp=None) +# Model init args +parser.add_argument("--max-num-tokens", type=int) +parser.add_argument("--moe-backend", type=str) +parser.add_argument("--moe-max-num-tokens", type=int) +group = parser.add_mutually_exclusive_group(required=False) +group.add_argument("--use-cuda-graph", action="store_true", dest="use_cuda_graph") +group.add_argument("--no-use-cuda-graph", action="store_false", dest="use_cuda_graph") +parser.set_defaults(use_cuda_graph=None) +# Per iteration args +parser.add_argument("--batch-size", type=comma_separated_ints, dest="batch_size_list") +parser.add_argument("--seq-len-q", type=comma_separated_ints, dest="seq_len_q_list") +parser.add_argument("--seq-len-kv-cache", type=comma_separated_ints, dest="seq_len_kv_cache_list") +parser.add_argument("--balance-method", type=str) +parser.add_argument("--balance-ratio", type=comma_separated_floats, dest="balance_ratio_list") +# Schedule +parser.add_argument("--warmup-times", type=int, default=20) +parser.add_argument("--run-times", type=int, default=100) +args = parser.parse_args() +# Load YAML file +with open(args.config_path) as f: + config = yaml.safe_load(f) +del args.config_path +for k, v in vars(args).items(): + if k.endswith("_list"): + config_key = k[: -len("_list")] + if v is None and config_key in config: + v = config[config_key] + if isinstance(v, list): + pass + elif v is None or isinstance(v, (int, float)): + v = [v] + else: + raise ValueError(f'Config "{config_key}" in YAML should be a value or a list') + setattr(args, k, v) + else: + config_key = k + if v is None and config_key in config: + v = config[config_key] + setattr(args, k, v) + if config_key in config: + del config[config_key] +if config: + raise ValueError(f"Config {','.join(config.keys())} from file are not options") +# Set default values +if args.max_batch_size is None: + args.max_batch_size = max(args.batch_size_list) +if args.max_num_tokens is None: + args.max_num_tokens = args.max_batch_size * max(args.seq_len_q_list) +print(args) + +# MPI args +rank = mpi_rank() +world_size = mpi_world_size() +local_rank = local_mpi_rank() +torch.cuda.set_device(local_rank) + +# Create KV cache manager +mark_ranges() +Runner = get_runner_cls(args.model) +mapping = Runner.create_mapping(enable_attention_dp=args.enable_attention_dp) +kv_cache_manager = Runner.create_kv_cache_manager( + args.model, + mapping, + tokens_per_block=args.tokens_per_block, + max_batch_size=args.max_batch_size, + max_seq_len=args.max_seq_len, + layer_indices=args.layer_indices, +) +attn_workspace = torch.empty((0,), device="cuda", dtype=torch.int8) + +# Create other global objects +AutoTuner.get().clear_cache() +capture_stream = torch.cuda.Stream() + +# Create Runner +runner = Runner( + args.model, + mapping, + moe_backend=args.moe_backend, + layer_indices=args.layer_indices, + scaled_from=args.scaled_from, + max_seq_len=args.max_seq_len, + max_num_tokens=args.max_num_tokens, + moe_max_num_tokens=args.moe_max_num_tokens, + use_cuda_graph=args.use_cuda_graph, +) + +# Warm up +for autotune_flag, batch_size, seq_len_q, seq_len_kv_cache, balance_ratio in [ + [ + True, + max(args.batch_size_list), + max(args.seq_len_q_list), + args.seq_len_kv_cache_list[0], + args.balance_ratio_list[0], + ], + *itertools.product( + [False], + args.batch_size_list, + args.seq_len_q_list, + args.seq_len_kv_cache_list, + args.balance_ratio_list, + ), +]: + assert batch_size <= args.max_batch_size + assert seq_len_q + seq_len_kv_cache <= args.max_seq_len + run_pack = runner.create_run_pack( + args.run_type, + batch_size=batch_size, + seq_len_q=seq_len_q, + seq_len_kv_cache=seq_len_kv_cache, + kv_cache_manager=kv_cache_manager, + attn_workspace=attn_workspace, + ) + with runner.replace_routing_method_ctx( + balance_method=BalanceMethod[args.balance_method], balance_ratio=balance_ratio + ): + capture_stream.wait_stream(torch.cuda.current_stream()) + with torch.cuda.stream(capture_stream): + if autotune_flag: + with autotune(): + run_pack() + run_pack() + torch.cuda.current_stream().wait_stream(capture_stream) +torch.cuda.synchronize() + +events = [ + torch.cuda.Event(enable_timing=True) for _ in range(args.warmup_times + args.run_times + 1) +] +[e.record() for e in events] # Explicitly warmup events because torch is lazy + +torch.cuda.cudart().cudaProfilerStart() +with nvtx.annotate(f"layer_wise_benchmarks args {json.dumps(args.__dict__)}"): + pass # Use `annotate` instead of `mark` to avoid addition lines on the Nsight Systems UI +for batch_size, seq_len_q, seq_len_kv_cache, balance_ratio in itertools.product( + args.batch_size_list, args.seq_len_q_list, args.seq_len_kv_cache_list, args.balance_ratio_list +): + # Profile: capture graph and replay it + problem_spec = { + "batch_size": batch_size, + "seq_len_q": seq_len_q, + "seq_len_kv_cache": seq_len_kv_cache, + "balance_ratio": balance_ratio, + } + with nvtx.annotate(f"layer_wise_benchmarks problem_spec {json.dumps(problem_spec)}"): + pass + run_pack = runner.create_run_pack( + args.run_type, + batch_size=batch_size, + seq_len_q=seq_len_q, + seq_len_kv_cache=seq_len_kv_cache, + kv_cache_manager=kv_cache_manager, + attn_workspace=attn_workspace, + ) + with runner.replace_routing_method_ctx( + balance_method=BalanceMethod[args.balance_method], balance_ratio=balance_ratio + ): + if args.use_cuda_graph: + with with_multi_stream(True): + g = torch.cuda.CUDAGraph() + with torch.cuda.graph(g, stream=capture_stream, capture_error_mode="global"): + run_pack() + + balance_ratio_str = "" if balance_ratio is None else f" balance={balance_ratio:.2g}" + nvtx_message = f"b={batch_size} s={seq_len_q} past={seq_len_kv_cache}{balance_ratio_str} NP{world_size}" + for i in range(args.warmup_times + args.run_times): + events[i].record() + with nvtx.annotate(nvtx_message): + if args.use_cuda_graph: + g.replay() + else: + run_pack() + events[-1].record() + torch.cuda.synchronize() + + # Print statistics + # Print before `cudaProfilerStop` to ensure messages are included in the profile + time_list = [start.elapsed_time(stop) for start, stop in zip(events, events[1:])] + time_list = time_list[args.warmup_times :] + print( + f"[RANK {rank}]" + f" batch_size {batch_size}" + f" seq_len_q {seq_len_q}" + f" seq_len_kv_cache {seq_len_kv_cache}" + + ("" if balance_ratio is None else f" balance_ratio {balance_ratio:.2g}") + + f" mean {np.mean(time_list) * 1000:.1f}" + f" median {np.median(time_list) * 1000:.1f}" + f" min {np.min(time_list) * 1000:.1f}" + f" max {np.max(time_list) * 1000:.1f}" + f" P90 {np.percentile(time_list, 90) * 1000:.1f}" + f" (us)" + ) +torch.cuda.cudart().cudaProfilerStop() diff --git a/examples/layer_wise_benchmarks/run_single.sh b/examples/layer_wise_benchmarks/run.sh similarity index 74% rename from examples/layer_wise_benchmarks/run_single.sh rename to examples/layer_wise_benchmarks/run.sh index be9aa6e5a4d..3c45da1324a 100755 --- a/examples/layer_wise_benchmarks/run_single.sh +++ b/examples/layer_wise_benchmarks/run.sh @@ -13,17 +13,18 @@ if [ "$RANK" -eq 0 ]; then export TLLM_LOG_LEVEL=INFO fi +PROFILE_DIR=${PROFILE_DIR:-profiles} +mkdir -p ${PROFILE_DIR} + PROFILE=${PROFILE:-1} GPU_METRICS=${GPU_METRICS:-0} if [ "$PROFILE" -eq 1 ]; then - PROFILE_FOLDER=profiles/run_single - mkdir -p ${PROFILE_FOLDER} PROFILE_CMD="nsys profile -t cuda,nvtx -s none --cpuctxsw none --cuda-event-trace false --cuda-graph-trace node -c cudaProfilerApi --capture-range-end stop - -o ${PROFILE_FOLDER}/run_single_ep${WORLD_SIZE}_rank${RANK}.nsys-rep + -o ${PROFILE_DIR}/report_np${WORLD_SIZE}_rank${RANK}.nsys-rep --force-overwrite true" if [ "$GPU_METRICS" -eq 1 ]; then PROFILE_CMD+=" --gpu-metrics-devices $LOCAL_RANK @@ -34,4 +35,6 @@ else fi set -x -$PROFILE_CMD python3 -u run_single.py "$@" +$PROFILE_CMD bash -c \ + "python3 -u run.py \"\$@\" 2>&1 | tee \"$PROFILE_DIR/report_np${WORLD_SIZE}_rank${RANK}.log\"" \ + bash "$@" diff --git a/examples/layer_wise_benchmarks/run_single.py b/examples/layer_wise_benchmarks/run_single.py deleted file mode 100644 index 77b95c35a6c..00000000000 --- a/examples/layer_wise_benchmarks/run_single.py +++ /dev/null @@ -1,157 +0,0 @@ -import argparse - -import numpy as np -import nvtx -import torch -import yaml - -from tensorrt_llm._torch.autotuner import AutoTuner, autotune -from tensorrt_llm._torch.modules.multi_stream_utils import with_multi_stream -from tensorrt_llm._utils import local_mpi_rank, mpi_rank, mpi_world_size -from tensorrt_llm.tools.layer_wise_benchmarks import BalanceMethod, get_runner_cls - - -def comma_separated_ints(s): - return [int(x) for x in s.split(",")] - - -# Parse cmdline -parser = argparse.ArgumentParser() -parser.add_argument("config_path", type=str) -parser.add_argument("--model", type=str, help="Pretrained model name or path") -parser.add_argument( - "--layer-indices", - type=comma_separated_ints, - help="Comma separated indices of layers, should be a contiguous range", -) -parser.add_argument("--run-type", type=str, choices=["CTX", "GEN"]) -parser.add_argument("--scaled-from", type=int) -# KV cache related args -parser.add_argument("--max-batch-size", type=int) -parser.add_argument("--tokens-per-block", type=int) -parser.add_argument("--max-seq-len", type=int) -group = parser.add_mutually_exclusive_group(required=False) -group.add_argument("--enable-attention-dp", action="store_true", dest="enable_attention_dp") -group.add_argument("--no-enable-attention-dp", action="store_false", dest="enable_attention_dp") -parser.set_defaults(enable_attention_dp=None) -# Model init args -parser.add_argument("--max-num-tokens", type=int) -parser.add_argument("--moe-backend", type=str) -parser.add_argument("--moe-max-num-tokens", type=int) -group = parser.add_mutually_exclusive_group(required=False) -group.add_argument("--use-cuda-graph", action="store_true", dest="use_cuda_graph") -group.add_argument("--no-use-cuda-graph", action="store_false", dest="use_cuda_graph") -parser.set_defaults(use_cuda_graph=None) -# Per iteration args -parser.add_argument("--batch-size", type=int) -parser.add_argument("--seq-len-q", type=int) -parser.add_argument("--seq-len-kv-cache", type=int) -parser.add_argument("--balance-method", type=str) -parser.add_argument("--balance-ratio", type=float) -args = parser.parse_args() -with open(args.config_path) as f: - config = yaml.safe_load(f) -del args.config_path -for k, v in vars(args).items(): - if v is None and k in config: - setattr(args, k, config[k]) -if args.max_batch_size is None: - args.max_batch_size = args.batch_size -if args.max_num_tokens is None: - args.max_num_tokens = args.max_batch_size * args.seq_len_q -print(args) - -# MPI args -rank = mpi_rank() -world_size = mpi_world_size() -local_rank = local_mpi_rank() -torch.cuda.set_device(local_rank) - -# Create KV cache manager -Runner = get_runner_cls(args.model) -mapping = Runner.create_mapping(enable_attention_dp=args.enable_attention_dp) -kv_cache_manager = Runner.create_kv_cache_manager( - args.model, - mapping, - tokens_per_block=args.tokens_per_block, - max_batch_size=args.max_batch_size, - max_seq_len=args.max_seq_len, - layer_indices=args.layer_indices, -) -attn_workspace = torch.empty((0,), device="cuda", dtype=torch.int8) - -# Create other global objects -AutoTuner.get().clear_cache() -capture_stream = torch.cuda.Stream() - -# Create Runner -runner = Runner( - args.model, - mapping, - moe_backend=args.moe_backend, - layer_indices=args.layer_indices, - scaled_from=args.scaled_from, - max_seq_len=args.max_seq_len, - max_num_tokens=args.max_num_tokens, - moe_max_num_tokens=args.moe_max_num_tokens, - use_cuda_graph=args.use_cuda_graph, -) - -# Warm up -assert args.batch_size <= args.max_batch_size -assert args.seq_len_q + args.seq_len_kv_cache <= args.max_seq_len -run_pack = runner.create_run_pack( - args.run_type, - batch_size=args.batch_size, - seq_len_q=args.seq_len_q, - seq_len_kv_cache=args.seq_len_kv_cache, - kv_cache_manager=kv_cache_manager, - attn_workspace=attn_workspace, -) -runner.replace_routing_method( - balance_method=BalanceMethod[args.balance_method], balance_ratio=args.balance_ratio -) -capture_stream.wait_stream(torch.cuda.current_stream()) -with torch.cuda.stream(capture_stream): - run_pack() - with autotune(): - run_pack() -torch.cuda.current_stream().wait_stream(capture_stream) -torch.cuda.synchronize() - -# Profile: capture graph and replay it -torch.cuda.cudart().cudaProfilerStart() -if args.use_cuda_graph: - with with_multi_stream(True): - g = torch.cuda.CUDAGraph() - with torch.cuda.graph(g, stream=capture_stream, capture_error_mode="global"): - run_pack() - -warmup_times = 20 -run_times = 100 -events = [torch.cuda.Event(enable_timing=True) for _ in range(warmup_times + run_times + 1)] -for i in range(warmup_times + run_times): - events[i].record() - with nvtx.annotate(f"b={args.batch_size} s={args.seq_len_q} EP{world_size}"): - if args.use_cuda_graph: - g.replay() - else: - run_pack() -events[-1].record() -torch.cuda.synchronize() - -# Print statistics -# Print before `cudaProfilerStop` to ensure messages are included in the profile -time_list = [start.elapsed_time(stop) for start, stop in zip(events, events[1:])] -time_list = time_list[warmup_times:] -print( - f"[RANK {rank}]" - f" min {np.min(time_list) * 1000:.1f}" - f" max {np.max(time_list) * 1000:.1f}" - f" mean {np.mean(time_list) * 1000:.1f}" - f" median {np.median(time_list) * 1000:.1f}" - f" P90 {np.percentile(time_list, 90) * 1000:.1f}" - f" (us)" -) - -torch.cuda.cudart().cudaProfilerStop() diff --git a/examples/layer_wise_benchmarks/slurm_init_containers.sh b/examples/layer_wise_benchmarks/slurm_init_containers.sh index 59d783b183a..08e77c5623a 100755 --- a/examples/layer_wise_benchmarks/slurm_init_containers.sh +++ b/examples/layer_wise_benchmarks/slurm_init_containers.sh @@ -3,6 +3,7 @@ set -euo pipefail # CONTAINER_IMAGE= +CONTAINER_NAME=${CONTAINER_NAME:-layer_wise_benchmarks} CONTAINER_MOUNTS=$(realpath "$(pwd)/../.."):$(realpath "$(pwd)/../..") if [ "${SLURM_JOB_ID:-}" == "" ]; then @@ -46,7 +47,7 @@ set -x srun -N "$NODES" \ --ntasks-per-node 1 \ --container-image "$CONTAINER_IMAGE" \ - --container-name "layer_wise_benchmarks" \ + --container-name "$CONTAINER_NAME" \ --container-mounts "$CONTAINER_MOUNTS" \ --container-workdir "$WORKDIR" \ bash -c "pip install -U packaging && diff --git a/examples/layer_wise_benchmarks/slurm_query_container_name.sh b/examples/layer_wise_benchmarks/slurm_query_container_name.sh new file mode 100755 index 00000000000..a7f6b1ba815 --- /dev/null +++ b/examples/layer_wise_benchmarks/slurm_query_container_name.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +set -euo pipefail + +prefix="pyxis_${SLURM_JOB_ID}_" +matches=$(printf "%s\n" "$(srun -N 1 enroot list)" | grep "^${prefix}" || true) +count=$(printf "%s\n" "$matches" | wc -l) + +if [ "$count" -eq 0 ]; then + echo "Error: No container found" >&2 + exit 1 +fi + +if [ "$count" -gt 1 ]; then + echo "Error: Multiple containers found" >&2 + while IFS= read -r match; do + echo "- ${match#$prefix}" >&2 + done <<< "$matches" + exit 1 +fi + +suffix=${matches#$prefix} +echo "Container name: $suffix" >&2 +echo "$suffix" diff --git a/examples/layer_wise_benchmarks/template.html b/examples/layer_wise_benchmarks/template.html new file mode 100644 index 00000000000..37017cae941 --- /dev/null +++ b/examples/layer_wise_benchmarks/template.html @@ -0,0 +1,766 @@ + + + + + + CUDA Kernel Latency Dashboard + + + + + +
+ Controls: + + + + (Tip: Arrows to move, +/- to toggle, Click charts to select) +
+ +
+
+ 🔧 Configuration (Click to expand) +
{{ runArgs }}
+
+
+ +
+
+ + + + +
+
+ +
+
+
Row Analysis
+
+
+
+
Column Hierarchy
+
+
+
+
+ + + + diff --git a/tensorrt_llm/tools/layer_wise_benchmarks/__init__.py b/tensorrt_llm/tools/layer_wise_benchmarks/__init__.py index 607e110b62e..e347df3ca8e 100644 --- a/tensorrt_llm/tools/layer_wise_benchmarks/__init__.py +++ b/tensorrt_llm/tools/layer_wise_benchmarks/__init__.py @@ -1,7 +1,5 @@ +from .mark_utils import mark_ranges from .runner_factory import get_runner_cls from .runner_interface import BalanceMethod -__all__ = [ - "BalanceMethod", - "get_runner_cls", -] +__all__ = ["BalanceMethod", "get_runner_cls", "mark_ranges"] diff --git a/tensorrt_llm/tools/layer_wise_benchmarks/deepseekv3_runner.py b/tensorrt_llm/tools/layer_wise_benchmarks/deepseekv3_runner.py index 307e591dd27..b2445b18b22 100644 --- a/tensorrt_llm/tools/layer_wise_benchmarks/deepseekv3_runner.py +++ b/tensorrt_llm/tools/layer_wise_benchmarks/deepseekv3_runner.py @@ -1,124 +1,16 @@ -import functools from typing import List, Optional import torch -import tensorrt_llm._torch.models.modeling_deepseekv3 from tensorrt_llm._torch.model_config import ModelConfig -from tensorrt_llm._torch.models.modeling_deepseekv3 import DeepseekV3DecoderLayer, DeepseekV3Gate +from tensorrt_llm._torch.models.modeling_deepseekv3 import DeepseekV3DecoderLayer from tensorrt_llm._torch.modules.rms_norm import RMSNorm from tensorrt_llm._torch.utils import AuxStreamType -from tensorrt_llm._utils import mpi_rank, mpi_world_size from tensorrt_llm.functional import AllReduceStrategy from tensorrt_llm.mapping import Mapping -from .runner_interface import BalanceMethod, RunnerBase -from .runner_utils import RunnerMixin, ceil_div - - -class RoutingMethod(DeepseekV3Gate): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.world_size = mpi_world_size() - self.rank = mpi_rank() - self.balance_method = BalanceMethod.NotModified - self.balance_ratio = None - - def apply(self, router_logits) -> (torch.Tensor, torch.Tensor): - token_selected_experts, token_final_scales = super().apply(router_logits) - num_experts = self.weight.shape[0] - if self.balance_method == BalanceMethod.NotModified: - pass - elif self.balance_method == BalanceMethod.Balanced: - token_selected_experts = RoutingMethod.get_balanced_selection( - token_selected_experts.shape[0], - token_selected_experts.shape[1], - num_experts, - token_selected_experts.dtype, - self.world_size, - self.rank, - ) - elif self.balance_method == BalanceMethod.ImbalancedRanks: - token_selected_experts = RoutingMethod.get_all_to_one_selection( - token_selected_experts.shape[0], - token_selected_experts.shape[1], - num_experts, - self.balance_ratio, - token_selected_experts.dtype, - self.world_size, - self.rank, - ) - elif self.balance_method == BalanceMethod.ImbalancedExperts: - token_selected_experts = RoutingMethod.get_balanced_rank_imbalanced_expert_selection( - token_selected_experts.shape[0], - token_selected_experts.shape[1], - num_experts, - self.balance_ratio, - token_selected_experts.dtype, - self.world_size, - self.rank, - ) - else: - raise NotImplementedError(f"Not support balance_method {self.balance_method}") - return token_selected_experts, token_final_scales - - @staticmethod - @functools.cache - def get_balanced_selection(num_tokens, top_k, num_experts, dtype, world_size, rank): - a = torch.arange(num_tokens * world_size * top_k, dtype=dtype, device="cuda").view( - num_tokens, world_size, top_k - )[:, rank] - experts = ( - a * (num_experts // world_size + 1) + a // num_experts * (num_experts // world_size) - ) % num_experts - return experts.contiguous() - - @staticmethod - def apply_balance_ratio(imbalanced_experts, num_experts, balance_ratio, world_size, rank): - num_tokens, top_k = imbalanced_experts.shape - dtype = imbalanced_experts.dtype - balanced_experts = RoutingMethod.get_balanced_selection( - num_tokens, top_k, num_experts, dtype, world_size, rank - ) - num_balanced_tokens = round(num_tokens * balance_ratio) - if balance_ratio != 0: - # Activate all experts - num_balanced_tokens = max( - num_balanced_tokens, ceil_div(num_experts, world_size * top_k) - ) - mixed_experts = balanced_experts.clone() - mixed_experts[num_balanced_tokens:] = imbalanced_experts[num_balanced_tokens:] - return mixed_experts - - @staticmethod - @functools.cache - def get_all_to_one_selection( - num_tokens, top_k, num_experts, balance_ratio, dtype, world_size, rank - ): - assert num_experts // world_size >= top_k - imbalanced_experts = torch.arange(num_tokens * top_k, dtype=dtype, device="cuda").view( - num_tokens, top_k - ) % (num_experts // world_size) - return RoutingMethod.apply_balance_ratio( - imbalanced_experts, num_experts, balance_ratio, world_size, rank - ) - - @staticmethod - @functools.cache - def get_balanced_rank_imbalanced_expert_selection( - num_tokens, top_k, num_experts, balance_ratio, dtype, world_size, rank - ): - experts_per_rank = num_experts // world_size - activate_experts_per_rank = ceil_div(top_k, world_size) - a = torch.arange(num_tokens * top_k, dtype=dtype, device="cuda").view(num_tokens, top_k) - narrow_experts = a % (activate_experts_per_rank * world_size) - imbalanced_experts = ( - narrow_experts * experts_per_rank % num_experts - + narrow_experts // world_size % experts_per_rank - ) - return RoutingMethod.apply_balance_ratio( - imbalanced_experts, num_experts, balance_ratio, world_size, rank - ) +from .runner_interface import RunnerBase +from .runner_utils import RunnerMixin class DeepSeekV3Runner(RunnerMixin, RunnerBase): @@ -139,10 +31,6 @@ def __init__( moe_max_num_tokens: int, use_cuda_graph: bool, ): - # Temporally replace the gate class - gate_cls_orig = tensorrt_llm._torch.models.modeling_deepseekv3.DeepseekV3Gate - tensorrt_llm._torch.models.modeling_deepseekv3.DeepseekV3Gate = RoutingMethod - self.model_config = ModelConfig.from_pretrained( pretrained_model_name_or_path, mapping=mapping, @@ -206,20 +94,3 @@ def __init__( layers[-1].next_layer_layernorm = next_layer_layernorm self.layers = layers - tensorrt_llm._torch.models.modeling_deepseekv3.DeepseekV3Gate = gate_cls_orig - - def replace_routing_method(self, balance_method: BalanceMethod, balance_ratio: float): - if self.model_config.moe_backend not in [ - "CUTLASS", - "DEEPGEMM", - "TRTLLM", - "WIDEEP", - "CUTEDSL", - ]: - raise NotImplementedError( - f'Not support replace routing method for moe_backend "{self.model_config.moe_backend}",' - f' please set balance_method to "NotModified"' - ) - for layer in self.layers: - layer.mlp.gate.balance_method = balance_method - layer.mlp.gate.balance_ratio = balance_ratio diff --git a/tensorrt_llm/tools/layer_wise_benchmarks/mark_utils.py b/tensorrt_llm/tools/layer_wise_benchmarks/mark_utils.py new file mode 100644 index 00000000000..72625d10598 --- /dev/null +++ b/tensorrt_llm/tools/layer_wise_benchmarks/mark_utils.py @@ -0,0 +1,24 @@ +import nvtx + +from tensorrt_llm._torch.models.modeling_deepseekv3 import DeepseekV3Gate +from tensorrt_llm._torch.models.modeling_qwen3_next import ( + Qwen3NextGatedDeltaNet, + Qwen3NextSparseMoeBlock, +) +from tensorrt_llm._torch.modules.attention import MLA, Attention +from tensorrt_llm._torch.modules.fused_moe.interface import MoE +from tensorrt_llm._torch.modules.gated_mlp import GatedMLP + + +def mark_ranges(): + DeepseekV3Gate.forward = nvtx.annotate("DeepseekV3Gate")(DeepseekV3Gate.forward) + Qwen3NextGatedDeltaNet.forward = nvtx.annotate("Qwen3NextGatedDeltaNet")( + Qwen3NextGatedDeltaNet.forward + ) + Qwen3NextSparseMoeBlock.forward = nvtx.annotate("Qwen3NextSparseMoeBlock")( + Qwen3NextSparseMoeBlock.forward + ) + MLA.forward = nvtx.annotate("MLA")(MLA.forward) + Attention.forward = nvtx.annotate("Attention")(Attention.forward) + MoE.forward = nvtx.annotate("MoE")(MoE.forward) + GatedMLP.forward = nvtx.annotate("GatedMLP")(GatedMLP.forward) diff --git a/tensorrt_llm/tools/layer_wise_benchmarks/runner_interface.py b/tensorrt_llm/tools/layer_wise_benchmarks/runner_interface.py index 20d672da167..9451124e20c 100644 --- a/tensorrt_llm/tools/layer_wise_benchmarks/runner_interface.py +++ b/tensorrt_llm/tools/layer_wise_benchmarks/runner_interface.py @@ -28,7 +28,7 @@ def create_run_pack( pass @abstractmethod - def replace_routing_method(self, balance_method: BalanceMethod, balance_ratio: float): + def replace_routing_method_ctx(self, balance_method: BalanceMethod, balance_ratio: float): pass @staticmethod diff --git a/tensorrt_llm/tools/layer_wise_benchmarks/runner_utils.py b/tensorrt_llm/tools/layer_wise_benchmarks/runner_utils.py index ea7201e7f2b..8ef5266b9e4 100644 --- a/tensorrt_llm/tools/layer_wise_benchmarks/runner_utils.py +++ b/tensorrt_llm/tools/layer_wise_benchmarks/runner_utils.py @@ -1,5 +1,7 @@ import contextlib +import functools import os +import unittest.mock import weakref from abc import ABC, abstractmethod from typing import Optional @@ -9,6 +11,8 @@ from tensorrt_llm._torch.attention_backend.utils import get_attention_backend from tensorrt_llm._torch.metadata import KVCacheParams from tensorrt_llm._torch.model_config import ModelConfig +from tensorrt_llm._torch.modules.fused_moe.fused_moe_cutlass import CutlassFusedMoE +from tensorrt_llm._torch.modules.fused_moe.fused_moe_trtllm_gen import TRTLLMGenFusedMoE from tensorrt_llm._torch.modules.fused_moe.fused_moe_wide_ep import WideEPMoE from tensorrt_llm._torch.modules.linear import Linear, WeightMode from tensorrt_llm._torch.modules.mamba.mamba2_metadata import Mamba2Metadata @@ -33,6 +37,166 @@ def round_up(a, b): return ceil_div(a, b) * b +def get_balanced_selection_no_cache( + num_tokens, top_k, num_experts, dtype, device, world_size, rank +): + # First, each sender selects target rank + target_rank_before_mod = torch.arange(num_tokens * world_size * top_k).view( + num_tokens, world_size, top_k + ) + target_rank_before_mod += top_k * torch.arange(num_tokens).view( + num_tokens, 1, 1 + ) # Shift `top_k` ranks for the next token on each rank, to balance network traffic + target_rank = target_rank_before_mod % world_size + # Second, each receiver selects target expert + target_expert = torch.empty_like(target_rank) + for reciever_rank in range(world_size): + mask = target_rank == reciever_rank + experts_per_rank = num_experts // world_size + local_expert = torch.arange(num_tokens * top_k) % experts_per_rank + target_expert[mask] = (reciever_rank * experts_per_rank) + local_expert + token_selected_experts = target_expert[:, rank].sort(dim=-1).values + return token_selected_experts.contiguous().to(dtype=dtype, device=device) + + +get_balanced_selection = functools.cache(get_balanced_selection_no_cache) + + +def test_get_balanced_selection(): + dtype = torch.long + for num_tokens in range(1, 33): + for num_experts in range(1, 65): + print(f"{num_tokens=} {num_experts=}") + for top_k in range(1, min(11, num_experts)): + for world_size in range(1, 65): + if num_experts % world_size == 0: + tokens_per_expert = torch.zeros(num_experts) + for rank in range(world_size): + token_selected_experts = get_balanced_selection_no_cache( + num_tokens, top_k, num_experts, dtype, "cpu", world_size, rank + ) + sorted_selection = token_selected_experts.sort(dim=-1).values + if (sorted_selection[:, :-1] == sorted_selection[:, 1:]).any(): + raise ValueError(f"duplicated experts on rank {rank}") + experts_per_rank = num_experts // world_size + tokens_per_rank = ( + (token_selected_experts // experts_per_rank) + .view(-1) + .bincount(minlength=world_size) + ) + if tokens_per_rank.max() - tokens_per_rank.min() > 1: + raise ValueError(f"tokens sent from rank {rank} is not balanced") + tokens_per_expert += token_selected_experts.view(-1).bincount( + minlength=num_experts + ) + if tokens_per_expert.max() - tokens_per_expert.min() > 1: + raise ValueError("tokens per expert is not balanced") + + +def apply_balance_ratio(imbalanced_experts, num_experts, balance_ratio, world_size, rank): + num_tokens, top_k = imbalanced_experts.shape + dtype = imbalanced_experts.dtype + device = imbalanced_experts.device + balanced_experts = get_balanced_selection_no_cache( + num_tokens, top_k, num_experts, dtype, device, world_size, rank + ) + if balance_ratio == 0.0: + num_balanced_tokens = 0 + else: + # Activate all experts + min_num_balanced_tokens = min(num_tokens, ceil_div(num_experts, world_size * top_k)) + num_balanced_tokens = min_num_balanced_tokens + round( + (num_tokens - min_num_balanced_tokens) * balance_ratio + ) + mixed_experts = torch.cat( + [balanced_experts[:num_balanced_tokens], imbalanced_experts[num_balanced_tokens:]] + ) + return mixed_experts + + +@functools.cache +def get_all_to_one_selection( + num_tokens, top_k, num_experts, balance_ratio, dtype, device, world_size, rank +): + experts_per_rank = num_experts // world_size + if top_k > experts_per_rank: + raise ValueError( + "Cannot send all tokens to a single rank because `top_k > experts_per_rank`" + ) + imbalanced_experts = ( + torch.arange( + rank * num_tokens * top_k, (rank + 1) * num_tokens * top_k, dtype=dtype, device=device + ).view(num_tokens, top_k) + % experts_per_rank + ) + imbalanced_experts = imbalanced_experts.sort(dim=-1).values + return apply_balance_ratio(imbalanced_experts, num_experts, balance_ratio, world_size, rank) + + +@functools.cache +def get_balanced_rank_imbalanced_expert_selection( + num_tokens, top_k, num_experts, balance_ratio, dtype, device, world_size, rank +): + experts_per_rank = num_experts // world_size + active_experts_per_rank = ceil_div(top_k, world_size) + # Select expert from [0, active_experts_per_rank * world_size), + # then scale to [0, experts_per_rank * world_size) + narrow_experts = get_balanced_selection_no_cache( + num_tokens, top_k, active_experts_per_rank * world_size, dtype, device, world_size, rank + ) + imbalanced_experts = ( + narrow_experts // active_experts_per_rank * experts_per_rank + + narrow_experts % active_experts_per_rank + ) + return apply_balance_ratio(imbalanced_experts, num_experts, balance_ratio, world_size, rank) + + +def make_balanced_routing_method( + apply_method_orig, num_experts, balance_method, balance_ratio, world_size, rank +): + def balanced_routing_method(router_logits): + token_selected_experts, token_final_scales = apply_method_orig(router_logits) + if balance_method == BalanceMethod.NotModified: + pass + elif balance_method == BalanceMethod.Balanced: + token_selected_experts = get_balanced_selection( + token_selected_experts.shape[0], + token_selected_experts.shape[1], + num_experts, + token_selected_experts.dtype, + token_selected_experts.device, + world_size, + rank, + ) + elif balance_method == BalanceMethod.ImbalancedRanks: + token_selected_experts = get_all_to_one_selection( + token_selected_experts.shape[0], + token_selected_experts.shape[1], + num_experts, + balance_ratio, + token_selected_experts.dtype, + token_selected_experts.device, + world_size, + rank, + ) + elif balance_method == BalanceMethod.ImbalancedExperts: + token_selected_experts = get_balanced_rank_imbalanced_expert_selection( + token_selected_experts.shape[0], + token_selected_experts.shape[1], + num_experts, + balance_ratio, + token_selected_experts.dtype, + token_selected_experts.device, + world_size, + rank, + ) + else: + raise NotImplementedError(f"Not support balance_method {balance_method}") + return token_selected_experts, token_final_scales + + return balanced_routing_method + + class RunnerMixin(ABC): @staticmethod @abstractmethod @@ -63,23 +227,60 @@ def scaled_from_ctx(scaled_from, mapping, pretrained_config): pretrained_config.n_routed_experts = ( pretrained_config.n_routed_experts // scaled_from * mapping.moe_ep_size ) - select_alltoall_method_type_orig = WideEPMoE.select_alltoall_method_type - - def select_alltoall_method_type(cls: type, mapping: Mapping, top_k: int, *args, **kwargs): - # Replace the condition `mapping.moe_ep_size <= top_k` with `scaled_from <= top_k` - # by replacing `top_k` with `fake_top_k` - if scaled_from <= top_k: - fake_top_k = mapping.moe_ep_size + 1 - else: - fake_top_k = mapping.moe_ep_size - 1 - assert (mapping.moe_ep_size <= fake_top_k) == (scaled_from <= top_k) - return select_alltoall_method_type_orig(mapping, fake_top_k, *args, **kwargs) - - WideEPMoE.select_alltoall_method_type = select_alltoall_method_type + + def make_select_alltoall_method_type(select_alltoall_method_type_orig): + def select_alltoall_method_type( + cls: type, mapping: Mapping, top_k: int, *args, **kwargs + ): + # Replace the condition `mapping.moe_ep_size <= top_k` with `scaled_from <= top_k` + # by replacing `top_k` with `fake_top_k` + if scaled_from <= top_k: + fake_top_k = mapping.moe_ep_size + 1 + else: + fake_top_k = mapping.moe_ep_size - 1 + assert (mapping.moe_ep_size <= fake_top_k) == (scaled_from <= top_k) + return select_alltoall_method_type_orig(mapping, fake_top_k, *args, **kwargs) + + return select_alltoall_method_type + + def make_select_alltoall_method_type_2(select_alltoall_method_type_orig): + def select_alltoall_method_type(self): + # Replace the condition `mapping.moe_ep_size <= top_k` with `scaled_from <= top_k` + # by replacing `top_k` with `fake_top_k` + top_k = self.routing_method.experts_per_token + if scaled_from <= top_k: + fake_top_k = mapping.moe_ep_size + 1 + else: + fake_top_k = mapping.moe_ep_size - 1 + assert (mapping.moe_ep_size <= fake_top_k) == (scaled_from <= top_k) + with unittest.mock.patch.object( + self.routing_method.__class__, + "experts_per_token", + new_callable=unittest.mock.PropertyMock, + ) as mock_top_k: + mock_top_k.return_value = fake_top_k + return select_alltoall_method_type_orig(self) + + return select_alltoall_method_type + + select_alltoall_method_type_cutlass = CutlassFusedMoE.select_alltoall_method_type + select_alltoall_method_type_trtllm_gen = TRTLLMGenFusedMoE.select_alltoall_method_type + select_alltoall_method_type_wide_ep = WideEPMoE.select_alltoall_method_type + CutlassFusedMoE.select_alltoall_method_type = make_select_alltoall_method_type_2( + select_alltoall_method_type_cutlass + ) + TRTLLMGenFusedMoE.select_alltoall_method_type = make_select_alltoall_method_type_2( + select_alltoall_method_type_trtllm_gen + ) + WideEPMoE.select_alltoall_method_type = make_select_alltoall_method_type( + select_alltoall_method_type_wide_ep + ) try: yield finally: - WideEPMoE.select_alltoall_method_type = select_alltoall_method_type_orig + CutlassFusedMoE.select_alltoall_method_type = select_alltoall_method_type_cutlass + TRTLLMGenFusedMoE.select_alltoall_method_type = select_alltoall_method_type_trtllm_gen + WideEPMoE.select_alltoall_method_type = select_alltoall_method_type_wide_ep @staticmethod def apply_quant_config_exclude_modules(layers, quant_config): @@ -186,9 +387,44 @@ def run_pack(): return run_pack - def replace_routing_method(self, balance_method: BalanceMethod, balance_ratio: float): - if balance_method != BalanceMethod.NotModified: - raise NotImplementedError("not support replacing routing method for this runner") + @contextlib.contextmanager + def replace_routing_method_ctx(self, balance_method: BalanceMethod, balance_ratio: float): + if balance_method == BalanceMethod.NotModified: + pass + elif self.model_config.moe_backend not in [ + "CUTEDSL", + "CUTLASS", + "DEEPGEMM", + "TRTLLM", + "WIDEEP", + ]: + raise NotImplementedError( + f'Not support replace routing method for moe_backend "{self.model_config.moe_backend}",' + f' please set balance_method to "NotModified"' + ) + elif ( + self.model_config.moe_backend == "TRTLLM" + and not self.model_config.mapping.enable_attention_dp + ): + raise NotImplementedError( + 'Not support replace routing method for moe_backend "TRTLLM" with attention TP,' + ' please set balance_method to "NotModified"' + ) + apply_methods_orig = [layer.mlp.experts.routing_method.apply for layer in self.layers] + try: + for layer, apply_method_orig in zip(self.layers, apply_methods_orig): + layer.mlp.experts.routing_method.apply = make_balanced_routing_method( + apply_method_orig, + layer.mlp.experts.num_experts, + balance_method, + balance_ratio, + layer.mlp.experts.ep_size, + layer.mlp.experts.ep_rank, + ) + yield + finally: + for layer, apply_method_orig in zip(self.layers, apply_methods_orig): + layer.mlp.experts.routing_method.apply = apply_method_orig @staticmethod def create_kv_cache_manager( diff --git a/tests/unittest/tools/test_layer_wise_benchmarks.py b/tests/unittest/tools/test_layer_wise_benchmarks.py index 14a02a9ae07..affc946bb53 100644 --- a/tests/unittest/tools/test_layer_wise_benchmarks.py +++ b/tests/unittest/tools/test_layer_wise_benchmarks.py @@ -11,10 +11,11 @@ def test_deepseek_r1_ctx_dep(llm_root, world_size): if torch.cuda.device_count() < world_size: pytest.skip(f"needs {world_size:d} GPUs to run this test") model_root = llm_models_root(check=True) + profile_dir = f"profiles/test_deepseek_r1_ctx_dep_{world_size}" check_call( [ "./mpi_launch.sh", - "./run_single.sh", + "./run.sh", "config_ctx.yaml", "--model", model_root / "DeepSeek-R1" / "DeepSeek-R1-0528-FP4-v2", @@ -23,6 +24,7 @@ def test_deepseek_r1_ctx_dep(llm_root, world_size): env={ **os.environ, "NP": f"{world_size:d}", + "PROFILE_DIR": profile_dir, }, ) @@ -32,20 +34,23 @@ def test_deepseek_r1_ctx_tep(llm_root, world_size): if torch.cuda.device_count() < world_size: pytest.skip(f"needs {world_size:d} GPUs to run this test") model_root = llm_models_root(check=True) + profile_dir = f"profiles/test_deepseek_r1_ctx_tep_{world_size}" check_call( [ "./mpi_launch.sh", - "./run_single.sh", + "./run.sh", "config_ctx.yaml", "--model", model_root / "DeepSeek-R1" / "DeepSeek-R1-0528-FP4-v2", "--no-enable-attention-dp", "--moe-backend=TRTLLM", + "--balance-method=NotModified", ], cwd=llm_root / "examples" / "layer_wise_benchmarks", env={ **os.environ, "NP": f"{world_size:d}", + "PROFILE_DIR": profile_dir, "TRTLLM_ENABLE_PDL": "1", }, ) @@ -56,10 +61,11 @@ def test_deepseek_v32_ctx_dep(llm_root, world_size): if torch.cuda.device_count() < world_size: pytest.skip(f"needs {world_size:d} GPUs to run this test") model_root = llm_models_root(check=True) + profile_dir = f"profiles/test_deepseek_v32_ctx_dep_{world_size}" check_call( [ "./mpi_launch.sh", - "./run_single.sh", + "./run.sh", "config_ctx.yaml", "--model", model_root / "DeepSeek-V3.2-Exp-hf", @@ -70,6 +76,7 @@ def test_deepseek_v32_ctx_dep(llm_root, world_size): env={ **os.environ, "NP": f"{world_size:d}", + "PROFILE_DIR": profile_dir, }, ) @@ -79,10 +86,11 @@ def test_deepseek_r1_gen_scaled_from_16_dep(llm_root, world_size): if torch.cuda.device_count() < world_size: pytest.skip(f"needs {world_size:d} GPUs to run this test") model_root = llm_models_root(check=True) + profile_dir = f"profiles/test_deepseek_r1_gen_scaled_from_16_dep_{world_size}" check_call( [ "./mpi_launch.sh", - "./run_single.sh", + "./run.sh", "config_gen.yaml", "--model", model_root / "DeepSeek-R1" / "DeepSeek-R1-0528-FP4-v2", @@ -94,8 +102,13 @@ def test_deepseek_r1_gen_scaled_from_16_dep(llm_root, world_size): env={ **os.environ, "NP": f"{world_size:d}", + "PROFILE_DIR": profile_dir, }, ) + check_call( + ["python3", "parse.py", "--profile-dir", profile_dir, f"--world-size={world_size}"], + cwd=llm_root / "examples" / "layer_wise_benchmarks", + ) @pytest.mark.parametrize("world_size", [1, 4]) @@ -103,10 +116,11 @@ def test_qwen3_next_gen_tep(llm_root, world_size): if torch.cuda.device_count() < world_size: pytest.skip(f"needs {world_size:d} GPUs to run this test") model_root = llm_models_root(check=True) + profile_dir = f"profiles/test_qwen3_next_gen_tep_{world_size}" check_call( [ "./mpi_launch.sh", - "./run_single.sh", + "./run.sh", "config_gen.yaml", "--model", model_root / "Qwen3" / "Qwen3-Next-80B-A3B-Instruct", @@ -119,6 +133,11 @@ def test_qwen3_next_gen_tep(llm_root, world_size): env={ **os.environ, "NP": f"{world_size:d}", + "PROFILE_DIR": profile_dir, "TRTLLM_ENABLE_PDL": "1", }, ) + check_call( + ["python3", "parse.py", "--profile-dir", profile_dir, f"--world-size={world_size}"], + cwd=llm_root / "examples" / "layer_wise_benchmarks", + )