diff --git a/components/backends/sglang/slurm_jobs/scripts/gap/bench.sh b/components/backends/sglang/slurm_jobs/scripts/gap/bench.sh index ccb4708ddfc6..7eee4fee37d9 100755 --- a/components/backends/sglang/slurm_jobs/scripts/gap/bench.sh +++ b/components/backends/sglang/slurm_jobs/scripts/gap/bench.sh @@ -30,14 +30,14 @@ set -e warmup_model $head_node $head_port $SERVED_MODEL_NAME $MODEL_PATH "${chosen_isl}x${chosen_osl}x10000x10000x250" set +e -genai_perf_warmup_workers=$(python3 -c "print(max(${DP:-0}, ${prefill_workers:-0}, ${decode_workers:-0}))") +aiperf_warmup_workers=$(python3 -c "print(max(${DP:-0}, ${prefill_workers:-0}, ${decode_workers:-0}))") IFS='x' read -r -a concurrency_list <<< "$chosen_concurrencies" profile_folder="/logs/gap_isl_${chosen_isl}_osl_${chosen_osl}" mkdir -p $profile_folder -tmp_work_dir=$(mktemp -d -t genai-perf-XXXXXXXX) +tmp_work_dir=$(mktemp -d -t aiperf-XXXXXXXX) for concurrency in ${concurrency_list[@]}; do export_folder="${tmp_work_dir}/concurrency_${concurrency}" mkdir -p $export_folder @@ -46,7 +46,7 @@ for concurrency in ${concurrency_list[@]}; do echo "Run benchmark for concurrency $concurrency; ISL $chosen_isl; OSL $chosen_osl" command=( - genai-perf profile + aiperf profile -m ${SERVED_MODEL_NAME} --tokenizer ${MODEL_PATH} --endpoint-type chat @@ -55,7 +55,7 @@ for concurrency in ${concurrency_list[@]}; do --streaming --concurrency ${concurrency} - --warmup-request-count $(( 2*genai_perf_warmup_workers )) + --warmup-request-count $(( 2*aiperf_warmup_workers )) --request-count $(( 5*concurrency )) --synthetic-input-tokens-mean ${chosen_isl} --synthetic-input-tokens-stddev 0 @@ -69,13 +69,11 @@ for concurrency in ${concurrency_list[@]}; do --tokenizer-trust-remote-code --num-dataset-entries 3000 - -- - --max-threads ${concurrency} ) set -e ${command[@]} set +e - cp $export_folder/*/*_genai_perf.json $profile_folder + cp $export_folder/*/*_aiperf.json $profile_folder done diff --git a/components/backends/trtllm/deploy/README.md b/components/backends/trtllm/deploy/README.md index 8e2d24425a44..ac507663612f 100644 --- a/components/backends/trtllm/deploy/README.md +++ b/components/backends/trtllm/deploy/README.md @@ -271,7 +271,7 @@ args: ## Benchmarking -To benchmark your deployment with GenAI-Perf, see this utility script: [perf.sh](../../../../benchmarks/llm/perf.sh) +To benchmark your deployment with AIPerf, see this utility script: [perf.sh](../../../../benchmarks/llm/perf.sh) Configure the `model` name and `host` based on your deployment. diff --git a/components/backends/trtllm/performance_sweeps/README.md b/components/backends/trtllm/performance_sweeps/README.md index aaec28f5436a..0e3a4de174c8 100644 --- a/components/backends/trtllm/performance_sweeps/README.md +++ b/components/backends/trtllm/performance_sweeps/README.md @@ -38,7 +38,7 @@ Please note that: 1. `submit_disagg.sh` - Main entry point for submitting benchmark jobs for disaggregated configurations. This includes WideEP optimization for DEP>=16. 2. `submit_agg.sh` - Main entry point for submitting benchmark jobs for aggregated configurations. -3. `post_process.py` - Scan the genai-perf results to produce a json with entries to each config point. +3. `post_process.py` - Scan the aiperf results to produce a json with entries to each config point. 4. `plot_performance_comparison.py` - Takes the json result file for disaggregated and/or aggregated configuration sweeps and plots a pareto line for better visualization. For more finer grained details on how to launch TRTLLM backend workers with DeepSeek R1 on GB200 slurm, please refer [multinode-examples.md](../../../../docs/backends/trtllm/multinode/multinode-examples.md). This guide shares similar assumption to the multinode examples guide. @@ -117,9 +117,9 @@ export SERVED_MODEL_NAME="nvidia/DeepSeek-R1-FP4" ## Post-Processing Results -The above jobs use genAI-perf tool to benchmark each configuration point across different concurrency values. These get stored in `dynamo_disagg-bm-8150-1024//genai_perf_artifacts` and `dynamo_agg-bm-8150-1024//genai_perf_artifacts` for disaggregated and aggregated respectively. +The above jobs use aiperf tool to benchmark each configuration point across different concurrency values. These get stored in `dynamo_disagg-bm-8150-1024//aiperf_artifacts` and `dynamo_agg-bm-8150-1024//aiperf_artifacts` for disaggregated and aggregated respectively. -After your benchmarking jobs have completed, you can use the `post_process.py` script to aggregate and summarize the results from the generated genai_perf_artifacts. +After your benchmarking jobs have completed, you can use the `post_process.py` script to aggregate and summarize the results from the generated aiperf_artifacts. To run the post-processing script, use: @@ -149,6 +149,6 @@ Refer to [Beyond the Buzz: A Pragmatic Take on Inference Disaggregation](https:/ ## Known Issues -- Some jobs may time out if genai-perf requires more time to complete all concurrency levels. +- Some jobs may time out if aiperf requires more time to complete all concurrency levels. - Workers may encounter out-of-memory (OOM) errors during inference, especially with larger configurations. - Configurations affected by these issues will result in missing data points on the performance plot. diff --git a/components/backends/trtllm/performance_sweeps/benchmark_agg.slurm b/components/backends/trtllm/performance_sweeps/benchmark_agg.slurm index 7d5ca6323547..693274840de5 100755 --- a/components/backends/trtllm/performance_sweeps/benchmark_agg.slurm +++ b/components/backends/trtllm/performance_sweeps/benchmark_agg.slurm @@ -40,7 +40,7 @@ if [ "${enable_attention_dp}" = "false" ]; then fi full_logdir=${sub_dir} -artifacts_dir=${full_logdir}/genai_perf_artifacts +artifacts_dir=${full_logdir}/aiperf_artifacts mkdir -p ${artifacts_dir} diff --git a/components/backends/trtllm/performance_sweeps/post_process.py b/components/backends/trtllm/performance_sweeps/post_process.py index c0a045411067..a9e5ed28c557 100755 --- a/components/backends/trtllm/performance_sweeps/post_process.py +++ b/components/backends/trtllm/performance_sweeps/post_process.py @@ -124,7 +124,7 @@ def extract_throughput_data(csv_path: str) -> Tuple[Optional[float], Optional[fl Extract throughput data from CSV file Args: - csv_path: Path to profile_export_genai_perf.csv + csv_path: Path to profile_export_aiperf.csv Returns: Tuple of (output_token_throughput, output_token_throughput_per_user) @@ -184,10 +184,10 @@ def process_directory(dir_path: str) -> Optional[List[Dict[str, Any]]]: Dictionary containing extracted data, or None if processing failed """ dir_path_obj = Path(dir_path) - artifacts_path = dir_path_obj / "genai_perf_artifacts" + artifacts_path = dir_path_obj / "aiperf_artifacts" if not artifacts_path.exists(): - print(f"Warning: No genai_perf_artifacts directory found in {dir_path}") + print(f"Warning: No aiperf_artifacts directory found in {dir_path}") return None # Parse deployment configuration @@ -205,7 +205,7 @@ def process_directory(dir_path: str) -> Optional[List[Dict[str, Any]]]: csv_files = [] for item in artifacts_path.iterdir(): if item.is_dir(): - csv_path = item / "profile_export_genai_perf.csv" + csv_path = item / "profile_export_aiperf.csv" if csv_path.exists(): csv_files.append(str(csv_path)) diff --git a/components/backends/trtllm/performance_sweeps/scripts/bench.sh b/components/backends/trtllm/performance_sweeps/scripts/bench.sh index c7141b44bbd3..e79c03f08db8 100755 --- a/components/backends/trtllm/performance_sweeps/scripts/bench.sh +++ b/components/backends/trtllm/performance_sweeps/scripts/bench.sh @@ -54,8 +54,8 @@ set -x config_file=${log_path}/config.yaml -# install genai-perf -pip install genai-perf +# install aiperf +pip install aiperf # Create artifacts root directory if it doesn't exist if [ ! -d "${artifacts_dir}" ]; then @@ -153,7 +153,7 @@ for concurrency in ${concurrency_list}; do num_prompts=$((concurrency * multi_round)) echo "Benchmarking with concurrency ${concurrency} ... ${num_prompts} prompts" mkdir -p ${log_path}/concurrency_${concurrency} - genai-perf profile \ + aiperf profile \ --model ${model} \ --tokenizer ${model_path} \ --endpoint-type chat \ @@ -174,9 +174,7 @@ for concurrency in ${concurrency_list}; do --num-dataset-entries ${num_prompts} \ --random-seed 100 \ --artifact-dir ${artifacts_dir} \ - -- \ -v \ - --max-threads ${concurrency} \ -H 'Authorization: Bearer NOT USED' \ -H 'Accept: text/event-stream' echo "Benchmark with concurrency ${concurrency} done" diff --git a/docs/backends/trtllm/README.md b/docs/backends/trtllm/README.md index f5d5fa1d1ba6..12fbb9e5a9db 100644 --- a/docs/backends/trtllm/README.md +++ b/docs/backends/trtllm/README.md @@ -196,7 +196,7 @@ NOTE: To send a request to a multi-node deployment, target the node which is run ### Benchmarking -To benchmark your deployment with GenAI-Perf, see this utility script, configuring the +To benchmark your deployment with AIPerf, see this utility script, configuring the `model` name and `host` based on your deployment: [perf.sh](../../../benchmarks/llm/perf.sh) @@ -236,7 +236,7 @@ NOTE: To send a request to a multi-node deployment, target the node which is run ## Benchmarking -To benchmark your deployment with GenAI-Perf, see this utility script, configuring the +To benchmark your deployment with AIPerf, see this utility script, configuring the `model` name and `host` based on your deployment: [perf.sh](../../../benchmarks/llm/perf.sh) ## Multimodal support diff --git a/docs/backends/trtllm/gpt-oss.md b/docs/backends/trtllm/gpt-oss.md index 071b88bb2e2d..948f15780b6e 100644 --- a/docs/backends/trtllm/gpt-oss.md +++ b/docs/backends/trtllm/gpt-oss.md @@ -404,7 +404,7 @@ curl localhost:8000/v1/chat/completions -H "Content-Type: application/json" ### Performance Testing with AIPerf -The Dynamo container includes [AIPerf](https://github.com/ai-dynamo/aiperf/tree/main?tab=readme-ov-file#aiperf), NVIDIA's tool for benchmarking generative AI models. This tool helps measure throughput, latency, and other performance metrics for your deployment. +The Dynamo container includes [AIPerf](https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/perf_analyzer/aiperf/README.html), NVIDIA's tool for benchmarking generative AI models. This tool helps measure throughput, latency, and other performance metrics for your deployment. **Run the following benchmark from inside the container** (after completing the deployment steps above):