feat: Replace genai-perf with aiperf (#3533)

lkomali · athreesh · commit 914ea617dc69 · 2025-10-16T15:45:18.000-07:00
Signed-off-by: lkomali &lt;lkomali@nvidia.com&gt;
diff --git a/benchmarks/router/README.md b/benchmarks/router/README.md
@@ -13,7 +13,7 @@ This directory contains scripts for benchmarking the Dynamo router with prefix c
 - etcd and NATS running (required for Dynamo coordination)
 - Required Python packages:
   - `dynamo` package (with vllm and frontend modules)
-  - `genai-perf` for benchmarking
+  - `aiperf` for benchmarking
   - `matplotlib` for plotting results
   - `data-generator` package (install with `pip install -e ./benchmarks` from repo root)
 
@@ -230,11 +230,11 @@ python real_data_benchmark.py --input-dataset trace.jsonl --prefix-root-multipli
 ```
 
 > [!Note]
-> At the time of writing this documentation, you may need to install the latest genai-perf from the main source branch to loadgen on the trace files:
+> At the time of writing this documentation, you may need to install the latest aiperf from the main source branch to loadgen on the trace files:
 > ```bash
-> pip install git+https://github.com/triton-inference-server/perf_analyzer.git#subdirectory=genai-perf
+> pip install git+https://github.com/ai-dynamo/aiperf.git#subdirectory=aiperf
 > ```
-> However, by the time of release, the genai-perf version included in the vLLM runtime container should be up to date enough to use as-is.
+> However, by the time of release, the aiperf version included in the vLLM runtime container should be up to date enough to use as-is.
 
 ## Troubleshooting
 
diff --git a/benchmarks/router/prefix_ratio_benchmark.py b/benchmarks/router/prefix_ratio_benchmark.py
@@ -27,7 +27,7 @@
 logger.addHandler(console_handler)
 
 
-def get_genai_perf_cmd(
+def get_aiperf_cmd(
     model,
     tokenizer,  # Add tokenizer parameter
     prefix_ratio,
@@ -40,12 +40,12 @@ def get_genai_perf_cmd(
     artifact_dir,
     url="http://localhost:8888",
 ):
-    """Build genai-perf command based on prefix ratio"""
+    """Build aiperf command based on prefix ratio"""
     prefix_length = int(isl * prefix_ratio)
     synthetic_input_length = int(isl * (1 - prefix_ratio))
 
     return [
-        "genai-perf",
+        "aiperf",
         "profile",
         "--model",
         model,
@@ -84,28 +84,25 @@ def get_genai_perf_cmd(
         str(num_prefix_prompts),
         "--artifact-dir",
         artifact_dir,
-        "--",
         "-v",
-        "--max-threads",
-        "256",
         "-H",
         "Authorization: Bearer NOT USED",
         "-H",
         "Accept: text/event-stream",
     ]
 
 
-def get_gap_result(artifact_dir: str) -> dict:
-    """Parse genai-perf results from JSON file"""
+def get_aiperf_result(artifact_dir: str) -> dict:
+    """Parse aiperf results from JSON file"""
     json_file_path = None
     for root, _, files in os.walk(artifact_dir):
-        if "profile_export_genai_perf.json" in files:
-            json_file_path = os.path.join(root, "profile_export_genai_perf.json")
+        if "profile_export_aiperf.json" in files:
+            json_file_path = os.path.join(root, "profile_export_aiperf.json")
             break
 
     if json_file_path is None:
         raise FileNotFoundError(
-            f"profile_export_genai_perf.json not found in {artifact_dir}"
+            f"profile_export_aiperf.json not found in {artifact_dir}"
         )
 
     with open(json_file_path, "r") as f:
@@ -125,8 +122,8 @@ def run_benchmark_single_url(
     artifact_dir,
     url,
 ) -> Optional[Dict]:
-    """Run genai-perf benchmark for a single URL"""
-    genai_perf_cmd = get_genai_perf_cmd(
+    """Run aiperf benchmark for a single URL"""
+    aiperf_cmd = get_aiperf_cmd(
         model,
         tokenizer,  # Pass tokenizer parameter
         prefix_ratio,
@@ -140,21 +137,21 @@ def run_benchmark_single_url(
         url,
     )
 
-    logger.info(f"Running command for URL {url}: {' '.join(genai_perf_cmd)}")
+    logger.info(f"Running command for URL {url}: {' '.join(aiperf_cmd)}")
 
     try:
-        gap_process = subprocess.run(
-            genai_perf_cmd, capture_output=True, text=True, check=True
+        aiperf_process = subprocess.run(
+            aiperf_cmd, capture_output=True, text=True, check=True
         )
 
-        logger.info(f"Genai-perf profiling completed successfully for URL {url}")
-        logger.info(gap_process.stdout)
+        logger.info(f"AIPerf profiling completed successfully for URL {url}")
+        logger.info(aiperf_process.stdout)
 
-        gap_result = get_gap_result(artifact_dir)
-        return gap_result
+        aiperf_result = get_aiperf_result(artifact_dir)
+        return aiperf_result
 
     except subprocess.CalledProcessError as e:
-        logger.error(f"Genai-perf failed for URL {url} with error code: {e.returncode}")
+        logger.error(f"AIPerf failed for URL {url} with error code: {e.returncode}")
         logger.error(f"stderr: {e.stderr}")
         return None
 
@@ -197,7 +194,7 @@ def run_benchmark(
     output_dir,
     urls,
 ) -> Optional[Dict]:
-    """Run genai-perf benchmark for a specific prefix ratio"""
+    """Run aiperf benchmark for a specific prefix ratio"""
     logger.info(
         f"Running benchmark with prefix_ratio={prefix_ratio}, seed={seed}, URLs={urls}"
     )
@@ -242,7 +239,7 @@ def run_benchmark(
         os.makedirs(artifact_dir, exist_ok=True)
         artifact_dirs.append(artifact_dir)
 
-        genai_perf_cmd = get_genai_perf_cmd(
+        aiperf_cmd = get_aiperf_cmd(
             model,
             tokenizer,  # Pass tokenizer parameter
             prefix_ratio,
@@ -256,10 +253,10 @@ def run_benchmark(
             url,
         )
 
-        logger.info(f"Launching process for URL {url}: {' '.join(genai_perf_cmd)}")
+        logger.info(f"Launching process for URL {url}: {' '.join(aiperf_cmd)}")
 
         process = subprocess.Popen(
-            genai_perf_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
+            aiperf_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
         )
         processes.append((process, url, artifact_dir))
 
@@ -269,18 +266,18 @@ def run_benchmark(
         stdout, stderr = process.communicate()
 
         if process.returncode == 0:
-            logger.info(f"Genai-perf completed successfully for URL {url}")
+            logger.info(f"AIPerf completed successfully for URL {url}")
             logger.info(stdout)
 
             try:
-                gap_result = get_gap_result(artifact_dir)
-                results.append(gap_result)
+                aiperf_result = get_aiperf_result(artifact_dir)
+                results.append(aiperf_result)
             except Exception as e:
                 logger.error(f"Failed to get results for URL {url}: {e}")
                 results.append(None)
         else:
             logger.error(
-                f"Genai-perf failed for URL {url} with error code: {process.returncode}"
+                f"AIPerf failed for URL {url} with error code: {process.returncode}"
             )
             logger.error(f"stderr: {stderr}")
             results.append(None)
diff --git a/benchmarks/router/real_data_benchmark.py b/benchmarks/router/real_data_benchmark.py
@@ -24,7 +24,7 @@
 logger.addHandler(console_handler)
 
 
-def get_genai_perf_cmd_for_trace(
+def get_aiperf_cmd_for_trace(
     model,
     tokenizer,
     input_dataset,
@@ -33,7 +33,7 @@ def get_genai_perf_cmd_for_trace(
     url="http://localhost:8888",
 ):
     return [
-        "genai-perf",
+        "aiperf",
         "profile",
         "--model",
         model,
@@ -47,17 +47,13 @@ def get_genai_perf_cmd_for_trace(
         "--url",
         url,
         "--input-file",
-        f"payload:{input_dataset}",
-        "--fixed-schedule",
-        "True",
+        f"{input_dataset}",
+        "--fixed-schedule-auto-offset",
         "--random-seed",
         str(seed),
         "--artifact-dir",
         artifact_dir,
-        "--",
         "-v",
-        "--max-threads",
-        "256",
         "-H",
         "Authorization: Bearer NOT USED",
         "-H",
@@ -73,8 +69,8 @@ def run_benchmark_with_trace(
     url,
     seed,
 ):
-    """Run genai-perf benchmark with a trace dataset"""
-    genai_perf_cmd = get_genai_perf_cmd_for_trace(
+    """Run aiperf benchmark with a trace dataset"""
+    aiperf_cmd = get_aiperf_cmd_for_trace(
         model,
         tokenizer,
         trace_dataset,
@@ -83,17 +79,17 @@ def run_benchmark_with_trace(
         url,
     )
 
-    logger.info(f"Running genai-perf with trace dataset: {trace_dataset}")
-    logger.info(f"Command: {' '.join(genai_perf_cmd)}")
+    logger.info(f"Running aiperf with trace dataset: {trace_dataset}")
+    logger.info(f"Command: {' '.join(aiperf_cmd)}")
 
     try:
-        # Run genai-perf and let it output directly to terminal
-        subprocess.run(genai_perf_cmd, check=True)
+        # Run aiperf and let it output directly to terminal
+        subprocess.run(aiperf_cmd, check=True)
 
-        logger.info("Genai-perf profiling completed successfully")
+        logger.info("AIPerf profiling completed successfully")
 
     except subprocess.CalledProcessError as e:
-        logger.error(f"Genai-perf failed with error code: {e.returncode}")
+        logger.error(f"AIPerf failed with error code: {e.returncode}")
         logger.error(f"stderr: {e.stderr}")
         raise
 
@@ -301,7 +297,7 @@ def main():
         logger.info(f"Synthetic trace data saved to: {trace_dataset_path}")
 
     # Run benchmark with the trace dataset
-    artifact_dir = os.path.join(args.output_dir, "genai_perf_artifacts")
+    artifact_dir = os.path.join(args.output_dir, "aiperf_artifacts")
     os.makedirs(artifact_dir, exist_ok=True)
 
     run_benchmark_with_trace(
diff --git a/benchmarks/sin_load_generator/README.md b/benchmarks/sin_load_generator/README.md
@@ -5,7 +5,7 @@ SPDX-License-Identifier: Apache-2.0
 
 # Sinusoidal Load Generator
 
-`sin_synth.py` is a simple script to generate synthetic load with sinusoidal request rate and isl/osl ratio. The output is in [mooncake-style](https://github.com/kvcache-ai/Mooncake) jsonl format, which can be directly used in [GenAI-Perf](https://github.com/triton-inference-server/perf_analyzer/tree/main/genai-perf/genai_perf).
+`sin_synth.py` is a simple script to generate synthetic load with sinusoidal request rate and isl/osl ratio. The output is in [mooncake-style](https://github.com/kvcache-ai/Mooncake) jsonl format, which can be directly used in [AIPerf](https://github.com/ai-dynamo/aiperf/tree/main/aiperf).
 
 ## Usage
 
diff --git a/benchmarks/utils/aiperf.py b/benchmarks/utils/aiperf.py
diff --git a/benchmarks/utils/workflow.py b/benchmarks/utils/workflow.py
@@ -4,7 +4,7 @@
 from pathlib import Path
 from typing import Dict, List
 
-from benchmarks.utils.genai import run_concurrency_sweep
+from benchmarks.utils.aiperf import run_concurrency_sweep
 from deploy.utils.kubernetes import is_running_in_cluster
 
 
diff --git a/docs/backends/trtllm/gpt-oss.md b/docs/backends/trtllm/gpt-oss.md
@@ -404,7 +404,7 @@ curl localhost:8000/v1/chat/completions   -H "Content-Type: application/json"
 
 ### Performance Testing with AIPerf
 
-The Dynamo container includes [AIPerf](https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/perf_analyzer/aiperf/README.html), NVIDIA's tool for benchmarking generative AI models. This tool helps measure throughput, latency, and other performance metrics for your deployment.
+The Dynamo container includes [AIPerf](https://github.com/ai-dynamo/aiperf/blob/main/README.md), NVIDIA's tool for benchmarking generative AI models. This tool helps measure throughput, latency, and other performance metrics for your deployment.
 
 **Run the following benchmark from inside the container** (after completing the deployment steps above):
 
diff --git a/docs/benchmarks/benchmarking.md b/docs/benchmarks/benchmarking.md
@@ -283,7 +283,7 @@ results/                         # Client-side: ./benchmarks/results/ or custom
 │   └── avg_time_to_first_token_vs_concurrency.png
 ├── <your-benchmark-name>/       # Results for your benchmark (uses your custom name)
 │   ├── c1/                      # Concurrency level 1
-│   │   └── profile_export_genai_perf.json
+│   │   └── profile_export_aiperf.json
 │   ├── c2/                      # Concurrency level 2
 │   ├── c5/                      # Concurrency level 5
 │   └── ...                      # Other concurrency levels (10, 50, 100, 250)
@@ -457,7 +457,7 @@ Results are stored in `/data/results` and follow the same structure as client-si
 /data/results/
 └── <benchmark-name>/                # Results for your benchmark name
     ├── c1/                          # Concurrency level 1
-    │   └── profile_export_genai_perf.json
+    │   └── profile_export_aiperf.json
     ├── c2/                          # Concurrency level 2
     └── ...                          # Other concurrency levels
 ```
diff --git a/docs/performance/tuning.md b/docs/performance/tuning.md
@@ -56,11 +56,11 @@ Typically, the number of GPUs vs the performance follows the following pattern:
 |       2 |                269 |                   135 |                        1.19x |
 |       4 |                578 |                   144 |                        1.28x |
 
-The best number of GPUs to use in the prefill and decode engines can be determined by running a few fixed ISL/OSL/concurrency test using [GenAI-Perf](https://github.com/triton-inference-server/perf_analyzer/tree/main/genai-perf) and compare with the SLA.
-GenAI-Perf is pre-installed in the dynamo container.
+The best number of GPUs to use in the prefill and decode engines can be determined by running a few fixed ISL/OSL/concurrency test using [AIPerf](https://github.com/ai-dynamo/aiperf/tree/main) and compare with the SLA.
+AIPerf is pre-installed in the dynamo container.
 
 > [!Tip]
-> If you are unfamiliar with GenAI-Perf, please see this helpful [tutorial](https://github.com/triton-inference-server/perf_analyzer/blob/main/genai-perf/docs/tutorial.md) to get you started.
+> If you are unfamiliar with AIPerf, please see this helpful [tutorial](https://github.com/ai-dynamo/aiperf/blob/main/docs/tutorial.md) to get you started.
 
 Besides the parallelization mapping, other common knobs to tune are maximum batch size, maximum number of tokens, and block size.
 For prefill engines, usually a small batch size and large `max_num_token` is preferred.
diff --git a/examples/basics/kubernetes/Distributed_Inference/README.md b/examples/basics/kubernetes/Distributed_Inference/README.md
@@ -54,4 +54,4 @@ curl localhost:8000/v1/chat/completions \
     "max_tokens": 30
   }'
   ```
-You can also benchmark the performance of the endpoint by [GenAI-Perf](https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/perf_analyzer/genai-perf/README.html)
+You can also benchmark the performance of the endpoint by [AIPerf](https://github.com/ai-dynamo/aiperf/blob/main/README.md)
diff --git a/examples/basics/kubernetes/shared_frontend/README.md b/examples/basics/kubernetes/shared_frontend/README.md
@@ -39,4 +39,4 @@ curl localhost:8000/v1/chat/completions \
     "max_tokens": 30
   }'
   ```
-You can also benchmark the performance of the endpoint by [GenAI-Perf](https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/perf_analyzer/genai-perf/README.html)
+You can also benchmark the performance of the endpoint by [AIPerf](https://github.com/ai-dynamo/aiperf/blob/main/README.md)
diff --git a/examples/deployments/router_standalone/README.md b/examples/deployments/router_standalone/README.md
@@ -80,7 +80,7 @@ While not implemented in this example, the router can also operate in a pure pre
 - Integrates with vLLM's OpenAI serving components for request preprocessing and response formatting
 
 ### `perf.sh`
-- Benchmarking script using `genai-perf` to test the router setup
+- Benchmarking script using `aiperf` to test the router setup
 - Configured for streaming chat completions with synthetic workloads
 - Tests concurrent requests to evaluate routing performance
 
diff --git a/examples/deployments/router_standalone/perf.sh b/examples/deployments/router_standalone/perf.sh
@@ -28,7 +28,7 @@ num_unique_prompts=10
 
 seed=42
 
-genai-perf profile \
+aiperf profile \
   --model ${model} \
   --tokenizer ${model} \
   --endpoint-type ${type} \
@@ -47,8 +47,6 @@ genai-perf profile \
   --request-count ${num_requests} \
   --num-dataset-entries ${num_unique_prompts} \
   --random-seed ${seed} \
-  -- \
   -v \
-  --max-threads 256 \
   -H 'Authorization: Bearer NOT USED' \
   -H 'Accept: text/event-stream'
diff --git a/tests/planner/README.md b/tests/planner/README.md
@@ -215,10 +215,10 @@ When running deployment with sla-planner, to reduce the image pulling time, depl
 kubectl apply -f ./perf_test_configs/image_cache_daemonset.yaml -n <namespace>
 ```
 
-Then, port-forward or shell into the frontend pod and run GenAI-Perf to get the goodput:
+Then, port-forward or shell into the frontend pod and run AIPerf to get the goodput:
 
 ```bash
-genai-perf profile \
+aiperf profile \
   --model nvidia/Llama-3.1-8B-Instruct-FP8 \
   --tokenizer nvidia/Llama-3.1-8B-Instruct-FP8 \
   --endpoint-type chat \
@@ -227,11 +227,11 @@ genai-perf profile \
   --input-file payload:/workspace/rr-5-45_i3000o300.jsonl \ # path to the generated load dataset \
   --fixed-schedule True \
   --goodput time_to_first_token:200 inter_token_latency:10 \
-  -- -v -max-threads 64 \
+  -v \
 ```
 
 > [!NOTE]
-> Sometimes, when sla planner scales down the number of workers, a few requests will error out and cause GenAI-Perf to stuck. We are aware of this issue and are working on fixing it.
+> Sometimes, when sla planner scales down the number of workers, a few requests will error out and cause AIPerf to stuck. We are aware of this issue and are working on fixing it.
 
 #### E2E Perf Test Results
 
diff --git a/tests/planner/scaling/run_scaling_test.sh b/tests/planner/scaling/run_scaling_test.sh
@@ -64,9 +64,9 @@ check_prerequisites() {
         exit 1
     fi
 
-    # Check for genai-perf
-    if ! command -v genai-perf &> /dev/null; then
-        log_error "genai-perf not found. This tool is required for load generation."
+    # Check for aiperf
+    if ! command -v aiperf &> /dev/null; then
+        log_error "aiperf not found. This tool is required for load generation."
         log_error "Please install the required dependencies by following the instructions in tests/planner/README.md"
         exit 1
     fi
diff --git a/tests/planner/utils/load_generator.py b/tests/planner/utils/load_generator.py

-Original file line number
+Diff line change
     "max_tokens": 30
   }'
   ```
 -You can also benchmark the performance of the endpoint by [GenAI-Perf](https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/perf_analyzer/genai-perf/README.html)
 +You can also benchmark the performance of the endpoint by [AIPerf](https://github.com/ai-dynamo/aiperf/blob/main/README.md)