ai-dynamo · saturley-hall · Oct 16, 2025 · Oct 16, 2025
diff --git a/benchmarks/llm/perf.sh b/benchmarks/llm/perf.sh
@@ -235,9 +235,8 @@ for concurrency in "${concurrency_array[@]}"; do
     --num-dataset-entries $(($concurrency*12)) \
     --random-seed 100 \
     --artifact-dir ${artifact_dir} \
-    -- \
+    --ui simple \
     -v \
-    --max-threads ${concurrency} \
     -H 'Authorization: Bearer NOT USED' \
     -H 'Accept: text/event-stream'
 

@@ -256,7 +256,7 @@ async def run_profile(args):
                     base_url=base_url,
                 )
                 if aiperf_result is not None:
-                    ttft = aiperf_result["records"]["ttft"]["avg"]
+                    ttft = aiperf_result["time_to_first_token"]["avg"]
 
                 logger.info("Cleaning up deployment...")
                 await client.delete_deployment()
@@ -435,11 +435,9 @@ async def run_profile(args):
                             base_url=base_url,
                         )
                         if aiperf_result is not None:
-                            itl = aiperf_result["records"]["inter_token_latency"]["avg"]
+                            itl = aiperf_result["inter_token_latency"]["avg"]
                             thpt_per_gpu = (
-                                aiperf_result["records"]["output_token_throughput"][
-                                    "avg"
-                                ]
+                                aiperf_result["output_token_throughput"]["avg"]
                                 / num_gpus
                             )
 

@@ -124,10 +124,8 @@ def get_itl_and_thpt_per_gpu(isl, osl, num_request):
             base_url=url,
         )
         if aiperf_result is not None:
-            itl = aiperf_result["records"]["inter_token_latency"]["avg"]
-            thpt_per_gpu = (
-                aiperf_result["records"]["output_token_throughput"]["avg"] / num_gpus
-            )
+            itl = aiperf_result["inter_token_latency"]["avg"]
+            thpt_per_gpu = aiperf_result["output_token_throughput"]["avg"] / num_gpus
             return itl, thpt_per_gpu
         return None, None
 

@@ -90,7 +90,7 @@ def get_ttft(isl):
             base_url=url,
         )
         if aiperf_result is not None:
-            return aiperf_result["records"]["ttft"]["avg"]
+            return aiperf_result["time_to_first_token"]["avg"]
         return None
 
     return _profile_prefill_helper(

diff --git a/benchmarks/router/README.md b/benchmarks/router/README.md
@@ -13,7 +13,7 @@ This directory contains scripts for benchmarking the Dynamo router with prefix c
 - etcd and NATS running (required for Dynamo coordination)
 - Required Python packages:
   - `dynamo` package (with vllm and frontend modules)
-  - `genai-perf` for benchmarking
+  - `aiperf` for benchmarking
   - `matplotlib` for plotting results
   - `data-generator` package (install with `pip install -e ./benchmarks` from repo root)
 
@@ -230,11 +230,11 @@ python real_data_benchmark.py --input-dataset trace.jsonl --prefix-root-multipli
 ```
 
 > [!Note]
-> At the time of writing this documentation, you may need to install the latest genai-perf from the main source branch to loadgen on the trace files:
+> At the time of writing this documentation, you may need to install the latest aiperf from the main source branch to loadgen on the trace files:
 > ```bash
-> pip install git+https://github.com/triton-inference-server/perf_analyzer.git#subdirectory=genai-perf
+> pip install git+https://github.com/ai-dynamo/aiperf.git
 > ```
-> However, by the time of release, the genai-perf version included in the vLLM runtime container should be up to date enough to use as-is.
+> However, by the time of release, the aiperf version included in the vLLM runtime container should be up to date enough to use as-is.
 
 ## Troubleshooting
 

diff --git a/benchmarks/sin_load_generator/README.md b/benchmarks/sin_load_generator/README.md
@@ -5,7 +5,7 @@ SPDX-License-Identifier: Apache-2.0
 
 # Sinusoidal Load Generator
 
-`sin_synth.py` is a simple script to generate synthetic load with sinusoidal request rate and isl/osl ratio. The output is in [mooncake-style](https://github.com/kvcache-ai/Mooncake) jsonl format, which can be directly used in [GenAI-Perf](https://github.com/triton-inference-server/perf_analyzer/tree/main/genai-perf/genai_perf).
+`sin_synth.py` is a simple script to generate synthetic load with sinusoidal request rate and isl/osl ratio. The output is in [mooncake-style](https://github.com/kvcache-ai/Mooncake) jsonl format, which can be directly used in [AIPerf](https://github.com/ai-dynamo/aiperf).
 
 ## Usage
 

diff --git a/docs/backends/trtllm/gpt-oss.md b/docs/backends/trtllm/gpt-oss.md
@@ -402,9 +402,9 @@ curl localhost:8000/v1/chat/completions   -H "Content-Type: application/json"
 ```
 ## Benchmarking
 
-### Performance Testing with GenAI-Perf
+### Performance Testing with AIPerf
 
-The Dynamo container includes [GenAI-Perf](https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/perf_analyzer/genai-perf/README.html), NVIDIA's tool for benchmarking generative AI models. This tool helps measure throughput, latency, and other performance metrics for your deployment.
+The Dynamo container includes [AIPerf](https://github.com/ai-dynamo/aiperf/tree/main?tab=readme-ov-file#aiperf), NVIDIA's tool for benchmarking generative AI models. This tool helps measure throughput, latency, and other performance metrics for your deployment.
 
 **Run the following benchmark from inside the container** (after completing the deployment steps above):
 
@@ -413,7 +413,7 @@ The Dynamo container includes [GenAI-Perf](https://docs.nvidia.com/deeplearning/
 mkdir -p /tmp/benchmark-results
 
 # Run the benchmark - this command tests the deployment with high-concurrency synthetic workload
-genai-perf profile \
+aiperf profile \
     --model openai/gpt-oss-120b \
     --tokenizer /model \
     --endpoint-type chat \
@@ -434,9 +434,7 @@ genai-perf profile \
     --num-dataset-entries 8000 \
     --random-seed 100 \
     --artifact-dir /tmp/benchmark-results \
-    -- \
     -v \
-    --max-threads 500 \
     -H 'Authorization: Bearer NOT USED' \
     -H 'Accept: text/event-stream'
 ```
@@ -457,13 +455,13 @@ Key parameters you can adjust:
 - `--output-tokens-mean`: Average output length (tests decode throughput)
 - `--request-count`: Total number of requests for the benchmark
 
-### Installing GenAI-Perf Outside the Container
+### Installing AIPerf Outside the Container
 
 If you prefer to run benchmarks from outside the container:
 
 ```bash
-# Install GenAI-Perf
-pip install genai-perf
+# Install AIPerf
+pip install aiperf
 
 # Then run the same benchmark command, adjusting the tokenizer path if needed
 ```
@@ -520,4 +518,4 @@ flowchart TD
 - **Production Deployment**: For multi-node deployments, see the [Multi-node Guide](../../../examples/basics/multinode/README.md)
 - **Advanced Configuration**: Explore TensorRT-LLM engine building options for further optimization
 - **Monitoring**: Set up Prometheus and Grafana for production monitoring
-- **Performance Benchmarking**: Use GenAI-Perf to measure and optimize your deployment performance
+- **Performance Benchmarking**: Use AIPerf to measure and optimize your deployment performance
@@ -57,19 +57,20 @@ spec:
             aiperf profile --artifact-dir $ARTIFACT_DIR \
                 --model $TARGET_MODEL \
                 --tokenizer /model-cache/hub/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a  \
-                --endpoint-type chat  --endpoint /v1/chat/completions \
+                --endpoint-type chat  \
+                --endpoint /v1/chat/completions \
                 --streaming \
                 --url http://$ENDPOINT \
                 --synthetic-input-tokens-mean $isl \
                 --synthetic-input-tokens-stddev 0 \
                 --output-tokens-mean $osl \
                 --output-tokens-stddev 0 \
-                --extra-inputs "{\"max_tokens\":$osl}" \
-                --extra-inputs "{\"min_tokens\":$osl}" \
-                --extra-inputs "{\"ignore_eos\":true}" \
+                --extra-inputs "max_tokens:$osl" \
+                --extra-inputs "min_tokens:$osl" \
+                --extra-inputs "ignore_eos:true" \
                 --extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \
-                --extra-inputs "{\"repetition_penalty\":1.0}" \
-                --extra-inputs "{\"temperature\": 0.0}" \
+                --extra-inputs "repetition_penalty:1.0" \
+                --extra-inputs "temperature: 0.0" \
                 --concurrency $concurrency \
                 --request-count $((10*concurrency)) \
                 --warmup-request-count $concurrency \

@@ -50,7 +50,8 @@ spec:
             aiperf profile --artifact-dir $ARTIFACT_DIR \
                 --model $TARGET_MODEL \
                 --tokenizer /root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd  \
-                --endpoint-type chat  --endpoint /v1/chat/completions \
+                --endpoint-type chat  \
+                --endpoint /v1/chat/completions \
                 --streaming \
                 --url http://$ENDPOINT \
                 --synthetic-input-tokens-mean $isl \

@@ -50,7 +50,8 @@ spec:
             aiperf profile --artifact-dir $ARTIFACT_DIR \
                 --model $TARGET_MODEL \
                 --tokenizer /root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd  \
-                --endpoint-type chat  --endpoint /v1/chat/completions \
+                --endpoint-type chat  \
+                --endpoint /v1/chat/completions \
                 --streaming \
                 --url http://$ENDPOINT \
                 --synthetic-input-tokens-mean $isl \

@@ -50,7 +50,8 @@ spec:
             aiperf profile --artifact-dir $ARTIFACT_DIR \
                 --model $TARGET_MODEL \
                 --tokenizer /root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd  \
-                --endpoint-type chat  --endpoint /v1/chat/completions \
+                --endpoint-type chat  \
+                --endpoint /v1/chat/completions \
                 --streaming \
                 --url http://$ENDPOINT \
                 --synthetic-input-tokens-mean $isl \

@@ -383,27 +383,19 @@ def log_summary_metrics(
             with open(profile_json) as f:
                 metrics = json.load(f)
 
-            # Extract key metrics from AI-Perf format
-            records = metrics.get("records", {})
-
-            # Request count from request_count record
-            request_count_record = records.get("request_count", {})
-            request_count = (
-                int(request_count_record.get("avg", 0)) if request_count_record else 0
-            )
+            # Request count
+            request_count = int(metrics.get("request_count", {}).get("avg", 0))
 
             # Check for errors
-            error_summary = metrics.get("error_summary", [])
-            error_count = len(error_summary)
+            error_count = len(metrics.get("error_summary", []))
 
             # Latency metrics (in milliseconds)
-            request_latency = records.get("request_latency", {})
+            request_latency = metrics.get("request_latency", {})
             avg_latency = request_latency.get("avg", 0) / 1000.0  # Convert to seconds
             p99_latency = request_latency.get("p99", 0) / 1000.0  # Convert to seconds
 
             # Throughput metrics
-            request_throughput = records.get("request_throughput", {})
-            throughput = request_throughput.get("avg", 0)
+            throughput = metrics.get("request_throughput", {}).get("avg", 0)
 
             # Log summary
             logger.info(
@@ -417,7 +409,7 @@ def log_summary_metrics(
 
             # Log success rate
             if request_count > 0:
-                success_rate = (request_count - error_count) / request_count * 100
+                success_rate = ((request_count - error_count) / request_count) * 100
                 logger.info(f"Success rate: {success_rate:.1f}%")
 
             # Also write summary to CSV file for aggregation

@@ -293,63 +293,44 @@ def parse_aiperf_client_results(log_dir: str) -> Dict[str, Any]:
                 with open(profile_json) as f:
                     client_metrics = json.load(f)
 
-                # AI-Perf format has "records" dictionary at the top level
-                records = client_metrics.get("records", {})
-
-                # Extract request count (this is the total requests made)
-                request_count_record = records.get("request_count", {})
-                request_count = (
-                    int(request_count_record.get("avg", 0))
-                    if request_count_record
-                    else 0
+                # Extract request count (this is the total successful requests made)
+                request_count = int(
+                    client_metrics.get("request_count", {}).get("avg", 0)
                 )
 
                 # Check for errors in error_summary
-                error_summary = client_metrics.get("error_summary", [])
-                error_count = len(error_summary)
+                error_count = len(client_metrics.get("error_summary", []))
 
                 # Check if test was cancelled
-                was_cancelled = client_metrics.get("was_cancelled", False)
-                if was_cancelled:
+                if client_metrics.get("was_cancelled", False):
                     error_count = request_count  # Mark all as failed if cancelled
 
                 all_metrics["total_requests"] += request_count
                 all_metrics["successful_requests"] += request_count - error_count
                 all_metrics["failed_requests"] += error_count
 
-                # Extract latency from request_latency record
-                request_latency = records.get("request_latency", {})
-
+                # Extract latency metrics
+                request_latency = client_metrics.get("request_latency", None)
                 if request_latency:
-                    # Convert milliseconds to seconds for consistency
-                    if "avg" in request_latency:
-                        all_metrics["latencies"].append(request_latency["avg"] / 1000.0)
-                    if "p50" in request_latency:
-                        all_metrics["p50_latencies"].append(
-                            request_latency["p50"] / 1000.0
-                        )
-                    if "p90" in request_latency:
-                        all_metrics["p90_latencies"].append(
-                            request_latency["p90"] / 1000.0
-                        )
-                    if "p99" in request_latency:
-                        all_metrics["p99_latencies"].append(
-                            request_latency["p99"] / 1000.0
-                        )
-
-                # Time to first token (if available in records)
-                ttft = records.get("time_to_first_token", {}) or records.get("ttft", {})
-                if ttft and "avg" in ttft:
-                    all_metrics["ttft"].append(ttft["avg"] / 1000.0)  # Convert ms to s
-
-                # Inter-token latency (if available in records)
-                itl = records.get("inter_token_latency", {}) or records.get("itl", {})
-                if itl and "avg" in itl:
-                    all_metrics["itl"].append(itl["avg"] / 1000.0)  # Convert ms to s
+                    all_metrics["latencies"].append(request_latency["avg"] / 1000.0)
+                    all_metrics["p50_latencies"].append(request_latency["p50"] / 1000.0)
+                    all_metrics["p90_latencies"].append(request_latency["p90"] / 1000.0)
+                    all_metrics["p99_latencies"].append(request_latency["p99"] / 1000.0)
+
+                # Time to first token
+                ttft = client_metrics.get("time_to_first_token", {}).get("avg", None)
+                if ttft:
+                    all_metrics["ttft"].append(ttft / 1000.0)  # Convert ms to s
+
+                # Inter-token latency
+                itl = client_metrics.get("inter_token_latency", {}).get("avg", None)
+                if itl:
+                    all_metrics["itl"].append(itl / 1000.0)  # Convert ms to s
 
                 # Throughput from request_throughput record
-                request_throughput = records.get("request_throughput", {})
-                req_throughput = request_throughput.get("avg", 0)
+                req_throughput = client_metrics.get("request_throughput", {}).get(
+                    "avg", 0
+                )
                 if req_throughput:
                     all_metrics["throughputs"].append(req_throughput)
 

@@ -215,10 +215,10 @@ When running deployment with sla-planner, to reduce the image pulling time, depl
 kubectl apply -f ./perf_test_configs/image_cache_daemonset.yaml -n <namespace>
 ```
 
-Then, port-forward or shell into the frontend pod and run GenAI-Perf to get the goodput:
+Then, port-forward or shell into the frontend pod and run AIPerf to get the goodput:
 
 ```bash
-genai-perf profile \
+aiperf profile \
   --model nvidia/Llama-3.1-8B-Instruct-FP8 \
   --tokenizer nvidia/Llama-3.1-8B-Instruct-FP8 \
   --endpoint-type chat \
@@ -227,11 +227,11 @@ genai-perf profile \
   --input-file payload:/workspace/rr-5-45_i3000o300.jsonl \ # path to the generated load dataset \
   --fixed-schedule True \
   --goodput time_to_first_token:200 inter_token_latency:10 \
-  -- -v -max-threads 64 \
+  -v
 ```
 
 > [!NOTE]
-> Sometimes, when sla planner scales down the number of workers, a few requests will error out and cause GenAI-Perf to stuck. We are aware of this issue and are working on fixing it.
+> Sometimes, when sla planner scales down the number of workers, a few requests will error out and cause AIPerf to stuck. We are aware of this issue and are working on fixing it.
 
 #### E2E Perf Test Results