ai-dynamo · ajcasagrande · Oct 16, 2025 · Oct 16, 2025 · Oct 16, 2025
diff --git a/benchmarks/llm/perf.sh b/benchmarks/llm/perf.sh
@@ -235,9 +235,8 @@ for concurrency in "${concurrency_array[@]}"; do
     --num-dataset-entries $(($concurrency*12)) \
     --random-seed 100 \
     --artifact-dir ${artifact_dir} \
-    -- \
+    --ui simple \
     -v \
-    --max-threads ${concurrency} \
     -H 'Authorization: Bearer NOT USED' \
     -H 'Accept: text/event-stream'
 

@@ -253,7 +253,7 @@ async def run_profile(args):
                     base_url=base_url,
                 )
                 if aiperf_result is not None:
-                    ttft = aiperf_result["records"]["ttft"]["avg"]
+                    ttft = aiperf_result["time_to_first_token"]["avg"]
 
                 logger.info("Cleaning up deployment...")
                 await client.delete_deployment()
@@ -432,11 +432,9 @@ async def run_profile(args):
                             base_url=base_url,
                         )
                         if aiperf_result is not None:
-                            itl = aiperf_result["records"]["inter_token_latency"]["avg"]
+                            itl = aiperf_result["inter_token_latency"]["avg"]
                             thpt_per_gpu = (
-                                aiperf_result["records"]["output_token_throughput"][
-                                    "avg"
-                                ]
+                                aiperf_result["output_token_throughput"]["avg"]
                                 / num_gpus
                             )
 

@@ -124,10 +124,8 @@ def get_itl_and_thpt_per_gpu(isl, osl, num_request):
             base_url=url,
         )
         if aiperf_result is not None:
-            itl = aiperf_result["records"]["inter_token_latency"]["avg"]
-            thpt_per_gpu = (
-                aiperf_result["records"]["output_token_throughput"]["avg"] / num_gpus
-            )
+            itl = aiperf_result["inter_token_latency"]["avg"]
+            thpt_per_gpu = aiperf_result["output_token_throughput"]["avg"] / num_gpus
             return itl, thpt_per_gpu
         return None, None
 

@@ -90,7 +90,7 @@ def get_ttft(isl):
             base_url=url,
         )
         if aiperf_result is not None:
-            return aiperf_result["records"]["ttft"]["avg"]
+            return aiperf_result["time_to_first_token"]["avg"]
         return None
 
     return _profile_prefill_helper(

diff --git a/benchmarks/router/README.md b/benchmarks/router/README.md
@@ -232,7 +232,7 @@ python real_data_benchmark.py --input-dataset trace.jsonl --prefix-root-multipli
 > [!Note]
 > At the time of writing this documentation, you may need to install the latest aiperf from the main source branch to loadgen on the trace files:
 > ```bash
-> pip install git+https://github.com/ai-dynamo/aiperf.git#subdirectory=aiperf
+> pip install git+https://github.com/ai-dynamo/aiperf.git
 > ```
 > However, by the time of release, the aiperf version included in the vLLM runtime container should be up to date enough to use as-is.
 

diff --git a/benchmarks/sin_load_generator/README.md b/benchmarks/sin_load_generator/README.md
@@ -5,7 +5,7 @@ SPDX-License-Identifier: Apache-2.0
 
 # Sinusoidal Load Generator
 
-`sin_synth.py` is a simple script to generate synthetic load with sinusoidal request rate and isl/osl ratio. The output is in [mooncake-style](https://github.com/kvcache-ai/Mooncake) jsonl format, which can be directly used in [AIPerf](https://github.com/ai-dynamo/aiperf/tree/main/aiperf).
+`sin_synth.py` is a simple script to generate synthetic load with sinusoidal request rate and isl/osl ratio. The output is in [mooncake-style](https://github.com/kvcache-ai/Mooncake) jsonl format, which can be directly used in [AIPerf](https://github.com/ai-dynamo/aiperf).
 
 ## Usage
 

diff --git a/docs/backends/trtllm/gpt-oss.md b/docs/backends/trtllm/gpt-oss.md
@@ -404,7 +404,7 @@ curl localhost:8000/v1/chat/completions   -H "Content-Type: application/json"
 
 ### Performance Testing with AIPerf
 
-The Dynamo container includes [AIPerf](https://github.com/ai-dynamo/aiperf/blob/main/README.md), NVIDIA's tool for benchmarking generative AI models. This tool helps measure throughput, latency, and other performance metrics for your deployment.
+The Dynamo container includes [AIPerf](https://github.com/ai-dynamo/aiperf/tree/main?tab=readme-ov-file#aiperf), NVIDIA's tool for benchmarking generative AI models. This tool helps measure throughput, latency, and other performance metrics for your deployment.
 
 **Run the following benchmark from inside the container** (after completing the deployment steps above):
 

@@ -57,19 +57,20 @@ spec:
             aiperf profile --artifact-dir $ARTIFACT_DIR \
                 --model $TARGET_MODEL \
                 --tokenizer /model-cache/hub/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a  \
-                --endpoint-type chat  --endpoint /v1/chat/completions \
+                --endpoint-type chat  \
+                --endpoint /v1/chat/completions \
                 --streaming \
                 --url http://$ENDPOINT \
                 --synthetic-input-tokens-mean $isl \
                 --synthetic-input-tokens-stddev 0 \
                 --output-tokens-mean $osl \
                 --output-tokens-stddev 0 \
-                --extra-inputs "{\"max_tokens\":$osl}" \
-                --extra-inputs "{\"min_tokens\":$osl}" \
-                --extra-inputs "{\"ignore_eos\":true}" \
+                --extra-inputs "max_tokens:$osl" \
+                --extra-inputs "min_tokens:$osl" \
+                --extra-inputs "ignore_eos:true" \
                 --extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \
-                --extra-inputs "{\"repetition_penalty\":1.0}" \
-                --extra-inputs "{\"temperature\": 0.0}" \
+                --extra-inputs "repetition_penalty:1.0" \
+                --extra-inputs "temperature: 0.0" \
                 --concurrency $concurrency \
                 --request-count $((10*concurrency)) \
                 --warmup-request-count $concurrency \

@@ -49,7 +49,8 @@ spec:
             aiperf profile --artifact-dir $ARTIFACT_DIR \
                 --model $TARGET_MODEL \
                 --tokenizer /root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd  \
-                --endpoint-type chat  --endpoint /v1/chat/completions \
+                --endpoint-type chat  \
+                --endpoint /v1/chat/completions \
                 --streaming \
                 --url http://$ENDPOINT \
                 --synthetic-input-tokens-mean $isl \

@@ -49,7 +49,8 @@ spec:
             aiperf profile --artifact-dir $ARTIFACT_DIR \
                 --model $TARGET_MODEL \
                 --tokenizer /root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd  \
-                --endpoint-type chat  --endpoint /v1/chat/completions \
+                --endpoint-type chat  \
+                --endpoint /v1/chat/completions \
                 --streaming \
                 --url http://$ENDPOINT \
                 --synthetic-input-tokens-mean $isl \

@@ -49,7 +49,8 @@ spec:
             aiperf profile --artifact-dir $ARTIFACT_DIR \
                 --model $TARGET_MODEL \
                 --tokenizer /root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd  \
-                --endpoint-type chat  --endpoint /v1/chat/completions \
+                --endpoint-type chat  \
+                --endpoint /v1/chat/completions \
                 --streaming \
                 --url http://$ENDPOINT \
                 --synthetic-input-tokens-mean $isl \

@@ -383,27 +383,19 @@ def log_summary_metrics(
             with open(profile_json) as f:
                 metrics = json.load(f)
 
-            # Extract key metrics from AI-Perf format
-            records = metrics.get("records", {})
-
-            # Request count from request_count record
-            request_count_record = records.get("request_count", {})
-            request_count = (
-                int(request_count_record.get("avg", 0)) if request_count_record else 0
-            )
+            # Request count
+            request_count = int(metrics.get("request_count", {}).get("avg", 0))
 
             # Check for errors
-            error_summary = metrics.get("error_summary", [])
-            error_count = len(error_summary)
+            error_count = len(metrics.get("error_summary", []))
 
             # Latency metrics (in milliseconds)
-            request_latency = records.get("request_latency", {})
+            request_latency = metrics.get("request_latency", {})
             avg_latency = request_latency.get("avg", 0) / 1000.0  # Convert to seconds
             p99_latency = request_latency.get("p99", 0) / 1000.0  # Convert to seconds
 
             # Throughput metrics
-            request_throughput = records.get("request_throughput", {})
-            throughput = request_throughput.get("avg", 0)
+            throughput = metrics.get("request_throughput", {}).get("avg", 0)
 
             # Log summary
             logger.info(
@@ -417,7 +409,7 @@ def log_summary_metrics(
 
             # Log success rate
             if request_count > 0:
-                success_rate = (request_count - error_count) / request_count * 100
+                success_rate = ((request_count - error_count) / request_count) * 100
                 logger.info(f"Success rate: {success_rate:.1f}%")
 
             # Also write summary to CSV file for aggregation

@@ -293,63 +293,44 @@ def parse_aiperf_client_results(log_dir: str) -> Dict[str, Any]:
                 with open(profile_json) as f:
                     client_metrics = json.load(f)
 
-                # AI-Perf format has "records" dictionary at the top level
-                records = client_metrics.get("records", {})
-
-                # Extract request count (this is the total requests made)
-                request_count_record = records.get("request_count", {})
-                request_count = (
-                    int(request_count_record.get("avg", 0))
-                    if request_count_record
-                    else 0
+                # Extract request count (this is the total successful requests made)
+                request_count = int(
+                    client_metrics.get("request_count", {}).get("avg", 0)
                 )
 
                 # Check for errors in error_summary
-                error_summary = client_metrics.get("error_summary", [])
-                error_count = len(error_summary)
+                error_count = len(client_metrics.get("error_summary", []))
 
                 # Check if test was cancelled
-                was_cancelled = client_metrics.get("was_cancelled", False)
-                if was_cancelled:
+                if client_metrics.get("was_cancelled", False):
                     error_count = request_count  # Mark all as failed if cancelled
 
                 all_metrics["total_requests"] += request_count
                 all_metrics["successful_requests"] += request_count - error_count
                 all_metrics["failed_requests"] += error_count
 
-                # Extract latency from request_latency record
-                request_latency = records.get("request_latency", {})
-
+                # Extract latency metrics
+                request_latency = client_metrics.get("request_latency", None)
                 if request_latency:
-                    # Convert milliseconds to seconds for consistency
-                    if "avg" in request_latency:
-                        all_metrics["latencies"].append(request_latency["avg"] / 1000.0)
-                    if "p50" in request_latency:
-                        all_metrics["p50_latencies"].append(
-                            request_latency["p50"] / 1000.0
-                        )
-                    if "p90" in request_latency:
-                        all_metrics["p90_latencies"].append(
-                            request_latency["p90"] / 1000.0
-                        )
-                    if "p99" in request_latency:
-                        all_metrics["p99_latencies"].append(
-                            request_latency["p99"] / 1000.0
-                        )
-
-                # Time to first token (if available in records)
-                ttft = records.get("time_to_first_token", {}) or records.get("ttft", {})
-                if ttft and "avg" in ttft:
-                    all_metrics["ttft"].append(ttft["avg"] / 1000.0)  # Convert ms to s
-
-                # Inter-token latency (if available in records)
-                itl = records.get("inter_token_latency", {}) or records.get("itl", {})
-                if itl and "avg" in itl:
-                    all_metrics["itl"].append(itl["avg"] / 1000.0)  # Convert ms to s
+                    all_metrics["latencies"].append(request_latency["avg"] / 1000.0)
+                    all_metrics["p50_latencies"].append(request_latency["p50"] / 1000.0)
+                    all_metrics["p90_latencies"].append(request_latency["p90"] / 1000.0)
+                    all_metrics["p99_latencies"].append(request_latency["p99"] / 1000.0)
+
+                # Time to first token
+                ttft = client_metrics.get("time_to_first_token", {}).get("avg", None)
+                if ttft:
+                    all_metrics["ttft"].append(ttft / 1000.0)  # Convert ms to s
+
+                # Inter-token latency
+                itl = client_metrics.get("inter_token_latency", {}).get("avg", None)
+                if itl:
+                    all_metrics["itl"].append(itl / 1000.0)  # Convert ms to s
 
                 # Throughput from request_throughput record
-                request_throughput = records.get("request_throughput", {})
-                req_throughput = request_throughput.get("avg", 0)
+                req_throughput = client_metrics.get("request_throughput", {}).get(
+                    "avg", 0
+                )
                 if req_throughput:
                     all_metrics["throughputs"].append(req_throughput)
 

diff --git a/tests/planner/README.md b/tests/planner/README.md
@@ -227,7 +227,7 @@ aiperf profile \
   --input-file payload:/workspace/rr-5-45_i3000o300.jsonl \ # path to the generated load dataset \
   --fixed-schedule True \
   --goodput time_to_first_token:200 inter_token_latency:10 \
-  -v \
+  -v
 ```
 
 > [!NOTE]

@@ -116,8 +116,6 @@ async def generate_load(
             str(params["request_rate"]),
             "--request-count",
             str(request_count),  # Use request count to limit test duration
-            "--stability-percentage",
-            "50",
             "--num-dataset-entries",
             str(
                 max(20, int(params["request_rate"] * 10))
@@ -210,35 +208,16 @@ def _parse_aiperf_results(self, artifact_dir: str) -> Dict[str, Any]:
             logger.info(f"Parsing results from: {results_file}")
 
             with open(results_file, "r") as f:
-                data = json.load(f)
-
-            results = {}
-            if "experiments" in data and data["experiments"]:
-                exp = data["experiments"][0]
-                if "perf_metrics" in exp:
-                    metrics = exp["perf_metrics"]
-                    results.update(
-                        {
-                            "throughput": metrics.get("throughput", {}).get("avg", 0),
-                            "ttft_mean": metrics.get("ttft", {}).get("avg", 0),
-                            "itl_mean": metrics.get("inter_token_latency", {}).get(
-                                "avg", 0
-                            ),
-                            "end_to_end_latency_mean": metrics.get(
-                                "request_latency", {}
-                            ).get("avg", 0),
-                        }
-                    )
-            if not results and "profile_export_aiperf" in data:
-                summary = data.get("summary", {})
-                results.update(
-                    {
-                        "throughput": summary.get("throughput", 0),
-                        "ttft_mean": summary.get("time_to_first_token_ms", 0),
-                        "itl_mean": summary.get("inter_token_latency_ms", 0),
-                    }
-                )
-
+                metrics = json.load(f)
+
+            results = {
+                "throughput": metrics.get("output_token_throughput", {}).get("avg", 0),
+                "ttft_mean": metrics.get("time_to_first_token", {}).get("avg", 0),
+                "itl_mean": metrics.get("inter_token_latency", {}).get("avg", 0),
+                "end_to_end_latency_mean": metrics.get("request_latency", {}).get(
+                    "avg", 0
+                ),
+            }
             logger.info(f"Parsed results: {results}")
             return results