Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions benchmarks/llm/perf.sh
Original file line number Diff line number Diff line change
Expand Up @@ -235,9 +235,8 @@ for concurrency in "${concurrency_array[@]}"; do
--num-dataset-entries $(($concurrency*12)) \
--random-seed 100 \
--artifact-dir ${artifact_dir} \
-- \
--ui simple \
-v \
--max-threads ${concurrency} \
-H 'Authorization: Bearer NOT USED' \
-H 'Accept: text/event-stream'

Expand Down
8 changes: 3 additions & 5 deletions benchmarks/profiler/profile_sla.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,7 @@ async def run_profile(args):
base_url=base_url,
)
if aiperf_result is not None:
ttft = aiperf_result["records"]["ttft"]["avg"]
ttft = aiperf_result["time_to_first_token"]["avg"]

logger.info("Cleaning up deployment...")
await client.delete_deployment()
Expand Down Expand Up @@ -432,11 +432,9 @@ async def run_profile(args):
base_url=base_url,
)
if aiperf_result is not None:
itl = aiperf_result["records"]["inter_token_latency"]["avg"]
itl = aiperf_result["inter_token_latency"]["avg"]
thpt_per_gpu = (
aiperf_result["records"]["output_token_throughput"][
"avg"
]
aiperf_result["output_token_throughput"]["avg"]
/ num_gpus
)

Expand Down
6 changes: 2 additions & 4 deletions benchmarks/profiler/utils/profile_decode.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,10 +124,8 @@ def get_itl_and_thpt_per_gpu(isl, osl, num_request):
base_url=url,
)
if aiperf_result is not None:
itl = aiperf_result["records"]["inter_token_latency"]["avg"]
thpt_per_gpu = (
aiperf_result["records"]["output_token_throughput"]["avg"] / num_gpus
)
itl = aiperf_result["inter_token_latency"]["avg"]
thpt_per_gpu = aiperf_result["output_token_throughput"]["avg"] / num_gpus
return itl, thpt_per_gpu
return None, None

Expand Down
2 changes: 1 addition & 1 deletion benchmarks/profiler/utils/profile_prefill.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def get_ttft(isl):
base_url=url,
)
if aiperf_result is not None:
return aiperf_result["records"]["ttft"]["avg"]
return aiperf_result["time_to_first_token"]["avg"]
return None

return _profile_prefill_helper(
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/router/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ python real_data_benchmark.py --input-dataset trace.jsonl --prefix-root-multipli
> [!Note]
> At the time of writing this documentation, you may need to install the latest aiperf from the main source branch to loadgen on the trace files:
> ```bash
> pip install git+https://github.com/ai-dynamo/aiperf.git#subdirectory=aiperf
> pip install git+https://github.com/ai-dynamo/aiperf.git
> ```
> However, by the time of release, the aiperf version included in the vLLM runtime container should be up to date enough to use as-is.

Expand Down
2 changes: 1 addition & 1 deletion benchmarks/sin_load_generator/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ SPDX-License-Identifier: Apache-2.0

# Sinusoidal Load Generator

`sin_synth.py` is a simple script to generate synthetic load with sinusoidal request rate and isl/osl ratio. The output is in [mooncake-style](https://github.com/kvcache-ai/Mooncake) jsonl format, which can be directly used in [AIPerf](https://github.com/ai-dynamo/aiperf/tree/main/aiperf).
`sin_synth.py` is a simple script to generate synthetic load with sinusoidal request rate and isl/osl ratio. The output is in [mooncake-style](https://github.com/kvcache-ai/Mooncake) jsonl format, which can be directly used in [AIPerf](https://github.com/ai-dynamo/aiperf).

## Usage

Expand Down
2 changes: 1 addition & 1 deletion docs/backends/trtllm/gpt-oss.md
Original file line number Diff line number Diff line change
Expand Up @@ -404,7 +404,7 @@ curl localhost:8000/v1/chat/completions -H "Content-Type: application/json"

### Performance Testing with AIPerf

The Dynamo container includes [AIPerf](https://github.com/ai-dynamo/aiperf/blob/main/README.md), NVIDIA's tool for benchmarking generative AI models. This tool helps measure throughput, latency, and other performance metrics for your deployment.
The Dynamo container includes [AIPerf](https://github.com/ai-dynamo/aiperf/tree/main?tab=readme-ov-file#aiperf), NVIDIA's tool for benchmarking generative AI models. This tool helps measure throughput, latency, and other performance metrics for your deployment.

**Run the following benchmark from inside the container** (after completing the deployment steps above):

Expand Down
13 changes: 7 additions & 6 deletions recipes/gpt-oss-120b/trtllm/agg/perf.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -57,19 +57,20 @@ spec:
aiperf profile --artifact-dir $ARTIFACT_DIR \
--model $TARGET_MODEL \
--tokenizer /model-cache/hub/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a \
--endpoint-type chat --endpoint /v1/chat/completions \
--endpoint-type chat \
--endpoint /v1/chat/completions \
--streaming \
--url http://$ENDPOINT \
--synthetic-input-tokens-mean $isl \
--synthetic-input-tokens-stddev 0 \
--output-tokens-mean $osl \
--output-tokens-stddev 0 \
--extra-inputs "{\"max_tokens\":$osl}" \
--extra-inputs "{\"min_tokens\":$osl}" \
--extra-inputs "{\"ignore_eos\":true}" \
--extra-inputs "max_tokens:$osl" \
--extra-inputs "min_tokens:$osl" \
--extra-inputs "ignore_eos:true" \
--extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \
--extra-inputs "{\"repetition_penalty\":1.0}" \
--extra-inputs "{\"temperature\": 0.0}" \
--extra-inputs "repetition_penalty:1.0" \
--extra-inputs "temperature: 0.0" \
--concurrency $concurrency \
--request-count $((10*concurrency)) \
--warmup-request-count $concurrency \
Expand Down
3 changes: 2 additions & 1 deletion recipes/llama-3-70b/vllm/agg/perf.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ spec:
aiperf profile --artifact-dir $ARTIFACT_DIR \
--model $TARGET_MODEL \
--tokenizer /root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \
--endpoint-type chat --endpoint /v1/chat/completions \
--endpoint-type chat \
--endpoint /v1/chat/completions \
--streaming \
--url http://$ENDPOINT \
--synthetic-input-tokens-mean $isl \
Expand Down
3 changes: 2 additions & 1 deletion recipes/llama-3-70b/vllm/disagg-multi-node/perf.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ spec:
aiperf profile --artifact-dir $ARTIFACT_DIR \
--model $TARGET_MODEL \
--tokenizer /root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \
--endpoint-type chat --endpoint /v1/chat/completions \
--endpoint-type chat \
--endpoint /v1/chat/completions \
--streaming \
--url http://$ENDPOINT \
--synthetic-input-tokens-mean $isl \
Expand Down
3 changes: 2 additions & 1 deletion recipes/llama-3-70b/vllm/disagg-single-node/perf.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ spec:
aiperf profile --artifact-dir $ARTIFACT_DIR \
--model $TARGET_MODEL \
--tokenizer /root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \
--endpoint-type chat --endpoint /v1/chat/completions \
--endpoint-type chat \
--endpoint /v1/chat/completions \
--streaming \
--url http://$ENDPOINT \
--synthetic-input-tokens-mean $isl \
Expand Down
20 changes: 6 additions & 14 deletions tests/fault_tolerance/deploy/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,27 +383,19 @@ def log_summary_metrics(
with open(profile_json) as f:
metrics = json.load(f)

# Extract key metrics from AI-Perf format
records = metrics.get("records", {})

# Request count from request_count record
request_count_record = records.get("request_count", {})
request_count = (
int(request_count_record.get("avg", 0)) if request_count_record else 0
)
# Request count
request_count = int(metrics.get("request_count", {}).get("avg", 0))

# Check for errors
error_summary = metrics.get("error_summary", [])
error_count = len(error_summary)
error_count = len(metrics.get("error_summary", []))

# Latency metrics (in milliseconds)
request_latency = records.get("request_latency", {})
request_latency = metrics.get("request_latency", {})
avg_latency = request_latency.get("avg", 0) / 1000.0 # Convert to seconds
p99_latency = request_latency.get("p99", 0) / 1000.0 # Convert to seconds

# Throughput metrics
request_throughput = records.get("request_throughput", {})
throughput = request_throughput.get("avg", 0)
throughput = metrics.get("request_throughput", {}).get("avg", 0)

# Log summary
logger.info(
Expand All @@ -417,7 +409,7 @@ def log_summary_metrics(

# Log success rate
if request_count > 0:
success_rate = (request_count - error_count) / request_count * 100
success_rate = ((request_count - error_count) / request_count) * 100
logger.info(f"Success rate: {success_rate:.1f}%")

# Also write summary to CSV file for aggregation
Expand Down
67 changes: 24 additions & 43 deletions tests/fault_tolerance/deploy/parse_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,63 +293,44 @@ def parse_aiperf_client_results(log_dir: str) -> Dict[str, Any]:
with open(profile_json) as f:
client_metrics = json.load(f)

# AI-Perf format has "records" dictionary at the top level
records = client_metrics.get("records", {})

# Extract request count (this is the total requests made)
request_count_record = records.get("request_count", {})
request_count = (
int(request_count_record.get("avg", 0))
if request_count_record
else 0
# Extract request count (this is the total successful requests made)
request_count = int(
client_metrics.get("request_count", {}).get("avg", 0)
)

# Check for errors in error_summary
error_summary = client_metrics.get("error_summary", [])
error_count = len(error_summary)
error_count = len(client_metrics.get("error_summary", []))

# Check if test was cancelled
was_cancelled = client_metrics.get("was_cancelled", False)
if was_cancelled:
if client_metrics.get("was_cancelled", False):
error_count = request_count # Mark all as failed if cancelled

all_metrics["total_requests"] += request_count
all_metrics["successful_requests"] += request_count - error_count
all_metrics["failed_requests"] += error_count

# Extract latency from request_latency record
request_latency = records.get("request_latency", {})

# Extract latency metrics
request_latency = client_metrics.get("request_latency", None)
if request_latency:
# Convert milliseconds to seconds for consistency
if "avg" in request_latency:
all_metrics["latencies"].append(request_latency["avg"] / 1000.0)
if "p50" in request_latency:
all_metrics["p50_latencies"].append(
request_latency["p50"] / 1000.0
)
if "p90" in request_latency:
all_metrics["p90_latencies"].append(
request_latency["p90"] / 1000.0
)
if "p99" in request_latency:
all_metrics["p99_latencies"].append(
request_latency["p99"] / 1000.0
)

# Time to first token (if available in records)
ttft = records.get("time_to_first_token", {}) or records.get("ttft", {})
if ttft and "avg" in ttft:
all_metrics["ttft"].append(ttft["avg"] / 1000.0) # Convert ms to s

# Inter-token latency (if available in records)
itl = records.get("inter_token_latency", {}) or records.get("itl", {})
if itl and "avg" in itl:
all_metrics["itl"].append(itl["avg"] / 1000.0) # Convert ms to s
all_metrics["latencies"].append(request_latency["avg"] / 1000.0)
all_metrics["p50_latencies"].append(request_latency["p50"] / 1000.0)
all_metrics["p90_latencies"].append(request_latency["p90"] / 1000.0)
all_metrics["p99_latencies"].append(request_latency["p99"] / 1000.0)

# Time to first token
ttft = client_metrics.get("time_to_first_token", {}).get("avg", None)
if ttft:
all_metrics["ttft"].append(ttft / 1000.0) # Convert ms to s

# Inter-token latency
itl = client_metrics.get("inter_token_latency", {}).get("avg", None)
if itl:
all_metrics["itl"].append(itl / 1000.0) # Convert ms to s

# Throughput from request_throughput record
request_throughput = records.get("request_throughput", {})
req_throughput = request_throughput.get("avg", 0)
req_throughput = client_metrics.get("request_throughput", {}).get(
"avg", 0
)
if req_throughput:
all_metrics["throughputs"].append(req_throughput)

Expand Down
2 changes: 1 addition & 1 deletion tests/planner/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ aiperf profile \
--input-file payload:/workspace/rr-5-45_i3000o300.jsonl \ # path to the generated load dataset \
--fixed-schedule True \
--goodput time_to_first_token:200 inter_token_latency:10 \
-v \
-v
```

> [!NOTE]
Expand Down
41 changes: 10 additions & 31 deletions tests/planner/utils/load_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,8 +116,6 @@ async def generate_load(
str(params["request_rate"]),
"--request-count",
str(request_count), # Use request count to limit test duration
"--stability-percentage",
"50",
"--num-dataset-entries",
str(
max(20, int(params["request_rate"] * 10))
Expand Down Expand Up @@ -210,35 +208,16 @@ def _parse_aiperf_results(self, artifact_dir: str) -> Dict[str, Any]:
logger.info(f"Parsing results from: {results_file}")

with open(results_file, "r") as f:
data = json.load(f)

results = {}
if "experiments" in data and data["experiments"]:
exp = data["experiments"][0]
if "perf_metrics" in exp:
metrics = exp["perf_metrics"]
results.update(
{
"throughput": metrics.get("throughput", {}).get("avg", 0),
"ttft_mean": metrics.get("ttft", {}).get("avg", 0),
"itl_mean": metrics.get("inter_token_latency", {}).get(
"avg", 0
),
"end_to_end_latency_mean": metrics.get(
"request_latency", {}
).get("avg", 0),
}
)
if not results and "profile_export_aiperf" in data:
summary = data.get("summary", {})
results.update(
{
"throughput": summary.get("throughput", 0),
"ttft_mean": summary.get("time_to_first_token_ms", 0),
"itl_mean": summary.get("inter_token_latency_ms", 0),
}
)

metrics = json.load(f)

results = {
"throughput": metrics.get("output_token_throughput", {}).get("avg", 0),
"ttft_mean": metrics.get("time_to_first_token", {}).get("avg", 0),
"itl_mean": metrics.get("inter_token_latency", {}).get("avg", 0),
"end_to_end_latency_mean": metrics.get("request_latency", {}).get(
"avg", 0
),
}
logger.info(f"Parsed results: {results}")
return results

Expand Down
Loading