Skip to content

Commit 914ea61

Browse files
lkomaliathreesh
authored andcommitted
feat: Replace genai-perf with aiperf (#3533)
Signed-off-by: lkomali <[email protected]>
1 parent fadf0e1 commit 914ea61

File tree

16 files changed

+86
-102
lines changed

16 files changed

+86
-102
lines changed

benchmarks/router/README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ This directory contains scripts for benchmarking the Dynamo router with prefix c
1313
- etcd and NATS running (required for Dynamo coordination)
1414
- Required Python packages:
1515
- `dynamo` package (with vllm and frontend modules)
16-
- `genai-perf` for benchmarking
16+
- `aiperf` for benchmarking
1717
- `matplotlib` for plotting results
1818
- `data-generator` package (install with `pip install -e ./benchmarks` from repo root)
1919

@@ -230,11 +230,11 @@ python real_data_benchmark.py --input-dataset trace.jsonl --prefix-root-multipli
230230
```
231231

232232
> [!Note]
233-
> At the time of writing this documentation, you may need to install the latest genai-perf from the main source branch to loadgen on the trace files:
233+
> At the time of writing this documentation, you may need to install the latest aiperf from the main source branch to loadgen on the trace files:
234234
> ```bash
235-
> pip install git+https://github.com/triton-inference-server/perf_analyzer.git#subdirectory=genai-perf
235+
> pip install git+https://github.com/ai-dynamo/aiperf.git#subdirectory=aiperf
236236
> ```
237-
> However, by the time of release, the genai-perf version included in the vLLM runtime container should be up to date enough to use as-is.
237+
> However, by the time of release, the aiperf version included in the vLLM runtime container should be up to date enough to use as-is.
238238
239239
## Troubleshooting
240240

benchmarks/router/prefix_ratio_benchmark.py

Lines changed: 26 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
logger.addHandler(console_handler)
2828

2929

30-
def get_genai_perf_cmd(
30+
def get_aiperf_cmd(
3131
model,
3232
tokenizer, # Add tokenizer parameter
3333
prefix_ratio,
@@ -40,12 +40,12 @@ def get_genai_perf_cmd(
4040
artifact_dir,
4141
url="http://localhost:8888",
4242
):
43-
"""Build genai-perf command based on prefix ratio"""
43+
"""Build aiperf command based on prefix ratio"""
4444
prefix_length = int(isl * prefix_ratio)
4545
synthetic_input_length = int(isl * (1 - prefix_ratio))
4646

4747
return [
48-
"genai-perf",
48+
"aiperf",
4949
"profile",
5050
"--model",
5151
model,
@@ -84,28 +84,25 @@ def get_genai_perf_cmd(
8484
str(num_prefix_prompts),
8585
"--artifact-dir",
8686
artifact_dir,
87-
"--",
8887
"-v",
89-
"--max-threads",
90-
"256",
9188
"-H",
9289
"Authorization: Bearer NOT USED",
9390
"-H",
9491
"Accept: text/event-stream",
9592
]
9693

9794

98-
def get_gap_result(artifact_dir: str) -> dict:
99-
"""Parse genai-perf results from JSON file"""
95+
def get_aiperf_result(artifact_dir: str) -> dict:
96+
"""Parse aiperf results from JSON file"""
10097
json_file_path = None
10198
for root, _, files in os.walk(artifact_dir):
102-
if "profile_export_genai_perf.json" in files:
103-
json_file_path = os.path.join(root, "profile_export_genai_perf.json")
99+
if "profile_export_aiperf.json" in files:
100+
json_file_path = os.path.join(root, "profile_export_aiperf.json")
104101
break
105102

106103
if json_file_path is None:
107104
raise FileNotFoundError(
108-
f"profile_export_genai_perf.json not found in {artifact_dir}"
105+
f"profile_export_aiperf.json not found in {artifact_dir}"
109106
)
110107

111108
with open(json_file_path, "r") as f:
@@ -125,8 +122,8 @@ def run_benchmark_single_url(
125122
artifact_dir,
126123
url,
127124
) -> Optional[Dict]:
128-
"""Run genai-perf benchmark for a single URL"""
129-
genai_perf_cmd = get_genai_perf_cmd(
125+
"""Run aiperf benchmark for a single URL"""
126+
aiperf_cmd = get_aiperf_cmd(
130127
model,
131128
tokenizer, # Pass tokenizer parameter
132129
prefix_ratio,
@@ -140,21 +137,21 @@ def run_benchmark_single_url(
140137
url,
141138
)
142139

143-
logger.info(f"Running command for URL {url}: {' '.join(genai_perf_cmd)}")
140+
logger.info(f"Running command for URL {url}: {' '.join(aiperf_cmd)}")
144141

145142
try:
146-
gap_process = subprocess.run(
147-
genai_perf_cmd, capture_output=True, text=True, check=True
143+
aiperf_process = subprocess.run(
144+
aiperf_cmd, capture_output=True, text=True, check=True
148145
)
149146

150-
logger.info(f"Genai-perf profiling completed successfully for URL {url}")
151-
logger.info(gap_process.stdout)
147+
logger.info(f"AIPerf profiling completed successfully for URL {url}")
148+
logger.info(aiperf_process.stdout)
152149

153-
gap_result = get_gap_result(artifact_dir)
154-
return gap_result
150+
aiperf_result = get_aiperf_result(artifact_dir)
151+
return aiperf_result
155152

156153
except subprocess.CalledProcessError as e:
157-
logger.error(f"Genai-perf failed for URL {url} with error code: {e.returncode}")
154+
logger.error(f"AIPerf failed for URL {url} with error code: {e.returncode}")
158155
logger.error(f"stderr: {e.stderr}")
159156
return None
160157

@@ -197,7 +194,7 @@ def run_benchmark(
197194
output_dir,
198195
urls,
199196
) -> Optional[Dict]:
200-
"""Run genai-perf benchmark for a specific prefix ratio"""
197+
"""Run aiperf benchmark for a specific prefix ratio"""
201198
logger.info(
202199
f"Running benchmark with prefix_ratio={prefix_ratio}, seed={seed}, URLs={urls}"
203200
)
@@ -242,7 +239,7 @@ def run_benchmark(
242239
os.makedirs(artifact_dir, exist_ok=True)
243240
artifact_dirs.append(artifact_dir)
244241

245-
genai_perf_cmd = get_genai_perf_cmd(
242+
aiperf_cmd = get_aiperf_cmd(
246243
model,
247244
tokenizer, # Pass tokenizer parameter
248245
prefix_ratio,
@@ -256,10 +253,10 @@ def run_benchmark(
256253
url,
257254
)
258255

259-
logger.info(f"Launching process for URL {url}: {' '.join(genai_perf_cmd)}")
256+
logger.info(f"Launching process for URL {url}: {' '.join(aiperf_cmd)}")
260257

261258
process = subprocess.Popen(
262-
genai_perf_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
259+
aiperf_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
263260
)
264261
processes.append((process, url, artifact_dir))
265262

@@ -269,18 +266,18 @@ def run_benchmark(
269266
stdout, stderr = process.communicate()
270267

271268
if process.returncode == 0:
272-
logger.info(f"Genai-perf completed successfully for URL {url}")
269+
logger.info(f"AIPerf completed successfully for URL {url}")
273270
logger.info(stdout)
274271

275272
try:
276-
gap_result = get_gap_result(artifact_dir)
277-
results.append(gap_result)
273+
aiperf_result = get_aiperf_result(artifact_dir)
274+
results.append(aiperf_result)
278275
except Exception as e:
279276
logger.error(f"Failed to get results for URL {url}: {e}")
280277
results.append(None)
281278
else:
282279
logger.error(
283-
f"Genai-perf failed for URL {url} with error code: {process.returncode}"
280+
f"AIPerf failed for URL {url} with error code: {process.returncode}"
284281
)
285282
logger.error(f"stderr: {stderr}")
286283
results.append(None)

benchmarks/router/real_data_benchmark.py

Lines changed: 13 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
logger.addHandler(console_handler)
2525

2626

27-
def get_genai_perf_cmd_for_trace(
27+
def get_aiperf_cmd_for_trace(
2828
model,
2929
tokenizer,
3030
input_dataset,
@@ -33,7 +33,7 @@ def get_genai_perf_cmd_for_trace(
3333
url="http://localhost:8888",
3434
):
3535
return [
36-
"genai-perf",
36+
"aiperf",
3737
"profile",
3838
"--model",
3939
model,
@@ -47,17 +47,13 @@ def get_genai_perf_cmd_for_trace(
4747
"--url",
4848
url,
4949
"--input-file",
50-
f"payload:{input_dataset}",
51-
"--fixed-schedule",
52-
"True",
50+
f"{input_dataset}",
51+
"--fixed-schedule-auto-offset",
5352
"--random-seed",
5453
str(seed),
5554
"--artifact-dir",
5655
artifact_dir,
57-
"--",
5856
"-v",
59-
"--max-threads",
60-
"256",
6157
"-H",
6258
"Authorization: Bearer NOT USED",
6359
"-H",
@@ -73,8 +69,8 @@ def run_benchmark_with_trace(
7369
url,
7470
seed,
7571
):
76-
"""Run genai-perf benchmark with a trace dataset"""
77-
genai_perf_cmd = get_genai_perf_cmd_for_trace(
72+
"""Run aiperf benchmark with a trace dataset"""
73+
aiperf_cmd = get_aiperf_cmd_for_trace(
7874
model,
7975
tokenizer,
8076
trace_dataset,
@@ -83,17 +79,17 @@ def run_benchmark_with_trace(
8379
url,
8480
)
8581

86-
logger.info(f"Running genai-perf with trace dataset: {trace_dataset}")
87-
logger.info(f"Command: {' '.join(genai_perf_cmd)}")
82+
logger.info(f"Running aiperf with trace dataset: {trace_dataset}")
83+
logger.info(f"Command: {' '.join(aiperf_cmd)}")
8884

8985
try:
90-
# Run genai-perf and let it output directly to terminal
91-
subprocess.run(genai_perf_cmd, check=True)
86+
# Run aiperf and let it output directly to terminal
87+
subprocess.run(aiperf_cmd, check=True)
9288

93-
logger.info("Genai-perf profiling completed successfully")
89+
logger.info("AIPerf profiling completed successfully")
9490

9591
except subprocess.CalledProcessError as e:
96-
logger.error(f"Genai-perf failed with error code: {e.returncode}")
92+
logger.error(f"AIPerf failed with error code: {e.returncode}")
9793
logger.error(f"stderr: {e.stderr}")
9894
raise
9995

@@ -301,7 +297,7 @@ def main():
301297
logger.info(f"Synthetic trace data saved to: {trace_dataset_path}")
302298

303299
# Run benchmark with the trace dataset
304-
artifact_dir = os.path.join(args.output_dir, "genai_perf_artifacts")
300+
artifact_dir = os.path.join(args.output_dir, "aiperf_artifacts")
305301
os.makedirs(artifact_dir, exist_ok=True)
306302

307303
run_benchmark_with_trace(

benchmarks/sin_load_generator/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ SPDX-License-Identifier: Apache-2.0
55

66
# Sinusoidal Load Generator
77

8-
`sin_synth.py` is a simple script to generate synthetic load with sinusoidal request rate and isl/osl ratio. The output is in [mooncake-style](https://github.com/kvcache-ai/Mooncake) jsonl format, which can be directly used in [GenAI-Perf](https://github.com/triton-inference-server/perf_analyzer/tree/main/genai-perf/genai_perf).
8+
`sin_synth.py` is a simple script to generate synthetic load with sinusoidal request rate and isl/osl ratio. The output is in [mooncake-style](https://github.com/kvcache-ai/Mooncake) jsonl format, which can be directly used in [AIPerf](https://github.com/ai-dynamo/aiperf/tree/main/aiperf).
99

1010
## Usage
1111

File renamed without changes.

benchmarks/utils/workflow.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from pathlib import Path
55
from typing import Dict, List
66

7-
from benchmarks.utils.genai import run_concurrency_sweep
7+
from benchmarks.utils.aiperf import run_concurrency_sweep
88
from deploy.utils.kubernetes import is_running_in_cluster
99

1010

docs/backends/trtllm/gpt-oss.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -404,7 +404,7 @@ curl localhost:8000/v1/chat/completions -H "Content-Type: application/json"
404404

405405
### Performance Testing with AIPerf
406406

407-
The Dynamo container includes [AIPerf](https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/perf_analyzer/aiperf/README.html), NVIDIA's tool for benchmarking generative AI models. This tool helps measure throughput, latency, and other performance metrics for your deployment.
407+
The Dynamo container includes [AIPerf](https://github.com/ai-dynamo/aiperf/blob/main/README.md), NVIDIA's tool for benchmarking generative AI models. This tool helps measure throughput, latency, and other performance metrics for your deployment.
408408

409409
**Run the following benchmark from inside the container** (after completing the deployment steps above):
410410

docs/benchmarks/benchmarking.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -283,7 +283,7 @@ results/ # Client-side: ./benchmarks/results/ or custom
283283
│ └── avg_time_to_first_token_vs_concurrency.png
284284
├── <your-benchmark-name>/ # Results for your benchmark (uses your custom name)
285285
│ ├── c1/ # Concurrency level 1
286-
│ │ └── profile_export_genai_perf.json
286+
│ │ └── profile_export_aiperf.json
287287
│ ├── c2/ # Concurrency level 2
288288
│ ├── c5/ # Concurrency level 5
289289
│ └── ... # Other concurrency levels (10, 50, 100, 250)
@@ -457,7 +457,7 @@ Results are stored in `/data/results` and follow the same structure as client-si
457457
/data/results/
458458
└── <benchmark-name>/ # Results for your benchmark name
459459
├── c1/ # Concurrency level 1
460-
│ └── profile_export_genai_perf.json
460+
│ └── profile_export_aiperf.json
461461
├── c2/ # Concurrency level 2
462462
└── ... # Other concurrency levels
463463
```

docs/performance/tuning.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -56,11 +56,11 @@ Typically, the number of GPUs vs the performance follows the following pattern:
5656
| 2 | 269 | 135 | 1.19x |
5757
| 4 | 578 | 144 | 1.28x |
5858

59-
The best number of GPUs to use in the prefill and decode engines can be determined by running a few fixed ISL/OSL/concurrency test using [GenAI-Perf](https://github.com/triton-inference-server/perf_analyzer/tree/main/genai-perf) and compare with the SLA.
60-
GenAI-Perf is pre-installed in the dynamo container.
59+
The best number of GPUs to use in the prefill and decode engines can be determined by running a few fixed ISL/OSL/concurrency test using [AIPerf](https://github.com/ai-dynamo/aiperf/tree/main) and compare with the SLA.
60+
AIPerf is pre-installed in the dynamo container.
6161

6262
> [!Tip]
63-
> If you are unfamiliar with GenAI-Perf, please see this helpful [tutorial](https://github.com/triton-inference-server/perf_analyzer/blob/main/genai-perf/docs/tutorial.md) to get you started.
63+
> If you are unfamiliar with AIPerf, please see this helpful [tutorial](https://github.com/ai-dynamo/aiperf/blob/main/docs/tutorial.md) to get you started.
6464
6565
Besides the parallelization mapping, other common knobs to tune are maximum batch size, maximum number of tokens, and block size.
6666
For prefill engines, usually a small batch size and large `max_num_token` is preferred.

examples/basics/kubernetes/Distributed_Inference/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,4 +54,4 @@ curl localhost:8000/v1/chat/completions \
5454
"max_tokens": 30
5555
}'
5656
```
57-
You can also benchmark the performance of the endpoint by [GenAI-Perf](https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/perf_analyzer/genai-perf/README.html)
57+
You can also benchmark the performance of the endpoint by [AIPerf](https://github.com/ai-dynamo/aiperf/blob/main/README.md)

0 commit comments

Comments
 (0)