diff --git a/benchmarks/llm/perf.sh b/benchmarks/llm/perf.sh index 446ec1f74d0c..a1f66dc01710 100644 --- a/benchmarks/llm/perf.sh +++ b/benchmarks/llm/perf.sh @@ -235,9 +235,8 @@ for concurrency in "${concurrency_array[@]}"; do --num-dataset-entries $(($concurrency*12)) \ --random-seed 100 \ --artifact-dir ${artifact_dir} \ - -- \ + --ui simple \ -v \ - --max-threads ${concurrency} \ -H 'Authorization: Bearer NOT USED' \ -H 'Accept: text/event-stream' diff --git a/benchmarks/profiler/profile_sla.py b/benchmarks/profiler/profile_sla.py index 8c670986cffa..757c073b2851 100644 --- a/benchmarks/profiler/profile_sla.py +++ b/benchmarks/profiler/profile_sla.py @@ -256,7 +256,7 @@ async def run_profile(args): base_url=base_url, ) if aiperf_result is not None: - ttft = aiperf_result["records"]["ttft"]["avg"] + ttft = aiperf_result["time_to_first_token"]["avg"] logger.info("Cleaning up deployment...") await client.delete_deployment() @@ -435,11 +435,9 @@ async def run_profile(args): base_url=base_url, ) if aiperf_result is not None: - itl = aiperf_result["records"]["inter_token_latency"]["avg"] + itl = aiperf_result["inter_token_latency"]["avg"] thpt_per_gpu = ( - aiperf_result["records"]["output_token_throughput"][ - "avg" - ] + aiperf_result["output_token_throughput"]["avg"] / num_gpus ) diff --git a/benchmarks/profiler/utils/profile_decode.py b/benchmarks/profiler/utils/profile_decode.py index 1a9cbf3d96fa..f0a819ec5dec 100644 --- a/benchmarks/profiler/utils/profile_decode.py +++ b/benchmarks/profiler/utils/profile_decode.py @@ -124,10 +124,8 @@ def get_itl_and_thpt_per_gpu(isl, osl, num_request): base_url=url, ) if aiperf_result is not None: - itl = aiperf_result["records"]["inter_token_latency"]["avg"] - thpt_per_gpu = ( - aiperf_result["records"]["output_token_throughput"]["avg"] / num_gpus - ) + itl = aiperf_result["inter_token_latency"]["avg"] + thpt_per_gpu = aiperf_result["output_token_throughput"]["avg"] / num_gpus return itl, thpt_per_gpu return None, None diff --git a/benchmarks/profiler/utils/profile_prefill.py b/benchmarks/profiler/utils/profile_prefill.py index d7f5dae91bf0..48171bdd7e63 100644 --- a/benchmarks/profiler/utils/profile_prefill.py +++ b/benchmarks/profiler/utils/profile_prefill.py @@ -90,7 +90,7 @@ def get_ttft(isl): base_url=url, ) if aiperf_result is not None: - return aiperf_result["records"]["ttft"]["avg"] + return aiperf_result["time_to_first_token"]["avg"] return None return _profile_prefill_helper( diff --git a/benchmarks/router/README.md b/benchmarks/router/README.md index 8ea830b759ac..40d8f127fd6b 100644 --- a/benchmarks/router/README.md +++ b/benchmarks/router/README.md @@ -13,7 +13,7 @@ This directory contains scripts for benchmarking the Dynamo router with prefix c - etcd and NATS running (required for Dynamo coordination) - Required Python packages: - `dynamo` package (with vllm and frontend modules) - - `genai-perf` for benchmarking + - `aiperf` for benchmarking - `matplotlib` for plotting results - `data-generator` package (install with `pip install -e ./benchmarks` from repo root) @@ -230,11 +230,11 @@ python real_data_benchmark.py --input-dataset trace.jsonl --prefix-root-multipli ``` > [!Note] -> At the time of writing this documentation, you may need to install the latest genai-perf from the main source branch to loadgen on the trace files: +> At the time of writing this documentation, you may need to install the latest aiperf from the main source branch to loadgen on the trace files: > ```bash -> pip install git+https://github.com/triton-inference-server/perf_analyzer.git#subdirectory=genai-perf +> pip install git+https://github.com/ai-dynamo/aiperf.git > ``` -> However, by the time of release, the genai-perf version included in the vLLM runtime container should be up to date enough to use as-is. +> However, by the time of release, the aiperf version included in the vLLM runtime container should be up to date enough to use as-is. ## Troubleshooting diff --git a/benchmarks/sin_load_generator/README.md b/benchmarks/sin_load_generator/README.md index 7c3ec5cf303a..82b7dee5b9c5 100644 --- a/benchmarks/sin_load_generator/README.md +++ b/benchmarks/sin_load_generator/README.md @@ -5,7 +5,7 @@ SPDX-License-Identifier: Apache-2.0 # Sinusoidal Load Generator -`sin_synth.py` is a simple script to generate synthetic load with sinusoidal request rate and isl/osl ratio. The output is in [mooncake-style](https://github.com/kvcache-ai/Mooncake) jsonl format, which can be directly used in [GenAI-Perf](https://github.com/triton-inference-server/perf_analyzer/tree/main/genai-perf/genai_perf). +`sin_synth.py` is a simple script to generate synthetic load with sinusoidal request rate and isl/osl ratio. The output is in [mooncake-style](https://github.com/kvcache-ai/Mooncake) jsonl format, which can be directly used in [AIPerf](https://github.com/ai-dynamo/aiperf). ## Usage diff --git a/docs/backends/trtllm/gpt-oss.md b/docs/backends/trtllm/gpt-oss.md index 6a11712724ae..071b88bb2e2d 100644 --- a/docs/backends/trtllm/gpt-oss.md +++ b/docs/backends/trtllm/gpt-oss.md @@ -402,9 +402,9 @@ curl localhost:8000/v1/chat/completions -H "Content-Type: application/json" ``` ## Benchmarking -### Performance Testing with GenAI-Perf +### Performance Testing with AIPerf -The Dynamo container includes [GenAI-Perf](https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/perf_analyzer/genai-perf/README.html), NVIDIA's tool for benchmarking generative AI models. This tool helps measure throughput, latency, and other performance metrics for your deployment. +The Dynamo container includes [AIPerf](https://github.com/ai-dynamo/aiperf/tree/main?tab=readme-ov-file#aiperf), NVIDIA's tool for benchmarking generative AI models. This tool helps measure throughput, latency, and other performance metrics for your deployment. **Run the following benchmark from inside the container** (after completing the deployment steps above): @@ -413,7 +413,7 @@ The Dynamo container includes [GenAI-Perf](https://docs.nvidia.com/deeplearning/ mkdir -p /tmp/benchmark-results # Run the benchmark - this command tests the deployment with high-concurrency synthetic workload -genai-perf profile \ +aiperf profile \ --model openai/gpt-oss-120b \ --tokenizer /model \ --endpoint-type chat \ @@ -434,9 +434,7 @@ genai-perf profile \ --num-dataset-entries 8000 \ --random-seed 100 \ --artifact-dir /tmp/benchmark-results \ - -- \ -v \ - --max-threads 500 \ -H 'Authorization: Bearer NOT USED' \ -H 'Accept: text/event-stream' ``` @@ -457,13 +455,13 @@ Key parameters you can adjust: - `--output-tokens-mean`: Average output length (tests decode throughput) - `--request-count`: Total number of requests for the benchmark -### Installing GenAI-Perf Outside the Container +### Installing AIPerf Outside the Container If you prefer to run benchmarks from outside the container: ```bash -# Install GenAI-Perf -pip install genai-perf +# Install AIPerf +pip install aiperf # Then run the same benchmark command, adjusting the tokenizer path if needed ``` @@ -520,4 +518,4 @@ flowchart TD - **Production Deployment**: For multi-node deployments, see the [Multi-node Guide](../../../examples/basics/multinode/README.md) - **Advanced Configuration**: Explore TensorRT-LLM engine building options for further optimization - **Monitoring**: Set up Prometheus and Grafana for production monitoring -- **Performance Benchmarking**: Use GenAI-Perf to measure and optimize your deployment performance +- **Performance Benchmarking**: Use AIPerf to measure and optimize your deployment performance diff --git a/recipes/gpt-oss-120b/trtllm/agg/perf.yaml b/recipes/gpt-oss-120b/trtllm/agg/perf.yaml index eed5d69addbf..a1dbbd696aba 100644 --- a/recipes/gpt-oss-120b/trtllm/agg/perf.yaml +++ b/recipes/gpt-oss-120b/trtllm/agg/perf.yaml @@ -57,19 +57,20 @@ spec: aiperf profile --artifact-dir $ARTIFACT_DIR \ --model $TARGET_MODEL \ --tokenizer /model-cache/hub/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a \ - --endpoint-type chat --endpoint /v1/chat/completions \ + --endpoint-type chat \ + --endpoint /v1/chat/completions \ --streaming \ --url http://$ENDPOINT \ --synthetic-input-tokens-mean $isl \ --synthetic-input-tokens-stddev 0 \ --output-tokens-mean $osl \ --output-tokens-stddev 0 \ - --extra-inputs "{\"max_tokens\":$osl}" \ - --extra-inputs "{\"min_tokens\":$osl}" \ - --extra-inputs "{\"ignore_eos\":true}" \ + --extra-inputs "max_tokens:$osl" \ + --extra-inputs "min_tokens:$osl" \ + --extra-inputs "ignore_eos:true" \ --extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \ - --extra-inputs "{\"repetition_penalty\":1.0}" \ - --extra-inputs "{\"temperature\": 0.0}" \ + --extra-inputs "repetition_penalty:1.0" \ + --extra-inputs "temperature: 0.0" \ --concurrency $concurrency \ --request-count $((10*concurrency)) \ --warmup-request-count $concurrency \ diff --git a/recipes/llama-3-70b/vllm/agg/perf.yaml b/recipes/llama-3-70b/vllm/agg/perf.yaml index 8c5a470f119c..5773214bf438 100644 --- a/recipes/llama-3-70b/vllm/agg/perf.yaml +++ b/recipes/llama-3-70b/vllm/agg/perf.yaml @@ -50,7 +50,8 @@ spec: aiperf profile --artifact-dir $ARTIFACT_DIR \ --model $TARGET_MODEL \ --tokenizer /root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \ - --endpoint-type chat --endpoint /v1/chat/completions \ + --endpoint-type chat \ + --endpoint /v1/chat/completions \ --streaming \ --url http://$ENDPOINT \ --synthetic-input-tokens-mean $isl \ diff --git a/recipes/llama-3-70b/vllm/disagg-multi-node/perf.yaml b/recipes/llama-3-70b/vllm/disagg-multi-node/perf.yaml index e2326e45873b..8b24296f828b 100644 --- a/recipes/llama-3-70b/vllm/disagg-multi-node/perf.yaml +++ b/recipes/llama-3-70b/vllm/disagg-multi-node/perf.yaml @@ -50,7 +50,8 @@ spec: aiperf profile --artifact-dir $ARTIFACT_DIR \ --model $TARGET_MODEL \ --tokenizer /root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \ - --endpoint-type chat --endpoint /v1/chat/completions \ + --endpoint-type chat \ + --endpoint /v1/chat/completions \ --streaming \ --url http://$ENDPOINT \ --synthetic-input-tokens-mean $isl \ diff --git a/recipes/llama-3-70b/vllm/disagg-single-node/perf.yaml b/recipes/llama-3-70b/vllm/disagg-single-node/perf.yaml index 61e53aa5a79e..c2ac8445c589 100644 --- a/recipes/llama-3-70b/vllm/disagg-single-node/perf.yaml +++ b/recipes/llama-3-70b/vllm/disagg-single-node/perf.yaml @@ -50,7 +50,8 @@ spec: aiperf profile --artifact-dir $ARTIFACT_DIR \ --model $TARGET_MODEL \ --tokenizer /root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \ - --endpoint-type chat --endpoint /v1/chat/completions \ + --endpoint-type chat \ + --endpoint /v1/chat/completions \ --streaming \ --url http://$ENDPOINT \ --synthetic-input-tokens-mean $isl \ diff --git a/tests/fault_tolerance/deploy/client.py b/tests/fault_tolerance/deploy/client.py index e3053008c43b..e8a6ec24c231 100644 --- a/tests/fault_tolerance/deploy/client.py +++ b/tests/fault_tolerance/deploy/client.py @@ -383,27 +383,19 @@ def log_summary_metrics( with open(profile_json) as f: metrics = json.load(f) - # Extract key metrics from AI-Perf format - records = metrics.get("records", {}) - - # Request count from request_count record - request_count_record = records.get("request_count", {}) - request_count = ( - int(request_count_record.get("avg", 0)) if request_count_record else 0 - ) + # Request count + request_count = int(metrics.get("request_count", {}).get("avg", 0)) # Check for errors - error_summary = metrics.get("error_summary", []) - error_count = len(error_summary) + error_count = len(metrics.get("error_summary", [])) # Latency metrics (in milliseconds) - request_latency = records.get("request_latency", {}) + request_latency = metrics.get("request_latency", {}) avg_latency = request_latency.get("avg", 0) / 1000.0 # Convert to seconds p99_latency = request_latency.get("p99", 0) / 1000.0 # Convert to seconds # Throughput metrics - request_throughput = records.get("request_throughput", {}) - throughput = request_throughput.get("avg", 0) + throughput = metrics.get("request_throughput", {}).get("avg", 0) # Log summary logger.info( @@ -417,7 +409,7 @@ def log_summary_metrics( # Log success rate if request_count > 0: - success_rate = (request_count - error_count) / request_count * 100 + success_rate = ((request_count - error_count) / request_count) * 100 logger.info(f"Success rate: {success_rate:.1f}%") # Also write summary to CSV file for aggregation diff --git a/tests/fault_tolerance/deploy/parse_results.py b/tests/fault_tolerance/deploy/parse_results.py index e41275cd44c7..77f28894d31f 100644 --- a/tests/fault_tolerance/deploy/parse_results.py +++ b/tests/fault_tolerance/deploy/parse_results.py @@ -293,63 +293,44 @@ def parse_aiperf_client_results(log_dir: str) -> Dict[str, Any]: with open(profile_json) as f: client_metrics = json.load(f) - # AI-Perf format has "records" dictionary at the top level - records = client_metrics.get("records", {}) - - # Extract request count (this is the total requests made) - request_count_record = records.get("request_count", {}) - request_count = ( - int(request_count_record.get("avg", 0)) - if request_count_record - else 0 + # Extract request count (this is the total successful requests made) + request_count = int( + client_metrics.get("request_count", {}).get("avg", 0) ) # Check for errors in error_summary - error_summary = client_metrics.get("error_summary", []) - error_count = len(error_summary) + error_count = len(client_metrics.get("error_summary", [])) # Check if test was cancelled - was_cancelled = client_metrics.get("was_cancelled", False) - if was_cancelled: + if client_metrics.get("was_cancelled", False): error_count = request_count # Mark all as failed if cancelled all_metrics["total_requests"] += request_count all_metrics["successful_requests"] += request_count - error_count all_metrics["failed_requests"] += error_count - # Extract latency from request_latency record - request_latency = records.get("request_latency", {}) - + # Extract latency metrics + request_latency = client_metrics.get("request_latency", None) if request_latency: - # Convert milliseconds to seconds for consistency - if "avg" in request_latency: - all_metrics["latencies"].append(request_latency["avg"] / 1000.0) - if "p50" in request_latency: - all_metrics["p50_latencies"].append( - request_latency["p50"] / 1000.0 - ) - if "p90" in request_latency: - all_metrics["p90_latencies"].append( - request_latency["p90"] / 1000.0 - ) - if "p99" in request_latency: - all_metrics["p99_latencies"].append( - request_latency["p99"] / 1000.0 - ) - - # Time to first token (if available in records) - ttft = records.get("time_to_first_token", {}) or records.get("ttft", {}) - if ttft and "avg" in ttft: - all_metrics["ttft"].append(ttft["avg"] / 1000.0) # Convert ms to s - - # Inter-token latency (if available in records) - itl = records.get("inter_token_latency", {}) or records.get("itl", {}) - if itl and "avg" in itl: - all_metrics["itl"].append(itl["avg"] / 1000.0) # Convert ms to s + all_metrics["latencies"].append(request_latency["avg"] / 1000.0) + all_metrics["p50_latencies"].append(request_latency["p50"] / 1000.0) + all_metrics["p90_latencies"].append(request_latency["p90"] / 1000.0) + all_metrics["p99_latencies"].append(request_latency["p99"] / 1000.0) + + # Time to first token + ttft = client_metrics.get("time_to_first_token", {}).get("avg", None) + if ttft: + all_metrics["ttft"].append(ttft / 1000.0) # Convert ms to s + + # Inter-token latency + itl = client_metrics.get("inter_token_latency", {}).get("avg", None) + if itl: + all_metrics["itl"].append(itl / 1000.0) # Convert ms to s # Throughput from request_throughput record - request_throughput = records.get("request_throughput", {}) - req_throughput = request_throughput.get("avg", 0) + req_throughput = client_metrics.get("request_throughput", {}).get( + "avg", 0 + ) if req_throughput: all_metrics["throughputs"].append(req_throughput) diff --git a/tests/planner/README.md b/tests/planner/README.md index 4c1566cc1beb..e9fdcbe44372 100644 --- a/tests/planner/README.md +++ b/tests/planner/README.md @@ -215,10 +215,10 @@ When running deployment with sla-planner, to reduce the image pulling time, depl kubectl apply -f ./perf_test_configs/image_cache_daemonset.yaml -n ``` -Then, port-forward or shell into the frontend pod and run GenAI-Perf to get the goodput: +Then, port-forward or shell into the frontend pod and run AIPerf to get the goodput: ```bash -genai-perf profile \ +aiperf profile \ --model nvidia/Llama-3.1-8B-Instruct-FP8 \ --tokenizer nvidia/Llama-3.1-8B-Instruct-FP8 \ --endpoint-type chat \ @@ -227,11 +227,11 @@ genai-perf profile \ --input-file payload:/workspace/rr-5-45_i3000o300.jsonl \ # path to the generated load dataset \ --fixed-schedule True \ --goodput time_to_first_token:200 inter_token_latency:10 \ - -- -v -max-threads 64 \ + -v ``` > [!NOTE] -> Sometimes, when sla planner scales down the number of workers, a few requests will error out and cause GenAI-Perf to stuck. We are aware of this issue and are working on fixing it. +> Sometimes, when sla planner scales down the number of workers, a few requests will error out and cause AIPerf to stuck. We are aware of this issue and are working on fixing it. #### E2E Perf Test Results diff --git a/tests/planner/utils/load_generator.py b/tests/planner/utils/load_generator.py index 69943de09411..6ac5e38a76d0 100644 --- a/tests/planner/utils/load_generator.py +++ b/tests/planner/utils/load_generator.py @@ -4,7 +4,7 @@ """ Load generation script for SLA planner scaling tests. -This script uses genai-perf to generate load at specific request rates +This script uses aiperf to generate load at specific request rates to test the planner's scaling behavior. """ @@ -24,7 +24,7 @@ class LoadGenerator: - """Generate load using genai-perf to test planner scaling.""" + """Generate load using aiperf to test planner scaling.""" def __init__( self, @@ -40,12 +40,12 @@ def __init__( self.osl = osl self.save_results = save_results - def _calculate_genai_perf_params( + def _calculate_aiperf_params( self, req_per_sec: float, ) -> Dict[str, Any]: """ - Calculate genai-perf parameters to approximate desired request rate. + Calculate aiperf parameters to approximate desired request rate. Args: req_per_sec: Desired requests per second @@ -71,15 +71,15 @@ async def generate_load( Args: req_per_sec: Target requests per second duration_sec: Duration to generate load (seconds) - artifact_dir: Directory to store genai-perf artifacts + artifact_dir: Directory to store aiperf artifacts Returns: Dictionary with load test results """ logger.info(f"Generating load: {req_per_sec} req/s for {duration_sec}s") - # Calculate genai-perf parameters - params = self._calculate_genai_perf_params(req_per_sec) + # Calculate aiperf parameters + params = self._calculate_aiperf_params(req_per_sec) logger.info(f"Using request_rate={params['request_rate']} req/s") # Create artifact directory if not provided @@ -95,9 +95,9 @@ async def generate_load( f"Adjusted parameters: duration={duration_sec}s, request_count={request_count}" ) - # Build genai-perf command based on coworker's successful approach + # Build aiperf command based on coworker's successful approach cmd = [ - "genai-perf", + "aiperf", "profile", "--model", self.model, @@ -116,18 +116,13 @@ async def generate_load( str(params["request_rate"]), "--request-count", str(request_count), # Use request count to limit test duration - "--stability-percentage", - "50", "--num-dataset-entries", str( max(20, int(params["request_rate"] * 10)) ), # Generate reasonable dataset size "--artifact-dir", artifact_dir, - "--", "-v", - "-max-threads", - "64", ] logger.info(f"Running command: {' '.join(cmd)}") @@ -135,7 +130,7 @@ async def generate_load( f"Expected duration: {duration_sec}s, timeout: {max(duration_sec * 2 + 120, int(duration_sec * 2.5))}s" ) - # Run genai-perf (async) + # Run aiperf (async) start_time = time.time() # More generous timeout for high-load tests - allow 2x duration + 2 minutes buffer timeout = max(duration_sec * 2 + 120, int(duration_sec * 2.5)) @@ -152,7 +147,7 @@ async def generate_load( except asyncio.TimeoutError: proc.kill() await proc.communicate() - logger.error("genai-perf timed out") + logger.error("aiperf timed out") raise RuntimeError("Load generation timed out") end_time = time.time() @@ -160,13 +155,9 @@ async def generate_load( # Persist logs for debugging try: - with open( - os.path.join(artifact_dir, "genai_perf.stdout.log"), "wb" - ) as f: + with open(os.path.join(artifact_dir, "aiperf.stdout.log"), "wb") as f: f.write(stdout or b"") - with open( - os.path.join(artifact_dir, "genai_perf.stderr.log"), "wb" - ) as f: + with open(os.path.join(artifact_dir, "aiperf.stderr.log"), "wb") as f: f.write(stderr or b"") except Exception: pass @@ -174,31 +165,31 @@ async def generate_load( if proc.returncode == 0: logger.info("Load generation completed successfully") logger.info(f"Actual duration: {actual_duration:.2f}s") - results = self._parse_genai_perf_results(artifact_dir) + results = self._parse_aiperf_results(artifact_dir) results.update( { "requested_req_per_sec": req_per_sec, "actual_duration": actual_duration, "target_duration": duration_sec, - "genai_perf_params": params, + "aiperf_params": params, "artifact_dir": artifact_dir, "success": True, } ) return results else: - logger.error(f"genai-perf failed with return code {proc.returncode}") - raise RuntimeError("genai-perf failed; see logs in artifact dir") + logger.error(f"aiperf failed with return code {proc.returncode}") + raise RuntimeError("aiperf failed; see logs in artifact dir") except RuntimeError: raise except Exception as e: - logger.error(f"genai-perf execution error: {e}") + logger.error(f"aiperf execution error: {e}") raise - def _parse_genai_perf_results(self, artifact_dir: str) -> Dict[str, Any]: - """Parse genai-perf results from artifact directory.""" + def _parse_aiperf_results(self, artifact_dir: str) -> Dict[str, Any]: + """Parse aiperf results from artifact directory.""" try: - # Look for the profile_export_genai_perf.json file + # Look for the profile_export_aiperf.json file json_files = [f for f in os.listdir(artifact_dir) if f.endswith(".json")] if not json_files: logger.warning("No JSON results found in artifact directory") @@ -207,7 +198,7 @@ def _parse_genai_perf_results(self, artifact_dir: str) -> Dict[str, Any]: # Main results file results_file = None for json_file in json_files: - if "profile_export" in json_file or "genai_perf" in json_file: + if "profile_export" in json_file or "aiperf" in json_file: results_file = os.path.join(artifact_dir, json_file) break @@ -217,40 +208,21 @@ def _parse_genai_perf_results(self, artifact_dir: str) -> Dict[str, Any]: logger.info(f"Parsing results from: {results_file}") with open(results_file, "r") as f: - data = json.load(f) - - results = {} - if "experiments" in data and data["experiments"]: - exp = data["experiments"][0] - if "perf_metrics" in exp: - metrics = exp["perf_metrics"] - results.update( - { - "throughput": metrics.get("throughput", {}).get("avg", 0), - "ttft_mean": metrics.get("ttft", {}).get("avg", 0), - "itl_mean": metrics.get("inter_token_latency", {}).get( - "avg", 0 - ), - "end_to_end_latency_mean": metrics.get( - "request_latency", {} - ).get("avg", 0), - } - ) - if not results and "profile_export_genai_perf" in data: - summary = data.get("summary", {}) - results.update( - { - "throughput": summary.get("throughput", 0), - "ttft_mean": summary.get("time_to_first_token_ms", 0), - "itl_mean": summary.get("inter_token_latency_ms", 0), - } - ) - + metrics = json.load(f) + + results = { + "throughput": metrics.get("output_token_throughput", {}).get("avg", 0), + "ttft_mean": metrics.get("time_to_first_token", {}).get("avg", 0), + "itl_mean": metrics.get("inter_token_latency", {}).get("avg", 0), + "end_to_end_latency_mean": metrics.get("request_latency", {}).get( + "avg", 0 + ), + } logger.info(f"Parsed results: {results}") return results except Exception as e: - logger.warning(f"Failed to parse genai-perf results: {e}") + logger.warning(f"Failed to parse aiperf results: {e}") return {} async def run_scaling_test(self) -> Dict[str, Any]: