-
Notifications
You must be signed in to change notification settings - Fork 933
[Tests][Qwen3-Omni] Add performance test cases #2011
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -118,10 +118,6 @@ def benchmark_params(request, omni_server): | |
| if param_index >= len(all_params): | ||
| raise ValueError(f"No benchmark parameters found for index {param_index} in test: {test_name}") | ||
|
|
||
| if all_params[param_index]["dataset_name"] == "random-mm": | ||
| # TODO: Due to known issues, skip the random-mm dataset. | ||
| pytest.skip("Skipping parameter for random-mm dataset.") | ||
|
|
||
| current = param_index + 1 | ||
| total = len(all_params) | ||
|
Comment on lines
118
to
122
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Removing the Useful? React with 👍 / 👎. |
||
| print(f"\n Running benchmark {current}/{total} for {test_name}") | ||
|
|
@@ -132,15 +128,70 @@ def benchmark_params(request, omni_server): | |
| } | ||
|
|
||
|
|
||
| def assert_result(result, params, num_prompt): | ||
| def _resolve_baseline_value( | ||
| baseline_raw: Any, | ||
| *, | ||
| sweep_index: int | None, | ||
| max_concurrency: Any = None, | ||
| request_rate: Any = None, | ||
| ) -> Any: | ||
| """Pick the baseline threshold for this sweep step. | ||
|
|
||
| Supported shapes per metric: | ||
| - **Scalar** — same threshold for every concurrency / QPS. | ||
| - **List** — aligned with ``max_concurrency`` / ``request_rate`` sweep order; use ``sweep_index``. | ||
| - **Dict** — keyed by concurrency or rate, e.g. ``{"1": 500, "4": 800}`` (keys are strings in JSON). | ||
|
|
||
| For dict lookup, ``max_concurrency`` is preferred when both are set (concurrency sweep). | ||
| """ | ||
| if baseline_raw is None: | ||
| # If no baseline is set, the maximum value will be used. | ||
| return 100000 | ||
| if isinstance(baseline_raw, dict): | ||
| if max_concurrency is not None: | ||
| for key in (max_concurrency, str(max_concurrency)): | ||
| if key in baseline_raw: | ||
| return baseline_raw[key] | ||
| if request_rate is not None: | ||
| for key in (request_rate, str(request_rate)): | ||
| if key in baseline_raw: | ||
| return baseline_raw[key] | ||
| raise KeyError( | ||
| f"baseline dict has no key for max_concurrency={max_concurrency!r} " | ||
| f"or request_rate={request_rate!r}; keys={list(baseline_raw.keys())!r}" | ||
| ) | ||
| if isinstance(baseline_raw, (list, tuple)): | ||
| return baseline_raw[sweep_index] | ||
| return baseline_raw | ||
|
|
||
|
|
||
| def assert_result( | ||
| result, | ||
| params, | ||
| num_prompt, | ||
| *, | ||
| sweep_index: int | None = None, | ||
| max_concurrency: Any = None, | ||
| request_rate: Any = None, | ||
| ) -> None: | ||
| assert result["completed"] == num_prompt, "Request failures exist" | ||
| baseline_data = params.get("baseline", {}) | ||
| for metric_name, baseline_value in baseline_data.items(): | ||
| for metric_name, baseline_raw in baseline_data.items(): | ||
| current_value = result[metric_name] | ||
| baseline_value = _resolve_baseline_value( | ||
| baseline_raw, | ||
| sweep_index=sweep_index, | ||
| max_concurrency=max_concurrency, | ||
| request_rate=request_rate, | ||
| ) | ||
| if "throughput" in metric_name: | ||
| assert current_value >= baseline_value, f"{metric_name}: {current_value} < {baseline_value}" | ||
| if current_value <= baseline_value: | ||
| print( | ||
| f"ERROR: Throughput test results were below baseline: {metric_name}: {current_value} > {baseline_value}" | ||
| ) | ||
| else: | ||
| assert current_value <= baseline_value, f"{metric_name}: {current_value} > {baseline_value}" | ||
| if current_value >= baseline_value: | ||
| print(f"ERROR: Test results exceeded baseline: {metric_name}: {current_value} < {baseline_value}") | ||
|
|
||
|
|
||
| @pytest.mark.parametrize("omni_server", test_params, indirect=True) | ||
|
|
@@ -195,8 +246,8 @@ def to_list(value, default=None): | |
| elif not isinstance(value, bool): | ||
| args.extend([arg_name, str(value)]) | ||
|
|
||
| # QPS test | ||
| for qps, num_prompt in zip(qps_list, num_prompt_list): | ||
| # QPS test (sweep_index aligns with qps_list / num_prompt_list for this loop) | ||
| for i, (qps, num_prompt) in enumerate(zip(qps_list, num_prompt_list)): | ||
| args = args + ["--request-rate", str(qps), "--num-prompts", str(num_prompt)] | ||
| result = run_benchmark( | ||
| args=args, | ||
|
|
@@ -205,10 +256,16 @@ def to_list(value, default=None): | |
| dataset_name=dataset_name, | ||
| num_prompt=num_prompt, | ||
| ) | ||
| assert_result(result, params, num_prompt=num_prompt) | ||
| assert_result( | ||
| result, | ||
| params, | ||
| num_prompt=num_prompt, | ||
| sweep_index=i, | ||
| request_rate=qps, | ||
| ) | ||
|
|
||
| # concurrency test | ||
| for concurrency, num_prompt in zip(max_concurrency_list, num_prompt_list): | ||
| # concurrency test (sweep_index aligns with max_concurrency_list for separate thresholds per concurrency) | ||
| for i, (concurrency, num_prompt) in enumerate(zip(max_concurrency_list, num_prompt_list)): | ||
| args = args + ["--max-concurrency", str(concurrency), "--num-prompts", str(num_prompt), "--request-rate", "inf"] | ||
| result = run_benchmark( | ||
| args=args, | ||
|
|
@@ -217,4 +274,10 @@ def to_list(value, default=None): | |
| dataset_name=dataset_name, | ||
| num_prompt=num_prompt, | ||
| ) | ||
| assert_result(result, params, num_prompt=num_prompt) | ||
| assert_result( | ||
| result, | ||
| params, | ||
| num_prompt=num_prompt, | ||
| sweep_index=i, | ||
| max_concurrency=concurrency, | ||
| ) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -25,9 +25,9 @@ | |
| "ignore_eos": true, | ||
| "percentile-metrics": "ttft,tpot,itl,e2el,audio_rtf,audio_ttfp,audio_duration", | ||
| "baseline": { | ||
| "mean_ttft_ms": 100000, | ||
| "mean_audio_ttfp_ms": 100000, | ||
| "mean_audio_rtf": 100000 | ||
| "mean_ttft_ms": [1000, 3000, 5000], | ||
| "mean_audio_ttfp_ms": [8000, 10000, 13000], | ||
| "mean_audio_rtf": [0.2, 0.25, 0.45] | ||
| } | ||
| }, | ||
| { | ||
|
|
@@ -39,10 +39,10 @@ | |
| 40, | ||
| 100 | ||
| ], | ||
| "max_concurrency": [ | ||
| 1, | ||
| 4, | ||
| 10 | ||
| "request_rate": [ | ||
| 0.1, | ||
| 0.3, | ||
| 0.5 | ||
| ], | ||
| "random_input_len": 100, | ||
| "random_output_len": 100, | ||
|
|
@@ -62,9 +62,31 @@ | |
| }, | ||
| "percentile-metrics": "ttft,tpot,itl,e2el,audio_rtf,audio_ttfp,audio_duration", | ||
| "baseline": { | ||
| "mean_ttft_ms": 100000, | ||
| "mean_audio_ttfp_ms": 100000, | ||
| "mean_audio_rtf": 100000 | ||
| "mean_ttft_ms": [2000, 4000, 6000], | ||
| "mean_audio_ttfp_ms": [10000, 13000, 15000], | ||
| "mean_audio_rtf": [0.25, 0.35, 0.45] | ||
| } | ||
| }, | ||
| { | ||
| "dataset_name": "random", | ||
| "backend": "openai-chat-omni", | ||
| "endpoint": "/v1/chat/completions", | ||
| "num_prompts": [ | ||
| 4, | ||
| 16 | ||
| ], | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why don't we set num_prompts to be the same as the short input/output case, using 10 and 40? Or change short input/output case to 4, 16 to maintain consistency. |
||
| "max_concurrency": [ | ||
| 1, | ||
| 4 | ||
| ], | ||
| "random_input_len": 2500, | ||
| "random_output_len": 900, | ||
| "ignore_eos": true, | ||
| "percentile-metrics": "ttft,tpot,itl,e2el,audio_rtf,audio_ttfp,audio_duration", | ||
| "baseline": { | ||
| "mean_ttft_ms": [1000, 3000], | ||
| "mean_audio_ttfp_ms": [30000, 60000], | ||
| "mean_audio_rtf": [0.35, 0.45] | ||
| } | ||
| } | ||
| ] | ||
|
|
@@ -113,9 +135,9 @@ | |
| "ignore_eos": true, | ||
| "percentile-metrics": "ttft,tpot,itl,e2el,audio_rtf,audio_ttfp,audio_duration", | ||
| "baseline": { | ||
| "mean_ttft_ms": 100000, | ||
| "mean_audio_ttfp_ms": 100000, | ||
| "mean_audio_rtf": 100000 | ||
| "mean_ttft_ms": [1000, 3000, 5000], | ||
| "mean_audio_ttfp_ms": [1000, 3000, 5000], | ||
| "mean_audio_rtf": [0.2, 0.35, 0.6] | ||
| } | ||
| }, | ||
| { | ||
|
|
@@ -127,10 +149,10 @@ | |
| 40, | ||
| 100 | ||
| ], | ||
| "max_concurrency": [ | ||
| 1, | ||
| 4, | ||
| 10 | ||
| "request_rate": [ | ||
| 0.1, | ||
| 0.3, | ||
| 0.5 | ||
| ], | ||
| "random_input_len": 100, | ||
| "random_output_len": 100, | ||
|
|
@@ -150,9 +172,31 @@ | |
| }, | ||
| "percentile-metrics": "ttft,tpot,itl,e2el,audio_rtf,audio_ttfp,audio_duration", | ||
| "baseline": { | ||
| "mean_ttft_ms": 100000, | ||
| "mean_audio_ttfp_ms": 100000, | ||
| "mean_audio_rtf": 100000 | ||
| "mean_ttft_ms": [2000, 4000, 6000], | ||
| "mean_audio_ttfp_ms": [2000, 4000, 6000], | ||
| "mean_audio_rtf": [0.25, 0.4, 0.7] | ||
| } | ||
| }, | ||
| { | ||
| "dataset_name": "random", | ||
| "backend": "openai-chat-omni", | ||
| "endpoint": "/v1/chat/completions", | ||
| "num_prompts": [ | ||
| 4, | ||
| 16 | ||
| ], | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why don't we set num_prompts to be the same as the short input/output case, using 10 and 40? Or change short input/output case to 4, 16 to maintain consistency.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Long input requests take too long to test, and if the settings are the same as for short inputs, it may cause test cases to time out. |
||
| "max_concurrency": [ | ||
| 1, | ||
| 4 | ||
| ], | ||
| "random_input_len": 2500, | ||
| "random_output_len": 900, | ||
| "ignore_eos": true, | ||
| "percentile-metrics": "ttft,tpot,itl,e2el,audio_rtf,audio_ttfp,audio_duration", | ||
| "baseline": { | ||
| "mean_ttft_ms": [1000, 3000], | ||
| "mean_audio_ttfp_ms": [1000, 3000], | ||
| "mean_audio_rtf": [0.35, 0.45] | ||
| } | ||
| } | ||
| ] | ||
|
|
@@ -183,8 +227,8 @@ | |
| }, | ||
| "percentile-metrics": "ttft,e2el,audio_rtf,audio_ttfp,audio_duration", | ||
| "baseline": { | ||
| "mean_audio_ttfp_ms": 100000, | ||
| "mean_audio_rtf": 100000 | ||
| "mean_audio_ttfp_ms": [6000, 6000], | ||
| "mean_audio_rtf": [0.3, 0.3] | ||
| } | ||
| } | ||
| ] | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The skip was added with
TODO: Due to known issues. Is the underlying issue actually resolved, or is this just removing the guard? Would be good to link the fix if there is one.