diff --git a/docs/contributing/ci/CI_5levels.md b/docs/contributing/ci/CI_5levels.md index 9cb6550bc22..b78d3a0efb1 100644 --- a/docs/contributing/ci/CI_5levels.md +++ b/docs/contributing/ci/CI_5levels.md @@ -567,15 +567,15 @@ L4 level testing is a comprehensive quality audit before a version release. It e { "dataset_name": "random", "num_prompts": [10, 20], - "request_rate": [0.5, 1], + "max_concurrency": [1, 4], "random_input_len": 2500, "random_output_len": 900, "ignore_eos": true, "percentile-metrics": "ttft,tpot,itl,e2el,audio_rtf,audio_ttfp,audio_duration", "baseline": { - "mean_ttft_ms": 100000, - "mean_audio_ttfp_ms": 100000, - "mean_audio_rtf": 100000 + "mean_ttft_ms": [500, 800], + "mean_audio_ttfp_ms": [2000, 3500], + "mean_audio_rtf": [0.25, 0.35] } } ] @@ -634,15 +634,15 @@ L4 level testing is a comprehensive quality audit before a version release. It e 1. Change the ---xxx-xx-xx running parameters to xxx_xx_xx format and fill them as keys in the JSON file. 2. For boolean variables in the running parameters, modify them to forms such as ignore_eos: true/false and fill them into the JSON file. - 3. Add the baseline parameter to specify the required validation values, ensuring the validation metric names match those in the result.json generated by the benchmark. + 3. Optionally add a `baseline` object (see **Baseline thresholds** below). If you omit `baseline` or leave it empty, the performance test still runs but does not assert metric thresholds from this field. 4. The qps and concurrency modes are mutually exclusive. For detailed explanations, see the table below: | Parameter | Type | Required | Example/Values | Description | | --------------- | ----------- | -------- | --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | num_prompts | int / array | Yes | 10,[10, 20, 30] | Number of requests. Supports single values or arrays. If a single value is used, it will be automatically expanded to match the number of qps or max_concurrency, e.g., [10,10,10]. If an array is used, its length must match the number of qps or max_concurrency. | | request_rate | int / array | No | 1, [1, 2, 3] | Queries per second. Supports single values or arrays. If a single value is used, it will be automatically expanded to match the number of num_prompts, e.g., [1,1,1]. If an array is used, its length must match the number of num_prompts. | - | max_concurrency | int / array | No | 1, [1, 2, 3] | Queries per second. Supports single values or arrays. If a single value is used, it will be automatically expanded to match the number of num_prompts, e.g., [1,1,1]. If an array is used, its length must match the number of num_prompts. | - + | max_concurrency | int / array | No | 1, [1, 2, 3] | Maximum concurrent in-flight requests. Same array / expansion rules as `request_rate` (mutually exclusive with QPS mode). | + | baseline | object | No | see above | Optional per-metric thresholds; keys must match benchmark output fields. Scalar, list (per sweep step), or object (keyed by concurrency or QPS string). | - - ***Run Command***: (Specific commands would depend on the performance testing tool and configuration defined in `nightly.json`). diff --git a/tests/dfx/perf/scripts/run_benchmark.py b/tests/dfx/perf/scripts/run_benchmark.py index bc6abbb0202..9e375fa9fec 100644 --- a/tests/dfx/perf/scripts/run_benchmark.py +++ b/tests/dfx/perf/scripts/run_benchmark.py @@ -118,10 +118,6 @@ def benchmark_params(request, omni_server): if param_index >= len(all_params): raise ValueError(f"No benchmark parameters found for index {param_index} in test: {test_name}") - if all_params[param_index]["dataset_name"] == "random-mm": - # TODO: Due to known issues, skip the random-mm dataset. - pytest.skip("Skipping parameter for random-mm dataset.") - current = param_index + 1 total = len(all_params) print(f"\n Running benchmark {current}/{total} for {test_name}") @@ -132,15 +128,70 @@ def benchmark_params(request, omni_server): } -def assert_result(result, params, num_prompt): +def _resolve_baseline_value( + baseline_raw: Any, + *, + sweep_index: int | None, + max_concurrency: Any = None, + request_rate: Any = None, +) -> Any: + """Pick the baseline threshold for this sweep step. + + Supported shapes per metric: + - **Scalar** — same threshold for every concurrency / QPS. + - **List** — aligned with ``max_concurrency`` / ``request_rate`` sweep order; use ``sweep_index``. + - **Dict** — keyed by concurrency or rate, e.g. ``{"1": 500, "4": 800}`` (keys are strings in JSON). + + For dict lookup, ``max_concurrency`` is preferred when both are set (concurrency sweep). + """ + if baseline_raw is None: + # If no baseline is set, the maximum value will be used. + return 100000 + if isinstance(baseline_raw, dict): + if max_concurrency is not None: + for key in (max_concurrency, str(max_concurrency)): + if key in baseline_raw: + return baseline_raw[key] + if request_rate is not None: + for key in (request_rate, str(request_rate)): + if key in baseline_raw: + return baseline_raw[key] + raise KeyError( + f"baseline dict has no key for max_concurrency={max_concurrency!r} " + f"or request_rate={request_rate!r}; keys={list(baseline_raw.keys())!r}" + ) + if isinstance(baseline_raw, (list, tuple)): + return baseline_raw[sweep_index] + return baseline_raw + + +def assert_result( + result, + params, + num_prompt, + *, + sweep_index: int | None = None, + max_concurrency: Any = None, + request_rate: Any = None, +) -> None: assert result["completed"] == num_prompt, "Request failures exist" baseline_data = params.get("baseline", {}) - for metric_name, baseline_value in baseline_data.items(): + for metric_name, baseline_raw in baseline_data.items(): current_value = result[metric_name] + baseline_value = _resolve_baseline_value( + baseline_raw, + sweep_index=sweep_index, + max_concurrency=max_concurrency, + request_rate=request_rate, + ) if "throughput" in metric_name: - assert current_value >= baseline_value, f"{metric_name}: {current_value} < {baseline_value}" + if current_value <= baseline_value: + print( + f"ERROR: Throughput test results were below baseline: {metric_name}: {current_value} > {baseline_value}" + ) else: - assert current_value <= baseline_value, f"{metric_name}: {current_value} > {baseline_value}" + if current_value >= baseline_value: + print(f"ERROR: Test results exceeded baseline: {metric_name}: {current_value} < {baseline_value}") @pytest.mark.parametrize("omni_server", test_params, indirect=True) @@ -195,8 +246,8 @@ def to_list(value, default=None): elif not isinstance(value, bool): args.extend([arg_name, str(value)]) - # QPS test - for qps, num_prompt in zip(qps_list, num_prompt_list): + # QPS test (sweep_index aligns with qps_list / num_prompt_list for this loop) + for i, (qps, num_prompt) in enumerate(zip(qps_list, num_prompt_list)): args = args + ["--request-rate", str(qps), "--num-prompts", str(num_prompt)] result = run_benchmark( args=args, @@ -205,10 +256,16 @@ def to_list(value, default=None): dataset_name=dataset_name, num_prompt=num_prompt, ) - assert_result(result, params, num_prompt=num_prompt) + assert_result( + result, + params, + num_prompt=num_prompt, + sweep_index=i, + request_rate=qps, + ) - # concurrency test - for concurrency, num_prompt in zip(max_concurrency_list, num_prompt_list): + # concurrency test (sweep_index aligns with max_concurrency_list for separate thresholds per concurrency) + for i, (concurrency, num_prompt) in enumerate(zip(max_concurrency_list, num_prompt_list)): args = args + ["--max-concurrency", str(concurrency), "--num-prompts", str(num_prompt), "--request-rate", "inf"] result = run_benchmark( args=args, @@ -217,4 +274,10 @@ def to_list(value, default=None): dataset_name=dataset_name, num_prompt=num_prompt, ) - assert_result(result, params, num_prompt=num_prompt) + assert_result( + result, + params, + num_prompt=num_prompt, + sweep_index=i, + max_concurrency=concurrency, + ) diff --git a/tests/dfx/perf/tests/test.json b/tests/dfx/perf/tests/test.json index 98b2ef25cc0..fe7e3804698 100644 --- a/tests/dfx/perf/tests/test.json +++ b/tests/dfx/perf/tests/test.json @@ -25,9 +25,9 @@ "ignore_eos": true, "percentile-metrics": "ttft,tpot,itl,e2el,audio_rtf,audio_ttfp,audio_duration", "baseline": { - "mean_ttft_ms": 100000, - "mean_audio_ttfp_ms": 100000, - "mean_audio_rtf": 100000 + "mean_ttft_ms": [1000, 3000, 5000], + "mean_audio_ttfp_ms": [8000, 10000, 13000], + "mean_audio_rtf": [0.2, 0.25, 0.45] } }, { @@ -39,10 +39,10 @@ 40, 100 ], - "max_concurrency": [ - 1, - 4, - 10 + "request_rate": [ + 0.1, + 0.3, + 0.5 ], "random_input_len": 100, "random_output_len": 100, @@ -62,9 +62,31 @@ }, "percentile-metrics": "ttft,tpot,itl,e2el,audio_rtf,audio_ttfp,audio_duration", "baseline": { - "mean_ttft_ms": 100000, - "mean_audio_ttfp_ms": 100000, - "mean_audio_rtf": 100000 + "mean_ttft_ms": [2000, 4000, 6000], + "mean_audio_ttfp_ms": [10000, 13000, 15000], + "mean_audio_rtf": [0.25, 0.35, 0.45] + } + }, + { + "dataset_name": "random", + "backend": "openai-chat-omni", + "endpoint": "/v1/chat/completions", + "num_prompts": [ + 4, + 16 + ], + "max_concurrency": [ + 1, + 4 + ], + "random_input_len": 2500, + "random_output_len": 900, + "ignore_eos": true, + "percentile-metrics": "ttft,tpot,itl,e2el,audio_rtf,audio_ttfp,audio_duration", + "baseline": { + "mean_ttft_ms": [1000, 3000], + "mean_audio_ttfp_ms": [30000, 60000], + "mean_audio_rtf": [0.35, 0.45] } } ] @@ -113,9 +135,9 @@ "ignore_eos": true, "percentile-metrics": "ttft,tpot,itl,e2el,audio_rtf,audio_ttfp,audio_duration", "baseline": { - "mean_ttft_ms": 100000, - "mean_audio_ttfp_ms": 100000, - "mean_audio_rtf": 100000 + "mean_ttft_ms": [1000, 3000, 5000], + "mean_audio_ttfp_ms": [1000, 3000, 5000], + "mean_audio_rtf": [0.2, 0.35, 0.6] } }, { @@ -127,10 +149,10 @@ 40, 100 ], - "max_concurrency": [ - 1, - 4, - 10 + "request_rate": [ + 0.1, + 0.3, + 0.5 ], "random_input_len": 100, "random_output_len": 100, @@ -150,9 +172,31 @@ }, "percentile-metrics": "ttft,tpot,itl,e2el,audio_rtf,audio_ttfp,audio_duration", "baseline": { - "mean_ttft_ms": 100000, - "mean_audio_ttfp_ms": 100000, - "mean_audio_rtf": 100000 + "mean_ttft_ms": [2000, 4000, 6000], + "mean_audio_ttfp_ms": [2000, 4000, 6000], + "mean_audio_rtf": [0.25, 0.4, 0.7] + } + }, + { + "dataset_name": "random", + "backend": "openai-chat-omni", + "endpoint": "/v1/chat/completions", + "num_prompts": [ + 4, + 16 + ], + "max_concurrency": [ + 1, + 4 + ], + "random_input_len": 2500, + "random_output_len": 900, + "ignore_eos": true, + "percentile-metrics": "ttft,tpot,itl,e2el,audio_rtf,audio_ttfp,audio_duration", + "baseline": { + "mean_ttft_ms": [1000, 3000], + "mean_audio_ttfp_ms": [1000, 3000], + "mean_audio_rtf": [0.35, 0.45] } } ] @@ -183,8 +227,8 @@ }, "percentile-metrics": "ttft,e2el,audio_rtf,audio_ttfp,audio_duration", "baseline": { - "mean_audio_ttfp_ms": 100000, - "mean_audio_rtf": 100000 + "mean_audio_ttfp_ms": [6000, 6000], + "mean_audio_rtf": [0.3, 0.3] } } ]