-
Notifications
You must be signed in to change notification settings - Fork 1.2k
[TEST]Add initial prefix cache case for nightly test #3709
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,104 @@ | ||
| # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. | ||
| # Copyright 2023 The vLLM team. | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
| # This file is a part of the vllm-ascend project. | ||
| # | ||
| import json | ||
|
|
||
| import pytest | ||
| from vllm.utils import get_open_port | ||
|
|
||
| from tests.e2e.conftest import RemoteOpenAIServer | ||
| from tools.aisbench import get_TTFT, run_aisbench_cases | ||
|
|
||
| MODELS = [ | ||
| "vllm-ascend/Qwen3-32B-W8A8", | ||
| ] | ||
|
|
||
| aisbench_warm_up = [{ | ||
| "case_type": "performance", | ||
| "dataset_path": "vllm-ascend/GSM8K-in1024-bs210", | ||
| "request_conf": "vllm_api_stream_chat", | ||
| "dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf", | ||
| "num_prompts": 210, | ||
| "max_out_len": 2, | ||
| "batch_size": 1000, | ||
| "baseline": 0, | ||
| "threshold": 0.97 | ||
| }] | ||
|
|
||
| aisbench_cases0 = [{ | ||
| "case_type": "performance", | ||
| "dataset_path": "vllm-ascend/prefix0-in3500-bs210", | ||
| "request_conf": "vllm_api_stream_chat", | ||
| "dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf", | ||
| "num_prompts": 210, | ||
| "max_out_len": 1500, | ||
| "batch_size": 48, | ||
| "baseline": 1, | ||
| "threshold": 0.97 | ||
| }] | ||
|
|
||
| aisbench_cases75 = [{ | ||
| "case_type": "performance", | ||
| "dataset_path": "vllm-ascend/prefix75-in3500-bs210", | ||
| "request_conf": "vllm_api_stream_chat", | ||
| "dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf", | ||
| "num_prompts": 210, | ||
| "max_out_len": 1500, | ||
| "batch_size": 48, | ||
| "baseline": 1, | ||
| "threshold": 0.97 | ||
| }] | ||
|
|
||
|
|
||
| @pytest.mark.asyncio | ||
| @pytest.mark.parametrize("model", MODELS) | ||
| async def test_models(model: str) -> None: | ||
| port = get_open_port() | ||
| env_dict = {"TASK_QUEUE_ENABLE": "1", "HCCL_OP_EXPANSION_MODE": "AIV"} | ||
| additional_config = { | ||
| "ascend_scheduler_config": { | ||
| "enabled": False | ||
| }, | ||
| "enable_weight_nz_layout": True | ||
| } | ||
| server_args = [ | ||
| "--quantization", "ascend", "--reasoning-parser", "qwen3", | ||
| "--tensor-parallel-size", "4", "--port", | ||
| str(port), "--max-model-len", "8192", "--max-num-batched-tokens", | ||
| "8192", "--max-num-seqs", "256", "--trust-remote-code", | ||
| "--gpu-memory-utilization", "0.9", "--additional-config", | ||
| json.dumps(additional_config) | ||
| ] | ||
| with RemoteOpenAIServer(model, | ||
| server_args, | ||
| server_port=port, | ||
| env_dict=env_dict, | ||
| auto_port=False): | ||
| run_aisbench_cases(model, port, aisbench_warm_up) | ||
| result = run_aisbench_cases(model, port, aisbench_cases0) | ||
| TTFT0 = get_TTFT(result) | ||
| with RemoteOpenAIServer(model, | ||
| server_args, | ||
| server_port=port, | ||
| env_dict=env_dict, | ||
| auto_port=False): | ||
| run_aisbench_cases(model, port, aisbench_warm_up) | ||
| result = run_aisbench_cases(model, port, aisbench_cases75) | ||
| TTFT75 = get_TTFT(result) | ||
| assert TTFT75 < 0.4 * TTFT0, f"The TTFT for prefix75 {TTFT75} is not less than 0.4*TTFT for prefix0 {TTFT0}." | ||
| print( | ||
| f"The TTFT for prefix75 {TTFT75} is less than 0.4*TTFT for prefix0 {TTFT0}." | ||
| ) | ||
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -205,15 +205,16 @@ def _get_result_performance(self): | |||||||||||||||||||
| f"{dataset_type}dataset.csv") | ||||||||||||||||||||
| result_json_file = os.path.join(result_dir, | ||||||||||||||||||||
| f"{dataset_type}dataset.json") | ||||||||||||||||||||
| self.result_csv = pd.read_csv(result_csv_file) | ||||||||||||||||||||
| self.result_csv = pd.read_csv(result_csv_file, index_col=0) | ||||||||||||||||||||
| print("Getting performance results from file: ", result_json_file) | ||||||||||||||||||||
| with open(result_json_file, 'r', encoding='utf-8') as f: | ||||||||||||||||||||
| self.result_json = json.load(f) | ||||||||||||||||||||
| self.result = [self.result_csv, self.result_json] | ||||||||||||||||||||
|
|
||||||||||||||||||||
| def _get_result_accuracy(self): | ||||||||||||||||||||
| acc_file = re.search(r'write csv to (.*)', self.result_line).group(1) | ||||||||||||||||||||
| df = pd.read_csv(acc_file) | ||||||||||||||||||||
| return float(df.loc[0][-1]) | ||||||||||||||||||||
| self.result = float(df.loc[0][-1]) | ||||||||||||||||||||
|
|
||||||||||||||||||||
| def _performance_verify(self): | ||||||||||||||||||||
| self._get_result_performance() | ||||||||||||||||||||
|
|
@@ -224,21 +225,30 @@ def _performance_verify(self): | |||||||||||||||||||
| ) >= self.threshold * self.baseline, f"Performance verification failed. The current Output Token Throughput is {output_throughput} token/s, which is not greater than or equal to {self.threshold} * baseline {self.baseline}." | ||||||||||||||||||||
|
|
||||||||||||||||||||
| def _accuracy_verify(self): | ||||||||||||||||||||
| acc_value = self._get_result_accuracy() | ||||||||||||||||||||
| self._get_result_accuracy() | ||||||||||||||||||||
| acc_value = self.result | ||||||||||||||||||||
| assert self.baseline - self.threshold <= acc_value <= self.baseline + self.threshold, f"Accuracy verification failed. The accuracy of {self.dataset_path} is {acc_value}, which is not within {self.threshold} relative to baseline {self.baseline}." | ||||||||||||||||||||
|
|
||||||||||||||||||||
|
|
||||||||||||||||||||
| def run_aisbench_cases(model, port, aisbench_cases): | ||||||||||||||||||||
| aisbench_results = [] | ||||||||||||||||||||
| aisbench_errors = [] | ||||||||||||||||||||
| for aisbench_case in aisbench_cases: | ||||||||||||||||||||
| try: | ||||||||||||||||||||
| with AisbenchRunner(model, port, aisbench_case): | ||||||||||||||||||||
| pass | ||||||||||||||||||||
| with AisbenchRunner(model, port, aisbench_case) as aisbench: | ||||||||||||||||||||
| aisbench_results.append(aisbench.result) | ||||||||||||||||||||
| except Exception as e: | ||||||||||||||||||||
| aisbench_results.append("") | ||||||||||||||||||||
| aisbench_errors.append([aisbench_case, e]) | ||||||||||||||||||||
| print(e) | ||||||||||||||||||||
| for failed_case, error_info in aisbench_errors: | ||||||||||||||||||||
| print( | ||||||||||||||||||||
| f"The following aisbench case failed: {failed_case}, reason is {error_info}." | ||||||||||||||||||||
| ) | ||||||||||||||||||||
| assert not aisbench_errors, "some aisbench cases failed, info were shown above." | ||||||||||||||||||||
| return aisbench_results | ||||||||||||||||||||
|
|
||||||||||||||||||||
|
|
||||||||||||||||||||
| def get_TTFT(result): | ||||||||||||||||||||
| TTFT = result[0][0].loc["TTFT", "Average"][:-3] | ||||||||||||||||||||
| return float(TTFT) | ||||||||||||||||||||
|
Comment on lines
+252
to
+254
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Slicing with
Suggested change
|
||||||||||||||||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Starting the
RemoteOpenAIServertwice is inefficient and significantly slows down the test. The server can be started once to run both test cases, and the warm-up also only needs to be run once. This will make the test execute much faster.