Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 104 additions & 0 deletions tests/e2e/nightly/features/test_prefix_cache_qwen3_32b_int8.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
import json

import pytest
from vllm.utils import get_open_port

from tests.e2e.conftest import RemoteOpenAIServer
from tools.aisbench import get_TTFT, run_aisbench_cases

MODELS = [
"vllm-ascend/Qwen3-32B-W8A8",
]

aisbench_warm_up = [{
"case_type": "performance",
"dataset_path": "vllm-ascend/GSM8K-in1024-bs210",
"request_conf": "vllm_api_stream_chat",
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf",
"num_prompts": 210,
"max_out_len": 2,
"batch_size": 1000,
"baseline": 0,
"threshold": 0.97
}]

aisbench_cases0 = [{
"case_type": "performance",
"dataset_path": "vllm-ascend/prefix0-in3500-bs210",
"request_conf": "vllm_api_stream_chat",
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf",
"num_prompts": 210,
"max_out_len": 1500,
"batch_size": 48,
"baseline": 1,
"threshold": 0.97
}]

aisbench_cases75 = [{
"case_type": "performance",
"dataset_path": "vllm-ascend/prefix75-in3500-bs210",
"request_conf": "vllm_api_stream_chat",
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf",
"num_prompts": 210,
"max_out_len": 1500,
"batch_size": 48,
"baseline": 1,
"threshold": 0.97
}]


@pytest.mark.asyncio
@pytest.mark.parametrize("model", MODELS)
async def test_models(model: str) -> None:
port = get_open_port()
env_dict = {"TASK_QUEUE_ENABLE": "1", "HCCL_OP_EXPANSION_MODE": "AIV"}
additional_config = {
"ascend_scheduler_config": {
"enabled": False
},
"enable_weight_nz_layout": True
}
server_args = [
"--quantization", "ascend", "--reasoning-parser", "qwen3",
"--tensor-parallel-size", "4", "--port",
str(port), "--max-model-len", "8192", "--max-num-batched-tokens",
"8192", "--max-num-seqs", "256", "--trust-remote-code",
"--gpu-memory-utilization", "0.9", "--additional-config",
json.dumps(additional_config)
]
with RemoteOpenAIServer(model,
server_args,
server_port=port,
env_dict=env_dict,
auto_port=False):
run_aisbench_cases(model, port, aisbench_warm_up)
result = run_aisbench_cases(model, port, aisbench_cases0)
TTFT0 = get_TTFT(result)
with RemoteOpenAIServer(model,
server_args,
server_port=port,
env_dict=env_dict,
auto_port=False):
run_aisbench_cases(model, port, aisbench_warm_up)
result = run_aisbench_cases(model, port, aisbench_cases75)
TTFT75 = get_TTFT(result)
Comment on lines +85 to +100
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

Starting the RemoteOpenAIServer twice is inefficient and significantly slows down the test. The server can be started once to run both test cases, and the warm-up also only needs to be run once. This will make the test execute much faster.

    with RemoteOpenAIServer(model,
                            server_args,
                            server_port=port,
                            env_dict=env_dict,
                            auto_port=False):
        run_aisbench_cases(model, port, aisbench_warm_up)
        result = run_aisbench_cases(model, port, aisbench_cases0)
        TTFT0 = get_TTFT(result)
        result = run_aisbench_cases(model, port, aisbench_cases75)
        TTFT75 = get_TTFT(result)

assert TTFT75 < 0.4 * TTFT0, f"The TTFT for prefix75 {TTFT75} is not less than 0.4*TTFT for prefix0 {TTFT0}."
print(
f"The TTFT for prefix75 {TTFT75} is less than 0.4*TTFT for prefix0 {TTFT0}."
)
20 changes: 15 additions & 5 deletions tools/aisbench.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,15 +205,16 @@ def _get_result_performance(self):
f"{dataset_type}dataset.csv")
result_json_file = os.path.join(result_dir,
f"{dataset_type}dataset.json")
self.result_csv = pd.read_csv(result_csv_file)
self.result_csv = pd.read_csv(result_csv_file, index_col=0)
print("Getting performance results from file: ", result_json_file)
with open(result_json_file, 'r', encoding='utf-8') as f:
self.result_json = json.load(f)
self.result = [self.result_csv, self.result_json]

def _get_result_accuracy(self):
acc_file = re.search(r'write csv to (.*)', self.result_line).group(1)
df = pd.read_csv(acc_file)
return float(df.loc[0][-1])
self.result = float(df.loc[0][-1])

def _performance_verify(self):
self._get_result_performance()
Expand All @@ -224,21 +225,30 @@ def _performance_verify(self):
) >= self.threshold * self.baseline, f"Performance verification failed. The current Output Token Throughput is {output_throughput} token/s, which is not greater than or equal to {self.threshold} * baseline {self.baseline}."

def _accuracy_verify(self):
acc_value = self._get_result_accuracy()
self._get_result_accuracy()
acc_value = self.result
assert self.baseline - self.threshold <= acc_value <= self.baseline + self.threshold, f"Accuracy verification failed. The accuracy of {self.dataset_path} is {acc_value}, which is not within {self.threshold} relative to baseline {self.baseline}."


def run_aisbench_cases(model, port, aisbench_cases):
aisbench_results = []
aisbench_errors = []
for aisbench_case in aisbench_cases:
try:
with AisbenchRunner(model, port, aisbench_case):
pass
with AisbenchRunner(model, port, aisbench_case) as aisbench:
aisbench_results.append(aisbench.result)
except Exception as e:
aisbench_results.append("")
aisbench_errors.append([aisbench_case, e])
print(e)
for failed_case, error_info in aisbench_errors:
print(
f"The following aisbench case failed: {failed_case}, reason is {error_info}."
)
assert not aisbench_errors, "some aisbench cases failed, info were shown above."
return aisbench_results


def get_TTFT(result):
TTFT = result[0][0].loc["TTFT", "Average"][:-3]
return float(TTFT)
Comment on lines +252 to +254
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

Slicing with [:-3] is brittle because it assumes the units will always be 3 characters long (e.g., ' ms'). If the units change (e.g., to 's') or are formatted differently, this will break or produce incorrect results. It's more robust to parse the numeric value from the string using a regular expression.

Suggested change
def get_TTFT(result):
TTFT = result[0][0].loc["TTFT", "Average"][:-3]
return float(TTFT)
def get_TTFT(result):
ttft_str = result[0][0].loc["TTFT", "Average"]
match = re.match(r"^\d+(\.\d*)?", ttft_str)
if not match:
raise ValueError(f"Could not parse TTFT value from '{ttft_str}'")
return float(match.group(0))

Loading