Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -63,10 +63,7 @@
}]


@pytest.mark.asyncio
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("mode", MODES)
async def test_models(model: str, mode: str) -> None:
def config():
port = get_open_port()
env_dict = {
"OMP_NUM_THREADS": "10",
Expand All @@ -85,6 +82,13 @@ async def test_models(model: str, mode: str) -> None:
"--speculative-config",
json.dumps(speculative_config)
]
return port, env_dict, additional_config, server_args

@pytest.mark.asyncio
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("mode", MODES)
async def test_models(model: str, mode: str) -> None:
port, env_dict, additional_config, server_args = config()
if mode == "single":
server_args.append("--enforce-eager")
server_args.extend(["--additional-config", json.dumps(additional_config)])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,18 +23,7 @@

from tests.e2e.conftest import RemoteOpenAIServer
from tools.aisbench import run_aisbench_cases

MODELS = [
"vllm-ascend/DeepSeek-R1-0528-W8A8",
]

prompts = [
"San Francisco is a",
]

api_keyword_args = {
"max_tokens": 10,
}
from .test_deepseek_r1_0528_w8a8 import *

aisbench_cases = [{
"case_type": "accuracy",
Expand All @@ -50,46 +39,23 @@

@pytest.mark.asyncio
@pytest.mark.parametrize("model", MODELS)
async def test_models(model: str) -> None:
port = get_open_port()
env_dict = {
"OMP_NUM_THREADS": "100",
"OMP_PROC_BIND": "false",
"HCCL_BUFFSIZE": "200",
"VLLM_RPC_TIMEOUT": "3600000",
"VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS": "3600000",
"DISABLE_L2_CACHE": "1",
"DYNAMIC_EPLB": "true",
}
speculative_config = {"num_speculative_tokens": 1, "method": "mtp"}
compilation_config = {
"cudagraph_capture_sizes": [24],
"cudagraph_mode": "FULL_DECODE_ONLY"
}
additional_config: dict[str, Any] = {
"enable_shared_expert_dp": False,
"multistream_overlap_shared_expert": False,
"eplb_config": {
"dynamic_eplb": True,
"expert_heat_collection_interval": 512,
"algorithm_execution_interval": 100,
"num_redundant_experts": 0
async def test_models_eplb(model: str) -> None:
port, env_dict, additional_config, server_args = config()
additional_config.update(
{
"eplb_config": {
"dynamic_eplb": "true",
"expert_heat_collection_interval": 1000,
"algorithm_execution_interval": 50,
"eplb_policy_type": 3,
}
}
}
server_args = [
"--quantization", "ascend", "--seed", "1024",
"--no-enable-prefix-caching", "--data-parallel-size", "4",
"--tensor-parallel-size", "4", "--enable-expert-parallel", "--port",
str(port), "--max-model-len", "40000", "--max-num-batched-tokens",
"4096", "--max-num-seqs", "12", "--trust-remote-code",
"--gpu-memory-utilization", "0.92"
]
server_args.extend(
["--speculative-config",
json.dumps(speculative_config)])
server_args.extend(
["--compilation-config",
json.dumps(compilation_config)])
)
env_dict.update(
{
"DYNAMIC_EPLB": "true",
}
)
server_args.extend(["--additional-config", json.dumps(additional_config)])
request_keyword_args: dict[str, Any] = {
**api_keyword_args,
Expand All @@ -113,3 +79,4 @@ async def test_models(model: str) -> None:
port,
aisbench_cases,
server_args=server_args)

Original file line number Diff line number Diff line change
Expand Up @@ -23,64 +23,30 @@

from tests.e2e.conftest import RemoteOpenAIServer
from tools.aisbench import run_aisbench_cases

MODELS = [
"vllm-ascend/Qwen3-235B-A22B-W8A8",
]

prompts = [
"San Francisco is a",
]

api_keyword_args = {
"max_tokens": 10,
}

aisbench_cases = [{
"case_type": "accuracy",
"dataset_path": "vllm-ascend/gsm8k-lite",
"request_conf": "vllm_api_general_chat",
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt",
"max_out_len": 32768,
"batch_size": 32,
"top_k": 20,
"baseline": 95,
"threshold": 5
}]
from .test_qwen3_235b_w8a8 import *


@pytest.mark.asyncio
@pytest.mark.parametrize("model", MODELS)
async def test_models(model: str) -> None:
port = get_open_port()
env_dict = {
"OMP_NUM_THREADS": "10",
"OMP_PROC_BIND": "false",
"HCCL_BUFFSIZE": "1024",
"PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"
}
async def test_models_eplb(model: str) -> None:
port, aisbench_cases, env_dict, compilation_config, server_args = config()
env_dict.update(
{
"DYNAMIC_EPLB": "true",
}
)
additional_config: dict[str, Any] = {}
compilation_config = {"cudagraph_mode": "FULL_DECODE_ONLY"}
server_args = [
"--quantization", "ascend", "--async-scheduling",
"--data-parallel-size", "4", "--tensor-parallel-size", "4",
"--enable-expert-parallel", "--port",
str(port), "--max-model-len", "40960", "--max-num-batched-tokens",
"8192", "--max-num-seqs", "12", "--trust-remote-code",
"--gpu-memory-utilization", "0.9"
]
env_dict["DYNAMIC_EPLB"] = "true"
additional_config["eplb_config"] = {
"dynamic_eplb": True,
"expert_heat_collection_interval": 512,
"algorithm_execution_interval": 100,
"num_redundant_experts": 0
"dynamic_eplb": "true",
"expert_heat_collection_interval": 600,
"algorithm_execution_interval": 50,
"num_redundant_experts": 16,
"eplb_policy_type": 2,
}
server_args.extend(["--additional-config", json.dumps(additional_config)])
server_args.extend(
["--compilation-config",
json.dumps(compilation_config)])
server_args.extend(["--additional-config", json.dumps(additional_config)])
request_keyword_args: dict[str, Any] = {
**api_keyword_args,
}
Expand Down
39 changes: 21 additions & 18 deletions tests/e2e/nightly/single_node/models/test_qwen3_235b_w8a8.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,24 +38,19 @@
"max_tokens": 10,
}

aisbench_cases = [{
"case_type": "accuracy",
"dataset_path": "vllm-ascend/gsm8k-lite",
"request_conf": "vllm_api_general_chat",
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt",
"max_out_len": 32768,
"batch_size": 32,
"top_k": 20,
"baseline": 95,
"threshold": 5
}]


@pytest.mark.asyncio
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("mode", MODES)
async def test_models(model: str, mode: str) -> None:
def config():
port = get_open_port()
aisbench_cases = [{
"case_type": "accuracy",
"dataset_path": "vllm-ascend/gsm8k-lite",
"request_conf": "vllm_api_general_chat",
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt",
"max_out_len": 32768,
"batch_size": 32,
"top_k": 20,
"baseline": 95,
"threshold": 5
}]
env_dict = {
"OMP_NUM_THREADS": "10",
"OMP_PROC_BIND": "false",
Expand All @@ -72,11 +67,19 @@ async def test_models(model: str, mode: str) -> None:
"8192", "--max-num-seqs", "12", "--trust-remote-code",
"--gpu-memory-utilization", "0.9"
]
return port, aisbench_cases, env_dict, compilation_config, server_args


@pytest.mark.asyncio
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("mode", MODES)
async def test_models(model: str, mode: str) -> None:
port, aisbench_cases, env_dict, compilation_config, server_args = config()
if mode == "piecewise":
compilation_config["cudagraph_mode"] = "PIECEWISE"
server_args.extend(
["--compilation-config",
json.dumps(compilation_config)])
json.dumps(compilation_config)])
request_keyword_args: dict[str, Any] = {
**api_keyword_args,
}
Expand Down