vllm-project · wangxiyuan · Feb 14, 2026 · Feb 4, 2026
@@ -63,10 +63,7 @@
 }]
 
 
-@pytest.mark.asyncio
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("mode", MODES)
-async def test_models(model: str, mode: str) -> None:
+def config():
     port = get_open_port()
     env_dict = {
         "OMP_NUM_THREADS": "10",
@@ -85,6 +82,13 @@ async def test_models(model: str, mode: str) -> None:
         "--speculative-config",
         json.dumps(speculative_config)
     ]
+    return port, env_dict, additional_config, server_args
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("mode", MODES)
+async def test_models(model: str, mode: str) -> None:
+    port, env_dict, additional_config, server_args = config()
     if mode == "single":
         server_args.append("--enforce-eager")
     server_args.extend(["--additional-config", json.dumps(additional_config)])

@@ -23,18 +23,7 @@
 
 from tests.e2e.conftest import RemoteOpenAIServer
 from tools.aisbench import run_aisbench_cases
-
-MODELS = [
-    "vllm-ascend/DeepSeek-R1-0528-W8A8",
-]
-
-prompts = [
-    "San Francisco is a",
-]
-
-api_keyword_args = {
-    "max_tokens": 10,
-}
+from .test_deepseek_r1_0528_w8a8 import *
 
 aisbench_cases = [{
     "case_type": "accuracy",
@@ -50,46 +39,23 @@
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model", MODELS)
-async def test_models(model: str) -> None:
-    port = get_open_port()
-    env_dict = {
-        "OMP_NUM_THREADS": "100",
-        "OMP_PROC_BIND": "false",
-        "HCCL_BUFFSIZE": "200",
-        "VLLM_RPC_TIMEOUT": "3600000",
-        "VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS": "3600000",
-        "DISABLE_L2_CACHE": "1",
-        "DYNAMIC_EPLB": "true",
-    }
-    speculative_config = {"num_speculative_tokens": 1, "method": "mtp"}
-    compilation_config = {
-        "cudagraph_capture_sizes": [24],
-        "cudagraph_mode": "FULL_DECODE_ONLY"
-    }
-    additional_config: dict[str, Any] = {
-        "enable_shared_expert_dp": False,
-        "multistream_overlap_shared_expert": False,
-        "eplb_config": {
-            "dynamic_eplb": True,
-            "expert_heat_collection_interval": 512,
-            "algorithm_execution_interval": 100,
-            "num_redundant_experts": 0
+async def test_models_eplb(model: str) -> None:
+    port, env_dict, additional_config, server_args = config()
+    additional_config.update(
+        {
+            "eplb_config": {
+                "dynamic_eplb": "true",
+                "expert_heat_collection_interval": 1000,
+                "algorithm_execution_interval": 50,
+                "eplb_policy_type": 3,
+            }
         }
-    }
-    server_args = [
-        "--quantization", "ascend", "--seed", "1024",
-        "--no-enable-prefix-caching", "--data-parallel-size", "4",
-        "--tensor-parallel-size", "4", "--enable-expert-parallel", "--port",
-        str(port), "--max-model-len", "40000", "--max-num-batched-tokens",
-        "4096", "--max-num-seqs", "12", "--trust-remote-code",
-        "--gpu-memory-utilization", "0.92"
-    ]
-    server_args.extend(
-        ["--speculative-config",
-         json.dumps(speculative_config)])
-    server_args.extend(
-        ["--compilation-config",
-         json.dumps(compilation_config)])
+    )
+    env_dict.update(
+        {
+            "DYNAMIC_EPLB": "true",
+        }
+    )
     server_args.extend(["--additional-config", json.dumps(additional_config)])
     request_keyword_args: dict[str, Any] = {
         **api_keyword_args,
@@ -113,3 +79,4 @@ async def test_models(model: str) -> None:
                            port,
                            aisbench_cases,
                            server_args=server_args)
+
@@ -23,64 +23,30 @@
 
 from tests.e2e.conftest import RemoteOpenAIServer
 from tools.aisbench import run_aisbench_cases
-
-MODELS = [
-    "vllm-ascend/Qwen3-235B-A22B-W8A8",
-]
-
-prompts = [
-    "San Francisco is a",
-]
-
-api_keyword_args = {
-    "max_tokens": 10,
-}
-
-aisbench_cases = [{
-    "case_type": "accuracy",
-    "dataset_path": "vllm-ascend/gsm8k-lite",
-    "request_conf": "vllm_api_general_chat",
-    "dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt",
-    "max_out_len": 32768,
-    "batch_size": 32,
-    "top_k": 20,
-    "baseline": 95,
-    "threshold": 5
-}]
+from .test_qwen3_235b_w8a8 import *
 
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model", MODELS)
-async def test_models(model: str) -> None:
-    port = get_open_port()
-    env_dict = {
-        "OMP_NUM_THREADS": "10",
-        "OMP_PROC_BIND": "false",
-        "HCCL_BUFFSIZE": "1024",
-        "PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
-        "VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"
-    }
+async def test_models_eplb(model: str) -> None:
+    port, aisbench_cases, env_dict, compilation_config, server_args = config()
+    env_dict.update(
+        {
+            "DYNAMIC_EPLB": "true",
+        }
+    )
     additional_config: dict[str, Any] = {}
-    compilation_config = {"cudagraph_mode": "FULL_DECODE_ONLY"}
-    server_args = [
-        "--quantization", "ascend", "--async-scheduling",
-        "--data-parallel-size", "4", "--tensor-parallel-size", "4",
-        "--enable-expert-parallel", "--port",
-        str(port), "--max-model-len", "40960", "--max-num-batched-tokens",
-        "8192", "--max-num-seqs", "12", "--trust-remote-code",
-        "--gpu-memory-utilization", "0.9"
-    ]
-    env_dict["DYNAMIC_EPLB"] = "true"
     additional_config["eplb_config"] = {
-        "dynamic_eplb": True,
-        "expert_heat_collection_interval": 512,
-        "algorithm_execution_interval": 100,
-        "num_redundant_experts": 0
+        "dynamic_eplb": "true",
+        "expert_heat_collection_interval": 600,
+        "algorithm_execution_interval": 50,
+        "num_redundant_experts": 16,
+        "eplb_policy_type": 2,
     }
+    server_args.extend(["--additional-config", json.dumps(additional_config)])
     server_args.extend(
         ["--compilation-config",
          json.dumps(compilation_config)])
-    server_args.extend(["--additional-config", json.dumps(additional_config)])
     request_keyword_args: dict[str, Any] = {
         **api_keyword_args,
     }

@@ -38,24 +38,19 @@
     "max_tokens": 10,
 }
 
-aisbench_cases = [{
-    "case_type": "accuracy",
-    "dataset_path": "vllm-ascend/gsm8k-lite",
-    "request_conf": "vllm_api_general_chat",
-    "dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt",
-    "max_out_len": 32768,
-    "batch_size": 32,
-    "top_k": 20,
-    "baseline": 95,
-    "threshold": 5
-}]
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("mode", MODES)
-async def test_models(model: str, mode: str) -> None:
+def config():
     port = get_open_port()
+    aisbench_cases = [{
+        "case_type": "accuracy",
+        "dataset_path": "vllm-ascend/gsm8k-lite",
+        "request_conf": "vllm_api_general_chat",
+        "dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt",
+        "max_out_len": 32768,
+        "batch_size": 32,
+        "top_k": 20,
+        "baseline": 95,
+        "threshold": 5
+    }]
     env_dict = {
         "OMP_NUM_THREADS": "10",
         "OMP_PROC_BIND": "false",
@@ -72,11 +67,19 @@ async def test_models(model: str, mode: str) -> None:
         "8192", "--max-num-seqs", "12", "--trust-remote-code",
         "--gpu-memory-utilization", "0.9"
     ]
+    return port, aisbench_cases, env_dict, compilation_config, server_args
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("mode", MODES)
+async def test_models(model: str, mode: str) -> None:
+    port, aisbench_cases, env_dict, compilation_config, server_args = config()
     if mode == "piecewise":
         compilation_config["cudagraph_mode"] = "PIECEWISE"
     server_args.extend(
         ["--compilation-config",
-         json.dumps(compilation_config)])
+            json.dumps(compilation_config)])
     request_keyword_args: dict[str, Any] = {
         **api_keyword_args,
     }