vllm-project · wangxiyuan · Mar 20, 2026 · Mar 19, 2026
@@ -76,12 +76,6 @@ def test_qwen3_moe_distributed_aiv_tp2():
 
 @pytest.mark.asyncio
 async def test_qwen3_moe_w8a8_distributed_tp2_ep_dynamic_eplb():
-    from vllm_ascend.utils import vllm_version_is
-
-    if not vllm_version_is("0.17.0"):
-        pytest.skip(
-            "EPLB output is different without EPLB, see issue: https://github.com/vllm-project/vllm-ascend/issues/7408",
-        )
     model = "vllm-ascend/Qwen3-30B-A3B-W8A8"
     port = get_open_port()
     compilation_config = json.dumps({"cudagraph_capture_sizes": [8]})

@@ -364,6 +364,7 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device):
 
         eplb_config = self.ascend_config.eplb_config
         self.dynamic_eplb = eplb_config.dynamic_eplb
+        self.eplb_enable = self.dynamic_eplb or (eplb_config.expert_map_path is not None)
         if self.dynamic_eplb:
             self.is_eplb_warmuped = False
             self.policy_type = eplb_config.eplb_policy_type
@@ -2554,7 +2555,9 @@ def load_model(self) -> None:
         logger.info("Starting to load model %s...", self.model_config.model)
 
         with DeviceMemoryProfiler() as m:  # noqa: SIM117
-            self.model = get_model(vllm_config=self.vllm_config)
+            if self.eplb_enable:
+                self.vllm_config.parallel_config.enable_eplb = True
+            self.model: nn.Module = get_model(vllm_config=self.vllm_config)
             if self.dynamic_eplb:
                 model_register(self.model)
             if self.drafter: