fix

Pr0Wh1teGivee · Pr0Wh1teGivee · commit 9ff7384739a4 · 2025-09-08T14:56:07.000+08:00
diff --git a/vllm_ascend/ascend_forward_context.py b/vllm_ascend/ascend_forward_context.py
@@ -81,6 +81,9 @@ def set_ascend_forward_context(
             batch_descriptor=batch_descriptor,
     ):
         forward_context = get_forward_context()
+        if moe_comm_method == "allgather" and with_prefill:
+            moe_comm_method = "naivemulticast"
+
         forward_context.moe_comm_method_name = moe_comm_method + "commimpl"
         forward_context.with_prefill = with_prefill
         ep_size = (get_ep_group().world_size if
diff --git a/vllm_ascend/ops/moe/fused_moe_prepare_and_finalize.py b/vllm_ascend/ops/moe/fused_moe_prepare_and_finalize.py
@@ -264,6 +264,8 @@ def prepare(self,
                 rm_router_logits: bool = False,
                 replace_allreduce: bool = False,
                 gate=None) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        self.enable_shared_expert_dp = enable_shared_expert_dp
+
         if self.moe_config.dp_size > 1:
             self.cu_tokens_across_dp_cpu = get_forward_context(
             ).dp_metadata.cu_tokens_across_dp_cpu
diff --git a/vllm_ascend/ops/moe/moe_comm_method.py b/vllm_ascend/ops/moe/moe_comm_method.py
@@ -90,16 +90,8 @@ def fused_experts(
             # For load balance
             log2phy: torch.Tensor = None,
             global_redundant_expert_num: int = 0,
-            fusion_mlp: bool = False,
             need_trans: bool = False) -> torch.Tensor:
         # Check constraints
-        assert hidden_states.shape[1] == w1.shape[1], (
-            f"Hidden size mismatch {hidden_states.shape[1]} != {w1.shape[1]}")
-        assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
-        assert hidden_states.is_contiguous(
-        ), "Hidden_states must be contiguous"
-        assert w1.stride(-1) == 1, "Stride of last dimension must be 1"
-        assert w2.stride(-1) == 1, "Stride of last dimension must be 1"
         assert hidden_states.dtype in [
             torch.float32, torch.float16, torch.bfloat16
         ]
@@ -137,7 +129,7 @@ def fused_experts(
                                        w2_scale_bias=w2_scale_bias,
                                        with_quant=use_int8_w8a8
                                        or use_int4_w4a8,
-                                       fusion=fusion_mlp,
+                                       fusion=use_int8_w8a8,
                                        need_trans=need_trans)
 
         hidden_states[:] = self.token_dispatcher.token_combine(
diff --git a/vllm_ascend/quantization/w8a8_dynamic.py b/vllm_ascend/quantization/w8a8_dynamic.py
@@ -265,8 +265,7 @@ def apply(
             global_redundant_expert_num=global_redundant_expert_num,
             shared_experts=shared_experts,
             shared_gate_up=shared_gate_up,
-            shared_dequant_scale=shared_dequant_scale,
-            fusion_mlp=True
+            shared_dequant_scale=shared_dequant_scale
         )
             
         # return unified_fused_experts_eager(

Original file line number	Diff line number	Diff line change
`@@ -265,8 +265,7 @@ def apply(`
`265`	`265`	`global_redundant_expert_num=global_redundant_expert_num,`
`266`	`266`	`shared_experts=shared_experts,`
`267`	`267`	`shared_gate_up=shared_gate_up,`
`268`		`- shared_dequant_scale=shared_dequant_scale,`
`269`		`- fusion_mlp=True`
	`268`	`+ shared_dequant_scale=shared_dequant_scale`
`270`	`269`	`)`
`271`	`270`
`272`	`271`	`# return unified_fused_experts_eager(`