diff --git a/vllm_ascend/ascend_forward_context.py b/vllm_ascend/ascend_forward_context.py index 06f5df1db17..8243cca46b6 100644 --- a/vllm_ascend/ascend_forward_context.py +++ b/vllm_ascend/ascend_forward_context.py @@ -244,7 +244,7 @@ def select_moe_comm_method(num_tokens: int, # TODO: drop the EP-size guard when dispatch_ffn_combine supports larger EP sizes # TODO: drop speculative method guard when dispatch_gmm_combine_decode supports w16a16 fused_mc2_enable = envs_ascend.VLLM_ASCEND_ENABLE_FUSED_MC2 and quant_type == "w8a8_dynamic" - dispatch_ffn_combine_enable = get_ep_group().world_size <= 16 and ( + dispatch_ffn_combine_enable = get_ep_group().world_size <= 32 and ( not is_draft_model) and (not dynamic_eplb) if num_tokens <= mc2_tokens_capacity: fused_decode_enable = fused_mc2_enable diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py index 2c1fae149fa..bc31abd1179 100644 --- a/vllm_ascend/envs.py +++ b/vllm_ascend/envs.py @@ -123,7 +123,7 @@ # Whether to enable fused mc2(`dispatch_gmm_combine_decode`/`dispatch_ffn_combine` operator) # 0, or not set: default ALLTOALL and MC2 will be used. # 1: ALLTOALL and MC2 might be replaced by `dispatch_ffn_combine` operator. - # `dispatch_ffn_combine` can be used only for moe layer with W8A8, EP<=16, non-mtp, non-dynamic-eplb. + # `dispatch_ffn_combine` can be used only for moe layer with W8A8, EP<=32, non-mtp, non-dynamic-eplb. # 2: MC2 might be replaced by `dispatch_gmm_combine_decode` operator. # `dispatch_gmm_combine_decode` can be used only for **decode node** moe layer # with W8A8. And MTP layer must be W8A8.