[ROCm][AITER][Bugfix] Disable emulation for MoE#41226
[ROCm][AITER][Bugfix] Disable emulation for MoE#41226heachary wants to merge 2 commits intovllm-project:mainfrom
Conversation
Signed-off-by: Hemanth Acharya <heachary@amd.com>
Signed-off-by: Hemanth Acharya <heachary@amd.com>
There was a problem hiding this comment.
Code Review
This pull request adds support for the w_mxfp4_a_mxfp4 scheme to the native AITER CK path in Quark MoE, reducing reliance on emulation. A new test case verifies this behavior. However, the logic for setting the emulate flag is incomplete as it does not check if AITER is actually enabled, which could lead to runtime errors. Additionally, the new test should be parametrized to cover more hardware configurations, and the use of enums for scheme names is suggested for better type safety.
| # for `w_mxfp4` (w4a16) and `w_mxfp4_a_mxfp4`; mixed schemes like | ||
| # `w_mxfp4_a_mxfp6_*` fall through to QuantMethod.NO and raise | ||
| # "Unsupported kernel config for moe heuristic dispatch". | ||
| _AITER_NATIVE_OCP_MX_SCHEMES = ("w_mxfp4", "w_mxfp4_a_mxfp4") |
There was a problem hiding this comment.
The addition of w_mxfp4_a_mxfp4 to _AITER_NATIVE_OCP_MX_SCHEMES highlights a significant bug in the self.emulate logic on lines 1034-1039. Currently, if current_platform.supports_mx() is True and the scheme is in _AITER_NATIVE_OCP_MX_SCHEMES, self.emulate will be False even if self.use_rocm_aiter_moe is False. This is because the and condition on line 1037 evaluates to True (since self.mxfp4_backend is initialized to NONE at line 992 and not yet updated for w_mxfp4), making the result dependent only on the first part of the expression. This will lead to a crash when apply() attempts to use the AITER path while it is disabled. The logic should be corrected to ensure emulation is used whenever AITER is unavailable and no other native backend exists. Additionally, consider using the OCP_MX_Scheme enum members for better type safety.
_AITER_NATIVE_OCP_MX_SCHEMES = (OCP_MX_Scheme.w_mxfp4,
OCP_MX_Scheme.w_mxfp4_a_mxfp4)| def test_moe_emulation_w_mxfp4_a_mxfp4(): | ||
| """w_mxfp4_a_mxfp4 on gfx950 (supports_mx + aiter) must not fall back | ||
| to emulation — it should use the native AITER CK path.""" | ||
| from unittest.mock import MagicMock, patch | ||
|
|
||
| from vllm.model_executor.layers.fused_moe import FusedMoEConfig, MoEActivation | ||
| from vllm.model_executor.layers.fused_moe.config import ( | ||
| FusedMoEParallelConfig, | ||
| RoutingMethodType, | ||
| ) | ||
| from vllm.model_executor.layers.quantization.quark.quark_moe import ( | ||
| QuarkOCP_MX_MoEMethod, | ||
| ) | ||
|
|
||
| weight_config = {"dtype": "fp4", "qscheme": "per_group", "is_dynamic": False} | ||
| input_config = {"dtype": "fp4", "qscheme": "per_group", "is_dynamic": True} | ||
|
|
||
| parallel_config = FusedMoEParallelConfig( | ||
| tp_size=1, | ||
| pcp_size=1, | ||
| dp_size=1, | ||
| ep_size=1, | ||
| tp_rank=0, | ||
| pcp_rank=0, | ||
| dp_rank=0, | ||
| ep_rank=0, | ||
| sp_size=1, | ||
| use_ep=False, | ||
| all2all_backend="", | ||
| enable_eplb=False, | ||
| ) | ||
| moe = FusedMoEConfig( | ||
| num_experts=8, | ||
| experts_per_token=2, | ||
| hidden_dim=256, | ||
| intermediate_size_per_partition=512, | ||
| num_local_experts=8, | ||
| num_logical_experts=8, | ||
| activation=MoEActivation.SILU, | ||
| device="gpu", | ||
| routing_method=RoutingMethodType.Default, | ||
| moe_parallel_config=parallel_config, | ||
| in_dtype=torch.bfloat16, | ||
| ) | ||
|
|
||
| mock_vllm_config = MagicMock() | ||
|
|
||
| with ( | ||
| patch( | ||
| "vllm.model_executor.layers.quantization.quark.quark_moe.current_platform" | ||
| ) as mock_platform, | ||
| patch( | ||
| "vllm.model_executor.layers.quantization.quark.quark_moe.rocm_aiter_ops" | ||
| ) as mock_aiter, | ||
| patch( | ||
| "vllm.model_executor.layers.quantization.quark.quark_moe" | ||
| ".get_current_vllm_config", | ||
| return_value=mock_vllm_config, | ||
| ), | ||
| ): | ||
| mock_platform.supports_mx.return_value = True | ||
| mock_aiter.is_fused_moe_enabled.return_value = True | ||
|
|
||
| method = QuarkOCP_MX_MoEMethod( | ||
| weight_config=weight_config, | ||
| input_config=input_config, | ||
| moe=moe, | ||
| ) | ||
|
|
||
| assert method.ocp_mx_scheme == "w_mxfp4_a_mxfp4" | ||
| assert method.emulate is False, ( | ||
| "w_mxfp4_a_mxfp4 on gfx950 (supports_mx + aiter) must not emulate" | ||
| ) |
There was a problem hiding this comment.
The unit test should be parametrized to cover cases where MX support or AITER is disabled. This would have caught the logic bug in self.emulate where emulation is incorrectly disabled when AITER is unavailable on MX-supporting hardware.
@pytest.mark.parametrize("supports_mx", [True, False])
@pytest.mark.parametrize("is_aiter_enabled", [True, False])
def test_moe_emulation_w_mxfp4_a_mxfp4(supports_mx, is_aiter_enabled):
"""w_mxfp4_a_mxfp4 on gfx950 (supports_mx + aiter) must not fall back
to emulation — it should use the native AITER CK path."""
from unittest.mock import MagicMock, patch
from vllm.model_executor.layers.fused_moe import FusedMoEConfig, MoEActivation
from vllm.model_executor.layers.fused_moe.config import (
FusedMoEParallelConfig,
RoutingMethodType,
)
from vllm.model_executor.layers.quantization.quark.quark_moe import (
QuarkOCP_MX_MoEMethod,
)
weight_config = {"dtype": "fp4", "qscheme": "per_group", "is_dynamic": False}
input_config = {"dtype": "fp4", "qscheme": "per_group", "is_dynamic": True}
parallel_config = FusedMoEParallelConfig(
tp_size=1,
pcp_size=1,
dp_size=1,
ep_size=1,
tp_rank=0,
pcp_rank=0,
dp_rank=0,
ep_rank=0,
sp_size=1,
use_ep=False,
all2all_backend="",
enable_eplb=False,
)
moe = FusedMoEConfig(
num_experts=8,
experts_per_token=2,
hidden_dim=256,
intermediate_size_per_partition=512,
num_local_experts=8,
num_logical_experts=8,
activation=MoEActivation.SILU,
device="gpu",
routing_method=RoutingMethodType.Default,
moe_parallel_config=parallel_config,
in_dtype=torch.bfloat16,
)
mock_vllm_config = MagicMock()
with (
patch(
"vllm.model_executor.layers.quantization.quark.quark_moe.current_platform"
) as mock_platform,
patch(
"vllm.model_executor.layers.quantization.quark.quark_moe.rocm_aiter_ops"
) as mock_aiter,
patch(
"vllm.model_executor.layers.quantization.quark.quark_moe"
".get_current_vllm_config",
return_value=mock_vllm_config,
),
):
mock_platform.supports_mx.return_value = supports_mx
mock_aiter.is_fused_moe_enabled.return_value = is_aiter_enabled
method = QuarkOCP_MX_MoEMethod(
weight_config=weight_config,
input_config=input_config,
moe=moe,
)
assert method.ocp_mx_scheme == "w_mxfp4_a_mxfp4"
# Emulation should be False only if both MX is supported and AITER is enabled
expected_emulate = not (supports_mx and is_aiter_enabled)
assert method.emulate == expected_emulate, (
f"Emulation mismatch for supports_mx={supports_mx}, "
f"is_aiter_enabled={is_aiter_enabled}")|
duplicate of #41175 |
Purpose
PR #39801 introduced a regression for models using the w_mxfp4_a_mxfp4 scheme (e.g. Kimi-K2-Thinking-MXFP4). The _AITER_NATIVE_OCP_MX_SCHEMES tuple only included w_mxfp4, so w_mxfp4_a_mxfp4 was not recognized as a natively supported scheme and fell back to emulation despite aiter's CK MoE kernel supporting it. This PR adds w_mxfp4_a_mxfp4 to the supported set, restoring native execution.
Test Plan
[x] Added a unit test to make sure
emulateis set correctlyTest Result