diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 07a9a0a8b522..0f2800149726 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -1217,6 +1217,7 @@ def should_moe_wna16_use_cuda( ): return ( current_platform.is_cuda() + and not current_platform.is_rocm() and bit == 4 and group_size in [32, 64, 128] and num_valid_tokens / num_experts <= 6 diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index ab4c3e0740a9..2c30d54de6fd 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -336,6 +336,7 @@ class RocmPlatform(Platform): "petit_nvfp4", "torchao", "bitsandbytes", + "moe_wna16", ] @classmethod