From 080d293753c1abfc4843440ffa035bafc44062d2 Mon Sep 17 00:00:00 2001 From: Bruce Changlong Xu Date: Fri, 27 Feb 2026 23:52:04 -0700 Subject: [PATCH] [ROCm][Quantization] Enable moe_wna16 quantization on ROCm via Triton path Enable WNA16 (W4A16/W8A16) MoE quantization on ROCm by: - Adding "moe_wna16" to RocmPlatform.supported_quantization - Excluding ROCm from should_moe_wna16_use_cuda() so the Triton fallback kernel (invoke_fused_moe_wna16_triton_kernel) is used instead of the CUDA-only moe_wna16_gemm op The Triton WNA16 MoE kernel already works on ROCm. Linear layers within moe_wna16 models fall through to non-Marlin AWQ/GPTQ paths since check_marlin_supports_layer returns False on ROCm. This enables popular 4-bit quantized MoE models (Mixtral, DeepSeek, etc.) with GPTQ/AWQ quantization on AMD GPUs. Signed-off-by: Bruce Changlong Xu --- vllm/model_executor/layers/fused_moe/fused_moe.py | 1 + vllm/platforms/rocm.py | 1 + 2 files changed, 2 insertions(+) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 07a9a0a8b522..0f2800149726 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -1217,6 +1217,7 @@ def should_moe_wna16_use_cuda( ): return ( current_platform.is_cuda() + and not current_platform.is_rocm() and bit == 4 and group_size in [32, 64, 128] and num_valid_tokens / num_experts <= 6 diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index ab4c3e0740a9..2c30d54de6fd 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -336,6 +336,7 @@ class RocmPlatform(Platform): "petit_nvfp4", "torchao", "bitsandbytes", + "moe_wna16", ] @classmethod