From 080d293753c1abfc4843440ffa035bafc44062d2 Mon Sep 17 00:00:00 2001
From: Bruce Changlong Xu <brucechanglongxu@gmail.com>
Date: Fri, 27 Feb 2026 23:52:04 -0700
Subject: [PATCH] [ROCm][Quantization] Enable moe_wna16 quantization on ROCm
 via Triton path

Enable WNA16 (W4A16/W8A16) MoE quantization on ROCm by:
- Adding "moe_wna16" to RocmPlatform.supported_quantization
- Excluding ROCm from should_moe_wna16_use_cuda() so the Triton
  fallback kernel (invoke_fused_moe_wna16_triton_kernel) is used
  instead of the CUDA-only moe_wna16_gemm op

The Triton WNA16 MoE kernel already works on ROCm. Linear layers
within moe_wna16 models fall through to non-Marlin AWQ/GPTQ paths
since check_marlin_supports_layer returns False on ROCm.

This enables popular 4-bit quantized MoE models (Mixtral, DeepSeek,
etc.) with GPTQ/AWQ quantization on AMD GPUs.

Signed-off-by: Bruce Changlong Xu <brucechanglongxu@gmail.com>
---
 vllm/model_executor/layers/fused_moe/fused_moe.py | 1 +
 vllm/platforms/rocm.py                            | 1 +
 2 files changed, 2 insertions(+)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 07a9a0a8b522..0f2800149726 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1217,6 +1217,7 @@ def should_moe_wna16_use_cuda(
 ):
     return (
         current_platform.is_cuda()
+        and not current_platform.is_rocm()
         and bit == 4
         and group_size in [32, 64, 128]
         and num_valid_tokens / num_experts <= 6
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index ab4c3e0740a9..2c30d54de6fd 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -336,6 +336,7 @@ class RocmPlatform(Platform):
         "petit_nvfp4",
         "torchao",
         "bitsandbytes",
+        "moe_wna16",
     ]
 
     @classmethod