vllm-project · fxmarty-amd · Mar 2, 2026 · Mar 2, 2026 · Mar 2, 2026 · Mar 2, 2026
@@ -2,3 +2,5 @@ DeepSeek-R1-TP_MI325.yaml
 DeepSeek-R1-DP_MI325.yaml
 DeepSeek-V3.2-TP_MI325.yaml
 DeepSeek-V3.2-DP_MI325.yaml
+Qwen3-30B-A3B-NVFP4.yaml
+Qwen3.5-35B-A3B-MXFP4-TP2.yaml
@@ -120,3 +120,20 @@ def test_nvfp4(vllm_runner, model, eager, backend, monkeypatch):
     with vllm_runner(model, enforce_eager=eager) as llm:
         output = llm.generate_greedy(["1 2 3 4 5"], max_tokens=2)
     assert output[0][1] == "1 2 3 4 5 6"
+
+
+# Qwen3-30B-A3B is 60 GB vs Llama-4-Scout-17B-16E-Instruct-FP4 that is 210 GB.
+@pytest.mark.parametrize(
+    "model",
+    [
+        "nvidia/Qwen3-30B-A3B-NVFP4",
+        "RedHatAI/Qwen3-30B-A3B-NVFP4",
+    ],
+)
+@pytest.mark.parametrize("eager", EAGER)
+@pytest.mark.parametrize("backend", ["emulation"])
+def test_nvfp4_moe(vllm_runner, model, eager, backend, monkeypatch):
+    monkeypatch.setenv("VLLM_NVFP4_GEMM_BACKEND", backend)
+    with vllm_runner(model, enforce_eager=eager, moe_backend="emulation") as llm:
+        output = llm.generate_greedy(["1 2 3 4 5"], max_tokens=2)
+    assert output[0][1] == "1 2 3 4 5 6"
@@ -228,6 +228,8 @@ def get_model_args(
         model_name="fxmarty/qwen1.5_moe_a2.7b_chat_w_fp6_e3m2_a_fp6_e3m2",
         excepted_value=10.6,
     ),
+    # This one raises `RuntimeError: wrong! device_gemm with the specified compilation
+    # parameters does not support this GEMM problem` on MI355X.
     AccuracyTestConfig(
         model_name="fxmarty/qwen_1.5-moe-a2.7b-mxfp4", excepted_value=12.4
     ),
@@ -238,8 +240,13 @@ def get_model_args(
     not QUARK_MXFP4_AVAILABLE,
     reason=f"amd-quark>={QUARK_MXFP4_MIN_VERSION} is not available",
 )
-@pytest.mark.parametrize("config", WIKITEXT_ACCURACY_CONFIGS)
-@pytest.mark.parametrize("tp_size", [1, 2])
+@pytest.mark.parametrize(
+    "config",
+    [pytest.param(val, id=f"config:{val}") for val in WIKITEXT_ACCURACY_CONFIGS],
+)
+@pytest.mark.parametrize(
+    "tp_size", [pytest.param(val, id=f"tp_size:{val}") for val in [1, 2]]
+)
 def test_ocp_mx_wikitext_correctness(config: AccuracyTestConfig, tp_size: int):
     device_count = torch.accelerator.device_count()
     if device_count < tp_size:
@@ -266,6 +273,54 @@ def test_ocp_mx_wikitext_correctness(config: AccuracyTestConfig, tp_size: int):
     ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
 
 
+@pytest.mark.skipif(
+    not QUARK_MXFP4_AVAILABLE,
+    reason=f"amd-quark>={QUARK_MXFP4_MIN_VERSION} is not available",
+)
+@pytest.mark.parametrize("tp_size", [1, 2])
+def test_nvfp4_wikitext_correctness(tp_size: int):
+    device_count = torch.accelerator.device_count()
+    if device_count < tp_size:
+        pytest.skip(f"This test requires >={tp_size} gpus, got only {device_count}")
+
+    # model_name = "amd-quark/Qwen3-30B-A3B-nvfp4-quark"
+    # NOTE: expected_value from nvidia/Qwen3-30B-A3B-NVFP4
+    expected_value = 11.2391
+
+    model_name = "amd-quark/Qwen3-30B-A3B-nvfp4-quark"
+    task = "wikitext"
+
+    rtol = 0.25
+
+    config = AccuracyTestConfig(
+        model_name=model_name,
+        excepted_value=expected_value,
+    )
+
+    model_args = config.get_model_args(
+        tp_size=tp_size,
+        kwargs={
+            "cudagraph_capture_sizes": [16],
+        },
+    )
+    model_args.pop("add_bos_token")
+
+    # Smaller cudagraph_capture_sizes to speed up the test.
+    results = lm_eval.simple_evaluate(
+        model="vllm",
+        model_args=model_args,
+        tasks=task,
+        batch_size=64,
+    )
+
+    EXPECTED_VALUE = config.excepted_value
+    measured_value = results["results"][task]["word_perplexity,none"]
+    assert (
+        measured_value < EXPECTED_VALUE + rtol
+        and measured_value > EXPECTED_VALUE - rtol
+    ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
+
+
 @pytest.mark.parametrize("config", GSM8K_ACCURACY_CONFIGS)
 @pytest.mark.skipif(
     not QUARK_MXFP4_AVAILABLE,

@@ -115,6 +115,7 @@ def with_default(
     "flashinfer_cutedsl",
     "marlin",
     "aiter",
+    "emulation",
 ]
 
 
@@ -142,7 +143,10 @@ class KernelConfig:
     - "flashinfer_cutlass": Use FlashInfer with CUTLASS kernels
     - "flashinfer_cutedsl": Use FlashInfer with CuteDSL kernels (FP4 only)
     - "marlin": Use Marlin kernels (weight-only quantization)
-    - "aiter": Use AMD AITer kernels (ROCm only)"""
+    - "aiter": Use AMD AITer kernels (ROCm only)
+    - "emulation": use BF16/FP16 GEMM, dequantizing weights and
+                   running QDQ on activations.
+    """
 
     @field_validator("moe_backend", mode="before")
     @classmethod

@@ -0,0 +1,164 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+NVFP4 quantization emulation for MoE.
+
+This file implements NVFP4 emulation for NVFP4 MOE in case the hardware used does not
+natively support NVFP4 MOE.
+
+Weights are dequantized on the fly during each forward, we fall back to calling
+`TritonExperts` using BF16, and fake NVFP4 quantize-dequantize
+is applied on `a13`, `a2`.
+"""
+
+import torch
+
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe.activation import MoEActivation
+from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEConfig,
+    FusedMoEQuantConfig,
+)
+from vllm.model_executor.layers.fused_moe.fused_moe import TritonExperts
+from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input
+from vllm.model_executor.layers.quantization.utils.nvfp4_emulation_utils import (
+    dequantize_to_dtype,
+)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    QuantKey,
+    kNvfp4Dynamic,
+    kNvfp4Static,
+)
+
+logger = init_logger(__name__)
+
+
+class Nvfp4QuantizationEmulationTritonExperts(TritonExperts):
+    """
+    Extension of TritonExperts to support emulated NVFP4 MoE experts.
+
+    It may be used for NVFP4 models when the device does not have
+    native support for this dtype.
+    """
+
+    def __init__(
+        self,
+        moe_config: FusedMoEConfig,
+        quant_config: FusedMoEQuantConfig,
+    ):
+        super().__init__(moe_config, quant_config)
+        logger.warning_once(
+            "Using Nvfp4QuantizationEmulationTritonExperts MOE backend. This will"
+            " dequantize weights on the fly and may be slower than native"
+            " quantized MOE. Consider using a device with native quantization"
+            " support (e.g. Nvidia Blackwell) for better performance."
+        )
+
+        # `TritonExperts.apply` expects pre-dequantized weights,
+        # which we handle in `apply` below.
+        self.w1_scale_val = self.quant_config.w1_scale
+        self.w2_scale_val = self.quant_config.w2_scale
+
+        self.quant_config._w1.scale = None
+        self.quant_config._w2.scale = None
+
+        self.quantization_emulation = True
+
+    @property
+    def quant_dtype(self) -> torch.dtype | str | None:
+        return "nvfp4"
+
+    @property
+    def expects_unquantized_inputs(self) -> bool:
+        return True
+
+    @staticmethod
+    def _supports_quant_scheme(
+        weight_key: QuantKey | None,
+        activation_key: QuantKey | None,
+    ) -> bool:
+        return (weight_key, activation_key) == (kNvfp4Static, kNvfp4Dynamic)
+
+    def apply(
+        self,
+        output: torch.Tensor,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: MoEActivation,
+        global_num_experts: int,
+        expert_map: torch.Tensor | None,
+        a1q_scale: torch.Tensor | None,
+        a2_scale: torch.Tensor | None,
+        workspace13: torch.Tensor,
+        workspace2: torch.Tensor,
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
+        apply_router_weight_on_input: bool,
+    ):
+        """
+        Apply emulated quantized MoE computation.
+
+        This dequantizes the weights on the fly and calls fused_experts_impl
+        with activation quantization support.
+        """
+        # Dequantize weights if they are quantized
+        # For NVFP4, weights are packed in uint8 format
+        # w1 shape: [num_experts, 2*intermediate_size, hidden_size//2]
+        # w2 shape: [num_experts, hidden_size, intermediate_size//2]
+        assert w1.dtype == torch.uint8
+        assert w2.dtype == torch.uint8
+
+        # Dequantize w1 from packed NVFP4 to fp16/bf16
+        w13_global_scale = self.quant_config.g1_alphas
+
+        w1_dequant = dequantize_to_dtype(
+            tensor_fp4=w1,
+            tensor_sf=self.w1_scale_val,
+            global_scale=w13_global_scale,
+            dtype=hidden_states.dtype,
+            block_size=16,
+            swizzle=False,
+        )
+
+        # Dequantize w2 from packed NVFP4 to fp16/bf16
+        w2_global_scale = self.quant_config.g2_alphas
+
+        w2_dequant = dequantize_to_dtype(
+            tensor_fp4=w2,
+            tensor_sf=self.w2_scale_val,
+            global_scale=w2_global_scale,
+            dtype=hidden_states.dtype,
+            block_size=16,
+            swizzle=False,
+        )
+
+        hidden_states, _ = moe_kernel_quantize_input(
+            A=hidden_states,
+            A_scale=self.quant_config.a1_gscale,
+            quant_dtype="nvfp4",
+            per_act_token_quant=False,
+            quantization_emulation=True,
+        )
+
+        # Activation quantization/dequantization is deferred to
+        # `moe_kernel_quantize_input` in TritonExperts.apply.
+        super().apply(
+            output=output,
+            hidden_states=hidden_states,
+            w1=w1_dequant,
+            w2=w2_dequant,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            activation=activation,
+            global_num_experts=global_num_experts,
+            expert_map=expert_map,
+            a1q_scale=None,
+            a2_scale=self.quant_config.a2_gscale,
+            workspace13=workspace13,
+            workspace2=workspace2,
+            expert_tokens_meta=expert_tokens_meta,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+        )