vllm-project · juhi10071998 · May 12, 2026 · May 12, 2026 · May 12, 2026 · May 12, 2026
@@ -7,6 +7,7 @@
 
 import os
 from typing import NoReturn
+from unittest.mock import MagicMock, patch
 
 import pytest
 import torch
@@ -285,3 +286,61 @@ def test_modelopt_nvfp4_config_dispatches_w4a16_method():
     assert config.LinearMethodCls is ModelOptNvFp4W4A16LinearMethod
     assert config.LinearMethodCls is not ModelOptNvFp4LinearMethod
     assert config.quant_method == "W4A16_NVFP4"
+
+
+@pytest.mark.parametrize(
+    "quant_method, expected_use_a16, act_key_is_none",
+    [
+        ("NVFP4", False, False),  # W4A4 default
+        ("W4A16_NVFP4", True, True),  # native W4A16 ckpt
+    ],
+)
+def test_modelopt_nvfp4_moe_dispatches_to_marlin_when_w4a16(
+    quant_method, expected_use_a16, act_key_is_none
+):
+    """``ModelOptNvFp4FusedMoE``: when the ckpt's ``quant_method`` is
+    ``W4A16_NVFP4``, the MoE class must pass ``activation_key=None`` to
+    ``select_nvfp4_moe_backend``. That filters out every W4A4 backend
+    (their ``_supports_quant_scheme`` requires
+    ``(kNvfp4Static, kNvfp4Dynamic)`` exactly); Marlin survives because
+    it only checks ``weight_key``. A regression here would mean a W4A16
+    ckpt silently went to the cutlass W4A4 path.
+    """
+    from vllm.model_executor.layers.quantization.modelopt import (
+        ModelOptNvFp4Config,
+        ModelOptNvFp4FusedMoE,
+    )
+    from vllm.model_executor.layers.quantization.utils.quant_utils import (
+        kNvfp4Dynamic,
+        kNvfp4Static,
+    )
+
+    config = ModelOptNvFp4Config(
+        quant_method=quant_method,
+        is_checkpoint_nvfp4_serialized=True,
+        kv_cache_quant_algo=None,
+        exclude_modules=[],
+        group_size=16,
+    )
+
+    mock_select = MagicMock(return_value=(MagicMock(), MagicMock()))
+    with (
+        patch(
+            "vllm.model_executor.layers.quantization.modelopt.select_nvfp4_moe_backend",
+            mock_select,
+        ),
+        patch(
+            "vllm.model_executor.layers.quantization.modelopt."
+            "is_global_sf_supported_for_nvfp4_backend",
+            return_value=False,
+        ),
+    ):
+        moe = ModelOptNvFp4FusedMoE(config, MagicMock())
+
+    assert moe.use_a16 is expected_use_a16
+    _, kwargs = mock_select.call_args
+    assert kwargs["weight_key"] is kNvfp4Static
+    if act_key_is_none:
+        assert kwargs["activation_key"] is None
+    else:
+        assert kwargs["activation_key"] is kNvfp4Dynamic
@@ -1391,11 +1391,17 @@ def __init__(
     ) -> None:
         super().__init__(moe_config)
         self.quant_config = quant_config
-        # Select experts implementation.
+        # W4A16 mode fires for W4A16_NVFP4 on-disk checkpoints. With
+        # activation_key=None every W4A4 backend's _supports_quant_scheme
+        # rejects itself (they all require (kNvfp4Static, kNvfp4Dynamic)
+        # exactly); only Marlin survives. Marlin's MoE path drops
+        # activation scales in convert_to_nvfp4_moe_kernel_format, so no
+        # other change is needed.
+        self.use_a16 = quant_config.quant_method == "W4A16_NVFP4"
         self.nvfp4_backend, self.experts_cls = select_nvfp4_moe_backend(
             config=self.moe,
             weight_key=kNvfp4Static,
-            activation_key=kNvfp4Dynamic,
+            activation_key=None if self.use_a16 else kNvfp4Dynamic,
         )
 
         self.use_global_sf = is_global_sf_supported_for_nvfp4_backend(