vllm-project · vllm-agent · May 13, 2026
@@ -24,7 +24,6 @@
     CompressedTensorsConfig,
     CompressedTensorsLinearMethod,
     CompressedTensorsW4A4Fp4,
-    CompressedTensorsW4A4Mxfp4,
     CompressedTensorsW4A8Fp8,
     CompressedTensorsW4A16Fp4,
     CompressedTensorsW8A8Fp8,
@@ -690,31 +689,3 @@ def check_model(model):
         llm.apply_model(check_model)
         output = llm.generate_greedy("Hello my name is", max_tokens=4)
         assert output
-
-
-@pytest.mark.skipif(
-    not current_platform.is_cuda() or not current_platform.has_device_capability(80),
-    reason="MXFP4 requires ampere or newer",
-)
-def test_compressed_tensors_mxfp4(vllm_runner):
-    model_path = "nm-testing/TinyLlama-1.1B-Chat-v1.0-MXFP4"
-    with vllm_runner(model_path, enforce_eager=True) as llm:
-
-        def check_model(model):
-            layer = model.model.layers[0]
-
-            qkv_proj = layer.self_attn.qkv_proj
-            o_proj = layer.self_attn.o_proj
-            gate_up_proj = layer.mlp.gate_up_proj
-            down_proj = layer.mlp.down_proj
-
-            for proj in (qkv_proj, o_proj, gate_up_proj, down_proj):
-                assert isinstance(proj.quant_method, CompressedTensorsLinearMethod)
-                assert isinstance(proj.scheme, CompressedTensorsW4A4Mxfp4)
-
-                # Verify group size
-                assert proj.scheme.group_size == 32
-
-        llm.apply_model(check_model)
-        output = llm.generate_greedy("Hello my name is", max_tokens=4)
-        assert output
diff --git a/vllm/model_executor/kernels/linear/__init__.py b/vllm/model_executor/kernels/linear/__init__.py
@@ -58,16 +58,6 @@
     XPUW4A8IntLinearKernel,
     XPUwNa16LinearKernel,
 )
-from vllm.model_executor.kernels.linear.mxfp4 import (
-    MxFp4LinearKernel,
-    MxFp4LinearLayerConfig,
-)
-from vllm.model_executor.kernels.linear.mxfp4.flashinfer import (
-    FlashInferMxFp4LinearKernel,
-)
-from vllm.model_executor.kernels.linear.mxfp4.marlin import (
-    MarlinMxFp4LinearKernel,
-)
 from vllm.model_executor.kernels.linear.mxfp8 import (
     Mxfp8LinearKernel,
     Mxfp8LinearLayerConfig,
@@ -286,13 +276,6 @@
     ],
 }
 
-_POSSIBLE_MXFP4_KERNELS: dict[PlatformEnum, list[type[MxFp4LinearKernel]]] = {
-    PlatformEnum.CUDA: [
-        FlashInferMxFp4LinearKernel,
-        MarlinMxFp4LinearKernel,
-    ],
-}
-
 # TODO make all kernels inherit from MMLinearKernel
 # then bound _KernelT only to MMLinearKernel
 _KernelT = TypeVar("_KernelT", bound=ScaledMMLinearKernel | MMLinearKernel)
@@ -587,48 +570,6 @@ def init_mxfp8_linear_kernel() -> Mxfp8LinearKernel:
     )
 
 
-def init_mxfp4_linear_kernel() -> MxFp4LinearKernel:
-    """Select and instantiate the best MXFP4 linear kernel for the
-    current platform."""
-    force_kernel: type[MxFp4LinearKernel] | None = None
-    if envs.VLLM_MXFP4_USE_MARLIN:
-        force_kernel = MarlinMxFp4LinearKernel
-
-    if force_kernel is not None:
-        is_supported, reason = force_kernel.is_supported()
-        if not is_supported:
-            raise ValueError(
-                f"Forced MXFP4 kernel {force_kernel.__name__} is not "
-                f"supported: {reason}"
-            )
-        logger.info_once("Using %s for MXFP4 GEMM", force_kernel.__name__)
-        return force_kernel(MxFp4LinearLayerConfig())
-
-    platform = current_platform._enum
-    possible = _POSSIBLE_MXFP4_KERNELS.get(platform, [])
-
-    failure_reasons = []
-    for kernel_cls in possible:
-        if kernel_cls.__name__ in envs.VLLM_DISABLED_KERNELS:
-            failure_reasons.append(
-                f" {kernel_cls.__name__} disabled by environment variable"
-            )
-            continue
-
-        is_supported, reason = kernel_cls.is_supported()
-        if not is_supported:
-            failure_reasons.append(f"{kernel_cls.__name__}: {reason}")
-            continue
-
-        logger.info_once("Using %s for MXFP4 GEMM", kernel_cls.__name__)
-        return kernel_cls(MxFp4LinearLayerConfig())
-
-    raise ValueError(
-        "Failed to find a kernel that can implement the "
-        "MXFP4 linear layer. Reasons: \n" + "\n".join(failure_reasons)
-    )
-
-
 def init_wfp8_a16_linear_kernel(
     weight_quant_key: QuantKey,
     activation_quant_key: QuantKey,
@@ -789,10 +730,6 @@ def register_linear_kernel(
         if platform not in _POSSIBLE_NVFP4_KERNELS:
             _POSSIBLE_NVFP4_KERNELS[platform] = []
         _POSSIBLE_NVFP4_KERNELS[platform].append(kernel_class)
-    elif kernel_type == "mxfp4":
-        if platform not in _POSSIBLE_MXFP4_KERNELS:
-            _POSSIBLE_MXFP4_KERNELS[platform] = []
-        _POSSIBLE_MXFP4_KERNELS[platform].append(kernel_class)
     else:
         raise ValueError(f"Unrecognized kernel type: {kernel_type}")
 
@@ -840,11 +777,6 @@ def register_linear_kernel(
     "init_mxfp8_linear_kernel",
     "Mxfp8LinearKernel",
     "Mxfp8LinearLayerConfig",
-    "init_mxfp4_linear_kernel",
-    "MxFp4LinearKernel",
-    "MxFp4LinearLayerConfig",
-    "FlashInferMxFp4LinearKernel",
-    "MarlinMxFp4LinearKernel",
     "FlashInferCutlassMxfp8LinearKernel",
     "MarlinMxfp8LinearKernel",
     "XPUMxFp8LinearKernel",

diff --git a/vllm/model_executor/kernels/linear/mxfp4/__init__.py b/vllm/model_executor/kernels/linear/mxfp4/__init__.py
diff --git a/vllm/model_executor/kernels/linear/mxfp4/base.py b/vllm/model_executor/kernels/linear/mxfp4/base.py
diff --git a/vllm/model_executor/kernels/linear/mxfp4/flashinfer.py b/vllm/model_executor/kernels/linear/mxfp4/flashinfer.py
diff --git a/vllm/model_executor/kernels/linear/mxfp4/marlin.py b/vllm/model_executor/kernels/linear/mxfp4/marlin.py
@@ -42,10 +42,10 @@
     CompressedTensors24,
     CompressedTensorsScheme,
     CompressedTensorsW4A4Fp4,
-    CompressedTensorsW4A4Mxfp4,
     CompressedTensorsW4A8Fp8,
     CompressedTensorsW4A8Int,
     CompressedTensorsW4A16Fp4,
+    CompressedTensorsW4A16Mxfp4,
     CompressedTensorsW8A8Fp8,
     CompressedTensorsW8A8Int8,
     CompressedTensorsW8A8Mxfp8,
@@ -625,7 +625,7 @@ def _get_scheme_from_parts(
             return CompressedTensorsW4A16Fp4()
 
         if self._is_mxfp4(weight_quant):
-            return CompressedTensorsW4A4Mxfp4()
+            return CompressedTensorsW4A16Mxfp4()
 
         if self._is_mxfp8(weight_quant):
             return CompressedTensorsW8A8Mxfp8()

@@ -42,7 +42,6 @@ def __init__(self, moe):
         super().__init__(moe)
         self.group_size = 32
         self.mxfp4_backend = Mxfp4MoeBackend.MARLIN
-        # use cutlass if supported, otherwise fallback to marlin for weight-only FP4
         self.use_cutlass_mxfp4 = CutlassExpertsMxfp4._supports_current_device()
         self.experts_cls: type[mk.FusedMoEExperts]
         if self.use_cutlass_mxfp4: