vllm-project · vllm-agent · May 16, 2026
@@ -90,9 +90,9 @@ def test_models(example_prompts, model_name) -> None:
 EAGER = [True, False]
 
 SM_100_NVFP4_BACKENDS = [
-    "flashinfer_cudnn",
-    "flashinfer_trtllm",
-    "flashinfer_cutlass",
+    "flashinfer-cudnn",
+    "flashinfer-trtllm",
+    "flashinfer-cutlass",
 ]
 
 
@@ -102,12 +102,12 @@ def test_models(example_prompts, model_name) -> None:
     "backend",
     [
         "emulation",
-        "flashinfer_cudnn",
-        "flashinfer_trtllm",  # the small seq_len ensures trtllm_8x4_layout backend is used
-        "flashinfer_cutlass",
+        "flashinfer-cudnn",
+        "flashinfer-trtllm",  # the small seq_len ensures trtllm_8x4_layout backend is used
+        "flashinfer-cutlass",
     ],
 )
-def test_nvfp4(vllm_runner, model, eager, backend):
+def test_nvfp4(vllm_runner, model, eager, backend, monkeypatch):
     if (
         not current_platform.has_device_capability(100)
         and backend in SM_100_NVFP4_BACKENDS
@@ -116,7 +116,8 @@ def test_nvfp4(vllm_runner, model, eager, backend):
             f"The backend {backend} is not supported with current_platform.has_device_capability(100) == False"
         )
 
-    with vllm_runner(model, enforce_eager=eager, linear_backend=backend) as llm:
+    monkeypatch.setenv("VLLM_NVFP4_GEMM_BACKEND", backend)
+    with vllm_runner(model, enforce_eager=eager) as llm:
         output = llm.generate_greedy(["1 2 3 4 5"], max_tokens=2)
     assert output[0][1] == "1 2 3 4 5 6"
 

@@ -135,24 +135,6 @@ def with_default(
     "emulation",
 ]
 
-LinearBackend = Literal[
-    "auto",
-    "cutlass",
-    "flashinfer_cutlass",
-    "flashinfer_trtllm",
-    "flashinfer_cudnn",
-    "marlin",
-    "triton",
-    "deep_gemm",
-    "torch",
-    "aiter",
-    "machete",
-    "fbgemm",
-    "conch",
-    "exllama",
-    "emulation",
-]
-
 
 @config
 class KernelConfig:
@@ -186,39 +168,13 @@ class KernelConfig:
                    running QDQ on activations.
     """
 
-    linear_backend: LinearBackend = "auto"
-    """Backend for quantized linear layer GEMM kernels. Available options:
-
-    - "auto": Automatically select the best backend based on model and hardware
-    - "cutlass": Use CUTLASS-based kernels
-    - "flashinfer_cutlass": Use FlashInfer with CUTLASS kernels
-    - "flashinfer_trtllm": Use FlashInfer with TensorRT-LLM kernels
-    - "flashinfer_cudnn": Use FlashInfer with cuDNN kernels
-    - "marlin": Use Marlin kernels
-    - "triton": Use Triton-based kernels
-    - "deep_gemm": Use DeepGEMM kernels
-    - "torch": Use PyTorch native scaled_mm kernels
-    - "aiter": Use AMD AITer kernels (ROCm only)
-    - "machete": Use Machete kernels (mixed-precision)
-    - "fbgemm": Use FBGEMM kernels
-    - "conch": Use Conch mixed-precision kernels
-    - "exllama": Use Exllama mixed-precision kernels
-    - "emulation": Use slow dequant-to-BF16 emulation (for testing only)"""
-
     @field_validator("moe_backend", mode="before")
     @classmethod
     def _normalize_moe_backend(cls, value: Any) -> Any:
         if isinstance(value, str):
             return value.lower().replace("-", "_")
         return value
 
-    @field_validator("linear_backend", mode="before")
-    @classmethod
-    def _normalize_linear_backend(cls, value: Any) -> Any:
-        if isinstance(value, str):
-            return value.lower().replace("-", "_")
-        return value
-
     def compute_hash(self) -> str:
         """
         Produces a hash unique to the pass configuration.

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -71,7 +71,7 @@
     PrefixCachingHashAlgo,
 )
 from vllm.config.device import Device
-from vllm.config.kernel import IrOpPriorityConfig, LinearBackend, MoEBackend
+from vllm.config.kernel import IrOpPriorityConfig, MoEBackend
 from vllm.config.lora import MaxLoRARanks
 from vllm.config.mamba import MambaBackendEnum
 from vllm.config.model import (
@@ -477,7 +477,6 @@ class EngineArgs:
     enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel
     enable_ep_weight_filter: bool = ParallelConfig.enable_ep_weight_filter
     moe_backend: MoEBackend = KernelConfig.moe_backend
-    linear_backend: LinearBackend = KernelConfig.linear_backend
     all2all_backend: All2AllBackend = ParallelConfig.all2all_backend
     enable_elastic_ep: bool = ParallelConfig.enable_elastic_ep
     enable_dbo: bool = ParallelConfig.enable_dbo
@@ -1413,9 +1412,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         moe_backend_kwargs = kernel_kwargs["moe_backend"]
         moe_backend_kwargs["type"] = lambda s: s.lower().replace("-", "_")
         kernel_group.add_argument("--moe-backend", **moe_backend_kwargs)
-        linear_backend_kwargs = kernel_kwargs["linear_backend"]
-        linear_backend_kwargs["type"] = lambda s: s.lower().replace("-", "_")
-        kernel_group.add_argument("--linear-backend", **linear_backend_kwargs)
 
         # vLLM arguments
         vllm_kwargs = get_kwargs(VllmConfig)
@@ -2089,8 +2085,6 @@ def create_engine_config(
             kernel_config.enable_flashinfer_autotune = self.enable_flashinfer_autotune
         if self.moe_backend != "auto":
             kernel_config.moe_backend = self.moe_backend
-        if self.linear_backend != "auto":
-            kernel_config.linear_backend = self.linear_backend
 
         # Transfer top-level ir_op_priority into KernelConfig.ir_op_priority
         for op_name, op_priority in asdict(self.ir_op_priority).items():