diff --git a/tests/models/quantization/test_nvfp4.py b/tests/models/quantization/test_nvfp4.py index 5ca307a4b191..afbcb1e5aeca 100644 --- a/tests/models/quantization/test_nvfp4.py +++ b/tests/models/quantization/test_nvfp4.py @@ -90,9 +90,9 @@ def test_models(example_prompts, model_name) -> None: EAGER = [True, False] SM_100_NVFP4_BACKENDS = [ - "flashinfer_cudnn", - "flashinfer_trtllm", - "flashinfer_cutlass", + "flashinfer-cudnn", + "flashinfer-trtllm", + "flashinfer-cutlass", ] @@ -102,12 +102,12 @@ def test_models(example_prompts, model_name) -> None: "backend", [ "emulation", - "flashinfer_cudnn", - "flashinfer_trtllm", # the small seq_len ensures trtllm_8x4_layout backend is used - "flashinfer_cutlass", + "flashinfer-cudnn", + "flashinfer-trtllm", # the small seq_len ensures trtllm_8x4_layout backend is used + "flashinfer-cutlass", ], ) -def test_nvfp4(vllm_runner, model, eager, backend): +def test_nvfp4(vllm_runner, model, eager, backend, monkeypatch): if ( not current_platform.has_device_capability(100) and backend in SM_100_NVFP4_BACKENDS @@ -116,7 +116,8 @@ def test_nvfp4(vllm_runner, model, eager, backend): f"The backend {backend} is not supported with current_platform.has_device_capability(100) == False" ) - with vllm_runner(model, enforce_eager=eager, linear_backend=backend) as llm: + monkeypatch.setenv("VLLM_NVFP4_GEMM_BACKEND", backend) + with vllm_runner(model, enforce_eager=eager) as llm: output = llm.generate_greedy(["1 2 3 4 5"], max_tokens=2) assert output[0][1] == "1 2 3 4 5 6" diff --git a/vllm/config/kernel.py b/vllm/config/kernel.py index 66b805383fcb..f450dd9b32c2 100644 --- a/vllm/config/kernel.py +++ b/vllm/config/kernel.py @@ -135,24 +135,6 @@ def with_default( "emulation", ] -LinearBackend = Literal[ - "auto", - "cutlass", - "flashinfer_cutlass", - "flashinfer_trtllm", - "flashinfer_cudnn", - "marlin", - "triton", - "deep_gemm", - "torch", - "aiter", - "machete", - "fbgemm", - "conch", - "exllama", - "emulation", -] - @config class KernelConfig: @@ -186,25 +168,6 @@ class KernelConfig: running QDQ on activations. """ - linear_backend: LinearBackend = "auto" - """Backend for quantized linear layer GEMM kernels. Available options: - - - "auto": Automatically select the best backend based on model and hardware - - "cutlass": Use CUTLASS-based kernels - - "flashinfer_cutlass": Use FlashInfer with CUTLASS kernels - - "flashinfer_trtllm": Use FlashInfer with TensorRT-LLM kernels - - "flashinfer_cudnn": Use FlashInfer with cuDNN kernels - - "marlin": Use Marlin kernels - - "triton": Use Triton-based kernels - - "deep_gemm": Use DeepGEMM kernels - - "torch": Use PyTorch native scaled_mm kernels - - "aiter": Use AMD AITer kernels (ROCm only) - - "machete": Use Machete kernels (mixed-precision) - - "fbgemm": Use FBGEMM kernels - - "conch": Use Conch mixed-precision kernels - - "exllama": Use Exllama mixed-precision kernels - - "emulation": Use slow dequant-to-BF16 emulation (for testing only)""" - @field_validator("moe_backend", mode="before") @classmethod def _normalize_moe_backend(cls, value: Any) -> Any: @@ -212,13 +175,6 @@ def _normalize_moe_backend(cls, value: Any) -> Any: return value.lower().replace("-", "_") return value - @field_validator("linear_backend", mode="before") - @classmethod - def _normalize_linear_backend(cls, value: Any) -> Any: - if isinstance(value, str): - return value.lower().replace("-", "_") - return value - def compute_hash(self) -> str: """ Produces a hash unique to the pass configuration. diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index d6e3770d4c35..2b5e242bb2f4 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -71,7 +71,7 @@ PrefixCachingHashAlgo, ) from vllm.config.device import Device -from vllm.config.kernel import IrOpPriorityConfig, LinearBackend, MoEBackend +from vllm.config.kernel import IrOpPriorityConfig, MoEBackend from vllm.config.lora import MaxLoRARanks from vllm.config.mamba import MambaBackendEnum from vllm.config.model import ( @@ -477,7 +477,6 @@ class EngineArgs: enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel enable_ep_weight_filter: bool = ParallelConfig.enable_ep_weight_filter moe_backend: MoEBackend = KernelConfig.moe_backend - linear_backend: LinearBackend = KernelConfig.linear_backend all2all_backend: All2AllBackend = ParallelConfig.all2all_backend enable_elastic_ep: bool = ParallelConfig.enable_elastic_ep enable_dbo: bool = ParallelConfig.enable_dbo @@ -1413,9 +1412,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: moe_backend_kwargs = kernel_kwargs["moe_backend"] moe_backend_kwargs["type"] = lambda s: s.lower().replace("-", "_") kernel_group.add_argument("--moe-backend", **moe_backend_kwargs) - linear_backend_kwargs = kernel_kwargs["linear_backend"] - linear_backend_kwargs["type"] = lambda s: s.lower().replace("-", "_") - kernel_group.add_argument("--linear-backend", **linear_backend_kwargs) # vLLM arguments vllm_kwargs = get_kwargs(VllmConfig) @@ -2089,8 +2085,6 @@ def create_engine_config( kernel_config.enable_flashinfer_autotune = self.enable_flashinfer_autotune if self.moe_backend != "auto": kernel_config.moe_backend = self.moe_backend - if self.linear_backend != "auto": - kernel_config.linear_backend = self.linear_backend # Transfer top-level ir_op_priority into KernelConfig.ir_op_priority for op_name, op_priority in asdict(self.ir_op_priority).items(): diff --git a/vllm/model_executor/kernels/linear/__init__.py b/vllm/model_executor/kernels/linear/__init__.py index 277fb28f3c91..83d925637369 100644 --- a/vllm/model_executor/kernels/linear/__init__.py +++ b/vllm/model_executor/kernels/linear/__init__.py @@ -13,7 +13,6 @@ import stability. """ -import warnings from typing import TypeVar import torch @@ -162,96 +161,6 @@ logger = init_logger(__name__) - -def _get_linear_backend() -> str: - """Get the linear_backend setting from the current vllm config.""" - from vllm.config import get_current_vllm_config_or_none - - config = get_current_vllm_config_or_none() - if config is not None: - return config.kernel_config.linear_backend - return "auto" - - -# Mapping from linear_backend name to the set of kernel classes it covers. -# When a user sets --linear-backend , only kernels in the corresponding -# set are considered candidates. If none can implement the layer config, -# an error is raised to respect the user's explicit intent. -_LINEAR_BACKEND_KERNEL_MAP: dict[str, set[type]] = { - "cutlass": { - CutlassInt8ScaledMMLinearKernel, - CutlassFP8ScaledMMLinearKernel, - CutlassFp8BlockScaledMMKernel, - CutlassW4A8LinearKernel, - CutlassNvFp4LinearKernel, - }, - "flashinfer_cutlass": { - FlashInferFP8ScaledMMLinearKernel, - FlashInferFp8DeepGEMMDynamicBlockScaledKernel, - FlashInferCutlassMxfp8LinearKernel, - FlashInferCutlassNvFp4LinearKernel, - FlashInferMxFp4LinearKernel, - }, - "flashinfer_trtllm": { - FlashInferTrtllmNvFp4LinearKernel, - }, - "flashinfer_cudnn": { - FlashInferCudnnNvFp4LinearKernel, - }, - "marlin": { - MarlinFP8ScaledMMLinearKernel, - MarlinLinearKernel, - MarlinMxfp8LinearKernel, - MarlinNvFp4LinearKernel, - MarlinMxFp4LinearKernel, - }, - "triton": { - TritonInt8ScaledMMLinearKernel, - TritonFp8BlockScaledMMKernel, - TritonW4A16LinearKernel, - }, - "deep_gemm": { - DeepGemmFp8BlockScaledMMKernel, - }, - "torch": { - PerTensorTorchFP8ScaledMMLinearKernel, - ChannelWiseTorchFP8ScaledMMLinearKernel, - RowWiseTorchFP8ScaledMMLinearKernel, - }, - "aiter": { - AiterInt8ScaledMMLinearKernel, - AiterFp8BlockScaledMMKernel, - AiterPerTokenFp8ScaledMMLinearKernel, - AiterPreshuffledPerTokenFp8ScaledMMLinearKernel, - }, - "machete": { - MacheteLinearKernel, - }, - "fbgemm": { - FbgemmNvFp4LinearKernel, - }, - "conch": { - ConchLinearKernel, - }, - "exllama": { - ExllamaLinearKernel, - }, - "emulation": { - EmulationMxfp8LinearKernel, - EmulationNvFp4LinearKernel, - }, -} - - -def _filter_kernels_by_backend( - backend: str, - kernels: list[type], -) -> list[type]: - """Filter a kernel priority list to only those matching the backend.""" - backend_kernels = _LINEAR_BACKEND_KERNEL_MAP.get(backend, set()) - return [k for k in kernels if k in backend_kernels] - - # in priority/performance order (when available) _POSSIBLE_INT8_KERNELS: dict[PlatformEnum, list[type[Int8ScaledMMLinearKernel]]] = { PlatformEnum.CPU: [CPUInt8ScaledMMLinearKernel], @@ -466,20 +375,7 @@ def choose_scaled_mm_linear_kernel( scope="global", ) - platform_kernels = possible_kernels[current_platform._enum] - - # Apply --linear-backend filtering when set. - linear_backend = _get_linear_backend() - if linear_backend != "auto": - filtered = _filter_kernels_by_backend(linear_backend, platform_kernels) - if not filtered: - raise ValueError( - f"--linear-backend={linear_backend} was requested but no " - f"'{linear_backend}' kernel exists for this layer type." - ) - platform_kernels = filtered - - for kernel in platform_kernels: + for kernel in possible_kernels[current_platform._enum]: is_supported_and_can_implement, failure_reason = ( is_supported_and_can_implement_kernel(kernel, config, compute_capability) ) @@ -630,21 +526,8 @@ def choose_mp_linear_kernel( if _cc is not None: compute_capability = _cc[0] * 10 + _cc[1] - platform_kernels = _POSSIBLE_KERNELS[current_platform._enum] - - # Apply --linear-backend filtering when set. - linear_backend = _get_linear_backend() - if linear_backend != "auto": - filtered = _filter_kernels_by_backend(linear_backend, platform_kernels) - if not filtered: - raise ValueError( - f"--linear-backend={linear_backend} was requested but no " - f"'{linear_backend}' kernel exists for mixed-precision layers." - ) - platform_kernels = filtered - failure_reasons = [] - for kernel in platform_kernels: + for kernel in _POSSIBLE_KERNELS[current_platform._enum]: if kernel.__name__ in envs.VLLM_DISABLED_KERNELS: failure_reasons.append( f" {kernel.__name__} disabled by environment variable" @@ -681,18 +564,7 @@ def init_mxfp8_linear_kernel() -> Mxfp8LinearKernel: config = Mxfp8LinearLayerConfig() platform = current_platform._enum - possible = list(_POSSIBLE_MXFP8_KERNELS.get(platform, [])) - - # Apply --linear-backend filtering when set. - linear_backend = _get_linear_backend() - if linear_backend != "auto": - filtered = _filter_kernels_by_backend(linear_backend, possible) - if not filtered: - raise ValueError( - f"--linear-backend={linear_backend} was requested but no " - f"'{linear_backend}' kernel exists for MXFP8 layers." - ) - possible = filtered + possible = _POSSIBLE_MXFP8_KERNELS.get(platform, []) failure_reasons = [] for kernel_cls in possible: @@ -724,10 +596,8 @@ def init_mxfp8_linear_kernel() -> Mxfp8LinearKernel: def init_mxfp4_linear_kernel() -> MxFp4LinearKernel: """Select and instantiate the best MXFP4 linear kernel for the current platform.""" - linear_backend = _get_linear_backend() - force_kernel: type[MxFp4LinearKernel] | None = None - if linear_backend == "auto" and envs.VLLM_MXFP4_USE_MARLIN: + if envs.VLLM_MXFP4_USE_MARLIN: force_kernel = MarlinMxFp4LinearKernel if force_kernel is not None: @@ -741,17 +611,7 @@ def init_mxfp4_linear_kernel() -> MxFp4LinearKernel: return force_kernel(MxFp4LinearLayerConfig()) platform = current_platform._enum - possible = list(_POSSIBLE_MXFP4_KERNELS.get(platform, [])) - - # Apply --linear-backend filtering when set. - if linear_backend != "auto": - filtered = _filter_kernels_by_backend(linear_backend, possible) - if not filtered: - raise ValueError( - f"--linear-backend={linear_backend} was requested but no " - f"'{linear_backend}' kernel exists for MXFP4 layers." - ) - possible = filtered + possible = _POSSIBLE_MXFP4_KERNELS.get(platform, []) failure_reasons = [] for kernel_cls in possible: @@ -826,59 +686,26 @@ def init_nvfp4_linear_kernel() -> NvFp4LinearKernel: current platform.""" config = NvFp4LinearLayerConfig() - # VLLM_BATCH_INVARIANT unconditionally forces emulation for deterministic - # execution. It overrides both --linear-backend and the deprecated env - # vars below. + # Env-var overrides. force_kernel: type[NvFp4LinearKernel] | None = None - linear_backend = _get_linear_backend() if envs.VLLM_BATCH_INVARIANT: - if linear_backend not in ("auto", "emulation"): - logger.warning_once( - "VLLM_BATCH_INVARIANT overrides --linear-backend=%s; using " - "the emulation backend for deterministic execution.", - linear_backend, - ) - else: - logger.info_once( - "VLLM_BATCH_INVARIANT forces NVFP4 linear to use the " - "emulation backend for deterministic execution." - ) + logger.info_once( + "VLLM_BATCH_INVARIANT forces NVFP4 linear to use the " + "emulation backend for deterministic execution." + ) force_kernel = EmulationNvFp4LinearKernel - elif linear_backend == "auto": - # Deprecated env-var overrides — only honoured when --linear-backend - # is "auto". Will be removed in v0.21; users should migrate to - # --linear-backend. - if envs.VLLM_USE_FBGEMM: - warnings.warn( - "VLLM_USE_FBGEMM is deprecated and will be removed in " - "v0.21. Use --linear-backend fbgemm instead.", - DeprecationWarning, - stacklevel=2, - ) - force_kernel = FbgemmNvFp4LinearKernel - elif envs.VLLM_USE_NVFP4_CT_EMULATIONS: - warnings.warn( - "VLLM_USE_NVFP4_CT_EMULATIONS is deprecated and will be " - "removed in v0.21. Use --linear-backend emulation instead.", - DeprecationWarning, - stacklevel=2, - ) - force_kernel = EmulationNvFp4LinearKernel - elif envs.VLLM_NVFP4_GEMM_BACKEND is not None: - warnings.warn( - "VLLM_NVFP4_GEMM_BACKEND is deprecated and will be " - "removed in v0.21. Use --linear-backend instead.", - DeprecationWarning, - stacklevel=2, + elif envs.VLLM_USE_FBGEMM: + force_kernel = FbgemmNvFp4LinearKernel + elif envs.VLLM_USE_NVFP4_CT_EMULATIONS: + force_kernel = EmulationNvFp4LinearKernel + elif envs.VLLM_NVFP4_GEMM_BACKEND is not None: + backend_name = envs.VLLM_NVFP4_GEMM_BACKEND + force_kernel = _NVFP4_BACKEND_TO_KERNEL.get(backend_name) + if force_kernel is None: + raise ValueError( + f"Unknown VLLM_NVFP4_GEMM_BACKEND={backend_name!r}. " + f"Valid choices: {list(_NVFP4_BACKEND_TO_KERNEL.keys())}" ) - backend_name = envs.VLLM_NVFP4_GEMM_BACKEND - force_kernel = _NVFP4_BACKEND_TO_KERNEL.get(backend_name) - if force_kernel is None: - raise ValueError( - f"Unknown VLLM_NVFP4_GEMM_BACKEND={backend_name!r}. " - f"Valid choices: " - f"{list(_NVFP4_BACKEND_TO_KERNEL.keys())}" - ) if force_kernel is not None: is_supported, reason = force_kernel.is_supported() @@ -890,19 +717,9 @@ def init_nvfp4_linear_kernel() -> NvFp4LinearKernel: logger.info_once("Using %s for NVFP4 GEMM", force_kernel.__name__) return force_kernel(config) - # Auto-select from registry (or --linear-backend filtered). + # Auto-select from registry. platform = current_platform._enum - possible = list(_POSSIBLE_NVFP4_KERNELS.get(platform, [])) - - # Apply --linear-backend filtering when set. - if linear_backend != "auto": - filtered = _filter_kernels_by_backend(linear_backend, possible) - if not filtered: - raise ValueError( - f"--linear-backend={linear_backend} was requested but no " - f"'{linear_backend}' kernel exists for NVFP4 layers." - ) - possible = filtered + possible = _POSSIBLE_NVFP4_KERNELS.get(platform, []) failure_reasons = [] for kernel_cls in possible: