Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 9 additions & 8 deletions tests/models/quantization/test_nvfp4.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,9 +90,9 @@ def test_models(example_prompts, model_name) -> None:
EAGER = [True, False]

SM_100_NVFP4_BACKENDS = [
"flashinfer_cudnn",
"flashinfer_trtllm",
"flashinfer_cutlass",
"flashinfer-cudnn",
"flashinfer-trtllm",
"flashinfer-cutlass",
]


Expand All @@ -102,12 +102,12 @@ def test_models(example_prompts, model_name) -> None:
"backend",
[
"emulation",
"flashinfer_cudnn",
"flashinfer_trtllm", # the small seq_len ensures trtllm_8x4_layout backend is used
"flashinfer_cutlass",
"flashinfer-cudnn",
"flashinfer-trtllm", # the small seq_len ensures trtllm_8x4_layout backend is used
"flashinfer-cutlass",
],
)
def test_nvfp4(vllm_runner, model, eager, backend):
def test_nvfp4(vllm_runner, model, eager, backend, monkeypatch):
if (
not current_platform.has_device_capability(100)
and backend in SM_100_NVFP4_BACKENDS
Expand All @@ -116,7 +116,8 @@ def test_nvfp4(vllm_runner, model, eager, backend):
f"The backend {backend} is not supported with current_platform.has_device_capability(100) == False"
)

with vllm_runner(model, enforce_eager=eager, linear_backend=backend) as llm:
monkeypatch.setenv("VLLM_NVFP4_GEMM_BACKEND", backend)
with vllm_runner(model, enforce_eager=eager) as llm:
output = llm.generate_greedy(["1 2 3 4 5"], max_tokens=2)
assert output[0][1] == "1 2 3 4 5 6"

Expand Down
44 changes: 0 additions & 44 deletions vllm/config/kernel.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,24 +135,6 @@ def with_default(
"emulation",
]

LinearBackend = Literal[
"auto",
"cutlass",
"flashinfer_cutlass",
"flashinfer_trtllm",
"flashinfer_cudnn",
"marlin",
"triton",
"deep_gemm",
"torch",
"aiter",
"machete",
"fbgemm",
"conch",
"exllama",
"emulation",
]


@config
class KernelConfig:
Expand Down Expand Up @@ -186,39 +168,13 @@ class KernelConfig:
running QDQ on activations.
"""

linear_backend: LinearBackend = "auto"
"""Backend for quantized linear layer GEMM kernels. Available options:

- "auto": Automatically select the best backend based on model and hardware
- "cutlass": Use CUTLASS-based kernels
- "flashinfer_cutlass": Use FlashInfer with CUTLASS kernels
- "flashinfer_trtllm": Use FlashInfer with TensorRT-LLM kernels
- "flashinfer_cudnn": Use FlashInfer with cuDNN kernels
- "marlin": Use Marlin kernels
- "triton": Use Triton-based kernels
- "deep_gemm": Use DeepGEMM kernels
- "torch": Use PyTorch native scaled_mm kernels
- "aiter": Use AMD AITer kernels (ROCm only)
- "machete": Use Machete kernels (mixed-precision)
- "fbgemm": Use FBGEMM kernels
- "conch": Use Conch mixed-precision kernels
- "exllama": Use Exllama mixed-precision kernels
- "emulation": Use slow dequant-to-BF16 emulation (for testing only)"""

@field_validator("moe_backend", mode="before")
@classmethod
def _normalize_moe_backend(cls, value: Any) -> Any:
if isinstance(value, str):
return value.lower().replace("-", "_")
return value

@field_validator("linear_backend", mode="before")
@classmethod
def _normalize_linear_backend(cls, value: Any) -> Any:
if isinstance(value, str):
return value.lower().replace("-", "_")
return value

def compute_hash(self) -> str:
"""
Produces a hash unique to the pass configuration.
Expand Down
8 changes: 1 addition & 7 deletions vllm/engine/arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@
PrefixCachingHashAlgo,
)
from vllm.config.device import Device
from vllm.config.kernel import IrOpPriorityConfig, LinearBackend, MoEBackend
from vllm.config.kernel import IrOpPriorityConfig, MoEBackend
from vllm.config.lora import MaxLoRARanks
from vllm.config.mamba import MambaBackendEnum
from vllm.config.model import (
Expand Down Expand Up @@ -477,7 +477,6 @@ class EngineArgs:
enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel
enable_ep_weight_filter: bool = ParallelConfig.enable_ep_weight_filter
moe_backend: MoEBackend = KernelConfig.moe_backend
linear_backend: LinearBackend = KernelConfig.linear_backend
all2all_backend: All2AllBackend = ParallelConfig.all2all_backend
enable_elastic_ep: bool = ParallelConfig.enable_elastic_ep
enable_dbo: bool = ParallelConfig.enable_dbo
Expand Down Expand Up @@ -1413,9 +1412,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
moe_backend_kwargs = kernel_kwargs["moe_backend"]
moe_backend_kwargs["type"] = lambda s: s.lower().replace("-", "_")
kernel_group.add_argument("--moe-backend", **moe_backend_kwargs)
linear_backend_kwargs = kernel_kwargs["linear_backend"]
linear_backend_kwargs["type"] = lambda s: s.lower().replace("-", "_")
kernel_group.add_argument("--linear-backend", **linear_backend_kwargs)

# vLLM arguments
vllm_kwargs = get_kwargs(VllmConfig)
Expand Down Expand Up @@ -2089,8 +2085,6 @@ def create_engine_config(
kernel_config.enable_flashinfer_autotune = self.enable_flashinfer_autotune
if self.moe_backend != "auto":
kernel_config.moe_backend = self.moe_backend
if self.linear_backend != "auto":
kernel_config.linear_backend = self.linear_backend

# Transfer top-level ir_op_priority into KernelConfig.ir_op_priority
for op_name, op_priority in asdict(self.ir_op_priority).items():
Expand Down
Loading
Loading