Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 0 additions & 29 deletions tests/quantization/test_compressed_tensors.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
CompressedTensorsConfig,
CompressedTensorsLinearMethod,
CompressedTensorsW4A4Fp4,
CompressedTensorsW4A4Mxfp4,
CompressedTensorsW4A8Fp8,
CompressedTensorsW4A16Fp4,
CompressedTensorsW8A8Fp8,
Expand Down Expand Up @@ -690,31 +689,3 @@ def check_model(model):
llm.apply_model(check_model)
output = llm.generate_greedy("Hello my name is", max_tokens=4)
assert output


@pytest.mark.skipif(
not current_platform.is_cuda() or not current_platform.has_device_capability(80),
reason="MXFP4 requires ampere or newer",
)
def test_compressed_tensors_mxfp4(vllm_runner):
model_path = "nm-testing/TinyLlama-1.1B-Chat-v1.0-MXFP4"
with vllm_runner(model_path, enforce_eager=True) as llm:

def check_model(model):
layer = model.model.layers[0]

qkv_proj = layer.self_attn.qkv_proj
o_proj = layer.self_attn.o_proj
gate_up_proj = layer.mlp.gate_up_proj
down_proj = layer.mlp.down_proj

for proj in (qkv_proj, o_proj, gate_up_proj, down_proj):
assert isinstance(proj.quant_method, CompressedTensorsLinearMethod)
assert isinstance(proj.scheme, CompressedTensorsW4A4Mxfp4)

# Verify group size
assert proj.scheme.group_size == 32

llm.apply_model(check_model)
output = llm.generate_greedy("Hello my name is", max_tokens=4)
assert output
68 changes: 0 additions & 68 deletions vllm/model_executor/kernels/linear/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,16 +58,6 @@
XPUW4A8IntLinearKernel,
XPUwNa16LinearKernel,
)
from vllm.model_executor.kernels.linear.mxfp4 import (
MxFp4LinearKernel,
MxFp4LinearLayerConfig,
)
from vllm.model_executor.kernels.linear.mxfp4.flashinfer import (
FlashInferMxFp4LinearKernel,
)
from vllm.model_executor.kernels.linear.mxfp4.marlin import (
MarlinMxFp4LinearKernel,
)
from vllm.model_executor.kernels.linear.mxfp8 import (
Mxfp8LinearKernel,
Mxfp8LinearLayerConfig,
Expand Down Expand Up @@ -286,13 +276,6 @@
],
}

_POSSIBLE_MXFP4_KERNELS: dict[PlatformEnum, list[type[MxFp4LinearKernel]]] = {
PlatformEnum.CUDA: [
FlashInferMxFp4LinearKernel,
MarlinMxFp4LinearKernel,
],
}

# TODO make all kernels inherit from MMLinearKernel
# then bound _KernelT only to MMLinearKernel
_KernelT = TypeVar("_KernelT", bound=ScaledMMLinearKernel | MMLinearKernel)
Expand Down Expand Up @@ -587,48 +570,6 @@ def init_mxfp8_linear_kernel() -> Mxfp8LinearKernel:
)


def init_mxfp4_linear_kernel() -> MxFp4LinearKernel:
"""Select and instantiate the best MXFP4 linear kernel for the
current platform."""
force_kernel: type[MxFp4LinearKernel] | None = None
if envs.VLLM_MXFP4_USE_MARLIN:
force_kernel = MarlinMxFp4LinearKernel

if force_kernel is not None:
is_supported, reason = force_kernel.is_supported()
if not is_supported:
raise ValueError(
f"Forced MXFP4 kernel {force_kernel.__name__} is not "
f"supported: {reason}"
)
logger.info_once("Using %s for MXFP4 GEMM", force_kernel.__name__)
return force_kernel(MxFp4LinearLayerConfig())

platform = current_platform._enum
possible = _POSSIBLE_MXFP4_KERNELS.get(platform, [])

failure_reasons = []
for kernel_cls in possible:
if kernel_cls.__name__ in envs.VLLM_DISABLED_KERNELS:
failure_reasons.append(
f" {kernel_cls.__name__} disabled by environment variable"
)
continue

is_supported, reason = kernel_cls.is_supported()
if not is_supported:
failure_reasons.append(f"{kernel_cls.__name__}: {reason}")
continue

logger.info_once("Using %s for MXFP4 GEMM", kernel_cls.__name__)
return kernel_cls(MxFp4LinearLayerConfig())

raise ValueError(
"Failed to find a kernel that can implement the "
"MXFP4 linear layer. Reasons: \n" + "\n".join(failure_reasons)
)


def init_wfp8_a16_linear_kernel(
weight_quant_key: QuantKey,
activation_quant_key: QuantKey,
Expand Down Expand Up @@ -789,10 +730,6 @@ def register_linear_kernel(
if platform not in _POSSIBLE_NVFP4_KERNELS:
_POSSIBLE_NVFP4_KERNELS[platform] = []
_POSSIBLE_NVFP4_KERNELS[platform].append(kernel_class)
elif kernel_type == "mxfp4":
if platform not in _POSSIBLE_MXFP4_KERNELS:
_POSSIBLE_MXFP4_KERNELS[platform] = []
_POSSIBLE_MXFP4_KERNELS[platform].append(kernel_class)
else:
raise ValueError(f"Unrecognized kernel type: {kernel_type}")

Expand Down Expand Up @@ -840,11 +777,6 @@ def register_linear_kernel(
"init_mxfp8_linear_kernel",
"Mxfp8LinearKernel",
"Mxfp8LinearLayerConfig",
"init_mxfp4_linear_kernel",
"MxFp4LinearKernel",
"MxFp4LinearLayerConfig",
"FlashInferMxFp4LinearKernel",
"MarlinMxFp4LinearKernel",
"FlashInferCutlassMxfp8LinearKernel",
"MarlinMxfp8LinearKernel",
"XPUMxFp8LinearKernel",
Expand Down
12 changes: 0 additions & 12 deletions vllm/model_executor/kernels/linear/mxfp4/__init__.py

This file was deleted.

67 changes: 0 additions & 67 deletions vllm/model_executor/kernels/linear/mxfp4/base.py

This file was deleted.

74 changes: 0 additions & 74 deletions vllm/model_executor/kernels/linear/mxfp4/flashinfer.py

This file was deleted.

52 changes: 0 additions & 52 deletions vllm/model_executor/kernels/linear/mxfp4/marlin.py

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,10 @@
CompressedTensors24,
CompressedTensorsScheme,
CompressedTensorsW4A4Fp4,
CompressedTensorsW4A4Mxfp4,
CompressedTensorsW4A8Fp8,
CompressedTensorsW4A8Int,
CompressedTensorsW4A16Fp4,
CompressedTensorsW4A16Mxfp4,
CompressedTensorsW8A8Fp8,
CompressedTensorsW8A8Int8,
CompressedTensorsW8A8Mxfp8,
Expand Down Expand Up @@ -625,7 +625,7 @@ def _get_scheme_from_parts(
return CompressedTensorsW4A16Fp4()

if self._is_mxfp4(weight_quant):
return CompressedTensorsW4A4Mxfp4()
return CompressedTensorsW4A16Mxfp4()

if self._is_mxfp8(weight_quant):
return CompressedTensorsW8A8Mxfp8()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ def __init__(self, moe):
super().__init__(moe)
self.group_size = 32
self.mxfp4_backend = Mxfp4MoeBackend.MARLIN
# use cutlass if supported, otherwise fallback to marlin for weight-only FP4
self.use_cutlass_mxfp4 = CutlassExpertsMxfp4._supports_current_device()
self.experts_cls: type[mk.FusedMoEExperts]
if self.use_cutlass_mxfp4:
Expand Down
Loading
Loading