From 2143aa6182ba4b1d793c996444e06421f52bdde4 Mon Sep 17 00:00:00 2001 From: yewentao256 Date: Sun, 17 May 2026 17:01:46 +0000 Subject: [PATCH] remove dead code Signed-off-by: yewentao256 --- .../compressed_tensors/compressed_tensors.py | 16 +--- .../compressed_tensors/schemes/__init__.py | 4 - .../schemes/compressed_tensors_24.py | 54 ----------- .../layers/quantization/schema.py | 90 ------------------- .../quantization/turboquant/quantizer.py | 6 -- 5 files changed, 2 insertions(+), 168 deletions(-) delete mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py delete mode 100644 vllm/model_executor/layers/quantization/schema.py delete mode 100644 vllm/model_executor/layers/quantization/turboquant/quantizer.py diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 85f12a464fe1..47d733043d8e 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -39,7 +39,6 @@ ) from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( WNA16_SUPPORTED_BITS, - CompressedTensors24, CompressedTensorsScheme, CompressedTensorsW4A4Fp4, CompressedTensorsW4A4Mxfp4, @@ -760,19 +759,8 @@ def get_scheme( input_quant=input_quant, sparsity_scheme=sparsity_scheme, ): - # Have a valid sparsity scheme - # Validate layer is supported by Cutlass 2:4 Kernel - model_compression_config = ( - None - if sparsity_scheme is None or sparsity_scheme.format == "dense" - else self.config - ) - - scheme = CompressedTensors24( - quantized=weight_quant is not None or input_quant is not None, - weight_quant=weight_quant, - input_quant=input_quant, - model_compression_config=model_compression_config, + raise NotImplementedError( + "Sparse24 models are no longer supported by vLLM." ) elif weight_quant is None: # Falling back to UnquantizedLinearMethod diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py index 0b0d8c230617..6aacd9e7ae57 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py @@ -13,9 +13,6 @@ from .compressed_tensors_w8a16_fp8 import CompressedTensorsW8A16Fp8 from .compressed_tensors_wNa16 import WNA16_SUPPORTED_BITS, CompressedTensorsWNA16 -# This avoids circular import error -from .compressed_tensors_24 import CompressedTensors24 # isort: skip - __all__ = [ "CompressedTensorsScheme", "CompressedTensorsWNA16", @@ -23,7 +20,6 @@ "CompressedTensorsW8A8Int8", "CompressedTensorsW8A8Fp8", "WNA16_SUPPORTED_BITS", - "CompressedTensors24", "CompressedTensorsW4A16Fp4", "CompressedTensorsW4A4Mxfp4", "CompressedTensorsW4A4Fp4", diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py deleted file mode 100644 index e28bc36368be..000000000000 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +++ /dev/null @@ -1,54 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from collections.abc import Callable -from typing import Any - -import torch -from compressed_tensors.quantization import ( - QuantizationArgs, -) - -from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( - CompressedTensorsScheme, -) - -__all__ = ["CompressedTensors24"] - - -class CompressedTensors24(CompressedTensorsScheme): - def __init__( - self, - quantized: bool = False, - weight_quant: QuantizationArgs | None = None, - input_quant: QuantizationArgs | None = None, - model_compression_config: dict[str, Any] | None = None, - ): - raise NotImplementedError("Sparse24 models are no longer supported by vLLM") - - @classmethod - def get_min_capability(cls) -> int: - raise NotImplementedError("Sparse24 models are no longer supported by vLLM") - - def create_weights( - self, - layer: torch.nn.Module, - input_size: int, - output_partition_sizes: list[int], - input_size_per_partition: int, - params_dtype: torch.dtype, - weight_loader: Callable, - **kwargs, - ): - raise NotImplementedError("Sparse24 models are no longer supported by vLLM") - - def process_weights_after_loading(self, layer: torch.nn.Module) -> None: - raise NotImplementedError("Sparse24 models are no longer supported by vLLM") - - def apply_weights( - self, - layer: torch.nn.Module, - x: torch.Tensor, - bias: torch.Tensor | None = None, - ) -> torch.Tensor: - raise NotImplementedError("Sparse24 models are no longer supported by vLLM") diff --git a/vllm/model_executor/layers/quantization/schema.py b/vllm/model_executor/layers/quantization/schema.py deleted file mode 100644 index 669bd9d6ed83..000000000000 --- a/vllm/model_executor/layers/quantization/schema.py +++ /dev/null @@ -1,90 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" -This file contains the Pydantic schemas for various quantization-related -parameters. When a relevant quantization technique is specified, these -parameters are loaded in the form of a JSON alongside the model weights -and augment the model with additional information needed for use of that -technique. The format of this JSON should be specified by one or more -schemas contained here. - -For example, when the KV cache is quantized to FP8-E4M3 (currently only -possible on ROCm), the model can be optionally augmented with KV cache -scaling factors. -""" - -from pydantic import BaseModel, ConfigDict, ValidationInfo, model_validator - - -class KVCacheQuantSchema(BaseModel): - dtype: str - # Each key is a TP rank. Each value is a dictionary mapping a TP rank's - # layer indices to their per-tensor KV cache scaling factor. - # TODO: Consider pulling this and its validation methods out into its - # own schema class (tricky as its members are variable) - scaling_factor: dict[int, dict[int, float]] - - @model_validator(mode="after") - def check_is_fp8(self) -> "KVCacheQuantSchema": - assert self.dtype == "float8_e4m3fn", ( - "Loaded scaling factors intended for KV cache dtype = " - f"{self.dtype} rather than float8_e4m3fn!" - ) - return self - - @model_validator(mode="after") - def check_tp_ranks(self, info: ValidationInfo) -> "KVCacheQuantSchema": - context = info.context - if context: - tp_size = context["tp_size"] - num_hidden_layers = context["num_hidden_layers"] - assert len(self.scaling_factor) == tp_size, ( - f"Loaded dictionary has TP size {len(self.scaling_factor)} " - f"but LLM engine is currently running with TP size {tp_size}." - ) - for tp_rank, layer_maps in self.scaling_factor.items(): - assert len(layer_maps) == num_hidden_layers, ( - f"KV cache scales map for TP rank {tp_rank} is malformed. " - f"Expected {num_hidden_layers} layers, got " - f"{len(layer_maps)}." - ) - for i in range(tp_size): - assert i in self.scaling_factor, ( - f"KV cache scales map for TP rank {i} not found." - ) - return self - - @model_validator(mode="after") - def check_current_rank(self, info: ValidationInfo) -> "KVCacheQuantSchema": - context = info.context - if context: - tp_rank = context["tp_rank"] - num_hidden_layers = context["num_hidden_layers"] - layer_scales_map = self.scaling_factor[tp_rank] - for i in range(num_hidden_layers): - assert i in layer_scales_map, ( - f"Could not find KV cache scales for layer {i} in " - f"TP rank {tp_rank}." - ) - return self - - -class QuantParamSchema(BaseModel): - # TODO: Generalize and extend with more fields - # (e.g. weights/activations params) once functionality is enabled - model_config = ConfigDict(protected_namespaces=()) - model_type: str | None - kv_cache: KVCacheQuantSchema - - @model_validator(mode="after") - def check_model_type(self, info: ValidationInfo) -> "QuantParamSchema": - context = info.context - if context: - model_type = context.get("model_type", None) - if model_type is not None: - assert model_type == self.model_type, ( - f"Model type is {model_type} but loaded " - f"scaling factors belonging to different " - f"model type {self.model_type}!" - ) - return self diff --git a/vllm/model_executor/layers/quantization/turboquant/quantizer.py b/vllm/model_executor/layers/quantization/turboquant/quantizer.py deleted file mode 100644 index 82a0c3391ce8..000000000000 --- a/vllm/model_executor/layers/quantization/turboquant/quantizer.py +++ /dev/null @@ -1,6 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""TurboQuant quantizer utilities. - -Triton kernels handle all quantization, packing, and dequantization on GPU. -"""