From 2143aa6182ba4b1d793c996444e06421f52bdde4 Mon Sep 17 00:00:00 2001
From: yewentao256 <zhyanwentao@126.com>
Date: Sun, 17 May 2026 17:01:46 +0000
Subject: [PATCH] remove dead code

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 .../compressed_tensors/compressed_tensors.py  | 16 +---
 .../compressed_tensors/schemes/__init__.py    |  4 -
 .../schemes/compressed_tensors_24.py          | 54 -----------
 .../layers/quantization/schema.py             | 90 -------------------
 .../quantization/turboquant/quantizer.py      |  6 --
 5 files changed, 2 insertions(+), 168 deletions(-)
 delete mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
 delete mode 100644 vllm/model_executor/layers/quantization/schema.py
 delete mode 100644 vllm/model_executor/layers/quantization/turboquant/quantizer.py

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 85f12a464fe1..47d733043d8e 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -39,7 +39,6 @@
 )
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     WNA16_SUPPORTED_BITS,
-    CompressedTensors24,
     CompressedTensorsScheme,
     CompressedTensorsW4A4Fp4,
     CompressedTensorsW4A4Mxfp4,
@@ -760,19 +759,8 @@ def get_scheme(
             input_quant=input_quant,
             sparsity_scheme=sparsity_scheme,
         ):
-            # Have a valid sparsity scheme
-            # Validate layer is supported by Cutlass 2:4 Kernel
-            model_compression_config = (
-                None
-                if sparsity_scheme is None or sparsity_scheme.format == "dense"
-                else self.config
-            )
-
-            scheme = CompressedTensors24(
-                quantized=weight_quant is not None or input_quant is not None,
-                weight_quant=weight_quant,
-                input_quant=input_quant,
-                model_compression_config=model_compression_config,
+            raise NotImplementedError(
+                "Sparse24 models are no longer supported by vLLM."
             )
         elif weight_quant is None:
             # Falling back to UnquantizedLinearMethod
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
index 0b0d8c230617..6aacd9e7ae57 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
@@ -13,9 +13,6 @@
 from .compressed_tensors_w8a16_fp8 import CompressedTensorsW8A16Fp8
 from .compressed_tensors_wNa16 import WNA16_SUPPORTED_BITS, CompressedTensorsWNA16
 
-# This avoids circular import error
-from .compressed_tensors_24 import CompressedTensors24  # isort: skip
-
 __all__ = [
     "CompressedTensorsScheme",
     "CompressedTensorsWNA16",
@@ -23,7 +20,6 @@
     "CompressedTensorsW8A8Int8",
     "CompressedTensorsW8A8Fp8",
     "WNA16_SUPPORTED_BITS",
-    "CompressedTensors24",
     "CompressedTensorsW4A16Fp4",
     "CompressedTensorsW4A4Mxfp4",
     "CompressedTensorsW4A4Fp4",
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
deleted file mode 100644
index e28bc36368be..000000000000
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from collections.abc import Callable
-from typing import Any
-
-import torch
-from compressed_tensors.quantization import (
-    QuantizationArgs,
-)
-
-from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
-    CompressedTensorsScheme,
-)
-
-__all__ = ["CompressedTensors24"]
-
-
-class CompressedTensors24(CompressedTensorsScheme):
-    def __init__(
-        self,
-        quantized: bool = False,
-        weight_quant: QuantizationArgs | None = None,
-        input_quant: QuantizationArgs | None = None,
-        model_compression_config: dict[str, Any] | None = None,
-    ):
-        raise NotImplementedError("Sparse24 models are no longer supported by vLLM")
-
-    @classmethod
-    def get_min_capability(cls) -> int:
-        raise NotImplementedError("Sparse24 models are no longer supported by vLLM")
-
-    def create_weights(
-        self,
-        layer: torch.nn.Module,
-        input_size: int,
-        output_partition_sizes: list[int],
-        input_size_per_partition: int,
-        params_dtype: torch.dtype,
-        weight_loader: Callable,
-        **kwargs,
-    ):
-        raise NotImplementedError("Sparse24 models are no longer supported by vLLM")
-
-    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        raise NotImplementedError("Sparse24 models are no longer supported by vLLM")
-
-    def apply_weights(
-        self,
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        bias: torch.Tensor | None = None,
-    ) -> torch.Tensor:
-        raise NotImplementedError("Sparse24 models are no longer supported by vLLM")
diff --git a/vllm/model_executor/layers/quantization/schema.py b/vllm/model_executor/layers/quantization/schema.py
deleted file mode 100644
index 669bd9d6ed83..000000000000
--- a/vllm/model_executor/layers/quantization/schema.py
+++ /dev/null
@@ -1,90 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-This file contains the Pydantic schemas for various quantization-related
-parameters. When a relevant quantization technique is specified, these
-parameters are loaded in the form of a JSON alongside the model weights
-and augment the model with additional information needed for use of that
-technique. The format of this JSON should be specified by one or more
-schemas contained here.
-
-For example, when the KV cache is quantized to FP8-E4M3 (currently only
-possible on ROCm), the model can be optionally augmented with KV cache
-scaling factors.
-"""
-
-from pydantic import BaseModel, ConfigDict, ValidationInfo, model_validator
-
-
-class KVCacheQuantSchema(BaseModel):
-    dtype: str
-    # Each key is a TP rank. Each value is a dictionary mapping a TP rank's
-    # layer indices to their per-tensor KV cache scaling factor.
-    # TODO: Consider pulling this and its validation methods out into its
-    # own schema class (tricky as its members are variable)
-    scaling_factor: dict[int, dict[int, float]]
-
-    @model_validator(mode="after")
-    def check_is_fp8(self) -> "KVCacheQuantSchema":
-        assert self.dtype == "float8_e4m3fn", (
-            "Loaded scaling factors intended for KV cache dtype = "
-            f"{self.dtype} rather than float8_e4m3fn!"
-        )
-        return self
-
-    @model_validator(mode="after")
-    def check_tp_ranks(self, info: ValidationInfo) -> "KVCacheQuantSchema":
-        context = info.context
-        if context:
-            tp_size = context["tp_size"]
-            num_hidden_layers = context["num_hidden_layers"]
-            assert len(self.scaling_factor) == tp_size, (
-                f"Loaded dictionary has TP size {len(self.scaling_factor)} "
-                f"but LLM engine is currently running with TP size {tp_size}."
-            )
-            for tp_rank, layer_maps in self.scaling_factor.items():
-                assert len(layer_maps) == num_hidden_layers, (
-                    f"KV cache scales map for TP rank {tp_rank} is malformed. "
-                    f"Expected {num_hidden_layers} layers, got "
-                    f"{len(layer_maps)}."
-                )
-            for i in range(tp_size):
-                assert i in self.scaling_factor, (
-                    f"KV cache scales map for TP rank {i} not found."
-                )
-        return self
-
-    @model_validator(mode="after")
-    def check_current_rank(self, info: ValidationInfo) -> "KVCacheQuantSchema":
-        context = info.context
-        if context:
-            tp_rank = context["tp_rank"]
-            num_hidden_layers = context["num_hidden_layers"]
-            layer_scales_map = self.scaling_factor[tp_rank]
-            for i in range(num_hidden_layers):
-                assert i in layer_scales_map, (
-                    f"Could not find KV cache scales for layer {i} in "
-                    f"TP rank {tp_rank}."
-                )
-        return self
-
-
-class QuantParamSchema(BaseModel):
-    # TODO: Generalize and extend with more fields
-    # (e.g. weights/activations params) once functionality is enabled
-    model_config = ConfigDict(protected_namespaces=())
-    model_type: str | None
-    kv_cache: KVCacheQuantSchema
-
-    @model_validator(mode="after")
-    def check_model_type(self, info: ValidationInfo) -> "QuantParamSchema":
-        context = info.context
-        if context:
-            model_type = context.get("model_type", None)
-            if model_type is not None:
-                assert model_type == self.model_type, (
-                    f"Model type is {model_type} but loaded "
-                    f"scaling factors belonging to different "
-                    f"model type {self.model_type}!"
-                )
-        return self
diff --git a/vllm/model_executor/layers/quantization/turboquant/quantizer.py b/vllm/model_executor/layers/quantization/turboquant/quantizer.py
deleted file mode 100644
index 82a0c3391ce8..000000000000
--- a/vllm/model_executor/layers/quantization/turboquant/quantizer.py
+++ /dev/null
@@ -1,6 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""TurboQuant quantizer utilities.
-
-Triton kernels handle all quantization, packing, and dequantization on GPU.
-"""