Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions tests/quantization/test_compressed_tensors.py
Original file line number Diff line number Diff line change
Expand Up @@ -589,8 +589,6 @@ def _make_ct_config(*, target: str = "Linear") -> CompressedTensorsConfig:
},
ignore=[],
quant_format="pack-quantized",
sparsity_scheme_map={},
sparsity_ignore_list=[],
)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,7 @@
from typing import TYPE_CHECKING, Any, Literal, cast

import torch
from compressed_tensors.config import (
CompressionFormat,
SparsityCompressionConfig,
SparsityStructure,
)
from compressed_tensors.config import CompressionFormat, SparsityCompressionConfig
from compressed_tensors.quantization import (
QuantizationArgs,
QuantizationStrategy,
Expand Down Expand Up @@ -81,8 +77,6 @@ def __init__(
target_scheme_map: dict[str, Any],
ignore: list[str],
quant_format: str,
sparsity_scheme_map: dict[str, SparsityCompressionConfig],
sparsity_ignore_list: list[str],
kv_cache_scheme: dict[str, Any] | None = None,
config: dict[str, Any] | None = None,
transform_config: dict[str, Any] | None = None,
Expand All @@ -95,8 +89,6 @@ def __init__(
# Map from [target -> scheme]
self.target_scheme_map = target_scheme_map
self.kv_cache_scheme = kv_cache_scheme
self.sparsity_scheme_map = sparsity_scheme_map
self.sparsity_ignore_list = sparsity_ignore_list
self.config = config
self.total_num_heads = total_num_heads
self.total_num_kv_heads = total_num_kv_heads
Expand Down Expand Up @@ -147,8 +139,6 @@ def _apply_list(lst: list) -> list:

self.target_scheme_map = _apply_dict(self.target_scheme_map)
self.ignore = _apply_list(self.ignore)
self.sparsity_scheme_map = _apply_dict(self.sparsity_scheme_map)
self.sparsity_ignore_list = _apply_list(self.sparsity_ignore_list)
if self.kv_cache_scheme is not None:
self.kv_cache_scheme = _apply_dict(self.kv_cache_scheme)

Expand Down Expand Up @@ -233,16 +223,14 @@ def from_config(cls, config: dict[str, Any]) -> "CompressedTensorsConfig":
ignore: list[str] = cast(list[str], config.get("ignore", []))
quant_format = cast(str, config.get("format"))
target_scheme_map = cls._quantization_scheme_map_from_config(config=config)
sparsity_scheme_map, sparsity_ignore_list = cls._parse_sparsity_config(
config=config
)

# Check for deprecated sparsity config
cls._parse_sparsity_config(config=config)

return cls(
target_scheme_map=target_scheme_map,
ignore=ignore,
quant_format=quant_format,
sparsity_scheme_map=sparsity_scheme_map,
sparsity_ignore_list=sparsity_ignore_list,
config=config,
transform_config=config.get("transform_config"),
kv_cache_scheme=config.get("kv_cache_scheme"),
Expand All @@ -269,6 +257,14 @@ def _parse_sparsity_config(
target: sparsity_config for target in sparsity_config.targets or list()
}
sparsity_ignore_list = sparsity_config.ignore or list()

# Raise DeprecationError if non-empty sparse_scheme_map is detected
if sparse_scheme_map:
raise DeprecationWarning(
"Sparsity support has been removed from compressed-tensors. "
"Please use a model without sparsity configuration."
)

return sparse_scheme_map, sparsity_ignore_list

@classmethod
Expand Down Expand Up @@ -739,30 +735,7 @@ def get_scheme(
input_quant = scheme_dict.get("input_activations")
format = scheme_dict.get("format")

# Find the sparsity scheme of the layer
# assume that fused layers inherit first component's sparsity scheme
sparsity_targets = self.sparsity_scheme_map.keys() - set(
self.sparsity_ignore_list
)
sparsity_scheme: SparsityCompressionConfig | None = None
matched_target = find_matched_target(
layer_name=layer_name,
module=layer,
targets=sparsity_targets,
fused_mapping=self.packed_modules_mapping,
)
if matched_target is not None:
sparsity_scheme = self.sparsity_scheme_map[matched_target]

if self.supports_cutlass_24(
weight_quant=weight_quant,
input_quant=input_quant,
sparsity_scheme=sparsity_scheme,
):
raise NotImplementedError(
"Sparse24 models are no longer supported by vLLM."
)
elif weight_quant is None:
if weight_quant is None:
# Falling back to UnquantizedLinearMethod
return None

Expand Down Expand Up @@ -826,72 +799,6 @@ def has_blocked_weights(self) -> bool:
return True
return False

@staticmethod
def supports_cutlass_24(
weight_quant: QuantizationArgs | None,
input_quant: QuantizationArgs | None,
sparsity_scheme: SparsityCompressionConfig | None = None,
) -> bool:
"""
Check if the layer is supported by the Cutlass 2:4 Kernel
Conditions:
- Overarching condition: Sparsity Structure is 2:4
- Unquantized cases are supported
- Weight only quantization is not-supported
- Supported weight quantization strategies are TENSOR and CHANNEL
- Supported input quantization strategies are TENSOR and TOKEN
- Only 8 bit quantization is supported

:return: True if the layer is supported by the Cutlass 2:4 Kernel
False otherwise
"""
if sparsity_scheme is None:
return False

is_valid_sparsity_structure: bool = (
sparsity_scheme.sparsity_structure == SparsityStructure.TWO_FOUR.value
)

valid_compressors = {
CompressionFormat.dense.value,
CompressionFormat.sparse_24_bitmask.value,
}

is_valid_sparsity = (
is_valid_sparsity_structure and sparsity_scheme.format in valid_compressors
)

if not is_valid_sparsity:
return False

# Unquantized cases are supported
if weight_quant is None and input_quant is None:
return True

# Weight only quantization is not-supported
if weight_quant is not None and input_quant is None:
return False

supported_weight_quant_strategies = [
QuantizationStrategy.TENSOR.value,
QuantizationStrategy.CHANNEL.value,
]

assert weight_quant is not None
assert input_quant is not None
if weight_quant.strategy not in supported_weight_quant_strategies:
return False

supported_input_quant_strategies = [
QuantizationStrategy.TENSOR.value,
QuantizationStrategy.TOKEN.value,
]

if input_quant.strategy not in supported_input_quant_strategies:
return False

return weight_quant.num_bits == input_quant.num_bits == 8


class CompressedTensorsLinearMethod(LinearMethodBase):
def __init__(self, quantization_config: CompressedTensorsConfig):
Expand Down
Loading