vllm-project
diff --git a/‎src/compressed_tensors/modeling/attention.py‎
Lines changed: 4 additions & 8 deletions b/‎src/compressed_tensors/modeling/attention.py‎
Lines changed: 4 additions & 8 deletions
diff --git a/‎src/compressed_tensors/modeling/kvcache.py‎
Lines changed: 2 additions & 6 deletions b/‎src/compressed_tensors/modeling/kvcache.py‎
Lines changed: 2 additions & 6 deletions
diff --git a/‎src/compressed_tensors/quantization/lifecycle/apply.py‎
Lines changed: 37 additions & 102 deletions b/‎src/compressed_tensors/quantization/lifecycle/apply.py‎
Lines changed: 37 additions & 102 deletions
diff --git a/‎src/compressed_tensors/quantization/lifecycle/initialize.py‎
Lines changed: 2 additions & 7 deletions b/‎src/compressed_tensors/quantization/lifecycle/initialize.py‎
Lines changed: 2 additions & 7 deletions
@@ -17,12 +17,7 @@
 from weakref import ref
 
 from compressed_tensors.modeling.kvcache import initialize_hooked_kv_cache
-from compressed_tensors.quantization import (
-    QuantizationArgs,
-    QuantizationScheme,
-    QuantizationStrategy,
-    forward_quantize,
-)
+from compressed_tensors.quantization.lifecycle.forward import forward_quantize
 from compressed_tensors.utils import getattr_chain
 from compressed_tensors.utils.internal import InternalModule
 from torch import Tensor
@@ -60,11 +55,12 @@ class QuantizedAttentionImpl(InternalModule):
     :param attn_module: parent attention module
     """
 
+    _original_impl = "eager"
+
     def __init__(self, config: PretrainedConfig, attn_module: Module):
         super().__init__()
         self.config = config
         self.attn_module = ref(attn_module)  # avoid circular references
-        self._qparams_initialized = False
 
     def forward(
         self,
@@ -79,7 +75,7 @@ def forward(
         quant_args_attr = "quantization_scheme.input_activations"
         quant_args = getattr_chain(module, quant_args_attr, None)
         quant_enabled = getattr(module, "quantization_enabled", True)
-        if quant_args is not None and quant_enabled and self._qparams_initialized:
+        if quant_args is not None and quant_enabled:
             query = forward_quantize(module, query, "q", quant_args)
 
         # original attention
 
@@ -16,10 +16,7 @@
 from typing import Callable, Optional, Tuple
 from weakref import ref
 
-# from compressed_tensors.quantization import QuantizationStrategy, forward_quantize
-# from compressed_tensors.quantization.lifecycle.initialize import (
-#     _initialize_scale_zero_point,
-# )
+from compressed_tensors.quantization.lifecycle.forward import forward_quantize
 from compressed_tensors.utils import getattr_chain
 from compressed_tensors.utils.internal import InternalModule
 from torch import Tensor
@@ -59,7 +56,6 @@ def __init__(self, config: PretrainedConfig, attn_module: Module):
         self.config = config
         self.attn_module = ref(attn_module)  # avoid circular reference
         self.past_key_values: Optional[Cache] = None
-        self._qparams_initialized = False
 
     def update(self, *args, **kwargs) -> Tuple[Tensor, Tensor]:
         return self(*args, **kwargs)
@@ -76,7 +72,7 @@ def forward(
         quant_args_attr = "quantization_scheme.input_activations"
         quant_args = getattr_chain(module, quant_args_attr, None)
         quant_enabled = getattr(module, "quantization_enabled", True)
-        if quant_args is not None and quant_enabled and self._qparams_initialized:
+        if quant_args is not None and quant_enabled:
             key_states = forward_quantize(module, key_states, "k", quant_args)
             value_states = forward_quantize(module, value_states, "v", quant_args)
 
 
@@ -21,21 +21,26 @@
 
 import torch
 from compressed_tensors.config import CompressionFormat
+from compressed_tensors.modeling import (
+    initialize_hooked_attention,
+    initialize_hooked_kv_cache,
+)
 from compressed_tensors.quantization.lifecycle.initialize import (
     initialize_module_for_quantization,
+    is_attention_module,
 )
 from compressed_tensors.quantization.quant_args import QuantizationArgs
 from compressed_tensors.quantization.quant_config import (
     QuantizationConfig,
     QuantizationStatus,
 )
 from compressed_tensors.quantization.quant_scheme import QuantizationScheme
-from compressed_tensors.quantization.utils import (
-    KV_CACHE_TARGETS,
-    is_kv_cache_quant_scheme,
-)
 from compressed_tensors.utils.helpers import deprecated, replace_module
-from compressed_tensors.utils.match import match_named_modules, match_targets
+from compressed_tensors.utils.match import (
+    is_narrow_match,
+    match_named_modules,
+    match_targets,
+)
 from compressed_tensors.utils.offload import update_parameter_data
 from compressed_tensors.utils.safetensors_load import get_safetensors_folder
 from safetensors import safe_open
@@ -126,8 +131,24 @@ def apply_quantization_config(
     if config is None:  # see PR #180
         return dict()
 
-    # preprocess to support kv cache scheme
-    config = process_quantization_config(config)
+    # force zero points during initialization
+    force_zero_point = config.quantization_status != QuantizationStatus.COMPRESSED
+
+    # apply kv cache quantization before any attention quantization
+    # because attention quantization is a superset of kv cache quantization
+    if config.kv_cache_scheme is not None:
+        scheme = QuantizationScheme(
+            targets=".*self_attn$", input_activations=config.kv_cache_scheme
+        )
+        for submodule in model.modules():
+            if is_attention_module(submodule):
+                submodule.quantization_scheme = scheme
+                initialize_hooked_kv_cache(model, submodule)
+                initialize_module_for_quantization(
+                    submodule,
+                    force_zero_point=force_zero_point,
+                )
+                submodule.quantization_status = config.quantization_status
 
     # build mapping of targets to schemes for easier matching
     # use ordered dict to preserve target ordering in config
@@ -163,51 +184,19 @@ def apply_quantization_config(
             replace_module(model, name, compressed_linear)
 
         else:
+            if is_attention_module(submodule) and is_narrow_match(
+                model, scheme.targets, name
+            ):
+                initialize_hooked_attention(model, submodule)
+
             initialize_module_for_quantization(
                 submodule,
-                force_zero_point=config.quantization_status
-                != QuantizationStatus.COMPRESSED,
+                force_zero_point=force_zero_point,
             )
 
         submodule.quantization_status = config.quantization_status
 
 
-def process_quantization_config(config: QuantizationConfig) -> QuantizationConfig:
-    """
-    Preprocess the raw QuantizationConfig
-
-    :param config: the raw QuantizationConfig
-    :return: the processed QuantizationConfig
-    """
-    if config.kv_cache_scheme is not None:
-        config = process_kv_cache_config(config)
-
-    return config
-
-
-def process_kv_cache_config(
-    config: QuantizationConfig, targets: Union[List[str], str] = KV_CACHE_TARGETS
-) -> QuantizationConfig:
-    """
-    Reformulate the `config.kv_cache` as a `config_group`
-    and add it to the set of existing `config.groups`
-
-    :param config: the QuantizationConfig
-    :return: the QuantizationConfig with additional "kv_cache" group
-    """
-    if targets == KV_CACHE_TARGETS:
-        _LOGGER.info(f"KV cache targets set to default value of: {KV_CACHE_TARGETS}")
-
-    kv_cache_dict = config.kv_cache_scheme.model_dump()
-    kv_cache_scheme = QuantizationScheme(
-        output_activations=QuantizationArgs(**kv_cache_dict),
-        targets=targets,
-    )
-    kv_cache_group = dict(kv_cache=kv_cache_scheme)
-    config.config_groups.update(kv_cache_group)
-    return config
-
-
 @deprecated(
     message="This function is deprecated and will be removed in a future release."
     "Please use `match_targets` from `compressed_tensors.utils.match` instead."
@@ -282,60 +271,6 @@ def _scheme_from_targets(
     targets: List[str],
     name: str,
 ) -> QuantizationScheme:
-    if len(targets) == 1:
-        # if `targets` iterable contains a single element
-        # use it as the key
-        return target_to_scheme[targets[0]]
-
-    # otherwise, we need to merge QuantizationSchemes corresponding
-    # to multiple targets. This is most likely because `name` module
-    # is being target both as an ordinary quantization target, as well
-    # as kv cache quantization target
-    schemes_to_merge = [target_to_scheme[target] for target in targets]
-    return _merge_schemes(schemes_to_merge, name)
-
-
-def _merge_schemes(
-    schemes_to_merge: List[QuantizationScheme], name: str
-) -> QuantizationScheme:
-    kv_cache_quantization_scheme = [
-        scheme for scheme in schemes_to_merge if is_kv_cache_quant_scheme(scheme)
-    ]
-    if not kv_cache_quantization_scheme:
-        # if the schemes_to_merge do not contain any
-        # kv cache QuantizationScheme
-        # return the first scheme (the prioritized one,
-        # since the order of schemes_to_merge matters)
-        return schemes_to_merge[0]
-    else:
-        # fetch the kv cache QuantizationScheme and the highest
-        # priority non-kv cache QuantizationScheme and merge them
-        kv_cache_quantization_scheme = kv_cache_quantization_scheme[0]
-        quantization_scheme = [
-            scheme
-            for scheme in schemes_to_merge
-            if not is_kv_cache_quant_scheme(scheme)
-        ][0]
-        schemes_to_merge = [kv_cache_quantization_scheme, quantization_scheme]
-        merged_scheme = {}
-        for scheme in schemes_to_merge:
-            scheme_dict = {
-                k: v for k, v in scheme.model_dump().items() if v is not None
-            }
-            # when merging multiple schemes, the final target will be
-            # the `name` argument - hence erase the original targets
-            del scheme_dict["targets"]
-            # make sure that schemes do not "clash" with each other
-            overlapping_keys = set(merged_scheme.keys()) & set(scheme_dict.keys())
-            if overlapping_keys:
-                raise ValueError(
-                    f"The module: {name} is being modified by two clashing "
-                    f"quantization schemes, that jointly try to override "
-                    f"properties: {overlapping_keys}. Fix the quantization config "
-                    "so that it is not ambiguous."
-                )
-            merged_scheme.update(scheme_dict)
-
-        merged_scheme.update(targets=[name])
-
-        return QuantizationScheme(**merged_scheme)
+    # return the first scheme (the prioritized one,
+    # since the order of target_to_scheme matters)
+    return target_to_scheme[targets[0]]
@@ -37,11 +37,7 @@
 from compressed_tensors.quantization.lifecycle.forward import (
     wrap_module_forward_quantized,
 )
-from compressed_tensors.quantization.utils import (
-    is_fp4,
-    is_kv_cache_quant_scheme,
-    strategy_cdiv,
-)
+from compressed_tensors.quantization.utils import is_fp4, strategy_cdiv
 from compressed_tensors.utils import (
     disable_hf_hook,
     get_execution_device,
@@ -129,8 +125,7 @@ def initialize_module_for_quantization(
                 force_zero_point=force_zero_point,
             )
 
-        output_is_kv_cache = is_kv_cache_quant_scheme(scheme)
-        if scheme.output_activations is not None and not output_is_kv_cache:
+        if scheme.output_activations is not None:
             initialize_qparams(
                 module,
                 "output",