NVIDIA · alessiodevoto · Sep 3, 2025 · Jan 7, 2025 · Jan 7, 2025 · Jan 13, 2025
diff --git a/README.md b/README.md
@@ -23,6 +23,27 @@ git clone https://github.com/NVIDIA/kvpress.git
 cd kvpress
 uv sync --all-groups
 ```
+<details><summary>
+Advanced installation settings
+</summary>
+
+To install optional packages, you can use [uv](https://docs.astral.sh/uv/). 
+To install with flash attention, just run:
+
+```bash
+git clone https://github.com/NVIDIA/kvpress.git
+cd kvpress
+uv sync --extra flash-attn
+```
+
+To install with dependencies for evaluation, run 
+
+```bash
+git clone https://github.com/NVIDIA/kvpress.git
+cd kvpress
+uv sync --extra eval
+```
+</details>
 
 ## Usage
 
@@ -203,25 +224,4 @@ with press(model):
 
 However, the `generate` method does not allow to exclude the question from the compression, which would artificially favors methods such as SnapKV. Ideally, we want a compression method that works whatever comes after the context (_e.g._ for use cases such as chat or document question answering). Finally the `generate` method does not allow to provide generation for multiple questions at once.
 
-</details>
-
-
-## Advanced installation settings
-To install optional packages, you can use [uv](https://docs.astral.sh/uv/). 
-To install with flash attention, just run:
-
-```bash
-git clone https://github.com/NVIDIA/kvpress.git
-cd kvpress
-uv sync --extra flash-attn
-```
-
-To install with dependencies for evaluation, run 
-
-```bash
-git clone https://github.com/NVIDIA/kvpress.git
-cd kvpress
-uv sync --extra eval
-```
-
-Notice that optional dependecies can be combined.
+</details>
diff --git a/kvpress/attention_patch.py b/kvpress/attention_patch.py
@@ -78,6 +78,10 @@ def wrapper(module, query, key, value, attention_mask, dropout, **kwargs):
             batch_indices, head_indices, seq_indices = module.masked_key_indices
             key[batch_indices, head_indices, seq_indices] = k[batch_indices, head_indices]
 
+        # see https://github.com/NVIDIA/kvpress/pull/115#issuecomment-3183785597
+        # cu_seq_lens_k are only in kwargs if model.generate is used.
+        if "cu_seq_lens_k" in kwargs:
+            kwargs["cu_seq_lens_k"][-1] = key.shape[-2]
         return func(module, query, key, value, attention_mask, dropout, **kwargs)
 
     return wrapper

diff --git a/kvpress/pipeline.py b/kvpress/pipeline.py
@@ -7,15 +7,14 @@
 from typing import Optional
 
 import torch
-from transformers import AutoModelForCausalLM, Cache, DynamicCache, Pipeline
+from transformers import AutoModelForCausalLM, Cache, DynamicCache, Pipeline, QuantizedCache
 from transformers.pipelines import PIPELINE_REGISTRY
 from transformers.pipelines.base import GenericTensor
 
 from kvpress.presses.base_press import BasePress
 from kvpress.presses.finch_press import FinchPress
 from kvpress.presses.key_rerotation_press import KeyRerotationPress
 from kvpress.presses.observed_attention_press import ObservedAttentionPress
-from kvpress.presses.per_layer_compression_press import PerLayerCompressionPress
 
 logger = logging.getLogger(__name__)
 
@@ -224,20 +223,6 @@ def _forward(
 
         return answers
 
-    def output_attentions(self, press: BasePress):
-        if isinstance(press, ObservedAttentionPress):
-            return True
-        if isinstance(press, (KeyRerotationPress, PerLayerCompressionPress)) and isinstance(
-            press.press, ObservedAttentionPress
-        ):
-            return True
-        return False
-
-    def postprocess(self, model_outputs, single_question):
-        if single_question:
-            return {"answer": model_outputs[0]}
-        return {"answers": model_outputs}
-
     def generate_answer(
         self, question_ids: torch.Tensor, cache: Cache, context_length: int, max_new_tokens: int
     ) -> str:
@@ -260,7 +245,6 @@ def generate_answer(
         str
             The generated answer.
         """
-
         cache_seq_lengths = [cache.get_seq_length(layer_idx) for layer_idx in range(len(cache))]
         position_ids = torch.arange(
             context_length, context_length + question_ids.shape[1], device=self.model.device
@@ -292,28 +276,34 @@ def generate_answer(
             if new_id.item() in should_stop_token_ids:
                 break
         answer = self.tokenizer.decode(torch.stack(generated_ids), skip_special_tokens=True)
-
         # Remove the generated tokens from the cache
-        cache.key_cache = [
-            cache.key_cache[layer_idx][:, :, :sequence_length]
-            for layer_idx, sequence_length in enumerate(cache_seq_lengths)
-        ]
-        cache.value_cache = [
-            cache.value_cache[layer_idx][:, :, :sequence_length]
-            for layer_idx, sequence_length in enumerate(cache_seq_lengths)
-        ]
-        if hasattr(cache, "_quantized_key_cache"):
-            cache._quantized_key_cache = [
-                cache._quantized_key_cache[layer_idx][:, :, :sequence_length]
-                for layer_idx, sequence_length in enumerate(cache_seq_lengths)
-            ]
-            cache._quantized_value_cache = [
-                cache._quantized_value_cache[layer_idx][:, :, :sequence_length]
-                for layer_idx, sequence_length in enumerate(cache_seq_lengths)
-            ]
+        for layer_idx, sequence_length in enumerate(cache_seq_lengths):
+            cache.layers[layer_idx].keys = cache.layers[layer_idx].keys[:, :, :sequence_length]
+            cache.layers[layer_idx].values = cache.layers[layer_idx].values[:, :, :sequence_length]
+
+        if isinstance(cache, QuantizedCache):
+            for layer_idx, sequence_length in enumerate(cache_seq_lengths):
+                cache.layers[layer_idx]._quantized_keys = cache.layers[layer_idx]._quantized_keys[
+                    :, :, :sequence_length
+                ]
+                cache.layers[layer_idx]._quantized_values = cache.layers[layer_idx]._quantized_values[
+                    :, :, :sequence_length
+                ]
 
         return answer
 
+    def output_attentions(self, press: BasePress):
+        if isinstance(press, ObservedAttentionPress):
+            return True
+        if hasattr(press, "press") and isinstance(press.press, ObservedAttentionPress):
+            return True
+        return False
+
+    def postprocess(self, model_outputs, single_question):
+        if single_question:
+            return {"answer": model_outputs[0]}
+        return {"answers": model_outputs}
+
 
 PIPELINE_REGISTRY.register_pipeline(
     "kv-press-text-generation",

diff --git a/kvpress/presses/base_press.py b/kvpress/presses/base_press.py
@@ -107,7 +107,7 @@ def forward_hook(self, module: nn.Module, input: list[torch.Tensor], kwargs: dic
         kwargs : dict
             Keyword arguments passed to the attention layer's forward method, including:
             - hidden_states: Input embeddings to the attention layer
-            - past_key_value: The KV cache object being modified
+            - past_key_values: The KV cache object being modified
             - cache_position: Position indices indicating where we are in the sequence
             - position_embeddings: RoPE embeddings if applicable
         output : list
@@ -123,31 +123,37 @@ def forward_hook(self, module: nn.Module, input: list[torch.Tensor], kwargs: dic
         """
 
         hidden_states = kwargs["hidden_states"]
-        cache = kwargs["past_key_value"]
+        cache = kwargs["past_key_values"]
         q_len = hidden_states.shape[1]
 
         # Don't compress after pre-filling
         if kwargs["cache_position"][-1] > q_len:
             return output
 
+        cache_layer = cache.layers[module.layer_idx]
         if isinstance(cache, QuantizedCache):
-            keys = cache._dequantize(cache._quantized_key_cache[module.layer_idx])
-            values = cache._dequantize(cache._quantized_value_cache[module.layer_idx])
+            keys = cache_layer._dequantize(  # type: ignore[index]
+                cache_layer._quantized_keys  # type: ignore[index]
+            )
+            values = cache_layer._dequantize(  # type: ignore[index]
+                cache_layer._quantized_values  # type: ignore[index]
+            )
+
         else:
-            keys = cache.key_cache[module.layer_idx]
-            values = cache.value_cache[module.layer_idx]
+            keys = cache_layer.keys
+            values = cache_layer.values
 
         keys, values = self.compress(module, hidden_states, keys, values, output[1], kwargs)
 
         if isinstance(cache, QuantizedCache):
-            cache._quantized_key_cache[module.layer_idx] = cache._quantize(keys, axis=cache.axis_key)
-            cache._quantized_value_cache[module.layer_idx] = cache._quantize(values, axis=cache.axis_value)
-            cache.key_cache[module.layer_idx] = torch.zeros(0, dtype=keys.dtype, device=keys.device)
-            cache.value_cache[module.layer_idx] = torch.zeros(0, dtype=keys.dtype, device=keys.device)
-            cache._seen_tokens = keys.shape[2]
+            cache_layer._quantized_keys = cache_layer._quantize(keys, axis=cache_layer.axis_key)
+            cache_layer._quantized_values = cache_layer._quantize(values, axis=cache_layer.axis_value)
+            cache_layer.keys = torch.zeros(0, dtype=keys.dtype, device=keys.device)  # type: ignore[index]
+            cache_layer.values = torch.zeros(0, dtype=keys.dtype, device=keys.device)  # type: ignore[index]
+            cache_layer.cumulative_length = keys.shape[2]
         else:
-            cache.key_cache[module.layer_idx] = keys
-            cache.value_cache[module.layer_idx] = values
+            cache_layer.keys = keys
+            cache_layer.values = values
 
         return output
 

diff --git a/kvpress/presses/kvzip_press.py b/kvpress/presses/kvzip_press.py
@@ -6,7 +6,7 @@
 from contextlib import contextmanager
 from dataclasses import dataclass
 from types import MethodType
-from typing import Any, Generator, List, cast
+from typing import Generator, List
 
 import torch
 from torch import nn
@@ -116,7 +116,10 @@ def __call__(self, model: PreTrainedModel) -> Generator:
 
         def wrapped_forward(model_self, *args, **kwargs):
             self._context_ids = kwargs["input_ids"]
-            self._cache = kwargs["past_key_values"]
+            assert (
+                "past_key_value" in kwargs or "past_key_values" in kwargs
+            ), f"KVzipPress requires 'past_key_value' or 'past_key_values' during prefilling. Got {kwargs.keys()}"
+            self._cache = kwargs.get("past_key_values", None) or kwargs.get("past_key_value", None)
             return original_forward(*args, **kwargs)
 
         model.model.forward = MethodType(wrapped_forward, model.model)
@@ -149,29 +152,34 @@ def forward_hook(self, module: nn.Module, input: list[torch.Tensor], kwargs: dic
         """
 
         hidden_states = kwargs["hidden_states"]
-        cache = kwargs["past_key_value"]
+        cache = kwargs.get("past_key_values", None) or kwargs.get("past_key_value", None)
 
+        cache_layer = cache.layers[module.layer_idx]
         if isinstance(cache, QuantizedCache):
-            keys = cache._dequantize(cache._quantized_key_cache[module.layer_idx])  # type: ignore[attr-defined]
-            values = cache._dequantize(cache._quantized_value_cache[module.layer_idx])  # type: ignore[attr-defined]
+            keys = cache_layer._dequantize(  # type: ignore[index]
+                cache_layer._quantized_keys  # type: ignore[index]
+            )
+            values = cache_layer._dequantize(  # type: ignore[index]
+                cache_layer._quantized_values  # type: ignore[index]
+            )
+
         else:
-            keys = cache.key_cache[module.layer_idx]
-            values = cache.value_cache[module.layer_idx]
+            keys = cache_layer.keys
+            values = cache_layer.values
 
         # Compute importance scores for KV pairs in the prefilled context,
         # retaining only the originally prefilled KV pairs.
         keys, values = self.score_kvzip(module, hidden_states, keys, values, output[1], kwargs)
 
         if isinstance(cache, QuantizedCache):
-            cache = cast(Any, cache)  # to ignore attr-defined style errors
-            cache._quantized_key_cache[module.layer_idx] = cache._quantize(keys, axis=cache.axis_key)
-            cache._quantized_value_cache[module.layer_idx] = cache._quantize(values, axis=cache.axis_value)
-            cache.key_cache[module.layer_idx] = torch.zeros(0, dtype=keys.dtype, device=keys.device)
-            cache.value_cache[module.layer_idx] = torch.zeros(0, dtype=keys.dtype, device=keys.device)
-            cache._seen_tokens = keys.shape[2]
+            cache_layer._quantized_keys = cache_layer._quantize(keys, axis=cache_layer.axis_key)
+            cache_layer._quantized_values = cache_layer._quantize(values, axis=cache_layer.axis_value)
+            cache_layer.keys = torch.zeros(0, dtype=keys.dtype, device=keys.device)  # type: ignore[index]
+            cache_layer.values = torch.zeros(0, dtype=keys.dtype, device=keys.device)  # type: ignore[index]
+            cache_layer.cumulative_length = keys.shape[2]
         else:
-            cache.key_cache[module.layer_idx] = keys
-            cache.value_cache[module.layer_idx] = values
+            cache_layer.keys = keys
+            cache_layer.values = values
 
         return output
 

diff --git a/kvpress/presses/observed_attention_press.py b/kvpress/presses/observed_attention_press.py
@@ -30,23 +30,19 @@ class ObservedAttentionPress(ScorerPress):
     ----------
     compression_ratio : float, default=0.0
         Fraction of key-value pairs to remove during compression.
-    output_attentions : bool, default=False
-        Whether to return attention weights in model output.
-        Controls whether attention weights are included in output after compression.
-        Attention weights are always needed internally for scoring but can be removed
-        from output to save memory.
+    output_attentions : bool, default=True
+        Whether to output the attention weights. Must be set True but we keep it for backward compatibility.
     """
 
     compression_ratio: float = 0.0
-    output_attentions: bool = False
+    output_attentions: bool = True
 
     def __post_init__(self):
         if not self.output_attentions:
-            logger.warning(
-                "Model will not return attentions in its output to save memory. "
-                "Set output_attentions=True if attentions are needed in the output."
+            # keep for backward compatibility, remove in version 1.0
+            raise ValueError(
+                "With transformers >= 4.54, " "ObservedAttentionPress will only work with output_attentions=True"
             )
-        super().__post_init__()
 
     def score(
         self,
@@ -64,12 +60,3 @@ def score(
         scores = scores / n_tokens_in_sum
         scores = scores.view(bsz, num_key_value_heads, -1, n_tokens).mean(2)
         return scores
-
-    def forward_hook(self, module: nn.Module, input: list[torch.Tensor], kwargs: dict, output: list):
-        output = super().forward_hook(module, input, kwargs, output)
-        # attentions are needed as input for the hook, but unless the user wants to return them in the output,
-        # we can remove them to save memory
-        if not self.output_attentions:
-            output = (output[0], None)
-
-        return output
diff --git a/notebooks/speed_and_memory.ipynb b/notebooks/speed_and_memory.ipynb
@@ -125,10 +125,9 @@
     "        if cache_implementation == \"dynamic\":\n",
     "            cache = DynamicCache()\n",
     "        elif cache_implementation == \"quantized\":\n",
-    "            config = QuantizedCacheConfig(nbits=4)\n",
-    "            cache = QuantoQuantizedCache(config)\n",
+    "            cache = QuantoQuantizedCache(config=model.config, nbits=4)\n",
     "        else:\n",
-    "            raise NotImplementedError(f\"Cache {cache_impl} not yet implemented\")\n",
+    "            raise NotImplementedError(f\"Cache {cache_implementation} not yet implemented\")\n",
     "\n",
     "        start = time()\n",
     "        model(inputs, num_logits_to_keep=1, past_key_values=cache)\n",