Remove mutable args and remove costly index sorting for window attn.

Griffin Adams · Griffin Adams · commit a2f8029b2275 · 2024-06-04T11:14:28.000Z
diff --git a/cache.py b/cache.py
@@ -13,7 +13,13 @@ class KVCache(ABC, nn.Module):
     relevant_kwargs = ["max_cache_length"]
 
     def __init__(
-        self, max_batch_size, n_heads, head_dim, dtype=torch.bfloat16, head_specific=False, **kwargs
+        self,
+        max_batch_size,
+        n_heads,
+        head_dim,
+        dtype=torch.bfloat16,
+        head_specific=False,
+        **kwargs,
     ):
         super().__init__()
 
@@ -28,7 +34,15 @@ def __init__(
         # We use n_heads as an optional second dimension to allow for head-specific evictions.
         self.register_buffer(
             "pos",
-            torch.full((max_batch_size, n_heads if head_specific else 1, self.max_cache_length), -1, dtype=torch.int),
+            torch.full(
+                (
+                    max_batch_size,
+                    n_heads if head_specific else 1,
+                    self.max_cache_length,
+                ),
+                -1,
+                dtype=torch.int,
+            ),
         )
 
         self.updates = 0
@@ -49,7 +63,7 @@ def reset(self):
         self.pos.fill_(-1)
         self.insertions = 0
         self.updates = 0
-    
+
     def update(self, input_pos, k_val, v_val):
         """
         Updates the cache with the given input positions, keys, and values.
@@ -73,7 +87,9 @@ def update(self, input_pos, k_val, v_val):
         # Truncate the unfilled part of the cache
         # Since we always fill in-order it will be at the end
         truncate_idx = min(self.insertions, self.max_cache_length)
-        return self.k_cache[:, :, :truncate_idx, :], self.v_cache[:, :, :truncate_idx, :]
+        return self.k_cache[:, :, :truncate_idx, :], self.v_cache[
+            :, :, :truncate_idx, :
+        ]
 
     @abstractmethod
     def _update(self, input_pos, k_val, v_val):
@@ -116,19 +132,16 @@ def __init__(
     def mark_global_tokens(self) -> bool:
         """
         Update POS tensor to give global tokens highest priority.
-        
+
         Return a boolean indicating whether or not all global tokens were filled.
 
         If it returns True, this function won't be called again to save computation.
         """
         # We put max priority on leading "global" tokens
-        global_mask = torch.logical_and(
-            self.pos < self.global_tokens, self.pos >= 0
-        )
+        global_mask = torch.logical_and(self.pos < self.global_tokens, self.pos >= 0)
         # Give self.score an arbitrary high value for global tokens so that they are not replaced
         self.pos.masked_fill_(global_mask, LARGE_INTEGER)
-        return global_mask.sum() == self.global_tokens
-
+        return (global_mask.sum() == self.global_tokens).item()
 
     def _update(self, input_pos, k_val, v_val):
         # Prefill case: If prompt > window, then we need to chop off early positions
@@ -144,19 +157,18 @@ def _update(self, input_pos, k_val, v_val):
             input_pos = input_pos[keep_idxs]
             k_val = k_val[:, :, keep_idxs]
             v_val = v_val[:, :, keep_idxs]
-        
+
         # Identify the lowest positions in the cache that are not filled
-        # For window, all heads are the same so let's just use the first head for "pos"
         pos = self.pos[:, 0, :].squeeze(1)
         _, min_k_indices = pos.topk(input_pos.shape[0], largest=False)
+        min_k_indices = min_k_indices.squeeze(0)
 
-        # Sort the indices in ascending order
-        min_k_indices, _ = min_k_indices.squeeze(0).sort()
-
-        self.fill(fill_indices=min_k_indices, input_pos=input_pos, k_val=k_val, v_val=v_val)
+        self.fill(
+            fill_indices=min_k_indices, input_pos=input_pos, k_val=k_val, v_val=v_val
+        )
 
         # This is a potentially costly operation which doesn't need to be repeated once we've filled the global tokens
-        self.global_filled |= self.mark_global_tokens()
+        self.global_filled = self.global_filled or self.mark_global_tokens()
 
 
 def get_cache_constructor(cache_strategy):
diff --git a/generate.py b/generate.py
@@ -99,7 +99,7 @@ def decode_n_tokens(
     cur_token: torch.Tensor,
     input_pos: torch.Tensor,
     num_new_tokens: int,
-    terminator_ids: Optional[list] = [],
+    terminator_ids: Optional[list] = None,
     callback=lambda _: _,
     **sampling_kwargs,
 ):
@@ -200,8 +200,8 @@ def generate(
     speculate_k: Optional[int] = 8,
     max_cache_length: Optional[float] = 1.0,
     callback=lambda x: x,
-    terminator_ids: Optional[list] = [],
-    cache_kwargs: dict = {"max_cache_length": 1.0},
+    terminator_ids: Optional[list] = None,
+    cache_kwargs: dict = None,
     **sampling_kwargs,
 ) -> torch.Tensor:
     """
@@ -235,7 +235,9 @@ def generate(
         ), f"Specified max cache length ({max_cache_length}) must be less than max_seq_length ({max_seq_length})."
     cache_kwargs["max_cache_length"] = max_cache_length
 
-    assert cache_kwargs["global_tokens"] <= max_cache_length, "Global tokens must be less than max_cache_length."
+    assert (
+        cache_kwargs["global_tokens"] <= max_cache_length
+    ), "Global tokens must be less than max_cache_length."
 
     with torch.device(device):
         model.setup_caches(max_batch_size=1, **cache_kwargs)
@@ -615,10 +617,9 @@ def callback(x):
             args.max_cache_length == 1.0
         ), "Full cache strategy only supports max_cache_length=1.0."
 
-    # TODO Nicer way to bundle these?
     cache_kwargs = {
-        "max_cache_length": args.max_cache_length,
         "cache_strategy": args.cache_strategy,
+        "max_cache_length": args.max_cache_length,
         "global_tokens": args.global_tokens,
     }
 
diff --git a/model.py b/model.py
@@ -244,6 +244,7 @@ def forward(
         q, k, v = map(lambda x: x.transpose(1, 2), (q, k, v))
 
         is_prefill = self.kv_cache.is_prefill()
+
         cache_k, cache_v = self.kv_cache.update(input_pos, k, v)
 
         # If we are in the prefill stage, we use the provided prompt kv-pairs
@@ -253,7 +254,14 @@ def forward(
 
         k = k.repeat_interleave(self.n_head // self.n_local_heads, dim=1)
         v = v.repeat_interleave(self.n_head // self.n_local_heads, dim=1)
-        y = F.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0)
+        y = F.scaled_dot_product_attention(
+            q,
+            k,
+            v,
+            is_causal=False,
+            attn_mask=mask,
+            dropout_p=0.0,
+        )
 
         y = y.transpose(1, 2).contiguous().view(bsz, seqlen, self.dim)