pytorch · cccclai · May 24, 2024
@@ -88,12 +88,13 @@ def tok_decode(self, tokens):
 
     def _model_call(self, inps):
         if self._use_kv_cache:
-            result_logits = []
-            for pos in range(self._max_seq_length):
-                pos_tensor = torch.tensor([pos], dtype=torch.int64)
-                logits = self._model(inps[:, pos : pos + 1], pos_tensor)
-                result_logits.append(logits)
-            return torch.cat(result_logits, dim=1)
+            pos_tensor = torch.arange(
+                self._max_seq_length, dtype=torch.int64, device=self.device
+            )
+
+            # Batch process the whole sequence.
+            logits = self._model(inps[:, : self._max_seq_length], pos_tensor)
+            return logits
         else:
             return self._model(inps)