fix seq_len dim for models using hybrid cache

guangy10 · Guang Yang · commit e5f8de56e288 · 2025-06-24T22:59:45.000-07:00
diff --git a/install_dev.py b/install_dev.py
@@ -34,7 +34,7 @@ def install_dep_from_source():
             "-m",
             "pip",
             "install",
-            "git+https://github.com/huggingface/transformers@ea013348737fbd0efdefa38f9cad30443a810fd3#egg=transformers",
+            "git+https://github.com/huggingface/transformers@37367c7d9fd23401c26e79a2b26253ab2d1b7236#egg=transformers",
         ]
     )
     subprocess.check_call(
diff --git a/optimum/executorch/modeling.py b/optimum/executorch/modeling.py
@@ -687,18 +687,18 @@ def generate(
                     cache_position=torch.tensor([i], dtype=torch.long, device=self.device),
                 )
                 self.stats.on_sampling_end()
+            next_token = torch.argmax(logits, dim=-1).item()
         else:
             self.stats.on_sampling_begin()
             logits = self.forward(
                 input_ids=torch.tensor(prompt_tokens, dtype=torch.long, device=self.device).unsqueeze(0),
                 cache_position=torch.arange(len(prompt_tokens), dtype=torch.long, device=self.device),
             )
             self.stats.on_sampling_end()
-
+            next_token = torch.argmax(logits, dim=-1)[0, -1].item()
         self.stats.on_prompt_eval_end()
         first_token_generated = False
 
-        next_token = torch.argmax(logits, dim=-1)[0, -1].item()
         generated_tokens = prompt_tokens + [next_token]
 
         while len(generated_tokens) < max_seq_len:
diff --git a/optimum/exporters/executorch/integrations.py b/optimum/exporters/executorch/integrations.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import logging
 from typing import Dict, Optional
 
 import torch
@@ -44,6 +45,7 @@ def __init__(self, model, use_custom_kv_cache=False):
         self.config = model.config
         self.use_custom_kv_cache = use_custom_kv_cache
         self.metadata = save_config_to_constant_methods(model.config, model.generation_config)
+        logging.info(f"Metadata to be recorded in PTE: {self.metadata}")
 
     def export(
         self,
@@ -58,21 +60,30 @@ def export(
             )
 
             max_batch_size = 1
-            max_cache_len = 4094
             seq_length = 3  # Make the sequence length dim to be dynamic in orfer to leverage parallel prefill in ExecuTorch runtime.
             example_input_ids = input_ids if input_ids is not None else torch.zeros((1, seq_length), dtype=torch.long)
             example_cache_position = (
                 cache_position if cache_position is not None else torch.arange(seq_length, dtype=torch.long)
             )
             seq_len_dim = torch.export.Dim(
-                "seq_length_dim", max=min(self.metadata["get_max_seq_len"], max_cache_len) - 1
+                "seq_length_dim",
+                max=min(
+                    self.metadata.get("get_max_seq_len"),
+                    self.metadata.get("sliding_window", float("inf")),
+                )
+                - 1,
             )
             dynamic_shapes = {"input_ids": {1: seq_len_dim}, "cache_position": {0: seq_len_dim}}
             strict = parse(torch.__version__) != parse(
                 "2.7.0"
             )  # Due to bug https://github.com/pytorch/pytorch/issues/150994
 
-            exportable_module = TorchExportableModuleForDecoderOnlyLM(self.model, max_batch_size, max_cache_len)
+            exportable_module = TorchExportableModuleForDecoderOnlyLM(
+                self.model,
+                max_batch_size=max_batch_size,
+                max_cache_len=self.metadata.get("get_max_seq_len"),
+            )
+
             if self.use_custom_kv_cache:
                 from optimum.executorch.attentions.custom_kv_cache import (
                     replace_with_et_custom_kv_cache,
diff --git a/optimum/exporters/executorch/utils.py b/optimum/exporters/executorch/utils.py
@@ -42,16 +42,14 @@ def save_config_to_constant_methods(
         "get_vocab_size": getattr(config, "vocab_size", None),
         "get_max_batch_size": 1,
         "get_max_seq_len": getattr(config, "max_position_embeddings", None),
+        "use_kv_cache": getattr(generation_config, "use_cache", None),
+        "sliding_window": getattr(config, "sliding_window", None),
         "decoder_start_token_id": getattr(config, "decoder_start_token_id", None),
         "use_sdpa_with_kv_cache": "custom_sdpa" in config._attn_implementation,
     }
 
     # Safely access fields from generation_config if it exists
     if generation_config is not None:
-        # Get use_cache with default value
-        use_cache = getattr(generation_config, "use_cache", None)
-        metadata["use_kv_cache"] = use_cache
-
         # Check for cache_config and its attributes
         cache_config = getattr(generation_config, "cache_config", None)
         if cache_config is not None:
diff --git a/tests/models/test_modeling_phi4.py b/tests/models/test_modeling_phi4.py
@@ -20,7 +20,7 @@
 
 import pytest
 import torchao
-from executorch import version as executorch_version
+import transformers
 from executorch.extension.pybindings.portable_lib import ExecuTorchModule
 from packaging.version import parse
 from transformers import AutoConfig, AutoTokenizer
@@ -43,41 +43,49 @@ def __init__(self, *args, **kwargs):
     @slow
     @pytest.mark.run_slow
     @pytest.mark.skipif(
-        is_ci,
-        reason="Test Phi-4-mini (3.8B) will require runner to be configured with larger RAM",
+        parse(transformers.__version__) < parse("4.52.0") or parse(torchao.__version__) < parse("0.11.0"),
+        reason="Only available on transformers >= 4.52.0 and torchao >= 0.11.0",
     )
-    def test_phi4_text_generation(self):
+    def test_phi4_text_generation_with_custom_sdpa_and_kv_cache_8da4w_8we(self):
         model_id = "microsoft/Phi-4-mini-instruct"
         config = AutoConfig.from_pretrained(model_id)
         # NOTE: To make the model exportable we need to set the rope scaling to default to avoid hitting
         # the data-dependent control flow in _longrope_frequency_update. Alternatively, we can rewrite
         # that function to avoid the data-dependent control flow.
         if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
             config.rope_scaling["type"] = "default"
-        model = ExecuTorchModelForCausalLM.from_pretrained(model_id, recipe="xnnpack", config=config)
+        model = ExecuTorchModelForCausalLM.from_pretrained(
+            model_id,
+            recipe="xnnpack",
+            config=config,
+            attn_implementation="custom_sdpa",
+            use_custom_kv_cache=True,
+            **{"qlinear": True, "qembeeding": True},
+        )
         self.assertIsInstance(model, ExecuTorchModelForCausalLM)
         self.assertIsInstance(model.model, ExecuTorchModule)
 
         tokenizer = AutoTokenizer.from_pretrained(model_id)
         generated_text = model.text_generation(
             tokenizer=tokenizer,
             prompt="My favourite condiment is ",
-            max_seq_len=32,
+            max_seq_len=64,
         )
         logging.info(f"\nGenerated text:\n\t{generated_text}")
-        generated_tokens = tokenizer(generated_text, return_tensors="pt").input_ids
 
-        # Free memory before loading eager for quality check
-        del model
-        del tokenizer
-        gc.collect()
+        if not is_ci:
+            generated_tokens = tokenizer(generated_text, return_tensors="pt").input_ids
+
+            # Free memory before loading eager for quality check
+            del model
+            del tokenizer
+            gc.collect()
 
-        self.assertTrue(check_causal_lm_output_quality(model_id, generated_tokens))
+            self.assertTrue(check_causal_lm_output_quality(model_id, generated_tokens))
 
     @slow
     @pytest.mark.run_slow
-    @pytest.mark.skipif(
-        parse(executorch_version.__version__) > parse("0.6.0"),
+    @pytest.mark.skip(
         reason="Require cache_position support in executorch runtime. Re-enable when available.",
     )
     def test_phi4_text_generation_with_quantized_pte_from_hub(self):
@@ -119,9 +127,8 @@ def test_phi4_text_generation_with_quantized_pte_from_hub(self):
 
     @slow
     @pytest.mark.run_slow
-    @pytest.mark.skipif(
-        parse(torchao.__version__) < parse("0.11.0.dev0"),
-        reason="Only available on torchao >= 0.11.0.dev0",
+    @pytest.mark.skip(
+        reason="Require cache_position support in executorch runtime. Re-enable when available.",
     )
     def test_phi4_text_generation_with_quantized_ckp(self):
         model_id = "pytorch/Phi-4-mini-instruct-8da4w"

Original file line number	Diff line number	Diff line change
`@@ -34,7 +34,7 @@ def install_dep_from_source():`
`34`	`34`	`"-m",`
`35`	`35`	`"pip",`
`36`	`36`	`"install",`
`37`		`- "git+https://github.com/huggingface/transformers@ea013348737fbd0efdefa38f9cad30443a810fd3#egg=transformers",`
	`37`	`+ "git+https://github.com/huggingface/transformers@37367c7d9fd23401c26e79a2b26253ab2d1b7236#egg=transformers",`
`38`	`38`	`]`
`39`	`39`	`)`
`40`	`40`	`subprocess.check_call(`