Support lowering quantized checkpoint from Hub

Guang Yang · Guang Yang · commit 2af1cd434b1d · 2025-05-07T12:23:52.000-07:00
diff --git a/optimum/executorch/modeling.py b/optimum/executorch/modeling.py
@@ -92,8 +92,12 @@ def __init__(self, models: Dict[str, "ExecuTorchModule"], config: "PretrainedCon
                 f"This attribute is used to identify the corresponding AutoModel class."
             )
 
-        for key, value in models.items():
-            setattr(self, key, value)
+        if len(models) == 1:
+            # For single PTE, always set the attr to "model"
+            setattr(self, "model", next(iter(models.values())))
+        else:
+            for key, value in models.items():
+                setattr(self, key, value)
 
         self.stats = Stats()
 
@@ -570,8 +574,8 @@ class ExecuTorchModelForCausalLM(ExecuTorchModelBase):
             Data type of the model parameters.
         bos_token_id (`int`):
             Beginning-of-sequence token ID.
-        eos_token_id (`int`):
-            End-of-sequence token ID.
+        eos_token_ids (`List[int]`):
+            End-of-sequence token IDs.
         vocab_size (`int`):
             Size of the model vocabulary.
     """
@@ -594,8 +598,10 @@ def __init__(self, models: Dict[str, "ExecuTorchModule"], config: "PretrainedCon
             self.dtype = self.model.run_method("get_dtype")[0]
         if "get_bos_id" in metadata:
             self.bos_token_id = self.model.run_method("get_bos_id")[0]
-        if "get_eos_id" in metadata:
-            self.eos_token_id = self.model.run_method("get_eos_id")[0]
+        for key in ("get_eos_id", "get_eos_ids"):
+            if key in metadata:
+                self.eos_token_ids = self.model.run_method("get_eos_ids")
+                break
         if "get_vocab_size" in metadata:
             self.vocab_size = self.model.run_method("get_vocab_size")[0]
         if "use_sdpa_with_kv_cache" in metadata:
@@ -694,7 +700,7 @@ def generate(
             next_token = torch.argmax(logits, dim=-1).item()
             generated_tokens.append(next_token)
 
-            if next_token == self.eos_token_id:
+            if next_token in self.eos_token_ids:
                 break
 
         self.stats.set_num_generated_tokens(len(generated_tokens) - len(prompt_tokens))
@@ -730,9 +736,9 @@ def text_generation(
             raise ValueError(
                 f"The tokenizer's bos_token_id={self.tokenizer.bos_token_id} must be the same as the model's bos_token_id={self.bos_token_id}."
             )
-        if self.tokenizer.eos_token_id is not None and self.tokenizer.eos_token_id != self.eos_token_id:
+        if self.tokenizer.eos_token_id is not None and self.tokenizer.eos_token_id not in self.eos_token_ids:
             raise ValueError(
-                f"The tokenizer's eos_token_id={self.tokenizer.eos_token_id} must be the same as the model's eos_token_id={self.eos_token_id}."
+                f"The tokenizer's eos_token_id={self.tokenizer.eos_token_id} must match with the model's eos_token_ids={self.eos_token_ids}."
             )
 
         # Reset stats for a new generation
diff --git a/tests/models/test_modeling_phi4.py b/tests/models/test_modeling_phi4.py
@@ -15,10 +15,13 @@
 
 import gc
 import logging
+import os
 import unittest
 
 import pytest
+import torchao
 from executorch.extension.pybindings.portable_lib import ExecuTorchModule
+from packaging.version import parse
 from transformers import AutoConfig, AutoTokenizer
 from transformers.testing_utils import slow
 
@@ -27,13 +30,18 @@
 from ..utils import check_causal_lm_output_quality
 
 
-@pytest.mark.skip(reason="Test Phi-4-mini (3.8B) will require runner to be configured with larger RAM")
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+is_ci = os.environ.get("GITHUB_ACTIONS") == "true"
+
+
 class ExecuTorchModelIntegrationTest(unittest.TestCase):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
     @slow
     @pytest.mark.run_slow
+    @pytest.mark.skip(is_ci, reason="Test Phi-4-mini (3.8B) will require runner to be configured with larger RAM")
     def test_phi4_text_generation(self):
         model_id = "microsoft/Phi-4-mini-instruct"
         config = AutoConfig.from_pretrained(model_id)
@@ -61,3 +69,92 @@ def test_phi4_text_generation(self):
         gc.collect()
 
         self.assertTrue(check_causal_lm_output_quality(model_id, generated_tokens))
+
+    @slow
+    @pytest.mark.run_slow
+    @pytest.mark.skipif(
+        parse(torchao.__version__) < parse("0.11.0.dev0"),
+        reason="Only available on torchao >= 0.11.0.dev0",
+    )
+    def test_phi4_text_generation_with_quantized_pte_from_hub(self):
+        model_id = "pytorch/Phi-4-mini-instruct-8da4w"
+        config = AutoConfig.from_pretrained(model_id)
+        # NOTE: To make the model exportable we need to set the rope scaling to default to avoid hitting
+        # the data-dependent control flow in _longrope_frequency_update. Alternatively, we can rewrite
+        # that function to avoid the data-dependent control flow.
+        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+            config.rope_scaling["type"] = "default"
+        model = ExecuTorchModelForCausalLM.from_pretrained(
+            model_id, recipe="xnnpack", config=config, file_name="phi4-mini-8da4w.pte"
+        )
+        self.assertIsInstance(model, ExecuTorchModelForCausalLM)
+        self.assertIsInstance(model.model, ExecuTorchModule)
+
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        generated_text = model.text_generation(
+            tokenizer=tokenizer,
+            prompt="My favourite condiment is ",
+            max_seq_len=64,
+        )
+        logging.info(f"\nGenerated text:\n\t{generated_text}")
+
+        if not is_ci:
+            generated_tokens = tokenizer(generated_text, return_tensors="pt").input_ids
+
+            # Free memory before loading eager for quality check
+            del model
+            del tokenizer
+            gc.collect()
+
+            self.assertTrue(
+                check_causal_lm_output_quality(
+                    "microsoft/Phi-4-mini-instruct",
+                    generated_tokens,
+                )
+            )
+
+    @slow
+    @pytest.mark.run_slow
+    @pytest.mark.skipif(
+        parse(torchao.__version__) < parse("0.11.0.dev0"),
+        reason="Only available on torchao >= 0.11.0.dev0",
+    )
+    def test_phi4_text_generation_with_quantized_ckp(self):
+        model_id = "pytorch/Phi-4-mini-instruct-8da4w"
+        config = AutoConfig.from_pretrained(model_id)
+        # NOTE: To make the model exportable we need to set the rope scaling to default to avoid hitting
+        # the data-dependent control flow in _longrope_frequency_update. Alternatively, we can rewrite
+        # that function to avoid the data-dependent control flow.
+        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+            config.rope_scaling["type"] = "default"
+        model = ExecuTorchModelForCausalLM.from_pretrained(
+            model_id,
+            recipe="xnnpack",
+            config=config,
+            export=True,
+        )
+        self.assertIsInstance(model, ExecuTorchModelForCausalLM)
+        self.assertIsInstance(model.model, ExecuTorchModule)
+
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        generated_text = model.text_generation(
+            tokenizer=tokenizer,
+            prompt="My favourite condiment is ",
+            max_seq_len=64,
+        )
+        logging.info(f"\nGenerated text:\n\t{generated_text}")
+
+        if not is_ci:
+            generated_tokens = tokenizer(generated_text, return_tensors="pt").input_ids
+
+            # Free memory before loading eager for quality check
+            del model
+            del tokenizer
+            gc.collect()
+
+            self.assertTrue(
+                check_causal_lm_output_quality(
+                    "microsoft/Phi-4-mini-instruct",
+                    generated_tokens,
+                )
+            )