rebase on gemma3 ci and log pte file size

Guang Yang · Guang Yang · commit 2c461b40f26b · 2025-05-01T14:15:11.000-07:00
diff --git a/optimum/commands/export/executorch.py b/optimum/commands/export/executorch.py
@@ -58,8 +58,8 @@ def parse_args_executorch(parser):
         help="For decoder-only models to use custom sdpa with static kv cache to boost performance. Defaults to False.",
     )
     required_group.add_argument(
-        "-q",
-        "--quantize",
+        "-qmode",
+        "--quantization_mode",
         required=False,
         choices=["8da4w"],
         help="Quantization recipe to use. Defaults to None.",
@@ -79,8 +79,8 @@ def run(self):
         kwargs = {}
         if self.args.use_custom_sdpa:
             kwargs["use_custom_sdpa"] = self.args.use_custom_sdpa
-        if self.args.quantize:
-            kwargs["quantize"] = self.args.quantize
+        if self.args.quantization_mode:
+            kwargs["quantization_mode"] = self.args.quantization_mode
 
         main_export(
             model_name_or_path=self.args.model,
diff --git a/optimum/executorch/modeling.py b/optimum/executorch/modeling.py
@@ -180,7 +180,9 @@ def _from_pretrained(
             local_files_only=local_files_only,
         )
         model = _load_for_executorch(model_cache_path)
-        logging.info(f"Loaded model from {model_cache_path}")
+        logging.info(
+            f"Loaded model from {model_cache_path} ({os.path.getsize(model_cache_path) / (1024 * 1024):.2f} MB)"
+        )
 
         return {default_file_name.removesuffix(_PTE_SUFFIX): model}
 
diff --git a/optimum/exporters/executorch/convert.py b/optimum/exporters/executorch/convert.py
@@ -26,8 +26,6 @@
 from .recipe_registry import discover_recipes, recipe_registry
 
 
-logger = logging.getLogger(__name__)
-
 AttentionInterface.register("custom_sdpa", custom_sdpa_with_start_pos_forward)
 
 
@@ -82,6 +80,8 @@ def export_to_executorch(
         full_path = os.path.join(f"{output_dir}", f"{name}.pte")
         with open(full_path, "wb") as f:
             prog.write_to_file(f)
-            logger.info(f"Saved exported program to {full_path}")
+            logging.info(
+                f"Saved exported program to {full_path} ({os.path.getsize(full_path) / (1024 * 1024):.2f} MB)"
+            )
 
     return executorch_progs
diff --git a/optimum/exporters/executorch/tasks/causal_lm.py b/optimum/exporters/executorch/tasks/causal_lm.py
@@ -57,7 +57,7 @@ def load_causal_lm_model(model_name_or_path: str, **kwargs) -> CausalLMExportabl
     cache_implementation = kwargs.get("cache_implementation", "static")
     max_length = kwargs.get("max_length", 2048)
     config = kwargs.get("config", None)
-    quantization_recipe = kwargs.get("quantize", None)
+    quantization_mode = kwargs.get("quantization_mode", None)
 
     eager_model = AutoModelForCausalLM.from_pretrained(
         model_name_or_path,
@@ -77,7 +77,7 @@ def load_causal_lm_model(model_name_or_path: str, **kwargs) -> CausalLMExportabl
         ),
     )
 
-    if quantization_recipe == "8da4w":
+    if quantization_mode == "8da4w":
         if parse(torchao.__version__) < parse("0.11.0.dev0"):
             raise RuntimeError("Quantization 8da4w requires torchao >= 0.11.0. Please upgrade torchao.")
 
diff --git a/tests/models/test_modeling_gemma3.py b/tests/models/test_modeling_gemma3.py
@@ -177,7 +177,9 @@ def test_gemma3_text_generation_with_custom_sdpa_float16(self):
         reason="Only available on torchao >= 0.11.0.dev0",
     )
     def test_gemma3_text_generation_with_custom_sdpa_8da4w(self):
-        model_id = "google/gemma-3-1b-it"
+        # TODO: Until https://github.com/huggingface/optimum/issues/2127 is fixed, have to use non-gated model on CI
+        # model_id = "google/gemma-3-1b-it"
+        model_id = "unsloth/gemma-3-1b-it"
         prompt = "Write a poem about a machine learning."
         tokenizer = AutoTokenizer.from_pretrained(model_id)
         kwargs = {"quantize": "8da4w"}

Original file line number	Diff line number	Diff line change
`@@ -180,7 +180,9 @@ def _from_pretrained(`
`180`	`180`	`local_files_only=local_files_only,`
`181`	`181`	`)`
`182`	`182`	`model = _load_for_executorch(model_cache_path)`
`183`		`- logging.info(f"Loaded model from {model_cache_path}")`
	`183`	`+ logging.info(`
	`184`	`+ f"Loaded model from {model_cache_path} ({os.path.getsize(model_cache_path) / (1024 * 1024):.2f} MB)"`
	`185`	`+ )`
`184`	`186`
`185`	`187`	`return {default_file_name.removesuffix(_PTE_SUFFIX): model}`
`186`	`188`