From 091d81c16564458e687164a31448318819a2fc29 Mon Sep 17 00:00:00 2001
From: lvliang-intel <liang1.lv@intel.com>
Date: Sun, 8 Mar 2026 21:12:59 +0800
Subject: [PATCH 01/10] Support GLM-Image model quantizaiton

Signed-off-by: lvliang-intel <liang1.lv@intel.com>
---
 auto_round/autoround.py                       |   8 +-
 auto_round/compressors/mllm/compressor.py     |   1 +
 auto_round/compressors/mllm/template.py       |   1 +
 auto_round/compressors/mllm/utils.py          |   1 +
 auto_round/compressors/shard_writer.py        |   6 +-
 .../export/export_to_autogptq/export.py       |  10 +-
 .../export/export_to_autoround/export.py      |  14 +-
 auto_round/export/utils.py                    | 127 +++++
 auto_round/special_model_handler.py           |  33 +-
 auto_round/utils/common.py                    |   1 +
 auto_round/utils/model.py                     | 141 ++++-
 test/test_cpu/models/test_glm_image.py        | 528 ++++++++++++++++++
 12 files changed, 856 insertions(+), 15 deletions(-)
 create mode 100644 test/test_cpu/models/test_glm_image.py

diff --git a/auto_round/autoround.py b/auto_round/autoround.py
index 4c3abe4ba..9888bfe8e 100644
--- a/auto_round/autoround.py
+++ b/auto_round/autoround.py
@@ -161,7 +161,13 @@ def __new__(
 
         model_cls = []
 
-        if (extra_config and not extra_config.mllm_config.is_default()) or is_mllm_model(model, platform=platform):
+        has_multimodal_assets = kwargs.get("processor") is not None or kwargs.get("image_processor") is not None
+
+        if (
+            (extra_config and not extra_config.mllm_config.is_default())
+            or has_multimodal_assets
+            or is_mllm_model(model, platform=platform)
+        ):
             logger.info("using MLLM mode for multimodal model.")
             model_cls.append(MLLMCompressor)
             if extra_config:
diff --git a/auto_round/compressors/mllm/compressor.py b/auto_round/compressors/mllm/compressor.py
index 165d4f3d3..12061ce10 100644
--- a/auto_round/compressors/mllm/compressor.py
+++ b/auto_round/compressors/mllm/compressor.py
@@ -493,6 +493,7 @@ def save_quantized(self, output_dir=None, format="auto_round", inplace=True, **k
             format=format,
             inplace=inplace,
             processor=self.processor,
+            image_processor=self.image_processor,
             quant_nontext_module=self.quant_nontext_module if hasattr(self, "quant_nontext_module") else False,
             **kwargs,
         )
diff --git a/auto_round/compressors/mllm/template.py b/auto_round/compressors/mllm/template.py
index 75190a091..09a315f6f 100644
--- a/auto_round/compressors/mllm/template.py
+++ b/auto_round/compressors/mllm/template.py
@@ -119,6 +119,7 @@ def _register_template(
 
 _register_template("qwen2_vl", default_dataset="NeelNanda/pile-10k", processor=PROCESSORS["qwen2_vl"])
 _register_template("qwen2_5_vl", default_dataset="NeelNanda/pile-10k", processor=PROCESSORS["qwen2_vl"])
+_register_template("glm_image", default_dataset="NeelNanda/pile-10k", processor=PROCESSORS["hf"])
 _register_template("mllama", default_dataset="liuhaotian/llava", processor=PROCESSORS["hf"])
 _register_template("deepseek_vl_v2", default_dataset="NeelNanda/pile-10k", processor=PROCESSORS["deepseek_v2"])
 _register_template("mistral3", default_dataset="NeelNanda/pile-10k", processor=PROCESSORS["hf"])
diff --git a/auto_round/compressors/mllm/utils.py b/auto_round/compressors/mllm/utils.py
index e8535666c..547c0503d 100644
--- a/auto_round/compressors/mllm/utils.py
+++ b/auto_round/compressors/mllm/utils.py
@@ -27,6 +27,7 @@
     "audio",
     "talker",
     "token2wav",
+    "vqmodel",
     "multi_modal_projector",
     "vision_tower",
     "multimodal_projector",
diff --git a/auto_round/compressors/shard_writer.py b/auto_round/compressors/shard_writer.py
index 39964319a..dc81f2959 100644
--- a/auto_round/compressors/shard_writer.py
+++ b/auto_round/compressors/shard_writer.py
@@ -60,7 +60,11 @@ def __init__(self, rounder):
         self.total_param_size_bytes = 0
 
         # Directory Setup
-        self.output_dir = os.path.join(rounder._get_save_folder_name(rounder.formats[0]), "")
+        base_dir = rounder._get_save_folder_name(rounder.formats[0])
+        subfolder = getattr(self.model, "_autoround_pipeline_subfolder", None)
+        if subfolder:
+            base_dir = os.path.join(base_dir, subfolder)
+        self.output_dir = os.path.join(base_dir, "")
         os.makedirs(self.output_dir, exist_ok=True)
 
     def _parse_size(self, size_str: str) -> int:
diff --git a/auto_round/export/export_to_autogptq/export.py b/auto_round/export/export_to_autogptq/export.py
index 75e9b0f3d..acfed7772 100644
--- a/auto_round/export/export_to_autogptq/export.py
+++ b/auto_round/export/export_to_autogptq/export.py
@@ -53,6 +53,7 @@
     filter_quantization_config,
     get_autogptq_packing_qlinear,
     release_layer_safely,
+    resolve_pipeline_export_layout,
     save_model,
 )
 from auto_round.schemes import QuantizationScheme
@@ -211,12 +212,17 @@ def save_quantized_as_autogptq(
     safe_serialization = kwargs.get("safe_serialization", True)
 
     # --- Save metadata (tokenizer, processor, etc.) ---
+    processor_output_dir = output_dir
+    model_output_dir = output_dir
+    if output_dir:
+        model_output_dir, processor_output_dir, _ = resolve_pipeline_export_layout(model, output_dir)
+
     if output_dir:
         # if os.path.exists(output_dir):
         #     logger.info(f"{output_dir} already exists, may cause overwrite conflicts.")
         for comp in (tokenizer, processor, image_processor):
             if comp is not None and hasattr(comp, "save_pretrained"):
-                comp.save_pretrained(output_dir)
+                comp.save_pretrained(processor_output_dir)
 
     # --- Handle quantization structure ---
     all_blocks = quant_block_list
@@ -319,6 +325,6 @@ def wrapper(name):
 
     dtype = torch.float16  ##force dtype to fp16
     save_model(
-        model, output_dir, safe_serialization=safe_serialization, dtype=dtype, config_file="quantize_config.json"
+        model, model_output_dir, safe_serialization=safe_serialization, dtype=dtype, config_file="quantize_config.json"
     )
     return model
diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py
index c57bf452c..467deab2c 100644
--- a/auto_round/export/export_to_autoround/export.py
+++ b/auto_round/export/export_to_autoround/export.py
@@ -35,6 +35,7 @@
     filter_quantization_config,
     get_autogptq_packing_qlinear,
     release_layer_safely,
+    resolve_pipeline_export_layout,
     save_model,
 )
 from auto_round.formats import AutoRoundExportFormat
@@ -334,19 +335,24 @@ def wrapper(name):
         return model
     # if os.path.exists(output_dir):
     #     logger.info(f"{output_dir} already exists, this may cause model conflict")
+    model_output_dir = output_dir
+    processor_output_dir = output_dir
+    if output_dir:
+        model_output_dir, processor_output_dir, _ = resolve_pipeline_export_layout(model, output_dir)
+
     if tokenizer is not None and hasattr(tokenizer, "save_pretrained"):
-        tokenizer.save_pretrained(output_dir)
+        tokenizer.save_pretrained(processor_output_dir)
 
     if processor is not None:
-        processor.save_pretrained(output_dir)
+        processor.save_pretrained(processor_output_dir)
     if image_processor is not None:
-        image_processor.save_pretrained(output_dir)
+        image_processor.save_pretrained(processor_output_dir)
     if quantization_config.get("act_bits", 16) <= 8:
         dtype = torch.bfloat16
     elif "awq" in quantization_config.get("packing_format", "auto_round:auto_gptq"):
         dtype = torch.float16  ## awq kernel only supports float16 on cuda
     else:
         dtype = None
-    save_model(model, output_dir, safe_serialization=safe_serialization, dtype=dtype)
+    save_model(model, model_output_dir, safe_serialization=safe_serialization, dtype=dtype)
 
     return model
diff --git a/auto_round/export/utils.py b/auto_round/export/utils.py
index 42ae86d5f..5581fc46c 100644
--- a/auto_round/export/utils.py
+++ b/auto_round/export/utils.py
@@ -13,12 +13,139 @@
 # limitations under the License.
 import json
 import os
+import shutil
 
 import torch.nn as nn
 
 from auto_round.utils import copy_python_files_from_model_cache, logger, unsupported_meta_device
 
 
+def is_local_pipeline_model_dir(model_dir: str) -> bool:
+    if not model_dir or not os.path.isdir(model_dir):
+        return False
+    return os.path.isfile(os.path.join(model_dir, "model_index.json"))
+
+
+def is_remote_pipeline_model_dir(model_dir: str) -> bool:
+    if not model_dir or os.path.isdir(model_dir):
+        return False
+    try:
+        from huggingface_hub import list_repo_files
+
+        return "model_index.json" in list_repo_files(model_dir)
+    except Exception:
+        return False
+
+
+def is_pipeline_model_dir(model_dir: str) -> bool:
+    return is_local_pipeline_model_dir(model_dir) or is_remote_pipeline_model_dir(model_dir)
+
+
+def _resolve_pipeline_source_dir(model: nn.Module) -> str | None:
+    candidates = [
+        getattr(model, "name_or_path", None),
+        getattr(getattr(model, "config", None), "_name_or_path", None),
+        getattr(getattr(model, "config", None), "name_or_path", None),
+    ]
+    for candidate in candidates:
+        if isinstance(candidate, str) and is_pipeline_model_dir(candidate):
+            return candidate
+    return None
+
+
+def _copy_pipeline_artifact(model_dir: str, relative_path: str, output_dir: str) -> None:
+    target_path = os.path.join(output_dir, relative_path)
+    os.makedirs(os.path.dirname(target_path), exist_ok=True)
+    if is_local_pipeline_model_dir(model_dir):
+        source_path = os.path.join(model_dir, relative_path)
+    else:
+        from huggingface_hub import hf_hub_download
+
+        source_path = hf_hub_download(model_dir, relative_path)
+    shutil.copy2(source_path, target_path)
+
+
+def _copy_pipeline_artifacts(source_dir: str, output_dir: str, exclude_components: set[str] | None = None):
+    exclude_components = exclude_components or set()
+    os.makedirs(output_dir, exist_ok=True)
+
+    model_index_path = os.path.join(source_dir, "model_index.json") if is_local_pipeline_model_dir(source_dir) else None
+    if model_index_path:
+        with open(model_index_path, "r", encoding="utf-8") as f:
+            model_index = json.load(f)
+    else:
+        from huggingface_hub import hf_hub_download, list_repo_files
+
+        with open(hf_hub_download(source_dir, "model_index.json"), "r", encoding="utf-8") as f:
+            model_index = json.load(f)
+
+    component_dirs = [k for k, v in model_index.items() if not k.startswith("_") and isinstance(v, list)]
+    is_local = is_local_pipeline_model_dir(source_dir)
+
+    # Copy root-level files
+    if is_local:
+        for name in os.listdir(source_dir):
+            src = os.path.join(source_dir, name)
+            if os.path.isfile(src) and (
+                name in ("model_index.json", ".gitattributes") or name.lower().startswith(("readme", "license"))
+            ):
+                shutil.copy2(src, os.path.join(output_dir, name))
+    else:
+        all_files = list(list_repo_files(source_dir))
+        for name in all_files:
+            if "/" not in name and (
+                name in ("model_index.json", ".gitattributes") or name.lower().startswith(("readme", "license"))
+            ):
+                _copy_pipeline_artifact(source_dir, name, output_dir)
+
+    # Copy component directories
+    for component_name in component_dirs:
+        if component_name in exclude_components:
+            continue
+        if is_local:
+            src = os.path.join(source_dir, component_name)
+            dst = os.path.join(output_dir, component_name)
+            if os.path.isdir(src):
+                shutil.copytree(src, dst, dirs_exist_ok=True)
+        else:
+            prefix = f"{component_name}/"
+            for f in all_files:
+                if f.startswith(prefix):
+                    _copy_pipeline_artifact(source_dir, f, output_dir)
+
+
+def resolve_pipeline_export_layout(model: nn.Module, output_dir: str) -> tuple[str, str, bool]:
+    model_component = getattr(model, "_autoround_pipeline_subfolder", None)
+    if model_component is None:
+        return output_dir, output_dir, False
+
+    source_dir = _resolve_pipeline_source_dir(model)
+    processor_component = None
+    if source_dir is not None:
+        try:
+            model_index_path = os.path.join(source_dir, "model_index.json") if is_local_pipeline_model_dir(source_dir) else None
+            if model_index_path:
+                with open(model_index_path, "r", encoding="utf-8") as f:
+                    model_index = json.load(f)
+            else:
+                from huggingface_hub import hf_hub_download
+
+                with open(hf_hub_download(source_dir, "model_index.json"), "r", encoding="utf-8") as f:
+                    model_index = json.load(f)
+            if "processor" in model_index and isinstance(model_index["processor"], list):
+                processor_component = "processor"
+            excluded = {model_component}
+            if processor_component:
+                excluded.add(processor_component)
+            _copy_pipeline_artifacts(source_dir, output_dir, exclude_components=excluded)
+        except Exception as e:
+            logger.warning("Failed to copy pipeline artifacts from %s: %s", source_dir, e)
+
+    model_output_dir = os.path.join(output_dir, model_component)
+    processor_output_dir = os.path.join(output_dir, processor_component) if processor_component else output_dir
+    return model_output_dir, processor_output_dir, True
+
+
 def save_model(
     model: nn.Module,
     save_dir: str,
diff --git a/auto_round/special_model_handler.py b/auto_round/special_model_handler.py
index f051b9673..695b43f51 100644
--- a/auto_round/special_model_handler.py
+++ b/auto_round/special_model_handler.py
@@ -21,7 +21,7 @@
 from auto_round.modeling.fused_moe.replace_modules import apply_replacements, release_original_module_
 from auto_round.utils import is_moe_model_via_config, logger
 
-mllms_with_limited_bs = ("llava", "qwen2_vl", "phi3_v", "mllama")  # Limitations on batch_size
+mllms_with_limited_bs = ("llava", "qwen2_vl", "phi3_v", "mllama", "glm_image")  # Limitations on batch_size
 
 SUPPORT_ONLY_TEXT_MODELS = [
     "phi3_v",
@@ -35,6 +35,7 @@
     "llama4",
     "internvl_chat",
     "glm4v_moe",
+    "glm_image",
     "qwen3_vl_moe",
     "gemma3",
 ]
@@ -80,7 +81,35 @@ def _get_deepseek_vl2_multimodal_block(model, quant_vision=False):
     return block_names
 
 
-SPECIAL_MULTIMODAL_BLOCK = {"deepseek_vl_v2": _get_deepseek_vl2_multimodal_block}
+def _get_glm_image_multimodal_block(model, quant_vision=False):
+    """Get block names for GLM-Image AR model.
+
+    GLM-Image AR model structure:
+    - model.visual.blocks: vision encoder
+    - model.language_model.layers: autoregressive text backbone
+
+    By default, only text backbone is quantized. Set quant_vision=True to include
+    the visual encoder blocks.
+    """
+    block_names = []
+
+    if quant_vision and hasattr(model, "model") and hasattr(model.model, "visual"):
+        if hasattr(model.model.visual, "blocks"):
+            block_names.append([f"model.visual.blocks.{i}" for i in range(len(model.model.visual.blocks))])
+
+    if hasattr(model, "model") and hasattr(model.model, "language_model"):
+        if hasattr(model.model.language_model, "layers"):
+            block_names.append(
+                [f"model.language_model.layers.{i}" for i in range(len(model.model.language_model.layers))]
+            )
+
+    return block_names
+
+
+SPECIAL_MULTIMODAL_BLOCK = {
+    "deepseek_vl_v2": _get_deepseek_vl2_multimodal_block,
+    "glm_image": _get_glm_image_multimodal_block,
+}
 
 
 def _deepseek_vl2_forward(
diff --git a/auto_round/utils/common.py b/auto_round/utils/common.py
index c494c2959..5981e9de7 100644
--- a/auto_round/utils/common.py
+++ b/auto_round/utils/common.py
@@ -193,6 +193,7 @@ def __getitem__(self, key):
     "audio",
     "talker",
     "token2wav",
+    "vqmodel",
     "vision_model",
     "audio_tower",
     "vision_encoder",
diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py
index f17398fcc..de561b106 100644
--- a/auto_round/utils/model.py
+++ b/auto_round/utils/model.py
@@ -339,6 +339,97 @@ def llm_load_model(
     return model, tokenizer
 
 
+def _find_pipeline_model_subfolder_local(model_dir: str) -> tuple:
+    """Find model/processor subfolders from a local pipeline directory with model_index.json.
+
+    Scans component subdirectories to find the one whose config.json has 'architectures',
+    and looks for a 'processor' component.
+
+    Returns:
+        (model_subfolder, processor_subfolder, config_dict)
+    """
+    index_path = os.path.join(model_dir, "model_index.json")
+    if not os.path.exists(index_path):
+        raise FileNotFoundError(f"No config.json or model_index.json found under {model_dir}")
+
+    with open(index_path, "r", encoding="utf-8") as f:
+        model_index = json.load(f)
+
+    processor_subfolder = None
+    for name, value in model_index.items():
+        if name == "processor" and isinstance(value, list):
+            processor_subfolder = "processor"
+            break
+
+    candidates = []
+    for name, value in model_index.items():
+        if name.startswith("_") or not isinstance(value, list) or len(value) < 2:
+            continue
+        comp_config_path = os.path.join(model_dir, name, "config.json")
+        if not os.path.isfile(comp_config_path):
+            continue
+        with open(comp_config_path, "r", encoding="utf-8") as f:
+            comp_config = json.load(f)
+        if "architectures" in comp_config:
+            candidates.append((name, comp_config))
+
+    if not candidates:
+        raise FileNotFoundError(
+            f"model_index.json found in {model_dir} but no component with 'architectures' in its config.json"
+        )
+
+    for name, comp_config in candidates:
+        arch = comp_config["architectures"][0]
+        if "CausalLM" in arch or "ConditionalGeneration" in arch:
+            return name, processor_subfolder, comp_config
+
+    return candidates[0][0], processor_subfolder, candidates[0][1]
+
+
+def _find_pipeline_model_subfolder_remote(repo_id: str, file_list: list) -> tuple:
+    """Find model/processor subfolders from a remote HF repo with model_index.json.
+
+    Returns:
+        (model_subfolder, processor_subfolder, config_dict)
+    """
+    from huggingface_hub import hf_hub_download
+
+    index_path = hf_hub_download(repo_id, "model_index.json")
+    with open(index_path, "r", encoding="utf-8") as f:
+        model_index = json.load(f)
+
+    processor_subfolder = None
+    for name, value in model_index.items():
+        if name == "processor" and isinstance(value, list):
+            processor_subfolder = "processor"
+            break
+
+    candidates = []
+    for name, value in model_index.items():
+        if name.startswith("_") or not isinstance(value, list) or len(value) < 2:
+            continue
+        comp_config_file = f"{name}/config.json"
+        if comp_config_file not in file_list:
+            continue
+        comp_config_path = hf_hub_download(repo_id, comp_config_file)
+        with open(comp_config_path, "r", encoding="utf-8") as f:
+            comp_config = json.load(f)
+        if "architectures" in comp_config:
+            candidates.append((name, comp_config))
+
+    if not candidates:
+        raise FileNotFoundError(
+            f"model_index.json found for {repo_id} but no component with 'architectures' in its config.json"
+        )
+
+    for name, comp_config in candidates:
+        arch = comp_config["architectures"][0]
+        if "CausalLM" in arch or "ConditionalGeneration" in arch:
+            return name, processor_subfolder, comp_config
+
+    return candidates[0][0], processor_subfolder, candidates[0][1]
+
+
 def mllm_load_model(
     pretrained_model_name_or_path: str,
     platform: str = "hf",
@@ -377,17 +468,29 @@ def mllm_load_model(
     torch_dtype = "auto"
     if device_str is not None and "hpu" in device_str:
         torch_dtype = torch.bfloat16
+    model_subfolder = None
+    processor_subfolder = None
     if os.path.isdir(pretrained_model_name_or_path):
-        config = json.load(open(os.path.join(pretrained_model_name_or_path, "config.json")))
+        config_path = os.path.join(pretrained_model_name_or_path, "config.json")
+        if os.path.exists(config_path):
+            with open(config_path, "r", encoding="utf-8") as f:
+                config = json.load(f)
+        else:
+            model_subfolder, processor_subfolder, config = _find_pipeline_model_subfolder_local(
+                pretrained_model_name_or_path
+            )
     else:
         from huggingface_hub import hf_hub_download, list_repo_files
 
         file_list = list_repo_files(pretrained_model_name_or_path)
         if "config.json" in file_list:
-            # Load plain JSON
             config_path = hf_hub_download(pretrained_model_name_or_path, "config.json")
             with open(config_path, "r", encoding="utf-8") as f:
                 config = json.load(f)
+        elif "model_index.json" in file_list:
+            model_subfolder, processor_subfolder, config = _find_pipeline_model_subfolder_remote(
+                pretrained_model_name_or_path, file_list
+            )
         elif "config.json.gz" in file_list:
             # Load gzipped JSON
             import gzip
@@ -436,20 +539,28 @@ def mllm_load_model(
             else:
                 cls = AutoModelForCausalLM
             try:
+                model_load_kwargs = {}
+                if model_subfolder is not None:
+                    model_load_kwargs["subfolder"] = model_subfolder
                 model = cls.from_pretrained(
                     pretrained_model_name_or_path,
                     trust_remote_code=trust_remote_code,
                     torch_dtype=torch_dtype,
                     device_map="auto" if use_auto_mapping else None,
+                    **model_load_kwargs,
                 )
             except ValueError as e:
                 if "FP8 quantized" in str(e):
                     with override_cuda_device_capability():
+                        model_load_kwargs = {}
+                        if model_subfolder is not None:
+                            model_load_kwargs["subfolder"] = model_subfolder
                         model = cls.from_pretrained(
                             pretrained_model_name_or_path,
                             trust_remote_code=trust_remote_code,
                             torch_dtype=torch_dtype,
                             device_map="auto" if use_auto_mapping else None,
+                            **model_load_kwargs,
                         )
                     logger.warning("the support for fp8 model as input is experimental, please use with caution.")
                 else:
@@ -463,11 +574,18 @@ def mllm_load_model(
                 else:
                     tokenizer = MistralTokenizer.from_hf_hub(pretrained_model_name_or_path)
             else:
+                processor_load_kwargs = {}
+                if processor_subfolder is not None:
+                    processor_load_kwargs["subfolder"] = processor_subfolder
                 tokenizer = AutoTokenizer.from_pretrained(
-                    pretrained_model_name_or_path, trust_remote_code=trust_remote_code
+                    pretrained_model_name_or_path,
+                    trust_remote_code=trust_remote_code,
+                    **processor_load_kwargs,
                 )
                 processor = AutoProcessor.from_pretrained(
-                    pretrained_model_name_or_path, trust_remote_code=trust_remote_code
+                    pretrained_model_name_or_path,
+                    trust_remote_code=trust_remote_code,
+                    **processor_load_kwargs,
                 )
             try:
                 if platform == "model_scope":
@@ -475,17 +593,30 @@ def mllm_load_model(
                 else:
                     from transformers import AutoImageProcessor
 
+                image_processor_load_kwargs = {}
+                if processor_subfolder is not None:
+                    image_processor_load_kwargs["subfolder"] = processor_subfolder
                 image_processor = AutoImageProcessor.from_pretrained(
-                    pretrained_model_name_or_path, trust_remote_code=trust_remote_code
+                    pretrained_model_name_or_path,
+                    trust_remote_code=trust_remote_code,
+                    **image_processor_load_kwargs,
                 )
             except Exception as e:
                 pass
 
+            if model_type == "glm_image" and image_processor is not None:
+                from transformers.models.glm_image.processing_glm_image import GlmImageProcessor
+
+                processor = GlmImageProcessor(image_processor=image_processor, tokenizer=tokenizer)
+
     model = model.eval()
     check_and_mark_quantized_module(model)
     handle_generation_config(model)
     model = _to_model_dtype(model, model_dtype)
 
+    if model_subfolder is not None:
+        model._autoround_pipeline_subfolder = model_subfolder
+
     return model, processor, tokenizer, image_processor
 
 
diff --git a/test/test_cpu/models/test_glm_image.py b/test/test_cpu/models/test_glm_image.py
new file mode 100644
index 000000000..af0e3f12f
--- /dev/null
+++ b/test/test_cpu/models/test_glm_image.py
@@ -0,0 +1,528 @@
+# Copyright (c) 2026 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Unit tests for GLM-Image quantize and inference helpers.
+
+These tests are purely local and do not load model weights from disk or
+download anything.  A fake model hierarchy built with ``torch.nn.Module``
+and ``types.SimpleNamespace`` is used to exercise the logic under test.
+"""
+
+import json
+import os
+import types
+
+import pytest
+import torch.nn as nn
+
+from auto_round.special_model_handler import _get_glm_image_multimodal_block
+from auto_round.utils.model import _find_pipeline_model_subfolder_local
+
+
+# ---------------------------------------------------------------------------
+# Helpers – fake model hierarchy
+# ---------------------------------------------------------------------------
+
+def _make_glm_image_model(n_vision_blocks: int = 4, n_lm_layers: int = 28):
+    """Return a minimal fake GlmImageForConditionalGeneration-like model.
+
+    Structure mirrors the real model::
+
+        model
+        ├── visual
+        │   └── blocks: ModuleList[n_vision_blocks]
+        └── language_model
+            └── layers: ModuleList[n_lm_layers]
+    """
+
+    class _Blocks(nn.ModuleList):
+        pass
+
+    class _Visual(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.blocks = _Blocks([nn.Linear(8, 8) for _ in range(n_vision_blocks)])
+
+    class _LM(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.layers = nn.ModuleList([nn.Linear(8, 8) for _ in range(n_lm_layers)])
+
+    class _Inner(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.visual = _Visual()
+            self.language_model = _LM()
+
+    class _GlmImageModel(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.model = _Inner()
+
+    return _GlmImageModel()
+
+
+# ---------------------------------------------------------------------------
+# Tests for _get_glm_image_multimodal_block
+# ---------------------------------------------------------------------------
+
+class TestGetGlmImageMultimodalBlock:
+    """Unit tests for the GLM-Image block-name discovery helper."""
+
+    def test_text_only_returns_one_block_group(self):
+        """Default (quant_vision=False): only language_model layers are returned."""
+        model = _make_glm_image_model(n_vision_blocks=4, n_lm_layers=28)
+        block_names = _get_glm_image_multimodal_block(model, quant_vision=False)
+
+        assert len(block_names) == 1, "Expected exactly one block group (LM layers only)"
+        expected = [f"model.language_model.layers.{i}" for i in range(28)]
+        assert block_names[0] == expected
+
+    def test_quant_vision_true_returns_two_block_groups(self):
+        """quant_vision=True: visual encoder blocks prepended before LM layers."""
+        model = _make_glm_image_model(n_vision_blocks=4, n_lm_layers=28)
+        block_names = _get_glm_image_multimodal_block(model, quant_vision=True)
+
+        assert len(block_names) == 2, "Expected two block groups: visual + LM"
+        expected_visual = [f"model.visual.blocks.{i}" for i in range(4)]
+        expected_lm = [f"model.language_model.layers.{i}" for i in range(28)]
+        assert block_names[0] == expected_visual
+        assert block_names[1] == expected_lm
+
+    def test_quant_vision_false_ignores_visual_blocks(self):
+        """quant_vision=False must not include visual blocks even if they exist."""
+        model = _make_glm_image_model(n_vision_blocks=8, n_lm_layers=10)
+        block_names = _get_glm_image_multimodal_block(model, quant_vision=False)
+
+        flat = [name for group in block_names for name in group]
+        assert not any("visual" in name for name in flat), (
+            "visual blocks must be excluded when quant_vision=False"
+        )
+
+    def test_missing_language_model_returns_empty(self):
+        """If the model has no language_model attribute, result is empty."""
+
+        class _NoLM(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.model = nn.Module()  # no visual, no language_model
+
+        block_names = _get_glm_image_multimodal_block(_NoLM(), quant_vision=False)
+        assert block_names == []
+
+    def test_missing_visual_blocks_with_quant_vision(self):
+        """quant_vision=True but visual.blocks missing: only LM layers returned."""
+
+        class _NoVisualBlocks(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.model = types.SimpleNamespace(
+                    language_model=types.SimpleNamespace(
+                        layers=nn.ModuleList([nn.Linear(8, 8) for _ in range(6)])
+                    )
+                    # no .visual attribute
+                )
+
+        block_names = _get_glm_image_multimodal_block(_NoVisualBlocks(), quant_vision=True)
+        assert len(block_names) == 1
+        assert block_names[0] == [f"model.language_model.layers.{i}" for i in range(6)]
+
+    def test_block_count_matches_actual_module_list_length(self):
+        """Block name count must equal the actual ModuleList size."""
+        n_lm = 32
+        model = _make_glm_image_model(n_vision_blocks=0, n_lm_layers=n_lm)
+        block_names = _get_glm_image_multimodal_block(model, quant_vision=False)
+
+        assert len(block_names) == 1
+        assert len(block_names[0]) == n_lm
+
+
+# ---------------------------------------------------------------------------
+# Helpers – temp filesystem for pipeline loading tests
+# ---------------------------------------------------------------------------
+
+def _make_pipeline_dir(tmp_path, components, has_processor=True):
+    """Write a minimal diffusers-style pipeline directory.
+
+    Args:
+        tmp_path: pytest tmp_path fixture directory.
+        components: dict mapping component_name → dict to write as config.json.
+        has_processor: if True, add a ``processor`` entry to model_index.json.
+    """
+    model_index = {"_class_name": "GlmImagePipeline", "_diffusers_version": "0.0.1"}
+    if has_processor:
+        model_index["processor"] = ["transformers", "GlmImageProcessor"]
+
+    for name, cfg in components.items():
+        comp_dir = tmp_path / name
+        comp_dir.mkdir(parents=True)
+        (comp_dir / "config.json").write_text(json.dumps(cfg), encoding="utf-8")
+        model_index[name] = ["transformers", cfg.get("architectures", ["Unknown"])[0]]
+
+    (tmp_path / "model_index.json").write_text(json.dumps(model_index), encoding="utf-8")
+    return str(tmp_path)
+
+
+# ---------------------------------------------------------------------------
+# Tests for _find_pipeline_model_subfolder_local
+# ---------------------------------------------------------------------------
+
+class TestFindPipelineModelSubfolderLocal:
+    """Unit tests for the local pipeline subfolder discovery helper."""
+
+    def test_finds_vision_language_encoder_subfolder(self, tmp_path):
+        """The component containing GlmImageForConditionalGeneration is returned."""
+        pipeline_dir = _make_pipeline_dir(
+            tmp_path,
+            {
+                "vision_language_encoder": {
+                    "architectures": ["GlmImageForConditionalGeneration"],
+                    "model_type": "glm_image",
+                },
+                "vae": {"model_type": "autoencoder_kl"},  # no architectures → ignored
+            },
+        )
+        model_subfolder, processor_subfolder, cfg = _find_pipeline_model_subfolder_local(pipeline_dir)
+
+        assert model_subfolder == "vision_language_encoder"
+        assert processor_subfolder == "processor"
+        assert cfg["architectures"][0] == "GlmImageForConditionalGeneration"
+
+    def test_prefers_conditional_generation_over_encoder(self, tmp_path):
+        """ConditionalGeneration architecture is preferred over plain encoder."""
+        pipeline_dir = _make_pipeline_dir(
+            tmp_path,
+            {
+                "text_encoder": {"architectures": ["T5EncoderModel"]},
+                "vision_language_encoder": {
+                    "architectures": ["GlmImageForConditionalGeneration"],
+                    "model_type": "glm_image",
+                },
+            },
+            has_processor=False,
+        )
+        model_subfolder, processor_subfolder, cfg = _find_pipeline_model_subfolder_local(pipeline_dir)
+
+        assert model_subfolder == "vision_language_encoder"
+        assert processor_subfolder is None  # no processor entry
+
+    def test_no_processor_returns_none(self, tmp_path):
+        """When model_index.json has no 'processor' key, processor_subfolder is None."""
+        pipeline_dir = _make_pipeline_dir(
+            tmp_path,
+            {"vision_language_encoder": {"architectures": ["GlmImageForConditionalGeneration"]}},
+            has_processor=False,
+        )
+        _, processor_subfolder, _ = _find_pipeline_model_subfolder_local(pipeline_dir)
+        assert processor_subfolder is None
+
+    def test_with_processor_returns_processor_subfolder(self, tmp_path):
+        """When model_index.json has a 'processor' key, processor_subfolder=='processor'."""
+        pipeline_dir = _make_pipeline_dir(
+            tmp_path,
+            {"vision_language_encoder": {"architectures": ["GlmImageForConditionalGeneration"]}},
+            has_processor=True,
+        )
+        _, processor_subfolder, _ = _find_pipeline_model_subfolder_local(pipeline_dir)
+        assert processor_subfolder == "processor"
+
+    def test_raises_when_no_model_index(self, tmp_path):
+        """FileNotFoundError raised when neither config.json nor model_index.json exists."""
+        with pytest.raises(FileNotFoundError, match="model_index.json"):
+            _find_pipeline_model_subfolder_local(str(tmp_path))
+
+    def test_raises_when_no_component_has_architectures(self, tmp_path):
+        """FileNotFoundError raised when no component config contains 'architectures'."""
+        pipeline_dir = _make_pipeline_dir(
+            tmp_path,
+            {
+                "vae": {"model_type": "autoencoder_kl"},
+                "scheduler": {},
+            },
+        )
+        with pytest.raises(FileNotFoundError, match="architectures"):
+            _find_pipeline_model_subfolder_local(pipeline_dir)
+
+    def test_falls_back_to_first_candidate_when_no_preferred_arch(self, tmp_path):
+        """When no ConditionalGeneration/CausalLM arch exists, first candidate is used."""
+        pipeline_dir = _make_pipeline_dir(
+            tmp_path,
+            {
+                "text_encoder": {"architectures": ["T5EncoderModel"]},
+                "image_encoder": {"architectures": ["CLIPVisionModel"]},
+            },
+            has_processor=False,
+        )
+        model_subfolder, _, cfg = _find_pipeline_model_subfolder_local(pipeline_dir)
+        # Must be one of the candidates, not crash
+        assert model_subfolder in ("text_encoder", "image_encoder")
+        assert "architectures" in cfg
+
+
+# ---------------------------------------------------------------------------
+# Tests for GlmImageProcessor construction path
+# ---------------------------------------------------------------------------
+
+class TestGlmImageProcessorConstruction:
+    """Unit-test the GlmImageProcessor assembly logic in mllm_load_model.
+
+    Without loading full model weights we directly exercise the branching
+    code that wraps image_processor + tokenizer into GlmImageProcessor when
+    model_type == "glm_image".  GlmImageProcessor itself is patched so the
+    test does not depend on transformers' internal input validation.
+    """
+
+    @pytest.fixture()
+    def mock_components(self):
+        """Return minimal fake tokenizer and image_processor objects."""
+        tokenizer = types.SimpleNamespace(pad_token_id=0, eos_token_id=2)
+        image_processor = types.SimpleNamespace(size={"height": 448, "width": 448})
+        return tokenizer, image_processor
+
+    def test_glm_image_processor_wraps_components(self, mock_components):
+        """GlmImageProcessor must be called with image_processor= and tokenizer=."""
+        from unittest.mock import MagicMock, patch
+
+        tokenizer, image_processor = mock_components
+        fake_processor = object()
+        mock_cls = MagicMock(return_value=fake_processor)
+
+        # Patch away the real GlmImageProcessor so we only test the branch logic
+        with patch.dict("sys.modules", {"transformers.models.glm_image.processing_glm_image": types.ModuleType("_fake")}):
+            import sys
+
+            sys.modules["transformers.models.glm_image.processing_glm_image"].GlmImageProcessor = mock_cls
+
+            model_type = "glm_image"
+            processor = None
+            if model_type == "glm_image" and image_processor is not None:
+                from transformers.models.glm_image.processing_glm_image import GlmImageProcessor
+
+                processor = GlmImageProcessor(image_processor=image_processor, tokenizer=tokenizer)
+
+        mock_cls.assert_called_once_with(image_processor=image_processor, tokenizer=tokenizer)
+        assert processor is fake_processor
+
+    def test_non_glm_image_model_type_skips_wrapping(self, mock_components):
+        """For any other model_type, the GlmImageProcessor wrapping is not applied."""
+        tokenizer, image_processor = mock_components
+
+        model_type = "qwen2_vl"
+        processor = None  # simulate AutoProcessor result already in place
+        if model_type == "glm_image" and image_processor is not None:
+            processor = object()  # should never be reached
+
+        assert processor is None  # wrapping must NOT happen
+
+    def test_skipped_when_image_processor_is_none(self, mock_components):
+        """image_processor=None prevents GlmImageProcessor from being built."""
+        tokenizer, _ = mock_components
+
+        model_type = "glm_image"
+        image_processor = None
+        processor = None
+        if model_type == "glm_image" and image_processor is not None:
+            processor = object()  # must not be reached
+
+        assert processor is None
+
+
+# ---------------------------------------------------------------------------
+# Helpers – minimal PIL Image factory (no file I/O)
+# ---------------------------------------------------------------------------
+
+def _make_rgb_image(width: int = 64, height: int = 64):
+    """Return a tiny solid-colour PIL Image in RGB mode."""
+    from PIL import Image
+
+    return Image.new("RGB", (width, height), color=(128, 64, 32))
+
+
+# ---------------------------------------------------------------------------
+# Tests for image-to-image inference call logic (run_glm_image.py)
+# ---------------------------------------------------------------------------
+
+class TestGlmImageI2ICallLogic:
+    """Unit tests for the image-to-image pipeline invocation logic.
+
+    The pattern under test mirrors run_glm_image.main()::
+
+        condition_images = [load_image(p) for p in args.reference_image] or None
+        result = pipe(prompt=..., image=condition_images, height=..., width=..., ...)
+
+    No real pipeline or model weights are required.
+    """
+
+    def test_no_reference_images_passes_none_to_pipeline(self):
+        """Empty reference_image list must yield image=None (text-to-image mode)."""
+        from unittest.mock import MagicMock
+
+        reference_image_paths = []  # T2I: no reference images provided
+        condition_images = [_make_rgb_image() for _ in reference_image_paths] or None
+
+        pipe = MagicMock()
+        pipe.return_value = MagicMock(images=[_make_rgb_image()])
+        pipe(prompt="a fox", image=condition_images, height=1024, width=1024)
+
+        _, kwargs = pipe.call_args
+        assert kwargs["image"] is None, "T2I: image kwarg must be None"
+
+    def test_single_reference_image_passed_as_list(self):
+        """Single reference image must be wrapped in a list (not passed bare)."""
+        from unittest.mock import MagicMock
+
+        ref_img = _make_rgb_image()
+        reference_image_paths = ["dummy_path.jpg"]
+        # Simulate load_image returning ref_img for each path
+        condition_images = [ref_img for _ in reference_image_paths] or None
+
+        pipe = MagicMock()
+        pipe.return_value = MagicMock(images=[_make_rgb_image()])
+        pipe(prompt="edit the sky", image=condition_images, height=33 * 32, width=32 * 32)
+
+        _, kwargs = pipe.call_args
+        assert isinstance(kwargs["image"], list), "I2I: image must be a list"
+        assert len(kwargs["image"]) == 1
+        assert kwargs["image"][0] is ref_img
+
+    def test_multi_image_list_preserved(self):
+        """Multiple reference images must all be forwarded as a list."""
+        from unittest.mock import MagicMock
+
+        imgs = [_make_rgb_image() for _ in range(3)]
+        condition_images = imgs or None  # non-empty list stays as-is
+
+        pipe = MagicMock()
+        pipe.return_value = MagicMock(images=[_make_rgb_image()])
+        pipe(prompt="merge subjects", image=condition_images, height=32 * 32, width=32 * 32)
+
+        _, kwargs = pipe.call_args
+        assert kwargs["image"] == imgs
+        assert len(kwargs["image"]) == 3
+
+    def test_height_width_not_divisible_by_32_raises(self):
+        """run_glm_image.main() raises ValueError when dimensions are not multiples of 32."""
+        height, width = 33 * 32 + 1, 32 * 32  # 1057 is not divisible by 32
+
+        with pytest.raises(ValueError, match="divisible by 32"):
+            if height % 32 != 0 or width % 32 != 0:
+                raise ValueError("GLM-Image requires height and width to be divisible by 32.")
+
+    def test_height_width_divisible_by_32_passes(self):
+        """Dimensions that are multiples of 32 must not raise."""
+        for height, width in [(33 * 32, 32 * 32), (1024, 768), (32, 32)]:
+            # Should not raise
+            if height % 32 != 0 or width % 32 != 0:
+                raise AssertionError(f"Unexpected non-multiple: {height}x{width}")
+
+    def test_i2i_prompt_forwarded_correctly(self):
+        """The prompt string must be forwarded verbatim to the pipeline call."""
+        from unittest.mock import MagicMock
+
+        prompt = "Replace the background with an underground station."
+        ref_img = _make_rgb_image()
+        condition_images = [ref_img]
+
+        pipe = MagicMock()
+        pipe.return_value = MagicMock(images=[_make_rgb_image()])
+        pipe(prompt=prompt, image=condition_images, height=33 * 32, width=32 * 32)
+
+        _, kwargs = pipe.call_args
+        assert kwargs["prompt"] == prompt
+
+
+# ---------------------------------------------------------------------------
+# Tests for load_image helper (run_glm_image.py)
+# ---------------------------------------------------------------------------
+
+class TestLoadImage:
+    """Unit tests for the load_image() helper in run_glm_image.
+
+    Covers local file loading and the URL-vs-path dispatch logic without
+    making any real network requests.
+    """
+
+    @pytest.fixture(autouse=True)
+    def _import_load_image(self):
+        """Import load_image from run_glm_image into the test namespace."""
+        import importlib
+        import sys
+
+        # Ensure the workspace root is on sys.path so run_glm_image can be imported
+        root = os.path.dirname(
+            os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+        )
+        if root not in sys.path:
+            sys.path.insert(0, root)
+        mod = importlib.import_module("run_glm_image")
+        self.load_image = mod.load_image
+
+    def test_load_local_rgb_image(self, tmp_path):
+        """load_image() opens a local file and returns an RGB PIL Image."""
+        from PIL import Image
+
+        img = Image.new("RGBA", (32, 32), color=(10, 20, 30, 255))
+        img_path = str(tmp_path / "test.png")
+        img.save(img_path)
+
+        result = self.load_image(img_path)
+
+        assert isinstance(result, Image.Image)
+        assert result.mode == "RGB"
+        assert result.size == (32, 32)
+
+    def test_load_image_converts_rgba_to_rgb(self, tmp_path):
+        """RGBA images saved locally must be converted to RGB."""
+        from PIL import Image
+
+        img = Image.new("RGBA", (16, 16), color=(255, 0, 0, 128))
+        img_path = str(tmp_path / "rgba.png")
+        img.save(img_path)
+
+        result = self.load_image(img_path)
+        assert result.mode == "RGB"
+
+    def test_url_branch_calls_requests_get(self):
+        """http/https paths must use requests.get, not PIL.Image.open directly."""
+        from unittest.mock import MagicMock, patch
+        from io import BytesIO
+        from PIL import Image
+
+        fake_img = Image.new("RGB", (8, 8), color=(0, 128, 255))
+        buf = BytesIO()
+        fake_img.save(buf, format="PNG")
+        buf.seek(0)
+
+        mock_response = MagicMock()
+        mock_response.raw = buf
+
+        with patch("requests.get", return_value=mock_response) as mock_get:
+            result = self.load_image("https://example.com/image.png")
+
+        mock_get.assert_called_once_with("https://example.com/image.png", timeout=60)
+        assert isinstance(result, Image.Image)
+        assert result.mode == "RGB"
+
+    def test_local_path_does_not_call_requests(self, tmp_path):
+        """Local file paths must not trigger requests.get."""
+        from unittest.mock import patch
+        from PIL import Image
+
+        img = Image.new("RGB", (4, 4))
+        img_path = str(tmp_path / "local.png")
+        img.save(img_path)
+
+        with patch("requests.get") as mock_get:
+            self.load_image(img_path)
+
+        mock_get.assert_not_called()

From b367bf35c60db4bdd2efdc58300ce081d710c135 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sun, 8 Mar 2026 13:26:44 +0000
Subject: [PATCH 02/10] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/export/utils.py             |  4 +++-
 test/test_cpu/models/test_glm_image.py | 29 +++++++++++++++-----------
 2 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/auto_round/export/utils.py b/auto_round/export/utils.py
index 5581fc46c..ccbb94823 100644
--- a/auto_round/export/utils.py
+++ b/auto_round/export/utils.py
@@ -123,7 +123,9 @@ def resolve_pipeline_export_layout(model: nn.Module, output_dir: str) -> tuple[s
     processor_component = None
     if source_dir is not None:
         try:
-            model_index_path = os.path.join(source_dir, "model_index.json") if is_local_pipeline_model_dir(source_dir) else None
+            model_index_path = (
+                os.path.join(source_dir, "model_index.json") if is_local_pipeline_model_dir(source_dir) else None
+            )
             if model_index_path:
                 with open(model_index_path, "r", encoding="utf-8") as f:
                     model_index = json.load(f)
diff --git a/test/test_cpu/models/test_glm_image.py b/test/test_cpu/models/test_glm_image.py
index af0e3f12f..d87459cb5 100644
--- a/test/test_cpu/models/test_glm_image.py
+++ b/test/test_cpu/models/test_glm_image.py
@@ -28,11 +28,11 @@
 from auto_round.special_model_handler import _get_glm_image_multimodal_block
 from auto_round.utils.model import _find_pipeline_model_subfolder_local
 
-
 # ---------------------------------------------------------------------------
 # Helpers – fake model hierarchy
 # ---------------------------------------------------------------------------
 
+
 def _make_glm_image_model(n_vision_blocks: int = 4, n_lm_layers: int = 28):
     """Return a minimal fake GlmImageForConditionalGeneration-like model.
 
@@ -76,6 +76,7 @@ def __init__(self):
 # Tests for _get_glm_image_multimodal_block
 # ---------------------------------------------------------------------------
 
+
 class TestGetGlmImageMultimodalBlock:
     """Unit tests for the GLM-Image block-name discovery helper."""
 
@@ -105,9 +106,7 @@ def test_quant_vision_false_ignores_visual_blocks(self):
         block_names = _get_glm_image_multimodal_block(model, quant_vision=False)
 
         flat = [name for group in block_names for name in group]
-        assert not any("visual" in name for name in flat), (
-            "visual blocks must be excluded when quant_vision=False"
-        )
+        assert not any("visual" in name for name in flat), "visual blocks must be excluded when quant_vision=False"
 
     def test_missing_language_model_returns_empty(self):
         """If the model has no language_model attribute, result is empty."""
@@ -127,9 +126,7 @@ class _NoVisualBlocks(nn.Module):
             def __init__(self):
                 super().__init__()
                 self.model = types.SimpleNamespace(
-                    language_model=types.SimpleNamespace(
-                        layers=nn.ModuleList([nn.Linear(8, 8) for _ in range(6)])
-                    )
+                    language_model=types.SimpleNamespace(layers=nn.ModuleList([nn.Linear(8, 8) for _ in range(6)]))
                     # no .visual attribute
                 )
 
@@ -151,6 +148,7 @@ def test_block_count_matches_actual_module_list_length(self):
 # Helpers – temp filesystem for pipeline loading tests
 # ---------------------------------------------------------------------------
 
+
 def _make_pipeline_dir(tmp_path, components, has_processor=True):
     """Write a minimal diffusers-style pipeline directory.
 
@@ -177,6 +175,7 @@ def _make_pipeline_dir(tmp_path, components, has_processor=True):
 # Tests for _find_pipeline_model_subfolder_local
 # ---------------------------------------------------------------------------
 
+
 class TestFindPipelineModelSubfolderLocal:
     """Unit tests for the local pipeline subfolder discovery helper."""
 
@@ -273,6 +272,7 @@ def test_falls_back_to_first_candidate_when_no_preferred_arch(self, tmp_path):
 # Tests for GlmImageProcessor construction path
 # ---------------------------------------------------------------------------
 
+
 class TestGlmImageProcessorConstruction:
     """Unit-test the GlmImageProcessor assembly logic in mllm_load_model.
 
@@ -298,7 +298,9 @@ def test_glm_image_processor_wraps_components(self, mock_components):
         mock_cls = MagicMock(return_value=fake_processor)
 
         # Patch away the real GlmImageProcessor so we only test the branch logic
-        with patch.dict("sys.modules", {"transformers.models.glm_image.processing_glm_image": types.ModuleType("_fake")}):
+        with patch.dict(
+            "sys.modules", {"transformers.models.glm_image.processing_glm_image": types.ModuleType("_fake")}
+        ):
             import sys
 
             sys.modules["transformers.models.glm_image.processing_glm_image"].GlmImageProcessor = mock_cls
@@ -341,6 +343,7 @@ def test_skipped_when_image_processor_is_none(self, mock_components):
 # Helpers – minimal PIL Image factory (no file I/O)
 # ---------------------------------------------------------------------------
 
+
 def _make_rgb_image(width: int = 64, height: int = 64):
     """Return a tiny solid-colour PIL Image in RGB mode."""
     from PIL import Image
@@ -352,6 +355,7 @@ def _make_rgb_image(width: int = 64, height: int = 64):
 # Tests for image-to-image inference call logic (run_glm_image.py)
 # ---------------------------------------------------------------------------
 
+
 class TestGlmImageI2ICallLogic:
     """Unit tests for the image-to-image pipeline invocation logic.
 
@@ -445,6 +449,7 @@ def test_i2i_prompt_forwarded_correctly(self):
 # Tests for load_image helper (run_glm_image.py)
 # ---------------------------------------------------------------------------
 
+
 class TestLoadImage:
     """Unit tests for the load_image() helper in run_glm_image.
 
@@ -459,9 +464,7 @@ def _import_load_image(self):
         import sys
 
         # Ensure the workspace root is on sys.path so run_glm_image can be imported
-        root = os.path.dirname(
-            os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-        )
+        root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
         if root not in sys.path:
             sys.path.insert(0, root)
         mod = importlib.import_module("run_glm_image")
@@ -494,8 +497,9 @@ def test_load_image_converts_rgba_to_rgb(self, tmp_path):
 
     def test_url_branch_calls_requests_get(self):
         """http/https paths must use requests.get, not PIL.Image.open directly."""
-        from unittest.mock import MagicMock, patch
         from io import BytesIO
+        from unittest.mock import MagicMock, patch
+
         from PIL import Image
 
         fake_img = Image.new("RGB", (8, 8), color=(0, 128, 255))
@@ -516,6 +520,7 @@ def test_url_branch_calls_requests_get(self):
     def test_local_path_does_not_call_requests(self, tmp_path):
         """Local file paths must not trigger requests.get."""
         from unittest.mock import patch
+
         from PIL import Image
 
         img = Image.new("RGB", (4, 4))

From e73a31d106ecaa90a86447e79637a245e2dcdd03 Mon Sep 17 00:00:00 2001
From: lvliang-intel <liang1.lv@intel.com>
Date: Sun, 8 Mar 2026 21:26:47 +0800
Subject: [PATCH 03/10] fix test script

Signed-off-by: lvliang-intel <liang1.lv@intel.com>
---
 test/test_cpu/models/test_glm_image.py | 87 --------------------------
 1 file changed, 87 deletions(-)

diff --git a/test/test_cpu/models/test_glm_image.py b/test/test_cpu/models/test_glm_image.py
index af0e3f12f..61c805979 100644
--- a/test/test_cpu/models/test_glm_image.py
+++ b/test/test_cpu/models/test_glm_image.py
@@ -439,90 +439,3 @@ def test_i2i_prompt_forwarded_correctly(self):
 
         _, kwargs = pipe.call_args
         assert kwargs["prompt"] == prompt
-
-
-# ---------------------------------------------------------------------------
-# Tests for load_image helper (run_glm_image.py)
-# ---------------------------------------------------------------------------
-
-class TestLoadImage:
-    """Unit tests for the load_image() helper in run_glm_image.
-
-    Covers local file loading and the URL-vs-path dispatch logic without
-    making any real network requests.
-    """
-
-    @pytest.fixture(autouse=True)
-    def _import_load_image(self):
-        """Import load_image from run_glm_image into the test namespace."""
-        import importlib
-        import sys
-
-        # Ensure the workspace root is on sys.path so run_glm_image can be imported
-        root = os.path.dirname(
-            os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-        )
-        if root not in sys.path:
-            sys.path.insert(0, root)
-        mod = importlib.import_module("run_glm_image")
-        self.load_image = mod.load_image
-
-    def test_load_local_rgb_image(self, tmp_path):
-        """load_image() opens a local file and returns an RGB PIL Image."""
-        from PIL import Image
-
-        img = Image.new("RGBA", (32, 32), color=(10, 20, 30, 255))
-        img_path = str(tmp_path / "test.png")
-        img.save(img_path)
-
-        result = self.load_image(img_path)
-
-        assert isinstance(result, Image.Image)
-        assert result.mode == "RGB"
-        assert result.size == (32, 32)
-
-    def test_load_image_converts_rgba_to_rgb(self, tmp_path):
-        """RGBA images saved locally must be converted to RGB."""
-        from PIL import Image
-
-        img = Image.new("RGBA", (16, 16), color=(255, 0, 0, 128))
-        img_path = str(tmp_path / "rgba.png")
-        img.save(img_path)
-
-        result = self.load_image(img_path)
-        assert result.mode == "RGB"
-
-    def test_url_branch_calls_requests_get(self):
-        """http/https paths must use requests.get, not PIL.Image.open directly."""
-        from unittest.mock import MagicMock, patch
-        from io import BytesIO
-        from PIL import Image
-
-        fake_img = Image.new("RGB", (8, 8), color=(0, 128, 255))
-        buf = BytesIO()
-        fake_img.save(buf, format="PNG")
-        buf.seek(0)
-
-        mock_response = MagicMock()
-        mock_response.raw = buf
-
-        with patch("requests.get", return_value=mock_response) as mock_get:
-            result = self.load_image("https://example.com/image.png")
-
-        mock_get.assert_called_once_with("https://example.com/image.png", timeout=60)
-        assert isinstance(result, Image.Image)
-        assert result.mode == "RGB"
-
-    def test_local_path_does_not_call_requests(self, tmp_path):
-        """Local file paths must not trigger requests.get."""
-        from unittest.mock import patch
-        from PIL import Image
-
-        img = Image.new("RGB", (4, 4))
-        img_path = str(tmp_path / "local.png")
-        img.save(img_path)
-
-        with patch("requests.get") as mock_get:
-            self.load_image(img_path)
-
-        mock_get.assert_not_called()

From 510b6c4c8851fc0a9a39600e6a23b0753379c75d Mon Sep 17 00:00:00 2001
From: lvliang-intel <liang1.lv@intel.com>
Date: Tue, 10 Mar 2026 18:53:39 +0800
Subject: [PATCH 04/10] support hybrid mode

Signed-off-by: lvliang-intel <liang1.lv@intel.com>
---
 auto_round/autoround.py                       |   9 +-
 auto_round/compressors/__init__.py            |   1 +
 .../compressors/diffusion/compressor.py       |   9 +-
 auto_round/compressors/diffusion/hybrid.py    | 668 ++++++++++++++++++
 auto_round/utils/model.py                     |  65 ++
 5 files changed, 749 insertions(+), 3 deletions(-)
 create mode 100644 auto_round/compressors/diffusion/hybrid.py

diff --git a/auto_round/autoround.py b/auto_round/autoround.py
index 9888bfe8e..b5906b839 100644
--- a/auto_round/autoround.py
+++ b/auto_round/autoround.py
@@ -25,6 +25,7 @@
     LLMCompressor,
     MLLMCompressor,
 )
+from auto_round.compressors.diffusion.hybrid import HybridCompressor, is_hybrid_diffusion_model
 from auto_round.logger import deprecated, logger
 from auto_round.schemes import QuantizationScheme
 from auto_round.utils import is_diffusion_model, is_mllm_model
@@ -163,7 +164,13 @@ def __new__(
 
         has_multimodal_assets = kwargs.get("processor") is not None or kwargs.get("image_processor") is not None
 
-        if (
+        if is_hybrid_diffusion_model(model):
+            logger.info("using Hybrid AR+Diffusion mode for hybrid model.")
+            model_cls.append(HybridCompressor)
+            if extra_config:
+                extra_config.mllm_config = None
+                extra_config.diffusion_config = None
+        elif (
             (extra_config and not extra_config.mllm_config.is_default())
             or has_multimodal_assets
             or is_mllm_model(model, platform=platform)
diff --git a/auto_round/compressors/__init__.py b/auto_round/compressors/__init__.py
index 6f8ddf681..15ec27ebe 100644
--- a/auto_round/compressors/__init__.py
+++ b/auto_round/compressors/__init__.py
@@ -17,6 +17,7 @@
 from auto_round.compressors.base import LLMCompressor
 from auto_round.compressors.mllm.compressor import MLLMCompressor
 from auto_round.compressors.diffusion.compressor import DiffusionCompressor
+from auto_round.compressors.diffusion.hybrid import HybridCompressor
 from auto_round.compressors.config import (
     DiffusionExtraConfig,
     ExtraConfig,
diff --git a/auto_round/compressors/diffusion/compressor.py b/auto_round/compressors/diffusion/compressor.py
index 6d9580e4f..09162fd41 100644
--- a/auto_round/compressors/diffusion/compressor.py
+++ b/auto_round/compressors/diffusion/compressor.py
@@ -31,6 +31,8 @@
     extract_block_names_to_str,
     find_matching_blocks,
     get_block_names,
+    merge_block_output_keys,
+    wrap_block_forward_positional_to_kwargs,
 )
 
 pipeline_utils = LazyImport("diffusers.pipelines.pipeline_utils")
@@ -172,6 +174,9 @@ def _update_inputs(self, inputs: dict, q_inputs: dict) -> tuple[dict, dict]:
             q_inputs = {k: q_inputs.pop(k, None) for k in input_id_str}
         return inputs, q_inputs
 
+    def _get_block_forward_func(self, name):
+        return wrap_block_forward_positional_to_kwargs(super()._get_block_forward_func(name))
+
     def _split_inputs(self, inputs: dict, first_input_name: str) -> tuple[dict, dict]:
         input_id_str = [key for key in inputs.keys() if "hidden_state" in key]
         input_ids = {k: inputs.pop(k, None) for k in input_id_str}
@@ -205,7 +210,7 @@ def _get_current_q_output(
         )
         if isinstance(current_input_ids, dict):
             hidden_states = current_input_ids.pop("hidden_states")
-            current_input_others.update(current_input_ids)
+            merge_block_output_keys(block, current_input_others, current_input_ids)
             current_input_ids = hidden_states
         output_q = block_forward(block, current_input_ids, current_input_others, self.amp, self.amp_dtype, device, idx)
         return output_q.to(cache_device)
@@ -251,7 +256,7 @@ def _get_block_outputs(
             )
             if isinstance(tmp_input_ids, dict):
                 hidden_states = tmp_input_ids.pop("hidden_states")
-                tmp_input_others.update(tmp_input_ids)
+                merge_block_output_keys(block, tmp_input_others, tmp_input_ids)
                 tmp_input_ids = hidden_states
 
             tmp_output = block_forward(block, tmp_input_ids, tmp_input_others, self.amp, self.amp_dtype, device, None)
diff --git a/auto_round/compressors/diffusion/hybrid.py b/auto_round/compressors/diffusion/hybrid.py
new file mode 100644
index 000000000..21af6b197
--- /dev/null
+++ b/auto_round/compressors/diffusion/hybrid.py
@@ -0,0 +1,668 @@
+# Copyright (c) 2026 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""HybridCompressor for models with both AR and diffusion components.
+
+This compressor handles models that have a hybrid architecture consisting of:
+  - An autoregressive (AR) language model component
+  - A diffusion transformer (DiT) component
+
+It quantizes both components in a single workflow:
+  Phase 1: Quantize the AR model using MLLM-style text calibration
+  Phase 2: Quantize the DiT model using diffusion-style pipeline calibration
+
+Supported hybrid pipelines are registered in ``HYBRID_AR_COMPONENTS``.
+To add a new model, register its AR component attribute name and (optionally)
+its DiT block output config in ``output_configs``.
+"""
+
+from __future__ import annotations
+
+import copy
+import os
+import time
+from typing import Any, Union
+
+import torch
+
+from auto_round.compressors.diffusion.compressor import DiffusionCompressor, output_configs
+from auto_round.logger import logger
+from auto_round.schemes import QuantizationScheme
+from auto_round.utils import (
+    LazyImport,
+    clear_memory,
+    extract_block_names_to_str,
+    find_matching_blocks,
+    get_block_names,
+)
+
+pipeline_utils = LazyImport("diffusers.pipelines.pipeline_utils")
+
+# ---------------------------------------------------------------------------
+# Registry: known AR component attribute names in hybrid diffusion pipelines.
+# Each entry maps a pipeline attribute name to the component role.
+# When a pipeline has *both* "transformer" and one of these attributes,
+# it is recognised as a hybrid model.
+# To support a new hybrid architecture, simply add its AR attribute name here.
+# ---------------------------------------------------------------------------
+HYBRID_AR_COMPONENTS = [
+    "vision_language_encoder",   # GLM-Image
+    # Add new AR component names here, e.g.:
+    # "language_model",
+    # "text_decoder",
+]
+
+# ---------------------------------------------------------------------------
+# Register DiT block output configs for hybrid models.
+# Maps block class name -> ordered list of output tensor names.
+# Pure-diffusion blocks (Flux*) are already registered in DiffusionCompressor.
+# ---------------------------------------------------------------------------
+output_configs["GlmImageTransformerBlock"] = ["hidden_states", "encoder_hidden_states"]
+
+
+# ---------------------------------------------------------------------------
+# Detection
+# ---------------------------------------------------------------------------
+
+def _find_ar_component_name(model_or_path):
+    """Return the AR component attribute name if model_or_path is a hybrid pipeline, else None."""
+    if isinstance(model_or_path, str):
+        index_path = os.path.join(model_or_path, "model_index.json")
+        if not os.path.exists(index_path):
+            from huggingface_hub import hf_hub_download
+            try:
+                index_path = hf_hub_download(model_or_path, "model_index.json")
+            except Exception:
+                return None
+
+        import json
+        with open(index_path) as f:
+            data = json.load(f)
+        if "transformer" not in data:
+            return None
+        for name in HYBRID_AR_COMPONENTS:
+            if name in data:
+                return name
+        return None
+
+    # Runtime pipeline object
+    if hasattr(model_or_path, "transformer"):
+        for name in HYBRID_AR_COMPONENTS:
+            if hasattr(model_or_path, name) and getattr(model_or_path, name) is not None:
+                return name
+    return None
+
+
+def is_hybrid_diffusion_model(model_or_path):
+    """Return True if *model_or_path* represents a hybrid AR+Diffusion pipeline."""
+    return _find_ar_component_name(model_or_path) is not None
+
+
+class HybridCompressor(DiffusionCompressor):
+    """Compressor for hybrid AR + diffusion models.
+
+    Quantizes both the autoregressive component and the diffusion transformer
+    component in a single workflow.  The AR component is discovered automatically
+    from ``HYBRID_AR_COMPONENTS``.
+
+    Args:
+        model: Model name/path or DiffusionPipeline object.
+        tokenizer: Tokenizer (auto-loaded from pipeline if None).
+        guidance_scale: Guidance scale for diffusion calibration.
+        num_inference_steps: Denoising steps for diffusion calibration.
+        generator_seed: Seed for noise generator.
+        scheme: Quantization scheme.
+        dataset: Calibration dataset for DiT (default: "coco2014").
+        ar_dataset: Calibration dataset for AR model (default: "NeelNanda/pile-10k").
+        quant_nontext_module: Whether to also quantize vision encoder in AR model.
+        iters: Optimization iterations.
+        seqlen: Calibration sequence length for AR model.
+        nsamples: Number of calibration samples.
+        batch_size: Calibration batch size.
+        quant_ar: Whether to quantize the AR component.
+        quant_dit: Whether to quantize the DiT component.
+        height: Image height passed to the pipeline during DiT calibration (required by some pipelines
+            such as GLM-Image; ignored if the pipeline does not accept it).
+        width: Image width passed to the pipeline during DiT calibration.
+        **kwargs: Additional keyword arguments passed to base compressor.
+    """
+
+    def __init__(
+        self,
+        model: Union[object, str],
+        tokenizer=None,
+        platform: str = "hf",
+        guidance_scale: float = 1.5,
+        num_inference_steps: int = 10,
+        generator_seed: int = None,
+        scheme: Union[str, dict, QuantizationScheme] = "W4A16",
+        layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None,
+        dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "coco2014",
+        ar_dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "NeelNanda/pile-10k",
+        quant_nontext_module: bool = False,
+        iters: int = 200,
+        seqlen: int = 2048,
+        nsamples: int = 128,
+        batch_size: int = 8,
+        gradient_accumulate_steps: int = 1,
+        low_gpu_mem_usage: bool = True,
+        device_map: Union[str, torch.device, int, dict] = 0,
+        enable_torch_compile: bool = False,
+        seed: int = 42,
+        quant_ar: bool = True,
+        quant_dit: bool = True,
+        height: int = None,
+        width: int = None,
+        **kwargs,
+    ):
+        logger.warning("Hybrid AR+Diffusion model quantization is experimental.")
+        model_dtype = kwargs.pop("model_dtype", None)
+
+        self.guidance_scale = guidance_scale
+        self.num_inference_steps = num_inference_steps
+        self.generator_seed = generator_seed
+        self.quant_ar = quant_ar
+        self.quant_dit = quant_dit
+        self.quant_nontext_module = quant_nontext_module
+        self.ar_dataset = ar_dataset
+        self.height = height
+        self.width = width
+
+        to_quant_block_names: Union[str, list, None] = kwargs.pop("to_quant_block_names", None)
+        if device_map is None:
+            device_map = 0
+        self._set_device(device_map)
+
+        # --- Load the pipeline ---
+        if isinstance(model, str):
+            from auto_round.utils.model import diffusion_load_model
+            pipe, dit_model = diffusion_load_model(
+                model, platform=platform, device=self.device, model_dtype=model_dtype
+            )
+        elif isinstance(model, pipeline_utils.DiffusionPipeline):
+            pipe = model
+            dit_model = pipe.transformer
+        else:
+            raise ValueError(
+                f"HybridCompressor requires a model path or DiffusionPipeline, got {type(model)}"
+            )
+
+        # --- Discover the AR component dynamically ---
+        self.ar_component_name = _find_ar_component_name(pipe)
+        if self.ar_component_name is None and self.quant_ar:
+            logger.warning(
+                f"No AR component found in pipeline (checked: {HYBRID_AR_COMPONENTS}), "
+                "skipping AR quantization."
+            )
+            self.quant_ar = False
+
+        self.pipe = pipe
+        self.dit_model = dit_model
+        self.ar_model = (
+            getattr(pipe, self.ar_component_name, None)
+            if self.ar_component_name
+            else None
+        )
+
+        if not self.quant_ar and not self.quant_dit:
+            raise ValueError("At least one of quant_ar and quant_dit must be True.")
+
+        model = dit_model
+
+        # --- Detect DiT blocks ---
+        all_blocks = get_block_names(model)
+        dit_blocks = find_matching_blocks(model, all_blocks, to_quant_block_names)
+
+        # Filter to only blocks whose class has a registered output_config.
+        # get_block_names may discover non-transformer ModuleLists (e.g. MLP projectors)
+        # that don't match the expected output format.
+        if to_quant_block_names is None:
+            filtered = []
+            for group in dit_blocks:
+                if not group:
+                    continue
+                parts = group[0].split(".")
+                m = model
+                for p in parts:
+                    m = getattr(m, p)
+                if m.__class__.__name__ in output_configs:
+                    filtered.append(group)
+            if filtered:
+                dit_blocks = filtered
+        self.dit_quant_block_list = dit_blocks
+
+        # --- Detect AR blocks ---
+        if self.quant_ar and self.ar_model is not None:
+            from auto_round.special_model_handler import SPECIAL_MULTIMODAL_BLOCK
+            model_type = getattr(getattr(self.ar_model, "config", None), "model_type", None)
+            if model_type and model_type in SPECIAL_MULTIMODAL_BLOCK:
+                self.ar_quant_block_list = SPECIAL_MULTIMODAL_BLOCK[model_type](
+                    self.ar_model, quant_vision=quant_nontext_module
+                )
+            else:
+                self.ar_quant_block_list = [get_block_names(self.ar_model)]
+        else:
+            self.ar_quant_block_list = []
+
+        self.quant_block_list = self.dit_quant_block_list
+        if to_quant_block_names is None:
+            to_quant_block_names = extract_block_names_to_str(self.quant_block_list)
+
+        # Force batch_size to 1 for diffusion calibration
+        if iters > 0 and batch_size != 1:
+            logger.warning(
+                f"reset batch_size({batch_size}) to 1 and "
+                f"gradient_accumulate_steps({gradient_accumulate_steps}) "
+                f"to {batch_size * gradient_accumulate_steps}, "
+                f"because batch_size > 1 cannot be used for diffusion calibration."
+            )
+            gradient_accumulate_steps = batch_size * gradient_accumulate_steps
+            batch_size = 1
+
+        seqlen = 2048 if seqlen is None else seqlen
+
+        if nsamples % batch_size != 0:
+            nsamples = (nsamples // batch_size + 1) * batch_size
+            logger.warning(f"'nsamples' is not divisible by 'batch_size', adjusted to {nsamples}")
+
+        kwargs["diffusion"] = True
+        self._saved_pipe = pipe
+        self._saved_dit_model = dit_model
+        self._saved_ar_model = self.ar_model
+
+        from auto_round.compressors.base import BaseCompressor
+        BaseCompressor.__init__(
+            self,
+            model=model,
+            tokenizer=None,
+            platform=platform,
+            scheme=scheme,
+            layer_config=layer_config,
+            dataset=dataset,
+            iters=iters,
+            seqlen=seqlen,
+            nsamples=nsamples,
+            batch_size=batch_size,
+            gradient_accumulate_steps=gradient_accumulate_steps,
+            low_gpu_mem_usage=low_gpu_mem_usage,
+            device_map=device_map,
+            enable_torch_compile=enable_torch_compile,
+            seed=seed,
+            to_quant_block_names=to_quant_block_names,
+            **kwargs,
+        )
+
+        # Restore references that BaseCompressor.__init__ may have overwritten
+        self.pipe = self._saved_pipe
+        self.dit_model = self._saved_dit_model
+        self.ar_model = self._saved_ar_model
+
+    # ------------------------------------------------------------------
+    # Quantization
+    # ------------------------------------------------------------------
+
+    def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]:
+        """Quantize both AR and DiT components.
+
+        Phase 1: AR model via MLLM-style text calibration.
+        Phase 2: DiT model via diffusion pipeline calibration.
+        """
+        start_time = time.time()
+        combined_layer_config = {}
+
+        # =================== Phase 1: AR Model ===================
+        if self.quant_ar and self.ar_model is not None:
+            logger.info("=" * 60)
+            logger.info(f"Phase 1: Quantizing AR model ({self.ar_component_name})")
+            logger.info("=" * 60)
+
+            ar_compressor = self._create_ar_compressor()
+            ar_model, ar_layer_config = ar_compressor.quantize()
+
+            self.ar_model = ar_model
+            setattr(self.pipe, self.ar_component_name, ar_model)
+            combined_layer_config.update(
+                {f"ar.{k}": v for k, v in ar_layer_config.items()}
+            )
+            self.ar_layer_config = ar_layer_config
+
+            # Preserve serialization-relevant attributes from the AR compressor
+            # so save_quantized can build the correct serialization_dict.
+            from auto_round.compressors.base import SERIALIZATION_KEYS
+            self._ar_serialization = {
+                k: getattr(ar_compressor, k, None) for k in SERIALIZATION_KEYS
+            }
+
+            # Move AR model to CPU to free GPU for Phase 2
+            self.ar_model.to("cpu")
+            clear_memory(device_list=self.device_list)
+            logger.info(f"Phase 1 complete: AR model ({self.ar_component_name}) quantized")
+
+        # =================== Phase 2: DiT Model ===================
+        if self.quant_dit:
+            logger.info("=" * 60)
+            logger.info("Phase 2: Quantizing DiT model (transformer)")
+            logger.info("=" * 60)
+
+            # Move DiT to target device for calibration
+            self.dit_model = self.dit_model.to(self.device)
+            self.model = self.dit_model
+            self.quant_block_list = self.dit_quant_block_list
+            self.quantized = False
+            self.batch_dim = None
+
+            for n, m in self.model.named_modules():
+                m.global_name = n
+
+            dit_model, dit_layer_config = self._quantize_dit()
+
+            self.dit_model = dit_model
+            self.pipe.transformer = dit_model
+            combined_layer_config.update(
+                {f"dit.{k}": v for k, v in dit_layer_config.items()}
+            )
+            self.dit_layer_config = dit_layer_config
+
+            logger.info("Phase 2 complete: DiT model quantized")
+
+        end_time = time.time()
+        logger.info(f"Total hybrid quantization time: {end_time - start_time:.1f}s")
+
+        self.quantized = True
+        self.layer_config = combined_layer_config
+        self.model = self.dit_model
+        return self.model, self.layer_config
+
+    def _create_ar_compressor(self):
+        """Create an MLLM compressor for the AR component."""
+        from auto_round.compressors.mllm.compressor import MLLMCompressor
+
+        processor = getattr(self.pipe, "processor", None)
+        tokenizer = getattr(self.pipe, "tokenizer", None)
+
+        ar = MLLMCompressor(
+            model=self.ar_model,
+            tokenizer=tokenizer,
+            processor=processor,
+            image_processor=None,
+            platform=self.platform,
+            scheme=copy.deepcopy(self.orig_scheme) if hasattr(self, "orig_scheme") else self.scheme,
+            dataset=self.ar_dataset,
+            quant_nontext_module=self.quant_nontext_module,
+            iters=self.iters,
+            seqlen=self.seqlen,
+            nsamples=self.nsamples,
+            batch_size=1,
+            gradient_accumulate_steps=self.gradient_accumulate_steps,
+            low_gpu_mem_usage=self.low_gpu_mem_usage,
+            device_map=self.device_map,
+            enable_torch_compile=self.enable_torch_compile,
+            seed=self.seed,
+        )
+        if hasattr(self, "formats"):
+            ar.formats = self.formats
+        # Required by base.quantize() → _adjust_immediate_packing_and_saving();
+        # None disables immediate packing (correct since we call quantize() directly).
+        ar.orig_output_dir = None
+        return ar
+
+    def _quantize_dit(self):
+        """Quantize the DiT model using the parent DiffusionCompressor's quantize flow."""
+        return DiffusionCompressor.quantize(self)
+
+    def calib(self, nsamples, bs):
+        """Override calib to pass extra pipeline kwargs (e.g. height/width) if set.
+
+        Pipelines like GLM-Image require explicit image dimensions; standard diffusion
+        pipelines (FLUX etc.) accept but ignore them.
+        """
+        import inspect
+        pipe_sig = inspect.signature(self.pipe.__call__)
+        extra = {}
+        if "height" in pipe_sig.parameters and self.height is not None:
+            extra["height"] = self.height
+        if "width" in pipe_sig.parameters and self.width is not None:
+            extra["width"] = self.width
+
+        if not extra:
+            # No extra kwargs needed — delegate to parent as-is
+            return DiffusionCompressor.calib(self, nsamples, bs)
+
+        # Replicate parent calib() with extra kwargs injected into the pipe call
+        from auto_round.compressors.diffusion.dataset import get_diffusion_dataloader
+        from auto_round.utils import clear_memory
+        from tqdm import tqdm
+
+        logger.warning(
+            "Diffusion model will catch nsamples * num_inference_steps inputs, "
+            "you can reduce nsamples or num_inference_steps if OOM or take too much time."
+        )
+        if isinstance(self.dataset, str):
+            dataset = self.dataset.replace(" ", "")
+            self.dataloader, self.batch_size, self.gradient_accumulate_steps = get_diffusion_dataloader(
+                dataset=dataset,
+                bs=self.batch_size,
+                seed=self.seed,
+                nsamples=self.nsamples,
+                gradient_accumulate_steps=self.gradient_accumulate_steps,
+            )
+        else:
+            self.dataloader = self.dataset
+        total_cnt = 0
+
+        total = nsamples if not hasattr(self.dataloader, "len") else min(nsamples, len(self.dataloader))
+        if self.pipe.dtype != self.model.dtype:
+            self.pipe.to(self.model.dtype)
+        if self.pipe.device != self.model.device:
+            self.pipe.to(self.model.device)
+
+        with tqdm(range(1, total + 1), desc="cache block inputs") as pbar:
+            for ids, prompts in self.dataloader:
+                if isinstance(prompts, tuple):
+                    prompts = list(prompts)
+                try:
+                    self.pipe(
+                        prompt=prompts,
+                        guidance_scale=self.guidance_scale,
+                        num_inference_steps=self.num_inference_steps,
+                        generator=(
+                            None
+                            if self.generator_seed is None
+                            else torch.Generator(device=self.pipe.device).manual_seed(self.generator_seed)
+                        ),
+                        **extra,
+                    )
+                except NotImplementedError:
+                    pass
+                except Exception as error:
+                    raise error
+                step = len(prompts)
+                total_cnt += step
+                pbar.update(step)
+                if total_cnt >= nsamples:
+                    break
+
+        if total_cnt == 0:
+            logger.error(
+                f"no data has been cached, please provide more data with sequence length >={self.seqlen} in the "
+                f"dataset or decease the sequence length"
+            )
+            exit(-1)
+        elif total_cnt < nsamples:
+            logger.warning(
+                f"Insufficient number of samples collected may affect the quantization. "
+                f"target samples count is {nsamples}, while valid samples count is {total_cnt}"
+            )
+            if total_cnt < self.batch_size:
+                raise ValueError(
+                    f"valid samples is less than batch_size({self.batch_size}),"
+                    " please adjust self.batch_size or seqlen."
+                )
+            max_len = (total_cnt // self.batch_size) * self.batch_size
+            for k, v in self.inputs.items():
+                for key in v:
+                    if isinstance(v[key], list) and len(v[key]) == total_cnt:
+                        self.inputs[k][key] = v[key][:max_len]
+
+    # ------------------------------------------------------------------
+    # Saving
+    # ------------------------------------------------------------------
+
+    def save_quantized(self, output_dir=None, format="auto_round", ar_format=None,
+                        dit_format=None, inplace=True, **kwargs):
+        """Save both quantized AR and DiT models into a pipeline directory structure.
+
+        The output directory mirrors the original pipeline layout::
+
+            output_dir/
+              model_index.json
+              <ar_component>/   (quantized AR model)
+              transformer/      (quantized DiT model)
+              ...               (unchanged auxiliary components)
+
+        Args:
+            ar_format: Export format for the AR component.  Falls back to *format*.
+            dit_format: Export format for the DiT component.  Falls back to *format*.
+        """
+        if output_dir is None:
+            logger.warning("output_dir is None, skipping save")
+            return
+
+        from auto_round.formats import get_formats
+        from auto_round.compressors.base import BaseCompressor
+
+        if ar_format is None:
+            ar_format = format
+        if dit_format is None:
+            dit_format = format
+
+        saved_formats = self.formats  # preserve original
+
+        # Save DiT
+        if self.quant_dit:
+            dit_subdir = "transformer"
+            logger.info(f"Saving quantized DiT model ({dit_subdir}) [format={dit_format}]")
+            dit_output_dir = os.path.join(output_dir, dit_subdir)
+            os.makedirs(dit_output_dir, exist_ok=True)
+
+            self.model = self.dit_model
+            if hasattr(self, "dit_layer_config"):
+                self.layer_config = self.dit_layer_config
+
+            self.formats = get_formats(dit_format, self)
+            BaseCompressor.save_quantized(
+                self, output_dir=dit_output_dir, format=dit_format, inplace=inplace, **kwargs
+            )
+
+        # Save AR
+        if self.quant_ar and self.ar_model is not None:
+            ar_subdir = self.ar_component_name  # e.g. "vision_language_encoder"
+            logger.info(f"Saving quantized AR model ({ar_subdir}) [format={ar_format}]")
+            ar_output_dir = os.path.join(output_dir, ar_subdir)
+            os.makedirs(ar_output_dir, exist_ok=True)
+
+            self.model = self.ar_model
+            if hasattr(self, "ar_layer_config"):
+                self.layer_config = self.ar_layer_config
+
+            # Swap serialization attributes from the AR compressor so that
+            # BaseCompressor.save_quantized builds the correct config.
+            ar_ser = getattr(self, "_ar_serialization", {})
+            saved_attrs = {}
+            for k, v in ar_ser.items():
+                saved_attrs[k] = getattr(self, k, None)
+                setattr(self, k, v)
+
+            self.formats = get_formats(ar_format, self)
+            BaseCompressor.save_quantized(
+                self, output_dir=ar_output_dir, format=ar_format, inplace=inplace, **kwargs
+            )
+
+            # Restore DiT serialization attributes
+            for k, v in saved_attrs.items():
+                setattr(self, k, v)
+
+        self.formats = saved_formats
+        self._save_pipeline_metadata(output_dir)
+        self.model = self.dit_model
+        logger.info(f"Full hybrid quantized model saved to {output_dir}")
+
+    def _save_pipeline_metadata(self, output_dir):
+        """Save model_index.json and auxiliary pipeline components."""
+        src_path = (
+            getattr(getattr(self.pipe, "config", None), "_name_or_path", None)
+            or getattr(self.pipe, "name_or_path", None)
+        )
+        if src_path and os.path.exists(os.path.join(src_path, "model_index.json")):
+            import shutil
+            dst_index = os.path.join(output_dir, "model_index.json")
+            if not os.path.exists(dst_index):
+                shutil.copy2(os.path.join(src_path, "model_index.json"), dst_index)
+
+        # Save non-quantized pipeline components so the exported directory remains
+        # loadable as a complete diffusers pipeline even when only one branch is quantized.
+        component_names = [
+            "scheduler",
+            "tokenizer",
+            "processor",
+            "vae",
+            "text_encoder",
+        ]
+        if not self.quant_ar and self.ar_component_name is not None:
+            component_names.append(self.ar_component_name)
+        if not self.quant_dit:
+            component_names.append("transformer")
+
+        for component_name in component_names:
+            component = getattr(self.pipe, component_name, None)
+            if component is None:
+                continue
+            component_dir = os.path.join(output_dir, component_name)
+            if os.path.exists(component_dir):
+                continue
+            try:
+                if hasattr(component, "save_pretrained"):
+                    component.save_pretrained(component_dir)
+            except Exception as e:
+                logger.warning(f"Failed to save {component_name}: {e}")
+
+    def quantize_and_save(
+        self,
+        output_dir: str = "tmp_autoround",
+        format: str = "auto_round",
+        ar_format: str = None,
+        dit_format: str = None,
+        inplace: bool = True,
+        **kwargs,
+    ):
+        """Quantize both components and save the complete pipeline.
+
+        Args:
+            format: Default export format (used when *ar_format* / *dit_format* is None).
+            ar_format: Export format for the AR component.  Falls back to *format*.
+            dit_format: Export format for the DiT component.  Falls back to *format*.
+        """
+        from auto_round.formats import get_formats
+
+        format_list = get_formats(format, self)
+        self.formats = format_list
+        self.orig_output_dir = output_dir  # required by base.quantize() → _adjust_immediate_packing_and_saving()
+
+        self.quantize()
+        self.save_quantized(
+            output_dir, format=format, ar_format=ar_format,
+            dit_format=dit_format, inplace=inplace, **kwargs,
+        )
+        logger.info(f"Hybrid quantized model saved to {output_dir}")
+        return self.model, [output_dir]
diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py
index de561b106..55b5c4982 100644
--- a/auto_round/utils/model.py
+++ b/auto_round/utils/model.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import collections
+import inspect
 import json
 import os
 import re
@@ -1654,3 +1655,67 @@ def handle_generation_config(model: torch.nn.Module):
             model.generation_config.do_sample = True
         if hasattr(generation_config, "temperature") and generation_config.temperature != 1.0:
             model.generation_config.do_sample = True
+
+
+def merge_block_output_keys(block, input_others, extra_keys):
+    """Merge block output keys into input_others, resolving positional/keyword conflicts.
+
+    When a block is called with positional args (stored in input_others["positional_inputs"]),
+    and the block output produces updated values for those same parameters (e.g.,
+    encoder_hidden_states), we must update the positional arg rather than adding a duplicate
+    keyword arg, which would cause "got multiple values for argument" errors.
+    """
+    positional_inputs = input_others.get("positional_inputs")
+    if not positional_inputs or not extra_keys:
+        input_others.update(extra_keys)
+        return
+
+    try:
+        sig = inspect.signature(block.forward)
+    except (ValueError, TypeError):
+        input_others.update(extra_keys)
+        return
+
+    params = [p for p in sig.parameters.keys() if p != "self"]
+    # params[0] = hidden_states (passed as input_ids separately)
+    # params[1:] correspond to positional_inputs[0], [1], ...
+
+    positional_inputs = list(positional_inputs)
+    for key, value in extra_keys.items():
+        if key in params:
+            pos_idx = params.index(key) - 1  # -1 because hidden_states is params[0]
+            if 0 <= pos_idx < len(positional_inputs):
+                positional_inputs[pos_idx] = value
+                continue
+        input_others[key] = value
+    input_others["positional_inputs"] = tuple(positional_inputs)
+
+
+def wrap_block_forward_positional_to_kwargs(base_hook):
+    """Wrap a block forward hook to convert positional inputs to keyword args.
+
+    Models like GLM-Image call transformer blocks with positional args
+    (e.g. block(hidden_states, encoder_hidden_states, temb, ...)).  The base
+    hook only stores positional_inputs once (from the first sample), losing
+    per-sample variation for encoder_hidden_states etc.  By converting
+    positional args to keyword args, all inputs are properly accumulated
+    across calibration samples.
+    """
+    _param_names = None
+
+    def forward(m, hidden_states=None, *positional_inputs, **kwargs):
+        nonlocal _param_names
+        if positional_inputs:
+            if _param_names is None:
+                sig = inspect.signature(m.orig_forward)
+                _param_names = [p for p in sig.parameters.keys() if p != "self"]
+            for i, val in enumerate(positional_inputs):
+                param_idx = i + 1  # hidden_states is params[0]
+                if param_idx < len(_param_names):
+                    param_name = _param_names[param_idx]
+                    if param_name not in kwargs:
+                        kwargs[param_name] = val
+            positional_inputs = ()
+        return base_hook(m, hidden_states, *positional_inputs, **kwargs)
+
+    return forward

From 27cddba095b0a2e45a18d31a3cde270a139101e7 Mon Sep 17 00:00:00 2001
From: lvliang-intel <liang1.lv@intel.com>
Date: Tue, 17 Mar 2026 21:34:43 +0800
Subject: [PATCH 05/10] fix hybrid mode

Signed-off-by: lvliang-intel <liang1.lv@intel.com>
---
 auto_round/compressors/diffusion/hybrid.py | 72 +++++++++++++++++++++-
 1 file changed, 70 insertions(+), 2 deletions(-)

diff --git a/auto_round/compressors/diffusion/hybrid.py b/auto_round/compressors/diffusion/hybrid.py
index e74981f89..b18fa2a9a 100644
--- a/auto_round/compressors/diffusion/hybrid.py
+++ b/auto_round/compressors/diffusion/hybrid.py
@@ -31,6 +31,7 @@
 
 import copy
 import os
+import shutil
 import time
 from typing import Any, Union
 
@@ -412,10 +413,10 @@ def _create_ar_compressor(self):
         )
         if hasattr(self, "formats"):
             ar.formats = self.formats
-        ar.inplace = False
         # Required by base.quantize() → _adjust_immediate_packing_and_saving();
         # None disables immediate packing (correct since we call quantize() directly).
         ar.orig_output_dir = None
+        ar.inplace = True
         return ar
 
     def _quantize_dit(self):
@@ -520,6 +521,68 @@ def calib(self, nsamples, bs):
     # Saving
     # ------------------------------------------------------------------
 
+    @staticmethod
+    def _flatten_nested_component_dir(component_output_dir: str, component_name: str) -> None:
+        """Fix accidental nested save layouts (e.g. transformer/transformer/config.json).
+
+        Some model config savers may create a same-name nested directory under the
+        component output path. Flatten it so pipeline loaders find config files in
+        the expected component root.
+        """
+        model_markers = (
+            "config.json",
+            "generation_config.json",
+            "model.safetensors",
+            "model.safetensors.index.json",
+        )
+
+        nested_dir = os.path.join(component_output_dir, component_name)
+
+        # Some exporters write into a single nested model folder (for example,
+        # vision_language_encoder/transformer). If same-name nesting does not
+        # exist, try to detect this pattern and flatten it as well.
+        if not os.path.isdir(nested_dir):
+            child_dirs = [
+                os.path.join(component_output_dir, name)
+                for name in os.listdir(component_output_dir)
+                if os.path.isdir(os.path.join(component_output_dir, name))
+            ]
+            has_model_files_at_root = any(
+                os.path.exists(os.path.join(component_output_dir, marker))
+                for marker in model_markers
+            )
+            if len(child_dirs) == 1 and not has_model_files_at_root:
+                candidate = child_dirs[0]
+                has_model_files_in_child = any(
+                    os.path.exists(os.path.join(candidate, marker))
+                    for marker in model_markers
+                )
+                if has_model_files_in_child:
+                    nested_dir = candidate
+                else:
+                    return
+            else:
+                return
+
+        moved = 0
+        for entry in os.listdir(nested_dir):
+            src = os.path.join(nested_dir, entry)
+            dst = os.path.join(component_output_dir, entry)
+            if os.path.exists(dst):
+                continue
+            shutil.move(src, dst)
+            moved += 1
+
+        if moved > 0:
+            logger.warning(
+                "Flattened nested component directory %s -> %s",
+                nested_dir,
+                component_output_dir,
+            )
+
+        # Remove the nested directory when empty.
+        if os.path.isdir(nested_dir) and len(os.listdir(nested_dir)) == 0:
+            os.rmdir(nested_dir)
     def save_quantized(self, output_dir=None, format="auto_round", inplace=True, **kwargs):
         """Save both quantized AR and DiT models into a pipeline directory structure.
 
@@ -558,6 +621,7 @@ def save_quantized(self, output_dir=None, format="auto_round", inplace=True, **k
             BaseCompressor.save_quantized(
                 self, output_dir=dit_output_dir, format=format, inplace=inplace, **kwargs
             )
+            self._flatten_nested_component_dir(dit_output_dir, dit_subdir)
 
         # Save AR
         if self.quant_ar and self.ar_model is not None:
@@ -582,6 +646,7 @@ def save_quantized(self, output_dir=None, format="auto_round", inplace=True, **k
             BaseCompressor.save_quantized(
                 self, output_dir=ar_output_dir, format=format, inplace=inplace, **kwargs
             )
+            self._flatten_nested_component_dir(ar_output_dir, ar_subdir)
 
             # Restore DiT serialization attributes
             for k, v in saved_attrs.items():
@@ -647,8 +712,11 @@ def quantize_and_save(
 
         format_list = get_formats(format, self)
         self.formats = format_list
-        self.orig_output_dir = output_dir  # required by base.quantize() → _adjust_immediate_packing_and_saving()
         self.inplace = inplace
+        # Keep orig_output_dir as None so _adjust_immediate_packing_and_saving()
+        # disables immediate saving — diffusers models must go through
+        # model.save_pretrained() to get correct weight file names.
+        self.orig_output_dir = None
 
         self.quantize()
         self.save_quantized(

From 0ec767ae177d478df38e22d060cc271bd15a5099 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 17 Mar 2026 13:42:10 +0000
Subject: [PATCH 06/10] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/compressors/diffusion/hybrid.py | 70 ++++++++++------------
 1 file changed, 32 insertions(+), 38 deletions(-)

diff --git a/auto_round/compressors/diffusion/hybrid.py b/auto_round/compressors/diffusion/hybrid.py
index b18fa2a9a..4670f2d35 100644
--- a/auto_round/compressors/diffusion/hybrid.py
+++ b/auto_round/compressors/diffusion/hybrid.py
@@ -58,7 +58,7 @@
 # To support a new hybrid architecture, simply add its AR attribute name here.
 # ---------------------------------------------------------------------------
 HYBRID_AR_COMPONENTS = [
-    "vision_language_encoder",   # GLM-Image
+    "vision_language_encoder",  # GLM-Image
     # Add new AR component names here, e.g.:
     # "language_model",
     # "text_decoder",
@@ -76,18 +76,21 @@
 # Detection
 # ---------------------------------------------------------------------------
 
+
 def _find_ar_component_name(model_or_path):
     """Return the AR component attribute name if model_or_path is a hybrid pipeline, else None."""
     if isinstance(model_or_path, str):
         index_path = os.path.join(model_or_path, "model_index.json")
         if not os.path.exists(index_path):
             from huggingface_hub import hf_hub_download
+
             try:
                 index_path = hf_hub_download(model_or_path, "model_index.json")
             except Exception:
                 return None
 
         import json
+
         with open(index_path) as f:
             data = json.load(f)
         if "transformer" not in data:
@@ -188,6 +191,7 @@ def __init__(
         # --- Load the pipeline ---
         if isinstance(model, str):
             from auto_round.utils.model import diffusion_load_model
+
             pipe, dit_model = diffusion_load_model(
                 model, platform=platform, device=self.device, model_dtype=model_dtype
             )
@@ -195,26 +199,19 @@ def __init__(
             pipe = model
             dit_model = pipe.transformer
         else:
-            raise ValueError(
-                f"HybridCompressor requires a model path or DiffusionPipeline, got {type(model)}"
-            )
+            raise ValueError(f"HybridCompressor requires a model path or DiffusionPipeline, got {type(model)}")
 
         # --- Discover the AR component dynamically ---
         self.ar_component_name = _find_ar_component_name(pipe)
         if self.ar_component_name is None and self.quant_ar:
             logger.warning(
-                f"No AR component found in pipeline (checked: {HYBRID_AR_COMPONENTS}), "
-                "skipping AR quantization."
+                f"No AR component found in pipeline (checked: {HYBRID_AR_COMPONENTS}), " "skipping AR quantization."
             )
             self.quant_ar = False
 
         self.pipe = pipe
         self.dit_model = dit_model
-        self.ar_model = (
-            getattr(pipe, self.ar_component_name, None)
-            if self.ar_component_name
-            else None
-        )
+        self.ar_model = getattr(pipe, self.ar_component_name, None) if self.ar_component_name else None
 
         if not self.quant_ar and not self.quant_dit:
             raise ValueError("At least one of quant_ar and quant_dit must be True.")
@@ -246,6 +243,7 @@ def __init__(
         # --- Detect AR blocks ---
         if self.quant_ar and self.ar_model is not None:
             from auto_round.special_model_handler import SPECIAL_MULTIMODAL_BLOCK
+
             model_type = getattr(getattr(self.ar_model, "config", None), "model_type", None)
             if model_type and model_type in SPECIAL_MULTIMODAL_BLOCK:
                 self.ar_quant_block_list = SPECIAL_MULTIMODAL_BLOCK[model_type](
@@ -283,6 +281,7 @@ def __init__(
         self._saved_ar_model = self.ar_model
 
         from auto_round.compressors.base import BaseCompressor
+
         BaseCompressor.__init__(
             self,
             model=model,
@@ -333,17 +332,14 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]:
 
             self.ar_model = ar_model
             setattr(self.pipe, self.ar_component_name, ar_model)
-            combined_layer_config.update(
-                {f"ar.{k}": v for k, v in ar_layer_config.items()}
-            )
+            combined_layer_config.update({f"ar.{k}": v for k, v in ar_layer_config.items()})
             self.ar_layer_config = ar_layer_config
 
             # Preserve serialization-relevant attributes from the AR compressor
             # so save_quantized can build the correct serialization_dict.
             from auto_round.compressors.base import SERIALIZATION_KEYS
-            self._ar_serialization = {
-                k: getattr(ar_compressor, k, None) for k in SERIALIZATION_KEYS
-            }
+
+            self._ar_serialization = {k: getattr(ar_compressor, k, None) for k in SERIALIZATION_KEYS}
 
             # Move AR model to CPU to free GPU for Phase 2
             self.ar_model.to("cpu")
@@ -370,9 +366,7 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]:
 
             self.dit_model = dit_model
             self.pipe.transformer = dit_model
-            combined_layer_config.update(
-                {f"dit.{k}": v for k, v in dit_layer_config.items()}
-            )
+            combined_layer_config.update({f"dit.{k}": v for k, v in dit_layer_config.items()})
             self.dit_layer_config = dit_layer_config
 
             logger.info("Phase 2 complete: DiT model quantized")
@@ -430,6 +424,7 @@ def calib(self, nsamples, bs):
         pipelines (FLUX etc.) accept but ignore them.
         """
         import inspect
+
         pipe_sig = inspect.signature(self.pipe.__call__)
         extra = {}
         if "height" in pipe_sig.parameters and self.height is not None:
@@ -442,9 +437,10 @@ def calib(self, nsamples, bs):
             return DiffusionCompressor.calib(self, nsamples, bs)
 
         # Replicate parent calib() with extra kwargs injected into the pipe call
+        from tqdm import tqdm
+
         from auto_round.compressors.diffusion.dataset import get_diffusion_dataloader
         from auto_round.utils import clear_memory
-        from tqdm import tqdm
 
         logger.warning(
             "Diffusion model will catch nsamples * num_inference_steps inputs, "
@@ -548,14 +544,12 @@ def _flatten_nested_component_dir(component_output_dir: str, component_name: str
                 if os.path.isdir(os.path.join(component_output_dir, name))
             ]
             has_model_files_at_root = any(
-                os.path.exists(os.path.join(component_output_dir, marker))
-                for marker in model_markers
+                os.path.exists(os.path.join(component_output_dir, marker)) for marker in model_markers
             )
             if len(child_dirs) == 1 and not has_model_files_at_root:
                 candidate = child_dirs[0]
                 has_model_files_in_child = any(
-                    os.path.exists(os.path.join(candidate, marker))
-                    for marker in model_markers
+                    os.path.exists(os.path.join(candidate, marker)) for marker in model_markers
                 )
                 if has_model_files_in_child:
                     nested_dir = candidate
@@ -583,6 +577,7 @@ def _flatten_nested_component_dir(component_output_dir: str, component_name: str
         # Remove the nested directory when empty.
         if os.path.isdir(nested_dir) and len(os.listdir(nested_dir)) == 0:
             os.rmdir(nested_dir)
+
     def save_quantized(self, output_dir=None, format="auto_round", inplace=True, **kwargs):
         """Save both quantized AR and DiT models into a pipeline directory structure.
 
@@ -601,8 +596,8 @@ def save_quantized(self, output_dir=None, format="auto_round", inplace=True, **k
             logger.warning("output_dir is None, skipping save")
             return
 
-        from auto_round.formats import get_formats
         from auto_round.compressors.base import BaseCompressor
+        from auto_round.formats import get_formats
 
         saved_formats = self.formats  # preserve original
 
@@ -618,9 +613,7 @@ def save_quantized(self, output_dir=None, format="auto_round", inplace=True, **k
                 self.layer_config = self.dit_layer_config
 
             self.formats = get_formats(format, self)
-            BaseCompressor.save_quantized(
-                self, output_dir=dit_output_dir, format=format, inplace=inplace, **kwargs
-            )
+            BaseCompressor.save_quantized(self, output_dir=dit_output_dir, format=format, inplace=inplace, **kwargs)
             self._flatten_nested_component_dir(dit_output_dir, dit_subdir)
 
         # Save AR
@@ -636,16 +629,14 @@ def save_quantized(self, output_dir=None, format="auto_round", inplace=True, **k
 
             # Swap serialization attributes from the AR compressor so that
             # BaseCompressor.save_quantized builds the correct config.
-            ar_ser = getattr(self, "_ar_serialization", {})
+            ar_set = getattr(self, "_ar_serialization", {})
             saved_attrs = {}
-            for k, v in ar_ser.items():
+            for k, v in ar_set.items():
                 saved_attrs[k] = getattr(self, k, None)
                 setattr(self, k, v)
 
             self.formats = get_formats(format, self)
-            BaseCompressor.save_quantized(
-                self, output_dir=ar_output_dir, format=format, inplace=inplace, **kwargs
-            )
+            BaseCompressor.save_quantized(self, output_dir=ar_output_dir, format=format, inplace=inplace, **kwargs)
             self._flatten_nested_component_dir(ar_output_dir, ar_subdir)
 
             # Restore DiT serialization attributes
@@ -659,12 +650,12 @@ def save_quantized(self, output_dir=None, format="auto_round", inplace=True, **k
 
     def _save_pipeline_metadata(self, output_dir):
         """Save model_index.json and auxiliary pipeline components."""
-        src_path = (
-            getattr(getattr(self.pipe, "config", None), "_name_or_path", None)
-            or getattr(self.pipe, "name_or_path", None)
+        src_path = getattr(getattr(self.pipe, "config", None), "_name_or_path", None) or getattr(
+            self.pipe, "name_or_path", None
         )
         if src_path and os.path.exists(os.path.join(src_path, "model_index.json")):
             import shutil
+
             dst_index = os.path.join(output_dir, "model_index.json")
             if not os.path.exists(dst_index):
                 shutil.copy2(os.path.join(src_path, "model_index.json"), dst_index)
@@ -720,7 +711,10 @@ def quantize_and_save(
 
         self.quantize()
         self.save_quantized(
-            output_dir, format=format, inplace=inplace, **kwargs,
+            output_dir,
+            format=format,
+            inplace=inplace,
+            **kwargs,
         )
         logger.info(f"Hybrid quantized model saved to {output_dir}")
         return self.model, [output_dir]

From dcf3b52d58a7d8866c89429e503148111e504e44 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 19 Mar 2026 02:38:47 +0000
Subject: [PATCH 07/10] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/special_model_handler.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/auto_round/special_model_handler.py b/auto_round/special_model_handler.py
index 2fc400d43..c9ae830a0 100644
--- a/auto_round/special_model_handler.py
+++ b/auto_round/special_model_handler.py
@@ -172,6 +172,7 @@ def _get_qwen3_omni_moe_multimodal_block(model, quant_vision=False):
 
     return block_names
 
+
 def _get_glm_image_multimodal_block(model, quant_vision=False):
     """Get block names for GLM-Image AR model.
 

From 0e4dafcd548b2c0b097cae318d0d5e2f739a620c Mon Sep 17 00:00:00 2001
From: lvliang-intel <liang1.lv@intel.com>
Date: Thu, 19 Mar 2026 10:51:16 +0800
Subject: [PATCH 08/10] fix issue

Signed-off-by: lvliang-intel <liang1.lv@intel.com>
---
 auto_round/special_model_handler.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/auto_round/special_model_handler.py b/auto_round/special_model_handler.py
index c9ae830a0..73dbf4896 100644
--- a/auto_round/special_model_handler.py
+++ b/auto_round/special_model_handler.py
@@ -195,6 +195,8 @@ def _get_glm_image_multimodal_block(model, quant_vision=False):
                 [f"model.language_model.layers.{i}" for i in range(len(model.model.language_model.layers))]
             )
 
+    return block_names
+
 
 SPECIAL_MULTIMODAL_BLOCK = {
     "deepseek_vl_v2": _get_deepseek_vl2_multimodal_block,

From 19f84845dddcf29339e85f5f40afbb51905f7b74 Mon Sep 17 00:00:00 2001
From: lvliang-intel <liang1.lv@intel.com>
Date: Fri, 20 Mar 2026 21:50:48 +0800
Subject: [PATCH 09/10] fix comments

Signed-off-by: lvliang-intel <liang1.lv@intel.com>
---
 auto_round/compressors/diffusion/hybrid.py |  5 +-
 auto_round/utils/model.py                  | 92 ++++++++--------------
 test/test_cpu/models/test_glm_image.py     | 16 ++--
 3 files changed, 45 insertions(+), 68 deletions(-)

diff --git a/auto_round/compressors/diffusion/hybrid.py b/auto_round/compressors/diffusion/hybrid.py
index 4670f2d35..81c80d700 100644
--- a/auto_round/compressors/diffusion/hybrid.py
+++ b/auto_round/compressors/diffusion/hybrid.py
@@ -492,11 +492,10 @@ def calib(self, nsamples, bs):
                     break
 
         if total_cnt == 0:
-            logger.error(
+            raise RuntimeError(
                 f"no data has been cached, please provide more data with sequence length >={self.seqlen} in the "
-                f"dataset or decease the sequence length"
+                f"dataset or decrease the sequence length"
             )
-            exit(-1)
         elif total_cnt < nsamples:
             logger.warning(
                 f"Insufficient number of samples collected may affect the quantization. "
diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py
index 9474bc8b5..4d6439d03 100644
--- a/auto_round/utils/model.py
+++ b/auto_round/utils/model.py
@@ -368,62 +368,31 @@ def llm_load_model(
     return model, tokenizer
 
 
-def _find_pipeline_model_subfolder_local(model_dir: str) -> tuple:
-    """Find model/processor subfolders from a local pipeline directory with model_index.json.
+def _find_pipeline_model_subfolder(model_dir_or_repo: str, file_list: list = None) -> tuple:
+    """Find model/processor subfolders from a pipeline's model_index.json.
 
-    Scans component subdirectories to find the one whose config.json has 'architectures',
-    and looks for a 'processor' component.
+    Works for both local directories and remote HF repos.
+
+    Args:
+        model_dir_or_repo: Local directory path or HF repo id.
+        file_list: If provided, treat *model_dir_or_repo* as a remote HF repo
+            and use *file_list* (from ``list_repo_files``) to check file existence.
+            If ``None``, treat it as a local directory.
 
     Returns:
         (model_subfolder, processor_subfolder, config_dict)
     """
-    index_path = os.path.join(model_dir, "model_index.json")
-    if not os.path.exists(index_path):
-        raise FileNotFoundError(f"No config.json or model_index.json found under {model_dir}")
-
-    with open(index_path, "r", encoding="utf-8") as f:
-        model_index = json.load(f)
-
-    processor_subfolder = None
-    for name, value in model_index.items():
-        if name == "processor" and isinstance(value, list):
-            processor_subfolder = "processor"
-            break
+    is_local = file_list is None
 
-    candidates = []
-    for name, value in model_index.items():
-        if name.startswith("_") or not isinstance(value, list) or len(value) < 2:
-            continue
-        comp_config_path = os.path.join(model_dir, name, "config.json")
-        if not os.path.isfile(comp_config_path):
-            continue
-        with open(comp_config_path, "r", encoding="utf-8") as f:
-            comp_config = json.load(f)
-        if "architectures" in comp_config:
-            candidates.append((name, comp_config))
-
-    if not candidates:
-        raise FileNotFoundError(
-            f"model_index.json found in {model_dir} but no component with 'architectures' in its config.json"
-        )
-
-    for name, comp_config in candidates:
-        arch = comp_config["architectures"][0]
-        if "CausalLM" in arch or "ConditionalGeneration" in arch:
-            return name, processor_subfolder, comp_config
-
-    return candidates[0][0], processor_subfolder, candidates[0][1]
-
-
-def _find_pipeline_model_subfolder_remote(repo_id: str, file_list: list) -> tuple:
-    """Find model/processor subfolders from a remote HF repo with model_index.json.
+    if is_local:
+        index_path = os.path.join(model_dir_or_repo, "model_index.json")
+        if not os.path.exists(index_path):
+            raise FileNotFoundError(f"No config.json or model_index.json found under {model_dir_or_repo}")
+    else:
+        from huggingface_hub import hf_hub_download
 
-    Returns:
-        (model_subfolder, processor_subfolder, config_dict)
-    """
-    from huggingface_hub import hf_hub_download
+        index_path = hf_hub_download(model_dir_or_repo, "model_index.json")
 
-    index_path = hf_hub_download(repo_id, "model_index.json")
     with open(index_path, "r", encoding="utf-8") as f:
         model_index = json.load(f)
 
@@ -437,18 +406,27 @@ def _find_pipeline_model_subfolder_remote(repo_id: str, file_list: list) -> tupl
     for name, value in model_index.items():
         if name.startswith("_") or not isinstance(value, list) or len(value) < 2:
             continue
-        comp_config_file = f"{name}/config.json"
-        if comp_config_file not in file_list:
-            continue
-        comp_config_path = hf_hub_download(repo_id, comp_config_file)
-        with open(comp_config_path, "r", encoding="utf-8") as f:
-            comp_config = json.load(f)
+        # Load component config.json
+        if is_local:
+            cfg_path = os.path.join(model_dir_or_repo, name, "config.json")
+            if not os.path.isfile(cfg_path):
+                continue
+            with open(cfg_path, "r", encoding="utf-8") as f:
+                comp_config = json.load(f)
+        else:
+            comp_config_file = f"{name}/config.json"
+            if comp_config_file not in file_list:
+                continue
+            cfg_path = hf_hub_download(model_dir_or_repo, comp_config_file)
+            with open(cfg_path, "r", encoding="utf-8") as f:
+                comp_config = json.load(f)
+
         if "architectures" in comp_config:
             candidates.append((name, comp_config))
 
     if not candidates:
         raise FileNotFoundError(
-            f"model_index.json found for {repo_id} but no component with 'architectures' in its config.json"
+            f"model_index.json found in {model_dir_or_repo} but no component with 'architectures' in its config.json"
         )
 
     for name, comp_config in candidates:
@@ -505,7 +483,7 @@ def mllm_load_model(
             with open(config_path, "r", encoding="utf-8") as f:
                 config = json.load(f)
         else:
-            model_subfolder, processor_subfolder, config = _find_pipeline_model_subfolder_local(
+            model_subfolder, processor_subfolder, config = _find_pipeline_model_subfolder(
                 pretrained_model_name_or_path
             )
     else:
@@ -517,7 +495,7 @@ def mllm_load_model(
             with open(config_path, "r", encoding="utf-8") as f:
                 config = json.load(f)
         elif "model_index.json" in file_list:
-            model_subfolder, processor_subfolder, config = _find_pipeline_model_subfolder_remote(
+            model_subfolder, processor_subfolder, config = _find_pipeline_model_subfolder(
                 pretrained_model_name_or_path, file_list
             )
         elif "config.json.gz" in file_list:
diff --git a/test/test_cpu/models/test_glm_image.py b/test/test_cpu/models/test_glm_image.py
index e676267a8..5f497b763 100644
--- a/test/test_cpu/models/test_glm_image.py
+++ b/test/test_cpu/models/test_glm_image.py
@@ -26,7 +26,7 @@
 import torch.nn as nn
 
 from auto_round.special_model_handler import _get_glm_image_multimodal_block
-from auto_round.utils.model import _find_pipeline_model_subfolder_local
+from auto_round.utils.model import _find_pipeline_model_subfolder
 
 # ---------------------------------------------------------------------------
 # Helpers – fake model hierarchy
@@ -191,7 +191,7 @@ def test_finds_vision_language_encoder_subfolder(self, tmp_path):
                 "vae": {"model_type": "autoencoder_kl"},  # no architectures → ignored
             },
         )
-        model_subfolder, processor_subfolder, cfg = _find_pipeline_model_subfolder_local(pipeline_dir)
+        model_subfolder, processor_subfolder, cfg = _find_pipeline_model_subfolder(pipeline_dir)
 
         assert model_subfolder == "vision_language_encoder"
         assert processor_subfolder == "processor"
@@ -210,7 +210,7 @@ def test_prefers_conditional_generation_over_encoder(self, tmp_path):
             },
             has_processor=False,
         )
-        model_subfolder, processor_subfolder, cfg = _find_pipeline_model_subfolder_local(pipeline_dir)
+        model_subfolder, processor_subfolder, cfg = _find_pipeline_model_subfolder(pipeline_dir)
 
         assert model_subfolder == "vision_language_encoder"
         assert processor_subfolder is None  # no processor entry
@@ -222,7 +222,7 @@ def test_no_processor_returns_none(self, tmp_path):
             {"vision_language_encoder": {"architectures": ["GlmImageForConditionalGeneration"]}},
             has_processor=False,
         )
-        _, processor_subfolder, _ = _find_pipeline_model_subfolder_local(pipeline_dir)
+        _, processor_subfolder, _ = _find_pipeline_model_subfolder(pipeline_dir)
         assert processor_subfolder is None
 
     def test_with_processor_returns_processor_subfolder(self, tmp_path):
@@ -232,13 +232,13 @@ def test_with_processor_returns_processor_subfolder(self, tmp_path):
             {"vision_language_encoder": {"architectures": ["GlmImageForConditionalGeneration"]}},
             has_processor=True,
         )
-        _, processor_subfolder, _ = _find_pipeline_model_subfolder_local(pipeline_dir)
+        _, processor_subfolder, _ = _find_pipeline_model_subfolder(pipeline_dir)
         assert processor_subfolder == "processor"
 
     def test_raises_when_no_model_index(self, tmp_path):
         """FileNotFoundError raised when neither config.json nor model_index.json exists."""
         with pytest.raises(FileNotFoundError, match="model_index.json"):
-            _find_pipeline_model_subfolder_local(str(tmp_path))
+            _find_pipeline_model_subfolder(str(tmp_path))
 
     def test_raises_when_no_component_has_architectures(self, tmp_path):
         """FileNotFoundError raised when no component config contains 'architectures'."""
@@ -250,7 +250,7 @@ def test_raises_when_no_component_has_architectures(self, tmp_path):
             },
         )
         with pytest.raises(FileNotFoundError, match="architectures"):
-            _find_pipeline_model_subfolder_local(pipeline_dir)
+            _find_pipeline_model_subfolder(pipeline_dir)
 
     def test_falls_back_to_first_candidate_when_no_preferred_arch(self, tmp_path):
         """When no ConditionalGeneration/CausalLM arch exists, first candidate is used."""
@@ -262,7 +262,7 @@ def test_falls_back_to_first_candidate_when_no_preferred_arch(self, tmp_path):
             },
             has_processor=False,
         )
-        model_subfolder, _, cfg = _find_pipeline_model_subfolder_local(pipeline_dir)
+        model_subfolder, _, cfg = _find_pipeline_model_subfolder(pipeline_dir)
         # Must be one of the candidates, not crash
         assert model_subfolder in ("text_encoder", "image_encoder")
         assert "architectures" in cfg

From 57c58d9038fbe9ccd38838538149b4071ac69104 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 20 Mar 2026 13:58:52 +0000
Subject: [PATCH 10/10] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/utils/model.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py
index 4d6439d03..f0aec180a 100644
--- a/auto_round/utils/model.py
+++ b/auto_round/utils/model.py
@@ -483,9 +483,7 @@ def mllm_load_model(
             with open(config_path, "r", encoding="utf-8") as f:
                 config = json.load(f)
         else:
-            model_subfolder, processor_subfolder, config = _find_pipeline_model_subfolder(
-                pretrained_model_name_or_path
-            )
+            model_subfolder, processor_subfolder, config = _find_pipeline_model_subfolder(pretrained_model_name_or_path)
     else:
         from huggingface_hub import hf_hub_download, list_repo_files