From 091d81c16564458e687164a31448318819a2fc29 Mon Sep 17 00:00:00 2001 From: lvliang-intel Date: Sun, 8 Mar 2026 21:12:59 +0800 Subject: [PATCH 01/10] Support GLM-Image model quantizaiton Signed-off-by: lvliang-intel --- auto_round/autoround.py | 8 +- auto_round/compressors/mllm/compressor.py | 1 + auto_round/compressors/mllm/template.py | 1 + auto_round/compressors/mllm/utils.py | 1 + auto_round/compressors/shard_writer.py | 6 +- .../export/export_to_autogptq/export.py | 10 +- .../export/export_to_autoround/export.py | 14 +- auto_round/export/utils.py | 127 +++++ auto_round/special_model_handler.py | 33 +- auto_round/utils/common.py | 1 + auto_round/utils/model.py | 141 ++++- test/test_cpu/models/test_glm_image.py | 528 ++++++++++++++++++ 12 files changed, 856 insertions(+), 15 deletions(-) create mode 100644 test/test_cpu/models/test_glm_image.py diff --git a/auto_round/autoround.py b/auto_round/autoround.py index 4c3abe4ba..9888bfe8e 100644 --- a/auto_round/autoround.py +++ b/auto_round/autoround.py @@ -161,7 +161,13 @@ def __new__( model_cls = [] - if (extra_config and not extra_config.mllm_config.is_default()) or is_mllm_model(model, platform=platform): + has_multimodal_assets = kwargs.get("processor") is not None or kwargs.get("image_processor") is not None + + if ( + (extra_config and not extra_config.mllm_config.is_default()) + or has_multimodal_assets + or is_mllm_model(model, platform=platform) + ): logger.info("using MLLM mode for multimodal model.") model_cls.append(MLLMCompressor) if extra_config: diff --git a/auto_round/compressors/mllm/compressor.py b/auto_round/compressors/mllm/compressor.py index 165d4f3d3..12061ce10 100644 --- a/auto_round/compressors/mllm/compressor.py +++ b/auto_round/compressors/mllm/compressor.py @@ -493,6 +493,7 @@ def save_quantized(self, output_dir=None, format="auto_round", inplace=True, **k format=format, inplace=inplace, processor=self.processor, + image_processor=self.image_processor, quant_nontext_module=self.quant_nontext_module if hasattr(self, "quant_nontext_module") else False, **kwargs, ) diff --git a/auto_round/compressors/mllm/template.py b/auto_round/compressors/mllm/template.py index 75190a091..09a315f6f 100644 --- a/auto_round/compressors/mllm/template.py +++ b/auto_round/compressors/mllm/template.py @@ -119,6 +119,7 @@ def _register_template( _register_template("qwen2_vl", default_dataset="NeelNanda/pile-10k", processor=PROCESSORS["qwen2_vl"]) _register_template("qwen2_5_vl", default_dataset="NeelNanda/pile-10k", processor=PROCESSORS["qwen2_vl"]) +_register_template("glm_image", default_dataset="NeelNanda/pile-10k", processor=PROCESSORS["hf"]) _register_template("mllama", default_dataset="liuhaotian/llava", processor=PROCESSORS["hf"]) _register_template("deepseek_vl_v2", default_dataset="NeelNanda/pile-10k", processor=PROCESSORS["deepseek_v2"]) _register_template("mistral3", default_dataset="NeelNanda/pile-10k", processor=PROCESSORS["hf"]) diff --git a/auto_round/compressors/mllm/utils.py b/auto_round/compressors/mllm/utils.py index e8535666c..547c0503d 100644 --- a/auto_round/compressors/mllm/utils.py +++ b/auto_round/compressors/mllm/utils.py @@ -27,6 +27,7 @@ "audio", "talker", "token2wav", + "vqmodel", "multi_modal_projector", "vision_tower", "multimodal_projector", diff --git a/auto_round/compressors/shard_writer.py b/auto_round/compressors/shard_writer.py index 39964319a..dc81f2959 100644 --- a/auto_round/compressors/shard_writer.py +++ b/auto_round/compressors/shard_writer.py @@ -60,7 +60,11 @@ def __init__(self, rounder): self.total_param_size_bytes = 0 # Directory Setup - self.output_dir = os.path.join(rounder._get_save_folder_name(rounder.formats[0]), "") + base_dir = rounder._get_save_folder_name(rounder.formats[0]) + subfolder = getattr(self.model, "_autoround_pipeline_subfolder", None) + if subfolder: + base_dir = os.path.join(base_dir, subfolder) + self.output_dir = os.path.join(base_dir, "") os.makedirs(self.output_dir, exist_ok=True) def _parse_size(self, size_str: str) -> int: diff --git a/auto_round/export/export_to_autogptq/export.py b/auto_round/export/export_to_autogptq/export.py index 75e9b0f3d..acfed7772 100644 --- a/auto_round/export/export_to_autogptq/export.py +++ b/auto_round/export/export_to_autogptq/export.py @@ -53,6 +53,7 @@ filter_quantization_config, get_autogptq_packing_qlinear, release_layer_safely, + resolve_pipeline_export_layout, save_model, ) from auto_round.schemes import QuantizationScheme @@ -211,12 +212,17 @@ def save_quantized_as_autogptq( safe_serialization = kwargs.get("safe_serialization", True) # --- Save metadata (tokenizer, processor, etc.) --- + processor_output_dir = output_dir + model_output_dir = output_dir + if output_dir: + model_output_dir, processor_output_dir, _ = resolve_pipeline_export_layout(model, output_dir) + if output_dir: # if os.path.exists(output_dir): # logger.info(f"{output_dir} already exists, may cause overwrite conflicts.") for comp in (tokenizer, processor, image_processor): if comp is not None and hasattr(comp, "save_pretrained"): - comp.save_pretrained(output_dir) + comp.save_pretrained(processor_output_dir) # --- Handle quantization structure --- all_blocks = quant_block_list @@ -319,6 +325,6 @@ def wrapper(name): dtype = torch.float16 ##force dtype to fp16 save_model( - model, output_dir, safe_serialization=safe_serialization, dtype=dtype, config_file="quantize_config.json" + model, model_output_dir, safe_serialization=safe_serialization, dtype=dtype, config_file="quantize_config.json" ) return model diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py index c57bf452c..467deab2c 100644 --- a/auto_round/export/export_to_autoround/export.py +++ b/auto_round/export/export_to_autoround/export.py @@ -35,6 +35,7 @@ filter_quantization_config, get_autogptq_packing_qlinear, release_layer_safely, + resolve_pipeline_export_layout, save_model, ) from auto_round.formats import AutoRoundExportFormat @@ -334,19 +335,24 @@ def wrapper(name): return model # if os.path.exists(output_dir): # logger.info(f"{output_dir} already exists, this may cause model conflict") + model_output_dir = output_dir + processor_output_dir = output_dir + if output_dir: + model_output_dir, processor_output_dir, _ = resolve_pipeline_export_layout(model, output_dir) + if tokenizer is not None and hasattr(tokenizer, "save_pretrained"): - tokenizer.save_pretrained(output_dir) + tokenizer.save_pretrained(processor_output_dir) if processor is not None: - processor.save_pretrained(output_dir) + processor.save_pretrained(processor_output_dir) if image_processor is not None: - image_processor.save_pretrained(output_dir) + image_processor.save_pretrained(processor_output_dir) if quantization_config.get("act_bits", 16) <= 8: dtype = torch.bfloat16 elif "awq" in quantization_config.get("packing_format", "auto_round:auto_gptq"): dtype = torch.float16 ## awq kernel only supports float16 on cuda else: dtype = None - save_model(model, output_dir, safe_serialization=safe_serialization, dtype=dtype) + save_model(model, model_output_dir, safe_serialization=safe_serialization, dtype=dtype) return model diff --git a/auto_round/export/utils.py b/auto_round/export/utils.py index 42ae86d5f..5581fc46c 100644 --- a/auto_round/export/utils.py +++ b/auto_round/export/utils.py @@ -13,12 +13,139 @@ # limitations under the License. import json import os +import shutil import torch.nn as nn from auto_round.utils import copy_python_files_from_model_cache, logger, unsupported_meta_device +def is_local_pipeline_model_dir(model_dir: str) -> bool: + if not model_dir or not os.path.isdir(model_dir): + return False + return os.path.isfile(os.path.join(model_dir, "model_index.json")) + + +def is_remote_pipeline_model_dir(model_dir: str) -> bool: + if not model_dir or os.path.isdir(model_dir): + return False + try: + from huggingface_hub import list_repo_files + + return "model_index.json" in list_repo_files(model_dir) + except Exception: + return False + + +def is_pipeline_model_dir(model_dir: str) -> bool: + return is_local_pipeline_model_dir(model_dir) or is_remote_pipeline_model_dir(model_dir) + + +def _resolve_pipeline_source_dir(model: nn.Module) -> str | None: + candidates = [ + getattr(model, "name_or_path", None), + getattr(getattr(model, "config", None), "_name_or_path", None), + getattr(getattr(model, "config", None), "name_or_path", None), + ] + for candidate in candidates: + if isinstance(candidate, str) and is_pipeline_model_dir(candidate): + return candidate + return None + + +def _copy_pipeline_artifact(model_dir: str, relative_path: str, output_dir: str) -> None: + target_path = os.path.join(output_dir, relative_path) + os.makedirs(os.path.dirname(target_path), exist_ok=True) + if is_local_pipeline_model_dir(model_dir): + source_path = os.path.join(model_dir, relative_path) + else: + from huggingface_hub import hf_hub_download + + source_path = hf_hub_download(model_dir, relative_path) + shutil.copy2(source_path, target_path) + + +def _copy_pipeline_artifacts(source_dir: str, output_dir: str, exclude_components: set[str] | None = None): + exclude_components = exclude_components or set() + os.makedirs(output_dir, exist_ok=True) + + model_index_path = os.path.join(source_dir, "model_index.json") if is_local_pipeline_model_dir(source_dir) else None + if model_index_path: + with open(model_index_path, "r", encoding="utf-8") as f: + model_index = json.load(f) + else: + from huggingface_hub import hf_hub_download, list_repo_files + + with open(hf_hub_download(source_dir, "model_index.json"), "r", encoding="utf-8") as f: + model_index = json.load(f) + + component_dirs = [k for k, v in model_index.items() if not k.startswith("_") and isinstance(v, list)] + is_local = is_local_pipeline_model_dir(source_dir) + + # Copy root-level files + if is_local: + for name in os.listdir(source_dir): + src = os.path.join(source_dir, name) + if os.path.isfile(src) and ( + name in ("model_index.json", ".gitattributes") or name.lower().startswith(("readme", "license")) + ): + shutil.copy2(src, os.path.join(output_dir, name)) + else: + all_files = list(list_repo_files(source_dir)) + for name in all_files: + if "/" not in name and ( + name in ("model_index.json", ".gitattributes") or name.lower().startswith(("readme", "license")) + ): + _copy_pipeline_artifact(source_dir, name, output_dir) + + # Copy component directories + for component_name in component_dirs: + if component_name in exclude_components: + continue + if is_local: + src = os.path.join(source_dir, component_name) + dst = os.path.join(output_dir, component_name) + if os.path.isdir(src): + shutil.copytree(src, dst, dirs_exist_ok=True) + else: + prefix = f"{component_name}/" + for f in all_files: + if f.startswith(prefix): + _copy_pipeline_artifact(source_dir, f, output_dir) + + +def resolve_pipeline_export_layout(model: nn.Module, output_dir: str) -> tuple[str, str, bool]: + model_component = getattr(model, "_autoround_pipeline_subfolder", None) + if model_component is None: + return output_dir, output_dir, False + + source_dir = _resolve_pipeline_source_dir(model) + processor_component = None + if source_dir is not None: + try: + model_index_path = os.path.join(source_dir, "model_index.json") if is_local_pipeline_model_dir(source_dir) else None + if model_index_path: + with open(model_index_path, "r", encoding="utf-8") as f: + model_index = json.load(f) + else: + from huggingface_hub import hf_hub_download + + with open(hf_hub_download(source_dir, "model_index.json"), "r", encoding="utf-8") as f: + model_index = json.load(f) + if "processor" in model_index and isinstance(model_index["processor"], list): + processor_component = "processor" + excluded = {model_component} + if processor_component: + excluded.add(processor_component) + _copy_pipeline_artifacts(source_dir, output_dir, exclude_components=excluded) + except Exception as e: + logger.warning("Failed to copy pipeline artifacts from %s: %s", source_dir, e) + + model_output_dir = os.path.join(output_dir, model_component) + processor_output_dir = os.path.join(output_dir, processor_component) if processor_component else output_dir + return model_output_dir, processor_output_dir, True + + def save_model( model: nn.Module, save_dir: str, diff --git a/auto_round/special_model_handler.py b/auto_round/special_model_handler.py index f051b9673..695b43f51 100644 --- a/auto_round/special_model_handler.py +++ b/auto_round/special_model_handler.py @@ -21,7 +21,7 @@ from auto_round.modeling.fused_moe.replace_modules import apply_replacements, release_original_module_ from auto_round.utils import is_moe_model_via_config, logger -mllms_with_limited_bs = ("llava", "qwen2_vl", "phi3_v", "mllama") # Limitations on batch_size +mllms_with_limited_bs = ("llava", "qwen2_vl", "phi3_v", "mllama", "glm_image") # Limitations on batch_size SUPPORT_ONLY_TEXT_MODELS = [ "phi3_v", @@ -35,6 +35,7 @@ "llama4", "internvl_chat", "glm4v_moe", + "glm_image", "qwen3_vl_moe", "gemma3", ] @@ -80,7 +81,35 @@ def _get_deepseek_vl2_multimodal_block(model, quant_vision=False): return block_names -SPECIAL_MULTIMODAL_BLOCK = {"deepseek_vl_v2": _get_deepseek_vl2_multimodal_block} +def _get_glm_image_multimodal_block(model, quant_vision=False): + """Get block names for GLM-Image AR model. + + GLM-Image AR model structure: + - model.visual.blocks: vision encoder + - model.language_model.layers: autoregressive text backbone + + By default, only text backbone is quantized. Set quant_vision=True to include + the visual encoder blocks. + """ + block_names = [] + + if quant_vision and hasattr(model, "model") and hasattr(model.model, "visual"): + if hasattr(model.model.visual, "blocks"): + block_names.append([f"model.visual.blocks.{i}" for i in range(len(model.model.visual.blocks))]) + + if hasattr(model, "model") and hasattr(model.model, "language_model"): + if hasattr(model.model.language_model, "layers"): + block_names.append( + [f"model.language_model.layers.{i}" for i in range(len(model.model.language_model.layers))] + ) + + return block_names + + +SPECIAL_MULTIMODAL_BLOCK = { + "deepseek_vl_v2": _get_deepseek_vl2_multimodal_block, + "glm_image": _get_glm_image_multimodal_block, +} def _deepseek_vl2_forward( diff --git a/auto_round/utils/common.py b/auto_round/utils/common.py index c494c2959..5981e9de7 100644 --- a/auto_round/utils/common.py +++ b/auto_round/utils/common.py @@ -193,6 +193,7 @@ def __getitem__(self, key): "audio", "talker", "token2wav", + "vqmodel", "vision_model", "audio_tower", "vision_encoder", diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py index f17398fcc..de561b106 100644 --- a/auto_round/utils/model.py +++ b/auto_round/utils/model.py @@ -339,6 +339,97 @@ def llm_load_model( return model, tokenizer +def _find_pipeline_model_subfolder_local(model_dir: str) -> tuple: + """Find model/processor subfolders from a local pipeline directory with model_index.json. + + Scans component subdirectories to find the one whose config.json has 'architectures', + and looks for a 'processor' component. + + Returns: + (model_subfolder, processor_subfolder, config_dict) + """ + index_path = os.path.join(model_dir, "model_index.json") + if not os.path.exists(index_path): + raise FileNotFoundError(f"No config.json or model_index.json found under {model_dir}") + + with open(index_path, "r", encoding="utf-8") as f: + model_index = json.load(f) + + processor_subfolder = None + for name, value in model_index.items(): + if name == "processor" and isinstance(value, list): + processor_subfolder = "processor" + break + + candidates = [] + for name, value in model_index.items(): + if name.startswith("_") or not isinstance(value, list) or len(value) < 2: + continue + comp_config_path = os.path.join(model_dir, name, "config.json") + if not os.path.isfile(comp_config_path): + continue + with open(comp_config_path, "r", encoding="utf-8") as f: + comp_config = json.load(f) + if "architectures" in comp_config: + candidates.append((name, comp_config)) + + if not candidates: + raise FileNotFoundError( + f"model_index.json found in {model_dir} but no component with 'architectures' in its config.json" + ) + + for name, comp_config in candidates: + arch = comp_config["architectures"][0] + if "CausalLM" in arch or "ConditionalGeneration" in arch: + return name, processor_subfolder, comp_config + + return candidates[0][0], processor_subfolder, candidates[0][1] + + +def _find_pipeline_model_subfolder_remote(repo_id: str, file_list: list) -> tuple: + """Find model/processor subfolders from a remote HF repo with model_index.json. + + Returns: + (model_subfolder, processor_subfolder, config_dict) + """ + from huggingface_hub import hf_hub_download + + index_path = hf_hub_download(repo_id, "model_index.json") + with open(index_path, "r", encoding="utf-8") as f: + model_index = json.load(f) + + processor_subfolder = None + for name, value in model_index.items(): + if name == "processor" and isinstance(value, list): + processor_subfolder = "processor" + break + + candidates = [] + for name, value in model_index.items(): + if name.startswith("_") or not isinstance(value, list) or len(value) < 2: + continue + comp_config_file = f"{name}/config.json" + if comp_config_file not in file_list: + continue + comp_config_path = hf_hub_download(repo_id, comp_config_file) + with open(comp_config_path, "r", encoding="utf-8") as f: + comp_config = json.load(f) + if "architectures" in comp_config: + candidates.append((name, comp_config)) + + if not candidates: + raise FileNotFoundError( + f"model_index.json found for {repo_id} but no component with 'architectures' in its config.json" + ) + + for name, comp_config in candidates: + arch = comp_config["architectures"][0] + if "CausalLM" in arch or "ConditionalGeneration" in arch: + return name, processor_subfolder, comp_config + + return candidates[0][0], processor_subfolder, candidates[0][1] + + def mllm_load_model( pretrained_model_name_or_path: str, platform: str = "hf", @@ -377,17 +468,29 @@ def mllm_load_model( torch_dtype = "auto" if device_str is not None and "hpu" in device_str: torch_dtype = torch.bfloat16 + model_subfolder = None + processor_subfolder = None if os.path.isdir(pretrained_model_name_or_path): - config = json.load(open(os.path.join(pretrained_model_name_or_path, "config.json"))) + config_path = os.path.join(pretrained_model_name_or_path, "config.json") + if os.path.exists(config_path): + with open(config_path, "r", encoding="utf-8") as f: + config = json.load(f) + else: + model_subfolder, processor_subfolder, config = _find_pipeline_model_subfolder_local( + pretrained_model_name_or_path + ) else: from huggingface_hub import hf_hub_download, list_repo_files file_list = list_repo_files(pretrained_model_name_or_path) if "config.json" in file_list: - # Load plain JSON config_path = hf_hub_download(pretrained_model_name_or_path, "config.json") with open(config_path, "r", encoding="utf-8") as f: config = json.load(f) + elif "model_index.json" in file_list: + model_subfolder, processor_subfolder, config = _find_pipeline_model_subfolder_remote( + pretrained_model_name_or_path, file_list + ) elif "config.json.gz" in file_list: # Load gzipped JSON import gzip @@ -436,20 +539,28 @@ def mllm_load_model( else: cls = AutoModelForCausalLM try: + model_load_kwargs = {} + if model_subfolder is not None: + model_load_kwargs["subfolder"] = model_subfolder model = cls.from_pretrained( pretrained_model_name_or_path, trust_remote_code=trust_remote_code, torch_dtype=torch_dtype, device_map="auto" if use_auto_mapping else None, + **model_load_kwargs, ) except ValueError as e: if "FP8 quantized" in str(e): with override_cuda_device_capability(): + model_load_kwargs = {} + if model_subfolder is not None: + model_load_kwargs["subfolder"] = model_subfolder model = cls.from_pretrained( pretrained_model_name_or_path, trust_remote_code=trust_remote_code, torch_dtype=torch_dtype, device_map="auto" if use_auto_mapping else None, + **model_load_kwargs, ) logger.warning("the support for fp8 model as input is experimental, please use with caution.") else: @@ -463,11 +574,18 @@ def mllm_load_model( else: tokenizer = MistralTokenizer.from_hf_hub(pretrained_model_name_or_path) else: + processor_load_kwargs = {} + if processor_subfolder is not None: + processor_load_kwargs["subfolder"] = processor_subfolder tokenizer = AutoTokenizer.from_pretrained( - pretrained_model_name_or_path, trust_remote_code=trust_remote_code + pretrained_model_name_or_path, + trust_remote_code=trust_remote_code, + **processor_load_kwargs, ) processor = AutoProcessor.from_pretrained( - pretrained_model_name_or_path, trust_remote_code=trust_remote_code + pretrained_model_name_or_path, + trust_remote_code=trust_remote_code, + **processor_load_kwargs, ) try: if platform == "model_scope": @@ -475,17 +593,30 @@ def mllm_load_model( else: from transformers import AutoImageProcessor + image_processor_load_kwargs = {} + if processor_subfolder is not None: + image_processor_load_kwargs["subfolder"] = processor_subfolder image_processor = AutoImageProcessor.from_pretrained( - pretrained_model_name_or_path, trust_remote_code=trust_remote_code + pretrained_model_name_or_path, + trust_remote_code=trust_remote_code, + **image_processor_load_kwargs, ) except Exception as e: pass + if model_type == "glm_image" and image_processor is not None: + from transformers.models.glm_image.processing_glm_image import GlmImageProcessor + + processor = GlmImageProcessor(image_processor=image_processor, tokenizer=tokenizer) + model = model.eval() check_and_mark_quantized_module(model) handle_generation_config(model) model = _to_model_dtype(model, model_dtype) + if model_subfolder is not None: + model._autoround_pipeline_subfolder = model_subfolder + return model, processor, tokenizer, image_processor diff --git a/test/test_cpu/models/test_glm_image.py b/test/test_cpu/models/test_glm_image.py new file mode 100644 index 000000000..af0e3f12f --- /dev/null +++ b/test/test_cpu/models/test_glm_image.py @@ -0,0 +1,528 @@ +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Unit tests for GLM-Image quantize and inference helpers. + +These tests are purely local and do not load model weights from disk or +download anything. A fake model hierarchy built with ``torch.nn.Module`` +and ``types.SimpleNamespace`` is used to exercise the logic under test. +""" + +import json +import os +import types + +import pytest +import torch.nn as nn + +from auto_round.special_model_handler import _get_glm_image_multimodal_block +from auto_round.utils.model import _find_pipeline_model_subfolder_local + + +# --------------------------------------------------------------------------- +# Helpers – fake model hierarchy +# --------------------------------------------------------------------------- + +def _make_glm_image_model(n_vision_blocks: int = 4, n_lm_layers: int = 28): + """Return a minimal fake GlmImageForConditionalGeneration-like model. + + Structure mirrors the real model:: + + model + ├── visual + │ └── blocks: ModuleList[n_vision_blocks] + └── language_model + └── layers: ModuleList[n_lm_layers] + """ + + class _Blocks(nn.ModuleList): + pass + + class _Visual(nn.Module): + def __init__(self): + super().__init__() + self.blocks = _Blocks([nn.Linear(8, 8) for _ in range(n_vision_blocks)]) + + class _LM(nn.Module): + def __init__(self): + super().__init__() + self.layers = nn.ModuleList([nn.Linear(8, 8) for _ in range(n_lm_layers)]) + + class _Inner(nn.Module): + def __init__(self): + super().__init__() + self.visual = _Visual() + self.language_model = _LM() + + class _GlmImageModel(nn.Module): + def __init__(self): + super().__init__() + self.model = _Inner() + + return _GlmImageModel() + + +# --------------------------------------------------------------------------- +# Tests for _get_glm_image_multimodal_block +# --------------------------------------------------------------------------- + +class TestGetGlmImageMultimodalBlock: + """Unit tests for the GLM-Image block-name discovery helper.""" + + def test_text_only_returns_one_block_group(self): + """Default (quant_vision=False): only language_model layers are returned.""" + model = _make_glm_image_model(n_vision_blocks=4, n_lm_layers=28) + block_names = _get_glm_image_multimodal_block(model, quant_vision=False) + + assert len(block_names) == 1, "Expected exactly one block group (LM layers only)" + expected = [f"model.language_model.layers.{i}" for i in range(28)] + assert block_names[0] == expected + + def test_quant_vision_true_returns_two_block_groups(self): + """quant_vision=True: visual encoder blocks prepended before LM layers.""" + model = _make_glm_image_model(n_vision_blocks=4, n_lm_layers=28) + block_names = _get_glm_image_multimodal_block(model, quant_vision=True) + + assert len(block_names) == 2, "Expected two block groups: visual + LM" + expected_visual = [f"model.visual.blocks.{i}" for i in range(4)] + expected_lm = [f"model.language_model.layers.{i}" for i in range(28)] + assert block_names[0] == expected_visual + assert block_names[1] == expected_lm + + def test_quant_vision_false_ignores_visual_blocks(self): + """quant_vision=False must not include visual blocks even if they exist.""" + model = _make_glm_image_model(n_vision_blocks=8, n_lm_layers=10) + block_names = _get_glm_image_multimodal_block(model, quant_vision=False) + + flat = [name for group in block_names for name in group] + assert not any("visual" in name for name in flat), ( + "visual blocks must be excluded when quant_vision=False" + ) + + def test_missing_language_model_returns_empty(self): + """If the model has no language_model attribute, result is empty.""" + + class _NoLM(nn.Module): + def __init__(self): + super().__init__() + self.model = nn.Module() # no visual, no language_model + + block_names = _get_glm_image_multimodal_block(_NoLM(), quant_vision=False) + assert block_names == [] + + def test_missing_visual_blocks_with_quant_vision(self): + """quant_vision=True but visual.blocks missing: only LM layers returned.""" + + class _NoVisualBlocks(nn.Module): + def __init__(self): + super().__init__() + self.model = types.SimpleNamespace( + language_model=types.SimpleNamespace( + layers=nn.ModuleList([nn.Linear(8, 8) for _ in range(6)]) + ) + # no .visual attribute + ) + + block_names = _get_glm_image_multimodal_block(_NoVisualBlocks(), quant_vision=True) + assert len(block_names) == 1 + assert block_names[0] == [f"model.language_model.layers.{i}" for i in range(6)] + + def test_block_count_matches_actual_module_list_length(self): + """Block name count must equal the actual ModuleList size.""" + n_lm = 32 + model = _make_glm_image_model(n_vision_blocks=0, n_lm_layers=n_lm) + block_names = _get_glm_image_multimodal_block(model, quant_vision=False) + + assert len(block_names) == 1 + assert len(block_names[0]) == n_lm + + +# --------------------------------------------------------------------------- +# Helpers – temp filesystem for pipeline loading tests +# --------------------------------------------------------------------------- + +def _make_pipeline_dir(tmp_path, components, has_processor=True): + """Write a minimal diffusers-style pipeline directory. + + Args: + tmp_path: pytest tmp_path fixture directory. + components: dict mapping component_name → dict to write as config.json. + has_processor: if True, add a ``processor`` entry to model_index.json. + """ + model_index = {"_class_name": "GlmImagePipeline", "_diffusers_version": "0.0.1"} + if has_processor: + model_index["processor"] = ["transformers", "GlmImageProcessor"] + + for name, cfg in components.items(): + comp_dir = tmp_path / name + comp_dir.mkdir(parents=True) + (comp_dir / "config.json").write_text(json.dumps(cfg), encoding="utf-8") + model_index[name] = ["transformers", cfg.get("architectures", ["Unknown"])[0]] + + (tmp_path / "model_index.json").write_text(json.dumps(model_index), encoding="utf-8") + return str(tmp_path) + + +# --------------------------------------------------------------------------- +# Tests for _find_pipeline_model_subfolder_local +# --------------------------------------------------------------------------- + +class TestFindPipelineModelSubfolderLocal: + """Unit tests for the local pipeline subfolder discovery helper.""" + + def test_finds_vision_language_encoder_subfolder(self, tmp_path): + """The component containing GlmImageForConditionalGeneration is returned.""" + pipeline_dir = _make_pipeline_dir( + tmp_path, + { + "vision_language_encoder": { + "architectures": ["GlmImageForConditionalGeneration"], + "model_type": "glm_image", + }, + "vae": {"model_type": "autoencoder_kl"}, # no architectures → ignored + }, + ) + model_subfolder, processor_subfolder, cfg = _find_pipeline_model_subfolder_local(pipeline_dir) + + assert model_subfolder == "vision_language_encoder" + assert processor_subfolder == "processor" + assert cfg["architectures"][0] == "GlmImageForConditionalGeneration" + + def test_prefers_conditional_generation_over_encoder(self, tmp_path): + """ConditionalGeneration architecture is preferred over plain encoder.""" + pipeline_dir = _make_pipeline_dir( + tmp_path, + { + "text_encoder": {"architectures": ["T5EncoderModel"]}, + "vision_language_encoder": { + "architectures": ["GlmImageForConditionalGeneration"], + "model_type": "glm_image", + }, + }, + has_processor=False, + ) + model_subfolder, processor_subfolder, cfg = _find_pipeline_model_subfolder_local(pipeline_dir) + + assert model_subfolder == "vision_language_encoder" + assert processor_subfolder is None # no processor entry + + def test_no_processor_returns_none(self, tmp_path): + """When model_index.json has no 'processor' key, processor_subfolder is None.""" + pipeline_dir = _make_pipeline_dir( + tmp_path, + {"vision_language_encoder": {"architectures": ["GlmImageForConditionalGeneration"]}}, + has_processor=False, + ) + _, processor_subfolder, _ = _find_pipeline_model_subfolder_local(pipeline_dir) + assert processor_subfolder is None + + def test_with_processor_returns_processor_subfolder(self, tmp_path): + """When model_index.json has a 'processor' key, processor_subfolder=='processor'.""" + pipeline_dir = _make_pipeline_dir( + tmp_path, + {"vision_language_encoder": {"architectures": ["GlmImageForConditionalGeneration"]}}, + has_processor=True, + ) + _, processor_subfolder, _ = _find_pipeline_model_subfolder_local(pipeline_dir) + assert processor_subfolder == "processor" + + def test_raises_when_no_model_index(self, tmp_path): + """FileNotFoundError raised when neither config.json nor model_index.json exists.""" + with pytest.raises(FileNotFoundError, match="model_index.json"): + _find_pipeline_model_subfolder_local(str(tmp_path)) + + def test_raises_when_no_component_has_architectures(self, tmp_path): + """FileNotFoundError raised when no component config contains 'architectures'.""" + pipeline_dir = _make_pipeline_dir( + tmp_path, + { + "vae": {"model_type": "autoencoder_kl"}, + "scheduler": {}, + }, + ) + with pytest.raises(FileNotFoundError, match="architectures"): + _find_pipeline_model_subfolder_local(pipeline_dir) + + def test_falls_back_to_first_candidate_when_no_preferred_arch(self, tmp_path): + """When no ConditionalGeneration/CausalLM arch exists, first candidate is used.""" + pipeline_dir = _make_pipeline_dir( + tmp_path, + { + "text_encoder": {"architectures": ["T5EncoderModel"]}, + "image_encoder": {"architectures": ["CLIPVisionModel"]}, + }, + has_processor=False, + ) + model_subfolder, _, cfg = _find_pipeline_model_subfolder_local(pipeline_dir) + # Must be one of the candidates, not crash + assert model_subfolder in ("text_encoder", "image_encoder") + assert "architectures" in cfg + + +# --------------------------------------------------------------------------- +# Tests for GlmImageProcessor construction path +# --------------------------------------------------------------------------- + +class TestGlmImageProcessorConstruction: + """Unit-test the GlmImageProcessor assembly logic in mllm_load_model. + + Without loading full model weights we directly exercise the branching + code that wraps image_processor + tokenizer into GlmImageProcessor when + model_type == "glm_image". GlmImageProcessor itself is patched so the + test does not depend on transformers' internal input validation. + """ + + @pytest.fixture() + def mock_components(self): + """Return minimal fake tokenizer and image_processor objects.""" + tokenizer = types.SimpleNamespace(pad_token_id=0, eos_token_id=2) + image_processor = types.SimpleNamespace(size={"height": 448, "width": 448}) + return tokenizer, image_processor + + def test_glm_image_processor_wraps_components(self, mock_components): + """GlmImageProcessor must be called with image_processor= and tokenizer=.""" + from unittest.mock import MagicMock, patch + + tokenizer, image_processor = mock_components + fake_processor = object() + mock_cls = MagicMock(return_value=fake_processor) + + # Patch away the real GlmImageProcessor so we only test the branch logic + with patch.dict("sys.modules", {"transformers.models.glm_image.processing_glm_image": types.ModuleType("_fake")}): + import sys + + sys.modules["transformers.models.glm_image.processing_glm_image"].GlmImageProcessor = mock_cls + + model_type = "glm_image" + processor = None + if model_type == "glm_image" and image_processor is not None: + from transformers.models.glm_image.processing_glm_image import GlmImageProcessor + + processor = GlmImageProcessor(image_processor=image_processor, tokenizer=tokenizer) + + mock_cls.assert_called_once_with(image_processor=image_processor, tokenizer=tokenizer) + assert processor is fake_processor + + def test_non_glm_image_model_type_skips_wrapping(self, mock_components): + """For any other model_type, the GlmImageProcessor wrapping is not applied.""" + tokenizer, image_processor = mock_components + + model_type = "qwen2_vl" + processor = None # simulate AutoProcessor result already in place + if model_type == "glm_image" and image_processor is not None: + processor = object() # should never be reached + + assert processor is None # wrapping must NOT happen + + def test_skipped_when_image_processor_is_none(self, mock_components): + """image_processor=None prevents GlmImageProcessor from being built.""" + tokenizer, _ = mock_components + + model_type = "glm_image" + image_processor = None + processor = None + if model_type == "glm_image" and image_processor is not None: + processor = object() # must not be reached + + assert processor is None + + +# --------------------------------------------------------------------------- +# Helpers – minimal PIL Image factory (no file I/O) +# --------------------------------------------------------------------------- + +def _make_rgb_image(width: int = 64, height: int = 64): + """Return a tiny solid-colour PIL Image in RGB mode.""" + from PIL import Image + + return Image.new("RGB", (width, height), color=(128, 64, 32)) + + +# --------------------------------------------------------------------------- +# Tests for image-to-image inference call logic (run_glm_image.py) +# --------------------------------------------------------------------------- + +class TestGlmImageI2ICallLogic: + """Unit tests for the image-to-image pipeline invocation logic. + + The pattern under test mirrors run_glm_image.main():: + + condition_images = [load_image(p) for p in args.reference_image] or None + result = pipe(prompt=..., image=condition_images, height=..., width=..., ...) + + No real pipeline or model weights are required. + """ + + def test_no_reference_images_passes_none_to_pipeline(self): + """Empty reference_image list must yield image=None (text-to-image mode).""" + from unittest.mock import MagicMock + + reference_image_paths = [] # T2I: no reference images provided + condition_images = [_make_rgb_image() for _ in reference_image_paths] or None + + pipe = MagicMock() + pipe.return_value = MagicMock(images=[_make_rgb_image()]) + pipe(prompt="a fox", image=condition_images, height=1024, width=1024) + + _, kwargs = pipe.call_args + assert kwargs["image"] is None, "T2I: image kwarg must be None" + + def test_single_reference_image_passed_as_list(self): + """Single reference image must be wrapped in a list (not passed bare).""" + from unittest.mock import MagicMock + + ref_img = _make_rgb_image() + reference_image_paths = ["dummy_path.jpg"] + # Simulate load_image returning ref_img for each path + condition_images = [ref_img for _ in reference_image_paths] or None + + pipe = MagicMock() + pipe.return_value = MagicMock(images=[_make_rgb_image()]) + pipe(prompt="edit the sky", image=condition_images, height=33 * 32, width=32 * 32) + + _, kwargs = pipe.call_args + assert isinstance(kwargs["image"], list), "I2I: image must be a list" + assert len(kwargs["image"]) == 1 + assert kwargs["image"][0] is ref_img + + def test_multi_image_list_preserved(self): + """Multiple reference images must all be forwarded as a list.""" + from unittest.mock import MagicMock + + imgs = [_make_rgb_image() for _ in range(3)] + condition_images = imgs or None # non-empty list stays as-is + + pipe = MagicMock() + pipe.return_value = MagicMock(images=[_make_rgb_image()]) + pipe(prompt="merge subjects", image=condition_images, height=32 * 32, width=32 * 32) + + _, kwargs = pipe.call_args + assert kwargs["image"] == imgs + assert len(kwargs["image"]) == 3 + + def test_height_width_not_divisible_by_32_raises(self): + """run_glm_image.main() raises ValueError when dimensions are not multiples of 32.""" + height, width = 33 * 32 + 1, 32 * 32 # 1057 is not divisible by 32 + + with pytest.raises(ValueError, match="divisible by 32"): + if height % 32 != 0 or width % 32 != 0: + raise ValueError("GLM-Image requires height and width to be divisible by 32.") + + def test_height_width_divisible_by_32_passes(self): + """Dimensions that are multiples of 32 must not raise.""" + for height, width in [(33 * 32, 32 * 32), (1024, 768), (32, 32)]: + # Should not raise + if height % 32 != 0 or width % 32 != 0: + raise AssertionError(f"Unexpected non-multiple: {height}x{width}") + + def test_i2i_prompt_forwarded_correctly(self): + """The prompt string must be forwarded verbatim to the pipeline call.""" + from unittest.mock import MagicMock + + prompt = "Replace the background with an underground station." + ref_img = _make_rgb_image() + condition_images = [ref_img] + + pipe = MagicMock() + pipe.return_value = MagicMock(images=[_make_rgb_image()]) + pipe(prompt=prompt, image=condition_images, height=33 * 32, width=32 * 32) + + _, kwargs = pipe.call_args + assert kwargs["prompt"] == prompt + + +# --------------------------------------------------------------------------- +# Tests for load_image helper (run_glm_image.py) +# --------------------------------------------------------------------------- + +class TestLoadImage: + """Unit tests for the load_image() helper in run_glm_image. + + Covers local file loading and the URL-vs-path dispatch logic without + making any real network requests. + """ + + @pytest.fixture(autouse=True) + def _import_load_image(self): + """Import load_image from run_glm_image into the test namespace.""" + import importlib + import sys + + # Ensure the workspace root is on sys.path so run_glm_image can be imported + root = os.path.dirname( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + ) + if root not in sys.path: + sys.path.insert(0, root) + mod = importlib.import_module("run_glm_image") + self.load_image = mod.load_image + + def test_load_local_rgb_image(self, tmp_path): + """load_image() opens a local file and returns an RGB PIL Image.""" + from PIL import Image + + img = Image.new("RGBA", (32, 32), color=(10, 20, 30, 255)) + img_path = str(tmp_path / "test.png") + img.save(img_path) + + result = self.load_image(img_path) + + assert isinstance(result, Image.Image) + assert result.mode == "RGB" + assert result.size == (32, 32) + + def test_load_image_converts_rgba_to_rgb(self, tmp_path): + """RGBA images saved locally must be converted to RGB.""" + from PIL import Image + + img = Image.new("RGBA", (16, 16), color=(255, 0, 0, 128)) + img_path = str(tmp_path / "rgba.png") + img.save(img_path) + + result = self.load_image(img_path) + assert result.mode == "RGB" + + def test_url_branch_calls_requests_get(self): + """http/https paths must use requests.get, not PIL.Image.open directly.""" + from unittest.mock import MagicMock, patch + from io import BytesIO + from PIL import Image + + fake_img = Image.new("RGB", (8, 8), color=(0, 128, 255)) + buf = BytesIO() + fake_img.save(buf, format="PNG") + buf.seek(0) + + mock_response = MagicMock() + mock_response.raw = buf + + with patch("requests.get", return_value=mock_response) as mock_get: + result = self.load_image("https://example.com/image.png") + + mock_get.assert_called_once_with("https://example.com/image.png", timeout=60) + assert isinstance(result, Image.Image) + assert result.mode == "RGB" + + def test_local_path_does_not_call_requests(self, tmp_path): + """Local file paths must not trigger requests.get.""" + from unittest.mock import patch + from PIL import Image + + img = Image.new("RGB", (4, 4)) + img_path = str(tmp_path / "local.png") + img.save(img_path) + + with patch("requests.get") as mock_get: + self.load_image(img_path) + + mock_get.assert_not_called() From b367bf35c60db4bdd2efdc58300ce081d710c135 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 8 Mar 2026 13:26:44 +0000 Subject: [PATCH 02/10] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/export/utils.py | 4 +++- test/test_cpu/models/test_glm_image.py | 29 +++++++++++++++----------- 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/auto_round/export/utils.py b/auto_round/export/utils.py index 5581fc46c..ccbb94823 100644 --- a/auto_round/export/utils.py +++ b/auto_round/export/utils.py @@ -123,7 +123,9 @@ def resolve_pipeline_export_layout(model: nn.Module, output_dir: str) -> tuple[s processor_component = None if source_dir is not None: try: - model_index_path = os.path.join(source_dir, "model_index.json") if is_local_pipeline_model_dir(source_dir) else None + model_index_path = ( + os.path.join(source_dir, "model_index.json") if is_local_pipeline_model_dir(source_dir) else None + ) if model_index_path: with open(model_index_path, "r", encoding="utf-8") as f: model_index = json.load(f) diff --git a/test/test_cpu/models/test_glm_image.py b/test/test_cpu/models/test_glm_image.py index af0e3f12f..d87459cb5 100644 --- a/test/test_cpu/models/test_glm_image.py +++ b/test/test_cpu/models/test_glm_image.py @@ -28,11 +28,11 @@ from auto_round.special_model_handler import _get_glm_image_multimodal_block from auto_round.utils.model import _find_pipeline_model_subfolder_local - # --------------------------------------------------------------------------- # Helpers – fake model hierarchy # --------------------------------------------------------------------------- + def _make_glm_image_model(n_vision_blocks: int = 4, n_lm_layers: int = 28): """Return a minimal fake GlmImageForConditionalGeneration-like model. @@ -76,6 +76,7 @@ def __init__(self): # Tests for _get_glm_image_multimodal_block # --------------------------------------------------------------------------- + class TestGetGlmImageMultimodalBlock: """Unit tests for the GLM-Image block-name discovery helper.""" @@ -105,9 +106,7 @@ def test_quant_vision_false_ignores_visual_blocks(self): block_names = _get_glm_image_multimodal_block(model, quant_vision=False) flat = [name for group in block_names for name in group] - assert not any("visual" in name for name in flat), ( - "visual blocks must be excluded when quant_vision=False" - ) + assert not any("visual" in name for name in flat), "visual blocks must be excluded when quant_vision=False" def test_missing_language_model_returns_empty(self): """If the model has no language_model attribute, result is empty.""" @@ -127,9 +126,7 @@ class _NoVisualBlocks(nn.Module): def __init__(self): super().__init__() self.model = types.SimpleNamespace( - language_model=types.SimpleNamespace( - layers=nn.ModuleList([nn.Linear(8, 8) for _ in range(6)]) - ) + language_model=types.SimpleNamespace(layers=nn.ModuleList([nn.Linear(8, 8) for _ in range(6)])) # no .visual attribute ) @@ -151,6 +148,7 @@ def test_block_count_matches_actual_module_list_length(self): # Helpers – temp filesystem for pipeline loading tests # --------------------------------------------------------------------------- + def _make_pipeline_dir(tmp_path, components, has_processor=True): """Write a minimal diffusers-style pipeline directory. @@ -177,6 +175,7 @@ def _make_pipeline_dir(tmp_path, components, has_processor=True): # Tests for _find_pipeline_model_subfolder_local # --------------------------------------------------------------------------- + class TestFindPipelineModelSubfolderLocal: """Unit tests for the local pipeline subfolder discovery helper.""" @@ -273,6 +272,7 @@ def test_falls_back_to_first_candidate_when_no_preferred_arch(self, tmp_path): # Tests for GlmImageProcessor construction path # --------------------------------------------------------------------------- + class TestGlmImageProcessorConstruction: """Unit-test the GlmImageProcessor assembly logic in mllm_load_model. @@ -298,7 +298,9 @@ def test_glm_image_processor_wraps_components(self, mock_components): mock_cls = MagicMock(return_value=fake_processor) # Patch away the real GlmImageProcessor so we only test the branch logic - with patch.dict("sys.modules", {"transformers.models.glm_image.processing_glm_image": types.ModuleType("_fake")}): + with patch.dict( + "sys.modules", {"transformers.models.glm_image.processing_glm_image": types.ModuleType("_fake")} + ): import sys sys.modules["transformers.models.glm_image.processing_glm_image"].GlmImageProcessor = mock_cls @@ -341,6 +343,7 @@ def test_skipped_when_image_processor_is_none(self, mock_components): # Helpers – minimal PIL Image factory (no file I/O) # --------------------------------------------------------------------------- + def _make_rgb_image(width: int = 64, height: int = 64): """Return a tiny solid-colour PIL Image in RGB mode.""" from PIL import Image @@ -352,6 +355,7 @@ def _make_rgb_image(width: int = 64, height: int = 64): # Tests for image-to-image inference call logic (run_glm_image.py) # --------------------------------------------------------------------------- + class TestGlmImageI2ICallLogic: """Unit tests for the image-to-image pipeline invocation logic. @@ -445,6 +449,7 @@ def test_i2i_prompt_forwarded_correctly(self): # Tests for load_image helper (run_glm_image.py) # --------------------------------------------------------------------------- + class TestLoadImage: """Unit tests for the load_image() helper in run_glm_image. @@ -459,9 +464,7 @@ def _import_load_image(self): import sys # Ensure the workspace root is on sys.path so run_glm_image can be imported - root = os.path.dirname( - os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - ) + root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) if root not in sys.path: sys.path.insert(0, root) mod = importlib.import_module("run_glm_image") @@ -494,8 +497,9 @@ def test_load_image_converts_rgba_to_rgb(self, tmp_path): def test_url_branch_calls_requests_get(self): """http/https paths must use requests.get, not PIL.Image.open directly.""" - from unittest.mock import MagicMock, patch from io import BytesIO + from unittest.mock import MagicMock, patch + from PIL import Image fake_img = Image.new("RGB", (8, 8), color=(0, 128, 255)) @@ -516,6 +520,7 @@ def test_url_branch_calls_requests_get(self): def test_local_path_does_not_call_requests(self, tmp_path): """Local file paths must not trigger requests.get.""" from unittest.mock import patch + from PIL import Image img = Image.new("RGB", (4, 4)) From e73a31d106ecaa90a86447e79637a245e2dcdd03 Mon Sep 17 00:00:00 2001 From: lvliang-intel Date: Sun, 8 Mar 2026 21:26:47 +0800 Subject: [PATCH 03/10] fix test script Signed-off-by: lvliang-intel --- test/test_cpu/models/test_glm_image.py | 87 -------------------------- 1 file changed, 87 deletions(-) diff --git a/test/test_cpu/models/test_glm_image.py b/test/test_cpu/models/test_glm_image.py index af0e3f12f..61c805979 100644 --- a/test/test_cpu/models/test_glm_image.py +++ b/test/test_cpu/models/test_glm_image.py @@ -439,90 +439,3 @@ def test_i2i_prompt_forwarded_correctly(self): _, kwargs = pipe.call_args assert kwargs["prompt"] == prompt - - -# --------------------------------------------------------------------------- -# Tests for load_image helper (run_glm_image.py) -# --------------------------------------------------------------------------- - -class TestLoadImage: - """Unit tests for the load_image() helper in run_glm_image. - - Covers local file loading and the URL-vs-path dispatch logic without - making any real network requests. - """ - - @pytest.fixture(autouse=True) - def _import_load_image(self): - """Import load_image from run_glm_image into the test namespace.""" - import importlib - import sys - - # Ensure the workspace root is on sys.path so run_glm_image can be imported - root = os.path.dirname( - os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - ) - if root not in sys.path: - sys.path.insert(0, root) - mod = importlib.import_module("run_glm_image") - self.load_image = mod.load_image - - def test_load_local_rgb_image(self, tmp_path): - """load_image() opens a local file and returns an RGB PIL Image.""" - from PIL import Image - - img = Image.new("RGBA", (32, 32), color=(10, 20, 30, 255)) - img_path = str(tmp_path / "test.png") - img.save(img_path) - - result = self.load_image(img_path) - - assert isinstance(result, Image.Image) - assert result.mode == "RGB" - assert result.size == (32, 32) - - def test_load_image_converts_rgba_to_rgb(self, tmp_path): - """RGBA images saved locally must be converted to RGB.""" - from PIL import Image - - img = Image.new("RGBA", (16, 16), color=(255, 0, 0, 128)) - img_path = str(tmp_path / "rgba.png") - img.save(img_path) - - result = self.load_image(img_path) - assert result.mode == "RGB" - - def test_url_branch_calls_requests_get(self): - """http/https paths must use requests.get, not PIL.Image.open directly.""" - from unittest.mock import MagicMock, patch - from io import BytesIO - from PIL import Image - - fake_img = Image.new("RGB", (8, 8), color=(0, 128, 255)) - buf = BytesIO() - fake_img.save(buf, format="PNG") - buf.seek(0) - - mock_response = MagicMock() - mock_response.raw = buf - - with patch("requests.get", return_value=mock_response) as mock_get: - result = self.load_image("https://example.com/image.png") - - mock_get.assert_called_once_with("https://example.com/image.png", timeout=60) - assert isinstance(result, Image.Image) - assert result.mode == "RGB" - - def test_local_path_does_not_call_requests(self, tmp_path): - """Local file paths must not trigger requests.get.""" - from unittest.mock import patch - from PIL import Image - - img = Image.new("RGB", (4, 4)) - img_path = str(tmp_path / "local.png") - img.save(img_path) - - with patch("requests.get") as mock_get: - self.load_image(img_path) - - mock_get.assert_not_called() From 510b6c4c8851fc0a9a39600e6a23b0753379c75d Mon Sep 17 00:00:00 2001 From: lvliang-intel Date: Tue, 10 Mar 2026 18:53:39 +0800 Subject: [PATCH 04/10] support hybrid mode Signed-off-by: lvliang-intel --- auto_round/autoround.py | 9 +- auto_round/compressors/__init__.py | 1 + .../compressors/diffusion/compressor.py | 9 +- auto_round/compressors/diffusion/hybrid.py | 668 ++++++++++++++++++ auto_round/utils/model.py | 65 ++ 5 files changed, 749 insertions(+), 3 deletions(-) create mode 100644 auto_round/compressors/diffusion/hybrid.py diff --git a/auto_round/autoround.py b/auto_round/autoround.py index 9888bfe8e..b5906b839 100644 --- a/auto_round/autoround.py +++ b/auto_round/autoround.py @@ -25,6 +25,7 @@ LLMCompressor, MLLMCompressor, ) +from auto_round.compressors.diffusion.hybrid import HybridCompressor, is_hybrid_diffusion_model from auto_round.logger import deprecated, logger from auto_round.schemes import QuantizationScheme from auto_round.utils import is_diffusion_model, is_mllm_model @@ -163,7 +164,13 @@ def __new__( has_multimodal_assets = kwargs.get("processor") is not None or kwargs.get("image_processor") is not None - if ( + if is_hybrid_diffusion_model(model): + logger.info("using Hybrid AR+Diffusion mode for hybrid model.") + model_cls.append(HybridCompressor) + if extra_config: + extra_config.mllm_config = None + extra_config.diffusion_config = None + elif ( (extra_config and not extra_config.mllm_config.is_default()) or has_multimodal_assets or is_mllm_model(model, platform=platform) diff --git a/auto_round/compressors/__init__.py b/auto_round/compressors/__init__.py index 6f8ddf681..15ec27ebe 100644 --- a/auto_round/compressors/__init__.py +++ b/auto_round/compressors/__init__.py @@ -17,6 +17,7 @@ from auto_round.compressors.base import LLMCompressor from auto_round.compressors.mllm.compressor import MLLMCompressor from auto_round.compressors.diffusion.compressor import DiffusionCompressor +from auto_round.compressors.diffusion.hybrid import HybridCompressor from auto_round.compressors.config import ( DiffusionExtraConfig, ExtraConfig, diff --git a/auto_round/compressors/diffusion/compressor.py b/auto_round/compressors/diffusion/compressor.py index 6d9580e4f..09162fd41 100644 --- a/auto_round/compressors/diffusion/compressor.py +++ b/auto_round/compressors/diffusion/compressor.py @@ -31,6 +31,8 @@ extract_block_names_to_str, find_matching_blocks, get_block_names, + merge_block_output_keys, + wrap_block_forward_positional_to_kwargs, ) pipeline_utils = LazyImport("diffusers.pipelines.pipeline_utils") @@ -172,6 +174,9 @@ def _update_inputs(self, inputs: dict, q_inputs: dict) -> tuple[dict, dict]: q_inputs = {k: q_inputs.pop(k, None) for k in input_id_str} return inputs, q_inputs + def _get_block_forward_func(self, name): + return wrap_block_forward_positional_to_kwargs(super()._get_block_forward_func(name)) + def _split_inputs(self, inputs: dict, first_input_name: str) -> tuple[dict, dict]: input_id_str = [key for key in inputs.keys() if "hidden_state" in key] input_ids = {k: inputs.pop(k, None) for k in input_id_str} @@ -205,7 +210,7 @@ def _get_current_q_output( ) if isinstance(current_input_ids, dict): hidden_states = current_input_ids.pop("hidden_states") - current_input_others.update(current_input_ids) + merge_block_output_keys(block, current_input_others, current_input_ids) current_input_ids = hidden_states output_q = block_forward(block, current_input_ids, current_input_others, self.amp, self.amp_dtype, device, idx) return output_q.to(cache_device) @@ -251,7 +256,7 @@ def _get_block_outputs( ) if isinstance(tmp_input_ids, dict): hidden_states = tmp_input_ids.pop("hidden_states") - tmp_input_others.update(tmp_input_ids) + merge_block_output_keys(block, tmp_input_others, tmp_input_ids) tmp_input_ids = hidden_states tmp_output = block_forward(block, tmp_input_ids, tmp_input_others, self.amp, self.amp_dtype, device, None) diff --git a/auto_round/compressors/diffusion/hybrid.py b/auto_round/compressors/diffusion/hybrid.py new file mode 100644 index 000000000..21af6b197 --- /dev/null +++ b/auto_round/compressors/diffusion/hybrid.py @@ -0,0 +1,668 @@ +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""HybridCompressor for models with both AR and diffusion components. + +This compressor handles models that have a hybrid architecture consisting of: + - An autoregressive (AR) language model component + - A diffusion transformer (DiT) component + +It quantizes both components in a single workflow: + Phase 1: Quantize the AR model using MLLM-style text calibration + Phase 2: Quantize the DiT model using diffusion-style pipeline calibration + +Supported hybrid pipelines are registered in ``HYBRID_AR_COMPONENTS``. +To add a new model, register its AR component attribute name and (optionally) +its DiT block output config in ``output_configs``. +""" + +from __future__ import annotations + +import copy +import os +import time +from typing import Any, Union + +import torch + +from auto_round.compressors.diffusion.compressor import DiffusionCompressor, output_configs +from auto_round.logger import logger +from auto_round.schemes import QuantizationScheme +from auto_round.utils import ( + LazyImport, + clear_memory, + extract_block_names_to_str, + find_matching_blocks, + get_block_names, +) + +pipeline_utils = LazyImport("diffusers.pipelines.pipeline_utils") + +# --------------------------------------------------------------------------- +# Registry: known AR component attribute names in hybrid diffusion pipelines. +# Each entry maps a pipeline attribute name to the component role. +# When a pipeline has *both* "transformer" and one of these attributes, +# it is recognised as a hybrid model. +# To support a new hybrid architecture, simply add its AR attribute name here. +# --------------------------------------------------------------------------- +HYBRID_AR_COMPONENTS = [ + "vision_language_encoder", # GLM-Image + # Add new AR component names here, e.g.: + # "language_model", + # "text_decoder", +] + +# --------------------------------------------------------------------------- +# Register DiT block output configs for hybrid models. +# Maps block class name -> ordered list of output tensor names. +# Pure-diffusion blocks (Flux*) are already registered in DiffusionCompressor. +# --------------------------------------------------------------------------- +output_configs["GlmImageTransformerBlock"] = ["hidden_states", "encoder_hidden_states"] + + +# --------------------------------------------------------------------------- +# Detection +# --------------------------------------------------------------------------- + +def _find_ar_component_name(model_or_path): + """Return the AR component attribute name if model_or_path is a hybrid pipeline, else None.""" + if isinstance(model_or_path, str): + index_path = os.path.join(model_or_path, "model_index.json") + if not os.path.exists(index_path): + from huggingface_hub import hf_hub_download + try: + index_path = hf_hub_download(model_or_path, "model_index.json") + except Exception: + return None + + import json + with open(index_path) as f: + data = json.load(f) + if "transformer" not in data: + return None + for name in HYBRID_AR_COMPONENTS: + if name in data: + return name + return None + + # Runtime pipeline object + if hasattr(model_or_path, "transformer"): + for name in HYBRID_AR_COMPONENTS: + if hasattr(model_or_path, name) and getattr(model_or_path, name) is not None: + return name + return None + + +def is_hybrid_diffusion_model(model_or_path): + """Return True if *model_or_path* represents a hybrid AR+Diffusion pipeline.""" + return _find_ar_component_name(model_or_path) is not None + + +class HybridCompressor(DiffusionCompressor): + """Compressor for hybrid AR + diffusion models. + + Quantizes both the autoregressive component and the diffusion transformer + component in a single workflow. The AR component is discovered automatically + from ``HYBRID_AR_COMPONENTS``. + + Args: + model: Model name/path or DiffusionPipeline object. + tokenizer: Tokenizer (auto-loaded from pipeline if None). + guidance_scale: Guidance scale for diffusion calibration. + num_inference_steps: Denoising steps for diffusion calibration. + generator_seed: Seed for noise generator. + scheme: Quantization scheme. + dataset: Calibration dataset for DiT (default: "coco2014"). + ar_dataset: Calibration dataset for AR model (default: "NeelNanda/pile-10k"). + quant_nontext_module: Whether to also quantize vision encoder in AR model. + iters: Optimization iterations. + seqlen: Calibration sequence length for AR model. + nsamples: Number of calibration samples. + batch_size: Calibration batch size. + quant_ar: Whether to quantize the AR component. + quant_dit: Whether to quantize the DiT component. + height: Image height passed to the pipeline during DiT calibration (required by some pipelines + such as GLM-Image; ignored if the pipeline does not accept it). + width: Image width passed to the pipeline during DiT calibration. + **kwargs: Additional keyword arguments passed to base compressor. + """ + + def __init__( + self, + model: Union[object, str], + tokenizer=None, + platform: str = "hf", + guidance_scale: float = 1.5, + num_inference_steps: int = 10, + generator_seed: int = None, + scheme: Union[str, dict, QuantizationScheme] = "W4A16", + layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None, + dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "coco2014", + ar_dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "NeelNanda/pile-10k", + quant_nontext_module: bool = False, + iters: int = 200, + seqlen: int = 2048, + nsamples: int = 128, + batch_size: int = 8, + gradient_accumulate_steps: int = 1, + low_gpu_mem_usage: bool = True, + device_map: Union[str, torch.device, int, dict] = 0, + enable_torch_compile: bool = False, + seed: int = 42, + quant_ar: bool = True, + quant_dit: bool = True, + height: int = None, + width: int = None, + **kwargs, + ): + logger.warning("Hybrid AR+Diffusion model quantization is experimental.") + model_dtype = kwargs.pop("model_dtype", None) + + self.guidance_scale = guidance_scale + self.num_inference_steps = num_inference_steps + self.generator_seed = generator_seed + self.quant_ar = quant_ar + self.quant_dit = quant_dit + self.quant_nontext_module = quant_nontext_module + self.ar_dataset = ar_dataset + self.height = height + self.width = width + + to_quant_block_names: Union[str, list, None] = kwargs.pop("to_quant_block_names", None) + if device_map is None: + device_map = 0 + self._set_device(device_map) + + # --- Load the pipeline --- + if isinstance(model, str): + from auto_round.utils.model import diffusion_load_model + pipe, dit_model = diffusion_load_model( + model, platform=platform, device=self.device, model_dtype=model_dtype + ) + elif isinstance(model, pipeline_utils.DiffusionPipeline): + pipe = model + dit_model = pipe.transformer + else: + raise ValueError( + f"HybridCompressor requires a model path or DiffusionPipeline, got {type(model)}" + ) + + # --- Discover the AR component dynamically --- + self.ar_component_name = _find_ar_component_name(pipe) + if self.ar_component_name is None and self.quant_ar: + logger.warning( + f"No AR component found in pipeline (checked: {HYBRID_AR_COMPONENTS}), " + "skipping AR quantization." + ) + self.quant_ar = False + + self.pipe = pipe + self.dit_model = dit_model + self.ar_model = ( + getattr(pipe, self.ar_component_name, None) + if self.ar_component_name + else None + ) + + if not self.quant_ar and not self.quant_dit: + raise ValueError("At least one of quant_ar and quant_dit must be True.") + + model = dit_model + + # --- Detect DiT blocks --- + all_blocks = get_block_names(model) + dit_blocks = find_matching_blocks(model, all_blocks, to_quant_block_names) + + # Filter to only blocks whose class has a registered output_config. + # get_block_names may discover non-transformer ModuleLists (e.g. MLP projectors) + # that don't match the expected output format. + if to_quant_block_names is None: + filtered = [] + for group in dit_blocks: + if not group: + continue + parts = group[0].split(".") + m = model + for p in parts: + m = getattr(m, p) + if m.__class__.__name__ in output_configs: + filtered.append(group) + if filtered: + dit_blocks = filtered + self.dit_quant_block_list = dit_blocks + + # --- Detect AR blocks --- + if self.quant_ar and self.ar_model is not None: + from auto_round.special_model_handler import SPECIAL_MULTIMODAL_BLOCK + model_type = getattr(getattr(self.ar_model, "config", None), "model_type", None) + if model_type and model_type in SPECIAL_MULTIMODAL_BLOCK: + self.ar_quant_block_list = SPECIAL_MULTIMODAL_BLOCK[model_type]( + self.ar_model, quant_vision=quant_nontext_module + ) + else: + self.ar_quant_block_list = [get_block_names(self.ar_model)] + else: + self.ar_quant_block_list = [] + + self.quant_block_list = self.dit_quant_block_list + if to_quant_block_names is None: + to_quant_block_names = extract_block_names_to_str(self.quant_block_list) + + # Force batch_size to 1 for diffusion calibration + if iters > 0 and batch_size != 1: + logger.warning( + f"reset batch_size({batch_size}) to 1 and " + f"gradient_accumulate_steps({gradient_accumulate_steps}) " + f"to {batch_size * gradient_accumulate_steps}, " + f"because batch_size > 1 cannot be used for diffusion calibration." + ) + gradient_accumulate_steps = batch_size * gradient_accumulate_steps + batch_size = 1 + + seqlen = 2048 if seqlen is None else seqlen + + if nsamples % batch_size != 0: + nsamples = (nsamples // batch_size + 1) * batch_size + logger.warning(f"'nsamples' is not divisible by 'batch_size', adjusted to {nsamples}") + + kwargs["diffusion"] = True + self._saved_pipe = pipe + self._saved_dit_model = dit_model + self._saved_ar_model = self.ar_model + + from auto_round.compressors.base import BaseCompressor + BaseCompressor.__init__( + self, + model=model, + tokenizer=None, + platform=platform, + scheme=scheme, + layer_config=layer_config, + dataset=dataset, + iters=iters, + seqlen=seqlen, + nsamples=nsamples, + batch_size=batch_size, + gradient_accumulate_steps=gradient_accumulate_steps, + low_gpu_mem_usage=low_gpu_mem_usage, + device_map=device_map, + enable_torch_compile=enable_torch_compile, + seed=seed, + to_quant_block_names=to_quant_block_names, + **kwargs, + ) + + # Restore references that BaseCompressor.__init__ may have overwritten + self.pipe = self._saved_pipe + self.dit_model = self._saved_dit_model + self.ar_model = self._saved_ar_model + + # ------------------------------------------------------------------ + # Quantization + # ------------------------------------------------------------------ + + def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: + """Quantize both AR and DiT components. + + Phase 1: AR model via MLLM-style text calibration. + Phase 2: DiT model via diffusion pipeline calibration. + """ + start_time = time.time() + combined_layer_config = {} + + # =================== Phase 1: AR Model =================== + if self.quant_ar and self.ar_model is not None: + logger.info("=" * 60) + logger.info(f"Phase 1: Quantizing AR model ({self.ar_component_name})") + logger.info("=" * 60) + + ar_compressor = self._create_ar_compressor() + ar_model, ar_layer_config = ar_compressor.quantize() + + self.ar_model = ar_model + setattr(self.pipe, self.ar_component_name, ar_model) + combined_layer_config.update( + {f"ar.{k}": v for k, v in ar_layer_config.items()} + ) + self.ar_layer_config = ar_layer_config + + # Preserve serialization-relevant attributes from the AR compressor + # so save_quantized can build the correct serialization_dict. + from auto_round.compressors.base import SERIALIZATION_KEYS + self._ar_serialization = { + k: getattr(ar_compressor, k, None) for k in SERIALIZATION_KEYS + } + + # Move AR model to CPU to free GPU for Phase 2 + self.ar_model.to("cpu") + clear_memory(device_list=self.device_list) + logger.info(f"Phase 1 complete: AR model ({self.ar_component_name}) quantized") + + # =================== Phase 2: DiT Model =================== + if self.quant_dit: + logger.info("=" * 60) + logger.info("Phase 2: Quantizing DiT model (transformer)") + logger.info("=" * 60) + + # Move DiT to target device for calibration + self.dit_model = self.dit_model.to(self.device) + self.model = self.dit_model + self.quant_block_list = self.dit_quant_block_list + self.quantized = False + self.batch_dim = None + + for n, m in self.model.named_modules(): + m.global_name = n + + dit_model, dit_layer_config = self._quantize_dit() + + self.dit_model = dit_model + self.pipe.transformer = dit_model + combined_layer_config.update( + {f"dit.{k}": v for k, v in dit_layer_config.items()} + ) + self.dit_layer_config = dit_layer_config + + logger.info("Phase 2 complete: DiT model quantized") + + end_time = time.time() + logger.info(f"Total hybrid quantization time: {end_time - start_time:.1f}s") + + self.quantized = True + self.layer_config = combined_layer_config + self.model = self.dit_model + return self.model, self.layer_config + + def _create_ar_compressor(self): + """Create an MLLM compressor for the AR component.""" + from auto_round.compressors.mllm.compressor import MLLMCompressor + + processor = getattr(self.pipe, "processor", None) + tokenizer = getattr(self.pipe, "tokenizer", None) + + ar = MLLMCompressor( + model=self.ar_model, + tokenizer=tokenizer, + processor=processor, + image_processor=None, + platform=self.platform, + scheme=copy.deepcopy(self.orig_scheme) if hasattr(self, "orig_scheme") else self.scheme, + dataset=self.ar_dataset, + quant_nontext_module=self.quant_nontext_module, + iters=self.iters, + seqlen=self.seqlen, + nsamples=self.nsamples, + batch_size=1, + gradient_accumulate_steps=self.gradient_accumulate_steps, + low_gpu_mem_usage=self.low_gpu_mem_usage, + device_map=self.device_map, + enable_torch_compile=self.enable_torch_compile, + seed=self.seed, + ) + if hasattr(self, "formats"): + ar.formats = self.formats + # Required by base.quantize() → _adjust_immediate_packing_and_saving(); + # None disables immediate packing (correct since we call quantize() directly). + ar.orig_output_dir = None + return ar + + def _quantize_dit(self): + """Quantize the DiT model using the parent DiffusionCompressor's quantize flow.""" + return DiffusionCompressor.quantize(self) + + def calib(self, nsamples, bs): + """Override calib to pass extra pipeline kwargs (e.g. height/width) if set. + + Pipelines like GLM-Image require explicit image dimensions; standard diffusion + pipelines (FLUX etc.) accept but ignore them. + """ + import inspect + pipe_sig = inspect.signature(self.pipe.__call__) + extra = {} + if "height" in pipe_sig.parameters and self.height is not None: + extra["height"] = self.height + if "width" in pipe_sig.parameters and self.width is not None: + extra["width"] = self.width + + if not extra: + # No extra kwargs needed — delegate to parent as-is + return DiffusionCompressor.calib(self, nsamples, bs) + + # Replicate parent calib() with extra kwargs injected into the pipe call + from auto_round.compressors.diffusion.dataset import get_diffusion_dataloader + from auto_round.utils import clear_memory + from tqdm import tqdm + + logger.warning( + "Diffusion model will catch nsamples * num_inference_steps inputs, " + "you can reduce nsamples or num_inference_steps if OOM or take too much time." + ) + if isinstance(self.dataset, str): + dataset = self.dataset.replace(" ", "") + self.dataloader, self.batch_size, self.gradient_accumulate_steps = get_diffusion_dataloader( + dataset=dataset, + bs=self.batch_size, + seed=self.seed, + nsamples=self.nsamples, + gradient_accumulate_steps=self.gradient_accumulate_steps, + ) + else: + self.dataloader = self.dataset + total_cnt = 0 + + total = nsamples if not hasattr(self.dataloader, "len") else min(nsamples, len(self.dataloader)) + if self.pipe.dtype != self.model.dtype: + self.pipe.to(self.model.dtype) + if self.pipe.device != self.model.device: + self.pipe.to(self.model.device) + + with tqdm(range(1, total + 1), desc="cache block inputs") as pbar: + for ids, prompts in self.dataloader: + if isinstance(prompts, tuple): + prompts = list(prompts) + try: + self.pipe( + prompt=prompts, + guidance_scale=self.guidance_scale, + num_inference_steps=self.num_inference_steps, + generator=( + None + if self.generator_seed is None + else torch.Generator(device=self.pipe.device).manual_seed(self.generator_seed) + ), + **extra, + ) + except NotImplementedError: + pass + except Exception as error: + raise error + step = len(prompts) + total_cnt += step + pbar.update(step) + if total_cnt >= nsamples: + break + + if total_cnt == 0: + logger.error( + f"no data has been cached, please provide more data with sequence length >={self.seqlen} in the " + f"dataset or decease the sequence length" + ) + exit(-1) + elif total_cnt < nsamples: + logger.warning( + f"Insufficient number of samples collected may affect the quantization. " + f"target samples count is {nsamples}, while valid samples count is {total_cnt}" + ) + if total_cnt < self.batch_size: + raise ValueError( + f"valid samples is less than batch_size({self.batch_size})," + " please adjust self.batch_size or seqlen." + ) + max_len = (total_cnt // self.batch_size) * self.batch_size + for k, v in self.inputs.items(): + for key in v: + if isinstance(v[key], list) and len(v[key]) == total_cnt: + self.inputs[k][key] = v[key][:max_len] + + # ------------------------------------------------------------------ + # Saving + # ------------------------------------------------------------------ + + def save_quantized(self, output_dir=None, format="auto_round", ar_format=None, + dit_format=None, inplace=True, **kwargs): + """Save both quantized AR and DiT models into a pipeline directory structure. + + The output directory mirrors the original pipeline layout:: + + output_dir/ + model_index.json + / (quantized AR model) + transformer/ (quantized DiT model) + ... (unchanged auxiliary components) + + Args: + ar_format: Export format for the AR component. Falls back to *format*. + dit_format: Export format for the DiT component. Falls back to *format*. + """ + if output_dir is None: + logger.warning("output_dir is None, skipping save") + return + + from auto_round.formats import get_formats + from auto_round.compressors.base import BaseCompressor + + if ar_format is None: + ar_format = format + if dit_format is None: + dit_format = format + + saved_formats = self.formats # preserve original + + # Save DiT + if self.quant_dit: + dit_subdir = "transformer" + logger.info(f"Saving quantized DiT model ({dit_subdir}) [format={dit_format}]") + dit_output_dir = os.path.join(output_dir, dit_subdir) + os.makedirs(dit_output_dir, exist_ok=True) + + self.model = self.dit_model + if hasattr(self, "dit_layer_config"): + self.layer_config = self.dit_layer_config + + self.formats = get_formats(dit_format, self) + BaseCompressor.save_quantized( + self, output_dir=dit_output_dir, format=dit_format, inplace=inplace, **kwargs + ) + + # Save AR + if self.quant_ar and self.ar_model is not None: + ar_subdir = self.ar_component_name # e.g. "vision_language_encoder" + logger.info(f"Saving quantized AR model ({ar_subdir}) [format={ar_format}]") + ar_output_dir = os.path.join(output_dir, ar_subdir) + os.makedirs(ar_output_dir, exist_ok=True) + + self.model = self.ar_model + if hasattr(self, "ar_layer_config"): + self.layer_config = self.ar_layer_config + + # Swap serialization attributes from the AR compressor so that + # BaseCompressor.save_quantized builds the correct config. + ar_ser = getattr(self, "_ar_serialization", {}) + saved_attrs = {} + for k, v in ar_ser.items(): + saved_attrs[k] = getattr(self, k, None) + setattr(self, k, v) + + self.formats = get_formats(ar_format, self) + BaseCompressor.save_quantized( + self, output_dir=ar_output_dir, format=ar_format, inplace=inplace, **kwargs + ) + + # Restore DiT serialization attributes + for k, v in saved_attrs.items(): + setattr(self, k, v) + + self.formats = saved_formats + self._save_pipeline_metadata(output_dir) + self.model = self.dit_model + logger.info(f"Full hybrid quantized model saved to {output_dir}") + + def _save_pipeline_metadata(self, output_dir): + """Save model_index.json and auxiliary pipeline components.""" + src_path = ( + getattr(getattr(self.pipe, "config", None), "_name_or_path", None) + or getattr(self.pipe, "name_or_path", None) + ) + if src_path and os.path.exists(os.path.join(src_path, "model_index.json")): + import shutil + dst_index = os.path.join(output_dir, "model_index.json") + if not os.path.exists(dst_index): + shutil.copy2(os.path.join(src_path, "model_index.json"), dst_index) + + # Save non-quantized pipeline components so the exported directory remains + # loadable as a complete diffusers pipeline even when only one branch is quantized. + component_names = [ + "scheduler", + "tokenizer", + "processor", + "vae", + "text_encoder", + ] + if not self.quant_ar and self.ar_component_name is not None: + component_names.append(self.ar_component_name) + if not self.quant_dit: + component_names.append("transformer") + + for component_name in component_names: + component = getattr(self.pipe, component_name, None) + if component is None: + continue + component_dir = os.path.join(output_dir, component_name) + if os.path.exists(component_dir): + continue + try: + if hasattr(component, "save_pretrained"): + component.save_pretrained(component_dir) + except Exception as e: + logger.warning(f"Failed to save {component_name}: {e}") + + def quantize_and_save( + self, + output_dir: str = "tmp_autoround", + format: str = "auto_round", + ar_format: str = None, + dit_format: str = None, + inplace: bool = True, + **kwargs, + ): + """Quantize both components and save the complete pipeline. + + Args: + format: Default export format (used when *ar_format* / *dit_format* is None). + ar_format: Export format for the AR component. Falls back to *format*. + dit_format: Export format for the DiT component. Falls back to *format*. + """ + from auto_round.formats import get_formats + + format_list = get_formats(format, self) + self.formats = format_list + self.orig_output_dir = output_dir # required by base.quantize() → _adjust_immediate_packing_and_saving() + + self.quantize() + self.save_quantized( + output_dir, format=format, ar_format=ar_format, + dit_format=dit_format, inplace=inplace, **kwargs, + ) + logger.info(f"Hybrid quantized model saved to {output_dir}") + return self.model, [output_dir] diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py index de561b106..55b5c4982 100644 --- a/auto_round/utils/model.py +++ b/auto_round/utils/model.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import collections +import inspect import json import os import re @@ -1654,3 +1655,67 @@ def handle_generation_config(model: torch.nn.Module): model.generation_config.do_sample = True if hasattr(generation_config, "temperature") and generation_config.temperature != 1.0: model.generation_config.do_sample = True + + +def merge_block_output_keys(block, input_others, extra_keys): + """Merge block output keys into input_others, resolving positional/keyword conflicts. + + When a block is called with positional args (stored in input_others["positional_inputs"]), + and the block output produces updated values for those same parameters (e.g., + encoder_hidden_states), we must update the positional arg rather than adding a duplicate + keyword arg, which would cause "got multiple values for argument" errors. + """ + positional_inputs = input_others.get("positional_inputs") + if not positional_inputs or not extra_keys: + input_others.update(extra_keys) + return + + try: + sig = inspect.signature(block.forward) + except (ValueError, TypeError): + input_others.update(extra_keys) + return + + params = [p for p in sig.parameters.keys() if p != "self"] + # params[0] = hidden_states (passed as input_ids separately) + # params[1:] correspond to positional_inputs[0], [1], ... + + positional_inputs = list(positional_inputs) + for key, value in extra_keys.items(): + if key in params: + pos_idx = params.index(key) - 1 # -1 because hidden_states is params[0] + if 0 <= pos_idx < len(positional_inputs): + positional_inputs[pos_idx] = value + continue + input_others[key] = value + input_others["positional_inputs"] = tuple(positional_inputs) + + +def wrap_block_forward_positional_to_kwargs(base_hook): + """Wrap a block forward hook to convert positional inputs to keyword args. + + Models like GLM-Image call transformer blocks with positional args + (e.g. block(hidden_states, encoder_hidden_states, temb, ...)). The base + hook only stores positional_inputs once (from the first sample), losing + per-sample variation for encoder_hidden_states etc. By converting + positional args to keyword args, all inputs are properly accumulated + across calibration samples. + """ + _param_names = None + + def forward(m, hidden_states=None, *positional_inputs, **kwargs): + nonlocal _param_names + if positional_inputs: + if _param_names is None: + sig = inspect.signature(m.orig_forward) + _param_names = [p for p in sig.parameters.keys() if p != "self"] + for i, val in enumerate(positional_inputs): + param_idx = i + 1 # hidden_states is params[0] + if param_idx < len(_param_names): + param_name = _param_names[param_idx] + if param_name not in kwargs: + kwargs[param_name] = val + positional_inputs = () + return base_hook(m, hidden_states, *positional_inputs, **kwargs) + + return forward From 27cddba095b0a2e45a18d31a3cde270a139101e7 Mon Sep 17 00:00:00 2001 From: lvliang-intel Date: Tue, 17 Mar 2026 21:34:43 +0800 Subject: [PATCH 05/10] fix hybrid mode Signed-off-by: lvliang-intel --- auto_round/compressors/diffusion/hybrid.py | 72 +++++++++++++++++++++- 1 file changed, 70 insertions(+), 2 deletions(-) diff --git a/auto_round/compressors/diffusion/hybrid.py b/auto_round/compressors/diffusion/hybrid.py index e74981f89..b18fa2a9a 100644 --- a/auto_round/compressors/diffusion/hybrid.py +++ b/auto_round/compressors/diffusion/hybrid.py @@ -31,6 +31,7 @@ import copy import os +import shutil import time from typing import Any, Union @@ -412,10 +413,10 @@ def _create_ar_compressor(self): ) if hasattr(self, "formats"): ar.formats = self.formats - ar.inplace = False # Required by base.quantize() → _adjust_immediate_packing_and_saving(); # None disables immediate packing (correct since we call quantize() directly). ar.orig_output_dir = None + ar.inplace = True return ar def _quantize_dit(self): @@ -520,6 +521,68 @@ def calib(self, nsamples, bs): # Saving # ------------------------------------------------------------------ + @staticmethod + def _flatten_nested_component_dir(component_output_dir: str, component_name: str) -> None: + """Fix accidental nested save layouts (e.g. transformer/transformer/config.json). + + Some model config savers may create a same-name nested directory under the + component output path. Flatten it so pipeline loaders find config files in + the expected component root. + """ + model_markers = ( + "config.json", + "generation_config.json", + "model.safetensors", + "model.safetensors.index.json", + ) + + nested_dir = os.path.join(component_output_dir, component_name) + + # Some exporters write into a single nested model folder (for example, + # vision_language_encoder/transformer). If same-name nesting does not + # exist, try to detect this pattern and flatten it as well. + if not os.path.isdir(nested_dir): + child_dirs = [ + os.path.join(component_output_dir, name) + for name in os.listdir(component_output_dir) + if os.path.isdir(os.path.join(component_output_dir, name)) + ] + has_model_files_at_root = any( + os.path.exists(os.path.join(component_output_dir, marker)) + for marker in model_markers + ) + if len(child_dirs) == 1 and not has_model_files_at_root: + candidate = child_dirs[0] + has_model_files_in_child = any( + os.path.exists(os.path.join(candidate, marker)) + for marker in model_markers + ) + if has_model_files_in_child: + nested_dir = candidate + else: + return + else: + return + + moved = 0 + for entry in os.listdir(nested_dir): + src = os.path.join(nested_dir, entry) + dst = os.path.join(component_output_dir, entry) + if os.path.exists(dst): + continue + shutil.move(src, dst) + moved += 1 + + if moved > 0: + logger.warning( + "Flattened nested component directory %s -> %s", + nested_dir, + component_output_dir, + ) + + # Remove the nested directory when empty. + if os.path.isdir(nested_dir) and len(os.listdir(nested_dir)) == 0: + os.rmdir(nested_dir) def save_quantized(self, output_dir=None, format="auto_round", inplace=True, **kwargs): """Save both quantized AR and DiT models into a pipeline directory structure. @@ -558,6 +621,7 @@ def save_quantized(self, output_dir=None, format="auto_round", inplace=True, **k BaseCompressor.save_quantized( self, output_dir=dit_output_dir, format=format, inplace=inplace, **kwargs ) + self._flatten_nested_component_dir(dit_output_dir, dit_subdir) # Save AR if self.quant_ar and self.ar_model is not None: @@ -582,6 +646,7 @@ def save_quantized(self, output_dir=None, format="auto_round", inplace=True, **k BaseCompressor.save_quantized( self, output_dir=ar_output_dir, format=format, inplace=inplace, **kwargs ) + self._flatten_nested_component_dir(ar_output_dir, ar_subdir) # Restore DiT serialization attributes for k, v in saved_attrs.items(): @@ -647,8 +712,11 @@ def quantize_and_save( format_list = get_formats(format, self) self.formats = format_list - self.orig_output_dir = output_dir # required by base.quantize() → _adjust_immediate_packing_and_saving() self.inplace = inplace + # Keep orig_output_dir as None so _adjust_immediate_packing_and_saving() + # disables immediate saving — diffusers models must go through + # model.save_pretrained() to get correct weight file names. + self.orig_output_dir = None self.quantize() self.save_quantized( From 0ec767ae177d478df38e22d060cc271bd15a5099 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 17 Mar 2026 13:42:10 +0000 Subject: [PATCH 06/10] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/compressors/diffusion/hybrid.py | 70 ++++++++++------------ 1 file changed, 32 insertions(+), 38 deletions(-) diff --git a/auto_round/compressors/diffusion/hybrid.py b/auto_round/compressors/diffusion/hybrid.py index b18fa2a9a..4670f2d35 100644 --- a/auto_round/compressors/diffusion/hybrid.py +++ b/auto_round/compressors/diffusion/hybrid.py @@ -58,7 +58,7 @@ # To support a new hybrid architecture, simply add its AR attribute name here. # --------------------------------------------------------------------------- HYBRID_AR_COMPONENTS = [ - "vision_language_encoder", # GLM-Image + "vision_language_encoder", # GLM-Image # Add new AR component names here, e.g.: # "language_model", # "text_decoder", @@ -76,18 +76,21 @@ # Detection # --------------------------------------------------------------------------- + def _find_ar_component_name(model_or_path): """Return the AR component attribute name if model_or_path is a hybrid pipeline, else None.""" if isinstance(model_or_path, str): index_path = os.path.join(model_or_path, "model_index.json") if not os.path.exists(index_path): from huggingface_hub import hf_hub_download + try: index_path = hf_hub_download(model_or_path, "model_index.json") except Exception: return None import json + with open(index_path) as f: data = json.load(f) if "transformer" not in data: @@ -188,6 +191,7 @@ def __init__( # --- Load the pipeline --- if isinstance(model, str): from auto_round.utils.model import diffusion_load_model + pipe, dit_model = diffusion_load_model( model, platform=platform, device=self.device, model_dtype=model_dtype ) @@ -195,26 +199,19 @@ def __init__( pipe = model dit_model = pipe.transformer else: - raise ValueError( - f"HybridCompressor requires a model path or DiffusionPipeline, got {type(model)}" - ) + raise ValueError(f"HybridCompressor requires a model path or DiffusionPipeline, got {type(model)}") # --- Discover the AR component dynamically --- self.ar_component_name = _find_ar_component_name(pipe) if self.ar_component_name is None and self.quant_ar: logger.warning( - f"No AR component found in pipeline (checked: {HYBRID_AR_COMPONENTS}), " - "skipping AR quantization." + f"No AR component found in pipeline (checked: {HYBRID_AR_COMPONENTS}), " "skipping AR quantization." ) self.quant_ar = False self.pipe = pipe self.dit_model = dit_model - self.ar_model = ( - getattr(pipe, self.ar_component_name, None) - if self.ar_component_name - else None - ) + self.ar_model = getattr(pipe, self.ar_component_name, None) if self.ar_component_name else None if not self.quant_ar and not self.quant_dit: raise ValueError("At least one of quant_ar and quant_dit must be True.") @@ -246,6 +243,7 @@ def __init__( # --- Detect AR blocks --- if self.quant_ar and self.ar_model is not None: from auto_round.special_model_handler import SPECIAL_MULTIMODAL_BLOCK + model_type = getattr(getattr(self.ar_model, "config", None), "model_type", None) if model_type and model_type in SPECIAL_MULTIMODAL_BLOCK: self.ar_quant_block_list = SPECIAL_MULTIMODAL_BLOCK[model_type]( @@ -283,6 +281,7 @@ def __init__( self._saved_ar_model = self.ar_model from auto_round.compressors.base import BaseCompressor + BaseCompressor.__init__( self, model=model, @@ -333,17 +332,14 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: self.ar_model = ar_model setattr(self.pipe, self.ar_component_name, ar_model) - combined_layer_config.update( - {f"ar.{k}": v for k, v in ar_layer_config.items()} - ) + combined_layer_config.update({f"ar.{k}": v for k, v in ar_layer_config.items()}) self.ar_layer_config = ar_layer_config # Preserve serialization-relevant attributes from the AR compressor # so save_quantized can build the correct serialization_dict. from auto_round.compressors.base import SERIALIZATION_KEYS - self._ar_serialization = { - k: getattr(ar_compressor, k, None) for k in SERIALIZATION_KEYS - } + + self._ar_serialization = {k: getattr(ar_compressor, k, None) for k in SERIALIZATION_KEYS} # Move AR model to CPU to free GPU for Phase 2 self.ar_model.to("cpu") @@ -370,9 +366,7 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: self.dit_model = dit_model self.pipe.transformer = dit_model - combined_layer_config.update( - {f"dit.{k}": v for k, v in dit_layer_config.items()} - ) + combined_layer_config.update({f"dit.{k}": v for k, v in dit_layer_config.items()}) self.dit_layer_config = dit_layer_config logger.info("Phase 2 complete: DiT model quantized") @@ -430,6 +424,7 @@ def calib(self, nsamples, bs): pipelines (FLUX etc.) accept but ignore them. """ import inspect + pipe_sig = inspect.signature(self.pipe.__call__) extra = {} if "height" in pipe_sig.parameters and self.height is not None: @@ -442,9 +437,10 @@ def calib(self, nsamples, bs): return DiffusionCompressor.calib(self, nsamples, bs) # Replicate parent calib() with extra kwargs injected into the pipe call + from tqdm import tqdm + from auto_round.compressors.diffusion.dataset import get_diffusion_dataloader from auto_round.utils import clear_memory - from tqdm import tqdm logger.warning( "Diffusion model will catch nsamples * num_inference_steps inputs, " @@ -548,14 +544,12 @@ def _flatten_nested_component_dir(component_output_dir: str, component_name: str if os.path.isdir(os.path.join(component_output_dir, name)) ] has_model_files_at_root = any( - os.path.exists(os.path.join(component_output_dir, marker)) - for marker in model_markers + os.path.exists(os.path.join(component_output_dir, marker)) for marker in model_markers ) if len(child_dirs) == 1 and not has_model_files_at_root: candidate = child_dirs[0] has_model_files_in_child = any( - os.path.exists(os.path.join(candidate, marker)) - for marker in model_markers + os.path.exists(os.path.join(candidate, marker)) for marker in model_markers ) if has_model_files_in_child: nested_dir = candidate @@ -583,6 +577,7 @@ def _flatten_nested_component_dir(component_output_dir: str, component_name: str # Remove the nested directory when empty. if os.path.isdir(nested_dir) and len(os.listdir(nested_dir)) == 0: os.rmdir(nested_dir) + def save_quantized(self, output_dir=None, format="auto_round", inplace=True, **kwargs): """Save both quantized AR and DiT models into a pipeline directory structure. @@ -601,8 +596,8 @@ def save_quantized(self, output_dir=None, format="auto_round", inplace=True, **k logger.warning("output_dir is None, skipping save") return - from auto_round.formats import get_formats from auto_round.compressors.base import BaseCompressor + from auto_round.formats import get_formats saved_formats = self.formats # preserve original @@ -618,9 +613,7 @@ def save_quantized(self, output_dir=None, format="auto_round", inplace=True, **k self.layer_config = self.dit_layer_config self.formats = get_formats(format, self) - BaseCompressor.save_quantized( - self, output_dir=dit_output_dir, format=format, inplace=inplace, **kwargs - ) + BaseCompressor.save_quantized(self, output_dir=dit_output_dir, format=format, inplace=inplace, **kwargs) self._flatten_nested_component_dir(dit_output_dir, dit_subdir) # Save AR @@ -636,16 +629,14 @@ def save_quantized(self, output_dir=None, format="auto_round", inplace=True, **k # Swap serialization attributes from the AR compressor so that # BaseCompressor.save_quantized builds the correct config. - ar_ser = getattr(self, "_ar_serialization", {}) + ar_set = getattr(self, "_ar_serialization", {}) saved_attrs = {} - for k, v in ar_ser.items(): + for k, v in ar_set.items(): saved_attrs[k] = getattr(self, k, None) setattr(self, k, v) self.formats = get_formats(format, self) - BaseCompressor.save_quantized( - self, output_dir=ar_output_dir, format=format, inplace=inplace, **kwargs - ) + BaseCompressor.save_quantized(self, output_dir=ar_output_dir, format=format, inplace=inplace, **kwargs) self._flatten_nested_component_dir(ar_output_dir, ar_subdir) # Restore DiT serialization attributes @@ -659,12 +650,12 @@ def save_quantized(self, output_dir=None, format="auto_round", inplace=True, **k def _save_pipeline_metadata(self, output_dir): """Save model_index.json and auxiliary pipeline components.""" - src_path = ( - getattr(getattr(self.pipe, "config", None), "_name_or_path", None) - or getattr(self.pipe, "name_or_path", None) + src_path = getattr(getattr(self.pipe, "config", None), "_name_or_path", None) or getattr( + self.pipe, "name_or_path", None ) if src_path and os.path.exists(os.path.join(src_path, "model_index.json")): import shutil + dst_index = os.path.join(output_dir, "model_index.json") if not os.path.exists(dst_index): shutil.copy2(os.path.join(src_path, "model_index.json"), dst_index) @@ -720,7 +711,10 @@ def quantize_and_save( self.quantize() self.save_quantized( - output_dir, format=format, inplace=inplace, **kwargs, + output_dir, + format=format, + inplace=inplace, + **kwargs, ) logger.info(f"Hybrid quantized model saved to {output_dir}") return self.model, [output_dir] From dcf3b52d58a7d8866c89429e503148111e504e44 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 19 Mar 2026 02:38:47 +0000 Subject: [PATCH 07/10] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/special_model_handler.py | 1 + 1 file changed, 1 insertion(+) diff --git a/auto_round/special_model_handler.py b/auto_round/special_model_handler.py index 2fc400d43..c9ae830a0 100644 --- a/auto_round/special_model_handler.py +++ b/auto_round/special_model_handler.py @@ -172,6 +172,7 @@ def _get_qwen3_omni_moe_multimodal_block(model, quant_vision=False): return block_names + def _get_glm_image_multimodal_block(model, quant_vision=False): """Get block names for GLM-Image AR model. From 0e4dafcd548b2c0b097cae318d0d5e2f739a620c Mon Sep 17 00:00:00 2001 From: lvliang-intel Date: Thu, 19 Mar 2026 10:51:16 +0800 Subject: [PATCH 08/10] fix issue Signed-off-by: lvliang-intel --- auto_round/special_model_handler.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/auto_round/special_model_handler.py b/auto_round/special_model_handler.py index c9ae830a0..73dbf4896 100644 --- a/auto_round/special_model_handler.py +++ b/auto_round/special_model_handler.py @@ -195,6 +195,8 @@ def _get_glm_image_multimodal_block(model, quant_vision=False): [f"model.language_model.layers.{i}" for i in range(len(model.model.language_model.layers))] ) + return block_names + SPECIAL_MULTIMODAL_BLOCK = { "deepseek_vl_v2": _get_deepseek_vl2_multimodal_block, From 19f84845dddcf29339e85f5f40afbb51905f7b74 Mon Sep 17 00:00:00 2001 From: lvliang-intel Date: Fri, 20 Mar 2026 21:50:48 +0800 Subject: [PATCH 09/10] fix comments Signed-off-by: lvliang-intel --- auto_round/compressors/diffusion/hybrid.py | 5 +- auto_round/utils/model.py | 92 ++++++++-------------- test/test_cpu/models/test_glm_image.py | 16 ++-- 3 files changed, 45 insertions(+), 68 deletions(-) diff --git a/auto_round/compressors/diffusion/hybrid.py b/auto_round/compressors/diffusion/hybrid.py index 4670f2d35..81c80d700 100644 --- a/auto_round/compressors/diffusion/hybrid.py +++ b/auto_round/compressors/diffusion/hybrid.py @@ -492,11 +492,10 @@ def calib(self, nsamples, bs): break if total_cnt == 0: - logger.error( + raise RuntimeError( f"no data has been cached, please provide more data with sequence length >={self.seqlen} in the " - f"dataset or decease the sequence length" + f"dataset or decrease the sequence length" ) - exit(-1) elif total_cnt < nsamples: logger.warning( f"Insufficient number of samples collected may affect the quantization. " diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py index 9474bc8b5..4d6439d03 100644 --- a/auto_round/utils/model.py +++ b/auto_round/utils/model.py @@ -368,62 +368,31 @@ def llm_load_model( return model, tokenizer -def _find_pipeline_model_subfolder_local(model_dir: str) -> tuple: - """Find model/processor subfolders from a local pipeline directory with model_index.json. +def _find_pipeline_model_subfolder(model_dir_or_repo: str, file_list: list = None) -> tuple: + """Find model/processor subfolders from a pipeline's model_index.json. - Scans component subdirectories to find the one whose config.json has 'architectures', - and looks for a 'processor' component. + Works for both local directories and remote HF repos. + + Args: + model_dir_or_repo: Local directory path or HF repo id. + file_list: If provided, treat *model_dir_or_repo* as a remote HF repo + and use *file_list* (from ``list_repo_files``) to check file existence. + If ``None``, treat it as a local directory. Returns: (model_subfolder, processor_subfolder, config_dict) """ - index_path = os.path.join(model_dir, "model_index.json") - if not os.path.exists(index_path): - raise FileNotFoundError(f"No config.json or model_index.json found under {model_dir}") - - with open(index_path, "r", encoding="utf-8") as f: - model_index = json.load(f) - - processor_subfolder = None - for name, value in model_index.items(): - if name == "processor" and isinstance(value, list): - processor_subfolder = "processor" - break + is_local = file_list is None - candidates = [] - for name, value in model_index.items(): - if name.startswith("_") or not isinstance(value, list) or len(value) < 2: - continue - comp_config_path = os.path.join(model_dir, name, "config.json") - if not os.path.isfile(comp_config_path): - continue - with open(comp_config_path, "r", encoding="utf-8") as f: - comp_config = json.load(f) - if "architectures" in comp_config: - candidates.append((name, comp_config)) - - if not candidates: - raise FileNotFoundError( - f"model_index.json found in {model_dir} but no component with 'architectures' in its config.json" - ) - - for name, comp_config in candidates: - arch = comp_config["architectures"][0] - if "CausalLM" in arch or "ConditionalGeneration" in arch: - return name, processor_subfolder, comp_config - - return candidates[0][0], processor_subfolder, candidates[0][1] - - -def _find_pipeline_model_subfolder_remote(repo_id: str, file_list: list) -> tuple: - """Find model/processor subfolders from a remote HF repo with model_index.json. + if is_local: + index_path = os.path.join(model_dir_or_repo, "model_index.json") + if not os.path.exists(index_path): + raise FileNotFoundError(f"No config.json or model_index.json found under {model_dir_or_repo}") + else: + from huggingface_hub import hf_hub_download - Returns: - (model_subfolder, processor_subfolder, config_dict) - """ - from huggingface_hub import hf_hub_download + index_path = hf_hub_download(model_dir_or_repo, "model_index.json") - index_path = hf_hub_download(repo_id, "model_index.json") with open(index_path, "r", encoding="utf-8") as f: model_index = json.load(f) @@ -437,18 +406,27 @@ def _find_pipeline_model_subfolder_remote(repo_id: str, file_list: list) -> tupl for name, value in model_index.items(): if name.startswith("_") or not isinstance(value, list) or len(value) < 2: continue - comp_config_file = f"{name}/config.json" - if comp_config_file not in file_list: - continue - comp_config_path = hf_hub_download(repo_id, comp_config_file) - with open(comp_config_path, "r", encoding="utf-8") as f: - comp_config = json.load(f) + # Load component config.json + if is_local: + cfg_path = os.path.join(model_dir_or_repo, name, "config.json") + if not os.path.isfile(cfg_path): + continue + with open(cfg_path, "r", encoding="utf-8") as f: + comp_config = json.load(f) + else: + comp_config_file = f"{name}/config.json" + if comp_config_file not in file_list: + continue + cfg_path = hf_hub_download(model_dir_or_repo, comp_config_file) + with open(cfg_path, "r", encoding="utf-8") as f: + comp_config = json.load(f) + if "architectures" in comp_config: candidates.append((name, comp_config)) if not candidates: raise FileNotFoundError( - f"model_index.json found for {repo_id} but no component with 'architectures' in its config.json" + f"model_index.json found in {model_dir_or_repo} but no component with 'architectures' in its config.json" ) for name, comp_config in candidates: @@ -505,7 +483,7 @@ def mllm_load_model( with open(config_path, "r", encoding="utf-8") as f: config = json.load(f) else: - model_subfolder, processor_subfolder, config = _find_pipeline_model_subfolder_local( + model_subfolder, processor_subfolder, config = _find_pipeline_model_subfolder( pretrained_model_name_or_path ) else: @@ -517,7 +495,7 @@ def mllm_load_model( with open(config_path, "r", encoding="utf-8") as f: config = json.load(f) elif "model_index.json" in file_list: - model_subfolder, processor_subfolder, config = _find_pipeline_model_subfolder_remote( + model_subfolder, processor_subfolder, config = _find_pipeline_model_subfolder( pretrained_model_name_or_path, file_list ) elif "config.json.gz" in file_list: diff --git a/test/test_cpu/models/test_glm_image.py b/test/test_cpu/models/test_glm_image.py index e676267a8..5f497b763 100644 --- a/test/test_cpu/models/test_glm_image.py +++ b/test/test_cpu/models/test_glm_image.py @@ -26,7 +26,7 @@ import torch.nn as nn from auto_round.special_model_handler import _get_glm_image_multimodal_block -from auto_round.utils.model import _find_pipeline_model_subfolder_local +from auto_round.utils.model import _find_pipeline_model_subfolder # --------------------------------------------------------------------------- # Helpers – fake model hierarchy @@ -191,7 +191,7 @@ def test_finds_vision_language_encoder_subfolder(self, tmp_path): "vae": {"model_type": "autoencoder_kl"}, # no architectures → ignored }, ) - model_subfolder, processor_subfolder, cfg = _find_pipeline_model_subfolder_local(pipeline_dir) + model_subfolder, processor_subfolder, cfg = _find_pipeline_model_subfolder(pipeline_dir) assert model_subfolder == "vision_language_encoder" assert processor_subfolder == "processor" @@ -210,7 +210,7 @@ def test_prefers_conditional_generation_over_encoder(self, tmp_path): }, has_processor=False, ) - model_subfolder, processor_subfolder, cfg = _find_pipeline_model_subfolder_local(pipeline_dir) + model_subfolder, processor_subfolder, cfg = _find_pipeline_model_subfolder(pipeline_dir) assert model_subfolder == "vision_language_encoder" assert processor_subfolder is None # no processor entry @@ -222,7 +222,7 @@ def test_no_processor_returns_none(self, tmp_path): {"vision_language_encoder": {"architectures": ["GlmImageForConditionalGeneration"]}}, has_processor=False, ) - _, processor_subfolder, _ = _find_pipeline_model_subfolder_local(pipeline_dir) + _, processor_subfolder, _ = _find_pipeline_model_subfolder(pipeline_dir) assert processor_subfolder is None def test_with_processor_returns_processor_subfolder(self, tmp_path): @@ -232,13 +232,13 @@ def test_with_processor_returns_processor_subfolder(self, tmp_path): {"vision_language_encoder": {"architectures": ["GlmImageForConditionalGeneration"]}}, has_processor=True, ) - _, processor_subfolder, _ = _find_pipeline_model_subfolder_local(pipeline_dir) + _, processor_subfolder, _ = _find_pipeline_model_subfolder(pipeline_dir) assert processor_subfolder == "processor" def test_raises_when_no_model_index(self, tmp_path): """FileNotFoundError raised when neither config.json nor model_index.json exists.""" with pytest.raises(FileNotFoundError, match="model_index.json"): - _find_pipeline_model_subfolder_local(str(tmp_path)) + _find_pipeline_model_subfolder(str(tmp_path)) def test_raises_when_no_component_has_architectures(self, tmp_path): """FileNotFoundError raised when no component config contains 'architectures'.""" @@ -250,7 +250,7 @@ def test_raises_when_no_component_has_architectures(self, tmp_path): }, ) with pytest.raises(FileNotFoundError, match="architectures"): - _find_pipeline_model_subfolder_local(pipeline_dir) + _find_pipeline_model_subfolder(pipeline_dir) def test_falls_back_to_first_candidate_when_no_preferred_arch(self, tmp_path): """When no ConditionalGeneration/CausalLM arch exists, first candidate is used.""" @@ -262,7 +262,7 @@ def test_falls_back_to_first_candidate_when_no_preferred_arch(self, tmp_path): }, has_processor=False, ) - model_subfolder, _, cfg = _find_pipeline_model_subfolder_local(pipeline_dir) + model_subfolder, _, cfg = _find_pipeline_model_subfolder(pipeline_dir) # Must be one of the candidates, not crash assert model_subfolder in ("text_encoder", "image_encoder") assert "architectures" in cfg From 57c58d9038fbe9ccd38838538149b4071ac69104 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 20 Mar 2026 13:58:52 +0000 Subject: [PATCH 10/10] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/utils/model.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py index 4d6439d03..f0aec180a 100644 --- a/auto_round/utils/model.py +++ b/auto_round/utils/model.py @@ -483,9 +483,7 @@ def mllm_load_model( with open(config_path, "r", encoding="utf-8") as f: config = json.load(f) else: - model_subfolder, processor_subfolder, config = _find_pipeline_model_subfolder( - pretrained_model_name_or_path - ) + model_subfolder, processor_subfolder, config = _find_pipeline_model_subfolder(pretrained_model_name_or_path) else: from huggingface_hub import hf_hub_download, list_repo_files