From cf301fe75b3e71c47ecce1035cb48e49fb5813cb Mon Sep 17 00:00:00 2001 From: Hongsheng Liu Date: Thu, 30 Apr 2026 15:05:26 +0800 Subject: [PATCH] =?UTF-8?q?Revert=20"[Quantization]=20Enable=20FP8=20onlin?= =?UTF-8?q?e=20quantization=20for=20Z-image=20text=20encode=E2=80=A6"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit ac662827466dcf0cb7e0d174e968ec8d590feb56. --- docs/user_guide/quantization/fp8.md | 18 +-- .../autoencoders/autoencoder_kl.py | 6 - .../model_loader/diffusers_loader.py | 23 +--- vllm_omni/diffusion/models/utils.py | 126 ------------------ .../models/z_image/pipeline_z_image.py | 34 +---- 5 files changed, 19 insertions(+), 188 deletions(-) diff --git a/docs/user_guide/quantization/fp8.md b/docs/user_guide/quantization/fp8.md index 7373a39ffb4..e89bc76ca77 100644 --- a/docs/user_guide/quantization/fp8.md +++ b/docs/user_guide/quantization/fp8.md @@ -32,15 +32,15 @@ guide. FP8 on Ampere may use a weight-only path where available. ### Diffusion Model (Qwen-Image, Wan2.2) -| Model | HF models | Online | Pre-calibrated | Recommendation | `ignored_layers` | Text-Encoder quantization | -|-------|-----------|:-------:|:------:|----------------|------------------|------------------| -| Qwen-Image | `Qwen/Qwen-Image`, `Qwen/Qwen-Image-2512` | Yes | Yes | Skip sensitive image-stream MLPs when quality regresses | `img_mlp` | | -| Wan2.2 | Wan2.2 diffusion pipelines | Not validated | Not validated | Validate against BF16 before documenting as supported | TBD | | -| Z-Image | `Tongyi-MAI/Z-Image-Turbo` | Yes | Yes | All layers | None | ✅︎ | -| FLUX.1 | `black-forest-labs/FLUX.1-dev`, `black-forest-labs/FLUX.1-schnell` | Yes | Yes | All layers | None | | -| FLUX.2-klein | `black-forest-labs/FLUX.2-klein-4B` | Yes | Yes | All layers | None | | -| HunyuanImage-3.0 | `tencent/HunyuanImage-3.0`, `tencent/HunyuanImage-3.0-Instruct` | Yes | Yes | All layers; use the Hunyuan stage config for multi-stage runs | None | | -| HunyuanVideo-1.5 | `hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-480p_t2v`, `720p_t2v`, `480p_i2v` | Yes | Yes | All layers | None | | +| Model | HF models | Online | Pre-calibrated | Recommendation | `ignored_layers` | +|-------|-----------|:-------:|:------:|----------------|------------------| +| Qwen-Image | `Qwen/Qwen-Image`, `Qwen/Qwen-Image-2512` | Yes | Yes | Skip sensitive image-stream MLPs when quality regresses | `img_mlp` | +| Wan2.2 | Wan2.2 diffusion pipelines | Not validated | Not validated | Validate against BF16 before documenting as supported | TBD | +| Z-Image | `Tongyi-MAI/Z-Image-Turbo` | Yes | Yes | All layers | None | +| FLUX.1 | `black-forest-labs/FLUX.1-dev`, `black-forest-labs/FLUX.1-schnell` | Yes | Yes | All layers | None | +| FLUX.2-klein | `black-forest-labs/FLUX.2-klein-4B` | Yes | Yes | All layers | None | +| HunyuanImage-3.0 | `tencent/HunyuanImage-3.0`, `tencent/HunyuanImage-3.0-Instruct` | Yes | Yes | All layers; use the Hunyuan stage config for multi-stage runs | None | +| HunyuanVideo-1.5 | `hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-480p_t2v`, `720p_t2v`, `480p_i2v` | Yes | Yes | All layers | None | ### Multi-Stage Omni/TTS Model (Qwen3-Omni, Qwen3-TTS) diff --git a/vllm_omni/diffusion/distributed/autoencoders/autoencoder_kl.py b/vllm_omni/diffusion/distributed/autoencoders/autoencoder_kl.py index 1045a6135ff..0084719a8ab 100644 --- a/vllm_omni/diffusion/distributed/autoencoders/autoencoder_kl.py +++ b/vllm_omni/diffusion/distributed/autoencoders/autoencoder_kl.py @@ -28,12 +28,6 @@ def from_pretrained(cls, *args: Any, **kwargs: Any): model.init_distributed() return model - @classmethod - def from_config(cls, *args: Any, **kwargs: Any): - model = super().from_config(*args, **kwargs) - model.init_distributed() - return model - def tile_split(self, z: torch.Tensor) -> tuple[list[TileTask], GridSpec]: # mostly copy from AutoencoderKL overlap_size = int(self.tile_latent_min_size * (1 - self.tile_overlap_factor)) diff --git a/vllm_omni/diffusion/model_loader/diffusers_loader.py b/vllm_omni/diffusion/model_loader/diffusers_loader.py index 446bf6dd65b..91f3574b185 100644 --- a/vllm_omni/diffusion/model_loader/diffusers_loader.py +++ b/vllm_omni/diffusion/model_loader/diffusers_loader.py @@ -26,7 +26,6 @@ multi_thread_safetensors_weights_iterator, safetensors_weights_iterator, ) -from vllm.transformers_utils.repo_utils import file_exists from vllm.utils.import_utils import resolve_obj_by_qualname from vllm.utils.torch_utils import set_default_torch_dtype @@ -50,8 +49,6 @@ def _natural_sort_key(filepath: str) -> list: MODEL_INDEX = "model_index.json" DIFFUSION_MODEL_WEIGHTS_INDEX = "diffusion_pytorch_model.safetensors.index.json" -TRANSFORMER_WEIGHTS_INDEX = "model.safetensors.index.json" -INDEX_FILES = [DIFFUSION_MODEL_WEIGHTS_INDEX, TRANSFORMER_WEIGHTS_INDEX] class DiffusersPipelineLoader: @@ -102,22 +99,8 @@ def _prepare_weights( is_local = os.path.isdir(model_name_or_path) load_format = self.load_config.load_format use_safetensors = False - possible_index_files = [ - f"{subfolder}/{index_file}" if subfolder is not None else index_file for index_file in INDEX_FILES - ] - available_index_file = list( - filter(lambda f: file_exists(model_name_or_path, f, revision=revision), possible_index_files) - ) - if len(available_index_file) > 1: - raise ValueError( - f"Multiple index files found in {model_name_or_path} with subfolder {subfolder}: {available_index_file}" - ) - index_file_with_subfolder = available_index_file[0] if len(available_index_file) == 1 else None - index_file = ( - index_file_with_subfolder.split("/")[-1] - if index_file_with_subfolder and subfolder is not None - else index_file_with_subfolder - ) + index_file = DIFFUSION_MODEL_WEIGHTS_INDEX + index_file_with_subfolder = f"{subfolder}/{index_file}" if subfolder else index_file # only hf is supported currently if load_format == "auto": @@ -180,7 +163,7 @@ def _prepare_weights( hf_weights_files = filter_duplicate_safetensors_files( hf_weights_files, filter_folder, - index_file or "", + index_file, ) else: hf_weights_files = filter_files_not_needed_for_inference(hf_weights_files) diff --git a/vllm_omni/diffusion/models/utils.py b/vllm_omni/diffusion/models/utils.py index ce5fa34322a..ba0d8dda20c 100644 --- a/vllm_omni/diffusion/models/utils.py +++ b/vllm_omni/diffusion/models/utils.py @@ -5,132 +5,6 @@ import json import os -from typing import TYPE_CHECKING, Literal - -import torch -import torch.nn as nn -from vllm.model_executor.layers.linear import ( - ColumnParallelLinear, - ReplicatedLinear, - RowParallelLinear, -) -from vllm.model_executor.models.transformers.utils import init_on_device_without_buffers -from vllm.model_executor.models.utils import maybe_prefix - -from vllm_omni.diffusion.data import OmniDiffusionConfig -from vllm_omni.diffusion.quantization import get_vllm_quant_config_for_layers - -if TYPE_CHECKING: - from transformers import PretrainedConfig, PreTrainedModel - from transformers.models.auto.auto_factory import _BaseAutoModelClass - from vllm.model_executor.layers.quantization.base_config import ( - QuantizationConfig, - ) - - -Style = Literal["colwise", "colwise_rep", "rowwise", "rowwise_rep", "replicate"] - - -def replace_linear_class( - linear: nn.Linear, - style: Style = "replicate", - quant_config: QuantizationConfig | None = None, - *, - prefix: str = "", -) -> ColumnParallelLinear | RowParallelLinear | ReplicatedLinear: - """ - Replace nn.Linear with one of vLLM's tensor parallel linear classes. - - Args: - linear: `nn.Linear` to be replaced. - style: Tensor parallel style of the new linear, e.g. "colwise". - quant_config: Quantization config for the new linear. - Returns: - The new linear. - """ - - if not isinstance(style, str): - raise ValueError(f"Unsupported parallel style type {type(style)}, expected str") - - vllm_linear_maps = { - "colwise": (ColumnParallelLinear, {}), - "colwise_rep": (ColumnParallelLinear, {"gather_output": True}), - "rowwise": (RowParallelLinear, {}), - "rowwise_rep": (RowParallelLinear, {"input_is_parallel": False}), - "replicate": (ReplicatedLinear, {}), - } - vllm_linear_cls, vllm_linear_kwargs = vllm_linear_maps[style] - - return vllm_linear_cls( - input_size=linear.in_features, - output_size=linear.out_features, - bias=linear.bias is not None, - quant_config=quant_config, - prefix=prefix, - return_bias=False, - **vllm_linear_kwargs, - ) - - -def recursive_replace_linear(model: nn.Module, od_config: OmniDiffusionConfig): - """Recursively replace modules in the model as needed. - Currently, this replaces: - - `nn.Linear` with vLLM's tensor parallel linear classes - """ - # Prefix the patterns because we always start from `self.model` - quant_config = get_vllm_quant_config_for_layers(od_config.quantization_config) - - def _recursive_replace(module: nn.Module, prefix: str): - for child_name, child_module in module.named_children(): - new_module = child_module - qual_name = maybe_prefix(prefix, child_name) - # Replace modules as needed - if isinstance(child_module, nn.Linear): - style = "replicate" - new_module = replace_linear_class(child_module, style, quant_config, prefix=qual_name) - else: - _recursive_replace(child_module, prefix=qual_name) - if new_module is not child_module: - setattr(module, child_name, new_module) - - _recursive_replace(model, prefix="") - - -def init_parameters( - module: nn.Module, - dtype: torch.dtype | None, - device: torch.device | None = None, -): - for name, param in module.named_parameters(recurse=False): - if param.device == torch.device("meta"): - new_param = nn.Parameter( - torch.empty_like( - param.data, - dtype=dtype, - device=device, - ), - requires_grad=param.requires_grad, - ) - setattr(module, name, new_param) - for child in module.children(): - init_parameters(child, dtype, device) - - -def create_transformers_model( - auto_cls: _BaseAutoModelClass, - od_config: OmniDiffusionConfig, - hf_config: PretrainedConfig, - dtype: torch.dtype | None = None, - device: torch.device | None = None, -) -> PreTrainedModel: - """Create a HuggingFace model using the given auto class and model name.""" - dtype = dtype or od_config.dtype - device = device or torch.get_default_device() - with init_on_device_without_buffers("meta"): - model = auto_cls.from_config(hf_config) - recursive_replace_linear(model, od_config) - init_parameters(model, dtype=dtype, device=device) - return model def _load_json(model_path: str, filename: str, local_files_only: bool = True) -> dict: diff --git a/vllm_omni/diffusion/models/z_image/pipeline_z_image.py b/vllm_omni/diffusion/models/z_image/pipeline_z_image.py index 5ead64566a2..296f1e57f9e 100644 --- a/vllm_omni/diffusion/models/z_image/pipeline_z_image.py +++ b/vllm_omni/diffusion/models/z_image/pipeline_z_image.py @@ -28,7 +28,7 @@ from diffusers.schedulers import FlowMatchEulerDiscreteScheduler from diffusers.utils import logging from diffusers.utils.torch_utils import randn_tensor -from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer +from transformers import AutoModel, AutoTokenizer from vllm.model_executor.models.utils import AutoWeightsLoader from vllm_omni.diffusion.data import DiffusionOutput, OmniDiffusionConfig @@ -36,7 +36,6 @@ from vllm_omni.diffusion.distributed.utils import get_local_device from vllm_omni.diffusion.model_loader.diffusers_loader import DiffusersPipelineLoader from vllm_omni.diffusion.model_loader.hub_prefetch import prefetch_subfolders -from vllm_omni.diffusion.models.utils import create_transformers_model from vllm_omni.diffusion.models.z_image.z_image_transformer import ( ZImageTransformer2DModel, ) @@ -170,25 +169,13 @@ def __init__( super().__init__() self.od_config = od_config self.weights_sources = [ - DiffusersPipelineLoader.ComponentSource( - model_or_path=od_config.model, - subfolder="text_encoder", - revision=od_config.revision, - prefix="text_encoder.", - ), DiffusersPipelineLoader.ComponentSource( model_or_path=od_config.model, subfolder="transformer", - revision=od_config.revision, + revision=None, prefix="transformer.", fall_back_to_pt=True, - ), - DiffusersPipelineLoader.ComponentSource( - model_or_path=od_config.model, - subfolder="vae", - revision=od_config.revision, - prefix="vae.", - ), + ) ] self._execution_device = get_local_device() model = od_config.model @@ -205,19 +192,12 @@ def __init__( model, subfolder="scheduler", local_files_only=local_files_only ) - text_encoder_config = AutoConfig.from_pretrained( + self.text_encoder = AutoModel.from_pretrained( model, subfolder="text_encoder", local_files_only=local_files_only - ) - self.text_encoder = create_transformers_model( - AutoModelForCausalLM, - od_config, - hf_config=text_encoder_config, ).to(self._execution_device) - if text_encoder_config.tie_word_embeddings: - self.text_encoder.lm_head.weight = self.text_encoder.get_input_embeddings().weight - - vae_config = DistributedAutoencoderKL.load_config(model, subfolder="vae", local_files_only=local_files_only) - self.vae = DistributedAutoencoderKL.from_config(vae_config).to(self._execution_device) + self.vae = DistributedAutoencoderKL.from_pretrained( + model, subfolder="vae", local_files_only=local_files_only + ).to(self._execution_device) self.transformer = ZImageTransformer2DModel(quant_config=od_config.quantization_config) self.tokenizer = AutoTokenizer.from_pretrained(model, subfolder="tokenizer", local_files_only=local_files_only)