From cf301fe75b3e71c47ecce1035cb48e49fb5813cb Mon Sep 17 00:00:00 2001
From: Hongsheng Liu <liuhongsheng4@huawei.com>
Date: Thu, 30 Apr 2026 15:05:26 +0800
Subject: [PATCH] =?UTF-8?q?Revert=20"[Quantization]=20Enable=20FP8=20onlin?=
 =?UTF-8?q?e=20quantization=20for=20Z-image=20text=20encode=E2=80=A6"?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit ac662827466dcf0cb7e0d174e968ec8d590feb56.
---
 docs/user_guide/quantization/fp8.md           |  18 +--
 .../autoencoders/autoencoder_kl.py            |   6 -
 .../model_loader/diffusers_loader.py          |  23 +---
 vllm_omni/diffusion/models/utils.py           | 126 ------------------
 .../models/z_image/pipeline_z_image.py        |  34 +----
 5 files changed, 19 insertions(+), 188 deletions(-)

diff --git a/docs/user_guide/quantization/fp8.md b/docs/user_guide/quantization/fp8.md
index 7373a39ffb4..e89bc76ca77 100644
--- a/docs/user_guide/quantization/fp8.md
+++ b/docs/user_guide/quantization/fp8.md
@@ -32,15 +32,15 @@ guide. FP8 on Ampere may use a weight-only path where available.
 
 ### Diffusion Model (Qwen-Image, Wan2.2)
 
-| Model | HF models | Online | Pre-calibrated | Recommendation | `ignored_layers` | Text-Encoder quantization |
-|-------|-----------|:-------:|:------:|----------------|------------------|------------------|
-| Qwen-Image | `Qwen/Qwen-Image`, `Qwen/Qwen-Image-2512` | Yes | Yes | Skip sensitive image-stream MLPs when quality regresses | `img_mlp` | |
-| Wan2.2 | Wan2.2 diffusion pipelines | Not validated | Not validated | Validate against BF16 before documenting as supported | TBD | |
-| Z-Image | `Tongyi-MAI/Z-Image-Turbo` | Yes | Yes | All layers | None | ✅︎ |
-| FLUX.1 | `black-forest-labs/FLUX.1-dev`, `black-forest-labs/FLUX.1-schnell` | Yes | Yes | All layers | None | |
-| FLUX.2-klein | `black-forest-labs/FLUX.2-klein-4B` | Yes | Yes | All layers | None | |
-| HunyuanImage-3.0 | `tencent/HunyuanImage-3.0`, `tencent/HunyuanImage-3.0-Instruct` | Yes | Yes | All layers; use the Hunyuan stage config for multi-stage runs | None | |
-| HunyuanVideo-1.5 | `hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-480p_t2v`, `720p_t2v`, `480p_i2v` | Yes | Yes | All layers | None | |
+| Model | HF models | Online | Pre-calibrated | Recommendation | `ignored_layers` |
+|-------|-----------|:-------:|:------:|----------------|------------------|
+| Qwen-Image | `Qwen/Qwen-Image`, `Qwen/Qwen-Image-2512` | Yes | Yes | Skip sensitive image-stream MLPs when quality regresses | `img_mlp` |
+| Wan2.2 | Wan2.2 diffusion pipelines | Not validated | Not validated | Validate against BF16 before documenting as supported | TBD |
+| Z-Image | `Tongyi-MAI/Z-Image-Turbo` | Yes | Yes | All layers | None |
+| FLUX.1 | `black-forest-labs/FLUX.1-dev`, `black-forest-labs/FLUX.1-schnell` | Yes | Yes | All layers | None |
+| FLUX.2-klein | `black-forest-labs/FLUX.2-klein-4B` | Yes | Yes | All layers | None |
+| HunyuanImage-3.0 | `tencent/HunyuanImage-3.0`, `tencent/HunyuanImage-3.0-Instruct` | Yes | Yes | All layers; use the Hunyuan stage config for multi-stage runs | None |
+| HunyuanVideo-1.5 | `hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-480p_t2v`, `720p_t2v`, `480p_i2v` | Yes | Yes | All layers | None |
 
 ### Multi-Stage Omni/TTS Model (Qwen3-Omni, Qwen3-TTS)
 
diff --git a/vllm_omni/diffusion/distributed/autoencoders/autoencoder_kl.py b/vllm_omni/diffusion/distributed/autoencoders/autoencoder_kl.py
index 1045a6135ff..0084719a8ab 100644
--- a/vllm_omni/diffusion/distributed/autoencoders/autoencoder_kl.py
+++ b/vllm_omni/diffusion/distributed/autoencoders/autoencoder_kl.py
@@ -28,12 +28,6 @@ def from_pretrained(cls, *args: Any, **kwargs: Any):
         model.init_distributed()
         return model
 
-    @classmethod
-    def from_config(cls, *args: Any, **kwargs: Any):
-        model = super().from_config(*args, **kwargs)
-        model.init_distributed()
-        return model
-
     def tile_split(self, z: torch.Tensor) -> tuple[list[TileTask], GridSpec]:
         # mostly copy from AutoencoderKL
         overlap_size = int(self.tile_latent_min_size * (1 - self.tile_overlap_factor))
diff --git a/vllm_omni/diffusion/model_loader/diffusers_loader.py b/vllm_omni/diffusion/model_loader/diffusers_loader.py
index 446bf6dd65b..91f3574b185 100644
--- a/vllm_omni/diffusion/model_loader/diffusers_loader.py
+++ b/vllm_omni/diffusion/model_loader/diffusers_loader.py
@@ -26,7 +26,6 @@
     multi_thread_safetensors_weights_iterator,
     safetensors_weights_iterator,
 )
-from vllm.transformers_utils.repo_utils import file_exists
 from vllm.utils.import_utils import resolve_obj_by_qualname
 from vllm.utils.torch_utils import set_default_torch_dtype
 
@@ -50,8 +49,6 @@ def _natural_sort_key(filepath: str) -> list:
 
 MODEL_INDEX = "model_index.json"
 DIFFUSION_MODEL_WEIGHTS_INDEX = "diffusion_pytorch_model.safetensors.index.json"
-TRANSFORMER_WEIGHTS_INDEX = "model.safetensors.index.json"
-INDEX_FILES = [DIFFUSION_MODEL_WEIGHTS_INDEX, TRANSFORMER_WEIGHTS_INDEX]
 
 
 class DiffusersPipelineLoader:
@@ -102,22 +99,8 @@ def _prepare_weights(
         is_local = os.path.isdir(model_name_or_path)
         load_format = self.load_config.load_format
         use_safetensors = False
-        possible_index_files = [
-            f"{subfolder}/{index_file}" if subfolder is not None else index_file for index_file in INDEX_FILES
-        ]
-        available_index_file = list(
-            filter(lambda f: file_exists(model_name_or_path, f, revision=revision), possible_index_files)
-        )
-        if len(available_index_file) > 1:
-            raise ValueError(
-                f"Multiple index files found in {model_name_or_path} with subfolder {subfolder}: {available_index_file}"
-            )
-        index_file_with_subfolder = available_index_file[0] if len(available_index_file) == 1 else None
-        index_file = (
-            index_file_with_subfolder.split("/")[-1]
-            if index_file_with_subfolder and subfolder is not None
-            else index_file_with_subfolder
-        )
+        index_file = DIFFUSION_MODEL_WEIGHTS_INDEX
+        index_file_with_subfolder = f"{subfolder}/{index_file}" if subfolder else index_file
 
         # only hf is supported currently
         if load_format == "auto":
@@ -180,7 +163,7 @@ def _prepare_weights(
             hf_weights_files = filter_duplicate_safetensors_files(
                 hf_weights_files,
                 filter_folder,
-                index_file or "",
+                index_file,
             )
         else:
             hf_weights_files = filter_files_not_needed_for_inference(hf_weights_files)
diff --git a/vllm_omni/diffusion/models/utils.py b/vllm_omni/diffusion/models/utils.py
index ce5fa34322a..ba0d8dda20c 100644
--- a/vllm_omni/diffusion/models/utils.py
+++ b/vllm_omni/diffusion/models/utils.py
@@ -5,132 +5,6 @@
 
 import json
 import os
-from typing import TYPE_CHECKING, Literal
-
-import torch
-import torch.nn as nn
-from vllm.model_executor.layers.linear import (
-    ColumnParallelLinear,
-    ReplicatedLinear,
-    RowParallelLinear,
-)
-from vllm.model_executor.models.transformers.utils import init_on_device_without_buffers
-from vllm.model_executor.models.utils import maybe_prefix
-
-from vllm_omni.diffusion.data import OmniDiffusionConfig
-from vllm_omni.diffusion.quantization import get_vllm_quant_config_for_layers
-
-if TYPE_CHECKING:
-    from transformers import PretrainedConfig, PreTrainedModel
-    from transformers.models.auto.auto_factory import _BaseAutoModelClass
-    from vllm.model_executor.layers.quantization.base_config import (
-        QuantizationConfig,
-    )
-
-
-Style = Literal["colwise", "colwise_rep", "rowwise", "rowwise_rep", "replicate"]
-
-
-def replace_linear_class(
-    linear: nn.Linear,
-    style: Style = "replicate",
-    quant_config: QuantizationConfig | None = None,
-    *,
-    prefix: str = "",
-) -> ColumnParallelLinear | RowParallelLinear | ReplicatedLinear:
-    """
-    Replace nn.Linear with one of vLLM's tensor parallel linear classes.
-
-    Args:
-        linear: `nn.Linear` to be replaced.
-        style: Tensor parallel style of the new linear, e.g. "colwise".
-        quant_config: Quantization config for the new linear.
-    Returns:
-        The new linear.
-    """
-
-    if not isinstance(style, str):
-        raise ValueError(f"Unsupported parallel style type {type(style)}, expected str")
-
-    vllm_linear_maps = {
-        "colwise": (ColumnParallelLinear, {}),
-        "colwise_rep": (ColumnParallelLinear, {"gather_output": True}),
-        "rowwise": (RowParallelLinear, {}),
-        "rowwise_rep": (RowParallelLinear, {"input_is_parallel": False}),
-        "replicate": (ReplicatedLinear, {}),
-    }
-    vllm_linear_cls, vllm_linear_kwargs = vllm_linear_maps[style]
-
-    return vllm_linear_cls(
-        input_size=linear.in_features,
-        output_size=linear.out_features,
-        bias=linear.bias is not None,
-        quant_config=quant_config,
-        prefix=prefix,
-        return_bias=False,
-        **vllm_linear_kwargs,
-    )
-
-
-def recursive_replace_linear(model: nn.Module, od_config: OmniDiffusionConfig):
-    """Recursively replace modules in the model as needed.
-    Currently, this replaces:
-    - `nn.Linear` with vLLM's tensor parallel linear classes
-    """
-    # Prefix the patterns because we always start from `self.model`
-    quant_config = get_vllm_quant_config_for_layers(od_config.quantization_config)
-
-    def _recursive_replace(module: nn.Module, prefix: str):
-        for child_name, child_module in module.named_children():
-            new_module = child_module
-            qual_name = maybe_prefix(prefix, child_name)
-            # Replace modules as needed
-            if isinstance(child_module, nn.Linear):
-                style = "replicate"
-                new_module = replace_linear_class(child_module, style, quant_config, prefix=qual_name)
-            else:
-                _recursive_replace(child_module, prefix=qual_name)
-            if new_module is not child_module:
-                setattr(module, child_name, new_module)
-
-    _recursive_replace(model, prefix="")
-
-
-def init_parameters(
-    module: nn.Module,
-    dtype: torch.dtype | None,
-    device: torch.device | None = None,
-):
-    for name, param in module.named_parameters(recurse=False):
-        if param.device == torch.device("meta"):
-            new_param = nn.Parameter(
-                torch.empty_like(
-                    param.data,
-                    dtype=dtype,
-                    device=device,
-                ),
-                requires_grad=param.requires_grad,
-            )
-            setattr(module, name, new_param)
-    for child in module.children():
-        init_parameters(child, dtype, device)
-
-
-def create_transformers_model(
-    auto_cls: _BaseAutoModelClass,
-    od_config: OmniDiffusionConfig,
-    hf_config: PretrainedConfig,
-    dtype: torch.dtype | None = None,
-    device: torch.device | None = None,
-) -> PreTrainedModel:
-    """Create a HuggingFace model using the given auto class and model name."""
-    dtype = dtype or od_config.dtype
-    device = device or torch.get_default_device()
-    with init_on_device_without_buffers("meta"):
-        model = auto_cls.from_config(hf_config)
-    recursive_replace_linear(model, od_config)
-    init_parameters(model, dtype=dtype, device=device)
-    return model
 
 
 def _load_json(model_path: str, filename: str, local_files_only: bool = True) -> dict:
diff --git a/vllm_omni/diffusion/models/z_image/pipeline_z_image.py b/vllm_omni/diffusion/models/z_image/pipeline_z_image.py
index 5ead64566a2..296f1e57f9e 100644
--- a/vllm_omni/diffusion/models/z_image/pipeline_z_image.py
+++ b/vllm_omni/diffusion/models/z_image/pipeline_z_image.py
@@ -28,7 +28,7 @@
 from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
 from diffusers.utils import logging
 from diffusers.utils.torch_utils import randn_tensor
-from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+from transformers import AutoModel, AutoTokenizer
 from vllm.model_executor.models.utils import AutoWeightsLoader
 
 from vllm_omni.diffusion.data import DiffusionOutput, OmniDiffusionConfig
@@ -36,7 +36,6 @@
 from vllm_omni.diffusion.distributed.utils import get_local_device
 from vllm_omni.diffusion.model_loader.diffusers_loader import DiffusersPipelineLoader
 from vllm_omni.diffusion.model_loader.hub_prefetch import prefetch_subfolders
-from vllm_omni.diffusion.models.utils import create_transformers_model
 from vllm_omni.diffusion.models.z_image.z_image_transformer import (
     ZImageTransformer2DModel,
 )
@@ -170,25 +169,13 @@ def __init__(
         super().__init__()
         self.od_config = od_config
         self.weights_sources = [
-            DiffusersPipelineLoader.ComponentSource(
-                model_or_path=od_config.model,
-                subfolder="text_encoder",
-                revision=od_config.revision,
-                prefix="text_encoder.",
-            ),
             DiffusersPipelineLoader.ComponentSource(
                 model_or_path=od_config.model,
                 subfolder="transformer",
-                revision=od_config.revision,
+                revision=None,
                 prefix="transformer.",
                 fall_back_to_pt=True,
-            ),
-            DiffusersPipelineLoader.ComponentSource(
-                model_or_path=od_config.model,
-                subfolder="vae",
-                revision=od_config.revision,
-                prefix="vae.",
-            ),
+            )
         ]
         self._execution_device = get_local_device()
         model = od_config.model
@@ -205,19 +192,12 @@ def __init__(
             model, subfolder="scheduler", local_files_only=local_files_only
         )
 
-        text_encoder_config = AutoConfig.from_pretrained(
+        self.text_encoder = AutoModel.from_pretrained(
             model, subfolder="text_encoder", local_files_only=local_files_only
-        )
-        self.text_encoder = create_transformers_model(
-            AutoModelForCausalLM,
-            od_config,
-            hf_config=text_encoder_config,
         ).to(self._execution_device)
-        if text_encoder_config.tie_word_embeddings:
-            self.text_encoder.lm_head.weight = self.text_encoder.get_input_embeddings().weight
-
-        vae_config = DistributedAutoencoderKL.load_config(model, subfolder="vae", local_files_only=local_files_only)
-        self.vae = DistributedAutoencoderKL.from_config(vae_config).to(self._execution_device)
+        self.vae = DistributedAutoencoderKL.from_pretrained(
+            model, subfolder="vae", local_files_only=local_files_only
+        ).to(self._execution_device)
         self.transformer = ZImageTransformer2DModel(quant_config=od_config.quantization_config)
         self.tokenizer = AutoTokenizer.from_pretrained(model, subfolder="tokenizer", local_files_only=local_files_only)