From e790e33b1e4cc7e028a564806a3f4741cc9a7207 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Wed, 11 Feb 2026 21:09:37 +0800
Subject: [PATCH 01/23] draft

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 .../diffusion/model_loader/diffusers_loader.py   | 12 ++++++------
 .../diffusion/models/z_image/pipeline_z_image.py | 16 ++++++++++++++--
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/vllm_omni/diffusion/model_loader/diffusers_loader.py b/vllm_omni/diffusion/model_loader/diffusers_loader.py
index b61f70b697c..d9090d95c40 100644
--- a/vllm_omni/diffusion/model_loader/diffusers_loader.py
+++ b/vllm_omni/diffusion/model_loader/diffusers_loader.py
@@ -262,9 +262,9 @@ def load_weights(self, model: nn.Module) -> None:
         # We only enable strict check for non-quantized models
         # that have loaded weights tracking currently.
         if loaded_weights is not None:
-            _ = weights_to_load - loaded_weights
-        #     if weights_not_loaded:
-        #         raise ValueError(
-        #             "Following weights were not initialized from "
-        #             f"checkpoint: {weights_not_loaded}"
-        #         )
+            weights_not_loaded = weights_to_load - loaded_weights
+            if weights_not_loaded:
+                raise ValueError(
+                    "Following weights were not initialized from "
+                    f"checkpoint: {weights_not_loaded}"
+                )
diff --git a/vllm_omni/diffusion/models/z_image/pipeline_z_image.py b/vllm_omni/diffusion/models/z_image/pipeline_z_image.py
index 8a391319538..341f9b7d7d4 100644
--- a/vllm_omni/diffusion/models/z_image/pipeline_z_image.py
+++ b/vllm_omni/diffusion/models/z_image/pipeline_z_image.py
@@ -156,10 +156,22 @@ def __init__(
             DiffusersPipelineLoader.ComponentSource(
                 model_or_path=od_config.model,
                 subfolder="transformer",
-                revision=None,
+                revision=od_config.revision,
                 prefix="transformer.",
                 fall_back_to_pt=True,
-            )
+            ),
+            DiffusersPipelineLoader.ComponentSource(
+                model_or_path=od_config.model,
+                subfolder="text_encoder",
+                revision=od_config.revision,
+                prefix="text_encoder.",
+            ),
+            DiffusersPipelineLoader.ComponentSource(
+                model_or_path=od_config.model,
+                subfolder="vae",
+                revision=od_config.revision,
+                prefix="vae.",
+            ),
         ]
         self._execution_device = get_local_device()
         model = od_config.model

From 0fe72c7cd111592a8d43886d985aa0c72a031121 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Wed, 11 Feb 2026 22:28:01 +0800
Subject: [PATCH 02/23] update

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 .../model_loader/diffusers_loader.py          |   1 +
 vllm_omni/diffusion/models/utils.py           | 101 ++++++++++++++++++
 .../models/z_image/pipeline_z_image.py        |  21 ++--
 3 files changed, 114 insertions(+), 9 deletions(-)
 create mode 100644 vllm_omni/diffusion/models/utils.py

diff --git a/vllm_omni/diffusion/model_loader/diffusers_loader.py b/vllm_omni/diffusion/model_loader/diffusers_loader.py
index d9090d95c40..201b442b4b3 100644
--- a/vllm_omni/diffusion/model_loader/diffusers_loader.py
+++ b/vllm_omni/diffusion/model_loader/diffusers_loader.py
@@ -32,6 +32,7 @@
 
 MODEL_INDEX = "model_index.json"
 DIFFUSION_MODEL_WEIGHTS_INDEX = "diffusion_pytorch_model.safetensors.index.json"
+TRANSFORMER_WEIGHTS_INDEX = "model.safetensors.index.json"
 
 
 class DiffusersPipelineLoader:
diff --git a/vllm_omni/diffusion/models/utils.py b/vllm_omni/diffusion/models/utils.py
new file mode 100644
index 00000000000..0b4b5097175
--- /dev/null
+++ b/vllm_omni/diffusion/models/utils.py
@@ -0,0 +1,101 @@
+from typing import Literal, TYPE_CHECKING
+
+import torch
+import torch.nn as nn
+
+from vllm_omni.diffusion.data import OmniDiffusionConfig
+from vllm_omni.diffusion.quantization import get_vllm_quant_config_for_layers
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from vllm.model_executor.models.utils import maybe_prefix
+
+if TYPE_CHECKING:
+    from vllm.model_executor.layers.quantization.base_config import (
+        QuantizationConfig,
+    )
+
+
+Style = Literal["colwise", "colwise_rep", "rowwise", "rowwise_rep", "replicate"]
+
+
+def replace_linear_class(
+    linear: nn.Linear,
+    style: Style = "replicate",
+    quant_config: "QuantizationConfig | None" = None,
+    *,
+    prefix: str = "",
+) -> ColumnParallelLinear | RowParallelLinear | ReplicatedLinear:
+    """
+    Replace nn.Linear with one of vLLM's tensor parallel linear classes.
+
+    Args:
+        linear: `nn.Linear` to be replaced.
+        style: Tensor parallel style of the new linear, e.g. "colwise".
+        quant_config: Quantization config for the new linear.
+    Returns:
+        The new linear.
+    """
+
+    if not isinstance(style, str):
+        raise ValueError(f"Unsupported parallel style type {type(style)}, expected str")
+
+    vllm_linear_cls, vllm_linear_kwargs = {
+        "colwise": (ColumnParallelLinear, {}),
+        "colwise_rep": (ColumnParallelLinear, {"gather_output": True}),
+        "rowwise": (RowParallelLinear, {}),
+        "rowwise_rep": (RowParallelLinear, {"input_is_parallel": False}),
+        "replicate": (ReplicatedLinear, {}),
+    }.get(style, (ReplicatedLinear, {}))
+
+    return vllm_linear_cls(
+        input_size=linear.in_features,
+        output_size=linear.out_features,
+        bias=linear.bias is not None,
+        quant_config=quant_config,
+        prefix=prefix,
+        return_bias=False,
+        **vllm_linear_kwargs,
+    )
+
+
+def recursive_replace_linear(model: nn.Module, od_config: OmniDiffusionConfig):
+    """Recursively replace modules in the model as needed.
+    Currently, this replaces:
+    - `nn.Linear` with vLLM's tensor parallel linear classes
+    - `*RMSNorm` with vLLM's `RMSNorm`
+    """
+    # Prefix the patterns because we always start from `self.model`
+    quant_config = get_vllm_quant_config_for_layers(od_config.quantization_config)
+    def _recursive_replace(module: nn.Module, prefix: str):
+        for child_name, child_module in module.named_children():
+            new_module = child_module
+            qual_name = maybe_prefix(prefix, child_name)
+            # Replace modules as needed
+            if isinstance(child_module, nn.Linear):
+                style = "replicate"
+                new_module = replace_linear_class(
+                    child_module, style, quant_config, prefix=qual_name
+                )
+            else:
+                _recursive_replace(child_module, prefix=qual_name)
+            if new_module is not child_module:
+                setattr(module, child_name, new_module)
+    _recursive_replace(model, prefix="")
+
+
+def init_parameters(module: nn.Module, dtype: torch.dtype | None, device: torch.device | None = None,):
+    for name, param in module.named_parameters(recurse=False):
+        if param.device == torch.device("meta"):
+            new_param = nn.Parameter(
+                torch.empty_like(
+                    param.data,
+                    dtype=dtype,
+                    device=device,
+                )
+            )
+            setattr(module, name, new_param)
+    for child in module.children():
+        init_parameters(child, dtype, device)
\ No newline at end of file
diff --git a/vllm_omni/diffusion/models/z_image/pipeline_z_image.py b/vllm_omni/diffusion/models/z_image/pipeline_z_image.py
index 341f9b7d7d4..788497f3c7c 100644
--- a/vllm_omni/diffusion/models/z_image/pipeline_z_image.py
+++ b/vllm_omni/diffusion/models/z_image/pipeline_z_image.py
@@ -28,7 +28,8 @@
 from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
 from diffusers.utils import logging
 from diffusers.utils.torch_utils import randn_tensor
-from transformers import AutoModel, AutoTokenizer
+from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
+from vllm.model_executor.models.transformers.utils import init_on_device_without_buffers
 from vllm.model_executor.models.utils import AutoWeightsLoader
 
 from vllm_omni.diffusion.data import DiffusionOutput, OmniDiffusionConfig
@@ -42,6 +43,7 @@
 from vllm_omni.model_executor.model_loader.weight_utils import (
     download_weights_from_hf_specific,
 )
+from vllm_omni.diffusion.models.utils import recursive_replace_linear, init_parameters
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
@@ -166,12 +168,6 @@ def __init__(
                 revision=od_config.revision,
                 prefix="text_encoder.",
             ),
-            DiffusersPipelineLoader.ComponentSource(
-                model_or_path=od_config.model,
-                subfolder="vae",
-                revision=od_config.revision,
-                prefix="vae.",
-            ),
         ]
         self._execution_device = get_local_device()
         model = od_config.model
@@ -180,9 +176,14 @@ def __init__(
             model, subfolder="scheduler", local_files_only=local_files_only
         )
 
-        self.text_encoder = AutoModel.from_pretrained(
+        text_encoder_config = AutoConfig.from_pretrained(
             model, subfolder="text_encoder", local_files_only=local_files_only
         )
+        with init_on_device_without_buffers("meta"):
+            self.text_encoder = AutoModelForCausalLM.from_config(text_encoder_config)
+        recursive_replace_linear(self.text_encoder, od_config)
+        init_parameters(self.text_encoder, dtype=od_config.dtype)
+
         self.vae = AutoencoderKL.from_pretrained(model, subfolder="vae", local_files_only=local_files_only).to(
             self._execution_device
         )
@@ -656,4 +657,6 @@ def forward(
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
-        return loader.load_weights(weights)
+        loaded_weights = loader.load_weights(weights)
+        loaded_weights |= {name for name, _ in self.vae.named_parameters()}
+        return loaded_weights

From 51070f4b6bc9b74ddc578bb77ab5fd4b0011f644 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Wed, 11 Feb 2026 22:42:22 +0800
Subject: [PATCH 03/23] fix

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm_omni/diffusion/models/z_image/pipeline_z_image.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm_omni/diffusion/models/z_image/pipeline_z_image.py b/vllm_omni/diffusion/models/z_image/pipeline_z_image.py
index 788497f3c7c..5bb536d47d3 100644
--- a/vllm_omni/diffusion/models/z_image/pipeline_z_image.py
+++ b/vllm_omni/diffusion/models/z_image/pipeline_z_image.py
@@ -182,7 +182,9 @@ def __init__(
         with init_on_device_without_buffers("meta"):
             self.text_encoder = AutoModelForCausalLM.from_config(text_encoder_config)
         recursive_replace_linear(self.text_encoder, od_config)
-        init_parameters(self.text_encoder, dtype=od_config.dtype)
+        init_parameters(self.text_encoder, dtype=od_config.dtype, device=self._execution_device)
+        if text_encoder_config.tie_word_embeddings:
+            self.text_encoder.lm_head.weight = self.text_encoder.get_input_embeddings().weight
 
         self.vae = AutoencoderKL.from_pretrained(model, subfolder="vae", local_files_only=local_files_only).to(
             self._execution_device
@@ -658,5 +660,5 @@ def forward(
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
         loaded_weights = loader.load_weights(weights)
-        loaded_weights |= {name for name, _ in self.vae.named_parameters()}
+        loaded_weights |= {f"vae.{name}" for name, _ in self.vae.named_parameters()}
         return loaded_weights

From 4bb6989155f590b052a03b1853b1b617215d5041 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Thu, 12 Feb 2026 22:43:15 +0800
Subject: [PATCH 04/23] update

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm_omni/diffusion/models/utils.py           | 22 ++++++++++++++++++-
 .../models/z_image/pipeline_z_image.py        | 13 ++++++-----
 2 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/vllm_omni/diffusion/models/utils.py b/vllm_omni/diffusion/models/utils.py
index 0b4b5097175..25104699780 100644
--- a/vllm_omni/diffusion/models/utils.py
+++ b/vllm_omni/diffusion/models/utils.py
@@ -3,6 +3,8 @@
 import torch
 import torch.nn as nn
 
+from vllm.model_executor.models.transformers.utils import init_on_device_without_buffers
+
 from vllm_omni.diffusion.data import OmniDiffusionConfig
 from vllm_omni.diffusion.quantization import get_vllm_quant_config_for_layers
 from vllm.model_executor.layers.linear import (
@@ -13,6 +15,8 @@
 from vllm.model_executor.models.utils import maybe_prefix
 
 if TYPE_CHECKING:
+    from transformers import PretrainedConfig, PreTrainedModel
+    from transformers.models.auto.auto_factory import _BaseAutoModelClass
     from vllm.model_executor.layers.quantization.base_config import (
         QuantizationConfig,
     )
@@ -98,4 +102,20 @@ def init_parameters(module: nn.Module, dtype: torch.dtype | None, device: torch.
             )
             setattr(module, name, new_param)
     for child in module.children():
-        init_parameters(child, dtype, device)
\ No newline at end of file
+        init_parameters(child, dtype, device)
+
+
+def create_transformers_model(
+    auto_cls: "_BaseAutoModelClass",
+    od_config: OmniDiffusionConfig,
+    hf_config: "PretrainedConfig",
+    dtype: torch.dtype | None = None,
+    device: torch.device | None = None,
+) -> PreTrainedModel:
+    """Create a HuggingFace model using the given auto class and model name."""
+    dtype = dtype or od_config.dtype
+    with init_on_device_without_buffers("meta"):
+        model = auto_cls.from_config(hf_config)
+    recursive_replace_linear(model, od_config)
+    init_parameters(model, dtype=dtype, device=device)
+    return model
diff --git a/vllm_omni/diffusion/models/z_image/pipeline_z_image.py b/vllm_omni/diffusion/models/z_image/pipeline_z_image.py
index 5bb536d47d3..6c6ce17b4f7 100644
--- a/vllm_omni/diffusion/models/z_image/pipeline_z_image.py
+++ b/vllm_omni/diffusion/models/z_image/pipeline_z_image.py
@@ -29,7 +29,6 @@
 from diffusers.utils import logging
 from diffusers.utils.torch_utils import randn_tensor
 from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
-from vllm.model_executor.models.transformers.utils import init_on_device_without_buffers
 from vllm.model_executor.models.utils import AutoWeightsLoader
 
 from vllm_omni.diffusion.data import DiffusionOutput, OmniDiffusionConfig
@@ -43,7 +42,7 @@
 from vllm_omni.model_executor.model_loader.weight_utils import (
     download_weights_from_hf_specific,
 )
-from vllm_omni.diffusion.models.utils import recursive_replace_linear, init_parameters
+from vllm_omni.diffusion.models.utils import create_transformers_model
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
@@ -179,10 +178,12 @@ def __init__(
         text_encoder_config = AutoConfig.from_pretrained(
             model, subfolder="text_encoder", local_files_only=local_files_only
         )
-        with init_on_device_without_buffers("meta"):
-            self.text_encoder = AutoModelForCausalLM.from_config(text_encoder_config)
-        recursive_replace_linear(self.text_encoder, od_config)
-        init_parameters(self.text_encoder, dtype=od_config.dtype, device=self._execution_device)
+        self.text_encoder = create_transformers_model(
+            AutoModelForCausalLM,
+            od_config,
+            hf_config=text_encoder_config,
+            device=self._execution_device,
+        )
         if text_encoder_config.tie_word_embeddings:
             self.text_encoder.lm_head.weight = self.text_encoder.get_input_embeddings().weight
 

From f6ebce1451d0d171e5c25bfd50f8cd7d4e5c511f Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Thu, 12 Feb 2026 23:14:19 +0800
Subject: [PATCH 05/23] fix

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 .../model_loader/diffusers_loader.py          | 29 +++++++++++++++----
 vllm_omni/diffusion/models/utils.py           | 25 +++++++++-------
 .../models/z_image/pipeline_z_image.py        |  4 +--
 3 files changed, 39 insertions(+), 19 deletions(-)

diff --git a/vllm_omni/diffusion/model_loader/diffusers_loader.py b/vllm_omni/diffusion/model_loader/diffusers_loader.py
index 201b442b4b3..2cf6e3eef30 100644
--- a/vllm_omni/diffusion/model_loader/diffusers_loader.py
+++ b/vllm_omni/diffusion/model_loader/diffusers_loader.py
@@ -22,6 +22,7 @@
     maybe_download_from_modelscope,
     safetensors_weights_iterator,
 )
+from vllm.transformers_utils.repo_utils import file_exists
 from vllm.utils.torch_utils import set_default_torch_dtype
 
 from vllm_omni.diffusion.data import OmniDiffusionConfig
@@ -33,6 +34,12 @@
 MODEL_INDEX = "model_index.json"
 DIFFUSION_MODEL_WEIGHTS_INDEX = "diffusion_pytorch_model.safetensors.index.json"
 TRANSFORMER_WEIGHTS_INDEX = "model.safetensors.index.json"
+INDEX_FILES = [DIFFUSION_MODEL_WEIGHTS_INDEX, TRANSFORMER_WEIGHTS_INDEX]
+
+
+def get_subfolder_file_path(subfolder: str | None, file: str) -> str:
+    """Get the subfolder path."""
+    return f"{subfolder}/" if subfolder is not None else file
 
 
 class DiffusersPipelineLoader:
@@ -95,8 +102,21 @@ def _prepare_weights(
         is_local = os.path.isdir(model_name_or_path)
         load_format = self.load_config.load_format
         use_safetensors = False
-        index_file = DIFFUSION_MODEL_WEIGHTS_INDEX
-        index_file_with_subfolder = f"{subfolder}/{index_file}" if subfolder else index_file
+        possible_index_files = [
+            f"{subfolder}/{index_file}" if subfolder is not None else index_file for index_file in INDEX_FILES
+        ]
+        available_index_file = list(
+            filter(lambda f: file_exists(model_name_or_path, f, revision), possible_index_files)
+        )
+        assert len(available_index_file) <= 1, (
+            f"Multiple index files found in {model_name_or_path} with subfolder {subfolder}: {available_index_file}"
+        )
+        index_file_with_subfolder = available_index_file[0] if len(available_index_file) == 1 else None
+        index_file = (
+            index_file_with_subfolder.split("/")[-1]
+            if index_file_with_subfolder and subfolder is not None
+            else index_file_with_subfolder
+        )
 
         # only hf is supported currently
         if load_format == "auto":
@@ -265,7 +285,4 @@ def load_weights(self, model: nn.Module) -> None:
         if loaded_weights is not None:
             weights_not_loaded = weights_to_load - loaded_weights
             if weights_not_loaded:
-                raise ValueError(
-                    "Following weights were not initialized from "
-                    f"checkpoint: {weights_not_loaded}"
-                )
+                raise ValueError(f"Following weights were not initialized from checkpoint: {weights_not_loaded}")
diff --git a/vllm_omni/diffusion/models/utils.py b/vllm_omni/diffusion/models/utils.py
index 25104699780..fdfdcc4037a 100644
--- a/vllm_omni/diffusion/models/utils.py
+++ b/vllm_omni/diffusion/models/utils.py
@@ -1,19 +1,18 @@
-from typing import Literal, TYPE_CHECKING
+from typing import TYPE_CHECKING, Literal
 
 import torch
 import torch.nn as nn
-
-from vllm.model_executor.models.transformers.utils import init_on_device_without_buffers
-
-from vllm_omni.diffusion.data import OmniDiffusionConfig
-from vllm_omni.diffusion.quantization import get_vllm_quant_config_for_layers
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
     ReplicatedLinear,
     RowParallelLinear,
 )
+from vllm.model_executor.models.transformers.utils import init_on_device_without_buffers
 from vllm.model_executor.models.utils import maybe_prefix
 
+from vllm_omni.diffusion.data import OmniDiffusionConfig
+from vllm_omni.diffusion.quantization import get_vllm_quant_config_for_layers
+
 if TYPE_CHECKING:
     from transformers import PretrainedConfig, PreTrainedModel
     from transformers.models.auto.auto_factory import _BaseAutoModelClass
@@ -73,6 +72,7 @@ def recursive_replace_linear(model: nn.Module, od_config: OmniDiffusionConfig):
     """
     # Prefix the patterns because we always start from `self.model`
     quant_config = get_vllm_quant_config_for_layers(od_config.quantization_config)
+
     def _recursive_replace(module: nn.Module, prefix: str):
         for child_name, child_module in module.named_children():
             new_module = child_module
@@ -80,17 +80,20 @@ def _recursive_replace(module: nn.Module, prefix: str):
             # Replace modules as needed
             if isinstance(child_module, nn.Linear):
                 style = "replicate"
-                new_module = replace_linear_class(
-                    child_module, style, quant_config, prefix=qual_name
-                )
+                new_module = replace_linear_class(child_module, style, quant_config, prefix=qual_name)
             else:
                 _recursive_replace(child_module, prefix=qual_name)
             if new_module is not child_module:
                 setattr(module, child_name, new_module)
+
     _recursive_replace(model, prefix="")
 
 
-def init_parameters(module: nn.Module, dtype: torch.dtype | None, device: torch.device | None = None,):
+def init_parameters(
+    module: nn.Module,
+    dtype: torch.dtype | None,
+    device: torch.device | None = None,
+):
     for name, param in module.named_parameters(recurse=False):
         if param.device == torch.device("meta"):
             new_param = nn.Parameter(
@@ -111,7 +114,7 @@ def create_transformers_model(
     hf_config: "PretrainedConfig",
     dtype: torch.dtype | None = None,
     device: torch.device | None = None,
-) -> PreTrainedModel:
+) -> "PreTrainedModel":
     """Create a HuggingFace model using the given auto class and model name."""
     dtype = dtype or od_config.dtype
     with init_on_device_without_buffers("meta"):
diff --git a/vllm_omni/diffusion/models/z_image/pipeline_z_image.py b/vllm_omni/diffusion/models/z_image/pipeline_z_image.py
index 6c6ce17b4f7..f56c095b7c0 100644
--- a/vllm_omni/diffusion/models/z_image/pipeline_z_image.py
+++ b/vllm_omni/diffusion/models/z_image/pipeline_z_image.py
@@ -28,12 +28,13 @@
 from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
 from diffusers.utils import logging
 from diffusers.utils.torch_utils import randn_tensor
-from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
 from vllm.model_executor.models.utils import AutoWeightsLoader
 
 from vllm_omni.diffusion.data import DiffusionOutput, OmniDiffusionConfig
 from vllm_omni.diffusion.distributed.utils import get_local_device
 from vllm_omni.diffusion.model_loader.diffusers_loader import DiffusersPipelineLoader
+from vllm_omni.diffusion.models.utils import create_transformers_model
 from vllm_omni.diffusion.models.z_image.z_image_transformer import (
     ZImageTransformer2DModel,
 )
@@ -42,7 +43,6 @@
 from vllm_omni.model_executor.model_loader.weight_utils import (
     download_weights_from_hf_specific,
 )
-from vllm_omni.diffusion.models.utils import create_transformers_model
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 

From 8a6d59fae6ef0c5e5d7ae2c2671069236f11ac01 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Thu, 12 Feb 2026 23:35:40 +0800
Subject: [PATCH 06/23] fix

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm_omni/diffusion/model_loader/diffusers_loader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm_omni/diffusion/model_loader/diffusers_loader.py b/vllm_omni/diffusion/model_loader/diffusers_loader.py
index 2cf6e3eef30..778e7154555 100644
--- a/vllm_omni/diffusion/model_loader/diffusers_loader.py
+++ b/vllm_omni/diffusion/model_loader/diffusers_loader.py
@@ -106,7 +106,7 @@ def _prepare_weights(
             f"{subfolder}/{index_file}" if subfolder is not None else index_file for index_file in INDEX_FILES
         ]
         available_index_file = list(
-            filter(lambda f: file_exists(model_name_or_path, f, revision), possible_index_files)
+            filter(lambda f: file_exists(model_name_or_path, f, revision=revision), possible_index_files)
         )
         assert len(available_index_file) <= 1, (
             f"Multiple index files found in {model_name_or_path} with subfolder {subfolder}: {available_index_file}"

From 59eb19dbeba97bd19d819301501ed34fc4801268 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Thu, 12 Feb 2026 23:37:27 +0800
Subject: [PATCH 07/23] revert weights tracking

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm_omni/diffusion/model_loader/diffusers_loader.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/vllm_omni/diffusion/model_loader/diffusers_loader.py b/vllm_omni/diffusion/model_loader/diffusers_loader.py
index 778e7154555..99d1a7dcd08 100644
--- a/vllm_omni/diffusion/model_loader/diffusers_loader.py
+++ b/vllm_omni/diffusion/model_loader/diffusers_loader.py
@@ -283,6 +283,9 @@ def load_weights(self, model: nn.Module) -> None:
         # We only enable strict check for non-quantized models
         # that have loaded weights tracking currently.
         if loaded_weights is not None:
-            weights_not_loaded = weights_to_load - loaded_weights
-            if weights_not_loaded:
-                raise ValueError(f"Following weights were not initialized from checkpoint: {weights_not_loaded}")
+            _ = weights_to_load - loaded_weights
+        #     if weights_not_loaded:
+        #         raise ValueError(
+        #             "Following weights were not initialized from "
+        #             f"checkpoint: {weights_not_loaded}"
+        #         )
\ No newline at end of file

From c0b8d16bedccbb7f77a94932a64af3faf9e9b1c4 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Thu, 12 Feb 2026 23:38:10 +0800
Subject: [PATCH 08/23] code format

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm_omni/diffusion/model_loader/diffusers_loader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm_omni/diffusion/model_loader/diffusers_loader.py b/vllm_omni/diffusion/model_loader/diffusers_loader.py
index 99d1a7dcd08..454c701b248 100644
--- a/vllm_omni/diffusion/model_loader/diffusers_loader.py
+++ b/vllm_omni/diffusion/model_loader/diffusers_loader.py
@@ -288,4 +288,4 @@ def load_weights(self, model: nn.Module) -> None:
         #         raise ValueError(
         #             "Following weights were not initialized from "
         #             f"checkpoint: {weights_not_loaded}"
-        #         )
\ No newline at end of file
+        #         )

From e0810ebf58fd68f34b980869b91ac3a41693f62c Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Thu, 12 Feb 2026 23:48:11 +0800
Subject: [PATCH 09/23] fix codex

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm_omni/diffusion/models/z_image/pipeline_z_image.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm_omni/diffusion/models/z_image/pipeline_z_image.py b/vllm_omni/diffusion/models/z_image/pipeline_z_image.py
index f56c095b7c0..a7d8d040fa7 100644
--- a/vllm_omni/diffusion/models/z_image/pipeline_z_image.py
+++ b/vllm_omni/diffusion/models/z_image/pipeline_z_image.py
@@ -182,7 +182,6 @@ def __init__(
             AutoModelForCausalLM,
             od_config,
             hf_config=text_encoder_config,
-            device=self._execution_device,
         )
         if text_encoder_config.tie_word_embeddings:
             self.text_encoder.lm_head.weight = self.text_encoder.get_input_embeddings().weight

From fd75eae28b831751cd5713a3eaf96ee7bc28fcdf Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Fri, 13 Feb 2026 00:05:36 +0800
Subject: [PATCH 10/23] fix device

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm_omni/diffusion/models/z_image/pipeline_z_image.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm_omni/diffusion/models/z_image/pipeline_z_image.py b/vllm_omni/diffusion/models/z_image/pipeline_z_image.py
index a7d8d040fa7..d6e81cea904 100644
--- a/vllm_omni/diffusion/models/z_image/pipeline_z_image.py
+++ b/vllm_omni/diffusion/models/z_image/pipeline_z_image.py
@@ -182,6 +182,7 @@ def __init__(
             AutoModelForCausalLM,
             od_config,
             hf_config=text_encoder_config,
+            device=torch.get_default_device(),
         )
         if text_encoder_config.tie_word_embeddings:
             self.text_encoder.lm_head.weight = self.text_encoder.get_input_embeddings().weight

From d35ba0151a965c01cbb581f9beb27873630c39b8 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Fri, 13 Feb 2026 00:39:12 +0800
Subject: [PATCH 11/23] clean

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm_omni/diffusion/models/utils.py                    | 1 +
 vllm_omni/diffusion/models/z_image/pipeline_z_image.py | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm_omni/diffusion/models/utils.py b/vllm_omni/diffusion/models/utils.py
index fdfdcc4037a..12f0c4471fa 100644
--- a/vllm_omni/diffusion/models/utils.py
+++ b/vllm_omni/diffusion/models/utils.py
@@ -117,6 +117,7 @@ def create_transformers_model(
 ) -> "PreTrainedModel":
     """Create a HuggingFace model using the given auto class and model name."""
     dtype = dtype or od_config.dtype
+    device = device or torch.get_default_device()
     with init_on_device_without_buffers("meta"):
         model = auto_cls.from_config(hf_config)
     recursive_replace_linear(model, od_config)
diff --git a/vllm_omni/diffusion/models/z_image/pipeline_z_image.py b/vllm_omni/diffusion/models/z_image/pipeline_z_image.py
index d6e81cea904..a7d8d040fa7 100644
--- a/vllm_omni/diffusion/models/z_image/pipeline_z_image.py
+++ b/vllm_omni/diffusion/models/z_image/pipeline_z_image.py
@@ -182,7 +182,6 @@ def __init__(
             AutoModelForCausalLM,
             od_config,
             hf_config=text_encoder_config,
-            device=torch.get_default_device(),
         )
         if text_encoder_config.tie_word_embeddings:
             self.text_encoder.lm_head.weight = self.text_encoder.get_input_embeddings().weight

From 53c28f5daf6590cf2df14e462928564ad9b1f713 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sat, 21 Feb 2026 21:14:48 +0800
Subject: [PATCH 12/23] vae use loader

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 .../diffusion/model_loader/diffusers_loader.py   |  2 +-
 .../diffusion/models/z_image/pipeline_z_image.py | 16 ++++++++++------
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/vllm_omni/diffusion/model_loader/diffusers_loader.py b/vllm_omni/diffusion/model_loader/diffusers_loader.py
index 40318d5f8fb..3d1221f831e 100644
--- a/vllm_omni/diffusion/model_loader/diffusers_loader.py
+++ b/vllm_omni/diffusion/model_loader/diffusers_loader.py
@@ -180,7 +180,7 @@ def _prepare_weights(
             hf_weights_files = filter_duplicate_safetensors_files(
                 hf_weights_files,
                 filter_folder,
-                index_file,
+                index_file or "",
             )
         else:
             hf_weights_files = filter_files_not_needed_for_inference(hf_weights_files)
diff --git a/vllm_omni/diffusion/models/z_image/pipeline_z_image.py b/vllm_omni/diffusion/models/z_image/pipeline_z_image.py
index eb6efc6cdcc..71c086b7838 100644
--- a/vllm_omni/diffusion/models/z_image/pipeline_z_image.py
+++ b/vllm_omni/diffusion/models/z_image/pipeline_z_image.py
@@ -154,6 +154,12 @@ def __init__(
         super().__init__()
         self.od_config = od_config
         self.weights_sources = [
+            DiffusersPipelineLoader.ComponentSource(
+                model_or_path=od_config.model,
+                subfolder="text_encoder",
+                revision=od_config.revision,
+                prefix="text_encoder.",
+            ),
             DiffusersPipelineLoader.ComponentSource(
                 model_or_path=od_config.model,
                 subfolder="transformer",
@@ -163,9 +169,9 @@ def __init__(
             ),
             DiffusersPipelineLoader.ComponentSource(
                 model_or_path=od_config.model,
-                subfolder="text_encoder",
+                subfolder="vae",
                 revision=od_config.revision,
-                prefix="text_encoder.",
+                prefix="vae.",
             ),
         ]
         self._execution_device = get_local_device()
@@ -186,9 +192,8 @@ def __init__(
         if text_encoder_config.tie_word_embeddings:
             self.text_encoder.lm_head.weight = self.text_encoder.get_input_embeddings().weight
 
-        self.vae = AutoencoderKL.from_pretrained(model, subfolder="vae", local_files_only=local_files_only).to(
-            self._execution_device
-        )
+        vae_config = AutoencoderKL.load_config(model, subfolder="vae", local_files_only=local_files_only)
+        self.vae = AutoencoderKL.from_config(vae_config).to(self._execution_device)
         # Get vLLM quantization config for linear layers
         quant_config = get_vllm_quant_config_for_layers(od_config.quantization_config)
         self.transformer = ZImageTransformer2DModel(quant_config=quant_config)
@@ -660,5 +665,4 @@ def forward(
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
         loaded_weights = loader.load_weights(weights)
-        loaded_weights |= {f"vae.{name}" for name, _ in self.vae.named_parameters()}
         return loaded_weights

From b644f56f26eced97f945df1eb24c11541a3585f1 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sat, 21 Feb 2026 21:18:48 +0800
Subject: [PATCH 13/23] clean

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm_omni/diffusion/models/z_image/pipeline_z_image.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm_omni/diffusion/models/z_image/pipeline_z_image.py b/vllm_omni/diffusion/models/z_image/pipeline_z_image.py
index 71c086b7838..5ece92dceb8 100644
--- a/vllm_omni/diffusion/models/z_image/pipeline_z_image.py
+++ b/vllm_omni/diffusion/models/z_image/pipeline_z_image.py
@@ -664,5 +664,4 @@ def forward(
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
-        loaded_weights = loader.load_weights(weights)
-        return loaded_weights
+        return loader.load_weights(weights)

From 75fd6f7e807f23e7b2e44ed34f5231d6bcafd296 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sat, 21 Feb 2026 21:26:42 +0800
Subject: [PATCH 14/23] raise value error for multiple index

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm_omni/diffusion/model_loader/diffusers_loader.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/vllm_omni/diffusion/model_loader/diffusers_loader.py b/vllm_omni/diffusion/model_loader/diffusers_loader.py
index 3d1221f831e..981d936a59d 100644
--- a/vllm_omni/diffusion/model_loader/diffusers_loader.py
+++ b/vllm_omni/diffusion/model_loader/diffusers_loader.py
@@ -109,9 +109,10 @@ def _prepare_weights(
         available_index_file = list(
             filter(lambda f: file_exists(model_name_or_path, f, revision=revision), possible_index_files)
         )
-        assert len(available_index_file) <= 1, (
-            f"Multiple index files found in {model_name_or_path} with subfolder {subfolder}: {available_index_file}"
-        )
+        if len(available_index_file) > 1:
+            raise ValueError(
+                f"Multiple index files found in {model_name_or_path} with subfolder {subfolder}: {available_index_file}"
+            )
         index_file_with_subfolder = available_index_file[0] if len(available_index_file) == 1 else None
         index_file = (
             index_file_with_subfolder.split("/")[-1]

From 805507a4018d38757984bd2a7ca2203c0b55dd96 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sun, 22 Feb 2026 10:47:41 +0800
Subject: [PATCH 15/23] remove unused function

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm_omni/diffusion/model_loader/diffusers_loader.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/vllm_omni/diffusion/model_loader/diffusers_loader.py b/vllm_omni/diffusion/model_loader/diffusers_loader.py
index 981d936a59d..51d1abc08cf 100644
--- a/vllm_omni/diffusion/model_loader/diffusers_loader.py
+++ b/vllm_omni/diffusion/model_loader/diffusers_loader.py
@@ -38,11 +38,6 @@
 INDEX_FILES = [DIFFUSION_MODEL_WEIGHTS_INDEX, TRANSFORMER_WEIGHTS_INDEX]
 
 
-def get_subfolder_file_path(subfolder: str | None, file: str) -> str:
-    """Get the subfolder path."""
-    return f"{subfolder}/" if subfolder is not None else file
-
-
 class DiffusersPipelineLoader:
     """Model loader that can load diffusers pipeline components from disk."""
 

From bfd7cd2fa96f1786ae0a5b5b66d41d8b5902870a Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sun, 22 Feb 2026 11:02:23 +0800
Subject: [PATCH 16/23] fix nits

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm_omni/diffusion/models/utils.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/vllm_omni/diffusion/models/utils.py b/vllm_omni/diffusion/models/utils.py
index 12f0c4471fa..a8e43ed4045 100644
--- a/vllm_omni/diffusion/models/utils.py
+++ b/vllm_omni/diffusion/models/utils.py
@@ -45,13 +45,14 @@ def replace_linear_class(
     if not isinstance(style, str):
         raise ValueError(f"Unsupported parallel style type {type(style)}, expected str")
 
-    vllm_linear_cls, vllm_linear_kwargs = {
+    vllm_linear_maps = {
         "colwise": (ColumnParallelLinear, {}),
         "colwise_rep": (ColumnParallelLinear, {"gather_output": True}),
         "rowwise": (RowParallelLinear, {}),
         "rowwise_rep": (RowParallelLinear, {"input_is_parallel": False}),
         "replicate": (ReplicatedLinear, {}),
-    }.get(style, (ReplicatedLinear, {}))
+    }
+    vllm_linear_cls, vllm_linear_kwargs = vllm_linear_maps[style]
 
     return vllm_linear_cls(
         input_size=linear.in_features,
@@ -68,7 +69,6 @@ def recursive_replace_linear(model: nn.Module, od_config: OmniDiffusionConfig):
     """Recursively replace modules in the model as needed.
     Currently, this replaces:
     - `nn.Linear` with vLLM's tensor parallel linear classes
-    - `*RMSNorm` with vLLM's `RMSNorm`
     """
     # Prefix the patterns because we always start from `self.model`
     quant_config = get_vllm_quant_config_for_layers(od_config.quantization_config)
@@ -101,7 +101,8 @@ def init_parameters(
                     param.data,
                     dtype=dtype,
                     device=device,
-                )
+                ),
+                requires_grad=param.requires_grad,
             )
             setattr(module, name, new_param)
     for child in module.children():

From 865218b99d8f6fe115e23172f787bf0f2be0ea84 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Wed, 18 Mar 2026 01:21:29 +0800
Subject: [PATCH 17/23] fix

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 .../diffusion/distributed/autoencoders/autoencoder_kl.py    | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/vllm_omni/diffusion/distributed/autoencoders/autoencoder_kl.py b/vllm_omni/diffusion/distributed/autoencoders/autoencoder_kl.py
index 7df2d6a8add..a249aeb291f 100644
--- a/vllm_omni/diffusion/distributed/autoencoders/autoencoder_kl.py
+++ b/vllm_omni/diffusion/distributed/autoencoders/autoencoder_kl.py
@@ -28,6 +28,12 @@ def from_pretrained(cls, *args: Any, **kwargs: Any):
         model.init_distributed()
         return model
 
+    @classmethod
+    def from_config(cls, *args: Any, **kwargs: Any):
+        model = super().from_config(*args, **kwargs)
+        model.init_distributed()
+        return model
+
     def tile_split(self, z: torch.Tensor) -> tuple[list[TileTask], GridSpec]:
         # mostly copy from AutoencoderKL
         overlap_size = int(self.tile_latent_min_size * (1 - self.tile_overlap_factor))

From 65e8f4d105a5fd9e08009fa7b800091fe2d5886a Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sat, 4 Apr 2026 14:34:09 +0800
Subject: [PATCH 18/23] fix

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm_omni/diffusion/models/z_image/pipeline_z_image.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm_omni/diffusion/models/z_image/pipeline_z_image.py b/vllm_omni/diffusion/models/z_image/pipeline_z_image.py
index 79e55ad6382..694eddcc3c8 100644
--- a/vllm_omni/diffusion/models/z_image/pipeline_z_image.py
+++ b/vllm_omni/diffusion/models/z_image/pipeline_z_image.py
@@ -183,12 +183,12 @@ def __init__(
 
         text_encoder_config = AutoConfig.from_pretrained(
             model, subfolder="text_encoder", local_files_only=local_files_only
-        ).to(self._execution_device)
+        )
         self.text_encoder = create_transformers_model(
             AutoModelForCausalLM,
             od_config,
             hf_config=text_encoder_config,
-        )
+        ).to(self._execution_device)
         if text_encoder_config.tie_word_embeddings:
             self.text_encoder.lm_head.weight = self.text_encoder.get_input_embeddings().weight
 

From 0b6fe43acecec59884cad5585adbddeb2390d75a Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Thu, 30 Apr 2026 12:18:49 +0800
Subject: [PATCH 19/23] update doc

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 docs/user_guide/diffusion/quantization/fp8.md    | 16 ++++++++--------
 .../diffusion/quantization/overview.md           |  2 +-
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/docs/user_guide/diffusion/quantization/fp8.md b/docs/user_guide/diffusion/quantization/fp8.md
index 9906631b625..bfa3679d34e 100644
--- a/docs/user_guide/diffusion/quantization/fp8.md
+++ b/docs/user_guide/diffusion/quantization/fp8.md
@@ -58,14 +58,14 @@ The available `ignored_layers` names depend on the model architecture (e.g., `to
 
 ## Supported Models
 
-| Model | HF Models | Recommendation | `ignored_layers` |
-|-------|-----------|---------------|------------------|
-| Z-Image | `Tongyi-MAI/Z-Image-Turbo` | All layers | None |
-| Qwen-Image | `Qwen/Qwen-Image`, `Qwen/Qwen-Image-2512` | Skip sensitive layers | `img_mlp` |
-| Flux | `black-forest-labs/FLUX.1-dev` | All layers | None |
-| HunyuanImage-3 | `tencent/HunyuanImage3` | All layers | None |
-| HunyuanVideo-1.5 | `hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-480p_t2v`, `720p_t2v`, `480p_i2v` | All layers | None |
-| Helios | `BestWishYsh/Helios-Base`, `BestWishYsh/Helios-Mid`, `BestWishYsh/Helios-Distilled` | All layers | None |
+| Model | HF Models | Recommendation | `ignored_layers` | Text-encoder quantization |
+|-------|-----------|----------------|------------------|---------------------------|
+| Z-Image | `Tongyi-MAI/Z-Image-Turbo` | All layers | None | ✅︎ |
+| Qwen-Image | `Qwen/Qwen-Image`, `Qwen/Qwen-Image-2512` | Skip sensitive layers | `img_mlp` | |
+| Flux | `black-forest-labs/FLUX.1-dev` | All layers | None | |
+| HunyuanImage-3 | `tencent/HunyuanImage3` | All layers | None | |
+| HunyuanVideo-1.5 | `hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-480p_t2v`, `720p_t2v`, `480p_i2v` | All layers | None | |
+| Helios | `BestWishYsh/Helios-Base`, `BestWishYsh/Helios-Mid`, `BestWishYsh/Helios-Distilled` | All layers | None | |
 
 ## Combining with Other Features
 
diff --git a/docs/user_guide/diffusion/quantization/overview.md b/docs/user_guide/diffusion/quantization/overview.md
index 25d7fa5c756..821ec008eb4 100644
--- a/docs/user_guide/diffusion/quantization/overview.md
+++ b/docs/user_guide/diffusion/quantization/overview.md
@@ -54,7 +54,7 @@ When `--quantization fp8` is enabled for diffusion models:
 | Component | What Gets Quantized | Mechanism |
 |-----------|-------------------|-----------|
 | **DiT (transformer)** | `nn.Linear` layers | vLLM W8A8 FP8 compute (Ada/Hopper) or weight-only (older GPUs) |
-| **Text encoder** | `nn.Linear` layers | FP8 weight storage, BF16 compute |
+| **Text encoder** | `nn.Linear` layers | vLLM W8A8 FP8 compute (Ada/Hopper) or weight-only (older GPUs) |
 | **VAE** | `nn.Conv2d`, `nn.Conv3d` layers | FP8 weight storage, BF16 compute |
 
 ### Multi-stage Omni Models

From 97f2466cf2817c5341e48355eec338517e290c87 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Thu, 30 Apr 2026 12:32:00 +0800
Subject: [PATCH 20/23] update doc

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 docs/user_guide/quantization/fp8.md | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/docs/user_guide/quantization/fp8.md b/docs/user_guide/quantization/fp8.md
index e89bc76ca77..7373a39ffb4 100644
--- a/docs/user_guide/quantization/fp8.md
+++ b/docs/user_guide/quantization/fp8.md
@@ -32,15 +32,15 @@ guide. FP8 on Ampere may use a weight-only path where available.
 
 ### Diffusion Model (Qwen-Image, Wan2.2)
 
-| Model | HF models | Online | Pre-calibrated | Recommendation | `ignored_layers` |
-|-------|-----------|:-------:|:------:|----------------|------------------|
-| Qwen-Image | `Qwen/Qwen-Image`, `Qwen/Qwen-Image-2512` | Yes | Yes | Skip sensitive image-stream MLPs when quality regresses | `img_mlp` |
-| Wan2.2 | Wan2.2 diffusion pipelines | Not validated | Not validated | Validate against BF16 before documenting as supported | TBD |
-| Z-Image | `Tongyi-MAI/Z-Image-Turbo` | Yes | Yes | All layers | None |
-| FLUX.1 | `black-forest-labs/FLUX.1-dev`, `black-forest-labs/FLUX.1-schnell` | Yes | Yes | All layers | None |
-| FLUX.2-klein | `black-forest-labs/FLUX.2-klein-4B` | Yes | Yes | All layers | None |
-| HunyuanImage-3.0 | `tencent/HunyuanImage-3.0`, `tencent/HunyuanImage-3.0-Instruct` | Yes | Yes | All layers; use the Hunyuan stage config for multi-stage runs | None |
-| HunyuanVideo-1.5 | `hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-480p_t2v`, `720p_t2v`, `480p_i2v` | Yes | Yes | All layers | None |
+| Model | HF models | Online | Pre-calibrated | Recommendation | `ignored_layers` | Text-Encoder quantization |
+|-------|-----------|:-------:|:------:|----------------|------------------|------------------|
+| Qwen-Image | `Qwen/Qwen-Image`, `Qwen/Qwen-Image-2512` | Yes | Yes | Skip sensitive image-stream MLPs when quality regresses | `img_mlp` | |
+| Wan2.2 | Wan2.2 diffusion pipelines | Not validated | Not validated | Validate against BF16 before documenting as supported | TBD | |
+| Z-Image | `Tongyi-MAI/Z-Image-Turbo` | Yes | Yes | All layers | None | ✅︎ |
+| FLUX.1 | `black-forest-labs/FLUX.1-dev`, `black-forest-labs/FLUX.1-schnell` | Yes | Yes | All layers | None | |
+| FLUX.2-klein | `black-forest-labs/FLUX.2-klein-4B` | Yes | Yes | All layers | None | |
+| HunyuanImage-3.0 | `tencent/HunyuanImage-3.0`, `tencent/HunyuanImage-3.0-Instruct` | Yes | Yes | All layers; use the Hunyuan stage config for multi-stage runs | None | |
+| HunyuanVideo-1.5 | `hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-480p_t2v`, `720p_t2v`, `480p_i2v` | Yes | Yes | All layers | None | |
 
 ### Multi-Stage Omni/TTS Model (Qwen3-Omni, Qwen3-TTS)
 

From a8303ba7dbbc8a0c721eb7e7be245d725b4160b1 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Thu, 30 Apr 2026 18:13:05 +0800
Subject: [PATCH 21/23] fix import

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm_omni/diffusion/models/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm_omni/diffusion/models/utils.py b/vllm_omni/diffusion/models/utils.py
index ce5fa34322a..122646219ff 100644
--- a/vllm_omni/diffusion/models/utils.py
+++ b/vllm_omni/diffusion/models/utils.py
@@ -18,7 +18,7 @@
 from vllm.model_executor.models.utils import maybe_prefix
 
 from vllm_omni.diffusion.data import OmniDiffusionConfig
-from vllm_omni.diffusion.quantization import get_vllm_quant_config_for_layers
+from vllm_omni.quantization import build_quant_config
 
 if TYPE_CHECKING:
     from transformers import PretrainedConfig, PreTrainedModel
@@ -78,7 +78,7 @@ def recursive_replace_linear(model: nn.Module, od_config: OmniDiffusionConfig):
     - `nn.Linear` with vLLM's tensor parallel linear classes
     """
     # Prefix the patterns because we always start from `self.model`
-    quant_config = get_vllm_quant_config_for_layers(od_config.quantization_config)
+    quant_config = build_quant_config(od_config.quantization_config)
 
     def _recursive_replace(module: nn.Module, prefix: str):
         for child_name, child_module in module.named_children():

From b9ef9ba0f7ec05307a473927e99fcf4b8e48b312 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Thu, 30 Apr 2026 21:50:22 +0800
Subject: [PATCH 22/23] fix weights downloading

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 .../model_loader/diffusers_loader.py          | 34 ++++++-------------
 1 file changed, 10 insertions(+), 24 deletions(-)

diff --git a/vllm_omni/diffusion/model_loader/diffusers_loader.py b/vllm_omni/diffusion/model_loader/diffusers_loader.py
index 446bf6dd65b..1abcae42ced 100644
--- a/vllm_omni/diffusion/model_loader/diffusers_loader.py
+++ b/vllm_omni/diffusion/model_loader/diffusers_loader.py
@@ -112,12 +112,7 @@ def _prepare_weights(
             raise ValueError(
                 f"Multiple index files found in {model_name_or_path} with subfolder {subfolder}: {available_index_file}"
             )
-        index_file_with_subfolder = available_index_file[0] if len(available_index_file) == 1 else None
-        index_file = (
-            index_file_with_subfolder.split("/")[-1]
-            if index_file_with_subfolder and subfolder is not None
-            else index_file_with_subfolder
-        )
+        index_file = available_index_file[0] if len(available_index_file) == 1 else ""
 
         # only hf is supported currently
         if load_format == "auto":
@@ -135,20 +130,21 @@ def _prepare_weights(
         if allow_patterns_overrides is not None:
             allow_patterns = allow_patterns_overrides
 
-        if subfolder is not None:
-            allow_patterns = [f"{subfolder}/{pattern}" for pattern in allow_patterns]
-
         if not is_local:
             hf_folder = download_weights_from_hf(
                 model_name_or_path,
                 self.load_config.download_dir,
                 allow_patterns,
                 revision,
+                subfolder=subfolder,
                 ignore_patterns=self.load_config.ignore_patterns,
             )
         else:
             hf_folder = model_name_or_path
 
+        if subfolder is not None:
+            hf_folder = os.path.join(hf_folder, subfolder)
+
         hf_weights_files: list[str] = []
         for pattern in allow_patterns:
             hf_weights_files += glob.glob(os.path.join(hf_folder, pattern))
@@ -166,22 +162,12 @@ def _prepare_weights(
             if not is_local:
                 download_safetensors_index_file_from_hf(
                     model_name_or_path,
-                    index_file_with_subfolder,
-                    self.load_config.download_dir,
-                    revision,
+                    index_file,
+                    cache_dir=self.load_config.download_dir,
+                    subfolder=subfolder,
+                    revision=revision,
                 )
-            # Some diffusers pipelines keep component weights under a
-            # subfolder (e.g. "transformer/") and the corresponding index file
-            # uses filenames relative to that subfolder. vLLM's
-            # `filter_duplicate_safetensors_files` expects weight_map entries
-            # to be relative to the `hf_folder` we pass in, so we point it to
-            # the component subfolder to avoid filtering out all shards.
-            filter_folder = os.path.join(hf_folder, subfolder) if subfolder is not None else hf_folder
-            hf_weights_files = filter_duplicate_safetensors_files(
-                hf_weights_files,
-                filter_folder,
-                index_file or "",
-            )
+            hf_weights_files = filter_duplicate_safetensors_files(hf_weights_files, hf_folder, index_file)
         else:
             hf_weights_files = filter_files_not_needed_for_inference(hf_weights_files)
 

From 326c6783b85950b4d4f2536f93d938b85a34df24 Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Sun, 3 May 2026 23:45:23 +0800
Subject: [PATCH 23/23] Update
 vllm_omni/diffusion/model_loader/diffusers_loader.py

Co-authored-by: SYLAR <125541396+lishunyang12@users.noreply.github.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm_omni/diffusion/model_loader/diffusers_loader.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm_omni/diffusion/model_loader/diffusers_loader.py b/vllm_omni/diffusion/model_loader/diffusers_loader.py
index 1abcae42ced..4c6fb070997 100644
--- a/vllm_omni/diffusion/model_loader/diffusers_loader.py
+++ b/vllm_omni/diffusion/model_loader/diffusers_loader.py
@@ -105,14 +105,14 @@ def _prepare_weights(
         possible_index_files = [
             f"{subfolder}/{index_file}" if subfolder is not None else index_file for index_file in INDEX_FILES
         ]
-        available_index_file = list(
-            filter(lambda f: file_exists(model_name_or_path, f, revision=revision), possible_index_files)
-        )
+        available_index_file = [
+            f for f in possible_index_files if file_exists(model_name_or_path, f, revision=revision)
+        ]
         if len(available_index_file) > 1:
             raise ValueError(
                 f"Multiple index files found in {model_name_or_path} with subfolder {subfolder}: {available_index_file}"
             )
-        index_file = available_index_file[0] if len(available_index_file) == 1 else ""
+        index_file = available_index_file[0] if available_index_file else ""
 
         # only hf is supported currently
         if load_format == "auto":