From e790e33b1e4cc7e028a564806a3f4741cc9a7207 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Wed, 11 Feb 2026 21:09:37 +0800 Subject: [PATCH 01/23] draft Signed-off-by: Isotr0py --- .../diffusion/model_loader/diffusers_loader.py | 12 ++++++------ .../diffusion/models/z_image/pipeline_z_image.py | 16 ++++++++++++++-- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/vllm_omni/diffusion/model_loader/diffusers_loader.py b/vllm_omni/diffusion/model_loader/diffusers_loader.py index b61f70b697c..d9090d95c40 100644 --- a/vllm_omni/diffusion/model_loader/diffusers_loader.py +++ b/vllm_omni/diffusion/model_loader/diffusers_loader.py @@ -262,9 +262,9 @@ def load_weights(self, model: nn.Module) -> None: # We only enable strict check for non-quantized models # that have loaded weights tracking currently. if loaded_weights is not None: - _ = weights_to_load - loaded_weights - # if weights_not_loaded: - # raise ValueError( - # "Following weights were not initialized from " - # f"checkpoint: {weights_not_loaded}" - # ) + weights_not_loaded = weights_to_load - loaded_weights + if weights_not_loaded: + raise ValueError( + "Following weights were not initialized from " + f"checkpoint: {weights_not_loaded}" + ) diff --git a/vllm_omni/diffusion/models/z_image/pipeline_z_image.py b/vllm_omni/diffusion/models/z_image/pipeline_z_image.py index 8a391319538..341f9b7d7d4 100644 --- a/vllm_omni/diffusion/models/z_image/pipeline_z_image.py +++ b/vllm_omni/diffusion/models/z_image/pipeline_z_image.py @@ -156,10 +156,22 @@ def __init__( DiffusersPipelineLoader.ComponentSource( model_or_path=od_config.model, subfolder="transformer", - revision=None, + revision=od_config.revision, prefix="transformer.", fall_back_to_pt=True, - ) + ), + DiffusersPipelineLoader.ComponentSource( + model_or_path=od_config.model, + subfolder="text_encoder", + revision=od_config.revision, + prefix="text_encoder.", + ), + DiffusersPipelineLoader.ComponentSource( + model_or_path=od_config.model, + subfolder="vae", + revision=od_config.revision, + prefix="vae.", + ), ] self._execution_device = get_local_device() model = od_config.model From 0fe72c7cd111592a8d43886d985aa0c72a031121 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Wed, 11 Feb 2026 22:28:01 +0800 Subject: [PATCH 02/23] update Signed-off-by: Isotr0py --- .../model_loader/diffusers_loader.py | 1 + vllm_omni/diffusion/models/utils.py | 101 ++++++++++++++++++ .../models/z_image/pipeline_z_image.py | 21 ++-- 3 files changed, 114 insertions(+), 9 deletions(-) create mode 100644 vllm_omni/diffusion/models/utils.py diff --git a/vllm_omni/diffusion/model_loader/diffusers_loader.py b/vllm_omni/diffusion/model_loader/diffusers_loader.py index d9090d95c40..201b442b4b3 100644 --- a/vllm_omni/diffusion/model_loader/diffusers_loader.py +++ b/vllm_omni/diffusion/model_loader/diffusers_loader.py @@ -32,6 +32,7 @@ MODEL_INDEX = "model_index.json" DIFFUSION_MODEL_WEIGHTS_INDEX = "diffusion_pytorch_model.safetensors.index.json" +TRANSFORMER_WEIGHTS_INDEX = "model.safetensors.index.json" class DiffusersPipelineLoader: diff --git a/vllm_omni/diffusion/models/utils.py b/vllm_omni/diffusion/models/utils.py new file mode 100644 index 00000000000..0b4b5097175 --- /dev/null +++ b/vllm_omni/diffusion/models/utils.py @@ -0,0 +1,101 @@ +from typing import Literal, TYPE_CHECKING + +import torch +import torch.nn as nn + +from vllm_omni.diffusion.data import OmniDiffusionConfig +from vllm_omni.diffusion.quantization import get_vllm_quant_config_for_layers +from vllm.model_executor.layers.linear import ( + ColumnParallelLinear, + ReplicatedLinear, + RowParallelLinear, +) +from vllm.model_executor.models.utils import maybe_prefix + +if TYPE_CHECKING: + from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig, + ) + + +Style = Literal["colwise", "colwise_rep", "rowwise", "rowwise_rep", "replicate"] + + +def replace_linear_class( + linear: nn.Linear, + style: Style = "replicate", + quant_config: "QuantizationConfig | None" = None, + *, + prefix: str = "", +) -> ColumnParallelLinear | RowParallelLinear | ReplicatedLinear: + """ + Replace nn.Linear with one of vLLM's tensor parallel linear classes. + + Args: + linear: `nn.Linear` to be replaced. + style: Tensor parallel style of the new linear, e.g. "colwise". + quant_config: Quantization config for the new linear. + Returns: + The new linear. + """ + + if not isinstance(style, str): + raise ValueError(f"Unsupported parallel style type {type(style)}, expected str") + + vllm_linear_cls, vllm_linear_kwargs = { + "colwise": (ColumnParallelLinear, {}), + "colwise_rep": (ColumnParallelLinear, {"gather_output": True}), + "rowwise": (RowParallelLinear, {}), + "rowwise_rep": (RowParallelLinear, {"input_is_parallel": False}), + "replicate": (ReplicatedLinear, {}), + }.get(style, (ReplicatedLinear, {})) + + return vllm_linear_cls( + input_size=linear.in_features, + output_size=linear.out_features, + bias=linear.bias is not None, + quant_config=quant_config, + prefix=prefix, + return_bias=False, + **vllm_linear_kwargs, + ) + + +def recursive_replace_linear(model: nn.Module, od_config: OmniDiffusionConfig): + """Recursively replace modules in the model as needed. + Currently, this replaces: + - `nn.Linear` with vLLM's tensor parallel linear classes + - `*RMSNorm` with vLLM's `RMSNorm` + """ + # Prefix the patterns because we always start from `self.model` + quant_config = get_vllm_quant_config_for_layers(od_config.quantization_config) + def _recursive_replace(module: nn.Module, prefix: str): + for child_name, child_module in module.named_children(): + new_module = child_module + qual_name = maybe_prefix(prefix, child_name) + # Replace modules as needed + if isinstance(child_module, nn.Linear): + style = "replicate" + new_module = replace_linear_class( + child_module, style, quant_config, prefix=qual_name + ) + else: + _recursive_replace(child_module, prefix=qual_name) + if new_module is not child_module: + setattr(module, child_name, new_module) + _recursive_replace(model, prefix="") + + +def init_parameters(module: nn.Module, dtype: torch.dtype | None, device: torch.device | None = None,): + for name, param in module.named_parameters(recurse=False): + if param.device == torch.device("meta"): + new_param = nn.Parameter( + torch.empty_like( + param.data, + dtype=dtype, + device=device, + ) + ) + setattr(module, name, new_param) + for child in module.children(): + init_parameters(child, dtype, device) \ No newline at end of file diff --git a/vllm_omni/diffusion/models/z_image/pipeline_z_image.py b/vllm_omni/diffusion/models/z_image/pipeline_z_image.py index 341f9b7d7d4..788497f3c7c 100644 --- a/vllm_omni/diffusion/models/z_image/pipeline_z_image.py +++ b/vllm_omni/diffusion/models/z_image/pipeline_z_image.py @@ -28,7 +28,8 @@ from diffusers.schedulers import FlowMatchEulerDiscreteScheduler from diffusers.utils import logging from diffusers.utils.torch_utils import randn_tensor -from transformers import AutoModel, AutoTokenizer +from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig +from vllm.model_executor.models.transformers.utils import init_on_device_without_buffers from vllm.model_executor.models.utils import AutoWeightsLoader from vllm_omni.diffusion.data import DiffusionOutput, OmniDiffusionConfig @@ -42,6 +43,7 @@ from vllm_omni.model_executor.model_loader.weight_utils import ( download_weights_from_hf_specific, ) +from vllm_omni.diffusion.models.utils import recursive_replace_linear, init_parameters logger = logging.get_logger(__name__) # pylint: disable=invalid-name @@ -166,12 +168,6 @@ def __init__( revision=od_config.revision, prefix="text_encoder.", ), - DiffusersPipelineLoader.ComponentSource( - model_or_path=od_config.model, - subfolder="vae", - revision=od_config.revision, - prefix="vae.", - ), ] self._execution_device = get_local_device() model = od_config.model @@ -180,9 +176,14 @@ def __init__( model, subfolder="scheduler", local_files_only=local_files_only ) - self.text_encoder = AutoModel.from_pretrained( + text_encoder_config = AutoConfig.from_pretrained( model, subfolder="text_encoder", local_files_only=local_files_only ) + with init_on_device_without_buffers("meta"): + self.text_encoder = AutoModelForCausalLM.from_config(text_encoder_config) + recursive_replace_linear(self.text_encoder, od_config) + init_parameters(self.text_encoder, dtype=od_config.dtype) + self.vae = AutoencoderKL.from_pretrained(model, subfolder="vae", local_files_only=local_files_only).to( self._execution_device ) @@ -656,4 +657,6 @@ def forward( def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) - return loader.load_weights(weights) + loaded_weights = loader.load_weights(weights) + loaded_weights |= {name for name, _ in self.vae.named_parameters()} + return loaded_weights From 51070f4b6bc9b74ddc578bb77ab5fd4b0011f644 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Wed, 11 Feb 2026 22:42:22 +0800 Subject: [PATCH 03/23] fix Signed-off-by: Isotr0py --- vllm_omni/diffusion/models/z_image/pipeline_z_image.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm_omni/diffusion/models/z_image/pipeline_z_image.py b/vllm_omni/diffusion/models/z_image/pipeline_z_image.py index 788497f3c7c..5bb536d47d3 100644 --- a/vllm_omni/diffusion/models/z_image/pipeline_z_image.py +++ b/vllm_omni/diffusion/models/z_image/pipeline_z_image.py @@ -182,7 +182,9 @@ def __init__( with init_on_device_without_buffers("meta"): self.text_encoder = AutoModelForCausalLM.from_config(text_encoder_config) recursive_replace_linear(self.text_encoder, od_config) - init_parameters(self.text_encoder, dtype=od_config.dtype) + init_parameters(self.text_encoder, dtype=od_config.dtype, device=self._execution_device) + if text_encoder_config.tie_word_embeddings: + self.text_encoder.lm_head.weight = self.text_encoder.get_input_embeddings().weight self.vae = AutoencoderKL.from_pretrained(model, subfolder="vae", local_files_only=local_files_only).to( self._execution_device @@ -658,5 +660,5 @@ def forward( def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) loaded_weights = loader.load_weights(weights) - loaded_weights |= {name for name, _ in self.vae.named_parameters()} + loaded_weights |= {f"vae.{name}" for name, _ in self.vae.named_parameters()} return loaded_weights From 4bb6989155f590b052a03b1853b1b617215d5041 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Thu, 12 Feb 2026 22:43:15 +0800 Subject: [PATCH 04/23] update Signed-off-by: Isotr0py --- vllm_omni/diffusion/models/utils.py | 22 ++++++++++++++++++- .../models/z_image/pipeline_z_image.py | 13 ++++++----- 2 files changed, 28 insertions(+), 7 deletions(-) diff --git a/vllm_omni/diffusion/models/utils.py b/vllm_omni/diffusion/models/utils.py index 0b4b5097175..25104699780 100644 --- a/vllm_omni/diffusion/models/utils.py +++ b/vllm_omni/diffusion/models/utils.py @@ -3,6 +3,8 @@ import torch import torch.nn as nn +from vllm.model_executor.models.transformers.utils import init_on_device_without_buffers + from vllm_omni.diffusion.data import OmniDiffusionConfig from vllm_omni.diffusion.quantization import get_vllm_quant_config_for_layers from vllm.model_executor.layers.linear import ( @@ -13,6 +15,8 @@ from vllm.model_executor.models.utils import maybe_prefix if TYPE_CHECKING: + from transformers import PretrainedConfig, PreTrainedModel + from transformers.models.auto.auto_factory import _BaseAutoModelClass from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, ) @@ -98,4 +102,20 @@ def init_parameters(module: nn.Module, dtype: torch.dtype | None, device: torch. ) setattr(module, name, new_param) for child in module.children(): - init_parameters(child, dtype, device) \ No newline at end of file + init_parameters(child, dtype, device) + + +def create_transformers_model( + auto_cls: "_BaseAutoModelClass", + od_config: OmniDiffusionConfig, + hf_config: "PretrainedConfig", + dtype: torch.dtype | None = None, + device: torch.device | None = None, +) -> PreTrainedModel: + """Create a HuggingFace model using the given auto class and model name.""" + dtype = dtype or od_config.dtype + with init_on_device_without_buffers("meta"): + model = auto_cls.from_config(hf_config) + recursive_replace_linear(model, od_config) + init_parameters(model, dtype=dtype, device=device) + return model diff --git a/vllm_omni/diffusion/models/z_image/pipeline_z_image.py b/vllm_omni/diffusion/models/z_image/pipeline_z_image.py index 5bb536d47d3..6c6ce17b4f7 100644 --- a/vllm_omni/diffusion/models/z_image/pipeline_z_image.py +++ b/vllm_omni/diffusion/models/z_image/pipeline_z_image.py @@ -29,7 +29,6 @@ from diffusers.utils import logging from diffusers.utils.torch_utils import randn_tensor from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig -from vllm.model_executor.models.transformers.utils import init_on_device_without_buffers from vllm.model_executor.models.utils import AutoWeightsLoader from vllm_omni.diffusion.data import DiffusionOutput, OmniDiffusionConfig @@ -43,7 +42,7 @@ from vllm_omni.model_executor.model_loader.weight_utils import ( download_weights_from_hf_specific, ) -from vllm_omni.diffusion.models.utils import recursive_replace_linear, init_parameters +from vllm_omni.diffusion.models.utils import create_transformers_model logger = logging.get_logger(__name__) # pylint: disable=invalid-name @@ -179,10 +178,12 @@ def __init__( text_encoder_config = AutoConfig.from_pretrained( model, subfolder="text_encoder", local_files_only=local_files_only ) - with init_on_device_without_buffers("meta"): - self.text_encoder = AutoModelForCausalLM.from_config(text_encoder_config) - recursive_replace_linear(self.text_encoder, od_config) - init_parameters(self.text_encoder, dtype=od_config.dtype, device=self._execution_device) + self.text_encoder = create_transformers_model( + AutoModelForCausalLM, + od_config, + hf_config=text_encoder_config, + device=self._execution_device, + ) if text_encoder_config.tie_word_embeddings: self.text_encoder.lm_head.weight = self.text_encoder.get_input_embeddings().weight From f6ebce1451d0d171e5c25bfd50f8cd7d4e5c511f Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Thu, 12 Feb 2026 23:14:19 +0800 Subject: [PATCH 05/23] fix Signed-off-by: Isotr0py --- .../model_loader/diffusers_loader.py | 29 +++++++++++++++---- vllm_omni/diffusion/models/utils.py | 25 +++++++++------- .../models/z_image/pipeline_z_image.py | 4 +-- 3 files changed, 39 insertions(+), 19 deletions(-) diff --git a/vllm_omni/diffusion/model_loader/diffusers_loader.py b/vllm_omni/diffusion/model_loader/diffusers_loader.py index 201b442b4b3..2cf6e3eef30 100644 --- a/vllm_omni/diffusion/model_loader/diffusers_loader.py +++ b/vllm_omni/diffusion/model_loader/diffusers_loader.py @@ -22,6 +22,7 @@ maybe_download_from_modelscope, safetensors_weights_iterator, ) +from vllm.transformers_utils.repo_utils import file_exists from vllm.utils.torch_utils import set_default_torch_dtype from vllm_omni.diffusion.data import OmniDiffusionConfig @@ -33,6 +34,12 @@ MODEL_INDEX = "model_index.json" DIFFUSION_MODEL_WEIGHTS_INDEX = "diffusion_pytorch_model.safetensors.index.json" TRANSFORMER_WEIGHTS_INDEX = "model.safetensors.index.json" +INDEX_FILES = [DIFFUSION_MODEL_WEIGHTS_INDEX, TRANSFORMER_WEIGHTS_INDEX] + + +def get_subfolder_file_path(subfolder: str | None, file: str) -> str: + """Get the subfolder path.""" + return f"{subfolder}/" if subfolder is not None else file class DiffusersPipelineLoader: @@ -95,8 +102,21 @@ def _prepare_weights( is_local = os.path.isdir(model_name_or_path) load_format = self.load_config.load_format use_safetensors = False - index_file = DIFFUSION_MODEL_WEIGHTS_INDEX - index_file_with_subfolder = f"{subfolder}/{index_file}" if subfolder else index_file + possible_index_files = [ + f"{subfolder}/{index_file}" if subfolder is not None else index_file for index_file in INDEX_FILES + ] + available_index_file = list( + filter(lambda f: file_exists(model_name_or_path, f, revision), possible_index_files) + ) + assert len(available_index_file) <= 1, ( + f"Multiple index files found in {model_name_or_path} with subfolder {subfolder}: {available_index_file}" + ) + index_file_with_subfolder = available_index_file[0] if len(available_index_file) == 1 else None + index_file = ( + index_file_with_subfolder.split("/")[-1] + if index_file_with_subfolder and subfolder is not None + else index_file_with_subfolder + ) # only hf is supported currently if load_format == "auto": @@ -265,7 +285,4 @@ def load_weights(self, model: nn.Module) -> None: if loaded_weights is not None: weights_not_loaded = weights_to_load - loaded_weights if weights_not_loaded: - raise ValueError( - "Following weights were not initialized from " - f"checkpoint: {weights_not_loaded}" - ) + raise ValueError(f"Following weights were not initialized from checkpoint: {weights_not_loaded}") diff --git a/vllm_omni/diffusion/models/utils.py b/vllm_omni/diffusion/models/utils.py index 25104699780..fdfdcc4037a 100644 --- a/vllm_omni/diffusion/models/utils.py +++ b/vllm_omni/diffusion/models/utils.py @@ -1,19 +1,18 @@ -from typing import Literal, TYPE_CHECKING +from typing import TYPE_CHECKING, Literal import torch import torch.nn as nn - -from vllm.model_executor.models.transformers.utils import init_on_device_without_buffers - -from vllm_omni.diffusion.data import OmniDiffusionConfig -from vllm_omni.diffusion.quantization import get_vllm_quant_config_for_layers from vllm.model_executor.layers.linear import ( ColumnParallelLinear, ReplicatedLinear, RowParallelLinear, ) +from vllm.model_executor.models.transformers.utils import init_on_device_without_buffers from vllm.model_executor.models.utils import maybe_prefix +from vllm_omni.diffusion.data import OmniDiffusionConfig +from vllm_omni.diffusion.quantization import get_vllm_quant_config_for_layers + if TYPE_CHECKING: from transformers import PretrainedConfig, PreTrainedModel from transformers.models.auto.auto_factory import _BaseAutoModelClass @@ -73,6 +72,7 @@ def recursive_replace_linear(model: nn.Module, od_config: OmniDiffusionConfig): """ # Prefix the patterns because we always start from `self.model` quant_config = get_vllm_quant_config_for_layers(od_config.quantization_config) + def _recursive_replace(module: nn.Module, prefix: str): for child_name, child_module in module.named_children(): new_module = child_module @@ -80,17 +80,20 @@ def _recursive_replace(module: nn.Module, prefix: str): # Replace modules as needed if isinstance(child_module, nn.Linear): style = "replicate" - new_module = replace_linear_class( - child_module, style, quant_config, prefix=qual_name - ) + new_module = replace_linear_class(child_module, style, quant_config, prefix=qual_name) else: _recursive_replace(child_module, prefix=qual_name) if new_module is not child_module: setattr(module, child_name, new_module) + _recursive_replace(model, prefix="") -def init_parameters(module: nn.Module, dtype: torch.dtype | None, device: torch.device | None = None,): +def init_parameters( + module: nn.Module, + dtype: torch.dtype | None, + device: torch.device | None = None, +): for name, param in module.named_parameters(recurse=False): if param.device == torch.device("meta"): new_param = nn.Parameter( @@ -111,7 +114,7 @@ def create_transformers_model( hf_config: "PretrainedConfig", dtype: torch.dtype | None = None, device: torch.device | None = None, -) -> PreTrainedModel: +) -> "PreTrainedModel": """Create a HuggingFace model using the given auto class and model name.""" dtype = dtype or od_config.dtype with init_on_device_without_buffers("meta"): diff --git a/vllm_omni/diffusion/models/z_image/pipeline_z_image.py b/vllm_omni/diffusion/models/z_image/pipeline_z_image.py index 6c6ce17b4f7..f56c095b7c0 100644 --- a/vllm_omni/diffusion/models/z_image/pipeline_z_image.py +++ b/vllm_omni/diffusion/models/z_image/pipeline_z_image.py @@ -28,12 +28,13 @@ from diffusers.schedulers import FlowMatchEulerDiscreteScheduler from diffusers.utils import logging from diffusers.utils.torch_utils import randn_tensor -from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig +from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer from vllm.model_executor.models.utils import AutoWeightsLoader from vllm_omni.diffusion.data import DiffusionOutput, OmniDiffusionConfig from vllm_omni.diffusion.distributed.utils import get_local_device from vllm_omni.diffusion.model_loader.diffusers_loader import DiffusersPipelineLoader +from vllm_omni.diffusion.models.utils import create_transformers_model from vllm_omni.diffusion.models.z_image.z_image_transformer import ( ZImageTransformer2DModel, ) @@ -42,7 +43,6 @@ from vllm_omni.model_executor.model_loader.weight_utils import ( download_weights_from_hf_specific, ) -from vllm_omni.diffusion.models.utils import create_transformers_model logger = logging.get_logger(__name__) # pylint: disable=invalid-name From 8a6d59fae6ef0c5e5d7ae2c2671069236f11ac01 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Thu, 12 Feb 2026 23:35:40 +0800 Subject: [PATCH 06/23] fix Signed-off-by: Isotr0py --- vllm_omni/diffusion/model_loader/diffusers_loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_omni/diffusion/model_loader/diffusers_loader.py b/vllm_omni/diffusion/model_loader/diffusers_loader.py index 2cf6e3eef30..778e7154555 100644 --- a/vllm_omni/diffusion/model_loader/diffusers_loader.py +++ b/vllm_omni/diffusion/model_loader/diffusers_loader.py @@ -106,7 +106,7 @@ def _prepare_weights( f"{subfolder}/{index_file}" if subfolder is not None else index_file for index_file in INDEX_FILES ] available_index_file = list( - filter(lambda f: file_exists(model_name_or_path, f, revision), possible_index_files) + filter(lambda f: file_exists(model_name_or_path, f, revision=revision), possible_index_files) ) assert len(available_index_file) <= 1, ( f"Multiple index files found in {model_name_or_path} with subfolder {subfolder}: {available_index_file}" From 59eb19dbeba97bd19d819301501ed34fc4801268 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Thu, 12 Feb 2026 23:37:27 +0800 Subject: [PATCH 07/23] revert weights tracking Signed-off-by: Isotr0py --- vllm_omni/diffusion/model_loader/diffusers_loader.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/vllm_omni/diffusion/model_loader/diffusers_loader.py b/vllm_omni/diffusion/model_loader/diffusers_loader.py index 778e7154555..99d1a7dcd08 100644 --- a/vllm_omni/diffusion/model_loader/diffusers_loader.py +++ b/vllm_omni/diffusion/model_loader/diffusers_loader.py @@ -283,6 +283,9 @@ def load_weights(self, model: nn.Module) -> None: # We only enable strict check for non-quantized models # that have loaded weights tracking currently. if loaded_weights is not None: - weights_not_loaded = weights_to_load - loaded_weights - if weights_not_loaded: - raise ValueError(f"Following weights were not initialized from checkpoint: {weights_not_loaded}") + _ = weights_to_load - loaded_weights + # if weights_not_loaded: + # raise ValueError( + # "Following weights were not initialized from " + # f"checkpoint: {weights_not_loaded}" + # ) \ No newline at end of file From c0b8d16bedccbb7f77a94932a64af3faf9e9b1c4 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Thu, 12 Feb 2026 23:38:10 +0800 Subject: [PATCH 08/23] code format Signed-off-by: Isotr0py --- vllm_omni/diffusion/model_loader/diffusers_loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_omni/diffusion/model_loader/diffusers_loader.py b/vllm_omni/diffusion/model_loader/diffusers_loader.py index 99d1a7dcd08..454c701b248 100644 --- a/vllm_omni/diffusion/model_loader/diffusers_loader.py +++ b/vllm_omni/diffusion/model_loader/diffusers_loader.py @@ -288,4 +288,4 @@ def load_weights(self, model: nn.Module) -> None: # raise ValueError( # "Following weights were not initialized from " # f"checkpoint: {weights_not_loaded}" - # ) \ No newline at end of file + # ) From e0810ebf58fd68f34b980869b91ac3a41693f62c Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Thu, 12 Feb 2026 23:48:11 +0800 Subject: [PATCH 09/23] fix codex Signed-off-by: Isotr0py --- vllm_omni/diffusion/models/z_image/pipeline_z_image.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm_omni/diffusion/models/z_image/pipeline_z_image.py b/vllm_omni/diffusion/models/z_image/pipeline_z_image.py index f56c095b7c0..a7d8d040fa7 100644 --- a/vllm_omni/diffusion/models/z_image/pipeline_z_image.py +++ b/vllm_omni/diffusion/models/z_image/pipeline_z_image.py @@ -182,7 +182,6 @@ def __init__( AutoModelForCausalLM, od_config, hf_config=text_encoder_config, - device=self._execution_device, ) if text_encoder_config.tie_word_embeddings: self.text_encoder.lm_head.weight = self.text_encoder.get_input_embeddings().weight From fd75eae28b831751cd5713a3eaf96ee7bc28fcdf Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Fri, 13 Feb 2026 00:05:36 +0800 Subject: [PATCH 10/23] fix device Signed-off-by: Isotr0py --- vllm_omni/diffusion/models/z_image/pipeline_z_image.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm_omni/diffusion/models/z_image/pipeline_z_image.py b/vllm_omni/diffusion/models/z_image/pipeline_z_image.py index a7d8d040fa7..d6e81cea904 100644 --- a/vllm_omni/diffusion/models/z_image/pipeline_z_image.py +++ b/vllm_omni/diffusion/models/z_image/pipeline_z_image.py @@ -182,6 +182,7 @@ def __init__( AutoModelForCausalLM, od_config, hf_config=text_encoder_config, + device=torch.get_default_device(), ) if text_encoder_config.tie_word_embeddings: self.text_encoder.lm_head.weight = self.text_encoder.get_input_embeddings().weight From d35ba0151a965c01cbb581f9beb27873630c39b8 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Fri, 13 Feb 2026 00:39:12 +0800 Subject: [PATCH 11/23] clean Signed-off-by: Isotr0py --- vllm_omni/diffusion/models/utils.py | 1 + vllm_omni/diffusion/models/z_image/pipeline_z_image.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_omni/diffusion/models/utils.py b/vllm_omni/diffusion/models/utils.py index fdfdcc4037a..12f0c4471fa 100644 --- a/vllm_omni/diffusion/models/utils.py +++ b/vllm_omni/diffusion/models/utils.py @@ -117,6 +117,7 @@ def create_transformers_model( ) -> "PreTrainedModel": """Create a HuggingFace model using the given auto class and model name.""" dtype = dtype or od_config.dtype + device = device or torch.get_default_device() with init_on_device_without_buffers("meta"): model = auto_cls.from_config(hf_config) recursive_replace_linear(model, od_config) diff --git a/vllm_omni/diffusion/models/z_image/pipeline_z_image.py b/vllm_omni/diffusion/models/z_image/pipeline_z_image.py index d6e81cea904..a7d8d040fa7 100644 --- a/vllm_omni/diffusion/models/z_image/pipeline_z_image.py +++ b/vllm_omni/diffusion/models/z_image/pipeline_z_image.py @@ -182,7 +182,6 @@ def __init__( AutoModelForCausalLM, od_config, hf_config=text_encoder_config, - device=torch.get_default_device(), ) if text_encoder_config.tie_word_embeddings: self.text_encoder.lm_head.weight = self.text_encoder.get_input_embeddings().weight From 53c28f5daf6590cf2df14e462928564ad9b1f713 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sat, 21 Feb 2026 21:14:48 +0800 Subject: [PATCH 12/23] vae use loader Signed-off-by: Isotr0py --- .../diffusion/model_loader/diffusers_loader.py | 2 +- .../diffusion/models/z_image/pipeline_z_image.py | 16 ++++++++++------ 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/vllm_omni/diffusion/model_loader/diffusers_loader.py b/vllm_omni/diffusion/model_loader/diffusers_loader.py index 40318d5f8fb..3d1221f831e 100644 --- a/vllm_omni/diffusion/model_loader/diffusers_loader.py +++ b/vllm_omni/diffusion/model_loader/diffusers_loader.py @@ -180,7 +180,7 @@ def _prepare_weights( hf_weights_files = filter_duplicate_safetensors_files( hf_weights_files, filter_folder, - index_file, + index_file or "", ) else: hf_weights_files = filter_files_not_needed_for_inference(hf_weights_files) diff --git a/vllm_omni/diffusion/models/z_image/pipeline_z_image.py b/vllm_omni/diffusion/models/z_image/pipeline_z_image.py index eb6efc6cdcc..71c086b7838 100644 --- a/vllm_omni/diffusion/models/z_image/pipeline_z_image.py +++ b/vllm_omni/diffusion/models/z_image/pipeline_z_image.py @@ -154,6 +154,12 @@ def __init__( super().__init__() self.od_config = od_config self.weights_sources = [ + DiffusersPipelineLoader.ComponentSource( + model_or_path=od_config.model, + subfolder="text_encoder", + revision=od_config.revision, + prefix="text_encoder.", + ), DiffusersPipelineLoader.ComponentSource( model_or_path=od_config.model, subfolder="transformer", @@ -163,9 +169,9 @@ def __init__( ), DiffusersPipelineLoader.ComponentSource( model_or_path=od_config.model, - subfolder="text_encoder", + subfolder="vae", revision=od_config.revision, - prefix="text_encoder.", + prefix="vae.", ), ] self._execution_device = get_local_device() @@ -186,9 +192,8 @@ def __init__( if text_encoder_config.tie_word_embeddings: self.text_encoder.lm_head.weight = self.text_encoder.get_input_embeddings().weight - self.vae = AutoencoderKL.from_pretrained(model, subfolder="vae", local_files_only=local_files_only).to( - self._execution_device - ) + vae_config = AutoencoderKL.load_config(model, subfolder="vae", local_files_only=local_files_only) + self.vae = AutoencoderKL.from_config(vae_config).to(self._execution_device) # Get vLLM quantization config for linear layers quant_config = get_vllm_quant_config_for_layers(od_config.quantization_config) self.transformer = ZImageTransformer2DModel(quant_config=quant_config) @@ -660,5 +665,4 @@ def forward( def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) loaded_weights = loader.load_weights(weights) - loaded_weights |= {f"vae.{name}" for name, _ in self.vae.named_parameters()} return loaded_weights From b644f56f26eced97f945df1eb24c11541a3585f1 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sat, 21 Feb 2026 21:18:48 +0800 Subject: [PATCH 13/23] clean Signed-off-by: Isotr0py --- vllm_omni/diffusion/models/z_image/pipeline_z_image.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm_omni/diffusion/models/z_image/pipeline_z_image.py b/vllm_omni/diffusion/models/z_image/pipeline_z_image.py index 71c086b7838..5ece92dceb8 100644 --- a/vllm_omni/diffusion/models/z_image/pipeline_z_image.py +++ b/vllm_omni/diffusion/models/z_image/pipeline_z_image.py @@ -664,5 +664,4 @@ def forward( def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) - loaded_weights = loader.load_weights(weights) - return loaded_weights + return loader.load_weights(weights) From 75fd6f7e807f23e7b2e44ed34f5231d6bcafd296 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sat, 21 Feb 2026 21:26:42 +0800 Subject: [PATCH 14/23] raise value error for multiple index Signed-off-by: Isotr0py --- vllm_omni/diffusion/model_loader/diffusers_loader.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/vllm_omni/diffusion/model_loader/diffusers_loader.py b/vllm_omni/diffusion/model_loader/diffusers_loader.py index 3d1221f831e..981d936a59d 100644 --- a/vllm_omni/diffusion/model_loader/diffusers_loader.py +++ b/vllm_omni/diffusion/model_loader/diffusers_loader.py @@ -109,9 +109,10 @@ def _prepare_weights( available_index_file = list( filter(lambda f: file_exists(model_name_or_path, f, revision=revision), possible_index_files) ) - assert len(available_index_file) <= 1, ( - f"Multiple index files found in {model_name_or_path} with subfolder {subfolder}: {available_index_file}" - ) + if len(available_index_file) > 1: + raise ValueError( + f"Multiple index files found in {model_name_or_path} with subfolder {subfolder}: {available_index_file}" + ) index_file_with_subfolder = available_index_file[0] if len(available_index_file) == 1 else None index_file = ( index_file_with_subfolder.split("/")[-1] From 805507a4018d38757984bd2a7ca2203c0b55dd96 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sun, 22 Feb 2026 10:47:41 +0800 Subject: [PATCH 15/23] remove unused function Signed-off-by: Isotr0py --- vllm_omni/diffusion/model_loader/diffusers_loader.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/vllm_omni/diffusion/model_loader/diffusers_loader.py b/vllm_omni/diffusion/model_loader/diffusers_loader.py index 981d936a59d..51d1abc08cf 100644 --- a/vllm_omni/diffusion/model_loader/diffusers_loader.py +++ b/vllm_omni/diffusion/model_loader/diffusers_loader.py @@ -38,11 +38,6 @@ INDEX_FILES = [DIFFUSION_MODEL_WEIGHTS_INDEX, TRANSFORMER_WEIGHTS_INDEX] -def get_subfolder_file_path(subfolder: str | None, file: str) -> str: - """Get the subfolder path.""" - return f"{subfolder}/" if subfolder is not None else file - - class DiffusersPipelineLoader: """Model loader that can load diffusers pipeline components from disk.""" From bfd7cd2fa96f1786ae0a5b5b66d41d8b5902870a Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sun, 22 Feb 2026 11:02:23 +0800 Subject: [PATCH 16/23] fix nits Signed-off-by: Isotr0py --- vllm_omni/diffusion/models/utils.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/vllm_omni/diffusion/models/utils.py b/vllm_omni/diffusion/models/utils.py index 12f0c4471fa..a8e43ed4045 100644 --- a/vllm_omni/diffusion/models/utils.py +++ b/vllm_omni/diffusion/models/utils.py @@ -45,13 +45,14 @@ def replace_linear_class( if not isinstance(style, str): raise ValueError(f"Unsupported parallel style type {type(style)}, expected str") - vllm_linear_cls, vllm_linear_kwargs = { + vllm_linear_maps = { "colwise": (ColumnParallelLinear, {}), "colwise_rep": (ColumnParallelLinear, {"gather_output": True}), "rowwise": (RowParallelLinear, {}), "rowwise_rep": (RowParallelLinear, {"input_is_parallel": False}), "replicate": (ReplicatedLinear, {}), - }.get(style, (ReplicatedLinear, {})) + } + vllm_linear_cls, vllm_linear_kwargs = vllm_linear_maps[style] return vllm_linear_cls( input_size=linear.in_features, @@ -68,7 +69,6 @@ def recursive_replace_linear(model: nn.Module, od_config: OmniDiffusionConfig): """Recursively replace modules in the model as needed. Currently, this replaces: - `nn.Linear` with vLLM's tensor parallel linear classes - - `*RMSNorm` with vLLM's `RMSNorm` """ # Prefix the patterns because we always start from `self.model` quant_config = get_vllm_quant_config_for_layers(od_config.quantization_config) @@ -101,7 +101,8 @@ def init_parameters( param.data, dtype=dtype, device=device, - ) + ), + requires_grad=param.requires_grad, ) setattr(module, name, new_param) for child in module.children(): From 865218b99d8f6fe115e23172f787bf0f2be0ea84 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Wed, 18 Mar 2026 01:21:29 +0800 Subject: [PATCH 17/23] fix Signed-off-by: Isotr0py --- .../diffusion/distributed/autoencoders/autoencoder_kl.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/vllm_omni/diffusion/distributed/autoencoders/autoencoder_kl.py b/vllm_omni/diffusion/distributed/autoencoders/autoencoder_kl.py index 7df2d6a8add..a249aeb291f 100644 --- a/vllm_omni/diffusion/distributed/autoencoders/autoencoder_kl.py +++ b/vllm_omni/diffusion/distributed/autoencoders/autoencoder_kl.py @@ -28,6 +28,12 @@ def from_pretrained(cls, *args: Any, **kwargs: Any): model.init_distributed() return model + @classmethod + def from_config(cls, *args: Any, **kwargs: Any): + model = super().from_config(*args, **kwargs) + model.init_distributed() + return model + def tile_split(self, z: torch.Tensor) -> tuple[list[TileTask], GridSpec]: # mostly copy from AutoencoderKL overlap_size = int(self.tile_latent_min_size * (1 - self.tile_overlap_factor)) From 65e8f4d105a5fd9e08009fa7b800091fe2d5886a Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sat, 4 Apr 2026 14:34:09 +0800 Subject: [PATCH 18/23] fix Signed-off-by: Isotr0py --- vllm_omni/diffusion/models/z_image/pipeline_z_image.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm_omni/diffusion/models/z_image/pipeline_z_image.py b/vllm_omni/diffusion/models/z_image/pipeline_z_image.py index 79e55ad6382..694eddcc3c8 100644 --- a/vllm_omni/diffusion/models/z_image/pipeline_z_image.py +++ b/vllm_omni/diffusion/models/z_image/pipeline_z_image.py @@ -183,12 +183,12 @@ def __init__( text_encoder_config = AutoConfig.from_pretrained( model, subfolder="text_encoder", local_files_only=local_files_only - ).to(self._execution_device) + ) self.text_encoder = create_transformers_model( AutoModelForCausalLM, od_config, hf_config=text_encoder_config, - ) + ).to(self._execution_device) if text_encoder_config.tie_word_embeddings: self.text_encoder.lm_head.weight = self.text_encoder.get_input_embeddings().weight From 0b6fe43acecec59884cad5585adbddeb2390d75a Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Thu, 30 Apr 2026 12:18:49 +0800 Subject: [PATCH 19/23] update doc Signed-off-by: Isotr0py --- docs/user_guide/diffusion/quantization/fp8.md | 16 ++++++++-------- .../diffusion/quantization/overview.md | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/user_guide/diffusion/quantization/fp8.md b/docs/user_guide/diffusion/quantization/fp8.md index 9906631b625..bfa3679d34e 100644 --- a/docs/user_guide/diffusion/quantization/fp8.md +++ b/docs/user_guide/diffusion/quantization/fp8.md @@ -58,14 +58,14 @@ The available `ignored_layers` names depend on the model architecture (e.g., `to ## Supported Models -| Model | HF Models | Recommendation | `ignored_layers` | -|-------|-----------|---------------|------------------| -| Z-Image | `Tongyi-MAI/Z-Image-Turbo` | All layers | None | -| Qwen-Image | `Qwen/Qwen-Image`, `Qwen/Qwen-Image-2512` | Skip sensitive layers | `img_mlp` | -| Flux | `black-forest-labs/FLUX.1-dev` | All layers | None | -| HunyuanImage-3 | `tencent/HunyuanImage3` | All layers | None | -| HunyuanVideo-1.5 | `hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-480p_t2v`, `720p_t2v`, `480p_i2v` | All layers | None | -| Helios | `BestWishYsh/Helios-Base`, `BestWishYsh/Helios-Mid`, `BestWishYsh/Helios-Distilled` | All layers | None | +| Model | HF Models | Recommendation | `ignored_layers` | Text-encoder quantization | +|-------|-----------|----------------|------------------|---------------------------| +| Z-Image | `Tongyi-MAI/Z-Image-Turbo` | All layers | None | ✅︎ | +| Qwen-Image | `Qwen/Qwen-Image`, `Qwen/Qwen-Image-2512` | Skip sensitive layers | `img_mlp` | | +| Flux | `black-forest-labs/FLUX.1-dev` | All layers | None | | +| HunyuanImage-3 | `tencent/HunyuanImage3` | All layers | None | | +| HunyuanVideo-1.5 | `hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-480p_t2v`, `720p_t2v`, `480p_i2v` | All layers | None | | +| Helios | `BestWishYsh/Helios-Base`, `BestWishYsh/Helios-Mid`, `BestWishYsh/Helios-Distilled` | All layers | None | | ## Combining with Other Features diff --git a/docs/user_guide/diffusion/quantization/overview.md b/docs/user_guide/diffusion/quantization/overview.md index 25d7fa5c756..821ec008eb4 100644 --- a/docs/user_guide/diffusion/quantization/overview.md +++ b/docs/user_guide/diffusion/quantization/overview.md @@ -54,7 +54,7 @@ When `--quantization fp8` is enabled for diffusion models: | Component | What Gets Quantized | Mechanism | |-----------|-------------------|-----------| | **DiT (transformer)** | `nn.Linear` layers | vLLM W8A8 FP8 compute (Ada/Hopper) or weight-only (older GPUs) | -| **Text encoder** | `nn.Linear` layers | FP8 weight storage, BF16 compute | +| **Text encoder** | `nn.Linear` layers | vLLM W8A8 FP8 compute (Ada/Hopper) or weight-only (older GPUs) | | **VAE** | `nn.Conv2d`, `nn.Conv3d` layers | FP8 weight storage, BF16 compute | ### Multi-stage Omni Models From 97f2466cf2817c5341e48355eec338517e290c87 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Thu, 30 Apr 2026 12:32:00 +0800 Subject: [PATCH 20/23] update doc Signed-off-by: Isotr0py --- docs/user_guide/quantization/fp8.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/user_guide/quantization/fp8.md b/docs/user_guide/quantization/fp8.md index e89bc76ca77..7373a39ffb4 100644 --- a/docs/user_guide/quantization/fp8.md +++ b/docs/user_guide/quantization/fp8.md @@ -32,15 +32,15 @@ guide. FP8 on Ampere may use a weight-only path where available. ### Diffusion Model (Qwen-Image, Wan2.2) -| Model | HF models | Online | Pre-calibrated | Recommendation | `ignored_layers` | -|-------|-----------|:-------:|:------:|----------------|------------------| -| Qwen-Image | `Qwen/Qwen-Image`, `Qwen/Qwen-Image-2512` | Yes | Yes | Skip sensitive image-stream MLPs when quality regresses | `img_mlp` | -| Wan2.2 | Wan2.2 diffusion pipelines | Not validated | Not validated | Validate against BF16 before documenting as supported | TBD | -| Z-Image | `Tongyi-MAI/Z-Image-Turbo` | Yes | Yes | All layers | None | -| FLUX.1 | `black-forest-labs/FLUX.1-dev`, `black-forest-labs/FLUX.1-schnell` | Yes | Yes | All layers | None | -| FLUX.2-klein | `black-forest-labs/FLUX.2-klein-4B` | Yes | Yes | All layers | None | -| HunyuanImage-3.0 | `tencent/HunyuanImage-3.0`, `tencent/HunyuanImage-3.0-Instruct` | Yes | Yes | All layers; use the Hunyuan stage config for multi-stage runs | None | -| HunyuanVideo-1.5 | `hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-480p_t2v`, `720p_t2v`, `480p_i2v` | Yes | Yes | All layers | None | +| Model | HF models | Online | Pre-calibrated | Recommendation | `ignored_layers` | Text-Encoder quantization | +|-------|-----------|:-------:|:------:|----------------|------------------|------------------| +| Qwen-Image | `Qwen/Qwen-Image`, `Qwen/Qwen-Image-2512` | Yes | Yes | Skip sensitive image-stream MLPs when quality regresses | `img_mlp` | | +| Wan2.2 | Wan2.2 diffusion pipelines | Not validated | Not validated | Validate against BF16 before documenting as supported | TBD | | +| Z-Image | `Tongyi-MAI/Z-Image-Turbo` | Yes | Yes | All layers | None | ✅︎ | +| FLUX.1 | `black-forest-labs/FLUX.1-dev`, `black-forest-labs/FLUX.1-schnell` | Yes | Yes | All layers | None | | +| FLUX.2-klein | `black-forest-labs/FLUX.2-klein-4B` | Yes | Yes | All layers | None | | +| HunyuanImage-3.0 | `tencent/HunyuanImage-3.0`, `tencent/HunyuanImage-3.0-Instruct` | Yes | Yes | All layers; use the Hunyuan stage config for multi-stage runs | None | | +| HunyuanVideo-1.5 | `hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-480p_t2v`, `720p_t2v`, `480p_i2v` | Yes | Yes | All layers | None | | ### Multi-Stage Omni/TTS Model (Qwen3-Omni, Qwen3-TTS) From a8303ba7dbbc8a0c721eb7e7be245d725b4160b1 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Thu, 30 Apr 2026 18:13:05 +0800 Subject: [PATCH 21/23] fix import Signed-off-by: Isotr0py --- vllm_omni/diffusion/models/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm_omni/diffusion/models/utils.py b/vllm_omni/diffusion/models/utils.py index ce5fa34322a..122646219ff 100644 --- a/vllm_omni/diffusion/models/utils.py +++ b/vllm_omni/diffusion/models/utils.py @@ -18,7 +18,7 @@ from vllm.model_executor.models.utils import maybe_prefix from vllm_omni.diffusion.data import OmniDiffusionConfig -from vllm_omni.diffusion.quantization import get_vllm_quant_config_for_layers +from vllm_omni.quantization import build_quant_config if TYPE_CHECKING: from transformers import PretrainedConfig, PreTrainedModel @@ -78,7 +78,7 @@ def recursive_replace_linear(model: nn.Module, od_config: OmniDiffusionConfig): - `nn.Linear` with vLLM's tensor parallel linear classes """ # Prefix the patterns because we always start from `self.model` - quant_config = get_vllm_quant_config_for_layers(od_config.quantization_config) + quant_config = build_quant_config(od_config.quantization_config) def _recursive_replace(module: nn.Module, prefix: str): for child_name, child_module in module.named_children(): From b9ef9ba0f7ec05307a473927e99fcf4b8e48b312 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Thu, 30 Apr 2026 21:50:22 +0800 Subject: [PATCH 22/23] fix weights downloading Signed-off-by: Isotr0py --- .../model_loader/diffusers_loader.py | 34 ++++++------------- 1 file changed, 10 insertions(+), 24 deletions(-) diff --git a/vllm_omni/diffusion/model_loader/diffusers_loader.py b/vllm_omni/diffusion/model_loader/diffusers_loader.py index 446bf6dd65b..1abcae42ced 100644 --- a/vllm_omni/diffusion/model_loader/diffusers_loader.py +++ b/vllm_omni/diffusion/model_loader/diffusers_loader.py @@ -112,12 +112,7 @@ def _prepare_weights( raise ValueError( f"Multiple index files found in {model_name_or_path} with subfolder {subfolder}: {available_index_file}" ) - index_file_with_subfolder = available_index_file[0] if len(available_index_file) == 1 else None - index_file = ( - index_file_with_subfolder.split("/")[-1] - if index_file_with_subfolder and subfolder is not None - else index_file_with_subfolder - ) + index_file = available_index_file[0] if len(available_index_file) == 1 else "" # only hf is supported currently if load_format == "auto": @@ -135,20 +130,21 @@ def _prepare_weights( if allow_patterns_overrides is not None: allow_patterns = allow_patterns_overrides - if subfolder is not None: - allow_patterns = [f"{subfolder}/{pattern}" for pattern in allow_patterns] - if not is_local: hf_folder = download_weights_from_hf( model_name_or_path, self.load_config.download_dir, allow_patterns, revision, + subfolder=subfolder, ignore_patterns=self.load_config.ignore_patterns, ) else: hf_folder = model_name_or_path + if subfolder is not None: + hf_folder = os.path.join(hf_folder, subfolder) + hf_weights_files: list[str] = [] for pattern in allow_patterns: hf_weights_files += glob.glob(os.path.join(hf_folder, pattern)) @@ -166,22 +162,12 @@ def _prepare_weights( if not is_local: download_safetensors_index_file_from_hf( model_name_or_path, - index_file_with_subfolder, - self.load_config.download_dir, - revision, + index_file, + cache_dir=self.load_config.download_dir, + subfolder=subfolder, + revision=revision, ) - # Some diffusers pipelines keep component weights under a - # subfolder (e.g. "transformer/") and the corresponding index file - # uses filenames relative to that subfolder. vLLM's - # `filter_duplicate_safetensors_files` expects weight_map entries - # to be relative to the `hf_folder` we pass in, so we point it to - # the component subfolder to avoid filtering out all shards. - filter_folder = os.path.join(hf_folder, subfolder) if subfolder is not None else hf_folder - hf_weights_files = filter_duplicate_safetensors_files( - hf_weights_files, - filter_folder, - index_file or "", - ) + hf_weights_files = filter_duplicate_safetensors_files(hf_weights_files, hf_folder, index_file) else: hf_weights_files = filter_files_not_needed_for_inference(hf_weights_files) From 326c6783b85950b4d4f2536f93d938b85a34df24 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Sun, 3 May 2026 23:45:23 +0800 Subject: [PATCH 23/23] Update vllm_omni/diffusion/model_loader/diffusers_loader.py Co-authored-by: SYLAR <125541396+lishunyang12@users.noreply.github.com> Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm_omni/diffusion/model_loader/diffusers_loader.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm_omni/diffusion/model_loader/diffusers_loader.py b/vllm_omni/diffusion/model_loader/diffusers_loader.py index 1abcae42ced..4c6fb070997 100644 --- a/vllm_omni/diffusion/model_loader/diffusers_loader.py +++ b/vllm_omni/diffusion/model_loader/diffusers_loader.py @@ -105,14 +105,14 @@ def _prepare_weights( possible_index_files = [ f"{subfolder}/{index_file}" if subfolder is not None else index_file for index_file in INDEX_FILES ] - available_index_file = list( - filter(lambda f: file_exists(model_name_or_path, f, revision=revision), possible_index_files) - ) + available_index_file = [ + f for f in possible_index_files if file_exists(model_name_or_path, f, revision=revision) + ] if len(available_index_file) > 1: raise ValueError( f"Multiple index files found in {model_name_or_path} with subfolder {subfolder}: {available_index_file}" ) - index_file = available_index_file[0] if len(available_index_file) == 1 else "" + index_file = available_index_file[0] if available_index_file else "" # only hf is supported currently if load_format == "auto":