From fc18d8d98aa88d9ba62d9b352dc2fdf7b24e810c Mon Sep 17 00:00:00 2001 From: betta18 Date: Mon, 13 Apr 2026 16:38:51 +0800 Subject: [PATCH 1/6] [Feature][NPU] support hyimage3 offline quant by vllm-ascend. Signed-off-by: betta18 --- vllm_omni/diffusion/data.py | 2 +- .../hunyuan_image_3_transformer.py | 25 ++++++++---- .../pipeline_hunyuan_image_3.py | 11 +++++- vllm_omni/diffusion/registry.py | 19 +++++++++ .../diffusion/worker/diffusion_worker.py | 12 ++++++ vllm_omni/platforms/interface.py | 8 ++++ .../platforms/npu/models/hunyuan_fused_moe.py | 6 --- vllm_omni/platforms/npu/platform.py | 14 +++++++ .../hunyuan_image3_moe_dit_quant_ascend.yaml | 39 +++++++++++++++++++ 9 files changed, 121 insertions(+), 15 deletions(-) create mode 100644 vllm_omni/platforms/npu/stage_configs/hunyuan_image3_moe_dit_quant_ascend.yaml diff --git a/vllm_omni/diffusion/data.py b/vllm_omni/diffusion/data.py index 488378b40ff..8e46cda6bc1 100644 --- a/vllm_omni/diffusion/data.py +++ b/vllm_omni/diffusion/data.py @@ -629,7 +629,7 @@ def from_kwargs(cls, **kwargs: Any) -> "OmniDiffusionConfig": # Backwards-compatibility: map "quantization" to "quantization_config" # so callers using the old field name still work. - if "quantization" in kwargs and "quantization_config" not in kwargs: + if "quantization" in kwargs and kwargs.get("quantization_config", None) is None: kwargs["quantization_config"] = kwargs.pop("quantization") else: kwargs.pop("quantization", None) diff --git a/vllm_omni/diffusion/models/hunyuan_image_3/hunyuan_image_3_transformer.py b/vllm_omni/diffusion/models/hunyuan_image_3/hunyuan_image_3_transformer.py index 69b39974b35..94824dff73f 100644 --- a/vllm_omni/diffusion/models/hunyuan_image_3/hunyuan_image_3_transformer.py +++ b/vllm_omni/diffusion/models/hunyuan_image_3/hunyuan_image_3_transformer.py @@ -1481,7 +1481,7 @@ def __init__( config.hidden_size, config.num_experts, bias=False, - quant_config=None, + quant_config=quant_config, prefix=f"{prefix}.gate", ) if config.use_mixed_mlp_moe > 0: @@ -1655,8 +1655,10 @@ def forward( custom_pos_emb: tuple[torch.FloatTensor] | None = None, **kwargs, ) -> torch.Tensor: - bsz, q_len, _ = hidden_states.size() + bsz, q_len, hidden_size = hidden_states.size() + hidden_states = hidden_states.reshape(-1, hidden_size) qkv, _ = self.qkv_proj(hidden_states) + qkv = qkv.reshape(bsz, q_len, -1) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) past_key_value: Cache | None = kwargs.get("past_key_value", None) @@ -1719,7 +1721,7 @@ def __init__( rope_theta=rope_theta, rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, - quant_config=None, + quant_config=quant_config, bias=attention_bias, cache_config=None, prefix=f"{prefix}.self_attn", @@ -1929,7 +1931,7 @@ def __init__(self, config: HunyuanImage3Config, quant_config=None, prefix: str = layer_idx=int(prefix.split(".")[-1]), prefix=prefix, ), - prefix=f"{prefix}.layers", + prefix=f"{prefix}.layers" if prefix else "layers", ) if get_pp_group().is_last_rank: self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) @@ -1944,7 +1946,7 @@ def _split_qkv_weight(self, qkv: torch.Tensor): num_attention_heads = self.config.num_attention_heads num_kv_heads = getattr(self.config, "num_key_value_heads", self.config.num_attention_heads) num_key_value_groups = num_attention_heads // num_kv_heads - hidden_size = self.config.hidden_size + hidden_size = qkv.shape[1] if hasattr(self.config, "head_dim"): attention_head_dim = self.config.head_dim @@ -1997,12 +1999,19 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): split_params_mapping = [ (".gate_up_proj", ".gate_and_up_proj", 2, [(1, 1), (0, 1)], None), ( - ".qkv_proj", - ".qkv_proj", + ".qkv_proj.weight", + ".qkv_proj.weight", num_attention_heads + num_kv_heads * 2, [("q", num_attention_heads), ("k", num_kv_heads), ("v", num_kv_heads)], self._split_qkv_weight, ), + ( + ".qkv_proj.weight_scale", + ".qkv_proj.weight_scale", + num_attention_heads + num_kv_heads * 2, + [("q", num_attention_heads), ("k", num_kv_heads), ("v", num_kv_heads)], + self._split_qkv_weight, + ) ] params_dict = dict(self.named_parameters()) @@ -2097,6 +2106,8 @@ def contains_unexpected_keyword(name, keywords): continue if "mlp.experts" in name: continue + if ".qkv_proj" in name and not name.endswith(weight_name): + continue name = name.replace(weight_name, param_name) # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: diff --git a/vllm_omni/diffusion/models/hunyuan_image_3/pipeline_hunyuan_image_3.py b/vllm_omni/diffusion/models/hunyuan_image_3/pipeline_hunyuan_image_3.py index c19e8a65a80..330c6236deb 100644 --- a/vllm_omni/diffusion/models/hunyuan_image_3/pipeline_hunyuan_image_3.py +++ b/vllm_omni/diffusion/models/hunyuan_image_3/pipeline_hunyuan_image_3.py @@ -15,7 +15,7 @@ from transformers.models.siglip2 import Siglip2VisionConfig, Siglip2VisionModel from transformers.utils.generic import ModelOutput from vllm.config.vllm import get_current_vllm_config -from vllm.model_executor.models.utils import AutoWeightsLoader +from vllm.model_executor.models.utils import AutoWeightsLoader, WeightsMapper from vllm.transformers_utils.config import get_config from vllm_omni.diffusion.data import DiffusionOutput, OmniDiffusionConfig @@ -64,6 +64,15 @@ def to_device(data, device): class HunyuanImage3Pipeline(HunyuanImage3PreTrainedModel, GenerationMixin, DiffusionPipelineProfilerMixin): + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_prefix={ + "model.": "", + }, + orig_to_new_substr={ + "mlp.gate.wg.": "mlp.gate.", + "gate_and_up_proj.": "gate_up_proj.", + }, + ) _PROFILER_TARGETS = [ "model.forward", "model.layers[0].forward", diff --git a/vllm_omni/diffusion/registry.py b/vllm_omni/diffusion/registry.py index 994dac04ad4..1637ae7a8e5 100644 --- a/vllm_omni/diffusion/registry.py +++ b/vllm_omni/diffusion/registry.py @@ -5,6 +5,7 @@ import torch.nn as nn from vllm.logger import init_logger +from vllm.model_executor.model_loader.utils import configure_quant_config from vllm.model_executor.models.registry import _LazyRegisteredModel, _ModelRegistry from vllm_omni.diffusion.data import OmniDiffusionConfig @@ -12,6 +13,7 @@ from vllm_omni.diffusion.distributed.sp_plan import SequenceParallelConfig, get_sp_plan_from_model from vllm_omni.diffusion.forward_context import get_forward_context from vllm_omni.diffusion.hooks.sequence_parallel import apply_sequence_parallel +from vllm_omni.platforms import current_omni_platform logger = init_logger(__name__) @@ -176,6 +178,22 @@ } +def _prepare_diffusion_quant_config( + od_config: OmniDiffusionConfig, + model_class: type[nn.Module], +) -> None: + """Prepare diffusion quant config using vLLM-style model bindings.""" + quant_config = od_config.quantization_config + if quant_config is None: + return + if hasattr(quant_config, "maybe_update_config"): + quant_config.maybe_update_config(od_config.model, revision=od_config.revision) + model_class.packed_modules_mapping = current_omni_platform.get_diffusion_packed_modules_mapping( + model_class + ) + configure_quant_config(quant_config, model_class) + + def initialize_model( od_config: OmniDiffusionConfig, ) -> nn.Module: @@ -198,6 +216,7 @@ def initialize_model( """ model_class = DiffusionModelRegistry._try_load_model_cls(od_config.model_class_name) if model_class is not None: + _prepare_diffusion_quant_config(od_config, model_class) model = model_class(od_config=od_config) vae_pp_size = od_config.parallel_config.vae_patch_parallel_size diff --git a/vllm_omni/diffusion/worker/diffusion_worker.py b/vllm_omni/diffusion/worker/diffusion_worker.py index 6e1cabba0ce..7c16265f71d 100644 --- a/vllm_omni/diffusion/worker/diffusion_worker.py +++ b/vllm_omni/diffusion/worker/diffusion_worker.py @@ -14,6 +14,7 @@ from collections.abc import Iterable from contextlib import AbstractContextManager, nullcontext from typing import Any +from types import SimpleNamespace import torch import zmq @@ -23,6 +24,7 @@ from vllm.utils.import_utils import resolve_obj_by_qualname from vllm.utils.mem_utils import GiB_bytes from vllm.v1.worker.workspace import init_workspace_manager +from vllm.transformers_utils.config import get_config, get_hf_text_config from vllm_omni.diffusion.data import ( DiffusionOutput, @@ -118,6 +120,16 @@ def init_device(self) -> None: vllm_config.parallel_config.tensor_parallel_size = self.od_config.parallel_config.tensor_parallel_size vllm_config.parallel_config.data_parallel_size = self.od_config.parallel_config.data_parallel_size vllm_config.parallel_config.enable_expert_parallel = self.od_config.parallel_config.enable_expert_parallel + hf_config = get_config(self.od_config.model, trust_remote_code=self.od_config.trust_remote_code) + hf_text_config = get_hf_text_config(hf_config) + vllm_config.model_config = SimpleNamespace( + hf_config=hf_config, + hf_text_config=hf_text_config, + enforce_eager=self.od_config.enforce_eager, + dtype=self.od_config.dtype, + enable_return_routed_experts=False + ) + vllm_config.quant_config = self.od_config.quantization_config self.vllm_config = vllm_config # Initialize distributed environment diff --git a/vllm_omni/platforms/interface.py b/vllm_omni/platforms/interface.py index 7739cec78b9..4525903f18f 100644 --- a/vllm_omni/platforms/interface.py +++ b/vllm_omni/platforms/interface.py @@ -5,6 +5,7 @@ from typing import Any import torch +import torch.nn as nn from vllm.platforms import Platform @@ -63,6 +64,13 @@ def get_diffusion_model_impl_qualname(cls, op_name: str) -> str: def prepare_diffusion_op_runtime(cls, op_name: str, **kwargs: Any) -> None: return None + @classmethod + def get_diffusion_packed_modules_mapping( + cls, + model_class: type[nn.Module], + ) -> dict[str, list[str]] | None: + return None + @classmethod def get_diffusion_attn_backend_cls( cls, diff --git a/vllm_omni/platforms/npu/models/hunyuan_fused_moe.py b/vllm_omni/platforms/npu/models/hunyuan_fused_moe.py index fad4c0edfc3..05079a7e4ae 100644 --- a/vllm_omni/platforms/npu/models/hunyuan_fused_moe.py +++ b/vllm_omni/platforms/npu/models/hunyuan_fused_moe.py @@ -107,12 +107,6 @@ class AscendHunyuanFusedMoE(AscendSharedFusedMoE): def __init__(self, *, prefix: str = "", **kwargs: Any) -> None: super().__init__(prefix=prefix, **kwargs) self._prefix = prefix - self._init_hook_handle = self.register_forward_pre_hook(self._initialize_kernel_hook, with_kwargs=True) - - def _initialize_kernel_hook(self, module: Any, args: Any, kwargs: Any) -> None: - if self.quant_method: - self.quant_method.process_weights_after_loading(self) - self._init_hook_handle.remove() def forward(self, hidden_states: Any, router_logits: Any) -> Any: _set_hunyuan_fused_moe_forward_context(hidden_states.shape[0]) diff --git a/vllm_omni/platforms/npu/platform.py b/vllm_omni/platforms/npu/platform.py index 1d6bea7cb5d..aa845fe67a6 100644 --- a/vllm_omni/platforms/npu/platform.py +++ b/vllm_omni/platforms/npu/platform.py @@ -4,6 +4,7 @@ from typing import Any import torch +import torch.nn as nn from vllm.logger import init_logger from vllm_ascend.platform import NPUPlatform @@ -12,6 +13,12 @@ logger = init_logger(__name__) +_DIFFUSION_PACKED_MODULES_MAPPING = { + "HunyuanImage3Pipeline": { + "experts": ["experts.0.gate_up_proj", "experts.0.down_proj"], + }, +} + class NPUOmniPlatform(OmniPlatform, NPUPlatform): """NPU/Ascend implementation of OmniPlatform. @@ -52,6 +59,13 @@ def prepare_diffusion_op_runtime(cls, op_name: str, **kwargs: Any) -> None: prepare_hunyuan_fused_moe_runtime() + @classmethod + def get_diffusion_packed_modules_mapping( + cls, + model_class: type[nn.Module], + ) -> dict[str, list[str]] | None: + return _DIFFUSION_PACKED_MODULES_MAPPING.get(model_class.__name__, None) + @classmethod def get_diffusion_attn_backend_cls( cls, diff --git a/vllm_omni/platforms/npu/stage_configs/hunyuan_image3_moe_dit_quant_ascend.yaml b/vllm_omni/platforms/npu/stage_configs/hunyuan_image3_moe_dit_quant_ascend.yaml new file mode 100644 index 00000000000..3b1d38b75e9 --- /dev/null +++ b/vllm_omni/platforms/npu/stage_configs/hunyuan_image3_moe_dit_quant_ascend.yaml @@ -0,0 +1,39 @@ +# Stage config for running Hunyuan-Image3.0 DiT on NPU. +# The following config has been verified on 8x A3-64G NPUs. + +# Stage 0: Diffusion (DiT + VAE) +# This stage receives noise and timesteps and performs denoising + VAE decode. +stage_args: + - stage_id: 0 + stage_type: diffusion + runtime: + devices: "0,1,2,3,4,5,6,7" + engine_args: + max_num_seqs: 1 + model_stage: dit + gpu_memory_utilization: 0.65 + enforce_eager: true + trust_remote_code: true + engine_output_type: image + distributed_executor_backend: "mp" + enable_prefix_caching: false + max_num_batched_tokens: 32768 + parallel_config: + tensor_parallel_size: 8 + enable_expert_parallel: true + quantization: ascend + omni_kv_config: + need_recv_cache: true + + final_output: true + final_output_type: image + is_comprehension: false + default_sampling_params: + seed: 42 + +# Runtime defaults +runtime: + enabled: true + defaults: + window_size: -1 + max_inflight: 1 From 5a1a56757a6182290f4ce69c465af294981697f9 Mon Sep 17 00:00:00 2001 From: betta18 Date: Tue, 21 Apr 2026 14:12:43 +0800 Subject: [PATCH 2/6] fix bug Signed-off-by: betta18 --- .../diffusion/quantization/msmodelslim.md | 56 +++++++++++++++++++ .../text_to_image/text_to_image.py | 1 + .../hunyuan_image3_transformer.py | 2 +- vllm_omni/diffusion/registry.py | 6 +- .../diffusion/worker/diffusion_worker.py | 14 +++-- vllm_omni/engine/async_omni_engine.py | 7 +++ .../hunyuan_image3_moe_dit_quant_ascend.yaml | 39 ------------- 7 files changed, 76 insertions(+), 49 deletions(-) create mode 100644 docs/user_guide/diffusion/quantization/msmodelslim.md delete mode 100644 vllm_omni/platforms/npu/stage_configs/hunyuan_image3_moe_dit_quant_ascend.yaml diff --git a/docs/user_guide/diffusion/quantization/msmodelslim.md b/docs/user_guide/diffusion/quantization/msmodelslim.md new file mode 100644 index 00000000000..5492cd9272b --- /dev/null +++ b/docs/user_guide/diffusion/quantization/msmodelslim.md @@ -0,0 +1,56 @@ +# msModelSlim Quantization + +## Overview + +[msModelSlim](https://github.com/Ascend/msmodelslim) is an Ascend-friendly compression tool focused on acceleration, using compression techniques, and built for Ascend hardware. It includes a series of inference optimization technologies such as quantization and compression, aiming to accelerate large language dense models, MoE models, multimodal understanding models, multimodal generation models, etc. + +Once you have a quantized model which is generated by **msModelSlim**, you can use vLLM Omni for inference by specifying the --quantization ascend parameter to enable quantization features. + +### Supported Schemes + +| Scheme | Bits | Status | +|--------|------|--------| +| W8A8 | 8 | ✅ Supported | +| W4A4 | 4 | Planned | + +W8A8 is the first supported scheme. Additional schemes will be added in future releases. + +## Model Quantization + +The following example shows how to generate W8A8 quantized weights for the [Wan2_2 model](https://gitcode.com/Ascend/msmodelslim/blob/master/example/multimodal_sd/Wan2_2/README.md). + +**Quantization Script:** + +```bash +msmodelslim quant \ + --model_path /path/to/wan2_2_t2v_float_weights \ + --save_path /path/to/wan2_2_t2v_quantized_weights \ + --device npu \ + --model_type Wan2_2 \ + --config_path /lab_practice/wan2_2/wan2_2_w8a8f8_mxfp_t2v.yaml \ + --trust_remote_code True +``` + +After quantization completes, the output directory will contain the quantized model files. + +For more examples, refer to the [official examples](https://gitcode.com/Ascend/msit/tree/master/msmodelslim/example). + +## Configuration + +1. **CLI**: pass `--quantization ascend`. + +```bash +# Offline inference +python text_to_image.py --model --quantization ascend + +# Online serving +vllm serve --omni --quantization ascend +``` + +## Supported Models + +| Model | HF Models | Recommendation | `ignored_layers` | +|-------|-----------|---------------|------------------| +| HunyuanImage-3.0 | - | All layers | None | + +Currently, quantized HunyuanImage-3.0 weights have not been uploaded to public model platforms such as Hugging Face. You can use a [HunyuanImage-3.0-adapted msModelSlim version](https://gitcode.com/betta18/msmodelslim/tree/hyimage3_mxfp8) to generate the quantized weights manually. We will upload the quantized weights as soon as possible. diff --git a/examples/offline_inference/text_to_image/text_to_image.py b/examples/offline_inference/text_to_image/text_to_image.py index 3b3f8e77cfb..df7a6351670 100644 --- a/examples/offline_inference/text_to_image/text_to_image.py +++ b/examples/offline_inference/text_to_image/text_to_image.py @@ -267,6 +267,7 @@ def parse_args() -> argparse.Namespace: default=None, help=("Custom system prompt. Used when --use-system-prompt is custom. "), ) + current_omni_platform.pre_register_and_update(parser) return parser.parse_args() diff --git a/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_transformer.py b/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_transformer.py index 88cd063d4e8..0f3c33389c5 100644 --- a/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_transformer.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_transformer.py @@ -2015,7 +2015,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): num_attention_heads + num_kv_heads * 2, [("q", num_attention_heads), ("k", num_kv_heads), ("v", num_kv_heads)], self._split_qkv_weight, - ) + ), ] params_dict = dict(self.named_parameters()) diff --git a/vllm_omni/diffusion/registry.py b/vllm_omni/diffusion/registry.py index 64ba8e78c47..12ad79171b3 100644 --- a/vllm_omni/diffusion/registry.py +++ b/vllm_omni/diffusion/registry.py @@ -238,10 +238,8 @@ def _prepare_diffusion_quant_config( if quant_config is None: return if hasattr(quant_config, "maybe_update_config"): - quant_config.maybe_update_config(od_config.model, revision=od_config.revision) - model_class.packed_modules_mapping = current_omni_platform.get_diffusion_packed_modules_mapping( - model_class - ) + quant_config.maybe_update_config(od_config.model) + model_class.packed_modules_mapping = current_omni_platform.get_diffusion_packed_modules_mapping(model_class) configure_quant_config(quant_config, model_class) diff --git a/vllm_omni/diffusion/worker/diffusion_worker.py b/vllm_omni/diffusion/worker/diffusion_worker.py index 3fe7a0866b5..927bbeb1a2a 100644 --- a/vllm_omni/diffusion/worker/diffusion_worker.py +++ b/vllm_omni/diffusion/worker/diffusion_worker.py @@ -13,8 +13,8 @@ import os from collections.abc import Iterable from contextlib import AbstractContextManager, nullcontext -from typing import Any from types import SimpleNamespace +from typing import Any import torch import zmq @@ -22,10 +22,10 @@ from vllm.distributed.device_communicators.shm_broadcast import MessageQueue from vllm.logger import init_logger from vllm.profiler.wrapper import CudaProfilerWrapper, WorkerProfiler +from vllm.transformers_utils.config import get_config, get_hf_text_config from vllm.utils.import_utils import resolve_obj_by_qualname from vllm.utils.mem_utils import GiB_bytes from vllm.v1.worker.workspace import init_workspace_manager -from vllm.transformers_utils.config import get_config, get_hf_text_config from vllm_omni.diffusion.data import ( DiffusionOutput, @@ -122,14 +122,18 @@ def init_device(self) -> None: vllm_config.parallel_config.data_parallel_size = self.od_config.parallel_config.data_parallel_size vllm_config.parallel_config.enable_expert_parallel = self.od_config.parallel_config.enable_expert_parallel vllm_config.profiler_config = self.od_config.profiler_config - hf_config = get_config(self.od_config.model, trust_remote_code=self.od_config.trust_remote_code) - hf_text_config = get_hf_text_config(hf_config) + try: + hf_config = get_config(self.od_config.model, trust_remote_code=self.od_config.trust_remote_code) + except ValueError: + hf_config = None + logger.info("Skipping hf_config loading for diffusion model %r", self.od_config.model_class_name) + hf_text_config = get_hf_text_config(hf_config) if hf_config is not None else None vllm_config.model_config = SimpleNamespace( hf_config=hf_config, hf_text_config=hf_text_config, enforce_eager=self.od_config.enforce_eager, dtype=self.od_config.dtype, - enable_return_routed_experts=False + enable_return_routed_experts=False, ) vllm_config.quant_config = self.od_config.quantization_config self.vllm_config = vllm_config diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py index 7b98c13b828..ee439fb921d 100644 --- a/vllm_omni/engine/async_omni_engine.py +++ b/vllm_omni/engine/async_omni_engine.py @@ -1429,6 +1429,13 @@ def _resolve_stage_configs(self, model: str, kwargs: dict[str, Any]) -> tuple[st or cfg.engine_args.quantization_config is None ): cfg.engine_args.quantization_config = quantization_config + quantization = kwargs.get("quantization") + if quantization is not None: + if ( + not hasattr(cfg.engine_args, "quantization") + or cfg.engine_args.quantization is None + ): + cfg.engine_args.quantization = quantization except Exception as e: logger.warning("Failed to inject LoRA config for stage: %s", e) diff --git a/vllm_omni/platforms/npu/stage_configs/hunyuan_image3_moe_dit_quant_ascend.yaml b/vllm_omni/platforms/npu/stage_configs/hunyuan_image3_moe_dit_quant_ascend.yaml deleted file mode 100644 index 3b1d38b75e9..00000000000 --- a/vllm_omni/platforms/npu/stage_configs/hunyuan_image3_moe_dit_quant_ascend.yaml +++ /dev/null @@ -1,39 +0,0 @@ -# Stage config for running Hunyuan-Image3.0 DiT on NPU. -# The following config has been verified on 8x A3-64G NPUs. - -# Stage 0: Diffusion (DiT + VAE) -# This stage receives noise and timesteps and performs denoising + VAE decode. -stage_args: - - stage_id: 0 - stage_type: diffusion - runtime: - devices: "0,1,2,3,4,5,6,7" - engine_args: - max_num_seqs: 1 - model_stage: dit - gpu_memory_utilization: 0.65 - enforce_eager: true - trust_remote_code: true - engine_output_type: image - distributed_executor_backend: "mp" - enable_prefix_caching: false - max_num_batched_tokens: 32768 - parallel_config: - tensor_parallel_size: 8 - enable_expert_parallel: true - quantization: ascend - omni_kv_config: - need_recv_cache: true - - final_output: true - final_output_type: image - is_comprehension: false - default_sampling_params: - seed: 42 - -# Runtime defaults -runtime: - enabled: true - defaults: - window_size: -1 - max_inflight: 1 From ce5f3f1f5761c869b6a68be6a8bdf6da6bbdde6e Mon Sep 17 00:00:00 2001 From: betta18 Date: Thu, 23 Apr 2026 16:14:17 +0800 Subject: [PATCH 3/6] fix pre-commit Signed-off-by: betta18 --- vllm_omni/engine/async_omni_engine.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py index ee439fb921d..eb8e0f3a205 100644 --- a/vllm_omni/engine/async_omni_engine.py +++ b/vllm_omni/engine/async_omni_engine.py @@ -1431,10 +1431,7 @@ def _resolve_stage_configs(self, model: str, kwargs: dict[str, Any]) -> tuple[st cfg.engine_args.quantization_config = quantization_config quantization = kwargs.get("quantization") if quantization is not None: - if ( - not hasattr(cfg.engine_args, "quantization") - or cfg.engine_args.quantization is None - ): + if (not hasattr(cfg.engine_args, "quantization") or cfg.engine_args.quantization is None): cfg.engine_args.quantization = quantization except Exception as e: logger.warning("Failed to inject LoRA config for stage: %s", e) From 631ddc77c54d02e295714f40ff1b373f8ed2b380 Mon Sep 17 00:00:00 2001 From: betta18 Date: Thu, 23 Apr 2026 16:15:58 +0800 Subject: [PATCH 4/6] fix pre-commit Signed-off-by: betta18 --- vllm_omni/engine/async_omni_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py index eb8e0f3a205..693fcdc47b0 100644 --- a/vllm_omni/engine/async_omni_engine.py +++ b/vllm_omni/engine/async_omni_engine.py @@ -1431,7 +1431,7 @@ def _resolve_stage_configs(self, model: str, kwargs: dict[str, Any]) -> tuple[st cfg.engine_args.quantization_config = quantization_config quantization = kwargs.get("quantization") if quantization is not None: - if (not hasattr(cfg.engine_args, "quantization") or cfg.engine_args.quantization is None): + if not hasattr(cfg.engine_args, "quantization") or cfg.engine_args.quantization is None: cfg.engine_args.quantization = quantization except Exception as e: logger.warning("Failed to inject LoRA config for stage: %s", e) From 9fce16e31e01e79a065cf419dfd8987ba4792754 Mon Sep 17 00:00:00 2001 From: betta18 Date: Fri, 24 Apr 2026 14:13:32 +0800 Subject: [PATCH 5/6] fix bug when model_class.packed_modules_mapping has default value. Signed-off-by: betta18 --- vllm_omni/diffusion/registry.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm_omni/diffusion/registry.py b/vllm_omni/diffusion/registry.py index 12ad79171b3..e96c73d107c 100644 --- a/vllm_omni/diffusion/registry.py +++ b/vllm_omni/diffusion/registry.py @@ -239,7 +239,9 @@ def _prepare_diffusion_quant_config( return if hasattr(quant_config, "maybe_update_config"): quant_config.maybe_update_config(od_config.model) - model_class.packed_modules_mapping = current_omni_platform.get_diffusion_packed_modules_mapping(model_class) + diffusion_packed_modules_mapping = current_omni_platform.get_diffusion_packed_modules_mapping(model_class) + if diffusion_packed_modules_mapping is not None: + model_class.packed_modules_mapping = diffusion_packed_modules_mapping configure_quant_config(quant_config, model_class) From 99a9fb96aa1deeac8c0f58ac687d3fe87f2a1f80 Mon Sep 17 00:00:00 2001 From: jiangmengyu18 <56633611+jiangmengyu18@users.noreply.github.com> Date: Fri, 24 Apr 2026 15:53:43 +0800 Subject: [PATCH 6/6] resolve conflicts Signed-off-by: jiangmengyu18 <56633611+jiangmengyu18@users.noreply.github.com> --- vllm_omni/engine/async_omni_engine.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py index 7ed931b46d2..1c3620c5cd3 100644 --- a/vllm_omni/engine/async_omni_engine.py +++ b/vllm_omni/engine/async_omni_engine.py @@ -1450,10 +1450,7 @@ def _resolve_stage_configs(self, model: str, kwargs: dict[str, Any]) -> tuple[st if lora_scale is not None: if not hasattr(cfg.engine_args, "lora_scale") or cfg.engine_args.lora_scale is None: cfg.engine_args.lora_scale = lora_scale - # Prefer explicit quantization_config; fallback to legacy --quantization. quantization_config = kwargs.get("quantization_config") - if quantization_config is None: - quantization_config = kwargs.get("quantization") if quantization_config is not None: if ( not hasattr(cfg.engine_args, "quantization_config")