diff --git a/examples/offline_inference/hunyuan_image3/end2end.py b/examples/offline_inference/hunyuan_image3/end2end.py index fc4b75e78d5..8993c8dadca 100644 --- a/examples/offline_inference/hunyuan_image3/end2end.py +++ b/examples/offline_inference/hunyuan_image3/end2end.py @@ -149,7 +149,7 @@ def main(): for p in prompts: token_ids = build_prompt_tokens(p, tokenizer, task=task, sys_type=args.sys_type) - prompt_dict: dict = {"prompt_token_ids": token_ids} + prompt_dict: dict = {"prompt_token_ids": token_ids, "prompt": p} if args.modality == "text2img": prompt_dict["modalities"] = ["image"] diff --git a/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_fused_moe.py b/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_fused_moe.py index 3f69cd00af2..63ba0d724c6 100644 --- a/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_fused_moe.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_fused_moe.py @@ -29,12 +29,6 @@ class HunyuanFusedMoEDefault(FusedMoE): def __init__(self, *, prefix: str = "", **kwargs: Any) -> None: super().__init__(prefix=prefix, **kwargs) self._prefix = prefix - self._init_hook_handle = self.register_forward_pre_hook(self._initialize_kernel_hook, with_kwargs=True) - - def _initialize_kernel_hook(self, module: Any, args: Any, kwargs: Any) -> None: - if self.quant_method: - self.quant_method.process_weights_after_loading(self) - self._init_hook_handle.remove() def forward(self, hidden_states: Any, router_logits: Any) -> Any: _set_forward_context_num_tokens(hidden_states.shape[0]) diff --git a/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_transformer.py b/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_transformer.py index e1acc4cdc65..cdd79b9fddb 100644 --- a/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_transformer.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_transformer.py @@ -1514,7 +1514,6 @@ def __init__( top_k=top_k, hidden_size=config.hidden_size, intermediate_size=intermediate_size, - reduce_results=False, renormalize=top_k > 1, quant_config=quant_config, prefix=f"{prefix}.experts", @@ -1532,11 +1531,6 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: # router_logits: (num_tokens, n_experts) router_logits, _ = self.gate(hidden_states) final_hidden_states = self.experts(hidden_states=hidden_states, router_logits=router_logits) - if self.shared_mlp is not None: - final_hidden_states = final_hidden_states[0] + final_hidden_states[1] - - if self.tp_size > 1: - final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel(final_hidden_states) return final_hidden_states.view(orig_shape) diff --git a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py index dbbad4e7197..0646d77e6df 100644 --- a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py @@ -11,7 +11,6 @@ from diffusers.schedulers.scheduling_flow_match_euler_discrete import FlowMatchEulerDiscreteScheduler from transformers.generation.configuration_utils import GenerationConfig from transformers.generation.utils import ALL_CACHE_NAMES, GenerationMixin -from transformers.models.siglip2 import Siglip2VisionConfig, Siglip2VisionModel from transformers.utils.generic import ModelOutput from vllm.config.vllm import get_current_vllm_config from vllm.model_executor.models.utils import AutoWeightsLoader, WeightsMapper @@ -22,6 +21,7 @@ from vllm_omni.diffusion.model_loader.diffusers_loader import DiffusersPipelineLoader from vllm_omni.diffusion.profiler.diffusion_pipeline_profiler import DiffusionPipelineProfilerMixin from vllm_omni.diffusion.request import OmniDiffusionRequest +from vllm_omni.model_executor.models.hunyuan_image3.siglip2 import Siglip2VisionTransformer from .autoencoder import AutoencoderKLConv3D from .hunyuan_image3_tokenizer import TokenizerWrapper @@ -111,9 +111,7 @@ def __init__(self, od_config: OmniDiffusionConfig) -> None: self._pipeline = None self._tkwrapper = TokenizerWrapper(od_config.model) self.image_processor = HunyuanImage3ImageProcessor(self.hf_config) - self.hf_config.vit.pop("use_return_dict", None) - vision_config = Siglip2VisionConfig(**self.hf_config.vit) - self.vision_model = Siglip2VisionModel(vision_config).vision_model + self.vision_model = Siglip2VisionTransformer(self.hf_config.vit) # self.vision_model = vision_model.vision_model self.vision_aligner = LightProjector(self.hf_config.vit_aligner) self.timestep_emb = TimestepEmbedder(hidden_size=self.hf_config.hidden_size)