vllm-project · Gaohan123 · May 5, 2026 · Apr 29, 2026 · Apr 29, 2026 · Apr 29, 2026
@@ -135,17 +135,19 @@ python end2end.py --model tencent/HunyuanImage-3.0-Instruct \
 
 ## Prompt Format
 
-HunyuanImage-3.0 uses a pretrain template format:
+HunyuanImage-3.0-Instruct uses an instruct chat template:
 
 ```
-<|startoftext|>{system_prompt}{<img>}{trigger_tag}{user_prompt}
+<|startoftext|>{system_prompt}\n\nUser: {<img>?}{user_prompt}\n\nAssistant: {trigger_tag?}
 ```
 
-- `<img>`: Placeholder for each input image (auto-inserted by `prompt_utils.py`)
-- Trigger tags: `<think>` (CoT), `<recaption>` (recaptioning)
+- `<img>`: Placeholder for each input image (single token; expanded by the multimodal pipeline)
+- Trigger tags: `<think>` (CoT), `<recaption>` (recaptioning) — placed AFTER `Assistant: `
 - System prompt: Auto-selected based on task
+- `t2i_vanilla` is the only task that uses the bare pretrain template (no chat structure)
 
-The `prompt_utils.build_prompt()` handles this formatting automatically.
+The shared `vllm_omni.diffusion.models.hunyuan_image3.prompt_utils.build_prompt_tokens()`
+helper handles segment-by-segment tokenization (matches HF `apply_chat_template` byte-for-byte).
 
 ------
 

@@ -16,23 +16,12 @@
 import argparse
 import os
 
-from vllm_omni.diffusion.models.hunyuan_image3.system_prompt import (
-    get_system_prompt,
+from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
+    build_prompt_tokens,
 )
 from vllm_omni.entrypoints.omni import Omni
 from vllm_omni.inputs.data import OmniPromptType
 
-# task → (sys_type, bot_task, trigger_tag)
-_TASK_PRESETS: dict[str, tuple[str, str | None, str | None]] = {
-    "t2t": ("en_unified", None, None),
-    "i2t": ("en_unified", None, None),
-    "it2i_think": ("en_unified", "think", "<think>"),
-    "it2i_recaption": ("en_unified", "recaption", "<recaption>"),
-    "t2i_think": ("en_unified", "think", "<think>"),
-    "t2i_recaption": ("en_unified", "recaption", "<recaption>"),
-    "t2i_vanilla": ("en_vanilla", "image", None),
-}
-
 # Modality → prompt_utils task mapping
 _MODALITY_TASK_MAP = {
     "text2img": "t2i_think",
@@ -42,36 +31,6 @@
 }
 
 
-def build_prompt(
-    user_prompt: str,
-    task: str = "it2i_think",
-    sys_type: str | None = None,
-    custom_system_prompt: str | None = None,
-) -> str:
-    """Build a HunyuanImage-3.0 prompt using pretrain template format."""
-    if task not in _TASK_PRESETS:
-        raise ValueError(f"Unknown task {task!r}. Choose from: {sorted(_TASK_PRESETS)}")
-
-    preset_sys_type, preset_bot_task, trigger_tag = _TASK_PRESETS[task]
-    effective_sys_type = sys_type or preset_sys_type
-
-    system_prompt = get_system_prompt(effective_sys_type, preset_bot_task, custom_system_prompt)
-    sys_text = system_prompt.strip() if system_prompt else ""
-
-    has_image_input = task.startswith("i2t") or task.startswith("it2i")
-
-    parts = ["<|startoftext|>"]
-    if sys_text:
-        parts.append(sys_text)
-    if has_image_input:
-        parts.append("<img>")
-    if trigger_tag:
-        parts.append(trigger_tag)
-    parts.append(user_prompt)
-
-    return "".join(parts)
-
-
 # Modality → default stage config
 _MODALITY_DEFAULT_CONFIG = {
     "text2img": "hunyuan_image3_t2i.yaml",
@@ -179,12 +138,18 @@ def main():
 
         input_image = Image.open(args.image_path).convert("RGB")
 
+    # Load tokenizer for segment-wise prompt tokenization (matches HF
+    # apply_chat_template byte-for-byte; see build_prompt_tokens docstring).
+    from transformers import AutoTokenizer
+
+    tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
+
     # Format prompts
     formatted_prompts: list[OmniPromptType] = []
     for p in prompts:
-        formatted_text = build_prompt(p, task=task, sys_type=args.sys_type)
+        token_ids = build_prompt_tokens(p, tokenizer, task=task, sys_type=args.sys_type)
 
-        prompt_dict: dict = {"prompt": formatted_text}
+        prompt_dict: dict = {"prompt_token_ids": token_ids}
 
         if args.modality == "text2img":
             prompt_dict["modalities"] = ["image"]