Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
d71981e
fix(hunyuan_image3): handle list pixel_values from Siglip2 in transfo…
zuiho-kai Apr 29, 2026
d360569
fix(hunyuan_image3): use instruct chat format for T2T prompt
zuiho-kai Apr 29, 2026
27083f9
fix(hunyuan_image3): use instruct chat template for all chat tasks
zuiho-kai Apr 29, 2026
ea80934
fix(hunyuan_image3): use Assistant: prefix to match HF tokenizer
zuiho-kai Apr 29, 2026
7bd429e
fix(hunyuan_image3): segment-tokenize prompt to bypass BPE merges
zuiho-kai Apr 29, 2026
3d415e1
docs(hunyuan_image3): document timestep-slot equivalence with HF
zuiho-kai Apr 29, 2026
8a1a4af
docs(hunyuan_image3): document image preprocessing alignment with HF
zuiho-kai Apr 29, 2026
41d2943
fix(hunyuan_image3): cast VAE pixels at encoder boundary, not in proc…
zuiho-kai Apr 30, 2026
31c2fa5
fix(hunyuan_image3): route MoE in fp32 to match HF reference
zuiho-kai Apr 30, 2026
07d8cf0
fix(hunyuan_image3): stop AR-only output at </think> for i2t/t2t
zuiho-kai Apr 30, 2026
6978fd7
fix(hunyuan_image3): import SharedFusedMoE
zuiho-kai Apr 30, 2026
fd4793e
refactor(hunyuan_image3): extract prompt_utils as shared builder
zuiho-kai Apr 30, 2026
36b1408
test(hunyuan_image3): regression tests for AR prompt template (PR #3243)
zuiho-kai Apr 30, 2026
78b8173
test(hunyuan_image3): apply ruff format to prompt_utils regression tests
zuiho-kai Apr 30, 2026
b298435
Merge branch 'main' into feature/hunyuan-t2t-sdpa-fa
Gaohan123 May 5, 2026
c5089d1
fix(hunyuan_image3): adapt to vllm 0.20 SharedFusedMoE removal
zuiho-kai May 5, 2026
0ec3d17
fix(hunyuan_image3): drop unused tensor_model_parallel_all_reduce import
zuiho-kai May 5, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 7 additions & 5 deletions examples/offline_inference/hunyuan_image3/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -135,17 +135,19 @@ python end2end.py --model tencent/HunyuanImage-3.0-Instruct \

## Prompt Format

HunyuanImage-3.0 uses a pretrain template format:
HunyuanImage-3.0-Instruct uses an instruct chat template:

```
<|startoftext|>{system_prompt}{<img>}{trigger_tag}{user_prompt}
<|startoftext|>{system_prompt}\n\nUser: {<img>?}{user_prompt}\n\nAssistant: {trigger_tag?}
```

- `<img>`: Placeholder for each input image (auto-inserted by `prompt_utils.py`)
- Trigger tags: `<think>` (CoT), `<recaption>` (recaptioning)
- `<img>`: Placeholder for each input image (single token; expanded by the multimodal pipeline)
- Trigger tags: `<think>` (CoT), `<recaption>` (recaptioning) — placed AFTER `Assistant: `
- System prompt: Auto-selected based on task
- `t2i_vanilla` is the only task that uses the bare pretrain template (no chat structure)

The `prompt_utils.build_prompt()` handles this formatting automatically.
The shared `vllm_omni.diffusion.models.hunyuan_image3.prompt_utils.build_prompt_tokens()`
helper handles segment-by-segment tokenization (matches HF `apply_chat_template` byte-for-byte).

------

Expand Down
55 changes: 10 additions & 45 deletions examples/offline_inference/hunyuan_image3/end2end.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,23 +16,12 @@
import argparse
import os

from vllm_omni.diffusion.models.hunyuan_image3.system_prompt import (
get_system_prompt,
from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
build_prompt_tokens,
)
from vllm_omni.entrypoints.omni import Omni
from vllm_omni.inputs.data import OmniPromptType

# task → (sys_type, bot_task, trigger_tag)
_TASK_PRESETS: dict[str, tuple[str, str | None, str | None]] = {
"t2t": ("en_unified", None, None),
"i2t": ("en_unified", None, None),
"it2i_think": ("en_unified", "think", "<think>"),
"it2i_recaption": ("en_unified", "recaption", "<recaption>"),
"t2i_think": ("en_unified", "think", "<think>"),
"t2i_recaption": ("en_unified", "recaption", "<recaption>"),
"t2i_vanilla": ("en_vanilla", "image", None),
}

# Modality → prompt_utils task mapping
_MODALITY_TASK_MAP = {
"text2img": "t2i_think",
Expand All @@ -42,36 +31,6 @@
}


def build_prompt(
user_prompt: str,
task: str = "it2i_think",
sys_type: str | None = None,
custom_system_prompt: str | None = None,
) -> str:
"""Build a HunyuanImage-3.0 prompt using pretrain template format."""
if task not in _TASK_PRESETS:
raise ValueError(f"Unknown task {task!r}. Choose from: {sorted(_TASK_PRESETS)}")

preset_sys_type, preset_bot_task, trigger_tag = _TASK_PRESETS[task]
effective_sys_type = sys_type or preset_sys_type

system_prompt = get_system_prompt(effective_sys_type, preset_bot_task, custom_system_prompt)
sys_text = system_prompt.strip() if system_prompt else ""

has_image_input = task.startswith("i2t") or task.startswith("it2i")

parts = ["<|startoftext|>"]
if sys_text:
parts.append(sys_text)
if has_image_input:
parts.append("<img>")
if trigger_tag:
parts.append(trigger_tag)
parts.append(user_prompt)

return "".join(parts)


# Modality → default stage config
_MODALITY_DEFAULT_CONFIG = {
"text2img": "hunyuan_image3_t2i.yaml",
Expand Down Expand Up @@ -179,12 +138,18 @@ def main():

input_image = Image.open(args.image_path).convert("RGB")

# Load tokenizer for segment-wise prompt tokenization (matches HF
# apply_chat_template byte-for-byte; see build_prompt_tokens docstring).
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)

# Format prompts
formatted_prompts: list[OmniPromptType] = []
for p in prompts:
formatted_text = build_prompt(p, task=task, sys_type=args.sys_type)
token_ids = build_prompt_tokens(p, tokenizer, task=task, sys_type=args.sys_type)

prompt_dict: dict = {"prompt": formatted_text}
prompt_dict: dict = {"prompt_token_ids": token_ids}

if args.modality == "text2img":
prompt_dict["modalities"] = ["image"]
Expand Down
Loading
Loading